Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/regex/_regex

2# Secret Labs' Regular Expression Engine core module

6# This version of the SRE library can be redistributed under CNRI's

7# Python 1.6 license. For any other use, please contact Secret Labs

8# AB (info@pythonware.com).

10# Portions of this engine have been developed in cooperation with

11# CNRI. Hewlett-Packard provided funding for 1.6 integration and

12# other compatibility work.

13#

14# 2010-01-16 mrab Python front-end re-written and extended

16import enum

17import string

18import unicodedata

19from collections import defaultdict

21import regex._regex as _regex

23__all__ = ["A", "ASCII", "B", "BESTMATCH", "D", "DEBUG", "E", "ENHANCEMATCH",

24 "F", "FULLCASE", "I", "IGNORECASE", "L", "LOCALE", "M", "MULTILINE", "P",

25 "POSIX", "R", "REVERSE", "S", "DOTALL", "T", "TEMPLATE", "U", "UNICODE",

26 "V0", "VERSION0", "V1", "VERSION1", "W", "WORD", "X", "VERBOSE", "error",

27 "Scanner", "RegexFlag"]

29# The regex exception.

30class error(Exception):

31 """Exception raised for invalid regular expressions.

33 Attributes:

35 msg: The unformatted error message

36 pattern: The regular expression pattern

37 pos: The position in the pattern where compilation failed, or None

38 lineno: The line number where compilation failed, unless pos is None

39 colno: The column number where compilation failed, unless pos is None

40 """

42 def __init__(self, message, pattern=None, pos=None):

43 newline = '\n' if isinstance(pattern, str) else b'\n'

44 self.msg = message

45 self.pattern = pattern

46 self.pos = pos

47 if pattern is not None and pos is not None:

48 self.lineno = pattern.count(newline, 0, pos) + 1

49 self.colno = pos - pattern.rfind(newline, 0, pos)

51 message = "{} at position {}".format(message, pos)

53 if newline in pattern:

54 message += " (line {}, column {})".format(self.lineno,

55 self.colno)

57 Exception.__init__(self, message)

59# The exception for when a positional flag has been turned on in the old

60# behaviour.

61class _UnscopedFlagSet(Exception):

62 pass

64# The exception for when parsing fails and we want to try something else.

65class ParseError(Exception):

66 pass

68# The exception for when there isn't a valid first set.

69class _FirstSetError(Exception):

70 pass

72# Flags.

73class RegexFlag(enum.IntFlag):

74 A = ASCII = 0x80 # Assume ASCII locale.

75 B = BESTMATCH = 0x1000 # Best fuzzy match.

76 D = DEBUG = 0x200 # Print parsed pattern.

77 E = ENHANCEMATCH = 0x8000 # Attempt to improve the fit after finding the first

78 # fuzzy match.

79 F = FULLCASE = 0x4000 # Unicode full case-folding.

80 I = IGNORECASE = 0x2 # Ignore case.

81 L = LOCALE = 0x4 # Assume current 8-bit locale.

82 M = MULTILINE = 0x8 # Make anchors look for newline.

83 P = POSIX = 0x10000 # POSIX-style matching (leftmost longest).

84 R = REVERSE = 0x400 # Search backwards.

85 S = DOTALL = 0x10 # Make dot match newline.

86 U = UNICODE = 0x20 # Assume Unicode locale.

87 V0 = VERSION0 = 0x2000 # Old legacy behaviour.

88 V1 = VERSION1 = 0x100 # New enhanced behaviour.

89 W = WORD = 0x800 # Default Unicode word breaks.

90 X = VERBOSE = 0x40 # Ignore whitespace and comments.

91 T = TEMPLATE = 0x1 # Template (present because re module has it).

93 def __repr__(self):

94 if self._name_ is not None:

95 return 'regex.%s' % self._name_

97 value = self._value_

98 members = []

99 negative = value < 0

100

101 if negative:

102 value = ~value

103

104 for m in self.__class__:

105 if value & m._value_:

106 value &= ~m._value_

107 members.append('regex.%s' % m._name_)

108

109 if value:

110 members.append(hex(value))

111

112 res = '|'.join(members)

113

114 if negative:

115 if len(members) > 1:

116 res = '~(%s)' % res

117 else:

118 res = '~%s' % res

119

120 return res

121

122 __str__ = object.__str__

123

124globals().update(RegexFlag.__members__)

125

126DEFAULT_VERSION = VERSION1

127

128_ALL_VERSIONS = VERSION0 | VERSION1

129_ALL_ENCODINGS = ASCII | LOCALE | UNICODE

130

131# The default flags for the various versions.

132DEFAULT_FLAGS = {VERSION0: 0, VERSION1: FULLCASE}

133

134# The mask for the flags.

136 REVERSE)

138 _ALL_ENCODINGS)

139

140ALPHA = frozenset(string.ascii_letters)

141DIGITS = frozenset(string.digits)

142ALNUM = ALPHA | DIGITS

143OCT_DIGITS = frozenset(string.octdigits)

144HEX_DIGITS = frozenset(string.hexdigits)

145SPECIAL_CHARS = frozenset("()|?*+{^$.[\\#") | frozenset([""])

146NAMED_CHAR_PART = ALNUM | frozenset(" -")

147PROPERTY_NAME_PART = ALNUM | frozenset(" &_-.")

148SET_OPS = ("||", "~~", "&&", "--")

149

150# The width of the code words inside the regex engine.

151BYTES_PER_CODE = _regex.get_code_size()

152BITS_PER_CODE = BYTES_PER_CODE * 8

153

154# The repeat count which represents infinity.

155UNLIMITED = (1 << BITS_PER_CODE) - 1

156

157# The regular expression flags.

158REGEX_FLAGS = {"a": ASCII, "b": BESTMATCH, "e": ENHANCEMATCH, "f": FULLCASE,

159 "i": IGNORECASE, "L": LOCALE, "m": MULTILINE, "p": POSIX, "r": REVERSE,

160 "s": DOTALL, "u": UNICODE, "V0": VERSION0, "V1": VERSION1, "w": WORD, "x":

161 VERBOSE}

162

163# The case flags.

164CASE_FLAGS = FULLCASE | IGNORECASE

165NOCASE = 0

166FULLIGNORECASE = FULLCASE | IGNORECASE

167

168FULL_CASE_FOLDING = UNICODE | FULLIGNORECASE

169

170CASE_FLAGS_COMBINATIONS = {0: 0, FULLCASE: 0, IGNORECASE: IGNORECASE,

171 FULLIGNORECASE: FULLIGNORECASE}

172

173# The number of digits in hexadecimal escapes.

174HEX_ESCAPES = {"x": 2, "u": 4, "U": 8}

175

176# The names of the opcodes.

177OPCODES = """

178FAILURE

179SUCCESS

180ANY

181ANY_ALL

182ANY_ALL_REV

183ANY_REV

184ANY_U

185ANY_U_REV

186ATOMIC

187BOUNDARY

188BRANCH

189CALL_REF

190CHARACTER

191CHARACTER_IGN

192CHARACTER_IGN_REV

193CHARACTER_REV

194CONDITIONAL

195DEFAULT_BOUNDARY

196DEFAULT_END_OF_WORD

197DEFAULT_START_OF_WORD

198END

199END_OF_LINE

200END_OF_LINE_U

201END_OF_STRING

202END_OF_STRING_LINE

203END_OF_STRING_LINE_U

204END_OF_WORD

205FUZZY

206GRAPHEME_BOUNDARY

207GREEDY_REPEAT

208GROUP

209GROUP_CALL

210GROUP_EXISTS

211KEEP

212LAZY_REPEAT

213LOOKAROUND

214NEXT

215PROPERTY

216PROPERTY_IGN

217PROPERTY_IGN_REV

218PROPERTY_REV

219PRUNE

220RANGE

221RANGE_IGN

222RANGE_IGN_REV

223RANGE_REV

224REF_GROUP

225REF_GROUP_FLD

226REF_GROUP_FLD_REV

227REF_GROUP_IGN

228REF_GROUP_IGN_REV

229REF_GROUP_REV

230SEARCH_ANCHOR

231SET_DIFF

232SET_DIFF_IGN

233SET_DIFF_IGN_REV

234SET_DIFF_REV

235SET_INTER

236SET_INTER_IGN

237SET_INTER_IGN_REV

238SET_INTER_REV

239SET_SYM_DIFF

240SET_SYM_DIFF_IGN

241SET_SYM_DIFF_IGN_REV

242SET_SYM_DIFF_REV

243SET_UNION

244SET_UNION_IGN

245SET_UNION_IGN_REV

246SET_UNION_REV

247SKIP

248START_OF_LINE

249START_OF_LINE_U

250START_OF_STRING

251START_OF_WORD

252STRING

253STRING_FLD

254STRING_FLD_REV

255STRING_IGN

256STRING_IGN_REV

257STRING_REV

258FUZZY_EXT

259"""

260

261# Define the opcodes in a namespace.

262class Namespace:

263 pass

264

265OP = Namespace()

266for i, op in enumerate(OPCODES.split()):

267 setattr(OP, op, i)

268

269def _shrink_cache(cache_dict, args_dict, locale_sensitive, max_length, divisor=5):

270 """Make room in the given cache.

271

272 Args:

273 cache_dict: The cache dictionary to modify.

274 args_dict: The dictionary of named list args used by patterns.

275 max_length: Maximum # of entries in cache_dict before it is shrunk.

276 divisor: Cache will shrink to max_length - 1/divisor*max_length items.

277 """

278 # Toss out a fraction of the entries at random to make room for new ones.

279 # A random algorithm was chosen as opposed to simply cache_dict.popitem()

280 # as popitem could penalize the same regular expression repeatedly based

281 # on its internal hash value. Being random should spread the cache miss

282 # love around.

283 cache_keys = tuple(cache_dict.keys())

284 overage = len(cache_keys) - max_length

285 if overage < 0:

286 # Cache is already within limits. Normally this should not happen

287 # but it could due to multithreading.

288 return

289

290 number_to_toss = max_length // divisor + overage

291

292 # The import is done here to avoid a circular dependency.

293 import random

294 if not hasattr(random, 'sample'):

295 # Do nothing while resolving the circular dependency:

296 # re->random->warnings->tokenize->string->re

297 return

298

299 for doomed_key in random.sample(cache_keys, number_to_toss):

300 try:

301 del cache_dict[doomed_key]

302 except KeyError:

303 # Ignore problems if the cache changed from another thread.

304 pass

305

306 # Rebuild the arguments and locale-sensitivity dictionaries.

307 args_dict.clear()

308 sensitivity_dict = {}

309 for pattern, pattern_type, flags, args, default_version, locale in tuple(cache_dict):

310 args_dict[pattern, pattern_type, flags, default_version, locale] = args

311 try:

312 sensitivity_dict[pattern_type, pattern] = locale_sensitive[pattern_type, pattern]

313 except KeyError:

314 pass

315

316 locale_sensitive.clear()

317 locale_sensitive.update(sensitivity_dict)

318

319def _fold_case(info, string):

320 "Folds the case of a string."

321 flags = info.flags

322 if (flags & _ALL_ENCODINGS) == 0:

323 flags |= info.guess_encoding

324

325 return _regex.fold_case(flags, string)

326

327def is_cased_i(info, char):

328 "Checks whether a character is cased."

329 return len(_regex.get_all_cases(info.flags, char)) > 1

330

331def is_cased_f(flags, char):

332 "Checks whether a character is cased."

333 return len(_regex.get_all_cases(flags, char)) > 1

334

335def _compile_firstset(info, fs):

336 "Compiles the firstset for the pattern."

337 reverse = bool(info.flags & REVERSE)

338 fs = _check_firstset(info, reverse, fs)

339 if not fs or isinstance(fs, AnyAll):

340 return []

341

342 # Compile the firstset.

343 return fs.compile(reverse)

344

345def _check_firstset(info, reverse, fs):

346 "Checks the firstset for the pattern."

347 if not fs or None in fs:

348 return None

349

350 # If we ignore the case, for simplicity we won't build a firstset.

351 members = set()

352 case_flags = NOCASE

353 for i in fs:

354 if isinstance(i, Character) and not i.positive:

355 return None

356

357# if i.case_flags:

358# if isinstance(i, Character):

359# if is_cased_i(info, i.value):

360# return []

361# elif isinstance(i, SetBase):

362# return []

363 case_flags |= i.case_flags

364 members.add(i.with_flags(case_flags=NOCASE))

365

366 if case_flags == (FULLCASE | IGNORECASE):

367 return None

368

369 # Build the firstset.

370 fs = SetUnion(info, list(members), case_flags=case_flags & ~FULLCASE,

371 zerowidth=True)

372 fs = fs.optimise(info, reverse, in_set=True)

373

374 return fs

375

376def _flatten_code(code):

377 "Flattens the code from a list of tuples."

378 flat_code = []

379 for c in code:

380 flat_code.extend(c)

381

382 return flat_code

383

384def make_case_flags(info):

385 "Makes the case flags."

386 flags = info.flags & CASE_FLAGS

387

388 # Turn off FULLCASE if ASCII is turned on.

389 if info.flags & ASCII:

390 flags &= ~FULLCASE

391

392 return flags

393

394def make_character(info, value, in_set=False):

395 "Makes a character literal."

396 if in_set:

397 # A character set is built case-sensitively.

398 return Character(value)

399

400 return Character(value, case_flags=make_case_flags(info))

401

402def make_ref_group(info, name, position):

403 "Makes a group reference."

404 return RefGroup(info, name, position, case_flags=make_case_flags(info))

405

406def make_string_set(info, name):

407 "Makes a string set."

408 return StringSet(info, name, case_flags=make_case_flags(info))

409

410def make_property(info, prop, in_set):

411 "Makes a property."

412 if in_set:

413 return prop

414

415 return prop.with_flags(case_flags=make_case_flags(info))

416

417def _parse_pattern(source, info):

418 "Parses a pattern, eg. 'a|b|c'."

419 branches = [parse_sequence(source, info)]

420 while source.match("|"):

421 branches.append(parse_sequence(source, info))

422

423 if len(branches) == 1:

424 return branches[0]

425 return Branch(branches)

426

427def parse_sequence(source, info):

428 "Parses a sequence, eg. 'abc'."

429 sequence = [None]

430 case_flags = make_case_flags(info)

431 while True:

432 saved_pos = source.pos

433 ch = source.get()

434 if ch in SPECIAL_CHARS:

435 if ch in ")|":

436 # The end of a sequence. At the end of the pattern ch is "".

437 source.pos = saved_pos

438 break

439 elif ch == "\\":

440 # An escape sequence outside a set.

441 sequence.append(parse_escape(source, info, False))

442 elif ch == "(":

443 # A parenthesised subpattern or a flag.

444 element = parse_paren(source, info)

445 if element is None:

446 case_flags = make_case_flags(info)

447 else:

448 sequence.append(element)

449 elif ch == ".":

450 # Any character.

451 if info.flags & DOTALL:

452 sequence.append(AnyAll())

453 elif info.flags & WORD:

454 sequence.append(AnyU())

455 else:

456 sequence.append(Any())

457 elif ch == "[":

458 # A character set.

459 sequence.append(parse_set(source, info))

460 elif ch == "^":

461 # The start of a line or the string.

462 if info.flags & MULTILINE:

463 if info.flags & WORD:

464 sequence.append(StartOfLineU())

465 else:

466 sequence.append(StartOfLine())

467 else:

468 sequence.append(StartOfString())

469 elif ch == "$":

470 # The end of a line or the string.

471 if info.flags & MULTILINE:

472 if info.flags & WORD:

473 sequence.append(EndOfLineU())

474 else:

475 sequence.append(EndOfLine())

476 else:

477 if info.flags & WORD:

478 sequence.append(EndOfStringLineU())

479 else:

480 sequence.append(EndOfStringLine())

481 elif ch in "?*+{":

482 # Looks like a quantifier.

483 counts = parse_quantifier(source, info, ch)

484 if counts:

485 # It _is_ a quantifier.

486 apply_quantifier(source, info, counts, case_flags, ch,

487 saved_pos, sequence)

488 sequence.append(None)

489 else:

490 # It's not a quantifier. Maybe it's a fuzzy constraint.

491 constraints = parse_fuzzy(source, info, ch, case_flags)

492 if constraints:

493 # It _is_ a fuzzy constraint.

494 apply_constraint(source, info, constraints, case_flags,

495 saved_pos, sequence)

496 sequence.append(None)

497 else:

498 # The element was just a literal.

499 sequence.append(Character(ord(ch),

500 case_flags=case_flags))

501 else:

502 # A literal.

503 sequence.append(Character(ord(ch), case_flags=case_flags))

504 else:

505 # A literal.

506 sequence.append(Character(ord(ch), case_flags=case_flags))

507

508 sequence = [item for item in sequence if item is not None]

509 return Sequence(sequence)

510

511def apply_quantifier(source, info, counts, case_flags, ch, saved_pos,

512 sequence):

513 element = sequence.pop()

514 if element is None:

515 if sequence:

516 raise error("multiple repeat", source.string, saved_pos)

517 raise error("nothing to repeat", source.string, saved_pos)

518

519 if isinstance(element, (GreedyRepeat, LazyRepeat, PossessiveRepeat)):

520 raise error("multiple repeat", source.string, saved_pos)

521

522 min_count, max_count = counts

523 saved_pos = source.pos

524 ch = source.get()

525 if ch == "?":

526 # The "?" suffix that means it's a lazy repeat.

527 repeated = LazyRepeat

528 elif ch == "+":

529 # The "+" suffix that means it's a possessive repeat.

530 repeated = PossessiveRepeat

531 else:

532 # No suffix means that it's a greedy repeat.

533 source.pos = saved_pos

534 repeated = GreedyRepeat

535

536 # Ignore the quantifier if it applies to a zero-width item or the number of

537 # repeats is fixed at 1.

538 if not element.is_empty() and (min_count != 1 or max_count != 1):

539 element = repeated(element, min_count, max_count)

540

541 sequence.append(element)

542

543def apply_constraint(source, info, constraints, case_flags, saved_pos,

544 sequence):

545 element = sequence.pop()

546 if element is None:

547 raise error("nothing for fuzzy constraint", source.string, saved_pos)

548

549 # If a group is marked as fuzzy then put all of the fuzzy part in the

550 # group.

551 if isinstance(element, Group):

552 element.subpattern = Fuzzy(element.subpattern, constraints)

553 sequence.append(element)

554 else:

555 sequence.append(Fuzzy(element, constraints))

556

557_QUANTIFIERS = {"?": (0, 1), "*": (0, None), "+": (1, None)}

558

559def parse_quantifier(source, info, ch):

560 "Parses a quantifier."

561 q = _QUANTIFIERS.get(ch)

562 if q:

563 # It's a quantifier.

564 return q

565

566 if ch == "{":

567 # Looks like a limited repeated element, eg. 'a{2,3}'.

568 counts = parse_limited_quantifier(source)

569 if counts:

570 return counts

571

572 return None

573

574def is_above_limit(count):

575 "Checks whether a count is above the maximum."

576 return count is not None and count >= UNLIMITED

577

578def parse_limited_quantifier(source):

579 "Parses a limited quantifier."

580 saved_pos = source.pos

581 min_count = parse_count(source)

582 if source.match(","):

583 max_count = parse_count(source)

584

585 # No minimum means 0 and no maximum means unlimited.

586 min_count = int(min_count or 0)

587 max_count = int(max_count) if max_count else None

588 else:

589 if not min_count:

590 source.pos = saved_pos

591 return None

592

593 min_count = max_count = int(min_count)

594

595 if not source.match ("}"):

596 source.pos = saved_pos

597 return None

598

599 if is_above_limit(min_count) or is_above_limit(max_count):

600 raise error("repeat count too big", source.string, saved_pos)

601

602 if max_count is not None and min_count > max_count:

603 raise error("min repeat greater than max repeat", source.string,

604 saved_pos)

605

606 return min_count, max_count

607

608def parse_fuzzy(source, info, ch, case_flags):

609 "Parses a fuzzy setting, if present."

610 saved_pos = source.pos

611

612 if ch != "{":

613 return None

614

615 constraints = {}

616 try:

617 parse_fuzzy_item(source, constraints)

618 while source.match(","):

619 parse_fuzzy_item(source, constraints)

620 except ParseError:

621 source.pos = saved_pos

622 return None

623

624 if source.match(":"):

625 constraints["test"] = parse_fuzzy_test(source, info, case_flags)

626

627 if not source.match("}"):

628 raise error("expected }", source.string, source.pos)

629

630 return constraints

631

632def parse_fuzzy_item(source, constraints):

633 "Parses a fuzzy setting item."

634 saved_pos = source.pos

635 try:

636 parse_cost_constraint(source, constraints)

637 except ParseError:

638 source.pos = saved_pos

639

640 parse_cost_equation(source, constraints)

641

642def parse_cost_constraint(source, constraints):

643 "Parses a cost constraint."

644 saved_pos = source.pos

645 ch = source.get()

646 if ch in ALPHA:

647 # Syntax: constraint [("<=" | "<") cost]

648 constraint = parse_constraint(source, constraints, ch)

649

650 max_inc = parse_fuzzy_compare(source)

651

652 if max_inc is None:

653 # No maximum cost.

654 constraints[constraint] = 0, None

655 else:

656 # There's a maximum cost.

657 cost_pos = source.pos

658 max_cost = parse_cost_limit(source)

659

660 # Inclusive or exclusive limit?

661 if not max_inc:

662 max_cost -= 1

663

664 if max_cost < 0:

665 raise error("bad fuzzy cost limit", source.string, cost_pos)

666

667 constraints[constraint] = 0, max_cost

668 elif ch in DIGITS:

669 # Syntax: cost ("<=" | "<") constraint ("<=" | "<") cost

670 source.pos = saved_pos

671

672 # Minimum cost.

673 cost_pos = source.pos

674 min_cost = parse_cost_limit(source)

675

676 min_inc = parse_fuzzy_compare(source)

677 if min_inc is None:

678 raise ParseError()

679

680 constraint = parse_constraint(source, constraints, source.get())

681

682 max_inc = parse_fuzzy_compare(source)

683 if max_inc is None:

684 raise ParseError()

685

686 # Maximum cost.

687 cost_pos = source.pos

688 max_cost = parse_cost_limit(source)

689

690 # Inclusive or exclusive limits?

691 if not min_inc:

692 min_cost += 1

693 if not max_inc:

694 max_cost -= 1

695

696 if not 0 <= min_cost <= max_cost:

697 raise error("bad fuzzy cost limit", source.string, cost_pos)

698

699 constraints[constraint] = min_cost, max_cost

700 else:

701 raise ParseError()

702

703def parse_cost_limit(source):

704 "Parses a cost limit."

705 cost_pos = source.pos

706 digits = parse_count(source)

707

708 try:

709 return int(digits)

710 except ValueError:

711 pass

712

713 raise error("bad fuzzy cost limit", source.string, cost_pos)

714

715def parse_constraint(source, constraints, ch):

716 "Parses a constraint."

717 if ch not in "deis":

718 raise ParseError()

719

720 if ch in constraints:

721 raise ParseError()

722

723 return ch

724

725def parse_fuzzy_compare(source):

726 "Parses a cost comparator."

727 if source.match("<="):

728 return True

729 elif source.match("<"):

730 return False

731 else:

732 return None

733

734def parse_cost_equation(source, constraints):

735 "Parses a cost equation."

736 if "cost" in constraints:

737 raise error("more than one cost equation", source.string, source.pos)

738

739 cost = {}

740

741 parse_cost_term(source, cost)

742 while source.match("+"):

743 parse_cost_term(source, cost)

744

745 max_inc = parse_fuzzy_compare(source)

746 if max_inc is None:

747 raise ParseError()

748

749 max_cost = int(parse_count(source))

750

751 if not max_inc:

752 max_cost -= 1

753

754 if max_cost < 0:

755 raise error("bad fuzzy cost limit", source.string, source.pos)

756

757 cost["max"] = max_cost

758

759 constraints["cost"] = cost

760

761def parse_cost_term(source, cost):

762 "Parses a cost equation term."

763 coeff = parse_count(source)

764 ch = source.get()

765 if ch not in "dis":

766 raise ParseError()

767

768 if ch in cost:

769 raise error("repeated fuzzy cost", source.string, source.pos)

770

771 cost[ch] = int(coeff or 1)

772

773def parse_fuzzy_test(source, info, case_flags):

774 saved_pos = source.pos

775 ch = source.get()

776 if ch in SPECIAL_CHARS:

777 if ch == "\\":

778 # An escape sequence outside a set.

779 return parse_escape(source, info, False)

780 elif ch == ".":

781 # Any character.

782 if info.flags & DOTALL:

783 return AnyAll()

784 elif info.flags & WORD:

785 return AnyU()

786 else:

787 return Any()

788 elif ch == "[":

789 # A character set.

790 return parse_set(source, info)

791 else:

792 raise error("expected character set", source.string, saved_pos)

793 elif ch:

794 # A literal.

795 return Character(ord(ch), case_flags=case_flags)

796 else:

797 raise error("expected character set", source.string, saved_pos)

798

799def parse_count(source):

800 "Parses a quantifier's count, which can be empty."

801 return source.get_while(DIGITS)

802

803def parse_paren(source, info):

804 """Parses a parenthesised subpattern or a flag. Returns FLAGS if it's an

805 inline flag.

806 """

807 saved_pos = source.pos

808 ch = source.get(True)

809 if ch == "?":

810 # (?...

811 saved_pos_2 = source.pos

812 ch = source.get(True)

813 if ch == "<":

814 # (?<...

815 saved_pos_3 = source.pos

816 ch = source.get()

817 if ch in ("=", "!"):

818 # (?<=... or (?<!...: lookbehind.

819 return parse_lookaround(source, info, True, ch == "=")

820

821 # (?<...: a named capture group.

822 source.pos = saved_pos_3

823 name = parse_name(source)

824 group = info.open_group(name)

825 source.expect(">")

826 saved_flags = info.flags

827 try:

828 subpattern = _parse_pattern(source, info)

829 source.expect(")")

830 finally:

831 info.flags = saved_flags

832 source.ignore_space = bool(info.flags & VERBOSE)

833

834 info.close_group()

835 return Group(info, group, subpattern)

836 if ch in ("=", "!"):

837 # (?=... or (?!...: lookahead.

838 return parse_lookaround(source, info, False, ch == "=")

839 if ch == "P":

840 # (?P...: a Python extension.

841 return parse_extension(source, info)

842 if ch == "#":

843 # (?#...: a comment.

844 return parse_comment(source)

845 if ch == "(":

846 # (?(...: a conditional subpattern.

847 return parse_conditional(source, info)

848 if ch == ">":

849 # (?>...: an atomic subpattern.

850 return parse_atomic(source, info)

851 if ch == "|":

852 # (?|...: a common/reset groups branch.

853 return parse_common(source, info)

854 if ch == "R" or "0" <= ch <= "9":

855 # (?R...: probably a call to a group.

856 return parse_call_group(source, info, ch, saved_pos_2)

857 if ch == "&":

858 # (?&...: a call to a named group.

859 return parse_call_named_group(source, info, saved_pos_2)

860

861 # (?...: probably a flags subpattern.

862 source.pos = saved_pos_2

863 return parse_flags_subpattern(source, info)

864

865 if ch == "*":

866 # (*...

867 saved_pos_2 = source.pos

868 word = source.get_while(set(")>"), include=False)

869 if word[ : 1].isalpha():

870 verb = VERBS.get(word)

871 if not verb:

872 raise error("unknown verb", source.string, saved_pos_2)

873

874 source.expect(")")

875

876 return verb

877

878 # (...: an unnamed capture group.

879 source.pos = saved_pos

880 group = info.open_group()

881 saved_flags = info.flags

882 try:

883 subpattern = _parse_pattern(source, info)

884 source.expect(")")

885 finally:

886 info.flags = saved_flags

887 source.ignore_space = bool(info.flags & VERBOSE)

888

889 info.close_group()

890

891 return Group(info, group, subpattern)

892

893def parse_extension(source, info):

894 "Parses a Python extension."

895 saved_pos = source.pos

896 ch = source.get()

897 if ch == "<":

898 # (?P<...: a named capture group.

899 name = parse_name(source)

900 group = info.open_group(name)

901 source.expect(">")

902 saved_flags = info.flags

903 try:

904 subpattern = _parse_pattern(source, info)

905 source.expect(")")

906 finally:

907 info.flags = saved_flags

908 source.ignore_space = bool(info.flags & VERBOSE)

909

910 info.close_group()

911

912 return Group(info, group, subpattern)

913 if ch == "=":

914 # (?P=...: a named group reference.

915 name = parse_name(source, allow_numeric=True)

916 source.expect(")")

917 if info.is_open_group(name):

918 raise error("cannot refer to an open group", source.string,

919 saved_pos)

920

921 return make_ref_group(info, name, saved_pos)

922 if ch == ">" or ch == "&":

923 # (?P>...: a call to a group.

924 return parse_call_named_group(source, info, saved_pos)

925

926 source.pos = saved_pos

927 raise error("unknown extension", source.string, saved_pos)

928

929def parse_comment(source):

930 "Parses a comment."

931 while True:

932 saved_pos = source.pos

933 c = source.get(True)

934

935 if not c or c == ")":

936 break

937

938 if c == "\\":

939 c = source.get(True)

940

941 source.pos = saved_pos

942 source.expect(")")

943

944 return None

945

946def parse_lookaround(source, info, behind, positive):

947 "Parses a lookaround."

948 saved_flags = info.flags

949 try:

950 subpattern = _parse_pattern(source, info)

951 source.expect(")")

952 finally:

953 info.flags = saved_flags

954 source.ignore_space = bool(info.flags & VERBOSE)

955

956 return LookAround(behind, positive, subpattern)

957

958def parse_conditional(source, info):

959 "Parses a conditional subpattern."

960 saved_flags = info.flags

961 saved_pos = source.pos

962 ch = source.get()

963 if ch == "?":

964 # (?(?...

965 ch = source.get()

966 if ch in ("=", "!"):

967 # (?(?=... or (?(?!...: lookahead conditional.

968 return parse_lookaround_conditional(source, info, False, ch == "=")

969 if ch == "<":

970 # (?(?<...

971 ch = source.get()

972 if ch in ("=", "!"):

973 # (?(?<=... or (?(?<!...: lookbehind conditional.

974 return parse_lookaround_conditional(source, info, True, ch ==

975 "=")

976

977 source.pos = saved_pos

978 raise error("expected lookaround conditional", source.string,

979 source.pos)

980

981 source.pos = saved_pos

982 try:

983 group = parse_name(source, True)

984 source.expect(")")

985 yes_branch = parse_sequence(source, info)

986 if source.match("|"):

987 no_branch = parse_sequence(source, info)

988 else:

989 no_branch = Sequence()

990

991 source.expect(")")

992 finally:

993 info.flags = saved_flags

994 source.ignore_space = bool(info.flags & VERBOSE)

995

996 if yes_branch.is_empty() and no_branch.is_empty():

997 return Sequence()

998

999 return Conditional(info, group, yes_branch, no_branch, saved_pos)

1000

1001def parse_lookaround_conditional(source, info, behind, positive):

1002 saved_flags = info.flags

1003 try:

1004 subpattern = _parse_pattern(source, info)

1005 source.expect(")")

1006 finally:

1007 info.flags = saved_flags

1008 source.ignore_space = bool(info.flags & VERBOSE)

1009

1010 yes_branch = parse_sequence(source, info)

1011 if source.match("|"):

1012 no_branch = parse_sequence(source, info)

1013 else:

1014 no_branch = Sequence()

1015

1016 source.expect(")")

1017

1018 return LookAroundConditional(behind, positive, subpattern, yes_branch,

1019 no_branch)

1020

1021def parse_atomic(source, info):

1022 "Parses an atomic subpattern."

1023 saved_flags = info.flags

1024 try:

1025 subpattern = _parse_pattern(source, info)

1026 source.expect(")")

1027 finally:

1028 info.flags = saved_flags

1029 source.ignore_space = bool(info.flags & VERBOSE)

1030

1031 return Atomic(subpattern)

1032

1033def parse_common(source, info):

1034 "Parses a common groups branch."

1035 # Capture group numbers in different branches can reuse the group numbers.

1036 initial_group_count = info.group_count

1037 branches = [parse_sequence(source, info)]

1038 final_group_count = info.group_count

1039 while source.match("|"):

1040 info.group_count = initial_group_count

1041 branches.append(parse_sequence(source, info))

1042 final_group_count = max(final_group_count, info.group_count)

1043

1044 info.group_count = final_group_count

1045 source.expect(")")

1046

1047 if len(branches) == 1:

1048 return branches[0]

1049 return Branch(branches)

1050

1051def parse_call_group(source, info, ch, pos):

1052 "Parses a call to a group."

1053 if ch == "R":

1054 group = "0"

1055 else:

1056 group = ch + source.get_while(DIGITS)

1057

1058 source.expect(")")

1059

1060 return CallGroup(info, group, pos)

1061

1062def parse_call_named_group(source, info, pos):

1063 "Parses a call to a named group."

1064 group = parse_name(source)

1065 source.expect(")")

1066

1067 return CallGroup(info, group, pos)

1068

1069def parse_flag_set(source):

1070 "Parses a set of inline flags."

1071 flags = 0

1072

1073 try:

1074 while True:

1075 saved_pos = source.pos

1076 ch = source.get()

1077 if ch == "V":

1078 ch += source.get()

1079 flags |= REGEX_FLAGS[ch]

1080 except KeyError:

1081 source.pos = saved_pos

1082

1083 return flags

1084

1085def parse_flags(source, info):

1086 "Parses flags being turned on/off."

1087 flags_on = parse_flag_set(source)

1088 if source.match("-"):

1089 flags_off = parse_flag_set(source)

1090 if not flags_off:

1091 raise error("bad inline flags: no flags after '-'", source.string,

1092 source.pos)

1093 else:

1094 flags_off = 0

1095

1096 if flags_on & LOCALE:

1097 # Remember that this pattern as an inline locale flag.

1098 info.inline_locale = True

1099

1100 return flags_on, flags_off

1101

1102def parse_subpattern(source, info, flags_on, flags_off):

1103 "Parses a subpattern with scoped flags."

1104 saved_flags = info.flags

1105 info.flags = (info.flags | flags_on) & ~flags_off

1106

1107 # Ensure that there aren't multiple encoding flags set.

1108 if info.flags & (ASCII | LOCALE | UNICODE):

1109 info.flags = (info.flags & ~_ALL_ENCODINGS) | flags_on

1110

1111 source.ignore_space = bool(info.flags & VERBOSE)

1112 try:

1113 subpattern = _parse_pattern(source, info)

1114 source.expect(")")

1115 finally:

1116 info.flags = saved_flags

1117 source.ignore_space = bool(info.flags & VERBOSE)

1118

1119 return subpattern

1120

1121def parse_flags_subpattern(source, info):

1122 """Parses a flags subpattern. It could be inline flags or a subpattern

1123 possibly with local flags. If it's a subpattern, then that's returned;

1124 if it's a inline flags, then None is returned.

1125 """

1126 flags_on, flags_off = parse_flags(source, info)

1127

1128 if flags_off & GLOBAL_FLAGS:

1129 raise error("bad inline flags: cannot turn off global flag",

1130 source.string, source.pos)

1131

1132 if flags_on & flags_off:

1133 raise error("bad inline flags: flag turned on and off", source.string,

1134 source.pos)

1135

1136 # Handle flags which are global in all regex behaviours.

1137 new_global_flags = (flags_on & ~info.global_flags) & GLOBAL_FLAGS

1138 if new_global_flags:

1139 info.global_flags |= new_global_flags

1140

1141 # A global has been turned on, so reparse the pattern.

1142 raise _UnscopedFlagSet(info.global_flags)

1143

1144 # Ensure that from now on we have only scoped flags.

1145 flags_on &= ~GLOBAL_FLAGS

1146

1147 if source.match(":"):

1148 return parse_subpattern(source, info, flags_on, flags_off)

1149

1150 if source.match(")"):

1151 parse_positional_flags(source, info, flags_on, flags_off)

1152 return None

1153

1154 raise error("unknown extension", source.string, source.pos)

1155

1156def parse_positional_flags(source, info, flags_on, flags_off):

1157 "Parses positional flags."

1158 info.flags = (info.flags | flags_on) & ~flags_off

1159 source.ignore_space = bool(info.flags & VERBOSE)

1160

1161def parse_name(source, allow_numeric=False, allow_group_0=False):

1162 "Parses a name."

1163 name = source.get_while(set(")>"), include=False)

1164

1165 if not name:

1166 raise error("missing group name", source.string, source.pos)

1167

1168 if name.isdigit():

1169 min_group = 0 if allow_group_0 else 1

1170 if not allow_numeric or int(name) < min_group:

1171 raise error("bad character in group name", source.string,

1172 source.pos)

1173 else:

1174 if not name.isidentifier():

1175 raise error("bad character in group name", source.string,

1176 source.pos)

1177

1178 return name

1179

1180def is_octal(string):

1181 "Checks whether a string is octal."

1182 return all(ch in OCT_DIGITS for ch in string)

1183

1184def is_decimal(string):

1185 "Checks whether a string is decimal."

1186 return all(ch in DIGITS for ch in string)

1187

1188def is_hexadecimal(string):

1189 "Checks whether a string is hexadecimal."

1190 return all(ch in HEX_DIGITS for ch in string)

1191

1192def parse_escape(source, info, in_set):

1193 "Parses an escape sequence."

1194 saved_ignore = source.ignore_space

1195 source.ignore_space = False

1196 ch = source.get()

1197 source.ignore_space = saved_ignore

1198 if not ch:

1199 # A backslash at the end of the pattern.

1200 raise error("bad escape (end of pattern)", source.string, source.pos)

1201 if ch in HEX_ESCAPES:

1202 # A hexadecimal escape sequence.

1203 return parse_hex_escape(source, info, ch, HEX_ESCAPES[ch], in_set, ch)

1204 elif ch == "g" and not in_set:

1205 # A group reference.

1206 saved_pos = source.pos

1207 try:

1208 return parse_group_ref(source, info)

1209 except error:

1210 # Invalid as a group reference, so assume it's a literal.

1211 source.pos = saved_pos

1212

1213 return make_character(info, ord(ch), in_set)

1214 elif ch == "G" and not in_set:

1215 # A search anchor.

1216 return SearchAnchor()

1217 elif ch == "L" and not in_set:

1218 # A string set.

1219 return parse_string_set(source, info)

1220 elif ch == "N":

1221 # A named codepoint.

1222 return parse_named_char(source, info, in_set)

1223 elif ch in "pP":

1224 # A Unicode property, positive or negative.

1225 return parse_property(source, info, ch == "p", in_set)

1226 elif ch == "R" and not in_set:

1227 # A line ending.

1228 charset = [0x0A, 0x0B, 0x0C, 0x0D]

1229 if info.guess_encoding == UNICODE:

1230 charset.extend([0x85, 0x2028, 0x2029])

1231

1232 return Atomic(Branch([String([0x0D, 0x0A]), SetUnion(info, [Character(c)

1233 for c in charset])]))

1234 elif ch == "X" and not in_set:

1235 # A grapheme cluster.

1236 return Grapheme()

1237 elif ch in ALPHA:

1238 # An alphabetic escape sequence.

1239 # Positional escapes aren't allowed inside a character set.

1240 if not in_set:

1241 if info.flags & WORD:

1242 value = WORD_POSITION_ESCAPES.get(ch)

1243 elif info.flags & ASCII:

1244 value = ASCII_POSITION_ESCAPES.get(ch)

1245 elif info.flags & UNICODE:

1246 value = UNICODE_POSITION_ESCAPES.get(ch)

1247 else:

1248 value = POSITION_ESCAPES.get(ch)

1249

1250 if value:

1251 return value

1252

1253 if info.flags & ASCII:

1254 value = ASCII_CHARSET_ESCAPES.get(ch)

1255 elif info.flags & UNICODE:

1256 value = UNICODE_CHARSET_ESCAPES.get(ch)

1257 else:

1258 value = CHARSET_ESCAPES.get(ch)

1259

1260 if value:

1261 return value

1262

1263 value = CHARACTER_ESCAPES.get(ch)

1264 if value:

1265 return Character(ord(value))

1266

1267 raise error("bad escape \\%s" % ch, source.string, source.pos)

1268 elif ch in DIGITS:

1269 # A numeric escape sequence.

1270 return parse_numeric_escape(source, info, ch, in_set)

1271 else:

1272 # A literal.

1273 return make_character(info, ord(ch), in_set)

1274

1275def parse_numeric_escape(source, info, ch, in_set):

1276 "Parses a numeric escape sequence."

1277 if in_set or ch == "0":

1278 # Octal escape sequence, max 3 digits.

1279 return parse_octal_escape(source, info, [ch], in_set)

1280

1281 # At least 1 digit, so either octal escape or group.

1282 digits = ch

1283 saved_pos = source.pos

1284 ch = source.get()

1285 if ch in DIGITS:

1286 # At least 2 digits, so either octal escape or group.

1287 digits += ch

1288 saved_pos = source.pos

1289 ch = source.get()

1290 if is_octal(digits) and ch in OCT_DIGITS:

1291 # 3 octal digits, so octal escape sequence.

1292 encoding = info.flags & _ALL_ENCODINGS

1293 if encoding == ASCII or encoding == LOCALE:

1294 octal_mask = 0xFF

1295 else:

1296 octal_mask = 0x1FF

1297

1298 value = int(digits + ch, 8) & octal_mask

1299 return make_character(info, value)

1300

1301 # Group reference.

1302 source.pos = saved_pos

1303 if info.is_open_group(digits):

1304 raise error("cannot refer to an open group", source.string, source.pos)

1305

1306 return make_ref_group(info, digits, source.pos)

1307

1308def parse_octal_escape(source, info, digits, in_set):

1309 "Parses an octal escape sequence."

1310 saved_pos = source.pos

1311 ch = source.get()

1312 while len(digits) < 3 and ch in OCT_DIGITS:

1313 digits.append(ch)

1314 saved_pos = source.pos

1315 ch = source.get()

1316

1317 source.pos = saved_pos

1318 try:

1319 value = int("".join(digits), 8)

1320 return make_character(info, value, in_set)

1321 except ValueError:

1322 if digits[0] in OCT_DIGITS:

1323 raise error("incomplete escape \\%s" % ''.join(digits),

1324 source.string, source.pos)

1325 else:

1326 raise error("bad escape \\%s" % digits[0], source.string,

1327 source.pos)

1328

1329def parse_hex_escape(source, info, esc, expected_len, in_set, type):

1330 "Parses a hex escape sequence."

1331 saved_pos = source.pos

1332 digits = []

1333 for i in range(expected_len):

1334 ch = source.get()

1335 if ch not in HEX_DIGITS:

1336 raise error("incomplete escape \\%s%s" % (type, ''.join(digits)),

1337 source.string, saved_pos)

1338 digits.append(ch)

1339

1340 try:

1341 value = int("".join(digits), 16)

1342 except ValueError:

1343 pass

1344 else:

1345 if value < 0x110000:

1346 return make_character(info, value, in_set)

1347

1348 # Bad hex escape.

1349 raise error("bad hex escape \\%s%s" % (esc, ''.join(digits)),

1350 source.string, saved_pos)

1351

1352def parse_group_ref(source, info):

1353 "Parses a group reference."

1354 source.expect("<")

1355 saved_pos = source.pos

1356 name = parse_name(source, True)

1357 source.expect(">")

1358 if info.is_open_group(name):

1359 raise error("cannot refer to an open group", source.string, source.pos)

1360

1361 return make_ref_group(info, name, saved_pos)

1362

1363def parse_string_set(source, info):

1364 "Parses a string set reference."

1365 source.expect("<")

1366 name = parse_name(source, True)

1367 source.expect(">")

1368 if name is None or name not in info.kwargs:

1369 raise error("undefined named list", source.string, source.pos)

1370

1371 return make_string_set(info, name)

1372

1373def parse_named_char(source, info, in_set):

1374 "Parses a named character."

1375 saved_pos = source.pos

1376 if source.match("{"):

1377 name = source.get_while(NAMED_CHAR_PART, keep_spaces=True)

1378 if source.match("}"):

1379 try:

1380 value = unicodedata.lookup(name)

1381 return make_character(info, ord(value), in_set)

1382 except KeyError:

1383 raise error("undefined character name", source.string,

1384 source.pos)

1385

1386 source.pos = saved_pos

1387 return make_character(info, ord("N"), in_set)

1388

1389def parse_property(source, info, positive, in_set):

1390 "Parses a Unicode property."

1391 saved_pos = source.pos

1392 ch = source.get()

1393 if ch == "{":

1394 negate = source.match("^")

1395 prop_name, name = parse_property_name(source)

1396 if source.match("}"):

1397 # It's correctly delimited.

1398 if info.flags & ASCII:

1399 encoding = ASCII_ENCODING

1400 elif info.flags & UNICODE:

1401 encoding = UNICODE_ENCODING

1402 else:

1403 encoding = 0

1404

1405 prop = lookup_property(prop_name, name, positive != negate, source,

1406 encoding=encoding)

1407 return make_property(info, prop, in_set)

1408 elif ch and ch in "CLMNPSZ":

1409 # An abbreviated property, eg \pL.

1410 if info.flags & ASCII:

1411 encoding = ASCII_ENCODING

1412 elif info.flags & UNICODE:

1413 encoding = UNICODE_ENCODING

1414 else:

1415 encoding = 0

1416

1417 prop = lookup_property(None, ch, positive, source, encoding=encoding)

1418 return make_property(info, prop, in_set)

1419

1420 # Not a property, so treat as a literal "p" or "P".

1421 source.pos = saved_pos

1422 ch = "p" if positive else "P"

1423 return make_character(info, ord(ch), in_set)

1424

1425def parse_property_name(source):

1426 "Parses a property name, which may be qualified."

1427 name = source.get_while(PROPERTY_NAME_PART)

1428 saved_pos = source.pos

1429

1430 ch = source.get()

1431 if ch and ch in ":=":

1432 prop_name = name

1433 name = source.get_while(ALNUM | set(" &_-./")).strip()

1434

1435 if name:

1436 # Name after the ":" or "=", so it's a qualified name.

1437 saved_pos = source.pos

1438 else:

1439 # No name after the ":" or "=", so assume it's an unqualified name.

1440 prop_name, name = None, prop_name

1441 else:

1442 prop_name = None

1443

1444 source.pos = saved_pos

1445 return prop_name, name

1446

1447def parse_set(source, info):

1448 "Parses a character set."

1449 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION

1450

1451 saved_ignore = source.ignore_space

1452 source.ignore_space = False

1453 # Negative set?

1454 negate = source.match("^")

1455 try:

1456 if version == VERSION0:

1457 item = parse_set_imp_union(source, info)

1458 else:

1459 item = parse_set_union(source, info)

1460

1461 if not source.match("]"):

1462 raise error("missing ]", source.string, source.pos)

1463 finally:

1464 source.ignore_space = saved_ignore

1465

1466 if negate:

1467 item = item.with_flags(positive=not item.positive)

1468

1469 item = item.with_flags(case_flags=make_case_flags(info))

1470

1471 return item

1472

1473def parse_set_union(source, info):

1474 "Parses a set union ([x||y])."

1475 items = [parse_set_symm_diff(source, info)]

1476 while source.match("||"):

1477 items.append(parse_set_symm_diff(source, info))

1478

1479 if len(items) == 1:

1480 return items[0]

1481 return SetUnion(info, items)

1482

1483def parse_set_symm_diff(source, info):

1484 "Parses a set symmetric difference ([x~~y])."

1485 items = [parse_set_inter(source, info)]

1486 while source.match("~~"):

1487 items.append(parse_set_inter(source, info))

1488

1489 if len(items) == 1:

1490 return items[0]

1491 return SetSymDiff(info, items)

1492

1493def parse_set_inter(source, info):

1494 "Parses a set intersection ([x&&y])."

1495 items = [parse_set_diff(source, info)]

1496 while source.match("&&"):

1497 items.append(parse_set_diff(source, info))

1498

1499 if len(items) == 1:

1500 return items[0]

1501 return SetInter(info, items)

1502

1503def parse_set_diff(source, info):

1504 "Parses a set difference ([x--y])."

1505 items = [parse_set_imp_union(source, info)]

1506 while source.match("--"):

1507 items.append(parse_set_imp_union(source, info))

1508

1509 if len(items) == 1:

1510 return items[0]

1511 return SetDiff(info, items)

1512

1513def parse_set_imp_union(source, info):

1514 "Parses a set implicit union ([xy])."

1515 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION

1516

1517 items = [parse_set_member(source, info)]

1518 while True:

1519 saved_pos = source.pos

1520 if source.match("]"):

1521 # End of the set.

1522 source.pos = saved_pos

1523 break

1524

1525 if version == VERSION1 and any(source.match(op) for op in SET_OPS):

1526 # The new behaviour has set operators.

1527 source.pos = saved_pos

1528 break

1529

1530 items.append(parse_set_member(source, info))

1531

1532 if len(items) == 1:

1533 return items[0]

1534 return SetUnion(info, items)

1535

1536def parse_set_member(source, info):

1537 "Parses a member in a character set."

1538 # Parse a set item.

1539 start = parse_set_item(source, info)

1540 saved_pos1 = source.pos

1541 if (not isinstance(start, Character) or not start.positive or not

1542 source.match("-")):

1543 # It's not the start of a range.

1544 return start

1545

1546 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION

1547

1548 # It looks like the start of a range of characters.

1549 saved_pos2 = source.pos

1550 if version == VERSION1 and source.match("-"):

1551 # It's actually the set difference operator '--', so return the

1552 # character.

1553 source.pos = saved_pos1

1554 return start

1555

1556 if source.match("]"):

1557 # We've reached the end of the set, so return both the character and

1558 # hyphen.

1559 source.pos = saved_pos2

1560 return SetUnion(info, [start, Character(ord("-"))])

1561

1562 # Parse a set item.

1563 end = parse_set_item(source, info)

1564 if not isinstance(end, Character) or not end.positive:

1565 # It's not a range, so return the character, hyphen and property.

1566 return SetUnion(info, [start, Character(ord("-")), end])

1567

1568 # It _is_ a range.

1569 if start.value > end.value:

1570 raise error("bad character range", source.string, source.pos)

1571

1572 if start.value == end.value:

1573 return start

1574

1575 return Range(start.value, end.value)

1576

1577def parse_set_item(source, info):

1578 "Parses an item in a character set."

1579 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION

1580

1581 if source.match("\\"):

1582 # An escape sequence in a set.

1583 return parse_escape(source, info, True)

1584

1585 saved_pos = source.pos

1586 if source.match("[:"):

1587 # Looks like a POSIX character class.

1588 try:

1589 return parse_posix_class(source, info)

1590 except ParseError:

1591 # Not a POSIX character class.

1592 source.pos = saved_pos

1593

1594 if version == VERSION1 and source.match("["):

1595 # It's the start of a nested set.

1596

1597 # Negative set?

1598 negate = source.match("^")

1599 item = parse_set_union(source, info)

1600

1601 if not source.match("]"):

1602 raise error("missing ]", source.string, source.pos)

1603

1604 if negate:

1605 item = item.with_flags(positive=not item.positive)

1606

1607 return item

1608

1609 ch = source.get()

1610 if not ch:

1611 raise error("unterminated character set", source.string, source.pos)

1612

1613 return Character(ord(ch))

1614

1615def parse_posix_class(source, info):

1616 "Parses a POSIX character class."

1617 negate = source.match("^")

1618 prop_name, name = parse_property_name(source)

1619 if not source.match(":]"):

1620 raise ParseError()

1621

1622 return lookup_property(prop_name, name, not negate, source, posix=True)

1623

1624def float_to_rational(flt):

1625 "Converts a float to a rational pair."

1626 int_part = int(flt)

1627 error = flt - int_part

1628 if abs(error) < 0.0001:

1629 return int_part, 1

1630

1631 den, num = float_to_rational(1.0 / error)

1632

1633 return int_part * den + num, den

1634

1635def numeric_to_rational(numeric):

1636 "Converts a numeric string to a rational string, if possible."

1637 if numeric[ : 1] == "-":

1638 sign, numeric = numeric[0], numeric[1 : ]

1639 else:

1640 sign = ""

1641

1642 parts = numeric.split("/")

1643 if len(parts) == 2:

1644 num, den = float_to_rational(float(parts[0]) / float(parts[1]))

1645 elif len(parts) == 1:

1646 num, den = float_to_rational(float(parts[0]))

1647 else:

1648 raise ValueError()

1649

1650 result = "{}{}/{}".format(sign, num, den)

1651 if result.endswith("/1"):

1652 return result[ : -2]

1653

1654 return result

1655

1656def standardise_name(name):

1657 "Standardises a property or value name."

1658 try:

1659 return numeric_to_rational("".join(name))

1660 except (ValueError, ZeroDivisionError):

1661 return "".join(ch for ch in name if ch not in "_- ").upper()

1662

1663_POSIX_CLASSES = set('ALNUM DIGIT PUNCT XDIGIT'.split())

1664

1665_BINARY_VALUES = set('YES Y NO N TRUE T FALSE F'.split())

1666

1667def lookup_property(property, value, positive, source=None, posix=False, encoding=0):

1668 "Looks up a property."

1669 # Normalise the names (which may still be lists).

1670 property = standardise_name(property) if property else None

1671 value = standardise_name(value)

1672

1673 if (property, value) == ("GENERALCATEGORY", "ASSIGNED"):

1674 property, value, positive = "GENERALCATEGORY", "UNASSIGNED", not positive

1675

1676 if posix and not property and value.upper() in _POSIX_CLASSES:

1677 value = 'POSIX' + value

1678

1679 if property:

1680 # Both the property and the value are provided.

1681 prop = PROPERTIES.get(property)

1682 if not prop:

1683 if not source:

1684 raise error("unknown property")

1685

1686 raise error("unknown property", source.string, source.pos)

1687

1688 prop_id, value_dict = prop

1689 val_id = value_dict.get(value)

1690 if val_id is None:

1691 if not source:

1692 raise error("unknown property value")

1693

1694 raise error("unknown property value", source.string, source.pos)

1695

1696 return Property((prop_id << 16) | val_id, positive, encoding=encoding)

1697

1698 # Only the value is provided.

1699 # It might be the name of a GC, script or block value.

1700 for property in ("GC", "SCRIPT", "BLOCK"):

1701 prop_id, value_dict = PROPERTIES.get(property)

1702 val_id = value_dict.get(value)

1703 if val_id is not None:

1704 return Property((prop_id << 16) | val_id, positive, encoding=encoding)

1705

1706 # It might be the name of a binary property.

1707 prop = PROPERTIES.get(value)

1708 if prop:

1709 prop_id, value_dict = prop

1710 if set(value_dict) == _BINARY_VALUES:

1711 return Property((prop_id << 16) | 1, positive, encoding=encoding)

1712

1713 return Property(prop_id << 16, not positive, encoding=encoding)

1714

1715 # It might be the name of a binary property starting with a prefix.

1716 if value.startswith("IS"):

1717 prop = PROPERTIES.get(value[2 : ])

1718 if prop:

1719 prop_id, value_dict = prop

1720 if "YES" in value_dict:

1721 return Property((prop_id << 16) | 1, positive, encoding=encoding)

1722

1723 # It might be the name of a script or block starting with a prefix.

1724 for prefix, property in (("IS", "SCRIPT"), ("IN", "BLOCK")):

1725 if value.startswith(prefix):

1726 prop_id, value_dict = PROPERTIES.get(property)

1727 val_id = value_dict.get(value[2 : ])

1728 if val_id is not None:

1729 return Property((prop_id << 16) | val_id, positive, encoding=encoding)

1730

1731 # Unknown property.

1732 if not source:

1733 raise error("unknown property")

1734

1735 raise error("unknown property", source.string, source.pos)

1736

1737def _compile_replacement(source, pattern, is_unicode):

1738 "Compiles a replacement template escape sequence."

1739 ch = source.get()

1740 if ch in ALPHA:

1741 # An alphabetic escape sequence.

1742 value = CHARACTER_ESCAPES.get(ch)

1743 if value:

1744 return False, [ord(value)]

1745

1746 if ch in HEX_ESCAPES and (ch == "x" or is_unicode):

1747 # A hexadecimal escape sequence.

1748 return False, [parse_repl_hex_escape(source, HEX_ESCAPES[ch], ch)]

1749

1750 if ch == "g":

1751 # A group preference.

1752 return True, [compile_repl_group(source, pattern)]

1753

1754 if ch == "N" and is_unicode:

1755 # A named character.

1756 value = parse_repl_named_char(source)

1757 if value is not None:

1758 return False, [value]

1759

1760 raise error("bad escape \\%s" % ch, source.string, source.pos)

1761

1762 if isinstance(source.sep, bytes):

1763 octal_mask = 0xFF

1764 else:

1765 octal_mask = 0x1FF

1766

1767 if ch == "0":

1768 # An octal escape sequence.

1769 digits = ch

1770 while len(digits) < 3:

1771 saved_pos = source.pos

1772 ch = source.get()

1773 if ch not in OCT_DIGITS:

1774 source.pos = saved_pos

1775 break

1776 digits += ch

1777

1778 return False, [int(digits, 8) & octal_mask]

1779

1780 if ch in DIGITS:

1781 # Either an octal escape sequence (3 digits) or a group reference (max

1782 # 2 digits).

1783 digits = ch

1784 saved_pos = source.pos

1785 ch = source.get()

1786 if ch in DIGITS:

1787 digits += ch

1788 saved_pos = source.pos

1789 ch = source.get()

1790 if ch and is_octal(digits + ch):

1791 # An octal escape sequence.

1792 return False, [int(digits + ch, 8) & octal_mask]

1793

1794 # A group reference.

1795 source.pos = saved_pos

1796 return True, [int(digits)]

1797

1798 if ch == "\\":

1799 # An escaped backslash is a backslash.

1800 return False, [ord("\\")]

1801

1802 if not ch:

1803 # A trailing backslash.

1804 raise error("bad escape (end of pattern)", source.string, source.pos)

1805

1806 # An escaped non-backslash is a backslash followed by the literal.

1807 return False, [ord("\\"), ord(ch)]

1808

1809def parse_repl_hex_escape(source, expected_len, type):

1810 "Parses a hex escape sequence in a replacement string."

1811 digits = []

1812 for i in range(expected_len):

1813 ch = source.get()

1814 if ch not in HEX_DIGITS:

1815 raise error("incomplete escape \\%s%s" % (type, ''.join(digits)),

1816 source.string, source.pos)

1817 digits.append(ch)

1818

1819 return int("".join(digits), 16)

1820

1821def parse_repl_named_char(source):

1822 "Parses a named character in a replacement string."

1823 saved_pos = source.pos

1824 if source.match("{"):

1825 name = source.get_while(ALPHA | set(" "))

1826

1827 if source.match("}"):

1828 try:

1829 value = unicodedata.lookup(name)

1830 return ord(value)

1831 except KeyError:

1832 raise error("undefined character name", source.string,

1833 source.pos)

1834

1835 source.pos = saved_pos

1836 return None

1837

1838def compile_repl_group(source, pattern):

1839 "Compiles a replacement template group reference."

1840 source.expect("<")

1841 name = parse_name(source, True, True)

1842

1843 source.expect(">")

1844 if name.isdigit():

1845 index = int(name)

1846 if not 0 <= index <= pattern.groups:

1847 raise error("invalid group reference", source.string, source.pos)

1848

1849 return index

1850

1851 try:

1852 return pattern.groupindex[name]

1853 except KeyError:

1854 raise IndexError("unknown group")

1855

1856# The regular expression is parsed into a syntax tree. The different types of

1857# node are defined below.

1858

1859INDENT = " "

1860POSITIVE_OP = 0x1

1861ZEROWIDTH_OP = 0x2

1862FUZZY_OP = 0x4

1863REVERSE_OP = 0x8

1864REQUIRED_OP = 0x10

1865ENCODING_OP_SHIFT = 5

1866

1867POS_TEXT = {False: "NON-MATCH", True: "MATCH"}

1868CASE_TEXT = {NOCASE: "", IGNORECASE: " SIMPLE_IGNORE_CASE", FULLCASE: "",

1869 FULLIGNORECASE: " FULL_IGNORE_CASE"}

1870

1871def make_sequence(items):

1872 if len(items) == 1:

1873 return items[0]

1874 return Sequence(items)

1875

1876# Common base class for all nodes.

1877class RegexBase:

1878 def __init__(self):

1879 self._key = self.__class__

1880

1881 def with_flags(self, positive=None, case_flags=None, zerowidth=None):

1882 if positive is None:

1883 positive = self.positive

1884 else:

1885 positive = bool(positive)

1886 if case_flags is None:

1887 case_flags = self.case_flags

1888 else:

1889 case_flags = CASE_FLAGS_COMBINATIONS[case_flags & CASE_FLAGS]

1890 if zerowidth is None:

1891 zerowidth = self.zerowidth

1892 else:

1893 zerowidth = bool(zerowidth)

1894

1895 if (positive == self.positive and case_flags == self.case_flags and

1896 zerowidth == self.zerowidth):

1897 return self

1898

1899 return self.rebuild(positive, case_flags, zerowidth)

1900

1901 def fix_groups(self, pattern, reverse, fuzzy):

1902 pass

1903

1904 def optimise(self, info, reverse):

1905 return self

1906

1907 def pack_characters(self, info):

1908 return self

1909

1910 def remove_captures(self):

1911 return self

1912

1913 def is_atomic(self):

1914 return True

1915

1916 def can_be_affix(self):

1917 return True

1918

1919 def contains_group(self):

1920 return False

1921

1922 def get_firstset(self, reverse):

1923 raise _FirstSetError()

1924

1925 def has_simple_start(self):

1926 return False

1927

1928 def compile(self, reverse=False, fuzzy=False):

1929 return self._compile(reverse, fuzzy)

1930

1931 def is_empty(self):

1932 return False

1933

1934 def __hash__(self):

1935 return hash(self._key)

1936

1937 def __eq__(self, other):

1938 return type(self) is type(other) and self._key == other._key

1939

1940 def __ne__(self, other):

1941 return not self.__eq__(other)

1942

1943 def get_required_string(self, reverse):

1944 return self.max_width(), None

1945

1946# Base class for zero-width nodes.

1947class ZeroWidthBase(RegexBase):

1948 def __init__(self, positive=True, encoding=0):

1949 RegexBase.__init__(self)

1950 self.positive = bool(positive)

1951 self.encoding = encoding

1952

1953 self._key = self.__class__, self.positive

1954

1955 def get_firstset(self, reverse):

1956 return set([None])

1957

1958 def _compile(self, reverse, fuzzy):

1959 flags = 0

1960 if self.positive:

1961 flags |= POSITIVE_OP

1962 if fuzzy:

1963 flags |= FUZZY_OP

1964 if reverse:

1965 flags |= REVERSE_OP

1966 flags |= self.encoding << ENCODING_OP_SHIFT

1967 return [(self._opcode, flags)]

1968

1969 def dump(self, indent, reverse):

1970 print("{}{} {}{}".format(INDENT * indent, self._op_name,

1971 POS_TEXT[self.positive], ["", " ASCII"][self.encoding]))

1972

1973 def max_width(self):

1974 return 0

1975

1976class Any(RegexBase):

1977 _opcode = {False: OP.ANY, True: OP.ANY_REV}

1978 _op_name = "ANY"

1979

1980 def has_simple_start(self):

1981 return True

1982

1983 def _compile(self, reverse, fuzzy):

1984 flags = 0

1985 if fuzzy:

1986 flags |= FUZZY_OP

1987 return [(self._opcode[reverse], flags)]

1988

1989 def dump(self, indent, reverse):

1990 print("{}{}".format(INDENT * indent, self._op_name))

1991

1992 def max_width(self):

1993 return 1

1994

1995class AnyAll(Any):

1996 _opcode = {False: OP.ANY_ALL, True: OP.ANY_ALL_REV}

1997 _op_name = "ANY_ALL"

1998

1999 def __init__(self):

2000 self.positive = True

2001 self.zerowidth = False

2002 self.case_flags = 0

2003

2004 self._key = self.__class__, self.positive

2005

2006class AnyU(Any):

2007 _opcode = {False: OP.ANY_U, True: OP.ANY_U_REV}

2008 _op_name = "ANY_U"

2009

2010class Atomic(RegexBase):

2011 def __init__(self, subpattern):

2012 RegexBase.__init__(self)

2013 self.subpattern = subpattern

2014

2015 def fix_groups(self, pattern, reverse, fuzzy):

2016 self.subpattern.fix_groups(pattern, reverse, fuzzy)

2017

2018 def optimise(self, info, reverse):

2019 self.subpattern = self.subpattern.optimise(info, reverse)

2020

2021 if self.subpattern.is_empty():

2022 return self.subpattern

2023 return self

2024

2025 def pack_characters(self, info):

2026 self.subpattern = self.subpattern.pack_characters(info)

2027 return self

2028

2029 def remove_captures(self):

2030 self.subpattern = self.subpattern.remove_captures()

2031 return self

2032

2033 def can_be_affix(self):

2034 return self.subpattern.can_be_affix()

2035

2036 def contains_group(self):

2037 return self.subpattern.contains_group()

2038

2039 def get_firstset(self, reverse):

2040 return self.subpattern.get_firstset(reverse)

2041

2042 def has_simple_start(self):

2043 return self.subpattern.has_simple_start()

2044

2045 def _compile(self, reverse, fuzzy):

2046 return ([(OP.ATOMIC, )] + self.subpattern.compile(reverse, fuzzy) +

2047 [(OP.END, )])

2048

2049 def dump(self, indent, reverse):

2050 print("{}ATOMIC".format(INDENT * indent))

2051 self.subpattern.dump(indent + 1, reverse)

2052

2053 def is_empty(self):

2054 return self.subpattern.is_empty()

2055

2056 def __eq__(self, other):

2057 return (type(self) is type(other) and self.subpattern ==

2058 other.subpattern)

2059

2060 def max_width(self):

2061 return self.subpattern.max_width()

2062

2063 def get_required_string(self, reverse):

2064 return self.subpattern.get_required_string(reverse)

2065

2066class Boundary(ZeroWidthBase):

2067 _opcode = OP.BOUNDARY

2068 _op_name = "BOUNDARY"

2069

2070class Branch(RegexBase):

2071 def __init__(self, branches):

2072 RegexBase.__init__(self)

2073 self.branches = branches

2074

2075 def fix_groups(self, pattern, reverse, fuzzy):

2076 for b in self.branches:

2077 b.fix_groups(pattern, reverse, fuzzy)

2078

2079 def optimise(self, info, reverse):

2080 if not self.branches:

2081 return Sequence([])

2082

2083 # Flatten branches within branches.

2084 branches = Branch._flatten_branches(info, reverse, self.branches)

2085

2086 # Move any common prefix or suffix out of the branches.

2087 if reverse:

2088 suffix, branches = Branch._split_common_suffix(info, branches)

2089 prefix = []

2090 else:

2091 prefix, branches = Branch._split_common_prefix(info, branches)

2092 suffix = []

2093

2094 # Try to reduce adjacent single-character branches to sets.

2095 branches = Branch._reduce_to_set(info, reverse, branches)

2096

2097 if len(branches) > 1:

2098 sequence = [Branch(branches)]

2099

2100 if not prefix or not suffix:

2101 # We might be able to add a quick precheck before the branches.

2102 firstset = self._add_precheck(info, reverse, branches)

2103

2104 if firstset:

2105 if reverse:

2106 sequence.append(firstset)

2107 else:

2108 sequence.insert(0, firstset)

2109 else:

2110 sequence = branches

2111

2112 return make_sequence(prefix + sequence + suffix)

2113

2114 def _add_precheck(self, info, reverse, branches):

2115 charset = set()

2116 pos = -1 if reverse else 0

2117

2118 for branch in branches:

2119 if type(branch) is Literal and branch.case_flags == NOCASE:

2120 charset.add(branch.characters[pos])

2121 else:

2122 return

2123

2124 if not charset:

2125 return None

2126

2127 return _check_firstset(info, reverse, [Character(c) for c in charset])

2128

2129 def pack_characters(self, info):

2130 self.branches = [b.pack_characters(info) for b in self.branches]

2131 return self

2132

2133 def remove_captures(self):

2134 self.branches = [b.remove_captures() for b in self.branches]

2135 return self

2136

2137 def is_atomic(self):

2138 return all(b.is_atomic() for b in self.branches)

2139

2140 def can_be_affix(self):

2141 return all(b.can_be_affix() for b in self.branches)

2142

2143 def contains_group(self):

2144 return any(b.contains_group() for b in self.branches)

2145

2146 def get_firstset(self, reverse):

2147 fs = set()

2148 for b in self.branches:

2149 fs |= b.get_firstset(reverse)

2150

2151 return fs or set([None])

2152

2153 def _compile(self, reverse, fuzzy):

2154 if not self.branches:

2155 return []

2156

2157 code = [(OP.BRANCH, )]

2158 for b in self.branches:

2159 code.extend(b.compile(reverse, fuzzy))

2160 code.append((OP.NEXT, ))

2161

2162 code[-1] = (OP.END, )

2163

2164 return code

2165

2166 def dump(self, indent, reverse):

2167 print("{}BRANCH".format(INDENT * indent))

2168 self.branches[0].dump(indent + 1, reverse)

2169 for b in self.branches[1 : ]:

2170 print("{}OR".format(INDENT * indent))

2171 b.dump(indent + 1, reverse)

2172

2173 @staticmethod

2174 def _flatten_branches(info, reverse, branches):

2175 # Flatten the branches so that there aren't branches of branches.

2176 new_branches = []

2177 for b in branches:

2178 b = b.optimise(info, reverse)

2179 if isinstance(b, Branch):

2180 new_branches.extend(b.branches)

2181 else:

2182 new_branches.append(b)

2183

2184 return new_branches

2185

2186 @staticmethod

2187 def _split_common_prefix(info, branches):

2188 # Common leading items can be moved out of the branches.

2189 # Get the items in the branches.

2190 alternatives = []

2191 for b in branches:

2192 if isinstance(b, Sequence):

2193 alternatives.append(b.items)

2194 else:

2195 alternatives.append([b])

2196

2197 # What is the maximum possible length of the prefix?

2198 max_count = min(len(a) for a in alternatives)

2199

2200 # What is the longest common prefix?

2201 prefix = alternatives[0]

2202 pos = 0

2203 end_pos = max_count

2204 while pos < end_pos and prefix[pos].can_be_affix() and all(a[pos] ==

2205 prefix[pos] for a in alternatives):

2206 pos += 1

2207 count = pos

2208

2209 if info.flags & UNICODE:

2210 # We need to check that we're not splitting a sequence of

2211 # characters which could form part of full case-folding.

2212 count = pos

2213 while count > 0 and not all(Branch._can_split(a, count) for a in

2214 alternatives):

2215 count -= 1

2216

2217 # No common prefix is possible.

2218 if count == 0:

2219 return [], branches

2220

2221 # Rebuild the branches.

2222 new_branches = []

2223 for a in alternatives:

2224 new_branches.append(make_sequence(a[count : ]))

2225

2226 return prefix[ : count], new_branches

2227

2228 @staticmethod

2229 def _split_common_suffix(info, branches):

2230 # Common trailing items can be moved out of the branches.

2231 # Get the items in the branches.

2232 alternatives = []

2233 for b in branches:

2234 if isinstance(b, Sequence):

2235 alternatives.append(b.items)

2236 else:

2237 alternatives.append([b])

2238

2239 # What is the maximum possible length of the suffix?

2240 max_count = min(len(a) for a in alternatives)

2241

2242 # What is the longest common suffix?

2243 suffix = alternatives[0]

2244 pos = -1

2245 end_pos = -1 - max_count

2246 while pos > end_pos and suffix[pos].can_be_affix() and all(a[pos] ==

2247 suffix[pos] for a in alternatives):

2248 pos -= 1

2249 count = -1 - pos

2250

2251 if info.flags & UNICODE:

2252 # We need to check that we're not splitting a sequence of

2253 # characters which could form part of full case-folding.

2254 while count > 0 and not all(Branch._can_split_rev(a, count) for a

2255 in alternatives):

2256 count -= 1

2257

2258 # No common suffix is possible.

2259 if count == 0:

2260 return [], branches

2261

2262 # Rebuild the branches.

2263 new_branches = []

2264 for a in alternatives:

2265 new_branches.append(make_sequence(a[ : -count]))

2266

2267 return suffix[-count : ], new_branches

2268

2269 @staticmethod

2270 def _can_split(items, count):

2271 # Check the characters either side of the proposed split.

2272 if not Branch._is_full_case(items, count - 1):

2273 return True

2274

2275 if not Branch._is_full_case(items, count):

2276 return True

2277

2278 # Check whether a 1-1 split would be OK.

2279 if Branch._is_folded(items[count - 1 : count + 1]):

2280 return False

2281

2282 # Check whether a 1-2 split would be OK.

2283 if (Branch._is_full_case(items, count + 2) and

2284 Branch._is_folded(items[count - 1 : count + 2])):

2285 return False

2286

2287 # Check whether a 2-1 split would be OK.

2288 if (Branch._is_full_case(items, count - 2) and

2289 Branch._is_folded(items[count - 2 : count + 1])):

2290 return False

2291

2292 return True

2293

2294 @staticmethod

2295 def _can_split_rev(items, count):

2296 end = len(items)

2297

2298 # Check the characters either side of the proposed split.

2299 if not Branch._is_full_case(items, end - count):

2300 return True

2301

2302 if not Branch._is_full_case(items, end - count - 1):

2303 return True

2304

2305 # Check whether a 1-1 split would be OK.

2306 if Branch._is_folded(items[end - count - 1 : end - count + 1]):

2307 return False

2308

2309 # Check whether a 1-2 split would be OK.

2310 if (Branch._is_full_case(items, end - count + 2) and

2311 Branch._is_folded(items[end - count - 1 : end - count + 2])):

2312 return False

2313

2314 # Check whether a 2-1 split would be OK.

2315 if (Branch._is_full_case(items, end - count - 2) and

2316 Branch._is_folded(items[end - count - 2 : end - count + 1])):

2317 return False

2318

2319 return True

2320

2321 @staticmethod

2322 def _merge_common_prefixes(info, reverse, branches):

2323 # Branches with the same case-sensitive character prefix can be grouped

2324 # together if they are separated only by other branches with a

2325 # character prefix.

2326 prefixed = defaultdict(list)

2327 order = {}

2328 new_branches = []

2329 for b in branches:

2330 if Branch._is_simple_character(b):

2331 # Branch starts with a simple character.

2332 prefixed[b.value].append([b])

2333 order.setdefault(b.value, len(order))

2334 elif (isinstance(b, Sequence) and b.items and

2335 Branch._is_simple_character(b.items[0])):

2336 # Branch starts with a simple character.

2337 prefixed[b.items[0].value].append(b.items)

2338 order.setdefault(b.items[0].value, len(order))

2339 else:

2340 Branch._flush_char_prefix(info, reverse, prefixed, order,

2341 new_branches)

2342

2343 new_branches.append(b)

2344

2345 Branch._flush_char_prefix(info, prefixed, order, new_branches)

2346

2347 return new_branches

2348

2349 @staticmethod

2350 def _is_simple_character(c):

2351 return isinstance(c, Character) and c.positive and not c.case_flags

2352

2353 @staticmethod

2354 def _reduce_to_set(info, reverse, branches):

2355 # Can the branches be reduced to a set?

2356 new_branches = []

2357 items = set()

2358 case_flags = NOCASE

2359 for b in branches:

2360 if isinstance(b, (Character, Property, SetBase)):

2361 # Branch starts with a single character.

2362 if b.case_flags != case_flags:

2363 # Different case sensitivity, so flush.

2364 Branch._flush_set_members(info, reverse, items, case_flags,

2365 new_branches)

2366

2367 case_flags = b.case_flags

2368

2369 items.add(b.with_flags(case_flags=NOCASE))

2370 else:

2371 Branch._flush_set_members(info, reverse, items, case_flags,

2372 new_branches)

2373

2374 new_branches.append(b)

2375

2376 Branch._flush_set_members(info, reverse, items, case_flags,

2377 new_branches)

2378

2379 return new_branches

2380

2381 @staticmethod

2382 def _flush_char_prefix(info, reverse, prefixed, order, new_branches):

2383 # Flush the prefixed branches.

2384 if not prefixed:

2385 return

2386

2387 for value, branches in sorted(prefixed.items(), key=lambda pair:

2388 order[pair[0]]):

2389 if len(branches) == 1:

2390 new_branches.append(make_sequence(branches[0]))

2391 else:

2392 subbranches = []

2393 optional = False

2394 for b in branches:

2395 if len(b) > 1:

2396 subbranches.append(make_sequence(b[1 : ]))

2397 elif not optional:

2398 subbranches.append(Sequence())

2399 optional = True

2400

2401 sequence = Sequence([Character(value), Branch(subbranches)])

2402 new_branches.append(sequence.optimise(info, reverse))

2403

2404 prefixed.clear()

2405 order.clear()

2406

2407 @staticmethod

2408 def _flush_set_members(info, reverse, items, case_flags, new_branches):

2409 # Flush the set members.

2410 if not items:

2411 return

2412

2413 if len(items) == 1:

2414 item = list(items)[0]

2415 else:

2416 item = SetUnion(info, list(items)).optimise(info, reverse)

2417

2418 new_branches.append(item.with_flags(case_flags=case_flags))

2419

2420 items.clear()

2421

2422 @staticmethod

2423 def _is_full_case(items, i):

2424 if not 0 <= i < len(items):

2425 return False

2426

2427 item = items[i]

2428 return (isinstance(item, Character) and item.positive and

2429 (item.case_flags & FULLIGNORECASE) == FULLIGNORECASE)

2430

2431 @staticmethod

2432 def _is_folded(items):

2433 if len(items) < 2:

2434 return False

2435

2436 for i in items:

2437 if (not isinstance(i, Character) or not i.positive or not

2438 i.case_flags):

2439 return False

2440

2441 folded = "".join(chr(i.value) for i in items)

2442 folded = _regex.fold_case(FULL_CASE_FOLDING, folded)

2443

2444 # Get the characters which expand to multiple codepoints on folding.

2445 expanding_chars = _regex.get_expand_on_folding()

2446

2447 for c in expanding_chars:

2448 if folded == _regex.fold_case(FULL_CASE_FOLDING, c):

2449 return True

2450

2451 return False

2452

2453 def is_empty(self):

2454 return all(b.is_empty() for b in self.branches)

2455

2456 def __eq__(self, other):

2457 return type(self) is type(other) and self.branches == other.branches

2458

2459 def max_width(self):

2460 return max(b.max_width() for b in self.branches)

2461

2462class CallGroup(RegexBase):

2463 def __init__(self, info, group, position):

2464 RegexBase.__init__(self)

2465 self.info = info

2466 self.group = group

2467 self.position = position

2468

2469 self._key = self.__class__, self.group

2470

2471 def fix_groups(self, pattern, reverse, fuzzy):

2472 try:

2473 self.group = int(self.group)

2474 except ValueError:

2475 try:

2476 self.group = self.info.group_index[self.group]

2477 except KeyError:

2478 raise error("invalid group reference", pattern, self.position)

2479

2480 if not 0 <= self.group <= self.info.group_count:

2481 raise error("unknown group", pattern, self.position)

2482

2483 if self.group > 0 and self.info.open_group_count[self.group] > 1:

2484 raise error("ambiguous group reference", pattern, self.position)

2485

2486 self.info.group_calls.append((self, reverse, fuzzy))

2487

2488 self._key = self.__class__, self.group

2489

2490 def remove_captures(self):

2491 raise error("group reference not allowed", pattern, self.position)

2492

2493 def _compile(self, reverse, fuzzy):

2494 return [(OP.GROUP_CALL, self.call_ref)]

2495

2496 def dump(self, indent, reverse):

2497 print("{}GROUP_CALL {}".format(INDENT * indent, self.group))

2498

2499 def __eq__(self, other):

2500 return type(self) is type(other) and self.group == other.group

2501

2502 def max_width(self):

2503 return UNLIMITED

2504

2505 def __del__(self):

2506 self.info = None

2507

2508class CallRef(RegexBase):

2509 def __init__(self, ref, parsed):

2510 self.ref = ref

2511 self.parsed = parsed

2512

2513 def _compile(self, reverse, fuzzy):

2514 return ([(OP.CALL_REF, self.ref)] + self.parsed._compile(reverse,

2515 fuzzy) + [(OP.END, )])

2516

2517class Character(RegexBase):

2518 _opcode = {(NOCASE, False): OP.CHARACTER, (IGNORECASE, False):

2519 OP.CHARACTER_IGN, (FULLCASE, False): OP.CHARACTER, (FULLIGNORECASE,

2520 False): OP.CHARACTER_IGN, (NOCASE, True): OP.CHARACTER_REV, (IGNORECASE,

2521 True): OP.CHARACTER_IGN_REV, (FULLCASE, True): OP.CHARACTER_REV,

2522 (FULLIGNORECASE, True): OP.CHARACTER_IGN_REV}

2523

2524 def __init__(self, value, positive=True, case_flags=NOCASE,

2525 zerowidth=False):

2526 RegexBase.__init__(self)

2527 self.value = value

2528 self.positive = bool(positive)

2529 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]

2530 self.zerowidth = bool(zerowidth)

2531

2532 if (self.positive and (self.case_flags & FULLIGNORECASE) ==

2533 FULLIGNORECASE):

2534 self.folded = _regex.fold_case(FULL_CASE_FOLDING, chr(self.value))

2535 else:

2536 self.folded = chr(self.value)

2537

2538 self._key = (self.__class__, self.value, self.positive,

2539 self.case_flags, self.zerowidth)

2540

2541 def rebuild(self, positive, case_flags, zerowidth):

2542 return Character(self.value, positive, case_flags, zerowidth)

2543

2544 def optimise(self, info, reverse, in_set=False):

2545 return self

2546

2547 def get_firstset(self, reverse):

2548 return set([self])

2549

2550 def has_simple_start(self):

2551 return True

2552

2553 def _compile(self, reverse, fuzzy):

2554 flags = 0

2555 if self.positive:

2556 flags |= POSITIVE_OP

2557 if self.zerowidth:

2558 flags |= ZEROWIDTH_OP

2559 if fuzzy:

2560 flags |= FUZZY_OP

2561

2562 code = PrecompiledCode([self._opcode[self.case_flags, reverse], flags,

2563 self.value])

2564

2565 if len(self.folded) > 1:

2566 # The character expands on full case-folding.

2567 code = Branch([code, String([ord(c) for c in self.folded],

2568 case_flags=self.case_flags)])

2569

2570 return code.compile(reverse, fuzzy)

2571

2572 def dump(self, indent, reverse):

2573 display = ascii(chr(self.value)).lstrip("bu")

2574 print("{}CHARACTER {} {}{}".format(INDENT * indent,

2575 POS_TEXT[self.positive], display, CASE_TEXT[self.case_flags]))

2576

2577 def matches(self, ch):

2578 return (ch == self.value) == self.positive

2579

2580 def max_width(self):

2581 return len(self.folded)

2582

2583 def get_required_string(self, reverse):

2584 if not self.positive:

2585 return 1, None

2586

2587 self.folded_characters = tuple(ord(c) for c in self.folded)

2588

2589 return 0, self

2590

2591class Conditional(RegexBase):

2592 def __init__(self, info, group, yes_item, no_item, position):

2593 RegexBase.__init__(self)

2594 self.info = info

2595 self.group = group

2596 self.yes_item = yes_item

2597 self.no_item = no_item

2598 self.position = position

2599

2600 def fix_groups(self, pattern, reverse, fuzzy):

2601 try:

2602 self.group = int(self.group)

2603 except ValueError:

2604 try:

2605 self.group = self.info.group_index[self.group]

2606 except KeyError:

2607 if self.group == 'DEFINE':

2608 # 'DEFINE' is a special name unless there's a group with

2609 # that name.

2610 self.group = 0

2611 else:

2612 raise error("unknown group", pattern, self.position)

2613

2614 if not 0 <= self.group <= self.info.group_count:

2615 raise error("invalid group reference", pattern, self.position)

2616

2617 self.yes_item.fix_groups(pattern, reverse, fuzzy)

2618 self.no_item.fix_groups(pattern, reverse, fuzzy)

2619

2620 def optimise(self, info, reverse):

2621 yes_item = self.yes_item.optimise(info, reverse)

2622 no_item = self.no_item.optimise(info, reverse)

2623

2624 return Conditional(info, self.group, yes_item, no_item, self.position)

2625

2626 def pack_characters(self, info):

2627 self.yes_item = self.yes_item.pack_characters(info)

2628 self.no_item = self.no_item.pack_characters(info)

2629 return self

2630

2631 def remove_captures(self):

2632 self.yes_item = self.yes_item.remove_captures()

2633 self.no_item = self.no_item.remove_captures()

2634

2635 def is_atomic(self):

2636 return self.yes_item.is_atomic() and self.no_item.is_atomic()

2637

2638 def can_be_affix(self):

2639 return self.yes_item.can_be_affix() and self.no_item.can_be_affix()

2640

2641 def contains_group(self):

2642 return self.yes_item.contains_group() or self.no_item.contains_group()

2643

2644 def get_firstset(self, reverse):

2645 return (self.yes_item.get_firstset(reverse) |

2646 self.no_item.get_firstset(reverse))

2647

2648 def _compile(self, reverse, fuzzy):

2649 code = [(OP.GROUP_EXISTS, self.group)]

2650 code.extend(self.yes_item.compile(reverse, fuzzy))

2651 add_code = self.no_item.compile(reverse, fuzzy)

2652 if add_code:

2653 code.append((OP.NEXT, ))

2654 code.extend(add_code)

2655

2656 code.append((OP.END, ))

2657

2658 return code

2659

2660 def dump(self, indent, reverse):

2661 print("{}GROUP_EXISTS {}".format(INDENT * indent, self.group))

2662 self.yes_item.dump(indent + 1, reverse)

2663 if not self.no_item.is_empty():

2664 print("{}OR".format(INDENT * indent))

2665 self.no_item.dump(indent + 1, reverse)

2666

2667 def is_empty(self):

2668 return self.yes_item.is_empty() and self.no_item.is_empty()

2669

2670 def __eq__(self, other):

2671 return type(self) is type(other) and (self.group, self.yes_item,

2672 self.no_item) == (other.group, other.yes_item, other.no_item)

2673

2674 def max_width(self):

2675 return max(self.yes_item.max_width(), self.no_item.max_width())

2676

2677 def __del__(self):

2678 self.info = None

2679

2680class DefaultBoundary(ZeroWidthBase):

2681 _opcode = OP.DEFAULT_BOUNDARY

2682 _op_name = "DEFAULT_BOUNDARY"

2683

2684class DefaultEndOfWord(ZeroWidthBase):

2685 _opcode = OP.DEFAULT_END_OF_WORD

2686 _op_name = "DEFAULT_END_OF_WORD"

2687

2688class DefaultStartOfWord(ZeroWidthBase):

2689 _opcode = OP.DEFAULT_START_OF_WORD

2690 _op_name = "DEFAULT_START_OF_WORD"

2691

2692class EndOfLine(ZeroWidthBase):

2693 _opcode = OP.END_OF_LINE

2694 _op_name = "END_OF_LINE"

2695

2696class EndOfLineU(EndOfLine):

2697 _opcode = OP.END_OF_LINE_U

2698 _op_name = "END_OF_LINE_U"

2699

2700class EndOfString(ZeroWidthBase):

2701 _opcode = OP.END_OF_STRING

2702 _op_name = "END_OF_STRING"

2703

2704class EndOfStringLine(ZeroWidthBase):

2705 _opcode = OP.END_OF_STRING_LINE

2706 _op_name = "END_OF_STRING_LINE"

2707

2708class EndOfStringLineU(EndOfStringLine):

2709 _opcode = OP.END_OF_STRING_LINE_U

2710 _op_name = "END_OF_STRING_LINE_U"

2711

2712class EndOfWord(ZeroWidthBase):

2713 _opcode = OP.END_OF_WORD

2714 _op_name = "END_OF_WORD"

2715

2716class Failure(ZeroWidthBase):

2717 _op_name = "FAILURE"

2718

2719 def _compile(self, reverse, fuzzy):

2720 return [(OP.FAILURE, )]

2721

2722class Fuzzy(RegexBase):

2723 def __init__(self, subpattern, constraints=None):

2724 RegexBase.__init__(self)

2725 if constraints is None:

2726 constraints = {}

2727 self.subpattern = subpattern

2728 self.constraints = constraints

2729

2730 # If an error type is mentioned in the cost equation, then its maximum

2731 # defaults to unlimited.

2732 if "cost" in constraints:

2733 for e in "dis":

2734 if e in constraints["cost"]:

2735 constraints.setdefault(e, (0, None))

2736

2737 # If any error type is mentioned, then all the error maxima default to

2738 # 0, otherwise they default to unlimited.

2739 if set(constraints) & set("dis"):

2740 for e in "dis":

2741 constraints.setdefault(e, (0, 0))

2742 else:

2743 for e in "dis":

2744 constraints.setdefault(e, (0, None))

2745

2746 # The maximum of the generic error type defaults to unlimited.

2747 constraints.setdefault("e", (0, None))

2748

2749 # The cost equation defaults to equal costs. Also, the cost of any

2750 # error type not mentioned in the cost equation defaults to 0.

2751 if "cost" in constraints:

2752 for e in "dis":

2753 constraints["cost"].setdefault(e, 0)

2754 else:

2755 constraints["cost"] = {"d": 1, "i": 1, "s": 1, "max":

2756 constraints["e"][1]}

2757

2758 def fix_groups(self, pattern, reverse, fuzzy):

2759 self.subpattern.fix_groups(pattern, reverse, True)

2760

2761 def pack_characters(self, info):

2762 self.subpattern = self.subpattern.pack_characters(info)

2763 return self

2764

2765 def remove_captures(self):

2766 self.subpattern = self.subpattern.remove_captures()

2767 return self

2768

2769 def is_atomic(self):

2770 return self.subpattern.is_atomic()

2771

2772 def contains_group(self):

2773 return self.subpattern.contains_group()

2774

2775 def _compile(self, reverse, fuzzy):

2776 # The individual limits.

2777 arguments = []

2778 for e in "dise":

2779 v = self.constraints[e]

2780 arguments.append(v[0])

2781 arguments.append(UNLIMITED if v[1] is None else v[1])

2782

2783 # The coeffs of the cost equation.

2784 for e in "dis":

2785 arguments.append(self.constraints["cost"][e])

2786

2787 # The maximum of the cost equation.

2788 v = self.constraints["cost"]["max"]

2789 arguments.append(UNLIMITED if v is None else v)

2790

2791 flags = 0

2792 if reverse:

2793 flags |= REVERSE_OP

2794

2795 test = self.constraints.get("test")

2796

2797 if test:

2798 return ([(OP.FUZZY_EXT, flags) + tuple(arguments)] +

2799 test.compile(reverse, True) + [(OP.NEXT,)] +

2800 self.subpattern.compile(reverse, True) + [(OP.END,)])

2801

2802 return ([(OP.FUZZY, flags) + tuple(arguments)] +

2803 self.subpattern.compile(reverse, True) + [(OP.END,)])

2804

2805 def dump(self, indent, reverse):

2806 constraints = self._constraints_to_string()

2807 if constraints:

2808 constraints = " " + constraints

2809 print("{}FUZZY{}".format(INDENT * indent, constraints))

2810 self.subpattern.dump(indent + 1, reverse)

2811

2812 def is_empty(self):

2813 return self.subpattern.is_empty()

2814

2815 def __eq__(self, other):

2816 return (type(self) is type(other) and self.subpattern ==

2817 other.subpattern and self.constraints == other.constraints)

2818

2819 def max_width(self):

2820 return UNLIMITED

2821

2822 def _constraints_to_string(self):

2823 constraints = []

2824

2825 for name in "ids":

2826 min, max = self.constraints[name]

2827 if max == 0:

2828 continue

2829

2830 con = ""

2831

2832 if min > 0:

2833 con = "{}<=".format(min)

2834

2835 con += name

2836

2837 if max is not None:

2838 con += "<={}".format(max)

2839

2840 constraints.append(con)

2841

2842 cost = []

2843 for name in "ids":

2844 coeff = self.constraints["cost"][name]

2845 if coeff > 0:

2846 cost.append("{}{}".format(coeff, name))

2847

2848 limit = self.constraints["cost"]["max"]

2849 if limit is not None and limit > 0:

2850 cost = "{}<={}".format("+".join(cost), limit)

2851 constraints.append(cost)

2852

2853 return ",".join(constraints)

2854

2855class Grapheme(RegexBase):

2856 def _compile(self, reverse, fuzzy):

2857 # Match at least 1 character until a grapheme boundary is reached. Note

2858 # that this is the same whether matching forwards or backwards.

2859 grapheme_matcher = Atomic(Sequence([LazyRepeat(AnyAll(), 1, None),

2860 GraphemeBoundary()]))

2861

2862 return grapheme_matcher.compile(reverse, fuzzy)

2863

2864 def dump(self, indent, reverse):

2865 print("{}GRAPHEME".format(INDENT * indent))

2866

2867 def max_width(self):

2868 return UNLIMITED

2869

2870class GraphemeBoundary:

2871 def compile(self, reverse, fuzzy):

2872 return [(OP.GRAPHEME_BOUNDARY, 1)]

2873

2874class GreedyRepeat(RegexBase):

2875 _opcode = OP.GREEDY_REPEAT

2876 _op_name = "GREEDY_REPEAT"

2877

2878 def __init__(self, subpattern, min_count, max_count):

2879 RegexBase.__init__(self)

2880 self.subpattern = subpattern

2881 self.min_count = min_count

2882 self.max_count = max_count

2883

2884 def fix_groups(self, pattern, reverse, fuzzy):

2885 self.subpattern.fix_groups(pattern, reverse, fuzzy)

2886

2887 def optimise(self, info, reverse):

2888 subpattern = self.subpattern.optimise(info, reverse)

2889

2890 return type(self)(subpattern, self.min_count, self.max_count)

2891

2892 def pack_characters(self, info):

2893 self.subpattern = self.subpattern.pack_characters(info)

2894 return self

2895

2896 def remove_captures(self):

2897 self.subpattern = self.subpattern.remove_captures()

2898 return self

2899

2900 def is_atomic(self):

2901 return self.min_count == self.max_count and self.subpattern.is_atomic()

2902

2903 def can_be_affix(self):

2904 return False

2905

2906 def contains_group(self):

2907 return self.subpattern.contains_group()

2908

2909 def get_firstset(self, reverse):

2910 fs = self.subpattern.get_firstset(reverse)

2911 if self.min_count == 0:

2912 fs.add(None)

2913

2914 return fs

2915

2916 def _compile(self, reverse, fuzzy):

2917 repeat = [self._opcode, self.min_count]

2918 if self.max_count is None:

2919 repeat.append(UNLIMITED)

2920 else:

2921 repeat.append(self.max_count)

2922

2923 subpattern = self.subpattern.compile(reverse, fuzzy)

2924 if not subpattern:

2925 return []

2926

2927 return ([tuple(repeat)] + subpattern + [(OP.END, )])

2928

2929 def dump(self, indent, reverse):

2930 if self.max_count is None:

2931 limit = "INF"

2932 else:

2933 limit = self.max_count

2934 print("{}{} {} {}".format(INDENT * indent, self._op_name,

2935 self.min_count, limit))

2936

2937 self.subpattern.dump(indent + 1, reverse)

2938

2939 def is_empty(self):

2940 return self.subpattern.is_empty()

2941

2942 def __eq__(self, other):

2943 return type(self) is type(other) and (self.subpattern, self.min_count,

2944 self.max_count) == (other.subpattern, other.min_count,

2945 other.max_count)

2946

2947 def max_width(self):

2948 if self.max_count is None:

2949 return UNLIMITED

2950

2951 return self.subpattern.max_width() * self.max_count

2952

2953 def get_required_string(self, reverse):

2954 max_count = UNLIMITED if self.max_count is None else self.max_count

2955 if self.min_count == 0:

2956 w = self.subpattern.max_width() * max_count

2957 return min(w, UNLIMITED), None

2958

2959 ofs, req = self.subpattern.get_required_string(reverse)

2960 if req:

2961 return ofs, req

2962

2963 w = self.subpattern.max_width() * max_count

2964 return min(w, UNLIMITED), None

2965

2966class PossessiveRepeat(GreedyRepeat):

2967 def is_atomic(self):

2968 return True

2969

2970 def _compile(self, reverse, fuzzy):

2971 subpattern = self.subpattern.compile(reverse, fuzzy)

2972 if not subpattern:

2973 return []

2974

2975 repeat = [self._opcode, self.min_count]

2976 if self.max_count is None:

2977 repeat.append(UNLIMITED)

2978 else:

2979 repeat.append(self.max_count)

2980

2981 return ([(OP.ATOMIC, ), tuple(repeat)] + subpattern + [(OP.END, ),

2982 (OP.END, )])

2983

2984 def dump(self, indent, reverse):

2985 print("{}ATOMIC".format(INDENT * indent))

2986

2987 if self.max_count is None:

2988 limit = "INF"

2989 else:

2990 limit = self.max_count

2991 print("{}{} {} {}".format(INDENT * (indent + 1), self._op_name,

2992 self.min_count, limit))

2993

2994 self.subpattern.dump(indent + 2, reverse)

2995

2996class Group(RegexBase):

2997 def __init__(self, info, group, subpattern):

2998 RegexBase.__init__(self)

2999 self.info = info

3000 self.group = group

3001 self.subpattern = subpattern

3002

3003 self.call_ref = None

3004

3005 def fix_groups(self, pattern, reverse, fuzzy):

3006 self.info.defined_groups[self.group] = (self, reverse, fuzzy)

3007 self.subpattern.fix_groups(pattern, reverse, fuzzy)

3008

3009 def optimise(self, info, reverse):

3010 subpattern = self.subpattern.optimise(info, reverse)

3011

3012 return Group(self.info, self.group, subpattern)

3013

3014 def pack_characters(self, info):

3015 self.subpattern = self.subpattern.pack_characters(info)

3016 return self

3017

3018 def remove_captures(self):

3019 return self.subpattern.remove_captures()

3020

3021 def is_atomic(self):

3022 return self.subpattern.is_atomic()

3023

3024 def can_be_affix(self):

3025 return False

3026

3027 def contains_group(self):

3028 return True

3029

3030 def get_firstset(self, reverse):

3031 return self.subpattern.get_firstset(reverse)

3032

3033 def has_simple_start(self):

3034 return self.subpattern.has_simple_start()

3035

3036 def _compile(self, reverse, fuzzy):

3037 code = []

3038

3039 public_group = private_group = self.group

3040 if private_group < 0:

3041 public_group = self.info.private_groups[private_group]

3042 private_group = self.info.group_count - private_group

3043

3044 key = self.group, reverse, fuzzy

3045 ref = self.info.call_refs.get(key)

3046 if ref is not None:

3047 code += [(OP.CALL_REF, ref)]

3048

3049 code += [(OP.GROUP, int(not reverse), private_group, public_group)]

3050 code += self.subpattern.compile(reverse, fuzzy)

3051 code += [(OP.END, )]

3052

3053 if ref is not None:

3054 code += [(OP.END, )]

3055

3056 return code

3057

3058 def dump(self, indent, reverse):

3059 group = self.group

3060 if group < 0:

3061 group = private_groups[group]

3062 print("{}GROUP {}".format(INDENT * indent, group))

3063 self.subpattern.dump(indent + 1, reverse)

3064

3065 def __eq__(self, other):

3066 return (type(self) is type(other) and (self.group, self.subpattern) ==

3067 (other.group, other.subpattern))

3068

3069 def max_width(self):

3070 return self.subpattern.max_width()

3071

3072 def get_required_string(self, reverse):

3073 return self.subpattern.get_required_string(reverse)

3074

3075 def __del__(self):

3076 self.info = None

3077

3078class Keep(ZeroWidthBase):

3079 _opcode = OP.KEEP

3080 _op_name = "KEEP"

3081

3082class LazyRepeat(GreedyRepeat):

3083 _opcode = OP.LAZY_REPEAT

3084 _op_name = "LAZY_REPEAT"

3085

3086class LookAround(RegexBase):

3087 _dir_text = {False: "AHEAD", True: "BEHIND"}

3088

3089 def __init__(self, behind, positive, subpattern):

3090 RegexBase.__init__(self)

3091 self.behind = bool(behind)

3092 self.positive = bool(positive)

3093 self.subpattern = subpattern

3094

3095 def fix_groups(self, pattern, reverse, fuzzy):

3096 self.subpattern.fix_groups(pattern, self.behind, fuzzy)

3097

3098 def optimise(self, info, reverse):

3099 subpattern = self.subpattern.optimise(info, self.behind)

3100 if self.positive and subpattern.is_empty():

3101 return subpattern

3102

3103 return LookAround(self.behind, self.positive, subpattern)

3104

3105 def pack_characters(self, info):

3106 self.subpattern = self.subpattern.pack_characters(info)

3107 return self

3108

3109 def remove_captures(self):

3110 return self.subpattern.remove_captures()

3111

3112 def is_atomic(self):

3113 return self.subpattern.is_atomic()

3114

3115 def can_be_affix(self):

3116 return self.subpattern.can_be_affix()

3117

3118 def contains_group(self):

3119 return self.subpattern.contains_group()

3120

3121 def get_firstset(self, reverse):

3122 if self.positive and self.behind == reverse:

3123 return self.subpattern.get_firstset(reverse)

3124

3125 return set([None])

3126

3127 def _compile(self, reverse, fuzzy):

3128 flags = 0

3129 if self.positive:

3130 flags |= POSITIVE_OP

3131 if fuzzy:

3132 flags |= FUZZY_OP

3133 if reverse:

3134 flags |= REVERSE_OP

3135

3136 return ([(OP.LOOKAROUND, flags, int(not self.behind))] +

3137 self.subpattern.compile(self.behind) + [(OP.END, )])

3138

3139 def dump(self, indent, reverse):

3140 print("{}LOOK{} {}".format(INDENT * indent,

3141 self._dir_text[self.behind], POS_TEXT[self.positive]))

3142 self.subpattern.dump(indent + 1, self.behind)

3143

3144 def is_empty(self):

3145 return self.positive and self.subpattern.is_empty()

3146

3147 def __eq__(self, other):

3148 return type(self) is type(other) and (self.behind, self.positive,

3149 self.subpattern) == (other.behind, other.positive, other.subpattern)

3150

3151 def max_width(self):

3152 return 0

3153

3154class LookAroundConditional(RegexBase):

3155 _dir_text = {False: "AHEAD", True: "BEHIND"}

3156

3157 def __init__(self, behind, positive, subpattern, yes_item, no_item):

3158 RegexBase.__init__(self)

3159 self.behind = bool(behind)

3160 self.positive = bool(positive)

3161 self.subpattern = subpattern

3162 self.yes_item = yes_item

3163 self.no_item = no_item

3164

3165 def fix_groups(self, pattern, reverse, fuzzy):

3166 self.subpattern.fix_groups(pattern, reverse, fuzzy)

3167 self.yes_item.fix_groups(pattern, reverse, fuzzy)

3168 self.no_item.fix_groups(pattern, reverse, fuzzy)

3169

3170 def optimise(self, info, reverse):

3171 subpattern = self.subpattern.optimise(info, self.behind)

3172 yes_item = self.yes_item.optimise(info, self.behind)

3173 no_item = self.no_item.optimise(info, self.behind)

3174

3175 return LookAroundConditional(self.behind, self.positive, subpattern,

3176 yes_item, no_item)

3177

3178 def pack_characters(self, info):

3179 self.subpattern = self.subpattern.pack_characters(info)

3180 self.yes_item = self.yes_item.pack_characters(info)

3181 self.no_item = self.no_item.pack_characters(info)

3182 return self

3183

3184 def remove_captures(self):

3185 self.subpattern = self.subpattern.remove_captures()

3186 self.yes_item = self.yes_item.remove_captures()

3187 self.no_item = self.no_item.remove_captures()

3188

3189 def is_atomic(self):

3190 return (self.subpattern.is_atomic() and self.yes_item.is_atomic() and

3191 self.no_item.is_atomic())

3192

3193 def can_be_affix(self):

3194 return (self.subpattern.can_be_affix() and self.yes_item.can_be_affix()

3195 and self.no_item.can_be_affix())

3196

3197 def contains_group(self):

3198 return (self.subpattern.contains_group() or

3199 self.yes_item.contains_group() or self.no_item.contains_group())

3200

3201 def _compile(self, reverse, fuzzy):

3202 code = [(OP.CONDITIONAL, int(self.positive), int(not self.behind))]

3203 code.extend(self.subpattern.compile(self.behind, fuzzy))

3204 code.append((OP.NEXT, ))

3205 code.extend(self.yes_item.compile(reverse, fuzzy))

3206 add_code = self.no_item.compile(reverse, fuzzy)

3207 if add_code:

3208 code.append((OP.NEXT, ))

3209 code.extend(add_code)

3210

3211 code.append((OP.END, ))

3212

3213 return code

3214

3215 def dump(self, indent, reverse):

3216 print("{}CONDITIONAL {} {}".format(INDENT * indent,

3217 self._dir_text[self.behind], POS_TEXT[self.positive]))

3218 self.subpattern.dump(indent + 1, self.behind)

3219 print("{}EITHER".format(INDENT * indent))

3220 self.yes_item.dump(indent + 1, reverse)

3221 if not self.no_item.is_empty():

3222 print("{}OR".format(INDENT * indent))

3223 self.no_item.dump(indent + 1, reverse)

3224

3225 def is_empty(self):

3226 return (self.subpattern.is_empty() and self.yes_item.is_empty() or

3227 self.no_item.is_empty())

3228

3229 def __eq__(self, other):

3230 return type(self) is type(other) and (self.subpattern, self.yes_item,

3231 self.no_item) == (other.subpattern, other.yes_item, other.no_item)

3232

3233 def max_width(self):

3234 return max(self.yes_item.max_width(), self.no_item.max_width())

3235

3236 def get_required_string(self, reverse):

3237 return self.max_width(), None

3238

3239class PrecompiledCode(RegexBase):

3240 def __init__(self, code):

3241 self.code = code

3242

3243 def _compile(self, reverse, fuzzy):

3244 return [tuple(self.code)]

3245

3246class Property(RegexBase):

3247 _opcode = {(NOCASE, False): OP.PROPERTY, (IGNORECASE, False):

3248 OP.PROPERTY_IGN, (FULLCASE, False): OP.PROPERTY, (FULLIGNORECASE, False):

3249 OP.PROPERTY_IGN, (NOCASE, True): OP.PROPERTY_REV, (IGNORECASE, True):

3250 OP.PROPERTY_IGN_REV, (FULLCASE, True): OP.PROPERTY_REV, (FULLIGNORECASE,

3251 True): OP.PROPERTY_IGN_REV}

3252

3253 def __init__(self, value, positive=True, case_flags=NOCASE,

3254 zerowidth=False, encoding=0):

3255 RegexBase.__init__(self)

3256 self.value = value

3257 self.positive = bool(positive)

3258 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]

3259 self.zerowidth = bool(zerowidth)

3260 self.encoding = encoding

3261

3262 self._key = (self.__class__, self.value, self.positive,

3263 self.case_flags, self.zerowidth)

3264

3265 def rebuild(self, positive, case_flags, zerowidth):

3266 return Property(self.value, positive, case_flags, zerowidth,

3267 self.encoding)

3268

3269 def optimise(self, info, reverse, in_set=False):

3270 return self

3271

3272 def get_firstset(self, reverse):

3273 return set([self])

3274

3275 def has_simple_start(self):

3276 return True

3277

3278 def _compile(self, reverse, fuzzy):

3279 flags = 0

3280 if self.positive:

3281 flags |= POSITIVE_OP

3282 if self.zerowidth:

3283 flags |= ZEROWIDTH_OP

3284 if fuzzy:

3285 flags |= FUZZY_OP

3286 flags |= self.encoding << ENCODING_OP_SHIFT

3287 return [(self._opcode[self.case_flags, reverse], flags, self.value)]

3288

3289 def dump(self, indent, reverse):

3290 prop = PROPERTY_NAMES[self.value >> 16]

3291 name, value = prop[0], prop[1][self.value & 0xFFFF]

3292 print("{}PROPERTY {} {}:{}{}{}".format(INDENT * indent,

3293 POS_TEXT[self.positive], name, value, CASE_TEXT[self.case_flags],

3294 ["", " ASCII"][self.encoding]))

3295

3296 def matches(self, ch):

3297 return _regex.has_property_value(self.value, ch) == self.positive

3298

3299 def max_width(self):

3300 return 1

3301

3302class Prune(ZeroWidthBase):

3303 _op_name = "PRUNE"

3304

3305 def _compile(self, reverse, fuzzy):

3306 return [(OP.PRUNE, )]

3307

3308class Range(RegexBase):

3309 _opcode = {(NOCASE, False): OP.RANGE, (IGNORECASE, False): OP.RANGE_IGN,

3310 (FULLCASE, False): OP.RANGE, (FULLIGNORECASE, False): OP.RANGE_IGN,

3311 (NOCASE, True): OP.RANGE_REV, (IGNORECASE, True): OP.RANGE_IGN_REV,

3312 (FULLCASE, True): OP.RANGE_REV, (FULLIGNORECASE, True): OP.RANGE_IGN_REV}

3313 _op_name = "RANGE"

3314

3315 def __init__(self, lower, upper, positive=True, case_flags=NOCASE,

3316 zerowidth=False):

3317 RegexBase.__init__(self)

3318 self.lower = lower

3319 self.upper = upper

3320 self.positive = bool(positive)

3321 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]

3322 self.zerowidth = bool(zerowidth)

3323

3324 self._key = (self.__class__, self.lower, self.upper, self.positive,

3325 self.case_flags, self.zerowidth)

3326

3327 def rebuild(self, positive, case_flags, zerowidth):

3328 return Range(self.lower, self.upper, positive, case_flags, zerowidth)

3329

3330 def optimise(self, info, reverse, in_set=False):

3331 # Is the range case-sensitive?

3332 if not self.positive or not (self.case_flags & IGNORECASE) or in_set:

3333 return self

3334

3335 # Is full case-folding possible?

3336 if (not (info.flags & UNICODE) or (self.case_flags & FULLIGNORECASE) !=

3337 FULLIGNORECASE):

3338 return self

3339

3340 # Get the characters which expand to multiple codepoints on folding.

3341 expanding_chars = _regex.get_expand_on_folding()

3342

3343 # Get the folded characters in the range.

3344 items = []

3345 for ch in expanding_chars:

3346 if self.lower <= ord(ch) <= self.upper:

3347 folded = _regex.fold_case(FULL_CASE_FOLDING, ch)

3348 items.append(String([ord(c) for c in folded],

3349 case_flags=self.case_flags))

3350

3351 if not items:

3352 # We can fall back to simple case-folding.

3353 return self

3354

3355 if len(items) < self.upper - self.lower + 1:

3356 # Not all the characters are covered by the full case-folding.

3357 items.insert(0, self)

3358

3359 return Branch(items)

3360

3361 def _compile(self, reverse, fuzzy):

3362 flags = 0

3363 if self.positive:

3364 flags |= POSITIVE_OP

3365 if self.zerowidth:

3366 flags |= ZEROWIDTH_OP

3367 if fuzzy:

3368 flags |= FUZZY_OP

3369 return [(self._opcode[self.case_flags, reverse], flags, self.lower,

3370 self.upper)]

3371

3372 def dump(self, indent, reverse):

3373 display_lower = ascii(chr(self.lower)).lstrip("bu")

3374 display_upper = ascii(chr(self.upper)).lstrip("bu")

3375 print("{}RANGE {} {} {}{}".format(INDENT * indent,

3376 POS_TEXT[self.positive], display_lower, display_upper,

3377 CASE_TEXT[self.case_flags]))

3378

3379 def matches(self, ch):

3380 return (self.lower <= ch <= self.upper) == self.positive

3381

3382 def max_width(self):

3383 return 1

3384

3385class RefGroup(RegexBase):

3386 _opcode = {(NOCASE, False): OP.REF_GROUP, (IGNORECASE, False):

3387 OP.REF_GROUP_IGN, (FULLCASE, False): OP.REF_GROUP, (FULLIGNORECASE,

3388 False): OP.REF_GROUP_FLD, (NOCASE, True): OP.REF_GROUP_REV, (IGNORECASE,

3389 True): OP.REF_GROUP_IGN_REV, (FULLCASE, True): OP.REF_GROUP_REV,

3390 (FULLIGNORECASE, True): OP.REF_GROUP_FLD_REV}

3391

3392 def __init__(self, info, group, position, case_flags=NOCASE):

3393 RegexBase.__init__(self)

3394 self.info = info

3395 self.group = group

3396 self.position = position

3397 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]

3398

3399 self._key = self.__class__, self.group, self.case_flags

3400

3401 def fix_groups(self, pattern, reverse, fuzzy):

3402 try:

3403 self.group = int(self.group)

3404 except ValueError:

3405 try:

3406 self.group = self.info.group_index[self.group]

3407 except KeyError:

3408 raise error("unknown group", pattern, self.position)

3409

3410 if not 1 <= self.group <= self.info.group_count:

3411 raise error("invalid group reference", pattern, self.position)

3412

3413 self._key = self.__class__, self.group, self.case_flags

3414

3415 def remove_captures(self):

3416 raise error("group reference not allowed", pattern, self.position)

3417

3418 def _compile(self, reverse, fuzzy):

3419 flags = 0

3420 if fuzzy:

3421 flags |= FUZZY_OP

3422 return [(self._opcode[self.case_flags, reverse], flags, self.group)]

3423

3424 def dump(self, indent, reverse):

3425 print("{}REF_GROUP {}{}".format(INDENT * indent, self.group,

3426 CASE_TEXT[self.case_flags]))

3427

3428 def max_width(self):

3429 return UNLIMITED

3430

3431 def __del__(self):

3432 self.info = None

3433

3434class SearchAnchor(ZeroWidthBase):

3435 _opcode = OP.SEARCH_ANCHOR

3436 _op_name = "SEARCH_ANCHOR"

3437

3438class Sequence(RegexBase):

3439 def __init__(self, items=None):

3440 RegexBase.__init__(self)

3441 if items is None:

3442 items = []

3443

3444 self.items = items

3445

3446 def fix_groups(self, pattern, reverse, fuzzy):

3447 for s in self.items:

3448 s.fix_groups(pattern, reverse, fuzzy)

3449

3450 def optimise(self, info, reverse):

3451 # Flatten the sequences.

3452 items = []

3453 for s in self.items:

3454 s = s.optimise(info, reverse)

3455 if isinstance(s, Sequence):

3456 items.extend(s.items)

3457 else:

3458 items.append(s)

3459

3460 return make_sequence(items)

3461

3462 def pack_characters(self, info):

3463 "Packs sequences of characters into strings."

3464 items = []

3465 characters = []

3466 case_flags = NOCASE

3467 for s in self.items:

3468 if type(s) is Character and s.positive and not s.zerowidth:

3469 if s.case_flags != case_flags:

3470 # Different case sensitivity, so flush, unless neither the

3471 # previous nor the new character are cased.

3472 if s.case_flags or is_cased_i(info, s.value):

3473 Sequence._flush_characters(info, characters,

3474 case_flags, items)

3475

3476 case_flags = s.case_flags

3477

3478 characters.append(s.value)

3479 elif type(s) is String or type(s) is Literal:

3480 if s.case_flags != case_flags:

3481 # Different case sensitivity, so flush, unless the neither

3482 # the previous nor the new string are cased.

3483 if s.case_flags or any(is_cased_i(info, c) for c in

3484 characters):

3485 Sequence._flush_characters(info, characters,

3486 case_flags, items)

3487

3488 case_flags = s.case_flags

3489

3490 characters.extend(s.characters)

3491 else:

3492 Sequence._flush_characters(info, characters, case_flags, items)

3493

3494 items.append(s.pack_characters(info))

3495

3496 Sequence._flush_characters(info, characters, case_flags, items)

3497

3498 return make_sequence(items)

3499

3500 def remove_captures(self):

3501 self.items = [s.remove_captures() for s in self.items]

3502 return self

3503

3504 def is_atomic(self):

3505 return all(s.is_atomic() for s in self.items)

3506

3507 def can_be_affix(self):

3508 return False

3509

3510 def contains_group(self):

3511 return any(s.contains_group() for s in self.items)

3512

3513 def get_firstset(self, reverse):

3514 fs = set()

3515 items = self.items

3516 if reverse:

3517 items.reverse()

3518 for s in items:

3519 fs |= s.get_firstset(reverse)

3520 if None not in fs:

3521 return fs

3522 fs.discard(None)

3523

3524 return fs | set([None])

3525

3526 def has_simple_start(self):

3527 return bool(self.items) and self.items[0].has_simple_start()

3528

3529 def _compile(self, reverse, fuzzy):

3530 seq = self.items

3531 if reverse:

3532 seq = seq[::-1]

3533

3534 code = []

3535 for s in seq:

3536 code.extend(s.compile(reverse, fuzzy))

3537

3538 return code

3539

3540 def dump(self, indent, reverse):

3541 for s in self.items:

3542 s.dump(indent, reverse)

3543

3544 @staticmethod

3545 def _flush_characters(info, characters, case_flags, items):

3546 if not characters:

3547 return

3548

3549 # Disregard case_flags if all of the characters are case-less.

3550 if case_flags & IGNORECASE:

3551 if not any(is_cased_i(info, c) for c in characters):

3552 case_flags = NOCASE

3553

3554 if (case_flags & FULLIGNORECASE) == FULLIGNORECASE:

3555 literals = Sequence._fix_full_casefold(characters)

3556

3557 for item in literals:

3558 chars = item.characters

3559

3560 if len(chars) == 1:

3561 items.append(Character(chars[0], case_flags=item.case_flags))

3562 else:

3563 items.append(String(chars, case_flags=item.case_flags))

3564 else:

3565 if len(characters) == 1:

3566 items.append(Character(characters[0], case_flags=case_flags))

3567 else:

3568 items.append(String(characters, case_flags=case_flags))

3569

3570 characters[:] = []

3571

3572 @staticmethod

3573 def _fix_full_casefold(characters):

3574 # Split a literal needing full case-folding into chunks that need it

3575 # and chunks that can use simple case-folding, which is faster.

3576 expanded = [_regex.fold_case(FULL_CASE_FOLDING, c) for c in

3577 _regex.get_expand_on_folding()]

3578 string = _regex.fold_case(FULL_CASE_FOLDING, ''.join(chr(c)

3579 for c in characters)).lower()

3580 chunks = []

3581

3582 for e in expanded:

3583 found = string.find(e)

3584

3585 while found >= 0:

3586 chunks.append((found, found + len(e)))

3587 found = string.find(e, found + 1)

3588

3589 pos = 0

3590 literals = []

3591

3592 for start, end in Sequence._merge_chunks(chunks):

3593 if pos < start:

3594 literals.append(Literal(characters[pos : start],

3595 case_flags=IGNORECASE))

3596

3597 literals.append(Literal(characters[start : end],

3598 case_flags=FULLIGNORECASE))

3599 pos = end

3600

3601 if pos < len(characters):

3602 literals.append(Literal(characters[pos : ], case_flags=IGNORECASE))

3603

3604 return literals

3605

3606 @staticmethod

3607 def _merge_chunks(chunks):

3608 if len(chunks) < 2:

3609 return chunks

3610

3611 chunks.sort()

3612

3613 start, end = chunks[0]

3614 new_chunks = []

3615

3616 for s, e in chunks[1 : ]:

3617 if s <= end:

3618 end = max(end, e)

3619 else:

3620 new_chunks.append((start, end))

3621 start, end = s, e

3622

3623 new_chunks.append((start, end))

3624

3625 return new_chunks

3626

3627 def is_empty(self):

3628 return all(i.is_empty() for i in self.items)

3629

3630 def __eq__(self, other):

3631 return type(self) is type(other) and self.items == other.items

3632

3633 def max_width(self):

3634 return sum(s.max_width() for s in self.items)

3635

3636 def get_required_string(self, reverse):

3637 seq = self.items

3638 if reverse:

3639 seq = seq[::-1]

3640

3641 offset = 0

3642

3643 for s in seq:

3644 ofs, req = s.get_required_string(reverse)

3645 offset += ofs

3646 if req:

3647 return offset, req

3648

3649 return offset, None

3650

3651class SetBase(RegexBase):

3652 def __init__(self, info, items, positive=True, case_flags=NOCASE,

3653 zerowidth=False):

3654 RegexBase.__init__(self)

3655 self.info = info

3656 self.items = tuple(items)

3657 self.positive = bool(positive)

3658 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]

3659 self.zerowidth = bool(zerowidth)

3660

3661 self.char_width = 1

3662

3663 self._key = (self.__class__, self.items, self.positive,

3664 self.case_flags, self.zerowidth)

3665

3666 def rebuild(self, positive, case_flags, zerowidth):

3667 return type(self)(self.info, self.items, positive, case_flags,

3668 zerowidth).optimise(self.info, False)

3669

3670 def get_firstset(self, reverse):

3671 return set([self])

3672

3673 def has_simple_start(self):

3674 return True

3675

3676 def _compile(self, reverse, fuzzy):

3677 flags = 0

3678 if self.positive:

3679 flags |= POSITIVE_OP

3680 if self.zerowidth:

3681 flags |= ZEROWIDTH_OP

3682 if fuzzy:

3683 flags |= FUZZY_OP

3684 code = [(self._opcode[self.case_flags, reverse], flags)]

3685 for m in self.items:

3686 code.extend(m.compile())

3687

3688 code.append((OP.END, ))

3689

3690 return code

3691

3692 def dump(self, indent, reverse):

3693 print("{}{} {}{}".format(INDENT * indent, self._op_name,

3694 POS_TEXT[self.positive], CASE_TEXT[self.case_flags]))

3695 for i in self.items:

3696 i.dump(indent + 1, reverse)

3697

3698 def _handle_case_folding(self, info, in_set):

3699 # Is the set case-sensitive?

3700 if not self.positive or not (self.case_flags & IGNORECASE) or in_set:

3701 return self

3702

3703 # Is full case-folding possible?

3704 if (not (self.info.flags & UNICODE) or (self.case_flags &

3705 FULLIGNORECASE) != FULLIGNORECASE):

3706 return self

3707

3708 # Get the characters which expand to multiple codepoints on folding.

3709 expanding_chars = _regex.get_expand_on_folding()

3710

3711 # Get the folded characters in the set.

3712 items = []

3713 seen = set()

3714 for ch in expanding_chars:

3715 if self.matches(ord(ch)):

3716 folded = _regex.fold_case(FULL_CASE_FOLDING, ch)

3717 if folded not in seen:

3718 items.append(String([ord(c) for c in folded],

3719 case_flags=self.case_flags))

3720 seen.add(folded)

3721

3722 if not items:

3723 # We can fall back to simple case-folding.

3724 return self

3725

3726 return Branch([self] + items)

3727

3728 def max_width(self):

3729 # Is the set case-sensitive?

3730 if not self.positive or not (self.case_flags & IGNORECASE):

3731 return 1

3732

3733 # Is full case-folding possible?

3734 if (not (self.info.flags & UNICODE) or (self.case_flags &

3735 FULLIGNORECASE) != FULLIGNORECASE):

3736 return 1

3737

3738 # Get the characters which expand to multiple codepoints on folding.

3739 expanding_chars = _regex.get_expand_on_folding()

3740

3741 # Get the folded characters in the set.

3742 seen = set()

3743 for ch in expanding_chars:

3744 if self.matches(ord(ch)):

3745 folded = _regex.fold_case(FULL_CASE_FOLDING, ch)

3746 seen.add(folded)

3747

3748 if not seen:

3749 return 1

3750

3751 return max(len(folded) for folded in seen)

3752

3753 def __del__(self):

3754 self.info = None

3755

3756class SetDiff(SetBase):

3757 _opcode = {(NOCASE, False): OP.SET_DIFF, (IGNORECASE, False):

3758 OP.SET_DIFF_IGN, (FULLCASE, False): OP.SET_DIFF, (FULLIGNORECASE, False):

3759 OP.SET_DIFF_IGN, (NOCASE, True): OP.SET_DIFF_REV, (IGNORECASE, True):

3760 OP.SET_DIFF_IGN_REV, (FULLCASE, True): OP.SET_DIFF_REV, (FULLIGNORECASE,

3761 True): OP.SET_DIFF_IGN_REV}

3762 _op_name = "SET_DIFF"

3763

3764 def optimise(self, info, reverse, in_set=False):

3765 items = self.items

3766 if len(items) > 2:

3767 items = [items[0], SetUnion(info, items[1 : ])]

3768

3769 if len(items) == 1:

3770 return items[0].with_flags(case_flags=self.case_flags,

3771 zerowidth=self.zerowidth).optimise(info, reverse, in_set)

3772

3773 self.items = tuple(m.optimise(info, reverse, in_set=True) for m in

3774 items)

3775

3776 return self._handle_case_folding(info, in_set)

3777

3778 def matches(self, ch):

3779 m = self.items[0].matches(ch) and not self.items[1].matches(ch)

3780 return m == self.positive

3781

3782class SetInter(SetBase):

3783 _opcode = {(NOCASE, False): OP.SET_INTER, (IGNORECASE, False):

3784 OP.SET_INTER_IGN, (FULLCASE, False): OP.SET_INTER, (FULLIGNORECASE,

3785 False): OP.SET_INTER_IGN, (NOCASE, True): OP.SET_INTER_REV, (IGNORECASE,

3786 True): OP.SET_INTER_IGN_REV, (FULLCASE, True): OP.SET_INTER_REV,

3787 (FULLIGNORECASE, True): OP.SET_INTER_IGN_REV}

3788 _op_name = "SET_INTER"

3789

3790 def optimise(self, info, reverse, in_set=False):

3791 items = []

3792 for m in self.items:

3793 m = m.optimise(info, reverse, in_set=True)

3794 if isinstance(m, SetInter) and m.positive:

3795 # Intersection in intersection.

3796 items.extend(m.items)

3797 else:

3798 items.append(m)

3799

3800 if len(items) == 1:

3801 return items[0].with_flags(case_flags=self.case_flags,

3802 zerowidth=self.zerowidth).optimise(info, reverse, in_set)

3803

3804 self.items = tuple(items)

3805

3806 return self._handle_case_folding(info, in_set)

3807

3808 def matches(self, ch):

3809 m = all(i.matches(ch) for i in self.items)

3810 return m == self.positive

3811

3812class SetSymDiff(SetBase):

3813 _opcode = {(NOCASE, False): OP.SET_SYM_DIFF, (IGNORECASE, False):

3814 OP.SET_SYM_DIFF_IGN, (FULLCASE, False): OP.SET_SYM_DIFF, (FULLIGNORECASE,

3815 False): OP.SET_SYM_DIFF_IGN, (NOCASE, True): OP.SET_SYM_DIFF_REV,

3816 (IGNORECASE, True): OP.SET_SYM_DIFF_IGN_REV, (FULLCASE, True):

3817 OP.SET_SYM_DIFF_REV, (FULLIGNORECASE, True): OP.SET_SYM_DIFF_IGN_REV}

3818 _op_name = "SET_SYM_DIFF"

3819

3820 def optimise(self, info, reverse, in_set=False):

3821 items = []

3822 for m in self.items:

3823 m = m.optimise(info, reverse, in_set=True)

3824 if isinstance(m, SetSymDiff) and m.positive:

3825 # Symmetric difference in symmetric difference.

3826 items.extend(m.items)

3827 else:

3828 items.append(m)

3829

3830 if len(items) == 1:

3831 return items[0].with_flags(case_flags=self.case_flags,

3832 zerowidth=self.zerowidth).optimise(info, reverse, in_set)

3833

3834 self.items = tuple(items)

3835

3836 return self._handle_case_folding(info, in_set)

3837

3838 def matches(self, ch):

3839 m = False

3840 for i in self.items:

3841 m = m != i.matches(ch)

3842

3843 return m == self.positive

3844

3845class SetUnion(SetBase):

3846 _opcode = {(NOCASE, False): OP.SET_UNION, (IGNORECASE, False):

3847 OP.SET_UNION_IGN, (FULLCASE, False): OP.SET_UNION, (FULLIGNORECASE,

3848 False): OP.SET_UNION_IGN, (NOCASE, True): OP.SET_UNION_REV, (IGNORECASE,

3849 True): OP.SET_UNION_IGN_REV, (FULLCASE, True): OP.SET_UNION_REV,

3850 (FULLIGNORECASE, True): OP.SET_UNION_IGN_REV}

3851 _op_name = "SET_UNION"

3852

3853 def optimise(self, info, reverse, in_set=False):

3854 items = []

3855 for m in self.items:

3856 m = m.optimise(info, reverse, in_set=True)

3857 if isinstance(m, SetUnion) and m.positive:

3858 # Union in union.

3859 items.extend(m.items)

3860 elif isinstance(m, AnyAll):

3861 return AnyAll()

3862 else:

3863 items.append(m)

3864

3865 # Are there complementary properties?

3866 properties = (set(), set())

3867

3868 for m in items:

3869 if isinstance(m, Property):

3870 properties[m.positive].add((m.value, m.case_flags, m.zerowidth))

3871

3872 if properties[0] & properties[1]:

3873 return AnyAll()

3874

3875 if len(items) == 1:

3876 i = items[0]

3877 return i.with_flags(positive=i.positive == self.positive,

3878 case_flags=self.case_flags,

3879 zerowidth=self.zerowidth).optimise(info, reverse, in_set)

3880

3881 self.items = tuple(items)

3882

3883 return self._handle_case_folding(info, in_set)

3884

3885 def _compile(self, reverse, fuzzy):

3886 flags = 0

3887 if self.positive:

3888 flags |= POSITIVE_OP

3889 if self.zerowidth:

3890 flags |= ZEROWIDTH_OP

3891 if fuzzy:

3892 flags |= FUZZY_OP

3893

3894 characters, others = defaultdict(list), []

3895 for m in self.items:

3896 if isinstance(m, Character):

3897 characters[m.positive].append(m.value)

3898 else:

3899 others.append(m)

3900

3901 code = [(self._opcode[self.case_flags, reverse], flags)]

3902

3903 for positive, values in characters.items():

3904 flags = 0

3905 if positive:

3906 flags |= POSITIVE_OP

3907 if len(values) == 1:

3908 code.append((OP.CHARACTER, flags, values[0]))

3909 else:

3910 code.append((OP.STRING, flags, len(values)) + tuple(values))

3911

3912 for m in others:

3913 code.extend(m.compile())

3914

3915 code.append((OP.END, ))

3916

3917 return code

3918

3919 def matches(self, ch):

3920 m = any(i.matches(ch) for i in self.items)

3921 return m == self.positive

3922

3923class Skip(ZeroWidthBase):

3924 _op_name = "SKIP"

3925 _opcode = OP.SKIP

3926

3927class StartOfLine(ZeroWidthBase):

3928 _opcode = OP.START_OF_LINE

3929 _op_name = "START_OF_LINE"

3930

3931class StartOfLineU(StartOfLine):

3932 _opcode = OP.START_OF_LINE_U

3933 _op_name = "START_OF_LINE_U"

3934

3935class StartOfString(ZeroWidthBase):

3936 _opcode = OP.START_OF_STRING

3937 _op_name = "START_OF_STRING"

3938

3939class StartOfWord(ZeroWidthBase):

3940 _opcode = OP.START_OF_WORD

3941 _op_name = "START_OF_WORD"

3942

3943class String(RegexBase):

3944 _opcode = {(NOCASE, False): OP.STRING, (IGNORECASE, False): OP.STRING_IGN,

3945 (FULLCASE, False): OP.STRING, (FULLIGNORECASE, False): OP.STRING_FLD,

3946 (NOCASE, True): OP.STRING_REV, (IGNORECASE, True): OP.STRING_IGN_REV,

3947 (FULLCASE, True): OP.STRING_REV, (FULLIGNORECASE, True):

3948 OP.STRING_FLD_REV}

3949

3950 def __init__(self, characters, case_flags=NOCASE):

3951 self.characters = tuple(characters)

3952 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]

3953

3954 if (self.case_flags & FULLIGNORECASE) == FULLIGNORECASE:

3955 folded_characters = []

3956 for char in self.characters:

3957 folded = _regex.fold_case(FULL_CASE_FOLDING, chr(char))

3958 folded_characters.extend(ord(c) for c in folded)

3959 else:

3960 folded_characters = self.characters

3961

3962 self.folded_characters = tuple(folded_characters)

3963 self.required = False

3964

3965 self._key = self.__class__, self.characters, self.case_flags

3966

3967 def get_firstset(self, reverse):

3968 if reverse:

3969 pos = -1

3970 else:

3971 pos = 0

3972 return set([Character(self.characters[pos],

3973 case_flags=self.case_flags)])

3974

3975 def has_simple_start(self):

3976 return True

3977

3978 def _compile(self, reverse, fuzzy):

3979 flags = 0

3980 if fuzzy:

3981 flags |= FUZZY_OP

3982 if self.required:

3983 flags |= REQUIRED_OP

3984 return [(self._opcode[self.case_flags, reverse], flags,

3985 len(self.folded_characters)) + self.folded_characters]

3986

3987 def dump(self, indent, reverse):

3988 display = ascii("".join(chr(c) for c in self.characters)).lstrip("bu")

3989 print("{}STRING {}{}".format(INDENT * indent, display,

3990 CASE_TEXT[self.case_flags]))

3991

3992 def max_width(self):

3993 return len(self.folded_characters)

3994

3995 def get_required_string(self, reverse):

3996 return 0, self

3997

3998class Literal(String):

3999 def dump(self, indent, reverse):

4000 literal = ''.join(chr(c) for c in self.characters)

4001 display = ascii(literal).lstrip("bu")

4002 print("{}LITERAL MATCH {}{}".format(INDENT * indent, display,

4003 CASE_TEXT[self.case_flags]))

4004

4005class StringSet(Branch):

4006 def __init__(self, info, name, case_flags=NOCASE):

4007 self.info = info

4008 self.name = name

4009 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]

4010

4011 self._key = self.__class__, self.name, self.case_flags

4012

4013 self.set_key = (name, self.case_flags)

4014 if self.set_key not in info.named_lists_used:

4015 info.named_lists_used[self.set_key] = len(info.named_lists_used)

4016

4017 index = self.info.named_lists_used[self.set_key]

4018 items = self.info.kwargs[self.name]

4019

4020 case_flags = self.case_flags

4021

4022 encoding = self.info.flags & _ALL_ENCODINGS

4023 fold_flags = encoding | case_flags

4024

4025 choices = []

4026

4027 for string in items:

4028 if isinstance(string, str):

4029 string = [ord(c) for c in string]

4030

4031 choices.append([Character(c, case_flags=case_flags) for c in

4032 string])

4033

4034 # Sort from longest to shortest.

4035 choices.sort(key=len, reverse=True)

4036

4037 self.branches = [Sequence(choice) for choice in choices]

4038

4039 def dump(self, indent, reverse):

4040 print("{}STRING_SET {}{}".format(INDENT * indent, self.name,

4041 CASE_TEXT[self.case_flags]))

4042

4043 def __del__(self):

4044 self.info = None

4045

4046class Source:

4047 "Scanner for the regular expression source string."

4048 def __init__(self, string):

4049 if isinstance(string, str):

4050 self.string = string

4051 self.char_type = chr

4052 else:

4053 self.string = string.decode("latin-1")

4054 self.char_type = lambda c: bytes([c])

4055

4056 self.pos = 0

4057 self.ignore_space = False

4058 self.sep = string[ : 0]

4059

4060 def get(self, override_ignore=False):

4061 string = self.string

4062 pos = self.pos

4063

4064 try:

4065 if self.ignore_space and not override_ignore:

4066 while True:

4067 if string[pos].isspace():

4068 # Skip over the whitespace.

4069 pos += 1

4070 elif string[pos] == "#":

4071 # Skip over the comment to the end of the line.

4072 pos = string.index("\n", pos)

4073 else:

4074 break

4075

4076 ch = string[pos]

4077 self.pos = pos + 1

4078 return ch

4079 except IndexError:

4080 # We've reached the end of the string.

4081 self.pos = pos

4082 return string[ : 0]

4083 except ValueError:

4084 # The comment extended to the end of the string.

4085 self.pos = len(string)

4086 return string[ : 0]

4087

4088 def get_many(self, count=1):

4089 string = self.string

4090 pos = self.pos

4091

4092 try:

4093 if self.ignore_space:

4094 substring = []

4095

4096 while len(substring) < count:

4097 while True:

4098 if string[pos].isspace():

4099 # Skip over the whitespace.

4100 pos += 1

4101 elif string[pos] == "#":

4102 # Skip over the comment to the end of the line.

4103 pos = string.index("\n", pos)

4104 else:

4105 break

4106

4107 substring.append(string[pos])

4108 pos += 1

4109

4110 substring = "".join(substring)

4111 else:

4112 substring = string[pos : pos + count]

4113 pos += len(substring)

4114

4115 self.pos = pos

4116 return substring

4117 except IndexError:

4118 # We've reached the end of the string.

4119 self.pos = len(string)

4120 return "".join(substring)

4121 except ValueError:

4122 # The comment extended to the end of the string.

4123 self.pos = len(string)

4124 return "".join(substring)

4125

4126 def get_while(self, test_set, include=True, keep_spaces=False):

4127 string = self.string

4128 pos = self.pos

4129

4130 if self.ignore_space and not keep_spaces:

4131 try:

4132 substring = []

4133

4134 while True:

4135 if string[pos].isspace():

4136 # Skip over the whitespace.

4137 pos += 1

4138 elif string[pos] == "#":

4139 # Skip over the comment to the end of the line.

4140 pos = string.index("\n", pos)

4141 elif (string[pos] in test_set) == include:

4142 substring.append(string[pos])

4143 pos += 1

4144 else:

4145 break

4146

4147 self.pos = pos

4148 except IndexError:

4149 # We've reached the end of the string.

4150 self.pos = len(string)

4151 except ValueError:

4152 # The comment extended to the end of the string.

4153 self.pos = len(string)

4154

4155 return "".join(substring)

4156 else:

4157 try:

4158 while (string[pos] in test_set) == include:

4159 pos += 1

4160

4161 substring = string[self.pos : pos]

4162

4163 self.pos = pos

4164

4165 return substring

4166 except IndexError:

4167 # We've reached the end of the string.

4168 substring = string[self.pos : pos]

4169

4170 self.pos = pos

4171

4172 return substring

4173

4174 def skip_while(self, test_set, include=True):

4175 string = self.string

4176 pos = self.pos

4177

4178 try:

4179 if self.ignore_space:

4180 while True:

4181 if string[pos].isspace():

4182 # Skip over the whitespace.

4183 pos += 1

4184 elif string[pos] == "#":

4185 # Skip over the comment to the end of the line.

4186 pos = string.index("\n", pos)

4187 elif (string[pos] in test_set) == include:

4188 pos += 1

4189 else:

4190 break

4191 else:

4192 while (string[pos] in test_set) == include:

4193 pos += 1

4194

4195 self.pos = pos

4196 except IndexError:

4197 # We've reached the end of the string.

4198 self.pos = len(string)

4199 except ValueError:

4200 # The comment extended to the end of the string.

4201 self.pos = len(string)

4202

4203 def match(self, substring):

4204 string = self.string

4205 pos = self.pos

4206

4207 if self.ignore_space:

4208 try:

4209 for c in substring:

4210 while True:

4211 if string[pos].isspace():

4212 # Skip over the whitespace.

4213 pos += 1

4214 elif string[pos] == "#":

4215 # Skip over the comment to the end of the line.

4216 pos = string.index("\n", pos)

4217 else:

4218 break

4219

4220 if string[pos] != c:

4221 return False

4222

4223 pos += 1

4224

4225 self.pos = pos

4226

4227 return True

4228 except IndexError:

4229 # We've reached the end of the string.

4230 return False

4231 except ValueError:

4232 # The comment extended to the end of the string.

4233 return False

4234 else:

4235 if not string.startswith(substring, pos):

4236 return False

4237

4238 self.pos = pos + len(substring)

4239

4240 return True

4241

4242 def expect(self, substring):

4243 if not self.match(substring):

4244 raise error("missing {}".format(substring), self.string, self.pos)

4245

4246 def at_end(self):

4247 string = self.string

4248 pos = self.pos

4249

4250 try:

4251 if self.ignore_space:

4252 while True:

4253 if string[pos].isspace():

4254 pos += 1

4255 elif string[pos] == "#":

4256 pos = string.index("\n", pos)

4257 else:

4258 break

4259

4260 return pos >= len(string)

4261 except IndexError:

4262 # We've reached the end of the string.

4263 return True

4264 except ValueError:

4265 # The comment extended to the end of the string.

4266 return True

4267

4268class Info:

4269 "Info about the regular expression."

4270

4271 def __init__(self, flags=0, char_type=None, kwargs={}):

4272 flags |= DEFAULT_FLAGS[(flags & _ALL_VERSIONS) or DEFAULT_VERSION]

4273 self.flags = flags

4274 self.global_flags = flags

4275 self.inline_locale = False

4276

4277 self.kwargs = kwargs

4278

4279 self.group_count = 0

4280 self.group_index = {}

4281 self.group_name = {}

4282 self.char_type = char_type

4283 self.named_lists_used = {}

4284 self.open_groups = []

4285 self.open_group_count = {}

4286 self.defined_groups = {}

4287 self.group_calls = []

4288 self.private_groups = {}

4289

4290 def open_group(self, name=None):

4291 group = self.group_index.get(name)

4292 if group is None:

4293 while True:

4294 self.group_count += 1

4295 if name is None or self.group_count not in self.group_name:

4296 break

4297

4298 group = self.group_count

4299 if name:

4300 self.group_index[name] = group

4301 self.group_name[group] = name

4302

4303 if group in self.open_groups:

4304 # We have a nested named group. We'll assign it a private group

4305 # number, initially negative until we can assign a proper

4306 # (positive) number.

4307 group_alias = -(len(self.private_groups) + 1)

4308 self.private_groups[group_alias] = group

4309 group = group_alias

4310

4311 self.open_groups.append(group)

4312 self.open_group_count[group] = self.open_group_count.get(group, 0) + 1

4313

4314 return group

4315

4316 def close_group(self):

4317 self.open_groups.pop()

4318

4319 def is_open_group(self, name):

4320 # In version 1, a group reference can refer to an open group. We'll

4321 # just pretend the group isn't open.

4322 version = (self.flags & _ALL_VERSIONS) or DEFAULT_VERSION

4323 if version == VERSION1:

4324 return False

4325

4326 if name.isdigit():

4327 group = int(name)

4328 else:

4329 group = self.group_index.get(name)

4330

4331 return group in self.open_groups

4332

4333def _check_group_features(info, parsed):

4334 """Checks whether the reverse and fuzzy features of the group calls match

4335 the groups which they call.

4336 """

4337 call_refs = {}

4338 additional_groups = []

4339 for call, reverse, fuzzy in info.group_calls:

4340 # Look up the reference of this group call.

4341 key = (call.group, reverse, fuzzy)

4342 ref = call_refs.get(key)

4343 if ref is None:

4344 # This group doesn't have a reference yet, so look up its features.

4345 if call.group == 0:

4346 # Calling the pattern as a whole.

4347 rev = bool(info.flags & REVERSE)

4348 fuz = isinstance(parsed, Fuzzy)

4349 if (rev, fuz) != (reverse, fuzzy):

4350 # The pattern as a whole doesn't have the features we want,

4351 # so we'll need to make a copy of it with the desired

4352 # features.

4353 additional_groups.append((CallRef(len(call_refs), parsed),

4354 reverse, fuzzy))

4355 else:

4356 # Calling a capture group.

4357 def_info = info.defined_groups[call.group]

4358 group = def_info[0]

4359 if def_info[1 : ] != (reverse, fuzzy):

4360 # The group doesn't have the features we want, so we'll

4361 # need to make a copy of it with the desired features.

4362 additional_groups.append((group, reverse, fuzzy))

4363

4364 ref = len(call_refs)

4365 call_refs[key] = ref

4366

4367 call.call_ref = ref

4368

4369 info.call_refs = call_refs

4370 info.additional_groups = additional_groups

4371

4372def _get_required_string(parsed, flags):

4373 "Gets the required string and related info of a parsed pattern."

4374

4375 req_offset, required = parsed.get_required_string(bool(flags & REVERSE))

4376 if required:

4377 required.required = True

4378 if req_offset >= UNLIMITED:

4379 req_offset = -1

4380

4381 req_flags = required.case_flags

4382 if not (flags & UNICODE):

4383 req_flags &= ~UNICODE

4384

4385 req_chars = required.folded_characters

4386 else:

4387 req_offset = 0

4388 req_chars = ()

4389 req_flags = 0

4390

4391 return req_offset, req_chars, req_flags

4392

4393class Scanner:

4394 def __init__(self, lexicon, flags=0):

4395 self.lexicon = lexicon

4396

4397 # Combine phrases into a compound pattern.

4398 patterns = []

4399 for phrase, action in lexicon:

4400 # Parse the regular expression.

4401 source = Source(phrase)

4402 info = Info(flags, source.char_type)

4403 source.ignore_space = bool(info.flags & VERBOSE)

4404 parsed = _parse_pattern(source, info)

4405 if not source.at_end():

4406 raise error("unbalanced parenthesis", source.string,

4407 source.pos)

4408

4409 # We want to forbid capture groups within each phrase.

4410 patterns.append(parsed.remove_captures())

4411

4412 # Combine all the subpatterns into one pattern.

4413 info = Info(flags)

4414 patterns = [Group(info, g + 1, p) for g, p in enumerate(patterns)]

4415 parsed = Branch(patterns)

4416

4417 # Optimise the compound pattern.

4418 reverse = bool(info.flags & REVERSE)

4419 parsed = parsed.optimise(info, reverse)

4420 parsed = parsed.pack_characters(info)

4421

4422 # Get the required string.

4423 req_offset, req_chars, req_flags = _get_required_string(parsed,

4424 info.flags)

4425

4426 # Check the features of the groups.

4427 _check_group_features(info, parsed)

4428

4429 # Complain if there are any group calls. They are not supported by the

4430 # Scanner class.

4431 if info.call_refs:

4432 raise error("recursive regex not supported by Scanner",

4433 source.string, source.pos)

4434

4435 reverse = bool(info.flags & REVERSE)

4436

4437 # Compile the compound pattern. The result is a list of tuples.

4438 code = parsed.compile(reverse) + [(OP.SUCCESS, )]

4439

4440 # Flatten the code into a list of ints.

4441 code = _flatten_code(code)

4442

4443 if not parsed.has_simple_start():

4444 # Get the first set, if possible.

4445 try:

4446 fs_code = _compile_firstset(info, parsed.get_firstset(reverse))

4447 fs_code = _flatten_code(fs_code)

4448 code = fs_code + code

4449 except _FirstSetError:

4450 pass

4451

4452 # Check the global flags for conflicts.

4453 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION

4454 if version not in (0, VERSION0, VERSION1):

4455 raise ValueError("VERSION0 and VERSION1 flags are mutually incompatible")

4456

4457 # Create the PatternObject.

4458 #

4459 # Local flags like IGNORECASE affect the code generation, but aren't

4460 # needed by the PatternObject itself. Conversely, global flags like

4461 # LOCALE _don't_ affect the code generation but _are_ needed by the

4462 # PatternObject.

4463 self.scanner = _regex.compile(None, (flags & GLOBAL_FLAGS) | version,

4464 code, {}, {}, {}, [], req_offset, req_chars, req_flags,

4465 len(patterns))

4466

4467 def scan(self, string):

4468 result = []

4469 append = result.append

4470 match = self.scanner.scanner(string).match

4471 i = 0

4472 while True:

4473 m = match()

4474 if not m:

4475 break

4476 j = m.end()

4477 if i == j:

4478 break

4479 action = self.lexicon[m.lastindex - 1][1]

4480 if hasattr(action, '__call__'):

4481 self.match = m

4482 action = action(self, m.group())

4483 if action is not None:

4484 append(action)

4485 i = j

4486

4487 return result, string[i : ]

4488

4489# Get the known properties dict.

4490PROPERTIES = _regex.get_properties()

4491

4492# Build the inverse of the properties dict.

4493PROPERTY_NAMES = {}

4494for prop_name, (prop_id, values) in PROPERTIES.items():

4495 name, prop_values = PROPERTY_NAMES.get(prop_id, ("", {}))

4496 name = max(name, prop_name, key=len)

4497 PROPERTY_NAMES[prop_id] = name, prop_values

4498

4499 for val_name, val_id in values.items():

4500 prop_values[val_id] = max(prop_values.get(val_id, ""), val_name,

4501 key=len)

4502

4503# Character escape sequences.

4504CHARACTER_ESCAPES = {

4505 "a": "\a",

4506 "b": "\b",

4507 "f": "\f",

4508 "n": "\n",

4509 "r": "\r",

4510 "t": "\t",

4511 "v": "\v",

4512}

4513

4514ASCII_ENCODING = 1

4515UNICODE_ENCODING = 2

4516

4517# Predefined character set escape sequences.

4518CHARSET_ESCAPES = {

4519 "d": lookup_property(None, "Digit", True),

4520 "D": lookup_property(None, "Digit", False),

4521 "h": lookup_property(None, "Blank", True),

4522 "s": lookup_property(None, "Space", True),

4523 "S": lookup_property(None, "Space", False),

4524 "w": lookup_property(None, "Word", True),

4525 "W": lookup_property(None, "Word", False),

4526}

4527

4528ASCII_CHARSET_ESCAPES = dict(CHARSET_ESCAPES)

4529ASCII_CHARSET_ESCAPES.update({

4530 "d": lookup_property(None, "Digit", True, encoding=ASCII_ENCODING),

4531 "D": lookup_property(None, "Digit", False, encoding=ASCII_ENCODING),

4532 "s": lookup_property(None, "Space", True, encoding=ASCII_ENCODING),

4533 "S": lookup_property(None, "Space", False, encoding=ASCII_ENCODING),

4534 "w": lookup_property(None, "Word", True, encoding=ASCII_ENCODING),

4535 "W": lookup_property(None, "Word", False, encoding=ASCII_ENCODING),

4536})

4537UNICODE_CHARSET_ESCAPES = dict(CHARSET_ESCAPES)

4538UNICODE_CHARSET_ESCAPES.update({

4539 "d": lookup_property(None, "Digit", True, encoding=UNICODE_ENCODING),

4540 "D": lookup_property(None, "Digit", False, encoding=UNICODE_ENCODING),

4541 "s": lookup_property(None, "Space", True, encoding=UNICODE_ENCODING),

4542 "S": lookup_property(None, "Space", False, encoding=UNICODE_ENCODING),

4543 "w": lookup_property(None, "Word", True, encoding=UNICODE_ENCODING),

4544 "W": lookup_property(None, "Word", False, encoding=UNICODE_ENCODING),

4545})

4546

4547# Positional escape sequences.

4548POSITION_ESCAPES = {

4549 "A": StartOfString(),

4550 "b": Boundary(),

4551 "B": Boundary(False),

4552 "K": Keep(),

4553 "m": StartOfWord(),

4554 "M": EndOfWord(),

4555 "Z": EndOfString(),

4556}

4557ASCII_POSITION_ESCAPES = dict(POSITION_ESCAPES)

4558ASCII_POSITION_ESCAPES.update({

4559 "b": Boundary(encoding=ASCII_ENCODING),

4560 "B": Boundary(False, encoding=ASCII_ENCODING),

4561 "m": StartOfWord(encoding=ASCII_ENCODING),

4562 "M": EndOfWord(encoding=ASCII_ENCODING),

4563})

4564UNICODE_POSITION_ESCAPES = dict(POSITION_ESCAPES)

4565UNICODE_POSITION_ESCAPES.update({

4566 "b": Boundary(encoding=UNICODE_ENCODING),

4567 "B": Boundary(False, encoding=UNICODE_ENCODING),

4568 "m": StartOfWord(encoding=UNICODE_ENCODING),

4569 "M": EndOfWord(encoding=UNICODE_ENCODING),

4570})

4571

4572# Positional escape sequences when WORD flag set.

4573WORD_POSITION_ESCAPES = dict(POSITION_ESCAPES)

4574WORD_POSITION_ESCAPES.update({

4575 "b": DefaultBoundary(),

4576 "B": DefaultBoundary(False),

4577 "m": DefaultStartOfWord(),

4578 "M": DefaultEndOfWord(),

4579})

4580

4581# Regex control verbs.

4582VERBS = {

4583 "FAIL": Failure(),

4584 "F": Failure(),

4585 "PRUNE": Prune(),

4586 "SKIP": Skip(),

4587}

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/regex/_regex_core.py: 43%

2835 statements