Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/regex/_regex_core.py: 26%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1#
2# Secret Labs' Regular Expression Engine core module
3#
4# Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved.
5#
6# This version of the SRE library can be redistributed under CNRI's
7# Python 1.6 license. For any other use, please contact Secret Labs
8# AB (info@pythonware.com).
9#
10# Portions of this engine have been developed in cooperation with
11# CNRI. Hewlett-Packard provided funding for 1.6 integration and
12# other compatibility work.
13#
14# 2010-01-16 mrab Python front-end re-written and extended
16import enum
17import string
18import unicodedata
19from collections import defaultdict
21from regex import _regex
23__all__ = ["A", "ASCII", "B", "BESTMATCH", "D", "DEBUG", "E", "ENHANCEMATCH",
24 "F", "FULLCASE", "I", "IGNORECASE", "L", "LOCALE", "M", "MULTILINE", "P",
25 "POSIX", "R", "REVERSE", "S", "DOTALL", "T", "TEMPLATE", "U", "UNICODE",
26 "V0", "VERSION0", "V1", "VERSION1", "W", "WORD", "X", "VERBOSE", "error",
27 "Scanner", "RegexFlag"]
29# The regex exception.
30class error(Exception):
31 """Exception raised for invalid regular expressions.
33 Attributes:
35 msg: The unformatted error message
36 pattern: The regular expression pattern
37 pos: The position in the pattern where compilation failed, or None
38 lineno: The line number where compilation failed, unless pos is None
39 colno: The column number where compilation failed, unless pos is None
40 """
42 def __init__(self, message, pattern=None, pos=None):
43 newline = '\n' if isinstance(pattern, str) else b'\n'
44 self.msg = message
45 self.pattern = pattern
46 self.pos = pos
47 if pattern is not None and pos is not None:
48 self.lineno = pattern.count(newline, 0, pos) + 1
49 self.colno = pos - pattern.rfind(newline, 0, pos)
51 message = "{} at position {}".format(message, pos)
53 if newline in pattern:
54 message += " (line {}, column {})".format(self.lineno,
55 self.colno)
57 Exception.__init__(self, message)
59# The exception for when a positional flag has been turned on in the old
60# behaviour.
61class _UnscopedFlagSet(Exception):
62 pass
64# The exception for when parsing fails and we want to try something else.
65class ParseError(Exception):
66 pass
68# The exception for when there isn't a valid first set.
69class _FirstSetError(Exception):
70 pass
72# Flags.
73class RegexFlag(enum.IntFlag):
74 A = ASCII = 0x80 # Assume ASCII locale.
75 B = BESTMATCH = 0x1000 # Best fuzzy match.
76 D = DEBUG = 0x200 # Print parsed pattern.
77 E = ENHANCEMATCH = 0x8000 # Attempt to improve the fit after finding the first
78 # fuzzy match.
79 F = FULLCASE = 0x4000 # Unicode full case-folding.
80 I = IGNORECASE = 0x2 # Ignore case.
81 L = LOCALE = 0x4 # Assume current 8-bit locale.
82 M = MULTILINE = 0x8 # Make anchors look for newline.
83 P = POSIX = 0x10000 # POSIX-style matching (leftmost longest).
84 R = REVERSE = 0x400 # Search backwards.
85 S = DOTALL = 0x10 # Make dot match newline.
86 U = UNICODE = 0x20 # Assume Unicode locale.
87 V0 = VERSION0 = 0x2000 # Old legacy behaviour.
88 V1 = VERSION1 = 0x100 # New enhanced behaviour.
89 W = WORD = 0x800 # Default Unicode word breaks.
90 X = VERBOSE = 0x40 # Ignore whitespace and comments.
91 T = TEMPLATE = 0x1 # Template (present because re module has it).
93 def __repr__(self):
94 if self._name_ is not None:
95 return 'regex.%s' % self._name_
97 value = self._value_
98 members = []
99 negative = value < 0
101 if negative:
102 value = ~value
104 for m in self.__class__:
105 if value & m._value_:
106 value &= ~m._value_
107 members.append('regex.%s' % m._name_)
109 if value:
110 members.append(hex(value))
112 res = '|'.join(members)
114 if negative:
115 if len(members) > 1:
116 res = '~(%s)' % res
117 else:
118 res = '~%s' % res
120 return res
122 __str__ = object.__str__
124# Put the flags into the module namespace. Being explicit here helps tools like
125# linters and IDEs understand the code better.
126ASCII = RegexFlag.ASCII
127BESTMATCH = RegexFlag.BESTMATCH
128DEBUG = RegexFlag.DEBUG
129DOTALL = RegexFlag.DOTALL
130ENHANCEMATCH = RegexFlag.ENHANCEMATCH
131FULLCASE = RegexFlag.FULLCASE
132IGNORECASE = RegexFlag.IGNORECASE
133LOCALE = RegexFlag.LOCALE
134MULTILINE = RegexFlag.MULTILINE
135POSIX = RegexFlag.POSIX
136REVERSE = RegexFlag.REVERSE
137TEMPLATE = RegexFlag.TEMPLATE
138UNICODE = RegexFlag.UNICODE
139VERBOSE = RegexFlag.VERBOSE
140VERSION0 = RegexFlag.VERSION0
141VERSION1 = RegexFlag.VERSION1
142WORD = RegexFlag.WORD
143A = RegexFlag.A
144B = RegexFlag.B
145D = RegexFlag.D
146E = RegexFlag.E
147F = RegexFlag.F
148I = RegexFlag.I
149L = RegexFlag.L
150M = RegexFlag.M
151P = RegexFlag.P
152R = RegexFlag.R
153S = RegexFlag.S
154U = RegexFlag.U
155V0 = RegexFlag.V0
156V1 = RegexFlag.V1
157W = RegexFlag.W
158X = RegexFlag.X
159T = RegexFlag.T
161DEFAULT_VERSION = VERSION1
163_ALL_VERSIONS = VERSION0 | VERSION1
164_ALL_ENCODINGS = ASCII | LOCALE | UNICODE
166# The default flags for the various versions.
167DEFAULT_FLAGS = {VERSION0: 0, VERSION1: FULLCASE}
169# The mask for the flags.
170GLOBAL_FLAGS = (_ALL_VERSIONS | BESTMATCH | DEBUG | ENHANCEMATCH | POSIX |
171 REVERSE)
172SCOPED_FLAGS = (FULLCASE | IGNORECASE | MULTILINE | DOTALL | WORD | VERBOSE |
173 _ALL_ENCODINGS)
175ALPHA = frozenset(string.ascii_letters)
176DIGITS = frozenset(string.digits)
177ALNUM = ALPHA | DIGITS
178OCT_DIGITS = frozenset(string.octdigits)
179HEX_DIGITS = frozenset(string.hexdigits)
180SPECIAL_CHARS = frozenset("()|?*+{^$.[\\#") | frozenset([""])
181NAMED_CHAR_PART = ALNUM | frozenset(" -")
182PROPERTY_NAME_PART = ALNUM | frozenset(" &_-.")
183SET_OPS = ("||", "~~", "&&", "--")
185# The width of the code words inside the regex engine.
186BYTES_PER_CODE = _regex.get_code_size()
187BITS_PER_CODE = BYTES_PER_CODE * 8
189# The repeat count which represents infinity.
190UNLIMITED = (1 << BITS_PER_CODE) - 1
192# The regular expression flags.
193REGEX_FLAGS = {"a": ASCII, "b": BESTMATCH, "e": ENHANCEMATCH, "f": FULLCASE,
194 "i": IGNORECASE, "L": LOCALE, "m": MULTILINE, "p": POSIX, "r": REVERSE,
195 "s": DOTALL, "u": UNICODE, "V0": VERSION0, "V1": VERSION1, "w": WORD, "x":
196 VERBOSE}
198# The case flags.
199CASE_FLAGS = FULLCASE | IGNORECASE
200NOCASE = 0
201FULLIGNORECASE = FULLCASE | IGNORECASE
203FULL_CASE_FOLDING = UNICODE | FULLIGNORECASE
205CASE_FLAGS_COMBINATIONS = {0: 0, FULLCASE: 0, IGNORECASE: IGNORECASE,
206 FULLIGNORECASE: FULLIGNORECASE}
208# The number of digits in hexadecimal escapes.
209HEX_ESCAPES = {"x": 2, "u": 4, "U": 8}
211# The names of the opcodes.
212OPCODES = """
213FAILURE
214SUCCESS
215ANY
216ANY_ALL
217ANY_ALL_REV
218ANY_REV
219ANY_U
220ANY_U_REV
221ATOMIC
222BOUNDARY
223BRANCH
224CALL_REF
225CHARACTER
226CHARACTER_IGN
227CHARACTER_IGN_REV
228CHARACTER_REV
229CONDITIONAL
230DEFAULT_BOUNDARY
231DEFAULT_END_OF_WORD
232DEFAULT_START_OF_WORD
233END
234END_OF_LINE
235END_OF_LINE_U
236END_OF_STRING
237END_OF_STRING_LINE
238END_OF_STRING_LINE_U
239END_OF_WORD
240FUZZY
241GRAPHEME_BOUNDARY
242GREEDY_REPEAT
243GROUP
244GROUP_CALL
245GROUP_EXISTS
246KEEP
247LAZY_REPEAT
248LOOKAROUND
249NEXT
250PROPERTY
251PROPERTY_IGN
252PROPERTY_IGN_REV
253PROPERTY_REV
254PRUNE
255RANGE
256RANGE_IGN
257RANGE_IGN_REV
258RANGE_REV
259REF_GROUP
260REF_GROUP_FLD
261REF_GROUP_FLD_REV
262REF_GROUP_IGN
263REF_GROUP_IGN_REV
264REF_GROUP_REV
265SEARCH_ANCHOR
266SET_DIFF
267SET_DIFF_IGN
268SET_DIFF_IGN_REV
269SET_DIFF_REV
270SET_INTER
271SET_INTER_IGN
272SET_INTER_IGN_REV
273SET_INTER_REV
274SET_SYM_DIFF
275SET_SYM_DIFF_IGN
276SET_SYM_DIFF_IGN_REV
277SET_SYM_DIFF_REV
278SET_UNION
279SET_UNION_IGN
280SET_UNION_IGN_REV
281SET_UNION_REV
282SKIP
283START_OF_LINE
284START_OF_LINE_U
285START_OF_STRING
286START_OF_WORD
287STRING
288STRING_FLD
289STRING_FLD_REV
290STRING_IGN
291STRING_IGN_REV
292STRING_REV
293FUZZY_EXT
294"""
296# Define the opcodes in a namespace.
297class Namespace:
298 pass
300OP = Namespace()
301for i, op in enumerate(OPCODES.split()):
302 setattr(OP, op, i)
304def _shrink_cache(cache_dict, args_dict, locale_sensitive, max_length, divisor=5):
305 """Make room in the given cache.
307 Args:
308 cache_dict: The cache dictionary to modify.
309 args_dict: The dictionary of named list args used by patterns.
310 max_length: Maximum # of entries in cache_dict before it is shrunk.
311 divisor: Cache will shrink to max_length - 1/divisor*max_length items.
312 """
313 # Toss out a fraction of the entries at random to make room for new ones.
314 # A random algorithm was chosen as opposed to simply cache_dict.popitem()
315 # as popitem could penalize the same regular expression repeatedly based
316 # on its internal hash value. Being random should spread the cache miss
317 # love around.
318 cache_keys = tuple(cache_dict.keys())
319 overage = len(cache_keys) - max_length
320 if overage < 0:
321 # Cache is already within limits. Normally this should not happen
322 # but it could due to multithreading.
323 return
325 number_to_toss = max_length // divisor + overage
327 # The import is done here to avoid a circular dependency.
328 import random
329 if not hasattr(random, 'sample'):
330 # Do nothing while resolving the circular dependency:
331 # re->random->warnings->tokenize->string->re
332 return
334 for doomed_key in random.sample(cache_keys, number_to_toss):
335 try:
336 del cache_dict[doomed_key]
337 except KeyError:
338 # Ignore problems if the cache changed from another thread.
339 pass
341 # Rebuild the arguments and locale-sensitivity dictionaries.
342 args_dict.clear()
343 sensitivity_dict = {}
344 for pattern, pattern_type, flags, args, default_version, locale in tuple(cache_dict):
345 args_dict[pattern, pattern_type, flags, default_version, locale] = args
346 try:
347 sensitivity_dict[pattern_type, pattern] = locale_sensitive[pattern_type, pattern]
348 except KeyError:
349 pass
351 locale_sensitive.clear()
352 locale_sensitive.update(sensitivity_dict)
354def _fold_case(info, string):
355 "Folds the case of a string."
356 flags = info.flags
357 if (flags & _ALL_ENCODINGS) == 0:
358 flags |= info.guess_encoding
360 return _regex.fold_case(flags, string)
362def is_cased_i(info, char):
363 "Checks whether a character is cased."
364 return len(_regex.get_all_cases(info.flags, char)) > 1
366def is_cased_f(flags, char):
367 "Checks whether a character is cased."
368 return len(_regex.get_all_cases(flags, char)) > 1
370def _compile_firstset(info, fs):
371 "Compiles the firstset for the pattern."
372 reverse = bool(info.flags & REVERSE)
373 fs = _check_firstset(info, reverse, fs)
374 if not fs or isinstance(fs, AnyAll):
375 return []
377 # Compile the firstset.
378 return fs.compile(reverse)
380def _check_firstset(info, reverse, fs):
381 "Checks the firstset for the pattern."
382 if not fs or None in fs:
383 return None
385 # If we ignore the case, for simplicity we won't build a firstset.
386 members = set()
387 case_flags = NOCASE
388 for i in fs:
389 if isinstance(i, Character) and not i.positive:
390 return None
392# if i.case_flags:
393# if isinstance(i, Character):
394# if is_cased_i(info, i.value):
395# return []
396# elif isinstance(i, SetBase):
397# return []
398 case_flags |= i.case_flags
399 members.add(i.with_flags(case_flags=NOCASE))
401 if case_flags == (FULLCASE | IGNORECASE):
402 return None
404 # Build the firstset.
405 fs = SetUnion(info, list(members), case_flags=case_flags & ~FULLCASE,
406 zerowidth=True)
407 fs = fs.optimise(info, reverse, in_set=True)
409 return fs
411def _flatten_code(code):
412 "Flattens the code from a list of tuples."
413 flat_code = []
414 for c in code:
415 flat_code.extend(c)
417 return flat_code
419def make_case_flags(info):
420 "Makes the case flags."
421 flags = info.flags & CASE_FLAGS
423 # Turn off FULLCASE if ASCII is turned on.
424 if info.flags & ASCII:
425 flags &= ~FULLCASE
427 return flags
429def make_character(info, value, in_set=False):
430 "Makes a character literal."
431 if in_set:
432 # A character set is built case-sensitively.
433 return Character(value)
435 return Character(value, case_flags=make_case_flags(info))
437def make_ref_group(info, name, position):
438 "Makes a group reference."
439 return RefGroup(info, name, position, case_flags=make_case_flags(info))
441def make_string_set(info, name):
442 "Makes a string set."
443 return StringSet(info, name, case_flags=make_case_flags(info))
445def make_property(info, prop, in_set):
446 "Makes a property."
447 if in_set:
448 return prop
450 return prop.with_flags(case_flags=make_case_flags(info))
452def _parse_pattern(source, info):
453 "Parses a pattern, eg. 'a|b|c'."
454 branches = [parse_sequence(source, info)]
455 while source.match("|"):
456 branches.append(parse_sequence(source, info))
458 if len(branches) == 1:
459 return branches[0]
460 return Branch(branches)
462def parse_sequence(source, info):
463 "Parses a sequence, eg. 'abc'."
464 sequence = [None]
465 case_flags = make_case_flags(info)
466 while True:
467 saved_pos = source.pos
468 ch = source.get()
469 if ch in SPECIAL_CHARS:
470 if ch in ")|":
471 # The end of a sequence. At the end of the pattern ch is "".
472 source.pos = saved_pos
473 break
474 elif ch == "\\":
475 # An escape sequence outside a set.
476 sequence.append(parse_escape(source, info, False))
477 elif ch == "(":
478 # A parenthesised subpattern or a flag.
479 element = parse_paren(source, info)
480 if element is None:
481 case_flags = make_case_flags(info)
482 else:
483 sequence.append(element)
484 elif ch == ".":
485 # Any character.
486 if info.flags & DOTALL:
487 sequence.append(AnyAll())
488 elif info.flags & WORD:
489 sequence.append(AnyU())
490 else:
491 sequence.append(Any())
492 elif ch == "[":
493 # A character set.
494 sequence.append(parse_set(source, info))
495 elif ch == "^":
496 # The start of a line or the string.
497 if info.flags & MULTILINE:
498 if info.flags & WORD:
499 sequence.append(StartOfLineU())
500 else:
501 sequence.append(StartOfLine())
502 else:
503 sequence.append(StartOfString())
504 elif ch == "$":
505 # The end of a line or the string.
506 if info.flags & MULTILINE:
507 if info.flags & WORD:
508 sequence.append(EndOfLineU())
509 else:
510 sequence.append(EndOfLine())
511 else:
512 if info.flags & WORD:
513 sequence.append(EndOfStringLineU())
514 else:
515 sequence.append(EndOfStringLine())
516 elif ch in "?*+{":
517 # Looks like a quantifier.
518 counts = parse_quantifier(source, info, ch)
519 if counts:
520 # It _is_ a quantifier.
521 apply_quantifier(source, info, counts, case_flags, ch,
522 saved_pos, sequence)
523 sequence.append(None)
524 else:
525 # It's not a quantifier. Maybe it's a fuzzy constraint.
526 constraints = parse_fuzzy(source, info, ch, case_flags)
528 if constraints:
529 # It _is_ a fuzzy constraint.
530 if is_actually_fuzzy(constraints):
531 apply_constraint(source, info, constraints, case_flags,
532 saved_pos, sequence)
533 sequence.append(None)
534 else:
535 # The element was just a literal.
536 sequence.append(Character(ord(ch),
537 case_flags=case_flags))
538 else:
539 # A literal.
540 sequence.append(Character(ord(ch), case_flags=case_flags))
541 else:
542 # A literal.
543 sequence.append(Character(ord(ch), case_flags=case_flags))
545 sequence = [item for item in sequence if item is not None]
546 return Sequence(sequence)
548def is_actually_fuzzy(constraints):
549 "Checks whether a fuzzy constraint is actually fuzzy."
550 if constraints.get("e") == (0, 0):
551 return False
553 if (constraints.get("s"), constraints.get("i"), constraints.get("d")) == ((0, 0), (0, 0), (0, 0)):
554 return False
556 return True
558def apply_quantifier(source, info, counts, case_flags, ch, saved_pos,
559 sequence):
560 element = sequence.pop()
561 if element is None:
562 if sequence:
563 raise error("multiple repeat", source.string, saved_pos)
564 raise error("nothing to repeat", source.string, saved_pos)
566 if isinstance(element, (GreedyRepeat, LazyRepeat, PossessiveRepeat)):
567 raise error("multiple repeat", source.string, saved_pos)
569 min_count, max_count = counts
570 saved_pos = source.pos
571 ch = source.get()
572 if ch == "?":
573 # The "?" suffix that means it's a lazy repeat.
574 repeated = LazyRepeat
575 elif ch == "+":
576 # The "+" suffix that means it's a possessive repeat.
577 repeated = PossessiveRepeat
578 else:
579 # No suffix means that it's a greedy repeat.
580 source.pos = saved_pos
581 repeated = GreedyRepeat
583 # Ignore the quantifier if it applies to a zero-width item or the number of
584 # repeats is fixed at 1.
585 if not element.is_empty() and (min_count != 1 or max_count != 1):
586 element = repeated(element, min_count, max_count)
588 sequence.append(element)
590def apply_constraint(source, info, constraints, case_flags, saved_pos,
591 sequence):
592 element = sequence.pop()
593 if element is None:
594 raise error("nothing for fuzzy constraint", source.string, saved_pos)
596 # If a group is marked as fuzzy then put all of the fuzzy part in the
597 # group.
598 if isinstance(element, Group):
599 element.subpattern = Fuzzy(element.subpattern, constraints)
600 sequence.append(element)
601 else:
602 sequence.append(Fuzzy(element, constraints))
604_QUANTIFIERS = {"?": (0, 1), "*": (0, None), "+": (1, None)}
606def parse_quantifier(source, info, ch):
607 "Parses a quantifier."
608 q = _QUANTIFIERS.get(ch)
609 if q:
610 # It's a quantifier.
611 return q
613 if ch == "{":
614 # Looks like a limited repeated element, eg. 'a{2,3}'.
615 counts = parse_limited_quantifier(source)
616 if counts:
617 return counts
619 return None
621def is_above_limit(count):
622 "Checks whether a count is above the maximum."
623 return count is not None and count >= UNLIMITED
625def parse_limited_quantifier(source):
626 "Parses a limited quantifier."
627 saved_pos = source.pos
628 min_count = parse_count(source)
629 if source.match(","):
630 max_count = parse_count(source)
632 # No minimum means 0 and no maximum means unlimited.
633 min_count = int(min_count or 0)
634 max_count = int(max_count) if max_count else None
635 else:
636 if not min_count:
637 source.pos = saved_pos
638 return None
640 min_count = max_count = int(min_count)
642 if not source.match ("}"):
643 source.pos = saved_pos
644 return None
646 if is_above_limit(min_count) or is_above_limit(max_count):
647 raise error("repeat count too big", source.string, saved_pos)
649 if max_count is not None and min_count > max_count:
650 raise error("min repeat greater than max repeat", source.string,
651 saved_pos)
653 return min_count, max_count
655def parse_fuzzy(source, info, ch, case_flags):
656 "Parses a fuzzy setting, if present."
657 saved_pos = source.pos
659 if ch != "{":
660 return None
662 constraints = {}
663 try:
664 parse_fuzzy_item(source, constraints)
665 while source.match(","):
666 parse_fuzzy_item(source, constraints)
667 except ParseError:
668 source.pos = saved_pos
669 return None
671 if source.match(":"):
672 constraints["test"] = parse_fuzzy_test(source, info, case_flags)
674 if not source.match("}"):
675 raise error("expected }", source.string, source.pos)
677 return constraints
679def parse_fuzzy_item(source, constraints):
680 "Parses a fuzzy setting item."
681 saved_pos = source.pos
682 try:
683 parse_cost_constraint(source, constraints)
684 except ParseError:
685 source.pos = saved_pos
687 parse_cost_equation(source, constraints)
689def parse_cost_constraint(source, constraints):
690 "Parses a cost constraint."
691 saved_pos = source.pos
692 ch = source.get()
693 if ch in ALPHA:
694 # Syntax: constraint [("<=" | "<") cost]
695 constraint = parse_constraint(source, constraints, ch)
697 max_inc = parse_fuzzy_compare(source)
699 if max_inc is None:
700 # No maximum cost.
701 constraints[constraint] = 0, None
702 else:
703 # There's a maximum cost.
704 cost_pos = source.pos
705 max_cost = parse_cost_limit(source)
707 # Inclusive or exclusive limit?
708 if not max_inc:
709 max_cost -= 1
711 if max_cost < 0:
712 raise error("bad fuzzy cost limit", source.string, cost_pos)
714 constraints[constraint] = 0, max_cost
715 elif ch in DIGITS:
716 # Syntax: cost ("<=" | "<") constraint ("<=" | "<") cost
717 source.pos = saved_pos
719 # Minimum cost.
720 cost_pos = source.pos
721 min_cost = parse_cost_limit(source)
723 min_inc = parse_fuzzy_compare(source)
724 if min_inc is None:
725 raise ParseError()
727 constraint = parse_constraint(source, constraints, source.get())
729 max_inc = parse_fuzzy_compare(source)
730 if max_inc is None:
731 raise ParseError()
733 # Maximum cost.
734 cost_pos = source.pos
735 max_cost = parse_cost_limit(source)
737 # Inclusive or exclusive limits?
738 if not min_inc:
739 min_cost += 1
740 if not max_inc:
741 max_cost -= 1
743 if not 0 <= min_cost <= max_cost:
744 raise error("bad fuzzy cost limit", source.string, cost_pos)
746 constraints[constraint] = min_cost, max_cost
747 else:
748 raise ParseError()
750def parse_cost_limit(source):
751 "Parses a cost limit."
752 cost_pos = source.pos
753 digits = parse_count(source)
755 try:
756 return int(digits)
757 except ValueError:
758 pass
760 raise error("bad fuzzy cost limit", source.string, cost_pos)
762def parse_constraint(source, constraints, ch):
763 "Parses a constraint."
764 if ch not in "deis":
765 raise ParseError()
767 if ch in constraints:
768 raise ParseError()
770 return ch
772def parse_fuzzy_compare(source):
773 "Parses a cost comparator."
774 if source.match("<="):
775 return True
776 elif source.match("<"):
777 return False
778 else:
779 return None
781def parse_cost_equation(source, constraints):
782 "Parses a cost equation."
783 if "cost" in constraints:
784 raise error("more than one cost equation", source.string, source.pos)
786 cost = {}
788 parse_cost_term(source, cost)
789 while source.match("+"):
790 parse_cost_term(source, cost)
792 max_inc = parse_fuzzy_compare(source)
793 if max_inc is None:
794 raise ParseError()
796 max_cost = int(parse_count(source))
798 if not max_inc:
799 max_cost -= 1
801 if max_cost < 0:
802 raise error("bad fuzzy cost limit", source.string, source.pos)
804 cost["max"] = max_cost
806 constraints["cost"] = cost
808def parse_cost_term(source, cost):
809 "Parses a cost equation term."
810 coeff = parse_count(source)
811 ch = source.get()
812 if ch not in "dis":
813 raise ParseError()
815 if ch in cost:
816 raise error("repeated fuzzy cost", source.string, source.pos)
818 cost[ch] = int(coeff or 1)
820def parse_fuzzy_test(source, info, case_flags):
821 saved_pos = source.pos
822 ch = source.get()
823 if ch in SPECIAL_CHARS:
824 if ch == "\\":
825 # An escape sequence outside a set.
826 return parse_escape(source, info, False)
827 elif ch == ".":
828 # Any character.
829 if info.flags & DOTALL:
830 return AnyAll()
831 elif info.flags & WORD:
832 return AnyU()
833 else:
834 return Any()
835 elif ch == "[":
836 # A character set.
837 return parse_set(source, info)
838 else:
839 raise error("expected character set", source.string, saved_pos)
840 elif ch:
841 # A literal.
842 return Character(ord(ch), case_flags=case_flags)
843 else:
844 raise error("expected character set", source.string, saved_pos)
846def parse_count(source):
847 "Parses a quantifier's count, which can be empty."
848 return source.get_while(DIGITS)
850def parse_paren(source, info):
851 """Parses a parenthesised subpattern or a flag. Returns FLAGS if it's an
852 inline flag.
853 """
854 saved_pos = source.pos
855 ch = source.get(True)
856 if ch == "?":
857 # (?...
858 saved_pos_2 = source.pos
859 ch = source.get(True)
860 if ch == "<":
861 # (?<...
862 saved_pos_3 = source.pos
863 ch = source.get()
864 if ch in ("=", "!"):
865 # (?<=... or (?<!...: lookbehind.
866 return parse_lookaround(source, info, True, ch == "=")
868 # (?<...: a named capture group.
869 source.pos = saved_pos_3
870 name = parse_name(source)
871 group = info.open_group(name)
872 source.expect(">")
873 saved_flags = info.flags
874 try:
875 subpattern = _parse_pattern(source, info)
876 source.expect(")")
877 finally:
878 info.flags = saved_flags
879 source.ignore_space = bool(info.flags & VERBOSE)
881 info.close_group()
882 return Group(info, group, subpattern)
883 if ch in ("=", "!"):
884 # (?=... or (?!...: lookahead.
885 return parse_lookaround(source, info, False, ch == "=")
886 if ch == "P":
887 # (?P...: a Python extension.
888 return parse_extension(source, info)
889 if ch == "#":
890 # (?#...: a comment.
891 return parse_comment(source)
892 if ch == "(":
893 # (?(...: a conditional subpattern.
894 return parse_conditional(source, info)
895 if ch == ">":
896 # (?>...: an atomic subpattern.
897 return parse_atomic(source, info)
898 if ch == "|":
899 # (?|...: a common/reset groups branch.
900 return parse_common(source, info)
901 if ch == "R" or "0" <= ch <= "9":
902 # (?R...: probably a call to a group.
903 return parse_call_group(source, info, ch, saved_pos_2)
904 if ch == "&":
905 # (?&...: a call to a named group.
906 return parse_call_named_group(source, info, saved_pos_2)
907 if (ch == "+" or ch == "-") and source.peek() in DIGITS:
908 return parse_rel_call_group(source, info, ch, saved_pos_2)
910 # (?...: probably a flags subpattern.
911 source.pos = saved_pos_2
912 return parse_flags_subpattern(source, info)
914 if ch == "*":
915 # (*...
916 saved_pos_2 = source.pos
917 word = source.get_while(set(")>"), include=False)
918 if word[ : 1].isalpha():
919 verb = VERBS.get(word)
920 if not verb:
921 raise error("unknown verb", source.string, saved_pos_2)
923 source.expect(")")
925 return verb
927 # (...: an unnamed capture group.
928 source.pos = saved_pos
929 group = info.open_group()
930 saved_flags = info.flags
931 try:
932 subpattern = _parse_pattern(source, info)
933 source.expect(")")
934 finally:
935 info.flags = saved_flags
936 source.ignore_space = bool(info.flags & VERBOSE)
938 info.close_group()
940 return Group(info, group, subpattern)
942def parse_extension(source, info):
943 "Parses a Python extension."
944 saved_pos = source.pos
945 ch = source.get()
946 if ch == "<":
947 # (?P<...: a named capture group.
948 name = parse_name(source)
949 group = info.open_group(name)
950 source.expect(">")
951 saved_flags = info.flags
952 try:
953 subpattern = _parse_pattern(source, info)
954 source.expect(")")
955 finally:
956 info.flags = saved_flags
957 source.ignore_space = bool(info.flags & VERBOSE)
959 info.close_group()
961 return Group(info, group, subpattern)
962 if ch == "=":
963 # (?P=...: a named group reference.
964 name = parse_name(source, allow_numeric=True)
965 source.expect(")")
966 if info.is_open_group(name):
967 raise error("cannot refer to an open group", source.string,
968 saved_pos)
970 return make_ref_group(info, name, saved_pos)
971 if ch == ">" or ch == "&":
972 # (?P>...: a call to a group.
973 return parse_call_named_group(source, info, saved_pos)
975 source.pos = saved_pos
976 raise error("unknown extension", source.string, saved_pos)
978def parse_comment(source):
979 "Parses a comment."
980 while True:
981 saved_pos = source.pos
982 c = source.get(True)
984 if not c or c == ")":
985 break
987 if c == "\\":
988 c = source.get(True)
990 source.pos = saved_pos
991 source.expect(")")
993 return None
995def parse_lookaround(source, info, behind, positive):
996 "Parses a lookaround."
997 saved_flags = info.flags
998 try:
999 subpattern = _parse_pattern(source, info)
1000 source.expect(")")
1001 finally:
1002 info.flags = saved_flags
1003 source.ignore_space = bool(info.flags & VERBOSE)
1005 return LookAround(behind, positive, subpattern)
1007def parse_conditional(source, info):
1008 "Parses a conditional subpattern."
1009 saved_flags = info.flags
1010 saved_pos = source.pos
1011 ch = source.get()
1012 if ch == "?":
1013 # (?(?...
1014 ch = source.get()
1015 if ch in ("=", "!"):
1016 # (?(?=... or (?(?!...: lookahead conditional.
1017 return parse_lookaround_conditional(source, info, False, ch == "=")
1018 if ch == "<":
1019 # (?(?<...
1020 ch = source.get()
1021 if ch in ("=", "!"):
1022 # (?(?<=... or (?(?<!...: lookbehind conditional.
1023 return parse_lookaround_conditional(source, info, True, ch ==
1024 "=")
1026 source.pos = saved_pos
1027 raise error("expected lookaround conditional", source.string,
1028 source.pos)
1030 source.pos = saved_pos
1031 try:
1032 group = parse_name(source, True)
1033 source.expect(")")
1034 yes_branch = parse_sequence(source, info)
1035 if source.match("|"):
1036 no_branch = parse_sequence(source, info)
1037 else:
1038 no_branch = Sequence()
1040 source.expect(")")
1041 finally:
1042 info.flags = saved_flags
1043 source.ignore_space = bool(info.flags & VERBOSE)
1045 if yes_branch.is_empty() and no_branch.is_empty():
1046 return Sequence()
1048 return Conditional(info, group, yes_branch, no_branch, saved_pos)
1050def parse_lookaround_conditional(source, info, behind, positive):
1051 saved_flags = info.flags
1052 try:
1053 subpattern = _parse_pattern(source, info)
1054 source.expect(")")
1055 finally:
1056 info.flags = saved_flags
1057 source.ignore_space = bool(info.flags & VERBOSE)
1059 yes_branch = parse_sequence(source, info)
1060 if source.match("|"):
1061 no_branch = parse_sequence(source, info)
1062 else:
1063 no_branch = Sequence()
1065 source.expect(")")
1067 return LookAroundConditional(behind, positive, subpattern, yes_branch,
1068 no_branch)
1070def parse_atomic(source, info):
1071 "Parses an atomic subpattern."
1072 saved_flags = info.flags
1073 try:
1074 subpattern = _parse_pattern(source, info)
1075 source.expect(")")
1076 finally:
1077 info.flags = saved_flags
1078 source.ignore_space = bool(info.flags & VERBOSE)
1080 return Atomic(subpattern)
1082def parse_common(source, info):
1083 "Parses a common groups branch."
1084 # Capture group numbers in different branches can reuse the group numbers.
1085 initial_group_count = info.group_count
1086 branches = [parse_sequence(source, info)]
1087 final_group_count = info.group_count
1088 while source.match("|"):
1089 info.group_count = initial_group_count
1090 branches.append(parse_sequence(source, info))
1091 final_group_count = max(final_group_count, info.group_count)
1093 info.group_count = final_group_count
1094 source.expect(")")
1096 if len(branches) == 1:
1097 return branches[0]
1098 return Branch(branches)
1100def parse_call_group(source, info, ch, pos):
1101 "Parses a call to a group."
1102 if ch == "R":
1103 group = "0"
1104 else:
1105 group = ch + source.get_while(DIGITS)
1107 source.expect(")")
1109 return CallGroup(info, group, pos)
1111def parse_rel_call_group(source, info, ch, pos):
1112 "Parses a relative call to a group."
1113 digits = source.get_while(DIGITS)
1114 if not digits:
1115 raise error("missing relative group number", source.string, source.pos)
1117 offset = int(digits)
1118 group = info.group_count + offset if ch == "+" else info.group_count - offset + 1
1119 if group <= 0:
1120 raise error("invalid relative group number", source.string, source.pos)
1122 source.expect(")")
1124 return CallGroup(info, group, pos)
1126def parse_call_named_group(source, info, pos):
1127 "Parses a call to a named group."
1128 group = parse_name(source)
1129 source.expect(")")
1131 return CallGroup(info, group, pos)
1133def parse_flag_set(source):
1134 "Parses a set of inline flags."
1135 flags = 0
1137 try:
1138 while True:
1139 saved_pos = source.pos
1140 ch = source.get()
1141 if ch == "V":
1142 ch += source.get()
1143 flags |= REGEX_FLAGS[ch]
1144 except KeyError:
1145 source.pos = saved_pos
1147 return flags
1149def parse_flags(source, info):
1150 "Parses flags being turned on/off."
1151 flags_on = parse_flag_set(source)
1152 if source.match("-"):
1153 flags_off = parse_flag_set(source)
1154 if not flags_off:
1155 raise error("bad inline flags: no flags after '-'", source.string,
1156 source.pos)
1157 else:
1158 flags_off = 0
1160 if flags_on & LOCALE:
1161 # Remember that this pattern as an inline locale flag.
1162 info.inline_locale = True
1164 return flags_on, flags_off
1166def parse_subpattern(source, info, flags_on, flags_off):
1167 "Parses a subpattern with scoped flags."
1168 saved_flags = info.flags
1169 info.flags = (info.flags | flags_on) & ~flags_off
1171 # Ensure that there aren't multiple encoding flags set.
1172 if info.flags & (ASCII | LOCALE | UNICODE):
1173 info.flags = (info.flags & ~_ALL_ENCODINGS) | flags_on
1175 source.ignore_space = bool(info.flags & VERBOSE)
1176 try:
1177 subpattern = _parse_pattern(source, info)
1178 source.expect(")")
1179 finally:
1180 info.flags = saved_flags
1181 source.ignore_space = bool(info.flags & VERBOSE)
1183 return subpattern
1185def parse_flags_subpattern(source, info):
1186 """Parses a flags subpattern. It could be inline flags or a subpattern
1187 possibly with local flags. If it's a subpattern, then that's returned;
1188 if it's a inline flags, then None is returned.
1189 """
1190 flags_on, flags_off = parse_flags(source, info)
1192 if flags_off & GLOBAL_FLAGS:
1193 raise error("bad inline flags: cannot turn off global flag",
1194 source.string, source.pos)
1196 if flags_on & flags_off:
1197 raise error("bad inline flags: flag turned on and off", source.string,
1198 source.pos)
1200 # Handle flags which are global in all regex behaviours.
1201 new_global_flags = (flags_on & ~info.global_flags) & GLOBAL_FLAGS
1202 if new_global_flags:
1203 info.global_flags |= new_global_flags
1205 # A global has been turned on, so reparse the pattern.
1206 raise _UnscopedFlagSet(info.global_flags)
1208 # Ensure that from now on we have only scoped flags.
1209 flags_on &= ~GLOBAL_FLAGS
1211 if source.match(":"):
1212 return parse_subpattern(source, info, flags_on, flags_off)
1214 if source.match(")"):
1215 parse_positional_flags(source, info, flags_on, flags_off)
1216 return None
1218 raise error("unknown extension", source.string, source.pos)
1220def parse_positional_flags(source, info, flags_on, flags_off):
1221 "Parses positional flags."
1222 info.flags = (info.flags | flags_on) & ~flags_off
1223 source.ignore_space = bool(info.flags & VERBOSE)
1225def parse_name(source, allow_numeric=False, allow_group_0=False):
1226 "Parses a name."
1227 name = source.get_while(set(")>"), include=False)
1229 if not name:
1230 raise error("missing group name", source.string, source.pos)
1232 if name.isdigit():
1233 min_group = 0 if allow_group_0 else 1
1234 if not allow_numeric or int(name) < min_group:
1235 raise error("bad character in group name", source.string,
1236 source.pos)
1237 else:
1238 if not name.isidentifier():
1239 raise error("bad character in group name", source.string,
1240 source.pos)
1242 return name
1244def is_octal(string):
1245 "Checks whether a string is octal."
1246 return all(ch in OCT_DIGITS for ch in string)
1248def is_decimal(string):
1249 "Checks whether a string is decimal."
1250 return all(ch in DIGITS for ch in string)
1252def is_hexadecimal(string):
1253 "Checks whether a string is hexadecimal."
1254 return all(ch in HEX_DIGITS for ch in string)
1256def parse_escape(source, info, in_set):
1257 "Parses an escape sequence."
1258 saved_ignore = source.ignore_space
1259 source.ignore_space = False
1260 ch = source.get()
1261 source.ignore_space = saved_ignore
1262 if not ch:
1263 # A backslash at the end of the pattern.
1264 raise error("bad escape (end of pattern)", source.string, source.pos)
1265 if ch in HEX_ESCAPES:
1266 # A hexadecimal escape sequence.
1267 return parse_hex_escape(source, info, ch, HEX_ESCAPES[ch], in_set, ch)
1268 elif ch == "g" and not in_set:
1269 # A group reference.
1270 saved_pos = source.pos
1271 try:
1272 return parse_group_ref(source, info)
1273 except error:
1274 # Invalid as a group reference, so assume it's a literal.
1275 source.pos = saved_pos
1277 return make_character(info, ord(ch), in_set)
1278 elif ch == "G" and not in_set:
1279 # A search anchor.
1280 return SearchAnchor()
1281 elif ch == "L" and not in_set:
1282 # A string set.
1283 return parse_string_set(source, info)
1284 elif ch == "N":
1285 # A named codepoint.
1286 return parse_named_char(source, info, in_set)
1287 elif ch in "pP":
1288 # A Unicode property, positive or negative.
1289 return parse_property(source, info, ch == "p", in_set)
1290 elif ch == "R" and not in_set:
1291 # A line ending.
1292 charset = [0x0A, 0x0B, 0x0C, 0x0D]
1293 if info.guess_encoding == UNICODE:
1294 charset.extend([0x85, 0x2028, 0x2029])
1296 return Atomic(Branch([String([0x0D, 0x0A]), SetUnion(info, [Character(c)
1297 for c in charset])]))
1298 elif ch == "X" and not in_set:
1299 # A grapheme cluster.
1300 return Grapheme()
1301 elif ch in ALPHA:
1302 # An alphabetic escape sequence.
1303 # Positional escapes aren't allowed inside a character set.
1304 if not in_set:
1305 if info.flags & WORD:
1306 value = WORD_POSITION_ESCAPES.get(ch)
1307 elif info.flags & ASCII:
1308 value = ASCII_POSITION_ESCAPES.get(ch)
1309 elif info.flags & UNICODE:
1310 value = UNICODE_POSITION_ESCAPES.get(ch)
1311 else:
1312 value = POSITION_ESCAPES.get(ch)
1314 if value:
1315 return value
1317 if info.flags & ASCII:
1318 value = ASCII_CHARSET_ESCAPES.get(ch)
1319 elif info.flags & UNICODE:
1320 value = UNICODE_CHARSET_ESCAPES.get(ch)
1321 else:
1322 value = CHARSET_ESCAPES.get(ch)
1324 if value:
1325 return value
1327 value = CHARACTER_ESCAPES.get(ch)
1328 if value:
1329 return Character(ord(value))
1331 raise error("bad escape \\%s" % ch, source.string, source.pos)
1332 elif ch in DIGITS:
1333 # A numeric escape sequence.
1334 return parse_numeric_escape(source, info, ch, in_set)
1335 else:
1336 # A literal.
1337 return make_character(info, ord(ch), in_set)
1339def parse_numeric_escape(source, info, ch, in_set):
1340 "Parses a numeric escape sequence."
1341 if in_set or ch == "0":
1342 # Octal escape sequence, max 3 digits.
1343 return parse_octal_escape(source, info, [ch], in_set)
1345 # At least 1 digit, so either octal escape or group.
1346 digits = ch
1347 saved_pos = source.pos
1348 ch = source.get()
1349 if ch in DIGITS:
1350 # At least 2 digits, so either octal escape or group.
1351 digits += ch
1352 saved_pos = source.pos
1353 ch = source.get()
1354 if is_octal(digits) and ch in OCT_DIGITS:
1355 # 3 octal digits, so octal escape sequence.
1356 encoding = info.flags & _ALL_ENCODINGS
1357 if encoding == ASCII or encoding == LOCALE:
1358 octal_mask = 0xFF
1359 else:
1360 octal_mask = 0x1FF
1362 value = int(digits + ch, 8) & octal_mask
1363 return make_character(info, value)
1365 # Group reference.
1366 source.pos = saved_pos
1367 if info.is_open_group(digits):
1368 raise error("cannot refer to an open group", source.string, source.pos)
1370 return make_ref_group(info, digits, source.pos)
1372def parse_octal_escape(source, info, digits, in_set):
1373 "Parses an octal escape sequence."
1374 saved_pos = source.pos
1375 ch = source.get()
1376 while len(digits) < 3 and ch in OCT_DIGITS:
1377 digits.append(ch)
1378 saved_pos = source.pos
1379 ch = source.get()
1381 source.pos = saved_pos
1382 try:
1383 value = int("".join(digits), 8)
1384 return make_character(info, value, in_set)
1385 except ValueError:
1386 if digits[0] in OCT_DIGITS:
1387 raise error("incomplete escape \\%s" % ''.join(digits),
1388 source.string, source.pos)
1389 else:
1390 raise error("bad escape \\%s" % digits[0], source.string,
1391 source.pos)
1393def parse_hex_escape(source, info, esc, expected_len, in_set, type):
1394 "Parses a hex escape sequence."
1395 saved_pos = source.pos
1396 digits = []
1397 for i in range(expected_len):
1398 ch = source.get()
1399 if ch not in HEX_DIGITS:
1400 raise error("incomplete escape \\%s%s" % (type, ''.join(digits)),
1401 source.string, saved_pos)
1402 digits.append(ch)
1404 try:
1405 value = int("".join(digits), 16)
1406 except ValueError:
1407 pass
1408 else:
1409 if value < 0x110000:
1410 return make_character(info, value, in_set)
1412 # Bad hex escape.
1413 raise error("bad hex escape \\%s%s" % (esc, ''.join(digits)),
1414 source.string, saved_pos)
1416def parse_group_ref(source, info):
1417 "Parses a group reference."
1418 source.expect("<")
1419 saved_pos = source.pos
1420 name = parse_name(source, True)
1421 source.expect(">")
1422 if info.is_open_group(name):
1423 raise error("cannot refer to an open group", source.string, source.pos)
1425 return make_ref_group(info, name, saved_pos)
1427def parse_string_set(source, info):
1428 "Parses a string set reference."
1429 source.expect("<")
1430 name = parse_name(source, True)
1431 source.expect(">")
1432 if name is None or name not in info.kwargs:
1433 raise error("undefined named list", source.string, source.pos)
1435 return make_string_set(info, name)
1437def parse_named_char(source, info, in_set):
1438 "Parses a named character."
1439 saved_pos = source.pos
1440 if source.match("{"):
1441 name = source.get_while(NAMED_CHAR_PART, keep_spaces=True)
1442 if source.match("}"):
1443 try:
1444 value = unicodedata.lookup(name)
1445 return make_character(info, ord(value), in_set)
1446 except KeyError:
1447 raise error("undefined character name", source.string,
1448 source.pos)
1450 source.pos = saved_pos
1451 return make_character(info, ord("N"), in_set)
1453def parse_property(source, info, positive, in_set):
1454 "Parses a Unicode property."
1455 saved_pos = source.pos
1456 ch = source.get()
1457 if ch == "{":
1458 negate = source.match("^")
1459 prop_name, name = parse_property_name(source)
1460 if source.match("}"):
1461 # It's correctly delimited.
1462 if info.flags & ASCII:
1463 encoding = ASCII_ENCODING
1464 elif info.flags & UNICODE:
1465 encoding = UNICODE_ENCODING
1466 else:
1467 encoding = 0
1469 prop = lookup_property(prop_name, name, positive != negate, source,
1470 encoding=encoding)
1471 return make_property(info, prop, in_set)
1472 elif ch and ch in "CLMNPSZ":
1473 # An abbreviated property, eg \pL.
1474 if info.flags & ASCII:
1475 encoding = ASCII_ENCODING
1476 elif info.flags & UNICODE:
1477 encoding = UNICODE_ENCODING
1478 else:
1479 encoding = 0
1481 prop = lookup_property(None, ch, positive, source, encoding=encoding)
1482 return make_property(info, prop, in_set)
1484 # Not a property, so treat as a literal "p" or "P".
1485 source.pos = saved_pos
1486 ch = "p" if positive else "P"
1487 return make_character(info, ord(ch), in_set)
1489def parse_property_name(source):
1490 "Parses a property name, which may be qualified."
1491 name = source.get_while(PROPERTY_NAME_PART)
1492 saved_pos = source.pos
1494 ch = source.get()
1495 if ch and ch in ":=":
1496 prop_name = name
1497 name = source.get_while(ALNUM | set(" &_-./")).strip()
1499 if name:
1500 # Name after the ":" or "=", so it's a qualified name.
1501 saved_pos = source.pos
1502 else:
1503 # No name after the ":" or "=", so assume it's an unqualified name.
1504 prop_name, name = None, prop_name
1505 else:
1506 prop_name = None
1508 source.pos = saved_pos
1509 return prop_name, name
1511def parse_set(source, info):
1512 "Parses a character set."
1513 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION
1515 saved_ignore = source.ignore_space
1516 source.ignore_space = False
1517 # Negative set?
1518 negate = source.match("^")
1519 try:
1520 if version == VERSION0:
1521 item = parse_set_imp_union(source, info)
1522 else:
1523 item = parse_set_union(source, info)
1525 if not source.match("]"):
1526 raise error("missing ]", source.string, source.pos)
1527 finally:
1528 source.ignore_space = saved_ignore
1530 if negate:
1531 item = item.with_flags(positive=not item.positive)
1533 item = item.with_flags(case_flags=make_case_flags(info))
1535 return item
1537def parse_set_union(source, info):
1538 "Parses a set union ([x||y])."
1539 items = [parse_set_symm_diff(source, info)]
1540 while source.match("||"):
1541 items.append(parse_set_symm_diff(source, info))
1543 if len(items) == 1:
1544 return items[0]
1545 return SetUnion(info, items)
1547def parse_set_symm_diff(source, info):
1548 "Parses a set symmetric difference ([x~~y])."
1549 items = [parse_set_inter(source, info)]
1550 while source.match("~~"):
1551 items.append(parse_set_inter(source, info))
1553 if len(items) == 1:
1554 return items[0]
1555 return SetSymDiff(info, items)
1557def parse_set_inter(source, info):
1558 "Parses a set intersection ([x&&y])."
1559 items = [parse_set_diff(source, info)]
1560 while source.match("&&"):
1561 items.append(parse_set_diff(source, info))
1563 if len(items) == 1:
1564 return items[0]
1565 return SetInter(info, items)
1567def parse_set_diff(source, info):
1568 "Parses a set difference ([x--y])."
1569 items = [parse_set_imp_union(source, info)]
1570 while source.match("--"):
1571 items.append(parse_set_imp_union(source, info))
1573 if len(items) == 1:
1574 return items[0]
1575 return SetDiff(info, items)
1577def parse_set_imp_union(source, info):
1578 "Parses a set implicit union ([xy])."
1579 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION
1581 items = [parse_set_member(source, info)]
1582 while True:
1583 saved_pos = source.pos
1584 if source.match("]"):
1585 # End of the set.
1586 source.pos = saved_pos
1587 break
1589 if version == VERSION1 and any(source.match(op) for op in SET_OPS):
1590 # The new behaviour has set operators.
1591 source.pos = saved_pos
1592 break
1594 items.append(parse_set_member(source, info))
1596 if len(items) == 1:
1597 return items[0]
1598 return SetUnion(info, items)
1600def parse_set_member(source, info):
1601 "Parses a member in a character set."
1602 # Parse a set item.
1603 start = parse_set_item(source, info)
1604 saved_pos1 = source.pos
1605 if (not isinstance(start, Character) or not start.positive or not
1606 source.match("-")):
1607 # It's not the start of a range.
1608 return start
1610 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION
1612 # It looks like the start of a range of characters.
1613 saved_pos2 = source.pos
1614 if version == VERSION1 and source.match("-"):
1615 # It's actually the set difference operator '--', so return the
1616 # character.
1617 source.pos = saved_pos1
1618 return start
1620 if source.match("]"):
1621 # We've reached the end of the set, so return both the character and
1622 # hyphen.
1623 source.pos = saved_pos2
1624 return SetUnion(info, [start, Character(ord("-"))])
1626 # Parse a set item.
1627 end = parse_set_item(source, info)
1628 if not isinstance(end, Character) or not end.positive:
1629 # It's not a range, so return the character, hyphen and property.
1630 return SetUnion(info, [start, Character(ord("-")), end])
1632 # It _is_ a range.
1633 if start.value > end.value:
1634 raise error("bad character range", source.string, source.pos)
1636 if start.value == end.value:
1637 return start
1639 return Range(start.value, end.value)
1641def parse_set_item(source, info):
1642 "Parses an item in a character set."
1643 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION
1645 if source.match("\\"):
1646 # An escape sequence in a set.
1647 return parse_escape(source, info, True)
1649 saved_pos = source.pos
1650 if source.match("[:"):
1651 # Looks like a POSIX character class.
1652 try:
1653 return parse_posix_class(source, info)
1654 except ParseError:
1655 # Not a POSIX character class.
1656 source.pos = saved_pos
1658 if version == VERSION1 and source.match("["):
1659 # It's the start of a nested set.
1661 # Negative set?
1662 negate = source.match("^")
1663 item = parse_set_union(source, info)
1665 if not source.match("]"):
1666 raise error("missing ]", source.string, source.pos)
1668 if negate:
1669 item = item.with_flags(positive=not item.positive)
1671 return item
1673 ch = source.get()
1674 if not ch:
1675 raise error("unterminated character set", source.string, source.pos)
1677 return Character(ord(ch))
1679def parse_posix_class(source, info):
1680 "Parses a POSIX character class."
1681 negate = source.match("^")
1682 prop_name, name = parse_property_name(source)
1683 if not source.match(":]"):
1684 raise ParseError()
1686 return lookup_property(prop_name, name, not negate, source, posix=True)
1688def float_to_rational(flt):
1689 "Converts a float to a rational pair."
1690 int_part = int(flt)
1691 error = flt - int_part
1692 if abs(error) < 0.0001:
1693 return int_part, 1
1695 den, num = float_to_rational(1.0 / error)
1697 return int_part * den + num, den
1699def numeric_to_rational(numeric):
1700 "Converts a numeric string to a rational string, if possible."
1701 if numeric[ : 1] == "-":
1702 sign, numeric = numeric[0], numeric[1 : ]
1703 else:
1704 sign = ""
1706 parts = numeric.split("/")
1707 if len(parts) == 2:
1708 num, den = float_to_rational(float(parts[0]) / float(parts[1]))
1709 elif len(parts) == 1:
1710 num, den = float_to_rational(float(parts[0]))
1711 else:
1712 raise ValueError()
1714 result = "{}{}/{}".format(sign, num, den)
1715 if result.endswith("/1"):
1716 return result[ : -2]
1718 return result
1720def standardise_name(name):
1721 "Standardises a property or value name."
1722 try:
1723 return numeric_to_rational("".join(name))
1724 except (ValueError, ZeroDivisionError):
1725 return "".join(ch for ch in name if ch not in "_- ").upper()
1727_POSIX_CLASSES = set('ALNUM DIGIT PUNCT XDIGIT'.split())
1729_BINARY_VALUES = set('YES Y NO N TRUE T FALSE F'.split())
1731def lookup_property(property, value, positive, source=None, posix=False, encoding=0):
1732 "Looks up a property."
1733 # Normalise the names (which may still be lists).
1734 property = standardise_name(property) if property else None
1735 value = standardise_name(value)
1737 if (property, value) == ("GENERALCATEGORY", "ASSIGNED"):
1738 property, value, positive = "GENERALCATEGORY", "UNASSIGNED", not positive
1740 if posix and not property and value.upper() in _POSIX_CLASSES:
1741 value = 'POSIX' + value
1743 if property:
1744 # Both the property and the value are provided.
1745 prop = PROPERTIES.get(property)
1746 if not prop:
1747 if not source:
1748 raise error("unknown property")
1750 raise error("unknown property", source.string, source.pos)
1752 prop_id, value_dict = prop
1753 val_id = value_dict.get(value)
1754 if val_id is None:
1755 if not source:
1756 raise error("unknown property value")
1758 raise error("unknown property value", source.string, source.pos)
1760 return Property((prop_id << 16) | val_id, positive, encoding=encoding)
1762 # Only the value is provided.
1763 # It might be the name of a GC, script or block value.
1764 for property in ("GC", "SCRIPT", "BLOCK"):
1765 prop_id, value_dict = PROPERTIES.get(property)
1766 val_id = value_dict.get(value)
1767 if val_id is not None:
1768 return Property((prop_id << 16) | val_id, positive, encoding=encoding)
1770 # It might be the name of a binary property.
1771 prop = PROPERTIES.get(value)
1772 if prop:
1773 prop_id, value_dict = prop
1774 if set(value_dict) == _BINARY_VALUES:
1775 return Property((prop_id << 16) | 1, positive, encoding=encoding)
1777 return Property(prop_id << 16, not positive, encoding=encoding)
1779 # It might be the name of a binary property starting with a prefix.
1780 if value.startswith("IS"):
1781 prop = PROPERTIES.get(value[2 : ])
1782 if prop:
1783 prop_id, value_dict = prop
1784 if "YES" in value_dict:
1785 return Property((prop_id << 16) | 1, positive, encoding=encoding)
1787 # It might be the name of a script or block starting with a prefix.
1788 for prefix, property in (("IS", "SCRIPT"), ("IN", "BLOCK")):
1789 if value.startswith(prefix):
1790 prop_id, value_dict = PROPERTIES.get(property)
1791 val_id = value_dict.get(value[2 : ])
1792 if val_id is not None:
1793 return Property((prop_id << 16) | val_id, positive, encoding=encoding)
1795 # Unknown property.
1796 if not source:
1797 raise error("unknown property")
1799 raise error("unknown property", source.string, source.pos)
1801def _compile_replacement(source, pattern, is_unicode):
1802 "Compiles a replacement template escape sequence."
1803 ch = source.get()
1804 if ch in ALPHA:
1805 # An alphabetic escape sequence.
1806 value = CHARACTER_ESCAPES.get(ch)
1807 if value:
1808 return False, [ord(value)]
1810 if ch in HEX_ESCAPES and (ch == "x" or is_unicode):
1811 # A hexadecimal escape sequence.
1812 return False, [parse_repl_hex_escape(source, HEX_ESCAPES[ch], ch)]
1814 if ch == "g":
1815 # A group preference.
1816 return True, [compile_repl_group(source, pattern)]
1818 if ch == "N" and is_unicode:
1819 # A named character.
1820 value = parse_repl_named_char(source)
1821 if value is not None:
1822 return False, [value]
1824 raise error("bad escape \\%s" % ch, source.string, source.pos)
1826 if isinstance(source.sep, bytes):
1827 octal_mask = 0xFF
1828 else:
1829 octal_mask = 0x1FF
1831 if ch == "0":
1832 # An octal escape sequence.
1833 digits = ch
1834 while len(digits) < 3:
1835 saved_pos = source.pos
1836 ch = source.get()
1837 if ch not in OCT_DIGITS:
1838 source.pos = saved_pos
1839 break
1840 digits += ch
1842 return False, [int(digits, 8) & octal_mask]
1844 if ch in DIGITS:
1845 # Either an octal escape sequence (3 digits) or a group reference (max
1846 # 2 digits).
1847 digits = ch
1848 saved_pos = source.pos
1849 ch = source.get()
1850 if ch in DIGITS:
1851 digits += ch
1852 saved_pos = source.pos
1853 ch = source.get()
1854 if ch and is_octal(digits + ch):
1855 # An octal escape sequence.
1856 return False, [int(digits + ch, 8) & octal_mask]
1858 # A group reference.
1859 source.pos = saved_pos
1860 return True, [int(digits)]
1862 if ch == "\\":
1863 # An escaped backslash is a backslash.
1864 return False, [ord("\\")]
1866 if not ch:
1867 # A trailing backslash.
1868 raise error("bad escape (end of pattern)", source.string, source.pos)
1870 # An escaped non-backslash is a backslash followed by the literal.
1871 return False, [ord("\\"), ord(ch)]
1873def parse_repl_hex_escape(source, expected_len, type):
1874 "Parses a hex escape sequence in a replacement string."
1875 digits = []
1876 for i in range(expected_len):
1877 ch = source.get()
1878 if ch not in HEX_DIGITS:
1879 raise error("incomplete escape \\%s%s" % (type, ''.join(digits)),
1880 source.string, source.pos)
1881 digits.append(ch)
1883 return int("".join(digits), 16)
1885def parse_repl_named_char(source):
1886 "Parses a named character in a replacement string."
1887 saved_pos = source.pos
1888 if source.match("{"):
1889 name = source.get_while(ALPHA | set(" "))
1891 if source.match("}"):
1892 try:
1893 value = unicodedata.lookup(name)
1894 return ord(value)
1895 except KeyError:
1896 raise error("undefined character name", source.string,
1897 source.pos)
1899 source.pos = saved_pos
1900 return None
1902def compile_repl_group(source, pattern):
1903 "Compiles a replacement template group reference."
1904 source.expect("<")
1905 name = parse_name(source, True, True)
1907 source.expect(">")
1908 if name.isdigit():
1909 index = int(name)
1910 if not 0 <= index <= pattern.groups:
1911 raise error("invalid group reference", source.string, source.pos)
1913 return index
1915 try:
1916 return pattern.groupindex[name]
1917 except KeyError:
1918 raise IndexError("unknown group")
1920# The regular expression is parsed into a syntax tree. The different types of
1921# node are defined below.
1923INDENT = " "
1924POSITIVE_OP = 0x1
1925ZEROWIDTH_OP = 0x2
1926FUZZY_OP = 0x4
1927REVERSE_OP = 0x8
1928REQUIRED_OP = 0x10
1929ENCODING_OP_SHIFT = 5
1931POS_TEXT = {False: "NON-MATCH", True: "MATCH"}
1932CASE_TEXT = {NOCASE: "", IGNORECASE: " SIMPLE_IGNORE_CASE", FULLCASE: "",
1933 FULLIGNORECASE: " FULL_IGNORE_CASE"}
1935def make_sequence(items):
1936 if len(items) == 1:
1937 return items[0]
1938 return Sequence(items)
1940# Common base class for all nodes.
1941class RegexBase:
1942 def __init__(self):
1943 self._key = self.__class__
1945 def with_flags(self, positive=None, case_flags=None, zerowidth=None):
1946 if positive is None:
1947 positive = self.positive
1948 else:
1949 positive = bool(positive)
1950 if case_flags is None:
1951 case_flags = self.case_flags
1952 else:
1953 case_flags = CASE_FLAGS_COMBINATIONS[case_flags & CASE_FLAGS]
1954 if zerowidth is None:
1955 zerowidth = self.zerowidth
1956 else:
1957 zerowidth = bool(zerowidth)
1959 if (positive == self.positive and case_flags == self.case_flags and
1960 zerowidth == self.zerowidth):
1961 return self
1963 return self.rebuild(positive, case_flags, zerowidth)
1965 def fix_groups(self, pattern, reverse, fuzzy):
1966 pass
1968 def optimise(self, info, reverse):
1969 return self
1971 def pack_characters(self, info):
1972 return self
1974 def remove_captures(self):
1975 return self
1977 def is_atomic(self):
1978 return True
1980 def can_be_affix(self):
1981 return True
1983 def contains_group(self):
1984 return False
1986 def get_firstset(self, reverse):
1987 raise _FirstSetError()
1989 def has_simple_start(self):
1990 return False
1992 def compile(self, reverse=False, fuzzy=False):
1993 return self._compile(reverse, fuzzy)
1995 def is_empty(self):
1996 return False
1998 def __hash__(self):
1999 return hash(self._key)
2001 def __eq__(self, other):
2002 return type(self) is type(other) and self._key == other._key
2004 def __ne__(self, other):
2005 return not self.__eq__(other)
2007 def get_required_string(self, reverse):
2008 return self.max_width(), None
2010# Base class for zero-width nodes.
2011class ZeroWidthBase(RegexBase):
2012 def __init__(self, positive=True, encoding=0):
2013 RegexBase.__init__(self)
2014 self.positive = bool(positive)
2015 self.encoding = encoding
2017 self._key = self.__class__, self.positive
2019 def get_firstset(self, reverse):
2020 return set([None])
2022 def _compile(self, reverse, fuzzy):
2023 flags = 0
2024 if self.positive:
2025 flags |= POSITIVE_OP
2026 if fuzzy:
2027 flags |= FUZZY_OP
2028 if reverse:
2029 flags |= REVERSE_OP
2030 flags |= self.encoding << ENCODING_OP_SHIFT
2031 return [(self._opcode, flags)]
2033 def dump(self, indent, reverse):
2034 print("{}{} {}{}".format(INDENT * indent, self._op_name,
2035 POS_TEXT[self.positive], ["", " ASCII"][self.encoding]))
2037 def max_width(self):
2038 return 0
2040class Any(RegexBase):
2041 _opcode = {False: OP.ANY, True: OP.ANY_REV}
2042 _op_name = "ANY"
2044 def has_simple_start(self):
2045 return True
2047 def _compile(self, reverse, fuzzy):
2048 flags = 0
2049 if fuzzy:
2050 flags |= FUZZY_OP
2051 return [(self._opcode[reverse], flags)]
2053 def dump(self, indent, reverse):
2054 print("{}{}".format(INDENT * indent, self._op_name))
2056 def max_width(self):
2057 return 1
2059class AnyAll(Any):
2060 _opcode = {False: OP.ANY_ALL, True: OP.ANY_ALL_REV}
2061 _op_name = "ANY_ALL"
2063 def __init__(self):
2064 self.positive = True
2065 self.zerowidth = False
2066 self.case_flags = 0
2068 self._key = self.__class__, self.positive
2070class AnyU(Any):
2071 _opcode = {False: OP.ANY_U, True: OP.ANY_U_REV}
2072 _op_name = "ANY_U"
2074class Atomic(RegexBase):
2075 def __init__(self, subpattern):
2076 RegexBase.__init__(self)
2077 self.subpattern = subpattern
2079 def fix_groups(self, pattern, reverse, fuzzy):
2080 self.subpattern.fix_groups(pattern, reverse, fuzzy)
2082 def optimise(self, info, reverse):
2083 self.subpattern = self.subpattern.optimise(info, reverse)
2085 if self.subpattern.is_empty():
2086 return self.subpattern
2087 return self
2089 def pack_characters(self, info):
2090 self.subpattern = self.subpattern.pack_characters(info)
2091 return self
2093 def remove_captures(self):
2094 self.subpattern = self.subpattern.remove_captures()
2095 return self
2097 def can_be_affix(self):
2098 return self.subpattern.can_be_affix()
2100 def contains_group(self):
2101 return self.subpattern.contains_group()
2103 def get_firstset(self, reverse):
2104 return self.subpattern.get_firstset(reverse)
2106 def has_simple_start(self):
2107 return self.subpattern.has_simple_start()
2109 def _compile(self, reverse, fuzzy):
2110 return ([(OP.ATOMIC, )] + self.subpattern.compile(reverse, fuzzy) +
2111 [(OP.END, )])
2113 def dump(self, indent, reverse):
2114 print("{}ATOMIC".format(INDENT * indent))
2115 self.subpattern.dump(indent + 1, reverse)
2117 def is_empty(self):
2118 return self.subpattern.is_empty()
2120 def __eq__(self, other):
2121 return (type(self) is type(other) and self.subpattern ==
2122 other.subpattern)
2124 def max_width(self):
2125 return self.subpattern.max_width()
2127 def get_required_string(self, reverse):
2128 return self.subpattern.get_required_string(reverse)
2130class Boundary(ZeroWidthBase):
2131 _opcode = OP.BOUNDARY
2132 _op_name = "BOUNDARY"
2134class Branch(RegexBase):
2135 def __init__(self, branches):
2136 RegexBase.__init__(self)
2137 self.branches = branches
2139 def fix_groups(self, pattern, reverse, fuzzy):
2140 for b in self.branches:
2141 b.fix_groups(pattern, reverse, fuzzy)
2143 def optimise(self, info, reverse):
2144 if not self.branches:
2145 return Sequence([])
2147 # Flatten branches within branches.
2148 branches = Branch._flatten_branches(info, reverse, self.branches)
2150 # Move any common prefix or suffix out of the branches.
2151 if reverse:
2152 suffix, branches = Branch._split_common_suffix(info, branches)
2153 prefix = []
2154 else:
2155 prefix, branches = Branch._split_common_prefix(info, branches)
2156 suffix = []
2158 # Try to reduce adjacent single-character branches to sets.
2159 branches = Branch._reduce_to_set(info, reverse, branches)
2161 if len(branches) > 1:
2162 sequence = [Branch(branches)]
2164 if not prefix or not suffix:
2165 # We might be able to add a quick precheck before the branches.
2166 firstset = self._add_precheck(info, reverse, branches)
2168 if firstset:
2169 if reverse:
2170 sequence.append(firstset)
2171 else:
2172 sequence.insert(0, firstset)
2173 else:
2174 sequence = branches
2176 return make_sequence(prefix + sequence + suffix)
2178 def _add_precheck(self, info, reverse, branches):
2179 charset = set()
2180 pos = -1 if reverse else 0
2182 for branch in branches:
2183 if type(branch) is Literal and branch.case_flags == NOCASE:
2184 charset.add(branch.characters[pos])
2185 else:
2186 return
2188 if not charset:
2189 return None
2191 return _check_firstset(info, reverse, [Character(c) for c in charset])
2193 def pack_characters(self, info):
2194 self.branches = [b.pack_characters(info) for b in self.branches]
2195 return self
2197 def remove_captures(self):
2198 self.branches = [b.remove_captures() for b in self.branches]
2199 return self
2201 def is_atomic(self):
2202 return all(b.is_atomic() for b in self.branches)
2204 def can_be_affix(self):
2205 return all(b.can_be_affix() for b in self.branches)
2207 def contains_group(self):
2208 return any(b.contains_group() for b in self.branches)
2210 def get_firstset(self, reverse):
2211 fs = set()
2212 for b in self.branches:
2213 fs |= b.get_firstset(reverse)
2215 return fs or set([None])
2217 def _compile(self, reverse, fuzzy):
2218 if not self.branches:
2219 return []
2221 code = [(OP.BRANCH, )]
2222 for b in self.branches:
2223 code.extend(b.compile(reverse, fuzzy))
2224 code.append((OP.NEXT, ))
2226 code[-1] = (OP.END, )
2228 return code
2230 def dump(self, indent, reverse):
2231 print("{}BRANCH".format(INDENT * indent))
2232 self.branches[0].dump(indent + 1, reverse)
2233 for b in self.branches[1 : ]:
2234 print("{}OR".format(INDENT * indent))
2235 b.dump(indent + 1, reverse)
2237 @staticmethod
2238 def _flatten_branches(info, reverse, branches):
2239 # Flatten the branches so that there aren't branches of branches.
2240 new_branches = []
2241 for b in branches:
2242 b = b.optimise(info, reverse)
2243 if isinstance(b, Branch):
2244 new_branches.extend(b.branches)
2245 else:
2246 new_branches.append(b)
2248 return new_branches
2250 @staticmethod
2251 def _split_common_prefix(info, branches):
2252 # Common leading items can be moved out of the branches.
2253 # Get the items in the branches.
2254 alternatives = []
2255 for b in branches:
2256 if isinstance(b, Sequence):
2257 alternatives.append(b.items)
2258 else:
2259 alternatives.append([b])
2261 # What is the maximum possible length of the prefix?
2262 max_count = min(len(a) for a in alternatives)
2264 # What is the longest common prefix?
2265 prefix = alternatives[0]
2266 pos = 0
2267 end_pos = max_count
2268 while pos < end_pos and prefix[pos].can_be_affix() and all(a[pos] ==
2269 prefix[pos] for a in alternatives):
2270 pos += 1
2271 count = pos
2273 if info.flags & UNICODE:
2274 # We need to check that we're not splitting a sequence of
2275 # characters which could form part of full case-folding.
2276 count = pos
2277 while count > 0 and not all(Branch._can_split(a, count) for a in
2278 alternatives):
2279 count -= 1
2281 # No common prefix is possible.
2282 if count == 0:
2283 return [], branches
2285 # Rebuild the branches.
2286 new_branches = []
2287 for a in alternatives:
2288 new_branches.append(make_sequence(a[count : ]))
2290 return prefix[ : count], new_branches
2292 @staticmethod
2293 def _split_common_suffix(info, branches):
2294 # Common trailing items can be moved out of the branches.
2295 # Get the items in the branches.
2296 alternatives = []
2297 for b in branches:
2298 if isinstance(b, Sequence):
2299 alternatives.append(b.items)
2300 else:
2301 alternatives.append([b])
2303 # What is the maximum possible length of the suffix?
2304 max_count = min(len(a) for a in alternatives)
2306 # What is the longest common suffix?
2307 suffix = alternatives[0]
2308 pos = -1
2309 end_pos = -1 - max_count
2310 while pos > end_pos and suffix[pos].can_be_affix() and all(a[pos] ==
2311 suffix[pos] for a in alternatives):
2312 pos -= 1
2313 count = -1 - pos
2315 if info.flags & UNICODE:
2316 # We need to check that we're not splitting a sequence of
2317 # characters which could form part of full case-folding.
2318 while count > 0 and not all(Branch._can_split_rev(a, count) for a
2319 in alternatives):
2320 count -= 1
2322 # No common suffix is possible.
2323 if count == 0:
2324 return [], branches
2326 # Rebuild the branches.
2327 new_branches = []
2328 for a in alternatives:
2329 new_branches.append(make_sequence(a[ : -count]))
2331 return suffix[-count : ], new_branches
2333 @staticmethod
2334 def _can_split(items, count):
2335 # Check the characters either side of the proposed split.
2336 if not Branch._is_full_case(items, count - 1):
2337 return True
2339 if not Branch._is_full_case(items, count):
2340 return True
2342 # Check whether a 1-1 split would be OK.
2343 if Branch._is_folded(items[count - 1 : count + 1]):
2344 return False
2346 # Check whether a 1-2 split would be OK.
2347 if (Branch._is_full_case(items, count + 2) and
2348 Branch._is_folded(items[count - 1 : count + 2])):
2349 return False
2351 # Check whether a 2-1 split would be OK.
2352 if (Branch._is_full_case(items, count - 2) and
2353 Branch._is_folded(items[count - 2 : count + 1])):
2354 return False
2356 return True
2358 @staticmethod
2359 def _can_split_rev(items, count):
2360 end = len(items)
2362 # Check the characters either side of the proposed split.
2363 if not Branch._is_full_case(items, end - count):
2364 return True
2366 if not Branch._is_full_case(items, end - count - 1):
2367 return True
2369 # Check whether a 1-1 split would be OK.
2370 if Branch._is_folded(items[end - count - 1 : end - count + 1]):
2371 return False
2373 # Check whether a 1-2 split would be OK.
2374 if (Branch._is_full_case(items, end - count + 2) and
2375 Branch._is_folded(items[end - count - 1 : end - count + 2])):
2376 return False
2378 # Check whether a 2-1 split would be OK.
2379 if (Branch._is_full_case(items, end - count - 2) and
2380 Branch._is_folded(items[end - count - 2 : end - count + 1])):
2381 return False
2383 return True
2385 @staticmethod
2386 def _merge_common_prefixes(info, reverse, branches):
2387 # Branches with the same case-sensitive character prefix can be grouped
2388 # together if they are separated only by other branches with a
2389 # character prefix.
2390 prefixed = defaultdict(list)
2391 order = {}
2392 new_branches = []
2393 for b in branches:
2394 if Branch._is_simple_character(b):
2395 # Branch starts with a simple character.
2396 prefixed[b.value].append([b])
2397 order.setdefault(b.value, len(order))
2398 elif (isinstance(b, Sequence) and b.items and
2399 Branch._is_simple_character(b.items[0])):
2400 # Branch starts with a simple character.
2401 prefixed[b.items[0].value].append(b.items)
2402 order.setdefault(b.items[0].value, len(order))
2403 else:
2404 Branch._flush_char_prefix(info, reverse, prefixed, order,
2405 new_branches)
2407 new_branches.append(b)
2409 Branch._flush_char_prefix(info, prefixed, order, new_branches)
2411 return new_branches
2413 @staticmethod
2414 def _is_simple_character(c):
2415 return isinstance(c, Character) and c.positive and not c.case_flags
2417 @staticmethod
2418 def _reduce_to_set(info, reverse, branches):
2419 # Can the branches be reduced to a set?
2420 new_branches = []
2421 items = set()
2422 case_flags = NOCASE
2423 for b in branches:
2424 if isinstance(b, (Character, Property, SetBase)):
2425 # Branch starts with a single character.
2426 if b.case_flags != case_flags:
2427 # Different case sensitivity, so flush.
2428 Branch._flush_set_members(info, reverse, items, case_flags,
2429 new_branches)
2431 case_flags = b.case_flags
2433 items.add(b.with_flags(case_flags=NOCASE))
2434 else:
2435 Branch._flush_set_members(info, reverse, items, case_flags,
2436 new_branches)
2438 new_branches.append(b)
2440 Branch._flush_set_members(info, reverse, items, case_flags,
2441 new_branches)
2443 return new_branches
2445 @staticmethod
2446 def _flush_char_prefix(info, reverse, prefixed, order, new_branches):
2447 # Flush the prefixed branches.
2448 if not prefixed:
2449 return
2451 for value, branches in sorted(prefixed.items(), key=lambda pair:
2452 order[pair[0]]):
2453 if len(branches) == 1:
2454 new_branches.append(make_sequence(branches[0]))
2455 else:
2456 subbranches = []
2457 optional = False
2458 for b in branches:
2459 if len(b) > 1:
2460 subbranches.append(make_sequence(b[1 : ]))
2461 elif not optional:
2462 subbranches.append(Sequence())
2463 optional = True
2465 sequence = Sequence([Character(value), Branch(subbranches)])
2466 new_branches.append(sequence.optimise(info, reverse))
2468 prefixed.clear()
2469 order.clear()
2471 @staticmethod
2472 def _flush_set_members(info, reverse, items, case_flags, new_branches):
2473 # Flush the set members.
2474 if not items:
2475 return
2477 if len(items) == 1:
2478 item = list(items)[0]
2479 else:
2480 item = SetUnion(info, list(items)).optimise(info, reverse)
2482 new_branches.append(item.with_flags(case_flags=case_flags))
2484 items.clear()
2486 @staticmethod
2487 def _is_full_case(items, i):
2488 if not 0 <= i < len(items):
2489 return False
2491 item = items[i]
2492 return (isinstance(item, Character) and item.positive and
2493 (item.case_flags & FULLIGNORECASE) == FULLIGNORECASE)
2495 @staticmethod
2496 def _is_folded(items):
2497 if len(items) < 2:
2498 return False
2500 for i in items:
2501 if (not isinstance(i, Character) or not i.positive or not
2502 i.case_flags):
2503 return False
2505 folded = "".join(chr(i.value) for i in items)
2506 folded = _regex.fold_case(FULL_CASE_FOLDING, folded)
2508 # Get the characters which expand to multiple codepoints on folding.
2509 expanding_chars = _regex.get_expand_on_folding()
2511 for c in expanding_chars:
2512 if folded == _regex.fold_case(FULL_CASE_FOLDING, c):
2513 return True
2515 return False
2517 def is_empty(self):
2518 return all(b.is_empty() for b in self.branches)
2520 def __eq__(self, other):
2521 return type(self) is type(other) and self.branches == other.branches
2523 def max_width(self):
2524 return max(b.max_width() for b in self.branches)
2526class CallGroup(RegexBase):
2527 def __init__(self, info, group, position):
2528 RegexBase.__init__(self)
2529 self.info = info
2530 self.group = group
2531 self.position = position
2533 self._key = self.__class__, self.group
2535 def fix_groups(self, pattern, reverse, fuzzy):
2536 try:
2537 self.group = int(self.group)
2538 except ValueError:
2539 try:
2540 self.group = self.info.group_index[self.group]
2541 except KeyError:
2542 raise error("invalid group reference", pattern, self.position)
2544 if not 0 <= self.group <= self.info.group_count:
2545 raise error("unknown group", pattern, self.position)
2547 if self.group > 0 and self.info.open_group_count[self.group] > 1:
2548 raise error("ambiguous group reference", pattern, self.position)
2550 self.info.group_calls.append((self, reverse, fuzzy))
2552 self._key = self.__class__, self.group
2554 def remove_captures(self):
2555 raise error("group reference not allowed", self.pattern, self.position)
2557 def _compile(self, reverse, fuzzy):
2558 return [(OP.GROUP_CALL, self.call_ref)]
2560 def dump(self, indent, reverse):
2561 print("{}GROUP_CALL {}".format(INDENT * indent, self.group))
2563 def __eq__(self, other):
2564 return type(self) is type(other) and self.group == other.group
2566 def max_width(self):
2567 return UNLIMITED
2569 def __del__(self):
2570 self.info = None
2572class CallRef(RegexBase):
2573 def __init__(self, ref, parsed):
2574 self.ref = ref
2575 self.parsed = parsed
2577 def _compile(self, reverse, fuzzy):
2578 return ([(OP.CALL_REF, self.ref)] + self.parsed._compile(reverse,
2579 fuzzy) + [(OP.END, )])
2581class Character(RegexBase):
2582 _opcode = {(NOCASE, False): OP.CHARACTER, (IGNORECASE, False):
2583 OP.CHARACTER_IGN, (FULLCASE, False): OP.CHARACTER, (FULLIGNORECASE,
2584 False): OP.CHARACTER_IGN, (NOCASE, True): OP.CHARACTER_REV, (IGNORECASE,
2585 True): OP.CHARACTER_IGN_REV, (FULLCASE, True): OP.CHARACTER_REV,
2586 (FULLIGNORECASE, True): OP.CHARACTER_IGN_REV}
2588 def __init__(self, value, positive=True, case_flags=NOCASE,
2589 zerowidth=False):
2590 RegexBase.__init__(self)
2591 self.value = value
2592 self.positive = bool(positive)
2593 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
2594 self.zerowidth = bool(zerowidth)
2596 if (self.positive and (self.case_flags & FULLIGNORECASE) ==
2597 FULLIGNORECASE):
2598 self.folded = _regex.fold_case(FULL_CASE_FOLDING, chr(self.value))
2599 else:
2600 self.folded = chr(self.value)
2602 self._key = (self.__class__, self.value, self.positive,
2603 self.case_flags, self.zerowidth)
2605 def rebuild(self, positive, case_flags, zerowidth):
2606 return Character(self.value, positive, case_flags, zerowidth)
2608 def optimise(self, info, reverse, in_set=False):
2609 return self
2611 def get_firstset(self, reverse):
2612 return set([self])
2614 def has_simple_start(self):
2615 return True
2617 def _compile(self, reverse, fuzzy):
2618 flags = 0
2619 if self.positive:
2620 flags |= POSITIVE_OP
2621 if self.zerowidth:
2622 flags |= ZEROWIDTH_OP
2623 if fuzzy:
2624 flags |= FUZZY_OP
2626 code = PrecompiledCode([self._opcode[self.case_flags, reverse], flags,
2627 self.value])
2629 if len(self.folded) > 1:
2630 # The character expands on full case-folding.
2631 code = Branch([code, String([ord(c) for c in self.folded],
2632 case_flags=self.case_flags)])
2634 return code.compile(reverse, fuzzy)
2636 def dump(self, indent, reverse):
2637 display = ascii(chr(self.value)).lstrip("bu")
2638 print("{}CHARACTER {} {}{}".format(INDENT * indent,
2639 POS_TEXT[self.positive], display, CASE_TEXT[self.case_flags]))
2641 def matches(self, ch):
2642 return (ch == self.value) == self.positive
2644 def max_width(self):
2645 return len(self.folded)
2647 def get_required_string(self, reverse):
2648 if not self.positive:
2649 return 1, None
2651 self.folded_characters = tuple(ord(c) for c in self.folded)
2653 return 0, self
2655class Conditional(RegexBase):
2656 def __init__(self, info, group, yes_item, no_item, position):
2657 RegexBase.__init__(self)
2658 self.info = info
2659 self.group = group
2660 self.yes_item = yes_item
2661 self.no_item = no_item
2662 self.position = position
2664 def fix_groups(self, pattern, reverse, fuzzy):
2665 try:
2666 self.group = int(self.group)
2667 except ValueError:
2668 try:
2669 self.group = self.info.group_index[self.group]
2670 except KeyError:
2671 if self.group == 'DEFINE':
2672 # 'DEFINE' is a special name unless there's a group with
2673 # that name.
2674 self.group = 0
2675 else:
2676 raise error("unknown group", pattern, self.position)
2678 if not 0 <= self.group <= self.info.group_count:
2679 raise error("invalid group reference", pattern, self.position)
2681 self.yes_item.fix_groups(pattern, reverse, fuzzy)
2682 self.no_item.fix_groups(pattern, reverse, fuzzy)
2684 def optimise(self, info, reverse):
2685 yes_item = self.yes_item.optimise(info, reverse)
2686 no_item = self.no_item.optimise(info, reverse)
2688 return Conditional(info, self.group, yes_item, no_item, self.position)
2690 def pack_characters(self, info):
2691 self.yes_item = self.yes_item.pack_characters(info)
2692 self.no_item = self.no_item.pack_characters(info)
2693 return self
2695 def remove_captures(self):
2696 self.yes_item = self.yes_item.remove_captures()
2697 self.no_item = self.no_item.remove_captures()
2699 def is_atomic(self):
2700 return self.yes_item.is_atomic() and self.no_item.is_atomic()
2702 def can_be_affix(self):
2703 return self.yes_item.can_be_affix() and self.no_item.can_be_affix()
2705 def contains_group(self):
2706 return self.yes_item.contains_group() or self.no_item.contains_group()
2708 def get_firstset(self, reverse):
2709 return (self.yes_item.get_firstset(reverse) |
2710 self.no_item.get_firstset(reverse))
2712 def _compile(self, reverse, fuzzy):
2713 code = [(OP.GROUP_EXISTS, self.group)]
2714 code.extend(self.yes_item.compile(reverse, fuzzy))
2715 add_code = self.no_item.compile(reverse, fuzzy)
2716 if add_code:
2717 code.append((OP.NEXT, ))
2718 code.extend(add_code)
2720 code.append((OP.END, ))
2722 return code
2724 def dump(self, indent, reverse):
2725 print("{}GROUP_EXISTS {}".format(INDENT * indent, self.group))
2726 self.yes_item.dump(indent + 1, reverse)
2727 if not self.no_item.is_empty():
2728 print("{}OR".format(INDENT * indent))
2729 self.no_item.dump(indent + 1, reverse)
2731 def is_empty(self):
2732 return self.yes_item.is_empty() and self.no_item.is_empty()
2734 def __eq__(self, other):
2735 return type(self) is type(other) and (self.group, self.yes_item,
2736 self.no_item) == (other.group, other.yes_item, other.no_item)
2738 def max_width(self):
2739 return max(self.yes_item.max_width(), self.no_item.max_width())
2741 def __del__(self):
2742 self.info = None
2744class DefaultBoundary(ZeroWidthBase):
2745 _opcode = OP.DEFAULT_BOUNDARY
2746 _op_name = "DEFAULT_BOUNDARY"
2748class DefaultEndOfWord(ZeroWidthBase):
2749 _opcode = OP.DEFAULT_END_OF_WORD
2750 _op_name = "DEFAULT_END_OF_WORD"
2752class DefaultStartOfWord(ZeroWidthBase):
2753 _opcode = OP.DEFAULT_START_OF_WORD
2754 _op_name = "DEFAULT_START_OF_WORD"
2756class EndOfLine(ZeroWidthBase):
2757 _opcode = OP.END_OF_LINE
2758 _op_name = "END_OF_LINE"
2760class EndOfLineU(EndOfLine):
2761 _opcode = OP.END_OF_LINE_U
2762 _op_name = "END_OF_LINE_U"
2764class EndOfString(ZeroWidthBase):
2765 _opcode = OP.END_OF_STRING
2766 _op_name = "END_OF_STRING"
2768class EndOfStringLine(ZeroWidthBase):
2769 _opcode = OP.END_OF_STRING_LINE
2770 _op_name = "END_OF_STRING_LINE"
2772class EndOfStringLineU(EndOfStringLine):
2773 _opcode = OP.END_OF_STRING_LINE_U
2774 _op_name = "END_OF_STRING_LINE_U"
2776class EndOfWord(ZeroWidthBase):
2777 _opcode = OP.END_OF_WORD
2778 _op_name = "END_OF_WORD"
2780class Failure(ZeroWidthBase):
2781 _op_name = "FAILURE"
2783 def _compile(self, reverse, fuzzy):
2784 return [(OP.FAILURE, )]
2786class Fuzzy(RegexBase):
2787 def __init__(self, subpattern, constraints=None):
2788 RegexBase.__init__(self)
2789 if constraints is None:
2790 constraints = {}
2791 self.subpattern = subpattern
2792 self.constraints = constraints
2794 # If an error type is mentioned in the cost equation, then its maximum
2795 # defaults to unlimited.
2796 if "cost" in constraints:
2797 for e in "dis":
2798 if e in constraints["cost"]:
2799 constraints.setdefault(e, (0, None))
2801 # If any error type is mentioned, then all the error maxima default to
2802 # 0, otherwise they default to unlimited.
2803 if set(constraints) & set("dis"):
2804 for e in "dis":
2805 constraints.setdefault(e, (0, 0))
2806 else:
2807 for e in "dis":
2808 constraints.setdefault(e, (0, None))
2810 # The maximum of the generic error type defaults to unlimited.
2811 constraints.setdefault("e", (0, None))
2813 # The cost equation defaults to equal costs. Also, the cost of any
2814 # error type not mentioned in the cost equation defaults to 0.
2815 if "cost" in constraints:
2816 for e in "dis":
2817 constraints["cost"].setdefault(e, 0)
2818 else:
2819 constraints["cost"] = {"d": 1, "i": 1, "s": 1, "max":
2820 constraints["e"][1]}
2822 def fix_groups(self, pattern, reverse, fuzzy):
2823 self.subpattern.fix_groups(pattern, reverse, True)
2825 def pack_characters(self, info):
2826 self.subpattern = self.subpattern.pack_characters(info)
2827 return self
2829 def remove_captures(self):
2830 self.subpattern = self.subpattern.remove_captures()
2831 return self
2833 def is_atomic(self):
2834 return self.subpattern.is_atomic()
2836 def contains_group(self):
2837 return self.subpattern.contains_group()
2839 def _compile(self, reverse, fuzzy):
2840 # The individual limits.
2841 arguments = []
2842 for e in "dise":
2843 v = self.constraints[e]
2844 arguments.append(v[0])
2845 arguments.append(UNLIMITED if v[1] is None else v[1])
2847 # The coeffs of the cost equation.
2848 for e in "dis":
2849 arguments.append(self.constraints["cost"][e])
2851 # The maximum of the cost equation.
2852 v = self.constraints["cost"]["max"]
2853 arguments.append(UNLIMITED if v is None else v)
2855 flags = 0
2856 if reverse:
2857 flags |= REVERSE_OP
2859 test = self.constraints.get("test")
2861 if test:
2862 return ([(OP.FUZZY_EXT, flags) + tuple(arguments)] +
2863 test.compile(reverse, True) + [(OP.NEXT,)] +
2864 self.subpattern.compile(reverse, True) + [(OP.END,)])
2866 return ([(OP.FUZZY, flags) + tuple(arguments)] +
2867 self.subpattern.compile(reverse, True) + [(OP.END,)])
2869 def dump(self, indent, reverse):
2870 constraints = self._constraints_to_string()
2871 if constraints:
2872 constraints = " " + constraints
2873 print("{}FUZZY{}".format(INDENT * indent, constraints))
2874 self.subpattern.dump(indent + 1, reverse)
2876 def is_empty(self):
2877 return self.subpattern.is_empty()
2879 def __eq__(self, other):
2880 return (type(self) is type(other) and self.subpattern ==
2881 other.subpattern and self.constraints == other.constraints)
2883 def max_width(self):
2884 return UNLIMITED
2886 def _constraints_to_string(self):
2887 constraints = []
2889 for name in "ids":
2890 min, max = self.constraints[name]
2891 if max == 0:
2892 continue
2894 con = ""
2896 if min > 0:
2897 con = "{}<=".format(min)
2899 con += name
2901 if max is not None:
2902 con += "<={}".format(max)
2904 constraints.append(con)
2906 cost = []
2907 for name in "ids":
2908 coeff = self.constraints["cost"][name]
2909 if coeff > 0:
2910 cost.append("{}{}".format(coeff, name))
2912 limit = self.constraints["cost"]["max"]
2913 if limit is not None and limit > 0:
2914 cost = "{}<={}".format("+".join(cost), limit)
2915 constraints.append(cost)
2917 return ",".join(constraints)
2919class Grapheme(RegexBase):
2920 def _compile(self, reverse, fuzzy):
2921 # Match at least 1 character until a grapheme boundary is reached. Note
2922 # that this is the same whether matching forwards or backwards.
2923 grapheme_matcher = Atomic(Sequence([LazyRepeat(AnyAll(), 1, None),
2924 GraphemeBoundary()]))
2926 return grapheme_matcher.compile(reverse, fuzzy)
2928 def dump(self, indent, reverse):
2929 print("{}GRAPHEME".format(INDENT * indent))
2931 def max_width(self):
2932 return UNLIMITED
2934class GraphemeBoundary:
2935 def compile(self, reverse, fuzzy):
2936 return [(OP.GRAPHEME_BOUNDARY, 1)]
2938class GreedyRepeat(RegexBase):
2939 _opcode = OP.GREEDY_REPEAT
2940 _op_name = "GREEDY_REPEAT"
2942 def __init__(self, subpattern, min_count, max_count):
2943 RegexBase.__init__(self)
2944 self.subpattern = subpattern
2945 self.min_count = min_count
2946 self.max_count = max_count
2948 def fix_groups(self, pattern, reverse, fuzzy):
2949 self.subpattern.fix_groups(pattern, reverse, fuzzy)
2951 def optimise(self, info, reverse):
2952 subpattern = self.subpattern.optimise(info, reverse)
2954 return type(self)(subpattern, self.min_count, self.max_count)
2956 def pack_characters(self, info):
2957 self.subpattern = self.subpattern.pack_characters(info)
2958 return self
2960 def remove_captures(self):
2961 self.subpattern = self.subpattern.remove_captures()
2962 return self
2964 def is_atomic(self):
2965 return self.min_count == self.max_count and self.subpattern.is_atomic()
2967 def can_be_affix(self):
2968 return False
2970 def contains_group(self):
2971 return self.subpattern.contains_group()
2973 def get_firstset(self, reverse):
2974 fs = self.subpattern.get_firstset(reverse)
2975 if self.min_count == 0:
2976 fs.add(None)
2978 return fs
2980 def _compile(self, reverse, fuzzy):
2981 repeat = [self._opcode, self.min_count]
2982 if self.max_count is None:
2983 repeat.append(UNLIMITED)
2984 else:
2985 repeat.append(self.max_count)
2987 subpattern = self.subpattern.compile(reverse, fuzzy)
2988 if not subpattern:
2989 return []
2991 return ([tuple(repeat)] + subpattern + [(OP.END, )])
2993 def dump(self, indent, reverse):
2994 if self.max_count is None:
2995 limit = "INF"
2996 else:
2997 limit = self.max_count
2998 print("{}{} {} {}".format(INDENT * indent, self._op_name,
2999 self.min_count, limit))
3001 self.subpattern.dump(indent + 1, reverse)
3003 def is_empty(self):
3004 return self.subpattern.is_empty()
3006 def __eq__(self, other):
3007 return type(self) is type(other) and (self.subpattern, self.min_count,
3008 self.max_count) == (other.subpattern, other.min_count,
3009 other.max_count)
3011 def max_width(self):
3012 if self.max_count is None:
3013 return UNLIMITED
3015 return self.subpattern.max_width() * self.max_count
3017 def get_required_string(self, reverse):
3018 max_count = UNLIMITED if self.max_count is None else self.max_count
3019 if self.min_count == 0:
3020 w = self.subpattern.max_width() * max_count
3021 return min(w, UNLIMITED), None
3023 ofs, req = self.subpattern.get_required_string(reverse)
3024 if req:
3025 return ofs, req
3027 w = self.subpattern.max_width() * max_count
3028 return min(w, UNLIMITED), None
3030class PossessiveRepeat(GreedyRepeat):
3031 def is_atomic(self):
3032 return True
3034 def _compile(self, reverse, fuzzy):
3035 subpattern = self.subpattern.compile(reverse, fuzzy)
3036 if not subpattern:
3037 return []
3039 repeat = [self._opcode, self.min_count]
3040 if self.max_count is None:
3041 repeat.append(UNLIMITED)
3042 else:
3043 repeat.append(self.max_count)
3045 return ([(OP.ATOMIC, ), tuple(repeat)] + subpattern + [(OP.END, ),
3046 (OP.END, )])
3048 def dump(self, indent, reverse):
3049 print("{}ATOMIC".format(INDENT * indent))
3051 if self.max_count is None:
3052 limit = "INF"
3053 else:
3054 limit = self.max_count
3055 print("{}{} {} {}".format(INDENT * (indent + 1), self._op_name,
3056 self.min_count, limit))
3058 self.subpattern.dump(indent + 2, reverse)
3060class Group(RegexBase):
3061 def __init__(self, info, group, subpattern):
3062 RegexBase.__init__(self)
3063 self.info = info
3064 self.group = group
3065 self.subpattern = subpattern
3067 self.call_ref = None
3069 def fix_groups(self, pattern, reverse, fuzzy):
3070 self.info.defined_groups[self.group] = (self, reverse, fuzzy)
3071 self.subpattern.fix_groups(pattern, reverse, fuzzy)
3073 def optimise(self, info, reverse):
3074 subpattern = self.subpattern.optimise(info, reverse)
3076 return Group(self.info, self.group, subpattern)
3078 def pack_characters(self, info):
3079 self.subpattern = self.subpattern.pack_characters(info)
3080 return self
3082 def remove_captures(self):
3083 return self.subpattern.remove_captures()
3085 def is_atomic(self):
3086 return self.subpattern.is_atomic()
3088 def can_be_affix(self):
3089 return False
3091 def contains_group(self):
3092 return True
3094 def get_firstset(self, reverse):
3095 return self.subpattern.get_firstset(reverse)
3097 def has_simple_start(self):
3098 return self.subpattern.has_simple_start()
3100 def _compile(self, reverse, fuzzy):
3101 code = []
3103 public_group = private_group = self.group
3104 if private_group < 0:
3105 public_group = self.info.private_groups[private_group]
3106 private_group = self.info.group_count - private_group
3108 key = self.group, reverse, fuzzy
3109 ref = self.info.call_refs.get(key)
3110 if ref is not None:
3111 code += [(OP.CALL_REF, ref)]
3113 code += [(OP.GROUP, int(not reverse), private_group, public_group)]
3114 code += self.subpattern.compile(reverse, fuzzy)
3115 code += [(OP.END, )]
3117 if ref is not None:
3118 code += [(OP.END, )]
3120 return code
3122 def dump(self, indent, reverse):
3123 group = self.group
3124 if group < 0:
3125 group = self.info.private_groups[group]
3126 print("{}GROUP {}".format(INDENT * indent, group))
3127 self.subpattern.dump(indent + 1, reverse)
3129 def __eq__(self, other):
3130 return (type(self) is type(other) and (self.group, self.subpattern) ==
3131 (other.group, other.subpattern))
3133 def max_width(self):
3134 return self.subpattern.max_width()
3136 def get_required_string(self, reverse):
3137 return self.subpattern.get_required_string(reverse)
3139 def __del__(self):
3140 self.info = None
3142class Keep(ZeroWidthBase):
3143 _opcode = OP.KEEP
3144 _op_name = "KEEP"
3146class LazyRepeat(GreedyRepeat):
3147 _opcode = OP.LAZY_REPEAT
3148 _op_name = "LAZY_REPEAT"
3150class LookAround(RegexBase):
3151 _dir_text = {False: "AHEAD", True: "BEHIND"}
3153 def __init__(self, behind, positive, subpattern):
3154 RegexBase.__init__(self)
3155 self.behind = bool(behind)
3156 self.positive = bool(positive)
3157 self.subpattern = subpattern
3159 def fix_groups(self, pattern, reverse, fuzzy):
3160 self.subpattern.fix_groups(pattern, self.behind, fuzzy)
3162 def optimise(self, info, reverse):
3163 subpattern = self.subpattern.optimise(info, self.behind)
3164 if self.positive and subpattern.is_empty():
3165 return subpattern
3167 return LookAround(self.behind, self.positive, subpattern)
3169 def pack_characters(self, info):
3170 self.subpattern = self.subpattern.pack_characters(info)
3171 return self
3173 def remove_captures(self):
3174 return self.subpattern.remove_captures()
3176 def is_atomic(self):
3177 return self.subpattern.is_atomic()
3179 def can_be_affix(self):
3180 return self.subpattern.can_be_affix()
3182 def contains_group(self):
3183 return self.subpattern.contains_group()
3185 def get_firstset(self, reverse):
3186 if self.positive and self.behind == reverse:
3187 return self.subpattern.get_firstset(reverse)
3189 return set([None])
3191 def _compile(self, reverse, fuzzy):
3192 flags = 0
3193 if self.positive:
3194 flags |= POSITIVE_OP
3195 if fuzzy:
3196 flags |= FUZZY_OP
3197 if reverse:
3198 flags |= REVERSE_OP
3200 return ([(OP.LOOKAROUND, flags, int(not self.behind))] +
3201 self.subpattern.compile(self.behind) + [(OP.END, )])
3203 def dump(self, indent, reverse):
3204 print("{}LOOK{} {}".format(INDENT * indent,
3205 self._dir_text[self.behind], POS_TEXT[self.positive]))
3206 self.subpattern.dump(indent + 1, self.behind)
3208 def is_empty(self):
3209 return self.positive and self.subpattern.is_empty()
3211 def __eq__(self, other):
3212 return type(self) is type(other) and (self.behind, self.positive,
3213 self.subpattern) == (other.behind, other.positive, other.subpattern)
3215 def max_width(self):
3216 return 0
3218class LookAroundConditional(RegexBase):
3219 _dir_text = {False: "AHEAD", True: "BEHIND"}
3221 def __init__(self, behind, positive, subpattern, yes_item, no_item):
3222 RegexBase.__init__(self)
3223 self.behind = bool(behind)
3224 self.positive = bool(positive)
3225 self.subpattern = subpattern
3226 self.yes_item = yes_item
3227 self.no_item = no_item
3229 def fix_groups(self, pattern, reverse, fuzzy):
3230 self.subpattern.fix_groups(pattern, reverse, fuzzy)
3231 self.yes_item.fix_groups(pattern, reverse, fuzzy)
3232 self.no_item.fix_groups(pattern, reverse, fuzzy)
3234 def optimise(self, info, reverse):
3235 subpattern = self.subpattern.optimise(info, self.behind)
3236 yes_item = self.yes_item.optimise(info, self.behind)
3237 no_item = self.no_item.optimise(info, self.behind)
3239 return LookAroundConditional(self.behind, self.positive, subpattern,
3240 yes_item, no_item)
3242 def pack_characters(self, info):
3243 self.subpattern = self.subpattern.pack_characters(info)
3244 self.yes_item = self.yes_item.pack_characters(info)
3245 self.no_item = self.no_item.pack_characters(info)
3246 return self
3248 def remove_captures(self):
3249 self.subpattern = self.subpattern.remove_captures()
3250 self.yes_item = self.yes_item.remove_captures()
3251 self.no_item = self.no_item.remove_captures()
3253 def is_atomic(self):
3254 return (self.subpattern.is_atomic() and self.yes_item.is_atomic() and
3255 self.no_item.is_atomic())
3257 def can_be_affix(self):
3258 return (self.subpattern.can_be_affix() and self.yes_item.can_be_affix()
3259 and self.no_item.can_be_affix())
3261 def contains_group(self):
3262 return (self.subpattern.contains_group() or
3263 self.yes_item.contains_group() or self.no_item.contains_group())
3265 def _compile(self, reverse, fuzzy):
3266 code = [(OP.CONDITIONAL, int(self.positive), int(not self.behind))]
3267 code.extend(self.subpattern.compile(self.behind, fuzzy))
3268 code.append((OP.NEXT, ))
3269 code.extend(self.yes_item.compile(reverse, fuzzy))
3270 add_code = self.no_item.compile(reverse, fuzzy)
3271 if add_code:
3272 code.append((OP.NEXT, ))
3273 code.extend(add_code)
3275 code.append((OP.END, ))
3277 return code
3279 def dump(self, indent, reverse):
3280 print("{}CONDITIONAL {} {}".format(INDENT * indent,
3281 self._dir_text[self.behind], POS_TEXT[self.positive]))
3282 self.subpattern.dump(indent + 1, self.behind)
3283 print("{}EITHER".format(INDENT * indent))
3284 self.yes_item.dump(indent + 1, reverse)
3285 if not self.no_item.is_empty():
3286 print("{}OR".format(INDENT * indent))
3287 self.no_item.dump(indent + 1, reverse)
3289 def is_empty(self):
3290 return (self.subpattern.is_empty() and self.yes_item.is_empty() or
3291 self.no_item.is_empty())
3293 def __eq__(self, other):
3294 return type(self) is type(other) and (self.subpattern, self.yes_item,
3295 self.no_item) == (other.subpattern, other.yes_item, other.no_item)
3297 def max_width(self):
3298 return max(self.yes_item.max_width(), self.no_item.max_width())
3300 def get_required_string(self, reverse):
3301 return self.max_width(), None
3303class PrecompiledCode(RegexBase):
3304 def __init__(self, code):
3305 self.code = code
3307 def _compile(self, reverse, fuzzy):
3308 return [tuple(self.code)]
3310class Property(RegexBase):
3311 _opcode = {(NOCASE, False): OP.PROPERTY, (IGNORECASE, False):
3312 OP.PROPERTY_IGN, (FULLCASE, False): OP.PROPERTY, (FULLIGNORECASE, False):
3313 OP.PROPERTY_IGN, (NOCASE, True): OP.PROPERTY_REV, (IGNORECASE, True):
3314 OP.PROPERTY_IGN_REV, (FULLCASE, True): OP.PROPERTY_REV, (FULLIGNORECASE,
3315 True): OP.PROPERTY_IGN_REV}
3317 def __init__(self, value, positive=True, case_flags=NOCASE,
3318 zerowidth=False, encoding=0):
3319 RegexBase.__init__(self)
3320 self.value = value
3321 self.positive = bool(positive)
3322 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
3323 self.zerowidth = bool(zerowidth)
3324 self.encoding = encoding
3326 self._key = (self.__class__, self.value, self.positive,
3327 self.case_flags, self.zerowidth)
3329 def rebuild(self, positive, case_flags, zerowidth):
3330 return Property(self.value, positive, case_flags, zerowidth,
3331 self.encoding)
3333 def optimise(self, info, reverse, in_set=False):
3334 return self
3336 def get_firstset(self, reverse):
3337 return set([self])
3339 def has_simple_start(self):
3340 return True
3342 def _compile(self, reverse, fuzzy):
3343 flags = 0
3344 if self.positive:
3345 flags |= POSITIVE_OP
3346 if self.zerowidth:
3347 flags |= ZEROWIDTH_OP
3348 if fuzzy:
3349 flags |= FUZZY_OP
3350 flags |= self.encoding << ENCODING_OP_SHIFT
3351 return [(self._opcode[self.case_flags, reverse], flags, self.value)]
3353 def dump(self, indent, reverse):
3354 prop = PROPERTY_NAMES[self.value >> 16]
3355 name, value = prop[0], prop[1][self.value & 0xFFFF]
3356 print("{}PROPERTY {} {}:{}{}{}".format(INDENT * indent,
3357 POS_TEXT[self.positive], name, value, CASE_TEXT[self.case_flags],
3358 ["", " ASCII"][self.encoding]))
3360 def matches(self, ch):
3361 return _regex.has_property_value(self.value, ch) == self.positive
3363 def max_width(self):
3364 return 1
3366class Prune(ZeroWidthBase):
3367 _op_name = "PRUNE"
3369 def _compile(self, reverse, fuzzy):
3370 return [(OP.PRUNE, )]
3372class Range(RegexBase):
3373 _opcode = {(NOCASE, False): OP.RANGE, (IGNORECASE, False): OP.RANGE_IGN,
3374 (FULLCASE, False): OP.RANGE, (FULLIGNORECASE, False): OP.RANGE_IGN,
3375 (NOCASE, True): OP.RANGE_REV, (IGNORECASE, True): OP.RANGE_IGN_REV,
3376 (FULLCASE, True): OP.RANGE_REV, (FULLIGNORECASE, True): OP.RANGE_IGN_REV}
3377 _op_name = "RANGE"
3379 def __init__(self, lower, upper, positive=True, case_flags=NOCASE,
3380 zerowidth=False):
3381 RegexBase.__init__(self)
3382 self.lower = lower
3383 self.upper = upper
3384 self.positive = bool(positive)
3385 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
3386 self.zerowidth = bool(zerowidth)
3388 self._key = (self.__class__, self.lower, self.upper, self.positive,
3389 self.case_flags, self.zerowidth)
3391 def rebuild(self, positive, case_flags, zerowidth):
3392 return Range(self.lower, self.upper, positive, case_flags, zerowidth)
3394 def optimise(self, info, reverse, in_set=False):
3395 # Is the range case-sensitive?
3396 if not self.positive or not (self.case_flags & IGNORECASE) or in_set:
3397 return self
3399 # Is full case-folding possible?
3400 if (not (info.flags & UNICODE) or (self.case_flags & FULLIGNORECASE) !=
3401 FULLIGNORECASE):
3402 return self
3404 # Get the characters which expand to multiple codepoints on folding.
3405 expanding_chars = _regex.get_expand_on_folding()
3407 # Get the folded characters in the range.
3408 items = []
3409 for ch in expanding_chars:
3410 if self.lower <= ord(ch) <= self.upper:
3411 folded = _regex.fold_case(FULL_CASE_FOLDING, ch)
3412 items.append(String([ord(c) for c in folded],
3413 case_flags=self.case_flags))
3415 if not items:
3416 # We can fall back to simple case-folding.
3417 return self
3419 if len(items) < self.upper - self.lower + 1:
3420 # Not all the characters are covered by the full case-folding.
3421 items.insert(0, self)
3423 return Branch(items)
3425 def _compile(self, reverse, fuzzy):
3426 flags = 0
3427 if self.positive:
3428 flags |= POSITIVE_OP
3429 if self.zerowidth:
3430 flags |= ZEROWIDTH_OP
3431 if fuzzy:
3432 flags |= FUZZY_OP
3433 return [(self._opcode[self.case_flags, reverse], flags, self.lower,
3434 self.upper)]
3436 def dump(self, indent, reverse):
3437 display_lower = ascii(chr(self.lower)).lstrip("bu")
3438 display_upper = ascii(chr(self.upper)).lstrip("bu")
3439 print("{}RANGE {} {} {}{}".format(INDENT * indent,
3440 POS_TEXT[self.positive], display_lower, display_upper,
3441 CASE_TEXT[self.case_flags]))
3443 def matches(self, ch):
3444 return (self.lower <= ch <= self.upper) == self.positive
3446 def max_width(self):
3447 return 1
3449class RefGroup(RegexBase):
3450 _opcode = {(NOCASE, False): OP.REF_GROUP, (IGNORECASE, False):
3451 OP.REF_GROUP_IGN, (FULLCASE, False): OP.REF_GROUP, (FULLIGNORECASE,
3452 False): OP.REF_GROUP_FLD, (NOCASE, True): OP.REF_GROUP_REV, (IGNORECASE,
3453 True): OP.REF_GROUP_IGN_REV, (FULLCASE, True): OP.REF_GROUP_REV,
3454 (FULLIGNORECASE, True): OP.REF_GROUP_FLD_REV}
3456 def __init__(self, info, group, position, case_flags=NOCASE):
3457 RegexBase.__init__(self)
3458 self.info = info
3459 self.group = group
3460 self.position = position
3461 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
3463 self._key = self.__class__, self.group, self.case_flags
3465 def fix_groups(self, pattern, reverse, fuzzy):
3466 try:
3467 self.group = int(self.group)
3468 except ValueError:
3469 try:
3470 self.group = self.info.group_index[self.group]
3471 except KeyError:
3472 raise error("unknown group", pattern, self.position)
3474 if not 1 <= self.group <= self.info.group_count:
3475 raise error("invalid group reference", pattern, self.position)
3477 self._key = self.__class__, self.group, self.case_flags
3479 def remove_captures(self):
3480 raise error("group reference not allowed", self.pattern, self.position)
3482 def _compile(self, reverse, fuzzy):
3483 flags = 0
3484 if fuzzy:
3485 flags |= FUZZY_OP
3486 return [(self._opcode[self.case_flags, reverse], flags, self.group)]
3488 def dump(self, indent, reverse):
3489 print("{}REF_GROUP {}{}".format(INDENT * indent, self.group,
3490 CASE_TEXT[self.case_flags]))
3492 def max_width(self):
3493 return UNLIMITED
3495 def __del__(self):
3496 self.info = None
3498class SearchAnchor(ZeroWidthBase):
3499 _opcode = OP.SEARCH_ANCHOR
3500 _op_name = "SEARCH_ANCHOR"
3502class Sequence(RegexBase):
3503 def __init__(self, items=None):
3504 RegexBase.__init__(self)
3505 if items is None:
3506 items = []
3508 self.items = items
3510 def fix_groups(self, pattern, reverse, fuzzy):
3511 for s in self.items:
3512 s.fix_groups(pattern, reverse, fuzzy)
3514 def optimise(self, info, reverse):
3515 # Flatten the sequences.
3516 items = []
3517 for s in self.items:
3518 s = s.optimise(info, reverse)
3519 if isinstance(s, Sequence):
3520 items.extend(s.items)
3521 else:
3522 items.append(s)
3524 return make_sequence(items)
3526 def pack_characters(self, info):
3527 "Packs sequences of characters into strings."
3528 items = []
3529 characters = []
3530 case_flags = NOCASE
3531 for s in self.items:
3532 if type(s) is Character and s.positive and not s.zerowidth:
3533 if s.case_flags != case_flags:
3534 # Different case sensitivity, so flush, unless neither the
3535 # previous nor the new character are cased.
3536 if s.case_flags or is_cased_i(info, s.value):
3537 Sequence._flush_characters(info, characters,
3538 case_flags, items)
3540 case_flags = s.case_flags
3542 characters.append(s.value)
3543 elif type(s) is String or type(s) is Literal:
3544 if s.case_flags != case_flags:
3545 # Different case sensitivity, so flush, unless the neither
3546 # the previous nor the new string are cased.
3547 if s.case_flags or any(is_cased_i(info, c) for c in
3548 characters):
3549 Sequence._flush_characters(info, characters,
3550 case_flags, items)
3552 case_flags = s.case_flags
3554 characters.extend(s.characters)
3555 else:
3556 Sequence._flush_characters(info, characters, case_flags, items)
3558 items.append(s.pack_characters(info))
3560 Sequence._flush_characters(info, characters, case_flags, items)
3562 return make_sequence(items)
3564 def remove_captures(self):
3565 self.items = [s.remove_captures() for s in self.items]
3566 return self
3568 def is_atomic(self):
3569 return all(s.is_atomic() for s in self.items)
3571 def can_be_affix(self):
3572 return False
3574 def contains_group(self):
3575 return any(s.contains_group() for s in self.items)
3577 def get_firstset(self, reverse):
3578 fs = set()
3579 items = self.items
3580 if reverse:
3581 items.reverse()
3582 for s in items:
3583 fs |= s.get_firstset(reverse)
3584 if None not in fs:
3585 return fs
3586 fs.discard(None)
3588 return fs | set([None])
3590 def has_simple_start(self):
3591 return bool(self.items) and self.items[0].has_simple_start()
3593 def _compile(self, reverse, fuzzy):
3594 seq = self.items
3595 if reverse:
3596 seq = seq[::-1]
3598 code = []
3599 for s in seq:
3600 code.extend(s.compile(reverse, fuzzy))
3602 return code
3604 def dump(self, indent, reverse):
3605 for s in self.items:
3606 s.dump(indent, reverse)
3608 @staticmethod
3609 def _flush_characters(info, characters, case_flags, items):
3610 if not characters:
3611 return
3613 # Disregard case_flags if all of the characters are case-less.
3614 if case_flags & IGNORECASE:
3615 if not any(is_cased_i(info, c) for c in characters):
3616 case_flags = NOCASE
3618 if (case_flags & FULLIGNORECASE) == FULLIGNORECASE:
3619 literals = Sequence._fix_full_casefold(characters)
3621 for item in literals:
3622 chars = item.characters
3624 if len(chars) == 1:
3625 items.append(Character(chars[0], case_flags=item.case_flags))
3626 else:
3627 items.append(String(chars, case_flags=item.case_flags))
3628 else:
3629 if len(characters) == 1:
3630 items.append(Character(characters[0], case_flags=case_flags))
3631 else:
3632 items.append(String(characters, case_flags=case_flags))
3634 characters[:] = []
3636 @staticmethod
3637 def _fix_full_casefold(characters):
3638 # Split a literal needing full case-folding into chunks that need it
3639 # and chunks that can use simple case-folding, which is faster.
3640 expanded = [_regex.fold_case(FULL_CASE_FOLDING, c) for c in
3641 _regex.get_expand_on_folding()]
3642 string = _regex.fold_case(FULL_CASE_FOLDING, ''.join(chr(c)
3643 for c in characters)).lower()
3644 chunks = []
3646 for e in expanded:
3647 found = string.find(e)
3649 while found >= 0:
3650 chunks.append((found, found + len(e)))
3651 found = string.find(e, found + 1)
3653 pos = 0
3654 literals = []
3656 for start, end in Sequence._merge_chunks(chunks):
3657 if pos < start:
3658 literals.append(Literal(characters[pos : start],
3659 case_flags=IGNORECASE))
3661 literals.append(Literal(characters[start : end],
3662 case_flags=FULLIGNORECASE))
3663 pos = end
3665 if pos < len(characters):
3666 literals.append(Literal(characters[pos : ], case_flags=IGNORECASE))
3668 return literals
3670 @staticmethod
3671 def _merge_chunks(chunks):
3672 if len(chunks) < 2:
3673 return chunks
3675 chunks.sort()
3677 start, end = chunks[0]
3678 new_chunks = []
3680 for s, e in chunks[1 : ]:
3681 if s <= end:
3682 end = max(end, e)
3683 else:
3684 new_chunks.append((start, end))
3685 start, end = s, e
3687 new_chunks.append((start, end))
3689 return new_chunks
3691 def is_empty(self):
3692 return all(i.is_empty() for i in self.items)
3694 def __eq__(self, other):
3695 return type(self) is type(other) and self.items == other.items
3697 def max_width(self):
3698 return sum(s.max_width() for s in self.items)
3700 def get_required_string(self, reverse):
3701 seq = self.items
3702 if reverse:
3703 seq = seq[::-1]
3705 offset = 0
3707 for s in seq:
3708 ofs, req = s.get_required_string(reverse)
3709 offset += ofs
3710 if req:
3711 return offset, req
3713 return offset, None
3715class SetBase(RegexBase):
3716 def __init__(self, info, items, positive=True, case_flags=NOCASE,
3717 zerowidth=False):
3718 RegexBase.__init__(self)
3719 self.info = info
3720 self.items = tuple(items)
3721 self.positive = bool(positive)
3722 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
3723 self.zerowidth = bool(zerowidth)
3725 self.char_width = 1
3727 self._key = (self.__class__, self.items, self.positive,
3728 self.case_flags, self.zerowidth)
3730 def rebuild(self, positive, case_flags, zerowidth):
3731 return type(self)(self.info, self.items, positive, case_flags,
3732 zerowidth).optimise(self.info, False)
3734 def get_firstset(self, reverse):
3735 return set([self])
3737 def has_simple_start(self):
3738 return True
3740 def _compile(self, reverse, fuzzy):
3741 flags = 0
3742 if self.positive:
3743 flags |= POSITIVE_OP
3744 if self.zerowidth:
3745 flags |= ZEROWIDTH_OP
3746 if fuzzy:
3747 flags |= FUZZY_OP
3748 code = [(self._opcode[self.case_flags, reverse], flags)]
3749 for m in self.items:
3750 code.extend(m.compile())
3752 code.append((OP.END, ))
3754 return code
3756 def dump(self, indent, reverse):
3757 print("{}{} {}{}".format(INDENT * indent, self._op_name,
3758 POS_TEXT[self.positive], CASE_TEXT[self.case_flags]))
3759 for i in self.items:
3760 i.dump(indent + 1, reverse)
3762 def _handle_case_folding(self, info, in_set):
3763 # Is the set case-sensitive?
3764 if not self.positive or not (self.case_flags & IGNORECASE) or in_set:
3765 return self
3767 # Is full case-folding possible?
3768 if (not (self.info.flags & UNICODE) or (self.case_flags &
3769 FULLIGNORECASE) != FULLIGNORECASE):
3770 return self
3772 # Get the characters which expand to multiple codepoints on folding.
3773 expanding_chars = _regex.get_expand_on_folding()
3775 # Get the folded characters in the set.
3776 items = []
3777 seen = set()
3778 for ch in expanding_chars:
3779 if self.matches(ord(ch)):
3780 folded = _regex.fold_case(FULL_CASE_FOLDING, ch)
3781 if folded not in seen:
3782 items.append(String([ord(c) for c in folded],
3783 case_flags=self.case_flags))
3784 seen.add(folded)
3786 if not items:
3787 # We can fall back to simple case-folding.
3788 return self
3790 return Branch([self] + items)
3792 def max_width(self):
3793 # Is the set case-sensitive?
3794 if not self.positive or not (self.case_flags & IGNORECASE):
3795 return 1
3797 # Is full case-folding possible?
3798 if (not (self.info.flags & UNICODE) or (self.case_flags &
3799 FULLIGNORECASE) != FULLIGNORECASE):
3800 return 1
3802 # Get the characters which expand to multiple codepoints on folding.
3803 expanding_chars = _regex.get_expand_on_folding()
3805 # Get the folded characters in the set.
3806 seen = set()
3807 for ch in expanding_chars:
3808 if self.matches(ord(ch)):
3809 folded = _regex.fold_case(FULL_CASE_FOLDING, ch)
3810 seen.add(folded)
3812 if not seen:
3813 return 1
3815 return max(len(folded) for folded in seen)
3817 def __del__(self):
3818 self.info = None
3820class SetDiff(SetBase):
3821 _opcode = {(NOCASE, False): OP.SET_DIFF, (IGNORECASE, False):
3822 OP.SET_DIFF_IGN, (FULLCASE, False): OP.SET_DIFF, (FULLIGNORECASE, False):
3823 OP.SET_DIFF_IGN, (NOCASE, True): OP.SET_DIFF_REV, (IGNORECASE, True):
3824 OP.SET_DIFF_IGN_REV, (FULLCASE, True): OP.SET_DIFF_REV, (FULLIGNORECASE,
3825 True): OP.SET_DIFF_IGN_REV}
3826 _op_name = "SET_DIFF"
3828 def optimise(self, info, reverse, in_set=False):
3829 items = self.items
3830 if len(items) > 2:
3831 items = [items[0], SetUnion(info, items[1 : ])]
3833 if len(items) == 1:
3834 return items[0].with_flags(case_flags=self.case_flags,
3835 zerowidth=self.zerowidth).optimise(info, reverse, in_set)
3837 self.items = tuple(m.optimise(info, reverse, in_set=True) for m in
3838 items)
3840 return self._handle_case_folding(info, in_set)
3842 def matches(self, ch):
3843 m = self.items[0].matches(ch) and not self.items[1].matches(ch)
3844 return m == self.positive
3846class SetInter(SetBase):
3847 _opcode = {(NOCASE, False): OP.SET_INTER, (IGNORECASE, False):
3848 OP.SET_INTER_IGN, (FULLCASE, False): OP.SET_INTER, (FULLIGNORECASE,
3849 False): OP.SET_INTER_IGN, (NOCASE, True): OP.SET_INTER_REV, (IGNORECASE,
3850 True): OP.SET_INTER_IGN_REV, (FULLCASE, True): OP.SET_INTER_REV,
3851 (FULLIGNORECASE, True): OP.SET_INTER_IGN_REV}
3852 _op_name = "SET_INTER"
3854 def optimise(self, info, reverse, in_set=False):
3855 items = []
3856 for m in self.items:
3857 m = m.optimise(info, reverse, in_set=True)
3858 if isinstance(m, SetInter) and m.positive:
3859 # Intersection in intersection.
3860 items.extend(m.items)
3861 else:
3862 items.append(m)
3864 if len(items) == 1:
3865 return items[0].with_flags(case_flags=self.case_flags,
3866 zerowidth=self.zerowidth).optimise(info, reverse, in_set)
3868 self.items = tuple(items)
3870 return self._handle_case_folding(info, in_set)
3872 def matches(self, ch):
3873 m = all(i.matches(ch) for i in self.items)
3874 return m == self.positive
3876class SetSymDiff(SetBase):
3877 _opcode = {(NOCASE, False): OP.SET_SYM_DIFF, (IGNORECASE, False):
3878 OP.SET_SYM_DIFF_IGN, (FULLCASE, False): OP.SET_SYM_DIFF, (FULLIGNORECASE,
3879 False): OP.SET_SYM_DIFF_IGN, (NOCASE, True): OP.SET_SYM_DIFF_REV,
3880 (IGNORECASE, True): OP.SET_SYM_DIFF_IGN_REV, (FULLCASE, True):
3881 OP.SET_SYM_DIFF_REV, (FULLIGNORECASE, True): OP.SET_SYM_DIFF_IGN_REV}
3882 _op_name = "SET_SYM_DIFF"
3884 def optimise(self, info, reverse, in_set=False):
3885 items = []
3886 for m in self.items:
3887 m = m.optimise(info, reverse, in_set=True)
3888 if isinstance(m, SetSymDiff) and m.positive:
3889 # Symmetric difference in symmetric difference.
3890 items.extend(m.items)
3891 else:
3892 items.append(m)
3894 if len(items) == 1:
3895 return items[0].with_flags(case_flags=self.case_flags,
3896 zerowidth=self.zerowidth).optimise(info, reverse, in_set)
3898 self.items = tuple(items)
3900 return self._handle_case_folding(info, in_set)
3902 def matches(self, ch):
3903 m = False
3904 for i in self.items:
3905 m = m != i.matches(ch)
3907 return m == self.positive
3909class SetUnion(SetBase):
3910 _opcode = {(NOCASE, False): OP.SET_UNION, (IGNORECASE, False):
3911 OP.SET_UNION_IGN, (FULLCASE, False): OP.SET_UNION, (FULLIGNORECASE,
3912 False): OP.SET_UNION_IGN, (NOCASE, True): OP.SET_UNION_REV, (IGNORECASE,
3913 True): OP.SET_UNION_IGN_REV, (FULLCASE, True): OP.SET_UNION_REV,
3914 (FULLIGNORECASE, True): OP.SET_UNION_IGN_REV}
3915 _op_name = "SET_UNION"
3917 def optimise(self, info, reverse, in_set=False):
3918 items = []
3919 for m in self.items:
3920 m = m.optimise(info, reverse, in_set=True)
3921 if isinstance(m, SetUnion) and m.positive:
3922 # Union in union.
3923 items.extend(m.items)
3924 elif isinstance(m, AnyAll):
3925 return AnyAll()
3926 else:
3927 items.append(m)
3929 # Are there complementary properties?
3930 properties = (set(), set())
3932 for m in items:
3933 if isinstance(m, Property):
3934 properties[m.positive].add((m.value, m.case_flags, m.zerowidth))
3936 if properties[0] & properties[1]:
3937 return AnyAll()
3939 if len(items) == 1:
3940 i = items[0]
3941 return i.with_flags(positive=i.positive == self.positive,
3942 case_flags=self.case_flags,
3943 zerowidth=self.zerowidth).optimise(info, reverse, in_set)
3945 self.items = tuple(items)
3947 return self._handle_case_folding(info, in_set)
3949 def _compile(self, reverse, fuzzy):
3950 flags = 0
3951 if self.positive:
3952 flags |= POSITIVE_OP
3953 if self.zerowidth:
3954 flags |= ZEROWIDTH_OP
3955 if fuzzy:
3956 flags |= FUZZY_OP
3958 characters, others = defaultdict(list), []
3959 for m in self.items:
3960 if isinstance(m, Character):
3961 characters[m.positive].append(m.value)
3962 else:
3963 others.append(m)
3965 code = [(self._opcode[self.case_flags, reverse], flags)]
3967 for positive, values in characters.items():
3968 flags = 0
3969 if positive:
3970 flags |= POSITIVE_OP
3971 if len(values) == 1:
3972 code.append((OP.CHARACTER, flags, values[0]))
3973 else:
3974 code.append((OP.STRING, flags, len(values)) + tuple(values))
3976 for m in others:
3977 code.extend(m.compile())
3979 code.append((OP.END, ))
3981 return code
3983 def matches(self, ch):
3984 m = any(i.matches(ch) for i in self.items)
3985 return m == self.positive
3987class Skip(ZeroWidthBase):
3988 _op_name = "SKIP"
3989 _opcode = OP.SKIP
3991class StartOfLine(ZeroWidthBase):
3992 _opcode = OP.START_OF_LINE
3993 _op_name = "START_OF_LINE"
3995class StartOfLineU(StartOfLine):
3996 _opcode = OP.START_OF_LINE_U
3997 _op_name = "START_OF_LINE_U"
3999class StartOfString(ZeroWidthBase):
4000 _opcode = OP.START_OF_STRING
4001 _op_name = "START_OF_STRING"
4003class StartOfWord(ZeroWidthBase):
4004 _opcode = OP.START_OF_WORD
4005 _op_name = "START_OF_WORD"
4007class String(RegexBase):
4008 _opcode = {(NOCASE, False): OP.STRING, (IGNORECASE, False): OP.STRING_IGN,
4009 (FULLCASE, False): OP.STRING, (FULLIGNORECASE, False): OP.STRING_FLD,
4010 (NOCASE, True): OP.STRING_REV, (IGNORECASE, True): OP.STRING_IGN_REV,
4011 (FULLCASE, True): OP.STRING_REV, (FULLIGNORECASE, True):
4012 OP.STRING_FLD_REV}
4014 def __init__(self, characters, case_flags=NOCASE):
4015 self.characters = tuple(characters)
4016 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
4018 if (self.case_flags & FULLIGNORECASE) == FULLIGNORECASE:
4019 folded_characters = []
4020 for char in self.characters:
4021 folded = _regex.fold_case(FULL_CASE_FOLDING, chr(char))
4022 folded_characters.extend(ord(c) for c in folded)
4023 else:
4024 folded_characters = self.characters
4026 self.folded_characters = tuple(folded_characters)
4027 self.required = False
4029 self._key = self.__class__, self.characters, self.case_flags
4031 def get_firstset(self, reverse):
4032 if reverse:
4033 pos = -1
4034 else:
4035 pos = 0
4036 return set([Character(self.characters[pos],
4037 case_flags=self.case_flags)])
4039 def has_simple_start(self):
4040 return True
4042 def _compile(self, reverse, fuzzy):
4043 flags = 0
4044 if fuzzy:
4045 flags |= FUZZY_OP
4046 if self.required:
4047 flags |= REQUIRED_OP
4048 return [(self._opcode[self.case_flags, reverse], flags,
4049 len(self.folded_characters)) + self.folded_characters]
4051 def dump(self, indent, reverse):
4052 display = ascii("".join(chr(c) for c in self.characters)).lstrip("bu")
4053 print("{}STRING {}{}".format(INDENT * indent, display,
4054 CASE_TEXT[self.case_flags]))
4056 def max_width(self):
4057 return len(self.folded_characters)
4059 def get_required_string(self, reverse):
4060 return 0, self
4062class Literal(String):
4063 def dump(self, indent, reverse):
4064 literal = ''.join(chr(c) for c in self.characters)
4065 display = ascii(literal).lstrip("bu")
4066 print("{}LITERAL MATCH {}{}".format(INDENT * indent, display,
4067 CASE_TEXT[self.case_flags]))
4069class StringSet(Branch):
4070 def __init__(self, info, name, case_flags=NOCASE):
4071 self.info = info
4072 self.name = name
4073 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
4075 self._key = self.__class__, self.name, self.case_flags
4077 self.set_key = (name, self.case_flags)
4078 if self.set_key not in info.named_lists_used:
4079 info.named_lists_used[self.set_key] = len(info.named_lists_used)
4081 index = self.info.named_lists_used[self.set_key]
4082 items = self.info.kwargs[self.name]
4084 case_flags = self.case_flags
4086 encoding = self.info.flags & _ALL_ENCODINGS
4087 fold_flags = encoding | case_flags
4089 choices = []
4091 for string in items:
4092 if isinstance(string, str):
4093 string = [ord(c) for c in string]
4095 choices.append([Character(c, case_flags=case_flags) for c in
4096 string])
4098 # Sort from longest to shortest.
4099 choices.sort(key=len, reverse=True)
4101 self.branches = [Sequence(choice) for choice in choices]
4103 def dump(self, indent, reverse):
4104 print("{}STRING_SET {}{}".format(INDENT * indent, self.name,
4105 CASE_TEXT[self.case_flags]))
4107 def __del__(self):
4108 self.info = None
4110class Source:
4111 "Scanner for the regular expression source string."
4112 def __init__(self, string):
4113 if isinstance(string, str):
4114 self.string = string
4115 self.char_type = chr
4116 else:
4117 self.string = string.decode("latin-1")
4118 self.char_type = lambda c: bytes([c])
4120 self.pos = 0
4121 self.ignore_space = False
4122 self.sep = string[ : 0]
4124 def peek(self, override_ignore=False):
4125 string = self.string
4126 pos = self.pos
4128 try:
4129 if self.ignore_space and not override_ignore:
4130 while True:
4131 if string[pos].isspace():
4132 # Skip over the whitespace.
4133 pos += 1
4134 elif string[pos] == "#":
4135 # Skip over the comment to the end of the line.
4136 pos = string.index("\n", pos)
4137 else:
4138 break
4140 return string[pos]
4141 except IndexError:
4142 # We've reached the end of the string.
4143 return string[ : 0]
4144 except ValueError:
4145 # The comment extended to the end of the string.
4146 return string[ : 0]
4148 def get(self, override_ignore=False):
4149 string = self.string
4150 pos = self.pos
4152 try:
4153 if self.ignore_space and not override_ignore:
4154 while True:
4155 if string[pos].isspace():
4156 # Skip over the whitespace.
4157 pos += 1
4158 elif string[pos] == "#":
4159 # Skip over the comment to the end of the line.
4160 pos = string.index("\n", pos)
4161 else:
4162 break
4164 ch = string[pos]
4165 self.pos = pos + 1
4166 return ch
4167 except IndexError:
4168 # We've reached the end of the string.
4169 self.pos = pos
4170 return string[ : 0]
4171 except ValueError:
4172 # The comment extended to the end of the string.
4173 self.pos = len(string)
4174 return string[ : 0]
4176 def get_many(self, count=1):
4177 string = self.string
4178 pos = self.pos
4180 try:
4181 if self.ignore_space:
4182 substring = []
4184 while len(substring) < count:
4185 while True:
4186 if string[pos].isspace():
4187 # Skip over the whitespace.
4188 pos += 1
4189 elif string[pos] == "#":
4190 # Skip over the comment to the end of the line.
4191 pos = string.index("\n", pos)
4192 else:
4193 break
4195 substring.append(string[pos])
4196 pos += 1
4198 substring = "".join(substring)
4199 else:
4200 substring = string[pos : pos + count]
4201 pos += len(substring)
4203 self.pos = pos
4204 return substring
4205 except IndexError:
4206 # We've reached the end of the string.
4207 self.pos = len(string)
4208 return "".join(substring)
4209 except ValueError:
4210 # The comment extended to the end of the string.
4211 self.pos = len(string)
4212 return "".join(substring)
4214 def get_while(self, test_set, include=True, keep_spaces=False):
4215 string = self.string
4216 pos = self.pos
4218 if self.ignore_space and not keep_spaces:
4219 try:
4220 substring = []
4222 while True:
4223 if string[pos].isspace():
4224 # Skip over the whitespace.
4225 pos += 1
4226 elif string[pos] == "#":
4227 # Skip over the comment to the end of the line.
4228 pos = string.index("\n", pos)
4229 elif (string[pos] in test_set) == include:
4230 substring.append(string[pos])
4231 pos += 1
4232 else:
4233 break
4235 self.pos = pos
4236 except IndexError:
4237 # We've reached the end of the string.
4238 self.pos = len(string)
4239 except ValueError:
4240 # The comment extended to the end of the string.
4241 self.pos = len(string)
4243 return "".join(substring)
4244 else:
4245 try:
4246 while (string[pos] in test_set) == include:
4247 pos += 1
4249 substring = string[self.pos : pos]
4251 self.pos = pos
4253 return substring
4254 except IndexError:
4255 # We've reached the end of the string.
4256 substring = string[self.pos : pos]
4258 self.pos = pos
4260 return substring
4262 def skip_while(self, test_set, include=True):
4263 string = self.string
4264 pos = self.pos
4266 try:
4267 if self.ignore_space:
4268 while True:
4269 if string[pos].isspace():
4270 # Skip over the whitespace.
4271 pos += 1
4272 elif string[pos] == "#":
4273 # Skip over the comment to the end of the line.
4274 pos = string.index("\n", pos)
4275 elif (string[pos] in test_set) == include:
4276 pos += 1
4277 else:
4278 break
4279 else:
4280 while (string[pos] in test_set) == include:
4281 pos += 1
4283 self.pos = pos
4284 except IndexError:
4285 # We've reached the end of the string.
4286 self.pos = len(string)
4287 except ValueError:
4288 # The comment extended to the end of the string.
4289 self.pos = len(string)
4291 def match(self, substring):
4292 string = self.string
4293 pos = self.pos
4295 if self.ignore_space:
4296 try:
4297 for c in substring:
4298 while True:
4299 if string[pos].isspace():
4300 # Skip over the whitespace.
4301 pos += 1
4302 elif string[pos] == "#":
4303 # Skip over the comment to the end of the line.
4304 pos = string.index("\n", pos)
4305 else:
4306 break
4308 if string[pos] != c:
4309 return False
4311 pos += 1
4313 self.pos = pos
4315 return True
4316 except IndexError:
4317 # We've reached the end of the string.
4318 return False
4319 except ValueError:
4320 # The comment extended to the end of the string.
4321 return False
4322 else:
4323 if not string.startswith(substring, pos):
4324 return False
4326 self.pos = pos + len(substring)
4328 return True
4330 def expect(self, substring):
4331 if not self.match(substring):
4332 raise error("missing {}".format(substring), self.string, self.pos)
4334 def at_end(self):
4335 string = self.string
4336 pos = self.pos
4338 try:
4339 if self.ignore_space:
4340 while True:
4341 if string[pos].isspace():
4342 pos += 1
4343 elif string[pos] == "#":
4344 pos = string.index("\n", pos)
4345 else:
4346 break
4348 return pos >= len(string)
4349 except IndexError:
4350 # We've reached the end of the string.
4351 return True
4352 except ValueError:
4353 # The comment extended to the end of the string.
4354 return True
4356class Info:
4357 "Info about the regular expression."
4359 def __init__(self, flags=0, char_type=None, kwargs={}):
4360 flags |= DEFAULT_FLAGS[(flags & _ALL_VERSIONS) or DEFAULT_VERSION]
4361 self.flags = flags
4362 self.global_flags = flags
4363 self.inline_locale = False
4365 self.kwargs = kwargs
4367 self.group_count = 0
4368 self.group_index = {}
4369 self.group_name = {}
4370 self.char_type = char_type
4371 self.named_lists_used = {}
4372 self.open_groups = []
4373 self.open_group_count = {}
4374 self.defined_groups = {}
4375 self.group_calls = []
4376 self.private_groups = {}
4378 def open_group(self, name=None):
4379 group = self.group_index.get(name)
4380 if group is None:
4381 while True:
4382 self.group_count += 1
4383 if name is None or self.group_count not in self.group_name:
4384 break
4386 group = self.group_count
4387 if name:
4388 self.group_index[name] = group
4389 self.group_name[group] = name
4391 if group in self.open_groups:
4392 # We have a nested named group. We'll assign it a private group
4393 # number, initially negative until we can assign a proper
4394 # (positive) number.
4395 group_alias = -(len(self.private_groups) + 1)
4396 self.private_groups[group_alias] = group
4397 group = group_alias
4399 self.open_groups.append(group)
4400 self.open_group_count[group] = self.open_group_count.get(group, 0) + 1
4402 return group
4404 def close_group(self):
4405 self.open_groups.pop()
4407 def is_open_group(self, name):
4408 # In version 1, a group reference can refer to an open group. We'll
4409 # just pretend the group isn't open.
4410 version = (self.flags & _ALL_VERSIONS) or DEFAULT_VERSION
4411 if version == VERSION1:
4412 return False
4414 if name.isdigit():
4415 group = int(name)
4416 else:
4417 group = self.group_index.get(name)
4419 return group in self.open_groups
4421def _check_group_features(info, parsed):
4422 """Checks whether the reverse and fuzzy features of the group calls match
4423 the groups which they call.
4424 """
4425 call_refs = {}
4426 additional_groups = []
4427 for call, reverse, fuzzy in info.group_calls:
4428 # Look up the reference of this group call.
4429 key = (call.group, reverse, fuzzy)
4430 ref = call_refs.get(key)
4431 if ref is None:
4432 # This group doesn't have a reference yet, so look up its features.
4433 if call.group == 0:
4434 # Calling the pattern as a whole.
4435 rev = bool(info.flags & REVERSE)
4436 fuz = isinstance(parsed, Fuzzy)
4437 if (rev, fuz) != (reverse, fuzzy):
4438 # The pattern as a whole doesn't have the features we want,
4439 # so we'll need to make a copy of it with the desired
4440 # features.
4441 additional_groups.append((CallRef(len(call_refs), parsed),
4442 reverse, fuzzy))
4443 else:
4444 # Calling a capture group.
4445 def_info = info.defined_groups[call.group]
4446 group = def_info[0]
4447 if def_info[1 : ] != (reverse, fuzzy):
4448 # The group doesn't have the features we want, so we'll
4449 # need to make a copy of it with the desired features.
4450 additional_groups.append((group, reverse, fuzzy))
4452 ref = len(call_refs)
4453 call_refs[key] = ref
4455 call.call_ref = ref
4457 info.call_refs = call_refs
4458 info.additional_groups = additional_groups
4460def _get_required_string(parsed, flags):
4461 "Gets the required string and related info of a parsed pattern."
4463 req_offset, required = parsed.get_required_string(bool(flags & REVERSE))
4464 if required:
4465 required.required = True
4466 if req_offset >= UNLIMITED:
4467 req_offset = -1
4469 req_flags = required.case_flags
4470 if not (flags & UNICODE):
4471 req_flags &= ~UNICODE
4473 req_chars = required.folded_characters
4474 else:
4475 req_offset = 0
4476 req_chars = ()
4477 req_flags = 0
4479 return req_offset, req_chars, req_flags
4481class Scanner:
4482 def __init__(self, lexicon, flags=0):
4483 self.lexicon = lexicon
4485 # Combine phrases into a compound pattern.
4486 patterns = []
4487 for phrase, action in lexicon:
4488 # Parse the regular expression.
4489 source = Source(phrase)
4490 info = Info(flags, source.char_type)
4491 source.ignore_space = bool(info.flags & VERBOSE)
4492 parsed = _parse_pattern(source, info)
4493 if not source.at_end():
4494 raise error("unbalanced parenthesis", source.string,
4495 source.pos)
4497 # We want to forbid capture groups within each phrase.
4498 patterns.append(parsed.remove_captures())
4500 # Combine all the subpatterns into one pattern.
4501 info = Info(flags)
4502 patterns = [Group(info, g + 1, p) for g, p in enumerate(patterns)]
4503 parsed = Branch(patterns)
4505 # Optimise the compound pattern.
4506 reverse = bool(info.flags & REVERSE)
4507 parsed = parsed.optimise(info, reverse)
4508 parsed = parsed.pack_characters(info)
4510 # Get the required string.
4511 req_offset, req_chars, req_flags = _get_required_string(parsed,
4512 info.flags)
4514 # Check the features of the groups.
4515 _check_group_features(info, parsed)
4517 # Complain if there are any group calls. They are not supported by the
4518 # Scanner class.
4519 if info.call_refs:
4520 raise error("recursive regex not supported by Scanner",
4521 source.string, source.pos)
4523 reverse = bool(info.flags & REVERSE)
4525 # Compile the compound pattern. The result is a list of tuples.
4526 code = parsed.compile(reverse) + [(OP.SUCCESS, )]
4528 # Flatten the code into a list of ints.
4529 code = _flatten_code(code)
4531 if not parsed.has_simple_start():
4532 # Get the first set, if possible.
4533 try:
4534 fs_code = _compile_firstset(info, parsed.get_firstset(reverse))
4535 fs_code = _flatten_code(fs_code)
4536 code = fs_code + code
4537 except _FirstSetError:
4538 pass
4540 # Check the global flags for conflicts.
4541 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION
4542 if version not in (0, VERSION0, VERSION1):
4543 raise ValueError("VERSION0 and VERSION1 flags are mutually incompatible")
4545 # Create the PatternObject.
4546 #
4547 # Local flags like IGNORECASE affect the code generation, but aren't
4548 # needed by the PatternObject itself. Conversely, global flags like
4549 # LOCALE _don't_ affect the code generation but _are_ needed by the
4550 # PatternObject.
4551 self.scanner = _regex.compile(None, (flags & GLOBAL_FLAGS) | version,
4552 code, {}, {}, {}, [], req_offset, req_chars, req_flags,
4553 len(patterns))
4555 def scan(self, string):
4556 result = []
4557 append = result.append
4558 match = self.scanner.scanner(string).match
4559 i = 0
4560 while True:
4561 m = match()
4562 if not m:
4563 break
4564 j = m.end()
4565 if i == j:
4566 break
4567 action = self.lexicon[m.lastindex - 1][1]
4568 if hasattr(action, '__call__'):
4569 self.match = m
4570 action = action(self, m.group())
4571 if action is not None:
4572 append(action)
4573 i = j
4575 return result, string[i : ]
4577# Get the known properties dict.
4578PROPERTIES = _regex.get_properties()
4580# Build the inverse of the properties dict.
4581PROPERTY_NAMES = {}
4582for prop_name, (prop_id, values) in PROPERTIES.items():
4583 name, prop_values = PROPERTY_NAMES.get(prop_id, ("", {}))
4584 name = max(name, prop_name, key=len)
4585 PROPERTY_NAMES[prop_id] = name, prop_values
4587 for val_name, val_id in values.items():
4588 prop_values[val_id] = max(prop_values.get(val_id, ""), val_name,
4589 key=len)
4591# Character escape sequences.
4592CHARACTER_ESCAPES = {
4593 "a": "\a",
4594 "b": "\b",
4595 "f": "\f",
4596 "n": "\n",
4597 "r": "\r",
4598 "t": "\t",
4599 "v": "\v",
4600}
4602ASCII_ENCODING = 1
4603UNICODE_ENCODING = 2
4605# Predefined character set escape sequences.
4606CHARSET_ESCAPES = {
4607 "d": lookup_property(None, "Digit", True),
4608 "D": lookup_property(None, "Digit", False),
4609 "h": lookup_property(None, "Blank", True),
4610 "s": lookup_property(None, "Space", True),
4611 "S": lookup_property(None, "Space", False),
4612 "w": lookup_property(None, "Word", True),
4613 "W": lookup_property(None, "Word", False),
4614}
4616ASCII_CHARSET_ESCAPES = dict(CHARSET_ESCAPES)
4617ASCII_CHARSET_ESCAPES.update({
4618 "d": lookup_property(None, "Digit", True, encoding=ASCII_ENCODING),
4619 "D": lookup_property(None, "Digit", False, encoding=ASCII_ENCODING),
4620 "s": lookup_property(None, "Space", True, encoding=ASCII_ENCODING),
4621 "S": lookup_property(None, "Space", False, encoding=ASCII_ENCODING),
4622 "w": lookup_property(None, "Word", True, encoding=ASCII_ENCODING),
4623 "W": lookup_property(None, "Word", False, encoding=ASCII_ENCODING),
4624})
4625UNICODE_CHARSET_ESCAPES = dict(CHARSET_ESCAPES)
4626UNICODE_CHARSET_ESCAPES.update({
4627 "d": lookup_property(None, "Digit", True, encoding=UNICODE_ENCODING),
4628 "D": lookup_property(None, "Digit", False, encoding=UNICODE_ENCODING),
4629 "s": lookup_property(None, "Space", True, encoding=UNICODE_ENCODING),
4630 "S": lookup_property(None, "Space", False, encoding=UNICODE_ENCODING),
4631 "w": lookup_property(None, "Word", True, encoding=UNICODE_ENCODING),
4632 "W": lookup_property(None, "Word", False, encoding=UNICODE_ENCODING),
4633})
4635# Positional escape sequences.
4636POSITION_ESCAPES = {
4637 "A": StartOfString(),
4638 "b": Boundary(),
4639 "B": Boundary(False),
4640 "K": Keep(),
4641 "m": StartOfWord(),
4642 "M": EndOfWord(),
4643 "Z": EndOfString(),
4644}
4645ASCII_POSITION_ESCAPES = dict(POSITION_ESCAPES)
4646ASCII_POSITION_ESCAPES.update({
4647 "b": Boundary(encoding=ASCII_ENCODING),
4648 "B": Boundary(False, encoding=ASCII_ENCODING),
4649 "m": StartOfWord(encoding=ASCII_ENCODING),
4650 "M": EndOfWord(encoding=ASCII_ENCODING),
4651})
4652UNICODE_POSITION_ESCAPES = dict(POSITION_ESCAPES)
4653UNICODE_POSITION_ESCAPES.update({
4654 "b": Boundary(encoding=UNICODE_ENCODING),
4655 "B": Boundary(False, encoding=UNICODE_ENCODING),
4656 "m": StartOfWord(encoding=UNICODE_ENCODING),
4657 "M": EndOfWord(encoding=UNICODE_ENCODING),
4658})
4660# Positional escape sequences when WORD flag set.
4661WORD_POSITION_ESCAPES = dict(POSITION_ESCAPES)
4662WORD_POSITION_ESCAPES.update({
4663 "b": DefaultBoundary(),
4664 "B": DefaultBoundary(False),
4665 "m": DefaultStartOfWord(),
4666 "M": DefaultEndOfWord(),
4667})
4669# Regex control verbs.
4670VERBS = {
4671 "FAIL": Failure(),
4672 "F": Failure(),
4673 "PRUNE": Prune(),
4674 "SKIP": Skip(),
4675}