Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/regex/_regex_core.py: 44%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1#
2# Secret Labs' Regular Expression Engine core module
3#
4# Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved.
5#
6# This version of the SRE library can be redistributed under CNRI's
7# Python 1.6 license. For any other use, please contact Secret Labs
8# AB (info@pythonware.com).
9#
10# Portions of this engine have been developed in cooperation with
11# CNRI. Hewlett-Packard provided funding for 1.6 integration and
12# other compatibility work.
13#
14# 2010-01-16 mrab Python front-end re-written and extended
16import enum
17import string
18import unicodedata
19from collections import defaultdict
21from regex import _regex
23__all__ = ["A", "ASCII", "B", "BESTMATCH", "D", "DEBUG", "E", "ENHANCEMATCH",
24 "F", "FULLCASE", "I", "IGNORECASE", "L", "LOCALE", "M", "MULTILINE", "P",
25 "POSIX", "R", "REVERSE", "S", "DOTALL", "T", "TEMPLATE", "U", "UNICODE",
26 "V0", "VERSION0", "V1", "VERSION1", "W", "WORD", "X", "VERBOSE", "error",
27 "Scanner", "RegexFlag"]
29# The regex exception.
30class error(Exception):
31 """Exception raised for invalid regular expressions.
33 Attributes:
35 msg: The unformatted error message
36 pattern: The regular expression pattern
37 pos: The position in the pattern where compilation failed, or None
38 lineno: The line number where compilation failed, unless pos is None
39 colno: The column number where compilation failed, unless pos is None
40 """
42 def __init__(self, message, pattern=None, pos=None):
43 newline = '\n' if isinstance(pattern, str) else b'\n'
44 self.msg = message
45 self.pattern = pattern
46 self.pos = pos
47 if pattern is not None and pos is not None:
48 self.lineno = pattern.count(newline, 0, pos) + 1
49 self.colno = pos - pattern.rfind(newline, 0, pos)
51 message = "{} at position {}".format(message, pos)
53 if newline in pattern:
54 message += " (line {}, column {})".format(self.lineno,
55 self.colno)
57 Exception.__init__(self, message)
59# The exception for when a positional flag has been turned on in the old
60# behaviour.
61class _UnscopedFlagSet(Exception):
62 pass
64# The exception for when parsing fails and we want to try something else.
65class ParseError(Exception):
66 pass
68# The exception for when there isn't a valid first set.
69class _FirstSetError(Exception):
70 pass
72# Flags.
73class RegexFlag(enum.IntFlag):
74 A = ASCII = 0x80 # Assume ASCII locale.
75 B = BESTMATCH = 0x1000 # Best fuzzy match.
76 D = DEBUG = 0x200 # Print parsed pattern.
77 E = ENHANCEMATCH = 0x8000 # Attempt to improve the fit after finding the first
78 # fuzzy match.
79 F = FULLCASE = 0x4000 # Unicode full case-folding.
80 I = IGNORECASE = 0x2 # Ignore case.
81 L = LOCALE = 0x4 # Assume current 8-bit locale.
82 M = MULTILINE = 0x8 # Make anchors look for newline.
83 P = POSIX = 0x10000 # POSIX-style matching (leftmost longest).
84 R = REVERSE = 0x400 # Search backwards.
85 S = DOTALL = 0x10 # Make dot match newline.
86 U = UNICODE = 0x20 # Assume Unicode locale.
87 V0 = VERSION0 = 0x2000 # Old legacy behaviour.
88 V1 = VERSION1 = 0x100 # New enhanced behaviour.
89 W = WORD = 0x800 # Default Unicode word breaks.
90 X = VERBOSE = 0x40 # Ignore whitespace and comments.
91 T = TEMPLATE = 0x1 # Template (present because re module has it).
93 def __repr__(self):
94 if self._name_ is not None:
95 return 'regex.%s' % self._name_
97 value = self._value_
98 members = []
99 negative = value < 0
101 if negative:
102 value = ~value
104 for m in self.__class__:
105 if value & m._value_:
106 value &= ~m._value_
107 members.append('regex.%s' % m._name_)
109 if value:
110 members.append(hex(value))
112 res = '|'.join(members)
114 if negative:
115 if len(members) > 1:
116 res = '~(%s)' % res
117 else:
118 res = '~%s' % res
120 return res
122 __str__ = object.__str__
124# Put the flags into the module namespace. Being explicit here helps tools like
125# linters and IDEs understand the code better.
126ASCII = RegexFlag.ASCII
127BESTMATCH = RegexFlag.BESTMATCH
128DEBUG = RegexFlag.DEBUG
129DOTALL = RegexFlag.DOTALL
130ENHANCEMATCH = RegexFlag.ENHANCEMATCH
131FULLCASE = RegexFlag.FULLCASE
132IGNORECASE = RegexFlag.IGNORECASE
133LOCALE = RegexFlag.LOCALE
134MULTILINE = RegexFlag.MULTILINE
135POSIX = RegexFlag.POSIX
136REVERSE = RegexFlag.REVERSE
137TEMPLATE = RegexFlag.TEMPLATE
138UNICODE = RegexFlag.UNICODE
139VERBOSE = RegexFlag.VERBOSE
140VERSION0 = RegexFlag.VERSION0
141VERSION1 = RegexFlag.VERSION1
142WORD = RegexFlag.WORD
143A = RegexFlag.A
144B = RegexFlag.B
145D = RegexFlag.D
146E = RegexFlag.E
147F = RegexFlag.F
148I = RegexFlag.I
149L = RegexFlag.L
150M = RegexFlag.M
151P = RegexFlag.P
152R = RegexFlag.R
153S = RegexFlag.S
154U = RegexFlag.U
155V0 = RegexFlag.V0
156V1 = RegexFlag.V1
157W = RegexFlag.W
158X = RegexFlag.X
159T = RegexFlag.T
161DEFAULT_VERSION = VERSION1
163_ALL_VERSIONS = VERSION0 | VERSION1
164_ALL_ENCODINGS = ASCII | LOCALE | UNICODE
166# The default flags for the various versions.
167DEFAULT_FLAGS = {VERSION0: 0, VERSION1: FULLCASE}
169# The mask for the flags.
170GLOBAL_FLAGS = (_ALL_VERSIONS | BESTMATCH | DEBUG | ENHANCEMATCH | POSIX |
171 REVERSE)
172SCOPED_FLAGS = (FULLCASE | IGNORECASE | MULTILINE | DOTALL | WORD | VERBOSE |
173 _ALL_ENCODINGS)
175ALPHA = frozenset(string.ascii_letters)
176DIGITS = frozenset(string.digits)
177ALNUM = ALPHA | DIGITS
178OCT_DIGITS = frozenset(string.octdigits)
179HEX_DIGITS = frozenset(string.hexdigits)
180SPECIAL_CHARS = frozenset("()|?*+{^$.[\\#") | frozenset([""])
181NAMED_CHAR_PART = ALNUM | frozenset(" -")
182PROPERTY_NAME_PART = ALNUM | frozenset(" &_-.")
183SET_OPS = ("||", "~~", "&&", "--")
185# The width of the code words inside the regex engine.
186BYTES_PER_CODE = _regex.get_code_size()
187BITS_PER_CODE = BYTES_PER_CODE * 8
189# The repeat count which represents infinity.
190UNLIMITED = (1 << BITS_PER_CODE) - 1
192# The regular expression flags.
193REGEX_FLAGS = {"a": ASCII, "b": BESTMATCH, "e": ENHANCEMATCH, "f": FULLCASE,
194 "i": IGNORECASE, "L": LOCALE, "m": MULTILINE, "p": POSIX, "r": REVERSE,
195 "s": DOTALL, "u": UNICODE, "V0": VERSION0, "V1": VERSION1, "w": WORD, "x":
196 VERBOSE}
198# The case flags.
199CASE_FLAGS = FULLCASE | IGNORECASE
200NOCASE = 0
201FULLIGNORECASE = FULLCASE | IGNORECASE
203FULL_CASE_FOLDING = UNICODE | FULLIGNORECASE
205CASE_FLAGS_COMBINATIONS = {0: 0, FULLCASE: 0, IGNORECASE: IGNORECASE,
206 FULLIGNORECASE: FULLIGNORECASE}
208# The number of digits in hexadecimal escapes.
209HEX_ESCAPES = {"x": 2, "u": 4, "U": 8}
211# The names of the opcodes.
212OPCODES = """
213FAILURE
214SUCCESS
215ANY
216ANY_ALL
217ANY_ALL_REV
218ANY_REV
219ANY_U
220ANY_U_REV
221ATOMIC
222BOUNDARY
223BRANCH
224CALL_REF
225CHARACTER
226CHARACTER_IGN
227CHARACTER_IGN_REV
228CHARACTER_REV
229CONDITIONAL
230DEFAULT_BOUNDARY
231DEFAULT_END_OF_WORD
232DEFAULT_START_OF_WORD
233END
234END_OF_LINE
235END_OF_LINE_U
236END_OF_STRING
237END_OF_STRING_LINE
238END_OF_STRING_LINE_U
239END_OF_WORD
240FUZZY
241GRAPHEME_BOUNDARY
242GREEDY_REPEAT
243GROUP
244GROUP_CALL
245GROUP_EXISTS
246KEEP
247LAZY_REPEAT
248LOOKAROUND
249NEXT
250PROPERTY
251PROPERTY_IGN
252PROPERTY_IGN_REV
253PROPERTY_REV
254PRUNE
255RANGE
256RANGE_IGN
257RANGE_IGN_REV
258RANGE_REV
259REF_GROUP
260REF_GROUP_FLD
261REF_GROUP_FLD_REV
262REF_GROUP_IGN
263REF_GROUP_IGN_REV
264REF_GROUP_REV
265SEARCH_ANCHOR
266SET_DIFF
267SET_DIFF_IGN
268SET_DIFF_IGN_REV
269SET_DIFF_REV
270SET_INTER
271SET_INTER_IGN
272SET_INTER_IGN_REV
273SET_INTER_REV
274SET_SYM_DIFF
275SET_SYM_DIFF_IGN
276SET_SYM_DIFF_IGN_REV
277SET_SYM_DIFF_REV
278SET_UNION
279SET_UNION_IGN
280SET_UNION_IGN_REV
281SET_UNION_REV
282SKIP
283START_OF_LINE
284START_OF_LINE_U
285START_OF_STRING
286START_OF_WORD
287STRING
288STRING_FLD
289STRING_FLD_REV
290STRING_IGN
291STRING_IGN_REV
292STRING_REV
293FUZZY_EXT
294"""
296# Define the opcodes in a namespace.
297class Namespace:
298 pass
300OP = Namespace()
301for i, op in enumerate(OPCODES.split()):
302 setattr(OP, op, i)
304def _shrink_cache(cache_dict, args_dict, locale_sensitive, max_length, divisor=5):
305 """Make room in the given cache.
307 Args:
308 cache_dict: The cache dictionary to modify.
309 args_dict: The dictionary of named list args used by patterns.
310 max_length: Maximum # of entries in cache_dict before it is shrunk.
311 divisor: Cache will shrink to max_length - 1/divisor*max_length items.
312 """
313 # Toss out a fraction of the entries at random to make room for new ones.
314 # A random algorithm was chosen as opposed to simply cache_dict.popitem()
315 # as popitem could penalize the same regular expression repeatedly based
316 # on its internal hash value. Being random should spread the cache miss
317 # love around.
318 cache_keys = tuple(cache_dict.keys())
319 overage = len(cache_keys) - max_length
320 if overage < 0:
321 # Cache is already within limits. Normally this should not happen
322 # but it could due to multithreading.
323 return
325 number_to_toss = max_length // divisor + overage
327 # The import is done here to avoid a circular dependency.
328 import random
329 if not hasattr(random, 'sample'):
330 # Do nothing while resolving the circular dependency:
331 # re->random->warnings->tokenize->string->re
332 return
334 for doomed_key in random.sample(cache_keys, number_to_toss):
335 try:
336 del cache_dict[doomed_key]
337 except KeyError:
338 # Ignore problems if the cache changed from another thread.
339 pass
341 # Rebuild the arguments and locale-sensitivity dictionaries.
342 args_dict.clear()
343 sensitivity_dict = {}
344 for pattern, pattern_type, flags, args, default_version, locale in tuple(cache_dict):
345 args_dict[pattern, pattern_type, flags, default_version, locale] = args
346 try:
347 sensitivity_dict[pattern_type, pattern] = locale_sensitive[pattern_type, pattern]
348 except KeyError:
349 pass
351 locale_sensitive.clear()
352 locale_sensitive.update(sensitivity_dict)
354def _fold_case(info, string):
355 "Folds the case of a string."
356 flags = info.flags
357 if (flags & _ALL_ENCODINGS) == 0:
358 flags |= info.guess_encoding
360 return _regex.fold_case(flags, string)
362def is_cased_i(info, char):
363 "Checks whether a character is cased."
364 return len(_regex.get_all_cases(info.flags, char)) > 1
366def is_cased_f(flags, char):
367 "Checks whether a character is cased."
368 return len(_regex.get_all_cases(flags, char)) > 1
370def _compile_firstset(info, fs):
371 "Compiles the firstset for the pattern."
372 reverse = bool(info.flags & REVERSE)
373 fs = _check_firstset(info, reverse, fs)
374 if not fs or isinstance(fs, AnyAll):
375 return []
377 # Compile the firstset.
378 return fs.compile(reverse)
380def _check_firstset(info, reverse, fs):
381 "Checks the firstset for the pattern."
382 if not fs or None in fs:
383 return None
385 # If we ignore the case, for simplicity we won't build a firstset.
386 members = set()
387 case_flags = NOCASE
388 for i in fs:
389 if isinstance(i, Character) and not i.positive:
390 return None
392# if i.case_flags:
393# if isinstance(i, Character):
394# if is_cased_i(info, i.value):
395# return []
396# elif isinstance(i, SetBase):
397# return []
398 case_flags |= i.case_flags
399 members.add(i.with_flags(case_flags=NOCASE))
401 if case_flags == (FULLCASE | IGNORECASE):
402 return None
404 # Build the firstset.
405 fs = SetUnion(info, list(members), case_flags=case_flags & ~FULLCASE,
406 zerowidth=True)
407 fs = fs.optimise(info, reverse, in_set=True)
409 return fs
411def _flatten_code(code):
412 "Flattens the code from a list of tuples."
413 flat_code = []
414 for c in code:
415 flat_code.extend(c)
417 return flat_code
419def make_case_flags(info):
420 "Makes the case flags."
421 flags = info.flags & CASE_FLAGS
423 # Turn off FULLCASE if ASCII is turned on.
424 if info.flags & ASCII:
425 flags &= ~FULLCASE
427 return flags
429def make_character(info, value, in_set=False):
430 "Makes a character literal."
431 if in_set:
432 # A character set is built case-sensitively.
433 return Character(value)
435 return Character(value, case_flags=make_case_flags(info))
437def make_ref_group(info, name, position):
438 "Makes a group reference."
439 return RefGroup(info, name, position, case_flags=make_case_flags(info))
441def make_string_set(info, name):
442 "Makes a string set."
443 return StringSet(info, name, case_flags=make_case_flags(info))
445def make_property(info, prop, in_set):
446 "Makes a property."
447 if in_set:
448 return prop
450 return prop.with_flags(case_flags=make_case_flags(info))
452def _parse_pattern(source, info):
453 "Parses a pattern, eg. 'a|b|c'."
454 branches = [parse_sequence(source, info)]
455 while source.match("|"):
456 branches.append(parse_sequence(source, info))
458 if len(branches) == 1:
459 return branches[0]
460 return Branch(branches)
462def parse_sequence(source, info):
463 "Parses a sequence, eg. 'abc'."
464 sequence = [None]
465 case_flags = make_case_flags(info)
466 while True:
467 saved_pos = source.pos
468 ch = source.get()
469 if ch in SPECIAL_CHARS:
470 if ch in ")|":
471 # The end of a sequence. At the end of the pattern ch is "".
472 source.pos = saved_pos
473 break
474 elif ch == "\\":
475 # An escape sequence outside a set.
476 sequence.append(parse_escape(source, info, False))
477 elif ch == "(":
478 # A parenthesised subpattern or a flag.
479 element = parse_paren(source, info)
480 if element is None:
481 case_flags = make_case_flags(info)
482 else:
483 sequence.append(element)
484 elif ch == ".":
485 # Any character.
486 if info.flags & DOTALL:
487 sequence.append(AnyAll())
488 elif info.flags & WORD:
489 sequence.append(AnyU())
490 else:
491 sequence.append(Any())
492 elif ch == "[":
493 # A character set.
494 sequence.append(parse_set(source, info))
495 elif ch == "^":
496 # The start of a line or the string.
497 if info.flags & MULTILINE:
498 if info.flags & WORD:
499 sequence.append(StartOfLineU())
500 else:
501 sequence.append(StartOfLine())
502 else:
503 sequence.append(StartOfString())
504 elif ch == "$":
505 # The end of a line or the string.
506 if info.flags & MULTILINE:
507 if info.flags & WORD:
508 sequence.append(EndOfLineU())
509 else:
510 sequence.append(EndOfLine())
511 else:
512 if info.flags & WORD:
513 sequence.append(EndOfStringLineU())
514 else:
515 sequence.append(EndOfStringLine())
516 elif ch in "?*+{":
517 # Looks like a quantifier.
518 counts = parse_quantifier(source, info, ch)
519 if counts:
520 # It _is_ a quantifier.
521 apply_quantifier(source, info, counts, case_flags, ch,
522 saved_pos, sequence)
523 sequence.append(None)
524 else:
525 # It's not a quantifier. Maybe it's a fuzzy constraint.
526 constraints = parse_fuzzy(source, info, ch, case_flags)
527 if constraints:
528 # It _is_ a fuzzy constraint.
529 apply_constraint(source, info, constraints, case_flags,
530 saved_pos, sequence)
531 sequence.append(None)
532 else:
533 # The element was just a literal.
534 sequence.append(Character(ord(ch),
535 case_flags=case_flags))
536 else:
537 # A literal.
538 sequence.append(Character(ord(ch), case_flags=case_flags))
539 else:
540 # A literal.
541 sequence.append(Character(ord(ch), case_flags=case_flags))
543 sequence = [item for item in sequence if item is not None]
544 return Sequence(sequence)
546def apply_quantifier(source, info, counts, case_flags, ch, saved_pos,
547 sequence):
548 element = sequence.pop()
549 if element is None:
550 if sequence:
551 raise error("multiple repeat", source.string, saved_pos)
552 raise error("nothing to repeat", source.string, saved_pos)
554 if isinstance(element, (GreedyRepeat, LazyRepeat, PossessiveRepeat)):
555 raise error("multiple repeat", source.string, saved_pos)
557 min_count, max_count = counts
558 saved_pos = source.pos
559 ch = source.get()
560 if ch == "?":
561 # The "?" suffix that means it's a lazy repeat.
562 repeated = LazyRepeat
563 elif ch == "+":
564 # The "+" suffix that means it's a possessive repeat.
565 repeated = PossessiveRepeat
566 else:
567 # No suffix means that it's a greedy repeat.
568 source.pos = saved_pos
569 repeated = GreedyRepeat
571 # Ignore the quantifier if it applies to a zero-width item or the number of
572 # repeats is fixed at 1.
573 if not element.is_empty() and (min_count != 1 or max_count != 1):
574 element = repeated(element, min_count, max_count)
576 sequence.append(element)
578def apply_constraint(source, info, constraints, case_flags, saved_pos,
579 sequence):
580 element = sequence.pop()
581 if element is None:
582 raise error("nothing for fuzzy constraint", source.string, saved_pos)
584 # If a group is marked as fuzzy then put all of the fuzzy part in the
585 # group.
586 if isinstance(element, Group):
587 element.subpattern = Fuzzy(element.subpattern, constraints)
588 sequence.append(element)
589 else:
590 sequence.append(Fuzzy(element, constraints))
592_QUANTIFIERS = {"?": (0, 1), "*": (0, None), "+": (1, None)}
594def parse_quantifier(source, info, ch):
595 "Parses a quantifier."
596 q = _QUANTIFIERS.get(ch)
597 if q:
598 # It's a quantifier.
599 return q
601 if ch == "{":
602 # Looks like a limited repeated element, eg. 'a{2,3}'.
603 counts = parse_limited_quantifier(source)
604 if counts:
605 return counts
607 return None
609def is_above_limit(count):
610 "Checks whether a count is above the maximum."
611 return count is not None and count >= UNLIMITED
613def parse_limited_quantifier(source):
614 "Parses a limited quantifier."
615 saved_pos = source.pos
616 min_count = parse_count(source)
617 if source.match(","):
618 max_count = parse_count(source)
620 # No minimum means 0 and no maximum means unlimited.
621 min_count = int(min_count or 0)
622 max_count = int(max_count) if max_count else None
623 else:
624 if not min_count:
625 source.pos = saved_pos
626 return None
628 min_count = max_count = int(min_count)
630 if not source.match ("}"):
631 source.pos = saved_pos
632 return None
634 if is_above_limit(min_count) or is_above_limit(max_count):
635 raise error("repeat count too big", source.string, saved_pos)
637 if max_count is not None and min_count > max_count:
638 raise error("min repeat greater than max repeat", source.string,
639 saved_pos)
641 return min_count, max_count
643def parse_fuzzy(source, info, ch, case_flags):
644 "Parses a fuzzy setting, if present."
645 saved_pos = source.pos
647 if ch != "{":
648 return None
650 constraints = {}
651 try:
652 parse_fuzzy_item(source, constraints)
653 while source.match(","):
654 parse_fuzzy_item(source, constraints)
655 except ParseError:
656 source.pos = saved_pos
657 return None
659 if source.match(":"):
660 constraints["test"] = parse_fuzzy_test(source, info, case_flags)
662 if not source.match("}"):
663 raise error("expected }", source.string, source.pos)
665 return constraints
667def parse_fuzzy_item(source, constraints):
668 "Parses a fuzzy setting item."
669 saved_pos = source.pos
670 try:
671 parse_cost_constraint(source, constraints)
672 except ParseError:
673 source.pos = saved_pos
675 parse_cost_equation(source, constraints)
677def parse_cost_constraint(source, constraints):
678 "Parses a cost constraint."
679 saved_pos = source.pos
680 ch = source.get()
681 if ch in ALPHA:
682 # Syntax: constraint [("<=" | "<") cost]
683 constraint = parse_constraint(source, constraints, ch)
685 max_inc = parse_fuzzy_compare(source)
687 if max_inc is None:
688 # No maximum cost.
689 constraints[constraint] = 0, None
690 else:
691 # There's a maximum cost.
692 cost_pos = source.pos
693 max_cost = parse_cost_limit(source)
695 # Inclusive or exclusive limit?
696 if not max_inc:
697 max_cost -= 1
699 if max_cost < 0:
700 raise error("bad fuzzy cost limit", source.string, cost_pos)
702 constraints[constraint] = 0, max_cost
703 elif ch in DIGITS:
704 # Syntax: cost ("<=" | "<") constraint ("<=" | "<") cost
705 source.pos = saved_pos
707 # Minimum cost.
708 cost_pos = source.pos
709 min_cost = parse_cost_limit(source)
711 min_inc = parse_fuzzy_compare(source)
712 if min_inc is None:
713 raise ParseError()
715 constraint = parse_constraint(source, constraints, source.get())
717 max_inc = parse_fuzzy_compare(source)
718 if max_inc is None:
719 raise ParseError()
721 # Maximum cost.
722 cost_pos = source.pos
723 max_cost = parse_cost_limit(source)
725 # Inclusive or exclusive limits?
726 if not min_inc:
727 min_cost += 1
728 if not max_inc:
729 max_cost -= 1
731 if not 0 <= min_cost <= max_cost:
732 raise error("bad fuzzy cost limit", source.string, cost_pos)
734 constraints[constraint] = min_cost, max_cost
735 else:
736 raise ParseError()
738def parse_cost_limit(source):
739 "Parses a cost limit."
740 cost_pos = source.pos
741 digits = parse_count(source)
743 try:
744 return int(digits)
745 except ValueError:
746 pass
748 raise error("bad fuzzy cost limit", source.string, cost_pos)
750def parse_constraint(source, constraints, ch):
751 "Parses a constraint."
752 if ch not in "deis":
753 raise ParseError()
755 if ch in constraints:
756 raise ParseError()
758 return ch
760def parse_fuzzy_compare(source):
761 "Parses a cost comparator."
762 if source.match("<="):
763 return True
764 elif source.match("<"):
765 return False
766 else:
767 return None
769def parse_cost_equation(source, constraints):
770 "Parses a cost equation."
771 if "cost" in constraints:
772 raise error("more than one cost equation", source.string, source.pos)
774 cost = {}
776 parse_cost_term(source, cost)
777 while source.match("+"):
778 parse_cost_term(source, cost)
780 max_inc = parse_fuzzy_compare(source)
781 if max_inc is None:
782 raise ParseError()
784 max_cost = int(parse_count(source))
786 if not max_inc:
787 max_cost -= 1
789 if max_cost < 0:
790 raise error("bad fuzzy cost limit", source.string, source.pos)
792 cost["max"] = max_cost
794 constraints["cost"] = cost
796def parse_cost_term(source, cost):
797 "Parses a cost equation term."
798 coeff = parse_count(source)
799 ch = source.get()
800 if ch not in "dis":
801 raise ParseError()
803 if ch in cost:
804 raise error("repeated fuzzy cost", source.string, source.pos)
806 cost[ch] = int(coeff or 1)
808def parse_fuzzy_test(source, info, case_flags):
809 saved_pos = source.pos
810 ch = source.get()
811 if ch in SPECIAL_CHARS:
812 if ch == "\\":
813 # An escape sequence outside a set.
814 return parse_escape(source, info, False)
815 elif ch == ".":
816 # Any character.
817 if info.flags & DOTALL:
818 return AnyAll()
819 elif info.flags & WORD:
820 return AnyU()
821 else:
822 return Any()
823 elif ch == "[":
824 # A character set.
825 return parse_set(source, info)
826 else:
827 raise error("expected character set", source.string, saved_pos)
828 elif ch:
829 # A literal.
830 return Character(ord(ch), case_flags=case_flags)
831 else:
832 raise error("expected character set", source.string, saved_pos)
834def parse_count(source):
835 "Parses a quantifier's count, which can be empty."
836 return source.get_while(DIGITS)
838def parse_paren(source, info):
839 """Parses a parenthesised subpattern or a flag. Returns FLAGS if it's an
840 inline flag.
841 """
842 saved_pos = source.pos
843 ch = source.get(True)
844 if ch == "?":
845 # (?...
846 saved_pos_2 = source.pos
847 ch = source.get(True)
848 if ch == "<":
849 # (?<...
850 saved_pos_3 = source.pos
851 ch = source.get()
852 if ch in ("=", "!"):
853 # (?<=... or (?<!...: lookbehind.
854 return parse_lookaround(source, info, True, ch == "=")
856 # (?<...: a named capture group.
857 source.pos = saved_pos_3
858 name = parse_name(source)
859 group = info.open_group(name)
860 source.expect(">")
861 saved_flags = info.flags
862 try:
863 subpattern = _parse_pattern(source, info)
864 source.expect(")")
865 finally:
866 info.flags = saved_flags
867 source.ignore_space = bool(info.flags & VERBOSE)
869 info.close_group()
870 return Group(info, group, subpattern)
871 if ch in ("=", "!"):
872 # (?=... or (?!...: lookahead.
873 return parse_lookaround(source, info, False, ch == "=")
874 if ch == "P":
875 # (?P...: a Python extension.
876 return parse_extension(source, info)
877 if ch == "#":
878 # (?#...: a comment.
879 return parse_comment(source)
880 if ch == "(":
881 # (?(...: a conditional subpattern.
882 return parse_conditional(source, info)
883 if ch == ">":
884 # (?>...: an atomic subpattern.
885 return parse_atomic(source, info)
886 if ch == "|":
887 # (?|...: a common/reset groups branch.
888 return parse_common(source, info)
889 if ch == "R" or "0" <= ch <= "9":
890 # (?R...: probably a call to a group.
891 return parse_call_group(source, info, ch, saved_pos_2)
892 if ch == "&":
893 # (?&...: a call to a named group.
894 return parse_call_named_group(source, info, saved_pos_2)
896 # (?...: probably a flags subpattern.
897 source.pos = saved_pos_2
898 return parse_flags_subpattern(source, info)
900 if ch == "*":
901 # (*...
902 saved_pos_2 = source.pos
903 word = source.get_while(set(")>"), include=False)
904 if word[ : 1].isalpha():
905 verb = VERBS.get(word)
906 if not verb:
907 raise error("unknown verb", source.string, saved_pos_2)
909 source.expect(")")
911 return verb
913 # (...: an unnamed capture group.
914 source.pos = saved_pos
915 group = info.open_group()
916 saved_flags = info.flags
917 try:
918 subpattern = _parse_pattern(source, info)
919 source.expect(")")
920 finally:
921 info.flags = saved_flags
922 source.ignore_space = bool(info.flags & VERBOSE)
924 info.close_group()
926 return Group(info, group, subpattern)
928def parse_extension(source, info):
929 "Parses a Python extension."
930 saved_pos = source.pos
931 ch = source.get()
932 if ch == "<":
933 # (?P<...: a named capture group.
934 name = parse_name(source)
935 group = info.open_group(name)
936 source.expect(">")
937 saved_flags = info.flags
938 try:
939 subpattern = _parse_pattern(source, info)
940 source.expect(")")
941 finally:
942 info.flags = saved_flags
943 source.ignore_space = bool(info.flags & VERBOSE)
945 info.close_group()
947 return Group(info, group, subpattern)
948 if ch == "=":
949 # (?P=...: a named group reference.
950 name = parse_name(source, allow_numeric=True)
951 source.expect(")")
952 if info.is_open_group(name):
953 raise error("cannot refer to an open group", source.string,
954 saved_pos)
956 return make_ref_group(info, name, saved_pos)
957 if ch == ">" or ch == "&":
958 # (?P>...: a call to a group.
959 return parse_call_named_group(source, info, saved_pos)
961 source.pos = saved_pos
962 raise error("unknown extension", source.string, saved_pos)
964def parse_comment(source):
965 "Parses a comment."
966 while True:
967 saved_pos = source.pos
968 c = source.get(True)
970 if not c or c == ")":
971 break
973 if c == "\\":
974 c = source.get(True)
976 source.pos = saved_pos
977 source.expect(")")
979 return None
981def parse_lookaround(source, info, behind, positive):
982 "Parses a lookaround."
983 saved_flags = info.flags
984 try:
985 subpattern = _parse_pattern(source, info)
986 source.expect(")")
987 finally:
988 info.flags = saved_flags
989 source.ignore_space = bool(info.flags & VERBOSE)
991 return LookAround(behind, positive, subpattern)
993def parse_conditional(source, info):
994 "Parses a conditional subpattern."
995 saved_flags = info.flags
996 saved_pos = source.pos
997 ch = source.get()
998 if ch == "?":
999 # (?(?...
1000 ch = source.get()
1001 if ch in ("=", "!"):
1002 # (?(?=... or (?(?!...: lookahead conditional.
1003 return parse_lookaround_conditional(source, info, False, ch == "=")
1004 if ch == "<":
1005 # (?(?<...
1006 ch = source.get()
1007 if ch in ("=", "!"):
1008 # (?(?<=... or (?(?<!...: lookbehind conditional.
1009 return parse_lookaround_conditional(source, info, True, ch ==
1010 "=")
1012 source.pos = saved_pos
1013 raise error("expected lookaround conditional", source.string,
1014 source.pos)
1016 source.pos = saved_pos
1017 try:
1018 group = parse_name(source, True)
1019 source.expect(")")
1020 yes_branch = parse_sequence(source, info)
1021 if source.match("|"):
1022 no_branch = parse_sequence(source, info)
1023 else:
1024 no_branch = Sequence()
1026 source.expect(")")
1027 finally:
1028 info.flags = saved_flags
1029 source.ignore_space = bool(info.flags & VERBOSE)
1031 if yes_branch.is_empty() and no_branch.is_empty():
1032 return Sequence()
1034 return Conditional(info, group, yes_branch, no_branch, saved_pos)
1036def parse_lookaround_conditional(source, info, behind, positive):
1037 saved_flags = info.flags
1038 try:
1039 subpattern = _parse_pattern(source, info)
1040 source.expect(")")
1041 finally:
1042 info.flags = saved_flags
1043 source.ignore_space = bool(info.flags & VERBOSE)
1045 yes_branch = parse_sequence(source, info)
1046 if source.match("|"):
1047 no_branch = parse_sequence(source, info)
1048 else:
1049 no_branch = Sequence()
1051 source.expect(")")
1053 return LookAroundConditional(behind, positive, subpattern, yes_branch,
1054 no_branch)
1056def parse_atomic(source, info):
1057 "Parses an atomic subpattern."
1058 saved_flags = info.flags
1059 try:
1060 subpattern = _parse_pattern(source, info)
1061 source.expect(")")
1062 finally:
1063 info.flags = saved_flags
1064 source.ignore_space = bool(info.flags & VERBOSE)
1066 return Atomic(subpattern)
1068def parse_common(source, info):
1069 "Parses a common groups branch."
1070 # Capture group numbers in different branches can reuse the group numbers.
1071 initial_group_count = info.group_count
1072 branches = [parse_sequence(source, info)]
1073 final_group_count = info.group_count
1074 while source.match("|"):
1075 info.group_count = initial_group_count
1076 branches.append(parse_sequence(source, info))
1077 final_group_count = max(final_group_count, info.group_count)
1079 info.group_count = final_group_count
1080 source.expect(")")
1082 if len(branches) == 1:
1083 return branches[0]
1084 return Branch(branches)
1086def parse_call_group(source, info, ch, pos):
1087 "Parses a call to a group."
1088 if ch == "R":
1089 group = "0"
1090 else:
1091 group = ch + source.get_while(DIGITS)
1093 source.expect(")")
1095 return CallGroup(info, group, pos)
1097def parse_call_named_group(source, info, pos):
1098 "Parses a call to a named group."
1099 group = parse_name(source)
1100 source.expect(")")
1102 return CallGroup(info, group, pos)
1104def parse_flag_set(source):
1105 "Parses a set of inline flags."
1106 flags = 0
1108 try:
1109 while True:
1110 saved_pos = source.pos
1111 ch = source.get()
1112 if ch == "V":
1113 ch += source.get()
1114 flags |= REGEX_FLAGS[ch]
1115 except KeyError:
1116 source.pos = saved_pos
1118 return flags
1120def parse_flags(source, info):
1121 "Parses flags being turned on/off."
1122 flags_on = parse_flag_set(source)
1123 if source.match("-"):
1124 flags_off = parse_flag_set(source)
1125 if not flags_off:
1126 raise error("bad inline flags: no flags after '-'", source.string,
1127 source.pos)
1128 else:
1129 flags_off = 0
1131 if flags_on & LOCALE:
1132 # Remember that this pattern as an inline locale flag.
1133 info.inline_locale = True
1135 return flags_on, flags_off
1137def parse_subpattern(source, info, flags_on, flags_off):
1138 "Parses a subpattern with scoped flags."
1139 saved_flags = info.flags
1140 info.flags = (info.flags | flags_on) & ~flags_off
1142 # Ensure that there aren't multiple encoding flags set.
1143 if info.flags & (ASCII | LOCALE | UNICODE):
1144 info.flags = (info.flags & ~_ALL_ENCODINGS) | flags_on
1146 source.ignore_space = bool(info.flags & VERBOSE)
1147 try:
1148 subpattern = _parse_pattern(source, info)
1149 source.expect(")")
1150 finally:
1151 info.flags = saved_flags
1152 source.ignore_space = bool(info.flags & VERBOSE)
1154 return subpattern
1156def parse_flags_subpattern(source, info):
1157 """Parses a flags subpattern. It could be inline flags or a subpattern
1158 possibly with local flags. If it's a subpattern, then that's returned;
1159 if it's a inline flags, then None is returned.
1160 """
1161 flags_on, flags_off = parse_flags(source, info)
1163 if flags_off & GLOBAL_FLAGS:
1164 raise error("bad inline flags: cannot turn off global flag",
1165 source.string, source.pos)
1167 if flags_on & flags_off:
1168 raise error("bad inline flags: flag turned on and off", source.string,
1169 source.pos)
1171 # Handle flags which are global in all regex behaviours.
1172 new_global_flags = (flags_on & ~info.global_flags) & GLOBAL_FLAGS
1173 if new_global_flags:
1174 info.global_flags |= new_global_flags
1176 # A global has been turned on, so reparse the pattern.
1177 raise _UnscopedFlagSet(info.global_flags)
1179 # Ensure that from now on we have only scoped flags.
1180 flags_on &= ~GLOBAL_FLAGS
1182 if source.match(":"):
1183 return parse_subpattern(source, info, flags_on, flags_off)
1185 if source.match(")"):
1186 parse_positional_flags(source, info, flags_on, flags_off)
1187 return None
1189 raise error("unknown extension", source.string, source.pos)
1191def parse_positional_flags(source, info, flags_on, flags_off):
1192 "Parses positional flags."
1193 info.flags = (info.flags | flags_on) & ~flags_off
1194 source.ignore_space = bool(info.flags & VERBOSE)
1196def parse_name(source, allow_numeric=False, allow_group_0=False):
1197 "Parses a name."
1198 name = source.get_while(set(")>"), include=False)
1200 if not name:
1201 raise error("missing group name", source.string, source.pos)
1203 if name.isdigit():
1204 min_group = 0 if allow_group_0 else 1
1205 if not allow_numeric or int(name) < min_group:
1206 raise error("bad character in group name", source.string,
1207 source.pos)
1208 else:
1209 if not name.isidentifier():
1210 raise error("bad character in group name", source.string,
1211 source.pos)
1213 return name
1215def is_octal(string):
1216 "Checks whether a string is octal."
1217 return all(ch in OCT_DIGITS for ch in string)
1219def is_decimal(string):
1220 "Checks whether a string is decimal."
1221 return all(ch in DIGITS for ch in string)
1223def is_hexadecimal(string):
1224 "Checks whether a string is hexadecimal."
1225 return all(ch in HEX_DIGITS for ch in string)
1227def parse_escape(source, info, in_set):
1228 "Parses an escape sequence."
1229 saved_ignore = source.ignore_space
1230 source.ignore_space = False
1231 ch = source.get()
1232 source.ignore_space = saved_ignore
1233 if not ch:
1234 # A backslash at the end of the pattern.
1235 raise error("bad escape (end of pattern)", source.string, source.pos)
1236 if ch in HEX_ESCAPES:
1237 # A hexadecimal escape sequence.
1238 return parse_hex_escape(source, info, ch, HEX_ESCAPES[ch], in_set, ch)
1239 elif ch == "g" and not in_set:
1240 # A group reference.
1241 saved_pos = source.pos
1242 try:
1243 return parse_group_ref(source, info)
1244 except error:
1245 # Invalid as a group reference, so assume it's a literal.
1246 source.pos = saved_pos
1248 return make_character(info, ord(ch), in_set)
1249 elif ch == "G" and not in_set:
1250 # A search anchor.
1251 return SearchAnchor()
1252 elif ch == "L" and not in_set:
1253 # A string set.
1254 return parse_string_set(source, info)
1255 elif ch == "N":
1256 # A named codepoint.
1257 return parse_named_char(source, info, in_set)
1258 elif ch in "pP":
1259 # A Unicode property, positive or negative.
1260 return parse_property(source, info, ch == "p", in_set)
1261 elif ch == "R" and not in_set:
1262 # A line ending.
1263 charset = [0x0A, 0x0B, 0x0C, 0x0D]
1264 if info.guess_encoding == UNICODE:
1265 charset.extend([0x85, 0x2028, 0x2029])
1267 return Atomic(Branch([String([0x0D, 0x0A]), SetUnion(info, [Character(c)
1268 for c in charset])]))
1269 elif ch == "X" and not in_set:
1270 # A grapheme cluster.
1271 return Grapheme()
1272 elif ch in ALPHA:
1273 # An alphabetic escape sequence.
1274 # Positional escapes aren't allowed inside a character set.
1275 if not in_set:
1276 if info.flags & WORD:
1277 value = WORD_POSITION_ESCAPES.get(ch)
1278 elif info.flags & ASCII:
1279 value = ASCII_POSITION_ESCAPES.get(ch)
1280 elif info.flags & UNICODE:
1281 value = UNICODE_POSITION_ESCAPES.get(ch)
1282 else:
1283 value = POSITION_ESCAPES.get(ch)
1285 if value:
1286 return value
1288 if info.flags & ASCII:
1289 value = ASCII_CHARSET_ESCAPES.get(ch)
1290 elif info.flags & UNICODE:
1291 value = UNICODE_CHARSET_ESCAPES.get(ch)
1292 else:
1293 value = CHARSET_ESCAPES.get(ch)
1295 if value:
1296 return value
1298 value = CHARACTER_ESCAPES.get(ch)
1299 if value:
1300 return Character(ord(value))
1302 raise error("bad escape \\%s" % ch, source.string, source.pos)
1303 elif ch in DIGITS:
1304 # A numeric escape sequence.
1305 return parse_numeric_escape(source, info, ch, in_set)
1306 else:
1307 # A literal.
1308 return make_character(info, ord(ch), in_set)
1310def parse_numeric_escape(source, info, ch, in_set):
1311 "Parses a numeric escape sequence."
1312 if in_set or ch == "0":
1313 # Octal escape sequence, max 3 digits.
1314 return parse_octal_escape(source, info, [ch], in_set)
1316 # At least 1 digit, so either octal escape or group.
1317 digits = ch
1318 saved_pos = source.pos
1319 ch = source.get()
1320 if ch in DIGITS:
1321 # At least 2 digits, so either octal escape or group.
1322 digits += ch
1323 saved_pos = source.pos
1324 ch = source.get()
1325 if is_octal(digits) and ch in OCT_DIGITS:
1326 # 3 octal digits, so octal escape sequence.
1327 encoding = info.flags & _ALL_ENCODINGS
1328 if encoding == ASCII or encoding == LOCALE:
1329 octal_mask = 0xFF
1330 else:
1331 octal_mask = 0x1FF
1333 value = int(digits + ch, 8) & octal_mask
1334 return make_character(info, value)
1336 # Group reference.
1337 source.pos = saved_pos
1338 if info.is_open_group(digits):
1339 raise error("cannot refer to an open group", source.string, source.pos)
1341 return make_ref_group(info, digits, source.pos)
1343def parse_octal_escape(source, info, digits, in_set):
1344 "Parses an octal escape sequence."
1345 saved_pos = source.pos
1346 ch = source.get()
1347 while len(digits) < 3 and ch in OCT_DIGITS:
1348 digits.append(ch)
1349 saved_pos = source.pos
1350 ch = source.get()
1352 source.pos = saved_pos
1353 try:
1354 value = int("".join(digits), 8)
1355 return make_character(info, value, in_set)
1356 except ValueError:
1357 if digits[0] in OCT_DIGITS:
1358 raise error("incomplete escape \\%s" % ''.join(digits),
1359 source.string, source.pos)
1360 else:
1361 raise error("bad escape \\%s" % digits[0], source.string,
1362 source.pos)
1364def parse_hex_escape(source, info, esc, expected_len, in_set, type):
1365 "Parses a hex escape sequence."
1366 saved_pos = source.pos
1367 digits = []
1368 for i in range(expected_len):
1369 ch = source.get()
1370 if ch not in HEX_DIGITS:
1371 raise error("incomplete escape \\%s%s" % (type, ''.join(digits)),
1372 source.string, saved_pos)
1373 digits.append(ch)
1375 try:
1376 value = int("".join(digits), 16)
1377 except ValueError:
1378 pass
1379 else:
1380 if value < 0x110000:
1381 return make_character(info, value, in_set)
1383 # Bad hex escape.
1384 raise error("bad hex escape \\%s%s" % (esc, ''.join(digits)),
1385 source.string, saved_pos)
1387def parse_group_ref(source, info):
1388 "Parses a group reference."
1389 source.expect("<")
1390 saved_pos = source.pos
1391 name = parse_name(source, True)
1392 source.expect(">")
1393 if info.is_open_group(name):
1394 raise error("cannot refer to an open group", source.string, source.pos)
1396 return make_ref_group(info, name, saved_pos)
1398def parse_string_set(source, info):
1399 "Parses a string set reference."
1400 source.expect("<")
1401 name = parse_name(source, True)
1402 source.expect(">")
1403 if name is None or name not in info.kwargs:
1404 raise error("undefined named list", source.string, source.pos)
1406 return make_string_set(info, name)
1408def parse_named_char(source, info, in_set):
1409 "Parses a named character."
1410 saved_pos = source.pos
1411 if source.match("{"):
1412 name = source.get_while(NAMED_CHAR_PART, keep_spaces=True)
1413 if source.match("}"):
1414 try:
1415 value = unicodedata.lookup(name)
1416 return make_character(info, ord(value), in_set)
1417 except KeyError:
1418 raise error("undefined character name", source.string,
1419 source.pos)
1421 source.pos = saved_pos
1422 return make_character(info, ord("N"), in_set)
1424def parse_property(source, info, positive, in_set):
1425 "Parses a Unicode property."
1426 saved_pos = source.pos
1427 ch = source.get()
1428 if ch == "{":
1429 negate = source.match("^")
1430 prop_name, name = parse_property_name(source)
1431 if source.match("}"):
1432 # It's correctly delimited.
1433 if info.flags & ASCII:
1434 encoding = ASCII_ENCODING
1435 elif info.flags & UNICODE:
1436 encoding = UNICODE_ENCODING
1437 else:
1438 encoding = 0
1440 prop = lookup_property(prop_name, name, positive != negate, source,
1441 encoding=encoding)
1442 return make_property(info, prop, in_set)
1443 elif ch and ch in "CLMNPSZ":
1444 # An abbreviated property, eg \pL.
1445 if info.flags & ASCII:
1446 encoding = ASCII_ENCODING
1447 elif info.flags & UNICODE:
1448 encoding = UNICODE_ENCODING
1449 else:
1450 encoding = 0
1452 prop = lookup_property(None, ch, positive, source, encoding=encoding)
1453 return make_property(info, prop, in_set)
1455 # Not a property, so treat as a literal "p" or "P".
1456 source.pos = saved_pos
1457 ch = "p" if positive else "P"
1458 return make_character(info, ord(ch), in_set)
1460def parse_property_name(source):
1461 "Parses a property name, which may be qualified."
1462 name = source.get_while(PROPERTY_NAME_PART)
1463 saved_pos = source.pos
1465 ch = source.get()
1466 if ch and ch in ":=":
1467 prop_name = name
1468 name = source.get_while(ALNUM | set(" &_-./")).strip()
1470 if name:
1471 # Name after the ":" or "=", so it's a qualified name.
1472 saved_pos = source.pos
1473 else:
1474 # No name after the ":" or "=", so assume it's an unqualified name.
1475 prop_name, name = None, prop_name
1476 else:
1477 prop_name = None
1479 source.pos = saved_pos
1480 return prop_name, name
1482def parse_set(source, info):
1483 "Parses a character set."
1484 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION
1486 saved_ignore = source.ignore_space
1487 source.ignore_space = False
1488 # Negative set?
1489 negate = source.match("^")
1490 try:
1491 if version == VERSION0:
1492 item = parse_set_imp_union(source, info)
1493 else:
1494 item = parse_set_union(source, info)
1496 if not source.match("]"):
1497 raise error("missing ]", source.string, source.pos)
1498 finally:
1499 source.ignore_space = saved_ignore
1501 if negate:
1502 item = item.with_flags(positive=not item.positive)
1504 item = item.with_flags(case_flags=make_case_flags(info))
1506 return item
1508def parse_set_union(source, info):
1509 "Parses a set union ([x||y])."
1510 items = [parse_set_symm_diff(source, info)]
1511 while source.match("||"):
1512 items.append(parse_set_symm_diff(source, info))
1514 if len(items) == 1:
1515 return items[0]
1516 return SetUnion(info, items)
1518def parse_set_symm_diff(source, info):
1519 "Parses a set symmetric difference ([x~~y])."
1520 items = [parse_set_inter(source, info)]
1521 while source.match("~~"):
1522 items.append(parse_set_inter(source, info))
1524 if len(items) == 1:
1525 return items[0]
1526 return SetSymDiff(info, items)
1528def parse_set_inter(source, info):
1529 "Parses a set intersection ([x&&y])."
1530 items = [parse_set_diff(source, info)]
1531 while source.match("&&"):
1532 items.append(parse_set_diff(source, info))
1534 if len(items) == 1:
1535 return items[0]
1536 return SetInter(info, items)
1538def parse_set_diff(source, info):
1539 "Parses a set difference ([x--y])."
1540 items = [parse_set_imp_union(source, info)]
1541 while source.match("--"):
1542 items.append(parse_set_imp_union(source, info))
1544 if len(items) == 1:
1545 return items[0]
1546 return SetDiff(info, items)
1548def parse_set_imp_union(source, info):
1549 "Parses a set implicit union ([xy])."
1550 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION
1552 items = [parse_set_member(source, info)]
1553 while True:
1554 saved_pos = source.pos
1555 if source.match("]"):
1556 # End of the set.
1557 source.pos = saved_pos
1558 break
1560 if version == VERSION1 and any(source.match(op) for op in SET_OPS):
1561 # The new behaviour has set operators.
1562 source.pos = saved_pos
1563 break
1565 items.append(parse_set_member(source, info))
1567 if len(items) == 1:
1568 return items[0]
1569 return SetUnion(info, items)
1571def parse_set_member(source, info):
1572 "Parses a member in a character set."
1573 # Parse a set item.
1574 start = parse_set_item(source, info)
1575 saved_pos1 = source.pos
1576 if (not isinstance(start, Character) or not start.positive or not
1577 source.match("-")):
1578 # It's not the start of a range.
1579 return start
1581 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION
1583 # It looks like the start of a range of characters.
1584 saved_pos2 = source.pos
1585 if version == VERSION1 and source.match("-"):
1586 # It's actually the set difference operator '--', so return the
1587 # character.
1588 source.pos = saved_pos1
1589 return start
1591 if source.match("]"):
1592 # We've reached the end of the set, so return both the character and
1593 # hyphen.
1594 source.pos = saved_pos2
1595 return SetUnion(info, [start, Character(ord("-"))])
1597 # Parse a set item.
1598 end = parse_set_item(source, info)
1599 if not isinstance(end, Character) or not end.positive:
1600 # It's not a range, so return the character, hyphen and property.
1601 return SetUnion(info, [start, Character(ord("-")), end])
1603 # It _is_ a range.
1604 if start.value > end.value:
1605 raise error("bad character range", source.string, source.pos)
1607 if start.value == end.value:
1608 return start
1610 return Range(start.value, end.value)
1612def parse_set_item(source, info):
1613 "Parses an item in a character set."
1614 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION
1616 if source.match("\\"):
1617 # An escape sequence in a set.
1618 return parse_escape(source, info, True)
1620 saved_pos = source.pos
1621 if source.match("[:"):
1622 # Looks like a POSIX character class.
1623 try:
1624 return parse_posix_class(source, info)
1625 except ParseError:
1626 # Not a POSIX character class.
1627 source.pos = saved_pos
1629 if version == VERSION1 and source.match("["):
1630 # It's the start of a nested set.
1632 # Negative set?
1633 negate = source.match("^")
1634 item = parse_set_union(source, info)
1636 if not source.match("]"):
1637 raise error("missing ]", source.string, source.pos)
1639 if negate:
1640 item = item.with_flags(positive=not item.positive)
1642 return item
1644 ch = source.get()
1645 if not ch:
1646 raise error("unterminated character set", source.string, source.pos)
1648 return Character(ord(ch))
1650def parse_posix_class(source, info):
1651 "Parses a POSIX character class."
1652 negate = source.match("^")
1653 prop_name, name = parse_property_name(source)
1654 if not source.match(":]"):
1655 raise ParseError()
1657 return lookup_property(prop_name, name, not negate, source, posix=True)
1659def float_to_rational(flt):
1660 "Converts a float to a rational pair."
1661 int_part = int(flt)
1662 error = flt - int_part
1663 if abs(error) < 0.0001:
1664 return int_part, 1
1666 den, num = float_to_rational(1.0 / error)
1668 return int_part * den + num, den
1670def numeric_to_rational(numeric):
1671 "Converts a numeric string to a rational string, if possible."
1672 if numeric[ : 1] == "-":
1673 sign, numeric = numeric[0], numeric[1 : ]
1674 else:
1675 sign = ""
1677 parts = numeric.split("/")
1678 if len(parts) == 2:
1679 num, den = float_to_rational(float(parts[0]) / float(parts[1]))
1680 elif len(parts) == 1:
1681 num, den = float_to_rational(float(parts[0]))
1682 else:
1683 raise ValueError()
1685 result = "{}{}/{}".format(sign, num, den)
1686 if result.endswith("/1"):
1687 return result[ : -2]
1689 return result
1691def standardise_name(name):
1692 "Standardises a property or value name."
1693 try:
1694 return numeric_to_rational("".join(name))
1695 except (ValueError, ZeroDivisionError):
1696 return "".join(ch for ch in name if ch not in "_- ").upper()
1698_POSIX_CLASSES = set('ALNUM DIGIT PUNCT XDIGIT'.split())
1700_BINARY_VALUES = set('YES Y NO N TRUE T FALSE F'.split())
1702def lookup_property(property, value, positive, source=None, posix=False, encoding=0):
1703 "Looks up a property."
1704 # Normalise the names (which may still be lists).
1705 property = standardise_name(property) if property else None
1706 value = standardise_name(value)
1708 if (property, value) == ("GENERALCATEGORY", "ASSIGNED"):
1709 property, value, positive = "GENERALCATEGORY", "UNASSIGNED", not positive
1711 if posix and not property and value.upper() in _POSIX_CLASSES:
1712 value = 'POSIX' + value
1714 if property:
1715 # Both the property and the value are provided.
1716 prop = PROPERTIES.get(property)
1717 if not prop:
1718 if not source:
1719 raise error("unknown property")
1721 raise error("unknown property", source.string, source.pos)
1723 prop_id, value_dict = prop
1724 val_id = value_dict.get(value)
1725 if val_id is None:
1726 if not source:
1727 raise error("unknown property value")
1729 raise error("unknown property value", source.string, source.pos)
1731 return Property((prop_id << 16) | val_id, positive, encoding=encoding)
1733 # Only the value is provided.
1734 # It might be the name of a GC, script or block value.
1735 for property in ("GC", "SCRIPT", "BLOCK"):
1736 prop_id, value_dict = PROPERTIES.get(property)
1737 val_id = value_dict.get(value)
1738 if val_id is not None:
1739 return Property((prop_id << 16) | val_id, positive, encoding=encoding)
1741 # It might be the name of a binary property.
1742 prop = PROPERTIES.get(value)
1743 if prop:
1744 prop_id, value_dict = prop
1745 if set(value_dict) == _BINARY_VALUES:
1746 return Property((prop_id << 16) | 1, positive, encoding=encoding)
1748 return Property(prop_id << 16, not positive, encoding=encoding)
1750 # It might be the name of a binary property starting with a prefix.
1751 if value.startswith("IS"):
1752 prop = PROPERTIES.get(value[2 : ])
1753 if prop:
1754 prop_id, value_dict = prop
1755 if "YES" in value_dict:
1756 return Property((prop_id << 16) | 1, positive, encoding=encoding)
1758 # It might be the name of a script or block starting with a prefix.
1759 for prefix, property in (("IS", "SCRIPT"), ("IN", "BLOCK")):
1760 if value.startswith(prefix):
1761 prop_id, value_dict = PROPERTIES.get(property)
1762 val_id = value_dict.get(value[2 : ])
1763 if val_id is not None:
1764 return Property((prop_id << 16) | val_id, positive, encoding=encoding)
1766 # Unknown property.
1767 if not source:
1768 raise error("unknown property")
1770 raise error("unknown property", source.string, source.pos)
1772def _compile_replacement(source, pattern, is_unicode):
1773 "Compiles a replacement template escape sequence."
1774 ch = source.get()
1775 if ch in ALPHA:
1776 # An alphabetic escape sequence.
1777 value = CHARACTER_ESCAPES.get(ch)
1778 if value:
1779 return False, [ord(value)]
1781 if ch in HEX_ESCAPES and (ch == "x" or is_unicode):
1782 # A hexadecimal escape sequence.
1783 return False, [parse_repl_hex_escape(source, HEX_ESCAPES[ch], ch)]
1785 if ch == "g":
1786 # A group preference.
1787 return True, [compile_repl_group(source, pattern)]
1789 if ch == "N" and is_unicode:
1790 # A named character.
1791 value = parse_repl_named_char(source)
1792 if value is not None:
1793 return False, [value]
1795 raise error("bad escape \\%s" % ch, source.string, source.pos)
1797 if isinstance(source.sep, bytes):
1798 octal_mask = 0xFF
1799 else:
1800 octal_mask = 0x1FF
1802 if ch == "0":
1803 # An octal escape sequence.
1804 digits = ch
1805 while len(digits) < 3:
1806 saved_pos = source.pos
1807 ch = source.get()
1808 if ch not in OCT_DIGITS:
1809 source.pos = saved_pos
1810 break
1811 digits += ch
1813 return False, [int(digits, 8) & octal_mask]
1815 if ch in DIGITS:
1816 # Either an octal escape sequence (3 digits) or a group reference (max
1817 # 2 digits).
1818 digits = ch
1819 saved_pos = source.pos
1820 ch = source.get()
1821 if ch in DIGITS:
1822 digits += ch
1823 saved_pos = source.pos
1824 ch = source.get()
1825 if ch and is_octal(digits + ch):
1826 # An octal escape sequence.
1827 return False, [int(digits + ch, 8) & octal_mask]
1829 # A group reference.
1830 source.pos = saved_pos
1831 return True, [int(digits)]
1833 if ch == "\\":
1834 # An escaped backslash is a backslash.
1835 return False, [ord("\\")]
1837 if not ch:
1838 # A trailing backslash.
1839 raise error("bad escape (end of pattern)", source.string, source.pos)
1841 # An escaped non-backslash is a backslash followed by the literal.
1842 return False, [ord("\\"), ord(ch)]
1844def parse_repl_hex_escape(source, expected_len, type):
1845 "Parses a hex escape sequence in a replacement string."
1846 digits = []
1847 for i in range(expected_len):
1848 ch = source.get()
1849 if ch not in HEX_DIGITS:
1850 raise error("incomplete escape \\%s%s" % (type, ''.join(digits)),
1851 source.string, source.pos)
1852 digits.append(ch)
1854 return int("".join(digits), 16)
1856def parse_repl_named_char(source):
1857 "Parses a named character in a replacement string."
1858 saved_pos = source.pos
1859 if source.match("{"):
1860 name = source.get_while(ALPHA | set(" "))
1862 if source.match("}"):
1863 try:
1864 value = unicodedata.lookup(name)
1865 return ord(value)
1866 except KeyError:
1867 raise error("undefined character name", source.string,
1868 source.pos)
1870 source.pos = saved_pos
1871 return None
1873def compile_repl_group(source, pattern):
1874 "Compiles a replacement template group reference."
1875 source.expect("<")
1876 name = parse_name(source, True, True)
1878 source.expect(">")
1879 if name.isdigit():
1880 index = int(name)
1881 if not 0 <= index <= pattern.groups:
1882 raise error("invalid group reference", source.string, source.pos)
1884 return index
1886 try:
1887 return pattern.groupindex[name]
1888 except KeyError:
1889 raise IndexError("unknown group")
1891# The regular expression is parsed into a syntax tree. The different types of
1892# node are defined below.
1894INDENT = " "
1895POSITIVE_OP = 0x1
1896ZEROWIDTH_OP = 0x2
1897FUZZY_OP = 0x4
1898REVERSE_OP = 0x8
1899REQUIRED_OP = 0x10
1900ENCODING_OP_SHIFT = 5
1902POS_TEXT = {False: "NON-MATCH", True: "MATCH"}
1903CASE_TEXT = {NOCASE: "", IGNORECASE: " SIMPLE_IGNORE_CASE", FULLCASE: "",
1904 FULLIGNORECASE: " FULL_IGNORE_CASE"}
1906def make_sequence(items):
1907 if len(items) == 1:
1908 return items[0]
1909 return Sequence(items)
1911# Common base class for all nodes.
1912class RegexBase:
1913 def __init__(self):
1914 self._key = self.__class__
1916 def with_flags(self, positive=None, case_flags=None, zerowidth=None):
1917 if positive is None:
1918 positive = self.positive
1919 else:
1920 positive = bool(positive)
1921 if case_flags is None:
1922 case_flags = self.case_flags
1923 else:
1924 case_flags = CASE_FLAGS_COMBINATIONS[case_flags & CASE_FLAGS]
1925 if zerowidth is None:
1926 zerowidth = self.zerowidth
1927 else:
1928 zerowidth = bool(zerowidth)
1930 if (positive == self.positive and case_flags == self.case_flags and
1931 zerowidth == self.zerowidth):
1932 return self
1934 return self.rebuild(positive, case_flags, zerowidth)
1936 def fix_groups(self, pattern, reverse, fuzzy):
1937 pass
1939 def optimise(self, info, reverse):
1940 return self
1942 def pack_characters(self, info):
1943 return self
1945 def remove_captures(self):
1946 return self
1948 def is_atomic(self):
1949 return True
1951 def can_be_affix(self):
1952 return True
1954 def contains_group(self):
1955 return False
1957 def get_firstset(self, reverse):
1958 raise _FirstSetError()
1960 def has_simple_start(self):
1961 return False
1963 def compile(self, reverse=False, fuzzy=False):
1964 return self._compile(reverse, fuzzy)
1966 def is_empty(self):
1967 return False
1969 def __hash__(self):
1970 return hash(self._key)
1972 def __eq__(self, other):
1973 return type(self) is type(other) and self._key == other._key
1975 def __ne__(self, other):
1976 return not self.__eq__(other)
1978 def get_required_string(self, reverse):
1979 return self.max_width(), None
1981# Base class for zero-width nodes.
1982class ZeroWidthBase(RegexBase):
1983 def __init__(self, positive=True, encoding=0):
1984 RegexBase.__init__(self)
1985 self.positive = bool(positive)
1986 self.encoding = encoding
1988 self._key = self.__class__, self.positive
1990 def get_firstset(self, reverse):
1991 return set([None])
1993 def _compile(self, reverse, fuzzy):
1994 flags = 0
1995 if self.positive:
1996 flags |= POSITIVE_OP
1997 if fuzzy:
1998 flags |= FUZZY_OP
1999 if reverse:
2000 flags |= REVERSE_OP
2001 flags |= self.encoding << ENCODING_OP_SHIFT
2002 return [(self._opcode, flags)]
2004 def dump(self, indent, reverse):
2005 print("{}{} {}{}".format(INDENT * indent, self._op_name,
2006 POS_TEXT[self.positive], ["", " ASCII"][self.encoding]))
2008 def max_width(self):
2009 return 0
2011class Any(RegexBase):
2012 _opcode = {False: OP.ANY, True: OP.ANY_REV}
2013 _op_name = "ANY"
2015 def has_simple_start(self):
2016 return True
2018 def _compile(self, reverse, fuzzy):
2019 flags = 0
2020 if fuzzy:
2021 flags |= FUZZY_OP
2022 return [(self._opcode[reverse], flags)]
2024 def dump(self, indent, reverse):
2025 print("{}{}".format(INDENT * indent, self._op_name))
2027 def max_width(self):
2028 return 1
2030class AnyAll(Any):
2031 _opcode = {False: OP.ANY_ALL, True: OP.ANY_ALL_REV}
2032 _op_name = "ANY_ALL"
2034 def __init__(self):
2035 self.positive = True
2036 self.zerowidth = False
2037 self.case_flags = 0
2039 self._key = self.__class__, self.positive
2041class AnyU(Any):
2042 _opcode = {False: OP.ANY_U, True: OP.ANY_U_REV}
2043 _op_name = "ANY_U"
2045class Atomic(RegexBase):
2046 def __init__(self, subpattern):
2047 RegexBase.__init__(self)
2048 self.subpattern = subpattern
2050 def fix_groups(self, pattern, reverse, fuzzy):
2051 self.subpattern.fix_groups(pattern, reverse, fuzzy)
2053 def optimise(self, info, reverse):
2054 self.subpattern = self.subpattern.optimise(info, reverse)
2056 if self.subpattern.is_empty():
2057 return self.subpattern
2058 return self
2060 def pack_characters(self, info):
2061 self.subpattern = self.subpattern.pack_characters(info)
2062 return self
2064 def remove_captures(self):
2065 self.subpattern = self.subpattern.remove_captures()
2066 return self
2068 def can_be_affix(self):
2069 return self.subpattern.can_be_affix()
2071 def contains_group(self):
2072 return self.subpattern.contains_group()
2074 def get_firstset(self, reverse):
2075 return self.subpattern.get_firstset(reverse)
2077 def has_simple_start(self):
2078 return self.subpattern.has_simple_start()
2080 def _compile(self, reverse, fuzzy):
2081 return ([(OP.ATOMIC, )] + self.subpattern.compile(reverse, fuzzy) +
2082 [(OP.END, )])
2084 def dump(self, indent, reverse):
2085 print("{}ATOMIC".format(INDENT * indent))
2086 self.subpattern.dump(indent + 1, reverse)
2088 def is_empty(self):
2089 return self.subpattern.is_empty()
2091 def __eq__(self, other):
2092 return (type(self) is type(other) and self.subpattern ==
2093 other.subpattern)
2095 def max_width(self):
2096 return self.subpattern.max_width()
2098 def get_required_string(self, reverse):
2099 return self.subpattern.get_required_string(reverse)
2101class Boundary(ZeroWidthBase):
2102 _opcode = OP.BOUNDARY
2103 _op_name = "BOUNDARY"
2105class Branch(RegexBase):
2106 def __init__(self, branches):
2107 RegexBase.__init__(self)
2108 self.branches = branches
2110 def fix_groups(self, pattern, reverse, fuzzy):
2111 for b in self.branches:
2112 b.fix_groups(pattern, reverse, fuzzy)
2114 def optimise(self, info, reverse):
2115 if not self.branches:
2116 return Sequence([])
2118 # Flatten branches within branches.
2119 branches = Branch._flatten_branches(info, reverse, self.branches)
2121 # Move any common prefix or suffix out of the branches.
2122 if reverse:
2123 suffix, branches = Branch._split_common_suffix(info, branches)
2124 prefix = []
2125 else:
2126 prefix, branches = Branch._split_common_prefix(info, branches)
2127 suffix = []
2129 # Try to reduce adjacent single-character branches to sets.
2130 branches = Branch._reduce_to_set(info, reverse, branches)
2132 if len(branches) > 1:
2133 sequence = [Branch(branches)]
2135 if not prefix or not suffix:
2136 # We might be able to add a quick precheck before the branches.
2137 firstset = self._add_precheck(info, reverse, branches)
2139 if firstset:
2140 if reverse:
2141 sequence.append(firstset)
2142 else:
2143 sequence.insert(0, firstset)
2144 else:
2145 sequence = branches
2147 return make_sequence(prefix + sequence + suffix)
2149 def _add_precheck(self, info, reverse, branches):
2150 charset = set()
2151 pos = -1 if reverse else 0
2153 for branch in branches:
2154 if type(branch) is Literal and branch.case_flags == NOCASE:
2155 charset.add(branch.characters[pos])
2156 else:
2157 return
2159 if not charset:
2160 return None
2162 return _check_firstset(info, reverse, [Character(c) for c in charset])
2164 def pack_characters(self, info):
2165 self.branches = [b.pack_characters(info) for b in self.branches]
2166 return self
2168 def remove_captures(self):
2169 self.branches = [b.remove_captures() for b in self.branches]
2170 return self
2172 def is_atomic(self):
2173 return all(b.is_atomic() for b in self.branches)
2175 def can_be_affix(self):
2176 return all(b.can_be_affix() for b in self.branches)
2178 def contains_group(self):
2179 return any(b.contains_group() for b in self.branches)
2181 def get_firstset(self, reverse):
2182 fs = set()
2183 for b in self.branches:
2184 fs |= b.get_firstset(reverse)
2186 return fs or set([None])
2188 def _compile(self, reverse, fuzzy):
2189 if not self.branches:
2190 return []
2192 code = [(OP.BRANCH, )]
2193 for b in self.branches:
2194 code.extend(b.compile(reverse, fuzzy))
2195 code.append((OP.NEXT, ))
2197 code[-1] = (OP.END, )
2199 return code
2201 def dump(self, indent, reverse):
2202 print("{}BRANCH".format(INDENT * indent))
2203 self.branches[0].dump(indent + 1, reverse)
2204 for b in self.branches[1 : ]:
2205 print("{}OR".format(INDENT * indent))
2206 b.dump(indent + 1, reverse)
2208 @staticmethod
2209 def _flatten_branches(info, reverse, branches):
2210 # Flatten the branches so that there aren't branches of branches.
2211 new_branches = []
2212 for b in branches:
2213 b = b.optimise(info, reverse)
2214 if isinstance(b, Branch):
2215 new_branches.extend(b.branches)
2216 else:
2217 new_branches.append(b)
2219 return new_branches
2221 @staticmethod
2222 def _split_common_prefix(info, branches):
2223 # Common leading items can be moved out of the branches.
2224 # Get the items in the branches.
2225 alternatives = []
2226 for b in branches:
2227 if isinstance(b, Sequence):
2228 alternatives.append(b.items)
2229 else:
2230 alternatives.append([b])
2232 # What is the maximum possible length of the prefix?
2233 max_count = min(len(a) for a in alternatives)
2235 # What is the longest common prefix?
2236 prefix = alternatives[0]
2237 pos = 0
2238 end_pos = max_count
2239 while pos < end_pos and prefix[pos].can_be_affix() and all(a[pos] ==
2240 prefix[pos] for a in alternatives):
2241 pos += 1
2242 count = pos
2244 if info.flags & UNICODE:
2245 # We need to check that we're not splitting a sequence of
2246 # characters which could form part of full case-folding.
2247 count = pos
2248 while count > 0 and not all(Branch._can_split(a, count) for a in
2249 alternatives):
2250 count -= 1
2252 # No common prefix is possible.
2253 if count == 0:
2254 return [], branches
2256 # Rebuild the branches.
2257 new_branches = []
2258 for a in alternatives:
2259 new_branches.append(make_sequence(a[count : ]))
2261 return prefix[ : count], new_branches
2263 @staticmethod
2264 def _split_common_suffix(info, branches):
2265 # Common trailing items can be moved out of the branches.
2266 # Get the items in the branches.
2267 alternatives = []
2268 for b in branches:
2269 if isinstance(b, Sequence):
2270 alternatives.append(b.items)
2271 else:
2272 alternatives.append([b])
2274 # What is the maximum possible length of the suffix?
2275 max_count = min(len(a) for a in alternatives)
2277 # What is the longest common suffix?
2278 suffix = alternatives[0]
2279 pos = -1
2280 end_pos = -1 - max_count
2281 while pos > end_pos and suffix[pos].can_be_affix() and all(a[pos] ==
2282 suffix[pos] for a in alternatives):
2283 pos -= 1
2284 count = -1 - pos
2286 if info.flags & UNICODE:
2287 # We need to check that we're not splitting a sequence of
2288 # characters which could form part of full case-folding.
2289 while count > 0 and not all(Branch._can_split_rev(a, count) for a
2290 in alternatives):
2291 count -= 1
2293 # No common suffix is possible.
2294 if count == 0:
2295 return [], branches
2297 # Rebuild the branches.
2298 new_branches = []
2299 for a in alternatives:
2300 new_branches.append(make_sequence(a[ : -count]))
2302 return suffix[-count : ], new_branches
2304 @staticmethod
2305 def _can_split(items, count):
2306 # Check the characters either side of the proposed split.
2307 if not Branch._is_full_case(items, count - 1):
2308 return True
2310 if not Branch._is_full_case(items, count):
2311 return True
2313 # Check whether a 1-1 split would be OK.
2314 if Branch._is_folded(items[count - 1 : count + 1]):
2315 return False
2317 # Check whether a 1-2 split would be OK.
2318 if (Branch._is_full_case(items, count + 2) and
2319 Branch._is_folded(items[count - 1 : count + 2])):
2320 return False
2322 # Check whether a 2-1 split would be OK.
2323 if (Branch._is_full_case(items, count - 2) and
2324 Branch._is_folded(items[count - 2 : count + 1])):
2325 return False
2327 return True
2329 @staticmethod
2330 def _can_split_rev(items, count):
2331 end = len(items)
2333 # Check the characters either side of the proposed split.
2334 if not Branch._is_full_case(items, end - count):
2335 return True
2337 if not Branch._is_full_case(items, end - count - 1):
2338 return True
2340 # Check whether a 1-1 split would be OK.
2341 if Branch._is_folded(items[end - count - 1 : end - count + 1]):
2342 return False
2344 # Check whether a 1-2 split would be OK.
2345 if (Branch._is_full_case(items, end - count + 2) and
2346 Branch._is_folded(items[end - count - 1 : end - count + 2])):
2347 return False
2349 # Check whether a 2-1 split would be OK.
2350 if (Branch._is_full_case(items, end - count - 2) and
2351 Branch._is_folded(items[end - count - 2 : end - count + 1])):
2352 return False
2354 return True
2356 @staticmethod
2357 def _merge_common_prefixes(info, reverse, branches):
2358 # Branches with the same case-sensitive character prefix can be grouped
2359 # together if they are separated only by other branches with a
2360 # character prefix.
2361 prefixed = defaultdict(list)
2362 order = {}
2363 new_branches = []
2364 for b in branches:
2365 if Branch._is_simple_character(b):
2366 # Branch starts with a simple character.
2367 prefixed[b.value].append([b])
2368 order.setdefault(b.value, len(order))
2369 elif (isinstance(b, Sequence) and b.items and
2370 Branch._is_simple_character(b.items[0])):
2371 # Branch starts with a simple character.
2372 prefixed[b.items[0].value].append(b.items)
2373 order.setdefault(b.items[0].value, len(order))
2374 else:
2375 Branch._flush_char_prefix(info, reverse, prefixed, order,
2376 new_branches)
2378 new_branches.append(b)
2380 Branch._flush_char_prefix(info, prefixed, order, new_branches)
2382 return new_branches
2384 @staticmethod
2385 def _is_simple_character(c):
2386 return isinstance(c, Character) and c.positive and not c.case_flags
2388 @staticmethod
2389 def _reduce_to_set(info, reverse, branches):
2390 # Can the branches be reduced to a set?
2391 new_branches = []
2392 items = set()
2393 case_flags = NOCASE
2394 for b in branches:
2395 if isinstance(b, (Character, Property, SetBase)):
2396 # Branch starts with a single character.
2397 if b.case_flags != case_flags:
2398 # Different case sensitivity, so flush.
2399 Branch._flush_set_members(info, reverse, items, case_flags,
2400 new_branches)
2402 case_flags = b.case_flags
2404 items.add(b.with_flags(case_flags=NOCASE))
2405 else:
2406 Branch._flush_set_members(info, reverse, items, case_flags,
2407 new_branches)
2409 new_branches.append(b)
2411 Branch._flush_set_members(info, reverse, items, case_flags,
2412 new_branches)
2414 return new_branches
2416 @staticmethod
2417 def _flush_char_prefix(info, reverse, prefixed, order, new_branches):
2418 # Flush the prefixed branches.
2419 if not prefixed:
2420 return
2422 for value, branches in sorted(prefixed.items(), key=lambda pair:
2423 order[pair[0]]):
2424 if len(branches) == 1:
2425 new_branches.append(make_sequence(branches[0]))
2426 else:
2427 subbranches = []
2428 optional = False
2429 for b in branches:
2430 if len(b) > 1:
2431 subbranches.append(make_sequence(b[1 : ]))
2432 elif not optional:
2433 subbranches.append(Sequence())
2434 optional = True
2436 sequence = Sequence([Character(value), Branch(subbranches)])
2437 new_branches.append(sequence.optimise(info, reverse))
2439 prefixed.clear()
2440 order.clear()
2442 @staticmethod
2443 def _flush_set_members(info, reverse, items, case_flags, new_branches):
2444 # Flush the set members.
2445 if not items:
2446 return
2448 if len(items) == 1:
2449 item = list(items)[0]
2450 else:
2451 item = SetUnion(info, list(items)).optimise(info, reverse)
2453 new_branches.append(item.with_flags(case_flags=case_flags))
2455 items.clear()
2457 @staticmethod
2458 def _is_full_case(items, i):
2459 if not 0 <= i < len(items):
2460 return False
2462 item = items[i]
2463 return (isinstance(item, Character) and item.positive and
2464 (item.case_flags & FULLIGNORECASE) == FULLIGNORECASE)
2466 @staticmethod
2467 def _is_folded(items):
2468 if len(items) < 2:
2469 return False
2471 for i in items:
2472 if (not isinstance(i, Character) or not i.positive or not
2473 i.case_flags):
2474 return False
2476 folded = "".join(chr(i.value) for i in items)
2477 folded = _regex.fold_case(FULL_CASE_FOLDING, folded)
2479 # Get the characters which expand to multiple codepoints on folding.
2480 expanding_chars = _regex.get_expand_on_folding()
2482 for c in expanding_chars:
2483 if folded == _regex.fold_case(FULL_CASE_FOLDING, c):
2484 return True
2486 return False
2488 def is_empty(self):
2489 return all(b.is_empty() for b in self.branches)
2491 def __eq__(self, other):
2492 return type(self) is type(other) and self.branches == other.branches
2494 def max_width(self):
2495 return max(b.max_width() for b in self.branches)
2497class CallGroup(RegexBase):
2498 def __init__(self, info, group, position):
2499 RegexBase.__init__(self)
2500 self.info = info
2501 self.group = group
2502 self.position = position
2504 self._key = self.__class__, self.group
2506 def fix_groups(self, pattern, reverse, fuzzy):
2507 try:
2508 self.group = int(self.group)
2509 except ValueError:
2510 try:
2511 self.group = self.info.group_index[self.group]
2512 except KeyError:
2513 raise error("invalid group reference", pattern, self.position)
2515 if not 0 <= self.group <= self.info.group_count:
2516 raise error("unknown group", pattern, self.position)
2518 if self.group > 0 and self.info.open_group_count[self.group] > 1:
2519 raise error("ambiguous group reference", pattern, self.position)
2521 self.info.group_calls.append((self, reverse, fuzzy))
2523 self._key = self.__class__, self.group
2525 def remove_captures(self):
2526 raise error("group reference not allowed", self.pattern, self.position)
2528 def _compile(self, reverse, fuzzy):
2529 return [(OP.GROUP_CALL, self.call_ref)]
2531 def dump(self, indent, reverse):
2532 print("{}GROUP_CALL {}".format(INDENT * indent, self.group))
2534 def __eq__(self, other):
2535 return type(self) is type(other) and self.group == other.group
2537 def max_width(self):
2538 return UNLIMITED
2540 def __del__(self):
2541 self.info = None
2543class CallRef(RegexBase):
2544 def __init__(self, ref, parsed):
2545 self.ref = ref
2546 self.parsed = parsed
2548 def _compile(self, reverse, fuzzy):
2549 return ([(OP.CALL_REF, self.ref)] + self.parsed._compile(reverse,
2550 fuzzy) + [(OP.END, )])
2552class Character(RegexBase):
2553 _opcode = {(NOCASE, False): OP.CHARACTER, (IGNORECASE, False):
2554 OP.CHARACTER_IGN, (FULLCASE, False): OP.CHARACTER, (FULLIGNORECASE,
2555 False): OP.CHARACTER_IGN, (NOCASE, True): OP.CHARACTER_REV, (IGNORECASE,
2556 True): OP.CHARACTER_IGN_REV, (FULLCASE, True): OP.CHARACTER_REV,
2557 (FULLIGNORECASE, True): OP.CHARACTER_IGN_REV}
2559 def __init__(self, value, positive=True, case_flags=NOCASE,
2560 zerowidth=False):
2561 RegexBase.__init__(self)
2562 self.value = value
2563 self.positive = bool(positive)
2564 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
2565 self.zerowidth = bool(zerowidth)
2567 if (self.positive and (self.case_flags & FULLIGNORECASE) ==
2568 FULLIGNORECASE):
2569 self.folded = _regex.fold_case(FULL_CASE_FOLDING, chr(self.value))
2570 else:
2571 self.folded = chr(self.value)
2573 self._key = (self.__class__, self.value, self.positive,
2574 self.case_flags, self.zerowidth)
2576 def rebuild(self, positive, case_flags, zerowidth):
2577 return Character(self.value, positive, case_flags, zerowidth)
2579 def optimise(self, info, reverse, in_set=False):
2580 return self
2582 def get_firstset(self, reverse):
2583 return set([self])
2585 def has_simple_start(self):
2586 return True
2588 def _compile(self, reverse, fuzzy):
2589 flags = 0
2590 if self.positive:
2591 flags |= POSITIVE_OP
2592 if self.zerowidth:
2593 flags |= ZEROWIDTH_OP
2594 if fuzzy:
2595 flags |= FUZZY_OP
2597 code = PrecompiledCode([self._opcode[self.case_flags, reverse], flags,
2598 self.value])
2600 if len(self.folded) > 1:
2601 # The character expands on full case-folding.
2602 code = Branch([code, String([ord(c) for c in self.folded],
2603 case_flags=self.case_flags)])
2605 return code.compile(reverse, fuzzy)
2607 def dump(self, indent, reverse):
2608 display = ascii(chr(self.value)).lstrip("bu")
2609 print("{}CHARACTER {} {}{}".format(INDENT * indent,
2610 POS_TEXT[self.positive], display, CASE_TEXT[self.case_flags]))
2612 def matches(self, ch):
2613 return (ch == self.value) == self.positive
2615 def max_width(self):
2616 return len(self.folded)
2618 def get_required_string(self, reverse):
2619 if not self.positive:
2620 return 1, None
2622 self.folded_characters = tuple(ord(c) for c in self.folded)
2624 return 0, self
2626class Conditional(RegexBase):
2627 def __init__(self, info, group, yes_item, no_item, position):
2628 RegexBase.__init__(self)
2629 self.info = info
2630 self.group = group
2631 self.yes_item = yes_item
2632 self.no_item = no_item
2633 self.position = position
2635 def fix_groups(self, pattern, reverse, fuzzy):
2636 try:
2637 self.group = int(self.group)
2638 except ValueError:
2639 try:
2640 self.group = self.info.group_index[self.group]
2641 except KeyError:
2642 if self.group == 'DEFINE':
2643 # 'DEFINE' is a special name unless there's a group with
2644 # that name.
2645 self.group = 0
2646 else:
2647 raise error("unknown group", pattern, self.position)
2649 if not 0 <= self.group <= self.info.group_count:
2650 raise error("invalid group reference", pattern, self.position)
2652 self.yes_item.fix_groups(pattern, reverse, fuzzy)
2653 self.no_item.fix_groups(pattern, reverse, fuzzy)
2655 def optimise(self, info, reverse):
2656 yes_item = self.yes_item.optimise(info, reverse)
2657 no_item = self.no_item.optimise(info, reverse)
2659 return Conditional(info, self.group, yes_item, no_item, self.position)
2661 def pack_characters(self, info):
2662 self.yes_item = self.yes_item.pack_characters(info)
2663 self.no_item = self.no_item.pack_characters(info)
2664 return self
2666 def remove_captures(self):
2667 self.yes_item = self.yes_item.remove_captures()
2668 self.no_item = self.no_item.remove_captures()
2670 def is_atomic(self):
2671 return self.yes_item.is_atomic() and self.no_item.is_atomic()
2673 def can_be_affix(self):
2674 return self.yes_item.can_be_affix() and self.no_item.can_be_affix()
2676 def contains_group(self):
2677 return self.yes_item.contains_group() or self.no_item.contains_group()
2679 def get_firstset(self, reverse):
2680 return (self.yes_item.get_firstset(reverse) |
2681 self.no_item.get_firstset(reverse))
2683 def _compile(self, reverse, fuzzy):
2684 code = [(OP.GROUP_EXISTS, self.group)]
2685 code.extend(self.yes_item.compile(reverse, fuzzy))
2686 add_code = self.no_item.compile(reverse, fuzzy)
2687 if add_code:
2688 code.append((OP.NEXT, ))
2689 code.extend(add_code)
2691 code.append((OP.END, ))
2693 return code
2695 def dump(self, indent, reverse):
2696 print("{}GROUP_EXISTS {}".format(INDENT * indent, self.group))
2697 self.yes_item.dump(indent + 1, reverse)
2698 if not self.no_item.is_empty():
2699 print("{}OR".format(INDENT * indent))
2700 self.no_item.dump(indent + 1, reverse)
2702 def is_empty(self):
2703 return self.yes_item.is_empty() and self.no_item.is_empty()
2705 def __eq__(self, other):
2706 return type(self) is type(other) and (self.group, self.yes_item,
2707 self.no_item) == (other.group, other.yes_item, other.no_item)
2709 def max_width(self):
2710 return max(self.yes_item.max_width(), self.no_item.max_width())
2712 def __del__(self):
2713 self.info = None
2715class DefaultBoundary(ZeroWidthBase):
2716 _opcode = OP.DEFAULT_BOUNDARY
2717 _op_name = "DEFAULT_BOUNDARY"
2719class DefaultEndOfWord(ZeroWidthBase):
2720 _opcode = OP.DEFAULT_END_OF_WORD
2721 _op_name = "DEFAULT_END_OF_WORD"
2723class DefaultStartOfWord(ZeroWidthBase):
2724 _opcode = OP.DEFAULT_START_OF_WORD
2725 _op_name = "DEFAULT_START_OF_WORD"
2727class EndOfLine(ZeroWidthBase):
2728 _opcode = OP.END_OF_LINE
2729 _op_name = "END_OF_LINE"
2731class EndOfLineU(EndOfLine):
2732 _opcode = OP.END_OF_LINE_U
2733 _op_name = "END_OF_LINE_U"
2735class EndOfString(ZeroWidthBase):
2736 _opcode = OP.END_OF_STRING
2737 _op_name = "END_OF_STRING"
2739class EndOfStringLine(ZeroWidthBase):
2740 _opcode = OP.END_OF_STRING_LINE
2741 _op_name = "END_OF_STRING_LINE"
2743class EndOfStringLineU(EndOfStringLine):
2744 _opcode = OP.END_OF_STRING_LINE_U
2745 _op_name = "END_OF_STRING_LINE_U"
2747class EndOfWord(ZeroWidthBase):
2748 _opcode = OP.END_OF_WORD
2749 _op_name = "END_OF_WORD"
2751class Failure(ZeroWidthBase):
2752 _op_name = "FAILURE"
2754 def _compile(self, reverse, fuzzy):
2755 return [(OP.FAILURE, )]
2757class Fuzzy(RegexBase):
2758 def __init__(self, subpattern, constraints=None):
2759 RegexBase.__init__(self)
2760 if constraints is None:
2761 constraints = {}
2762 self.subpattern = subpattern
2763 self.constraints = constraints
2765 # If an error type is mentioned in the cost equation, then its maximum
2766 # defaults to unlimited.
2767 if "cost" in constraints:
2768 for e in "dis":
2769 if e in constraints["cost"]:
2770 constraints.setdefault(e, (0, None))
2772 # If any error type is mentioned, then all the error maxima default to
2773 # 0, otherwise they default to unlimited.
2774 if set(constraints) & set("dis"):
2775 for e in "dis":
2776 constraints.setdefault(e, (0, 0))
2777 else:
2778 for e in "dis":
2779 constraints.setdefault(e, (0, None))
2781 # The maximum of the generic error type defaults to unlimited.
2782 constraints.setdefault("e", (0, None))
2784 # The cost equation defaults to equal costs. Also, the cost of any
2785 # error type not mentioned in the cost equation defaults to 0.
2786 if "cost" in constraints:
2787 for e in "dis":
2788 constraints["cost"].setdefault(e, 0)
2789 else:
2790 constraints["cost"] = {"d": 1, "i": 1, "s": 1, "max":
2791 constraints["e"][1]}
2793 def fix_groups(self, pattern, reverse, fuzzy):
2794 self.subpattern.fix_groups(pattern, reverse, True)
2796 def pack_characters(self, info):
2797 self.subpattern = self.subpattern.pack_characters(info)
2798 return self
2800 def remove_captures(self):
2801 self.subpattern = self.subpattern.remove_captures()
2802 return self
2804 def is_atomic(self):
2805 return self.subpattern.is_atomic()
2807 def contains_group(self):
2808 return self.subpattern.contains_group()
2810 def _compile(self, reverse, fuzzy):
2811 # The individual limits.
2812 arguments = []
2813 for e in "dise":
2814 v = self.constraints[e]
2815 arguments.append(v[0])
2816 arguments.append(UNLIMITED if v[1] is None else v[1])
2818 # The coeffs of the cost equation.
2819 for e in "dis":
2820 arguments.append(self.constraints["cost"][e])
2822 # The maximum of the cost equation.
2823 v = self.constraints["cost"]["max"]
2824 arguments.append(UNLIMITED if v is None else v)
2826 flags = 0
2827 if reverse:
2828 flags |= REVERSE_OP
2830 test = self.constraints.get("test")
2832 if test:
2833 return ([(OP.FUZZY_EXT, flags) + tuple(arguments)] +
2834 test.compile(reverse, True) + [(OP.NEXT,)] +
2835 self.subpattern.compile(reverse, True) + [(OP.END,)])
2837 return ([(OP.FUZZY, flags) + tuple(arguments)] +
2838 self.subpattern.compile(reverse, True) + [(OP.END,)])
2840 def dump(self, indent, reverse):
2841 constraints = self._constraints_to_string()
2842 if constraints:
2843 constraints = " " + constraints
2844 print("{}FUZZY{}".format(INDENT * indent, constraints))
2845 self.subpattern.dump(indent + 1, reverse)
2847 def is_empty(self):
2848 return self.subpattern.is_empty()
2850 def __eq__(self, other):
2851 return (type(self) is type(other) and self.subpattern ==
2852 other.subpattern and self.constraints == other.constraints)
2854 def max_width(self):
2855 return UNLIMITED
2857 def _constraints_to_string(self):
2858 constraints = []
2860 for name in "ids":
2861 min, max = self.constraints[name]
2862 if max == 0:
2863 continue
2865 con = ""
2867 if min > 0:
2868 con = "{}<=".format(min)
2870 con += name
2872 if max is not None:
2873 con += "<={}".format(max)
2875 constraints.append(con)
2877 cost = []
2878 for name in "ids":
2879 coeff = self.constraints["cost"][name]
2880 if coeff > 0:
2881 cost.append("{}{}".format(coeff, name))
2883 limit = self.constraints["cost"]["max"]
2884 if limit is not None and limit > 0:
2885 cost = "{}<={}".format("+".join(cost), limit)
2886 constraints.append(cost)
2888 return ",".join(constraints)
2890class Grapheme(RegexBase):
2891 def _compile(self, reverse, fuzzy):
2892 # Match at least 1 character until a grapheme boundary is reached. Note
2893 # that this is the same whether matching forwards or backwards.
2894 grapheme_matcher = Atomic(Sequence([LazyRepeat(AnyAll(), 1, None),
2895 GraphemeBoundary()]))
2897 return grapheme_matcher.compile(reverse, fuzzy)
2899 def dump(self, indent, reverse):
2900 print("{}GRAPHEME".format(INDENT * indent))
2902 def max_width(self):
2903 return UNLIMITED
2905class GraphemeBoundary:
2906 def compile(self, reverse, fuzzy):
2907 return [(OP.GRAPHEME_BOUNDARY, 1)]
2909class GreedyRepeat(RegexBase):
2910 _opcode = OP.GREEDY_REPEAT
2911 _op_name = "GREEDY_REPEAT"
2913 def __init__(self, subpattern, min_count, max_count):
2914 RegexBase.__init__(self)
2915 self.subpattern = subpattern
2916 self.min_count = min_count
2917 self.max_count = max_count
2919 def fix_groups(self, pattern, reverse, fuzzy):
2920 self.subpattern.fix_groups(pattern, reverse, fuzzy)
2922 def optimise(self, info, reverse):
2923 subpattern = self.subpattern.optimise(info, reverse)
2925 return type(self)(subpattern, self.min_count, self.max_count)
2927 def pack_characters(self, info):
2928 self.subpattern = self.subpattern.pack_characters(info)
2929 return self
2931 def remove_captures(self):
2932 self.subpattern = self.subpattern.remove_captures()
2933 return self
2935 def is_atomic(self):
2936 return self.min_count == self.max_count and self.subpattern.is_atomic()
2938 def can_be_affix(self):
2939 return False
2941 def contains_group(self):
2942 return self.subpattern.contains_group()
2944 def get_firstset(self, reverse):
2945 fs = self.subpattern.get_firstset(reverse)
2946 if self.min_count == 0:
2947 fs.add(None)
2949 return fs
2951 def _compile(self, reverse, fuzzy):
2952 repeat = [self._opcode, self.min_count]
2953 if self.max_count is None:
2954 repeat.append(UNLIMITED)
2955 else:
2956 repeat.append(self.max_count)
2958 subpattern = self.subpattern.compile(reverse, fuzzy)
2959 if not subpattern:
2960 return []
2962 return ([tuple(repeat)] + subpattern + [(OP.END, )])
2964 def dump(self, indent, reverse):
2965 if self.max_count is None:
2966 limit = "INF"
2967 else:
2968 limit = self.max_count
2969 print("{}{} {} {}".format(INDENT * indent, self._op_name,
2970 self.min_count, limit))
2972 self.subpattern.dump(indent + 1, reverse)
2974 def is_empty(self):
2975 return self.subpattern.is_empty()
2977 def __eq__(self, other):
2978 return type(self) is type(other) and (self.subpattern, self.min_count,
2979 self.max_count) == (other.subpattern, other.min_count,
2980 other.max_count)
2982 def max_width(self):
2983 if self.max_count is None:
2984 return UNLIMITED
2986 return self.subpattern.max_width() * self.max_count
2988 def get_required_string(self, reverse):
2989 max_count = UNLIMITED if self.max_count is None else self.max_count
2990 if self.min_count == 0:
2991 w = self.subpattern.max_width() * max_count
2992 return min(w, UNLIMITED), None
2994 ofs, req = self.subpattern.get_required_string(reverse)
2995 if req:
2996 return ofs, req
2998 w = self.subpattern.max_width() * max_count
2999 return min(w, UNLIMITED), None
3001class PossessiveRepeat(GreedyRepeat):
3002 def is_atomic(self):
3003 return True
3005 def _compile(self, reverse, fuzzy):
3006 subpattern = self.subpattern.compile(reverse, fuzzy)
3007 if not subpattern:
3008 return []
3010 repeat = [self._opcode, self.min_count]
3011 if self.max_count is None:
3012 repeat.append(UNLIMITED)
3013 else:
3014 repeat.append(self.max_count)
3016 return ([(OP.ATOMIC, ), tuple(repeat)] + subpattern + [(OP.END, ),
3017 (OP.END, )])
3019 def dump(self, indent, reverse):
3020 print("{}ATOMIC".format(INDENT * indent))
3022 if self.max_count is None:
3023 limit = "INF"
3024 else:
3025 limit = self.max_count
3026 print("{}{} {} {}".format(INDENT * (indent + 1), self._op_name,
3027 self.min_count, limit))
3029 self.subpattern.dump(indent + 2, reverse)
3031class Group(RegexBase):
3032 def __init__(self, info, group, subpattern):
3033 RegexBase.__init__(self)
3034 self.info = info
3035 self.group = group
3036 self.subpattern = subpattern
3038 self.call_ref = None
3040 def fix_groups(self, pattern, reverse, fuzzy):
3041 self.info.defined_groups[self.group] = (self, reverse, fuzzy)
3042 self.subpattern.fix_groups(pattern, reverse, fuzzy)
3044 def optimise(self, info, reverse):
3045 subpattern = self.subpattern.optimise(info, reverse)
3047 return Group(self.info, self.group, subpattern)
3049 def pack_characters(self, info):
3050 self.subpattern = self.subpattern.pack_characters(info)
3051 return self
3053 def remove_captures(self):
3054 return self.subpattern.remove_captures()
3056 def is_atomic(self):
3057 return self.subpattern.is_atomic()
3059 def can_be_affix(self):
3060 return False
3062 def contains_group(self):
3063 return True
3065 def get_firstset(self, reverse):
3066 return self.subpattern.get_firstset(reverse)
3068 def has_simple_start(self):
3069 return self.subpattern.has_simple_start()
3071 def _compile(self, reverse, fuzzy):
3072 code = []
3074 public_group = private_group = self.group
3075 if private_group < 0:
3076 public_group = self.info.private_groups[private_group]
3077 private_group = self.info.group_count - private_group
3079 key = self.group, reverse, fuzzy
3080 ref = self.info.call_refs.get(key)
3081 if ref is not None:
3082 code += [(OP.CALL_REF, ref)]
3084 code += [(OP.GROUP, int(not reverse), private_group, public_group)]
3085 code += self.subpattern.compile(reverse, fuzzy)
3086 code += [(OP.END, )]
3088 if ref is not None:
3089 code += [(OP.END, )]
3091 return code
3093 def dump(self, indent, reverse):
3094 group = self.group
3095 if group < 0:
3096 group = self.info.private_groups[group]
3097 print("{}GROUP {}".format(INDENT * indent, group))
3098 self.subpattern.dump(indent + 1, reverse)
3100 def __eq__(self, other):
3101 return (type(self) is type(other) and (self.group, self.subpattern) ==
3102 (other.group, other.subpattern))
3104 def max_width(self):
3105 return self.subpattern.max_width()
3107 def get_required_string(self, reverse):
3108 return self.subpattern.get_required_string(reverse)
3110 def __del__(self):
3111 self.info = None
3113class Keep(ZeroWidthBase):
3114 _opcode = OP.KEEP
3115 _op_name = "KEEP"
3117class LazyRepeat(GreedyRepeat):
3118 _opcode = OP.LAZY_REPEAT
3119 _op_name = "LAZY_REPEAT"
3121class LookAround(RegexBase):
3122 _dir_text = {False: "AHEAD", True: "BEHIND"}
3124 def __init__(self, behind, positive, subpattern):
3125 RegexBase.__init__(self)
3126 self.behind = bool(behind)
3127 self.positive = bool(positive)
3128 self.subpattern = subpattern
3130 def fix_groups(self, pattern, reverse, fuzzy):
3131 self.subpattern.fix_groups(pattern, self.behind, fuzzy)
3133 def optimise(self, info, reverse):
3134 subpattern = self.subpattern.optimise(info, self.behind)
3135 if self.positive and subpattern.is_empty():
3136 return subpattern
3138 return LookAround(self.behind, self.positive, subpattern)
3140 def pack_characters(self, info):
3141 self.subpattern = self.subpattern.pack_characters(info)
3142 return self
3144 def remove_captures(self):
3145 return self.subpattern.remove_captures()
3147 def is_atomic(self):
3148 return self.subpattern.is_atomic()
3150 def can_be_affix(self):
3151 return self.subpattern.can_be_affix()
3153 def contains_group(self):
3154 return self.subpattern.contains_group()
3156 def get_firstset(self, reverse):
3157 if self.positive and self.behind == reverse:
3158 return self.subpattern.get_firstset(reverse)
3160 return set([None])
3162 def _compile(self, reverse, fuzzy):
3163 flags = 0
3164 if self.positive:
3165 flags |= POSITIVE_OP
3166 if fuzzy:
3167 flags |= FUZZY_OP
3168 if reverse:
3169 flags |= REVERSE_OP
3171 return ([(OP.LOOKAROUND, flags, int(not self.behind))] +
3172 self.subpattern.compile(self.behind) + [(OP.END, )])
3174 def dump(self, indent, reverse):
3175 print("{}LOOK{} {}".format(INDENT * indent,
3176 self._dir_text[self.behind], POS_TEXT[self.positive]))
3177 self.subpattern.dump(indent + 1, self.behind)
3179 def is_empty(self):
3180 return self.positive and self.subpattern.is_empty()
3182 def __eq__(self, other):
3183 return type(self) is type(other) and (self.behind, self.positive,
3184 self.subpattern) == (other.behind, other.positive, other.subpattern)
3186 def max_width(self):
3187 return 0
3189class LookAroundConditional(RegexBase):
3190 _dir_text = {False: "AHEAD", True: "BEHIND"}
3192 def __init__(self, behind, positive, subpattern, yes_item, no_item):
3193 RegexBase.__init__(self)
3194 self.behind = bool(behind)
3195 self.positive = bool(positive)
3196 self.subpattern = subpattern
3197 self.yes_item = yes_item
3198 self.no_item = no_item
3200 def fix_groups(self, pattern, reverse, fuzzy):
3201 self.subpattern.fix_groups(pattern, reverse, fuzzy)
3202 self.yes_item.fix_groups(pattern, reverse, fuzzy)
3203 self.no_item.fix_groups(pattern, reverse, fuzzy)
3205 def optimise(self, info, reverse):
3206 subpattern = self.subpattern.optimise(info, self.behind)
3207 yes_item = self.yes_item.optimise(info, self.behind)
3208 no_item = self.no_item.optimise(info, self.behind)
3210 return LookAroundConditional(self.behind, self.positive, subpattern,
3211 yes_item, no_item)
3213 def pack_characters(self, info):
3214 self.subpattern = self.subpattern.pack_characters(info)
3215 self.yes_item = self.yes_item.pack_characters(info)
3216 self.no_item = self.no_item.pack_characters(info)
3217 return self
3219 def remove_captures(self):
3220 self.subpattern = self.subpattern.remove_captures()
3221 self.yes_item = self.yes_item.remove_captures()
3222 self.no_item = self.no_item.remove_captures()
3224 def is_atomic(self):
3225 return (self.subpattern.is_atomic() and self.yes_item.is_atomic() and
3226 self.no_item.is_atomic())
3228 def can_be_affix(self):
3229 return (self.subpattern.can_be_affix() and self.yes_item.can_be_affix()
3230 and self.no_item.can_be_affix())
3232 def contains_group(self):
3233 return (self.subpattern.contains_group() or
3234 self.yes_item.contains_group() or self.no_item.contains_group())
3236 def _compile(self, reverse, fuzzy):
3237 code = [(OP.CONDITIONAL, int(self.positive), int(not self.behind))]
3238 code.extend(self.subpattern.compile(self.behind, fuzzy))
3239 code.append((OP.NEXT, ))
3240 code.extend(self.yes_item.compile(reverse, fuzzy))
3241 add_code = self.no_item.compile(reverse, fuzzy)
3242 if add_code:
3243 code.append((OP.NEXT, ))
3244 code.extend(add_code)
3246 code.append((OP.END, ))
3248 return code
3250 def dump(self, indent, reverse):
3251 print("{}CONDITIONAL {} {}".format(INDENT * indent,
3252 self._dir_text[self.behind], POS_TEXT[self.positive]))
3253 self.subpattern.dump(indent + 1, self.behind)
3254 print("{}EITHER".format(INDENT * indent))
3255 self.yes_item.dump(indent + 1, reverse)
3256 if not self.no_item.is_empty():
3257 print("{}OR".format(INDENT * indent))
3258 self.no_item.dump(indent + 1, reverse)
3260 def is_empty(self):
3261 return (self.subpattern.is_empty() and self.yes_item.is_empty() or
3262 self.no_item.is_empty())
3264 def __eq__(self, other):
3265 return type(self) is type(other) and (self.subpattern, self.yes_item,
3266 self.no_item) == (other.subpattern, other.yes_item, other.no_item)
3268 def max_width(self):
3269 return max(self.yes_item.max_width(), self.no_item.max_width())
3271 def get_required_string(self, reverse):
3272 return self.max_width(), None
3274class PrecompiledCode(RegexBase):
3275 def __init__(self, code):
3276 self.code = code
3278 def _compile(self, reverse, fuzzy):
3279 return [tuple(self.code)]
3281class Property(RegexBase):
3282 _opcode = {(NOCASE, False): OP.PROPERTY, (IGNORECASE, False):
3283 OP.PROPERTY_IGN, (FULLCASE, False): OP.PROPERTY, (FULLIGNORECASE, False):
3284 OP.PROPERTY_IGN, (NOCASE, True): OP.PROPERTY_REV, (IGNORECASE, True):
3285 OP.PROPERTY_IGN_REV, (FULLCASE, True): OP.PROPERTY_REV, (FULLIGNORECASE,
3286 True): OP.PROPERTY_IGN_REV}
3288 def __init__(self, value, positive=True, case_flags=NOCASE,
3289 zerowidth=False, encoding=0):
3290 RegexBase.__init__(self)
3291 self.value = value
3292 self.positive = bool(positive)
3293 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
3294 self.zerowidth = bool(zerowidth)
3295 self.encoding = encoding
3297 self._key = (self.__class__, self.value, self.positive,
3298 self.case_flags, self.zerowidth)
3300 def rebuild(self, positive, case_flags, zerowidth):
3301 return Property(self.value, positive, case_flags, zerowidth,
3302 self.encoding)
3304 def optimise(self, info, reverse, in_set=False):
3305 return self
3307 def get_firstset(self, reverse):
3308 return set([self])
3310 def has_simple_start(self):
3311 return True
3313 def _compile(self, reverse, fuzzy):
3314 flags = 0
3315 if self.positive:
3316 flags |= POSITIVE_OP
3317 if self.zerowidth:
3318 flags |= ZEROWIDTH_OP
3319 if fuzzy:
3320 flags |= FUZZY_OP
3321 flags |= self.encoding << ENCODING_OP_SHIFT
3322 return [(self._opcode[self.case_flags, reverse], flags, self.value)]
3324 def dump(self, indent, reverse):
3325 prop = PROPERTY_NAMES[self.value >> 16]
3326 name, value = prop[0], prop[1][self.value & 0xFFFF]
3327 print("{}PROPERTY {} {}:{}{}{}".format(INDENT * indent,
3328 POS_TEXT[self.positive], name, value, CASE_TEXT[self.case_flags],
3329 ["", " ASCII"][self.encoding]))
3331 def matches(self, ch):
3332 return _regex.has_property_value(self.value, ch) == self.positive
3334 def max_width(self):
3335 return 1
3337class Prune(ZeroWidthBase):
3338 _op_name = "PRUNE"
3340 def _compile(self, reverse, fuzzy):
3341 return [(OP.PRUNE, )]
3343class Range(RegexBase):
3344 _opcode = {(NOCASE, False): OP.RANGE, (IGNORECASE, False): OP.RANGE_IGN,
3345 (FULLCASE, False): OP.RANGE, (FULLIGNORECASE, False): OP.RANGE_IGN,
3346 (NOCASE, True): OP.RANGE_REV, (IGNORECASE, True): OP.RANGE_IGN_REV,
3347 (FULLCASE, True): OP.RANGE_REV, (FULLIGNORECASE, True): OP.RANGE_IGN_REV}
3348 _op_name = "RANGE"
3350 def __init__(self, lower, upper, positive=True, case_flags=NOCASE,
3351 zerowidth=False):
3352 RegexBase.__init__(self)
3353 self.lower = lower
3354 self.upper = upper
3355 self.positive = bool(positive)
3356 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
3357 self.zerowidth = bool(zerowidth)
3359 self._key = (self.__class__, self.lower, self.upper, self.positive,
3360 self.case_flags, self.zerowidth)
3362 def rebuild(self, positive, case_flags, zerowidth):
3363 return Range(self.lower, self.upper, positive, case_flags, zerowidth)
3365 def optimise(self, info, reverse, in_set=False):
3366 # Is the range case-sensitive?
3367 if not self.positive or not (self.case_flags & IGNORECASE) or in_set:
3368 return self
3370 # Is full case-folding possible?
3371 if (not (info.flags & UNICODE) or (self.case_flags & FULLIGNORECASE) !=
3372 FULLIGNORECASE):
3373 return self
3375 # Get the characters which expand to multiple codepoints on folding.
3376 expanding_chars = _regex.get_expand_on_folding()
3378 # Get the folded characters in the range.
3379 items = []
3380 for ch in expanding_chars:
3381 if self.lower <= ord(ch) <= self.upper:
3382 folded = _regex.fold_case(FULL_CASE_FOLDING, ch)
3383 items.append(String([ord(c) for c in folded],
3384 case_flags=self.case_flags))
3386 if not items:
3387 # We can fall back to simple case-folding.
3388 return self
3390 if len(items) < self.upper - self.lower + 1:
3391 # Not all the characters are covered by the full case-folding.
3392 items.insert(0, self)
3394 return Branch(items)
3396 def _compile(self, reverse, fuzzy):
3397 flags = 0
3398 if self.positive:
3399 flags |= POSITIVE_OP
3400 if self.zerowidth:
3401 flags |= ZEROWIDTH_OP
3402 if fuzzy:
3403 flags |= FUZZY_OP
3404 return [(self._opcode[self.case_flags, reverse], flags, self.lower,
3405 self.upper)]
3407 def dump(self, indent, reverse):
3408 display_lower = ascii(chr(self.lower)).lstrip("bu")
3409 display_upper = ascii(chr(self.upper)).lstrip("bu")
3410 print("{}RANGE {} {} {}{}".format(INDENT * indent,
3411 POS_TEXT[self.positive], display_lower, display_upper,
3412 CASE_TEXT[self.case_flags]))
3414 def matches(self, ch):
3415 return (self.lower <= ch <= self.upper) == self.positive
3417 def max_width(self):
3418 return 1
3420class RefGroup(RegexBase):
3421 _opcode = {(NOCASE, False): OP.REF_GROUP, (IGNORECASE, False):
3422 OP.REF_GROUP_IGN, (FULLCASE, False): OP.REF_GROUP, (FULLIGNORECASE,
3423 False): OP.REF_GROUP_FLD, (NOCASE, True): OP.REF_GROUP_REV, (IGNORECASE,
3424 True): OP.REF_GROUP_IGN_REV, (FULLCASE, True): OP.REF_GROUP_REV,
3425 (FULLIGNORECASE, True): OP.REF_GROUP_FLD_REV}
3427 def __init__(self, info, group, position, case_flags=NOCASE):
3428 RegexBase.__init__(self)
3429 self.info = info
3430 self.group = group
3431 self.position = position
3432 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
3434 self._key = self.__class__, self.group, self.case_flags
3436 def fix_groups(self, pattern, reverse, fuzzy):
3437 try:
3438 self.group = int(self.group)
3439 except ValueError:
3440 try:
3441 self.group = self.info.group_index[self.group]
3442 except KeyError:
3443 raise error("unknown group", pattern, self.position)
3445 if not 1 <= self.group <= self.info.group_count:
3446 raise error("invalid group reference", pattern, self.position)
3448 self._key = self.__class__, self.group, self.case_flags
3450 def remove_captures(self):
3451 raise error("group reference not allowed", self.pattern, self.position)
3453 def _compile(self, reverse, fuzzy):
3454 flags = 0
3455 if fuzzy:
3456 flags |= FUZZY_OP
3457 return [(self._opcode[self.case_flags, reverse], flags, self.group)]
3459 def dump(self, indent, reverse):
3460 print("{}REF_GROUP {}{}".format(INDENT * indent, self.group,
3461 CASE_TEXT[self.case_flags]))
3463 def max_width(self):
3464 return UNLIMITED
3466 def __del__(self):
3467 self.info = None
3469class SearchAnchor(ZeroWidthBase):
3470 _opcode = OP.SEARCH_ANCHOR
3471 _op_name = "SEARCH_ANCHOR"
3473class Sequence(RegexBase):
3474 def __init__(self, items=None):
3475 RegexBase.__init__(self)
3476 if items is None:
3477 items = []
3479 self.items = items
3481 def fix_groups(self, pattern, reverse, fuzzy):
3482 for s in self.items:
3483 s.fix_groups(pattern, reverse, fuzzy)
3485 def optimise(self, info, reverse):
3486 # Flatten the sequences.
3487 items = []
3488 for s in self.items:
3489 s = s.optimise(info, reverse)
3490 if isinstance(s, Sequence):
3491 items.extend(s.items)
3492 else:
3493 items.append(s)
3495 return make_sequence(items)
3497 def pack_characters(self, info):
3498 "Packs sequences of characters into strings."
3499 items = []
3500 characters = []
3501 case_flags = NOCASE
3502 for s in self.items:
3503 if type(s) is Character and s.positive and not s.zerowidth:
3504 if s.case_flags != case_flags:
3505 # Different case sensitivity, so flush, unless neither the
3506 # previous nor the new character are cased.
3507 if s.case_flags or is_cased_i(info, s.value):
3508 Sequence._flush_characters(info, characters,
3509 case_flags, items)
3511 case_flags = s.case_flags
3513 characters.append(s.value)
3514 elif type(s) is String or type(s) is Literal:
3515 if s.case_flags != case_flags:
3516 # Different case sensitivity, so flush, unless the neither
3517 # the previous nor the new string are cased.
3518 if s.case_flags or any(is_cased_i(info, c) for c in
3519 characters):
3520 Sequence._flush_characters(info, characters,
3521 case_flags, items)
3523 case_flags = s.case_flags
3525 characters.extend(s.characters)
3526 else:
3527 Sequence._flush_characters(info, characters, case_flags, items)
3529 items.append(s.pack_characters(info))
3531 Sequence._flush_characters(info, characters, case_flags, items)
3533 return make_sequence(items)
3535 def remove_captures(self):
3536 self.items = [s.remove_captures() for s in self.items]
3537 return self
3539 def is_atomic(self):
3540 return all(s.is_atomic() for s in self.items)
3542 def can_be_affix(self):
3543 return False
3545 def contains_group(self):
3546 return any(s.contains_group() for s in self.items)
3548 def get_firstset(self, reverse):
3549 fs = set()
3550 items = self.items
3551 if reverse:
3552 items.reverse()
3553 for s in items:
3554 fs |= s.get_firstset(reverse)
3555 if None not in fs:
3556 return fs
3557 fs.discard(None)
3559 return fs | set([None])
3561 def has_simple_start(self):
3562 return bool(self.items) and self.items[0].has_simple_start()
3564 def _compile(self, reverse, fuzzy):
3565 seq = self.items
3566 if reverse:
3567 seq = seq[::-1]
3569 code = []
3570 for s in seq:
3571 code.extend(s.compile(reverse, fuzzy))
3573 return code
3575 def dump(self, indent, reverse):
3576 for s in self.items:
3577 s.dump(indent, reverse)
3579 @staticmethod
3580 def _flush_characters(info, characters, case_flags, items):
3581 if not characters:
3582 return
3584 # Disregard case_flags if all of the characters are case-less.
3585 if case_flags & IGNORECASE:
3586 if not any(is_cased_i(info, c) for c in characters):
3587 case_flags = NOCASE
3589 if (case_flags & FULLIGNORECASE) == FULLIGNORECASE:
3590 literals = Sequence._fix_full_casefold(characters)
3592 for item in literals:
3593 chars = item.characters
3595 if len(chars) == 1:
3596 items.append(Character(chars[0], case_flags=item.case_flags))
3597 else:
3598 items.append(String(chars, case_flags=item.case_flags))
3599 else:
3600 if len(characters) == 1:
3601 items.append(Character(characters[0], case_flags=case_flags))
3602 else:
3603 items.append(String(characters, case_flags=case_flags))
3605 characters[:] = []
3607 @staticmethod
3608 def _fix_full_casefold(characters):
3609 # Split a literal needing full case-folding into chunks that need it
3610 # and chunks that can use simple case-folding, which is faster.
3611 expanded = [_regex.fold_case(FULL_CASE_FOLDING, c) for c in
3612 _regex.get_expand_on_folding()]
3613 string = _regex.fold_case(FULL_CASE_FOLDING, ''.join(chr(c)
3614 for c in characters)).lower()
3615 chunks = []
3617 for e in expanded:
3618 found = string.find(e)
3620 while found >= 0:
3621 chunks.append((found, found + len(e)))
3622 found = string.find(e, found + 1)
3624 pos = 0
3625 literals = []
3627 for start, end in Sequence._merge_chunks(chunks):
3628 if pos < start:
3629 literals.append(Literal(characters[pos : start],
3630 case_flags=IGNORECASE))
3632 literals.append(Literal(characters[start : end],
3633 case_flags=FULLIGNORECASE))
3634 pos = end
3636 if pos < len(characters):
3637 literals.append(Literal(characters[pos : ], case_flags=IGNORECASE))
3639 return literals
3641 @staticmethod
3642 def _merge_chunks(chunks):
3643 if len(chunks) < 2:
3644 return chunks
3646 chunks.sort()
3648 start, end = chunks[0]
3649 new_chunks = []
3651 for s, e in chunks[1 : ]:
3652 if s <= end:
3653 end = max(end, e)
3654 else:
3655 new_chunks.append((start, end))
3656 start, end = s, e
3658 new_chunks.append((start, end))
3660 return new_chunks
3662 def is_empty(self):
3663 return all(i.is_empty() for i in self.items)
3665 def __eq__(self, other):
3666 return type(self) is type(other) and self.items == other.items
3668 def max_width(self):
3669 return sum(s.max_width() for s in self.items)
3671 def get_required_string(self, reverse):
3672 seq = self.items
3673 if reverse:
3674 seq = seq[::-1]
3676 offset = 0
3678 for s in seq:
3679 ofs, req = s.get_required_string(reverse)
3680 offset += ofs
3681 if req:
3682 return offset, req
3684 return offset, None
3686class SetBase(RegexBase):
3687 def __init__(self, info, items, positive=True, case_flags=NOCASE,
3688 zerowidth=False):
3689 RegexBase.__init__(self)
3690 self.info = info
3691 self.items = tuple(items)
3692 self.positive = bool(positive)
3693 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
3694 self.zerowidth = bool(zerowidth)
3696 self.char_width = 1
3698 self._key = (self.__class__, self.items, self.positive,
3699 self.case_flags, self.zerowidth)
3701 def rebuild(self, positive, case_flags, zerowidth):
3702 return type(self)(self.info, self.items, positive, case_flags,
3703 zerowidth).optimise(self.info, False)
3705 def get_firstset(self, reverse):
3706 return set([self])
3708 def has_simple_start(self):
3709 return True
3711 def _compile(self, reverse, fuzzy):
3712 flags = 0
3713 if self.positive:
3714 flags |= POSITIVE_OP
3715 if self.zerowidth:
3716 flags |= ZEROWIDTH_OP
3717 if fuzzy:
3718 flags |= FUZZY_OP
3719 code = [(self._opcode[self.case_flags, reverse], flags)]
3720 for m in self.items:
3721 code.extend(m.compile())
3723 code.append((OP.END, ))
3725 return code
3727 def dump(self, indent, reverse):
3728 print("{}{} {}{}".format(INDENT * indent, self._op_name,
3729 POS_TEXT[self.positive], CASE_TEXT[self.case_flags]))
3730 for i in self.items:
3731 i.dump(indent + 1, reverse)
3733 def _handle_case_folding(self, info, in_set):
3734 # Is the set case-sensitive?
3735 if not self.positive or not (self.case_flags & IGNORECASE) or in_set:
3736 return self
3738 # Is full case-folding possible?
3739 if (not (self.info.flags & UNICODE) or (self.case_flags &
3740 FULLIGNORECASE) != FULLIGNORECASE):
3741 return self
3743 # Get the characters which expand to multiple codepoints on folding.
3744 expanding_chars = _regex.get_expand_on_folding()
3746 # Get the folded characters in the set.
3747 items = []
3748 seen = set()
3749 for ch in expanding_chars:
3750 if self.matches(ord(ch)):
3751 folded = _regex.fold_case(FULL_CASE_FOLDING, ch)
3752 if folded not in seen:
3753 items.append(String([ord(c) for c in folded],
3754 case_flags=self.case_flags))
3755 seen.add(folded)
3757 if not items:
3758 # We can fall back to simple case-folding.
3759 return self
3761 return Branch([self] + items)
3763 def max_width(self):
3764 # Is the set case-sensitive?
3765 if not self.positive or not (self.case_flags & IGNORECASE):
3766 return 1
3768 # Is full case-folding possible?
3769 if (not (self.info.flags & UNICODE) or (self.case_flags &
3770 FULLIGNORECASE) != FULLIGNORECASE):
3771 return 1
3773 # Get the characters which expand to multiple codepoints on folding.
3774 expanding_chars = _regex.get_expand_on_folding()
3776 # Get the folded characters in the set.
3777 seen = set()
3778 for ch in expanding_chars:
3779 if self.matches(ord(ch)):
3780 folded = _regex.fold_case(FULL_CASE_FOLDING, ch)
3781 seen.add(folded)
3783 if not seen:
3784 return 1
3786 return max(len(folded) for folded in seen)
3788 def __del__(self):
3789 self.info = None
3791class SetDiff(SetBase):
3792 _opcode = {(NOCASE, False): OP.SET_DIFF, (IGNORECASE, False):
3793 OP.SET_DIFF_IGN, (FULLCASE, False): OP.SET_DIFF, (FULLIGNORECASE, False):
3794 OP.SET_DIFF_IGN, (NOCASE, True): OP.SET_DIFF_REV, (IGNORECASE, True):
3795 OP.SET_DIFF_IGN_REV, (FULLCASE, True): OP.SET_DIFF_REV, (FULLIGNORECASE,
3796 True): OP.SET_DIFF_IGN_REV}
3797 _op_name = "SET_DIFF"
3799 def optimise(self, info, reverse, in_set=False):
3800 items = self.items
3801 if len(items) > 2:
3802 items = [items[0], SetUnion(info, items[1 : ])]
3804 if len(items) == 1:
3805 return items[0].with_flags(case_flags=self.case_flags,
3806 zerowidth=self.zerowidth).optimise(info, reverse, in_set)
3808 self.items = tuple(m.optimise(info, reverse, in_set=True) for m in
3809 items)
3811 return self._handle_case_folding(info, in_set)
3813 def matches(self, ch):
3814 m = self.items[0].matches(ch) and not self.items[1].matches(ch)
3815 return m == self.positive
3817class SetInter(SetBase):
3818 _opcode = {(NOCASE, False): OP.SET_INTER, (IGNORECASE, False):
3819 OP.SET_INTER_IGN, (FULLCASE, False): OP.SET_INTER, (FULLIGNORECASE,
3820 False): OP.SET_INTER_IGN, (NOCASE, True): OP.SET_INTER_REV, (IGNORECASE,
3821 True): OP.SET_INTER_IGN_REV, (FULLCASE, True): OP.SET_INTER_REV,
3822 (FULLIGNORECASE, True): OP.SET_INTER_IGN_REV}
3823 _op_name = "SET_INTER"
3825 def optimise(self, info, reverse, in_set=False):
3826 items = []
3827 for m in self.items:
3828 m = m.optimise(info, reverse, in_set=True)
3829 if isinstance(m, SetInter) and m.positive:
3830 # Intersection in intersection.
3831 items.extend(m.items)
3832 else:
3833 items.append(m)
3835 if len(items) == 1:
3836 return items[0].with_flags(case_flags=self.case_flags,
3837 zerowidth=self.zerowidth).optimise(info, reverse, in_set)
3839 self.items = tuple(items)
3841 return self._handle_case_folding(info, in_set)
3843 def matches(self, ch):
3844 m = all(i.matches(ch) for i in self.items)
3845 return m == self.positive
3847class SetSymDiff(SetBase):
3848 _opcode = {(NOCASE, False): OP.SET_SYM_DIFF, (IGNORECASE, False):
3849 OP.SET_SYM_DIFF_IGN, (FULLCASE, False): OP.SET_SYM_DIFF, (FULLIGNORECASE,
3850 False): OP.SET_SYM_DIFF_IGN, (NOCASE, True): OP.SET_SYM_DIFF_REV,
3851 (IGNORECASE, True): OP.SET_SYM_DIFF_IGN_REV, (FULLCASE, True):
3852 OP.SET_SYM_DIFF_REV, (FULLIGNORECASE, True): OP.SET_SYM_DIFF_IGN_REV}
3853 _op_name = "SET_SYM_DIFF"
3855 def optimise(self, info, reverse, in_set=False):
3856 items = []
3857 for m in self.items:
3858 m = m.optimise(info, reverse, in_set=True)
3859 if isinstance(m, SetSymDiff) and m.positive:
3860 # Symmetric difference in symmetric difference.
3861 items.extend(m.items)
3862 else:
3863 items.append(m)
3865 if len(items) == 1:
3866 return items[0].with_flags(case_flags=self.case_flags,
3867 zerowidth=self.zerowidth).optimise(info, reverse, in_set)
3869 self.items = tuple(items)
3871 return self._handle_case_folding(info, in_set)
3873 def matches(self, ch):
3874 m = False
3875 for i in self.items:
3876 m = m != i.matches(ch)
3878 return m == self.positive
3880class SetUnion(SetBase):
3881 _opcode = {(NOCASE, False): OP.SET_UNION, (IGNORECASE, False):
3882 OP.SET_UNION_IGN, (FULLCASE, False): OP.SET_UNION, (FULLIGNORECASE,
3883 False): OP.SET_UNION_IGN, (NOCASE, True): OP.SET_UNION_REV, (IGNORECASE,
3884 True): OP.SET_UNION_IGN_REV, (FULLCASE, True): OP.SET_UNION_REV,
3885 (FULLIGNORECASE, True): OP.SET_UNION_IGN_REV}
3886 _op_name = "SET_UNION"
3888 def optimise(self, info, reverse, in_set=False):
3889 items = []
3890 for m in self.items:
3891 m = m.optimise(info, reverse, in_set=True)
3892 if isinstance(m, SetUnion) and m.positive:
3893 # Union in union.
3894 items.extend(m.items)
3895 elif isinstance(m, AnyAll):
3896 return AnyAll()
3897 else:
3898 items.append(m)
3900 # Are there complementary properties?
3901 properties = (set(), set())
3903 for m in items:
3904 if isinstance(m, Property):
3905 properties[m.positive].add((m.value, m.case_flags, m.zerowidth))
3907 if properties[0] & properties[1]:
3908 return AnyAll()
3910 if len(items) == 1:
3911 i = items[0]
3912 return i.with_flags(positive=i.positive == self.positive,
3913 case_flags=self.case_flags,
3914 zerowidth=self.zerowidth).optimise(info, reverse, in_set)
3916 self.items = tuple(items)
3918 return self._handle_case_folding(info, in_set)
3920 def _compile(self, reverse, fuzzy):
3921 flags = 0
3922 if self.positive:
3923 flags |= POSITIVE_OP
3924 if self.zerowidth:
3925 flags |= ZEROWIDTH_OP
3926 if fuzzy:
3927 flags |= FUZZY_OP
3929 characters, others = defaultdict(list), []
3930 for m in self.items:
3931 if isinstance(m, Character):
3932 characters[m.positive].append(m.value)
3933 else:
3934 others.append(m)
3936 code = [(self._opcode[self.case_flags, reverse], flags)]
3938 for positive, values in characters.items():
3939 flags = 0
3940 if positive:
3941 flags |= POSITIVE_OP
3942 if len(values) == 1:
3943 code.append((OP.CHARACTER, flags, values[0]))
3944 else:
3945 code.append((OP.STRING, flags, len(values)) + tuple(values))
3947 for m in others:
3948 code.extend(m.compile())
3950 code.append((OP.END, ))
3952 return code
3954 def matches(self, ch):
3955 m = any(i.matches(ch) for i in self.items)
3956 return m == self.positive
3958class Skip(ZeroWidthBase):
3959 _op_name = "SKIP"
3960 _opcode = OP.SKIP
3962class StartOfLine(ZeroWidthBase):
3963 _opcode = OP.START_OF_LINE
3964 _op_name = "START_OF_LINE"
3966class StartOfLineU(StartOfLine):
3967 _opcode = OP.START_OF_LINE_U
3968 _op_name = "START_OF_LINE_U"
3970class StartOfString(ZeroWidthBase):
3971 _opcode = OP.START_OF_STRING
3972 _op_name = "START_OF_STRING"
3974class StartOfWord(ZeroWidthBase):
3975 _opcode = OP.START_OF_WORD
3976 _op_name = "START_OF_WORD"
3978class String(RegexBase):
3979 _opcode = {(NOCASE, False): OP.STRING, (IGNORECASE, False): OP.STRING_IGN,
3980 (FULLCASE, False): OP.STRING, (FULLIGNORECASE, False): OP.STRING_FLD,
3981 (NOCASE, True): OP.STRING_REV, (IGNORECASE, True): OP.STRING_IGN_REV,
3982 (FULLCASE, True): OP.STRING_REV, (FULLIGNORECASE, True):
3983 OP.STRING_FLD_REV}
3985 def __init__(self, characters, case_flags=NOCASE):
3986 self.characters = tuple(characters)
3987 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
3989 if (self.case_flags & FULLIGNORECASE) == FULLIGNORECASE:
3990 folded_characters = []
3991 for char in self.characters:
3992 folded = _regex.fold_case(FULL_CASE_FOLDING, chr(char))
3993 folded_characters.extend(ord(c) for c in folded)
3994 else:
3995 folded_characters = self.characters
3997 self.folded_characters = tuple(folded_characters)
3998 self.required = False
4000 self._key = self.__class__, self.characters, self.case_flags
4002 def get_firstset(self, reverse):
4003 if reverse:
4004 pos = -1
4005 else:
4006 pos = 0
4007 return set([Character(self.characters[pos],
4008 case_flags=self.case_flags)])
4010 def has_simple_start(self):
4011 return True
4013 def _compile(self, reverse, fuzzy):
4014 flags = 0
4015 if fuzzy:
4016 flags |= FUZZY_OP
4017 if self.required:
4018 flags |= REQUIRED_OP
4019 return [(self._opcode[self.case_flags, reverse], flags,
4020 len(self.folded_characters)) + self.folded_characters]
4022 def dump(self, indent, reverse):
4023 display = ascii("".join(chr(c) for c in self.characters)).lstrip("bu")
4024 print("{}STRING {}{}".format(INDENT * indent, display,
4025 CASE_TEXT[self.case_flags]))
4027 def max_width(self):
4028 return len(self.folded_characters)
4030 def get_required_string(self, reverse):
4031 return 0, self
4033class Literal(String):
4034 def dump(self, indent, reverse):
4035 literal = ''.join(chr(c) for c in self.characters)
4036 display = ascii(literal).lstrip("bu")
4037 print("{}LITERAL MATCH {}{}".format(INDENT * indent, display,
4038 CASE_TEXT[self.case_flags]))
4040class StringSet(Branch):
4041 def __init__(self, info, name, case_flags=NOCASE):
4042 self.info = info
4043 self.name = name
4044 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
4046 self._key = self.__class__, self.name, self.case_flags
4048 self.set_key = (name, self.case_flags)
4049 if self.set_key not in info.named_lists_used:
4050 info.named_lists_used[self.set_key] = len(info.named_lists_used)
4052 index = self.info.named_lists_used[self.set_key]
4053 items = self.info.kwargs[self.name]
4055 case_flags = self.case_flags
4057 encoding = self.info.flags & _ALL_ENCODINGS
4058 fold_flags = encoding | case_flags
4060 choices = []
4062 for string in items:
4063 if isinstance(string, str):
4064 string = [ord(c) for c in string]
4066 choices.append([Character(c, case_flags=case_flags) for c in
4067 string])
4069 # Sort from longest to shortest.
4070 choices.sort(key=len, reverse=True)
4072 self.branches = [Sequence(choice) for choice in choices]
4074 def dump(self, indent, reverse):
4075 print("{}STRING_SET {}{}".format(INDENT * indent, self.name,
4076 CASE_TEXT[self.case_flags]))
4078 def __del__(self):
4079 self.info = None
4081class Source:
4082 "Scanner for the regular expression source string."
4083 def __init__(self, string):
4084 if isinstance(string, str):
4085 self.string = string
4086 self.char_type = chr
4087 else:
4088 self.string = string.decode("latin-1")
4089 self.char_type = lambda c: bytes([c])
4091 self.pos = 0
4092 self.ignore_space = False
4093 self.sep = string[ : 0]
4095 def get(self, override_ignore=False):
4096 string = self.string
4097 pos = self.pos
4099 try:
4100 if self.ignore_space and not override_ignore:
4101 while True:
4102 if string[pos].isspace():
4103 # Skip over the whitespace.
4104 pos += 1
4105 elif string[pos] == "#":
4106 # Skip over the comment to the end of the line.
4107 pos = string.index("\n", pos)
4108 else:
4109 break
4111 ch = string[pos]
4112 self.pos = pos + 1
4113 return ch
4114 except IndexError:
4115 # We've reached the end of the string.
4116 self.pos = pos
4117 return string[ : 0]
4118 except ValueError:
4119 # The comment extended to the end of the string.
4120 self.pos = len(string)
4121 return string[ : 0]
4123 def get_many(self, count=1):
4124 string = self.string
4125 pos = self.pos
4127 try:
4128 if self.ignore_space:
4129 substring = []
4131 while len(substring) < count:
4132 while True:
4133 if string[pos].isspace():
4134 # Skip over the whitespace.
4135 pos += 1
4136 elif string[pos] == "#":
4137 # Skip over the comment to the end of the line.
4138 pos = string.index("\n", pos)
4139 else:
4140 break
4142 substring.append(string[pos])
4143 pos += 1
4145 substring = "".join(substring)
4146 else:
4147 substring = string[pos : pos + count]
4148 pos += len(substring)
4150 self.pos = pos
4151 return substring
4152 except IndexError:
4153 # We've reached the end of the string.
4154 self.pos = len(string)
4155 return "".join(substring)
4156 except ValueError:
4157 # The comment extended to the end of the string.
4158 self.pos = len(string)
4159 return "".join(substring)
4161 def get_while(self, test_set, include=True, keep_spaces=False):
4162 string = self.string
4163 pos = self.pos
4165 if self.ignore_space and not keep_spaces:
4166 try:
4167 substring = []
4169 while True:
4170 if string[pos].isspace():
4171 # Skip over the whitespace.
4172 pos += 1
4173 elif string[pos] == "#":
4174 # Skip over the comment to the end of the line.
4175 pos = string.index("\n", pos)
4176 elif (string[pos] in test_set) == include:
4177 substring.append(string[pos])
4178 pos += 1
4179 else:
4180 break
4182 self.pos = pos
4183 except IndexError:
4184 # We've reached the end of the string.
4185 self.pos = len(string)
4186 except ValueError:
4187 # The comment extended to the end of the string.
4188 self.pos = len(string)
4190 return "".join(substring)
4191 else:
4192 try:
4193 while (string[pos] in test_set) == include:
4194 pos += 1
4196 substring = string[self.pos : pos]
4198 self.pos = pos
4200 return substring
4201 except IndexError:
4202 # We've reached the end of the string.
4203 substring = string[self.pos : pos]
4205 self.pos = pos
4207 return substring
4209 def skip_while(self, test_set, include=True):
4210 string = self.string
4211 pos = self.pos
4213 try:
4214 if self.ignore_space:
4215 while True:
4216 if string[pos].isspace():
4217 # Skip over the whitespace.
4218 pos += 1
4219 elif string[pos] == "#":
4220 # Skip over the comment to the end of the line.
4221 pos = string.index("\n", pos)
4222 elif (string[pos] in test_set) == include:
4223 pos += 1
4224 else:
4225 break
4226 else:
4227 while (string[pos] in test_set) == include:
4228 pos += 1
4230 self.pos = pos
4231 except IndexError:
4232 # We've reached the end of the string.
4233 self.pos = len(string)
4234 except ValueError:
4235 # The comment extended to the end of the string.
4236 self.pos = len(string)
4238 def match(self, substring):
4239 string = self.string
4240 pos = self.pos
4242 if self.ignore_space:
4243 try:
4244 for c in substring:
4245 while True:
4246 if string[pos].isspace():
4247 # Skip over the whitespace.
4248 pos += 1
4249 elif string[pos] == "#":
4250 # Skip over the comment to the end of the line.
4251 pos = string.index("\n", pos)
4252 else:
4253 break
4255 if string[pos] != c:
4256 return False
4258 pos += 1
4260 self.pos = pos
4262 return True
4263 except IndexError:
4264 # We've reached the end of the string.
4265 return False
4266 except ValueError:
4267 # The comment extended to the end of the string.
4268 return False
4269 else:
4270 if not string.startswith(substring, pos):
4271 return False
4273 self.pos = pos + len(substring)
4275 return True
4277 def expect(self, substring):
4278 if not self.match(substring):
4279 raise error("missing {}".format(substring), self.string, self.pos)
4281 def at_end(self):
4282 string = self.string
4283 pos = self.pos
4285 try:
4286 if self.ignore_space:
4287 while True:
4288 if string[pos].isspace():
4289 pos += 1
4290 elif string[pos] == "#":
4291 pos = string.index("\n", pos)
4292 else:
4293 break
4295 return pos >= len(string)
4296 except IndexError:
4297 # We've reached the end of the string.
4298 return True
4299 except ValueError:
4300 # The comment extended to the end of the string.
4301 return True
4303class Info:
4304 "Info about the regular expression."
4306 def __init__(self, flags=0, char_type=None, kwargs={}):
4307 flags |= DEFAULT_FLAGS[(flags & _ALL_VERSIONS) or DEFAULT_VERSION]
4308 self.flags = flags
4309 self.global_flags = flags
4310 self.inline_locale = False
4312 self.kwargs = kwargs
4314 self.group_count = 0
4315 self.group_index = {}
4316 self.group_name = {}
4317 self.char_type = char_type
4318 self.named_lists_used = {}
4319 self.open_groups = []
4320 self.open_group_count = {}
4321 self.defined_groups = {}
4322 self.group_calls = []
4323 self.private_groups = {}
4325 def open_group(self, name=None):
4326 group = self.group_index.get(name)
4327 if group is None:
4328 while True:
4329 self.group_count += 1
4330 if name is None or self.group_count not in self.group_name:
4331 break
4333 group = self.group_count
4334 if name:
4335 self.group_index[name] = group
4336 self.group_name[group] = name
4338 if group in self.open_groups:
4339 # We have a nested named group. We'll assign it a private group
4340 # number, initially negative until we can assign a proper
4341 # (positive) number.
4342 group_alias = -(len(self.private_groups) + 1)
4343 self.private_groups[group_alias] = group
4344 group = group_alias
4346 self.open_groups.append(group)
4347 self.open_group_count[group] = self.open_group_count.get(group, 0) + 1
4349 return group
4351 def close_group(self):
4352 self.open_groups.pop()
4354 def is_open_group(self, name):
4355 # In version 1, a group reference can refer to an open group. We'll
4356 # just pretend the group isn't open.
4357 version = (self.flags & _ALL_VERSIONS) or DEFAULT_VERSION
4358 if version == VERSION1:
4359 return False
4361 if name.isdigit():
4362 group = int(name)
4363 else:
4364 group = self.group_index.get(name)
4366 return group in self.open_groups
4368def _check_group_features(info, parsed):
4369 """Checks whether the reverse and fuzzy features of the group calls match
4370 the groups which they call.
4371 """
4372 call_refs = {}
4373 additional_groups = []
4374 for call, reverse, fuzzy in info.group_calls:
4375 # Look up the reference of this group call.
4376 key = (call.group, reverse, fuzzy)
4377 ref = call_refs.get(key)
4378 if ref is None:
4379 # This group doesn't have a reference yet, so look up its features.
4380 if call.group == 0:
4381 # Calling the pattern as a whole.
4382 rev = bool(info.flags & REVERSE)
4383 fuz = isinstance(parsed, Fuzzy)
4384 if (rev, fuz) != (reverse, fuzzy):
4385 # The pattern as a whole doesn't have the features we want,
4386 # so we'll need to make a copy of it with the desired
4387 # features.
4388 additional_groups.append((CallRef(len(call_refs), parsed),
4389 reverse, fuzzy))
4390 else:
4391 # Calling a capture group.
4392 def_info = info.defined_groups[call.group]
4393 group = def_info[0]
4394 if def_info[1 : ] != (reverse, fuzzy):
4395 # The group doesn't have the features we want, so we'll
4396 # need to make a copy of it with the desired features.
4397 additional_groups.append((group, reverse, fuzzy))
4399 ref = len(call_refs)
4400 call_refs[key] = ref
4402 call.call_ref = ref
4404 info.call_refs = call_refs
4405 info.additional_groups = additional_groups
4407def _get_required_string(parsed, flags):
4408 "Gets the required string and related info of a parsed pattern."
4410 req_offset, required = parsed.get_required_string(bool(flags & REVERSE))
4411 if required:
4412 required.required = True
4413 if req_offset >= UNLIMITED:
4414 req_offset = -1
4416 req_flags = required.case_flags
4417 if not (flags & UNICODE):
4418 req_flags &= ~UNICODE
4420 req_chars = required.folded_characters
4421 else:
4422 req_offset = 0
4423 req_chars = ()
4424 req_flags = 0
4426 return req_offset, req_chars, req_flags
4428class Scanner:
4429 def __init__(self, lexicon, flags=0):
4430 self.lexicon = lexicon
4432 # Combine phrases into a compound pattern.
4433 patterns = []
4434 for phrase, action in lexicon:
4435 # Parse the regular expression.
4436 source = Source(phrase)
4437 info = Info(flags, source.char_type)
4438 source.ignore_space = bool(info.flags & VERBOSE)
4439 parsed = _parse_pattern(source, info)
4440 if not source.at_end():
4441 raise error("unbalanced parenthesis", source.string,
4442 source.pos)
4444 # We want to forbid capture groups within each phrase.
4445 patterns.append(parsed.remove_captures())
4447 # Combine all the subpatterns into one pattern.
4448 info = Info(flags)
4449 patterns = [Group(info, g + 1, p) for g, p in enumerate(patterns)]
4450 parsed = Branch(patterns)
4452 # Optimise the compound pattern.
4453 reverse = bool(info.flags & REVERSE)
4454 parsed = parsed.optimise(info, reverse)
4455 parsed = parsed.pack_characters(info)
4457 # Get the required string.
4458 req_offset, req_chars, req_flags = _get_required_string(parsed,
4459 info.flags)
4461 # Check the features of the groups.
4462 _check_group_features(info, parsed)
4464 # Complain if there are any group calls. They are not supported by the
4465 # Scanner class.
4466 if info.call_refs:
4467 raise error("recursive regex not supported by Scanner",
4468 source.string, source.pos)
4470 reverse = bool(info.flags & REVERSE)
4472 # Compile the compound pattern. The result is a list of tuples.
4473 code = parsed.compile(reverse) + [(OP.SUCCESS, )]
4475 # Flatten the code into a list of ints.
4476 code = _flatten_code(code)
4478 if not parsed.has_simple_start():
4479 # Get the first set, if possible.
4480 try:
4481 fs_code = _compile_firstset(info, parsed.get_firstset(reverse))
4482 fs_code = _flatten_code(fs_code)
4483 code = fs_code + code
4484 except _FirstSetError:
4485 pass
4487 # Check the global flags for conflicts.
4488 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION
4489 if version not in (0, VERSION0, VERSION1):
4490 raise ValueError("VERSION0 and VERSION1 flags are mutually incompatible")
4492 # Create the PatternObject.
4493 #
4494 # Local flags like IGNORECASE affect the code generation, but aren't
4495 # needed by the PatternObject itself. Conversely, global flags like
4496 # LOCALE _don't_ affect the code generation but _are_ needed by the
4497 # PatternObject.
4498 self.scanner = _regex.compile(None, (flags & GLOBAL_FLAGS) | version,
4499 code, {}, {}, {}, [], req_offset, req_chars, req_flags,
4500 len(patterns))
4502 def scan(self, string):
4503 result = []
4504 append = result.append
4505 match = self.scanner.scanner(string).match
4506 i = 0
4507 while True:
4508 m = match()
4509 if not m:
4510 break
4511 j = m.end()
4512 if i == j:
4513 break
4514 action = self.lexicon[m.lastindex - 1][1]
4515 if hasattr(action, '__call__'):
4516 self.match = m
4517 action = action(self, m.group())
4518 if action is not None:
4519 append(action)
4520 i = j
4522 return result, string[i : ]
4524# Get the known properties dict.
4525PROPERTIES = _regex.get_properties()
4527# Build the inverse of the properties dict.
4528PROPERTY_NAMES = {}
4529for prop_name, (prop_id, values) in PROPERTIES.items():
4530 name, prop_values = PROPERTY_NAMES.get(prop_id, ("", {}))
4531 name = max(name, prop_name, key=len)
4532 PROPERTY_NAMES[prop_id] = name, prop_values
4534 for val_name, val_id in values.items():
4535 prop_values[val_id] = max(prop_values.get(val_id, ""), val_name,
4536 key=len)
4538# Character escape sequences.
4539CHARACTER_ESCAPES = {
4540 "a": "\a",
4541 "b": "\b",
4542 "f": "\f",
4543 "n": "\n",
4544 "r": "\r",
4545 "t": "\t",
4546 "v": "\v",
4547}
4549ASCII_ENCODING = 1
4550UNICODE_ENCODING = 2
4552# Predefined character set escape sequences.
4553CHARSET_ESCAPES = {
4554 "d": lookup_property(None, "Digit", True),
4555 "D": lookup_property(None, "Digit", False),
4556 "h": lookup_property(None, "Blank", True),
4557 "s": lookup_property(None, "Space", True),
4558 "S": lookup_property(None, "Space", False),
4559 "w": lookup_property(None, "Word", True),
4560 "W": lookup_property(None, "Word", False),
4561}
4563ASCII_CHARSET_ESCAPES = dict(CHARSET_ESCAPES)
4564ASCII_CHARSET_ESCAPES.update({
4565 "d": lookup_property(None, "Digit", True, encoding=ASCII_ENCODING),
4566 "D": lookup_property(None, "Digit", False, encoding=ASCII_ENCODING),
4567 "s": lookup_property(None, "Space", True, encoding=ASCII_ENCODING),
4568 "S": lookup_property(None, "Space", False, encoding=ASCII_ENCODING),
4569 "w": lookup_property(None, "Word", True, encoding=ASCII_ENCODING),
4570 "W": lookup_property(None, "Word", False, encoding=ASCII_ENCODING),
4571})
4572UNICODE_CHARSET_ESCAPES = dict(CHARSET_ESCAPES)
4573UNICODE_CHARSET_ESCAPES.update({
4574 "d": lookup_property(None, "Digit", True, encoding=UNICODE_ENCODING),
4575 "D": lookup_property(None, "Digit", False, encoding=UNICODE_ENCODING),
4576 "s": lookup_property(None, "Space", True, encoding=UNICODE_ENCODING),
4577 "S": lookup_property(None, "Space", False, encoding=UNICODE_ENCODING),
4578 "w": lookup_property(None, "Word", True, encoding=UNICODE_ENCODING),
4579 "W": lookup_property(None, "Word", False, encoding=UNICODE_ENCODING),
4580})
4582# Positional escape sequences.
4583POSITION_ESCAPES = {
4584 "A": StartOfString(),
4585 "b": Boundary(),
4586 "B": Boundary(False),
4587 "K": Keep(),
4588 "m": StartOfWord(),
4589 "M": EndOfWord(),
4590 "Z": EndOfString(),
4591}
4592ASCII_POSITION_ESCAPES = dict(POSITION_ESCAPES)
4593ASCII_POSITION_ESCAPES.update({
4594 "b": Boundary(encoding=ASCII_ENCODING),
4595 "B": Boundary(False, encoding=ASCII_ENCODING),
4596 "m": StartOfWord(encoding=ASCII_ENCODING),
4597 "M": EndOfWord(encoding=ASCII_ENCODING),
4598})
4599UNICODE_POSITION_ESCAPES = dict(POSITION_ESCAPES)
4600UNICODE_POSITION_ESCAPES.update({
4601 "b": Boundary(encoding=UNICODE_ENCODING),
4602 "B": Boundary(False, encoding=UNICODE_ENCODING),
4603 "m": StartOfWord(encoding=UNICODE_ENCODING),
4604 "M": EndOfWord(encoding=UNICODE_ENCODING),
4605})
4607# Positional escape sequences when WORD flag set.
4608WORD_POSITION_ESCAPES = dict(POSITION_ESCAPES)
4609WORD_POSITION_ESCAPES.update({
4610 "b": DefaultBoundary(),
4611 "B": DefaultBoundary(False),
4612 "m": DefaultStartOfWord(),
4613 "M": DefaultEndOfWord(),
4614})
4616# Regex control verbs.
4617VERBS = {
4618 "FAIL": Failure(),
4619 "F": Failure(),
4620 "PRUNE": Prune(),
4621 "SKIP": Skip(),
4622}