Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/regex/_regex_core.py: 26%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1#
2# Secret Labs' Regular Expression Engine core module
3#
4# Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved.
5#
6# This version of the SRE library can be redistributed under CNRI's
7# Python 1.6 license. For any other use, please contact Secret Labs
8# AB (info@pythonware.com).
9#
10# Portions of this engine have been developed in cooperation with
11# CNRI. Hewlett-Packard provided funding for 1.6 integration and
12# other compatibility work.
13#
14# 2010-01-16 mrab Python front-end re-written and extended
16import enum
17import string
18import unicodedata
19from collections import defaultdict
21from regex import _regex
23__all__ = ["A", "ASCII", "B", "BESTMATCH", "D", "DEBUG", "E", "ENHANCEMATCH",
24 "F", "FULLCASE", "I", "IGNORECASE", "L", "LOCALE", "M", "MULTILINE", "P",
25 "POSIX", "R", "REVERSE", "S", "DOTALL", "T", "TEMPLATE", "U", "UNICODE",
26 "V0", "VERSION0", "V1", "VERSION1", "W", "WORD", "X", "VERBOSE", "error",
27 "Scanner", "RegexFlag"]
29# The regex exception.
30class error(Exception):
31 """Exception raised for invalid regular expressions.
33 Attributes:
35 msg: The unformatted error message
36 pattern: The regular expression pattern
37 pos: The position in the pattern where compilation failed, or None
38 lineno: The line number where compilation failed, unless pos is None
39 colno: The column number where compilation failed, unless pos is None
40 """
42 def __init__(self, message, pattern=None, pos=None):
43 newline = '\n' if isinstance(pattern, str) else b'\n'
44 self.msg = message
45 self.pattern = pattern
46 self.pos = pos
47 if pattern is not None and pos is not None:
48 self.lineno = pattern.count(newline, 0, pos) + 1
49 self.colno = pos - pattern.rfind(newline, 0, pos)
51 message = "{} at position {}".format(message, pos)
53 if newline in pattern:
54 message += " (line {}, column {})".format(self.lineno,
55 self.colno)
57 Exception.__init__(self, message)
59# The exception for when a positional flag has been turned on in the old
60# behaviour.
61class _UnscopedFlagSet(Exception):
62 pass
64# The exception for when parsing fails and we want to try something else.
65class ParseError(Exception):
66 pass
68# The exception for when there isn't a valid first set.
69class _FirstSetError(Exception):
70 pass
72# Flags.
73class RegexFlag(enum.IntFlag):
74 A = ASCII = 0x80 # Assume ASCII locale.
75 B = BESTMATCH = 0x1000 # Best fuzzy match.
76 D = DEBUG = 0x200 # Print parsed pattern.
77 E = ENHANCEMATCH = 0x8000 # Attempt to improve the fit after finding the first
78 # fuzzy match.
79 F = FULLCASE = 0x4000 # Unicode full case-folding.
80 I = IGNORECASE = 0x2 # Ignore case.
81 L = LOCALE = 0x4 # Assume current 8-bit locale.
82 M = MULTILINE = 0x8 # Make anchors look for newline.
83 P = POSIX = 0x10000 # POSIX-style matching (leftmost longest).
84 R = REVERSE = 0x400 # Search backwards.
85 S = DOTALL = 0x10 # Make dot match newline.
86 U = UNICODE = 0x20 # Assume Unicode locale.
87 V0 = VERSION0 = 0x2000 # Old legacy behaviour.
88 V1 = VERSION1 = 0x100 # New enhanced behaviour.
89 W = WORD = 0x800 # Default Unicode word breaks.
90 X = VERBOSE = 0x40 # Ignore whitespace and comments.
91 T = TEMPLATE = 0x1 # Template (present because re module has it).
93 def __repr__(self):
94 if self._name_ is not None:
95 return 'regex.%s' % self._name_
97 value = self._value_
98 members = []
99 negative = value < 0
101 if negative:
102 value = ~value
104 for m in self.__class__:
105 if value & m._value_:
106 value &= ~m._value_
107 members.append('regex.%s' % m._name_)
109 if value:
110 members.append(hex(value))
112 res = '|'.join(members)
114 if negative:
115 if len(members) > 1:
116 res = '~(%s)' % res
117 else:
118 res = '~%s' % res
120 return res
122 __str__ = object.__str__
124# Put the flags into the module namespace. Being explicit here helps tools like
125# linters and IDEs understand the code better.
126ASCII = RegexFlag.ASCII
127BESTMATCH = RegexFlag.BESTMATCH
128DEBUG = RegexFlag.DEBUG
129DOTALL = RegexFlag.DOTALL
130ENHANCEMATCH = RegexFlag.ENHANCEMATCH
131FULLCASE = RegexFlag.FULLCASE
132IGNORECASE = RegexFlag.IGNORECASE
133LOCALE = RegexFlag.LOCALE
134MULTILINE = RegexFlag.MULTILINE
135POSIX = RegexFlag.POSIX
136REVERSE = RegexFlag.REVERSE
137TEMPLATE = RegexFlag.TEMPLATE
138UNICODE = RegexFlag.UNICODE
139VERBOSE = RegexFlag.VERBOSE
140VERSION0 = RegexFlag.VERSION0
141VERSION1 = RegexFlag.VERSION1
142WORD = RegexFlag.WORD
143A = RegexFlag.A
144B = RegexFlag.B
145D = RegexFlag.D
146E = RegexFlag.E
147F = RegexFlag.F
148I = RegexFlag.I
149L = RegexFlag.L
150M = RegexFlag.M
151P = RegexFlag.P
152R = RegexFlag.R
153S = RegexFlag.S
154U = RegexFlag.U
155V0 = RegexFlag.V0
156V1 = RegexFlag.V1
157W = RegexFlag.W
158X = RegexFlag.X
159T = RegexFlag.T
161DEFAULT_VERSION = VERSION1
163_ALL_VERSIONS = VERSION0 | VERSION1
164_ALL_ENCODINGS = ASCII | LOCALE | UNICODE
166# The default flags for the various versions.
167DEFAULT_FLAGS = {VERSION0: 0, VERSION1: FULLCASE}
169# The mask for the flags.
170GLOBAL_FLAGS = (_ALL_VERSIONS | BESTMATCH | DEBUG | ENHANCEMATCH | POSIX |
171 REVERSE)
172SCOPED_FLAGS = (FULLCASE | IGNORECASE | MULTILINE | DOTALL | WORD | VERBOSE |
173 _ALL_ENCODINGS)
175ALPHA = frozenset(string.ascii_letters)
176DIGITS = frozenset(string.digits)
177ALNUM = ALPHA | DIGITS
178OCT_DIGITS = frozenset(string.octdigits)
179HEX_DIGITS = frozenset(string.hexdigits)
180SPECIAL_CHARS = frozenset("()|?*+{^$.[\\#") | frozenset([""])
181NAMED_CHAR_PART = ALNUM | frozenset(" -")
182PROPERTY_NAME_PART = ALNUM | frozenset(" &_-.")
183SET_OPS = ("||", "~~", "&&", "--")
185# The width of the code words inside the regex engine.
186BYTES_PER_CODE = _regex.get_code_size()
187BITS_PER_CODE = BYTES_PER_CODE * 8
189# The repeat count which represents infinity.
190UNLIMITED = (1 << BITS_PER_CODE) - 1
192# The regular expression flags.
193REGEX_FLAGS = {"a": ASCII, "b": BESTMATCH, "e": ENHANCEMATCH, "f": FULLCASE,
194 "i": IGNORECASE, "L": LOCALE, "m": MULTILINE, "p": POSIX, "r": REVERSE,
195 "s": DOTALL, "u": UNICODE, "V0": VERSION0, "V1": VERSION1, "w": WORD, "x":
196 VERBOSE}
198# The case flags.
199CASE_FLAGS = FULLCASE | IGNORECASE
200NOCASE = 0
201FULLIGNORECASE = FULLCASE | IGNORECASE
203FULL_CASE_FOLDING = UNICODE | FULLIGNORECASE
205CASE_FLAGS_COMBINATIONS = {0: 0, FULLCASE: 0, IGNORECASE: IGNORECASE,
206 FULLIGNORECASE: FULLIGNORECASE}
208# The number of digits in hexadecimal escapes.
209HEX_ESCAPES = {"x": 2, "u": 4, "U": 8}
211# The names of the opcodes.
212OPCODES = """
213FAILURE
214SUCCESS
215ANY
216ANY_ALL
217ANY_ALL_REV
218ANY_REV
219ANY_U
220ANY_U_REV
221ATOMIC
222BOUNDARY
223BRANCH
224CALL_REF
225CHARACTER
226CHARACTER_IGN
227CHARACTER_IGN_REV
228CHARACTER_REV
229CONDITIONAL
230DEFAULT_BOUNDARY
231DEFAULT_END_OF_WORD
232DEFAULT_START_OF_WORD
233END
234END_OF_LINE
235END_OF_LINE_U
236END_OF_STRING
237END_OF_STRING_LINE
238END_OF_STRING_LINE_U
239END_OF_WORD
240FUZZY
241GRAPHEME_BOUNDARY
242GREEDY_REPEAT
243GROUP
244GROUP_CALL
245GROUP_EXISTS
246KEEP
247LAZY_REPEAT
248LOOKAROUND
249NEXT
250PROPERTY
251PROPERTY_IGN
252PROPERTY_IGN_REV
253PROPERTY_REV
254PRUNE
255RANGE
256RANGE_IGN
257RANGE_IGN_REV
258RANGE_REV
259REF_GROUP
260REF_GROUP_FLD
261REF_GROUP_FLD_REV
262REF_GROUP_IGN
263REF_GROUP_IGN_REV
264REF_GROUP_REV
265SEARCH_ANCHOR
266SET_DIFF
267SET_DIFF_IGN
268SET_DIFF_IGN_REV
269SET_DIFF_REV
270SET_INTER
271SET_INTER_IGN
272SET_INTER_IGN_REV
273SET_INTER_REV
274SET_SYM_DIFF
275SET_SYM_DIFF_IGN
276SET_SYM_DIFF_IGN_REV
277SET_SYM_DIFF_REV
278SET_UNION
279SET_UNION_IGN
280SET_UNION_IGN_REV
281SET_UNION_REV
282SKIP
283START_OF_LINE
284START_OF_LINE_U
285START_OF_STRING
286START_OF_WORD
287STRING
288STRING_FLD
289STRING_FLD_REV
290STRING_IGN
291STRING_IGN_REV
292STRING_REV
293FUZZY_EXT
294"""
296# Define the opcodes in a namespace.
297class Namespace:
298 pass
300OP = Namespace()
301for i, op in enumerate(OPCODES.split()):
302 setattr(OP, op, i)
304def _shrink_cache(cache_dict, args_dict, locale_sensitive, max_length, divisor=5):
305 """Make room in the given cache.
307 Args:
308 cache_dict: The cache dictionary to modify.
309 args_dict: The dictionary of named list args used by patterns.
310 max_length: Maximum # of entries in cache_dict before it is shrunk.
311 divisor: Cache will shrink to max_length - 1/divisor*max_length items.
312 """
313 # Toss out a fraction of the entries at random to make room for new ones.
314 # A random algorithm was chosen as opposed to simply cache_dict.popitem()
315 # as popitem could penalize the same regular expression repeatedly based
316 # on its internal hash value. Being random should spread the cache miss
317 # love around.
318 cache_keys = tuple(cache_dict.keys())
319 overage = len(cache_keys) - max_length
320 if overage < 0:
321 # Cache is already within limits. Normally this should not happen
322 # but it could due to multithreading.
323 return
325 number_to_toss = max_length // divisor + overage
327 # The import is done here to avoid a circular dependency.
328 import random
329 if not hasattr(random, 'sample'):
330 # Do nothing while resolving the circular dependency:
331 # re->random->warnings->tokenize->string->re
332 return
334 for doomed_key in random.sample(cache_keys, number_to_toss):
335 try:
336 del cache_dict[doomed_key]
337 except KeyError:
338 # Ignore problems if the cache changed from another thread.
339 pass
341 # Rebuild the arguments and locale-sensitivity dictionaries.
342 args_dict.clear()
343 sensitivity_dict = {}
344 for pattern, pattern_type, flags, args, default_version, locale in tuple(cache_dict):
345 args_dict[pattern, pattern_type, flags, default_version, locale] = args
346 try:
347 sensitivity_dict[pattern_type, pattern] = locale_sensitive[pattern_type, pattern]
348 except KeyError:
349 pass
351 locale_sensitive.clear()
352 locale_sensitive.update(sensitivity_dict)
354def _fold_case(info, string):
355 "Folds the case of a string."
356 flags = info.flags
357 if (flags & _ALL_ENCODINGS) == 0:
358 flags |= info.guess_encoding
360 return _regex.fold_case(flags, string)
362def is_cased_i(info, char):
363 "Checks whether a character is cased."
364 return len(_regex.get_all_cases(info.flags, char)) > 1
366def is_cased_f(flags, char):
367 "Checks whether a character is cased."
368 return len(_regex.get_all_cases(flags, char)) > 1
370def _compile_firstset(info, fs):
371 "Compiles the firstset for the pattern."
372 reverse = bool(info.flags & REVERSE)
373 fs = _check_firstset(info, reverse, fs)
374 if not fs or isinstance(fs, AnyAll):
375 return []
377 # Compile the firstset.
378 return fs.compile(reverse)
380def _check_firstset(info, reverse, fs):
381 "Checks the firstset for the pattern."
382 if not fs or None in fs:
383 return None
385 # If we ignore the case, for simplicity we won't build a firstset.
386 members = set()
387 case_flags = NOCASE
388 for i in fs:
389 if isinstance(i, Character) and not i.positive:
390 return None
392# if i.case_flags:
393# if isinstance(i, Character):
394# if is_cased_i(info, i.value):
395# return []
396# elif isinstance(i, SetBase):
397# return []
398 case_flags |= i.case_flags
399 members.add(i.with_flags(case_flags=NOCASE))
401 if case_flags == (FULLCASE | IGNORECASE):
402 return None
404 # Build the firstset.
405 fs = SetUnion(info, list(members), case_flags=case_flags & ~FULLCASE,
406 zerowidth=True)
407 fs = fs.optimise(info, reverse, in_set=True)
409 return fs
411def _flatten_code(code):
412 "Flattens the code from a list of tuples."
413 flat_code = []
414 for c in code:
415 flat_code.extend(c)
417 return flat_code
419def make_case_flags(info):
420 "Makes the case flags."
421 flags = info.flags & CASE_FLAGS
423 # Turn off FULLCASE if ASCII is turned on.
424 if info.flags & ASCII:
425 flags &= ~FULLCASE
427 return flags
429def make_character(info, value, in_set=False):
430 "Makes a character literal."
431 if in_set:
432 # A character set is built case-sensitively.
433 return Character(value)
435 return Character(value, case_flags=make_case_flags(info))
437def make_ref_group(info, name, position):
438 "Makes a group reference."
439 return RefGroup(info, name, position, case_flags=make_case_flags(info))
441def make_string_set(info, name):
442 "Makes a string set."
443 return StringSet(info, name, case_flags=make_case_flags(info))
445def make_property(info, prop, in_set):
446 "Makes a property."
447 if in_set:
448 return prop
450 return prop.with_flags(case_flags=make_case_flags(info))
452def _parse_pattern(source, info):
453 "Parses a pattern, eg. 'a|b|c'."
454 branches = [parse_sequence(source, info)]
455 while source.match("|"):
456 branches.append(parse_sequence(source, info))
458 if len(branches) == 1:
459 return branches[0]
460 return Branch(branches)
462def parse_sequence(source, info):
463 "Parses a sequence, eg. 'abc'."
464 sequence = [None]
465 case_flags = make_case_flags(info)
466 while True:
467 saved_pos = source.pos
468 ch = source.get()
469 if ch in SPECIAL_CHARS:
470 if ch in ")|":
471 # The end of a sequence. At the end of the pattern ch is "".
472 source.pos = saved_pos
473 break
474 elif ch == "\\":
475 # An escape sequence outside a set.
476 sequence.append(parse_escape(source, info, False))
477 elif ch == "(":
478 # A parenthesised subpattern or a flag.
479 element = parse_paren(source, info)
480 if element is None:
481 case_flags = make_case_flags(info)
482 else:
483 sequence.append(element)
484 elif ch == ".":
485 # Any character.
486 if info.flags & DOTALL:
487 sequence.append(AnyAll())
488 elif info.flags & WORD:
489 sequence.append(AnyU())
490 else:
491 sequence.append(Any())
492 elif ch == "[":
493 # A character set.
494 sequence.append(parse_set(source, info))
495 elif ch == "^":
496 # The start of a line or the string.
497 if info.flags & MULTILINE:
498 if info.flags & WORD:
499 sequence.append(StartOfLineU())
500 else:
501 sequence.append(StartOfLine())
502 else:
503 sequence.append(StartOfString())
504 elif ch == "$":
505 # The end of a line or the string.
506 if info.flags & MULTILINE:
507 if info.flags & WORD:
508 sequence.append(EndOfLineU())
509 else:
510 sequence.append(EndOfLine())
511 else:
512 if info.flags & WORD:
513 sequence.append(EndOfStringLineU())
514 else:
515 sequence.append(EndOfStringLine())
516 elif ch in "?*+{":
517 # Looks like a quantifier.
518 counts = parse_quantifier(source, info, ch)
519 if counts:
520 # It _is_ a quantifier.
521 apply_quantifier(source, info, counts, case_flags, ch,
522 saved_pos, sequence)
523 sequence.append(None)
524 else:
525 # It's not a quantifier. Maybe it's a fuzzy constraint.
526 constraints = parse_fuzzy(source, info, ch, case_flags)
527 if constraints:
528 # It _is_ a fuzzy constraint.
529 apply_constraint(source, info, constraints, case_flags,
530 saved_pos, sequence)
531 sequence.append(None)
532 else:
533 # The element was just a literal.
534 sequence.append(Character(ord(ch),
535 case_flags=case_flags))
536 else:
537 # A literal.
538 sequence.append(Character(ord(ch), case_flags=case_flags))
539 else:
540 # A literal.
541 sequence.append(Character(ord(ch), case_flags=case_flags))
543 sequence = [item for item in sequence if item is not None]
544 return Sequence(sequence)
546def apply_quantifier(source, info, counts, case_flags, ch, saved_pos,
547 sequence):
548 element = sequence.pop()
549 if element is None:
550 if sequence:
551 raise error("multiple repeat", source.string, saved_pos)
552 raise error("nothing to repeat", source.string, saved_pos)
554 if isinstance(element, (GreedyRepeat, LazyRepeat, PossessiveRepeat)):
555 raise error("multiple repeat", source.string, saved_pos)
557 min_count, max_count = counts
558 saved_pos = source.pos
559 ch = source.get()
560 if ch == "?":
561 # The "?" suffix that means it's a lazy repeat.
562 repeated = LazyRepeat
563 elif ch == "+":
564 # The "+" suffix that means it's a possessive repeat.
565 repeated = PossessiveRepeat
566 else:
567 # No suffix means that it's a greedy repeat.
568 source.pos = saved_pos
569 repeated = GreedyRepeat
571 # Ignore the quantifier if it applies to a zero-width item or the number of
572 # repeats is fixed at 1.
573 if not element.is_empty() and (min_count != 1 or max_count != 1):
574 element = repeated(element, min_count, max_count)
576 sequence.append(element)
578def apply_constraint(source, info, constraints, case_flags, saved_pos,
579 sequence):
580 element = sequence.pop()
581 if element is None:
582 raise error("nothing for fuzzy constraint", source.string, saved_pos)
584 # If a group is marked as fuzzy then put all of the fuzzy part in the
585 # group.
586 if isinstance(element, Group):
587 element.subpattern = Fuzzy(element.subpattern, constraints)
588 sequence.append(element)
589 else:
590 sequence.append(Fuzzy(element, constraints))
592_QUANTIFIERS = {"?": (0, 1), "*": (0, None), "+": (1, None)}
594def parse_quantifier(source, info, ch):
595 "Parses a quantifier."
596 q = _QUANTIFIERS.get(ch)
597 if q:
598 # It's a quantifier.
599 return q
601 if ch == "{":
602 # Looks like a limited repeated element, eg. 'a{2,3}'.
603 counts = parse_limited_quantifier(source)
604 if counts:
605 return counts
607 return None
609def is_above_limit(count):
610 "Checks whether a count is above the maximum."
611 return count is not None and count >= UNLIMITED
613def parse_limited_quantifier(source):
614 "Parses a limited quantifier."
615 saved_pos = source.pos
616 min_count = parse_count(source)
617 if source.match(","):
618 max_count = parse_count(source)
620 # No minimum means 0 and no maximum means unlimited.
621 min_count = int(min_count or 0)
622 max_count = int(max_count) if max_count else None
623 else:
624 if not min_count:
625 source.pos = saved_pos
626 return None
628 min_count = max_count = int(min_count)
630 if not source.match ("}"):
631 source.pos = saved_pos
632 return None
634 if is_above_limit(min_count) or is_above_limit(max_count):
635 raise error("repeat count too big", source.string, saved_pos)
637 if max_count is not None and min_count > max_count:
638 raise error("min repeat greater than max repeat", source.string,
639 saved_pos)
641 return min_count, max_count
643def parse_fuzzy(source, info, ch, case_flags):
644 "Parses a fuzzy setting, if present."
645 saved_pos = source.pos
647 if ch != "{":
648 return None
650 constraints = {}
651 try:
652 parse_fuzzy_item(source, constraints)
653 while source.match(","):
654 parse_fuzzy_item(source, constraints)
655 except ParseError:
656 source.pos = saved_pos
657 return None
659 if source.match(":"):
660 constraints["test"] = parse_fuzzy_test(source, info, case_flags)
662 if not source.match("}"):
663 raise error("expected }", source.string, source.pos)
665 return constraints
667def parse_fuzzy_item(source, constraints):
668 "Parses a fuzzy setting item."
669 saved_pos = source.pos
670 try:
671 parse_cost_constraint(source, constraints)
672 except ParseError:
673 source.pos = saved_pos
675 parse_cost_equation(source, constraints)
677def parse_cost_constraint(source, constraints):
678 "Parses a cost constraint."
679 saved_pos = source.pos
680 ch = source.get()
681 if ch in ALPHA:
682 # Syntax: constraint [("<=" | "<") cost]
683 constraint = parse_constraint(source, constraints, ch)
685 max_inc = parse_fuzzy_compare(source)
687 if max_inc is None:
688 # No maximum cost.
689 constraints[constraint] = 0, None
690 else:
691 # There's a maximum cost.
692 cost_pos = source.pos
693 max_cost = parse_cost_limit(source)
695 # Inclusive or exclusive limit?
696 if not max_inc:
697 max_cost -= 1
699 if max_cost < 0:
700 raise error("bad fuzzy cost limit", source.string, cost_pos)
702 constraints[constraint] = 0, max_cost
703 elif ch in DIGITS:
704 # Syntax: cost ("<=" | "<") constraint ("<=" | "<") cost
705 source.pos = saved_pos
707 # Minimum cost.
708 cost_pos = source.pos
709 min_cost = parse_cost_limit(source)
711 min_inc = parse_fuzzy_compare(source)
712 if min_inc is None:
713 raise ParseError()
715 constraint = parse_constraint(source, constraints, source.get())
717 max_inc = parse_fuzzy_compare(source)
718 if max_inc is None:
719 raise ParseError()
721 # Maximum cost.
722 cost_pos = source.pos
723 max_cost = parse_cost_limit(source)
725 # Inclusive or exclusive limits?
726 if not min_inc:
727 min_cost += 1
728 if not max_inc:
729 max_cost -= 1
731 if not 0 <= min_cost <= max_cost:
732 raise error("bad fuzzy cost limit", source.string, cost_pos)
734 constraints[constraint] = min_cost, max_cost
735 else:
736 raise ParseError()
738def parse_cost_limit(source):
739 "Parses a cost limit."
740 cost_pos = source.pos
741 digits = parse_count(source)
743 try:
744 return int(digits)
745 except ValueError:
746 pass
748 raise error("bad fuzzy cost limit", source.string, cost_pos)
750def parse_constraint(source, constraints, ch):
751 "Parses a constraint."
752 if ch not in "deis":
753 raise ParseError()
755 if ch in constraints:
756 raise ParseError()
758 return ch
760def parse_fuzzy_compare(source):
761 "Parses a cost comparator."
762 if source.match("<="):
763 return True
764 elif source.match("<"):
765 return False
766 else:
767 return None
769def parse_cost_equation(source, constraints):
770 "Parses a cost equation."
771 if "cost" in constraints:
772 raise error("more than one cost equation", source.string, source.pos)
774 cost = {}
776 parse_cost_term(source, cost)
777 while source.match("+"):
778 parse_cost_term(source, cost)
780 max_inc = parse_fuzzy_compare(source)
781 if max_inc is None:
782 raise ParseError()
784 max_cost = int(parse_count(source))
786 if not max_inc:
787 max_cost -= 1
789 if max_cost < 0:
790 raise error("bad fuzzy cost limit", source.string, source.pos)
792 cost["max"] = max_cost
794 constraints["cost"] = cost
796def parse_cost_term(source, cost):
797 "Parses a cost equation term."
798 coeff = parse_count(source)
799 ch = source.get()
800 if ch not in "dis":
801 raise ParseError()
803 if ch in cost:
804 raise error("repeated fuzzy cost", source.string, source.pos)
806 cost[ch] = int(coeff or 1)
808def parse_fuzzy_test(source, info, case_flags):
809 saved_pos = source.pos
810 ch = source.get()
811 if ch in SPECIAL_CHARS:
812 if ch == "\\":
813 # An escape sequence outside a set.
814 return parse_escape(source, info, False)
815 elif ch == ".":
816 # Any character.
817 if info.flags & DOTALL:
818 return AnyAll()
819 elif info.flags & WORD:
820 return AnyU()
821 else:
822 return Any()
823 elif ch == "[":
824 # A character set.
825 return parse_set(source, info)
826 else:
827 raise error("expected character set", source.string, saved_pos)
828 elif ch:
829 # A literal.
830 return Character(ord(ch), case_flags=case_flags)
831 else:
832 raise error("expected character set", source.string, saved_pos)
834def parse_count(source):
835 "Parses a quantifier's count, which can be empty."
836 return source.get_while(DIGITS)
838def parse_paren(source, info):
839 """Parses a parenthesised subpattern or a flag. Returns FLAGS if it's an
840 inline flag.
841 """
842 saved_pos = source.pos
843 ch = source.get(True)
844 if ch == "?":
845 # (?...
846 saved_pos_2 = source.pos
847 ch = source.get(True)
848 if ch == "<":
849 # (?<...
850 saved_pos_3 = source.pos
851 ch = source.get()
852 if ch in ("=", "!"):
853 # (?<=... or (?<!...: lookbehind.
854 return parse_lookaround(source, info, True, ch == "=")
856 # (?<...: a named capture group.
857 source.pos = saved_pos_3
858 name = parse_name(source)
859 group = info.open_group(name)
860 source.expect(">")
861 saved_flags = info.flags
862 try:
863 subpattern = _parse_pattern(source, info)
864 source.expect(")")
865 finally:
866 info.flags = saved_flags
867 source.ignore_space = bool(info.flags & VERBOSE)
869 info.close_group()
870 return Group(info, group, subpattern)
871 if ch in ("=", "!"):
872 # (?=... or (?!...: lookahead.
873 return parse_lookaround(source, info, False, ch == "=")
874 if ch == "P":
875 # (?P...: a Python extension.
876 return parse_extension(source, info)
877 if ch == "#":
878 # (?#...: a comment.
879 return parse_comment(source)
880 if ch == "(":
881 # (?(...: a conditional subpattern.
882 return parse_conditional(source, info)
883 if ch == ">":
884 # (?>...: an atomic subpattern.
885 return parse_atomic(source, info)
886 if ch == "|":
887 # (?|...: a common/reset groups branch.
888 return parse_common(source, info)
889 if ch == "R" or "0" <= ch <= "9":
890 # (?R...: probably a call to a group.
891 return parse_call_group(source, info, ch, saved_pos_2)
892 if ch == "&":
893 # (?&...: a call to a named group.
894 return parse_call_named_group(source, info, saved_pos_2)
895 if (ch == "+" or ch == "-") and source.peek() in DIGITS:
896 return parse_rel_call_group(source, info, ch, saved_pos_2)
898 # (?...: probably a flags subpattern.
899 source.pos = saved_pos_2
900 return parse_flags_subpattern(source, info)
902 if ch == "*":
903 # (*...
904 saved_pos_2 = source.pos
905 word = source.get_while(set(")>"), include=False)
906 if word[ : 1].isalpha():
907 verb = VERBS.get(word)
908 if not verb:
909 raise error("unknown verb", source.string, saved_pos_2)
911 source.expect(")")
913 return verb
915 # (...: an unnamed capture group.
916 source.pos = saved_pos
917 group = info.open_group()
918 saved_flags = info.flags
919 try:
920 subpattern = _parse_pattern(source, info)
921 source.expect(")")
922 finally:
923 info.flags = saved_flags
924 source.ignore_space = bool(info.flags & VERBOSE)
926 info.close_group()
928 return Group(info, group, subpattern)
930def parse_extension(source, info):
931 "Parses a Python extension."
932 saved_pos = source.pos
933 ch = source.get()
934 if ch == "<":
935 # (?P<...: a named capture group.
936 name = parse_name(source)
937 group = info.open_group(name)
938 source.expect(">")
939 saved_flags = info.flags
940 try:
941 subpattern = _parse_pattern(source, info)
942 source.expect(")")
943 finally:
944 info.flags = saved_flags
945 source.ignore_space = bool(info.flags & VERBOSE)
947 info.close_group()
949 return Group(info, group, subpattern)
950 if ch == "=":
951 # (?P=...: a named group reference.
952 name = parse_name(source, allow_numeric=True)
953 source.expect(")")
954 if info.is_open_group(name):
955 raise error("cannot refer to an open group", source.string,
956 saved_pos)
958 return make_ref_group(info, name, saved_pos)
959 if ch == ">" or ch == "&":
960 # (?P>...: a call to a group.
961 return parse_call_named_group(source, info, saved_pos)
963 source.pos = saved_pos
964 raise error("unknown extension", source.string, saved_pos)
966def parse_comment(source):
967 "Parses a comment."
968 while True:
969 saved_pos = source.pos
970 c = source.get(True)
972 if not c or c == ")":
973 break
975 if c == "\\":
976 c = source.get(True)
978 source.pos = saved_pos
979 source.expect(")")
981 return None
983def parse_lookaround(source, info, behind, positive):
984 "Parses a lookaround."
985 saved_flags = info.flags
986 try:
987 subpattern = _parse_pattern(source, info)
988 source.expect(")")
989 finally:
990 info.flags = saved_flags
991 source.ignore_space = bool(info.flags & VERBOSE)
993 return LookAround(behind, positive, subpattern)
995def parse_conditional(source, info):
996 "Parses a conditional subpattern."
997 saved_flags = info.flags
998 saved_pos = source.pos
999 ch = source.get()
1000 if ch == "?":
1001 # (?(?...
1002 ch = source.get()
1003 if ch in ("=", "!"):
1004 # (?(?=... or (?(?!...: lookahead conditional.
1005 return parse_lookaround_conditional(source, info, False, ch == "=")
1006 if ch == "<":
1007 # (?(?<...
1008 ch = source.get()
1009 if ch in ("=", "!"):
1010 # (?(?<=... or (?(?<!...: lookbehind conditional.
1011 return parse_lookaround_conditional(source, info, True, ch ==
1012 "=")
1014 source.pos = saved_pos
1015 raise error("expected lookaround conditional", source.string,
1016 source.pos)
1018 source.pos = saved_pos
1019 try:
1020 group = parse_name(source, True)
1021 source.expect(")")
1022 yes_branch = parse_sequence(source, info)
1023 if source.match("|"):
1024 no_branch = parse_sequence(source, info)
1025 else:
1026 no_branch = Sequence()
1028 source.expect(")")
1029 finally:
1030 info.flags = saved_flags
1031 source.ignore_space = bool(info.flags & VERBOSE)
1033 if yes_branch.is_empty() and no_branch.is_empty():
1034 return Sequence()
1036 return Conditional(info, group, yes_branch, no_branch, saved_pos)
1038def parse_lookaround_conditional(source, info, behind, positive):
1039 saved_flags = info.flags
1040 try:
1041 subpattern = _parse_pattern(source, info)
1042 source.expect(")")
1043 finally:
1044 info.flags = saved_flags
1045 source.ignore_space = bool(info.flags & VERBOSE)
1047 yes_branch = parse_sequence(source, info)
1048 if source.match("|"):
1049 no_branch = parse_sequence(source, info)
1050 else:
1051 no_branch = Sequence()
1053 source.expect(")")
1055 return LookAroundConditional(behind, positive, subpattern, yes_branch,
1056 no_branch)
1058def parse_atomic(source, info):
1059 "Parses an atomic subpattern."
1060 saved_flags = info.flags
1061 try:
1062 subpattern = _parse_pattern(source, info)
1063 source.expect(")")
1064 finally:
1065 info.flags = saved_flags
1066 source.ignore_space = bool(info.flags & VERBOSE)
1068 return Atomic(subpattern)
1070def parse_common(source, info):
1071 "Parses a common groups branch."
1072 # Capture group numbers in different branches can reuse the group numbers.
1073 initial_group_count = info.group_count
1074 branches = [parse_sequence(source, info)]
1075 final_group_count = info.group_count
1076 while source.match("|"):
1077 info.group_count = initial_group_count
1078 branches.append(parse_sequence(source, info))
1079 final_group_count = max(final_group_count, info.group_count)
1081 info.group_count = final_group_count
1082 source.expect(")")
1084 if len(branches) == 1:
1085 return branches[0]
1086 return Branch(branches)
1088def parse_call_group(source, info, ch, pos):
1089 "Parses a call to a group."
1090 if ch == "R":
1091 group = "0"
1092 else:
1093 group = ch + source.get_while(DIGITS)
1095 source.expect(")")
1097 return CallGroup(info, group, pos)
1099def parse_rel_call_group(source, info, ch, pos):
1100 "Parses a relative call to a group."
1101 digits = source.get_while(DIGITS)
1102 if not digits:
1103 raise error("missing relative group number", source.string, source.pos)
1105 offset = int(digits)
1106 group = info.group_count + offset if ch == "+" else info.group_count - offset + 1
1107 if group <= 0:
1108 raise error("invalid relative group number", source.string, source.pos)
1110 source.expect(")")
1112 return CallGroup(info, group, pos)
1114def parse_call_named_group(source, info, pos):
1115 "Parses a call to a named group."
1116 group = parse_name(source)
1117 source.expect(")")
1119 return CallGroup(info, group, pos)
1121def parse_flag_set(source):
1122 "Parses a set of inline flags."
1123 flags = 0
1125 try:
1126 while True:
1127 saved_pos = source.pos
1128 ch = source.get()
1129 if ch == "V":
1130 ch += source.get()
1131 flags |= REGEX_FLAGS[ch]
1132 except KeyError:
1133 source.pos = saved_pos
1135 return flags
1137def parse_flags(source, info):
1138 "Parses flags being turned on/off."
1139 flags_on = parse_flag_set(source)
1140 if source.match("-"):
1141 flags_off = parse_flag_set(source)
1142 if not flags_off:
1143 raise error("bad inline flags: no flags after '-'", source.string,
1144 source.pos)
1145 else:
1146 flags_off = 0
1148 if flags_on & LOCALE:
1149 # Remember that this pattern as an inline locale flag.
1150 info.inline_locale = True
1152 return flags_on, flags_off
1154def parse_subpattern(source, info, flags_on, flags_off):
1155 "Parses a subpattern with scoped flags."
1156 saved_flags = info.flags
1157 info.flags = (info.flags | flags_on) & ~flags_off
1159 # Ensure that there aren't multiple encoding flags set.
1160 if info.flags & (ASCII | LOCALE | UNICODE):
1161 info.flags = (info.flags & ~_ALL_ENCODINGS) | flags_on
1163 source.ignore_space = bool(info.flags & VERBOSE)
1164 try:
1165 subpattern = _parse_pattern(source, info)
1166 source.expect(")")
1167 finally:
1168 info.flags = saved_flags
1169 source.ignore_space = bool(info.flags & VERBOSE)
1171 return subpattern
1173def parse_flags_subpattern(source, info):
1174 """Parses a flags subpattern. It could be inline flags or a subpattern
1175 possibly with local flags. If it's a subpattern, then that's returned;
1176 if it's a inline flags, then None is returned.
1177 """
1178 flags_on, flags_off = parse_flags(source, info)
1180 if flags_off & GLOBAL_FLAGS:
1181 raise error("bad inline flags: cannot turn off global flag",
1182 source.string, source.pos)
1184 if flags_on & flags_off:
1185 raise error("bad inline flags: flag turned on and off", source.string,
1186 source.pos)
1188 # Handle flags which are global in all regex behaviours.
1189 new_global_flags = (flags_on & ~info.global_flags) & GLOBAL_FLAGS
1190 if new_global_flags:
1191 info.global_flags |= new_global_flags
1193 # A global has been turned on, so reparse the pattern.
1194 raise _UnscopedFlagSet(info.global_flags)
1196 # Ensure that from now on we have only scoped flags.
1197 flags_on &= ~GLOBAL_FLAGS
1199 if source.match(":"):
1200 return parse_subpattern(source, info, flags_on, flags_off)
1202 if source.match(")"):
1203 parse_positional_flags(source, info, flags_on, flags_off)
1204 return None
1206 raise error("unknown extension", source.string, source.pos)
1208def parse_positional_flags(source, info, flags_on, flags_off):
1209 "Parses positional flags."
1210 info.flags = (info.flags | flags_on) & ~flags_off
1211 source.ignore_space = bool(info.flags & VERBOSE)
1213def parse_name(source, allow_numeric=False, allow_group_0=False):
1214 "Parses a name."
1215 name = source.get_while(set(")>"), include=False)
1217 if not name:
1218 raise error("missing group name", source.string, source.pos)
1220 if name.isdigit():
1221 min_group = 0 if allow_group_0 else 1
1222 if not allow_numeric or int(name) < min_group:
1223 raise error("bad character in group name", source.string,
1224 source.pos)
1225 else:
1226 if not name.isidentifier():
1227 raise error("bad character in group name", source.string,
1228 source.pos)
1230 return name
1232def is_octal(string):
1233 "Checks whether a string is octal."
1234 return all(ch in OCT_DIGITS for ch in string)
1236def is_decimal(string):
1237 "Checks whether a string is decimal."
1238 return all(ch in DIGITS for ch in string)
1240def is_hexadecimal(string):
1241 "Checks whether a string is hexadecimal."
1242 return all(ch in HEX_DIGITS for ch in string)
1244def parse_escape(source, info, in_set):
1245 "Parses an escape sequence."
1246 saved_ignore = source.ignore_space
1247 source.ignore_space = False
1248 ch = source.get()
1249 source.ignore_space = saved_ignore
1250 if not ch:
1251 # A backslash at the end of the pattern.
1252 raise error("bad escape (end of pattern)", source.string, source.pos)
1253 if ch in HEX_ESCAPES:
1254 # A hexadecimal escape sequence.
1255 return parse_hex_escape(source, info, ch, HEX_ESCAPES[ch], in_set, ch)
1256 elif ch == "g" and not in_set:
1257 # A group reference.
1258 saved_pos = source.pos
1259 try:
1260 return parse_group_ref(source, info)
1261 except error:
1262 # Invalid as a group reference, so assume it's a literal.
1263 source.pos = saved_pos
1265 return make_character(info, ord(ch), in_set)
1266 elif ch == "G" and not in_set:
1267 # A search anchor.
1268 return SearchAnchor()
1269 elif ch == "L" and not in_set:
1270 # A string set.
1271 return parse_string_set(source, info)
1272 elif ch == "N":
1273 # A named codepoint.
1274 return parse_named_char(source, info, in_set)
1275 elif ch in "pP":
1276 # A Unicode property, positive or negative.
1277 return parse_property(source, info, ch == "p", in_set)
1278 elif ch == "R" and not in_set:
1279 # A line ending.
1280 charset = [0x0A, 0x0B, 0x0C, 0x0D]
1281 if info.guess_encoding == UNICODE:
1282 charset.extend([0x85, 0x2028, 0x2029])
1284 return Atomic(Branch([String([0x0D, 0x0A]), SetUnion(info, [Character(c)
1285 for c in charset])]))
1286 elif ch == "X" and not in_set:
1287 # A grapheme cluster.
1288 return Grapheme()
1289 elif ch in ALPHA:
1290 # An alphabetic escape sequence.
1291 # Positional escapes aren't allowed inside a character set.
1292 if not in_set:
1293 if info.flags & WORD:
1294 value = WORD_POSITION_ESCAPES.get(ch)
1295 elif info.flags & ASCII:
1296 value = ASCII_POSITION_ESCAPES.get(ch)
1297 elif info.flags & UNICODE:
1298 value = UNICODE_POSITION_ESCAPES.get(ch)
1299 else:
1300 value = POSITION_ESCAPES.get(ch)
1302 if value:
1303 return value
1305 if info.flags & ASCII:
1306 value = ASCII_CHARSET_ESCAPES.get(ch)
1307 elif info.flags & UNICODE:
1308 value = UNICODE_CHARSET_ESCAPES.get(ch)
1309 else:
1310 value = CHARSET_ESCAPES.get(ch)
1312 if value:
1313 return value
1315 value = CHARACTER_ESCAPES.get(ch)
1316 if value:
1317 return Character(ord(value))
1319 raise error("bad escape \\%s" % ch, source.string, source.pos)
1320 elif ch in DIGITS:
1321 # A numeric escape sequence.
1322 return parse_numeric_escape(source, info, ch, in_set)
1323 else:
1324 # A literal.
1325 return make_character(info, ord(ch), in_set)
1327def parse_numeric_escape(source, info, ch, in_set):
1328 "Parses a numeric escape sequence."
1329 if in_set or ch == "0":
1330 # Octal escape sequence, max 3 digits.
1331 return parse_octal_escape(source, info, [ch], in_set)
1333 # At least 1 digit, so either octal escape or group.
1334 digits = ch
1335 saved_pos = source.pos
1336 ch = source.get()
1337 if ch in DIGITS:
1338 # At least 2 digits, so either octal escape or group.
1339 digits += ch
1340 saved_pos = source.pos
1341 ch = source.get()
1342 if is_octal(digits) and ch in OCT_DIGITS:
1343 # 3 octal digits, so octal escape sequence.
1344 encoding = info.flags & _ALL_ENCODINGS
1345 if encoding == ASCII or encoding == LOCALE:
1346 octal_mask = 0xFF
1347 else:
1348 octal_mask = 0x1FF
1350 value = int(digits + ch, 8) & octal_mask
1351 return make_character(info, value)
1353 # Group reference.
1354 source.pos = saved_pos
1355 if info.is_open_group(digits):
1356 raise error("cannot refer to an open group", source.string, source.pos)
1358 return make_ref_group(info, digits, source.pos)
1360def parse_octal_escape(source, info, digits, in_set):
1361 "Parses an octal escape sequence."
1362 saved_pos = source.pos
1363 ch = source.get()
1364 while len(digits) < 3 and ch in OCT_DIGITS:
1365 digits.append(ch)
1366 saved_pos = source.pos
1367 ch = source.get()
1369 source.pos = saved_pos
1370 try:
1371 value = int("".join(digits), 8)
1372 return make_character(info, value, in_set)
1373 except ValueError:
1374 if digits[0] in OCT_DIGITS:
1375 raise error("incomplete escape \\%s" % ''.join(digits),
1376 source.string, source.pos)
1377 else:
1378 raise error("bad escape \\%s" % digits[0], source.string,
1379 source.pos)
1381def parse_hex_escape(source, info, esc, expected_len, in_set, type):
1382 "Parses a hex escape sequence."
1383 saved_pos = source.pos
1384 digits = []
1385 for i in range(expected_len):
1386 ch = source.get()
1387 if ch not in HEX_DIGITS:
1388 raise error("incomplete escape \\%s%s" % (type, ''.join(digits)),
1389 source.string, saved_pos)
1390 digits.append(ch)
1392 try:
1393 value = int("".join(digits), 16)
1394 except ValueError:
1395 pass
1396 else:
1397 if value < 0x110000:
1398 return make_character(info, value, in_set)
1400 # Bad hex escape.
1401 raise error("bad hex escape \\%s%s" % (esc, ''.join(digits)),
1402 source.string, saved_pos)
1404def parse_group_ref(source, info):
1405 "Parses a group reference."
1406 source.expect("<")
1407 saved_pos = source.pos
1408 name = parse_name(source, True)
1409 source.expect(">")
1410 if info.is_open_group(name):
1411 raise error("cannot refer to an open group", source.string, source.pos)
1413 return make_ref_group(info, name, saved_pos)
1415def parse_string_set(source, info):
1416 "Parses a string set reference."
1417 source.expect("<")
1418 name = parse_name(source, True)
1419 source.expect(">")
1420 if name is None or name not in info.kwargs:
1421 raise error("undefined named list", source.string, source.pos)
1423 return make_string_set(info, name)
1425def parse_named_char(source, info, in_set):
1426 "Parses a named character."
1427 saved_pos = source.pos
1428 if source.match("{"):
1429 name = source.get_while(NAMED_CHAR_PART, keep_spaces=True)
1430 if source.match("}"):
1431 try:
1432 value = unicodedata.lookup(name)
1433 return make_character(info, ord(value), in_set)
1434 except KeyError:
1435 raise error("undefined character name", source.string,
1436 source.pos)
1438 source.pos = saved_pos
1439 return make_character(info, ord("N"), in_set)
1441def parse_property(source, info, positive, in_set):
1442 "Parses a Unicode property."
1443 saved_pos = source.pos
1444 ch = source.get()
1445 if ch == "{":
1446 negate = source.match("^")
1447 prop_name, name = parse_property_name(source)
1448 if source.match("}"):
1449 # It's correctly delimited.
1450 if info.flags & ASCII:
1451 encoding = ASCII_ENCODING
1452 elif info.flags & UNICODE:
1453 encoding = UNICODE_ENCODING
1454 else:
1455 encoding = 0
1457 prop = lookup_property(prop_name, name, positive != negate, source,
1458 encoding=encoding)
1459 return make_property(info, prop, in_set)
1460 elif ch and ch in "CLMNPSZ":
1461 # An abbreviated property, eg \pL.
1462 if info.flags & ASCII:
1463 encoding = ASCII_ENCODING
1464 elif info.flags & UNICODE:
1465 encoding = UNICODE_ENCODING
1466 else:
1467 encoding = 0
1469 prop = lookup_property(None, ch, positive, source, encoding=encoding)
1470 return make_property(info, prop, in_set)
1472 # Not a property, so treat as a literal "p" or "P".
1473 source.pos = saved_pos
1474 ch = "p" if positive else "P"
1475 return make_character(info, ord(ch), in_set)
1477def parse_property_name(source):
1478 "Parses a property name, which may be qualified."
1479 name = source.get_while(PROPERTY_NAME_PART)
1480 saved_pos = source.pos
1482 ch = source.get()
1483 if ch and ch in ":=":
1484 prop_name = name
1485 name = source.get_while(ALNUM | set(" &_-./")).strip()
1487 if name:
1488 # Name after the ":" or "=", so it's a qualified name.
1489 saved_pos = source.pos
1490 else:
1491 # No name after the ":" or "=", so assume it's an unqualified name.
1492 prop_name, name = None, prop_name
1493 else:
1494 prop_name = None
1496 source.pos = saved_pos
1497 return prop_name, name
1499def parse_set(source, info):
1500 "Parses a character set."
1501 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION
1503 saved_ignore = source.ignore_space
1504 source.ignore_space = False
1505 # Negative set?
1506 negate = source.match("^")
1507 try:
1508 if version == VERSION0:
1509 item = parse_set_imp_union(source, info)
1510 else:
1511 item = parse_set_union(source, info)
1513 if not source.match("]"):
1514 raise error("missing ]", source.string, source.pos)
1515 finally:
1516 source.ignore_space = saved_ignore
1518 if negate:
1519 item = item.with_flags(positive=not item.positive)
1521 item = item.with_flags(case_flags=make_case_flags(info))
1523 return item
1525def parse_set_union(source, info):
1526 "Parses a set union ([x||y])."
1527 items = [parse_set_symm_diff(source, info)]
1528 while source.match("||"):
1529 items.append(parse_set_symm_diff(source, info))
1531 if len(items) == 1:
1532 return items[0]
1533 return SetUnion(info, items)
1535def parse_set_symm_diff(source, info):
1536 "Parses a set symmetric difference ([x~~y])."
1537 items = [parse_set_inter(source, info)]
1538 while source.match("~~"):
1539 items.append(parse_set_inter(source, info))
1541 if len(items) == 1:
1542 return items[0]
1543 return SetSymDiff(info, items)
1545def parse_set_inter(source, info):
1546 "Parses a set intersection ([x&&y])."
1547 items = [parse_set_diff(source, info)]
1548 while source.match("&&"):
1549 items.append(parse_set_diff(source, info))
1551 if len(items) == 1:
1552 return items[0]
1553 return SetInter(info, items)
1555def parse_set_diff(source, info):
1556 "Parses a set difference ([x--y])."
1557 items = [parse_set_imp_union(source, info)]
1558 while source.match("--"):
1559 items.append(parse_set_imp_union(source, info))
1561 if len(items) == 1:
1562 return items[0]
1563 return SetDiff(info, items)
1565def parse_set_imp_union(source, info):
1566 "Parses a set implicit union ([xy])."
1567 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION
1569 items = [parse_set_member(source, info)]
1570 while True:
1571 saved_pos = source.pos
1572 if source.match("]"):
1573 # End of the set.
1574 source.pos = saved_pos
1575 break
1577 if version == VERSION1 and any(source.match(op) for op in SET_OPS):
1578 # The new behaviour has set operators.
1579 source.pos = saved_pos
1580 break
1582 items.append(parse_set_member(source, info))
1584 if len(items) == 1:
1585 return items[0]
1586 return SetUnion(info, items)
1588def parse_set_member(source, info):
1589 "Parses a member in a character set."
1590 # Parse a set item.
1591 start = parse_set_item(source, info)
1592 saved_pos1 = source.pos
1593 if (not isinstance(start, Character) or not start.positive or not
1594 source.match("-")):
1595 # It's not the start of a range.
1596 return start
1598 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION
1600 # It looks like the start of a range of characters.
1601 saved_pos2 = source.pos
1602 if version == VERSION1 and source.match("-"):
1603 # It's actually the set difference operator '--', so return the
1604 # character.
1605 source.pos = saved_pos1
1606 return start
1608 if source.match("]"):
1609 # We've reached the end of the set, so return both the character and
1610 # hyphen.
1611 source.pos = saved_pos2
1612 return SetUnion(info, [start, Character(ord("-"))])
1614 # Parse a set item.
1615 end = parse_set_item(source, info)
1616 if not isinstance(end, Character) or not end.positive:
1617 # It's not a range, so return the character, hyphen and property.
1618 return SetUnion(info, [start, Character(ord("-")), end])
1620 # It _is_ a range.
1621 if start.value > end.value:
1622 raise error("bad character range", source.string, source.pos)
1624 if start.value == end.value:
1625 return start
1627 return Range(start.value, end.value)
1629def parse_set_item(source, info):
1630 "Parses an item in a character set."
1631 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION
1633 if source.match("\\"):
1634 # An escape sequence in a set.
1635 return parse_escape(source, info, True)
1637 saved_pos = source.pos
1638 if source.match("[:"):
1639 # Looks like a POSIX character class.
1640 try:
1641 return parse_posix_class(source, info)
1642 except ParseError:
1643 # Not a POSIX character class.
1644 source.pos = saved_pos
1646 if version == VERSION1 and source.match("["):
1647 # It's the start of a nested set.
1649 # Negative set?
1650 negate = source.match("^")
1651 item = parse_set_union(source, info)
1653 if not source.match("]"):
1654 raise error("missing ]", source.string, source.pos)
1656 if negate:
1657 item = item.with_flags(positive=not item.positive)
1659 return item
1661 ch = source.get()
1662 if not ch:
1663 raise error("unterminated character set", source.string, source.pos)
1665 return Character(ord(ch))
1667def parse_posix_class(source, info):
1668 "Parses a POSIX character class."
1669 negate = source.match("^")
1670 prop_name, name = parse_property_name(source)
1671 if not source.match(":]"):
1672 raise ParseError()
1674 return lookup_property(prop_name, name, not negate, source, posix=True)
1676def float_to_rational(flt):
1677 "Converts a float to a rational pair."
1678 int_part = int(flt)
1679 error = flt - int_part
1680 if abs(error) < 0.0001:
1681 return int_part, 1
1683 den, num = float_to_rational(1.0 / error)
1685 return int_part * den + num, den
1687def numeric_to_rational(numeric):
1688 "Converts a numeric string to a rational string, if possible."
1689 if numeric[ : 1] == "-":
1690 sign, numeric = numeric[0], numeric[1 : ]
1691 else:
1692 sign = ""
1694 parts = numeric.split("/")
1695 if len(parts) == 2:
1696 num, den = float_to_rational(float(parts[0]) / float(parts[1]))
1697 elif len(parts) == 1:
1698 num, den = float_to_rational(float(parts[0]))
1699 else:
1700 raise ValueError()
1702 result = "{}{}/{}".format(sign, num, den)
1703 if result.endswith("/1"):
1704 return result[ : -2]
1706 return result
1708def standardise_name(name):
1709 "Standardises a property or value name."
1710 try:
1711 return numeric_to_rational("".join(name))
1712 except (ValueError, ZeroDivisionError):
1713 return "".join(ch for ch in name if ch not in "_- ").upper()
1715_POSIX_CLASSES = set('ALNUM DIGIT PUNCT XDIGIT'.split())
1717_BINARY_VALUES = set('YES Y NO N TRUE T FALSE F'.split())
1719def lookup_property(property, value, positive, source=None, posix=False, encoding=0):
1720 "Looks up a property."
1721 # Normalise the names (which may still be lists).
1722 property = standardise_name(property) if property else None
1723 value = standardise_name(value)
1725 if (property, value) == ("GENERALCATEGORY", "ASSIGNED"):
1726 property, value, positive = "GENERALCATEGORY", "UNASSIGNED", not positive
1728 if posix and not property and value.upper() in _POSIX_CLASSES:
1729 value = 'POSIX' + value
1731 if property:
1732 # Both the property and the value are provided.
1733 prop = PROPERTIES.get(property)
1734 if not prop:
1735 if not source:
1736 raise error("unknown property")
1738 raise error("unknown property", source.string, source.pos)
1740 prop_id, value_dict = prop
1741 val_id = value_dict.get(value)
1742 if val_id is None:
1743 if not source:
1744 raise error("unknown property value")
1746 raise error("unknown property value", source.string, source.pos)
1748 return Property((prop_id << 16) | val_id, positive, encoding=encoding)
1750 # Only the value is provided.
1751 # It might be the name of a GC, script or block value.
1752 for property in ("GC", "SCRIPT", "BLOCK"):
1753 prop_id, value_dict = PROPERTIES.get(property)
1754 val_id = value_dict.get(value)
1755 if val_id is not None:
1756 return Property((prop_id << 16) | val_id, positive, encoding=encoding)
1758 # It might be the name of a binary property.
1759 prop = PROPERTIES.get(value)
1760 if prop:
1761 prop_id, value_dict = prop
1762 if set(value_dict) == _BINARY_VALUES:
1763 return Property((prop_id << 16) | 1, positive, encoding=encoding)
1765 return Property(prop_id << 16, not positive, encoding=encoding)
1767 # It might be the name of a binary property starting with a prefix.
1768 if value.startswith("IS"):
1769 prop = PROPERTIES.get(value[2 : ])
1770 if prop:
1771 prop_id, value_dict = prop
1772 if "YES" in value_dict:
1773 return Property((prop_id << 16) | 1, positive, encoding=encoding)
1775 # It might be the name of a script or block starting with a prefix.
1776 for prefix, property in (("IS", "SCRIPT"), ("IN", "BLOCK")):
1777 if value.startswith(prefix):
1778 prop_id, value_dict = PROPERTIES.get(property)
1779 val_id = value_dict.get(value[2 : ])
1780 if val_id is not None:
1781 return Property((prop_id << 16) | val_id, positive, encoding=encoding)
1783 # Unknown property.
1784 if not source:
1785 raise error("unknown property")
1787 raise error("unknown property", source.string, source.pos)
1789def _compile_replacement(source, pattern, is_unicode):
1790 "Compiles a replacement template escape sequence."
1791 ch = source.get()
1792 if ch in ALPHA:
1793 # An alphabetic escape sequence.
1794 value = CHARACTER_ESCAPES.get(ch)
1795 if value:
1796 return False, [ord(value)]
1798 if ch in HEX_ESCAPES and (ch == "x" or is_unicode):
1799 # A hexadecimal escape sequence.
1800 return False, [parse_repl_hex_escape(source, HEX_ESCAPES[ch], ch)]
1802 if ch == "g":
1803 # A group preference.
1804 return True, [compile_repl_group(source, pattern)]
1806 if ch == "N" and is_unicode:
1807 # A named character.
1808 value = parse_repl_named_char(source)
1809 if value is not None:
1810 return False, [value]
1812 raise error("bad escape \\%s" % ch, source.string, source.pos)
1814 if isinstance(source.sep, bytes):
1815 octal_mask = 0xFF
1816 else:
1817 octal_mask = 0x1FF
1819 if ch == "0":
1820 # An octal escape sequence.
1821 digits = ch
1822 while len(digits) < 3:
1823 saved_pos = source.pos
1824 ch = source.get()
1825 if ch not in OCT_DIGITS:
1826 source.pos = saved_pos
1827 break
1828 digits += ch
1830 return False, [int(digits, 8) & octal_mask]
1832 if ch in DIGITS:
1833 # Either an octal escape sequence (3 digits) or a group reference (max
1834 # 2 digits).
1835 digits = ch
1836 saved_pos = source.pos
1837 ch = source.get()
1838 if ch in DIGITS:
1839 digits += ch
1840 saved_pos = source.pos
1841 ch = source.get()
1842 if ch and is_octal(digits + ch):
1843 # An octal escape sequence.
1844 return False, [int(digits + ch, 8) & octal_mask]
1846 # A group reference.
1847 source.pos = saved_pos
1848 return True, [int(digits)]
1850 if ch == "\\":
1851 # An escaped backslash is a backslash.
1852 return False, [ord("\\")]
1854 if not ch:
1855 # A trailing backslash.
1856 raise error("bad escape (end of pattern)", source.string, source.pos)
1858 # An escaped non-backslash is a backslash followed by the literal.
1859 return False, [ord("\\"), ord(ch)]
1861def parse_repl_hex_escape(source, expected_len, type):
1862 "Parses a hex escape sequence in a replacement string."
1863 digits = []
1864 for i in range(expected_len):
1865 ch = source.get()
1866 if ch not in HEX_DIGITS:
1867 raise error("incomplete escape \\%s%s" % (type, ''.join(digits)),
1868 source.string, source.pos)
1869 digits.append(ch)
1871 return int("".join(digits), 16)
1873def parse_repl_named_char(source):
1874 "Parses a named character in a replacement string."
1875 saved_pos = source.pos
1876 if source.match("{"):
1877 name = source.get_while(ALPHA | set(" "))
1879 if source.match("}"):
1880 try:
1881 value = unicodedata.lookup(name)
1882 return ord(value)
1883 except KeyError:
1884 raise error("undefined character name", source.string,
1885 source.pos)
1887 source.pos = saved_pos
1888 return None
1890def compile_repl_group(source, pattern):
1891 "Compiles a replacement template group reference."
1892 source.expect("<")
1893 name = parse_name(source, True, True)
1895 source.expect(">")
1896 if name.isdigit():
1897 index = int(name)
1898 if not 0 <= index <= pattern.groups:
1899 raise error("invalid group reference", source.string, source.pos)
1901 return index
1903 try:
1904 return pattern.groupindex[name]
1905 except KeyError:
1906 raise IndexError("unknown group")
1908# The regular expression is parsed into a syntax tree. The different types of
1909# node are defined below.
1911INDENT = " "
1912POSITIVE_OP = 0x1
1913ZEROWIDTH_OP = 0x2
1914FUZZY_OP = 0x4
1915REVERSE_OP = 0x8
1916REQUIRED_OP = 0x10
1917ENCODING_OP_SHIFT = 5
1919POS_TEXT = {False: "NON-MATCH", True: "MATCH"}
1920CASE_TEXT = {NOCASE: "", IGNORECASE: " SIMPLE_IGNORE_CASE", FULLCASE: "",
1921 FULLIGNORECASE: " FULL_IGNORE_CASE"}
1923def make_sequence(items):
1924 if len(items) == 1:
1925 return items[0]
1926 return Sequence(items)
1928# Common base class for all nodes.
1929class RegexBase:
1930 def __init__(self):
1931 self._key = self.__class__
1933 def with_flags(self, positive=None, case_flags=None, zerowidth=None):
1934 if positive is None:
1935 positive = self.positive
1936 else:
1937 positive = bool(positive)
1938 if case_flags is None:
1939 case_flags = self.case_flags
1940 else:
1941 case_flags = CASE_FLAGS_COMBINATIONS[case_flags & CASE_FLAGS]
1942 if zerowidth is None:
1943 zerowidth = self.zerowidth
1944 else:
1945 zerowidth = bool(zerowidth)
1947 if (positive == self.positive and case_flags == self.case_flags and
1948 zerowidth == self.zerowidth):
1949 return self
1951 return self.rebuild(positive, case_flags, zerowidth)
1953 def fix_groups(self, pattern, reverse, fuzzy):
1954 pass
1956 def optimise(self, info, reverse):
1957 return self
1959 def pack_characters(self, info):
1960 return self
1962 def remove_captures(self):
1963 return self
1965 def is_atomic(self):
1966 return True
1968 def can_be_affix(self):
1969 return True
1971 def contains_group(self):
1972 return False
1974 def get_firstset(self, reverse):
1975 raise _FirstSetError()
1977 def has_simple_start(self):
1978 return False
1980 def compile(self, reverse=False, fuzzy=False):
1981 return self._compile(reverse, fuzzy)
1983 def is_empty(self):
1984 return False
1986 def __hash__(self):
1987 return hash(self._key)
1989 def __eq__(self, other):
1990 return type(self) is type(other) and self._key == other._key
1992 def __ne__(self, other):
1993 return not self.__eq__(other)
1995 def get_required_string(self, reverse):
1996 return self.max_width(), None
1998# Base class for zero-width nodes.
1999class ZeroWidthBase(RegexBase):
2000 def __init__(self, positive=True, encoding=0):
2001 RegexBase.__init__(self)
2002 self.positive = bool(positive)
2003 self.encoding = encoding
2005 self._key = self.__class__, self.positive
2007 def get_firstset(self, reverse):
2008 return set([None])
2010 def _compile(self, reverse, fuzzy):
2011 flags = 0
2012 if self.positive:
2013 flags |= POSITIVE_OP
2014 if fuzzy:
2015 flags |= FUZZY_OP
2016 if reverse:
2017 flags |= REVERSE_OP
2018 flags |= self.encoding << ENCODING_OP_SHIFT
2019 return [(self._opcode, flags)]
2021 def dump(self, indent, reverse):
2022 print("{}{} {}{}".format(INDENT * indent, self._op_name,
2023 POS_TEXT[self.positive], ["", " ASCII"][self.encoding]))
2025 def max_width(self):
2026 return 0
2028class Any(RegexBase):
2029 _opcode = {False: OP.ANY, True: OP.ANY_REV}
2030 _op_name = "ANY"
2032 def has_simple_start(self):
2033 return True
2035 def _compile(self, reverse, fuzzy):
2036 flags = 0
2037 if fuzzy:
2038 flags |= FUZZY_OP
2039 return [(self._opcode[reverse], flags)]
2041 def dump(self, indent, reverse):
2042 print("{}{}".format(INDENT * indent, self._op_name))
2044 def max_width(self):
2045 return 1
2047class AnyAll(Any):
2048 _opcode = {False: OP.ANY_ALL, True: OP.ANY_ALL_REV}
2049 _op_name = "ANY_ALL"
2051 def __init__(self):
2052 self.positive = True
2053 self.zerowidth = False
2054 self.case_flags = 0
2056 self._key = self.__class__, self.positive
2058class AnyU(Any):
2059 _opcode = {False: OP.ANY_U, True: OP.ANY_U_REV}
2060 _op_name = "ANY_U"
2062class Atomic(RegexBase):
2063 def __init__(self, subpattern):
2064 RegexBase.__init__(self)
2065 self.subpattern = subpattern
2067 def fix_groups(self, pattern, reverse, fuzzy):
2068 self.subpattern.fix_groups(pattern, reverse, fuzzy)
2070 def optimise(self, info, reverse):
2071 self.subpattern = self.subpattern.optimise(info, reverse)
2073 if self.subpattern.is_empty():
2074 return self.subpattern
2075 return self
2077 def pack_characters(self, info):
2078 self.subpattern = self.subpattern.pack_characters(info)
2079 return self
2081 def remove_captures(self):
2082 self.subpattern = self.subpattern.remove_captures()
2083 return self
2085 def can_be_affix(self):
2086 return self.subpattern.can_be_affix()
2088 def contains_group(self):
2089 return self.subpattern.contains_group()
2091 def get_firstset(self, reverse):
2092 return self.subpattern.get_firstset(reverse)
2094 def has_simple_start(self):
2095 return self.subpattern.has_simple_start()
2097 def _compile(self, reverse, fuzzy):
2098 return ([(OP.ATOMIC, )] + self.subpattern.compile(reverse, fuzzy) +
2099 [(OP.END, )])
2101 def dump(self, indent, reverse):
2102 print("{}ATOMIC".format(INDENT * indent))
2103 self.subpattern.dump(indent + 1, reverse)
2105 def is_empty(self):
2106 return self.subpattern.is_empty()
2108 def __eq__(self, other):
2109 return (type(self) is type(other) and self.subpattern ==
2110 other.subpattern)
2112 def max_width(self):
2113 return self.subpattern.max_width()
2115 def get_required_string(self, reverse):
2116 return self.subpattern.get_required_string(reverse)
2118class Boundary(ZeroWidthBase):
2119 _opcode = OP.BOUNDARY
2120 _op_name = "BOUNDARY"
2122class Branch(RegexBase):
2123 def __init__(self, branches):
2124 RegexBase.__init__(self)
2125 self.branches = branches
2127 def fix_groups(self, pattern, reverse, fuzzy):
2128 for b in self.branches:
2129 b.fix_groups(pattern, reverse, fuzzy)
2131 def optimise(self, info, reverse):
2132 if not self.branches:
2133 return Sequence([])
2135 # Flatten branches within branches.
2136 branches = Branch._flatten_branches(info, reverse, self.branches)
2138 # Move any common prefix or suffix out of the branches.
2139 if reverse:
2140 suffix, branches = Branch._split_common_suffix(info, branches)
2141 prefix = []
2142 else:
2143 prefix, branches = Branch._split_common_prefix(info, branches)
2144 suffix = []
2146 # Try to reduce adjacent single-character branches to sets.
2147 branches = Branch._reduce_to_set(info, reverse, branches)
2149 if len(branches) > 1:
2150 sequence = [Branch(branches)]
2152 if not prefix or not suffix:
2153 # We might be able to add a quick precheck before the branches.
2154 firstset = self._add_precheck(info, reverse, branches)
2156 if firstset:
2157 if reverse:
2158 sequence.append(firstset)
2159 else:
2160 sequence.insert(0, firstset)
2161 else:
2162 sequence = branches
2164 return make_sequence(prefix + sequence + suffix)
2166 def _add_precheck(self, info, reverse, branches):
2167 charset = set()
2168 pos = -1 if reverse else 0
2170 for branch in branches:
2171 if type(branch) is Literal and branch.case_flags == NOCASE:
2172 charset.add(branch.characters[pos])
2173 else:
2174 return
2176 if not charset:
2177 return None
2179 return _check_firstset(info, reverse, [Character(c) for c in charset])
2181 def pack_characters(self, info):
2182 self.branches = [b.pack_characters(info) for b in self.branches]
2183 return self
2185 def remove_captures(self):
2186 self.branches = [b.remove_captures() for b in self.branches]
2187 return self
2189 def is_atomic(self):
2190 return all(b.is_atomic() for b in self.branches)
2192 def can_be_affix(self):
2193 return all(b.can_be_affix() for b in self.branches)
2195 def contains_group(self):
2196 return any(b.contains_group() for b in self.branches)
2198 def get_firstset(self, reverse):
2199 fs = set()
2200 for b in self.branches:
2201 fs |= b.get_firstset(reverse)
2203 return fs or set([None])
2205 def _compile(self, reverse, fuzzy):
2206 if not self.branches:
2207 return []
2209 code = [(OP.BRANCH, )]
2210 for b in self.branches:
2211 code.extend(b.compile(reverse, fuzzy))
2212 code.append((OP.NEXT, ))
2214 code[-1] = (OP.END, )
2216 return code
2218 def dump(self, indent, reverse):
2219 print("{}BRANCH".format(INDENT * indent))
2220 self.branches[0].dump(indent + 1, reverse)
2221 for b in self.branches[1 : ]:
2222 print("{}OR".format(INDENT * indent))
2223 b.dump(indent + 1, reverse)
2225 @staticmethod
2226 def _flatten_branches(info, reverse, branches):
2227 # Flatten the branches so that there aren't branches of branches.
2228 new_branches = []
2229 for b in branches:
2230 b = b.optimise(info, reverse)
2231 if isinstance(b, Branch):
2232 new_branches.extend(b.branches)
2233 else:
2234 new_branches.append(b)
2236 return new_branches
2238 @staticmethod
2239 def _split_common_prefix(info, branches):
2240 # Common leading items can be moved out of the branches.
2241 # Get the items in the branches.
2242 alternatives = []
2243 for b in branches:
2244 if isinstance(b, Sequence):
2245 alternatives.append(b.items)
2246 else:
2247 alternatives.append([b])
2249 # What is the maximum possible length of the prefix?
2250 max_count = min(len(a) for a in alternatives)
2252 # What is the longest common prefix?
2253 prefix = alternatives[0]
2254 pos = 0
2255 end_pos = max_count
2256 while pos < end_pos and prefix[pos].can_be_affix() and all(a[pos] ==
2257 prefix[pos] for a in alternatives):
2258 pos += 1
2259 count = pos
2261 if info.flags & UNICODE:
2262 # We need to check that we're not splitting a sequence of
2263 # characters which could form part of full case-folding.
2264 count = pos
2265 while count > 0 and not all(Branch._can_split(a, count) for a in
2266 alternatives):
2267 count -= 1
2269 # No common prefix is possible.
2270 if count == 0:
2271 return [], branches
2273 # Rebuild the branches.
2274 new_branches = []
2275 for a in alternatives:
2276 new_branches.append(make_sequence(a[count : ]))
2278 return prefix[ : count], new_branches
2280 @staticmethod
2281 def _split_common_suffix(info, branches):
2282 # Common trailing items can be moved out of the branches.
2283 # Get the items in the branches.
2284 alternatives = []
2285 for b in branches:
2286 if isinstance(b, Sequence):
2287 alternatives.append(b.items)
2288 else:
2289 alternatives.append([b])
2291 # What is the maximum possible length of the suffix?
2292 max_count = min(len(a) for a in alternatives)
2294 # What is the longest common suffix?
2295 suffix = alternatives[0]
2296 pos = -1
2297 end_pos = -1 - max_count
2298 while pos > end_pos and suffix[pos].can_be_affix() and all(a[pos] ==
2299 suffix[pos] for a in alternatives):
2300 pos -= 1
2301 count = -1 - pos
2303 if info.flags & UNICODE:
2304 # We need to check that we're not splitting a sequence of
2305 # characters which could form part of full case-folding.
2306 while count > 0 and not all(Branch._can_split_rev(a, count) for a
2307 in alternatives):
2308 count -= 1
2310 # No common suffix is possible.
2311 if count == 0:
2312 return [], branches
2314 # Rebuild the branches.
2315 new_branches = []
2316 for a in alternatives:
2317 new_branches.append(make_sequence(a[ : -count]))
2319 return suffix[-count : ], new_branches
2321 @staticmethod
2322 def _can_split(items, count):
2323 # Check the characters either side of the proposed split.
2324 if not Branch._is_full_case(items, count - 1):
2325 return True
2327 if not Branch._is_full_case(items, count):
2328 return True
2330 # Check whether a 1-1 split would be OK.
2331 if Branch._is_folded(items[count - 1 : count + 1]):
2332 return False
2334 # Check whether a 1-2 split would be OK.
2335 if (Branch._is_full_case(items, count + 2) and
2336 Branch._is_folded(items[count - 1 : count + 2])):
2337 return False
2339 # Check whether a 2-1 split would be OK.
2340 if (Branch._is_full_case(items, count - 2) and
2341 Branch._is_folded(items[count - 2 : count + 1])):
2342 return False
2344 return True
2346 @staticmethod
2347 def _can_split_rev(items, count):
2348 end = len(items)
2350 # Check the characters either side of the proposed split.
2351 if not Branch._is_full_case(items, end - count):
2352 return True
2354 if not Branch._is_full_case(items, end - count - 1):
2355 return True
2357 # Check whether a 1-1 split would be OK.
2358 if Branch._is_folded(items[end - count - 1 : end - count + 1]):
2359 return False
2361 # Check whether a 1-2 split would be OK.
2362 if (Branch._is_full_case(items, end - count + 2) and
2363 Branch._is_folded(items[end - count - 1 : end - count + 2])):
2364 return False
2366 # Check whether a 2-1 split would be OK.
2367 if (Branch._is_full_case(items, end - count - 2) and
2368 Branch._is_folded(items[end - count - 2 : end - count + 1])):
2369 return False
2371 return True
2373 @staticmethod
2374 def _merge_common_prefixes(info, reverse, branches):
2375 # Branches with the same case-sensitive character prefix can be grouped
2376 # together if they are separated only by other branches with a
2377 # character prefix.
2378 prefixed = defaultdict(list)
2379 order = {}
2380 new_branches = []
2381 for b in branches:
2382 if Branch._is_simple_character(b):
2383 # Branch starts with a simple character.
2384 prefixed[b.value].append([b])
2385 order.setdefault(b.value, len(order))
2386 elif (isinstance(b, Sequence) and b.items and
2387 Branch._is_simple_character(b.items[0])):
2388 # Branch starts with a simple character.
2389 prefixed[b.items[0].value].append(b.items)
2390 order.setdefault(b.items[0].value, len(order))
2391 else:
2392 Branch._flush_char_prefix(info, reverse, prefixed, order,
2393 new_branches)
2395 new_branches.append(b)
2397 Branch._flush_char_prefix(info, prefixed, order, new_branches)
2399 return new_branches
2401 @staticmethod
2402 def _is_simple_character(c):
2403 return isinstance(c, Character) and c.positive and not c.case_flags
2405 @staticmethod
2406 def _reduce_to_set(info, reverse, branches):
2407 # Can the branches be reduced to a set?
2408 new_branches = []
2409 items = set()
2410 case_flags = NOCASE
2411 for b in branches:
2412 if isinstance(b, (Character, Property, SetBase)):
2413 # Branch starts with a single character.
2414 if b.case_flags != case_flags:
2415 # Different case sensitivity, so flush.
2416 Branch._flush_set_members(info, reverse, items, case_flags,
2417 new_branches)
2419 case_flags = b.case_flags
2421 items.add(b.with_flags(case_flags=NOCASE))
2422 else:
2423 Branch._flush_set_members(info, reverse, items, case_flags,
2424 new_branches)
2426 new_branches.append(b)
2428 Branch._flush_set_members(info, reverse, items, case_flags,
2429 new_branches)
2431 return new_branches
2433 @staticmethod
2434 def _flush_char_prefix(info, reverse, prefixed, order, new_branches):
2435 # Flush the prefixed branches.
2436 if not prefixed:
2437 return
2439 for value, branches in sorted(prefixed.items(), key=lambda pair:
2440 order[pair[0]]):
2441 if len(branches) == 1:
2442 new_branches.append(make_sequence(branches[0]))
2443 else:
2444 subbranches = []
2445 optional = False
2446 for b in branches:
2447 if len(b) > 1:
2448 subbranches.append(make_sequence(b[1 : ]))
2449 elif not optional:
2450 subbranches.append(Sequence())
2451 optional = True
2453 sequence = Sequence([Character(value), Branch(subbranches)])
2454 new_branches.append(sequence.optimise(info, reverse))
2456 prefixed.clear()
2457 order.clear()
2459 @staticmethod
2460 def _flush_set_members(info, reverse, items, case_flags, new_branches):
2461 # Flush the set members.
2462 if not items:
2463 return
2465 if len(items) == 1:
2466 item = list(items)[0]
2467 else:
2468 item = SetUnion(info, list(items)).optimise(info, reverse)
2470 new_branches.append(item.with_flags(case_flags=case_flags))
2472 items.clear()
2474 @staticmethod
2475 def _is_full_case(items, i):
2476 if not 0 <= i < len(items):
2477 return False
2479 item = items[i]
2480 return (isinstance(item, Character) and item.positive and
2481 (item.case_flags & FULLIGNORECASE) == FULLIGNORECASE)
2483 @staticmethod
2484 def _is_folded(items):
2485 if len(items) < 2:
2486 return False
2488 for i in items:
2489 if (not isinstance(i, Character) or not i.positive or not
2490 i.case_flags):
2491 return False
2493 folded = "".join(chr(i.value) for i in items)
2494 folded = _regex.fold_case(FULL_CASE_FOLDING, folded)
2496 # Get the characters which expand to multiple codepoints on folding.
2497 expanding_chars = _regex.get_expand_on_folding()
2499 for c in expanding_chars:
2500 if folded == _regex.fold_case(FULL_CASE_FOLDING, c):
2501 return True
2503 return False
2505 def is_empty(self):
2506 return all(b.is_empty() for b in self.branches)
2508 def __eq__(self, other):
2509 return type(self) is type(other) and self.branches == other.branches
2511 def max_width(self):
2512 return max(b.max_width() for b in self.branches)
2514class CallGroup(RegexBase):
2515 def __init__(self, info, group, position):
2516 RegexBase.__init__(self)
2517 self.info = info
2518 self.group = group
2519 self.position = position
2521 self._key = self.__class__, self.group
2523 def fix_groups(self, pattern, reverse, fuzzy):
2524 try:
2525 self.group = int(self.group)
2526 except ValueError:
2527 try:
2528 self.group = self.info.group_index[self.group]
2529 except KeyError:
2530 raise error("invalid group reference", pattern, self.position)
2532 if not 0 <= self.group <= self.info.group_count:
2533 raise error("unknown group", pattern, self.position)
2535 if self.group > 0 and self.info.open_group_count[self.group] > 1:
2536 raise error("ambiguous group reference", pattern, self.position)
2538 self.info.group_calls.append((self, reverse, fuzzy))
2540 self._key = self.__class__, self.group
2542 def remove_captures(self):
2543 raise error("group reference not allowed", self.pattern, self.position)
2545 def _compile(self, reverse, fuzzy):
2546 return [(OP.GROUP_CALL, self.call_ref)]
2548 def dump(self, indent, reverse):
2549 print("{}GROUP_CALL {}".format(INDENT * indent, self.group))
2551 def __eq__(self, other):
2552 return type(self) is type(other) and self.group == other.group
2554 def max_width(self):
2555 return UNLIMITED
2557 def __del__(self):
2558 self.info = None
2560class CallRef(RegexBase):
2561 def __init__(self, ref, parsed):
2562 self.ref = ref
2563 self.parsed = parsed
2565 def _compile(self, reverse, fuzzy):
2566 return ([(OP.CALL_REF, self.ref)] + self.parsed._compile(reverse,
2567 fuzzy) + [(OP.END, )])
2569class Character(RegexBase):
2570 _opcode = {(NOCASE, False): OP.CHARACTER, (IGNORECASE, False):
2571 OP.CHARACTER_IGN, (FULLCASE, False): OP.CHARACTER, (FULLIGNORECASE,
2572 False): OP.CHARACTER_IGN, (NOCASE, True): OP.CHARACTER_REV, (IGNORECASE,
2573 True): OP.CHARACTER_IGN_REV, (FULLCASE, True): OP.CHARACTER_REV,
2574 (FULLIGNORECASE, True): OP.CHARACTER_IGN_REV}
2576 def __init__(self, value, positive=True, case_flags=NOCASE,
2577 zerowidth=False):
2578 RegexBase.__init__(self)
2579 self.value = value
2580 self.positive = bool(positive)
2581 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
2582 self.zerowidth = bool(zerowidth)
2584 if (self.positive and (self.case_flags & FULLIGNORECASE) ==
2585 FULLIGNORECASE):
2586 self.folded = _regex.fold_case(FULL_CASE_FOLDING, chr(self.value))
2587 else:
2588 self.folded = chr(self.value)
2590 self._key = (self.__class__, self.value, self.positive,
2591 self.case_flags, self.zerowidth)
2593 def rebuild(self, positive, case_flags, zerowidth):
2594 return Character(self.value, positive, case_flags, zerowidth)
2596 def optimise(self, info, reverse, in_set=False):
2597 return self
2599 def get_firstset(self, reverse):
2600 return set([self])
2602 def has_simple_start(self):
2603 return True
2605 def _compile(self, reverse, fuzzy):
2606 flags = 0
2607 if self.positive:
2608 flags |= POSITIVE_OP
2609 if self.zerowidth:
2610 flags |= ZEROWIDTH_OP
2611 if fuzzy:
2612 flags |= FUZZY_OP
2614 code = PrecompiledCode([self._opcode[self.case_flags, reverse], flags,
2615 self.value])
2617 if len(self.folded) > 1:
2618 # The character expands on full case-folding.
2619 code = Branch([code, String([ord(c) for c in self.folded],
2620 case_flags=self.case_flags)])
2622 return code.compile(reverse, fuzzy)
2624 def dump(self, indent, reverse):
2625 display = ascii(chr(self.value)).lstrip("bu")
2626 print("{}CHARACTER {} {}{}".format(INDENT * indent,
2627 POS_TEXT[self.positive], display, CASE_TEXT[self.case_flags]))
2629 def matches(self, ch):
2630 return (ch == self.value) == self.positive
2632 def max_width(self):
2633 return len(self.folded)
2635 def get_required_string(self, reverse):
2636 if not self.positive:
2637 return 1, None
2639 self.folded_characters = tuple(ord(c) for c in self.folded)
2641 return 0, self
2643class Conditional(RegexBase):
2644 def __init__(self, info, group, yes_item, no_item, position):
2645 RegexBase.__init__(self)
2646 self.info = info
2647 self.group = group
2648 self.yes_item = yes_item
2649 self.no_item = no_item
2650 self.position = position
2652 def fix_groups(self, pattern, reverse, fuzzy):
2653 try:
2654 self.group = int(self.group)
2655 except ValueError:
2656 try:
2657 self.group = self.info.group_index[self.group]
2658 except KeyError:
2659 if self.group == 'DEFINE':
2660 # 'DEFINE' is a special name unless there's a group with
2661 # that name.
2662 self.group = 0
2663 else:
2664 raise error("unknown group", pattern, self.position)
2666 if not 0 <= self.group <= self.info.group_count:
2667 raise error("invalid group reference", pattern, self.position)
2669 self.yes_item.fix_groups(pattern, reverse, fuzzy)
2670 self.no_item.fix_groups(pattern, reverse, fuzzy)
2672 def optimise(self, info, reverse):
2673 yes_item = self.yes_item.optimise(info, reverse)
2674 no_item = self.no_item.optimise(info, reverse)
2676 return Conditional(info, self.group, yes_item, no_item, self.position)
2678 def pack_characters(self, info):
2679 self.yes_item = self.yes_item.pack_characters(info)
2680 self.no_item = self.no_item.pack_characters(info)
2681 return self
2683 def remove_captures(self):
2684 self.yes_item = self.yes_item.remove_captures()
2685 self.no_item = self.no_item.remove_captures()
2687 def is_atomic(self):
2688 return self.yes_item.is_atomic() and self.no_item.is_atomic()
2690 def can_be_affix(self):
2691 return self.yes_item.can_be_affix() and self.no_item.can_be_affix()
2693 def contains_group(self):
2694 return self.yes_item.contains_group() or self.no_item.contains_group()
2696 def get_firstset(self, reverse):
2697 return (self.yes_item.get_firstset(reverse) |
2698 self.no_item.get_firstset(reverse))
2700 def _compile(self, reverse, fuzzy):
2701 code = [(OP.GROUP_EXISTS, self.group)]
2702 code.extend(self.yes_item.compile(reverse, fuzzy))
2703 add_code = self.no_item.compile(reverse, fuzzy)
2704 if add_code:
2705 code.append((OP.NEXT, ))
2706 code.extend(add_code)
2708 code.append((OP.END, ))
2710 return code
2712 def dump(self, indent, reverse):
2713 print("{}GROUP_EXISTS {}".format(INDENT * indent, self.group))
2714 self.yes_item.dump(indent + 1, reverse)
2715 if not self.no_item.is_empty():
2716 print("{}OR".format(INDENT * indent))
2717 self.no_item.dump(indent + 1, reverse)
2719 def is_empty(self):
2720 return self.yes_item.is_empty() and self.no_item.is_empty()
2722 def __eq__(self, other):
2723 return type(self) is type(other) and (self.group, self.yes_item,
2724 self.no_item) == (other.group, other.yes_item, other.no_item)
2726 def max_width(self):
2727 return max(self.yes_item.max_width(), self.no_item.max_width())
2729 def __del__(self):
2730 self.info = None
2732class DefaultBoundary(ZeroWidthBase):
2733 _opcode = OP.DEFAULT_BOUNDARY
2734 _op_name = "DEFAULT_BOUNDARY"
2736class DefaultEndOfWord(ZeroWidthBase):
2737 _opcode = OP.DEFAULT_END_OF_WORD
2738 _op_name = "DEFAULT_END_OF_WORD"
2740class DefaultStartOfWord(ZeroWidthBase):
2741 _opcode = OP.DEFAULT_START_OF_WORD
2742 _op_name = "DEFAULT_START_OF_WORD"
2744class EndOfLine(ZeroWidthBase):
2745 _opcode = OP.END_OF_LINE
2746 _op_name = "END_OF_LINE"
2748class EndOfLineU(EndOfLine):
2749 _opcode = OP.END_OF_LINE_U
2750 _op_name = "END_OF_LINE_U"
2752class EndOfString(ZeroWidthBase):
2753 _opcode = OP.END_OF_STRING
2754 _op_name = "END_OF_STRING"
2756class EndOfStringLine(ZeroWidthBase):
2757 _opcode = OP.END_OF_STRING_LINE
2758 _op_name = "END_OF_STRING_LINE"
2760class EndOfStringLineU(EndOfStringLine):
2761 _opcode = OP.END_OF_STRING_LINE_U
2762 _op_name = "END_OF_STRING_LINE_U"
2764class EndOfWord(ZeroWidthBase):
2765 _opcode = OP.END_OF_WORD
2766 _op_name = "END_OF_WORD"
2768class Failure(ZeroWidthBase):
2769 _op_name = "FAILURE"
2771 def _compile(self, reverse, fuzzy):
2772 return [(OP.FAILURE, )]
2774class Fuzzy(RegexBase):
2775 def __init__(self, subpattern, constraints=None):
2776 RegexBase.__init__(self)
2777 if constraints is None:
2778 constraints = {}
2779 self.subpattern = subpattern
2780 self.constraints = constraints
2782 # If an error type is mentioned in the cost equation, then its maximum
2783 # defaults to unlimited.
2784 if "cost" in constraints:
2785 for e in "dis":
2786 if e in constraints["cost"]:
2787 constraints.setdefault(e, (0, None))
2789 # If any error type is mentioned, then all the error maxima default to
2790 # 0, otherwise they default to unlimited.
2791 if set(constraints) & set("dis"):
2792 for e in "dis":
2793 constraints.setdefault(e, (0, 0))
2794 else:
2795 for e in "dis":
2796 constraints.setdefault(e, (0, None))
2798 # The maximum of the generic error type defaults to unlimited.
2799 constraints.setdefault("e", (0, None))
2801 # The cost equation defaults to equal costs. Also, the cost of any
2802 # error type not mentioned in the cost equation defaults to 0.
2803 if "cost" in constraints:
2804 for e in "dis":
2805 constraints["cost"].setdefault(e, 0)
2806 else:
2807 constraints["cost"] = {"d": 1, "i": 1, "s": 1, "max":
2808 constraints["e"][1]}
2810 def fix_groups(self, pattern, reverse, fuzzy):
2811 self.subpattern.fix_groups(pattern, reverse, True)
2813 def pack_characters(self, info):
2814 self.subpattern = self.subpattern.pack_characters(info)
2815 return self
2817 def remove_captures(self):
2818 self.subpattern = self.subpattern.remove_captures()
2819 return self
2821 def is_atomic(self):
2822 return self.subpattern.is_atomic()
2824 def contains_group(self):
2825 return self.subpattern.contains_group()
2827 def _compile(self, reverse, fuzzy):
2828 # The individual limits.
2829 arguments = []
2830 for e in "dise":
2831 v = self.constraints[e]
2832 arguments.append(v[0])
2833 arguments.append(UNLIMITED if v[1] is None else v[1])
2835 # The coeffs of the cost equation.
2836 for e in "dis":
2837 arguments.append(self.constraints["cost"][e])
2839 # The maximum of the cost equation.
2840 v = self.constraints["cost"]["max"]
2841 arguments.append(UNLIMITED if v is None else v)
2843 flags = 0
2844 if reverse:
2845 flags |= REVERSE_OP
2847 test = self.constraints.get("test")
2849 if test:
2850 return ([(OP.FUZZY_EXT, flags) + tuple(arguments)] +
2851 test.compile(reverse, True) + [(OP.NEXT,)] +
2852 self.subpattern.compile(reverse, True) + [(OP.END,)])
2854 return ([(OP.FUZZY, flags) + tuple(arguments)] +
2855 self.subpattern.compile(reverse, True) + [(OP.END,)])
2857 def dump(self, indent, reverse):
2858 constraints = self._constraints_to_string()
2859 if constraints:
2860 constraints = " " + constraints
2861 print("{}FUZZY{}".format(INDENT * indent, constraints))
2862 self.subpattern.dump(indent + 1, reverse)
2864 def is_empty(self):
2865 return self.subpattern.is_empty()
2867 def __eq__(self, other):
2868 return (type(self) is type(other) and self.subpattern ==
2869 other.subpattern and self.constraints == other.constraints)
2871 def max_width(self):
2872 return UNLIMITED
2874 def _constraints_to_string(self):
2875 constraints = []
2877 for name in "ids":
2878 min, max = self.constraints[name]
2879 if max == 0:
2880 continue
2882 con = ""
2884 if min > 0:
2885 con = "{}<=".format(min)
2887 con += name
2889 if max is not None:
2890 con += "<={}".format(max)
2892 constraints.append(con)
2894 cost = []
2895 for name in "ids":
2896 coeff = self.constraints["cost"][name]
2897 if coeff > 0:
2898 cost.append("{}{}".format(coeff, name))
2900 limit = self.constraints["cost"]["max"]
2901 if limit is not None and limit > 0:
2902 cost = "{}<={}".format("+".join(cost), limit)
2903 constraints.append(cost)
2905 return ",".join(constraints)
2907class Grapheme(RegexBase):
2908 def _compile(self, reverse, fuzzy):
2909 # Match at least 1 character until a grapheme boundary is reached. Note
2910 # that this is the same whether matching forwards or backwards.
2911 grapheme_matcher = Atomic(Sequence([LazyRepeat(AnyAll(), 1, None),
2912 GraphemeBoundary()]))
2914 return grapheme_matcher.compile(reverse, fuzzy)
2916 def dump(self, indent, reverse):
2917 print("{}GRAPHEME".format(INDENT * indent))
2919 def max_width(self):
2920 return UNLIMITED
2922class GraphemeBoundary:
2923 def compile(self, reverse, fuzzy):
2924 return [(OP.GRAPHEME_BOUNDARY, 1)]
2926class GreedyRepeat(RegexBase):
2927 _opcode = OP.GREEDY_REPEAT
2928 _op_name = "GREEDY_REPEAT"
2930 def __init__(self, subpattern, min_count, max_count):
2931 RegexBase.__init__(self)
2932 self.subpattern = subpattern
2933 self.min_count = min_count
2934 self.max_count = max_count
2936 def fix_groups(self, pattern, reverse, fuzzy):
2937 self.subpattern.fix_groups(pattern, reverse, fuzzy)
2939 def optimise(self, info, reverse):
2940 subpattern = self.subpattern.optimise(info, reverse)
2942 return type(self)(subpattern, self.min_count, self.max_count)
2944 def pack_characters(self, info):
2945 self.subpattern = self.subpattern.pack_characters(info)
2946 return self
2948 def remove_captures(self):
2949 self.subpattern = self.subpattern.remove_captures()
2950 return self
2952 def is_atomic(self):
2953 return self.min_count == self.max_count and self.subpattern.is_atomic()
2955 def can_be_affix(self):
2956 return False
2958 def contains_group(self):
2959 return self.subpattern.contains_group()
2961 def get_firstset(self, reverse):
2962 fs = self.subpattern.get_firstset(reverse)
2963 if self.min_count == 0:
2964 fs.add(None)
2966 return fs
2968 def _compile(self, reverse, fuzzy):
2969 repeat = [self._opcode, self.min_count]
2970 if self.max_count is None:
2971 repeat.append(UNLIMITED)
2972 else:
2973 repeat.append(self.max_count)
2975 subpattern = self.subpattern.compile(reverse, fuzzy)
2976 if not subpattern:
2977 return []
2979 return ([tuple(repeat)] + subpattern + [(OP.END, )])
2981 def dump(self, indent, reverse):
2982 if self.max_count is None:
2983 limit = "INF"
2984 else:
2985 limit = self.max_count
2986 print("{}{} {} {}".format(INDENT * indent, self._op_name,
2987 self.min_count, limit))
2989 self.subpattern.dump(indent + 1, reverse)
2991 def is_empty(self):
2992 return self.subpattern.is_empty()
2994 def __eq__(self, other):
2995 return type(self) is type(other) and (self.subpattern, self.min_count,
2996 self.max_count) == (other.subpattern, other.min_count,
2997 other.max_count)
2999 def max_width(self):
3000 if self.max_count is None:
3001 return UNLIMITED
3003 return self.subpattern.max_width() * self.max_count
3005 def get_required_string(self, reverse):
3006 max_count = UNLIMITED if self.max_count is None else self.max_count
3007 if self.min_count == 0:
3008 w = self.subpattern.max_width() * max_count
3009 return min(w, UNLIMITED), None
3011 ofs, req = self.subpattern.get_required_string(reverse)
3012 if req:
3013 return ofs, req
3015 w = self.subpattern.max_width() * max_count
3016 return min(w, UNLIMITED), None
3018class PossessiveRepeat(GreedyRepeat):
3019 def is_atomic(self):
3020 return True
3022 def _compile(self, reverse, fuzzy):
3023 subpattern = self.subpattern.compile(reverse, fuzzy)
3024 if not subpattern:
3025 return []
3027 repeat = [self._opcode, self.min_count]
3028 if self.max_count is None:
3029 repeat.append(UNLIMITED)
3030 else:
3031 repeat.append(self.max_count)
3033 return ([(OP.ATOMIC, ), tuple(repeat)] + subpattern + [(OP.END, ),
3034 (OP.END, )])
3036 def dump(self, indent, reverse):
3037 print("{}ATOMIC".format(INDENT * indent))
3039 if self.max_count is None:
3040 limit = "INF"
3041 else:
3042 limit = self.max_count
3043 print("{}{} {} {}".format(INDENT * (indent + 1), self._op_name,
3044 self.min_count, limit))
3046 self.subpattern.dump(indent + 2, reverse)
3048class Group(RegexBase):
3049 def __init__(self, info, group, subpattern):
3050 RegexBase.__init__(self)
3051 self.info = info
3052 self.group = group
3053 self.subpattern = subpattern
3055 self.call_ref = None
3057 def fix_groups(self, pattern, reverse, fuzzy):
3058 self.info.defined_groups[self.group] = (self, reverse, fuzzy)
3059 self.subpattern.fix_groups(pattern, reverse, fuzzy)
3061 def optimise(self, info, reverse):
3062 subpattern = self.subpattern.optimise(info, reverse)
3064 return Group(self.info, self.group, subpattern)
3066 def pack_characters(self, info):
3067 self.subpattern = self.subpattern.pack_characters(info)
3068 return self
3070 def remove_captures(self):
3071 return self.subpattern.remove_captures()
3073 def is_atomic(self):
3074 return self.subpattern.is_atomic()
3076 def can_be_affix(self):
3077 return False
3079 def contains_group(self):
3080 return True
3082 def get_firstset(self, reverse):
3083 return self.subpattern.get_firstset(reverse)
3085 def has_simple_start(self):
3086 return self.subpattern.has_simple_start()
3088 def _compile(self, reverse, fuzzy):
3089 code = []
3091 public_group = private_group = self.group
3092 if private_group < 0:
3093 public_group = self.info.private_groups[private_group]
3094 private_group = self.info.group_count - private_group
3096 key = self.group, reverse, fuzzy
3097 ref = self.info.call_refs.get(key)
3098 if ref is not None:
3099 code += [(OP.CALL_REF, ref)]
3101 code += [(OP.GROUP, int(not reverse), private_group, public_group)]
3102 code += self.subpattern.compile(reverse, fuzzy)
3103 code += [(OP.END, )]
3105 if ref is not None:
3106 code += [(OP.END, )]
3108 return code
3110 def dump(self, indent, reverse):
3111 group = self.group
3112 if group < 0:
3113 group = self.info.private_groups[group]
3114 print("{}GROUP {}".format(INDENT * indent, group))
3115 self.subpattern.dump(indent + 1, reverse)
3117 def __eq__(self, other):
3118 return (type(self) is type(other) and (self.group, self.subpattern) ==
3119 (other.group, other.subpattern))
3121 def max_width(self):
3122 return self.subpattern.max_width()
3124 def get_required_string(self, reverse):
3125 return self.subpattern.get_required_string(reverse)
3127 def __del__(self):
3128 self.info = None
3130class Keep(ZeroWidthBase):
3131 _opcode = OP.KEEP
3132 _op_name = "KEEP"
3134class LazyRepeat(GreedyRepeat):
3135 _opcode = OP.LAZY_REPEAT
3136 _op_name = "LAZY_REPEAT"
3138class LookAround(RegexBase):
3139 _dir_text = {False: "AHEAD", True: "BEHIND"}
3141 def __init__(self, behind, positive, subpattern):
3142 RegexBase.__init__(self)
3143 self.behind = bool(behind)
3144 self.positive = bool(positive)
3145 self.subpattern = subpattern
3147 def fix_groups(self, pattern, reverse, fuzzy):
3148 self.subpattern.fix_groups(pattern, self.behind, fuzzy)
3150 def optimise(self, info, reverse):
3151 subpattern = self.subpattern.optimise(info, self.behind)
3152 if self.positive and subpattern.is_empty():
3153 return subpattern
3155 return LookAround(self.behind, self.positive, subpattern)
3157 def pack_characters(self, info):
3158 self.subpattern = self.subpattern.pack_characters(info)
3159 return self
3161 def remove_captures(self):
3162 return self.subpattern.remove_captures()
3164 def is_atomic(self):
3165 return self.subpattern.is_atomic()
3167 def can_be_affix(self):
3168 return self.subpattern.can_be_affix()
3170 def contains_group(self):
3171 return self.subpattern.contains_group()
3173 def get_firstset(self, reverse):
3174 if self.positive and self.behind == reverse:
3175 return self.subpattern.get_firstset(reverse)
3177 return set([None])
3179 def _compile(self, reverse, fuzzy):
3180 flags = 0
3181 if self.positive:
3182 flags |= POSITIVE_OP
3183 if fuzzy:
3184 flags |= FUZZY_OP
3185 if reverse:
3186 flags |= REVERSE_OP
3188 return ([(OP.LOOKAROUND, flags, int(not self.behind))] +
3189 self.subpattern.compile(self.behind) + [(OP.END, )])
3191 def dump(self, indent, reverse):
3192 print("{}LOOK{} {}".format(INDENT * indent,
3193 self._dir_text[self.behind], POS_TEXT[self.positive]))
3194 self.subpattern.dump(indent + 1, self.behind)
3196 def is_empty(self):
3197 return self.positive and self.subpattern.is_empty()
3199 def __eq__(self, other):
3200 return type(self) is type(other) and (self.behind, self.positive,
3201 self.subpattern) == (other.behind, other.positive, other.subpattern)
3203 def max_width(self):
3204 return 0
3206class LookAroundConditional(RegexBase):
3207 _dir_text = {False: "AHEAD", True: "BEHIND"}
3209 def __init__(self, behind, positive, subpattern, yes_item, no_item):
3210 RegexBase.__init__(self)
3211 self.behind = bool(behind)
3212 self.positive = bool(positive)
3213 self.subpattern = subpattern
3214 self.yes_item = yes_item
3215 self.no_item = no_item
3217 def fix_groups(self, pattern, reverse, fuzzy):
3218 self.subpattern.fix_groups(pattern, reverse, fuzzy)
3219 self.yes_item.fix_groups(pattern, reverse, fuzzy)
3220 self.no_item.fix_groups(pattern, reverse, fuzzy)
3222 def optimise(self, info, reverse):
3223 subpattern = self.subpattern.optimise(info, self.behind)
3224 yes_item = self.yes_item.optimise(info, self.behind)
3225 no_item = self.no_item.optimise(info, self.behind)
3227 return LookAroundConditional(self.behind, self.positive, subpattern,
3228 yes_item, no_item)
3230 def pack_characters(self, info):
3231 self.subpattern = self.subpattern.pack_characters(info)
3232 self.yes_item = self.yes_item.pack_characters(info)
3233 self.no_item = self.no_item.pack_characters(info)
3234 return self
3236 def remove_captures(self):
3237 self.subpattern = self.subpattern.remove_captures()
3238 self.yes_item = self.yes_item.remove_captures()
3239 self.no_item = self.no_item.remove_captures()
3241 def is_atomic(self):
3242 return (self.subpattern.is_atomic() and self.yes_item.is_atomic() and
3243 self.no_item.is_atomic())
3245 def can_be_affix(self):
3246 return (self.subpattern.can_be_affix() and self.yes_item.can_be_affix()
3247 and self.no_item.can_be_affix())
3249 def contains_group(self):
3250 return (self.subpattern.contains_group() or
3251 self.yes_item.contains_group() or self.no_item.contains_group())
3253 def _compile(self, reverse, fuzzy):
3254 code = [(OP.CONDITIONAL, int(self.positive), int(not self.behind))]
3255 code.extend(self.subpattern.compile(self.behind, fuzzy))
3256 code.append((OP.NEXT, ))
3257 code.extend(self.yes_item.compile(reverse, fuzzy))
3258 add_code = self.no_item.compile(reverse, fuzzy)
3259 if add_code:
3260 code.append((OP.NEXT, ))
3261 code.extend(add_code)
3263 code.append((OP.END, ))
3265 return code
3267 def dump(self, indent, reverse):
3268 print("{}CONDITIONAL {} {}".format(INDENT * indent,
3269 self._dir_text[self.behind], POS_TEXT[self.positive]))
3270 self.subpattern.dump(indent + 1, self.behind)
3271 print("{}EITHER".format(INDENT * indent))
3272 self.yes_item.dump(indent + 1, reverse)
3273 if not self.no_item.is_empty():
3274 print("{}OR".format(INDENT * indent))
3275 self.no_item.dump(indent + 1, reverse)
3277 def is_empty(self):
3278 return (self.subpattern.is_empty() and self.yes_item.is_empty() or
3279 self.no_item.is_empty())
3281 def __eq__(self, other):
3282 return type(self) is type(other) and (self.subpattern, self.yes_item,
3283 self.no_item) == (other.subpattern, other.yes_item, other.no_item)
3285 def max_width(self):
3286 return max(self.yes_item.max_width(), self.no_item.max_width())
3288 def get_required_string(self, reverse):
3289 return self.max_width(), None
3291class PrecompiledCode(RegexBase):
3292 def __init__(self, code):
3293 self.code = code
3295 def _compile(self, reverse, fuzzy):
3296 return [tuple(self.code)]
3298class Property(RegexBase):
3299 _opcode = {(NOCASE, False): OP.PROPERTY, (IGNORECASE, False):
3300 OP.PROPERTY_IGN, (FULLCASE, False): OP.PROPERTY, (FULLIGNORECASE, False):
3301 OP.PROPERTY_IGN, (NOCASE, True): OP.PROPERTY_REV, (IGNORECASE, True):
3302 OP.PROPERTY_IGN_REV, (FULLCASE, True): OP.PROPERTY_REV, (FULLIGNORECASE,
3303 True): OP.PROPERTY_IGN_REV}
3305 def __init__(self, value, positive=True, case_flags=NOCASE,
3306 zerowidth=False, encoding=0):
3307 RegexBase.__init__(self)
3308 self.value = value
3309 self.positive = bool(positive)
3310 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
3311 self.zerowidth = bool(zerowidth)
3312 self.encoding = encoding
3314 self._key = (self.__class__, self.value, self.positive,
3315 self.case_flags, self.zerowidth)
3317 def rebuild(self, positive, case_flags, zerowidth):
3318 return Property(self.value, positive, case_flags, zerowidth,
3319 self.encoding)
3321 def optimise(self, info, reverse, in_set=False):
3322 return self
3324 def get_firstset(self, reverse):
3325 return set([self])
3327 def has_simple_start(self):
3328 return True
3330 def _compile(self, reverse, fuzzy):
3331 flags = 0
3332 if self.positive:
3333 flags |= POSITIVE_OP
3334 if self.zerowidth:
3335 flags |= ZEROWIDTH_OP
3336 if fuzzy:
3337 flags |= FUZZY_OP
3338 flags |= self.encoding << ENCODING_OP_SHIFT
3339 return [(self._opcode[self.case_flags, reverse], flags, self.value)]
3341 def dump(self, indent, reverse):
3342 prop = PROPERTY_NAMES[self.value >> 16]
3343 name, value = prop[0], prop[1][self.value & 0xFFFF]
3344 print("{}PROPERTY {} {}:{}{}{}".format(INDENT * indent,
3345 POS_TEXT[self.positive], name, value, CASE_TEXT[self.case_flags],
3346 ["", " ASCII"][self.encoding]))
3348 def matches(self, ch):
3349 return _regex.has_property_value(self.value, ch) == self.positive
3351 def max_width(self):
3352 return 1
3354class Prune(ZeroWidthBase):
3355 _op_name = "PRUNE"
3357 def _compile(self, reverse, fuzzy):
3358 return [(OP.PRUNE, )]
3360class Range(RegexBase):
3361 _opcode = {(NOCASE, False): OP.RANGE, (IGNORECASE, False): OP.RANGE_IGN,
3362 (FULLCASE, False): OP.RANGE, (FULLIGNORECASE, False): OP.RANGE_IGN,
3363 (NOCASE, True): OP.RANGE_REV, (IGNORECASE, True): OP.RANGE_IGN_REV,
3364 (FULLCASE, True): OP.RANGE_REV, (FULLIGNORECASE, True): OP.RANGE_IGN_REV}
3365 _op_name = "RANGE"
3367 def __init__(self, lower, upper, positive=True, case_flags=NOCASE,
3368 zerowidth=False):
3369 RegexBase.__init__(self)
3370 self.lower = lower
3371 self.upper = upper
3372 self.positive = bool(positive)
3373 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
3374 self.zerowidth = bool(zerowidth)
3376 self._key = (self.__class__, self.lower, self.upper, self.positive,
3377 self.case_flags, self.zerowidth)
3379 def rebuild(self, positive, case_flags, zerowidth):
3380 return Range(self.lower, self.upper, positive, case_flags, zerowidth)
3382 def optimise(self, info, reverse, in_set=False):
3383 # Is the range case-sensitive?
3384 if not self.positive or not (self.case_flags & IGNORECASE) or in_set:
3385 return self
3387 # Is full case-folding possible?
3388 if (not (info.flags & UNICODE) or (self.case_flags & FULLIGNORECASE) !=
3389 FULLIGNORECASE):
3390 return self
3392 # Get the characters which expand to multiple codepoints on folding.
3393 expanding_chars = _regex.get_expand_on_folding()
3395 # Get the folded characters in the range.
3396 items = []
3397 for ch in expanding_chars:
3398 if self.lower <= ord(ch) <= self.upper:
3399 folded = _regex.fold_case(FULL_CASE_FOLDING, ch)
3400 items.append(String([ord(c) for c in folded],
3401 case_flags=self.case_flags))
3403 if not items:
3404 # We can fall back to simple case-folding.
3405 return self
3407 if len(items) < self.upper - self.lower + 1:
3408 # Not all the characters are covered by the full case-folding.
3409 items.insert(0, self)
3411 return Branch(items)
3413 def _compile(self, reverse, fuzzy):
3414 flags = 0
3415 if self.positive:
3416 flags |= POSITIVE_OP
3417 if self.zerowidth:
3418 flags |= ZEROWIDTH_OP
3419 if fuzzy:
3420 flags |= FUZZY_OP
3421 return [(self._opcode[self.case_flags, reverse], flags, self.lower,
3422 self.upper)]
3424 def dump(self, indent, reverse):
3425 display_lower = ascii(chr(self.lower)).lstrip("bu")
3426 display_upper = ascii(chr(self.upper)).lstrip("bu")
3427 print("{}RANGE {} {} {}{}".format(INDENT * indent,
3428 POS_TEXT[self.positive], display_lower, display_upper,
3429 CASE_TEXT[self.case_flags]))
3431 def matches(self, ch):
3432 return (self.lower <= ch <= self.upper) == self.positive
3434 def max_width(self):
3435 return 1
3437class RefGroup(RegexBase):
3438 _opcode = {(NOCASE, False): OP.REF_GROUP, (IGNORECASE, False):
3439 OP.REF_GROUP_IGN, (FULLCASE, False): OP.REF_GROUP, (FULLIGNORECASE,
3440 False): OP.REF_GROUP_FLD, (NOCASE, True): OP.REF_GROUP_REV, (IGNORECASE,
3441 True): OP.REF_GROUP_IGN_REV, (FULLCASE, True): OP.REF_GROUP_REV,
3442 (FULLIGNORECASE, True): OP.REF_GROUP_FLD_REV}
3444 def __init__(self, info, group, position, case_flags=NOCASE):
3445 RegexBase.__init__(self)
3446 self.info = info
3447 self.group = group
3448 self.position = position
3449 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
3451 self._key = self.__class__, self.group, self.case_flags
3453 def fix_groups(self, pattern, reverse, fuzzy):
3454 try:
3455 self.group = int(self.group)
3456 except ValueError:
3457 try:
3458 self.group = self.info.group_index[self.group]
3459 except KeyError:
3460 raise error("unknown group", pattern, self.position)
3462 if not 1 <= self.group <= self.info.group_count:
3463 raise error("invalid group reference", pattern, self.position)
3465 self._key = self.__class__, self.group, self.case_flags
3467 def remove_captures(self):
3468 raise error("group reference not allowed", self.pattern, self.position)
3470 def _compile(self, reverse, fuzzy):
3471 flags = 0
3472 if fuzzy:
3473 flags |= FUZZY_OP
3474 return [(self._opcode[self.case_flags, reverse], flags, self.group)]
3476 def dump(self, indent, reverse):
3477 print("{}REF_GROUP {}{}".format(INDENT * indent, self.group,
3478 CASE_TEXT[self.case_flags]))
3480 def max_width(self):
3481 return UNLIMITED
3483 def __del__(self):
3484 self.info = None
3486class SearchAnchor(ZeroWidthBase):
3487 _opcode = OP.SEARCH_ANCHOR
3488 _op_name = "SEARCH_ANCHOR"
3490class Sequence(RegexBase):
3491 def __init__(self, items=None):
3492 RegexBase.__init__(self)
3493 if items is None:
3494 items = []
3496 self.items = items
3498 def fix_groups(self, pattern, reverse, fuzzy):
3499 for s in self.items:
3500 s.fix_groups(pattern, reverse, fuzzy)
3502 def optimise(self, info, reverse):
3503 # Flatten the sequences.
3504 items = []
3505 for s in self.items:
3506 s = s.optimise(info, reverse)
3507 if isinstance(s, Sequence):
3508 items.extend(s.items)
3509 else:
3510 items.append(s)
3512 return make_sequence(items)
3514 def pack_characters(self, info):
3515 "Packs sequences of characters into strings."
3516 items = []
3517 characters = []
3518 case_flags = NOCASE
3519 for s in self.items:
3520 if type(s) is Character and s.positive and not s.zerowidth:
3521 if s.case_flags != case_flags:
3522 # Different case sensitivity, so flush, unless neither the
3523 # previous nor the new character are cased.
3524 if s.case_flags or is_cased_i(info, s.value):
3525 Sequence._flush_characters(info, characters,
3526 case_flags, items)
3528 case_flags = s.case_flags
3530 characters.append(s.value)
3531 elif type(s) is String or type(s) is Literal:
3532 if s.case_flags != case_flags:
3533 # Different case sensitivity, so flush, unless the neither
3534 # the previous nor the new string are cased.
3535 if s.case_flags or any(is_cased_i(info, c) for c in
3536 characters):
3537 Sequence._flush_characters(info, characters,
3538 case_flags, items)
3540 case_flags = s.case_flags
3542 characters.extend(s.characters)
3543 else:
3544 Sequence._flush_characters(info, characters, case_flags, items)
3546 items.append(s.pack_characters(info))
3548 Sequence._flush_characters(info, characters, case_flags, items)
3550 return make_sequence(items)
3552 def remove_captures(self):
3553 self.items = [s.remove_captures() for s in self.items]
3554 return self
3556 def is_atomic(self):
3557 return all(s.is_atomic() for s in self.items)
3559 def can_be_affix(self):
3560 return False
3562 def contains_group(self):
3563 return any(s.contains_group() for s in self.items)
3565 def get_firstset(self, reverse):
3566 fs = set()
3567 items = self.items
3568 if reverse:
3569 items.reverse()
3570 for s in items:
3571 fs |= s.get_firstset(reverse)
3572 if None not in fs:
3573 return fs
3574 fs.discard(None)
3576 return fs | set([None])
3578 def has_simple_start(self):
3579 return bool(self.items) and self.items[0].has_simple_start()
3581 def _compile(self, reverse, fuzzy):
3582 seq = self.items
3583 if reverse:
3584 seq = seq[::-1]
3586 code = []
3587 for s in seq:
3588 code.extend(s.compile(reverse, fuzzy))
3590 return code
3592 def dump(self, indent, reverse):
3593 for s in self.items:
3594 s.dump(indent, reverse)
3596 @staticmethod
3597 def _flush_characters(info, characters, case_flags, items):
3598 if not characters:
3599 return
3601 # Disregard case_flags if all of the characters are case-less.
3602 if case_flags & IGNORECASE:
3603 if not any(is_cased_i(info, c) for c in characters):
3604 case_flags = NOCASE
3606 if (case_flags & FULLIGNORECASE) == FULLIGNORECASE:
3607 literals = Sequence._fix_full_casefold(characters)
3609 for item in literals:
3610 chars = item.characters
3612 if len(chars) == 1:
3613 items.append(Character(chars[0], case_flags=item.case_flags))
3614 else:
3615 items.append(String(chars, case_flags=item.case_flags))
3616 else:
3617 if len(characters) == 1:
3618 items.append(Character(characters[0], case_flags=case_flags))
3619 else:
3620 items.append(String(characters, case_flags=case_flags))
3622 characters[:] = []
3624 @staticmethod
3625 def _fix_full_casefold(characters):
3626 # Split a literal needing full case-folding into chunks that need it
3627 # and chunks that can use simple case-folding, which is faster.
3628 expanded = [_regex.fold_case(FULL_CASE_FOLDING, c) for c in
3629 _regex.get_expand_on_folding()]
3630 string = _regex.fold_case(FULL_CASE_FOLDING, ''.join(chr(c)
3631 for c in characters)).lower()
3632 chunks = []
3634 for e in expanded:
3635 found = string.find(e)
3637 while found >= 0:
3638 chunks.append((found, found + len(e)))
3639 found = string.find(e, found + 1)
3641 pos = 0
3642 literals = []
3644 for start, end in Sequence._merge_chunks(chunks):
3645 if pos < start:
3646 literals.append(Literal(characters[pos : start],
3647 case_flags=IGNORECASE))
3649 literals.append(Literal(characters[start : end],
3650 case_flags=FULLIGNORECASE))
3651 pos = end
3653 if pos < len(characters):
3654 literals.append(Literal(characters[pos : ], case_flags=IGNORECASE))
3656 return literals
3658 @staticmethod
3659 def _merge_chunks(chunks):
3660 if len(chunks) < 2:
3661 return chunks
3663 chunks.sort()
3665 start, end = chunks[0]
3666 new_chunks = []
3668 for s, e in chunks[1 : ]:
3669 if s <= end:
3670 end = max(end, e)
3671 else:
3672 new_chunks.append((start, end))
3673 start, end = s, e
3675 new_chunks.append((start, end))
3677 return new_chunks
3679 def is_empty(self):
3680 return all(i.is_empty() for i in self.items)
3682 def __eq__(self, other):
3683 return type(self) is type(other) and self.items == other.items
3685 def max_width(self):
3686 return sum(s.max_width() for s in self.items)
3688 def get_required_string(self, reverse):
3689 seq = self.items
3690 if reverse:
3691 seq = seq[::-1]
3693 offset = 0
3695 for s in seq:
3696 ofs, req = s.get_required_string(reverse)
3697 offset += ofs
3698 if req:
3699 return offset, req
3701 return offset, None
3703class SetBase(RegexBase):
3704 def __init__(self, info, items, positive=True, case_flags=NOCASE,
3705 zerowidth=False):
3706 RegexBase.__init__(self)
3707 self.info = info
3708 self.items = tuple(items)
3709 self.positive = bool(positive)
3710 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
3711 self.zerowidth = bool(zerowidth)
3713 self.char_width = 1
3715 self._key = (self.__class__, self.items, self.positive,
3716 self.case_flags, self.zerowidth)
3718 def rebuild(self, positive, case_flags, zerowidth):
3719 return type(self)(self.info, self.items, positive, case_flags,
3720 zerowidth).optimise(self.info, False)
3722 def get_firstset(self, reverse):
3723 return set([self])
3725 def has_simple_start(self):
3726 return True
3728 def _compile(self, reverse, fuzzy):
3729 flags = 0
3730 if self.positive:
3731 flags |= POSITIVE_OP
3732 if self.zerowidth:
3733 flags |= ZEROWIDTH_OP
3734 if fuzzy:
3735 flags |= FUZZY_OP
3736 code = [(self._opcode[self.case_flags, reverse], flags)]
3737 for m in self.items:
3738 code.extend(m.compile())
3740 code.append((OP.END, ))
3742 return code
3744 def dump(self, indent, reverse):
3745 print("{}{} {}{}".format(INDENT * indent, self._op_name,
3746 POS_TEXT[self.positive], CASE_TEXT[self.case_flags]))
3747 for i in self.items:
3748 i.dump(indent + 1, reverse)
3750 def _handle_case_folding(self, info, in_set):
3751 # Is the set case-sensitive?
3752 if not self.positive or not (self.case_flags & IGNORECASE) or in_set:
3753 return self
3755 # Is full case-folding possible?
3756 if (not (self.info.flags & UNICODE) or (self.case_flags &
3757 FULLIGNORECASE) != FULLIGNORECASE):
3758 return self
3760 # Get the characters which expand to multiple codepoints on folding.
3761 expanding_chars = _regex.get_expand_on_folding()
3763 # Get the folded characters in the set.
3764 items = []
3765 seen = set()
3766 for ch in expanding_chars:
3767 if self.matches(ord(ch)):
3768 folded = _regex.fold_case(FULL_CASE_FOLDING, ch)
3769 if folded not in seen:
3770 items.append(String([ord(c) for c in folded],
3771 case_flags=self.case_flags))
3772 seen.add(folded)
3774 if not items:
3775 # We can fall back to simple case-folding.
3776 return self
3778 return Branch([self] + items)
3780 def max_width(self):
3781 # Is the set case-sensitive?
3782 if not self.positive or not (self.case_flags & IGNORECASE):
3783 return 1
3785 # Is full case-folding possible?
3786 if (not (self.info.flags & UNICODE) or (self.case_flags &
3787 FULLIGNORECASE) != FULLIGNORECASE):
3788 return 1
3790 # Get the characters which expand to multiple codepoints on folding.
3791 expanding_chars = _regex.get_expand_on_folding()
3793 # Get the folded characters in the set.
3794 seen = set()
3795 for ch in expanding_chars:
3796 if self.matches(ord(ch)):
3797 folded = _regex.fold_case(FULL_CASE_FOLDING, ch)
3798 seen.add(folded)
3800 if not seen:
3801 return 1
3803 return max(len(folded) for folded in seen)
3805 def __del__(self):
3806 self.info = None
3808class SetDiff(SetBase):
3809 _opcode = {(NOCASE, False): OP.SET_DIFF, (IGNORECASE, False):
3810 OP.SET_DIFF_IGN, (FULLCASE, False): OP.SET_DIFF, (FULLIGNORECASE, False):
3811 OP.SET_DIFF_IGN, (NOCASE, True): OP.SET_DIFF_REV, (IGNORECASE, True):
3812 OP.SET_DIFF_IGN_REV, (FULLCASE, True): OP.SET_DIFF_REV, (FULLIGNORECASE,
3813 True): OP.SET_DIFF_IGN_REV}
3814 _op_name = "SET_DIFF"
3816 def optimise(self, info, reverse, in_set=False):
3817 items = self.items
3818 if len(items) > 2:
3819 items = [items[0], SetUnion(info, items[1 : ])]
3821 if len(items) == 1:
3822 return items[0].with_flags(case_flags=self.case_flags,
3823 zerowidth=self.zerowidth).optimise(info, reverse, in_set)
3825 self.items = tuple(m.optimise(info, reverse, in_set=True) for m in
3826 items)
3828 return self._handle_case_folding(info, in_set)
3830 def matches(self, ch):
3831 m = self.items[0].matches(ch) and not self.items[1].matches(ch)
3832 return m == self.positive
3834class SetInter(SetBase):
3835 _opcode = {(NOCASE, False): OP.SET_INTER, (IGNORECASE, False):
3836 OP.SET_INTER_IGN, (FULLCASE, False): OP.SET_INTER, (FULLIGNORECASE,
3837 False): OP.SET_INTER_IGN, (NOCASE, True): OP.SET_INTER_REV, (IGNORECASE,
3838 True): OP.SET_INTER_IGN_REV, (FULLCASE, True): OP.SET_INTER_REV,
3839 (FULLIGNORECASE, True): OP.SET_INTER_IGN_REV}
3840 _op_name = "SET_INTER"
3842 def optimise(self, info, reverse, in_set=False):
3843 items = []
3844 for m in self.items:
3845 m = m.optimise(info, reverse, in_set=True)
3846 if isinstance(m, SetInter) and m.positive:
3847 # Intersection in intersection.
3848 items.extend(m.items)
3849 else:
3850 items.append(m)
3852 if len(items) == 1:
3853 return items[0].with_flags(case_flags=self.case_flags,
3854 zerowidth=self.zerowidth).optimise(info, reverse, in_set)
3856 self.items = tuple(items)
3858 return self._handle_case_folding(info, in_set)
3860 def matches(self, ch):
3861 m = all(i.matches(ch) for i in self.items)
3862 return m == self.positive
3864class SetSymDiff(SetBase):
3865 _opcode = {(NOCASE, False): OP.SET_SYM_DIFF, (IGNORECASE, False):
3866 OP.SET_SYM_DIFF_IGN, (FULLCASE, False): OP.SET_SYM_DIFF, (FULLIGNORECASE,
3867 False): OP.SET_SYM_DIFF_IGN, (NOCASE, True): OP.SET_SYM_DIFF_REV,
3868 (IGNORECASE, True): OP.SET_SYM_DIFF_IGN_REV, (FULLCASE, True):
3869 OP.SET_SYM_DIFF_REV, (FULLIGNORECASE, True): OP.SET_SYM_DIFF_IGN_REV}
3870 _op_name = "SET_SYM_DIFF"
3872 def optimise(self, info, reverse, in_set=False):
3873 items = []
3874 for m in self.items:
3875 m = m.optimise(info, reverse, in_set=True)
3876 if isinstance(m, SetSymDiff) and m.positive:
3877 # Symmetric difference in symmetric difference.
3878 items.extend(m.items)
3879 else:
3880 items.append(m)
3882 if len(items) == 1:
3883 return items[0].with_flags(case_flags=self.case_flags,
3884 zerowidth=self.zerowidth).optimise(info, reverse, in_set)
3886 self.items = tuple(items)
3888 return self._handle_case_folding(info, in_set)
3890 def matches(self, ch):
3891 m = False
3892 for i in self.items:
3893 m = m != i.matches(ch)
3895 return m == self.positive
3897class SetUnion(SetBase):
3898 _opcode = {(NOCASE, False): OP.SET_UNION, (IGNORECASE, False):
3899 OP.SET_UNION_IGN, (FULLCASE, False): OP.SET_UNION, (FULLIGNORECASE,
3900 False): OP.SET_UNION_IGN, (NOCASE, True): OP.SET_UNION_REV, (IGNORECASE,
3901 True): OP.SET_UNION_IGN_REV, (FULLCASE, True): OP.SET_UNION_REV,
3902 (FULLIGNORECASE, True): OP.SET_UNION_IGN_REV}
3903 _op_name = "SET_UNION"
3905 def optimise(self, info, reverse, in_set=False):
3906 items = []
3907 for m in self.items:
3908 m = m.optimise(info, reverse, in_set=True)
3909 if isinstance(m, SetUnion) and m.positive:
3910 # Union in union.
3911 items.extend(m.items)
3912 elif isinstance(m, AnyAll):
3913 return AnyAll()
3914 else:
3915 items.append(m)
3917 # Are there complementary properties?
3918 properties = (set(), set())
3920 for m in items:
3921 if isinstance(m, Property):
3922 properties[m.positive].add((m.value, m.case_flags, m.zerowidth))
3924 if properties[0] & properties[1]:
3925 return AnyAll()
3927 if len(items) == 1:
3928 i = items[0]
3929 return i.with_flags(positive=i.positive == self.positive,
3930 case_flags=self.case_flags,
3931 zerowidth=self.zerowidth).optimise(info, reverse, in_set)
3933 self.items = tuple(items)
3935 return self._handle_case_folding(info, in_set)
3937 def _compile(self, reverse, fuzzy):
3938 flags = 0
3939 if self.positive:
3940 flags |= POSITIVE_OP
3941 if self.zerowidth:
3942 flags |= ZEROWIDTH_OP
3943 if fuzzy:
3944 flags |= FUZZY_OP
3946 characters, others = defaultdict(list), []
3947 for m in self.items:
3948 if isinstance(m, Character):
3949 characters[m.positive].append(m.value)
3950 else:
3951 others.append(m)
3953 code = [(self._opcode[self.case_flags, reverse], flags)]
3955 for positive, values in characters.items():
3956 flags = 0
3957 if positive:
3958 flags |= POSITIVE_OP
3959 if len(values) == 1:
3960 code.append((OP.CHARACTER, flags, values[0]))
3961 else:
3962 code.append((OP.STRING, flags, len(values)) + tuple(values))
3964 for m in others:
3965 code.extend(m.compile())
3967 code.append((OP.END, ))
3969 return code
3971 def matches(self, ch):
3972 m = any(i.matches(ch) for i in self.items)
3973 return m == self.positive
3975class Skip(ZeroWidthBase):
3976 _op_name = "SKIP"
3977 _opcode = OP.SKIP
3979class StartOfLine(ZeroWidthBase):
3980 _opcode = OP.START_OF_LINE
3981 _op_name = "START_OF_LINE"
3983class StartOfLineU(StartOfLine):
3984 _opcode = OP.START_OF_LINE_U
3985 _op_name = "START_OF_LINE_U"
3987class StartOfString(ZeroWidthBase):
3988 _opcode = OP.START_OF_STRING
3989 _op_name = "START_OF_STRING"
3991class StartOfWord(ZeroWidthBase):
3992 _opcode = OP.START_OF_WORD
3993 _op_name = "START_OF_WORD"
3995class String(RegexBase):
3996 _opcode = {(NOCASE, False): OP.STRING, (IGNORECASE, False): OP.STRING_IGN,
3997 (FULLCASE, False): OP.STRING, (FULLIGNORECASE, False): OP.STRING_FLD,
3998 (NOCASE, True): OP.STRING_REV, (IGNORECASE, True): OP.STRING_IGN_REV,
3999 (FULLCASE, True): OP.STRING_REV, (FULLIGNORECASE, True):
4000 OP.STRING_FLD_REV}
4002 def __init__(self, characters, case_flags=NOCASE):
4003 self.characters = tuple(characters)
4004 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
4006 if (self.case_flags & FULLIGNORECASE) == FULLIGNORECASE:
4007 folded_characters = []
4008 for char in self.characters:
4009 folded = _regex.fold_case(FULL_CASE_FOLDING, chr(char))
4010 folded_characters.extend(ord(c) for c in folded)
4011 else:
4012 folded_characters = self.characters
4014 self.folded_characters = tuple(folded_characters)
4015 self.required = False
4017 self._key = self.__class__, self.characters, self.case_flags
4019 def get_firstset(self, reverse):
4020 if reverse:
4021 pos = -1
4022 else:
4023 pos = 0
4024 return set([Character(self.characters[pos],
4025 case_flags=self.case_flags)])
4027 def has_simple_start(self):
4028 return True
4030 def _compile(self, reverse, fuzzy):
4031 flags = 0
4032 if fuzzy:
4033 flags |= FUZZY_OP
4034 if self.required:
4035 flags |= REQUIRED_OP
4036 return [(self._opcode[self.case_flags, reverse], flags,
4037 len(self.folded_characters)) + self.folded_characters]
4039 def dump(self, indent, reverse):
4040 display = ascii("".join(chr(c) for c in self.characters)).lstrip("bu")
4041 print("{}STRING {}{}".format(INDENT * indent, display,
4042 CASE_TEXT[self.case_flags]))
4044 def max_width(self):
4045 return len(self.folded_characters)
4047 def get_required_string(self, reverse):
4048 return 0, self
4050class Literal(String):
4051 def dump(self, indent, reverse):
4052 literal = ''.join(chr(c) for c in self.characters)
4053 display = ascii(literal).lstrip("bu")
4054 print("{}LITERAL MATCH {}{}".format(INDENT * indent, display,
4055 CASE_TEXT[self.case_flags]))
4057class StringSet(Branch):
4058 def __init__(self, info, name, case_flags=NOCASE):
4059 self.info = info
4060 self.name = name
4061 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
4063 self._key = self.__class__, self.name, self.case_flags
4065 self.set_key = (name, self.case_flags)
4066 if self.set_key not in info.named_lists_used:
4067 info.named_lists_used[self.set_key] = len(info.named_lists_used)
4069 index = self.info.named_lists_used[self.set_key]
4070 items = self.info.kwargs[self.name]
4072 case_flags = self.case_flags
4074 encoding = self.info.flags & _ALL_ENCODINGS
4075 fold_flags = encoding | case_flags
4077 choices = []
4079 for string in items:
4080 if isinstance(string, str):
4081 string = [ord(c) for c in string]
4083 choices.append([Character(c, case_flags=case_flags) for c in
4084 string])
4086 # Sort from longest to shortest.
4087 choices.sort(key=len, reverse=True)
4089 self.branches = [Sequence(choice) for choice in choices]
4091 def dump(self, indent, reverse):
4092 print("{}STRING_SET {}{}".format(INDENT * indent, self.name,
4093 CASE_TEXT[self.case_flags]))
4095 def __del__(self):
4096 self.info = None
4098class Source:
4099 "Scanner for the regular expression source string."
4100 def __init__(self, string):
4101 if isinstance(string, str):
4102 self.string = string
4103 self.char_type = chr
4104 else:
4105 self.string = string.decode("latin-1")
4106 self.char_type = lambda c: bytes([c])
4108 self.pos = 0
4109 self.ignore_space = False
4110 self.sep = string[ : 0]
4112 def peek(self, override_ignore=False):
4113 string = self.string
4114 pos = self.pos
4116 try:
4117 if self.ignore_space and not override_ignore:
4118 while True:
4119 if string[pos].isspace():
4120 # Skip over the whitespace.
4121 pos += 1
4122 elif string[pos] == "#":
4123 # Skip over the comment to the end of the line.
4124 pos = string.index("\n", pos)
4125 else:
4126 break
4128 return string[pos]
4129 except IndexError:
4130 # We've reached the end of the string.
4131 return string[ : 0]
4132 except ValueError:
4133 # The comment extended to the end of the string.
4134 return string[ : 0]
4136 def get(self, override_ignore=False):
4137 string = self.string
4138 pos = self.pos
4140 try:
4141 if self.ignore_space and not override_ignore:
4142 while True:
4143 if string[pos].isspace():
4144 # Skip over the whitespace.
4145 pos += 1
4146 elif string[pos] == "#":
4147 # Skip over the comment to the end of the line.
4148 pos = string.index("\n", pos)
4149 else:
4150 break
4152 ch = string[pos]
4153 self.pos = pos + 1
4154 return ch
4155 except IndexError:
4156 # We've reached the end of the string.
4157 self.pos = pos
4158 return string[ : 0]
4159 except ValueError:
4160 # The comment extended to the end of the string.
4161 self.pos = len(string)
4162 return string[ : 0]
4164 def get_many(self, count=1):
4165 string = self.string
4166 pos = self.pos
4168 try:
4169 if self.ignore_space:
4170 substring = []
4172 while len(substring) < count:
4173 while True:
4174 if string[pos].isspace():
4175 # Skip over the whitespace.
4176 pos += 1
4177 elif string[pos] == "#":
4178 # Skip over the comment to the end of the line.
4179 pos = string.index("\n", pos)
4180 else:
4181 break
4183 substring.append(string[pos])
4184 pos += 1
4186 substring = "".join(substring)
4187 else:
4188 substring = string[pos : pos + count]
4189 pos += len(substring)
4191 self.pos = pos
4192 return substring
4193 except IndexError:
4194 # We've reached the end of the string.
4195 self.pos = len(string)
4196 return "".join(substring)
4197 except ValueError:
4198 # The comment extended to the end of the string.
4199 self.pos = len(string)
4200 return "".join(substring)
4202 def get_while(self, test_set, include=True, keep_spaces=False):
4203 string = self.string
4204 pos = self.pos
4206 if self.ignore_space and not keep_spaces:
4207 try:
4208 substring = []
4210 while True:
4211 if string[pos].isspace():
4212 # Skip over the whitespace.
4213 pos += 1
4214 elif string[pos] == "#":
4215 # Skip over the comment to the end of the line.
4216 pos = string.index("\n", pos)
4217 elif (string[pos] in test_set) == include:
4218 substring.append(string[pos])
4219 pos += 1
4220 else:
4221 break
4223 self.pos = pos
4224 except IndexError:
4225 # We've reached the end of the string.
4226 self.pos = len(string)
4227 except ValueError:
4228 # The comment extended to the end of the string.
4229 self.pos = len(string)
4231 return "".join(substring)
4232 else:
4233 try:
4234 while (string[pos] in test_set) == include:
4235 pos += 1
4237 substring = string[self.pos : pos]
4239 self.pos = pos
4241 return substring
4242 except IndexError:
4243 # We've reached the end of the string.
4244 substring = string[self.pos : pos]
4246 self.pos = pos
4248 return substring
4250 def skip_while(self, test_set, include=True):
4251 string = self.string
4252 pos = self.pos
4254 try:
4255 if self.ignore_space:
4256 while True:
4257 if string[pos].isspace():
4258 # Skip over the whitespace.
4259 pos += 1
4260 elif string[pos] == "#":
4261 # Skip over the comment to the end of the line.
4262 pos = string.index("\n", pos)
4263 elif (string[pos] in test_set) == include:
4264 pos += 1
4265 else:
4266 break
4267 else:
4268 while (string[pos] in test_set) == include:
4269 pos += 1
4271 self.pos = pos
4272 except IndexError:
4273 # We've reached the end of the string.
4274 self.pos = len(string)
4275 except ValueError:
4276 # The comment extended to the end of the string.
4277 self.pos = len(string)
4279 def match(self, substring):
4280 string = self.string
4281 pos = self.pos
4283 if self.ignore_space:
4284 try:
4285 for c in substring:
4286 while True:
4287 if string[pos].isspace():
4288 # Skip over the whitespace.
4289 pos += 1
4290 elif string[pos] == "#":
4291 # Skip over the comment to the end of the line.
4292 pos = string.index("\n", pos)
4293 else:
4294 break
4296 if string[pos] != c:
4297 return False
4299 pos += 1
4301 self.pos = pos
4303 return True
4304 except IndexError:
4305 # We've reached the end of the string.
4306 return False
4307 except ValueError:
4308 # The comment extended to the end of the string.
4309 return False
4310 else:
4311 if not string.startswith(substring, pos):
4312 return False
4314 self.pos = pos + len(substring)
4316 return True
4318 def expect(self, substring):
4319 if not self.match(substring):
4320 raise error("missing {}".format(substring), self.string, self.pos)
4322 def at_end(self):
4323 string = self.string
4324 pos = self.pos
4326 try:
4327 if self.ignore_space:
4328 while True:
4329 if string[pos].isspace():
4330 pos += 1
4331 elif string[pos] == "#":
4332 pos = string.index("\n", pos)
4333 else:
4334 break
4336 return pos >= len(string)
4337 except IndexError:
4338 # We've reached the end of the string.
4339 return True
4340 except ValueError:
4341 # The comment extended to the end of the string.
4342 return True
4344class Info:
4345 "Info about the regular expression."
4347 def __init__(self, flags=0, char_type=None, kwargs={}):
4348 flags |= DEFAULT_FLAGS[(flags & _ALL_VERSIONS) or DEFAULT_VERSION]
4349 self.flags = flags
4350 self.global_flags = flags
4351 self.inline_locale = False
4353 self.kwargs = kwargs
4355 self.group_count = 0
4356 self.group_index = {}
4357 self.group_name = {}
4358 self.char_type = char_type
4359 self.named_lists_used = {}
4360 self.open_groups = []
4361 self.open_group_count = {}
4362 self.defined_groups = {}
4363 self.group_calls = []
4364 self.private_groups = {}
4366 def open_group(self, name=None):
4367 group = self.group_index.get(name)
4368 if group is None:
4369 while True:
4370 self.group_count += 1
4371 if name is None or self.group_count not in self.group_name:
4372 break
4374 group = self.group_count
4375 if name:
4376 self.group_index[name] = group
4377 self.group_name[group] = name
4379 if group in self.open_groups:
4380 # We have a nested named group. We'll assign it a private group
4381 # number, initially negative until we can assign a proper
4382 # (positive) number.
4383 group_alias = -(len(self.private_groups) + 1)
4384 self.private_groups[group_alias] = group
4385 group = group_alias
4387 self.open_groups.append(group)
4388 self.open_group_count[group] = self.open_group_count.get(group, 0) + 1
4390 return group
4392 def close_group(self):
4393 self.open_groups.pop()
4395 def is_open_group(self, name):
4396 # In version 1, a group reference can refer to an open group. We'll
4397 # just pretend the group isn't open.
4398 version = (self.flags & _ALL_VERSIONS) or DEFAULT_VERSION
4399 if version == VERSION1:
4400 return False
4402 if name.isdigit():
4403 group = int(name)
4404 else:
4405 group = self.group_index.get(name)
4407 return group in self.open_groups
4409def _check_group_features(info, parsed):
4410 """Checks whether the reverse and fuzzy features of the group calls match
4411 the groups which they call.
4412 """
4413 call_refs = {}
4414 additional_groups = []
4415 for call, reverse, fuzzy in info.group_calls:
4416 # Look up the reference of this group call.
4417 key = (call.group, reverse, fuzzy)
4418 ref = call_refs.get(key)
4419 if ref is None:
4420 # This group doesn't have a reference yet, so look up its features.
4421 if call.group == 0:
4422 # Calling the pattern as a whole.
4423 rev = bool(info.flags & REVERSE)
4424 fuz = isinstance(parsed, Fuzzy)
4425 if (rev, fuz) != (reverse, fuzzy):
4426 # The pattern as a whole doesn't have the features we want,
4427 # so we'll need to make a copy of it with the desired
4428 # features.
4429 additional_groups.append((CallRef(len(call_refs), parsed),
4430 reverse, fuzzy))
4431 else:
4432 # Calling a capture group.
4433 def_info = info.defined_groups[call.group]
4434 group = def_info[0]
4435 if def_info[1 : ] != (reverse, fuzzy):
4436 # The group doesn't have the features we want, so we'll
4437 # need to make a copy of it with the desired features.
4438 additional_groups.append((group, reverse, fuzzy))
4440 ref = len(call_refs)
4441 call_refs[key] = ref
4443 call.call_ref = ref
4445 info.call_refs = call_refs
4446 info.additional_groups = additional_groups
4448def _get_required_string(parsed, flags):
4449 "Gets the required string and related info of a parsed pattern."
4451 req_offset, required = parsed.get_required_string(bool(flags & REVERSE))
4452 if required:
4453 required.required = True
4454 if req_offset >= UNLIMITED:
4455 req_offset = -1
4457 req_flags = required.case_flags
4458 if not (flags & UNICODE):
4459 req_flags &= ~UNICODE
4461 req_chars = required.folded_characters
4462 else:
4463 req_offset = 0
4464 req_chars = ()
4465 req_flags = 0
4467 return req_offset, req_chars, req_flags
4469class Scanner:
4470 def __init__(self, lexicon, flags=0):
4471 self.lexicon = lexicon
4473 # Combine phrases into a compound pattern.
4474 patterns = []
4475 for phrase, action in lexicon:
4476 # Parse the regular expression.
4477 source = Source(phrase)
4478 info = Info(flags, source.char_type)
4479 source.ignore_space = bool(info.flags & VERBOSE)
4480 parsed = _parse_pattern(source, info)
4481 if not source.at_end():
4482 raise error("unbalanced parenthesis", source.string,
4483 source.pos)
4485 # We want to forbid capture groups within each phrase.
4486 patterns.append(parsed.remove_captures())
4488 # Combine all the subpatterns into one pattern.
4489 info = Info(flags)
4490 patterns = [Group(info, g + 1, p) for g, p in enumerate(patterns)]
4491 parsed = Branch(patterns)
4493 # Optimise the compound pattern.
4494 reverse = bool(info.flags & REVERSE)
4495 parsed = parsed.optimise(info, reverse)
4496 parsed = parsed.pack_characters(info)
4498 # Get the required string.
4499 req_offset, req_chars, req_flags = _get_required_string(parsed,
4500 info.flags)
4502 # Check the features of the groups.
4503 _check_group_features(info, parsed)
4505 # Complain if there are any group calls. They are not supported by the
4506 # Scanner class.
4507 if info.call_refs:
4508 raise error("recursive regex not supported by Scanner",
4509 source.string, source.pos)
4511 reverse = bool(info.flags & REVERSE)
4513 # Compile the compound pattern. The result is a list of tuples.
4514 code = parsed.compile(reverse) + [(OP.SUCCESS, )]
4516 # Flatten the code into a list of ints.
4517 code = _flatten_code(code)
4519 if not parsed.has_simple_start():
4520 # Get the first set, if possible.
4521 try:
4522 fs_code = _compile_firstset(info, parsed.get_firstset(reverse))
4523 fs_code = _flatten_code(fs_code)
4524 code = fs_code + code
4525 except _FirstSetError:
4526 pass
4528 # Check the global flags for conflicts.
4529 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION
4530 if version not in (0, VERSION0, VERSION1):
4531 raise ValueError("VERSION0 and VERSION1 flags are mutually incompatible")
4533 # Create the PatternObject.
4534 #
4535 # Local flags like IGNORECASE affect the code generation, but aren't
4536 # needed by the PatternObject itself. Conversely, global flags like
4537 # LOCALE _don't_ affect the code generation but _are_ needed by the
4538 # PatternObject.
4539 self.scanner = _regex.compile(None, (flags & GLOBAL_FLAGS) | version,
4540 code, {}, {}, {}, [], req_offset, req_chars, req_flags,
4541 len(patterns))
4543 def scan(self, string):
4544 result = []
4545 append = result.append
4546 match = self.scanner.scanner(string).match
4547 i = 0
4548 while True:
4549 m = match()
4550 if not m:
4551 break
4552 j = m.end()
4553 if i == j:
4554 break
4555 action = self.lexicon[m.lastindex - 1][1]
4556 if hasattr(action, '__call__'):
4557 self.match = m
4558 action = action(self, m.group())
4559 if action is not None:
4560 append(action)
4561 i = j
4563 return result, string[i : ]
4565# Get the known properties dict.
4566PROPERTIES = _regex.get_properties()
4568# Build the inverse of the properties dict.
4569PROPERTY_NAMES = {}
4570for prop_name, (prop_id, values) in PROPERTIES.items():
4571 name, prop_values = PROPERTY_NAMES.get(prop_id, ("", {}))
4572 name = max(name, prop_name, key=len)
4573 PROPERTY_NAMES[prop_id] = name, prop_values
4575 for val_name, val_id in values.items():
4576 prop_values[val_id] = max(prop_values.get(val_id, ""), val_name,
4577 key=len)
4579# Character escape sequences.
4580CHARACTER_ESCAPES = {
4581 "a": "\a",
4582 "b": "\b",
4583 "f": "\f",
4584 "n": "\n",
4585 "r": "\r",
4586 "t": "\t",
4587 "v": "\v",
4588}
4590ASCII_ENCODING = 1
4591UNICODE_ENCODING = 2
4593# Predefined character set escape sequences.
4594CHARSET_ESCAPES = {
4595 "d": lookup_property(None, "Digit", True),
4596 "D": lookup_property(None, "Digit", False),
4597 "h": lookup_property(None, "Blank", True),
4598 "s": lookup_property(None, "Space", True),
4599 "S": lookup_property(None, "Space", False),
4600 "w": lookup_property(None, "Word", True),
4601 "W": lookup_property(None, "Word", False),
4602}
4604ASCII_CHARSET_ESCAPES = dict(CHARSET_ESCAPES)
4605ASCII_CHARSET_ESCAPES.update({
4606 "d": lookup_property(None, "Digit", True, encoding=ASCII_ENCODING),
4607 "D": lookup_property(None, "Digit", False, encoding=ASCII_ENCODING),
4608 "s": lookup_property(None, "Space", True, encoding=ASCII_ENCODING),
4609 "S": lookup_property(None, "Space", False, encoding=ASCII_ENCODING),
4610 "w": lookup_property(None, "Word", True, encoding=ASCII_ENCODING),
4611 "W": lookup_property(None, "Word", False, encoding=ASCII_ENCODING),
4612})
4613UNICODE_CHARSET_ESCAPES = dict(CHARSET_ESCAPES)
4614UNICODE_CHARSET_ESCAPES.update({
4615 "d": lookup_property(None, "Digit", True, encoding=UNICODE_ENCODING),
4616 "D": lookup_property(None, "Digit", False, encoding=UNICODE_ENCODING),
4617 "s": lookup_property(None, "Space", True, encoding=UNICODE_ENCODING),
4618 "S": lookup_property(None, "Space", False, encoding=UNICODE_ENCODING),
4619 "w": lookup_property(None, "Word", True, encoding=UNICODE_ENCODING),
4620 "W": lookup_property(None, "Word", False, encoding=UNICODE_ENCODING),
4621})
4623# Positional escape sequences.
4624POSITION_ESCAPES = {
4625 "A": StartOfString(),
4626 "b": Boundary(),
4627 "B": Boundary(False),
4628 "K": Keep(),
4629 "m": StartOfWord(),
4630 "M": EndOfWord(),
4631 "Z": EndOfString(),
4632}
4633ASCII_POSITION_ESCAPES = dict(POSITION_ESCAPES)
4634ASCII_POSITION_ESCAPES.update({
4635 "b": Boundary(encoding=ASCII_ENCODING),
4636 "B": Boundary(False, encoding=ASCII_ENCODING),
4637 "m": StartOfWord(encoding=ASCII_ENCODING),
4638 "M": EndOfWord(encoding=ASCII_ENCODING),
4639})
4640UNICODE_POSITION_ESCAPES = dict(POSITION_ESCAPES)
4641UNICODE_POSITION_ESCAPES.update({
4642 "b": Boundary(encoding=UNICODE_ENCODING),
4643 "B": Boundary(False, encoding=UNICODE_ENCODING),
4644 "m": StartOfWord(encoding=UNICODE_ENCODING),
4645 "M": EndOfWord(encoding=UNICODE_ENCODING),
4646})
4648# Positional escape sequences when WORD flag set.
4649WORD_POSITION_ESCAPES = dict(POSITION_ESCAPES)
4650WORD_POSITION_ESCAPES.update({
4651 "b": DefaultBoundary(),
4652 "B": DefaultBoundary(False),
4653 "m": DefaultStartOfWord(),
4654 "M": DefaultEndOfWord(),
4655})
4657# Regex control verbs.
4658VERBS = {
4659 "FAIL": Failure(),
4660 "F": Failure(),
4661 "PRUNE": Prune(),
4662 "SKIP": Skip(),
4663}