Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/regex/_regex_core.py: 78%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1#
2# Secret Labs' Regular Expression Engine core module
3#
4# Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved.
5#
6# This version of the SRE library can be redistributed under CNRI's
7# Python 1.6 license. For any other use, please contact Secret Labs
8# AB (info@pythonware.com).
9#
10# Portions of this engine have been developed in cooperation with
11# CNRI. Hewlett-Packard provided funding for 1.6 integration and
12# other compatibility work.
13#
14# 2010-01-16 mrab Python front-end re-written and extended
16import enum
17import string
18import unicodedata
19from collections import defaultdict
21import regex._regex as _regex
23__all__ = ["A", "ASCII", "B", "BESTMATCH", "D", "DEBUG", "E", "ENHANCEMATCH",
24 "F", "FULLCASE", "I", "IGNORECASE", "L", "LOCALE", "M", "MULTILINE", "P",
25 "POSIX", "R", "REVERSE", "S", "DOTALL", "T", "TEMPLATE", "U", "UNICODE",
26 "V0", "VERSION0", "V1", "VERSION1", "W", "WORD", "X", "VERBOSE", "error",
27 "Scanner", "RegexFlag"]
29# The regex exception.
30class error(Exception):
31 """Exception raised for invalid regular expressions.
33 Attributes:
35 msg: The unformatted error message
36 pattern: The regular expression pattern
37 pos: The position in the pattern where compilation failed, or None
38 lineno: The line number where compilation failed, unless pos is None
39 colno: The column number where compilation failed, unless pos is None
40 """
42 def __init__(self, message, pattern=None, pos=None):
43 newline = '\n' if isinstance(pattern, str) else b'\n'
44 self.msg = message
45 self.pattern = pattern
46 self.pos = pos
47 if pattern is not None and pos is not None:
48 self.lineno = pattern.count(newline, 0, pos) + 1
49 self.colno = pos - pattern.rfind(newline, 0, pos)
51 message = "{} at position {}".format(message, pos)
53 if newline in pattern:
54 message += " (line {}, column {})".format(self.lineno,
55 self.colno)
57 Exception.__init__(self, message)
59# The exception for when a positional flag has been turned on in the old
60# behaviour.
61class _UnscopedFlagSet(Exception):
62 pass
64# The exception for when parsing fails and we want to try something else.
65class ParseError(Exception):
66 pass
68# The exception for when there isn't a valid first set.
69class _FirstSetError(Exception):
70 pass
72# Flags.
73class RegexFlag(enum.IntFlag):
74 A = ASCII = 0x80 # Assume ASCII locale.
75 B = BESTMATCH = 0x1000 # Best fuzzy match.
76 D = DEBUG = 0x200 # Print parsed pattern.
77 E = ENHANCEMATCH = 0x8000 # Attempt to improve the fit after finding the first
78 # fuzzy match.
79 F = FULLCASE = 0x4000 # Unicode full case-folding.
80 I = IGNORECASE = 0x2 # Ignore case.
81 L = LOCALE = 0x4 # Assume current 8-bit locale.
82 M = MULTILINE = 0x8 # Make anchors look for newline.
83 P = POSIX = 0x10000 # POSIX-style matching (leftmost longest).
84 R = REVERSE = 0x400 # Search backwards.
85 S = DOTALL = 0x10 # Make dot match newline.
86 U = UNICODE = 0x20 # Assume Unicode locale.
87 V0 = VERSION0 = 0x2000 # Old legacy behaviour.
88 V1 = VERSION1 = 0x100 # New enhanced behaviour.
89 W = WORD = 0x800 # Default Unicode word breaks.
90 X = VERBOSE = 0x40 # Ignore whitespace and comments.
91 T = TEMPLATE = 0x1 # Template (present because re module has it).
93 def __repr__(self):
94 if self._name_ is not None:
95 return 'regex.%s' % self._name_
97 value = self._value_
98 members = []
99 negative = value < 0
101 if negative:
102 value = ~value
104 for m in self.__class__:
105 if value & m._value_:
106 value &= ~m._value_
107 members.append('regex.%s' % m._name_)
109 if value:
110 members.append(hex(value))
112 res = '|'.join(members)
114 if negative:
115 if len(members) > 1:
116 res = '~(%s)' % res
117 else:
118 res = '~%s' % res
120 return res
122 __str__ = object.__str__
124globals().update(RegexFlag.__members__)
126DEFAULT_VERSION = VERSION1
128_ALL_VERSIONS = VERSION0 | VERSION1
129_ALL_ENCODINGS = ASCII | LOCALE | UNICODE
131# The default flags for the various versions.
132DEFAULT_FLAGS = {VERSION0: 0, VERSION1: FULLCASE}
134# The mask for the flags.
135GLOBAL_FLAGS = (_ALL_VERSIONS | BESTMATCH | DEBUG | ENHANCEMATCH | POSIX |
136 REVERSE)
137SCOPED_FLAGS = (FULLCASE | IGNORECASE | MULTILINE | DOTALL | WORD | VERBOSE |
138 _ALL_ENCODINGS)
140ALPHA = frozenset(string.ascii_letters)
141DIGITS = frozenset(string.digits)
142ALNUM = ALPHA | DIGITS
143OCT_DIGITS = frozenset(string.octdigits)
144HEX_DIGITS = frozenset(string.hexdigits)
145SPECIAL_CHARS = frozenset("()|?*+{^$.[\\#") | frozenset([""])
146NAMED_CHAR_PART = ALNUM | frozenset(" -")
147PROPERTY_NAME_PART = ALNUM | frozenset(" &_-.")
148SET_OPS = ("||", "~~", "&&", "--")
150# The width of the code words inside the regex engine.
151BYTES_PER_CODE = _regex.get_code_size()
152BITS_PER_CODE = BYTES_PER_CODE * 8
154# The repeat count which represents infinity.
155UNLIMITED = (1 << BITS_PER_CODE) - 1
157# The regular expression flags.
158REGEX_FLAGS = {"a": ASCII, "b": BESTMATCH, "e": ENHANCEMATCH, "f": FULLCASE,
159 "i": IGNORECASE, "L": LOCALE, "m": MULTILINE, "p": POSIX, "r": REVERSE,
160 "s": DOTALL, "u": UNICODE, "V0": VERSION0, "V1": VERSION1, "w": WORD, "x":
161 VERBOSE}
163# The case flags.
164CASE_FLAGS = FULLCASE | IGNORECASE
165NOCASE = 0
166FULLIGNORECASE = FULLCASE | IGNORECASE
168FULL_CASE_FOLDING = UNICODE | FULLIGNORECASE
170CASE_FLAGS_COMBINATIONS = {0: 0, FULLCASE: 0, IGNORECASE: IGNORECASE,
171 FULLIGNORECASE: FULLIGNORECASE}
173# The number of digits in hexadecimal escapes.
174HEX_ESCAPES = {"x": 2, "u": 4, "U": 8}
176# The names of the opcodes.
177OPCODES = """
178FAILURE
179SUCCESS
180ANY
181ANY_ALL
182ANY_ALL_REV
183ANY_REV
184ANY_U
185ANY_U_REV
186ATOMIC
187BOUNDARY
188BRANCH
189CALL_REF
190CHARACTER
191CHARACTER_IGN
192CHARACTER_IGN_REV
193CHARACTER_REV
194CONDITIONAL
195DEFAULT_BOUNDARY
196DEFAULT_END_OF_WORD
197DEFAULT_START_OF_WORD
198END
199END_OF_LINE
200END_OF_LINE_U
201END_OF_STRING
202END_OF_STRING_LINE
203END_OF_STRING_LINE_U
204END_OF_WORD
205FUZZY
206GRAPHEME_BOUNDARY
207GREEDY_REPEAT
208GROUP
209GROUP_CALL
210GROUP_EXISTS
211KEEP
212LAZY_REPEAT
213LOOKAROUND
214NEXT
215PROPERTY
216PROPERTY_IGN
217PROPERTY_IGN_REV
218PROPERTY_REV
219PRUNE
220RANGE
221RANGE_IGN
222RANGE_IGN_REV
223RANGE_REV
224REF_GROUP
225REF_GROUP_FLD
226REF_GROUP_FLD_REV
227REF_GROUP_IGN
228REF_GROUP_IGN_REV
229REF_GROUP_REV
230SEARCH_ANCHOR
231SET_DIFF
232SET_DIFF_IGN
233SET_DIFF_IGN_REV
234SET_DIFF_REV
235SET_INTER
236SET_INTER_IGN
237SET_INTER_IGN_REV
238SET_INTER_REV
239SET_SYM_DIFF
240SET_SYM_DIFF_IGN
241SET_SYM_DIFF_IGN_REV
242SET_SYM_DIFF_REV
243SET_UNION
244SET_UNION_IGN
245SET_UNION_IGN_REV
246SET_UNION_REV
247SKIP
248START_OF_LINE
249START_OF_LINE_U
250START_OF_STRING
251START_OF_WORD
252STRING
253STRING_FLD
254STRING_FLD_REV
255STRING_IGN
256STRING_IGN_REV
257STRING_REV
258FUZZY_EXT
259"""
261# Define the opcodes in a namespace.
262class Namespace:
263 pass
265OP = Namespace()
266for i, op in enumerate(OPCODES.split()):
267 setattr(OP, op, i)
269def _shrink_cache(cache_dict, args_dict, locale_sensitive, max_length, divisor=5):
270 """Make room in the given cache.
272 Args:
273 cache_dict: The cache dictionary to modify.
274 args_dict: The dictionary of named list args used by patterns.
275 max_length: Maximum # of entries in cache_dict before it is shrunk.
276 divisor: Cache will shrink to max_length - 1/divisor*max_length items.
277 """
278 # Toss out a fraction of the entries at random to make room for new ones.
279 # A random algorithm was chosen as opposed to simply cache_dict.popitem()
280 # as popitem could penalize the same regular expression repeatedly based
281 # on its internal hash value. Being random should spread the cache miss
282 # love around.
283 cache_keys = tuple(cache_dict.keys())
284 overage = len(cache_keys) - max_length
285 if overage < 0:
286 # Cache is already within limits. Normally this should not happen
287 # but it could due to multithreading.
288 return
290 number_to_toss = max_length // divisor + overage
292 # The import is done here to avoid a circular dependency.
293 import random
294 if not hasattr(random, 'sample'):
295 # Do nothing while resolving the circular dependency:
296 # re->random->warnings->tokenize->string->re
297 return
299 for doomed_key in random.sample(cache_keys, number_to_toss):
300 try:
301 del cache_dict[doomed_key]
302 except KeyError:
303 # Ignore problems if the cache changed from another thread.
304 pass
306 # Rebuild the arguments and locale-sensitivity dictionaries.
307 args_dict.clear()
308 sensitivity_dict = {}
309 for pattern, pattern_type, flags, args, default_version, locale in tuple(cache_dict):
310 args_dict[pattern, pattern_type, flags, default_version, locale] = args
311 try:
312 sensitivity_dict[pattern_type, pattern] = locale_sensitive[pattern_type, pattern]
313 except KeyError:
314 pass
316 locale_sensitive.clear()
317 locale_sensitive.update(sensitivity_dict)
319def _fold_case(info, string):
320 "Folds the case of a string."
321 flags = info.flags
322 if (flags & _ALL_ENCODINGS) == 0:
323 flags |= info.guess_encoding
325 return _regex.fold_case(flags, string)
327def is_cased_i(info, char):
328 "Checks whether a character is cased."
329 return len(_regex.get_all_cases(info.flags, char)) > 1
331def is_cased_f(flags, char):
332 "Checks whether a character is cased."
333 return len(_regex.get_all_cases(flags, char)) > 1
335def _compile_firstset(info, fs):
336 "Compiles the firstset for the pattern."
337 reverse = bool(info.flags & REVERSE)
338 fs = _check_firstset(info, reverse, fs)
339 if not fs or isinstance(fs, AnyAll):
340 return []
342 # Compile the firstset.
343 return fs.compile(reverse)
345def _check_firstset(info, reverse, fs):
346 "Checks the firstset for the pattern."
347 if not fs or None in fs:
348 return None
350 # If we ignore the case, for simplicity we won't build a firstset.
351 members = set()
352 case_flags = NOCASE
353 for i in fs:
354 if isinstance(i, Character) and not i.positive:
355 return None
357# if i.case_flags:
358# if isinstance(i, Character):
359# if is_cased_i(info, i.value):
360# return []
361# elif isinstance(i, SetBase):
362# return []
363 case_flags |= i.case_flags
364 members.add(i.with_flags(case_flags=NOCASE))
366 if case_flags == (FULLCASE | IGNORECASE):
367 return None
369 # Build the firstset.
370 fs = SetUnion(info, list(members), case_flags=case_flags & ~FULLCASE,
371 zerowidth=True)
372 fs = fs.optimise(info, reverse, in_set=True)
374 return fs
376def _flatten_code(code):
377 "Flattens the code from a list of tuples."
378 flat_code = []
379 for c in code:
380 flat_code.extend(c)
382 return flat_code
384def make_case_flags(info):
385 "Makes the case flags."
386 flags = info.flags & CASE_FLAGS
388 # Turn off FULLCASE if ASCII is turned on.
389 if info.flags & ASCII:
390 flags &= ~FULLCASE
392 return flags
394def make_character(info, value, in_set=False):
395 "Makes a character literal."
396 if in_set:
397 # A character set is built case-sensitively.
398 return Character(value)
400 return Character(value, case_flags=make_case_flags(info))
402def make_ref_group(info, name, position):
403 "Makes a group reference."
404 return RefGroup(info, name, position, case_flags=make_case_flags(info))
406def make_string_set(info, name):
407 "Makes a string set."
408 return StringSet(info, name, case_flags=make_case_flags(info))
410def make_property(info, prop, in_set):
411 "Makes a property."
412 if in_set:
413 return prop
415 return prop.with_flags(case_flags=make_case_flags(info))
417def _parse_pattern(source, info):
418 "Parses a pattern, eg. 'a|b|c'."
419 branches = [parse_sequence(source, info)]
420 while source.match("|"):
421 branches.append(parse_sequence(source, info))
423 if len(branches) == 1:
424 return branches[0]
425 return Branch(branches)
427def parse_sequence(source, info):
428 "Parses a sequence, eg. 'abc'."
429 sequence = [None]
430 case_flags = make_case_flags(info)
431 while True:
432 saved_pos = source.pos
433 ch = source.get()
434 if ch in SPECIAL_CHARS:
435 if ch in ")|":
436 # The end of a sequence. At the end of the pattern ch is "".
437 source.pos = saved_pos
438 break
439 elif ch == "\\":
440 # An escape sequence outside a set.
441 sequence.append(parse_escape(source, info, False))
442 elif ch == "(":
443 # A parenthesised subpattern or a flag.
444 element = parse_paren(source, info)
445 if element is None:
446 case_flags = make_case_flags(info)
447 else:
448 sequence.append(element)
449 elif ch == ".":
450 # Any character.
451 if info.flags & DOTALL:
452 sequence.append(AnyAll())
453 elif info.flags & WORD:
454 sequence.append(AnyU())
455 else:
456 sequence.append(Any())
457 elif ch == "[":
458 # A character set.
459 sequence.append(parse_set(source, info))
460 elif ch == "^":
461 # The start of a line or the string.
462 if info.flags & MULTILINE:
463 if info.flags & WORD:
464 sequence.append(StartOfLineU())
465 else:
466 sequence.append(StartOfLine())
467 else:
468 sequence.append(StartOfString())
469 elif ch == "$":
470 # The end of a line or the string.
471 if info.flags & MULTILINE:
472 if info.flags & WORD:
473 sequence.append(EndOfLineU())
474 else:
475 sequence.append(EndOfLine())
476 else:
477 if info.flags & WORD:
478 sequence.append(EndOfStringLineU())
479 else:
480 sequence.append(EndOfStringLine())
481 elif ch in "?*+{":
482 # Looks like a quantifier.
483 counts = parse_quantifier(source, info, ch)
484 if counts:
485 # It _is_ a quantifier.
486 apply_quantifier(source, info, counts, case_flags, ch,
487 saved_pos, sequence)
488 sequence.append(None)
489 else:
490 # It's not a quantifier. Maybe it's a fuzzy constraint.
491 constraints = parse_fuzzy(source, info, ch, case_flags)
492 if constraints:
493 # It _is_ a fuzzy constraint.
494 apply_constraint(source, info, constraints, case_flags,
495 saved_pos, sequence)
496 sequence.append(None)
497 else:
498 # The element was just a literal.
499 sequence.append(Character(ord(ch),
500 case_flags=case_flags))
501 else:
502 # A literal.
503 sequence.append(Character(ord(ch), case_flags=case_flags))
504 else:
505 # A literal.
506 sequence.append(Character(ord(ch), case_flags=case_flags))
508 sequence = [item for item in sequence if item is not None]
509 return Sequence(sequence)
511def apply_quantifier(source, info, counts, case_flags, ch, saved_pos,
512 sequence):
513 element = sequence.pop()
514 if element is None:
515 if sequence:
516 raise error("multiple repeat", source.string, saved_pos)
517 raise error("nothing to repeat", source.string, saved_pos)
519 if isinstance(element, (GreedyRepeat, LazyRepeat, PossessiveRepeat)):
520 raise error("multiple repeat", source.string, saved_pos)
522 min_count, max_count = counts
523 saved_pos = source.pos
524 ch = source.get()
525 if ch == "?":
526 # The "?" suffix that means it's a lazy repeat.
527 repeated = LazyRepeat
528 elif ch == "+":
529 # The "+" suffix that means it's a possessive repeat.
530 repeated = PossessiveRepeat
531 else:
532 # No suffix means that it's a greedy repeat.
533 source.pos = saved_pos
534 repeated = GreedyRepeat
536 # Ignore the quantifier if it applies to a zero-width item or the number of
537 # repeats is fixed at 1.
538 if not element.is_empty() and (min_count != 1 or max_count != 1):
539 element = repeated(element, min_count, max_count)
541 sequence.append(element)
543def apply_constraint(source, info, constraints, case_flags, saved_pos,
544 sequence):
545 element = sequence.pop()
546 if element is None:
547 raise error("nothing for fuzzy constraint", source.string, saved_pos)
549 # If a group is marked as fuzzy then put all of the fuzzy part in the
550 # group.
551 if isinstance(element, Group):
552 element.subpattern = Fuzzy(element.subpattern, constraints)
553 sequence.append(element)
554 else:
555 sequence.append(Fuzzy(element, constraints))
557_QUANTIFIERS = {"?": (0, 1), "*": (0, None), "+": (1, None)}
559def parse_quantifier(source, info, ch):
560 "Parses a quantifier."
561 q = _QUANTIFIERS.get(ch)
562 if q:
563 # It's a quantifier.
564 return q
566 if ch == "{":
567 # Looks like a limited repeated element, eg. 'a{2,3}'.
568 counts = parse_limited_quantifier(source)
569 if counts:
570 return counts
572 return None
574def is_above_limit(count):
575 "Checks whether a count is above the maximum."
576 return count is not None and count >= UNLIMITED
578def parse_limited_quantifier(source):
579 "Parses a limited quantifier."
580 saved_pos = source.pos
581 min_count = parse_count(source)
582 if source.match(","):
583 max_count = parse_count(source)
585 # No minimum means 0 and no maximum means unlimited.
586 min_count = int(min_count or 0)
587 max_count = int(max_count) if max_count else None
588 else:
589 if not min_count:
590 source.pos = saved_pos
591 return None
593 min_count = max_count = int(min_count)
595 if not source.match ("}"):
596 source.pos = saved_pos
597 return None
599 if is_above_limit(min_count) or is_above_limit(max_count):
600 raise error("repeat count too big", source.string, saved_pos)
602 if max_count is not None and min_count > max_count:
603 raise error("min repeat greater than max repeat", source.string,
604 saved_pos)
606 return min_count, max_count
608def parse_fuzzy(source, info, ch, case_flags):
609 "Parses a fuzzy setting, if present."
610 saved_pos = source.pos
612 if ch != "{":
613 return None
615 constraints = {}
616 try:
617 parse_fuzzy_item(source, constraints)
618 while source.match(","):
619 parse_fuzzy_item(source, constraints)
620 except ParseError:
621 source.pos = saved_pos
622 return None
624 if source.match(":"):
625 constraints["test"] = parse_fuzzy_test(source, info, case_flags)
627 if not source.match("}"):
628 raise error("expected }", source.string, source.pos)
630 return constraints
632def parse_fuzzy_item(source, constraints):
633 "Parses a fuzzy setting item."
634 saved_pos = source.pos
635 try:
636 parse_cost_constraint(source, constraints)
637 except ParseError:
638 source.pos = saved_pos
640 parse_cost_equation(source, constraints)
642def parse_cost_constraint(source, constraints):
643 "Parses a cost constraint."
644 saved_pos = source.pos
645 ch = source.get()
646 if ch in ALPHA:
647 # Syntax: constraint [("<=" | "<") cost]
648 constraint = parse_constraint(source, constraints, ch)
650 max_inc = parse_fuzzy_compare(source)
652 if max_inc is None:
653 # No maximum cost.
654 constraints[constraint] = 0, None
655 else:
656 # There's a maximum cost.
657 cost_pos = source.pos
658 max_cost = parse_cost_limit(source)
660 # Inclusive or exclusive limit?
661 if not max_inc:
662 max_cost -= 1
664 if max_cost < 0:
665 raise error("bad fuzzy cost limit", source.string, cost_pos)
667 constraints[constraint] = 0, max_cost
668 elif ch in DIGITS:
669 # Syntax: cost ("<=" | "<") constraint ("<=" | "<") cost
670 source.pos = saved_pos
672 # Minimum cost.
673 cost_pos = source.pos
674 min_cost = parse_cost_limit(source)
676 min_inc = parse_fuzzy_compare(source)
677 if min_inc is None:
678 raise ParseError()
680 constraint = parse_constraint(source, constraints, source.get())
682 max_inc = parse_fuzzy_compare(source)
683 if max_inc is None:
684 raise ParseError()
686 # Maximum cost.
687 cost_pos = source.pos
688 max_cost = parse_cost_limit(source)
690 # Inclusive or exclusive limits?
691 if not min_inc:
692 min_cost += 1
693 if not max_inc:
694 max_cost -= 1
696 if not 0 <= min_cost <= max_cost:
697 raise error("bad fuzzy cost limit", source.string, cost_pos)
699 constraints[constraint] = min_cost, max_cost
700 else:
701 raise ParseError()
703def parse_cost_limit(source):
704 "Parses a cost limit."
705 cost_pos = source.pos
706 digits = parse_count(source)
708 try:
709 return int(digits)
710 except ValueError:
711 pass
713 raise error("bad fuzzy cost limit", source.string, cost_pos)
715def parse_constraint(source, constraints, ch):
716 "Parses a constraint."
717 if ch not in "deis":
718 raise ParseError()
720 if ch in constraints:
721 raise ParseError()
723 return ch
725def parse_fuzzy_compare(source):
726 "Parses a cost comparator."
727 if source.match("<="):
728 return True
729 elif source.match("<"):
730 return False
731 else:
732 return None
734def parse_cost_equation(source, constraints):
735 "Parses a cost equation."
736 if "cost" in constraints:
737 raise error("more than one cost equation", source.string, source.pos)
739 cost = {}
741 parse_cost_term(source, cost)
742 while source.match("+"):
743 parse_cost_term(source, cost)
745 max_inc = parse_fuzzy_compare(source)
746 if max_inc is None:
747 raise ParseError()
749 max_cost = int(parse_count(source))
751 if not max_inc:
752 max_cost -= 1
754 if max_cost < 0:
755 raise error("bad fuzzy cost limit", source.string, source.pos)
757 cost["max"] = max_cost
759 constraints["cost"] = cost
761def parse_cost_term(source, cost):
762 "Parses a cost equation term."
763 coeff = parse_count(source)
764 ch = source.get()
765 if ch not in "dis":
766 raise ParseError()
768 if ch in cost:
769 raise error("repeated fuzzy cost", source.string, source.pos)
771 cost[ch] = int(coeff or 1)
773def parse_fuzzy_test(source, info, case_flags):
774 saved_pos = source.pos
775 ch = source.get()
776 if ch in SPECIAL_CHARS:
777 if ch == "\\":
778 # An escape sequence outside a set.
779 return parse_escape(source, info, False)
780 elif ch == ".":
781 # Any character.
782 if info.flags & DOTALL:
783 return AnyAll()
784 elif info.flags & WORD:
785 return AnyU()
786 else:
787 return Any()
788 elif ch == "[":
789 # A character set.
790 return parse_set(source, info)
791 else:
792 raise error("expected character set", source.string, saved_pos)
793 elif ch:
794 # A literal.
795 return Character(ord(ch), case_flags=case_flags)
796 else:
797 raise error("expected character set", source.string, saved_pos)
799def parse_count(source):
800 "Parses a quantifier's count, which can be empty."
801 return source.get_while(DIGITS)
803def parse_paren(source, info):
804 """Parses a parenthesised subpattern or a flag. Returns FLAGS if it's an
805 inline flag.
806 """
807 saved_pos = source.pos
808 ch = source.get(True)
809 if ch == "?":
810 # (?...
811 saved_pos_2 = source.pos
812 ch = source.get(True)
813 if ch == "<":
814 # (?<...
815 saved_pos_3 = source.pos
816 ch = source.get()
817 if ch in ("=", "!"):
818 # (?<=... or (?<!...: lookbehind.
819 return parse_lookaround(source, info, True, ch == "=")
821 # (?<...: a named capture group.
822 source.pos = saved_pos_3
823 name = parse_name(source)
824 group = info.open_group(name)
825 source.expect(">")
826 saved_flags = info.flags
827 try:
828 subpattern = _parse_pattern(source, info)
829 source.expect(")")
830 finally:
831 info.flags = saved_flags
832 source.ignore_space = bool(info.flags & VERBOSE)
834 info.close_group()
835 return Group(info, group, subpattern)
836 if ch in ("=", "!"):
837 # (?=... or (?!...: lookahead.
838 return parse_lookaround(source, info, False, ch == "=")
839 if ch == "P":
840 # (?P...: a Python extension.
841 return parse_extension(source, info)
842 if ch == "#":
843 # (?#...: a comment.
844 return parse_comment(source)
845 if ch == "(":
846 # (?(...: a conditional subpattern.
847 return parse_conditional(source, info)
848 if ch == ">":
849 # (?>...: an atomic subpattern.
850 return parse_atomic(source, info)
851 if ch == "|":
852 # (?|...: a common/reset groups branch.
853 return parse_common(source, info)
854 if ch == "R" or "0" <= ch <= "9":
855 # (?R...: probably a call to a group.
856 return parse_call_group(source, info, ch, saved_pos_2)
857 if ch == "&":
858 # (?&...: a call to a named group.
859 return parse_call_named_group(source, info, saved_pos_2)
861 # (?...: probably a flags subpattern.
862 source.pos = saved_pos_2
863 return parse_flags_subpattern(source, info)
865 if ch == "*":
866 # (*...
867 saved_pos_2 = source.pos
868 word = source.get_while(set(")>"), include=False)
869 if word[ : 1].isalpha():
870 verb = VERBS.get(word)
871 if not verb:
872 raise error("unknown verb", source.string, saved_pos_2)
874 source.expect(")")
876 return verb
878 # (...: an unnamed capture group.
879 source.pos = saved_pos
880 group = info.open_group()
881 saved_flags = info.flags
882 try:
883 subpattern = _parse_pattern(source, info)
884 source.expect(")")
885 finally:
886 info.flags = saved_flags
887 source.ignore_space = bool(info.flags & VERBOSE)
889 info.close_group()
891 return Group(info, group, subpattern)
893def parse_extension(source, info):
894 "Parses a Python extension."
895 saved_pos = source.pos
896 ch = source.get()
897 if ch == "<":
898 # (?P<...: a named capture group.
899 name = parse_name(source)
900 group = info.open_group(name)
901 source.expect(">")
902 saved_flags = info.flags
903 try:
904 subpattern = _parse_pattern(source, info)
905 source.expect(")")
906 finally:
907 info.flags = saved_flags
908 source.ignore_space = bool(info.flags & VERBOSE)
910 info.close_group()
912 return Group(info, group, subpattern)
913 if ch == "=":
914 # (?P=...: a named group reference.
915 name = parse_name(source, allow_numeric=True)
916 source.expect(")")
917 if info.is_open_group(name):
918 raise error("cannot refer to an open group", source.string,
919 saved_pos)
921 return make_ref_group(info, name, saved_pos)
922 if ch == ">" or ch == "&":
923 # (?P>...: a call to a group.
924 return parse_call_named_group(source, info, saved_pos)
926 source.pos = saved_pos
927 raise error("unknown extension", source.string, saved_pos)
929def parse_comment(source):
930 "Parses a comment."
931 while True:
932 saved_pos = source.pos
933 c = source.get(True)
935 if not c or c == ")":
936 break
938 if c == "\\":
939 c = source.get(True)
941 source.pos = saved_pos
942 source.expect(")")
944 return None
946def parse_lookaround(source, info, behind, positive):
947 "Parses a lookaround."
948 saved_flags = info.flags
949 try:
950 subpattern = _parse_pattern(source, info)
951 source.expect(")")
952 finally:
953 info.flags = saved_flags
954 source.ignore_space = bool(info.flags & VERBOSE)
956 return LookAround(behind, positive, subpattern)
958def parse_conditional(source, info):
959 "Parses a conditional subpattern."
960 saved_flags = info.flags
961 saved_pos = source.pos
962 ch = source.get()
963 if ch == "?":
964 # (?(?...
965 ch = source.get()
966 if ch in ("=", "!"):
967 # (?(?=... or (?(?!...: lookahead conditional.
968 return parse_lookaround_conditional(source, info, False, ch == "=")
969 if ch == "<":
970 # (?(?<...
971 ch = source.get()
972 if ch in ("=", "!"):
973 # (?(?<=... or (?(?<!...: lookbehind conditional.
974 return parse_lookaround_conditional(source, info, True, ch ==
975 "=")
977 source.pos = saved_pos
978 raise error("expected lookaround conditional", source.string,
979 source.pos)
981 source.pos = saved_pos
982 try:
983 group = parse_name(source, True)
984 source.expect(")")
985 yes_branch = parse_sequence(source, info)
986 if source.match("|"):
987 no_branch = parse_sequence(source, info)
988 else:
989 no_branch = Sequence()
991 source.expect(")")
992 finally:
993 info.flags = saved_flags
994 source.ignore_space = bool(info.flags & VERBOSE)
996 if yes_branch.is_empty() and no_branch.is_empty():
997 return Sequence()
999 return Conditional(info, group, yes_branch, no_branch, saved_pos)
1001def parse_lookaround_conditional(source, info, behind, positive):
1002 saved_flags = info.flags
1003 try:
1004 subpattern = _parse_pattern(source, info)
1005 source.expect(")")
1006 finally:
1007 info.flags = saved_flags
1008 source.ignore_space = bool(info.flags & VERBOSE)
1010 yes_branch = parse_sequence(source, info)
1011 if source.match("|"):
1012 no_branch = parse_sequence(source, info)
1013 else:
1014 no_branch = Sequence()
1016 source.expect(")")
1018 return LookAroundConditional(behind, positive, subpattern, yes_branch,
1019 no_branch)
1021def parse_atomic(source, info):
1022 "Parses an atomic subpattern."
1023 saved_flags = info.flags
1024 try:
1025 subpattern = _parse_pattern(source, info)
1026 source.expect(")")
1027 finally:
1028 info.flags = saved_flags
1029 source.ignore_space = bool(info.flags & VERBOSE)
1031 return Atomic(subpattern)
1033def parse_common(source, info):
1034 "Parses a common groups branch."
1035 # Capture group numbers in different branches can reuse the group numbers.
1036 initial_group_count = info.group_count
1037 branches = [parse_sequence(source, info)]
1038 final_group_count = info.group_count
1039 while source.match("|"):
1040 info.group_count = initial_group_count
1041 branches.append(parse_sequence(source, info))
1042 final_group_count = max(final_group_count, info.group_count)
1044 info.group_count = final_group_count
1045 source.expect(")")
1047 if len(branches) == 1:
1048 return branches[0]
1049 return Branch(branches)
1051def parse_call_group(source, info, ch, pos):
1052 "Parses a call to a group."
1053 if ch == "R":
1054 group = "0"
1055 else:
1056 group = ch + source.get_while(DIGITS)
1058 source.expect(")")
1060 return CallGroup(info, group, pos)
1062def parse_call_named_group(source, info, pos):
1063 "Parses a call to a named group."
1064 group = parse_name(source)
1065 source.expect(")")
1067 return CallGroup(info, group, pos)
1069def parse_flag_set(source):
1070 "Parses a set of inline flags."
1071 flags = 0
1073 try:
1074 while True:
1075 saved_pos = source.pos
1076 ch = source.get()
1077 if ch == "V":
1078 ch += source.get()
1079 flags |= REGEX_FLAGS[ch]
1080 except KeyError:
1081 source.pos = saved_pos
1083 return flags
1085def parse_flags(source, info):
1086 "Parses flags being turned on/off."
1087 flags_on = parse_flag_set(source)
1088 if source.match("-"):
1089 flags_off = parse_flag_set(source)
1090 if not flags_off:
1091 raise error("bad inline flags: no flags after '-'", source.string,
1092 source.pos)
1093 else:
1094 flags_off = 0
1096 if flags_on & LOCALE:
1097 # Remember that this pattern as an inline locale flag.
1098 info.inline_locale = True
1100 return flags_on, flags_off
1102def parse_subpattern(source, info, flags_on, flags_off):
1103 "Parses a subpattern with scoped flags."
1104 saved_flags = info.flags
1105 info.flags = (info.flags | flags_on) & ~flags_off
1107 # Ensure that there aren't multiple encoding flags set.
1108 if info.flags & (ASCII | LOCALE | UNICODE):
1109 info.flags = (info.flags & ~_ALL_ENCODINGS) | flags_on
1111 source.ignore_space = bool(info.flags & VERBOSE)
1112 try:
1113 subpattern = _parse_pattern(source, info)
1114 source.expect(")")
1115 finally:
1116 info.flags = saved_flags
1117 source.ignore_space = bool(info.flags & VERBOSE)
1119 return subpattern
1121def parse_flags_subpattern(source, info):
1122 """Parses a flags subpattern. It could be inline flags or a subpattern
1123 possibly with local flags. If it's a subpattern, then that's returned;
1124 if it's a inline flags, then None is returned.
1125 """
1126 flags_on, flags_off = parse_flags(source, info)
1128 if flags_off & GLOBAL_FLAGS:
1129 raise error("bad inline flags: cannot turn off global flag",
1130 source.string, source.pos)
1132 if flags_on & flags_off:
1133 raise error("bad inline flags: flag turned on and off", source.string,
1134 source.pos)
1136 # Handle flags which are global in all regex behaviours.
1137 new_global_flags = (flags_on & ~info.global_flags) & GLOBAL_FLAGS
1138 if new_global_flags:
1139 info.global_flags |= new_global_flags
1141 # A global has been turned on, so reparse the pattern.
1142 raise _UnscopedFlagSet(info.global_flags)
1144 # Ensure that from now on we have only scoped flags.
1145 flags_on &= ~GLOBAL_FLAGS
1147 if source.match(":"):
1148 return parse_subpattern(source, info, flags_on, flags_off)
1150 if source.match(")"):
1151 parse_positional_flags(source, info, flags_on, flags_off)
1152 return None
1154 raise error("unknown extension", source.string, source.pos)
1156def parse_positional_flags(source, info, flags_on, flags_off):
1157 "Parses positional flags."
1158 info.flags = (info.flags | flags_on) & ~flags_off
1159 source.ignore_space = bool(info.flags & VERBOSE)
1161def parse_name(source, allow_numeric=False, allow_group_0=False):
1162 "Parses a name."
1163 name = source.get_while(set(")>"), include=False)
1165 if not name:
1166 raise error("missing group name", source.string, source.pos)
1168 if name.isdigit():
1169 min_group = 0 if allow_group_0 else 1
1170 if not allow_numeric or int(name) < min_group:
1171 raise error("bad character in group name", source.string,
1172 source.pos)
1173 else:
1174 if not name.isidentifier():
1175 raise error("bad character in group name", source.string,
1176 source.pos)
1178 return name
1180def is_octal(string):
1181 "Checks whether a string is octal."
1182 return all(ch in OCT_DIGITS for ch in string)
1184def is_decimal(string):
1185 "Checks whether a string is decimal."
1186 return all(ch in DIGITS for ch in string)
1188def is_hexadecimal(string):
1189 "Checks whether a string is hexadecimal."
1190 return all(ch in HEX_DIGITS for ch in string)
1192def parse_escape(source, info, in_set):
1193 "Parses an escape sequence."
1194 saved_ignore = source.ignore_space
1195 source.ignore_space = False
1196 ch = source.get()
1197 source.ignore_space = saved_ignore
1198 if not ch:
1199 # A backslash at the end of the pattern.
1200 raise error("bad escape (end of pattern)", source.string, source.pos)
1201 if ch in HEX_ESCAPES:
1202 # A hexadecimal escape sequence.
1203 return parse_hex_escape(source, info, ch, HEX_ESCAPES[ch], in_set, ch)
1204 elif ch == "g" and not in_set:
1205 # A group reference.
1206 saved_pos = source.pos
1207 try:
1208 return parse_group_ref(source, info)
1209 except error:
1210 # Invalid as a group reference, so assume it's a literal.
1211 source.pos = saved_pos
1213 return make_character(info, ord(ch), in_set)
1214 elif ch == "G" and not in_set:
1215 # A search anchor.
1216 return SearchAnchor()
1217 elif ch == "L" and not in_set:
1218 # A string set.
1219 return parse_string_set(source, info)
1220 elif ch == "N":
1221 # A named codepoint.
1222 return parse_named_char(source, info, in_set)
1223 elif ch in "pP":
1224 # A Unicode property, positive or negative.
1225 return parse_property(source, info, ch == "p", in_set)
1226 elif ch == "R" and not in_set:
1227 # A line ending.
1228 charset = [0x0A, 0x0B, 0x0C, 0x0D]
1229 if info.guess_encoding == UNICODE:
1230 charset.extend([0x85, 0x2028, 0x2029])
1232 return Atomic(Branch([String([0x0D, 0x0A]), SetUnion(info, [Character(c)
1233 for c in charset])]))
1234 elif ch == "X" and not in_set:
1235 # A grapheme cluster.
1236 return Grapheme()
1237 elif ch in ALPHA:
1238 # An alphabetic escape sequence.
1239 # Positional escapes aren't allowed inside a character set.
1240 if not in_set:
1241 if info.flags & WORD:
1242 value = WORD_POSITION_ESCAPES.get(ch)
1243 elif info.flags & ASCII:
1244 value = ASCII_POSITION_ESCAPES.get(ch)
1245 elif info.flags & UNICODE:
1246 value = UNICODE_POSITION_ESCAPES.get(ch)
1247 else:
1248 value = POSITION_ESCAPES.get(ch)
1250 if value:
1251 return value
1253 if info.flags & ASCII:
1254 value = ASCII_CHARSET_ESCAPES.get(ch)
1255 elif info.flags & UNICODE:
1256 value = UNICODE_CHARSET_ESCAPES.get(ch)
1257 else:
1258 value = CHARSET_ESCAPES.get(ch)
1260 if value:
1261 return value
1263 value = CHARACTER_ESCAPES.get(ch)
1264 if value:
1265 return Character(ord(value))
1267 raise error("bad escape \\%s" % ch, source.string, source.pos)
1268 elif ch in DIGITS:
1269 # A numeric escape sequence.
1270 return parse_numeric_escape(source, info, ch, in_set)
1271 else:
1272 # A literal.
1273 return make_character(info, ord(ch), in_set)
1275def parse_numeric_escape(source, info, ch, in_set):
1276 "Parses a numeric escape sequence."
1277 if in_set or ch == "0":
1278 # Octal escape sequence, max 3 digits.
1279 return parse_octal_escape(source, info, [ch], in_set)
1281 # At least 1 digit, so either octal escape or group.
1282 digits = ch
1283 saved_pos = source.pos
1284 ch = source.get()
1285 if ch in DIGITS:
1286 # At least 2 digits, so either octal escape or group.
1287 digits += ch
1288 saved_pos = source.pos
1289 ch = source.get()
1290 if is_octal(digits) and ch in OCT_DIGITS:
1291 # 3 octal digits, so octal escape sequence.
1292 encoding = info.flags & _ALL_ENCODINGS
1293 if encoding == ASCII or encoding == LOCALE:
1294 octal_mask = 0xFF
1295 else:
1296 octal_mask = 0x1FF
1298 value = int(digits + ch, 8) & octal_mask
1299 return make_character(info, value)
1301 # Group reference.
1302 source.pos = saved_pos
1303 if info.is_open_group(digits):
1304 raise error("cannot refer to an open group", source.string, source.pos)
1306 return make_ref_group(info, digits, source.pos)
1308def parse_octal_escape(source, info, digits, in_set):
1309 "Parses an octal escape sequence."
1310 saved_pos = source.pos
1311 ch = source.get()
1312 while len(digits) < 3 and ch in OCT_DIGITS:
1313 digits.append(ch)
1314 saved_pos = source.pos
1315 ch = source.get()
1317 source.pos = saved_pos
1318 try:
1319 value = int("".join(digits), 8)
1320 return make_character(info, value, in_set)
1321 except ValueError:
1322 if digits[0] in OCT_DIGITS:
1323 raise error("incomplete escape \\%s" % ''.join(digits),
1324 source.string, source.pos)
1325 else:
1326 raise error("bad escape \\%s" % digits[0], source.string,
1327 source.pos)
1329def parse_hex_escape(source, info, esc, expected_len, in_set, type):
1330 "Parses a hex escape sequence."
1331 saved_pos = source.pos
1332 digits = []
1333 for i in range(expected_len):
1334 ch = source.get()
1335 if ch not in HEX_DIGITS:
1336 raise error("incomplete escape \\%s%s" % (type, ''.join(digits)),
1337 source.string, saved_pos)
1338 digits.append(ch)
1340 try:
1341 value = int("".join(digits), 16)
1342 except ValueError:
1343 pass
1344 else:
1345 if value < 0x110000:
1346 return make_character(info, value, in_set)
1348 # Bad hex escape.
1349 raise error("bad hex escape \\%s%s" % (esc, ''.join(digits)),
1350 source.string, saved_pos)
1352def parse_group_ref(source, info):
1353 "Parses a group reference."
1354 source.expect("<")
1355 saved_pos = source.pos
1356 name = parse_name(source, True)
1357 source.expect(">")
1358 if info.is_open_group(name):
1359 raise error("cannot refer to an open group", source.string, source.pos)
1361 return make_ref_group(info, name, saved_pos)
1363def parse_string_set(source, info):
1364 "Parses a string set reference."
1365 source.expect("<")
1366 name = parse_name(source, True)
1367 source.expect(">")
1368 if name is None or name not in info.kwargs:
1369 raise error("undefined named list", source.string, source.pos)
1371 return make_string_set(info, name)
1373def parse_named_char(source, info, in_set):
1374 "Parses a named character."
1375 saved_pos = source.pos
1376 if source.match("{"):
1377 name = source.get_while(NAMED_CHAR_PART, keep_spaces=True)
1378 if source.match("}"):
1379 try:
1380 value = unicodedata.lookup(name)
1381 return make_character(info, ord(value), in_set)
1382 except KeyError:
1383 raise error("undefined character name", source.string,
1384 source.pos)
1386 source.pos = saved_pos
1387 return make_character(info, ord("N"), in_set)
1389def parse_property(source, info, positive, in_set):
1390 "Parses a Unicode property."
1391 saved_pos = source.pos
1392 ch = source.get()
1393 if ch == "{":
1394 negate = source.match("^")
1395 prop_name, name = parse_property_name(source)
1396 if source.match("}"):
1397 # It's correctly delimited.
1398 if info.flags & ASCII:
1399 encoding = ASCII_ENCODING
1400 elif info.flags & UNICODE:
1401 encoding = UNICODE_ENCODING
1402 else:
1403 encoding = 0
1405 prop = lookup_property(prop_name, name, positive != negate, source,
1406 encoding=encoding)
1407 return make_property(info, prop, in_set)
1408 elif ch and ch in "CLMNPSZ":
1409 # An abbreviated property, eg \pL.
1410 if info.flags & ASCII:
1411 encoding = ASCII_ENCODING
1412 elif info.flags & UNICODE:
1413 encoding = UNICODE_ENCODING
1414 else:
1415 encoding = 0
1417 prop = lookup_property(None, ch, positive, source, encoding=encoding)
1418 return make_property(info, prop, in_set)
1420 # Not a property, so treat as a literal "p" or "P".
1421 source.pos = saved_pos
1422 ch = "p" if positive else "P"
1423 return make_character(info, ord(ch), in_set)
1425def parse_property_name(source):
1426 "Parses a property name, which may be qualified."
1427 name = source.get_while(PROPERTY_NAME_PART)
1428 saved_pos = source.pos
1430 ch = source.get()
1431 if ch and ch in ":=":
1432 prop_name = name
1433 name = source.get_while(ALNUM | set(" &_-./")).strip()
1435 if name:
1436 # Name after the ":" or "=", so it's a qualified name.
1437 saved_pos = source.pos
1438 else:
1439 # No name after the ":" or "=", so assume it's an unqualified name.
1440 prop_name, name = None, prop_name
1441 else:
1442 prop_name = None
1444 source.pos = saved_pos
1445 return prop_name, name
1447def parse_set(source, info):
1448 "Parses a character set."
1449 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION
1451 saved_ignore = source.ignore_space
1452 source.ignore_space = False
1453 # Negative set?
1454 negate = source.match("^")
1455 try:
1456 if version == VERSION0:
1457 item = parse_set_imp_union(source, info)
1458 else:
1459 item = parse_set_union(source, info)
1461 if not source.match("]"):
1462 raise error("missing ]", source.string, source.pos)
1463 finally:
1464 source.ignore_space = saved_ignore
1466 if negate:
1467 item = item.with_flags(positive=not item.positive)
1469 item = item.with_flags(case_flags=make_case_flags(info))
1471 return item
1473def parse_set_union(source, info):
1474 "Parses a set union ([x||y])."
1475 items = [parse_set_symm_diff(source, info)]
1476 while source.match("||"):
1477 items.append(parse_set_symm_diff(source, info))
1479 if len(items) == 1:
1480 return items[0]
1481 return SetUnion(info, items)
1483def parse_set_symm_diff(source, info):
1484 "Parses a set symmetric difference ([x~~y])."
1485 items = [parse_set_inter(source, info)]
1486 while source.match("~~"):
1487 items.append(parse_set_inter(source, info))
1489 if len(items) == 1:
1490 return items[0]
1491 return SetSymDiff(info, items)
1493def parse_set_inter(source, info):
1494 "Parses a set intersection ([x&&y])."
1495 items = [parse_set_diff(source, info)]
1496 while source.match("&&"):
1497 items.append(parse_set_diff(source, info))
1499 if len(items) == 1:
1500 return items[0]
1501 return SetInter(info, items)
1503def parse_set_diff(source, info):
1504 "Parses a set difference ([x--y])."
1505 items = [parse_set_imp_union(source, info)]
1506 while source.match("--"):
1507 items.append(parse_set_imp_union(source, info))
1509 if len(items) == 1:
1510 return items[0]
1511 return SetDiff(info, items)
1513def parse_set_imp_union(source, info):
1514 "Parses a set implicit union ([xy])."
1515 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION
1517 items = [parse_set_member(source, info)]
1518 while True:
1519 saved_pos = source.pos
1520 if source.match("]"):
1521 # End of the set.
1522 source.pos = saved_pos
1523 break
1525 if version == VERSION1 and any(source.match(op) for op in SET_OPS):
1526 # The new behaviour has set operators.
1527 source.pos = saved_pos
1528 break
1530 items.append(parse_set_member(source, info))
1532 if len(items) == 1:
1533 return items[0]
1534 return SetUnion(info, items)
1536def parse_set_member(source, info):
1537 "Parses a member in a character set."
1538 # Parse a set item.
1539 start = parse_set_item(source, info)
1540 saved_pos1 = source.pos
1541 if (not isinstance(start, Character) or not start.positive or not
1542 source.match("-")):
1543 # It's not the start of a range.
1544 return start
1546 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION
1548 # It looks like the start of a range of characters.
1549 saved_pos2 = source.pos
1550 if version == VERSION1 and source.match("-"):
1551 # It's actually the set difference operator '--', so return the
1552 # character.
1553 source.pos = saved_pos1
1554 return start
1556 if source.match("]"):
1557 # We've reached the end of the set, so return both the character and
1558 # hyphen.
1559 source.pos = saved_pos2
1560 return SetUnion(info, [start, Character(ord("-"))])
1562 # Parse a set item.
1563 end = parse_set_item(source, info)
1564 if not isinstance(end, Character) or not end.positive:
1565 # It's not a range, so return the character, hyphen and property.
1566 return SetUnion(info, [start, Character(ord("-")), end])
1568 # It _is_ a range.
1569 if start.value > end.value:
1570 raise error("bad character range", source.string, source.pos)
1572 if start.value == end.value:
1573 return start
1575 return Range(start.value, end.value)
1577def parse_set_item(source, info):
1578 "Parses an item in a character set."
1579 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION
1581 if source.match("\\"):
1582 # An escape sequence in a set.
1583 return parse_escape(source, info, True)
1585 saved_pos = source.pos
1586 if source.match("[:"):
1587 # Looks like a POSIX character class.
1588 try:
1589 return parse_posix_class(source, info)
1590 except ParseError:
1591 # Not a POSIX character class.
1592 source.pos = saved_pos
1594 if version == VERSION1 and source.match("["):
1595 # It's the start of a nested set.
1597 # Negative set?
1598 negate = source.match("^")
1599 item = parse_set_union(source, info)
1601 if not source.match("]"):
1602 raise error("missing ]", source.string, source.pos)
1604 if negate:
1605 item = item.with_flags(positive=not item.positive)
1607 return item
1609 ch = source.get()
1610 if not ch:
1611 raise error("unterminated character set", source.string, source.pos)
1613 return Character(ord(ch))
1615def parse_posix_class(source, info):
1616 "Parses a POSIX character class."
1617 negate = source.match("^")
1618 prop_name, name = parse_property_name(source)
1619 if not source.match(":]"):
1620 raise ParseError()
1622 return lookup_property(prop_name, name, not negate, source, posix=True)
1624def float_to_rational(flt):
1625 "Converts a float to a rational pair."
1626 int_part = int(flt)
1627 error = flt - int_part
1628 if abs(error) < 0.0001:
1629 return int_part, 1
1631 den, num = float_to_rational(1.0 / error)
1633 return int_part * den + num, den
1635def numeric_to_rational(numeric):
1636 "Converts a numeric string to a rational string, if possible."
1637 if numeric[ : 1] == "-":
1638 sign, numeric = numeric[0], numeric[1 : ]
1639 else:
1640 sign = ""
1642 parts = numeric.split("/")
1643 if len(parts) == 2:
1644 num, den = float_to_rational(float(parts[0]) / float(parts[1]))
1645 elif len(parts) == 1:
1646 num, den = float_to_rational(float(parts[0]))
1647 else:
1648 raise ValueError()
1650 result = "{}{}/{}".format(sign, num, den)
1651 if result.endswith("/1"):
1652 return result[ : -2]
1654 return result
1656def standardise_name(name):
1657 "Standardises a property or value name."
1658 try:
1659 return numeric_to_rational("".join(name))
1660 except (ValueError, ZeroDivisionError):
1661 return "".join(ch for ch in name if ch not in "_- ").upper()
1663_POSIX_CLASSES = set('ALNUM DIGIT PUNCT XDIGIT'.split())
1665_BINARY_VALUES = set('YES Y NO N TRUE T FALSE F'.split())
1667def lookup_property(property, value, positive, source=None, posix=False, encoding=0):
1668 "Looks up a property."
1669 # Normalise the names (which may still be lists).
1670 property = standardise_name(property) if property else None
1671 value = standardise_name(value)
1673 if (property, value) == ("GENERALCATEGORY", "ASSIGNED"):
1674 property, value, positive = "GENERALCATEGORY", "UNASSIGNED", not positive
1676 if posix and not property and value.upper() in _POSIX_CLASSES:
1677 value = 'POSIX' + value
1679 if property:
1680 # Both the property and the value are provided.
1681 prop = PROPERTIES.get(property)
1682 if not prop:
1683 if not source:
1684 raise error("unknown property")
1686 raise error("unknown property", source.string, source.pos)
1688 prop_id, value_dict = prop
1689 val_id = value_dict.get(value)
1690 if val_id is None:
1691 if not source:
1692 raise error("unknown property value")
1694 raise error("unknown property value", source.string, source.pos)
1696 return Property((prop_id << 16) | val_id, positive, encoding=encoding)
1698 # Only the value is provided.
1699 # It might be the name of a GC, script or block value.
1700 for property in ("GC", "SCRIPT", "BLOCK"):
1701 prop_id, value_dict = PROPERTIES.get(property)
1702 val_id = value_dict.get(value)
1703 if val_id is not None:
1704 return Property((prop_id << 16) | val_id, positive, encoding=encoding)
1706 # It might be the name of a binary property.
1707 prop = PROPERTIES.get(value)
1708 if prop:
1709 prop_id, value_dict = prop
1710 if set(value_dict) == _BINARY_VALUES:
1711 return Property((prop_id << 16) | 1, positive, encoding=encoding)
1713 return Property(prop_id << 16, not positive, encoding=encoding)
1715 # It might be the name of a binary property starting with a prefix.
1716 if value.startswith("IS"):
1717 prop = PROPERTIES.get(value[2 : ])
1718 if prop:
1719 prop_id, value_dict = prop
1720 if "YES" in value_dict:
1721 return Property((prop_id << 16) | 1, positive, encoding=encoding)
1723 # It might be the name of a script or block starting with a prefix.
1724 for prefix, property in (("IS", "SCRIPT"), ("IN", "BLOCK")):
1725 if value.startswith(prefix):
1726 prop_id, value_dict = PROPERTIES.get(property)
1727 val_id = value_dict.get(value[2 : ])
1728 if val_id is not None:
1729 return Property((prop_id << 16) | val_id, positive, encoding=encoding)
1731 # Unknown property.
1732 if not source:
1733 raise error("unknown property")
1735 raise error("unknown property", source.string, source.pos)
1737def _compile_replacement(source, pattern, is_unicode):
1738 "Compiles a replacement template escape sequence."
1739 ch = source.get()
1740 if ch in ALPHA:
1741 # An alphabetic escape sequence.
1742 value = CHARACTER_ESCAPES.get(ch)
1743 if value:
1744 return False, [ord(value)]
1746 if ch in HEX_ESCAPES and (ch == "x" or is_unicode):
1747 # A hexadecimal escape sequence.
1748 return False, [parse_repl_hex_escape(source, HEX_ESCAPES[ch], ch)]
1750 if ch == "g":
1751 # A group preference.
1752 return True, [compile_repl_group(source, pattern)]
1754 if ch == "N" and is_unicode:
1755 # A named character.
1756 value = parse_repl_named_char(source)
1757 if value is not None:
1758 return False, [value]
1760 raise error("bad escape \\%s" % ch, source.string, source.pos)
1762 if isinstance(source.sep, bytes):
1763 octal_mask = 0xFF
1764 else:
1765 octal_mask = 0x1FF
1767 if ch == "0":
1768 # An octal escape sequence.
1769 digits = ch
1770 while len(digits) < 3:
1771 saved_pos = source.pos
1772 ch = source.get()
1773 if ch not in OCT_DIGITS:
1774 source.pos = saved_pos
1775 break
1776 digits += ch
1778 return False, [int(digits, 8) & octal_mask]
1780 if ch in DIGITS:
1781 # Either an octal escape sequence (3 digits) or a group reference (max
1782 # 2 digits).
1783 digits = ch
1784 saved_pos = source.pos
1785 ch = source.get()
1786 if ch in DIGITS:
1787 digits += ch
1788 saved_pos = source.pos
1789 ch = source.get()
1790 if ch and is_octal(digits + ch):
1791 # An octal escape sequence.
1792 return False, [int(digits + ch, 8) & octal_mask]
1794 # A group reference.
1795 source.pos = saved_pos
1796 return True, [int(digits)]
1798 if ch == "\\":
1799 # An escaped backslash is a backslash.
1800 return False, [ord("\\")]
1802 if not ch:
1803 # A trailing backslash.
1804 raise error("bad escape (end of pattern)", source.string, source.pos)
1806 # An escaped non-backslash is a backslash followed by the literal.
1807 return False, [ord("\\"), ord(ch)]
1809def parse_repl_hex_escape(source, expected_len, type):
1810 "Parses a hex escape sequence in a replacement string."
1811 digits = []
1812 for i in range(expected_len):
1813 ch = source.get()
1814 if ch not in HEX_DIGITS:
1815 raise error("incomplete escape \\%s%s" % (type, ''.join(digits)),
1816 source.string, source.pos)
1817 digits.append(ch)
1819 return int("".join(digits), 16)
1821def parse_repl_named_char(source):
1822 "Parses a named character in a replacement string."
1823 saved_pos = source.pos
1824 if source.match("{"):
1825 name = source.get_while(ALPHA | set(" "))
1827 if source.match("}"):
1828 try:
1829 value = unicodedata.lookup(name)
1830 return ord(value)
1831 except KeyError:
1832 raise error("undefined character name", source.string,
1833 source.pos)
1835 source.pos = saved_pos
1836 return None
1838def compile_repl_group(source, pattern):
1839 "Compiles a replacement template group reference."
1840 source.expect("<")
1841 name = parse_name(source, True, True)
1843 source.expect(">")
1844 if name.isdigit():
1845 index = int(name)
1846 if not 0 <= index <= pattern.groups:
1847 raise error("invalid group reference", source.string, source.pos)
1849 return index
1851 try:
1852 return pattern.groupindex[name]
1853 except KeyError:
1854 raise IndexError("unknown group")
1856# The regular expression is parsed into a syntax tree. The different types of
1857# node are defined below.
1859INDENT = " "
1860POSITIVE_OP = 0x1
1861ZEROWIDTH_OP = 0x2
1862FUZZY_OP = 0x4
1863REVERSE_OP = 0x8
1864REQUIRED_OP = 0x10
1865ENCODING_OP_SHIFT = 5
1867POS_TEXT = {False: "NON-MATCH", True: "MATCH"}
1868CASE_TEXT = {NOCASE: "", IGNORECASE: " SIMPLE_IGNORE_CASE", FULLCASE: "",
1869 FULLIGNORECASE: " FULL_IGNORE_CASE"}
1871def make_sequence(items):
1872 if len(items) == 1:
1873 return items[0]
1874 return Sequence(items)
1876# Common base class for all nodes.
1877class RegexBase:
1878 def __init__(self):
1879 self._key = self.__class__
1881 def with_flags(self, positive=None, case_flags=None, zerowidth=None):
1882 if positive is None:
1883 positive = self.positive
1884 else:
1885 positive = bool(positive)
1886 if case_flags is None:
1887 case_flags = self.case_flags
1888 else:
1889 case_flags = CASE_FLAGS_COMBINATIONS[case_flags & CASE_FLAGS]
1890 if zerowidth is None:
1891 zerowidth = self.zerowidth
1892 else:
1893 zerowidth = bool(zerowidth)
1895 if (positive == self.positive and case_flags == self.case_flags and
1896 zerowidth == self.zerowidth):
1897 return self
1899 return self.rebuild(positive, case_flags, zerowidth)
1901 def fix_groups(self, pattern, reverse, fuzzy):
1902 pass
1904 def optimise(self, info, reverse):
1905 return self
1907 def pack_characters(self, info):
1908 return self
1910 def remove_captures(self):
1911 return self
1913 def is_atomic(self):
1914 return True
1916 def can_be_affix(self):
1917 return True
1919 def contains_group(self):
1920 return False
1922 def get_firstset(self, reverse):
1923 raise _FirstSetError()
1925 def has_simple_start(self):
1926 return False
1928 def compile(self, reverse=False, fuzzy=False):
1929 return self._compile(reverse, fuzzy)
1931 def is_empty(self):
1932 return False
1934 def __hash__(self):
1935 return hash(self._key)
1937 def __eq__(self, other):
1938 return type(self) is type(other) and self._key == other._key
1940 def __ne__(self, other):
1941 return not self.__eq__(other)
1943 def get_required_string(self, reverse):
1944 return self.max_width(), None
1946# Base class for zero-width nodes.
1947class ZeroWidthBase(RegexBase):
1948 def __init__(self, positive=True, encoding=0):
1949 RegexBase.__init__(self)
1950 self.positive = bool(positive)
1951 self.encoding = encoding
1953 self._key = self.__class__, self.positive
1955 def get_firstset(self, reverse):
1956 return set([None])
1958 def _compile(self, reverse, fuzzy):
1959 flags = 0
1960 if self.positive:
1961 flags |= POSITIVE_OP
1962 if fuzzy:
1963 flags |= FUZZY_OP
1964 if reverse:
1965 flags |= REVERSE_OP
1966 flags |= self.encoding << ENCODING_OP_SHIFT
1967 return [(self._opcode, flags)]
1969 def dump(self, indent, reverse):
1970 print("{}{} {}{}".format(INDENT * indent, self._op_name,
1971 POS_TEXT[self.positive], ["", " ASCII"][self.encoding]))
1973 def max_width(self):
1974 return 0
1976class Any(RegexBase):
1977 _opcode = {False: OP.ANY, True: OP.ANY_REV}
1978 _op_name = "ANY"
1980 def has_simple_start(self):
1981 return True
1983 def _compile(self, reverse, fuzzy):
1984 flags = 0
1985 if fuzzy:
1986 flags |= FUZZY_OP
1987 return [(self._opcode[reverse], flags)]
1989 def dump(self, indent, reverse):
1990 print("{}{}".format(INDENT * indent, self._op_name))
1992 def max_width(self):
1993 return 1
1995class AnyAll(Any):
1996 _opcode = {False: OP.ANY_ALL, True: OP.ANY_ALL_REV}
1997 _op_name = "ANY_ALL"
1999 def __init__(self):
2000 self.positive = True
2001 self.zerowidth = False
2002 self.case_flags = 0
2004class AnyU(Any):
2005 _opcode = {False: OP.ANY_U, True: OP.ANY_U_REV}
2006 _op_name = "ANY_U"
2008class Atomic(RegexBase):
2009 def __init__(self, subpattern):
2010 RegexBase.__init__(self)
2011 self.subpattern = subpattern
2013 def fix_groups(self, pattern, reverse, fuzzy):
2014 self.subpattern.fix_groups(pattern, reverse, fuzzy)
2016 def optimise(self, info, reverse):
2017 self.subpattern = self.subpattern.optimise(info, reverse)
2019 if self.subpattern.is_empty():
2020 return self.subpattern
2021 return self
2023 def pack_characters(self, info):
2024 self.subpattern = self.subpattern.pack_characters(info)
2025 return self
2027 def remove_captures(self):
2028 self.subpattern = self.subpattern.remove_captures()
2029 return self
2031 def can_be_affix(self):
2032 return self.subpattern.can_be_affix()
2034 def contains_group(self):
2035 return self.subpattern.contains_group()
2037 def get_firstset(self, reverse):
2038 return self.subpattern.get_firstset(reverse)
2040 def has_simple_start(self):
2041 return self.subpattern.has_simple_start()
2043 def _compile(self, reverse, fuzzy):
2044 return ([(OP.ATOMIC, )] + self.subpattern.compile(reverse, fuzzy) +
2045 [(OP.END, )])
2047 def dump(self, indent, reverse):
2048 print("{}ATOMIC".format(INDENT * indent))
2049 self.subpattern.dump(indent + 1, reverse)
2051 def is_empty(self):
2052 return self.subpattern.is_empty()
2054 def __eq__(self, other):
2055 return (type(self) is type(other) and self.subpattern ==
2056 other.subpattern)
2058 def max_width(self):
2059 return self.subpattern.max_width()
2061 def get_required_string(self, reverse):
2062 return self.subpattern.get_required_string(reverse)
2064class Boundary(ZeroWidthBase):
2065 _opcode = OP.BOUNDARY
2066 _op_name = "BOUNDARY"
2068class Branch(RegexBase):
2069 def __init__(self, branches):
2070 RegexBase.__init__(self)
2071 self.branches = branches
2073 def fix_groups(self, pattern, reverse, fuzzy):
2074 for b in self.branches:
2075 b.fix_groups(pattern, reverse, fuzzy)
2077 def optimise(self, info, reverse):
2078 if not self.branches:
2079 return Sequence([])
2081 # Flatten branches within branches.
2082 branches = Branch._flatten_branches(info, reverse, self.branches)
2084 # Move any common prefix or suffix out of the branches.
2085 if reverse:
2086 suffix, branches = Branch._split_common_suffix(info, branches)
2087 prefix = []
2088 else:
2089 prefix, branches = Branch._split_common_prefix(info, branches)
2090 suffix = []
2092 # Try to reduce adjacent single-character branches to sets.
2093 branches = Branch._reduce_to_set(info, reverse, branches)
2095 if len(branches) > 1:
2096 sequence = [Branch(branches)]
2098 if not prefix or not suffix:
2099 # We might be able to add a quick precheck before the branches.
2100 firstset = self._add_precheck(info, reverse, branches)
2102 if firstset:
2103 if reverse:
2104 sequence.append(firstset)
2105 else:
2106 sequence.insert(0, firstset)
2107 else:
2108 sequence = branches
2110 return make_sequence(prefix + sequence + suffix)
2112 def _add_precheck(self, info, reverse, branches):
2113 charset = set()
2114 pos = -1 if reverse else 0
2116 for branch in branches:
2117 if type(branch) is Literal and branch.case_flags == NOCASE:
2118 charset.add(branch.characters[pos])
2119 else:
2120 return
2122 if not charset:
2123 return None
2125 return _check_firstset(info, reverse, [Character(c) for c in charset])
2127 def pack_characters(self, info):
2128 self.branches = [b.pack_characters(info) for b in self.branches]
2129 return self
2131 def remove_captures(self):
2132 self.branches = [b.remove_captures() for b in self.branches]
2133 return self
2135 def is_atomic(self):
2136 return all(b.is_atomic() for b in self.branches)
2138 def can_be_affix(self):
2139 return all(b.can_be_affix() for b in self.branches)
2141 def contains_group(self):
2142 return any(b.contains_group() for b in self.branches)
2144 def get_firstset(self, reverse):
2145 fs = set()
2146 for b in self.branches:
2147 fs |= b.get_firstset(reverse)
2149 return fs or set([None])
2151 def _compile(self, reverse, fuzzy):
2152 if not self.branches:
2153 return []
2155 code = [(OP.BRANCH, )]
2156 for b in self.branches:
2157 code.extend(b.compile(reverse, fuzzy))
2158 code.append((OP.NEXT, ))
2160 code[-1] = (OP.END, )
2162 return code
2164 def dump(self, indent, reverse):
2165 print("{}BRANCH".format(INDENT * indent))
2166 self.branches[0].dump(indent + 1, reverse)
2167 for b in self.branches[1 : ]:
2168 print("{}OR".format(INDENT * indent))
2169 b.dump(indent + 1, reverse)
2171 @staticmethod
2172 def _flatten_branches(info, reverse, branches):
2173 # Flatten the branches so that there aren't branches of branches.
2174 new_branches = []
2175 for b in branches:
2176 b = b.optimise(info, reverse)
2177 if isinstance(b, Branch):
2178 new_branches.extend(b.branches)
2179 else:
2180 new_branches.append(b)
2182 return new_branches
2184 @staticmethod
2185 def _split_common_prefix(info, branches):
2186 # Common leading items can be moved out of the branches.
2187 # Get the items in the branches.
2188 alternatives = []
2189 for b in branches:
2190 if isinstance(b, Sequence):
2191 alternatives.append(b.items)
2192 else:
2193 alternatives.append([b])
2195 # What is the maximum possible length of the prefix?
2196 max_count = min(len(a) for a in alternatives)
2198 # What is the longest common prefix?
2199 prefix = alternatives[0]
2200 pos = 0
2201 end_pos = max_count
2202 while pos < end_pos and prefix[pos].can_be_affix() and all(a[pos] ==
2203 prefix[pos] for a in alternatives):
2204 pos += 1
2205 count = pos
2207 if info.flags & UNICODE:
2208 # We need to check that we're not splitting a sequence of
2209 # characters which could form part of full case-folding.
2210 count = pos
2211 while count > 0 and not all(Branch._can_split(a, count) for a in
2212 alternatives):
2213 count -= 1
2215 # No common prefix is possible.
2216 if count == 0:
2217 return [], branches
2219 # Rebuild the branches.
2220 new_branches = []
2221 for a in alternatives:
2222 new_branches.append(make_sequence(a[count : ]))
2224 return prefix[ : count], new_branches
2226 @staticmethod
2227 def _split_common_suffix(info, branches):
2228 # Common trailing items can be moved out of the branches.
2229 # Get the items in the branches.
2230 alternatives = []
2231 for b in branches:
2232 if isinstance(b, Sequence):
2233 alternatives.append(b.items)
2234 else:
2235 alternatives.append([b])
2237 # What is the maximum possible length of the suffix?
2238 max_count = min(len(a) for a in alternatives)
2240 # What is the longest common suffix?
2241 suffix = alternatives[0]
2242 pos = -1
2243 end_pos = -1 - max_count
2244 while pos > end_pos and suffix[pos].can_be_affix() and all(a[pos] ==
2245 suffix[pos] for a in alternatives):
2246 pos -= 1
2247 count = -1 - pos
2249 if info.flags & UNICODE:
2250 # We need to check that we're not splitting a sequence of
2251 # characters which could form part of full case-folding.
2252 while count > 0 and not all(Branch._can_split_rev(a, count) for a
2253 in alternatives):
2254 count -= 1
2256 # No common suffix is possible.
2257 if count == 0:
2258 return [], branches
2260 # Rebuild the branches.
2261 new_branches = []
2262 for a in alternatives:
2263 new_branches.append(make_sequence(a[ : -count]))
2265 return suffix[-count : ], new_branches
2267 @staticmethod
2268 def _can_split(items, count):
2269 # Check the characters either side of the proposed split.
2270 if not Branch._is_full_case(items, count - 1):
2271 return True
2273 if not Branch._is_full_case(items, count):
2274 return True
2276 # Check whether a 1-1 split would be OK.
2277 if Branch._is_folded(items[count - 1 : count + 1]):
2278 return False
2280 # Check whether a 1-2 split would be OK.
2281 if (Branch._is_full_case(items, count + 2) and
2282 Branch._is_folded(items[count - 1 : count + 2])):
2283 return False
2285 # Check whether a 2-1 split would be OK.
2286 if (Branch._is_full_case(items, count - 2) and
2287 Branch._is_folded(items[count - 2 : count + 1])):
2288 return False
2290 return True
2292 @staticmethod
2293 def _can_split_rev(items, count):
2294 end = len(items)
2296 # Check the characters either side of the proposed split.
2297 if not Branch._is_full_case(items, end - count):
2298 return True
2300 if not Branch._is_full_case(items, end - count - 1):
2301 return True
2303 # Check whether a 1-1 split would be OK.
2304 if Branch._is_folded(items[end - count - 1 : end - count + 1]):
2305 return False
2307 # Check whether a 1-2 split would be OK.
2308 if (Branch._is_full_case(items, end - count + 2) and
2309 Branch._is_folded(items[end - count - 1 : end - count + 2])):
2310 return False
2312 # Check whether a 2-1 split would be OK.
2313 if (Branch._is_full_case(items, end - count - 2) and
2314 Branch._is_folded(items[end - count - 2 : end - count + 1])):
2315 return False
2317 return True
2319 @staticmethod
2320 def _merge_common_prefixes(info, reverse, branches):
2321 # Branches with the same case-sensitive character prefix can be grouped
2322 # together if they are separated only by other branches with a
2323 # character prefix.
2324 prefixed = defaultdict(list)
2325 order = {}
2326 new_branches = []
2327 for b in branches:
2328 if Branch._is_simple_character(b):
2329 # Branch starts with a simple character.
2330 prefixed[b.value].append([b])
2331 order.setdefault(b.value, len(order))
2332 elif (isinstance(b, Sequence) and b.items and
2333 Branch._is_simple_character(b.items[0])):
2334 # Branch starts with a simple character.
2335 prefixed[b.items[0].value].append(b.items)
2336 order.setdefault(b.items[0].value, len(order))
2337 else:
2338 Branch._flush_char_prefix(info, reverse, prefixed, order,
2339 new_branches)
2341 new_branches.append(b)
2343 Branch._flush_char_prefix(info, prefixed, order, new_branches)
2345 return new_branches
2347 @staticmethod
2348 def _is_simple_character(c):
2349 return isinstance(c, Character) and c.positive and not c.case_flags
2351 @staticmethod
2352 def _reduce_to_set(info, reverse, branches):
2353 # Can the branches be reduced to a set?
2354 new_branches = []
2355 items = set()
2356 case_flags = NOCASE
2357 for b in branches:
2358 if isinstance(b, (Character, Property, SetBase)):
2359 # Branch starts with a single character.
2360 if b.case_flags != case_flags:
2361 # Different case sensitivity, so flush.
2362 Branch._flush_set_members(info, reverse, items, case_flags,
2363 new_branches)
2365 case_flags = b.case_flags
2367 items.add(b.with_flags(case_flags=NOCASE))
2368 else:
2369 Branch._flush_set_members(info, reverse, items, case_flags,
2370 new_branches)
2372 new_branches.append(b)
2374 Branch._flush_set_members(info, reverse, items, case_flags,
2375 new_branches)
2377 return new_branches
2379 @staticmethod
2380 def _flush_char_prefix(info, reverse, prefixed, order, new_branches):
2381 # Flush the prefixed branches.
2382 if not prefixed:
2383 return
2385 for value, branches in sorted(prefixed.items(), key=lambda pair:
2386 order[pair[0]]):
2387 if len(branches) == 1:
2388 new_branches.append(make_sequence(branches[0]))
2389 else:
2390 subbranches = []
2391 optional = False
2392 for b in branches:
2393 if len(b) > 1:
2394 subbranches.append(make_sequence(b[1 : ]))
2395 elif not optional:
2396 subbranches.append(Sequence())
2397 optional = True
2399 sequence = Sequence([Character(value), Branch(subbranches)])
2400 new_branches.append(sequence.optimise(info, reverse))
2402 prefixed.clear()
2403 order.clear()
2405 @staticmethod
2406 def _flush_set_members(info, reverse, items, case_flags, new_branches):
2407 # Flush the set members.
2408 if not items:
2409 return
2411 if len(items) == 1:
2412 item = list(items)[0]
2413 else:
2414 item = SetUnion(info, list(items)).optimise(info, reverse)
2416 new_branches.append(item.with_flags(case_flags=case_flags))
2418 items.clear()
2420 @staticmethod
2421 def _is_full_case(items, i):
2422 if not 0 <= i < len(items):
2423 return False
2425 item = items[i]
2426 return (isinstance(item, Character) and item.positive and
2427 (item.case_flags & FULLIGNORECASE) == FULLIGNORECASE)
2429 @staticmethod
2430 def _is_folded(items):
2431 if len(items) < 2:
2432 return False
2434 for i in items:
2435 if (not isinstance(i, Character) or not i.positive or not
2436 i.case_flags):
2437 return False
2439 folded = "".join(chr(i.value) for i in items)
2440 folded = _regex.fold_case(FULL_CASE_FOLDING, folded)
2442 # Get the characters which expand to multiple codepoints on folding.
2443 expanding_chars = _regex.get_expand_on_folding()
2445 for c in expanding_chars:
2446 if folded == _regex.fold_case(FULL_CASE_FOLDING, c):
2447 return True
2449 return False
2451 def is_empty(self):
2452 return all(b.is_empty() for b in self.branches)
2454 def __eq__(self, other):
2455 return type(self) is type(other) and self.branches == other.branches
2457 def max_width(self):
2458 return max(b.max_width() for b in self.branches)
2460class CallGroup(RegexBase):
2461 def __init__(self, info, group, position):
2462 RegexBase.__init__(self)
2463 self.info = info
2464 self.group = group
2465 self.position = position
2467 self._key = self.__class__, self.group
2469 def fix_groups(self, pattern, reverse, fuzzy):
2470 try:
2471 self.group = int(self.group)
2472 except ValueError:
2473 try:
2474 self.group = self.info.group_index[self.group]
2475 except KeyError:
2476 raise error("invalid group reference", pattern, self.position)
2478 if not 0 <= self.group <= self.info.group_count:
2479 raise error("unknown group", pattern, self.position)
2481 if self.group > 0 and self.info.open_group_count[self.group] > 1:
2482 raise error("ambiguous group reference", pattern, self.position)
2484 self.info.group_calls.append((self, reverse, fuzzy))
2486 self._key = self.__class__, self.group
2488 def remove_captures(self):
2489 raise error("group reference not allowed", pattern, self.position)
2491 def _compile(self, reverse, fuzzy):
2492 return [(OP.GROUP_CALL, self.call_ref)]
2494 def dump(self, indent, reverse):
2495 print("{}GROUP_CALL {}".format(INDENT * indent, self.group))
2497 def __eq__(self, other):
2498 return type(self) is type(other) and self.group == other.group
2500 def max_width(self):
2501 return UNLIMITED
2503 def __del__(self):
2504 self.info = None
2506class CallRef(RegexBase):
2507 def __init__(self, ref, parsed):
2508 self.ref = ref
2509 self.parsed = parsed
2511 def _compile(self, reverse, fuzzy):
2512 return ([(OP.CALL_REF, self.ref)] + self.parsed._compile(reverse,
2513 fuzzy) + [(OP.END, )])
2515class Character(RegexBase):
2516 _opcode = {(NOCASE, False): OP.CHARACTER, (IGNORECASE, False):
2517 OP.CHARACTER_IGN, (FULLCASE, False): OP.CHARACTER, (FULLIGNORECASE,
2518 False): OP.CHARACTER_IGN, (NOCASE, True): OP.CHARACTER_REV, (IGNORECASE,
2519 True): OP.CHARACTER_IGN_REV, (FULLCASE, True): OP.CHARACTER_REV,
2520 (FULLIGNORECASE, True): OP.CHARACTER_IGN_REV}
2522 def __init__(self, value, positive=True, case_flags=NOCASE,
2523 zerowidth=False):
2524 RegexBase.__init__(self)
2525 self.value = value
2526 self.positive = bool(positive)
2527 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
2528 self.zerowidth = bool(zerowidth)
2530 if (self.positive and (self.case_flags & FULLIGNORECASE) ==
2531 FULLIGNORECASE):
2532 self.folded = _regex.fold_case(FULL_CASE_FOLDING, chr(self.value))
2533 else:
2534 self.folded = chr(self.value)
2536 self._key = (self.__class__, self.value, self.positive,
2537 self.case_flags, self.zerowidth)
2539 def rebuild(self, positive, case_flags, zerowidth):
2540 return Character(self.value, positive, case_flags, zerowidth)
2542 def optimise(self, info, reverse, in_set=False):
2543 return self
2545 def get_firstset(self, reverse):
2546 return set([self])
2548 def has_simple_start(self):
2549 return True
2551 def _compile(self, reverse, fuzzy):
2552 flags = 0
2553 if self.positive:
2554 flags |= POSITIVE_OP
2555 if self.zerowidth:
2556 flags |= ZEROWIDTH_OP
2557 if fuzzy:
2558 flags |= FUZZY_OP
2560 code = PrecompiledCode([self._opcode[self.case_flags, reverse], flags,
2561 self.value])
2563 if len(self.folded) > 1:
2564 # The character expands on full case-folding.
2565 code = Branch([code, String([ord(c) for c in self.folded],
2566 case_flags=self.case_flags)])
2568 return code.compile(reverse, fuzzy)
2570 def dump(self, indent, reverse):
2571 display = ascii(chr(self.value)).lstrip("bu")
2572 print("{}CHARACTER {} {}{}".format(INDENT * indent,
2573 POS_TEXT[self.positive], display, CASE_TEXT[self.case_flags]))
2575 def matches(self, ch):
2576 return (ch == self.value) == self.positive
2578 def max_width(self):
2579 return len(self.folded)
2581 def get_required_string(self, reverse):
2582 if not self.positive:
2583 return 1, None
2585 self.folded_characters = tuple(ord(c) for c in self.folded)
2587 return 0, self
2589class Conditional(RegexBase):
2590 def __init__(self, info, group, yes_item, no_item, position):
2591 RegexBase.__init__(self)
2592 self.info = info
2593 self.group = group
2594 self.yes_item = yes_item
2595 self.no_item = no_item
2596 self.position = position
2598 def fix_groups(self, pattern, reverse, fuzzy):
2599 try:
2600 self.group = int(self.group)
2601 except ValueError:
2602 try:
2603 self.group = self.info.group_index[self.group]
2604 except KeyError:
2605 if self.group == 'DEFINE':
2606 # 'DEFINE' is a special name unless there's a group with
2607 # that name.
2608 self.group = 0
2609 else:
2610 raise error("unknown group", pattern, self.position)
2612 if not 0 <= self.group <= self.info.group_count:
2613 raise error("invalid group reference", pattern, self.position)
2615 self.yes_item.fix_groups(pattern, reverse, fuzzy)
2616 self.no_item.fix_groups(pattern, reverse, fuzzy)
2618 def optimise(self, info, reverse):
2619 yes_item = self.yes_item.optimise(info, reverse)
2620 no_item = self.no_item.optimise(info, reverse)
2622 return Conditional(info, self.group, yes_item, no_item, self.position)
2624 def pack_characters(self, info):
2625 self.yes_item = self.yes_item.pack_characters(info)
2626 self.no_item = self.no_item.pack_characters(info)
2627 return self
2629 def remove_captures(self):
2630 self.yes_item = self.yes_item.remove_captures()
2631 self.no_item = self.no_item.remove_captures()
2633 def is_atomic(self):
2634 return self.yes_item.is_atomic() and self.no_item.is_atomic()
2636 def can_be_affix(self):
2637 return self.yes_item.can_be_affix() and self.no_item.can_be_affix()
2639 def contains_group(self):
2640 return self.yes_item.contains_group() or self.no_item.contains_group()
2642 def get_firstset(self, reverse):
2643 return (self.yes_item.get_firstset(reverse) |
2644 self.no_item.get_firstset(reverse))
2646 def _compile(self, reverse, fuzzy):
2647 code = [(OP.GROUP_EXISTS, self.group)]
2648 code.extend(self.yes_item.compile(reverse, fuzzy))
2649 add_code = self.no_item.compile(reverse, fuzzy)
2650 if add_code:
2651 code.append((OP.NEXT, ))
2652 code.extend(add_code)
2654 code.append((OP.END, ))
2656 return code
2658 def dump(self, indent, reverse):
2659 print("{}GROUP_EXISTS {}".format(INDENT * indent, self.group))
2660 self.yes_item.dump(indent + 1, reverse)
2661 if not self.no_item.is_empty():
2662 print("{}OR".format(INDENT * indent))
2663 self.no_item.dump(indent + 1, reverse)
2665 def is_empty(self):
2666 return self.yes_item.is_empty() and self.no_item.is_empty()
2668 def __eq__(self, other):
2669 return type(self) is type(other) and (self.group, self.yes_item,
2670 self.no_item) == (other.group, other.yes_item, other.no_item)
2672 def max_width(self):
2673 return max(self.yes_item.max_width(), self.no_item.max_width())
2675 def __del__(self):
2676 self.info = None
2678class DefaultBoundary(ZeroWidthBase):
2679 _opcode = OP.DEFAULT_BOUNDARY
2680 _op_name = "DEFAULT_BOUNDARY"
2682class DefaultEndOfWord(ZeroWidthBase):
2683 _opcode = OP.DEFAULT_END_OF_WORD
2684 _op_name = "DEFAULT_END_OF_WORD"
2686class DefaultStartOfWord(ZeroWidthBase):
2687 _opcode = OP.DEFAULT_START_OF_WORD
2688 _op_name = "DEFAULT_START_OF_WORD"
2690class EndOfLine(ZeroWidthBase):
2691 _opcode = OP.END_OF_LINE
2692 _op_name = "END_OF_LINE"
2694class EndOfLineU(EndOfLine):
2695 _opcode = OP.END_OF_LINE_U
2696 _op_name = "END_OF_LINE_U"
2698class EndOfString(ZeroWidthBase):
2699 _opcode = OP.END_OF_STRING
2700 _op_name = "END_OF_STRING"
2702class EndOfStringLine(ZeroWidthBase):
2703 _opcode = OP.END_OF_STRING_LINE
2704 _op_name = "END_OF_STRING_LINE"
2706class EndOfStringLineU(EndOfStringLine):
2707 _opcode = OP.END_OF_STRING_LINE_U
2708 _op_name = "END_OF_STRING_LINE_U"
2710class EndOfWord(ZeroWidthBase):
2711 _opcode = OP.END_OF_WORD
2712 _op_name = "END_OF_WORD"
2714class Failure(ZeroWidthBase):
2715 _op_name = "FAILURE"
2717 def _compile(self, reverse, fuzzy):
2718 return [(OP.FAILURE, )]
2720class Fuzzy(RegexBase):
2721 def __init__(self, subpattern, constraints=None):
2722 RegexBase.__init__(self)
2723 if constraints is None:
2724 constraints = {}
2725 self.subpattern = subpattern
2726 self.constraints = constraints
2728 # If an error type is mentioned in the cost equation, then its maximum
2729 # defaults to unlimited.
2730 if "cost" in constraints:
2731 for e in "dis":
2732 if e in constraints["cost"]:
2733 constraints.setdefault(e, (0, None))
2735 # If any error type is mentioned, then all the error maxima default to
2736 # 0, otherwise they default to unlimited.
2737 if set(constraints) & set("dis"):
2738 for e in "dis":
2739 constraints.setdefault(e, (0, 0))
2740 else:
2741 for e in "dis":
2742 constraints.setdefault(e, (0, None))
2744 # The maximum of the generic error type defaults to unlimited.
2745 constraints.setdefault("e", (0, None))
2747 # The cost equation defaults to equal costs. Also, the cost of any
2748 # error type not mentioned in the cost equation defaults to 0.
2749 if "cost" in constraints:
2750 for e in "dis":
2751 constraints["cost"].setdefault(e, 0)
2752 else:
2753 constraints["cost"] = {"d": 1, "i": 1, "s": 1, "max":
2754 constraints["e"][1]}
2756 def fix_groups(self, pattern, reverse, fuzzy):
2757 self.subpattern.fix_groups(pattern, reverse, True)
2759 def pack_characters(self, info):
2760 self.subpattern = self.subpattern.pack_characters(info)
2761 return self
2763 def remove_captures(self):
2764 self.subpattern = self.subpattern.remove_captures()
2765 return self
2767 def is_atomic(self):
2768 return self.subpattern.is_atomic()
2770 def contains_group(self):
2771 return self.subpattern.contains_group()
2773 def _compile(self, reverse, fuzzy):
2774 # The individual limits.
2775 arguments = []
2776 for e in "dise":
2777 v = self.constraints[e]
2778 arguments.append(v[0])
2779 arguments.append(UNLIMITED if v[1] is None else v[1])
2781 # The coeffs of the cost equation.
2782 for e in "dis":
2783 arguments.append(self.constraints["cost"][e])
2785 # The maximum of the cost equation.
2786 v = self.constraints["cost"]["max"]
2787 arguments.append(UNLIMITED if v is None else v)
2789 flags = 0
2790 if reverse:
2791 flags |= REVERSE_OP
2793 test = self.constraints.get("test")
2795 if test:
2796 return ([(OP.FUZZY_EXT, flags) + tuple(arguments)] +
2797 test.compile(reverse, True) + [(OP.NEXT,)] +
2798 self.subpattern.compile(reverse, True) + [(OP.END,)])
2800 return ([(OP.FUZZY, flags) + tuple(arguments)] +
2801 self.subpattern.compile(reverse, True) + [(OP.END,)])
2803 def dump(self, indent, reverse):
2804 constraints = self._constraints_to_string()
2805 if constraints:
2806 constraints = " " + constraints
2807 print("{}FUZZY{}".format(INDENT * indent, constraints))
2808 self.subpattern.dump(indent + 1, reverse)
2810 def is_empty(self):
2811 return self.subpattern.is_empty()
2813 def __eq__(self, other):
2814 return (type(self) is type(other) and self.subpattern ==
2815 other.subpattern and self.constraints == other.constraints)
2817 def max_width(self):
2818 return UNLIMITED
2820 def _constraints_to_string(self):
2821 constraints = []
2823 for name in "ids":
2824 min, max = self.constraints[name]
2825 if max == 0:
2826 continue
2828 con = ""
2830 if min > 0:
2831 con = "{}<=".format(min)
2833 con += name
2835 if max is not None:
2836 con += "<={}".format(max)
2838 constraints.append(con)
2840 cost = []
2841 for name in "ids":
2842 coeff = self.constraints["cost"][name]
2843 if coeff > 0:
2844 cost.append("{}{}".format(coeff, name))
2846 limit = self.constraints["cost"]["max"]
2847 if limit is not None and limit > 0:
2848 cost = "{}<={}".format("+".join(cost), limit)
2849 constraints.append(cost)
2851 return ",".join(constraints)
2853class Grapheme(RegexBase):
2854 def _compile(self, reverse, fuzzy):
2855 # Match at least 1 character until a grapheme boundary is reached. Note
2856 # that this is the same whether matching forwards or backwards.
2857 grapheme_matcher = Atomic(Sequence([LazyRepeat(AnyAll(), 1, None),
2858 GraphemeBoundary()]))
2860 return grapheme_matcher.compile(reverse, fuzzy)
2862 def dump(self, indent, reverse):
2863 print("{}GRAPHEME".format(INDENT * indent))
2865 def max_width(self):
2866 return UNLIMITED
2868class GraphemeBoundary:
2869 def compile(self, reverse, fuzzy):
2870 return [(OP.GRAPHEME_BOUNDARY, 1)]
2872class GreedyRepeat(RegexBase):
2873 _opcode = OP.GREEDY_REPEAT
2874 _op_name = "GREEDY_REPEAT"
2876 def __init__(self, subpattern, min_count, max_count):
2877 RegexBase.__init__(self)
2878 self.subpattern = subpattern
2879 self.min_count = min_count
2880 self.max_count = max_count
2882 def fix_groups(self, pattern, reverse, fuzzy):
2883 self.subpattern.fix_groups(pattern, reverse, fuzzy)
2885 def optimise(self, info, reverse):
2886 subpattern = self.subpattern.optimise(info, reverse)
2888 return type(self)(subpattern, self.min_count, self.max_count)
2890 def pack_characters(self, info):
2891 self.subpattern = self.subpattern.pack_characters(info)
2892 return self
2894 def remove_captures(self):
2895 self.subpattern = self.subpattern.remove_captures()
2896 return self
2898 def is_atomic(self):
2899 return self.min_count == self.max_count and self.subpattern.is_atomic()
2901 def can_be_affix(self):
2902 return False
2904 def contains_group(self):
2905 return self.subpattern.contains_group()
2907 def get_firstset(self, reverse):
2908 fs = self.subpattern.get_firstset(reverse)
2909 if self.min_count == 0:
2910 fs.add(None)
2912 return fs
2914 def _compile(self, reverse, fuzzy):
2915 repeat = [self._opcode, self.min_count]
2916 if self.max_count is None:
2917 repeat.append(UNLIMITED)
2918 else:
2919 repeat.append(self.max_count)
2921 subpattern = self.subpattern.compile(reverse, fuzzy)
2922 if not subpattern:
2923 return []
2925 return ([tuple(repeat)] + subpattern + [(OP.END, )])
2927 def dump(self, indent, reverse):
2928 if self.max_count is None:
2929 limit = "INF"
2930 else:
2931 limit = self.max_count
2932 print("{}{} {} {}".format(INDENT * indent, self._op_name,
2933 self.min_count, limit))
2935 self.subpattern.dump(indent + 1, reverse)
2937 def is_empty(self):
2938 return self.subpattern.is_empty()
2940 def __eq__(self, other):
2941 return type(self) is type(other) and (self.subpattern, self.min_count,
2942 self.max_count) == (other.subpattern, other.min_count,
2943 other.max_count)
2945 def max_width(self):
2946 if self.max_count is None:
2947 return UNLIMITED
2949 return self.subpattern.max_width() * self.max_count
2951 def get_required_string(self, reverse):
2952 max_count = UNLIMITED if self.max_count is None else self.max_count
2953 if self.min_count == 0:
2954 w = self.subpattern.max_width() * max_count
2955 return min(w, UNLIMITED), None
2957 ofs, req = self.subpattern.get_required_string(reverse)
2958 if req:
2959 return ofs, req
2961 w = self.subpattern.max_width() * max_count
2962 return min(w, UNLIMITED), None
2964class PossessiveRepeat(GreedyRepeat):
2965 def is_atomic(self):
2966 return True
2968 def _compile(self, reverse, fuzzy):
2969 subpattern = self.subpattern.compile(reverse, fuzzy)
2970 if not subpattern:
2971 return []
2973 repeat = [self._opcode, self.min_count]
2974 if self.max_count is None:
2975 repeat.append(UNLIMITED)
2976 else:
2977 repeat.append(self.max_count)
2979 return ([(OP.ATOMIC, ), tuple(repeat)] + subpattern + [(OP.END, ),
2980 (OP.END, )])
2982 def dump(self, indent, reverse):
2983 print("{}ATOMIC".format(INDENT * indent))
2985 if self.max_count is None:
2986 limit = "INF"
2987 else:
2988 limit = self.max_count
2989 print("{}{} {} {}".format(INDENT * (indent + 1), self._op_name,
2990 self.min_count, limit))
2992 self.subpattern.dump(indent + 2, reverse)
2994class Group(RegexBase):
2995 def __init__(self, info, group, subpattern):
2996 RegexBase.__init__(self)
2997 self.info = info
2998 self.group = group
2999 self.subpattern = subpattern
3001 self.call_ref = None
3003 def fix_groups(self, pattern, reverse, fuzzy):
3004 self.info.defined_groups[self.group] = (self, reverse, fuzzy)
3005 self.subpattern.fix_groups(pattern, reverse, fuzzy)
3007 def optimise(self, info, reverse):
3008 subpattern = self.subpattern.optimise(info, reverse)
3010 return Group(self.info, self.group, subpattern)
3012 def pack_characters(self, info):
3013 self.subpattern = self.subpattern.pack_characters(info)
3014 return self
3016 def remove_captures(self):
3017 return self.subpattern.remove_captures()
3019 def is_atomic(self):
3020 return self.subpattern.is_atomic()
3022 def can_be_affix(self):
3023 return False
3025 def contains_group(self):
3026 return True
3028 def get_firstset(self, reverse):
3029 return self.subpattern.get_firstset(reverse)
3031 def has_simple_start(self):
3032 return self.subpattern.has_simple_start()
3034 def _compile(self, reverse, fuzzy):
3035 code = []
3037 public_group = private_group = self.group
3038 if private_group < 0:
3039 public_group = self.info.private_groups[private_group]
3040 private_group = self.info.group_count - private_group
3042 key = self.group, reverse, fuzzy
3043 ref = self.info.call_refs.get(key)
3044 if ref is not None:
3045 code += [(OP.CALL_REF, ref)]
3047 code += [(OP.GROUP, int(not reverse), private_group, public_group)]
3048 code += self.subpattern.compile(reverse, fuzzy)
3049 code += [(OP.END, )]
3051 if ref is not None:
3052 code += [(OP.END, )]
3054 return code
3056 def dump(self, indent, reverse):
3057 group = self.group
3058 if group < 0:
3059 group = private_groups[group]
3060 print("{}GROUP {}".format(INDENT * indent, group))
3061 self.subpattern.dump(indent + 1, reverse)
3063 def __eq__(self, other):
3064 return (type(self) is type(other) and (self.group, self.subpattern) ==
3065 (other.group, other.subpattern))
3067 def max_width(self):
3068 return self.subpattern.max_width()
3070 def get_required_string(self, reverse):
3071 return self.subpattern.get_required_string(reverse)
3073 def __del__(self):
3074 self.info = None
3076class Keep(ZeroWidthBase):
3077 _opcode = OP.KEEP
3078 _op_name = "KEEP"
3080class LazyRepeat(GreedyRepeat):
3081 _opcode = OP.LAZY_REPEAT
3082 _op_name = "LAZY_REPEAT"
3084class LookAround(RegexBase):
3085 _dir_text = {False: "AHEAD", True: "BEHIND"}
3087 def __init__(self, behind, positive, subpattern):
3088 RegexBase.__init__(self)
3089 self.behind = bool(behind)
3090 self.positive = bool(positive)
3091 self.subpattern = subpattern
3093 def fix_groups(self, pattern, reverse, fuzzy):
3094 self.subpattern.fix_groups(pattern, self.behind, fuzzy)
3096 def optimise(self, info, reverse):
3097 subpattern = self.subpattern.optimise(info, self.behind)
3098 if self.positive and subpattern.is_empty():
3099 return subpattern
3101 return LookAround(self.behind, self.positive, subpattern)
3103 def pack_characters(self, info):
3104 self.subpattern = self.subpattern.pack_characters(info)
3105 return self
3107 def remove_captures(self):
3108 return self.subpattern.remove_captures()
3110 def is_atomic(self):
3111 return self.subpattern.is_atomic()
3113 def can_be_affix(self):
3114 return self.subpattern.can_be_affix()
3116 def contains_group(self):
3117 return self.subpattern.contains_group()
3119 def get_firstset(self, reverse):
3120 if self.positive and self.behind == reverse:
3121 return self.subpattern.get_firstset(reverse)
3123 return set([None])
3125 def _compile(self, reverse, fuzzy):
3126 flags = 0
3127 if self.positive:
3128 flags |= POSITIVE_OP
3129 if fuzzy:
3130 flags |= FUZZY_OP
3131 if reverse:
3132 flags |= REVERSE_OP
3134 return ([(OP.LOOKAROUND, flags, int(not self.behind))] +
3135 self.subpattern.compile(self.behind) + [(OP.END, )])
3137 def dump(self, indent, reverse):
3138 print("{}LOOK{} {}".format(INDENT * indent,
3139 self._dir_text[self.behind], POS_TEXT[self.positive]))
3140 self.subpattern.dump(indent + 1, self.behind)
3142 def is_empty(self):
3143 return self.positive and self.subpattern.is_empty()
3145 def __eq__(self, other):
3146 return type(self) is type(other) and (self.behind, self.positive,
3147 self.subpattern) == (other.behind, other.positive, other.subpattern)
3149 def max_width(self):
3150 return 0
3152class LookAroundConditional(RegexBase):
3153 _dir_text = {False: "AHEAD", True: "BEHIND"}
3155 def __init__(self, behind, positive, subpattern, yes_item, no_item):
3156 RegexBase.__init__(self)
3157 self.behind = bool(behind)
3158 self.positive = bool(positive)
3159 self.subpattern = subpattern
3160 self.yes_item = yes_item
3161 self.no_item = no_item
3163 def fix_groups(self, pattern, reverse, fuzzy):
3164 self.subpattern.fix_groups(pattern, reverse, fuzzy)
3165 self.yes_item.fix_groups(pattern, reverse, fuzzy)
3166 self.no_item.fix_groups(pattern, reverse, fuzzy)
3168 def optimise(self, info, reverse):
3169 subpattern = self.subpattern.optimise(info, self.behind)
3170 yes_item = self.yes_item.optimise(info, self.behind)
3171 no_item = self.no_item.optimise(info, self.behind)
3173 return LookAroundConditional(self.behind, self.positive, subpattern,
3174 yes_item, no_item)
3176 def pack_characters(self, info):
3177 self.subpattern = self.subpattern.pack_characters(info)
3178 self.yes_item = self.yes_item.pack_characters(info)
3179 self.no_item = self.no_item.pack_characters(info)
3180 return self
3182 def remove_captures(self):
3183 self.subpattern = self.subpattern.remove_captures()
3184 self.yes_item = self.yes_item.remove_captures()
3185 self.no_item = self.no_item.remove_captures()
3187 def is_atomic(self):
3188 return (self.subpattern.is_atomic() and self.yes_item.is_atomic() and
3189 self.no_item.is_atomic())
3191 def can_be_affix(self):
3192 return (self.subpattern.can_be_affix() and self.yes_item.can_be_affix()
3193 and self.no_item.can_be_affix())
3195 def contains_group(self):
3196 return (self.subpattern.contains_group() or
3197 self.yes_item.contains_group() or self.no_item.contains_group())
3199 def _compile(self, reverse, fuzzy):
3200 code = [(OP.CONDITIONAL, int(self.positive), int(not self.behind))]
3201 code.extend(self.subpattern.compile(self.behind, fuzzy))
3202 code.append((OP.NEXT, ))
3203 code.extend(self.yes_item.compile(reverse, fuzzy))
3204 add_code = self.no_item.compile(reverse, fuzzy)
3205 if add_code:
3206 code.append((OP.NEXT, ))
3207 code.extend(add_code)
3209 code.append((OP.END, ))
3211 return code
3213 def dump(self, indent, reverse):
3214 print("{}CONDITIONAL {} {}".format(INDENT * indent,
3215 self._dir_text[self.behind], POS_TEXT[self.positive]))
3216 self.subpattern.dump(indent + 1, self.behind)
3217 print("{}EITHER".format(INDENT * indent))
3218 self.yes_item.dump(indent + 1, reverse)
3219 if not self.no_item.is_empty():
3220 print("{}OR".format(INDENT * indent))
3221 self.no_item.dump(indent + 1, reverse)
3223 def is_empty(self):
3224 return (self.subpattern.is_empty() and self.yes_item.is_empty() or
3225 self.no_item.is_empty())
3227 def __eq__(self, other):
3228 return type(self) is type(other) and (self.subpattern, self.yes_item,
3229 self.no_item) == (other.subpattern, other.yes_item, other.no_item)
3231 def max_width(self):
3232 return max(self.yes_item.max_width(), self.no_item.max_width())
3234 def get_required_string(self, reverse):
3235 return self.max_width(), None
3237class PrecompiledCode(RegexBase):
3238 def __init__(self, code):
3239 self.code = code
3241 def _compile(self, reverse, fuzzy):
3242 return [tuple(self.code)]
3244class Property(RegexBase):
3245 _opcode = {(NOCASE, False): OP.PROPERTY, (IGNORECASE, False):
3246 OP.PROPERTY_IGN, (FULLCASE, False): OP.PROPERTY, (FULLIGNORECASE, False):
3247 OP.PROPERTY_IGN, (NOCASE, True): OP.PROPERTY_REV, (IGNORECASE, True):
3248 OP.PROPERTY_IGN_REV, (FULLCASE, True): OP.PROPERTY_REV, (FULLIGNORECASE,
3249 True): OP.PROPERTY_IGN_REV}
3251 def __init__(self, value, positive=True, case_flags=NOCASE,
3252 zerowidth=False, encoding=0):
3253 RegexBase.__init__(self)
3254 self.value = value
3255 self.positive = bool(positive)
3256 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
3257 self.zerowidth = bool(zerowidth)
3258 self.encoding = encoding
3260 self._key = (self.__class__, self.value, self.positive,
3261 self.case_flags, self.zerowidth)
3263 def rebuild(self, positive, case_flags, zerowidth):
3264 return Property(self.value, positive, case_flags, zerowidth,
3265 self.encoding)
3267 def optimise(self, info, reverse, in_set=False):
3268 return self
3270 def get_firstset(self, reverse):
3271 return set([self])
3273 def has_simple_start(self):
3274 return True
3276 def _compile(self, reverse, fuzzy):
3277 flags = 0
3278 if self.positive:
3279 flags |= POSITIVE_OP
3280 if self.zerowidth:
3281 flags |= ZEROWIDTH_OP
3282 if fuzzy:
3283 flags |= FUZZY_OP
3284 flags |= self.encoding << ENCODING_OP_SHIFT
3285 return [(self._opcode[self.case_flags, reverse], flags, self.value)]
3287 def dump(self, indent, reverse):
3288 prop = PROPERTY_NAMES[self.value >> 16]
3289 name, value = prop[0], prop[1][self.value & 0xFFFF]
3290 print("{}PROPERTY {} {}:{}{}{}".format(INDENT * indent,
3291 POS_TEXT[self.positive], name, value, CASE_TEXT[self.case_flags],
3292 ["", " ASCII"][self.encoding]))
3294 def matches(self, ch):
3295 return _regex.has_property_value(self.value, ch) == self.positive
3297 def max_width(self):
3298 return 1
3300class Prune(ZeroWidthBase):
3301 _op_name = "PRUNE"
3303 def _compile(self, reverse, fuzzy):
3304 return [(OP.PRUNE, )]
3306class Range(RegexBase):
3307 _opcode = {(NOCASE, False): OP.RANGE, (IGNORECASE, False): OP.RANGE_IGN,
3308 (FULLCASE, False): OP.RANGE, (FULLIGNORECASE, False): OP.RANGE_IGN,
3309 (NOCASE, True): OP.RANGE_REV, (IGNORECASE, True): OP.RANGE_IGN_REV,
3310 (FULLCASE, True): OP.RANGE_REV, (FULLIGNORECASE, True): OP.RANGE_IGN_REV}
3311 _op_name = "RANGE"
3313 def __init__(self, lower, upper, positive=True, case_flags=NOCASE,
3314 zerowidth=False):
3315 RegexBase.__init__(self)
3316 self.lower = lower
3317 self.upper = upper
3318 self.positive = bool(positive)
3319 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
3320 self.zerowidth = bool(zerowidth)
3322 self._key = (self.__class__, self.lower, self.upper, self.positive,
3323 self.case_flags, self.zerowidth)
3325 def rebuild(self, positive, case_flags, zerowidth):
3326 return Range(self.lower, self.upper, positive, case_flags, zerowidth)
3328 def optimise(self, info, reverse, in_set=False):
3329 # Is the range case-sensitive?
3330 if not self.positive or not (self.case_flags & IGNORECASE) or in_set:
3331 return self
3333 # Is full case-folding possible?
3334 if (not (info.flags & UNICODE) or (self.case_flags & FULLIGNORECASE) !=
3335 FULLIGNORECASE):
3336 return self
3338 # Get the characters which expand to multiple codepoints on folding.
3339 expanding_chars = _regex.get_expand_on_folding()
3341 # Get the folded characters in the range.
3342 items = []
3343 for ch in expanding_chars:
3344 if self.lower <= ord(ch) <= self.upper:
3345 folded = _regex.fold_case(FULL_CASE_FOLDING, ch)
3346 items.append(String([ord(c) for c in folded],
3347 case_flags=self.case_flags))
3349 if not items:
3350 # We can fall back to simple case-folding.
3351 return self
3353 if len(items) < self.upper - self.lower + 1:
3354 # Not all the characters are covered by the full case-folding.
3355 items.insert(0, self)
3357 return Branch(items)
3359 def _compile(self, reverse, fuzzy):
3360 flags = 0
3361 if self.positive:
3362 flags |= POSITIVE_OP
3363 if self.zerowidth:
3364 flags |= ZEROWIDTH_OP
3365 if fuzzy:
3366 flags |= FUZZY_OP
3367 return [(self._opcode[self.case_flags, reverse], flags, self.lower,
3368 self.upper)]
3370 def dump(self, indent, reverse):
3371 display_lower = ascii(chr(self.lower)).lstrip("bu")
3372 display_upper = ascii(chr(self.upper)).lstrip("bu")
3373 print("{}RANGE {} {} {}{}".format(INDENT * indent,
3374 POS_TEXT[self.positive], display_lower, display_upper,
3375 CASE_TEXT[self.case_flags]))
3377 def matches(self, ch):
3378 return (self.lower <= ch <= self.upper) == self.positive
3380 def max_width(self):
3381 return 1
3383class RefGroup(RegexBase):
3384 _opcode = {(NOCASE, False): OP.REF_GROUP, (IGNORECASE, False):
3385 OP.REF_GROUP_IGN, (FULLCASE, False): OP.REF_GROUP, (FULLIGNORECASE,
3386 False): OP.REF_GROUP_FLD, (NOCASE, True): OP.REF_GROUP_REV, (IGNORECASE,
3387 True): OP.REF_GROUP_IGN_REV, (FULLCASE, True): OP.REF_GROUP_REV,
3388 (FULLIGNORECASE, True): OP.REF_GROUP_FLD_REV}
3390 def __init__(self, info, group, position, case_flags=NOCASE):
3391 RegexBase.__init__(self)
3392 self.info = info
3393 self.group = group
3394 self.position = position
3395 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
3397 self._key = self.__class__, self.group, self.case_flags
3399 def fix_groups(self, pattern, reverse, fuzzy):
3400 try:
3401 self.group = int(self.group)
3402 except ValueError:
3403 try:
3404 self.group = self.info.group_index[self.group]
3405 except KeyError:
3406 raise error("unknown group", pattern, self.position)
3408 if not 1 <= self.group <= self.info.group_count:
3409 raise error("invalid group reference", pattern, self.position)
3411 self._key = self.__class__, self.group, self.case_flags
3413 def remove_captures(self):
3414 raise error("group reference not allowed", pattern, self.position)
3416 def _compile(self, reverse, fuzzy):
3417 flags = 0
3418 if fuzzy:
3419 flags |= FUZZY_OP
3420 return [(self._opcode[self.case_flags, reverse], flags, self.group)]
3422 def dump(self, indent, reverse):
3423 print("{}REF_GROUP {}{}".format(INDENT * indent, self.group,
3424 CASE_TEXT[self.case_flags]))
3426 def max_width(self):
3427 return UNLIMITED
3429 def __del__(self):
3430 self.info = None
3432class SearchAnchor(ZeroWidthBase):
3433 _opcode = OP.SEARCH_ANCHOR
3434 _op_name = "SEARCH_ANCHOR"
3436class Sequence(RegexBase):
3437 def __init__(self, items=None):
3438 RegexBase.__init__(self)
3439 if items is None:
3440 items = []
3442 self.items = items
3444 def fix_groups(self, pattern, reverse, fuzzy):
3445 for s in self.items:
3446 s.fix_groups(pattern, reverse, fuzzy)
3448 def optimise(self, info, reverse):
3449 # Flatten the sequences.
3450 items = []
3451 for s in self.items:
3452 s = s.optimise(info, reverse)
3453 if isinstance(s, Sequence):
3454 items.extend(s.items)
3455 else:
3456 items.append(s)
3458 return make_sequence(items)
3460 def pack_characters(self, info):
3461 "Packs sequences of characters into strings."
3462 items = []
3463 characters = []
3464 case_flags = NOCASE
3465 for s in self.items:
3466 if type(s) is Character and s.positive and not s.zerowidth:
3467 if s.case_flags != case_flags:
3468 # Different case sensitivity, so flush, unless neither the
3469 # previous nor the new character are cased.
3470 if s.case_flags or is_cased_i(info, s.value):
3471 Sequence._flush_characters(info, characters,
3472 case_flags, items)
3474 case_flags = s.case_flags
3476 characters.append(s.value)
3477 elif type(s) is String or type(s) is Literal:
3478 if s.case_flags != case_flags:
3479 # Different case sensitivity, so flush, unless the neither
3480 # the previous nor the new string are cased.
3481 if s.case_flags or any(is_cased_i(info, c) for c in
3482 characters):
3483 Sequence._flush_characters(info, characters,
3484 case_flags, items)
3486 case_flags = s.case_flags
3488 characters.extend(s.characters)
3489 else:
3490 Sequence._flush_characters(info, characters, case_flags, items)
3492 items.append(s.pack_characters(info))
3494 Sequence._flush_characters(info, characters, case_flags, items)
3496 return make_sequence(items)
3498 def remove_captures(self):
3499 self.items = [s.remove_captures() for s in self.items]
3500 return self
3502 def is_atomic(self):
3503 return all(s.is_atomic() for s in self.items)
3505 def can_be_affix(self):
3506 return False
3508 def contains_group(self):
3509 return any(s.contains_group() for s in self.items)
3511 def get_firstset(self, reverse):
3512 fs = set()
3513 items = self.items
3514 if reverse:
3515 items.reverse()
3516 for s in items:
3517 fs |= s.get_firstset(reverse)
3518 if None not in fs:
3519 return fs
3520 fs.discard(None)
3522 return fs | set([None])
3524 def has_simple_start(self):
3525 return bool(self.items) and self.items[0].has_simple_start()
3527 def _compile(self, reverse, fuzzy):
3528 seq = self.items
3529 if reverse:
3530 seq = seq[::-1]
3532 code = []
3533 for s in seq:
3534 code.extend(s.compile(reverse, fuzzy))
3536 return code
3538 def dump(self, indent, reverse):
3539 for s in self.items:
3540 s.dump(indent, reverse)
3542 @staticmethod
3543 def _flush_characters(info, characters, case_flags, items):
3544 if not characters:
3545 return
3547 # Disregard case_flags if all of the characters are case-less.
3548 if case_flags & IGNORECASE:
3549 if not any(is_cased_i(info, c) for c in characters):
3550 case_flags = NOCASE
3552 if (case_flags & FULLIGNORECASE) == FULLIGNORECASE:
3553 literals = Sequence._fix_full_casefold(characters)
3555 for item in literals:
3556 chars = item.characters
3558 if len(chars) == 1:
3559 items.append(Character(chars[0], case_flags=item.case_flags))
3560 else:
3561 items.append(String(chars, case_flags=item.case_flags))
3562 else:
3563 if len(characters) == 1:
3564 items.append(Character(characters[0], case_flags=case_flags))
3565 else:
3566 items.append(String(characters, case_flags=case_flags))
3568 characters[:] = []
3570 @staticmethod
3571 def _fix_full_casefold(characters):
3572 # Split a literal needing full case-folding into chunks that need it
3573 # and chunks that can use simple case-folding, which is faster.
3574 expanded = [_regex.fold_case(FULL_CASE_FOLDING, c) for c in
3575 _regex.get_expand_on_folding()]
3576 string = _regex.fold_case(FULL_CASE_FOLDING, ''.join(chr(c)
3577 for c in characters)).lower()
3578 chunks = []
3580 for e in expanded:
3581 found = string.find(e)
3583 while found >= 0:
3584 chunks.append((found, found + len(e)))
3585 found = string.find(e, found + 1)
3587 pos = 0
3588 literals = []
3590 for start, end in Sequence._merge_chunks(chunks):
3591 if pos < start:
3592 literals.append(Literal(characters[pos : start],
3593 case_flags=IGNORECASE))
3595 literals.append(Literal(characters[start : end],
3596 case_flags=FULLIGNORECASE))
3597 pos = end
3599 if pos < len(characters):
3600 literals.append(Literal(characters[pos : ], case_flags=IGNORECASE))
3602 return literals
3604 @staticmethod
3605 def _merge_chunks(chunks):
3606 if len(chunks) < 2:
3607 return chunks
3609 chunks.sort()
3611 start, end = chunks[0]
3612 new_chunks = []
3614 for s, e in chunks[1 : ]:
3615 if s <= end:
3616 end = max(end, e)
3617 else:
3618 new_chunks.append((start, end))
3619 start, end = s, e
3621 new_chunks.append((start, end))
3623 return new_chunks
3625 def is_empty(self):
3626 return all(i.is_empty() for i in self.items)
3628 def __eq__(self, other):
3629 return type(self) is type(other) and self.items == other.items
3631 def max_width(self):
3632 return sum(s.max_width() for s in self.items)
3634 def get_required_string(self, reverse):
3635 seq = self.items
3636 if reverse:
3637 seq = seq[::-1]
3639 offset = 0
3641 for s in seq:
3642 ofs, req = s.get_required_string(reverse)
3643 offset += ofs
3644 if req:
3645 return offset, req
3647 return offset, None
3649class SetBase(RegexBase):
3650 def __init__(self, info, items, positive=True, case_flags=NOCASE,
3651 zerowidth=False):
3652 RegexBase.__init__(self)
3653 self.info = info
3654 self.items = tuple(items)
3655 self.positive = bool(positive)
3656 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
3657 self.zerowidth = bool(zerowidth)
3659 self.char_width = 1
3661 self._key = (self.__class__, self.items, self.positive,
3662 self.case_flags, self.zerowidth)
3664 def rebuild(self, positive, case_flags, zerowidth):
3665 return type(self)(self.info, self.items, positive, case_flags,
3666 zerowidth).optimise(self.info, False)
3668 def get_firstset(self, reverse):
3669 return set([self])
3671 def has_simple_start(self):
3672 return True
3674 def _compile(self, reverse, fuzzy):
3675 flags = 0
3676 if self.positive:
3677 flags |= POSITIVE_OP
3678 if self.zerowidth:
3679 flags |= ZEROWIDTH_OP
3680 if fuzzy:
3681 flags |= FUZZY_OP
3682 code = [(self._opcode[self.case_flags, reverse], flags)]
3683 for m in self.items:
3684 code.extend(m.compile())
3686 code.append((OP.END, ))
3688 return code
3690 def dump(self, indent, reverse):
3691 print("{}{} {}{}".format(INDENT * indent, self._op_name,
3692 POS_TEXT[self.positive], CASE_TEXT[self.case_flags]))
3693 for i in self.items:
3694 i.dump(indent + 1, reverse)
3696 def _handle_case_folding(self, info, in_set):
3697 # Is the set case-sensitive?
3698 if not self.positive or not (self.case_flags & IGNORECASE) or in_set:
3699 return self
3701 # Is full case-folding possible?
3702 if (not (self.info.flags & UNICODE) or (self.case_flags &
3703 FULLIGNORECASE) != FULLIGNORECASE):
3704 return self
3706 # Get the characters which expand to multiple codepoints on folding.
3707 expanding_chars = _regex.get_expand_on_folding()
3709 # Get the folded characters in the set.
3710 items = []
3711 seen = set()
3712 for ch in expanding_chars:
3713 if self.matches(ord(ch)):
3714 folded = _regex.fold_case(FULL_CASE_FOLDING, ch)
3715 if folded not in seen:
3716 items.append(String([ord(c) for c in folded],
3717 case_flags=self.case_flags))
3718 seen.add(folded)
3720 if not items:
3721 # We can fall back to simple case-folding.
3722 return self
3724 return Branch([self] + items)
3726 def max_width(self):
3727 # Is the set case-sensitive?
3728 if not self.positive or not (self.case_flags & IGNORECASE):
3729 return 1
3731 # Is full case-folding possible?
3732 if (not (self.info.flags & UNICODE) or (self.case_flags &
3733 FULLIGNORECASE) != FULLIGNORECASE):
3734 return 1
3736 # Get the characters which expand to multiple codepoints on folding.
3737 expanding_chars = _regex.get_expand_on_folding()
3739 # Get the folded characters in the set.
3740 seen = set()
3741 for ch in expanding_chars:
3742 if self.matches(ord(ch)):
3743 folded = _regex.fold_case(FULL_CASE_FOLDING, ch)
3744 seen.add(folded)
3746 if not seen:
3747 return 1
3749 return max(len(folded) for folded in seen)
3751 def __del__(self):
3752 self.info = None
3754class SetDiff(SetBase):
3755 _opcode = {(NOCASE, False): OP.SET_DIFF, (IGNORECASE, False):
3756 OP.SET_DIFF_IGN, (FULLCASE, False): OP.SET_DIFF, (FULLIGNORECASE, False):
3757 OP.SET_DIFF_IGN, (NOCASE, True): OP.SET_DIFF_REV, (IGNORECASE, True):
3758 OP.SET_DIFF_IGN_REV, (FULLCASE, True): OP.SET_DIFF_REV, (FULLIGNORECASE,
3759 True): OP.SET_DIFF_IGN_REV}
3760 _op_name = "SET_DIFF"
3762 def optimise(self, info, reverse, in_set=False):
3763 items = self.items
3764 if len(items) > 2:
3765 items = [items[0], SetUnion(info, items[1 : ])]
3767 if len(items) == 1:
3768 return items[0].with_flags(case_flags=self.case_flags,
3769 zerowidth=self.zerowidth).optimise(info, reverse, in_set)
3771 self.items = tuple(m.optimise(info, reverse, in_set=True) for m in
3772 items)
3774 return self._handle_case_folding(info, in_set)
3776 def matches(self, ch):
3777 m = self.items[0].matches(ch) and not self.items[1].matches(ch)
3778 return m == self.positive
3780class SetInter(SetBase):
3781 _opcode = {(NOCASE, False): OP.SET_INTER, (IGNORECASE, False):
3782 OP.SET_INTER_IGN, (FULLCASE, False): OP.SET_INTER, (FULLIGNORECASE,
3783 False): OP.SET_INTER_IGN, (NOCASE, True): OP.SET_INTER_REV, (IGNORECASE,
3784 True): OP.SET_INTER_IGN_REV, (FULLCASE, True): OP.SET_INTER_REV,
3785 (FULLIGNORECASE, True): OP.SET_INTER_IGN_REV}
3786 _op_name = "SET_INTER"
3788 def optimise(self, info, reverse, in_set=False):
3789 items = []
3790 for m in self.items:
3791 m = m.optimise(info, reverse, in_set=True)
3792 if isinstance(m, SetInter) and m.positive:
3793 # Intersection in intersection.
3794 items.extend(m.items)
3795 else:
3796 items.append(m)
3798 if len(items) == 1:
3799 return items[0].with_flags(case_flags=self.case_flags,
3800 zerowidth=self.zerowidth).optimise(info, reverse, in_set)
3802 self.items = tuple(items)
3804 return self._handle_case_folding(info, in_set)
3806 def matches(self, ch):
3807 m = all(i.matches(ch) for i in self.items)
3808 return m == self.positive
3810class SetSymDiff(SetBase):
3811 _opcode = {(NOCASE, False): OP.SET_SYM_DIFF, (IGNORECASE, False):
3812 OP.SET_SYM_DIFF_IGN, (FULLCASE, False): OP.SET_SYM_DIFF, (FULLIGNORECASE,
3813 False): OP.SET_SYM_DIFF_IGN, (NOCASE, True): OP.SET_SYM_DIFF_REV,
3814 (IGNORECASE, True): OP.SET_SYM_DIFF_IGN_REV, (FULLCASE, True):
3815 OP.SET_SYM_DIFF_REV, (FULLIGNORECASE, True): OP.SET_SYM_DIFF_IGN_REV}
3816 _op_name = "SET_SYM_DIFF"
3818 def optimise(self, info, reverse, in_set=False):
3819 items = []
3820 for m in self.items:
3821 m = m.optimise(info, reverse, in_set=True)
3822 if isinstance(m, SetSymDiff) and m.positive:
3823 # Symmetric difference in symmetric difference.
3824 items.extend(m.items)
3825 else:
3826 items.append(m)
3828 if len(items) == 1:
3829 return items[0].with_flags(case_flags=self.case_flags,
3830 zerowidth=self.zerowidth).optimise(info, reverse, in_set)
3832 self.items = tuple(items)
3834 return self._handle_case_folding(info, in_set)
3836 def matches(self, ch):
3837 m = False
3838 for i in self.items:
3839 m = m != i.matches(ch)
3841 return m == self.positive
3843class SetUnion(SetBase):
3844 _opcode = {(NOCASE, False): OP.SET_UNION, (IGNORECASE, False):
3845 OP.SET_UNION_IGN, (FULLCASE, False): OP.SET_UNION, (FULLIGNORECASE,
3846 False): OP.SET_UNION_IGN, (NOCASE, True): OP.SET_UNION_REV, (IGNORECASE,
3847 True): OP.SET_UNION_IGN_REV, (FULLCASE, True): OP.SET_UNION_REV,
3848 (FULLIGNORECASE, True): OP.SET_UNION_IGN_REV}
3849 _op_name = "SET_UNION"
3851 def optimise(self, info, reverse, in_set=False):
3852 items = []
3853 for m in self.items:
3854 m = m.optimise(info, reverse, in_set=True)
3855 if isinstance(m, SetUnion) and m.positive:
3856 # Union in union.
3857 items.extend(m.items)
3858 elif isinstance(m, AnyAll):
3859 return AnyAll()
3860 else:
3861 items.append(m)
3863 # Are there complementary properties?
3864 properties = (set(), set())
3866 for m in items:
3867 if isinstance(m, Property):
3868 properties[m.positive].add((m.value, m.case_flags, m.zerowidth))
3870 if properties[0] & properties[1]:
3871 return AnyAll()
3873 if len(items) == 1:
3874 i = items[0]
3875 return i.with_flags(positive=i.positive == self.positive,
3876 case_flags=self.case_flags,
3877 zerowidth=self.zerowidth).optimise(info, reverse, in_set)
3879 self.items = tuple(items)
3881 return self._handle_case_folding(info, in_set)
3883 def _compile(self, reverse, fuzzy):
3884 flags = 0
3885 if self.positive:
3886 flags |= POSITIVE_OP
3887 if self.zerowidth:
3888 flags |= ZEROWIDTH_OP
3889 if fuzzy:
3890 flags |= FUZZY_OP
3892 characters, others = defaultdict(list), []
3893 for m in self.items:
3894 if isinstance(m, Character):
3895 characters[m.positive].append(m.value)
3896 else:
3897 others.append(m)
3899 code = [(self._opcode[self.case_flags, reverse], flags)]
3901 for positive, values in characters.items():
3902 flags = 0
3903 if positive:
3904 flags |= POSITIVE_OP
3905 if len(values) == 1:
3906 code.append((OP.CHARACTER, flags, values[0]))
3907 else:
3908 code.append((OP.STRING, flags, len(values)) + tuple(values))
3910 for m in others:
3911 code.extend(m.compile())
3913 code.append((OP.END, ))
3915 return code
3917 def matches(self, ch):
3918 m = any(i.matches(ch) for i in self.items)
3919 return m == self.positive
3921class Skip(ZeroWidthBase):
3922 _op_name = "SKIP"
3923 _opcode = OP.SKIP
3925class StartOfLine(ZeroWidthBase):
3926 _opcode = OP.START_OF_LINE
3927 _op_name = "START_OF_LINE"
3929class StartOfLineU(StartOfLine):
3930 _opcode = OP.START_OF_LINE_U
3931 _op_name = "START_OF_LINE_U"
3933class StartOfString(ZeroWidthBase):
3934 _opcode = OP.START_OF_STRING
3935 _op_name = "START_OF_STRING"
3937class StartOfWord(ZeroWidthBase):
3938 _opcode = OP.START_OF_WORD
3939 _op_name = "START_OF_WORD"
3941class String(RegexBase):
3942 _opcode = {(NOCASE, False): OP.STRING, (IGNORECASE, False): OP.STRING_IGN,
3943 (FULLCASE, False): OP.STRING, (FULLIGNORECASE, False): OP.STRING_FLD,
3944 (NOCASE, True): OP.STRING_REV, (IGNORECASE, True): OP.STRING_IGN_REV,
3945 (FULLCASE, True): OP.STRING_REV, (FULLIGNORECASE, True):
3946 OP.STRING_FLD_REV}
3948 def __init__(self, characters, case_flags=NOCASE):
3949 self.characters = tuple(characters)
3950 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
3952 if (self.case_flags & FULLIGNORECASE) == FULLIGNORECASE:
3953 folded_characters = []
3954 for char in self.characters:
3955 folded = _regex.fold_case(FULL_CASE_FOLDING, chr(char))
3956 folded_characters.extend(ord(c) for c in folded)
3957 else:
3958 folded_characters = self.characters
3960 self.folded_characters = tuple(folded_characters)
3961 self.required = False
3963 self._key = self.__class__, self.characters, self.case_flags
3965 def get_firstset(self, reverse):
3966 if reverse:
3967 pos = -1
3968 else:
3969 pos = 0
3970 return set([Character(self.characters[pos],
3971 case_flags=self.case_flags)])
3973 def has_simple_start(self):
3974 return True
3976 def _compile(self, reverse, fuzzy):
3977 flags = 0
3978 if fuzzy:
3979 flags |= FUZZY_OP
3980 if self.required:
3981 flags |= REQUIRED_OP
3982 return [(self._opcode[self.case_flags, reverse], flags,
3983 len(self.folded_characters)) + self.folded_characters]
3985 def dump(self, indent, reverse):
3986 display = ascii("".join(chr(c) for c in self.characters)).lstrip("bu")
3987 print("{}STRING {}{}".format(INDENT * indent, display,
3988 CASE_TEXT[self.case_flags]))
3990 def max_width(self):
3991 return len(self.folded_characters)
3993 def get_required_string(self, reverse):
3994 return 0, self
3996class Literal(String):
3997 def dump(self, indent, reverse):
3998 literal = ''.join(chr(c) for c in self.characters)
3999 display = ascii(literal).lstrip("bu")
4000 print("{}LITERAL MATCH {}{}".format(INDENT * indent, display,
4001 CASE_TEXT[self.case_flags]))
4003class StringSet(Branch):
4004 def __init__(self, info, name, case_flags=NOCASE):
4005 self.info = info
4006 self.name = name
4007 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
4009 self._key = self.__class__, self.name, self.case_flags
4011 self.set_key = (name, self.case_flags)
4012 if self.set_key not in info.named_lists_used:
4013 info.named_lists_used[self.set_key] = len(info.named_lists_used)
4015 index = self.info.named_lists_used[self.set_key]
4016 items = self.info.kwargs[self.name]
4018 case_flags = self.case_flags
4020 encoding = self.info.flags & _ALL_ENCODINGS
4021 fold_flags = encoding | case_flags
4023 choices = []
4025 for string in items:
4026 if isinstance(string, str):
4027 string = [ord(c) for c in string]
4029 choices.append([Character(c, case_flags=case_flags) for c in
4030 string])
4032 # Sort from longest to shortest.
4033 choices.sort(key=len, reverse=True)
4035 self.branches = [Sequence(choice) for choice in choices]
4037 def dump(self, indent, reverse):
4038 print("{}STRING_SET {}{}".format(INDENT * indent, self.name,
4039 CASE_TEXT[self.case_flags]))
4041 def __del__(self):
4042 self.info = None
4044class Source:
4045 "Scanner for the regular expression source string."
4046 def __init__(self, string):
4047 if isinstance(string, str):
4048 self.string = string
4049 self.char_type = chr
4050 else:
4051 self.string = string.decode("latin-1")
4052 self.char_type = lambda c: bytes([c])
4054 self.pos = 0
4055 self.ignore_space = False
4056 self.sep = string[ : 0]
4058 def get(self, override_ignore=False):
4059 string = self.string
4060 pos = self.pos
4062 try:
4063 if self.ignore_space and not override_ignore:
4064 while True:
4065 if string[pos].isspace():
4066 # Skip over the whitespace.
4067 pos += 1
4068 elif string[pos] == "#":
4069 # Skip over the comment to the end of the line.
4070 pos = string.index("\n", pos)
4071 else:
4072 break
4074 ch = string[pos]
4075 self.pos = pos + 1
4076 return ch
4077 except IndexError:
4078 # We've reached the end of the string.
4079 self.pos = pos
4080 return string[ : 0]
4081 except ValueError:
4082 # The comment extended to the end of the string.
4083 self.pos = len(string)
4084 return string[ : 0]
4086 def get_many(self, count=1):
4087 string = self.string
4088 pos = self.pos
4090 try:
4091 if self.ignore_space:
4092 substring = []
4094 while len(substring) < count:
4095 while True:
4096 if string[pos].isspace():
4097 # Skip over the whitespace.
4098 pos += 1
4099 elif string[pos] == "#":
4100 # Skip over the comment to the end of the line.
4101 pos = string.index("\n", pos)
4102 else:
4103 break
4105 substring.append(string[pos])
4106 pos += 1
4108 substring = "".join(substring)
4109 else:
4110 substring = string[pos : pos + count]
4111 pos += len(substring)
4113 self.pos = pos
4114 return substring
4115 except IndexError:
4116 # We've reached the end of the string.
4117 self.pos = len(string)
4118 return "".join(substring)
4119 except ValueError:
4120 # The comment extended to the end of the string.
4121 self.pos = len(string)
4122 return "".join(substring)
4124 def get_while(self, test_set, include=True, keep_spaces=False):
4125 string = self.string
4126 pos = self.pos
4128 if self.ignore_space and not keep_spaces:
4129 try:
4130 substring = []
4132 while True:
4133 if string[pos].isspace():
4134 # Skip over the whitespace.
4135 pos += 1
4136 elif string[pos] == "#":
4137 # Skip over the comment to the end of the line.
4138 pos = string.index("\n", pos)
4139 elif (string[pos] in test_set) == include:
4140 substring.append(string[pos])
4141 pos += 1
4142 else:
4143 break
4145 self.pos = pos
4146 except IndexError:
4147 # We've reached the end of the string.
4148 self.pos = len(string)
4149 except ValueError:
4150 # The comment extended to the end of the string.
4151 self.pos = len(string)
4153 return "".join(substring)
4154 else:
4155 try:
4156 while (string[pos] in test_set) == include:
4157 pos += 1
4159 substring = string[self.pos : pos]
4161 self.pos = pos
4163 return substring
4164 except IndexError:
4165 # We've reached the end of the string.
4166 substring = string[self.pos : pos]
4168 self.pos = pos
4170 return substring
4172 def skip_while(self, test_set, include=True):
4173 string = self.string
4174 pos = self.pos
4176 try:
4177 if self.ignore_space:
4178 while True:
4179 if string[pos].isspace():
4180 # Skip over the whitespace.
4181 pos += 1
4182 elif string[pos] == "#":
4183 # Skip over the comment to the end of the line.
4184 pos = string.index("\n", pos)
4185 elif (string[pos] in test_set) == include:
4186 pos += 1
4187 else:
4188 break
4189 else:
4190 while (string[pos] in test_set) == include:
4191 pos += 1
4193 self.pos = pos
4194 except IndexError:
4195 # We've reached the end of the string.
4196 self.pos = len(string)
4197 except ValueError:
4198 # The comment extended to the end of the string.
4199 self.pos = len(string)
4201 def match(self, substring):
4202 string = self.string
4203 pos = self.pos
4205 if self.ignore_space:
4206 try:
4207 for c in substring:
4208 while True:
4209 if string[pos].isspace():
4210 # Skip over the whitespace.
4211 pos += 1
4212 elif string[pos] == "#":
4213 # Skip over the comment to the end of the line.
4214 pos = string.index("\n", pos)
4215 else:
4216 break
4218 if string[pos] != c:
4219 return False
4221 pos += 1
4223 self.pos = pos
4225 return True
4226 except IndexError:
4227 # We've reached the end of the string.
4228 return False
4229 except ValueError:
4230 # The comment extended to the end of the string.
4231 return False
4232 else:
4233 if not string.startswith(substring, pos):
4234 return False
4236 self.pos = pos + len(substring)
4238 return True
4240 def expect(self, substring):
4241 if not self.match(substring):
4242 raise error("missing {}".format(substring), self.string, self.pos)
4244 def at_end(self):
4245 string = self.string
4246 pos = self.pos
4248 try:
4249 if self.ignore_space:
4250 while True:
4251 if string[pos].isspace():
4252 pos += 1
4253 elif string[pos] == "#":
4254 pos = string.index("\n", pos)
4255 else:
4256 break
4258 return pos >= len(string)
4259 except IndexError:
4260 # We've reached the end of the string.
4261 return True
4262 except ValueError:
4263 # The comment extended to the end of the string.
4264 return True
4266class Info:
4267 "Info about the regular expression."
4269 def __init__(self, flags=0, char_type=None, kwargs={}):
4270 flags |= DEFAULT_FLAGS[(flags & _ALL_VERSIONS) or DEFAULT_VERSION]
4271 self.flags = flags
4272 self.global_flags = flags
4273 self.inline_locale = False
4275 self.kwargs = kwargs
4277 self.group_count = 0
4278 self.group_index = {}
4279 self.group_name = {}
4280 self.char_type = char_type
4281 self.named_lists_used = {}
4282 self.open_groups = []
4283 self.open_group_count = {}
4284 self.defined_groups = {}
4285 self.group_calls = []
4286 self.private_groups = {}
4288 def open_group(self, name=None):
4289 group = self.group_index.get(name)
4290 if group is None:
4291 while True:
4292 self.group_count += 1
4293 if name is None or self.group_count not in self.group_name:
4294 break
4296 group = self.group_count
4297 if name:
4298 self.group_index[name] = group
4299 self.group_name[group] = name
4301 if group in self.open_groups:
4302 # We have a nested named group. We'll assign it a private group
4303 # number, initially negative until we can assign a proper
4304 # (positive) number.
4305 group_alias = -(len(self.private_groups) + 1)
4306 self.private_groups[group_alias] = group
4307 group = group_alias
4309 self.open_groups.append(group)
4310 self.open_group_count[group] = self.open_group_count.get(group, 0) + 1
4312 return group
4314 def close_group(self):
4315 self.open_groups.pop()
4317 def is_open_group(self, name):
4318 # In version 1, a group reference can refer to an open group. We'll
4319 # just pretend the group isn't open.
4320 version = (self.flags & _ALL_VERSIONS) or DEFAULT_VERSION
4321 if version == VERSION1:
4322 return False
4324 if name.isdigit():
4325 group = int(name)
4326 else:
4327 group = self.group_index.get(name)
4329 return group in self.open_groups
4331def _check_group_features(info, parsed):
4332 """Checks whether the reverse and fuzzy features of the group calls match
4333 the groups which they call.
4334 """
4335 call_refs = {}
4336 additional_groups = []
4337 for call, reverse, fuzzy in info.group_calls:
4338 # Look up the reference of this group call.
4339 key = (call.group, reverse, fuzzy)
4340 ref = call_refs.get(key)
4341 if ref is None:
4342 # This group doesn't have a reference yet, so look up its features.
4343 if call.group == 0:
4344 # Calling the pattern as a whole.
4345 rev = bool(info.flags & REVERSE)
4346 fuz = isinstance(parsed, Fuzzy)
4347 if (rev, fuz) != (reverse, fuzzy):
4348 # The pattern as a whole doesn't have the features we want,
4349 # so we'll need to make a copy of it with the desired
4350 # features.
4351 additional_groups.append((CallRef(len(call_refs), parsed),
4352 reverse, fuzzy))
4353 else:
4354 # Calling a capture group.
4355 def_info = info.defined_groups[call.group]
4356 group = def_info[0]
4357 if def_info[1 : ] != (reverse, fuzzy):
4358 # The group doesn't have the features we want, so we'll
4359 # need to make a copy of it with the desired features.
4360 additional_groups.append((group, reverse, fuzzy))
4362 ref = len(call_refs)
4363 call_refs[key] = ref
4365 call.call_ref = ref
4367 info.call_refs = call_refs
4368 info.additional_groups = additional_groups
4370def _get_required_string(parsed, flags):
4371 "Gets the required string and related info of a parsed pattern."
4373 req_offset, required = parsed.get_required_string(bool(flags & REVERSE))
4374 if required:
4375 required.required = True
4376 if req_offset >= UNLIMITED:
4377 req_offset = -1
4379 req_flags = required.case_flags
4380 if not (flags & UNICODE):
4381 req_flags &= ~UNICODE
4383 req_chars = required.folded_characters
4384 else:
4385 req_offset = 0
4386 req_chars = ()
4387 req_flags = 0
4389 return req_offset, req_chars, req_flags
4391class Scanner:
4392 def __init__(self, lexicon, flags=0):
4393 self.lexicon = lexicon
4395 # Combine phrases into a compound pattern.
4396 patterns = []
4397 for phrase, action in lexicon:
4398 # Parse the regular expression.
4399 source = Source(phrase)
4400 info = Info(flags, source.char_type)
4401 source.ignore_space = bool(info.flags & VERBOSE)
4402 parsed = _parse_pattern(source, info)
4403 if not source.at_end():
4404 raise error("unbalanced parenthesis", source.string,
4405 source.pos)
4407 # We want to forbid capture groups within each phrase.
4408 patterns.append(parsed.remove_captures())
4410 # Combine all the subpatterns into one pattern.
4411 info = Info(flags)
4412 patterns = [Group(info, g + 1, p) for g, p in enumerate(patterns)]
4413 parsed = Branch(patterns)
4415 # Optimise the compound pattern.
4416 reverse = bool(info.flags & REVERSE)
4417 parsed = parsed.optimise(info, reverse)
4418 parsed = parsed.pack_characters(info)
4420 # Get the required string.
4421 req_offset, req_chars, req_flags = _get_required_string(parsed,
4422 info.flags)
4424 # Check the features of the groups.
4425 _check_group_features(info, parsed)
4427 # Complain if there are any group calls. They are not supported by the
4428 # Scanner class.
4429 if info.call_refs:
4430 raise error("recursive regex not supported by Scanner",
4431 source.string, source.pos)
4433 reverse = bool(info.flags & REVERSE)
4435 # Compile the compound pattern. The result is a list of tuples.
4436 code = parsed.compile(reverse) + [(OP.SUCCESS, )]
4438 # Flatten the code into a list of ints.
4439 code = _flatten_code(code)
4441 if not parsed.has_simple_start():
4442 # Get the first set, if possible.
4443 try:
4444 fs_code = _compile_firstset(info, parsed.get_firstset(reverse))
4445 fs_code = _flatten_code(fs_code)
4446 code = fs_code + code
4447 except _FirstSetError:
4448 pass
4450 # Check the global flags for conflicts.
4451 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION
4452 if version not in (0, VERSION0, VERSION1):
4453 raise ValueError("VERSION0 and VERSION1 flags are mutually incompatible")
4455 # Create the PatternObject.
4456 #
4457 # Local flags like IGNORECASE affect the code generation, but aren't
4458 # needed by the PatternObject itself. Conversely, global flags like
4459 # LOCALE _don't_ affect the code generation but _are_ needed by the
4460 # PatternObject.
4461 self.scanner = _regex.compile(None, (flags & GLOBAL_FLAGS) | version,
4462 code, {}, {}, {}, [], req_offset, req_chars, req_flags,
4463 len(patterns))
4465 def scan(self, string):
4466 result = []
4467 append = result.append
4468 match = self.scanner.scanner(string).match
4469 i = 0
4470 while True:
4471 m = match()
4472 if not m:
4473 break
4474 j = m.end()
4475 if i == j:
4476 break
4477 action = self.lexicon[m.lastindex - 1][1]
4478 if hasattr(action, '__call__'):
4479 self.match = m
4480 action = action(self, m.group())
4481 if action is not None:
4482 append(action)
4483 i = j
4485 return result, string[i : ]
4487# Get the known properties dict.
4488PROPERTIES = _regex.get_properties()
4490# Build the inverse of the properties dict.
4491PROPERTY_NAMES = {}
4492for prop_name, (prop_id, values) in PROPERTIES.items():
4493 name, prop_values = PROPERTY_NAMES.get(prop_id, ("", {}))
4494 name = max(name, prop_name, key=len)
4495 PROPERTY_NAMES[prop_id] = name, prop_values
4497 for val_name, val_id in values.items():
4498 prop_values[val_id] = max(prop_values.get(val_id, ""), val_name,
4499 key=len)
4501# Character escape sequences.
4502CHARACTER_ESCAPES = {
4503 "a": "\a",
4504 "b": "\b",
4505 "f": "\f",
4506 "n": "\n",
4507 "r": "\r",
4508 "t": "\t",
4509 "v": "\v",
4510}
4512ASCII_ENCODING = 1
4513UNICODE_ENCODING = 2
4515# Predefined character set escape sequences.
4516CHARSET_ESCAPES = {
4517 "d": lookup_property(None, "Digit", True),
4518 "D": lookup_property(None, "Digit", False),
4519 "h": lookup_property(None, "Blank", True),
4520 "s": lookup_property(None, "Space", True),
4521 "S": lookup_property(None, "Space", False),
4522 "w": lookup_property(None, "Word", True),
4523 "W": lookup_property(None, "Word", False),
4524}
4526ASCII_CHARSET_ESCAPES = dict(CHARSET_ESCAPES)
4527ASCII_CHARSET_ESCAPES.update({
4528 "d": lookup_property(None, "Digit", True, encoding=ASCII_ENCODING),
4529 "D": lookup_property(None, "Digit", False, encoding=ASCII_ENCODING),
4530 "s": lookup_property(None, "Space", True, encoding=ASCII_ENCODING),
4531 "S": lookup_property(None, "Space", False, encoding=ASCII_ENCODING),
4532 "w": lookup_property(None, "Word", True, encoding=ASCII_ENCODING),
4533 "W": lookup_property(None, "Word", False, encoding=ASCII_ENCODING),
4534})
4535UNICODE_CHARSET_ESCAPES = dict(CHARSET_ESCAPES)
4536UNICODE_CHARSET_ESCAPES.update({
4537 "d": lookup_property(None, "Digit", True, encoding=UNICODE_ENCODING),
4538 "D": lookup_property(None, "Digit", False, encoding=UNICODE_ENCODING),
4539 "s": lookup_property(None, "Space", True, encoding=UNICODE_ENCODING),
4540 "S": lookup_property(None, "Space", False, encoding=UNICODE_ENCODING),
4541 "w": lookup_property(None, "Word", True, encoding=UNICODE_ENCODING),
4542 "W": lookup_property(None, "Word", False, encoding=UNICODE_ENCODING),
4543})
4545# Positional escape sequences.
4546POSITION_ESCAPES = {
4547 "A": StartOfString(),
4548 "b": Boundary(),
4549 "B": Boundary(False),
4550 "K": Keep(),
4551 "m": StartOfWord(),
4552 "M": EndOfWord(),
4553 "Z": EndOfString(),
4554}
4555ASCII_POSITION_ESCAPES = dict(POSITION_ESCAPES)
4556ASCII_POSITION_ESCAPES.update({
4557 "b": Boundary(encoding=ASCII_ENCODING),
4558 "B": Boundary(False, encoding=ASCII_ENCODING),
4559 "m": StartOfWord(encoding=ASCII_ENCODING),
4560 "M": EndOfWord(encoding=ASCII_ENCODING),
4561})
4562UNICODE_POSITION_ESCAPES = dict(POSITION_ESCAPES)
4563UNICODE_POSITION_ESCAPES.update({
4564 "b": Boundary(encoding=UNICODE_ENCODING),
4565 "B": Boundary(False, encoding=UNICODE_ENCODING),
4566 "m": StartOfWord(encoding=UNICODE_ENCODING),
4567 "M": EndOfWord(encoding=UNICODE_ENCODING),
4568})
4570# Positional escape sequences when WORD flag set.
4571WORD_POSITION_ESCAPES = dict(POSITION_ESCAPES)
4572WORD_POSITION_ESCAPES.update({
4573 "b": DefaultBoundary(),
4574 "B": DefaultBoundary(False),
4575 "m": DefaultStartOfWord(),
4576 "M": DefaultEndOfWord(),
4577})
4579# Regex control verbs.
4580VERBS = {
4581 "FAIL": Failure(),
4582 "F": Failure(),
4583 "PRUNE": Prune(),
4584 "SKIP": Skip(),
4585}