Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/regex-2023.3.23-py3.8-linux-x86_64.egg/regex/_regex_core.py: 87%
2773 statements
« prev ^ index » next coverage.py v7.2.2, created at 2023-03-26 06:34 +0000
« prev ^ index » next coverage.py v7.2.2, created at 2023-03-26 06:34 +0000
1#
2# Secret Labs' Regular Expression Engine core module
3#
4# Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved.
5#
6# This version of the SRE library can be redistributed under CNRI's
7# Python 1.6 license. For any other use, please contact Secret Labs
8# AB (info@pythonware.com).
9#
10# Portions of this engine have been developed in cooperation with
11# CNRI. Hewlett-Packard provided funding for 1.6 integration and
12# other compatibility work.
13#
14# 2010-01-16 mrab Python front-end re-written and extended
16import enum
17import string
18import unicodedata
19from collections import defaultdict
21import regex._regex as _regex
23__all__ = ["A", "ASCII", "B", "BESTMATCH", "D", "DEBUG", "E", "ENHANCEMATCH",
24 "F", "FULLCASE", "I", "IGNORECASE", "L", "LOCALE", "M", "MULTILINE", "P",
25 "POSIX", "R", "REVERSE", "S", "DOTALL", "T", "TEMPLATE", "U", "UNICODE",
26 "V0", "VERSION0", "V1", "VERSION1", "W", "WORD", "X", "VERBOSE", "error",
27 "Scanner", "RegexFlag"]
29# The regex exception.
30class error(Exception):
31 """Exception raised for invalid regular expressions.
33 Attributes:
35 msg: The unformatted error message
36 pattern: The regular expression pattern
37 pos: The position in the pattern where compilation failed, or None
38 lineno: The line number where compilation failed, unless pos is None
39 colno: The column number where compilation failed, unless pos is None
40 """
42 def __init__(self, message, pattern=None, pos=None):
43 newline = '\n' if isinstance(pattern, str) else b'\n'
44 self.msg = message
45 self.pattern = pattern
46 self.pos = pos
47 if pattern is not None and pos is not None:
48 self.lineno = pattern.count(newline, 0, pos) + 1
49 self.colno = pos - pattern.rfind(newline, 0, pos)
51 message = "{} at position {}".format(message, pos)
53 if newline in pattern:
54 message += " (line {}, column {})".format(self.lineno,
55 self.colno)
57 Exception.__init__(self, message)
59# The exception for when a positional flag has been turned on in the old
60# behaviour.
61class _UnscopedFlagSet(Exception):
62 pass
64# The exception for when parsing fails and we want to try something else.
65class ParseError(Exception):
66 pass
68# The exception for when there isn't a valid first set.
69class _FirstSetError(Exception):
70 pass
72# Flags.
73class RegexFlag(enum.IntFlag):
74 A = ASCII = 0x80 # Assume ASCII locale.
75 B = BESTMATCH = 0x1000 # Best fuzzy match.
76 D = DEBUG = 0x200 # Print parsed pattern.
77 E = ENHANCEMATCH = 0x8000 # Attempt to improve the fit after finding the first
78 # fuzzy match.
79 F = FULLCASE = 0x4000 # Unicode full case-folding.
80 I = IGNORECASE = 0x2 # Ignore case.
81 L = LOCALE = 0x4 # Assume current 8-bit locale.
82 M = MULTILINE = 0x8 # Make anchors look for newline.
83 P = POSIX = 0x10000 # POSIX-style matching (leftmost longest).
84 R = REVERSE = 0x400 # Search backwards.
85 S = DOTALL = 0x10 # Make dot match newline.
86 U = UNICODE = 0x20 # Assume Unicode locale.
87 V0 = VERSION0 = 0x2000 # Old legacy behaviour.
88 V1 = VERSION1 = 0x100 # New enhanced behaviour.
89 W = WORD = 0x800 # Default Unicode word breaks.
90 X = VERBOSE = 0x40 # Ignore whitespace and comments.
91 T = TEMPLATE = 0x1 # Template (present because re module has it).
93 def __repr__(self):
94 if self._name_ is not None:
95 return 'regex.%s' % self._name_
97 value = self._value_
98 members = []
99 negative = value < 0
101 if negative:
102 value = ~value
104 for m in self.__class__:
105 if value & m._value_:
106 value &= ~m._value_
107 members.append('regex.%s' % m._name_)
109 if value:
110 members.append(hex(value))
112 res = '|'.join(members)
114 if negative:
115 if len(members) > 1:
116 res = '~(%s)' % res
117 else:
118 res = '~%s' % res
120 return res
122 __str__ = object.__str__
124globals().update(RegexFlag.__members__)
126DEFAULT_VERSION = VERSION1
128_ALL_VERSIONS = VERSION0 | VERSION1
129_ALL_ENCODINGS = ASCII | LOCALE | UNICODE
131# The default flags for the various versions.
132DEFAULT_FLAGS = {VERSION0: 0, VERSION1: FULLCASE}
134# The mask for the flags.
135GLOBAL_FLAGS = (_ALL_VERSIONS | BESTMATCH | DEBUG | ENHANCEMATCH | POSIX |
136 REVERSE)
137SCOPED_FLAGS = (FULLCASE | IGNORECASE | MULTILINE | DOTALL | WORD | VERBOSE |
138 _ALL_ENCODINGS)
140ALPHA = frozenset(string.ascii_letters)
141DIGITS = frozenset(string.digits)
142ALNUM = ALPHA | DIGITS
143OCT_DIGITS = frozenset(string.octdigits)
144HEX_DIGITS = frozenset(string.hexdigits)
145SPECIAL_CHARS = frozenset("()|?*+{^$.[\\#") | frozenset([""])
146NAMED_CHAR_PART = ALNUM | frozenset(" -")
147PROPERTY_NAME_PART = ALNUM | frozenset(" &_-.")
148SET_OPS = ("||", "~~", "&&", "--")
150# The width of the code words inside the regex engine.
151BYTES_PER_CODE = _regex.get_code_size()
152BITS_PER_CODE = BYTES_PER_CODE * 8
154# The repeat count which represents infinity.
155UNLIMITED = (1 << BITS_PER_CODE) - 1
157# The regular expression flags.
158REGEX_FLAGS = {"a": ASCII, "b": BESTMATCH, "e": ENHANCEMATCH, "f": FULLCASE,
159 "i": IGNORECASE, "L": LOCALE, "m": MULTILINE, "p": POSIX, "r": REVERSE,
160 "s": DOTALL, "u": UNICODE, "V0": VERSION0, "V1": VERSION1, "w": WORD, "x":
161 VERBOSE}
163# The case flags.
164CASE_FLAGS = FULLCASE | IGNORECASE
165NOCASE = 0
166FULLIGNORECASE = FULLCASE | IGNORECASE
168FULL_CASE_FOLDING = UNICODE | FULLIGNORECASE
170CASE_FLAGS_COMBINATIONS = {0: 0, FULLCASE: 0, IGNORECASE: IGNORECASE,
171 FULLIGNORECASE: FULLIGNORECASE}
173# The number of digits in hexadecimal escapes.
174HEX_ESCAPES = {"x": 2, "u": 4, "U": 8}
176# The names of the opcodes.
177OPCODES = """
178FAILURE
179SUCCESS
180ANY
181ANY_ALL
182ANY_ALL_REV
183ANY_REV
184ANY_U
185ANY_U_REV
186ATOMIC
187BOUNDARY
188BRANCH
189CALL_REF
190CHARACTER
191CHARACTER_IGN
192CHARACTER_IGN_REV
193CHARACTER_REV
194CONDITIONAL
195DEFAULT_BOUNDARY
196DEFAULT_END_OF_WORD
197DEFAULT_START_OF_WORD
198END
199END_OF_LINE
200END_OF_LINE_U
201END_OF_STRING
202END_OF_STRING_LINE
203END_OF_STRING_LINE_U
204END_OF_WORD
205FUZZY
206GRAPHEME_BOUNDARY
207GREEDY_REPEAT
208GROUP
209GROUP_CALL
210GROUP_EXISTS
211KEEP
212LAZY_REPEAT
213LOOKAROUND
214NEXT
215PROPERTY
216PROPERTY_IGN
217PROPERTY_IGN_REV
218PROPERTY_REV
219PRUNE
220RANGE
221RANGE_IGN
222RANGE_IGN_REV
223RANGE_REV
224REF_GROUP
225REF_GROUP_FLD
226REF_GROUP_FLD_REV
227REF_GROUP_IGN
228REF_GROUP_IGN_REV
229REF_GROUP_REV
230SEARCH_ANCHOR
231SET_DIFF
232SET_DIFF_IGN
233SET_DIFF_IGN_REV
234SET_DIFF_REV
235SET_INTER
236SET_INTER_IGN
237SET_INTER_IGN_REV
238SET_INTER_REV
239SET_SYM_DIFF
240SET_SYM_DIFF_IGN
241SET_SYM_DIFF_IGN_REV
242SET_SYM_DIFF_REV
243SET_UNION
244SET_UNION_IGN
245SET_UNION_IGN_REV
246SET_UNION_REV
247SKIP
248START_OF_LINE
249START_OF_LINE_U
250START_OF_STRING
251START_OF_WORD
252STRING
253STRING_FLD
254STRING_FLD_REV
255STRING_IGN
256STRING_IGN_REV
257STRING_REV
258FUZZY_EXT
259"""
261# Define the opcodes in a namespace.
262class Namespace:
263 pass
265OP = Namespace()
266for i, op in enumerate(OPCODES.split()):
267 setattr(OP, op, i)
269def _shrink_cache(cache_dict, args_dict, locale_sensitive, max_length, divisor=5):
270 """Make room in the given cache.
272 Args:
273 cache_dict: The cache dictionary to modify.
274 args_dict: The dictionary of named list args used by patterns.
275 max_length: Maximum # of entries in cache_dict before it is shrunk.
276 divisor: Cache will shrink to max_length - 1/divisor*max_length items.
277 """
278 # Toss out a fraction of the entries at random to make room for new ones.
279 # A random algorithm was chosen as opposed to simply cache_dict.popitem()
280 # as popitem could penalize the same regular expression repeatedly based
281 # on its internal hash value. Being random should spread the cache miss
282 # love around.
283 cache_keys = tuple(cache_dict.keys())
284 overage = len(cache_keys) - max_length
285 if overage < 0:
286 # Cache is already within limits. Normally this should not happen
287 # but it could due to multithreading.
288 return
290 number_to_toss = max_length // divisor + overage
292 # The import is done here to avoid a circular dependency.
293 import random
294 if not hasattr(random, 'sample'):
295 # Do nothing while resolving the circular dependency:
296 # re->random->warnings->tokenize->string->re
297 return
299 for doomed_key in random.sample(cache_keys, number_to_toss):
300 try:
301 del cache_dict[doomed_key]
302 except KeyError:
303 # Ignore problems if the cache changed from another thread.
304 pass
306 # Rebuild the arguments and locale-sensitivity dictionaries.
307 args_dict.clear()
308 sensitivity_dict = {}
309 for pattern, pattern_type, flags, args, default_version, locale in tuple(cache_dict):
310 args_dict[pattern, pattern_type, flags, default_version, locale] = args
311 try:
312 sensitivity_dict[pattern_type, pattern] = locale_sensitive[pattern_type, pattern]
313 except KeyError:
314 pass
316 locale_sensitive.clear()
317 locale_sensitive.update(sensitivity_dict)
319def _fold_case(info, string):
320 "Folds the case of a string."
321 flags = info.flags
322 if (flags & _ALL_ENCODINGS) == 0:
323 flags |= info.guess_encoding
325 return _regex.fold_case(flags, string)
327def is_cased_i(info, char):
328 "Checks whether a character is cased."
329 return len(_regex.get_all_cases(info.flags, char)) > 1
331def is_cased_f(flags, char):
332 "Checks whether a character is cased."
333 return len(_regex.get_all_cases(flags, char)) > 1
335def _compile_firstset(info, fs):
336 "Compiles the firstset for the pattern."
337 reverse = bool(info.flags & REVERSE)
338 fs = _check_firstset(info, reverse, fs)
339 if not fs:
340 return []
342 # Compile the firstset.
343 return fs.compile(reverse)
345def _check_firstset(info, reverse, fs):
346 "Checks the firstset for the pattern."
347 if not fs or None in fs:
348 return None
350 # If we ignore the case, for simplicity we won't build a firstset.
351 members = set()
352 case_flags = NOCASE
353 for i in fs:
354 if isinstance(i, Character) and not i.positive:
355 return None
357# if i.case_flags:
358# if isinstance(i, Character):
359# if is_cased_i(info, i.value):
360# return []
361# elif isinstance(i, SetBase):
362# return []
363 case_flags |= i.case_flags
364 members.add(i.with_flags(case_flags=NOCASE))
366 if case_flags == (FULLCASE | IGNORECASE):
367 return None
369 # Build the firstset.
370 fs = SetUnion(info, list(members), case_flags=case_flags & ~FULLCASE,
371 zerowidth=True)
372 fs = fs.optimise(info, reverse, in_set=True)
374 return fs
376def _flatten_code(code):
377 "Flattens the code from a list of tuples."
378 flat_code = []
379 for c in code:
380 flat_code.extend(c)
382 return flat_code
384def make_case_flags(info):
385 "Makes the case flags."
386 flags = info.flags & CASE_FLAGS
388 # Turn off FULLCASE if ASCII is turned on.
389 if info.flags & ASCII:
390 flags &= ~FULLCASE
392 return flags
394def make_character(info, value, in_set=False):
395 "Makes a character literal."
396 if in_set:
397 # A character set is built case-sensitively.
398 return Character(value)
400 return Character(value, case_flags=make_case_flags(info))
402def make_ref_group(info, name, position):
403 "Makes a group reference."
404 return RefGroup(info, name, position, case_flags=make_case_flags(info))
406def make_string_set(info, name):
407 "Makes a string set."
408 return StringSet(info, name, case_flags=make_case_flags(info))
410def make_property(info, prop, in_set):
411 "Makes a property."
412 if in_set:
413 return prop
415 return prop.with_flags(case_flags=make_case_flags(info))
417def _parse_pattern(source, info):
418 "Parses a pattern, eg. 'a|b|c'."
419 branches = [parse_sequence(source, info)]
420 while source.match("|"):
421 branches.append(parse_sequence(source, info))
423 if len(branches) == 1:
424 return branches[0]
425 return Branch(branches)
427def parse_sequence(source, info):
428 "Parses a sequence, eg. 'abc'."
429 sequence = [None]
430 case_flags = make_case_flags(info)
431 while True:
432 saved_pos = source.pos
433 ch = source.get()
434 if ch in SPECIAL_CHARS:
435 if ch in ")|":
436 # The end of a sequence. At the end of the pattern ch is "".
437 source.pos = saved_pos
438 break
439 elif ch == "\\":
440 # An escape sequence outside a set.
441 sequence.append(parse_escape(source, info, False))
442 elif ch == "(":
443 # A parenthesised subpattern or a flag.
444 element = parse_paren(source, info)
445 if element is None:
446 case_flags = make_case_flags(info)
447 else:
448 sequence.append(element)
449 elif ch == ".":
450 # Any character.
451 if info.flags & DOTALL:
452 sequence.append(AnyAll())
453 elif info.flags & WORD:
454 sequence.append(AnyU())
455 else:
456 sequence.append(Any())
457 elif ch == "[":
458 # A character set.
459 sequence.append(parse_set(source, info))
460 elif ch == "^":
461 # The start of a line or the string.
462 if info.flags & MULTILINE:
463 if info.flags & WORD:
464 sequence.append(StartOfLineU())
465 else:
466 sequence.append(StartOfLine())
467 else:
468 sequence.append(StartOfString())
469 elif ch == "$":
470 # The end of a line or the string.
471 if info.flags & MULTILINE:
472 if info.flags & WORD:
473 sequence.append(EndOfLineU())
474 else:
475 sequence.append(EndOfLine())
476 else:
477 if info.flags & WORD:
478 sequence.append(EndOfStringLineU())
479 else:
480 sequence.append(EndOfStringLine())
481 elif ch in "?*+{":
482 # Looks like a quantifier.
483 counts = parse_quantifier(source, info, ch)
484 if counts:
485 # It _is_ a quantifier.
486 apply_quantifier(source, info, counts, case_flags, ch,
487 saved_pos, sequence)
488 sequence.append(None)
489 else:
490 # It's not a quantifier. Maybe it's a fuzzy constraint.
491 constraints = parse_fuzzy(source, info, ch, case_flags)
492 if constraints:
493 # It _is_ a fuzzy constraint.
494 apply_constraint(source, info, constraints, case_flags,
495 saved_pos, sequence)
496 sequence.append(None)
497 else:
498 # The element was just a literal.
499 sequence.append(Character(ord(ch),
500 case_flags=case_flags))
501 else:
502 # A literal.
503 sequence.append(Character(ord(ch), case_flags=case_flags))
504 else:
505 # A literal.
506 sequence.append(Character(ord(ch), case_flags=case_flags))
508 sequence = [item for item in sequence if item is not None]
509 return Sequence(sequence)
511def apply_quantifier(source, info, counts, case_flags, ch, saved_pos,
512 sequence):
513 element = sequence.pop()
514 if element is None:
515 if sequence:
516 raise error("multiple repeat", source.string, saved_pos)
517 raise error("nothing to repeat", source.string, saved_pos)
519 if isinstance(element, (GreedyRepeat, LazyRepeat, PossessiveRepeat)):
520 raise error("multiple repeat", source.string, saved_pos)
522 min_count, max_count = counts
523 saved_pos = source.pos
524 ch = source.get()
525 if ch == "?":
526 # The "?" suffix that means it's a lazy repeat.
527 repeated = LazyRepeat
528 elif ch == "+":
529 # The "+" suffix that means it's a possessive repeat.
530 repeated = PossessiveRepeat
531 else:
532 # No suffix means that it's a greedy repeat.
533 source.pos = saved_pos
534 repeated = GreedyRepeat
536 # Ignore the quantifier if it applies to a zero-width item or the number of
537 # repeats is fixed at 1.
538 if not element.is_empty() and (min_count != 1 or max_count != 1):
539 element = repeated(element, min_count, max_count)
541 sequence.append(element)
543def apply_constraint(source, info, constraints, case_flags, saved_pos,
544 sequence):
545 element = sequence.pop()
546 if element is None:
547 raise error("nothing for fuzzy constraint", source.string, saved_pos)
549 # If a group is marked as fuzzy then put all of the fuzzy part in the
550 # group.
551 if isinstance(element, Group):
552 element.subpattern = Fuzzy(element.subpattern, constraints)
553 sequence.append(element)
554 else:
555 sequence.append(Fuzzy(element, constraints))
557_QUANTIFIERS = {"?": (0, 1), "*": (0, None), "+": (1, None)}
559def parse_quantifier(source, info, ch):
560 "Parses a quantifier."
561 q = _QUANTIFIERS.get(ch)
562 if q:
563 # It's a quantifier.
564 return q
566 if ch == "{":
567 # Looks like a limited repeated element, eg. 'a{2,3}'.
568 counts = parse_limited_quantifier(source)
569 if counts:
570 return counts
572 return None
574def is_above_limit(count):
575 "Checks whether a count is above the maximum."
576 return count is not None and count >= UNLIMITED
578def parse_limited_quantifier(source):
579 "Parses a limited quantifier."
580 saved_pos = source.pos
581 min_count = parse_count(source)
582 if source.match(","):
583 max_count = parse_count(source)
585 # No minimum means 0 and no maximum means unlimited.
586 min_count = int(min_count or 0)
587 max_count = int(max_count) if max_count else None
588 else:
589 if not min_count:
590 source.pos = saved_pos
591 return None
593 min_count = max_count = int(min_count)
595 if not source.match ("}"):
596 source.pos = saved_pos
597 return None
599 if is_above_limit(min_count) or is_above_limit(max_count):
600 raise error("repeat count too big", source.string, saved_pos)
602 if max_count is not None and min_count > max_count:
603 raise error("min repeat greater than max repeat", source.string,
604 saved_pos)
606 return min_count, max_count
608def parse_fuzzy(source, info, ch, case_flags):
609 "Parses a fuzzy setting, if present."
610 saved_pos = source.pos
612 if ch != "{":
613 return None
615 constraints = {}
616 try:
617 parse_fuzzy_item(source, constraints)
618 while source.match(","):
619 parse_fuzzy_item(source, constraints)
620 except ParseError:
621 source.pos = saved_pos
622 return None
624 if source.match(":"):
625 constraints["test"] = parse_fuzzy_test(source, info, case_flags)
627 if not source.match("}"):
628 raise error("expected }", source.string, source.pos)
630 return constraints
632def parse_fuzzy_item(source, constraints):
633 "Parses a fuzzy setting item."
634 saved_pos = source.pos
635 try:
636 parse_cost_constraint(source, constraints)
637 except ParseError:
638 source.pos = saved_pos
640 parse_cost_equation(source, constraints)
642def parse_cost_constraint(source, constraints):
643 "Parses a cost constraint."
644 saved_pos = source.pos
645 ch = source.get()
646 if ch in ALPHA:
647 # Syntax: constraint [("<=" | "<") cost]
648 constraint = parse_constraint(source, constraints, ch)
650 max_inc = parse_fuzzy_compare(source)
652 if max_inc is None:
653 # No maximum cost.
654 constraints[constraint] = 0, None
655 else:
656 # There's a maximum cost.
657 cost_pos = source.pos
658 max_cost = parse_cost_limit(source)
660 # Inclusive or exclusive limit?
661 if not max_inc:
662 max_cost -= 1
664 if max_cost < 0:
665 raise error("bad fuzzy cost limit", source.string, cost_pos)
667 constraints[constraint] = 0, max_cost
668 elif ch in DIGITS:
669 # Syntax: cost ("<=" | "<") constraint ("<=" | "<") cost
670 source.pos = saved_pos
672 # Minimum cost.
673 cost_pos = source.pos
674 min_cost = parse_cost_limit(source)
676 min_inc = parse_fuzzy_compare(source)
677 if min_inc is None:
678 raise ParseError()
680 constraint = parse_constraint(source, constraints, source.get())
682 max_inc = parse_fuzzy_compare(source)
683 if max_inc is None:
684 raise ParseError()
686 # Maximum cost.
687 cost_pos = source.pos
688 max_cost = parse_cost_limit(source)
690 # Inclusive or exclusive limits?
691 if not min_inc:
692 min_cost += 1
693 if not max_inc:
694 max_cost -= 1
696 if not 0 <= min_cost <= max_cost:
697 raise error("bad fuzzy cost limit", source.string, cost_pos)
699 constraints[constraint] = min_cost, max_cost
700 else:
701 raise ParseError()
703def parse_cost_limit(source):
704 "Parses a cost limit."
705 cost_pos = source.pos
706 digits = parse_count(source)
708 try:
709 return int(digits)
710 except ValueError:
711 pass
713 raise error("bad fuzzy cost limit", source.string, cost_pos)
715def parse_constraint(source, constraints, ch):
716 "Parses a constraint."
717 if ch not in "deis":
718 raise ParseError()
720 if ch in constraints:
721 raise ParseError()
723 return ch
725def parse_fuzzy_compare(source):
726 "Parses a cost comparator."
727 if source.match("<="):
728 return True
729 elif source.match("<"):
730 return False
731 else:
732 return None
734def parse_cost_equation(source, constraints):
735 "Parses a cost equation."
736 if "cost" in constraints:
737 raise error("more than one cost equation", source.string, source.pos)
739 cost = {}
741 parse_cost_term(source, cost)
742 while source.match("+"):
743 parse_cost_term(source, cost)
745 max_inc = parse_fuzzy_compare(source)
746 if max_inc is None:
747 raise ParseError()
749 max_cost = int(parse_count(source))
751 if not max_inc:
752 max_cost -= 1
754 if max_cost < 0:
755 raise error("bad fuzzy cost limit", source.string, source.pos)
757 cost["max"] = max_cost
759 constraints["cost"] = cost
761def parse_cost_term(source, cost):
762 "Parses a cost equation term."
763 coeff = parse_count(source)
764 ch = source.get()
765 if ch not in "dis":
766 raise ParseError()
768 if ch in cost:
769 raise error("repeated fuzzy cost", source.string, source.pos)
771 cost[ch] = int(coeff or 1)
773def parse_fuzzy_test(source, info, case_flags):
774 saved_pos = source.pos
775 ch = source.get()
776 if ch in SPECIAL_CHARS:
777 if ch == "\\":
778 # An escape sequence outside a set.
779 return parse_escape(source, info, False)
780 elif ch == ".":
781 # Any character.
782 if info.flags & DOTALL:
783 return AnyAll()
784 elif info.flags & WORD:
785 return AnyU()
786 else:
787 return Any()
788 elif ch == "[":
789 # A character set.
790 return parse_set(source, info)
791 else:
792 raise error("expected character set", source.string, saved_pos)
793 elif ch:
794 # A literal.
795 return Character(ord(ch), case_flags=case_flags)
796 else:
797 raise error("expected character set", source.string, saved_pos)
799def parse_count(source):
800 "Parses a quantifier's count, which can be empty."
801 return source.get_while(DIGITS)
803def parse_paren(source, info):
804 """Parses a parenthesised subpattern or a flag. Returns FLAGS if it's an
805 inline flag.
806 """
807 saved_pos = source.pos
808 ch = source.get(True)
809 if ch == "?":
810 # (?...
811 saved_pos_2 = source.pos
812 ch = source.get(True)
813 if ch == "<":
814 # (?<...
815 saved_pos_3 = source.pos
816 ch = source.get()
817 if ch in ("=", "!"):
818 # (?<=... or (?<!...: lookbehind.
819 return parse_lookaround(source, info, True, ch == "=")
821 # (?<...: a named capture group.
822 source.pos = saved_pos_3
823 name = parse_name(source)
824 group = info.open_group(name)
825 source.expect(">")
826 saved_flags = info.flags
827 try:
828 subpattern = _parse_pattern(source, info)
829 source.expect(")")
830 finally:
831 info.flags = saved_flags
832 source.ignore_space = bool(info.flags & VERBOSE)
834 info.close_group()
835 return Group(info, group, subpattern)
836 if ch in ("=", "!"):
837 # (?=... or (?!...: lookahead.
838 return parse_lookaround(source, info, False, ch == "=")
839 if ch == "P":
840 # (?P...: a Python extension.
841 return parse_extension(source, info)
842 if ch == "#":
843 # (?#...: a comment.
844 return parse_comment(source)
845 if ch == "(":
846 # (?(...: a conditional subpattern.
847 return parse_conditional(source, info)
848 if ch == ">":
849 # (?>...: an atomic subpattern.
850 return parse_atomic(source, info)
851 if ch == "|":
852 # (?|...: a common/reset groups branch.
853 return parse_common(source, info)
854 if ch == "R" or "0" <= ch <= "9":
855 # (?R...: probably a call to a group.
856 return parse_call_group(source, info, ch, saved_pos_2)
857 if ch == "&":
858 # (?&...: a call to a named group.
859 return parse_call_named_group(source, info, saved_pos_2)
861 # (?...: probably a flags subpattern.
862 source.pos = saved_pos_2
863 return parse_flags_subpattern(source, info)
865 if ch == "*":
866 # (*...
867 saved_pos_2 = source.pos
868 word = source.get_while(set(")>"), include=False)
869 if word[ : 1].isalpha():
870 verb = VERBS.get(word)
871 if not verb:
872 raise error("unknown verb", source.string, saved_pos_2)
874 source.expect(")")
876 return verb
878 # (...: an unnamed capture group.
879 source.pos = saved_pos
880 group = info.open_group()
881 saved_flags = info.flags
882 try:
883 subpattern = _parse_pattern(source, info)
884 source.expect(")")
885 finally:
886 info.flags = saved_flags
887 source.ignore_space = bool(info.flags & VERBOSE)
889 info.close_group()
891 return Group(info, group, subpattern)
893def parse_extension(source, info):
894 "Parses a Python extension."
895 saved_pos = source.pos
896 ch = source.get()
897 if ch == "<":
898 # (?P<...: a named capture group.
899 name = parse_name(source)
900 group = info.open_group(name)
901 source.expect(">")
902 saved_flags = info.flags
903 try:
904 subpattern = _parse_pattern(source, info)
905 source.expect(")")
906 finally:
907 info.flags = saved_flags
908 source.ignore_space = bool(info.flags & VERBOSE)
910 info.close_group()
912 return Group(info, group, subpattern)
913 if ch == "=":
914 # (?P=...: a named group reference.
915 name = parse_name(source, allow_numeric=True)
916 source.expect(")")
917 if info.is_open_group(name):
918 raise error("cannot refer to an open group", source.string,
919 saved_pos)
921 return make_ref_group(info, name, saved_pos)
922 if ch == ">" or ch == "&":
923 # (?P>...: a call to a group.
924 return parse_call_named_group(source, info, saved_pos)
926 source.pos = saved_pos
927 raise error("unknown extension", source.string, saved_pos)
929def parse_comment(source):
930 "Parses a comment."
931 while True:
932 saved_pos = source.pos
933 c = source.get(True)
935 if not c or c == ")":
936 break
938 if c == "\\":
939 c = source.get(True)
941 source.pos = saved_pos
942 source.expect(")")
944 return None
946def parse_lookaround(source, info, behind, positive):
947 "Parses a lookaround."
948 saved_flags = info.flags
949 try:
950 subpattern = _parse_pattern(source, info)
951 source.expect(")")
952 finally:
953 info.flags = saved_flags
954 source.ignore_space = bool(info.flags & VERBOSE)
956 return LookAround(behind, positive, subpattern)
958def parse_conditional(source, info):
959 "Parses a conditional subpattern."
960 saved_flags = info.flags
961 saved_pos = source.pos
962 ch = source.get()
963 if ch == "?":
964 # (?(?...
965 ch = source.get()
966 if ch in ("=", "!"):
967 # (?(?=... or (?(?!...: lookahead conditional.
968 return parse_lookaround_conditional(source, info, False, ch == "=")
969 if ch == "<":
970 # (?(?<...
971 ch = source.get()
972 if ch in ("=", "!"):
973 # (?(?<=... or (?(?<!...: lookbehind conditional.
974 return parse_lookaround_conditional(source, info, True, ch ==
975 "=")
977 source.pos = saved_pos
978 raise error("expected lookaround conditional", source.string,
979 source.pos)
981 source.pos = saved_pos
982 try:
983 group = parse_name(source, True)
984 source.expect(")")
985 yes_branch = parse_sequence(source, info)
986 if source.match("|"):
987 no_branch = parse_sequence(source, info)
988 else:
989 no_branch = Sequence()
991 source.expect(")")
992 finally:
993 info.flags = saved_flags
994 source.ignore_space = bool(info.flags & VERBOSE)
996 if yes_branch.is_empty() and no_branch.is_empty():
997 return Sequence()
999 return Conditional(info, group, yes_branch, no_branch, saved_pos)
1001def parse_lookaround_conditional(source, info, behind, positive):
1002 saved_flags = info.flags
1003 try:
1004 subpattern = _parse_pattern(source, info)
1005 source.expect(")")
1006 finally:
1007 info.flags = saved_flags
1008 source.ignore_space = bool(info.flags & VERBOSE)
1010 yes_branch = parse_sequence(source, info)
1011 if source.match("|"):
1012 no_branch = parse_sequence(source, info)
1013 else:
1014 no_branch = Sequence()
1016 source.expect(")")
1018 return LookAroundConditional(behind, positive, subpattern, yes_branch,
1019 no_branch)
1021def parse_atomic(source, info):
1022 "Parses an atomic subpattern."
1023 saved_flags = info.flags
1024 try:
1025 subpattern = _parse_pattern(source, info)
1026 source.expect(")")
1027 finally:
1028 info.flags = saved_flags
1029 source.ignore_space = bool(info.flags & VERBOSE)
1031 return Atomic(subpattern)
1033def parse_common(source, info):
1034 "Parses a common groups branch."
1035 # Capture group numbers in different branches can reuse the group numbers.
1036 initial_group_count = info.group_count
1037 branches = [parse_sequence(source, info)]
1038 final_group_count = info.group_count
1039 while source.match("|"):
1040 info.group_count = initial_group_count
1041 branches.append(parse_sequence(source, info))
1042 final_group_count = max(final_group_count, info.group_count)
1044 info.group_count = final_group_count
1045 source.expect(")")
1047 if len(branches) == 1:
1048 return branches[0]
1049 return Branch(branches)
1051def parse_call_group(source, info, ch, pos):
1052 "Parses a call to a group."
1053 if ch == "R":
1054 group = "0"
1055 else:
1056 group = ch + source.get_while(DIGITS)
1058 source.expect(")")
1060 return CallGroup(info, group, pos)
1062def parse_call_named_group(source, info, pos):
1063 "Parses a call to a named group."
1064 group = parse_name(source)
1065 source.expect(")")
1067 return CallGroup(info, group, pos)
1069def parse_flag_set(source):
1070 "Parses a set of inline flags."
1071 flags = 0
1073 try:
1074 while True:
1075 saved_pos = source.pos
1076 ch = source.get()
1077 if ch == "V":
1078 ch += source.get()
1079 flags |= REGEX_FLAGS[ch]
1080 except KeyError:
1081 source.pos = saved_pos
1083 return flags
1085def parse_flags(source, info):
1086 "Parses flags being turned on/off."
1087 flags_on = parse_flag_set(source)
1088 if source.match("-"):
1089 flags_off = parse_flag_set(source)
1090 if not flags_off:
1091 raise error("bad inline flags: no flags after '-'", source.string,
1092 source.pos)
1093 else:
1094 flags_off = 0
1096 if flags_on & LOCALE:
1097 # Remember that this pattern as an inline locale flag.
1098 info.inline_locale = True
1100 return flags_on, flags_off
1102def parse_subpattern(source, info, flags_on, flags_off):
1103 "Parses a subpattern with scoped flags."
1104 saved_flags = info.flags
1105 info.flags = (info.flags | flags_on) & ~flags_off
1106 source.ignore_space = bool(info.flags & VERBOSE)
1107 try:
1108 subpattern = _parse_pattern(source, info)
1109 source.expect(")")
1110 finally:
1111 info.flags = saved_flags
1112 source.ignore_space = bool(info.flags & VERBOSE)
1114 return subpattern
1116def parse_flags_subpattern(source, info):
1117 """Parses a flags subpattern. It could be inline flags or a subpattern
1118 possibly with local flags. If it's a subpattern, then that's returned;
1119 if it's a inline flags, then None is returned.
1120 """
1121 flags_on, flags_off = parse_flags(source, info)
1123 if flags_off & GLOBAL_FLAGS:
1124 raise error("bad inline flags: cannot turn off global flag",
1125 source.string, source.pos)
1127 if flags_on & flags_off:
1128 raise error("bad inline flags: flag turned on and off", source.string,
1129 source.pos)
1131 # Handle flags which are global in all regex behaviours.
1132 new_global_flags = (flags_on & ~info.global_flags) & GLOBAL_FLAGS
1133 if new_global_flags:
1134 info.global_flags |= new_global_flags
1136 # A global has been turned on, so reparse the pattern.
1137 raise _UnscopedFlagSet(info.global_flags)
1139 # Ensure that from now on we have only scoped flags.
1140 flags_on &= ~GLOBAL_FLAGS
1142 if source.match(":"):
1143 return parse_subpattern(source, info, flags_on, flags_off)
1145 if source.match(")"):
1146 parse_positional_flags(source, info, flags_on, flags_off)
1147 return None
1149 raise error("unknown extension", source.string, source.pos)
1151def parse_positional_flags(source, info, flags_on, flags_off):
1152 "Parses positional flags."
1153 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION
1154 if version == VERSION0:
1155 # Positional flags are global and can only be turned on.
1156 if flags_off:
1157 raise error("bad inline flags: cannot turn flags off",
1158 source.string, source.pos)
1160 new_global_flags = flags_on & ~info.global_flags
1161 if new_global_flags:
1162 info.global_flags |= new_global_flags
1164 # A global has been turned on, so reparse the pattern.
1165 raise _UnscopedFlagSet(info.global_flags)
1166 else:
1167 info.flags = (info.flags | flags_on) & ~flags_off
1169 source.ignore_space = bool(info.flags & VERBOSE)
1171def parse_name(source, allow_numeric=False, allow_group_0=False):
1172 "Parses a name."
1173 name = source.get_while(set(")>"), include=False)
1175 if not name:
1176 raise error("missing group name", source.string, source.pos)
1178 if name.isdigit():
1179 min_group = 0 if allow_group_0 else 1
1180 if not allow_numeric or int(name) < min_group:
1181 raise error("bad character in group name", source.string,
1182 source.pos)
1183 else:
1184 if not name.isidentifier():
1185 raise error("bad character in group name", source.string,
1186 source.pos)
1188 return name
1190def is_octal(string):
1191 "Checks whether a string is octal."
1192 return all(ch in OCT_DIGITS for ch in string)
1194def is_decimal(string):
1195 "Checks whether a string is decimal."
1196 return all(ch in DIGITS for ch in string)
1198def is_hexadecimal(string):
1199 "Checks whether a string is hexadecimal."
1200 return all(ch in HEX_DIGITS for ch in string)
1202def parse_escape(source, info, in_set):
1203 "Parses an escape sequence."
1204 saved_ignore = source.ignore_space
1205 source.ignore_space = False
1206 ch = source.get()
1207 source.ignore_space = saved_ignore
1208 if not ch:
1209 # A backslash at the end of the pattern.
1210 raise error("bad escape (end of pattern)", source.string, source.pos)
1211 if ch in HEX_ESCAPES:
1212 # A hexadecimal escape sequence.
1213 return parse_hex_escape(source, info, ch, HEX_ESCAPES[ch], in_set, ch)
1214 elif ch == "g" and not in_set:
1215 # A group reference.
1216 saved_pos = source.pos
1217 try:
1218 return parse_group_ref(source, info)
1219 except error:
1220 # Invalid as a group reference, so assume it's a literal.
1221 source.pos = saved_pos
1223 return make_character(info, ord(ch), in_set)
1224 elif ch == "G" and not in_set:
1225 # A search anchor.
1226 return SearchAnchor()
1227 elif ch == "L" and not in_set:
1228 # A string set.
1229 return parse_string_set(source, info)
1230 elif ch == "N":
1231 # A named codepoint.
1232 return parse_named_char(source, info, in_set)
1233 elif ch in "pP":
1234 # A Unicode property, positive or negative.
1235 return parse_property(source, info, ch == "p", in_set)
1236 elif ch == "X" and not in_set:
1237 # A grapheme cluster.
1238 return Grapheme()
1239 elif ch in ALPHA:
1240 # An alphabetic escape sequence.
1241 # Positional escapes aren't allowed inside a character set.
1242 if not in_set:
1243 if info.flags & WORD:
1244 value = WORD_POSITION_ESCAPES.get(ch)
1245 else:
1246 value = POSITION_ESCAPES.get(ch)
1248 if value:
1249 return value
1251 value = CHARSET_ESCAPES.get(ch)
1252 if value:
1253 return value
1255 value = CHARACTER_ESCAPES.get(ch)
1256 if value:
1257 return Character(ord(value))
1259 raise error("bad escape \\%s" % ch, source.string, source.pos)
1260 elif ch in DIGITS:
1261 # A numeric escape sequence.
1262 return parse_numeric_escape(source, info, ch, in_set)
1263 else:
1264 # A literal.
1265 return make_character(info, ord(ch), in_set)
1267def parse_numeric_escape(source, info, ch, in_set):
1268 "Parses a numeric escape sequence."
1269 if in_set or ch == "0":
1270 # Octal escape sequence, max 3 digits.
1271 return parse_octal_escape(source, info, [ch], in_set)
1273 # At least 1 digit, so either octal escape or group.
1274 digits = ch
1275 saved_pos = source.pos
1276 ch = source.get()
1277 if ch in DIGITS:
1278 # At least 2 digits, so either octal escape or group.
1279 digits += ch
1280 saved_pos = source.pos
1281 ch = source.get()
1282 if is_octal(digits) and ch in OCT_DIGITS:
1283 # 3 octal digits, so octal escape sequence.
1284 encoding = info.flags & _ALL_ENCODINGS
1285 if encoding == ASCII or encoding == LOCALE:
1286 octal_mask = 0xFF
1287 else:
1288 octal_mask = 0x1FF
1290 value = int(digits + ch, 8) & octal_mask
1291 return make_character(info, value)
1293 # Group reference.
1294 source.pos = saved_pos
1295 if info.is_open_group(digits):
1296 raise error("cannot refer to an open group", source.string, source.pos)
1298 return make_ref_group(info, digits, source.pos)
1300def parse_octal_escape(source, info, digits, in_set):
1301 "Parses an octal escape sequence."
1302 saved_pos = source.pos
1303 ch = source.get()
1304 while len(digits) < 3 and ch in OCT_DIGITS:
1305 digits.append(ch)
1306 saved_pos = source.pos
1307 ch = source.get()
1309 source.pos = saved_pos
1310 try:
1311 value = int("".join(digits), 8)
1312 return make_character(info, value, in_set)
1313 except ValueError:
1314 if digits[0] in OCT_DIGITS:
1315 raise error("incomplete escape \\%s" % ''.join(digits),
1316 source.string, source.pos)
1317 else:
1318 raise error("bad escape \\%s" % digits[0], source.string,
1319 source.pos)
1321def parse_hex_escape(source, info, esc, expected_len, in_set, type):
1322 "Parses a hex escape sequence."
1323 saved_pos = source.pos
1324 digits = []
1325 for i in range(expected_len):
1326 ch = source.get()
1327 if ch not in HEX_DIGITS:
1328 raise error("incomplete escape \\%s%s" % (type, ''.join(digits)),
1329 source.string, saved_pos)
1330 digits.append(ch)
1332 try:
1333 value = int("".join(digits), 16)
1334 except ValueError:
1335 pass
1336 else:
1337 if value < 0x110000:
1338 return make_character(info, value, in_set)
1340 # Bad hex escape.
1341 raise error("bad hex escape \\%s%s" % (esc, ''.join(digits)),
1342 source.string, saved_pos)
1344def parse_group_ref(source, info):
1345 "Parses a group reference."
1346 source.expect("<")
1347 saved_pos = source.pos
1348 name = parse_name(source, True)
1349 source.expect(">")
1350 if info.is_open_group(name):
1351 raise error("cannot refer to an open group", source.string, source.pos)
1353 return make_ref_group(info, name, saved_pos)
1355def parse_string_set(source, info):
1356 "Parses a string set reference."
1357 source.expect("<")
1358 name = parse_name(source, True)
1359 source.expect(">")
1360 if name is None or name not in info.kwargs:
1361 raise error("undefined named list", source.string, source.pos)
1363 return make_string_set(info, name)
1365def parse_named_char(source, info, in_set):
1366 "Parses a named character."
1367 saved_pos = source.pos
1368 if source.match("{"):
1369 name = source.get_while(NAMED_CHAR_PART)
1370 if source.match("}"):
1371 try:
1372 value = unicodedata.lookup(name)
1373 return make_character(info, ord(value), in_set)
1374 except KeyError:
1375 raise error("undefined character name", source.string,
1376 source.pos)
1378 source.pos = saved_pos
1379 return make_character(info, ord("N"), in_set)
1381def parse_property(source, info, positive, in_set):
1382 "Parses a Unicode property."
1383 saved_pos = source.pos
1384 ch = source.get()
1385 if ch == "{":
1386 negate = source.match("^")
1387 prop_name, name = parse_property_name(source)
1388 if source.match("}"):
1389 # It's correctly delimited.
1390 prop = lookup_property(prop_name, name, positive != negate, source)
1391 return make_property(info, prop, in_set)
1392 elif ch and ch in "CLMNPSZ":
1393 # An abbreviated property, eg \pL.
1394 prop = lookup_property(None, ch, positive, source)
1395 return make_property(info, prop, in_set)
1397 # Not a property, so treat as a literal "p" or "P".
1398 source.pos = saved_pos
1399 ch = "p" if positive else "P"
1400 return make_character(info, ord(ch), in_set)
1402def parse_property_name(source):
1403 "Parses a property name, which may be qualified."
1404 name = source.get_while(PROPERTY_NAME_PART)
1405 saved_pos = source.pos
1407 ch = source.get()
1408 if ch and ch in ":=":
1409 prop_name = name
1410 name = source.get_while(ALNUM | set(" &_-./")).strip()
1412 if name:
1413 # Name after the ":" or "=", so it's a qualified name.
1414 saved_pos = source.pos
1415 else:
1416 # No name after the ":" or "=", so assume it's an unqualified name.
1417 prop_name, name = None, prop_name
1418 else:
1419 prop_name = None
1421 source.pos = saved_pos
1422 return prop_name, name
1424def parse_set(source, info):
1425 "Parses a character set."
1426 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION
1428 saved_ignore = source.ignore_space
1429 source.ignore_space = False
1430 # Negative set?
1431 negate = source.match("^")
1432 try:
1433 if version == VERSION0:
1434 item = parse_set_imp_union(source, info)
1435 else:
1436 item = parse_set_union(source, info)
1438 if not source.match("]"):
1439 raise error("missing ]", source.string, source.pos)
1440 finally:
1441 source.ignore_space = saved_ignore
1443 if negate:
1444 item = item.with_flags(positive=not item.positive)
1446 item = item.with_flags(case_flags=make_case_flags(info))
1448 return item
1450def parse_set_union(source, info):
1451 "Parses a set union ([x||y])."
1452 items = [parse_set_symm_diff(source, info)]
1453 while source.match("||"):
1454 items.append(parse_set_symm_diff(source, info))
1456 if len(items) == 1:
1457 return items[0]
1458 return SetUnion(info, items)
1460def parse_set_symm_diff(source, info):
1461 "Parses a set symmetric difference ([x~~y])."
1462 items = [parse_set_inter(source, info)]
1463 while source.match("~~"):
1464 items.append(parse_set_inter(source, info))
1466 if len(items) == 1:
1467 return items[0]
1468 return SetSymDiff(info, items)
1470def parse_set_inter(source, info):
1471 "Parses a set intersection ([x&&y])."
1472 items = [parse_set_diff(source, info)]
1473 while source.match("&&"):
1474 items.append(parse_set_diff(source, info))
1476 if len(items) == 1:
1477 return items[0]
1478 return SetInter(info, items)
1480def parse_set_diff(source, info):
1481 "Parses a set difference ([x--y])."
1482 items = [parse_set_imp_union(source, info)]
1483 while source.match("--"):
1484 items.append(parse_set_imp_union(source, info))
1486 if len(items) == 1:
1487 return items[0]
1488 return SetDiff(info, items)
1490def parse_set_imp_union(source, info):
1491 "Parses a set implicit union ([xy])."
1492 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION
1494 items = [parse_set_member(source, info)]
1495 while True:
1496 saved_pos = source.pos
1497 if source.match("]"):
1498 # End of the set.
1499 source.pos = saved_pos
1500 break
1502 if version == VERSION1 and any(source.match(op) for op in SET_OPS):
1503 # The new behaviour has set operators.
1504 source.pos = saved_pos
1505 break
1507 items.append(parse_set_member(source, info))
1509 if len(items) == 1:
1510 return items[0]
1511 return SetUnion(info, items)
1513def parse_set_member(source, info):
1514 "Parses a member in a character set."
1515 # Parse a set item.
1516 start = parse_set_item(source, info)
1517 saved_pos1 = source.pos
1518 if (not isinstance(start, Character) or not start.positive or not
1519 source.match("-")):
1520 # It's not the start of a range.
1521 return start
1523 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION
1525 # It looks like the start of a range of characters.
1526 saved_pos2 = source.pos
1527 if version == VERSION1 and source.match("-"):
1528 # It's actually the set difference operator '--', so return the
1529 # character.
1530 source.pos = saved_pos1
1531 return start
1533 if source.match("]"):
1534 # We've reached the end of the set, so return both the character and
1535 # hyphen.
1536 source.pos = saved_pos2
1537 return SetUnion(info, [start, Character(ord("-"))])
1539 # Parse a set item.
1540 end = parse_set_item(source, info)
1541 if not isinstance(end, Character) or not end.positive:
1542 # It's not a range, so return the character, hyphen and property.
1543 return SetUnion(info, [start, Character(ord("-")), end])
1545 # It _is_ a range.
1546 if start.value > end.value:
1547 raise error("bad character range", source.string, source.pos)
1549 if start.value == end.value:
1550 return start
1552 return Range(start.value, end.value)
1554def parse_set_item(source, info):
1555 "Parses an item in a character set."
1556 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION
1558 if source.match("\\"):
1559 # An escape sequence in a set.
1560 return parse_escape(source, info, True)
1562 saved_pos = source.pos
1563 if source.match("[:"):
1564 # Looks like a POSIX character class.
1565 try:
1566 return parse_posix_class(source, info)
1567 except ParseError:
1568 # Not a POSIX character class.
1569 source.pos = saved_pos
1571 if version == VERSION1 and source.match("["):
1572 # It's the start of a nested set.
1574 # Negative set?
1575 negate = source.match("^")
1576 item = parse_set_union(source, info)
1578 if not source.match("]"):
1579 raise error("missing ]", source.string, source.pos)
1581 if negate:
1582 item = item.with_flags(positive=not item.positive)
1584 return item
1586 ch = source.get()
1587 if not ch:
1588 raise error("unterminated character set", source.string, source.pos)
1590 return Character(ord(ch))
1592def parse_posix_class(source, info):
1593 "Parses a POSIX character class."
1594 negate = source.match("^")
1595 prop_name, name = parse_property_name(source)
1596 if not source.match(":]"):
1597 raise ParseError()
1599 return lookup_property(prop_name, name, not negate, source, posix=True)
1601def float_to_rational(flt):
1602 "Converts a float to a rational pair."
1603 int_part = int(flt)
1604 error = flt - int_part
1605 if abs(error) < 0.0001:
1606 return int_part, 1
1608 den, num = float_to_rational(1.0 / error)
1610 return int_part * den + num, den
1612def numeric_to_rational(numeric):
1613 "Converts a numeric string to a rational string, if possible."
1614 if numeric[ : 1] == "-":
1615 sign, numeric = numeric[0], numeric[1 : ]
1616 else:
1617 sign = ""
1619 parts = numeric.split("/")
1620 if len(parts) == 2:
1621 num, den = float_to_rational(float(parts[0]) / float(parts[1]))
1622 elif len(parts) == 1:
1623 num, den = float_to_rational(float(parts[0]))
1624 else:
1625 raise ValueError()
1627 result = "{}{}/{}".format(sign, num, den)
1628 if result.endswith("/1"):
1629 return result[ : -2]
1631 return result
1633def standardise_name(name):
1634 "Standardises a property or value name."
1635 try:
1636 return numeric_to_rational("".join(name))
1637 except (ValueError, ZeroDivisionError):
1638 return "".join(ch for ch in name if ch not in "_- ").upper()
1640_POSIX_CLASSES = set('ALNUM DIGIT PUNCT XDIGIT'.split())
1642_BINARY_VALUES = set('YES Y NO N TRUE T FALSE F'.split())
1644def lookup_property(property, value, positive, source=None, posix=False):
1645 "Looks up a property."
1646 # Normalise the names (which may still be lists).
1647 property = standardise_name(property) if property else None
1648 value = standardise_name(value)
1650 if (property, value) == ("GENERALCATEGORY", "ASSIGNED"):
1651 property, value, positive = "GENERALCATEGORY", "UNASSIGNED", not positive
1653 if posix and not property and value.upper() in _POSIX_CLASSES:
1654 value = 'POSIX' + value
1656 if property:
1657 # Both the property and the value are provided.
1658 prop = PROPERTIES.get(property)
1659 if not prop:
1660 if not source:
1661 raise error("unknown property")
1663 raise error("unknown property", source.string, source.pos)
1665 prop_id, value_dict = prop
1666 val_id = value_dict.get(value)
1667 if val_id is None:
1668 if not source:
1669 raise error("unknown property value")
1671 raise error("unknown property value", source.string, source.pos)
1673 return Property((prop_id << 16) | val_id, positive)
1675 # Only the value is provided.
1676 # It might be the name of a GC, script or block value.
1677 for property in ("GC", "SCRIPT", "BLOCK"):
1678 prop_id, value_dict = PROPERTIES.get(property)
1679 val_id = value_dict.get(value)
1680 if val_id is not None:
1681 return Property((prop_id << 16) | val_id, positive)
1683 # It might be the name of a binary property.
1684 prop = PROPERTIES.get(value)
1685 if prop:
1686 prop_id, value_dict = prop
1687 if set(value_dict) == _BINARY_VALUES:
1688 return Property((prop_id << 16) | 1, positive)
1690 return Property(prop_id << 16, not positive)
1692 # It might be the name of a binary property starting with a prefix.
1693 if value.startswith("IS"):
1694 prop = PROPERTIES.get(value[2 : ])
1695 if prop:
1696 prop_id, value_dict = prop
1697 if "YES" in value_dict:
1698 return Property((prop_id << 16) | 1, positive)
1700 # It might be the name of a script or block starting with a prefix.
1701 for prefix, property in (("IS", "SCRIPT"), ("IN", "BLOCK")):
1702 if value.startswith(prefix):
1703 prop_id, value_dict = PROPERTIES.get(property)
1704 val_id = value_dict.get(value[2 : ])
1705 if val_id is not None:
1706 return Property((prop_id << 16) | val_id, positive)
1708 # Unknown property.
1709 if not source:
1710 raise error("unknown property")
1712 raise error("unknown property", source.string, source.pos)
1714def _compile_replacement(source, pattern, is_unicode):
1715 "Compiles a replacement template escape sequence."
1716 ch = source.get()
1717 if ch in ALPHA:
1718 # An alphabetic escape sequence.
1719 value = CHARACTER_ESCAPES.get(ch)
1720 if value:
1721 return False, [ord(value)]
1723 if ch in HEX_ESCAPES and (ch == "x" or is_unicode):
1724 # A hexadecimal escape sequence.
1725 return False, [parse_repl_hex_escape(source, HEX_ESCAPES[ch], ch)]
1727 if ch == "g":
1728 # A group preference.
1729 return True, [compile_repl_group(source, pattern)]
1731 if ch == "N" and is_unicode:
1732 # A named character.
1733 value = parse_repl_named_char(source)
1734 if value is not None:
1735 return False, [value]
1737 raise error("bad escape \\%s" % ch, source.string, source.pos)
1739 if isinstance(source.sep, bytes):
1740 octal_mask = 0xFF
1741 else:
1742 octal_mask = 0x1FF
1744 if ch == "0":
1745 # An octal escape sequence.
1746 digits = ch
1747 while len(digits) < 3:
1748 saved_pos = source.pos
1749 ch = source.get()
1750 if ch not in OCT_DIGITS:
1751 source.pos = saved_pos
1752 break
1753 digits += ch
1755 return False, [int(digits, 8) & octal_mask]
1757 if ch in DIGITS:
1758 # Either an octal escape sequence (3 digits) or a group reference (max
1759 # 2 digits).
1760 digits = ch
1761 saved_pos = source.pos
1762 ch = source.get()
1763 if ch in DIGITS:
1764 digits += ch
1765 saved_pos = source.pos
1766 ch = source.get()
1767 if ch and is_octal(digits + ch):
1768 # An octal escape sequence.
1769 return False, [int(digits + ch, 8) & octal_mask]
1771 # A group reference.
1772 source.pos = saved_pos
1773 return True, [int(digits)]
1775 if ch == "\\":
1776 # An escaped backslash is a backslash.
1777 return False, [ord("\\")]
1779 if not ch:
1780 # A trailing backslash.
1781 raise error("bad escape (end of pattern)", source.string, source.pos)
1783 # An escaped non-backslash is a backslash followed by the literal.
1784 return False, [ord("\\"), ord(ch)]
1786def parse_repl_hex_escape(source, expected_len, type):
1787 "Parses a hex escape sequence in a replacement string."
1788 digits = []
1789 for i in range(expected_len):
1790 ch = source.get()
1791 if ch not in HEX_DIGITS:
1792 raise error("incomplete escape \\%s%s" % (type, ''.join(digits)),
1793 source.string, source.pos)
1794 digits.append(ch)
1796 return int("".join(digits), 16)
1798def parse_repl_named_char(source):
1799 "Parses a named character in a replacement string."
1800 saved_pos = source.pos
1801 if source.match("{"):
1802 name = source.get_while(ALPHA | set(" "))
1804 if source.match("}"):
1805 try:
1806 value = unicodedata.lookup(name)
1807 return ord(value)
1808 except KeyError:
1809 raise error("undefined character name", source.string,
1810 source.pos)
1812 source.pos = saved_pos
1813 return None
1815def compile_repl_group(source, pattern):
1816 "Compiles a replacement template group reference."
1817 source.expect("<")
1818 name = parse_name(source, True, True)
1820 source.expect(">")
1821 if name.isdigit():
1822 index = int(name)
1823 if not 0 <= index <= pattern.groups:
1824 raise error("invalid group reference", source.string, source.pos)
1826 return index
1828 try:
1829 return pattern.groupindex[name]
1830 except KeyError:
1831 raise IndexError("unknown group")
1833# The regular expression is parsed into a syntax tree. The different types of
1834# node are defined below.
1836INDENT = " "
1837POSITIVE_OP = 0x1
1838ZEROWIDTH_OP = 0x2
1839FUZZY_OP = 0x4
1840REVERSE_OP = 0x8
1841REQUIRED_OP = 0x10
1843POS_TEXT = {False: "NON-MATCH", True: "MATCH"}
1844CASE_TEXT = {NOCASE: "", IGNORECASE: " SIMPLE_IGNORE_CASE", FULLCASE: "",
1845 FULLIGNORECASE: " FULL_IGNORE_CASE"}
1847def make_sequence(items):
1848 if len(items) == 1:
1849 return items[0]
1850 return Sequence(items)
1852# Common base class for all nodes.
1853class RegexBase:
1854 def __init__(self):
1855 self._key = self.__class__
1857 def with_flags(self, positive=None, case_flags=None, zerowidth=None):
1858 if positive is None:
1859 positive = self.positive
1860 else:
1861 positive = bool(positive)
1862 if case_flags is None:
1863 case_flags = self.case_flags
1864 else:
1865 case_flags = CASE_FLAGS_COMBINATIONS[case_flags & CASE_FLAGS]
1866 if zerowidth is None:
1867 zerowidth = self.zerowidth
1868 else:
1869 zerowidth = bool(zerowidth)
1871 if (positive == self.positive and case_flags == self.case_flags and
1872 zerowidth == self.zerowidth):
1873 return self
1875 return self.rebuild(positive, case_flags, zerowidth)
1877 def fix_groups(self, pattern, reverse, fuzzy):
1878 pass
1880 def optimise(self, info, reverse):
1881 return self
1883 def pack_characters(self, info):
1884 return self
1886 def remove_captures(self):
1887 return self
1889 def is_atomic(self):
1890 return True
1892 def can_be_affix(self):
1893 return True
1895 def contains_group(self):
1896 return False
1898 def get_firstset(self, reverse):
1899 raise _FirstSetError()
1901 def has_simple_start(self):
1902 return False
1904 def compile(self, reverse=False, fuzzy=False):
1905 return self._compile(reverse, fuzzy)
1907 def is_empty(self):
1908 return False
1910 def __hash__(self):
1911 return hash(self._key)
1913 def __eq__(self, other):
1914 return type(self) is type(other) and self._key == other._key
1916 def __ne__(self, other):
1917 return not self.__eq__(other)
1919 def get_required_string(self, reverse):
1920 return self.max_width(), None
1922# Base class for zero-width nodes.
1923class ZeroWidthBase(RegexBase):
1924 def __init__(self, positive=True):
1925 RegexBase.__init__(self)
1926 self.positive = bool(positive)
1928 self._key = self.__class__, self.positive
1930 def get_firstset(self, reverse):
1931 return set([None])
1933 def _compile(self, reverse, fuzzy):
1934 flags = 0
1935 if self.positive:
1936 flags |= POSITIVE_OP
1937 if fuzzy:
1938 flags |= FUZZY_OP
1939 if reverse:
1940 flags |= REVERSE_OP
1941 return [(self._opcode, flags)]
1943 def dump(self, indent, reverse):
1944 print("{}{} {}".format(INDENT * indent, self._op_name,
1945 POS_TEXT[self.positive]))
1947 def max_width(self):
1948 return 0
1950class Any(RegexBase):
1951 _opcode = {False: OP.ANY, True: OP.ANY_REV}
1952 _op_name = "ANY"
1954 def has_simple_start(self):
1955 return True
1957 def _compile(self, reverse, fuzzy):
1958 flags = 0
1959 if fuzzy:
1960 flags |= FUZZY_OP
1961 return [(self._opcode[reverse], flags)]
1963 def dump(self, indent, reverse):
1964 print("{}{}".format(INDENT * indent, self._op_name))
1966 def max_width(self):
1967 return 1
1969class AnyAll(Any):
1970 _opcode = {False: OP.ANY_ALL, True: OP.ANY_ALL_REV}
1971 _op_name = "ANY_ALL"
1973class AnyU(Any):
1974 _opcode = {False: OP.ANY_U, True: OP.ANY_U_REV}
1975 _op_name = "ANY_U"
1977class Atomic(RegexBase):
1978 def __init__(self, subpattern):
1979 RegexBase.__init__(self)
1980 self.subpattern = subpattern
1982 def fix_groups(self, pattern, reverse, fuzzy):
1983 self.subpattern.fix_groups(pattern, reverse, fuzzy)
1985 def optimise(self, info, reverse):
1986 self.subpattern = self.subpattern.optimise(info, reverse)
1988 if self.subpattern.is_empty():
1989 return self.subpattern
1990 return self
1992 def pack_characters(self, info):
1993 self.subpattern = self.subpattern.pack_characters(info)
1994 return self
1996 def remove_captures(self):
1997 self.subpattern = self.subpattern.remove_captures()
1998 return self
2000 def can_be_affix(self):
2001 return self.subpattern.can_be_affix()
2003 def contains_group(self):
2004 return self.subpattern.contains_group()
2006 def get_firstset(self, reverse):
2007 return self.subpattern.get_firstset(reverse)
2009 def has_simple_start(self):
2010 return self.subpattern.has_simple_start()
2012 def _compile(self, reverse, fuzzy):
2013 return ([(OP.ATOMIC, )] + self.subpattern.compile(reverse, fuzzy) +
2014 [(OP.END, )])
2016 def dump(self, indent, reverse):
2017 print("{}ATOMIC".format(INDENT * indent))
2018 self.subpattern.dump(indent + 1, reverse)
2020 def is_empty(self):
2021 return self.subpattern.is_empty()
2023 def __eq__(self, other):
2024 return (type(self) is type(other) and self.subpattern ==
2025 other.subpattern)
2027 def max_width(self):
2028 return self.subpattern.max_width()
2030 def get_required_string(self, reverse):
2031 return self.subpattern.get_required_string(reverse)
2033class Boundary(ZeroWidthBase):
2034 _opcode = OP.BOUNDARY
2035 _op_name = "BOUNDARY"
2037class Branch(RegexBase):
2038 def __init__(self, branches):
2039 RegexBase.__init__(self)
2040 self.branches = branches
2042 def fix_groups(self, pattern, reverse, fuzzy):
2043 for b in self.branches:
2044 b.fix_groups(pattern, reverse, fuzzy)
2046 def optimise(self, info, reverse):
2047 if not self.branches:
2048 return Sequence([])
2050 # Flatten branches within branches.
2051 branches = Branch._flatten_branches(info, reverse, self.branches)
2053 # Move any common prefix or suffix out of the branches.
2054 if reverse:
2055 suffix, branches = Branch._split_common_suffix(info, branches)
2056 prefix = []
2057 else:
2058 prefix, branches = Branch._split_common_prefix(info, branches)
2059 suffix = []
2061 # Try to reduce adjacent single-character branches to sets.
2062 branches = Branch._reduce_to_set(info, reverse, branches)
2064 if len(branches) > 1:
2065 sequence = [Branch(branches)]
2067 if not prefix or not suffix:
2068 # We might be able to add a quick precheck before the branches.
2069 firstset = self._add_precheck(info, reverse, branches)
2071 if firstset:
2072 if reverse:
2073 sequence.append(firstset)
2074 else:
2075 sequence.insert(0, firstset)
2076 else:
2077 sequence = branches
2079 return make_sequence(prefix + sequence + suffix)
2081 def _add_precheck(self, info, reverse, branches):
2082 charset = set()
2083 pos = -1 if reverse else 0
2085 for branch in branches:
2086 if type(branch) is Literal and branch.case_flags == NOCASE:
2087 charset.add(branch.characters[pos])
2088 else:
2089 return
2091 if not charset:
2092 return None
2094 return _check_firstset(info, reverse, [Character(c) for c in charset])
2096 def pack_characters(self, info):
2097 self.branches = [b.pack_characters(info) for b in self.branches]
2098 return self
2100 def remove_captures(self):
2101 self.branches = [b.remove_captures() for b in self.branches]
2102 return self
2104 def is_atomic(self):
2105 return all(b.is_atomic() for b in self.branches)
2107 def can_be_affix(self):
2108 return all(b.can_be_affix() for b in self.branches)
2110 def contains_group(self):
2111 return any(b.contains_group() for b in self.branches)
2113 def get_firstset(self, reverse):
2114 fs = set()
2115 for b in self.branches:
2116 fs |= b.get_firstset(reverse)
2118 return fs or set([None])
2120 def _compile(self, reverse, fuzzy):
2121 code = [(OP.BRANCH, )]
2122 for b in self.branches:
2123 code.extend(b.compile(reverse, fuzzy))
2124 code.append((OP.NEXT, ))
2126 code[-1] = (OP.END, )
2128 return code
2130 def dump(self, indent, reverse):
2131 print("{}BRANCH".format(INDENT * indent))
2132 self.branches[0].dump(indent + 1, reverse)
2133 for b in self.branches[1 : ]:
2134 print("{}OR".format(INDENT * indent))
2135 b.dump(indent + 1, reverse)
2137 @staticmethod
2138 def _flatten_branches(info, reverse, branches):
2139 # Flatten the branches so that there aren't branches of branches.
2140 new_branches = []
2141 for b in branches:
2142 b = b.optimise(info, reverse)
2143 if isinstance(b, Branch):
2144 new_branches.extend(b.branches)
2145 else:
2146 new_branches.append(b)
2148 return new_branches
2150 @staticmethod
2151 def _split_common_prefix(info, branches):
2152 # Common leading items can be moved out of the branches.
2153 # Get the items in the branches.
2154 alternatives = []
2155 for b in branches:
2156 if isinstance(b, Sequence):
2157 alternatives.append(b.items)
2158 else:
2159 alternatives.append([b])
2161 # What is the maximum possible length of the prefix?
2162 max_count = min(len(a) for a in alternatives)
2164 # What is the longest common prefix?
2165 prefix = alternatives[0]
2166 pos = 0
2167 end_pos = max_count
2168 while pos < end_pos and prefix[pos].can_be_affix() and all(a[pos] ==
2169 prefix[pos] for a in alternatives):
2170 pos += 1
2171 count = pos
2173 if info.flags & UNICODE:
2174 # We need to check that we're not splitting a sequence of
2175 # characters which could form part of full case-folding.
2176 count = pos
2177 while count > 0 and not all(Branch._can_split(a, count) for a in
2178 alternatives):
2179 count -= 1
2181 # No common prefix is possible.
2182 if count == 0:
2183 return [], branches
2185 # Rebuild the branches.
2186 new_branches = []
2187 for a in alternatives:
2188 new_branches.append(make_sequence(a[count : ]))
2190 return prefix[ : count], new_branches
2192 @staticmethod
2193 def _split_common_suffix(info, branches):
2194 # Common trailing items can be moved out of the branches.
2195 # Get the items in the branches.
2196 alternatives = []
2197 for b in branches:
2198 if isinstance(b, Sequence):
2199 alternatives.append(b.items)
2200 else:
2201 alternatives.append([b])
2203 # What is the maximum possible length of the suffix?
2204 max_count = min(len(a) for a in alternatives)
2206 # What is the longest common suffix?
2207 suffix = alternatives[0]
2208 pos = -1
2209 end_pos = -1 - max_count
2210 while pos > end_pos and suffix[pos].can_be_affix() and all(a[pos] ==
2211 suffix[pos] for a in alternatives):
2212 pos -= 1
2213 count = -1 - pos
2215 if info.flags & UNICODE:
2216 # We need to check that we're not splitting a sequence of
2217 # characters which could form part of full case-folding.
2218 while count > 0 and not all(Branch._can_split_rev(a, count) for a
2219 in alternatives):
2220 count -= 1
2222 # No common suffix is possible.
2223 if count == 0:
2224 return [], branches
2226 # Rebuild the branches.
2227 new_branches = []
2228 for a in alternatives:
2229 new_branches.append(make_sequence(a[ : -count]))
2231 return suffix[-count : ], new_branches
2233 @staticmethod
2234 def _can_split(items, count):
2235 # Check the characters either side of the proposed split.
2236 if not Branch._is_full_case(items, count - 1):
2237 return True
2239 if not Branch._is_full_case(items, count):
2240 return True
2242 # Check whether a 1-1 split would be OK.
2243 if Branch._is_folded(items[count - 1 : count + 1]):
2244 return False
2246 # Check whether a 1-2 split would be OK.
2247 if (Branch._is_full_case(items, count + 2) and
2248 Branch._is_folded(items[count - 1 : count + 2])):
2249 return False
2251 # Check whether a 2-1 split would be OK.
2252 if (Branch._is_full_case(items, count - 2) and
2253 Branch._is_folded(items[count - 2 : count + 1])):
2254 return False
2256 return True
2258 @staticmethod
2259 def _can_split_rev(items, count):
2260 end = len(items)
2262 # Check the characters either side of the proposed split.
2263 if not Branch._is_full_case(items, end - count):
2264 return True
2266 if not Branch._is_full_case(items, end - count - 1):
2267 return True
2269 # Check whether a 1-1 split would be OK.
2270 if Branch._is_folded(items[end - count - 1 : end - count + 1]):
2271 return False
2273 # Check whether a 1-2 split would be OK.
2274 if (Branch._is_full_case(items, end - count + 2) and
2275 Branch._is_folded(items[end - count - 1 : end - count + 2])):
2276 return False
2278 # Check whether a 2-1 split would be OK.
2279 if (Branch._is_full_case(items, end - count - 2) and
2280 Branch._is_folded(items[end - count - 2 : end - count + 1])):
2281 return False
2283 return True
2285 @staticmethod
2286 def _merge_common_prefixes(info, reverse, branches):
2287 # Branches with the same case-sensitive character prefix can be grouped
2288 # together if they are separated only by other branches with a
2289 # character prefix.
2290 prefixed = defaultdict(list)
2291 order = {}
2292 new_branches = []
2293 for b in branches:
2294 if Branch._is_simple_character(b):
2295 # Branch starts with a simple character.
2296 prefixed[b.value].append([b])
2297 order.setdefault(b.value, len(order))
2298 elif (isinstance(b, Sequence) and b.items and
2299 Branch._is_simple_character(b.items[0])):
2300 # Branch starts with a simple character.
2301 prefixed[b.items[0].value].append(b.items)
2302 order.setdefault(b.items[0].value, len(order))
2303 else:
2304 Branch._flush_char_prefix(info, reverse, prefixed, order,
2305 new_branches)
2307 new_branches.append(b)
2309 Branch._flush_char_prefix(info, prefixed, order, new_branches)
2311 return new_branches
2313 @staticmethod
2314 def _is_simple_character(c):
2315 return isinstance(c, Character) and c.positive and not c.case_flags
2317 @staticmethod
2318 def _reduce_to_set(info, reverse, branches):
2319 # Can the branches be reduced to a set?
2320 new_branches = []
2321 items = set()
2322 case_flags = NOCASE
2323 for b in branches:
2324 if isinstance(b, (Character, Property, SetBase)):
2325 # Branch starts with a single character.
2326 if b.case_flags != case_flags:
2327 # Different case sensitivity, so flush.
2328 Branch._flush_set_members(info, reverse, items, case_flags,
2329 new_branches)
2331 case_flags = b.case_flags
2333 items.add(b.with_flags(case_flags=NOCASE))
2334 else:
2335 Branch._flush_set_members(info, reverse, items, case_flags,
2336 new_branches)
2338 new_branches.append(b)
2340 Branch._flush_set_members(info, reverse, items, case_flags,
2341 new_branches)
2343 return new_branches
2345 @staticmethod
2346 def _flush_char_prefix(info, reverse, prefixed, order, new_branches):
2347 # Flush the prefixed branches.
2348 if not prefixed:
2349 return
2351 for value, branches in sorted(prefixed.items(), key=lambda pair:
2352 order[pair[0]]):
2353 if len(branches) == 1:
2354 new_branches.append(make_sequence(branches[0]))
2355 else:
2356 subbranches = []
2357 optional = False
2358 for b in branches:
2359 if len(b) > 1:
2360 subbranches.append(make_sequence(b[1 : ]))
2361 elif not optional:
2362 subbranches.append(Sequence())
2363 optional = True
2365 sequence = Sequence([Character(value), Branch(subbranches)])
2366 new_branches.append(sequence.optimise(info, reverse))
2368 prefixed.clear()
2369 order.clear()
2371 @staticmethod
2372 def _flush_set_members(info, reverse, items, case_flags, new_branches):
2373 # Flush the set members.
2374 if not items:
2375 return
2377 if len(items) == 1:
2378 item = list(items)[0]
2379 else:
2380 item = SetUnion(info, list(items)).optimise(info, reverse)
2382 new_branches.append(item.with_flags(case_flags=case_flags))
2384 items.clear()
2386 @staticmethod
2387 def _is_full_case(items, i):
2388 if not 0 <= i < len(items):
2389 return False
2391 item = items[i]
2392 return (isinstance(item, Character) and item.positive and
2393 (item.case_flags & FULLIGNORECASE) == FULLIGNORECASE)
2395 @staticmethod
2396 def _is_folded(items):
2397 if len(items) < 2:
2398 return False
2400 for i in items:
2401 if (not isinstance(i, Character) or not i.positive or not
2402 i.case_flags):
2403 return False
2405 folded = "".join(chr(i.value) for i in items)
2406 folded = _regex.fold_case(FULL_CASE_FOLDING, folded)
2408 # Get the characters which expand to multiple codepoints on folding.
2409 expanding_chars = _regex.get_expand_on_folding()
2411 for c in expanding_chars:
2412 if folded == _regex.fold_case(FULL_CASE_FOLDING, c):
2413 return True
2415 return False
2417 def is_empty(self):
2418 return all(b.is_empty() for b in self.branches)
2420 def __eq__(self, other):
2421 return type(self) is type(other) and self.branches == other.branches
2423 def max_width(self):
2424 return max(b.max_width() for b in self.branches)
2426class CallGroup(RegexBase):
2427 def __init__(self, info, group, position):
2428 RegexBase.__init__(self)
2429 self.info = info
2430 self.group = group
2431 self.position = position
2433 self._key = self.__class__, self.group
2435 def fix_groups(self, pattern, reverse, fuzzy):
2436 try:
2437 self.group = int(self.group)
2438 except ValueError:
2439 try:
2440 self.group = self.info.group_index[self.group]
2441 except KeyError:
2442 raise error("invalid group reference", pattern, self.position)
2444 if not 0 <= self.group <= self.info.group_count:
2445 raise error("unknown group", pattern, self.position)
2447 if self.group > 0 and self.info.open_group_count[self.group] > 1:
2448 raise error("ambiguous group reference", pattern, self.position)
2450 self.info.group_calls.append((self, reverse, fuzzy))
2452 self._key = self.__class__, self.group
2454 def remove_captures(self):
2455 raise error("group reference not allowed", pattern, self.position)
2457 def _compile(self, reverse, fuzzy):
2458 return [(OP.GROUP_CALL, self.call_ref)]
2460 def dump(self, indent, reverse):
2461 print("{}GROUP_CALL {}".format(INDENT * indent, self.group))
2463 def __eq__(self, other):
2464 return type(self) is type(other) and self.group == other.group
2466 def max_width(self):
2467 return UNLIMITED
2469 def __del__(self):
2470 self.info = None
2472class CallRef(RegexBase):
2473 def __init__(self, ref, parsed):
2474 self.ref = ref
2475 self.parsed = parsed
2477 def _compile(self, reverse, fuzzy):
2478 return ([(OP.CALL_REF, self.ref)] + self.parsed._compile(reverse,
2479 fuzzy) + [(OP.END, )])
2481class Character(RegexBase):
2482 _opcode = {(NOCASE, False): OP.CHARACTER, (IGNORECASE, False):
2483 OP.CHARACTER_IGN, (FULLCASE, False): OP.CHARACTER, (FULLIGNORECASE,
2484 False): OP.CHARACTER_IGN, (NOCASE, True): OP.CHARACTER_REV, (IGNORECASE,
2485 True): OP.CHARACTER_IGN_REV, (FULLCASE, True): OP.CHARACTER_REV,
2486 (FULLIGNORECASE, True): OP.CHARACTER_IGN_REV}
2488 def __init__(self, value, positive=True, case_flags=NOCASE,
2489 zerowidth=False):
2490 RegexBase.__init__(self)
2491 self.value = value
2492 self.positive = bool(positive)
2493 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
2494 self.zerowidth = bool(zerowidth)
2496 if (self.positive and (self.case_flags & FULLIGNORECASE) ==
2497 FULLIGNORECASE):
2498 self.folded = _regex.fold_case(FULL_CASE_FOLDING, chr(self.value))
2499 else:
2500 self.folded = chr(self.value)
2502 self._key = (self.__class__, self.value, self.positive,
2503 self.case_flags, self.zerowidth)
2505 def rebuild(self, positive, case_flags, zerowidth):
2506 return Character(self.value, positive, case_flags, zerowidth)
2508 def optimise(self, info, reverse, in_set=False):
2509 return self
2511 def get_firstset(self, reverse):
2512 return set([self])
2514 def has_simple_start(self):
2515 return True
2517 def _compile(self, reverse, fuzzy):
2518 flags = 0
2519 if self.positive:
2520 flags |= POSITIVE_OP
2521 if self.zerowidth:
2522 flags |= ZEROWIDTH_OP
2523 if fuzzy:
2524 flags |= FUZZY_OP
2526 code = PrecompiledCode([self._opcode[self.case_flags, reverse], flags,
2527 self.value])
2529 if len(self.folded) > 1:
2530 # The character expands on full case-folding.
2531 code = Branch([code, String([ord(c) for c in self.folded],
2532 case_flags=self.case_flags)])
2534 return code.compile(reverse, fuzzy)
2536 def dump(self, indent, reverse):
2537 display = ascii(chr(self.value)).lstrip("bu")
2538 print("{}CHARACTER {} {}{}".format(INDENT * indent,
2539 POS_TEXT[self.positive], display, CASE_TEXT[self.case_flags]))
2541 def matches(self, ch):
2542 return (ch == self.value) == self.positive
2544 def max_width(self):
2545 return len(self.folded)
2547 def get_required_string(self, reverse):
2548 if not self.positive:
2549 return 1, None
2551 self.folded_characters = tuple(ord(c) for c in self.folded)
2553 return 0, self
2555class Conditional(RegexBase):
2556 def __init__(self, info, group, yes_item, no_item, position):
2557 RegexBase.__init__(self)
2558 self.info = info
2559 self.group = group
2560 self.yes_item = yes_item
2561 self.no_item = no_item
2562 self.position = position
2564 def fix_groups(self, pattern, reverse, fuzzy):
2565 try:
2566 self.group = int(self.group)
2567 except ValueError:
2568 try:
2569 self.group = self.info.group_index[self.group]
2570 except KeyError:
2571 if self.group == 'DEFINE':
2572 # 'DEFINE' is a special name unless there's a group with
2573 # that name.
2574 self.group = 0
2575 else:
2576 raise error("unknown group", pattern, self.position)
2578 if not 0 <= self.group <= self.info.group_count:
2579 raise error("invalid group reference", pattern, self.position)
2581 self.yes_item.fix_groups(pattern, reverse, fuzzy)
2582 self.no_item.fix_groups(pattern, reverse, fuzzy)
2584 def optimise(self, info, reverse):
2585 yes_item = self.yes_item.optimise(info, reverse)
2586 no_item = self.no_item.optimise(info, reverse)
2588 return Conditional(info, self.group, yes_item, no_item, self.position)
2590 def pack_characters(self, info):
2591 self.yes_item = self.yes_item.pack_characters(info)
2592 self.no_item = self.no_item.pack_characters(info)
2593 return self
2595 def remove_captures(self):
2596 self.yes_item = self.yes_item.remove_captures()
2597 self.no_item = self.no_item.remove_captures()
2599 def is_atomic(self):
2600 return self.yes_item.is_atomic() and self.no_item.is_atomic()
2602 def can_be_affix(self):
2603 return self.yes_item.can_be_affix() and self.no_item.can_be_affix()
2605 def contains_group(self):
2606 return self.yes_item.contains_group() or self.no_item.contains_group()
2608 def get_firstset(self, reverse):
2609 return (self.yes_item.get_firstset(reverse) |
2610 self.no_item.get_firstset(reverse))
2612 def _compile(self, reverse, fuzzy):
2613 code = [(OP.GROUP_EXISTS, self.group)]
2614 code.extend(self.yes_item.compile(reverse, fuzzy))
2615 add_code = self.no_item.compile(reverse, fuzzy)
2616 if add_code:
2617 code.append((OP.NEXT, ))
2618 code.extend(add_code)
2620 code.append((OP.END, ))
2622 return code
2624 def dump(self, indent, reverse):
2625 print("{}GROUP_EXISTS {}".format(INDENT * indent, self.group))
2626 self.yes_item.dump(indent + 1, reverse)
2627 if not self.no_item.is_empty():
2628 print("{}OR".format(INDENT * indent))
2629 self.no_item.dump(indent + 1, reverse)
2631 def is_empty(self):
2632 return self.yes_item.is_empty() and self.no_item.is_empty()
2634 def __eq__(self, other):
2635 return type(self) is type(other) and (self.group, self.yes_item,
2636 self.no_item) == (other.group, other.yes_item, other.no_item)
2638 def max_width(self):
2639 return max(self.yes_item.max_width(), self.no_item.max_width())
2641 def __del__(self):
2642 self.info = None
2644class DefaultBoundary(ZeroWidthBase):
2645 _opcode = OP.DEFAULT_BOUNDARY
2646 _op_name = "DEFAULT_BOUNDARY"
2648class DefaultEndOfWord(ZeroWidthBase):
2649 _opcode = OP.DEFAULT_END_OF_WORD
2650 _op_name = "DEFAULT_END_OF_WORD"
2652class DefaultStartOfWord(ZeroWidthBase):
2653 _opcode = OP.DEFAULT_START_OF_WORD
2654 _op_name = "DEFAULT_START_OF_WORD"
2656class EndOfLine(ZeroWidthBase):
2657 _opcode = OP.END_OF_LINE
2658 _op_name = "END_OF_LINE"
2660class EndOfLineU(EndOfLine):
2661 _opcode = OP.END_OF_LINE_U
2662 _op_name = "END_OF_LINE_U"
2664class EndOfString(ZeroWidthBase):
2665 _opcode = OP.END_OF_STRING
2666 _op_name = "END_OF_STRING"
2668class EndOfStringLine(ZeroWidthBase):
2669 _opcode = OP.END_OF_STRING_LINE
2670 _op_name = "END_OF_STRING_LINE"
2672class EndOfStringLineU(EndOfStringLine):
2673 _opcode = OP.END_OF_STRING_LINE_U
2674 _op_name = "END_OF_STRING_LINE_U"
2676class EndOfWord(ZeroWidthBase):
2677 _opcode = OP.END_OF_WORD
2678 _op_name = "END_OF_WORD"
2680class Failure(ZeroWidthBase):
2681 _op_name = "FAILURE"
2683 def _compile(self, reverse, fuzzy):
2684 return [(OP.FAILURE, )]
2686class Fuzzy(RegexBase):
2687 def __init__(self, subpattern, constraints=None):
2688 RegexBase.__init__(self)
2689 if constraints is None:
2690 constraints = {}
2691 self.subpattern = subpattern
2692 self.constraints = constraints
2694 # If an error type is mentioned in the cost equation, then its maximum
2695 # defaults to unlimited.
2696 if "cost" in constraints:
2697 for e in "dis":
2698 if e in constraints["cost"]:
2699 constraints.setdefault(e, (0, None))
2701 # If any error type is mentioned, then all the error maxima default to
2702 # 0, otherwise they default to unlimited.
2703 if set(constraints) & set("dis"):
2704 for e in "dis":
2705 constraints.setdefault(e, (0, 0))
2706 else:
2707 for e in "dis":
2708 constraints.setdefault(e, (0, None))
2710 # The maximum of the generic error type defaults to unlimited.
2711 constraints.setdefault("e", (0, None))
2713 # The cost equation defaults to equal costs. Also, the cost of any
2714 # error type not mentioned in the cost equation defaults to 0.
2715 if "cost" in constraints:
2716 for e in "dis":
2717 constraints["cost"].setdefault(e, 0)
2718 else:
2719 constraints["cost"] = {"d": 1, "i": 1, "s": 1, "max":
2720 constraints["e"][1]}
2722 def fix_groups(self, pattern, reverse, fuzzy):
2723 self.subpattern.fix_groups(pattern, reverse, True)
2725 def pack_characters(self, info):
2726 self.subpattern = self.subpattern.pack_characters(info)
2727 return self
2729 def remove_captures(self):
2730 self.subpattern = self.subpattern.remove_captures()
2731 return self
2733 def is_atomic(self):
2734 return self.subpattern.is_atomic()
2736 def contains_group(self):
2737 return self.subpattern.contains_group()
2739 def _compile(self, reverse, fuzzy):
2740 # The individual limits.
2741 arguments = []
2742 for e in "dise":
2743 v = self.constraints[e]
2744 arguments.append(v[0])
2745 arguments.append(UNLIMITED if v[1] is None else v[1])
2747 # The coeffs of the cost equation.
2748 for e in "dis":
2749 arguments.append(self.constraints["cost"][e])
2751 # The maximum of the cost equation.
2752 v = self.constraints["cost"]["max"]
2753 arguments.append(UNLIMITED if v is None else v)
2755 flags = 0
2756 if reverse:
2757 flags |= REVERSE_OP
2759 test = self.constraints.get("test")
2761 if test:
2762 return ([(OP.FUZZY_EXT, flags) + tuple(arguments)] +
2763 test.compile(reverse, True) + [(OP.NEXT,)] +
2764 self.subpattern.compile(reverse, True) + [(OP.END,)])
2766 return ([(OP.FUZZY, flags) + tuple(arguments)] +
2767 self.subpattern.compile(reverse, True) + [(OP.END,)])
2769 def dump(self, indent, reverse):
2770 constraints = self._constraints_to_string()
2771 if constraints:
2772 constraints = " " + constraints
2773 print("{}FUZZY{}".format(INDENT * indent, constraints))
2774 self.subpattern.dump(indent + 1, reverse)
2776 def is_empty(self):
2777 return self.subpattern.is_empty()
2779 def __eq__(self, other):
2780 return (type(self) is type(other) and self.subpattern ==
2781 other.subpattern and self.constraints == other.constraints)
2783 def max_width(self):
2784 return UNLIMITED
2786 def _constraints_to_string(self):
2787 constraints = []
2789 for name in "ids":
2790 min, max = self.constraints[name]
2791 if max == 0:
2792 continue
2794 con = ""
2796 if min > 0:
2797 con = "{}<=".format(min)
2799 con += name
2801 if max is not None:
2802 con += "<={}".format(max)
2804 constraints.append(con)
2806 cost = []
2807 for name in "ids":
2808 coeff = self.constraints["cost"][name]
2809 if coeff > 0:
2810 cost.append("{}{}".format(coeff, name))
2812 limit = self.constraints["cost"]["max"]
2813 if limit is not None and limit > 0:
2814 cost = "{}<={}".format("+".join(cost), limit)
2815 constraints.append(cost)
2817 return ",".join(constraints)
2819class Grapheme(RegexBase):
2820 def _compile(self, reverse, fuzzy):
2821 # Match at least 1 character until a grapheme boundary is reached. Note
2822 # that this is the same whether matching forwards or backwards.
2823 grapheme_matcher = Atomic(Sequence([LazyRepeat(AnyAll(), 1, None),
2824 GraphemeBoundary()]))
2826 return grapheme_matcher.compile(reverse, fuzzy)
2828 def dump(self, indent, reverse):
2829 print("{}GRAPHEME".format(INDENT * indent))
2831 def max_width(self):
2832 return UNLIMITED
2834class GraphemeBoundary:
2835 def compile(self, reverse, fuzzy):
2836 return [(OP.GRAPHEME_BOUNDARY, 1)]
2838class GreedyRepeat(RegexBase):
2839 _opcode = OP.GREEDY_REPEAT
2840 _op_name = "GREEDY_REPEAT"
2842 def __init__(self, subpattern, min_count, max_count):
2843 RegexBase.__init__(self)
2844 self.subpattern = subpattern
2845 self.min_count = min_count
2846 self.max_count = max_count
2848 def fix_groups(self, pattern, reverse, fuzzy):
2849 self.subpattern.fix_groups(pattern, reverse, fuzzy)
2851 def optimise(self, info, reverse):
2852 subpattern = self.subpattern.optimise(info, reverse)
2854 return type(self)(subpattern, self.min_count, self.max_count)
2856 def pack_characters(self, info):
2857 self.subpattern = self.subpattern.pack_characters(info)
2858 return self
2860 def remove_captures(self):
2861 self.subpattern = self.subpattern.remove_captures()
2862 return self
2864 def is_atomic(self):
2865 return self.min_count == self.max_count and self.subpattern.is_atomic()
2867 def can_be_affix(self):
2868 return False
2870 def contains_group(self):
2871 return self.subpattern.contains_group()
2873 def get_firstset(self, reverse):
2874 fs = self.subpattern.get_firstset(reverse)
2875 if self.min_count == 0:
2876 fs.add(None)
2878 return fs
2880 def _compile(self, reverse, fuzzy):
2881 repeat = [self._opcode, self.min_count]
2882 if self.max_count is None:
2883 repeat.append(UNLIMITED)
2884 else:
2885 repeat.append(self.max_count)
2887 subpattern = self.subpattern.compile(reverse, fuzzy)
2888 if not subpattern:
2889 return []
2891 return ([tuple(repeat)] + subpattern + [(OP.END, )])
2893 def dump(self, indent, reverse):
2894 if self.max_count is None:
2895 limit = "INF"
2896 else:
2897 limit = self.max_count
2898 print("{}{} {} {}".format(INDENT * indent, self._op_name,
2899 self.min_count, limit))
2901 self.subpattern.dump(indent + 1, reverse)
2903 def is_empty(self):
2904 return self.subpattern.is_empty()
2906 def __eq__(self, other):
2907 return type(self) is type(other) and (self.subpattern, self.min_count,
2908 self.max_count) == (other.subpattern, other.min_count,
2909 other.max_count)
2911 def max_width(self):
2912 if self.max_count is None:
2913 return UNLIMITED
2915 return self.subpattern.max_width() * self.max_count
2917 def get_required_string(self, reverse):
2918 max_count = UNLIMITED if self.max_count is None else self.max_count
2919 if self.min_count == 0:
2920 w = self.subpattern.max_width() * max_count
2921 return min(w, UNLIMITED), None
2923 ofs, req = self.subpattern.get_required_string(reverse)
2924 if req:
2925 return ofs, req
2927 w = self.subpattern.max_width() * max_count
2928 return min(w, UNLIMITED), None
2930class PossessiveRepeat(GreedyRepeat):
2931 def is_atomic(self):
2932 return True
2934 def _compile(self, reverse, fuzzy):
2935 subpattern = self.subpattern.compile(reverse, fuzzy)
2936 if not subpattern:
2937 return []
2939 repeat = [self._opcode, self.min_count]
2940 if self.max_count is None:
2941 repeat.append(UNLIMITED)
2942 else:
2943 repeat.append(self.max_count)
2945 return ([(OP.ATOMIC, ), tuple(repeat)] + subpattern + [(OP.END, ),
2946 (OP.END, )])
2948 def dump(self, indent, reverse):
2949 print("{}ATOMIC".format(INDENT * indent))
2951 if self.max_count is None:
2952 limit = "INF"
2953 else:
2954 limit = self.max_count
2955 print("{}{} {} {}".format(INDENT * (indent + 1), self._op_name,
2956 self.min_count, limit))
2958 self.subpattern.dump(indent + 2, reverse)
2960class Group(RegexBase):
2961 def __init__(self, info, group, subpattern):
2962 RegexBase.__init__(self)
2963 self.info = info
2964 self.group = group
2965 self.subpattern = subpattern
2967 self.call_ref = None
2969 def fix_groups(self, pattern, reverse, fuzzy):
2970 self.info.defined_groups[self.group] = (self, reverse, fuzzy)
2971 self.subpattern.fix_groups(pattern, reverse, fuzzy)
2973 def optimise(self, info, reverse):
2974 subpattern = self.subpattern.optimise(info, reverse)
2976 return Group(self.info, self.group, subpattern)
2978 def pack_characters(self, info):
2979 self.subpattern = self.subpattern.pack_characters(info)
2980 return self
2982 def remove_captures(self):
2983 return self.subpattern.remove_captures()
2985 def is_atomic(self):
2986 return self.subpattern.is_atomic()
2988 def can_be_affix(self):
2989 return False
2991 def contains_group(self):
2992 return True
2994 def get_firstset(self, reverse):
2995 return self.subpattern.get_firstset(reverse)
2997 def has_simple_start(self):
2998 return self.subpattern.has_simple_start()
3000 def _compile(self, reverse, fuzzy):
3001 code = []
3003 key = self.group, reverse, fuzzy
3004 ref = self.info.call_refs.get(key)
3005 if ref is not None:
3006 code += [(OP.CALL_REF, ref)]
3008 public_group = private_group = self.group
3009 if private_group < 0:
3010 public_group = self.info.private_groups[private_group]
3011 private_group = self.info.group_count - private_group
3013 code += ([(OP.GROUP, int(not reverse), private_group, public_group)] +
3014 self.subpattern.compile(reverse, fuzzy) + [(OP.END, )])
3016 if ref is not None:
3017 code += [(OP.END, )]
3019 return code
3021 def dump(self, indent, reverse):
3022 group = self.group
3023 if group < 0:
3024 group = private_groups[group]
3025 print("{}GROUP {}".format(INDENT * indent, group))
3026 self.subpattern.dump(indent + 1, reverse)
3028 def __eq__(self, other):
3029 return (type(self) is type(other) and (self.group, self.subpattern) ==
3030 (other.group, other.subpattern))
3032 def max_width(self):
3033 return self.subpattern.max_width()
3035 def get_required_string(self, reverse):
3036 return self.subpattern.get_required_string(reverse)
3038 def __del__(self):
3039 self.info = None
3041class Keep(ZeroWidthBase):
3042 _opcode = OP.KEEP
3043 _op_name = "KEEP"
3045class LazyRepeat(GreedyRepeat):
3046 _opcode = OP.LAZY_REPEAT
3047 _op_name = "LAZY_REPEAT"
3049class LookAround(RegexBase):
3050 _dir_text = {False: "AHEAD", True: "BEHIND"}
3052 def __init__(self, behind, positive, subpattern):
3053 RegexBase.__init__(self)
3054 self.behind = bool(behind)
3055 self.positive = bool(positive)
3056 self.subpattern = subpattern
3058 def fix_groups(self, pattern, reverse, fuzzy):
3059 self.subpattern.fix_groups(pattern, self.behind, fuzzy)
3061 def optimise(self, info, reverse):
3062 subpattern = self.subpattern.optimise(info, self.behind)
3063 if self.positive and subpattern.is_empty():
3064 return subpattern
3066 return LookAround(self.behind, self.positive, subpattern)
3068 def pack_characters(self, info):
3069 self.subpattern = self.subpattern.pack_characters(info)
3070 return self
3072 def remove_captures(self):
3073 return self.subpattern.remove_captures()
3075 def is_atomic(self):
3076 return self.subpattern.is_atomic()
3078 def can_be_affix(self):
3079 return self.subpattern.can_be_affix()
3081 def contains_group(self):
3082 return self.subpattern.contains_group()
3084 def get_firstset(self, reverse):
3085 if self.positive and self.behind == reverse:
3086 return self.subpattern.get_firstset(reverse)
3088 return set([None])
3090 def _compile(self, reverse, fuzzy):
3091 flags = 0
3092 if self.positive:
3093 flags |= POSITIVE_OP
3094 if fuzzy:
3095 flags |= FUZZY_OP
3096 if reverse:
3097 flags |= REVERSE_OP
3099 return ([(OP.LOOKAROUND, flags, int(not self.behind))] +
3100 self.subpattern.compile(self.behind) + [(OP.END, )])
3102 def dump(self, indent, reverse):
3103 print("{}LOOK{} {}".format(INDENT * indent,
3104 self._dir_text[self.behind], POS_TEXT[self.positive]))
3105 self.subpattern.dump(indent + 1, self.behind)
3107 def is_empty(self):
3108 return self.positive and self.subpattern.is_empty()
3110 def __eq__(self, other):
3111 return type(self) is type(other) and (self.behind, self.positive,
3112 self.subpattern) == (other.behind, other.positive, other.subpattern)
3114 def max_width(self):
3115 return 0
3117class LookAroundConditional(RegexBase):
3118 _dir_text = {False: "AHEAD", True: "BEHIND"}
3120 def __init__(self, behind, positive, subpattern, yes_item, no_item):
3121 RegexBase.__init__(self)
3122 self.behind = bool(behind)
3123 self.positive = bool(positive)
3124 self.subpattern = subpattern
3125 self.yes_item = yes_item
3126 self.no_item = no_item
3128 def fix_groups(self, pattern, reverse, fuzzy):
3129 self.subpattern.fix_groups(pattern, reverse, fuzzy)
3130 self.yes_item.fix_groups(pattern, reverse, fuzzy)
3131 self.no_item.fix_groups(pattern, reverse, fuzzy)
3133 def optimise(self, info, reverse):
3134 subpattern = self.subpattern.optimise(info, self.behind)
3135 yes_item = self.yes_item.optimise(info, self.behind)
3136 no_item = self.no_item.optimise(info, self.behind)
3138 return LookAroundConditional(self.behind, self.positive, subpattern,
3139 yes_item, no_item)
3141 def pack_characters(self, info):
3142 self.subpattern = self.subpattern.pack_characters(info)
3143 self.yes_item = self.yes_item.pack_characters(info)
3144 self.no_item = self.no_item.pack_characters(info)
3145 return self
3147 def remove_captures(self):
3148 self.subpattern = self.subpattern.remove_captures()
3149 self.yes_item = self.yes_item.remove_captures()
3150 self.no_item = self.no_item.remove_captures()
3152 def is_atomic(self):
3153 return (self.subpattern.is_atomic() and self.yes_item.is_atomic() and
3154 self.no_item.is_atomic())
3156 def can_be_affix(self):
3157 return (self.subpattern.can_be_affix() and self.yes_item.can_be_affix()
3158 and self.no_item.can_be_affix())
3160 def contains_group(self):
3161 return (self.subpattern.contains_group() or
3162 self.yes_item.contains_group() or self.no_item.contains_group())
3164 def _compile(self, reverse, fuzzy):
3165 code = [(OP.CONDITIONAL, int(self.positive), int(not self.behind))]
3166 code.extend(self.subpattern.compile(self.behind, fuzzy))
3167 code.append((OP.NEXT, ))
3168 code.extend(self.yes_item.compile(reverse, fuzzy))
3169 add_code = self.no_item.compile(reverse, fuzzy)
3170 if add_code:
3171 code.append((OP.NEXT, ))
3172 code.extend(add_code)
3174 code.append((OP.END, ))
3176 return code
3178 def dump(self, indent, reverse):
3179 print("{}CONDITIONAL {} {}".format(INDENT * indent,
3180 self._dir_text[self.behind], POS_TEXT[self.positive]))
3181 self.subpattern.dump(indent + 1, self.behind)
3182 print("{}EITHER".format(INDENT * indent))
3183 self.yes_item.dump(indent + 1, reverse)
3184 if not self.no_item.is_empty():
3185 print("{}OR".format(INDENT * indent))
3186 self.no_item.dump(indent + 1, reverse)
3188 def is_empty(self):
3189 return (self.subpattern.is_empty() and self.yes_item.is_empty() or
3190 self.no_item.is_empty())
3192 def __eq__(self, other):
3193 return type(self) is type(other) and (self.subpattern, self.yes_item,
3194 self.no_item) == (other.subpattern, other.yes_item, other.no_item)
3196 def max_width(self):
3197 return max(self.yes_item.max_width(), self.no_item.max_width())
3199 def get_required_string(self, reverse):
3200 return self.max_width(), None
3202class PrecompiledCode(RegexBase):
3203 def __init__(self, code):
3204 self.code = code
3206 def _compile(self, reverse, fuzzy):
3207 return [tuple(self.code)]
3209class Property(RegexBase):
3210 _opcode = {(NOCASE, False): OP.PROPERTY, (IGNORECASE, False):
3211 OP.PROPERTY_IGN, (FULLCASE, False): OP.PROPERTY, (FULLIGNORECASE, False):
3212 OP.PROPERTY_IGN, (NOCASE, True): OP.PROPERTY_REV, (IGNORECASE, True):
3213 OP.PROPERTY_IGN_REV, (FULLCASE, True): OP.PROPERTY_REV, (FULLIGNORECASE,
3214 True): OP.PROPERTY_IGN_REV}
3216 def __init__(self, value, positive=True, case_flags=NOCASE,
3217 zerowidth=False):
3218 RegexBase.__init__(self)
3219 self.value = value
3220 self.positive = bool(positive)
3221 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
3222 self.zerowidth = bool(zerowidth)
3224 self._key = (self.__class__, self.value, self.positive,
3225 self.case_flags, self.zerowidth)
3227 def rebuild(self, positive, case_flags, zerowidth):
3228 return Property(self.value, positive, case_flags, zerowidth)
3230 def optimise(self, info, reverse, in_set=False):
3231 return self
3233 def get_firstset(self, reverse):
3234 return set([self])
3236 def has_simple_start(self):
3237 return True
3239 def _compile(self, reverse, fuzzy):
3240 flags = 0
3241 if self.positive:
3242 flags |= POSITIVE_OP
3243 if self.zerowidth:
3244 flags |= ZEROWIDTH_OP
3245 if fuzzy:
3246 flags |= FUZZY_OP
3247 return [(self._opcode[self.case_flags, reverse], flags, self.value)]
3249 def dump(self, indent, reverse):
3250 prop = PROPERTY_NAMES[self.value >> 16]
3251 name, value = prop[0], prop[1][self.value & 0xFFFF]
3252 print("{}PROPERTY {} {}:{}{}".format(INDENT * indent,
3253 POS_TEXT[self.positive], name, value, CASE_TEXT[self.case_flags]))
3255 def matches(self, ch):
3256 return _regex.has_property_value(self.value, ch) == self.positive
3258 def max_width(self):
3259 return 1
3261class Prune(ZeroWidthBase):
3262 _op_name = "PRUNE"
3264 def _compile(self, reverse, fuzzy):
3265 return [(OP.PRUNE, )]
3267class Range(RegexBase):
3268 _opcode = {(NOCASE, False): OP.RANGE, (IGNORECASE, False): OP.RANGE_IGN,
3269 (FULLCASE, False): OP.RANGE, (FULLIGNORECASE, False): OP.RANGE_IGN,
3270 (NOCASE, True): OP.RANGE_REV, (IGNORECASE, True): OP.RANGE_IGN_REV,
3271 (FULLCASE, True): OP.RANGE_REV, (FULLIGNORECASE, True): OP.RANGE_IGN_REV}
3272 _op_name = "RANGE"
3274 def __init__(self, lower, upper, positive=True, case_flags=NOCASE,
3275 zerowidth=False):
3276 RegexBase.__init__(self)
3277 self.lower = lower
3278 self.upper = upper
3279 self.positive = bool(positive)
3280 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
3281 self.zerowidth = bool(zerowidth)
3283 self._key = (self.__class__, self.lower, self.upper, self.positive,
3284 self.case_flags, self.zerowidth)
3286 def rebuild(self, positive, case_flags, zerowidth):
3287 return Range(self.lower, self.upper, positive, case_flags, zerowidth)
3289 def optimise(self, info, reverse, in_set=False):
3290 # Is the range case-sensitive?
3291 if not self.positive or not (self.case_flags & IGNORECASE) or in_set:
3292 return self
3294 # Is full case-folding possible?
3295 if (not (info.flags & UNICODE) or (self.case_flags & FULLIGNORECASE) !=
3296 FULLIGNORECASE):
3297 return self
3299 # Get the characters which expand to multiple codepoints on folding.
3300 expanding_chars = _regex.get_expand_on_folding()
3302 # Get the folded characters in the range.
3303 items = []
3304 for ch in expanding_chars:
3305 if self.lower <= ord(ch) <= self.upper:
3306 folded = _regex.fold_case(FULL_CASE_FOLDING, ch)
3307 items.append(String([ord(c) for c in folded],
3308 case_flags=self.case_flags))
3310 if not items:
3311 # We can fall back to simple case-folding.
3312 return self
3314 if len(items) < self.upper - self.lower + 1:
3315 # Not all the characters are covered by the full case-folding.
3316 items.insert(0, self)
3318 return Branch(items)
3320 def _compile(self, reverse, fuzzy):
3321 flags = 0
3322 if self.positive:
3323 flags |= POSITIVE_OP
3324 if self.zerowidth:
3325 flags |= ZEROWIDTH_OP
3326 if fuzzy:
3327 flags |= FUZZY_OP
3328 return [(self._opcode[self.case_flags, reverse], flags, self.lower,
3329 self.upper)]
3331 def dump(self, indent, reverse):
3332 display_lower = ascii(chr(self.lower)).lstrip("bu")
3333 display_upper = ascii(chr(self.upper)).lstrip("bu")
3334 print("{}RANGE {} {} {}{}".format(INDENT * indent,
3335 POS_TEXT[self.positive], display_lower, display_upper,
3336 CASE_TEXT[self.case_flags]))
3338 def matches(self, ch):
3339 return (self.lower <= ch <= self.upper) == self.positive
3341 def max_width(self):
3342 return 1
3344class RefGroup(RegexBase):
3345 _opcode = {(NOCASE, False): OP.REF_GROUP, (IGNORECASE, False):
3346 OP.REF_GROUP_IGN, (FULLCASE, False): OP.REF_GROUP, (FULLIGNORECASE,
3347 False): OP.REF_GROUP_FLD, (NOCASE, True): OP.REF_GROUP_REV, (IGNORECASE,
3348 True): OP.REF_GROUP_IGN_REV, (FULLCASE, True): OP.REF_GROUP_REV,
3349 (FULLIGNORECASE, True): OP.REF_GROUP_FLD_REV}
3351 def __init__(self, info, group, position, case_flags=NOCASE):
3352 RegexBase.__init__(self)
3353 self.info = info
3354 self.group = group
3355 self.position = position
3356 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
3358 self._key = self.__class__, self.group, self.case_flags
3360 def fix_groups(self, pattern, reverse, fuzzy):
3361 try:
3362 self.group = int(self.group)
3363 except ValueError:
3364 try:
3365 self.group = self.info.group_index[self.group]
3366 except KeyError:
3367 raise error("unknown group", pattern, self.position)
3369 if not 1 <= self.group <= self.info.group_count:
3370 raise error("invalid group reference", pattern, self.position)
3372 self._key = self.__class__, self.group, self.case_flags
3374 def remove_captures(self):
3375 raise error("group reference not allowed", pattern, self.position)
3377 def _compile(self, reverse, fuzzy):
3378 flags = 0
3379 if fuzzy:
3380 flags |= FUZZY_OP
3381 return [(self._opcode[self.case_flags, reverse], flags, self.group)]
3383 def dump(self, indent, reverse):
3384 print("{}REF_GROUP {}{}".format(INDENT * indent, self.group,
3385 CASE_TEXT[self.case_flags]))
3387 def max_width(self):
3388 return UNLIMITED
3390 def __del__(self):
3391 self.info = None
3393class SearchAnchor(ZeroWidthBase):
3394 _opcode = OP.SEARCH_ANCHOR
3395 _op_name = "SEARCH_ANCHOR"
3397class Sequence(RegexBase):
3398 def __init__(self, items=None):
3399 RegexBase.__init__(self)
3400 if items is None:
3401 items = []
3403 self.items = items
3405 def fix_groups(self, pattern, reverse, fuzzy):
3406 for s in self.items:
3407 s.fix_groups(pattern, reverse, fuzzy)
3409 def optimise(self, info, reverse):
3410 # Flatten the sequences.
3411 items = []
3412 for s in self.items:
3413 s = s.optimise(info, reverse)
3414 if isinstance(s, Sequence):
3415 items.extend(s.items)
3416 else:
3417 items.append(s)
3419 return make_sequence(items)
3421 def pack_characters(self, info):
3422 "Packs sequences of characters into strings."
3423 items = []
3424 characters = []
3425 case_flags = NOCASE
3426 for s in self.items:
3427 if type(s) is Character and s.positive and not s.zerowidth:
3428 if s.case_flags != case_flags:
3429 # Different case sensitivity, so flush, unless neither the
3430 # previous nor the new character are cased.
3431 if s.case_flags or is_cased_i(info, s.value):
3432 Sequence._flush_characters(info, characters,
3433 case_flags, items)
3435 case_flags = s.case_flags
3437 characters.append(s.value)
3438 elif type(s) is String or type(s) is Literal:
3439 if s.case_flags != case_flags:
3440 # Different case sensitivity, so flush, unless the neither
3441 # the previous nor the new string are cased.
3442 if s.case_flags or any(is_cased_i(info, c) for c in
3443 characters):
3444 Sequence._flush_characters(info, characters,
3445 case_flags, items)
3447 case_flags = s.case_flags
3449 characters.extend(s.characters)
3450 else:
3451 Sequence._flush_characters(info, characters, case_flags, items)
3453 items.append(s.pack_characters(info))
3455 Sequence._flush_characters(info, characters, case_flags, items)
3457 return make_sequence(items)
3459 def remove_captures(self):
3460 self.items = [s.remove_captures() for s in self.items]
3461 return self
3463 def is_atomic(self):
3464 return all(s.is_atomic() for s in self.items)
3466 def can_be_affix(self):
3467 return False
3469 def contains_group(self):
3470 return any(s.contains_group() for s in self.items)
3472 def get_firstset(self, reverse):
3473 fs = set()
3474 items = self.items
3475 if reverse:
3476 items.reverse()
3477 for s in items:
3478 fs |= s.get_firstset(reverse)
3479 if None not in fs:
3480 return fs
3481 fs.discard(None)
3483 return fs | set([None])
3485 def has_simple_start(self):
3486 return bool(self.items) and self.items[0].has_simple_start()
3488 def _compile(self, reverse, fuzzy):
3489 seq = self.items
3490 if reverse:
3491 seq = seq[::-1]
3493 code = []
3494 for s in seq:
3495 code.extend(s.compile(reverse, fuzzy))
3497 return code
3499 def dump(self, indent, reverse):
3500 for s in self.items:
3501 s.dump(indent, reverse)
3503 @staticmethod
3504 def _flush_characters(info, characters, case_flags, items):
3505 if not characters:
3506 return
3508 # Disregard case_flags if all of the characters are case-less.
3509 if case_flags & IGNORECASE:
3510 if not any(is_cased_i(info, c) for c in characters):
3511 case_flags = NOCASE
3513 if (case_flags & FULLIGNORECASE) == FULLIGNORECASE:
3514 literals = Sequence._fix_full_casefold(characters)
3516 for item in literals:
3517 chars = item.characters
3519 if len(chars) == 1:
3520 items.append(Character(chars[0], case_flags=item.case_flags))
3521 else:
3522 items.append(String(chars, case_flags=item.case_flags))
3523 else:
3524 if len(characters) == 1:
3525 items.append(Character(characters[0], case_flags=case_flags))
3526 else:
3527 items.append(String(characters, case_flags=case_flags))
3529 characters[:] = []
3531 @staticmethod
3532 def _fix_full_casefold(characters):
3533 # Split a literal needing full case-folding into chunks that need it
3534 # and chunks that can use simple case-folding, which is faster.
3535 expanded = [_regex.fold_case(FULL_CASE_FOLDING, c) for c in
3536 _regex.get_expand_on_folding()]
3537 string = _regex.fold_case(FULL_CASE_FOLDING, ''.join(chr(c)
3538 for c in characters)).lower()
3539 chunks = []
3541 for e in expanded:
3542 found = string.find(e)
3544 while found >= 0:
3545 chunks.append((found, found + len(e)))
3546 found = string.find(e, found + 1)
3548 pos = 0
3549 literals = []
3551 for start, end in Sequence._merge_chunks(chunks):
3552 if pos < start:
3553 literals.append(Literal(characters[pos : start],
3554 case_flags=IGNORECASE))
3556 literals.append(Literal(characters[start : end],
3557 case_flags=FULLIGNORECASE))
3558 pos = end
3560 if pos < len(characters):
3561 literals.append(Literal(characters[pos : ], case_flags=IGNORECASE))
3563 return literals
3565 @staticmethod
3566 def _merge_chunks(chunks):
3567 if len(chunks) < 2:
3568 return chunks
3570 chunks.sort()
3572 start, end = chunks[0]
3573 new_chunks = []
3575 for s, e in chunks[1 : ]:
3576 if s <= end:
3577 end = max(end, e)
3578 else:
3579 new_chunks.append((start, end))
3580 start, end = s, e
3582 new_chunks.append((start, end))
3584 return new_chunks
3586 def is_empty(self):
3587 return all(i.is_empty() for i in self.items)
3589 def __eq__(self, other):
3590 return type(self) is type(other) and self.items == other.items
3592 def max_width(self):
3593 return sum(s.max_width() for s in self.items)
3595 def get_required_string(self, reverse):
3596 seq = self.items
3597 if reverse:
3598 seq = seq[::-1]
3600 offset = 0
3602 for s in seq:
3603 ofs, req = s.get_required_string(reverse)
3604 offset += ofs
3605 if req:
3606 return offset, req
3608 return offset, None
3610class SetBase(RegexBase):
3611 def __init__(self, info, items, positive=True, case_flags=NOCASE,
3612 zerowidth=False):
3613 RegexBase.__init__(self)
3614 self.info = info
3615 self.items = tuple(items)
3616 self.positive = bool(positive)
3617 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
3618 self.zerowidth = bool(zerowidth)
3620 self.char_width = 1
3622 self._key = (self.__class__, self.items, self.positive,
3623 self.case_flags, self.zerowidth)
3625 def rebuild(self, positive, case_flags, zerowidth):
3626 return type(self)(self.info, self.items, positive, case_flags,
3627 zerowidth).optimise(self.info, False)
3629 def get_firstset(self, reverse):
3630 return set([self])
3632 def has_simple_start(self):
3633 return True
3635 def _compile(self, reverse, fuzzy):
3636 flags = 0
3637 if self.positive:
3638 flags |= POSITIVE_OP
3639 if self.zerowidth:
3640 flags |= ZEROWIDTH_OP
3641 if fuzzy:
3642 flags |= FUZZY_OP
3643 code = [(self._opcode[self.case_flags, reverse], flags)]
3644 for m in self.items:
3645 code.extend(m.compile())
3647 code.append((OP.END, ))
3649 return code
3651 def dump(self, indent, reverse):
3652 print("{}{} {}{}".format(INDENT * indent, self._op_name,
3653 POS_TEXT[self.positive], CASE_TEXT[self.case_flags]))
3654 for i in self.items:
3655 i.dump(indent + 1, reverse)
3657 def _handle_case_folding(self, info, in_set):
3658 # Is the set case-sensitive?
3659 if not self.positive or not (self.case_flags & IGNORECASE) or in_set:
3660 return self
3662 # Is full case-folding possible?
3663 if (not (self.info.flags & UNICODE) or (self.case_flags &
3664 FULLIGNORECASE) != FULLIGNORECASE):
3665 return self
3667 # Get the characters which expand to multiple codepoints on folding.
3668 expanding_chars = _regex.get_expand_on_folding()
3670 # Get the folded characters in the set.
3671 items = []
3672 seen = set()
3673 for ch in expanding_chars:
3674 if self.matches(ord(ch)):
3675 folded = _regex.fold_case(FULL_CASE_FOLDING, ch)
3676 if folded not in seen:
3677 items.append(String([ord(c) for c in folded],
3678 case_flags=self.case_flags))
3679 seen.add(folded)
3681 if not items:
3682 # We can fall back to simple case-folding.
3683 return self
3685 return Branch([self] + items)
3687 def max_width(self):
3688 # Is the set case-sensitive?
3689 if not self.positive or not (self.case_flags & IGNORECASE):
3690 return 1
3692 # Is full case-folding possible?
3693 if (not (self.info.flags & UNICODE) or (self.case_flags &
3694 FULLIGNORECASE) != FULLIGNORECASE):
3695 return 1
3697 # Get the characters which expand to multiple codepoints on folding.
3698 expanding_chars = _regex.get_expand_on_folding()
3700 # Get the folded characters in the set.
3701 seen = set()
3702 for ch in expanding_chars:
3703 if self.matches(ord(ch)):
3704 folded = _regex.fold_case(FULL_CASE_FOLDING, ch)
3705 seen.add(folded)
3707 if not seen:
3708 return 1
3710 return max(len(folded) for folded in seen)
3712 def __del__(self):
3713 self.info = None
3715class SetDiff(SetBase):
3716 _opcode = {(NOCASE, False): OP.SET_DIFF, (IGNORECASE, False):
3717 OP.SET_DIFF_IGN, (FULLCASE, False): OP.SET_DIFF, (FULLIGNORECASE, False):
3718 OP.SET_DIFF_IGN, (NOCASE, True): OP.SET_DIFF_REV, (IGNORECASE, True):
3719 OP.SET_DIFF_IGN_REV, (FULLCASE, True): OP.SET_DIFF_REV, (FULLIGNORECASE,
3720 True): OP.SET_DIFF_IGN_REV}
3721 _op_name = "SET_DIFF"
3723 def optimise(self, info, reverse, in_set=False):
3724 items = self.items
3725 if len(items) > 2:
3726 items = [items[0], SetUnion(info, items[1 : ])]
3728 if len(items) == 1:
3729 return items[0].with_flags(case_flags=self.case_flags,
3730 zerowidth=self.zerowidth).optimise(info, reverse, in_set)
3732 self.items = tuple(m.optimise(info, reverse, in_set=True) for m in
3733 items)
3735 return self._handle_case_folding(info, in_set)
3737 def matches(self, ch):
3738 m = self.items[0].matches(ch) and not self.items[1].matches(ch)
3739 return m == self.positive
3741class SetInter(SetBase):
3742 _opcode = {(NOCASE, False): OP.SET_INTER, (IGNORECASE, False):
3743 OP.SET_INTER_IGN, (FULLCASE, False): OP.SET_INTER, (FULLIGNORECASE,
3744 False): OP.SET_INTER_IGN, (NOCASE, True): OP.SET_INTER_REV, (IGNORECASE,
3745 True): OP.SET_INTER_IGN_REV, (FULLCASE, True): OP.SET_INTER_REV,
3746 (FULLIGNORECASE, True): OP.SET_INTER_IGN_REV}
3747 _op_name = "SET_INTER"
3749 def optimise(self, info, reverse, in_set=False):
3750 items = []
3751 for m in self.items:
3752 m = m.optimise(info, reverse, in_set=True)
3753 if isinstance(m, SetInter) and m.positive:
3754 # Intersection in intersection.
3755 items.extend(m.items)
3756 else:
3757 items.append(m)
3759 if len(items) == 1:
3760 return items[0].with_flags(case_flags=self.case_flags,
3761 zerowidth=self.zerowidth).optimise(info, reverse, in_set)
3763 self.items = tuple(items)
3765 return self._handle_case_folding(info, in_set)
3767 def matches(self, ch):
3768 m = all(i.matches(ch) for i in self.items)
3769 return m == self.positive
3771class SetSymDiff(SetBase):
3772 _opcode = {(NOCASE, False): OP.SET_SYM_DIFF, (IGNORECASE, False):
3773 OP.SET_SYM_DIFF_IGN, (FULLCASE, False): OP.SET_SYM_DIFF, (FULLIGNORECASE,
3774 False): OP.SET_SYM_DIFF_IGN, (NOCASE, True): OP.SET_SYM_DIFF_REV,
3775 (IGNORECASE, True): OP.SET_SYM_DIFF_IGN_REV, (FULLCASE, True):
3776 OP.SET_SYM_DIFF_REV, (FULLIGNORECASE, True): OP.SET_SYM_DIFF_IGN_REV}
3777 _op_name = "SET_SYM_DIFF"
3779 def optimise(self, info, reverse, in_set=False):
3780 items = []
3781 for m in self.items:
3782 m = m.optimise(info, reverse, in_set=True)
3783 if isinstance(m, SetSymDiff) and m.positive:
3784 # Symmetric difference in symmetric difference.
3785 items.extend(m.items)
3786 else:
3787 items.append(m)
3789 if len(items) == 1:
3790 return items[0].with_flags(case_flags=self.case_flags,
3791 zerowidth=self.zerowidth).optimise(info, reverse, in_set)
3793 self.items = tuple(items)
3795 return self._handle_case_folding(info, in_set)
3797 def matches(self, ch):
3798 m = False
3799 for i in self.items:
3800 m = m != i.matches(ch)
3802 return m == self.positive
3804class SetUnion(SetBase):
3805 _opcode = {(NOCASE, False): OP.SET_UNION, (IGNORECASE, False):
3806 OP.SET_UNION_IGN, (FULLCASE, False): OP.SET_UNION, (FULLIGNORECASE,
3807 False): OP.SET_UNION_IGN, (NOCASE, True): OP.SET_UNION_REV, (IGNORECASE,
3808 True): OP.SET_UNION_IGN_REV, (FULLCASE, True): OP.SET_UNION_REV,
3809 (FULLIGNORECASE, True): OP.SET_UNION_IGN_REV}
3810 _op_name = "SET_UNION"
3812 def optimise(self, info, reverse, in_set=False):
3813 items = []
3814 for m in self.items:
3815 m = m.optimise(info, reverse, in_set=True)
3816 if isinstance(m, SetUnion) and m.positive:
3817 # Union in union.
3818 items.extend(m.items)
3819 else:
3820 items.append(m)
3822 if len(items) == 1:
3823 i = items[0]
3824 return i.with_flags(positive=i.positive == self.positive,
3825 case_flags=self.case_flags,
3826 zerowidth=self.zerowidth).optimise(info, reverse, in_set)
3828 self.items = tuple(items)
3830 return self._handle_case_folding(info, in_set)
3832 def _compile(self, reverse, fuzzy):
3833 flags = 0
3834 if self.positive:
3835 flags |= POSITIVE_OP
3836 if self.zerowidth:
3837 flags |= ZEROWIDTH_OP
3838 if fuzzy:
3839 flags |= FUZZY_OP
3841 characters, others = defaultdict(list), []
3842 for m in self.items:
3843 if isinstance(m, Character):
3844 characters[m.positive].append(m.value)
3845 else:
3846 others.append(m)
3848 code = [(self._opcode[self.case_flags, reverse], flags)]
3850 for positive, values in characters.items():
3851 flags = 0
3852 if positive:
3853 flags |= POSITIVE_OP
3854 if len(values) == 1:
3855 code.append((OP.CHARACTER, flags, values[0]))
3856 else:
3857 code.append((OP.STRING, flags, len(values)) + tuple(values))
3859 for m in others:
3860 code.extend(m.compile())
3862 code.append((OP.END, ))
3864 return code
3866 def matches(self, ch):
3867 m = any(i.matches(ch) for i in self.items)
3868 return m == self.positive
3870class Skip(ZeroWidthBase):
3871 _op_name = "SKIP"
3872 _opcode = OP.SKIP
3874class StartOfLine(ZeroWidthBase):
3875 _opcode = OP.START_OF_LINE
3876 _op_name = "START_OF_LINE"
3878class StartOfLineU(StartOfLine):
3879 _opcode = OP.START_OF_LINE_U
3880 _op_name = "START_OF_LINE_U"
3882class StartOfString(ZeroWidthBase):
3883 _opcode = OP.START_OF_STRING
3884 _op_name = "START_OF_STRING"
3886class StartOfWord(ZeroWidthBase):
3887 _opcode = OP.START_OF_WORD
3888 _op_name = "START_OF_WORD"
3890class String(RegexBase):
3891 _opcode = {(NOCASE, False): OP.STRING, (IGNORECASE, False): OP.STRING_IGN,
3892 (FULLCASE, False): OP.STRING, (FULLIGNORECASE, False): OP.STRING_FLD,
3893 (NOCASE, True): OP.STRING_REV, (IGNORECASE, True): OP.STRING_IGN_REV,
3894 (FULLCASE, True): OP.STRING_REV, (FULLIGNORECASE, True):
3895 OP.STRING_FLD_REV}
3897 def __init__(self, characters, case_flags=NOCASE):
3898 self.characters = tuple(characters)
3899 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
3901 if (self.case_flags & FULLIGNORECASE) == FULLIGNORECASE:
3902 folded_characters = []
3903 for char in self.characters:
3904 folded = _regex.fold_case(FULL_CASE_FOLDING, chr(char))
3905 folded_characters.extend(ord(c) for c in folded)
3906 else:
3907 folded_characters = self.characters
3909 self.folded_characters = tuple(folded_characters)
3910 self.required = False
3912 self._key = self.__class__, self.characters, self.case_flags
3914 def get_firstset(self, reverse):
3915 if reverse:
3916 pos = -1
3917 else:
3918 pos = 0
3919 return set([Character(self.characters[pos],
3920 case_flags=self.case_flags)])
3922 def has_simple_start(self):
3923 return True
3925 def _compile(self, reverse, fuzzy):
3926 flags = 0
3927 if fuzzy:
3928 flags |= FUZZY_OP
3929 if self.required:
3930 flags |= REQUIRED_OP
3931 return [(self._opcode[self.case_flags, reverse], flags,
3932 len(self.folded_characters)) + self.folded_characters]
3934 def dump(self, indent, reverse):
3935 display = ascii("".join(chr(c) for c in self.characters)).lstrip("bu")
3936 print("{}STRING {}{}".format(INDENT * indent, display,
3937 CASE_TEXT[self.case_flags]))
3939 def max_width(self):
3940 return len(self.folded_characters)
3942 def get_required_string(self, reverse):
3943 return 0, self
3945class Literal(String):
3946 def dump(self, indent, reverse):
3947 literal = ''.join(chr(c) for c in self.characters)
3948 display = ascii(literal).lstrip("bu")
3949 print("{}LITERAL MATCH {}{}".format(INDENT * indent, display,
3950 CASE_TEXT[self.case_flags]))
3952class StringSet(Branch):
3953 def __init__(self, info, name, case_flags=NOCASE):
3954 self.info = info
3955 self.name = name
3956 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
3958 self._key = self.__class__, self.name, self.case_flags
3960 self.set_key = (name, self.case_flags)
3961 if self.set_key not in info.named_lists_used:
3962 info.named_lists_used[self.set_key] = len(info.named_lists_used)
3964 index = self.info.named_lists_used[self.set_key]
3965 items = self.info.kwargs[self.name]
3967 case_flags = self.case_flags
3969 encoding = self.info.flags & _ALL_ENCODINGS
3970 fold_flags = encoding | case_flags
3972 choices = []
3974 for string in items:
3975 if isinstance(string, str):
3976 string = [ord(c) for c in string]
3978 choices.append([Character(c, case_flags=case_flags) for c in
3979 string])
3981 # Sort from longest to shortest.
3982 choices.sort(key=len, reverse=True)
3984 self.branches = [Sequence(choice) for choice in choices]
3986 def dump(self, indent, reverse):
3987 print("{}STRING_SET {}{}".format(INDENT * indent, self.name,
3988 CASE_TEXT[self.case_flags]))
3990 def __del__(self):
3991 self.info = None
3993class Source:
3994 "Scanner for the regular expression source string."
3995 def __init__(self, string):
3996 if isinstance(string, str):
3997 self.string = string
3998 self.char_type = chr
3999 else:
4000 self.string = string.decode("latin-1")
4001 self.char_type = lambda c: bytes([c])
4003 self.pos = 0
4004 self.ignore_space = False
4005 self.sep = string[ : 0]
4007 def get(self, override_ignore=False):
4008 string = self.string
4009 pos = self.pos
4011 try:
4012 if self.ignore_space and not override_ignore:
4013 while True:
4014 if string[pos].isspace():
4015 # Skip over the whitespace.
4016 pos += 1
4017 elif string[pos] == "#":
4018 # Skip over the comment to the end of the line.
4019 pos = string.index("\n", pos)
4020 else:
4021 break
4023 ch = string[pos]
4024 self.pos = pos + 1
4025 return ch
4026 except IndexError:
4027 # We've reached the end of the string.
4028 self.pos = pos
4029 return string[ : 0]
4030 except ValueError:
4031 # The comment extended to the end of the string.
4032 self.pos = len(string)
4033 return string[ : 0]
4035 def get_many(self, count=1):
4036 string = self.string
4037 pos = self.pos
4039 try:
4040 if self.ignore_space:
4041 substring = []
4043 while len(substring) < count:
4044 while True:
4045 if string[pos].isspace():
4046 # Skip over the whitespace.
4047 pos += 1
4048 elif string[pos] == "#":
4049 # Skip over the comment to the end of the line.
4050 pos = string.index("\n", pos)
4051 else:
4052 break
4054 substring.append(string[pos])
4055 pos += 1
4057 substring = "".join(substring)
4058 else:
4059 substring = string[pos : pos + count]
4060 pos += len(substring)
4062 self.pos = pos
4063 return substring
4064 except IndexError:
4065 # We've reached the end of the string.
4066 self.pos = len(string)
4067 return "".join(substring)
4068 except ValueError:
4069 # The comment extended to the end of the string.
4070 self.pos = len(string)
4071 return "".join(substring)
4073 def get_while(self, test_set, include=True):
4074 string = self.string
4075 pos = self.pos
4077 if self.ignore_space:
4078 try:
4079 substring = []
4081 while True:
4082 if string[pos].isspace():
4083 # Skip over the whitespace.
4084 pos += 1
4085 elif string[pos] == "#":
4086 # Skip over the comment to the end of the line.
4087 pos = string.index("\n", pos)
4088 elif (string[pos] in test_set) == include:
4089 substring.append(string[pos])
4090 pos += 1
4091 else:
4092 break
4094 self.pos = pos
4095 except IndexError:
4096 # We've reached the end of the string.
4097 self.pos = len(string)
4098 except ValueError:
4099 # The comment extended to the end of the string.
4100 self.pos = len(string)
4102 return "".join(substring)
4103 else:
4104 try:
4105 while (string[pos] in test_set) == include:
4106 pos += 1
4108 substring = string[self.pos : pos]
4110 self.pos = pos
4112 return substring
4113 except IndexError:
4114 # We've reached the end of the string.
4115 substring = string[self.pos : pos]
4117 self.pos = pos
4119 return substring
4121 def skip_while(self, test_set, include=True):
4122 string = self.string
4123 pos = self.pos
4125 try:
4126 if self.ignore_space:
4127 while True:
4128 if string[pos].isspace():
4129 # Skip over the whitespace.
4130 pos += 1
4131 elif string[pos] == "#":
4132 # Skip over the comment to the end of the line.
4133 pos = string.index("\n", pos)
4134 elif (string[pos] in test_set) == include:
4135 pos += 1
4136 else:
4137 break
4138 else:
4139 while (string[pos] in test_set) == include:
4140 pos += 1
4142 self.pos = pos
4143 except IndexError:
4144 # We've reached the end of the string.
4145 self.pos = len(string)
4146 except ValueError:
4147 # The comment extended to the end of the string.
4148 self.pos = len(string)
4150 def match(self, substring):
4151 string = self.string
4152 pos = self.pos
4154 if self.ignore_space:
4155 try:
4156 for c in substring:
4157 while True:
4158 if string[pos].isspace():
4159 # Skip over the whitespace.
4160 pos += 1
4161 elif string[pos] == "#":
4162 # Skip over the comment to the end of the line.
4163 pos = string.index("\n", pos)
4164 else:
4165 break
4167 if string[pos] != c:
4168 return False
4170 pos += 1
4172 self.pos = pos
4174 return True
4175 except IndexError:
4176 # We've reached the end of the string.
4177 return False
4178 except ValueError:
4179 # The comment extended to the end of the string.
4180 return False
4181 else:
4182 if not string.startswith(substring, pos):
4183 return False
4185 self.pos = pos + len(substring)
4187 return True
4189 def expect(self, substring):
4190 if not self.match(substring):
4191 raise error("missing {}".format(substring), self.string, self.pos)
4193 def at_end(self):
4194 string = self.string
4195 pos = self.pos
4197 try:
4198 if self.ignore_space:
4199 while True:
4200 if string[pos].isspace():
4201 pos += 1
4202 elif string[pos] == "#":
4203 pos = string.index("\n", pos)
4204 else:
4205 break
4207 return pos >= len(string)
4208 except IndexError:
4209 # We've reached the end of the string.
4210 return True
4211 except ValueError:
4212 # The comment extended to the end of the string.
4213 return True
4215class Info:
4216 "Info about the regular expression."
4218 def __init__(self, flags=0, char_type=None, kwargs={}):
4219 flags |= DEFAULT_FLAGS[(flags & _ALL_VERSIONS) or DEFAULT_VERSION]
4220 self.flags = flags
4221 self.global_flags = flags
4222 self.inline_locale = False
4224 self.kwargs = kwargs
4226 self.group_count = 0
4227 self.group_index = {}
4228 self.group_name = {}
4229 self.char_type = char_type
4230 self.named_lists_used = {}
4231 self.open_groups = []
4232 self.open_group_count = {}
4233 self.defined_groups = {}
4234 self.group_calls = []
4235 self.private_groups = {}
4237 def open_group(self, name=None):
4238 group = self.group_index.get(name)
4239 if group is None:
4240 while True:
4241 self.group_count += 1
4242 if name is None or self.group_count not in self.group_name:
4243 break
4245 group = self.group_count
4246 if name:
4247 self.group_index[name] = group
4248 self.group_name[group] = name
4250 if group in self.open_groups:
4251 # We have a nested named group. We'll assign it a private group
4252 # number, initially negative until we can assign a proper
4253 # (positive) number.
4254 group_alias = -(len(self.private_groups) + 1)
4255 self.private_groups[group_alias] = group
4256 group = group_alias
4258 self.open_groups.append(group)
4259 self.open_group_count[group] = self.open_group_count.get(group, 0) + 1
4261 return group
4263 def close_group(self):
4264 self.open_groups.pop()
4266 def is_open_group(self, name):
4267 # In version 1, a group reference can refer to an open group. We'll
4268 # just pretend the group isn't open.
4269 version = (self.flags & _ALL_VERSIONS) or DEFAULT_VERSION
4270 if version == VERSION1:
4271 return False
4273 if name.isdigit():
4274 group = int(name)
4275 else:
4276 group = self.group_index.get(name)
4278 return group in self.open_groups
4280def _check_group_features(info, parsed):
4281 """Checks whether the reverse and fuzzy features of the group calls match
4282 the groups which they call.
4283 """
4284 call_refs = {}
4285 additional_groups = []
4286 for call, reverse, fuzzy in info.group_calls:
4287 # Look up the reference of this group call.
4288 key = (call.group, reverse, fuzzy)
4289 ref = call_refs.get(key)
4290 if ref is None:
4291 # This group doesn't have a reference yet, so look up its features.
4292 if call.group == 0:
4293 # Calling the pattern as a whole.
4294 rev = bool(info.flags & REVERSE)
4295 fuz = isinstance(parsed, Fuzzy)
4296 if (rev, fuz) != (reverse, fuzzy):
4297 # The pattern as a whole doesn't have the features we want,
4298 # so we'll need to make a copy of it with the desired
4299 # features.
4300 additional_groups.append((CallRef(len(call_refs), parsed),
4301 reverse, fuzzy))
4302 else:
4303 # Calling a capture group.
4304 def_info = info.defined_groups[call.group]
4305 group = def_info[0]
4306 if def_info[1 : ] != (reverse, fuzzy):
4307 # The group doesn't have the features we want, so we'll
4308 # need to make a copy of it with the desired features.
4309 additional_groups.append((group, reverse, fuzzy))
4311 ref = len(call_refs)
4312 call_refs[key] = ref
4314 call.call_ref = ref
4316 info.call_refs = call_refs
4317 info.additional_groups = additional_groups
4319def _get_required_string(parsed, flags):
4320 "Gets the required string and related info of a parsed pattern."
4322 req_offset, required = parsed.get_required_string(bool(flags & REVERSE))
4323 if required:
4324 required.required = True
4325 if req_offset >= UNLIMITED:
4326 req_offset = -1
4328 req_flags = required.case_flags
4329 if not (flags & UNICODE):
4330 req_flags &= ~UNICODE
4332 req_chars = required.folded_characters
4333 else:
4334 req_offset = 0
4335 req_chars = ()
4336 req_flags = 0
4338 return req_offset, req_chars, req_flags
4340class Scanner:
4341 def __init__(self, lexicon, flags=0):
4342 self.lexicon = lexicon
4344 # Combine phrases into a compound pattern.
4345 patterns = []
4346 for phrase, action in lexicon:
4347 # Parse the regular expression.
4348 source = Source(phrase)
4349 info = Info(flags, source.char_type)
4350 source.ignore_space = bool(info.flags & VERBOSE)
4351 parsed = _parse_pattern(source, info)
4352 if not source.at_end():
4353 raise error("unbalanced parenthesis", source.string,
4354 source.pos)
4356 # We want to forbid capture groups within each phrase.
4357 patterns.append(parsed.remove_captures())
4359 # Combine all the subpatterns into one pattern.
4360 info = Info(flags)
4361 patterns = [Group(info, g + 1, p) for g, p in enumerate(patterns)]
4362 parsed = Branch(patterns)
4364 # Optimise the compound pattern.
4365 reverse = bool(info.flags & REVERSE)
4366 parsed = parsed.optimise(info, reverse)
4367 parsed = parsed.pack_characters(info)
4369 # Get the required string.
4370 req_offset, req_chars, req_flags = _get_required_string(parsed,
4371 info.flags)
4373 # Check the features of the groups.
4374 _check_group_features(info, parsed)
4376 # Complain if there are any group calls. They are not supported by the
4377 # Scanner class.
4378 if info.call_refs:
4379 raise error("recursive regex not supported by Scanner",
4380 source.string, source.pos)
4382 reverse = bool(info.flags & REVERSE)
4384 # Compile the compound pattern. The result is a list of tuples.
4385 code = parsed.compile(reverse) + [(OP.SUCCESS, )]
4387 # Flatten the code into a list of ints.
4388 code = _flatten_code(code)
4390 if not parsed.has_simple_start():
4391 # Get the first set, if possible.
4392 try:
4393 fs_code = _compile_firstset(info, parsed.get_firstset(reverse))
4394 fs_code = _flatten_code(fs_code)
4395 code = fs_code + code
4396 except _FirstSetError:
4397 pass
4399 # Check the global flags for conflicts.
4400 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION
4401 if version not in (0, VERSION0, VERSION1):
4402 raise ValueError("VERSION0 and VERSION1 flags are mutually incompatible")
4404 # Create the PatternObject.
4405 #
4406 # Local flags like IGNORECASE affect the code generation, but aren't
4407 # needed by the PatternObject itself. Conversely, global flags like
4408 # LOCALE _don't_ affect the code generation but _are_ needed by the
4409 # PatternObject.
4410 self.scanner = _regex.compile(None, (flags & GLOBAL_FLAGS) | version,
4411 code, {}, {}, {}, [], req_offset, req_chars, req_flags,
4412 len(patterns))
4414 def scan(self, string):
4415 result = []
4416 append = result.append
4417 match = self.scanner.scanner(string).match
4418 i = 0
4419 while True:
4420 m = match()
4421 if not m:
4422 break
4423 j = m.end()
4424 if i == j:
4425 break
4426 action = self.lexicon[m.lastindex - 1][1]
4427 if hasattr(action, '__call__'):
4428 self.match = m
4429 action = action(self, m.group())
4430 if action is not None:
4431 append(action)
4432 i = j
4434 return result, string[i : ]
4436# Get the known properties dict.
4437PROPERTIES = _regex.get_properties()
4439# Build the inverse of the properties dict.
4440PROPERTY_NAMES = {}
4441for prop_name, (prop_id, values) in PROPERTIES.items():
4442 name, prop_values = PROPERTY_NAMES.get(prop_id, ("", {}))
4443 name = max(name, prop_name, key=len)
4444 PROPERTY_NAMES[prop_id] = name, prop_values
4446 for val_name, val_id in values.items():
4447 prop_values[val_id] = max(prop_values.get(val_id, ""), val_name,
4448 key=len)
4450# Character escape sequences.
4451CHARACTER_ESCAPES = {
4452 "a": "\a",
4453 "b": "\b",
4454 "f": "\f",
4455 "n": "\n",
4456 "r": "\r",
4457 "t": "\t",
4458 "v": "\v",
4459}
4461# Predefined character set escape sequences.
4462CHARSET_ESCAPES = {
4463 "d": lookup_property(None, "Digit", True),
4464 "D": lookup_property(None, "Digit", False),
4465 "h": lookup_property(None, "Blank", True),
4466 "s": lookup_property(None, "Space", True),
4467 "S": lookup_property(None, "Space", False),
4468 "w": lookup_property(None, "Word", True),
4469 "W": lookup_property(None, "Word", False),
4470}
4472# Positional escape sequences.
4473POSITION_ESCAPES = {
4474 "A": StartOfString(),
4475 "b": Boundary(),
4476 "B": Boundary(False),
4477 "K": Keep(),
4478 "m": StartOfWord(),
4479 "M": EndOfWord(),
4480 "Z": EndOfString(),
4481}
4483# Positional escape sequences when WORD flag set.
4484WORD_POSITION_ESCAPES = dict(POSITION_ESCAPES)
4485WORD_POSITION_ESCAPES.update({
4486 "b": DefaultBoundary(),
4487 "B": DefaultBoundary(False),
4488 "m": DefaultStartOfWord(),
4489 "M": DefaultEndOfWord(),
4490})
4492# Regex control verbs.
4493VERBS = {
4494 "FAIL": Failure(),
4495 "F": Failure(),
4496 "PRUNE": Prune(),
4497 "SKIP": Skip(),
4498}