1#
2# Secret Labs' Regular Expression Engine core module
3#
4# Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved.
5#
6# This version of the SRE library can be redistributed under CNRI's
7# Python 1.6 license. For any other use, please contact Secret Labs
8# AB (info@pythonware.com).
9#
10# Portions of this engine have been developed in cooperation with
11# CNRI. Hewlett-Packard provided funding for 1.6 integration and
12# other compatibility work.
13#
14# 2010-01-16 mrab Python front-end re-written and extended
15
16import enum
17import string
18import unicodedata
19from collections import defaultdict
20
21import regex._regex as _regex
22
23__all__ = ["A", "ASCII", "B", "BESTMATCH", "D", "DEBUG", "E", "ENHANCEMATCH",
24 "F", "FULLCASE", "I", "IGNORECASE", "L", "LOCALE", "M", "MULTILINE", "P",
25 "POSIX", "R", "REVERSE", "S", "DOTALL", "T", "TEMPLATE", "U", "UNICODE",
26 "V0", "VERSION0", "V1", "VERSION1", "W", "WORD", "X", "VERBOSE", "error",
27 "Scanner", "RegexFlag"]
28
29# The regex exception.
30class error(Exception):
31 """Exception raised for invalid regular expressions.
32
33 Attributes:
34
35 msg: The unformatted error message
36 pattern: The regular expression pattern
37 pos: The position in the pattern where compilation failed, or None
38 lineno: The line number where compilation failed, unless pos is None
39 colno: The column number where compilation failed, unless pos is None
40 """
41
42 def __init__(self, message, pattern=None, pos=None):
43 newline = '\n' if isinstance(pattern, str) else b'\n'
44 self.msg = message
45 self.pattern = pattern
46 self.pos = pos
47 if pattern is not None and pos is not None:
48 self.lineno = pattern.count(newline, 0, pos) + 1
49 self.colno = pos - pattern.rfind(newline, 0, pos)
50
51 message = "{} at position {}".format(message, pos)
52
53 if newline in pattern:
54 message += " (line {}, column {})".format(self.lineno,
55 self.colno)
56
57 Exception.__init__(self, message)
58
59# The exception for when a positional flag has been turned on in the old
60# behaviour.
61class _UnscopedFlagSet(Exception):
62 pass
63
64# The exception for when parsing fails and we want to try something else.
65class ParseError(Exception):
66 pass
67
68# The exception for when there isn't a valid first set.
69class _FirstSetError(Exception):
70 pass
71
72# Flags.
73class RegexFlag(enum.IntFlag):
74 A = ASCII = 0x80 # Assume ASCII locale.
75 B = BESTMATCH = 0x1000 # Best fuzzy match.
76 D = DEBUG = 0x200 # Print parsed pattern.
77 E = ENHANCEMATCH = 0x8000 # Attempt to improve the fit after finding the first
78 # fuzzy match.
79 F = FULLCASE = 0x4000 # Unicode full case-folding.
80 I = IGNORECASE = 0x2 # Ignore case.
81 L = LOCALE = 0x4 # Assume current 8-bit locale.
82 M = MULTILINE = 0x8 # Make anchors look for newline.
83 P = POSIX = 0x10000 # POSIX-style matching (leftmost longest).
84 R = REVERSE = 0x400 # Search backwards.
85 S = DOTALL = 0x10 # Make dot match newline.
86 U = UNICODE = 0x20 # Assume Unicode locale.
87 V0 = VERSION0 = 0x2000 # Old legacy behaviour.
88 V1 = VERSION1 = 0x100 # New enhanced behaviour.
89 W = WORD = 0x800 # Default Unicode word breaks.
90 X = VERBOSE = 0x40 # Ignore whitespace and comments.
91 T = TEMPLATE = 0x1 # Template (present because re module has it).
92
93 def __repr__(self):
94 if self._name_ is not None:
95 return 'regex.%s' % self._name_
96
97 value = self._value_
98 members = []
99 negative = value < 0
100
101 if negative:
102 value = ~value
103
104 for m in self.__class__:
105 if value & m._value_:
106 value &= ~m._value_
107 members.append('regex.%s' % m._name_)
108
109 if value:
110 members.append(hex(value))
111
112 res = '|'.join(members)
113
114 if negative:
115 if len(members) > 1:
116 res = '~(%s)' % res
117 else:
118 res = '~%s' % res
119
120 return res
121
122 __str__ = object.__str__
123
124globals().update(RegexFlag.__members__)
125
126DEFAULT_VERSION = VERSION1
127
128_ALL_VERSIONS = VERSION0 | VERSION1
129_ALL_ENCODINGS = ASCII | LOCALE | UNICODE
130
131# The default flags for the various versions.
132DEFAULT_FLAGS = {VERSION0: 0, VERSION1: FULLCASE}
133
134# The mask for the flags.
135GLOBAL_FLAGS = (_ALL_VERSIONS | BESTMATCH | DEBUG | ENHANCEMATCH | POSIX |
136 REVERSE)
137SCOPED_FLAGS = (FULLCASE | IGNORECASE | MULTILINE | DOTALL | WORD | VERBOSE |
138 _ALL_ENCODINGS)
139
140ALPHA = frozenset(string.ascii_letters)
141DIGITS = frozenset(string.digits)
142ALNUM = ALPHA | DIGITS
143OCT_DIGITS = frozenset(string.octdigits)
144HEX_DIGITS = frozenset(string.hexdigits)
145SPECIAL_CHARS = frozenset("()|?*+{^$.[\\#") | frozenset([""])
146NAMED_CHAR_PART = ALNUM | frozenset(" -")
147PROPERTY_NAME_PART = ALNUM | frozenset(" &_-.")
148SET_OPS = ("||", "~~", "&&", "--")
149
150# The width of the code words inside the regex engine.
151BYTES_PER_CODE = _regex.get_code_size()
152BITS_PER_CODE = BYTES_PER_CODE * 8
153
154# The repeat count which represents infinity.
155UNLIMITED = (1 << BITS_PER_CODE) - 1
156
157# The regular expression flags.
158REGEX_FLAGS = {"a": ASCII, "b": BESTMATCH, "e": ENHANCEMATCH, "f": FULLCASE,
159 "i": IGNORECASE, "L": LOCALE, "m": MULTILINE, "p": POSIX, "r": REVERSE,
160 "s": DOTALL, "u": UNICODE, "V0": VERSION0, "V1": VERSION1, "w": WORD, "x":
161 VERBOSE}
162
163# The case flags.
164CASE_FLAGS = FULLCASE | IGNORECASE
165NOCASE = 0
166FULLIGNORECASE = FULLCASE | IGNORECASE
167
168FULL_CASE_FOLDING = UNICODE | FULLIGNORECASE
169
170CASE_FLAGS_COMBINATIONS = {0: 0, FULLCASE: 0, IGNORECASE: IGNORECASE,
171 FULLIGNORECASE: FULLIGNORECASE}
172
173# The number of digits in hexadecimal escapes.
174HEX_ESCAPES = {"x": 2, "u": 4, "U": 8}
175
176# The names of the opcodes.
177OPCODES = """
178FAILURE
179SUCCESS
180ANY
181ANY_ALL
182ANY_ALL_REV
183ANY_REV
184ANY_U
185ANY_U_REV
186ATOMIC
187BOUNDARY
188BRANCH
189CALL_REF
190CHARACTER
191CHARACTER_IGN
192CHARACTER_IGN_REV
193CHARACTER_REV
194CONDITIONAL
195DEFAULT_BOUNDARY
196DEFAULT_END_OF_WORD
197DEFAULT_START_OF_WORD
198END
199END_OF_LINE
200END_OF_LINE_U
201END_OF_STRING
202END_OF_STRING_LINE
203END_OF_STRING_LINE_U
204END_OF_WORD
205FUZZY
206GRAPHEME_BOUNDARY
207GREEDY_REPEAT
208GROUP
209GROUP_CALL
210GROUP_EXISTS
211KEEP
212LAZY_REPEAT
213LOOKAROUND
214NEXT
215PROPERTY
216PROPERTY_IGN
217PROPERTY_IGN_REV
218PROPERTY_REV
219PRUNE
220RANGE
221RANGE_IGN
222RANGE_IGN_REV
223RANGE_REV
224REF_GROUP
225REF_GROUP_FLD
226REF_GROUP_FLD_REV
227REF_GROUP_IGN
228REF_GROUP_IGN_REV
229REF_GROUP_REV
230SEARCH_ANCHOR
231SET_DIFF
232SET_DIFF_IGN
233SET_DIFF_IGN_REV
234SET_DIFF_REV
235SET_INTER
236SET_INTER_IGN
237SET_INTER_IGN_REV
238SET_INTER_REV
239SET_SYM_DIFF
240SET_SYM_DIFF_IGN
241SET_SYM_DIFF_IGN_REV
242SET_SYM_DIFF_REV
243SET_UNION
244SET_UNION_IGN
245SET_UNION_IGN_REV
246SET_UNION_REV
247SKIP
248START_OF_LINE
249START_OF_LINE_U
250START_OF_STRING
251START_OF_WORD
252STRING
253STRING_FLD
254STRING_FLD_REV
255STRING_IGN
256STRING_IGN_REV
257STRING_REV
258FUZZY_EXT
259"""
260
261# Define the opcodes in a namespace.
262class Namespace:
263 pass
264
265OP = Namespace()
266for i, op in enumerate(OPCODES.split()):
267 setattr(OP, op, i)
268
269def _shrink_cache(cache_dict, args_dict, locale_sensitive, max_length, divisor=5):
270 """Make room in the given cache.
271
272 Args:
273 cache_dict: The cache dictionary to modify.
274 args_dict: The dictionary of named list args used by patterns.
275 max_length: Maximum # of entries in cache_dict before it is shrunk.
276 divisor: Cache will shrink to max_length - 1/divisor*max_length items.
277 """
278 # Toss out a fraction of the entries at random to make room for new ones.
279 # A random algorithm was chosen as opposed to simply cache_dict.popitem()
280 # as popitem could penalize the same regular expression repeatedly based
281 # on its internal hash value. Being random should spread the cache miss
282 # love around.
283 cache_keys = tuple(cache_dict.keys())
284 overage = len(cache_keys) - max_length
285 if overage < 0:
286 # Cache is already within limits. Normally this should not happen
287 # but it could due to multithreading.
288 return
289
290 number_to_toss = max_length // divisor + overage
291
292 # The import is done here to avoid a circular dependency.
293 import random
294 if not hasattr(random, 'sample'):
295 # Do nothing while resolving the circular dependency:
296 # re->random->warnings->tokenize->string->re
297 return
298
299 for doomed_key in random.sample(cache_keys, number_to_toss):
300 try:
301 del cache_dict[doomed_key]
302 except KeyError:
303 # Ignore problems if the cache changed from another thread.
304 pass
305
306 # Rebuild the arguments and locale-sensitivity dictionaries.
307 args_dict.clear()
308 sensitivity_dict = {}
309 for pattern, pattern_type, flags, args, default_version, locale in tuple(cache_dict):
310 args_dict[pattern, pattern_type, flags, default_version, locale] = args
311 try:
312 sensitivity_dict[pattern_type, pattern] = locale_sensitive[pattern_type, pattern]
313 except KeyError:
314 pass
315
316 locale_sensitive.clear()
317 locale_sensitive.update(sensitivity_dict)
318
319def _fold_case(info, string):
320 "Folds the case of a string."
321 flags = info.flags
322 if (flags & _ALL_ENCODINGS) == 0:
323 flags |= info.guess_encoding
324
325 return _regex.fold_case(flags, string)
326
327def is_cased_i(info, char):
328 "Checks whether a character is cased."
329 return len(_regex.get_all_cases(info.flags, char)) > 1
330
331def is_cased_f(flags, char):
332 "Checks whether a character is cased."
333 return len(_regex.get_all_cases(flags, char)) > 1
334
335def _compile_firstset(info, fs):
336 "Compiles the firstset for the pattern."
337 reverse = bool(info.flags & REVERSE)
338 fs = _check_firstset(info, reverse, fs)
339 if not fs or isinstance(fs, AnyAll):
340 return []
341
342 # Compile the firstset.
343 return fs.compile(reverse)
344
345def _check_firstset(info, reverse, fs):
346 "Checks the firstset for the pattern."
347 if not fs or None in fs:
348 return None
349
350 # If we ignore the case, for simplicity we won't build a firstset.
351 members = set()
352 case_flags = NOCASE
353 for i in fs:
354 if isinstance(i, Character) and not i.positive:
355 return None
356
357# if i.case_flags:
358# if isinstance(i, Character):
359# if is_cased_i(info, i.value):
360# return []
361# elif isinstance(i, SetBase):
362# return []
363 case_flags |= i.case_flags
364 members.add(i.with_flags(case_flags=NOCASE))
365
366 if case_flags == (FULLCASE | IGNORECASE):
367 return None
368
369 # Build the firstset.
370 fs = SetUnion(info, list(members), case_flags=case_flags & ~FULLCASE,
371 zerowidth=True)
372 fs = fs.optimise(info, reverse, in_set=True)
373
374 return fs
375
376def _flatten_code(code):
377 "Flattens the code from a list of tuples."
378 flat_code = []
379 for c in code:
380 flat_code.extend(c)
381
382 return flat_code
383
384def make_case_flags(info):
385 "Makes the case flags."
386 flags = info.flags & CASE_FLAGS
387
388 # Turn off FULLCASE if ASCII is turned on.
389 if info.flags & ASCII:
390 flags &= ~FULLCASE
391
392 return flags
393
394def make_character(info, value, in_set=False):
395 "Makes a character literal."
396 if in_set:
397 # A character set is built case-sensitively.
398 return Character(value)
399
400 return Character(value, case_flags=make_case_flags(info))
401
402def make_ref_group(info, name, position):
403 "Makes a group reference."
404 return RefGroup(info, name, position, case_flags=make_case_flags(info))
405
406def make_string_set(info, name):
407 "Makes a string set."
408 return StringSet(info, name, case_flags=make_case_flags(info))
409
410def make_property(info, prop, in_set):
411 "Makes a property."
412 if in_set:
413 return prop
414
415 return prop.with_flags(case_flags=make_case_flags(info))
416
417def _parse_pattern(source, info):
418 "Parses a pattern, eg. 'a|b|c'."
419 branches = [parse_sequence(source, info)]
420 while source.match("|"):
421 branches.append(parse_sequence(source, info))
422
423 if len(branches) == 1:
424 return branches[0]
425 return Branch(branches)
426
427def parse_sequence(source, info):
428 "Parses a sequence, eg. 'abc'."
429 sequence = [None]
430 case_flags = make_case_flags(info)
431 while True:
432 saved_pos = source.pos
433 ch = source.get()
434 if ch in SPECIAL_CHARS:
435 if ch in ")|":
436 # The end of a sequence. At the end of the pattern ch is "".
437 source.pos = saved_pos
438 break
439 elif ch == "\\":
440 # An escape sequence outside a set.
441 sequence.append(parse_escape(source, info, False))
442 elif ch == "(":
443 # A parenthesised subpattern or a flag.
444 element = parse_paren(source, info)
445 if element is None:
446 case_flags = make_case_flags(info)
447 else:
448 sequence.append(element)
449 elif ch == ".":
450 # Any character.
451 if info.flags & DOTALL:
452 sequence.append(AnyAll())
453 elif info.flags & WORD:
454 sequence.append(AnyU())
455 else:
456 sequence.append(Any())
457 elif ch == "[":
458 # A character set.
459 sequence.append(parse_set(source, info))
460 elif ch == "^":
461 # The start of a line or the string.
462 if info.flags & MULTILINE:
463 if info.flags & WORD:
464 sequence.append(StartOfLineU())
465 else:
466 sequence.append(StartOfLine())
467 else:
468 sequence.append(StartOfString())
469 elif ch == "$":
470 # The end of a line or the string.
471 if info.flags & MULTILINE:
472 if info.flags & WORD:
473 sequence.append(EndOfLineU())
474 else:
475 sequence.append(EndOfLine())
476 else:
477 if info.flags & WORD:
478 sequence.append(EndOfStringLineU())
479 else:
480 sequence.append(EndOfStringLine())
481 elif ch in "?*+{":
482 # Looks like a quantifier.
483 counts = parse_quantifier(source, info, ch)
484 if counts:
485 # It _is_ a quantifier.
486 apply_quantifier(source, info, counts, case_flags, ch,
487 saved_pos, sequence)
488 sequence.append(None)
489 else:
490 # It's not a quantifier. Maybe it's a fuzzy constraint.
491 constraints = parse_fuzzy(source, info, ch, case_flags)
492 if constraints:
493 # It _is_ a fuzzy constraint.
494 apply_constraint(source, info, constraints, case_flags,
495 saved_pos, sequence)
496 sequence.append(None)
497 else:
498 # The element was just a literal.
499 sequence.append(Character(ord(ch),
500 case_flags=case_flags))
501 else:
502 # A literal.
503 sequence.append(Character(ord(ch), case_flags=case_flags))
504 else:
505 # A literal.
506 sequence.append(Character(ord(ch), case_flags=case_flags))
507
508 sequence = [item for item in sequence if item is not None]
509 return Sequence(sequence)
510
511def apply_quantifier(source, info, counts, case_flags, ch, saved_pos,
512 sequence):
513 element = sequence.pop()
514 if element is None:
515 if sequence:
516 raise error("multiple repeat", source.string, saved_pos)
517 raise error("nothing to repeat", source.string, saved_pos)
518
519 if isinstance(element, (GreedyRepeat, LazyRepeat, PossessiveRepeat)):
520 raise error("multiple repeat", source.string, saved_pos)
521
522 min_count, max_count = counts
523 saved_pos = source.pos
524 ch = source.get()
525 if ch == "?":
526 # The "?" suffix that means it's a lazy repeat.
527 repeated = LazyRepeat
528 elif ch == "+":
529 # The "+" suffix that means it's a possessive repeat.
530 repeated = PossessiveRepeat
531 else:
532 # No suffix means that it's a greedy repeat.
533 source.pos = saved_pos
534 repeated = GreedyRepeat
535
536 # Ignore the quantifier if it applies to a zero-width item or the number of
537 # repeats is fixed at 1.
538 if not element.is_empty() and (min_count != 1 or max_count != 1):
539 element = repeated(element, min_count, max_count)
540
541 sequence.append(element)
542
543def apply_constraint(source, info, constraints, case_flags, saved_pos,
544 sequence):
545 element = sequence.pop()
546 if element is None:
547 raise error("nothing for fuzzy constraint", source.string, saved_pos)
548
549 # If a group is marked as fuzzy then put all of the fuzzy part in the
550 # group.
551 if isinstance(element, Group):
552 element.subpattern = Fuzzy(element.subpattern, constraints)
553 sequence.append(element)
554 else:
555 sequence.append(Fuzzy(element, constraints))
556
557_QUANTIFIERS = {"?": (0, 1), "*": (0, None), "+": (1, None)}
558
559def parse_quantifier(source, info, ch):
560 "Parses a quantifier."
561 q = _QUANTIFIERS.get(ch)
562 if q:
563 # It's a quantifier.
564 return q
565
566 if ch == "{":
567 # Looks like a limited repeated element, eg. 'a{2,3}'.
568 counts = parse_limited_quantifier(source)
569 if counts:
570 return counts
571
572 return None
573
574def is_above_limit(count):
575 "Checks whether a count is above the maximum."
576 return count is not None and count >= UNLIMITED
577
578def parse_limited_quantifier(source):
579 "Parses a limited quantifier."
580 saved_pos = source.pos
581 min_count = parse_count(source)
582 if source.match(","):
583 max_count = parse_count(source)
584
585 # No minimum means 0 and no maximum means unlimited.
586 min_count = int(min_count or 0)
587 max_count = int(max_count) if max_count else None
588 else:
589 if not min_count:
590 source.pos = saved_pos
591 return None
592
593 min_count = max_count = int(min_count)
594
595 if not source.match ("}"):
596 source.pos = saved_pos
597 return None
598
599 if is_above_limit(min_count) or is_above_limit(max_count):
600 raise error("repeat count too big", source.string, saved_pos)
601
602 if max_count is not None and min_count > max_count:
603 raise error("min repeat greater than max repeat", source.string,
604 saved_pos)
605
606 return min_count, max_count
607
608def parse_fuzzy(source, info, ch, case_flags):
609 "Parses a fuzzy setting, if present."
610 saved_pos = source.pos
611
612 if ch != "{":
613 return None
614
615 constraints = {}
616 try:
617 parse_fuzzy_item(source, constraints)
618 while source.match(","):
619 parse_fuzzy_item(source, constraints)
620 except ParseError:
621 source.pos = saved_pos
622 return None
623
624 if source.match(":"):
625 constraints["test"] = parse_fuzzy_test(source, info, case_flags)
626
627 if not source.match("}"):
628 raise error("expected }", source.string, source.pos)
629
630 return constraints
631
632def parse_fuzzy_item(source, constraints):
633 "Parses a fuzzy setting item."
634 saved_pos = source.pos
635 try:
636 parse_cost_constraint(source, constraints)
637 except ParseError:
638 source.pos = saved_pos
639
640 parse_cost_equation(source, constraints)
641
642def parse_cost_constraint(source, constraints):
643 "Parses a cost constraint."
644 saved_pos = source.pos
645 ch = source.get()
646 if ch in ALPHA:
647 # Syntax: constraint [("<=" | "<") cost]
648 constraint = parse_constraint(source, constraints, ch)
649
650 max_inc = parse_fuzzy_compare(source)
651
652 if max_inc is None:
653 # No maximum cost.
654 constraints[constraint] = 0, None
655 else:
656 # There's a maximum cost.
657 cost_pos = source.pos
658 max_cost = parse_cost_limit(source)
659
660 # Inclusive or exclusive limit?
661 if not max_inc:
662 max_cost -= 1
663
664 if max_cost < 0:
665 raise error("bad fuzzy cost limit", source.string, cost_pos)
666
667 constraints[constraint] = 0, max_cost
668 elif ch in DIGITS:
669 # Syntax: cost ("<=" | "<") constraint ("<=" | "<") cost
670 source.pos = saved_pos
671
672 # Minimum cost.
673 cost_pos = source.pos
674 min_cost = parse_cost_limit(source)
675
676 min_inc = parse_fuzzy_compare(source)
677 if min_inc is None:
678 raise ParseError()
679
680 constraint = parse_constraint(source, constraints, source.get())
681
682 max_inc = parse_fuzzy_compare(source)
683 if max_inc is None:
684 raise ParseError()
685
686 # Maximum cost.
687 cost_pos = source.pos
688 max_cost = parse_cost_limit(source)
689
690 # Inclusive or exclusive limits?
691 if not min_inc:
692 min_cost += 1
693 if not max_inc:
694 max_cost -= 1
695
696 if not 0 <= min_cost <= max_cost:
697 raise error("bad fuzzy cost limit", source.string, cost_pos)
698
699 constraints[constraint] = min_cost, max_cost
700 else:
701 raise ParseError()
702
703def parse_cost_limit(source):
704 "Parses a cost limit."
705 cost_pos = source.pos
706 digits = parse_count(source)
707
708 try:
709 return int(digits)
710 except ValueError:
711 pass
712
713 raise error("bad fuzzy cost limit", source.string, cost_pos)
714
715def parse_constraint(source, constraints, ch):
716 "Parses a constraint."
717 if ch not in "deis":
718 raise ParseError()
719
720 if ch in constraints:
721 raise ParseError()
722
723 return ch
724
725def parse_fuzzy_compare(source):
726 "Parses a cost comparator."
727 if source.match("<="):
728 return True
729 elif source.match("<"):
730 return False
731 else:
732 return None
733
734def parse_cost_equation(source, constraints):
735 "Parses a cost equation."
736 if "cost" in constraints:
737 raise error("more than one cost equation", source.string, source.pos)
738
739 cost = {}
740
741 parse_cost_term(source, cost)
742 while source.match("+"):
743 parse_cost_term(source, cost)
744
745 max_inc = parse_fuzzy_compare(source)
746 if max_inc is None:
747 raise ParseError()
748
749 max_cost = int(parse_count(source))
750
751 if not max_inc:
752 max_cost -= 1
753
754 if max_cost < 0:
755 raise error("bad fuzzy cost limit", source.string, source.pos)
756
757 cost["max"] = max_cost
758
759 constraints["cost"] = cost
760
761def parse_cost_term(source, cost):
762 "Parses a cost equation term."
763 coeff = parse_count(source)
764 ch = source.get()
765 if ch not in "dis":
766 raise ParseError()
767
768 if ch in cost:
769 raise error("repeated fuzzy cost", source.string, source.pos)
770
771 cost[ch] = int(coeff or 1)
772
773def parse_fuzzy_test(source, info, case_flags):
774 saved_pos = source.pos
775 ch = source.get()
776 if ch in SPECIAL_CHARS:
777 if ch == "\\":
778 # An escape sequence outside a set.
779 return parse_escape(source, info, False)
780 elif ch == ".":
781 # Any character.
782 if info.flags & DOTALL:
783 return AnyAll()
784 elif info.flags & WORD:
785 return AnyU()
786 else:
787 return Any()
788 elif ch == "[":
789 # A character set.
790 return parse_set(source, info)
791 else:
792 raise error("expected character set", source.string, saved_pos)
793 elif ch:
794 # A literal.
795 return Character(ord(ch), case_flags=case_flags)
796 else:
797 raise error("expected character set", source.string, saved_pos)
798
799def parse_count(source):
800 "Parses a quantifier's count, which can be empty."
801 return source.get_while(DIGITS)
802
803def parse_paren(source, info):
804 """Parses a parenthesised subpattern or a flag. Returns FLAGS if it's an
805 inline flag.
806 """
807 saved_pos = source.pos
808 ch = source.get(True)
809 if ch == "?":
810 # (?...
811 saved_pos_2 = source.pos
812 ch = source.get(True)
813 if ch == "<":
814 # (?<...
815 saved_pos_3 = source.pos
816 ch = source.get()
817 if ch in ("=", "!"):
818 # (?<=... or (?<!...: lookbehind.
819 return parse_lookaround(source, info, True, ch == "=")
820
821 # (?<...: a named capture group.
822 source.pos = saved_pos_3
823 name = parse_name(source)
824 group = info.open_group(name)
825 source.expect(">")
826 saved_flags = info.flags
827 try:
828 subpattern = _parse_pattern(source, info)
829 source.expect(")")
830 finally:
831 info.flags = saved_flags
832 source.ignore_space = bool(info.flags & VERBOSE)
833
834 info.close_group()
835 return Group(info, group, subpattern)
836 if ch in ("=", "!"):
837 # (?=... or (?!...: lookahead.
838 return parse_lookaround(source, info, False, ch == "=")
839 if ch == "P":
840 # (?P...: a Python extension.
841 return parse_extension(source, info)
842 if ch == "#":
843 # (?#...: a comment.
844 return parse_comment(source)
845 if ch == "(":
846 # (?(...: a conditional subpattern.
847 return parse_conditional(source, info)
848 if ch == ">":
849 # (?>...: an atomic subpattern.
850 return parse_atomic(source, info)
851 if ch == "|":
852 # (?|...: a common/reset groups branch.
853 return parse_common(source, info)
854 if ch == "R" or "0" <= ch <= "9":
855 # (?R...: probably a call to a group.
856 return parse_call_group(source, info, ch, saved_pos_2)
857 if ch == "&":
858 # (?&...: a call to a named group.
859 return parse_call_named_group(source, info, saved_pos_2)
860
861 # (?...: probably a flags subpattern.
862 source.pos = saved_pos_2
863 return parse_flags_subpattern(source, info)
864
865 if ch == "*":
866 # (*...
867 saved_pos_2 = source.pos
868 word = source.get_while(set(")>"), include=False)
869 if word[ : 1].isalpha():
870 verb = VERBS.get(word)
871 if not verb:
872 raise error("unknown verb", source.string, saved_pos_2)
873
874 source.expect(")")
875
876 return verb
877
878 # (...: an unnamed capture group.
879 source.pos = saved_pos
880 group = info.open_group()
881 saved_flags = info.flags
882 try:
883 subpattern = _parse_pattern(source, info)
884 source.expect(")")
885 finally:
886 info.flags = saved_flags
887 source.ignore_space = bool(info.flags & VERBOSE)
888
889 info.close_group()
890
891 return Group(info, group, subpattern)
892
893def parse_extension(source, info):
894 "Parses a Python extension."
895 saved_pos = source.pos
896 ch = source.get()
897 if ch == "<":
898 # (?P<...: a named capture group.
899 name = parse_name(source)
900 group = info.open_group(name)
901 source.expect(">")
902 saved_flags = info.flags
903 try:
904 subpattern = _parse_pattern(source, info)
905 source.expect(")")
906 finally:
907 info.flags = saved_flags
908 source.ignore_space = bool(info.flags & VERBOSE)
909
910 info.close_group()
911
912 return Group(info, group, subpattern)
913 if ch == "=":
914 # (?P=...: a named group reference.
915 name = parse_name(source, allow_numeric=True)
916 source.expect(")")
917 if info.is_open_group(name):
918 raise error("cannot refer to an open group", source.string,
919 saved_pos)
920
921 return make_ref_group(info, name, saved_pos)
922 if ch == ">" or ch == "&":
923 # (?P>...: a call to a group.
924 return parse_call_named_group(source, info, saved_pos)
925
926 source.pos = saved_pos
927 raise error("unknown extension", source.string, saved_pos)
928
929def parse_comment(source):
930 "Parses a comment."
931 while True:
932 saved_pos = source.pos
933 c = source.get(True)
934
935 if not c or c == ")":
936 break
937
938 if c == "\\":
939 c = source.get(True)
940
941 source.pos = saved_pos
942 source.expect(")")
943
944 return None
945
946def parse_lookaround(source, info, behind, positive):
947 "Parses a lookaround."
948 saved_flags = info.flags
949 try:
950 subpattern = _parse_pattern(source, info)
951 source.expect(")")
952 finally:
953 info.flags = saved_flags
954 source.ignore_space = bool(info.flags & VERBOSE)
955
956 return LookAround(behind, positive, subpattern)
957
958def parse_conditional(source, info):
959 "Parses a conditional subpattern."
960 saved_flags = info.flags
961 saved_pos = source.pos
962 ch = source.get()
963 if ch == "?":
964 # (?(?...
965 ch = source.get()
966 if ch in ("=", "!"):
967 # (?(?=... or (?(?!...: lookahead conditional.
968 return parse_lookaround_conditional(source, info, False, ch == "=")
969 if ch == "<":
970 # (?(?<...
971 ch = source.get()
972 if ch in ("=", "!"):
973 # (?(?<=... or (?(?<!...: lookbehind conditional.
974 return parse_lookaround_conditional(source, info, True, ch ==
975 "=")
976
977 source.pos = saved_pos
978 raise error("expected lookaround conditional", source.string,
979 source.pos)
980
981 source.pos = saved_pos
982 try:
983 group = parse_name(source, True)
984 source.expect(")")
985 yes_branch = parse_sequence(source, info)
986 if source.match("|"):
987 no_branch = parse_sequence(source, info)
988 else:
989 no_branch = Sequence()
990
991 source.expect(")")
992 finally:
993 info.flags = saved_flags
994 source.ignore_space = bool(info.flags & VERBOSE)
995
996 if yes_branch.is_empty() and no_branch.is_empty():
997 return Sequence()
998
999 return Conditional(info, group, yes_branch, no_branch, saved_pos)
1000
1001def parse_lookaround_conditional(source, info, behind, positive):
1002 saved_flags = info.flags
1003 try:
1004 subpattern = _parse_pattern(source, info)
1005 source.expect(")")
1006 finally:
1007 info.flags = saved_flags
1008 source.ignore_space = bool(info.flags & VERBOSE)
1009
1010 yes_branch = parse_sequence(source, info)
1011 if source.match("|"):
1012 no_branch = parse_sequence(source, info)
1013 else:
1014 no_branch = Sequence()
1015
1016 source.expect(")")
1017
1018 return LookAroundConditional(behind, positive, subpattern, yes_branch,
1019 no_branch)
1020
1021def parse_atomic(source, info):
1022 "Parses an atomic subpattern."
1023 saved_flags = info.flags
1024 try:
1025 subpattern = _parse_pattern(source, info)
1026 source.expect(")")
1027 finally:
1028 info.flags = saved_flags
1029 source.ignore_space = bool(info.flags & VERBOSE)
1030
1031 return Atomic(subpattern)
1032
1033def parse_common(source, info):
1034 "Parses a common groups branch."
1035 # Capture group numbers in different branches can reuse the group numbers.
1036 initial_group_count = info.group_count
1037 branches = [parse_sequence(source, info)]
1038 final_group_count = info.group_count
1039 while source.match("|"):
1040 info.group_count = initial_group_count
1041 branches.append(parse_sequence(source, info))
1042 final_group_count = max(final_group_count, info.group_count)
1043
1044 info.group_count = final_group_count
1045 source.expect(")")
1046
1047 if len(branches) == 1:
1048 return branches[0]
1049 return Branch(branches)
1050
1051def parse_call_group(source, info, ch, pos):
1052 "Parses a call to a group."
1053 if ch == "R":
1054 group = "0"
1055 else:
1056 group = ch + source.get_while(DIGITS)
1057
1058 source.expect(")")
1059
1060 return CallGroup(info, group, pos)
1061
1062def parse_call_named_group(source, info, pos):
1063 "Parses a call to a named group."
1064 group = parse_name(source)
1065 source.expect(")")
1066
1067 return CallGroup(info, group, pos)
1068
1069def parse_flag_set(source):
1070 "Parses a set of inline flags."
1071 flags = 0
1072
1073 try:
1074 while True:
1075 saved_pos = source.pos
1076 ch = source.get()
1077 if ch == "V":
1078 ch += source.get()
1079 flags |= REGEX_FLAGS[ch]
1080 except KeyError:
1081 source.pos = saved_pos
1082
1083 return flags
1084
1085def parse_flags(source, info):
1086 "Parses flags being turned on/off."
1087 flags_on = parse_flag_set(source)
1088 if source.match("-"):
1089 flags_off = parse_flag_set(source)
1090 if not flags_off:
1091 raise error("bad inline flags: no flags after '-'", source.string,
1092 source.pos)
1093 else:
1094 flags_off = 0
1095
1096 if flags_on & LOCALE:
1097 # Remember that this pattern as an inline locale flag.
1098 info.inline_locale = True
1099
1100 return flags_on, flags_off
1101
1102def parse_subpattern(source, info, flags_on, flags_off):
1103 "Parses a subpattern with scoped flags."
1104 saved_flags = info.flags
1105 info.flags = (info.flags | flags_on) & ~flags_off
1106
1107 # Ensure that there aren't multiple encoding flags set.
1108 if info.flags & (ASCII | LOCALE | UNICODE):
1109 info.flags = (info.flags & ~_ALL_ENCODINGS) | flags_on
1110
1111 source.ignore_space = bool(info.flags & VERBOSE)
1112 try:
1113 subpattern = _parse_pattern(source, info)
1114 source.expect(")")
1115 finally:
1116 info.flags = saved_flags
1117 source.ignore_space = bool(info.flags & VERBOSE)
1118
1119 return subpattern
1120
1121def parse_flags_subpattern(source, info):
1122 """Parses a flags subpattern. It could be inline flags or a subpattern
1123 possibly with local flags. If it's a subpattern, then that's returned;
1124 if it's a inline flags, then None is returned.
1125 """
1126 flags_on, flags_off = parse_flags(source, info)
1127
1128 if flags_off & GLOBAL_FLAGS:
1129 raise error("bad inline flags: cannot turn off global flag",
1130 source.string, source.pos)
1131
1132 if flags_on & flags_off:
1133 raise error("bad inline flags: flag turned on and off", source.string,
1134 source.pos)
1135
1136 # Handle flags which are global in all regex behaviours.
1137 new_global_flags = (flags_on & ~info.global_flags) & GLOBAL_FLAGS
1138 if new_global_flags:
1139 info.global_flags |= new_global_flags
1140
1141 # A global has been turned on, so reparse the pattern.
1142 raise _UnscopedFlagSet(info.global_flags)
1143
1144 # Ensure that from now on we have only scoped flags.
1145 flags_on &= ~GLOBAL_FLAGS
1146
1147 if source.match(":"):
1148 return parse_subpattern(source, info, flags_on, flags_off)
1149
1150 if source.match(")"):
1151 parse_positional_flags(source, info, flags_on, flags_off)
1152 return None
1153
1154 raise error("unknown extension", source.string, source.pos)
1155
1156def parse_positional_flags(source, info, flags_on, flags_off):
1157 "Parses positional flags."
1158 info.flags = (info.flags | flags_on) & ~flags_off
1159 source.ignore_space = bool(info.flags & VERBOSE)
1160
1161def parse_name(source, allow_numeric=False, allow_group_0=False):
1162 "Parses a name."
1163 name = source.get_while(set(")>"), include=False)
1164
1165 if not name:
1166 raise error("missing group name", source.string, source.pos)
1167
1168 if name.isdigit():
1169 min_group = 0 if allow_group_0 else 1
1170 if not allow_numeric or int(name) < min_group:
1171 raise error("bad character in group name", source.string,
1172 source.pos)
1173 else:
1174 if not name.isidentifier():
1175 raise error("bad character in group name", source.string,
1176 source.pos)
1177
1178 return name
1179
1180def is_octal(string):
1181 "Checks whether a string is octal."
1182 return all(ch in OCT_DIGITS for ch in string)
1183
1184def is_decimal(string):
1185 "Checks whether a string is decimal."
1186 return all(ch in DIGITS for ch in string)
1187
1188def is_hexadecimal(string):
1189 "Checks whether a string is hexadecimal."
1190 return all(ch in HEX_DIGITS for ch in string)
1191
1192def parse_escape(source, info, in_set):
1193 "Parses an escape sequence."
1194 saved_ignore = source.ignore_space
1195 source.ignore_space = False
1196 ch = source.get()
1197 source.ignore_space = saved_ignore
1198 if not ch:
1199 # A backslash at the end of the pattern.
1200 raise error("bad escape (end of pattern)", source.string, source.pos)
1201 if ch in HEX_ESCAPES:
1202 # A hexadecimal escape sequence.
1203 return parse_hex_escape(source, info, ch, HEX_ESCAPES[ch], in_set, ch)
1204 elif ch == "g" and not in_set:
1205 # A group reference.
1206 saved_pos = source.pos
1207 try:
1208 return parse_group_ref(source, info)
1209 except error:
1210 # Invalid as a group reference, so assume it's a literal.
1211 source.pos = saved_pos
1212
1213 return make_character(info, ord(ch), in_set)
1214 elif ch == "G" and not in_set:
1215 # A search anchor.
1216 return SearchAnchor()
1217 elif ch == "L" and not in_set:
1218 # A string set.
1219 return parse_string_set(source, info)
1220 elif ch == "N":
1221 # A named codepoint.
1222 return parse_named_char(source, info, in_set)
1223 elif ch in "pP":
1224 # A Unicode property, positive or negative.
1225 return parse_property(source, info, ch == "p", in_set)
1226 elif ch == "R" and not in_set:
1227 # A line ending.
1228 charset = [0x0A, 0x0B, 0x0C, 0x0D]
1229 if info.guess_encoding == UNICODE:
1230 charset.extend([0x85, 0x2028, 0x2029])
1231
1232 return Atomic(Branch([String([0x0D, 0x0A]), SetUnion(info, [Character(c)
1233 for c in charset])]))
1234 elif ch == "X" and not in_set:
1235 # A grapheme cluster.
1236 return Grapheme()
1237 elif ch in ALPHA:
1238 # An alphabetic escape sequence.
1239 # Positional escapes aren't allowed inside a character set.
1240 if not in_set:
1241 if info.flags & WORD:
1242 value = WORD_POSITION_ESCAPES.get(ch)
1243 elif info.flags & ASCII:
1244 value = ASCII_POSITION_ESCAPES.get(ch)
1245 elif info.flags & UNICODE:
1246 value = UNICODE_POSITION_ESCAPES.get(ch)
1247 else:
1248 value = POSITION_ESCAPES.get(ch)
1249
1250 if value:
1251 return value
1252
1253 if info.flags & ASCII:
1254 value = ASCII_CHARSET_ESCAPES.get(ch)
1255 elif info.flags & UNICODE:
1256 value = UNICODE_CHARSET_ESCAPES.get(ch)
1257 else:
1258 value = CHARSET_ESCAPES.get(ch)
1259
1260 if value:
1261 return value
1262
1263 value = CHARACTER_ESCAPES.get(ch)
1264 if value:
1265 return Character(ord(value))
1266
1267 raise error("bad escape \\%s" % ch, source.string, source.pos)
1268 elif ch in DIGITS:
1269 # A numeric escape sequence.
1270 return parse_numeric_escape(source, info, ch, in_set)
1271 else:
1272 # A literal.
1273 return make_character(info, ord(ch), in_set)
1274
1275def parse_numeric_escape(source, info, ch, in_set):
1276 "Parses a numeric escape sequence."
1277 if in_set or ch == "0":
1278 # Octal escape sequence, max 3 digits.
1279 return parse_octal_escape(source, info, [ch], in_set)
1280
1281 # At least 1 digit, so either octal escape or group.
1282 digits = ch
1283 saved_pos = source.pos
1284 ch = source.get()
1285 if ch in DIGITS:
1286 # At least 2 digits, so either octal escape or group.
1287 digits += ch
1288 saved_pos = source.pos
1289 ch = source.get()
1290 if is_octal(digits) and ch in OCT_DIGITS:
1291 # 3 octal digits, so octal escape sequence.
1292 encoding = info.flags & _ALL_ENCODINGS
1293 if encoding == ASCII or encoding == LOCALE:
1294 octal_mask = 0xFF
1295 else:
1296 octal_mask = 0x1FF
1297
1298 value = int(digits + ch, 8) & octal_mask
1299 return make_character(info, value)
1300
1301 # Group reference.
1302 source.pos = saved_pos
1303 if info.is_open_group(digits):
1304 raise error("cannot refer to an open group", source.string, source.pos)
1305
1306 return make_ref_group(info, digits, source.pos)
1307
1308def parse_octal_escape(source, info, digits, in_set):
1309 "Parses an octal escape sequence."
1310 saved_pos = source.pos
1311 ch = source.get()
1312 while len(digits) < 3 and ch in OCT_DIGITS:
1313 digits.append(ch)
1314 saved_pos = source.pos
1315 ch = source.get()
1316
1317 source.pos = saved_pos
1318 try:
1319 value = int("".join(digits), 8)
1320 return make_character(info, value, in_set)
1321 except ValueError:
1322 if digits[0] in OCT_DIGITS:
1323 raise error("incomplete escape \\%s" % ''.join(digits),
1324 source.string, source.pos)
1325 else:
1326 raise error("bad escape \\%s" % digits[0], source.string,
1327 source.pos)
1328
1329def parse_hex_escape(source, info, esc, expected_len, in_set, type):
1330 "Parses a hex escape sequence."
1331 saved_pos = source.pos
1332 digits = []
1333 for i in range(expected_len):
1334 ch = source.get()
1335 if ch not in HEX_DIGITS:
1336 raise error("incomplete escape \\%s%s" % (type, ''.join(digits)),
1337 source.string, saved_pos)
1338 digits.append(ch)
1339
1340 try:
1341 value = int("".join(digits), 16)
1342 except ValueError:
1343 pass
1344 else:
1345 if value < 0x110000:
1346 return make_character(info, value, in_set)
1347
1348 # Bad hex escape.
1349 raise error("bad hex escape \\%s%s" % (esc, ''.join(digits)),
1350 source.string, saved_pos)
1351
1352def parse_group_ref(source, info):
1353 "Parses a group reference."
1354 source.expect("<")
1355 saved_pos = source.pos
1356 name = parse_name(source, True)
1357 source.expect(">")
1358 if info.is_open_group(name):
1359 raise error("cannot refer to an open group", source.string, source.pos)
1360
1361 return make_ref_group(info, name, saved_pos)
1362
1363def parse_string_set(source, info):
1364 "Parses a string set reference."
1365 source.expect("<")
1366 name = parse_name(source, True)
1367 source.expect(">")
1368 if name is None or name not in info.kwargs:
1369 raise error("undefined named list", source.string, source.pos)
1370
1371 return make_string_set(info, name)
1372
1373def parse_named_char(source, info, in_set):
1374 "Parses a named character."
1375 saved_pos = source.pos
1376 if source.match("{"):
1377 name = source.get_while(NAMED_CHAR_PART, keep_spaces=True)
1378 if source.match("}"):
1379 try:
1380 value = unicodedata.lookup(name)
1381 return make_character(info, ord(value), in_set)
1382 except KeyError:
1383 raise error("undefined character name", source.string,
1384 source.pos)
1385
1386 source.pos = saved_pos
1387 return make_character(info, ord("N"), in_set)
1388
1389def parse_property(source, info, positive, in_set):
1390 "Parses a Unicode property."
1391 saved_pos = source.pos
1392 ch = source.get()
1393 if ch == "{":
1394 negate = source.match("^")
1395 prop_name, name = parse_property_name(source)
1396 if source.match("}"):
1397 # It's correctly delimited.
1398 if info.flags & ASCII:
1399 encoding = ASCII_ENCODING
1400 elif info.flags & UNICODE:
1401 encoding = UNICODE_ENCODING
1402 else:
1403 encoding = 0
1404
1405 prop = lookup_property(prop_name, name, positive != negate, source,
1406 encoding=encoding)
1407 return make_property(info, prop, in_set)
1408 elif ch and ch in "CLMNPSZ":
1409 # An abbreviated property, eg \pL.
1410 if info.flags & ASCII:
1411 encoding = ASCII_ENCODING
1412 elif info.flags & UNICODE:
1413 encoding = UNICODE_ENCODING
1414 else:
1415 encoding = 0
1416
1417 prop = lookup_property(None, ch, positive, source, encoding=encoding)
1418 return make_property(info, prop, in_set)
1419
1420 # Not a property, so treat as a literal "p" or "P".
1421 source.pos = saved_pos
1422 ch = "p" if positive else "P"
1423 return make_character(info, ord(ch), in_set)
1424
1425def parse_property_name(source):
1426 "Parses a property name, which may be qualified."
1427 name = source.get_while(PROPERTY_NAME_PART)
1428 saved_pos = source.pos
1429
1430 ch = source.get()
1431 if ch and ch in ":=":
1432 prop_name = name
1433 name = source.get_while(ALNUM | set(" &_-./")).strip()
1434
1435 if name:
1436 # Name after the ":" or "=", so it's a qualified name.
1437 saved_pos = source.pos
1438 else:
1439 # No name after the ":" or "=", so assume it's an unqualified name.
1440 prop_name, name = None, prop_name
1441 else:
1442 prop_name = None
1443
1444 source.pos = saved_pos
1445 return prop_name, name
1446
1447def parse_set(source, info):
1448 "Parses a character set."
1449 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION
1450
1451 saved_ignore = source.ignore_space
1452 source.ignore_space = False
1453 # Negative set?
1454 negate = source.match("^")
1455 try:
1456 if version == VERSION0:
1457 item = parse_set_imp_union(source, info)
1458 else:
1459 item = parse_set_union(source, info)
1460
1461 if not source.match("]"):
1462 raise error("missing ]", source.string, source.pos)
1463 finally:
1464 source.ignore_space = saved_ignore
1465
1466 if negate:
1467 item = item.with_flags(positive=not item.positive)
1468
1469 item = item.with_flags(case_flags=make_case_flags(info))
1470
1471 return item
1472
1473def parse_set_union(source, info):
1474 "Parses a set union ([x||y])."
1475 items = [parse_set_symm_diff(source, info)]
1476 while source.match("||"):
1477 items.append(parse_set_symm_diff(source, info))
1478
1479 if len(items) == 1:
1480 return items[0]
1481 return SetUnion(info, items)
1482
1483def parse_set_symm_diff(source, info):
1484 "Parses a set symmetric difference ([x~~y])."
1485 items = [parse_set_inter(source, info)]
1486 while source.match("~~"):
1487 items.append(parse_set_inter(source, info))
1488
1489 if len(items) == 1:
1490 return items[0]
1491 return SetSymDiff(info, items)
1492
1493def parse_set_inter(source, info):
1494 "Parses a set intersection ([x&&y])."
1495 items = [parse_set_diff(source, info)]
1496 while source.match("&&"):
1497 items.append(parse_set_diff(source, info))
1498
1499 if len(items) == 1:
1500 return items[0]
1501 return SetInter(info, items)
1502
1503def parse_set_diff(source, info):
1504 "Parses a set difference ([x--y])."
1505 items = [parse_set_imp_union(source, info)]
1506 while source.match("--"):
1507 items.append(parse_set_imp_union(source, info))
1508
1509 if len(items) == 1:
1510 return items[0]
1511 return SetDiff(info, items)
1512
1513def parse_set_imp_union(source, info):
1514 "Parses a set implicit union ([xy])."
1515 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION
1516
1517 items = [parse_set_member(source, info)]
1518 while True:
1519 saved_pos = source.pos
1520 if source.match("]"):
1521 # End of the set.
1522 source.pos = saved_pos
1523 break
1524
1525 if version == VERSION1 and any(source.match(op) for op in SET_OPS):
1526 # The new behaviour has set operators.
1527 source.pos = saved_pos
1528 break
1529
1530 items.append(parse_set_member(source, info))
1531
1532 if len(items) == 1:
1533 return items[0]
1534 return SetUnion(info, items)
1535
1536def parse_set_member(source, info):
1537 "Parses a member in a character set."
1538 # Parse a set item.
1539 start = parse_set_item(source, info)
1540 saved_pos1 = source.pos
1541 if (not isinstance(start, Character) or not start.positive or not
1542 source.match("-")):
1543 # It's not the start of a range.
1544 return start
1545
1546 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION
1547
1548 # It looks like the start of a range of characters.
1549 saved_pos2 = source.pos
1550 if version == VERSION1 and source.match("-"):
1551 # It's actually the set difference operator '--', so return the
1552 # character.
1553 source.pos = saved_pos1
1554 return start
1555
1556 if source.match("]"):
1557 # We've reached the end of the set, so return both the character and
1558 # hyphen.
1559 source.pos = saved_pos2
1560 return SetUnion(info, [start, Character(ord("-"))])
1561
1562 # Parse a set item.
1563 end = parse_set_item(source, info)
1564 if not isinstance(end, Character) or not end.positive:
1565 # It's not a range, so return the character, hyphen and property.
1566 return SetUnion(info, [start, Character(ord("-")), end])
1567
1568 # It _is_ a range.
1569 if start.value > end.value:
1570 raise error("bad character range", source.string, source.pos)
1571
1572 if start.value == end.value:
1573 return start
1574
1575 return Range(start.value, end.value)
1576
1577def parse_set_item(source, info):
1578 "Parses an item in a character set."
1579 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION
1580
1581 if source.match("\\"):
1582 # An escape sequence in a set.
1583 return parse_escape(source, info, True)
1584
1585 saved_pos = source.pos
1586 if source.match("[:"):
1587 # Looks like a POSIX character class.
1588 try:
1589 return parse_posix_class(source, info)
1590 except ParseError:
1591 # Not a POSIX character class.
1592 source.pos = saved_pos
1593
1594 if version == VERSION1 and source.match("["):
1595 # It's the start of a nested set.
1596
1597 # Negative set?
1598 negate = source.match("^")
1599 item = parse_set_union(source, info)
1600
1601 if not source.match("]"):
1602 raise error("missing ]", source.string, source.pos)
1603
1604 if negate:
1605 item = item.with_flags(positive=not item.positive)
1606
1607 return item
1608
1609 ch = source.get()
1610 if not ch:
1611 raise error("unterminated character set", source.string, source.pos)
1612
1613 return Character(ord(ch))
1614
1615def parse_posix_class(source, info):
1616 "Parses a POSIX character class."
1617 negate = source.match("^")
1618 prop_name, name = parse_property_name(source)
1619 if not source.match(":]"):
1620 raise ParseError()
1621
1622 return lookup_property(prop_name, name, not negate, source, posix=True)
1623
1624def float_to_rational(flt):
1625 "Converts a float to a rational pair."
1626 int_part = int(flt)
1627 error = flt - int_part
1628 if abs(error) < 0.0001:
1629 return int_part, 1
1630
1631 den, num = float_to_rational(1.0 / error)
1632
1633 return int_part * den + num, den
1634
1635def numeric_to_rational(numeric):
1636 "Converts a numeric string to a rational string, if possible."
1637 if numeric[ : 1] == "-":
1638 sign, numeric = numeric[0], numeric[1 : ]
1639 else:
1640 sign = ""
1641
1642 parts = numeric.split("/")
1643 if len(parts) == 2:
1644 num, den = float_to_rational(float(parts[0]) / float(parts[1]))
1645 elif len(parts) == 1:
1646 num, den = float_to_rational(float(parts[0]))
1647 else:
1648 raise ValueError()
1649
1650 result = "{}{}/{}".format(sign, num, den)
1651 if result.endswith("/1"):
1652 return result[ : -2]
1653
1654 return result
1655
1656def standardise_name(name):
1657 "Standardises a property or value name."
1658 try:
1659 return numeric_to_rational("".join(name))
1660 except (ValueError, ZeroDivisionError):
1661 return "".join(ch for ch in name if ch not in "_- ").upper()
1662
1663_POSIX_CLASSES = set('ALNUM DIGIT PUNCT XDIGIT'.split())
1664
1665_BINARY_VALUES = set('YES Y NO N TRUE T FALSE F'.split())
1666
1667def lookup_property(property, value, positive, source=None, posix=False, encoding=0):
1668 "Looks up a property."
1669 # Normalise the names (which may still be lists).
1670 property = standardise_name(property) if property else None
1671 value = standardise_name(value)
1672
1673 if (property, value) == ("GENERALCATEGORY", "ASSIGNED"):
1674 property, value, positive = "GENERALCATEGORY", "UNASSIGNED", not positive
1675
1676 if posix and not property and value.upper() in _POSIX_CLASSES:
1677 value = 'POSIX' + value
1678
1679 if property:
1680 # Both the property and the value are provided.
1681 prop = PROPERTIES.get(property)
1682 if not prop:
1683 if not source:
1684 raise error("unknown property")
1685
1686 raise error("unknown property", source.string, source.pos)
1687
1688 prop_id, value_dict = prop
1689 val_id = value_dict.get(value)
1690 if val_id is None:
1691 if not source:
1692 raise error("unknown property value")
1693
1694 raise error("unknown property value", source.string, source.pos)
1695
1696 return Property((prop_id << 16) | val_id, positive, encoding=encoding)
1697
1698 # Only the value is provided.
1699 # It might be the name of a GC, script or block value.
1700 for property in ("GC", "SCRIPT", "BLOCK"):
1701 prop_id, value_dict = PROPERTIES.get(property)
1702 val_id = value_dict.get(value)
1703 if val_id is not None:
1704 return Property((prop_id << 16) | val_id, positive, encoding=encoding)
1705
1706 # It might be the name of a binary property.
1707 prop = PROPERTIES.get(value)
1708 if prop:
1709 prop_id, value_dict = prop
1710 if set(value_dict) == _BINARY_VALUES:
1711 return Property((prop_id << 16) | 1, positive, encoding=encoding)
1712
1713 return Property(prop_id << 16, not positive, encoding=encoding)
1714
1715 # It might be the name of a binary property starting with a prefix.
1716 if value.startswith("IS"):
1717 prop = PROPERTIES.get(value[2 : ])
1718 if prop:
1719 prop_id, value_dict = prop
1720 if "YES" in value_dict:
1721 return Property((prop_id << 16) | 1, positive, encoding=encoding)
1722
1723 # It might be the name of a script or block starting with a prefix.
1724 for prefix, property in (("IS", "SCRIPT"), ("IN", "BLOCK")):
1725 if value.startswith(prefix):
1726 prop_id, value_dict = PROPERTIES.get(property)
1727 val_id = value_dict.get(value[2 : ])
1728 if val_id is not None:
1729 return Property((prop_id << 16) | val_id, positive, encoding=encoding)
1730
1731 # Unknown property.
1732 if not source:
1733 raise error("unknown property")
1734
1735 raise error("unknown property", source.string, source.pos)
1736
1737def _compile_replacement(source, pattern, is_unicode):
1738 "Compiles a replacement template escape sequence."
1739 ch = source.get()
1740 if ch in ALPHA:
1741 # An alphabetic escape sequence.
1742 value = CHARACTER_ESCAPES.get(ch)
1743 if value:
1744 return False, [ord(value)]
1745
1746 if ch in HEX_ESCAPES and (ch == "x" or is_unicode):
1747 # A hexadecimal escape sequence.
1748 return False, [parse_repl_hex_escape(source, HEX_ESCAPES[ch], ch)]
1749
1750 if ch == "g":
1751 # A group preference.
1752 return True, [compile_repl_group(source, pattern)]
1753
1754 if ch == "N" and is_unicode:
1755 # A named character.
1756 value = parse_repl_named_char(source)
1757 if value is not None:
1758 return False, [value]
1759
1760 raise error("bad escape \\%s" % ch, source.string, source.pos)
1761
1762 if isinstance(source.sep, bytes):
1763 octal_mask = 0xFF
1764 else:
1765 octal_mask = 0x1FF
1766
1767 if ch == "0":
1768 # An octal escape sequence.
1769 digits = ch
1770 while len(digits) < 3:
1771 saved_pos = source.pos
1772 ch = source.get()
1773 if ch not in OCT_DIGITS:
1774 source.pos = saved_pos
1775 break
1776 digits += ch
1777
1778 return False, [int(digits, 8) & octal_mask]
1779
1780 if ch in DIGITS:
1781 # Either an octal escape sequence (3 digits) or a group reference (max
1782 # 2 digits).
1783 digits = ch
1784 saved_pos = source.pos
1785 ch = source.get()
1786 if ch in DIGITS:
1787 digits += ch
1788 saved_pos = source.pos
1789 ch = source.get()
1790 if ch and is_octal(digits + ch):
1791 # An octal escape sequence.
1792 return False, [int(digits + ch, 8) & octal_mask]
1793
1794 # A group reference.
1795 source.pos = saved_pos
1796 return True, [int(digits)]
1797
1798 if ch == "\\":
1799 # An escaped backslash is a backslash.
1800 return False, [ord("\\")]
1801
1802 if not ch:
1803 # A trailing backslash.
1804 raise error("bad escape (end of pattern)", source.string, source.pos)
1805
1806 # An escaped non-backslash is a backslash followed by the literal.
1807 return False, [ord("\\"), ord(ch)]
1808
1809def parse_repl_hex_escape(source, expected_len, type):
1810 "Parses a hex escape sequence in a replacement string."
1811 digits = []
1812 for i in range(expected_len):
1813 ch = source.get()
1814 if ch not in HEX_DIGITS:
1815 raise error("incomplete escape \\%s%s" % (type, ''.join(digits)),
1816 source.string, source.pos)
1817 digits.append(ch)
1818
1819 return int("".join(digits), 16)
1820
1821def parse_repl_named_char(source):
1822 "Parses a named character in a replacement string."
1823 saved_pos = source.pos
1824 if source.match("{"):
1825 name = source.get_while(ALPHA | set(" "))
1826
1827 if source.match("}"):
1828 try:
1829 value = unicodedata.lookup(name)
1830 return ord(value)
1831 except KeyError:
1832 raise error("undefined character name", source.string,
1833 source.pos)
1834
1835 source.pos = saved_pos
1836 return None
1837
1838def compile_repl_group(source, pattern):
1839 "Compiles a replacement template group reference."
1840 source.expect("<")
1841 name = parse_name(source, True, True)
1842
1843 source.expect(">")
1844 if name.isdigit():
1845 index = int(name)
1846 if not 0 <= index <= pattern.groups:
1847 raise error("invalid group reference", source.string, source.pos)
1848
1849 return index
1850
1851 try:
1852 return pattern.groupindex[name]
1853 except KeyError:
1854 raise IndexError("unknown group")
1855
1856# The regular expression is parsed into a syntax tree. The different types of
1857# node are defined below.
1858
1859INDENT = " "
1860POSITIVE_OP = 0x1
1861ZEROWIDTH_OP = 0x2
1862FUZZY_OP = 0x4
1863REVERSE_OP = 0x8
1864REQUIRED_OP = 0x10
1865ENCODING_OP_SHIFT = 5
1866
1867POS_TEXT = {False: "NON-MATCH", True: "MATCH"}
1868CASE_TEXT = {NOCASE: "", IGNORECASE: " SIMPLE_IGNORE_CASE", FULLCASE: "",
1869 FULLIGNORECASE: " FULL_IGNORE_CASE"}
1870
1871def make_sequence(items):
1872 if len(items) == 1:
1873 return items[0]
1874 return Sequence(items)
1875
1876# Common base class for all nodes.
1877class RegexBase:
1878 def __init__(self):
1879 self._key = self.__class__
1880
1881 def with_flags(self, positive=None, case_flags=None, zerowidth=None):
1882 if positive is None:
1883 positive = self.positive
1884 else:
1885 positive = bool(positive)
1886 if case_flags is None:
1887 case_flags = self.case_flags
1888 else:
1889 case_flags = CASE_FLAGS_COMBINATIONS[case_flags & CASE_FLAGS]
1890 if zerowidth is None:
1891 zerowidth = self.zerowidth
1892 else:
1893 zerowidth = bool(zerowidth)
1894
1895 if (positive == self.positive and case_flags == self.case_flags and
1896 zerowidth == self.zerowidth):
1897 return self
1898
1899 return self.rebuild(positive, case_flags, zerowidth)
1900
1901 def fix_groups(self, pattern, reverse, fuzzy):
1902 pass
1903
1904 def optimise(self, info, reverse):
1905 return self
1906
1907 def pack_characters(self, info):
1908 return self
1909
1910 def remove_captures(self):
1911 return self
1912
1913 def is_atomic(self):
1914 return True
1915
1916 def can_be_affix(self):
1917 return True
1918
1919 def contains_group(self):
1920 return False
1921
1922 def get_firstset(self, reverse):
1923 raise _FirstSetError()
1924
1925 def has_simple_start(self):
1926 return False
1927
1928 def compile(self, reverse=False, fuzzy=False):
1929 return self._compile(reverse, fuzzy)
1930
1931 def is_empty(self):
1932 return False
1933
1934 def __hash__(self):
1935 return hash(self._key)
1936
1937 def __eq__(self, other):
1938 return type(self) is type(other) and self._key == other._key
1939
1940 def __ne__(self, other):
1941 return not self.__eq__(other)
1942
1943 def get_required_string(self, reverse):
1944 return self.max_width(), None
1945
1946# Base class for zero-width nodes.
1947class ZeroWidthBase(RegexBase):
1948 def __init__(self, positive=True, encoding=0):
1949 RegexBase.__init__(self)
1950 self.positive = bool(positive)
1951 self.encoding = encoding
1952
1953 self._key = self.__class__, self.positive
1954
1955 def get_firstset(self, reverse):
1956 return set([None])
1957
1958 def _compile(self, reverse, fuzzy):
1959 flags = 0
1960 if self.positive:
1961 flags |= POSITIVE_OP
1962 if fuzzy:
1963 flags |= FUZZY_OP
1964 if reverse:
1965 flags |= REVERSE_OP
1966 flags |= self.encoding << ENCODING_OP_SHIFT
1967 return [(self._opcode, flags)]
1968
1969 def dump(self, indent, reverse):
1970 print("{}{} {}{}".format(INDENT * indent, self._op_name,
1971 POS_TEXT[self.positive], ["", " ASCII"][self.encoding]))
1972
1973 def max_width(self):
1974 return 0
1975
1976class Any(RegexBase):
1977 _opcode = {False: OP.ANY, True: OP.ANY_REV}
1978 _op_name = "ANY"
1979
1980 def has_simple_start(self):
1981 return True
1982
1983 def _compile(self, reverse, fuzzy):
1984 flags = 0
1985 if fuzzy:
1986 flags |= FUZZY_OP
1987 return [(self._opcode[reverse], flags)]
1988
1989 def dump(self, indent, reverse):
1990 print("{}{}".format(INDENT * indent, self._op_name))
1991
1992 def max_width(self):
1993 return 1
1994
1995class AnyAll(Any):
1996 _opcode = {False: OP.ANY_ALL, True: OP.ANY_ALL_REV}
1997 _op_name = "ANY_ALL"
1998
1999 def __init__(self):
2000 self.positive = True
2001 self.zerowidth = False
2002 self.case_flags = 0
2003
2004 self._key = self.__class__, self.positive
2005
2006class AnyU(Any):
2007 _opcode = {False: OP.ANY_U, True: OP.ANY_U_REV}
2008 _op_name = "ANY_U"
2009
2010class Atomic(RegexBase):
2011 def __init__(self, subpattern):
2012 RegexBase.__init__(self)
2013 self.subpattern = subpattern
2014
2015 def fix_groups(self, pattern, reverse, fuzzy):
2016 self.subpattern.fix_groups(pattern, reverse, fuzzy)
2017
2018 def optimise(self, info, reverse):
2019 self.subpattern = self.subpattern.optimise(info, reverse)
2020
2021 if self.subpattern.is_empty():
2022 return self.subpattern
2023 return self
2024
2025 def pack_characters(self, info):
2026 self.subpattern = self.subpattern.pack_characters(info)
2027 return self
2028
2029 def remove_captures(self):
2030 self.subpattern = self.subpattern.remove_captures()
2031 return self
2032
2033 def can_be_affix(self):
2034 return self.subpattern.can_be_affix()
2035
2036 def contains_group(self):
2037 return self.subpattern.contains_group()
2038
2039 def get_firstset(self, reverse):
2040 return self.subpattern.get_firstset(reverse)
2041
2042 def has_simple_start(self):
2043 return self.subpattern.has_simple_start()
2044
2045 def _compile(self, reverse, fuzzy):
2046 return ([(OP.ATOMIC, )] + self.subpattern.compile(reverse, fuzzy) +
2047 [(OP.END, )])
2048
2049 def dump(self, indent, reverse):
2050 print("{}ATOMIC".format(INDENT * indent))
2051 self.subpattern.dump(indent + 1, reverse)
2052
2053 def is_empty(self):
2054 return self.subpattern.is_empty()
2055
2056 def __eq__(self, other):
2057 return (type(self) is type(other) and self.subpattern ==
2058 other.subpattern)
2059
2060 def max_width(self):
2061 return self.subpattern.max_width()
2062
2063 def get_required_string(self, reverse):
2064 return self.subpattern.get_required_string(reverse)
2065
2066class Boundary(ZeroWidthBase):
2067 _opcode = OP.BOUNDARY
2068 _op_name = "BOUNDARY"
2069
2070class Branch(RegexBase):
2071 def __init__(self, branches):
2072 RegexBase.__init__(self)
2073 self.branches = branches
2074
2075 def fix_groups(self, pattern, reverse, fuzzy):
2076 for b in self.branches:
2077 b.fix_groups(pattern, reverse, fuzzy)
2078
2079 def optimise(self, info, reverse):
2080 if not self.branches:
2081 return Sequence([])
2082
2083 # Flatten branches within branches.
2084 branches = Branch._flatten_branches(info, reverse, self.branches)
2085
2086 # Move any common prefix or suffix out of the branches.
2087 if reverse:
2088 suffix, branches = Branch._split_common_suffix(info, branches)
2089 prefix = []
2090 else:
2091 prefix, branches = Branch._split_common_prefix(info, branches)
2092 suffix = []
2093
2094 # Try to reduce adjacent single-character branches to sets.
2095 branches = Branch._reduce_to_set(info, reverse, branches)
2096
2097 if len(branches) > 1:
2098 sequence = [Branch(branches)]
2099
2100 if not prefix or not suffix:
2101 # We might be able to add a quick precheck before the branches.
2102 firstset = self._add_precheck(info, reverse, branches)
2103
2104 if firstset:
2105 if reverse:
2106 sequence.append(firstset)
2107 else:
2108 sequence.insert(0, firstset)
2109 else:
2110 sequence = branches
2111
2112 return make_sequence(prefix + sequence + suffix)
2113
2114 def _add_precheck(self, info, reverse, branches):
2115 charset = set()
2116 pos = -1 if reverse else 0
2117
2118 for branch in branches:
2119 if type(branch) is Literal and branch.case_flags == NOCASE:
2120 charset.add(branch.characters[pos])
2121 else:
2122 return
2123
2124 if not charset:
2125 return None
2126
2127 return _check_firstset(info, reverse, [Character(c) for c in charset])
2128
2129 def pack_characters(self, info):
2130 self.branches = [b.pack_characters(info) for b in self.branches]
2131 return self
2132
2133 def remove_captures(self):
2134 self.branches = [b.remove_captures() for b in self.branches]
2135 return self
2136
2137 def is_atomic(self):
2138 return all(b.is_atomic() for b in self.branches)
2139
2140 def can_be_affix(self):
2141 return all(b.can_be_affix() for b in self.branches)
2142
2143 def contains_group(self):
2144 return any(b.contains_group() for b in self.branches)
2145
2146 def get_firstset(self, reverse):
2147 fs = set()
2148 for b in self.branches:
2149 fs |= b.get_firstset(reverse)
2150
2151 return fs or set([None])
2152
2153 def _compile(self, reverse, fuzzy):
2154 if not self.branches:
2155 return []
2156
2157 code = [(OP.BRANCH, )]
2158 for b in self.branches:
2159 code.extend(b.compile(reverse, fuzzy))
2160 code.append((OP.NEXT, ))
2161
2162 code[-1] = (OP.END, )
2163
2164 return code
2165
2166 def dump(self, indent, reverse):
2167 print("{}BRANCH".format(INDENT * indent))
2168 self.branches[0].dump(indent + 1, reverse)
2169 for b in self.branches[1 : ]:
2170 print("{}OR".format(INDENT * indent))
2171 b.dump(indent + 1, reverse)
2172
2173 @staticmethod
2174 def _flatten_branches(info, reverse, branches):
2175 # Flatten the branches so that there aren't branches of branches.
2176 new_branches = []
2177 for b in branches:
2178 b = b.optimise(info, reverse)
2179 if isinstance(b, Branch):
2180 new_branches.extend(b.branches)
2181 else:
2182 new_branches.append(b)
2183
2184 return new_branches
2185
2186 @staticmethod
2187 def _split_common_prefix(info, branches):
2188 # Common leading items can be moved out of the branches.
2189 # Get the items in the branches.
2190 alternatives = []
2191 for b in branches:
2192 if isinstance(b, Sequence):
2193 alternatives.append(b.items)
2194 else:
2195 alternatives.append([b])
2196
2197 # What is the maximum possible length of the prefix?
2198 max_count = min(len(a) for a in alternatives)
2199
2200 # What is the longest common prefix?
2201 prefix = alternatives[0]
2202 pos = 0
2203 end_pos = max_count
2204 while pos < end_pos and prefix[pos].can_be_affix() and all(a[pos] ==
2205 prefix[pos] for a in alternatives):
2206 pos += 1
2207 count = pos
2208
2209 if info.flags & UNICODE:
2210 # We need to check that we're not splitting a sequence of
2211 # characters which could form part of full case-folding.
2212 count = pos
2213 while count > 0 and not all(Branch._can_split(a, count) for a in
2214 alternatives):
2215 count -= 1
2216
2217 # No common prefix is possible.
2218 if count == 0:
2219 return [], branches
2220
2221 # Rebuild the branches.
2222 new_branches = []
2223 for a in alternatives:
2224 new_branches.append(make_sequence(a[count : ]))
2225
2226 return prefix[ : count], new_branches
2227
2228 @staticmethod
2229 def _split_common_suffix(info, branches):
2230 # Common trailing items can be moved out of the branches.
2231 # Get the items in the branches.
2232 alternatives = []
2233 for b in branches:
2234 if isinstance(b, Sequence):
2235 alternatives.append(b.items)
2236 else:
2237 alternatives.append([b])
2238
2239 # What is the maximum possible length of the suffix?
2240 max_count = min(len(a) for a in alternatives)
2241
2242 # What is the longest common suffix?
2243 suffix = alternatives[0]
2244 pos = -1
2245 end_pos = -1 - max_count
2246 while pos > end_pos and suffix[pos].can_be_affix() and all(a[pos] ==
2247 suffix[pos] for a in alternatives):
2248 pos -= 1
2249 count = -1 - pos
2250
2251 if info.flags & UNICODE:
2252 # We need to check that we're not splitting a sequence of
2253 # characters which could form part of full case-folding.
2254 while count > 0 and not all(Branch._can_split_rev(a, count) for a
2255 in alternatives):
2256 count -= 1
2257
2258 # No common suffix is possible.
2259 if count == 0:
2260 return [], branches
2261
2262 # Rebuild the branches.
2263 new_branches = []
2264 for a in alternatives:
2265 new_branches.append(make_sequence(a[ : -count]))
2266
2267 return suffix[-count : ], new_branches
2268
2269 @staticmethod
2270 def _can_split(items, count):
2271 # Check the characters either side of the proposed split.
2272 if not Branch._is_full_case(items, count - 1):
2273 return True
2274
2275 if not Branch._is_full_case(items, count):
2276 return True
2277
2278 # Check whether a 1-1 split would be OK.
2279 if Branch._is_folded(items[count - 1 : count + 1]):
2280 return False
2281
2282 # Check whether a 1-2 split would be OK.
2283 if (Branch._is_full_case(items, count + 2) and
2284 Branch._is_folded(items[count - 1 : count + 2])):
2285 return False
2286
2287 # Check whether a 2-1 split would be OK.
2288 if (Branch._is_full_case(items, count - 2) and
2289 Branch._is_folded(items[count - 2 : count + 1])):
2290 return False
2291
2292 return True
2293
2294 @staticmethod
2295 def _can_split_rev(items, count):
2296 end = len(items)
2297
2298 # Check the characters either side of the proposed split.
2299 if not Branch._is_full_case(items, end - count):
2300 return True
2301
2302 if not Branch._is_full_case(items, end - count - 1):
2303 return True
2304
2305 # Check whether a 1-1 split would be OK.
2306 if Branch._is_folded(items[end - count - 1 : end - count + 1]):
2307 return False
2308
2309 # Check whether a 1-2 split would be OK.
2310 if (Branch._is_full_case(items, end - count + 2) and
2311 Branch._is_folded(items[end - count - 1 : end - count + 2])):
2312 return False
2313
2314 # Check whether a 2-1 split would be OK.
2315 if (Branch._is_full_case(items, end - count - 2) and
2316 Branch._is_folded(items[end - count - 2 : end - count + 1])):
2317 return False
2318
2319 return True
2320
2321 @staticmethod
2322 def _merge_common_prefixes(info, reverse, branches):
2323 # Branches with the same case-sensitive character prefix can be grouped
2324 # together if they are separated only by other branches with a
2325 # character prefix.
2326 prefixed = defaultdict(list)
2327 order = {}
2328 new_branches = []
2329 for b in branches:
2330 if Branch._is_simple_character(b):
2331 # Branch starts with a simple character.
2332 prefixed[b.value].append([b])
2333 order.setdefault(b.value, len(order))
2334 elif (isinstance(b, Sequence) and b.items and
2335 Branch._is_simple_character(b.items[0])):
2336 # Branch starts with a simple character.
2337 prefixed[b.items[0].value].append(b.items)
2338 order.setdefault(b.items[0].value, len(order))
2339 else:
2340 Branch._flush_char_prefix(info, reverse, prefixed, order,
2341 new_branches)
2342
2343 new_branches.append(b)
2344
2345 Branch._flush_char_prefix(info, prefixed, order, new_branches)
2346
2347 return new_branches
2348
2349 @staticmethod
2350 def _is_simple_character(c):
2351 return isinstance(c, Character) and c.positive and not c.case_flags
2352
2353 @staticmethod
2354 def _reduce_to_set(info, reverse, branches):
2355 # Can the branches be reduced to a set?
2356 new_branches = []
2357 items = set()
2358 case_flags = NOCASE
2359 for b in branches:
2360 if isinstance(b, (Character, Property, SetBase)):
2361 # Branch starts with a single character.
2362 if b.case_flags != case_flags:
2363 # Different case sensitivity, so flush.
2364 Branch._flush_set_members(info, reverse, items, case_flags,
2365 new_branches)
2366
2367 case_flags = b.case_flags
2368
2369 items.add(b.with_flags(case_flags=NOCASE))
2370 else:
2371 Branch._flush_set_members(info, reverse, items, case_flags,
2372 new_branches)
2373
2374 new_branches.append(b)
2375
2376 Branch._flush_set_members(info, reverse, items, case_flags,
2377 new_branches)
2378
2379 return new_branches
2380
2381 @staticmethod
2382 def _flush_char_prefix(info, reverse, prefixed, order, new_branches):
2383 # Flush the prefixed branches.
2384 if not prefixed:
2385 return
2386
2387 for value, branches in sorted(prefixed.items(), key=lambda pair:
2388 order[pair[0]]):
2389 if len(branches) == 1:
2390 new_branches.append(make_sequence(branches[0]))
2391 else:
2392 subbranches = []
2393 optional = False
2394 for b in branches:
2395 if len(b) > 1:
2396 subbranches.append(make_sequence(b[1 : ]))
2397 elif not optional:
2398 subbranches.append(Sequence())
2399 optional = True
2400
2401 sequence = Sequence([Character(value), Branch(subbranches)])
2402 new_branches.append(sequence.optimise(info, reverse))
2403
2404 prefixed.clear()
2405 order.clear()
2406
2407 @staticmethod
2408 def _flush_set_members(info, reverse, items, case_flags, new_branches):
2409 # Flush the set members.
2410 if not items:
2411 return
2412
2413 if len(items) == 1:
2414 item = list(items)[0]
2415 else:
2416 item = SetUnion(info, list(items)).optimise(info, reverse)
2417
2418 new_branches.append(item.with_flags(case_flags=case_flags))
2419
2420 items.clear()
2421
2422 @staticmethod
2423 def _is_full_case(items, i):
2424 if not 0 <= i < len(items):
2425 return False
2426
2427 item = items[i]
2428 return (isinstance(item, Character) and item.positive and
2429 (item.case_flags & FULLIGNORECASE) == FULLIGNORECASE)
2430
2431 @staticmethod
2432 def _is_folded(items):
2433 if len(items) < 2:
2434 return False
2435
2436 for i in items:
2437 if (not isinstance(i, Character) or not i.positive or not
2438 i.case_flags):
2439 return False
2440
2441 folded = "".join(chr(i.value) for i in items)
2442 folded = _regex.fold_case(FULL_CASE_FOLDING, folded)
2443
2444 # Get the characters which expand to multiple codepoints on folding.
2445 expanding_chars = _regex.get_expand_on_folding()
2446
2447 for c in expanding_chars:
2448 if folded == _regex.fold_case(FULL_CASE_FOLDING, c):
2449 return True
2450
2451 return False
2452
2453 def is_empty(self):
2454 return all(b.is_empty() for b in self.branches)
2455
2456 def __eq__(self, other):
2457 return type(self) is type(other) and self.branches == other.branches
2458
2459 def max_width(self):
2460 return max(b.max_width() for b in self.branches)
2461
2462class CallGroup(RegexBase):
2463 def __init__(self, info, group, position):
2464 RegexBase.__init__(self)
2465 self.info = info
2466 self.group = group
2467 self.position = position
2468
2469 self._key = self.__class__, self.group
2470
2471 def fix_groups(self, pattern, reverse, fuzzy):
2472 try:
2473 self.group = int(self.group)
2474 except ValueError:
2475 try:
2476 self.group = self.info.group_index[self.group]
2477 except KeyError:
2478 raise error("invalid group reference", pattern, self.position)
2479
2480 if not 0 <= self.group <= self.info.group_count:
2481 raise error("unknown group", pattern, self.position)
2482
2483 if self.group > 0 and self.info.open_group_count[self.group] > 1:
2484 raise error("ambiguous group reference", pattern, self.position)
2485
2486 self.info.group_calls.append((self, reverse, fuzzy))
2487
2488 self._key = self.__class__, self.group
2489
2490 def remove_captures(self):
2491 raise error("group reference not allowed", pattern, self.position)
2492
2493 def _compile(self, reverse, fuzzy):
2494 return [(OP.GROUP_CALL, self.call_ref)]
2495
2496 def dump(self, indent, reverse):
2497 print("{}GROUP_CALL {}".format(INDENT * indent, self.group))
2498
2499 def __eq__(self, other):
2500 return type(self) is type(other) and self.group == other.group
2501
2502 def max_width(self):
2503 return UNLIMITED
2504
2505 def __del__(self):
2506 self.info = None
2507
2508class CallRef(RegexBase):
2509 def __init__(self, ref, parsed):
2510 self.ref = ref
2511 self.parsed = parsed
2512
2513 def _compile(self, reverse, fuzzy):
2514 return ([(OP.CALL_REF, self.ref)] + self.parsed._compile(reverse,
2515 fuzzy) + [(OP.END, )])
2516
2517class Character(RegexBase):
2518 _opcode = {(NOCASE, False): OP.CHARACTER, (IGNORECASE, False):
2519 OP.CHARACTER_IGN, (FULLCASE, False): OP.CHARACTER, (FULLIGNORECASE,
2520 False): OP.CHARACTER_IGN, (NOCASE, True): OP.CHARACTER_REV, (IGNORECASE,
2521 True): OP.CHARACTER_IGN_REV, (FULLCASE, True): OP.CHARACTER_REV,
2522 (FULLIGNORECASE, True): OP.CHARACTER_IGN_REV}
2523
2524 def __init__(self, value, positive=True, case_flags=NOCASE,
2525 zerowidth=False):
2526 RegexBase.__init__(self)
2527 self.value = value
2528 self.positive = bool(positive)
2529 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
2530 self.zerowidth = bool(zerowidth)
2531
2532 if (self.positive and (self.case_flags & FULLIGNORECASE) ==
2533 FULLIGNORECASE):
2534 self.folded = _regex.fold_case(FULL_CASE_FOLDING, chr(self.value))
2535 else:
2536 self.folded = chr(self.value)
2537
2538 self._key = (self.__class__, self.value, self.positive,
2539 self.case_flags, self.zerowidth)
2540
2541 def rebuild(self, positive, case_flags, zerowidth):
2542 return Character(self.value, positive, case_flags, zerowidth)
2543
2544 def optimise(self, info, reverse, in_set=False):
2545 return self
2546
2547 def get_firstset(self, reverse):
2548 return set([self])
2549
2550 def has_simple_start(self):
2551 return True
2552
2553 def _compile(self, reverse, fuzzy):
2554 flags = 0
2555 if self.positive:
2556 flags |= POSITIVE_OP
2557 if self.zerowidth:
2558 flags |= ZEROWIDTH_OP
2559 if fuzzy:
2560 flags |= FUZZY_OP
2561
2562 code = PrecompiledCode([self._opcode[self.case_flags, reverse], flags,
2563 self.value])
2564
2565 if len(self.folded) > 1:
2566 # The character expands on full case-folding.
2567 code = Branch([code, String([ord(c) for c in self.folded],
2568 case_flags=self.case_flags)])
2569
2570 return code.compile(reverse, fuzzy)
2571
2572 def dump(self, indent, reverse):
2573 display = ascii(chr(self.value)).lstrip("bu")
2574 print("{}CHARACTER {} {}{}".format(INDENT * indent,
2575 POS_TEXT[self.positive], display, CASE_TEXT[self.case_flags]))
2576
2577 def matches(self, ch):
2578 return (ch == self.value) == self.positive
2579
2580 def max_width(self):
2581 return len(self.folded)
2582
2583 def get_required_string(self, reverse):
2584 if not self.positive:
2585 return 1, None
2586
2587 self.folded_characters = tuple(ord(c) for c in self.folded)
2588
2589 return 0, self
2590
2591class Conditional(RegexBase):
2592 def __init__(self, info, group, yes_item, no_item, position):
2593 RegexBase.__init__(self)
2594 self.info = info
2595 self.group = group
2596 self.yes_item = yes_item
2597 self.no_item = no_item
2598 self.position = position
2599
2600 def fix_groups(self, pattern, reverse, fuzzy):
2601 try:
2602 self.group = int(self.group)
2603 except ValueError:
2604 try:
2605 self.group = self.info.group_index[self.group]
2606 except KeyError:
2607 if self.group == 'DEFINE':
2608 # 'DEFINE' is a special name unless there's a group with
2609 # that name.
2610 self.group = 0
2611 else:
2612 raise error("unknown group", pattern, self.position)
2613
2614 if not 0 <= self.group <= self.info.group_count:
2615 raise error("invalid group reference", pattern, self.position)
2616
2617 self.yes_item.fix_groups(pattern, reverse, fuzzy)
2618 self.no_item.fix_groups(pattern, reverse, fuzzy)
2619
2620 def optimise(self, info, reverse):
2621 yes_item = self.yes_item.optimise(info, reverse)
2622 no_item = self.no_item.optimise(info, reverse)
2623
2624 return Conditional(info, self.group, yes_item, no_item, self.position)
2625
2626 def pack_characters(self, info):
2627 self.yes_item = self.yes_item.pack_characters(info)
2628 self.no_item = self.no_item.pack_characters(info)
2629 return self
2630
2631 def remove_captures(self):
2632 self.yes_item = self.yes_item.remove_captures()
2633 self.no_item = self.no_item.remove_captures()
2634
2635 def is_atomic(self):
2636 return self.yes_item.is_atomic() and self.no_item.is_atomic()
2637
2638 def can_be_affix(self):
2639 return self.yes_item.can_be_affix() and self.no_item.can_be_affix()
2640
2641 def contains_group(self):
2642 return self.yes_item.contains_group() or self.no_item.contains_group()
2643
2644 def get_firstset(self, reverse):
2645 return (self.yes_item.get_firstset(reverse) |
2646 self.no_item.get_firstset(reverse))
2647
2648 def _compile(self, reverse, fuzzy):
2649 code = [(OP.GROUP_EXISTS, self.group)]
2650 code.extend(self.yes_item.compile(reverse, fuzzy))
2651 add_code = self.no_item.compile(reverse, fuzzy)
2652 if add_code:
2653 code.append((OP.NEXT, ))
2654 code.extend(add_code)
2655
2656 code.append((OP.END, ))
2657
2658 return code
2659
2660 def dump(self, indent, reverse):
2661 print("{}GROUP_EXISTS {}".format(INDENT * indent, self.group))
2662 self.yes_item.dump(indent + 1, reverse)
2663 if not self.no_item.is_empty():
2664 print("{}OR".format(INDENT * indent))
2665 self.no_item.dump(indent + 1, reverse)
2666
2667 def is_empty(self):
2668 return self.yes_item.is_empty() and self.no_item.is_empty()
2669
2670 def __eq__(self, other):
2671 return type(self) is type(other) and (self.group, self.yes_item,
2672 self.no_item) == (other.group, other.yes_item, other.no_item)
2673
2674 def max_width(self):
2675 return max(self.yes_item.max_width(), self.no_item.max_width())
2676
2677 def __del__(self):
2678 self.info = None
2679
2680class DefaultBoundary(ZeroWidthBase):
2681 _opcode = OP.DEFAULT_BOUNDARY
2682 _op_name = "DEFAULT_BOUNDARY"
2683
2684class DefaultEndOfWord(ZeroWidthBase):
2685 _opcode = OP.DEFAULT_END_OF_WORD
2686 _op_name = "DEFAULT_END_OF_WORD"
2687
2688class DefaultStartOfWord(ZeroWidthBase):
2689 _opcode = OP.DEFAULT_START_OF_WORD
2690 _op_name = "DEFAULT_START_OF_WORD"
2691
2692class EndOfLine(ZeroWidthBase):
2693 _opcode = OP.END_OF_LINE
2694 _op_name = "END_OF_LINE"
2695
2696class EndOfLineU(EndOfLine):
2697 _opcode = OP.END_OF_LINE_U
2698 _op_name = "END_OF_LINE_U"
2699
2700class EndOfString(ZeroWidthBase):
2701 _opcode = OP.END_OF_STRING
2702 _op_name = "END_OF_STRING"
2703
2704class EndOfStringLine(ZeroWidthBase):
2705 _opcode = OP.END_OF_STRING_LINE
2706 _op_name = "END_OF_STRING_LINE"
2707
2708class EndOfStringLineU(EndOfStringLine):
2709 _opcode = OP.END_OF_STRING_LINE_U
2710 _op_name = "END_OF_STRING_LINE_U"
2711
2712class EndOfWord(ZeroWidthBase):
2713 _opcode = OP.END_OF_WORD
2714 _op_name = "END_OF_WORD"
2715
2716class Failure(ZeroWidthBase):
2717 _op_name = "FAILURE"
2718
2719 def _compile(self, reverse, fuzzy):
2720 return [(OP.FAILURE, )]
2721
2722class Fuzzy(RegexBase):
2723 def __init__(self, subpattern, constraints=None):
2724 RegexBase.__init__(self)
2725 if constraints is None:
2726 constraints = {}
2727 self.subpattern = subpattern
2728 self.constraints = constraints
2729
2730 # If an error type is mentioned in the cost equation, then its maximum
2731 # defaults to unlimited.
2732 if "cost" in constraints:
2733 for e in "dis":
2734 if e in constraints["cost"]:
2735 constraints.setdefault(e, (0, None))
2736
2737 # If any error type is mentioned, then all the error maxima default to
2738 # 0, otherwise they default to unlimited.
2739 if set(constraints) & set("dis"):
2740 for e in "dis":
2741 constraints.setdefault(e, (0, 0))
2742 else:
2743 for e in "dis":
2744 constraints.setdefault(e, (0, None))
2745
2746 # The maximum of the generic error type defaults to unlimited.
2747 constraints.setdefault("e", (0, None))
2748
2749 # The cost equation defaults to equal costs. Also, the cost of any
2750 # error type not mentioned in the cost equation defaults to 0.
2751 if "cost" in constraints:
2752 for e in "dis":
2753 constraints["cost"].setdefault(e, 0)
2754 else:
2755 constraints["cost"] = {"d": 1, "i": 1, "s": 1, "max":
2756 constraints["e"][1]}
2757
2758 def fix_groups(self, pattern, reverse, fuzzy):
2759 self.subpattern.fix_groups(pattern, reverse, True)
2760
2761 def pack_characters(self, info):
2762 self.subpattern = self.subpattern.pack_characters(info)
2763 return self
2764
2765 def remove_captures(self):
2766 self.subpattern = self.subpattern.remove_captures()
2767 return self
2768
2769 def is_atomic(self):
2770 return self.subpattern.is_atomic()
2771
2772 def contains_group(self):
2773 return self.subpattern.contains_group()
2774
2775 def _compile(self, reverse, fuzzy):
2776 # The individual limits.
2777 arguments = []
2778 for e in "dise":
2779 v = self.constraints[e]
2780 arguments.append(v[0])
2781 arguments.append(UNLIMITED if v[1] is None else v[1])
2782
2783 # The coeffs of the cost equation.
2784 for e in "dis":
2785 arguments.append(self.constraints["cost"][e])
2786
2787 # The maximum of the cost equation.
2788 v = self.constraints["cost"]["max"]
2789 arguments.append(UNLIMITED if v is None else v)
2790
2791 flags = 0
2792 if reverse:
2793 flags |= REVERSE_OP
2794
2795 test = self.constraints.get("test")
2796
2797 if test:
2798 return ([(OP.FUZZY_EXT, flags) + tuple(arguments)] +
2799 test.compile(reverse, True) + [(OP.NEXT,)] +
2800 self.subpattern.compile(reverse, True) + [(OP.END,)])
2801
2802 return ([(OP.FUZZY, flags) + tuple(arguments)] +
2803 self.subpattern.compile(reverse, True) + [(OP.END,)])
2804
2805 def dump(self, indent, reverse):
2806 constraints = self._constraints_to_string()
2807 if constraints:
2808 constraints = " " + constraints
2809 print("{}FUZZY{}".format(INDENT * indent, constraints))
2810 self.subpattern.dump(indent + 1, reverse)
2811
2812 def is_empty(self):
2813 return self.subpattern.is_empty()
2814
2815 def __eq__(self, other):
2816 return (type(self) is type(other) and self.subpattern ==
2817 other.subpattern and self.constraints == other.constraints)
2818
2819 def max_width(self):
2820 return UNLIMITED
2821
2822 def _constraints_to_string(self):
2823 constraints = []
2824
2825 for name in "ids":
2826 min, max = self.constraints[name]
2827 if max == 0:
2828 continue
2829
2830 con = ""
2831
2832 if min > 0:
2833 con = "{}<=".format(min)
2834
2835 con += name
2836
2837 if max is not None:
2838 con += "<={}".format(max)
2839
2840 constraints.append(con)
2841
2842 cost = []
2843 for name in "ids":
2844 coeff = self.constraints["cost"][name]
2845 if coeff > 0:
2846 cost.append("{}{}".format(coeff, name))
2847
2848 limit = self.constraints["cost"]["max"]
2849 if limit is not None and limit > 0:
2850 cost = "{}<={}".format("+".join(cost), limit)
2851 constraints.append(cost)
2852
2853 return ",".join(constraints)
2854
2855class Grapheme(RegexBase):
2856 def _compile(self, reverse, fuzzy):
2857 # Match at least 1 character until a grapheme boundary is reached. Note
2858 # that this is the same whether matching forwards or backwards.
2859 grapheme_matcher = Atomic(Sequence([LazyRepeat(AnyAll(), 1, None),
2860 GraphemeBoundary()]))
2861
2862 return grapheme_matcher.compile(reverse, fuzzy)
2863
2864 def dump(self, indent, reverse):
2865 print("{}GRAPHEME".format(INDENT * indent))
2866
2867 def max_width(self):
2868 return UNLIMITED
2869
2870class GraphemeBoundary:
2871 def compile(self, reverse, fuzzy):
2872 return [(OP.GRAPHEME_BOUNDARY, 1)]
2873
2874class GreedyRepeat(RegexBase):
2875 _opcode = OP.GREEDY_REPEAT
2876 _op_name = "GREEDY_REPEAT"
2877
2878 def __init__(self, subpattern, min_count, max_count):
2879 RegexBase.__init__(self)
2880 self.subpattern = subpattern
2881 self.min_count = min_count
2882 self.max_count = max_count
2883
2884 def fix_groups(self, pattern, reverse, fuzzy):
2885 self.subpattern.fix_groups(pattern, reverse, fuzzy)
2886
2887 def optimise(self, info, reverse):
2888 subpattern = self.subpattern.optimise(info, reverse)
2889
2890 return type(self)(subpattern, self.min_count, self.max_count)
2891
2892 def pack_characters(self, info):
2893 self.subpattern = self.subpattern.pack_characters(info)
2894 return self
2895
2896 def remove_captures(self):
2897 self.subpattern = self.subpattern.remove_captures()
2898 return self
2899
2900 def is_atomic(self):
2901 return self.min_count == self.max_count and self.subpattern.is_atomic()
2902
2903 def can_be_affix(self):
2904 return False
2905
2906 def contains_group(self):
2907 return self.subpattern.contains_group()
2908
2909 def get_firstset(self, reverse):
2910 fs = self.subpattern.get_firstset(reverse)
2911 if self.min_count == 0:
2912 fs.add(None)
2913
2914 return fs
2915
2916 def _compile(self, reverse, fuzzy):
2917 repeat = [self._opcode, self.min_count]
2918 if self.max_count is None:
2919 repeat.append(UNLIMITED)
2920 else:
2921 repeat.append(self.max_count)
2922
2923 subpattern = self.subpattern.compile(reverse, fuzzy)
2924 if not subpattern:
2925 return []
2926
2927 return ([tuple(repeat)] + subpattern + [(OP.END, )])
2928
2929 def dump(self, indent, reverse):
2930 if self.max_count is None:
2931 limit = "INF"
2932 else:
2933 limit = self.max_count
2934 print("{}{} {} {}".format(INDENT * indent, self._op_name,
2935 self.min_count, limit))
2936
2937 self.subpattern.dump(indent + 1, reverse)
2938
2939 def is_empty(self):
2940 return self.subpattern.is_empty()
2941
2942 def __eq__(self, other):
2943 return type(self) is type(other) and (self.subpattern, self.min_count,
2944 self.max_count) == (other.subpattern, other.min_count,
2945 other.max_count)
2946
2947 def max_width(self):
2948 if self.max_count is None:
2949 return UNLIMITED
2950
2951 return self.subpattern.max_width() * self.max_count
2952
2953 def get_required_string(self, reverse):
2954 max_count = UNLIMITED if self.max_count is None else self.max_count
2955 if self.min_count == 0:
2956 w = self.subpattern.max_width() * max_count
2957 return min(w, UNLIMITED), None
2958
2959 ofs, req = self.subpattern.get_required_string(reverse)
2960 if req:
2961 return ofs, req
2962
2963 w = self.subpattern.max_width() * max_count
2964 return min(w, UNLIMITED), None
2965
2966class PossessiveRepeat(GreedyRepeat):
2967 def is_atomic(self):
2968 return True
2969
2970 def _compile(self, reverse, fuzzy):
2971 subpattern = self.subpattern.compile(reverse, fuzzy)
2972 if not subpattern:
2973 return []
2974
2975 repeat = [self._opcode, self.min_count]
2976 if self.max_count is None:
2977 repeat.append(UNLIMITED)
2978 else:
2979 repeat.append(self.max_count)
2980
2981 return ([(OP.ATOMIC, ), tuple(repeat)] + subpattern + [(OP.END, ),
2982 (OP.END, )])
2983
2984 def dump(self, indent, reverse):
2985 print("{}ATOMIC".format(INDENT * indent))
2986
2987 if self.max_count is None:
2988 limit = "INF"
2989 else:
2990 limit = self.max_count
2991 print("{}{} {} {}".format(INDENT * (indent + 1), self._op_name,
2992 self.min_count, limit))
2993
2994 self.subpattern.dump(indent + 2, reverse)
2995
2996class Group(RegexBase):
2997 def __init__(self, info, group, subpattern):
2998 RegexBase.__init__(self)
2999 self.info = info
3000 self.group = group
3001 self.subpattern = subpattern
3002
3003 self.call_ref = None
3004
3005 def fix_groups(self, pattern, reverse, fuzzy):
3006 self.info.defined_groups[self.group] = (self, reverse, fuzzy)
3007 self.subpattern.fix_groups(pattern, reverse, fuzzy)
3008
3009 def optimise(self, info, reverse):
3010 subpattern = self.subpattern.optimise(info, reverse)
3011
3012 return Group(self.info, self.group, subpattern)
3013
3014 def pack_characters(self, info):
3015 self.subpattern = self.subpattern.pack_characters(info)
3016 return self
3017
3018 def remove_captures(self):
3019 return self.subpattern.remove_captures()
3020
3021 def is_atomic(self):
3022 return self.subpattern.is_atomic()
3023
3024 def can_be_affix(self):
3025 return False
3026
3027 def contains_group(self):
3028 return True
3029
3030 def get_firstset(self, reverse):
3031 return self.subpattern.get_firstset(reverse)
3032
3033 def has_simple_start(self):
3034 return self.subpattern.has_simple_start()
3035
3036 def _compile(self, reverse, fuzzy):
3037 code = []
3038
3039 public_group = private_group = self.group
3040 if private_group < 0:
3041 public_group = self.info.private_groups[private_group]
3042 private_group = self.info.group_count - private_group
3043
3044 key = self.group, reverse, fuzzy
3045 ref = self.info.call_refs.get(key)
3046 if ref is not None:
3047 code += [(OP.CALL_REF, ref)]
3048
3049 code += [(OP.GROUP, int(not reverse), private_group, public_group)]
3050 code += self.subpattern.compile(reverse, fuzzy)
3051 code += [(OP.END, )]
3052
3053 if ref is not None:
3054 code += [(OP.END, )]
3055
3056 return code
3057
3058 def dump(self, indent, reverse):
3059 group = self.group
3060 if group < 0:
3061 group = private_groups[group]
3062 print("{}GROUP {}".format(INDENT * indent, group))
3063 self.subpattern.dump(indent + 1, reverse)
3064
3065 def __eq__(self, other):
3066 return (type(self) is type(other) and (self.group, self.subpattern) ==
3067 (other.group, other.subpattern))
3068
3069 def max_width(self):
3070 return self.subpattern.max_width()
3071
3072 def get_required_string(self, reverse):
3073 return self.subpattern.get_required_string(reverse)
3074
3075 def __del__(self):
3076 self.info = None
3077
3078class Keep(ZeroWidthBase):
3079 _opcode = OP.KEEP
3080 _op_name = "KEEP"
3081
3082class LazyRepeat(GreedyRepeat):
3083 _opcode = OP.LAZY_REPEAT
3084 _op_name = "LAZY_REPEAT"
3085
3086class LookAround(RegexBase):
3087 _dir_text = {False: "AHEAD", True: "BEHIND"}
3088
3089 def __init__(self, behind, positive, subpattern):
3090 RegexBase.__init__(self)
3091 self.behind = bool(behind)
3092 self.positive = bool(positive)
3093 self.subpattern = subpattern
3094
3095 def fix_groups(self, pattern, reverse, fuzzy):
3096 self.subpattern.fix_groups(pattern, self.behind, fuzzy)
3097
3098 def optimise(self, info, reverse):
3099 subpattern = self.subpattern.optimise(info, self.behind)
3100 if self.positive and subpattern.is_empty():
3101 return subpattern
3102
3103 return LookAround(self.behind, self.positive, subpattern)
3104
3105 def pack_characters(self, info):
3106 self.subpattern = self.subpattern.pack_characters(info)
3107 return self
3108
3109 def remove_captures(self):
3110 return self.subpattern.remove_captures()
3111
3112 def is_atomic(self):
3113 return self.subpattern.is_atomic()
3114
3115 def can_be_affix(self):
3116 return self.subpattern.can_be_affix()
3117
3118 def contains_group(self):
3119 return self.subpattern.contains_group()
3120
3121 def get_firstset(self, reverse):
3122 if self.positive and self.behind == reverse:
3123 return self.subpattern.get_firstset(reverse)
3124
3125 return set([None])
3126
3127 def _compile(self, reverse, fuzzy):
3128 flags = 0
3129 if self.positive:
3130 flags |= POSITIVE_OP
3131 if fuzzy:
3132 flags |= FUZZY_OP
3133 if reverse:
3134 flags |= REVERSE_OP
3135
3136 return ([(OP.LOOKAROUND, flags, int(not self.behind))] +
3137 self.subpattern.compile(self.behind) + [(OP.END, )])
3138
3139 def dump(self, indent, reverse):
3140 print("{}LOOK{} {}".format(INDENT * indent,
3141 self._dir_text[self.behind], POS_TEXT[self.positive]))
3142 self.subpattern.dump(indent + 1, self.behind)
3143
3144 def is_empty(self):
3145 return self.positive and self.subpattern.is_empty()
3146
3147 def __eq__(self, other):
3148 return type(self) is type(other) and (self.behind, self.positive,
3149 self.subpattern) == (other.behind, other.positive, other.subpattern)
3150
3151 def max_width(self):
3152 return 0
3153
3154class LookAroundConditional(RegexBase):
3155 _dir_text = {False: "AHEAD", True: "BEHIND"}
3156
3157 def __init__(self, behind, positive, subpattern, yes_item, no_item):
3158 RegexBase.__init__(self)
3159 self.behind = bool(behind)
3160 self.positive = bool(positive)
3161 self.subpattern = subpattern
3162 self.yes_item = yes_item
3163 self.no_item = no_item
3164
3165 def fix_groups(self, pattern, reverse, fuzzy):
3166 self.subpattern.fix_groups(pattern, reverse, fuzzy)
3167 self.yes_item.fix_groups(pattern, reverse, fuzzy)
3168 self.no_item.fix_groups(pattern, reverse, fuzzy)
3169
3170 def optimise(self, info, reverse):
3171 subpattern = self.subpattern.optimise(info, self.behind)
3172 yes_item = self.yes_item.optimise(info, self.behind)
3173 no_item = self.no_item.optimise(info, self.behind)
3174
3175 return LookAroundConditional(self.behind, self.positive, subpattern,
3176 yes_item, no_item)
3177
3178 def pack_characters(self, info):
3179 self.subpattern = self.subpattern.pack_characters(info)
3180 self.yes_item = self.yes_item.pack_characters(info)
3181 self.no_item = self.no_item.pack_characters(info)
3182 return self
3183
3184 def remove_captures(self):
3185 self.subpattern = self.subpattern.remove_captures()
3186 self.yes_item = self.yes_item.remove_captures()
3187 self.no_item = self.no_item.remove_captures()
3188
3189 def is_atomic(self):
3190 return (self.subpattern.is_atomic() and self.yes_item.is_atomic() and
3191 self.no_item.is_atomic())
3192
3193 def can_be_affix(self):
3194 return (self.subpattern.can_be_affix() and self.yes_item.can_be_affix()
3195 and self.no_item.can_be_affix())
3196
3197 def contains_group(self):
3198 return (self.subpattern.contains_group() or
3199 self.yes_item.contains_group() or self.no_item.contains_group())
3200
3201 def _compile(self, reverse, fuzzy):
3202 code = [(OP.CONDITIONAL, int(self.positive), int(not self.behind))]
3203 code.extend(self.subpattern.compile(self.behind, fuzzy))
3204 code.append((OP.NEXT, ))
3205 code.extend(self.yes_item.compile(reverse, fuzzy))
3206 add_code = self.no_item.compile(reverse, fuzzy)
3207 if add_code:
3208 code.append((OP.NEXT, ))
3209 code.extend(add_code)
3210
3211 code.append((OP.END, ))
3212
3213 return code
3214
3215 def dump(self, indent, reverse):
3216 print("{}CONDITIONAL {} {}".format(INDENT * indent,
3217 self._dir_text[self.behind], POS_TEXT[self.positive]))
3218 self.subpattern.dump(indent + 1, self.behind)
3219 print("{}EITHER".format(INDENT * indent))
3220 self.yes_item.dump(indent + 1, reverse)
3221 if not self.no_item.is_empty():
3222 print("{}OR".format(INDENT * indent))
3223 self.no_item.dump(indent + 1, reverse)
3224
3225 def is_empty(self):
3226 return (self.subpattern.is_empty() and self.yes_item.is_empty() or
3227 self.no_item.is_empty())
3228
3229 def __eq__(self, other):
3230 return type(self) is type(other) and (self.subpattern, self.yes_item,
3231 self.no_item) == (other.subpattern, other.yes_item, other.no_item)
3232
3233 def max_width(self):
3234 return max(self.yes_item.max_width(), self.no_item.max_width())
3235
3236 def get_required_string(self, reverse):
3237 return self.max_width(), None
3238
3239class PrecompiledCode(RegexBase):
3240 def __init__(self, code):
3241 self.code = code
3242
3243 def _compile(self, reverse, fuzzy):
3244 return [tuple(self.code)]
3245
3246class Property(RegexBase):
3247 _opcode = {(NOCASE, False): OP.PROPERTY, (IGNORECASE, False):
3248 OP.PROPERTY_IGN, (FULLCASE, False): OP.PROPERTY, (FULLIGNORECASE, False):
3249 OP.PROPERTY_IGN, (NOCASE, True): OP.PROPERTY_REV, (IGNORECASE, True):
3250 OP.PROPERTY_IGN_REV, (FULLCASE, True): OP.PROPERTY_REV, (FULLIGNORECASE,
3251 True): OP.PROPERTY_IGN_REV}
3252
3253 def __init__(self, value, positive=True, case_flags=NOCASE,
3254 zerowidth=False, encoding=0):
3255 RegexBase.__init__(self)
3256 self.value = value
3257 self.positive = bool(positive)
3258 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
3259 self.zerowidth = bool(zerowidth)
3260 self.encoding = encoding
3261
3262 self._key = (self.__class__, self.value, self.positive,
3263 self.case_flags, self.zerowidth)
3264
3265 def rebuild(self, positive, case_flags, zerowidth):
3266 return Property(self.value, positive, case_flags, zerowidth,
3267 self.encoding)
3268
3269 def optimise(self, info, reverse, in_set=False):
3270 return self
3271
3272 def get_firstset(self, reverse):
3273 return set([self])
3274
3275 def has_simple_start(self):
3276 return True
3277
3278 def _compile(self, reverse, fuzzy):
3279 flags = 0
3280 if self.positive:
3281 flags |= POSITIVE_OP
3282 if self.zerowidth:
3283 flags |= ZEROWIDTH_OP
3284 if fuzzy:
3285 flags |= FUZZY_OP
3286 flags |= self.encoding << ENCODING_OP_SHIFT
3287 return [(self._opcode[self.case_flags, reverse], flags, self.value)]
3288
3289 def dump(self, indent, reverse):
3290 prop = PROPERTY_NAMES[self.value >> 16]
3291 name, value = prop[0], prop[1][self.value & 0xFFFF]
3292 print("{}PROPERTY {} {}:{}{}{}".format(INDENT * indent,
3293 POS_TEXT[self.positive], name, value, CASE_TEXT[self.case_flags],
3294 ["", " ASCII"][self.encoding]))
3295
3296 def matches(self, ch):
3297 return _regex.has_property_value(self.value, ch) == self.positive
3298
3299 def max_width(self):
3300 return 1
3301
3302class Prune(ZeroWidthBase):
3303 _op_name = "PRUNE"
3304
3305 def _compile(self, reverse, fuzzy):
3306 return [(OP.PRUNE, )]
3307
3308class Range(RegexBase):
3309 _opcode = {(NOCASE, False): OP.RANGE, (IGNORECASE, False): OP.RANGE_IGN,
3310 (FULLCASE, False): OP.RANGE, (FULLIGNORECASE, False): OP.RANGE_IGN,
3311 (NOCASE, True): OP.RANGE_REV, (IGNORECASE, True): OP.RANGE_IGN_REV,
3312 (FULLCASE, True): OP.RANGE_REV, (FULLIGNORECASE, True): OP.RANGE_IGN_REV}
3313 _op_name = "RANGE"
3314
3315 def __init__(self, lower, upper, positive=True, case_flags=NOCASE,
3316 zerowidth=False):
3317 RegexBase.__init__(self)
3318 self.lower = lower
3319 self.upper = upper
3320 self.positive = bool(positive)
3321 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
3322 self.zerowidth = bool(zerowidth)
3323
3324 self._key = (self.__class__, self.lower, self.upper, self.positive,
3325 self.case_flags, self.zerowidth)
3326
3327 def rebuild(self, positive, case_flags, zerowidth):
3328 return Range(self.lower, self.upper, positive, case_flags, zerowidth)
3329
3330 def optimise(self, info, reverse, in_set=False):
3331 # Is the range case-sensitive?
3332 if not self.positive or not (self.case_flags & IGNORECASE) or in_set:
3333 return self
3334
3335 # Is full case-folding possible?
3336 if (not (info.flags & UNICODE) or (self.case_flags & FULLIGNORECASE) !=
3337 FULLIGNORECASE):
3338 return self
3339
3340 # Get the characters which expand to multiple codepoints on folding.
3341 expanding_chars = _regex.get_expand_on_folding()
3342
3343 # Get the folded characters in the range.
3344 items = []
3345 for ch in expanding_chars:
3346 if self.lower <= ord(ch) <= self.upper:
3347 folded = _regex.fold_case(FULL_CASE_FOLDING, ch)
3348 items.append(String([ord(c) for c in folded],
3349 case_flags=self.case_flags))
3350
3351 if not items:
3352 # We can fall back to simple case-folding.
3353 return self
3354
3355 if len(items) < self.upper - self.lower + 1:
3356 # Not all the characters are covered by the full case-folding.
3357 items.insert(0, self)
3358
3359 return Branch(items)
3360
3361 def _compile(self, reverse, fuzzy):
3362 flags = 0
3363 if self.positive:
3364 flags |= POSITIVE_OP
3365 if self.zerowidth:
3366 flags |= ZEROWIDTH_OP
3367 if fuzzy:
3368 flags |= FUZZY_OP
3369 return [(self._opcode[self.case_flags, reverse], flags, self.lower,
3370 self.upper)]
3371
3372 def dump(self, indent, reverse):
3373 display_lower = ascii(chr(self.lower)).lstrip("bu")
3374 display_upper = ascii(chr(self.upper)).lstrip("bu")
3375 print("{}RANGE {} {} {}{}".format(INDENT * indent,
3376 POS_TEXT[self.positive], display_lower, display_upper,
3377 CASE_TEXT[self.case_flags]))
3378
3379 def matches(self, ch):
3380 return (self.lower <= ch <= self.upper) == self.positive
3381
3382 def max_width(self):
3383 return 1
3384
3385class RefGroup(RegexBase):
3386 _opcode = {(NOCASE, False): OP.REF_GROUP, (IGNORECASE, False):
3387 OP.REF_GROUP_IGN, (FULLCASE, False): OP.REF_GROUP, (FULLIGNORECASE,
3388 False): OP.REF_GROUP_FLD, (NOCASE, True): OP.REF_GROUP_REV, (IGNORECASE,
3389 True): OP.REF_GROUP_IGN_REV, (FULLCASE, True): OP.REF_GROUP_REV,
3390 (FULLIGNORECASE, True): OP.REF_GROUP_FLD_REV}
3391
3392 def __init__(self, info, group, position, case_flags=NOCASE):
3393 RegexBase.__init__(self)
3394 self.info = info
3395 self.group = group
3396 self.position = position
3397 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
3398
3399 self._key = self.__class__, self.group, self.case_flags
3400
3401 def fix_groups(self, pattern, reverse, fuzzy):
3402 try:
3403 self.group = int(self.group)
3404 except ValueError:
3405 try:
3406 self.group = self.info.group_index[self.group]
3407 except KeyError:
3408 raise error("unknown group", pattern, self.position)
3409
3410 if not 1 <= self.group <= self.info.group_count:
3411 raise error("invalid group reference", pattern, self.position)
3412
3413 self._key = self.__class__, self.group, self.case_flags
3414
3415 def remove_captures(self):
3416 raise error("group reference not allowed", pattern, self.position)
3417
3418 def _compile(self, reverse, fuzzy):
3419 flags = 0
3420 if fuzzy:
3421 flags |= FUZZY_OP
3422 return [(self._opcode[self.case_flags, reverse], flags, self.group)]
3423
3424 def dump(self, indent, reverse):
3425 print("{}REF_GROUP {}{}".format(INDENT * indent, self.group,
3426 CASE_TEXT[self.case_flags]))
3427
3428 def max_width(self):
3429 return UNLIMITED
3430
3431 def __del__(self):
3432 self.info = None
3433
3434class SearchAnchor(ZeroWidthBase):
3435 _opcode = OP.SEARCH_ANCHOR
3436 _op_name = "SEARCH_ANCHOR"
3437
3438class Sequence(RegexBase):
3439 def __init__(self, items=None):
3440 RegexBase.__init__(self)
3441 if items is None:
3442 items = []
3443
3444 self.items = items
3445
3446 def fix_groups(self, pattern, reverse, fuzzy):
3447 for s in self.items:
3448 s.fix_groups(pattern, reverse, fuzzy)
3449
3450 def optimise(self, info, reverse):
3451 # Flatten the sequences.
3452 items = []
3453 for s in self.items:
3454 s = s.optimise(info, reverse)
3455 if isinstance(s, Sequence):
3456 items.extend(s.items)
3457 else:
3458 items.append(s)
3459
3460 return make_sequence(items)
3461
3462 def pack_characters(self, info):
3463 "Packs sequences of characters into strings."
3464 items = []
3465 characters = []
3466 case_flags = NOCASE
3467 for s in self.items:
3468 if type(s) is Character and s.positive and not s.zerowidth:
3469 if s.case_flags != case_flags:
3470 # Different case sensitivity, so flush, unless neither the
3471 # previous nor the new character are cased.
3472 if s.case_flags or is_cased_i(info, s.value):
3473 Sequence._flush_characters(info, characters,
3474 case_flags, items)
3475
3476 case_flags = s.case_flags
3477
3478 characters.append(s.value)
3479 elif type(s) is String or type(s) is Literal:
3480 if s.case_flags != case_flags:
3481 # Different case sensitivity, so flush, unless the neither
3482 # the previous nor the new string are cased.
3483 if s.case_flags or any(is_cased_i(info, c) for c in
3484 characters):
3485 Sequence._flush_characters(info, characters,
3486 case_flags, items)
3487
3488 case_flags = s.case_flags
3489
3490 characters.extend(s.characters)
3491 else:
3492 Sequence._flush_characters(info, characters, case_flags, items)
3493
3494 items.append(s.pack_characters(info))
3495
3496 Sequence._flush_characters(info, characters, case_flags, items)
3497
3498 return make_sequence(items)
3499
3500 def remove_captures(self):
3501 self.items = [s.remove_captures() for s in self.items]
3502 return self
3503
3504 def is_atomic(self):
3505 return all(s.is_atomic() for s in self.items)
3506
3507 def can_be_affix(self):
3508 return False
3509
3510 def contains_group(self):
3511 return any(s.contains_group() for s in self.items)
3512
3513 def get_firstset(self, reverse):
3514 fs = set()
3515 items = self.items
3516 if reverse:
3517 items.reverse()
3518 for s in items:
3519 fs |= s.get_firstset(reverse)
3520 if None not in fs:
3521 return fs
3522 fs.discard(None)
3523
3524 return fs | set([None])
3525
3526 def has_simple_start(self):
3527 return bool(self.items) and self.items[0].has_simple_start()
3528
3529 def _compile(self, reverse, fuzzy):
3530 seq = self.items
3531 if reverse:
3532 seq = seq[::-1]
3533
3534 code = []
3535 for s in seq:
3536 code.extend(s.compile(reverse, fuzzy))
3537
3538 return code
3539
3540 def dump(self, indent, reverse):
3541 for s in self.items:
3542 s.dump(indent, reverse)
3543
3544 @staticmethod
3545 def _flush_characters(info, characters, case_flags, items):
3546 if not characters:
3547 return
3548
3549 # Disregard case_flags if all of the characters are case-less.
3550 if case_flags & IGNORECASE:
3551 if not any(is_cased_i(info, c) for c in characters):
3552 case_flags = NOCASE
3553
3554 if (case_flags & FULLIGNORECASE) == FULLIGNORECASE:
3555 literals = Sequence._fix_full_casefold(characters)
3556
3557 for item in literals:
3558 chars = item.characters
3559
3560 if len(chars) == 1:
3561 items.append(Character(chars[0], case_flags=item.case_flags))
3562 else:
3563 items.append(String(chars, case_flags=item.case_flags))
3564 else:
3565 if len(characters) == 1:
3566 items.append(Character(characters[0], case_flags=case_flags))
3567 else:
3568 items.append(String(characters, case_flags=case_flags))
3569
3570 characters[:] = []
3571
3572 @staticmethod
3573 def _fix_full_casefold(characters):
3574 # Split a literal needing full case-folding into chunks that need it
3575 # and chunks that can use simple case-folding, which is faster.
3576 expanded = [_regex.fold_case(FULL_CASE_FOLDING, c) for c in
3577 _regex.get_expand_on_folding()]
3578 string = _regex.fold_case(FULL_CASE_FOLDING, ''.join(chr(c)
3579 for c in characters)).lower()
3580 chunks = []
3581
3582 for e in expanded:
3583 found = string.find(e)
3584
3585 while found >= 0:
3586 chunks.append((found, found + len(e)))
3587 found = string.find(e, found + 1)
3588
3589 pos = 0
3590 literals = []
3591
3592 for start, end in Sequence._merge_chunks(chunks):
3593 if pos < start:
3594 literals.append(Literal(characters[pos : start],
3595 case_flags=IGNORECASE))
3596
3597 literals.append(Literal(characters[start : end],
3598 case_flags=FULLIGNORECASE))
3599 pos = end
3600
3601 if pos < len(characters):
3602 literals.append(Literal(characters[pos : ], case_flags=IGNORECASE))
3603
3604 return literals
3605
3606 @staticmethod
3607 def _merge_chunks(chunks):
3608 if len(chunks) < 2:
3609 return chunks
3610
3611 chunks.sort()
3612
3613 start, end = chunks[0]
3614 new_chunks = []
3615
3616 for s, e in chunks[1 : ]:
3617 if s <= end:
3618 end = max(end, e)
3619 else:
3620 new_chunks.append((start, end))
3621 start, end = s, e
3622
3623 new_chunks.append((start, end))
3624
3625 return new_chunks
3626
3627 def is_empty(self):
3628 return all(i.is_empty() for i in self.items)
3629
3630 def __eq__(self, other):
3631 return type(self) is type(other) and self.items == other.items
3632
3633 def max_width(self):
3634 return sum(s.max_width() for s in self.items)
3635
3636 def get_required_string(self, reverse):
3637 seq = self.items
3638 if reverse:
3639 seq = seq[::-1]
3640
3641 offset = 0
3642
3643 for s in seq:
3644 ofs, req = s.get_required_string(reverse)
3645 offset += ofs
3646 if req:
3647 return offset, req
3648
3649 return offset, None
3650
3651class SetBase(RegexBase):
3652 def __init__(self, info, items, positive=True, case_flags=NOCASE,
3653 zerowidth=False):
3654 RegexBase.__init__(self)
3655 self.info = info
3656 self.items = tuple(items)
3657 self.positive = bool(positive)
3658 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
3659 self.zerowidth = bool(zerowidth)
3660
3661 self.char_width = 1
3662
3663 self._key = (self.__class__, self.items, self.positive,
3664 self.case_flags, self.zerowidth)
3665
3666 def rebuild(self, positive, case_flags, zerowidth):
3667 return type(self)(self.info, self.items, positive, case_flags,
3668 zerowidth).optimise(self.info, False)
3669
3670 def get_firstset(self, reverse):
3671 return set([self])
3672
3673 def has_simple_start(self):
3674 return True
3675
3676 def _compile(self, reverse, fuzzy):
3677 flags = 0
3678 if self.positive:
3679 flags |= POSITIVE_OP
3680 if self.zerowidth:
3681 flags |= ZEROWIDTH_OP
3682 if fuzzy:
3683 flags |= FUZZY_OP
3684 code = [(self._opcode[self.case_flags, reverse], flags)]
3685 for m in self.items:
3686 code.extend(m.compile())
3687
3688 code.append((OP.END, ))
3689
3690 return code
3691
3692 def dump(self, indent, reverse):
3693 print("{}{} {}{}".format(INDENT * indent, self._op_name,
3694 POS_TEXT[self.positive], CASE_TEXT[self.case_flags]))
3695 for i in self.items:
3696 i.dump(indent + 1, reverse)
3697
3698 def _handle_case_folding(self, info, in_set):
3699 # Is the set case-sensitive?
3700 if not self.positive or not (self.case_flags & IGNORECASE) or in_set:
3701 return self
3702
3703 # Is full case-folding possible?
3704 if (not (self.info.flags & UNICODE) or (self.case_flags &
3705 FULLIGNORECASE) != FULLIGNORECASE):
3706 return self
3707
3708 # Get the characters which expand to multiple codepoints on folding.
3709 expanding_chars = _regex.get_expand_on_folding()
3710
3711 # Get the folded characters in the set.
3712 items = []
3713 seen = set()
3714 for ch in expanding_chars:
3715 if self.matches(ord(ch)):
3716 folded = _regex.fold_case(FULL_CASE_FOLDING, ch)
3717 if folded not in seen:
3718 items.append(String([ord(c) for c in folded],
3719 case_flags=self.case_flags))
3720 seen.add(folded)
3721
3722 if not items:
3723 # We can fall back to simple case-folding.
3724 return self
3725
3726 return Branch([self] + items)
3727
3728 def max_width(self):
3729 # Is the set case-sensitive?
3730 if not self.positive or not (self.case_flags & IGNORECASE):
3731 return 1
3732
3733 # Is full case-folding possible?
3734 if (not (self.info.flags & UNICODE) or (self.case_flags &
3735 FULLIGNORECASE) != FULLIGNORECASE):
3736 return 1
3737
3738 # Get the characters which expand to multiple codepoints on folding.
3739 expanding_chars = _regex.get_expand_on_folding()
3740
3741 # Get the folded characters in the set.
3742 seen = set()
3743 for ch in expanding_chars:
3744 if self.matches(ord(ch)):
3745 folded = _regex.fold_case(FULL_CASE_FOLDING, ch)
3746 seen.add(folded)
3747
3748 if not seen:
3749 return 1
3750
3751 return max(len(folded) for folded in seen)
3752
3753 def __del__(self):
3754 self.info = None
3755
3756class SetDiff(SetBase):
3757 _opcode = {(NOCASE, False): OP.SET_DIFF, (IGNORECASE, False):
3758 OP.SET_DIFF_IGN, (FULLCASE, False): OP.SET_DIFF, (FULLIGNORECASE, False):
3759 OP.SET_DIFF_IGN, (NOCASE, True): OP.SET_DIFF_REV, (IGNORECASE, True):
3760 OP.SET_DIFF_IGN_REV, (FULLCASE, True): OP.SET_DIFF_REV, (FULLIGNORECASE,
3761 True): OP.SET_DIFF_IGN_REV}
3762 _op_name = "SET_DIFF"
3763
3764 def optimise(self, info, reverse, in_set=False):
3765 items = self.items
3766 if len(items) > 2:
3767 items = [items[0], SetUnion(info, items[1 : ])]
3768
3769 if len(items) == 1:
3770 return items[0].with_flags(case_flags=self.case_flags,
3771 zerowidth=self.zerowidth).optimise(info, reverse, in_set)
3772
3773 self.items = tuple(m.optimise(info, reverse, in_set=True) for m in
3774 items)
3775
3776 return self._handle_case_folding(info, in_set)
3777
3778 def matches(self, ch):
3779 m = self.items[0].matches(ch) and not self.items[1].matches(ch)
3780 return m == self.positive
3781
3782class SetInter(SetBase):
3783 _opcode = {(NOCASE, False): OP.SET_INTER, (IGNORECASE, False):
3784 OP.SET_INTER_IGN, (FULLCASE, False): OP.SET_INTER, (FULLIGNORECASE,
3785 False): OP.SET_INTER_IGN, (NOCASE, True): OP.SET_INTER_REV, (IGNORECASE,
3786 True): OP.SET_INTER_IGN_REV, (FULLCASE, True): OP.SET_INTER_REV,
3787 (FULLIGNORECASE, True): OP.SET_INTER_IGN_REV}
3788 _op_name = "SET_INTER"
3789
3790 def optimise(self, info, reverse, in_set=False):
3791 items = []
3792 for m in self.items:
3793 m = m.optimise(info, reverse, in_set=True)
3794 if isinstance(m, SetInter) and m.positive:
3795 # Intersection in intersection.
3796 items.extend(m.items)
3797 else:
3798 items.append(m)
3799
3800 if len(items) == 1:
3801 return items[0].with_flags(case_flags=self.case_flags,
3802 zerowidth=self.zerowidth).optimise(info, reverse, in_set)
3803
3804 self.items = tuple(items)
3805
3806 return self._handle_case_folding(info, in_set)
3807
3808 def matches(self, ch):
3809 m = all(i.matches(ch) for i in self.items)
3810 return m == self.positive
3811
3812class SetSymDiff(SetBase):
3813 _opcode = {(NOCASE, False): OP.SET_SYM_DIFF, (IGNORECASE, False):
3814 OP.SET_SYM_DIFF_IGN, (FULLCASE, False): OP.SET_SYM_DIFF, (FULLIGNORECASE,
3815 False): OP.SET_SYM_DIFF_IGN, (NOCASE, True): OP.SET_SYM_DIFF_REV,
3816 (IGNORECASE, True): OP.SET_SYM_DIFF_IGN_REV, (FULLCASE, True):
3817 OP.SET_SYM_DIFF_REV, (FULLIGNORECASE, True): OP.SET_SYM_DIFF_IGN_REV}
3818 _op_name = "SET_SYM_DIFF"
3819
3820 def optimise(self, info, reverse, in_set=False):
3821 items = []
3822 for m in self.items:
3823 m = m.optimise(info, reverse, in_set=True)
3824 if isinstance(m, SetSymDiff) and m.positive:
3825 # Symmetric difference in symmetric difference.
3826 items.extend(m.items)
3827 else:
3828 items.append(m)
3829
3830 if len(items) == 1:
3831 return items[0].with_flags(case_flags=self.case_flags,
3832 zerowidth=self.zerowidth).optimise(info, reverse, in_set)
3833
3834 self.items = tuple(items)
3835
3836 return self._handle_case_folding(info, in_set)
3837
3838 def matches(self, ch):
3839 m = False
3840 for i in self.items:
3841 m = m != i.matches(ch)
3842
3843 return m == self.positive
3844
3845class SetUnion(SetBase):
3846 _opcode = {(NOCASE, False): OP.SET_UNION, (IGNORECASE, False):
3847 OP.SET_UNION_IGN, (FULLCASE, False): OP.SET_UNION, (FULLIGNORECASE,
3848 False): OP.SET_UNION_IGN, (NOCASE, True): OP.SET_UNION_REV, (IGNORECASE,
3849 True): OP.SET_UNION_IGN_REV, (FULLCASE, True): OP.SET_UNION_REV,
3850 (FULLIGNORECASE, True): OP.SET_UNION_IGN_REV}
3851 _op_name = "SET_UNION"
3852
3853 def optimise(self, info, reverse, in_set=False):
3854 items = []
3855 for m in self.items:
3856 m = m.optimise(info, reverse, in_set=True)
3857 if isinstance(m, SetUnion) and m.positive:
3858 # Union in union.
3859 items.extend(m.items)
3860 elif isinstance(m, AnyAll):
3861 return AnyAll()
3862 else:
3863 items.append(m)
3864
3865 # Are there complementary properties?
3866 properties = (set(), set())
3867
3868 for m in items:
3869 if isinstance(m, Property):
3870 properties[m.positive].add((m.value, m.case_flags, m.zerowidth))
3871
3872 if properties[0] & properties[1]:
3873 return AnyAll()
3874
3875 if len(items) == 1:
3876 i = items[0]
3877 return i.with_flags(positive=i.positive == self.positive,
3878 case_flags=self.case_flags,
3879 zerowidth=self.zerowidth).optimise(info, reverse, in_set)
3880
3881 self.items = tuple(items)
3882
3883 return self._handle_case_folding(info, in_set)
3884
3885 def _compile(self, reverse, fuzzy):
3886 flags = 0
3887 if self.positive:
3888 flags |= POSITIVE_OP
3889 if self.zerowidth:
3890 flags |= ZEROWIDTH_OP
3891 if fuzzy:
3892 flags |= FUZZY_OP
3893
3894 characters, others = defaultdict(list), []
3895 for m in self.items:
3896 if isinstance(m, Character):
3897 characters[m.positive].append(m.value)
3898 else:
3899 others.append(m)
3900
3901 code = [(self._opcode[self.case_flags, reverse], flags)]
3902
3903 for positive, values in characters.items():
3904 flags = 0
3905 if positive:
3906 flags |= POSITIVE_OP
3907 if len(values) == 1:
3908 code.append((OP.CHARACTER, flags, values[0]))
3909 else:
3910 code.append((OP.STRING, flags, len(values)) + tuple(values))
3911
3912 for m in others:
3913 code.extend(m.compile())
3914
3915 code.append((OP.END, ))
3916
3917 return code
3918
3919 def matches(self, ch):
3920 m = any(i.matches(ch) for i in self.items)
3921 return m == self.positive
3922
3923class Skip(ZeroWidthBase):
3924 _op_name = "SKIP"
3925 _opcode = OP.SKIP
3926
3927class StartOfLine(ZeroWidthBase):
3928 _opcode = OP.START_OF_LINE
3929 _op_name = "START_OF_LINE"
3930
3931class StartOfLineU(StartOfLine):
3932 _opcode = OP.START_OF_LINE_U
3933 _op_name = "START_OF_LINE_U"
3934
3935class StartOfString(ZeroWidthBase):
3936 _opcode = OP.START_OF_STRING
3937 _op_name = "START_OF_STRING"
3938
3939class StartOfWord(ZeroWidthBase):
3940 _opcode = OP.START_OF_WORD
3941 _op_name = "START_OF_WORD"
3942
3943class String(RegexBase):
3944 _opcode = {(NOCASE, False): OP.STRING, (IGNORECASE, False): OP.STRING_IGN,
3945 (FULLCASE, False): OP.STRING, (FULLIGNORECASE, False): OP.STRING_FLD,
3946 (NOCASE, True): OP.STRING_REV, (IGNORECASE, True): OP.STRING_IGN_REV,
3947 (FULLCASE, True): OP.STRING_REV, (FULLIGNORECASE, True):
3948 OP.STRING_FLD_REV}
3949
3950 def __init__(self, characters, case_flags=NOCASE):
3951 self.characters = tuple(characters)
3952 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
3953
3954 if (self.case_flags & FULLIGNORECASE) == FULLIGNORECASE:
3955 folded_characters = []
3956 for char in self.characters:
3957 folded = _regex.fold_case(FULL_CASE_FOLDING, chr(char))
3958 folded_characters.extend(ord(c) for c in folded)
3959 else:
3960 folded_characters = self.characters
3961
3962 self.folded_characters = tuple(folded_characters)
3963 self.required = False
3964
3965 self._key = self.__class__, self.characters, self.case_flags
3966
3967 def get_firstset(self, reverse):
3968 if reverse:
3969 pos = -1
3970 else:
3971 pos = 0
3972 return set([Character(self.characters[pos],
3973 case_flags=self.case_flags)])
3974
3975 def has_simple_start(self):
3976 return True
3977
3978 def _compile(self, reverse, fuzzy):
3979 flags = 0
3980 if fuzzy:
3981 flags |= FUZZY_OP
3982 if self.required:
3983 flags |= REQUIRED_OP
3984 return [(self._opcode[self.case_flags, reverse], flags,
3985 len(self.folded_characters)) + self.folded_characters]
3986
3987 def dump(self, indent, reverse):
3988 display = ascii("".join(chr(c) for c in self.characters)).lstrip("bu")
3989 print("{}STRING {}{}".format(INDENT * indent, display,
3990 CASE_TEXT[self.case_flags]))
3991
3992 def max_width(self):
3993 return len(self.folded_characters)
3994
3995 def get_required_string(self, reverse):
3996 return 0, self
3997
3998class Literal(String):
3999 def dump(self, indent, reverse):
4000 literal = ''.join(chr(c) for c in self.characters)
4001 display = ascii(literal).lstrip("bu")
4002 print("{}LITERAL MATCH {}{}".format(INDENT * indent, display,
4003 CASE_TEXT[self.case_flags]))
4004
4005class StringSet(Branch):
4006 def __init__(self, info, name, case_flags=NOCASE):
4007 self.info = info
4008 self.name = name
4009 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
4010
4011 self._key = self.__class__, self.name, self.case_flags
4012
4013 self.set_key = (name, self.case_flags)
4014 if self.set_key not in info.named_lists_used:
4015 info.named_lists_used[self.set_key] = len(info.named_lists_used)
4016
4017 index = self.info.named_lists_used[self.set_key]
4018 items = self.info.kwargs[self.name]
4019
4020 case_flags = self.case_flags
4021
4022 encoding = self.info.flags & _ALL_ENCODINGS
4023 fold_flags = encoding | case_flags
4024
4025 choices = []
4026
4027 for string in items:
4028 if isinstance(string, str):
4029 string = [ord(c) for c in string]
4030
4031 choices.append([Character(c, case_flags=case_flags) for c in
4032 string])
4033
4034 # Sort from longest to shortest.
4035 choices.sort(key=len, reverse=True)
4036
4037 self.branches = [Sequence(choice) for choice in choices]
4038
4039 def dump(self, indent, reverse):
4040 print("{}STRING_SET {}{}".format(INDENT * indent, self.name,
4041 CASE_TEXT[self.case_flags]))
4042
4043 def __del__(self):
4044 self.info = None
4045
4046class Source:
4047 "Scanner for the regular expression source string."
4048 def __init__(self, string):
4049 if isinstance(string, str):
4050 self.string = string
4051 self.char_type = chr
4052 else:
4053 self.string = string.decode("latin-1")
4054 self.char_type = lambda c: bytes([c])
4055
4056 self.pos = 0
4057 self.ignore_space = False
4058 self.sep = string[ : 0]
4059
4060 def get(self, override_ignore=False):
4061 string = self.string
4062 pos = self.pos
4063
4064 try:
4065 if self.ignore_space and not override_ignore:
4066 while True:
4067 if string[pos].isspace():
4068 # Skip over the whitespace.
4069 pos += 1
4070 elif string[pos] == "#":
4071 # Skip over the comment to the end of the line.
4072 pos = string.index("\n", pos)
4073 else:
4074 break
4075
4076 ch = string[pos]
4077 self.pos = pos + 1
4078 return ch
4079 except IndexError:
4080 # We've reached the end of the string.
4081 self.pos = pos
4082 return string[ : 0]
4083 except ValueError:
4084 # The comment extended to the end of the string.
4085 self.pos = len(string)
4086 return string[ : 0]
4087
4088 def get_many(self, count=1):
4089 string = self.string
4090 pos = self.pos
4091
4092 try:
4093 if self.ignore_space:
4094 substring = []
4095
4096 while len(substring) < count:
4097 while True:
4098 if string[pos].isspace():
4099 # Skip over the whitespace.
4100 pos += 1
4101 elif string[pos] == "#":
4102 # Skip over the comment to the end of the line.
4103 pos = string.index("\n", pos)
4104 else:
4105 break
4106
4107 substring.append(string[pos])
4108 pos += 1
4109
4110 substring = "".join(substring)
4111 else:
4112 substring = string[pos : pos + count]
4113 pos += len(substring)
4114
4115 self.pos = pos
4116 return substring
4117 except IndexError:
4118 # We've reached the end of the string.
4119 self.pos = len(string)
4120 return "".join(substring)
4121 except ValueError:
4122 # The comment extended to the end of the string.
4123 self.pos = len(string)
4124 return "".join(substring)
4125
4126 def get_while(self, test_set, include=True, keep_spaces=False):
4127 string = self.string
4128 pos = self.pos
4129
4130 if self.ignore_space and not keep_spaces:
4131 try:
4132 substring = []
4133
4134 while True:
4135 if string[pos].isspace():
4136 # Skip over the whitespace.
4137 pos += 1
4138 elif string[pos] == "#":
4139 # Skip over the comment to the end of the line.
4140 pos = string.index("\n", pos)
4141 elif (string[pos] in test_set) == include:
4142 substring.append(string[pos])
4143 pos += 1
4144 else:
4145 break
4146
4147 self.pos = pos
4148 except IndexError:
4149 # We've reached the end of the string.
4150 self.pos = len(string)
4151 except ValueError:
4152 # The comment extended to the end of the string.
4153 self.pos = len(string)
4154
4155 return "".join(substring)
4156 else:
4157 try:
4158 while (string[pos] in test_set) == include:
4159 pos += 1
4160
4161 substring = string[self.pos : pos]
4162
4163 self.pos = pos
4164
4165 return substring
4166 except IndexError:
4167 # We've reached the end of the string.
4168 substring = string[self.pos : pos]
4169
4170 self.pos = pos
4171
4172 return substring
4173
4174 def skip_while(self, test_set, include=True):
4175 string = self.string
4176 pos = self.pos
4177
4178 try:
4179 if self.ignore_space:
4180 while True:
4181 if string[pos].isspace():
4182 # Skip over the whitespace.
4183 pos += 1
4184 elif string[pos] == "#":
4185 # Skip over the comment to the end of the line.
4186 pos = string.index("\n", pos)
4187 elif (string[pos] in test_set) == include:
4188 pos += 1
4189 else:
4190 break
4191 else:
4192 while (string[pos] in test_set) == include:
4193 pos += 1
4194
4195 self.pos = pos
4196 except IndexError:
4197 # We've reached the end of the string.
4198 self.pos = len(string)
4199 except ValueError:
4200 # The comment extended to the end of the string.
4201 self.pos = len(string)
4202
4203 def match(self, substring):
4204 string = self.string
4205 pos = self.pos
4206
4207 if self.ignore_space:
4208 try:
4209 for c in substring:
4210 while True:
4211 if string[pos].isspace():
4212 # Skip over the whitespace.
4213 pos += 1
4214 elif string[pos] == "#":
4215 # Skip over the comment to the end of the line.
4216 pos = string.index("\n", pos)
4217 else:
4218 break
4219
4220 if string[pos] != c:
4221 return False
4222
4223 pos += 1
4224
4225 self.pos = pos
4226
4227 return True
4228 except IndexError:
4229 # We've reached the end of the string.
4230 return False
4231 except ValueError:
4232 # The comment extended to the end of the string.
4233 return False
4234 else:
4235 if not string.startswith(substring, pos):
4236 return False
4237
4238 self.pos = pos + len(substring)
4239
4240 return True
4241
4242 def expect(self, substring):
4243 if not self.match(substring):
4244 raise error("missing {}".format(substring), self.string, self.pos)
4245
4246 def at_end(self):
4247 string = self.string
4248 pos = self.pos
4249
4250 try:
4251 if self.ignore_space:
4252 while True:
4253 if string[pos].isspace():
4254 pos += 1
4255 elif string[pos] == "#":
4256 pos = string.index("\n", pos)
4257 else:
4258 break
4259
4260 return pos >= len(string)
4261 except IndexError:
4262 # We've reached the end of the string.
4263 return True
4264 except ValueError:
4265 # The comment extended to the end of the string.
4266 return True
4267
4268class Info:
4269 "Info about the regular expression."
4270
4271 def __init__(self, flags=0, char_type=None, kwargs={}):
4272 flags |= DEFAULT_FLAGS[(flags & _ALL_VERSIONS) or DEFAULT_VERSION]
4273 self.flags = flags
4274 self.global_flags = flags
4275 self.inline_locale = False
4276
4277 self.kwargs = kwargs
4278
4279 self.group_count = 0
4280 self.group_index = {}
4281 self.group_name = {}
4282 self.char_type = char_type
4283 self.named_lists_used = {}
4284 self.open_groups = []
4285 self.open_group_count = {}
4286 self.defined_groups = {}
4287 self.group_calls = []
4288 self.private_groups = {}
4289
4290 def open_group(self, name=None):
4291 group = self.group_index.get(name)
4292 if group is None:
4293 while True:
4294 self.group_count += 1
4295 if name is None or self.group_count not in self.group_name:
4296 break
4297
4298 group = self.group_count
4299 if name:
4300 self.group_index[name] = group
4301 self.group_name[group] = name
4302
4303 if group in self.open_groups:
4304 # We have a nested named group. We'll assign it a private group
4305 # number, initially negative until we can assign a proper
4306 # (positive) number.
4307 group_alias = -(len(self.private_groups) + 1)
4308 self.private_groups[group_alias] = group
4309 group = group_alias
4310
4311 self.open_groups.append(group)
4312 self.open_group_count[group] = self.open_group_count.get(group, 0) + 1
4313
4314 return group
4315
4316 def close_group(self):
4317 self.open_groups.pop()
4318
4319 def is_open_group(self, name):
4320 # In version 1, a group reference can refer to an open group. We'll
4321 # just pretend the group isn't open.
4322 version = (self.flags & _ALL_VERSIONS) or DEFAULT_VERSION
4323 if version == VERSION1:
4324 return False
4325
4326 if name.isdigit():
4327 group = int(name)
4328 else:
4329 group = self.group_index.get(name)
4330
4331 return group in self.open_groups
4332
4333def _check_group_features(info, parsed):
4334 """Checks whether the reverse and fuzzy features of the group calls match
4335 the groups which they call.
4336 """
4337 call_refs = {}
4338 additional_groups = []
4339 for call, reverse, fuzzy in info.group_calls:
4340 # Look up the reference of this group call.
4341 key = (call.group, reverse, fuzzy)
4342 ref = call_refs.get(key)
4343 if ref is None:
4344 # This group doesn't have a reference yet, so look up its features.
4345 if call.group == 0:
4346 # Calling the pattern as a whole.
4347 rev = bool(info.flags & REVERSE)
4348 fuz = isinstance(parsed, Fuzzy)
4349 if (rev, fuz) != (reverse, fuzzy):
4350 # The pattern as a whole doesn't have the features we want,
4351 # so we'll need to make a copy of it with the desired
4352 # features.
4353 additional_groups.append((CallRef(len(call_refs), parsed),
4354 reverse, fuzzy))
4355 else:
4356 # Calling a capture group.
4357 def_info = info.defined_groups[call.group]
4358 group = def_info[0]
4359 if def_info[1 : ] != (reverse, fuzzy):
4360 # The group doesn't have the features we want, so we'll
4361 # need to make a copy of it with the desired features.
4362 additional_groups.append((group, reverse, fuzzy))
4363
4364 ref = len(call_refs)
4365 call_refs[key] = ref
4366
4367 call.call_ref = ref
4368
4369 info.call_refs = call_refs
4370 info.additional_groups = additional_groups
4371
4372def _get_required_string(parsed, flags):
4373 "Gets the required string and related info of a parsed pattern."
4374
4375 req_offset, required = parsed.get_required_string(bool(flags & REVERSE))
4376 if required:
4377 required.required = True
4378 if req_offset >= UNLIMITED:
4379 req_offset = -1
4380
4381 req_flags = required.case_flags
4382 if not (flags & UNICODE):
4383 req_flags &= ~UNICODE
4384
4385 req_chars = required.folded_characters
4386 else:
4387 req_offset = 0
4388 req_chars = ()
4389 req_flags = 0
4390
4391 return req_offset, req_chars, req_flags
4392
4393class Scanner:
4394 def __init__(self, lexicon, flags=0):
4395 self.lexicon = lexicon
4396
4397 # Combine phrases into a compound pattern.
4398 patterns = []
4399 for phrase, action in lexicon:
4400 # Parse the regular expression.
4401 source = Source(phrase)
4402 info = Info(flags, source.char_type)
4403 source.ignore_space = bool(info.flags & VERBOSE)
4404 parsed = _parse_pattern(source, info)
4405 if not source.at_end():
4406 raise error("unbalanced parenthesis", source.string,
4407 source.pos)
4408
4409 # We want to forbid capture groups within each phrase.
4410 patterns.append(parsed.remove_captures())
4411
4412 # Combine all the subpatterns into one pattern.
4413 info = Info(flags)
4414 patterns = [Group(info, g + 1, p) for g, p in enumerate(patterns)]
4415 parsed = Branch(patterns)
4416
4417 # Optimise the compound pattern.
4418 reverse = bool(info.flags & REVERSE)
4419 parsed = parsed.optimise(info, reverse)
4420 parsed = parsed.pack_characters(info)
4421
4422 # Get the required string.
4423 req_offset, req_chars, req_flags = _get_required_string(parsed,
4424 info.flags)
4425
4426 # Check the features of the groups.
4427 _check_group_features(info, parsed)
4428
4429 # Complain if there are any group calls. They are not supported by the
4430 # Scanner class.
4431 if info.call_refs:
4432 raise error("recursive regex not supported by Scanner",
4433 source.string, source.pos)
4434
4435 reverse = bool(info.flags & REVERSE)
4436
4437 # Compile the compound pattern. The result is a list of tuples.
4438 code = parsed.compile(reverse) + [(OP.SUCCESS, )]
4439
4440 # Flatten the code into a list of ints.
4441 code = _flatten_code(code)
4442
4443 if not parsed.has_simple_start():
4444 # Get the first set, if possible.
4445 try:
4446 fs_code = _compile_firstset(info, parsed.get_firstset(reverse))
4447 fs_code = _flatten_code(fs_code)
4448 code = fs_code + code
4449 except _FirstSetError:
4450 pass
4451
4452 # Check the global flags for conflicts.
4453 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION
4454 if version not in (0, VERSION0, VERSION1):
4455 raise ValueError("VERSION0 and VERSION1 flags are mutually incompatible")
4456
4457 # Create the PatternObject.
4458 #
4459 # Local flags like IGNORECASE affect the code generation, but aren't
4460 # needed by the PatternObject itself. Conversely, global flags like
4461 # LOCALE _don't_ affect the code generation but _are_ needed by the
4462 # PatternObject.
4463 self.scanner = _regex.compile(None, (flags & GLOBAL_FLAGS) | version,
4464 code, {}, {}, {}, [], req_offset, req_chars, req_flags,
4465 len(patterns))
4466
4467 def scan(self, string):
4468 result = []
4469 append = result.append
4470 match = self.scanner.scanner(string).match
4471 i = 0
4472 while True:
4473 m = match()
4474 if not m:
4475 break
4476 j = m.end()
4477 if i == j:
4478 break
4479 action = self.lexicon[m.lastindex - 1][1]
4480 if hasattr(action, '__call__'):
4481 self.match = m
4482 action = action(self, m.group())
4483 if action is not None:
4484 append(action)
4485 i = j
4486
4487 return result, string[i : ]
4488
4489# Get the known properties dict.
4490PROPERTIES = _regex.get_properties()
4491
4492# Build the inverse of the properties dict.
4493PROPERTY_NAMES = {}
4494for prop_name, (prop_id, values) in PROPERTIES.items():
4495 name, prop_values = PROPERTY_NAMES.get(prop_id, ("", {}))
4496 name = max(name, prop_name, key=len)
4497 PROPERTY_NAMES[prop_id] = name, prop_values
4498
4499 for val_name, val_id in values.items():
4500 prop_values[val_id] = max(prop_values.get(val_id, ""), val_name,
4501 key=len)
4502
4503# Character escape sequences.
4504CHARACTER_ESCAPES = {
4505 "a": "\a",
4506 "b": "\b",
4507 "f": "\f",
4508 "n": "\n",
4509 "r": "\r",
4510 "t": "\t",
4511 "v": "\v",
4512}
4513
4514ASCII_ENCODING = 1
4515UNICODE_ENCODING = 2
4516
4517# Predefined character set escape sequences.
4518CHARSET_ESCAPES = {
4519 "d": lookup_property(None, "Digit", True),
4520 "D": lookup_property(None, "Digit", False),
4521 "h": lookup_property(None, "Blank", True),
4522 "s": lookup_property(None, "Space", True),
4523 "S": lookup_property(None, "Space", False),
4524 "w": lookup_property(None, "Word", True),
4525 "W": lookup_property(None, "Word", False),
4526}
4527
4528ASCII_CHARSET_ESCAPES = dict(CHARSET_ESCAPES)
4529ASCII_CHARSET_ESCAPES.update({
4530 "d": lookup_property(None, "Digit", True, encoding=ASCII_ENCODING),
4531 "D": lookup_property(None, "Digit", False, encoding=ASCII_ENCODING),
4532 "s": lookup_property(None, "Space", True, encoding=ASCII_ENCODING),
4533 "S": lookup_property(None, "Space", False, encoding=ASCII_ENCODING),
4534 "w": lookup_property(None, "Word", True, encoding=ASCII_ENCODING),
4535 "W": lookup_property(None, "Word", False, encoding=ASCII_ENCODING),
4536})
4537UNICODE_CHARSET_ESCAPES = dict(CHARSET_ESCAPES)
4538UNICODE_CHARSET_ESCAPES.update({
4539 "d": lookup_property(None, "Digit", True, encoding=UNICODE_ENCODING),
4540 "D": lookup_property(None, "Digit", False, encoding=UNICODE_ENCODING),
4541 "s": lookup_property(None, "Space", True, encoding=UNICODE_ENCODING),
4542 "S": lookup_property(None, "Space", False, encoding=UNICODE_ENCODING),
4543 "w": lookup_property(None, "Word", True, encoding=UNICODE_ENCODING),
4544 "W": lookup_property(None, "Word", False, encoding=UNICODE_ENCODING),
4545})
4546
4547# Positional escape sequences.
4548POSITION_ESCAPES = {
4549 "A": StartOfString(),
4550 "b": Boundary(),
4551 "B": Boundary(False),
4552 "K": Keep(),
4553 "m": StartOfWord(),
4554 "M": EndOfWord(),
4555 "Z": EndOfString(),
4556}
4557ASCII_POSITION_ESCAPES = dict(POSITION_ESCAPES)
4558ASCII_POSITION_ESCAPES.update({
4559 "b": Boundary(encoding=ASCII_ENCODING),
4560 "B": Boundary(False, encoding=ASCII_ENCODING),
4561 "m": StartOfWord(encoding=ASCII_ENCODING),
4562 "M": EndOfWord(encoding=ASCII_ENCODING),
4563})
4564UNICODE_POSITION_ESCAPES = dict(POSITION_ESCAPES)
4565UNICODE_POSITION_ESCAPES.update({
4566 "b": Boundary(encoding=UNICODE_ENCODING),
4567 "B": Boundary(False, encoding=UNICODE_ENCODING),
4568 "m": StartOfWord(encoding=UNICODE_ENCODING),
4569 "M": EndOfWord(encoding=UNICODE_ENCODING),
4570})
4571
4572# Positional escape sequences when WORD flag set.
4573WORD_POSITION_ESCAPES = dict(POSITION_ESCAPES)
4574WORD_POSITION_ESCAPES.update({
4575 "b": DefaultBoundary(),
4576 "B": DefaultBoundary(False),
4577 "m": DefaultStartOfWord(),
4578 "M": DefaultEndOfWord(),
4579})
4580
4581# Regex control verbs.
4582VERBS = {
4583 "FAIL": Failure(),
4584 "F": Failure(),
4585 "PRUNE": Prune(),
4586 "SKIP": Skip(),
4587}