1#
2# Secret Labs' Regular Expression Engine core module
3#
4# Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved.
5#
6# This version of the SRE library can be redistributed under CNRI's
7# Python 1.6 license. For any other use, please contact Secret Labs
8# AB (info@pythonware.com).
9#
10# Portions of this engine have been developed in cooperation with
11# CNRI. Hewlett-Packard provided funding for 1.6 integration and
12# other compatibility work.
13#
14# 2010-01-16 mrab Python front-end re-written and extended
15
16import enum
17import string
18import unicodedata
19from collections import defaultdict
20
21import regex._regex as _regex
22
23__all__ = ["A", "ASCII", "B", "BESTMATCH", "D", "DEBUG", "E", "ENHANCEMATCH",
24 "F", "FULLCASE", "I", "IGNORECASE", "L", "LOCALE", "M", "MULTILINE", "P",
25 "POSIX", "R", "REVERSE", "S", "DOTALL", "T", "TEMPLATE", "U", "UNICODE",
26 "V0", "VERSION0", "V1", "VERSION1", "W", "WORD", "X", "VERBOSE", "error",
27 "Scanner", "RegexFlag"]
28
29# The regex exception.
30class error(Exception):
31 """Exception raised for invalid regular expressions.
32
33 Attributes:
34
35 msg: The unformatted error message
36 pattern: The regular expression pattern
37 pos: The position in the pattern where compilation failed, or None
38 lineno: The line number where compilation failed, unless pos is None
39 colno: The column number where compilation failed, unless pos is None
40 """
41
42 def __init__(self, message, pattern=None, pos=None):
43 newline = '\n' if isinstance(pattern, str) else b'\n'
44 self.msg = message
45 self.pattern = pattern
46 self.pos = pos
47 if pattern is not None and pos is not None:
48 self.lineno = pattern.count(newline, 0, pos) + 1
49 self.colno = pos - pattern.rfind(newline, 0, pos)
50
51 message = "{} at position {}".format(message, pos)
52
53 if newline in pattern:
54 message += " (line {}, column {})".format(self.lineno,
55 self.colno)
56
57 Exception.__init__(self, message)
58
59# The exception for when a positional flag has been turned on in the old
60# behaviour.
61class _UnscopedFlagSet(Exception):
62 pass
63
64# The exception for when parsing fails and we want to try something else.
65class ParseError(Exception):
66 pass
67
68# The exception for when there isn't a valid first set.
69class _FirstSetError(Exception):
70 pass
71
72# Flags.
73class RegexFlag(enum.IntFlag):
74 A = ASCII = 0x80 # Assume ASCII locale.
75 B = BESTMATCH = 0x1000 # Best fuzzy match.
76 D = DEBUG = 0x200 # Print parsed pattern.
77 E = ENHANCEMATCH = 0x8000 # Attempt to improve the fit after finding the first
78 # fuzzy match.
79 F = FULLCASE = 0x4000 # Unicode full case-folding.
80 I = IGNORECASE = 0x2 # Ignore case.
81 L = LOCALE = 0x4 # Assume current 8-bit locale.
82 M = MULTILINE = 0x8 # Make anchors look for newline.
83 P = POSIX = 0x10000 # POSIX-style matching (leftmost longest).
84 R = REVERSE = 0x400 # Search backwards.
85 S = DOTALL = 0x10 # Make dot match newline.
86 U = UNICODE = 0x20 # Assume Unicode locale.
87 V0 = VERSION0 = 0x2000 # Old legacy behaviour.
88 V1 = VERSION1 = 0x100 # New enhanced behaviour.
89 W = WORD = 0x800 # Default Unicode word breaks.
90 X = VERBOSE = 0x40 # Ignore whitespace and comments.
91 T = TEMPLATE = 0x1 # Template (present because re module has it).
92
93 def __repr__(self):
94 if self._name_ is not None:
95 return 'regex.%s' % self._name_
96
97 value = self._value_
98 members = []
99 negative = value < 0
100
101 if negative:
102 value = ~value
103
104 for m in self.__class__:
105 if value & m._value_:
106 value &= ~m._value_
107 members.append('regex.%s' % m._name_)
108
109 if value:
110 members.append(hex(value))
111
112 res = '|'.join(members)
113
114 if negative:
115 if len(members) > 1:
116 res = '~(%s)' % res
117 else:
118 res = '~%s' % res
119
120 return res
121
122 __str__ = object.__str__
123
124globals().update(RegexFlag.__members__)
125
126DEFAULT_VERSION = VERSION1
127
128_ALL_VERSIONS = VERSION0 | VERSION1
129_ALL_ENCODINGS = ASCII | LOCALE | UNICODE
130
131# The default flags for the various versions.
132DEFAULT_FLAGS = {VERSION0: 0, VERSION1: FULLCASE}
133
134# The mask for the flags.
135GLOBAL_FLAGS = (_ALL_VERSIONS | BESTMATCH | DEBUG | ENHANCEMATCH | POSIX |
136 REVERSE)
137SCOPED_FLAGS = (FULLCASE | IGNORECASE | MULTILINE | DOTALL | WORD | VERBOSE |
138 _ALL_ENCODINGS)
139
140ALPHA = frozenset(string.ascii_letters)
141DIGITS = frozenset(string.digits)
142ALNUM = ALPHA | DIGITS
143OCT_DIGITS = frozenset(string.octdigits)
144HEX_DIGITS = frozenset(string.hexdigits)
145SPECIAL_CHARS = frozenset("()|?*+{^$.[\\#") | frozenset([""])
146NAMED_CHAR_PART = ALNUM | frozenset(" -")
147PROPERTY_NAME_PART = ALNUM | frozenset(" &_-.")
148SET_OPS = ("||", "~~", "&&", "--")
149
150# The width of the code words inside the regex engine.
151BYTES_PER_CODE = _regex.get_code_size()
152BITS_PER_CODE = BYTES_PER_CODE * 8
153
154# The repeat count which represents infinity.
155UNLIMITED = (1 << BITS_PER_CODE) - 1
156
157# The regular expression flags.
158REGEX_FLAGS = {"a": ASCII, "b": BESTMATCH, "e": ENHANCEMATCH, "f": FULLCASE,
159 "i": IGNORECASE, "L": LOCALE, "m": MULTILINE, "p": POSIX, "r": REVERSE,
160 "s": DOTALL, "u": UNICODE, "V0": VERSION0, "V1": VERSION1, "w": WORD, "x":
161 VERBOSE}
162
163# The case flags.
164CASE_FLAGS = FULLCASE | IGNORECASE
165NOCASE = 0
166FULLIGNORECASE = FULLCASE | IGNORECASE
167
168FULL_CASE_FOLDING = UNICODE | FULLIGNORECASE
169
170CASE_FLAGS_COMBINATIONS = {0: 0, FULLCASE: 0, IGNORECASE: IGNORECASE,
171 FULLIGNORECASE: FULLIGNORECASE}
172
173# The number of digits in hexadecimal escapes.
174HEX_ESCAPES = {"x": 2, "u": 4, "U": 8}
175
176# The names of the opcodes.
177OPCODES = """
178FAILURE
179SUCCESS
180ANY
181ANY_ALL
182ANY_ALL_REV
183ANY_REV
184ANY_U
185ANY_U_REV
186ATOMIC
187BOUNDARY
188BRANCH
189CALL_REF
190CHARACTER
191CHARACTER_IGN
192CHARACTER_IGN_REV
193CHARACTER_REV
194CONDITIONAL
195DEFAULT_BOUNDARY
196DEFAULT_END_OF_WORD
197DEFAULT_START_OF_WORD
198END
199END_OF_LINE
200END_OF_LINE_U
201END_OF_STRING
202END_OF_STRING_LINE
203END_OF_STRING_LINE_U
204END_OF_WORD
205FUZZY
206GRAPHEME_BOUNDARY
207GREEDY_REPEAT
208GROUP
209GROUP_CALL
210GROUP_EXISTS
211KEEP
212LAZY_REPEAT
213LOOKAROUND
214NEXT
215PROPERTY
216PROPERTY_IGN
217PROPERTY_IGN_REV
218PROPERTY_REV
219PRUNE
220RANGE
221RANGE_IGN
222RANGE_IGN_REV
223RANGE_REV
224REF_GROUP
225REF_GROUP_FLD
226REF_GROUP_FLD_REV
227REF_GROUP_IGN
228REF_GROUP_IGN_REV
229REF_GROUP_REV
230SEARCH_ANCHOR
231SET_DIFF
232SET_DIFF_IGN
233SET_DIFF_IGN_REV
234SET_DIFF_REV
235SET_INTER
236SET_INTER_IGN
237SET_INTER_IGN_REV
238SET_INTER_REV
239SET_SYM_DIFF
240SET_SYM_DIFF_IGN
241SET_SYM_DIFF_IGN_REV
242SET_SYM_DIFF_REV
243SET_UNION
244SET_UNION_IGN
245SET_UNION_IGN_REV
246SET_UNION_REV
247SKIP
248START_OF_LINE
249START_OF_LINE_U
250START_OF_STRING
251START_OF_WORD
252STRING
253STRING_FLD
254STRING_FLD_REV
255STRING_IGN
256STRING_IGN_REV
257STRING_REV
258FUZZY_EXT
259"""
260
261# Define the opcodes in a namespace.
262class Namespace:
263 pass
264
265OP = Namespace()
266for i, op in enumerate(OPCODES.split()):
267 setattr(OP, op, i)
268
269def _shrink_cache(cache_dict, args_dict, locale_sensitive, max_length, divisor=5):
270 """Make room in the given cache.
271
272 Args:
273 cache_dict: The cache dictionary to modify.
274 args_dict: The dictionary of named list args used by patterns.
275 max_length: Maximum # of entries in cache_dict before it is shrunk.
276 divisor: Cache will shrink to max_length - 1/divisor*max_length items.
277 """
278 # Toss out a fraction of the entries at random to make room for new ones.
279 # A random algorithm was chosen as opposed to simply cache_dict.popitem()
280 # as popitem could penalize the same regular expression repeatedly based
281 # on its internal hash value. Being random should spread the cache miss
282 # love around.
283 cache_keys = tuple(cache_dict.keys())
284 overage = len(cache_keys) - max_length
285 if overage < 0:
286 # Cache is already within limits. Normally this should not happen
287 # but it could due to multithreading.
288 return
289
290 number_to_toss = max_length // divisor + overage
291
292 # The import is done here to avoid a circular dependency.
293 import random
294 if not hasattr(random, 'sample'):
295 # Do nothing while resolving the circular dependency:
296 # re->random->warnings->tokenize->string->re
297 return
298
299 for doomed_key in random.sample(cache_keys, number_to_toss):
300 try:
301 del cache_dict[doomed_key]
302 except KeyError:
303 # Ignore problems if the cache changed from another thread.
304 pass
305
306 # Rebuild the arguments and locale-sensitivity dictionaries.
307 args_dict.clear()
308 sensitivity_dict = {}
309 for pattern, pattern_type, flags, args, default_version, locale in tuple(cache_dict):
310 args_dict[pattern, pattern_type, flags, default_version, locale] = args
311 try:
312 sensitivity_dict[pattern_type, pattern] = locale_sensitive[pattern_type, pattern]
313 except KeyError:
314 pass
315
316 locale_sensitive.clear()
317 locale_sensitive.update(sensitivity_dict)
318
319def _fold_case(info, string):
320 "Folds the case of a string."
321 flags = info.flags
322 if (flags & _ALL_ENCODINGS) == 0:
323 flags |= info.guess_encoding
324
325 return _regex.fold_case(flags, string)
326
327def is_cased_i(info, char):
328 "Checks whether a character is cased."
329 return len(_regex.get_all_cases(info.flags, char)) > 1
330
331def is_cased_f(flags, char):
332 "Checks whether a character is cased."
333 return len(_regex.get_all_cases(flags, char)) > 1
334
335def _compile_firstset(info, fs):
336 "Compiles the firstset for the pattern."
337 reverse = bool(info.flags & REVERSE)
338 fs = _check_firstset(info, reverse, fs)
339 if not fs:
340 return []
341
342 # Compile the firstset.
343 return fs.compile(reverse)
344
345def _check_firstset(info, reverse, fs):
346 "Checks the firstset for the pattern."
347 if not fs or None in fs:
348 return None
349
350 # If we ignore the case, for simplicity we won't build a firstset.
351 members = set()
352 case_flags = NOCASE
353 for i in fs:
354 if isinstance(i, Character) and not i.positive:
355 return None
356
357# if i.case_flags:
358# if isinstance(i, Character):
359# if is_cased_i(info, i.value):
360# return []
361# elif isinstance(i, SetBase):
362# return []
363 case_flags |= i.case_flags
364 members.add(i.with_flags(case_flags=NOCASE))
365
366 if case_flags == (FULLCASE | IGNORECASE):
367 return None
368
369 # Build the firstset.
370 fs = SetUnion(info, list(members), case_flags=case_flags & ~FULLCASE,
371 zerowidth=True)
372 fs = fs.optimise(info, reverse, in_set=True)
373
374 return fs
375
376def _flatten_code(code):
377 "Flattens the code from a list of tuples."
378 flat_code = []
379 for c in code:
380 flat_code.extend(c)
381
382 return flat_code
383
384def make_case_flags(info):
385 "Makes the case flags."
386 flags = info.flags & CASE_FLAGS
387
388 # Turn off FULLCASE if ASCII is turned on.
389 if info.flags & ASCII:
390 flags &= ~FULLCASE
391
392 return flags
393
394def make_character(info, value, in_set=False):
395 "Makes a character literal."
396 if in_set:
397 # A character set is built case-sensitively.
398 return Character(value)
399
400 return Character(value, case_flags=make_case_flags(info))
401
402def make_ref_group(info, name, position):
403 "Makes a group reference."
404 return RefGroup(info, name, position, case_flags=make_case_flags(info))
405
406def make_string_set(info, name):
407 "Makes a string set."
408 return StringSet(info, name, case_flags=make_case_flags(info))
409
410def make_property(info, prop, in_set):
411 "Makes a property."
412 if in_set:
413 return prop
414
415 return prop.with_flags(case_flags=make_case_flags(info))
416
417def _parse_pattern(source, info):
418 "Parses a pattern, eg. 'a|b|c'."
419 branches = [parse_sequence(source, info)]
420 while source.match("|"):
421 branches.append(parse_sequence(source, info))
422
423 if len(branches) == 1:
424 return branches[0]
425 return Branch(branches)
426
427def parse_sequence(source, info):
428 "Parses a sequence, eg. 'abc'."
429 sequence = [None]
430 case_flags = make_case_flags(info)
431 while True:
432 saved_pos = source.pos
433 ch = source.get()
434 if ch in SPECIAL_CHARS:
435 if ch in ")|":
436 # The end of a sequence. At the end of the pattern ch is "".
437 source.pos = saved_pos
438 break
439 elif ch == "\\":
440 # An escape sequence outside a set.
441 sequence.append(parse_escape(source, info, False))
442 elif ch == "(":
443 # A parenthesised subpattern or a flag.
444 element = parse_paren(source, info)
445 if element is None:
446 case_flags = make_case_flags(info)
447 else:
448 sequence.append(element)
449 elif ch == ".":
450 # Any character.
451 if info.flags & DOTALL:
452 sequence.append(AnyAll())
453 elif info.flags & WORD:
454 sequence.append(AnyU())
455 else:
456 sequence.append(Any())
457 elif ch == "[":
458 # A character set.
459 sequence.append(parse_set(source, info))
460 elif ch == "^":
461 # The start of a line or the string.
462 if info.flags & MULTILINE:
463 if info.flags & WORD:
464 sequence.append(StartOfLineU())
465 else:
466 sequence.append(StartOfLine())
467 else:
468 sequence.append(StartOfString())
469 elif ch == "$":
470 # The end of a line or the string.
471 if info.flags & MULTILINE:
472 if info.flags & WORD:
473 sequence.append(EndOfLineU())
474 else:
475 sequence.append(EndOfLine())
476 else:
477 if info.flags & WORD:
478 sequence.append(EndOfStringLineU())
479 else:
480 sequence.append(EndOfStringLine())
481 elif ch in "?*+{":
482 # Looks like a quantifier.
483 counts = parse_quantifier(source, info, ch)
484 if counts:
485 # It _is_ a quantifier.
486 apply_quantifier(source, info, counts, case_flags, ch,
487 saved_pos, sequence)
488 sequence.append(None)
489 else:
490 # It's not a quantifier. Maybe it's a fuzzy constraint.
491 constraints = parse_fuzzy(source, info, ch, case_flags)
492 if constraints:
493 # It _is_ a fuzzy constraint.
494 apply_constraint(source, info, constraints, case_flags,
495 saved_pos, sequence)
496 sequence.append(None)
497 else:
498 # The element was just a literal.
499 sequence.append(Character(ord(ch),
500 case_flags=case_flags))
501 else:
502 # A literal.
503 sequence.append(Character(ord(ch), case_flags=case_flags))
504 else:
505 # A literal.
506 sequence.append(Character(ord(ch), case_flags=case_flags))
507
508 sequence = [item for item in sequence if item is not None]
509 return Sequence(sequence)
510
511def apply_quantifier(source, info, counts, case_flags, ch, saved_pos,
512 sequence):
513 element = sequence.pop()
514 if element is None:
515 if sequence:
516 raise error("multiple repeat", source.string, saved_pos)
517 raise error("nothing to repeat", source.string, saved_pos)
518
519 if isinstance(element, (GreedyRepeat, LazyRepeat, PossessiveRepeat)):
520 raise error("multiple repeat", source.string, saved_pos)
521
522 min_count, max_count = counts
523 saved_pos = source.pos
524 ch = source.get()
525 if ch == "?":
526 # The "?" suffix that means it's a lazy repeat.
527 repeated = LazyRepeat
528 elif ch == "+":
529 # The "+" suffix that means it's a possessive repeat.
530 repeated = PossessiveRepeat
531 else:
532 # No suffix means that it's a greedy repeat.
533 source.pos = saved_pos
534 repeated = GreedyRepeat
535
536 # Ignore the quantifier if it applies to a zero-width item or the number of
537 # repeats is fixed at 1.
538 if not element.is_empty() and (min_count != 1 or max_count != 1):
539 element = repeated(element, min_count, max_count)
540
541 sequence.append(element)
542
543def apply_constraint(source, info, constraints, case_flags, saved_pos,
544 sequence):
545 element = sequence.pop()
546 if element is None:
547 raise error("nothing for fuzzy constraint", source.string, saved_pos)
548
549 # If a group is marked as fuzzy then put all of the fuzzy part in the
550 # group.
551 if isinstance(element, Group):
552 element.subpattern = Fuzzy(element.subpattern, constraints)
553 sequence.append(element)
554 else:
555 sequence.append(Fuzzy(element, constraints))
556
557_QUANTIFIERS = {"?": (0, 1), "*": (0, None), "+": (1, None)}
558
559def parse_quantifier(source, info, ch):
560 "Parses a quantifier."
561 q = _QUANTIFIERS.get(ch)
562 if q:
563 # It's a quantifier.
564 return q
565
566 if ch == "{":
567 # Looks like a limited repeated element, eg. 'a{2,3}'.
568 counts = parse_limited_quantifier(source)
569 if counts:
570 return counts
571
572 return None
573
574def is_above_limit(count):
575 "Checks whether a count is above the maximum."
576 return count is not None and count >= UNLIMITED
577
578def parse_limited_quantifier(source):
579 "Parses a limited quantifier."
580 saved_pos = source.pos
581 min_count = parse_count(source)
582 if source.match(","):
583 max_count = parse_count(source)
584
585 # No minimum means 0 and no maximum means unlimited.
586 min_count = int(min_count or 0)
587 max_count = int(max_count) if max_count else None
588 else:
589 if not min_count:
590 source.pos = saved_pos
591 return None
592
593 min_count = max_count = int(min_count)
594
595 if not source.match ("}"):
596 source.pos = saved_pos
597 return None
598
599 if is_above_limit(min_count) or is_above_limit(max_count):
600 raise error("repeat count too big", source.string, saved_pos)
601
602 if max_count is not None and min_count > max_count:
603 raise error("min repeat greater than max repeat", source.string,
604 saved_pos)
605
606 return min_count, max_count
607
608def parse_fuzzy(source, info, ch, case_flags):
609 "Parses a fuzzy setting, if present."
610 saved_pos = source.pos
611
612 if ch != "{":
613 return None
614
615 constraints = {}
616 try:
617 parse_fuzzy_item(source, constraints)
618 while source.match(","):
619 parse_fuzzy_item(source, constraints)
620 except ParseError:
621 source.pos = saved_pos
622 return None
623
624 if source.match(":"):
625 constraints["test"] = parse_fuzzy_test(source, info, case_flags)
626
627 if not source.match("}"):
628 raise error("expected }", source.string, source.pos)
629
630 return constraints
631
632def parse_fuzzy_item(source, constraints):
633 "Parses a fuzzy setting item."
634 saved_pos = source.pos
635 try:
636 parse_cost_constraint(source, constraints)
637 except ParseError:
638 source.pos = saved_pos
639
640 parse_cost_equation(source, constraints)
641
642def parse_cost_constraint(source, constraints):
643 "Parses a cost constraint."
644 saved_pos = source.pos
645 ch = source.get()
646 if ch in ALPHA:
647 # Syntax: constraint [("<=" | "<") cost]
648 constraint = parse_constraint(source, constraints, ch)
649
650 max_inc = parse_fuzzy_compare(source)
651
652 if max_inc is None:
653 # No maximum cost.
654 constraints[constraint] = 0, None
655 else:
656 # There's a maximum cost.
657 cost_pos = source.pos
658 max_cost = parse_cost_limit(source)
659
660 # Inclusive or exclusive limit?
661 if not max_inc:
662 max_cost -= 1
663
664 if max_cost < 0:
665 raise error("bad fuzzy cost limit", source.string, cost_pos)
666
667 constraints[constraint] = 0, max_cost
668 elif ch in DIGITS:
669 # Syntax: cost ("<=" | "<") constraint ("<=" | "<") cost
670 source.pos = saved_pos
671
672 # Minimum cost.
673 cost_pos = source.pos
674 min_cost = parse_cost_limit(source)
675
676 min_inc = parse_fuzzy_compare(source)
677 if min_inc is None:
678 raise ParseError()
679
680 constraint = parse_constraint(source, constraints, source.get())
681
682 max_inc = parse_fuzzy_compare(source)
683 if max_inc is None:
684 raise ParseError()
685
686 # Maximum cost.
687 cost_pos = source.pos
688 max_cost = parse_cost_limit(source)
689
690 # Inclusive or exclusive limits?
691 if not min_inc:
692 min_cost += 1
693 if not max_inc:
694 max_cost -= 1
695
696 if not 0 <= min_cost <= max_cost:
697 raise error("bad fuzzy cost limit", source.string, cost_pos)
698
699 constraints[constraint] = min_cost, max_cost
700 else:
701 raise ParseError()
702
703def parse_cost_limit(source):
704 "Parses a cost limit."
705 cost_pos = source.pos
706 digits = parse_count(source)
707
708 try:
709 return int(digits)
710 except ValueError:
711 pass
712
713 raise error("bad fuzzy cost limit", source.string, cost_pos)
714
715def parse_constraint(source, constraints, ch):
716 "Parses a constraint."
717 if ch not in "deis":
718 raise ParseError()
719
720 if ch in constraints:
721 raise ParseError()
722
723 return ch
724
725def parse_fuzzy_compare(source):
726 "Parses a cost comparator."
727 if source.match("<="):
728 return True
729 elif source.match("<"):
730 return False
731 else:
732 return None
733
734def parse_cost_equation(source, constraints):
735 "Parses a cost equation."
736 if "cost" in constraints:
737 raise error("more than one cost equation", source.string, source.pos)
738
739 cost = {}
740
741 parse_cost_term(source, cost)
742 while source.match("+"):
743 parse_cost_term(source, cost)
744
745 max_inc = parse_fuzzy_compare(source)
746 if max_inc is None:
747 raise ParseError()
748
749 max_cost = int(parse_count(source))
750
751 if not max_inc:
752 max_cost -= 1
753
754 if max_cost < 0:
755 raise error("bad fuzzy cost limit", source.string, source.pos)
756
757 cost["max"] = max_cost
758
759 constraints["cost"] = cost
760
761def parse_cost_term(source, cost):
762 "Parses a cost equation term."
763 coeff = parse_count(source)
764 ch = source.get()
765 if ch not in "dis":
766 raise ParseError()
767
768 if ch in cost:
769 raise error("repeated fuzzy cost", source.string, source.pos)
770
771 cost[ch] = int(coeff or 1)
772
773def parse_fuzzy_test(source, info, case_flags):
774 saved_pos = source.pos
775 ch = source.get()
776 if ch in SPECIAL_CHARS:
777 if ch == "\\":
778 # An escape sequence outside a set.
779 return parse_escape(source, info, False)
780 elif ch == ".":
781 # Any character.
782 if info.flags & DOTALL:
783 return AnyAll()
784 elif info.flags & WORD:
785 return AnyU()
786 else:
787 return Any()
788 elif ch == "[":
789 # A character set.
790 return parse_set(source, info)
791 else:
792 raise error("expected character set", source.string, saved_pos)
793 elif ch:
794 # A literal.
795 return Character(ord(ch), case_flags=case_flags)
796 else:
797 raise error("expected character set", source.string, saved_pos)
798
799def parse_count(source):
800 "Parses a quantifier's count, which can be empty."
801 return source.get_while(DIGITS)
802
803def parse_paren(source, info):
804 """Parses a parenthesised subpattern or a flag. Returns FLAGS if it's an
805 inline flag.
806 """
807 saved_pos = source.pos
808 ch = source.get(True)
809 if ch == "?":
810 # (?...
811 saved_pos_2 = source.pos
812 ch = source.get(True)
813 if ch == "<":
814 # (?<...
815 saved_pos_3 = source.pos
816 ch = source.get()
817 if ch in ("=", "!"):
818 # (?<=... or (?<!...: lookbehind.
819 return parse_lookaround(source, info, True, ch == "=")
820
821 # (?<...: a named capture group.
822 source.pos = saved_pos_3
823 name = parse_name(source)
824 group = info.open_group(name)
825 source.expect(">")
826 saved_flags = info.flags
827 try:
828 subpattern = _parse_pattern(source, info)
829 source.expect(")")
830 finally:
831 info.flags = saved_flags
832 source.ignore_space = bool(info.flags & VERBOSE)
833
834 info.close_group()
835 return Group(info, group, subpattern)
836 if ch in ("=", "!"):
837 # (?=... or (?!...: lookahead.
838 return parse_lookaround(source, info, False, ch == "=")
839 if ch == "P":
840 # (?P...: a Python extension.
841 return parse_extension(source, info)
842 if ch == "#":
843 # (?#...: a comment.
844 return parse_comment(source)
845 if ch == "(":
846 # (?(...: a conditional subpattern.
847 return parse_conditional(source, info)
848 if ch == ">":
849 # (?>...: an atomic subpattern.
850 return parse_atomic(source, info)
851 if ch == "|":
852 # (?|...: a common/reset groups branch.
853 return parse_common(source, info)
854 if ch == "R" or "0" <= ch <= "9":
855 # (?R...: probably a call to a group.
856 return parse_call_group(source, info, ch, saved_pos_2)
857 if ch == "&":
858 # (?&...: a call to a named group.
859 return parse_call_named_group(source, info, saved_pos_2)
860
861 # (?...: probably a flags subpattern.
862 source.pos = saved_pos_2
863 return parse_flags_subpattern(source, info)
864
865 if ch == "*":
866 # (*...
867 saved_pos_2 = source.pos
868 word = source.get_while(set(")>"), include=False)
869 if word[ : 1].isalpha():
870 verb = VERBS.get(word)
871 if not verb:
872 raise error("unknown verb", source.string, saved_pos_2)
873
874 source.expect(")")
875
876 return verb
877
878 # (...: an unnamed capture group.
879 source.pos = saved_pos
880 group = info.open_group()
881 saved_flags = info.flags
882 try:
883 subpattern = _parse_pattern(source, info)
884 source.expect(")")
885 finally:
886 info.flags = saved_flags
887 source.ignore_space = bool(info.flags & VERBOSE)
888
889 info.close_group()
890
891 return Group(info, group, subpattern)
892
893def parse_extension(source, info):
894 "Parses a Python extension."
895 saved_pos = source.pos
896 ch = source.get()
897 if ch == "<":
898 # (?P<...: a named capture group.
899 name = parse_name(source)
900 group = info.open_group(name)
901 source.expect(">")
902 saved_flags = info.flags
903 try:
904 subpattern = _parse_pattern(source, info)
905 source.expect(")")
906 finally:
907 info.flags = saved_flags
908 source.ignore_space = bool(info.flags & VERBOSE)
909
910 info.close_group()
911
912 return Group(info, group, subpattern)
913 if ch == "=":
914 # (?P=...: a named group reference.
915 name = parse_name(source, allow_numeric=True)
916 source.expect(")")
917 if info.is_open_group(name):
918 raise error("cannot refer to an open group", source.string,
919 saved_pos)
920
921 return make_ref_group(info, name, saved_pos)
922 if ch == ">" or ch == "&":
923 # (?P>...: a call to a group.
924 return parse_call_named_group(source, info, saved_pos)
925
926 source.pos = saved_pos
927 raise error("unknown extension", source.string, saved_pos)
928
929def parse_comment(source):
930 "Parses a comment."
931 while True:
932 saved_pos = source.pos
933 c = source.get(True)
934
935 if not c or c == ")":
936 break
937
938 if c == "\\":
939 c = source.get(True)
940
941 source.pos = saved_pos
942 source.expect(")")
943
944 return None
945
946def parse_lookaround(source, info, behind, positive):
947 "Parses a lookaround."
948 saved_flags = info.flags
949 try:
950 subpattern = _parse_pattern(source, info)
951 source.expect(")")
952 finally:
953 info.flags = saved_flags
954 source.ignore_space = bool(info.flags & VERBOSE)
955
956 return LookAround(behind, positive, subpattern)
957
958def parse_conditional(source, info):
959 "Parses a conditional subpattern."
960 saved_flags = info.flags
961 saved_pos = source.pos
962 ch = source.get()
963 if ch == "?":
964 # (?(?...
965 ch = source.get()
966 if ch in ("=", "!"):
967 # (?(?=... or (?(?!...: lookahead conditional.
968 return parse_lookaround_conditional(source, info, False, ch == "=")
969 if ch == "<":
970 # (?(?<...
971 ch = source.get()
972 if ch in ("=", "!"):
973 # (?(?<=... or (?(?<!...: lookbehind conditional.
974 return parse_lookaround_conditional(source, info, True, ch ==
975 "=")
976
977 source.pos = saved_pos
978 raise error("expected lookaround conditional", source.string,
979 source.pos)
980
981 source.pos = saved_pos
982 try:
983 group = parse_name(source, True)
984 source.expect(")")
985 yes_branch = parse_sequence(source, info)
986 if source.match("|"):
987 no_branch = parse_sequence(source, info)
988 else:
989 no_branch = Sequence()
990
991 source.expect(")")
992 finally:
993 info.flags = saved_flags
994 source.ignore_space = bool(info.flags & VERBOSE)
995
996 if yes_branch.is_empty() and no_branch.is_empty():
997 return Sequence()
998
999 return Conditional(info, group, yes_branch, no_branch, saved_pos)
1000
1001def parse_lookaround_conditional(source, info, behind, positive):
1002 saved_flags = info.flags
1003 try:
1004 subpattern = _parse_pattern(source, info)
1005 source.expect(")")
1006 finally:
1007 info.flags = saved_flags
1008 source.ignore_space = bool(info.flags & VERBOSE)
1009
1010 yes_branch = parse_sequence(source, info)
1011 if source.match("|"):
1012 no_branch = parse_sequence(source, info)
1013 else:
1014 no_branch = Sequence()
1015
1016 source.expect(")")
1017
1018 return LookAroundConditional(behind, positive, subpattern, yes_branch,
1019 no_branch)
1020
1021def parse_atomic(source, info):
1022 "Parses an atomic subpattern."
1023 saved_flags = info.flags
1024 try:
1025 subpattern = _parse_pattern(source, info)
1026 source.expect(")")
1027 finally:
1028 info.flags = saved_flags
1029 source.ignore_space = bool(info.flags & VERBOSE)
1030
1031 return Atomic(subpattern)
1032
1033def parse_common(source, info):
1034 "Parses a common groups branch."
1035 # Capture group numbers in different branches can reuse the group numbers.
1036 initial_group_count = info.group_count
1037 branches = [parse_sequence(source, info)]
1038 final_group_count = info.group_count
1039 while source.match("|"):
1040 info.group_count = initial_group_count
1041 branches.append(parse_sequence(source, info))
1042 final_group_count = max(final_group_count, info.group_count)
1043
1044 info.group_count = final_group_count
1045 source.expect(")")
1046
1047 if len(branches) == 1:
1048 return branches[0]
1049 return Branch(branches)
1050
1051def parse_call_group(source, info, ch, pos):
1052 "Parses a call to a group."
1053 if ch == "R":
1054 group = "0"
1055 else:
1056 group = ch + source.get_while(DIGITS)
1057
1058 source.expect(")")
1059
1060 return CallGroup(info, group, pos)
1061
1062def parse_call_named_group(source, info, pos):
1063 "Parses a call to a named group."
1064 group = parse_name(source)
1065 source.expect(")")
1066
1067 return CallGroup(info, group, pos)
1068
1069def parse_flag_set(source):
1070 "Parses a set of inline flags."
1071 flags = 0
1072
1073 try:
1074 while True:
1075 saved_pos = source.pos
1076 ch = source.get()
1077 if ch == "V":
1078 ch += source.get()
1079 flags |= REGEX_FLAGS[ch]
1080 except KeyError:
1081 source.pos = saved_pos
1082
1083 return flags
1084
1085def parse_flags(source, info):
1086 "Parses flags being turned on/off."
1087 flags_on = parse_flag_set(source)
1088 if source.match("-"):
1089 flags_off = parse_flag_set(source)
1090 if not flags_off:
1091 raise error("bad inline flags: no flags after '-'", source.string,
1092 source.pos)
1093 else:
1094 flags_off = 0
1095
1096 if flags_on & LOCALE:
1097 # Remember that this pattern as an inline locale flag.
1098 info.inline_locale = True
1099
1100 return flags_on, flags_off
1101
1102def parse_subpattern(source, info, flags_on, flags_off):
1103 "Parses a subpattern with scoped flags."
1104 saved_flags = info.flags
1105 info.flags = (info.flags | flags_on) & ~flags_off
1106 source.ignore_space = bool(info.flags & VERBOSE)
1107 try:
1108 subpattern = _parse_pattern(source, info)
1109 source.expect(")")
1110 finally:
1111 info.flags = saved_flags
1112 source.ignore_space = bool(info.flags & VERBOSE)
1113
1114 return subpattern
1115
1116def parse_flags_subpattern(source, info):
1117 """Parses a flags subpattern. It could be inline flags or a subpattern
1118 possibly with local flags. If it's a subpattern, then that's returned;
1119 if it's a inline flags, then None is returned.
1120 """
1121 flags_on, flags_off = parse_flags(source, info)
1122
1123 if flags_off & GLOBAL_FLAGS:
1124 raise error("bad inline flags: cannot turn off global flag",
1125 source.string, source.pos)
1126
1127 if flags_on & flags_off:
1128 raise error("bad inline flags: flag turned on and off", source.string,
1129 source.pos)
1130
1131 # Handle flags which are global in all regex behaviours.
1132 new_global_flags = (flags_on & ~info.global_flags) & GLOBAL_FLAGS
1133 if new_global_flags:
1134 info.global_flags |= new_global_flags
1135
1136 # A global has been turned on, so reparse the pattern.
1137 raise _UnscopedFlagSet(info.global_flags)
1138
1139 # Ensure that from now on we have only scoped flags.
1140 flags_on &= ~GLOBAL_FLAGS
1141
1142 if source.match(":"):
1143 return parse_subpattern(source, info, flags_on, flags_off)
1144
1145 if source.match(")"):
1146 parse_positional_flags(source, info, flags_on, flags_off)
1147 return None
1148
1149 raise error("unknown extension", source.string, source.pos)
1150
1151def parse_positional_flags(source, info, flags_on, flags_off):
1152 "Parses positional flags."
1153 info.flags = (info.flags | flags_on) & ~flags_off
1154 source.ignore_space = bool(info.flags & VERBOSE)
1155
1156def parse_name(source, allow_numeric=False, allow_group_0=False):
1157 "Parses a name."
1158 name = source.get_while(set(")>"), include=False)
1159
1160 if not name:
1161 raise error("missing group name", source.string, source.pos)
1162
1163 if name.isdigit():
1164 min_group = 0 if allow_group_0 else 1
1165 if not allow_numeric or int(name) < min_group:
1166 raise error("bad character in group name", source.string,
1167 source.pos)
1168 else:
1169 if not name.isidentifier():
1170 raise error("bad character in group name", source.string,
1171 source.pos)
1172
1173 return name
1174
1175def is_octal(string):
1176 "Checks whether a string is octal."
1177 return all(ch in OCT_DIGITS for ch in string)
1178
1179def is_decimal(string):
1180 "Checks whether a string is decimal."
1181 return all(ch in DIGITS for ch in string)
1182
1183def is_hexadecimal(string):
1184 "Checks whether a string is hexadecimal."
1185 return all(ch in HEX_DIGITS for ch in string)
1186
1187def parse_escape(source, info, in_set):
1188 "Parses an escape sequence."
1189 saved_ignore = source.ignore_space
1190 source.ignore_space = False
1191 ch = source.get()
1192 source.ignore_space = saved_ignore
1193 if not ch:
1194 # A backslash at the end of the pattern.
1195 raise error("bad escape (end of pattern)", source.string, source.pos)
1196 if ch in HEX_ESCAPES:
1197 # A hexadecimal escape sequence.
1198 return parse_hex_escape(source, info, ch, HEX_ESCAPES[ch], in_set, ch)
1199 elif ch == "g" and not in_set:
1200 # A group reference.
1201 saved_pos = source.pos
1202 try:
1203 return parse_group_ref(source, info)
1204 except error:
1205 # Invalid as a group reference, so assume it's a literal.
1206 source.pos = saved_pos
1207
1208 return make_character(info, ord(ch), in_set)
1209 elif ch == "G" and not in_set:
1210 # A search anchor.
1211 return SearchAnchor()
1212 elif ch == "L" and not in_set:
1213 # A string set.
1214 return parse_string_set(source, info)
1215 elif ch == "N":
1216 # A named codepoint.
1217 return parse_named_char(source, info, in_set)
1218 elif ch in "pP":
1219 # A Unicode property, positive or negative.
1220 return parse_property(source, info, ch == "p", in_set)
1221 elif ch == "R" and not in_set:
1222 # A line ending.
1223 charset = [0x0A, 0x0B, 0x0C, 0x0D]
1224 if info.guess_encoding == UNICODE:
1225 charset.extend([0x85, 0x2028, 0x2029])
1226
1227 return Atomic(Branch([String([0x0D, 0x0A]), SetUnion(info, [Character(c)
1228 for c in charset])]))
1229 elif ch == "X" and not in_set:
1230 # A grapheme cluster.
1231 return Grapheme()
1232 elif ch in ALPHA:
1233 # An alphabetic escape sequence.
1234 # Positional escapes aren't allowed inside a character set.
1235 if not in_set:
1236 if info.flags & WORD:
1237 value = WORD_POSITION_ESCAPES.get(ch)
1238 else:
1239 value = POSITION_ESCAPES.get(ch)
1240
1241 if value:
1242 return value
1243
1244 value = CHARSET_ESCAPES.get(ch)
1245 if value:
1246 return value
1247
1248 value = CHARACTER_ESCAPES.get(ch)
1249 if value:
1250 return Character(ord(value))
1251
1252 raise error("bad escape \\%s" % ch, source.string, source.pos)
1253 elif ch in DIGITS:
1254 # A numeric escape sequence.
1255 return parse_numeric_escape(source, info, ch, in_set)
1256 else:
1257 # A literal.
1258 return make_character(info, ord(ch), in_set)
1259
1260def parse_numeric_escape(source, info, ch, in_set):
1261 "Parses a numeric escape sequence."
1262 if in_set or ch == "0":
1263 # Octal escape sequence, max 3 digits.
1264 return parse_octal_escape(source, info, [ch], in_set)
1265
1266 # At least 1 digit, so either octal escape or group.
1267 digits = ch
1268 saved_pos = source.pos
1269 ch = source.get()
1270 if ch in DIGITS:
1271 # At least 2 digits, so either octal escape or group.
1272 digits += ch
1273 saved_pos = source.pos
1274 ch = source.get()
1275 if is_octal(digits) and ch in OCT_DIGITS:
1276 # 3 octal digits, so octal escape sequence.
1277 encoding = info.flags & _ALL_ENCODINGS
1278 if encoding == ASCII or encoding == LOCALE:
1279 octal_mask = 0xFF
1280 else:
1281 octal_mask = 0x1FF
1282
1283 value = int(digits + ch, 8) & octal_mask
1284 return make_character(info, value)
1285
1286 # Group reference.
1287 source.pos = saved_pos
1288 if info.is_open_group(digits):
1289 raise error("cannot refer to an open group", source.string, source.pos)
1290
1291 return make_ref_group(info, digits, source.pos)
1292
1293def parse_octal_escape(source, info, digits, in_set):
1294 "Parses an octal escape sequence."
1295 saved_pos = source.pos
1296 ch = source.get()
1297 while len(digits) < 3 and ch in OCT_DIGITS:
1298 digits.append(ch)
1299 saved_pos = source.pos
1300 ch = source.get()
1301
1302 source.pos = saved_pos
1303 try:
1304 value = int("".join(digits), 8)
1305 return make_character(info, value, in_set)
1306 except ValueError:
1307 if digits[0] in OCT_DIGITS:
1308 raise error("incomplete escape \\%s" % ''.join(digits),
1309 source.string, source.pos)
1310 else:
1311 raise error("bad escape \\%s" % digits[0], source.string,
1312 source.pos)
1313
1314def parse_hex_escape(source, info, esc, expected_len, in_set, type):
1315 "Parses a hex escape sequence."
1316 saved_pos = source.pos
1317 digits = []
1318 for i in range(expected_len):
1319 ch = source.get()
1320 if ch not in HEX_DIGITS:
1321 raise error("incomplete escape \\%s%s" % (type, ''.join(digits)),
1322 source.string, saved_pos)
1323 digits.append(ch)
1324
1325 try:
1326 value = int("".join(digits), 16)
1327 except ValueError:
1328 pass
1329 else:
1330 if value < 0x110000:
1331 return make_character(info, value, in_set)
1332
1333 # Bad hex escape.
1334 raise error("bad hex escape \\%s%s" % (esc, ''.join(digits)),
1335 source.string, saved_pos)
1336
1337def parse_group_ref(source, info):
1338 "Parses a group reference."
1339 source.expect("<")
1340 saved_pos = source.pos
1341 name = parse_name(source, True)
1342 source.expect(">")
1343 if info.is_open_group(name):
1344 raise error("cannot refer to an open group", source.string, source.pos)
1345
1346 return make_ref_group(info, name, saved_pos)
1347
1348def parse_string_set(source, info):
1349 "Parses a string set reference."
1350 source.expect("<")
1351 name = parse_name(source, True)
1352 source.expect(">")
1353 if name is None or name not in info.kwargs:
1354 raise error("undefined named list", source.string, source.pos)
1355
1356 return make_string_set(info, name)
1357
1358def parse_named_char(source, info, in_set):
1359 "Parses a named character."
1360 saved_pos = source.pos
1361 if source.match("{"):
1362 name = source.get_while(NAMED_CHAR_PART, keep_spaces=True)
1363 if source.match("}"):
1364 try:
1365 value = unicodedata.lookup(name)
1366 return make_character(info, ord(value), in_set)
1367 except KeyError:
1368 raise error("undefined character name", source.string,
1369 source.pos)
1370
1371 source.pos = saved_pos
1372 return make_character(info, ord("N"), in_set)
1373
1374def parse_property(source, info, positive, in_set):
1375 "Parses a Unicode property."
1376 saved_pos = source.pos
1377 ch = source.get()
1378 if ch == "{":
1379 negate = source.match("^")
1380 prop_name, name = parse_property_name(source)
1381 if source.match("}"):
1382 # It's correctly delimited.
1383 prop = lookup_property(prop_name, name, positive != negate, source)
1384 return make_property(info, prop, in_set)
1385 elif ch and ch in "CLMNPSZ":
1386 # An abbreviated property, eg \pL.
1387 prop = lookup_property(None, ch, positive, source)
1388 return make_property(info, prop, in_set)
1389
1390 # Not a property, so treat as a literal "p" or "P".
1391 source.pos = saved_pos
1392 ch = "p" if positive else "P"
1393 return make_character(info, ord(ch), in_set)
1394
1395def parse_property_name(source):
1396 "Parses a property name, which may be qualified."
1397 name = source.get_while(PROPERTY_NAME_PART)
1398 saved_pos = source.pos
1399
1400 ch = source.get()
1401 if ch and ch in ":=":
1402 prop_name = name
1403 name = source.get_while(ALNUM | set(" &_-./")).strip()
1404
1405 if name:
1406 # Name after the ":" or "=", so it's a qualified name.
1407 saved_pos = source.pos
1408 else:
1409 # No name after the ":" or "=", so assume it's an unqualified name.
1410 prop_name, name = None, prop_name
1411 else:
1412 prop_name = None
1413
1414 source.pos = saved_pos
1415 return prop_name, name
1416
1417def parse_set(source, info):
1418 "Parses a character set."
1419 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION
1420
1421 saved_ignore = source.ignore_space
1422 source.ignore_space = False
1423 # Negative set?
1424 negate = source.match("^")
1425 try:
1426 if version == VERSION0:
1427 item = parse_set_imp_union(source, info)
1428 else:
1429 item = parse_set_union(source, info)
1430
1431 if not source.match("]"):
1432 raise error("missing ]", source.string, source.pos)
1433 finally:
1434 source.ignore_space = saved_ignore
1435
1436 if negate:
1437 item = item.with_flags(positive=not item.positive)
1438
1439 item = item.with_flags(case_flags=make_case_flags(info))
1440
1441 return item
1442
1443def parse_set_union(source, info):
1444 "Parses a set union ([x||y])."
1445 items = [parse_set_symm_diff(source, info)]
1446 while source.match("||"):
1447 items.append(parse_set_symm_diff(source, info))
1448
1449 if len(items) == 1:
1450 return items[0]
1451 return SetUnion(info, items)
1452
1453def parse_set_symm_diff(source, info):
1454 "Parses a set symmetric difference ([x~~y])."
1455 items = [parse_set_inter(source, info)]
1456 while source.match("~~"):
1457 items.append(parse_set_inter(source, info))
1458
1459 if len(items) == 1:
1460 return items[0]
1461 return SetSymDiff(info, items)
1462
1463def parse_set_inter(source, info):
1464 "Parses a set intersection ([x&&y])."
1465 items = [parse_set_diff(source, info)]
1466 while source.match("&&"):
1467 items.append(parse_set_diff(source, info))
1468
1469 if len(items) == 1:
1470 return items[0]
1471 return SetInter(info, items)
1472
1473def parse_set_diff(source, info):
1474 "Parses a set difference ([x--y])."
1475 items = [parse_set_imp_union(source, info)]
1476 while source.match("--"):
1477 items.append(parse_set_imp_union(source, info))
1478
1479 if len(items) == 1:
1480 return items[0]
1481 return SetDiff(info, items)
1482
1483def parse_set_imp_union(source, info):
1484 "Parses a set implicit union ([xy])."
1485 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION
1486
1487 items = [parse_set_member(source, info)]
1488 while True:
1489 saved_pos = source.pos
1490 if source.match("]"):
1491 # End of the set.
1492 source.pos = saved_pos
1493 break
1494
1495 if version == VERSION1 and any(source.match(op) for op in SET_OPS):
1496 # The new behaviour has set operators.
1497 source.pos = saved_pos
1498 break
1499
1500 items.append(parse_set_member(source, info))
1501
1502 if len(items) == 1:
1503 return items[0]
1504 return SetUnion(info, items)
1505
1506def parse_set_member(source, info):
1507 "Parses a member in a character set."
1508 # Parse a set item.
1509 start = parse_set_item(source, info)
1510 saved_pos1 = source.pos
1511 if (not isinstance(start, Character) or not start.positive or not
1512 source.match("-")):
1513 # It's not the start of a range.
1514 return start
1515
1516 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION
1517
1518 # It looks like the start of a range of characters.
1519 saved_pos2 = source.pos
1520 if version == VERSION1 and source.match("-"):
1521 # It's actually the set difference operator '--', so return the
1522 # character.
1523 source.pos = saved_pos1
1524 return start
1525
1526 if source.match("]"):
1527 # We've reached the end of the set, so return both the character and
1528 # hyphen.
1529 source.pos = saved_pos2
1530 return SetUnion(info, [start, Character(ord("-"))])
1531
1532 # Parse a set item.
1533 end = parse_set_item(source, info)
1534 if not isinstance(end, Character) or not end.positive:
1535 # It's not a range, so return the character, hyphen and property.
1536 return SetUnion(info, [start, Character(ord("-")), end])
1537
1538 # It _is_ a range.
1539 if start.value > end.value:
1540 raise error("bad character range", source.string, source.pos)
1541
1542 if start.value == end.value:
1543 return start
1544
1545 return Range(start.value, end.value)
1546
1547def parse_set_item(source, info):
1548 "Parses an item in a character set."
1549 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION
1550
1551 if source.match("\\"):
1552 # An escape sequence in a set.
1553 return parse_escape(source, info, True)
1554
1555 saved_pos = source.pos
1556 if source.match("[:"):
1557 # Looks like a POSIX character class.
1558 try:
1559 return parse_posix_class(source, info)
1560 except ParseError:
1561 # Not a POSIX character class.
1562 source.pos = saved_pos
1563
1564 if version == VERSION1 and source.match("["):
1565 # It's the start of a nested set.
1566
1567 # Negative set?
1568 negate = source.match("^")
1569 item = parse_set_union(source, info)
1570
1571 if not source.match("]"):
1572 raise error("missing ]", source.string, source.pos)
1573
1574 if negate:
1575 item = item.with_flags(positive=not item.positive)
1576
1577 return item
1578
1579 ch = source.get()
1580 if not ch:
1581 raise error("unterminated character set", source.string, source.pos)
1582
1583 return Character(ord(ch))
1584
1585def parse_posix_class(source, info):
1586 "Parses a POSIX character class."
1587 negate = source.match("^")
1588 prop_name, name = parse_property_name(source)
1589 if not source.match(":]"):
1590 raise ParseError()
1591
1592 return lookup_property(prop_name, name, not negate, source, posix=True)
1593
1594def float_to_rational(flt):
1595 "Converts a float to a rational pair."
1596 int_part = int(flt)
1597 error = flt - int_part
1598 if abs(error) < 0.0001:
1599 return int_part, 1
1600
1601 den, num = float_to_rational(1.0 / error)
1602
1603 return int_part * den + num, den
1604
1605def numeric_to_rational(numeric):
1606 "Converts a numeric string to a rational string, if possible."
1607 if numeric[ : 1] == "-":
1608 sign, numeric = numeric[0], numeric[1 : ]
1609 else:
1610 sign = ""
1611
1612 parts = numeric.split("/")
1613 if len(parts) == 2:
1614 num, den = float_to_rational(float(parts[0]) / float(parts[1]))
1615 elif len(parts) == 1:
1616 num, den = float_to_rational(float(parts[0]))
1617 else:
1618 raise ValueError()
1619
1620 result = "{}{}/{}".format(sign, num, den)
1621 if result.endswith("/1"):
1622 return result[ : -2]
1623
1624 return result
1625
1626def standardise_name(name):
1627 "Standardises a property or value name."
1628 try:
1629 return numeric_to_rational("".join(name))
1630 except (ValueError, ZeroDivisionError):
1631 return "".join(ch for ch in name if ch not in "_- ").upper()
1632
1633_POSIX_CLASSES = set('ALNUM DIGIT PUNCT XDIGIT'.split())
1634
1635_BINARY_VALUES = set('YES Y NO N TRUE T FALSE F'.split())
1636
1637def lookup_property(property, value, positive, source=None, posix=False):
1638 "Looks up a property."
1639 # Normalise the names (which may still be lists).
1640 property = standardise_name(property) if property else None
1641 value = standardise_name(value)
1642
1643 if (property, value) == ("GENERALCATEGORY", "ASSIGNED"):
1644 property, value, positive = "GENERALCATEGORY", "UNASSIGNED", not positive
1645
1646 if posix and not property and value.upper() in _POSIX_CLASSES:
1647 value = 'POSIX' + value
1648
1649 if property:
1650 # Both the property and the value are provided.
1651 prop = PROPERTIES.get(property)
1652 if not prop:
1653 if not source:
1654 raise error("unknown property")
1655
1656 raise error("unknown property", source.string, source.pos)
1657
1658 prop_id, value_dict = prop
1659 val_id = value_dict.get(value)
1660 if val_id is None:
1661 if not source:
1662 raise error("unknown property value")
1663
1664 raise error("unknown property value", source.string, source.pos)
1665
1666 return Property((prop_id << 16) | val_id, positive)
1667
1668 # Only the value is provided.
1669 # It might be the name of a GC, script or block value.
1670 for property in ("GC", "SCRIPT", "BLOCK"):
1671 prop_id, value_dict = PROPERTIES.get(property)
1672 val_id = value_dict.get(value)
1673 if val_id is not None:
1674 return Property((prop_id << 16) | val_id, positive)
1675
1676 # It might be the name of a binary property.
1677 prop = PROPERTIES.get(value)
1678 if prop:
1679 prop_id, value_dict = prop
1680 if set(value_dict) == _BINARY_VALUES:
1681 return Property((prop_id << 16) | 1, positive)
1682
1683 return Property(prop_id << 16, not positive)
1684
1685 # It might be the name of a binary property starting with a prefix.
1686 if value.startswith("IS"):
1687 prop = PROPERTIES.get(value[2 : ])
1688 if prop:
1689 prop_id, value_dict = prop
1690 if "YES" in value_dict:
1691 return Property((prop_id << 16) | 1, positive)
1692
1693 # It might be the name of a script or block starting with a prefix.
1694 for prefix, property in (("IS", "SCRIPT"), ("IN", "BLOCK")):
1695 if value.startswith(prefix):
1696 prop_id, value_dict = PROPERTIES.get(property)
1697 val_id = value_dict.get(value[2 : ])
1698 if val_id is not None:
1699 return Property((prop_id << 16) | val_id, positive)
1700
1701 # Unknown property.
1702 if not source:
1703 raise error("unknown property")
1704
1705 raise error("unknown property", source.string, source.pos)
1706
1707def _compile_replacement(source, pattern, is_unicode):
1708 "Compiles a replacement template escape sequence."
1709 ch = source.get()
1710 if ch in ALPHA:
1711 # An alphabetic escape sequence.
1712 value = CHARACTER_ESCAPES.get(ch)
1713 if value:
1714 return False, [ord(value)]
1715
1716 if ch in HEX_ESCAPES and (ch == "x" or is_unicode):
1717 # A hexadecimal escape sequence.
1718 return False, [parse_repl_hex_escape(source, HEX_ESCAPES[ch], ch)]
1719
1720 if ch == "g":
1721 # A group preference.
1722 return True, [compile_repl_group(source, pattern)]
1723
1724 if ch == "N" and is_unicode:
1725 # A named character.
1726 value = parse_repl_named_char(source)
1727 if value is not None:
1728 return False, [value]
1729
1730 raise error("bad escape \\%s" % ch, source.string, source.pos)
1731
1732 if isinstance(source.sep, bytes):
1733 octal_mask = 0xFF
1734 else:
1735 octal_mask = 0x1FF
1736
1737 if ch == "0":
1738 # An octal escape sequence.
1739 digits = ch
1740 while len(digits) < 3:
1741 saved_pos = source.pos
1742 ch = source.get()
1743 if ch not in OCT_DIGITS:
1744 source.pos = saved_pos
1745 break
1746 digits += ch
1747
1748 return False, [int(digits, 8) & octal_mask]
1749
1750 if ch in DIGITS:
1751 # Either an octal escape sequence (3 digits) or a group reference (max
1752 # 2 digits).
1753 digits = ch
1754 saved_pos = source.pos
1755 ch = source.get()
1756 if ch in DIGITS:
1757 digits += ch
1758 saved_pos = source.pos
1759 ch = source.get()
1760 if ch and is_octal(digits + ch):
1761 # An octal escape sequence.
1762 return False, [int(digits + ch, 8) & octal_mask]
1763
1764 # A group reference.
1765 source.pos = saved_pos
1766 return True, [int(digits)]
1767
1768 if ch == "\\":
1769 # An escaped backslash is a backslash.
1770 return False, [ord("\\")]
1771
1772 if not ch:
1773 # A trailing backslash.
1774 raise error("bad escape (end of pattern)", source.string, source.pos)
1775
1776 # An escaped non-backslash is a backslash followed by the literal.
1777 return False, [ord("\\"), ord(ch)]
1778
1779def parse_repl_hex_escape(source, expected_len, type):
1780 "Parses a hex escape sequence in a replacement string."
1781 digits = []
1782 for i in range(expected_len):
1783 ch = source.get()
1784 if ch not in HEX_DIGITS:
1785 raise error("incomplete escape \\%s%s" % (type, ''.join(digits)),
1786 source.string, source.pos)
1787 digits.append(ch)
1788
1789 return int("".join(digits), 16)
1790
1791def parse_repl_named_char(source):
1792 "Parses a named character in a replacement string."
1793 saved_pos = source.pos
1794 if source.match("{"):
1795 name = source.get_while(ALPHA | set(" "))
1796
1797 if source.match("}"):
1798 try:
1799 value = unicodedata.lookup(name)
1800 return ord(value)
1801 except KeyError:
1802 raise error("undefined character name", source.string,
1803 source.pos)
1804
1805 source.pos = saved_pos
1806 return None
1807
1808def compile_repl_group(source, pattern):
1809 "Compiles a replacement template group reference."
1810 source.expect("<")
1811 name = parse_name(source, True, True)
1812
1813 source.expect(">")
1814 if name.isdigit():
1815 index = int(name)
1816 if not 0 <= index <= pattern.groups:
1817 raise error("invalid group reference", source.string, source.pos)
1818
1819 return index
1820
1821 try:
1822 return pattern.groupindex[name]
1823 except KeyError:
1824 raise IndexError("unknown group")
1825
1826# The regular expression is parsed into a syntax tree. The different types of
1827# node are defined below.
1828
1829INDENT = " "
1830POSITIVE_OP = 0x1
1831ZEROWIDTH_OP = 0x2
1832FUZZY_OP = 0x4
1833REVERSE_OP = 0x8
1834REQUIRED_OP = 0x10
1835
1836POS_TEXT = {False: "NON-MATCH", True: "MATCH"}
1837CASE_TEXT = {NOCASE: "", IGNORECASE: " SIMPLE_IGNORE_CASE", FULLCASE: "",
1838 FULLIGNORECASE: " FULL_IGNORE_CASE"}
1839
1840def make_sequence(items):
1841 if len(items) == 1:
1842 return items[0]
1843 return Sequence(items)
1844
1845# Common base class for all nodes.
1846class RegexBase:
1847 def __init__(self):
1848 self._key = self.__class__
1849
1850 def with_flags(self, positive=None, case_flags=None, zerowidth=None):
1851 if positive is None:
1852 positive = self.positive
1853 else:
1854 positive = bool(positive)
1855 if case_flags is None:
1856 case_flags = self.case_flags
1857 else:
1858 case_flags = CASE_FLAGS_COMBINATIONS[case_flags & CASE_FLAGS]
1859 if zerowidth is None:
1860 zerowidth = self.zerowidth
1861 else:
1862 zerowidth = bool(zerowidth)
1863
1864 if (positive == self.positive and case_flags == self.case_flags and
1865 zerowidth == self.zerowidth):
1866 return self
1867
1868 return self.rebuild(positive, case_flags, zerowidth)
1869
1870 def fix_groups(self, pattern, reverse, fuzzy):
1871 pass
1872
1873 def optimise(self, info, reverse):
1874 return self
1875
1876 def pack_characters(self, info):
1877 return self
1878
1879 def remove_captures(self):
1880 return self
1881
1882 def is_atomic(self):
1883 return True
1884
1885 def can_be_affix(self):
1886 return True
1887
1888 def contains_group(self):
1889 return False
1890
1891 def get_firstset(self, reverse):
1892 raise _FirstSetError()
1893
1894 def has_simple_start(self):
1895 return False
1896
1897 def compile(self, reverse=False, fuzzy=False):
1898 return self._compile(reverse, fuzzy)
1899
1900 def is_empty(self):
1901 return False
1902
1903 def __hash__(self):
1904 return hash(self._key)
1905
1906 def __eq__(self, other):
1907 return type(self) is type(other) and self._key == other._key
1908
1909 def __ne__(self, other):
1910 return not self.__eq__(other)
1911
1912 def get_required_string(self, reverse):
1913 return self.max_width(), None
1914
1915# Base class for zero-width nodes.
1916class ZeroWidthBase(RegexBase):
1917 def __init__(self, positive=True):
1918 RegexBase.__init__(self)
1919 self.positive = bool(positive)
1920
1921 self._key = self.__class__, self.positive
1922
1923 def get_firstset(self, reverse):
1924 return set([None])
1925
1926 def _compile(self, reverse, fuzzy):
1927 flags = 0
1928 if self.positive:
1929 flags |= POSITIVE_OP
1930 if fuzzy:
1931 flags |= FUZZY_OP
1932 if reverse:
1933 flags |= REVERSE_OP
1934 return [(self._opcode, flags)]
1935
1936 def dump(self, indent, reverse):
1937 print("{}{} {}".format(INDENT * indent, self._op_name,
1938 POS_TEXT[self.positive]))
1939
1940 def max_width(self):
1941 return 0
1942
1943class Any(RegexBase):
1944 _opcode = {False: OP.ANY, True: OP.ANY_REV}
1945 _op_name = "ANY"
1946
1947 def has_simple_start(self):
1948 return True
1949
1950 def _compile(self, reverse, fuzzy):
1951 flags = 0
1952 if fuzzy:
1953 flags |= FUZZY_OP
1954 return [(self._opcode[reverse], flags)]
1955
1956 def dump(self, indent, reverse):
1957 print("{}{}".format(INDENT * indent, self._op_name))
1958
1959 def max_width(self):
1960 return 1
1961
1962class AnyAll(Any):
1963 _opcode = {False: OP.ANY_ALL, True: OP.ANY_ALL_REV}
1964 _op_name = "ANY_ALL"
1965
1966class AnyU(Any):
1967 _opcode = {False: OP.ANY_U, True: OP.ANY_U_REV}
1968 _op_name = "ANY_U"
1969
1970class Atomic(RegexBase):
1971 def __init__(self, subpattern):
1972 RegexBase.__init__(self)
1973 self.subpattern = subpattern
1974
1975 def fix_groups(self, pattern, reverse, fuzzy):
1976 self.subpattern.fix_groups(pattern, reverse, fuzzy)
1977
1978 def optimise(self, info, reverse):
1979 self.subpattern = self.subpattern.optimise(info, reverse)
1980
1981 if self.subpattern.is_empty():
1982 return self.subpattern
1983 return self
1984
1985 def pack_characters(self, info):
1986 self.subpattern = self.subpattern.pack_characters(info)
1987 return self
1988
1989 def remove_captures(self):
1990 self.subpattern = self.subpattern.remove_captures()
1991 return self
1992
1993 def can_be_affix(self):
1994 return self.subpattern.can_be_affix()
1995
1996 def contains_group(self):
1997 return self.subpattern.contains_group()
1998
1999 def get_firstset(self, reverse):
2000 return self.subpattern.get_firstset(reverse)
2001
2002 def has_simple_start(self):
2003 return self.subpattern.has_simple_start()
2004
2005 def _compile(self, reverse, fuzzy):
2006 return ([(OP.ATOMIC, )] + self.subpattern.compile(reverse, fuzzy) +
2007 [(OP.END, )])
2008
2009 def dump(self, indent, reverse):
2010 print("{}ATOMIC".format(INDENT * indent))
2011 self.subpattern.dump(indent + 1, reverse)
2012
2013 def is_empty(self):
2014 return self.subpattern.is_empty()
2015
2016 def __eq__(self, other):
2017 return (type(self) is type(other) and self.subpattern ==
2018 other.subpattern)
2019
2020 def max_width(self):
2021 return self.subpattern.max_width()
2022
2023 def get_required_string(self, reverse):
2024 return self.subpattern.get_required_string(reverse)
2025
2026class Boundary(ZeroWidthBase):
2027 _opcode = OP.BOUNDARY
2028 _op_name = "BOUNDARY"
2029
2030class Branch(RegexBase):
2031 def __init__(self, branches):
2032 RegexBase.__init__(self)
2033 self.branches = branches
2034
2035 def fix_groups(self, pattern, reverse, fuzzy):
2036 for b in self.branches:
2037 b.fix_groups(pattern, reverse, fuzzy)
2038
2039 def optimise(self, info, reverse):
2040 if not self.branches:
2041 return Sequence([])
2042
2043 # Flatten branches within branches.
2044 branches = Branch._flatten_branches(info, reverse, self.branches)
2045
2046 # Move any common prefix or suffix out of the branches.
2047 if reverse:
2048 suffix, branches = Branch._split_common_suffix(info, branches)
2049 prefix = []
2050 else:
2051 prefix, branches = Branch._split_common_prefix(info, branches)
2052 suffix = []
2053
2054 # Try to reduce adjacent single-character branches to sets.
2055 branches = Branch._reduce_to_set(info, reverse, branches)
2056
2057 if len(branches) > 1:
2058 sequence = [Branch(branches)]
2059
2060 if not prefix or not suffix:
2061 # We might be able to add a quick precheck before the branches.
2062 firstset = self._add_precheck(info, reverse, branches)
2063
2064 if firstset:
2065 if reverse:
2066 sequence.append(firstset)
2067 else:
2068 sequence.insert(0, firstset)
2069 else:
2070 sequence = branches
2071
2072 return make_sequence(prefix + sequence + suffix)
2073
2074 def _add_precheck(self, info, reverse, branches):
2075 charset = set()
2076 pos = -1 if reverse else 0
2077
2078 for branch in branches:
2079 if type(branch) is Literal and branch.case_flags == NOCASE:
2080 charset.add(branch.characters[pos])
2081 else:
2082 return
2083
2084 if not charset:
2085 return None
2086
2087 return _check_firstset(info, reverse, [Character(c) for c in charset])
2088
2089 def pack_characters(self, info):
2090 self.branches = [b.pack_characters(info) for b in self.branches]
2091 return self
2092
2093 def remove_captures(self):
2094 self.branches = [b.remove_captures() for b in self.branches]
2095 return self
2096
2097 def is_atomic(self):
2098 return all(b.is_atomic() for b in self.branches)
2099
2100 def can_be_affix(self):
2101 return all(b.can_be_affix() for b in self.branches)
2102
2103 def contains_group(self):
2104 return any(b.contains_group() for b in self.branches)
2105
2106 def get_firstset(self, reverse):
2107 fs = set()
2108 for b in self.branches:
2109 fs |= b.get_firstset(reverse)
2110
2111 return fs or set([None])
2112
2113 def _compile(self, reverse, fuzzy):
2114 if not self.branches:
2115 return []
2116
2117 code = [(OP.BRANCH, )]
2118 for b in self.branches:
2119 code.extend(b.compile(reverse, fuzzy))
2120 code.append((OP.NEXT, ))
2121
2122 code[-1] = (OP.END, )
2123
2124 return code
2125
2126 def dump(self, indent, reverse):
2127 print("{}BRANCH".format(INDENT * indent))
2128 self.branches[0].dump(indent + 1, reverse)
2129 for b in self.branches[1 : ]:
2130 print("{}OR".format(INDENT * indent))
2131 b.dump(indent + 1, reverse)
2132
2133 @staticmethod
2134 def _flatten_branches(info, reverse, branches):
2135 # Flatten the branches so that there aren't branches of branches.
2136 new_branches = []
2137 for b in branches:
2138 b = b.optimise(info, reverse)
2139 if isinstance(b, Branch):
2140 new_branches.extend(b.branches)
2141 else:
2142 new_branches.append(b)
2143
2144 return new_branches
2145
2146 @staticmethod
2147 def _split_common_prefix(info, branches):
2148 # Common leading items can be moved out of the branches.
2149 # Get the items in the branches.
2150 alternatives = []
2151 for b in branches:
2152 if isinstance(b, Sequence):
2153 alternatives.append(b.items)
2154 else:
2155 alternatives.append([b])
2156
2157 # What is the maximum possible length of the prefix?
2158 max_count = min(len(a) for a in alternatives)
2159
2160 # What is the longest common prefix?
2161 prefix = alternatives[0]
2162 pos = 0
2163 end_pos = max_count
2164 while pos < end_pos and prefix[pos].can_be_affix() and all(a[pos] ==
2165 prefix[pos] for a in alternatives):
2166 pos += 1
2167 count = pos
2168
2169 if info.flags & UNICODE:
2170 # We need to check that we're not splitting a sequence of
2171 # characters which could form part of full case-folding.
2172 count = pos
2173 while count > 0 and not all(Branch._can_split(a, count) for a in
2174 alternatives):
2175 count -= 1
2176
2177 # No common prefix is possible.
2178 if count == 0:
2179 return [], branches
2180
2181 # Rebuild the branches.
2182 new_branches = []
2183 for a in alternatives:
2184 new_branches.append(make_sequence(a[count : ]))
2185
2186 return prefix[ : count], new_branches
2187
2188 @staticmethod
2189 def _split_common_suffix(info, branches):
2190 # Common trailing items can be moved out of the branches.
2191 # Get the items in the branches.
2192 alternatives = []
2193 for b in branches:
2194 if isinstance(b, Sequence):
2195 alternatives.append(b.items)
2196 else:
2197 alternatives.append([b])
2198
2199 # What is the maximum possible length of the suffix?
2200 max_count = min(len(a) for a in alternatives)
2201
2202 # What is the longest common suffix?
2203 suffix = alternatives[0]
2204 pos = -1
2205 end_pos = -1 - max_count
2206 while pos > end_pos and suffix[pos].can_be_affix() and all(a[pos] ==
2207 suffix[pos] for a in alternatives):
2208 pos -= 1
2209 count = -1 - pos
2210
2211 if info.flags & UNICODE:
2212 # We need to check that we're not splitting a sequence of
2213 # characters which could form part of full case-folding.
2214 while count > 0 and not all(Branch._can_split_rev(a, count) for a
2215 in alternatives):
2216 count -= 1
2217
2218 # No common suffix is possible.
2219 if count == 0:
2220 return [], branches
2221
2222 # Rebuild the branches.
2223 new_branches = []
2224 for a in alternatives:
2225 new_branches.append(make_sequence(a[ : -count]))
2226
2227 return suffix[-count : ], new_branches
2228
2229 @staticmethod
2230 def _can_split(items, count):
2231 # Check the characters either side of the proposed split.
2232 if not Branch._is_full_case(items, count - 1):
2233 return True
2234
2235 if not Branch._is_full_case(items, count):
2236 return True
2237
2238 # Check whether a 1-1 split would be OK.
2239 if Branch._is_folded(items[count - 1 : count + 1]):
2240 return False
2241
2242 # Check whether a 1-2 split would be OK.
2243 if (Branch._is_full_case(items, count + 2) and
2244 Branch._is_folded(items[count - 1 : count + 2])):
2245 return False
2246
2247 # Check whether a 2-1 split would be OK.
2248 if (Branch._is_full_case(items, count - 2) and
2249 Branch._is_folded(items[count - 2 : count + 1])):
2250 return False
2251
2252 return True
2253
2254 @staticmethod
2255 def _can_split_rev(items, count):
2256 end = len(items)
2257
2258 # Check the characters either side of the proposed split.
2259 if not Branch._is_full_case(items, end - count):
2260 return True
2261
2262 if not Branch._is_full_case(items, end - count - 1):
2263 return True
2264
2265 # Check whether a 1-1 split would be OK.
2266 if Branch._is_folded(items[end - count - 1 : end - count + 1]):
2267 return False
2268
2269 # Check whether a 1-2 split would be OK.
2270 if (Branch._is_full_case(items, end - count + 2) and
2271 Branch._is_folded(items[end - count - 1 : end - count + 2])):
2272 return False
2273
2274 # Check whether a 2-1 split would be OK.
2275 if (Branch._is_full_case(items, end - count - 2) and
2276 Branch._is_folded(items[end - count - 2 : end - count + 1])):
2277 return False
2278
2279 return True
2280
2281 @staticmethod
2282 def _merge_common_prefixes(info, reverse, branches):
2283 # Branches with the same case-sensitive character prefix can be grouped
2284 # together if they are separated only by other branches with a
2285 # character prefix.
2286 prefixed = defaultdict(list)
2287 order = {}
2288 new_branches = []
2289 for b in branches:
2290 if Branch._is_simple_character(b):
2291 # Branch starts with a simple character.
2292 prefixed[b.value].append([b])
2293 order.setdefault(b.value, len(order))
2294 elif (isinstance(b, Sequence) and b.items and
2295 Branch._is_simple_character(b.items[0])):
2296 # Branch starts with a simple character.
2297 prefixed[b.items[0].value].append(b.items)
2298 order.setdefault(b.items[0].value, len(order))
2299 else:
2300 Branch._flush_char_prefix(info, reverse, prefixed, order,
2301 new_branches)
2302
2303 new_branches.append(b)
2304
2305 Branch._flush_char_prefix(info, prefixed, order, new_branches)
2306
2307 return new_branches
2308
2309 @staticmethod
2310 def _is_simple_character(c):
2311 return isinstance(c, Character) and c.positive and not c.case_flags
2312
2313 @staticmethod
2314 def _reduce_to_set(info, reverse, branches):
2315 # Can the branches be reduced to a set?
2316 new_branches = []
2317 items = set()
2318 case_flags = NOCASE
2319 for b in branches:
2320 if isinstance(b, (Character, Property, SetBase)):
2321 # Branch starts with a single character.
2322 if b.case_flags != case_flags:
2323 # Different case sensitivity, so flush.
2324 Branch._flush_set_members(info, reverse, items, case_flags,
2325 new_branches)
2326
2327 case_flags = b.case_flags
2328
2329 items.add(b.with_flags(case_flags=NOCASE))
2330 else:
2331 Branch._flush_set_members(info, reverse, items, case_flags,
2332 new_branches)
2333
2334 new_branches.append(b)
2335
2336 Branch._flush_set_members(info, reverse, items, case_flags,
2337 new_branches)
2338
2339 return new_branches
2340
2341 @staticmethod
2342 def _flush_char_prefix(info, reverse, prefixed, order, new_branches):
2343 # Flush the prefixed branches.
2344 if not prefixed:
2345 return
2346
2347 for value, branches in sorted(prefixed.items(), key=lambda pair:
2348 order[pair[0]]):
2349 if len(branches) == 1:
2350 new_branches.append(make_sequence(branches[0]))
2351 else:
2352 subbranches = []
2353 optional = False
2354 for b in branches:
2355 if len(b) > 1:
2356 subbranches.append(make_sequence(b[1 : ]))
2357 elif not optional:
2358 subbranches.append(Sequence())
2359 optional = True
2360
2361 sequence = Sequence([Character(value), Branch(subbranches)])
2362 new_branches.append(sequence.optimise(info, reverse))
2363
2364 prefixed.clear()
2365 order.clear()
2366
2367 @staticmethod
2368 def _flush_set_members(info, reverse, items, case_flags, new_branches):
2369 # Flush the set members.
2370 if not items:
2371 return
2372
2373 if len(items) == 1:
2374 item = list(items)[0]
2375 else:
2376 item = SetUnion(info, list(items)).optimise(info, reverse)
2377
2378 new_branches.append(item.with_flags(case_flags=case_flags))
2379
2380 items.clear()
2381
2382 @staticmethod
2383 def _is_full_case(items, i):
2384 if not 0 <= i < len(items):
2385 return False
2386
2387 item = items[i]
2388 return (isinstance(item, Character) and item.positive and
2389 (item.case_flags & FULLIGNORECASE) == FULLIGNORECASE)
2390
2391 @staticmethod
2392 def _is_folded(items):
2393 if len(items) < 2:
2394 return False
2395
2396 for i in items:
2397 if (not isinstance(i, Character) or not i.positive or not
2398 i.case_flags):
2399 return False
2400
2401 folded = "".join(chr(i.value) for i in items)
2402 folded = _regex.fold_case(FULL_CASE_FOLDING, folded)
2403
2404 # Get the characters which expand to multiple codepoints on folding.
2405 expanding_chars = _regex.get_expand_on_folding()
2406
2407 for c in expanding_chars:
2408 if folded == _regex.fold_case(FULL_CASE_FOLDING, c):
2409 return True
2410
2411 return False
2412
2413 def is_empty(self):
2414 return all(b.is_empty() for b in self.branches)
2415
2416 def __eq__(self, other):
2417 return type(self) is type(other) and self.branches == other.branches
2418
2419 def max_width(self):
2420 return max(b.max_width() for b in self.branches)
2421
2422class CallGroup(RegexBase):
2423 def __init__(self, info, group, position):
2424 RegexBase.__init__(self)
2425 self.info = info
2426 self.group = group
2427 self.position = position
2428
2429 self._key = self.__class__, self.group
2430
2431 def fix_groups(self, pattern, reverse, fuzzy):
2432 try:
2433 self.group = int(self.group)
2434 except ValueError:
2435 try:
2436 self.group = self.info.group_index[self.group]
2437 except KeyError:
2438 raise error("invalid group reference", pattern, self.position)
2439
2440 if not 0 <= self.group <= self.info.group_count:
2441 raise error("unknown group", pattern, self.position)
2442
2443 if self.group > 0 and self.info.open_group_count[self.group] > 1:
2444 raise error("ambiguous group reference", pattern, self.position)
2445
2446 self.info.group_calls.append((self, reverse, fuzzy))
2447
2448 self._key = self.__class__, self.group
2449
2450 def remove_captures(self):
2451 raise error("group reference not allowed", pattern, self.position)
2452
2453 def _compile(self, reverse, fuzzy):
2454 return [(OP.GROUP_CALL, self.call_ref)]
2455
2456 def dump(self, indent, reverse):
2457 print("{}GROUP_CALL {}".format(INDENT * indent, self.group))
2458
2459 def __eq__(self, other):
2460 return type(self) is type(other) and self.group == other.group
2461
2462 def max_width(self):
2463 return UNLIMITED
2464
2465 def __del__(self):
2466 self.info = None
2467
2468class CallRef(RegexBase):
2469 def __init__(self, ref, parsed):
2470 self.ref = ref
2471 self.parsed = parsed
2472
2473 def _compile(self, reverse, fuzzy):
2474 return ([(OP.CALL_REF, self.ref)] + self.parsed._compile(reverse,
2475 fuzzy) + [(OP.END, )])
2476
2477class Character(RegexBase):
2478 _opcode = {(NOCASE, False): OP.CHARACTER, (IGNORECASE, False):
2479 OP.CHARACTER_IGN, (FULLCASE, False): OP.CHARACTER, (FULLIGNORECASE,
2480 False): OP.CHARACTER_IGN, (NOCASE, True): OP.CHARACTER_REV, (IGNORECASE,
2481 True): OP.CHARACTER_IGN_REV, (FULLCASE, True): OP.CHARACTER_REV,
2482 (FULLIGNORECASE, True): OP.CHARACTER_IGN_REV}
2483
2484 def __init__(self, value, positive=True, case_flags=NOCASE,
2485 zerowidth=False):
2486 RegexBase.__init__(self)
2487 self.value = value
2488 self.positive = bool(positive)
2489 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
2490 self.zerowidth = bool(zerowidth)
2491
2492 if (self.positive and (self.case_flags & FULLIGNORECASE) ==
2493 FULLIGNORECASE):
2494 self.folded = _regex.fold_case(FULL_CASE_FOLDING, chr(self.value))
2495 else:
2496 self.folded = chr(self.value)
2497
2498 self._key = (self.__class__, self.value, self.positive,
2499 self.case_flags, self.zerowidth)
2500
2501 def rebuild(self, positive, case_flags, zerowidth):
2502 return Character(self.value, positive, case_flags, zerowidth)
2503
2504 def optimise(self, info, reverse, in_set=False):
2505 return self
2506
2507 def get_firstset(self, reverse):
2508 return set([self])
2509
2510 def has_simple_start(self):
2511 return True
2512
2513 def _compile(self, reverse, fuzzy):
2514 flags = 0
2515 if self.positive:
2516 flags |= POSITIVE_OP
2517 if self.zerowidth:
2518 flags |= ZEROWIDTH_OP
2519 if fuzzy:
2520 flags |= FUZZY_OP
2521
2522 code = PrecompiledCode([self._opcode[self.case_flags, reverse], flags,
2523 self.value])
2524
2525 if len(self.folded) > 1:
2526 # The character expands on full case-folding.
2527 code = Branch([code, String([ord(c) for c in self.folded],
2528 case_flags=self.case_flags)])
2529
2530 return code.compile(reverse, fuzzy)
2531
2532 def dump(self, indent, reverse):
2533 display = ascii(chr(self.value)).lstrip("bu")
2534 print("{}CHARACTER {} {}{}".format(INDENT * indent,
2535 POS_TEXT[self.positive], display, CASE_TEXT[self.case_flags]))
2536
2537 def matches(self, ch):
2538 return (ch == self.value) == self.positive
2539
2540 def max_width(self):
2541 return len(self.folded)
2542
2543 def get_required_string(self, reverse):
2544 if not self.positive:
2545 return 1, None
2546
2547 self.folded_characters = tuple(ord(c) for c in self.folded)
2548
2549 return 0, self
2550
2551class Conditional(RegexBase):
2552 def __init__(self, info, group, yes_item, no_item, position):
2553 RegexBase.__init__(self)
2554 self.info = info
2555 self.group = group
2556 self.yes_item = yes_item
2557 self.no_item = no_item
2558 self.position = position
2559
2560 def fix_groups(self, pattern, reverse, fuzzy):
2561 try:
2562 self.group = int(self.group)
2563 except ValueError:
2564 try:
2565 self.group = self.info.group_index[self.group]
2566 except KeyError:
2567 if self.group == 'DEFINE':
2568 # 'DEFINE' is a special name unless there's a group with
2569 # that name.
2570 self.group = 0
2571 else:
2572 raise error("unknown group", pattern, self.position)
2573
2574 if not 0 <= self.group <= self.info.group_count:
2575 raise error("invalid group reference", pattern, self.position)
2576
2577 self.yes_item.fix_groups(pattern, reverse, fuzzy)
2578 self.no_item.fix_groups(pattern, reverse, fuzzy)
2579
2580 def optimise(self, info, reverse):
2581 yes_item = self.yes_item.optimise(info, reverse)
2582 no_item = self.no_item.optimise(info, reverse)
2583
2584 return Conditional(info, self.group, yes_item, no_item, self.position)
2585
2586 def pack_characters(self, info):
2587 self.yes_item = self.yes_item.pack_characters(info)
2588 self.no_item = self.no_item.pack_characters(info)
2589 return self
2590
2591 def remove_captures(self):
2592 self.yes_item = self.yes_item.remove_captures()
2593 self.no_item = self.no_item.remove_captures()
2594
2595 def is_atomic(self):
2596 return self.yes_item.is_atomic() and self.no_item.is_atomic()
2597
2598 def can_be_affix(self):
2599 return self.yes_item.can_be_affix() and self.no_item.can_be_affix()
2600
2601 def contains_group(self):
2602 return self.yes_item.contains_group() or self.no_item.contains_group()
2603
2604 def get_firstset(self, reverse):
2605 return (self.yes_item.get_firstset(reverse) |
2606 self.no_item.get_firstset(reverse))
2607
2608 def _compile(self, reverse, fuzzy):
2609 code = [(OP.GROUP_EXISTS, self.group)]
2610 code.extend(self.yes_item.compile(reverse, fuzzy))
2611 add_code = self.no_item.compile(reverse, fuzzy)
2612 if add_code:
2613 code.append((OP.NEXT, ))
2614 code.extend(add_code)
2615
2616 code.append((OP.END, ))
2617
2618 return code
2619
2620 def dump(self, indent, reverse):
2621 print("{}GROUP_EXISTS {}".format(INDENT * indent, self.group))
2622 self.yes_item.dump(indent + 1, reverse)
2623 if not self.no_item.is_empty():
2624 print("{}OR".format(INDENT * indent))
2625 self.no_item.dump(indent + 1, reverse)
2626
2627 def is_empty(self):
2628 return self.yes_item.is_empty() and self.no_item.is_empty()
2629
2630 def __eq__(self, other):
2631 return type(self) is type(other) and (self.group, self.yes_item,
2632 self.no_item) == (other.group, other.yes_item, other.no_item)
2633
2634 def max_width(self):
2635 return max(self.yes_item.max_width(), self.no_item.max_width())
2636
2637 def __del__(self):
2638 self.info = None
2639
2640class DefaultBoundary(ZeroWidthBase):
2641 _opcode = OP.DEFAULT_BOUNDARY
2642 _op_name = "DEFAULT_BOUNDARY"
2643
2644class DefaultEndOfWord(ZeroWidthBase):
2645 _opcode = OP.DEFAULT_END_OF_WORD
2646 _op_name = "DEFAULT_END_OF_WORD"
2647
2648class DefaultStartOfWord(ZeroWidthBase):
2649 _opcode = OP.DEFAULT_START_OF_WORD
2650 _op_name = "DEFAULT_START_OF_WORD"
2651
2652class EndOfLine(ZeroWidthBase):
2653 _opcode = OP.END_OF_LINE
2654 _op_name = "END_OF_LINE"
2655
2656class EndOfLineU(EndOfLine):
2657 _opcode = OP.END_OF_LINE_U
2658 _op_name = "END_OF_LINE_U"
2659
2660class EndOfString(ZeroWidthBase):
2661 _opcode = OP.END_OF_STRING
2662 _op_name = "END_OF_STRING"
2663
2664class EndOfStringLine(ZeroWidthBase):
2665 _opcode = OP.END_OF_STRING_LINE
2666 _op_name = "END_OF_STRING_LINE"
2667
2668class EndOfStringLineU(EndOfStringLine):
2669 _opcode = OP.END_OF_STRING_LINE_U
2670 _op_name = "END_OF_STRING_LINE_U"
2671
2672class EndOfWord(ZeroWidthBase):
2673 _opcode = OP.END_OF_WORD
2674 _op_name = "END_OF_WORD"
2675
2676class Failure(ZeroWidthBase):
2677 _op_name = "FAILURE"
2678
2679 def _compile(self, reverse, fuzzy):
2680 return [(OP.FAILURE, )]
2681
2682class Fuzzy(RegexBase):
2683 def __init__(self, subpattern, constraints=None):
2684 RegexBase.__init__(self)
2685 if constraints is None:
2686 constraints = {}
2687 self.subpattern = subpattern
2688 self.constraints = constraints
2689
2690 # If an error type is mentioned in the cost equation, then its maximum
2691 # defaults to unlimited.
2692 if "cost" in constraints:
2693 for e in "dis":
2694 if e in constraints["cost"]:
2695 constraints.setdefault(e, (0, None))
2696
2697 # If any error type is mentioned, then all the error maxima default to
2698 # 0, otherwise they default to unlimited.
2699 if set(constraints) & set("dis"):
2700 for e in "dis":
2701 constraints.setdefault(e, (0, 0))
2702 else:
2703 for e in "dis":
2704 constraints.setdefault(e, (0, None))
2705
2706 # The maximum of the generic error type defaults to unlimited.
2707 constraints.setdefault("e", (0, None))
2708
2709 # The cost equation defaults to equal costs. Also, the cost of any
2710 # error type not mentioned in the cost equation defaults to 0.
2711 if "cost" in constraints:
2712 for e in "dis":
2713 constraints["cost"].setdefault(e, 0)
2714 else:
2715 constraints["cost"] = {"d": 1, "i": 1, "s": 1, "max":
2716 constraints["e"][1]}
2717
2718 def fix_groups(self, pattern, reverse, fuzzy):
2719 self.subpattern.fix_groups(pattern, reverse, True)
2720
2721 def pack_characters(self, info):
2722 self.subpattern = self.subpattern.pack_characters(info)
2723 return self
2724
2725 def remove_captures(self):
2726 self.subpattern = self.subpattern.remove_captures()
2727 return self
2728
2729 def is_atomic(self):
2730 return self.subpattern.is_atomic()
2731
2732 def contains_group(self):
2733 return self.subpattern.contains_group()
2734
2735 def _compile(self, reverse, fuzzy):
2736 # The individual limits.
2737 arguments = []
2738 for e in "dise":
2739 v = self.constraints[e]
2740 arguments.append(v[0])
2741 arguments.append(UNLIMITED if v[1] is None else v[1])
2742
2743 # The coeffs of the cost equation.
2744 for e in "dis":
2745 arguments.append(self.constraints["cost"][e])
2746
2747 # The maximum of the cost equation.
2748 v = self.constraints["cost"]["max"]
2749 arguments.append(UNLIMITED if v is None else v)
2750
2751 flags = 0
2752 if reverse:
2753 flags |= REVERSE_OP
2754
2755 test = self.constraints.get("test")
2756
2757 if test:
2758 return ([(OP.FUZZY_EXT, flags) + tuple(arguments)] +
2759 test.compile(reverse, True) + [(OP.NEXT,)] +
2760 self.subpattern.compile(reverse, True) + [(OP.END,)])
2761
2762 return ([(OP.FUZZY, flags) + tuple(arguments)] +
2763 self.subpattern.compile(reverse, True) + [(OP.END,)])
2764
2765 def dump(self, indent, reverse):
2766 constraints = self._constraints_to_string()
2767 if constraints:
2768 constraints = " " + constraints
2769 print("{}FUZZY{}".format(INDENT * indent, constraints))
2770 self.subpattern.dump(indent + 1, reverse)
2771
2772 def is_empty(self):
2773 return self.subpattern.is_empty()
2774
2775 def __eq__(self, other):
2776 return (type(self) is type(other) and self.subpattern ==
2777 other.subpattern and self.constraints == other.constraints)
2778
2779 def max_width(self):
2780 return UNLIMITED
2781
2782 def _constraints_to_string(self):
2783 constraints = []
2784
2785 for name in "ids":
2786 min, max = self.constraints[name]
2787 if max == 0:
2788 continue
2789
2790 con = ""
2791
2792 if min > 0:
2793 con = "{}<=".format(min)
2794
2795 con += name
2796
2797 if max is not None:
2798 con += "<={}".format(max)
2799
2800 constraints.append(con)
2801
2802 cost = []
2803 for name in "ids":
2804 coeff = self.constraints["cost"][name]
2805 if coeff > 0:
2806 cost.append("{}{}".format(coeff, name))
2807
2808 limit = self.constraints["cost"]["max"]
2809 if limit is not None and limit > 0:
2810 cost = "{}<={}".format("+".join(cost), limit)
2811 constraints.append(cost)
2812
2813 return ",".join(constraints)
2814
2815class Grapheme(RegexBase):
2816 def _compile(self, reverse, fuzzy):
2817 # Match at least 1 character until a grapheme boundary is reached. Note
2818 # that this is the same whether matching forwards or backwards.
2819 grapheme_matcher = Atomic(Sequence([LazyRepeat(AnyAll(), 1, None),
2820 GraphemeBoundary()]))
2821
2822 return grapheme_matcher.compile(reverse, fuzzy)
2823
2824 def dump(self, indent, reverse):
2825 print("{}GRAPHEME".format(INDENT * indent))
2826
2827 def max_width(self):
2828 return UNLIMITED
2829
2830class GraphemeBoundary:
2831 def compile(self, reverse, fuzzy):
2832 return [(OP.GRAPHEME_BOUNDARY, 1)]
2833
2834class GreedyRepeat(RegexBase):
2835 _opcode = OP.GREEDY_REPEAT
2836 _op_name = "GREEDY_REPEAT"
2837
2838 def __init__(self, subpattern, min_count, max_count):
2839 RegexBase.__init__(self)
2840 self.subpattern = subpattern
2841 self.min_count = min_count
2842 self.max_count = max_count
2843
2844 def fix_groups(self, pattern, reverse, fuzzy):
2845 self.subpattern.fix_groups(pattern, reverse, fuzzy)
2846
2847 def optimise(self, info, reverse):
2848 subpattern = self.subpattern.optimise(info, reverse)
2849
2850 return type(self)(subpattern, self.min_count, self.max_count)
2851
2852 def pack_characters(self, info):
2853 self.subpattern = self.subpattern.pack_characters(info)
2854 return self
2855
2856 def remove_captures(self):
2857 self.subpattern = self.subpattern.remove_captures()
2858 return self
2859
2860 def is_atomic(self):
2861 return self.min_count == self.max_count and self.subpattern.is_atomic()
2862
2863 def can_be_affix(self):
2864 return False
2865
2866 def contains_group(self):
2867 return self.subpattern.contains_group()
2868
2869 def get_firstset(self, reverse):
2870 fs = self.subpattern.get_firstset(reverse)
2871 if self.min_count == 0:
2872 fs.add(None)
2873
2874 return fs
2875
2876 def _compile(self, reverse, fuzzy):
2877 repeat = [self._opcode, self.min_count]
2878 if self.max_count is None:
2879 repeat.append(UNLIMITED)
2880 else:
2881 repeat.append(self.max_count)
2882
2883 subpattern = self.subpattern.compile(reverse, fuzzy)
2884 if not subpattern:
2885 return []
2886
2887 return ([tuple(repeat)] + subpattern + [(OP.END, )])
2888
2889 def dump(self, indent, reverse):
2890 if self.max_count is None:
2891 limit = "INF"
2892 else:
2893 limit = self.max_count
2894 print("{}{} {} {}".format(INDENT * indent, self._op_name,
2895 self.min_count, limit))
2896
2897 self.subpattern.dump(indent + 1, reverse)
2898
2899 def is_empty(self):
2900 return self.subpattern.is_empty()
2901
2902 def __eq__(self, other):
2903 return type(self) is type(other) and (self.subpattern, self.min_count,
2904 self.max_count) == (other.subpattern, other.min_count,
2905 other.max_count)
2906
2907 def max_width(self):
2908 if self.max_count is None:
2909 return UNLIMITED
2910
2911 return self.subpattern.max_width() * self.max_count
2912
2913 def get_required_string(self, reverse):
2914 max_count = UNLIMITED if self.max_count is None else self.max_count
2915 if self.min_count == 0:
2916 w = self.subpattern.max_width() * max_count
2917 return min(w, UNLIMITED), None
2918
2919 ofs, req = self.subpattern.get_required_string(reverse)
2920 if req:
2921 return ofs, req
2922
2923 w = self.subpattern.max_width() * max_count
2924 return min(w, UNLIMITED), None
2925
2926class PossessiveRepeat(GreedyRepeat):
2927 def is_atomic(self):
2928 return True
2929
2930 def _compile(self, reverse, fuzzy):
2931 subpattern = self.subpattern.compile(reverse, fuzzy)
2932 if not subpattern:
2933 return []
2934
2935 repeat = [self._opcode, self.min_count]
2936 if self.max_count is None:
2937 repeat.append(UNLIMITED)
2938 else:
2939 repeat.append(self.max_count)
2940
2941 return ([(OP.ATOMIC, ), tuple(repeat)] + subpattern + [(OP.END, ),
2942 (OP.END, )])
2943
2944 def dump(self, indent, reverse):
2945 print("{}ATOMIC".format(INDENT * indent))
2946
2947 if self.max_count is None:
2948 limit = "INF"
2949 else:
2950 limit = self.max_count
2951 print("{}{} {} {}".format(INDENT * (indent + 1), self._op_name,
2952 self.min_count, limit))
2953
2954 self.subpattern.dump(indent + 2, reverse)
2955
2956class Group(RegexBase):
2957 def __init__(self, info, group, subpattern):
2958 RegexBase.__init__(self)
2959 self.info = info
2960 self.group = group
2961 self.subpattern = subpattern
2962
2963 self.call_ref = None
2964
2965 def fix_groups(self, pattern, reverse, fuzzy):
2966 self.info.defined_groups[self.group] = (self, reverse, fuzzy)
2967 self.subpattern.fix_groups(pattern, reverse, fuzzy)
2968
2969 def optimise(self, info, reverse):
2970 subpattern = self.subpattern.optimise(info, reverse)
2971
2972 return Group(self.info, self.group, subpattern)
2973
2974 def pack_characters(self, info):
2975 self.subpattern = self.subpattern.pack_characters(info)
2976 return self
2977
2978 def remove_captures(self):
2979 return self.subpattern.remove_captures()
2980
2981 def is_atomic(self):
2982 return self.subpattern.is_atomic()
2983
2984 def can_be_affix(self):
2985 return False
2986
2987 def contains_group(self):
2988 return True
2989
2990 def get_firstset(self, reverse):
2991 return self.subpattern.get_firstset(reverse)
2992
2993 def has_simple_start(self):
2994 return self.subpattern.has_simple_start()
2995
2996 def _compile(self, reverse, fuzzy):
2997 code = []
2998
2999 public_group = private_group = self.group
3000 if private_group < 0:
3001 public_group = self.info.private_groups[private_group]
3002 private_group = self.info.group_count - private_group
3003
3004 key = self.group, reverse, fuzzy
3005 ref = self.info.call_refs.get(key)
3006 if ref is not None:
3007 code += [(OP.CALL_REF, ref)]
3008
3009 code += [(OP.GROUP, int(not reverse), private_group, public_group)]
3010 code += self.subpattern.compile(reverse, fuzzy)
3011 code += [(OP.END, )]
3012
3013 if ref is not None:
3014 code += [(OP.END, )]
3015
3016 return code
3017
3018 def dump(self, indent, reverse):
3019 group = self.group
3020 if group < 0:
3021 group = private_groups[group]
3022 print("{}GROUP {}".format(INDENT * indent, group))
3023 self.subpattern.dump(indent + 1, reverse)
3024
3025 def __eq__(self, other):
3026 return (type(self) is type(other) and (self.group, self.subpattern) ==
3027 (other.group, other.subpattern))
3028
3029 def max_width(self):
3030 return self.subpattern.max_width()
3031
3032 def get_required_string(self, reverse):
3033 return self.subpattern.get_required_string(reverse)
3034
3035 def __del__(self):
3036 self.info = None
3037
3038class Keep(ZeroWidthBase):
3039 _opcode = OP.KEEP
3040 _op_name = "KEEP"
3041
3042class LazyRepeat(GreedyRepeat):
3043 _opcode = OP.LAZY_REPEAT
3044 _op_name = "LAZY_REPEAT"
3045
3046class LookAround(RegexBase):
3047 _dir_text = {False: "AHEAD", True: "BEHIND"}
3048
3049 def __init__(self, behind, positive, subpattern):
3050 RegexBase.__init__(self)
3051 self.behind = bool(behind)
3052 self.positive = bool(positive)
3053 self.subpattern = subpattern
3054
3055 def fix_groups(self, pattern, reverse, fuzzy):
3056 self.subpattern.fix_groups(pattern, self.behind, fuzzy)
3057
3058 def optimise(self, info, reverse):
3059 subpattern = self.subpattern.optimise(info, self.behind)
3060 if self.positive and subpattern.is_empty():
3061 return subpattern
3062
3063 return LookAround(self.behind, self.positive, subpattern)
3064
3065 def pack_characters(self, info):
3066 self.subpattern = self.subpattern.pack_characters(info)
3067 return self
3068
3069 def remove_captures(self):
3070 return self.subpattern.remove_captures()
3071
3072 def is_atomic(self):
3073 return self.subpattern.is_atomic()
3074
3075 def can_be_affix(self):
3076 return self.subpattern.can_be_affix()
3077
3078 def contains_group(self):
3079 return self.subpattern.contains_group()
3080
3081 def get_firstset(self, reverse):
3082 if self.positive and self.behind == reverse:
3083 return self.subpattern.get_firstset(reverse)
3084
3085 return set([None])
3086
3087 def _compile(self, reverse, fuzzy):
3088 flags = 0
3089 if self.positive:
3090 flags |= POSITIVE_OP
3091 if fuzzy:
3092 flags |= FUZZY_OP
3093 if reverse:
3094 flags |= REVERSE_OP
3095
3096 return ([(OP.LOOKAROUND, flags, int(not self.behind))] +
3097 self.subpattern.compile(self.behind) + [(OP.END, )])
3098
3099 def dump(self, indent, reverse):
3100 print("{}LOOK{} {}".format(INDENT * indent,
3101 self._dir_text[self.behind], POS_TEXT[self.positive]))
3102 self.subpattern.dump(indent + 1, self.behind)
3103
3104 def is_empty(self):
3105 return self.positive and self.subpattern.is_empty()
3106
3107 def __eq__(self, other):
3108 return type(self) is type(other) and (self.behind, self.positive,
3109 self.subpattern) == (other.behind, other.positive, other.subpattern)
3110
3111 def max_width(self):
3112 return 0
3113
3114class LookAroundConditional(RegexBase):
3115 _dir_text = {False: "AHEAD", True: "BEHIND"}
3116
3117 def __init__(self, behind, positive, subpattern, yes_item, no_item):
3118 RegexBase.__init__(self)
3119 self.behind = bool(behind)
3120 self.positive = bool(positive)
3121 self.subpattern = subpattern
3122 self.yes_item = yes_item
3123 self.no_item = no_item
3124
3125 def fix_groups(self, pattern, reverse, fuzzy):
3126 self.subpattern.fix_groups(pattern, reverse, fuzzy)
3127 self.yes_item.fix_groups(pattern, reverse, fuzzy)
3128 self.no_item.fix_groups(pattern, reverse, fuzzy)
3129
3130 def optimise(self, info, reverse):
3131 subpattern = self.subpattern.optimise(info, self.behind)
3132 yes_item = self.yes_item.optimise(info, self.behind)
3133 no_item = self.no_item.optimise(info, self.behind)
3134
3135 return LookAroundConditional(self.behind, self.positive, subpattern,
3136 yes_item, no_item)
3137
3138 def pack_characters(self, info):
3139 self.subpattern = self.subpattern.pack_characters(info)
3140 self.yes_item = self.yes_item.pack_characters(info)
3141 self.no_item = self.no_item.pack_characters(info)
3142 return self
3143
3144 def remove_captures(self):
3145 self.subpattern = self.subpattern.remove_captures()
3146 self.yes_item = self.yes_item.remove_captures()
3147 self.no_item = self.no_item.remove_captures()
3148
3149 def is_atomic(self):
3150 return (self.subpattern.is_atomic() and self.yes_item.is_atomic() and
3151 self.no_item.is_atomic())
3152
3153 def can_be_affix(self):
3154 return (self.subpattern.can_be_affix() and self.yes_item.can_be_affix()
3155 and self.no_item.can_be_affix())
3156
3157 def contains_group(self):
3158 return (self.subpattern.contains_group() or
3159 self.yes_item.contains_group() or self.no_item.contains_group())
3160
3161 def _compile(self, reverse, fuzzy):
3162 code = [(OP.CONDITIONAL, int(self.positive), int(not self.behind))]
3163 code.extend(self.subpattern.compile(self.behind, fuzzy))
3164 code.append((OP.NEXT, ))
3165 code.extend(self.yes_item.compile(reverse, fuzzy))
3166 add_code = self.no_item.compile(reverse, fuzzy)
3167 if add_code:
3168 code.append((OP.NEXT, ))
3169 code.extend(add_code)
3170
3171 code.append((OP.END, ))
3172
3173 return code
3174
3175 def dump(self, indent, reverse):
3176 print("{}CONDITIONAL {} {}".format(INDENT * indent,
3177 self._dir_text[self.behind], POS_TEXT[self.positive]))
3178 self.subpattern.dump(indent + 1, self.behind)
3179 print("{}EITHER".format(INDENT * indent))
3180 self.yes_item.dump(indent + 1, reverse)
3181 if not self.no_item.is_empty():
3182 print("{}OR".format(INDENT * indent))
3183 self.no_item.dump(indent + 1, reverse)
3184
3185 def is_empty(self):
3186 return (self.subpattern.is_empty() and self.yes_item.is_empty() or
3187 self.no_item.is_empty())
3188
3189 def __eq__(self, other):
3190 return type(self) is type(other) and (self.subpattern, self.yes_item,
3191 self.no_item) == (other.subpattern, other.yes_item, other.no_item)
3192
3193 def max_width(self):
3194 return max(self.yes_item.max_width(), self.no_item.max_width())
3195
3196 def get_required_string(self, reverse):
3197 return self.max_width(), None
3198
3199class PrecompiledCode(RegexBase):
3200 def __init__(self, code):
3201 self.code = code
3202
3203 def _compile(self, reverse, fuzzy):
3204 return [tuple(self.code)]
3205
3206class Property(RegexBase):
3207 _opcode = {(NOCASE, False): OP.PROPERTY, (IGNORECASE, False):
3208 OP.PROPERTY_IGN, (FULLCASE, False): OP.PROPERTY, (FULLIGNORECASE, False):
3209 OP.PROPERTY_IGN, (NOCASE, True): OP.PROPERTY_REV, (IGNORECASE, True):
3210 OP.PROPERTY_IGN_REV, (FULLCASE, True): OP.PROPERTY_REV, (FULLIGNORECASE,
3211 True): OP.PROPERTY_IGN_REV}
3212
3213 def __init__(self, value, positive=True, case_flags=NOCASE,
3214 zerowidth=False):
3215 RegexBase.__init__(self)
3216 self.value = value
3217 self.positive = bool(positive)
3218 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
3219 self.zerowidth = bool(zerowidth)
3220
3221 self._key = (self.__class__, self.value, self.positive,
3222 self.case_flags, self.zerowidth)
3223
3224 def rebuild(self, positive, case_flags, zerowidth):
3225 return Property(self.value, positive, case_flags, zerowidth)
3226
3227 def optimise(self, info, reverse, in_set=False):
3228 return self
3229
3230 def get_firstset(self, reverse):
3231 return set([self])
3232
3233 def has_simple_start(self):
3234 return True
3235
3236 def _compile(self, reverse, fuzzy):
3237 flags = 0
3238 if self.positive:
3239 flags |= POSITIVE_OP
3240 if self.zerowidth:
3241 flags |= ZEROWIDTH_OP
3242 if fuzzy:
3243 flags |= FUZZY_OP
3244 return [(self._opcode[self.case_flags, reverse], flags, self.value)]
3245
3246 def dump(self, indent, reverse):
3247 prop = PROPERTY_NAMES[self.value >> 16]
3248 name, value = prop[0], prop[1][self.value & 0xFFFF]
3249 print("{}PROPERTY {} {}:{}{}".format(INDENT * indent,
3250 POS_TEXT[self.positive], name, value, CASE_TEXT[self.case_flags]))
3251
3252 def matches(self, ch):
3253 return _regex.has_property_value(self.value, ch) == self.positive
3254
3255 def max_width(self):
3256 return 1
3257
3258class Prune(ZeroWidthBase):
3259 _op_name = "PRUNE"
3260
3261 def _compile(self, reverse, fuzzy):
3262 return [(OP.PRUNE, )]
3263
3264class Range(RegexBase):
3265 _opcode = {(NOCASE, False): OP.RANGE, (IGNORECASE, False): OP.RANGE_IGN,
3266 (FULLCASE, False): OP.RANGE, (FULLIGNORECASE, False): OP.RANGE_IGN,
3267 (NOCASE, True): OP.RANGE_REV, (IGNORECASE, True): OP.RANGE_IGN_REV,
3268 (FULLCASE, True): OP.RANGE_REV, (FULLIGNORECASE, True): OP.RANGE_IGN_REV}
3269 _op_name = "RANGE"
3270
3271 def __init__(self, lower, upper, positive=True, case_flags=NOCASE,
3272 zerowidth=False):
3273 RegexBase.__init__(self)
3274 self.lower = lower
3275 self.upper = upper
3276 self.positive = bool(positive)
3277 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
3278 self.zerowidth = bool(zerowidth)
3279
3280 self._key = (self.__class__, self.lower, self.upper, self.positive,
3281 self.case_flags, self.zerowidth)
3282
3283 def rebuild(self, positive, case_flags, zerowidth):
3284 return Range(self.lower, self.upper, positive, case_flags, zerowidth)
3285
3286 def optimise(self, info, reverse, in_set=False):
3287 # Is the range case-sensitive?
3288 if not self.positive or not (self.case_flags & IGNORECASE) or in_set:
3289 return self
3290
3291 # Is full case-folding possible?
3292 if (not (info.flags & UNICODE) or (self.case_flags & FULLIGNORECASE) !=
3293 FULLIGNORECASE):
3294 return self
3295
3296 # Get the characters which expand to multiple codepoints on folding.
3297 expanding_chars = _regex.get_expand_on_folding()
3298
3299 # Get the folded characters in the range.
3300 items = []
3301 for ch in expanding_chars:
3302 if self.lower <= ord(ch) <= self.upper:
3303 folded = _regex.fold_case(FULL_CASE_FOLDING, ch)
3304 items.append(String([ord(c) for c in folded],
3305 case_flags=self.case_flags))
3306
3307 if not items:
3308 # We can fall back to simple case-folding.
3309 return self
3310
3311 if len(items) < self.upper - self.lower + 1:
3312 # Not all the characters are covered by the full case-folding.
3313 items.insert(0, self)
3314
3315 return Branch(items)
3316
3317 def _compile(self, reverse, fuzzy):
3318 flags = 0
3319 if self.positive:
3320 flags |= POSITIVE_OP
3321 if self.zerowidth:
3322 flags |= ZEROWIDTH_OP
3323 if fuzzy:
3324 flags |= FUZZY_OP
3325 return [(self._opcode[self.case_flags, reverse], flags, self.lower,
3326 self.upper)]
3327
3328 def dump(self, indent, reverse):
3329 display_lower = ascii(chr(self.lower)).lstrip("bu")
3330 display_upper = ascii(chr(self.upper)).lstrip("bu")
3331 print("{}RANGE {} {} {}{}".format(INDENT * indent,
3332 POS_TEXT[self.positive], display_lower, display_upper,
3333 CASE_TEXT[self.case_flags]))
3334
3335 def matches(self, ch):
3336 return (self.lower <= ch <= self.upper) == self.positive
3337
3338 def max_width(self):
3339 return 1
3340
3341class RefGroup(RegexBase):
3342 _opcode = {(NOCASE, False): OP.REF_GROUP, (IGNORECASE, False):
3343 OP.REF_GROUP_IGN, (FULLCASE, False): OP.REF_GROUP, (FULLIGNORECASE,
3344 False): OP.REF_GROUP_FLD, (NOCASE, True): OP.REF_GROUP_REV, (IGNORECASE,
3345 True): OP.REF_GROUP_IGN_REV, (FULLCASE, True): OP.REF_GROUP_REV,
3346 (FULLIGNORECASE, True): OP.REF_GROUP_FLD_REV}
3347
3348 def __init__(self, info, group, position, case_flags=NOCASE):
3349 RegexBase.__init__(self)
3350 self.info = info
3351 self.group = group
3352 self.position = position
3353 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
3354
3355 self._key = self.__class__, self.group, self.case_flags
3356
3357 def fix_groups(self, pattern, reverse, fuzzy):
3358 try:
3359 self.group = int(self.group)
3360 except ValueError:
3361 try:
3362 self.group = self.info.group_index[self.group]
3363 except KeyError:
3364 raise error("unknown group", pattern, self.position)
3365
3366 if not 1 <= self.group <= self.info.group_count:
3367 raise error("invalid group reference", pattern, self.position)
3368
3369 self._key = self.__class__, self.group, self.case_flags
3370
3371 def remove_captures(self):
3372 raise error("group reference not allowed", pattern, self.position)
3373
3374 def _compile(self, reverse, fuzzy):
3375 flags = 0
3376 if fuzzy:
3377 flags |= FUZZY_OP
3378 return [(self._opcode[self.case_flags, reverse], flags, self.group)]
3379
3380 def dump(self, indent, reverse):
3381 print("{}REF_GROUP {}{}".format(INDENT * indent, self.group,
3382 CASE_TEXT[self.case_flags]))
3383
3384 def max_width(self):
3385 return UNLIMITED
3386
3387 def __del__(self):
3388 self.info = None
3389
3390class SearchAnchor(ZeroWidthBase):
3391 _opcode = OP.SEARCH_ANCHOR
3392 _op_name = "SEARCH_ANCHOR"
3393
3394class Sequence(RegexBase):
3395 def __init__(self, items=None):
3396 RegexBase.__init__(self)
3397 if items is None:
3398 items = []
3399
3400 self.items = items
3401
3402 def fix_groups(self, pattern, reverse, fuzzy):
3403 for s in self.items:
3404 s.fix_groups(pattern, reverse, fuzzy)
3405
3406 def optimise(self, info, reverse):
3407 # Flatten the sequences.
3408 items = []
3409 for s in self.items:
3410 s = s.optimise(info, reverse)
3411 if isinstance(s, Sequence):
3412 items.extend(s.items)
3413 else:
3414 items.append(s)
3415
3416 return make_sequence(items)
3417
3418 def pack_characters(self, info):
3419 "Packs sequences of characters into strings."
3420 items = []
3421 characters = []
3422 case_flags = NOCASE
3423 for s in self.items:
3424 if type(s) is Character and s.positive and not s.zerowidth:
3425 if s.case_flags != case_flags:
3426 # Different case sensitivity, so flush, unless neither the
3427 # previous nor the new character are cased.
3428 if s.case_flags or is_cased_i(info, s.value):
3429 Sequence._flush_characters(info, characters,
3430 case_flags, items)
3431
3432 case_flags = s.case_flags
3433
3434 characters.append(s.value)
3435 elif type(s) is String or type(s) is Literal:
3436 if s.case_flags != case_flags:
3437 # Different case sensitivity, so flush, unless the neither
3438 # the previous nor the new string are cased.
3439 if s.case_flags or any(is_cased_i(info, c) for c in
3440 characters):
3441 Sequence._flush_characters(info, characters,
3442 case_flags, items)
3443
3444 case_flags = s.case_flags
3445
3446 characters.extend(s.characters)
3447 else:
3448 Sequence._flush_characters(info, characters, case_flags, items)
3449
3450 items.append(s.pack_characters(info))
3451
3452 Sequence._flush_characters(info, characters, case_flags, items)
3453
3454 return make_sequence(items)
3455
3456 def remove_captures(self):
3457 self.items = [s.remove_captures() for s in self.items]
3458 return self
3459
3460 def is_atomic(self):
3461 return all(s.is_atomic() for s in self.items)
3462
3463 def can_be_affix(self):
3464 return False
3465
3466 def contains_group(self):
3467 return any(s.contains_group() for s in self.items)
3468
3469 def get_firstset(self, reverse):
3470 fs = set()
3471 items = self.items
3472 if reverse:
3473 items.reverse()
3474 for s in items:
3475 fs |= s.get_firstset(reverse)
3476 if None not in fs:
3477 return fs
3478 fs.discard(None)
3479
3480 return fs | set([None])
3481
3482 def has_simple_start(self):
3483 return bool(self.items) and self.items[0].has_simple_start()
3484
3485 def _compile(self, reverse, fuzzy):
3486 seq = self.items
3487 if reverse:
3488 seq = seq[::-1]
3489
3490 code = []
3491 for s in seq:
3492 code.extend(s.compile(reverse, fuzzy))
3493
3494 return code
3495
3496 def dump(self, indent, reverse):
3497 for s in self.items:
3498 s.dump(indent, reverse)
3499
3500 @staticmethod
3501 def _flush_characters(info, characters, case_flags, items):
3502 if not characters:
3503 return
3504
3505 # Disregard case_flags if all of the characters are case-less.
3506 if case_flags & IGNORECASE:
3507 if not any(is_cased_i(info, c) for c in characters):
3508 case_flags = NOCASE
3509
3510 if (case_flags & FULLIGNORECASE) == FULLIGNORECASE:
3511 literals = Sequence._fix_full_casefold(characters)
3512
3513 for item in literals:
3514 chars = item.characters
3515
3516 if len(chars) == 1:
3517 items.append(Character(chars[0], case_flags=item.case_flags))
3518 else:
3519 items.append(String(chars, case_flags=item.case_flags))
3520 else:
3521 if len(characters) == 1:
3522 items.append(Character(characters[0], case_flags=case_flags))
3523 else:
3524 items.append(String(characters, case_flags=case_flags))
3525
3526 characters[:] = []
3527
3528 @staticmethod
3529 def _fix_full_casefold(characters):
3530 # Split a literal needing full case-folding into chunks that need it
3531 # and chunks that can use simple case-folding, which is faster.
3532 expanded = [_regex.fold_case(FULL_CASE_FOLDING, c) for c in
3533 _regex.get_expand_on_folding()]
3534 string = _regex.fold_case(FULL_CASE_FOLDING, ''.join(chr(c)
3535 for c in characters)).lower()
3536 chunks = []
3537
3538 for e in expanded:
3539 found = string.find(e)
3540
3541 while found >= 0:
3542 chunks.append((found, found + len(e)))
3543 found = string.find(e, found + 1)
3544
3545 pos = 0
3546 literals = []
3547
3548 for start, end in Sequence._merge_chunks(chunks):
3549 if pos < start:
3550 literals.append(Literal(characters[pos : start],
3551 case_flags=IGNORECASE))
3552
3553 literals.append(Literal(characters[start : end],
3554 case_flags=FULLIGNORECASE))
3555 pos = end
3556
3557 if pos < len(characters):
3558 literals.append(Literal(characters[pos : ], case_flags=IGNORECASE))
3559
3560 return literals
3561
3562 @staticmethod
3563 def _merge_chunks(chunks):
3564 if len(chunks) < 2:
3565 return chunks
3566
3567 chunks.sort()
3568
3569 start, end = chunks[0]
3570 new_chunks = []
3571
3572 for s, e in chunks[1 : ]:
3573 if s <= end:
3574 end = max(end, e)
3575 else:
3576 new_chunks.append((start, end))
3577 start, end = s, e
3578
3579 new_chunks.append((start, end))
3580
3581 return new_chunks
3582
3583 def is_empty(self):
3584 return all(i.is_empty() for i in self.items)
3585
3586 def __eq__(self, other):
3587 return type(self) is type(other) and self.items == other.items
3588
3589 def max_width(self):
3590 return sum(s.max_width() for s in self.items)
3591
3592 def get_required_string(self, reverse):
3593 seq = self.items
3594 if reverse:
3595 seq = seq[::-1]
3596
3597 offset = 0
3598
3599 for s in seq:
3600 ofs, req = s.get_required_string(reverse)
3601 offset += ofs
3602 if req:
3603 return offset, req
3604
3605 return offset, None
3606
3607class SetBase(RegexBase):
3608 def __init__(self, info, items, positive=True, case_flags=NOCASE,
3609 zerowidth=False):
3610 RegexBase.__init__(self)
3611 self.info = info
3612 self.items = tuple(items)
3613 self.positive = bool(positive)
3614 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
3615 self.zerowidth = bool(zerowidth)
3616
3617 self.char_width = 1
3618
3619 self._key = (self.__class__, self.items, self.positive,
3620 self.case_flags, self.zerowidth)
3621
3622 def rebuild(self, positive, case_flags, zerowidth):
3623 return type(self)(self.info, self.items, positive, case_flags,
3624 zerowidth).optimise(self.info, False)
3625
3626 def get_firstset(self, reverse):
3627 return set([self])
3628
3629 def has_simple_start(self):
3630 return True
3631
3632 def _compile(self, reverse, fuzzy):
3633 flags = 0
3634 if self.positive:
3635 flags |= POSITIVE_OP
3636 if self.zerowidth:
3637 flags |= ZEROWIDTH_OP
3638 if fuzzy:
3639 flags |= FUZZY_OP
3640 code = [(self._opcode[self.case_flags, reverse], flags)]
3641 for m in self.items:
3642 code.extend(m.compile())
3643
3644 code.append((OP.END, ))
3645
3646 return code
3647
3648 def dump(self, indent, reverse):
3649 print("{}{} {}{}".format(INDENT * indent, self._op_name,
3650 POS_TEXT[self.positive], CASE_TEXT[self.case_flags]))
3651 for i in self.items:
3652 i.dump(indent + 1, reverse)
3653
3654 def _handle_case_folding(self, info, in_set):
3655 # Is the set case-sensitive?
3656 if not self.positive or not (self.case_flags & IGNORECASE) or in_set:
3657 return self
3658
3659 # Is full case-folding possible?
3660 if (not (self.info.flags & UNICODE) or (self.case_flags &
3661 FULLIGNORECASE) != FULLIGNORECASE):
3662 return self
3663
3664 # Get the characters which expand to multiple codepoints on folding.
3665 expanding_chars = _regex.get_expand_on_folding()
3666
3667 # Get the folded characters in the set.
3668 items = []
3669 seen = set()
3670 for ch in expanding_chars:
3671 if self.matches(ord(ch)):
3672 folded = _regex.fold_case(FULL_CASE_FOLDING, ch)
3673 if folded not in seen:
3674 items.append(String([ord(c) for c in folded],
3675 case_flags=self.case_flags))
3676 seen.add(folded)
3677
3678 if not items:
3679 # We can fall back to simple case-folding.
3680 return self
3681
3682 return Branch([self] + items)
3683
3684 def max_width(self):
3685 # Is the set case-sensitive?
3686 if not self.positive or not (self.case_flags & IGNORECASE):
3687 return 1
3688
3689 # Is full case-folding possible?
3690 if (not (self.info.flags & UNICODE) or (self.case_flags &
3691 FULLIGNORECASE) != FULLIGNORECASE):
3692 return 1
3693
3694 # Get the characters which expand to multiple codepoints on folding.
3695 expanding_chars = _regex.get_expand_on_folding()
3696
3697 # Get the folded characters in the set.
3698 seen = set()
3699 for ch in expanding_chars:
3700 if self.matches(ord(ch)):
3701 folded = _regex.fold_case(FULL_CASE_FOLDING, ch)
3702 seen.add(folded)
3703
3704 if not seen:
3705 return 1
3706
3707 return max(len(folded) for folded in seen)
3708
3709 def __del__(self):
3710 self.info = None
3711
3712class SetDiff(SetBase):
3713 _opcode = {(NOCASE, False): OP.SET_DIFF, (IGNORECASE, False):
3714 OP.SET_DIFF_IGN, (FULLCASE, False): OP.SET_DIFF, (FULLIGNORECASE, False):
3715 OP.SET_DIFF_IGN, (NOCASE, True): OP.SET_DIFF_REV, (IGNORECASE, True):
3716 OP.SET_DIFF_IGN_REV, (FULLCASE, True): OP.SET_DIFF_REV, (FULLIGNORECASE,
3717 True): OP.SET_DIFF_IGN_REV}
3718 _op_name = "SET_DIFF"
3719
3720 def optimise(self, info, reverse, in_set=False):
3721 items = self.items
3722 if len(items) > 2:
3723 items = [items[0], SetUnion(info, items[1 : ])]
3724
3725 if len(items) == 1:
3726 return items[0].with_flags(case_flags=self.case_flags,
3727 zerowidth=self.zerowidth).optimise(info, reverse, in_set)
3728
3729 self.items = tuple(m.optimise(info, reverse, in_set=True) for m in
3730 items)
3731
3732 return self._handle_case_folding(info, in_set)
3733
3734 def matches(self, ch):
3735 m = self.items[0].matches(ch) and not self.items[1].matches(ch)
3736 return m == self.positive
3737
3738class SetInter(SetBase):
3739 _opcode = {(NOCASE, False): OP.SET_INTER, (IGNORECASE, False):
3740 OP.SET_INTER_IGN, (FULLCASE, False): OP.SET_INTER, (FULLIGNORECASE,
3741 False): OP.SET_INTER_IGN, (NOCASE, True): OP.SET_INTER_REV, (IGNORECASE,
3742 True): OP.SET_INTER_IGN_REV, (FULLCASE, True): OP.SET_INTER_REV,
3743 (FULLIGNORECASE, True): OP.SET_INTER_IGN_REV}
3744 _op_name = "SET_INTER"
3745
3746 def optimise(self, info, reverse, in_set=False):
3747 items = []
3748 for m in self.items:
3749 m = m.optimise(info, reverse, in_set=True)
3750 if isinstance(m, SetInter) and m.positive:
3751 # Intersection in intersection.
3752 items.extend(m.items)
3753 else:
3754 items.append(m)
3755
3756 if len(items) == 1:
3757 return items[0].with_flags(case_flags=self.case_flags,
3758 zerowidth=self.zerowidth).optimise(info, reverse, in_set)
3759
3760 self.items = tuple(items)
3761
3762 return self._handle_case_folding(info, in_set)
3763
3764 def matches(self, ch):
3765 m = all(i.matches(ch) for i in self.items)
3766 return m == self.positive
3767
3768class SetSymDiff(SetBase):
3769 _opcode = {(NOCASE, False): OP.SET_SYM_DIFF, (IGNORECASE, False):
3770 OP.SET_SYM_DIFF_IGN, (FULLCASE, False): OP.SET_SYM_DIFF, (FULLIGNORECASE,
3771 False): OP.SET_SYM_DIFF_IGN, (NOCASE, True): OP.SET_SYM_DIFF_REV,
3772 (IGNORECASE, True): OP.SET_SYM_DIFF_IGN_REV, (FULLCASE, True):
3773 OP.SET_SYM_DIFF_REV, (FULLIGNORECASE, True): OP.SET_SYM_DIFF_IGN_REV}
3774 _op_name = "SET_SYM_DIFF"
3775
3776 def optimise(self, info, reverse, in_set=False):
3777 items = []
3778 for m in self.items:
3779 m = m.optimise(info, reverse, in_set=True)
3780 if isinstance(m, SetSymDiff) and m.positive:
3781 # Symmetric difference in symmetric difference.
3782 items.extend(m.items)
3783 else:
3784 items.append(m)
3785
3786 if len(items) == 1:
3787 return items[0].with_flags(case_flags=self.case_flags,
3788 zerowidth=self.zerowidth).optimise(info, reverse, in_set)
3789
3790 self.items = tuple(items)
3791
3792 return self._handle_case_folding(info, in_set)
3793
3794 def matches(self, ch):
3795 m = False
3796 for i in self.items:
3797 m = m != i.matches(ch)
3798
3799 return m == self.positive
3800
3801class SetUnion(SetBase):
3802 _opcode = {(NOCASE, False): OP.SET_UNION, (IGNORECASE, False):
3803 OP.SET_UNION_IGN, (FULLCASE, False): OP.SET_UNION, (FULLIGNORECASE,
3804 False): OP.SET_UNION_IGN, (NOCASE, True): OP.SET_UNION_REV, (IGNORECASE,
3805 True): OP.SET_UNION_IGN_REV, (FULLCASE, True): OP.SET_UNION_REV,
3806 (FULLIGNORECASE, True): OP.SET_UNION_IGN_REV}
3807 _op_name = "SET_UNION"
3808
3809 def optimise(self, info, reverse, in_set=False):
3810 items = []
3811 for m in self.items:
3812 m = m.optimise(info, reverse, in_set=True)
3813 if isinstance(m, SetUnion) and m.positive:
3814 # Union in union.
3815 items.extend(m.items)
3816 else:
3817 items.append(m)
3818
3819 if len(items) == 1:
3820 i = items[0]
3821 return i.with_flags(positive=i.positive == self.positive,
3822 case_flags=self.case_flags,
3823 zerowidth=self.zerowidth).optimise(info, reverse, in_set)
3824
3825 self.items = tuple(items)
3826
3827 return self._handle_case_folding(info, in_set)
3828
3829 def _compile(self, reverse, fuzzy):
3830 flags = 0
3831 if self.positive:
3832 flags |= POSITIVE_OP
3833 if self.zerowidth:
3834 flags |= ZEROWIDTH_OP
3835 if fuzzy:
3836 flags |= FUZZY_OP
3837
3838 characters, others = defaultdict(list), []
3839 for m in self.items:
3840 if isinstance(m, Character):
3841 characters[m.positive].append(m.value)
3842 else:
3843 others.append(m)
3844
3845 code = [(self._opcode[self.case_flags, reverse], flags)]
3846
3847 for positive, values in characters.items():
3848 flags = 0
3849 if positive:
3850 flags |= POSITIVE_OP
3851 if len(values) == 1:
3852 code.append((OP.CHARACTER, flags, values[0]))
3853 else:
3854 code.append((OP.STRING, flags, len(values)) + tuple(values))
3855
3856 for m in others:
3857 code.extend(m.compile())
3858
3859 code.append((OP.END, ))
3860
3861 return code
3862
3863 def matches(self, ch):
3864 m = any(i.matches(ch) for i in self.items)
3865 return m == self.positive
3866
3867class Skip(ZeroWidthBase):
3868 _op_name = "SKIP"
3869 _opcode = OP.SKIP
3870
3871class StartOfLine(ZeroWidthBase):
3872 _opcode = OP.START_OF_LINE
3873 _op_name = "START_OF_LINE"
3874
3875class StartOfLineU(StartOfLine):
3876 _opcode = OP.START_OF_LINE_U
3877 _op_name = "START_OF_LINE_U"
3878
3879class StartOfString(ZeroWidthBase):
3880 _opcode = OP.START_OF_STRING
3881 _op_name = "START_OF_STRING"
3882
3883class StartOfWord(ZeroWidthBase):
3884 _opcode = OP.START_OF_WORD
3885 _op_name = "START_OF_WORD"
3886
3887class String(RegexBase):
3888 _opcode = {(NOCASE, False): OP.STRING, (IGNORECASE, False): OP.STRING_IGN,
3889 (FULLCASE, False): OP.STRING, (FULLIGNORECASE, False): OP.STRING_FLD,
3890 (NOCASE, True): OP.STRING_REV, (IGNORECASE, True): OP.STRING_IGN_REV,
3891 (FULLCASE, True): OP.STRING_REV, (FULLIGNORECASE, True):
3892 OP.STRING_FLD_REV}
3893
3894 def __init__(self, characters, case_flags=NOCASE):
3895 self.characters = tuple(characters)
3896 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
3897
3898 if (self.case_flags & FULLIGNORECASE) == FULLIGNORECASE:
3899 folded_characters = []
3900 for char in self.characters:
3901 folded = _regex.fold_case(FULL_CASE_FOLDING, chr(char))
3902 folded_characters.extend(ord(c) for c in folded)
3903 else:
3904 folded_characters = self.characters
3905
3906 self.folded_characters = tuple(folded_characters)
3907 self.required = False
3908
3909 self._key = self.__class__, self.characters, self.case_flags
3910
3911 def get_firstset(self, reverse):
3912 if reverse:
3913 pos = -1
3914 else:
3915 pos = 0
3916 return set([Character(self.characters[pos],
3917 case_flags=self.case_flags)])
3918
3919 def has_simple_start(self):
3920 return True
3921
3922 def _compile(self, reverse, fuzzy):
3923 flags = 0
3924 if fuzzy:
3925 flags |= FUZZY_OP
3926 if self.required:
3927 flags |= REQUIRED_OP
3928 return [(self._opcode[self.case_flags, reverse], flags,
3929 len(self.folded_characters)) + self.folded_characters]
3930
3931 def dump(self, indent, reverse):
3932 display = ascii("".join(chr(c) for c in self.characters)).lstrip("bu")
3933 print("{}STRING {}{}".format(INDENT * indent, display,
3934 CASE_TEXT[self.case_flags]))
3935
3936 def max_width(self):
3937 return len(self.folded_characters)
3938
3939 def get_required_string(self, reverse):
3940 return 0, self
3941
3942class Literal(String):
3943 def dump(self, indent, reverse):
3944 literal = ''.join(chr(c) for c in self.characters)
3945 display = ascii(literal).lstrip("bu")
3946 print("{}LITERAL MATCH {}{}".format(INDENT * indent, display,
3947 CASE_TEXT[self.case_flags]))
3948
3949class StringSet(Branch):
3950 def __init__(self, info, name, case_flags=NOCASE):
3951 self.info = info
3952 self.name = name
3953 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
3954
3955 self._key = self.__class__, self.name, self.case_flags
3956
3957 self.set_key = (name, self.case_flags)
3958 if self.set_key not in info.named_lists_used:
3959 info.named_lists_used[self.set_key] = len(info.named_lists_used)
3960
3961 index = self.info.named_lists_used[self.set_key]
3962 items = self.info.kwargs[self.name]
3963
3964 case_flags = self.case_flags
3965
3966 encoding = self.info.flags & _ALL_ENCODINGS
3967 fold_flags = encoding | case_flags
3968
3969 choices = []
3970
3971 for string in items:
3972 if isinstance(string, str):
3973 string = [ord(c) for c in string]
3974
3975 choices.append([Character(c, case_flags=case_flags) for c in
3976 string])
3977
3978 # Sort from longest to shortest.
3979 choices.sort(key=len, reverse=True)
3980
3981 self.branches = [Sequence(choice) for choice in choices]
3982
3983 def dump(self, indent, reverse):
3984 print("{}STRING_SET {}{}".format(INDENT * indent, self.name,
3985 CASE_TEXT[self.case_flags]))
3986
3987 def __del__(self):
3988 self.info = None
3989
3990class Source:
3991 "Scanner for the regular expression source string."
3992 def __init__(self, string):
3993 if isinstance(string, str):
3994 self.string = string
3995 self.char_type = chr
3996 else:
3997 self.string = string.decode("latin-1")
3998 self.char_type = lambda c: bytes([c])
3999
4000 self.pos = 0
4001 self.ignore_space = False
4002 self.sep = string[ : 0]
4003
4004 def get(self, override_ignore=False):
4005 string = self.string
4006 pos = self.pos
4007
4008 try:
4009 if self.ignore_space and not override_ignore:
4010 while True:
4011 if string[pos].isspace():
4012 # Skip over the whitespace.
4013 pos += 1
4014 elif string[pos] == "#":
4015 # Skip over the comment to the end of the line.
4016 pos = string.index("\n", pos)
4017 else:
4018 break
4019
4020 ch = string[pos]
4021 self.pos = pos + 1
4022 return ch
4023 except IndexError:
4024 # We've reached the end of the string.
4025 self.pos = pos
4026 return string[ : 0]
4027 except ValueError:
4028 # The comment extended to the end of the string.
4029 self.pos = len(string)
4030 return string[ : 0]
4031
4032 def get_many(self, count=1):
4033 string = self.string
4034 pos = self.pos
4035
4036 try:
4037 if self.ignore_space:
4038 substring = []
4039
4040 while len(substring) < count:
4041 while True:
4042 if string[pos].isspace():
4043 # Skip over the whitespace.
4044 pos += 1
4045 elif string[pos] == "#":
4046 # Skip over the comment to the end of the line.
4047 pos = string.index("\n", pos)
4048 else:
4049 break
4050
4051 substring.append(string[pos])
4052 pos += 1
4053
4054 substring = "".join(substring)
4055 else:
4056 substring = string[pos : pos + count]
4057 pos += len(substring)
4058
4059 self.pos = pos
4060 return substring
4061 except IndexError:
4062 # We've reached the end of the string.
4063 self.pos = len(string)
4064 return "".join(substring)
4065 except ValueError:
4066 # The comment extended to the end of the string.
4067 self.pos = len(string)
4068 return "".join(substring)
4069
4070 def get_while(self, test_set, include=True, keep_spaces=False):
4071 string = self.string
4072 pos = self.pos
4073
4074 if self.ignore_space and not keep_spaces:
4075 try:
4076 substring = []
4077
4078 while True:
4079 if string[pos].isspace():
4080 # Skip over the whitespace.
4081 pos += 1
4082 elif string[pos] == "#":
4083 # Skip over the comment to the end of the line.
4084 pos = string.index("\n", pos)
4085 elif (string[pos] in test_set) == include:
4086 substring.append(string[pos])
4087 pos += 1
4088 else:
4089 break
4090
4091 self.pos = pos
4092 except IndexError:
4093 # We've reached the end of the string.
4094 self.pos = len(string)
4095 except ValueError:
4096 # The comment extended to the end of the string.
4097 self.pos = len(string)
4098
4099 return "".join(substring)
4100 else:
4101 try:
4102 while (string[pos] in test_set) == include:
4103 pos += 1
4104
4105 substring = string[self.pos : pos]
4106
4107 self.pos = pos
4108
4109 return substring
4110 except IndexError:
4111 # We've reached the end of the string.
4112 substring = string[self.pos : pos]
4113
4114 self.pos = pos
4115
4116 return substring
4117
4118 def skip_while(self, test_set, include=True):
4119 string = self.string
4120 pos = self.pos
4121
4122 try:
4123 if self.ignore_space:
4124 while True:
4125 if string[pos].isspace():
4126 # Skip over the whitespace.
4127 pos += 1
4128 elif string[pos] == "#":
4129 # Skip over the comment to the end of the line.
4130 pos = string.index("\n", pos)
4131 elif (string[pos] in test_set) == include:
4132 pos += 1
4133 else:
4134 break
4135 else:
4136 while (string[pos] in test_set) == include:
4137 pos += 1
4138
4139 self.pos = pos
4140 except IndexError:
4141 # We've reached the end of the string.
4142 self.pos = len(string)
4143 except ValueError:
4144 # The comment extended to the end of the string.
4145 self.pos = len(string)
4146
4147 def match(self, substring):
4148 string = self.string
4149 pos = self.pos
4150
4151 if self.ignore_space:
4152 try:
4153 for c in substring:
4154 while True:
4155 if string[pos].isspace():
4156 # Skip over the whitespace.
4157 pos += 1
4158 elif string[pos] == "#":
4159 # Skip over the comment to the end of the line.
4160 pos = string.index("\n", pos)
4161 else:
4162 break
4163
4164 if string[pos] != c:
4165 return False
4166
4167 pos += 1
4168
4169 self.pos = pos
4170
4171 return True
4172 except IndexError:
4173 # We've reached the end of the string.
4174 return False
4175 except ValueError:
4176 # The comment extended to the end of the string.
4177 return False
4178 else:
4179 if not string.startswith(substring, pos):
4180 return False
4181
4182 self.pos = pos + len(substring)
4183
4184 return True
4185
4186 def expect(self, substring):
4187 if not self.match(substring):
4188 raise error("missing {}".format(substring), self.string, self.pos)
4189
4190 def at_end(self):
4191 string = self.string
4192 pos = self.pos
4193
4194 try:
4195 if self.ignore_space:
4196 while True:
4197 if string[pos].isspace():
4198 pos += 1
4199 elif string[pos] == "#":
4200 pos = string.index("\n", pos)
4201 else:
4202 break
4203
4204 return pos >= len(string)
4205 except IndexError:
4206 # We've reached the end of the string.
4207 return True
4208 except ValueError:
4209 # The comment extended to the end of the string.
4210 return True
4211
4212class Info:
4213 "Info about the regular expression."
4214
4215 def __init__(self, flags=0, char_type=None, kwargs={}):
4216 flags |= DEFAULT_FLAGS[(flags & _ALL_VERSIONS) or DEFAULT_VERSION]
4217 self.flags = flags
4218 self.global_flags = flags
4219 self.inline_locale = False
4220
4221 self.kwargs = kwargs
4222
4223 self.group_count = 0
4224 self.group_index = {}
4225 self.group_name = {}
4226 self.char_type = char_type
4227 self.named_lists_used = {}
4228 self.open_groups = []
4229 self.open_group_count = {}
4230 self.defined_groups = {}
4231 self.group_calls = []
4232 self.private_groups = {}
4233
4234 def open_group(self, name=None):
4235 group = self.group_index.get(name)
4236 if group is None:
4237 while True:
4238 self.group_count += 1
4239 if name is None or self.group_count not in self.group_name:
4240 break
4241
4242 group = self.group_count
4243 if name:
4244 self.group_index[name] = group
4245 self.group_name[group] = name
4246
4247 if group in self.open_groups:
4248 # We have a nested named group. We'll assign it a private group
4249 # number, initially negative until we can assign a proper
4250 # (positive) number.
4251 group_alias = -(len(self.private_groups) + 1)
4252 self.private_groups[group_alias] = group
4253 group = group_alias
4254
4255 self.open_groups.append(group)
4256 self.open_group_count[group] = self.open_group_count.get(group, 0) + 1
4257
4258 return group
4259
4260 def close_group(self):
4261 self.open_groups.pop()
4262
4263 def is_open_group(self, name):
4264 # In version 1, a group reference can refer to an open group. We'll
4265 # just pretend the group isn't open.
4266 version = (self.flags & _ALL_VERSIONS) or DEFAULT_VERSION
4267 if version == VERSION1:
4268 return False
4269
4270 if name.isdigit():
4271 group = int(name)
4272 else:
4273 group = self.group_index.get(name)
4274
4275 return group in self.open_groups
4276
4277def _check_group_features(info, parsed):
4278 """Checks whether the reverse and fuzzy features of the group calls match
4279 the groups which they call.
4280 """
4281 call_refs = {}
4282 additional_groups = []
4283 for call, reverse, fuzzy in info.group_calls:
4284 # Look up the reference of this group call.
4285 key = (call.group, reverse, fuzzy)
4286 ref = call_refs.get(key)
4287 if ref is None:
4288 # This group doesn't have a reference yet, so look up its features.
4289 if call.group == 0:
4290 # Calling the pattern as a whole.
4291 rev = bool(info.flags & REVERSE)
4292 fuz = isinstance(parsed, Fuzzy)
4293 if (rev, fuz) != (reverse, fuzzy):
4294 # The pattern as a whole doesn't have the features we want,
4295 # so we'll need to make a copy of it with the desired
4296 # features.
4297 additional_groups.append((CallRef(len(call_refs), parsed),
4298 reverse, fuzzy))
4299 else:
4300 # Calling a capture group.
4301 def_info = info.defined_groups[call.group]
4302 group = def_info[0]
4303 if def_info[1 : ] != (reverse, fuzzy):
4304 # The group doesn't have the features we want, so we'll
4305 # need to make a copy of it with the desired features.
4306 additional_groups.append((group, reverse, fuzzy))
4307
4308 ref = len(call_refs)
4309 call_refs[key] = ref
4310
4311 call.call_ref = ref
4312
4313 info.call_refs = call_refs
4314 info.additional_groups = additional_groups
4315
4316def _get_required_string(parsed, flags):
4317 "Gets the required string and related info of a parsed pattern."
4318
4319 req_offset, required = parsed.get_required_string(bool(flags & REVERSE))
4320 if required:
4321 required.required = True
4322 if req_offset >= UNLIMITED:
4323 req_offset = -1
4324
4325 req_flags = required.case_flags
4326 if not (flags & UNICODE):
4327 req_flags &= ~UNICODE
4328
4329 req_chars = required.folded_characters
4330 else:
4331 req_offset = 0
4332 req_chars = ()
4333 req_flags = 0
4334
4335 return req_offset, req_chars, req_flags
4336
4337class Scanner:
4338 def __init__(self, lexicon, flags=0):
4339 self.lexicon = lexicon
4340
4341 # Combine phrases into a compound pattern.
4342 patterns = []
4343 for phrase, action in lexicon:
4344 # Parse the regular expression.
4345 source = Source(phrase)
4346 info = Info(flags, source.char_type)
4347 source.ignore_space = bool(info.flags & VERBOSE)
4348 parsed = _parse_pattern(source, info)
4349 if not source.at_end():
4350 raise error("unbalanced parenthesis", source.string,
4351 source.pos)
4352
4353 # We want to forbid capture groups within each phrase.
4354 patterns.append(parsed.remove_captures())
4355
4356 # Combine all the subpatterns into one pattern.
4357 info = Info(flags)
4358 patterns = [Group(info, g + 1, p) for g, p in enumerate(patterns)]
4359 parsed = Branch(patterns)
4360
4361 # Optimise the compound pattern.
4362 reverse = bool(info.flags & REVERSE)
4363 parsed = parsed.optimise(info, reverse)
4364 parsed = parsed.pack_characters(info)
4365
4366 # Get the required string.
4367 req_offset, req_chars, req_flags = _get_required_string(parsed,
4368 info.flags)
4369
4370 # Check the features of the groups.
4371 _check_group_features(info, parsed)
4372
4373 # Complain if there are any group calls. They are not supported by the
4374 # Scanner class.
4375 if info.call_refs:
4376 raise error("recursive regex not supported by Scanner",
4377 source.string, source.pos)
4378
4379 reverse = bool(info.flags & REVERSE)
4380
4381 # Compile the compound pattern. The result is a list of tuples.
4382 code = parsed.compile(reverse) + [(OP.SUCCESS, )]
4383
4384 # Flatten the code into a list of ints.
4385 code = _flatten_code(code)
4386
4387 if not parsed.has_simple_start():
4388 # Get the first set, if possible.
4389 try:
4390 fs_code = _compile_firstset(info, parsed.get_firstset(reverse))
4391 fs_code = _flatten_code(fs_code)
4392 code = fs_code + code
4393 except _FirstSetError:
4394 pass
4395
4396 # Check the global flags for conflicts.
4397 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION
4398 if version not in (0, VERSION0, VERSION1):
4399 raise ValueError("VERSION0 and VERSION1 flags are mutually incompatible")
4400
4401 # Create the PatternObject.
4402 #
4403 # Local flags like IGNORECASE affect the code generation, but aren't
4404 # needed by the PatternObject itself. Conversely, global flags like
4405 # LOCALE _don't_ affect the code generation but _are_ needed by the
4406 # PatternObject.
4407 self.scanner = _regex.compile(None, (flags & GLOBAL_FLAGS) | version,
4408 code, {}, {}, {}, [], req_offset, req_chars, req_flags,
4409 len(patterns))
4410
4411 def scan(self, string):
4412 result = []
4413 append = result.append
4414 match = self.scanner.scanner(string).match
4415 i = 0
4416 while True:
4417 m = match()
4418 if not m:
4419 break
4420 j = m.end()
4421 if i == j:
4422 break
4423 action = self.lexicon[m.lastindex - 1][1]
4424 if hasattr(action, '__call__'):
4425 self.match = m
4426 action = action(self, m.group())
4427 if action is not None:
4428 append(action)
4429 i = j
4430
4431 return result, string[i : ]
4432
4433# Get the known properties dict.
4434PROPERTIES = _regex.get_properties()
4435
4436# Build the inverse of the properties dict.
4437PROPERTY_NAMES = {}
4438for prop_name, (prop_id, values) in PROPERTIES.items():
4439 name, prop_values = PROPERTY_NAMES.get(prop_id, ("", {}))
4440 name = max(name, prop_name, key=len)
4441 PROPERTY_NAMES[prop_id] = name, prop_values
4442
4443 for val_name, val_id in values.items():
4444 prop_values[val_id] = max(prop_values.get(val_id, ""), val_name,
4445 key=len)
4446
4447# Character escape sequences.
4448CHARACTER_ESCAPES = {
4449 "a": "\a",
4450 "b": "\b",
4451 "f": "\f",
4452 "n": "\n",
4453 "r": "\r",
4454 "t": "\t",
4455 "v": "\v",
4456}
4457
4458# Predefined character set escape sequences.
4459CHARSET_ESCAPES = {
4460 "d": lookup_property(None, "Digit", True),
4461 "D": lookup_property(None, "Digit", False),
4462 "h": lookup_property(None, "Blank", True),
4463 "s": lookup_property(None, "Space", True),
4464 "S": lookup_property(None, "Space", False),
4465 "w": lookup_property(None, "Word", True),
4466 "W": lookup_property(None, "Word", False),
4467}
4468
4469# Positional escape sequences.
4470POSITION_ESCAPES = {
4471 "A": StartOfString(),
4472 "b": Boundary(),
4473 "B": Boundary(False),
4474 "K": Keep(),
4475 "m": StartOfWord(),
4476 "M": EndOfWord(),
4477 "Z": EndOfString(),
4478}
4479
4480# Positional escape sequences when WORD flag set.
4481WORD_POSITION_ESCAPES = dict(POSITION_ESCAPES)
4482WORD_POSITION_ESCAPES.update({
4483 "b": DefaultBoundary(),
4484 "B": DefaultBoundary(False),
4485 "m": DefaultStartOfWord(),
4486 "M": DefaultEndOfWord(),
4487})
4488
4489# Regex control verbs.
4490VERBS = {
4491 "FAIL": Failure(),
4492 "F": Failure(),
4493 "PRUNE": Prune(),
4494 "SKIP": Skip(),
4495}