1#
2# Secret Labs' Regular Expression Engine core module
3#
4# Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved.
5#
6# This version of the SRE library can be redistributed under CNRI's
7# Python 1.6 license. For any other use, please contact Secret Labs
8# AB (info@pythonware.com).
9#
10# Portions of this engine have been developed in cooperation with
11# CNRI. Hewlett-Packard provided funding for 1.6 integration and
12# other compatibility work.
13#
14# 2010-01-16 mrab Python front-end re-written and extended
15
16import enum
17import string
18import unicodedata
19from collections import defaultdict
20
21import regex._regex as _regex
22
23__all__ = ["A", "ASCII", "B", "BESTMATCH", "D", "DEBUG", "E", "ENHANCEMATCH",
24 "F", "FULLCASE", "I", "IGNORECASE", "L", "LOCALE", "M", "MULTILINE", "P",
25 "POSIX", "R", "REVERSE", "S", "DOTALL", "T", "TEMPLATE", "U", "UNICODE",
26 "V0", "VERSION0", "V1", "VERSION1", "W", "WORD", "X", "VERBOSE", "error",
27 "Scanner", "RegexFlag"]
28
29# The regex exception.
30class error(Exception):
31 """Exception raised for invalid regular expressions.
32
33 Attributes:
34
35 msg: The unformatted error message
36 pattern: The regular expression pattern
37 pos: The position in the pattern where compilation failed, or None
38 lineno: The line number where compilation failed, unless pos is None
39 colno: The column number where compilation failed, unless pos is None
40 """
41
42 def __init__(self, message, pattern=None, pos=None):
43 newline = '\n' if isinstance(pattern, str) else b'\n'
44 self.msg = message
45 self.pattern = pattern
46 self.pos = pos
47 if pattern is not None and pos is not None:
48 self.lineno = pattern.count(newline, 0, pos) + 1
49 self.colno = pos - pattern.rfind(newline, 0, pos)
50
51 message = "{} at position {}".format(message, pos)
52
53 if newline in pattern:
54 message += " (line {}, column {})".format(self.lineno,
55 self.colno)
56
57 Exception.__init__(self, message)
58
59# The exception for when a positional flag has been turned on in the old
60# behaviour.
61class _UnscopedFlagSet(Exception):
62 pass
63
64# The exception for when parsing fails and we want to try something else.
65class ParseError(Exception):
66 pass
67
68# The exception for when there isn't a valid first set.
69class _FirstSetError(Exception):
70 pass
71
72# Flags.
73class RegexFlag(enum.IntFlag):
74 A = ASCII = 0x80 # Assume ASCII locale.
75 B = BESTMATCH = 0x1000 # Best fuzzy match.
76 D = DEBUG = 0x200 # Print parsed pattern.
77 E = ENHANCEMATCH = 0x8000 # Attempt to improve the fit after finding the first
78 # fuzzy match.
79 F = FULLCASE = 0x4000 # Unicode full case-folding.
80 I = IGNORECASE = 0x2 # Ignore case.
81 L = LOCALE = 0x4 # Assume current 8-bit locale.
82 M = MULTILINE = 0x8 # Make anchors look for newline.
83 P = POSIX = 0x10000 # POSIX-style matching (leftmost longest).
84 R = REVERSE = 0x400 # Search backwards.
85 S = DOTALL = 0x10 # Make dot match newline.
86 U = UNICODE = 0x20 # Assume Unicode locale.
87 V0 = VERSION0 = 0x2000 # Old legacy behaviour.
88 V1 = VERSION1 = 0x100 # New enhanced behaviour.
89 W = WORD = 0x800 # Default Unicode word breaks.
90 X = VERBOSE = 0x40 # Ignore whitespace and comments.
91 T = TEMPLATE = 0x1 # Template (present because re module has it).
92
93 def __repr__(self):
94 if self._name_ is not None:
95 return 'regex.%s' % self._name_
96
97 value = self._value_
98 members = []
99 negative = value < 0
100
101 if negative:
102 value = ~value
103
104 for m in self.__class__:
105 if value & m._value_:
106 value &= ~m._value_
107 members.append('regex.%s' % m._name_)
108
109 if value:
110 members.append(hex(value))
111
112 res = '|'.join(members)
113
114 if negative:
115 if len(members) > 1:
116 res = '~(%s)' % res
117 else:
118 res = '~%s' % res
119
120 return res
121
122 __str__ = object.__str__
123
124globals().update(RegexFlag.__members__)
125
126DEFAULT_VERSION = VERSION1
127
128_ALL_VERSIONS = VERSION0 | VERSION1
129_ALL_ENCODINGS = ASCII | LOCALE | UNICODE
130
131# The default flags for the various versions.
132DEFAULT_FLAGS = {VERSION0: 0, VERSION1: FULLCASE}
133
134# The mask for the flags.
135GLOBAL_FLAGS = (_ALL_VERSIONS | BESTMATCH | DEBUG | ENHANCEMATCH | POSIX |
136 REVERSE)
137SCOPED_FLAGS = (FULLCASE | IGNORECASE | MULTILINE | DOTALL | WORD | VERBOSE |
138 _ALL_ENCODINGS)
139
140ALPHA = frozenset(string.ascii_letters)
141DIGITS = frozenset(string.digits)
142ALNUM = ALPHA | DIGITS
143OCT_DIGITS = frozenset(string.octdigits)
144HEX_DIGITS = frozenset(string.hexdigits)
145SPECIAL_CHARS = frozenset("()|?*+{^$.[\\#") | frozenset([""])
146NAMED_CHAR_PART = ALNUM | frozenset(" -")
147PROPERTY_NAME_PART = ALNUM | frozenset(" &_-.")
148SET_OPS = ("||", "~~", "&&", "--")
149
150# The width of the code words inside the regex engine.
151BYTES_PER_CODE = _regex.get_code_size()
152BITS_PER_CODE = BYTES_PER_CODE * 8
153
154# The repeat count which represents infinity.
155UNLIMITED = (1 << BITS_PER_CODE) - 1
156
157# The regular expression flags.
158REGEX_FLAGS = {"a": ASCII, "b": BESTMATCH, "e": ENHANCEMATCH, "f": FULLCASE,
159 "i": IGNORECASE, "L": LOCALE, "m": MULTILINE, "p": POSIX, "r": REVERSE,
160 "s": DOTALL, "u": UNICODE, "V0": VERSION0, "V1": VERSION1, "w": WORD, "x":
161 VERBOSE}
162
163# The case flags.
164CASE_FLAGS = FULLCASE | IGNORECASE
165NOCASE = 0
166FULLIGNORECASE = FULLCASE | IGNORECASE
167
168FULL_CASE_FOLDING = UNICODE | FULLIGNORECASE
169
170CASE_FLAGS_COMBINATIONS = {0: 0, FULLCASE: 0, IGNORECASE: IGNORECASE,
171 FULLIGNORECASE: FULLIGNORECASE}
172
173# The number of digits in hexadecimal escapes.
174HEX_ESCAPES = {"x": 2, "u": 4, "U": 8}
175
176# The names of the opcodes.
177OPCODES = """
178FAILURE
179SUCCESS
180ANY
181ANY_ALL
182ANY_ALL_REV
183ANY_REV
184ANY_U
185ANY_U_REV
186ATOMIC
187BOUNDARY
188BRANCH
189CALL_REF
190CHARACTER
191CHARACTER_IGN
192CHARACTER_IGN_REV
193CHARACTER_REV
194CONDITIONAL
195DEFAULT_BOUNDARY
196DEFAULT_END_OF_WORD
197DEFAULT_START_OF_WORD
198END
199END_OF_LINE
200END_OF_LINE_U
201END_OF_STRING
202END_OF_STRING_LINE
203END_OF_STRING_LINE_U
204END_OF_WORD
205FUZZY
206GRAPHEME_BOUNDARY
207GREEDY_REPEAT
208GROUP
209GROUP_CALL
210GROUP_EXISTS
211KEEP
212LAZY_REPEAT
213LOOKAROUND
214NEXT
215PROPERTY
216PROPERTY_IGN
217PROPERTY_IGN_REV
218PROPERTY_REV
219PRUNE
220RANGE
221RANGE_IGN
222RANGE_IGN_REV
223RANGE_REV
224REF_GROUP
225REF_GROUP_FLD
226REF_GROUP_FLD_REV
227REF_GROUP_IGN
228REF_GROUP_IGN_REV
229REF_GROUP_REV
230SEARCH_ANCHOR
231SET_DIFF
232SET_DIFF_IGN
233SET_DIFF_IGN_REV
234SET_DIFF_REV
235SET_INTER
236SET_INTER_IGN
237SET_INTER_IGN_REV
238SET_INTER_REV
239SET_SYM_DIFF
240SET_SYM_DIFF_IGN
241SET_SYM_DIFF_IGN_REV
242SET_SYM_DIFF_REV
243SET_UNION
244SET_UNION_IGN
245SET_UNION_IGN_REV
246SET_UNION_REV
247SKIP
248START_OF_LINE
249START_OF_LINE_U
250START_OF_STRING
251START_OF_WORD
252STRING
253STRING_FLD
254STRING_FLD_REV
255STRING_IGN
256STRING_IGN_REV
257STRING_REV
258FUZZY_EXT
259"""
260
261# Define the opcodes in a namespace.
262class Namespace:
263 pass
264
265OP = Namespace()
266for i, op in enumerate(OPCODES.split()):
267 setattr(OP, op, i)
268
269def _shrink_cache(cache_dict, args_dict, locale_sensitive, max_length, divisor=5):
270 """Make room in the given cache.
271
272 Args:
273 cache_dict: The cache dictionary to modify.
274 args_dict: The dictionary of named list args used by patterns.
275 max_length: Maximum # of entries in cache_dict before it is shrunk.
276 divisor: Cache will shrink to max_length - 1/divisor*max_length items.
277 """
278 # Toss out a fraction of the entries at random to make room for new ones.
279 # A random algorithm was chosen as opposed to simply cache_dict.popitem()
280 # as popitem could penalize the same regular expression repeatedly based
281 # on its internal hash value. Being random should spread the cache miss
282 # love around.
283 cache_keys = tuple(cache_dict.keys())
284 overage = len(cache_keys) - max_length
285 if overage < 0:
286 # Cache is already within limits. Normally this should not happen
287 # but it could due to multithreading.
288 return
289
290 number_to_toss = max_length // divisor + overage
291
292 # The import is done here to avoid a circular dependency.
293 import random
294 if not hasattr(random, 'sample'):
295 # Do nothing while resolving the circular dependency:
296 # re->random->warnings->tokenize->string->re
297 return
298
299 for doomed_key in random.sample(cache_keys, number_to_toss):
300 try:
301 del cache_dict[doomed_key]
302 except KeyError:
303 # Ignore problems if the cache changed from another thread.
304 pass
305
306 # Rebuild the arguments and locale-sensitivity dictionaries.
307 args_dict.clear()
308 sensitivity_dict = {}
309 for pattern, pattern_type, flags, args, default_version, locale in tuple(cache_dict):
310 args_dict[pattern, pattern_type, flags, default_version, locale] = args
311 try:
312 sensitivity_dict[pattern_type, pattern] = locale_sensitive[pattern_type, pattern]
313 except KeyError:
314 pass
315
316 locale_sensitive.clear()
317 locale_sensitive.update(sensitivity_dict)
318
319def _fold_case(info, string):
320 "Folds the case of a string."
321 flags = info.flags
322 if (flags & _ALL_ENCODINGS) == 0:
323 flags |= info.guess_encoding
324
325 return _regex.fold_case(flags, string)
326
327def is_cased_i(info, char):
328 "Checks whether a character is cased."
329 return len(_regex.get_all_cases(info.flags, char)) > 1
330
331def is_cased_f(flags, char):
332 "Checks whether a character is cased."
333 return len(_regex.get_all_cases(flags, char)) > 1
334
335def _compile_firstset(info, fs):
336 "Compiles the firstset for the pattern."
337 reverse = bool(info.flags & REVERSE)
338 fs = _check_firstset(info, reverse, fs)
339 if not fs or isinstance(fs, AnyAll):
340 return []
341
342 # Compile the firstset.
343 return fs.compile(reverse)
344
345def _check_firstset(info, reverse, fs):
346 "Checks the firstset for the pattern."
347 if not fs or None in fs:
348 return None
349
350 # If we ignore the case, for simplicity we won't build a firstset.
351 members = set()
352 case_flags = NOCASE
353 for i in fs:
354 if isinstance(i, Character) and not i.positive:
355 return None
356
357# if i.case_flags:
358# if isinstance(i, Character):
359# if is_cased_i(info, i.value):
360# return []
361# elif isinstance(i, SetBase):
362# return []
363 case_flags |= i.case_flags
364 members.add(i.with_flags(case_flags=NOCASE))
365
366 if case_flags == (FULLCASE | IGNORECASE):
367 return None
368
369 # Build the firstset.
370 fs = SetUnion(info, list(members), case_flags=case_flags & ~FULLCASE,
371 zerowidth=True)
372 fs = fs.optimise(info, reverse, in_set=True)
373
374 return fs
375
376def _flatten_code(code):
377 "Flattens the code from a list of tuples."
378 flat_code = []
379 for c in code:
380 flat_code.extend(c)
381
382 return flat_code
383
384def make_case_flags(info):
385 "Makes the case flags."
386 flags = info.flags & CASE_FLAGS
387
388 # Turn off FULLCASE if ASCII is turned on.
389 if info.flags & ASCII:
390 flags &= ~FULLCASE
391
392 return flags
393
394def make_character(info, value, in_set=False):
395 "Makes a character literal."
396 if in_set:
397 # A character set is built case-sensitively.
398 return Character(value)
399
400 return Character(value, case_flags=make_case_flags(info))
401
402def make_ref_group(info, name, position):
403 "Makes a group reference."
404 return RefGroup(info, name, position, case_flags=make_case_flags(info))
405
406def make_string_set(info, name):
407 "Makes a string set."
408 return StringSet(info, name, case_flags=make_case_flags(info))
409
410def make_property(info, prop, in_set):
411 "Makes a property."
412 if in_set:
413 return prop
414
415 return prop.with_flags(case_flags=make_case_flags(info))
416
417def _parse_pattern(source, info):
418 "Parses a pattern, eg. 'a|b|c'."
419 branches = [parse_sequence(source, info)]
420 while source.match("|"):
421 branches.append(parse_sequence(source, info))
422
423 if len(branches) == 1:
424 return branches[0]
425 return Branch(branches)
426
427def parse_sequence(source, info):
428 "Parses a sequence, eg. 'abc'."
429 sequence = [None]
430 case_flags = make_case_flags(info)
431 while True:
432 saved_pos = source.pos
433 ch = source.get()
434 if ch in SPECIAL_CHARS:
435 if ch in ")|":
436 # The end of a sequence. At the end of the pattern ch is "".
437 source.pos = saved_pos
438 break
439 elif ch == "\\":
440 # An escape sequence outside a set.
441 sequence.append(parse_escape(source, info, False))
442 elif ch == "(":
443 # A parenthesised subpattern or a flag.
444 element = parse_paren(source, info)
445 if element is None:
446 case_flags = make_case_flags(info)
447 else:
448 sequence.append(element)
449 elif ch == ".":
450 # Any character.
451 if info.flags & DOTALL:
452 sequence.append(AnyAll())
453 elif info.flags & WORD:
454 sequence.append(AnyU())
455 else:
456 sequence.append(Any())
457 elif ch == "[":
458 # A character set.
459 sequence.append(parse_set(source, info))
460 elif ch == "^":
461 # The start of a line or the string.
462 if info.flags & MULTILINE:
463 if info.flags & WORD:
464 sequence.append(StartOfLineU())
465 else:
466 sequence.append(StartOfLine())
467 else:
468 sequence.append(StartOfString())
469 elif ch == "$":
470 # The end of a line or the string.
471 if info.flags & MULTILINE:
472 if info.flags & WORD:
473 sequence.append(EndOfLineU())
474 else:
475 sequence.append(EndOfLine())
476 else:
477 if info.flags & WORD:
478 sequence.append(EndOfStringLineU())
479 else:
480 sequence.append(EndOfStringLine())
481 elif ch in "?*+{":
482 # Looks like a quantifier.
483 counts = parse_quantifier(source, info, ch)
484 if counts:
485 # It _is_ a quantifier.
486 apply_quantifier(source, info, counts, case_flags, ch,
487 saved_pos, sequence)
488 sequence.append(None)
489 else:
490 # It's not a quantifier. Maybe it's a fuzzy constraint.
491 constraints = parse_fuzzy(source, info, ch, case_flags)
492 if constraints:
493 # It _is_ a fuzzy constraint.
494 apply_constraint(source, info, constraints, case_flags,
495 saved_pos, sequence)
496 sequence.append(None)
497 else:
498 # The element was just a literal.
499 sequence.append(Character(ord(ch),
500 case_flags=case_flags))
501 else:
502 # A literal.
503 sequence.append(Character(ord(ch), case_flags=case_flags))
504 else:
505 # A literal.
506 sequence.append(Character(ord(ch), case_flags=case_flags))
507
508 sequence = [item for item in sequence if item is not None]
509 return Sequence(sequence)
510
511def apply_quantifier(source, info, counts, case_flags, ch, saved_pos,
512 sequence):
513 element = sequence.pop()
514 if element is None:
515 if sequence:
516 raise error("multiple repeat", source.string, saved_pos)
517 raise error("nothing to repeat", source.string, saved_pos)
518
519 if isinstance(element, (GreedyRepeat, LazyRepeat, PossessiveRepeat)):
520 raise error("multiple repeat", source.string, saved_pos)
521
522 min_count, max_count = counts
523 saved_pos = source.pos
524 ch = source.get()
525 if ch == "?":
526 # The "?" suffix that means it's a lazy repeat.
527 repeated = LazyRepeat
528 elif ch == "+":
529 # The "+" suffix that means it's a possessive repeat.
530 repeated = PossessiveRepeat
531 else:
532 # No suffix means that it's a greedy repeat.
533 source.pos = saved_pos
534 repeated = GreedyRepeat
535
536 # Ignore the quantifier if it applies to a zero-width item or the number of
537 # repeats is fixed at 1.
538 if not element.is_empty() and (min_count != 1 or max_count != 1):
539 element = repeated(element, min_count, max_count)
540
541 sequence.append(element)
542
543def apply_constraint(source, info, constraints, case_flags, saved_pos,
544 sequence):
545 element = sequence.pop()
546 if element is None:
547 raise error("nothing for fuzzy constraint", source.string, saved_pos)
548
549 # If a group is marked as fuzzy then put all of the fuzzy part in the
550 # group.
551 if isinstance(element, Group):
552 element.subpattern = Fuzzy(element.subpattern, constraints)
553 sequence.append(element)
554 else:
555 sequence.append(Fuzzy(element, constraints))
556
557_QUANTIFIERS = {"?": (0, 1), "*": (0, None), "+": (1, None)}
558
559def parse_quantifier(source, info, ch):
560 "Parses a quantifier."
561 q = _QUANTIFIERS.get(ch)
562 if q:
563 # It's a quantifier.
564 return q
565
566 if ch == "{":
567 # Looks like a limited repeated element, eg. 'a{2,3}'.
568 counts = parse_limited_quantifier(source)
569 if counts:
570 return counts
571
572 return None
573
574def is_above_limit(count):
575 "Checks whether a count is above the maximum."
576 return count is not None and count >= UNLIMITED
577
578def parse_limited_quantifier(source):
579 "Parses a limited quantifier."
580 saved_pos = source.pos
581 min_count = parse_count(source)
582 if source.match(","):
583 max_count = parse_count(source)
584
585 # No minimum means 0 and no maximum means unlimited.
586 min_count = int(min_count or 0)
587 max_count = int(max_count) if max_count else None
588 else:
589 if not min_count:
590 source.pos = saved_pos
591 return None
592
593 min_count = max_count = int(min_count)
594
595 if not source.match ("}"):
596 source.pos = saved_pos
597 return None
598
599 if is_above_limit(min_count) or is_above_limit(max_count):
600 raise error("repeat count too big", source.string, saved_pos)
601
602 if max_count is not None and min_count > max_count:
603 raise error("min repeat greater than max repeat", source.string,
604 saved_pos)
605
606 return min_count, max_count
607
608def parse_fuzzy(source, info, ch, case_flags):
609 "Parses a fuzzy setting, if present."
610 saved_pos = source.pos
611
612 if ch != "{":
613 return None
614
615 constraints = {}
616 try:
617 parse_fuzzy_item(source, constraints)
618 while source.match(","):
619 parse_fuzzy_item(source, constraints)
620 except ParseError:
621 source.pos = saved_pos
622 return None
623
624 if source.match(":"):
625 constraints["test"] = parse_fuzzy_test(source, info, case_flags)
626
627 if not source.match("}"):
628 raise error("expected }", source.string, source.pos)
629
630 return constraints
631
632def parse_fuzzy_item(source, constraints):
633 "Parses a fuzzy setting item."
634 saved_pos = source.pos
635 try:
636 parse_cost_constraint(source, constraints)
637 except ParseError:
638 source.pos = saved_pos
639
640 parse_cost_equation(source, constraints)
641
642def parse_cost_constraint(source, constraints):
643 "Parses a cost constraint."
644 saved_pos = source.pos
645 ch = source.get()
646 if ch in ALPHA:
647 # Syntax: constraint [("<=" | "<") cost]
648 constraint = parse_constraint(source, constraints, ch)
649
650 max_inc = parse_fuzzy_compare(source)
651
652 if max_inc is None:
653 # No maximum cost.
654 constraints[constraint] = 0, None
655 else:
656 # There's a maximum cost.
657 cost_pos = source.pos
658 max_cost = parse_cost_limit(source)
659
660 # Inclusive or exclusive limit?
661 if not max_inc:
662 max_cost -= 1
663
664 if max_cost < 0:
665 raise error("bad fuzzy cost limit", source.string, cost_pos)
666
667 constraints[constraint] = 0, max_cost
668 elif ch in DIGITS:
669 # Syntax: cost ("<=" | "<") constraint ("<=" | "<") cost
670 source.pos = saved_pos
671
672 # Minimum cost.
673 cost_pos = source.pos
674 min_cost = parse_cost_limit(source)
675
676 min_inc = parse_fuzzy_compare(source)
677 if min_inc is None:
678 raise ParseError()
679
680 constraint = parse_constraint(source, constraints, source.get())
681
682 max_inc = parse_fuzzy_compare(source)
683 if max_inc is None:
684 raise ParseError()
685
686 # Maximum cost.
687 cost_pos = source.pos
688 max_cost = parse_cost_limit(source)
689
690 # Inclusive or exclusive limits?
691 if not min_inc:
692 min_cost += 1
693 if not max_inc:
694 max_cost -= 1
695
696 if not 0 <= min_cost <= max_cost:
697 raise error("bad fuzzy cost limit", source.string, cost_pos)
698
699 constraints[constraint] = min_cost, max_cost
700 else:
701 raise ParseError()
702
703def parse_cost_limit(source):
704 "Parses a cost limit."
705 cost_pos = source.pos
706 digits = parse_count(source)
707
708 try:
709 return int(digits)
710 except ValueError:
711 pass
712
713 raise error("bad fuzzy cost limit", source.string, cost_pos)
714
715def parse_constraint(source, constraints, ch):
716 "Parses a constraint."
717 if ch not in "deis":
718 raise ParseError()
719
720 if ch in constraints:
721 raise ParseError()
722
723 return ch
724
725def parse_fuzzy_compare(source):
726 "Parses a cost comparator."
727 if source.match("<="):
728 return True
729 elif source.match("<"):
730 return False
731 else:
732 return None
733
734def parse_cost_equation(source, constraints):
735 "Parses a cost equation."
736 if "cost" in constraints:
737 raise error("more than one cost equation", source.string, source.pos)
738
739 cost = {}
740
741 parse_cost_term(source, cost)
742 while source.match("+"):
743 parse_cost_term(source, cost)
744
745 max_inc = parse_fuzzy_compare(source)
746 if max_inc is None:
747 raise ParseError()
748
749 max_cost = int(parse_count(source))
750
751 if not max_inc:
752 max_cost -= 1
753
754 if max_cost < 0:
755 raise error("bad fuzzy cost limit", source.string, source.pos)
756
757 cost["max"] = max_cost
758
759 constraints["cost"] = cost
760
761def parse_cost_term(source, cost):
762 "Parses a cost equation term."
763 coeff = parse_count(source)
764 ch = source.get()
765 if ch not in "dis":
766 raise ParseError()
767
768 if ch in cost:
769 raise error("repeated fuzzy cost", source.string, source.pos)
770
771 cost[ch] = int(coeff or 1)
772
773def parse_fuzzy_test(source, info, case_flags):
774 saved_pos = source.pos
775 ch = source.get()
776 if ch in SPECIAL_CHARS:
777 if ch == "\\":
778 # An escape sequence outside a set.
779 return parse_escape(source, info, False)
780 elif ch == ".":
781 # Any character.
782 if info.flags & DOTALL:
783 return AnyAll()
784 elif info.flags & WORD:
785 return AnyU()
786 else:
787 return Any()
788 elif ch == "[":
789 # A character set.
790 return parse_set(source, info)
791 else:
792 raise error("expected character set", source.string, saved_pos)
793 elif ch:
794 # A literal.
795 return Character(ord(ch), case_flags=case_flags)
796 else:
797 raise error("expected character set", source.string, saved_pos)
798
799def parse_count(source):
800 "Parses a quantifier's count, which can be empty."
801 return source.get_while(DIGITS)
802
803def parse_paren(source, info):
804 """Parses a parenthesised subpattern or a flag. Returns FLAGS if it's an
805 inline flag.
806 """
807 saved_pos = source.pos
808 ch = source.get(True)
809 if ch == "?":
810 # (?...
811 saved_pos_2 = source.pos
812 ch = source.get(True)
813 if ch == "<":
814 # (?<...
815 saved_pos_3 = source.pos
816 ch = source.get()
817 if ch in ("=", "!"):
818 # (?<=... or (?<!...: lookbehind.
819 return parse_lookaround(source, info, True, ch == "=")
820
821 # (?<...: a named capture group.
822 source.pos = saved_pos_3
823 name = parse_name(source)
824 group = info.open_group(name)
825 source.expect(">")
826 saved_flags = info.flags
827 try:
828 subpattern = _parse_pattern(source, info)
829 source.expect(")")
830 finally:
831 info.flags = saved_flags
832 source.ignore_space = bool(info.flags & VERBOSE)
833
834 info.close_group()
835 return Group(info, group, subpattern)
836 if ch in ("=", "!"):
837 # (?=... or (?!...: lookahead.
838 return parse_lookaround(source, info, False, ch == "=")
839 if ch == "P":
840 # (?P...: a Python extension.
841 return parse_extension(source, info)
842 if ch == "#":
843 # (?#...: a comment.
844 return parse_comment(source)
845 if ch == "(":
846 # (?(...: a conditional subpattern.
847 return parse_conditional(source, info)
848 if ch == ">":
849 # (?>...: an atomic subpattern.
850 return parse_atomic(source, info)
851 if ch == "|":
852 # (?|...: a common/reset groups branch.
853 return parse_common(source, info)
854 if ch == "R" or "0" <= ch <= "9":
855 # (?R...: probably a call to a group.
856 return parse_call_group(source, info, ch, saved_pos_2)
857 if ch == "&":
858 # (?&...: a call to a named group.
859 return parse_call_named_group(source, info, saved_pos_2)
860
861 # (?...: probably a flags subpattern.
862 source.pos = saved_pos_2
863 return parse_flags_subpattern(source, info)
864
865 if ch == "*":
866 # (*...
867 saved_pos_2 = source.pos
868 word = source.get_while(set(")>"), include=False)
869 if word[ : 1].isalpha():
870 verb = VERBS.get(word)
871 if not verb:
872 raise error("unknown verb", source.string, saved_pos_2)
873
874 source.expect(")")
875
876 return verb
877
878 # (...: an unnamed capture group.
879 source.pos = saved_pos
880 group = info.open_group()
881 saved_flags = info.flags
882 try:
883 subpattern = _parse_pattern(source, info)
884 source.expect(")")
885 finally:
886 info.flags = saved_flags
887 source.ignore_space = bool(info.flags & VERBOSE)
888
889 info.close_group()
890
891 return Group(info, group, subpattern)
892
893def parse_extension(source, info):
894 "Parses a Python extension."
895 saved_pos = source.pos
896 ch = source.get()
897 if ch == "<":
898 # (?P<...: a named capture group.
899 name = parse_name(source)
900 group = info.open_group(name)
901 source.expect(">")
902 saved_flags = info.flags
903 try:
904 subpattern = _parse_pattern(source, info)
905 source.expect(")")
906 finally:
907 info.flags = saved_flags
908 source.ignore_space = bool(info.flags & VERBOSE)
909
910 info.close_group()
911
912 return Group(info, group, subpattern)
913 if ch == "=":
914 # (?P=...: a named group reference.
915 name = parse_name(source, allow_numeric=True)
916 source.expect(")")
917 if info.is_open_group(name):
918 raise error("cannot refer to an open group", source.string,
919 saved_pos)
920
921 return make_ref_group(info, name, saved_pos)
922 if ch == ">" or ch == "&":
923 # (?P>...: a call to a group.
924 return parse_call_named_group(source, info, saved_pos)
925
926 source.pos = saved_pos
927 raise error("unknown extension", source.string, saved_pos)
928
929def parse_comment(source):
930 "Parses a comment."
931 while True:
932 saved_pos = source.pos
933 c = source.get(True)
934
935 if not c or c == ")":
936 break
937
938 if c == "\\":
939 c = source.get(True)
940
941 source.pos = saved_pos
942 source.expect(")")
943
944 return None
945
946def parse_lookaround(source, info, behind, positive):
947 "Parses a lookaround."
948 saved_flags = info.flags
949 try:
950 subpattern = _parse_pattern(source, info)
951 source.expect(")")
952 finally:
953 info.flags = saved_flags
954 source.ignore_space = bool(info.flags & VERBOSE)
955
956 return LookAround(behind, positive, subpattern)
957
958def parse_conditional(source, info):
959 "Parses a conditional subpattern."
960 saved_flags = info.flags
961 saved_pos = source.pos
962 ch = source.get()
963 if ch == "?":
964 # (?(?...
965 ch = source.get()
966 if ch in ("=", "!"):
967 # (?(?=... or (?(?!...: lookahead conditional.
968 return parse_lookaround_conditional(source, info, False, ch == "=")
969 if ch == "<":
970 # (?(?<...
971 ch = source.get()
972 if ch in ("=", "!"):
973 # (?(?<=... or (?(?<!...: lookbehind conditional.
974 return parse_lookaround_conditional(source, info, True, ch ==
975 "=")
976
977 source.pos = saved_pos
978 raise error("expected lookaround conditional", source.string,
979 source.pos)
980
981 source.pos = saved_pos
982 try:
983 group = parse_name(source, True)
984 source.expect(")")
985 yes_branch = parse_sequence(source, info)
986 if source.match("|"):
987 no_branch = parse_sequence(source, info)
988 else:
989 no_branch = Sequence()
990
991 source.expect(")")
992 finally:
993 info.flags = saved_flags
994 source.ignore_space = bool(info.flags & VERBOSE)
995
996 if yes_branch.is_empty() and no_branch.is_empty():
997 return Sequence()
998
999 return Conditional(info, group, yes_branch, no_branch, saved_pos)
1000
1001def parse_lookaround_conditional(source, info, behind, positive):
1002 saved_flags = info.flags
1003 try:
1004 subpattern = _parse_pattern(source, info)
1005 source.expect(")")
1006 finally:
1007 info.flags = saved_flags
1008 source.ignore_space = bool(info.flags & VERBOSE)
1009
1010 yes_branch = parse_sequence(source, info)
1011 if source.match("|"):
1012 no_branch = parse_sequence(source, info)
1013 else:
1014 no_branch = Sequence()
1015
1016 source.expect(")")
1017
1018 return LookAroundConditional(behind, positive, subpattern, yes_branch,
1019 no_branch)
1020
1021def parse_atomic(source, info):
1022 "Parses an atomic subpattern."
1023 saved_flags = info.flags
1024 try:
1025 subpattern = _parse_pattern(source, info)
1026 source.expect(")")
1027 finally:
1028 info.flags = saved_flags
1029 source.ignore_space = bool(info.flags & VERBOSE)
1030
1031 return Atomic(subpattern)
1032
1033def parse_common(source, info):
1034 "Parses a common groups branch."
1035 # Capture group numbers in different branches can reuse the group numbers.
1036 initial_group_count = info.group_count
1037 branches = [parse_sequence(source, info)]
1038 final_group_count = info.group_count
1039 while source.match("|"):
1040 info.group_count = initial_group_count
1041 branches.append(parse_sequence(source, info))
1042 final_group_count = max(final_group_count, info.group_count)
1043
1044 info.group_count = final_group_count
1045 source.expect(")")
1046
1047 if len(branches) == 1:
1048 return branches[0]
1049 return Branch(branches)
1050
1051def parse_call_group(source, info, ch, pos):
1052 "Parses a call to a group."
1053 if ch == "R":
1054 group = "0"
1055 else:
1056 group = ch + source.get_while(DIGITS)
1057
1058 source.expect(")")
1059
1060 return CallGroup(info, group, pos)
1061
1062def parse_call_named_group(source, info, pos):
1063 "Parses a call to a named group."
1064 group = parse_name(source)
1065 source.expect(")")
1066
1067 return CallGroup(info, group, pos)
1068
1069def parse_flag_set(source):
1070 "Parses a set of inline flags."
1071 flags = 0
1072
1073 try:
1074 while True:
1075 saved_pos = source.pos
1076 ch = source.get()
1077 if ch == "V":
1078 ch += source.get()
1079 flags |= REGEX_FLAGS[ch]
1080 except KeyError:
1081 source.pos = saved_pos
1082
1083 return flags
1084
1085def parse_flags(source, info):
1086 "Parses flags being turned on/off."
1087 flags_on = parse_flag_set(source)
1088 if source.match("-"):
1089 flags_off = parse_flag_set(source)
1090 if not flags_off:
1091 raise error("bad inline flags: no flags after '-'", source.string,
1092 source.pos)
1093 else:
1094 flags_off = 0
1095
1096 if flags_on & LOCALE:
1097 # Remember that this pattern as an inline locale flag.
1098 info.inline_locale = True
1099
1100 return flags_on, flags_off
1101
1102def parse_subpattern(source, info, flags_on, flags_off):
1103 "Parses a subpattern with scoped flags."
1104 saved_flags = info.flags
1105 info.flags = (info.flags | flags_on) & ~flags_off
1106
1107 # Ensure that there aren't multiple encoding flags set.
1108 if info.flags & (ASCII | LOCALE | UNICODE):
1109 info.flags = (info.flags & ~_ALL_ENCODINGS) | flags_on
1110
1111 source.ignore_space = bool(info.flags & VERBOSE)
1112 try:
1113 subpattern = _parse_pattern(source, info)
1114 source.expect(")")
1115 finally:
1116 info.flags = saved_flags
1117 source.ignore_space = bool(info.flags & VERBOSE)
1118
1119 return subpattern
1120
1121def parse_flags_subpattern(source, info):
1122 """Parses a flags subpattern. It could be inline flags or a subpattern
1123 possibly with local flags. If it's a subpattern, then that's returned;
1124 if it's a inline flags, then None is returned.
1125 """
1126 flags_on, flags_off = parse_flags(source, info)
1127
1128 if flags_off & GLOBAL_FLAGS:
1129 raise error("bad inline flags: cannot turn off global flag",
1130 source.string, source.pos)
1131
1132 if flags_on & flags_off:
1133 raise error("bad inline flags: flag turned on and off", source.string,
1134 source.pos)
1135
1136 # Handle flags which are global in all regex behaviours.
1137 new_global_flags = (flags_on & ~info.global_flags) & GLOBAL_FLAGS
1138 if new_global_flags:
1139 info.global_flags |= new_global_flags
1140
1141 # A global has been turned on, so reparse the pattern.
1142 raise _UnscopedFlagSet(info.global_flags)
1143
1144 # Ensure that from now on we have only scoped flags.
1145 flags_on &= ~GLOBAL_FLAGS
1146
1147 if source.match(":"):
1148 return parse_subpattern(source, info, flags_on, flags_off)
1149
1150 if source.match(")"):
1151 parse_positional_flags(source, info, flags_on, flags_off)
1152 return None
1153
1154 raise error("unknown extension", source.string, source.pos)
1155
1156def parse_positional_flags(source, info, flags_on, flags_off):
1157 "Parses positional flags."
1158 info.flags = (info.flags | flags_on) & ~flags_off
1159 source.ignore_space = bool(info.flags & VERBOSE)
1160
1161def parse_name(source, allow_numeric=False, allow_group_0=False):
1162 "Parses a name."
1163 name = source.get_while(set(")>"), include=False)
1164
1165 if not name:
1166 raise error("missing group name", source.string, source.pos)
1167
1168 if name.isdigit():
1169 min_group = 0 if allow_group_0 else 1
1170 if not allow_numeric or int(name) < min_group:
1171 raise error("bad character in group name", source.string,
1172 source.pos)
1173 else:
1174 if not name.isidentifier():
1175 raise error("bad character in group name", source.string,
1176 source.pos)
1177
1178 return name
1179
1180def is_octal(string):
1181 "Checks whether a string is octal."
1182 return all(ch in OCT_DIGITS for ch in string)
1183
1184def is_decimal(string):
1185 "Checks whether a string is decimal."
1186 return all(ch in DIGITS for ch in string)
1187
1188def is_hexadecimal(string):
1189 "Checks whether a string is hexadecimal."
1190 return all(ch in HEX_DIGITS for ch in string)
1191
1192def parse_escape(source, info, in_set):
1193 "Parses an escape sequence."
1194 saved_ignore = source.ignore_space
1195 source.ignore_space = False
1196 ch = source.get()
1197 source.ignore_space = saved_ignore
1198 if not ch:
1199 # A backslash at the end of the pattern.
1200 raise error("bad escape (end of pattern)", source.string, source.pos)
1201 if ch in HEX_ESCAPES:
1202 # A hexadecimal escape sequence.
1203 return parse_hex_escape(source, info, ch, HEX_ESCAPES[ch], in_set, ch)
1204 elif ch == "g" and not in_set:
1205 # A group reference.
1206 saved_pos = source.pos
1207 try:
1208 return parse_group_ref(source, info)
1209 except error:
1210 # Invalid as a group reference, so assume it's a literal.
1211 source.pos = saved_pos
1212
1213 return make_character(info, ord(ch), in_set)
1214 elif ch == "G" and not in_set:
1215 # A search anchor.
1216 return SearchAnchor()
1217 elif ch == "L" and not in_set:
1218 # A string set.
1219 return parse_string_set(source, info)
1220 elif ch == "N":
1221 # A named codepoint.
1222 return parse_named_char(source, info, in_set)
1223 elif ch in "pP":
1224 # A Unicode property, positive or negative.
1225 return parse_property(source, info, ch == "p", in_set)
1226 elif ch == "R" and not in_set:
1227 # A line ending.
1228 charset = [0x0A, 0x0B, 0x0C, 0x0D]
1229 if info.guess_encoding == UNICODE:
1230 charset.extend([0x85, 0x2028, 0x2029])
1231
1232 return Atomic(Branch([String([0x0D, 0x0A]), SetUnion(info, [Character(c)
1233 for c in charset])]))
1234 elif ch == "X" and not in_set:
1235 # A grapheme cluster.
1236 return Grapheme()
1237 elif ch in ALPHA:
1238 # An alphabetic escape sequence.
1239 # Positional escapes aren't allowed inside a character set.
1240 if not in_set:
1241 if info.flags & WORD:
1242 value = WORD_POSITION_ESCAPES.get(ch)
1243 elif info.flags & ASCII:
1244 value = ASCII_POSITION_ESCAPES.get(ch)
1245 elif info.flags & UNICODE:
1246 value = UNICODE_POSITION_ESCAPES.get(ch)
1247 else:
1248 value = POSITION_ESCAPES.get(ch)
1249
1250 if value:
1251 return value
1252
1253 if info.flags & ASCII:
1254 value = ASCII_CHARSET_ESCAPES.get(ch)
1255 elif info.flags & UNICODE:
1256 value = UNICODE_CHARSET_ESCAPES.get(ch)
1257 else:
1258 value = CHARSET_ESCAPES.get(ch)
1259
1260 if value:
1261 return value
1262
1263 value = CHARACTER_ESCAPES.get(ch)
1264 if value:
1265 return Character(ord(value))
1266
1267 raise error("bad escape \\%s" % ch, source.string, source.pos)
1268 elif ch in DIGITS:
1269 # A numeric escape sequence.
1270 return parse_numeric_escape(source, info, ch, in_set)
1271 else:
1272 # A literal.
1273 return make_character(info, ord(ch), in_set)
1274
1275def parse_numeric_escape(source, info, ch, in_set):
1276 "Parses a numeric escape sequence."
1277 if in_set or ch == "0":
1278 # Octal escape sequence, max 3 digits.
1279 return parse_octal_escape(source, info, [ch], in_set)
1280
1281 # At least 1 digit, so either octal escape or group.
1282 digits = ch
1283 saved_pos = source.pos
1284 ch = source.get()
1285 if ch in DIGITS:
1286 # At least 2 digits, so either octal escape or group.
1287 digits += ch
1288 saved_pos = source.pos
1289 ch = source.get()
1290 if is_octal(digits) and ch in OCT_DIGITS:
1291 # 3 octal digits, so octal escape sequence.
1292 encoding = info.flags & _ALL_ENCODINGS
1293 if encoding == ASCII or encoding == LOCALE:
1294 octal_mask = 0xFF
1295 else:
1296 octal_mask = 0x1FF
1297
1298 value = int(digits + ch, 8) & octal_mask
1299 return make_character(info, value)
1300
1301 # Group reference.
1302 source.pos = saved_pos
1303 if info.is_open_group(digits):
1304 raise error("cannot refer to an open group", source.string, source.pos)
1305
1306 return make_ref_group(info, digits, source.pos)
1307
1308def parse_octal_escape(source, info, digits, in_set):
1309 "Parses an octal escape sequence."
1310 saved_pos = source.pos
1311 ch = source.get()
1312 while len(digits) < 3 and ch in OCT_DIGITS:
1313 digits.append(ch)
1314 saved_pos = source.pos
1315 ch = source.get()
1316
1317 source.pos = saved_pos
1318 try:
1319 value = int("".join(digits), 8)
1320 return make_character(info, value, in_set)
1321 except ValueError:
1322 if digits[0] in OCT_DIGITS:
1323 raise error("incomplete escape \\%s" % ''.join(digits),
1324 source.string, source.pos)
1325 else:
1326 raise error("bad escape \\%s" % digits[0], source.string,
1327 source.pos)
1328
1329def parse_hex_escape(source, info, esc, expected_len, in_set, type):
1330 "Parses a hex escape sequence."
1331 saved_pos = source.pos
1332 digits = []
1333 for i in range(expected_len):
1334 ch = source.get()
1335 if ch not in HEX_DIGITS:
1336 raise error("incomplete escape \\%s%s" % (type, ''.join(digits)),
1337 source.string, saved_pos)
1338 digits.append(ch)
1339
1340 try:
1341 value = int("".join(digits), 16)
1342 except ValueError:
1343 pass
1344 else:
1345 if value < 0x110000:
1346 return make_character(info, value, in_set)
1347
1348 # Bad hex escape.
1349 raise error("bad hex escape \\%s%s" % (esc, ''.join(digits)),
1350 source.string, saved_pos)
1351
1352def parse_group_ref(source, info):
1353 "Parses a group reference."
1354 source.expect("<")
1355 saved_pos = source.pos
1356 name = parse_name(source, True)
1357 source.expect(">")
1358 if info.is_open_group(name):
1359 raise error("cannot refer to an open group", source.string, source.pos)
1360
1361 return make_ref_group(info, name, saved_pos)
1362
1363def parse_string_set(source, info):
1364 "Parses a string set reference."
1365 source.expect("<")
1366 name = parse_name(source, True)
1367 source.expect(">")
1368 if name is None or name not in info.kwargs:
1369 raise error("undefined named list", source.string, source.pos)
1370
1371 return make_string_set(info, name)
1372
1373def parse_named_char(source, info, in_set):
1374 "Parses a named character."
1375 saved_pos = source.pos
1376 if source.match("{"):
1377 name = source.get_while(NAMED_CHAR_PART, keep_spaces=True)
1378 if source.match("}"):
1379 try:
1380 value = unicodedata.lookup(name)
1381 return make_character(info, ord(value), in_set)
1382 except KeyError:
1383 raise error("undefined character name", source.string,
1384 source.pos)
1385
1386 source.pos = saved_pos
1387 return make_character(info, ord("N"), in_set)
1388
1389def parse_property(source, info, positive, in_set):
1390 "Parses a Unicode property."
1391 saved_pos = source.pos
1392 ch = source.get()
1393 if ch == "{":
1394 negate = source.match("^")
1395 prop_name, name = parse_property_name(source)
1396 if source.match("}"):
1397 # It's correctly delimited.
1398 if info.flags & ASCII:
1399 encoding = ASCII_ENCODING
1400 elif info.flags & UNICODE:
1401 encoding = UNICODE_ENCODING
1402 else:
1403 encoding = 0
1404
1405 prop = lookup_property(prop_name, name, positive != negate, source,
1406 encoding=encoding)
1407 return make_property(info, prop, in_set)
1408 elif ch and ch in "CLMNPSZ":
1409 # An abbreviated property, eg \pL.
1410 if info.flags & ASCII:
1411 encoding = ASCII_ENCODING
1412 elif info.flags & UNICODE:
1413 encoding = UNICODE_ENCODING
1414 else:
1415 encoding = 0
1416
1417 prop = lookup_property(None, ch, positive, source, encoding=encoding)
1418 return make_property(info, prop, in_set)
1419
1420 # Not a property, so treat as a literal "p" or "P".
1421 source.pos = saved_pos
1422 ch = "p" if positive else "P"
1423 return make_character(info, ord(ch), in_set)
1424
1425def parse_property_name(source):
1426 "Parses a property name, which may be qualified."
1427 name = source.get_while(PROPERTY_NAME_PART)
1428 saved_pos = source.pos
1429
1430 ch = source.get()
1431 if ch and ch in ":=":
1432 prop_name = name
1433 name = source.get_while(ALNUM | set(" &_-./")).strip()
1434
1435 if name:
1436 # Name after the ":" or "=", so it's a qualified name.
1437 saved_pos = source.pos
1438 else:
1439 # No name after the ":" or "=", so assume it's an unqualified name.
1440 prop_name, name = None, prop_name
1441 else:
1442 prop_name = None
1443
1444 source.pos = saved_pos
1445 return prop_name, name
1446
1447def parse_set(source, info):
1448 "Parses a character set."
1449 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION
1450
1451 saved_ignore = source.ignore_space
1452 source.ignore_space = False
1453 # Negative set?
1454 negate = source.match("^")
1455 try:
1456 if version == VERSION0:
1457 item = parse_set_imp_union(source, info)
1458 else:
1459 item = parse_set_union(source, info)
1460
1461 if not source.match("]"):
1462 raise error("missing ]", source.string, source.pos)
1463 finally:
1464 source.ignore_space = saved_ignore
1465
1466 if negate:
1467 item = item.with_flags(positive=not item.positive)
1468
1469 item = item.with_flags(case_flags=make_case_flags(info))
1470
1471 return item
1472
1473def parse_set_union(source, info):
1474 "Parses a set union ([x||y])."
1475 items = [parse_set_symm_diff(source, info)]
1476 while source.match("||"):
1477 items.append(parse_set_symm_diff(source, info))
1478
1479 if len(items) == 1:
1480 return items[0]
1481 return SetUnion(info, items)
1482
1483def parse_set_symm_diff(source, info):
1484 "Parses a set symmetric difference ([x~~y])."
1485 items = [parse_set_inter(source, info)]
1486 while source.match("~~"):
1487 items.append(parse_set_inter(source, info))
1488
1489 if len(items) == 1:
1490 return items[0]
1491 return SetSymDiff(info, items)
1492
1493def parse_set_inter(source, info):
1494 "Parses a set intersection ([x&&y])."
1495 items = [parse_set_diff(source, info)]
1496 while source.match("&&"):
1497 items.append(parse_set_diff(source, info))
1498
1499 if len(items) == 1:
1500 return items[0]
1501 return SetInter(info, items)
1502
1503def parse_set_diff(source, info):
1504 "Parses a set difference ([x--y])."
1505 items = [parse_set_imp_union(source, info)]
1506 while source.match("--"):
1507 items.append(parse_set_imp_union(source, info))
1508
1509 if len(items) == 1:
1510 return items[0]
1511 return SetDiff(info, items)
1512
1513def parse_set_imp_union(source, info):
1514 "Parses a set implicit union ([xy])."
1515 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION
1516
1517 items = [parse_set_member(source, info)]
1518 while True:
1519 saved_pos = source.pos
1520 if source.match("]"):
1521 # End of the set.
1522 source.pos = saved_pos
1523 break
1524
1525 if version == VERSION1 and any(source.match(op) for op in SET_OPS):
1526 # The new behaviour has set operators.
1527 source.pos = saved_pos
1528 break
1529
1530 items.append(parse_set_member(source, info))
1531
1532 if len(items) == 1:
1533 return items[0]
1534 return SetUnion(info, items)
1535
1536def parse_set_member(source, info):
1537 "Parses a member in a character set."
1538 # Parse a set item.
1539 start = parse_set_item(source, info)
1540 saved_pos1 = source.pos
1541 if (not isinstance(start, Character) or not start.positive or not
1542 source.match("-")):
1543 # It's not the start of a range.
1544 return start
1545
1546 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION
1547
1548 # It looks like the start of a range of characters.
1549 saved_pos2 = source.pos
1550 if version == VERSION1 and source.match("-"):
1551 # It's actually the set difference operator '--', so return the
1552 # character.
1553 source.pos = saved_pos1
1554 return start
1555
1556 if source.match("]"):
1557 # We've reached the end of the set, so return both the character and
1558 # hyphen.
1559 source.pos = saved_pos2
1560 return SetUnion(info, [start, Character(ord("-"))])
1561
1562 # Parse a set item.
1563 end = parse_set_item(source, info)
1564 if not isinstance(end, Character) or not end.positive:
1565 # It's not a range, so return the character, hyphen and property.
1566 return SetUnion(info, [start, Character(ord("-")), end])
1567
1568 # It _is_ a range.
1569 if start.value > end.value:
1570 raise error("bad character range", source.string, source.pos)
1571
1572 if start.value == end.value:
1573 return start
1574
1575 return Range(start.value, end.value)
1576
1577def parse_set_item(source, info):
1578 "Parses an item in a character set."
1579 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION
1580
1581 if source.match("\\"):
1582 # An escape sequence in a set.
1583 return parse_escape(source, info, True)
1584
1585 saved_pos = source.pos
1586 if source.match("[:"):
1587 # Looks like a POSIX character class.
1588 try:
1589 return parse_posix_class(source, info)
1590 except ParseError:
1591 # Not a POSIX character class.
1592 source.pos = saved_pos
1593
1594 if version == VERSION1 and source.match("["):
1595 # It's the start of a nested set.
1596
1597 # Negative set?
1598 negate = source.match("^")
1599 item = parse_set_union(source, info)
1600
1601 if not source.match("]"):
1602 raise error("missing ]", source.string, source.pos)
1603
1604 if negate:
1605 item = item.with_flags(positive=not item.positive)
1606
1607 return item
1608
1609 ch = source.get()
1610 if not ch:
1611 raise error("unterminated character set", source.string, source.pos)
1612
1613 return Character(ord(ch))
1614
1615def parse_posix_class(source, info):
1616 "Parses a POSIX character class."
1617 negate = source.match("^")
1618 prop_name, name = parse_property_name(source)
1619 if not source.match(":]"):
1620 raise ParseError()
1621
1622 return lookup_property(prop_name, name, not negate, source, posix=True)
1623
1624def float_to_rational(flt):
1625 "Converts a float to a rational pair."
1626 int_part = int(flt)
1627 error = flt - int_part
1628 if abs(error) < 0.0001:
1629 return int_part, 1
1630
1631 den, num = float_to_rational(1.0 / error)
1632
1633 return int_part * den + num, den
1634
1635def numeric_to_rational(numeric):
1636 "Converts a numeric string to a rational string, if possible."
1637 if numeric[ : 1] == "-":
1638 sign, numeric = numeric[0], numeric[1 : ]
1639 else:
1640 sign = ""
1641
1642 parts = numeric.split("/")
1643 if len(parts) == 2:
1644 num, den = float_to_rational(float(parts[0]) / float(parts[1]))
1645 elif len(parts) == 1:
1646 num, den = float_to_rational(float(parts[0]))
1647 else:
1648 raise ValueError()
1649
1650 result = "{}{}/{}".format(sign, num, den)
1651 if result.endswith("/1"):
1652 return result[ : -2]
1653
1654 return result
1655
1656def standardise_name(name):
1657 "Standardises a property or value name."
1658 try:
1659 return numeric_to_rational("".join(name))
1660 except (ValueError, ZeroDivisionError):
1661 return "".join(ch for ch in name if ch not in "_- ").upper()
1662
1663_POSIX_CLASSES = set('ALNUM DIGIT PUNCT XDIGIT'.split())
1664
1665_BINARY_VALUES = set('YES Y NO N TRUE T FALSE F'.split())
1666
1667def lookup_property(property, value, positive, source=None, posix=False, encoding=0):
1668 "Looks up a property."
1669 # Normalise the names (which may still be lists).
1670 property = standardise_name(property) if property else None
1671 value = standardise_name(value)
1672
1673 if (property, value) == ("GENERALCATEGORY", "ASSIGNED"):
1674 property, value, positive = "GENERALCATEGORY", "UNASSIGNED", not positive
1675
1676 if posix and not property and value.upper() in _POSIX_CLASSES:
1677 value = 'POSIX' + value
1678
1679 if property:
1680 # Both the property and the value are provided.
1681 prop = PROPERTIES.get(property)
1682 if not prop:
1683 if not source:
1684 raise error("unknown property")
1685
1686 raise error("unknown property", source.string, source.pos)
1687
1688 prop_id, value_dict = prop
1689 val_id = value_dict.get(value)
1690 if val_id is None:
1691 if not source:
1692 raise error("unknown property value")
1693
1694 raise error("unknown property value", source.string, source.pos)
1695
1696 return Property((prop_id << 16) | val_id, positive, encoding=encoding)
1697
1698 # Only the value is provided.
1699 # It might be the name of a GC, script or block value.
1700 for property in ("GC", "SCRIPT", "BLOCK"):
1701 prop_id, value_dict = PROPERTIES.get(property)
1702 val_id = value_dict.get(value)
1703 if val_id is not None:
1704 return Property((prop_id << 16) | val_id, positive, encoding=encoding)
1705
1706 # It might be the name of a binary property.
1707 prop = PROPERTIES.get(value)
1708 if prop:
1709 prop_id, value_dict = prop
1710 if set(value_dict) == _BINARY_VALUES:
1711 return Property((prop_id << 16) | 1, positive, encoding=encoding)
1712
1713 return Property(prop_id << 16, not positive, encoding=encoding)
1714
1715 # It might be the name of a binary property starting with a prefix.
1716 if value.startswith("IS"):
1717 prop = PROPERTIES.get(value[2 : ])
1718 if prop:
1719 prop_id, value_dict = prop
1720 if "YES" in value_dict:
1721 return Property((prop_id << 16) | 1, positive, encoding=encoding)
1722
1723 # It might be the name of a script or block starting with a prefix.
1724 for prefix, property in (("IS", "SCRIPT"), ("IN", "BLOCK")):
1725 if value.startswith(prefix):
1726 prop_id, value_dict = PROPERTIES.get(property)
1727 val_id = value_dict.get(value[2 : ])
1728 if val_id is not None:
1729 return Property((prop_id << 16) | val_id, positive, encoding=encoding)
1730
1731 # Unknown property.
1732 if not source:
1733 raise error("unknown property")
1734
1735 raise error("unknown property", source.string, source.pos)
1736
1737def _compile_replacement(source, pattern, is_unicode):
1738 "Compiles a replacement template escape sequence."
1739 ch = source.get()
1740 if ch in ALPHA:
1741 # An alphabetic escape sequence.
1742 value = CHARACTER_ESCAPES.get(ch)
1743 if value:
1744 return False, [ord(value)]
1745
1746 if ch in HEX_ESCAPES and (ch == "x" or is_unicode):
1747 # A hexadecimal escape sequence.
1748 return False, [parse_repl_hex_escape(source, HEX_ESCAPES[ch], ch)]
1749
1750 if ch == "g":
1751 # A group preference.
1752 return True, [compile_repl_group(source, pattern)]
1753
1754 if ch == "N" and is_unicode:
1755 # A named character.
1756 value = parse_repl_named_char(source)
1757 if value is not None:
1758 return False, [value]
1759
1760 raise error("bad escape \\%s" % ch, source.string, source.pos)
1761
1762 if isinstance(source.sep, bytes):
1763 octal_mask = 0xFF
1764 else:
1765 octal_mask = 0x1FF
1766
1767 if ch == "0":
1768 # An octal escape sequence.
1769 digits = ch
1770 while len(digits) < 3:
1771 saved_pos = source.pos
1772 ch = source.get()
1773 if ch not in OCT_DIGITS:
1774 source.pos = saved_pos
1775 break
1776 digits += ch
1777
1778 return False, [int(digits, 8) & octal_mask]
1779
1780 if ch in DIGITS:
1781 # Either an octal escape sequence (3 digits) or a group reference (max
1782 # 2 digits).
1783 digits = ch
1784 saved_pos = source.pos
1785 ch = source.get()
1786 if ch in DIGITS:
1787 digits += ch
1788 saved_pos = source.pos
1789 ch = source.get()
1790 if ch and is_octal(digits + ch):
1791 # An octal escape sequence.
1792 return False, [int(digits + ch, 8) & octal_mask]
1793
1794 # A group reference.
1795 source.pos = saved_pos
1796 return True, [int(digits)]
1797
1798 if ch == "\\":
1799 # An escaped backslash is a backslash.
1800 return False, [ord("\\")]
1801
1802 if not ch:
1803 # A trailing backslash.
1804 raise error("bad escape (end of pattern)", source.string, source.pos)
1805
1806 # An escaped non-backslash is a backslash followed by the literal.
1807 return False, [ord("\\"), ord(ch)]
1808
1809def parse_repl_hex_escape(source, expected_len, type):
1810 "Parses a hex escape sequence in a replacement string."
1811 digits = []
1812 for i in range(expected_len):
1813 ch = source.get()
1814 if ch not in HEX_DIGITS:
1815 raise error("incomplete escape \\%s%s" % (type, ''.join(digits)),
1816 source.string, source.pos)
1817 digits.append(ch)
1818
1819 return int("".join(digits), 16)
1820
1821def parse_repl_named_char(source):
1822 "Parses a named character in a replacement string."
1823 saved_pos = source.pos
1824 if source.match("{"):
1825 name = source.get_while(ALPHA | set(" "))
1826
1827 if source.match("}"):
1828 try:
1829 value = unicodedata.lookup(name)
1830 return ord(value)
1831 except KeyError:
1832 raise error("undefined character name", source.string,
1833 source.pos)
1834
1835 source.pos = saved_pos
1836 return None
1837
1838def compile_repl_group(source, pattern):
1839 "Compiles a replacement template group reference."
1840 source.expect("<")
1841 name = parse_name(source, True, True)
1842
1843 source.expect(">")
1844 if name.isdigit():
1845 index = int(name)
1846 if not 0 <= index <= pattern.groups:
1847 raise error("invalid group reference", source.string, source.pos)
1848
1849 return index
1850
1851 try:
1852 return pattern.groupindex[name]
1853 except KeyError:
1854 raise IndexError("unknown group")
1855
1856# The regular expression is parsed into a syntax tree. The different types of
1857# node are defined below.
1858
1859INDENT = " "
1860POSITIVE_OP = 0x1
1861ZEROWIDTH_OP = 0x2
1862FUZZY_OP = 0x4
1863REVERSE_OP = 0x8
1864REQUIRED_OP = 0x10
1865ENCODING_OP_SHIFT = 5
1866
1867POS_TEXT = {False: "NON-MATCH", True: "MATCH"}
1868CASE_TEXT = {NOCASE: "", IGNORECASE: " SIMPLE_IGNORE_CASE", FULLCASE: "",
1869 FULLIGNORECASE: " FULL_IGNORE_CASE"}
1870
1871def make_sequence(items):
1872 if len(items) == 1:
1873 return items[0]
1874 return Sequence(items)
1875
1876# Common base class for all nodes.
1877class RegexBase:
1878 def __init__(self):
1879 self._key = self.__class__
1880
1881 def with_flags(self, positive=None, case_flags=None, zerowidth=None):
1882 if positive is None:
1883 positive = self.positive
1884 else:
1885 positive = bool(positive)
1886 if case_flags is None:
1887 case_flags = self.case_flags
1888 else:
1889 case_flags = CASE_FLAGS_COMBINATIONS[case_flags & CASE_FLAGS]
1890 if zerowidth is None:
1891 zerowidth = self.zerowidth
1892 else:
1893 zerowidth = bool(zerowidth)
1894
1895 if (positive == self.positive and case_flags == self.case_flags and
1896 zerowidth == self.zerowidth):
1897 return self
1898
1899 return self.rebuild(positive, case_flags, zerowidth)
1900
1901 def fix_groups(self, pattern, reverse, fuzzy):
1902 pass
1903
1904 def optimise(self, info, reverse):
1905 return self
1906
1907 def pack_characters(self, info):
1908 return self
1909
1910 def remove_captures(self):
1911 return self
1912
1913 def is_atomic(self):
1914 return True
1915
1916 def can_be_affix(self):
1917 return True
1918
1919 def contains_group(self):
1920 return False
1921
1922 def get_firstset(self, reverse):
1923 raise _FirstSetError()
1924
1925 def has_simple_start(self):
1926 return False
1927
1928 def compile(self, reverse=False, fuzzy=False):
1929 return self._compile(reverse, fuzzy)
1930
1931 def is_empty(self):
1932 return False
1933
1934 def __hash__(self):
1935 return hash(self._key)
1936
1937 def __eq__(self, other):
1938 return type(self) is type(other) and self._key == other._key
1939
1940 def __ne__(self, other):
1941 return not self.__eq__(other)
1942
1943 def get_required_string(self, reverse):
1944 return self.max_width(), None
1945
1946# Base class for zero-width nodes.
1947class ZeroWidthBase(RegexBase):
1948 def __init__(self, positive=True, encoding=0):
1949 RegexBase.__init__(self)
1950 self.positive = bool(positive)
1951 self.encoding = encoding
1952
1953 self._key = self.__class__, self.positive
1954
1955 def get_firstset(self, reverse):
1956 return set([None])
1957
1958 def _compile(self, reverse, fuzzy):
1959 flags = 0
1960 if self.positive:
1961 flags |= POSITIVE_OP
1962 if fuzzy:
1963 flags |= FUZZY_OP
1964 if reverse:
1965 flags |= REVERSE_OP
1966 flags |= self.encoding << ENCODING_OP_SHIFT
1967 return [(self._opcode, flags)]
1968
1969 def dump(self, indent, reverse):
1970 print("{}{} {}{}".format(INDENT * indent, self._op_name,
1971 POS_TEXT[self.positive], ["", " ASCII"][self.encoding]))
1972
1973 def max_width(self):
1974 return 0
1975
1976class Any(RegexBase):
1977 _opcode = {False: OP.ANY, True: OP.ANY_REV}
1978 _op_name = "ANY"
1979
1980 def has_simple_start(self):
1981 return True
1982
1983 def _compile(self, reverse, fuzzy):
1984 flags = 0
1985 if fuzzy:
1986 flags |= FUZZY_OP
1987 return [(self._opcode[reverse], flags)]
1988
1989 def dump(self, indent, reverse):
1990 print("{}{}".format(INDENT * indent, self._op_name))
1991
1992 def max_width(self):
1993 return 1
1994
1995class AnyAll(Any):
1996 _opcode = {False: OP.ANY_ALL, True: OP.ANY_ALL_REV}
1997 _op_name = "ANY_ALL"
1998
1999class AnyU(Any):
2000 _opcode = {False: OP.ANY_U, True: OP.ANY_U_REV}
2001 _op_name = "ANY_U"
2002
2003class Atomic(RegexBase):
2004 def __init__(self, subpattern):
2005 RegexBase.__init__(self)
2006 self.subpattern = subpattern
2007
2008 def fix_groups(self, pattern, reverse, fuzzy):
2009 self.subpattern.fix_groups(pattern, reverse, fuzzy)
2010
2011 def optimise(self, info, reverse):
2012 self.subpattern = self.subpattern.optimise(info, reverse)
2013
2014 if self.subpattern.is_empty():
2015 return self.subpattern
2016 return self
2017
2018 def pack_characters(self, info):
2019 self.subpattern = self.subpattern.pack_characters(info)
2020 return self
2021
2022 def remove_captures(self):
2023 self.subpattern = self.subpattern.remove_captures()
2024 return self
2025
2026 def can_be_affix(self):
2027 return self.subpattern.can_be_affix()
2028
2029 def contains_group(self):
2030 return self.subpattern.contains_group()
2031
2032 def get_firstset(self, reverse):
2033 return self.subpattern.get_firstset(reverse)
2034
2035 def has_simple_start(self):
2036 return self.subpattern.has_simple_start()
2037
2038 def _compile(self, reverse, fuzzy):
2039 return ([(OP.ATOMIC, )] + self.subpattern.compile(reverse, fuzzy) +
2040 [(OP.END, )])
2041
2042 def dump(self, indent, reverse):
2043 print("{}ATOMIC".format(INDENT * indent))
2044 self.subpattern.dump(indent + 1, reverse)
2045
2046 def is_empty(self):
2047 return self.subpattern.is_empty()
2048
2049 def __eq__(self, other):
2050 return (type(self) is type(other) and self.subpattern ==
2051 other.subpattern)
2052
2053 def max_width(self):
2054 return self.subpattern.max_width()
2055
2056 def get_required_string(self, reverse):
2057 return self.subpattern.get_required_string(reverse)
2058
2059class Boundary(ZeroWidthBase):
2060 _opcode = OP.BOUNDARY
2061 _op_name = "BOUNDARY"
2062
2063class Branch(RegexBase):
2064 def __init__(self, branches):
2065 RegexBase.__init__(self)
2066 self.branches = branches
2067
2068 def fix_groups(self, pattern, reverse, fuzzy):
2069 for b in self.branches:
2070 b.fix_groups(pattern, reverse, fuzzy)
2071
2072 def optimise(self, info, reverse):
2073 if not self.branches:
2074 return Sequence([])
2075
2076 # Flatten branches within branches.
2077 branches = Branch._flatten_branches(info, reverse, self.branches)
2078
2079 # Move any common prefix or suffix out of the branches.
2080 if reverse:
2081 suffix, branches = Branch._split_common_suffix(info, branches)
2082 prefix = []
2083 else:
2084 prefix, branches = Branch._split_common_prefix(info, branches)
2085 suffix = []
2086
2087 # Try to reduce adjacent single-character branches to sets.
2088 branches = Branch._reduce_to_set(info, reverse, branches)
2089
2090 if len(branches) > 1:
2091 sequence = [Branch(branches)]
2092
2093 if not prefix or not suffix:
2094 # We might be able to add a quick precheck before the branches.
2095 firstset = self._add_precheck(info, reverse, branches)
2096
2097 if firstset:
2098 if reverse:
2099 sequence.append(firstset)
2100 else:
2101 sequence.insert(0, firstset)
2102 else:
2103 sequence = branches
2104
2105 return make_sequence(prefix + sequence + suffix)
2106
2107 def _add_precheck(self, info, reverse, branches):
2108 charset = set()
2109 pos = -1 if reverse else 0
2110
2111 for branch in branches:
2112 if type(branch) is Literal and branch.case_flags == NOCASE:
2113 charset.add(branch.characters[pos])
2114 else:
2115 return
2116
2117 if not charset:
2118 return None
2119
2120 return _check_firstset(info, reverse, [Character(c) for c in charset])
2121
2122 def pack_characters(self, info):
2123 self.branches = [b.pack_characters(info) for b in self.branches]
2124 return self
2125
2126 def remove_captures(self):
2127 self.branches = [b.remove_captures() for b in self.branches]
2128 return self
2129
2130 def is_atomic(self):
2131 return all(b.is_atomic() for b in self.branches)
2132
2133 def can_be_affix(self):
2134 return all(b.can_be_affix() for b in self.branches)
2135
2136 def contains_group(self):
2137 return any(b.contains_group() for b in self.branches)
2138
2139 def get_firstset(self, reverse):
2140 fs = set()
2141 for b in self.branches:
2142 fs |= b.get_firstset(reverse)
2143
2144 return fs or set([None])
2145
2146 def _compile(self, reverse, fuzzy):
2147 if not self.branches:
2148 return []
2149
2150 code = [(OP.BRANCH, )]
2151 for b in self.branches:
2152 code.extend(b.compile(reverse, fuzzy))
2153 code.append((OP.NEXT, ))
2154
2155 code[-1] = (OP.END, )
2156
2157 return code
2158
2159 def dump(self, indent, reverse):
2160 print("{}BRANCH".format(INDENT * indent))
2161 self.branches[0].dump(indent + 1, reverse)
2162 for b in self.branches[1 : ]:
2163 print("{}OR".format(INDENT * indent))
2164 b.dump(indent + 1, reverse)
2165
2166 @staticmethod
2167 def _flatten_branches(info, reverse, branches):
2168 # Flatten the branches so that there aren't branches of branches.
2169 new_branches = []
2170 for b in branches:
2171 b = b.optimise(info, reverse)
2172 if isinstance(b, Branch):
2173 new_branches.extend(b.branches)
2174 else:
2175 new_branches.append(b)
2176
2177 return new_branches
2178
2179 @staticmethod
2180 def _split_common_prefix(info, branches):
2181 # Common leading items can be moved out of the branches.
2182 # Get the items in the branches.
2183 alternatives = []
2184 for b in branches:
2185 if isinstance(b, Sequence):
2186 alternatives.append(b.items)
2187 else:
2188 alternatives.append([b])
2189
2190 # What is the maximum possible length of the prefix?
2191 max_count = min(len(a) for a in alternatives)
2192
2193 # What is the longest common prefix?
2194 prefix = alternatives[0]
2195 pos = 0
2196 end_pos = max_count
2197 while pos < end_pos and prefix[pos].can_be_affix() and all(a[pos] ==
2198 prefix[pos] for a in alternatives):
2199 pos += 1
2200 count = pos
2201
2202 if info.flags & UNICODE:
2203 # We need to check that we're not splitting a sequence of
2204 # characters which could form part of full case-folding.
2205 count = pos
2206 while count > 0 and not all(Branch._can_split(a, count) for a in
2207 alternatives):
2208 count -= 1
2209
2210 # No common prefix is possible.
2211 if count == 0:
2212 return [], branches
2213
2214 # Rebuild the branches.
2215 new_branches = []
2216 for a in alternatives:
2217 new_branches.append(make_sequence(a[count : ]))
2218
2219 return prefix[ : count], new_branches
2220
2221 @staticmethod
2222 def _split_common_suffix(info, branches):
2223 # Common trailing items can be moved out of the branches.
2224 # Get the items in the branches.
2225 alternatives = []
2226 for b in branches:
2227 if isinstance(b, Sequence):
2228 alternatives.append(b.items)
2229 else:
2230 alternatives.append([b])
2231
2232 # What is the maximum possible length of the suffix?
2233 max_count = min(len(a) for a in alternatives)
2234
2235 # What is the longest common suffix?
2236 suffix = alternatives[0]
2237 pos = -1
2238 end_pos = -1 - max_count
2239 while pos > end_pos and suffix[pos].can_be_affix() and all(a[pos] ==
2240 suffix[pos] for a in alternatives):
2241 pos -= 1
2242 count = -1 - pos
2243
2244 if info.flags & UNICODE:
2245 # We need to check that we're not splitting a sequence of
2246 # characters which could form part of full case-folding.
2247 while count > 0 and not all(Branch._can_split_rev(a, count) for a
2248 in alternatives):
2249 count -= 1
2250
2251 # No common suffix is possible.
2252 if count == 0:
2253 return [], branches
2254
2255 # Rebuild the branches.
2256 new_branches = []
2257 for a in alternatives:
2258 new_branches.append(make_sequence(a[ : -count]))
2259
2260 return suffix[-count : ], new_branches
2261
2262 @staticmethod
2263 def _can_split(items, count):
2264 # Check the characters either side of the proposed split.
2265 if not Branch._is_full_case(items, count - 1):
2266 return True
2267
2268 if not Branch._is_full_case(items, count):
2269 return True
2270
2271 # Check whether a 1-1 split would be OK.
2272 if Branch._is_folded(items[count - 1 : count + 1]):
2273 return False
2274
2275 # Check whether a 1-2 split would be OK.
2276 if (Branch._is_full_case(items, count + 2) and
2277 Branch._is_folded(items[count - 1 : count + 2])):
2278 return False
2279
2280 # Check whether a 2-1 split would be OK.
2281 if (Branch._is_full_case(items, count - 2) and
2282 Branch._is_folded(items[count - 2 : count + 1])):
2283 return False
2284
2285 return True
2286
2287 @staticmethod
2288 def _can_split_rev(items, count):
2289 end = len(items)
2290
2291 # Check the characters either side of the proposed split.
2292 if not Branch._is_full_case(items, end - count):
2293 return True
2294
2295 if not Branch._is_full_case(items, end - count - 1):
2296 return True
2297
2298 # Check whether a 1-1 split would be OK.
2299 if Branch._is_folded(items[end - count - 1 : end - count + 1]):
2300 return False
2301
2302 # Check whether a 1-2 split would be OK.
2303 if (Branch._is_full_case(items, end - count + 2) and
2304 Branch._is_folded(items[end - count - 1 : end - count + 2])):
2305 return False
2306
2307 # Check whether a 2-1 split would be OK.
2308 if (Branch._is_full_case(items, end - count - 2) and
2309 Branch._is_folded(items[end - count - 2 : end - count + 1])):
2310 return False
2311
2312 return True
2313
2314 @staticmethod
2315 def _merge_common_prefixes(info, reverse, branches):
2316 # Branches with the same case-sensitive character prefix can be grouped
2317 # together if they are separated only by other branches with a
2318 # character prefix.
2319 prefixed = defaultdict(list)
2320 order = {}
2321 new_branches = []
2322 for b in branches:
2323 if Branch._is_simple_character(b):
2324 # Branch starts with a simple character.
2325 prefixed[b.value].append([b])
2326 order.setdefault(b.value, len(order))
2327 elif (isinstance(b, Sequence) and b.items and
2328 Branch._is_simple_character(b.items[0])):
2329 # Branch starts with a simple character.
2330 prefixed[b.items[0].value].append(b.items)
2331 order.setdefault(b.items[0].value, len(order))
2332 else:
2333 Branch._flush_char_prefix(info, reverse, prefixed, order,
2334 new_branches)
2335
2336 new_branches.append(b)
2337
2338 Branch._flush_char_prefix(info, prefixed, order, new_branches)
2339
2340 return new_branches
2341
2342 @staticmethod
2343 def _is_simple_character(c):
2344 return isinstance(c, Character) and c.positive and not c.case_flags
2345
2346 @staticmethod
2347 def _reduce_to_set(info, reverse, branches):
2348 # Can the branches be reduced to a set?
2349 new_branches = []
2350 items = set()
2351 case_flags = NOCASE
2352 for b in branches:
2353 if isinstance(b, (Character, Property, SetBase)):
2354 # Branch starts with a single character.
2355 if b.case_flags != case_flags:
2356 # Different case sensitivity, so flush.
2357 Branch._flush_set_members(info, reverse, items, case_flags,
2358 new_branches)
2359
2360 case_flags = b.case_flags
2361
2362 items.add(b.with_flags(case_flags=NOCASE))
2363 else:
2364 Branch._flush_set_members(info, reverse, items, case_flags,
2365 new_branches)
2366
2367 new_branches.append(b)
2368
2369 Branch._flush_set_members(info, reverse, items, case_flags,
2370 new_branches)
2371
2372 return new_branches
2373
2374 @staticmethod
2375 def _flush_char_prefix(info, reverse, prefixed, order, new_branches):
2376 # Flush the prefixed branches.
2377 if not prefixed:
2378 return
2379
2380 for value, branches in sorted(prefixed.items(), key=lambda pair:
2381 order[pair[0]]):
2382 if len(branches) == 1:
2383 new_branches.append(make_sequence(branches[0]))
2384 else:
2385 subbranches = []
2386 optional = False
2387 for b in branches:
2388 if len(b) > 1:
2389 subbranches.append(make_sequence(b[1 : ]))
2390 elif not optional:
2391 subbranches.append(Sequence())
2392 optional = True
2393
2394 sequence = Sequence([Character(value), Branch(subbranches)])
2395 new_branches.append(sequence.optimise(info, reverse))
2396
2397 prefixed.clear()
2398 order.clear()
2399
2400 @staticmethod
2401 def _flush_set_members(info, reverse, items, case_flags, new_branches):
2402 # Flush the set members.
2403 if not items:
2404 return
2405
2406 if len(items) == 1:
2407 item = list(items)[0]
2408 else:
2409 item = SetUnion(info, list(items)).optimise(info, reverse)
2410
2411 new_branches.append(item.with_flags(case_flags=case_flags))
2412
2413 items.clear()
2414
2415 @staticmethod
2416 def _is_full_case(items, i):
2417 if not 0 <= i < len(items):
2418 return False
2419
2420 item = items[i]
2421 return (isinstance(item, Character) and item.positive and
2422 (item.case_flags & FULLIGNORECASE) == FULLIGNORECASE)
2423
2424 @staticmethod
2425 def _is_folded(items):
2426 if len(items) < 2:
2427 return False
2428
2429 for i in items:
2430 if (not isinstance(i, Character) or not i.positive or not
2431 i.case_flags):
2432 return False
2433
2434 folded = "".join(chr(i.value) for i in items)
2435 folded = _regex.fold_case(FULL_CASE_FOLDING, folded)
2436
2437 # Get the characters which expand to multiple codepoints on folding.
2438 expanding_chars = _regex.get_expand_on_folding()
2439
2440 for c in expanding_chars:
2441 if folded == _regex.fold_case(FULL_CASE_FOLDING, c):
2442 return True
2443
2444 return False
2445
2446 def is_empty(self):
2447 return all(b.is_empty() for b in self.branches)
2448
2449 def __eq__(self, other):
2450 return type(self) is type(other) and self.branches == other.branches
2451
2452 def max_width(self):
2453 return max(b.max_width() for b in self.branches)
2454
2455class CallGroup(RegexBase):
2456 def __init__(self, info, group, position):
2457 RegexBase.__init__(self)
2458 self.info = info
2459 self.group = group
2460 self.position = position
2461
2462 self._key = self.__class__, self.group
2463
2464 def fix_groups(self, pattern, reverse, fuzzy):
2465 try:
2466 self.group = int(self.group)
2467 except ValueError:
2468 try:
2469 self.group = self.info.group_index[self.group]
2470 except KeyError:
2471 raise error("invalid group reference", pattern, self.position)
2472
2473 if not 0 <= self.group <= self.info.group_count:
2474 raise error("unknown group", pattern, self.position)
2475
2476 if self.group > 0 and self.info.open_group_count[self.group] > 1:
2477 raise error("ambiguous group reference", pattern, self.position)
2478
2479 self.info.group_calls.append((self, reverse, fuzzy))
2480
2481 self._key = self.__class__, self.group
2482
2483 def remove_captures(self):
2484 raise error("group reference not allowed", pattern, self.position)
2485
2486 def _compile(self, reverse, fuzzy):
2487 return [(OP.GROUP_CALL, self.call_ref)]
2488
2489 def dump(self, indent, reverse):
2490 print("{}GROUP_CALL {}".format(INDENT * indent, self.group))
2491
2492 def __eq__(self, other):
2493 return type(self) is type(other) and self.group == other.group
2494
2495 def max_width(self):
2496 return UNLIMITED
2497
2498 def __del__(self):
2499 self.info = None
2500
2501class CallRef(RegexBase):
2502 def __init__(self, ref, parsed):
2503 self.ref = ref
2504 self.parsed = parsed
2505
2506 def _compile(self, reverse, fuzzy):
2507 return ([(OP.CALL_REF, self.ref)] + self.parsed._compile(reverse,
2508 fuzzy) + [(OP.END, )])
2509
2510class Character(RegexBase):
2511 _opcode = {(NOCASE, False): OP.CHARACTER, (IGNORECASE, False):
2512 OP.CHARACTER_IGN, (FULLCASE, False): OP.CHARACTER, (FULLIGNORECASE,
2513 False): OP.CHARACTER_IGN, (NOCASE, True): OP.CHARACTER_REV, (IGNORECASE,
2514 True): OP.CHARACTER_IGN_REV, (FULLCASE, True): OP.CHARACTER_REV,
2515 (FULLIGNORECASE, True): OP.CHARACTER_IGN_REV}
2516
2517 def __init__(self, value, positive=True, case_flags=NOCASE,
2518 zerowidth=False):
2519 RegexBase.__init__(self)
2520 self.value = value
2521 self.positive = bool(positive)
2522 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
2523 self.zerowidth = bool(zerowidth)
2524
2525 if (self.positive and (self.case_flags & FULLIGNORECASE) ==
2526 FULLIGNORECASE):
2527 self.folded = _regex.fold_case(FULL_CASE_FOLDING, chr(self.value))
2528 else:
2529 self.folded = chr(self.value)
2530
2531 self._key = (self.__class__, self.value, self.positive,
2532 self.case_flags, self.zerowidth)
2533
2534 def rebuild(self, positive, case_flags, zerowidth):
2535 return Character(self.value, positive, case_flags, zerowidth)
2536
2537 def optimise(self, info, reverse, in_set=False):
2538 return self
2539
2540 def get_firstset(self, reverse):
2541 return set([self])
2542
2543 def has_simple_start(self):
2544 return True
2545
2546 def _compile(self, reverse, fuzzy):
2547 flags = 0
2548 if self.positive:
2549 flags |= POSITIVE_OP
2550 if self.zerowidth:
2551 flags |= ZEROWIDTH_OP
2552 if fuzzy:
2553 flags |= FUZZY_OP
2554
2555 code = PrecompiledCode([self._opcode[self.case_flags, reverse], flags,
2556 self.value])
2557
2558 if len(self.folded) > 1:
2559 # The character expands on full case-folding.
2560 code = Branch([code, String([ord(c) for c in self.folded],
2561 case_flags=self.case_flags)])
2562
2563 return code.compile(reverse, fuzzy)
2564
2565 def dump(self, indent, reverse):
2566 display = ascii(chr(self.value)).lstrip("bu")
2567 print("{}CHARACTER {} {}{}".format(INDENT * indent,
2568 POS_TEXT[self.positive], display, CASE_TEXT[self.case_flags]))
2569
2570 def matches(self, ch):
2571 return (ch == self.value) == self.positive
2572
2573 def max_width(self):
2574 return len(self.folded)
2575
2576 def get_required_string(self, reverse):
2577 if not self.positive:
2578 return 1, None
2579
2580 self.folded_characters = tuple(ord(c) for c in self.folded)
2581
2582 return 0, self
2583
2584class Conditional(RegexBase):
2585 def __init__(self, info, group, yes_item, no_item, position):
2586 RegexBase.__init__(self)
2587 self.info = info
2588 self.group = group
2589 self.yes_item = yes_item
2590 self.no_item = no_item
2591 self.position = position
2592
2593 def fix_groups(self, pattern, reverse, fuzzy):
2594 try:
2595 self.group = int(self.group)
2596 except ValueError:
2597 try:
2598 self.group = self.info.group_index[self.group]
2599 except KeyError:
2600 if self.group == 'DEFINE':
2601 # 'DEFINE' is a special name unless there's a group with
2602 # that name.
2603 self.group = 0
2604 else:
2605 raise error("unknown group", pattern, self.position)
2606
2607 if not 0 <= self.group <= self.info.group_count:
2608 raise error("invalid group reference", pattern, self.position)
2609
2610 self.yes_item.fix_groups(pattern, reverse, fuzzy)
2611 self.no_item.fix_groups(pattern, reverse, fuzzy)
2612
2613 def optimise(self, info, reverse):
2614 yes_item = self.yes_item.optimise(info, reverse)
2615 no_item = self.no_item.optimise(info, reverse)
2616
2617 return Conditional(info, self.group, yes_item, no_item, self.position)
2618
2619 def pack_characters(self, info):
2620 self.yes_item = self.yes_item.pack_characters(info)
2621 self.no_item = self.no_item.pack_characters(info)
2622 return self
2623
2624 def remove_captures(self):
2625 self.yes_item = self.yes_item.remove_captures()
2626 self.no_item = self.no_item.remove_captures()
2627
2628 def is_atomic(self):
2629 return self.yes_item.is_atomic() and self.no_item.is_atomic()
2630
2631 def can_be_affix(self):
2632 return self.yes_item.can_be_affix() and self.no_item.can_be_affix()
2633
2634 def contains_group(self):
2635 return self.yes_item.contains_group() or self.no_item.contains_group()
2636
2637 def get_firstset(self, reverse):
2638 return (self.yes_item.get_firstset(reverse) |
2639 self.no_item.get_firstset(reverse))
2640
2641 def _compile(self, reverse, fuzzy):
2642 code = [(OP.GROUP_EXISTS, self.group)]
2643 code.extend(self.yes_item.compile(reverse, fuzzy))
2644 add_code = self.no_item.compile(reverse, fuzzy)
2645 if add_code:
2646 code.append((OP.NEXT, ))
2647 code.extend(add_code)
2648
2649 code.append((OP.END, ))
2650
2651 return code
2652
2653 def dump(self, indent, reverse):
2654 print("{}GROUP_EXISTS {}".format(INDENT * indent, self.group))
2655 self.yes_item.dump(indent + 1, reverse)
2656 if not self.no_item.is_empty():
2657 print("{}OR".format(INDENT * indent))
2658 self.no_item.dump(indent + 1, reverse)
2659
2660 def is_empty(self):
2661 return self.yes_item.is_empty() and self.no_item.is_empty()
2662
2663 def __eq__(self, other):
2664 return type(self) is type(other) and (self.group, self.yes_item,
2665 self.no_item) == (other.group, other.yes_item, other.no_item)
2666
2667 def max_width(self):
2668 return max(self.yes_item.max_width(), self.no_item.max_width())
2669
2670 def __del__(self):
2671 self.info = None
2672
2673class DefaultBoundary(ZeroWidthBase):
2674 _opcode = OP.DEFAULT_BOUNDARY
2675 _op_name = "DEFAULT_BOUNDARY"
2676
2677class DefaultEndOfWord(ZeroWidthBase):
2678 _opcode = OP.DEFAULT_END_OF_WORD
2679 _op_name = "DEFAULT_END_OF_WORD"
2680
2681class DefaultStartOfWord(ZeroWidthBase):
2682 _opcode = OP.DEFAULT_START_OF_WORD
2683 _op_name = "DEFAULT_START_OF_WORD"
2684
2685class EndOfLine(ZeroWidthBase):
2686 _opcode = OP.END_OF_LINE
2687 _op_name = "END_OF_LINE"
2688
2689class EndOfLineU(EndOfLine):
2690 _opcode = OP.END_OF_LINE_U
2691 _op_name = "END_OF_LINE_U"
2692
2693class EndOfString(ZeroWidthBase):
2694 _opcode = OP.END_OF_STRING
2695 _op_name = "END_OF_STRING"
2696
2697class EndOfStringLine(ZeroWidthBase):
2698 _opcode = OP.END_OF_STRING_LINE
2699 _op_name = "END_OF_STRING_LINE"
2700
2701class EndOfStringLineU(EndOfStringLine):
2702 _opcode = OP.END_OF_STRING_LINE_U
2703 _op_name = "END_OF_STRING_LINE_U"
2704
2705class EndOfWord(ZeroWidthBase):
2706 _opcode = OP.END_OF_WORD
2707 _op_name = "END_OF_WORD"
2708
2709class Failure(ZeroWidthBase):
2710 _op_name = "FAILURE"
2711
2712 def _compile(self, reverse, fuzzy):
2713 return [(OP.FAILURE, )]
2714
2715class Fuzzy(RegexBase):
2716 def __init__(self, subpattern, constraints=None):
2717 RegexBase.__init__(self)
2718 if constraints is None:
2719 constraints = {}
2720 self.subpattern = subpattern
2721 self.constraints = constraints
2722
2723 # If an error type is mentioned in the cost equation, then its maximum
2724 # defaults to unlimited.
2725 if "cost" in constraints:
2726 for e in "dis":
2727 if e in constraints["cost"]:
2728 constraints.setdefault(e, (0, None))
2729
2730 # If any error type is mentioned, then all the error maxima default to
2731 # 0, otherwise they default to unlimited.
2732 if set(constraints) & set("dis"):
2733 for e in "dis":
2734 constraints.setdefault(e, (0, 0))
2735 else:
2736 for e in "dis":
2737 constraints.setdefault(e, (0, None))
2738
2739 # The maximum of the generic error type defaults to unlimited.
2740 constraints.setdefault("e", (0, None))
2741
2742 # The cost equation defaults to equal costs. Also, the cost of any
2743 # error type not mentioned in the cost equation defaults to 0.
2744 if "cost" in constraints:
2745 for e in "dis":
2746 constraints["cost"].setdefault(e, 0)
2747 else:
2748 constraints["cost"] = {"d": 1, "i": 1, "s": 1, "max":
2749 constraints["e"][1]}
2750
2751 def fix_groups(self, pattern, reverse, fuzzy):
2752 self.subpattern.fix_groups(pattern, reverse, True)
2753
2754 def pack_characters(self, info):
2755 self.subpattern = self.subpattern.pack_characters(info)
2756 return self
2757
2758 def remove_captures(self):
2759 self.subpattern = self.subpattern.remove_captures()
2760 return self
2761
2762 def is_atomic(self):
2763 return self.subpattern.is_atomic()
2764
2765 def contains_group(self):
2766 return self.subpattern.contains_group()
2767
2768 def _compile(self, reverse, fuzzy):
2769 # The individual limits.
2770 arguments = []
2771 for e in "dise":
2772 v = self.constraints[e]
2773 arguments.append(v[0])
2774 arguments.append(UNLIMITED if v[1] is None else v[1])
2775
2776 # The coeffs of the cost equation.
2777 for e in "dis":
2778 arguments.append(self.constraints["cost"][e])
2779
2780 # The maximum of the cost equation.
2781 v = self.constraints["cost"]["max"]
2782 arguments.append(UNLIMITED if v is None else v)
2783
2784 flags = 0
2785 if reverse:
2786 flags |= REVERSE_OP
2787
2788 test = self.constraints.get("test")
2789
2790 if test:
2791 return ([(OP.FUZZY_EXT, flags) + tuple(arguments)] +
2792 test.compile(reverse, True) + [(OP.NEXT,)] +
2793 self.subpattern.compile(reverse, True) + [(OP.END,)])
2794
2795 return ([(OP.FUZZY, flags) + tuple(arguments)] +
2796 self.subpattern.compile(reverse, True) + [(OP.END,)])
2797
2798 def dump(self, indent, reverse):
2799 constraints = self._constraints_to_string()
2800 if constraints:
2801 constraints = " " + constraints
2802 print("{}FUZZY{}".format(INDENT * indent, constraints))
2803 self.subpattern.dump(indent + 1, reverse)
2804
2805 def is_empty(self):
2806 return self.subpattern.is_empty()
2807
2808 def __eq__(self, other):
2809 return (type(self) is type(other) and self.subpattern ==
2810 other.subpattern and self.constraints == other.constraints)
2811
2812 def max_width(self):
2813 return UNLIMITED
2814
2815 def _constraints_to_string(self):
2816 constraints = []
2817
2818 for name in "ids":
2819 min, max = self.constraints[name]
2820 if max == 0:
2821 continue
2822
2823 con = ""
2824
2825 if min > 0:
2826 con = "{}<=".format(min)
2827
2828 con += name
2829
2830 if max is not None:
2831 con += "<={}".format(max)
2832
2833 constraints.append(con)
2834
2835 cost = []
2836 for name in "ids":
2837 coeff = self.constraints["cost"][name]
2838 if coeff > 0:
2839 cost.append("{}{}".format(coeff, name))
2840
2841 limit = self.constraints["cost"]["max"]
2842 if limit is not None and limit > 0:
2843 cost = "{}<={}".format("+".join(cost), limit)
2844 constraints.append(cost)
2845
2846 return ",".join(constraints)
2847
2848class Grapheme(RegexBase):
2849 def _compile(self, reverse, fuzzy):
2850 # Match at least 1 character until a grapheme boundary is reached. Note
2851 # that this is the same whether matching forwards or backwards.
2852 grapheme_matcher = Atomic(Sequence([LazyRepeat(AnyAll(), 1, None),
2853 GraphemeBoundary()]))
2854
2855 return grapheme_matcher.compile(reverse, fuzzy)
2856
2857 def dump(self, indent, reverse):
2858 print("{}GRAPHEME".format(INDENT * indent))
2859
2860 def max_width(self):
2861 return UNLIMITED
2862
2863class GraphemeBoundary:
2864 def compile(self, reverse, fuzzy):
2865 return [(OP.GRAPHEME_BOUNDARY, 1)]
2866
2867class GreedyRepeat(RegexBase):
2868 _opcode = OP.GREEDY_REPEAT
2869 _op_name = "GREEDY_REPEAT"
2870
2871 def __init__(self, subpattern, min_count, max_count):
2872 RegexBase.__init__(self)
2873 self.subpattern = subpattern
2874 self.min_count = min_count
2875 self.max_count = max_count
2876
2877 def fix_groups(self, pattern, reverse, fuzzy):
2878 self.subpattern.fix_groups(pattern, reverse, fuzzy)
2879
2880 def optimise(self, info, reverse):
2881 subpattern = self.subpattern.optimise(info, reverse)
2882
2883 return type(self)(subpattern, self.min_count, self.max_count)
2884
2885 def pack_characters(self, info):
2886 self.subpattern = self.subpattern.pack_characters(info)
2887 return self
2888
2889 def remove_captures(self):
2890 self.subpattern = self.subpattern.remove_captures()
2891 return self
2892
2893 def is_atomic(self):
2894 return self.min_count == self.max_count and self.subpattern.is_atomic()
2895
2896 def can_be_affix(self):
2897 return False
2898
2899 def contains_group(self):
2900 return self.subpattern.contains_group()
2901
2902 def get_firstset(self, reverse):
2903 fs = self.subpattern.get_firstset(reverse)
2904 if self.min_count == 0:
2905 fs.add(None)
2906
2907 return fs
2908
2909 def _compile(self, reverse, fuzzy):
2910 repeat = [self._opcode, self.min_count]
2911 if self.max_count is None:
2912 repeat.append(UNLIMITED)
2913 else:
2914 repeat.append(self.max_count)
2915
2916 subpattern = self.subpattern.compile(reverse, fuzzy)
2917 if not subpattern:
2918 return []
2919
2920 return ([tuple(repeat)] + subpattern + [(OP.END, )])
2921
2922 def dump(self, indent, reverse):
2923 if self.max_count is None:
2924 limit = "INF"
2925 else:
2926 limit = self.max_count
2927 print("{}{} {} {}".format(INDENT * indent, self._op_name,
2928 self.min_count, limit))
2929
2930 self.subpattern.dump(indent + 1, reverse)
2931
2932 def is_empty(self):
2933 return self.subpattern.is_empty()
2934
2935 def __eq__(self, other):
2936 return type(self) is type(other) and (self.subpattern, self.min_count,
2937 self.max_count) == (other.subpattern, other.min_count,
2938 other.max_count)
2939
2940 def max_width(self):
2941 if self.max_count is None:
2942 return UNLIMITED
2943
2944 return self.subpattern.max_width() * self.max_count
2945
2946 def get_required_string(self, reverse):
2947 max_count = UNLIMITED if self.max_count is None else self.max_count
2948 if self.min_count == 0:
2949 w = self.subpattern.max_width() * max_count
2950 return min(w, UNLIMITED), None
2951
2952 ofs, req = self.subpattern.get_required_string(reverse)
2953 if req:
2954 return ofs, req
2955
2956 w = self.subpattern.max_width() * max_count
2957 return min(w, UNLIMITED), None
2958
2959class PossessiveRepeat(GreedyRepeat):
2960 def is_atomic(self):
2961 return True
2962
2963 def _compile(self, reverse, fuzzy):
2964 subpattern = self.subpattern.compile(reverse, fuzzy)
2965 if not subpattern:
2966 return []
2967
2968 repeat = [self._opcode, self.min_count]
2969 if self.max_count is None:
2970 repeat.append(UNLIMITED)
2971 else:
2972 repeat.append(self.max_count)
2973
2974 return ([(OP.ATOMIC, ), tuple(repeat)] + subpattern + [(OP.END, ),
2975 (OP.END, )])
2976
2977 def dump(self, indent, reverse):
2978 print("{}ATOMIC".format(INDENT * indent))
2979
2980 if self.max_count is None:
2981 limit = "INF"
2982 else:
2983 limit = self.max_count
2984 print("{}{} {} {}".format(INDENT * (indent + 1), self._op_name,
2985 self.min_count, limit))
2986
2987 self.subpattern.dump(indent + 2, reverse)
2988
2989class Group(RegexBase):
2990 def __init__(self, info, group, subpattern):
2991 RegexBase.__init__(self)
2992 self.info = info
2993 self.group = group
2994 self.subpattern = subpattern
2995
2996 self.call_ref = None
2997
2998 def fix_groups(self, pattern, reverse, fuzzy):
2999 self.info.defined_groups[self.group] = (self, reverse, fuzzy)
3000 self.subpattern.fix_groups(pattern, reverse, fuzzy)
3001
3002 def optimise(self, info, reverse):
3003 subpattern = self.subpattern.optimise(info, reverse)
3004
3005 return Group(self.info, self.group, subpattern)
3006
3007 def pack_characters(self, info):
3008 self.subpattern = self.subpattern.pack_characters(info)
3009 return self
3010
3011 def remove_captures(self):
3012 return self.subpattern.remove_captures()
3013
3014 def is_atomic(self):
3015 return self.subpattern.is_atomic()
3016
3017 def can_be_affix(self):
3018 return False
3019
3020 def contains_group(self):
3021 return True
3022
3023 def get_firstset(self, reverse):
3024 return self.subpattern.get_firstset(reverse)
3025
3026 def has_simple_start(self):
3027 return self.subpattern.has_simple_start()
3028
3029 def _compile(self, reverse, fuzzy):
3030 code = []
3031
3032 public_group = private_group = self.group
3033 if private_group < 0:
3034 public_group = self.info.private_groups[private_group]
3035 private_group = self.info.group_count - private_group
3036
3037 key = self.group, reverse, fuzzy
3038 ref = self.info.call_refs.get(key)
3039 if ref is not None:
3040 code += [(OP.CALL_REF, ref)]
3041
3042 code += [(OP.GROUP, int(not reverse), private_group, public_group)]
3043 code += self.subpattern.compile(reverse, fuzzy)
3044 code += [(OP.END, )]
3045
3046 if ref is not None:
3047 code += [(OP.END, )]
3048
3049 return code
3050
3051 def dump(self, indent, reverse):
3052 group = self.group
3053 if group < 0:
3054 group = private_groups[group]
3055 print("{}GROUP {}".format(INDENT * indent, group))
3056 self.subpattern.dump(indent + 1, reverse)
3057
3058 def __eq__(self, other):
3059 return (type(self) is type(other) and (self.group, self.subpattern) ==
3060 (other.group, other.subpattern))
3061
3062 def max_width(self):
3063 return self.subpattern.max_width()
3064
3065 def get_required_string(self, reverse):
3066 return self.subpattern.get_required_string(reverse)
3067
3068 def __del__(self):
3069 self.info = None
3070
3071class Keep(ZeroWidthBase):
3072 _opcode = OP.KEEP
3073 _op_name = "KEEP"
3074
3075class LazyRepeat(GreedyRepeat):
3076 _opcode = OP.LAZY_REPEAT
3077 _op_name = "LAZY_REPEAT"
3078
3079class LookAround(RegexBase):
3080 _dir_text = {False: "AHEAD", True: "BEHIND"}
3081
3082 def __init__(self, behind, positive, subpattern):
3083 RegexBase.__init__(self)
3084 self.behind = bool(behind)
3085 self.positive = bool(positive)
3086 self.subpattern = subpattern
3087
3088 def fix_groups(self, pattern, reverse, fuzzy):
3089 self.subpattern.fix_groups(pattern, self.behind, fuzzy)
3090
3091 def optimise(self, info, reverse):
3092 subpattern = self.subpattern.optimise(info, self.behind)
3093 if self.positive and subpattern.is_empty():
3094 return subpattern
3095
3096 return LookAround(self.behind, self.positive, subpattern)
3097
3098 def pack_characters(self, info):
3099 self.subpattern = self.subpattern.pack_characters(info)
3100 return self
3101
3102 def remove_captures(self):
3103 return self.subpattern.remove_captures()
3104
3105 def is_atomic(self):
3106 return self.subpattern.is_atomic()
3107
3108 def can_be_affix(self):
3109 return self.subpattern.can_be_affix()
3110
3111 def contains_group(self):
3112 return self.subpattern.contains_group()
3113
3114 def get_firstset(self, reverse):
3115 if self.positive and self.behind == reverse:
3116 return self.subpattern.get_firstset(reverse)
3117
3118 return set([None])
3119
3120 def _compile(self, reverse, fuzzy):
3121 flags = 0
3122 if self.positive:
3123 flags |= POSITIVE_OP
3124 if fuzzy:
3125 flags |= FUZZY_OP
3126 if reverse:
3127 flags |= REVERSE_OP
3128
3129 return ([(OP.LOOKAROUND, flags, int(not self.behind))] +
3130 self.subpattern.compile(self.behind) + [(OP.END, )])
3131
3132 def dump(self, indent, reverse):
3133 print("{}LOOK{} {}".format(INDENT * indent,
3134 self._dir_text[self.behind], POS_TEXT[self.positive]))
3135 self.subpattern.dump(indent + 1, self.behind)
3136
3137 def is_empty(self):
3138 return self.positive and self.subpattern.is_empty()
3139
3140 def __eq__(self, other):
3141 return type(self) is type(other) and (self.behind, self.positive,
3142 self.subpattern) == (other.behind, other.positive, other.subpattern)
3143
3144 def max_width(self):
3145 return 0
3146
3147class LookAroundConditional(RegexBase):
3148 _dir_text = {False: "AHEAD", True: "BEHIND"}
3149
3150 def __init__(self, behind, positive, subpattern, yes_item, no_item):
3151 RegexBase.__init__(self)
3152 self.behind = bool(behind)
3153 self.positive = bool(positive)
3154 self.subpattern = subpattern
3155 self.yes_item = yes_item
3156 self.no_item = no_item
3157
3158 def fix_groups(self, pattern, reverse, fuzzy):
3159 self.subpattern.fix_groups(pattern, reverse, fuzzy)
3160 self.yes_item.fix_groups(pattern, reverse, fuzzy)
3161 self.no_item.fix_groups(pattern, reverse, fuzzy)
3162
3163 def optimise(self, info, reverse):
3164 subpattern = self.subpattern.optimise(info, self.behind)
3165 yes_item = self.yes_item.optimise(info, self.behind)
3166 no_item = self.no_item.optimise(info, self.behind)
3167
3168 return LookAroundConditional(self.behind, self.positive, subpattern,
3169 yes_item, no_item)
3170
3171 def pack_characters(self, info):
3172 self.subpattern = self.subpattern.pack_characters(info)
3173 self.yes_item = self.yes_item.pack_characters(info)
3174 self.no_item = self.no_item.pack_characters(info)
3175 return self
3176
3177 def remove_captures(self):
3178 self.subpattern = self.subpattern.remove_captures()
3179 self.yes_item = self.yes_item.remove_captures()
3180 self.no_item = self.no_item.remove_captures()
3181
3182 def is_atomic(self):
3183 return (self.subpattern.is_atomic() and self.yes_item.is_atomic() and
3184 self.no_item.is_atomic())
3185
3186 def can_be_affix(self):
3187 return (self.subpattern.can_be_affix() and self.yes_item.can_be_affix()
3188 and self.no_item.can_be_affix())
3189
3190 def contains_group(self):
3191 return (self.subpattern.contains_group() or
3192 self.yes_item.contains_group() or self.no_item.contains_group())
3193
3194 def _compile(self, reverse, fuzzy):
3195 code = [(OP.CONDITIONAL, int(self.positive), int(not self.behind))]
3196 code.extend(self.subpattern.compile(self.behind, fuzzy))
3197 code.append((OP.NEXT, ))
3198 code.extend(self.yes_item.compile(reverse, fuzzy))
3199 add_code = self.no_item.compile(reverse, fuzzy)
3200 if add_code:
3201 code.append((OP.NEXT, ))
3202 code.extend(add_code)
3203
3204 code.append((OP.END, ))
3205
3206 return code
3207
3208 def dump(self, indent, reverse):
3209 print("{}CONDITIONAL {} {}".format(INDENT * indent,
3210 self._dir_text[self.behind], POS_TEXT[self.positive]))
3211 self.subpattern.dump(indent + 1, self.behind)
3212 print("{}EITHER".format(INDENT * indent))
3213 self.yes_item.dump(indent + 1, reverse)
3214 if not self.no_item.is_empty():
3215 print("{}OR".format(INDENT * indent))
3216 self.no_item.dump(indent + 1, reverse)
3217
3218 def is_empty(self):
3219 return (self.subpattern.is_empty() and self.yes_item.is_empty() or
3220 self.no_item.is_empty())
3221
3222 def __eq__(self, other):
3223 return type(self) is type(other) and (self.subpattern, self.yes_item,
3224 self.no_item) == (other.subpattern, other.yes_item, other.no_item)
3225
3226 def max_width(self):
3227 return max(self.yes_item.max_width(), self.no_item.max_width())
3228
3229 def get_required_string(self, reverse):
3230 return self.max_width(), None
3231
3232class PrecompiledCode(RegexBase):
3233 def __init__(self, code):
3234 self.code = code
3235
3236 def _compile(self, reverse, fuzzy):
3237 return [tuple(self.code)]
3238
3239class Property(RegexBase):
3240 _opcode = {(NOCASE, False): OP.PROPERTY, (IGNORECASE, False):
3241 OP.PROPERTY_IGN, (FULLCASE, False): OP.PROPERTY, (FULLIGNORECASE, False):
3242 OP.PROPERTY_IGN, (NOCASE, True): OP.PROPERTY_REV, (IGNORECASE, True):
3243 OP.PROPERTY_IGN_REV, (FULLCASE, True): OP.PROPERTY_REV, (FULLIGNORECASE,
3244 True): OP.PROPERTY_IGN_REV}
3245
3246 def __init__(self, value, positive=True, case_flags=NOCASE,
3247 zerowidth=False, encoding=0):
3248 RegexBase.__init__(self)
3249 self.value = value
3250 self.positive = bool(positive)
3251 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
3252 self.zerowidth = bool(zerowidth)
3253 self.encoding = encoding
3254
3255 self._key = (self.__class__, self.value, self.positive,
3256 self.case_flags, self.zerowidth)
3257
3258 def rebuild(self, positive, case_flags, zerowidth):
3259 return Property(self.value, positive, case_flags, zerowidth,
3260 self.encoding)
3261
3262 def optimise(self, info, reverse, in_set=False):
3263 return self
3264
3265 def get_firstset(self, reverse):
3266 return set([self])
3267
3268 def has_simple_start(self):
3269 return True
3270
3271 def _compile(self, reverse, fuzzy):
3272 flags = 0
3273 if self.positive:
3274 flags |= POSITIVE_OP
3275 if self.zerowidth:
3276 flags |= ZEROWIDTH_OP
3277 if fuzzy:
3278 flags |= FUZZY_OP
3279 flags |= self.encoding << ENCODING_OP_SHIFT
3280 return [(self._opcode[self.case_flags, reverse], flags, self.value)]
3281
3282 def dump(self, indent, reverse):
3283 prop = PROPERTY_NAMES[self.value >> 16]
3284 name, value = prop[0], prop[1][self.value & 0xFFFF]
3285 print("{}PROPERTY {} {}:{}{}{}".format(INDENT * indent,
3286 POS_TEXT[self.positive], name, value, CASE_TEXT[self.case_flags],
3287 ["", " ASCII"][self.encoding]))
3288
3289 def matches(self, ch):
3290 return _regex.has_property_value(self.value, ch) == self.positive
3291
3292 def max_width(self):
3293 return 1
3294
3295class Prune(ZeroWidthBase):
3296 _op_name = "PRUNE"
3297
3298 def _compile(self, reverse, fuzzy):
3299 return [(OP.PRUNE, )]
3300
3301class Range(RegexBase):
3302 _opcode = {(NOCASE, False): OP.RANGE, (IGNORECASE, False): OP.RANGE_IGN,
3303 (FULLCASE, False): OP.RANGE, (FULLIGNORECASE, False): OP.RANGE_IGN,
3304 (NOCASE, True): OP.RANGE_REV, (IGNORECASE, True): OP.RANGE_IGN_REV,
3305 (FULLCASE, True): OP.RANGE_REV, (FULLIGNORECASE, True): OP.RANGE_IGN_REV}
3306 _op_name = "RANGE"
3307
3308 def __init__(self, lower, upper, positive=True, case_flags=NOCASE,
3309 zerowidth=False):
3310 RegexBase.__init__(self)
3311 self.lower = lower
3312 self.upper = upper
3313 self.positive = bool(positive)
3314 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
3315 self.zerowidth = bool(zerowidth)
3316
3317 self._key = (self.__class__, self.lower, self.upper, self.positive,
3318 self.case_flags, self.zerowidth)
3319
3320 def rebuild(self, positive, case_flags, zerowidth):
3321 return Range(self.lower, self.upper, positive, case_flags, zerowidth)
3322
3323 def optimise(self, info, reverse, in_set=False):
3324 # Is the range case-sensitive?
3325 if not self.positive or not (self.case_flags & IGNORECASE) or in_set:
3326 return self
3327
3328 # Is full case-folding possible?
3329 if (not (info.flags & UNICODE) or (self.case_flags & FULLIGNORECASE) !=
3330 FULLIGNORECASE):
3331 return self
3332
3333 # Get the characters which expand to multiple codepoints on folding.
3334 expanding_chars = _regex.get_expand_on_folding()
3335
3336 # Get the folded characters in the range.
3337 items = []
3338 for ch in expanding_chars:
3339 if self.lower <= ord(ch) <= self.upper:
3340 folded = _regex.fold_case(FULL_CASE_FOLDING, ch)
3341 items.append(String([ord(c) for c in folded],
3342 case_flags=self.case_flags))
3343
3344 if not items:
3345 # We can fall back to simple case-folding.
3346 return self
3347
3348 if len(items) < self.upper - self.lower + 1:
3349 # Not all the characters are covered by the full case-folding.
3350 items.insert(0, self)
3351
3352 return Branch(items)
3353
3354 def _compile(self, reverse, fuzzy):
3355 flags = 0
3356 if self.positive:
3357 flags |= POSITIVE_OP
3358 if self.zerowidth:
3359 flags |= ZEROWIDTH_OP
3360 if fuzzy:
3361 flags |= FUZZY_OP
3362 return [(self._opcode[self.case_flags, reverse], flags, self.lower,
3363 self.upper)]
3364
3365 def dump(self, indent, reverse):
3366 display_lower = ascii(chr(self.lower)).lstrip("bu")
3367 display_upper = ascii(chr(self.upper)).lstrip("bu")
3368 print("{}RANGE {} {} {}{}".format(INDENT * indent,
3369 POS_TEXT[self.positive], display_lower, display_upper,
3370 CASE_TEXT[self.case_flags]))
3371
3372 def matches(self, ch):
3373 return (self.lower <= ch <= self.upper) == self.positive
3374
3375 def max_width(self):
3376 return 1
3377
3378class RefGroup(RegexBase):
3379 _opcode = {(NOCASE, False): OP.REF_GROUP, (IGNORECASE, False):
3380 OP.REF_GROUP_IGN, (FULLCASE, False): OP.REF_GROUP, (FULLIGNORECASE,
3381 False): OP.REF_GROUP_FLD, (NOCASE, True): OP.REF_GROUP_REV, (IGNORECASE,
3382 True): OP.REF_GROUP_IGN_REV, (FULLCASE, True): OP.REF_GROUP_REV,
3383 (FULLIGNORECASE, True): OP.REF_GROUP_FLD_REV}
3384
3385 def __init__(self, info, group, position, case_flags=NOCASE):
3386 RegexBase.__init__(self)
3387 self.info = info
3388 self.group = group
3389 self.position = position
3390 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
3391
3392 self._key = self.__class__, self.group, self.case_flags
3393
3394 def fix_groups(self, pattern, reverse, fuzzy):
3395 try:
3396 self.group = int(self.group)
3397 except ValueError:
3398 try:
3399 self.group = self.info.group_index[self.group]
3400 except KeyError:
3401 raise error("unknown group", pattern, self.position)
3402
3403 if not 1 <= self.group <= self.info.group_count:
3404 raise error("invalid group reference", pattern, self.position)
3405
3406 self._key = self.__class__, self.group, self.case_flags
3407
3408 def remove_captures(self):
3409 raise error("group reference not allowed", pattern, self.position)
3410
3411 def _compile(self, reverse, fuzzy):
3412 flags = 0
3413 if fuzzy:
3414 flags |= FUZZY_OP
3415 return [(self._opcode[self.case_flags, reverse], flags, self.group)]
3416
3417 def dump(self, indent, reverse):
3418 print("{}REF_GROUP {}{}".format(INDENT * indent, self.group,
3419 CASE_TEXT[self.case_flags]))
3420
3421 def max_width(self):
3422 return UNLIMITED
3423
3424 def __del__(self):
3425 self.info = None
3426
3427class SearchAnchor(ZeroWidthBase):
3428 _opcode = OP.SEARCH_ANCHOR
3429 _op_name = "SEARCH_ANCHOR"
3430
3431class Sequence(RegexBase):
3432 def __init__(self, items=None):
3433 RegexBase.__init__(self)
3434 if items is None:
3435 items = []
3436
3437 self.items = items
3438
3439 def fix_groups(self, pattern, reverse, fuzzy):
3440 for s in self.items:
3441 s.fix_groups(pattern, reverse, fuzzy)
3442
3443 def optimise(self, info, reverse):
3444 # Flatten the sequences.
3445 items = []
3446 for s in self.items:
3447 s = s.optimise(info, reverse)
3448 if isinstance(s, Sequence):
3449 items.extend(s.items)
3450 else:
3451 items.append(s)
3452
3453 return make_sequence(items)
3454
3455 def pack_characters(self, info):
3456 "Packs sequences of characters into strings."
3457 items = []
3458 characters = []
3459 case_flags = NOCASE
3460 for s in self.items:
3461 if type(s) is Character and s.positive and not s.zerowidth:
3462 if s.case_flags != case_flags:
3463 # Different case sensitivity, so flush, unless neither the
3464 # previous nor the new character are cased.
3465 if s.case_flags or is_cased_i(info, s.value):
3466 Sequence._flush_characters(info, characters,
3467 case_flags, items)
3468
3469 case_flags = s.case_flags
3470
3471 characters.append(s.value)
3472 elif type(s) is String or type(s) is Literal:
3473 if s.case_flags != case_flags:
3474 # Different case sensitivity, so flush, unless the neither
3475 # the previous nor the new string are cased.
3476 if s.case_flags or any(is_cased_i(info, c) for c in
3477 characters):
3478 Sequence._flush_characters(info, characters,
3479 case_flags, items)
3480
3481 case_flags = s.case_flags
3482
3483 characters.extend(s.characters)
3484 else:
3485 Sequence._flush_characters(info, characters, case_flags, items)
3486
3487 items.append(s.pack_characters(info))
3488
3489 Sequence._flush_characters(info, characters, case_flags, items)
3490
3491 return make_sequence(items)
3492
3493 def remove_captures(self):
3494 self.items = [s.remove_captures() for s in self.items]
3495 return self
3496
3497 def is_atomic(self):
3498 return all(s.is_atomic() for s in self.items)
3499
3500 def can_be_affix(self):
3501 return False
3502
3503 def contains_group(self):
3504 return any(s.contains_group() for s in self.items)
3505
3506 def get_firstset(self, reverse):
3507 fs = set()
3508 items = self.items
3509 if reverse:
3510 items.reverse()
3511 for s in items:
3512 fs |= s.get_firstset(reverse)
3513 if None not in fs:
3514 return fs
3515 fs.discard(None)
3516
3517 return fs | set([None])
3518
3519 def has_simple_start(self):
3520 return bool(self.items) and self.items[0].has_simple_start()
3521
3522 def _compile(self, reverse, fuzzy):
3523 seq = self.items
3524 if reverse:
3525 seq = seq[::-1]
3526
3527 code = []
3528 for s in seq:
3529 code.extend(s.compile(reverse, fuzzy))
3530
3531 return code
3532
3533 def dump(self, indent, reverse):
3534 for s in self.items:
3535 s.dump(indent, reverse)
3536
3537 @staticmethod
3538 def _flush_characters(info, characters, case_flags, items):
3539 if not characters:
3540 return
3541
3542 # Disregard case_flags if all of the characters are case-less.
3543 if case_flags & IGNORECASE:
3544 if not any(is_cased_i(info, c) for c in characters):
3545 case_flags = NOCASE
3546
3547 if (case_flags & FULLIGNORECASE) == FULLIGNORECASE:
3548 literals = Sequence._fix_full_casefold(characters)
3549
3550 for item in literals:
3551 chars = item.characters
3552
3553 if len(chars) == 1:
3554 items.append(Character(chars[0], case_flags=item.case_flags))
3555 else:
3556 items.append(String(chars, case_flags=item.case_flags))
3557 else:
3558 if len(characters) == 1:
3559 items.append(Character(characters[0], case_flags=case_flags))
3560 else:
3561 items.append(String(characters, case_flags=case_flags))
3562
3563 characters[:] = []
3564
3565 @staticmethod
3566 def _fix_full_casefold(characters):
3567 # Split a literal needing full case-folding into chunks that need it
3568 # and chunks that can use simple case-folding, which is faster.
3569 expanded = [_regex.fold_case(FULL_CASE_FOLDING, c) for c in
3570 _regex.get_expand_on_folding()]
3571 string = _regex.fold_case(FULL_CASE_FOLDING, ''.join(chr(c)
3572 for c in characters)).lower()
3573 chunks = []
3574
3575 for e in expanded:
3576 found = string.find(e)
3577
3578 while found >= 0:
3579 chunks.append((found, found + len(e)))
3580 found = string.find(e, found + 1)
3581
3582 pos = 0
3583 literals = []
3584
3585 for start, end in Sequence._merge_chunks(chunks):
3586 if pos < start:
3587 literals.append(Literal(characters[pos : start],
3588 case_flags=IGNORECASE))
3589
3590 literals.append(Literal(characters[start : end],
3591 case_flags=FULLIGNORECASE))
3592 pos = end
3593
3594 if pos < len(characters):
3595 literals.append(Literal(characters[pos : ], case_flags=IGNORECASE))
3596
3597 return literals
3598
3599 @staticmethod
3600 def _merge_chunks(chunks):
3601 if len(chunks) < 2:
3602 return chunks
3603
3604 chunks.sort()
3605
3606 start, end = chunks[0]
3607 new_chunks = []
3608
3609 for s, e in chunks[1 : ]:
3610 if s <= end:
3611 end = max(end, e)
3612 else:
3613 new_chunks.append((start, end))
3614 start, end = s, e
3615
3616 new_chunks.append((start, end))
3617
3618 return new_chunks
3619
3620 def is_empty(self):
3621 return all(i.is_empty() for i in self.items)
3622
3623 def __eq__(self, other):
3624 return type(self) is type(other) and self.items == other.items
3625
3626 def max_width(self):
3627 return sum(s.max_width() for s in self.items)
3628
3629 def get_required_string(self, reverse):
3630 seq = self.items
3631 if reverse:
3632 seq = seq[::-1]
3633
3634 offset = 0
3635
3636 for s in seq:
3637 ofs, req = s.get_required_string(reverse)
3638 offset += ofs
3639 if req:
3640 return offset, req
3641
3642 return offset, None
3643
3644class SetBase(RegexBase):
3645 def __init__(self, info, items, positive=True, case_flags=NOCASE,
3646 zerowidth=False):
3647 RegexBase.__init__(self)
3648 self.info = info
3649 self.items = tuple(items)
3650 self.positive = bool(positive)
3651 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
3652 self.zerowidth = bool(zerowidth)
3653
3654 self.char_width = 1
3655
3656 self._key = (self.__class__, self.items, self.positive,
3657 self.case_flags, self.zerowidth)
3658
3659 def rebuild(self, positive, case_flags, zerowidth):
3660 return type(self)(self.info, self.items, positive, case_flags,
3661 zerowidth).optimise(self.info, False)
3662
3663 def get_firstset(self, reverse):
3664 return set([self])
3665
3666 def has_simple_start(self):
3667 return True
3668
3669 def _compile(self, reverse, fuzzy):
3670 flags = 0
3671 if self.positive:
3672 flags |= POSITIVE_OP
3673 if self.zerowidth:
3674 flags |= ZEROWIDTH_OP
3675 if fuzzy:
3676 flags |= FUZZY_OP
3677 code = [(self._opcode[self.case_flags, reverse], flags)]
3678 for m in self.items:
3679 code.extend(m.compile())
3680
3681 code.append((OP.END, ))
3682
3683 return code
3684
3685 def dump(self, indent, reverse):
3686 print("{}{} {}{}".format(INDENT * indent, self._op_name,
3687 POS_TEXT[self.positive], CASE_TEXT[self.case_flags]))
3688 for i in self.items:
3689 i.dump(indent + 1, reverse)
3690
3691 def _handle_case_folding(self, info, in_set):
3692 # Is the set case-sensitive?
3693 if not self.positive or not (self.case_flags & IGNORECASE) or in_set:
3694 return self
3695
3696 # Is full case-folding possible?
3697 if (not (self.info.flags & UNICODE) or (self.case_flags &
3698 FULLIGNORECASE) != FULLIGNORECASE):
3699 return self
3700
3701 # Get the characters which expand to multiple codepoints on folding.
3702 expanding_chars = _regex.get_expand_on_folding()
3703
3704 # Get the folded characters in the set.
3705 items = []
3706 seen = set()
3707 for ch in expanding_chars:
3708 if self.matches(ord(ch)):
3709 folded = _regex.fold_case(FULL_CASE_FOLDING, ch)
3710 if folded not in seen:
3711 items.append(String([ord(c) for c in folded],
3712 case_flags=self.case_flags))
3713 seen.add(folded)
3714
3715 if not items:
3716 # We can fall back to simple case-folding.
3717 return self
3718
3719 return Branch([self] + items)
3720
3721 def max_width(self):
3722 # Is the set case-sensitive?
3723 if not self.positive or not (self.case_flags & IGNORECASE):
3724 return 1
3725
3726 # Is full case-folding possible?
3727 if (not (self.info.flags & UNICODE) or (self.case_flags &
3728 FULLIGNORECASE) != FULLIGNORECASE):
3729 return 1
3730
3731 # Get the characters which expand to multiple codepoints on folding.
3732 expanding_chars = _regex.get_expand_on_folding()
3733
3734 # Get the folded characters in the set.
3735 seen = set()
3736 for ch in expanding_chars:
3737 if self.matches(ord(ch)):
3738 folded = _regex.fold_case(FULL_CASE_FOLDING, ch)
3739 seen.add(folded)
3740
3741 if not seen:
3742 return 1
3743
3744 return max(len(folded) for folded in seen)
3745
3746 def __del__(self):
3747 self.info = None
3748
3749class SetDiff(SetBase):
3750 _opcode = {(NOCASE, False): OP.SET_DIFF, (IGNORECASE, False):
3751 OP.SET_DIFF_IGN, (FULLCASE, False): OP.SET_DIFF, (FULLIGNORECASE, False):
3752 OP.SET_DIFF_IGN, (NOCASE, True): OP.SET_DIFF_REV, (IGNORECASE, True):
3753 OP.SET_DIFF_IGN_REV, (FULLCASE, True): OP.SET_DIFF_REV, (FULLIGNORECASE,
3754 True): OP.SET_DIFF_IGN_REV}
3755 _op_name = "SET_DIFF"
3756
3757 def optimise(self, info, reverse, in_set=False):
3758 items = self.items
3759 if len(items) > 2:
3760 items = [items[0], SetUnion(info, items[1 : ])]
3761
3762 if len(items) == 1:
3763 return items[0].with_flags(case_flags=self.case_flags,
3764 zerowidth=self.zerowidth).optimise(info, reverse, in_set)
3765
3766 self.items = tuple(m.optimise(info, reverse, in_set=True) for m in
3767 items)
3768
3769 return self._handle_case_folding(info, in_set)
3770
3771 def matches(self, ch):
3772 m = self.items[0].matches(ch) and not self.items[1].matches(ch)
3773 return m == self.positive
3774
3775class SetInter(SetBase):
3776 _opcode = {(NOCASE, False): OP.SET_INTER, (IGNORECASE, False):
3777 OP.SET_INTER_IGN, (FULLCASE, False): OP.SET_INTER, (FULLIGNORECASE,
3778 False): OP.SET_INTER_IGN, (NOCASE, True): OP.SET_INTER_REV, (IGNORECASE,
3779 True): OP.SET_INTER_IGN_REV, (FULLCASE, True): OP.SET_INTER_REV,
3780 (FULLIGNORECASE, True): OP.SET_INTER_IGN_REV}
3781 _op_name = "SET_INTER"
3782
3783 def optimise(self, info, reverse, in_set=False):
3784 items = []
3785 for m in self.items:
3786 m = m.optimise(info, reverse, in_set=True)
3787 if isinstance(m, SetInter) and m.positive:
3788 # Intersection in intersection.
3789 items.extend(m.items)
3790 else:
3791 items.append(m)
3792
3793 if len(items) == 1:
3794 return items[0].with_flags(case_flags=self.case_flags,
3795 zerowidth=self.zerowidth).optimise(info, reverse, in_set)
3796
3797 self.items = tuple(items)
3798
3799 return self._handle_case_folding(info, in_set)
3800
3801 def matches(self, ch):
3802 m = all(i.matches(ch) for i in self.items)
3803 return m == self.positive
3804
3805class SetSymDiff(SetBase):
3806 _opcode = {(NOCASE, False): OP.SET_SYM_DIFF, (IGNORECASE, False):
3807 OP.SET_SYM_DIFF_IGN, (FULLCASE, False): OP.SET_SYM_DIFF, (FULLIGNORECASE,
3808 False): OP.SET_SYM_DIFF_IGN, (NOCASE, True): OP.SET_SYM_DIFF_REV,
3809 (IGNORECASE, True): OP.SET_SYM_DIFF_IGN_REV, (FULLCASE, True):
3810 OP.SET_SYM_DIFF_REV, (FULLIGNORECASE, True): OP.SET_SYM_DIFF_IGN_REV}
3811 _op_name = "SET_SYM_DIFF"
3812
3813 def optimise(self, info, reverse, in_set=False):
3814 items = []
3815 for m in self.items:
3816 m = m.optimise(info, reverse, in_set=True)
3817 if isinstance(m, SetSymDiff) and m.positive:
3818 # Symmetric difference in symmetric difference.
3819 items.extend(m.items)
3820 else:
3821 items.append(m)
3822
3823 if len(items) == 1:
3824 return items[0].with_flags(case_flags=self.case_flags,
3825 zerowidth=self.zerowidth).optimise(info, reverse, in_set)
3826
3827 self.items = tuple(items)
3828
3829 return self._handle_case_folding(info, in_set)
3830
3831 def matches(self, ch):
3832 m = False
3833 for i in self.items:
3834 m = m != i.matches(ch)
3835
3836 return m == self.positive
3837
3838class SetUnion(SetBase):
3839 _opcode = {(NOCASE, False): OP.SET_UNION, (IGNORECASE, False):
3840 OP.SET_UNION_IGN, (FULLCASE, False): OP.SET_UNION, (FULLIGNORECASE,
3841 False): OP.SET_UNION_IGN, (NOCASE, True): OP.SET_UNION_REV, (IGNORECASE,
3842 True): OP.SET_UNION_IGN_REV, (FULLCASE, True): OP.SET_UNION_REV,
3843 (FULLIGNORECASE, True): OP.SET_UNION_IGN_REV}
3844 _op_name = "SET_UNION"
3845
3846 def optimise(self, info, reverse, in_set=False):
3847 items = []
3848 for m in self.items:
3849 m = m.optimise(info, reverse, in_set=True)
3850 if isinstance(m, SetUnion) and m.positive:
3851 # Union in union.
3852 items.extend(m.items)
3853 elif isinstance(m, AnyAll):
3854 return AnyAll()
3855 else:
3856 items.append(m)
3857
3858 # Are there complementary properties?
3859 properties = (set(), set())
3860
3861 for m in items:
3862 if isinstance(m, Property):
3863 properties[m.positive].add((m.value, m.case_flags, m.zerowidth))
3864
3865 if properties[0] & properties[1]:
3866 return AnyAll()
3867
3868 if len(items) == 1:
3869 i = items[0]
3870 return i.with_flags(positive=i.positive == self.positive,
3871 case_flags=self.case_flags,
3872 zerowidth=self.zerowidth).optimise(info, reverse, in_set)
3873
3874 self.items = tuple(items)
3875
3876 return self._handle_case_folding(info, in_set)
3877
3878 def _compile(self, reverse, fuzzy):
3879 flags = 0
3880 if self.positive:
3881 flags |= POSITIVE_OP
3882 if self.zerowidth:
3883 flags |= ZEROWIDTH_OP
3884 if fuzzy:
3885 flags |= FUZZY_OP
3886
3887 characters, others = defaultdict(list), []
3888 for m in self.items:
3889 if isinstance(m, Character):
3890 characters[m.positive].append(m.value)
3891 else:
3892 others.append(m)
3893
3894 code = [(self._opcode[self.case_flags, reverse], flags)]
3895
3896 for positive, values in characters.items():
3897 flags = 0
3898 if positive:
3899 flags |= POSITIVE_OP
3900 if len(values) == 1:
3901 code.append((OP.CHARACTER, flags, values[0]))
3902 else:
3903 code.append((OP.STRING, flags, len(values)) + tuple(values))
3904
3905 for m in others:
3906 code.extend(m.compile())
3907
3908 code.append((OP.END, ))
3909
3910 return code
3911
3912 def matches(self, ch):
3913 m = any(i.matches(ch) for i in self.items)
3914 return m == self.positive
3915
3916class Skip(ZeroWidthBase):
3917 _op_name = "SKIP"
3918 _opcode = OP.SKIP
3919
3920class StartOfLine(ZeroWidthBase):
3921 _opcode = OP.START_OF_LINE
3922 _op_name = "START_OF_LINE"
3923
3924class StartOfLineU(StartOfLine):
3925 _opcode = OP.START_OF_LINE_U
3926 _op_name = "START_OF_LINE_U"
3927
3928class StartOfString(ZeroWidthBase):
3929 _opcode = OP.START_OF_STRING
3930 _op_name = "START_OF_STRING"
3931
3932class StartOfWord(ZeroWidthBase):
3933 _opcode = OP.START_OF_WORD
3934 _op_name = "START_OF_WORD"
3935
3936class String(RegexBase):
3937 _opcode = {(NOCASE, False): OP.STRING, (IGNORECASE, False): OP.STRING_IGN,
3938 (FULLCASE, False): OP.STRING, (FULLIGNORECASE, False): OP.STRING_FLD,
3939 (NOCASE, True): OP.STRING_REV, (IGNORECASE, True): OP.STRING_IGN_REV,
3940 (FULLCASE, True): OP.STRING_REV, (FULLIGNORECASE, True):
3941 OP.STRING_FLD_REV}
3942
3943 def __init__(self, characters, case_flags=NOCASE):
3944 self.characters = tuple(characters)
3945 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
3946
3947 if (self.case_flags & FULLIGNORECASE) == FULLIGNORECASE:
3948 folded_characters = []
3949 for char in self.characters:
3950 folded = _regex.fold_case(FULL_CASE_FOLDING, chr(char))
3951 folded_characters.extend(ord(c) for c in folded)
3952 else:
3953 folded_characters = self.characters
3954
3955 self.folded_characters = tuple(folded_characters)
3956 self.required = False
3957
3958 self._key = self.__class__, self.characters, self.case_flags
3959
3960 def get_firstset(self, reverse):
3961 if reverse:
3962 pos = -1
3963 else:
3964 pos = 0
3965 return set([Character(self.characters[pos],
3966 case_flags=self.case_flags)])
3967
3968 def has_simple_start(self):
3969 return True
3970
3971 def _compile(self, reverse, fuzzy):
3972 flags = 0
3973 if fuzzy:
3974 flags |= FUZZY_OP
3975 if self.required:
3976 flags |= REQUIRED_OP
3977 return [(self._opcode[self.case_flags, reverse], flags,
3978 len(self.folded_characters)) + self.folded_characters]
3979
3980 def dump(self, indent, reverse):
3981 display = ascii("".join(chr(c) for c in self.characters)).lstrip("bu")
3982 print("{}STRING {}{}".format(INDENT * indent, display,
3983 CASE_TEXT[self.case_flags]))
3984
3985 def max_width(self):
3986 return len(self.folded_characters)
3987
3988 def get_required_string(self, reverse):
3989 return 0, self
3990
3991class Literal(String):
3992 def dump(self, indent, reverse):
3993 literal = ''.join(chr(c) for c in self.characters)
3994 display = ascii(literal).lstrip("bu")
3995 print("{}LITERAL MATCH {}{}".format(INDENT * indent, display,
3996 CASE_TEXT[self.case_flags]))
3997
3998class StringSet(Branch):
3999 def __init__(self, info, name, case_flags=NOCASE):
4000 self.info = info
4001 self.name = name
4002 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]
4003
4004 self._key = self.__class__, self.name, self.case_flags
4005
4006 self.set_key = (name, self.case_flags)
4007 if self.set_key not in info.named_lists_used:
4008 info.named_lists_used[self.set_key] = len(info.named_lists_used)
4009
4010 index = self.info.named_lists_used[self.set_key]
4011 items = self.info.kwargs[self.name]
4012
4013 case_flags = self.case_flags
4014
4015 encoding = self.info.flags & _ALL_ENCODINGS
4016 fold_flags = encoding | case_flags
4017
4018 choices = []
4019
4020 for string in items:
4021 if isinstance(string, str):
4022 string = [ord(c) for c in string]
4023
4024 choices.append([Character(c, case_flags=case_flags) for c in
4025 string])
4026
4027 # Sort from longest to shortest.
4028 choices.sort(key=len, reverse=True)
4029
4030 self.branches = [Sequence(choice) for choice in choices]
4031
4032 def dump(self, indent, reverse):
4033 print("{}STRING_SET {}{}".format(INDENT * indent, self.name,
4034 CASE_TEXT[self.case_flags]))
4035
4036 def __del__(self):
4037 self.info = None
4038
4039class Source:
4040 "Scanner for the regular expression source string."
4041 def __init__(self, string):
4042 if isinstance(string, str):
4043 self.string = string
4044 self.char_type = chr
4045 else:
4046 self.string = string.decode("latin-1")
4047 self.char_type = lambda c: bytes([c])
4048
4049 self.pos = 0
4050 self.ignore_space = False
4051 self.sep = string[ : 0]
4052
4053 def get(self, override_ignore=False):
4054 string = self.string
4055 pos = self.pos
4056
4057 try:
4058 if self.ignore_space and not override_ignore:
4059 while True:
4060 if string[pos].isspace():
4061 # Skip over the whitespace.
4062 pos += 1
4063 elif string[pos] == "#":
4064 # Skip over the comment to the end of the line.
4065 pos = string.index("\n", pos)
4066 else:
4067 break
4068
4069 ch = string[pos]
4070 self.pos = pos + 1
4071 return ch
4072 except IndexError:
4073 # We've reached the end of the string.
4074 self.pos = pos
4075 return string[ : 0]
4076 except ValueError:
4077 # The comment extended to the end of the string.
4078 self.pos = len(string)
4079 return string[ : 0]
4080
4081 def get_many(self, count=1):
4082 string = self.string
4083 pos = self.pos
4084
4085 try:
4086 if self.ignore_space:
4087 substring = []
4088
4089 while len(substring) < count:
4090 while True:
4091 if string[pos].isspace():
4092 # Skip over the whitespace.
4093 pos += 1
4094 elif string[pos] == "#":
4095 # Skip over the comment to the end of the line.
4096 pos = string.index("\n", pos)
4097 else:
4098 break
4099
4100 substring.append(string[pos])
4101 pos += 1
4102
4103 substring = "".join(substring)
4104 else:
4105 substring = string[pos : pos + count]
4106 pos += len(substring)
4107
4108 self.pos = pos
4109 return substring
4110 except IndexError:
4111 # We've reached the end of the string.
4112 self.pos = len(string)
4113 return "".join(substring)
4114 except ValueError:
4115 # The comment extended to the end of the string.
4116 self.pos = len(string)
4117 return "".join(substring)
4118
4119 def get_while(self, test_set, include=True, keep_spaces=False):
4120 string = self.string
4121 pos = self.pos
4122
4123 if self.ignore_space and not keep_spaces:
4124 try:
4125 substring = []
4126
4127 while True:
4128 if string[pos].isspace():
4129 # Skip over the whitespace.
4130 pos += 1
4131 elif string[pos] == "#":
4132 # Skip over the comment to the end of the line.
4133 pos = string.index("\n", pos)
4134 elif (string[pos] in test_set) == include:
4135 substring.append(string[pos])
4136 pos += 1
4137 else:
4138 break
4139
4140 self.pos = pos
4141 except IndexError:
4142 # We've reached the end of the string.
4143 self.pos = len(string)
4144 except ValueError:
4145 # The comment extended to the end of the string.
4146 self.pos = len(string)
4147
4148 return "".join(substring)
4149 else:
4150 try:
4151 while (string[pos] in test_set) == include:
4152 pos += 1
4153
4154 substring = string[self.pos : pos]
4155
4156 self.pos = pos
4157
4158 return substring
4159 except IndexError:
4160 # We've reached the end of the string.
4161 substring = string[self.pos : pos]
4162
4163 self.pos = pos
4164
4165 return substring
4166
4167 def skip_while(self, test_set, include=True):
4168 string = self.string
4169 pos = self.pos
4170
4171 try:
4172 if self.ignore_space:
4173 while True:
4174 if string[pos].isspace():
4175 # Skip over the whitespace.
4176 pos += 1
4177 elif string[pos] == "#":
4178 # Skip over the comment to the end of the line.
4179 pos = string.index("\n", pos)
4180 elif (string[pos] in test_set) == include:
4181 pos += 1
4182 else:
4183 break
4184 else:
4185 while (string[pos] in test_set) == include:
4186 pos += 1
4187
4188 self.pos = pos
4189 except IndexError:
4190 # We've reached the end of the string.
4191 self.pos = len(string)
4192 except ValueError:
4193 # The comment extended to the end of the string.
4194 self.pos = len(string)
4195
4196 def match(self, substring):
4197 string = self.string
4198 pos = self.pos
4199
4200 if self.ignore_space:
4201 try:
4202 for c in substring:
4203 while True:
4204 if string[pos].isspace():
4205 # Skip over the whitespace.
4206 pos += 1
4207 elif string[pos] == "#":
4208 # Skip over the comment to the end of the line.
4209 pos = string.index("\n", pos)
4210 else:
4211 break
4212
4213 if string[pos] != c:
4214 return False
4215
4216 pos += 1
4217
4218 self.pos = pos
4219
4220 return True
4221 except IndexError:
4222 # We've reached the end of the string.
4223 return False
4224 except ValueError:
4225 # The comment extended to the end of the string.
4226 return False
4227 else:
4228 if not string.startswith(substring, pos):
4229 return False
4230
4231 self.pos = pos + len(substring)
4232
4233 return True
4234
4235 def expect(self, substring):
4236 if not self.match(substring):
4237 raise error("missing {}".format(substring), self.string, self.pos)
4238
4239 def at_end(self):
4240 string = self.string
4241 pos = self.pos
4242
4243 try:
4244 if self.ignore_space:
4245 while True:
4246 if string[pos].isspace():
4247 pos += 1
4248 elif string[pos] == "#":
4249 pos = string.index("\n", pos)
4250 else:
4251 break
4252
4253 return pos >= len(string)
4254 except IndexError:
4255 # We've reached the end of the string.
4256 return True
4257 except ValueError:
4258 # The comment extended to the end of the string.
4259 return True
4260
4261class Info:
4262 "Info about the regular expression."
4263
4264 def __init__(self, flags=0, char_type=None, kwargs={}):
4265 flags |= DEFAULT_FLAGS[(flags & _ALL_VERSIONS) or DEFAULT_VERSION]
4266 self.flags = flags
4267 self.global_flags = flags
4268 self.inline_locale = False
4269
4270 self.kwargs = kwargs
4271
4272 self.group_count = 0
4273 self.group_index = {}
4274 self.group_name = {}
4275 self.char_type = char_type
4276 self.named_lists_used = {}
4277 self.open_groups = []
4278 self.open_group_count = {}
4279 self.defined_groups = {}
4280 self.group_calls = []
4281 self.private_groups = {}
4282
4283 def open_group(self, name=None):
4284 group = self.group_index.get(name)
4285 if group is None:
4286 while True:
4287 self.group_count += 1
4288 if name is None or self.group_count not in self.group_name:
4289 break
4290
4291 group = self.group_count
4292 if name:
4293 self.group_index[name] = group
4294 self.group_name[group] = name
4295
4296 if group in self.open_groups:
4297 # We have a nested named group. We'll assign it a private group
4298 # number, initially negative until we can assign a proper
4299 # (positive) number.
4300 group_alias = -(len(self.private_groups) + 1)
4301 self.private_groups[group_alias] = group
4302 group = group_alias
4303
4304 self.open_groups.append(group)
4305 self.open_group_count[group] = self.open_group_count.get(group, 0) + 1
4306
4307 return group
4308
4309 def close_group(self):
4310 self.open_groups.pop()
4311
4312 def is_open_group(self, name):
4313 # In version 1, a group reference can refer to an open group. We'll
4314 # just pretend the group isn't open.
4315 version = (self.flags & _ALL_VERSIONS) or DEFAULT_VERSION
4316 if version == VERSION1:
4317 return False
4318
4319 if name.isdigit():
4320 group = int(name)
4321 else:
4322 group = self.group_index.get(name)
4323
4324 return group in self.open_groups
4325
4326def _check_group_features(info, parsed):
4327 """Checks whether the reverse and fuzzy features of the group calls match
4328 the groups which they call.
4329 """
4330 call_refs = {}
4331 additional_groups = []
4332 for call, reverse, fuzzy in info.group_calls:
4333 # Look up the reference of this group call.
4334 key = (call.group, reverse, fuzzy)
4335 ref = call_refs.get(key)
4336 if ref is None:
4337 # This group doesn't have a reference yet, so look up its features.
4338 if call.group == 0:
4339 # Calling the pattern as a whole.
4340 rev = bool(info.flags & REVERSE)
4341 fuz = isinstance(parsed, Fuzzy)
4342 if (rev, fuz) != (reverse, fuzzy):
4343 # The pattern as a whole doesn't have the features we want,
4344 # so we'll need to make a copy of it with the desired
4345 # features.
4346 additional_groups.append((CallRef(len(call_refs), parsed),
4347 reverse, fuzzy))
4348 else:
4349 # Calling a capture group.
4350 def_info = info.defined_groups[call.group]
4351 group = def_info[0]
4352 if def_info[1 : ] != (reverse, fuzzy):
4353 # The group doesn't have the features we want, so we'll
4354 # need to make a copy of it with the desired features.
4355 additional_groups.append((group, reverse, fuzzy))
4356
4357 ref = len(call_refs)
4358 call_refs[key] = ref
4359
4360 call.call_ref = ref
4361
4362 info.call_refs = call_refs
4363 info.additional_groups = additional_groups
4364
4365def _get_required_string(parsed, flags):
4366 "Gets the required string and related info of a parsed pattern."
4367
4368 req_offset, required = parsed.get_required_string(bool(flags & REVERSE))
4369 if required:
4370 required.required = True
4371 if req_offset >= UNLIMITED:
4372 req_offset = -1
4373
4374 req_flags = required.case_flags
4375 if not (flags & UNICODE):
4376 req_flags &= ~UNICODE
4377
4378 req_chars = required.folded_characters
4379 else:
4380 req_offset = 0
4381 req_chars = ()
4382 req_flags = 0
4383
4384 return req_offset, req_chars, req_flags
4385
4386class Scanner:
4387 def __init__(self, lexicon, flags=0):
4388 self.lexicon = lexicon
4389
4390 # Combine phrases into a compound pattern.
4391 patterns = []
4392 for phrase, action in lexicon:
4393 # Parse the regular expression.
4394 source = Source(phrase)
4395 info = Info(flags, source.char_type)
4396 source.ignore_space = bool(info.flags & VERBOSE)
4397 parsed = _parse_pattern(source, info)
4398 if not source.at_end():
4399 raise error("unbalanced parenthesis", source.string,
4400 source.pos)
4401
4402 # We want to forbid capture groups within each phrase.
4403 patterns.append(parsed.remove_captures())
4404
4405 # Combine all the subpatterns into one pattern.
4406 info = Info(flags)
4407 patterns = [Group(info, g + 1, p) for g, p in enumerate(patterns)]
4408 parsed = Branch(patterns)
4409
4410 # Optimise the compound pattern.
4411 reverse = bool(info.flags & REVERSE)
4412 parsed = parsed.optimise(info, reverse)
4413 parsed = parsed.pack_characters(info)
4414
4415 # Get the required string.
4416 req_offset, req_chars, req_flags = _get_required_string(parsed,
4417 info.flags)
4418
4419 # Check the features of the groups.
4420 _check_group_features(info, parsed)
4421
4422 # Complain if there are any group calls. They are not supported by the
4423 # Scanner class.
4424 if info.call_refs:
4425 raise error("recursive regex not supported by Scanner",
4426 source.string, source.pos)
4427
4428 reverse = bool(info.flags & REVERSE)
4429
4430 # Compile the compound pattern. The result is a list of tuples.
4431 code = parsed.compile(reverse) + [(OP.SUCCESS, )]
4432
4433 # Flatten the code into a list of ints.
4434 code = _flatten_code(code)
4435
4436 if not parsed.has_simple_start():
4437 # Get the first set, if possible.
4438 try:
4439 fs_code = _compile_firstset(info, parsed.get_firstset(reverse))
4440 fs_code = _flatten_code(fs_code)
4441 code = fs_code + code
4442 except _FirstSetError:
4443 pass
4444
4445 # Check the global flags for conflicts.
4446 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION
4447 if version not in (0, VERSION0, VERSION1):
4448 raise ValueError("VERSION0 and VERSION1 flags are mutually incompatible")
4449
4450 # Create the PatternObject.
4451 #
4452 # Local flags like IGNORECASE affect the code generation, but aren't
4453 # needed by the PatternObject itself. Conversely, global flags like
4454 # LOCALE _don't_ affect the code generation but _are_ needed by the
4455 # PatternObject.
4456 self.scanner = _regex.compile(None, (flags & GLOBAL_FLAGS) | version,
4457 code, {}, {}, {}, [], req_offset, req_chars, req_flags,
4458 len(patterns))
4459
4460 def scan(self, string):
4461 result = []
4462 append = result.append
4463 match = self.scanner.scanner(string).match
4464 i = 0
4465 while True:
4466 m = match()
4467 if not m:
4468 break
4469 j = m.end()
4470 if i == j:
4471 break
4472 action = self.lexicon[m.lastindex - 1][1]
4473 if hasattr(action, '__call__'):
4474 self.match = m
4475 action = action(self, m.group())
4476 if action is not None:
4477 append(action)
4478 i = j
4479
4480 return result, string[i : ]
4481
4482# Get the known properties dict.
4483PROPERTIES = _regex.get_properties()
4484
4485# Build the inverse of the properties dict.
4486PROPERTY_NAMES = {}
4487for prop_name, (prop_id, values) in PROPERTIES.items():
4488 name, prop_values = PROPERTY_NAMES.get(prop_id, ("", {}))
4489 name = max(name, prop_name, key=len)
4490 PROPERTY_NAMES[prop_id] = name, prop_values
4491
4492 for val_name, val_id in values.items():
4493 prop_values[val_id] = max(prop_values.get(val_id, ""), val_name,
4494 key=len)
4495
4496# Character escape sequences.
4497CHARACTER_ESCAPES = {
4498 "a": "\a",
4499 "b": "\b",
4500 "f": "\f",
4501 "n": "\n",
4502 "r": "\r",
4503 "t": "\t",
4504 "v": "\v",
4505}
4506
4507ASCII_ENCODING = 1
4508UNICODE_ENCODING = 2
4509
4510# Predefined character set escape sequences.
4511CHARSET_ESCAPES = {
4512 "d": lookup_property(None, "Digit", True),
4513 "D": lookup_property(None, "Digit", False),
4514 "h": lookup_property(None, "Blank", True),
4515 "s": lookup_property(None, "Space", True),
4516 "S": lookup_property(None, "Space", False),
4517 "w": lookup_property(None, "Word", True),
4518 "W": lookup_property(None, "Word", False),
4519}
4520
4521ASCII_CHARSET_ESCAPES = dict(CHARSET_ESCAPES)
4522ASCII_CHARSET_ESCAPES.update({
4523 "d": lookup_property(None, "Digit", True, encoding=ASCII_ENCODING),
4524 "D": lookup_property(None, "Digit", False, encoding=ASCII_ENCODING),
4525 "s": lookup_property(None, "Space", True, encoding=ASCII_ENCODING),
4526 "S": lookup_property(None, "Space", False, encoding=ASCII_ENCODING),
4527 "w": lookup_property(None, "Word", True, encoding=ASCII_ENCODING),
4528 "W": lookup_property(None, "Word", False, encoding=ASCII_ENCODING),
4529})
4530UNICODE_CHARSET_ESCAPES = dict(CHARSET_ESCAPES)
4531UNICODE_CHARSET_ESCAPES.update({
4532 "d": lookup_property(None, "Digit", True, encoding=UNICODE_ENCODING),
4533 "D": lookup_property(None, "Digit", False, encoding=UNICODE_ENCODING),
4534 "s": lookup_property(None, "Space", True, encoding=UNICODE_ENCODING),
4535 "S": lookup_property(None, "Space", False, encoding=UNICODE_ENCODING),
4536 "w": lookup_property(None, "Word", True, encoding=UNICODE_ENCODING),
4537 "W": lookup_property(None, "Word", False, encoding=UNICODE_ENCODING),
4538})
4539
4540# Positional escape sequences.
4541POSITION_ESCAPES = {
4542 "A": StartOfString(),
4543 "b": Boundary(),
4544 "B": Boundary(False),
4545 "K": Keep(),
4546 "m": StartOfWord(),
4547 "M": EndOfWord(),
4548 "Z": EndOfString(),
4549}
4550ASCII_POSITION_ESCAPES = dict(POSITION_ESCAPES)
4551ASCII_POSITION_ESCAPES.update({
4552 "b": Boundary(encoding=ASCII_ENCODING),
4553 "B": Boundary(False, encoding=ASCII_ENCODING),
4554 "m": StartOfWord(encoding=ASCII_ENCODING),
4555 "M": EndOfWord(encoding=ASCII_ENCODING),
4556})
4557UNICODE_POSITION_ESCAPES = dict(POSITION_ESCAPES)
4558UNICODE_POSITION_ESCAPES.update({
4559 "b": Boundary(encoding=UNICODE_ENCODING),
4560 "B": Boundary(False, encoding=UNICODE_ENCODING),
4561 "m": StartOfWord(encoding=UNICODE_ENCODING),
4562 "M": EndOfWord(encoding=UNICODE_ENCODING),
4563})
4564
4565# Positional escape sequences when WORD flag set.
4566WORD_POSITION_ESCAPES = dict(POSITION_ESCAPES)
4567WORD_POSITION_ESCAPES.update({
4568 "b": DefaultBoundary(),
4569 "B": DefaultBoundary(False),
4570 "m": DefaultStartOfWord(),
4571 "M": DefaultEndOfWord(),
4572})
4573
4574# Regex control verbs.
4575VERBS = {
4576 "FAIL": Failure(),
4577 "F": Failure(),
4578 "PRUNE": Prune(),
4579 "SKIP": Skip(),
4580}