Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pygments/regexopt.py: 96%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2 pygments.regexopt
3 ~~~~~~~~~~~~~~~~~
5 An algorithm that generates optimized regexes for matching long lists of
6 literal strings.
8 :copyright: Copyright 2006-present by the Pygments team, see AUTHORS.
9 :license: BSD, see LICENSE for details.
10"""
12import re
13from re import escape
14from itertools import groupby
15from operator import itemgetter
17CS_ESCAPE = re.compile(r'[\[\^\\\-\]]')
18FIRST_ELEMENT = itemgetter(0)
21def commonprefix(m):
22 """Given an iterable of strings, returns the longest common leading substring"""
23 if not m:
24 return ""
25 s1 = min(m)
26 s2 = max(m)
27 for i, c in enumerate(s1):
28 if c != s2[i]:
29 return s1[:i]
30 return s1
33def make_charset(letters):
34 return '[' + CS_ESCAPE.sub(lambda m: '\\' + m.group(), ''.join(letters)) + ']'
37def regex_opt_inner(strings, open_paren):
38 """Return a regex that matches any string in the sorted list of strings."""
39 close_paren = open_paren and ')' or ''
40 # print strings, repr(open_paren)
41 if not strings:
42 # print '-> nothing left'
43 return ''
44 first = strings[0]
45 if len(strings) == 1:
46 # print '-> only 1 string'
47 return open_paren + escape(first) + close_paren
48 if not first:
49 # print '-> first string empty'
50 return open_paren + regex_opt_inner(strings[1:], '(?:') \
51 + '?' + close_paren
52 if len(first) == 1:
53 # multiple one-char strings? make a charset
54 oneletter = []
55 rest = []
56 for s in strings:
57 if len(s) == 1:
58 oneletter.append(s)
59 else:
60 rest.append(s)
61 if len(oneletter) > 1: # do we have more than one oneletter string?
62 if rest:
63 # print '-> 1-character + rest'
64 return open_paren + regex_opt_inner(rest, '') + '|' \
65 + make_charset(oneletter) + close_paren
66 # print '-> only 1-character'
67 return open_paren + make_charset(oneletter) + close_paren
68 prefix = commonprefix(strings)
69 if prefix:
70 plen = len(prefix)
71 # we have a prefix for all strings
72 # print '-> prefix:', prefix
73 return open_paren + escape(prefix) \
74 + regex_opt_inner([s[plen:] for s in strings], '(?:') \
75 + close_paren
76 # is there a suffix?
77 strings_rev = [s[::-1] for s in strings]
78 suffix = commonprefix(strings_rev)
79 if suffix:
80 slen = len(suffix)
81 # print '-> suffix:', suffix[::-1]
82 return open_paren \
83 + regex_opt_inner(sorted(s[:-slen] for s in strings), '(?:') \
84 + escape(suffix[::-1]) + close_paren
85 # recurse on common 1-string prefixes
86 # print '-> last resort'
87 return open_paren + \
88 '|'.join(regex_opt_inner(list(group[1]), '')
89 for group in groupby(strings, lambda s: s[0] == first[0])) \
90 + close_paren
93def regex_opt(strings, prefix='', suffix=''):
94 """Return a compiled regex that matches any string in the given list.
96 The strings to match must be literal strings, not regexes. They will be
97 regex-escaped.
99 *prefix* and *suffix* are pre- and appended to the final regex.
100 """
101 strings = sorted(strings)
102 return prefix + regex_opt_inner(strings, '(') + suffix