Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pygments/regexopt.py: 98%
42 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-01 06:54 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-01 06:54 +0000
1"""
2 pygments.regexopt
3 ~~~~~~~~~~~~~~~~~
5 An algorithm that generates optimized regexes for matching long lists of
6 literal strings.
8 :copyright: Copyright 2006-2023 by the Pygments team, see AUTHORS.
9 :license: BSD, see LICENSE for details.
10"""
12import re
13from re import escape
14from os.path import commonprefix
15from itertools import groupby
16from operator import itemgetter
18CS_ESCAPE = re.compile(r'[\[\^\\\-\]]')
19FIRST_ELEMENT = itemgetter(0)
22def make_charset(letters):
23 return '[' + CS_ESCAPE.sub(lambda m: '\\' + m.group(), ''.join(letters)) + ']'
26def regex_opt_inner(strings, open_paren):
27 """Return a regex that matches any string in the sorted list of strings."""
28 close_paren = open_paren and ')' or ''
29 # print strings, repr(open_paren)
30 if not strings:
31 # print '-> nothing left'
32 return ''
33 first = strings[0]
34 if len(strings) == 1:
35 # print '-> only 1 string'
36 return open_paren + escape(first) + close_paren
37 if not first:
38 # print '-> first string empty'
39 return open_paren + regex_opt_inner(strings[1:], '(?:') \
40 + '?' + close_paren
41 if len(first) == 1:
42 # multiple one-char strings? make a charset
43 oneletter = []
44 rest = []
45 for s in strings:
46 if len(s) == 1:
47 oneletter.append(s)
48 else:
49 rest.append(s)
50 if len(oneletter) > 1: # do we have more than one oneletter string?
51 if rest:
52 # print '-> 1-character + rest'
53 return open_paren + regex_opt_inner(rest, '') + '|' \
54 + make_charset(oneletter) + close_paren
55 # print '-> only 1-character'
56 return open_paren + make_charset(oneletter) + close_paren
57 prefix = commonprefix(strings)
58 if prefix:
59 plen = len(prefix)
60 # we have a prefix for all strings
61 # print '-> prefix:', prefix
62 return open_paren + escape(prefix) \
63 + regex_opt_inner([s[plen:] for s in strings], '(?:') \
64 + close_paren
65 # is there a suffix?
66 strings_rev = [s[::-1] for s in strings]
67 suffix = commonprefix(strings_rev)
68 if suffix:
69 slen = len(suffix)
70 # print '-> suffix:', suffix[::-1]
71 return open_paren \
72 + regex_opt_inner(sorted(s[:-slen] for s in strings), '(?:') \
73 + escape(suffix[::-1]) + close_paren
74 # recurse on common 1-string prefixes
75 # print '-> last resort'
76 return open_paren + \
77 '|'.join(regex_opt_inner(list(group[1]), '')
78 for group in groupby(strings, lambda s: s[0] == first[0])) \
79 + close_paren
82def regex_opt(strings, prefix='', suffix=''):
83 """Return a compiled regex that matches any string in the given list.
85 The strings to match must be literal strings, not regexes. They will be
86 regex-escaped.
88 *prefix* and *suffix* are pre- and appended to the final regex.
89 """
90 strings = sorted(strings)
91 return prefix + regex_opt_inner(strings, '(') + suffix