Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pygments/regexopt.py: 96%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

51 statements  

1""" 

2 pygments.regexopt 

3 ~~~~~~~~~~~~~~~~~ 

4 

5 An algorithm that generates optimized regexes for matching long lists of 

6 literal strings. 

7 

8 :copyright: Copyright 2006-present by the Pygments team, see AUTHORS. 

9 :license: BSD, see LICENSE for details. 

10""" 

11 

12import re 

13from re import escape 

14from itertools import groupby 

15from operator import itemgetter 

16 

17CS_ESCAPE = re.compile(r'[\[\^\\\-\]]') 

18FIRST_ELEMENT = itemgetter(0) 

19 

20 

21def commonprefix(m): 

22 """Given an iterable of strings, returns the longest common leading substring""" 

23 if not m: 

24 return "" 

25 s1 = min(m) 

26 s2 = max(m) 

27 for i, c in enumerate(s1): 

28 if c != s2[i]: 

29 return s1[:i] 

30 return s1 

31 

32 

33def make_charset(letters): 

34 return '[' + CS_ESCAPE.sub(lambda m: '\\' + m.group(), ''.join(letters)) + ']' 

35 

36 

37def regex_opt_inner(strings, open_paren): 

38 """Return a regex that matches any string in the sorted list of strings.""" 

39 close_paren = open_paren and ')' or '' 

40 # print strings, repr(open_paren) 

41 if not strings: 

42 # print '-> nothing left' 

43 return '' 

44 first = strings[0] 

45 if len(strings) == 1: 

46 # print '-> only 1 string' 

47 return open_paren + escape(first) + close_paren 

48 if not first: 

49 # print '-> first string empty' 

50 return open_paren + regex_opt_inner(strings[1:], '(?:') \ 

51 + '?' + close_paren 

52 if len(first) == 1: 

53 # multiple one-char strings? make a charset 

54 oneletter = [] 

55 rest = [] 

56 for s in strings: 

57 if len(s) == 1: 

58 oneletter.append(s) 

59 else: 

60 rest.append(s) 

61 if len(oneletter) > 1: # do we have more than one oneletter string? 

62 if rest: 

63 # print '-> 1-character + rest' 

64 return open_paren + regex_opt_inner(rest, '') + '|' \ 

65 + make_charset(oneletter) + close_paren 

66 # print '-> only 1-character' 

67 return open_paren + make_charset(oneletter) + close_paren 

68 prefix = commonprefix(strings) 

69 if prefix: 

70 plen = len(prefix) 

71 # we have a prefix for all strings 

72 # print '-> prefix:', prefix 

73 return open_paren + escape(prefix) \ 

74 + regex_opt_inner([s[plen:] for s in strings], '(?:') \ 

75 + close_paren 

76 # is there a suffix? 

77 strings_rev = [s[::-1] for s in strings] 

78 suffix = commonprefix(strings_rev) 

79 if suffix: 

80 slen = len(suffix) 

81 # print '-> suffix:', suffix[::-1] 

82 return open_paren \ 

83 + regex_opt_inner(sorted(s[:-slen] for s in strings), '(?:') \ 

84 + escape(suffix[::-1]) + close_paren 

85 # recurse on common 1-string prefixes 

86 # print '-> last resort' 

87 return open_paren + \ 

88 '|'.join(regex_opt_inner(list(group[1]), '') 

89 for group in groupby(strings, lambda s: s[0] == first[0])) \ 

90 + close_paren 

91 

92 

93def regex_opt(strings, prefix='', suffix=''): 

94 """Return a compiled regex that matches any string in the given list. 

95 

96 The strings to match must be literal strings, not regexes. They will be 

97 regex-escaped. 

98 

99 *prefix* and *suffix* are pre- and appended to the final regex. 

100 """ 

101 strings = sorted(strings) 

102 return prefix + regex_opt_inner(strings, '(') + suffix