Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pygments/regexopt.py: 98%

42 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-07-01 06:54 +0000

1""" 

2 pygments.regexopt 

3 ~~~~~~~~~~~~~~~~~ 

4 

5 An algorithm that generates optimized regexes for matching long lists of 

6 literal strings. 

7 

8 :copyright: Copyright 2006-2023 by the Pygments team, see AUTHORS. 

9 :license: BSD, see LICENSE for details. 

10""" 

11 

12import re 

13from re import escape 

14from os.path import commonprefix 

15from itertools import groupby 

16from operator import itemgetter 

17 

18CS_ESCAPE = re.compile(r'[\[\^\\\-\]]') 

19FIRST_ELEMENT = itemgetter(0) 

20 

21 

22def make_charset(letters): 

23 return '[' + CS_ESCAPE.sub(lambda m: '\\' + m.group(), ''.join(letters)) + ']' 

24 

25 

26def regex_opt_inner(strings, open_paren): 

27 """Return a regex that matches any string in the sorted list of strings.""" 

28 close_paren = open_paren and ')' or '' 

29 # print strings, repr(open_paren) 

30 if not strings: 

31 # print '-> nothing left' 

32 return '' 

33 first = strings[0] 

34 if len(strings) == 1: 

35 # print '-> only 1 string' 

36 return open_paren + escape(first) + close_paren 

37 if not first: 

38 # print '-> first string empty' 

39 return open_paren + regex_opt_inner(strings[1:], '(?:') \ 

40 + '?' + close_paren 

41 if len(first) == 1: 

42 # multiple one-char strings? make a charset 

43 oneletter = [] 

44 rest = [] 

45 for s in strings: 

46 if len(s) == 1: 

47 oneletter.append(s) 

48 else: 

49 rest.append(s) 

50 if len(oneletter) > 1: # do we have more than one oneletter string? 

51 if rest: 

52 # print '-> 1-character + rest' 

53 return open_paren + regex_opt_inner(rest, '') + '|' \ 

54 + make_charset(oneletter) + close_paren 

55 # print '-> only 1-character' 

56 return open_paren + make_charset(oneletter) + close_paren 

57 prefix = commonprefix(strings) 

58 if prefix: 

59 plen = len(prefix) 

60 # we have a prefix for all strings 

61 # print '-> prefix:', prefix 

62 return open_paren + escape(prefix) \ 

63 + regex_opt_inner([s[plen:] for s in strings], '(?:') \ 

64 + close_paren 

65 # is there a suffix? 

66 strings_rev = [s[::-1] for s in strings] 

67 suffix = commonprefix(strings_rev) 

68 if suffix: 

69 slen = len(suffix) 

70 # print '-> suffix:', suffix[::-1] 

71 return open_paren \ 

72 + regex_opt_inner(sorted(s[:-slen] for s in strings), '(?:') \ 

73 + escape(suffix[::-1]) + close_paren 

74 # recurse on common 1-string prefixes 

75 # print '-> last resort' 

76 return open_paren + \ 

77 '|'.join(regex_opt_inner(list(group[1]), '') 

78 for group in groupby(strings, lambda s: s[0] == first[0])) \ 

79 + close_paren 

80 

81 

82def regex_opt(strings, prefix='', suffix=''): 

83 """Return a compiled regex that matches any string in the given list. 

84 

85 The strings to match must be literal strings, not regexes. They will be 

86 regex-escaped. 

87 

88 *prefix* and *suffix* are pre- and appended to the final regex. 

89 """ 

90 strings = sorted(strings) 

91 return prefix + regex_opt_inner(strings, '(') + suffix