Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/textdistance/libraries.py: 75%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

122 statements  

1from __future__ import annotations 

2 

3# built-in 

4import json 

5from collections import defaultdict 

6from copy import deepcopy 

7from importlib import import_module 

8from pathlib import Path 

9from typing import Any, Callable, Sequence 

10 

11 

12LIBRARIES_PATH = Path(__file__).parent / 'libraries.json' 

13 

14 

15class LibrariesManager: 

16 libs: defaultdict[str, list[LibraryBase]] 

17 

18 def __init__(self) -> None: 

19 self.libs = defaultdict(list) 

20 

21 def register(self, alg: str, lib: LibraryBase) -> None: 

22 """Register new lib 

23 """ 

24 self.libs[alg].append(lib) 

25 

26 def optimize(self) -> None: 

27 """Sort algorithm implementations by speed. 

28 """ 

29 # load benchmarks results 

30 with LIBRARIES_PATH.open('r', encoding='utf8') as f: 

31 libs_data: dict = json.load(f) 

32 # optimize 

33 for alg, libs_names in libs_data.items(): 

34 libs = self.get_libs(alg) 

35 if not libs: 

36 continue 

37 # drop slow libs 

38 self.libs[alg] = [lib for lib in libs if [lib.module_name, lib.func_name] in libs_names] 

39 # sort libs by speed 

40 self.libs[alg].sort(key=lambda lib: libs_names.index([lib.module_name, lib.func_name])) 

41 

42 def get_algorithms(self) -> list[str]: 

43 """Get list of available algorithms. 

44 """ 

45 return list(self.libs.keys()) 

46 

47 def get_libs(self, alg: str) -> list[LibraryBase]: 

48 """Get libs list for algorithm 

49 """ 

50 if alg not in self.libs: 

51 return [] 

52 return self.libs[alg] 

53 

54 def clone(self) -> LibrariesManager: 

55 """Clone library manager prototype 

56 """ 

57 obj = self.__class__() 

58 obj.libs = deepcopy(self.libs) 

59 return obj 

60 

61 

62class LibraryBase: 

63 func: Callable | None | Any = NotImplemented 

64 

65 def __init__( 

66 self, 

67 module_name: str, 

68 func_name: str, 

69 *, 

70 presets: dict[str, Any] | None = None, 

71 attr: str | None = None, 

72 conditions: dict[str, bool] | None = None, 

73 ) -> None: 

74 self.module_name = module_name 

75 self.func_name = func_name 

76 self.presets = presets 

77 self.conditions = conditions 

78 self.attr = attr 

79 

80 def check_conditions(self, obj: object, *sequences: Sequence) -> bool: 

81 # external libs can compare only 2 strings 

82 if len(sequences) != 2: 

83 return False 

84 if not self.conditions: 

85 return True 

86 for name, value in self.conditions.items(): 

87 if getattr(obj, name) != value: 

88 return False 

89 

90 return True 

91 

92 def prepare(self, *sequences: Sequence) -> tuple: 

93 return sequences 

94 

95 @property 

96 def setup(self) -> str: 

97 result = f'from {self.module_name} import {self.func_name} as func' 

98 result += '\nfunc = func' 

99 if self.presets is not None: 

100 result += f'(**{repr(self.presets)})' 

101 if self.attr is not None: 

102 result += f'.{self.attr}' 

103 return result 

104 

105 def get_function(self) -> Callable | None: 

106 if self.func is NotImplemented: 

107 # import module 

108 try: 

109 module = import_module(self.module_name) 

110 except ImportError: 

111 self.func = None 

112 return None 

113 

114 # get object from module 

115 obj = getattr(module, self.func_name) 

116 # init class 

117 if self.presets is not None: 

118 obj = obj(**self.presets) 

119 # get needed attribute 

120 if self.attr is not None: 

121 obj = getattr(obj, self.attr) 

122 self.func = obj 

123 

124 return self.func 

125 

126 def __str__(self) -> str: 

127 return f'{self.module_name}.{self.func_name}' 

128 

129 

130class TextLibrary(LibraryBase): 

131 def check_conditions(self, obj: object, *sequences: Sequence) -> bool: 

132 if not super().check_conditions(obj, *sequences): 

133 return False 

134 

135 # compare only by letters 

136 if getattr(obj, 'qval', 0) != 1: 

137 return False 

138 

139 # every sequence must be string 

140 for seq in sequences: 

141 if type(seq) is not str: 

142 return False 

143 return True 

144 

145 def prepare(self, *sequences: Sequence) -> tuple: 

146 # convert list of letters to string 

147 if isinstance(sequences[0], (tuple, list)): 

148 sequences = tuple(map(lambda x: ''.join(x), sequences)) 

149 return sequences 

150 

151 

152class SameLengthLibrary(LibraryBase): 

153 def check_conditions(self, obj: object, *sequences: Sequence) -> bool: 

154 if not super().check_conditions(obj, *sequences): 

155 return False 

156 # compare only same length iterators 

157 if min(map(len, sequences)) != max(map(len, sequences)): 

158 return False 

159 return True 

160 

161 

162class SameLengthTextLibrary(SameLengthLibrary, TextLibrary): 

163 pass 

164 

165 

166prototype = LibrariesManager() 

167reg = prototype.register 

168 

169alg = 'DamerauLevenshtein' 

170reg(alg, LibraryBase('pyxdameraulevenshtein', 'damerau_levenshtein_distance', conditions=dict(restricted=True))) 

171reg(alg, TextLibrary('jellyfish', 'damerau_levenshtein_distance', conditions=dict(restricted=False))) 

172reg(alg, LibraryBase('rapidfuzz.distance.DamerauLevenshtein', 'distance', conditions=dict(restricted=False))) 

173reg(alg, LibraryBase('rapidfuzz.distance.OSA', 'distance', conditions=dict(restricted=True))) 

174 

175alg = 'Hamming' 

176reg(alg, SameLengthTextLibrary('Levenshtein', 'hamming')) 

177reg(alg, TextLibrary('jellyfish', 'hamming_distance')) 

178reg(alg, SameLengthLibrary('rapidfuzz.distance.Hamming', 'distance')) 

179 

180alg = 'Jaro' 

181reg(alg, TextLibrary('jellyfish', 'jaro_similarity')) 

182reg(alg, LibraryBase('rapidfuzz.distance.Jaro', 'similarity')) 

183# reg(alg, TextLibrary('Levenshtein', 'jaro')) 

184# reg(alg, TextLibrary('py_stringmatching.similarity_measure.jaro', 'jaro')) 

185 

186alg = 'JaroWinkler' 

187# reg(alg, LibraryBase('py_stringmatching.similarity_measure.jaro_winkler', 'jaro_winkler')) 

188reg(alg, TextLibrary('jellyfish', 'jaro_winkler_similarity', conditions=dict(winklerize=True))) 

189reg(alg, LibraryBase('rapidfuzz.distance.JaroWinkler', 'similarity', conditions=dict(winklerize=True))) 

190# https://github.com/life4/textdistance/issues/39 

191# reg(alg, TextLibrary('Levenshtein', 'jaro_winkler', conditions=dict(winklerize=True))) 

192 

193alg = 'Levenshtein' 

194reg(alg, LibraryBase('pylev', 'levenshtein')) 

195reg(alg, TextLibrary('jellyfish', 'levenshtein_distance')) 

196reg(alg, TextLibrary('Levenshtein', 'distance')) 

197reg(alg, LibraryBase('rapidfuzz.distance.Levenshtein', 'distance')) 

198# reg(alg, TextLibrary('py_stringmatching.similarity_measure.levenshtein', 'levenshtein'))