1from __future__ import annotations
2
3# built-in
4import json
5from collections import defaultdict
6from copy import deepcopy
7from importlib import import_module
8from pathlib import Path
9from typing import Any, Callable, Sequence
10
11
12LIBRARIES_PATH = Path(__file__).parent / 'libraries.json'
13
14
15class LibrariesManager:
16 libs: defaultdict[str, list[LibraryBase]]
17
18 def __init__(self) -> None:
19 self.libs = defaultdict(list)
20
21 def register(self, alg: str, lib: LibraryBase) -> None:
22 """Register new lib
23 """
24 self.libs[alg].append(lib)
25
26 def optimize(self) -> None:
27 """Sort algorithm implementations by speed.
28 """
29 # load benchmarks results
30 with LIBRARIES_PATH.open('r', encoding='utf8') as f:
31 libs_data: dict = json.load(f)
32 # optimize
33 for alg, libs_names in libs_data.items():
34 libs = self.get_libs(alg)
35 if not libs:
36 continue
37 # drop slow libs
38 self.libs[alg] = [lib for lib in libs if [lib.module_name, lib.func_name] in libs_names]
39 # sort libs by speed
40 self.libs[alg].sort(key=lambda lib: libs_names.index([lib.module_name, lib.func_name]))
41
42 def get_algorithms(self) -> list[str]:
43 """Get list of available algorithms.
44 """
45 return list(self.libs.keys())
46
47 def get_libs(self, alg: str) -> list[LibraryBase]:
48 """Get libs list for algorithm
49 """
50 if alg not in self.libs:
51 return []
52 return self.libs[alg]
53
54 def clone(self) -> LibrariesManager:
55 """Clone library manager prototype
56 """
57 obj = self.__class__()
58 obj.libs = deepcopy(self.libs)
59 return obj
60
61
62class LibraryBase:
63 func: Callable | None | Any = NotImplemented
64
65 def __init__(
66 self,
67 module_name: str,
68 func_name: str,
69 *,
70 presets: dict[str, Any] | None = None,
71 attr: str | None = None,
72 conditions: dict[str, bool] | None = None,
73 ) -> None:
74 self.module_name = module_name
75 self.func_name = func_name
76 self.presets = presets
77 self.conditions = conditions
78 self.attr = attr
79
80 def check_conditions(self, obj: object, *sequences: Sequence) -> bool:
81 # external libs can compare only 2 strings
82 if len(sequences) != 2:
83 return False
84 if not self.conditions:
85 return True
86 for name, value in self.conditions.items():
87 if getattr(obj, name) != value:
88 return False
89
90 return True
91
92 def prepare(self, *sequences: Sequence) -> tuple:
93 return sequences
94
95 @property
96 def setup(self) -> str:
97 result = f'from {self.module_name} import {self.func_name} as func'
98 result += '\nfunc = func'
99 if self.presets is not None:
100 result += f'(**{repr(self.presets)})'
101 if self.attr is not None:
102 result += f'.{self.attr}'
103 return result
104
105 def get_function(self) -> Callable | None:
106 if self.func is NotImplemented:
107 # import module
108 try:
109 module = import_module(self.module_name)
110 except ImportError:
111 self.func = None
112 return None
113
114 # get object from module
115 obj = getattr(module, self.func_name)
116 # init class
117 if self.presets is not None:
118 obj = obj(**self.presets)
119 # get needed attribute
120 if self.attr is not None:
121 obj = getattr(obj, self.attr)
122 self.func = obj
123
124 return self.func
125
126 def __str__(self) -> str:
127 return f'{self.module_name}.{self.func_name}'
128
129
130class TextLibrary(LibraryBase):
131 def check_conditions(self, obj: object, *sequences: Sequence) -> bool:
132 if not super().check_conditions(obj, *sequences):
133 return False
134
135 # compare only by letters
136 if getattr(obj, 'qval', 0) != 1:
137 return False
138
139 # every sequence must be string
140 for seq in sequences:
141 if type(seq) is not str:
142 return False
143 return True
144
145 def prepare(self, *sequences: Sequence) -> tuple:
146 # convert list of letters to string
147 if isinstance(sequences[0], (tuple, list)):
148 sequences = tuple(map(lambda x: ''.join(x), sequences))
149 return sequences
150
151
152class SameLengthLibrary(LibraryBase):
153 def check_conditions(self, obj: object, *sequences: Sequence) -> bool:
154 if not super().check_conditions(obj, *sequences):
155 return False
156 # compare only same length iterators
157 if min(map(len, sequences)) != max(map(len, sequences)):
158 return False
159 return True
160
161
162class SameLengthTextLibrary(SameLengthLibrary, TextLibrary):
163 pass
164
165
166prototype = LibrariesManager()
167reg = prototype.register
168
169alg = 'DamerauLevenshtein'
170reg(alg, LibraryBase('pyxdameraulevenshtein', 'damerau_levenshtein_distance', conditions=dict(restricted=True)))
171reg(alg, TextLibrary('jellyfish', 'damerau_levenshtein_distance', conditions=dict(restricted=False)))
172reg(alg, LibraryBase('rapidfuzz.distance.DamerauLevenshtein', 'distance', conditions=dict(restricted=False)))
173reg(alg, LibraryBase('rapidfuzz.distance.OSA', 'distance', conditions=dict(restricted=True)))
174
175alg = 'Hamming'
176reg(alg, SameLengthTextLibrary('Levenshtein', 'hamming'))
177reg(alg, TextLibrary('jellyfish', 'hamming_distance'))
178reg(alg, SameLengthLibrary('rapidfuzz.distance.Hamming', 'distance'))
179
180alg = 'Jaro'
181reg(alg, TextLibrary('jellyfish', 'jaro_similarity'))
182reg(alg, LibraryBase('rapidfuzz.distance.Jaro', 'similarity'))
183# reg(alg, TextLibrary('Levenshtein', 'jaro'))
184# reg(alg, TextLibrary('py_stringmatching.similarity_measure.jaro', 'jaro'))
185
186alg = 'JaroWinkler'
187# reg(alg, LibraryBase('py_stringmatching.similarity_measure.jaro_winkler', 'jaro_winkler'))
188reg(alg, TextLibrary('jellyfish', 'jaro_winkler_similarity', conditions=dict(winklerize=True)))
189reg(alg, LibraryBase('rapidfuzz.distance.JaroWinkler', 'similarity', conditions=dict(winklerize=True)))
190# https://github.com/life4/textdistance/issues/39
191# reg(alg, TextLibrary('Levenshtein', 'jaro_winkler', conditions=dict(winklerize=True)))
192
193alg = 'Levenshtein'
194reg(alg, LibraryBase('pylev', 'levenshtein'))
195reg(alg, TextLibrary('jellyfish', 'levenshtein_distance'))
196reg(alg, TextLibrary('Levenshtein', 'distance'))
197reg(alg, LibraryBase('rapidfuzz.distance.Levenshtein', 'distance'))
198# reg(alg, TextLibrary('py_stringmatching.similarity_measure.levenshtein', 'levenshtein'))