1from __future__ import annotations
2
3# built-in
4from functools import reduce
5from itertools import islice, permutations, repeat
6from math import log
7from typing import Sequence
8
9# app
10from .base import Base as _Base, BaseSimilarity as _BaseSimilarity
11from .edit_based import DamerauLevenshtein
12
13
14__all__ = [
15 'Jaccard', 'Sorensen', 'Tversky',
16 'Overlap', 'Cosine', 'Tanimoto', 'MongeElkan', 'Bag',
17
18 'jaccard', 'sorensen', 'tversky', 'sorensen_dice', 'dice',
19 'overlap', 'cosine', 'tanimoto', 'monge_elkan', 'bag',
20]
21
22
23class Jaccard(_BaseSimilarity):
24 """
25 Compute the Jaccard similarity between the two sequences.
26 They should contain hashable items.
27 The return value is a float between 0 and 1, where 1 means equal,
28 and 0 totally different.
29
30 https://en.wikipedia.org/wiki/Jaccard_index
31 https://github.com/Yomguithereal/talisman/blob/master/src/metrics/jaccard.js
32 """
33
34 def __init__(
35 self,
36 qval: int = 1,
37 as_set: bool = False,
38 external: bool = True,
39 ) -> None:
40 self.qval = qval
41 self.as_set = as_set
42 self.external = external
43
44 def maximum(self, *sequences: Sequence) -> int:
45 return 1
46
47 def __call__(self, *sequences: Sequence) -> float:
48 result = self.quick_answer(*sequences)
49 if result is not None:
50 return result
51
52 sequences = self._get_counters(*sequences) # sets
53 intersection = self._intersect_counters(*sequences) # set
54 intersection = self._count_counters(intersection) # int
55 union = self._union_counters(*sequences) # set
56 union = self._count_counters(union) # int
57 return intersection / union
58
59
60class Sorensen(_BaseSimilarity):
61 """
62 Compute the Sorensen distance between the two sequences.
63 They should contain hashable items.
64 The return value is a float between 0 and 1, where 1 means equal,
65 and 0 totally different.
66
67 https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient
68 https://github.com/Yomguithereal/talisman/blob/master/src/metrics/dice.js
69 """
70
71 def __init__(self, qval: int = 1, as_set: bool = False, external: bool = True) -> None:
72 self.qval = qval
73 self.as_set = as_set
74 self.external = external
75
76 def maximum(self, *sequences: Sequence) -> int:
77 return 1
78
79 def __call__(self, *sequences: Sequence) -> float:
80 result = self.quick_answer(*sequences)
81 if result is not None:
82 return result
83
84 sequences = self._get_counters(*sequences) # sets
85 count = sum(self._count_counters(s) for s in sequences)
86 intersection = self._intersect_counters(*sequences) # set
87 intersection = self._count_counters(intersection) # int
88 return 2.0 * intersection / count
89
90
91class Tversky(_BaseSimilarity):
92 """
93 Compute the Tversky index for two sequences.
94 They should contain hashable items.
95 The return value is a float between 0 and 1, where 1 means equal,
96 and 0 totally different
97
98 https://en.wikipedia.org/wiki/Tversky_index
99 https://github.com/Yomguithereal/talisman/blob/master/src/metrics/tversky.js
100 """
101
102 def __init__(
103 self,
104 qval: int = 1,
105 ks: Sequence[float] = None,
106 bias: float | None = None,
107 as_set: bool = False,
108 external: bool = True,
109 ) -> None:
110 self.qval = qval
111 self.ks = ks or repeat(1)
112 self.bias = bias
113 self.as_set = as_set
114 self.external = external
115
116 def maximum(self, *sequences: Sequence) -> int:
117 return 1
118
119 def __call__(self, *sequences: Sequence) -> float:
120 quick_result = self.quick_answer(*sequences)
121 if quick_result is not None:
122 return quick_result
123
124 sequences = self._get_counters(*sequences) # sets
125 intersection = self._intersect_counters(*sequences) # set
126 intersection = self._count_counters(intersection) # int
127 sequences = [self._count_counters(s) for s in sequences] # ints
128 ks = list(islice(self.ks, len(sequences)))
129
130 if len(sequences) != 2 or self.bias is None:
131 result = intersection
132 for k, s in zip(ks, sequences):
133 result += k * (s - intersection)
134 return intersection / result
135
136 s1, s2 = sequences
137 alpha, beta = ks
138 a_val = min([s1, s2])
139 b_val = max([s1, s2])
140 c_val = intersection + self.bias
141 result = alpha * beta * (a_val - b_val) + b_val * beta
142 return c_val / (result + c_val)
143
144
145class Overlap(_BaseSimilarity):
146 """
147 Compute the Overlap coefficient for two sequences.
148 They should contain hashable items.
149 The return value is a float between 0 and 1, where 1 means equal,
150 and 0 totally different
151
152 https://en.wikipedia.org/wiki/Overlap_coefficient
153 https://github.com/Yomguithereal/talisman/blob/master/src/metrics/overlap.js
154 """
155
156 def __init__(
157 self,
158 qval: int = 1,
159 as_set: bool = False,
160 external: bool = True,
161 ) -> None:
162 self.qval = qval
163 self.as_set = as_set
164 self.external = external
165
166 def maximum(self, *sequences: Sequence) -> int:
167 return 1
168
169 def __call__(self, *sequences: Sequence) -> float:
170 result = self.quick_answer(*sequences)
171 if result is not None:
172 return result
173
174 sequences = self._get_counters(*sequences) # sets
175 intersection = self._intersect_counters(*sequences) # set
176 intersection = self._count_counters(intersection) # int
177 sequences = [self._count_counters(s) for s in sequences] # ints
178
179 return intersection / min(sequences)
180
181
182class Cosine(_BaseSimilarity):
183 """
184 Compute the Cosine similarity (Ochiai coefficient) for two sequences.
185 They should contain hashable items.
186 The return value is a float between 0 and 1, where 1 means equal,
187 and 0 totally different
188
189 https://en.wikipedia.org/wiki/Cosine_similarity
190 https://github.com/Yomguithereal/talisman/blob/master/src/metrics/cosine.js
191 """
192
193 def __init__(
194 self,
195 qval: int = 1,
196 as_set: bool = False,
197 external: bool = True,
198 ) -> None:
199 self.qval = qval
200 self.as_set = as_set
201 self.external = external
202
203 def maximum(self, *sequences: Sequence) -> int:
204 return 1
205
206 def __call__(self, *sequences: Sequence) -> float:
207 result = self.quick_answer(*sequences)
208 if result is not None:
209 return result
210
211 sequences = self._get_counters(*sequences) # sets
212 intersection = self._intersect_counters(*sequences) # set
213 intersection = self._count_counters(intersection) # int
214 sequences = [self._count_counters(s) for s in sequences] # ints
215 prod = reduce(lambda x, y: x * y, sequences)
216
217 return intersection / pow(prod, 1.0 / len(sequences))
218
219
220class Tanimoto(Jaccard):
221 """
222 Compute the Tanimoto distance between two sequences.
223 They should contain hashable items.
224 The return value is a float between -inf and 0, where 0 means equal,
225 and -inf totally different
226
227 This is identical to the Jaccard similarity coefficient
228 and the Tversky index for alpha=1 and beta=1.
229
230 https://en.wikipedia.org/wiki/Jaccard_index#Tanimoto_similarity_and_distance
231 """
232
233 def __call__(self, *sequences: Sequence) -> float:
234 result = super().__call__(*sequences)
235 if result == 0:
236 return float('-inf')
237 else:
238 return log(result, 2)
239
240
241class MongeElkan(_BaseSimilarity):
242 """
243 Compute the Monge Elkan distance between two sequences.
244 They should contain hashable items.
245 The return value is a float between 0 and 2, where 2 means equal,
246 and 0 totally different.
247
248 https://www.academia.edu/200314/Generalized_Monge-Elkan_Method_for_Approximate_Text_String_Comparison
249 http://www.cs.cmu.edu/~wcohen/postscript/kdd-2003-match-ws.pdf
250 https://github.com/Yomguithereal/talisman/blob/master/src/metrics/monge-elkan.js
251 """
252 _damerau_levenshtein = DamerauLevenshtein()
253
254 def __init__(
255 self,
256 algorithm=_damerau_levenshtein,
257 symmetric: bool = False,
258 qval: int = 1,
259 external: bool = True,
260 ) -> None:
261 self.algorithm = algorithm
262 self.symmetric = symmetric
263 self.qval = qval
264 self.external = external
265
266 def maximum(self, *sequences: Sequence) -> float:
267 result = self.algorithm.maximum(sequences)
268 for seq in sequences:
269 if seq:
270 result = max(result, self.algorithm.maximum(*seq))
271 return result
272
273 def _calc(self, seq, *sequences: Sequence) -> float:
274 if not seq:
275 return 0
276 maxes = []
277 for c1 in seq:
278 for s in sequences:
279 max_sim = float('-inf')
280 for c2 in s:
281 max_sim = max(max_sim, self.algorithm.similarity(c1, c2))
282 maxes.append(max_sim)
283 return sum(maxes) / len(seq) / len(maxes)
284
285 def __call__(self, *sequences: Sequence) -> float:
286 quick_result = self.quick_answer(*sequences)
287 if quick_result is not None:
288 return quick_result
289 sequences = self._get_sequences(*sequences)
290
291 if self.symmetric:
292 result = []
293 for seqs in permutations(sequences):
294 result.append(self._calc(*seqs))
295 return sum(result) / len(result)
296 else:
297 return self._calc(*sequences)
298
299
300class Bag(_Base):
301 """
302 Compute the Bag distance between two sequences.
303 They should contain hashable items.
304 The return value is a float between 0 and N, where 0 means equal,
305 and N totally different. N would, at most, be the length of the
306 longest sequence in the comparison.
307
308 http://www-db.disi.unibo.it/research/papers/SPIRE02.pdf
309 https://github.com/Yomguithereal/talisman/blob/master/src/metrics/bag.js
310 """
311
312 def __call__(self, *sequences: Sequence) -> float:
313 sequences = self._get_counters(*sequences) # sets
314 intersection = self._intersect_counters(*sequences) # set
315 return max(self._count_counters(sequence - intersection) for sequence in sequences)
316
317
318bag = Bag()
319cosine = Cosine()
320dice = Sorensen()
321jaccard = Jaccard()
322monge_elkan = MongeElkan()
323overlap = Overlap()
324sorensen = Sorensen()
325sorensen_dice = Sorensen()
326# sorensen_dice = Tversky(ks=[.5, .5])
327tanimoto = Tanimoto()
328tversky = Tversky()