1from __future__ import annotations
2
3# built-in
4from collections import defaultdict
5from itertools import zip_longest
6from typing import Any, Sequence, TypeVar
7
8# app
9from .base import Base as _Base, BaseSimilarity as _BaseSimilarity
10from .types import SimFunc, TestFunc
11
12
13try:
14 # external
15 import numpy
16except ImportError:
17 numpy = None # type: ignore[assignment]
18
19
20__all__ = [
21 'Hamming', 'MLIPNS',
22 'Levenshtein', 'DamerauLevenshtein',
23 'Jaro', 'JaroWinkler', 'StrCmp95',
24 'NeedlemanWunsch', 'Gotoh', 'SmithWaterman',
25
26 'hamming', 'mlipns',
27 'levenshtein', 'damerau_levenshtein',
28 'jaro', 'jaro_winkler', 'strcmp95',
29 'needleman_wunsch', 'gotoh', 'smith_waterman',
30]
31T = TypeVar('T')
32
33
34class Hamming(_Base):
35 """
36 Compute the Hamming distance between the two or more sequences.
37 The Hamming distance is the number of differing items in ordered sequences.
38
39 https://en.wikipedia.org/wiki/Hamming_distance
40 """
41
42 def __init__(
43 self,
44 qval: int = 1,
45 test_func: TestFunc | None = None,
46 truncate: bool = False,
47 external: bool = True,
48 ) -> None:
49 self.qval = qval
50 self.test_func = test_func or self._ident
51 self.truncate = truncate
52 self.external = external
53
54 def __call__(self, *sequences: Sequence[object]) -> int:
55 sequences = self._get_sequences(*sequences)
56
57 result = self.quick_answer(*sequences)
58 if result is not None:
59 assert isinstance(result, int)
60 return result
61
62 _zip = zip if self.truncate else zip_longest
63 return sum(not self.test_func(*es) for es in _zip(*sequences))
64
65
66class Levenshtein(_Base):
67 """
68 Compute the absolute Levenshtein distance between the two sequences.
69 The Levenshtein distance is the minimum number of edit operations necessary
70 for transforming one sequence into the other. The edit operations allowed are:
71
72 * deletion: ABC -> BC, AC, AB
73 * insertion: ABC -> ABCD, EABC, AEBC..
74 * substitution: ABC -> ABE, ADC, FBC..
75
76 https://en.wikipedia.org/wiki/Levenshtein_distance
77 TODO: https://gist.github.com/kylebgorman/1081951/9b38b7743a3cb5167ab2c6608ac8eea7fc629dca
78 """
79
80 def __init__(
81 self,
82 qval: int = 1,
83 test_func: TestFunc | None = None,
84 external: bool = True,
85 ) -> None:
86 self.qval = qval
87 self.test_func = test_func or self._ident
88 self.external = external
89
90 def _recursive(self, s1: Sequence[T], s2: Sequence[T]) -> int:
91 # TODO: more than 2 sequences support
92 if not s1 or not s2:
93 return len(s1) + len(s2)
94
95 if self.test_func(s1[-1], s2[-1]):
96 return self(s1[:-1], s2[:-1])
97
98 # deletion/insertion
99 d = min(
100 self(s1[:-1], s2),
101 self(s1, s2[:-1]),
102 )
103 # substitution
104 s = self(s1[:-1], s2[:-1])
105 return min(d, s) + 1
106
107 def _cycled(self, s1: Sequence[T], s2: Sequence[T]) -> int:
108 """
109 source:
110 https://github.com/jamesturk/jellyfish/blob/master/jellyfish/_jellyfish.py#L18
111 """
112 rows = len(s1) + 1
113 cols = len(s2) + 1
114 prev = None
115 cur: Any
116 if numpy:
117 cur = numpy.arange(cols)
118 else:
119 cur = range(cols)
120
121 for r in range(1, rows):
122 prev, cur = cur, [r] + [0] * (cols - 1)
123 for c in range(1, cols):
124 deletion = prev[c] + 1
125 insertion = cur[c - 1] + 1
126 dist = self.test_func(s1[r - 1], s2[c - 1])
127 edit = prev[c - 1] + (not dist)
128 cur[c] = min(edit, deletion, insertion)
129 return int(cur[-1])
130
131 def __call__(self, s1: Sequence[T], s2: Sequence[T]) -> int:
132 s1, s2 = self._get_sequences(s1, s2)
133
134 result = self.quick_answer(s1, s2)
135 if result is not None:
136 assert isinstance(result, int)
137 return result
138
139 return self._cycled(s1, s2)
140
141
142class DamerauLevenshtein(_Base):
143 """
144 Compute the absolute Damerau-Levenshtein distance between the two sequences.
145 The Damerau-Levenshtein distance is the minimum number of edit operations necessary
146 for transforming one sequence into the other. The edit operations allowed are:
147
148 * deletion: ABC -> BC, AC, AB
149 * insertion: ABC -> ABCD, EABC, AEBC..
150 * substitution: ABC -> ABE, ADC, FBC..
151 * transposition: ABC -> ACB, BAC
152
153 If `restricted=False`, it will calculate unrestricted distance,
154 where the same character can be touched more than once.
155 So the distance between BA and ACB is 2: BA -> AB -> ACB.
156
157 https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance
158 """
159
160 def __init__(
161 self,
162 qval: int = 1,
163 test_func: TestFunc | None = None,
164 external: bool = True,
165 restricted: bool = True,
166 ) -> None:
167 self.qval = qval
168 self.test_func = test_func or self._ident
169 self.external = external
170 self.restricted = restricted
171
172 def _numpy(self, s1: Sequence[T], s2: Sequence[T]) -> int:
173 # TODO: doesn't pass tests, need improve
174 d = numpy.zeros([len(s1) + 1, len(s2) + 1], dtype=int)
175
176 # matrix
177 for i in range(-1, len(s1) + 1):
178 d[i][-1] = i + 1
179 for j in range(-1, len(s2) + 1):
180 d[-1][j] = j + 1
181
182 for i, cs1 in enumerate(s1):
183 for j, cs2 in enumerate(s2):
184 cost = int(not self.test_func(cs1, cs2))
185 # ^ 0 if equal, 1 otherwise
186
187 d[i][j] = min(
188 d[i - 1][j] + 1, # deletion
189 d[i][j - 1] + 1, # insertion
190 d[i - 1][j - 1] + cost, # substitution
191 )
192
193 # transposition
194 if not i or not j:
195 continue
196 if not self.test_func(cs1, s2[j - 1]):
197 continue
198 d[i][j] = min(
199 d[i][j],
200 d[i - 2][j - 2] + cost,
201 )
202
203 return d[len(s1) - 1][len(s2) - 1]
204
205 def _pure_python_unrestricted(self, s1: Sequence[T], s2: Sequence[T]) -> int:
206 """https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance
207 """
208 d: dict[tuple[int, int], int] = {}
209 da: dict[T, int] = {}
210
211 len1 = len(s1)
212 len2 = len(s2)
213
214 maxdist = len1 + len2
215 d[-1, -1] = maxdist
216
217 # matrix
218 for i in range(len(s1) + 1):
219 d[i, -1] = maxdist
220 d[i, 0] = i
221 for j in range(len(s2) + 1):
222 d[-1, j] = maxdist
223 d[0, j] = j
224
225 for i, cs1 in enumerate(s1, start=1):
226 db = 0
227 for j, cs2 in enumerate(s2, start=1):
228 i1 = da.get(cs2, 0)
229 j1 = db
230 if self.test_func(cs1, cs2):
231 cost = 0
232 db = j
233 else:
234 cost = 1
235
236 d[i, j] = min(
237 d[i - 1, j - 1] + cost, # substitution
238 d[i, j - 1] + 1, # insertion
239 d[i - 1, j] + 1, # deletion
240 d[i1 - 1, j1 - 1] + (i - i1) - 1 + (j - j1), # transposition
241 )
242 da[cs1] = i
243
244 return d[len1, len2]
245
246 def _pure_python_restricted(self, s1: Sequence[T], s2: Sequence[T]) -> int:
247 """
248 https://www.guyrutenberg.com/2008/12/15/damerau-levenshtein-distance-in-python/
249 """
250 d: dict[tuple[int, int], int] = {}
251
252 # matrix
253 for i in range(-1, len(s1) + 1):
254 d[i, -1] = i + 1
255 for j in range(-1, len(s2) + 1):
256 d[-1, j] = j + 1
257
258 for i, cs1 in enumerate(s1):
259 for j, cs2 in enumerate(s2):
260 cost = int(not self.test_func(cs1, cs2))
261 # ^ 0 if equal, 1 otherwise
262
263 d[i, j] = min(
264 d[i - 1, j] + 1, # deletion
265 d[i, j - 1] + 1, # insertion
266 d[i - 1, j - 1] + cost, # substitution
267 )
268
269 # transposition
270 if not i or not j:
271 continue
272 if not self.test_func(cs1, s2[j - 1]):
273 continue
274 if not self.test_func(s1[i - 1], cs2):
275 continue
276 d[i, j] = min(
277 d[i, j],
278 d[i - 2, j - 2] + cost,
279 )
280
281 return d[len(s1) - 1, len(s2) - 1]
282
283 def __call__(self, s1: Sequence[T], s2: Sequence[T]) -> int:
284 s1, s2 = self._get_sequences(s1, s2)
285
286 result = self.quick_answer(s1, s2)
287 if result is not None:
288 return result # type: ignore[return-value]
289
290 # if numpy:
291 # return self._numpy(s1, s2)
292 # else:
293 if self.restricted:
294 return self._pure_python_restricted(s1, s2)
295 return self._pure_python_unrestricted(s1, s2)
296
297
298class JaroWinkler(_BaseSimilarity):
299 """
300 Computes the Jaro-Winkler measure between two strings.
301 The Jaro-Winkler measure is designed to capture cases where two strings
302 have a low Jaro score, but share a prefix.
303 and thus are likely to match.
304
305 https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance
306 https://github.com/Yomguithereal/talisman/blob/master/src/metrics/jaro.js
307 https://github.com/Yomguithereal/talisman/blob/master/src/metrics/jaro-winkler.js
308 """
309
310 def __init__(
311 self,
312 long_tolerance: bool = False,
313 winklerize: bool = True,
314 qval: int = 1,
315 external: bool = True,
316 ) -> None:
317 self.qval = qval
318 self.long_tolerance = long_tolerance
319 self.winklerize = winklerize
320 self.external = external
321
322 def maximum(self, *sequences: Sequence[object]) -> int:
323 return 1
324
325 def __call__(self, s1: Sequence[T], s2: Sequence[T], prefix_weight: float = 0.1) -> float:
326 s1, s2 = self._get_sequences(s1, s2)
327
328 result = self.quick_answer(s1, s2)
329 if result is not None:
330 return result
331
332 s1_len = len(s1)
333 s2_len = len(s2)
334
335 if not s1_len or not s2_len:
336 return 0.0
337
338 min_len = min(s1_len, s2_len)
339 search_range = max(s1_len, s2_len)
340 search_range = (search_range // 2) - 1
341 if search_range < 0:
342 search_range = 0
343
344 s1_flags = [False] * s1_len
345 s2_flags = [False] * s2_len
346
347 # looking only within search range, count & flag matched pairs
348 common_chars = 0
349 for i, s1_ch in enumerate(s1):
350 low = max(0, i - search_range)
351 hi = min(i + search_range, s2_len - 1)
352 for j in range(low, hi + 1):
353 if not s2_flags[j] and s2[j] == s1_ch:
354 s1_flags[i] = s2_flags[j] = True
355 common_chars += 1
356 break
357
358 # short circuit if no characters match
359 if not common_chars:
360 return 0.0
361
362 # count transpositions
363 k = trans_count = 0
364 for i, s1_f in enumerate(s1_flags):
365 if s1_f:
366 for j in range(k, s2_len):
367 if s2_flags[j]:
368 k = j + 1
369 break
370 if s1[i] != s2[j]:
371 trans_count += 1
372 trans_count //= 2
373
374 # adjust for similarities in nonmatched characters
375 weight = common_chars / s1_len + common_chars / s2_len
376 weight += (common_chars - trans_count) / common_chars
377 weight /= 3
378
379 # stop to boost if strings are not similar
380 if not self.winklerize:
381 return weight
382 if weight <= 0.7:
383 return weight
384
385 # winkler modification
386 # adjust for up to first 4 chars in common
387 j = min(min_len, 4)
388 i = 0
389 while i < j and s1[i] == s2[i]:
390 i += 1
391 if i:
392 weight += i * prefix_weight * (1.0 - weight)
393
394 # optionally adjust for long strings
395 # after agreeing beginning chars, at least two or more must agree and
396 # agreed characters must be > half of remaining characters
397 if not self.long_tolerance or min_len <= 4:
398 return weight
399 if common_chars <= i + 1 or 2 * common_chars < min_len + i:
400 return weight
401 tmp = (common_chars - i - 1) / (s1_len + s2_len - i * 2 + 2)
402 weight += (1.0 - weight) * tmp
403 return weight
404
405
406class Jaro(JaroWinkler):
407 def __init__(
408 self,
409 long_tolerance: bool = False,
410 qval: int = 1,
411 external: bool = True,
412 ) -> None:
413 super().__init__(
414 long_tolerance=long_tolerance,
415 winklerize=False,
416 qval=qval,
417 external=external,
418 )
419
420
421class NeedlemanWunsch(_BaseSimilarity):
422 """
423 Computes the Needleman-Wunsch measure between two strings.
424 The Needleman-Wunsch generalizes the Levenshtein distance and considers global
425 alignment between two strings. Specifically, it is computed by assigning
426 a score to each alignment between two input strings and choosing the
427 score of the best alignment, that is, the maximal score.
428 An alignment between two strings is a set of correspondences between the
429 characters of between them, allowing for gaps.
430
431 https://en.wikipedia.org/wiki/Needleman%E2%80%93Wunsch_algorithm
432 """
433
434 def __init__(
435 self,
436 gap_cost: float = 1.0,
437 sim_func: SimFunc = None,
438 qval: int = 1,
439 external: bool = True,
440 ) -> None:
441 self.qval = qval
442 self.gap_cost = gap_cost
443 if sim_func:
444 self.sim_func = sim_func
445 else:
446 self.sim_func = self._ident
447 self.external = external
448
449 def minimum(self, *sequences: Sequence[object]) -> float:
450 return -max(map(len, sequences)) * self.gap_cost
451
452 def maximum(self, *sequences: Sequence[object]) -> float:
453 return max(map(len, sequences))
454
455 def distance(self, *sequences: Sequence[object]) -> float:
456 """Get distance between sequences
457 """
458 return -1 * self.similarity(*sequences)
459
460 def normalized_distance(self, *sequences: Sequence[object]) -> float:
461 """Get distance from 0 to 1
462 """
463 minimum = self.minimum(*sequences)
464 maximum = self.maximum(*sequences)
465 if maximum == 0:
466 return 0
467 return (self.distance(*sequences) - minimum) / (maximum - minimum)
468
469 def normalized_similarity(self, *sequences: Sequence[object]) -> float:
470 """Get similarity from 0 to 1
471 """
472 minimum = self.minimum(*sequences)
473 maximum = self.maximum(*sequences)
474 if maximum == 0:
475 return 1
476 return (self.similarity(*sequences) - minimum) / (maximum * 2)
477
478 def __call__(self, s1: Sequence[T], s2: Sequence[T]) -> float:
479 if not numpy:
480 raise ImportError('Please, install numpy for Needleman-Wunsch measure')
481
482 s1, s2 = self._get_sequences(s1, s2)
483
484 # result = self.quick_answer(s1, s2)
485 # if result is not None:
486 # return result * self.maximum(s1, s2)
487
488 dist_mat = numpy.zeros(
489 (len(s1) + 1, len(s2) + 1),
490 dtype=float,
491 )
492 # DP initialization
493 for i in range(len(s1) + 1):
494 dist_mat[i, 0] = -(i * self.gap_cost)
495 # DP initialization
496 for j in range(len(s2) + 1):
497 dist_mat[0, j] = -(j * self.gap_cost)
498 # Needleman-Wunsch DP calculation
499 for i, c1 in enumerate(s1, 1):
500 for j, c2 in enumerate(s2, 1):
501 match = dist_mat[i - 1, j - 1] + self.sim_func(c1, c2)
502 delete = dist_mat[i - 1, j] - self.gap_cost
503 insert = dist_mat[i, j - 1] - self.gap_cost
504 dist_mat[i, j] = max(match, delete, insert)
505 return dist_mat[dist_mat.shape[0] - 1, dist_mat.shape[1] - 1]
506
507
508class SmithWaterman(_BaseSimilarity):
509 """
510 Computes the Smith-Waterman measure between two strings.
511 The Smith-Waterman algorithm performs local sequence alignment;
512 that is, for determining similar regions between two strings.
513 Instead of looking at the total sequence, the Smith-Waterman algorithm compares
514 segments of all possible lengths and optimizes the similarity measure.
515
516 https://en.wikipedia.org/wiki/Smith%E2%80%93Waterman_algorithm
517 https://github.com/Yomguithereal/talisman/blob/master/src/metrics/smith-waterman.js
518 """
519
520 def __init__(
521 self,
522 gap_cost: float = 1.0,
523 sim_func: SimFunc = None,
524 qval: int = 1,
525 external: bool = True,
526 ) -> None:
527 self.qval = qval
528 self.gap_cost = gap_cost
529 self.sim_func = sim_func or self._ident
530 self.external = external
531
532 def maximum(self, *sequences: Sequence[object]) -> int:
533 return min(map(len, sequences))
534
535 def __call__(self, s1: Sequence[T], s2: Sequence[T]) -> float:
536 if not numpy:
537 raise ImportError('Please, install numpy for Smith-Waterman measure')
538
539 s1, s2 = self._get_sequences(s1, s2)
540
541 result = self.quick_answer(s1, s2)
542 if result is not None:
543 return result
544
545 dist_mat = numpy.zeros(
546 (len(s1) + 1, len(s2) + 1),
547 dtype=float,
548 )
549 for i, sc1 in enumerate(s1, start=1):
550 for j, sc2 in enumerate(s2, start=1):
551 # The score for substituting the letter a[i - 1] for b[j - 1].
552 # Generally low for mismatch, high for match.
553 match = dist_mat[i - 1, j - 1] + self.sim_func(sc1, sc2)
554 # The scores for for introducing extra letters in one of the strings
555 # (or by symmetry, deleting them from the other).
556 delete = dist_mat[i - 1, j] - self.gap_cost
557 insert = dist_mat[i, j - 1] - self.gap_cost
558 dist_mat[i, j] = max(0, match, delete, insert)
559 return dist_mat[dist_mat.shape[0] - 1, dist_mat.shape[1] - 1]
560
561
562class Gotoh(NeedlemanWunsch):
563 """Gotoh score
564 Gotoh's algorithm is essentially Needleman-Wunsch with affine gap
565 penalties:
566 https://www.cs.umd.edu/class/spring2003/cmsc838t/papers/gotoh1982.pdf
567 """
568
569 def __init__(
570 self,
571 gap_open: int = 1,
572 gap_ext: float = 0.4,
573 sim_func: SimFunc = None,
574 qval: int = 1,
575 external: bool = True,
576 ) -> None:
577 self.qval = qval
578 self.gap_open = gap_open
579 self.gap_ext = gap_ext
580 if sim_func:
581 self.sim_func = sim_func
582 else:
583 self.sim_func = self._ident
584 self.external = external
585
586 def minimum(self, *sequences: Sequence[object]) -> int:
587 return -min(map(len, sequences))
588
589 def maximum(self, *sequences: Sequence[object]) -> int:
590 return min(map(len, sequences))
591
592 def __call__(self, s1: Sequence[T], s2: Sequence[T]) -> float:
593 if not numpy:
594 raise ImportError('Please, install numpy for Gotoh measure')
595
596 s1, s2 = self._get_sequences(s1, s2)
597
598 # result = self.quick_answer(s1, s2)
599 # if result is not None:
600 # return result * self.maximum(s1, s2)
601
602 len_s1 = len(s1)
603 len_s2 = len(s2)
604 d_mat = numpy.zeros((len_s1 + 1, len_s2 + 1), dtype=float)
605 p_mat = numpy.zeros((len_s1 + 1, len_s2 + 1), dtype=float)
606 q_mat = numpy.zeros((len_s1 + 1, len_s2 + 1), dtype=float)
607
608 d_mat[0, 0] = 0
609 p_mat[0, 0] = float('-inf')
610 q_mat[0, 0] = float('-inf')
611 for i in range(1, len_s1 + 1):
612 d_mat[i, 0] = float('-inf')
613 p_mat[i, 0] = -self.gap_open - self.gap_ext * (i - 1)
614 q_mat[i, 0] = float('-inf')
615 q_mat[i, 1] = -self.gap_open
616 for j in range(1, len_s2 + 1):
617 d_mat[0, j] = float('-inf')
618 p_mat[0, j] = float('-inf')
619 p_mat[1, j] = -self.gap_open
620 q_mat[0, j] = -self.gap_open - self.gap_ext * (j - 1)
621
622 for i, sc1 in enumerate(s1, start=1):
623 for j, sc2 in enumerate(s2, start=1):
624 sim_val = self.sim_func(sc1, sc2)
625 d_mat[i, j] = max(
626 d_mat[i - 1, j - 1] + sim_val,
627 p_mat[i - 1, j - 1] + sim_val,
628 q_mat[i - 1, j - 1] + sim_val,
629 )
630 p_mat[i, j] = max(
631 d_mat[i - 1, j] - self.gap_open,
632 p_mat[i - 1, j] - self.gap_ext,
633 )
634 q_mat[i, j] = max(
635 d_mat[i, j - 1] - self.gap_open,
636 q_mat[i, j - 1] - self.gap_ext,
637 )
638
639 i, j = (n - 1 for n in d_mat.shape)
640 return max(d_mat[i, j], p_mat[i, j], q_mat[i, j])
641
642
643class StrCmp95(_BaseSimilarity):
644 """strcmp95 similarity
645
646 http://cpansearch.perl.org/src/SCW/Text-JaroWinkler-0.1/strcmp95.c
647 """
648 sp_mx: tuple[tuple[str, str], ...] = (
649 ('A', 'E'), ('A', 'I'), ('A', 'O'), ('A', 'U'), ('B', 'V'), ('E', 'I'),
650 ('E', 'O'), ('E', 'U'), ('I', 'O'), ('I', 'U'), ('O', 'U'), ('I', 'Y'),
651 ('E', 'Y'), ('C', 'G'), ('E', 'F'), ('W', 'U'), ('W', 'V'), ('X', 'K'),
652 ('S', 'Z'), ('X', 'S'), ('Q', 'C'), ('U', 'V'), ('M', 'N'), ('L', 'I'),
653 ('Q', 'O'), ('P', 'R'), ('I', 'J'), ('2', 'Z'), ('5', 'S'), ('8', 'B'),
654 ('1', 'I'), ('1', 'L'), ('0', 'O'), ('0', 'Q'), ('C', 'K'), ('G', 'J'),
655 )
656
657 def __init__(self, long_strings: bool = False, external: bool = True) -> None:
658 self.long_strings = long_strings
659 self.external = external
660
661 def maximum(self, *sequences: Sequence[object]) -> int:
662 return 1
663
664 @staticmethod
665 def _in_range(char) -> bool:
666 return 0 < ord(char) < 91
667
668 def __call__(self, s1: str, s2: str) -> float:
669 s1 = s1.strip().upper()
670 s2 = s2.strip().upper()
671
672 result = self.quick_answer(s1, s2)
673 if result is not None:
674 return result
675
676 len_s1 = len(s1)
677 len_s2 = len(s2)
678
679 adjwt = defaultdict(int)
680
681 # Initialize the adjwt array on the first call to the function only.
682 # The adjwt array is used to give partial credit for characters that
683 # may be errors due to known phonetic or character recognition errors.
684 # A typical example is to match the letter "O" with the number "0"
685 for c1, c2 in self.sp_mx:
686 adjwt[c1, c2] = 3
687 adjwt[c2, c1] = 3
688
689 if len_s1 > len_s2:
690 search_range = len_s1
691 minv = len_s2
692 else:
693 search_range = len_s2
694 minv = len_s1
695
696 # Blank out the flags
697 s1_flag = [0] * search_range
698 s2_flag = [0] * search_range
699 search_range = max(0, search_range // 2 - 1)
700
701 # Looking only within the search range, count and flag the matched pairs.
702 num_com = 0
703 yl1 = len_s2 - 1
704 for i, sc1 in enumerate(s1):
705 lowlim = max(i - search_range, 0)
706 hilim = min(i + search_range, yl1)
707 for j in range(lowlim, hilim + 1):
708 if s2_flag[j] == 0 and s2[j] == sc1:
709 s2_flag[j] = 1
710 s1_flag[i] = 1
711 num_com += 1
712 break
713
714 # If no characters in common - return
715 if num_com == 0:
716 return 0.0
717
718 # Count the number of transpositions
719 k = n_trans = 0
720 for i, sc1 in enumerate(s1):
721 if not s1_flag[i]:
722 continue
723 for j in range(k, len_s2):
724 if s2_flag[j] != 0:
725 k = j + 1
726 break
727 if sc1 != s2[j]:
728 n_trans += 1
729 n_trans = n_trans // 2
730
731 # Adjust for similarities in unmatched characters
732 n_simi = 0
733 if minv > num_com:
734 for i in range(len_s1):
735 if s1_flag[i] != 0:
736 continue
737 if not self._in_range(s1[i]):
738 continue
739 for j in range(len_s2):
740 if s2_flag[j] != 0:
741 continue
742 if not self._in_range(s2[j]):
743 continue
744 if (s1[i], s2[j]) not in adjwt:
745 continue
746 n_simi += adjwt[s1[i], s2[j]]
747 s2_flag[j] = 2
748 break
749 num_sim = n_simi / 10.0 + num_com
750
751 # Main weight computation
752 weight = num_sim / len_s1 + num_sim / len_s2
753 weight += (num_com - n_trans) / num_com
754 weight = weight / 3.0
755
756 # Continue to boost the weight if the strings are similar
757 if weight <= 0.7:
758 return weight
759
760 # Adjust for having up to the first 4 characters in common
761 j = min(minv, 4)
762 i = 0
763 for sc1, sc2 in zip(s1, s2):
764 if i >= j:
765 break
766 if sc1 != sc2:
767 break
768 if sc1.isdigit():
769 break
770 i += 1
771 if i:
772 weight += i * 0.1 * (1.0 - weight)
773
774 # Optionally adjust for long strings.
775
776 # After agreeing beginning chars, at least two more must agree and
777 # the agreeing characters must be > .5 of remaining characters.
778 if not self.long_strings:
779 return weight
780 if minv <= 4:
781 return weight
782 if num_com <= i + 1 or 2 * num_com < minv + i:
783 return weight
784 if s1[0].isdigit():
785 return weight
786 res = (num_com - i - 1) / (len_s1 + len_s2 - i * 2 + 2)
787 weight += (1.0 - weight) * res
788 return weight
789
790
791class MLIPNS(_BaseSimilarity):
792 """
793 Compute the Hamming distance between the two or more sequences.
794 The Hamming distance is the number of differing items in ordered sequences.
795
796 http://www.sial.iias.spb.su/files/386-386-1-PB.pdf
797 https://github.com/Yomguithereal/talisman/blob/master/src/metrics/mlipns.js
798 """
799
800 def __init__(
801 self, threshold: float = 0.25,
802 maxmismatches: int = 2,
803 qval: int = 1,
804 external: bool = True,
805 ) -> None:
806 self.qval = qval
807 self.threshold = threshold
808 self.maxmismatches = maxmismatches
809 self.external = external
810
811 def maximum(self, *sequences: Sequence[object]) -> int:
812 return 1
813
814 def __call__(self, *sequences: Sequence[object]) -> float:
815 sequences = self._get_sequences(*sequences)
816
817 result = self.quick_answer(*sequences)
818 if result is not None:
819 return result
820
821 mismatches = 0
822 ham = Hamming()(*sequences)
823 maxlen = max(map(len, sequences))
824 while all(sequences) and mismatches <= self.maxmismatches:
825 if not maxlen:
826 return 1
827 if 1 - (maxlen - ham) / maxlen <= self.threshold:
828 return 1
829 mismatches += 1
830 ham -= 1
831 maxlen -= 1
832
833 if not maxlen:
834 return 1
835 return 0
836
837
838hamming = Hamming()
839levenshtein = Levenshtein()
840damerau = damerau_levenshtein = DamerauLevenshtein()
841jaro = Jaro()
842jaro_winkler = JaroWinkler()
843needleman_wunsch = NeedlemanWunsch()
844smith_waterman = SmithWaterman()
845gotoh = Gotoh()
846strcmp95 = StrCmp95()
847mlipns = MLIPNS()