1from __future__ import annotations
2
3# built-in
4from difflib import SequenceMatcher as _SequenceMatcher
5from typing import Any
6
7# app
8from ..utils import find_ngrams
9from .base import BaseSimilarity as _BaseSimilarity
10from .types import TestFunc
11
12
13try:
14 # external
15 import numpy
16except ImportError:
17 # built-in
18 from array import array
19 numpy = None # type: ignore[assignment]
20
21
22__all__ = [
23 'lcsseq', 'lcsstr', 'ratcliff_obershelp',
24 'LCSSeq', 'LCSStr', 'RatcliffObershelp',
25]
26
27
28class LCSSeq(_BaseSimilarity):
29 """longest common subsequence similarity
30
31 https://en.wikipedia.org/wiki/Longest_common_subsequence_problem
32 """
33
34 def __init__(
35 self,
36 qval: int = 1,
37 test_func: TestFunc = None,
38 external: bool = True,
39 ) -> None:
40 self.qval = qval
41 self.test_func = test_func or self._ident
42 self.external = external
43
44 def _dynamic(self, seq1: str, seq2: str) -> str:
45 """
46 https://github.com/chrislit/abydos/blob/master/abydos/distance/_lcsseq.py
47 http://www.dis.uniroma1.it/~bonifaci/algo/LCSSEQ.py
48 http://rosettacode.org/wiki/Longest_common_subsequence#Dynamic_Programming_8
49 """
50 lengths: Any
51 if numpy:
52 lengths = numpy.zeros((len(seq1) + 1, len(seq2) + 1), dtype=int)
53 else:
54 lengths = [array('L', [0] * (len(seq2) + 1)) for _ in range(len(seq1) + 1)]
55
56 # row 0 and column 0 are initialized to 0 already
57 for i, char1 in enumerate(seq1):
58 for j, char2 in enumerate(seq2):
59 if char1 == char2:
60 lengths[i + 1][j + 1] = lengths[i][j] + 1
61 else:
62 lengths[i + 1][j + 1] = max(lengths[i + 1][j], lengths[i][j + 1])
63
64 # read the substring out from the matrix
65 result = ''
66 i, j = len(seq1), len(seq2)
67 while i != 0 and j != 0:
68 if lengths[i][j] == lengths[i - 1][j]:
69 i -= 1
70 elif lengths[i][j] == lengths[i][j - 1]:
71 j -= 1
72 else:
73 assert seq1[i - 1] == seq2[j - 1]
74 result = seq1[i - 1] + result
75 i -= 1
76 j -= 1
77 return result
78
79 def _recursive(self, *sequences: str) -> str:
80 if not all(sequences):
81 return type(sequences[0])() # empty sequence
82 if self.test_func(*[s[-1] for s in sequences]):
83 c = sequences[0][-1]
84 sequences = tuple(s[:-1] for s in sequences)
85 return self(*sequences) + c
86 m = type(sequences[0])() # empty sequence
87 for i, s in enumerate(sequences):
88 ss = sequences[:i] + (s[:-1], ) + sequences[i + 1:]
89 m = max([self(*ss), m], key=len)
90 return m
91
92 def __call__(self, *sequences: str) -> str:
93 if not sequences:
94 return ''
95 sequences = self._get_sequences(*sequences)
96 if len(sequences) == 2:
97 return self._dynamic(*sequences)
98 else:
99 return self._recursive(*sequences)
100
101 def similarity(self, *sequences) -> int:
102 return len(self(*sequences))
103
104
105class LCSStr(_BaseSimilarity):
106 """longest common substring similarity
107 """
108
109 def _standart(self, s1: str, s2: str) -> str:
110 matcher = _SequenceMatcher(a=s1, b=s2)
111 match = matcher.find_longest_match(0, len(s1), 0, len(s2))
112 return s1[match.a: match.a + match.size]
113
114 def _custom(self, *sequences: str) -> str:
115 short = min(sequences, key=len)
116 length = len(short)
117 for n in range(length, 0, -1):
118 for subseq in find_ngrams(short, n):
119 joined = ''.join(subseq)
120 for seq in sequences:
121 if joined not in seq:
122 break
123 else:
124 return joined
125 return type(short)() # empty sequence
126
127 def __call__(self, *sequences: str) -> str:
128 if not all(sequences):
129 return ''
130 length = len(sequences)
131 if length == 0:
132 return ''
133 if length == 1:
134 return sequences[0]
135
136 sequences = self._get_sequences(*sequences)
137 if length == 2 and max(map(len, sequences)) < 200:
138 return self._standart(*sequences)
139 return self._custom(*sequences)
140
141 def similarity(self, *sequences: str) -> int:
142 return len(self(*sequences))
143
144
145class RatcliffObershelp(_BaseSimilarity):
146 """Ratcliff-Obershelp similarity
147 This follows the Ratcliff-Obershelp algorithm to derive a similarity
148 measure:
149 1. Find the length of the longest common substring in sequences.
150 2. Recurse on the strings to the left & right of each this substring
151 in sequences. The base case is a 0 length common substring, in which
152 case, return 0. Otherwise, return the sum of the current longest
153 common substring and the left & right recursed sums.
154 3. Multiply this length by 2 and divide by the sum of the lengths of
155 sequences.
156
157 https://en.wikipedia.org/wiki/Gestalt_Pattern_Matching
158 https://github.com/Yomguithereal/talisman/blob/master/src/metrics/ratcliff-obershelp.js
159 https://xlinux.nist.gov/dads/HTML/ratcliffObershelp.html
160 """
161
162 def maximum(self, *sequences: str) -> int:
163 return 1
164
165 def _find(self, *sequences: str) -> int:
166 subseq = LCSStr()(*sequences)
167 length = len(subseq)
168 if length == 0:
169 return 0
170 before = [s[:s.find(subseq)] for s in sequences]
171 after = [s[s.find(subseq) + length:] for s in sequences]
172 return self._find(*before) + length + self._find(*after)
173
174 def __call__(self, *sequences: str) -> float:
175 result = self.quick_answer(*sequences)
176 if result is not None:
177 return result
178 scount = len(sequences) # sequences count
179 ecount = sum(map(len, sequences)) # elements count
180 sequences = self._get_sequences(*sequences)
181 return scount * self._find(*sequences) / ecount
182
183
184lcsseq = LCSSeq()
185lcsstr = LCSStr()
186ratcliff_obershelp = RatcliffObershelp()