Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/textdistance/algorithms/sequence_based.py: 65%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

111 statements  

1from __future__ import annotations 

2 

3# built-in 

4from difflib import SequenceMatcher as _SequenceMatcher 

5from typing import Any 

6 

7# app 

8from ..utils import find_ngrams 

9from .base import BaseSimilarity as _BaseSimilarity 

10from .types import TestFunc 

11 

12 

13try: 

14 # external 

15 import numpy 

16except ImportError: 

17 # built-in 

18 from array import array 

19 numpy = None # type: ignore[assignment] 

20 

21 

22__all__ = [ 

23 'lcsseq', 'lcsstr', 'ratcliff_obershelp', 

24 'LCSSeq', 'LCSStr', 'RatcliffObershelp', 

25] 

26 

27 

28class LCSSeq(_BaseSimilarity): 

29 """longest common subsequence similarity 

30 

31 https://en.wikipedia.org/wiki/Longest_common_subsequence_problem 

32 """ 

33 

34 def __init__( 

35 self, 

36 qval: int = 1, 

37 test_func: TestFunc = None, 

38 external: bool = True, 

39 ) -> None: 

40 self.qval = qval 

41 self.test_func = test_func or self._ident 

42 self.external = external 

43 

44 def _dynamic(self, seq1: str, seq2: str) -> str: 

45 """ 

46 https://github.com/chrislit/abydos/blob/master/abydos/distance/_lcsseq.py 

47 http://www.dis.uniroma1.it/~bonifaci/algo/LCSSEQ.py 

48 http://rosettacode.org/wiki/Longest_common_subsequence#Dynamic_Programming_8 

49 """ 

50 lengths: Any 

51 if numpy: 

52 lengths = numpy.zeros((len(seq1) + 1, len(seq2) + 1), dtype=int) 

53 else: 

54 lengths = [array('L', [0] * (len(seq2) + 1)) for _ in range(len(seq1) + 1)] 

55 

56 # row 0 and column 0 are initialized to 0 already 

57 for i, char1 in enumerate(seq1): 

58 for j, char2 in enumerate(seq2): 

59 if char1 == char2: 

60 lengths[i + 1][j + 1] = lengths[i][j] + 1 

61 else: 

62 lengths[i + 1][j + 1] = max(lengths[i + 1][j], lengths[i][j + 1]) 

63 

64 # read the substring out from the matrix 

65 result = '' 

66 i, j = len(seq1), len(seq2) 

67 while i != 0 and j != 0: 

68 if lengths[i][j] == lengths[i - 1][j]: 

69 i -= 1 

70 elif lengths[i][j] == lengths[i][j - 1]: 

71 j -= 1 

72 else: 

73 assert seq1[i - 1] == seq2[j - 1] 

74 result = seq1[i - 1] + result 

75 i -= 1 

76 j -= 1 

77 return result 

78 

79 def _recursive(self, *sequences: str) -> str: 

80 if not all(sequences): 

81 return type(sequences[0])() # empty sequence 

82 if self.test_func(*[s[-1] for s in sequences]): 

83 c = sequences[0][-1] 

84 sequences = tuple(s[:-1] for s in sequences) 

85 return self(*sequences) + c 

86 m = type(sequences[0])() # empty sequence 

87 for i, s in enumerate(sequences): 

88 ss = sequences[:i] + (s[:-1], ) + sequences[i + 1:] 

89 m = max([self(*ss), m], key=len) 

90 return m 

91 

92 def __call__(self, *sequences: str) -> str: 

93 if not sequences: 

94 return '' 

95 sequences = self._get_sequences(*sequences) 

96 if len(sequences) == 2: 

97 return self._dynamic(*sequences) 

98 else: 

99 return self._recursive(*sequences) 

100 

101 def similarity(self, *sequences) -> int: 

102 return len(self(*sequences)) 

103 

104 

105class LCSStr(_BaseSimilarity): 

106 """longest common substring similarity 

107 """ 

108 

109 def _standart(self, s1: str, s2: str) -> str: 

110 matcher = _SequenceMatcher(a=s1, b=s2) 

111 match = matcher.find_longest_match(0, len(s1), 0, len(s2)) 

112 return s1[match.a: match.a + match.size] 

113 

114 def _custom(self, *sequences: str) -> str: 

115 short = min(sequences, key=len) 

116 length = len(short) 

117 for n in range(length, 0, -1): 

118 for subseq in find_ngrams(short, n): 

119 joined = ''.join(subseq) 

120 for seq in sequences: 

121 if joined not in seq: 

122 break 

123 else: 

124 return joined 

125 return type(short)() # empty sequence 

126 

127 def __call__(self, *sequences: str) -> str: 

128 if not all(sequences): 

129 return '' 

130 length = len(sequences) 

131 if length == 0: 

132 return '' 

133 if length == 1: 

134 return sequences[0] 

135 

136 sequences = self._get_sequences(*sequences) 

137 if length == 2 and max(map(len, sequences)) < 200: 

138 return self._standart(*sequences) 

139 return self._custom(*sequences) 

140 

141 def similarity(self, *sequences: str) -> int: 

142 return len(self(*sequences)) 

143 

144 

145class RatcliffObershelp(_BaseSimilarity): 

146 """Ratcliff-Obershelp similarity 

147 This follows the Ratcliff-Obershelp algorithm to derive a similarity 

148 measure: 

149 1. Find the length of the longest common substring in sequences. 

150 2. Recurse on the strings to the left & right of each this substring 

151 in sequences. The base case is a 0 length common substring, in which 

152 case, return 0. Otherwise, return the sum of the current longest 

153 common substring and the left & right recursed sums. 

154 3. Multiply this length by 2 and divide by the sum of the lengths of 

155 sequences. 

156 

157 https://en.wikipedia.org/wiki/Gestalt_Pattern_Matching 

158 https://github.com/Yomguithereal/talisman/blob/master/src/metrics/ratcliff-obershelp.js 

159 https://xlinux.nist.gov/dads/HTML/ratcliffObershelp.html 

160 """ 

161 

162 def maximum(self, *sequences: str) -> int: 

163 return 1 

164 

165 def _find(self, *sequences: str) -> int: 

166 subseq = LCSStr()(*sequences) 

167 length = len(subseq) 

168 if length == 0: 

169 return 0 

170 before = [s[:s.find(subseq)] for s in sequences] 

171 after = [s[s.find(subseq) + length:] for s in sequences] 

172 return self._find(*before) + length + self._find(*after) 

173 

174 def __call__(self, *sequences: str) -> float: 

175 result = self.quick_answer(*sequences) 

176 if result is not None: 

177 return result 

178 scount = len(sequences) # sequences count 

179 ecount = sum(map(len, sequences)) # elements count 

180 sequences = self._get_sequences(*sequences) 

181 return scount * self._find(*sequences) / ecount 

182 

183 

184lcsseq = LCSSeq() 

185lcsstr = LCSStr() 

186ratcliff_obershelp = RatcliffObershelp()