Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/textdistance/algorithms/sequence

1from __future__ import annotations

3# built-in

4from difflib import SequenceMatcher as _SequenceMatcher

5from typing import Any

7# app

8from ..utils import find_ngrams

9from .base import BaseSimilarity as _BaseSimilarity

10from .types import TestFunc

13try:

14 # external

15 import numpy

16except ImportError:

17 # built-in

18 from array import array

19 numpy = None # type: ignore[assignment]

22__all__ = [

23 'lcsseq', 'lcsstr', 'ratcliff_obershelp',

24 'LCSSeq', 'LCSStr', 'RatcliffObershelp',

25]

28class LCSSeq(_BaseSimilarity):

29 """longest common subsequence similarity

31 https://en.wikipedia.org/wiki/Longest_common_subsequence_problem

32 """

34 def __init__(

35 self,

36 qval: int = 1,

37 test_func: TestFunc = None,

38 external: bool = True,

39 ) -> None:

40 self.qval = qval

41 self.test_func = test_func or self._ident

42 self.external = external

44 def _dynamic(self, seq1: str, seq2: str) -> str:

45 """

46 https://github.com/chrislit/abydos/blob/master/abydos/distance/_lcsseq.py

47 http://www.dis.uniroma1.it/~bonifaci/algo/LCSSEQ.py

48 http://rosettacode.org/wiki/Longest_common_subsequence#Dynamic_Programming_8

49 """

50 lengths: Any

51 if numpy:

52 lengths = numpy.zeros((len(seq1) + 1, len(seq2) + 1), dtype=int)

53 else:

54 lengths = [array('L', [0] * (len(seq2) + 1)) for _ in range(len(seq1) + 1)]

56 # row 0 and column 0 are initialized to 0 already

57 for i, char1 in enumerate(seq1):

58 for j, char2 in enumerate(seq2):

59 if char1 == char2:

60 lengths[i + 1][j + 1] = lengths[i][j] + 1

61 else:

62 lengths[i + 1][j + 1] = max(lengths[i + 1][j], lengths[i][j + 1])

64 # read the substring out from the matrix

65 result = ''

66 i, j = len(seq1), len(seq2)

67 while i != 0 and j != 0:

68 if lengths[i][j] == lengths[i - 1][j]:

69 i -= 1

70 elif lengths[i][j] == lengths[i][j - 1]:

71 j -= 1

72 else:

73 assert seq1[i - 1] == seq2[j - 1]

74 result = seq1[i - 1] + result

75 i -= 1

76 j -= 1

77 return result

79 def _recursive(self, *sequences: str) -> str:

80 if not all(sequences):

81 return type(sequences[0])() # empty sequence

82 if self.test_func(*[s[-1] for s in sequences]):

83 c = sequences[0][-1]

84 sequences = tuple(s[:-1] for s in sequences)

85 return self(*sequences) + c

86 m = type(sequences[0])() # empty sequence

87 for i, s in enumerate(sequences):

88 ss = sequences[:i] + (s[:-1], ) + sequences[i + 1:]

89 m = max([self(*ss), m], key=len)

90 return m

92 def __call__(self, *sequences: str) -> str:

93 if not sequences:

94 return ''

95 sequences = self._get_sequences(*sequences)

96 if len(sequences) == 2:

97 return self._dynamic(*sequences)

98 else:

99 return self._recursive(*sequences)

100

101 def similarity(self, *sequences) -> int:

102 return len(self(*sequences))

103

104

105class LCSStr(_BaseSimilarity):

106 """longest common substring similarity

107 """

108

109 def _standart(self, s1: str, s2: str) -> str:

110 matcher = _SequenceMatcher(a=s1, b=s2)

111 match = matcher.find_longest_match(0, len(s1), 0, len(s2))

112 return s1[match.a: match.a + match.size]

113

114 def _custom(self, *sequences: str) -> str:

115 short = min(sequences, key=len)

116 length = len(short)

117 for n in range(length, 0, -1):

118 for subseq in find_ngrams(short, n):

119 joined = ''.join(subseq)

120 for seq in sequences:

121 if joined not in seq:

122 break

123 else:

124 return joined

125 return type(short)() # empty sequence

126

127 def __call__(self, *sequences: str) -> str:

128 if not all(sequences):

129 return ''

130 length = len(sequences)

131 if length == 0:

132 return ''

133 if length == 1:

134 return sequences[0]

135

136 sequences = self._get_sequences(*sequences)

137 if length == 2 and max(map(len, sequences)) < 200:

138 return self._standart(*sequences)

139 return self._custom(*sequences)

140

141 def similarity(self, *sequences: str) -> int:

142 return len(self(*sequences))

143

144

145class RatcliffObershelp(_BaseSimilarity):

146 """Ratcliff-Obershelp similarity

147 This follows the Ratcliff-Obershelp algorithm to derive a similarity

148 measure:

149 1. Find the length of the longest common substring in sequences.

150 2. Recurse on the strings to the left & right of each this substring

151 in sequences. The base case is a 0 length common substring, in which

152 case, return 0. Otherwise, return the sum of the current longest

153 common substring and the left & right recursed sums.

154 3. Multiply this length by 2 and divide by the sum of the lengths of

155 sequences.

156

157 https://en.wikipedia.org/wiki/Gestalt_Pattern_Matching

158 https://github.com/Yomguithereal/talisman/blob/master/src/metrics/ratcliff-obershelp.js

159 https://xlinux.nist.gov/dads/HTML/ratcliffObershelp.html

160 """

161

162 def maximum(self, *sequences: str) -> int:

163 return 1

164

165 def _find(self, *sequences: str) -> int:

166 subseq = LCSStr()(*sequences)

167 length = len(subseq)

168 if length == 0:

169 return 0

170 before = [s[:s.find(subseq)] for s in sequences]

171 after = [s[s.find(subseq) + length:] for s in sequences]

172 return self._find(*before) + length + self._find(*after)

173

174 def __call__(self, *sequences: str) -> float:

175 result = self.quick_answer(*sequences)

176 if result is not None:

177 return result

178 scount = len(sequences) # sequences count

179 ecount = sum(map(len, sequences)) # elements count

180 sequences = self._get_sequences(*sequences)

181 return scount * self._find(*sequences) / ecount

182

183

184lcsseq = LCSSeq()

185lcsstr = LCSStr()

186ratcliff_obershelp = RatcliffObershelp()

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/textdistance/algorithms/sequence_based.py: 65%

111 statements