Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/textdistance/algorithms/token_based.py: 76%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

160 statements  

1from __future__ import annotations 

2 

3# built-in 

4from functools import reduce 

5from itertools import islice, permutations, repeat 

6from math import log 

7from typing import Sequence 

8 

9# app 

10from .base import Base as _Base, BaseSimilarity as _BaseSimilarity 

11from .edit_based import DamerauLevenshtein 

12 

13 

14__all__ = [ 

15 'Jaccard', 'Sorensen', 'Tversky', 

16 'Overlap', 'Cosine', 'Tanimoto', 'MongeElkan', 'Bag', 

17 

18 'jaccard', 'sorensen', 'tversky', 'sorensen_dice', 'dice', 

19 'overlap', 'cosine', 'tanimoto', 'monge_elkan', 'bag', 

20] 

21 

22 

23class Jaccard(_BaseSimilarity): 

24 """ 

25 Compute the Jaccard similarity between the two sequences. 

26 They should contain hashable items. 

27 The return value is a float between 0 and 1, where 1 means equal, 

28 and 0 totally different. 

29 

30 https://en.wikipedia.org/wiki/Jaccard_index 

31 https://github.com/Yomguithereal/talisman/blob/master/src/metrics/jaccard.js 

32 """ 

33 

34 def __init__( 

35 self, 

36 qval: int = 1, 

37 as_set: bool = False, 

38 external: bool = True, 

39 ) -> None: 

40 self.qval = qval 

41 self.as_set = as_set 

42 self.external = external 

43 

44 def maximum(self, *sequences: Sequence) -> int: 

45 return 1 

46 

47 def __call__(self, *sequences: Sequence) -> float: 

48 result = self.quick_answer(*sequences) 

49 if result is not None: 

50 return result 

51 

52 sequences = self._get_counters(*sequences) # sets 

53 intersection = self._intersect_counters(*sequences) # set 

54 intersection = self._count_counters(intersection) # int 

55 union = self._union_counters(*sequences) # set 

56 union = self._count_counters(union) # int 

57 return intersection / union 

58 

59 

60class Sorensen(_BaseSimilarity): 

61 """ 

62 Compute the Sorensen distance between the two sequences. 

63 They should contain hashable items. 

64 The return value is a float between 0 and 1, where 1 means equal, 

65 and 0 totally different. 

66 

67 https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient 

68 https://github.com/Yomguithereal/talisman/blob/master/src/metrics/dice.js 

69 """ 

70 

71 def __init__(self, qval: int = 1, as_set: bool = False, external: bool = True) -> None: 

72 self.qval = qval 

73 self.as_set = as_set 

74 self.external = external 

75 

76 def maximum(self, *sequences: Sequence) -> int: 

77 return 1 

78 

79 def __call__(self, *sequences: Sequence) -> float: 

80 result = self.quick_answer(*sequences) 

81 if result is not None: 

82 return result 

83 

84 sequences = self._get_counters(*sequences) # sets 

85 count = sum(self._count_counters(s) for s in sequences) 

86 intersection = self._intersect_counters(*sequences) # set 

87 intersection = self._count_counters(intersection) # int 

88 return 2.0 * intersection / count 

89 

90 

91class Tversky(_BaseSimilarity): 

92 """ 

93 Compute the Tversky index for two sequences. 

94 They should contain hashable items. 

95 The return value is a float between 0 and 1, where 1 means equal, 

96 and 0 totally different 

97 

98 https://en.wikipedia.org/wiki/Tversky_index 

99 https://github.com/Yomguithereal/talisman/blob/master/src/metrics/tversky.js 

100 """ 

101 

102 def __init__( 

103 self, 

104 qval: int = 1, 

105 ks: Sequence[float] = None, 

106 bias: float | None = None, 

107 as_set: bool = False, 

108 external: bool = True, 

109 ) -> None: 

110 self.qval = qval 

111 self.ks = ks or repeat(1) 

112 self.bias = bias 

113 self.as_set = as_set 

114 self.external = external 

115 

116 def maximum(self, *sequences: Sequence) -> int: 

117 return 1 

118 

119 def __call__(self, *sequences: Sequence) -> float: 

120 quick_result = self.quick_answer(*sequences) 

121 if quick_result is not None: 

122 return quick_result 

123 

124 sequences = self._get_counters(*sequences) # sets 

125 intersection = self._intersect_counters(*sequences) # set 

126 intersection = self._count_counters(intersection) # int 

127 sequences = [self._count_counters(s) for s in sequences] # ints 

128 ks = list(islice(self.ks, len(sequences))) 

129 

130 if len(sequences) != 2 or self.bias is None: 

131 result = intersection 

132 for k, s in zip(ks, sequences): 

133 result += k * (s - intersection) 

134 return intersection / result 

135 

136 s1, s2 = sequences 

137 alpha, beta = ks 

138 a_val = min([s1, s2]) 

139 b_val = max([s1, s2]) 

140 c_val = intersection + self.bias 

141 result = alpha * beta * (a_val - b_val) + b_val * beta 

142 return c_val / (result + c_val) 

143 

144 

145class Overlap(_BaseSimilarity): 

146 """ 

147 Compute the Overlap coefficient for two sequences. 

148 They should contain hashable items. 

149 The return value is a float between 0 and 1, where 1 means equal, 

150 and 0 totally different 

151 

152 https://en.wikipedia.org/wiki/Overlap_coefficient 

153 https://github.com/Yomguithereal/talisman/blob/master/src/metrics/overlap.js 

154 """ 

155 

156 def __init__( 

157 self, 

158 qval: int = 1, 

159 as_set: bool = False, 

160 external: bool = True, 

161 ) -> None: 

162 self.qval = qval 

163 self.as_set = as_set 

164 self.external = external 

165 

166 def maximum(self, *sequences: Sequence) -> int: 

167 return 1 

168 

169 def __call__(self, *sequences: Sequence) -> float: 

170 result = self.quick_answer(*sequences) 

171 if result is not None: 

172 return result 

173 

174 sequences = self._get_counters(*sequences) # sets 

175 intersection = self._intersect_counters(*sequences) # set 

176 intersection = self._count_counters(intersection) # int 

177 sequences = [self._count_counters(s) for s in sequences] # ints 

178 

179 return intersection / min(sequences) 

180 

181 

182class Cosine(_BaseSimilarity): 

183 """ 

184 Compute the Cosine similarity (Ochiai coefficient) for two sequences. 

185 They should contain hashable items. 

186 The return value is a float between 0 and 1, where 1 means equal, 

187 and 0 totally different 

188 

189 https://en.wikipedia.org/wiki/Cosine_similarity 

190 https://github.com/Yomguithereal/talisman/blob/master/src/metrics/cosine.js 

191 """ 

192 

193 def __init__( 

194 self, 

195 qval: int = 1, 

196 as_set: bool = False, 

197 external: bool = True, 

198 ) -> None: 

199 self.qval = qval 

200 self.as_set = as_set 

201 self.external = external 

202 

203 def maximum(self, *sequences: Sequence) -> int: 

204 return 1 

205 

206 def __call__(self, *sequences: Sequence) -> float: 

207 result = self.quick_answer(*sequences) 

208 if result is not None: 

209 return result 

210 

211 sequences = self._get_counters(*sequences) # sets 

212 intersection = self._intersect_counters(*sequences) # set 

213 intersection = self._count_counters(intersection) # int 

214 sequences = [self._count_counters(s) for s in sequences] # ints 

215 prod = reduce(lambda x, y: x * y, sequences) 

216 

217 return intersection / pow(prod, 1.0 / len(sequences)) 

218 

219 

220class Tanimoto(Jaccard): 

221 """ 

222 Compute the Tanimoto distance between two sequences. 

223 They should contain hashable items. 

224 The return value is a float between -inf and 0, where 0 means equal, 

225 and -inf totally different 

226  

227 This is identical to the Jaccard similarity coefficient 

228 and the Tversky index for alpha=1 and beta=1. 

229 

230 https://en.wikipedia.org/wiki/Jaccard_index#Tanimoto_similarity_and_distance 

231 """ 

232 

233 def __call__(self, *sequences: Sequence) -> float: 

234 result = super().__call__(*sequences) 

235 if result == 0: 

236 return float('-inf') 

237 else: 

238 return log(result, 2) 

239 

240 

241class MongeElkan(_BaseSimilarity): 

242 """ 

243 Compute the Monge Elkan distance between two sequences. 

244 They should contain hashable items. 

245 The return value is a float between 0 and 2, where 2 means equal, 

246 and 0 totally different. 

247  

248 https://www.academia.edu/200314/Generalized_Monge-Elkan_Method_for_Approximate_Text_String_Comparison 

249 http://www.cs.cmu.edu/~wcohen/postscript/kdd-2003-match-ws.pdf 

250 https://github.com/Yomguithereal/talisman/blob/master/src/metrics/monge-elkan.js 

251 """ 

252 _damerau_levenshtein = DamerauLevenshtein() 

253 

254 def __init__( 

255 self, 

256 algorithm=_damerau_levenshtein, 

257 symmetric: bool = False, 

258 qval: int = 1, 

259 external: bool = True, 

260 ) -> None: 

261 self.algorithm = algorithm 

262 self.symmetric = symmetric 

263 self.qval = qval 

264 self.external = external 

265 

266 def maximum(self, *sequences: Sequence) -> float: 

267 result = self.algorithm.maximum(sequences) 

268 for seq in sequences: 

269 if seq: 

270 result = max(result, self.algorithm.maximum(*seq)) 

271 return result 

272 

273 def _calc(self, seq, *sequences: Sequence) -> float: 

274 if not seq: 

275 return 0 

276 maxes = [] 

277 for c1 in seq: 

278 for s in sequences: 

279 max_sim = float('-inf') 

280 for c2 in s: 

281 max_sim = max(max_sim, self.algorithm.similarity(c1, c2)) 

282 maxes.append(max_sim) 

283 return sum(maxes) / len(seq) / len(maxes) 

284 

285 def __call__(self, *sequences: Sequence) -> float: 

286 quick_result = self.quick_answer(*sequences) 

287 if quick_result is not None: 

288 return quick_result 

289 sequences = self._get_sequences(*sequences) 

290 

291 if self.symmetric: 

292 result = [] 

293 for seqs in permutations(sequences): 

294 result.append(self._calc(*seqs)) 

295 return sum(result) / len(result) 

296 else: 

297 return self._calc(*sequences) 

298 

299 

300class Bag(_Base): 

301 """ 

302 Compute the Bag distance between two sequences. 

303 They should contain hashable items. 

304 The return value is a float between 0 and N, where 0 means equal, 

305 and N totally different. N would, at most, be the length of the  

306 longest sequence in the comparison. 

307  

308 http://www-db.disi.unibo.it/research/papers/SPIRE02.pdf 

309 https://github.com/Yomguithereal/talisman/blob/master/src/metrics/bag.js 

310 """ 

311 

312 def __call__(self, *sequences: Sequence) -> float: 

313 sequences = self._get_counters(*sequences) # sets 

314 intersection = self._intersect_counters(*sequences) # set 

315 return max(self._count_counters(sequence - intersection) for sequence in sequences) 

316 

317 

318bag = Bag() 

319cosine = Cosine() 

320dice = Sorensen() 

321jaccard = Jaccard() 

322monge_elkan = MongeElkan() 

323overlap = Overlap() 

324sorensen = Sorensen() 

325sorensen_dice = Sorensen() 

326# sorensen_dice = Tversky(ks=[.5, .5]) 

327tanimoto = Tanimoto() 

328tversky = Tversky()