Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/textdistance/algorithms/edit_based.py: 59%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

448 statements  

1from __future__ import annotations 

2 

3# built-in 

4from collections import defaultdict 

5from itertools import zip_longest 

6from typing import Any, Sequence, TypeVar 

7 

8# app 

9from .base import Base as _Base, BaseSimilarity as _BaseSimilarity 

10from .types import SimFunc, TestFunc 

11 

12 

13try: 

14 # external 

15 import numpy 

16except ImportError: 

17 numpy = None # type: ignore[assignment] 

18 

19 

20__all__ = [ 

21 'Hamming', 'MLIPNS', 

22 'Levenshtein', 'DamerauLevenshtein', 

23 'Jaro', 'JaroWinkler', 'StrCmp95', 

24 'NeedlemanWunsch', 'Gotoh', 'SmithWaterman', 

25 

26 'hamming', 'mlipns', 

27 'levenshtein', 'damerau_levenshtein', 

28 'jaro', 'jaro_winkler', 'strcmp95', 

29 'needleman_wunsch', 'gotoh', 'smith_waterman', 

30] 

31T = TypeVar('T') 

32 

33 

34class Hamming(_Base): 

35 """ 

36 Compute the Hamming distance between the two or more sequences. 

37 The Hamming distance is the number of differing items in ordered sequences. 

38 

39 https://en.wikipedia.org/wiki/Hamming_distance 

40 """ 

41 

42 def __init__( 

43 self, 

44 qval: int = 1, 

45 test_func: TestFunc | None = None, 

46 truncate: bool = False, 

47 external: bool = True, 

48 ) -> None: 

49 self.qval = qval 

50 self.test_func = test_func or self._ident 

51 self.truncate = truncate 

52 self.external = external 

53 

54 def __call__(self, *sequences: Sequence[object]) -> int: 

55 sequences = self._get_sequences(*sequences) 

56 

57 result = self.quick_answer(*sequences) 

58 if result is not None: 

59 assert isinstance(result, int) 

60 return result 

61 

62 _zip = zip if self.truncate else zip_longest 

63 return sum(not self.test_func(*es) for es in _zip(*sequences)) 

64 

65 

66class Levenshtein(_Base): 

67 """ 

68 Compute the absolute Levenshtein distance between the two sequences. 

69 The Levenshtein distance is the minimum number of edit operations necessary 

70 for transforming one sequence into the other. The edit operations allowed are: 

71 

72 * deletion: ABC -> BC, AC, AB 

73 * insertion: ABC -> ABCD, EABC, AEBC.. 

74 * substitution: ABC -> ABE, ADC, FBC.. 

75 

76 https://en.wikipedia.org/wiki/Levenshtein_distance 

77 TODO: https://gist.github.com/kylebgorman/1081951/9b38b7743a3cb5167ab2c6608ac8eea7fc629dca 

78 """ 

79 

80 def __init__( 

81 self, 

82 qval: int = 1, 

83 test_func: TestFunc | None = None, 

84 external: bool = True, 

85 ) -> None: 

86 self.qval = qval 

87 self.test_func = test_func or self._ident 

88 self.external = external 

89 

90 def _recursive(self, s1: Sequence[T], s2: Sequence[T]) -> int: 

91 # TODO: more than 2 sequences support 

92 if not s1 or not s2: 

93 return len(s1) + len(s2) 

94 

95 if self.test_func(s1[-1], s2[-1]): 

96 return self(s1[:-1], s2[:-1]) 

97 

98 # deletion/insertion 

99 d = min( 

100 self(s1[:-1], s2), 

101 self(s1, s2[:-1]), 

102 ) 

103 # substitution 

104 s = self(s1[:-1], s2[:-1]) 

105 return min(d, s) + 1 

106 

107 def _cycled(self, s1: Sequence[T], s2: Sequence[T]) -> int: 

108 """ 

109 source: 

110 https://github.com/jamesturk/jellyfish/blob/master/jellyfish/_jellyfish.py#L18 

111 """ 

112 rows = len(s1) + 1 

113 cols = len(s2) + 1 

114 prev = None 

115 cur: Any 

116 if numpy: 

117 cur = numpy.arange(cols) 

118 else: 

119 cur = range(cols) 

120 

121 for r in range(1, rows): 

122 prev, cur = cur, [r] + [0] * (cols - 1) 

123 for c in range(1, cols): 

124 deletion = prev[c] + 1 

125 insertion = cur[c - 1] + 1 

126 dist = self.test_func(s1[r - 1], s2[c - 1]) 

127 edit = prev[c - 1] + (not dist) 

128 cur[c] = min(edit, deletion, insertion) 

129 return int(cur[-1]) 

130 

131 def __call__(self, s1: Sequence[T], s2: Sequence[T]) -> int: 

132 s1, s2 = self._get_sequences(s1, s2) 

133 

134 result = self.quick_answer(s1, s2) 

135 if result is not None: 

136 assert isinstance(result, int) 

137 return result 

138 

139 return self._cycled(s1, s2) 

140 

141 

142class DamerauLevenshtein(_Base): 

143 """ 

144 Compute the absolute Damerau-Levenshtein distance between the two sequences. 

145 The Damerau-Levenshtein distance is the minimum number of edit operations necessary 

146 for transforming one sequence into the other. The edit operations allowed are: 

147 

148 * deletion: ABC -> BC, AC, AB 

149 * insertion: ABC -> ABCD, EABC, AEBC.. 

150 * substitution: ABC -> ABE, ADC, FBC.. 

151 * transposition: ABC -> ACB, BAC 

152 

153 If `restricted=False`, it will calculate unrestricted distance, 

154 where the same character can be touched more than once. 

155 So the distance between BA and ACB is 2: BA -> AB -> ACB. 

156 

157 https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance 

158 """ 

159 

160 def __init__( 

161 self, 

162 qval: int = 1, 

163 test_func: TestFunc | None = None, 

164 external: bool = True, 

165 restricted: bool = True, 

166 ) -> None: 

167 self.qval = qval 

168 self.test_func = test_func or self._ident 

169 self.external = external 

170 self.restricted = restricted 

171 

172 def _numpy(self, s1: Sequence[T], s2: Sequence[T]) -> int: 

173 # TODO: doesn't pass tests, need improve 

174 d = numpy.zeros([len(s1) + 1, len(s2) + 1], dtype=int) 

175 

176 # matrix 

177 for i in range(-1, len(s1) + 1): 

178 d[i][-1] = i + 1 

179 for j in range(-1, len(s2) + 1): 

180 d[-1][j] = j + 1 

181 

182 for i, cs1 in enumerate(s1): 

183 for j, cs2 in enumerate(s2): 

184 cost = int(not self.test_func(cs1, cs2)) 

185 # ^ 0 if equal, 1 otherwise 

186 

187 d[i][j] = min( 

188 d[i - 1][j] + 1, # deletion 

189 d[i][j - 1] + 1, # insertion 

190 d[i - 1][j - 1] + cost, # substitution 

191 ) 

192 

193 # transposition 

194 if not i or not j: 

195 continue 

196 if not self.test_func(cs1, s2[j - 1]): 

197 continue 

198 d[i][j] = min( 

199 d[i][j], 

200 d[i - 2][j - 2] + cost, 

201 ) 

202 

203 return d[len(s1) - 1][len(s2) - 1] 

204 

205 def _pure_python_unrestricted(self, s1: Sequence[T], s2: Sequence[T]) -> int: 

206 """https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance 

207 """ 

208 d: dict[tuple[int, int], int] = {} 

209 da: dict[T, int] = {} 

210 

211 len1 = len(s1) 

212 len2 = len(s2) 

213 

214 maxdist = len1 + len2 

215 d[-1, -1] = maxdist 

216 

217 # matrix 

218 for i in range(len(s1) + 1): 

219 d[i, -1] = maxdist 

220 d[i, 0] = i 

221 for j in range(len(s2) + 1): 

222 d[-1, j] = maxdist 

223 d[0, j] = j 

224 

225 for i, cs1 in enumerate(s1, start=1): 

226 db = 0 

227 for j, cs2 in enumerate(s2, start=1): 

228 i1 = da.get(cs2, 0) 

229 j1 = db 

230 if self.test_func(cs1, cs2): 

231 cost = 0 

232 db = j 

233 else: 

234 cost = 1 

235 

236 d[i, j] = min( 

237 d[i - 1, j - 1] + cost, # substitution 

238 d[i, j - 1] + 1, # insertion 

239 d[i - 1, j] + 1, # deletion 

240 d[i1 - 1, j1 - 1] + (i - i1) - 1 + (j - j1), # transposition 

241 ) 

242 da[cs1] = i 

243 

244 return d[len1, len2] 

245 

246 def _pure_python_restricted(self, s1: Sequence[T], s2: Sequence[T]) -> int: 

247 """ 

248 https://www.guyrutenberg.com/2008/12/15/damerau-levenshtein-distance-in-python/ 

249 """ 

250 d: dict[tuple[int, int], int] = {} 

251 

252 # matrix 

253 for i in range(-1, len(s1) + 1): 

254 d[i, -1] = i + 1 

255 for j in range(-1, len(s2) + 1): 

256 d[-1, j] = j + 1 

257 

258 for i, cs1 in enumerate(s1): 

259 for j, cs2 in enumerate(s2): 

260 cost = int(not self.test_func(cs1, cs2)) 

261 # ^ 0 if equal, 1 otherwise 

262 

263 d[i, j] = min( 

264 d[i - 1, j] + 1, # deletion 

265 d[i, j - 1] + 1, # insertion 

266 d[i - 1, j - 1] + cost, # substitution 

267 ) 

268 

269 # transposition 

270 if not i or not j: 

271 continue 

272 if not self.test_func(cs1, s2[j - 1]): 

273 continue 

274 if not self.test_func(s1[i - 1], cs2): 

275 continue 

276 d[i, j] = min( 

277 d[i, j], 

278 d[i - 2, j - 2] + cost, 

279 ) 

280 

281 return d[len(s1) - 1, len(s2) - 1] 

282 

283 def __call__(self, s1: Sequence[T], s2: Sequence[T]) -> int: 

284 s1, s2 = self._get_sequences(s1, s2) 

285 

286 result = self.quick_answer(s1, s2) 

287 if result is not None: 

288 return result # type: ignore[return-value] 

289 

290 # if numpy: 

291 # return self._numpy(s1, s2) 

292 # else: 

293 if self.restricted: 

294 return self._pure_python_restricted(s1, s2) 

295 return self._pure_python_unrestricted(s1, s2) 

296 

297 

298class JaroWinkler(_BaseSimilarity): 

299 """ 

300 Computes the Jaro-Winkler measure between two strings. 

301 The Jaro-Winkler measure is designed to capture cases where two strings 

302 have a low Jaro score, but share a prefix. 

303 and thus are likely to match. 

304 

305 https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance 

306 https://github.com/Yomguithereal/talisman/blob/master/src/metrics/jaro.js 

307 https://github.com/Yomguithereal/talisman/blob/master/src/metrics/jaro-winkler.js 

308 """ 

309 

310 def __init__( 

311 self, 

312 long_tolerance: bool = False, 

313 winklerize: bool = True, 

314 qval: int = 1, 

315 external: bool = True, 

316 ) -> None: 

317 self.qval = qval 

318 self.long_tolerance = long_tolerance 

319 self.winklerize = winklerize 

320 self.external = external 

321 

322 def maximum(self, *sequences: Sequence[object]) -> int: 

323 return 1 

324 

325 def __call__(self, s1: Sequence[T], s2: Sequence[T], prefix_weight: float = 0.1) -> float: 

326 s1, s2 = self._get_sequences(s1, s2) 

327 

328 result = self.quick_answer(s1, s2) 

329 if result is not None: 

330 return result 

331 

332 s1_len = len(s1) 

333 s2_len = len(s2) 

334 

335 if not s1_len or not s2_len: 

336 return 0.0 

337 

338 min_len = min(s1_len, s2_len) 

339 search_range = max(s1_len, s2_len) 

340 search_range = (search_range // 2) - 1 

341 if search_range < 0: 

342 search_range = 0 

343 

344 s1_flags = [False] * s1_len 

345 s2_flags = [False] * s2_len 

346 

347 # looking only within search range, count & flag matched pairs 

348 common_chars = 0 

349 for i, s1_ch in enumerate(s1): 

350 low = max(0, i - search_range) 

351 hi = min(i + search_range, s2_len - 1) 

352 for j in range(low, hi + 1): 

353 if not s2_flags[j] and s2[j] == s1_ch: 

354 s1_flags[i] = s2_flags[j] = True 

355 common_chars += 1 

356 break 

357 

358 # short circuit if no characters match 

359 if not common_chars: 

360 return 0.0 

361 

362 # count transpositions 

363 k = trans_count = 0 

364 for i, s1_f in enumerate(s1_flags): 

365 if s1_f: 

366 for j in range(k, s2_len): 

367 if s2_flags[j]: 

368 k = j + 1 

369 break 

370 if s1[i] != s2[j]: 

371 trans_count += 1 

372 trans_count //= 2 

373 

374 # adjust for similarities in nonmatched characters 

375 weight = common_chars / s1_len + common_chars / s2_len 

376 weight += (common_chars - trans_count) / common_chars 

377 weight /= 3 

378 

379 # stop to boost if strings are not similar 

380 if not self.winklerize: 

381 return weight 

382 if weight <= 0.7: 

383 return weight 

384 

385 # winkler modification 

386 # adjust for up to first 4 chars in common 

387 j = min(min_len, 4) 

388 i = 0 

389 while i < j and s1[i] == s2[i]: 

390 i += 1 

391 if i: 

392 weight += i * prefix_weight * (1.0 - weight) 

393 

394 # optionally adjust for long strings 

395 # after agreeing beginning chars, at least two or more must agree and 

396 # agreed characters must be > half of remaining characters 

397 if not self.long_tolerance or min_len <= 4: 

398 return weight 

399 if common_chars <= i + 1 or 2 * common_chars < min_len + i: 

400 return weight 

401 tmp = (common_chars - i - 1) / (s1_len + s2_len - i * 2 + 2) 

402 weight += (1.0 - weight) * tmp 

403 return weight 

404 

405 

406class Jaro(JaroWinkler): 

407 def __init__( 

408 self, 

409 long_tolerance: bool = False, 

410 qval: int = 1, 

411 external: bool = True, 

412 ) -> None: 

413 super().__init__( 

414 long_tolerance=long_tolerance, 

415 winklerize=False, 

416 qval=qval, 

417 external=external, 

418 ) 

419 

420 

421class NeedlemanWunsch(_BaseSimilarity): 

422 """ 

423 Computes the Needleman-Wunsch measure between two strings. 

424 The Needleman-Wunsch generalizes the Levenshtein distance and considers global 

425 alignment between two strings. Specifically, it is computed by assigning 

426 a score to each alignment between two input strings and choosing the 

427 score of the best alignment, that is, the maximal score. 

428 An alignment between two strings is a set of correspondences between the 

429 characters of between them, allowing for gaps. 

430 

431 https://en.wikipedia.org/wiki/Needleman%E2%80%93Wunsch_algorithm 

432 """ 

433 

434 def __init__( 

435 self, 

436 gap_cost: float = 1.0, 

437 sim_func: SimFunc = None, 

438 qval: int = 1, 

439 external: bool = True, 

440 ) -> None: 

441 self.qval = qval 

442 self.gap_cost = gap_cost 

443 if sim_func: 

444 self.sim_func = sim_func 

445 else: 

446 self.sim_func = self._ident 

447 self.external = external 

448 

449 def minimum(self, *sequences: Sequence[object]) -> float: 

450 return -max(map(len, sequences)) * self.gap_cost 

451 

452 def maximum(self, *sequences: Sequence[object]) -> float: 

453 return max(map(len, sequences)) 

454 

455 def distance(self, *sequences: Sequence[object]) -> float: 

456 """Get distance between sequences 

457 """ 

458 return -1 * self.similarity(*sequences) 

459 

460 def normalized_distance(self, *sequences: Sequence[object]) -> float: 

461 """Get distance from 0 to 1 

462 """ 

463 minimum = self.minimum(*sequences) 

464 maximum = self.maximum(*sequences) 

465 if maximum == 0: 

466 return 0 

467 return (self.distance(*sequences) - minimum) / (maximum - minimum) 

468 

469 def normalized_similarity(self, *sequences: Sequence[object]) -> float: 

470 """Get similarity from 0 to 1 

471 """ 

472 minimum = self.minimum(*sequences) 

473 maximum = self.maximum(*sequences) 

474 if maximum == 0: 

475 return 1 

476 return (self.similarity(*sequences) - minimum) / (maximum * 2) 

477 

478 def __call__(self, s1: Sequence[T], s2: Sequence[T]) -> float: 

479 if not numpy: 

480 raise ImportError('Please, install numpy for Needleman-Wunsch measure') 

481 

482 s1, s2 = self._get_sequences(s1, s2) 

483 

484 # result = self.quick_answer(s1, s2) 

485 # if result is not None: 

486 # return result * self.maximum(s1, s2) 

487 

488 dist_mat = numpy.zeros( 

489 (len(s1) + 1, len(s2) + 1), 

490 dtype=float, 

491 ) 

492 # DP initialization 

493 for i in range(len(s1) + 1): 

494 dist_mat[i, 0] = -(i * self.gap_cost) 

495 # DP initialization 

496 for j in range(len(s2) + 1): 

497 dist_mat[0, j] = -(j * self.gap_cost) 

498 # Needleman-Wunsch DP calculation 

499 for i, c1 in enumerate(s1, 1): 

500 for j, c2 in enumerate(s2, 1): 

501 match = dist_mat[i - 1, j - 1] + self.sim_func(c1, c2) 

502 delete = dist_mat[i - 1, j] - self.gap_cost 

503 insert = dist_mat[i, j - 1] - self.gap_cost 

504 dist_mat[i, j] = max(match, delete, insert) 

505 return dist_mat[dist_mat.shape[0] - 1, dist_mat.shape[1] - 1] 

506 

507 

508class SmithWaterman(_BaseSimilarity): 

509 """ 

510 Computes the Smith-Waterman measure between two strings. 

511 The Smith-Waterman algorithm performs local sequence alignment; 

512 that is, for determining similar regions between two strings. 

513 Instead of looking at the total sequence, the Smith-Waterman algorithm compares 

514 segments of all possible lengths and optimizes the similarity measure. 

515 

516 https://en.wikipedia.org/wiki/Smith%E2%80%93Waterman_algorithm 

517 https://github.com/Yomguithereal/talisman/blob/master/src/metrics/smith-waterman.js 

518 """ 

519 

520 def __init__( 

521 self, 

522 gap_cost: float = 1.0, 

523 sim_func: SimFunc = None, 

524 qval: int = 1, 

525 external: bool = True, 

526 ) -> None: 

527 self.qval = qval 

528 self.gap_cost = gap_cost 

529 self.sim_func = sim_func or self._ident 

530 self.external = external 

531 

532 def maximum(self, *sequences: Sequence[object]) -> int: 

533 return min(map(len, sequences)) 

534 

535 def __call__(self, s1: Sequence[T], s2: Sequence[T]) -> float: 

536 if not numpy: 

537 raise ImportError('Please, install numpy for Smith-Waterman measure') 

538 

539 s1, s2 = self._get_sequences(s1, s2) 

540 

541 result = self.quick_answer(s1, s2) 

542 if result is not None: 

543 return result 

544 

545 dist_mat = numpy.zeros( 

546 (len(s1) + 1, len(s2) + 1), 

547 dtype=float, 

548 ) 

549 for i, sc1 in enumerate(s1, start=1): 

550 for j, sc2 in enumerate(s2, start=1): 

551 # The score for substituting the letter a[i - 1] for b[j - 1]. 

552 # Generally low for mismatch, high for match. 

553 match = dist_mat[i - 1, j - 1] + self.sim_func(sc1, sc2) 

554 # The scores for for introducing extra letters in one of the strings 

555 # (or by symmetry, deleting them from the other). 

556 delete = dist_mat[i - 1, j] - self.gap_cost 

557 insert = dist_mat[i, j - 1] - self.gap_cost 

558 dist_mat[i, j] = max(0, match, delete, insert) 

559 return dist_mat[dist_mat.shape[0] - 1, dist_mat.shape[1] - 1] 

560 

561 

562class Gotoh(NeedlemanWunsch): 

563 """Gotoh score 

564 Gotoh's algorithm is essentially Needleman-Wunsch with affine gap 

565 penalties: 

566 https://www.cs.umd.edu/class/spring2003/cmsc838t/papers/gotoh1982.pdf 

567 """ 

568 

569 def __init__( 

570 self, 

571 gap_open: int = 1, 

572 gap_ext: float = 0.4, 

573 sim_func: SimFunc = None, 

574 qval: int = 1, 

575 external: bool = True, 

576 ) -> None: 

577 self.qval = qval 

578 self.gap_open = gap_open 

579 self.gap_ext = gap_ext 

580 if sim_func: 

581 self.sim_func = sim_func 

582 else: 

583 self.sim_func = self._ident 

584 self.external = external 

585 

586 def minimum(self, *sequences: Sequence[object]) -> int: 

587 return -min(map(len, sequences)) 

588 

589 def maximum(self, *sequences: Sequence[object]) -> int: 

590 return min(map(len, sequences)) 

591 

592 def __call__(self, s1: Sequence[T], s2: Sequence[T]) -> float: 

593 if not numpy: 

594 raise ImportError('Please, install numpy for Gotoh measure') 

595 

596 s1, s2 = self._get_sequences(s1, s2) 

597 

598 # result = self.quick_answer(s1, s2) 

599 # if result is not None: 

600 # return result * self.maximum(s1, s2) 

601 

602 len_s1 = len(s1) 

603 len_s2 = len(s2) 

604 d_mat = numpy.zeros((len_s1 + 1, len_s2 + 1), dtype=float) 

605 p_mat = numpy.zeros((len_s1 + 1, len_s2 + 1), dtype=float) 

606 q_mat = numpy.zeros((len_s1 + 1, len_s2 + 1), dtype=float) 

607 

608 d_mat[0, 0] = 0 

609 p_mat[0, 0] = float('-inf') 

610 q_mat[0, 0] = float('-inf') 

611 for i in range(1, len_s1 + 1): 

612 d_mat[i, 0] = float('-inf') 

613 p_mat[i, 0] = -self.gap_open - self.gap_ext * (i - 1) 

614 q_mat[i, 0] = float('-inf') 

615 q_mat[i, 1] = -self.gap_open 

616 for j in range(1, len_s2 + 1): 

617 d_mat[0, j] = float('-inf') 

618 p_mat[0, j] = float('-inf') 

619 p_mat[1, j] = -self.gap_open 

620 q_mat[0, j] = -self.gap_open - self.gap_ext * (j - 1) 

621 

622 for i, sc1 in enumerate(s1, start=1): 

623 for j, sc2 in enumerate(s2, start=1): 

624 sim_val = self.sim_func(sc1, sc2) 

625 d_mat[i, j] = max( 

626 d_mat[i - 1, j - 1] + sim_val, 

627 p_mat[i - 1, j - 1] + sim_val, 

628 q_mat[i - 1, j - 1] + sim_val, 

629 ) 

630 p_mat[i, j] = max( 

631 d_mat[i - 1, j] - self.gap_open, 

632 p_mat[i - 1, j] - self.gap_ext, 

633 ) 

634 q_mat[i, j] = max( 

635 d_mat[i, j - 1] - self.gap_open, 

636 q_mat[i, j - 1] - self.gap_ext, 

637 ) 

638 

639 i, j = (n - 1 for n in d_mat.shape) 

640 return max(d_mat[i, j], p_mat[i, j], q_mat[i, j]) 

641 

642 

643class StrCmp95(_BaseSimilarity): 

644 """strcmp95 similarity 

645 

646 http://cpansearch.perl.org/src/SCW/Text-JaroWinkler-0.1/strcmp95.c 

647 """ 

648 sp_mx: tuple[tuple[str, str], ...] = ( 

649 ('A', 'E'), ('A', 'I'), ('A', 'O'), ('A', 'U'), ('B', 'V'), ('E', 'I'), 

650 ('E', 'O'), ('E', 'U'), ('I', 'O'), ('I', 'U'), ('O', 'U'), ('I', 'Y'), 

651 ('E', 'Y'), ('C', 'G'), ('E', 'F'), ('W', 'U'), ('W', 'V'), ('X', 'K'), 

652 ('S', 'Z'), ('X', 'S'), ('Q', 'C'), ('U', 'V'), ('M', 'N'), ('L', 'I'), 

653 ('Q', 'O'), ('P', 'R'), ('I', 'J'), ('2', 'Z'), ('5', 'S'), ('8', 'B'), 

654 ('1', 'I'), ('1', 'L'), ('0', 'O'), ('0', 'Q'), ('C', 'K'), ('G', 'J'), 

655 ) 

656 

657 def __init__(self, long_strings: bool = False, external: bool = True) -> None: 

658 self.long_strings = long_strings 

659 self.external = external 

660 

661 def maximum(self, *sequences: Sequence[object]) -> int: 

662 return 1 

663 

664 @staticmethod 

665 def _in_range(char) -> bool: 

666 return 0 < ord(char) < 91 

667 

668 def __call__(self, s1: str, s2: str) -> float: 

669 s1 = s1.strip().upper() 

670 s2 = s2.strip().upper() 

671 

672 result = self.quick_answer(s1, s2) 

673 if result is not None: 

674 return result 

675 

676 len_s1 = len(s1) 

677 len_s2 = len(s2) 

678 

679 adjwt = defaultdict(int) 

680 

681 # Initialize the adjwt array on the first call to the function only. 

682 # The adjwt array is used to give partial credit for characters that 

683 # may be errors due to known phonetic or character recognition errors. 

684 # A typical example is to match the letter "O" with the number "0" 

685 for c1, c2 in self.sp_mx: 

686 adjwt[c1, c2] = 3 

687 adjwt[c2, c1] = 3 

688 

689 if len_s1 > len_s2: 

690 search_range = len_s1 

691 minv = len_s2 

692 else: 

693 search_range = len_s2 

694 minv = len_s1 

695 

696 # Blank out the flags 

697 s1_flag = [0] * search_range 

698 s2_flag = [0] * search_range 

699 search_range = max(0, search_range // 2 - 1) 

700 

701 # Looking only within the search range, count and flag the matched pairs. 

702 num_com = 0 

703 yl1 = len_s2 - 1 

704 for i, sc1 in enumerate(s1): 

705 lowlim = max(i - search_range, 0) 

706 hilim = min(i + search_range, yl1) 

707 for j in range(lowlim, hilim + 1): 

708 if s2_flag[j] == 0 and s2[j] == sc1: 

709 s2_flag[j] = 1 

710 s1_flag[i] = 1 

711 num_com += 1 

712 break 

713 

714 # If no characters in common - return 

715 if num_com == 0: 

716 return 0.0 

717 

718 # Count the number of transpositions 

719 k = n_trans = 0 

720 for i, sc1 in enumerate(s1): 

721 if not s1_flag[i]: 

722 continue 

723 for j in range(k, len_s2): 

724 if s2_flag[j] != 0: 

725 k = j + 1 

726 break 

727 if sc1 != s2[j]: 

728 n_trans += 1 

729 n_trans = n_trans // 2 

730 

731 # Adjust for similarities in unmatched characters 

732 n_simi = 0 

733 if minv > num_com: 

734 for i in range(len_s1): 

735 if s1_flag[i] != 0: 

736 continue 

737 if not self._in_range(s1[i]): 

738 continue 

739 for j in range(len_s2): 

740 if s2_flag[j] != 0: 

741 continue 

742 if not self._in_range(s2[j]): 

743 continue 

744 if (s1[i], s2[j]) not in adjwt: 

745 continue 

746 n_simi += adjwt[s1[i], s2[j]] 

747 s2_flag[j] = 2 

748 break 

749 num_sim = n_simi / 10.0 + num_com 

750 

751 # Main weight computation 

752 weight = num_sim / len_s1 + num_sim / len_s2 

753 weight += (num_com - n_trans) / num_com 

754 weight = weight / 3.0 

755 

756 # Continue to boost the weight if the strings are similar 

757 if weight <= 0.7: 

758 return weight 

759 

760 # Adjust for having up to the first 4 characters in common 

761 j = min(minv, 4) 

762 i = 0 

763 for sc1, sc2 in zip(s1, s2): 

764 if i >= j: 

765 break 

766 if sc1 != sc2: 

767 break 

768 if sc1.isdigit(): 

769 break 

770 i += 1 

771 if i: 

772 weight += i * 0.1 * (1.0 - weight) 

773 

774 # Optionally adjust for long strings. 

775 

776 # After agreeing beginning chars, at least two more must agree and 

777 # the agreeing characters must be > .5 of remaining characters. 

778 if not self.long_strings: 

779 return weight 

780 if minv <= 4: 

781 return weight 

782 if num_com <= i + 1 or 2 * num_com < minv + i: 

783 return weight 

784 if s1[0].isdigit(): 

785 return weight 

786 res = (num_com - i - 1) / (len_s1 + len_s2 - i * 2 + 2) 

787 weight += (1.0 - weight) * res 

788 return weight 

789 

790 

791class MLIPNS(_BaseSimilarity): 

792 """ 

793 Compute the Hamming distance between the two or more sequences. 

794 The Hamming distance is the number of differing items in ordered sequences. 

795 

796 http://www.sial.iias.spb.su/files/386-386-1-PB.pdf 

797 https://github.com/Yomguithereal/talisman/blob/master/src/metrics/mlipns.js 

798 """ 

799 

800 def __init__( 

801 self, threshold: float = 0.25, 

802 maxmismatches: int = 2, 

803 qval: int = 1, 

804 external: bool = True, 

805 ) -> None: 

806 self.qval = qval 

807 self.threshold = threshold 

808 self.maxmismatches = maxmismatches 

809 self.external = external 

810 

811 def maximum(self, *sequences: Sequence[object]) -> int: 

812 return 1 

813 

814 def __call__(self, *sequences: Sequence[object]) -> float: 

815 sequences = self._get_sequences(*sequences) 

816 

817 result = self.quick_answer(*sequences) 

818 if result is not None: 

819 return result 

820 

821 mismatches = 0 

822 ham = Hamming()(*sequences) 

823 maxlen = max(map(len, sequences)) 

824 while all(sequences) and mismatches <= self.maxmismatches: 

825 if not maxlen: 

826 return 1 

827 if 1 - (maxlen - ham) / maxlen <= self.threshold: 

828 return 1 

829 mismatches += 1 

830 ham -= 1 

831 maxlen -= 1 

832 

833 if not maxlen: 

834 return 1 

835 return 0 

836 

837 

838hamming = Hamming() 

839levenshtein = Levenshtein() 

840damerau = damerau_levenshtein = DamerauLevenshtein() 

841jaro = Jaro() 

842jaro_winkler = JaroWinkler() 

843needleman_wunsch = NeedlemanWunsch() 

844smith_waterman = SmithWaterman() 

845gotoh = Gotoh() 

846strcmp95 = StrCmp95() 

847mlipns = MLIPNS()