Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/textdistance/algorithms/edit

1from __future__ import annotations

3# built-in

4from collections import defaultdict

5from itertools import zip_longest

6from typing import Any, Sequence, TypeVar

8# app

9from .base import Base as _Base, BaseSimilarity as _BaseSimilarity

10from .types import SimFunc, TestFunc

13try:

14 # external

15 import numpy

16except ImportError:

17 numpy = None # type: ignore[assignment]

20__all__ = [

21 'Hamming', 'MLIPNS',

22 'Levenshtein', 'DamerauLevenshtein',

23 'Jaro', 'JaroWinkler', 'StrCmp95',

24 'NeedlemanWunsch', 'Gotoh', 'SmithWaterman',

26 'hamming', 'mlipns',

27 'levenshtein', 'damerau_levenshtein',

28 'jaro', 'jaro_winkler', 'strcmp95',

29 'needleman_wunsch', 'gotoh', 'smith_waterman',

30]

31T = TypeVar('T')

34class Hamming(_Base):

35 """

36 Compute the Hamming distance between the two or more sequences.

37 The Hamming distance is the number of differing items in ordered sequences.

39 https://en.wikipedia.org/wiki/Hamming_distance

40 """

42 def __init__(

43 self,

44 qval: int = 1,

45 test_func: TestFunc | None = None,

46 truncate: bool = False,

47 external: bool = True,

48 ) -> None:

49 self.qval = qval

50 self.test_func = test_func or self._ident

51 self.truncate = truncate

52 self.external = external

54 def __call__(self, *sequences: Sequence[object]) -> int:

55 sequences = self._get_sequences(*sequences)

57 result = self.quick_answer(*sequences)

58 if result is not None:

59 assert isinstance(result, int)

60 return result

62 _zip = zip if self.truncate else zip_longest

63 return sum(not self.test_func(*es) for es in _zip(*sequences))

66class Levenshtein(_Base):

67 """

68 Compute the absolute Levenshtein distance between the two sequences.

69 The Levenshtein distance is the minimum number of edit operations necessary

70 for transforming one sequence into the other. The edit operations allowed are:

72 * deletion: ABC -> BC, AC, AB

73 * insertion: ABC -> ABCD, EABC, AEBC..

74 * substitution: ABC -> ABE, ADC, FBC..

76 https://en.wikipedia.org/wiki/Levenshtein_distance

77 TODO: https://gist.github.com/kylebgorman/1081951/9b38b7743a3cb5167ab2c6608ac8eea7fc629dca

78 """

80 def __init__(

81 self,

82 qval: int = 1,

83 test_func: TestFunc | None = None,

84 external: bool = True,

85 ) -> None:

86 self.qval = qval

87 self.test_func = test_func or self._ident

88 self.external = external

90 def _recursive(self, s1: Sequence[T], s2: Sequence[T]) -> int:

91 # TODO: more than 2 sequences support

92 if not s1 or not s2:

93 return len(s1) + len(s2)

95 if self.test_func(s1[-1], s2[-1]):

96 return self(s1[:-1], s2[:-1])

98 # deletion/insertion

99 d = min(

100 self(s1[:-1], s2),

101 self(s1, s2[:-1]),

102 )

103 # substitution

104 s = self(s1[:-1], s2[:-1])

105 return min(d, s) + 1

106

107 def _cycled(self, s1: Sequence[T], s2: Sequence[T]) -> int:

108 """

109 source:

110 https://github.com/jamesturk/jellyfish/blob/master/jellyfish/_jellyfish.py#L18

111 """

112 rows = len(s1) + 1

113 cols = len(s2) + 1

114 prev = None

115 cur: Any

116 if numpy:

117 cur = numpy.arange(cols)

118 else:

119 cur = range(cols)

120

121 for r in range(1, rows):

122 prev, cur = cur, [r] + [0] * (cols - 1)

123 for c in range(1, cols):

124 deletion = prev[c] + 1

125 insertion = cur[c - 1] + 1

126 dist = self.test_func(s1[r - 1], s2[c - 1])

127 edit = prev[c - 1] + (not dist)

128 cur[c] = min(edit, deletion, insertion)

129 return int(cur[-1])

130

131 def __call__(self, s1: Sequence[T], s2: Sequence[T]) -> int:

132 s1, s2 = self._get_sequences(s1, s2)

133

134 result = self.quick_answer(s1, s2)

135 if result is not None:

136 assert isinstance(result, int)

137 return result

138

139 return self._cycled(s1, s2)

140

141

142class DamerauLevenshtein(_Base):

143 """

144 Compute the absolute Damerau-Levenshtein distance between the two sequences.

145 The Damerau-Levenshtein distance is the minimum number of edit operations necessary

146 for transforming one sequence into the other. The edit operations allowed are:

147

148 * deletion: ABC -> BC, AC, AB

149 * insertion: ABC -> ABCD, EABC, AEBC..

150 * substitution: ABC -> ABE, ADC, FBC..

151 * transposition: ABC -> ACB, BAC

152

153 If `restricted=False`, it will calculate unrestricted distance,

154 where the same character can be touched more than once.

155 So the distance between BA and ACB is 2: BA -> AB -> ACB.

156

157 https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance

158 """

159

160 def __init__(

161 self,

162 qval: int = 1,

163 test_func: TestFunc | None = None,

164 external: bool = True,

165 restricted: bool = True,

166 ) -> None:

167 self.qval = qval

168 self.test_func = test_func or self._ident

169 self.external = external

170 self.restricted = restricted

171

172 def _numpy(self, s1: Sequence[T], s2: Sequence[T]) -> int:

173 # TODO: doesn't pass tests, need improve

174 d = numpy.zeros([len(s1) + 1, len(s2) + 1], dtype=int)

175

176 # matrix

177 for i in range(-1, len(s1) + 1):

178 d[i][-1] = i + 1

179 for j in range(-1, len(s2) + 1):

180 d[-1][j] = j + 1

181

182 for i, cs1 in enumerate(s1):

183 for j, cs2 in enumerate(s2):

184 cost = int(not self.test_func(cs1, cs2))

185 # ^ 0 if equal, 1 otherwise

186

187 d[i][j] = min(

188 d[i - 1][j] + 1, # deletion

189 d[i][j - 1] + 1, # insertion

190 d[i - 1][j - 1] + cost, # substitution

191 )

192

193 # transposition

194 if not i or not j:

195 continue

196 if not self.test_func(cs1, s2[j - 1]):

197 continue

198 d[i][j] = min(

199 d[i][j],

200 d[i - 2][j - 2] + cost,

201 )

202

203 return d[len(s1) - 1][len(s2) - 1]

204

205 def _pure_python_unrestricted(self, s1: Sequence[T], s2: Sequence[T]) -> int:

206 """https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance

207 """

208 d: dict[tuple[int, int], int] = {}

209 da: dict[T, int] = {}

210

211 len1 = len(s1)

212 len2 = len(s2)

213

214 maxdist = len1 + len2

215 d[-1, -1] = maxdist

216

217 # matrix

218 for i in range(len(s1) + 1):

219 d[i, -1] = maxdist

220 d[i, 0] = i

221 for j in range(len(s2) + 1):

222 d[-1, j] = maxdist

223 d[0, j] = j

224

225 for i, cs1 in enumerate(s1, start=1):

226 db = 0

227 for j, cs2 in enumerate(s2, start=1):

228 i1 = da.get(cs2, 0)

229 j1 = db

230 if self.test_func(cs1, cs2):

231 cost = 0

232 db = j

233 else:

234 cost = 1

235

236 d[i, j] = min(

237 d[i - 1, j - 1] + cost, # substitution

238 d[i, j - 1] + 1, # insertion

239 d[i - 1, j] + 1, # deletion

240 d[i1 - 1, j1 - 1] + (i - i1) - 1 + (j - j1), # transposition

241 )

242 da[cs1] = i

243

244 return d[len1, len2]

245

246 def _pure_python_restricted(self, s1: Sequence[T], s2: Sequence[T]) -> int:

247 """

248 https://www.guyrutenberg.com/2008/12/15/damerau-levenshtein-distance-in-python/

249 """

250 d: dict[tuple[int, int], int] = {}

251

252 # matrix

253 for i in range(-1, len(s1) + 1):

254 d[i, -1] = i + 1

255 for j in range(-1, len(s2) + 1):

256 d[-1, j] = j + 1

257

258 for i, cs1 in enumerate(s1):

259 for j, cs2 in enumerate(s2):

260 cost = int(not self.test_func(cs1, cs2))

261 # ^ 0 if equal, 1 otherwise

262

263 d[i, j] = min(

264 d[i - 1, j] + 1, # deletion

265 d[i, j - 1] + 1, # insertion

266 d[i - 1, j - 1] + cost, # substitution

267 )

268

269 # transposition

270 if not i or not j:

271 continue

272 if not self.test_func(cs1, s2[j - 1]):

273 continue

274 if not self.test_func(s1[i - 1], cs2):

275 continue

276 d[i, j] = min(

277 d[i, j],

278 d[i - 2, j - 2] + cost,

279 )

280

281 return d[len(s1) - 1, len(s2) - 1]

282

283 def __call__(self, s1: Sequence[T], s2: Sequence[T]) -> int:

284 s1, s2 = self._get_sequences(s1, s2)

285

286 result = self.quick_answer(s1, s2)

287 if result is not None:

288 return result # type: ignore[return-value]

289

290 # if numpy:

291 # return self._numpy(s1, s2)

292 # else:

293 if self.restricted:

294 return self._pure_python_restricted(s1, s2)

295 return self._pure_python_unrestricted(s1, s2)

296

297

298class JaroWinkler(_BaseSimilarity):

299 """

300 Computes the Jaro-Winkler measure between two strings.

301 The Jaro-Winkler measure is designed to capture cases where two strings

302 have a low Jaro score, but share a prefix.

303 and thus are likely to match.

304

305 https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance

306 https://github.com/Yomguithereal/talisman/blob/master/src/metrics/jaro.js

307 https://github.com/Yomguithereal/talisman/blob/master/src/metrics/jaro-winkler.js

308 """

309

310 def __init__(

311 self,

312 long_tolerance: bool = False,

313 winklerize: bool = True,

314 qval: int = 1,

315 external: bool = True,

316 ) -> None:

317 self.qval = qval

318 self.long_tolerance = long_tolerance

319 self.winklerize = winklerize

320 self.external = external

321

322 def maximum(self, *sequences: Sequence[object]) -> int:

323 return 1

324

325 def __call__(self, s1: Sequence[T], s2: Sequence[T], prefix_weight: float = 0.1) -> float:

326 s1, s2 = self._get_sequences(s1, s2)

327

328 result = self.quick_answer(s1, s2)

329 if result is not None:

330 return result

331

332 s1_len = len(s1)

333 s2_len = len(s2)

334

335 if not s1_len or not s2_len:

336 return 0.0

337

338 min_len = min(s1_len, s2_len)

339 search_range = max(s1_len, s2_len)

340 search_range = (search_range // 2) - 1

341 if search_range < 0:

342 search_range = 0

343

344 s1_flags = [False] * s1_len

345 s2_flags = [False] * s2_len

346

347 # looking only within search range, count & flag matched pairs

348 common_chars = 0

349 for i, s1_ch in enumerate(s1):

350 low = max(0, i - search_range)

351 hi = min(i + search_range, s2_len - 1)

352 for j in range(low, hi + 1):

353 if not s2_flags[j] and s2[j] == s1_ch:

354 s1_flags[i] = s2_flags[j] = True

355 common_chars += 1

356 break

357

358 # short circuit if no characters match

359 if not common_chars:

360 return 0.0

361

362 # count transpositions

363 k = trans_count = 0

364 for i, s1_f in enumerate(s1_flags):

365 if s1_f:

366 for j in range(k, s2_len):

367 if s2_flags[j]:

368 k = j + 1

369 break

370 if s1[i] != s2[j]:

371 trans_count += 1

372 trans_count //= 2

373

374 # adjust for similarities in nonmatched characters

375 weight = common_chars / s1_len + common_chars / s2_len

376 weight += (common_chars - trans_count) / common_chars

377 weight /= 3

378

379 # stop to boost if strings are not similar

380 if not self.winklerize:

381 return weight

382 if weight <= 0.7:

383 return weight

384

385 # winkler modification

386 # adjust for up to first 4 chars in common

387 j = min(min_len, 4)

388 i = 0

389 while i < j and s1[i] == s2[i]:

390 i += 1

391 if i:

392 weight += i * prefix_weight * (1.0 - weight)

393

394 # optionally adjust for long strings

395 # after agreeing beginning chars, at least two or more must agree and

396 # agreed characters must be > half of remaining characters

397 if not self.long_tolerance or min_len <= 4:

398 return weight

399 if common_chars <= i + 1 or 2 * common_chars < min_len + i:

400 return weight

401 tmp = (common_chars - i - 1) / (s1_len + s2_len - i * 2 + 2)

402 weight += (1.0 - weight) * tmp

403 return weight

404

405

406class Jaro(JaroWinkler):

407 def __init__(

408 self,

409 long_tolerance: bool = False,

410 qval: int = 1,

411 external: bool = True,

412 ) -> None:

413 super().__init__(

414 long_tolerance=long_tolerance,

415 winklerize=False,

416 qval=qval,

417 external=external,

418 )

419

420

421class NeedlemanWunsch(_BaseSimilarity):

422 """

423 Computes the Needleman-Wunsch measure between two strings.

424 The Needleman-Wunsch generalizes the Levenshtein distance and considers global

425 alignment between two strings. Specifically, it is computed by assigning

426 a score to each alignment between two input strings and choosing the

427 score of the best alignment, that is, the maximal score.

428 An alignment between two strings is a set of correspondences between the

429 characters of between them, allowing for gaps.

430

431 https://en.wikipedia.org/wiki/Needleman%E2%80%93Wunsch_algorithm

432 """

433

434 def __init__(

435 self,

436 gap_cost: float = 1.0,

437 sim_func: SimFunc = None,

438 qval: int = 1,

439 external: bool = True,

440 ) -> None:

441 self.qval = qval

442 self.gap_cost = gap_cost

443 if sim_func:

444 self.sim_func = sim_func

445 else:

446 self.sim_func = self._ident

447 self.external = external

448

449 def minimum(self, *sequences: Sequence[object]) -> float:

450 return -max(map(len, sequences)) * self.gap_cost

451

452 def maximum(self, *sequences: Sequence[object]) -> float:

453 return max(map(len, sequences))

454

455 def distance(self, *sequences: Sequence[object]) -> float:

456 """Get distance between sequences

457 """

458 return -1 * self.similarity(*sequences)

459

460 def normalized_distance(self, *sequences: Sequence[object]) -> float:

461 """Get distance from 0 to 1

462 """

463 minimum = self.minimum(*sequences)

464 maximum = self.maximum(*sequences)

465 if maximum == 0:

466 return 0

467 return (self.distance(*sequences) - minimum) / (maximum - minimum)

468

469 def normalized_similarity(self, *sequences: Sequence[object]) -> float:

470 """Get similarity from 0 to 1

471 """

472 minimum = self.minimum(*sequences)

473 maximum = self.maximum(*sequences)

474 if maximum == 0:

475 return 1

476 return (self.similarity(*sequences) - minimum) / (maximum * 2)

477

478 def __call__(self, s1: Sequence[T], s2: Sequence[T]) -> float:

479 if not numpy:

480 raise ImportError('Please, install numpy for Needleman-Wunsch measure')

481

482 s1, s2 = self._get_sequences(s1, s2)

483

484 # result = self.quick_answer(s1, s2)

485 # if result is not None:

486 # return result * self.maximum(s1, s2)

487

488 dist_mat = numpy.zeros(

489 (len(s1) + 1, len(s2) + 1),

490 dtype=float,

491 )

492 # DP initialization

493 for i in range(len(s1) + 1):

494 dist_mat[i, 0] = -(i * self.gap_cost)

495 # DP initialization

496 for j in range(len(s2) + 1):

497 dist_mat[0, j] = -(j * self.gap_cost)

498 # Needleman-Wunsch DP calculation

499 for i, c1 in enumerate(s1, 1):

500 for j, c2 in enumerate(s2, 1):

501 match = dist_mat[i - 1, j - 1] + self.sim_func(c1, c2)

502 delete = dist_mat[i - 1, j] - self.gap_cost

503 insert = dist_mat[i, j - 1] - self.gap_cost

504 dist_mat[i, j] = max(match, delete, insert)

505 return dist_mat[dist_mat.shape[0] - 1, dist_mat.shape[1] - 1]

506

507

508class SmithWaterman(_BaseSimilarity):

509 """

510 Computes the Smith-Waterman measure between two strings.

511 The Smith-Waterman algorithm performs local sequence alignment;

512 that is, for determining similar regions between two strings.

513 Instead of looking at the total sequence, the Smith-Waterman algorithm compares

514 segments of all possible lengths and optimizes the similarity measure.

515

516 https://en.wikipedia.org/wiki/Smith%E2%80%93Waterman_algorithm

517 https://github.com/Yomguithereal/talisman/blob/master/src/metrics/smith-waterman.js

518 """

519

520 def __init__(

521 self,

522 gap_cost: float = 1.0,

523 sim_func: SimFunc = None,

524 qval: int = 1,

525 external: bool = True,

526 ) -> None:

527 self.qval = qval

528 self.gap_cost = gap_cost

529 self.sim_func = sim_func or self._ident

530 self.external = external

531

532 def maximum(self, *sequences: Sequence[object]) -> int:

533 return min(map(len, sequences))

534

535 def __call__(self, s1: Sequence[T], s2: Sequence[T]) -> float:

536 if not numpy:

537 raise ImportError('Please, install numpy for Smith-Waterman measure')

538

539 s1, s2 = self._get_sequences(s1, s2)

540

541 result = self.quick_answer(s1, s2)

542 if result is not None:

543 return result

544

545 dist_mat = numpy.zeros(

546 (len(s1) + 1, len(s2) + 1),

547 dtype=float,

548 )

549 for i, sc1 in enumerate(s1, start=1):

550 for j, sc2 in enumerate(s2, start=1):

551 # The score for substituting the letter a[i - 1] for b[j - 1].

552 # Generally low for mismatch, high for match.

553 match = dist_mat[i - 1, j - 1] + self.sim_func(sc1, sc2)

554 # The scores for for introducing extra letters in one of the strings

555 # (or by symmetry, deleting them from the other).

556 delete = dist_mat[i - 1, j] - self.gap_cost

557 insert = dist_mat[i, j - 1] - self.gap_cost

558 dist_mat[i, j] = max(0, match, delete, insert)

559 return dist_mat[dist_mat.shape[0] - 1, dist_mat.shape[1] - 1]

560

561

562class Gotoh(NeedlemanWunsch):

563 """Gotoh score

564 Gotoh's algorithm is essentially Needleman-Wunsch with affine gap

565 penalties:

566 https://www.cs.umd.edu/class/spring2003/cmsc838t/papers/gotoh1982.pdf

567 """

568

569 def __init__(

570 self,

571 gap_open: int = 1,

572 gap_ext: float = 0.4,

573 sim_func: SimFunc = None,

574 qval: int = 1,

575 external: bool = True,

576 ) -> None:

577 self.qval = qval

578 self.gap_open = gap_open

579 self.gap_ext = gap_ext

580 if sim_func:

581 self.sim_func = sim_func

582 else:

583 self.sim_func = self._ident

584 self.external = external

585

586 def minimum(self, *sequences: Sequence[object]) -> int:

587 return -min(map(len, sequences))

588

589 def maximum(self, *sequences: Sequence[object]) -> int:

590 return min(map(len, sequences))

591

592 def __call__(self, s1: Sequence[T], s2: Sequence[T]) -> float:

593 if not numpy:

594 raise ImportError('Please, install numpy for Gotoh measure')

595

596 s1, s2 = self._get_sequences(s1, s2)

597

598 # result = self.quick_answer(s1, s2)

599 # if result is not None:

600 # return result * self.maximum(s1, s2)

601

602 len_s1 = len(s1)

603 len_s2 = len(s2)

604 d_mat = numpy.zeros((len_s1 + 1, len_s2 + 1), dtype=float)

605 p_mat = numpy.zeros((len_s1 + 1, len_s2 + 1), dtype=float)

606 q_mat = numpy.zeros((len_s1 + 1, len_s2 + 1), dtype=float)

607

608 d_mat[0, 0] = 0

609 p_mat[0, 0] = float('-inf')

610 q_mat[0, 0] = float('-inf')

611 for i in range(1, len_s1 + 1):

612 d_mat[i, 0] = float('-inf')

613 p_mat[i, 0] = -self.gap_open - self.gap_ext * (i - 1)

614 q_mat[i, 0] = float('-inf')

615 q_mat[i, 1] = -self.gap_open

616 for j in range(1, len_s2 + 1):

617 d_mat[0, j] = float('-inf')

618 p_mat[0, j] = float('-inf')

619 p_mat[1, j] = -self.gap_open

620 q_mat[0, j] = -self.gap_open - self.gap_ext * (j - 1)

621

622 for i, sc1 in enumerate(s1, start=1):

623 for j, sc2 in enumerate(s2, start=1):

624 sim_val = self.sim_func(sc1, sc2)

625 d_mat[i, j] = max(

626 d_mat[i - 1, j - 1] + sim_val,

627 p_mat[i - 1, j - 1] + sim_val,

628 q_mat[i - 1, j - 1] + sim_val,

629 )

630 p_mat[i, j] = max(

631 d_mat[i - 1, j] - self.gap_open,

632 p_mat[i - 1, j] - self.gap_ext,

633 )

634 q_mat[i, j] = max(

635 d_mat[i, j - 1] - self.gap_open,

636 q_mat[i, j - 1] - self.gap_ext,

637 )

638

639 i, j = (n - 1 for n in d_mat.shape)

640 return max(d_mat[i, j], p_mat[i, j], q_mat[i, j])

641

642

643class StrCmp95(_BaseSimilarity):

644 """strcmp95 similarity

645

646 http://cpansearch.perl.org/src/SCW/Text-JaroWinkler-0.1/strcmp95.c

647 """

648 sp_mx: tuple[tuple[str, str], ...] = (

649 ('A', 'E'), ('A', 'I'), ('A', 'O'), ('A', 'U'), ('B', 'V'), ('E', 'I'),

650 ('E', 'O'), ('E', 'U'), ('I', 'O'), ('I', 'U'), ('O', 'U'), ('I', 'Y'),

651 ('E', 'Y'), ('C', 'G'), ('E', 'F'), ('W', 'U'), ('W', 'V'), ('X', 'K'),

652 ('S', 'Z'), ('X', 'S'), ('Q', 'C'), ('U', 'V'), ('M', 'N'), ('L', 'I'),

653 ('Q', 'O'), ('P', 'R'), ('I', 'J'), ('2', 'Z'), ('5', 'S'), ('8', 'B'),

654 ('1', 'I'), ('1', 'L'), ('0', 'O'), ('0', 'Q'), ('C', 'K'), ('G', 'J'),

655 )

656

657 def __init__(self, long_strings: bool = False, external: bool = True) -> None:

658 self.long_strings = long_strings

659 self.external = external

660

661 def maximum(self, *sequences: Sequence[object]) -> int:

662 return 1

663

664 @staticmethod

665 def _in_range(char) -> bool:

666 return 0 < ord(char) < 91

667

668 def __call__(self, s1: str, s2: str) -> float:

669 s1 = s1.strip().upper()

670 s2 = s2.strip().upper()

671

672 result = self.quick_answer(s1, s2)

673 if result is not None:

674 return result

675

676 len_s1 = len(s1)

677 len_s2 = len(s2)

678

679 adjwt = defaultdict(int)

680

681 # Initialize the adjwt array on the first call to the function only.

682 # The adjwt array is used to give partial credit for characters that

683 # may be errors due to known phonetic or character recognition errors.

684 # A typical example is to match the letter "O" with the number "0"

685 for c1, c2 in self.sp_mx:

686 adjwt[c1, c2] = 3

687 adjwt[c2, c1] = 3

688

689 if len_s1 > len_s2:

690 search_range = len_s1

691 minv = len_s2

692 else:

693 search_range = len_s2

694 minv = len_s1

695

696 # Blank out the flags

697 s1_flag = [0] * search_range

698 s2_flag = [0] * search_range

699 search_range = max(0, search_range // 2 - 1)

700

701 # Looking only within the search range, count and flag the matched pairs.

702 num_com = 0

703 yl1 = len_s2 - 1

704 for i, sc1 in enumerate(s1):

705 lowlim = max(i - search_range, 0)

706 hilim = min(i + search_range, yl1)

707 for j in range(lowlim, hilim + 1):

708 if s2_flag[j] == 0 and s2[j] == sc1:

709 s2_flag[j] = 1

710 s1_flag[i] = 1

711 num_com += 1

712 break

713

714 # If no characters in common - return

715 if num_com == 0:

716 return 0.0

717

718 # Count the number of transpositions

719 k = n_trans = 0

720 for i, sc1 in enumerate(s1):

721 if not s1_flag[i]:

722 continue

723 for j in range(k, len_s2):

724 if s2_flag[j] != 0:

725 k = j + 1

726 break

727 if sc1 != s2[j]:

728 n_trans += 1

729 n_trans = n_trans // 2

730

731 # Adjust for similarities in unmatched characters

732 n_simi = 0

733 if minv > num_com:

734 for i in range(len_s1):

735 if s1_flag[i] != 0:

736 continue

737 if not self._in_range(s1[i]):

738 continue

739 for j in range(len_s2):

740 if s2_flag[j] != 0:

741 continue

742 if not self._in_range(s2[j]):

743 continue

744 if (s1[i], s2[j]) not in adjwt:

745 continue

746 n_simi += adjwt[s1[i], s2[j]]

747 s2_flag[j] = 2

748 break

749 num_sim = n_simi / 10.0 + num_com

750

751 # Main weight computation

752 weight = num_sim / len_s1 + num_sim / len_s2

753 weight += (num_com - n_trans) / num_com

754 weight = weight / 3.0

755

756 # Continue to boost the weight if the strings are similar

757 if weight <= 0.7:

758 return weight

759

760 # Adjust for having up to the first 4 characters in common

761 j = min(minv, 4)

762 i = 0

763 for sc1, sc2 in zip(s1, s2):

764 if i >= j:

765 break

766 if sc1 != sc2:

767 break

768 if sc1.isdigit():

769 break

770 i += 1

771 if i:

772 weight += i * 0.1 * (1.0 - weight)

773

774 # Optionally adjust for long strings.

775

776 # After agreeing beginning chars, at least two more must agree and

777 # the agreeing characters must be > .5 of remaining characters.

778 if not self.long_strings:

779 return weight

780 if minv <= 4:

781 return weight

782 if num_com <= i + 1 or 2 * num_com < minv + i:

783 return weight

784 if s1[0].isdigit():

785 return weight

786 res = (num_com - i - 1) / (len_s1 + len_s2 - i * 2 + 2)

787 weight += (1.0 - weight) * res

788 return weight

789

790

791class MLIPNS(_BaseSimilarity):

792 """

793 Compute the Hamming distance between the two or more sequences.

794 The Hamming distance is the number of differing items in ordered sequences.

795

796 http://www.sial.iias.spb.su/files/386-386-1-PB.pdf

797 https://github.com/Yomguithereal/talisman/blob/master/src/metrics/mlipns.js

798 """

799

800 def __init__(

801 self, threshold: float = 0.25,

802 maxmismatches: int = 2,

803 qval: int = 1,

804 external: bool = True,

805 ) -> None:

806 self.qval = qval

807 self.threshold = threshold

808 self.maxmismatches = maxmismatches

809 self.external = external

810

811 def maximum(self, *sequences: Sequence[object]) -> int:

812 return 1

813

814 def __call__(self, *sequences: Sequence[object]) -> float:

815 sequences = self._get_sequences(*sequences)

816

817 result = self.quick_answer(*sequences)

818 if result is not None:

819 return result

820

821 mismatches = 0

822 ham = Hamming()(*sequences)

823 maxlen = max(map(len, sequences))

824 while all(sequences) and mismatches <= self.maxmismatches:

825 if not maxlen:

826 return 1

827 if 1 - (maxlen - ham) / maxlen <= self.threshold:

828 return 1

829 mismatches += 1

830 ham -= 1

831 maxlen -= 1

832

833 if not maxlen:

834 return 1

835 return 0

836

837

838hamming = Hamming()

839levenshtein = Levenshtein()

840damerau = damerau_levenshtein = DamerauLevenshtein()

841jaro = Jaro()

842jaro_winkler = JaroWinkler()

843needleman_wunsch = NeedlemanWunsch()

844smith_waterman = SmithWaterman()

845gotoh = Gotoh()

846strcmp95 = StrCmp95()

847mlipns = MLIPNS()

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/textdistance/algorithms/edit_based.py: 59%

448 statements