Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scipy/spatial/distance.py: 15%

1"""

2Distance computations (:mod:`scipy.spatial.distance`)

3=====================================================

5.. sectionauthor:: Damian Eads

7Function reference

8------------------

10Distance matrix computation from a collection of raw observation vectors

11stored in a rectangular array.

13.. autosummary::

14 :toctree: generated/

16 pdist -- pairwise distances between observation vectors.

17 cdist -- distances between two collections of observation vectors

18 squareform -- convert distance matrix to a condensed one and vice versa

19 directed_hausdorff -- directed Hausdorff distance between arrays

21Predicates for checking the validity of distance matrices, both

22condensed and redundant. Also contained in this module are functions

23for computing the number of observations in a distance matrix.

25.. autosummary::

26 :toctree: generated/

28 is_valid_dm -- checks for a valid distance matrix

29 is_valid_y -- checks for a valid condensed distance matrix

30 num_obs_dm -- # of observations in a distance matrix

31 num_obs_y -- # of observations in a condensed distance matrix

33Distance functions between two numeric vectors ``u`` and ``v``. Computing

34distances over a large collection of vectors is inefficient for these

35functions. Use ``pdist`` for this purpose.

37.. autosummary::

38 :toctree: generated/

40 braycurtis -- the Bray-Curtis distance.

41 canberra -- the Canberra distance.

42 chebyshev -- the Chebyshev distance.

43 cityblock -- the Manhattan distance.

44 correlation -- the Correlation distance.

45 cosine -- the Cosine distance.

46 euclidean -- the Euclidean distance.

47 jensenshannon -- the Jensen-Shannon distance.

48 mahalanobis -- the Mahalanobis distance.

49 minkowski -- the Minkowski distance.

50 seuclidean -- the normalized Euclidean distance.

51 sqeuclidean -- the squared Euclidean distance.

53Distance functions between two boolean vectors (representing sets) ``u`` and

54``v``. As in the case of numerical vectors, ``pdist`` is more efficient for

55computing the distances between all pairs.

57.. autosummary::

58 :toctree: generated/

60 dice -- the Dice dissimilarity.

61 hamming -- the Hamming distance.

62 jaccard -- the Jaccard distance.

63 kulsinski -- the Kulsinski distance.

64 kulczynski1 -- the Kulczynski 1 distance.

65 rogerstanimoto -- the Rogers-Tanimoto dissimilarity.

66 russellrao -- the Russell-Rao dissimilarity.

67 sokalmichener -- the Sokal-Michener dissimilarity.

68 sokalsneath -- the Sokal-Sneath dissimilarity.

69 yule -- the Yule dissimilarity.

71:func:`hamming` also operates over discrete numerical vectors.

72"""

74# Copyright (C) Damian Eads, 2007-2008. New BSD License.

76__all__ = [

77 'braycurtis',

78 'canberra',

79 'cdist',

80 'chebyshev',

81 'cityblock',

82 'correlation',

83 'cosine',

84 'dice',

85 'directed_hausdorff',

86 'euclidean',

87 'hamming',

88 'is_valid_dm',

89 'is_valid_y',

90 'jaccard',

91 'jensenshannon',

92 'kulsinski',

93 'kulczynski1',

94 'mahalanobis',

95 'minkowski',

96 'num_obs_dm',

97 'num_obs_y',

98 'pdist',

99 'rogerstanimoto',

100 'russellrao',

101 'seuclidean',

102 'sokalmichener',

103 'sokalsneath',

104 'sqeuclidean',

105 'squareform',

106 'yule'

107]

108

109

110import warnings

111import numpy as np

112import dataclasses

113

114from typing import List, Optional, Set, Callable

115

116from functools import partial

117from scipy._lib._util import _asarray_validated

118

119from . import _distance_wrap

120from . import _hausdorff

121from ..linalg import norm

122from ..special import rel_entr

123

124from . import _distance_pybind

125

126from .._lib.deprecation import _deprecated

127

128def _copy_array_if_base_present(a):

129 """Copy the array if its base points to a parent array."""

130 if a.base is not None:

131 return a.copy()

132 return a

133

134

135def _correlation_cdist_wrap(XA, XB, dm, **kwargs):

136 XA = XA - XA.mean(axis=1, keepdims=True)

137 XB = XB - XB.mean(axis=1, keepdims=True)

138 _distance_wrap.cdist_cosine_double_wrap(XA, XB, dm, **kwargs)

139

140

141def _correlation_pdist_wrap(X, dm, **kwargs):

142 X2 = X - X.mean(axis=1, keepdims=True)

143 _distance_wrap.pdist_cosine_double_wrap(X2, dm, **kwargs)

144

145

146def _convert_to_type(X, out_type):

147 return np.ascontiguousarray(X, dtype=out_type)

148

149

150def _nbool_correspond_all(u, v, w=None):

151 if u.dtype == v.dtype == bool and w is None:

152 not_u = ~u

153 not_v = ~v

154 nff = (not_u & not_v).sum()

155 nft = (not_u & v).sum()

156 ntf = (u & not_v).sum()

157 ntt = (u & v).sum()

158 else:

159 dtype = np.result_type(int, u.dtype, v.dtype)

160 u = u.astype(dtype)

161 v = v.astype(dtype)

162 not_u = 1.0 - u

163 not_v = 1.0 - v

164 if w is not None:

165 not_u = w * not_u

166 u = w * u

167 nff = (not_u * not_v).sum()

168 nft = (not_u * v).sum()

169 ntf = (u * not_v).sum()

170 ntt = (u * v).sum()

171 return (nff, nft, ntf, ntt)

172

173

174def _nbool_correspond_ft_tf(u, v, w=None):

175 if u.dtype == v.dtype == bool and w is None:

176 not_u = ~u

177 not_v = ~v

178 nft = (not_u & v).sum()

179 ntf = (u & not_v).sum()

180 else:

181 dtype = np.result_type(int, u.dtype, v.dtype)

182 u = u.astype(dtype)

183 v = v.astype(dtype)

184 not_u = 1.0 - u

185 not_v = 1.0 - v

186 if w is not None:

187 not_u = w * not_u

188 u = w * u

189 nft = (not_u * v).sum()

190 ntf = (u * not_v).sum()

191 return (nft, ntf)

192

193

194def _validate_cdist_input(XA, XB, mA, mB, n, metric_info, **kwargs):

195 # get supported types

196 types = metric_info.types

197 # choose best type

198 typ = types[types.index(XA.dtype)] if XA.dtype in types else types[0]

199 # validate data

200 XA = _convert_to_type(XA, out_type=typ)

201 XB = _convert_to_type(XB, out_type=typ)

202

203 # validate kwargs

204 _validate_kwargs = metric_info.validator

205 if _validate_kwargs:

206 kwargs = _validate_kwargs((XA, XB), mA + mB, n, **kwargs)

207 return XA, XB, typ, kwargs

208

209

210def _validate_weight_with_size(X, m, n, **kwargs):

211 w = kwargs.pop('w', None)

212 if w is None:

213 return kwargs

214

215 if w.ndim != 1 or w.shape[0] != n:

216 raise ValueError("Weights must have same size as input vector. "

217 f"{w.shape[0]} vs. {n}")

218

219 kwargs['w'] = _validate_weights(w)

220 return kwargs

221

222

223def _validate_hamming_kwargs(X, m, n, **kwargs):

224 w = kwargs.get('w', np.ones((n,), dtype='double'))

225

226 if w.ndim != 1 or w.shape[0] != n:

227 raise ValueError("Weights must have same size as input vector. %d vs. %d" % (w.shape[0], n))

228

229 kwargs['w'] = _validate_weights(w)

230 return kwargs

231

232

233def _validate_mahalanobis_kwargs(X, m, n, **kwargs):

234 VI = kwargs.pop('VI', None)

235 if VI is None:

236 if m <= n:

237 # There are fewer observations than the dimension of

238 # the observations.

239 raise ValueError("The number of observations (%d) is too "

240 "small; the covariance matrix is "

241 "singular. For observations with %d "

242 "dimensions, at least %d observations "

243 "are required." % (m, n, n + 1))

244 if isinstance(X, tuple):

245 X = np.vstack(X)

246 CV = np.atleast_2d(np.cov(X.astype(np.double, copy=False).T))

247 VI = np.linalg.inv(CV).T.copy()

248 kwargs["VI"] = _convert_to_double(VI)

249 return kwargs

250

251

252def _validate_minkowski_kwargs(X, m, n, **kwargs):

253 kwargs = _validate_weight_with_size(X, m, n, **kwargs)

254 if 'p' not in kwargs:

255 kwargs['p'] = 2.

256 else:

257 if kwargs['p'] <= 0:

258 raise ValueError("p must be greater than 0")

259

260 return kwargs

261

262

263def _validate_pdist_input(X, m, n, metric_info, **kwargs):

264 # get supported types

265 types = metric_info.types

266 # choose best type

267 typ = types[types.index(X.dtype)] if X.dtype in types else types[0]

268 # validate data

269 X = _convert_to_type(X, out_type=typ)

270

271 # validate kwargs

272 _validate_kwargs = metric_info.validator

273 if _validate_kwargs:

274 kwargs = _validate_kwargs(X, m, n, **kwargs)

275 return X, typ, kwargs

276

277

278def _validate_seuclidean_kwargs(X, m, n, **kwargs):

279 V = kwargs.pop('V', None)

280 if V is None:

281 if isinstance(X, tuple):

282 X = np.vstack(X)

283 V = np.var(X.astype(np.double, copy=False), axis=0, ddof=1)

284 else:

285 V = np.asarray(V, order='c')

286 if len(V.shape) != 1:

287 raise ValueError('Variance vector V must '

288 'be one-dimensional.')

289 if V.shape[0] != n:

290 raise ValueError('Variance vector V must be of the same '

291 'dimension as the vectors on which the distances '

292 'are computed.')

293 kwargs['V'] = _convert_to_double(V)

294 return kwargs

295

296

297def _validate_vector(u, dtype=None):

298 # XXX Is order='c' really necessary?

299 u = np.asarray(u, dtype=dtype, order='c')

300 if u.ndim == 1:

301 return u

302 raise ValueError("Input vector should be 1-D.")

303

304

305def _validate_weights(w, dtype=np.double):

306 w = _validate_vector(w, dtype=dtype)

307 if np.any(w < 0):

308 raise ValueError("Input weights should be all non-negative")

309 return w

310

311

312def directed_hausdorff(u, v, seed=0):

313 """

314 Compute the directed Hausdorff distance between two 2-D arrays.

315

316 Distances between pairs are calculated using a Euclidean metric.

317

318 Parameters

319 ----------

320 u : (M,N) array_like

321 Input array.

322 v : (O,N) array_like

323 Input array.

324 seed : int or None

325 Local `numpy.random.RandomState` seed. Default is 0, a random

326 shuffling of u and v that guarantees reproducibility.

327

328 Returns

329 -------

330 d : double

331 The directed Hausdorff distance between arrays `u` and `v`,

332

333 index_1 : int

334 index of point contributing to Hausdorff pair in `u`

335

336 index_2 : int

337 index of point contributing to Hausdorff pair in `v`

338

339 Raises

340 ------

341 ValueError

342 An exception is thrown if `u` and `v` do not have

343 the same number of columns.

344

345 Notes

346 -----

347 Uses the early break technique and the random sampling approach

348 described by [1]_. Although worst-case performance is ``O(m * o)``

349 (as with the brute force algorithm), this is unlikely in practice

350 as the input data would have to require the algorithm to explore

351 every single point interaction, and after the algorithm shuffles

352 the input points at that. The best case performance is O(m), which

353 is satisfied by selecting an inner loop distance that is less than

354 cmax and leads to an early break as often as possible. The authors

355 have formally shown that the average runtime is closer to O(m).

356

357 .. versionadded:: 0.19.0

358

359 References

360 ----------

361 .. [1] A. A. Taha and A. Hanbury, "An efficient algorithm for

362 calculating the exact Hausdorff distance." IEEE Transactions On

363 Pattern Analysis And Machine Intelligence, vol. 37 pp. 2153-63,

364 2015.

365

366 See Also

367 --------

368 scipy.spatial.procrustes : Another similarity test for two data sets

369

370 Examples

371 --------

372 Find the directed Hausdorff distance between two 2-D arrays of

373 coordinates:

374

375 >>> from scipy.spatial.distance import directed_hausdorff

376 >>> import numpy as np

377 >>> u = np.array([(1.0, 0.0),

378 ... (0.0, 1.0),

379 ... (-1.0, 0.0),

380 ... (0.0, -1.0)])

381 >>> v = np.array([(2.0, 0.0),

382 ... (0.0, 2.0),

383 ... (-2.0, 0.0),

384 ... (0.0, -4.0)])

385

386 >>> directed_hausdorff(u, v)[0]

387 2.23606797749979

388 >>> directed_hausdorff(v, u)[0]

389 3.0

390

391 Find the general (symmetric) Hausdorff distance between two 2-D

392 arrays of coordinates:

393

394 >>> max(directed_hausdorff(u, v)[0], directed_hausdorff(v, u)[0])

395 3.0

396

397 Find the indices of the points that generate the Hausdorff distance

398 (the Hausdorff pair):

399

400 >>> directed_hausdorff(v, u)[1:]

401 (3, 3)

402

403 """

404 u = np.asarray(u, dtype=np.float64, order='c')

405 v = np.asarray(v, dtype=np.float64, order='c')

406 if u.shape[1] != v.shape[1]:

407 raise ValueError('u and v need to have the same '

408 'number of columns')

409 result = _hausdorff.directed_hausdorff(u, v, seed)

410 return result

411

412

413def minkowski(u, v, p=2, w=None):

414 """

415 Compute the Minkowski distance between two 1-D arrays.

416

417 The Minkowski distance between 1-D arrays `u` and `v`,

418 is defined as

419

420 .. math::

421

422 {\\|u-v\\|}_p = (\\sum{|u_i - v_i|^p})^{1/p}.

423

424

425 \\left(\\sum{w_i(|(u_i - v_i)|^p)}\\right)^{1/p}.

426

427 Parameters

428 ----------

429 u : (N,) array_like

430 Input array.

431 v : (N,) array_like

432 Input array.

433 p : scalar

434 The order of the norm of the difference :math:`{\\|u-v\\|}_p`. Note

435 that for :math:`0 < p < 1`, the triangle inequality only holds with

436 an additional multiplicative factor, i.e. it is only a quasi-metric.

437 w : (N,) array_like, optional

438 The weights for each value in `u` and `v`. Default is None,

439 which gives each value a weight of 1.0

440

441 Returns

442 -------

443 minkowski : double

444 The Minkowski distance between vectors `u` and `v`.

445

446 Examples

447 --------

448 >>> from scipy.spatial import distance

449 >>> distance.minkowski([1, 0, 0], [0, 1, 0], 1)

450 2.0

451 >>> distance.minkowski([1, 0, 0], [0, 1, 0], 2)

452 1.4142135623730951

453 >>> distance.minkowski([1, 0, 0], [0, 1, 0], 3)

454 1.2599210498948732

455 >>> distance.minkowski([1, 1, 0], [0, 1, 0], 1)

456 1.0

457 >>> distance.minkowski([1, 1, 0], [0, 1, 0], 2)

458 1.0

459 >>> distance.minkowski([1, 1, 0], [0, 1, 0], 3)

460 1.0

461

462 """

463 u = _validate_vector(u)

464 v = _validate_vector(v)

465 if p <= 0:

466 raise ValueError("p must be greater than 0")

467 u_v = u - v

468 if w is not None:

469 w = _validate_weights(w)

470 if p == 1:

471 root_w = w

472 elif p == 2:

473 # better precision and speed

474 root_w = np.sqrt(w)

475 elif p == np.inf:

476 root_w = (w != 0)

477 else:

478 root_w = np.power(w, 1/p)

479 u_v = root_w * u_v

480 dist = norm(u_v, ord=p)

481 return dist

482

483

484def euclidean(u, v, w=None):

485 """

486 Computes the Euclidean distance between two 1-D arrays.

487

488 The Euclidean distance between 1-D arrays `u` and `v`, is defined as

489

490 .. math::

491

492 {\\|u-v\\|}_2

493

494 \\left(\\sum{(w_i |(u_i - v_i)|^2)}\\right)^{1/2}

495

496 Parameters

497 ----------

498 u : (N,) array_like

499 Input array.

500 v : (N,) array_like

501 Input array.

502 w : (N,) array_like, optional

503 The weights for each value in `u` and `v`. Default is None,

504 which gives each value a weight of 1.0

505

506 Returns

507 -------

508 euclidean : double

509 The Euclidean distance between vectors `u` and `v`.

510

511 Examples

512 --------

513 >>> from scipy.spatial import distance

514 >>> distance.euclidean([1, 0, 0], [0, 1, 0])

515 1.4142135623730951

516 >>> distance.euclidean([1, 1, 0], [0, 1, 0])

517 1.0

518

519 """

520 return minkowski(u, v, p=2, w=w)

521

522

523def sqeuclidean(u, v, w=None):

524 """

525 Compute the squared Euclidean distance between two 1-D arrays.

526

527 The squared Euclidean distance between `u` and `v` is defined as

528

529 .. math::

530

531 {\\|u-v\\|}_2^2

532

533 \\left(\\sum{(w_i |(u_i - v_i)|^2)}\\right)

534

535 Parameters

536 ----------

537 u : (N,) array_like

538 Input array.

539 v : (N,) array_like

540 Input array.

541 w : (N,) array_like, optional

542 The weights for each value in `u` and `v`. Default is None,

543 which gives each value a weight of 1.0

544

545 Returns

546 -------

547 sqeuclidean : double

548 The squared Euclidean distance between vectors `u` and `v`.

549

550 Examples

551 --------

552 >>> from scipy.spatial import distance

553 >>> distance.sqeuclidean([1, 0, 0], [0, 1, 0])

554 2.0

555 >>> distance.sqeuclidean([1, 1, 0], [0, 1, 0])

556 1.0

557

558 """

559 # Preserve float dtypes, but convert everything else to np.float64

560 # for stability.

561 utype, vtype = None, None

562 if not (hasattr(u, "dtype") and np.issubdtype(u.dtype, np.inexact)):

563 utype = np.float64

564 if not (hasattr(v, "dtype") and np.issubdtype(v.dtype, np.inexact)):

565 vtype = np.float64

566

567 u = _validate_vector(u, dtype=utype)

568 v = _validate_vector(v, dtype=vtype)

569 u_v = u - v

570 u_v_w = u_v # only want weights applied once

571 if w is not None:

572 w = _validate_weights(w)

573 u_v_w = w * u_v

574 return np.dot(u_v, u_v_w)

575

576

577def correlation(u, v, w=None, centered=True):

578 """

579 Compute the correlation distance between two 1-D arrays.

580

581 The correlation distance between `u` and `v`, is

582 defined as

583

584 .. math::

585

586 1 - \\frac{(u - \\bar{u}) \\cdot (v - \\bar{v})}

587 {{\\|(u - \\bar{u})\\|}_2 {\\|(v - \\bar{v})\\|}_2}

588

589 where :math:`\\bar{u}` is the mean of the elements of `u`

590 and :math:`x \\cdot y` is the dot product of :math:`x` and :math:`y`.

591

592 Parameters

593 ----------

594 u : (N,) array_like

595 Input array.

596 v : (N,) array_like

597 Input array.

598 w : (N,) array_like, optional

599 The weights for each value in `u` and `v`. Default is None,

600 which gives each value a weight of 1.0

601 centered : bool, optional

602 If True, `u` and `v` will be centered. Default is True.

603

604 Returns

605 -------

606 correlation : double

607 The correlation distance between 1-D array `u` and `v`.

608

609 """

610 u = _validate_vector(u)

611 v = _validate_vector(v)

612 if w is not None:

613 w = _validate_weights(w)

614 if centered:

615 umu = np.average(u, weights=w)

616 vmu = np.average(v, weights=w)

617 u = u - umu

618 v = v - vmu

619 uv = np.average(u * v, weights=w)

620 uu = np.average(np.square(u), weights=w)

621 vv = np.average(np.square(v), weights=w)

622 dist = 1.0 - uv / np.sqrt(uu * vv)

623 # Return absolute value to avoid small negative value due to rounding

624 return np.abs(dist)

625

626

627def cosine(u, v, w=None):

628 """

629 Compute the Cosine distance between 1-D arrays.

630

631 The Cosine distance between `u` and `v`, is defined as

632

633 .. math::

634

635 1 - \\frac{u \\cdot v}

636 {\\|u\\|_2 \\|v\\|_2}.

637

638 where :math:`u \\cdot v` is the dot product of :math:`u` and

639 :math:`v`.

640

641 Parameters

642 ----------

643 u : (N,) array_like

644 Input array.

645 v : (N,) array_like

646 Input array.

647 w : (N,) array_like, optional

648 The weights for each value in `u` and `v`. Default is None,

649 which gives each value a weight of 1.0

650

651 Returns

652 -------

653 cosine : double

654 The Cosine distance between vectors `u` and `v`.

655

656 Examples

657 --------

658 >>> from scipy.spatial import distance

659 >>> distance.cosine([1, 0, 0], [0, 1, 0])

660 1.0

661 >>> distance.cosine([100, 0, 0], [0, 1, 0])

662 1.0

663 >>> distance.cosine([1, 1, 0], [0, 1, 0])

664 0.29289321881345254

665

666 """

667 # cosine distance is also referred to as 'uncentered correlation',

668 # or 'reflective correlation'

669 # clamp the result to 0-2

670 return max(0, min(correlation(u, v, w=w, centered=False), 2.0))

671

672

673def hamming(u, v, w=None):

674 """

675 Compute the Hamming distance between two 1-D arrays.

676

677 The Hamming distance between 1-D arrays `u` and `v`, is simply the

678 proportion of disagreeing components in `u` and `v`. If `u` and `v` are

679 boolean vectors, the Hamming distance is

680

681 .. math::

682

683 \\frac{c_{01} + c_{10}}{n}

684

685 where :math:`c_{ij}` is the number of occurrences of

686 :math:`\\mathtt{u[k]} = i` and :math:`\\mathtt{v[k]} = j` for

687 :math:`k < n`.

688

689 Parameters

690 ----------

691 u : (N,) array_like

692 Input array.

693 v : (N,) array_like

694 Input array.

695 w : (N,) array_like, optional

696 The weights for each value in `u` and `v`. Default is None,

697 which gives each value a weight of 1.0

698

699 Returns

700 -------

701 hamming : double

702 The Hamming distance between vectors `u` and `v`.

703

704 Examples

705 --------

706 >>> from scipy.spatial import distance

707 >>> distance.hamming([1, 0, 0], [0, 1, 0])

708 0.66666666666666663

709 >>> distance.hamming([1, 0, 0], [1, 1, 0])

710 0.33333333333333331

711 >>> distance.hamming([1, 0, 0], [2, 0, 0])

712 0.33333333333333331

713 >>> distance.hamming([1, 0, 0], [3, 0, 0])

714 0.33333333333333331

715

716 """

717 u = _validate_vector(u)

718 v = _validate_vector(v)

719 if u.shape != v.shape:

720 raise ValueError('The 1d arrays must have equal lengths.')

721 u_ne_v = u != v

722 if w is not None:

723 w = _validate_weights(w)

724 return np.average(u_ne_v, weights=w)

725

726

727def jaccard(u, v, w=None):

728 """

729 Compute the Jaccard-Needham dissimilarity between two boolean 1-D arrays.

730

731 The Jaccard-Needham dissimilarity between 1-D boolean arrays `u` and `v`,

732 is defined as

733

734 .. math::

735

736 \\frac{c_{TF} + c_{FT}}

737 {c_{TT} + c_{FT} + c_{TF}}

738

739 where :math:`c_{ij}` is the number of occurrences of

740 :math:`\\mathtt{u[k]} = i` and :math:`\\mathtt{v[k]} = j` for

741 :math:`k < n`.

742

743 Parameters

744 ----------

745 u : (N,) array_like, bool

746 Input array.

747 v : (N,) array_like, bool

748 Input array.

749 w : (N,) array_like, optional

750 The weights for each value in `u` and `v`. Default is None,

751 which gives each value a weight of 1.0

752

753 Returns

754 -------

755 jaccard : double

756 The Jaccard distance between vectors `u` and `v`.

757

758 Notes

759 -----

760 When both `u` and `v` lead to a `0/0` division i.e. there is no overlap

761 between the items in the vectors the returned distance is 0. See the

762 Wikipedia page on the Jaccard index [1]_, and this paper [2]_.

763

764 .. versionchanged:: 1.2.0

765 Previously, when `u` and `v` lead to a `0/0` division, the function

766 would return NaN. This was changed to return 0 instead.

767

768 References

769 ----------

770 .. [1] https://en.wikipedia.org/wiki/Jaccard_index

771 .. [2] S. Kosub, "A note on the triangle inequality for the Jaccard

772 distance", 2016, :arxiv:`1612.02696`

773

774 Examples

775 --------

776 >>> from scipy.spatial import distance

777 >>> distance.jaccard([1, 0, 0], [0, 1, 0])

778 1.0

779 >>> distance.jaccard([1, 0, 0], [1, 1, 0])

780 0.5

781 >>> distance.jaccard([1, 0, 0], [1, 2, 0])

782 0.5

783 >>> distance.jaccard([1, 0, 0], [1, 1, 1])

784 0.66666666666666663

785

786 """

787 u = _validate_vector(u)

788 v = _validate_vector(v)

789

790 nonzero = np.bitwise_or(u != 0, v != 0)

791 unequal_nonzero = np.bitwise_and((u != v), nonzero)

792 if w is not None:

793 w = _validate_weights(w)

794 nonzero = w * nonzero

795 unequal_nonzero = w * unequal_nonzero

796 a = np.double(unequal_nonzero.sum())

797 b = np.double(nonzero.sum())

798 return (a / b) if b != 0 else 0

799

800

801@_deprecated("Kulsinski has been deprecated from scipy.spatial.distance"

802 " in SciPy 1.9.0 and it will be removed in SciPy 1.11.0."

803 " It is superseded by scipy.spatial.distance.kulczynski1.")

804def kulsinski(u, v, w=None):

805 """

806 Compute the Kulsinski dissimilarity between two boolean 1-D arrays.

807

808 The Kulsinski dissimilarity between two boolean 1-D arrays `u` and `v`,

809 is defined as

810

811 .. math::

812

813 \\frac{c_{TF} + c_{FT} - c_{TT} + n}

814 {c_{FT} + c_{TF} + n}

815

816 where :math:`c_{ij}` is the number of occurrences of

817 :math:`\\mathtt{u[k]} = i` and :math:`\\mathtt{v[k]} = j` for

818 :math:`k < n`.

819

820 .. deprecated:: 0.12.0

821 `kulsinski` has been deprecated from `scipy.spatial.distance` in

822 SciPy 1.9.0 and it will be removed in SciPy 1.11.0. It is superseded

823 by `scipy.spatial.distance.kulczynski1`.

824

825 Parameters

826 ----------

827 u : (N,) array_like, bool

828 Input array.

829 v : (N,) array_like, bool

830 Input array.

831 w : (N,) array_like, optional

832 The weights for each value in `u` and `v`. Default is None,

833 which gives each value a weight of 1.0

834

835 Returns

836 -------

837 kulsinski : double

838 The Kulsinski distance between vectors `u` and `v`.

839

840 Examples

841 --------

842 >>> from scipy.spatial import distance

843 >>> distance.kulsinski([1, 0, 0], [0, 1, 0])

844 1.0

845 >>> distance.kulsinski([1, 0, 0], [1, 1, 0])

846 0.75

847 >>> distance.kulsinski([1, 0, 0], [2, 1, 0])

848 0.33333333333333331

849 >>> distance.kulsinski([1, 0, 0], [3, 1, 0])

850 -0.5

851

852 """

853 u = _validate_vector(u)

854 v = _validate_vector(v)

855 if w is None:

856 n = float(len(u))

857 else:

858 w = _validate_weights(w)

859 n = w.sum()

860 (nff, nft, ntf, ntt) = _nbool_correspond_all(u, v, w=w)

861

862 return (ntf + nft - ntt + n) / (ntf + nft + n)

863

864

865def kulczynski1(u, v, *, w=None):

866 """

867 Compute the Kulczynski 1 dissimilarity between two boolean 1-D arrays.

868

869 The Kulczynski 1 dissimilarity between two boolean 1-D arrays `u` and `v`

870 of length ``n``, is defined as

871

872 .. math::

873

874 \\frac{c_{11}}

875 {c_{01} + c_{10}}

876

877 where :math:`c_{ij}` is the number of occurrences of

878 :math:`\\mathtt{u[k]} = i` and :math:`\\mathtt{v[k]} = j` for

879 :math:`k \\in {0, 1, ..., n-1}`.

880

881 Parameters

882 ----------

883 u : (N,) array_like, bool

884 Input array.

885 v : (N,) array_like, bool

886 Input array.

887 w : (N,) array_like, optional

888 The weights for each value in `u` and `v`. Default is None,

889 which gives each value a weight of 1.0

890

891 Returns

892 -------

893 kulczynski1 : float

894 The Kulczynski 1 distance between vectors `u` and `v`.

895

896 Notes

897 -----

898 This measure has a minimum value of 0 and no upper limit.

899 It is un-defined when there are no non-matches.

900

901 .. versionadded:: 1.8.0

902

903 References

904 ----------

905 .. [1] Kulczynski S. et al. Bulletin

906 International de l'Academie Polonaise des Sciences

907 et des Lettres, Classe des Sciences Mathematiques

908 et Naturelles, Serie B (Sciences Naturelles). 1927;

909 Supplement II: 57-203.

910

911 Examples

912 --------

913 >>> from scipy.spatial import distance

914 >>> distance.kulczynski1([1, 0, 0], [0, 1, 0])

915 0.0

916 >>> distance.kulczynski1([True, False, False], [True, True, False])

917 1.0

918 >>> distance.kulczynski1([True, False, False], [True])

919 0.5

920 >>> distance.kulczynski1([1, 0, 0], [3, 1, 0])

921 -3.0

922

923 """

924 u = _validate_vector(u)

925 v = _validate_vector(v)

926 if w is not None:

927 w = _validate_weights(w)

928 (_, nft, ntf, ntt) = _nbool_correspond_all(u, v, w=w)

929

930 return ntt / (ntf + nft)

931

932

933def seuclidean(u, v, V):

934 """

935 Return the standardized Euclidean distance between two 1-D arrays.

936

937 The standardized Euclidean distance between `u` and `v`.

938

939 Parameters

940 ----------

941 u : (N,) array_like

942 Input array.

943 v : (N,) array_like

944 Input array.

945 V : (N,) array_like

946 `V` is an 1-D array of component variances. It is usually computed

947 among a larger collection vectors.

948

949 Returns

950 -------

951 seuclidean : double

952 The standardized Euclidean distance between vectors `u` and `v`.

953

954 Examples

955 --------

956 >>> from scipy.spatial import distance

957 >>> distance.seuclidean([1, 0, 0], [0, 1, 0], [0.1, 0.1, 0.1])

958 4.4721359549995796

959 >>> distance.seuclidean([1, 0, 0], [0, 1, 0], [1, 0.1, 0.1])

960 3.3166247903553998

961 >>> distance.seuclidean([1, 0, 0], [0, 1, 0], [10, 0.1, 0.1])

962 3.1780497164141406

963

964 """

965 u = _validate_vector(u)

966 v = _validate_vector(v)

967 V = _validate_vector(V, dtype=np.float64)

968 if V.shape[0] != u.shape[0] or u.shape[0] != v.shape[0]:

969 raise TypeError('V must be a 1-D array of the same dimension '

970 'as u and v.')

971 return euclidean(u, v, w=1/V)

972

973

974def cityblock(u, v, w=None):

975 """

976 Compute the City Block (Manhattan) distance.

977

978 Computes the Manhattan distance between two 1-D arrays `u` and `v`,

979 which is defined as

980

981 .. math::

982

983 \\sum_i {\\left| u_i - v_i \\right|}.

984

985 Parameters

986 ----------

987 u : (N,) array_like

988 Input array.

989 v : (N,) array_like

990 Input array.

991 w : (N,) array_like, optional

992 The weights for each value in `u` and `v`. Default is None,

993 which gives each value a weight of 1.0

994

995 Returns

996 -------

997 cityblock : double

998 The City Block (Manhattan) distance between vectors `u` and `v`.

999

1000 Examples

1001 --------

1002 >>> from scipy.spatial import distance

1003 >>> distance.cityblock([1, 0, 0], [0, 1, 0])

1004 2

1005 >>> distance.cityblock([1, 0, 0], [0, 2, 0])

1006 3

1007 >>> distance.cityblock([1, 0, 0], [1, 1, 0])

1008 1

1009

1010 """

1011 u = _validate_vector(u)

1012 v = _validate_vector(v)

1013 l1_diff = abs(u - v)

1014 if w is not None:

1015 w = _validate_weights(w)

1016 l1_diff = w * l1_diff

1017 return l1_diff.sum()

1018

1019

1020def mahalanobis(u, v, VI):

1021 """

1022 Compute the Mahalanobis distance between two 1-D arrays.

1023

1024 The Mahalanobis distance between 1-D arrays `u` and `v`, is defined as

1025

1026 .. math::

1027

1028 \\sqrt{ (u-v) V^{-1} (u-v)^T }

1029

1030 where ``V`` is the covariance matrix. Note that the argument `VI`

1031 is the inverse of ``V``.

1032

1033 Parameters

1034 ----------

1035 u : (N,) array_like

1036 Input array.

1037 v : (N,) array_like

1038 Input array.

1039 VI : array_like

1040 The inverse of the covariance matrix.

1041

1042 Returns

1043 -------

1044 mahalanobis : double

1045 The Mahalanobis distance between vectors `u` and `v`.

1046

1047 Examples

1048 --------

1049 >>> from scipy.spatial import distance

1050 >>> iv = [[1, 0.5, 0.5], [0.5, 1, 0.5], [0.5, 0.5, 1]]

1051 >>> distance.mahalanobis([1, 0, 0], [0, 1, 0], iv)

1052 1.0

1053 >>> distance.mahalanobis([0, 2, 0], [0, 1, 0], iv)

1054 1.0

1055 >>> distance.mahalanobis([2, 0, 0], [0, 1, 0], iv)

1056 1.7320508075688772

1057

1058 """

1059 u = _validate_vector(u)

1060 v = _validate_vector(v)

1061 VI = np.atleast_2d(VI)

1062 delta = u - v

1063 m = np.dot(np.dot(delta, VI), delta)

1064 return np.sqrt(m)

1065

1066

1067def chebyshev(u, v, w=None):

1068 """

1069 Compute the Chebyshev distance.

1070

1071 Computes the Chebyshev distance between two 1-D arrays `u` and `v`,

1072 which is defined as

1073

1074 .. math::

1075

1076 \\max_i {|u_i-v_i|}.

1077

1078 Parameters

1079 ----------

1080 u : (N,) array_like

1081 Input vector.

1082 v : (N,) array_like

1083 Input vector.

1084 w : (N,) array_like, optional

1085 Unused, as 'max' is a weightless operation. Here for API consistency.

1086

1087 Returns

1088 -------

1089 chebyshev : double

1090 The Chebyshev distance between vectors `u` and `v`.

1091

1092 Examples

1093 --------

1094 >>> from scipy.spatial import distance

1095 >>> distance.chebyshev([1, 0, 0], [0, 1, 0])

1096 1

1097 >>> distance.chebyshev([1, 1, 0], [0, 1, 0])

1098 1

1099

1100 """

1101 u = _validate_vector(u)

1102 v = _validate_vector(v)

1103 if w is not None:

1104 w = _validate_weights(w)

1105 has_weight = w > 0

1106 if has_weight.sum() < w.size:

1107 u = u[has_weight]

1108 v = v[has_weight]

1109 return max(abs(u - v))

1110

1111

1112def braycurtis(u, v, w=None):

1113 """

1114 Compute the Bray-Curtis distance between two 1-D arrays.

1115

1116 Bray-Curtis distance is defined as

1117

1118 .. math::

1119

1120 \\sum{|u_i-v_i|} / \\sum{|u_i+v_i|}

1121

1122 The Bray-Curtis distance is in the range [0, 1] if all coordinates are

1123 positive, and is undefined if the inputs are of length zero.

1124

1125 Parameters

1126 ----------

1127 u : (N,) array_like

1128 Input array.

1129 v : (N,) array_like

1130 Input array.

1131 w : (N,) array_like, optional

1132 The weights for each value in `u` and `v`. Default is None,

1133 which gives each value a weight of 1.0

1134

1135 Returns

1136 -------

1137 braycurtis : double

1138 The Bray-Curtis distance between 1-D arrays `u` and `v`.

1139

1140 Examples

1141 --------

1142 >>> from scipy.spatial import distance

1143 >>> distance.braycurtis([1, 0, 0], [0, 1, 0])

1144 1.0

1145 >>> distance.braycurtis([1, 1, 0], [0, 1, 0])

1146 0.33333333333333331

1147

1148 """

1149 u = _validate_vector(u)

1150 v = _validate_vector(v, dtype=np.float64)

1151 l1_diff = abs(u - v)

1152 l1_sum = abs(u + v)

1153 if w is not None:

1154 w = _validate_weights(w)

1155 l1_diff = w * l1_diff

1156 l1_sum = w * l1_sum

1157 return l1_diff.sum() / l1_sum.sum()

1158

1159

1160def canberra(u, v, w=None):

1161 """

1162 Compute the Canberra distance between two 1-D arrays.

1163

1164 The Canberra distance is defined as

1165

1166 .. math::

1167

1168 d(u,v) = \\sum_i \\frac{|u_i-v_i|}

1169 {|u_i|+|v_i|}.

1170

1171 Parameters

1172 ----------

1173 u : (N,) array_like

1174 Input array.

1175 v : (N,) array_like

1176 Input array.

1177 w : (N,) array_like, optional

1178 The weights for each value in `u` and `v`. Default is None,

1179 which gives each value a weight of 1.0

1180

1181 Returns

1182 -------

1183 canberra : double

1184 The Canberra distance between vectors `u` and `v`.

1185

1186 Notes

1187 -----

1188 When `u[i]` and `v[i]` are 0 for given i, then the fraction 0/0 = 0 is

1189 used in the calculation.

1190

1191 Examples

1192 --------

1193 >>> from scipy.spatial import distance

1194 >>> distance.canberra([1, 0, 0], [0, 1, 0])

1195 2.0

1196 >>> distance.canberra([1, 1, 0], [0, 1, 0])

1197 1.0

1198

1199 """

1200 u = _validate_vector(u)

1201 v = _validate_vector(v, dtype=np.float64)

1202 if w is not None:

1203 w = _validate_weights(w)

1204 with np.errstate(invalid='ignore'):

1205 abs_uv = abs(u - v)

1206 abs_u = abs(u)

1207 abs_v = abs(v)

1208 d = abs_uv / (abs_u + abs_v)

1209 if w is not None:

1210 d = w * d

1211 d = np.nansum(d)

1212 return d

1213

1214

1215def jensenshannon(p, q, base=None, *, axis=0, keepdims=False):

1216 """

1217 Compute the Jensen-Shannon distance (metric) between

1218 two probability arrays. This is the square root

1219 of the Jensen-Shannon divergence.

1220

1221 The Jensen-Shannon distance between two probability

1222 vectors `p` and `q` is defined as,

1223

1224 .. math::

1225

1226 \\sqrt{\\frac{D(p \\parallel m) + D(q \\parallel m)}{2}}

1227

1228 where :math:`m` is the pointwise mean of :math:`p` and :math:`q`

1229 and :math:`D` is the Kullback-Leibler divergence.

1230

1231 This routine will normalize `p` and `q` if they don't sum to 1.0.

1232

1233 Parameters

1234 ----------

1235 p : (N,) array_like

1236 left probability vector

1237 q : (N,) array_like

1238 right probability vector

1239 base : double, optional

1240 the base of the logarithm used to compute the output

1241 if not given, then the routine uses the default base of

1242 scipy.stats.entropy.

1243 axis : int, optional

1244 Axis along which the Jensen-Shannon distances are computed. The default

1245 is 0.

1246

1247 .. versionadded:: 1.7.0

1248 keepdims : bool, optional

1249 If this is set to `True`, the reduced axes are left in the

1250 result as dimensions with size one. With this option,

1251 the result will broadcast correctly against the input array.

1252 Default is False.

1253

1254 .. versionadded:: 1.7.0

1255

1256 Returns

1257 -------

1258 js : double or ndarray

1259 The Jensen-Shannon distances between `p` and `q` along the `axis`.

1260

1261 Notes

1262 -----

1263

1264 .. versionadded:: 1.2.0

1265

1266 Examples

1267 --------

1268 >>> from scipy.spatial import distance

1269 >>> import numpy as np

1270 >>> distance.jensenshannon([1.0, 0.0, 0.0], [0.0, 1.0, 0.0], 2.0)

1271 1.0

1272 >>> distance.jensenshannon([1.0, 0.0], [0.5, 0.5])

1273 0.46450140402245893

1274 >>> distance.jensenshannon([1.0, 0.0, 0.0], [1.0, 0.0, 0.0])

1275 0.0

1276 >>> a = np.array([[1, 2, 3, 4],

1277 ... [5, 6, 7, 8],

1278 ... [9, 10, 11, 12]])

1279 >>> b = np.array([[13, 14, 15, 16],

1280 ... [17, 18, 19, 20],

1281 ... [21, 22, 23, 24]])

1282 >>> distance.jensenshannon(a, b, axis=0)

1283 array([0.1954288, 0.1447697, 0.1138377, 0.0927636])

1284 >>> distance.jensenshannon(a, b, axis=1)

1285 array([0.1402339, 0.0399106, 0.0201815])

1286

1287 """

1288 p = np.asarray(p)

1289 q = np.asarray(q)

1290 p = p / np.sum(p, axis=axis, keepdims=True)

1291 q = q / np.sum(q, axis=axis, keepdims=True)

1292 m = (p + q) / 2.0

1293 left = rel_entr(p, m)

1294 right = rel_entr(q, m)

1295 left_sum = np.sum(left, axis=axis, keepdims=keepdims)

1296 right_sum = np.sum(right, axis=axis, keepdims=keepdims)

1297 js = left_sum + right_sum

1298 if base is not None:

1299 js /= np.log(base)

1300 return np.sqrt(js / 2.0)

1301

1302

1303def yule(u, v, w=None):

1304 """

1305 Compute the Yule dissimilarity between two boolean 1-D arrays.

1306

1307 The Yule dissimilarity is defined as

1308

1309 .. math::

1310

1311 \\frac{R}{c_{TT} * c_{FF} + \\frac{R}{2}}

1312

1313 where :math:`c_{ij}` is the number of occurrences of

1314 :math:`\\mathtt{u[k]} = i` and :math:`\\mathtt{v[k]} = j` for

1315 :math:`k < n` and :math:`R = 2.0 * c_{TF} * c_{FT}`.

1316

1317 Parameters

1318 ----------

1319 u : (N,) array_like, bool

1320 Input array.

1321 v : (N,) array_like, bool

1322 Input array.

1323 w : (N,) array_like, optional

1324 The weights for each value in `u` and `v`. Default is None,

1325 which gives each value a weight of 1.0

1326

1327 Returns

1328 -------

1329 yule : double

1330 The Yule dissimilarity between vectors `u` and `v`.

1331

1332 Examples

1333 --------

1334 >>> from scipy.spatial import distance

1335 >>> distance.yule([1, 0, 0], [0, 1, 0])

1336 2.0

1337 >>> distance.yule([1, 1, 0], [0, 1, 0])

1338 0.0

1339

1340 """

1341 u = _validate_vector(u)

1342 v = _validate_vector(v)

1343 if w is not None:

1344 w = _validate_weights(w)

1345 (nff, nft, ntf, ntt) = _nbool_correspond_all(u, v, w=w)

1346 half_R = ntf * nft

1347 if half_R == 0:

1348 return 0.0

1349 else:

1350 return float(2.0 * half_R / (ntt * nff + half_R))

1351

1352

1353def dice(u, v, w=None):

1354 """

1355 Compute the Dice dissimilarity between two boolean 1-D arrays.

1356

1357 The Dice dissimilarity between `u` and `v`, is

1358

1359 .. math::

1360

1361 \\frac{c_{TF} + c_{FT}}

1362 {2c_{TT} + c_{FT} + c_{TF}}

1363

1364 where :math:`c_{ij}` is the number of occurrences of

1365 :math:`\\mathtt{u[k]} = i` and :math:`\\mathtt{v[k]} = j` for

1366 :math:`k < n`.

1367

1368 Parameters

1369 ----------

1370 u : (N,) array_like, bool

1371 Input 1-D array.

1372 v : (N,) array_like, bool

1373 Input 1-D array.

1374 w : (N,) array_like, optional

1375 The weights for each value in `u` and `v`. Default is None,

1376 which gives each value a weight of 1.0

1377

1378 Returns

1379 -------

1380 dice : double

1381 The Dice dissimilarity between 1-D arrays `u` and `v`.

1382

1383 Notes

1384 -----

1385 This function computes the Dice dissimilarity index. To compute the

1386 Dice similarity index, convert one to the other with similarity =

1387 1 - dissimilarity.

1388

1389 Examples

1390 --------

1391 >>> from scipy.spatial import distance

1392 >>> distance.dice([1, 0, 0], [0, 1, 0])

1393 1.0

1394 >>> distance.dice([1, 0, 0], [1, 1, 0])

1395 0.3333333333333333

1396 >>> distance.dice([1, 0, 0], [2, 0, 0])

1397 -0.3333333333333333

1398

1399 """

1400 u = _validate_vector(u)

1401 v = _validate_vector(v)

1402 if w is not None:

1403 w = _validate_weights(w)

1404 if u.dtype == v.dtype == bool and w is None:

1405 ntt = (u & v).sum()

1406 else:

1407 dtype = np.result_type(int, u.dtype, v.dtype)

1408 u = u.astype(dtype)

1409 v = v.astype(dtype)

1410 if w is None:

1411 ntt = (u * v).sum()

1412 else:

1413 ntt = (u * v * w).sum()

1414 (nft, ntf) = _nbool_correspond_ft_tf(u, v, w=w)

1415 return float((ntf + nft) / np.array(2.0 * ntt + ntf + nft))

1416

1417

1418def rogerstanimoto(u, v, w=None):

1419 """

1420 Compute the Rogers-Tanimoto dissimilarity between two boolean 1-D arrays.

1421

1422 The Rogers-Tanimoto dissimilarity between two boolean 1-D arrays

1423 `u` and `v`, is defined as

1424

1425 .. math::

1426 \\frac{R}

1427 {c_{TT} + c_{FF} + R}

1428

1429 where :math:`c_{ij}` is the number of occurrences of

1430 :math:`\\mathtt{u[k]} = i` and :math:`\\mathtt{v[k]} = j` for

1431 :math:`k < n` and :math:`R = 2(c_{TF} + c_{FT})`.

1432

1433 Parameters

1434 ----------

1435 u : (N,) array_like, bool

1436 Input array.

1437 v : (N,) array_like, bool

1438 Input array.

1439 w : (N,) array_like, optional

1440 The weights for each value in `u` and `v`. Default is None,

1441 which gives each value a weight of 1.0

1442

1443 Returns

1444 -------

1445 rogerstanimoto : double

1446 The Rogers-Tanimoto dissimilarity between vectors

1447 `u` and `v`.

1448

1449 Examples

1450 --------

1451 >>> from scipy.spatial import distance

1452 >>> distance.rogerstanimoto([1, 0, 0], [0, 1, 0])

1453 0.8

1454 >>> distance.rogerstanimoto([1, 0, 0], [1, 1, 0])

1455 0.5

1456 >>> distance.rogerstanimoto([1, 0, 0], [2, 0, 0])

1457 -1.0

1458

1459 """

1460 u = _validate_vector(u)

1461 v = _validate_vector(v)

1462 if w is not None:

1463 w = _validate_weights(w)

1464 (nff, nft, ntf, ntt) = _nbool_correspond_all(u, v, w=w)

1465 return float(2.0 * (ntf + nft)) / float(ntt + nff + (2.0 * (ntf + nft)))

1466

1467

1468def russellrao(u, v, w=None):

1469 """

1470 Compute the Russell-Rao dissimilarity between two boolean 1-D arrays.

1471

1472 The Russell-Rao dissimilarity between two boolean 1-D arrays, `u` and

1473 `v`, is defined as

1474

1475 .. math::

1476

1477 \\frac{n - c_{TT}}

1478 {n}

1479

1480 where :math:`c_{ij}` is the number of occurrences of

1481 :math:`\\mathtt{u[k]} = i` and :math:`\\mathtt{v[k]} = j` for

1482 :math:`k < n`.

1483

1484 Parameters

1485 ----------

1486 u : (N,) array_like, bool

1487 Input array.

1488 v : (N,) array_like, bool

1489 Input array.

1490 w : (N,) array_like, optional

1491 The weights for each value in `u` and `v`. Default is None,

1492 which gives each value a weight of 1.0

1493

1494 Returns

1495 -------

1496 russellrao : double

1497 The Russell-Rao dissimilarity between vectors `u` and `v`.

1498

1499 Examples

1500 --------

1501 >>> from scipy.spatial import distance

1502 >>> distance.russellrao([1, 0, 0], [0, 1, 0])

1503 1.0

1504 >>> distance.russellrao([1, 0, 0], [1, 1, 0])

1505 0.6666666666666666

1506 >>> distance.russellrao([1, 0, 0], [2, 0, 0])

1507 0.3333333333333333

1508

1509 """

1510 u = _validate_vector(u)

1511 v = _validate_vector(v)

1512 if u.dtype == v.dtype == bool and w is None:

1513 ntt = (u & v).sum()

1514 n = float(len(u))

1515 elif w is None:

1516 ntt = (u * v).sum()

1517 n = float(len(u))

1518 else:

1519 w = _validate_weights(w)

1520 ntt = (u * v * w).sum()

1521 n = w.sum()

1522 return float(n - ntt) / n

1523

1524

1525def sokalmichener(u, v, w=None):

1526 """

1527 Compute the Sokal-Michener dissimilarity between two boolean 1-D arrays.

1528

1529 The Sokal-Michener dissimilarity between boolean 1-D arrays `u` and `v`,

1530 is defined as

1531

1532 .. math::

1533

1534 \\frac{R}

1535 {S + R}

1536

1537 where :math:`c_{ij}` is the number of occurrences of

1538 :math:`\\mathtt{u[k]} = i` and :math:`\\mathtt{v[k]} = j` for

1539 :math:`k < n`, :math:`R = 2 * (c_{TF} + c_{FT})` and

1540 :math:`S = c_{FF} + c_{TT}`.

1541

1542 Parameters

1543 ----------

1544 u : (N,) array_like, bool

1545 Input array.

1546 v : (N,) array_like, bool

1547 Input array.

1548 w : (N,) array_like, optional

1549 The weights for each value in `u` and `v`. Default is None,

1550 which gives each value a weight of 1.0

1551

1552 Returns

1553 -------

1554 sokalmichener : double

1555 The Sokal-Michener dissimilarity between vectors `u` and `v`.

1556

1557 Examples

1558 --------

1559 >>> from scipy.spatial import distance

1560 >>> distance.sokalmichener([1, 0, 0], [0, 1, 0])

1561 0.8

1562 >>> distance.sokalmichener([1, 0, 0], [1, 1, 0])

1563 0.5

1564 >>> distance.sokalmichener([1, 0, 0], [2, 0, 0])

1565 -1.0

1566

1567 """

1568 u = _validate_vector(u)

1569 v = _validate_vector(v)

1570 if w is not None:

1571 w = _validate_weights(w)

1572 nff, nft, ntf, ntt = _nbool_correspond_all(u, v, w=w)

1573 return float(2.0 * (ntf + nft)) / float(ntt + nff + 2.0 * (ntf + nft))

1574

1575

1576def sokalsneath(u, v, w=None):

1577 """

1578 Compute the Sokal-Sneath dissimilarity between two boolean 1-D arrays.

1579

1580 The Sokal-Sneath dissimilarity between `u` and `v`,

1581

1582 .. math::

1583

1584 \\frac{R}

1585 {c_{TT} + R}

1586

1587 where :math:`c_{ij}` is the number of occurrences of

1588 :math:`\\mathtt{u[k]} = i` and :math:`\\mathtt{v[k]} = j` for

1589 :math:`k < n` and :math:`R = 2(c_{TF} + c_{FT})`.

1590

1591 Parameters

1592 ----------

1593 u : (N,) array_like, bool

1594 Input array.

1595 v : (N,) array_like, bool

1596 Input array.

1597 w : (N,) array_like, optional

1598 The weights for each value in `u` and `v`. Default is None,

1599 which gives each value a weight of 1.0

1600

1601 Returns

1602 -------

1603 sokalsneath : double

1604 The Sokal-Sneath dissimilarity between vectors `u` and `v`.

1605

1606 Examples

1607 --------

1608 >>> from scipy.spatial import distance

1609 >>> distance.sokalsneath([1, 0, 0], [0, 1, 0])

1610 1.0

1611 >>> distance.sokalsneath([1, 0, 0], [1, 1, 0])

1612 0.66666666666666663

1613 >>> distance.sokalsneath([1, 0, 0], [2, 1, 0])

1614 0.0

1615 >>> distance.sokalsneath([1, 0, 0], [3, 1, 0])

1616 -2.0

1617

1618 """

1619 u = _validate_vector(u)

1620 v = _validate_vector(v)

1621 if u.dtype == v.dtype == bool and w is None:

1622 ntt = (u & v).sum()

1623 elif w is None:

1624 ntt = (u * v).sum()

1625 else:

1626 w = _validate_weights(w)

1627 ntt = (u * v * w).sum()

1628 (nft, ntf) = _nbool_correspond_ft_tf(u, v, w=w)

1629 denom = np.array(ntt + 2.0 * (ntf + nft))

1630 if not denom.any():

1631 raise ValueError('Sokal-Sneath dissimilarity is not defined for '

1632 'vectors that are entirely false.')

1633 return float(2.0 * (ntf + nft)) / denom

1634

1635

1636_convert_to_double = partial(_convert_to_type, out_type=np.double)

1637_convert_to_bool = partial(_convert_to_type, out_type=bool)

1638

1639# adding python-only wrappers to _distance_wrap module

1640_distance_wrap.pdist_correlation_double_wrap = _correlation_pdist_wrap

1641_distance_wrap.cdist_correlation_double_wrap = _correlation_cdist_wrap

1642

1643

1644@dataclasses.dataclass(frozen=True)

1645class CDistMetricWrapper:

1646 metric_name: str

1647

1648 def __call__(self, XA, XB, *, out=None, **kwargs):

1649 XA = np.ascontiguousarray(XA)

1650 XB = np.ascontiguousarray(XB)

1651 mA, n = XA.shape

1652 mB, _ = XB.shape

1653 metric_name = self.metric_name

1654 metric_info = _METRICS[metric_name]

1655 XA, XB, typ, kwargs = _validate_cdist_input(

1656 XA, XB, mA, mB, n, metric_info, **kwargs)

1657

1658 w = kwargs.pop('w', None)

1659 if w is not None:

1660 metric = metric_info.dist_func

1661 return _cdist_callable(

1662 XA, XB, metric=metric, out=out, w=w, **kwargs)

1663

1664 dm = _prepare_out_argument(out, np.double, (mA, mB))

1665 # get cdist wrapper

1666 cdist_fn = getattr(_distance_wrap, f'cdist_{metric_name}_{typ}_wrap')

1667 cdist_fn(XA, XB, dm, **kwargs)

1668 return dm

1669

1670

1671@dataclasses.dataclass(frozen=True)

1672class CDistWeightedMetricWrapper:

1673 metric_name: str

1674 weighted_metric: str

1675

1676 def __call__(self, XA, XB, *, out=None, **kwargs):

1677 XA = np.ascontiguousarray(XA)

1678 XB = np.ascontiguousarray(XB)

1679 mA, n = XA.shape

1680 mB, _ = XB.shape

1681 metric_name = self.metric_name

1682 XA, XB, typ, kwargs = _validate_cdist_input(

1683 XA, XB, mA, mB, n, _METRICS[metric_name], **kwargs)

1684 dm = _prepare_out_argument(out, np.double, (mA, mB))

1685

1686 w = kwargs.pop('w', None)

1687 if w is not None:

1688 metric_name = self.weighted_metric

1689 kwargs['w'] = w

1690

1691 # get cdist wrapper

1692 cdist_fn = getattr(_distance_wrap, f'cdist_{metric_name}_{typ}_wrap')

1693 cdist_fn(XA, XB, dm, **kwargs)

1694 return dm

1695

1696

1697@dataclasses.dataclass(frozen=True)

1698class PDistMetricWrapper:

1699 metric_name: str

1700

1701 def __call__(self, X, *, out=None, **kwargs):

1702 X = np.ascontiguousarray(X)

1703 m, n = X.shape

1704 metric_name = self.metric_name

1705 metric_info = _METRICS[metric_name]

1706 X, typ, kwargs = _validate_pdist_input(

1707 X, m, n, metric_info, **kwargs)

1708 out_size = (m * (m - 1)) // 2

1709 w = kwargs.pop('w', None)

1710 if w is not None:

1711 metric = metric_info.dist_func

1712 return _pdist_callable(

1713 X, metric=metric, out=out, w=w, **kwargs)

1714

1715 dm = _prepare_out_argument(out, np.double, (out_size,))

1716 # get pdist wrapper

1717 pdist_fn = getattr(_distance_wrap, f'pdist_{metric_name}_{typ}_wrap')

1718 pdist_fn(X, dm, **kwargs)

1719 return dm

1720

1721

1722@dataclasses.dataclass(frozen=True)

1723class PDistWeightedMetricWrapper:

1724 metric_name: str

1725 weighted_metric: str

1726

1727 def __call__(self, X, *, out=None, **kwargs):

1728 X = np.ascontiguousarray(X)

1729 m, n = X.shape

1730 metric_name = self.metric_name

1731 X, typ, kwargs = _validate_pdist_input(

1732 X, m, n, _METRICS[metric_name], **kwargs)

1733 out_size = (m * (m - 1)) // 2

1734 dm = _prepare_out_argument(out, np.double, (out_size,))

1735

1736 w = kwargs.pop('w', None)

1737 if w is not None:

1738 metric_name = self.weighted_metric

1739 kwargs['w'] = w

1740

1741 # get pdist wrapper

1742 pdist_fn = getattr(_distance_wrap, f'pdist_{metric_name}_{typ}_wrap')

1743 pdist_fn(X, dm, **kwargs)

1744 return dm

1745

1746

1747@dataclasses.dataclass(frozen=True)

1748class MetricInfo:

1749 # Name of python distance function

1750 canonical_name: str

1751 # All aliases, including canonical_name

1752 aka: Set[str]

1753 # unvectorized distance function

1754 dist_func: Callable

1755 # Optimized cdist function

1756 cdist_func: Callable

1757 # Optimized pdist function

1758 pdist_func: Callable

1759 # function that checks kwargs and computes default values:

1760 # f(X, m, n, **kwargs)

1761 validator: Optional[Callable] = None

1762 # list of supported types:

1763 # X (pdist) and XA (cdist) are used to choose the type. if there is no

1764 # match the first type is used. Default double

1765 types: List[str] = dataclasses.field(default_factory=lambda: ['double'])

1766 # true if out array must be C-contiguous

1767 requires_contiguous_out: bool = True

1768

1769

1770# Registry of implemented metrics:

1771_METRIC_INFOS = [

1772 MetricInfo(

1773 canonical_name='braycurtis',

1774 aka={'braycurtis'},

1775 dist_func=braycurtis,

1776 cdist_func=_distance_pybind.cdist_braycurtis,

1777 pdist_func=_distance_pybind.pdist_braycurtis,

1778 ),

1779 MetricInfo(

1780 canonical_name='canberra',

1781 aka={'canberra'},

1782 dist_func=canberra,

1783 cdist_func=_distance_pybind.cdist_canberra,

1784 pdist_func=_distance_pybind.pdist_canberra,

1785 ),

1786 MetricInfo(

1787 canonical_name='chebyshev',

1788 aka={'chebychev', 'chebyshev', 'cheby', 'cheb', 'ch'},

1789 dist_func=chebyshev,

1790 cdist_func=_distance_pybind.cdist_chebyshev,

1791 pdist_func=_distance_pybind.pdist_chebyshev,

1792 ),

1793 MetricInfo(

1794 canonical_name='cityblock',

1795 aka={'cityblock', 'cblock', 'cb', 'c'},

1796 dist_func=cityblock,

1797 cdist_func=_distance_pybind.cdist_cityblock,

1798 pdist_func=_distance_pybind.pdist_cityblock,

1799 ),

1800 MetricInfo(

1801 canonical_name='correlation',

1802 aka={'correlation', 'co'},

1803 dist_func=correlation,

1804 cdist_func=CDistMetricWrapper('correlation'),

1805 pdist_func=PDistMetricWrapper('correlation'),

1806 ),

1807 MetricInfo(

1808 canonical_name='cosine',

1809 aka={'cosine', 'cos'},

1810 dist_func=cosine,

1811 cdist_func=CDistMetricWrapper('cosine'),

1812 pdist_func=PDistMetricWrapper('cosine'),

1813 ),

1814 MetricInfo(

1815 canonical_name='dice',

1816 aka={'dice'},

1817 types=['bool'],

1818 dist_func=dice,

1819 cdist_func=CDistMetricWrapper('dice'),

1820 pdist_func=PDistMetricWrapper('dice'),

1821 ),

1822 MetricInfo(

1823 canonical_name='euclidean',

1824 aka={'euclidean', 'euclid', 'eu', 'e'},

1825 dist_func=euclidean,

1826 cdist_func=_distance_pybind.cdist_euclidean,

1827 pdist_func=_distance_pybind.pdist_euclidean,

1828 ),

1829 MetricInfo(

1830 canonical_name='hamming',

1831 aka={'matching', 'hamming', 'hamm', 'ha', 'h'},

1832 types=['double', 'bool'],

1833 validator=_validate_hamming_kwargs,

1834 dist_func=hamming,

1835 cdist_func=CDistWeightedMetricWrapper('hamming', 'hamming'),

1836 pdist_func=PDistWeightedMetricWrapper('hamming', 'hamming'),

1837 ),

1838 MetricInfo(

1839 canonical_name='jaccard',

1840 aka={'jaccard', 'jacc', 'ja', 'j'},

1841 types=['double', 'bool'],

1842 dist_func=jaccard,

1843 cdist_func=CDistMetricWrapper('jaccard'),

1844 pdist_func=PDistMetricWrapper('jaccard'),

1845 ),

1846 MetricInfo(

1847 canonical_name='jensenshannon',

1848 aka={'jensenshannon', 'js'},

1849 dist_func=jensenshannon,

1850 cdist_func=CDistMetricWrapper('jensenshannon'),

1851 pdist_func=PDistMetricWrapper('jensenshannon'),

1852 ),

1853 MetricInfo(

1854 canonical_name='kulsinski',

1855 aka={'kulsinski'},

1856 types=['bool'],

1857 dist_func=kulsinski,

1858 cdist_func=CDistMetricWrapper('kulsinski'),

1859 pdist_func=PDistMetricWrapper('kulsinski'),

1860 ),

1861 MetricInfo(

1862 canonical_name='kulczynski1',

1863 aka={'kulczynski1'},

1864 types=['bool'],

1865 dist_func=kulczynski1,

1866 cdist_func=CDistMetricWrapper('kulczynski1'),

1867 pdist_func=PDistMetricWrapper('kulczynski1'),

1868 ),

1869 MetricInfo(

1870 canonical_name='mahalanobis',

1871 aka={'mahalanobis', 'mahal', 'mah'},

1872 validator=_validate_mahalanobis_kwargs,

1873 dist_func=mahalanobis,

1874 cdist_func=CDistMetricWrapper('mahalanobis'),

1875 pdist_func=PDistMetricWrapper('mahalanobis'),

1876 ),

1877 MetricInfo(

1878 canonical_name='minkowski',

1879 aka={'minkowski', 'mi', 'm', 'pnorm'},

1880 validator=_validate_minkowski_kwargs,

1881 dist_func=minkowski,

1882 cdist_func=_distance_pybind.cdist_minkowski,

1883 pdist_func=_distance_pybind.pdist_minkowski,

1884 ),

1885 MetricInfo(

1886 canonical_name='rogerstanimoto',

1887 aka={'rogerstanimoto'},

1888 types=['bool'],

1889 dist_func=rogerstanimoto,

1890 cdist_func=CDistMetricWrapper('rogerstanimoto'),

1891 pdist_func=PDistMetricWrapper('rogerstanimoto'),

1892 ),

1893 MetricInfo(

1894 canonical_name='russellrao',

1895 aka={'russellrao'},

1896 types=['bool'],

1897 dist_func=russellrao,

1898 cdist_func=CDistMetricWrapper('russellrao'),

1899 pdist_func=PDistMetricWrapper('russellrao'),

1900 ),

1901 MetricInfo(

1902 canonical_name='seuclidean',

1903 aka={'seuclidean', 'se', 's'},

1904 validator=_validate_seuclidean_kwargs,

1905 dist_func=seuclidean,

1906 cdist_func=CDistMetricWrapper('seuclidean'),

1907 pdist_func=PDistMetricWrapper('seuclidean'),

1908 ),

1909 MetricInfo(

1910 canonical_name='sokalmichener',

1911 aka={'sokalmichener'},

1912 types=['bool'],

1913 dist_func=sokalmichener,

1914 cdist_func=CDistMetricWrapper('sokalmichener'),

1915 pdist_func=PDistMetricWrapper('sokalmichener'),

1916 ),

1917 MetricInfo(

1918 canonical_name='sokalsneath',

1919 aka={'sokalsneath'},

1920 types=['bool'],

1921 dist_func=sokalsneath,

1922 cdist_func=CDistMetricWrapper('sokalsneath'),

1923 pdist_func=PDistMetricWrapper('sokalsneath'),

1924 ),

1925 MetricInfo(

1926 canonical_name='sqeuclidean',

1927 aka={'sqeuclidean', 'sqe', 'sqeuclid'},

1928 dist_func=sqeuclidean,

1929 cdist_func=_distance_pybind.cdist_sqeuclidean,

1930 pdist_func=_distance_pybind.pdist_sqeuclidean,

1931 ),

1932 MetricInfo(

1933 canonical_name='yule',

1934 aka={'yule'},

1935 types=['bool'],

1936 dist_func=yule,

1937 cdist_func=CDistMetricWrapper('yule'),

1938 pdist_func=PDistMetricWrapper('yule'),

1939 ),

1940]

1941

1942_METRICS = {info.canonical_name: info for info in _METRIC_INFOS}

1943_METRIC_ALIAS = dict((alias, info)

1944 for info in _METRIC_INFOS

1945 for alias in info.aka)

1946

1947_METRICS_NAMES = list(_METRICS.keys())

1948

1949_TEST_METRICS = {'test_' + info.canonical_name: info for info in _METRIC_INFOS}

1950

1951

1952def pdist(X, metric='euclidean', *, out=None, **kwargs):

1953 """

1954 Pairwise distances between observations in n-dimensional space.

1955

1956 See Notes for common calling conventions.

1957

1958 Parameters

1959 ----------

1960 X : array_like

1961 An m by n array of m original observations in an

1962 n-dimensional space.

1963 metric : str or function, optional

1964 The distance metric to use. The distance function can

1965 be 'braycurtis', 'canberra', 'chebyshev', 'cityblock',

1966 'correlation', 'cosine', 'dice', 'euclidean', 'hamming',

1967 'jaccard', 'jensenshannon', 'kulczynski1',

1968 'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto',

1969 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath',

1970 'sqeuclidean', 'yule'.

1971 **kwargs : dict, optional

1972 Extra arguments to `metric`: refer to each metric documentation for a

1973 list of all possible arguments.

1974

1975 Some possible arguments:

1976

1977 p : scalar

1978 The p-norm to apply for Minkowski, weighted and unweighted.

1979 Default: 2.

1980

1981 w : ndarray

1982 The weight vector for metrics that support weights (e.g., Minkowski).

1983

1984 V : ndarray

1985 The variance vector for standardized Euclidean.

1986 Default: var(X, axis=0, ddof=1)

1987

1988 VI : ndarray

1989 The inverse of the covariance matrix for Mahalanobis.

1990 Default: inv(cov(X.T)).T

1991

1992 out : ndarray.

1993 The output array

1994 If not None, condensed distance matrix Y is stored in this array.

1995

1996 Returns

1997 -------

1998 Y : ndarray

1999 Returns a condensed distance matrix Y. For each :math:`i` and :math:`j`

2000 (where :math:`i<j<m`),where m is the number of original observations.

2001 The metric ``dist(u=X[i], v=X[j])`` is computed and stored in entry ``m

2002 * i + j - ((i + 2) * (i + 1)) // 2``.

2003

2004 See Also

2005 --------

2006 squareform : converts between condensed distance matrices and

2007 square distance matrices.

2008

2009 Notes

2010 -----

2011 See ``squareform`` for information on how to calculate the index of

2012 this entry or to convert the condensed distance matrix to a

2013 redundant square matrix.

2014

2015 The following are common calling conventions.

2016

2017 1. ``Y = pdist(X, 'euclidean')``

2018

2019 Computes the distance between m points using Euclidean distance

2020 (2-norm) as the distance metric between the points. The points

2021 are arranged as m n-dimensional row vectors in the matrix X.

2022

2023 2. ``Y = pdist(X, 'minkowski', p=2.)``

2024

2025 Computes the distances using the Minkowski distance

2026 :math:`\\|u-v\\|_p` (:math:`p`-norm) where :math:`p > 0` (note

2027 that this is only a quasi-metric if :math:`0 < p < 1`).

2028

2029 3. ``Y = pdist(X, 'cityblock')``

2030

2031 Computes the city block or Manhattan distance between the

2032 points.

2033

2034 4. ``Y = pdist(X, 'seuclidean', V=None)``

2035

2036 Computes the standardized Euclidean distance. The standardized

2037 Euclidean distance between two n-vectors ``u`` and ``v`` is

2038

2039 .. math::

2040

2041 \\sqrt{\\sum {(u_i-v_i)^2 / V[x_i]}}

2042

2043

2044 V is the variance vector; V[i] is the variance computed over all

2045 the i'th components of the points. If not passed, it is

2046 automatically computed.

2047

2048 5. ``Y = pdist(X, 'sqeuclidean')``

2049

2050 Computes the squared Euclidean distance :math:`\\|u-v\\|_2^2` between

2051 the vectors.

2052

2053 6. ``Y = pdist(X, 'cosine')``

2054

2055 Computes the cosine distance between vectors u and v,

2056

2057 .. math::

2058

2059 1 - \\frac{u \\cdot v}

2060 {{\\|u\\|}_2 {\\|v\\|}_2}

2061

2062 where :math:`\\|*\\|_2` is the 2-norm of its argument ``*``, and

2063 :math:`u \\cdot v` is the dot product of ``u`` and ``v``.

2064

2065 7. ``Y = pdist(X, 'correlation')``

2066

2067 Computes the correlation distance between vectors u and v. This is

2068

2069 .. math::

2070

2071 1 - \\frac{(u - \\bar{u}) \\cdot (v - \\bar{v})}

2072 {{\\|(u - \\bar{u})\\|}_2 {\\|(v - \\bar{v})\\|}_2}

2073

2074 where :math:`\\bar{v}` is the mean of the elements of vector v,

2075 and :math:`x \\cdot y` is the dot product of :math:`x` and :math:`y`.

2076

2077 8. ``Y = pdist(X, 'hamming')``

2078

2079 Computes the normalized Hamming distance, or the proportion of

2080 those vector elements between two n-vectors ``u`` and ``v``

2081 which disagree. To save memory, the matrix ``X`` can be of type

2082 boolean.

2083

2084 9. ``Y = pdist(X, 'jaccard')``

2085

2086 Computes the Jaccard distance between the points. Given two

2087 vectors, ``u`` and ``v``, the Jaccard distance is the

2088 proportion of those elements ``u[i]`` and ``v[i]`` that

2089 disagree.

2090

2091 10. ``Y = pdist(X, 'jensenshannon')``

2092

2093 Computes the Jensen-Shannon distance between two probability arrays.

2094 Given two probability vectors, :math:`p` and :math:`q`, the

2095 Jensen-Shannon distance is

2096

2097 .. math::

2098

2099 \\sqrt{\\frac{D(p \\parallel m) + D(q \\parallel m)}{2}}

2100

2101 where :math:`m` is the pointwise mean of :math:`p` and :math:`q`

2102 and :math:`D` is the Kullback-Leibler divergence.

2103

2104 11. ``Y = pdist(X, 'chebyshev')``

2105

2106 Computes the Chebyshev distance between the points. The

2107 Chebyshev distance between two n-vectors ``u`` and ``v`` is the

2108 maximum norm-1 distance between their respective elements. More

2109 precisely, the distance is given by

2110

2111 .. math::

2112

2113 d(u,v) = \\max_i {|u_i-v_i|}

2114

2115 12. ``Y = pdist(X, 'canberra')``

2116

2117 Computes the Canberra distance between the points. The

2118 Canberra distance between two points ``u`` and ``v`` is

2119

2120 .. math::

2121

2122 d(u,v) = \\sum_i \\frac{|u_i-v_i|}

2123 {|u_i|+|v_i|}

2124

2125

2126 13. ``Y = pdist(X, 'braycurtis')``

2127

2128 Computes the Bray-Curtis distance between the points. The

2129 Bray-Curtis distance between two points ``u`` and ``v`` is

2130

2131

2132 .. math::

2133

2134 d(u,v) = \\frac{\\sum_i {|u_i-v_i|}}

2135 {\\sum_i {|u_i+v_i|}}

2136

2137 14. ``Y = pdist(X, 'mahalanobis', VI=None)``

2138

2139 Computes the Mahalanobis distance between the points. The

2140 Mahalanobis distance between two points ``u`` and ``v`` is

2141 :math:`\\sqrt{(u-v)(1/V)(u-v)^T}` where :math:`(1/V)` (the ``VI``

2142 variable) is the inverse covariance. If ``VI`` is not None,

2143 ``VI`` will be used as the inverse covariance matrix.

2144

2145 15. ``Y = pdist(X, 'yule')``

2146

2147 Computes the Yule distance between each pair of boolean

2148 vectors. (see yule function documentation)

2149

2150 16. ``Y = pdist(X, 'matching')``

2151

2152 Synonym for 'hamming'.

2153

2154 17. ``Y = pdist(X, 'dice')``

2155

2156 Computes the Dice distance between each pair of boolean

2157 vectors. (see dice function documentation)

2158

2159 18. ``Y = pdist(X, 'kulczynski1')``

2160

2161 Computes the kulczynski1 distance between each pair of

2162 boolean vectors. (see kulczynski1 function documentation)

2163

2164 19. ``Y = pdist(X, 'rogerstanimoto')``

2165

2166 Computes the Rogers-Tanimoto distance between each pair of

2167 boolean vectors. (see rogerstanimoto function documentation)

2168

2169 20. ``Y = pdist(X, 'russellrao')``

2170

2171 Computes the Russell-Rao distance between each pair of

2172 boolean vectors. (see russellrao function documentation)

2173

2174 21. ``Y = pdist(X, 'sokalmichener')``

2175

2176 Computes the Sokal-Michener distance between each pair of

2177 boolean vectors. (see sokalmichener function documentation)

2178

2179 22. ``Y = pdist(X, 'sokalsneath')``

2180

2181 Computes the Sokal-Sneath distance between each pair of

2182 boolean vectors. (see sokalsneath function documentation)

2183

2184 23. ``Y = pdist(X, 'kulczynski1')``

2185

2186 Computes the Kulczynski 1 distance between each pair of

2187 boolean vectors. (see kulczynski1 function documentation)

2188

2189 24. ``Y = pdist(X, f)``

2190

2191 Computes the distance between all pairs of vectors in X

2192 using the user supplied 2-arity function f. For example,

2193 Euclidean distance between the vectors could be computed

2194 as follows::

2195

2196 dm = pdist(X, lambda u, v: np.sqrt(((u-v)**2).sum()))

2197

2198 Note that you should avoid passing a reference to one of

2199 the distance functions defined in this library. For example,::

2200

2201 dm = pdist(X, sokalsneath)

2202

2203 would calculate the pair-wise distances between the vectors in

2204 X using the Python function sokalsneath. This would result in

2205 sokalsneath being called :math:`{n \\choose 2}` times, which

2206 is inefficient. Instead, the optimized C version is more

2207 efficient, and we call it using the following syntax.::

2208

2209 dm = pdist(X, 'sokalsneath')

2210

2211 """

2212 # You can also call this as:

2213 # Y = pdist(X, 'test_abc')

2214 # where 'abc' is the metric being tested. This computes the distance

2215 # between all pairs of vectors in X using the distance metric 'abc' but

2216 # with a more succinct, verifiable, but less efficient implementation.

2217

2218 X = _asarray_validated(X, sparse_ok=False, objects_ok=True, mask_ok=True,

2219 check_finite=False)

2220

2221 s = X.shape

2222 if len(s) != 2:

2223 raise ValueError('A 2-dimensional array must be passed.')

2224

2225 m, n = s

2226

2227 if callable(metric):

2228 mstr = getattr(metric, '__name__', 'UnknownCustomMetric')

2229 metric_info = _METRIC_ALIAS.get(mstr, None)

2230

2231 if metric_info is not None:

2232 X, typ, kwargs = _validate_pdist_input(

2233 X, m, n, metric_info, **kwargs)

2234

2235 return _pdist_callable(X, metric=metric, out=out, **kwargs)

2236 elif isinstance(metric, str):

2237 mstr = metric.lower()

2238 metric_info = _METRIC_ALIAS.get(mstr, None)

2239

2240 if metric_info is not None:

2241 pdist_fn = metric_info.pdist_func

2242 return pdist_fn(X, out=out, **kwargs)

2243 elif mstr.startswith("test_"):

2244 metric_info = _TEST_METRICS.get(mstr, None)

2245 if metric_info is None:

2246 raise ValueError(f'Unknown "Test" Distance Metric: {mstr[5:]}')

2247 X, typ, kwargs = _validate_pdist_input(

2248 X, m, n, metric_info, **kwargs)

2249 return _pdist_callable(

2250 X, metric=metric_info.dist_func, out=out, **kwargs)

2251 else:

2252 raise ValueError('Unknown Distance Metric: %s' % mstr)

2253 else:

2254 raise TypeError('2nd argument metric must be a string identifier '

2255 'or a function.')

2256

2257

2258def squareform(X, force="no", checks=True):

2259 """

2260 Convert a vector-form distance vector to a square-form distance

2261 matrix, and vice-versa.

2262

2263 Parameters

2264 ----------

2265 X : array_like

2266 Either a condensed or redundant distance matrix.

2267 force : str, optional

2268 As with MATLAB(TM), if force is equal to ``'tovector'`` or

2269 ``'tomatrix'``, the input will be treated as a distance matrix or

2270 distance vector respectively.

2271 checks : bool, optional

2272 If set to False, no checks will be made for matrix

2273 symmetry nor zero diagonals. This is useful if it is known that

2274 ``X - X.T1`` is small and ``diag(X)`` is close to zero.

2275 These values are ignored any way so they do not disrupt the

2276 squareform transformation.

2277

2278 Returns

2279 -------

2280 Y : ndarray

2281 If a condensed distance matrix is passed, a redundant one is

2282 returned, or if a redundant one is passed, a condensed distance

2283 matrix is returned.

2284

2285 Notes

2286 -----

2287 1. ``v = squareform(X)``

2288

2289 Given a square n-by-n symmetric distance matrix ``X``,

2290 ``v = squareform(X)`` returns a ``n * (n-1) / 2``

2291 (i.e. binomial coefficient n choose 2) sized vector `v`

2292 where :math:`v[{n \\choose 2} - {n-i \\choose 2} + (j-i-1)]`

2293 is the distance between distinct points ``i`` and ``j``.

2294 If ``X`` is non-square or asymmetric, an error is raised.

2295

2296 2. ``X = squareform(v)``

2297

2298 Given a ``n * (n-1) / 2`` sized vector ``v``

2299 for some integer ``n >= 1`` encoding distances as described,

2300 ``X = squareform(v)`` returns a n-by-n distance matrix ``X``.

2301 The ``X[i, j]`` and ``X[j, i]`` values are set to

2302 :math:`v[{n \\choose 2} - {n-i \\choose 2} + (j-i-1)]`

2303 and all diagonal elements are zero.

2304

2305 In SciPy 0.19.0, ``squareform`` stopped casting all input types to

2306 float64, and started returning arrays of the same dtype as the input.

2307

2308 """

2309

2310 X = np.ascontiguousarray(X)

2311

2312 s = X.shape

2313

2314 if force.lower() == 'tomatrix':

2315 if len(s) != 1:

2316 raise ValueError("Forcing 'tomatrix' but input X is not a "

2317 "distance vector.")

2318 elif force.lower() == 'tovector':

2319 if len(s) != 2:

2320 raise ValueError("Forcing 'tovector' but input X is not a "

2321 "distance matrix.")

2322

2323 # X = squareform(v)

2324 if len(s) == 1:

2325 if s[0] == 0:

2326 return np.zeros((1, 1), dtype=X.dtype)

2327

2328 # Grab the closest value to the square root of the number

2329 # of elements times 2 to see if the number of elements

2330 # is indeed a binomial coefficient.

2331 d = int(np.ceil(np.sqrt(s[0] * 2)))

2332

2333 # Check that v is of valid dimensions.

2334 if d * (d - 1) != s[0] * 2:

2335 raise ValueError('Incompatible vector size. It must be a binomial '

2336 'coefficient n choose 2 for some integer n >= 2.')

2337

2338 # Allocate memory for the distance matrix.

2339 M = np.zeros((d, d), dtype=X.dtype)

2340

2341 # Since the C code does not support striding using strides.

2342 # The dimensions are used instead.

2343 X = _copy_array_if_base_present(X)

2344

2345 # Fill in the values of the distance matrix.

2346 _distance_wrap.to_squareform_from_vector_wrap(M, X)

2347

2348 # Return the distance matrix.

2349 return M

2350 elif len(s) == 2:

2351 if s[0] != s[1]:

2352 raise ValueError('The matrix argument must be square.')

2353 if checks:

2354 is_valid_dm(X, throw=True, name='X')

2355

2356 # One-side of the dimensions is set here.

2357 d = s[0]

2358

2359 if d <= 1:

2360 return np.array([], dtype=X.dtype)

2361

2362 # Create a vector.

2363 v = np.zeros((d * (d - 1)) // 2, dtype=X.dtype)

2364

2365 # Since the C code does not support striding using strides.

2366 # The dimensions are used instead.

2367 X = _copy_array_if_base_present(X)

2368

2369 # Convert the vector to squareform.

2370 _distance_wrap.to_vector_from_squareform_wrap(X, v)

2371 return v

2372 else:

2373 raise ValueError(('The first argument must be one or two dimensional '

2374 'array. A %d-dimensional array is not '

2375 'permitted') % len(s))

2376

2377

2378def is_valid_dm(D, tol=0.0, throw=False, name="D", warning=False):

2379 """

2380 Return True if input array is a valid distance matrix.

2381

2382 Distance matrices must be 2-dimensional numpy arrays.

2383 They must have a zero-diagonal, and they must be symmetric.

2384

2385 Parameters

2386 ----------

2387 D : array_like

2388 The candidate object to test for validity.

2389 tol : float, optional

2390 The distance matrix should be symmetric. `tol` is the maximum

2391 difference between entries ``ij`` and ``ji`` for the distance

2392 metric to be considered symmetric.

2393 throw : bool, optional

2394 An exception is thrown if the distance matrix passed is not valid.

2395 name : str, optional

2396 The name of the variable to checked. This is useful if

2397 throw is set to True so the offending variable can be identified

2398 in the exception message when an exception is thrown.

2399 warning : bool, optional

2400 Instead of throwing an exception, a warning message is

2401 raised.

2402

2403 Returns

2404 -------

2405 valid : bool

2406 True if the variable `D` passed is a valid distance matrix.

2407

2408 Notes

2409 -----

2410 Small numerical differences in `D` and `D.T` and non-zeroness of

2411 the diagonal are ignored if they are within the tolerance specified

2412 by `tol`.

2413

2414 """

2415 D = np.asarray(D, order='c')

2416 valid = True

2417 try:

2418 s = D.shape

2419 if len(D.shape) != 2:

2420 if name:

2421 raise ValueError(('Distance matrix \'%s\' must have shape=2 '

2422 '(i.e. be two-dimensional).') % name)

2423 else:

2424 raise ValueError('Distance matrix must have shape=2 (i.e. '

2425 'be two-dimensional).')

2426 if tol == 0.0:

2427 if not (D == D.T).all():

2428 if name:

2429 raise ValueError(('Distance matrix \'%s\' must be '

2430 'symmetric.') % name)

2431 else:

2432 raise ValueError('Distance matrix must be symmetric.')

2433 if not (D[range(0, s[0]), range(0, s[0])] == 0).all():

2434 if name:

2435 raise ValueError(('Distance matrix \'%s\' diagonal must '

2436 'be zero.') % name)

2437 else:

2438 raise ValueError('Distance matrix diagonal must be zero.')

2439 else:

2440 if not (D - D.T <= tol).all():

2441 if name:

2442 raise ValueError(('Distance matrix \'%s\' must be '

2443 'symmetric within tolerance %5.5f.')

2444 % (name, tol))

2445 else:

2446 raise ValueError('Distance matrix must be symmetric within'

2447 ' tolerance %5.5f.' % tol)

2448 if not (D[range(0, s[0]), range(0, s[0])] <= tol).all():

2449 if name:

2450 raise ValueError(('Distance matrix \'%s\' diagonal must be'

2451 ' close to zero within tolerance %5.5f.')

2452 % (name, tol))

2453 else:

2454 raise ValueError(('Distance matrix \'%s\' diagonal must be'

2455 ' close to zero within tolerance %5.5f.')

2456 % tol)

2457 except Exception as e:

2458 if throw:

2459 raise

2460 if warning:

2461 warnings.warn(str(e))

2462 valid = False

2463 return valid

2464

2465

2466def is_valid_y(y, warning=False, throw=False, name=None):

2467 """

2468 Return True if the input array is a valid condensed distance matrix.

2469

2470 Condensed distance matrices must be 1-dimensional numpy arrays.

2471 Their length must be a binomial coefficient :math:`{n \\choose 2}`

2472 for some positive integer n.

2473

2474 Parameters

2475 ----------

2476 y : array_like

2477 The condensed distance matrix.

2478 warning : bool, optional

2479 Invokes a warning if the variable passed is not a valid

2480 condensed distance matrix. The warning message explains why

2481 the distance matrix is not valid. `name` is used when

2482 referencing the offending variable.

2483 throw : bool, optional

2484 Throws an exception if the variable passed is not a valid

2485 condensed distance matrix.

2486 name : bool, optional

2487 Used when referencing the offending variable in the

2488 warning or exception message.

2489

2490 """

2491 y = np.asarray(y, order='c')

2492 valid = True

2493 try:

2494 if len(y.shape) != 1:

2495 if name:

2496 raise ValueError(('Condensed distance matrix \'%s\' must '

2497 'have shape=1 (i.e. be one-dimensional).')

2498 % name)

2499 else:

2500 raise ValueError('Condensed distance matrix must have shape=1 '

2501 '(i.e. be one-dimensional).')

2502 n = y.shape[0]

2503 d = int(np.ceil(np.sqrt(n * 2)))

2504 if (d * (d - 1) / 2) != n:

2505 if name:

2506 raise ValueError(('Length n of condensed distance matrix '

2507 '\'%s\' must be a binomial coefficient, i.e.'

2508 'there must be a k such that '

2509 '(k \\choose 2)=n)!') % name)

2510 else:

2511 raise ValueError('Length n of condensed distance matrix must '

2512 'be a binomial coefficient, i.e. there must '

2513 'be a k such that (k \\choose 2)=n)!')

2514 except Exception as e:

2515 if throw:

2516 raise

2517 if warning:

2518 warnings.warn(str(e))

2519 valid = False

2520 return valid

2521

2522

2523def num_obs_dm(d):

2524 """

2525 Return the number of original observations that correspond to a

2526 square, redundant distance matrix.

2527

2528 Parameters

2529 ----------

2530 d : array_like

2531 The target distance matrix.

2532

2533 Returns

2534 -------

2535 num_obs_dm : int

2536 The number of observations in the redundant distance matrix.

2537

2538 """

2539 d = np.asarray(d, order='c')

2540 is_valid_dm(d, tol=np.inf, throw=True, name='d')

2541 return d.shape[0]

2542

2543

2544def num_obs_y(Y):

2545 """

2546 Return the number of original observations that correspond to a

2547 condensed distance matrix.

2548

2549 Parameters

2550 ----------

2551 Y : array_like

2552 Condensed distance matrix.

2553

2554 Returns

2555 -------

2556 n : int

2557 The number of observations in the condensed distance matrix `Y`.

2558

2559 """

2560 Y = np.asarray(Y, order='c')

2561 is_valid_y(Y, throw=True, name='Y')

2562 k = Y.shape[0]

2563 if k == 0:

2564 raise ValueError("The number of observations cannot be determined on "

2565 "an empty distance matrix.")

2566 d = int(np.ceil(np.sqrt(k * 2)))

2567 if (d * (d - 1) / 2) != k:

2568 raise ValueError("Invalid condensed distance matrix passed. Must be "

2569 "some k where k=(n choose 2) for some n >= 2.")

2570 return d

2571

2572

2573def _prepare_out_argument(out, dtype, expected_shape):

2574 if out is None:

2575 return np.empty(expected_shape, dtype=dtype)

2576

2577 if out.shape != expected_shape:

2578 raise ValueError("Output array has incorrect shape.")

2579 if not out.flags.c_contiguous:

2580 raise ValueError("Output array must be C-contiguous.")

2581 if out.dtype != np.double:

2582 raise ValueError("Output array must be double type.")

2583 return out

2584

2585

2586def _pdist_callable(X, *, out, metric, **kwargs):

2587 n = X.shape[0]

2588 out_size = (n * (n - 1)) // 2

2589 dm = _prepare_out_argument(out, np.double, (out_size,))

2590 k = 0

2591 for i in range(X.shape[0] - 1):

2592 for j in range(i + 1, X.shape[0]):

2593 dm[k] = metric(X[i], X[j], **kwargs)

2594 k += 1

2595 return dm

2596

2597

2598def _cdist_callable(XA, XB, *, out, metric, **kwargs):

2599 mA = XA.shape[0]

2600 mB = XB.shape[0]

2601 dm = _prepare_out_argument(out, np.double, (mA, mB))

2602 for i in range(mA):

2603 for j in range(mB):

2604 dm[i, j] = metric(XA[i], XB[j], **kwargs)

2605 return dm

2606

2607

2608def cdist(XA, XB, metric='euclidean', *, out=None, **kwargs):

2609 """

2610 Compute distance between each pair of the two collections of inputs.

2611

2612 See Notes for common calling conventions.

2613

2614 Parameters

2615 ----------

2616 XA : array_like

2617 An :math:`m_A` by :math:`n` array of :math:`m_A`

2618 original observations in an :math:`n`-dimensional space.

2619 Inputs are converted to float type.

2620 XB : array_like

2621 An :math:`m_B` by :math:`n` array of :math:`m_B`

2622 original observations in an :math:`n`-dimensional space.

2623 Inputs are converted to float type.

2624 metric : str or callable, optional

2625 The distance metric to use. If a string, the distance function can be

2626 'braycurtis', 'canberra', 'chebyshev', 'cityblock', 'correlation',

2627 'cosine', 'dice', 'euclidean', 'hamming', 'jaccard', 'jensenshannon',

2628 'kulczynski1', 'mahalanobis', 'matching', 'minkowski',

2629 'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener',

2630 'sokalsneath', 'sqeuclidean', 'yule'.

2631 **kwargs : dict, optional

2632 Extra arguments to `metric`: refer to each metric documentation for a

2633 list of all possible arguments.

2634

2635 Some possible arguments:

2636

2637 p : scalar

2638 The p-norm to apply for Minkowski, weighted and unweighted.

2639 Default: 2.

2640

2641 w : array_like

2642 The weight vector for metrics that support weights (e.g., Minkowski).

2643

2644 V : array_like

2645 The variance vector for standardized Euclidean.

2646 Default: var(vstack([XA, XB]), axis=0, ddof=1)

2647

2648 VI : array_like

2649 The inverse of the covariance matrix for Mahalanobis.

2650 Default: inv(cov(vstack([XA, XB].T))).T

2651

2652 out : ndarray

2653 The output array

2654 If not None, the distance matrix Y is stored in this array.

2655

2656 Returns

2657 -------

2658 Y : ndarray

2659 A :math:`m_A` by :math:`m_B` distance matrix is returned.

2660 For each :math:`i` and :math:`j`, the metric

2661 ``dist(u=XA[i], v=XB[j])`` is computed and stored in the

2662 :math:`ij` th entry.

2663

2664 Raises

2665 ------

2666 ValueError

2667 An exception is thrown if `XA` and `XB` do not have

2668 the same number of columns.

2669

2670 Notes

2671 -----

2672 The following are common calling conventions:

2673

2674 1. ``Y = cdist(XA, XB, 'euclidean')``

2675

2676 Computes the distance between :math:`m` points using

2677 Euclidean distance (2-norm) as the distance metric between the

2678 points. The points are arranged as :math:`m`

2679 :math:`n`-dimensional row vectors in the matrix X.

2680

2681 2. ``Y = cdist(XA, XB, 'minkowski', p=2.)``

2682

2683 Computes the distances using the Minkowski distance

2684 :math:`\\|u-v\\|_p` (:math:`p`-norm) where :math:`p > 0` (note

2685 that this is only a quasi-metric if :math:`0 < p < 1`).

2686

2687 3. ``Y = cdist(XA, XB, 'cityblock')``

2688

2689 Computes the city block or Manhattan distance between the

2690 points.

2691

2692 4. ``Y = cdist(XA, XB, 'seuclidean', V=None)``

2693

2694 Computes the standardized Euclidean distance. The standardized

2695 Euclidean distance between two n-vectors ``u`` and ``v`` is

2696

2697 .. math::

2698

2699 \\sqrt{\\sum {(u_i-v_i)^2 / V[x_i]}}.

2700

2701 V is the variance vector; V[i] is the variance computed over all

2702 the i'th components of the points. If not passed, it is

2703 automatically computed.

2704

2705 5. ``Y = cdist(XA, XB, 'sqeuclidean')``

2706

2707 Computes the squared Euclidean distance :math:`\\|u-v\\|_2^2` between

2708 the vectors.

2709

2710 6. ``Y = cdist(XA, XB, 'cosine')``

2711

2712 Computes the cosine distance between vectors u and v,

2713

2714 .. math::

2715

2716 1 - \\frac{u \\cdot v}

2717 {{\\|u\\|}_2 {\\|v\\|}_2}

2718

2719 where :math:`\\|*\\|_2` is the 2-norm of its argument ``*``, and

2720 :math:`u \\cdot v` is the dot product of :math:`u` and :math:`v`.

2721

2722 7. ``Y = cdist(XA, XB, 'correlation')``

2723

2724 Computes the correlation distance between vectors u and v. This is

2725

2726 .. math::

2727

2728 1 - \\frac{(u - \\bar{u}) \\cdot (v - \\bar{v})}

2729 {{\\|(u - \\bar{u})\\|}_2 {\\|(v - \\bar{v})\\|}_2}

2730

2731 where :math:`\\bar{v}` is the mean of the elements of vector v,

2732 and :math:`x \\cdot y` is the dot product of :math:`x` and :math:`y`.

2733

2734

2735 8. ``Y = cdist(XA, XB, 'hamming')``

2736

2737 Computes the normalized Hamming distance, or the proportion of

2738 those vector elements between two n-vectors ``u`` and ``v``

2739 which disagree. To save memory, the matrix ``X`` can be of type

2740 boolean.

2741

2742 9. ``Y = cdist(XA, XB, 'jaccard')``

2743

2744 Computes the Jaccard distance between the points. Given two

2745 vectors, ``u`` and ``v``, the Jaccard distance is the

2746 proportion of those elements ``u[i]`` and ``v[i]`` that

2747 disagree where at least one of them is non-zero.

2748

2749 10. ``Y = cdist(XA, XB, 'jensenshannon')``

2750

2751 Computes the Jensen-Shannon distance between two probability arrays.

2752 Given two probability vectors, :math:`p` and :math:`q`, the

2753 Jensen-Shannon distance is

2754

2755 .. math::

2756

2757 \\sqrt{\\frac{D(p \\parallel m) + D(q \\parallel m)}{2}}

2758

2759 where :math:`m` is the pointwise mean of :math:`p` and :math:`q`

2760 and :math:`D` is the Kullback-Leibler divergence.

2761

2762 11. ``Y = cdist(XA, XB, 'chebyshev')``

2763

2764 Computes the Chebyshev distance between the points. The

2765 Chebyshev distance between two n-vectors ``u`` and ``v`` is the

2766 maximum norm-1 distance between their respective elements. More

2767 precisely, the distance is given by

2768

2769 .. math::

2770

2771 d(u,v) = \\max_i {|u_i-v_i|}.

2772

2773 12. ``Y = cdist(XA, XB, 'canberra')``

2774

2775 Computes the Canberra distance between the points. The

2776 Canberra distance between two points ``u`` and ``v`` is

2777

2778 .. math::

2779

2780 d(u,v) = \\sum_i \\frac{|u_i-v_i|}

2781 {|u_i|+|v_i|}.

2782

2783 13. ``Y = cdist(XA, XB, 'braycurtis')``

2784

2785 Computes the Bray-Curtis distance between the points. The

2786 Bray-Curtis distance between two points ``u`` and ``v`` is

2787

2788

2789 .. math::

2790

2791 d(u,v) = \\frac{\\sum_i (|u_i-v_i|)}

2792 {\\sum_i (|u_i+v_i|)}

2793

2794 14. ``Y = cdist(XA, XB, 'mahalanobis', VI=None)``

2795

2796 Computes the Mahalanobis distance between the points. The

2797 Mahalanobis distance between two points ``u`` and ``v`` is

2798 :math:`\\sqrt{(u-v)(1/V)(u-v)^T}` where :math:`(1/V)` (the ``VI``

2799 variable) is the inverse covariance. If ``VI`` is not None,

2800 ``VI`` will be used as the inverse covariance matrix.

2801

2802 15. ``Y = cdist(XA, XB, 'yule')``

2803

2804 Computes the Yule distance between the boolean

2805 vectors. (see `yule` function documentation)

2806

2807 16. ``Y = cdist(XA, XB, 'matching')``

2808

2809 Synonym for 'hamming'.

2810

2811 17. ``Y = cdist(XA, XB, 'dice')``

2812

2813 Computes the Dice distance between the boolean vectors. (see

2814 `dice` function documentation)

2815

2816 18. ``Y = cdist(XA, XB, 'kulczynski1')``

2817

2818 Computes the kulczynski distance between the boolean

2819 vectors. (see `kulczynski1` function documentation)

2820

2821 19. ``Y = cdist(XA, XB, 'rogerstanimoto')``

2822

2823 Computes the Rogers-Tanimoto distance between the boolean

2824 vectors. (see `rogerstanimoto` function documentation)

2825

2826 20. ``Y = cdist(XA, XB, 'russellrao')``

2827

2828 Computes the Russell-Rao distance between the boolean

2829 vectors. (see `russellrao` function documentation)

2830

2831 21. ``Y = cdist(XA, XB, 'sokalmichener')``

2832

2833 Computes the Sokal-Michener distance between the boolean

2834 vectors. (see `sokalmichener` function documentation)

2835

2836 22. ``Y = cdist(XA, XB, 'sokalsneath')``

2837

2838 Computes the Sokal-Sneath distance between the vectors. (see

2839 `sokalsneath` function documentation)

2840

2841 23. ``Y = cdist(XA, XB, f)``

2842

2843 Computes the distance between all pairs of vectors in X

2844 using the user supplied 2-arity function f. For example,

2845 Euclidean distance between the vectors could be computed

2846 as follows::

2847

2848 dm = cdist(XA, XB, lambda u, v: np.sqrt(((u-v)**2).sum()))

2849

2850 Note that you should avoid passing a reference to one of

2851 the distance functions defined in this library. For example,::

2852

2853 dm = cdist(XA, XB, sokalsneath)

2854

2855 would calculate the pair-wise distances between the vectors in

2856 X using the Python function `sokalsneath`. This would result in

2857 sokalsneath being called :math:`{n \\choose 2}` times, which

2858 is inefficient. Instead, the optimized C version is more

2859 efficient, and we call it using the following syntax::

2860

2861 dm = cdist(XA, XB, 'sokalsneath')

2862

2863 Examples

2864 --------

2865 Find the Euclidean distances between four 2-D coordinates:

2866

2867 >>> from scipy.spatial import distance

2868 >>> import numpy as np

2869 >>> coords = [(35.0456, -85.2672),

2870 ... (35.1174, -89.9711),

2871 ... (35.9728, -83.9422),

2872 ... (36.1667, -86.7833)]

2873 >>> distance.cdist(coords, coords, 'euclidean')

2874 array([[ 0. , 4.7044, 1.6172, 1.8856],

2875 [ 4.7044, 0. , 6.0893, 3.3561],

2876 [ 1.6172, 6.0893, 0. , 2.8477],

2877 [ 1.8856, 3.3561, 2.8477, 0. ]])

2878

2879

2880 Find the Manhattan distance from a 3-D point to the corners of the unit

2881 cube:

2882

2883 >>> a = np.array([[0, 0, 0],

2884 ... [0, 0, 1],

2885 ... [0, 1, 0],

2886 ... [0, 1, 1],

2887 ... [1, 0, 0],

2888 ... [1, 0, 1],

2889 ... [1, 1, 0],

2890 ... [1, 1, 1]])

2891 >>> b = np.array([[ 0.1, 0.2, 0.4]])

2892 >>> distance.cdist(a, b, 'cityblock')

2893 array([[ 0.7],

2894 [ 0.9],

2895 [ 1.3],

2896 [ 1.5],

2897 [ 1.5],

2898 [ 1.7],

2899 [ 2.1],

2900 [ 2.3]])

2901

2902 """

2903 # You can also call this as:

2904 # Y = cdist(XA, XB, 'test_abc')

2905 # where 'abc' is the metric being tested. This computes the distance

2906 # between all pairs of vectors in XA and XB using the distance metric 'abc'

2907 # but with a more succinct, verifiable, but less efficient implementation.

2908

2909 XA = np.asarray(XA)

2910 XB = np.asarray(XB)

2911

2912 s = XA.shape

2913 sB = XB.shape

2914

2915 if len(s) != 2:

2916 raise ValueError('XA must be a 2-dimensional array.')

2917 if len(sB) != 2:

2918 raise ValueError('XB must be a 2-dimensional array.')

2919 if s[1] != sB[1]:

2920 raise ValueError('XA and XB must have the same number of columns '

2921 '(i.e. feature dimension.)')

2922

2923 mA = s[0]

2924 mB = sB[0]

2925 n = s[1]

2926

2927 if callable(metric):

2928 mstr = getattr(metric, '__name__', 'Unknown')

2929 metric_info = _METRIC_ALIAS.get(mstr, None)

2930 if metric_info is not None:

2931 XA, XB, typ, kwargs = _validate_cdist_input(

2932 XA, XB, mA, mB, n, metric_info, **kwargs)

2933 return _cdist_callable(XA, XB, metric=metric, out=out, **kwargs)

2934 elif isinstance(metric, str):

2935 mstr = metric.lower()

2936 metric_info = _METRIC_ALIAS.get(mstr, None)

2937 if metric_info is not None:

2938 cdist_fn = metric_info.cdist_func

2939 return cdist_fn(XA, XB, out=out, **kwargs)

2940 elif mstr.startswith("test_"):

2941 metric_info = _TEST_METRICS.get(mstr, None)

2942 if metric_info is None:

2943 raise ValueError(f'Unknown "Test" Distance Metric: {mstr[5:]}')

2944 XA, XB, typ, kwargs = _validate_cdist_input(

2945 XA, XB, mA, mB, n, metric_info, **kwargs)

2946 return _cdist_callable(

2947 XA, XB, metric=metric_info.dist_func, out=out, **kwargs)

2948 else:

2949 raise ValueError('Unknown Distance Metric: %s' % mstr)

2950 else:

2951 raise TypeError('2nd argument metric must be a string identifier '

2952 'or a function.')