Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scipy/spatial/distance.py: 15%

649 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-12-12 06:31 +0000

1""" 

2Distance computations (:mod:`scipy.spatial.distance`) 

3===================================================== 

4 

5.. sectionauthor:: Damian Eads 

6 

7Function reference 

8------------------ 

9 

10Distance matrix computation from a collection of raw observation vectors 

11stored in a rectangular array. 

12 

13.. autosummary:: 

14 :toctree: generated/ 

15 

16 pdist -- pairwise distances between observation vectors. 

17 cdist -- distances between two collections of observation vectors 

18 squareform -- convert distance matrix to a condensed one and vice versa 

19 directed_hausdorff -- directed Hausdorff distance between arrays 

20 

21Predicates for checking the validity of distance matrices, both 

22condensed and redundant. Also contained in this module are functions 

23for computing the number of observations in a distance matrix. 

24 

25.. autosummary:: 

26 :toctree: generated/ 

27 

28 is_valid_dm -- checks for a valid distance matrix 

29 is_valid_y -- checks for a valid condensed distance matrix 

30 num_obs_dm -- # of observations in a distance matrix 

31 num_obs_y -- # of observations in a condensed distance matrix 

32 

33Distance functions between two numeric vectors ``u`` and ``v``. Computing 

34distances over a large collection of vectors is inefficient for these 

35functions. Use ``pdist`` for this purpose. 

36 

37.. autosummary:: 

38 :toctree: generated/ 

39 

40 braycurtis -- the Bray-Curtis distance. 

41 canberra -- the Canberra distance. 

42 chebyshev -- the Chebyshev distance. 

43 cityblock -- the Manhattan distance. 

44 correlation -- the Correlation distance. 

45 cosine -- the Cosine distance. 

46 euclidean -- the Euclidean distance. 

47 jensenshannon -- the Jensen-Shannon distance. 

48 mahalanobis -- the Mahalanobis distance. 

49 minkowski -- the Minkowski distance. 

50 seuclidean -- the normalized Euclidean distance. 

51 sqeuclidean -- the squared Euclidean distance. 

52 

53Distance functions between two boolean vectors (representing sets) ``u`` and 

54``v``. As in the case of numerical vectors, ``pdist`` is more efficient for 

55computing the distances between all pairs. 

56 

57.. autosummary:: 

58 :toctree: generated/ 

59 

60 dice -- the Dice dissimilarity. 

61 hamming -- the Hamming distance. 

62 jaccard -- the Jaccard distance. 

63 kulsinski -- the Kulsinski distance. 

64 kulczynski1 -- the Kulczynski 1 distance. 

65 rogerstanimoto -- the Rogers-Tanimoto dissimilarity. 

66 russellrao -- the Russell-Rao dissimilarity. 

67 sokalmichener -- the Sokal-Michener dissimilarity. 

68 sokalsneath -- the Sokal-Sneath dissimilarity. 

69 yule -- the Yule dissimilarity. 

70 

71:func:`hamming` also operates over discrete numerical vectors. 

72""" 

73 

74# Copyright (C) Damian Eads, 2007-2008. New BSD License. 

75 

76__all__ = [ 

77 'braycurtis', 

78 'canberra', 

79 'cdist', 

80 'chebyshev', 

81 'cityblock', 

82 'correlation', 

83 'cosine', 

84 'dice', 

85 'directed_hausdorff', 

86 'euclidean', 

87 'hamming', 

88 'is_valid_dm', 

89 'is_valid_y', 

90 'jaccard', 

91 'jensenshannon', 

92 'kulsinski', 

93 'kulczynski1', 

94 'mahalanobis', 

95 'minkowski', 

96 'num_obs_dm', 

97 'num_obs_y', 

98 'pdist', 

99 'rogerstanimoto', 

100 'russellrao', 

101 'seuclidean', 

102 'sokalmichener', 

103 'sokalsneath', 

104 'sqeuclidean', 

105 'squareform', 

106 'yule' 

107] 

108 

109 

110import warnings 

111import numpy as np 

112import dataclasses 

113 

114from typing import List, Optional, Set, Callable 

115 

116from functools import partial 

117from scipy._lib._util import _asarray_validated 

118 

119from . import _distance_wrap 

120from . import _hausdorff 

121from ..linalg import norm 

122from ..special import rel_entr 

123 

124from . import _distance_pybind 

125 

126from .._lib.deprecation import _deprecated 

127 

128def _copy_array_if_base_present(a): 

129 """Copy the array if its base points to a parent array.""" 

130 if a.base is not None: 

131 return a.copy() 

132 return a 

133 

134 

135def _correlation_cdist_wrap(XA, XB, dm, **kwargs): 

136 XA = XA - XA.mean(axis=1, keepdims=True) 

137 XB = XB - XB.mean(axis=1, keepdims=True) 

138 _distance_wrap.cdist_cosine_double_wrap(XA, XB, dm, **kwargs) 

139 

140 

141def _correlation_pdist_wrap(X, dm, **kwargs): 

142 X2 = X - X.mean(axis=1, keepdims=True) 

143 _distance_wrap.pdist_cosine_double_wrap(X2, dm, **kwargs) 

144 

145 

146def _convert_to_type(X, out_type): 

147 return np.ascontiguousarray(X, dtype=out_type) 

148 

149 

150def _nbool_correspond_all(u, v, w=None): 

151 if u.dtype == v.dtype == bool and w is None: 

152 not_u = ~u 

153 not_v = ~v 

154 nff = (not_u & not_v).sum() 

155 nft = (not_u & v).sum() 

156 ntf = (u & not_v).sum() 

157 ntt = (u & v).sum() 

158 else: 

159 dtype = np.result_type(int, u.dtype, v.dtype) 

160 u = u.astype(dtype) 

161 v = v.astype(dtype) 

162 not_u = 1.0 - u 

163 not_v = 1.0 - v 

164 if w is not None: 

165 not_u = w * not_u 

166 u = w * u 

167 nff = (not_u * not_v).sum() 

168 nft = (not_u * v).sum() 

169 ntf = (u * not_v).sum() 

170 ntt = (u * v).sum() 

171 return (nff, nft, ntf, ntt) 

172 

173 

174def _nbool_correspond_ft_tf(u, v, w=None): 

175 if u.dtype == v.dtype == bool and w is None: 

176 not_u = ~u 

177 not_v = ~v 

178 nft = (not_u & v).sum() 

179 ntf = (u & not_v).sum() 

180 else: 

181 dtype = np.result_type(int, u.dtype, v.dtype) 

182 u = u.astype(dtype) 

183 v = v.astype(dtype) 

184 not_u = 1.0 - u 

185 not_v = 1.0 - v 

186 if w is not None: 

187 not_u = w * not_u 

188 u = w * u 

189 nft = (not_u * v).sum() 

190 ntf = (u * not_v).sum() 

191 return (nft, ntf) 

192 

193 

194def _validate_cdist_input(XA, XB, mA, mB, n, metric_info, **kwargs): 

195 # get supported types 

196 types = metric_info.types 

197 # choose best type 

198 typ = types[types.index(XA.dtype)] if XA.dtype in types else types[0] 

199 # validate data 

200 XA = _convert_to_type(XA, out_type=typ) 

201 XB = _convert_to_type(XB, out_type=typ) 

202 

203 # validate kwargs 

204 _validate_kwargs = metric_info.validator 

205 if _validate_kwargs: 

206 kwargs = _validate_kwargs((XA, XB), mA + mB, n, **kwargs) 

207 return XA, XB, typ, kwargs 

208 

209 

210def _validate_weight_with_size(X, m, n, **kwargs): 

211 w = kwargs.pop('w', None) 

212 if w is None: 

213 return kwargs 

214 

215 if w.ndim != 1 or w.shape[0] != n: 

216 raise ValueError("Weights must have same size as input vector. " 

217 f"{w.shape[0]} vs. {n}") 

218 

219 kwargs['w'] = _validate_weights(w) 

220 return kwargs 

221 

222 

223def _validate_hamming_kwargs(X, m, n, **kwargs): 

224 w = kwargs.get('w', np.ones((n,), dtype='double')) 

225 

226 if w.ndim != 1 or w.shape[0] != n: 

227 raise ValueError("Weights must have same size as input vector. %d vs. %d" % (w.shape[0], n)) 

228 

229 kwargs['w'] = _validate_weights(w) 

230 return kwargs 

231 

232 

233def _validate_mahalanobis_kwargs(X, m, n, **kwargs): 

234 VI = kwargs.pop('VI', None) 

235 if VI is None: 

236 if m <= n: 

237 # There are fewer observations than the dimension of 

238 # the observations. 

239 raise ValueError("The number of observations (%d) is too " 

240 "small; the covariance matrix is " 

241 "singular. For observations with %d " 

242 "dimensions, at least %d observations " 

243 "are required." % (m, n, n + 1)) 

244 if isinstance(X, tuple): 

245 X = np.vstack(X) 

246 CV = np.atleast_2d(np.cov(X.astype(np.double, copy=False).T)) 

247 VI = np.linalg.inv(CV).T.copy() 

248 kwargs["VI"] = _convert_to_double(VI) 

249 return kwargs 

250 

251 

252def _validate_minkowski_kwargs(X, m, n, **kwargs): 

253 kwargs = _validate_weight_with_size(X, m, n, **kwargs) 

254 if 'p' not in kwargs: 

255 kwargs['p'] = 2. 

256 else: 

257 if kwargs['p'] <= 0: 

258 raise ValueError("p must be greater than 0") 

259 

260 return kwargs 

261 

262 

263def _validate_pdist_input(X, m, n, metric_info, **kwargs): 

264 # get supported types 

265 types = metric_info.types 

266 # choose best type 

267 typ = types[types.index(X.dtype)] if X.dtype in types else types[0] 

268 # validate data 

269 X = _convert_to_type(X, out_type=typ) 

270 

271 # validate kwargs 

272 _validate_kwargs = metric_info.validator 

273 if _validate_kwargs: 

274 kwargs = _validate_kwargs(X, m, n, **kwargs) 

275 return X, typ, kwargs 

276 

277 

278def _validate_seuclidean_kwargs(X, m, n, **kwargs): 

279 V = kwargs.pop('V', None) 

280 if V is None: 

281 if isinstance(X, tuple): 

282 X = np.vstack(X) 

283 V = np.var(X.astype(np.double, copy=False), axis=0, ddof=1) 

284 else: 

285 V = np.asarray(V, order='c') 

286 if len(V.shape) != 1: 

287 raise ValueError('Variance vector V must ' 

288 'be one-dimensional.') 

289 if V.shape[0] != n: 

290 raise ValueError('Variance vector V must be of the same ' 

291 'dimension as the vectors on which the distances ' 

292 'are computed.') 

293 kwargs['V'] = _convert_to_double(V) 

294 return kwargs 

295 

296 

297def _validate_vector(u, dtype=None): 

298 # XXX Is order='c' really necessary? 

299 u = np.asarray(u, dtype=dtype, order='c') 

300 if u.ndim == 1: 

301 return u 

302 raise ValueError("Input vector should be 1-D.") 

303 

304 

305def _validate_weights(w, dtype=np.double): 

306 w = _validate_vector(w, dtype=dtype) 

307 if np.any(w < 0): 

308 raise ValueError("Input weights should be all non-negative") 

309 return w 

310 

311 

312def directed_hausdorff(u, v, seed=0): 

313 """ 

314 Compute the directed Hausdorff distance between two 2-D arrays. 

315 

316 Distances between pairs are calculated using a Euclidean metric. 

317 

318 Parameters 

319 ---------- 

320 u : (M,N) array_like 

321 Input array. 

322 v : (O,N) array_like 

323 Input array. 

324 seed : int or None 

325 Local `numpy.random.RandomState` seed. Default is 0, a random 

326 shuffling of u and v that guarantees reproducibility. 

327 

328 Returns 

329 ------- 

330 d : double 

331 The directed Hausdorff distance between arrays `u` and `v`, 

332 

333 index_1 : int 

334 index of point contributing to Hausdorff pair in `u` 

335 

336 index_2 : int 

337 index of point contributing to Hausdorff pair in `v` 

338 

339 Raises 

340 ------ 

341 ValueError 

342 An exception is thrown if `u` and `v` do not have 

343 the same number of columns. 

344 

345 Notes 

346 ----- 

347 Uses the early break technique and the random sampling approach 

348 described by [1]_. Although worst-case performance is ``O(m * o)`` 

349 (as with the brute force algorithm), this is unlikely in practice 

350 as the input data would have to require the algorithm to explore 

351 every single point interaction, and after the algorithm shuffles 

352 the input points at that. The best case performance is O(m), which 

353 is satisfied by selecting an inner loop distance that is less than 

354 cmax and leads to an early break as often as possible. The authors 

355 have formally shown that the average runtime is closer to O(m). 

356 

357 .. versionadded:: 0.19.0 

358 

359 References 

360 ---------- 

361 .. [1] A. A. Taha and A. Hanbury, "An efficient algorithm for 

362 calculating the exact Hausdorff distance." IEEE Transactions On 

363 Pattern Analysis And Machine Intelligence, vol. 37 pp. 2153-63, 

364 2015. 

365 

366 See Also 

367 -------- 

368 scipy.spatial.procrustes : Another similarity test for two data sets 

369 

370 Examples 

371 -------- 

372 Find the directed Hausdorff distance between two 2-D arrays of 

373 coordinates: 

374 

375 >>> from scipy.spatial.distance import directed_hausdorff 

376 >>> import numpy as np 

377 >>> u = np.array([(1.0, 0.0), 

378 ... (0.0, 1.0), 

379 ... (-1.0, 0.0), 

380 ... (0.0, -1.0)]) 

381 >>> v = np.array([(2.0, 0.0), 

382 ... (0.0, 2.0), 

383 ... (-2.0, 0.0), 

384 ... (0.0, -4.0)]) 

385 

386 >>> directed_hausdorff(u, v)[0] 

387 2.23606797749979 

388 >>> directed_hausdorff(v, u)[0] 

389 3.0 

390 

391 Find the general (symmetric) Hausdorff distance between two 2-D 

392 arrays of coordinates: 

393 

394 >>> max(directed_hausdorff(u, v)[0], directed_hausdorff(v, u)[0]) 

395 3.0 

396 

397 Find the indices of the points that generate the Hausdorff distance 

398 (the Hausdorff pair): 

399 

400 >>> directed_hausdorff(v, u)[1:] 

401 (3, 3) 

402 

403 """ 

404 u = np.asarray(u, dtype=np.float64, order='c') 

405 v = np.asarray(v, dtype=np.float64, order='c') 

406 if u.shape[1] != v.shape[1]: 

407 raise ValueError('u and v need to have the same ' 

408 'number of columns') 

409 result = _hausdorff.directed_hausdorff(u, v, seed) 

410 return result 

411 

412 

413def minkowski(u, v, p=2, w=None): 

414 """ 

415 Compute the Minkowski distance between two 1-D arrays. 

416 

417 The Minkowski distance between 1-D arrays `u` and `v`, 

418 is defined as 

419 

420 .. math:: 

421 

422 {\\|u-v\\|}_p = (\\sum{|u_i - v_i|^p})^{1/p}. 

423 

424 

425 \\left(\\sum{w_i(|(u_i - v_i)|^p)}\\right)^{1/p}. 

426 

427 Parameters 

428 ---------- 

429 u : (N,) array_like 

430 Input array. 

431 v : (N,) array_like 

432 Input array. 

433 p : scalar 

434 The order of the norm of the difference :math:`{\\|u-v\\|}_p`. Note 

435 that for :math:`0 < p < 1`, the triangle inequality only holds with 

436 an additional multiplicative factor, i.e. it is only a quasi-metric. 

437 w : (N,) array_like, optional 

438 The weights for each value in `u` and `v`. Default is None, 

439 which gives each value a weight of 1.0 

440 

441 Returns 

442 ------- 

443 minkowski : double 

444 The Minkowski distance between vectors `u` and `v`. 

445 

446 Examples 

447 -------- 

448 >>> from scipy.spatial import distance 

449 >>> distance.minkowski([1, 0, 0], [0, 1, 0], 1) 

450 2.0 

451 >>> distance.minkowski([1, 0, 0], [0, 1, 0], 2) 

452 1.4142135623730951 

453 >>> distance.minkowski([1, 0, 0], [0, 1, 0], 3) 

454 1.2599210498948732 

455 >>> distance.minkowski([1, 1, 0], [0, 1, 0], 1) 

456 1.0 

457 >>> distance.minkowski([1, 1, 0], [0, 1, 0], 2) 

458 1.0 

459 >>> distance.minkowski([1, 1, 0], [0, 1, 0], 3) 

460 1.0 

461 

462 """ 

463 u = _validate_vector(u) 

464 v = _validate_vector(v) 

465 if p <= 0: 

466 raise ValueError("p must be greater than 0") 

467 u_v = u - v 

468 if w is not None: 

469 w = _validate_weights(w) 

470 if p == 1: 

471 root_w = w 

472 elif p == 2: 

473 # better precision and speed 

474 root_w = np.sqrt(w) 

475 elif p == np.inf: 

476 root_w = (w != 0) 

477 else: 

478 root_w = np.power(w, 1/p) 

479 u_v = root_w * u_v 

480 dist = norm(u_v, ord=p) 

481 return dist 

482 

483 

484def euclidean(u, v, w=None): 

485 """ 

486 Computes the Euclidean distance between two 1-D arrays. 

487 

488 The Euclidean distance between 1-D arrays `u` and `v`, is defined as 

489 

490 .. math:: 

491 

492 {\\|u-v\\|}_2 

493 

494 \\left(\\sum{(w_i |(u_i - v_i)|^2)}\\right)^{1/2} 

495 

496 Parameters 

497 ---------- 

498 u : (N,) array_like 

499 Input array. 

500 v : (N,) array_like 

501 Input array. 

502 w : (N,) array_like, optional 

503 The weights for each value in `u` and `v`. Default is None, 

504 which gives each value a weight of 1.0 

505 

506 Returns 

507 ------- 

508 euclidean : double 

509 The Euclidean distance between vectors `u` and `v`. 

510 

511 Examples 

512 -------- 

513 >>> from scipy.spatial import distance 

514 >>> distance.euclidean([1, 0, 0], [0, 1, 0]) 

515 1.4142135623730951 

516 >>> distance.euclidean([1, 1, 0], [0, 1, 0]) 

517 1.0 

518 

519 """ 

520 return minkowski(u, v, p=2, w=w) 

521 

522 

523def sqeuclidean(u, v, w=None): 

524 """ 

525 Compute the squared Euclidean distance between two 1-D arrays. 

526 

527 The squared Euclidean distance between `u` and `v` is defined as 

528 

529 .. math:: 

530 

531 {\\|u-v\\|}_2^2 

532 

533 \\left(\\sum{(w_i |(u_i - v_i)|^2)}\\right) 

534 

535 Parameters 

536 ---------- 

537 u : (N,) array_like 

538 Input array. 

539 v : (N,) array_like 

540 Input array. 

541 w : (N,) array_like, optional 

542 The weights for each value in `u` and `v`. Default is None, 

543 which gives each value a weight of 1.0 

544 

545 Returns 

546 ------- 

547 sqeuclidean : double 

548 The squared Euclidean distance between vectors `u` and `v`. 

549 

550 Examples 

551 -------- 

552 >>> from scipy.spatial import distance 

553 >>> distance.sqeuclidean([1, 0, 0], [0, 1, 0]) 

554 2.0 

555 >>> distance.sqeuclidean([1, 1, 0], [0, 1, 0]) 

556 1.0 

557 

558 """ 

559 # Preserve float dtypes, but convert everything else to np.float64 

560 # for stability. 

561 utype, vtype = None, None 

562 if not (hasattr(u, "dtype") and np.issubdtype(u.dtype, np.inexact)): 

563 utype = np.float64 

564 if not (hasattr(v, "dtype") and np.issubdtype(v.dtype, np.inexact)): 

565 vtype = np.float64 

566 

567 u = _validate_vector(u, dtype=utype) 

568 v = _validate_vector(v, dtype=vtype) 

569 u_v = u - v 

570 u_v_w = u_v # only want weights applied once 

571 if w is not None: 

572 w = _validate_weights(w) 

573 u_v_w = w * u_v 

574 return np.dot(u_v, u_v_w) 

575 

576 

577def correlation(u, v, w=None, centered=True): 

578 """ 

579 Compute the correlation distance between two 1-D arrays. 

580 

581 The correlation distance between `u` and `v`, is 

582 defined as 

583 

584 .. math:: 

585 

586 1 - \\frac{(u - \\bar{u}) \\cdot (v - \\bar{v})} 

587 {{\\|(u - \\bar{u})\\|}_2 {\\|(v - \\bar{v})\\|}_2} 

588 

589 where :math:`\\bar{u}` is the mean of the elements of `u` 

590 and :math:`x \\cdot y` is the dot product of :math:`x` and :math:`y`. 

591 

592 Parameters 

593 ---------- 

594 u : (N,) array_like 

595 Input array. 

596 v : (N,) array_like 

597 Input array. 

598 w : (N,) array_like, optional 

599 The weights for each value in `u` and `v`. Default is None, 

600 which gives each value a weight of 1.0 

601 centered : bool, optional 

602 If True, `u` and `v` will be centered. Default is True. 

603 

604 Returns 

605 ------- 

606 correlation : double 

607 The correlation distance between 1-D array `u` and `v`. 

608 

609 """ 

610 u = _validate_vector(u) 

611 v = _validate_vector(v) 

612 if w is not None: 

613 w = _validate_weights(w) 

614 if centered: 

615 umu = np.average(u, weights=w) 

616 vmu = np.average(v, weights=w) 

617 u = u - umu 

618 v = v - vmu 

619 uv = np.average(u * v, weights=w) 

620 uu = np.average(np.square(u), weights=w) 

621 vv = np.average(np.square(v), weights=w) 

622 dist = 1.0 - uv / np.sqrt(uu * vv) 

623 # Return absolute value to avoid small negative value due to rounding 

624 return np.abs(dist) 

625 

626 

627def cosine(u, v, w=None): 

628 """ 

629 Compute the Cosine distance between 1-D arrays. 

630 

631 The Cosine distance between `u` and `v`, is defined as 

632 

633 .. math:: 

634 

635 1 - \\frac{u \\cdot v} 

636 {\\|u\\|_2 \\|v\\|_2}. 

637 

638 where :math:`u \\cdot v` is the dot product of :math:`u` and 

639 :math:`v`. 

640 

641 Parameters 

642 ---------- 

643 u : (N,) array_like 

644 Input array. 

645 v : (N,) array_like 

646 Input array. 

647 w : (N,) array_like, optional 

648 The weights for each value in `u` and `v`. Default is None, 

649 which gives each value a weight of 1.0 

650 

651 Returns 

652 ------- 

653 cosine : double 

654 The Cosine distance between vectors `u` and `v`. 

655 

656 Examples 

657 -------- 

658 >>> from scipy.spatial import distance 

659 >>> distance.cosine([1, 0, 0], [0, 1, 0]) 

660 1.0 

661 >>> distance.cosine([100, 0, 0], [0, 1, 0]) 

662 1.0 

663 >>> distance.cosine([1, 1, 0], [0, 1, 0]) 

664 0.29289321881345254 

665 

666 """ 

667 # cosine distance is also referred to as 'uncentered correlation', 

668 # or 'reflective correlation' 

669 # clamp the result to 0-2 

670 return max(0, min(correlation(u, v, w=w, centered=False), 2.0)) 

671 

672 

673def hamming(u, v, w=None): 

674 """ 

675 Compute the Hamming distance between two 1-D arrays. 

676 

677 The Hamming distance between 1-D arrays `u` and `v`, is simply the 

678 proportion of disagreeing components in `u` and `v`. If `u` and `v` are 

679 boolean vectors, the Hamming distance is 

680 

681 .. math:: 

682 

683 \\frac{c_{01} + c_{10}}{n} 

684 

685 where :math:`c_{ij}` is the number of occurrences of 

686 :math:`\\mathtt{u[k]} = i` and :math:`\\mathtt{v[k]} = j` for 

687 :math:`k < n`. 

688 

689 Parameters 

690 ---------- 

691 u : (N,) array_like 

692 Input array. 

693 v : (N,) array_like 

694 Input array. 

695 w : (N,) array_like, optional 

696 The weights for each value in `u` and `v`. Default is None, 

697 which gives each value a weight of 1.0 

698 

699 Returns 

700 ------- 

701 hamming : double 

702 The Hamming distance between vectors `u` and `v`. 

703 

704 Examples 

705 -------- 

706 >>> from scipy.spatial import distance 

707 >>> distance.hamming([1, 0, 0], [0, 1, 0]) 

708 0.66666666666666663 

709 >>> distance.hamming([1, 0, 0], [1, 1, 0]) 

710 0.33333333333333331 

711 >>> distance.hamming([1, 0, 0], [2, 0, 0]) 

712 0.33333333333333331 

713 >>> distance.hamming([1, 0, 0], [3, 0, 0]) 

714 0.33333333333333331 

715 

716 """ 

717 u = _validate_vector(u) 

718 v = _validate_vector(v) 

719 if u.shape != v.shape: 

720 raise ValueError('The 1d arrays must have equal lengths.') 

721 u_ne_v = u != v 

722 if w is not None: 

723 w = _validate_weights(w) 

724 return np.average(u_ne_v, weights=w) 

725 

726 

727def jaccard(u, v, w=None): 

728 """ 

729 Compute the Jaccard-Needham dissimilarity between two boolean 1-D arrays. 

730 

731 The Jaccard-Needham dissimilarity between 1-D boolean arrays `u` and `v`, 

732 is defined as 

733 

734 .. math:: 

735 

736 \\frac{c_{TF} + c_{FT}} 

737 {c_{TT} + c_{FT} + c_{TF}} 

738 

739 where :math:`c_{ij}` is the number of occurrences of 

740 :math:`\\mathtt{u[k]} = i` and :math:`\\mathtt{v[k]} = j` for 

741 :math:`k < n`. 

742 

743 Parameters 

744 ---------- 

745 u : (N,) array_like, bool 

746 Input array. 

747 v : (N,) array_like, bool 

748 Input array. 

749 w : (N,) array_like, optional 

750 The weights for each value in `u` and `v`. Default is None, 

751 which gives each value a weight of 1.0 

752 

753 Returns 

754 ------- 

755 jaccard : double 

756 The Jaccard distance between vectors `u` and `v`. 

757 

758 Notes 

759 ----- 

760 When both `u` and `v` lead to a `0/0` division i.e. there is no overlap 

761 between the items in the vectors the returned distance is 0. See the 

762 Wikipedia page on the Jaccard index [1]_, and this paper [2]_. 

763 

764 .. versionchanged:: 1.2.0 

765 Previously, when `u` and `v` lead to a `0/0` division, the function 

766 would return NaN. This was changed to return 0 instead. 

767 

768 References 

769 ---------- 

770 .. [1] https://en.wikipedia.org/wiki/Jaccard_index 

771 .. [2] S. Kosub, "A note on the triangle inequality for the Jaccard 

772 distance", 2016, :arxiv:`1612.02696` 

773 

774 Examples 

775 -------- 

776 >>> from scipy.spatial import distance 

777 >>> distance.jaccard([1, 0, 0], [0, 1, 0]) 

778 1.0 

779 >>> distance.jaccard([1, 0, 0], [1, 1, 0]) 

780 0.5 

781 >>> distance.jaccard([1, 0, 0], [1, 2, 0]) 

782 0.5 

783 >>> distance.jaccard([1, 0, 0], [1, 1, 1]) 

784 0.66666666666666663 

785 

786 """ 

787 u = _validate_vector(u) 

788 v = _validate_vector(v) 

789 

790 nonzero = np.bitwise_or(u != 0, v != 0) 

791 unequal_nonzero = np.bitwise_and((u != v), nonzero) 

792 if w is not None: 

793 w = _validate_weights(w) 

794 nonzero = w * nonzero 

795 unequal_nonzero = w * unequal_nonzero 

796 a = np.double(unequal_nonzero.sum()) 

797 b = np.double(nonzero.sum()) 

798 return (a / b) if b != 0 else 0 

799 

800 

801@_deprecated("Kulsinski has been deprecated from scipy.spatial.distance" 

802 " in SciPy 1.9.0 and it will be removed in SciPy 1.11.0." 

803 " It is superseded by scipy.spatial.distance.kulczynski1.") 

804def kulsinski(u, v, w=None): 

805 """ 

806 Compute the Kulsinski dissimilarity between two boolean 1-D arrays. 

807 

808 The Kulsinski dissimilarity between two boolean 1-D arrays `u` and `v`, 

809 is defined as 

810 

811 .. math:: 

812 

813 \\frac{c_{TF} + c_{FT} - c_{TT} + n} 

814 {c_{FT} + c_{TF} + n} 

815 

816 where :math:`c_{ij}` is the number of occurrences of 

817 :math:`\\mathtt{u[k]} = i` and :math:`\\mathtt{v[k]} = j` for 

818 :math:`k < n`. 

819 

820 .. deprecated:: 0.12.0 

821 `kulsinski` has been deprecated from `scipy.spatial.distance` in 

822 SciPy 1.9.0 and it will be removed in SciPy 1.11.0. It is superseded 

823 by `scipy.spatial.distance.kulczynski1`. 

824 

825 Parameters 

826 ---------- 

827 u : (N,) array_like, bool 

828 Input array. 

829 v : (N,) array_like, bool 

830 Input array. 

831 w : (N,) array_like, optional 

832 The weights for each value in `u` and `v`. Default is None, 

833 which gives each value a weight of 1.0 

834 

835 Returns 

836 ------- 

837 kulsinski : double 

838 The Kulsinski distance between vectors `u` and `v`. 

839 

840 Examples 

841 -------- 

842 >>> from scipy.spatial import distance 

843 >>> distance.kulsinski([1, 0, 0], [0, 1, 0]) 

844 1.0 

845 >>> distance.kulsinski([1, 0, 0], [1, 1, 0]) 

846 0.75 

847 >>> distance.kulsinski([1, 0, 0], [2, 1, 0]) 

848 0.33333333333333331 

849 >>> distance.kulsinski([1, 0, 0], [3, 1, 0]) 

850 -0.5 

851 

852 """ 

853 u = _validate_vector(u) 

854 v = _validate_vector(v) 

855 if w is None: 

856 n = float(len(u)) 

857 else: 

858 w = _validate_weights(w) 

859 n = w.sum() 

860 (nff, nft, ntf, ntt) = _nbool_correspond_all(u, v, w=w) 

861 

862 return (ntf + nft - ntt + n) / (ntf + nft + n) 

863 

864 

865def kulczynski1(u, v, *, w=None): 

866 """ 

867 Compute the Kulczynski 1 dissimilarity between two boolean 1-D arrays. 

868 

869 The Kulczynski 1 dissimilarity between two boolean 1-D arrays `u` and `v` 

870 of length ``n``, is defined as 

871 

872 .. math:: 

873 

874 \\frac{c_{11}} 

875 {c_{01} + c_{10}} 

876 

877 where :math:`c_{ij}` is the number of occurrences of 

878 :math:`\\mathtt{u[k]} = i` and :math:`\\mathtt{v[k]} = j` for 

879 :math:`k \\in {0, 1, ..., n-1}`. 

880 

881 Parameters 

882 ---------- 

883 u : (N,) array_like, bool 

884 Input array. 

885 v : (N,) array_like, bool 

886 Input array. 

887 w : (N,) array_like, optional 

888 The weights for each value in `u` and `v`. Default is None, 

889 which gives each value a weight of 1.0 

890 

891 Returns 

892 ------- 

893 kulczynski1 : float 

894 The Kulczynski 1 distance between vectors `u` and `v`. 

895 

896 Notes 

897 ----- 

898 This measure has a minimum value of 0 and no upper limit. 

899 It is un-defined when there are no non-matches. 

900 

901 .. versionadded:: 1.8.0 

902 

903 References 

904 ---------- 

905 .. [1] Kulczynski S. et al. Bulletin 

906 International de l'Academie Polonaise des Sciences 

907 et des Lettres, Classe des Sciences Mathematiques 

908 et Naturelles, Serie B (Sciences Naturelles). 1927; 

909 Supplement II: 57-203. 

910 

911 Examples 

912 -------- 

913 >>> from scipy.spatial import distance 

914 >>> distance.kulczynski1([1, 0, 0], [0, 1, 0]) 

915 0.0 

916 >>> distance.kulczynski1([True, False, False], [True, True, False]) 

917 1.0 

918 >>> distance.kulczynski1([True, False, False], [True]) 

919 0.5 

920 >>> distance.kulczynski1([1, 0, 0], [3, 1, 0]) 

921 -3.0 

922 

923 """ 

924 u = _validate_vector(u) 

925 v = _validate_vector(v) 

926 if w is not None: 

927 w = _validate_weights(w) 

928 (_, nft, ntf, ntt) = _nbool_correspond_all(u, v, w=w) 

929 

930 return ntt / (ntf + nft) 

931 

932 

933def seuclidean(u, v, V): 

934 """ 

935 Return the standardized Euclidean distance between two 1-D arrays. 

936 

937 The standardized Euclidean distance between `u` and `v`. 

938 

939 Parameters 

940 ---------- 

941 u : (N,) array_like 

942 Input array. 

943 v : (N,) array_like 

944 Input array. 

945 V : (N,) array_like 

946 `V` is an 1-D array of component variances. It is usually computed 

947 among a larger collection vectors. 

948 

949 Returns 

950 ------- 

951 seuclidean : double 

952 The standardized Euclidean distance between vectors `u` and `v`. 

953 

954 Examples 

955 -------- 

956 >>> from scipy.spatial import distance 

957 >>> distance.seuclidean([1, 0, 0], [0, 1, 0], [0.1, 0.1, 0.1]) 

958 4.4721359549995796 

959 >>> distance.seuclidean([1, 0, 0], [0, 1, 0], [1, 0.1, 0.1]) 

960 3.3166247903553998 

961 >>> distance.seuclidean([1, 0, 0], [0, 1, 0], [10, 0.1, 0.1]) 

962 3.1780497164141406 

963 

964 """ 

965 u = _validate_vector(u) 

966 v = _validate_vector(v) 

967 V = _validate_vector(V, dtype=np.float64) 

968 if V.shape[0] != u.shape[0] or u.shape[0] != v.shape[0]: 

969 raise TypeError('V must be a 1-D array of the same dimension ' 

970 'as u and v.') 

971 return euclidean(u, v, w=1/V) 

972 

973 

974def cityblock(u, v, w=None): 

975 """ 

976 Compute the City Block (Manhattan) distance. 

977 

978 Computes the Manhattan distance between two 1-D arrays `u` and `v`, 

979 which is defined as 

980 

981 .. math:: 

982 

983 \\sum_i {\\left| u_i - v_i \\right|}. 

984 

985 Parameters 

986 ---------- 

987 u : (N,) array_like 

988 Input array. 

989 v : (N,) array_like 

990 Input array. 

991 w : (N,) array_like, optional 

992 The weights for each value in `u` and `v`. Default is None, 

993 which gives each value a weight of 1.0 

994 

995 Returns 

996 ------- 

997 cityblock : double 

998 The City Block (Manhattan) distance between vectors `u` and `v`. 

999 

1000 Examples 

1001 -------- 

1002 >>> from scipy.spatial import distance 

1003 >>> distance.cityblock([1, 0, 0], [0, 1, 0]) 

1004 2 

1005 >>> distance.cityblock([1, 0, 0], [0, 2, 0]) 

1006 3 

1007 >>> distance.cityblock([1, 0, 0], [1, 1, 0]) 

1008 1 

1009 

1010 """ 

1011 u = _validate_vector(u) 

1012 v = _validate_vector(v) 

1013 l1_diff = abs(u - v) 

1014 if w is not None: 

1015 w = _validate_weights(w) 

1016 l1_diff = w * l1_diff 

1017 return l1_diff.sum() 

1018 

1019 

1020def mahalanobis(u, v, VI): 

1021 """ 

1022 Compute the Mahalanobis distance between two 1-D arrays. 

1023 

1024 The Mahalanobis distance between 1-D arrays `u` and `v`, is defined as 

1025 

1026 .. math:: 

1027 

1028 \\sqrt{ (u-v) V^{-1} (u-v)^T } 

1029 

1030 where ``V`` is the covariance matrix. Note that the argument `VI` 

1031 is the inverse of ``V``. 

1032 

1033 Parameters 

1034 ---------- 

1035 u : (N,) array_like 

1036 Input array. 

1037 v : (N,) array_like 

1038 Input array. 

1039 VI : array_like 

1040 The inverse of the covariance matrix. 

1041 

1042 Returns 

1043 ------- 

1044 mahalanobis : double 

1045 The Mahalanobis distance between vectors `u` and `v`. 

1046 

1047 Examples 

1048 -------- 

1049 >>> from scipy.spatial import distance 

1050 >>> iv = [[1, 0.5, 0.5], [0.5, 1, 0.5], [0.5, 0.5, 1]] 

1051 >>> distance.mahalanobis([1, 0, 0], [0, 1, 0], iv) 

1052 1.0 

1053 >>> distance.mahalanobis([0, 2, 0], [0, 1, 0], iv) 

1054 1.0 

1055 >>> distance.mahalanobis([2, 0, 0], [0, 1, 0], iv) 

1056 1.7320508075688772 

1057 

1058 """ 

1059 u = _validate_vector(u) 

1060 v = _validate_vector(v) 

1061 VI = np.atleast_2d(VI) 

1062 delta = u - v 

1063 m = np.dot(np.dot(delta, VI), delta) 

1064 return np.sqrt(m) 

1065 

1066 

1067def chebyshev(u, v, w=None): 

1068 """ 

1069 Compute the Chebyshev distance. 

1070 

1071 Computes the Chebyshev distance between two 1-D arrays `u` and `v`, 

1072 which is defined as 

1073 

1074 .. math:: 

1075 

1076 \\max_i {|u_i-v_i|}. 

1077 

1078 Parameters 

1079 ---------- 

1080 u : (N,) array_like 

1081 Input vector. 

1082 v : (N,) array_like 

1083 Input vector. 

1084 w : (N,) array_like, optional 

1085 Unused, as 'max' is a weightless operation. Here for API consistency. 

1086 

1087 Returns 

1088 ------- 

1089 chebyshev : double 

1090 The Chebyshev distance between vectors `u` and `v`. 

1091 

1092 Examples 

1093 -------- 

1094 >>> from scipy.spatial import distance 

1095 >>> distance.chebyshev([1, 0, 0], [0, 1, 0]) 

1096 1 

1097 >>> distance.chebyshev([1, 1, 0], [0, 1, 0]) 

1098 1 

1099 

1100 """ 

1101 u = _validate_vector(u) 

1102 v = _validate_vector(v) 

1103 if w is not None: 

1104 w = _validate_weights(w) 

1105 has_weight = w > 0 

1106 if has_weight.sum() < w.size: 

1107 u = u[has_weight] 

1108 v = v[has_weight] 

1109 return max(abs(u - v)) 

1110 

1111 

1112def braycurtis(u, v, w=None): 

1113 """ 

1114 Compute the Bray-Curtis distance between two 1-D arrays. 

1115 

1116 Bray-Curtis distance is defined as 

1117 

1118 .. math:: 

1119 

1120 \\sum{|u_i-v_i|} / \\sum{|u_i+v_i|} 

1121 

1122 The Bray-Curtis distance is in the range [0, 1] if all coordinates are 

1123 positive, and is undefined if the inputs are of length zero. 

1124 

1125 Parameters 

1126 ---------- 

1127 u : (N,) array_like 

1128 Input array. 

1129 v : (N,) array_like 

1130 Input array. 

1131 w : (N,) array_like, optional 

1132 The weights for each value in `u` and `v`. Default is None, 

1133 which gives each value a weight of 1.0 

1134 

1135 Returns 

1136 ------- 

1137 braycurtis : double 

1138 The Bray-Curtis distance between 1-D arrays `u` and `v`. 

1139 

1140 Examples 

1141 -------- 

1142 >>> from scipy.spatial import distance 

1143 >>> distance.braycurtis([1, 0, 0], [0, 1, 0]) 

1144 1.0 

1145 >>> distance.braycurtis([1, 1, 0], [0, 1, 0]) 

1146 0.33333333333333331 

1147 

1148 """ 

1149 u = _validate_vector(u) 

1150 v = _validate_vector(v, dtype=np.float64) 

1151 l1_diff = abs(u - v) 

1152 l1_sum = abs(u + v) 

1153 if w is not None: 

1154 w = _validate_weights(w) 

1155 l1_diff = w * l1_diff 

1156 l1_sum = w * l1_sum 

1157 return l1_diff.sum() / l1_sum.sum() 

1158 

1159 

1160def canberra(u, v, w=None): 

1161 """ 

1162 Compute the Canberra distance between two 1-D arrays. 

1163 

1164 The Canberra distance is defined as 

1165 

1166 .. math:: 

1167 

1168 d(u,v) = \\sum_i \\frac{|u_i-v_i|} 

1169 {|u_i|+|v_i|}. 

1170 

1171 Parameters 

1172 ---------- 

1173 u : (N,) array_like 

1174 Input array. 

1175 v : (N,) array_like 

1176 Input array. 

1177 w : (N,) array_like, optional 

1178 The weights for each value in `u` and `v`. Default is None, 

1179 which gives each value a weight of 1.0 

1180 

1181 Returns 

1182 ------- 

1183 canberra : double 

1184 The Canberra distance between vectors `u` and `v`. 

1185 

1186 Notes 

1187 ----- 

1188 When `u[i]` and `v[i]` are 0 for given i, then the fraction 0/0 = 0 is 

1189 used in the calculation. 

1190 

1191 Examples 

1192 -------- 

1193 >>> from scipy.spatial import distance 

1194 >>> distance.canberra([1, 0, 0], [0, 1, 0]) 

1195 2.0 

1196 >>> distance.canberra([1, 1, 0], [0, 1, 0]) 

1197 1.0 

1198 

1199 """ 

1200 u = _validate_vector(u) 

1201 v = _validate_vector(v, dtype=np.float64) 

1202 if w is not None: 

1203 w = _validate_weights(w) 

1204 with np.errstate(invalid='ignore'): 

1205 abs_uv = abs(u - v) 

1206 abs_u = abs(u) 

1207 abs_v = abs(v) 

1208 d = abs_uv / (abs_u + abs_v) 

1209 if w is not None: 

1210 d = w * d 

1211 d = np.nansum(d) 

1212 return d 

1213 

1214 

1215def jensenshannon(p, q, base=None, *, axis=0, keepdims=False): 

1216 """ 

1217 Compute the Jensen-Shannon distance (metric) between 

1218 two probability arrays. This is the square root 

1219 of the Jensen-Shannon divergence. 

1220 

1221 The Jensen-Shannon distance between two probability 

1222 vectors `p` and `q` is defined as, 

1223 

1224 .. math:: 

1225 

1226 \\sqrt{\\frac{D(p \\parallel m) + D(q \\parallel m)}{2}} 

1227 

1228 where :math:`m` is the pointwise mean of :math:`p` and :math:`q` 

1229 and :math:`D` is the Kullback-Leibler divergence. 

1230 

1231 This routine will normalize `p` and `q` if they don't sum to 1.0. 

1232 

1233 Parameters 

1234 ---------- 

1235 p : (N,) array_like 

1236 left probability vector 

1237 q : (N,) array_like 

1238 right probability vector 

1239 base : double, optional 

1240 the base of the logarithm used to compute the output 

1241 if not given, then the routine uses the default base of 

1242 scipy.stats.entropy. 

1243 axis : int, optional 

1244 Axis along which the Jensen-Shannon distances are computed. The default 

1245 is 0. 

1246 

1247 .. versionadded:: 1.7.0 

1248 keepdims : bool, optional 

1249 If this is set to `True`, the reduced axes are left in the 

1250 result as dimensions with size one. With this option, 

1251 the result will broadcast correctly against the input array. 

1252 Default is False. 

1253 

1254 .. versionadded:: 1.7.0 

1255 

1256 Returns 

1257 ------- 

1258 js : double or ndarray 

1259 The Jensen-Shannon distances between `p` and `q` along the `axis`. 

1260 

1261 Notes 

1262 ----- 

1263 

1264 .. versionadded:: 1.2.0 

1265 

1266 Examples 

1267 -------- 

1268 >>> from scipy.spatial import distance 

1269 >>> import numpy as np 

1270 >>> distance.jensenshannon([1.0, 0.0, 0.0], [0.0, 1.0, 0.0], 2.0) 

1271 1.0 

1272 >>> distance.jensenshannon([1.0, 0.0], [0.5, 0.5]) 

1273 0.46450140402245893 

1274 >>> distance.jensenshannon([1.0, 0.0, 0.0], [1.0, 0.0, 0.0]) 

1275 0.0 

1276 >>> a = np.array([[1, 2, 3, 4], 

1277 ... [5, 6, 7, 8], 

1278 ... [9, 10, 11, 12]]) 

1279 >>> b = np.array([[13, 14, 15, 16], 

1280 ... [17, 18, 19, 20], 

1281 ... [21, 22, 23, 24]]) 

1282 >>> distance.jensenshannon(a, b, axis=0) 

1283 array([0.1954288, 0.1447697, 0.1138377, 0.0927636]) 

1284 >>> distance.jensenshannon(a, b, axis=1) 

1285 array([0.1402339, 0.0399106, 0.0201815]) 

1286 

1287 """ 

1288 p = np.asarray(p) 

1289 q = np.asarray(q) 

1290 p = p / np.sum(p, axis=axis, keepdims=True) 

1291 q = q / np.sum(q, axis=axis, keepdims=True) 

1292 m = (p + q) / 2.0 

1293 left = rel_entr(p, m) 

1294 right = rel_entr(q, m) 

1295 left_sum = np.sum(left, axis=axis, keepdims=keepdims) 

1296 right_sum = np.sum(right, axis=axis, keepdims=keepdims) 

1297 js = left_sum + right_sum 

1298 if base is not None: 

1299 js /= np.log(base) 

1300 return np.sqrt(js / 2.0) 

1301 

1302 

1303def yule(u, v, w=None): 

1304 """ 

1305 Compute the Yule dissimilarity between two boolean 1-D arrays. 

1306 

1307 The Yule dissimilarity is defined as 

1308 

1309 .. math:: 

1310 

1311 \\frac{R}{c_{TT} * c_{FF} + \\frac{R}{2}} 

1312 

1313 where :math:`c_{ij}` is the number of occurrences of 

1314 :math:`\\mathtt{u[k]} = i` and :math:`\\mathtt{v[k]} = j` for 

1315 :math:`k < n` and :math:`R = 2.0 * c_{TF} * c_{FT}`. 

1316 

1317 Parameters 

1318 ---------- 

1319 u : (N,) array_like, bool 

1320 Input array. 

1321 v : (N,) array_like, bool 

1322 Input array. 

1323 w : (N,) array_like, optional 

1324 The weights for each value in `u` and `v`. Default is None, 

1325 which gives each value a weight of 1.0 

1326 

1327 Returns 

1328 ------- 

1329 yule : double 

1330 The Yule dissimilarity between vectors `u` and `v`. 

1331 

1332 Examples 

1333 -------- 

1334 >>> from scipy.spatial import distance 

1335 >>> distance.yule([1, 0, 0], [0, 1, 0]) 

1336 2.0 

1337 >>> distance.yule([1, 1, 0], [0, 1, 0]) 

1338 0.0 

1339 

1340 """ 

1341 u = _validate_vector(u) 

1342 v = _validate_vector(v) 

1343 if w is not None: 

1344 w = _validate_weights(w) 

1345 (nff, nft, ntf, ntt) = _nbool_correspond_all(u, v, w=w) 

1346 half_R = ntf * nft 

1347 if half_R == 0: 

1348 return 0.0 

1349 else: 

1350 return float(2.0 * half_R / (ntt * nff + half_R)) 

1351 

1352 

1353def dice(u, v, w=None): 

1354 """ 

1355 Compute the Dice dissimilarity between two boolean 1-D arrays. 

1356 

1357 The Dice dissimilarity between `u` and `v`, is 

1358 

1359 .. math:: 

1360 

1361 \\frac{c_{TF} + c_{FT}} 

1362 {2c_{TT} + c_{FT} + c_{TF}} 

1363 

1364 where :math:`c_{ij}` is the number of occurrences of 

1365 :math:`\\mathtt{u[k]} = i` and :math:`\\mathtt{v[k]} = j` for 

1366 :math:`k < n`. 

1367 

1368 Parameters 

1369 ---------- 

1370 u : (N,) array_like, bool 

1371 Input 1-D array. 

1372 v : (N,) array_like, bool 

1373 Input 1-D array. 

1374 w : (N,) array_like, optional 

1375 The weights for each value in `u` and `v`. Default is None, 

1376 which gives each value a weight of 1.0 

1377 

1378 Returns 

1379 ------- 

1380 dice : double 

1381 The Dice dissimilarity between 1-D arrays `u` and `v`. 

1382 

1383 Notes 

1384 ----- 

1385 This function computes the Dice dissimilarity index. To compute the 

1386 Dice similarity index, convert one to the other with similarity = 

1387 1 - dissimilarity. 

1388 

1389 Examples 

1390 -------- 

1391 >>> from scipy.spatial import distance 

1392 >>> distance.dice([1, 0, 0], [0, 1, 0]) 

1393 1.0 

1394 >>> distance.dice([1, 0, 0], [1, 1, 0]) 

1395 0.3333333333333333 

1396 >>> distance.dice([1, 0, 0], [2, 0, 0]) 

1397 -0.3333333333333333 

1398 

1399 """ 

1400 u = _validate_vector(u) 

1401 v = _validate_vector(v) 

1402 if w is not None: 

1403 w = _validate_weights(w) 

1404 if u.dtype == v.dtype == bool and w is None: 

1405 ntt = (u & v).sum() 

1406 else: 

1407 dtype = np.result_type(int, u.dtype, v.dtype) 

1408 u = u.astype(dtype) 

1409 v = v.astype(dtype) 

1410 if w is None: 

1411 ntt = (u * v).sum() 

1412 else: 

1413 ntt = (u * v * w).sum() 

1414 (nft, ntf) = _nbool_correspond_ft_tf(u, v, w=w) 

1415 return float((ntf + nft) / np.array(2.0 * ntt + ntf + nft)) 

1416 

1417 

1418def rogerstanimoto(u, v, w=None): 

1419 """ 

1420 Compute the Rogers-Tanimoto dissimilarity between two boolean 1-D arrays. 

1421 

1422 The Rogers-Tanimoto dissimilarity between two boolean 1-D arrays 

1423 `u` and `v`, is defined as 

1424 

1425 .. math:: 

1426 \\frac{R} 

1427 {c_{TT} + c_{FF} + R} 

1428 

1429 where :math:`c_{ij}` is the number of occurrences of 

1430 :math:`\\mathtt{u[k]} = i` and :math:`\\mathtt{v[k]} = j` for 

1431 :math:`k < n` and :math:`R = 2(c_{TF} + c_{FT})`. 

1432 

1433 Parameters 

1434 ---------- 

1435 u : (N,) array_like, bool 

1436 Input array. 

1437 v : (N,) array_like, bool 

1438 Input array. 

1439 w : (N,) array_like, optional 

1440 The weights for each value in `u` and `v`. Default is None, 

1441 which gives each value a weight of 1.0 

1442 

1443 Returns 

1444 ------- 

1445 rogerstanimoto : double 

1446 The Rogers-Tanimoto dissimilarity between vectors 

1447 `u` and `v`. 

1448 

1449 Examples 

1450 -------- 

1451 >>> from scipy.spatial import distance 

1452 >>> distance.rogerstanimoto([1, 0, 0], [0, 1, 0]) 

1453 0.8 

1454 >>> distance.rogerstanimoto([1, 0, 0], [1, 1, 0]) 

1455 0.5 

1456 >>> distance.rogerstanimoto([1, 0, 0], [2, 0, 0]) 

1457 -1.0 

1458 

1459 """ 

1460 u = _validate_vector(u) 

1461 v = _validate_vector(v) 

1462 if w is not None: 

1463 w = _validate_weights(w) 

1464 (nff, nft, ntf, ntt) = _nbool_correspond_all(u, v, w=w) 

1465 return float(2.0 * (ntf + nft)) / float(ntt + nff + (2.0 * (ntf + nft))) 

1466 

1467 

1468def russellrao(u, v, w=None): 

1469 """ 

1470 Compute the Russell-Rao dissimilarity between two boolean 1-D arrays. 

1471 

1472 The Russell-Rao dissimilarity between two boolean 1-D arrays, `u` and 

1473 `v`, is defined as 

1474 

1475 .. math:: 

1476 

1477 \\frac{n - c_{TT}} 

1478 {n} 

1479 

1480 where :math:`c_{ij}` is the number of occurrences of 

1481 :math:`\\mathtt{u[k]} = i` and :math:`\\mathtt{v[k]} = j` for 

1482 :math:`k < n`. 

1483 

1484 Parameters 

1485 ---------- 

1486 u : (N,) array_like, bool 

1487 Input array. 

1488 v : (N,) array_like, bool 

1489 Input array. 

1490 w : (N,) array_like, optional 

1491 The weights for each value in `u` and `v`. Default is None, 

1492 which gives each value a weight of 1.0 

1493 

1494 Returns 

1495 ------- 

1496 russellrao : double 

1497 The Russell-Rao dissimilarity between vectors `u` and `v`. 

1498 

1499 Examples 

1500 -------- 

1501 >>> from scipy.spatial import distance 

1502 >>> distance.russellrao([1, 0, 0], [0, 1, 0]) 

1503 1.0 

1504 >>> distance.russellrao([1, 0, 0], [1, 1, 0]) 

1505 0.6666666666666666 

1506 >>> distance.russellrao([1, 0, 0], [2, 0, 0]) 

1507 0.3333333333333333 

1508 

1509 """ 

1510 u = _validate_vector(u) 

1511 v = _validate_vector(v) 

1512 if u.dtype == v.dtype == bool and w is None: 

1513 ntt = (u & v).sum() 

1514 n = float(len(u)) 

1515 elif w is None: 

1516 ntt = (u * v).sum() 

1517 n = float(len(u)) 

1518 else: 

1519 w = _validate_weights(w) 

1520 ntt = (u * v * w).sum() 

1521 n = w.sum() 

1522 return float(n - ntt) / n 

1523 

1524 

1525def sokalmichener(u, v, w=None): 

1526 """ 

1527 Compute the Sokal-Michener dissimilarity between two boolean 1-D arrays. 

1528 

1529 The Sokal-Michener dissimilarity between boolean 1-D arrays `u` and `v`, 

1530 is defined as 

1531 

1532 .. math:: 

1533 

1534 \\frac{R} 

1535 {S + R} 

1536 

1537 where :math:`c_{ij}` is the number of occurrences of 

1538 :math:`\\mathtt{u[k]} = i` and :math:`\\mathtt{v[k]} = j` for 

1539 :math:`k < n`, :math:`R = 2 * (c_{TF} + c_{FT})` and 

1540 :math:`S = c_{FF} + c_{TT}`. 

1541 

1542 Parameters 

1543 ---------- 

1544 u : (N,) array_like, bool 

1545 Input array. 

1546 v : (N,) array_like, bool 

1547 Input array. 

1548 w : (N,) array_like, optional 

1549 The weights for each value in `u` and `v`. Default is None, 

1550 which gives each value a weight of 1.0 

1551 

1552 Returns 

1553 ------- 

1554 sokalmichener : double 

1555 The Sokal-Michener dissimilarity between vectors `u` and `v`. 

1556 

1557 Examples 

1558 -------- 

1559 >>> from scipy.spatial import distance 

1560 >>> distance.sokalmichener([1, 0, 0], [0, 1, 0]) 

1561 0.8 

1562 >>> distance.sokalmichener([1, 0, 0], [1, 1, 0]) 

1563 0.5 

1564 >>> distance.sokalmichener([1, 0, 0], [2, 0, 0]) 

1565 -1.0 

1566 

1567 """ 

1568 u = _validate_vector(u) 

1569 v = _validate_vector(v) 

1570 if w is not None: 

1571 w = _validate_weights(w) 

1572 nff, nft, ntf, ntt = _nbool_correspond_all(u, v, w=w) 

1573 return float(2.0 * (ntf + nft)) / float(ntt + nff + 2.0 * (ntf + nft)) 

1574 

1575 

1576def sokalsneath(u, v, w=None): 

1577 """ 

1578 Compute the Sokal-Sneath dissimilarity between two boolean 1-D arrays. 

1579 

1580 The Sokal-Sneath dissimilarity between `u` and `v`, 

1581 

1582 .. math:: 

1583 

1584 \\frac{R} 

1585 {c_{TT} + R} 

1586 

1587 where :math:`c_{ij}` is the number of occurrences of 

1588 :math:`\\mathtt{u[k]} = i` and :math:`\\mathtt{v[k]} = j` for 

1589 :math:`k < n` and :math:`R = 2(c_{TF} + c_{FT})`. 

1590 

1591 Parameters 

1592 ---------- 

1593 u : (N,) array_like, bool 

1594 Input array. 

1595 v : (N,) array_like, bool 

1596 Input array. 

1597 w : (N,) array_like, optional 

1598 The weights for each value in `u` and `v`. Default is None, 

1599 which gives each value a weight of 1.0 

1600 

1601 Returns 

1602 ------- 

1603 sokalsneath : double 

1604 The Sokal-Sneath dissimilarity between vectors `u` and `v`. 

1605 

1606 Examples 

1607 -------- 

1608 >>> from scipy.spatial import distance 

1609 >>> distance.sokalsneath([1, 0, 0], [0, 1, 0]) 

1610 1.0 

1611 >>> distance.sokalsneath([1, 0, 0], [1, 1, 0]) 

1612 0.66666666666666663 

1613 >>> distance.sokalsneath([1, 0, 0], [2, 1, 0]) 

1614 0.0 

1615 >>> distance.sokalsneath([1, 0, 0], [3, 1, 0]) 

1616 -2.0 

1617 

1618 """ 

1619 u = _validate_vector(u) 

1620 v = _validate_vector(v) 

1621 if u.dtype == v.dtype == bool and w is None: 

1622 ntt = (u & v).sum() 

1623 elif w is None: 

1624 ntt = (u * v).sum() 

1625 else: 

1626 w = _validate_weights(w) 

1627 ntt = (u * v * w).sum() 

1628 (nft, ntf) = _nbool_correspond_ft_tf(u, v, w=w) 

1629 denom = np.array(ntt + 2.0 * (ntf + nft)) 

1630 if not denom.any(): 

1631 raise ValueError('Sokal-Sneath dissimilarity is not defined for ' 

1632 'vectors that are entirely false.') 

1633 return float(2.0 * (ntf + nft)) / denom 

1634 

1635 

1636_convert_to_double = partial(_convert_to_type, out_type=np.double) 

1637_convert_to_bool = partial(_convert_to_type, out_type=bool) 

1638 

1639# adding python-only wrappers to _distance_wrap module 

1640_distance_wrap.pdist_correlation_double_wrap = _correlation_pdist_wrap 

1641_distance_wrap.cdist_correlation_double_wrap = _correlation_cdist_wrap 

1642 

1643 

1644@dataclasses.dataclass(frozen=True) 

1645class CDistMetricWrapper: 

1646 metric_name: str 

1647 

1648 def __call__(self, XA, XB, *, out=None, **kwargs): 

1649 XA = np.ascontiguousarray(XA) 

1650 XB = np.ascontiguousarray(XB) 

1651 mA, n = XA.shape 

1652 mB, _ = XB.shape 

1653 metric_name = self.metric_name 

1654 metric_info = _METRICS[metric_name] 

1655 XA, XB, typ, kwargs = _validate_cdist_input( 

1656 XA, XB, mA, mB, n, metric_info, **kwargs) 

1657 

1658 w = kwargs.pop('w', None) 

1659 if w is not None: 

1660 metric = metric_info.dist_func 

1661 return _cdist_callable( 

1662 XA, XB, metric=metric, out=out, w=w, **kwargs) 

1663 

1664 dm = _prepare_out_argument(out, np.double, (mA, mB)) 

1665 # get cdist wrapper 

1666 cdist_fn = getattr(_distance_wrap, f'cdist_{metric_name}_{typ}_wrap') 

1667 cdist_fn(XA, XB, dm, **kwargs) 

1668 return dm 

1669 

1670 

1671@dataclasses.dataclass(frozen=True) 

1672class CDistWeightedMetricWrapper: 

1673 metric_name: str 

1674 weighted_metric: str 

1675 

1676 def __call__(self, XA, XB, *, out=None, **kwargs): 

1677 XA = np.ascontiguousarray(XA) 

1678 XB = np.ascontiguousarray(XB) 

1679 mA, n = XA.shape 

1680 mB, _ = XB.shape 

1681 metric_name = self.metric_name 

1682 XA, XB, typ, kwargs = _validate_cdist_input( 

1683 XA, XB, mA, mB, n, _METRICS[metric_name], **kwargs) 

1684 dm = _prepare_out_argument(out, np.double, (mA, mB)) 

1685 

1686 w = kwargs.pop('w', None) 

1687 if w is not None: 

1688 metric_name = self.weighted_metric 

1689 kwargs['w'] = w 

1690 

1691 # get cdist wrapper 

1692 cdist_fn = getattr(_distance_wrap, f'cdist_{metric_name}_{typ}_wrap') 

1693 cdist_fn(XA, XB, dm, **kwargs) 

1694 return dm 

1695 

1696 

1697@dataclasses.dataclass(frozen=True) 

1698class PDistMetricWrapper: 

1699 metric_name: str 

1700 

1701 def __call__(self, X, *, out=None, **kwargs): 

1702 X = np.ascontiguousarray(X) 

1703 m, n = X.shape 

1704 metric_name = self.metric_name 

1705 metric_info = _METRICS[metric_name] 

1706 X, typ, kwargs = _validate_pdist_input( 

1707 X, m, n, metric_info, **kwargs) 

1708 out_size = (m * (m - 1)) // 2 

1709 w = kwargs.pop('w', None) 

1710 if w is not None: 

1711 metric = metric_info.dist_func 

1712 return _pdist_callable( 

1713 X, metric=metric, out=out, w=w, **kwargs) 

1714 

1715 dm = _prepare_out_argument(out, np.double, (out_size,)) 

1716 # get pdist wrapper 

1717 pdist_fn = getattr(_distance_wrap, f'pdist_{metric_name}_{typ}_wrap') 

1718 pdist_fn(X, dm, **kwargs) 

1719 return dm 

1720 

1721 

1722@dataclasses.dataclass(frozen=True) 

1723class PDistWeightedMetricWrapper: 

1724 metric_name: str 

1725 weighted_metric: str 

1726 

1727 def __call__(self, X, *, out=None, **kwargs): 

1728 X = np.ascontiguousarray(X) 

1729 m, n = X.shape 

1730 metric_name = self.metric_name 

1731 X, typ, kwargs = _validate_pdist_input( 

1732 X, m, n, _METRICS[metric_name], **kwargs) 

1733 out_size = (m * (m - 1)) // 2 

1734 dm = _prepare_out_argument(out, np.double, (out_size,)) 

1735 

1736 w = kwargs.pop('w', None) 

1737 if w is not None: 

1738 metric_name = self.weighted_metric 

1739 kwargs['w'] = w 

1740 

1741 # get pdist wrapper 

1742 pdist_fn = getattr(_distance_wrap, f'pdist_{metric_name}_{typ}_wrap') 

1743 pdist_fn(X, dm, **kwargs) 

1744 return dm 

1745 

1746 

1747@dataclasses.dataclass(frozen=True) 

1748class MetricInfo: 

1749 # Name of python distance function 

1750 canonical_name: str 

1751 # All aliases, including canonical_name 

1752 aka: Set[str] 

1753 # unvectorized distance function 

1754 dist_func: Callable 

1755 # Optimized cdist function 

1756 cdist_func: Callable 

1757 # Optimized pdist function 

1758 pdist_func: Callable 

1759 # function that checks kwargs and computes default values: 

1760 # f(X, m, n, **kwargs) 

1761 validator: Optional[Callable] = None 

1762 # list of supported types: 

1763 # X (pdist) and XA (cdist) are used to choose the type. if there is no 

1764 # match the first type is used. Default double 

1765 types: List[str] = dataclasses.field(default_factory=lambda: ['double']) 

1766 # true if out array must be C-contiguous 

1767 requires_contiguous_out: bool = True 

1768 

1769 

1770# Registry of implemented metrics: 

1771_METRIC_INFOS = [ 

1772 MetricInfo( 

1773 canonical_name='braycurtis', 

1774 aka={'braycurtis'}, 

1775 dist_func=braycurtis, 

1776 cdist_func=_distance_pybind.cdist_braycurtis, 

1777 pdist_func=_distance_pybind.pdist_braycurtis, 

1778 ), 

1779 MetricInfo( 

1780 canonical_name='canberra', 

1781 aka={'canberra'}, 

1782 dist_func=canberra, 

1783 cdist_func=_distance_pybind.cdist_canberra, 

1784 pdist_func=_distance_pybind.pdist_canberra, 

1785 ), 

1786 MetricInfo( 

1787 canonical_name='chebyshev', 

1788 aka={'chebychev', 'chebyshev', 'cheby', 'cheb', 'ch'}, 

1789 dist_func=chebyshev, 

1790 cdist_func=_distance_pybind.cdist_chebyshev, 

1791 pdist_func=_distance_pybind.pdist_chebyshev, 

1792 ), 

1793 MetricInfo( 

1794 canonical_name='cityblock', 

1795 aka={'cityblock', 'cblock', 'cb', 'c'}, 

1796 dist_func=cityblock, 

1797 cdist_func=_distance_pybind.cdist_cityblock, 

1798 pdist_func=_distance_pybind.pdist_cityblock, 

1799 ), 

1800 MetricInfo( 

1801 canonical_name='correlation', 

1802 aka={'correlation', 'co'}, 

1803 dist_func=correlation, 

1804 cdist_func=CDistMetricWrapper('correlation'), 

1805 pdist_func=PDistMetricWrapper('correlation'), 

1806 ), 

1807 MetricInfo( 

1808 canonical_name='cosine', 

1809 aka={'cosine', 'cos'}, 

1810 dist_func=cosine, 

1811 cdist_func=CDistMetricWrapper('cosine'), 

1812 pdist_func=PDistMetricWrapper('cosine'), 

1813 ), 

1814 MetricInfo( 

1815 canonical_name='dice', 

1816 aka={'dice'}, 

1817 types=['bool'], 

1818 dist_func=dice, 

1819 cdist_func=CDistMetricWrapper('dice'), 

1820 pdist_func=PDistMetricWrapper('dice'), 

1821 ), 

1822 MetricInfo( 

1823 canonical_name='euclidean', 

1824 aka={'euclidean', 'euclid', 'eu', 'e'}, 

1825 dist_func=euclidean, 

1826 cdist_func=_distance_pybind.cdist_euclidean, 

1827 pdist_func=_distance_pybind.pdist_euclidean, 

1828 ), 

1829 MetricInfo( 

1830 canonical_name='hamming', 

1831 aka={'matching', 'hamming', 'hamm', 'ha', 'h'}, 

1832 types=['double', 'bool'], 

1833 validator=_validate_hamming_kwargs, 

1834 dist_func=hamming, 

1835 cdist_func=CDistWeightedMetricWrapper('hamming', 'hamming'), 

1836 pdist_func=PDistWeightedMetricWrapper('hamming', 'hamming'), 

1837 ), 

1838 MetricInfo( 

1839 canonical_name='jaccard', 

1840 aka={'jaccard', 'jacc', 'ja', 'j'}, 

1841 types=['double', 'bool'], 

1842 dist_func=jaccard, 

1843 cdist_func=CDistMetricWrapper('jaccard'), 

1844 pdist_func=PDistMetricWrapper('jaccard'), 

1845 ), 

1846 MetricInfo( 

1847 canonical_name='jensenshannon', 

1848 aka={'jensenshannon', 'js'}, 

1849 dist_func=jensenshannon, 

1850 cdist_func=CDistMetricWrapper('jensenshannon'), 

1851 pdist_func=PDistMetricWrapper('jensenshannon'), 

1852 ), 

1853 MetricInfo( 

1854 canonical_name='kulsinski', 

1855 aka={'kulsinski'}, 

1856 types=['bool'], 

1857 dist_func=kulsinski, 

1858 cdist_func=CDistMetricWrapper('kulsinski'), 

1859 pdist_func=PDistMetricWrapper('kulsinski'), 

1860 ), 

1861 MetricInfo( 

1862 canonical_name='kulczynski1', 

1863 aka={'kulczynski1'}, 

1864 types=['bool'], 

1865 dist_func=kulczynski1, 

1866 cdist_func=CDistMetricWrapper('kulczynski1'), 

1867 pdist_func=PDistMetricWrapper('kulczynski1'), 

1868 ), 

1869 MetricInfo( 

1870 canonical_name='mahalanobis', 

1871 aka={'mahalanobis', 'mahal', 'mah'}, 

1872 validator=_validate_mahalanobis_kwargs, 

1873 dist_func=mahalanobis, 

1874 cdist_func=CDistMetricWrapper('mahalanobis'), 

1875 pdist_func=PDistMetricWrapper('mahalanobis'), 

1876 ), 

1877 MetricInfo( 

1878 canonical_name='minkowski', 

1879 aka={'minkowski', 'mi', 'm', 'pnorm'}, 

1880 validator=_validate_minkowski_kwargs, 

1881 dist_func=minkowski, 

1882 cdist_func=_distance_pybind.cdist_minkowski, 

1883 pdist_func=_distance_pybind.pdist_minkowski, 

1884 ), 

1885 MetricInfo( 

1886 canonical_name='rogerstanimoto', 

1887 aka={'rogerstanimoto'}, 

1888 types=['bool'], 

1889 dist_func=rogerstanimoto, 

1890 cdist_func=CDistMetricWrapper('rogerstanimoto'), 

1891 pdist_func=PDistMetricWrapper('rogerstanimoto'), 

1892 ), 

1893 MetricInfo( 

1894 canonical_name='russellrao', 

1895 aka={'russellrao'}, 

1896 types=['bool'], 

1897 dist_func=russellrao, 

1898 cdist_func=CDistMetricWrapper('russellrao'), 

1899 pdist_func=PDistMetricWrapper('russellrao'), 

1900 ), 

1901 MetricInfo( 

1902 canonical_name='seuclidean', 

1903 aka={'seuclidean', 'se', 's'}, 

1904 validator=_validate_seuclidean_kwargs, 

1905 dist_func=seuclidean, 

1906 cdist_func=CDistMetricWrapper('seuclidean'), 

1907 pdist_func=PDistMetricWrapper('seuclidean'), 

1908 ), 

1909 MetricInfo( 

1910 canonical_name='sokalmichener', 

1911 aka={'sokalmichener'}, 

1912 types=['bool'], 

1913 dist_func=sokalmichener, 

1914 cdist_func=CDistMetricWrapper('sokalmichener'), 

1915 pdist_func=PDistMetricWrapper('sokalmichener'), 

1916 ), 

1917 MetricInfo( 

1918 canonical_name='sokalsneath', 

1919 aka={'sokalsneath'}, 

1920 types=['bool'], 

1921 dist_func=sokalsneath, 

1922 cdist_func=CDistMetricWrapper('sokalsneath'), 

1923 pdist_func=PDistMetricWrapper('sokalsneath'), 

1924 ), 

1925 MetricInfo( 

1926 canonical_name='sqeuclidean', 

1927 aka={'sqeuclidean', 'sqe', 'sqeuclid'}, 

1928 dist_func=sqeuclidean, 

1929 cdist_func=_distance_pybind.cdist_sqeuclidean, 

1930 pdist_func=_distance_pybind.pdist_sqeuclidean, 

1931 ), 

1932 MetricInfo( 

1933 canonical_name='yule', 

1934 aka={'yule'}, 

1935 types=['bool'], 

1936 dist_func=yule, 

1937 cdist_func=CDistMetricWrapper('yule'), 

1938 pdist_func=PDistMetricWrapper('yule'), 

1939 ), 

1940] 

1941 

1942_METRICS = {info.canonical_name: info for info in _METRIC_INFOS} 

1943_METRIC_ALIAS = dict((alias, info) 

1944 for info in _METRIC_INFOS 

1945 for alias in info.aka) 

1946 

1947_METRICS_NAMES = list(_METRICS.keys()) 

1948 

1949_TEST_METRICS = {'test_' + info.canonical_name: info for info in _METRIC_INFOS} 

1950 

1951 

1952def pdist(X, metric='euclidean', *, out=None, **kwargs): 

1953 """ 

1954 Pairwise distances between observations in n-dimensional space. 

1955 

1956 See Notes for common calling conventions. 

1957 

1958 Parameters 

1959 ---------- 

1960 X : array_like 

1961 An m by n array of m original observations in an 

1962 n-dimensional space. 

1963 metric : str or function, optional 

1964 The distance metric to use. The distance function can 

1965 be 'braycurtis', 'canberra', 'chebyshev', 'cityblock', 

1966 'correlation', 'cosine', 'dice', 'euclidean', 'hamming', 

1967 'jaccard', 'jensenshannon', 'kulczynski1', 

1968 'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto', 

1969 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 

1970 'sqeuclidean', 'yule'. 

1971 **kwargs : dict, optional 

1972 Extra arguments to `metric`: refer to each metric documentation for a 

1973 list of all possible arguments. 

1974 

1975 Some possible arguments: 

1976 

1977 p : scalar 

1978 The p-norm to apply for Minkowski, weighted and unweighted. 

1979 Default: 2. 

1980 

1981 w : ndarray 

1982 The weight vector for metrics that support weights (e.g., Minkowski). 

1983 

1984 V : ndarray 

1985 The variance vector for standardized Euclidean. 

1986 Default: var(X, axis=0, ddof=1) 

1987 

1988 VI : ndarray 

1989 The inverse of the covariance matrix for Mahalanobis. 

1990 Default: inv(cov(X.T)).T 

1991 

1992 out : ndarray. 

1993 The output array 

1994 If not None, condensed distance matrix Y is stored in this array. 

1995 

1996 Returns 

1997 ------- 

1998 Y : ndarray 

1999 Returns a condensed distance matrix Y. For each :math:`i` and :math:`j` 

2000 (where :math:`i<j<m`),where m is the number of original observations. 

2001 The metric ``dist(u=X[i], v=X[j])`` is computed and stored in entry ``m 

2002 * i + j - ((i + 2) * (i + 1)) // 2``. 

2003 

2004 See Also 

2005 -------- 

2006 squareform : converts between condensed distance matrices and 

2007 square distance matrices. 

2008 

2009 Notes 

2010 ----- 

2011 See ``squareform`` for information on how to calculate the index of 

2012 this entry or to convert the condensed distance matrix to a 

2013 redundant square matrix. 

2014 

2015 The following are common calling conventions. 

2016 

2017 1. ``Y = pdist(X, 'euclidean')`` 

2018 

2019 Computes the distance between m points using Euclidean distance 

2020 (2-norm) as the distance metric between the points. The points 

2021 are arranged as m n-dimensional row vectors in the matrix X. 

2022 

2023 2. ``Y = pdist(X, 'minkowski', p=2.)`` 

2024 

2025 Computes the distances using the Minkowski distance 

2026 :math:`\\|u-v\\|_p` (:math:`p`-norm) where :math:`p > 0` (note 

2027 that this is only a quasi-metric if :math:`0 < p < 1`). 

2028 

2029 3. ``Y = pdist(X, 'cityblock')`` 

2030 

2031 Computes the city block or Manhattan distance between the 

2032 points. 

2033 

2034 4. ``Y = pdist(X, 'seuclidean', V=None)`` 

2035 

2036 Computes the standardized Euclidean distance. The standardized 

2037 Euclidean distance between two n-vectors ``u`` and ``v`` is 

2038 

2039 .. math:: 

2040 

2041 \\sqrt{\\sum {(u_i-v_i)^2 / V[x_i]}} 

2042 

2043 

2044 V is the variance vector; V[i] is the variance computed over all 

2045 the i'th components of the points. If not passed, it is 

2046 automatically computed. 

2047 

2048 5. ``Y = pdist(X, 'sqeuclidean')`` 

2049 

2050 Computes the squared Euclidean distance :math:`\\|u-v\\|_2^2` between 

2051 the vectors. 

2052 

2053 6. ``Y = pdist(X, 'cosine')`` 

2054 

2055 Computes the cosine distance between vectors u and v, 

2056 

2057 .. math:: 

2058 

2059 1 - \\frac{u \\cdot v} 

2060 {{\\|u\\|}_2 {\\|v\\|}_2} 

2061 

2062 where :math:`\\|*\\|_2` is the 2-norm of its argument ``*``, and 

2063 :math:`u \\cdot v` is the dot product of ``u`` and ``v``. 

2064 

2065 7. ``Y = pdist(X, 'correlation')`` 

2066 

2067 Computes the correlation distance between vectors u and v. This is 

2068 

2069 .. math:: 

2070 

2071 1 - \\frac{(u - \\bar{u}) \\cdot (v - \\bar{v})} 

2072 {{\\|(u - \\bar{u})\\|}_2 {\\|(v - \\bar{v})\\|}_2} 

2073 

2074 where :math:`\\bar{v}` is the mean of the elements of vector v, 

2075 and :math:`x \\cdot y` is the dot product of :math:`x` and :math:`y`. 

2076 

2077 8. ``Y = pdist(X, 'hamming')`` 

2078 

2079 Computes the normalized Hamming distance, or the proportion of 

2080 those vector elements between two n-vectors ``u`` and ``v`` 

2081 which disagree. To save memory, the matrix ``X`` can be of type 

2082 boolean. 

2083 

2084 9. ``Y = pdist(X, 'jaccard')`` 

2085 

2086 Computes the Jaccard distance between the points. Given two 

2087 vectors, ``u`` and ``v``, the Jaccard distance is the 

2088 proportion of those elements ``u[i]`` and ``v[i]`` that 

2089 disagree. 

2090 

2091 10. ``Y = pdist(X, 'jensenshannon')`` 

2092 

2093 Computes the Jensen-Shannon distance between two probability arrays. 

2094 Given two probability vectors, :math:`p` and :math:`q`, the 

2095 Jensen-Shannon distance is 

2096 

2097 .. math:: 

2098 

2099 \\sqrt{\\frac{D(p \\parallel m) + D(q \\parallel m)}{2}} 

2100 

2101 where :math:`m` is the pointwise mean of :math:`p` and :math:`q` 

2102 and :math:`D` is the Kullback-Leibler divergence. 

2103 

2104 11. ``Y = pdist(X, 'chebyshev')`` 

2105 

2106 Computes the Chebyshev distance between the points. The 

2107 Chebyshev distance between two n-vectors ``u`` and ``v`` is the 

2108 maximum norm-1 distance between their respective elements. More 

2109 precisely, the distance is given by 

2110 

2111 .. math:: 

2112 

2113 d(u,v) = \\max_i {|u_i-v_i|} 

2114 

2115 12. ``Y = pdist(X, 'canberra')`` 

2116 

2117 Computes the Canberra distance between the points. The 

2118 Canberra distance between two points ``u`` and ``v`` is 

2119 

2120 .. math:: 

2121 

2122 d(u,v) = \\sum_i \\frac{|u_i-v_i|} 

2123 {|u_i|+|v_i|} 

2124 

2125 

2126 13. ``Y = pdist(X, 'braycurtis')`` 

2127 

2128 Computes the Bray-Curtis distance between the points. The 

2129 Bray-Curtis distance between two points ``u`` and ``v`` is 

2130 

2131 

2132 .. math:: 

2133 

2134 d(u,v) = \\frac{\\sum_i {|u_i-v_i|}} 

2135 {\\sum_i {|u_i+v_i|}} 

2136 

2137 14. ``Y = pdist(X, 'mahalanobis', VI=None)`` 

2138 

2139 Computes the Mahalanobis distance between the points. The 

2140 Mahalanobis distance between two points ``u`` and ``v`` is 

2141 :math:`\\sqrt{(u-v)(1/V)(u-v)^T}` where :math:`(1/V)` (the ``VI`` 

2142 variable) is the inverse covariance. If ``VI`` is not None, 

2143 ``VI`` will be used as the inverse covariance matrix. 

2144 

2145 15. ``Y = pdist(X, 'yule')`` 

2146 

2147 Computes the Yule distance between each pair of boolean 

2148 vectors. (see yule function documentation) 

2149 

2150 16. ``Y = pdist(X, 'matching')`` 

2151 

2152 Synonym for 'hamming'. 

2153 

2154 17. ``Y = pdist(X, 'dice')`` 

2155 

2156 Computes the Dice distance between each pair of boolean 

2157 vectors. (see dice function documentation) 

2158 

2159 18. ``Y = pdist(X, 'kulczynski1')`` 

2160 

2161 Computes the kulczynski1 distance between each pair of 

2162 boolean vectors. (see kulczynski1 function documentation) 

2163 

2164 19. ``Y = pdist(X, 'rogerstanimoto')`` 

2165 

2166 Computes the Rogers-Tanimoto distance between each pair of 

2167 boolean vectors. (see rogerstanimoto function documentation) 

2168 

2169 20. ``Y = pdist(X, 'russellrao')`` 

2170 

2171 Computes the Russell-Rao distance between each pair of 

2172 boolean vectors. (see russellrao function documentation) 

2173 

2174 21. ``Y = pdist(X, 'sokalmichener')`` 

2175 

2176 Computes the Sokal-Michener distance between each pair of 

2177 boolean vectors. (see sokalmichener function documentation) 

2178 

2179 22. ``Y = pdist(X, 'sokalsneath')`` 

2180 

2181 Computes the Sokal-Sneath distance between each pair of 

2182 boolean vectors. (see sokalsneath function documentation) 

2183 

2184 23. ``Y = pdist(X, 'kulczynski1')`` 

2185 

2186 Computes the Kulczynski 1 distance between each pair of 

2187 boolean vectors. (see kulczynski1 function documentation) 

2188 

2189 24. ``Y = pdist(X, f)`` 

2190 

2191 Computes the distance between all pairs of vectors in X 

2192 using the user supplied 2-arity function f. For example, 

2193 Euclidean distance between the vectors could be computed 

2194 as follows:: 

2195 

2196 dm = pdist(X, lambda u, v: np.sqrt(((u-v)**2).sum())) 

2197 

2198 Note that you should avoid passing a reference to one of 

2199 the distance functions defined in this library. For example,:: 

2200 

2201 dm = pdist(X, sokalsneath) 

2202 

2203 would calculate the pair-wise distances between the vectors in 

2204 X using the Python function sokalsneath. This would result in 

2205 sokalsneath being called :math:`{n \\choose 2}` times, which 

2206 is inefficient. Instead, the optimized C version is more 

2207 efficient, and we call it using the following syntax.:: 

2208 

2209 dm = pdist(X, 'sokalsneath') 

2210 

2211 """ 

2212 # You can also call this as: 

2213 # Y = pdist(X, 'test_abc') 

2214 # where 'abc' is the metric being tested. This computes the distance 

2215 # between all pairs of vectors in X using the distance metric 'abc' but 

2216 # with a more succinct, verifiable, but less efficient implementation. 

2217 

2218 X = _asarray_validated(X, sparse_ok=False, objects_ok=True, mask_ok=True, 

2219 check_finite=False) 

2220 

2221 s = X.shape 

2222 if len(s) != 2: 

2223 raise ValueError('A 2-dimensional array must be passed.') 

2224 

2225 m, n = s 

2226 

2227 if callable(metric): 

2228 mstr = getattr(metric, '__name__', 'UnknownCustomMetric') 

2229 metric_info = _METRIC_ALIAS.get(mstr, None) 

2230 

2231 if metric_info is not None: 

2232 X, typ, kwargs = _validate_pdist_input( 

2233 X, m, n, metric_info, **kwargs) 

2234 

2235 return _pdist_callable(X, metric=metric, out=out, **kwargs) 

2236 elif isinstance(metric, str): 

2237 mstr = metric.lower() 

2238 metric_info = _METRIC_ALIAS.get(mstr, None) 

2239 

2240 if metric_info is not None: 

2241 pdist_fn = metric_info.pdist_func 

2242 return pdist_fn(X, out=out, **kwargs) 

2243 elif mstr.startswith("test_"): 

2244 metric_info = _TEST_METRICS.get(mstr, None) 

2245 if metric_info is None: 

2246 raise ValueError(f'Unknown "Test" Distance Metric: {mstr[5:]}') 

2247 X, typ, kwargs = _validate_pdist_input( 

2248 X, m, n, metric_info, **kwargs) 

2249 return _pdist_callable( 

2250 X, metric=metric_info.dist_func, out=out, **kwargs) 

2251 else: 

2252 raise ValueError('Unknown Distance Metric: %s' % mstr) 

2253 else: 

2254 raise TypeError('2nd argument metric must be a string identifier ' 

2255 'or a function.') 

2256 

2257 

2258def squareform(X, force="no", checks=True): 

2259 """ 

2260 Convert a vector-form distance vector to a square-form distance 

2261 matrix, and vice-versa. 

2262 

2263 Parameters 

2264 ---------- 

2265 X : array_like 

2266 Either a condensed or redundant distance matrix. 

2267 force : str, optional 

2268 As with MATLAB(TM), if force is equal to ``'tovector'`` or 

2269 ``'tomatrix'``, the input will be treated as a distance matrix or 

2270 distance vector respectively. 

2271 checks : bool, optional 

2272 If set to False, no checks will be made for matrix 

2273 symmetry nor zero diagonals. This is useful if it is known that 

2274 ``X - X.T1`` is small and ``diag(X)`` is close to zero. 

2275 These values are ignored any way so they do not disrupt the 

2276 squareform transformation. 

2277 

2278 Returns 

2279 ------- 

2280 Y : ndarray 

2281 If a condensed distance matrix is passed, a redundant one is 

2282 returned, or if a redundant one is passed, a condensed distance 

2283 matrix is returned. 

2284 

2285 Notes 

2286 ----- 

2287 1. ``v = squareform(X)`` 

2288 

2289 Given a square n-by-n symmetric distance matrix ``X``, 

2290 ``v = squareform(X)`` returns a ``n * (n-1) / 2`` 

2291 (i.e. binomial coefficient n choose 2) sized vector `v` 

2292 where :math:`v[{n \\choose 2} - {n-i \\choose 2} + (j-i-1)]` 

2293 is the distance between distinct points ``i`` and ``j``. 

2294 If ``X`` is non-square or asymmetric, an error is raised. 

2295 

2296 2. ``X = squareform(v)`` 

2297 

2298 Given a ``n * (n-1) / 2`` sized vector ``v`` 

2299 for some integer ``n >= 1`` encoding distances as described, 

2300 ``X = squareform(v)`` returns a n-by-n distance matrix ``X``. 

2301 The ``X[i, j]`` and ``X[j, i]`` values are set to 

2302 :math:`v[{n \\choose 2} - {n-i \\choose 2} + (j-i-1)]` 

2303 and all diagonal elements are zero. 

2304 

2305 In SciPy 0.19.0, ``squareform`` stopped casting all input types to 

2306 float64, and started returning arrays of the same dtype as the input. 

2307 

2308 """ 

2309 

2310 X = np.ascontiguousarray(X) 

2311 

2312 s = X.shape 

2313 

2314 if force.lower() == 'tomatrix': 

2315 if len(s) != 1: 

2316 raise ValueError("Forcing 'tomatrix' but input X is not a " 

2317 "distance vector.") 

2318 elif force.lower() == 'tovector': 

2319 if len(s) != 2: 

2320 raise ValueError("Forcing 'tovector' but input X is not a " 

2321 "distance matrix.") 

2322 

2323 # X = squareform(v) 

2324 if len(s) == 1: 

2325 if s[0] == 0: 

2326 return np.zeros((1, 1), dtype=X.dtype) 

2327 

2328 # Grab the closest value to the square root of the number 

2329 # of elements times 2 to see if the number of elements 

2330 # is indeed a binomial coefficient. 

2331 d = int(np.ceil(np.sqrt(s[0] * 2))) 

2332 

2333 # Check that v is of valid dimensions. 

2334 if d * (d - 1) != s[0] * 2: 

2335 raise ValueError('Incompatible vector size. It must be a binomial ' 

2336 'coefficient n choose 2 for some integer n >= 2.') 

2337 

2338 # Allocate memory for the distance matrix. 

2339 M = np.zeros((d, d), dtype=X.dtype) 

2340 

2341 # Since the C code does not support striding using strides. 

2342 # The dimensions are used instead. 

2343 X = _copy_array_if_base_present(X) 

2344 

2345 # Fill in the values of the distance matrix. 

2346 _distance_wrap.to_squareform_from_vector_wrap(M, X) 

2347 

2348 # Return the distance matrix. 

2349 return M 

2350 elif len(s) == 2: 

2351 if s[0] != s[1]: 

2352 raise ValueError('The matrix argument must be square.') 

2353 if checks: 

2354 is_valid_dm(X, throw=True, name='X') 

2355 

2356 # One-side of the dimensions is set here. 

2357 d = s[0] 

2358 

2359 if d <= 1: 

2360 return np.array([], dtype=X.dtype) 

2361 

2362 # Create a vector. 

2363 v = np.zeros((d * (d - 1)) // 2, dtype=X.dtype) 

2364 

2365 # Since the C code does not support striding using strides. 

2366 # The dimensions are used instead. 

2367 X = _copy_array_if_base_present(X) 

2368 

2369 # Convert the vector to squareform. 

2370 _distance_wrap.to_vector_from_squareform_wrap(X, v) 

2371 return v 

2372 else: 

2373 raise ValueError(('The first argument must be one or two dimensional ' 

2374 'array. A %d-dimensional array is not ' 

2375 'permitted') % len(s)) 

2376 

2377 

2378def is_valid_dm(D, tol=0.0, throw=False, name="D", warning=False): 

2379 """ 

2380 Return True if input array is a valid distance matrix. 

2381 

2382 Distance matrices must be 2-dimensional numpy arrays. 

2383 They must have a zero-diagonal, and they must be symmetric. 

2384 

2385 Parameters 

2386 ---------- 

2387 D : array_like 

2388 The candidate object to test for validity. 

2389 tol : float, optional 

2390 The distance matrix should be symmetric. `tol` is the maximum 

2391 difference between entries ``ij`` and ``ji`` for the distance 

2392 metric to be considered symmetric. 

2393 throw : bool, optional 

2394 An exception is thrown if the distance matrix passed is not valid. 

2395 name : str, optional 

2396 The name of the variable to checked. This is useful if 

2397 throw is set to True so the offending variable can be identified 

2398 in the exception message when an exception is thrown. 

2399 warning : bool, optional 

2400 Instead of throwing an exception, a warning message is 

2401 raised. 

2402 

2403 Returns 

2404 ------- 

2405 valid : bool 

2406 True if the variable `D` passed is a valid distance matrix. 

2407 

2408 Notes 

2409 ----- 

2410 Small numerical differences in `D` and `D.T` and non-zeroness of 

2411 the diagonal are ignored if they are within the tolerance specified 

2412 by `tol`. 

2413 

2414 """ 

2415 D = np.asarray(D, order='c') 

2416 valid = True 

2417 try: 

2418 s = D.shape 

2419 if len(D.shape) != 2: 

2420 if name: 

2421 raise ValueError(('Distance matrix \'%s\' must have shape=2 ' 

2422 '(i.e. be two-dimensional).') % name) 

2423 else: 

2424 raise ValueError('Distance matrix must have shape=2 (i.e. ' 

2425 'be two-dimensional).') 

2426 if tol == 0.0: 

2427 if not (D == D.T).all(): 

2428 if name: 

2429 raise ValueError(('Distance matrix \'%s\' must be ' 

2430 'symmetric.') % name) 

2431 else: 

2432 raise ValueError('Distance matrix must be symmetric.') 

2433 if not (D[range(0, s[0]), range(0, s[0])] == 0).all(): 

2434 if name: 

2435 raise ValueError(('Distance matrix \'%s\' diagonal must ' 

2436 'be zero.') % name) 

2437 else: 

2438 raise ValueError('Distance matrix diagonal must be zero.') 

2439 else: 

2440 if not (D - D.T <= tol).all(): 

2441 if name: 

2442 raise ValueError(('Distance matrix \'%s\' must be ' 

2443 'symmetric within tolerance %5.5f.') 

2444 % (name, tol)) 

2445 else: 

2446 raise ValueError('Distance matrix must be symmetric within' 

2447 ' tolerance %5.5f.' % tol) 

2448 if not (D[range(0, s[0]), range(0, s[0])] <= tol).all(): 

2449 if name: 

2450 raise ValueError(('Distance matrix \'%s\' diagonal must be' 

2451 ' close to zero within tolerance %5.5f.') 

2452 % (name, tol)) 

2453 else: 

2454 raise ValueError(('Distance matrix \'%s\' diagonal must be' 

2455 ' close to zero within tolerance %5.5f.') 

2456 % tol) 

2457 except Exception as e: 

2458 if throw: 

2459 raise 

2460 if warning: 

2461 warnings.warn(str(e)) 

2462 valid = False 

2463 return valid 

2464 

2465 

2466def is_valid_y(y, warning=False, throw=False, name=None): 

2467 """ 

2468 Return True if the input array is a valid condensed distance matrix. 

2469 

2470 Condensed distance matrices must be 1-dimensional numpy arrays. 

2471 Their length must be a binomial coefficient :math:`{n \\choose 2}` 

2472 for some positive integer n. 

2473 

2474 Parameters 

2475 ---------- 

2476 y : array_like 

2477 The condensed distance matrix. 

2478 warning : bool, optional 

2479 Invokes a warning if the variable passed is not a valid 

2480 condensed distance matrix. The warning message explains why 

2481 the distance matrix is not valid. `name` is used when 

2482 referencing the offending variable. 

2483 throw : bool, optional 

2484 Throws an exception if the variable passed is not a valid 

2485 condensed distance matrix. 

2486 name : bool, optional 

2487 Used when referencing the offending variable in the 

2488 warning or exception message. 

2489 

2490 """ 

2491 y = np.asarray(y, order='c') 

2492 valid = True 

2493 try: 

2494 if len(y.shape) != 1: 

2495 if name: 

2496 raise ValueError(('Condensed distance matrix \'%s\' must ' 

2497 'have shape=1 (i.e. be one-dimensional).') 

2498 % name) 

2499 else: 

2500 raise ValueError('Condensed distance matrix must have shape=1 ' 

2501 '(i.e. be one-dimensional).') 

2502 n = y.shape[0] 

2503 d = int(np.ceil(np.sqrt(n * 2))) 

2504 if (d * (d - 1) / 2) != n: 

2505 if name: 

2506 raise ValueError(('Length n of condensed distance matrix ' 

2507 '\'%s\' must be a binomial coefficient, i.e.' 

2508 'there must be a k such that ' 

2509 '(k \\choose 2)=n)!') % name) 

2510 else: 

2511 raise ValueError('Length n of condensed distance matrix must ' 

2512 'be a binomial coefficient, i.e. there must ' 

2513 'be a k such that (k \\choose 2)=n)!') 

2514 except Exception as e: 

2515 if throw: 

2516 raise 

2517 if warning: 

2518 warnings.warn(str(e)) 

2519 valid = False 

2520 return valid 

2521 

2522 

2523def num_obs_dm(d): 

2524 """ 

2525 Return the number of original observations that correspond to a 

2526 square, redundant distance matrix. 

2527 

2528 Parameters 

2529 ---------- 

2530 d : array_like 

2531 The target distance matrix. 

2532 

2533 Returns 

2534 ------- 

2535 num_obs_dm : int 

2536 The number of observations in the redundant distance matrix. 

2537 

2538 """ 

2539 d = np.asarray(d, order='c') 

2540 is_valid_dm(d, tol=np.inf, throw=True, name='d') 

2541 return d.shape[0] 

2542 

2543 

2544def num_obs_y(Y): 

2545 """ 

2546 Return the number of original observations that correspond to a 

2547 condensed distance matrix. 

2548 

2549 Parameters 

2550 ---------- 

2551 Y : array_like 

2552 Condensed distance matrix. 

2553 

2554 Returns 

2555 ------- 

2556 n : int 

2557 The number of observations in the condensed distance matrix `Y`. 

2558 

2559 """ 

2560 Y = np.asarray(Y, order='c') 

2561 is_valid_y(Y, throw=True, name='Y') 

2562 k = Y.shape[0] 

2563 if k == 0: 

2564 raise ValueError("The number of observations cannot be determined on " 

2565 "an empty distance matrix.") 

2566 d = int(np.ceil(np.sqrt(k * 2))) 

2567 if (d * (d - 1) / 2) != k: 

2568 raise ValueError("Invalid condensed distance matrix passed. Must be " 

2569 "some k where k=(n choose 2) for some n >= 2.") 

2570 return d 

2571 

2572 

2573def _prepare_out_argument(out, dtype, expected_shape): 

2574 if out is None: 

2575 return np.empty(expected_shape, dtype=dtype) 

2576 

2577 if out.shape != expected_shape: 

2578 raise ValueError("Output array has incorrect shape.") 

2579 if not out.flags.c_contiguous: 

2580 raise ValueError("Output array must be C-contiguous.") 

2581 if out.dtype != np.double: 

2582 raise ValueError("Output array must be double type.") 

2583 return out 

2584 

2585 

2586def _pdist_callable(X, *, out, metric, **kwargs): 

2587 n = X.shape[0] 

2588 out_size = (n * (n - 1)) // 2 

2589 dm = _prepare_out_argument(out, np.double, (out_size,)) 

2590 k = 0 

2591 for i in range(X.shape[0] - 1): 

2592 for j in range(i + 1, X.shape[0]): 

2593 dm[k] = metric(X[i], X[j], **kwargs) 

2594 k += 1 

2595 return dm 

2596 

2597 

2598def _cdist_callable(XA, XB, *, out, metric, **kwargs): 

2599 mA = XA.shape[0] 

2600 mB = XB.shape[0] 

2601 dm = _prepare_out_argument(out, np.double, (mA, mB)) 

2602 for i in range(mA): 

2603 for j in range(mB): 

2604 dm[i, j] = metric(XA[i], XB[j], **kwargs) 

2605 return dm 

2606 

2607 

2608def cdist(XA, XB, metric='euclidean', *, out=None, **kwargs): 

2609 """ 

2610 Compute distance between each pair of the two collections of inputs. 

2611 

2612 See Notes for common calling conventions. 

2613 

2614 Parameters 

2615 ---------- 

2616 XA : array_like 

2617 An :math:`m_A` by :math:`n` array of :math:`m_A` 

2618 original observations in an :math:`n`-dimensional space. 

2619 Inputs are converted to float type. 

2620 XB : array_like 

2621 An :math:`m_B` by :math:`n` array of :math:`m_B` 

2622 original observations in an :math:`n`-dimensional space. 

2623 Inputs are converted to float type. 

2624 metric : str or callable, optional 

2625 The distance metric to use. If a string, the distance function can be 

2626 'braycurtis', 'canberra', 'chebyshev', 'cityblock', 'correlation', 

2627 'cosine', 'dice', 'euclidean', 'hamming', 'jaccard', 'jensenshannon', 

2628 'kulczynski1', 'mahalanobis', 'matching', 'minkowski', 

2629 'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener', 

2630 'sokalsneath', 'sqeuclidean', 'yule'. 

2631 **kwargs : dict, optional 

2632 Extra arguments to `metric`: refer to each metric documentation for a 

2633 list of all possible arguments. 

2634 

2635 Some possible arguments: 

2636 

2637 p : scalar 

2638 The p-norm to apply for Minkowski, weighted and unweighted. 

2639 Default: 2. 

2640 

2641 w : array_like 

2642 The weight vector for metrics that support weights (e.g., Minkowski). 

2643 

2644 V : array_like 

2645 The variance vector for standardized Euclidean. 

2646 Default: var(vstack([XA, XB]), axis=0, ddof=1) 

2647 

2648 VI : array_like 

2649 The inverse of the covariance matrix for Mahalanobis. 

2650 Default: inv(cov(vstack([XA, XB].T))).T 

2651 

2652 out : ndarray 

2653 The output array 

2654 If not None, the distance matrix Y is stored in this array. 

2655 

2656 Returns 

2657 ------- 

2658 Y : ndarray 

2659 A :math:`m_A` by :math:`m_B` distance matrix is returned. 

2660 For each :math:`i` and :math:`j`, the metric 

2661 ``dist(u=XA[i], v=XB[j])`` is computed and stored in the 

2662 :math:`ij` th entry. 

2663 

2664 Raises 

2665 ------ 

2666 ValueError 

2667 An exception is thrown if `XA` and `XB` do not have 

2668 the same number of columns. 

2669 

2670 Notes 

2671 ----- 

2672 The following are common calling conventions: 

2673 

2674 1. ``Y = cdist(XA, XB, 'euclidean')`` 

2675 

2676 Computes the distance between :math:`m` points using 

2677 Euclidean distance (2-norm) as the distance metric between the 

2678 points. The points are arranged as :math:`m` 

2679 :math:`n`-dimensional row vectors in the matrix X. 

2680 

2681 2. ``Y = cdist(XA, XB, 'minkowski', p=2.)`` 

2682 

2683 Computes the distances using the Minkowski distance 

2684 :math:`\\|u-v\\|_p` (:math:`p`-norm) where :math:`p > 0` (note 

2685 that this is only a quasi-metric if :math:`0 < p < 1`). 

2686 

2687 3. ``Y = cdist(XA, XB, 'cityblock')`` 

2688 

2689 Computes the city block or Manhattan distance between the 

2690 points. 

2691 

2692 4. ``Y = cdist(XA, XB, 'seuclidean', V=None)`` 

2693 

2694 Computes the standardized Euclidean distance. The standardized 

2695 Euclidean distance between two n-vectors ``u`` and ``v`` is 

2696 

2697 .. math:: 

2698 

2699 \\sqrt{\\sum {(u_i-v_i)^2 / V[x_i]}}. 

2700 

2701 V is the variance vector; V[i] is the variance computed over all 

2702 the i'th components of the points. If not passed, it is 

2703 automatically computed. 

2704 

2705 5. ``Y = cdist(XA, XB, 'sqeuclidean')`` 

2706 

2707 Computes the squared Euclidean distance :math:`\\|u-v\\|_2^2` between 

2708 the vectors. 

2709 

2710 6. ``Y = cdist(XA, XB, 'cosine')`` 

2711 

2712 Computes the cosine distance between vectors u and v, 

2713 

2714 .. math:: 

2715 

2716 1 - \\frac{u \\cdot v} 

2717 {{\\|u\\|}_2 {\\|v\\|}_2} 

2718 

2719 where :math:`\\|*\\|_2` is the 2-norm of its argument ``*``, and 

2720 :math:`u \\cdot v` is the dot product of :math:`u` and :math:`v`. 

2721 

2722 7. ``Y = cdist(XA, XB, 'correlation')`` 

2723 

2724 Computes the correlation distance between vectors u and v. This is 

2725 

2726 .. math:: 

2727 

2728 1 - \\frac{(u - \\bar{u}) \\cdot (v - \\bar{v})} 

2729 {{\\|(u - \\bar{u})\\|}_2 {\\|(v - \\bar{v})\\|}_2} 

2730 

2731 where :math:`\\bar{v}` is the mean of the elements of vector v, 

2732 and :math:`x \\cdot y` is the dot product of :math:`x` and :math:`y`. 

2733 

2734 

2735 8. ``Y = cdist(XA, XB, 'hamming')`` 

2736 

2737 Computes the normalized Hamming distance, or the proportion of 

2738 those vector elements between two n-vectors ``u`` and ``v`` 

2739 which disagree. To save memory, the matrix ``X`` can be of type 

2740 boolean. 

2741 

2742 9. ``Y = cdist(XA, XB, 'jaccard')`` 

2743 

2744 Computes the Jaccard distance between the points. Given two 

2745 vectors, ``u`` and ``v``, the Jaccard distance is the 

2746 proportion of those elements ``u[i]`` and ``v[i]`` that 

2747 disagree where at least one of them is non-zero. 

2748 

2749 10. ``Y = cdist(XA, XB, 'jensenshannon')`` 

2750 

2751 Computes the Jensen-Shannon distance between two probability arrays. 

2752 Given two probability vectors, :math:`p` and :math:`q`, the 

2753 Jensen-Shannon distance is 

2754 

2755 .. math:: 

2756 

2757 \\sqrt{\\frac{D(p \\parallel m) + D(q \\parallel m)}{2}} 

2758 

2759 where :math:`m` is the pointwise mean of :math:`p` and :math:`q` 

2760 and :math:`D` is the Kullback-Leibler divergence. 

2761 

2762 11. ``Y = cdist(XA, XB, 'chebyshev')`` 

2763 

2764 Computes the Chebyshev distance between the points. The 

2765 Chebyshev distance between two n-vectors ``u`` and ``v`` is the 

2766 maximum norm-1 distance between their respective elements. More 

2767 precisely, the distance is given by 

2768 

2769 .. math:: 

2770 

2771 d(u,v) = \\max_i {|u_i-v_i|}. 

2772 

2773 12. ``Y = cdist(XA, XB, 'canberra')`` 

2774 

2775 Computes the Canberra distance between the points. The 

2776 Canberra distance between two points ``u`` and ``v`` is 

2777 

2778 .. math:: 

2779 

2780 d(u,v) = \\sum_i \\frac{|u_i-v_i|} 

2781 {|u_i|+|v_i|}. 

2782 

2783 13. ``Y = cdist(XA, XB, 'braycurtis')`` 

2784 

2785 Computes the Bray-Curtis distance between the points. The 

2786 Bray-Curtis distance between two points ``u`` and ``v`` is 

2787 

2788 

2789 .. math:: 

2790 

2791 d(u,v) = \\frac{\\sum_i (|u_i-v_i|)} 

2792 {\\sum_i (|u_i+v_i|)} 

2793 

2794 14. ``Y = cdist(XA, XB, 'mahalanobis', VI=None)`` 

2795 

2796 Computes the Mahalanobis distance between the points. The 

2797 Mahalanobis distance between two points ``u`` and ``v`` is 

2798 :math:`\\sqrt{(u-v)(1/V)(u-v)^T}` where :math:`(1/V)` (the ``VI`` 

2799 variable) is the inverse covariance. If ``VI`` is not None, 

2800 ``VI`` will be used as the inverse covariance matrix. 

2801 

2802 15. ``Y = cdist(XA, XB, 'yule')`` 

2803 

2804 Computes the Yule distance between the boolean 

2805 vectors. (see `yule` function documentation) 

2806 

2807 16. ``Y = cdist(XA, XB, 'matching')`` 

2808 

2809 Synonym for 'hamming'. 

2810 

2811 17. ``Y = cdist(XA, XB, 'dice')`` 

2812 

2813 Computes the Dice distance between the boolean vectors. (see 

2814 `dice` function documentation) 

2815 

2816 18. ``Y = cdist(XA, XB, 'kulczynski1')`` 

2817 

2818 Computes the kulczynski distance between the boolean 

2819 vectors. (see `kulczynski1` function documentation) 

2820 

2821 19. ``Y = cdist(XA, XB, 'rogerstanimoto')`` 

2822 

2823 Computes the Rogers-Tanimoto distance between the boolean 

2824 vectors. (see `rogerstanimoto` function documentation) 

2825 

2826 20. ``Y = cdist(XA, XB, 'russellrao')`` 

2827 

2828 Computes the Russell-Rao distance between the boolean 

2829 vectors. (see `russellrao` function documentation) 

2830 

2831 21. ``Y = cdist(XA, XB, 'sokalmichener')`` 

2832 

2833 Computes the Sokal-Michener distance between the boolean 

2834 vectors. (see `sokalmichener` function documentation) 

2835 

2836 22. ``Y = cdist(XA, XB, 'sokalsneath')`` 

2837 

2838 Computes the Sokal-Sneath distance between the vectors. (see 

2839 `sokalsneath` function documentation) 

2840 

2841 23. ``Y = cdist(XA, XB, f)`` 

2842 

2843 Computes the distance between all pairs of vectors in X 

2844 using the user supplied 2-arity function f. For example, 

2845 Euclidean distance between the vectors could be computed 

2846 as follows:: 

2847 

2848 dm = cdist(XA, XB, lambda u, v: np.sqrt(((u-v)**2).sum())) 

2849 

2850 Note that you should avoid passing a reference to one of 

2851 the distance functions defined in this library. For example,:: 

2852 

2853 dm = cdist(XA, XB, sokalsneath) 

2854 

2855 would calculate the pair-wise distances between the vectors in 

2856 X using the Python function `sokalsneath`. This would result in 

2857 sokalsneath being called :math:`{n \\choose 2}` times, which 

2858 is inefficient. Instead, the optimized C version is more 

2859 efficient, and we call it using the following syntax:: 

2860 

2861 dm = cdist(XA, XB, 'sokalsneath') 

2862 

2863 Examples 

2864 -------- 

2865 Find the Euclidean distances between four 2-D coordinates: 

2866 

2867 >>> from scipy.spatial import distance 

2868 >>> import numpy as np 

2869 >>> coords = [(35.0456, -85.2672), 

2870 ... (35.1174, -89.9711), 

2871 ... (35.9728, -83.9422), 

2872 ... (36.1667, -86.7833)] 

2873 >>> distance.cdist(coords, coords, 'euclidean') 

2874 array([[ 0. , 4.7044, 1.6172, 1.8856], 

2875 [ 4.7044, 0. , 6.0893, 3.3561], 

2876 [ 1.6172, 6.0893, 0. , 2.8477], 

2877 [ 1.8856, 3.3561, 2.8477, 0. ]]) 

2878 

2879 

2880 Find the Manhattan distance from a 3-D point to the corners of the unit 

2881 cube: 

2882 

2883 >>> a = np.array([[0, 0, 0], 

2884 ... [0, 0, 1], 

2885 ... [0, 1, 0], 

2886 ... [0, 1, 1], 

2887 ... [1, 0, 0], 

2888 ... [1, 0, 1], 

2889 ... [1, 1, 0], 

2890 ... [1, 1, 1]]) 

2891 >>> b = np.array([[ 0.1, 0.2, 0.4]]) 

2892 >>> distance.cdist(a, b, 'cityblock') 

2893 array([[ 0.7], 

2894 [ 0.9], 

2895 [ 1.3], 

2896 [ 1.5], 

2897 [ 1.5], 

2898 [ 1.7], 

2899 [ 2.1], 

2900 [ 2.3]]) 

2901 

2902 """ 

2903 # You can also call this as: 

2904 # Y = cdist(XA, XB, 'test_abc') 

2905 # where 'abc' is the metric being tested. This computes the distance 

2906 # between all pairs of vectors in XA and XB using the distance metric 'abc' 

2907 # but with a more succinct, verifiable, but less efficient implementation. 

2908 

2909 XA = np.asarray(XA) 

2910 XB = np.asarray(XB) 

2911 

2912 s = XA.shape 

2913 sB = XB.shape 

2914 

2915 if len(s) != 2: 

2916 raise ValueError('XA must be a 2-dimensional array.') 

2917 if len(sB) != 2: 

2918 raise ValueError('XB must be a 2-dimensional array.') 

2919 if s[1] != sB[1]: 

2920 raise ValueError('XA and XB must have the same number of columns ' 

2921 '(i.e. feature dimension.)') 

2922 

2923 mA = s[0] 

2924 mB = sB[0] 

2925 n = s[1] 

2926 

2927 if callable(metric): 

2928 mstr = getattr(metric, '__name__', 'Unknown') 

2929 metric_info = _METRIC_ALIAS.get(mstr, None) 

2930 if metric_info is not None: 

2931 XA, XB, typ, kwargs = _validate_cdist_input( 

2932 XA, XB, mA, mB, n, metric_info, **kwargs) 

2933 return _cdist_callable(XA, XB, metric=metric, out=out, **kwargs) 

2934 elif isinstance(metric, str): 

2935 mstr = metric.lower() 

2936 metric_info = _METRIC_ALIAS.get(mstr, None) 

2937 if metric_info is not None: 

2938 cdist_fn = metric_info.cdist_func 

2939 return cdist_fn(XA, XB, out=out, **kwargs) 

2940 elif mstr.startswith("test_"): 

2941 metric_info = _TEST_METRICS.get(mstr, None) 

2942 if metric_info is None: 

2943 raise ValueError(f'Unknown "Test" Distance Metric: {mstr[5:]}') 

2944 XA, XB, typ, kwargs = _validate_cdist_input( 

2945 XA, XB, mA, mB, n, metric_info, **kwargs) 

2946 return _cdist_callable( 

2947 XA, XB, metric=metric_info.dist_func, out=out, **kwargs) 

2948 else: 

2949 raise ValueError('Unknown Distance Metric: %s' % mstr) 

2950 else: 

2951 raise TypeError('2nd argument metric must be a string identifier ' 

2952 'or a function.')