Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scipy/spatial/distance.py: 15%
649 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-12 06:31 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-12 06:31 +0000
1"""
2Distance computations (:mod:`scipy.spatial.distance`)
3=====================================================
5.. sectionauthor:: Damian Eads
7Function reference
8------------------
10Distance matrix computation from a collection of raw observation vectors
11stored in a rectangular array.
13.. autosummary::
14 :toctree: generated/
16 pdist -- pairwise distances between observation vectors.
17 cdist -- distances between two collections of observation vectors
18 squareform -- convert distance matrix to a condensed one and vice versa
19 directed_hausdorff -- directed Hausdorff distance between arrays
21Predicates for checking the validity of distance matrices, both
22condensed and redundant. Also contained in this module are functions
23for computing the number of observations in a distance matrix.
25.. autosummary::
26 :toctree: generated/
28 is_valid_dm -- checks for a valid distance matrix
29 is_valid_y -- checks for a valid condensed distance matrix
30 num_obs_dm -- # of observations in a distance matrix
31 num_obs_y -- # of observations in a condensed distance matrix
33Distance functions between two numeric vectors ``u`` and ``v``. Computing
34distances over a large collection of vectors is inefficient for these
35functions. Use ``pdist`` for this purpose.
37.. autosummary::
38 :toctree: generated/
40 braycurtis -- the Bray-Curtis distance.
41 canberra -- the Canberra distance.
42 chebyshev -- the Chebyshev distance.
43 cityblock -- the Manhattan distance.
44 correlation -- the Correlation distance.
45 cosine -- the Cosine distance.
46 euclidean -- the Euclidean distance.
47 jensenshannon -- the Jensen-Shannon distance.
48 mahalanobis -- the Mahalanobis distance.
49 minkowski -- the Minkowski distance.
50 seuclidean -- the normalized Euclidean distance.
51 sqeuclidean -- the squared Euclidean distance.
53Distance functions between two boolean vectors (representing sets) ``u`` and
54``v``. As in the case of numerical vectors, ``pdist`` is more efficient for
55computing the distances between all pairs.
57.. autosummary::
58 :toctree: generated/
60 dice -- the Dice dissimilarity.
61 hamming -- the Hamming distance.
62 jaccard -- the Jaccard distance.
63 kulsinski -- the Kulsinski distance.
64 kulczynski1 -- the Kulczynski 1 distance.
65 rogerstanimoto -- the Rogers-Tanimoto dissimilarity.
66 russellrao -- the Russell-Rao dissimilarity.
67 sokalmichener -- the Sokal-Michener dissimilarity.
68 sokalsneath -- the Sokal-Sneath dissimilarity.
69 yule -- the Yule dissimilarity.
71:func:`hamming` also operates over discrete numerical vectors.
72"""
74# Copyright (C) Damian Eads, 2007-2008. New BSD License.
76__all__ = [
77 'braycurtis',
78 'canberra',
79 'cdist',
80 'chebyshev',
81 'cityblock',
82 'correlation',
83 'cosine',
84 'dice',
85 'directed_hausdorff',
86 'euclidean',
87 'hamming',
88 'is_valid_dm',
89 'is_valid_y',
90 'jaccard',
91 'jensenshannon',
92 'kulsinski',
93 'kulczynski1',
94 'mahalanobis',
95 'minkowski',
96 'num_obs_dm',
97 'num_obs_y',
98 'pdist',
99 'rogerstanimoto',
100 'russellrao',
101 'seuclidean',
102 'sokalmichener',
103 'sokalsneath',
104 'sqeuclidean',
105 'squareform',
106 'yule'
107]
110import warnings
111import numpy as np
112import dataclasses
114from typing import List, Optional, Set, Callable
116from functools import partial
117from scipy._lib._util import _asarray_validated
119from . import _distance_wrap
120from . import _hausdorff
121from ..linalg import norm
122from ..special import rel_entr
124from . import _distance_pybind
126from .._lib.deprecation import _deprecated
128def _copy_array_if_base_present(a):
129 """Copy the array if its base points to a parent array."""
130 if a.base is not None:
131 return a.copy()
132 return a
135def _correlation_cdist_wrap(XA, XB, dm, **kwargs):
136 XA = XA - XA.mean(axis=1, keepdims=True)
137 XB = XB - XB.mean(axis=1, keepdims=True)
138 _distance_wrap.cdist_cosine_double_wrap(XA, XB, dm, **kwargs)
141def _correlation_pdist_wrap(X, dm, **kwargs):
142 X2 = X - X.mean(axis=1, keepdims=True)
143 _distance_wrap.pdist_cosine_double_wrap(X2, dm, **kwargs)
146def _convert_to_type(X, out_type):
147 return np.ascontiguousarray(X, dtype=out_type)
150def _nbool_correspond_all(u, v, w=None):
151 if u.dtype == v.dtype == bool and w is None:
152 not_u = ~u
153 not_v = ~v
154 nff = (not_u & not_v).sum()
155 nft = (not_u & v).sum()
156 ntf = (u & not_v).sum()
157 ntt = (u & v).sum()
158 else:
159 dtype = np.result_type(int, u.dtype, v.dtype)
160 u = u.astype(dtype)
161 v = v.astype(dtype)
162 not_u = 1.0 - u
163 not_v = 1.0 - v
164 if w is not None:
165 not_u = w * not_u
166 u = w * u
167 nff = (not_u * not_v).sum()
168 nft = (not_u * v).sum()
169 ntf = (u * not_v).sum()
170 ntt = (u * v).sum()
171 return (nff, nft, ntf, ntt)
174def _nbool_correspond_ft_tf(u, v, w=None):
175 if u.dtype == v.dtype == bool and w is None:
176 not_u = ~u
177 not_v = ~v
178 nft = (not_u & v).sum()
179 ntf = (u & not_v).sum()
180 else:
181 dtype = np.result_type(int, u.dtype, v.dtype)
182 u = u.astype(dtype)
183 v = v.astype(dtype)
184 not_u = 1.0 - u
185 not_v = 1.0 - v
186 if w is not None:
187 not_u = w * not_u
188 u = w * u
189 nft = (not_u * v).sum()
190 ntf = (u * not_v).sum()
191 return (nft, ntf)
194def _validate_cdist_input(XA, XB, mA, mB, n, metric_info, **kwargs):
195 # get supported types
196 types = metric_info.types
197 # choose best type
198 typ = types[types.index(XA.dtype)] if XA.dtype in types else types[0]
199 # validate data
200 XA = _convert_to_type(XA, out_type=typ)
201 XB = _convert_to_type(XB, out_type=typ)
203 # validate kwargs
204 _validate_kwargs = metric_info.validator
205 if _validate_kwargs:
206 kwargs = _validate_kwargs((XA, XB), mA + mB, n, **kwargs)
207 return XA, XB, typ, kwargs
210def _validate_weight_with_size(X, m, n, **kwargs):
211 w = kwargs.pop('w', None)
212 if w is None:
213 return kwargs
215 if w.ndim != 1 or w.shape[0] != n:
216 raise ValueError("Weights must have same size as input vector. "
217 f"{w.shape[0]} vs. {n}")
219 kwargs['w'] = _validate_weights(w)
220 return kwargs
223def _validate_hamming_kwargs(X, m, n, **kwargs):
224 w = kwargs.get('w', np.ones((n,), dtype='double'))
226 if w.ndim != 1 or w.shape[0] != n:
227 raise ValueError("Weights must have same size as input vector. %d vs. %d" % (w.shape[0], n))
229 kwargs['w'] = _validate_weights(w)
230 return kwargs
233def _validate_mahalanobis_kwargs(X, m, n, **kwargs):
234 VI = kwargs.pop('VI', None)
235 if VI is None:
236 if m <= n:
237 # There are fewer observations than the dimension of
238 # the observations.
239 raise ValueError("The number of observations (%d) is too "
240 "small; the covariance matrix is "
241 "singular. For observations with %d "
242 "dimensions, at least %d observations "
243 "are required." % (m, n, n + 1))
244 if isinstance(X, tuple):
245 X = np.vstack(X)
246 CV = np.atleast_2d(np.cov(X.astype(np.double, copy=False).T))
247 VI = np.linalg.inv(CV).T.copy()
248 kwargs["VI"] = _convert_to_double(VI)
249 return kwargs
252def _validate_minkowski_kwargs(X, m, n, **kwargs):
253 kwargs = _validate_weight_with_size(X, m, n, **kwargs)
254 if 'p' not in kwargs:
255 kwargs['p'] = 2.
256 else:
257 if kwargs['p'] <= 0:
258 raise ValueError("p must be greater than 0")
260 return kwargs
263def _validate_pdist_input(X, m, n, metric_info, **kwargs):
264 # get supported types
265 types = metric_info.types
266 # choose best type
267 typ = types[types.index(X.dtype)] if X.dtype in types else types[0]
268 # validate data
269 X = _convert_to_type(X, out_type=typ)
271 # validate kwargs
272 _validate_kwargs = metric_info.validator
273 if _validate_kwargs:
274 kwargs = _validate_kwargs(X, m, n, **kwargs)
275 return X, typ, kwargs
278def _validate_seuclidean_kwargs(X, m, n, **kwargs):
279 V = kwargs.pop('V', None)
280 if V is None:
281 if isinstance(X, tuple):
282 X = np.vstack(X)
283 V = np.var(X.astype(np.double, copy=False), axis=0, ddof=1)
284 else:
285 V = np.asarray(V, order='c')
286 if len(V.shape) != 1:
287 raise ValueError('Variance vector V must '
288 'be one-dimensional.')
289 if V.shape[0] != n:
290 raise ValueError('Variance vector V must be of the same '
291 'dimension as the vectors on which the distances '
292 'are computed.')
293 kwargs['V'] = _convert_to_double(V)
294 return kwargs
297def _validate_vector(u, dtype=None):
298 # XXX Is order='c' really necessary?
299 u = np.asarray(u, dtype=dtype, order='c')
300 if u.ndim == 1:
301 return u
302 raise ValueError("Input vector should be 1-D.")
305def _validate_weights(w, dtype=np.double):
306 w = _validate_vector(w, dtype=dtype)
307 if np.any(w < 0):
308 raise ValueError("Input weights should be all non-negative")
309 return w
312def directed_hausdorff(u, v, seed=0):
313 """
314 Compute the directed Hausdorff distance between two 2-D arrays.
316 Distances between pairs are calculated using a Euclidean metric.
318 Parameters
319 ----------
320 u : (M,N) array_like
321 Input array.
322 v : (O,N) array_like
323 Input array.
324 seed : int or None
325 Local `numpy.random.RandomState` seed. Default is 0, a random
326 shuffling of u and v that guarantees reproducibility.
328 Returns
329 -------
330 d : double
331 The directed Hausdorff distance between arrays `u` and `v`,
333 index_1 : int
334 index of point contributing to Hausdorff pair in `u`
336 index_2 : int
337 index of point contributing to Hausdorff pair in `v`
339 Raises
340 ------
341 ValueError
342 An exception is thrown if `u` and `v` do not have
343 the same number of columns.
345 Notes
346 -----
347 Uses the early break technique and the random sampling approach
348 described by [1]_. Although worst-case performance is ``O(m * o)``
349 (as with the brute force algorithm), this is unlikely in practice
350 as the input data would have to require the algorithm to explore
351 every single point interaction, and after the algorithm shuffles
352 the input points at that. The best case performance is O(m), which
353 is satisfied by selecting an inner loop distance that is less than
354 cmax and leads to an early break as often as possible. The authors
355 have formally shown that the average runtime is closer to O(m).
357 .. versionadded:: 0.19.0
359 References
360 ----------
361 .. [1] A. A. Taha and A. Hanbury, "An efficient algorithm for
362 calculating the exact Hausdorff distance." IEEE Transactions On
363 Pattern Analysis And Machine Intelligence, vol. 37 pp. 2153-63,
364 2015.
366 See Also
367 --------
368 scipy.spatial.procrustes : Another similarity test for two data sets
370 Examples
371 --------
372 Find the directed Hausdorff distance between two 2-D arrays of
373 coordinates:
375 >>> from scipy.spatial.distance import directed_hausdorff
376 >>> import numpy as np
377 >>> u = np.array([(1.0, 0.0),
378 ... (0.0, 1.0),
379 ... (-1.0, 0.0),
380 ... (0.0, -1.0)])
381 >>> v = np.array([(2.0, 0.0),
382 ... (0.0, 2.0),
383 ... (-2.0, 0.0),
384 ... (0.0, -4.0)])
386 >>> directed_hausdorff(u, v)[0]
387 2.23606797749979
388 >>> directed_hausdorff(v, u)[0]
389 3.0
391 Find the general (symmetric) Hausdorff distance between two 2-D
392 arrays of coordinates:
394 >>> max(directed_hausdorff(u, v)[0], directed_hausdorff(v, u)[0])
395 3.0
397 Find the indices of the points that generate the Hausdorff distance
398 (the Hausdorff pair):
400 >>> directed_hausdorff(v, u)[1:]
401 (3, 3)
403 """
404 u = np.asarray(u, dtype=np.float64, order='c')
405 v = np.asarray(v, dtype=np.float64, order='c')
406 if u.shape[1] != v.shape[1]:
407 raise ValueError('u and v need to have the same '
408 'number of columns')
409 result = _hausdorff.directed_hausdorff(u, v, seed)
410 return result
413def minkowski(u, v, p=2, w=None):
414 """
415 Compute the Minkowski distance between two 1-D arrays.
417 The Minkowski distance between 1-D arrays `u` and `v`,
418 is defined as
420 .. math::
422 {\\|u-v\\|}_p = (\\sum{|u_i - v_i|^p})^{1/p}.
425 \\left(\\sum{w_i(|(u_i - v_i)|^p)}\\right)^{1/p}.
427 Parameters
428 ----------
429 u : (N,) array_like
430 Input array.
431 v : (N,) array_like
432 Input array.
433 p : scalar
434 The order of the norm of the difference :math:`{\\|u-v\\|}_p`. Note
435 that for :math:`0 < p < 1`, the triangle inequality only holds with
436 an additional multiplicative factor, i.e. it is only a quasi-metric.
437 w : (N,) array_like, optional
438 The weights for each value in `u` and `v`. Default is None,
439 which gives each value a weight of 1.0
441 Returns
442 -------
443 minkowski : double
444 The Minkowski distance between vectors `u` and `v`.
446 Examples
447 --------
448 >>> from scipy.spatial import distance
449 >>> distance.minkowski([1, 0, 0], [0, 1, 0], 1)
450 2.0
451 >>> distance.minkowski([1, 0, 0], [0, 1, 0], 2)
452 1.4142135623730951
453 >>> distance.minkowski([1, 0, 0], [0, 1, 0], 3)
454 1.2599210498948732
455 >>> distance.minkowski([1, 1, 0], [0, 1, 0], 1)
456 1.0
457 >>> distance.minkowski([1, 1, 0], [0, 1, 0], 2)
458 1.0
459 >>> distance.minkowski([1, 1, 0], [0, 1, 0], 3)
460 1.0
462 """
463 u = _validate_vector(u)
464 v = _validate_vector(v)
465 if p <= 0:
466 raise ValueError("p must be greater than 0")
467 u_v = u - v
468 if w is not None:
469 w = _validate_weights(w)
470 if p == 1:
471 root_w = w
472 elif p == 2:
473 # better precision and speed
474 root_w = np.sqrt(w)
475 elif p == np.inf:
476 root_w = (w != 0)
477 else:
478 root_w = np.power(w, 1/p)
479 u_v = root_w * u_v
480 dist = norm(u_v, ord=p)
481 return dist
484def euclidean(u, v, w=None):
485 """
486 Computes the Euclidean distance between two 1-D arrays.
488 The Euclidean distance between 1-D arrays `u` and `v`, is defined as
490 .. math::
492 {\\|u-v\\|}_2
494 \\left(\\sum{(w_i |(u_i - v_i)|^2)}\\right)^{1/2}
496 Parameters
497 ----------
498 u : (N,) array_like
499 Input array.
500 v : (N,) array_like
501 Input array.
502 w : (N,) array_like, optional
503 The weights for each value in `u` and `v`. Default is None,
504 which gives each value a weight of 1.0
506 Returns
507 -------
508 euclidean : double
509 The Euclidean distance between vectors `u` and `v`.
511 Examples
512 --------
513 >>> from scipy.spatial import distance
514 >>> distance.euclidean([1, 0, 0], [0, 1, 0])
515 1.4142135623730951
516 >>> distance.euclidean([1, 1, 0], [0, 1, 0])
517 1.0
519 """
520 return minkowski(u, v, p=2, w=w)
523def sqeuclidean(u, v, w=None):
524 """
525 Compute the squared Euclidean distance between two 1-D arrays.
527 The squared Euclidean distance between `u` and `v` is defined as
529 .. math::
531 {\\|u-v\\|}_2^2
533 \\left(\\sum{(w_i |(u_i - v_i)|^2)}\\right)
535 Parameters
536 ----------
537 u : (N,) array_like
538 Input array.
539 v : (N,) array_like
540 Input array.
541 w : (N,) array_like, optional
542 The weights for each value in `u` and `v`. Default is None,
543 which gives each value a weight of 1.0
545 Returns
546 -------
547 sqeuclidean : double
548 The squared Euclidean distance between vectors `u` and `v`.
550 Examples
551 --------
552 >>> from scipy.spatial import distance
553 >>> distance.sqeuclidean([1, 0, 0], [0, 1, 0])
554 2.0
555 >>> distance.sqeuclidean([1, 1, 0], [0, 1, 0])
556 1.0
558 """
559 # Preserve float dtypes, but convert everything else to np.float64
560 # for stability.
561 utype, vtype = None, None
562 if not (hasattr(u, "dtype") and np.issubdtype(u.dtype, np.inexact)):
563 utype = np.float64
564 if not (hasattr(v, "dtype") and np.issubdtype(v.dtype, np.inexact)):
565 vtype = np.float64
567 u = _validate_vector(u, dtype=utype)
568 v = _validate_vector(v, dtype=vtype)
569 u_v = u - v
570 u_v_w = u_v # only want weights applied once
571 if w is not None:
572 w = _validate_weights(w)
573 u_v_w = w * u_v
574 return np.dot(u_v, u_v_w)
577def correlation(u, v, w=None, centered=True):
578 """
579 Compute the correlation distance between two 1-D arrays.
581 The correlation distance between `u` and `v`, is
582 defined as
584 .. math::
586 1 - \\frac{(u - \\bar{u}) \\cdot (v - \\bar{v})}
587 {{\\|(u - \\bar{u})\\|}_2 {\\|(v - \\bar{v})\\|}_2}
589 where :math:`\\bar{u}` is the mean of the elements of `u`
590 and :math:`x \\cdot y` is the dot product of :math:`x` and :math:`y`.
592 Parameters
593 ----------
594 u : (N,) array_like
595 Input array.
596 v : (N,) array_like
597 Input array.
598 w : (N,) array_like, optional
599 The weights for each value in `u` and `v`. Default is None,
600 which gives each value a weight of 1.0
601 centered : bool, optional
602 If True, `u` and `v` will be centered. Default is True.
604 Returns
605 -------
606 correlation : double
607 The correlation distance between 1-D array `u` and `v`.
609 """
610 u = _validate_vector(u)
611 v = _validate_vector(v)
612 if w is not None:
613 w = _validate_weights(w)
614 if centered:
615 umu = np.average(u, weights=w)
616 vmu = np.average(v, weights=w)
617 u = u - umu
618 v = v - vmu
619 uv = np.average(u * v, weights=w)
620 uu = np.average(np.square(u), weights=w)
621 vv = np.average(np.square(v), weights=w)
622 dist = 1.0 - uv / np.sqrt(uu * vv)
623 # Return absolute value to avoid small negative value due to rounding
624 return np.abs(dist)
627def cosine(u, v, w=None):
628 """
629 Compute the Cosine distance between 1-D arrays.
631 The Cosine distance between `u` and `v`, is defined as
633 .. math::
635 1 - \\frac{u \\cdot v}
636 {\\|u\\|_2 \\|v\\|_2}.
638 where :math:`u \\cdot v` is the dot product of :math:`u` and
639 :math:`v`.
641 Parameters
642 ----------
643 u : (N,) array_like
644 Input array.
645 v : (N,) array_like
646 Input array.
647 w : (N,) array_like, optional
648 The weights for each value in `u` and `v`. Default is None,
649 which gives each value a weight of 1.0
651 Returns
652 -------
653 cosine : double
654 The Cosine distance between vectors `u` and `v`.
656 Examples
657 --------
658 >>> from scipy.spatial import distance
659 >>> distance.cosine([1, 0, 0], [0, 1, 0])
660 1.0
661 >>> distance.cosine([100, 0, 0], [0, 1, 0])
662 1.0
663 >>> distance.cosine([1, 1, 0], [0, 1, 0])
664 0.29289321881345254
666 """
667 # cosine distance is also referred to as 'uncentered correlation',
668 # or 'reflective correlation'
669 # clamp the result to 0-2
670 return max(0, min(correlation(u, v, w=w, centered=False), 2.0))
673def hamming(u, v, w=None):
674 """
675 Compute the Hamming distance between two 1-D arrays.
677 The Hamming distance between 1-D arrays `u` and `v`, is simply the
678 proportion of disagreeing components in `u` and `v`. If `u` and `v` are
679 boolean vectors, the Hamming distance is
681 .. math::
683 \\frac{c_{01} + c_{10}}{n}
685 where :math:`c_{ij}` is the number of occurrences of
686 :math:`\\mathtt{u[k]} = i` and :math:`\\mathtt{v[k]} = j` for
687 :math:`k < n`.
689 Parameters
690 ----------
691 u : (N,) array_like
692 Input array.
693 v : (N,) array_like
694 Input array.
695 w : (N,) array_like, optional
696 The weights for each value in `u` and `v`. Default is None,
697 which gives each value a weight of 1.0
699 Returns
700 -------
701 hamming : double
702 The Hamming distance between vectors `u` and `v`.
704 Examples
705 --------
706 >>> from scipy.spatial import distance
707 >>> distance.hamming([1, 0, 0], [0, 1, 0])
708 0.66666666666666663
709 >>> distance.hamming([1, 0, 0], [1, 1, 0])
710 0.33333333333333331
711 >>> distance.hamming([1, 0, 0], [2, 0, 0])
712 0.33333333333333331
713 >>> distance.hamming([1, 0, 0], [3, 0, 0])
714 0.33333333333333331
716 """
717 u = _validate_vector(u)
718 v = _validate_vector(v)
719 if u.shape != v.shape:
720 raise ValueError('The 1d arrays must have equal lengths.')
721 u_ne_v = u != v
722 if w is not None:
723 w = _validate_weights(w)
724 return np.average(u_ne_v, weights=w)
727def jaccard(u, v, w=None):
728 """
729 Compute the Jaccard-Needham dissimilarity between two boolean 1-D arrays.
731 The Jaccard-Needham dissimilarity between 1-D boolean arrays `u` and `v`,
732 is defined as
734 .. math::
736 \\frac{c_{TF} + c_{FT}}
737 {c_{TT} + c_{FT} + c_{TF}}
739 where :math:`c_{ij}` is the number of occurrences of
740 :math:`\\mathtt{u[k]} = i` and :math:`\\mathtt{v[k]} = j` for
741 :math:`k < n`.
743 Parameters
744 ----------
745 u : (N,) array_like, bool
746 Input array.
747 v : (N,) array_like, bool
748 Input array.
749 w : (N,) array_like, optional
750 The weights for each value in `u` and `v`. Default is None,
751 which gives each value a weight of 1.0
753 Returns
754 -------
755 jaccard : double
756 The Jaccard distance between vectors `u` and `v`.
758 Notes
759 -----
760 When both `u` and `v` lead to a `0/0` division i.e. there is no overlap
761 between the items in the vectors the returned distance is 0. See the
762 Wikipedia page on the Jaccard index [1]_, and this paper [2]_.
764 .. versionchanged:: 1.2.0
765 Previously, when `u` and `v` lead to a `0/0` division, the function
766 would return NaN. This was changed to return 0 instead.
768 References
769 ----------
770 .. [1] https://en.wikipedia.org/wiki/Jaccard_index
771 .. [2] S. Kosub, "A note on the triangle inequality for the Jaccard
772 distance", 2016, :arxiv:`1612.02696`
774 Examples
775 --------
776 >>> from scipy.spatial import distance
777 >>> distance.jaccard([1, 0, 0], [0, 1, 0])
778 1.0
779 >>> distance.jaccard([1, 0, 0], [1, 1, 0])
780 0.5
781 >>> distance.jaccard([1, 0, 0], [1, 2, 0])
782 0.5
783 >>> distance.jaccard([1, 0, 0], [1, 1, 1])
784 0.66666666666666663
786 """
787 u = _validate_vector(u)
788 v = _validate_vector(v)
790 nonzero = np.bitwise_or(u != 0, v != 0)
791 unequal_nonzero = np.bitwise_and((u != v), nonzero)
792 if w is not None:
793 w = _validate_weights(w)
794 nonzero = w * nonzero
795 unequal_nonzero = w * unequal_nonzero
796 a = np.double(unequal_nonzero.sum())
797 b = np.double(nonzero.sum())
798 return (a / b) if b != 0 else 0
801@_deprecated("Kulsinski has been deprecated from scipy.spatial.distance"
802 " in SciPy 1.9.0 and it will be removed in SciPy 1.11.0."
803 " It is superseded by scipy.spatial.distance.kulczynski1.")
804def kulsinski(u, v, w=None):
805 """
806 Compute the Kulsinski dissimilarity between two boolean 1-D arrays.
808 The Kulsinski dissimilarity between two boolean 1-D arrays `u` and `v`,
809 is defined as
811 .. math::
813 \\frac{c_{TF} + c_{FT} - c_{TT} + n}
814 {c_{FT} + c_{TF} + n}
816 where :math:`c_{ij}` is the number of occurrences of
817 :math:`\\mathtt{u[k]} = i` and :math:`\\mathtt{v[k]} = j` for
818 :math:`k < n`.
820 .. deprecated:: 0.12.0
821 `kulsinski` has been deprecated from `scipy.spatial.distance` in
822 SciPy 1.9.0 and it will be removed in SciPy 1.11.0. It is superseded
823 by `scipy.spatial.distance.kulczynski1`.
825 Parameters
826 ----------
827 u : (N,) array_like, bool
828 Input array.
829 v : (N,) array_like, bool
830 Input array.
831 w : (N,) array_like, optional
832 The weights for each value in `u` and `v`. Default is None,
833 which gives each value a weight of 1.0
835 Returns
836 -------
837 kulsinski : double
838 The Kulsinski distance between vectors `u` and `v`.
840 Examples
841 --------
842 >>> from scipy.spatial import distance
843 >>> distance.kulsinski([1, 0, 0], [0, 1, 0])
844 1.0
845 >>> distance.kulsinski([1, 0, 0], [1, 1, 0])
846 0.75
847 >>> distance.kulsinski([1, 0, 0], [2, 1, 0])
848 0.33333333333333331
849 >>> distance.kulsinski([1, 0, 0], [3, 1, 0])
850 -0.5
852 """
853 u = _validate_vector(u)
854 v = _validate_vector(v)
855 if w is None:
856 n = float(len(u))
857 else:
858 w = _validate_weights(w)
859 n = w.sum()
860 (nff, nft, ntf, ntt) = _nbool_correspond_all(u, v, w=w)
862 return (ntf + nft - ntt + n) / (ntf + nft + n)
865def kulczynski1(u, v, *, w=None):
866 """
867 Compute the Kulczynski 1 dissimilarity between two boolean 1-D arrays.
869 The Kulczynski 1 dissimilarity between two boolean 1-D arrays `u` and `v`
870 of length ``n``, is defined as
872 .. math::
874 \\frac{c_{11}}
875 {c_{01} + c_{10}}
877 where :math:`c_{ij}` is the number of occurrences of
878 :math:`\\mathtt{u[k]} = i` and :math:`\\mathtt{v[k]} = j` for
879 :math:`k \\in {0, 1, ..., n-1}`.
881 Parameters
882 ----------
883 u : (N,) array_like, bool
884 Input array.
885 v : (N,) array_like, bool
886 Input array.
887 w : (N,) array_like, optional
888 The weights for each value in `u` and `v`. Default is None,
889 which gives each value a weight of 1.0
891 Returns
892 -------
893 kulczynski1 : float
894 The Kulczynski 1 distance between vectors `u` and `v`.
896 Notes
897 -----
898 This measure has a minimum value of 0 and no upper limit.
899 It is un-defined when there are no non-matches.
901 .. versionadded:: 1.8.0
903 References
904 ----------
905 .. [1] Kulczynski S. et al. Bulletin
906 International de l'Academie Polonaise des Sciences
907 et des Lettres, Classe des Sciences Mathematiques
908 et Naturelles, Serie B (Sciences Naturelles). 1927;
909 Supplement II: 57-203.
911 Examples
912 --------
913 >>> from scipy.spatial import distance
914 >>> distance.kulczynski1([1, 0, 0], [0, 1, 0])
915 0.0
916 >>> distance.kulczynski1([True, False, False], [True, True, False])
917 1.0
918 >>> distance.kulczynski1([True, False, False], [True])
919 0.5
920 >>> distance.kulczynski1([1, 0, 0], [3, 1, 0])
921 -3.0
923 """
924 u = _validate_vector(u)
925 v = _validate_vector(v)
926 if w is not None:
927 w = _validate_weights(w)
928 (_, nft, ntf, ntt) = _nbool_correspond_all(u, v, w=w)
930 return ntt / (ntf + nft)
933def seuclidean(u, v, V):
934 """
935 Return the standardized Euclidean distance between two 1-D arrays.
937 The standardized Euclidean distance between `u` and `v`.
939 Parameters
940 ----------
941 u : (N,) array_like
942 Input array.
943 v : (N,) array_like
944 Input array.
945 V : (N,) array_like
946 `V` is an 1-D array of component variances. It is usually computed
947 among a larger collection vectors.
949 Returns
950 -------
951 seuclidean : double
952 The standardized Euclidean distance between vectors `u` and `v`.
954 Examples
955 --------
956 >>> from scipy.spatial import distance
957 >>> distance.seuclidean([1, 0, 0], [0, 1, 0], [0.1, 0.1, 0.1])
958 4.4721359549995796
959 >>> distance.seuclidean([1, 0, 0], [0, 1, 0], [1, 0.1, 0.1])
960 3.3166247903553998
961 >>> distance.seuclidean([1, 0, 0], [0, 1, 0], [10, 0.1, 0.1])
962 3.1780497164141406
964 """
965 u = _validate_vector(u)
966 v = _validate_vector(v)
967 V = _validate_vector(V, dtype=np.float64)
968 if V.shape[0] != u.shape[0] or u.shape[0] != v.shape[0]:
969 raise TypeError('V must be a 1-D array of the same dimension '
970 'as u and v.')
971 return euclidean(u, v, w=1/V)
974def cityblock(u, v, w=None):
975 """
976 Compute the City Block (Manhattan) distance.
978 Computes the Manhattan distance between two 1-D arrays `u` and `v`,
979 which is defined as
981 .. math::
983 \\sum_i {\\left| u_i - v_i \\right|}.
985 Parameters
986 ----------
987 u : (N,) array_like
988 Input array.
989 v : (N,) array_like
990 Input array.
991 w : (N,) array_like, optional
992 The weights for each value in `u` and `v`. Default is None,
993 which gives each value a weight of 1.0
995 Returns
996 -------
997 cityblock : double
998 The City Block (Manhattan) distance between vectors `u` and `v`.
1000 Examples
1001 --------
1002 >>> from scipy.spatial import distance
1003 >>> distance.cityblock([1, 0, 0], [0, 1, 0])
1004 2
1005 >>> distance.cityblock([1, 0, 0], [0, 2, 0])
1006 3
1007 >>> distance.cityblock([1, 0, 0], [1, 1, 0])
1008 1
1010 """
1011 u = _validate_vector(u)
1012 v = _validate_vector(v)
1013 l1_diff = abs(u - v)
1014 if w is not None:
1015 w = _validate_weights(w)
1016 l1_diff = w * l1_diff
1017 return l1_diff.sum()
1020def mahalanobis(u, v, VI):
1021 """
1022 Compute the Mahalanobis distance between two 1-D arrays.
1024 The Mahalanobis distance between 1-D arrays `u` and `v`, is defined as
1026 .. math::
1028 \\sqrt{ (u-v) V^{-1} (u-v)^T }
1030 where ``V`` is the covariance matrix. Note that the argument `VI`
1031 is the inverse of ``V``.
1033 Parameters
1034 ----------
1035 u : (N,) array_like
1036 Input array.
1037 v : (N,) array_like
1038 Input array.
1039 VI : array_like
1040 The inverse of the covariance matrix.
1042 Returns
1043 -------
1044 mahalanobis : double
1045 The Mahalanobis distance between vectors `u` and `v`.
1047 Examples
1048 --------
1049 >>> from scipy.spatial import distance
1050 >>> iv = [[1, 0.5, 0.5], [0.5, 1, 0.5], [0.5, 0.5, 1]]
1051 >>> distance.mahalanobis([1, 0, 0], [0, 1, 0], iv)
1052 1.0
1053 >>> distance.mahalanobis([0, 2, 0], [0, 1, 0], iv)
1054 1.0
1055 >>> distance.mahalanobis([2, 0, 0], [0, 1, 0], iv)
1056 1.7320508075688772
1058 """
1059 u = _validate_vector(u)
1060 v = _validate_vector(v)
1061 VI = np.atleast_2d(VI)
1062 delta = u - v
1063 m = np.dot(np.dot(delta, VI), delta)
1064 return np.sqrt(m)
1067def chebyshev(u, v, w=None):
1068 """
1069 Compute the Chebyshev distance.
1071 Computes the Chebyshev distance between two 1-D arrays `u` and `v`,
1072 which is defined as
1074 .. math::
1076 \\max_i {|u_i-v_i|}.
1078 Parameters
1079 ----------
1080 u : (N,) array_like
1081 Input vector.
1082 v : (N,) array_like
1083 Input vector.
1084 w : (N,) array_like, optional
1085 Unused, as 'max' is a weightless operation. Here for API consistency.
1087 Returns
1088 -------
1089 chebyshev : double
1090 The Chebyshev distance between vectors `u` and `v`.
1092 Examples
1093 --------
1094 >>> from scipy.spatial import distance
1095 >>> distance.chebyshev([1, 0, 0], [0, 1, 0])
1096 1
1097 >>> distance.chebyshev([1, 1, 0], [0, 1, 0])
1098 1
1100 """
1101 u = _validate_vector(u)
1102 v = _validate_vector(v)
1103 if w is not None:
1104 w = _validate_weights(w)
1105 has_weight = w > 0
1106 if has_weight.sum() < w.size:
1107 u = u[has_weight]
1108 v = v[has_weight]
1109 return max(abs(u - v))
1112def braycurtis(u, v, w=None):
1113 """
1114 Compute the Bray-Curtis distance between two 1-D arrays.
1116 Bray-Curtis distance is defined as
1118 .. math::
1120 \\sum{|u_i-v_i|} / \\sum{|u_i+v_i|}
1122 The Bray-Curtis distance is in the range [0, 1] if all coordinates are
1123 positive, and is undefined if the inputs are of length zero.
1125 Parameters
1126 ----------
1127 u : (N,) array_like
1128 Input array.
1129 v : (N,) array_like
1130 Input array.
1131 w : (N,) array_like, optional
1132 The weights for each value in `u` and `v`. Default is None,
1133 which gives each value a weight of 1.0
1135 Returns
1136 -------
1137 braycurtis : double
1138 The Bray-Curtis distance between 1-D arrays `u` and `v`.
1140 Examples
1141 --------
1142 >>> from scipy.spatial import distance
1143 >>> distance.braycurtis([1, 0, 0], [0, 1, 0])
1144 1.0
1145 >>> distance.braycurtis([1, 1, 0], [0, 1, 0])
1146 0.33333333333333331
1148 """
1149 u = _validate_vector(u)
1150 v = _validate_vector(v, dtype=np.float64)
1151 l1_diff = abs(u - v)
1152 l1_sum = abs(u + v)
1153 if w is not None:
1154 w = _validate_weights(w)
1155 l1_diff = w * l1_diff
1156 l1_sum = w * l1_sum
1157 return l1_diff.sum() / l1_sum.sum()
1160def canberra(u, v, w=None):
1161 """
1162 Compute the Canberra distance between two 1-D arrays.
1164 The Canberra distance is defined as
1166 .. math::
1168 d(u,v) = \\sum_i \\frac{|u_i-v_i|}
1169 {|u_i|+|v_i|}.
1171 Parameters
1172 ----------
1173 u : (N,) array_like
1174 Input array.
1175 v : (N,) array_like
1176 Input array.
1177 w : (N,) array_like, optional
1178 The weights for each value in `u` and `v`. Default is None,
1179 which gives each value a weight of 1.0
1181 Returns
1182 -------
1183 canberra : double
1184 The Canberra distance between vectors `u` and `v`.
1186 Notes
1187 -----
1188 When `u[i]` and `v[i]` are 0 for given i, then the fraction 0/0 = 0 is
1189 used in the calculation.
1191 Examples
1192 --------
1193 >>> from scipy.spatial import distance
1194 >>> distance.canberra([1, 0, 0], [0, 1, 0])
1195 2.0
1196 >>> distance.canberra([1, 1, 0], [0, 1, 0])
1197 1.0
1199 """
1200 u = _validate_vector(u)
1201 v = _validate_vector(v, dtype=np.float64)
1202 if w is not None:
1203 w = _validate_weights(w)
1204 with np.errstate(invalid='ignore'):
1205 abs_uv = abs(u - v)
1206 abs_u = abs(u)
1207 abs_v = abs(v)
1208 d = abs_uv / (abs_u + abs_v)
1209 if w is not None:
1210 d = w * d
1211 d = np.nansum(d)
1212 return d
1215def jensenshannon(p, q, base=None, *, axis=0, keepdims=False):
1216 """
1217 Compute the Jensen-Shannon distance (metric) between
1218 two probability arrays. This is the square root
1219 of the Jensen-Shannon divergence.
1221 The Jensen-Shannon distance between two probability
1222 vectors `p` and `q` is defined as,
1224 .. math::
1226 \\sqrt{\\frac{D(p \\parallel m) + D(q \\parallel m)}{2}}
1228 where :math:`m` is the pointwise mean of :math:`p` and :math:`q`
1229 and :math:`D` is the Kullback-Leibler divergence.
1231 This routine will normalize `p` and `q` if they don't sum to 1.0.
1233 Parameters
1234 ----------
1235 p : (N,) array_like
1236 left probability vector
1237 q : (N,) array_like
1238 right probability vector
1239 base : double, optional
1240 the base of the logarithm used to compute the output
1241 if not given, then the routine uses the default base of
1242 scipy.stats.entropy.
1243 axis : int, optional
1244 Axis along which the Jensen-Shannon distances are computed. The default
1245 is 0.
1247 .. versionadded:: 1.7.0
1248 keepdims : bool, optional
1249 If this is set to `True`, the reduced axes are left in the
1250 result as dimensions with size one. With this option,
1251 the result will broadcast correctly against the input array.
1252 Default is False.
1254 .. versionadded:: 1.7.0
1256 Returns
1257 -------
1258 js : double or ndarray
1259 The Jensen-Shannon distances between `p` and `q` along the `axis`.
1261 Notes
1262 -----
1264 .. versionadded:: 1.2.0
1266 Examples
1267 --------
1268 >>> from scipy.spatial import distance
1269 >>> import numpy as np
1270 >>> distance.jensenshannon([1.0, 0.0, 0.0], [0.0, 1.0, 0.0], 2.0)
1271 1.0
1272 >>> distance.jensenshannon([1.0, 0.0], [0.5, 0.5])
1273 0.46450140402245893
1274 >>> distance.jensenshannon([1.0, 0.0, 0.0], [1.0, 0.0, 0.0])
1275 0.0
1276 >>> a = np.array([[1, 2, 3, 4],
1277 ... [5, 6, 7, 8],
1278 ... [9, 10, 11, 12]])
1279 >>> b = np.array([[13, 14, 15, 16],
1280 ... [17, 18, 19, 20],
1281 ... [21, 22, 23, 24]])
1282 >>> distance.jensenshannon(a, b, axis=0)
1283 array([0.1954288, 0.1447697, 0.1138377, 0.0927636])
1284 >>> distance.jensenshannon(a, b, axis=1)
1285 array([0.1402339, 0.0399106, 0.0201815])
1287 """
1288 p = np.asarray(p)
1289 q = np.asarray(q)
1290 p = p / np.sum(p, axis=axis, keepdims=True)
1291 q = q / np.sum(q, axis=axis, keepdims=True)
1292 m = (p + q) / 2.0
1293 left = rel_entr(p, m)
1294 right = rel_entr(q, m)
1295 left_sum = np.sum(left, axis=axis, keepdims=keepdims)
1296 right_sum = np.sum(right, axis=axis, keepdims=keepdims)
1297 js = left_sum + right_sum
1298 if base is not None:
1299 js /= np.log(base)
1300 return np.sqrt(js / 2.0)
1303def yule(u, v, w=None):
1304 """
1305 Compute the Yule dissimilarity between two boolean 1-D arrays.
1307 The Yule dissimilarity is defined as
1309 .. math::
1311 \\frac{R}{c_{TT} * c_{FF} + \\frac{R}{2}}
1313 where :math:`c_{ij}` is the number of occurrences of
1314 :math:`\\mathtt{u[k]} = i` and :math:`\\mathtt{v[k]} = j` for
1315 :math:`k < n` and :math:`R = 2.0 * c_{TF} * c_{FT}`.
1317 Parameters
1318 ----------
1319 u : (N,) array_like, bool
1320 Input array.
1321 v : (N,) array_like, bool
1322 Input array.
1323 w : (N,) array_like, optional
1324 The weights for each value in `u` and `v`. Default is None,
1325 which gives each value a weight of 1.0
1327 Returns
1328 -------
1329 yule : double
1330 The Yule dissimilarity between vectors `u` and `v`.
1332 Examples
1333 --------
1334 >>> from scipy.spatial import distance
1335 >>> distance.yule([1, 0, 0], [0, 1, 0])
1336 2.0
1337 >>> distance.yule([1, 1, 0], [0, 1, 0])
1338 0.0
1340 """
1341 u = _validate_vector(u)
1342 v = _validate_vector(v)
1343 if w is not None:
1344 w = _validate_weights(w)
1345 (nff, nft, ntf, ntt) = _nbool_correspond_all(u, v, w=w)
1346 half_R = ntf * nft
1347 if half_R == 0:
1348 return 0.0
1349 else:
1350 return float(2.0 * half_R / (ntt * nff + half_R))
1353def dice(u, v, w=None):
1354 """
1355 Compute the Dice dissimilarity between two boolean 1-D arrays.
1357 The Dice dissimilarity between `u` and `v`, is
1359 .. math::
1361 \\frac{c_{TF} + c_{FT}}
1362 {2c_{TT} + c_{FT} + c_{TF}}
1364 where :math:`c_{ij}` is the number of occurrences of
1365 :math:`\\mathtt{u[k]} = i` and :math:`\\mathtt{v[k]} = j` for
1366 :math:`k < n`.
1368 Parameters
1369 ----------
1370 u : (N,) array_like, bool
1371 Input 1-D array.
1372 v : (N,) array_like, bool
1373 Input 1-D array.
1374 w : (N,) array_like, optional
1375 The weights for each value in `u` and `v`. Default is None,
1376 which gives each value a weight of 1.0
1378 Returns
1379 -------
1380 dice : double
1381 The Dice dissimilarity between 1-D arrays `u` and `v`.
1383 Notes
1384 -----
1385 This function computes the Dice dissimilarity index. To compute the
1386 Dice similarity index, convert one to the other with similarity =
1387 1 - dissimilarity.
1389 Examples
1390 --------
1391 >>> from scipy.spatial import distance
1392 >>> distance.dice([1, 0, 0], [0, 1, 0])
1393 1.0
1394 >>> distance.dice([1, 0, 0], [1, 1, 0])
1395 0.3333333333333333
1396 >>> distance.dice([1, 0, 0], [2, 0, 0])
1397 -0.3333333333333333
1399 """
1400 u = _validate_vector(u)
1401 v = _validate_vector(v)
1402 if w is not None:
1403 w = _validate_weights(w)
1404 if u.dtype == v.dtype == bool and w is None:
1405 ntt = (u & v).sum()
1406 else:
1407 dtype = np.result_type(int, u.dtype, v.dtype)
1408 u = u.astype(dtype)
1409 v = v.astype(dtype)
1410 if w is None:
1411 ntt = (u * v).sum()
1412 else:
1413 ntt = (u * v * w).sum()
1414 (nft, ntf) = _nbool_correspond_ft_tf(u, v, w=w)
1415 return float((ntf + nft) / np.array(2.0 * ntt + ntf + nft))
1418def rogerstanimoto(u, v, w=None):
1419 """
1420 Compute the Rogers-Tanimoto dissimilarity between two boolean 1-D arrays.
1422 The Rogers-Tanimoto dissimilarity between two boolean 1-D arrays
1423 `u` and `v`, is defined as
1425 .. math::
1426 \\frac{R}
1427 {c_{TT} + c_{FF} + R}
1429 where :math:`c_{ij}` is the number of occurrences of
1430 :math:`\\mathtt{u[k]} = i` and :math:`\\mathtt{v[k]} = j` for
1431 :math:`k < n` and :math:`R = 2(c_{TF} + c_{FT})`.
1433 Parameters
1434 ----------
1435 u : (N,) array_like, bool
1436 Input array.
1437 v : (N,) array_like, bool
1438 Input array.
1439 w : (N,) array_like, optional
1440 The weights for each value in `u` and `v`. Default is None,
1441 which gives each value a weight of 1.0
1443 Returns
1444 -------
1445 rogerstanimoto : double
1446 The Rogers-Tanimoto dissimilarity between vectors
1447 `u` and `v`.
1449 Examples
1450 --------
1451 >>> from scipy.spatial import distance
1452 >>> distance.rogerstanimoto([1, 0, 0], [0, 1, 0])
1453 0.8
1454 >>> distance.rogerstanimoto([1, 0, 0], [1, 1, 0])
1455 0.5
1456 >>> distance.rogerstanimoto([1, 0, 0], [2, 0, 0])
1457 -1.0
1459 """
1460 u = _validate_vector(u)
1461 v = _validate_vector(v)
1462 if w is not None:
1463 w = _validate_weights(w)
1464 (nff, nft, ntf, ntt) = _nbool_correspond_all(u, v, w=w)
1465 return float(2.0 * (ntf + nft)) / float(ntt + nff + (2.0 * (ntf + nft)))
1468def russellrao(u, v, w=None):
1469 """
1470 Compute the Russell-Rao dissimilarity between two boolean 1-D arrays.
1472 The Russell-Rao dissimilarity between two boolean 1-D arrays, `u` and
1473 `v`, is defined as
1475 .. math::
1477 \\frac{n - c_{TT}}
1478 {n}
1480 where :math:`c_{ij}` is the number of occurrences of
1481 :math:`\\mathtt{u[k]} = i` and :math:`\\mathtt{v[k]} = j` for
1482 :math:`k < n`.
1484 Parameters
1485 ----------
1486 u : (N,) array_like, bool
1487 Input array.
1488 v : (N,) array_like, bool
1489 Input array.
1490 w : (N,) array_like, optional
1491 The weights for each value in `u` and `v`. Default is None,
1492 which gives each value a weight of 1.0
1494 Returns
1495 -------
1496 russellrao : double
1497 The Russell-Rao dissimilarity between vectors `u` and `v`.
1499 Examples
1500 --------
1501 >>> from scipy.spatial import distance
1502 >>> distance.russellrao([1, 0, 0], [0, 1, 0])
1503 1.0
1504 >>> distance.russellrao([1, 0, 0], [1, 1, 0])
1505 0.6666666666666666
1506 >>> distance.russellrao([1, 0, 0], [2, 0, 0])
1507 0.3333333333333333
1509 """
1510 u = _validate_vector(u)
1511 v = _validate_vector(v)
1512 if u.dtype == v.dtype == bool and w is None:
1513 ntt = (u & v).sum()
1514 n = float(len(u))
1515 elif w is None:
1516 ntt = (u * v).sum()
1517 n = float(len(u))
1518 else:
1519 w = _validate_weights(w)
1520 ntt = (u * v * w).sum()
1521 n = w.sum()
1522 return float(n - ntt) / n
1525def sokalmichener(u, v, w=None):
1526 """
1527 Compute the Sokal-Michener dissimilarity between two boolean 1-D arrays.
1529 The Sokal-Michener dissimilarity between boolean 1-D arrays `u` and `v`,
1530 is defined as
1532 .. math::
1534 \\frac{R}
1535 {S + R}
1537 where :math:`c_{ij}` is the number of occurrences of
1538 :math:`\\mathtt{u[k]} = i` and :math:`\\mathtt{v[k]} = j` for
1539 :math:`k < n`, :math:`R = 2 * (c_{TF} + c_{FT})` and
1540 :math:`S = c_{FF} + c_{TT}`.
1542 Parameters
1543 ----------
1544 u : (N,) array_like, bool
1545 Input array.
1546 v : (N,) array_like, bool
1547 Input array.
1548 w : (N,) array_like, optional
1549 The weights for each value in `u` and `v`. Default is None,
1550 which gives each value a weight of 1.0
1552 Returns
1553 -------
1554 sokalmichener : double
1555 The Sokal-Michener dissimilarity between vectors `u` and `v`.
1557 Examples
1558 --------
1559 >>> from scipy.spatial import distance
1560 >>> distance.sokalmichener([1, 0, 0], [0, 1, 0])
1561 0.8
1562 >>> distance.sokalmichener([1, 0, 0], [1, 1, 0])
1563 0.5
1564 >>> distance.sokalmichener([1, 0, 0], [2, 0, 0])
1565 -1.0
1567 """
1568 u = _validate_vector(u)
1569 v = _validate_vector(v)
1570 if w is not None:
1571 w = _validate_weights(w)
1572 nff, nft, ntf, ntt = _nbool_correspond_all(u, v, w=w)
1573 return float(2.0 * (ntf + nft)) / float(ntt + nff + 2.0 * (ntf + nft))
1576def sokalsneath(u, v, w=None):
1577 """
1578 Compute the Sokal-Sneath dissimilarity between two boolean 1-D arrays.
1580 The Sokal-Sneath dissimilarity between `u` and `v`,
1582 .. math::
1584 \\frac{R}
1585 {c_{TT} + R}
1587 where :math:`c_{ij}` is the number of occurrences of
1588 :math:`\\mathtt{u[k]} = i` and :math:`\\mathtt{v[k]} = j` for
1589 :math:`k < n` and :math:`R = 2(c_{TF} + c_{FT})`.
1591 Parameters
1592 ----------
1593 u : (N,) array_like, bool
1594 Input array.
1595 v : (N,) array_like, bool
1596 Input array.
1597 w : (N,) array_like, optional
1598 The weights for each value in `u` and `v`. Default is None,
1599 which gives each value a weight of 1.0
1601 Returns
1602 -------
1603 sokalsneath : double
1604 The Sokal-Sneath dissimilarity between vectors `u` and `v`.
1606 Examples
1607 --------
1608 >>> from scipy.spatial import distance
1609 >>> distance.sokalsneath([1, 0, 0], [0, 1, 0])
1610 1.0
1611 >>> distance.sokalsneath([1, 0, 0], [1, 1, 0])
1612 0.66666666666666663
1613 >>> distance.sokalsneath([1, 0, 0], [2, 1, 0])
1614 0.0
1615 >>> distance.sokalsneath([1, 0, 0], [3, 1, 0])
1616 -2.0
1618 """
1619 u = _validate_vector(u)
1620 v = _validate_vector(v)
1621 if u.dtype == v.dtype == bool and w is None:
1622 ntt = (u & v).sum()
1623 elif w is None:
1624 ntt = (u * v).sum()
1625 else:
1626 w = _validate_weights(w)
1627 ntt = (u * v * w).sum()
1628 (nft, ntf) = _nbool_correspond_ft_tf(u, v, w=w)
1629 denom = np.array(ntt + 2.0 * (ntf + nft))
1630 if not denom.any():
1631 raise ValueError('Sokal-Sneath dissimilarity is not defined for '
1632 'vectors that are entirely false.')
1633 return float(2.0 * (ntf + nft)) / denom
1636_convert_to_double = partial(_convert_to_type, out_type=np.double)
1637_convert_to_bool = partial(_convert_to_type, out_type=bool)
1639# adding python-only wrappers to _distance_wrap module
1640_distance_wrap.pdist_correlation_double_wrap = _correlation_pdist_wrap
1641_distance_wrap.cdist_correlation_double_wrap = _correlation_cdist_wrap
1644@dataclasses.dataclass(frozen=True)
1645class CDistMetricWrapper:
1646 metric_name: str
1648 def __call__(self, XA, XB, *, out=None, **kwargs):
1649 XA = np.ascontiguousarray(XA)
1650 XB = np.ascontiguousarray(XB)
1651 mA, n = XA.shape
1652 mB, _ = XB.shape
1653 metric_name = self.metric_name
1654 metric_info = _METRICS[metric_name]
1655 XA, XB, typ, kwargs = _validate_cdist_input(
1656 XA, XB, mA, mB, n, metric_info, **kwargs)
1658 w = kwargs.pop('w', None)
1659 if w is not None:
1660 metric = metric_info.dist_func
1661 return _cdist_callable(
1662 XA, XB, metric=metric, out=out, w=w, **kwargs)
1664 dm = _prepare_out_argument(out, np.double, (mA, mB))
1665 # get cdist wrapper
1666 cdist_fn = getattr(_distance_wrap, f'cdist_{metric_name}_{typ}_wrap')
1667 cdist_fn(XA, XB, dm, **kwargs)
1668 return dm
1671@dataclasses.dataclass(frozen=True)
1672class CDistWeightedMetricWrapper:
1673 metric_name: str
1674 weighted_metric: str
1676 def __call__(self, XA, XB, *, out=None, **kwargs):
1677 XA = np.ascontiguousarray(XA)
1678 XB = np.ascontiguousarray(XB)
1679 mA, n = XA.shape
1680 mB, _ = XB.shape
1681 metric_name = self.metric_name
1682 XA, XB, typ, kwargs = _validate_cdist_input(
1683 XA, XB, mA, mB, n, _METRICS[metric_name], **kwargs)
1684 dm = _prepare_out_argument(out, np.double, (mA, mB))
1686 w = kwargs.pop('w', None)
1687 if w is not None:
1688 metric_name = self.weighted_metric
1689 kwargs['w'] = w
1691 # get cdist wrapper
1692 cdist_fn = getattr(_distance_wrap, f'cdist_{metric_name}_{typ}_wrap')
1693 cdist_fn(XA, XB, dm, **kwargs)
1694 return dm
1697@dataclasses.dataclass(frozen=True)
1698class PDistMetricWrapper:
1699 metric_name: str
1701 def __call__(self, X, *, out=None, **kwargs):
1702 X = np.ascontiguousarray(X)
1703 m, n = X.shape
1704 metric_name = self.metric_name
1705 metric_info = _METRICS[metric_name]
1706 X, typ, kwargs = _validate_pdist_input(
1707 X, m, n, metric_info, **kwargs)
1708 out_size = (m * (m - 1)) // 2
1709 w = kwargs.pop('w', None)
1710 if w is not None:
1711 metric = metric_info.dist_func
1712 return _pdist_callable(
1713 X, metric=metric, out=out, w=w, **kwargs)
1715 dm = _prepare_out_argument(out, np.double, (out_size,))
1716 # get pdist wrapper
1717 pdist_fn = getattr(_distance_wrap, f'pdist_{metric_name}_{typ}_wrap')
1718 pdist_fn(X, dm, **kwargs)
1719 return dm
1722@dataclasses.dataclass(frozen=True)
1723class PDistWeightedMetricWrapper:
1724 metric_name: str
1725 weighted_metric: str
1727 def __call__(self, X, *, out=None, **kwargs):
1728 X = np.ascontiguousarray(X)
1729 m, n = X.shape
1730 metric_name = self.metric_name
1731 X, typ, kwargs = _validate_pdist_input(
1732 X, m, n, _METRICS[metric_name], **kwargs)
1733 out_size = (m * (m - 1)) // 2
1734 dm = _prepare_out_argument(out, np.double, (out_size,))
1736 w = kwargs.pop('w', None)
1737 if w is not None:
1738 metric_name = self.weighted_metric
1739 kwargs['w'] = w
1741 # get pdist wrapper
1742 pdist_fn = getattr(_distance_wrap, f'pdist_{metric_name}_{typ}_wrap')
1743 pdist_fn(X, dm, **kwargs)
1744 return dm
1747@dataclasses.dataclass(frozen=True)
1748class MetricInfo:
1749 # Name of python distance function
1750 canonical_name: str
1751 # All aliases, including canonical_name
1752 aka: Set[str]
1753 # unvectorized distance function
1754 dist_func: Callable
1755 # Optimized cdist function
1756 cdist_func: Callable
1757 # Optimized pdist function
1758 pdist_func: Callable
1759 # function that checks kwargs and computes default values:
1760 # f(X, m, n, **kwargs)
1761 validator: Optional[Callable] = None
1762 # list of supported types:
1763 # X (pdist) and XA (cdist) are used to choose the type. if there is no
1764 # match the first type is used. Default double
1765 types: List[str] = dataclasses.field(default_factory=lambda: ['double'])
1766 # true if out array must be C-contiguous
1767 requires_contiguous_out: bool = True
1770# Registry of implemented metrics:
1771_METRIC_INFOS = [
1772 MetricInfo(
1773 canonical_name='braycurtis',
1774 aka={'braycurtis'},
1775 dist_func=braycurtis,
1776 cdist_func=_distance_pybind.cdist_braycurtis,
1777 pdist_func=_distance_pybind.pdist_braycurtis,
1778 ),
1779 MetricInfo(
1780 canonical_name='canberra',
1781 aka={'canberra'},
1782 dist_func=canberra,
1783 cdist_func=_distance_pybind.cdist_canberra,
1784 pdist_func=_distance_pybind.pdist_canberra,
1785 ),
1786 MetricInfo(
1787 canonical_name='chebyshev',
1788 aka={'chebychev', 'chebyshev', 'cheby', 'cheb', 'ch'},
1789 dist_func=chebyshev,
1790 cdist_func=_distance_pybind.cdist_chebyshev,
1791 pdist_func=_distance_pybind.pdist_chebyshev,
1792 ),
1793 MetricInfo(
1794 canonical_name='cityblock',
1795 aka={'cityblock', 'cblock', 'cb', 'c'},
1796 dist_func=cityblock,
1797 cdist_func=_distance_pybind.cdist_cityblock,
1798 pdist_func=_distance_pybind.pdist_cityblock,
1799 ),
1800 MetricInfo(
1801 canonical_name='correlation',
1802 aka={'correlation', 'co'},
1803 dist_func=correlation,
1804 cdist_func=CDistMetricWrapper('correlation'),
1805 pdist_func=PDistMetricWrapper('correlation'),
1806 ),
1807 MetricInfo(
1808 canonical_name='cosine',
1809 aka={'cosine', 'cos'},
1810 dist_func=cosine,
1811 cdist_func=CDistMetricWrapper('cosine'),
1812 pdist_func=PDistMetricWrapper('cosine'),
1813 ),
1814 MetricInfo(
1815 canonical_name='dice',
1816 aka={'dice'},
1817 types=['bool'],
1818 dist_func=dice,
1819 cdist_func=CDistMetricWrapper('dice'),
1820 pdist_func=PDistMetricWrapper('dice'),
1821 ),
1822 MetricInfo(
1823 canonical_name='euclidean',
1824 aka={'euclidean', 'euclid', 'eu', 'e'},
1825 dist_func=euclidean,
1826 cdist_func=_distance_pybind.cdist_euclidean,
1827 pdist_func=_distance_pybind.pdist_euclidean,
1828 ),
1829 MetricInfo(
1830 canonical_name='hamming',
1831 aka={'matching', 'hamming', 'hamm', 'ha', 'h'},
1832 types=['double', 'bool'],
1833 validator=_validate_hamming_kwargs,
1834 dist_func=hamming,
1835 cdist_func=CDistWeightedMetricWrapper('hamming', 'hamming'),
1836 pdist_func=PDistWeightedMetricWrapper('hamming', 'hamming'),
1837 ),
1838 MetricInfo(
1839 canonical_name='jaccard',
1840 aka={'jaccard', 'jacc', 'ja', 'j'},
1841 types=['double', 'bool'],
1842 dist_func=jaccard,
1843 cdist_func=CDistMetricWrapper('jaccard'),
1844 pdist_func=PDistMetricWrapper('jaccard'),
1845 ),
1846 MetricInfo(
1847 canonical_name='jensenshannon',
1848 aka={'jensenshannon', 'js'},
1849 dist_func=jensenshannon,
1850 cdist_func=CDistMetricWrapper('jensenshannon'),
1851 pdist_func=PDistMetricWrapper('jensenshannon'),
1852 ),
1853 MetricInfo(
1854 canonical_name='kulsinski',
1855 aka={'kulsinski'},
1856 types=['bool'],
1857 dist_func=kulsinski,
1858 cdist_func=CDistMetricWrapper('kulsinski'),
1859 pdist_func=PDistMetricWrapper('kulsinski'),
1860 ),
1861 MetricInfo(
1862 canonical_name='kulczynski1',
1863 aka={'kulczynski1'},
1864 types=['bool'],
1865 dist_func=kulczynski1,
1866 cdist_func=CDistMetricWrapper('kulczynski1'),
1867 pdist_func=PDistMetricWrapper('kulczynski1'),
1868 ),
1869 MetricInfo(
1870 canonical_name='mahalanobis',
1871 aka={'mahalanobis', 'mahal', 'mah'},
1872 validator=_validate_mahalanobis_kwargs,
1873 dist_func=mahalanobis,
1874 cdist_func=CDistMetricWrapper('mahalanobis'),
1875 pdist_func=PDistMetricWrapper('mahalanobis'),
1876 ),
1877 MetricInfo(
1878 canonical_name='minkowski',
1879 aka={'minkowski', 'mi', 'm', 'pnorm'},
1880 validator=_validate_minkowski_kwargs,
1881 dist_func=minkowski,
1882 cdist_func=_distance_pybind.cdist_minkowski,
1883 pdist_func=_distance_pybind.pdist_minkowski,
1884 ),
1885 MetricInfo(
1886 canonical_name='rogerstanimoto',
1887 aka={'rogerstanimoto'},
1888 types=['bool'],
1889 dist_func=rogerstanimoto,
1890 cdist_func=CDistMetricWrapper('rogerstanimoto'),
1891 pdist_func=PDistMetricWrapper('rogerstanimoto'),
1892 ),
1893 MetricInfo(
1894 canonical_name='russellrao',
1895 aka={'russellrao'},
1896 types=['bool'],
1897 dist_func=russellrao,
1898 cdist_func=CDistMetricWrapper('russellrao'),
1899 pdist_func=PDistMetricWrapper('russellrao'),
1900 ),
1901 MetricInfo(
1902 canonical_name='seuclidean',
1903 aka={'seuclidean', 'se', 's'},
1904 validator=_validate_seuclidean_kwargs,
1905 dist_func=seuclidean,
1906 cdist_func=CDistMetricWrapper('seuclidean'),
1907 pdist_func=PDistMetricWrapper('seuclidean'),
1908 ),
1909 MetricInfo(
1910 canonical_name='sokalmichener',
1911 aka={'sokalmichener'},
1912 types=['bool'],
1913 dist_func=sokalmichener,
1914 cdist_func=CDistMetricWrapper('sokalmichener'),
1915 pdist_func=PDistMetricWrapper('sokalmichener'),
1916 ),
1917 MetricInfo(
1918 canonical_name='sokalsneath',
1919 aka={'sokalsneath'},
1920 types=['bool'],
1921 dist_func=sokalsneath,
1922 cdist_func=CDistMetricWrapper('sokalsneath'),
1923 pdist_func=PDistMetricWrapper('sokalsneath'),
1924 ),
1925 MetricInfo(
1926 canonical_name='sqeuclidean',
1927 aka={'sqeuclidean', 'sqe', 'sqeuclid'},
1928 dist_func=sqeuclidean,
1929 cdist_func=_distance_pybind.cdist_sqeuclidean,
1930 pdist_func=_distance_pybind.pdist_sqeuclidean,
1931 ),
1932 MetricInfo(
1933 canonical_name='yule',
1934 aka={'yule'},
1935 types=['bool'],
1936 dist_func=yule,
1937 cdist_func=CDistMetricWrapper('yule'),
1938 pdist_func=PDistMetricWrapper('yule'),
1939 ),
1940]
1942_METRICS = {info.canonical_name: info for info in _METRIC_INFOS}
1943_METRIC_ALIAS = dict((alias, info)
1944 for info in _METRIC_INFOS
1945 for alias in info.aka)
1947_METRICS_NAMES = list(_METRICS.keys())
1949_TEST_METRICS = {'test_' + info.canonical_name: info for info in _METRIC_INFOS}
1952def pdist(X, metric='euclidean', *, out=None, **kwargs):
1953 """
1954 Pairwise distances between observations in n-dimensional space.
1956 See Notes for common calling conventions.
1958 Parameters
1959 ----------
1960 X : array_like
1961 An m by n array of m original observations in an
1962 n-dimensional space.
1963 metric : str or function, optional
1964 The distance metric to use. The distance function can
1965 be 'braycurtis', 'canberra', 'chebyshev', 'cityblock',
1966 'correlation', 'cosine', 'dice', 'euclidean', 'hamming',
1967 'jaccard', 'jensenshannon', 'kulczynski1',
1968 'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto',
1969 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath',
1970 'sqeuclidean', 'yule'.
1971 **kwargs : dict, optional
1972 Extra arguments to `metric`: refer to each metric documentation for a
1973 list of all possible arguments.
1975 Some possible arguments:
1977 p : scalar
1978 The p-norm to apply for Minkowski, weighted and unweighted.
1979 Default: 2.
1981 w : ndarray
1982 The weight vector for metrics that support weights (e.g., Minkowski).
1984 V : ndarray
1985 The variance vector for standardized Euclidean.
1986 Default: var(X, axis=0, ddof=1)
1988 VI : ndarray
1989 The inverse of the covariance matrix for Mahalanobis.
1990 Default: inv(cov(X.T)).T
1992 out : ndarray.
1993 The output array
1994 If not None, condensed distance matrix Y is stored in this array.
1996 Returns
1997 -------
1998 Y : ndarray
1999 Returns a condensed distance matrix Y. For each :math:`i` and :math:`j`
2000 (where :math:`i<j<m`),where m is the number of original observations.
2001 The metric ``dist(u=X[i], v=X[j])`` is computed and stored in entry ``m
2002 * i + j - ((i + 2) * (i + 1)) // 2``.
2004 See Also
2005 --------
2006 squareform : converts between condensed distance matrices and
2007 square distance matrices.
2009 Notes
2010 -----
2011 See ``squareform`` for information on how to calculate the index of
2012 this entry or to convert the condensed distance matrix to a
2013 redundant square matrix.
2015 The following are common calling conventions.
2017 1. ``Y = pdist(X, 'euclidean')``
2019 Computes the distance between m points using Euclidean distance
2020 (2-norm) as the distance metric between the points. The points
2021 are arranged as m n-dimensional row vectors in the matrix X.
2023 2. ``Y = pdist(X, 'minkowski', p=2.)``
2025 Computes the distances using the Minkowski distance
2026 :math:`\\|u-v\\|_p` (:math:`p`-norm) where :math:`p > 0` (note
2027 that this is only a quasi-metric if :math:`0 < p < 1`).
2029 3. ``Y = pdist(X, 'cityblock')``
2031 Computes the city block or Manhattan distance between the
2032 points.
2034 4. ``Y = pdist(X, 'seuclidean', V=None)``
2036 Computes the standardized Euclidean distance. The standardized
2037 Euclidean distance between two n-vectors ``u`` and ``v`` is
2039 .. math::
2041 \\sqrt{\\sum {(u_i-v_i)^2 / V[x_i]}}
2044 V is the variance vector; V[i] is the variance computed over all
2045 the i'th components of the points. If not passed, it is
2046 automatically computed.
2048 5. ``Y = pdist(X, 'sqeuclidean')``
2050 Computes the squared Euclidean distance :math:`\\|u-v\\|_2^2` between
2051 the vectors.
2053 6. ``Y = pdist(X, 'cosine')``
2055 Computes the cosine distance between vectors u and v,
2057 .. math::
2059 1 - \\frac{u \\cdot v}
2060 {{\\|u\\|}_2 {\\|v\\|}_2}
2062 where :math:`\\|*\\|_2` is the 2-norm of its argument ``*``, and
2063 :math:`u \\cdot v` is the dot product of ``u`` and ``v``.
2065 7. ``Y = pdist(X, 'correlation')``
2067 Computes the correlation distance between vectors u and v. This is
2069 .. math::
2071 1 - \\frac{(u - \\bar{u}) \\cdot (v - \\bar{v})}
2072 {{\\|(u - \\bar{u})\\|}_2 {\\|(v - \\bar{v})\\|}_2}
2074 where :math:`\\bar{v}` is the mean of the elements of vector v,
2075 and :math:`x \\cdot y` is the dot product of :math:`x` and :math:`y`.
2077 8. ``Y = pdist(X, 'hamming')``
2079 Computes the normalized Hamming distance, or the proportion of
2080 those vector elements between two n-vectors ``u`` and ``v``
2081 which disagree. To save memory, the matrix ``X`` can be of type
2082 boolean.
2084 9. ``Y = pdist(X, 'jaccard')``
2086 Computes the Jaccard distance between the points. Given two
2087 vectors, ``u`` and ``v``, the Jaccard distance is the
2088 proportion of those elements ``u[i]`` and ``v[i]`` that
2089 disagree.
2091 10. ``Y = pdist(X, 'jensenshannon')``
2093 Computes the Jensen-Shannon distance between two probability arrays.
2094 Given two probability vectors, :math:`p` and :math:`q`, the
2095 Jensen-Shannon distance is
2097 .. math::
2099 \\sqrt{\\frac{D(p \\parallel m) + D(q \\parallel m)}{2}}
2101 where :math:`m` is the pointwise mean of :math:`p` and :math:`q`
2102 and :math:`D` is the Kullback-Leibler divergence.
2104 11. ``Y = pdist(X, 'chebyshev')``
2106 Computes the Chebyshev distance between the points. The
2107 Chebyshev distance between two n-vectors ``u`` and ``v`` is the
2108 maximum norm-1 distance between their respective elements. More
2109 precisely, the distance is given by
2111 .. math::
2113 d(u,v) = \\max_i {|u_i-v_i|}
2115 12. ``Y = pdist(X, 'canberra')``
2117 Computes the Canberra distance between the points. The
2118 Canberra distance between two points ``u`` and ``v`` is
2120 .. math::
2122 d(u,v) = \\sum_i \\frac{|u_i-v_i|}
2123 {|u_i|+|v_i|}
2126 13. ``Y = pdist(X, 'braycurtis')``
2128 Computes the Bray-Curtis distance between the points. The
2129 Bray-Curtis distance between two points ``u`` and ``v`` is
2132 .. math::
2134 d(u,v) = \\frac{\\sum_i {|u_i-v_i|}}
2135 {\\sum_i {|u_i+v_i|}}
2137 14. ``Y = pdist(X, 'mahalanobis', VI=None)``
2139 Computes the Mahalanobis distance between the points. The
2140 Mahalanobis distance between two points ``u`` and ``v`` is
2141 :math:`\\sqrt{(u-v)(1/V)(u-v)^T}` where :math:`(1/V)` (the ``VI``
2142 variable) is the inverse covariance. If ``VI`` is not None,
2143 ``VI`` will be used as the inverse covariance matrix.
2145 15. ``Y = pdist(X, 'yule')``
2147 Computes the Yule distance between each pair of boolean
2148 vectors. (see yule function documentation)
2150 16. ``Y = pdist(X, 'matching')``
2152 Synonym for 'hamming'.
2154 17. ``Y = pdist(X, 'dice')``
2156 Computes the Dice distance between each pair of boolean
2157 vectors. (see dice function documentation)
2159 18. ``Y = pdist(X, 'kulczynski1')``
2161 Computes the kulczynski1 distance between each pair of
2162 boolean vectors. (see kulczynski1 function documentation)
2164 19. ``Y = pdist(X, 'rogerstanimoto')``
2166 Computes the Rogers-Tanimoto distance between each pair of
2167 boolean vectors. (see rogerstanimoto function documentation)
2169 20. ``Y = pdist(X, 'russellrao')``
2171 Computes the Russell-Rao distance between each pair of
2172 boolean vectors. (see russellrao function documentation)
2174 21. ``Y = pdist(X, 'sokalmichener')``
2176 Computes the Sokal-Michener distance between each pair of
2177 boolean vectors. (see sokalmichener function documentation)
2179 22. ``Y = pdist(X, 'sokalsneath')``
2181 Computes the Sokal-Sneath distance between each pair of
2182 boolean vectors. (see sokalsneath function documentation)
2184 23. ``Y = pdist(X, 'kulczynski1')``
2186 Computes the Kulczynski 1 distance between each pair of
2187 boolean vectors. (see kulczynski1 function documentation)
2189 24. ``Y = pdist(X, f)``
2191 Computes the distance between all pairs of vectors in X
2192 using the user supplied 2-arity function f. For example,
2193 Euclidean distance between the vectors could be computed
2194 as follows::
2196 dm = pdist(X, lambda u, v: np.sqrt(((u-v)**2).sum()))
2198 Note that you should avoid passing a reference to one of
2199 the distance functions defined in this library. For example,::
2201 dm = pdist(X, sokalsneath)
2203 would calculate the pair-wise distances between the vectors in
2204 X using the Python function sokalsneath. This would result in
2205 sokalsneath being called :math:`{n \\choose 2}` times, which
2206 is inefficient. Instead, the optimized C version is more
2207 efficient, and we call it using the following syntax.::
2209 dm = pdist(X, 'sokalsneath')
2211 """
2212 # You can also call this as:
2213 # Y = pdist(X, 'test_abc')
2214 # where 'abc' is the metric being tested. This computes the distance
2215 # between all pairs of vectors in X using the distance metric 'abc' but
2216 # with a more succinct, verifiable, but less efficient implementation.
2218 X = _asarray_validated(X, sparse_ok=False, objects_ok=True, mask_ok=True,
2219 check_finite=False)
2221 s = X.shape
2222 if len(s) != 2:
2223 raise ValueError('A 2-dimensional array must be passed.')
2225 m, n = s
2227 if callable(metric):
2228 mstr = getattr(metric, '__name__', 'UnknownCustomMetric')
2229 metric_info = _METRIC_ALIAS.get(mstr, None)
2231 if metric_info is not None:
2232 X, typ, kwargs = _validate_pdist_input(
2233 X, m, n, metric_info, **kwargs)
2235 return _pdist_callable(X, metric=metric, out=out, **kwargs)
2236 elif isinstance(metric, str):
2237 mstr = metric.lower()
2238 metric_info = _METRIC_ALIAS.get(mstr, None)
2240 if metric_info is not None:
2241 pdist_fn = metric_info.pdist_func
2242 return pdist_fn(X, out=out, **kwargs)
2243 elif mstr.startswith("test_"):
2244 metric_info = _TEST_METRICS.get(mstr, None)
2245 if metric_info is None:
2246 raise ValueError(f'Unknown "Test" Distance Metric: {mstr[5:]}')
2247 X, typ, kwargs = _validate_pdist_input(
2248 X, m, n, metric_info, **kwargs)
2249 return _pdist_callable(
2250 X, metric=metric_info.dist_func, out=out, **kwargs)
2251 else:
2252 raise ValueError('Unknown Distance Metric: %s' % mstr)
2253 else:
2254 raise TypeError('2nd argument metric must be a string identifier '
2255 'or a function.')
2258def squareform(X, force="no", checks=True):
2259 """
2260 Convert a vector-form distance vector to a square-form distance
2261 matrix, and vice-versa.
2263 Parameters
2264 ----------
2265 X : array_like
2266 Either a condensed or redundant distance matrix.
2267 force : str, optional
2268 As with MATLAB(TM), if force is equal to ``'tovector'`` or
2269 ``'tomatrix'``, the input will be treated as a distance matrix or
2270 distance vector respectively.
2271 checks : bool, optional
2272 If set to False, no checks will be made for matrix
2273 symmetry nor zero diagonals. This is useful if it is known that
2274 ``X - X.T1`` is small and ``diag(X)`` is close to zero.
2275 These values are ignored any way so they do not disrupt the
2276 squareform transformation.
2278 Returns
2279 -------
2280 Y : ndarray
2281 If a condensed distance matrix is passed, a redundant one is
2282 returned, or if a redundant one is passed, a condensed distance
2283 matrix is returned.
2285 Notes
2286 -----
2287 1. ``v = squareform(X)``
2289 Given a square n-by-n symmetric distance matrix ``X``,
2290 ``v = squareform(X)`` returns a ``n * (n-1) / 2``
2291 (i.e. binomial coefficient n choose 2) sized vector `v`
2292 where :math:`v[{n \\choose 2} - {n-i \\choose 2} + (j-i-1)]`
2293 is the distance between distinct points ``i`` and ``j``.
2294 If ``X`` is non-square or asymmetric, an error is raised.
2296 2. ``X = squareform(v)``
2298 Given a ``n * (n-1) / 2`` sized vector ``v``
2299 for some integer ``n >= 1`` encoding distances as described,
2300 ``X = squareform(v)`` returns a n-by-n distance matrix ``X``.
2301 The ``X[i, j]`` and ``X[j, i]`` values are set to
2302 :math:`v[{n \\choose 2} - {n-i \\choose 2} + (j-i-1)]`
2303 and all diagonal elements are zero.
2305 In SciPy 0.19.0, ``squareform`` stopped casting all input types to
2306 float64, and started returning arrays of the same dtype as the input.
2308 """
2310 X = np.ascontiguousarray(X)
2312 s = X.shape
2314 if force.lower() == 'tomatrix':
2315 if len(s) != 1:
2316 raise ValueError("Forcing 'tomatrix' but input X is not a "
2317 "distance vector.")
2318 elif force.lower() == 'tovector':
2319 if len(s) != 2:
2320 raise ValueError("Forcing 'tovector' but input X is not a "
2321 "distance matrix.")
2323 # X = squareform(v)
2324 if len(s) == 1:
2325 if s[0] == 0:
2326 return np.zeros((1, 1), dtype=X.dtype)
2328 # Grab the closest value to the square root of the number
2329 # of elements times 2 to see if the number of elements
2330 # is indeed a binomial coefficient.
2331 d = int(np.ceil(np.sqrt(s[0] * 2)))
2333 # Check that v is of valid dimensions.
2334 if d * (d - 1) != s[0] * 2:
2335 raise ValueError('Incompatible vector size. It must be a binomial '
2336 'coefficient n choose 2 for some integer n >= 2.')
2338 # Allocate memory for the distance matrix.
2339 M = np.zeros((d, d), dtype=X.dtype)
2341 # Since the C code does not support striding using strides.
2342 # The dimensions are used instead.
2343 X = _copy_array_if_base_present(X)
2345 # Fill in the values of the distance matrix.
2346 _distance_wrap.to_squareform_from_vector_wrap(M, X)
2348 # Return the distance matrix.
2349 return M
2350 elif len(s) == 2:
2351 if s[0] != s[1]:
2352 raise ValueError('The matrix argument must be square.')
2353 if checks:
2354 is_valid_dm(X, throw=True, name='X')
2356 # One-side of the dimensions is set here.
2357 d = s[0]
2359 if d <= 1:
2360 return np.array([], dtype=X.dtype)
2362 # Create a vector.
2363 v = np.zeros((d * (d - 1)) // 2, dtype=X.dtype)
2365 # Since the C code does not support striding using strides.
2366 # The dimensions are used instead.
2367 X = _copy_array_if_base_present(X)
2369 # Convert the vector to squareform.
2370 _distance_wrap.to_vector_from_squareform_wrap(X, v)
2371 return v
2372 else:
2373 raise ValueError(('The first argument must be one or two dimensional '
2374 'array. A %d-dimensional array is not '
2375 'permitted') % len(s))
2378def is_valid_dm(D, tol=0.0, throw=False, name="D", warning=False):
2379 """
2380 Return True if input array is a valid distance matrix.
2382 Distance matrices must be 2-dimensional numpy arrays.
2383 They must have a zero-diagonal, and they must be symmetric.
2385 Parameters
2386 ----------
2387 D : array_like
2388 The candidate object to test for validity.
2389 tol : float, optional
2390 The distance matrix should be symmetric. `tol` is the maximum
2391 difference between entries ``ij`` and ``ji`` for the distance
2392 metric to be considered symmetric.
2393 throw : bool, optional
2394 An exception is thrown if the distance matrix passed is not valid.
2395 name : str, optional
2396 The name of the variable to checked. This is useful if
2397 throw is set to True so the offending variable can be identified
2398 in the exception message when an exception is thrown.
2399 warning : bool, optional
2400 Instead of throwing an exception, a warning message is
2401 raised.
2403 Returns
2404 -------
2405 valid : bool
2406 True if the variable `D` passed is a valid distance matrix.
2408 Notes
2409 -----
2410 Small numerical differences in `D` and `D.T` and non-zeroness of
2411 the diagonal are ignored if they are within the tolerance specified
2412 by `tol`.
2414 """
2415 D = np.asarray(D, order='c')
2416 valid = True
2417 try:
2418 s = D.shape
2419 if len(D.shape) != 2:
2420 if name:
2421 raise ValueError(('Distance matrix \'%s\' must have shape=2 '
2422 '(i.e. be two-dimensional).') % name)
2423 else:
2424 raise ValueError('Distance matrix must have shape=2 (i.e. '
2425 'be two-dimensional).')
2426 if tol == 0.0:
2427 if not (D == D.T).all():
2428 if name:
2429 raise ValueError(('Distance matrix \'%s\' must be '
2430 'symmetric.') % name)
2431 else:
2432 raise ValueError('Distance matrix must be symmetric.')
2433 if not (D[range(0, s[0]), range(0, s[0])] == 0).all():
2434 if name:
2435 raise ValueError(('Distance matrix \'%s\' diagonal must '
2436 'be zero.') % name)
2437 else:
2438 raise ValueError('Distance matrix diagonal must be zero.')
2439 else:
2440 if not (D - D.T <= tol).all():
2441 if name:
2442 raise ValueError(('Distance matrix \'%s\' must be '
2443 'symmetric within tolerance %5.5f.')
2444 % (name, tol))
2445 else:
2446 raise ValueError('Distance matrix must be symmetric within'
2447 ' tolerance %5.5f.' % tol)
2448 if not (D[range(0, s[0]), range(0, s[0])] <= tol).all():
2449 if name:
2450 raise ValueError(('Distance matrix \'%s\' diagonal must be'
2451 ' close to zero within tolerance %5.5f.')
2452 % (name, tol))
2453 else:
2454 raise ValueError(('Distance matrix \'%s\' diagonal must be'
2455 ' close to zero within tolerance %5.5f.')
2456 % tol)
2457 except Exception as e:
2458 if throw:
2459 raise
2460 if warning:
2461 warnings.warn(str(e))
2462 valid = False
2463 return valid
2466def is_valid_y(y, warning=False, throw=False, name=None):
2467 """
2468 Return True if the input array is a valid condensed distance matrix.
2470 Condensed distance matrices must be 1-dimensional numpy arrays.
2471 Their length must be a binomial coefficient :math:`{n \\choose 2}`
2472 for some positive integer n.
2474 Parameters
2475 ----------
2476 y : array_like
2477 The condensed distance matrix.
2478 warning : bool, optional
2479 Invokes a warning if the variable passed is not a valid
2480 condensed distance matrix. The warning message explains why
2481 the distance matrix is not valid. `name` is used when
2482 referencing the offending variable.
2483 throw : bool, optional
2484 Throws an exception if the variable passed is not a valid
2485 condensed distance matrix.
2486 name : bool, optional
2487 Used when referencing the offending variable in the
2488 warning or exception message.
2490 """
2491 y = np.asarray(y, order='c')
2492 valid = True
2493 try:
2494 if len(y.shape) != 1:
2495 if name:
2496 raise ValueError(('Condensed distance matrix \'%s\' must '
2497 'have shape=1 (i.e. be one-dimensional).')
2498 % name)
2499 else:
2500 raise ValueError('Condensed distance matrix must have shape=1 '
2501 '(i.e. be one-dimensional).')
2502 n = y.shape[0]
2503 d = int(np.ceil(np.sqrt(n * 2)))
2504 if (d * (d - 1) / 2) != n:
2505 if name:
2506 raise ValueError(('Length n of condensed distance matrix '
2507 '\'%s\' must be a binomial coefficient, i.e.'
2508 'there must be a k such that '
2509 '(k \\choose 2)=n)!') % name)
2510 else:
2511 raise ValueError('Length n of condensed distance matrix must '
2512 'be a binomial coefficient, i.e. there must '
2513 'be a k such that (k \\choose 2)=n)!')
2514 except Exception as e:
2515 if throw:
2516 raise
2517 if warning:
2518 warnings.warn(str(e))
2519 valid = False
2520 return valid
2523def num_obs_dm(d):
2524 """
2525 Return the number of original observations that correspond to a
2526 square, redundant distance matrix.
2528 Parameters
2529 ----------
2530 d : array_like
2531 The target distance matrix.
2533 Returns
2534 -------
2535 num_obs_dm : int
2536 The number of observations in the redundant distance matrix.
2538 """
2539 d = np.asarray(d, order='c')
2540 is_valid_dm(d, tol=np.inf, throw=True, name='d')
2541 return d.shape[0]
2544def num_obs_y(Y):
2545 """
2546 Return the number of original observations that correspond to a
2547 condensed distance matrix.
2549 Parameters
2550 ----------
2551 Y : array_like
2552 Condensed distance matrix.
2554 Returns
2555 -------
2556 n : int
2557 The number of observations in the condensed distance matrix `Y`.
2559 """
2560 Y = np.asarray(Y, order='c')
2561 is_valid_y(Y, throw=True, name='Y')
2562 k = Y.shape[0]
2563 if k == 0:
2564 raise ValueError("The number of observations cannot be determined on "
2565 "an empty distance matrix.")
2566 d = int(np.ceil(np.sqrt(k * 2)))
2567 if (d * (d - 1) / 2) != k:
2568 raise ValueError("Invalid condensed distance matrix passed. Must be "
2569 "some k where k=(n choose 2) for some n >= 2.")
2570 return d
2573def _prepare_out_argument(out, dtype, expected_shape):
2574 if out is None:
2575 return np.empty(expected_shape, dtype=dtype)
2577 if out.shape != expected_shape:
2578 raise ValueError("Output array has incorrect shape.")
2579 if not out.flags.c_contiguous:
2580 raise ValueError("Output array must be C-contiguous.")
2581 if out.dtype != np.double:
2582 raise ValueError("Output array must be double type.")
2583 return out
2586def _pdist_callable(X, *, out, metric, **kwargs):
2587 n = X.shape[0]
2588 out_size = (n * (n - 1)) // 2
2589 dm = _prepare_out_argument(out, np.double, (out_size,))
2590 k = 0
2591 for i in range(X.shape[0] - 1):
2592 for j in range(i + 1, X.shape[0]):
2593 dm[k] = metric(X[i], X[j], **kwargs)
2594 k += 1
2595 return dm
2598def _cdist_callable(XA, XB, *, out, metric, **kwargs):
2599 mA = XA.shape[0]
2600 mB = XB.shape[0]
2601 dm = _prepare_out_argument(out, np.double, (mA, mB))
2602 for i in range(mA):
2603 for j in range(mB):
2604 dm[i, j] = metric(XA[i], XB[j], **kwargs)
2605 return dm
2608def cdist(XA, XB, metric='euclidean', *, out=None, **kwargs):
2609 """
2610 Compute distance between each pair of the two collections of inputs.
2612 See Notes for common calling conventions.
2614 Parameters
2615 ----------
2616 XA : array_like
2617 An :math:`m_A` by :math:`n` array of :math:`m_A`
2618 original observations in an :math:`n`-dimensional space.
2619 Inputs are converted to float type.
2620 XB : array_like
2621 An :math:`m_B` by :math:`n` array of :math:`m_B`
2622 original observations in an :math:`n`-dimensional space.
2623 Inputs are converted to float type.
2624 metric : str or callable, optional
2625 The distance metric to use. If a string, the distance function can be
2626 'braycurtis', 'canberra', 'chebyshev', 'cityblock', 'correlation',
2627 'cosine', 'dice', 'euclidean', 'hamming', 'jaccard', 'jensenshannon',
2628 'kulczynski1', 'mahalanobis', 'matching', 'minkowski',
2629 'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener',
2630 'sokalsneath', 'sqeuclidean', 'yule'.
2631 **kwargs : dict, optional
2632 Extra arguments to `metric`: refer to each metric documentation for a
2633 list of all possible arguments.
2635 Some possible arguments:
2637 p : scalar
2638 The p-norm to apply for Minkowski, weighted and unweighted.
2639 Default: 2.
2641 w : array_like
2642 The weight vector for metrics that support weights (e.g., Minkowski).
2644 V : array_like
2645 The variance vector for standardized Euclidean.
2646 Default: var(vstack([XA, XB]), axis=0, ddof=1)
2648 VI : array_like
2649 The inverse of the covariance matrix for Mahalanobis.
2650 Default: inv(cov(vstack([XA, XB].T))).T
2652 out : ndarray
2653 The output array
2654 If not None, the distance matrix Y is stored in this array.
2656 Returns
2657 -------
2658 Y : ndarray
2659 A :math:`m_A` by :math:`m_B` distance matrix is returned.
2660 For each :math:`i` and :math:`j`, the metric
2661 ``dist(u=XA[i], v=XB[j])`` is computed and stored in the
2662 :math:`ij` th entry.
2664 Raises
2665 ------
2666 ValueError
2667 An exception is thrown if `XA` and `XB` do not have
2668 the same number of columns.
2670 Notes
2671 -----
2672 The following are common calling conventions:
2674 1. ``Y = cdist(XA, XB, 'euclidean')``
2676 Computes the distance between :math:`m` points using
2677 Euclidean distance (2-norm) as the distance metric between the
2678 points. The points are arranged as :math:`m`
2679 :math:`n`-dimensional row vectors in the matrix X.
2681 2. ``Y = cdist(XA, XB, 'minkowski', p=2.)``
2683 Computes the distances using the Minkowski distance
2684 :math:`\\|u-v\\|_p` (:math:`p`-norm) where :math:`p > 0` (note
2685 that this is only a quasi-metric if :math:`0 < p < 1`).
2687 3. ``Y = cdist(XA, XB, 'cityblock')``
2689 Computes the city block or Manhattan distance between the
2690 points.
2692 4. ``Y = cdist(XA, XB, 'seuclidean', V=None)``
2694 Computes the standardized Euclidean distance. The standardized
2695 Euclidean distance between two n-vectors ``u`` and ``v`` is
2697 .. math::
2699 \\sqrt{\\sum {(u_i-v_i)^2 / V[x_i]}}.
2701 V is the variance vector; V[i] is the variance computed over all
2702 the i'th components of the points. If not passed, it is
2703 automatically computed.
2705 5. ``Y = cdist(XA, XB, 'sqeuclidean')``
2707 Computes the squared Euclidean distance :math:`\\|u-v\\|_2^2` between
2708 the vectors.
2710 6. ``Y = cdist(XA, XB, 'cosine')``
2712 Computes the cosine distance between vectors u and v,
2714 .. math::
2716 1 - \\frac{u \\cdot v}
2717 {{\\|u\\|}_2 {\\|v\\|}_2}
2719 where :math:`\\|*\\|_2` is the 2-norm of its argument ``*``, and
2720 :math:`u \\cdot v` is the dot product of :math:`u` and :math:`v`.
2722 7. ``Y = cdist(XA, XB, 'correlation')``
2724 Computes the correlation distance between vectors u and v. This is
2726 .. math::
2728 1 - \\frac{(u - \\bar{u}) \\cdot (v - \\bar{v})}
2729 {{\\|(u - \\bar{u})\\|}_2 {\\|(v - \\bar{v})\\|}_2}
2731 where :math:`\\bar{v}` is the mean of the elements of vector v,
2732 and :math:`x \\cdot y` is the dot product of :math:`x` and :math:`y`.
2735 8. ``Y = cdist(XA, XB, 'hamming')``
2737 Computes the normalized Hamming distance, or the proportion of
2738 those vector elements between two n-vectors ``u`` and ``v``
2739 which disagree. To save memory, the matrix ``X`` can be of type
2740 boolean.
2742 9. ``Y = cdist(XA, XB, 'jaccard')``
2744 Computes the Jaccard distance between the points. Given two
2745 vectors, ``u`` and ``v``, the Jaccard distance is the
2746 proportion of those elements ``u[i]`` and ``v[i]`` that
2747 disagree where at least one of them is non-zero.
2749 10. ``Y = cdist(XA, XB, 'jensenshannon')``
2751 Computes the Jensen-Shannon distance between two probability arrays.
2752 Given two probability vectors, :math:`p` and :math:`q`, the
2753 Jensen-Shannon distance is
2755 .. math::
2757 \\sqrt{\\frac{D(p \\parallel m) + D(q \\parallel m)}{2}}
2759 where :math:`m` is the pointwise mean of :math:`p` and :math:`q`
2760 and :math:`D` is the Kullback-Leibler divergence.
2762 11. ``Y = cdist(XA, XB, 'chebyshev')``
2764 Computes the Chebyshev distance between the points. The
2765 Chebyshev distance between two n-vectors ``u`` and ``v`` is the
2766 maximum norm-1 distance between their respective elements. More
2767 precisely, the distance is given by
2769 .. math::
2771 d(u,v) = \\max_i {|u_i-v_i|}.
2773 12. ``Y = cdist(XA, XB, 'canberra')``
2775 Computes the Canberra distance between the points. The
2776 Canberra distance between two points ``u`` and ``v`` is
2778 .. math::
2780 d(u,v) = \\sum_i \\frac{|u_i-v_i|}
2781 {|u_i|+|v_i|}.
2783 13. ``Y = cdist(XA, XB, 'braycurtis')``
2785 Computes the Bray-Curtis distance between the points. The
2786 Bray-Curtis distance between two points ``u`` and ``v`` is
2789 .. math::
2791 d(u,v) = \\frac{\\sum_i (|u_i-v_i|)}
2792 {\\sum_i (|u_i+v_i|)}
2794 14. ``Y = cdist(XA, XB, 'mahalanobis', VI=None)``
2796 Computes the Mahalanobis distance between the points. The
2797 Mahalanobis distance between two points ``u`` and ``v`` is
2798 :math:`\\sqrt{(u-v)(1/V)(u-v)^T}` where :math:`(1/V)` (the ``VI``
2799 variable) is the inverse covariance. If ``VI`` is not None,
2800 ``VI`` will be used as the inverse covariance matrix.
2802 15. ``Y = cdist(XA, XB, 'yule')``
2804 Computes the Yule distance between the boolean
2805 vectors. (see `yule` function documentation)
2807 16. ``Y = cdist(XA, XB, 'matching')``
2809 Synonym for 'hamming'.
2811 17. ``Y = cdist(XA, XB, 'dice')``
2813 Computes the Dice distance between the boolean vectors. (see
2814 `dice` function documentation)
2816 18. ``Y = cdist(XA, XB, 'kulczynski1')``
2818 Computes the kulczynski distance between the boolean
2819 vectors. (see `kulczynski1` function documentation)
2821 19. ``Y = cdist(XA, XB, 'rogerstanimoto')``
2823 Computes the Rogers-Tanimoto distance between the boolean
2824 vectors. (see `rogerstanimoto` function documentation)
2826 20. ``Y = cdist(XA, XB, 'russellrao')``
2828 Computes the Russell-Rao distance between the boolean
2829 vectors. (see `russellrao` function documentation)
2831 21. ``Y = cdist(XA, XB, 'sokalmichener')``
2833 Computes the Sokal-Michener distance between the boolean
2834 vectors. (see `sokalmichener` function documentation)
2836 22. ``Y = cdist(XA, XB, 'sokalsneath')``
2838 Computes the Sokal-Sneath distance between the vectors. (see
2839 `sokalsneath` function documentation)
2841 23. ``Y = cdist(XA, XB, f)``
2843 Computes the distance between all pairs of vectors in X
2844 using the user supplied 2-arity function f. For example,
2845 Euclidean distance between the vectors could be computed
2846 as follows::
2848 dm = cdist(XA, XB, lambda u, v: np.sqrt(((u-v)**2).sum()))
2850 Note that you should avoid passing a reference to one of
2851 the distance functions defined in this library. For example,::
2853 dm = cdist(XA, XB, sokalsneath)
2855 would calculate the pair-wise distances between the vectors in
2856 X using the Python function `sokalsneath`. This would result in
2857 sokalsneath being called :math:`{n \\choose 2}` times, which
2858 is inefficient. Instead, the optimized C version is more
2859 efficient, and we call it using the following syntax::
2861 dm = cdist(XA, XB, 'sokalsneath')
2863 Examples
2864 --------
2865 Find the Euclidean distances between four 2-D coordinates:
2867 >>> from scipy.spatial import distance
2868 >>> import numpy as np
2869 >>> coords = [(35.0456, -85.2672),
2870 ... (35.1174, -89.9711),
2871 ... (35.9728, -83.9422),
2872 ... (36.1667, -86.7833)]
2873 >>> distance.cdist(coords, coords, 'euclidean')
2874 array([[ 0. , 4.7044, 1.6172, 1.8856],
2875 [ 4.7044, 0. , 6.0893, 3.3561],
2876 [ 1.6172, 6.0893, 0. , 2.8477],
2877 [ 1.8856, 3.3561, 2.8477, 0. ]])
2880 Find the Manhattan distance from a 3-D point to the corners of the unit
2881 cube:
2883 >>> a = np.array([[0, 0, 0],
2884 ... [0, 0, 1],
2885 ... [0, 1, 0],
2886 ... [0, 1, 1],
2887 ... [1, 0, 0],
2888 ... [1, 0, 1],
2889 ... [1, 1, 0],
2890 ... [1, 1, 1]])
2891 >>> b = np.array([[ 0.1, 0.2, 0.4]])
2892 >>> distance.cdist(a, b, 'cityblock')
2893 array([[ 0.7],
2894 [ 0.9],
2895 [ 1.3],
2896 [ 1.5],
2897 [ 1.5],
2898 [ 1.7],
2899 [ 2.1],
2900 [ 2.3]])
2902 """
2903 # You can also call this as:
2904 # Y = cdist(XA, XB, 'test_abc')
2905 # where 'abc' is the metric being tested. This computes the distance
2906 # between all pairs of vectors in XA and XB using the distance metric 'abc'
2907 # but with a more succinct, verifiable, but less efficient implementation.
2909 XA = np.asarray(XA)
2910 XB = np.asarray(XB)
2912 s = XA.shape
2913 sB = XB.shape
2915 if len(s) != 2:
2916 raise ValueError('XA must be a 2-dimensional array.')
2917 if len(sB) != 2:
2918 raise ValueError('XB must be a 2-dimensional array.')
2919 if s[1] != sB[1]:
2920 raise ValueError('XA and XB must have the same number of columns '
2921 '(i.e. feature dimension.)')
2923 mA = s[0]
2924 mB = sB[0]
2925 n = s[1]
2927 if callable(metric):
2928 mstr = getattr(metric, '__name__', 'Unknown')
2929 metric_info = _METRIC_ALIAS.get(mstr, None)
2930 if metric_info is not None:
2931 XA, XB, typ, kwargs = _validate_cdist_input(
2932 XA, XB, mA, mB, n, metric_info, **kwargs)
2933 return _cdist_callable(XA, XB, metric=metric, out=out, **kwargs)
2934 elif isinstance(metric, str):
2935 mstr = metric.lower()
2936 metric_info = _METRIC_ALIAS.get(mstr, None)
2937 if metric_info is not None:
2938 cdist_fn = metric_info.cdist_func
2939 return cdist_fn(XA, XB, out=out, **kwargs)
2940 elif mstr.startswith("test_"):
2941 metric_info = _TEST_METRICS.get(mstr, None)
2942 if metric_info is None:
2943 raise ValueError(f'Unknown "Test" Distance Metric: {mstr[5:]}')
2944 XA, XB, typ, kwargs = _validate_cdist_input(
2945 XA, XB, mA, mB, n, metric_info, **kwargs)
2946 return _cdist_callable(
2947 XA, XB, metric=metric_info.dist_func, out=out, **kwargs)
2948 else:
2949 raise ValueError('Unknown Distance Metric: %s' % mstr)
2950 else:
2951 raise TypeError('2nd argument metric must be a string identifier '
2952 'or a function.')