Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scipy/spatial/_kdtree.py: 34%
139 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-12 06:31 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-12 06:31 +0000
1# Copyright Anne M. Archibald 2008
2# Released under the scipy license
3import numpy as np
4from ._ckdtree import cKDTree, cKDTreeNode
6__all__ = ['minkowski_distance_p', 'minkowski_distance',
7 'distance_matrix',
8 'Rectangle', 'KDTree']
11def minkowski_distance_p(x, y, p=2):
12 """Compute the pth power of the L**p distance between two arrays.
14 For efficiency, this function computes the L**p distance but does
15 not extract the pth root. If `p` is 1 or infinity, this is equal to
16 the actual L**p distance.
18 The last dimensions of `x` and `y` must be the same length. Any
19 other dimensions must be compatible for broadcasting.
21 Parameters
22 ----------
23 x : (..., K) array_like
24 Input array.
25 y : (..., K) array_like
26 Input array.
27 p : float, 1 <= p <= infinity
28 Which Minkowski p-norm to use.
30 Returns
31 -------
32 dist : ndarray
33 pth power of the distance between the input arrays.
35 Examples
36 --------
37 >>> from scipy.spatial import minkowski_distance_p
38 >>> minkowski_distance_p([[0, 0], [0, 0]], [[1, 1], [0, 1]])
39 array([2, 1])
41 """
42 x = np.asarray(x)
43 y = np.asarray(y)
45 # Find smallest common datatype with float64 (return type of this
46 # function) - addresses #10262.
47 # Don't just cast to float64 for complex input case.
48 common_datatype = np.promote_types(np.promote_types(x.dtype, y.dtype),
49 'float64')
51 # Make sure x and y are NumPy arrays of correct datatype.
52 x = x.astype(common_datatype)
53 y = y.astype(common_datatype)
55 if p == np.inf:
56 return np.amax(np.abs(y-x), axis=-1)
57 elif p == 1:
58 return np.sum(np.abs(y-x), axis=-1)
59 else:
60 return np.sum(np.abs(y-x)**p, axis=-1)
63def minkowski_distance(x, y, p=2):
64 """Compute the L**p distance between two arrays.
66 The last dimensions of `x` and `y` must be the same length. Any
67 other dimensions must be compatible for broadcasting.
69 Parameters
70 ----------
71 x : (..., K) array_like
72 Input array.
73 y : (..., K) array_like
74 Input array.
75 p : float, 1 <= p <= infinity
76 Which Minkowski p-norm to use.
78 Returns
79 -------
80 dist : ndarray
81 Distance between the input arrays.
83 Examples
84 --------
85 >>> from scipy.spatial import minkowski_distance
86 >>> minkowski_distance([[0, 0], [0, 0]], [[1, 1], [0, 1]])
87 array([ 1.41421356, 1. ])
89 """
90 x = np.asarray(x)
91 y = np.asarray(y)
92 if p == np.inf or p == 1:
93 return minkowski_distance_p(x, y, p)
94 else:
95 return minkowski_distance_p(x, y, p)**(1./p)
98class Rectangle:
99 """Hyperrectangle class.
101 Represents a Cartesian product of intervals.
102 """
103 def __init__(self, maxes, mins):
104 """Construct a hyperrectangle."""
105 self.maxes = np.maximum(maxes,mins).astype(float)
106 self.mins = np.minimum(maxes,mins).astype(float)
107 self.m, = self.maxes.shape
109 def __repr__(self):
110 return "<Rectangle %s>" % list(zip(self.mins, self.maxes))
112 def volume(self):
113 """Total volume."""
114 return np.prod(self.maxes-self.mins)
116 def split(self, d, split):
117 """Produce two hyperrectangles by splitting.
119 In general, if you need to compute maximum and minimum
120 distances to the children, it can be done more efficiently
121 by updating the maximum and minimum distances to the parent.
123 Parameters
124 ----------
125 d : int
126 Axis to split hyperrectangle along.
127 split : float
128 Position along axis `d` to split at.
130 """
131 mid = np.copy(self.maxes)
132 mid[d] = split
133 less = Rectangle(self.mins, mid)
134 mid = np.copy(self.mins)
135 mid[d] = split
136 greater = Rectangle(mid, self.maxes)
137 return less, greater
139 def min_distance_point(self, x, p=2.):
140 """
141 Return the minimum distance between input and points in the
142 hyperrectangle.
144 Parameters
145 ----------
146 x : array_like
147 Input.
148 p : float, optional
149 Input.
151 """
152 return minkowski_distance(
153 0, np.maximum(0, np.maximum(self.mins-x, x-self.maxes)),
154 p
155 )
157 def max_distance_point(self, x, p=2.):
158 """
159 Return the maximum distance between input and points in the hyperrectangle.
161 Parameters
162 ----------
163 x : array_like
164 Input array.
165 p : float, optional
166 Input.
168 """
169 return minkowski_distance(0, np.maximum(self.maxes-x, x-self.mins), p)
171 def min_distance_rectangle(self, other, p=2.):
172 """
173 Compute the minimum distance between points in the two hyperrectangles.
175 Parameters
176 ----------
177 other : hyperrectangle
178 Input.
179 p : float
180 Input.
182 """
183 return minkowski_distance(
184 0,
185 np.maximum(0, np.maximum(self.mins-other.maxes,
186 other.mins-self.maxes)),
187 p
188 )
190 def max_distance_rectangle(self, other, p=2.):
191 """
192 Compute the maximum distance between points in the two hyperrectangles.
194 Parameters
195 ----------
196 other : hyperrectangle
197 Input.
198 p : float, optional
199 Input.
201 """
202 return minkowski_distance(
203 0, np.maximum(self.maxes-other.mins, other.maxes-self.mins), p)
206class KDTree(cKDTree):
207 """kd-tree for quick nearest-neighbor lookup.
209 This class provides an index into a set of k-dimensional points
210 which can be used to rapidly look up the nearest neighbors of any
211 point.
213 Parameters
214 ----------
215 data : array_like, shape (n,m)
216 The n data points of dimension m to be indexed. This array is
217 not copied unless this is necessary to produce a contiguous
218 array of doubles, and so modifying this data will result in
219 bogus results. The data are also copied if the kd-tree is built
220 with copy_data=True.
221 leafsize : positive int, optional
222 The number of points at which the algorithm switches over to
223 brute-force. Default: 10.
224 compact_nodes : bool, optional
225 If True, the kd-tree is built to shrink the hyperrectangles to
226 the actual data range. This usually gives a more compact tree that
227 is robust against degenerated input data and gives faster queries
228 at the expense of longer build time. Default: True.
229 copy_data : bool, optional
230 If True the data is always copied to protect the kd-tree against
231 data corruption. Default: False.
232 balanced_tree : bool, optional
233 If True, the median is used to split the hyperrectangles instead of
234 the midpoint. This usually gives a more compact tree and
235 faster queries at the expense of longer build time. Default: True.
236 boxsize : array_like or scalar, optional
237 Apply a m-d toroidal topology to the KDTree.. The topology is generated
238 by :math:`x_i + n_i L_i` where :math:`n_i` are integers and :math:`L_i`
239 is the boxsize along i-th dimension. The input data shall be wrapped
240 into :math:`[0, L_i)`. A ValueError is raised if any of the data is
241 outside of this bound.
243 Notes
244 -----
245 The algorithm used is described in Maneewongvatana and Mount 1999.
246 The general idea is that the kd-tree is a binary tree, each of whose
247 nodes represents an axis-aligned hyperrectangle. Each node specifies
248 an axis and splits the set of points based on whether their coordinate
249 along that axis is greater than or less than a particular value.
251 During construction, the axis and splitting point are chosen by the
252 "sliding midpoint" rule, which ensures that the cells do not all
253 become long and thin.
255 The tree can be queried for the r closest neighbors of any given point
256 (optionally returning only those within some maximum distance of the
257 point). It can also be queried, with a substantial gain in efficiency,
258 for the r approximate closest neighbors.
260 For large dimensions (20 is already large) do not expect this to run
261 significantly faster than brute force. High-dimensional nearest-neighbor
262 queries are a substantial open problem in computer science.
264 Attributes
265 ----------
266 data : ndarray, shape (n,m)
267 The n data points of dimension m to be indexed. This array is
268 not copied unless this is necessary to produce a contiguous
269 array of doubles. The data are also copied if the kd-tree is built
270 with `copy_data=True`.
271 leafsize : positive int
272 The number of points at which the algorithm switches over to
273 brute-force.
274 m : int
275 The dimension of a single data-point.
276 n : int
277 The number of data points.
278 maxes : ndarray, shape (m,)
279 The maximum value in each dimension of the n data points.
280 mins : ndarray, shape (m,)
281 The minimum value in each dimension of the n data points.
282 size : int
283 The number of nodes in the tree.
285 """
287 class node:
288 @staticmethod
289 def _create(ckdtree_node=None):
290 """Create either an inner or leaf node, wrapping a cKDTreeNode instance"""
291 if ckdtree_node is None:
292 return KDTree.node(ckdtree_node)
293 elif ckdtree_node.split_dim == -1:
294 return KDTree.leafnode(ckdtree_node)
295 else:
296 return KDTree.innernode(ckdtree_node)
298 def __init__(self, ckdtree_node=None):
299 if ckdtree_node is None:
300 ckdtree_node = cKDTreeNode()
301 self._node = ckdtree_node
303 def __lt__(self, other):
304 return id(self) < id(other)
306 def __gt__(self, other):
307 return id(self) > id(other)
309 def __le__(self, other):
310 return id(self) <= id(other)
312 def __ge__(self, other):
313 return id(self) >= id(other)
315 def __eq__(self, other):
316 return id(self) == id(other)
318 class leafnode(node):
319 @property
320 def idx(self):
321 return self._node.indices
323 @property
324 def children(self):
325 return self._node.children
327 class innernode(node):
328 def __init__(self, ckdtreenode):
329 assert isinstance(ckdtreenode, cKDTreeNode)
330 super().__init__(ckdtreenode)
331 self.less = KDTree.node._create(ckdtreenode.lesser)
332 self.greater = KDTree.node._create(ckdtreenode.greater)
334 @property
335 def split_dim(self):
336 return self._node.split_dim
338 @property
339 def split(self):
340 return self._node.split
342 @property
343 def children(self):
344 return self._node.children
346 @property
347 def tree(self):
348 if not hasattr(self, "_tree"):
349 self._tree = KDTree.node._create(super().tree)
351 return self._tree
353 def __init__(self, data, leafsize=10, compact_nodes=True, copy_data=False,
354 balanced_tree=True, boxsize=None):
355 data = np.asarray(data)
356 if data.dtype.kind == 'c':
357 raise TypeError("KDTree does not work with complex data")
359 # Note KDTree has different default leafsize from cKDTree
360 super().__init__(data, leafsize, compact_nodes, copy_data,
361 balanced_tree, boxsize)
363 def query(
364 self, x, k=1, eps=0, p=2, distance_upper_bound=np.inf, workers=1):
365 r"""Query the kd-tree for nearest neighbors.
367 Parameters
368 ----------
369 x : array_like, last dimension self.m
370 An array of points to query.
371 k : int or Sequence[int], optional
372 Either the number of nearest neighbors to return, or a list of the
373 k-th nearest neighbors to return, starting from 1.
374 eps : nonnegative float, optional
375 Return approximate nearest neighbors; the kth returned value
376 is guaranteed to be no further than (1+eps) times the
377 distance to the real kth nearest neighbor.
378 p : float, 1<=p<=infinity, optional
379 Which Minkowski p-norm to use.
380 1 is the sum-of-absolute-values distance ("Manhattan" distance).
381 2 is the usual Euclidean distance.
382 infinity is the maximum-coordinate-difference distance.
383 A large, finite p may cause a ValueError if overflow can occur.
384 distance_upper_bound : nonnegative float, optional
385 Return only neighbors within this distance. This is used to prune
386 tree searches, so if you are doing a series of nearest-neighbor
387 queries, it may help to supply the distance to the nearest neighbor
388 of the most recent point.
389 workers : int, optional
390 Number of workers to use for parallel processing. If -1 is given
391 all CPU threads are used. Default: 1.
393 .. versionadded:: 1.6.0
395 Returns
396 -------
397 d : float or array of floats
398 The distances to the nearest neighbors.
399 If ``x`` has shape ``tuple+(self.m,)``, then ``d`` has shape
400 ``tuple+(k,)``.
401 When k == 1, the last dimension of the output is squeezed.
402 Missing neighbors are indicated with infinite distances.
403 Hits are sorted by distance (nearest first).
405 .. versionchanged:: 1.9.0
406 Previously if ``k=None``, then `d` was an object array of
407 shape ``tuple``, containing lists of distances. This behavior
408 has been removed, use `query_ball_point` instead.
410 i : integer or array of integers
411 The index of each neighbor in ``self.data``.
412 ``i`` is the same shape as d.
413 Missing neighbors are indicated with ``self.n``.
415 Examples
416 --------
418 >>> import numpy as np
419 >>> from scipy.spatial import KDTree
420 >>> x, y = np.mgrid[0:5, 2:8]
421 >>> tree = KDTree(np.c_[x.ravel(), y.ravel()])
423 To query the nearest neighbours and return squeezed result, use
425 >>> dd, ii = tree.query([[0, 0], [2.2, 2.9]], k=1)
426 >>> print(dd, ii, sep='\n')
427 [2. 0.2236068]
428 [ 0 13]
430 To query the nearest neighbours and return unsqueezed result, use
432 >>> dd, ii = tree.query([[0, 0], [2.2, 2.9]], k=[1])
433 >>> print(dd, ii, sep='\n')
434 [[2. ]
435 [0.2236068]]
436 [[ 0]
437 [13]]
439 To query the second nearest neighbours and return unsqueezed result,
440 use
442 >>> dd, ii = tree.query([[0, 0], [2.2, 2.9]], k=[2])
443 >>> print(dd, ii, sep='\n')
444 [[2.23606798]
445 [0.80622577]]
446 [[ 6]
447 [19]]
449 To query the first and second nearest neighbours, use
451 >>> dd, ii = tree.query([[0, 0], [2.2, 2.9]], k=2)
452 >>> print(dd, ii, sep='\n')
453 [[2. 2.23606798]
454 [0.2236068 0.80622577]]
455 [[ 0 6]
456 [13 19]]
458 or, be more specific
460 >>> dd, ii = tree.query([[0, 0], [2.2, 2.9]], k=[1, 2])
461 >>> print(dd, ii, sep='\n')
462 [[2. 2.23606798]
463 [0.2236068 0.80622577]]
464 [[ 0 6]
465 [13 19]]
467 """
468 x = np.asarray(x)
469 if x.dtype.kind == 'c':
470 raise TypeError("KDTree does not work with complex data")
472 if k is None:
473 raise ValueError("k must be an integer or a sequence of integers")
475 d, i = super().query(x, k, eps, p, distance_upper_bound, workers)
476 if isinstance(i, int):
477 i = np.intp(i)
478 return d, i
480 def query_ball_point(self, x, r, p=2., eps=0, workers=1,
481 return_sorted=None, return_length=False):
482 """Find all points within distance r of point(s) x.
484 Parameters
485 ----------
486 x : array_like, shape tuple + (self.m,)
487 The point or points to search for neighbors of.
488 r : array_like, float
489 The radius of points to return, must broadcast to the length of x.
490 p : float, optional
491 Which Minkowski p-norm to use. Should be in the range [1, inf].
492 A finite large p may cause a ValueError if overflow can occur.
493 eps : nonnegative float, optional
494 Approximate search. Branches of the tree are not explored if their
495 nearest points are further than ``r / (1 + eps)``, and branches are
496 added in bulk if their furthest points are nearer than
497 ``r * (1 + eps)``.
498 workers : int, optional
499 Number of jobs to schedule for parallel processing. If -1 is given
500 all processors are used. Default: 1.
502 .. versionadded:: 1.6.0
503 return_sorted : bool, optional
504 Sorts returned indicies if True and does not sort them if False. If
505 None, does not sort single point queries, but does sort
506 multi-point queries which was the behavior before this option
507 was added.
509 .. versionadded:: 1.6.0
510 return_length : bool, optional
511 Return the number of points inside the radius instead of a list
512 of the indices.
514 .. versionadded:: 1.6.0
516 Returns
517 -------
518 results : list or array of lists
519 If `x` is a single point, returns a list of the indices of the
520 neighbors of `x`. If `x` is an array of points, returns an object
521 array of shape tuple containing lists of neighbors.
523 Notes
524 -----
525 If you have many points whose neighbors you want to find, you may save
526 substantial amounts of time by putting them in a KDTree and using
527 query_ball_tree.
529 Examples
530 --------
531 >>> import numpy as np
532 >>> from scipy import spatial
533 >>> x, y = np.mgrid[0:5, 0:5]
534 >>> points = np.c_[x.ravel(), y.ravel()]
535 >>> tree = spatial.KDTree(points)
536 >>> sorted(tree.query_ball_point([2, 0], 1))
537 [5, 10, 11, 15]
539 Query multiple points and plot the results:
541 >>> import matplotlib.pyplot as plt
542 >>> points = np.asarray(points)
543 >>> plt.plot(points[:,0], points[:,1], '.')
544 >>> for results in tree.query_ball_point(([2, 0], [3, 3]), 1):
545 ... nearby_points = points[results]
546 ... plt.plot(nearby_points[:,0], nearby_points[:,1], 'o')
547 >>> plt.margins(0.1, 0.1)
548 >>> plt.show()
550 """
551 x = np.asarray(x)
552 if x.dtype.kind == 'c':
553 raise TypeError("KDTree does not work with complex data")
554 return super().query_ball_point(
555 x, r, p, eps, workers, return_sorted, return_length)
557 def query_ball_tree(self, other, r, p=2., eps=0):
558 """
559 Find all pairs of points between `self` and `other` whose distance is
560 at most r.
562 Parameters
563 ----------
564 other : KDTree instance
565 The tree containing points to search against.
566 r : float
567 The maximum distance, has to be positive.
568 p : float, optional
569 Which Minkowski norm to use. `p` has to meet the condition
570 ``1 <= p <= infinity``.
571 eps : float, optional
572 Approximate search. Branches of the tree are not explored
573 if their nearest points are further than ``r/(1+eps)``, and
574 branches are added in bulk if their furthest points are nearer
575 than ``r * (1+eps)``. `eps` has to be non-negative.
577 Returns
578 -------
579 results : list of lists
580 For each element ``self.data[i]`` of this tree, ``results[i]`` is a
581 list of the indices of its neighbors in ``other.data``.
583 Examples
584 --------
585 You can search all pairs of points between two kd-trees within a distance:
587 >>> import matplotlib.pyplot as plt
588 >>> import numpy as np
589 >>> from scipy.spatial import KDTree
590 >>> rng = np.random.default_rng()
591 >>> points1 = rng.random((15, 2))
592 >>> points2 = rng.random((15, 2))
593 >>> plt.figure(figsize=(6, 6))
594 >>> plt.plot(points1[:, 0], points1[:, 1], "xk", markersize=14)
595 >>> plt.plot(points2[:, 0], points2[:, 1], "og", markersize=14)
596 >>> kd_tree1 = KDTree(points1)
597 >>> kd_tree2 = KDTree(points2)
598 >>> indexes = kd_tree1.query_ball_tree(kd_tree2, r=0.2)
599 >>> for i in range(len(indexes)):
600 ... for j in indexes[i]:
601 ... plt.plot([points1[i, 0], points2[j, 0]],
602 ... [points1[i, 1], points2[j, 1]], "-r")
603 >>> plt.show()
605 """
606 return super().query_ball_tree(other, r, p, eps)
608 def query_pairs(self, r, p=2., eps=0, output_type='set'):
609 """Find all pairs of points in `self` whose distance is at most r.
611 Parameters
612 ----------
613 r : positive float
614 The maximum distance.
615 p : float, optional
616 Which Minkowski norm to use. `p` has to meet the condition
617 ``1 <= p <= infinity``.
618 eps : float, optional
619 Approximate search. Branches of the tree are not explored
620 if their nearest points are further than ``r/(1+eps)``, and
621 branches are added in bulk if their furthest points are nearer
622 than ``r * (1+eps)``. `eps` has to be non-negative.
623 output_type : string, optional
624 Choose the output container, 'set' or 'ndarray'. Default: 'set'
626 .. versionadded:: 1.6.0
628 Returns
629 -------
630 results : set or ndarray
631 Set of pairs ``(i,j)``, with ``i < j``, for which the corresponding
632 positions are close. If output_type is 'ndarray', an ndarry is
633 returned instead of a set.
635 Examples
636 --------
637 You can search all pairs of points in a kd-tree within a distance:
639 >>> import matplotlib.pyplot as plt
640 >>> import numpy as np
641 >>> from scipy.spatial import KDTree
642 >>> rng = np.random.default_rng()
643 >>> points = rng.random((20, 2))
644 >>> plt.figure(figsize=(6, 6))
645 >>> plt.plot(points[:, 0], points[:, 1], "xk", markersize=14)
646 >>> kd_tree = KDTree(points)
647 >>> pairs = kd_tree.query_pairs(r=0.2)
648 >>> for (i, j) in pairs:
649 ... plt.plot([points[i, 0], points[j, 0]],
650 ... [points[i, 1], points[j, 1]], "-r")
651 >>> plt.show()
653 """
654 return super().query_pairs(r, p, eps, output_type)
656 def count_neighbors(self, other, r, p=2., weights=None, cumulative=True):
657 """Count how many nearby pairs can be formed.
659 Count the number of pairs ``(x1,x2)`` can be formed, with ``x1`` drawn
660 from ``self`` and ``x2`` drawn from ``other``, and where
661 ``distance(x1, x2, p) <= r``.
663 Data points on ``self`` and ``other`` are optionally weighted by the
664 ``weights`` argument. (See below)
666 This is adapted from the "two-point correlation" algorithm described by
667 Gray and Moore [1]_. See notes for further discussion.
669 Parameters
670 ----------
671 other : KDTree
672 The other tree to draw points from, can be the same tree as self.
673 r : float or one-dimensional array of floats
674 The radius to produce a count for. Multiple radii are searched with
675 a single tree traversal.
676 If the count is non-cumulative(``cumulative=False``), ``r`` defines
677 the edges of the bins, and must be non-decreasing.
678 p : float, optional
679 1<=p<=infinity.
680 Which Minkowski p-norm to use.
681 Default 2.0.
682 A finite large p may cause a ValueError if overflow can occur.
683 weights : tuple, array_like, or None, optional
684 If None, the pair-counting is unweighted.
685 If given as a tuple, weights[0] is the weights of points in
686 ``self``, and weights[1] is the weights of points in ``other``;
687 either can be None to indicate the points are unweighted.
688 If given as an array_like, weights is the weights of points in
689 ``self`` and ``other``. For this to make sense, ``self`` and
690 ``other`` must be the same tree. If ``self`` and ``other`` are two
691 different trees, a ``ValueError`` is raised.
692 Default: None
694 .. versionadded:: 1.6.0
695 cumulative : bool, optional
696 Whether the returned counts are cumulative. When cumulative is set
697 to ``False`` the algorithm is optimized to work with a large number
698 of bins (>10) specified by ``r``. When ``cumulative`` is set to
699 True, the algorithm is optimized to work with a small number of
700 ``r``. Default: True
702 .. versionadded:: 1.6.0
704 Returns
705 -------
706 result : scalar or 1-D array
707 The number of pairs. For unweighted counts, the result is integer.
708 For weighted counts, the result is float.
709 If cumulative is False, ``result[i]`` contains the counts with
710 ``(-inf if i == 0 else r[i-1]) < R <= r[i]``
712 Notes
713 -----
714 Pair-counting is the basic operation used to calculate the two point
715 correlation functions from a data set composed of position of objects.
717 Two point correlation function measures the clustering of objects and
718 is widely used in cosmology to quantify the large scale structure
719 in our Universe, but it may be useful for data analysis in other fields
720 where self-similar assembly of objects also occur.
722 The Landy-Szalay estimator for the two point correlation function of
723 ``D`` measures the clustering signal in ``D``. [2]_
725 For example, given the position of two sets of objects,
727 - objects ``D`` (data) contains the clustering signal, and
729 - objects ``R`` (random) that contains no signal,
731 .. math::
733 \\xi(r) = \\frac{<D, D> - 2 f <D, R> + f^2<R, R>}{f^2<R, R>},
735 where the brackets represents counting pairs between two data sets
736 in a finite bin around ``r`` (distance), corresponding to setting
737 `cumulative=False`, and ``f = float(len(D)) / float(len(R))`` is the
738 ratio between number of objects from data and random.
740 The algorithm implemented here is loosely based on the dual-tree
741 algorithm described in [1]_. We switch between two different
742 pair-cumulation scheme depending on the setting of ``cumulative``.
743 The computing time of the method we use when for
744 ``cumulative == False`` does not scale with the total number of bins.
745 The algorithm for ``cumulative == True`` scales linearly with the
746 number of bins, though it is slightly faster when only
747 1 or 2 bins are used. [5]_.
749 As an extension to the naive pair-counting,
750 weighted pair-counting counts the product of weights instead
751 of number of pairs.
752 Weighted pair-counting is used to estimate marked correlation functions
753 ([3]_, section 2.2),
754 or to properly calculate the average of data per distance bin
755 (e.g. [4]_, section 2.1 on redshift).
757 .. [1] Gray and Moore,
758 "N-body problems in statistical learning",
759 Mining the sky, 2000,
760 https://arxiv.org/abs/astro-ph/0012333
762 .. [2] Landy and Szalay,
763 "Bias and variance of angular correlation functions",
764 The Astrophysical Journal, 1993,
765 http://adsabs.harvard.edu/abs/1993ApJ...412...64L
767 .. [3] Sheth, Connolly and Skibba,
768 "Marked correlations in galaxy formation models",
769 Arxiv e-print, 2005,
770 https://arxiv.org/abs/astro-ph/0511773
772 .. [4] Hawkins, et al.,
773 "The 2dF Galaxy Redshift Survey: correlation functions,
774 peculiar velocities and the matter density of the Universe",
775 Monthly Notices of the Royal Astronomical Society, 2002,
776 http://adsabs.harvard.edu/abs/2003MNRAS.346...78H
778 .. [5] https://github.com/scipy/scipy/pull/5647#issuecomment-168474926
780 Examples
781 --------
782 You can count neighbors number between two kd-trees within a distance:
784 >>> import numpy as np
785 >>> from scipy.spatial import KDTree
786 >>> rng = np.random.default_rng()
787 >>> points1 = rng.random((5, 2))
788 >>> points2 = rng.random((5, 2))
789 >>> kd_tree1 = KDTree(points1)
790 >>> kd_tree2 = KDTree(points2)
791 >>> kd_tree1.count_neighbors(kd_tree2, 0.2)
792 1
794 This number is same as the total pair number calculated by
795 `query_ball_tree`:
797 >>> indexes = kd_tree1.query_ball_tree(kd_tree2, r=0.2)
798 >>> sum([len(i) for i in indexes])
799 1
801 """
802 return super().count_neighbors(other, r, p, weights, cumulative)
804 def sparse_distance_matrix(
805 self, other, max_distance, p=2., output_type='dok_matrix'):
806 """Compute a sparse distance matrix.
808 Computes a distance matrix between two KDTrees, leaving as zero
809 any distance greater than max_distance.
811 Parameters
812 ----------
813 other : KDTree
815 max_distance : positive float
817 p : float, 1<=p<=infinity
818 Which Minkowski p-norm to use.
819 A finite large p may cause a ValueError if overflow can occur.
821 output_type : string, optional
822 Which container to use for output data. Options: 'dok_matrix',
823 'coo_matrix', 'dict', or 'ndarray'. Default: 'dok_matrix'.
825 .. versionadded:: 1.6.0
827 Returns
828 -------
829 result : dok_matrix, coo_matrix, dict or ndarray
830 Sparse matrix representing the results in "dictionary of keys"
831 format. If a dict is returned the keys are (i,j) tuples of indices.
832 If output_type is 'ndarray' a record array with fields 'i', 'j',
833 and 'v' is returned,
835 Examples
836 --------
837 You can compute a sparse distance matrix between two kd-trees:
839 >>> import numpy as np
840 >>> from scipy.spatial import KDTree
841 >>> rng = np.random.default_rng()
842 >>> points1 = rng.random((5, 2))
843 >>> points2 = rng.random((5, 2))
844 >>> kd_tree1 = KDTree(points1)
845 >>> kd_tree2 = KDTree(points2)
846 >>> sdm = kd_tree1.sparse_distance_matrix(kd_tree2, 0.3)
847 >>> sdm.toarray()
848 array([[0. , 0. , 0.12295571, 0. , 0. ],
849 [0. , 0. , 0. , 0. , 0. ],
850 [0.28942611, 0. , 0. , 0.2333084 , 0. ],
851 [0. , 0. , 0. , 0. , 0. ],
852 [0.24617575, 0.29571802, 0.26836782, 0. , 0. ]])
854 You can check distances above the `max_distance` are zeros:
856 >>> from scipy.spatial import distance_matrix
857 >>> distance_matrix(points1, points2)
858 array([[0.56906522, 0.39923701, 0.12295571, 0.8658745 , 0.79428925],
859 [0.37327919, 0.7225693 , 0.87665969, 0.32580855, 0.75679479],
860 [0.28942611, 0.30088013, 0.6395831 , 0.2333084 , 0.33630734],
861 [0.31994999, 0.72658602, 0.71124834, 0.55396483, 0.90785663],
862 [0.24617575, 0.29571802, 0.26836782, 0.57714465, 0.6473269 ]])
864 """
865 return super().sparse_distance_matrix(
866 other, max_distance, p, output_type)
869def distance_matrix(x, y, p=2, threshold=1000000):
870 """Compute the distance matrix.
872 Returns the matrix of all pair-wise distances.
874 Parameters
875 ----------
876 x : (M, K) array_like
877 Matrix of M vectors in K dimensions.
878 y : (N, K) array_like
879 Matrix of N vectors in K dimensions.
880 p : float, 1 <= p <= infinity
881 Which Minkowski p-norm to use.
882 threshold : positive int
883 If ``M * N * K`` > `threshold`, algorithm uses a Python loop instead
884 of large temporary arrays.
886 Returns
887 -------
888 result : (M, N) ndarray
889 Matrix containing the distance from every vector in `x` to every vector
890 in `y`.
892 Examples
893 --------
894 >>> from scipy.spatial import distance_matrix
895 >>> distance_matrix([[0,0],[0,1]], [[1,0],[1,1]])
896 array([[ 1. , 1.41421356],
897 [ 1.41421356, 1. ]])
899 """
901 x = np.asarray(x)
902 m, k = x.shape
903 y = np.asarray(y)
904 n, kk = y.shape
906 if k != kk:
907 raise ValueError(f"x contains {k}-dimensional vectors but y contains "
908 f"{kk}-dimensional vectors")
910 if m*n*k <= threshold:
911 return minkowski_distance(x[:,np.newaxis,:],y[np.newaxis,:,:],p)
912 else:
913 result = np.empty((m,n),dtype=float) # FIXME: figure out the best dtype
914 if m < n:
915 for i in range(m):
916 result[i,:] = minkowski_distance(x[i],y,p)
917 else:
918 for j in range(n):
919 result[:,j] = minkowski_distance(x,y[j],p)
920 return result