Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scipy/stats/_kde.py: 17%
198 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-12 06:31 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-12 06:31 +0000
1#-------------------------------------------------------------------------------
2#
3# Define classes for (uni/multi)-variate kernel density estimation.
4#
5# Currently, only Gaussian kernels are implemented.
6#
7# Written by: Robert Kern
8#
9# Date: 2004-08-09
10#
11# Modified: 2005-02-10 by Robert Kern.
12# Contributed to SciPy
13# 2005-10-07 by Robert Kern.
14# Some fixes to match the new scipy_core
15#
16# Copyright 2004-2005 by Enthought, Inc.
17#
18#-------------------------------------------------------------------------------
20# Standard library imports.
21import warnings
23# SciPy imports.
24from scipy import linalg, special
25from scipy._lib._util import check_random_state
27from numpy import (asarray, atleast_2d, reshape, zeros, newaxis, exp, pi,
28 sqrt, ravel, power, atleast_1d, squeeze, sum, transpose,
29 ones, cov)
30import numpy as np
32# Local imports.
33from . import _mvn
34from ._stats import gaussian_kernel_estimate, gaussian_kernel_estimate_log
37__all__ = ['gaussian_kde']
40class gaussian_kde:
41 """Representation of a kernel-density estimate using Gaussian kernels.
43 Kernel density estimation is a way to estimate the probability density
44 function (PDF) of a random variable in a non-parametric way.
45 `gaussian_kde` works for both uni-variate and multi-variate data. It
46 includes automatic bandwidth determination. The estimation works best for
47 a unimodal distribution; bimodal or multi-modal distributions tend to be
48 oversmoothed.
50 Parameters
51 ----------
52 dataset : array_like
53 Datapoints to estimate from. In case of univariate data this is a 1-D
54 array, otherwise a 2-D array with shape (# of dims, # of data).
55 bw_method : str, scalar or callable, optional
56 The method used to calculate the estimator bandwidth. This can be
57 'scott', 'silverman', a scalar constant or a callable. If a scalar,
58 this will be used directly as `kde.factor`. If a callable, it should
59 take a `gaussian_kde` instance as only parameter and return a scalar.
60 If None (default), 'scott' is used. See Notes for more details.
61 weights : array_like, optional
62 weights of datapoints. This must be the same shape as dataset.
63 If None (default), the samples are assumed to be equally weighted
65 Attributes
66 ----------
67 dataset : ndarray
68 The dataset with which `gaussian_kde` was initialized.
69 d : int
70 Number of dimensions.
71 n : int
72 Number of datapoints.
73 neff : int
74 Effective number of datapoints.
76 .. versionadded:: 1.2.0
77 factor : float
78 The bandwidth factor, obtained from `kde.covariance_factor`. The square
79 of `kde.factor` multiplies the covariance matrix of the data in the kde
80 estimation.
81 covariance : ndarray
82 The covariance matrix of `dataset`, scaled by the calculated bandwidth
83 (`kde.factor`).
84 inv_cov : ndarray
85 The inverse of `covariance`.
87 Methods
88 -------
89 evaluate
90 __call__
91 integrate_gaussian
92 integrate_box_1d
93 integrate_box
94 integrate_kde
95 pdf
96 logpdf
97 resample
98 set_bandwidth
99 covariance_factor
101 Notes
102 -----
103 Bandwidth selection strongly influences the estimate obtained from the KDE
104 (much more so than the actual shape of the kernel). Bandwidth selection
105 can be done by a "rule of thumb", by cross-validation, by "plug-in
106 methods" or by other means; see [3]_, [4]_ for reviews. `gaussian_kde`
107 uses a rule of thumb, the default is Scott's Rule.
109 Scott's Rule [1]_, implemented as `scotts_factor`, is::
111 n**(-1./(d+4)),
113 with ``n`` the number of data points and ``d`` the number of dimensions.
114 In the case of unequally weighted points, `scotts_factor` becomes::
116 neff**(-1./(d+4)),
118 with ``neff`` the effective number of datapoints.
119 Silverman's Rule [2]_, implemented as `silverman_factor`, is::
121 (n * (d + 2) / 4.)**(-1. / (d + 4)).
123 or in the case of unequally weighted points::
125 (neff * (d + 2) / 4.)**(-1. / (d + 4)).
127 Good general descriptions of kernel density estimation can be found in [1]_
128 and [2]_, the mathematics for this multi-dimensional implementation can be
129 found in [1]_.
131 With a set of weighted samples, the effective number of datapoints ``neff``
132 is defined by::
134 neff = sum(weights)^2 / sum(weights^2)
136 as detailed in [5]_.
138 `gaussian_kde` does not currently support data that lies in a
139 lower-dimensional subspace of the space in which it is expressed. For such
140 data, consider performing principle component analysis / dimensionality
141 reduction and using `gaussian_kde` with the transformed data.
143 References
144 ----------
145 .. [1] D.W. Scott, "Multivariate Density Estimation: Theory, Practice, and
146 Visualization", John Wiley & Sons, New York, Chicester, 1992.
147 .. [2] B.W. Silverman, "Density Estimation for Statistics and Data
148 Analysis", Vol. 26, Monographs on Statistics and Applied Probability,
149 Chapman and Hall, London, 1986.
150 .. [3] B.A. Turlach, "Bandwidth Selection in Kernel Density Estimation: A
151 Review", CORE and Institut de Statistique, Vol. 19, pp. 1-33, 1993.
152 .. [4] D.M. Bashtannyk and R.J. Hyndman, "Bandwidth selection for kernel
153 conditional density estimation", Computational Statistics & Data
154 Analysis, Vol. 36, pp. 279-298, 2001.
155 .. [5] Gray P. G., 1969, Journal of the Royal Statistical Society.
156 Series A (General), 132, 272
158 Examples
159 --------
160 Generate some random two-dimensional data:
162 >>> import numpy as np
163 >>> from scipy import stats
164 >>> def measure(n):
165 ... "Measurement model, return two coupled measurements."
166 ... m1 = np.random.normal(size=n)
167 ... m2 = np.random.normal(scale=0.5, size=n)
168 ... return m1+m2, m1-m2
170 >>> m1, m2 = measure(2000)
171 >>> xmin = m1.min()
172 >>> xmax = m1.max()
173 >>> ymin = m2.min()
174 >>> ymax = m2.max()
176 Perform a kernel density estimate on the data:
178 >>> X, Y = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
179 >>> positions = np.vstack([X.ravel(), Y.ravel()])
180 >>> values = np.vstack([m1, m2])
181 >>> kernel = stats.gaussian_kde(values)
182 >>> Z = np.reshape(kernel(positions).T, X.shape)
184 Plot the results:
186 >>> import matplotlib.pyplot as plt
187 >>> fig, ax = plt.subplots()
188 >>> ax.imshow(np.rot90(Z), cmap=plt.cm.gist_earth_r,
189 ... extent=[xmin, xmax, ymin, ymax])
190 >>> ax.plot(m1, m2, 'k.', markersize=2)
191 >>> ax.set_xlim([xmin, xmax])
192 >>> ax.set_ylim([ymin, ymax])
193 >>> plt.show()
195 """
196 def __init__(self, dataset, bw_method=None, weights=None):
197 self.dataset = atleast_2d(asarray(dataset))
198 if not self.dataset.size > 1:
199 raise ValueError("`dataset` input should have multiple elements.")
201 self.d, self.n = self.dataset.shape
203 if weights is not None:
204 self._weights = atleast_1d(weights).astype(float)
205 self._weights /= sum(self._weights)
206 if self.weights.ndim != 1:
207 raise ValueError("`weights` input should be one-dimensional.")
208 if len(self._weights) != self.n:
209 raise ValueError("`weights` input should be of length n")
210 self._neff = 1/sum(self._weights**2)
212 # This can be converted to a warning once gh-10205 is resolved
213 if self.d > self.n:
214 msg = ("Number of dimensions is greater than number of samples. "
215 "This results in a singular data covariance matrix, which "
216 "cannot be treated using the algorithms implemented in "
217 "`gaussian_kde`. Note that `gaussian_kde` interprets each "
218 "*column* of `dataset` to be a point; consider transposing "
219 "the input to `dataset`.")
220 raise ValueError(msg)
222 try:
223 self.set_bandwidth(bw_method=bw_method)
224 except linalg.LinAlgError as e:
225 msg = ("The data appears to lie in a lower-dimensional subspace "
226 "of the space in which it is expressed. This has resulted "
227 "in a singular data covariance matrix, which cannot be "
228 "treated using the algorithms implemented in "
229 "`gaussian_kde`. Consider performing principle component "
230 "analysis / dimensionality reduction and using "
231 "`gaussian_kde` with the transformed data.")
232 raise linalg.LinAlgError(msg) from e
234 def evaluate(self, points):
235 """Evaluate the estimated pdf on a set of points.
237 Parameters
238 ----------
239 points : (# of dimensions, # of points)-array
240 Alternatively, a (# of dimensions,) vector can be passed in and
241 treated as a single point.
243 Returns
244 -------
245 values : (# of points,)-array
246 The values at each point.
248 Raises
249 ------
250 ValueError : if the dimensionality of the input points is different than
251 the dimensionality of the KDE.
253 """
254 points = atleast_2d(asarray(points))
256 d, m = points.shape
257 if d != self.d:
258 if d == 1 and m == self.d:
259 # points was passed in as a row vector
260 points = reshape(points, (self.d, 1))
261 m = 1
262 else:
263 msg = "points have dimension %s, dataset has dimension %s" % (d,
264 self.d)
265 raise ValueError(msg)
267 output_dtype, spec = _get_output_dtype(self.covariance, points)
268 result = gaussian_kernel_estimate[spec](
269 self.dataset.T, self.weights[:, None],
270 points.T, self.cho_cov, output_dtype)
272 return result[:, 0]
274 __call__ = evaluate
276 def integrate_gaussian(self, mean, cov):
277 """
278 Multiply estimated density by a multivariate Gaussian and integrate
279 over the whole space.
281 Parameters
282 ----------
283 mean : aray_like
284 A 1-D array, specifying the mean of the Gaussian.
285 cov : array_like
286 A 2-D array, specifying the covariance matrix of the Gaussian.
288 Returns
289 -------
290 result : scalar
291 The value of the integral.
293 Raises
294 ------
295 ValueError
296 If the mean or covariance of the input Gaussian differs from
297 the KDE's dimensionality.
299 """
300 mean = atleast_1d(squeeze(mean))
301 cov = atleast_2d(cov)
303 if mean.shape != (self.d,):
304 raise ValueError("mean does not have dimension %s" % self.d)
305 if cov.shape != (self.d, self.d):
306 raise ValueError("covariance does not have dimension %s" % self.d)
308 # make mean a column vector
309 mean = mean[:, newaxis]
311 sum_cov = self.covariance + cov
313 # This will raise LinAlgError if the new cov matrix is not s.p.d
314 # cho_factor returns (ndarray, bool) where bool is a flag for whether
315 # or not ndarray is upper or lower triangular
316 sum_cov_chol = linalg.cho_factor(sum_cov)
318 diff = self.dataset - mean
319 tdiff = linalg.cho_solve(sum_cov_chol, diff)
321 sqrt_det = np.prod(np.diagonal(sum_cov_chol[0]))
322 norm_const = power(2 * pi, sum_cov.shape[0] / 2.0) * sqrt_det
324 energies = sum(diff * tdiff, axis=0) / 2.0
325 result = sum(exp(-energies)*self.weights, axis=0) / norm_const
327 return result
329 def integrate_box_1d(self, low, high):
330 """
331 Computes the integral of a 1D pdf between two bounds.
333 Parameters
334 ----------
335 low : scalar
336 Lower bound of integration.
337 high : scalar
338 Upper bound of integration.
340 Returns
341 -------
342 value : scalar
343 The result of the integral.
345 Raises
346 ------
347 ValueError
348 If the KDE is over more than one dimension.
350 """
351 if self.d != 1:
352 raise ValueError("integrate_box_1d() only handles 1D pdfs")
354 stdev = ravel(sqrt(self.covariance))[0]
356 normalized_low = ravel((low - self.dataset) / stdev)
357 normalized_high = ravel((high - self.dataset) / stdev)
359 value = np.sum(self.weights*(
360 special.ndtr(normalized_high) -
361 special.ndtr(normalized_low)))
362 return value
364 def integrate_box(self, low_bounds, high_bounds, maxpts=None):
365 """Computes the integral of a pdf over a rectangular interval.
367 Parameters
368 ----------
369 low_bounds : array_like
370 A 1-D array containing the lower bounds of integration.
371 high_bounds : array_like
372 A 1-D array containing the upper bounds of integration.
373 maxpts : int, optional
374 The maximum number of points to use for integration.
376 Returns
377 -------
378 value : scalar
379 The result of the integral.
381 """
382 if maxpts is not None:
383 extra_kwds = {'maxpts': maxpts}
384 else:
385 extra_kwds = {}
387 value, inform = _mvn.mvnun_weighted(low_bounds, high_bounds,
388 self.dataset, self.weights,
389 self.covariance, **extra_kwds)
390 if inform:
391 msg = ('An integral in _mvn.mvnun requires more points than %s' %
392 (self.d * 1000))
393 warnings.warn(msg)
395 return value
397 def integrate_kde(self, other):
398 """
399 Computes the integral of the product of this kernel density estimate
400 with another.
402 Parameters
403 ----------
404 other : gaussian_kde instance
405 The other kde.
407 Returns
408 -------
409 value : scalar
410 The result of the integral.
412 Raises
413 ------
414 ValueError
415 If the KDEs have different dimensionality.
417 """
418 if other.d != self.d:
419 raise ValueError("KDEs are not the same dimensionality")
421 # we want to iterate over the smallest number of points
422 if other.n < self.n:
423 small = other
424 large = self
425 else:
426 small = self
427 large = other
429 sum_cov = small.covariance + large.covariance
430 sum_cov_chol = linalg.cho_factor(sum_cov)
431 result = 0.0
432 for i in range(small.n):
433 mean = small.dataset[:, i, newaxis]
434 diff = large.dataset - mean
435 tdiff = linalg.cho_solve(sum_cov_chol, diff)
437 energies = sum(diff * tdiff, axis=0) / 2.0
438 result += sum(exp(-energies)*large.weights, axis=0)*small.weights[i]
440 sqrt_det = np.prod(np.diagonal(sum_cov_chol[0]))
441 norm_const = power(2 * pi, sum_cov.shape[0] / 2.0) * sqrt_det
443 result /= norm_const
445 return result
447 def resample(self, size=None, seed=None):
448 """Randomly sample a dataset from the estimated pdf.
450 Parameters
451 ----------
452 size : int, optional
453 The number of samples to draw. If not provided, then the size is
454 the same as the effective number of samples in the underlying
455 dataset.
456 seed : {None, int, `numpy.random.Generator`, `numpy.random.RandomState`}, optional
457 If `seed` is None (or `np.random`), the `numpy.random.RandomState`
458 singleton is used.
459 If `seed` is an int, a new ``RandomState`` instance is used,
460 seeded with `seed`.
461 If `seed` is already a ``Generator`` or ``RandomState`` instance then
462 that instance is used.
464 Returns
465 -------
466 resample : (self.d, `size`) ndarray
467 The sampled dataset.
469 """
470 if size is None:
471 size = int(self.neff)
473 random_state = check_random_state(seed)
474 norm = transpose(random_state.multivariate_normal(
475 zeros((self.d,), float), self.covariance, size=size
476 ))
477 indices = random_state.choice(self.n, size=size, p=self.weights)
478 means = self.dataset[:, indices]
480 return means + norm
482 def scotts_factor(self):
483 """Compute Scott's factor.
485 Returns
486 -------
487 s : float
488 Scott's factor.
489 """
490 return power(self.neff, -1./(self.d+4))
492 def silverman_factor(self):
493 """Compute the Silverman factor.
495 Returns
496 -------
497 s : float
498 The silverman factor.
499 """
500 return power(self.neff*(self.d+2.0)/4.0, -1./(self.d+4))
502 # Default method to calculate bandwidth, can be overwritten by subclass
503 covariance_factor = scotts_factor
504 covariance_factor.__doc__ = """Computes the coefficient (`kde.factor`) that
505 multiplies the data covariance matrix to obtain the kernel covariance
506 matrix. The default is `scotts_factor`. A subclass can overwrite this
507 method to provide a different method, or set it through a call to
508 `kde.set_bandwidth`."""
510 def set_bandwidth(self, bw_method=None):
511 """Compute the estimator bandwidth with given method.
513 The new bandwidth calculated after a call to `set_bandwidth` is used
514 for subsequent evaluations of the estimated density.
516 Parameters
517 ----------
518 bw_method : str, scalar or callable, optional
519 The method used to calculate the estimator bandwidth. This can be
520 'scott', 'silverman', a scalar constant or a callable. If a
521 scalar, this will be used directly as `kde.factor`. If a callable,
522 it should take a `gaussian_kde` instance as only parameter and
523 return a scalar. If None (default), nothing happens; the current
524 `kde.covariance_factor` method is kept.
526 Notes
527 -----
528 .. versionadded:: 0.11
530 Examples
531 --------
532 >>> import numpy as np
533 >>> import scipy.stats as stats
534 >>> x1 = np.array([-7, -5, 1, 4, 5.])
535 >>> kde = stats.gaussian_kde(x1)
536 >>> xs = np.linspace(-10, 10, num=50)
537 >>> y1 = kde(xs)
538 >>> kde.set_bandwidth(bw_method='silverman')
539 >>> y2 = kde(xs)
540 >>> kde.set_bandwidth(bw_method=kde.factor / 3.)
541 >>> y3 = kde(xs)
543 >>> import matplotlib.pyplot as plt
544 >>> fig, ax = plt.subplots()
545 >>> ax.plot(x1, np.full(x1.shape, 1 / (4. * x1.size)), 'bo',
546 ... label='Data points (rescaled)')
547 >>> ax.plot(xs, y1, label='Scott (default)')
548 >>> ax.plot(xs, y2, label='Silverman')
549 >>> ax.plot(xs, y3, label='Const (1/3 * Silverman)')
550 >>> ax.legend()
551 >>> plt.show()
553 """
554 if bw_method is None:
555 pass
556 elif bw_method == 'scott':
557 self.covariance_factor = self.scotts_factor
558 elif bw_method == 'silverman':
559 self.covariance_factor = self.silverman_factor
560 elif np.isscalar(bw_method) and not isinstance(bw_method, str):
561 self._bw_method = 'use constant'
562 self.covariance_factor = lambda: bw_method
563 elif callable(bw_method):
564 self._bw_method = bw_method
565 self.covariance_factor = lambda: self._bw_method(self)
566 else:
567 msg = "`bw_method` should be 'scott', 'silverman', a scalar " \
568 "or a callable."
569 raise ValueError(msg)
571 self._compute_covariance()
573 def _compute_covariance(self):
574 """Computes the covariance matrix for each Gaussian kernel using
575 covariance_factor().
576 """
577 self.factor = self.covariance_factor()
578 # Cache covariance and Cholesky decomp of covariance
579 if not hasattr(self, '_data_cho_cov'):
580 self._data_covariance = atleast_2d(cov(self.dataset, rowvar=1,
581 bias=False,
582 aweights=self.weights))
583 self._data_cho_cov = linalg.cholesky(self._data_covariance,
584 lower=True)
586 self.covariance = self._data_covariance * self.factor**2
587 self.cho_cov = (self._data_cho_cov * self.factor).astype(np.float64)
588 self.log_det = 2*np.log(np.diag(self.cho_cov
589 * np.sqrt(2*pi))).sum()
591 @property
592 def inv_cov(self):
593 # Re-compute from scratch each time because I'm not sure how this is
594 # used in the wild. (Perhaps users change the `dataset`, since it's
595 # not a private attribute?) `_compute_covariance` used to recalculate
596 # all these, so we'll recalculate everything now that this is a
597 # a property.
598 self.factor = self.covariance_factor()
599 self._data_covariance = atleast_2d(cov(self.dataset, rowvar=1,
600 bias=False, aweights=self.weights))
601 return linalg.inv(self._data_covariance) / self.factor**2
603 def pdf(self, x):
604 """
605 Evaluate the estimated pdf on a provided set of points.
607 Notes
608 -----
609 This is an alias for `gaussian_kde.evaluate`. See the ``evaluate``
610 docstring for more details.
612 """
613 return self.evaluate(x)
615 def logpdf(self, x):
616 """
617 Evaluate the log of the estimated pdf on a provided set of points.
618 """
619 points = atleast_2d(x)
621 d, m = points.shape
622 if d != self.d:
623 if d == 1 and m == self.d:
624 # points was passed in as a row vector
625 points = reshape(points, (self.d, 1))
626 m = 1
627 else:
628 msg = (f"points have dimension {d}, "
629 f"dataset has dimension {self.d}")
630 raise ValueError(msg)
632 output_dtype, spec = _get_output_dtype(self.covariance, points)
633 result = gaussian_kernel_estimate_log[spec](
634 self.dataset.T, self.weights[:, None],
635 points.T, self.cho_cov, output_dtype)
637 return result[:, 0]
639 def marginal(self, dimensions):
640 """Return a marginal KDE distribution
642 Parameters
643 ----------
644 dimensions : int or 1-d array_like
645 The dimensions of the multivariate distribution corresponding
646 with the marginal variables, that is, the indices of the dimensions
647 that are being retained. The other dimensions are marginalized out.
649 Returns
650 -------
651 marginal_kde : gaussian_kde
652 An object representing the marginal distribution.
654 Notes
655 -----
656 .. versionadded:: 1.10.0
658 """
660 dims = np.atleast_1d(dimensions)
662 if not np.issubdtype(dims.dtype, np.integer):
663 msg = ("Elements of `dimensions` must be integers - the indices "
664 "of the marginal variables being retained.")
665 raise ValueError(msg)
667 n = len(self.dataset) # number of dimensions
668 original_dims = dims.copy()
670 dims[dims < 0] = n + dims[dims < 0]
672 if len(np.unique(dims)) != len(dims):
673 msg = ("All elements of `dimensions` must be unique.")
674 raise ValueError(msg)
676 i_invalid = (dims < 0) | (dims >= n)
677 if np.any(i_invalid):
678 msg = (f"Dimensions {original_dims[i_invalid]} are invalid "
679 f"for a distribution in {n} dimensions.")
680 raise ValueError(msg)
682 dataset = self.dataset[dims]
683 weights = self.weights
685 return gaussian_kde(dataset, bw_method=self.covariance_factor(),
686 weights=weights)
688 @property
689 def weights(self):
690 try:
691 return self._weights
692 except AttributeError:
693 self._weights = ones(self.n)/self.n
694 return self._weights
696 @property
697 def neff(self):
698 try:
699 return self._neff
700 except AttributeError:
701 self._neff = 1/sum(self.weights**2)
702 return self._neff
705def _get_output_dtype(covariance, points):
706 """
707 Calculates the output dtype and the "spec" (=C type name).
709 This was necessary in order to deal with the fused types in the Cython
710 routine `gaussian_kernel_estimate`. See gh-10824 for details.
711 """
712 output_dtype = np.common_type(covariance, points)
713 itemsize = np.dtype(output_dtype).itemsize
714 if itemsize == 4:
715 spec = 'float'
716 elif itemsize == 8:
717 spec = 'double'
718 elif itemsize in (12, 16):
719 spec = 'long double'
720 else:
721 raise ValueError(
722 f"{output_dtype} has unexpected item size: {itemsize}"
723 )
725 return output_dtype, spec