Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scipy/stats/

1#-------------------------------------------------------------------------------

3# Define classes for (uni/multi)-variate kernel density estimation.

5# Currently, only Gaussian kernels are implemented.

7# Written by: Robert Kern

9# Date: 2004-08-09

10#

11# Modified: 2005-02-10 by Robert Kern.

12# Contributed to SciPy

13# 2005-10-07 by Robert Kern.

14# Some fixes to match the new scipy_core

15#

17#

18#-------------------------------------------------------------------------------

20# Standard library imports.

21import warnings

23# SciPy imports.

24from scipy import linalg, special

25from scipy._lib._util import check_random_state

27from numpy import (asarray, atleast_2d, reshape, zeros, newaxis, exp, pi,

28 sqrt, ravel, power, atleast_1d, squeeze, sum, transpose,

29 ones, cov)

30import numpy as np

32# Local imports.

33from . import _mvn

34from ._stats import gaussian_kernel_estimate, gaussian_kernel_estimate_log

37__all__ = ['gaussian_kde']

40class gaussian_kde:

41 """Representation of a kernel-density estimate using Gaussian kernels.

43 Kernel density estimation is a way to estimate the probability density

44 function (PDF) of a random variable in a non-parametric way.

45 `gaussian_kde` works for both uni-variate and multi-variate data. It

46 includes automatic bandwidth determination. The estimation works best for

47 a unimodal distribution; bimodal or multi-modal distributions tend to be

48 oversmoothed.

50 Parameters

51 ----------

52 dataset : array_like

53 Datapoints to estimate from. In case of univariate data this is a 1-D

54 array, otherwise a 2-D array with shape (# of dims, # of data).

55 bw_method : str, scalar or callable, optional

56 The method used to calculate the estimator bandwidth. This can be

57 'scott', 'silverman', a scalar constant or a callable. If a scalar,

58 this will be used directly as `kde.factor`. If a callable, it should

59 take a `gaussian_kde` instance as only parameter and return a scalar.

60 If None (default), 'scott' is used. See Notes for more details.

61 weights : array_like, optional

62 weights of datapoints. This must be the same shape as dataset.

63 If None (default), the samples are assumed to be equally weighted

65 Attributes

66 ----------

67 dataset : ndarray

68 The dataset with which `gaussian_kde` was initialized.

69 d : int

70 Number of dimensions.

71 n : int

72 Number of datapoints.

73 neff : int

74 Effective number of datapoints.

76 .. versionadded:: 1.2.0

77 factor : float

78 The bandwidth factor, obtained from `kde.covariance_factor`. The square

79 of `kde.factor` multiplies the covariance matrix of the data in the kde

80 estimation.

81 covariance : ndarray

82 The covariance matrix of `dataset`, scaled by the calculated bandwidth

83 (`kde.factor`).

84 inv_cov : ndarray

85 The inverse of `covariance`.

87 Methods

88 -------

89 evaluate

90 __call__

91 integrate_gaussian

92 integrate_box_1d

93 integrate_box

94 integrate_kde

95 pdf

96 logpdf

97 resample

98 set_bandwidth

99 covariance_factor

100

101 Notes

102 -----

103 Bandwidth selection strongly influences the estimate obtained from the KDE

104 (much more so than the actual shape of the kernel). Bandwidth selection

105 can be done by a "rule of thumb", by cross-validation, by "plug-in

106 methods" or by other means; see [3]_, [4]_ for reviews. `gaussian_kde`

107 uses a rule of thumb, the default is Scott's Rule.

108

109 Scott's Rule [1]_, implemented as `scotts_factor`, is::

110

111 n**(-1./(d+4)),

112

113 with ``n`` the number of data points and ``d`` the number of dimensions.

114 In the case of unequally weighted points, `scotts_factor` becomes::

115

116 neff**(-1./(d+4)),

117

118 with ``neff`` the effective number of datapoints.

119 Silverman's Rule [2]_, implemented as `silverman_factor`, is::

120

121 (n * (d + 2) / 4.)**(-1. / (d + 4)).

122

123 or in the case of unequally weighted points::

124

125 (neff * (d + 2) / 4.)**(-1. / (d + 4)).

126

127 Good general descriptions of kernel density estimation can be found in [1]_

128 and [2]_, the mathematics for this multi-dimensional implementation can be

129 found in [1]_.

130

131 With a set of weighted samples, the effective number of datapoints ``neff``

132 is defined by::

133

134 neff = sum(weights)^2 / sum(weights^2)

135

136 as detailed in [5]_.

137

138 `gaussian_kde` does not currently support data that lies in a

139 lower-dimensional subspace of the space in which it is expressed. For such

140 data, consider performing principle component analysis / dimensionality

141 reduction and using `gaussian_kde` with the transformed data.

142

143 References

144 ----------

145 .. [1] D.W. Scott, "Multivariate Density Estimation: Theory, Practice, and

146 Visualization", John Wiley & Sons, New York, Chicester, 1992.

147 .. [2] B.W. Silverman, "Density Estimation for Statistics and Data

148 Analysis", Vol. 26, Monographs on Statistics and Applied Probability,

149 Chapman and Hall, London, 1986.

150 .. [3] B.A. Turlach, "Bandwidth Selection in Kernel Density Estimation: A

151 Review", CORE and Institut de Statistique, Vol. 19, pp. 1-33, 1993.

152 .. [4] D.M. Bashtannyk and R.J. Hyndman, "Bandwidth selection for kernel

153 conditional density estimation", Computational Statistics & Data

154 Analysis, Vol. 36, pp. 279-298, 2001.

155 .. [5] Gray P. G., 1969, Journal of the Royal Statistical Society.

156 Series A (General), 132, 272

157

158 Examples

159 --------

160 Generate some random two-dimensional data:

161

162 >>> import numpy as np

163 >>> from scipy import stats

164 >>> def measure(n):

165 ... "Measurement model, return two coupled measurements."

166 ... m1 = np.random.normal(size=n)

167 ... m2 = np.random.normal(scale=0.5, size=n)

168 ... return m1+m2, m1-m2

169

170 >>> m1, m2 = measure(2000)

171 >>> xmin = m1.min()

172 >>> xmax = m1.max()

173 >>> ymin = m2.min()

174 >>> ymax = m2.max()

175

176 Perform a kernel density estimate on the data:

177

178 >>> X, Y = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]

179 >>> positions = np.vstack([X.ravel(), Y.ravel()])

180 >>> values = np.vstack([m1, m2])

181 >>> kernel = stats.gaussian_kde(values)

182 >>> Z = np.reshape(kernel(positions).T, X.shape)

183

184 Plot the results:

185

186 >>> import matplotlib.pyplot as plt

187 >>> fig, ax = plt.subplots()

188 >>> ax.imshow(np.rot90(Z), cmap=plt.cm.gist_earth_r,

189 ... extent=[xmin, xmax, ymin, ymax])

190 >>> ax.plot(m1, m2, 'k.', markersize=2)

191 >>> ax.set_xlim([xmin, xmax])

192 >>> ax.set_ylim([ymin, ymax])

193 >>> plt.show()

194

195 """

196 def __init__(self, dataset, bw_method=None, weights=None):

197 self.dataset = atleast_2d(asarray(dataset))

198 if not self.dataset.size > 1:

199 raise ValueError("`dataset` input should have multiple elements.")

200

201 self.d, self.n = self.dataset.shape

202

203 if weights is not None:

204 self._weights = atleast_1d(weights).astype(float)

205 self._weights /= sum(self._weights)

206 if self.weights.ndim != 1:

207 raise ValueError("`weights` input should be one-dimensional.")

208 if len(self._weights) != self.n:

209 raise ValueError("`weights` input should be of length n")

210 self._neff = 1/sum(self._weights**2)

211

212 # This can be converted to a warning once gh-10205 is resolved

213 if self.d > self.n:

214 msg = ("Number of dimensions is greater than number of samples. "

215 "This results in a singular data covariance matrix, which "

216 "cannot be treated using the algorithms implemented in "

217 "`gaussian_kde`. Note that `gaussian_kde` interprets each "

218 "*column* of `dataset` to be a point; consider transposing "

219 "the input to `dataset`.")

220 raise ValueError(msg)

221

222 try:

223 self.set_bandwidth(bw_method=bw_method)

224 except linalg.LinAlgError as e:

225 msg = ("The data appears to lie in a lower-dimensional subspace "

226 "of the space in which it is expressed. This has resulted "

227 "in a singular data covariance matrix, which cannot be "

228 "treated using the algorithms implemented in "

229 "`gaussian_kde`. Consider performing principle component "

230 "analysis / dimensionality reduction and using "

231 "`gaussian_kde` with the transformed data.")

232 raise linalg.LinAlgError(msg) from e

233

234 def evaluate(self, points):

235 """Evaluate the estimated pdf on a set of points.

236

237 Parameters

238 ----------

239 points : (# of dimensions, # of points)-array

240 Alternatively, a (# of dimensions,) vector can be passed in and

241 treated as a single point.

242

243 Returns

244 -------

245 values : (# of points,)-array

246 The values at each point.

247

248 Raises

249 ------

250 ValueError : if the dimensionality of the input points is different than

251 the dimensionality of the KDE.

252

253 """

254 points = atleast_2d(asarray(points))

255

256 d, m = points.shape

257 if d != self.d:

258 if d == 1 and m == self.d:

259 # points was passed in as a row vector

260 points = reshape(points, (self.d, 1))

261 m = 1

262 else:

263 msg = "points have dimension %s, dataset has dimension %s" % (d,

264 self.d)

265 raise ValueError(msg)

266

267 output_dtype, spec = _get_output_dtype(self.covariance, points)

268 result = gaussian_kernel_estimate[spec](

269 self.dataset.T, self.weights[:, None],

270 points.T, self.cho_cov, output_dtype)

271

272 return result[:, 0]

273

274 __call__ = evaluate

275

276 def integrate_gaussian(self, mean, cov):

277 """

278 Multiply estimated density by a multivariate Gaussian and integrate

279 over the whole space.

280

281 Parameters

282 ----------

283 mean : aray_like

284 A 1-D array, specifying the mean of the Gaussian.

285 cov : array_like

286 A 2-D array, specifying the covariance matrix of the Gaussian.

287

288 Returns

289 -------

290 result : scalar

291 The value of the integral.

292

293 Raises

294 ------

295 ValueError

296 If the mean or covariance of the input Gaussian differs from

297 the KDE's dimensionality.

298

299 """

300 mean = atleast_1d(squeeze(mean))

301 cov = atleast_2d(cov)

302

303 if mean.shape != (self.d,):

304 raise ValueError("mean does not have dimension %s" % self.d)

305 if cov.shape != (self.d, self.d):

306 raise ValueError("covariance does not have dimension %s" % self.d)

307

308 # make mean a column vector

309 mean = mean[:, newaxis]

310

311 sum_cov = self.covariance + cov

312

313 # This will raise LinAlgError if the new cov matrix is not s.p.d

314 # cho_factor returns (ndarray, bool) where bool is a flag for whether

315 # or not ndarray is upper or lower triangular

316 sum_cov_chol = linalg.cho_factor(sum_cov)

317

318 diff = self.dataset - mean

319 tdiff = linalg.cho_solve(sum_cov_chol, diff)

320

321 sqrt_det = np.prod(np.diagonal(sum_cov_chol[0]))

322 norm_const = power(2 * pi, sum_cov.shape[0] / 2.0) * sqrt_det

323

324 energies = sum(diff * tdiff, axis=0) / 2.0

325 result = sum(exp(-energies)*self.weights, axis=0) / norm_const

326

327 return result

328

329 def integrate_box_1d(self, low, high):

330 """

331 Computes the integral of a 1D pdf between two bounds.

332

333 Parameters

334 ----------

335 low : scalar

336 Lower bound of integration.

337 high : scalar

338 Upper bound of integration.

339

340 Returns

341 -------

342 value : scalar

343 The result of the integral.

344

345 Raises

346 ------

347 ValueError

348 If the KDE is over more than one dimension.

349

350 """

351 if self.d != 1:

352 raise ValueError("integrate_box_1d() only handles 1D pdfs")

353

354 stdev = ravel(sqrt(self.covariance))[0]

355

356 normalized_low = ravel((low - self.dataset) / stdev)

357 normalized_high = ravel((high - self.dataset) / stdev)

358

359 value = np.sum(self.weights*(

360 special.ndtr(normalized_high) -

361 special.ndtr(normalized_low)))

362 return value

363

364 def integrate_box(self, low_bounds, high_bounds, maxpts=None):

365 """Computes the integral of a pdf over a rectangular interval.

366

367 Parameters

368 ----------

369 low_bounds : array_like

370 A 1-D array containing the lower bounds of integration.

371 high_bounds : array_like

372 A 1-D array containing the upper bounds of integration.

373 maxpts : int, optional

374 The maximum number of points to use for integration.

375

376 Returns

377 -------

378 value : scalar

379 The result of the integral.

380

381 """

382 if maxpts is not None:

383 extra_kwds = {'maxpts': maxpts}

384 else:

385 extra_kwds = {}

386

387 value, inform = _mvn.mvnun_weighted(low_bounds, high_bounds,

388 self.dataset, self.weights,

389 self.covariance, **extra_kwds)

390 if inform:

391 msg = ('An integral in _mvn.mvnun requires more points than %s' %

392 (self.d * 1000))

393 warnings.warn(msg)

394

395 return value

396

397 def integrate_kde(self, other):

398 """

399 Computes the integral of the product of this kernel density estimate

400 with another.

401

402 Parameters

403 ----------

404 other : gaussian_kde instance

405 The other kde.

406

407 Returns

408 -------

409 value : scalar

410 The result of the integral.

411

412 Raises

413 ------

414 ValueError

415 If the KDEs have different dimensionality.

416

417 """

418 if other.d != self.d:

419 raise ValueError("KDEs are not the same dimensionality")

420

421 # we want to iterate over the smallest number of points

422 if other.n < self.n:

423 small = other

424 large = self

425 else:

426 small = self

427 large = other

428

429 sum_cov = small.covariance + large.covariance

430 sum_cov_chol = linalg.cho_factor(sum_cov)

431 result = 0.0

432 for i in range(small.n):

433 mean = small.dataset[:, i, newaxis]

434 diff = large.dataset - mean

435 tdiff = linalg.cho_solve(sum_cov_chol, diff)

436

437 energies = sum(diff * tdiff, axis=0) / 2.0

438 result += sum(exp(-energies)*large.weights, axis=0)*small.weights[i]

439

440 sqrt_det = np.prod(np.diagonal(sum_cov_chol[0]))

441 norm_const = power(2 * pi, sum_cov.shape[0] / 2.0) * sqrt_det

442

443 result /= norm_const

444

445 return result

446

447 def resample(self, size=None, seed=None):

448 """Randomly sample a dataset from the estimated pdf.

449

450 Parameters

451 ----------

452 size : int, optional

453 The number of samples to draw. If not provided, then the size is

454 the same as the effective number of samples in the underlying

455 dataset.

456 seed : {None, int, `numpy.random.Generator`, `numpy.random.RandomState`}, optional

457 If `seed` is None (or `np.random`), the `numpy.random.RandomState`

458 singleton is used.

459 If `seed` is an int, a new ``RandomState`` instance is used,

460 seeded with `seed`.

461 If `seed` is already a ``Generator`` or ``RandomState`` instance then

462 that instance is used.

463

464 Returns

465 -------

466 resample : (self.d, `size`) ndarray

467 The sampled dataset.

468

469 """

470 if size is None:

471 size = int(self.neff)

472

473 random_state = check_random_state(seed)

474 norm = transpose(random_state.multivariate_normal(

475 zeros((self.d,), float), self.covariance, size=size

476 ))

477 indices = random_state.choice(self.n, size=size, p=self.weights)

478 means = self.dataset[:, indices]

479

480 return means + norm

481

482 def scotts_factor(self):

483 """Compute Scott's factor.

484

485 Returns

486 -------

487 s : float

488 Scott's factor.

489 """

490 return power(self.neff, -1./(self.d+4))

491

492 def silverman_factor(self):

493 """Compute the Silverman factor.

494

495 Returns

496 -------

497 s : float

498 The silverman factor.

499 """

500 return power(self.neff*(self.d+2.0)/4.0, -1./(self.d+4))

501

502 # Default method to calculate bandwidth, can be overwritten by subclass

503 covariance_factor = scotts_factor

504 covariance_factor.__doc__ = """Computes the coefficient (`kde.factor`) that

505 multiplies the data covariance matrix to obtain the kernel covariance

506 matrix. The default is `scotts_factor`. A subclass can overwrite this

507 method to provide a different method, or set it through a call to

508 `kde.set_bandwidth`."""

509

510 def set_bandwidth(self, bw_method=None):

511 """Compute the estimator bandwidth with given method.

512

513 The new bandwidth calculated after a call to `set_bandwidth` is used

514 for subsequent evaluations of the estimated density.

515

516 Parameters

517 ----------

518 bw_method : str, scalar or callable, optional

519 The method used to calculate the estimator bandwidth. This can be

520 'scott', 'silverman', a scalar constant or a callable. If a

521 scalar, this will be used directly as `kde.factor`. If a callable,

522 it should take a `gaussian_kde` instance as only parameter and

523 return a scalar. If None (default), nothing happens; the current

524 `kde.covariance_factor` method is kept.

525

526 Notes

527 -----

528 .. versionadded:: 0.11

529

530 Examples

531 --------

532 >>> import numpy as np

533 >>> import scipy.stats as stats

534 >>> x1 = np.array([-7, -5, 1, 4, 5.])

535 >>> kde = stats.gaussian_kde(x1)

536 >>> xs = np.linspace(-10, 10, num=50)

537 >>> y1 = kde(xs)

538 >>> kde.set_bandwidth(bw_method='silverman')

539 >>> y2 = kde(xs)

540 >>> kde.set_bandwidth(bw_method=kde.factor / 3.)

541 >>> y3 = kde(xs)

542

543 >>> import matplotlib.pyplot as plt

544 >>> fig, ax = plt.subplots()

545 >>> ax.plot(x1, np.full(x1.shape, 1 / (4. * x1.size)), 'bo',

546 ... label='Data points (rescaled)')

547 >>> ax.plot(xs, y1, label='Scott (default)')

548 >>> ax.plot(xs, y2, label='Silverman')

549 >>> ax.plot(xs, y3, label='Const (1/3 * Silverman)')

550 >>> ax.legend()

551 >>> plt.show()

552

553 """

554 if bw_method is None:

555 pass

556 elif bw_method == 'scott':

557 self.covariance_factor = self.scotts_factor

558 elif bw_method == 'silverman':

559 self.covariance_factor = self.silverman_factor

560 elif np.isscalar(bw_method) and not isinstance(bw_method, str):

561 self._bw_method = 'use constant'

562 self.covariance_factor = lambda: bw_method

563 elif callable(bw_method):

564 self._bw_method = bw_method

565 self.covariance_factor = lambda: self._bw_method(self)

566 else:

567 msg = "`bw_method` should be 'scott', 'silverman', a scalar " \

568 "or a callable."

569 raise ValueError(msg)

570

571 self._compute_covariance()

572

573 def _compute_covariance(self):

574 """Computes the covariance matrix for each Gaussian kernel using

575 covariance_factor().

576 """

577 self.factor = self.covariance_factor()

578 # Cache covariance and Cholesky decomp of covariance

579 if not hasattr(self, '_data_cho_cov'):

580 self._data_covariance = atleast_2d(cov(self.dataset, rowvar=1,

581 bias=False,

582 aweights=self.weights))

583 self._data_cho_cov = linalg.cholesky(self._data_covariance,

584 lower=True)

585

586 self.covariance = self._data_covariance * self.factor**2

587 self.cho_cov = (self._data_cho_cov * self.factor).astype(np.float64)

588 self.log_det = 2*np.log(np.diag(self.cho_cov

589 * np.sqrt(2*pi))).sum()

590

591 @property

592 def inv_cov(self):

593 # Re-compute from scratch each time because I'm not sure how this is

594 # used in the wild. (Perhaps users change the `dataset`, since it's

595 # not a private attribute?) `_compute_covariance` used to recalculate

596 # all these, so we'll recalculate everything now that this is a

597 # a property.

598 self.factor = self.covariance_factor()

599 self._data_covariance = atleast_2d(cov(self.dataset, rowvar=1,

600 bias=False, aweights=self.weights))

601 return linalg.inv(self._data_covariance) / self.factor**2

602

603 def pdf(self, x):

604 """

605 Evaluate the estimated pdf on a provided set of points.

606

607 Notes

608 -----

609 This is an alias for `gaussian_kde.evaluate`. See the ``evaluate``

610 docstring for more details.

611

612 """

613 return self.evaluate(x)

614

615 def logpdf(self, x):

616 """

617 Evaluate the log of the estimated pdf on a provided set of points.

618 """

619 points = atleast_2d(x)

620

621 d, m = points.shape

622 if d != self.d:

623 if d == 1 and m == self.d:

624 # points was passed in as a row vector

625 points = reshape(points, (self.d, 1))

626 m = 1

627 else:

628 msg = (f"points have dimension {d}, "

629 f"dataset has dimension {self.d}")

630 raise ValueError(msg)

631

632 output_dtype, spec = _get_output_dtype(self.covariance, points)

633 result = gaussian_kernel_estimate_log[spec](

634 self.dataset.T, self.weights[:, None],

635 points.T, self.cho_cov, output_dtype)

636

637 return result[:, 0]

638

639 def marginal(self, dimensions):

640 """Return a marginal KDE distribution

641

642 Parameters

643 ----------

644 dimensions : int or 1-d array_like

645 The dimensions of the multivariate distribution corresponding

646 with the marginal variables, that is, the indices of the dimensions

647 that are being retained. The other dimensions are marginalized out.

648

649 Returns

650 -------

651 marginal_kde : gaussian_kde

652 An object representing the marginal distribution.

653

654 Notes

655 -----

656 .. versionadded:: 1.10.0

657

658 """

659

660 dims = np.atleast_1d(dimensions)

661

662 if not np.issubdtype(dims.dtype, np.integer):

663 msg = ("Elements of `dimensions` must be integers - the indices "

664 "of the marginal variables being retained.")

665 raise ValueError(msg)

666

667 n = len(self.dataset) # number of dimensions

668 original_dims = dims.copy()

669

670 dims[dims < 0] = n + dims[dims < 0]

671

672 if len(np.unique(dims)) != len(dims):

673 msg = ("All elements of `dimensions` must be unique.")

674 raise ValueError(msg)

675

676 i_invalid = (dims < 0) | (dims >= n)

677 if np.any(i_invalid):

678 msg = (f"Dimensions {original_dims[i_invalid]} are invalid "

679 f"for a distribution in {n} dimensions.")

680 raise ValueError(msg)

681

682 dataset = self.dataset[dims]

683 weights = self.weights

684

685 return gaussian_kde(dataset, bw_method=self.covariance_factor(),

686 weights=weights)

687

688 @property

689 def weights(self):

690 try:

691 return self._weights

692 except AttributeError:

693 self._weights = ones(self.n)/self.n

694 return self._weights

695

696 @property

697 def neff(self):

698 try:

699 return self._neff

700 except AttributeError:

701 self._neff = 1/sum(self.weights**2)

702 return self._neff

703

704

705def _get_output_dtype(covariance, points):

706 """

707 Calculates the output dtype and the "spec" (=C type name).

708

709 This was necessary in order to deal with the fused types in the Cython

710 routine `gaussian_kernel_estimate`. See gh-10824 for details.

711 """

712 output_dtype = np.common_type(covariance, points)

713 itemsize = np.dtype(output_dtype).itemsize

714 if itemsize == 4:

715 spec = 'float'

716 elif itemsize == 8:

717 spec = 'double'

718 elif itemsize in (12, 16):

719 spec = 'long double'

720 else:

721 raise ValueError(

722 f"{output_dtype} has unexpected item size: {itemsize}"

723 )

724

725 return output_dtype, spec

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scipy/stats/_kde.py: 17%

198 statements