Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scipy/stats/

1from __future__ import annotations

2import math

3import warnings

4from collections import namedtuple

6import numpy as np

7from numpy import (isscalar, r_, log, around, unique, asarray, zeros,

8 arange, sort, amin, amax, atleast_1d, sqrt, array,

9 compress, pi, exp, ravel, count_nonzero, sin, cos,

10 arctan2, hypot)

12from scipy import optimize

13from scipy import special

14from scipy._lib._bunch import _make_tuple_bunch

15from scipy._lib._util import _rename_parameter, _contains_nan

17from . import _statlib

18from . import _stats_py

19from ._fit import FitResult

20from ._stats_py import find_repeats, _normtest_finish, SignificanceResult

21from .contingency import chi2_contingency

22from . import distributions

23from ._distn_infrastructure import rv_generic

24from ._hypotests import _get_wilcoxon_distr

25from ._axis_nan_policy import _axis_nan_policy_factory

26from .._lib.deprecation import _deprecated

29__all__ = ['mvsdist',

30 'bayes_mvs', 'kstat', 'kstatvar', 'probplot', 'ppcc_max', 'ppcc_plot',

31 'boxcox_llf', 'boxcox', 'boxcox_normmax', 'boxcox_normplot',

32 'shapiro', 'anderson', 'ansari', 'bartlett', 'levene', 'binom_test',

33 'fligner', 'mood', 'wilcoxon', 'median_test',

34 'circmean', 'circvar', 'circstd', 'anderson_ksamp',

35 'yeojohnson_llf', 'yeojohnson', 'yeojohnson_normmax',

36 'yeojohnson_normplot', 'directional_stats'

37 ]

40Mean = namedtuple('Mean', ('statistic', 'minmax'))

41Variance = namedtuple('Variance', ('statistic', 'minmax'))

42Std_dev = namedtuple('Std_dev', ('statistic', 'minmax'))

45def bayes_mvs(data, alpha=0.90):

46 r"""

47 Bayesian confidence intervals for the mean, var, and std.

49 Parameters

50 ----------

51 data : array_like

52 Input data, if multi-dimensional it is flattened to 1-D by `bayes_mvs`.

53 Requires 2 or more data points.

54 alpha : float, optional

55 Probability that the returned confidence interval contains

56 the true parameter.

58 Returns

59 -------

60 mean_cntr, var_cntr, std_cntr : tuple

61 The three results are for the mean, variance and standard deviation,

62 respectively. Each result is a tuple of the form::

64 (center, (lower, upper))

66 with `center` the mean of the conditional pdf of the value given the

67 data, and `(lower, upper)` a confidence interval, centered on the

68 median, containing the estimate to a probability ``alpha``.

70 See Also

71 --------

72 mvsdist

74 Notes

75 -----

76 Each tuple of mean, variance, and standard deviation estimates represent

77 the (center, (lower, upper)) with center the mean of the conditional pdf

78 of the value given the data and (lower, upper) is a confidence interval

79 centered on the median, containing the estimate to a probability

80 ``alpha``.

82 Converts data to 1-D and assumes all data has the same mean and variance.

83 Uses Jeffrey's prior for variance and std.

85 Equivalent to ``tuple((x.mean(), x.interval(alpha)) for x in mvsdist(dat))``

87 References

88 ----------

89 T.E. Oliphant, "A Bayesian perspective on estimating mean, variance, and

90 standard-deviation from data", https://scholarsarchive.byu.edu/facpub/278,

91 2006.

93 Examples

94 --------

95 First a basic example to demonstrate the outputs:

97 >>> from scipy import stats

98 >>> data = [6, 9, 12, 7, 8, 8, 13]

99 >>> mean, var, std = stats.bayes_mvs(data)

100 >>> mean

101 Mean(statistic=9.0, minmax=(7.103650222612533, 10.896349777387467))

102 >>> var

103 Variance(statistic=10.0, minmax=(3.176724206..., 24.45910382...))

104 >>> std

105 Std_dev(statistic=2.9724954732045084, minmax=(1.7823367265645143, 4.945614605014631))

106

107 Now we generate some normally distributed random data, and get estimates of

108 mean and standard deviation with 95% confidence intervals for those

109 estimates:

110

111 >>> n_samples = 100000

112 >>> data = stats.norm.rvs(size=n_samples)

113 >>> res_mean, res_var, res_std = stats.bayes_mvs(data, alpha=0.95)

114

115 >>> import matplotlib.pyplot as plt

116 >>> fig = plt.figure()

117 >>> ax = fig.add_subplot(111)

118 >>> ax.hist(data, bins=100, density=True, label='Histogram of data')

119 >>> ax.vlines(res_mean.statistic, 0, 0.5, colors='r', label='Estimated mean')

120 >>> ax.axvspan(res_mean.minmax[0],res_mean.minmax[1], facecolor='r',

121 ... alpha=0.2, label=r'Estimated mean (95% limits)')

122 >>> ax.vlines(res_std.statistic, 0, 0.5, colors='g', label='Estimated scale')

123 >>> ax.axvspan(res_std.minmax[0],res_std.minmax[1], facecolor='g', alpha=0.2,

124 ... label=r'Estimated scale (95% limits)')

125

126 >>> ax.legend(fontsize=10)

127 >>> ax.set_xlim([-4, 4])

128 >>> ax.set_ylim([0, 0.5])

129 >>> plt.show()

130

131 """

132 m, v, s = mvsdist(data)

133 if alpha >= 1 or alpha <= 0:

134 raise ValueError("0 < alpha < 1 is required, but alpha=%s was given."

135 % alpha)

136

137 m_res = Mean(m.mean(), m.interval(alpha))

138 v_res = Variance(v.mean(), v.interval(alpha))

139 s_res = Std_dev(s.mean(), s.interval(alpha))

140

141 return m_res, v_res, s_res

142

143

144def mvsdist(data):

145 """

146 'Frozen' distributions for mean, variance, and standard deviation of data.

147

148 Parameters

149 ----------

150 data : array_like

151 Input array. Converted to 1-D using ravel.

152 Requires 2 or more data-points.

153

154 Returns

155 -------

156 mdist : "frozen" distribution object

157 Distribution object representing the mean of the data.

158 vdist : "frozen" distribution object

159 Distribution object representing the variance of the data.

160 sdist : "frozen" distribution object

161 Distribution object representing the standard deviation of the data.

162

163 See Also

164 --------

165 bayes_mvs

166

167 Notes

168 -----

169 The return values from ``bayes_mvs(data)`` is equivalent to

170 ``tuple((x.mean(), x.interval(0.90)) for x in mvsdist(data))``.

171

172 In other words, calling ``<dist>.mean()`` and ``<dist>.interval(0.90)``

173 on the three distribution objects returned from this function will give

174 the same results that are returned from `bayes_mvs`.

175

176 References

177 ----------

178 T.E. Oliphant, "A Bayesian perspective on estimating mean, variance, and

179 standard-deviation from data", https://scholarsarchive.byu.edu/facpub/278,

180 2006.

181

182 Examples

183 --------

184 >>> from scipy import stats

185 >>> data = [6, 9, 12, 7, 8, 8, 13]

186 >>> mean, var, std = stats.mvsdist(data)

187

188 We now have frozen distribution objects "mean", "var" and "std" that we can

189 examine:

190

191 >>> mean.mean()

192 9.0

193 >>> mean.interval(0.95)

194 (6.6120585482655692, 11.387941451734431)

195 >>> mean.std()

196 1.1952286093343936

197

198 """

199 x = ravel(data)

200 n = len(x)

201 if n < 2:

202 raise ValueError("Need at least 2 data-points.")

203 xbar = x.mean()

204 C = x.var()

205 if n > 1000: # gaussian approximations for large n

206 mdist = distributions.norm(loc=xbar, scale=math.sqrt(C / n))

207 sdist = distributions.norm(loc=math.sqrt(C), scale=math.sqrt(C / (2. * n)))

208 vdist = distributions.norm(loc=C, scale=math.sqrt(2.0 / n) * C)

209 else:

210 nm1 = n - 1

211 fac = n * C / 2.

212 val = nm1 / 2.

213 mdist = distributions.t(nm1, loc=xbar, scale=math.sqrt(C / nm1))

214 sdist = distributions.gengamma(val, -2, scale=math.sqrt(fac))

215 vdist = distributions.invgamma(val, scale=fac)

216 return mdist, vdist, sdist

217

218

219@_axis_nan_policy_factory(

220 lambda x: x, result_to_tuple=lambda x: (x,), n_outputs=1, default_axis=None

221)

222def kstat(data, n=2):

223 r"""

224 Return the nth k-statistic (1<=n<=4 so far).

225

226 The nth k-statistic k_n is the unique symmetric unbiased estimator of the

227 nth cumulant kappa_n.

228

229 Parameters

230 ----------

231 data : array_like

232 Input array. Note that n-D input gets flattened.

233 n : int, {1, 2, 3, 4}, optional

234 Default is equal to 2.

235

236 Returns

237 -------

238 kstat : float

239 The nth k-statistic.

240

241 See Also

242 --------

243 kstatvar: Returns an unbiased estimator of the variance of the k-statistic.

244 moment: Returns the n-th central moment about the mean for a sample.

245

246 Notes

247 -----

248 For a sample size n, the first few k-statistics are given by:

249

250 .. math::

251

252 k_{1} = \mu

253 k_{2} = \frac{n}{n-1} m_{2}

254 k_{3} = \frac{ n^{2} } {(n-1) (n-2)} m_{3}

255 k_{4} = \frac{ n^{2} [(n + 1)m_{4} - 3(n - 1) m^2_{2}]} {(n-1) (n-2) (n-3)}

256

257 where :math:`\mu` is the sample mean, :math:`m_2` is the sample

258 variance, and :math:`m_i` is the i-th sample central moment.

259

260 References

261 ----------

262 http://mathworld.wolfram.com/k-Statistic.html

263

264 http://mathworld.wolfram.com/Cumulant.html

265

266 Examples

267 --------

268 >>> from scipy import stats

269 >>> from numpy.random import default_rng

270 >>> rng = default_rng()

271

272 As sample size increases, n-th moment and n-th k-statistic converge to the

273 same number (although they aren't identical). In the case of the normal

274 distribution, they converge to zero.

275

276 >>> for n in [2, 3, 4, 5, 6, 7]:

277 ... x = rng.normal(size=10**n)

278 ... m, k = stats.moment(x, 3), stats.kstat(x, 3)

279 ... print("%.3g %.3g %.3g" % (m, k, m-k))

280 -0.631 -0.651 0.0194 # random

281 0.0282 0.0283 -8.49e-05

282 -0.0454 -0.0454 1.36e-05

283 7.53e-05 7.53e-05 -2.26e-09

284 0.00166 0.00166 -4.99e-09

285 -2.88e-06 -2.88e-06 8.63e-13

286 """

287 if n > 4 or n < 1:

288 raise ValueError("k-statistics only supported for 1<=n<=4")

289 n = int(n)

290 S = np.zeros(n + 1, np.float64)

291 data = ravel(data)

292 N = data.size

293

294 # raise ValueError on empty input

295 if N == 0:

296 raise ValueError("Data input must not be empty")

297

298 # on nan input, return nan without warning

299 if np.isnan(np.sum(data)):

300 return np.nan

301

302 for k in range(1, n + 1):

303 S[k] = np.sum(data**k, axis=0)

304 if n == 1:

305 return S[1] * 1.0/N

306 elif n == 2:

307 return (N*S[2] - S[1]**2.0) / (N*(N - 1.0))

308 elif n == 3:

309 return (2*S[1]**3 - 3*N*S[1]*S[2] + N*N*S[3]) / (N*(N - 1.0)*(N - 2.0))

310 elif n == 4:

311 return ((-6*S[1]**4 + 12*N*S[1]**2 * S[2] - 3*N*(N-1.0)*S[2]**2 -

312 4*N*(N+1)*S[1]*S[3] + N*N*(N+1)*S[4]) /

313 (N*(N-1.0)*(N-2.0)*(N-3.0)))

314 else:

315 raise ValueError("Should not be here.")

316

317

318@_axis_nan_policy_factory(

319 lambda x: x, result_to_tuple=lambda x: (x,), n_outputs=1, default_axis=None

320)

321def kstatvar(data, n=2):

322 r"""Return an unbiased estimator of the variance of the k-statistic.

323

324 See `kstat` for more details of the k-statistic.

325

326 Parameters

327 ----------

328 data : array_like

329 Input array. Note that n-D input gets flattened.

330 n : int, {1, 2}, optional

331 Default is equal to 2.

332

333 Returns

334 -------

335 kstatvar : float

336 The nth k-statistic variance.

337

338 See Also

339 --------

340 kstat: Returns the n-th k-statistic.

341 moment: Returns the n-th central moment about the mean for a sample.

342

343 Notes

344 -----

345 The variances of the first few k-statistics are given by:

346

347 .. math::

348

349 var(k_{1}) = \frac{\kappa^2}{n}

350 var(k_{2}) = \frac{\kappa^4}{n} + \frac{2\kappa^2_{2}}{n - 1}

351 var(k_{3}) = \frac{\kappa^6}{n} + \frac{9 \kappa_2 \kappa_4}{n - 1} +

352 \frac{9 \kappa^2_{3}}{n - 1} +

353 \frac{6 n \kappa^3_{2}}{(n-1) (n-2)}

354 var(k_{4}) = \frac{\kappa^8}{n} + \frac{16 \kappa_2 \kappa_6}{n - 1} +

355 \frac{48 \kappa_{3} \kappa_5}{n - 1} +

356 \frac{34 \kappa^2_{4}}{n-1} + \frac{72 n \kappa^2_{2} \kappa_4}{(n - 1) (n - 2)} +

357 \frac{144 n \kappa_{2} \kappa^2_{3}}{(n - 1) (n - 2)} +

358 \frac{24 (n + 1) n \kappa^4_{2}}{(n - 1) (n - 2) (n - 3)}

359 """

360 data = ravel(data)

361 N = len(data)

362 if n == 1:

363 return kstat(data, n=2) * 1.0/N

364 elif n == 2:

365 k2 = kstat(data, n=2)

366 k4 = kstat(data, n=4)

367 return (2*N*k2**2 + (N-1)*k4) / (N*(N+1))

368 else:

369 raise ValueError("Only n=1 or n=2 supported.")

370

371

372def _calc_uniform_order_statistic_medians(n):

373 """Approximations of uniform order statistic medians.

374

375 Parameters

376 ----------

377 n : int

378 Sample size.

379

380 Returns

381 -------

382 v : 1d float array

383 Approximations of the order statistic medians.

384

385 References

386 ----------

387 .. [1] James J. Filliben, "The Probability Plot Correlation Coefficient

388 Test for Normality", Technometrics, Vol. 17, pp. 111-117, 1975.

389

390 Examples

391 --------

392 Order statistics of the uniform distribution on the unit interval

393 are marginally distributed according to beta distributions.

394 The expectations of these order statistic are evenly spaced across

395 the interval, but the distributions are skewed in a way that

396 pushes the medians slightly towards the endpoints of the unit interval:

397

398 >>> import numpy as np

399 >>> n = 4

400 >>> k = np.arange(1, n+1)

401 >>> from scipy.stats import beta

402 >>> a = k

403 >>> b = n-k+1

404 >>> beta.mean(a, b)

405 array([0.2, 0.4, 0.6, 0.8])

406 >>> beta.median(a, b)

407 array([0.15910358, 0.38572757, 0.61427243, 0.84089642])

408

409 The Filliben approximation uses the exact medians of the smallest

410 and greatest order statistics, and the remaining medians are approximated

411 by points spread evenly across a sub-interval of the unit interval:

412

413 >>> from scipy.stats._morestats import _calc_uniform_order_statistic_medians

414 >>> _calc_uniform_order_statistic_medians(n)

415 array([0.15910358, 0.38545246, 0.61454754, 0.84089642])

416

417 This plot shows the skewed distributions of the order statistics

418 of a sample of size four from a uniform distribution on the unit interval:

419

420 >>> import matplotlib.pyplot as plt

421 >>> x = np.linspace(0.0, 1.0, num=50, endpoint=True)

422 >>> pdfs = [beta.pdf(x, a[i], b[i]) for i in range(n)]

423 >>> plt.figure()

424 >>> plt.plot(x, pdfs[0], x, pdfs[1], x, pdfs[2], x, pdfs[3])

425

426 """

427 v = np.empty(n, dtype=np.float64)

428 v[-1] = 0.5**(1.0 / n)

429 v[0] = 1 - v[-1]

430 i = np.arange(2, n)

431 v[1:-1] = (i - 0.3175) / (n + 0.365)

432 return v

433

434

435def _parse_dist_kw(dist, enforce_subclass=True):

436 """Parse `dist` keyword.

437

438 Parameters

439 ----------

440 dist : str or stats.distributions instance.

441 Several functions take `dist` as a keyword, hence this utility

442 function.

443 enforce_subclass : bool, optional

444 If True (default), `dist` needs to be a

445 `_distn_infrastructure.rv_generic` instance.

446 It can sometimes be useful to set this keyword to False, if a function

447 wants to accept objects that just look somewhat like such an instance

448 (for example, they have a ``ppf`` method).

449

450 """

451 if isinstance(dist, rv_generic):

452 pass

453 elif isinstance(dist, str):

454 try:

455 dist = getattr(distributions, dist)

456 except AttributeError as e:

457 raise ValueError("%s is not a valid distribution name" % dist) from e

458 elif enforce_subclass:

459 msg = ("`dist` should be a stats.distributions instance or a string "

460 "with the name of such a distribution.")

461 raise ValueError(msg)

462

463 return dist

464

465

466def _add_axis_labels_title(plot, xlabel, ylabel, title):

467 """Helper function to add axes labels and a title to stats plots."""

468 try:

469 if hasattr(plot, 'set_title'):

470 # Matplotlib Axes instance or something that looks like it

471 plot.set_title(title)

472 plot.set_xlabel(xlabel)

473 plot.set_ylabel(ylabel)

474 else:

475 # matplotlib.pyplot module

476 plot.title(title)

477 plot.xlabel(xlabel)

478 plot.ylabel(ylabel)

479 except Exception:

480 # Not an MPL object or something that looks (enough) like it.

481 # Don't crash on adding labels or title

482 pass

483

484

485def probplot(x, sparams=(), dist='norm', fit=True, plot=None, rvalue=False):

486 """

487 Calculate quantiles for a probability plot, and optionally show the plot.

488

489 Generates a probability plot of sample data against the quantiles of a

490 specified theoretical distribution (the normal distribution by default).

491 `probplot` optionally calculates a best-fit line for the data and plots the

492 results using Matplotlib or a given plot function.

493

494 Parameters

495 ----------

496 x : array_like

497 Sample/response data from which `probplot` creates the plot.

498 sparams : tuple, optional

499 Distribution-specific shape parameters (shape parameters plus location

500 and scale).

501 dist : str or stats.distributions instance, optional

502 Distribution or distribution function name. The default is 'norm' for a

503 normal probability plot. Objects that look enough like a

504 stats.distributions instance (i.e. they have a ``ppf`` method) are also

505 accepted.

506 fit : bool, optional

507 Fit a least-squares regression (best-fit) line to the sample data if

508 True (default).

509 plot : object, optional

510 If given, plots the quantiles.

511 If given and `fit` is True, also plots the least squares fit.

512 `plot` is an object that has to have methods "plot" and "text".

513 The `matplotlib.pyplot` module or a Matplotlib Axes object can be used,

514 or a custom object with the same methods.

515 Default is None, which means that no plot is created.

516 rvalue : bool, optional

517 If `plot` is provided and `fit` is True, setting `rvalue` to True

518 includes the coefficient of determination on the plot.

519 Default is False.

520

521 Returns

522 -------

523 (osm, osr) : tuple of ndarrays

524 Tuple of theoretical quantiles (osm, or order statistic medians) and

525 ordered responses (osr). `osr` is simply sorted input `x`.

526 For details on how `osm` is calculated see the Notes section.

527 (slope, intercept, r) : tuple of floats, optional

528 Tuple containing the result of the least-squares fit, if that is

529 performed by `probplot`. `r` is the square root of the coefficient of

530 determination. If ``fit=False`` and ``plot=None``, this tuple is not

531 returned.

532

533 Notes

534 -----

535 Even if `plot` is given, the figure is not shown or saved by `probplot`;

536 ``plt.show()`` or ``plt.savefig('figname.png')`` should be used after

537 calling `probplot`.

538

539 `probplot` generates a probability plot, which should not be confused with

540 a Q-Q or a P-P plot. Statsmodels has more extensive functionality of this

541 type, see ``statsmodels.api.ProbPlot``.

542

543 The formula used for the theoretical quantiles (horizontal axis of the

544 probability plot) is Filliben's estimate::

545

546 quantiles = dist.ppf(val), for

547

548 0.5**(1/n), for i = n

549 val = (i - 0.3175) / (n + 0.365), for i = 2, ..., n-1

550 1 - 0.5**(1/n), for i = 1

551

552 where ``i`` indicates the i-th ordered value and ``n`` is the total number

553 of values.

554

555 Examples

556 --------

557 >>> import numpy as np

558 >>> from scipy import stats

559 >>> import matplotlib.pyplot as plt

560 >>> nsample = 100

561 >>> rng = np.random.default_rng()

562

563 A t distribution with small degrees of freedom:

564

565 >>> ax1 = plt.subplot(221)

566 >>> x = stats.t.rvs(3, size=nsample, random_state=rng)

567 >>> res = stats.probplot(x, plot=plt)

568

569 A t distribution with larger degrees of freedom:

570

571 >>> ax2 = plt.subplot(222)

572 >>> x = stats.t.rvs(25, size=nsample, random_state=rng)

573 >>> res = stats.probplot(x, plot=plt)

574

575 A mixture of two normal distributions with broadcasting:

576

577 >>> ax3 = plt.subplot(223)

578 >>> x = stats.norm.rvs(loc=[0,5], scale=[1,1.5],

579 ... size=(nsample//2,2), random_state=rng).ravel()

580 >>> res = stats.probplot(x, plot=plt)

581

582 A standard normal distribution:

583

584 >>> ax4 = plt.subplot(224)

585 >>> x = stats.norm.rvs(loc=0, scale=1, size=nsample, random_state=rng)

586 >>> res = stats.probplot(x, plot=plt)

587

588 Produce a new figure with a loggamma distribution, using the ``dist`` and

589 ``sparams`` keywords:

590

591 >>> fig = plt.figure()

592 >>> ax = fig.add_subplot(111)

593 >>> x = stats.loggamma.rvs(c=2.5, size=500, random_state=rng)

594 >>> res = stats.probplot(x, dist=stats.loggamma, sparams=(2.5,), plot=ax)

595 >>> ax.set_title("Probplot for loggamma dist with shape parameter 2.5")

596

597 Show the results with Matplotlib:

598

599 >>> plt.show()

600

601 """

602 x = np.asarray(x)

603 if x.size == 0:

604 if fit:

605 return (x, x), (np.nan, np.nan, 0.0)

606 else:

607 return x, x

608

609 osm_uniform = _calc_uniform_order_statistic_medians(len(x))

610 dist = _parse_dist_kw(dist, enforce_subclass=False)

611 if sparams is None:

612 sparams = ()

613 if isscalar(sparams):

614 sparams = (sparams,)

615 if not isinstance(sparams, tuple):

616 sparams = tuple(sparams)

617

618 osm = dist.ppf(osm_uniform, *sparams)

619 osr = sort(x)

620 if fit:

621 # perform a linear least squares fit.

622 slope, intercept, r, prob, _ = _stats_py.linregress(osm, osr)

623

624 if plot is not None:

625 plot.plot(osm, osr, 'bo')

626 if fit:

627 plot.plot(osm, slope*osm + intercept, 'r-')

628 _add_axis_labels_title(plot, xlabel='Theoretical quantiles',

629 ylabel='Ordered Values',

630 title='Probability Plot')

631

632 # Add R^2 value to the plot as text

633 if fit and rvalue:

634 xmin = amin(osm)

635 xmax = amax(osm)

636 ymin = amin(x)

637 ymax = amax(x)

638 posx = xmin + 0.70 * (xmax - xmin)

639 posy = ymin + 0.01 * (ymax - ymin)

640 plot.text(posx, posy, "$R^2=%1.4f$" % r**2)

641

642 if fit:

643 return (osm, osr), (slope, intercept, r)

644 else:

645 return osm, osr

646

647

648def ppcc_max(x, brack=(0.0, 1.0), dist='tukeylambda'):

649 """Calculate the shape parameter that maximizes the PPCC.

650

651 The probability plot correlation coefficient (PPCC) plot can be used

652 to determine the optimal shape parameter for a one-parameter family

653 of distributions. ``ppcc_max`` returns the shape parameter that would

654 maximize the probability plot correlation coefficient for the given

655 data to a one-parameter family of distributions.

656

657 Parameters

658 ----------

659 x : array_like

660 Input array.

661 brack : tuple, optional

662 Triple (a,b,c) where (a<b<c). If bracket consists of two numbers (a, c)

663 then they are assumed to be a starting interval for a downhill bracket

664 search (see `scipy.optimize.brent`).

665 dist : str or stats.distributions instance, optional

666 Distribution or distribution function name. Objects that look enough

667 like a stats.distributions instance (i.e. they have a ``ppf`` method)

668 are also accepted. The default is ``'tukeylambda'``.

669

670 Returns

671 -------

672 shape_value : float

673 The shape parameter at which the probability plot correlation

674 coefficient reaches its max value.

675

676 See Also

677 --------

678 ppcc_plot, probplot, boxcox

679

680 Notes

681 -----

682 The brack keyword serves as a starting point which is useful in corner

683 cases. One can use a plot to obtain a rough visual estimate of the location

684 for the maximum to start the search near it.

685

686 References

687 ----------

688 .. [1] J.J. Filliben, "The Probability Plot Correlation Coefficient Test

689 for Normality", Technometrics, Vol. 17, pp. 111-117, 1975.

690 .. [2] Engineering Statistics Handbook, NIST/SEMATEC,

691 https://www.itl.nist.gov/div898/handbook/eda/section3/ppccplot.htm

692

693 Examples

694 --------

695 First we generate some random data from a Weibull distribution

696 with shape parameter 2.5:

697

698 >>> import numpy as np

699 >>> from scipy import stats

700 >>> import matplotlib.pyplot as plt

701 >>> rng = np.random.default_rng()

702 >>> c = 2.5

703 >>> x = stats.weibull_min.rvs(c, scale=4, size=2000, random_state=rng)

704

705 Generate the PPCC plot for this data with the Weibull distribution.

706

707 >>> fig, ax = plt.subplots(figsize=(8, 6))

708 >>> res = stats.ppcc_plot(x, c/2, 2*c, dist='weibull_min', plot=ax)

709

710 We calculate the value where the shape should reach its maximum and a

711 red line is drawn there. The line should coincide with the highest

712 point in the PPCC graph.

713

714 >>> cmax = stats.ppcc_max(x, brack=(c/2, 2*c), dist='weibull_min')

715 >>> ax.axvline(cmax, color='r')

716 >>> plt.show()

717

718 """

719 dist = _parse_dist_kw(dist)

720 osm_uniform = _calc_uniform_order_statistic_medians(len(x))

721 osr = sort(x)

722

723 # this function computes the x-axis values of the probability plot

724 # and computes a linear regression (including the correlation)

725 # and returns 1-r so that a minimization function maximizes the

726 # correlation

727 def tempfunc(shape, mi, yvals, func):

728 xvals = func(mi, shape)

729 r, prob = _stats_py.pearsonr(xvals, yvals)

730 return 1 - r

731

732 return optimize.brent(tempfunc, brack=brack,

733 args=(osm_uniform, osr, dist.ppf))

734

735

736def ppcc_plot(x, a, b, dist='tukeylambda', plot=None, N=80):

737 """Calculate and optionally plot probability plot correlation coefficient.

738

739 The probability plot correlation coefficient (PPCC) plot can be used to

740 determine the optimal shape parameter for a one-parameter family of

741 distributions. It cannot be used for distributions without shape

742 parameters

743 (like the normal distribution) or with multiple shape parameters.

744

745 By default a Tukey-Lambda distribution (`stats.tukeylambda`) is used. A

746 Tukey-Lambda PPCC plot interpolates from long-tailed to short-tailed

747 distributions via an approximately normal one, and is therefore

748 particularly useful in practice.

749

750 Parameters

751 ----------

752 x : array_like

753 Input array.

754 a, b : scalar

755 Lower and upper bounds of the shape parameter to use.

756 dist : str or stats.distributions instance, optional

757 Distribution or distribution function name. Objects that look enough

758 like a stats.distributions instance (i.e. they have a ``ppf`` method)

759 are also accepted. The default is ``'tukeylambda'``.

760 plot : object, optional

761 If given, plots PPCC against the shape parameter.

762 `plot` is an object that has to have methods "plot" and "text".

763 The `matplotlib.pyplot` module or a Matplotlib Axes object can be used,

764 or a custom object with the same methods.

765 Default is None, which means that no plot is created.

766 N : int, optional

767 Number of points on the horizontal axis (equally distributed from

768 `a` to `b`).

769

770 Returns

771 -------

772 svals : ndarray

773 The shape values for which `ppcc` was calculated.

774 ppcc : ndarray

775 The calculated probability plot correlation coefficient values.

776

777 See Also

778 --------

779 ppcc_max, probplot, boxcox_normplot, tukeylambda

780

781 References

782 ----------

783 J.J. Filliben, "The Probability Plot Correlation Coefficient Test for

784 Normality", Technometrics, Vol. 17, pp. 111-117, 1975.

785

786 Examples

787 --------

788 First we generate some random data from a Weibull distribution

789 with shape parameter 2.5, and plot the histogram of the data:

790

791 >>> import numpy as np

792 >>> from scipy import stats

793 >>> import matplotlib.pyplot as plt

794 >>> rng = np.random.default_rng()

795 >>> c = 2.5

796 >>> x = stats.weibull_min.rvs(c, scale=4, size=2000, random_state=rng)

797

798 Take a look at the histogram of the data.

799

800 >>> fig1, ax = plt.subplots(figsize=(9, 4))

801 >>> ax.hist(x, bins=50)

802 >>> ax.set_title('Histogram of x')

803 >>> plt.show()

804

805 Now we explore this data with a PPCC plot as well as the related

806 probability plot and Box-Cox normplot. A red line is drawn where we

807 expect the PPCC value to be maximal (at the shape parameter ``c``

808 used above):

809

810 >>> fig2 = plt.figure(figsize=(12, 4))

811 >>> ax1 = fig2.add_subplot(1, 3, 1)

812 >>> ax2 = fig2.add_subplot(1, 3, 2)

813 >>> ax3 = fig2.add_subplot(1, 3, 3)

814 >>> res = stats.probplot(x, plot=ax1)

815 >>> res = stats.boxcox_normplot(x, -4, 4, plot=ax2)

816 >>> res = stats.ppcc_plot(x, c/2, 2*c, dist='weibull_min', plot=ax3)

817 >>> ax3.axvline(c, color='r')

818 >>> plt.show()

819

820 """

821 if b <= a:

822 raise ValueError("`b` has to be larger than `a`.")

823

824 svals = np.linspace(a, b, num=N)

825 ppcc = np.empty_like(svals)

826 for k, sval in enumerate(svals):

827 _, r2 = probplot(x, sval, dist=dist, fit=True)

828 ppcc[k] = r2[-1]

829

830 if plot is not None:

831 plot.plot(svals, ppcc, 'x')

832 _add_axis_labels_title(plot, xlabel='Shape Values',

833 ylabel='Prob Plot Corr. Coef.',

834 title='(%s) PPCC Plot' % dist)

835

836 return svals, ppcc

837

838

839def boxcox_llf(lmb, data):

840 r"""The boxcox log-likelihood function.

841

842 Parameters

843 ----------

844 lmb : scalar

845 Parameter for Box-Cox transformation. See `boxcox` for details.

846 data : array_like

847 Data to calculate Box-Cox log-likelihood for. If `data` is

848 multi-dimensional, the log-likelihood is calculated along the first

849 axis.

850

851 Returns

852 -------

853 llf : float or ndarray

854 Box-Cox log-likelihood of `data` given `lmb`. A float for 1-D `data`,

855 an array otherwise.

856

857 See Also

858 --------

859 boxcox, probplot, boxcox_normplot, boxcox_normmax

860

861 Notes

862 -----

863 The Box-Cox log-likelihood function is defined here as

864

865 .. math::

866

867 llf = (\lambda - 1) \sum_i(\log(x_i)) -

868 N/2 \log(\sum_i (y_i - \bar{y})^2 / N),

869

870 where ``y`` is the Box-Cox transformed input data ``x``.

871

872 Examples

873 --------

874 >>> import numpy as np

875 >>> from scipy import stats

876 >>> import matplotlib.pyplot as plt

877 >>> from mpl_toolkits.axes_grid1.inset_locator import inset_axes

878

879 Generate some random variates and calculate Box-Cox log-likelihood values

880 for them for a range of ``lmbda`` values:

881

882 >>> rng = np.random.default_rng()

883 >>> x = stats.loggamma.rvs(5, loc=10, size=1000, random_state=rng)

884 >>> lmbdas = np.linspace(-2, 10)

885 >>> llf = np.zeros(lmbdas.shape, dtype=float)

886 >>> for ii, lmbda in enumerate(lmbdas):

887 ... llf[ii] = stats.boxcox_llf(lmbda, x)

888

889 Also find the optimal lmbda value with `boxcox`:

890

891 >>> x_most_normal, lmbda_optimal = stats.boxcox(x)

892

893 Plot the log-likelihood as function of lmbda. Add the optimal lmbda as a

894 horizontal line to check that that's really the optimum:

895

896 >>> fig = plt.figure()

897 >>> ax = fig.add_subplot(111)

898 >>> ax.plot(lmbdas, llf, 'b.-')

899 >>> ax.axhline(stats.boxcox_llf(lmbda_optimal, x), color='r')

900 >>> ax.set_xlabel('lmbda parameter')

901 >>> ax.set_ylabel('Box-Cox log-likelihood')

902

903 Now add some probability plots to show that where the log-likelihood is

904 maximized the data transformed with `boxcox` looks closest to normal:

905

906 >>> locs = [3, 10, 4] # 'lower left', 'center', 'lower right'

907 >>> for lmbda, loc in zip([-1, lmbda_optimal, 9], locs):

908 ... xt = stats.boxcox(x, lmbda=lmbda)

909 ... (osm, osr), (slope, intercept, r_sq) = stats.probplot(xt)

910 ... ax_inset = inset_axes(ax, width="20%", height="20%", loc=loc)

911 ... ax_inset.plot(osm, osr, 'c.', osm, slope*osm + intercept, 'k-')

912 ... ax_inset.set_xticklabels([])

913 ... ax_inset.set_yticklabels([])

914 ... ax_inset.set_title(r'$\lambda=%1.2f$' % lmbda)

915

916 >>> plt.show()

917

918 """

919 data = np.asarray(data)

920 N = data.shape[0]

921 if N == 0:

922 return np.nan

923

924 logdata = np.log(data)

925

926 # Compute the variance of the transformed data.

927 if lmb == 0:

928 variance = np.var(logdata, axis=0)

929 else:

930 # Transform without the constant offset 1/lmb. The offset does

931 # not effect the variance, and the subtraction of the offset can

932 # lead to loss of precision.

933 variance = np.var(data**lmb / lmb, axis=0)

934

935 return (lmb - 1) * np.sum(logdata, axis=0) - N/2 * np.log(variance)

936

937

938def _boxcox_conf_interval(x, lmax, alpha):

939 # Need to find the lambda for which

940 # f(x,lmbda) >= f(x,lmax) - 0.5*chi^2_alpha;1

941 fac = 0.5 * distributions.chi2.ppf(1 - alpha, 1)

942 target = boxcox_llf(lmax, x) - fac

943

944 def rootfunc(lmbda, data, target):

945 return boxcox_llf(lmbda, data) - target

946

947 # Find positive endpoint of interval in which answer is to be found

948 newlm = lmax + 0.5

949 N = 0

950 while (rootfunc(newlm, x, target) > 0.0) and (N < 500):

951 newlm += 0.1

952 N += 1

953

954 if N == 500:

955 raise RuntimeError("Could not find endpoint.")

956

957 lmplus = optimize.brentq(rootfunc, lmax, newlm, args=(x, target))

958

959 # Now find negative interval in the same way

960 newlm = lmax - 0.5

961 N = 0

962 while (rootfunc(newlm, x, target) > 0.0) and (N < 500):

963 newlm -= 0.1

964 N += 1

965

966 if N == 500:

967 raise RuntimeError("Could not find endpoint.")

968

969 lmminus = optimize.brentq(rootfunc, newlm, lmax, args=(x, target))

970 return lmminus, lmplus

971

972

973def boxcox(x, lmbda=None, alpha=None, optimizer=None):

974 r"""Return a dataset transformed by a Box-Cox power transformation.

975

976 Parameters

977 ----------

978 x : ndarray

979 Input array to be transformed.

980

981 If `lmbda` is not None, this is an alias of

982 `scipy.special.boxcox`.

983 Returns nan if ``x < 0``; returns -inf if ``x == 0 and lmbda < 0``.

984

985 If `lmbda` is None, array must be positive, 1-dimensional, and

986 non-constant.

987

988 lmbda : scalar, optional

989 If `lmbda` is None (default), find the value of `lmbda` that maximizes

990 the log-likelihood function and return it as the second output

991 argument.

992

993 If `lmbda` is not None, do the transformation for that value.

994

995 alpha : float, optional

996 If `lmbda` is None and `alpha` is not None (default), return the

997 ``100 * (1-alpha)%`` confidence interval for `lmbda` as the third

998 output argument. Must be between 0.0 and 1.0.

999

1000 If `lmbda` is not None, `alpha` is ignored.

1001 optimizer : callable, optional

1002 If `lmbda` is None, `optimizer` is the scalar optimizer used to find

1003 the value of `lmbda` that minimizes the negative log-likelihood

1004 function. `optimizer` is a callable that accepts one argument:

1005

1006 fun : callable

1007 The objective function, which evaluates the negative

1008 log-likelihood function at a provided value of `lmbda`

1009

1010 and returns an object, such as an instance of

1011 `scipy.optimize.OptimizeResult`, which holds the optimal value of

1012 `lmbda` in an attribute `x`.

1013

1014 See the example in `boxcox_normmax` or the documentation of

1015 `scipy.optimize.minimize_scalar` for more information.

1016

1017 If `lmbda` is not None, `optimizer` is ignored.

1018

1019 Returns

1020 -------

1021 boxcox : ndarray

1022 Box-Cox power transformed array.

1023 maxlog : float, optional

1024 If the `lmbda` parameter is None, the second returned argument is

1025 the `lmbda` that maximizes the log-likelihood function.

1026 (min_ci, max_ci) : tuple of float, optional

1027 If `lmbda` parameter is None and `alpha` is not None, this returned

1028 tuple of floats represents the minimum and maximum confidence limits

1029 given `alpha`.

1030

1031 See Also

1032 --------

1033 probplot, boxcox_normplot, boxcox_normmax, boxcox_llf

1034

1035 Notes

1036 -----

1037 The Box-Cox transform is given by::

1038

1039 y = (x**lmbda - 1) / lmbda, for lmbda != 0

1040 log(x), for lmbda = 0

1041

1042 `boxcox` requires the input data to be positive. Sometimes a Box-Cox

1043 transformation provides a shift parameter to achieve this; `boxcox` does

1044 not. Such a shift parameter is equivalent to adding a positive constant to

1045 `x` before calling `boxcox`.

1046

1047 The confidence limits returned when `alpha` is provided give the interval

1048 where:

1049

1050 .. math::

1051

1052 llf(\hat{\lambda}) - llf(\lambda) < \frac{1}{2}\chi^2(1 - \alpha, 1),

1053

1054 with ``llf`` the log-likelihood function and :math:`\chi^2` the chi-squared

1055 function.

1056

1057 References

1058 ----------

1059 G.E.P. Box and D.R. Cox, "An Analysis of Transformations", Journal of the

1060 Royal Statistical Society B, 26, 211-252 (1964).

1061

1062 Examples

1063 --------

1064 >>> from scipy import stats

1065 >>> import matplotlib.pyplot as plt

1066

1067 We generate some random variates from a non-normal distribution and make a

1068 probability plot for it, to show it is non-normal in the tails:

1069

1070 >>> fig = plt.figure()

1071 >>> ax1 = fig.add_subplot(211)

1072 >>> x = stats.loggamma.rvs(5, size=500) + 5

1073 >>> prob = stats.probplot(x, dist=stats.norm, plot=ax1)

1074 >>> ax1.set_xlabel('')

1075 >>> ax1.set_title('Probplot against normal distribution')

1076

1077 We now use `boxcox` to transform the data so it's closest to normal:

1078

1079 >>> ax2 = fig.add_subplot(212)

1080 >>> xt, _ = stats.boxcox(x)

1081 >>> prob = stats.probplot(xt, dist=stats.norm, plot=ax2)

1082 >>> ax2.set_title('Probplot after Box-Cox transformation')

1083

1084 >>> plt.show()

1085

1086 """

1087 x = np.asarray(x)

1088

1089 if lmbda is not None: # single transformation

1090 return special.boxcox(x, lmbda)

1091

1092 if x.ndim != 1:

1093 raise ValueError("Data must be 1-dimensional.")

1094

1095 if x.size == 0:

1096 return x

1097

1098 if np.all(x == x[0]):

1099 raise ValueError("Data must not be constant.")

1100

1101 if np.any(x <= 0):

1102 raise ValueError("Data must be positive.")

1103

1104 # If lmbda=None, find the lmbda that maximizes the log-likelihood function.

1105 lmax = boxcox_normmax(x, method='mle', optimizer=optimizer)

1106 y = boxcox(x, lmax)

1107

1108 if alpha is None:

1109 return y, lmax

1110 else:

1111 # Find confidence interval

1112 interval = _boxcox_conf_interval(x, lmax, alpha)

1113 return y, lmax, interval

1114

1115

1116def boxcox_normmax(x, brack=None, method='pearsonr', optimizer=None):

1117 """Compute optimal Box-Cox transform parameter for input data.

1118

1119 Parameters

1120 ----------

1121 x : array_like

1122 Input array.

1123 brack : 2-tuple, optional, default (-2.0, 2.0)

1124 The starting interval for a downhill bracket search for the default

1125 `optimize.brent` solver. Note that this is in most cases not

1126 critical; the final result is allowed to be outside this bracket.

1127 If `optimizer` is passed, `brack` must be None.

1128 method : str, optional

1129 The method to determine the optimal transform parameter (`boxcox`

1130 ``lmbda`` parameter). Options are:

1131

1132 'pearsonr' (default)

1133 Maximizes the Pearson correlation coefficient between

1134 ``y = boxcox(x)`` and the expected values for ``y`` if `x` would be

1135 normally-distributed.

1136

1137 'mle'

1138 Minimizes the log-likelihood `boxcox_llf`. This is the method used

1139 in `boxcox`.

1140

1141 'all'

1142 Use all optimization methods available, and return all results.

1143 Useful to compare different methods.

1144 optimizer : callable, optional

1145 `optimizer` is a callable that accepts one argument:

1146

1147 fun : callable

1148 The objective function to be optimized. `fun` accepts one argument,

1149 the Box-Cox transform parameter `lmbda`, and returns the negative

1150 log-likelihood function at the provided value. The job of `optimizer`

1151 is to find the value of `lmbda` that minimizes `fun`.

1152

1153 and returns an object, such as an instance of

1154 `scipy.optimize.OptimizeResult`, which holds the optimal value of

1155 `lmbda` in an attribute `x`.

1156

1157 See the example below or the documentation of

1158 `scipy.optimize.minimize_scalar` for more information.

1159

1160 Returns

1161 -------

1162 maxlog : float or ndarray

1163 The optimal transform parameter found. An array instead of a scalar

1164 for ``method='all'``.

1165

1166 See Also

1167 --------

1168 boxcox, boxcox_llf, boxcox_normplot, scipy.optimize.minimize_scalar

1169

1170 Examples

1171 --------

1172 >>> import numpy as np

1173 >>> from scipy import stats

1174 >>> import matplotlib.pyplot as plt

1175

1176 We can generate some data and determine the optimal ``lmbda`` in various

1177 ways:

1178

1179 >>> rng = np.random.default_rng()

1180 >>> x = stats.loggamma.rvs(5, size=30, random_state=rng) + 5

1181 >>> y, lmax_mle = stats.boxcox(x)

1182 >>> lmax_pearsonr = stats.boxcox_normmax(x)

1183

1184 >>> lmax_mle

1185 2.217563431465757

1186 >>> lmax_pearsonr

1187 2.238318660200961

1188 >>> stats.boxcox_normmax(x, method='all')

1189 array([2.23831866, 2.21756343])

1190

1191 >>> fig = plt.figure()

1192 >>> ax = fig.add_subplot(111)

1193 >>> prob = stats.boxcox_normplot(x, -10, 10, plot=ax)

1194 >>> ax.axvline(lmax_mle, color='r')

1195 >>> ax.axvline(lmax_pearsonr, color='g', ls='--')

1196

1197 >>> plt.show()

1198

1199 Alternatively, we can define our own `optimizer` function. Suppose we

1200 are only interested in values of `lmbda` on the interval [6, 7], we

1201 want to use `scipy.optimize.minimize_scalar` with ``method='bounded'``,

1202 and we want to use tighter tolerances when optimizing the log-likelihood

1203 function. To do this, we define a function that accepts positional argument

1204 `fun` and uses `scipy.optimize.minimize_scalar` to minimize `fun` subject

1205 to the provided bounds and tolerances:

1206

1207 >>> from scipy import optimize

1208 >>> options = {'xatol': 1e-12} # absolute tolerance on `x`

1209 >>> def optimizer(fun):

1210 ... return optimize.minimize_scalar(fun, bounds=(6, 7),

1211 ... method="bounded", options=options)

1212 >>> stats.boxcox_normmax(x, optimizer=optimizer)

1213 6.000...

1214 """

1215 # If optimizer is not given, define default 'brent' optimizer.

1216 if optimizer is None:

1217

1218 # Set default value for `brack`.

1219 if brack is None:

1220 brack = (-2.0, 2.0)

1221

1222 def _optimizer(func, args):

1223 return optimize.brent(func, args=args, brack=brack)

1224

1225 # Otherwise check optimizer.

1226 else:

1227 if not callable(optimizer):

1228 raise ValueError("`optimizer` must be a callable")

1229

1230 if brack is not None:

1231 raise ValueError("`brack` must be None if `optimizer` is given")

1232

1233 # `optimizer` is expected to return a `OptimizeResult` object, we here

1234 # get the solution to the optimization problem.

1235 def _optimizer(func, args):

1236 def func_wrapped(x):

1237 return func(x, *args)

1238 return getattr(optimizer(func_wrapped), 'x', None)

1239

1240 def _pearsonr(x):

1241 osm_uniform = _calc_uniform_order_statistic_medians(len(x))

1242 xvals = distributions.norm.ppf(osm_uniform)

1243

1244 def _eval_pearsonr(lmbda, xvals, samps):

1245 # This function computes the x-axis values of the probability plot

1246 # and computes a linear regression (including the correlation) and

1247 # returns ``1 - r`` so that a minimization function maximizes the

1248 # correlation.

1249 y = boxcox(samps, lmbda)

1250 yvals = np.sort(y)

1251 r, prob = _stats_py.pearsonr(xvals, yvals)

1252 return 1 - r

1253

1254 return _optimizer(_eval_pearsonr, args=(xvals, x))

1255

1256 def _mle(x):

1257 def _eval_mle(lmb, data):

1258 # function to minimize

1259 return -boxcox_llf(lmb, data)

1260

1261 return _optimizer(_eval_mle, args=(x,))

1262

1263 def _all(x):

1264 maxlog = np.empty(2, dtype=float)

1265 maxlog[0] = _pearsonr(x)

1266 maxlog[1] = _mle(x)

1267 return maxlog

1268

1269 methods = {'pearsonr': _pearsonr,

1270 'mle': _mle,

1271 'all': _all}

1272 if method not in methods.keys():

1273 raise ValueError("Method %s not recognized." % method)

1274

1275 optimfunc = methods[method]

1276 res = optimfunc(x)

1277 if res is None:

1278 message = ("`optimizer` must return an object containing the optimal "

1279 "`lmbda` in attribute `x`")

1280 raise ValueError(message)

1281 return res

1282

1283

1284def _normplot(method, x, la, lb, plot=None, N=80):

1285 """Compute parameters for a Box-Cox or Yeo-Johnson normality plot,

1286 optionally show it.

1287

1288 See `boxcox_normplot` or `yeojohnson_normplot` for details.

1289 """

1290

1291 if method == 'boxcox':

1292 title = 'Box-Cox Normality Plot'

1293 transform_func = boxcox

1294 else:

1295 title = 'Yeo-Johnson Normality Plot'

1296 transform_func = yeojohnson

1297

1298 x = np.asarray(x)

1299 if x.size == 0:

1300 return x

1301

1302 if lb <= la:

1303 raise ValueError("`lb` has to be larger than `la`.")

1304

1305 if method == 'boxcox' and np.any(x <= 0):

1306 raise ValueError("Data must be positive.")

1307

1308 lmbdas = np.linspace(la, lb, num=N)

1309 ppcc = lmbdas * 0.0

1310 for i, val in enumerate(lmbdas):

1311 # Determine for each lmbda the square root of correlation coefficient

1312 # of transformed x

1313 z = transform_func(x, lmbda=val)

1314 _, (_, _, r) = probplot(z, dist='norm', fit=True)

1315 ppcc[i] = r

1316

1317 if plot is not None:

1318 plot.plot(lmbdas, ppcc, 'x')

1319 _add_axis_labels_title(plot, xlabel='$\\lambda$',

1320 ylabel='Prob Plot Corr. Coef.',

1321 title=title)

1322

1323 return lmbdas, ppcc

1324

1325

1326def boxcox_normplot(x, la, lb, plot=None, N=80):

1327 """Compute parameters for a Box-Cox normality plot, optionally show it.

1328

1329 A Box-Cox normality plot shows graphically what the best transformation

1330 parameter is to use in `boxcox` to obtain a distribution that is close

1331 to normal.

1332

1333 Parameters

1334 ----------

1335 x : array_like

1336 Input array.

1337 la, lb : scalar

1338 The lower and upper bounds for the ``lmbda`` values to pass to `boxcox`

1339 for Box-Cox transformations. These are also the limits of the

1340 horizontal axis of the plot if that is generated.

1341 plot : object, optional

1342 If given, plots the quantiles and least squares fit.

1343 `plot` is an object that has to have methods "plot" and "text".

1344 The `matplotlib.pyplot` module or a Matplotlib Axes object can be used,

1345 or a custom object with the same methods.

1346 Default is None, which means that no plot is created.

1347 N : int, optional

1348 Number of points on the horizontal axis (equally distributed from

1349 `la` to `lb`).

1350

1351 Returns

1352 -------

1353 lmbdas : ndarray

1354 The ``lmbda`` values for which a Box-Cox transform was done.

1355 ppcc : ndarray

1356 Probability Plot Correlelation Coefficient, as obtained from `probplot`

1357 when fitting the Box-Cox transformed input `x` against a normal

1358 distribution.

1359

1360 See Also

1361 --------

1362 probplot, boxcox, boxcox_normmax, boxcox_llf, ppcc_max

1363

1364 Notes

1365 -----

1366 Even if `plot` is given, the figure is not shown or saved by

1367 `boxcox_normplot`; ``plt.show()`` or ``plt.savefig('figname.png')``

1368 should be used after calling `probplot`.

1369

1370 Examples

1371 --------

1372 >>> from scipy import stats

1373 >>> import matplotlib.pyplot as plt

1374

1375 Generate some non-normally distributed data, and create a Box-Cox plot:

1376

1377 >>> x = stats.loggamma.rvs(5, size=500) + 5

1378 >>> fig = plt.figure()

1379 >>> ax = fig.add_subplot(111)

1380 >>> prob = stats.boxcox_normplot(x, -20, 20, plot=ax)

1381

1382 Determine and plot the optimal ``lmbda`` to transform ``x`` and plot it in

1383 the same plot:

1384

1385 >>> _, maxlog = stats.boxcox(x)

1386 >>> ax.axvline(maxlog, color='r')

1387

1388 >>> plt.show()

1389

1390 """

1391 return _normplot('boxcox', x, la, lb, plot, N)

1392

1393

1394def yeojohnson(x, lmbda=None):

1395 r"""Return a dataset transformed by a Yeo-Johnson power transformation.

1396

1397 Parameters

1398 ----------

1399 x : ndarray

1400 Input array. Should be 1-dimensional.

1401 lmbda : float, optional

1402 If ``lmbda`` is ``None``, find the lambda that maximizes the

1403 log-likelihood function and return it as the second output argument.

1404 Otherwise the transformation is done for the given value.

1405

1406 Returns

1407 -------

1408 yeojohnson: ndarray

1409 Yeo-Johnson power transformed array.

1410 maxlog : float, optional

1411 If the `lmbda` parameter is None, the second returned argument is

1412 the lambda that maximizes the log-likelihood function.

1413

1414 See Also

1415 --------

1416 probplot, yeojohnson_normplot, yeojohnson_normmax, yeojohnson_llf, boxcox

1417

1418 Notes

1419 -----

1420 The Yeo-Johnson transform is given by::

1421

1422 y = ((x + 1)**lmbda - 1) / lmbda, for x >= 0, lmbda != 0

1423 log(x + 1), for x >= 0, lmbda = 0

1424 -((-x + 1)**(2 - lmbda) - 1) / (2 - lmbda), for x < 0, lmbda != 2

1425 -log(-x + 1), for x < 0, lmbda = 2

1426

1427 Unlike `boxcox`, `yeojohnson` does not require the input data to be

1428 positive.

1429

1430 .. versionadded:: 1.2.0

1431

1432

1433 References

1434 ----------

1435 I. Yeo and R.A. Johnson, "A New Family of Power Transformations to

1436 Improve Normality or Symmetry", Biometrika 87.4 (2000):

1437

1438

1439 Examples

1440 --------

1441 >>> from scipy import stats

1442 >>> import matplotlib.pyplot as plt

1443

1444 We generate some random variates from a non-normal distribution and make a

1445 probability plot for it, to show it is non-normal in the tails:

1446

1447 >>> fig = plt.figure()

1448 >>> ax1 = fig.add_subplot(211)

1449 >>> x = stats.loggamma.rvs(5, size=500) + 5

1450 >>> prob = stats.probplot(x, dist=stats.norm, plot=ax1)

1451 >>> ax1.set_xlabel('')

1452 >>> ax1.set_title('Probplot against normal distribution')

1453

1454 We now use `yeojohnson` to transform the data so it's closest to normal:

1455

1456 >>> ax2 = fig.add_subplot(212)

1457 >>> xt, lmbda = stats.yeojohnson(x)

1458 >>> prob = stats.probplot(xt, dist=stats.norm, plot=ax2)

1459 >>> ax2.set_title('Probplot after Yeo-Johnson transformation')

1460

1461 >>> plt.show()

1462

1463 """

1464 x = np.asarray(x)

1465 if x.size == 0:

1466 return x

1467

1468 if np.issubdtype(x.dtype, np.complexfloating):

1469 raise ValueError('Yeo-Johnson transformation is not defined for '

1470 'complex numbers.')

1471

1472 if np.issubdtype(x.dtype, np.integer):

1473 x = x.astype(np.float64, copy=False)

1474

1475 if lmbda is not None:

1476 return _yeojohnson_transform(x, lmbda)

1477

1478 # if lmbda=None, find the lmbda that maximizes the log-likelihood function.

1479 lmax = yeojohnson_normmax(x)

1480 y = _yeojohnson_transform(x, lmax)

1481

1482 return y, lmax

1483

1484

1485def _yeojohnson_transform(x, lmbda):

1486 """Returns `x` transformed by the Yeo-Johnson power transform with given

1487 parameter `lmbda`.

1488 """

1489 out = np.zeros_like(x)

1490 pos = x >= 0 # binary mask

1491

1492 # when x >= 0

1493 if abs(lmbda) < np.spacing(1.):

1494 out[pos] = np.log1p(x[pos])

1495 else: # lmbda != 0

1496 out[pos] = (np.power(x[pos] + 1, lmbda) - 1) / lmbda

1497

1498 # when x < 0

1499 if abs(lmbda - 2) > np.spacing(1.):

1500 out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)

1501 else: # lmbda == 2

1502 out[~pos] = -np.log1p(-x[~pos])

1503

1504 return out

1505

1506

1507def yeojohnson_llf(lmb, data):

1508 r"""The yeojohnson log-likelihood function.

1509

1510 Parameters

1511 ----------

1512 lmb : scalar

1513 Parameter for Yeo-Johnson transformation. See `yeojohnson` for

1514 details.

1515 data : array_like

1516 Data to calculate Yeo-Johnson log-likelihood for. If `data` is

1517 multi-dimensional, the log-likelihood is calculated along the first

1518 axis.

1519

1520 Returns

1521 -------

1522 llf : float

1523 Yeo-Johnson log-likelihood of `data` given `lmb`.

1524

1525 See Also

1526 --------

1527 yeojohnson, probplot, yeojohnson_normplot, yeojohnson_normmax

1528

1529 Notes

1530 -----

1531 The Yeo-Johnson log-likelihood function is defined here as

1532

1533 .. math::

1534

1535 llf = -N/2 \log(\hat{\sigma}^2) + (\lambda - 1)

1536 \sum_i \text{ sign }(x_i)\log(|x_i| + 1)

1537

1538 where :math:`\hat{\sigma}^2` is estimated variance of the Yeo-Johnson

1539 transformed input data ``x``.

1540

1541 .. versionadded:: 1.2.0

1542

1543 Examples

1544 --------

1545 >>> import numpy as np

1546 >>> from scipy import stats

1547 >>> import matplotlib.pyplot as plt

1548 >>> from mpl_toolkits.axes_grid1.inset_locator import inset_axes

1549

1550 Generate some random variates and calculate Yeo-Johnson log-likelihood

1551 values for them for a range of ``lmbda`` values:

1552

1553 >>> x = stats.loggamma.rvs(5, loc=10, size=1000)

1554 >>> lmbdas = np.linspace(-2, 10)

1555 >>> llf = np.zeros(lmbdas.shape, dtype=float)

1556 >>> for ii, lmbda in enumerate(lmbdas):

1557 ... llf[ii] = stats.yeojohnson_llf(lmbda, x)

1558

1559 Also find the optimal lmbda value with `yeojohnson`:

1560

1561 >>> x_most_normal, lmbda_optimal = stats.yeojohnson(x)

1562

1563 Plot the log-likelihood as function of lmbda. Add the optimal lmbda as a

1564 horizontal line to check that that's really the optimum:

1565

1566 >>> fig = plt.figure()

1567 >>> ax = fig.add_subplot(111)

1568 >>> ax.plot(lmbdas, llf, 'b.-')

1569 >>> ax.axhline(stats.yeojohnson_llf(lmbda_optimal, x), color='r')

1570 >>> ax.set_xlabel('lmbda parameter')

1571 >>> ax.set_ylabel('Yeo-Johnson log-likelihood')

1572

1573 Now add some probability plots to show that where the log-likelihood is

1574 maximized the data transformed with `yeojohnson` looks closest to normal:

1575

1576 >>> locs = [3, 10, 4] # 'lower left', 'center', 'lower right'

1577 >>> for lmbda, loc in zip([-1, lmbda_optimal, 9], locs):

1578 ... xt = stats.yeojohnson(x, lmbda=lmbda)

1579 ... (osm, osr), (slope, intercept, r_sq) = stats.probplot(xt)

1580 ... ax_inset = inset_axes(ax, width="20%", height="20%", loc=loc)

1581 ... ax_inset.plot(osm, osr, 'c.', osm, slope*osm + intercept, 'k-')

1582 ... ax_inset.set_xticklabels([])

1583 ... ax_inset.set_yticklabels([])

1584 ... ax_inset.set_title(r'$\lambda=%1.2f$' % lmbda)

1585

1586 >>> plt.show()

1587

1588 """

1589 data = np.asarray(data)

1590 n_samples = data.shape[0]

1591

1592 if n_samples == 0:

1593 return np.nan

1594

1595 trans = _yeojohnson_transform(data, lmb)

1596 trans_var = trans.var(axis=0)

1597 loglike = np.empty_like(trans_var)

1598

1599 # Avoid RuntimeWarning raised by np.log when the variance is too low

1600 tiny_variance = trans_var < np.finfo(trans_var.dtype).tiny

1601 loglike[tiny_variance] = np.inf

1602

1603 loglike[~tiny_variance] = (

1604 -n_samples / 2 * np.log(trans_var[~tiny_variance]))

1605 loglike[~tiny_variance] += (

1606 (lmb - 1) * (np.sign(data) * np.log(np.abs(data) + 1)).sum(axis=0))

1607 return loglike

1608

1609

1610def yeojohnson_normmax(x, brack=(-2, 2)):

1611 """Compute optimal Yeo-Johnson transform parameter.

1612

1613 Compute optimal Yeo-Johnson transform parameter for input data, using

1614 maximum likelihood estimation.

1615

1616 Parameters

1617 ----------

1618 x : array_like

1619 Input array.

1620 brack : 2-tuple, optional

1621 The starting interval for a downhill bracket search with

1622 `optimize.brent`. Note that this is in most cases not critical; the

1623 final result is allowed to be outside this bracket.

1624

1625 Returns

1626 -------

1627 maxlog : float

1628 The optimal transform parameter found.

1629

1630 See Also

1631 --------

1632 yeojohnson, yeojohnson_llf, yeojohnson_normplot

1633

1634 Notes

1635 -----

1636 .. versionadded:: 1.2.0

1637

1638 Examples

1639 --------

1640 >>> import numpy as np

1641 >>> from scipy import stats

1642 >>> import matplotlib.pyplot as plt

1643

1644 Generate some data and determine optimal ``lmbda``

1645

1646 >>> rng = np.random.default_rng()

1647 >>> x = stats.loggamma.rvs(5, size=30, random_state=rng) + 5

1648 >>> lmax = stats.yeojohnson_normmax(x)

1649

1650 >>> fig = plt.figure()

1651 >>> ax = fig.add_subplot(111)

1652 >>> prob = stats.yeojohnson_normplot(x, -10, 10, plot=ax)

1653 >>> ax.axvline(lmax, color='r')

1654

1655 >>> plt.show()

1656

1657 """

1658 def _neg_llf(lmbda, data):

1659 llf = yeojohnson_llf(lmbda, data)

1660 # reject likelihoods that are inf which are likely due to small

1661 # variance in the transformed space

1662 llf[np.isinf(llf)] = -np.inf

1663 return -llf

1664

1665 with np.errstate(invalid='ignore'):

1666 return optimize.brent(_neg_llf, brack=brack, args=(x,))

1667

1668

1669def yeojohnson_normplot(x, la, lb, plot=None, N=80):

1670 """Compute parameters for a Yeo-Johnson normality plot, optionally show it.

1671

1672 A Yeo-Johnson normality plot shows graphically what the best

1673 transformation parameter is to use in `yeojohnson` to obtain a

1674 distribution that is close to normal.

1675

1676 Parameters

1677 ----------

1678 x : array_like

1679 Input array.

1680 la, lb : scalar

1681 The lower and upper bounds for the ``lmbda`` values to pass to

1682 `yeojohnson` for Yeo-Johnson transformations. These are also the

1683 limits of the horizontal axis of the plot if that is generated.

1684 plot : object, optional

1685 If given, plots the quantiles and least squares fit.

1686 `plot` is an object that has to have methods "plot" and "text".

1687 The `matplotlib.pyplot` module or a Matplotlib Axes object can be used,

1688 or a custom object with the same methods.

1689 Default is None, which means that no plot is created.

1690 N : int, optional

1691 Number of points on the horizontal axis (equally distributed from

1692 `la` to `lb`).

1693

1694 Returns

1695 -------

1696 lmbdas : ndarray

1697 The ``lmbda`` values for which a Yeo-Johnson transform was done.

1698 ppcc : ndarray

1699 Probability Plot Correlelation Coefficient, as obtained from `probplot`

1700 when fitting the Box-Cox transformed input `x` against a normal

1701 distribution.

1702

1703 See Also

1704 --------

1705 probplot, yeojohnson, yeojohnson_normmax, yeojohnson_llf, ppcc_max

1706

1707 Notes

1708 -----

1709 Even if `plot` is given, the figure is not shown or saved by

1710 `boxcox_normplot`; ``plt.show()`` or ``plt.savefig('figname.png')``

1711 should be used after calling `probplot`.

1712

1713 .. versionadded:: 1.2.0

1714

1715 Examples

1716 --------

1717 >>> from scipy import stats

1718 >>> import matplotlib.pyplot as plt

1719

1720 Generate some non-normally distributed data, and create a Yeo-Johnson plot:

1721

1722 >>> x = stats.loggamma.rvs(5, size=500) + 5

1723 >>> fig = plt.figure()

1724 >>> ax = fig.add_subplot(111)

1725 >>> prob = stats.yeojohnson_normplot(x, -20, 20, plot=ax)

1726

1727 Determine and plot the optimal ``lmbda`` to transform ``x`` and plot it in

1728 the same plot:

1729

1730 >>> _, maxlog = stats.yeojohnson(x)

1731 >>> ax.axvline(maxlog, color='r')

1732

1733 >>> plt.show()

1734

1735 """

1736 return _normplot('yeojohnson', x, la, lb, plot, N)

1737

1738

1739ShapiroResult = namedtuple('ShapiroResult', ('statistic', 'pvalue'))

1740

1741

1742def shapiro(x):

1743 """Perform the Shapiro-Wilk test for normality.

1744

1745 The Shapiro-Wilk test tests the null hypothesis that the

1746 data was drawn from a normal distribution.

1747

1748 Parameters

1749 ----------

1750 x : array_like

1751 Array of sample data.

1752

1753 Returns

1754 -------

1755 statistic : float

1756 The test statistic.

1757 p-value : float

1758 The p-value for the hypothesis test.

1759

1760 See Also

1761 --------

1762 anderson : The Anderson-Darling test for normality

1763 kstest : The Kolmogorov-Smirnov test for goodness of fit.

1764

1765 Notes

1766 -----

1767 The algorithm used is described in [4]_ but censoring parameters as

1768 described are not implemented. For N > 5000 the W test statistic is accurate

1769 but the p-value may not be.

1770

1771 The chance of rejecting the null hypothesis when it is true is close to 5%

1772 regardless of sample size.

1773

1774 References

1775 ----------

1776 .. [1] https://www.itl.nist.gov/div898/handbook/prc/section2/prc213.htm

1777 .. [2] Shapiro, S. S. & Wilk, M.B (1965). An analysis of variance test for

1778 normality (complete samples), Biometrika, Vol. 52, pp. 591-611.

1779 .. [3] Razali, N. M. & Wah, Y. B. (2011) Power comparisons of Shapiro-Wilk,

1780 Kolmogorov-Smirnov, Lilliefors and Anderson-Darling tests, Journal of

1781 Statistical Modeling and Analytics, Vol. 2, pp. 21-33.

1782 .. [4] ALGORITHM AS R94 APPL. STATIST. (1995) VOL. 44, NO. 4.

1783

1784 Examples

1785 --------

1786 >>> import numpy as np

1787 >>> from scipy import stats

1788 >>> rng = np.random.default_rng()

1789 >>> x = stats.norm.rvs(loc=5, scale=3, size=100, random_state=rng)

1790 >>> shapiro_test = stats.shapiro(x)

1791 >>> shapiro_test

1792 ShapiroResult(statistic=0.9813305735588074, pvalue=0.16855233907699585)

1793 >>> shapiro_test.statistic

1794 0.9813305735588074

1795 >>> shapiro_test.pvalue

1796 0.16855233907699585

1797

1798 """

1799 x = np.ravel(x)

1800

1801 N = len(x)

1802 if N < 3:

1803 raise ValueError("Data must be at least length 3.")

1804

1805 x = x - np.median(x)

1806

1807 a = zeros(N, 'f')

1808 init = 0

1809

1810 y = sort(x)

1811 a, w, pw, ifault = _statlib.swilk(y, a[:N//2], init)

1812 if ifault not in [0, 2]:

1813 warnings.warn("Input data for shapiro has range zero. The results "

1814 "may not be accurate.")

1815 if N > 5000:

1816 warnings.warn("p-value may not be accurate for N > 5000.")

1817

1818 return ShapiroResult(w, pw)

1819

1820

1821# Values from Stephens, M A, "EDF Statistics for Goodness of Fit and

1822# Some Comparisons", Journal of the American Statistical

1823# Association, Vol. 69, Issue 347, Sept. 1974, pp 730-737

1824_Avals_norm = array([0.576, 0.656, 0.787, 0.918, 1.092])

1825_Avals_expon = array([0.922, 1.078, 1.341, 1.606, 1.957])

1826# From Stephens, M A, "Goodness of Fit for the Extreme Value Distribution",

1827# Biometrika, Vol. 64, Issue 3, Dec. 1977, pp 583-588.

1828_Avals_gumbel = array([0.474, 0.637, 0.757, 0.877, 1.038])

1829# From Stephens, M A, "Tests of Fit for the Logistic Distribution Based

1830# on the Empirical Distribution Function.", Biometrika,

1831# Vol. 66, Issue 3, Dec. 1979, pp 591-595.

1832_Avals_logistic = array([0.426, 0.563, 0.660, 0.769, 0.906, 1.010])

1833

1834

1835AndersonResult = _make_tuple_bunch('AndersonResult',

1836 ['statistic', 'critical_values',

1837 'significance_level'], ['fit_result'])

1838

1839

1840def anderson(x, dist='norm'):

1841 """Anderson-Darling test for data coming from a particular distribution.

1842

1843 The Anderson-Darling test tests the null hypothesis that a sample is

1844 drawn from a population that follows a particular distribution.

1845 For the Anderson-Darling test, the critical values depend on

1846 which distribution is being tested against. This function works

1847 for normal, exponential, logistic, or Gumbel (Extreme Value

1848 Type I) distributions.

1849

1850 Parameters

1851 ----------

1852 x : array_like

1853 Array of sample data.

1854 dist : {'norm', 'expon', 'logistic', 'gumbel', 'gumbel_l', 'gumbel_r', 'extreme1'}, optional

1855 The type of distribution to test against. The default is 'norm'.

1856 The names 'extreme1', 'gumbel_l' and 'gumbel' are synonyms for the

1857 same distribution.

1858

1859 Returns

1860 -------

1861 result : AndersonResult

1862 An object with the following attributes:

1863

1864 statistic : float

1865 The Anderson-Darling test statistic.

1866 critical_values : list

1867 The critical values for this distribution.

1868 significance_level : list

1869 The significance levels for the corresponding critical values

1870 in percents. The function returns critical values for a

1871 differing set of significance levels depending on the

1872 distribution that is being tested against.

1873 fit_result : `~scipy.stats._result_classes.FitResult`

1874 An object containing the results of fitting the distribution to

1875 the data.

1876

1877 See Also

1878 --------

1879 kstest : The Kolmogorov-Smirnov test for goodness-of-fit.

1880

1881 Notes

1882 -----

1883 Critical values provided are for the following significance levels:

1884

1885 normal/exponential

1886 15%, 10%, 5%, 2.5%, 1%

1887 logistic

1888 25%, 10%, 5%, 2.5%, 1%, 0.5%

1889 Gumbel

1890 25%, 10%, 5%, 2.5%, 1%

1891

1892 If the returned statistic is larger than these critical values then

1893 for the corresponding significance level, the null hypothesis that

1894 the data come from the chosen distribution can be rejected.

1895 The returned statistic is referred to as 'A2' in the references.

1896

1897 References

1898 ----------

1899 .. [1] https://www.itl.nist.gov/div898/handbook/prc/section2/prc213.htm

1900 .. [2] Stephens, M. A. (1974). EDF Statistics for Goodness of Fit and

1901 Some Comparisons, Journal of the American Statistical Association,

1902 Vol. 69, pp. 730-737.

1903 .. [3] Stephens, M. A. (1976). Asymptotic Results for Goodness-of-Fit

1904 Statistics with Unknown Parameters, Annals of Statistics, Vol. 4,

1905 pp. 357-369.

1906 .. [4] Stephens, M. A. (1977). Goodness of Fit for the Extreme Value

1907 Distribution, Biometrika, Vol. 64, pp. 583-588.

1908 .. [5] Stephens, M. A. (1977). Goodness of Fit with Special Reference

1909 to Tests for Exponentiality , Technical Report No. 262,

1910 Department of Statistics, Stanford University, Stanford, CA.

1911 .. [6] Stephens, M. A. (1979). Tests of Fit for the Logistic Distribution

1912 Based on the Empirical Distribution Function, Biometrika, Vol. 66,

1913 pp. 591-595.

1914

1915 Examples

1916 --------

1917 Test the null hypothesis that a random sample was drawn from a normal

1918 distribution (with unspecified mean and standard deviation).

1919

1920 >>> import numpy as np

1921 >>> from scipy.stats import anderson

1922 >>> rng = np.random.default_rng()

1923 >>> data = rng.random(size=35)

1924 >>> res = anderson(data)

1925 >>> res.statistic

1926 0.8398018749744764

1927 >>> res.critical_values

1928 array([0.527, 0.6 , 0.719, 0.839, 0.998])

1929 >>> res.significance_level

1930 array([15. , 10. , 5. , 2.5, 1. ])

1931

1932 The value of the statistic (barely) exceeds the critical value associated

1933 with a significance level of 2.5%, so the null hypothesis may be rejected

1934 at a significance level of 2.5%, but not at a significance level of 1%.

1935

1936 """ # noqa

1937 dist = dist.lower()

1938 if dist in {'extreme1', 'gumbel'}:

1939 dist = 'gumbel_l'

1940 dists = {'norm', 'expon', 'gumbel_l', 'gumbel_r', 'logistic'}

1941 if dist not in dists:

1942 raise ValueError(f"Invalid distribution; dist must be in {dists}.")

1943 y = sort(x)

1944 xbar = np.mean(x, axis=0)

1945 N = len(y)

1946 if dist == 'norm':

1947 s = np.std(x, ddof=1, axis=0)

1948 w = (y - xbar) / s

1949 fit_params = xbar, s

1950 logcdf = distributions.norm.logcdf(w)

1951 logsf = distributions.norm.logsf(w)

1952 sig = array([15, 10, 5, 2.5, 1])

1953 critical = around(_Avals_norm / (1.0 + 4.0/N - 25.0/N/N), 3)

1954 elif dist == 'expon':

1955 w = y / xbar

1956 fit_params = 0, xbar

1957 logcdf = distributions.expon.logcdf(w)

1958 logsf = distributions.expon.logsf(w)

1959 sig = array([15, 10, 5, 2.5, 1])

1960 critical = around(_Avals_expon / (1.0 + 0.6/N), 3)

1961 elif dist == 'logistic':

1962 def rootfunc(ab, xj, N):

1963 a, b = ab

1964 tmp = (xj - a) / b

1965 tmp2 = exp(tmp)

1966 val = [np.sum(1.0/(1+tmp2), axis=0) - 0.5*N,

1967 np.sum(tmp*(1.0-tmp2)/(1+tmp2), axis=0) + N]

1968 return array(val)

1969

1970 sol0 = array([xbar, np.std(x, ddof=1, axis=0)])

1971 sol = optimize.fsolve(rootfunc, sol0, args=(x, N), xtol=1e-5)

1972 w = (y - sol[0]) / sol[1]

1973 fit_params = sol

1974 logcdf = distributions.logistic.logcdf(w)

1975 logsf = distributions.logistic.logsf(w)

1976 sig = array([25, 10, 5, 2.5, 1, 0.5])

1977 critical = around(_Avals_logistic / (1.0 + 0.25/N), 3)

1978 elif dist == 'gumbel_r':

1979 xbar, s = distributions.gumbel_r.fit(x)

1980 w = (y - xbar) / s

1981 fit_params = xbar, s

1982 logcdf = distributions.gumbel_r.logcdf(w)

1983 logsf = distributions.gumbel_r.logsf(w)

1984 sig = array([25, 10, 5, 2.5, 1])

1985 critical = around(_Avals_gumbel / (1.0 + 0.2/sqrt(N)), 3)

1986 elif dist == 'gumbel_l':

1987 xbar, s = distributions.gumbel_l.fit(x)

1988 w = (y - xbar) / s

1989 fit_params = xbar, s

1990 logcdf = distributions.gumbel_l.logcdf(w)

1991 logsf = distributions.gumbel_l.logsf(w)

1992 sig = array([25, 10, 5, 2.5, 1])

1993 critical = around(_Avals_gumbel / (1.0 + 0.2/sqrt(N)), 3)

1994

1995 i = arange(1, N + 1)

1996 A2 = -N - np.sum((2*i - 1.0) / N * (logcdf + logsf[::-1]), axis=0)

1997

1998 # FitResult initializer expects an optimize result, so let's work with it

1999 message = '`anderson` successfully fit the distribution to the data.'

2000 res = optimize.OptimizeResult(success=True, message=message)

2001 res.x = np.array(fit_params)

2002 fit_result = FitResult(getattr(distributions, dist), y,

2003 discrete=False, res=res)

2004

2005 return AndersonResult(A2, critical, sig, fit_result=fit_result)

2006

2007

2008def _anderson_ksamp_midrank(samples, Z, Zstar, k, n, N):

2009 """Compute A2akN equation 7 of Scholz and Stephens.

2010

2011 Parameters

2012 ----------

2013 samples : sequence of 1-D array_like

2014 Array of sample arrays.

2015 Z : array_like

2016 Sorted array of all observations.

2017 Zstar : array_like

2018 Sorted array of unique observations.

2019 k : int

2020 Number of samples.

2021 n : array_like

2022 Number of observations in each sample.

2023 N : int

2024 Total number of observations.

2025

2026 Returns

2027 -------

2028 A2aKN : float

2029 The A2aKN statistics of Scholz and Stephens 1987.

2030

2031 """

2032 A2akN = 0.

2033 Z_ssorted_left = Z.searchsorted(Zstar, 'left')

2034 if N == Zstar.size:

2035 lj = 1.

2036 else:

2037 lj = Z.searchsorted(Zstar, 'right') - Z_ssorted_left

2038 Bj = Z_ssorted_left + lj / 2.

2039 for i in arange(0, k):

2040 s = np.sort(samples[i])

2041 s_ssorted_right = s.searchsorted(Zstar, side='right')

2042 Mij = s_ssorted_right.astype(float)

2043 fij = s_ssorted_right - s.searchsorted(Zstar, 'left')

2044 Mij -= fij / 2.

2045 inner = lj / float(N) * (N*Mij - Bj*n[i])**2 / (Bj*(N - Bj) - N*lj/4.)

2046 A2akN += inner.sum() / n[i]

2047 A2akN *= (N - 1.) / N

2048 return A2akN

2049

2050

2051def _anderson_ksamp_right(samples, Z, Zstar, k, n, N):

2052 """Compute A2akN equation 6 of Scholz & Stephens.

2053

2054 Parameters

2055 ----------

2056 samples : sequence of 1-D array_like

2057 Array of sample arrays.

2058 Z : array_like

2059 Sorted array of all observations.

2060 Zstar : array_like

2061 Sorted array of unique observations.

2062 k : int

2063 Number of samples.

2064 n : array_like

2065 Number of observations in each sample.

2066 N : int

2067 Total number of observations.

2068

2069 Returns

2070 -------

2071 A2KN : float

2072 The A2KN statistics of Scholz and Stephens 1987.

2073

2074 """

2075 A2kN = 0.

2076 lj = Z.searchsorted(Zstar[:-1], 'right') - Z.searchsorted(Zstar[:-1],

2077 'left')

2078 Bj = lj.cumsum()

2079 for i in arange(0, k):

2080 s = np.sort(samples[i])

2081 Mij = s.searchsorted(Zstar[:-1], side='right')

2082 inner = lj / float(N) * (N * Mij - Bj * n[i])**2 / (Bj * (N - Bj))

2083 A2kN += inner.sum() / n[i]

2084 return A2kN

2085

2086

2087Anderson_ksampResult = _make_tuple_bunch(

2088 'Anderson_ksampResult',

2089 ['statistic', 'critical_values', 'pvalue'], []

2090)

2091

2092

2093def anderson_ksamp(samples, midrank=True):

2094 """The Anderson-Darling test for k-samples.

2095

2096 The k-sample Anderson-Darling test is a modification of the

2097 one-sample Anderson-Darling test. It tests the null hypothesis

2098 that k-samples are drawn from the same population without having

2099 to specify the distribution function of that population. The

2100 critical values depend on the number of samples.

2101

2102 Parameters

2103 ----------

2104 samples : sequence of 1-D array_like

2105 Array of sample data in arrays.

2106 midrank : bool, optional

2107 Type of Anderson-Darling test which is computed. Default

2108 (True) is the midrank test applicable to continuous and

2109 discrete populations. If False, the right side empirical

2110 distribution is used.

2111

2112 Returns

2113 -------

2114 res : Anderson_ksampResult

2115 An object containing attributes:

2116

2117 statistic : float

2118 Normalized k-sample Anderson-Darling test statistic.

2119 critical_values : array

2120 The critical values for significance levels 25%, 10%, 5%, 2.5%, 1%,

2121 0.5%, 0.1%.

2122 pvalue : float

2123 The approximate p-value of the test. The value is floored / capped

2124 at 0.1% / 25%.

2125

2126 Raises

2127 ------

2128 ValueError

2129 If less than 2 samples are provided, a sample is empty, or no

2130 distinct observations are in the samples.

2131

2132 See Also

2133 --------

2134 ks_2samp : 2 sample Kolmogorov-Smirnov test

2135 anderson : 1 sample Anderson-Darling test

2136

2137 Notes

2138 -----

2139 [1]_ defines three versions of the k-sample Anderson-Darling test:

2140 one for continuous distributions and two for discrete

2141 distributions, in which ties between samples may occur. The

2142 default of this routine is to compute the version based on the

2143 midrank empirical distribution function. This test is applicable

2144 to continuous and discrete data. If midrank is set to False, the

2145 right side empirical distribution is used for a test for discrete

2146 data. According to [1]_, the two discrete test statistics differ

2147 only slightly if a few collisions due to round-off errors occur in

2148 the test not adjusted for ties between samples.

2149

2150 The critical values corresponding to the significance levels from 0.01

2151 to 0.25 are taken from [1]_. p-values are floored / capped

2152 at 0.1% / 25%. Since the range of critical values might be extended in

2153 future releases, it is recommended not to test ``p == 0.25``, but rather

2154 ``p >= 0.25`` (analogously for the lower bound).

2155

2156 .. versionadded:: 0.14.0

2157

2158 References

2159 ----------

2160 .. [1] Scholz, F. W and Stephens, M. A. (1987), K-Sample

2161 Anderson-Darling Tests, Journal of the American Statistical

2162 Association, Vol. 82, pp. 918-924.

2163

2164 Examples

2165 --------

2166 >>> import numpy as np

2167 >>> from scipy import stats

2168 >>> rng = np.random.default_rng()

2169 >>> res = stats.anderson_ksamp([rng.normal(size=50),

2170 ... rng.normal(loc=0.5, size=30)])

2171 >>> res.statistic, res.pvalue

2172 (1.974403288713695, 0.04991293614572478)

2173 >>> res.critical_values

2174 array([0.325, 1.226, 1.961, 2.718, 3.752, 4.592, 6.546])

2175

2176 The null hypothesis that the two random samples come from the same

2177 distribution can be rejected at the 5% level because the returned

2178 test value is greater than the critical value for 5% (1.961) but

2179 not at the 2.5% level. The interpolation gives an approximate

2180 p-value of 4.99%.

2181

2182 >>> res = stats.anderson_ksamp([rng.normal(size=50),

2183 ... rng.normal(size=30), rng.normal(size=20)])

2184 >>> res.statistic, res.pvalue

2185 (-0.29103725200789504, 0.25)

2186 >>> res.critical_values

2187 array([ 0.44925884, 1.3052767 , 1.9434184 , 2.57696569, 3.41634856,

2188 4.07210043, 5.56419101])

2189

2190 The null hypothesis cannot be rejected for three samples from an

2191 identical distribution. The reported p-value (25%) has been capped and

2192 may not be very accurate (since it corresponds to the value 0.449

2193 whereas the statistic is -0.291).

2194

2195 """

2196 k = len(samples)

2197 if (k < 2):

2198 raise ValueError("anderson_ksamp needs at least two samples")

2199

2200 samples = list(map(np.asarray, samples))

2201 Z = np.sort(np.hstack(samples))

2202 N = Z.size

2203 Zstar = np.unique(Z)

2204 if Zstar.size < 2:

2205 raise ValueError("anderson_ksamp needs more than one distinct "

2206 "observation")

2207

2208 n = np.array([sample.size for sample in samples])

2209 if np.any(n == 0):

2210 raise ValueError("anderson_ksamp encountered sample without "

2211 "observations")

2212

2213 if midrank:

2214 A2kN = _anderson_ksamp_midrank(samples, Z, Zstar, k, n, N)

2215 else:

2216 A2kN = _anderson_ksamp_right(samples, Z, Zstar, k, n, N)

2217

2218 H = (1. / n).sum()

2219 hs_cs = (1. / arange(N - 1, 1, -1)).cumsum()

2220 h = hs_cs[-1] + 1

2221 g = (hs_cs / arange(2, N)).sum()

2222

2223 a = (4*g - 6) * (k - 1) + (10 - 6*g)*H

2224 b = (2*g - 4)*k**2 + 8*h*k + (2*g - 14*h - 4)*H - 8*h + 4*g - 6

2225 c = (6*h + 2*g - 2)*k**2 + (4*h - 4*g + 6)*k + (2*h - 6)*H + 4*h

2226 d = (2*h + 6)*k**2 - 4*h*k

2227 sigmasq = (a*N**3 + b*N**2 + c*N + d) / ((N - 1.) * (N - 2.) * (N - 3.))

2228 m = k - 1

2229 A2 = (A2kN - m) / math.sqrt(sigmasq)

2230

2231 # The b_i values are the interpolation coefficients from Table 2

2232 # of Scholz and Stephens 1987

2233 b0 = np.array([0.675, 1.281, 1.645, 1.96, 2.326, 2.573, 3.085])

2234 b1 = np.array([-0.245, 0.25, 0.678, 1.149, 1.822, 2.364, 3.615])

2235 b2 = np.array([-0.105, -0.305, -0.362, -0.391, -0.396, -0.345, -0.154])

2236 critical = b0 + b1 / math.sqrt(m) + b2 / m

2237

2238 sig = np.array([0.25, 0.1, 0.05, 0.025, 0.01, 0.005, 0.001])

2239 if A2 < critical.min():

2240 p = sig.max()

2241 warnings.warn("p-value capped: true value larger than {}".format(p),

2242 stacklevel=2)

2243 elif A2 > critical.max():

2244 p = sig.min()

2245 warnings.warn("p-value floored: true value smaller than {}".format(p),

2246 stacklevel=2)

2247 else:

2248 # interpolation of probit of significance level

2249 pf = np.polyfit(critical, log(sig), 2)

2250 p = math.exp(np.polyval(pf, A2))

2251

2252 # create result object with alias for backward compatibility

2253 res = Anderson_ksampResult(A2, critical, p)

2254 res.significance_level = p

2255 return res

2256

2257

2258AnsariResult = namedtuple('AnsariResult', ('statistic', 'pvalue'))

2259

2260

2261class _ABW:

2262 """Distribution of Ansari-Bradley W-statistic under the null hypothesis."""

2263 # TODO: calculate exact distribution considering ties

2264 # We could avoid summing over more than half the frequencies,

2265 # but inititally it doesn't seem worth the extra complexity

2266

2267 def __init__(self):

2268 """Minimal initializer."""

2269 self.m = None

2270 self.n = None

2271 self.astart = None

2272 self.total = None

2273 self.freqs = None

2274

2275 def _recalc(self, n, m):

2276 """When necessary, recalculate exact distribution."""

2277 if n != self.n or m != self.m:

2278 self.n, self.m = n, m

2279 # distribution is NOT symmetric when m + n is odd

2280 # n is len(x), m is len(y), and ratio of scales is defined x/y

2281 astart, a1, _ = _statlib.gscale(n, m)

2282 self.astart = astart # minimum value of statistic

2283 # Exact distribution of test statistic under null hypothesis

2284 # expressed as frequencies/counts/integers to maintain precision.

2285 # Stored as floats to avoid overflow of sums.

2286 self.freqs = a1.astype(np.float64)

2287 self.total = self.freqs.sum() # could calculate from m and n

2288 # probability mass is self.freqs / self.total;

2289

2290 def pmf(self, k, n, m):

2291 """Probability mass function."""

2292 self._recalc(n, m)

2293 # The convention here is that PMF at k = 12.5 is the same as at k = 12,

2294 # -> use `floor` in case of ties.

2295 ind = np.floor(k - self.astart).astype(int)

2296 return self.freqs[ind] / self.total

2297

2298 def cdf(self, k, n, m):

2299 """Cumulative distribution function."""

2300 self._recalc(n, m)

2301 # Null distribution derived without considering ties is

2302 # approximate. Round down to avoid Type I error.

2303 ind = np.ceil(k - self.astart).astype(int)

2304 return self.freqs[:ind+1].sum() / self.total

2305

2306 def sf(self, k, n, m):

2307 """Survival function."""

2308 self._recalc(n, m)

2309 # Null distribution derived without considering ties is

2310 # approximate. Round down to avoid Type I error.

2311 ind = np.floor(k - self.astart).astype(int)

2312 return self.freqs[ind:].sum() / self.total

2313

2314

2315# Maintain state for faster repeat calls to ansari w/ method='exact'

2316_abw_state = _ABW()

2317

2318

2319def ansari(x, y, alternative='two-sided'):

2320 """Perform the Ansari-Bradley test for equal scale parameters.

2321

2322 The Ansari-Bradley test ([1]_, [2]_) is a non-parametric test

2323 for the equality of the scale parameter of the distributions

2324 from which two samples were drawn. The null hypothesis states that

2325 the ratio of the scale of the distribution underlying `x` to the scale

2326 of the distribution underlying `y` is 1.

2327

2328 Parameters

2329 ----------

2330 x, y : array_like

2331 Arrays of sample data.

2332 alternative : {'two-sided', 'less', 'greater'}, optional

2333 Defines the alternative hypothesis. Default is 'two-sided'.

2334 The following options are available:

2335

2336 * 'two-sided': the ratio of scales is not equal to 1.

2337 * 'less': the ratio of scales is less than 1.

2338 * 'greater': the ratio of scales is greater than 1.

2339

2340 .. versionadded:: 1.7.0

2341

2342 Returns

2343 -------

2344 statistic : float

2345 The Ansari-Bradley test statistic.

2346 pvalue : float

2347 The p-value of the hypothesis test.

2348

2349 See Also

2350 --------

2351 fligner : A non-parametric test for the equality of k variances

2352 mood : A non-parametric test for the equality of two scale parameters

2353

2354 Notes

2355 -----

2356 The p-value given is exact when the sample sizes are both less than

2357 55 and there are no ties, otherwise a normal approximation for the

2358 p-value is used.

2359

2360 References

2361 ----------

2362 .. [1] Ansari, A. R. and Bradley, R. A. (1960) Rank-sum tests for

2363 dispersions, Annals of Mathematical Statistics, 31, 1174-1189.

2364 .. [2] Sprent, Peter and N.C. Smeeton. Applied nonparametric

2365 statistical methods. 3rd ed. Chapman and Hall/CRC. 2001.

2366 Section 5.8.2.

2367 .. [3] Nathaniel E. Helwig "Nonparametric Dispersion and Equality

2368 Tests" at http://users.stat.umn.edu/~helwig/notes/npde-Notes.pdf

2369

2370 Examples

2371 --------

2372 >>> import numpy as np

2373 >>> from scipy.stats import ansari

2374 >>> rng = np.random.default_rng()

2375

2376 For these examples, we'll create three random data sets. The first

2377 two, with sizes 35 and 25, are drawn from a normal distribution with

2378 mean 0 and standard deviation 2. The third data set has size 25 and

2379 is drawn from a normal distribution with standard deviation 1.25.

2380

2381 >>> x1 = rng.normal(loc=0, scale=2, size=35)

2382 >>> x2 = rng.normal(loc=0, scale=2, size=25)

2383 >>> x3 = rng.normal(loc=0, scale=1.25, size=25)

2384

2385 First we apply `ansari` to `x1` and `x2`. These samples are drawn

2386 from the same distribution, so we expect the Ansari-Bradley test

2387 should not lead us to conclude that the scales of the distributions

2388 are different.

2389

2390 >>> ansari(x1, x2)

2391 AnsariResult(statistic=541.0, pvalue=0.9762532927399098)

2392

2393 With a p-value close to 1, we cannot conclude that there is a

2394 significant difference in the scales (as expected).

2395

2396 Now apply the test to `x1` and `x3`:

2397

2398 >>> ansari(x1, x3)

2399 AnsariResult(statistic=425.0, pvalue=0.0003087020407974518)

2400

2401 The probability of observing such an extreme value of the statistic

2402 under the null hypothesis of equal scales is only 0.03087%. We take this

2403 as evidence against the null hypothesis in favor of the alternative:

2404 the scales of the distributions from which the samples were drawn

2405 are not equal.

2406

2407 We can use the `alternative` parameter to perform a one-tailed test.

2408 In the above example, the scale of `x1` is greater than `x3` and so

2409 the ratio of scales of `x1` and `x3` is greater than 1. This means

2410 that the p-value when ``alternative='greater'`` should be near 0 and

2411 hence we should be able to reject the null hypothesis:

2412

2413 >>> ansari(x1, x3, alternative='greater')

2414 AnsariResult(statistic=425.0, pvalue=0.0001543510203987259)

2415

2416 As we can see, the p-value is indeed quite low. Use of

2417 ``alternative='less'`` should thus yield a large p-value:

2418

2419 >>> ansari(x1, x3, alternative='less')

2420 AnsariResult(statistic=425.0, pvalue=0.9998643258449039)

2421

2422 """

2423 if alternative not in {'two-sided', 'greater', 'less'}:

2424 raise ValueError("'alternative' must be 'two-sided',"

2425 " 'greater', or 'less'.")

2426 x, y = asarray(x), asarray(y)

2427 n = len(x)

2428 m = len(y)

2429 if m < 1:

2430 raise ValueError("Not enough other observations.")

2431 if n < 1:

2432 raise ValueError("Not enough test observations.")

2433

2434 N = m + n

2435 xy = r_[x, y] # combine

2436 rank = _stats_py.rankdata(xy)

2437 symrank = amin(array((rank, N - rank + 1)), 0)

2438 AB = np.sum(symrank[:n], axis=0)

2439 uxy = unique(xy)

2440 repeats = (len(uxy) != len(xy))

2441 exact = ((m < 55) and (n < 55) and not repeats)

2442 if repeats and (m < 55 or n < 55):

2443 warnings.warn("Ties preclude use of exact statistic.")

2444 if exact:

2445 if alternative == 'two-sided':

2446 pval = 2.0 * np.minimum(_abw_state.cdf(AB, n, m),

2447 _abw_state.sf(AB, n, m))

2448 elif alternative == 'greater':

2449 # AB statistic is _smaller_ when ratio of scales is larger,

2450 # so this is the opposite of the usual calculation

2451 pval = _abw_state.cdf(AB, n, m)

2452 else:

2453 pval = _abw_state.sf(AB, n, m)

2454 return AnsariResult(AB, min(1.0, pval))

2455

2456 # otherwise compute normal approximation

2457 if N % 2: # N odd

2458 mnAB = n * (N+1.0)**2 / 4.0 / N

2459 varAB = n * m * (N+1.0) * (3+N**2) / (48.0 * N**2)

2460 else:

2461 mnAB = n * (N+2.0) / 4.0

2462 varAB = m * n * (N+2) * (N-2.0) / 48 / (N-1.0)

2463 if repeats: # adjust variance estimates

2464 # compute np.sum(tj * rj**2,axis=0)

2465 fac = np.sum(symrank**2, axis=0)

2466 if N % 2: # N odd

2467 varAB = m * n * (16*N*fac - (N+1)**4) / (16.0 * N**2 * (N-1))

2468 else: # N even

2469 varAB = m * n * (16*fac - N*(N+2)**2) / (16.0 * N * (N-1))

2470

2471 # Small values of AB indicate larger dispersion for the x sample.

2472 # Large values of AB indicate larger dispersion for the y sample.

2473 # This is opposite to the way we define the ratio of scales. see [1]_.

2474 z = (mnAB - AB) / sqrt(varAB)

2475 z, pval = _normtest_finish(z, alternative)

2476 return AnsariResult(AB, pval)

2477

2478

2479BartlettResult = namedtuple('BartlettResult', ('statistic', 'pvalue'))

2480

2481

2482def bartlett(*samples):

2483 """Perform Bartlett's test for equal variances.

2484

2485 Bartlett's test tests the null hypothesis that all input samples

2486 are from populations with equal variances. For samples

2487 from significantly non-normal populations, Levene's test

2488 `levene` is more robust.

2489

2490 Parameters

2491 ----------

2492 sample1, sample2, ... : array_like

2493 arrays of sample data. Only 1d arrays are accepted, they may have

2494 different lengths.

2495

2496 Returns

2497 -------

2498 statistic : float

2499 The test statistic.

2500 pvalue : float

2501 The p-value of the test.

2502

2503 See Also

2504 --------

2505 fligner : A non-parametric test for the equality of k variances

2506 levene : A robust parametric test for equality of k variances

2507

2508 Notes

2509 -----

2510 Conover et al. (1981) examine many of the existing parametric and

2511 nonparametric tests by extensive simulations and they conclude that the

2512 tests proposed by Fligner and Killeen (1976) and Levene (1960) appear to be

2513 superior in terms of robustness of departures from normality and power

2514 ([3]_).

2515

2516 References

2517 ----------

2518 .. [1] https://www.itl.nist.gov/div898/handbook/eda/section3/eda357.htm

2519

2520 .. [2] Snedecor, George W. and Cochran, William G. (1989), Statistical

2521 Methods, Eighth Edition, Iowa State University Press.

2522

2523 .. [3] Park, C. and Lindsay, B. G. (1999). Robust Scale Estimation and

2524 Hypothesis Testing based on Quadratic Inference Function. Technical

2525 Report #99-03, Center for Likelihood Studies, Pennsylvania State

2526 University.

2527

2528 .. [4] Bartlett, M. S. (1937). Properties of Sufficiency and Statistical

2529 Tests. Proceedings of the Royal Society of London. Series A,

2530 Mathematical and Physical Sciences, Vol. 160, No.901, pp. 268-282.

2531

2532 Examples

2533 --------

2534 Test whether or not the lists `a`, `b` and `c` come from populations

2535 with equal variances.

2536

2537 >>> import numpy as np

2538 >>> from scipy.stats import bartlett

2539 >>> a = [8.88, 9.12, 9.04, 8.98, 9.00, 9.08, 9.01, 8.85, 9.06, 8.99]

2540 >>> b = [8.88, 8.95, 9.29, 9.44, 9.15, 9.58, 8.36, 9.18, 8.67, 9.05]

2541 >>> c = [8.95, 9.12, 8.95, 8.85, 9.03, 8.84, 9.07, 8.98, 8.86, 8.98]

2542 >>> stat, p = bartlett(a, b, c)

2543 >>> p

2544 1.1254782518834628e-05

2545

2546 The very small p-value suggests that the populations do not have equal

2547 variances.

2548

2549 This is not surprising, given that the sample variance of `b` is much

2550 larger than that of `a` and `c`:

2551

2552 >>> [np.var(x, ddof=1) for x in [a, b, c]]

2553 [0.007054444444444413, 0.13073888888888888, 0.008890000000000002]

2554

2555 """

2556 # Handle empty input and input that is not 1d

2557 for sample in samples:

2558 if np.asanyarray(sample).size == 0:

2559 return BartlettResult(np.nan, np.nan)

2560 if np.asanyarray(sample).ndim > 1:

2561 raise ValueError('Samples must be one-dimensional.')

2562

2563 k = len(samples)

2564 if k < 2:

2565 raise ValueError("Must enter at least two input sample vectors.")

2566 Ni = np.empty(k)

2567 ssq = np.empty(k, 'd')

2568 for j in range(k):

2569 Ni[j] = len(samples[j])

2570 ssq[j] = np.var(samples[j], ddof=1)

2571 Ntot = np.sum(Ni, axis=0)

2572 spsq = np.sum((Ni - 1)*ssq, axis=0) / (1.0*(Ntot - k))

2573 numer = (Ntot*1.0 - k) * log(spsq) - np.sum((Ni - 1.0)*log(ssq), axis=0)

2574 denom = 1.0 + 1.0/(3*(k - 1)) * ((np.sum(1.0/(Ni - 1.0), axis=0)) -

2575 1.0/(Ntot - k))

2576 T = numer / denom

2577 pval = distributions.chi2.sf(T, k - 1) # 1 - cdf

2578

2579 return BartlettResult(T, pval)

2580

2581

2582LeveneResult = namedtuple('LeveneResult', ('statistic', 'pvalue'))

2583

2584

2585def levene(*samples, center='median', proportiontocut=0.05):

2586 """Perform Levene test for equal variances.

2587

2588 The Levene test tests the null hypothesis that all input samples

2589 are from populations with equal variances. Levene's test is an

2590 alternative to Bartlett's test `bartlett` in the case where

2591 there are significant deviations from normality.

2592

2593 Parameters

2594 ----------

2595 sample1, sample2, ... : array_like

2596 The sample data, possibly with different lengths. Only one-dimensional

2597 samples are accepted.

2598 center : {'mean', 'median', 'trimmed'}, optional

2599 Which function of the data to use in the test. The default

2600 is 'median'.

2601 proportiontocut : float, optional

2602 When `center` is 'trimmed', this gives the proportion of data points

2603 to cut from each end. (See `scipy.stats.trim_mean`.)

2604 Default is 0.05.

2605

2606 Returns

2607 -------

2608 statistic : float

2609 The test statistic.

2610 pvalue : float

2611 The p-value for the test.

2612

2613 Notes

2614 -----

2615 Three variations of Levene's test are possible. The possibilities

2616 and their recommended usages are:

2617

2618 * 'median' : Recommended for skewed (non-normal) distributions>

2619 * 'mean' : Recommended for symmetric, moderate-tailed distributions.

2620 * 'trimmed' : Recommended for heavy-tailed distributions.

2621

2622 The test version using the mean was proposed in the original article

2623 of Levene ([2]_) while the median and trimmed mean have been studied by

2624 Brown and Forsythe ([3]_), sometimes also referred to as Brown-Forsythe

2625 test.

2626

2627 References

2628 ----------

2629 .. [1] https://www.itl.nist.gov/div898/handbook/eda/section3/eda35a.htm

2630 .. [2] Levene, H. (1960). In Contributions to Probability and Statistics:

2631 Essays in Honor of Harold Hotelling, I. Olkin et al. eds.,

2632 Stanford University Press, pp. 278-292.

2633 .. [3] Brown, M. B. and Forsythe, A. B. (1974), Journal of the American

2634 Statistical Association, 69, 364-367

2635

2636 Examples

2637 --------

2638 Test whether or not the lists `a`, `b` and `c` come from populations

2639 with equal variances.

2640

2641 >>> import numpy as np

2642 >>> from scipy.stats import levene

2643 >>> a = [8.88, 9.12, 9.04, 8.98, 9.00, 9.08, 9.01, 8.85, 9.06, 8.99]

2644 >>> b = [8.88, 8.95, 9.29, 9.44, 9.15, 9.58, 8.36, 9.18, 8.67, 9.05]

2645 >>> c = [8.95, 9.12, 8.95, 8.85, 9.03, 8.84, 9.07, 8.98, 8.86, 8.98]

2646 >>> stat, p = levene(a, b, c)

2647 >>> p

2648 0.002431505967249681

2649

2650 The small p-value suggests that the populations do not have equal

2651 variances.

2652

2653 This is not surprising, given that the sample variance of `b` is much

2654 larger than that of `a` and `c`:

2655

2656 >>> [np.var(x, ddof=1) for x in [a, b, c]]

2657 [0.007054444444444413, 0.13073888888888888, 0.008890000000000002]

2658

2659 """

2660 if center not in ['mean', 'median', 'trimmed']:

2661 raise ValueError("center must be 'mean', 'median' or 'trimmed'.")

2662

2663 k = len(samples)

2664 if k < 2:

2665 raise ValueError("Must enter at least two input sample vectors.")

2666 # check for 1d input

2667 for j in range(k):

2668 if np.asanyarray(samples[j]).ndim > 1:

2669 raise ValueError('Samples must be one-dimensional.')

2670

2671 Ni = np.empty(k)

2672 Yci = np.empty(k, 'd')

2673

2674 if center == 'median':

2675 func = lambda x: np.median(x, axis=0)

2676 elif center == 'mean':

2677 func = lambda x: np.mean(x, axis=0)

2678 else: # center == 'trimmed'

2679 samples = tuple(_stats_py.trimboth(np.sort(sample), proportiontocut)

2680 for sample in samples)

2681 func = lambda x: np.mean(x, axis=0)

2682

2683 for j in range(k):

2684 Ni[j] = len(samples[j])

2685 Yci[j] = func(samples[j])

2686 Ntot = np.sum(Ni, axis=0)

2687

2688 # compute Zij's

2689 Zij = [None] * k

2690 for i in range(k):

2691 Zij[i] = abs(asarray(samples[i]) - Yci[i])

2692

2693 # compute Zbari

2694 Zbari = np.empty(k, 'd')

2695 Zbar = 0.0

2696 for i in range(k):

2697 Zbari[i] = np.mean(Zij[i], axis=0)

2698 Zbar += Zbari[i] * Ni[i]

2699

2700 Zbar /= Ntot

2701 numer = (Ntot - k) * np.sum(Ni * (Zbari - Zbar)**2, axis=0)

2702

2703 # compute denom_variance

2704 dvar = 0.0

2705 for i in range(k):

2706 dvar += np.sum((Zij[i] - Zbari[i])**2, axis=0)

2707

2708 denom = (k - 1.0) * dvar

2709

2710 W = numer / denom

2711 pval = distributions.f.sf(W, k-1, Ntot-k) # 1 - cdf

2712 return LeveneResult(W, pval)

2713

2714

2715@_deprecated("'binom_test' is deprecated in favour of"

2716 " 'binomtest' from version 1.7.0 and will"

2717 " be removed in Scipy 1.12.0.")

2718def binom_test(x, n=None, p=0.5, alternative='two-sided'):

2719 """Perform a test that the probability of success is p.

2720

2721 This is an exact, two-sided test of the null hypothesis

2722 that the probability of success in a Bernoulli experiment

2723 is `p`.

2724

2725 .. deprecated:: 1.10.0

2726 `binom_test` is deprecated in favour of `binomtest` and will

2727 be removed in Scipy 1.12.0.

2728

2729 Parameters

2730 ----------

2731 x : int or array_like

2732 The number of successes, or if x has length 2, it is the

2733 number of successes and the number of failures.

2734 n : int

2735 The number of trials. This is ignored if x gives both the

2736 number of successes and failures.

2737 p : float, optional

2738 The hypothesized probability of success. ``0 <= p <= 1``. The

2739 default value is ``p = 0.5``.

2740 alternative : {'two-sided', 'greater', 'less'}, optional

2741 Indicates the alternative hypothesis. The default value is

2742 'two-sided'.

2743

2744 Returns

2745 -------

2746 p-value : float

2747 The p-value of the hypothesis test.

2748

2749 References

2750 ----------

2751 .. [1] https://en.wikipedia.org/wiki/Binomial_test

2752

2753 Examples

2754 --------

2755 >>> from scipy import stats

2756

2757 A car manufacturer claims that no more than 10% of their cars are unsafe.

2758 15 cars are inspected for safety, 3 were found to be unsafe. Test the

2759 manufacturer's claim:

2760

2761 >>> stats.binom_test(3, n=15, p=0.1, alternative='greater')

2762 0.18406106910639114

2763

2764 The null hypothesis cannot be rejected at the 5% level of significance

2765 because the returned p-value is greater than the critical value of 5%.

2766

2767 """

2768 x = atleast_1d(x).astype(np.int_)

2769 if len(x) == 2:

2770 n = x[1] + x[0]

2771 x = x[0]

2772 elif len(x) == 1:

2773 x = x[0]

2774 if n is None or n < x:

2775 raise ValueError("n must be >= x")

2776 n = np.int_(n)

2777 else:

2778 raise ValueError("Incorrect length for x.")

2779

2780 if (p > 1.0) or (p < 0.0):

2781 raise ValueError("p must be in range [0,1]")

2782

2783 if alternative not in ('two-sided', 'less', 'greater'):

2784 raise ValueError("alternative not recognized\n"

2785 "should be 'two-sided', 'less' or 'greater'")

2786

2787 if alternative == 'less':

2788 pval = distributions.binom.cdf(x, n, p)

2789 return pval

2790

2791 if alternative == 'greater':

2792 pval = distributions.binom.sf(x-1, n, p)

2793 return pval

2794

2795 # if alternative was neither 'less' nor 'greater', then it's 'two-sided'

2796 d = distributions.binom.pmf(x, n, p)

2797 rerr = 1 + 1e-7

2798 if x == p * n:

2799 # special case as shortcut, would also be handled by `else` below

2800 pval = 1.

2801 elif x < p * n:

2802 i = np.arange(np.ceil(p * n), n+1)

2803 y = np.sum(distributions.binom.pmf(i, n, p) <= d*rerr, axis=0)

2804 pval = (distributions.binom.cdf(x, n, p) +

2805 distributions.binom.sf(n - y, n, p))

2806 else:

2807 i = np.arange(np.floor(p*n) + 1)

2808 y = np.sum(distributions.binom.pmf(i, n, p) <= d*rerr, axis=0)

2809 pval = (distributions.binom.cdf(y-1, n, p) +

2810 distributions.binom.sf(x-1, n, p))

2811

2812 return min(1.0, pval)

2813

2814

2815def _apply_func(x, g, func):

2816 # g is list of indices into x

2817 # separating x into different groups

2818 # func should be applied over the groups

2819 g = unique(r_[0, g, len(x)])

2820 output = [func(x[g[k]:g[k+1]]) for k in range(len(g) - 1)]

2821

2822 return asarray(output)

2823

2824

2825FlignerResult = namedtuple('FlignerResult', ('statistic', 'pvalue'))

2826

2827

2828def fligner(*samples, center='median', proportiontocut=0.05):

2829 """Perform Fligner-Killeen test for equality of variance.

2830

2831 Fligner's test tests the null hypothesis that all input samples

2832 are from populations with equal variances. Fligner-Killeen's test is

2833 distribution free when populations are identical [2]_.

2834

2835 Parameters

2836 ----------

2837 sample1, sample2, ... : array_like

2838 Arrays of sample data. Need not be the same length.

2839 center : {'mean', 'median', 'trimmed'}, optional

2840 Keyword argument controlling which function of the data is used in

2841 computing the test statistic. The default is 'median'.

2842 proportiontocut : float, optional

2843 When `center` is 'trimmed', this gives the proportion of data points

2844 to cut from each end. (See `scipy.stats.trim_mean`.)

2845 Default is 0.05.

2846

2847 Returns

2848 -------

2849 statistic : float

2850 The test statistic.

2851 pvalue : float

2852 The p-value for the hypothesis test.

2853

2854 See Also

2855 --------

2856 bartlett : A parametric test for equality of k variances in normal samples

2857 levene : A robust parametric test for equality of k variances

2858

2859 Notes

2860 -----

2861 As with Levene's test there are three variants of Fligner's test that

2862 differ by the measure of central tendency used in the test. See `levene`

2863 for more information.

2864

2865 Conover et al. (1981) examine many of the existing parametric and

2866 nonparametric tests by extensive simulations and they conclude that the

2867 tests proposed by Fligner and Killeen (1976) and Levene (1960) appear to be

2868 superior in terms of robustness of departures from normality and power [3]_.

2869

2870 References

2871 ----------

2872 .. [1] Park, C. and Lindsay, B. G. (1999). Robust Scale Estimation and

2873 Hypothesis Testing based on Quadratic Inference Function. Technical

2874 Report #99-03, Center for Likelihood Studies, Pennsylvania State

2875 University.

2876 https://cecas.clemson.edu/~cspark/cv/paper/qif/draftqif2.pdf

2877

2878 .. [2] Fligner, M.A. and Killeen, T.J. (1976). Distribution-free two-sample

2879 tests for scale. 'Journal of the American Statistical Association.'

2880 71(353), 210-213.

2881

2882 .. [3] Park, C. and Lindsay, B. G. (1999). Robust Scale Estimation and

2883 Hypothesis Testing based on Quadratic Inference Function. Technical

2884 Report #99-03, Center for Likelihood Studies, Pennsylvania State

2885 University.

2886

2887 .. [4] Conover, W. J., Johnson, M. E. and Johnson M. M. (1981). A

2888 comparative study of tests for homogeneity of variances, with

2889 applications to the outer continental shelf biding data.

2890 Technometrics, 23(4), 351-361.

2891

2892 Examples

2893 --------

2894 Test whether or not the lists `a`, `b` and `c` come from populations

2895 with equal variances.

2896

2897 >>> import numpy as np

2898 >>> from scipy.stats import fligner

2899 >>> a = [8.88, 9.12, 9.04, 8.98, 9.00, 9.08, 9.01, 8.85, 9.06, 8.99]

2900 >>> b = [8.88, 8.95, 9.29, 9.44, 9.15, 9.58, 8.36, 9.18, 8.67, 9.05]

2901 >>> c = [8.95, 9.12, 8.95, 8.85, 9.03, 8.84, 9.07, 8.98, 8.86, 8.98]

2902 >>> stat, p = fligner(a, b, c)

2903 >>> p

2904 0.00450826080004775

2905

2906 The small p-value suggests that the populations do not have equal

2907 variances.

2908

2909 This is not surprising, given that the sample variance of `b` is much

2910 larger than that of `a` and `c`:

2911

2912 >>> [np.var(x, ddof=1) for x in [a, b, c]]

2913 [0.007054444444444413, 0.13073888888888888, 0.008890000000000002]

2914

2915 """

2916 if center not in ['mean', 'median', 'trimmed']:

2917 raise ValueError("center must be 'mean', 'median' or 'trimmed'.")

2918

2919 # Handle empty input

2920 for sample in samples:

2921 if np.asanyarray(sample).size == 0:

2922 return FlignerResult(np.nan, np.nan)

2923

2924 k = len(samples)

2925 if k < 2:

2926 raise ValueError("Must enter at least two input sample vectors.")

2927

2928 if center == 'median':

2929 func = lambda x: np.median(x, axis=0)

2930 elif center == 'mean':

2931 func = lambda x: np.mean(x, axis=0)

2932 else: # center == 'trimmed'

2933 samples = tuple(_stats_py.trimboth(sample, proportiontocut)

2934 for sample in samples)

2935 func = lambda x: np.mean(x, axis=0)

2936

2937 Ni = asarray([len(samples[j]) for j in range(k)])

2938 Yci = asarray([func(samples[j]) for j in range(k)])

2939 Ntot = np.sum(Ni, axis=0)

2940 # compute Zij's

2941 Zij = [abs(asarray(samples[i]) - Yci[i]) for i in range(k)]

2942 allZij = []

2943 g = [0]

2944 for i in range(k):

2945 allZij.extend(list(Zij[i]))

2946 g.append(len(allZij))

2947

2948 ranks = _stats_py.rankdata(allZij)

2949 sample = distributions.norm.ppf(ranks / (2*(Ntot + 1.0)) + 0.5)

2950

2951 # compute Aibar

2952 Aibar = _apply_func(sample, g, np.sum) / Ni

2953 anbar = np.mean(sample, axis=0)

2954 varsq = np.var(sample, axis=0, ddof=1)

2955 Xsq = np.sum(Ni * (asarray(Aibar) - anbar)**2.0, axis=0) / varsq

2956 pval = distributions.chi2.sf(Xsq, k - 1) # 1 - cdf

2957 return FlignerResult(Xsq, pval)

2958

2959

2960@_axis_nan_policy_factory(lambda x1: (x1,), n_samples=4, n_outputs=1)

2961def _mood_inner_lc(xy, x, diffs, sorted_xy, n, m, N) -> float:

2962 # Obtain the unique values and their frequencies from the pooled samples.

2963 # "a_j, + b_j, = t_j, for j = 1, ... k" where `k` is the number of unique

2964 # classes, and "[t]he number of values associated with the x's and y's in

2965 # the jth class will be denoted by a_j, and b_j respectively."

2966 # (Mielke, 312)

2967 # Reuse previously computed sorted array and `diff` arrays to obtain the

2968 # unique values and counts. Prepend `diffs` with a non-zero to indicate

2969 # that the first element should be marked as not matching what preceded it.

2970 diffs_prep = np.concatenate(([1], diffs))

2971 # Unique elements are where the was a difference between elements in the

2972 # sorted array

2973 uniques = sorted_xy[diffs_prep != 0]

2974 # The count of each element is the bin size for each set of consecutive

2975 # differences where the difference is zero. Replace nonzero differences

2976 # with 1 and then use the cumulative sum to count the indices.

2977 t = np.bincount(np.cumsum(np.asarray(diffs_prep != 0, dtype=int)))[1:]

2978 k = len(uniques)

2979 js = np.arange(1, k + 1, dtype=int)

2980 # the `b` array mentioned in the paper is not used, outside of the

2981 # calculation of `t`, so we do not need to calculate it separately. Here

2982 # we calculate `a`. In plain language, `a[j]` is the number of values in

2983 # `x` that equal `uniques[j]`.

2984 sorted_xyx = np.sort(np.concatenate((xy, x)))

2985 diffs = np.diff(sorted_xyx)

2986 diffs_prep = np.concatenate(([1], diffs))

2987 diff_is_zero = np.asarray(diffs_prep != 0, dtype=int)

2988 xyx_counts = np.bincount(np.cumsum(diff_is_zero))[1:]

2989 a = xyx_counts - t

2990 # "Define .. a_0 = b_0 = t_0 = S_0 = 0" (Mielke 312) so we shift `a`

2991 # and `t` arrays over 1 to allow a first element of 0 to accommodate this

2992 # indexing.

2993 t = np.concatenate(([0], t))

2994 a = np.concatenate(([0], a))

2995 # S is built from `t`, so it does not need a preceding zero added on.

2996 S = np.cumsum(t)

2997 # define a copy of `S` with a prepending zero for later use to avoid

2998 # the need for indexing.

2999 S_i_m1 = np.concatenate(([0], S[:-1]))

3000

3001 # Psi, as defined by the 6th unnumbered equation on page 313 (Mielke).

3002 # Note that in the paper there is an error where the denominator `2` is

3003 # squared when it should be the entire equation.

3004 def psi(indicator):

3005 return (indicator - (N + 1)/2)**2

3006

3007 # define summation range for use in calculation of phi, as seen in sum

3008 # in the unnumbered equation on the bottom of page 312 (Mielke).

3009 s_lower = S[js - 1] + 1

3010 s_upper = S[js] + 1

3011 phi_J = [np.arange(s_lower[idx], s_upper[idx]) for idx in range(k)]

3012

3013 # for every range in the above array, determine the sum of psi(I) for

3014 # every element in the range. Divide all the sums by `t`. Following the

3015 # last unnumbered equation on page 312.

3016 phis = [np.sum(psi(I_j)) for I_j in phi_J] / t[js]

3017

3018 # `T` is equal to a[j] * phi[j], per the first unnumbered equation on

3019 # page 312. `phis` is already in the order based on `js`, so we index

3020 # into `a` with `js` as well.

3021 T = sum(phis * a[js])

3022

3023 # The approximate statistic

3024 E_0_T = n * (N * N - 1) / 12

3025

3026 varM = (m * n * (N + 1.0) * (N ** 2 - 4) / 180 -

3027 m * n / (180 * N * (N - 1)) * np.sum(

3028 t * (t**2 - 1) * (t**2 - 4 + (15 * (N - S - S_i_m1) ** 2))

3029 ))

3030

3031 return ((T - E_0_T) / np.sqrt(varM),)

3032

3033

3034def mood(x, y, axis=0, alternative="two-sided"):

3035 """Perform Mood's test for equal scale parameters.

3036

3037 Mood's two-sample test for scale parameters is a non-parametric

3038 test for the null hypothesis that two samples are drawn from the

3039 same distribution with the same scale parameter.

3040

3041 Parameters

3042 ----------

3043 x, y : array_like

3044 Arrays of sample data.

3045 axis : int, optional

3046 The axis along which the samples are tested. `x` and `y` can be of

3047 different length along `axis`.

3048 If `axis` is None, `x` and `y` are flattened and the test is done on

3049 all values in the flattened arrays.

3050 alternative : {'two-sided', 'less', 'greater'}, optional

3051 Defines the alternative hypothesis. Default is 'two-sided'.

3052 The following options are available:

3053

3054 * 'two-sided': the scales of the distributions underlying `x` and `y`

3055 are different.

3056 * 'less': the scale of the distribution underlying `x` is less than

3057 the scale of the distribution underlying `y`.

3058 * 'greater': the scale of the distribution underlying `x` is greater

3059 than the scale of the distribution underlying `y`.

3060

3061 .. versionadded:: 1.7.0

3062

3063 Returns

3064 -------

3065 res : SignificanceResult

3066 An object containing attributes:

3067

3068 statistic : scalar or ndarray

3069 The z-score for the hypothesis test. For 1-D inputs a scalar is

3070 returned.

3071 pvalue : scalar ndarray

3072 The p-value for the hypothesis test.

3073

3074 See Also

3075 --------

3076 fligner : A non-parametric test for the equality of k variances

3077 ansari : A non-parametric test for the equality of 2 variances

3078 bartlett : A parametric test for equality of k variances in normal samples

3079 levene : A parametric test for equality of k variances

3080

3081 Notes

3082 -----

3083 The data are assumed to be drawn from probability distributions ``f(x)``

3084 and ``f(x/s) / s`` respectively, for some probability density function f.

3085 The null hypothesis is that ``s == 1``.

3086

3087 For multi-dimensional arrays, if the inputs are of shapes

3088 ``(n0, n1, n2, n3)`` and ``(n0, m1, n2, n3)``, then if ``axis=1``, the

3089 resulting z and p values will have shape ``(n0, n2, n3)``. Note that

3090 ``n1`` and ``m1`` don't have to be equal, but the other dimensions do.

3091

3092 References

3093 ----------

3094 [1] Mielke, Paul W. "Note on Some Squared Rank Tests with Existing Ties."

3095 Technometrics, vol. 9, no. 2, 1967, pp. 312-14. JSTOR,

3096 https://doi.org/10.2307/1266427. Accessed 18 May 2022.

3097

3098 Examples

3099 --------

3100 >>> import numpy as np

3101 >>> from scipy import stats

3102 >>> rng = np.random.default_rng()

3103 >>> x2 = rng.standard_normal((2, 45, 6, 7))

3104 >>> x1 = rng.standard_normal((2, 30, 6, 7))

3105 >>> res = stats.mood(x1, x2, axis=1)

3106 >>> res.pvalue.shape

3107 (2, 6, 7)

3108

3109 Find the number of points where the difference in scale is not significant:

3110

3111 >>> (res.pvalue > 0.1).sum()

3112 78

3113

3114 Perform the test with different scales:

3115

3116 >>> x1 = rng.standard_normal((2, 30))

3117 >>> x2 = rng.standard_normal((2, 35)) * 10.0

3118 >>> stats.mood(x1, x2, axis=1)

3119 SignificanceResult(statistic=array([-5.76174136, -6.12650783]),

3120 pvalue=array([8.32505043e-09, 8.98287869e-10]))

3121

3122 """

3123 x = np.asarray(x, dtype=float)

3124 y = np.asarray(y, dtype=float)

3125

3126 if axis is None:

3127 x = x.flatten()

3128 y = y.flatten()

3129 axis = 0

3130

3131 if axis < 0:

3132 axis = x.ndim + axis

3133

3134 # Determine shape of the result arrays

3135 res_shape = tuple([x.shape[ax] for ax in range(len(x.shape)) if ax != axis])

3136 if not (res_shape == tuple([y.shape[ax] for ax in range(len(y.shape)) if

3137 ax != axis])):

3138 raise ValueError("Dimensions of x and y on all axes except `axis` "

3139 "should match")

3140

3141 n = x.shape[axis]

3142 m = y.shape[axis]

3143 N = m + n

3144 if N < 3:

3145 raise ValueError("Not enough observations.")

3146

3147 xy = np.concatenate((x, y), axis=axis)

3148 # determine if any of the samples contain ties

3149 sorted_xy = np.sort(xy, axis=axis)

3150 diffs = np.diff(sorted_xy, axis=axis)

3151 if 0 in diffs:

3152 z = np.asarray(_mood_inner_lc(xy, x, diffs, sorted_xy, n, m, N,

3153 axis=axis))

3154 else:

3155 if axis != 0:

3156 xy = np.moveaxis(xy, axis, 0)

3157

3158 xy = xy.reshape(xy.shape[0], -1)

3159 # Generalized to the n-dimensional case by adding the axis argument,

3160 # and using for loops, since rankdata is not vectorized. For improving

3161 # performance consider vectorizing rankdata function.

3162 all_ranks = np.empty_like(xy)

3163 for j in range(xy.shape[1]):

3164 all_ranks[:, j] = _stats_py.rankdata(xy[:, j])

3165

3166 Ri = all_ranks[:n]

3167 M = np.sum((Ri - (N + 1.0) / 2) ** 2, axis=0)

3168 # Approx stat.

3169 mnM = n * (N * N - 1.0) / 12

3170 varM = m * n * (N + 1.0) * (N + 2) * (N - 2) / 180

3171 z = (M - mnM) / sqrt(varM)

3172 z, pval = _normtest_finish(z, alternative)

3173

3174 if res_shape == ():

3175 # Return scalars, not 0-D arrays

3176 z = z[0]

3177 pval = pval[0]

3178 else:

3179 z.shape = res_shape

3180 pval.shape = res_shape

3181 return SignificanceResult(z, pval)

3182

3183

3184WilcoxonResult = _make_tuple_bunch('WilcoxonResult', ['statistic', 'pvalue'])

3185

3186

3187def wilcoxon_result_unpacker(res):

3188 if hasattr(res, 'zstatistic'):

3189 return res.statistic, res.pvalue, res.zstatistic

3190 else:

3191 return res.statistic, res.pvalue

3192

3193

3194def wilcoxon_result_object(statistic, pvalue, zstatistic=None):

3195 res = WilcoxonResult(statistic, pvalue)

3196 if zstatistic is not None:

3197 res.zstatistic = zstatistic

3198 return res

3199

3200

3201def wilcoxon_outputs(kwds):

3202 method = kwds.get('method', 'auto')

3203 if method == 'approx':

3204 return 3

3205 return 2

3206

3207

3208@_rename_parameter("mode", "method")

3209@_axis_nan_policy_factory(

3210 wilcoxon_result_object, paired=True,

3211 n_samples=lambda kwds: 2 if kwds.get('y', None) is not None else 1,

3212 result_to_tuple=wilcoxon_result_unpacker, n_outputs=wilcoxon_outputs,

3213)

3214def wilcoxon(x, y=None, zero_method="wilcox", correction=False,

3215 alternative="two-sided", method='auto'):

3216 """Calculate the Wilcoxon signed-rank test.

3217

3218 The Wilcoxon signed-rank test tests the null hypothesis that two

3219 related paired samples come from the same distribution. In particular,

3220 it tests whether the distribution of the differences ``x - y`` is symmetric

3221 about zero. It is a non-parametric version of the paired T-test.

3222

3223 Parameters

3224 ----------

3225 x : array_like

3226 Either the first set of measurements (in which case ``y`` is the second

3227 set of measurements), or the differences between two sets of

3228 measurements (in which case ``y`` is not to be specified.) Must be

3229 one-dimensional.

3230 y : array_like, optional

3231 Either the second set of measurements (if ``x`` is the first set of

3232 measurements), or not specified (if ``x`` is the differences between

3233 two sets of measurements.) Must be one-dimensional.

3234 zero_method : {"wilcox", "pratt", "zsplit"}, optional

3235 There are different conventions for handling pairs of observations

3236 with equal values ("zero-differences", or "zeros").

3237

3238 * "wilcox": Discards all zero-differences (default); see [4]_.

3239 * "pratt": Includes zero-differences in the ranking process,

3240 but drops the ranks of the zeros (more conservative); see [3]_.

3241 In this case, the normal approximation is adjusted as in [5]_.

3242 * "zsplit": Includes zero-differences in the ranking process and

3243 splits the zero rank between positive and negative ones.

3244

3245 correction : bool, optional

3246 If True, apply continuity correction by adjusting the Wilcoxon rank

3247 statistic by 0.5 towards the mean value when computing the

3248 z-statistic if a normal approximation is used. Default is False.

3249 alternative : {"two-sided", "greater", "less"}, optional

3250 Defines the alternative hypothesis. Default is 'two-sided'.

3251 In the following, let ``d`` represent the difference between the paired

3252 samples: ``d = x - y`` if both ``x`` and ``y`` are provided, or

3253 ``d = x`` otherwise.

3254

3255 * 'two-sided': the distribution underlying ``d`` is not symmetric

3256 about zero.

3257 * 'less': the distribution underlying ``d`` is stochastically less

3258 than a distribution symmetric about zero.

3259 * 'greater': the distribution underlying ``d`` is stochastically

3260 greater than a distribution symmetric about zero.

3261

3262 method : {"auto", "exact", "approx"}, optional

3263 Method to calculate the p-value, see Notes. Default is "auto".

3264

3265 Returns

3266 -------

3267 An object with the following attributes.

3268

3269 statistic : array_like

3270 If `alternative` is "two-sided", the sum of the ranks of the

3271 differences above or below zero, whichever is smaller.

3272 Otherwise the sum of the ranks of the differences above zero.

3273 pvalue : array_like

3274 The p-value for the test depending on `alternative` and `method`.

3275 zstatistic : array_like

3276 When ``method = 'approx'``, this is the normalized z-statistic::

3277

3278 z = (T - mn - d) / se

3279

3280 where ``T`` is `statistic` as defined above, ``mn`` is the mean of the

3281 distribution under the null hypothesis, ``d`` is a continuity

3282 correction, and ``se`` is the standard error.

3283 When ``method != 'approx'``, this attribute is not available.

3284

3285 See Also

3286 --------

3287 kruskal, mannwhitneyu

3288

3289 Notes

3290 -----

3291 In the following, let ``d`` represent the difference between the paired

3292 samples: ``d = x - y`` if both ``x`` and ``y`` are provided, or ``d = x``

3293 otherwise. Assume that all elements of ``d`` are independent and

3294 identically distributed observations, and all are distinct and nonzero.

3295

3296 - When ``len(d)`` is sufficiently large, the null distribution of the

3297 normalized test statistic (`zstatistic` above) is approximately normal,

3298 and ``method = 'approx'`` can be used to compute the p-value.

3299

3300 - When ``len(d)`` is small, the normal approximation may not be accurate,

3301 and ``method='exact'`` is preferred (at the cost of additional

3302 execution time).

3303

3304 - The default, ``method='auto'``, selects between the two: when

3305 ``len(d) <= 50``, the exact method is used; otherwise, the approximate

3306 method is used.

3307

3308 The presence of "ties" (i.e. not all elements of ``d`` are unique) and

3309 "zeros" (i.e. elements of ``d`` are zero) changes the null distribution

3310 of the test statistic, and ``method='exact'`` no longer calculates

3311 the exact p-value. If ``method='approx'``, the z-statistic is adjusted

3312 for more accurate comparison against the standard normal, but still,

3313 for finite sample sizes, the standard normal is only an approximation of

3314 the true null distribution of the z-statistic. There is no clear

3315 consensus among references on which method most accurately approximates

3316 the p-value for small samples in the presence of zeros and/or ties. In any

3317 case, this is the behavior of `wilcoxon` when ``method='auto':

3318 ``method='exact'`` is used when ``len(d) <= 50`` *and there are no zeros*;

3319 otherwise, ``method='approx'`` is used.

3320

3321 References

3322 ----------

3323 .. [1] https://en.wikipedia.org/wiki/Wilcoxon_signed-rank_test

3324 .. [2] Conover, W.J., Practical Nonparametric Statistics, 1971.

3325 .. [3] Pratt, J.W., Remarks on Zeros and Ties in the Wilcoxon Signed

3326 Rank Procedures, Journal of the American Statistical Association,

3327 Vol. 54, 1959, pp. 655-667. :doi:`10.1080/01621459.1959.10501526`

3328 .. [4] Wilcoxon, F., Individual Comparisons by Ranking Methods,

3329 Biometrics Bulletin, Vol. 1, 1945, pp. 80-83. :doi:`10.2307/3001968`

3330 .. [5] Cureton, E.E., The Normal Approximation to the Signed-Rank

3331 Sampling Distribution When Zero Differences are Present,

3332 Journal of the American Statistical Association, Vol. 62, 1967,

3333 pp. 1068-1069. :doi:`10.1080/01621459.1967.10500917`

3334

3335 Examples

3336 --------

3337 In [4]_, the differences in height between cross- and self-fertilized

3338 corn plants is given as follows:

3339

3340 >>> d = [6, 8, 14, 16, 23, 24, 28, 29, 41, -48, 49, 56, 60, -67, 75]

3341

3342 Cross-fertilized plants appear to be higher. To test the null

3343 hypothesis that there is no height difference, we can apply the

3344 two-sided test:

3345

3346 >>> from scipy.stats import wilcoxon

3347 >>> res = wilcoxon(d)

3348 >>> res.statistic, res.pvalue

3349 (24.0, 0.041259765625)

3350

3351 Hence, we would reject the null hypothesis at a confidence level of 5%,

3352 concluding that there is a difference in height between the groups.

3353 To confirm that the median of the differences can be assumed to be

3354 positive, we use:

3355

3356 >>> res = wilcoxon(d, alternative='greater')

3357 >>> res.statistic, res.pvalue

3358 (96.0, 0.0206298828125)

3359

3360 This shows that the null hypothesis that the median is negative can be

3361 rejected at a confidence level of 5% in favor of the alternative that

3362 the median is greater than zero. The p-values above are exact. Using the

3363 normal approximation gives very similar values:

3364

3365 >>> res = wilcoxon(d, method='approx')

3366 >>> res.statistic, res.pvalue

3367 (24.0, 0.04088813291185591)

3368

3369 Note that the statistic changed to 96 in the one-sided case (the sum

3370 of ranks of positive differences) whereas it is 24 in the two-sided

3371 case (the minimum of sum of ranks above and below zero).

3372

3373 """

3374 mode = method

3375

3376 if mode not in ["auto", "approx", "exact"]:

3377 raise ValueError("mode must be either 'auto', 'approx' or 'exact'")

3378

3379 if zero_method not in ["wilcox", "pratt", "zsplit"]:

3380 raise ValueError("Zero method must be either 'wilcox' "

3381 "or 'pratt' or 'zsplit'")

3382

3383 if alternative not in ["two-sided", "less", "greater"]:

3384 raise ValueError("Alternative must be either 'two-sided', "

3385 "'greater' or 'less'")

3386

3387 if y is None:

3388 d = asarray(x)

3389 if d.ndim > 1:

3390 raise ValueError('Sample x must be one-dimensional.')

3391 else:

3392 x, y = map(asarray, (x, y))

3393 if x.ndim > 1 or y.ndim > 1:

3394 raise ValueError('Samples x and y must be one-dimensional.')

3395 if len(x) != len(y):

3396 raise ValueError('The samples x and y must have the same length.')

3397 d = x - y

3398

3399 if len(d) == 0:

3400 res = WilcoxonResult(np.nan, np.nan)

3401 if method == 'approx':

3402 res.zstatistic = np.nan

3403 return res

3404

3405 if mode == "auto":

3406 if len(d) <= 50:

3407 mode = "exact"

3408 else:

3409 mode = "approx"

3410

3411 n_zero = np.sum(d == 0)

3412 if n_zero > 0 and mode == "exact":

3413 mode = "approx"

3414 warnings.warn("Exact p-value calculation does not work if there are "

3415 "zeros. Switching to normal approximation.")

3416

3417 if mode == "approx":

3418 if zero_method in ["wilcox", "pratt"]:

3419 if n_zero == len(d):

3420 raise ValueError("zero_method 'wilcox' and 'pratt' do not "

3421 "work if x - y is zero for all elements.")

3422 if zero_method == "wilcox":

3423 # Keep all non-zero differences

3424 d = compress(np.not_equal(d, 0), d)

3425

3426 count = len(d)

3427 if count < 10 and mode == "approx":

3428 warnings.warn("Sample size too small for normal approximation.")

3429

3430 r = _stats_py.rankdata(abs(d))

3431 r_plus = np.sum((d > 0) * r)

3432 r_minus = np.sum((d < 0) * r)

3433

3434 if zero_method == "zsplit":

3435 r_zero = np.sum((d == 0) * r)

3436 r_plus += r_zero / 2.

3437 r_minus += r_zero / 2.

3438

3439 # return min for two-sided test, but r_plus for one-sided test

3440 # the literature is not consistent here

3441 # r_plus is more informative since r_plus + r_minus = count*(count+1)/2,

3442 # i.e. the sum of the ranks, so r_minus and the min can be inferred

3443 # (If alternative='pratt', r_plus + r_minus = count*(count+1)/2 - r_zero.)

3444 # [3] uses the r_plus for the one-sided test, keep min for two-sided test

3445 # to keep backwards compatibility

3446 if alternative == "two-sided":

3447 T = min(r_plus, r_minus)

3448 else:

3449 T = r_plus

3450

3451 if mode == "approx":

3452 mn = count * (count + 1.) * 0.25

3453 se = count * (count + 1.) * (2. * count + 1.)

3454

3455 if zero_method == "pratt":

3456 r = r[d != 0]

3457 # normal approximation needs to be adjusted, see Cureton (1967)

3458 mn -= n_zero * (n_zero + 1.) * 0.25

3459 se -= n_zero * (n_zero + 1.) * (2. * n_zero + 1.)

3460

3461 replist, repnum = find_repeats(r)

3462 if repnum.size != 0:

3463 # Correction for repeated elements.

3464 se -= 0.5 * (repnum * (repnum * repnum - 1)).sum()

3465

3466 se = sqrt(se / 24)

3467

3468 # apply continuity correction if applicable

3469 d = 0

3470 if correction:

3471 if alternative == "two-sided":

3472 d = 0.5 * np.sign(T - mn)

3473 elif alternative == "less":

3474 d = -0.5

3475 else:

3476 d = 0.5

3477

3478 # compute statistic and p-value using normal approximation

3479 z = (T - mn - d) / se

3480 if alternative == "two-sided":

3481 prob = 2. * distributions.norm.sf(abs(z))

3482 elif alternative == "greater":

3483 # large T = r_plus indicates x is greater than y; i.e.

3484 # accept alternative in that case and return small p-value (sf)

3485 prob = distributions.norm.sf(z)

3486 else:

3487 prob = distributions.norm.cdf(z)

3488 elif mode == "exact":

3489 # get pmf of the possible positive ranksums r_plus

3490 pmf = _get_wilcoxon_distr(count)

3491 # note: r_plus is int (ties not allowed), need int for slices below

3492 r_plus = int(r_plus)

3493 if alternative == "two-sided":

3494 if r_plus == (len(pmf) - 1) // 2:

3495 # r_plus is the center of the distribution.

3496 prob = 1.0

3497 else:

3498 p_less = np.sum(pmf[:r_plus + 1])

3499 p_greater = np.sum(pmf[r_plus:])

3500 prob = 2*min(p_greater, p_less)

3501 elif alternative == "greater":

3502 prob = np.sum(pmf[r_plus:])

3503 else:

3504 prob = np.sum(pmf[:r_plus + 1])

3505 prob = np.clip(prob, 0, 1)

3506

3507 res = WilcoxonResult(T, prob)

3508 if method == 'approx':

3509 res.zstatistic = z

3510 return res

3511

3512

3513MedianTestResult = _make_tuple_bunch(

3514 'MedianTestResult',

3515 ['statistic', 'pvalue', 'median', 'table'], []

3516)

3517

3518

3519def median_test(*samples, ties='below', correction=True, lambda_=1,

3520 nan_policy='propagate'):

3521 """Perform a Mood's median test.

3522

3523 Test that two or more samples come from populations with the same median.

3524

3525 Let ``n = len(samples)`` be the number of samples. The "grand median" of

3526 all the data is computed, and a contingency table is formed by

3527 classifying the values in each sample as being above or below the grand

3528 median. The contingency table, along with `correction` and `lambda_`,

3529 are passed to `scipy.stats.chi2_contingency` to compute the test statistic

3530 and p-value.

3531

3532 Parameters

3533 ----------

3534 sample1, sample2, ... : array_like

3535 The set of samples. There must be at least two samples.

3536 Each sample must be a one-dimensional sequence containing at least

3537 one value. The samples are not required to have the same length.

3538 ties : str, optional

3539 Determines how values equal to the grand median are classified in

3540 the contingency table. The string must be one of::

3541

3542 "below":

3543 Values equal to the grand median are counted as "below".

3544 "above":

3545 Values equal to the grand median are counted as "above".

3546 "ignore":

3547 Values equal to the grand median are not counted.

3548

3549 The default is "below".

3550 correction : bool, optional

3551 If True, *and* there are just two samples, apply Yates' correction

3552 for continuity when computing the test statistic associated with

3553 the contingency table. Default is True.

3554 lambda_ : float or str, optional

3555 By default, the statistic computed in this test is Pearson's

3556 chi-squared statistic. `lambda_` allows a statistic from the

3557 Cressie-Read power divergence family to be used instead. See

3558 `power_divergence` for details.

3559 Default is 1 (Pearson's chi-squared statistic).

3560 nan_policy : {'propagate', 'raise', 'omit'}, optional

3561 Defines how to handle when input contains nan. 'propagate' returns nan,

3562 'raise' throws an error, 'omit' performs the calculations ignoring nan

3563 values. Default is 'propagate'.

3564

3565 Returns

3566 -------

3567 res : MedianTestResult

3568 An object containing attributes:

3569

3570 statistic : float

3571 The test statistic. The statistic that is returned is determined

3572 by `lambda_`. The default is Pearson's chi-squared statistic.

3573 pvalue : float

3574 The p-value of the test.

3575 median : float

3576 The grand median.

3577 table : ndarray

3578 The contingency table. The shape of the table is (2, n), where

3579 n is the number of samples. The first row holds the counts of the

3580 values above the grand median, and the second row holds the counts

3581 of the values below the grand median. The table allows further

3582 analysis with, for example, `scipy.stats.chi2_contingency`, or with

3583 `scipy.stats.fisher_exact` if there are two samples, without having

3584 to recompute the table. If ``nan_policy`` is "propagate" and there

3585 are nans in the input, the return value for ``table`` is ``None``.

3586

3587 See Also

3588 --------

3589 kruskal : Compute the Kruskal-Wallis H-test for independent samples.

3590 mannwhitneyu : Computes the Mann-Whitney rank test on samples x and y.

3591

3592 Notes

3593 -----

3594 .. versionadded:: 0.15.0

3595

3596 References

3597 ----------

3598 .. [1] Mood, A. M., Introduction to the Theory of Statistics. McGraw-Hill

3599 (1950), pp. 394-399.

3600 .. [2] Zar, J. H., Biostatistical Analysis, 5th ed. Prentice Hall (2010).

3601 See Sections 8.12 and 10.15.

3602

3603 Examples

3604 --------

3605 A biologist runs an experiment in which there are three groups of plants.

3606 Group 1 has 16 plants, group 2 has 15 plants, and group 3 has 17 plants.

3607 Each plant produces a number of seeds. The seed counts for each group

3608 are::

3609

3610 Group 1: 10 14 14 18 20 22 24 25 31 31 32 39 43 43 48 49

3611 Group 2: 28 30 31 33 34 35 36 40 44 55 57 61 91 92 99

3612 Group 3: 0 3 9 22 23 25 25 33 34 34 40 45 46 48 62 67 84

3613

3614 The following code applies Mood's median test to these samples.

3615

3616 >>> g1 = [10, 14, 14, 18, 20, 22, 24, 25, 31, 31, 32, 39, 43, 43, 48, 49]

3617 >>> g2 = [28, 30, 31, 33, 34, 35, 36, 40, 44, 55, 57, 61, 91, 92, 99]

3618 >>> g3 = [0, 3, 9, 22, 23, 25, 25, 33, 34, 34, 40, 45, 46, 48, 62, 67, 84]

3619 >>> from scipy.stats import median_test

3620 >>> res = median_test(g1, g2, g3)

3621

3622 The median is

3623

3624 >>> res.median

3625 34.0

3626

3627 and the contingency table is

3628

3629 >>> res.table

3630 array([[ 5, 10, 7],

3631 [11, 5, 10]])

3632

3633 `p` is too large to conclude that the medians are not the same:

3634

3635 >>> res.pvalue

3636 0.12609082774093244

3637

3638 The "G-test" can be performed by passing ``lambda_="log-likelihood"`` to

3639 `median_test`.

3640

3641 >>> res = median_test(g1, g2, g3, lambda_="log-likelihood")

3642 >>> res.pvalue

3643 0.12224779737117837

3644

3645 The median occurs several times in the data, so we'll get a different

3646 result if, for example, ``ties="above"`` is used:

3647

3648 >>> res = median_test(g1, g2, g3, ties="above")

3649 >>> res.pvalue

3650 0.063873276069553273

3651

3652 >>> res.table

3653 array([[ 5, 11, 9],

3654 [11, 4, 8]])

3655

3656 This example demonstrates that if the data set is not large and there

3657 are values equal to the median, the p-value can be sensitive to the

3658 choice of `ties`.

3659

3660 """

3661 if len(samples) < 2:

3662 raise ValueError('median_test requires two or more samples.')

3663

3664 ties_options = ['below', 'above', 'ignore']

3665 if ties not in ties_options:

3666 raise ValueError("invalid 'ties' option '%s'; 'ties' must be one "

3667 "of: %s" % (ties, str(ties_options)[1:-1]))

3668

3669 data = [np.asarray(sample) for sample in samples]

3670

3671 # Validate the sizes and shapes of the arguments.

3672 for k, d in enumerate(data):

3673 if d.size == 0:

3674 raise ValueError("Sample %d is empty. All samples must "

3675 "contain at least one value." % (k + 1))

3676 if d.ndim != 1:

3677 raise ValueError("Sample %d has %d dimensions. All "

3678 "samples must be one-dimensional sequences." %

3679 (k + 1, d.ndim))

3680

3681 cdata = np.concatenate(data)

3682 contains_nan, nan_policy = _contains_nan(cdata, nan_policy)

3683 if contains_nan and nan_policy == 'propagate':

3684 return MedianTestResult(np.nan, np.nan, np.nan, None)

3685

3686 if contains_nan:

3687 grand_median = np.median(cdata[~np.isnan(cdata)])

3688 else:

3689 grand_median = np.median(cdata)

3690 # When the minimum version of numpy supported by scipy is 1.9.0,

3691 # the above if/else statement can be replaced by the single line:

3692 # grand_median = np.nanmedian(cdata)

3693

3694 # Create the contingency table.

3695 table = np.zeros((2, len(data)), dtype=np.int64)

3696 for k, sample in enumerate(data):

3697 sample = sample[~np.isnan(sample)]

3698

3699 nabove = count_nonzero(sample > grand_median)

3700 nbelow = count_nonzero(sample < grand_median)

3701 nequal = sample.size - (nabove + nbelow)

3702 table[0, k] += nabove

3703 table[1, k] += nbelow

3704 if ties == "below":

3705 table[1, k] += nequal

3706 elif ties == "above":

3707 table[0, k] += nequal

3708

3709 # Check that no row or column of the table is all zero.

3710 # Such a table can not be given to chi2_contingency, because it would have

3711 # a zero in the table of expected frequencies.

3712 rowsums = table.sum(axis=1)

3713 if rowsums[0] == 0:

3714 raise ValueError("All values are below the grand median (%r)." %

3715 grand_median)

3716 if rowsums[1] == 0:

3717 raise ValueError("All values are above the grand median (%r)." %

3718 grand_median)

3719 if ties == "ignore":

3720 # We already checked that each sample has at least one value, but it

3721 # is possible that all those values equal the grand median. If `ties`

3722 # is "ignore", that would result in a column of zeros in `table`. We

3723 # check for that case here.

3724 zero_cols = np.nonzero((table == 0).all(axis=0))[0]

3725 if len(zero_cols) > 0:

3726 msg = ("All values in sample %d are equal to the grand "

3727 "median (%r), so they are ignored, resulting in an "

3728 "empty sample." % (zero_cols[0] + 1, grand_median))

3729 raise ValueError(msg)

3730

3731 stat, p, dof, expected = chi2_contingency(table, lambda_=lambda_,

3732 correction=correction)

3733 return MedianTestResult(stat, p, grand_median, table)

3734

3735

3736def _circfuncs_common(samples, high, low, nan_policy='propagate'):

3737 # Ensure samples are array-like and size is not zero

3738 samples = np.asarray(samples)

3739 if samples.size == 0:

3740 return np.nan, np.asarray(np.nan), np.asarray(np.nan), None

3741

3742 # Recast samples as radians that range between 0 and 2 pi and calculate

3743 # the sine and cosine

3744 sin_samp = sin((samples - low)*2.*pi / (high - low))

3745 cos_samp = cos((samples - low)*2.*pi / (high - low))

3746

3747 # Apply the NaN policy

3748 contains_nan, nan_policy = _contains_nan(samples, nan_policy)

3749 if contains_nan and nan_policy == 'omit':

3750 mask = np.isnan(samples)

3751 # Set the sines and cosines that are NaN to zero

3752 sin_samp[mask] = 0.0

3753 cos_samp[mask] = 0.0

3754 else:

3755 mask = None

3756

3757 return samples, sin_samp, cos_samp, mask

3758

3759

3760def circmean(samples, high=2*pi, low=0, axis=None, nan_policy='propagate'):

3761 """Compute the circular mean for samples in a range.

3762

3763 Parameters

3764 ----------

3765 samples : array_like

3766 Input array.

3767 high : float or int, optional

3768 High boundary for the sample range. Default is ``2*pi``.

3769 low : float or int, optional

3770 Low boundary for the sample range. Default is 0.

3771 axis : int, optional

3772 Axis along which means are computed. The default is to compute

3773 the mean of the flattened array.

3774 nan_policy : {'propagate', 'raise', 'omit'}, optional

3775 Defines how to handle when input contains nan. 'propagate' returns nan,

3776 'raise' throws an error, 'omit' performs the calculations ignoring nan

3777 values. Default is 'propagate'.

3778

3779 Returns

3780 -------

3781 circmean : float

3782 Circular mean.

3783

3784 See Also

3785 --------

3786 circstd : Circular standard deviation.

3787 circvar : Circular variance.

3788

3789 Examples

3790 --------

3791 For simplicity, all angles are printed out in degrees.

3792

3793 >>> import numpy as np

3794 >>> from scipy.stats import circmean

3795 >>> import matplotlib.pyplot as plt

3796 >>> angles = np.deg2rad(np.array([20, 30, 330]))

3797 >>> circmean = circmean(angles)

3798 >>> np.rad2deg(circmean)

3799 7.294976657784009

3800

3801 >>> mean = angles.mean()

3802 >>> np.rad2deg(mean)

3803 126.66666666666666

3804

3805 Plot and compare the circular mean against the arithmetic mean.

3806

3807 >>> plt.plot(np.cos(np.linspace(0, 2*np.pi, 500)),

3808 ... np.sin(np.linspace(0, 2*np.pi, 500)),

3809 ... c='k')

3810 >>> plt.scatter(np.cos(angles), np.sin(angles), c='k')

3811 >>> plt.scatter(np.cos(circmean), np.sin(circmean), c='b',

3812 ... label='circmean')

3813 >>> plt.scatter(np.cos(mean), np.sin(mean), c='r', label='mean')

3814 >>> plt.legend()

3815 >>> plt.axis('equal')

3816 >>> plt.show()

3817

3818 """

3819 samples, sin_samp, cos_samp, nmask = _circfuncs_common(samples, high, low,

3820 nan_policy=nan_policy)

3821 sin_sum = sin_samp.sum(axis=axis)

3822 cos_sum = cos_samp.sum(axis=axis)

3823 res = arctan2(sin_sum, cos_sum)

3824

3825 mask_nan = ~np.isnan(res)

3826 if mask_nan.ndim > 0:

3827 mask = res[mask_nan] < 0

3828 else:

3829 mask = res < 0

3830

3831 if mask.ndim > 0:

3832 mask_nan[mask_nan] = mask

3833 res[mask_nan] += 2*pi

3834 elif mask:

3835 res += 2*pi

3836

3837 # Set output to NaN if no samples went into the mean

3838 if nmask is not None:

3839 if nmask.all():

3840 res = np.full(shape=res.shape, fill_value=np.nan)

3841 else:

3842 # Find out if any of the axis that are being averaged consist

3843 # entirely of NaN. If one exists, set the result (res) to NaN

3844 nshape = 0 if axis is None else axis

3845 smask = nmask.shape[nshape] == nmask.sum(axis=axis)

3846 if smask.any():

3847 res[smask] = np.nan

3848

3849 return res*(high - low)/2.0/pi + low

3850

3851

3852def circvar(samples, high=2*pi, low=0, axis=None, nan_policy='propagate'):

3853 """Compute the circular variance for samples assumed to be in a range.

3854

3855 Parameters

3856 ----------

3857 samples : array_like

3858 Input array.

3859 high : float or int, optional

3860 High boundary for the sample range. Default is ``2*pi``.

3861 low : float or int, optional

3862 Low boundary for the sample range. Default is 0.

3863 axis : int, optional

3864 Axis along which variances are computed. The default is to compute

3865 the variance of the flattened array.

3866 nan_policy : {'propagate', 'raise', 'omit'}, optional

3867 Defines how to handle when input contains nan. 'propagate' returns nan,

3868 'raise' throws an error, 'omit' performs the calculations ignoring nan

3869 values. Default is 'propagate'.

3870

3871 Returns

3872 -------

3873 circvar : float

3874 Circular variance.

3875

3876 See Also

3877 --------

3878 circmean : Circular mean.

3879 circstd : Circular standard deviation.

3880

3881 Notes

3882 -----

3883 This uses the following definition of circular variance: ``1-R``, where

3884 ``R`` is the mean resultant vector. The

3885 returned value is in the range [0, 1], 0 standing for no variance, and 1

3886 for a large variance. In the limit of small angles, this value is similar

3887 to half the 'linear' variance.

3888

3889 References

3890 ----------

3891 .. [1] Fisher, N.I. *Statistical analysis of circular data*. Cambridge

3892 University Press, 1993.

3893

3894 Examples

3895 --------

3896 >>> import numpy as np

3897 >>> from scipy.stats import circvar

3898 >>> import matplotlib.pyplot as plt

3899 >>> samples_1 = np.array([0.072, -0.158, 0.077, 0.108, 0.286,

3900 ... 0.133, -0.473, -0.001, -0.348, 0.131])

3901 >>> samples_2 = np.array([0.111, -0.879, 0.078, 0.733, 0.421,

3902 ... 0.104, -0.136, -0.867, 0.012, 0.105])

3903 >>> circvar_1 = circvar(samples_1)

3904 >>> circvar_2 = circvar(samples_2)

3905

3906 Plot the samples.

3907

3908 >>> fig, (left, right) = plt.subplots(ncols=2)

3909 >>> for image in (left, right):

3910 ... image.plot(np.cos(np.linspace(0, 2*np.pi, 500)),

3911 ... np.sin(np.linspace(0, 2*np.pi, 500)),

3912 ... c='k')

3913 ... image.axis('equal')

3914 ... image.axis('off')

3915 >>> left.scatter(np.cos(samples_1), np.sin(samples_1), c='k', s=15)

3916 >>> left.set_title(f"circular variance: {np.round(circvar_1, 2)!r}")

3917 >>> right.scatter(np.cos(samples_2), np.sin(samples_2), c='k', s=15)

3918 >>> right.set_title(f"circular variance: {np.round(circvar_2, 2)!r}")

3919 >>> plt.show()

3920

3921 """

3922 samples, sin_samp, cos_samp, mask = _circfuncs_common(samples, high, low,

3923 nan_policy=nan_policy)

3924 if mask is None:

3925 sin_mean = sin_samp.mean(axis=axis)

3926 cos_mean = cos_samp.mean(axis=axis)

3927 else:

3928 nsum = np.asarray(np.sum(~mask, axis=axis).astype(float))

3929 nsum[nsum == 0] = np.nan

3930 sin_mean = sin_samp.sum(axis=axis) / nsum

3931 cos_mean = cos_samp.sum(axis=axis) / nsum

3932 # hypot can go slightly above 1 due to rounding errors

3933 with np.errstate(invalid='ignore'):

3934 R = np.minimum(1, hypot(sin_mean, cos_mean))

3935

3936 res = 1. - R

3937 return res

3938

3939

3940def circstd(samples, high=2*pi, low=0, axis=None, nan_policy='propagate', *,

3941 normalize=False):

3942 """

3943 Compute the circular standard deviation for samples assumed to be in the

3944 range [low to high].

3945

3946 Parameters

3947 ----------

3948 samples : array_like

3949 Input array.

3950 high : float or int, optional

3951 High boundary for the sample range. Default is ``2*pi``.

3952 low : float or int, optional

3953 Low boundary for the sample range. Default is 0.

3954 axis : int, optional

3955 Axis along which standard deviations are computed. The default is

3956 to compute the standard deviation of the flattened array.

3957 nan_policy : {'propagate', 'raise', 'omit'}, optional

3958 Defines how to handle when input contains nan. 'propagate' returns nan,

3959 'raise' throws an error, 'omit' performs the calculations ignoring nan

3960 values. Default is 'propagate'.

3961 normalize : boolean, optional

3962 If True, the returned value is equal to ``sqrt(-2*log(R))`` and does

3963 not depend on the variable units. If False (default), the returned

3964 value is scaled by ``((high-low)/(2*pi))``.

3965

3966 Returns

3967 -------

3968 circstd : float

3969 Circular standard deviation.

3970

3971 See Also

3972 --------

3973 circmean : Circular mean.

3974 circvar : Circular variance.

3975

3976 Notes

3977 -----

3978 This uses a definition of circular standard deviation from [1]_.

3979 Essentially, the calculation is as follows.

3980

3981 .. code-block:: python

3982

3983 import numpy as np

3984 C = np.cos(samples).mean()

3985 S = np.sin(samples).mean()

3986 R = np.sqrt(C**2 + S**2)

3987 l = 2*np.pi / (high-low)

3988 circstd = np.sqrt(-2*np.log(R)) / l

3989

3990 In the limit of small angles, it returns a number close to the 'linear'

3991 standard deviation.

3992

3993 References

3994 ----------

3995 .. [1] Mardia, K. V. (1972). 2. In *Statistics of Directional Data*

3996 (pp. 18-24). Academic Press. :doi:`10.1016/C2013-0-07425-7`.

3997

3998 Examples

3999 --------

4000 >>> import numpy as np

4001 >>> from scipy.stats import circstd

4002 >>> import matplotlib.pyplot as plt

4003 >>> samples_1 = np.array([0.072, -0.158, 0.077, 0.108, 0.286,

4004 ... 0.133, -0.473, -0.001, -0.348, 0.131])

4005 >>> samples_2 = np.array([0.111, -0.879, 0.078, 0.733, 0.421,

4006 ... 0.104, -0.136, -0.867, 0.012, 0.105])

4007 >>> circstd_1 = circstd(samples_1)

4008 >>> circstd_2 = circstd(samples_2)

4009

4010 Plot the samples.

4011

4012 >>> fig, (left, right) = plt.subplots(ncols=2)

4013 >>> for image in (left, right):

4014 ... image.plot(np.cos(np.linspace(0, 2*np.pi, 500)),

4015 ... np.sin(np.linspace(0, 2*np.pi, 500)),

4016 ... c='k')

4017 ... image.axis('equal')

4018 ... image.axis('off')

4019 >>> left.scatter(np.cos(samples_1), np.sin(samples_1), c='k', s=15)

4020 >>> left.set_title(f"circular std: {np.round(circstd_1, 2)!r}")

4021 >>> right.plot(np.cos(np.linspace(0, 2*np.pi, 500)),

4022 ... np.sin(np.linspace(0, 2*np.pi, 500)),

4023 ... c='k')

4024 >>> right.scatter(np.cos(samples_2), np.sin(samples_2), c='k', s=15)

4025 >>> right.set_title(f"circular std: {np.round(circstd_2, 2)!r}")

4026 >>> plt.show()

4027

4028 """

4029 samples, sin_samp, cos_samp, mask = _circfuncs_common(samples, high, low,

4030 nan_policy=nan_policy)

4031 if mask is None:

4032 sin_mean = sin_samp.mean(axis=axis) # [1] (2.2.3)

4033 cos_mean = cos_samp.mean(axis=axis) # [1] (2.2.3)

4034 else:

4035 nsum = np.asarray(np.sum(~mask, axis=axis).astype(float))

4036 nsum[nsum == 0] = np.nan

4037 sin_mean = sin_samp.sum(axis=axis) / nsum

4038 cos_mean = cos_samp.sum(axis=axis) / nsum

4039 # hypot can go slightly above 1 due to rounding errors

4040 with np.errstate(invalid='ignore'):

4041 R = np.minimum(1, hypot(sin_mean, cos_mean)) # [1] (2.2.4)

4042

4043 res = sqrt(-2*log(R))

4044 if not normalize:

4045 res *= (high-low)/(2.*pi) # [1] (2.3.14) w/ (2.3.7)

4046 return res

4047

4048

4049class DirectionalStats:

4050 def __init__(self, mean_direction, mean_resultant_length):

4051 self.mean_direction = mean_direction

4052 self.mean_resultant_length = mean_resultant_length

4053

4054 def __repr__(self):

4055 return (f"DirectionalStats(mean_direction={self.mean_direction},"

4056 f" mean_resultant_length={self.mean_resultant_length})")

4057

4058

4059def directional_stats(samples, *, axis=0, normalize=True):

4060 """

4061 Computes sample statistics for directional data.

4062

4063 Computes the directional mean (also called the mean direction vector) and

4064 mean resultant length of a sample of vectors.

4065

4066 The directional mean is a measure of "preferred direction" of vector data.

4067 It is analogous to the sample mean, but it is for use when the length of

4068 the data is irrelevant (e.g. unit vectors).

4069

4070 The mean resultant length is a value between 0 and 1 used to quantify the

4071 dispersion of directional data: the smaller the mean resultant length, the

4072 greater the dispersion. Several definitions of directional variance

4073 involving the mean resultant length are given in [1]_ and [2]_.

4074

4075 Parameters

4076 ----------

4077 samples : array_like

4078 Input array. Must be at least two-dimensional, and the last axis of the

4079 input must correspond with the dimensionality of the vector space.

4080 When the input is exactly two dimensional, this means that each row

4081 of the data is a vector observation.

4082 axis : int, default: 0

4083 Axis along which the directional mean is computed.

4084 normalize: boolean, default: True

4085 If True, normalize the input to ensure that each observation is a

4086 unit vector. It the observations are already unit vectors, consider

4087 setting this to False to avoid unnecessary computation.

4088

4089 Returns

4090 -------

4091 res : DirectionalStats

4092 An object containing attributes:

4093

4094 mean_direction : ndarray

4095 Directional mean.

4096 mean_resultant_length : ndarray

4097 The mean resultant length [1]_.

4098

4099 See also

4100 --------

4101 circmean: circular mean; i.e. directional mean for 2D *angles*

4102 circvar: circular variance; i.e. directional variance for 2D *angles*

4103

4104 Notes

4105 -----

4106 This uses a definition of directional mean from [1]_.

4107 Assuming the observations are unit vectors, the calculation is as follows.

4108

4109 .. code-block:: python

4110

4111 mean = samples.mean(axis=0)

4112 mean_resultant_length = np.linalg.norm(mean)

4113 mean_direction = mean / mean_resultant_length

4114

4115 This definition is appropriate for *directional* data (i.e. vector data

4116 for which the magnitude of each observation is irrelevant) but not

4117 for *axial* data (i.e. vector data for which the magnitude and *sign* of

4118 each observation is irrelevant).

4119

4120 Several definitions of directional variance involving the mean resultant

4121 length ``R`` have been proposed, including ``1 - R`` [1]_, ``1 - R**2``

4122 [2]_, and ``2 * (1 - R)`` [2]_. Rather than choosing one, this function

4123 returns ``R`` as attribute `mean_resultant_length` so the user can compute

4124 their preferred measure of dispersion.

4125

4126 References

4127 ----------

4128 .. [1] Mardia, Jupp. (2000). *Directional Statistics*

4129 (p. 163). Wiley.

4130

4131 .. [2] https://en.wikipedia.org/wiki/Directional_statistics

4132

4133 Examples

4134 --------

4135 >>> import numpy as np

4136 >>> from scipy.stats import directional_stats

4137 >>> data = np.array([[3, 4], # first observation, 2D vector space

4138 ... [6, -8]]) # second observation

4139 >>> dirstats = directional_stats(data)

4140 >>> dirstats.mean_direction

4141 array([1., 0.])

4142

4143 In contrast, the regular sample mean of the vectors would be influenced

4144 by the magnitude of each observation. Furthermore, the result would not be

4145 a unit vector.

4146

4147 >>> data.mean(axis=0)

4148 array([4.5, -2.])

4149

4150 An exemplary use case for `directional_stats` is to find a *meaningful*

4151 center for a set of observations on a sphere, e.g. geographical locations.

4152

4153 >>> data = np.array([[0.8660254, 0.5, 0.],

4154 ... [0.8660254, -0.5, 0.]])

4155 >>> dirstats = directional_stats(data)

4156 >>> dirstats.mean_direction

4157 array([1., 0., 0.])

4158

4159 The regular sample mean on the other hand yields a result which does not

4160 lie on the surface of the sphere.

4161

4162 >>> data.mean(axis=0)

4163 array([0.8660254, 0., 0.])

4164

4165 The function also returns the mean resultant length, which

4166 can be used to calculate a directional variance. For example, using the

4167 definition ``Var(z) = 1 - R`` from [2]_ where ``R`` is the

4168 mean resultant length, we can calculate the directional variance of the

4169 vectors in the above example as:

4170

4171 >>> 1 - dirstats.mean_resultant_length

4172 0.13397459716167093

4173 """

4174 samples = np.asarray(samples)

4175 if samples.ndim < 2:

4176 raise ValueError("samples must at least be two-dimensional. "

4177 f"Instead samples has shape: {samples.shape!r}")

4178 samples = np.moveaxis(samples, axis, 0)

4179 if normalize:

4180 vectornorms = np.linalg.norm(samples, axis=-1, keepdims=True)

4181 samples = samples/vectornorms

4182 mean = np.mean(samples, axis=0)

4183 mean_resultant_length = np.linalg.norm(mean, axis=-1, keepdims=True)

4184 mean_direction = mean / mean_resultant_length

4185 return DirectionalStats(mean_direction,

4186 mean_resultant_length.squeeze(-1)[()])

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scipy/stats/_morestats.py: 10%

985 statements