Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scipy/stats/_morestats.py: 10%

985 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-12-12 06:31 +0000

1from __future__ import annotations 

2import math 

3import warnings 

4from collections import namedtuple 

5 

6import numpy as np 

7from numpy import (isscalar, r_, log, around, unique, asarray, zeros, 

8 arange, sort, amin, amax, atleast_1d, sqrt, array, 

9 compress, pi, exp, ravel, count_nonzero, sin, cos, 

10 arctan2, hypot) 

11 

12from scipy import optimize 

13from scipy import special 

14from scipy._lib._bunch import _make_tuple_bunch 

15from scipy._lib._util import _rename_parameter, _contains_nan 

16 

17from . import _statlib 

18from . import _stats_py 

19from ._fit import FitResult 

20from ._stats_py import find_repeats, _normtest_finish, SignificanceResult 

21from .contingency import chi2_contingency 

22from . import distributions 

23from ._distn_infrastructure import rv_generic 

24from ._hypotests import _get_wilcoxon_distr 

25from ._axis_nan_policy import _axis_nan_policy_factory 

26from .._lib.deprecation import _deprecated 

27 

28 

29__all__ = ['mvsdist', 

30 'bayes_mvs', 'kstat', 'kstatvar', 'probplot', 'ppcc_max', 'ppcc_plot', 

31 'boxcox_llf', 'boxcox', 'boxcox_normmax', 'boxcox_normplot', 

32 'shapiro', 'anderson', 'ansari', 'bartlett', 'levene', 'binom_test', 

33 'fligner', 'mood', 'wilcoxon', 'median_test', 

34 'circmean', 'circvar', 'circstd', 'anderson_ksamp', 

35 'yeojohnson_llf', 'yeojohnson', 'yeojohnson_normmax', 

36 'yeojohnson_normplot', 'directional_stats' 

37 ] 

38 

39 

40Mean = namedtuple('Mean', ('statistic', 'minmax')) 

41Variance = namedtuple('Variance', ('statistic', 'minmax')) 

42Std_dev = namedtuple('Std_dev', ('statistic', 'minmax')) 

43 

44 

45def bayes_mvs(data, alpha=0.90): 

46 r""" 

47 Bayesian confidence intervals for the mean, var, and std. 

48 

49 Parameters 

50 ---------- 

51 data : array_like 

52 Input data, if multi-dimensional it is flattened to 1-D by `bayes_mvs`. 

53 Requires 2 or more data points. 

54 alpha : float, optional 

55 Probability that the returned confidence interval contains 

56 the true parameter. 

57 

58 Returns 

59 ------- 

60 mean_cntr, var_cntr, std_cntr : tuple 

61 The three results are for the mean, variance and standard deviation, 

62 respectively. Each result is a tuple of the form:: 

63 

64 (center, (lower, upper)) 

65 

66 with `center` the mean of the conditional pdf of the value given the 

67 data, and `(lower, upper)` a confidence interval, centered on the 

68 median, containing the estimate to a probability ``alpha``. 

69 

70 See Also 

71 -------- 

72 mvsdist 

73 

74 Notes 

75 ----- 

76 Each tuple of mean, variance, and standard deviation estimates represent 

77 the (center, (lower, upper)) with center the mean of the conditional pdf 

78 of the value given the data and (lower, upper) is a confidence interval 

79 centered on the median, containing the estimate to a probability 

80 ``alpha``. 

81 

82 Converts data to 1-D and assumes all data has the same mean and variance. 

83 Uses Jeffrey's prior for variance and std. 

84 

85 Equivalent to ``tuple((x.mean(), x.interval(alpha)) for x in mvsdist(dat))`` 

86 

87 References 

88 ---------- 

89 T.E. Oliphant, "A Bayesian perspective on estimating mean, variance, and 

90 standard-deviation from data", https://scholarsarchive.byu.edu/facpub/278, 

91 2006. 

92 

93 Examples 

94 -------- 

95 First a basic example to demonstrate the outputs: 

96 

97 >>> from scipy import stats 

98 >>> data = [6, 9, 12, 7, 8, 8, 13] 

99 >>> mean, var, std = stats.bayes_mvs(data) 

100 >>> mean 

101 Mean(statistic=9.0, minmax=(7.103650222612533, 10.896349777387467)) 

102 >>> var 

103 Variance(statistic=10.0, minmax=(3.176724206..., 24.45910382...)) 

104 >>> std 

105 Std_dev(statistic=2.9724954732045084, minmax=(1.7823367265645143, 4.945614605014631)) 

106 

107 Now we generate some normally distributed random data, and get estimates of 

108 mean and standard deviation with 95% confidence intervals for those 

109 estimates: 

110 

111 >>> n_samples = 100000 

112 >>> data = stats.norm.rvs(size=n_samples) 

113 >>> res_mean, res_var, res_std = stats.bayes_mvs(data, alpha=0.95) 

114 

115 >>> import matplotlib.pyplot as plt 

116 >>> fig = plt.figure() 

117 >>> ax = fig.add_subplot(111) 

118 >>> ax.hist(data, bins=100, density=True, label='Histogram of data') 

119 >>> ax.vlines(res_mean.statistic, 0, 0.5, colors='r', label='Estimated mean') 

120 >>> ax.axvspan(res_mean.minmax[0],res_mean.minmax[1], facecolor='r', 

121 ... alpha=0.2, label=r'Estimated mean (95% limits)') 

122 >>> ax.vlines(res_std.statistic, 0, 0.5, colors='g', label='Estimated scale') 

123 >>> ax.axvspan(res_std.minmax[0],res_std.minmax[1], facecolor='g', alpha=0.2, 

124 ... label=r'Estimated scale (95% limits)') 

125 

126 >>> ax.legend(fontsize=10) 

127 >>> ax.set_xlim([-4, 4]) 

128 >>> ax.set_ylim([0, 0.5]) 

129 >>> plt.show() 

130 

131 """ 

132 m, v, s = mvsdist(data) 

133 if alpha >= 1 or alpha <= 0: 

134 raise ValueError("0 < alpha < 1 is required, but alpha=%s was given." 

135 % alpha) 

136 

137 m_res = Mean(m.mean(), m.interval(alpha)) 

138 v_res = Variance(v.mean(), v.interval(alpha)) 

139 s_res = Std_dev(s.mean(), s.interval(alpha)) 

140 

141 return m_res, v_res, s_res 

142 

143 

144def mvsdist(data): 

145 """ 

146 'Frozen' distributions for mean, variance, and standard deviation of data. 

147 

148 Parameters 

149 ---------- 

150 data : array_like 

151 Input array. Converted to 1-D using ravel. 

152 Requires 2 or more data-points. 

153 

154 Returns 

155 ------- 

156 mdist : "frozen" distribution object 

157 Distribution object representing the mean of the data. 

158 vdist : "frozen" distribution object 

159 Distribution object representing the variance of the data. 

160 sdist : "frozen" distribution object 

161 Distribution object representing the standard deviation of the data. 

162 

163 See Also 

164 -------- 

165 bayes_mvs 

166 

167 Notes 

168 ----- 

169 The return values from ``bayes_mvs(data)`` is equivalent to 

170 ``tuple((x.mean(), x.interval(0.90)) for x in mvsdist(data))``. 

171 

172 In other words, calling ``<dist>.mean()`` and ``<dist>.interval(0.90)`` 

173 on the three distribution objects returned from this function will give 

174 the same results that are returned from `bayes_mvs`. 

175 

176 References 

177 ---------- 

178 T.E. Oliphant, "A Bayesian perspective on estimating mean, variance, and 

179 standard-deviation from data", https://scholarsarchive.byu.edu/facpub/278, 

180 2006. 

181 

182 Examples 

183 -------- 

184 >>> from scipy import stats 

185 >>> data = [6, 9, 12, 7, 8, 8, 13] 

186 >>> mean, var, std = stats.mvsdist(data) 

187 

188 We now have frozen distribution objects "mean", "var" and "std" that we can 

189 examine: 

190 

191 >>> mean.mean() 

192 9.0 

193 >>> mean.interval(0.95) 

194 (6.6120585482655692, 11.387941451734431) 

195 >>> mean.std() 

196 1.1952286093343936 

197 

198 """ 

199 x = ravel(data) 

200 n = len(x) 

201 if n < 2: 

202 raise ValueError("Need at least 2 data-points.") 

203 xbar = x.mean() 

204 C = x.var() 

205 if n > 1000: # gaussian approximations for large n 

206 mdist = distributions.norm(loc=xbar, scale=math.sqrt(C / n)) 

207 sdist = distributions.norm(loc=math.sqrt(C), scale=math.sqrt(C / (2. * n))) 

208 vdist = distributions.norm(loc=C, scale=math.sqrt(2.0 / n) * C) 

209 else: 

210 nm1 = n - 1 

211 fac = n * C / 2. 

212 val = nm1 / 2. 

213 mdist = distributions.t(nm1, loc=xbar, scale=math.sqrt(C / nm1)) 

214 sdist = distributions.gengamma(val, -2, scale=math.sqrt(fac)) 

215 vdist = distributions.invgamma(val, scale=fac) 

216 return mdist, vdist, sdist 

217 

218 

219@_axis_nan_policy_factory( 

220 lambda x: x, result_to_tuple=lambda x: (x,), n_outputs=1, default_axis=None 

221) 

222def kstat(data, n=2): 

223 r""" 

224 Return the nth k-statistic (1<=n<=4 so far). 

225 

226 The nth k-statistic k_n is the unique symmetric unbiased estimator of the 

227 nth cumulant kappa_n. 

228 

229 Parameters 

230 ---------- 

231 data : array_like 

232 Input array. Note that n-D input gets flattened. 

233 n : int, {1, 2, 3, 4}, optional 

234 Default is equal to 2. 

235 

236 Returns 

237 ------- 

238 kstat : float 

239 The nth k-statistic. 

240 

241 See Also 

242 -------- 

243 kstatvar: Returns an unbiased estimator of the variance of the k-statistic. 

244 moment: Returns the n-th central moment about the mean for a sample. 

245 

246 Notes 

247 ----- 

248 For a sample size n, the first few k-statistics are given by: 

249 

250 .. math:: 

251 

252 k_{1} = \mu 

253 k_{2} = \frac{n}{n-1} m_{2} 

254 k_{3} = \frac{ n^{2} } {(n-1) (n-2)} m_{3} 

255 k_{4} = \frac{ n^{2} [(n + 1)m_{4} - 3(n - 1) m^2_{2}]} {(n-1) (n-2) (n-3)} 

256 

257 where :math:`\mu` is the sample mean, :math:`m_2` is the sample 

258 variance, and :math:`m_i` is the i-th sample central moment. 

259 

260 References 

261 ---------- 

262 http://mathworld.wolfram.com/k-Statistic.html 

263 

264 http://mathworld.wolfram.com/Cumulant.html 

265 

266 Examples 

267 -------- 

268 >>> from scipy import stats 

269 >>> from numpy.random import default_rng 

270 >>> rng = default_rng() 

271 

272 As sample size increases, n-th moment and n-th k-statistic converge to the 

273 same number (although they aren't identical). In the case of the normal 

274 distribution, they converge to zero. 

275 

276 >>> for n in [2, 3, 4, 5, 6, 7]: 

277 ... x = rng.normal(size=10**n) 

278 ... m, k = stats.moment(x, 3), stats.kstat(x, 3) 

279 ... print("%.3g %.3g %.3g" % (m, k, m-k)) 

280 -0.631 -0.651 0.0194 # random 

281 0.0282 0.0283 -8.49e-05 

282 -0.0454 -0.0454 1.36e-05 

283 7.53e-05 7.53e-05 -2.26e-09 

284 0.00166 0.00166 -4.99e-09 

285 -2.88e-06 -2.88e-06 8.63e-13 

286 """ 

287 if n > 4 or n < 1: 

288 raise ValueError("k-statistics only supported for 1<=n<=4") 

289 n = int(n) 

290 S = np.zeros(n + 1, np.float64) 

291 data = ravel(data) 

292 N = data.size 

293 

294 # raise ValueError on empty input 

295 if N == 0: 

296 raise ValueError("Data input must not be empty") 

297 

298 # on nan input, return nan without warning 

299 if np.isnan(np.sum(data)): 

300 return np.nan 

301 

302 for k in range(1, n + 1): 

303 S[k] = np.sum(data**k, axis=0) 

304 if n == 1: 

305 return S[1] * 1.0/N 

306 elif n == 2: 

307 return (N*S[2] - S[1]**2.0) / (N*(N - 1.0)) 

308 elif n == 3: 

309 return (2*S[1]**3 - 3*N*S[1]*S[2] + N*N*S[3]) / (N*(N - 1.0)*(N - 2.0)) 

310 elif n == 4: 

311 return ((-6*S[1]**4 + 12*N*S[1]**2 * S[2] - 3*N*(N-1.0)*S[2]**2 - 

312 4*N*(N+1)*S[1]*S[3] + N*N*(N+1)*S[4]) / 

313 (N*(N-1.0)*(N-2.0)*(N-3.0))) 

314 else: 

315 raise ValueError("Should not be here.") 

316 

317 

318@_axis_nan_policy_factory( 

319 lambda x: x, result_to_tuple=lambda x: (x,), n_outputs=1, default_axis=None 

320) 

321def kstatvar(data, n=2): 

322 r"""Return an unbiased estimator of the variance of the k-statistic. 

323 

324 See `kstat` for more details of the k-statistic. 

325 

326 Parameters 

327 ---------- 

328 data : array_like 

329 Input array. Note that n-D input gets flattened. 

330 n : int, {1, 2}, optional 

331 Default is equal to 2. 

332 

333 Returns 

334 ------- 

335 kstatvar : float 

336 The nth k-statistic variance. 

337 

338 See Also 

339 -------- 

340 kstat: Returns the n-th k-statistic. 

341 moment: Returns the n-th central moment about the mean for a sample. 

342 

343 Notes 

344 ----- 

345 The variances of the first few k-statistics are given by: 

346 

347 .. math:: 

348 

349 var(k_{1}) = \frac{\kappa^2}{n} 

350 var(k_{2}) = \frac{\kappa^4}{n} + \frac{2\kappa^2_{2}}{n - 1} 

351 var(k_{3}) = \frac{\kappa^6}{n} + \frac{9 \kappa_2 \kappa_4}{n - 1} + 

352 \frac{9 \kappa^2_{3}}{n - 1} + 

353 \frac{6 n \kappa^3_{2}}{(n-1) (n-2)} 

354 var(k_{4}) = \frac{\kappa^8}{n} + \frac{16 \kappa_2 \kappa_6}{n - 1} + 

355 \frac{48 \kappa_{3} \kappa_5}{n - 1} + 

356 \frac{34 \kappa^2_{4}}{n-1} + \frac{72 n \kappa^2_{2} \kappa_4}{(n - 1) (n - 2)} + 

357 \frac{144 n \kappa_{2} \kappa^2_{3}}{(n - 1) (n - 2)} + 

358 \frac{24 (n + 1) n \kappa^4_{2}}{(n - 1) (n - 2) (n - 3)} 

359 """ 

360 data = ravel(data) 

361 N = len(data) 

362 if n == 1: 

363 return kstat(data, n=2) * 1.0/N 

364 elif n == 2: 

365 k2 = kstat(data, n=2) 

366 k4 = kstat(data, n=4) 

367 return (2*N*k2**2 + (N-1)*k4) / (N*(N+1)) 

368 else: 

369 raise ValueError("Only n=1 or n=2 supported.") 

370 

371 

372def _calc_uniform_order_statistic_medians(n): 

373 """Approximations of uniform order statistic medians. 

374 

375 Parameters 

376 ---------- 

377 n : int 

378 Sample size. 

379 

380 Returns 

381 ------- 

382 v : 1d float array 

383 Approximations of the order statistic medians. 

384 

385 References 

386 ---------- 

387 .. [1] James J. Filliben, "The Probability Plot Correlation Coefficient 

388 Test for Normality", Technometrics, Vol. 17, pp. 111-117, 1975. 

389 

390 Examples 

391 -------- 

392 Order statistics of the uniform distribution on the unit interval 

393 are marginally distributed according to beta distributions. 

394 The expectations of these order statistic are evenly spaced across 

395 the interval, but the distributions are skewed in a way that 

396 pushes the medians slightly towards the endpoints of the unit interval: 

397 

398 >>> import numpy as np 

399 >>> n = 4 

400 >>> k = np.arange(1, n+1) 

401 >>> from scipy.stats import beta 

402 >>> a = k 

403 >>> b = n-k+1 

404 >>> beta.mean(a, b) 

405 array([0.2, 0.4, 0.6, 0.8]) 

406 >>> beta.median(a, b) 

407 array([0.15910358, 0.38572757, 0.61427243, 0.84089642]) 

408 

409 The Filliben approximation uses the exact medians of the smallest 

410 and greatest order statistics, and the remaining medians are approximated 

411 by points spread evenly across a sub-interval of the unit interval: 

412 

413 >>> from scipy.stats._morestats import _calc_uniform_order_statistic_medians 

414 >>> _calc_uniform_order_statistic_medians(n) 

415 array([0.15910358, 0.38545246, 0.61454754, 0.84089642]) 

416 

417 This plot shows the skewed distributions of the order statistics 

418 of a sample of size four from a uniform distribution on the unit interval: 

419 

420 >>> import matplotlib.pyplot as plt 

421 >>> x = np.linspace(0.0, 1.0, num=50, endpoint=True) 

422 >>> pdfs = [beta.pdf(x, a[i], b[i]) for i in range(n)] 

423 >>> plt.figure() 

424 >>> plt.plot(x, pdfs[0], x, pdfs[1], x, pdfs[2], x, pdfs[3]) 

425 

426 """ 

427 v = np.empty(n, dtype=np.float64) 

428 v[-1] = 0.5**(1.0 / n) 

429 v[0] = 1 - v[-1] 

430 i = np.arange(2, n) 

431 v[1:-1] = (i - 0.3175) / (n + 0.365) 

432 return v 

433 

434 

435def _parse_dist_kw(dist, enforce_subclass=True): 

436 """Parse `dist` keyword. 

437 

438 Parameters 

439 ---------- 

440 dist : str or stats.distributions instance. 

441 Several functions take `dist` as a keyword, hence this utility 

442 function. 

443 enforce_subclass : bool, optional 

444 If True (default), `dist` needs to be a 

445 `_distn_infrastructure.rv_generic` instance. 

446 It can sometimes be useful to set this keyword to False, if a function 

447 wants to accept objects that just look somewhat like such an instance 

448 (for example, they have a ``ppf`` method). 

449 

450 """ 

451 if isinstance(dist, rv_generic): 

452 pass 

453 elif isinstance(dist, str): 

454 try: 

455 dist = getattr(distributions, dist) 

456 except AttributeError as e: 

457 raise ValueError("%s is not a valid distribution name" % dist) from e 

458 elif enforce_subclass: 

459 msg = ("`dist` should be a stats.distributions instance or a string " 

460 "with the name of such a distribution.") 

461 raise ValueError(msg) 

462 

463 return dist 

464 

465 

466def _add_axis_labels_title(plot, xlabel, ylabel, title): 

467 """Helper function to add axes labels and a title to stats plots.""" 

468 try: 

469 if hasattr(plot, 'set_title'): 

470 # Matplotlib Axes instance or something that looks like it 

471 plot.set_title(title) 

472 plot.set_xlabel(xlabel) 

473 plot.set_ylabel(ylabel) 

474 else: 

475 # matplotlib.pyplot module 

476 plot.title(title) 

477 plot.xlabel(xlabel) 

478 plot.ylabel(ylabel) 

479 except Exception: 

480 # Not an MPL object or something that looks (enough) like it. 

481 # Don't crash on adding labels or title 

482 pass 

483 

484 

485def probplot(x, sparams=(), dist='norm', fit=True, plot=None, rvalue=False): 

486 """ 

487 Calculate quantiles for a probability plot, and optionally show the plot. 

488 

489 Generates a probability plot of sample data against the quantiles of a 

490 specified theoretical distribution (the normal distribution by default). 

491 `probplot` optionally calculates a best-fit line for the data and plots the 

492 results using Matplotlib or a given plot function. 

493 

494 Parameters 

495 ---------- 

496 x : array_like 

497 Sample/response data from which `probplot` creates the plot. 

498 sparams : tuple, optional 

499 Distribution-specific shape parameters (shape parameters plus location 

500 and scale). 

501 dist : str or stats.distributions instance, optional 

502 Distribution or distribution function name. The default is 'norm' for a 

503 normal probability plot. Objects that look enough like a 

504 stats.distributions instance (i.e. they have a ``ppf`` method) are also 

505 accepted. 

506 fit : bool, optional 

507 Fit a least-squares regression (best-fit) line to the sample data if 

508 True (default). 

509 plot : object, optional 

510 If given, plots the quantiles. 

511 If given and `fit` is True, also plots the least squares fit. 

512 `plot` is an object that has to have methods "plot" and "text". 

513 The `matplotlib.pyplot` module or a Matplotlib Axes object can be used, 

514 or a custom object with the same methods. 

515 Default is None, which means that no plot is created. 

516 rvalue : bool, optional 

517 If `plot` is provided and `fit` is True, setting `rvalue` to True 

518 includes the coefficient of determination on the plot. 

519 Default is False. 

520 

521 Returns 

522 ------- 

523 (osm, osr) : tuple of ndarrays 

524 Tuple of theoretical quantiles (osm, or order statistic medians) and 

525 ordered responses (osr). `osr` is simply sorted input `x`. 

526 For details on how `osm` is calculated see the Notes section. 

527 (slope, intercept, r) : tuple of floats, optional 

528 Tuple containing the result of the least-squares fit, if that is 

529 performed by `probplot`. `r` is the square root of the coefficient of 

530 determination. If ``fit=False`` and ``plot=None``, this tuple is not 

531 returned. 

532 

533 Notes 

534 ----- 

535 Even if `plot` is given, the figure is not shown or saved by `probplot`; 

536 ``plt.show()`` or ``plt.savefig('figname.png')`` should be used after 

537 calling `probplot`. 

538 

539 `probplot` generates a probability plot, which should not be confused with 

540 a Q-Q or a P-P plot. Statsmodels has more extensive functionality of this 

541 type, see ``statsmodels.api.ProbPlot``. 

542 

543 The formula used for the theoretical quantiles (horizontal axis of the 

544 probability plot) is Filliben's estimate:: 

545 

546 quantiles = dist.ppf(val), for 

547 

548 0.5**(1/n), for i = n 

549 val = (i - 0.3175) / (n + 0.365), for i = 2, ..., n-1 

550 1 - 0.5**(1/n), for i = 1 

551 

552 where ``i`` indicates the i-th ordered value and ``n`` is the total number 

553 of values. 

554 

555 Examples 

556 -------- 

557 >>> import numpy as np 

558 >>> from scipy import stats 

559 >>> import matplotlib.pyplot as plt 

560 >>> nsample = 100 

561 >>> rng = np.random.default_rng() 

562 

563 A t distribution with small degrees of freedom: 

564 

565 >>> ax1 = plt.subplot(221) 

566 >>> x = stats.t.rvs(3, size=nsample, random_state=rng) 

567 >>> res = stats.probplot(x, plot=plt) 

568 

569 A t distribution with larger degrees of freedom: 

570 

571 >>> ax2 = plt.subplot(222) 

572 >>> x = stats.t.rvs(25, size=nsample, random_state=rng) 

573 >>> res = stats.probplot(x, plot=plt) 

574 

575 A mixture of two normal distributions with broadcasting: 

576 

577 >>> ax3 = plt.subplot(223) 

578 >>> x = stats.norm.rvs(loc=[0,5], scale=[1,1.5], 

579 ... size=(nsample//2,2), random_state=rng).ravel() 

580 >>> res = stats.probplot(x, plot=plt) 

581 

582 A standard normal distribution: 

583 

584 >>> ax4 = plt.subplot(224) 

585 >>> x = stats.norm.rvs(loc=0, scale=1, size=nsample, random_state=rng) 

586 >>> res = stats.probplot(x, plot=plt) 

587 

588 Produce a new figure with a loggamma distribution, using the ``dist`` and 

589 ``sparams`` keywords: 

590 

591 >>> fig = plt.figure() 

592 >>> ax = fig.add_subplot(111) 

593 >>> x = stats.loggamma.rvs(c=2.5, size=500, random_state=rng) 

594 >>> res = stats.probplot(x, dist=stats.loggamma, sparams=(2.5,), plot=ax) 

595 >>> ax.set_title("Probplot for loggamma dist with shape parameter 2.5") 

596 

597 Show the results with Matplotlib: 

598 

599 >>> plt.show() 

600 

601 """ 

602 x = np.asarray(x) 

603 if x.size == 0: 

604 if fit: 

605 return (x, x), (np.nan, np.nan, 0.0) 

606 else: 

607 return x, x 

608 

609 osm_uniform = _calc_uniform_order_statistic_medians(len(x)) 

610 dist = _parse_dist_kw(dist, enforce_subclass=False) 

611 if sparams is None: 

612 sparams = () 

613 if isscalar(sparams): 

614 sparams = (sparams,) 

615 if not isinstance(sparams, tuple): 

616 sparams = tuple(sparams) 

617 

618 osm = dist.ppf(osm_uniform, *sparams) 

619 osr = sort(x) 

620 if fit: 

621 # perform a linear least squares fit. 

622 slope, intercept, r, prob, _ = _stats_py.linregress(osm, osr) 

623 

624 if plot is not None: 

625 plot.plot(osm, osr, 'bo') 

626 if fit: 

627 plot.plot(osm, slope*osm + intercept, 'r-') 

628 _add_axis_labels_title(plot, xlabel='Theoretical quantiles', 

629 ylabel='Ordered Values', 

630 title='Probability Plot') 

631 

632 # Add R^2 value to the plot as text 

633 if fit and rvalue: 

634 xmin = amin(osm) 

635 xmax = amax(osm) 

636 ymin = amin(x) 

637 ymax = amax(x) 

638 posx = xmin + 0.70 * (xmax - xmin) 

639 posy = ymin + 0.01 * (ymax - ymin) 

640 plot.text(posx, posy, "$R^2=%1.4f$" % r**2) 

641 

642 if fit: 

643 return (osm, osr), (slope, intercept, r) 

644 else: 

645 return osm, osr 

646 

647 

648def ppcc_max(x, brack=(0.0, 1.0), dist='tukeylambda'): 

649 """Calculate the shape parameter that maximizes the PPCC. 

650 

651 The probability plot correlation coefficient (PPCC) plot can be used 

652 to determine the optimal shape parameter for a one-parameter family 

653 of distributions. ``ppcc_max`` returns the shape parameter that would 

654 maximize the probability plot correlation coefficient for the given 

655 data to a one-parameter family of distributions. 

656 

657 Parameters 

658 ---------- 

659 x : array_like 

660 Input array. 

661 brack : tuple, optional 

662 Triple (a,b,c) where (a<b<c). If bracket consists of two numbers (a, c) 

663 then they are assumed to be a starting interval for a downhill bracket 

664 search (see `scipy.optimize.brent`). 

665 dist : str or stats.distributions instance, optional 

666 Distribution or distribution function name. Objects that look enough 

667 like a stats.distributions instance (i.e. they have a ``ppf`` method) 

668 are also accepted. The default is ``'tukeylambda'``. 

669 

670 Returns 

671 ------- 

672 shape_value : float 

673 The shape parameter at which the probability plot correlation 

674 coefficient reaches its max value. 

675 

676 See Also 

677 -------- 

678 ppcc_plot, probplot, boxcox 

679 

680 Notes 

681 ----- 

682 The brack keyword serves as a starting point which is useful in corner 

683 cases. One can use a plot to obtain a rough visual estimate of the location 

684 for the maximum to start the search near it. 

685 

686 References 

687 ---------- 

688 .. [1] J.J. Filliben, "The Probability Plot Correlation Coefficient Test 

689 for Normality", Technometrics, Vol. 17, pp. 111-117, 1975. 

690 .. [2] Engineering Statistics Handbook, NIST/SEMATEC, 

691 https://www.itl.nist.gov/div898/handbook/eda/section3/ppccplot.htm 

692 

693 Examples 

694 -------- 

695 First we generate some random data from a Weibull distribution 

696 with shape parameter 2.5: 

697 

698 >>> import numpy as np 

699 >>> from scipy import stats 

700 >>> import matplotlib.pyplot as plt 

701 >>> rng = np.random.default_rng() 

702 >>> c = 2.5 

703 >>> x = stats.weibull_min.rvs(c, scale=4, size=2000, random_state=rng) 

704 

705 Generate the PPCC plot for this data with the Weibull distribution. 

706 

707 >>> fig, ax = plt.subplots(figsize=(8, 6)) 

708 >>> res = stats.ppcc_plot(x, c/2, 2*c, dist='weibull_min', plot=ax) 

709 

710 We calculate the value where the shape should reach its maximum and a 

711 red line is drawn there. The line should coincide with the highest 

712 point in the PPCC graph. 

713 

714 >>> cmax = stats.ppcc_max(x, brack=(c/2, 2*c), dist='weibull_min') 

715 >>> ax.axvline(cmax, color='r') 

716 >>> plt.show() 

717 

718 """ 

719 dist = _parse_dist_kw(dist) 

720 osm_uniform = _calc_uniform_order_statistic_medians(len(x)) 

721 osr = sort(x) 

722 

723 # this function computes the x-axis values of the probability plot 

724 # and computes a linear regression (including the correlation) 

725 # and returns 1-r so that a minimization function maximizes the 

726 # correlation 

727 def tempfunc(shape, mi, yvals, func): 

728 xvals = func(mi, shape) 

729 r, prob = _stats_py.pearsonr(xvals, yvals) 

730 return 1 - r 

731 

732 return optimize.brent(tempfunc, brack=brack, 

733 args=(osm_uniform, osr, dist.ppf)) 

734 

735 

736def ppcc_plot(x, a, b, dist='tukeylambda', plot=None, N=80): 

737 """Calculate and optionally plot probability plot correlation coefficient. 

738 

739 The probability plot correlation coefficient (PPCC) plot can be used to 

740 determine the optimal shape parameter for a one-parameter family of 

741 distributions. It cannot be used for distributions without shape 

742 parameters 

743 (like the normal distribution) or with multiple shape parameters. 

744 

745 By default a Tukey-Lambda distribution (`stats.tukeylambda`) is used. A 

746 Tukey-Lambda PPCC plot interpolates from long-tailed to short-tailed 

747 distributions via an approximately normal one, and is therefore 

748 particularly useful in practice. 

749 

750 Parameters 

751 ---------- 

752 x : array_like 

753 Input array. 

754 a, b : scalar 

755 Lower and upper bounds of the shape parameter to use. 

756 dist : str or stats.distributions instance, optional 

757 Distribution or distribution function name. Objects that look enough 

758 like a stats.distributions instance (i.e. they have a ``ppf`` method) 

759 are also accepted. The default is ``'tukeylambda'``. 

760 plot : object, optional 

761 If given, plots PPCC against the shape parameter. 

762 `plot` is an object that has to have methods "plot" and "text". 

763 The `matplotlib.pyplot` module or a Matplotlib Axes object can be used, 

764 or a custom object with the same methods. 

765 Default is None, which means that no plot is created. 

766 N : int, optional 

767 Number of points on the horizontal axis (equally distributed from 

768 `a` to `b`). 

769 

770 Returns 

771 ------- 

772 svals : ndarray 

773 The shape values for which `ppcc` was calculated. 

774 ppcc : ndarray 

775 The calculated probability plot correlation coefficient values. 

776 

777 See Also 

778 -------- 

779 ppcc_max, probplot, boxcox_normplot, tukeylambda 

780 

781 References 

782 ---------- 

783 J.J. Filliben, "The Probability Plot Correlation Coefficient Test for 

784 Normality", Technometrics, Vol. 17, pp. 111-117, 1975. 

785 

786 Examples 

787 -------- 

788 First we generate some random data from a Weibull distribution 

789 with shape parameter 2.5, and plot the histogram of the data: 

790 

791 >>> import numpy as np 

792 >>> from scipy import stats 

793 >>> import matplotlib.pyplot as plt 

794 >>> rng = np.random.default_rng() 

795 >>> c = 2.5 

796 >>> x = stats.weibull_min.rvs(c, scale=4, size=2000, random_state=rng) 

797 

798 Take a look at the histogram of the data. 

799 

800 >>> fig1, ax = plt.subplots(figsize=(9, 4)) 

801 >>> ax.hist(x, bins=50) 

802 >>> ax.set_title('Histogram of x') 

803 >>> plt.show() 

804 

805 Now we explore this data with a PPCC plot as well as the related 

806 probability plot and Box-Cox normplot. A red line is drawn where we 

807 expect the PPCC value to be maximal (at the shape parameter ``c`` 

808 used above): 

809 

810 >>> fig2 = plt.figure(figsize=(12, 4)) 

811 >>> ax1 = fig2.add_subplot(1, 3, 1) 

812 >>> ax2 = fig2.add_subplot(1, 3, 2) 

813 >>> ax3 = fig2.add_subplot(1, 3, 3) 

814 >>> res = stats.probplot(x, plot=ax1) 

815 >>> res = stats.boxcox_normplot(x, -4, 4, plot=ax2) 

816 >>> res = stats.ppcc_plot(x, c/2, 2*c, dist='weibull_min', plot=ax3) 

817 >>> ax3.axvline(c, color='r') 

818 >>> plt.show() 

819 

820 """ 

821 if b <= a: 

822 raise ValueError("`b` has to be larger than `a`.") 

823 

824 svals = np.linspace(a, b, num=N) 

825 ppcc = np.empty_like(svals) 

826 for k, sval in enumerate(svals): 

827 _, r2 = probplot(x, sval, dist=dist, fit=True) 

828 ppcc[k] = r2[-1] 

829 

830 if plot is not None: 

831 plot.plot(svals, ppcc, 'x') 

832 _add_axis_labels_title(plot, xlabel='Shape Values', 

833 ylabel='Prob Plot Corr. Coef.', 

834 title='(%s) PPCC Plot' % dist) 

835 

836 return svals, ppcc 

837 

838 

839def boxcox_llf(lmb, data): 

840 r"""The boxcox log-likelihood function. 

841 

842 Parameters 

843 ---------- 

844 lmb : scalar 

845 Parameter for Box-Cox transformation. See `boxcox` for details. 

846 data : array_like 

847 Data to calculate Box-Cox log-likelihood for. If `data` is 

848 multi-dimensional, the log-likelihood is calculated along the first 

849 axis. 

850 

851 Returns 

852 ------- 

853 llf : float or ndarray 

854 Box-Cox log-likelihood of `data` given `lmb`. A float for 1-D `data`, 

855 an array otherwise. 

856 

857 See Also 

858 -------- 

859 boxcox, probplot, boxcox_normplot, boxcox_normmax 

860 

861 Notes 

862 ----- 

863 The Box-Cox log-likelihood function is defined here as 

864 

865 .. math:: 

866 

867 llf = (\lambda - 1) \sum_i(\log(x_i)) - 

868 N/2 \log(\sum_i (y_i - \bar{y})^2 / N), 

869 

870 where ``y`` is the Box-Cox transformed input data ``x``. 

871 

872 Examples 

873 -------- 

874 >>> import numpy as np 

875 >>> from scipy import stats 

876 >>> import matplotlib.pyplot as plt 

877 >>> from mpl_toolkits.axes_grid1.inset_locator import inset_axes 

878 

879 Generate some random variates and calculate Box-Cox log-likelihood values 

880 for them for a range of ``lmbda`` values: 

881 

882 >>> rng = np.random.default_rng() 

883 >>> x = stats.loggamma.rvs(5, loc=10, size=1000, random_state=rng) 

884 >>> lmbdas = np.linspace(-2, 10) 

885 >>> llf = np.zeros(lmbdas.shape, dtype=float) 

886 >>> for ii, lmbda in enumerate(lmbdas): 

887 ... llf[ii] = stats.boxcox_llf(lmbda, x) 

888 

889 Also find the optimal lmbda value with `boxcox`: 

890 

891 >>> x_most_normal, lmbda_optimal = stats.boxcox(x) 

892 

893 Plot the log-likelihood as function of lmbda. Add the optimal lmbda as a 

894 horizontal line to check that that's really the optimum: 

895 

896 >>> fig = plt.figure() 

897 >>> ax = fig.add_subplot(111) 

898 >>> ax.plot(lmbdas, llf, 'b.-') 

899 >>> ax.axhline(stats.boxcox_llf(lmbda_optimal, x), color='r') 

900 >>> ax.set_xlabel('lmbda parameter') 

901 >>> ax.set_ylabel('Box-Cox log-likelihood') 

902 

903 Now add some probability plots to show that where the log-likelihood is 

904 maximized the data transformed with `boxcox` looks closest to normal: 

905 

906 >>> locs = [3, 10, 4] # 'lower left', 'center', 'lower right' 

907 >>> for lmbda, loc in zip([-1, lmbda_optimal, 9], locs): 

908 ... xt = stats.boxcox(x, lmbda=lmbda) 

909 ... (osm, osr), (slope, intercept, r_sq) = stats.probplot(xt) 

910 ... ax_inset = inset_axes(ax, width="20%", height="20%", loc=loc) 

911 ... ax_inset.plot(osm, osr, 'c.', osm, slope*osm + intercept, 'k-') 

912 ... ax_inset.set_xticklabels([]) 

913 ... ax_inset.set_yticklabels([]) 

914 ... ax_inset.set_title(r'$\lambda=%1.2f$' % lmbda) 

915 

916 >>> plt.show() 

917 

918 """ 

919 data = np.asarray(data) 

920 N = data.shape[0] 

921 if N == 0: 

922 return np.nan 

923 

924 logdata = np.log(data) 

925 

926 # Compute the variance of the transformed data. 

927 if lmb == 0: 

928 variance = np.var(logdata, axis=0) 

929 else: 

930 # Transform without the constant offset 1/lmb. The offset does 

931 # not effect the variance, and the subtraction of the offset can 

932 # lead to loss of precision. 

933 variance = np.var(data**lmb / lmb, axis=0) 

934 

935 return (lmb - 1) * np.sum(logdata, axis=0) - N/2 * np.log(variance) 

936 

937 

938def _boxcox_conf_interval(x, lmax, alpha): 

939 # Need to find the lambda for which 

940 # f(x,lmbda) >= f(x,lmax) - 0.5*chi^2_alpha;1 

941 fac = 0.5 * distributions.chi2.ppf(1 - alpha, 1) 

942 target = boxcox_llf(lmax, x) - fac 

943 

944 def rootfunc(lmbda, data, target): 

945 return boxcox_llf(lmbda, data) - target 

946 

947 # Find positive endpoint of interval in which answer is to be found 

948 newlm = lmax + 0.5 

949 N = 0 

950 while (rootfunc(newlm, x, target) > 0.0) and (N < 500): 

951 newlm += 0.1 

952 N += 1 

953 

954 if N == 500: 

955 raise RuntimeError("Could not find endpoint.") 

956 

957 lmplus = optimize.brentq(rootfunc, lmax, newlm, args=(x, target)) 

958 

959 # Now find negative interval in the same way 

960 newlm = lmax - 0.5 

961 N = 0 

962 while (rootfunc(newlm, x, target) > 0.0) and (N < 500): 

963 newlm -= 0.1 

964 N += 1 

965 

966 if N == 500: 

967 raise RuntimeError("Could not find endpoint.") 

968 

969 lmminus = optimize.brentq(rootfunc, newlm, lmax, args=(x, target)) 

970 return lmminus, lmplus 

971 

972 

973def boxcox(x, lmbda=None, alpha=None, optimizer=None): 

974 r"""Return a dataset transformed by a Box-Cox power transformation. 

975 

976 Parameters 

977 ---------- 

978 x : ndarray 

979 Input array to be transformed. 

980 

981 If `lmbda` is not None, this is an alias of 

982 `scipy.special.boxcox`. 

983 Returns nan if ``x < 0``; returns -inf if ``x == 0 and lmbda < 0``. 

984 

985 If `lmbda` is None, array must be positive, 1-dimensional, and 

986 non-constant. 

987 

988 lmbda : scalar, optional 

989 If `lmbda` is None (default), find the value of `lmbda` that maximizes 

990 the log-likelihood function and return it as the second output 

991 argument. 

992 

993 If `lmbda` is not None, do the transformation for that value. 

994 

995 alpha : float, optional 

996 If `lmbda` is None and `alpha` is not None (default), return the 

997 ``100 * (1-alpha)%`` confidence interval for `lmbda` as the third 

998 output argument. Must be between 0.0 and 1.0. 

999 

1000 If `lmbda` is not None, `alpha` is ignored. 

1001 optimizer : callable, optional 

1002 If `lmbda` is None, `optimizer` is the scalar optimizer used to find 

1003 the value of `lmbda` that minimizes the negative log-likelihood 

1004 function. `optimizer` is a callable that accepts one argument: 

1005 

1006 fun : callable 

1007 The objective function, which evaluates the negative 

1008 log-likelihood function at a provided value of `lmbda` 

1009 

1010 and returns an object, such as an instance of 

1011 `scipy.optimize.OptimizeResult`, which holds the optimal value of 

1012 `lmbda` in an attribute `x`. 

1013 

1014 See the example in `boxcox_normmax` or the documentation of 

1015 `scipy.optimize.minimize_scalar` for more information. 

1016 

1017 If `lmbda` is not None, `optimizer` is ignored. 

1018 

1019 Returns 

1020 ------- 

1021 boxcox : ndarray 

1022 Box-Cox power transformed array. 

1023 maxlog : float, optional 

1024 If the `lmbda` parameter is None, the second returned argument is 

1025 the `lmbda` that maximizes the log-likelihood function. 

1026 (min_ci, max_ci) : tuple of float, optional 

1027 If `lmbda` parameter is None and `alpha` is not None, this returned 

1028 tuple of floats represents the minimum and maximum confidence limits 

1029 given `alpha`. 

1030 

1031 See Also 

1032 -------- 

1033 probplot, boxcox_normplot, boxcox_normmax, boxcox_llf 

1034 

1035 Notes 

1036 ----- 

1037 The Box-Cox transform is given by:: 

1038 

1039 y = (x**lmbda - 1) / lmbda, for lmbda != 0 

1040 log(x), for lmbda = 0 

1041 

1042 `boxcox` requires the input data to be positive. Sometimes a Box-Cox 

1043 transformation provides a shift parameter to achieve this; `boxcox` does 

1044 not. Such a shift parameter is equivalent to adding a positive constant to 

1045 `x` before calling `boxcox`. 

1046 

1047 The confidence limits returned when `alpha` is provided give the interval 

1048 where: 

1049 

1050 .. math:: 

1051 

1052 llf(\hat{\lambda}) - llf(\lambda) < \frac{1}{2}\chi^2(1 - \alpha, 1), 

1053 

1054 with ``llf`` the log-likelihood function and :math:`\chi^2` the chi-squared 

1055 function. 

1056 

1057 References 

1058 ---------- 

1059 G.E.P. Box and D.R. Cox, "An Analysis of Transformations", Journal of the 

1060 Royal Statistical Society B, 26, 211-252 (1964). 

1061 

1062 Examples 

1063 -------- 

1064 >>> from scipy import stats 

1065 >>> import matplotlib.pyplot as plt 

1066 

1067 We generate some random variates from a non-normal distribution and make a 

1068 probability plot for it, to show it is non-normal in the tails: 

1069 

1070 >>> fig = plt.figure() 

1071 >>> ax1 = fig.add_subplot(211) 

1072 >>> x = stats.loggamma.rvs(5, size=500) + 5 

1073 >>> prob = stats.probplot(x, dist=stats.norm, plot=ax1) 

1074 >>> ax1.set_xlabel('') 

1075 >>> ax1.set_title('Probplot against normal distribution') 

1076 

1077 We now use `boxcox` to transform the data so it's closest to normal: 

1078 

1079 >>> ax2 = fig.add_subplot(212) 

1080 >>> xt, _ = stats.boxcox(x) 

1081 >>> prob = stats.probplot(xt, dist=stats.norm, plot=ax2) 

1082 >>> ax2.set_title('Probplot after Box-Cox transformation') 

1083 

1084 >>> plt.show() 

1085 

1086 """ 

1087 x = np.asarray(x) 

1088 

1089 if lmbda is not None: # single transformation 

1090 return special.boxcox(x, lmbda) 

1091 

1092 if x.ndim != 1: 

1093 raise ValueError("Data must be 1-dimensional.") 

1094 

1095 if x.size == 0: 

1096 return x 

1097 

1098 if np.all(x == x[0]): 

1099 raise ValueError("Data must not be constant.") 

1100 

1101 if np.any(x <= 0): 

1102 raise ValueError("Data must be positive.") 

1103 

1104 # If lmbda=None, find the lmbda that maximizes the log-likelihood function. 

1105 lmax = boxcox_normmax(x, method='mle', optimizer=optimizer) 

1106 y = boxcox(x, lmax) 

1107 

1108 if alpha is None: 

1109 return y, lmax 

1110 else: 

1111 # Find confidence interval 

1112 interval = _boxcox_conf_interval(x, lmax, alpha) 

1113 return y, lmax, interval 

1114 

1115 

1116def boxcox_normmax(x, brack=None, method='pearsonr', optimizer=None): 

1117 """Compute optimal Box-Cox transform parameter for input data. 

1118 

1119 Parameters 

1120 ---------- 

1121 x : array_like 

1122 Input array. 

1123 brack : 2-tuple, optional, default (-2.0, 2.0) 

1124 The starting interval for a downhill bracket search for the default 

1125 `optimize.brent` solver. Note that this is in most cases not 

1126 critical; the final result is allowed to be outside this bracket. 

1127 If `optimizer` is passed, `brack` must be None. 

1128 method : str, optional 

1129 The method to determine the optimal transform parameter (`boxcox` 

1130 ``lmbda`` parameter). Options are: 

1131 

1132 'pearsonr' (default) 

1133 Maximizes the Pearson correlation coefficient between 

1134 ``y = boxcox(x)`` and the expected values for ``y`` if `x` would be 

1135 normally-distributed. 

1136 

1137 'mle' 

1138 Minimizes the log-likelihood `boxcox_llf`. This is the method used 

1139 in `boxcox`. 

1140 

1141 'all' 

1142 Use all optimization methods available, and return all results. 

1143 Useful to compare different methods. 

1144 optimizer : callable, optional 

1145 `optimizer` is a callable that accepts one argument: 

1146 

1147 fun : callable 

1148 The objective function to be optimized. `fun` accepts one argument, 

1149 the Box-Cox transform parameter `lmbda`, and returns the negative 

1150 log-likelihood function at the provided value. The job of `optimizer` 

1151 is to find the value of `lmbda` that minimizes `fun`. 

1152 

1153 and returns an object, such as an instance of 

1154 `scipy.optimize.OptimizeResult`, which holds the optimal value of 

1155 `lmbda` in an attribute `x`. 

1156 

1157 See the example below or the documentation of 

1158 `scipy.optimize.minimize_scalar` for more information. 

1159 

1160 Returns 

1161 ------- 

1162 maxlog : float or ndarray 

1163 The optimal transform parameter found. An array instead of a scalar 

1164 for ``method='all'``. 

1165 

1166 See Also 

1167 -------- 

1168 boxcox, boxcox_llf, boxcox_normplot, scipy.optimize.minimize_scalar 

1169 

1170 Examples 

1171 -------- 

1172 >>> import numpy as np 

1173 >>> from scipy import stats 

1174 >>> import matplotlib.pyplot as plt 

1175 

1176 We can generate some data and determine the optimal ``lmbda`` in various 

1177 ways: 

1178 

1179 >>> rng = np.random.default_rng() 

1180 >>> x = stats.loggamma.rvs(5, size=30, random_state=rng) + 5 

1181 >>> y, lmax_mle = stats.boxcox(x) 

1182 >>> lmax_pearsonr = stats.boxcox_normmax(x) 

1183 

1184 >>> lmax_mle 

1185 2.217563431465757 

1186 >>> lmax_pearsonr 

1187 2.238318660200961 

1188 >>> stats.boxcox_normmax(x, method='all') 

1189 array([2.23831866, 2.21756343]) 

1190 

1191 >>> fig = plt.figure() 

1192 >>> ax = fig.add_subplot(111) 

1193 >>> prob = stats.boxcox_normplot(x, -10, 10, plot=ax) 

1194 >>> ax.axvline(lmax_mle, color='r') 

1195 >>> ax.axvline(lmax_pearsonr, color='g', ls='--') 

1196 

1197 >>> plt.show() 

1198 

1199 Alternatively, we can define our own `optimizer` function. Suppose we 

1200 are only interested in values of `lmbda` on the interval [6, 7], we 

1201 want to use `scipy.optimize.minimize_scalar` with ``method='bounded'``, 

1202 and we want to use tighter tolerances when optimizing the log-likelihood 

1203 function. To do this, we define a function that accepts positional argument 

1204 `fun` and uses `scipy.optimize.minimize_scalar` to minimize `fun` subject 

1205 to the provided bounds and tolerances: 

1206 

1207 >>> from scipy import optimize 

1208 >>> options = {'xatol': 1e-12} # absolute tolerance on `x` 

1209 >>> def optimizer(fun): 

1210 ... return optimize.minimize_scalar(fun, bounds=(6, 7), 

1211 ... method="bounded", options=options) 

1212 >>> stats.boxcox_normmax(x, optimizer=optimizer) 

1213 6.000... 

1214 """ 

1215 # If optimizer is not given, define default 'brent' optimizer. 

1216 if optimizer is None: 

1217 

1218 # Set default value for `brack`. 

1219 if brack is None: 

1220 brack = (-2.0, 2.0) 

1221 

1222 def _optimizer(func, args): 

1223 return optimize.brent(func, args=args, brack=brack) 

1224 

1225 # Otherwise check optimizer. 

1226 else: 

1227 if not callable(optimizer): 

1228 raise ValueError("`optimizer` must be a callable") 

1229 

1230 if brack is not None: 

1231 raise ValueError("`brack` must be None if `optimizer` is given") 

1232 

1233 # `optimizer` is expected to return a `OptimizeResult` object, we here 

1234 # get the solution to the optimization problem. 

1235 def _optimizer(func, args): 

1236 def func_wrapped(x): 

1237 return func(x, *args) 

1238 return getattr(optimizer(func_wrapped), 'x', None) 

1239 

1240 def _pearsonr(x): 

1241 osm_uniform = _calc_uniform_order_statistic_medians(len(x)) 

1242 xvals = distributions.norm.ppf(osm_uniform) 

1243 

1244 def _eval_pearsonr(lmbda, xvals, samps): 

1245 # This function computes the x-axis values of the probability plot 

1246 # and computes a linear regression (including the correlation) and 

1247 # returns ``1 - r`` so that a minimization function maximizes the 

1248 # correlation. 

1249 y = boxcox(samps, lmbda) 

1250 yvals = np.sort(y) 

1251 r, prob = _stats_py.pearsonr(xvals, yvals) 

1252 return 1 - r 

1253 

1254 return _optimizer(_eval_pearsonr, args=(xvals, x)) 

1255 

1256 def _mle(x): 

1257 def _eval_mle(lmb, data): 

1258 # function to minimize 

1259 return -boxcox_llf(lmb, data) 

1260 

1261 return _optimizer(_eval_mle, args=(x,)) 

1262 

1263 def _all(x): 

1264 maxlog = np.empty(2, dtype=float) 

1265 maxlog[0] = _pearsonr(x) 

1266 maxlog[1] = _mle(x) 

1267 return maxlog 

1268 

1269 methods = {'pearsonr': _pearsonr, 

1270 'mle': _mle, 

1271 'all': _all} 

1272 if method not in methods.keys(): 

1273 raise ValueError("Method %s not recognized." % method) 

1274 

1275 optimfunc = methods[method] 

1276 res = optimfunc(x) 

1277 if res is None: 

1278 message = ("`optimizer` must return an object containing the optimal " 

1279 "`lmbda` in attribute `x`") 

1280 raise ValueError(message) 

1281 return res 

1282 

1283 

1284def _normplot(method, x, la, lb, plot=None, N=80): 

1285 """Compute parameters for a Box-Cox or Yeo-Johnson normality plot, 

1286 optionally show it. 

1287 

1288 See `boxcox_normplot` or `yeojohnson_normplot` for details. 

1289 """ 

1290 

1291 if method == 'boxcox': 

1292 title = 'Box-Cox Normality Plot' 

1293 transform_func = boxcox 

1294 else: 

1295 title = 'Yeo-Johnson Normality Plot' 

1296 transform_func = yeojohnson 

1297 

1298 x = np.asarray(x) 

1299 if x.size == 0: 

1300 return x 

1301 

1302 if lb <= la: 

1303 raise ValueError("`lb` has to be larger than `la`.") 

1304 

1305 if method == 'boxcox' and np.any(x <= 0): 

1306 raise ValueError("Data must be positive.") 

1307 

1308 lmbdas = np.linspace(la, lb, num=N) 

1309 ppcc = lmbdas * 0.0 

1310 for i, val in enumerate(lmbdas): 

1311 # Determine for each lmbda the square root of correlation coefficient 

1312 # of transformed x 

1313 z = transform_func(x, lmbda=val) 

1314 _, (_, _, r) = probplot(z, dist='norm', fit=True) 

1315 ppcc[i] = r 

1316 

1317 if plot is not None: 

1318 plot.plot(lmbdas, ppcc, 'x') 

1319 _add_axis_labels_title(plot, xlabel='$\\lambda$', 

1320 ylabel='Prob Plot Corr. Coef.', 

1321 title=title) 

1322 

1323 return lmbdas, ppcc 

1324 

1325 

1326def boxcox_normplot(x, la, lb, plot=None, N=80): 

1327 """Compute parameters for a Box-Cox normality plot, optionally show it. 

1328 

1329 A Box-Cox normality plot shows graphically what the best transformation 

1330 parameter is to use in `boxcox` to obtain a distribution that is close 

1331 to normal. 

1332 

1333 Parameters 

1334 ---------- 

1335 x : array_like 

1336 Input array. 

1337 la, lb : scalar 

1338 The lower and upper bounds for the ``lmbda`` values to pass to `boxcox` 

1339 for Box-Cox transformations. These are also the limits of the 

1340 horizontal axis of the plot if that is generated. 

1341 plot : object, optional 

1342 If given, plots the quantiles and least squares fit. 

1343 `plot` is an object that has to have methods "plot" and "text". 

1344 The `matplotlib.pyplot` module or a Matplotlib Axes object can be used, 

1345 or a custom object with the same methods. 

1346 Default is None, which means that no plot is created. 

1347 N : int, optional 

1348 Number of points on the horizontal axis (equally distributed from 

1349 `la` to `lb`). 

1350 

1351 Returns 

1352 ------- 

1353 lmbdas : ndarray 

1354 The ``lmbda`` values for which a Box-Cox transform was done. 

1355 ppcc : ndarray 

1356 Probability Plot Correlelation Coefficient, as obtained from `probplot` 

1357 when fitting the Box-Cox transformed input `x` against a normal 

1358 distribution. 

1359 

1360 See Also 

1361 -------- 

1362 probplot, boxcox, boxcox_normmax, boxcox_llf, ppcc_max 

1363 

1364 Notes 

1365 ----- 

1366 Even if `plot` is given, the figure is not shown or saved by 

1367 `boxcox_normplot`; ``plt.show()`` or ``plt.savefig('figname.png')`` 

1368 should be used after calling `probplot`. 

1369 

1370 Examples 

1371 -------- 

1372 >>> from scipy import stats 

1373 >>> import matplotlib.pyplot as plt 

1374 

1375 Generate some non-normally distributed data, and create a Box-Cox plot: 

1376 

1377 >>> x = stats.loggamma.rvs(5, size=500) + 5 

1378 >>> fig = plt.figure() 

1379 >>> ax = fig.add_subplot(111) 

1380 >>> prob = stats.boxcox_normplot(x, -20, 20, plot=ax) 

1381 

1382 Determine and plot the optimal ``lmbda`` to transform ``x`` and plot it in 

1383 the same plot: 

1384 

1385 >>> _, maxlog = stats.boxcox(x) 

1386 >>> ax.axvline(maxlog, color='r') 

1387 

1388 >>> plt.show() 

1389 

1390 """ 

1391 return _normplot('boxcox', x, la, lb, plot, N) 

1392 

1393 

1394def yeojohnson(x, lmbda=None): 

1395 r"""Return a dataset transformed by a Yeo-Johnson power transformation. 

1396 

1397 Parameters 

1398 ---------- 

1399 x : ndarray 

1400 Input array. Should be 1-dimensional. 

1401 lmbda : float, optional 

1402 If ``lmbda`` is ``None``, find the lambda that maximizes the 

1403 log-likelihood function and return it as the second output argument. 

1404 Otherwise the transformation is done for the given value. 

1405 

1406 Returns 

1407 ------- 

1408 yeojohnson: ndarray 

1409 Yeo-Johnson power transformed array. 

1410 maxlog : float, optional 

1411 If the `lmbda` parameter is None, the second returned argument is 

1412 the lambda that maximizes the log-likelihood function. 

1413 

1414 See Also 

1415 -------- 

1416 probplot, yeojohnson_normplot, yeojohnson_normmax, yeojohnson_llf, boxcox 

1417 

1418 Notes 

1419 ----- 

1420 The Yeo-Johnson transform is given by:: 

1421 

1422 y = ((x + 1)**lmbda - 1) / lmbda, for x >= 0, lmbda != 0 

1423 log(x + 1), for x >= 0, lmbda = 0 

1424 -((-x + 1)**(2 - lmbda) - 1) / (2 - lmbda), for x < 0, lmbda != 2 

1425 -log(-x + 1), for x < 0, lmbda = 2 

1426 

1427 Unlike `boxcox`, `yeojohnson` does not require the input data to be 

1428 positive. 

1429 

1430 .. versionadded:: 1.2.0 

1431 

1432 

1433 References 

1434 ---------- 

1435 I. Yeo and R.A. Johnson, "A New Family of Power Transformations to 

1436 Improve Normality or Symmetry", Biometrika 87.4 (2000): 

1437 

1438 

1439 Examples 

1440 -------- 

1441 >>> from scipy import stats 

1442 >>> import matplotlib.pyplot as plt 

1443 

1444 We generate some random variates from a non-normal distribution and make a 

1445 probability plot for it, to show it is non-normal in the tails: 

1446 

1447 >>> fig = plt.figure() 

1448 >>> ax1 = fig.add_subplot(211) 

1449 >>> x = stats.loggamma.rvs(5, size=500) + 5 

1450 >>> prob = stats.probplot(x, dist=stats.norm, plot=ax1) 

1451 >>> ax1.set_xlabel('') 

1452 >>> ax1.set_title('Probplot against normal distribution') 

1453 

1454 We now use `yeojohnson` to transform the data so it's closest to normal: 

1455 

1456 >>> ax2 = fig.add_subplot(212) 

1457 >>> xt, lmbda = stats.yeojohnson(x) 

1458 >>> prob = stats.probplot(xt, dist=stats.norm, plot=ax2) 

1459 >>> ax2.set_title('Probplot after Yeo-Johnson transformation') 

1460 

1461 >>> plt.show() 

1462 

1463 """ 

1464 x = np.asarray(x) 

1465 if x.size == 0: 

1466 return x 

1467 

1468 if np.issubdtype(x.dtype, np.complexfloating): 

1469 raise ValueError('Yeo-Johnson transformation is not defined for ' 

1470 'complex numbers.') 

1471 

1472 if np.issubdtype(x.dtype, np.integer): 

1473 x = x.astype(np.float64, copy=False) 

1474 

1475 if lmbda is not None: 

1476 return _yeojohnson_transform(x, lmbda) 

1477 

1478 # if lmbda=None, find the lmbda that maximizes the log-likelihood function. 

1479 lmax = yeojohnson_normmax(x) 

1480 y = _yeojohnson_transform(x, lmax) 

1481 

1482 return y, lmax 

1483 

1484 

1485def _yeojohnson_transform(x, lmbda): 

1486 """Returns `x` transformed by the Yeo-Johnson power transform with given 

1487 parameter `lmbda`. 

1488 """ 

1489 out = np.zeros_like(x) 

1490 pos = x >= 0 # binary mask 

1491 

1492 # when x >= 0 

1493 if abs(lmbda) < np.spacing(1.): 

1494 out[pos] = np.log1p(x[pos]) 

1495 else: # lmbda != 0 

1496 out[pos] = (np.power(x[pos] + 1, lmbda) - 1) / lmbda 

1497 

1498 # when x < 0 

1499 if abs(lmbda - 2) > np.spacing(1.): 

1500 out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda) 

1501 else: # lmbda == 2 

1502 out[~pos] = -np.log1p(-x[~pos]) 

1503 

1504 return out 

1505 

1506 

1507def yeojohnson_llf(lmb, data): 

1508 r"""The yeojohnson log-likelihood function. 

1509 

1510 Parameters 

1511 ---------- 

1512 lmb : scalar 

1513 Parameter for Yeo-Johnson transformation. See `yeojohnson` for 

1514 details. 

1515 data : array_like 

1516 Data to calculate Yeo-Johnson log-likelihood for. If `data` is 

1517 multi-dimensional, the log-likelihood is calculated along the first 

1518 axis. 

1519 

1520 Returns 

1521 ------- 

1522 llf : float 

1523 Yeo-Johnson log-likelihood of `data` given `lmb`. 

1524 

1525 See Also 

1526 -------- 

1527 yeojohnson, probplot, yeojohnson_normplot, yeojohnson_normmax 

1528 

1529 Notes 

1530 ----- 

1531 The Yeo-Johnson log-likelihood function is defined here as 

1532 

1533 .. math:: 

1534 

1535 llf = -N/2 \log(\hat{\sigma}^2) + (\lambda - 1) 

1536 \sum_i \text{ sign }(x_i)\log(|x_i| + 1) 

1537 

1538 where :math:`\hat{\sigma}^2` is estimated variance of the Yeo-Johnson 

1539 transformed input data ``x``. 

1540 

1541 .. versionadded:: 1.2.0 

1542 

1543 Examples 

1544 -------- 

1545 >>> import numpy as np 

1546 >>> from scipy import stats 

1547 >>> import matplotlib.pyplot as plt 

1548 >>> from mpl_toolkits.axes_grid1.inset_locator import inset_axes 

1549 

1550 Generate some random variates and calculate Yeo-Johnson log-likelihood 

1551 values for them for a range of ``lmbda`` values: 

1552 

1553 >>> x = stats.loggamma.rvs(5, loc=10, size=1000) 

1554 >>> lmbdas = np.linspace(-2, 10) 

1555 >>> llf = np.zeros(lmbdas.shape, dtype=float) 

1556 >>> for ii, lmbda in enumerate(lmbdas): 

1557 ... llf[ii] = stats.yeojohnson_llf(lmbda, x) 

1558 

1559 Also find the optimal lmbda value with `yeojohnson`: 

1560 

1561 >>> x_most_normal, lmbda_optimal = stats.yeojohnson(x) 

1562 

1563 Plot the log-likelihood as function of lmbda. Add the optimal lmbda as a 

1564 horizontal line to check that that's really the optimum: 

1565 

1566 >>> fig = plt.figure() 

1567 >>> ax = fig.add_subplot(111) 

1568 >>> ax.plot(lmbdas, llf, 'b.-') 

1569 >>> ax.axhline(stats.yeojohnson_llf(lmbda_optimal, x), color='r') 

1570 >>> ax.set_xlabel('lmbda parameter') 

1571 >>> ax.set_ylabel('Yeo-Johnson log-likelihood') 

1572 

1573 Now add some probability plots to show that where the log-likelihood is 

1574 maximized the data transformed with `yeojohnson` looks closest to normal: 

1575 

1576 >>> locs = [3, 10, 4] # 'lower left', 'center', 'lower right' 

1577 >>> for lmbda, loc in zip([-1, lmbda_optimal, 9], locs): 

1578 ... xt = stats.yeojohnson(x, lmbda=lmbda) 

1579 ... (osm, osr), (slope, intercept, r_sq) = stats.probplot(xt) 

1580 ... ax_inset = inset_axes(ax, width="20%", height="20%", loc=loc) 

1581 ... ax_inset.plot(osm, osr, 'c.', osm, slope*osm + intercept, 'k-') 

1582 ... ax_inset.set_xticklabels([]) 

1583 ... ax_inset.set_yticklabels([]) 

1584 ... ax_inset.set_title(r'$\lambda=%1.2f$' % lmbda) 

1585 

1586 >>> plt.show() 

1587 

1588 """ 

1589 data = np.asarray(data) 

1590 n_samples = data.shape[0] 

1591 

1592 if n_samples == 0: 

1593 return np.nan 

1594 

1595 trans = _yeojohnson_transform(data, lmb) 

1596 trans_var = trans.var(axis=0) 

1597 loglike = np.empty_like(trans_var) 

1598 

1599 # Avoid RuntimeWarning raised by np.log when the variance is too low 

1600 tiny_variance = trans_var < np.finfo(trans_var.dtype).tiny 

1601 loglike[tiny_variance] = np.inf 

1602 

1603 loglike[~tiny_variance] = ( 

1604 -n_samples / 2 * np.log(trans_var[~tiny_variance])) 

1605 loglike[~tiny_variance] += ( 

1606 (lmb - 1) * (np.sign(data) * np.log(np.abs(data) + 1)).sum(axis=0)) 

1607 return loglike 

1608 

1609 

1610def yeojohnson_normmax(x, brack=(-2, 2)): 

1611 """Compute optimal Yeo-Johnson transform parameter. 

1612 

1613 Compute optimal Yeo-Johnson transform parameter for input data, using 

1614 maximum likelihood estimation. 

1615 

1616 Parameters 

1617 ---------- 

1618 x : array_like 

1619 Input array. 

1620 brack : 2-tuple, optional 

1621 The starting interval for a downhill bracket search with 

1622 `optimize.brent`. Note that this is in most cases not critical; the 

1623 final result is allowed to be outside this bracket. 

1624 

1625 Returns 

1626 ------- 

1627 maxlog : float 

1628 The optimal transform parameter found. 

1629 

1630 See Also 

1631 -------- 

1632 yeojohnson, yeojohnson_llf, yeojohnson_normplot 

1633 

1634 Notes 

1635 ----- 

1636 .. versionadded:: 1.2.0 

1637 

1638 Examples 

1639 -------- 

1640 >>> import numpy as np 

1641 >>> from scipy import stats 

1642 >>> import matplotlib.pyplot as plt 

1643 

1644 Generate some data and determine optimal ``lmbda`` 

1645 

1646 >>> rng = np.random.default_rng() 

1647 >>> x = stats.loggamma.rvs(5, size=30, random_state=rng) + 5 

1648 >>> lmax = stats.yeojohnson_normmax(x) 

1649 

1650 >>> fig = plt.figure() 

1651 >>> ax = fig.add_subplot(111) 

1652 >>> prob = stats.yeojohnson_normplot(x, -10, 10, plot=ax) 

1653 >>> ax.axvline(lmax, color='r') 

1654 

1655 >>> plt.show() 

1656 

1657 """ 

1658 def _neg_llf(lmbda, data): 

1659 llf = yeojohnson_llf(lmbda, data) 

1660 # reject likelihoods that are inf which are likely due to small 

1661 # variance in the transformed space 

1662 llf[np.isinf(llf)] = -np.inf 

1663 return -llf 

1664 

1665 with np.errstate(invalid='ignore'): 

1666 return optimize.brent(_neg_llf, brack=brack, args=(x,)) 

1667 

1668 

1669def yeojohnson_normplot(x, la, lb, plot=None, N=80): 

1670 """Compute parameters for a Yeo-Johnson normality plot, optionally show it. 

1671 

1672 A Yeo-Johnson normality plot shows graphically what the best 

1673 transformation parameter is to use in `yeojohnson` to obtain a 

1674 distribution that is close to normal. 

1675 

1676 Parameters 

1677 ---------- 

1678 x : array_like 

1679 Input array. 

1680 la, lb : scalar 

1681 The lower and upper bounds for the ``lmbda`` values to pass to 

1682 `yeojohnson` for Yeo-Johnson transformations. These are also the 

1683 limits of the horizontal axis of the plot if that is generated. 

1684 plot : object, optional 

1685 If given, plots the quantiles and least squares fit. 

1686 `plot` is an object that has to have methods "plot" and "text". 

1687 The `matplotlib.pyplot` module or a Matplotlib Axes object can be used, 

1688 or a custom object with the same methods. 

1689 Default is None, which means that no plot is created. 

1690 N : int, optional 

1691 Number of points on the horizontal axis (equally distributed from 

1692 `la` to `lb`). 

1693 

1694 Returns 

1695 ------- 

1696 lmbdas : ndarray 

1697 The ``lmbda`` values for which a Yeo-Johnson transform was done. 

1698 ppcc : ndarray 

1699 Probability Plot Correlelation Coefficient, as obtained from `probplot` 

1700 when fitting the Box-Cox transformed input `x` against a normal 

1701 distribution. 

1702 

1703 See Also 

1704 -------- 

1705 probplot, yeojohnson, yeojohnson_normmax, yeojohnson_llf, ppcc_max 

1706 

1707 Notes 

1708 ----- 

1709 Even if `plot` is given, the figure is not shown or saved by 

1710 `boxcox_normplot`; ``plt.show()`` or ``plt.savefig('figname.png')`` 

1711 should be used after calling `probplot`. 

1712 

1713 .. versionadded:: 1.2.0 

1714 

1715 Examples 

1716 -------- 

1717 >>> from scipy import stats 

1718 >>> import matplotlib.pyplot as plt 

1719 

1720 Generate some non-normally distributed data, and create a Yeo-Johnson plot: 

1721 

1722 >>> x = stats.loggamma.rvs(5, size=500) + 5 

1723 >>> fig = plt.figure() 

1724 >>> ax = fig.add_subplot(111) 

1725 >>> prob = stats.yeojohnson_normplot(x, -20, 20, plot=ax) 

1726 

1727 Determine and plot the optimal ``lmbda`` to transform ``x`` and plot it in 

1728 the same plot: 

1729 

1730 >>> _, maxlog = stats.yeojohnson(x) 

1731 >>> ax.axvline(maxlog, color='r') 

1732 

1733 >>> plt.show() 

1734 

1735 """ 

1736 return _normplot('yeojohnson', x, la, lb, plot, N) 

1737 

1738 

1739ShapiroResult = namedtuple('ShapiroResult', ('statistic', 'pvalue')) 

1740 

1741 

1742def shapiro(x): 

1743 """Perform the Shapiro-Wilk test for normality. 

1744 

1745 The Shapiro-Wilk test tests the null hypothesis that the 

1746 data was drawn from a normal distribution. 

1747 

1748 Parameters 

1749 ---------- 

1750 x : array_like 

1751 Array of sample data. 

1752 

1753 Returns 

1754 ------- 

1755 statistic : float 

1756 The test statistic. 

1757 p-value : float 

1758 The p-value for the hypothesis test. 

1759 

1760 See Also 

1761 -------- 

1762 anderson : The Anderson-Darling test for normality 

1763 kstest : The Kolmogorov-Smirnov test for goodness of fit. 

1764 

1765 Notes 

1766 ----- 

1767 The algorithm used is described in [4]_ but censoring parameters as 

1768 described are not implemented. For N > 5000 the W test statistic is accurate 

1769 but the p-value may not be. 

1770 

1771 The chance of rejecting the null hypothesis when it is true is close to 5% 

1772 regardless of sample size. 

1773 

1774 References 

1775 ---------- 

1776 .. [1] https://www.itl.nist.gov/div898/handbook/prc/section2/prc213.htm 

1777 .. [2] Shapiro, S. S. & Wilk, M.B (1965). An analysis of variance test for 

1778 normality (complete samples), Biometrika, Vol. 52, pp. 591-611. 

1779 .. [3] Razali, N. M. & Wah, Y. B. (2011) Power comparisons of Shapiro-Wilk, 

1780 Kolmogorov-Smirnov, Lilliefors and Anderson-Darling tests, Journal of 

1781 Statistical Modeling and Analytics, Vol. 2, pp. 21-33. 

1782 .. [4] ALGORITHM AS R94 APPL. STATIST. (1995) VOL. 44, NO. 4. 

1783 

1784 Examples 

1785 -------- 

1786 >>> import numpy as np 

1787 >>> from scipy import stats 

1788 >>> rng = np.random.default_rng() 

1789 >>> x = stats.norm.rvs(loc=5, scale=3, size=100, random_state=rng) 

1790 >>> shapiro_test = stats.shapiro(x) 

1791 >>> shapiro_test 

1792 ShapiroResult(statistic=0.9813305735588074, pvalue=0.16855233907699585) 

1793 >>> shapiro_test.statistic 

1794 0.9813305735588074 

1795 >>> shapiro_test.pvalue 

1796 0.16855233907699585 

1797 

1798 """ 

1799 x = np.ravel(x) 

1800 

1801 N = len(x) 

1802 if N < 3: 

1803 raise ValueError("Data must be at least length 3.") 

1804 

1805 x = x - np.median(x) 

1806 

1807 a = zeros(N, 'f') 

1808 init = 0 

1809 

1810 y = sort(x) 

1811 a, w, pw, ifault = _statlib.swilk(y, a[:N//2], init) 

1812 if ifault not in [0, 2]: 

1813 warnings.warn("Input data for shapiro has range zero. The results " 

1814 "may not be accurate.") 

1815 if N > 5000: 

1816 warnings.warn("p-value may not be accurate for N > 5000.") 

1817 

1818 return ShapiroResult(w, pw) 

1819 

1820 

1821# Values from Stephens, M A, "EDF Statistics for Goodness of Fit and 

1822# Some Comparisons", Journal of the American Statistical 

1823# Association, Vol. 69, Issue 347, Sept. 1974, pp 730-737 

1824_Avals_norm = array([0.576, 0.656, 0.787, 0.918, 1.092]) 

1825_Avals_expon = array([0.922, 1.078, 1.341, 1.606, 1.957]) 

1826# From Stephens, M A, "Goodness of Fit for the Extreme Value Distribution", 

1827# Biometrika, Vol. 64, Issue 3, Dec. 1977, pp 583-588. 

1828_Avals_gumbel = array([0.474, 0.637, 0.757, 0.877, 1.038]) 

1829# From Stephens, M A, "Tests of Fit for the Logistic Distribution Based 

1830# on the Empirical Distribution Function.", Biometrika, 

1831# Vol. 66, Issue 3, Dec. 1979, pp 591-595. 

1832_Avals_logistic = array([0.426, 0.563, 0.660, 0.769, 0.906, 1.010]) 

1833 

1834 

1835AndersonResult = _make_tuple_bunch('AndersonResult', 

1836 ['statistic', 'critical_values', 

1837 'significance_level'], ['fit_result']) 

1838 

1839 

1840def anderson(x, dist='norm'): 

1841 """Anderson-Darling test for data coming from a particular distribution. 

1842 

1843 The Anderson-Darling test tests the null hypothesis that a sample is 

1844 drawn from a population that follows a particular distribution. 

1845 For the Anderson-Darling test, the critical values depend on 

1846 which distribution is being tested against. This function works 

1847 for normal, exponential, logistic, or Gumbel (Extreme Value 

1848 Type I) distributions. 

1849 

1850 Parameters 

1851 ---------- 

1852 x : array_like 

1853 Array of sample data. 

1854 dist : {'norm', 'expon', 'logistic', 'gumbel', 'gumbel_l', 'gumbel_r', 'extreme1'}, optional 

1855 The type of distribution to test against. The default is 'norm'. 

1856 The names 'extreme1', 'gumbel_l' and 'gumbel' are synonyms for the 

1857 same distribution. 

1858 

1859 Returns 

1860 ------- 

1861 result : AndersonResult 

1862 An object with the following attributes: 

1863 

1864 statistic : float 

1865 The Anderson-Darling test statistic. 

1866 critical_values : list 

1867 The critical values for this distribution. 

1868 significance_level : list 

1869 The significance levels for the corresponding critical values 

1870 in percents. The function returns critical values for a 

1871 differing set of significance levels depending on the 

1872 distribution that is being tested against. 

1873 fit_result : `~scipy.stats._result_classes.FitResult` 

1874 An object containing the results of fitting the distribution to 

1875 the data. 

1876 

1877 See Also 

1878 -------- 

1879 kstest : The Kolmogorov-Smirnov test for goodness-of-fit. 

1880 

1881 Notes 

1882 ----- 

1883 Critical values provided are for the following significance levels: 

1884 

1885 normal/exponential 

1886 15%, 10%, 5%, 2.5%, 1% 

1887 logistic 

1888 25%, 10%, 5%, 2.5%, 1%, 0.5% 

1889 Gumbel 

1890 25%, 10%, 5%, 2.5%, 1% 

1891 

1892 If the returned statistic is larger than these critical values then 

1893 for the corresponding significance level, the null hypothesis that 

1894 the data come from the chosen distribution can be rejected. 

1895 The returned statistic is referred to as 'A2' in the references. 

1896 

1897 References 

1898 ---------- 

1899 .. [1] https://www.itl.nist.gov/div898/handbook/prc/section2/prc213.htm 

1900 .. [2] Stephens, M. A. (1974). EDF Statistics for Goodness of Fit and 

1901 Some Comparisons, Journal of the American Statistical Association, 

1902 Vol. 69, pp. 730-737. 

1903 .. [3] Stephens, M. A. (1976). Asymptotic Results for Goodness-of-Fit 

1904 Statistics with Unknown Parameters, Annals of Statistics, Vol. 4, 

1905 pp. 357-369. 

1906 .. [4] Stephens, M. A. (1977). Goodness of Fit for the Extreme Value 

1907 Distribution, Biometrika, Vol. 64, pp. 583-588. 

1908 .. [5] Stephens, M. A. (1977). Goodness of Fit with Special Reference 

1909 to Tests for Exponentiality , Technical Report No. 262, 

1910 Department of Statistics, Stanford University, Stanford, CA. 

1911 .. [6] Stephens, M. A. (1979). Tests of Fit for the Logistic Distribution 

1912 Based on the Empirical Distribution Function, Biometrika, Vol. 66, 

1913 pp. 591-595. 

1914 

1915 Examples 

1916 -------- 

1917 Test the null hypothesis that a random sample was drawn from a normal 

1918 distribution (with unspecified mean and standard deviation). 

1919 

1920 >>> import numpy as np 

1921 >>> from scipy.stats import anderson 

1922 >>> rng = np.random.default_rng() 

1923 >>> data = rng.random(size=35) 

1924 >>> res = anderson(data) 

1925 >>> res.statistic 

1926 0.8398018749744764 

1927 >>> res.critical_values 

1928 array([0.527, 0.6 , 0.719, 0.839, 0.998]) 

1929 >>> res.significance_level 

1930 array([15. , 10. , 5. , 2.5, 1. ]) 

1931 

1932 The value of the statistic (barely) exceeds the critical value associated 

1933 with a significance level of 2.5%, so the null hypothesis may be rejected 

1934 at a significance level of 2.5%, but not at a significance level of 1%. 

1935 

1936 """ # noqa 

1937 dist = dist.lower() 

1938 if dist in {'extreme1', 'gumbel'}: 

1939 dist = 'gumbel_l' 

1940 dists = {'norm', 'expon', 'gumbel_l', 'gumbel_r', 'logistic'} 

1941 if dist not in dists: 

1942 raise ValueError(f"Invalid distribution; dist must be in {dists}.") 

1943 y = sort(x) 

1944 xbar = np.mean(x, axis=0) 

1945 N = len(y) 

1946 if dist == 'norm': 

1947 s = np.std(x, ddof=1, axis=0) 

1948 w = (y - xbar) / s 

1949 fit_params = xbar, s 

1950 logcdf = distributions.norm.logcdf(w) 

1951 logsf = distributions.norm.logsf(w) 

1952 sig = array([15, 10, 5, 2.5, 1]) 

1953 critical = around(_Avals_norm / (1.0 + 4.0/N - 25.0/N/N), 3) 

1954 elif dist == 'expon': 

1955 w = y / xbar 

1956 fit_params = 0, xbar 

1957 logcdf = distributions.expon.logcdf(w) 

1958 logsf = distributions.expon.logsf(w) 

1959 sig = array([15, 10, 5, 2.5, 1]) 

1960 critical = around(_Avals_expon / (1.0 + 0.6/N), 3) 

1961 elif dist == 'logistic': 

1962 def rootfunc(ab, xj, N): 

1963 a, b = ab 

1964 tmp = (xj - a) / b 

1965 tmp2 = exp(tmp) 

1966 val = [np.sum(1.0/(1+tmp2), axis=0) - 0.5*N, 

1967 np.sum(tmp*(1.0-tmp2)/(1+tmp2), axis=0) + N] 

1968 return array(val) 

1969 

1970 sol0 = array([xbar, np.std(x, ddof=1, axis=0)]) 

1971 sol = optimize.fsolve(rootfunc, sol0, args=(x, N), xtol=1e-5) 

1972 w = (y - sol[0]) / sol[1] 

1973 fit_params = sol 

1974 logcdf = distributions.logistic.logcdf(w) 

1975 logsf = distributions.logistic.logsf(w) 

1976 sig = array([25, 10, 5, 2.5, 1, 0.5]) 

1977 critical = around(_Avals_logistic / (1.0 + 0.25/N), 3) 

1978 elif dist == 'gumbel_r': 

1979 xbar, s = distributions.gumbel_r.fit(x) 

1980 w = (y - xbar) / s 

1981 fit_params = xbar, s 

1982 logcdf = distributions.gumbel_r.logcdf(w) 

1983 logsf = distributions.gumbel_r.logsf(w) 

1984 sig = array([25, 10, 5, 2.5, 1]) 

1985 critical = around(_Avals_gumbel / (1.0 + 0.2/sqrt(N)), 3) 

1986 elif dist == 'gumbel_l': 

1987 xbar, s = distributions.gumbel_l.fit(x) 

1988 w = (y - xbar) / s 

1989 fit_params = xbar, s 

1990 logcdf = distributions.gumbel_l.logcdf(w) 

1991 logsf = distributions.gumbel_l.logsf(w) 

1992 sig = array([25, 10, 5, 2.5, 1]) 

1993 critical = around(_Avals_gumbel / (1.0 + 0.2/sqrt(N)), 3) 

1994 

1995 i = arange(1, N + 1) 

1996 A2 = -N - np.sum((2*i - 1.0) / N * (logcdf + logsf[::-1]), axis=0) 

1997 

1998 # FitResult initializer expects an optimize result, so let's work with it 

1999 message = '`anderson` successfully fit the distribution to the data.' 

2000 res = optimize.OptimizeResult(success=True, message=message) 

2001 res.x = np.array(fit_params) 

2002 fit_result = FitResult(getattr(distributions, dist), y, 

2003 discrete=False, res=res) 

2004 

2005 return AndersonResult(A2, critical, sig, fit_result=fit_result) 

2006 

2007 

2008def _anderson_ksamp_midrank(samples, Z, Zstar, k, n, N): 

2009 """Compute A2akN equation 7 of Scholz and Stephens. 

2010 

2011 Parameters 

2012 ---------- 

2013 samples : sequence of 1-D array_like 

2014 Array of sample arrays. 

2015 Z : array_like 

2016 Sorted array of all observations. 

2017 Zstar : array_like 

2018 Sorted array of unique observations. 

2019 k : int 

2020 Number of samples. 

2021 n : array_like 

2022 Number of observations in each sample. 

2023 N : int 

2024 Total number of observations. 

2025 

2026 Returns 

2027 ------- 

2028 A2aKN : float 

2029 The A2aKN statistics of Scholz and Stephens 1987. 

2030 

2031 """ 

2032 A2akN = 0. 

2033 Z_ssorted_left = Z.searchsorted(Zstar, 'left') 

2034 if N == Zstar.size: 

2035 lj = 1. 

2036 else: 

2037 lj = Z.searchsorted(Zstar, 'right') - Z_ssorted_left 

2038 Bj = Z_ssorted_left + lj / 2. 

2039 for i in arange(0, k): 

2040 s = np.sort(samples[i]) 

2041 s_ssorted_right = s.searchsorted(Zstar, side='right') 

2042 Mij = s_ssorted_right.astype(float) 

2043 fij = s_ssorted_right - s.searchsorted(Zstar, 'left') 

2044 Mij -= fij / 2. 

2045 inner = lj / float(N) * (N*Mij - Bj*n[i])**2 / (Bj*(N - Bj) - N*lj/4.) 

2046 A2akN += inner.sum() / n[i] 

2047 A2akN *= (N - 1.) / N 

2048 return A2akN 

2049 

2050 

2051def _anderson_ksamp_right(samples, Z, Zstar, k, n, N): 

2052 """Compute A2akN equation 6 of Scholz & Stephens. 

2053 

2054 Parameters 

2055 ---------- 

2056 samples : sequence of 1-D array_like 

2057 Array of sample arrays. 

2058 Z : array_like 

2059 Sorted array of all observations. 

2060 Zstar : array_like 

2061 Sorted array of unique observations. 

2062 k : int 

2063 Number of samples. 

2064 n : array_like 

2065 Number of observations in each sample. 

2066 N : int 

2067 Total number of observations. 

2068 

2069 Returns 

2070 ------- 

2071 A2KN : float 

2072 The A2KN statistics of Scholz and Stephens 1987. 

2073 

2074 """ 

2075 A2kN = 0. 

2076 lj = Z.searchsorted(Zstar[:-1], 'right') - Z.searchsorted(Zstar[:-1], 

2077 'left') 

2078 Bj = lj.cumsum() 

2079 for i in arange(0, k): 

2080 s = np.sort(samples[i]) 

2081 Mij = s.searchsorted(Zstar[:-1], side='right') 

2082 inner = lj / float(N) * (N * Mij - Bj * n[i])**2 / (Bj * (N - Bj)) 

2083 A2kN += inner.sum() / n[i] 

2084 return A2kN 

2085 

2086 

2087Anderson_ksampResult = _make_tuple_bunch( 

2088 'Anderson_ksampResult', 

2089 ['statistic', 'critical_values', 'pvalue'], [] 

2090) 

2091 

2092 

2093def anderson_ksamp(samples, midrank=True): 

2094 """The Anderson-Darling test for k-samples. 

2095 

2096 The k-sample Anderson-Darling test is a modification of the 

2097 one-sample Anderson-Darling test. It tests the null hypothesis 

2098 that k-samples are drawn from the same population without having 

2099 to specify the distribution function of that population. The 

2100 critical values depend on the number of samples. 

2101 

2102 Parameters 

2103 ---------- 

2104 samples : sequence of 1-D array_like 

2105 Array of sample data in arrays. 

2106 midrank : bool, optional 

2107 Type of Anderson-Darling test which is computed. Default 

2108 (True) is the midrank test applicable to continuous and 

2109 discrete populations. If False, the right side empirical 

2110 distribution is used. 

2111 

2112 Returns 

2113 ------- 

2114 res : Anderson_ksampResult 

2115 An object containing attributes: 

2116 

2117 statistic : float 

2118 Normalized k-sample Anderson-Darling test statistic. 

2119 critical_values : array 

2120 The critical values for significance levels 25%, 10%, 5%, 2.5%, 1%, 

2121 0.5%, 0.1%. 

2122 pvalue : float 

2123 The approximate p-value of the test. The value is floored / capped 

2124 at 0.1% / 25%. 

2125 

2126 Raises 

2127 ------ 

2128 ValueError 

2129 If less than 2 samples are provided, a sample is empty, or no 

2130 distinct observations are in the samples. 

2131 

2132 See Also 

2133 -------- 

2134 ks_2samp : 2 sample Kolmogorov-Smirnov test 

2135 anderson : 1 sample Anderson-Darling test 

2136 

2137 Notes 

2138 ----- 

2139 [1]_ defines three versions of the k-sample Anderson-Darling test: 

2140 one for continuous distributions and two for discrete 

2141 distributions, in which ties between samples may occur. The 

2142 default of this routine is to compute the version based on the 

2143 midrank empirical distribution function. This test is applicable 

2144 to continuous and discrete data. If midrank is set to False, the 

2145 right side empirical distribution is used for a test for discrete 

2146 data. According to [1]_, the two discrete test statistics differ 

2147 only slightly if a few collisions due to round-off errors occur in 

2148 the test not adjusted for ties between samples. 

2149 

2150 The critical values corresponding to the significance levels from 0.01 

2151 to 0.25 are taken from [1]_. p-values are floored / capped 

2152 at 0.1% / 25%. Since the range of critical values might be extended in 

2153 future releases, it is recommended not to test ``p == 0.25``, but rather 

2154 ``p >= 0.25`` (analogously for the lower bound). 

2155 

2156 .. versionadded:: 0.14.0 

2157 

2158 References 

2159 ---------- 

2160 .. [1] Scholz, F. W and Stephens, M. A. (1987), K-Sample 

2161 Anderson-Darling Tests, Journal of the American Statistical 

2162 Association, Vol. 82, pp. 918-924. 

2163 

2164 Examples 

2165 -------- 

2166 >>> import numpy as np 

2167 >>> from scipy import stats 

2168 >>> rng = np.random.default_rng() 

2169 >>> res = stats.anderson_ksamp([rng.normal(size=50), 

2170 ... rng.normal(loc=0.5, size=30)]) 

2171 >>> res.statistic, res.pvalue 

2172 (1.974403288713695, 0.04991293614572478) 

2173 >>> res.critical_values 

2174 array([0.325, 1.226, 1.961, 2.718, 3.752, 4.592, 6.546]) 

2175 

2176 The null hypothesis that the two random samples come from the same 

2177 distribution can be rejected at the 5% level because the returned 

2178 test value is greater than the critical value for 5% (1.961) but 

2179 not at the 2.5% level. The interpolation gives an approximate 

2180 p-value of 4.99%. 

2181 

2182 >>> res = stats.anderson_ksamp([rng.normal(size=50), 

2183 ... rng.normal(size=30), rng.normal(size=20)]) 

2184 >>> res.statistic, res.pvalue 

2185 (-0.29103725200789504, 0.25) 

2186 >>> res.critical_values 

2187 array([ 0.44925884, 1.3052767 , 1.9434184 , 2.57696569, 3.41634856, 

2188 4.07210043, 5.56419101]) 

2189 

2190 The null hypothesis cannot be rejected for three samples from an 

2191 identical distribution. The reported p-value (25%) has been capped and 

2192 may not be very accurate (since it corresponds to the value 0.449 

2193 whereas the statistic is -0.291). 

2194 

2195 """ 

2196 k = len(samples) 

2197 if (k < 2): 

2198 raise ValueError("anderson_ksamp needs at least two samples") 

2199 

2200 samples = list(map(np.asarray, samples)) 

2201 Z = np.sort(np.hstack(samples)) 

2202 N = Z.size 

2203 Zstar = np.unique(Z) 

2204 if Zstar.size < 2: 

2205 raise ValueError("anderson_ksamp needs more than one distinct " 

2206 "observation") 

2207 

2208 n = np.array([sample.size for sample in samples]) 

2209 if np.any(n == 0): 

2210 raise ValueError("anderson_ksamp encountered sample without " 

2211 "observations") 

2212 

2213 if midrank: 

2214 A2kN = _anderson_ksamp_midrank(samples, Z, Zstar, k, n, N) 

2215 else: 

2216 A2kN = _anderson_ksamp_right(samples, Z, Zstar, k, n, N) 

2217 

2218 H = (1. / n).sum() 

2219 hs_cs = (1. / arange(N - 1, 1, -1)).cumsum() 

2220 h = hs_cs[-1] + 1 

2221 g = (hs_cs / arange(2, N)).sum() 

2222 

2223 a = (4*g - 6) * (k - 1) + (10 - 6*g)*H 

2224 b = (2*g - 4)*k**2 + 8*h*k + (2*g - 14*h - 4)*H - 8*h + 4*g - 6 

2225 c = (6*h + 2*g - 2)*k**2 + (4*h - 4*g + 6)*k + (2*h - 6)*H + 4*h 

2226 d = (2*h + 6)*k**2 - 4*h*k 

2227 sigmasq = (a*N**3 + b*N**2 + c*N + d) / ((N - 1.) * (N - 2.) * (N - 3.)) 

2228 m = k - 1 

2229 A2 = (A2kN - m) / math.sqrt(sigmasq) 

2230 

2231 # The b_i values are the interpolation coefficients from Table 2 

2232 # of Scholz and Stephens 1987 

2233 b0 = np.array([0.675, 1.281, 1.645, 1.96, 2.326, 2.573, 3.085]) 

2234 b1 = np.array([-0.245, 0.25, 0.678, 1.149, 1.822, 2.364, 3.615]) 

2235 b2 = np.array([-0.105, -0.305, -0.362, -0.391, -0.396, -0.345, -0.154]) 

2236 critical = b0 + b1 / math.sqrt(m) + b2 / m 

2237 

2238 sig = np.array([0.25, 0.1, 0.05, 0.025, 0.01, 0.005, 0.001]) 

2239 if A2 < critical.min(): 

2240 p = sig.max() 

2241 warnings.warn("p-value capped: true value larger than {}".format(p), 

2242 stacklevel=2) 

2243 elif A2 > critical.max(): 

2244 p = sig.min() 

2245 warnings.warn("p-value floored: true value smaller than {}".format(p), 

2246 stacklevel=2) 

2247 else: 

2248 # interpolation of probit of significance level 

2249 pf = np.polyfit(critical, log(sig), 2) 

2250 p = math.exp(np.polyval(pf, A2)) 

2251 

2252 # create result object with alias for backward compatibility 

2253 res = Anderson_ksampResult(A2, critical, p) 

2254 res.significance_level = p 

2255 return res 

2256 

2257 

2258AnsariResult = namedtuple('AnsariResult', ('statistic', 'pvalue')) 

2259 

2260 

2261class _ABW: 

2262 """Distribution of Ansari-Bradley W-statistic under the null hypothesis.""" 

2263 # TODO: calculate exact distribution considering ties 

2264 # We could avoid summing over more than half the frequencies, 

2265 # but inititally it doesn't seem worth the extra complexity 

2266 

2267 def __init__(self): 

2268 """Minimal initializer.""" 

2269 self.m = None 

2270 self.n = None 

2271 self.astart = None 

2272 self.total = None 

2273 self.freqs = None 

2274 

2275 def _recalc(self, n, m): 

2276 """When necessary, recalculate exact distribution.""" 

2277 if n != self.n or m != self.m: 

2278 self.n, self.m = n, m 

2279 # distribution is NOT symmetric when m + n is odd 

2280 # n is len(x), m is len(y), and ratio of scales is defined x/y 

2281 astart, a1, _ = _statlib.gscale(n, m) 

2282 self.astart = astart # minimum value of statistic 

2283 # Exact distribution of test statistic under null hypothesis 

2284 # expressed as frequencies/counts/integers to maintain precision. 

2285 # Stored as floats to avoid overflow of sums. 

2286 self.freqs = a1.astype(np.float64) 

2287 self.total = self.freqs.sum() # could calculate from m and n 

2288 # probability mass is self.freqs / self.total; 

2289 

2290 def pmf(self, k, n, m): 

2291 """Probability mass function.""" 

2292 self._recalc(n, m) 

2293 # The convention here is that PMF at k = 12.5 is the same as at k = 12, 

2294 # -> use `floor` in case of ties. 

2295 ind = np.floor(k - self.astart).astype(int) 

2296 return self.freqs[ind] / self.total 

2297 

2298 def cdf(self, k, n, m): 

2299 """Cumulative distribution function.""" 

2300 self._recalc(n, m) 

2301 # Null distribution derived without considering ties is 

2302 # approximate. Round down to avoid Type I error. 

2303 ind = np.ceil(k - self.astart).astype(int) 

2304 return self.freqs[:ind+1].sum() / self.total 

2305 

2306 def sf(self, k, n, m): 

2307 """Survival function.""" 

2308 self._recalc(n, m) 

2309 # Null distribution derived without considering ties is 

2310 # approximate. Round down to avoid Type I error. 

2311 ind = np.floor(k - self.astart).astype(int) 

2312 return self.freqs[ind:].sum() / self.total 

2313 

2314 

2315# Maintain state for faster repeat calls to ansari w/ method='exact' 

2316_abw_state = _ABW() 

2317 

2318 

2319def ansari(x, y, alternative='two-sided'): 

2320 """Perform the Ansari-Bradley test for equal scale parameters. 

2321 

2322 The Ansari-Bradley test ([1]_, [2]_) is a non-parametric test 

2323 for the equality of the scale parameter of the distributions 

2324 from which two samples were drawn. The null hypothesis states that 

2325 the ratio of the scale of the distribution underlying `x` to the scale 

2326 of the distribution underlying `y` is 1. 

2327 

2328 Parameters 

2329 ---------- 

2330 x, y : array_like 

2331 Arrays of sample data. 

2332 alternative : {'two-sided', 'less', 'greater'}, optional 

2333 Defines the alternative hypothesis. Default is 'two-sided'. 

2334 The following options are available: 

2335 

2336 * 'two-sided': the ratio of scales is not equal to 1. 

2337 * 'less': the ratio of scales is less than 1. 

2338 * 'greater': the ratio of scales is greater than 1. 

2339 

2340 .. versionadded:: 1.7.0 

2341 

2342 Returns 

2343 ------- 

2344 statistic : float 

2345 The Ansari-Bradley test statistic. 

2346 pvalue : float 

2347 The p-value of the hypothesis test. 

2348 

2349 See Also 

2350 -------- 

2351 fligner : A non-parametric test for the equality of k variances 

2352 mood : A non-parametric test for the equality of two scale parameters 

2353 

2354 Notes 

2355 ----- 

2356 The p-value given is exact when the sample sizes are both less than 

2357 55 and there are no ties, otherwise a normal approximation for the 

2358 p-value is used. 

2359 

2360 References 

2361 ---------- 

2362 .. [1] Ansari, A. R. and Bradley, R. A. (1960) Rank-sum tests for 

2363 dispersions, Annals of Mathematical Statistics, 31, 1174-1189. 

2364 .. [2] Sprent, Peter and N.C. Smeeton. Applied nonparametric 

2365 statistical methods. 3rd ed. Chapman and Hall/CRC. 2001. 

2366 Section 5.8.2. 

2367 .. [3] Nathaniel E. Helwig "Nonparametric Dispersion and Equality 

2368 Tests" at http://users.stat.umn.edu/~helwig/notes/npde-Notes.pdf 

2369 

2370 Examples 

2371 -------- 

2372 >>> import numpy as np 

2373 >>> from scipy.stats import ansari 

2374 >>> rng = np.random.default_rng() 

2375 

2376 For these examples, we'll create three random data sets. The first 

2377 two, with sizes 35 and 25, are drawn from a normal distribution with 

2378 mean 0 and standard deviation 2. The third data set has size 25 and 

2379 is drawn from a normal distribution with standard deviation 1.25. 

2380 

2381 >>> x1 = rng.normal(loc=0, scale=2, size=35) 

2382 >>> x2 = rng.normal(loc=0, scale=2, size=25) 

2383 >>> x3 = rng.normal(loc=0, scale=1.25, size=25) 

2384 

2385 First we apply `ansari` to `x1` and `x2`. These samples are drawn 

2386 from the same distribution, so we expect the Ansari-Bradley test 

2387 should not lead us to conclude that the scales of the distributions 

2388 are different. 

2389 

2390 >>> ansari(x1, x2) 

2391 AnsariResult(statistic=541.0, pvalue=0.9762532927399098) 

2392 

2393 With a p-value close to 1, we cannot conclude that there is a 

2394 significant difference in the scales (as expected). 

2395 

2396 Now apply the test to `x1` and `x3`: 

2397 

2398 >>> ansari(x1, x3) 

2399 AnsariResult(statistic=425.0, pvalue=0.0003087020407974518) 

2400 

2401 The probability of observing such an extreme value of the statistic 

2402 under the null hypothesis of equal scales is only 0.03087%. We take this 

2403 as evidence against the null hypothesis in favor of the alternative: 

2404 the scales of the distributions from which the samples were drawn 

2405 are not equal. 

2406 

2407 We can use the `alternative` parameter to perform a one-tailed test. 

2408 In the above example, the scale of `x1` is greater than `x3` and so 

2409 the ratio of scales of `x1` and `x3` is greater than 1. This means 

2410 that the p-value when ``alternative='greater'`` should be near 0 and 

2411 hence we should be able to reject the null hypothesis: 

2412 

2413 >>> ansari(x1, x3, alternative='greater') 

2414 AnsariResult(statistic=425.0, pvalue=0.0001543510203987259) 

2415 

2416 As we can see, the p-value is indeed quite low. Use of 

2417 ``alternative='less'`` should thus yield a large p-value: 

2418 

2419 >>> ansari(x1, x3, alternative='less') 

2420 AnsariResult(statistic=425.0, pvalue=0.9998643258449039) 

2421 

2422 """ 

2423 if alternative not in {'two-sided', 'greater', 'less'}: 

2424 raise ValueError("'alternative' must be 'two-sided'," 

2425 " 'greater', or 'less'.") 

2426 x, y = asarray(x), asarray(y) 

2427 n = len(x) 

2428 m = len(y) 

2429 if m < 1: 

2430 raise ValueError("Not enough other observations.") 

2431 if n < 1: 

2432 raise ValueError("Not enough test observations.") 

2433 

2434 N = m + n 

2435 xy = r_[x, y] # combine 

2436 rank = _stats_py.rankdata(xy) 

2437 symrank = amin(array((rank, N - rank + 1)), 0) 

2438 AB = np.sum(symrank[:n], axis=0) 

2439 uxy = unique(xy) 

2440 repeats = (len(uxy) != len(xy)) 

2441 exact = ((m < 55) and (n < 55) and not repeats) 

2442 if repeats and (m < 55 or n < 55): 

2443 warnings.warn("Ties preclude use of exact statistic.") 

2444 if exact: 

2445 if alternative == 'two-sided': 

2446 pval = 2.0 * np.minimum(_abw_state.cdf(AB, n, m), 

2447 _abw_state.sf(AB, n, m)) 

2448 elif alternative == 'greater': 

2449 # AB statistic is _smaller_ when ratio of scales is larger, 

2450 # so this is the opposite of the usual calculation 

2451 pval = _abw_state.cdf(AB, n, m) 

2452 else: 

2453 pval = _abw_state.sf(AB, n, m) 

2454 return AnsariResult(AB, min(1.0, pval)) 

2455 

2456 # otherwise compute normal approximation 

2457 if N % 2: # N odd 

2458 mnAB = n * (N+1.0)**2 / 4.0 / N 

2459 varAB = n * m * (N+1.0) * (3+N**2) / (48.0 * N**2) 

2460 else: 

2461 mnAB = n * (N+2.0) / 4.0 

2462 varAB = m * n * (N+2) * (N-2.0) / 48 / (N-1.0) 

2463 if repeats: # adjust variance estimates 

2464 # compute np.sum(tj * rj**2,axis=0) 

2465 fac = np.sum(symrank**2, axis=0) 

2466 if N % 2: # N odd 

2467 varAB = m * n * (16*N*fac - (N+1)**4) / (16.0 * N**2 * (N-1)) 

2468 else: # N even 

2469 varAB = m * n * (16*fac - N*(N+2)**2) / (16.0 * N * (N-1)) 

2470 

2471 # Small values of AB indicate larger dispersion for the x sample. 

2472 # Large values of AB indicate larger dispersion for the y sample. 

2473 # This is opposite to the way we define the ratio of scales. see [1]_. 

2474 z = (mnAB - AB) / sqrt(varAB) 

2475 z, pval = _normtest_finish(z, alternative) 

2476 return AnsariResult(AB, pval) 

2477 

2478 

2479BartlettResult = namedtuple('BartlettResult', ('statistic', 'pvalue')) 

2480 

2481 

2482def bartlett(*samples): 

2483 """Perform Bartlett's test for equal variances. 

2484 

2485 Bartlett's test tests the null hypothesis that all input samples 

2486 are from populations with equal variances. For samples 

2487 from significantly non-normal populations, Levene's test 

2488 `levene` is more robust. 

2489 

2490 Parameters 

2491 ---------- 

2492 sample1, sample2, ... : array_like 

2493 arrays of sample data. Only 1d arrays are accepted, they may have 

2494 different lengths. 

2495 

2496 Returns 

2497 ------- 

2498 statistic : float 

2499 The test statistic. 

2500 pvalue : float 

2501 The p-value of the test. 

2502 

2503 See Also 

2504 -------- 

2505 fligner : A non-parametric test for the equality of k variances 

2506 levene : A robust parametric test for equality of k variances 

2507 

2508 Notes 

2509 ----- 

2510 Conover et al. (1981) examine many of the existing parametric and 

2511 nonparametric tests by extensive simulations and they conclude that the 

2512 tests proposed by Fligner and Killeen (1976) and Levene (1960) appear to be 

2513 superior in terms of robustness of departures from normality and power 

2514 ([3]_). 

2515 

2516 References 

2517 ---------- 

2518 .. [1] https://www.itl.nist.gov/div898/handbook/eda/section3/eda357.htm 

2519 

2520 .. [2] Snedecor, George W. and Cochran, William G. (1989), Statistical 

2521 Methods, Eighth Edition, Iowa State University Press. 

2522 

2523 .. [3] Park, C. and Lindsay, B. G. (1999). Robust Scale Estimation and 

2524 Hypothesis Testing based on Quadratic Inference Function. Technical 

2525 Report #99-03, Center for Likelihood Studies, Pennsylvania State 

2526 University. 

2527 

2528 .. [4] Bartlett, M. S. (1937). Properties of Sufficiency and Statistical 

2529 Tests. Proceedings of the Royal Society of London. Series A, 

2530 Mathematical and Physical Sciences, Vol. 160, No.901, pp. 268-282. 

2531 

2532 Examples 

2533 -------- 

2534 Test whether or not the lists `a`, `b` and `c` come from populations 

2535 with equal variances. 

2536 

2537 >>> import numpy as np 

2538 >>> from scipy.stats import bartlett 

2539 >>> a = [8.88, 9.12, 9.04, 8.98, 9.00, 9.08, 9.01, 8.85, 9.06, 8.99] 

2540 >>> b = [8.88, 8.95, 9.29, 9.44, 9.15, 9.58, 8.36, 9.18, 8.67, 9.05] 

2541 >>> c = [8.95, 9.12, 8.95, 8.85, 9.03, 8.84, 9.07, 8.98, 8.86, 8.98] 

2542 >>> stat, p = bartlett(a, b, c) 

2543 >>> p 

2544 1.1254782518834628e-05 

2545 

2546 The very small p-value suggests that the populations do not have equal 

2547 variances. 

2548 

2549 This is not surprising, given that the sample variance of `b` is much 

2550 larger than that of `a` and `c`: 

2551 

2552 >>> [np.var(x, ddof=1) for x in [a, b, c]] 

2553 [0.007054444444444413, 0.13073888888888888, 0.008890000000000002] 

2554 

2555 """ 

2556 # Handle empty input and input that is not 1d 

2557 for sample in samples: 

2558 if np.asanyarray(sample).size == 0: 

2559 return BartlettResult(np.nan, np.nan) 

2560 if np.asanyarray(sample).ndim > 1: 

2561 raise ValueError('Samples must be one-dimensional.') 

2562 

2563 k = len(samples) 

2564 if k < 2: 

2565 raise ValueError("Must enter at least two input sample vectors.") 

2566 Ni = np.empty(k) 

2567 ssq = np.empty(k, 'd') 

2568 for j in range(k): 

2569 Ni[j] = len(samples[j]) 

2570 ssq[j] = np.var(samples[j], ddof=1) 

2571 Ntot = np.sum(Ni, axis=0) 

2572 spsq = np.sum((Ni - 1)*ssq, axis=0) / (1.0*(Ntot - k)) 

2573 numer = (Ntot*1.0 - k) * log(spsq) - np.sum((Ni - 1.0)*log(ssq), axis=0) 

2574 denom = 1.0 + 1.0/(3*(k - 1)) * ((np.sum(1.0/(Ni - 1.0), axis=0)) - 

2575 1.0/(Ntot - k)) 

2576 T = numer / denom 

2577 pval = distributions.chi2.sf(T, k - 1) # 1 - cdf 

2578 

2579 return BartlettResult(T, pval) 

2580 

2581 

2582LeveneResult = namedtuple('LeveneResult', ('statistic', 'pvalue')) 

2583 

2584 

2585def levene(*samples, center='median', proportiontocut=0.05): 

2586 """Perform Levene test for equal variances. 

2587 

2588 The Levene test tests the null hypothesis that all input samples 

2589 are from populations with equal variances. Levene's test is an 

2590 alternative to Bartlett's test `bartlett` in the case where 

2591 there are significant deviations from normality. 

2592 

2593 Parameters 

2594 ---------- 

2595 sample1, sample2, ... : array_like 

2596 The sample data, possibly with different lengths. Only one-dimensional 

2597 samples are accepted. 

2598 center : {'mean', 'median', 'trimmed'}, optional 

2599 Which function of the data to use in the test. The default 

2600 is 'median'. 

2601 proportiontocut : float, optional 

2602 When `center` is 'trimmed', this gives the proportion of data points 

2603 to cut from each end. (See `scipy.stats.trim_mean`.) 

2604 Default is 0.05. 

2605 

2606 Returns 

2607 ------- 

2608 statistic : float 

2609 The test statistic. 

2610 pvalue : float 

2611 The p-value for the test. 

2612 

2613 Notes 

2614 ----- 

2615 Three variations of Levene's test are possible. The possibilities 

2616 and their recommended usages are: 

2617 

2618 * 'median' : Recommended for skewed (non-normal) distributions> 

2619 * 'mean' : Recommended for symmetric, moderate-tailed distributions. 

2620 * 'trimmed' : Recommended for heavy-tailed distributions. 

2621 

2622 The test version using the mean was proposed in the original article 

2623 of Levene ([2]_) while the median and trimmed mean have been studied by 

2624 Brown and Forsythe ([3]_), sometimes also referred to as Brown-Forsythe 

2625 test. 

2626 

2627 References 

2628 ---------- 

2629 .. [1] https://www.itl.nist.gov/div898/handbook/eda/section3/eda35a.htm 

2630 .. [2] Levene, H. (1960). In Contributions to Probability and Statistics: 

2631 Essays in Honor of Harold Hotelling, I. Olkin et al. eds., 

2632 Stanford University Press, pp. 278-292. 

2633 .. [3] Brown, M. B. and Forsythe, A. B. (1974), Journal of the American 

2634 Statistical Association, 69, 364-367 

2635 

2636 Examples 

2637 -------- 

2638 Test whether or not the lists `a`, `b` and `c` come from populations 

2639 with equal variances. 

2640 

2641 >>> import numpy as np 

2642 >>> from scipy.stats import levene 

2643 >>> a = [8.88, 9.12, 9.04, 8.98, 9.00, 9.08, 9.01, 8.85, 9.06, 8.99] 

2644 >>> b = [8.88, 8.95, 9.29, 9.44, 9.15, 9.58, 8.36, 9.18, 8.67, 9.05] 

2645 >>> c = [8.95, 9.12, 8.95, 8.85, 9.03, 8.84, 9.07, 8.98, 8.86, 8.98] 

2646 >>> stat, p = levene(a, b, c) 

2647 >>> p 

2648 0.002431505967249681 

2649 

2650 The small p-value suggests that the populations do not have equal 

2651 variances. 

2652 

2653 This is not surprising, given that the sample variance of `b` is much 

2654 larger than that of `a` and `c`: 

2655 

2656 >>> [np.var(x, ddof=1) for x in [a, b, c]] 

2657 [0.007054444444444413, 0.13073888888888888, 0.008890000000000002] 

2658 

2659 """ 

2660 if center not in ['mean', 'median', 'trimmed']: 

2661 raise ValueError("center must be 'mean', 'median' or 'trimmed'.") 

2662 

2663 k = len(samples) 

2664 if k < 2: 

2665 raise ValueError("Must enter at least two input sample vectors.") 

2666 # check for 1d input 

2667 for j in range(k): 

2668 if np.asanyarray(samples[j]).ndim > 1: 

2669 raise ValueError('Samples must be one-dimensional.') 

2670 

2671 Ni = np.empty(k) 

2672 Yci = np.empty(k, 'd') 

2673 

2674 if center == 'median': 

2675 func = lambda x: np.median(x, axis=0) 

2676 elif center == 'mean': 

2677 func = lambda x: np.mean(x, axis=0) 

2678 else: # center == 'trimmed' 

2679 samples = tuple(_stats_py.trimboth(np.sort(sample), proportiontocut) 

2680 for sample in samples) 

2681 func = lambda x: np.mean(x, axis=0) 

2682 

2683 for j in range(k): 

2684 Ni[j] = len(samples[j]) 

2685 Yci[j] = func(samples[j]) 

2686 Ntot = np.sum(Ni, axis=0) 

2687 

2688 # compute Zij's 

2689 Zij = [None] * k 

2690 for i in range(k): 

2691 Zij[i] = abs(asarray(samples[i]) - Yci[i]) 

2692 

2693 # compute Zbari 

2694 Zbari = np.empty(k, 'd') 

2695 Zbar = 0.0 

2696 for i in range(k): 

2697 Zbari[i] = np.mean(Zij[i], axis=0) 

2698 Zbar += Zbari[i] * Ni[i] 

2699 

2700 Zbar /= Ntot 

2701 numer = (Ntot - k) * np.sum(Ni * (Zbari - Zbar)**2, axis=0) 

2702 

2703 # compute denom_variance 

2704 dvar = 0.0 

2705 for i in range(k): 

2706 dvar += np.sum((Zij[i] - Zbari[i])**2, axis=0) 

2707 

2708 denom = (k - 1.0) * dvar 

2709 

2710 W = numer / denom 

2711 pval = distributions.f.sf(W, k-1, Ntot-k) # 1 - cdf 

2712 return LeveneResult(W, pval) 

2713 

2714 

2715@_deprecated("'binom_test' is deprecated in favour of" 

2716 " 'binomtest' from version 1.7.0 and will" 

2717 " be removed in Scipy 1.12.0.") 

2718def binom_test(x, n=None, p=0.5, alternative='two-sided'): 

2719 """Perform a test that the probability of success is p. 

2720 

2721 This is an exact, two-sided test of the null hypothesis 

2722 that the probability of success in a Bernoulli experiment 

2723 is `p`. 

2724 

2725 .. deprecated:: 1.10.0 

2726 `binom_test` is deprecated in favour of `binomtest` and will 

2727 be removed in Scipy 1.12.0. 

2728 

2729 Parameters 

2730 ---------- 

2731 x : int or array_like 

2732 The number of successes, or if x has length 2, it is the 

2733 number of successes and the number of failures. 

2734 n : int 

2735 The number of trials. This is ignored if x gives both the 

2736 number of successes and failures. 

2737 p : float, optional 

2738 The hypothesized probability of success. ``0 <= p <= 1``. The 

2739 default value is ``p = 0.5``. 

2740 alternative : {'two-sided', 'greater', 'less'}, optional 

2741 Indicates the alternative hypothesis. The default value is 

2742 'two-sided'. 

2743 

2744 Returns 

2745 ------- 

2746 p-value : float 

2747 The p-value of the hypothesis test. 

2748 

2749 References 

2750 ---------- 

2751 .. [1] https://en.wikipedia.org/wiki/Binomial_test 

2752 

2753 Examples 

2754 -------- 

2755 >>> from scipy import stats 

2756 

2757 A car manufacturer claims that no more than 10% of their cars are unsafe. 

2758 15 cars are inspected for safety, 3 were found to be unsafe. Test the 

2759 manufacturer's claim: 

2760 

2761 >>> stats.binom_test(3, n=15, p=0.1, alternative='greater') 

2762 0.18406106910639114 

2763 

2764 The null hypothesis cannot be rejected at the 5% level of significance 

2765 because the returned p-value is greater than the critical value of 5%. 

2766 

2767 """ 

2768 x = atleast_1d(x).astype(np.int_) 

2769 if len(x) == 2: 

2770 n = x[1] + x[0] 

2771 x = x[0] 

2772 elif len(x) == 1: 

2773 x = x[0] 

2774 if n is None or n < x: 

2775 raise ValueError("n must be >= x") 

2776 n = np.int_(n) 

2777 else: 

2778 raise ValueError("Incorrect length for x.") 

2779 

2780 if (p > 1.0) or (p < 0.0): 

2781 raise ValueError("p must be in range [0,1]") 

2782 

2783 if alternative not in ('two-sided', 'less', 'greater'): 

2784 raise ValueError("alternative not recognized\n" 

2785 "should be 'two-sided', 'less' or 'greater'") 

2786 

2787 if alternative == 'less': 

2788 pval = distributions.binom.cdf(x, n, p) 

2789 return pval 

2790 

2791 if alternative == 'greater': 

2792 pval = distributions.binom.sf(x-1, n, p) 

2793 return pval 

2794 

2795 # if alternative was neither 'less' nor 'greater', then it's 'two-sided' 

2796 d = distributions.binom.pmf(x, n, p) 

2797 rerr = 1 + 1e-7 

2798 if x == p * n: 

2799 # special case as shortcut, would also be handled by `else` below 

2800 pval = 1. 

2801 elif x < p * n: 

2802 i = np.arange(np.ceil(p * n), n+1) 

2803 y = np.sum(distributions.binom.pmf(i, n, p) <= d*rerr, axis=0) 

2804 pval = (distributions.binom.cdf(x, n, p) + 

2805 distributions.binom.sf(n - y, n, p)) 

2806 else: 

2807 i = np.arange(np.floor(p*n) + 1) 

2808 y = np.sum(distributions.binom.pmf(i, n, p) <= d*rerr, axis=0) 

2809 pval = (distributions.binom.cdf(y-1, n, p) + 

2810 distributions.binom.sf(x-1, n, p)) 

2811 

2812 return min(1.0, pval) 

2813 

2814 

2815def _apply_func(x, g, func): 

2816 # g is list of indices into x 

2817 # separating x into different groups 

2818 # func should be applied over the groups 

2819 g = unique(r_[0, g, len(x)]) 

2820 output = [func(x[g[k]:g[k+1]]) for k in range(len(g) - 1)] 

2821 

2822 return asarray(output) 

2823 

2824 

2825FlignerResult = namedtuple('FlignerResult', ('statistic', 'pvalue')) 

2826 

2827 

2828def fligner(*samples, center='median', proportiontocut=0.05): 

2829 """Perform Fligner-Killeen test for equality of variance. 

2830 

2831 Fligner's test tests the null hypothesis that all input samples 

2832 are from populations with equal variances. Fligner-Killeen's test is 

2833 distribution free when populations are identical [2]_. 

2834 

2835 Parameters 

2836 ---------- 

2837 sample1, sample2, ... : array_like 

2838 Arrays of sample data. Need not be the same length. 

2839 center : {'mean', 'median', 'trimmed'}, optional 

2840 Keyword argument controlling which function of the data is used in 

2841 computing the test statistic. The default is 'median'. 

2842 proportiontocut : float, optional 

2843 When `center` is 'trimmed', this gives the proportion of data points 

2844 to cut from each end. (See `scipy.stats.trim_mean`.) 

2845 Default is 0.05. 

2846 

2847 Returns 

2848 ------- 

2849 statistic : float 

2850 The test statistic. 

2851 pvalue : float 

2852 The p-value for the hypothesis test. 

2853 

2854 See Also 

2855 -------- 

2856 bartlett : A parametric test for equality of k variances in normal samples 

2857 levene : A robust parametric test for equality of k variances 

2858 

2859 Notes 

2860 ----- 

2861 As with Levene's test there are three variants of Fligner's test that 

2862 differ by the measure of central tendency used in the test. See `levene` 

2863 for more information. 

2864 

2865 Conover et al. (1981) examine many of the existing parametric and 

2866 nonparametric tests by extensive simulations and they conclude that the 

2867 tests proposed by Fligner and Killeen (1976) and Levene (1960) appear to be 

2868 superior in terms of robustness of departures from normality and power [3]_. 

2869 

2870 References 

2871 ---------- 

2872 .. [1] Park, C. and Lindsay, B. G. (1999). Robust Scale Estimation and 

2873 Hypothesis Testing based on Quadratic Inference Function. Technical 

2874 Report #99-03, Center for Likelihood Studies, Pennsylvania State 

2875 University. 

2876 https://cecas.clemson.edu/~cspark/cv/paper/qif/draftqif2.pdf 

2877 

2878 .. [2] Fligner, M.A. and Killeen, T.J. (1976). Distribution-free two-sample 

2879 tests for scale. 'Journal of the American Statistical Association.' 

2880 71(353), 210-213. 

2881 

2882 .. [3] Park, C. and Lindsay, B. G. (1999). Robust Scale Estimation and 

2883 Hypothesis Testing based on Quadratic Inference Function. Technical 

2884 Report #99-03, Center for Likelihood Studies, Pennsylvania State 

2885 University. 

2886 

2887 .. [4] Conover, W. J., Johnson, M. E. and Johnson M. M. (1981). A 

2888 comparative study of tests for homogeneity of variances, with 

2889 applications to the outer continental shelf biding data. 

2890 Technometrics, 23(4), 351-361. 

2891 

2892 Examples 

2893 -------- 

2894 Test whether or not the lists `a`, `b` and `c` come from populations 

2895 with equal variances. 

2896 

2897 >>> import numpy as np 

2898 >>> from scipy.stats import fligner 

2899 >>> a = [8.88, 9.12, 9.04, 8.98, 9.00, 9.08, 9.01, 8.85, 9.06, 8.99] 

2900 >>> b = [8.88, 8.95, 9.29, 9.44, 9.15, 9.58, 8.36, 9.18, 8.67, 9.05] 

2901 >>> c = [8.95, 9.12, 8.95, 8.85, 9.03, 8.84, 9.07, 8.98, 8.86, 8.98] 

2902 >>> stat, p = fligner(a, b, c) 

2903 >>> p 

2904 0.00450826080004775 

2905 

2906 The small p-value suggests that the populations do not have equal 

2907 variances. 

2908 

2909 This is not surprising, given that the sample variance of `b` is much 

2910 larger than that of `a` and `c`: 

2911 

2912 >>> [np.var(x, ddof=1) for x in [a, b, c]] 

2913 [0.007054444444444413, 0.13073888888888888, 0.008890000000000002] 

2914 

2915 """ 

2916 if center not in ['mean', 'median', 'trimmed']: 

2917 raise ValueError("center must be 'mean', 'median' or 'trimmed'.") 

2918 

2919 # Handle empty input 

2920 for sample in samples: 

2921 if np.asanyarray(sample).size == 0: 

2922 return FlignerResult(np.nan, np.nan) 

2923 

2924 k = len(samples) 

2925 if k < 2: 

2926 raise ValueError("Must enter at least two input sample vectors.") 

2927 

2928 if center == 'median': 

2929 func = lambda x: np.median(x, axis=0) 

2930 elif center == 'mean': 

2931 func = lambda x: np.mean(x, axis=0) 

2932 else: # center == 'trimmed' 

2933 samples = tuple(_stats_py.trimboth(sample, proportiontocut) 

2934 for sample in samples) 

2935 func = lambda x: np.mean(x, axis=0) 

2936 

2937 Ni = asarray([len(samples[j]) for j in range(k)]) 

2938 Yci = asarray([func(samples[j]) for j in range(k)]) 

2939 Ntot = np.sum(Ni, axis=0) 

2940 # compute Zij's 

2941 Zij = [abs(asarray(samples[i]) - Yci[i]) for i in range(k)] 

2942 allZij = [] 

2943 g = [0] 

2944 for i in range(k): 

2945 allZij.extend(list(Zij[i])) 

2946 g.append(len(allZij)) 

2947 

2948 ranks = _stats_py.rankdata(allZij) 

2949 sample = distributions.norm.ppf(ranks / (2*(Ntot + 1.0)) + 0.5) 

2950 

2951 # compute Aibar 

2952 Aibar = _apply_func(sample, g, np.sum) / Ni 

2953 anbar = np.mean(sample, axis=0) 

2954 varsq = np.var(sample, axis=0, ddof=1) 

2955 Xsq = np.sum(Ni * (asarray(Aibar) - anbar)**2.0, axis=0) / varsq 

2956 pval = distributions.chi2.sf(Xsq, k - 1) # 1 - cdf 

2957 return FlignerResult(Xsq, pval) 

2958 

2959 

2960@_axis_nan_policy_factory(lambda x1: (x1,), n_samples=4, n_outputs=1) 

2961def _mood_inner_lc(xy, x, diffs, sorted_xy, n, m, N) -> float: 

2962 # Obtain the unique values and their frequencies from the pooled samples. 

2963 # "a_j, + b_j, = t_j, for j = 1, ... k" where `k` is the number of unique 

2964 # classes, and "[t]he number of values associated with the x's and y's in 

2965 # the jth class will be denoted by a_j, and b_j respectively." 

2966 # (Mielke, 312) 

2967 # Reuse previously computed sorted array and `diff` arrays to obtain the 

2968 # unique values and counts. Prepend `diffs` with a non-zero to indicate 

2969 # that the first element should be marked as not matching what preceded it. 

2970 diffs_prep = np.concatenate(([1], diffs)) 

2971 # Unique elements are where the was a difference between elements in the 

2972 # sorted array 

2973 uniques = sorted_xy[diffs_prep != 0] 

2974 # The count of each element is the bin size for each set of consecutive 

2975 # differences where the difference is zero. Replace nonzero differences 

2976 # with 1 and then use the cumulative sum to count the indices. 

2977 t = np.bincount(np.cumsum(np.asarray(diffs_prep != 0, dtype=int)))[1:] 

2978 k = len(uniques) 

2979 js = np.arange(1, k + 1, dtype=int) 

2980 # the `b` array mentioned in the paper is not used, outside of the 

2981 # calculation of `t`, so we do not need to calculate it separately. Here 

2982 # we calculate `a`. In plain language, `a[j]` is the number of values in 

2983 # `x` that equal `uniques[j]`. 

2984 sorted_xyx = np.sort(np.concatenate((xy, x))) 

2985 diffs = np.diff(sorted_xyx) 

2986 diffs_prep = np.concatenate(([1], diffs)) 

2987 diff_is_zero = np.asarray(diffs_prep != 0, dtype=int) 

2988 xyx_counts = np.bincount(np.cumsum(diff_is_zero))[1:] 

2989 a = xyx_counts - t 

2990 # "Define .. a_0 = b_0 = t_0 = S_0 = 0" (Mielke 312) so we shift `a` 

2991 # and `t` arrays over 1 to allow a first element of 0 to accommodate this 

2992 # indexing. 

2993 t = np.concatenate(([0], t)) 

2994 a = np.concatenate(([0], a)) 

2995 # S is built from `t`, so it does not need a preceding zero added on. 

2996 S = np.cumsum(t) 

2997 # define a copy of `S` with a prepending zero for later use to avoid 

2998 # the need for indexing. 

2999 S_i_m1 = np.concatenate(([0], S[:-1])) 

3000 

3001 # Psi, as defined by the 6th unnumbered equation on page 313 (Mielke). 

3002 # Note that in the paper there is an error where the denominator `2` is 

3003 # squared when it should be the entire equation. 

3004 def psi(indicator): 

3005 return (indicator - (N + 1)/2)**2 

3006 

3007 # define summation range for use in calculation of phi, as seen in sum 

3008 # in the unnumbered equation on the bottom of page 312 (Mielke). 

3009 s_lower = S[js - 1] + 1 

3010 s_upper = S[js] + 1 

3011 phi_J = [np.arange(s_lower[idx], s_upper[idx]) for idx in range(k)] 

3012 

3013 # for every range in the above array, determine the sum of psi(I) for 

3014 # every element in the range. Divide all the sums by `t`. Following the 

3015 # last unnumbered equation on page 312. 

3016 phis = [np.sum(psi(I_j)) for I_j in phi_J] / t[js] 

3017 

3018 # `T` is equal to a[j] * phi[j], per the first unnumbered equation on 

3019 # page 312. `phis` is already in the order based on `js`, so we index 

3020 # into `a` with `js` as well. 

3021 T = sum(phis * a[js]) 

3022 

3023 # The approximate statistic 

3024 E_0_T = n * (N * N - 1) / 12 

3025 

3026 varM = (m * n * (N + 1.0) * (N ** 2 - 4) / 180 - 

3027 m * n / (180 * N * (N - 1)) * np.sum( 

3028 t * (t**2 - 1) * (t**2 - 4 + (15 * (N - S - S_i_m1) ** 2)) 

3029 )) 

3030 

3031 return ((T - E_0_T) / np.sqrt(varM),) 

3032 

3033 

3034def mood(x, y, axis=0, alternative="two-sided"): 

3035 """Perform Mood's test for equal scale parameters. 

3036 

3037 Mood's two-sample test for scale parameters is a non-parametric 

3038 test for the null hypothesis that two samples are drawn from the 

3039 same distribution with the same scale parameter. 

3040 

3041 Parameters 

3042 ---------- 

3043 x, y : array_like 

3044 Arrays of sample data. 

3045 axis : int, optional 

3046 The axis along which the samples are tested. `x` and `y` can be of 

3047 different length along `axis`. 

3048 If `axis` is None, `x` and `y` are flattened and the test is done on 

3049 all values in the flattened arrays. 

3050 alternative : {'two-sided', 'less', 'greater'}, optional 

3051 Defines the alternative hypothesis. Default is 'two-sided'. 

3052 The following options are available: 

3053 

3054 * 'two-sided': the scales of the distributions underlying `x` and `y` 

3055 are different. 

3056 * 'less': the scale of the distribution underlying `x` is less than 

3057 the scale of the distribution underlying `y`. 

3058 * 'greater': the scale of the distribution underlying `x` is greater 

3059 than the scale of the distribution underlying `y`. 

3060 

3061 .. versionadded:: 1.7.0 

3062 

3063 Returns 

3064 ------- 

3065 res : SignificanceResult 

3066 An object containing attributes: 

3067 

3068 statistic : scalar or ndarray 

3069 The z-score for the hypothesis test. For 1-D inputs a scalar is 

3070 returned. 

3071 pvalue : scalar ndarray 

3072 The p-value for the hypothesis test. 

3073 

3074 See Also 

3075 -------- 

3076 fligner : A non-parametric test for the equality of k variances 

3077 ansari : A non-parametric test for the equality of 2 variances 

3078 bartlett : A parametric test for equality of k variances in normal samples 

3079 levene : A parametric test for equality of k variances 

3080 

3081 Notes 

3082 ----- 

3083 The data are assumed to be drawn from probability distributions ``f(x)`` 

3084 and ``f(x/s) / s`` respectively, for some probability density function f. 

3085 The null hypothesis is that ``s == 1``. 

3086 

3087 For multi-dimensional arrays, if the inputs are of shapes 

3088 ``(n0, n1, n2, n3)`` and ``(n0, m1, n2, n3)``, then if ``axis=1``, the 

3089 resulting z and p values will have shape ``(n0, n2, n3)``. Note that 

3090 ``n1`` and ``m1`` don't have to be equal, but the other dimensions do. 

3091 

3092 References 

3093 ---------- 

3094 [1] Mielke, Paul W. "Note on Some Squared Rank Tests with Existing Ties." 

3095 Technometrics, vol. 9, no. 2, 1967, pp. 312-14. JSTOR, 

3096 https://doi.org/10.2307/1266427. Accessed 18 May 2022. 

3097 

3098 Examples 

3099 -------- 

3100 >>> import numpy as np 

3101 >>> from scipy import stats 

3102 >>> rng = np.random.default_rng() 

3103 >>> x2 = rng.standard_normal((2, 45, 6, 7)) 

3104 >>> x1 = rng.standard_normal((2, 30, 6, 7)) 

3105 >>> res = stats.mood(x1, x2, axis=1) 

3106 >>> res.pvalue.shape 

3107 (2, 6, 7) 

3108 

3109 Find the number of points where the difference in scale is not significant: 

3110 

3111 >>> (res.pvalue > 0.1).sum() 

3112 78 

3113 

3114 Perform the test with different scales: 

3115 

3116 >>> x1 = rng.standard_normal((2, 30)) 

3117 >>> x2 = rng.standard_normal((2, 35)) * 10.0 

3118 >>> stats.mood(x1, x2, axis=1) 

3119 SignificanceResult(statistic=array([-5.76174136, -6.12650783]), 

3120 pvalue=array([8.32505043e-09, 8.98287869e-10])) 

3121 

3122 """ 

3123 x = np.asarray(x, dtype=float) 

3124 y = np.asarray(y, dtype=float) 

3125 

3126 if axis is None: 

3127 x = x.flatten() 

3128 y = y.flatten() 

3129 axis = 0 

3130 

3131 if axis < 0: 

3132 axis = x.ndim + axis 

3133 

3134 # Determine shape of the result arrays 

3135 res_shape = tuple([x.shape[ax] for ax in range(len(x.shape)) if ax != axis]) 

3136 if not (res_shape == tuple([y.shape[ax] for ax in range(len(y.shape)) if 

3137 ax != axis])): 

3138 raise ValueError("Dimensions of x and y on all axes except `axis` " 

3139 "should match") 

3140 

3141 n = x.shape[axis] 

3142 m = y.shape[axis] 

3143 N = m + n 

3144 if N < 3: 

3145 raise ValueError("Not enough observations.") 

3146 

3147 xy = np.concatenate((x, y), axis=axis) 

3148 # determine if any of the samples contain ties 

3149 sorted_xy = np.sort(xy, axis=axis) 

3150 diffs = np.diff(sorted_xy, axis=axis) 

3151 if 0 in diffs: 

3152 z = np.asarray(_mood_inner_lc(xy, x, diffs, sorted_xy, n, m, N, 

3153 axis=axis)) 

3154 else: 

3155 if axis != 0: 

3156 xy = np.moveaxis(xy, axis, 0) 

3157 

3158 xy = xy.reshape(xy.shape[0], -1) 

3159 # Generalized to the n-dimensional case by adding the axis argument, 

3160 # and using for loops, since rankdata is not vectorized. For improving 

3161 # performance consider vectorizing rankdata function. 

3162 all_ranks = np.empty_like(xy) 

3163 for j in range(xy.shape[1]): 

3164 all_ranks[:, j] = _stats_py.rankdata(xy[:, j]) 

3165 

3166 Ri = all_ranks[:n] 

3167 M = np.sum((Ri - (N + 1.0) / 2) ** 2, axis=0) 

3168 # Approx stat. 

3169 mnM = n * (N * N - 1.0) / 12 

3170 varM = m * n * (N + 1.0) * (N + 2) * (N - 2) / 180 

3171 z = (M - mnM) / sqrt(varM) 

3172 z, pval = _normtest_finish(z, alternative) 

3173 

3174 if res_shape == (): 

3175 # Return scalars, not 0-D arrays 

3176 z = z[0] 

3177 pval = pval[0] 

3178 else: 

3179 z.shape = res_shape 

3180 pval.shape = res_shape 

3181 return SignificanceResult(z, pval) 

3182 

3183 

3184WilcoxonResult = _make_tuple_bunch('WilcoxonResult', ['statistic', 'pvalue']) 

3185 

3186 

3187def wilcoxon_result_unpacker(res): 

3188 if hasattr(res, 'zstatistic'): 

3189 return res.statistic, res.pvalue, res.zstatistic 

3190 else: 

3191 return res.statistic, res.pvalue 

3192 

3193 

3194def wilcoxon_result_object(statistic, pvalue, zstatistic=None): 

3195 res = WilcoxonResult(statistic, pvalue) 

3196 if zstatistic is not None: 

3197 res.zstatistic = zstatistic 

3198 return res 

3199 

3200 

3201def wilcoxon_outputs(kwds): 

3202 method = kwds.get('method', 'auto') 

3203 if method == 'approx': 

3204 return 3 

3205 return 2 

3206 

3207 

3208@_rename_parameter("mode", "method") 

3209@_axis_nan_policy_factory( 

3210 wilcoxon_result_object, paired=True, 

3211 n_samples=lambda kwds: 2 if kwds.get('y', None) is not None else 1, 

3212 result_to_tuple=wilcoxon_result_unpacker, n_outputs=wilcoxon_outputs, 

3213) 

3214def wilcoxon(x, y=None, zero_method="wilcox", correction=False, 

3215 alternative="two-sided", method='auto'): 

3216 """Calculate the Wilcoxon signed-rank test. 

3217 

3218 The Wilcoxon signed-rank test tests the null hypothesis that two 

3219 related paired samples come from the same distribution. In particular, 

3220 it tests whether the distribution of the differences ``x - y`` is symmetric 

3221 about zero. It is a non-parametric version of the paired T-test. 

3222 

3223 Parameters 

3224 ---------- 

3225 x : array_like 

3226 Either the first set of measurements (in which case ``y`` is the second 

3227 set of measurements), or the differences between two sets of 

3228 measurements (in which case ``y`` is not to be specified.) Must be 

3229 one-dimensional. 

3230 y : array_like, optional 

3231 Either the second set of measurements (if ``x`` is the first set of 

3232 measurements), or not specified (if ``x`` is the differences between 

3233 two sets of measurements.) Must be one-dimensional. 

3234 zero_method : {"wilcox", "pratt", "zsplit"}, optional 

3235 There are different conventions for handling pairs of observations 

3236 with equal values ("zero-differences", or "zeros"). 

3237 

3238 * "wilcox": Discards all zero-differences (default); see [4]_. 

3239 * "pratt": Includes zero-differences in the ranking process, 

3240 but drops the ranks of the zeros (more conservative); see [3]_. 

3241 In this case, the normal approximation is adjusted as in [5]_. 

3242 * "zsplit": Includes zero-differences in the ranking process and 

3243 splits the zero rank between positive and negative ones. 

3244 

3245 correction : bool, optional 

3246 If True, apply continuity correction by adjusting the Wilcoxon rank 

3247 statistic by 0.5 towards the mean value when computing the 

3248 z-statistic if a normal approximation is used. Default is False. 

3249 alternative : {"two-sided", "greater", "less"}, optional 

3250 Defines the alternative hypothesis. Default is 'two-sided'. 

3251 In the following, let ``d`` represent the difference between the paired 

3252 samples: ``d = x - y`` if both ``x`` and ``y`` are provided, or 

3253 ``d = x`` otherwise. 

3254 

3255 * 'two-sided': the distribution underlying ``d`` is not symmetric 

3256 about zero. 

3257 * 'less': the distribution underlying ``d`` is stochastically less 

3258 than a distribution symmetric about zero. 

3259 * 'greater': the distribution underlying ``d`` is stochastically 

3260 greater than a distribution symmetric about zero. 

3261 

3262 method : {"auto", "exact", "approx"}, optional 

3263 Method to calculate the p-value, see Notes. Default is "auto". 

3264 

3265 Returns 

3266 ------- 

3267 An object with the following attributes. 

3268 

3269 statistic : array_like 

3270 If `alternative` is "two-sided", the sum of the ranks of the 

3271 differences above or below zero, whichever is smaller. 

3272 Otherwise the sum of the ranks of the differences above zero. 

3273 pvalue : array_like 

3274 The p-value for the test depending on `alternative` and `method`. 

3275 zstatistic : array_like 

3276 When ``method = 'approx'``, this is the normalized z-statistic:: 

3277 

3278 z = (T - mn - d) / se 

3279 

3280 where ``T`` is `statistic` as defined above, ``mn`` is the mean of the 

3281 distribution under the null hypothesis, ``d`` is a continuity 

3282 correction, and ``se`` is the standard error. 

3283 When ``method != 'approx'``, this attribute is not available. 

3284 

3285 See Also 

3286 -------- 

3287 kruskal, mannwhitneyu 

3288 

3289 Notes 

3290 ----- 

3291 In the following, let ``d`` represent the difference between the paired 

3292 samples: ``d = x - y`` if both ``x`` and ``y`` are provided, or ``d = x`` 

3293 otherwise. Assume that all elements of ``d`` are independent and 

3294 identically distributed observations, and all are distinct and nonzero. 

3295 

3296 - When ``len(d)`` is sufficiently large, the null distribution of the 

3297 normalized test statistic (`zstatistic` above) is approximately normal, 

3298 and ``method = 'approx'`` can be used to compute the p-value. 

3299 

3300 - When ``len(d)`` is small, the normal approximation may not be accurate, 

3301 and ``method='exact'`` is preferred (at the cost of additional 

3302 execution time). 

3303 

3304 - The default, ``method='auto'``, selects between the two: when 

3305 ``len(d) <= 50``, the exact method is used; otherwise, the approximate 

3306 method is used. 

3307 

3308 The presence of "ties" (i.e. not all elements of ``d`` are unique) and 

3309 "zeros" (i.e. elements of ``d`` are zero) changes the null distribution 

3310 of the test statistic, and ``method='exact'`` no longer calculates 

3311 the exact p-value. If ``method='approx'``, the z-statistic is adjusted 

3312 for more accurate comparison against the standard normal, but still, 

3313 for finite sample sizes, the standard normal is only an approximation of 

3314 the true null distribution of the z-statistic. There is no clear 

3315 consensus among references on which method most accurately approximates 

3316 the p-value for small samples in the presence of zeros and/or ties. In any 

3317 case, this is the behavior of `wilcoxon` when ``method='auto': 

3318 ``method='exact'`` is used when ``len(d) <= 50`` *and there are no zeros*; 

3319 otherwise, ``method='approx'`` is used. 

3320 

3321 References 

3322 ---------- 

3323 .. [1] https://en.wikipedia.org/wiki/Wilcoxon_signed-rank_test 

3324 .. [2] Conover, W.J., Practical Nonparametric Statistics, 1971. 

3325 .. [3] Pratt, J.W., Remarks on Zeros and Ties in the Wilcoxon Signed 

3326 Rank Procedures, Journal of the American Statistical Association, 

3327 Vol. 54, 1959, pp. 655-667. :doi:`10.1080/01621459.1959.10501526` 

3328 .. [4] Wilcoxon, F., Individual Comparisons by Ranking Methods, 

3329 Biometrics Bulletin, Vol. 1, 1945, pp. 80-83. :doi:`10.2307/3001968` 

3330 .. [5] Cureton, E.E., The Normal Approximation to the Signed-Rank 

3331 Sampling Distribution When Zero Differences are Present, 

3332 Journal of the American Statistical Association, Vol. 62, 1967, 

3333 pp. 1068-1069. :doi:`10.1080/01621459.1967.10500917` 

3334 

3335 Examples 

3336 -------- 

3337 In [4]_, the differences in height between cross- and self-fertilized 

3338 corn plants is given as follows: 

3339 

3340 >>> d = [6, 8, 14, 16, 23, 24, 28, 29, 41, -48, 49, 56, 60, -67, 75] 

3341 

3342 Cross-fertilized plants appear to be higher. To test the null 

3343 hypothesis that there is no height difference, we can apply the 

3344 two-sided test: 

3345 

3346 >>> from scipy.stats import wilcoxon 

3347 >>> res = wilcoxon(d) 

3348 >>> res.statistic, res.pvalue 

3349 (24.0, 0.041259765625) 

3350 

3351 Hence, we would reject the null hypothesis at a confidence level of 5%, 

3352 concluding that there is a difference in height between the groups. 

3353 To confirm that the median of the differences can be assumed to be 

3354 positive, we use: 

3355 

3356 >>> res = wilcoxon(d, alternative='greater') 

3357 >>> res.statistic, res.pvalue 

3358 (96.0, 0.0206298828125) 

3359 

3360 This shows that the null hypothesis that the median is negative can be 

3361 rejected at a confidence level of 5% in favor of the alternative that 

3362 the median is greater than zero. The p-values above are exact. Using the 

3363 normal approximation gives very similar values: 

3364 

3365 >>> res = wilcoxon(d, method='approx') 

3366 >>> res.statistic, res.pvalue 

3367 (24.0, 0.04088813291185591) 

3368 

3369 Note that the statistic changed to 96 in the one-sided case (the sum 

3370 of ranks of positive differences) whereas it is 24 in the two-sided 

3371 case (the minimum of sum of ranks above and below zero). 

3372 

3373 """ 

3374 mode = method 

3375 

3376 if mode not in ["auto", "approx", "exact"]: 

3377 raise ValueError("mode must be either 'auto', 'approx' or 'exact'") 

3378 

3379 if zero_method not in ["wilcox", "pratt", "zsplit"]: 

3380 raise ValueError("Zero method must be either 'wilcox' " 

3381 "or 'pratt' or 'zsplit'") 

3382 

3383 if alternative not in ["two-sided", "less", "greater"]: 

3384 raise ValueError("Alternative must be either 'two-sided', " 

3385 "'greater' or 'less'") 

3386 

3387 if y is None: 

3388 d = asarray(x) 

3389 if d.ndim > 1: 

3390 raise ValueError('Sample x must be one-dimensional.') 

3391 else: 

3392 x, y = map(asarray, (x, y)) 

3393 if x.ndim > 1 or y.ndim > 1: 

3394 raise ValueError('Samples x and y must be one-dimensional.') 

3395 if len(x) != len(y): 

3396 raise ValueError('The samples x and y must have the same length.') 

3397 d = x - y 

3398 

3399 if len(d) == 0: 

3400 res = WilcoxonResult(np.nan, np.nan) 

3401 if method == 'approx': 

3402 res.zstatistic = np.nan 

3403 return res 

3404 

3405 if mode == "auto": 

3406 if len(d) <= 50: 

3407 mode = "exact" 

3408 else: 

3409 mode = "approx" 

3410 

3411 n_zero = np.sum(d == 0) 

3412 if n_zero > 0 and mode == "exact": 

3413 mode = "approx" 

3414 warnings.warn("Exact p-value calculation does not work if there are " 

3415 "zeros. Switching to normal approximation.") 

3416 

3417 if mode == "approx": 

3418 if zero_method in ["wilcox", "pratt"]: 

3419 if n_zero == len(d): 

3420 raise ValueError("zero_method 'wilcox' and 'pratt' do not " 

3421 "work if x - y is zero for all elements.") 

3422 if zero_method == "wilcox": 

3423 # Keep all non-zero differences 

3424 d = compress(np.not_equal(d, 0), d) 

3425 

3426 count = len(d) 

3427 if count < 10 and mode == "approx": 

3428 warnings.warn("Sample size too small for normal approximation.") 

3429 

3430 r = _stats_py.rankdata(abs(d)) 

3431 r_plus = np.sum((d > 0) * r) 

3432 r_minus = np.sum((d < 0) * r) 

3433 

3434 if zero_method == "zsplit": 

3435 r_zero = np.sum((d == 0) * r) 

3436 r_plus += r_zero / 2. 

3437 r_minus += r_zero / 2. 

3438 

3439 # return min for two-sided test, but r_plus for one-sided test 

3440 # the literature is not consistent here 

3441 # r_plus is more informative since r_plus + r_minus = count*(count+1)/2, 

3442 # i.e. the sum of the ranks, so r_minus and the min can be inferred 

3443 # (If alternative='pratt', r_plus + r_minus = count*(count+1)/2 - r_zero.) 

3444 # [3] uses the r_plus for the one-sided test, keep min for two-sided test 

3445 # to keep backwards compatibility 

3446 if alternative == "two-sided": 

3447 T = min(r_plus, r_minus) 

3448 else: 

3449 T = r_plus 

3450 

3451 if mode == "approx": 

3452 mn = count * (count + 1.) * 0.25 

3453 se = count * (count + 1.) * (2. * count + 1.) 

3454 

3455 if zero_method == "pratt": 

3456 r = r[d != 0] 

3457 # normal approximation needs to be adjusted, see Cureton (1967) 

3458 mn -= n_zero * (n_zero + 1.) * 0.25 

3459 se -= n_zero * (n_zero + 1.) * (2. * n_zero + 1.) 

3460 

3461 replist, repnum = find_repeats(r) 

3462 if repnum.size != 0: 

3463 # Correction for repeated elements. 

3464 se -= 0.5 * (repnum * (repnum * repnum - 1)).sum() 

3465 

3466 se = sqrt(se / 24) 

3467 

3468 # apply continuity correction if applicable 

3469 d = 0 

3470 if correction: 

3471 if alternative == "two-sided": 

3472 d = 0.5 * np.sign(T - mn) 

3473 elif alternative == "less": 

3474 d = -0.5 

3475 else: 

3476 d = 0.5 

3477 

3478 # compute statistic and p-value using normal approximation 

3479 z = (T - mn - d) / se 

3480 if alternative == "two-sided": 

3481 prob = 2. * distributions.norm.sf(abs(z)) 

3482 elif alternative == "greater": 

3483 # large T = r_plus indicates x is greater than y; i.e. 

3484 # accept alternative in that case and return small p-value (sf) 

3485 prob = distributions.norm.sf(z) 

3486 else: 

3487 prob = distributions.norm.cdf(z) 

3488 elif mode == "exact": 

3489 # get pmf of the possible positive ranksums r_plus 

3490 pmf = _get_wilcoxon_distr(count) 

3491 # note: r_plus is int (ties not allowed), need int for slices below 

3492 r_plus = int(r_plus) 

3493 if alternative == "two-sided": 

3494 if r_plus == (len(pmf) - 1) // 2: 

3495 # r_plus is the center of the distribution. 

3496 prob = 1.0 

3497 else: 

3498 p_less = np.sum(pmf[:r_plus + 1]) 

3499 p_greater = np.sum(pmf[r_plus:]) 

3500 prob = 2*min(p_greater, p_less) 

3501 elif alternative == "greater": 

3502 prob = np.sum(pmf[r_plus:]) 

3503 else: 

3504 prob = np.sum(pmf[:r_plus + 1]) 

3505 prob = np.clip(prob, 0, 1) 

3506 

3507 res = WilcoxonResult(T, prob) 

3508 if method == 'approx': 

3509 res.zstatistic = z 

3510 return res 

3511 

3512 

3513MedianTestResult = _make_tuple_bunch( 

3514 'MedianTestResult', 

3515 ['statistic', 'pvalue', 'median', 'table'], [] 

3516) 

3517 

3518 

3519def median_test(*samples, ties='below', correction=True, lambda_=1, 

3520 nan_policy='propagate'): 

3521 """Perform a Mood's median test. 

3522 

3523 Test that two or more samples come from populations with the same median. 

3524 

3525 Let ``n = len(samples)`` be the number of samples. The "grand median" of 

3526 all the data is computed, and a contingency table is formed by 

3527 classifying the values in each sample as being above or below the grand 

3528 median. The contingency table, along with `correction` and `lambda_`, 

3529 are passed to `scipy.stats.chi2_contingency` to compute the test statistic 

3530 and p-value. 

3531 

3532 Parameters 

3533 ---------- 

3534 sample1, sample2, ... : array_like 

3535 The set of samples. There must be at least two samples. 

3536 Each sample must be a one-dimensional sequence containing at least 

3537 one value. The samples are not required to have the same length. 

3538 ties : str, optional 

3539 Determines how values equal to the grand median are classified in 

3540 the contingency table. The string must be one of:: 

3541 

3542 "below": 

3543 Values equal to the grand median are counted as "below". 

3544 "above": 

3545 Values equal to the grand median are counted as "above". 

3546 "ignore": 

3547 Values equal to the grand median are not counted. 

3548 

3549 The default is "below". 

3550 correction : bool, optional 

3551 If True, *and* there are just two samples, apply Yates' correction 

3552 for continuity when computing the test statistic associated with 

3553 the contingency table. Default is True. 

3554 lambda_ : float or str, optional 

3555 By default, the statistic computed in this test is Pearson's 

3556 chi-squared statistic. `lambda_` allows a statistic from the 

3557 Cressie-Read power divergence family to be used instead. See 

3558 `power_divergence` for details. 

3559 Default is 1 (Pearson's chi-squared statistic). 

3560 nan_policy : {'propagate', 'raise', 'omit'}, optional 

3561 Defines how to handle when input contains nan. 'propagate' returns nan, 

3562 'raise' throws an error, 'omit' performs the calculations ignoring nan 

3563 values. Default is 'propagate'. 

3564 

3565 Returns 

3566 ------- 

3567 res : MedianTestResult 

3568 An object containing attributes: 

3569 

3570 statistic : float 

3571 The test statistic. The statistic that is returned is determined 

3572 by `lambda_`. The default is Pearson's chi-squared statistic. 

3573 pvalue : float 

3574 The p-value of the test. 

3575 median : float 

3576 The grand median. 

3577 table : ndarray 

3578 The contingency table. The shape of the table is (2, n), where 

3579 n is the number of samples. The first row holds the counts of the 

3580 values above the grand median, and the second row holds the counts 

3581 of the values below the grand median. The table allows further 

3582 analysis with, for example, `scipy.stats.chi2_contingency`, or with 

3583 `scipy.stats.fisher_exact` if there are two samples, without having 

3584 to recompute the table. If ``nan_policy`` is "propagate" and there 

3585 are nans in the input, the return value for ``table`` is ``None``. 

3586 

3587 See Also 

3588 -------- 

3589 kruskal : Compute the Kruskal-Wallis H-test for independent samples. 

3590 mannwhitneyu : Computes the Mann-Whitney rank test on samples x and y. 

3591 

3592 Notes 

3593 ----- 

3594 .. versionadded:: 0.15.0 

3595 

3596 References 

3597 ---------- 

3598 .. [1] Mood, A. M., Introduction to the Theory of Statistics. McGraw-Hill 

3599 (1950), pp. 394-399. 

3600 .. [2] Zar, J. H., Biostatistical Analysis, 5th ed. Prentice Hall (2010). 

3601 See Sections 8.12 and 10.15. 

3602 

3603 Examples 

3604 -------- 

3605 A biologist runs an experiment in which there are three groups of plants. 

3606 Group 1 has 16 plants, group 2 has 15 plants, and group 3 has 17 plants. 

3607 Each plant produces a number of seeds. The seed counts for each group 

3608 are:: 

3609 

3610 Group 1: 10 14 14 18 20 22 24 25 31 31 32 39 43 43 48 49 

3611 Group 2: 28 30 31 33 34 35 36 40 44 55 57 61 91 92 99 

3612 Group 3: 0 3 9 22 23 25 25 33 34 34 40 45 46 48 62 67 84 

3613 

3614 The following code applies Mood's median test to these samples. 

3615 

3616 >>> g1 = [10, 14, 14, 18, 20, 22, 24, 25, 31, 31, 32, 39, 43, 43, 48, 49] 

3617 >>> g2 = [28, 30, 31, 33, 34, 35, 36, 40, 44, 55, 57, 61, 91, 92, 99] 

3618 >>> g3 = [0, 3, 9, 22, 23, 25, 25, 33, 34, 34, 40, 45, 46, 48, 62, 67, 84] 

3619 >>> from scipy.stats import median_test 

3620 >>> res = median_test(g1, g2, g3) 

3621 

3622 The median is 

3623 

3624 >>> res.median 

3625 34.0 

3626 

3627 and the contingency table is 

3628 

3629 >>> res.table 

3630 array([[ 5, 10, 7], 

3631 [11, 5, 10]]) 

3632 

3633 `p` is too large to conclude that the medians are not the same: 

3634 

3635 >>> res.pvalue 

3636 0.12609082774093244 

3637 

3638 The "G-test" can be performed by passing ``lambda_="log-likelihood"`` to 

3639 `median_test`. 

3640 

3641 >>> res = median_test(g1, g2, g3, lambda_="log-likelihood") 

3642 >>> res.pvalue 

3643 0.12224779737117837 

3644 

3645 The median occurs several times in the data, so we'll get a different 

3646 result if, for example, ``ties="above"`` is used: 

3647 

3648 >>> res = median_test(g1, g2, g3, ties="above") 

3649 >>> res.pvalue 

3650 0.063873276069553273 

3651 

3652 >>> res.table 

3653 array([[ 5, 11, 9], 

3654 [11, 4, 8]]) 

3655 

3656 This example demonstrates that if the data set is not large and there 

3657 are values equal to the median, the p-value can be sensitive to the 

3658 choice of `ties`. 

3659 

3660 """ 

3661 if len(samples) < 2: 

3662 raise ValueError('median_test requires two or more samples.') 

3663 

3664 ties_options = ['below', 'above', 'ignore'] 

3665 if ties not in ties_options: 

3666 raise ValueError("invalid 'ties' option '%s'; 'ties' must be one " 

3667 "of: %s" % (ties, str(ties_options)[1:-1])) 

3668 

3669 data = [np.asarray(sample) for sample in samples] 

3670 

3671 # Validate the sizes and shapes of the arguments. 

3672 for k, d in enumerate(data): 

3673 if d.size == 0: 

3674 raise ValueError("Sample %d is empty. All samples must " 

3675 "contain at least one value." % (k + 1)) 

3676 if d.ndim != 1: 

3677 raise ValueError("Sample %d has %d dimensions. All " 

3678 "samples must be one-dimensional sequences." % 

3679 (k + 1, d.ndim)) 

3680 

3681 cdata = np.concatenate(data) 

3682 contains_nan, nan_policy = _contains_nan(cdata, nan_policy) 

3683 if contains_nan and nan_policy == 'propagate': 

3684 return MedianTestResult(np.nan, np.nan, np.nan, None) 

3685 

3686 if contains_nan: 

3687 grand_median = np.median(cdata[~np.isnan(cdata)]) 

3688 else: 

3689 grand_median = np.median(cdata) 

3690 # When the minimum version of numpy supported by scipy is 1.9.0, 

3691 # the above if/else statement can be replaced by the single line: 

3692 # grand_median = np.nanmedian(cdata) 

3693 

3694 # Create the contingency table. 

3695 table = np.zeros((2, len(data)), dtype=np.int64) 

3696 for k, sample in enumerate(data): 

3697 sample = sample[~np.isnan(sample)] 

3698 

3699 nabove = count_nonzero(sample > grand_median) 

3700 nbelow = count_nonzero(sample < grand_median) 

3701 nequal = sample.size - (nabove + nbelow) 

3702 table[0, k] += nabove 

3703 table[1, k] += nbelow 

3704 if ties == "below": 

3705 table[1, k] += nequal 

3706 elif ties == "above": 

3707 table[0, k] += nequal 

3708 

3709 # Check that no row or column of the table is all zero. 

3710 # Such a table can not be given to chi2_contingency, because it would have 

3711 # a zero in the table of expected frequencies. 

3712 rowsums = table.sum(axis=1) 

3713 if rowsums[0] == 0: 

3714 raise ValueError("All values are below the grand median (%r)." % 

3715 grand_median) 

3716 if rowsums[1] == 0: 

3717 raise ValueError("All values are above the grand median (%r)." % 

3718 grand_median) 

3719 if ties == "ignore": 

3720 # We already checked that each sample has at least one value, but it 

3721 # is possible that all those values equal the grand median. If `ties` 

3722 # is "ignore", that would result in a column of zeros in `table`. We 

3723 # check for that case here. 

3724 zero_cols = np.nonzero((table == 0).all(axis=0))[0] 

3725 if len(zero_cols) > 0: 

3726 msg = ("All values in sample %d are equal to the grand " 

3727 "median (%r), so they are ignored, resulting in an " 

3728 "empty sample." % (zero_cols[0] + 1, grand_median)) 

3729 raise ValueError(msg) 

3730 

3731 stat, p, dof, expected = chi2_contingency(table, lambda_=lambda_, 

3732 correction=correction) 

3733 return MedianTestResult(stat, p, grand_median, table) 

3734 

3735 

3736def _circfuncs_common(samples, high, low, nan_policy='propagate'): 

3737 # Ensure samples are array-like and size is not zero 

3738 samples = np.asarray(samples) 

3739 if samples.size == 0: 

3740 return np.nan, np.asarray(np.nan), np.asarray(np.nan), None 

3741 

3742 # Recast samples as radians that range between 0 and 2 pi and calculate 

3743 # the sine and cosine 

3744 sin_samp = sin((samples - low)*2.*pi / (high - low)) 

3745 cos_samp = cos((samples - low)*2.*pi / (high - low)) 

3746 

3747 # Apply the NaN policy 

3748 contains_nan, nan_policy = _contains_nan(samples, nan_policy) 

3749 if contains_nan and nan_policy == 'omit': 

3750 mask = np.isnan(samples) 

3751 # Set the sines and cosines that are NaN to zero 

3752 sin_samp[mask] = 0.0 

3753 cos_samp[mask] = 0.0 

3754 else: 

3755 mask = None 

3756 

3757 return samples, sin_samp, cos_samp, mask 

3758 

3759 

3760def circmean(samples, high=2*pi, low=0, axis=None, nan_policy='propagate'): 

3761 """Compute the circular mean for samples in a range. 

3762 

3763 Parameters 

3764 ---------- 

3765 samples : array_like 

3766 Input array. 

3767 high : float or int, optional 

3768 High boundary for the sample range. Default is ``2*pi``. 

3769 low : float or int, optional 

3770 Low boundary for the sample range. Default is 0. 

3771 axis : int, optional 

3772 Axis along which means are computed. The default is to compute 

3773 the mean of the flattened array. 

3774 nan_policy : {'propagate', 'raise', 'omit'}, optional 

3775 Defines how to handle when input contains nan. 'propagate' returns nan, 

3776 'raise' throws an error, 'omit' performs the calculations ignoring nan 

3777 values. Default is 'propagate'. 

3778 

3779 Returns 

3780 ------- 

3781 circmean : float 

3782 Circular mean. 

3783 

3784 See Also 

3785 -------- 

3786 circstd : Circular standard deviation. 

3787 circvar : Circular variance. 

3788 

3789 Examples 

3790 -------- 

3791 For simplicity, all angles are printed out in degrees. 

3792 

3793 >>> import numpy as np 

3794 >>> from scipy.stats import circmean 

3795 >>> import matplotlib.pyplot as plt 

3796 >>> angles = np.deg2rad(np.array([20, 30, 330])) 

3797 >>> circmean = circmean(angles) 

3798 >>> np.rad2deg(circmean) 

3799 7.294976657784009 

3800 

3801 >>> mean = angles.mean() 

3802 >>> np.rad2deg(mean) 

3803 126.66666666666666 

3804 

3805 Plot and compare the circular mean against the arithmetic mean. 

3806 

3807 >>> plt.plot(np.cos(np.linspace(0, 2*np.pi, 500)), 

3808 ... np.sin(np.linspace(0, 2*np.pi, 500)), 

3809 ... c='k') 

3810 >>> plt.scatter(np.cos(angles), np.sin(angles), c='k') 

3811 >>> plt.scatter(np.cos(circmean), np.sin(circmean), c='b', 

3812 ... label='circmean') 

3813 >>> plt.scatter(np.cos(mean), np.sin(mean), c='r', label='mean') 

3814 >>> plt.legend() 

3815 >>> plt.axis('equal') 

3816 >>> plt.show() 

3817 

3818 """ 

3819 samples, sin_samp, cos_samp, nmask = _circfuncs_common(samples, high, low, 

3820 nan_policy=nan_policy) 

3821 sin_sum = sin_samp.sum(axis=axis) 

3822 cos_sum = cos_samp.sum(axis=axis) 

3823 res = arctan2(sin_sum, cos_sum) 

3824 

3825 mask_nan = ~np.isnan(res) 

3826 if mask_nan.ndim > 0: 

3827 mask = res[mask_nan] < 0 

3828 else: 

3829 mask = res < 0 

3830 

3831 if mask.ndim > 0: 

3832 mask_nan[mask_nan] = mask 

3833 res[mask_nan] += 2*pi 

3834 elif mask: 

3835 res += 2*pi 

3836 

3837 # Set output to NaN if no samples went into the mean 

3838 if nmask is not None: 

3839 if nmask.all(): 

3840 res = np.full(shape=res.shape, fill_value=np.nan) 

3841 else: 

3842 # Find out if any of the axis that are being averaged consist 

3843 # entirely of NaN. If one exists, set the result (res) to NaN 

3844 nshape = 0 if axis is None else axis 

3845 smask = nmask.shape[nshape] == nmask.sum(axis=axis) 

3846 if smask.any(): 

3847 res[smask] = np.nan 

3848 

3849 return res*(high - low)/2.0/pi + low 

3850 

3851 

3852def circvar(samples, high=2*pi, low=0, axis=None, nan_policy='propagate'): 

3853 """Compute the circular variance for samples assumed to be in a range. 

3854 

3855 Parameters 

3856 ---------- 

3857 samples : array_like 

3858 Input array. 

3859 high : float or int, optional 

3860 High boundary for the sample range. Default is ``2*pi``. 

3861 low : float or int, optional 

3862 Low boundary for the sample range. Default is 0. 

3863 axis : int, optional 

3864 Axis along which variances are computed. The default is to compute 

3865 the variance of the flattened array. 

3866 nan_policy : {'propagate', 'raise', 'omit'}, optional 

3867 Defines how to handle when input contains nan. 'propagate' returns nan, 

3868 'raise' throws an error, 'omit' performs the calculations ignoring nan 

3869 values. Default is 'propagate'. 

3870 

3871 Returns 

3872 ------- 

3873 circvar : float 

3874 Circular variance. 

3875 

3876 See Also 

3877 -------- 

3878 circmean : Circular mean. 

3879 circstd : Circular standard deviation. 

3880 

3881 Notes 

3882 ----- 

3883 This uses the following definition of circular variance: ``1-R``, where 

3884 ``R`` is the mean resultant vector. The 

3885 returned value is in the range [0, 1], 0 standing for no variance, and 1 

3886 for a large variance. In the limit of small angles, this value is similar 

3887 to half the 'linear' variance. 

3888 

3889 References 

3890 ---------- 

3891 .. [1] Fisher, N.I. *Statistical analysis of circular data*. Cambridge 

3892 University Press, 1993. 

3893 

3894 Examples 

3895 -------- 

3896 >>> import numpy as np 

3897 >>> from scipy.stats import circvar 

3898 >>> import matplotlib.pyplot as plt 

3899 >>> samples_1 = np.array([0.072, -0.158, 0.077, 0.108, 0.286, 

3900 ... 0.133, -0.473, -0.001, -0.348, 0.131]) 

3901 >>> samples_2 = np.array([0.111, -0.879, 0.078, 0.733, 0.421, 

3902 ... 0.104, -0.136, -0.867, 0.012, 0.105]) 

3903 >>> circvar_1 = circvar(samples_1) 

3904 >>> circvar_2 = circvar(samples_2) 

3905 

3906 Plot the samples. 

3907 

3908 >>> fig, (left, right) = plt.subplots(ncols=2) 

3909 >>> for image in (left, right): 

3910 ... image.plot(np.cos(np.linspace(0, 2*np.pi, 500)), 

3911 ... np.sin(np.linspace(0, 2*np.pi, 500)), 

3912 ... c='k') 

3913 ... image.axis('equal') 

3914 ... image.axis('off') 

3915 >>> left.scatter(np.cos(samples_1), np.sin(samples_1), c='k', s=15) 

3916 >>> left.set_title(f"circular variance: {np.round(circvar_1, 2)!r}") 

3917 >>> right.scatter(np.cos(samples_2), np.sin(samples_2), c='k', s=15) 

3918 >>> right.set_title(f"circular variance: {np.round(circvar_2, 2)!r}") 

3919 >>> plt.show() 

3920 

3921 """ 

3922 samples, sin_samp, cos_samp, mask = _circfuncs_common(samples, high, low, 

3923 nan_policy=nan_policy) 

3924 if mask is None: 

3925 sin_mean = sin_samp.mean(axis=axis) 

3926 cos_mean = cos_samp.mean(axis=axis) 

3927 else: 

3928 nsum = np.asarray(np.sum(~mask, axis=axis).astype(float)) 

3929 nsum[nsum == 0] = np.nan 

3930 sin_mean = sin_samp.sum(axis=axis) / nsum 

3931 cos_mean = cos_samp.sum(axis=axis) / nsum 

3932 # hypot can go slightly above 1 due to rounding errors 

3933 with np.errstate(invalid='ignore'): 

3934 R = np.minimum(1, hypot(sin_mean, cos_mean)) 

3935 

3936 res = 1. - R 

3937 return res 

3938 

3939 

3940def circstd(samples, high=2*pi, low=0, axis=None, nan_policy='propagate', *, 

3941 normalize=False): 

3942 """ 

3943 Compute the circular standard deviation for samples assumed to be in the 

3944 range [low to high]. 

3945 

3946 Parameters 

3947 ---------- 

3948 samples : array_like 

3949 Input array. 

3950 high : float or int, optional 

3951 High boundary for the sample range. Default is ``2*pi``. 

3952 low : float or int, optional 

3953 Low boundary for the sample range. Default is 0. 

3954 axis : int, optional 

3955 Axis along which standard deviations are computed. The default is 

3956 to compute the standard deviation of the flattened array. 

3957 nan_policy : {'propagate', 'raise', 'omit'}, optional 

3958 Defines how to handle when input contains nan. 'propagate' returns nan, 

3959 'raise' throws an error, 'omit' performs the calculations ignoring nan 

3960 values. Default is 'propagate'. 

3961 normalize : boolean, optional 

3962 If True, the returned value is equal to ``sqrt(-2*log(R))`` and does 

3963 not depend on the variable units. If False (default), the returned 

3964 value is scaled by ``((high-low)/(2*pi))``. 

3965 

3966 Returns 

3967 ------- 

3968 circstd : float 

3969 Circular standard deviation. 

3970 

3971 See Also 

3972 -------- 

3973 circmean : Circular mean. 

3974 circvar : Circular variance. 

3975 

3976 Notes 

3977 ----- 

3978 This uses a definition of circular standard deviation from [1]_. 

3979 Essentially, the calculation is as follows. 

3980 

3981 .. code-block:: python 

3982 

3983 import numpy as np 

3984 C = np.cos(samples).mean() 

3985 S = np.sin(samples).mean() 

3986 R = np.sqrt(C**2 + S**2) 

3987 l = 2*np.pi / (high-low) 

3988 circstd = np.sqrt(-2*np.log(R)) / l 

3989 

3990 In the limit of small angles, it returns a number close to the 'linear' 

3991 standard deviation. 

3992 

3993 References 

3994 ---------- 

3995 .. [1] Mardia, K. V. (1972). 2. In *Statistics of Directional Data* 

3996 (pp. 18-24). Academic Press. :doi:`10.1016/C2013-0-07425-7`. 

3997 

3998 Examples 

3999 -------- 

4000 >>> import numpy as np 

4001 >>> from scipy.stats import circstd 

4002 >>> import matplotlib.pyplot as plt 

4003 >>> samples_1 = np.array([0.072, -0.158, 0.077, 0.108, 0.286, 

4004 ... 0.133, -0.473, -0.001, -0.348, 0.131]) 

4005 >>> samples_2 = np.array([0.111, -0.879, 0.078, 0.733, 0.421, 

4006 ... 0.104, -0.136, -0.867, 0.012, 0.105]) 

4007 >>> circstd_1 = circstd(samples_1) 

4008 >>> circstd_2 = circstd(samples_2) 

4009 

4010 Plot the samples. 

4011 

4012 >>> fig, (left, right) = plt.subplots(ncols=2) 

4013 >>> for image in (left, right): 

4014 ... image.plot(np.cos(np.linspace(0, 2*np.pi, 500)), 

4015 ... np.sin(np.linspace(0, 2*np.pi, 500)), 

4016 ... c='k') 

4017 ... image.axis('equal') 

4018 ... image.axis('off') 

4019 >>> left.scatter(np.cos(samples_1), np.sin(samples_1), c='k', s=15) 

4020 >>> left.set_title(f"circular std: {np.round(circstd_1, 2)!r}") 

4021 >>> right.plot(np.cos(np.linspace(0, 2*np.pi, 500)), 

4022 ... np.sin(np.linspace(0, 2*np.pi, 500)), 

4023 ... c='k') 

4024 >>> right.scatter(np.cos(samples_2), np.sin(samples_2), c='k', s=15) 

4025 >>> right.set_title(f"circular std: {np.round(circstd_2, 2)!r}") 

4026 >>> plt.show() 

4027 

4028 """ 

4029 samples, sin_samp, cos_samp, mask = _circfuncs_common(samples, high, low, 

4030 nan_policy=nan_policy) 

4031 if mask is None: 

4032 sin_mean = sin_samp.mean(axis=axis) # [1] (2.2.3) 

4033 cos_mean = cos_samp.mean(axis=axis) # [1] (2.2.3) 

4034 else: 

4035 nsum = np.asarray(np.sum(~mask, axis=axis).astype(float)) 

4036 nsum[nsum == 0] = np.nan 

4037 sin_mean = sin_samp.sum(axis=axis) / nsum 

4038 cos_mean = cos_samp.sum(axis=axis) / nsum 

4039 # hypot can go slightly above 1 due to rounding errors 

4040 with np.errstate(invalid='ignore'): 

4041 R = np.minimum(1, hypot(sin_mean, cos_mean)) # [1] (2.2.4) 

4042 

4043 res = sqrt(-2*log(R)) 

4044 if not normalize: 

4045 res *= (high-low)/(2.*pi) # [1] (2.3.14) w/ (2.3.7) 

4046 return res 

4047 

4048 

4049class DirectionalStats: 

4050 def __init__(self, mean_direction, mean_resultant_length): 

4051 self.mean_direction = mean_direction 

4052 self.mean_resultant_length = mean_resultant_length 

4053 

4054 def __repr__(self): 

4055 return (f"DirectionalStats(mean_direction={self.mean_direction}," 

4056 f" mean_resultant_length={self.mean_resultant_length})") 

4057 

4058 

4059def directional_stats(samples, *, axis=0, normalize=True): 

4060 """ 

4061 Computes sample statistics for directional data. 

4062 

4063 Computes the directional mean (also called the mean direction vector) and 

4064 mean resultant length of a sample of vectors. 

4065 

4066 The directional mean is a measure of "preferred direction" of vector data. 

4067 It is analogous to the sample mean, but it is for use when the length of 

4068 the data is irrelevant (e.g. unit vectors). 

4069 

4070 The mean resultant length is a value between 0 and 1 used to quantify the 

4071 dispersion of directional data: the smaller the mean resultant length, the 

4072 greater the dispersion. Several definitions of directional variance 

4073 involving the mean resultant length are given in [1]_ and [2]_. 

4074 

4075 Parameters 

4076 ---------- 

4077 samples : array_like 

4078 Input array. Must be at least two-dimensional, and the last axis of the 

4079 input must correspond with the dimensionality of the vector space. 

4080 When the input is exactly two dimensional, this means that each row 

4081 of the data is a vector observation. 

4082 axis : int, default: 0 

4083 Axis along which the directional mean is computed. 

4084 normalize: boolean, default: True 

4085 If True, normalize the input to ensure that each observation is a 

4086 unit vector. It the observations are already unit vectors, consider 

4087 setting this to False to avoid unnecessary computation. 

4088 

4089 Returns 

4090 ------- 

4091 res : DirectionalStats 

4092 An object containing attributes: 

4093 

4094 mean_direction : ndarray 

4095 Directional mean. 

4096 mean_resultant_length : ndarray 

4097 The mean resultant length [1]_. 

4098 

4099 See also 

4100 -------- 

4101 circmean: circular mean; i.e. directional mean for 2D *angles* 

4102 circvar: circular variance; i.e. directional variance for 2D *angles* 

4103 

4104 Notes 

4105 ----- 

4106 This uses a definition of directional mean from [1]_. 

4107 Assuming the observations are unit vectors, the calculation is as follows. 

4108 

4109 .. code-block:: python 

4110 

4111 mean = samples.mean(axis=0) 

4112 mean_resultant_length = np.linalg.norm(mean) 

4113 mean_direction = mean / mean_resultant_length 

4114 

4115 This definition is appropriate for *directional* data (i.e. vector data 

4116 for which the magnitude of each observation is irrelevant) but not 

4117 for *axial* data (i.e. vector data for which the magnitude and *sign* of 

4118 each observation is irrelevant). 

4119 

4120 Several definitions of directional variance involving the mean resultant 

4121 length ``R`` have been proposed, including ``1 - R`` [1]_, ``1 - R**2`` 

4122 [2]_, and ``2 * (1 - R)`` [2]_. Rather than choosing one, this function 

4123 returns ``R`` as attribute `mean_resultant_length` so the user can compute 

4124 their preferred measure of dispersion. 

4125 

4126 References 

4127 ---------- 

4128 .. [1] Mardia, Jupp. (2000). *Directional Statistics* 

4129 (p. 163). Wiley. 

4130 

4131 .. [2] https://en.wikipedia.org/wiki/Directional_statistics 

4132 

4133 Examples 

4134 -------- 

4135 >>> import numpy as np 

4136 >>> from scipy.stats import directional_stats 

4137 >>> data = np.array([[3, 4], # first observation, 2D vector space 

4138 ... [6, -8]]) # second observation 

4139 >>> dirstats = directional_stats(data) 

4140 >>> dirstats.mean_direction 

4141 array([1., 0.]) 

4142 

4143 In contrast, the regular sample mean of the vectors would be influenced 

4144 by the magnitude of each observation. Furthermore, the result would not be 

4145 a unit vector. 

4146 

4147 >>> data.mean(axis=0) 

4148 array([4.5, -2.]) 

4149 

4150 An exemplary use case for `directional_stats` is to find a *meaningful* 

4151 center for a set of observations on a sphere, e.g. geographical locations. 

4152 

4153 >>> data = np.array([[0.8660254, 0.5, 0.], 

4154 ... [0.8660254, -0.5, 0.]]) 

4155 >>> dirstats = directional_stats(data) 

4156 >>> dirstats.mean_direction 

4157 array([1., 0., 0.]) 

4158 

4159 The regular sample mean on the other hand yields a result which does not 

4160 lie on the surface of the sphere. 

4161 

4162 >>> data.mean(axis=0) 

4163 array([0.8660254, 0., 0.]) 

4164 

4165 The function also returns the mean resultant length, which 

4166 can be used to calculate a directional variance. For example, using the 

4167 definition ``Var(z) = 1 - R`` from [2]_ where ``R`` is the 

4168 mean resultant length, we can calculate the directional variance of the 

4169 vectors in the above example as: 

4170 

4171 >>> 1 - dirstats.mean_resultant_length 

4172 0.13397459716167093 

4173 """ 

4174 samples = np.asarray(samples) 

4175 if samples.ndim < 2: 

4176 raise ValueError("samples must at least be two-dimensional. " 

4177 f"Instead samples has shape: {samples.shape!r}") 

4178 samples = np.moveaxis(samples, axis, 0) 

4179 if normalize: 

4180 vectornorms = np.linalg.norm(samples, axis=-1, keepdims=True) 

4181 samples = samples/vectornorms 

4182 mean = np.mean(samples, axis=0) 

4183 mean_resultant_length = np.linalg.norm(mean, axis=-1, keepdims=True) 

4184 mean_direction = mean / mean_resultant_length 

4185 return DirectionalStats(mean_direction, 

4186 mean_resultant_length.squeeze(-1)[()])