Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scipy/stats/_hypotests.py: 10%

467 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-12-12 06:31 +0000

1from collections import namedtuple 

2from dataclasses import make_dataclass 

3from math import comb 

4import numpy as np 

5import warnings 

6from itertools import combinations 

7import scipy.stats 

8from scipy.optimize import shgo 

9from . import distributions 

10from ._common import ConfidenceInterval 

11from ._continuous_distns import chi2, norm 

12from scipy.special import gamma, kv, gammaln 

13from scipy.fft import ifft 

14from ._stats_pythran import _a_ij_Aij_Dij2 

15from ._stats_pythran import ( 

16 _concordant_pairs as _P, _discordant_pairs as _Q 

17) 

18from scipy.stats import _stats_py 

19 

20__all__ = ['epps_singleton_2samp', 'cramervonmises', 'somersd', 

21 'barnard_exact', 'boschloo_exact', 'cramervonmises_2samp', 

22 'tukey_hsd', 'poisson_means_test'] 

23 

24Epps_Singleton_2sampResult = namedtuple('Epps_Singleton_2sampResult', 

25 ('statistic', 'pvalue')) 

26 

27 

28def epps_singleton_2samp(x, y, t=(0.4, 0.8)): 

29 """Compute the Epps-Singleton (ES) test statistic. 

30 

31 Test the null hypothesis that two samples have the same underlying 

32 probability distribution. 

33 

34 Parameters 

35 ---------- 

36 x, y : array-like 

37 The two samples of observations to be tested. Input must not have more 

38 than one dimension. Samples can have different lengths. 

39 t : array-like, optional 

40 The points (t1, ..., tn) where the empirical characteristic function is 

41 to be evaluated. It should be positive distinct numbers. The default 

42 value (0.4, 0.8) is proposed in [1]_. Input must not have more than 

43 one dimension. 

44 

45 Returns 

46 ------- 

47 statistic : float 

48 The test statistic. 

49 pvalue : float 

50 The associated p-value based on the asymptotic chi2-distribution. 

51 

52 See Also 

53 -------- 

54 ks_2samp, anderson_ksamp 

55 

56 Notes 

57 ----- 

58 Testing whether two samples are generated by the same underlying 

59 distribution is a classical question in statistics. A widely used test is 

60 the Kolmogorov-Smirnov (KS) test which relies on the empirical 

61 distribution function. Epps and Singleton introduce a test based on the 

62 empirical characteristic function in [1]_. 

63 

64 One advantage of the ES test compared to the KS test is that is does 

65 not assume a continuous distribution. In [1]_, the authors conclude 

66 that the test also has a higher power than the KS test in many 

67 examples. They recommend the use of the ES test for discrete samples as 

68 well as continuous samples with at least 25 observations each, whereas 

69 `anderson_ksamp` is recommended for smaller sample sizes in the 

70 continuous case. 

71 

72 The p-value is computed from the asymptotic distribution of the test 

73 statistic which follows a `chi2` distribution. If the sample size of both 

74 `x` and `y` is below 25, the small sample correction proposed in [1]_ is 

75 applied to the test statistic. 

76 

77 The default values of `t` are determined in [1]_ by considering 

78 various distributions and finding good values that lead to a high power 

79 of the test in general. Table III in [1]_ gives the optimal values for 

80 the distributions tested in that study. The values of `t` are scaled by 

81 the semi-interquartile range in the implementation, see [1]_. 

82 

83 References 

84 ---------- 

85 .. [1] T. W. Epps and K. J. Singleton, "An omnibus test for the two-sample 

86 problem using the empirical characteristic function", Journal of 

87 Statistical Computation and Simulation 26, p. 177--203, 1986. 

88 

89 .. [2] S. J. Goerg and J. Kaiser, "Nonparametric testing of distributions 

90 - the Epps-Singleton two-sample test using the empirical characteristic 

91 function", The Stata Journal 9(3), p. 454--465, 2009. 

92 

93 """ 

94 x, y, t = np.asarray(x), np.asarray(y), np.asarray(t) 

95 # check if x and y are valid inputs 

96 if x.ndim > 1: 

97 raise ValueError('x must be 1d, but x.ndim equals {}.'.format(x.ndim)) 

98 if y.ndim > 1: 

99 raise ValueError('y must be 1d, but y.ndim equals {}.'.format(y.ndim)) 

100 nx, ny = len(x), len(y) 

101 if (nx < 5) or (ny < 5): 

102 raise ValueError('x and y should have at least 5 elements, but len(x) ' 

103 '= {} and len(y) = {}.'.format(nx, ny)) 

104 if not np.isfinite(x).all(): 

105 raise ValueError('x must not contain nonfinite values.') 

106 if not np.isfinite(y).all(): 

107 raise ValueError('y must not contain nonfinite values.') 

108 n = nx + ny 

109 

110 # check if t is valid 

111 if t.ndim > 1: 

112 raise ValueError('t must be 1d, but t.ndim equals {}.'.format(t.ndim)) 

113 if np.less_equal(t, 0).any(): 

114 raise ValueError('t must contain positive elements only.') 

115 

116 # rescale t with semi-iqr as proposed in [1]; import iqr here to avoid 

117 # circular import 

118 from scipy.stats import iqr 

119 sigma = iqr(np.hstack((x, y))) / 2 

120 ts = np.reshape(t, (-1, 1)) / sigma 

121 

122 # covariance estimation of ES test 

123 gx = np.vstack((np.cos(ts*x), np.sin(ts*x))).T # shape = (nx, 2*len(t)) 

124 gy = np.vstack((np.cos(ts*y), np.sin(ts*y))).T 

125 cov_x = np.cov(gx.T, bias=True) # the test uses biased cov-estimate 

126 cov_y = np.cov(gy.T, bias=True) 

127 est_cov = (n/nx)*cov_x + (n/ny)*cov_y 

128 est_cov_inv = np.linalg.pinv(est_cov) 

129 r = np.linalg.matrix_rank(est_cov_inv) 

130 if r < 2*len(t): 

131 warnings.warn('Estimated covariance matrix does not have full rank. ' 

132 'This indicates a bad choice of the input t and the ' 

133 'test might not be consistent.') # see p. 183 in [1]_ 

134 

135 # compute test statistic w distributed asympt. as chisquare with df=r 

136 g_diff = np.mean(gx, axis=0) - np.mean(gy, axis=0) 

137 w = n*np.dot(g_diff.T, np.dot(est_cov_inv, g_diff)) 

138 

139 # apply small-sample correction 

140 if (max(nx, ny) < 25): 

141 corr = 1.0/(1.0 + n**(-0.45) + 10.1*(nx**(-1.7) + ny**(-1.7))) 

142 w = corr * w 

143 

144 p = chi2.sf(w, r) 

145 

146 return Epps_Singleton_2sampResult(w, p) 

147 

148 

149def poisson_means_test(k1, n1, k2, n2, *, diff=0, alternative='two-sided'): 

150 r""" 

151 Performs the Poisson means test, AKA the "E-test". 

152 

153 This is a test of the null hypothesis that the difference between means of 

154 two Poisson distributions is `diff`. The samples are provided as the 

155 number of events `k1` and `k2` observed within measurement intervals 

156 (e.g. of time, space, number of observations) of sizes `n1` and `n2`. 

157 

158 Parameters 

159 ---------- 

160 k1 : int 

161 Number of events observed from distribution 1. 

162 n1: float 

163 Size of sample from distribution 1. 

164 k2 : int 

165 Number of events observed from distribution 2. 

166 n2 : float 

167 Size of sample from distribution 2. 

168 diff : float, default=0 

169 The hypothesized difference in means between the distributions 

170 underlying the samples. 

171 alternative : {'two-sided', 'less', 'greater'}, optional 

172 Defines the alternative hypothesis. 

173 The following options are available (default is 'two-sided'): 

174 

175 * 'two-sided': the difference between distribution means is not 

176 equal to `diff` 

177 * 'less': the difference between distribution means is less than 

178 `diff` 

179 * 'greater': the difference between distribution means is greater 

180 than `diff` 

181 

182 Returns 

183 ------- 

184 statistic : float 

185 The test statistic (see [1]_ equation 3.3). 

186 pvalue : float 

187 The probability of achieving such an extreme value of the test 

188 statistic under the null hypothesis. 

189 

190 Notes 

191 ----- 

192 

193 Let: 

194 

195 .. math:: X_1 \sim \mbox{Poisson}(\mathtt{n1}\lambda_1) 

196 

197 be a random variable independent of 

198 

199 .. math:: X_2 \sim \mbox{Poisson}(\mathtt{n2}\lambda_2) 

200 

201 and let ``k1`` and ``k2`` be the observed values of :math:`X_1` 

202 and :math:`X_2`, respectively. Then `poisson_means_test` uses the number 

203 of observed events ``k1`` and ``k2`` from samples of size ``n1`` and 

204 ``n2``, respectively, to test the null hypothesis that 

205 

206 .. math:: 

207 H_0: \lambda_1 - \lambda_2 = \mathtt{diff} 

208 

209 A benefit of the E-test is that it has good power for small sample sizes, 

210 which can reduce sampling costs [1]_. It has been evaluated and determined 

211 to be more powerful than the comparable C-test, sometimes referred to as 

212 the Poisson exact test. 

213 

214 References 

215 ---------- 

216 .. [1] Krishnamoorthy, K., & Thomson, J. (2004). A more powerful test for 

217 comparing two Poisson means. Journal of Statistical Planning and 

218 Inference, 119(1), 23-35. 

219 

220 .. [2] Przyborowski, J., & Wilenski, H. (1940). Homogeneity of results in 

221 testing samples from Poisson series: With an application to testing 

222 clover seed for dodder. Biometrika, 31(3/4), 313-323. 

223 

224 Examples 

225 -------- 

226 

227 Suppose that a gardener wishes to test the number of dodder (weed) seeds 

228 in a sack of clover seeds that they buy from a seed company. It has 

229 previously been established that the number of dodder seeds in clover 

230 follows the Poisson distribution. 

231 

232 A 100 gram sample is drawn from the sack before being shipped to the 

233 gardener. The sample is analyzed, and it is found to contain no dodder 

234 seeds; that is, `k1` is 0. However, upon arrival, the gardener draws 

235 another 100 gram sample from the sack. This time, three dodder seeds are 

236 found in the sample; that is, `k2` is 3. The gardener would like to 

237 know if the difference is significant and not due to chance. The 

238 null hypothesis is that the difference between the two samples is merely 

239 due to chance, or that :math:`\lambda_1 - \lambda_2 = \mathtt{diff}` 

240 where :math:`\mathtt{diff} = 0`. The alternative hypothesis is that the 

241 difference is not due to chance, or :math:`\lambda_1 - \lambda_2 \ne 0`. 

242 The gardener selects a significance level of 5% to reject the null 

243 hypothesis in favor of the alternative [2]_. 

244 

245 >>> import scipy.stats as stats 

246 >>> res = stats.poisson_means_test(0, 100, 3, 100) 

247 >>> res.statistic, res.pvalue 

248 (-1.7320508075688772, 0.08837900929018157) 

249 

250 The p-value is .088, indicating a near 9% chance of observing a value of 

251 the test statistic under the null hypothesis. This exceeds 5%, so the 

252 gardener does not reject the null hypothesis as the difference cannot be 

253 regarded as significant at this level. 

254 """ 

255 

256 _poisson_means_test_iv(k1, n1, k2, n2, diff, alternative) 

257 

258 # "for a given k_1 and k_2, an estimate of \lambda_2 is given by" [1] (3.4) 

259 lmbd_hat2 = ((k1 + k2) / (n1 + n2) - diff * n1 / (n1 + n2)) 

260 

261 # "\hat{\lambda_{2k}} may be less than or equal to zero ... and in this 

262 # case the null hypothesis cannot be rejected ... [and] it is not necessary 

263 # to compute the p-value". [1] page 26 below eq. (3.6). 

264 if lmbd_hat2 <= 0: 

265 return _stats_py.SignificanceResult(0, 1) 

266 

267 # The unbiased variance estimate [1] (3.2) 

268 var = k1 / (n1 ** 2) + k2 / (n2 ** 2) 

269 

270 # The _observed_ pivot statistic from the input. It follows the 

271 # unnumbered equation following equation (3.3) This is used later in 

272 # comparison with the computed pivot statistics in an indicator function. 

273 t_k1k2 = (k1 / n1 - k2 / n2 - diff) / np.sqrt(var) 

274 

275 # Equation (3.5) of [1] is lengthy, so it is broken into several parts, 

276 # beginning here. Note that the probability mass function of poisson is 

277 # exp^(-\mu)*\mu^k/k!, so and this is called with shape \mu, here noted 

278 # here as nlmbd_hat*. The strategy for evaluating the double summation in 

279 # (3.5) is to create two arrays of the values of the two products inside 

280 # the summation and then broadcast them together into a matrix, and then 

281 # sum across the entire matrix. 

282 

283 # Compute constants (as seen in the first and second separated products in 

284 # (3.5).). (This is the shape (\mu) parameter of the poisson distribution.) 

285 nlmbd_hat1 = n1 * (lmbd_hat2 + diff) 

286 nlmbd_hat2 = n2 * lmbd_hat2 

287 

288 # Determine summation bounds for tail ends of distribution rather than 

289 # summing to infinity. `x1*` is for the outer sum and `x2*` is the inner 

290 # sum. 

291 x1_lb, x1_ub = distributions.poisson.ppf([1e-10, 1 - 1e-16], nlmbd_hat1) 

292 x2_lb, x2_ub = distributions.poisson.ppf([1e-10, 1 - 1e-16], nlmbd_hat2) 

293 

294 # Construct arrays to function as the x_1 and x_2 counters on the summation 

295 # in (3.5). `x1` is in columns and `x2` is in rows to allow for 

296 # broadcasting. 

297 x1 = np.arange(x1_lb, x1_ub + 1) 

298 x2 = np.arange(x2_lb, x2_ub + 1)[:, None] 

299 

300 # These are the two products in equation (3.5) with `prob_x1` being the 

301 # first (left side) and `prob_x2` being the second (right side). (To 

302 # make as clear as possible: the 1st contains a "+ d" term, the 2nd does 

303 # not.) 

304 prob_x1 = distributions.poisson.pmf(x1, nlmbd_hat1) 

305 prob_x2 = distributions.poisson.pmf(x2, nlmbd_hat2) 

306 

307 # compute constants for use in the "pivot statistic" per the 

308 # unnumbered equation following (3.3). 

309 lmbd_x1 = x1 / n1 

310 lmbd_x2 = x2 / n2 

311 lmbds_diff = lmbd_x1 - lmbd_x2 - diff 

312 var_x1x2 = lmbd_x1 / n1 + lmbd_x2 / n2 

313 

314 # This is the 'pivot statistic' for use in the indicator of the summation 

315 # (left side of "I[.]"). 

316 with np.errstate(invalid='ignore', divide='ignore'): 

317 t_x1x2 = lmbds_diff / np.sqrt(var_x1x2) 

318 

319 # `[indicator]` implements the "I[.] ... the indicator function" per 

320 # the paragraph following equation (3.5). 

321 if alternative == 'two-sided': 

322 indicator = np.abs(t_x1x2) >= np.abs(t_k1k2) 

323 elif alternative == 'less': 

324 indicator = t_x1x2 <= t_k1k2 

325 else: 

326 indicator = t_x1x2 >= t_k1k2 

327 

328 # Multiply all combinations of the products together, exclude terms 

329 # based on the `indicator` and then sum. (3.5) 

330 pvalue = np.sum((prob_x1 * prob_x2)[indicator]) 

331 return _stats_py.SignificanceResult(t_k1k2, pvalue) 

332 

333 

334def _poisson_means_test_iv(k1, n1, k2, n2, diff, alternative): 

335 # """check for valid types and values of input to `poisson_mean_test`.""" 

336 if k1 != int(k1) or k2 != int(k2): 

337 raise TypeError('`k1` and `k2` must be integers.') 

338 

339 count_err = '`k1` and `k2` must be greater than or equal to 0.' 

340 if k1 < 0 or k2 < 0: 

341 raise ValueError(count_err) 

342 

343 if n1 <= 0 or n2 <= 0: 

344 raise ValueError('`n1` and `n2` must be greater than 0.') 

345 

346 if diff < 0: 

347 raise ValueError('diff must be greater than or equal to 0.') 

348 

349 alternatives = {'two-sided', 'less', 'greater'} 

350 if alternative.lower() not in alternatives: 

351 raise ValueError(f"Alternative must be one of '{alternatives}'.") 

352 

353 

354class CramerVonMisesResult: 

355 def __init__(self, statistic, pvalue): 

356 self.statistic = statistic 

357 self.pvalue = pvalue 

358 

359 def __repr__(self): 

360 return (f"{self.__class__.__name__}(statistic={self.statistic}, " 

361 f"pvalue={self.pvalue})") 

362 

363 

364def _psi1_mod(x): 

365 """ 

366 psi1 is defined in equation 1.10 in Csörgő, S. and Faraway, J. (1996). 

367 This implements a modified version by excluding the term V(x) / 12 

368 (here: _cdf_cvm_inf(x) / 12) to avoid evaluating _cdf_cvm_inf(x) 

369 twice in _cdf_cvm. 

370 

371 Implementation based on MAPLE code of Julian Faraway and R code of the 

372 function pCvM in the package goftest (v1.1.1), permission granted 

373 by Adrian Baddeley. Main difference in the implementation: the code 

374 here keeps adding terms of the series until the terms are small enough. 

375 """ 

376 

377 def _ed2(y): 

378 z = y**2 / 4 

379 b = kv(1/4, z) + kv(3/4, z) 

380 return np.exp(-z) * (y/2)**(3/2) * b / np.sqrt(np.pi) 

381 

382 def _ed3(y): 

383 z = y**2 / 4 

384 c = np.exp(-z) / np.sqrt(np.pi) 

385 return c * (y/2)**(5/2) * (2*kv(1/4, z) + 3*kv(3/4, z) - kv(5/4, z)) 

386 

387 def _Ak(k, x): 

388 m = 2*k + 1 

389 sx = 2 * np.sqrt(x) 

390 y1 = x**(3/4) 

391 y2 = x**(5/4) 

392 

393 e1 = m * gamma(k + 1/2) * _ed2((4 * k + 3)/sx) / (9 * y1) 

394 e2 = gamma(k + 1/2) * _ed3((4 * k + 1) / sx) / (72 * y2) 

395 e3 = 2 * (m + 2) * gamma(k + 3/2) * _ed3((4 * k + 5) / sx) / (12 * y2) 

396 e4 = 7 * m * gamma(k + 1/2) * _ed2((4 * k + 1) / sx) / (144 * y1) 

397 e5 = 7 * m * gamma(k + 1/2) * _ed2((4 * k + 5) / sx) / (144 * y1) 

398 

399 return e1 + e2 + e3 + e4 + e5 

400 

401 x = np.asarray(x) 

402 tot = np.zeros_like(x, dtype='float') 

403 cond = np.ones_like(x, dtype='bool') 

404 k = 0 

405 while np.any(cond): 

406 z = -_Ak(k, x[cond]) / (np.pi * gamma(k + 1)) 

407 tot[cond] = tot[cond] + z 

408 cond[cond] = np.abs(z) >= 1e-7 

409 k += 1 

410 

411 return tot 

412 

413 

414def _cdf_cvm_inf(x): 

415 """ 

416 Calculate the cdf of the Cramér-von Mises statistic (infinite sample size). 

417 

418 See equation 1.2 in Csörgő, S. and Faraway, J. (1996). 

419 

420 Implementation based on MAPLE code of Julian Faraway and R code of the 

421 function pCvM in the package goftest (v1.1.1), permission granted 

422 by Adrian Baddeley. Main difference in the implementation: the code 

423 here keeps adding terms of the series until the terms are small enough. 

424 

425 The function is not expected to be accurate for large values of x, say 

426 x > 4, when the cdf is very close to 1. 

427 """ 

428 x = np.asarray(x) 

429 

430 def term(x, k): 

431 # this expression can be found in [2], second line of (1.3) 

432 u = np.exp(gammaln(k + 0.5) - gammaln(k+1)) / (np.pi**1.5 * np.sqrt(x)) 

433 y = 4*k + 1 

434 q = y**2 / (16*x) 

435 b = kv(0.25, q) 

436 return u * np.sqrt(y) * np.exp(-q) * b 

437 

438 tot = np.zeros_like(x, dtype='float') 

439 cond = np.ones_like(x, dtype='bool') 

440 k = 0 

441 while np.any(cond): 

442 z = term(x[cond], k) 

443 tot[cond] = tot[cond] + z 

444 cond[cond] = np.abs(z) >= 1e-7 

445 k += 1 

446 

447 return tot 

448 

449 

450def _cdf_cvm(x, n=None): 

451 """ 

452 Calculate the cdf of the Cramér-von Mises statistic for a finite sample 

453 size n. If N is None, use the asymptotic cdf (n=inf). 

454 

455 See equation 1.8 in Csörgő, S. and Faraway, J. (1996) for finite samples, 

456 1.2 for the asymptotic cdf. 

457 

458 The function is not expected to be accurate for large values of x, say 

459 x > 2, when the cdf is very close to 1 and it might return values > 1 

460 in that case, e.g. _cdf_cvm(2.0, 12) = 1.0000027556716846. Moreover, it 

461 is not accurate for small values of n, especially close to the bounds of 

462 the distribution's domain, [1/(12*n), n/3], where the value jumps to 0 

463 and 1, respectively. These are limitations of the approximation by Csörgő 

464 and Faraway (1996) implemented in this function. 

465 """ 

466 x = np.asarray(x) 

467 if n is None: 

468 y = _cdf_cvm_inf(x) 

469 else: 

470 # support of the test statistic is [12/n, n/3], see 1.1 in [2] 

471 y = np.zeros_like(x, dtype='float') 

472 sup = (1./(12*n) < x) & (x < n/3.) 

473 # note: _psi1_mod does not include the term _cdf_cvm_inf(x) / 12 

474 # therefore, we need to add it here 

475 y[sup] = _cdf_cvm_inf(x[sup]) * (1 + 1./(12*n)) + _psi1_mod(x[sup]) / n 

476 y[x >= n/3] = 1 

477 

478 if y.ndim == 0: 

479 return y[()] 

480 return y 

481 

482 

483def cramervonmises(rvs, cdf, args=()): 

484 """Perform the one-sample Cramér-von Mises test for goodness of fit. 

485 

486 This performs a test of the goodness of fit of a cumulative distribution 

487 function (cdf) :math:`F` compared to the empirical distribution function 

488 :math:`F_n` of observed random variates :math:`X_1, ..., X_n` that are 

489 assumed to be independent and identically distributed ([1]_). 

490 The null hypothesis is that the :math:`X_i` have cumulative distribution 

491 :math:`F`. 

492 

493 Parameters 

494 ---------- 

495 rvs : array_like 

496 A 1-D array of observed values of the random variables :math:`X_i`. 

497 cdf : str or callable 

498 The cumulative distribution function :math:`F` to test the 

499 observations against. If a string, it should be the name of a 

500 distribution in `scipy.stats`. If a callable, that callable is used 

501 to calculate the cdf: ``cdf(x, *args) -> float``. 

502 args : tuple, optional 

503 Distribution parameters. These are assumed to be known; see Notes. 

504 

505 Returns 

506 ------- 

507 res : object with attributes 

508 statistic : float 

509 Cramér-von Mises statistic. 

510 pvalue : float 

511 The p-value. 

512 

513 See Also 

514 -------- 

515 kstest, cramervonmises_2samp 

516 

517 Notes 

518 ----- 

519 .. versionadded:: 1.6.0 

520 

521 The p-value relies on the approximation given by equation 1.8 in [2]_. 

522 It is important to keep in mind that the p-value is only accurate if 

523 one tests a simple hypothesis, i.e. the parameters of the reference 

524 distribution are known. If the parameters are estimated from the data 

525 (composite hypothesis), the computed p-value is not reliable. 

526 

527 References 

528 ---------- 

529 .. [1] Cramér-von Mises criterion, Wikipedia, 

530 https://en.wikipedia.org/wiki/Cram%C3%A9r%E2%80%93von_Mises_criterion 

531 .. [2] Csörgő, S. and Faraway, J. (1996). The Exact and Asymptotic 

532 Distribution of Cramér-von Mises Statistics. Journal of the 

533 Royal Statistical Society, pp. 221-234. 

534 

535 Examples 

536 -------- 

537 

538 Suppose we wish to test whether data generated by ``scipy.stats.norm.rvs`` 

539 were, in fact, drawn from the standard normal distribution. We choose a 

540 significance level of alpha=0.05. 

541 

542 >>> import numpy as np 

543 >>> from scipy import stats 

544 >>> rng = np.random.default_rng() 

545 >>> x = stats.norm.rvs(size=500, random_state=rng) 

546 >>> res = stats.cramervonmises(x, 'norm') 

547 >>> res.statistic, res.pvalue 

548 (0.49121480855028343, 0.04189256516661377) 

549 

550 The p-value 0.79 exceeds our chosen significance level, so we do not 

551 reject the null hypothesis that the observed sample is drawn from the 

552 standard normal distribution. 

553 

554 Now suppose we wish to check whether the same samples shifted by 2.1 is 

555 consistent with being drawn from a normal distribution with a mean of 2. 

556 

557 >>> y = x + 2.1 

558 >>> res = stats.cramervonmises(y, 'norm', args=(2,)) 

559 >>> res.statistic, res.pvalue 

560 (0.07400330012187435, 0.7274595666160468) 

561 

562 Here we have used the `args` keyword to specify the mean (``loc``) 

563 of the normal distribution to test the data against. This is equivalent 

564 to the following, in which we create a frozen normal distribution with 

565 mean 2.1, then pass its ``cdf`` method as an argument. 

566 

567 >>> frozen_dist = stats.norm(loc=2) 

568 >>> res = stats.cramervonmises(y, frozen_dist.cdf) 

569 >>> res.statistic, res.pvalue 

570 (0.07400330012187435, 0.7274595666160468) 

571 

572 In either case, we would reject the null hypothesis that the observed 

573 sample is drawn from a normal distribution with a mean of 2 (and default 

574 variance of 1) because the p-value 0.04 is less than our chosen 

575 significance level. 

576 

577 """ 

578 if isinstance(cdf, str): 

579 cdf = getattr(distributions, cdf).cdf 

580 

581 vals = np.sort(np.asarray(rvs)) 

582 

583 if vals.size <= 1: 

584 raise ValueError('The sample must contain at least two observations.') 

585 if vals.ndim > 1: 

586 raise ValueError('The sample must be one-dimensional.') 

587 

588 n = len(vals) 

589 cdfvals = cdf(vals, *args) 

590 

591 u = (2*np.arange(1, n+1) - 1)/(2*n) 

592 w = 1/(12*n) + np.sum((u - cdfvals)**2) 

593 

594 # avoid small negative values that can occur due to the approximation 

595 p = max(0, 1. - _cdf_cvm(w, n)) 

596 

597 return CramerVonMisesResult(statistic=w, pvalue=p) 

598 

599 

600def _get_wilcoxon_distr(n): 

601 """ 

602 Distribution of probability of the Wilcoxon ranksum statistic r_plus (sum 

603 of ranks of positive differences). 

604 Returns an array with the probabilities of all the possible ranks 

605 r = 0, ..., n*(n+1)/2 

606 """ 

607 c = np.ones(1, dtype=np.double) 

608 for k in range(1, n + 1): 

609 prev_c = c 

610 c = np.zeros(k * (k + 1) // 2 + 1, dtype=np.double) 

611 m = len(prev_c) 

612 c[:m] = prev_c * 0.5 

613 c[-m:] += prev_c * 0.5 

614 return c 

615 

616 

617def _get_wilcoxon_distr2(n): 

618 """ 

619 Distribution of probability of the Wilcoxon ranksum statistic r_plus (sum 

620 of ranks of positive differences). 

621 Returns an array with the probabilities of all the possible ranks 

622 r = 0, ..., n*(n+1)/2 

623 This is a slower reference function 

624 References 

625 ---------- 

626 .. [1] 1. Harris T, Hardin JW. Exact Wilcoxon Signed-Rank and Wilcoxon 

627 Mann-Whitney Ranksum Tests. The Stata Journal. 2013;13(2):337-343. 

628 """ 

629 ai = np.arange(1, n+1)[:, None] 

630 t = n*(n+1)/2 

631 q = 2*t 

632 j = np.arange(q) 

633 theta = 2*np.pi/q*j 

634 phi_sp = np.prod(np.cos(theta*ai), axis=0) 

635 phi_s = np.exp(1j*theta*t) * phi_sp 

636 p = np.real(ifft(phi_s)) 

637 res = np.zeros(int(t)+1) 

638 res[:-1:] = p[::2] 

639 res[0] /= 2 

640 res[-1] = res[0] 

641 return res 

642 

643 

644def _tau_b(A): 

645 """Calculate Kendall's tau-b and p-value from contingency table.""" 

646 # See [2] 2.2 and 4.2 

647 

648 # contingency table must be truly 2D 

649 if A.shape[0] == 1 or A.shape[1] == 1: 

650 return np.nan, np.nan 

651 

652 NA = A.sum() 

653 PA = _P(A) 

654 QA = _Q(A) 

655 Sri2 = (A.sum(axis=1)**2).sum() 

656 Scj2 = (A.sum(axis=0)**2).sum() 

657 denominator = (NA**2 - Sri2)*(NA**2 - Scj2) 

658 

659 tau = (PA-QA)/(denominator)**0.5 

660 

661 numerator = 4*(_a_ij_Aij_Dij2(A) - (PA - QA)**2 / NA) 

662 s02_tau_b = numerator/denominator 

663 if s02_tau_b == 0: # Avoid divide by zero 

664 return tau, 0 

665 Z = tau/s02_tau_b**0.5 

666 p = 2*norm.sf(abs(Z)) # 2-sided p-value 

667 

668 return tau, p 

669 

670 

671def _somers_d(A, alternative='two-sided'): 

672 """Calculate Somers' D and p-value from contingency table.""" 

673 # See [3] page 1740 

674 

675 # contingency table must be truly 2D 

676 if A.shape[0] <= 1 or A.shape[1] <= 1: 

677 return np.nan, np.nan 

678 

679 NA = A.sum() 

680 NA2 = NA**2 

681 PA = _P(A) 

682 QA = _Q(A) 

683 Sri2 = (A.sum(axis=1)**2).sum() 

684 

685 d = (PA - QA)/(NA2 - Sri2) 

686 

687 S = _a_ij_Aij_Dij2(A) - (PA-QA)**2/NA 

688 

689 with np.errstate(divide='ignore'): 

690 Z = (PA - QA)/(4*(S))**0.5 

691 

692 _, p = scipy.stats._stats_py._normtest_finish(Z, alternative) 

693 

694 return d, p 

695 

696 

697SomersDResult = make_dataclass("SomersDResult", 

698 ("statistic", "pvalue", "table")) 

699 

700 

701def somersd(x, y=None, alternative='two-sided'): 

702 r"""Calculates Somers' D, an asymmetric measure of ordinal association. 

703 

704 Like Kendall's :math:`\tau`, Somers' :math:`D` is a measure of the 

705 correspondence between two rankings. Both statistics consider the 

706 difference between the number of concordant and discordant pairs in two 

707 rankings :math:`X` and :math:`Y`, and both are normalized such that values 

708 close to 1 indicate strong agreement and values close to -1 indicate 

709 strong disagreement. They differ in how they are normalized. To show the 

710 relationship, Somers' :math:`D` can be defined in terms of Kendall's 

711 :math:`\tau_a`: 

712 

713 .. math:: 

714 D(Y|X) = \frac{\tau_a(X, Y)}{\tau_a(X, X)} 

715 

716 Suppose the first ranking :math:`X` has :math:`r` distinct ranks and the 

717 second ranking :math:`Y` has :math:`s` distinct ranks. These two lists of 

718 :math:`n` rankings can also be viewed as an :math:`r \times s` contingency 

719 table in which element :math:`i, j` is the number of rank pairs with rank 

720 :math:`i` in ranking :math:`X` and rank :math:`j` in ranking :math:`Y`. 

721 Accordingly, `somersd` also allows the input data to be supplied as a 

722 single, 2D contingency table instead of as two separate, 1D rankings. 

723 

724 Note that the definition of Somers' :math:`D` is asymmetric: in general, 

725 :math:`D(Y|X) \neq D(X|Y)`. ``somersd(x, y)`` calculates Somers' 

726 :math:`D(Y|X)`: the "row" variable :math:`X` is treated as an independent 

727 variable, and the "column" variable :math:`Y` is dependent. For Somers' 

728 :math:`D(X|Y)`, swap the input lists or transpose the input table. 

729 

730 Parameters 

731 ---------- 

732 x : array_like 

733 1D array of rankings, treated as the (row) independent variable. 

734 Alternatively, a 2D contingency table. 

735 y : array_like, optional 

736 If `x` is a 1D array of rankings, `y` is a 1D array of rankings of the 

737 same length, treated as the (column) dependent variable. 

738 If `x` is 2D, `y` is ignored. 

739 alternative : {'two-sided', 'less', 'greater'}, optional 

740 Defines the alternative hypothesis. Default is 'two-sided'. 

741 The following options are available: 

742 * 'two-sided': the rank correlation is nonzero 

743 * 'less': the rank correlation is negative (less than zero) 

744 * 'greater': the rank correlation is positive (greater than zero) 

745 

746 Returns 

747 ------- 

748 res : SomersDResult 

749 A `SomersDResult` object with the following fields: 

750 

751 statistic : float 

752 The Somers' :math:`D` statistic. 

753 pvalue : float 

754 The p-value for a hypothesis test whose null 

755 hypothesis is an absence of association, :math:`D=0`. 

756 See notes for more information. 

757 table : 2D array 

758 The contingency table formed from rankings `x` and `y` (or the 

759 provided contingency table, if `x` is a 2D array) 

760 

761 See Also 

762 -------- 

763 kendalltau : Calculates Kendall's tau, another correlation measure. 

764 weightedtau : Computes a weighted version of Kendall's tau. 

765 spearmanr : Calculates a Spearman rank-order correlation coefficient. 

766 pearsonr : Calculates a Pearson correlation coefficient. 

767 

768 Notes 

769 ----- 

770 This function follows the contingency table approach of [2]_ and 

771 [3]_. *p*-values are computed based on an asymptotic approximation of 

772 the test statistic distribution under the null hypothesis :math:`D=0`. 

773 

774 Theoretically, hypothesis tests based on Kendall's :math:`tau` and Somers' 

775 :math:`D` should be identical. 

776 However, the *p*-values returned by `kendalltau` are based 

777 on the null hypothesis of *independence* between :math:`X` and :math:`Y` 

778 (i.e. the population from which pairs in :math:`X` and :math:`Y` are 

779 sampled contains equal numbers of all possible pairs), which is more 

780 specific than the null hypothesis :math:`D=0` used here. If the null 

781 hypothesis of independence is desired, it is acceptable to use the 

782 *p*-value returned by `kendalltau` with the statistic returned by 

783 `somersd` and vice versa. For more information, see [2]_. 

784 

785 Contingency tables are formatted according to the convention used by 

786 SAS and R: the first ranking supplied (``x``) is the "row" variable, and 

787 the second ranking supplied (``y``) is the "column" variable. This is 

788 opposite the convention of Somers' original paper [1]_. 

789 

790 References 

791 ---------- 

792 .. [1] Robert H. Somers, "A New Asymmetric Measure of Association for 

793 Ordinal Variables", *American Sociological Review*, Vol. 27, No. 6, 

794 pp. 799--811, 1962. 

795 

796 .. [2] Morton B. Brown and Jacqueline K. Benedetti, "Sampling Behavior of 

797 Tests for Correlation in Two-Way Contingency Tables", *Journal of 

798 the American Statistical Association* Vol. 72, No. 358, pp. 

799 309--315, 1977. 

800 

801 .. [3] SAS Institute, Inc., "The FREQ Procedure (Book Excerpt)", 

802 *SAS/STAT 9.2 User's Guide, Second Edition*, SAS Publishing, 2009. 

803 

804 .. [4] Laerd Statistics, "Somers' d using SPSS Statistics", *SPSS 

805 Statistics Tutorials and Statistical Guides*, 

806 https://statistics.laerd.com/spss-tutorials/somers-d-using-spss-statistics.php, 

807 Accessed July 31, 2020. 

808 

809 Examples 

810 -------- 

811 We calculate Somers' D for the example given in [4]_, in which a hotel 

812 chain owner seeks to determine the association between hotel room 

813 cleanliness and customer satisfaction. The independent variable, hotel 

814 room cleanliness, is ranked on an ordinal scale: "below average (1)", 

815 "average (2)", or "above average (3)". The dependent variable, customer 

816 satisfaction, is ranked on a second scale: "very dissatisfied (1)", 

817 "moderately dissatisfied (2)", "neither dissatisfied nor satisfied (3)", 

818 "moderately satisfied (4)", or "very satisfied (5)". 189 customers 

819 respond to the survey, and the results are cast into a contingency table 

820 with the hotel room cleanliness as the "row" variable and customer 

821 satisfaction as the "column" variable. 

822 

823 +-----+-----+-----+-----+-----+-----+ 

824 | | (1) | (2) | (3) | (4) | (5) | 

825 +=====+=====+=====+=====+=====+=====+ 

826 | (1) | 27 | 25 | 14 | 7 | 0 | 

827 +-----+-----+-----+-----+-----+-----+ 

828 | (2) | 7 | 14 | 18 | 35 | 12 | 

829 +-----+-----+-----+-----+-----+-----+ 

830 | (3) | 1 | 3 | 2 | 7 | 17 | 

831 +-----+-----+-----+-----+-----+-----+ 

832 

833 For example, 27 customers assigned their room a cleanliness ranking of 

834 "below average (1)" and a corresponding satisfaction of "very 

835 dissatisfied (1)". We perform the analysis as follows. 

836 

837 >>> from scipy.stats import somersd 

838 >>> table = [[27, 25, 14, 7, 0], [7, 14, 18, 35, 12], [1, 3, 2, 7, 17]] 

839 >>> res = somersd(table) 

840 >>> res.statistic 

841 0.6032766111513396 

842 >>> res.pvalue 

843 1.0007091191074533e-27 

844 

845 The value of the Somers' D statistic is approximately 0.6, indicating 

846 a positive correlation between room cleanliness and customer satisfaction 

847 in the sample. 

848 The *p*-value is very small, indicating a very small probability of 

849 observing such an extreme value of the statistic under the null 

850 hypothesis that the statistic of the entire population (from which 

851 our sample of 189 customers is drawn) is zero. This supports the 

852 alternative hypothesis that the true value of Somers' D for the population 

853 is nonzero. 

854 

855 """ 

856 x, y = np.array(x), np.array(y) 

857 if x.ndim == 1: 

858 if x.size != y.size: 

859 raise ValueError("Rankings must be of equal length.") 

860 table = scipy.stats.contingency.crosstab(x, y)[1] 

861 elif x.ndim == 2: 

862 if np.any(x < 0): 

863 raise ValueError("All elements of the contingency table must be " 

864 "non-negative.") 

865 if np.any(x != x.astype(int)): 

866 raise ValueError("All elements of the contingency table must be " 

867 "integer.") 

868 if x.nonzero()[0].size < 2: 

869 raise ValueError("At least two elements of the contingency table " 

870 "must be nonzero.") 

871 table = x 

872 else: 

873 raise ValueError("x must be either a 1D or 2D array") 

874 d, p = _somers_d(table, alternative) 

875 

876 # add alias for consistency with other correlation functions 

877 res = SomersDResult(d, p, table) 

878 res.correlation = d 

879 return res 

880 

881 

882# This could be combined with `_all_partitions` in `_resampling.py` 

883def _all_partitions(nx, ny): 

884 """ 

885 Partition a set of indices into two fixed-length sets in all possible ways 

886 

887 Partition a set of indices 0 ... nx + ny - 1 into two sets of length nx and 

888 ny in all possible ways (ignoring order of elements). 

889 """ 

890 z = np.arange(nx+ny) 

891 for c in combinations(z, nx): 

892 x = np.array(c) 

893 mask = np.ones(nx+ny, bool) 

894 mask[x] = False 

895 y = z[mask] 

896 yield x, y 

897 

898 

899def _compute_log_combinations(n): 

900 """Compute all log combination of C(n, k).""" 

901 gammaln_arr = gammaln(np.arange(n + 1) + 1) 

902 return gammaln(n + 1) - gammaln_arr - gammaln_arr[::-1] 

903 

904 

905BarnardExactResult = make_dataclass( 

906 "BarnardExactResult", [("statistic", float), ("pvalue", float)] 

907) 

908 

909 

910def barnard_exact(table, alternative="two-sided", pooled=True, n=32): 

911 r"""Perform a Barnard exact test on a 2x2 contingency table. 

912 

913 Parameters 

914 ---------- 

915 table : array_like of ints 

916 A 2x2 contingency table. Elements should be non-negative integers. 

917 

918 alternative : {'two-sided', 'less', 'greater'}, optional 

919 Defines the null and alternative hypotheses. Default is 'two-sided'. 

920 Please see explanations in the Notes section below. 

921 

922 pooled : bool, optional 

923 Whether to compute score statistic with pooled variance (as in 

924 Student's t-test, for example) or unpooled variance (as in Welch's 

925 t-test). Default is ``True``. 

926 

927 n : int, optional 

928 Number of sampling points used in the construction of the sampling 

929 method. Note that this argument will automatically be converted to 

930 the next higher power of 2 since `scipy.stats.qmc.Sobol` is used to 

931 select sample points. Default is 32. Must be positive. In most cases, 

932 32 points is enough to reach good precision. More points comes at 

933 performance cost. 

934 

935 Returns 

936 ------- 

937 ber : BarnardExactResult 

938 A result object with the following attributes. 

939 

940 statistic : float 

941 The Wald statistic with pooled or unpooled variance, depending 

942 on the user choice of `pooled`. 

943 

944 pvalue : float 

945 P-value, the probability of obtaining a distribution at least as 

946 extreme as the one that was actually observed, assuming that the 

947 null hypothesis is true. 

948 

949 See Also 

950 -------- 

951 chi2_contingency : Chi-square test of independence of variables in a 

952 contingency table. 

953 fisher_exact : Fisher exact test on a 2x2 contingency table. 

954 boschloo_exact : Boschloo's exact test on a 2x2 contingency table, 

955 which is an uniformly more powerful alternative to Fisher's exact test. 

956 

957 Notes 

958 ----- 

959 Barnard's test is an exact test used in the analysis of contingency 

960 tables. It examines the association of two categorical variables, and 

961 is a more powerful alternative than Fisher's exact test 

962 for 2x2 contingency tables. 

963 

964 Let's define :math:`X_0` a 2x2 matrix representing the observed sample, 

965 where each column stores the binomial experiment, as in the example 

966 below. Let's also define :math:`p_1, p_2` the theoretical binomial 

967 probabilities for :math:`x_{11}` and :math:`x_{12}`. When using 

968 Barnard exact test, we can assert three different null hypotheses : 

969 

970 - :math:`H_0 : p_1 \geq p_2` versus :math:`H_1 : p_1 < p_2`, 

971 with `alternative` = "less" 

972 

973 - :math:`H_0 : p_1 \leq p_2` versus :math:`H_1 : p_1 > p_2`, 

974 with `alternative` = "greater" 

975 

976 - :math:`H_0 : p_1 = p_2` versus :math:`H_1 : p_1 \neq p_2`, 

977 with `alternative` = "two-sided" (default one) 

978 

979 In order to compute Barnard's exact test, we are using the Wald 

980 statistic [3]_ with pooled or unpooled variance. 

981 Under the default assumption that both variances are equal 

982 (``pooled = True``), the statistic is computed as: 

983 

984 .. math:: 

985 

986 T(X) = \frac{ 

987 \hat{p}_1 - \hat{p}_2 

988 }{ 

989 \sqrt{ 

990 \hat{p}(1 - \hat{p}) 

991 (\frac{1}{c_1} + 

992 \frac{1}{c_2}) 

993 } 

994 } 

995 

996 with :math:`\hat{p}_1, \hat{p}_2` and :math:`\hat{p}` the estimator of 

997 :math:`p_1, p_2` and :math:`p`, the latter being the combined probability, 

998 given the assumption that :math:`p_1 = p_2`. 

999 

1000 If this assumption is invalid (``pooled = False``), the statistic is: 

1001 

1002 .. math:: 

1003 

1004 T(X) = \frac{ 

1005 \hat{p}_1 - \hat{p}_2 

1006 }{ 

1007 \sqrt{ 

1008 \frac{\hat{p}_1 (1 - \hat{p}_1)}{c_1} + 

1009 \frac{\hat{p}_2 (1 - \hat{p}_2)}{c_2} 

1010 } 

1011 } 

1012 

1013 The p-value is then computed as: 

1014 

1015 .. math:: 

1016 

1017 \sum 

1018 \binom{c_1}{x_{11}} 

1019 \binom{c_2}{x_{12}} 

1020 \pi^{x_{11} + x_{12}} 

1021 (1 - \pi)^{t - x_{11} - x_{12}} 

1022 

1023 where the sum is over all 2x2 contingency tables :math:`X` such that: 

1024 * :math:`T(X) \leq T(X_0)` when `alternative` = "less", 

1025 * :math:`T(X) \geq T(X_0)` when `alternative` = "greater", or 

1026 * :math:`T(X) \geq |T(X_0)|` when `alternative` = "two-sided". 

1027 Above, :math:`c_1, c_2` are the sum of the columns 1 and 2, 

1028 and :math:`t` the total (sum of the 4 sample's element). 

1029 

1030 The returned p-value is the maximum p-value taken over the nuisance 

1031 parameter :math:`\pi`, where :math:`0 \leq \pi \leq 1`. 

1032 

1033 This function's complexity is :math:`O(n c_1 c_2)`, where `n` is the 

1034 number of sample points. 

1035 

1036 References 

1037 ---------- 

1038 .. [1] Barnard, G. A. "Significance Tests for 2x2 Tables". *Biometrika*. 

1039 34.1/2 (1947): 123-138. :doi:`dpgkg3` 

1040 

1041 .. [2] Mehta, Cyrus R., and Pralay Senchaudhuri. "Conditional versus 

1042 unconditional exact tests for comparing two binomials." 

1043 *Cytel Software Corporation* 675 (2003): 1-5. 

1044 

1045 .. [3] "Wald Test". *Wikipedia*. https://en.wikipedia.org/wiki/Wald_test 

1046 

1047 Examples 

1048 -------- 

1049 An example use of Barnard's test is presented in [2]_. 

1050 

1051 Consider the following example of a vaccine efficacy study 

1052 (Chan, 1998). In a randomized clinical trial of 30 subjects, 15 were 

1053 inoculated with a recombinant DNA influenza vaccine and the 15 were 

1054 inoculated with a placebo. Twelve of the 15 subjects in the placebo 

1055 group (80%) eventually became infected with influenza whereas for the 

1056 vaccine group, only 7 of the 15 subjects (47%) became infected. The 

1057 data are tabulated as a 2 x 2 table:: 

1058 

1059 Vaccine Placebo 

1060 Yes 7 12 

1061 No 8 3 

1062 

1063 When working with statistical hypothesis testing, we usually use a 

1064 threshold probability or significance level upon which we decide 

1065 to reject the null hypothesis :math:`H_0`. Suppose we choose the common 

1066 significance level of 5%. 

1067 

1068 Our alternative hypothesis is that the vaccine will lower the chance of 

1069 becoming infected with the virus; that is, the probability :math:`p_1` of 

1070 catching the virus with the vaccine will be *less than* the probability 

1071 :math:`p_2` of catching the virus without the vaccine. Therefore, we call 

1072 `barnard_exact` with the ``alternative="less"`` option: 

1073 

1074 >>> import scipy.stats as stats 

1075 >>> res = stats.barnard_exact([[7, 12], [8, 3]], alternative="less") 

1076 >>> res.statistic 

1077 -1.894... 

1078 >>> res.pvalue 

1079 0.03407... 

1080 

1081 Under the null hypothesis that the vaccine will not lower the chance of 

1082 becoming infected, the probability of obtaining test results at least as 

1083 extreme as the observed data is approximately 3.4%. Since this p-value is 

1084 less than our chosen significance level, we have evidence to reject 

1085 :math:`H_0` in favor of the alternative. 

1086 

1087 Suppose we had used Fisher's exact test instead: 

1088 

1089 >>> _, pvalue = stats.fisher_exact([[7, 12], [8, 3]], alternative="less") 

1090 >>> pvalue 

1091 0.0640... 

1092 

1093 With the same threshold significance of 5%, we would not have been able 

1094 to reject the null hypothesis in favor of the alternative. As stated in 

1095 [2]_, Barnard's test is uniformly more powerful than Fisher's exact test 

1096 because Barnard's test does not condition on any margin. Fisher's test 

1097 should only be used when both sets of marginals are fixed. 

1098 

1099 """ 

1100 if n <= 0: 

1101 raise ValueError( 

1102 "Number of points `n` must be strictly positive, " 

1103 f"found {n!r}" 

1104 ) 

1105 

1106 table = np.asarray(table, dtype=np.int64) 

1107 

1108 if not table.shape == (2, 2): 

1109 raise ValueError("The input `table` must be of shape (2, 2).") 

1110 

1111 if np.any(table < 0): 

1112 raise ValueError("All values in `table` must be nonnegative.") 

1113 

1114 if 0 in table.sum(axis=0): 

1115 # If both values in column are zero, the p-value is 1 and 

1116 # the score's statistic is NaN. 

1117 return BarnardExactResult(np.nan, 1.0) 

1118 

1119 total_col_1, total_col_2 = table.sum(axis=0) 

1120 

1121 x1 = np.arange(total_col_1 + 1, dtype=np.int64).reshape(-1, 1) 

1122 x2 = np.arange(total_col_2 + 1, dtype=np.int64).reshape(1, -1) 

1123 

1124 # We need to calculate the wald statistics for each combination of x1 and 

1125 # x2. 

1126 p1, p2 = x1 / total_col_1, x2 / total_col_2 

1127 

1128 if pooled: 

1129 p = (x1 + x2) / (total_col_1 + total_col_2) 

1130 variances = p * (1 - p) * (1 / total_col_1 + 1 / total_col_2) 

1131 else: 

1132 variances = p1 * (1 - p1) / total_col_1 + p2 * (1 - p2) / total_col_2 

1133 

1134 # To avoid warning when dividing by 0 

1135 with np.errstate(divide="ignore", invalid="ignore"): 

1136 wald_statistic = np.divide((p1 - p2), np.sqrt(variances)) 

1137 

1138 wald_statistic[p1 == p2] = 0 # Removing NaN values 

1139 

1140 wald_stat_obs = wald_statistic[table[0, 0], table[0, 1]] 

1141 

1142 if alternative == "two-sided": 

1143 index_arr = np.abs(wald_statistic) >= abs(wald_stat_obs) 

1144 elif alternative == "less": 

1145 index_arr = wald_statistic <= wald_stat_obs 

1146 elif alternative == "greater": 

1147 index_arr = wald_statistic >= wald_stat_obs 

1148 else: 

1149 msg = ( 

1150 "`alternative` should be one of {'two-sided', 'less', 'greater'}," 

1151 f" found {alternative!r}" 

1152 ) 

1153 raise ValueError(msg) 

1154 

1155 x1_sum_x2 = x1 + x2 

1156 

1157 x1_log_comb = _compute_log_combinations(total_col_1) 

1158 x2_log_comb = _compute_log_combinations(total_col_2) 

1159 x1_sum_x2_log_comb = x1_log_comb[x1] + x2_log_comb[x2] 

1160 

1161 result = shgo( 

1162 _get_binomial_log_p_value_with_nuisance_param, 

1163 args=(x1_sum_x2, x1_sum_x2_log_comb, index_arr), 

1164 bounds=((0, 1),), 

1165 n=n, 

1166 sampling_method="sobol", 

1167 ) 

1168 

1169 # result.fun is the negative log pvalue and therefore needs to be 

1170 # changed before return 

1171 p_value = np.clip(np.exp(-result.fun), a_min=0, a_max=1) 

1172 return BarnardExactResult(wald_stat_obs, p_value) 

1173 

1174 

1175BoschlooExactResult = make_dataclass( 

1176 "BoschlooExactResult", [("statistic", float), ("pvalue", float)] 

1177) 

1178 

1179 

1180def boschloo_exact(table, alternative="two-sided", n=32): 

1181 r"""Perform Boschloo's exact test on a 2x2 contingency table. 

1182 

1183 Parameters 

1184 ---------- 

1185 table : array_like of ints 

1186 A 2x2 contingency table. Elements should be non-negative integers. 

1187 

1188 alternative : {'two-sided', 'less', 'greater'}, optional 

1189 Defines the null and alternative hypotheses. Default is 'two-sided'. 

1190 Please see explanations in the Notes section below. 

1191 

1192 n : int, optional 

1193 Number of sampling points used in the construction of the sampling 

1194 method. Note that this argument will automatically be converted to 

1195 the next higher power of 2 since `scipy.stats.qmc.Sobol` is used to 

1196 select sample points. Default is 32. Must be positive. In most cases, 

1197 32 points is enough to reach good precision. More points comes at 

1198 performance cost. 

1199 

1200 Returns 

1201 ------- 

1202 ber : BoschlooExactResult 

1203 A result object with the following attributes. 

1204 

1205 statistic : float 

1206 The statistic used in Boschloo's test; that is, the p-value 

1207 from Fisher's exact test. 

1208 

1209 pvalue : float 

1210 P-value, the probability of obtaining a distribution at least as 

1211 extreme as the one that was actually observed, assuming that the 

1212 null hypothesis is true. 

1213 

1214 See Also 

1215 -------- 

1216 chi2_contingency : Chi-square test of independence of variables in a 

1217 contingency table. 

1218 fisher_exact : Fisher exact test on a 2x2 contingency table. 

1219 barnard_exact : Barnard's exact test, which is a more powerful alternative 

1220 than Fisher's exact test for 2x2 contingency tables. 

1221 

1222 Notes 

1223 ----- 

1224 Boschloo's test is an exact test used in the analysis of contingency 

1225 tables. It examines the association of two categorical variables, and 

1226 is a uniformly more powerful alternative to Fisher's exact test 

1227 for 2x2 contingency tables. 

1228 

1229 Boschloo's exact test uses the p-value of Fisher's exact test as a 

1230 statistic, and Boschloo's p-value is the probability under the null 

1231 hypothesis of observing such an extreme value of this statistic. 

1232 

1233 Let's define :math:`X_0` a 2x2 matrix representing the observed sample, 

1234 where each column stores the binomial experiment, as in the example 

1235 below. Let's also define :math:`p_1, p_2` the theoretical binomial 

1236 probabilities for :math:`x_{11}` and :math:`x_{12}`. When using 

1237 Boschloo exact test, we can assert three different alternative hypotheses: 

1238 

1239 - :math:`H_0 : p_1=p_2` versus :math:`H_1 : p_1 < p_2`, 

1240 with `alternative` = "less" 

1241 

1242 - :math:`H_0 : p_1=p_2` versus :math:`H_1 : p_1 > p_2`, 

1243 with `alternative` = "greater" 

1244 

1245 - :math:`H_0 : p_1=p_2` versus :math:`H_1 : p_1 \neq p_2`, 

1246 with `alternative` = "two-sided" (default) 

1247 

1248 There are multiple conventions for computing a two-sided p-value when the 

1249 null distribution is asymmetric. Here, we apply the convention that the 

1250 p-value of a two-sided test is twice the minimum of the p-values of the 

1251 one-sided tests (clipped to 1.0). Note that `fisher_exact` follows a 

1252 different convention, so for a given `table`, the statistic reported by 

1253 `boschloo_exact` may differ from the p-value reported by `fisher_exact` 

1254 when ``alternative='two-sided'``. 

1255 

1256 .. versionadded:: 1.7.0 

1257 

1258 References 

1259 ---------- 

1260 .. [1] R.D. Boschloo. "Raised conditional level of significance for the 

1261 2 x 2-table when testing the equality of two probabilities", 

1262 Statistica Neerlandica, 24(1), 1970 

1263 

1264 .. [2] "Boschloo's test", Wikipedia, 

1265 https://en.wikipedia.org/wiki/Boschloo%27s_test 

1266 

1267 .. [3] Lise M. Saari et al. "Employee attitudes and job satisfaction", 

1268 Human Resource Management, 43(4), 395-407, 2004, 

1269 :doi:`10.1002/hrm.20032`. 

1270 

1271 Examples 

1272 -------- 

1273 In the following example, we consider the article "Employee 

1274 attitudes and job satisfaction" [3]_ 

1275 which reports the results of a survey from 63 scientists and 117 college 

1276 professors. Of the 63 scientists, 31 said they were very satisfied with 

1277 their jobs, whereas 74 of the college professors were very satisfied 

1278 with their work. Is this significant evidence that college 

1279 professors are happier with their work than scientists? 

1280 The following table summarizes the data mentioned above:: 

1281 

1282 college professors scientists 

1283 Very Satisfied 74 31 

1284 Dissatisfied 43 32 

1285 

1286 When working with statistical hypothesis testing, we usually use a 

1287 threshold probability or significance level upon which we decide 

1288 to reject the null hypothesis :math:`H_0`. Suppose we choose the common 

1289 significance level of 5%. 

1290 

1291 Our alternative hypothesis is that college professors are truly more 

1292 satisfied with their work than scientists. Therefore, we expect 

1293 :math:`p_1` the proportion of very satisfied college professors to be 

1294 greater than :math:`p_2`, the proportion of very satisfied scientists. 

1295 We thus call `boschloo_exact` with the ``alternative="greater"`` option: 

1296 

1297 >>> import scipy.stats as stats 

1298 >>> res = stats.boschloo_exact([[74, 31], [43, 32]], alternative="greater") 

1299 >>> res.statistic 

1300 0.0483... 

1301 >>> res.pvalue 

1302 0.0355... 

1303 

1304 Under the null hypothesis that scientists are happier in their work than 

1305 college professors, the probability of obtaining test 

1306 results at least as extreme as the observed data is approximately 3.55%. 

1307 Since this p-value is less than our chosen significance level, we have 

1308 evidence to reject :math:`H_0` in favor of the alternative hypothesis. 

1309 

1310 """ 

1311 hypergeom = distributions.hypergeom 

1312 

1313 if n <= 0: 

1314 raise ValueError( 

1315 "Number of points `n` must be strictly positive," 

1316 f" found {n!r}" 

1317 ) 

1318 

1319 table = np.asarray(table, dtype=np.int64) 

1320 

1321 if not table.shape == (2, 2): 

1322 raise ValueError("The input `table` must be of shape (2, 2).") 

1323 

1324 if np.any(table < 0): 

1325 raise ValueError("All values in `table` must be nonnegative.") 

1326 

1327 if 0 in table.sum(axis=0): 

1328 # If both values in column are zero, the p-value is 1 and 

1329 # the score's statistic is NaN. 

1330 return BoschlooExactResult(np.nan, np.nan) 

1331 

1332 total_col_1, total_col_2 = table.sum(axis=0) 

1333 total = total_col_1 + total_col_2 

1334 x1 = np.arange(total_col_1 + 1, dtype=np.int64).reshape(1, -1) 

1335 x2 = np.arange(total_col_2 + 1, dtype=np.int64).reshape(-1, 1) 

1336 x1_sum_x2 = x1 + x2 

1337 

1338 if alternative == 'less': 

1339 pvalues = hypergeom.cdf(x1, total, x1_sum_x2, total_col_1).T 

1340 elif alternative == 'greater': 

1341 # Same formula as the 'less' case, but with the second column. 

1342 pvalues = hypergeom.cdf(x2, total, x1_sum_x2, total_col_2).T 

1343 elif alternative == 'two-sided': 

1344 boschloo_less = boschloo_exact(table, alternative="less", n=n) 

1345 boschloo_greater = boschloo_exact(table, alternative="greater", n=n) 

1346 

1347 res = ( 

1348 boschloo_less if boschloo_less.pvalue < boschloo_greater.pvalue 

1349 else boschloo_greater 

1350 ) 

1351 

1352 # Two-sided p-value is defined as twice the minimum of the one-sided 

1353 # p-values 

1354 pvalue = np.clip(2 * res.pvalue, a_min=0, a_max=1) 

1355 return BoschlooExactResult(res.statistic, pvalue) 

1356 else: 

1357 msg = ( 

1358 f"`alternative` should be one of {'two-sided', 'less', 'greater'}," 

1359 f" found {alternative!r}" 

1360 ) 

1361 raise ValueError(msg) 

1362 

1363 fisher_stat = pvalues[table[0, 0], table[0, 1]] 

1364 

1365 # fisher_stat * (1+1e-13) guards us from small numerical error. It is 

1366 # equivalent to np.isclose with relative tol of 1e-13 and absolute tol of 0 

1367 # For more throughout explanations, see gh-14178 

1368 index_arr = pvalues <= fisher_stat * (1+1e-13) 

1369 

1370 x1, x2, x1_sum_x2 = x1.T, x2.T, x1_sum_x2.T 

1371 x1_log_comb = _compute_log_combinations(total_col_1) 

1372 x2_log_comb = _compute_log_combinations(total_col_2) 

1373 x1_sum_x2_log_comb = x1_log_comb[x1] + x2_log_comb[x2] 

1374 

1375 result = shgo( 

1376 _get_binomial_log_p_value_with_nuisance_param, 

1377 args=(x1_sum_x2, x1_sum_x2_log_comb, index_arr), 

1378 bounds=((0, 1),), 

1379 n=n, 

1380 sampling_method="sobol", 

1381 ) 

1382 

1383 # result.fun is the negative log pvalue and therefore needs to be 

1384 # changed before return 

1385 p_value = np.clip(np.exp(-result.fun), a_min=0, a_max=1) 

1386 return BoschlooExactResult(fisher_stat, p_value) 

1387 

1388 

1389def _get_binomial_log_p_value_with_nuisance_param( 

1390 nuisance_param, x1_sum_x2, x1_sum_x2_log_comb, index_arr 

1391): 

1392 r""" 

1393 Compute the log pvalue in respect of a nuisance parameter considering 

1394 a 2x2 sample space. 

1395 

1396 Parameters 

1397 ---------- 

1398 nuisance_param : float 

1399 nuisance parameter used in the computation of the maximisation of 

1400 the p-value. Must be between 0 and 1 

1401 

1402 x1_sum_x2 : ndarray 

1403 Sum of x1 and x2 inside barnard_exact 

1404 

1405 x1_sum_x2_log_comb : ndarray 

1406 sum of the log combination of x1 and x2 

1407 

1408 index_arr : ndarray of boolean 

1409 

1410 Returns 

1411 ------- 

1412 p_value : float 

1413 Return the maximum p-value considering every nuisance paramater 

1414 between 0 and 1 

1415 

1416 Notes 

1417 ----- 

1418 

1419 Both Barnard's test and Boschloo's test iterate over a nuisance parameter 

1420 :math:`\pi \in [0, 1]` to find the maximum p-value. To search this 

1421 maxima, this function return the negative log pvalue with respect to the 

1422 nuisance parameter passed in params. This negative log p-value is then 

1423 used in `shgo` to find the minimum negative pvalue which is our maximum 

1424 pvalue. 

1425 

1426 Also, to compute the different combination used in the 

1427 p-values' computation formula, this function uses `gammaln` which is 

1428 more tolerant for large value than `scipy.special.comb`. `gammaln` gives 

1429 a log combination. For the little precision loss, performances are 

1430 improved a lot. 

1431 """ 

1432 t1, t2 = x1_sum_x2.shape 

1433 n = t1 + t2 - 2 

1434 with np.errstate(divide="ignore", invalid="ignore"): 

1435 log_nuisance = np.log( 

1436 nuisance_param, 

1437 out=np.zeros_like(nuisance_param), 

1438 where=nuisance_param >= 0, 

1439 ) 

1440 log_1_minus_nuisance = np.log( 

1441 1 - nuisance_param, 

1442 out=np.zeros_like(nuisance_param), 

1443 where=1 - nuisance_param >= 0, 

1444 ) 

1445 

1446 nuisance_power_x1_x2 = log_nuisance * x1_sum_x2 

1447 nuisance_power_x1_x2[(x1_sum_x2 == 0)[:, :]] = 0 

1448 

1449 nuisance_power_n_minus_x1_x2 = log_1_minus_nuisance * (n - x1_sum_x2) 

1450 nuisance_power_n_minus_x1_x2[(x1_sum_x2 == n)[:, :]] = 0 

1451 

1452 tmp_log_values_arr = ( 

1453 x1_sum_x2_log_comb 

1454 + nuisance_power_x1_x2 

1455 + nuisance_power_n_minus_x1_x2 

1456 ) 

1457 

1458 tmp_values_from_index = tmp_log_values_arr[index_arr] 

1459 

1460 # To avoid dividing by zero in log function and getting inf value, 

1461 # values are centered according to the max 

1462 max_value = tmp_values_from_index.max() 

1463 

1464 # To have better result's precision, the log pvalue is taken here. 

1465 # Indeed, pvalue is included inside [0, 1] interval. Passing the 

1466 # pvalue to log makes the interval a lot bigger ([-inf, 0]), and thus 

1467 # help us to achieve better precision 

1468 with np.errstate(divide="ignore", invalid="ignore"): 

1469 log_probs = np.exp(tmp_values_from_index - max_value).sum() 

1470 log_pvalue = max_value + np.log( 

1471 log_probs, 

1472 out=np.full_like(log_probs, -np.inf), 

1473 where=log_probs > 0, 

1474 ) 

1475 

1476 # Since shgo find the minima, minus log pvalue is returned 

1477 return -log_pvalue 

1478 

1479 

1480def _pval_cvm_2samp_exact(s, m, n): 

1481 """ 

1482 Compute the exact p-value of the Cramer-von Mises two-sample test 

1483 for a given value s of the test statistic. 

1484 m and n are the sizes of the samples. 

1485 

1486 [1] Y. Xiao, A. Gordon, and A. Yakovlev, "A C++ Program for 

1487 the Cramér-Von Mises Two-Sample Test", J. Stat. Soft., 

1488 vol. 17, no. 8, pp. 1-15, Dec. 2006. 

1489 [2] T. W. Anderson "On the Distribution of the Two-Sample Cramer-von Mises 

1490 Criterion," The Annals of Mathematical Statistics, Ann. Math. Statist. 

1491 33(3), 1148-1159, (September, 1962) 

1492 """ 

1493 

1494 # [1, p. 3] 

1495 lcm = np.lcm(m, n) 

1496 # [1, p. 4], below eq. 3 

1497 a = lcm // m 

1498 b = lcm // n 

1499 # Combine Eq. 9 in [2] with Eq. 2 in [1] and solve for $\zeta$ 

1500 # Hint: `s` is $U$ in [2], and $T_2$ in [1] is $T$ in [2] 

1501 mn = m * n 

1502 zeta = lcm ** 2 * (m + n) * (6 * s - mn * (4 * mn - 1)) // (6 * mn ** 2) 

1503 

1504 # bound maximum value that may appear in `gs` (remember both rows!) 

1505 zeta_bound = lcm**2 * (m + n) # bound elements in row 1 

1506 combinations = comb(m + n, m) # sum of row 2 

1507 max_gs = max(zeta_bound, combinations) 

1508 dtype = np.min_scalar_type(max_gs) 

1509 

1510 # the frequency table of $g_{u, v}^+$ defined in [1, p. 6] 

1511 gs = ([np.array([[0], [1]], dtype=dtype)] 

1512 + [np.empty((2, 0), dtype=dtype) for _ in range(m)]) 

1513 for u in range(n + 1): 

1514 next_gs = [] 

1515 tmp = np.empty((2, 0), dtype=dtype) 

1516 for v, g in enumerate(gs): 

1517 # Calculate g recursively with eq. 11 in [1]. Even though it 

1518 # doesn't look like it, this also does 12/13 (all of Algorithm 1). 

1519 vi, i0, i1 = np.intersect1d(tmp[0], g[0], return_indices=True) 

1520 tmp = np.concatenate([ 

1521 np.stack([vi, tmp[1, i0] + g[1, i1]]), 

1522 np.delete(tmp, i0, 1), 

1523 np.delete(g, i1, 1) 

1524 ], 1) 

1525 tmp[0] += (a * v - b * u) ** 2 

1526 next_gs.append(tmp) 

1527 gs = next_gs 

1528 value, freq = gs[m] 

1529 return np.float64(np.sum(freq[value >= zeta]) / combinations) 

1530 

1531 

1532def cramervonmises_2samp(x, y, method='auto'): 

1533 """Perform the two-sample Cramér-von Mises test for goodness of fit. 

1534 

1535 This is the two-sample version of the Cramér-von Mises test ([1]_): 

1536 for two independent samples :math:`X_1, ..., X_n` and 

1537 :math:`Y_1, ..., Y_m`, the null hypothesis is that the samples 

1538 come from the same (unspecified) continuous distribution. 

1539 

1540 Parameters 

1541 ---------- 

1542 x : array_like 

1543 A 1-D array of observed values of the random variables :math:`X_i`. 

1544 y : array_like 

1545 A 1-D array of observed values of the random variables :math:`Y_i`. 

1546 method : {'auto', 'asymptotic', 'exact'}, optional 

1547 The method used to compute the p-value, see Notes for details. 

1548 The default is 'auto'. 

1549 

1550 Returns 

1551 ------- 

1552 res : object with attributes 

1553 statistic : float 

1554 Cramér-von Mises statistic. 

1555 pvalue : float 

1556 The p-value. 

1557 

1558 See Also 

1559 -------- 

1560 cramervonmises, anderson_ksamp, epps_singleton_2samp, ks_2samp 

1561 

1562 Notes 

1563 ----- 

1564 .. versionadded:: 1.7.0 

1565 

1566 The statistic is computed according to equation 9 in [2]_. The 

1567 calculation of the p-value depends on the keyword `method`: 

1568 

1569 - ``asymptotic``: The p-value is approximated by using the limiting 

1570 distribution of the test statistic. 

1571 - ``exact``: The exact p-value is computed by enumerating all 

1572 possible combinations of the test statistic, see [2]_. 

1573 

1574 If ``method='auto'``, the exact approach is used 

1575 if both samples contain equal to or less than 20 observations, 

1576 otherwise the asymptotic distribution is used. 

1577 

1578 If the underlying distribution is not continuous, the p-value is likely to 

1579 be conservative (Section 6.2 in [3]_). When ranking the data to compute 

1580 the test statistic, midranks are used if there are ties. 

1581 

1582 References 

1583 ---------- 

1584 .. [1] https://en.wikipedia.org/wiki/Cramer-von_Mises_criterion 

1585 .. [2] Anderson, T.W. (1962). On the distribution of the two-sample 

1586 Cramer-von-Mises criterion. The Annals of Mathematical 

1587 Statistics, pp. 1148-1159. 

1588 .. [3] Conover, W.J., Practical Nonparametric Statistics, 1971. 

1589 

1590 Examples 

1591 -------- 

1592 

1593 Suppose we wish to test whether two samples generated by 

1594 ``scipy.stats.norm.rvs`` have the same distribution. We choose a 

1595 significance level of alpha=0.05. 

1596 

1597 >>> import numpy as np 

1598 >>> from scipy import stats 

1599 >>> rng = np.random.default_rng() 

1600 >>> x = stats.norm.rvs(size=100, random_state=rng) 

1601 >>> y = stats.norm.rvs(size=70, random_state=rng) 

1602 >>> res = stats.cramervonmises_2samp(x, y) 

1603 >>> res.statistic, res.pvalue 

1604 (0.29376470588235293, 0.1412873014573014) 

1605 

1606 The p-value exceeds our chosen significance level, so we do not 

1607 reject the null hypothesis that the observed samples are drawn from the 

1608 same distribution. 

1609 

1610 For small sample sizes, one can compute the exact p-values: 

1611 

1612 >>> x = stats.norm.rvs(size=7, random_state=rng) 

1613 >>> y = stats.t.rvs(df=2, size=6, random_state=rng) 

1614 >>> res = stats.cramervonmises_2samp(x, y, method='exact') 

1615 >>> res.statistic, res.pvalue 

1616 (0.197802197802198, 0.31643356643356646) 

1617 

1618 The p-value based on the asymptotic distribution is a good approximation 

1619 even though the sample size is small. 

1620 

1621 >>> res = stats.cramervonmises_2samp(x, y, method='asymptotic') 

1622 >>> res.statistic, res.pvalue 

1623 (0.197802197802198, 0.2966041181527128) 

1624 

1625 Independent of the method, one would not reject the null hypothesis at the 

1626 chosen significance level in this example. 

1627 

1628 """ 

1629 xa = np.sort(np.asarray(x)) 

1630 ya = np.sort(np.asarray(y)) 

1631 

1632 if xa.size <= 1 or ya.size <= 1: 

1633 raise ValueError('x and y must contain at least two observations.') 

1634 if xa.ndim > 1 or ya.ndim > 1: 

1635 raise ValueError('The samples must be one-dimensional.') 

1636 if method not in ['auto', 'exact', 'asymptotic']: 

1637 raise ValueError('method must be either auto, exact or asymptotic.') 

1638 

1639 nx = len(xa) 

1640 ny = len(ya) 

1641 

1642 if method == 'auto': 

1643 if max(nx, ny) > 20: 

1644 method = 'asymptotic' 

1645 else: 

1646 method = 'exact' 

1647 

1648 # get ranks of x and y in the pooled sample 

1649 z = np.concatenate([xa, ya]) 

1650 # in case of ties, use midrank (see [1]) 

1651 r = scipy.stats.rankdata(z, method='average') 

1652 rx = r[:nx] 

1653 ry = r[nx:] 

1654 

1655 # compute U (eq. 10 in [2]) 

1656 u = nx * np.sum((rx - np.arange(1, nx+1))**2) 

1657 u += ny * np.sum((ry - np.arange(1, ny+1))**2) 

1658 

1659 # compute T (eq. 9 in [2]) 

1660 k, N = nx*ny, nx + ny 

1661 t = u / (k*N) - (4*k - 1)/(6*N) 

1662 

1663 if method == 'exact': 

1664 p = _pval_cvm_2samp_exact(u, nx, ny) 

1665 else: 

1666 # compute expected value and variance of T (eq. 11 and 14 in [2]) 

1667 et = (1 + 1/N)/6 

1668 vt = (N+1) * (4*k*N - 3*(nx**2 + ny**2) - 2*k) 

1669 vt = vt / (45 * N**2 * 4 * k) 

1670 

1671 # computed the normalized statistic (eq. 15 in [2]) 

1672 tn = 1/6 + (t - et) / np.sqrt(45 * vt) 

1673 

1674 # approximate distribution of tn with limiting distribution 

1675 # of the one-sample test statistic 

1676 # if tn < 0.003, the _cdf_cvm_inf(tn) < 1.28*1e-18, return 1.0 directly 

1677 if tn < 0.003: 

1678 p = 1.0 

1679 else: 

1680 p = max(0, 1. - _cdf_cvm_inf(tn)) 

1681 

1682 return CramerVonMisesResult(statistic=t, pvalue=p) 

1683 

1684 

1685class TukeyHSDResult: 

1686 """Result of `scipy.stats.tukey_hsd`. 

1687 

1688 Attributes 

1689 ---------- 

1690 statistic : float ndarray 

1691 The computed statistic of the test for each comparison. The element 

1692 at index ``(i, j)`` is the statistic for the comparison between groups 

1693 ``i`` and ``j``. 

1694 pvalue : float ndarray 

1695 The associated p-value from the studentized range distribution. The 

1696 element at index ``(i, j)`` is the p-value for the comparison 

1697 between groups ``i`` and ``j``. 

1698 

1699 Notes 

1700 ----- 

1701 The string representation of this object displays the most recently 

1702 calculated confidence interval, and if none have been previously 

1703 calculated, it will evaluate ``confidence_interval()``. 

1704 

1705 References 

1706 ---------- 

1707 .. [1] NIST/SEMATECH e-Handbook of Statistical Methods, "7.4.7.1. Tukey's 

1708 Method." 

1709 https://www.itl.nist.gov/div898/handbook/prc/section4/prc471.htm, 

1710 28 November 2020. 

1711 """ 

1712 

1713 def __init__(self, statistic, pvalue, _nobs, _ntreatments, _stand_err): 

1714 self.statistic = statistic 

1715 self.pvalue = pvalue 

1716 self._ntreatments = _ntreatments 

1717 self._nobs = _nobs 

1718 self._stand_err = _stand_err 

1719 self._ci = None 

1720 self._ci_cl = None 

1721 

1722 def __str__(self): 

1723 # Note: `__str__` prints the confidence intervals from the most 

1724 # recent call to `confidence_interval`. If it has not been called, 

1725 # it will be called with the default CL of .95. 

1726 if self._ci is None: 

1727 self.confidence_interval(confidence_level=.95) 

1728 s = ("Tukey's HSD Pairwise Group Comparisons" 

1729 f" ({self._ci_cl*100:.1f}% Confidence Interval)\n") 

1730 s += "Comparison Statistic p-value Lower CI Upper CI\n" 

1731 for i in range(self.pvalue.shape[0]): 

1732 for j in range(self.pvalue.shape[0]): 

1733 if i != j: 

1734 s += (f" ({i} - {j}) {self.statistic[i, j]:>10.3f}" 

1735 f"{self.pvalue[i, j]:>10.3f}" 

1736 f"{self._ci.low[i, j]:>10.3f}" 

1737 f"{self._ci.high[i, j]:>10.3f}\n") 

1738 return s 

1739 

1740 def confidence_interval(self, confidence_level=.95): 

1741 """Compute the confidence interval for the specified confidence level. 

1742 

1743 Parameters 

1744 ---------- 

1745 confidence_level : float, optional 

1746 Confidence level for the computed confidence interval 

1747 of the estimated proportion. Default is .95. 

1748 

1749 Returns 

1750 ------- 

1751 ci : ``ConfidenceInterval`` object 

1752 The object has attributes ``low`` and ``high`` that hold the 

1753 lower and upper bounds of the confidence intervals for each 

1754 comparison. The high and low values are accessible for each 

1755 comparison at index ``(i, j)`` between groups ``i`` and ``j``. 

1756 

1757 References 

1758 ---------- 

1759 .. [1] NIST/SEMATECH e-Handbook of Statistical Methods, "7.4.7.1. 

1760 Tukey's Method." 

1761 https://www.itl.nist.gov/div898/handbook/prc/section4/prc471.htm, 

1762 28 November 2020. 

1763 

1764 Examples 

1765 -------- 

1766 >>> from scipy.stats import tukey_hsd 

1767 >>> group0 = [24.5, 23.5, 26.4, 27.1, 29.9] 

1768 >>> group1 = [28.4, 34.2, 29.5, 32.2, 30.1] 

1769 >>> group2 = [26.1, 28.3, 24.3, 26.2, 27.8] 

1770 >>> result = tukey_hsd(group0, group1, group2) 

1771 >>> ci = result.confidence_interval() 

1772 >>> ci.low 

1773 array([[-3.649159, -8.249159, -3.909159], 

1774 [ 0.950841, -3.649159, 0.690841], 

1775 [-3.389159, -7.989159, -3.649159]]) 

1776 >>> ci.high 

1777 array([[ 3.649159, -0.950841, 3.389159], 

1778 [ 8.249159, 3.649159, 7.989159], 

1779 [ 3.909159, -0.690841, 3.649159]]) 

1780 """ 

1781 # check to see if the supplied confidence level matches that of the 

1782 # previously computed CI. 

1783 if (self._ci is not None and self._ci_cl is not None and 

1784 confidence_level == self._ci_cl): 

1785 return self._ci 

1786 

1787 if not 0 < confidence_level < 1: 

1788 raise ValueError("Confidence level must be between 0 and 1.") 

1789 # determine the critical value of the studentized range using the 

1790 # appropriate confidence level, number of treatments, and degrees 

1791 # of freedom as determined by the number of data less the number of 

1792 # treatments. ("Confidence limits for Tukey's method")[1]. Note that 

1793 # in the cases of unequal sample sizes there will be a criterion for 

1794 # each group comparison. 

1795 params = (confidence_level, self._nobs, self._ntreatments - self._nobs) 

1796 srd = distributions.studentized_range.ppf(*params) 

1797 # also called maximum critical value, the Tukey criterion is the 

1798 # studentized range critical value * the square root of mean square 

1799 # error over the sample size. 

1800 tukey_criterion = srd * self._stand_err 

1801 # the confidence levels are determined by the 

1802 # `mean_differences` +- `tukey_criterion` 

1803 upper_conf = self.statistic + tukey_criterion 

1804 lower_conf = self.statistic - tukey_criterion 

1805 self._ci = ConfidenceInterval(low=lower_conf, high=upper_conf) 

1806 self._ci_cl = confidence_level 

1807 return self._ci 

1808 

1809 

1810def _tukey_hsd_iv(args): 

1811 if (len(args)) < 2: 

1812 raise ValueError("There must be more than 1 treatment.") 

1813 args = [np.asarray(arg) for arg in args] 

1814 for arg in args: 

1815 if arg.ndim != 1: 

1816 raise ValueError("Input samples must be one-dimensional.") 

1817 if arg.size <= 1: 

1818 raise ValueError("Input sample size must be greater than one.") 

1819 if np.isinf(arg).any(): 

1820 raise ValueError("Input samples must be finite.") 

1821 return args 

1822 

1823 

1824def tukey_hsd(*args): 

1825 """Perform Tukey's HSD test for equality of means over multiple treatments. 

1826 

1827 Tukey's honestly significant difference (HSD) test performs pairwise 

1828 comparison of means for a set of samples. Whereas ANOVA (e.g. `f_oneway`) 

1829 assesses whether the true means underlying each sample are identical, 

1830 Tukey's HSD is a post hoc test used to compare the mean of each sample 

1831 to the mean of each other sample. 

1832 

1833 The null hypothesis is that the distributions underlying the samples all 

1834 have the same mean. The test statistic, which is computed for every 

1835 possible pairing of samples, is simply the difference between the sample 

1836 means. For each pair, the p-value is the probability under the null 

1837 hypothesis (and other assumptions; see notes) of observing such an extreme 

1838 value of the statistic, considering that many pairwise comparisons are 

1839 being performed. Confidence intervals for the difference between each pair 

1840 of means are also available. 

1841 

1842 Parameters 

1843 ---------- 

1844 sample1, sample2, ... : array_like 

1845 The sample measurements for each group. There must be at least 

1846 two arguments. 

1847 

1848 Returns 

1849 ------- 

1850 result : `~scipy.stats._result_classes.TukeyHSDResult` instance 

1851 The return value is an object with the following attributes: 

1852 

1853 statistic : float ndarray 

1854 The computed statistic of the test for each comparison. The element 

1855 at index ``(i, j)`` is the statistic for the comparison between 

1856 groups ``i`` and ``j``. 

1857 pvalue : float ndarray 

1858 The computed p-value of the test for each comparison. The element 

1859 at index ``(i, j)`` is the p-value for the comparison between 

1860 groups ``i`` and ``j``. 

1861 

1862 The object has the following methods: 

1863 

1864 confidence_interval(confidence_level=0.95): 

1865 Compute the confidence interval for the specified confidence level. 

1866 

1867 Notes 

1868 ----- 

1869 The use of this test relies on several assumptions. 

1870 

1871 1. The observations are independent within and among groups. 

1872 2. The observations within each group are normally distributed. 

1873 3. The distributions from which the samples are drawn have the same finite 

1874 variance. 

1875 

1876 The original formulation of the test was for samples of equal size [6]_. 

1877 In case of unequal sample sizes, the test uses the Tukey-Kramer method 

1878 [4]_. 

1879 

1880 References 

1881 ---------- 

1882 .. [1] NIST/SEMATECH e-Handbook of Statistical Methods, "7.4.7.1. Tukey's 

1883 Method." 

1884 https://www.itl.nist.gov/div898/handbook/prc/section4/prc471.htm, 

1885 28 November 2020. 

1886 .. [2] Abdi, Herve & Williams, Lynne. (2021). "Tukey's Honestly Significant 

1887 Difference (HSD) Test." 

1888 https://personal.utdallas.edu/~herve/abdi-HSD2010-pretty.pdf 

1889 .. [3] "One-Way ANOVA Using SAS PROC ANOVA & PROC GLM." SAS 

1890 Tutorials, 2007, www.stattutorials.com/SAS/TUTORIAL-PROC-GLM.htm. 

1891 .. [4] Kramer, Clyde Young. "Extension of Multiple Range Tests to Group 

1892 Means with Unequal Numbers of Replications." Biometrics, vol. 12, 

1893 no. 3, 1956, pp. 307-310. JSTOR, www.jstor.org/stable/3001469. 

1894 Accessed 25 May 2021. 

1895 .. [5] NIST/SEMATECH e-Handbook of Statistical Methods, "7.4.3.3. 

1896 The ANOVA table and tests of hypotheses about means" 

1897 https://www.itl.nist.gov/div898/handbook/prc/section4/prc433.htm, 

1898 2 June 2021. 

1899 .. [6] Tukey, John W. "Comparing Individual Means in the Analysis of 

1900 Variance." Biometrics, vol. 5, no. 2, 1949, pp. 99-114. JSTOR, 

1901 www.jstor.org/stable/3001913. Accessed 14 June 2021. 

1902 

1903 

1904 Examples 

1905 -------- 

1906 Here are some data comparing the time to relief of three brands of 

1907 headache medicine, reported in minutes. Data adapted from [3]_. 

1908 

1909 >>> import numpy as np 

1910 >>> from scipy.stats import tukey_hsd 

1911 >>> group0 = [24.5, 23.5, 26.4, 27.1, 29.9] 

1912 >>> group1 = [28.4, 34.2, 29.5, 32.2, 30.1] 

1913 >>> group2 = [26.1, 28.3, 24.3, 26.2, 27.8] 

1914 

1915 We would like to see if the means between any of the groups are 

1916 significantly different. First, visually examine a box and whisker plot. 

1917 

1918 >>> import matplotlib.pyplot as plt 

1919 >>> fig, ax = plt.subplots(1, 1) 

1920 >>> ax.boxplot([group0, group1, group2]) 

1921 >>> ax.set_xticklabels(["group0", "group1", "group2"]) # doctest: +SKIP 

1922 >>> ax.set_ylabel("mean") # doctest: +SKIP 

1923 >>> plt.show() 

1924 

1925 From the box and whisker plot, we can see overlap in the interquartile 

1926 ranges group 1 to group 2 and group 3, but we can apply the ``tukey_hsd`` 

1927 test to determine if the difference between means is significant. We 

1928 set a significance level of .05 to reject the null hypothesis. 

1929 

1930 >>> res = tukey_hsd(group0, group1, group2) 

1931 >>> print(res) 

1932 Tukey's HSD Pairwise Group Comparisons (95.0% Confidence Interval) 

1933 Comparison Statistic p-value Lower CI Upper CI 

1934 (0 - 1) -4.600 0.014 -8.249 -0.951 

1935 (0 - 2) -0.260 0.980 -3.909 3.389 

1936 (1 - 0) 4.600 0.014 0.951 8.249 

1937 (1 - 2) 4.340 0.020 0.691 7.989 

1938 (2 - 0) 0.260 0.980 -3.389 3.909 

1939 (2 - 1) -4.340 0.020 -7.989 -0.691 

1940 

1941 The null hypothesis is that each group has the same mean. The p-value for 

1942 comparisons between ``group0`` and ``group1`` as well as ``group1`` and 

1943 ``group2`` do not exceed .05, so we reject the null hypothesis that they 

1944 have the same means. The p-value of the comparison between ``group0`` 

1945 and ``group2`` exceeds .05, so we accept the null hypothesis that there 

1946 is not a significant difference between their means. 

1947 

1948 We can also compute the confidence interval associated with our chosen 

1949 confidence level. 

1950 

1951 >>> group0 = [24.5, 23.5, 26.4, 27.1, 29.9] 

1952 >>> group1 = [28.4, 34.2, 29.5, 32.2, 30.1] 

1953 >>> group2 = [26.1, 28.3, 24.3, 26.2, 27.8] 

1954 >>> result = tukey_hsd(group0, group1, group2) 

1955 >>> conf = res.confidence_interval(confidence_level=.99) 

1956 >>> for ((i, j), l) in np.ndenumerate(conf.low): 

1957 ... # filter out self comparisons 

1958 ... if i != j: 

1959 ... h = conf.high[i,j] 

1960 ... print(f"({i} - {j}) {l:>6.3f} {h:>6.3f}") 

1961 (0 - 1) -9.480 0.280 

1962 (0 - 2) -5.140 4.620 

1963 (1 - 0) -0.280 9.480 

1964 (1 - 2) -0.540 9.220 

1965 (2 - 0) -4.620 5.140 

1966 (2 - 1) -9.220 0.540 

1967 """ 

1968 args = _tukey_hsd_iv(args) 

1969 ntreatments = len(args) 

1970 means = np.asarray([np.mean(arg) for arg in args]) 

1971 nsamples_treatments = np.asarray([a.size for a in args]) 

1972 nobs = np.sum(nsamples_treatments) 

1973 

1974 # determine mean square error [5]. Note that this is sometimes called 

1975 # mean square error within. 

1976 mse = (np.sum([np.var(arg, ddof=1) for arg in args] * 

1977 (nsamples_treatments - 1)) / (nobs - ntreatments)) 

1978 

1979 # The calculation of the standard error differs when treatments differ in 

1980 # size. See ("Unequal sample sizes")[1]. 

1981 if np.unique(nsamples_treatments).size == 1: 

1982 # all input groups are the same length, so only one value needs to be 

1983 # calculated [1]. 

1984 normalize = 2 / nsamples_treatments[0] 

1985 else: 

1986 # to compare groups of differing sizes, we must compute a variance 

1987 # value for each individual comparison. Use broadcasting to get the 

1988 # resulting matrix. [3], verified against [4] (page 308). 

1989 normalize = 1 / nsamples_treatments + 1 / nsamples_treatments[None].T 

1990 

1991 # the standard error is used in the computation of the tukey criterion and 

1992 # finding the p-values. 

1993 stand_err = np.sqrt(normalize * mse / 2) 

1994 

1995 # the mean difference is the test statistic. 

1996 mean_differences = means[None].T - means 

1997 

1998 # Calculate the t-statistic to use within the survival function of the 

1999 # studentized range to get the p-value. 

2000 t_stat = np.abs(mean_differences) / stand_err 

2001 

2002 params = t_stat, ntreatments, nobs - ntreatments 

2003 pvalues = distributions.studentized_range.sf(*params) 

2004 

2005 return TukeyHSDResult(mean_differences, pvalues, ntreatments, 

2006 nobs, stand_err)