Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scipy/stats/_hypotests.py: 10%
467 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-12 06:31 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-12 06:31 +0000
1from collections import namedtuple
2from dataclasses import make_dataclass
3from math import comb
4import numpy as np
5import warnings
6from itertools import combinations
7import scipy.stats
8from scipy.optimize import shgo
9from . import distributions
10from ._common import ConfidenceInterval
11from ._continuous_distns import chi2, norm
12from scipy.special import gamma, kv, gammaln
13from scipy.fft import ifft
14from ._stats_pythran import _a_ij_Aij_Dij2
15from ._stats_pythran import (
16 _concordant_pairs as _P, _discordant_pairs as _Q
17)
18from scipy.stats import _stats_py
20__all__ = ['epps_singleton_2samp', 'cramervonmises', 'somersd',
21 'barnard_exact', 'boschloo_exact', 'cramervonmises_2samp',
22 'tukey_hsd', 'poisson_means_test']
24Epps_Singleton_2sampResult = namedtuple('Epps_Singleton_2sampResult',
25 ('statistic', 'pvalue'))
28def epps_singleton_2samp(x, y, t=(0.4, 0.8)):
29 """Compute the Epps-Singleton (ES) test statistic.
31 Test the null hypothesis that two samples have the same underlying
32 probability distribution.
34 Parameters
35 ----------
36 x, y : array-like
37 The two samples of observations to be tested. Input must not have more
38 than one dimension. Samples can have different lengths.
39 t : array-like, optional
40 The points (t1, ..., tn) where the empirical characteristic function is
41 to be evaluated. It should be positive distinct numbers. The default
42 value (0.4, 0.8) is proposed in [1]_. Input must not have more than
43 one dimension.
45 Returns
46 -------
47 statistic : float
48 The test statistic.
49 pvalue : float
50 The associated p-value based on the asymptotic chi2-distribution.
52 See Also
53 --------
54 ks_2samp, anderson_ksamp
56 Notes
57 -----
58 Testing whether two samples are generated by the same underlying
59 distribution is a classical question in statistics. A widely used test is
60 the Kolmogorov-Smirnov (KS) test which relies on the empirical
61 distribution function. Epps and Singleton introduce a test based on the
62 empirical characteristic function in [1]_.
64 One advantage of the ES test compared to the KS test is that is does
65 not assume a continuous distribution. In [1]_, the authors conclude
66 that the test also has a higher power than the KS test in many
67 examples. They recommend the use of the ES test for discrete samples as
68 well as continuous samples with at least 25 observations each, whereas
69 `anderson_ksamp` is recommended for smaller sample sizes in the
70 continuous case.
72 The p-value is computed from the asymptotic distribution of the test
73 statistic which follows a `chi2` distribution. If the sample size of both
74 `x` and `y` is below 25, the small sample correction proposed in [1]_ is
75 applied to the test statistic.
77 The default values of `t` are determined in [1]_ by considering
78 various distributions and finding good values that lead to a high power
79 of the test in general. Table III in [1]_ gives the optimal values for
80 the distributions tested in that study. The values of `t` are scaled by
81 the semi-interquartile range in the implementation, see [1]_.
83 References
84 ----------
85 .. [1] T. W. Epps and K. J. Singleton, "An omnibus test for the two-sample
86 problem using the empirical characteristic function", Journal of
87 Statistical Computation and Simulation 26, p. 177--203, 1986.
89 .. [2] S. J. Goerg and J. Kaiser, "Nonparametric testing of distributions
90 - the Epps-Singleton two-sample test using the empirical characteristic
91 function", The Stata Journal 9(3), p. 454--465, 2009.
93 """
94 x, y, t = np.asarray(x), np.asarray(y), np.asarray(t)
95 # check if x and y are valid inputs
96 if x.ndim > 1:
97 raise ValueError('x must be 1d, but x.ndim equals {}.'.format(x.ndim))
98 if y.ndim > 1:
99 raise ValueError('y must be 1d, but y.ndim equals {}.'.format(y.ndim))
100 nx, ny = len(x), len(y)
101 if (nx < 5) or (ny < 5):
102 raise ValueError('x and y should have at least 5 elements, but len(x) '
103 '= {} and len(y) = {}.'.format(nx, ny))
104 if not np.isfinite(x).all():
105 raise ValueError('x must not contain nonfinite values.')
106 if not np.isfinite(y).all():
107 raise ValueError('y must not contain nonfinite values.')
108 n = nx + ny
110 # check if t is valid
111 if t.ndim > 1:
112 raise ValueError('t must be 1d, but t.ndim equals {}.'.format(t.ndim))
113 if np.less_equal(t, 0).any():
114 raise ValueError('t must contain positive elements only.')
116 # rescale t with semi-iqr as proposed in [1]; import iqr here to avoid
117 # circular import
118 from scipy.stats import iqr
119 sigma = iqr(np.hstack((x, y))) / 2
120 ts = np.reshape(t, (-1, 1)) / sigma
122 # covariance estimation of ES test
123 gx = np.vstack((np.cos(ts*x), np.sin(ts*x))).T # shape = (nx, 2*len(t))
124 gy = np.vstack((np.cos(ts*y), np.sin(ts*y))).T
125 cov_x = np.cov(gx.T, bias=True) # the test uses biased cov-estimate
126 cov_y = np.cov(gy.T, bias=True)
127 est_cov = (n/nx)*cov_x + (n/ny)*cov_y
128 est_cov_inv = np.linalg.pinv(est_cov)
129 r = np.linalg.matrix_rank(est_cov_inv)
130 if r < 2*len(t):
131 warnings.warn('Estimated covariance matrix does not have full rank. '
132 'This indicates a bad choice of the input t and the '
133 'test might not be consistent.') # see p. 183 in [1]_
135 # compute test statistic w distributed asympt. as chisquare with df=r
136 g_diff = np.mean(gx, axis=0) - np.mean(gy, axis=0)
137 w = n*np.dot(g_diff.T, np.dot(est_cov_inv, g_diff))
139 # apply small-sample correction
140 if (max(nx, ny) < 25):
141 corr = 1.0/(1.0 + n**(-0.45) + 10.1*(nx**(-1.7) + ny**(-1.7)))
142 w = corr * w
144 p = chi2.sf(w, r)
146 return Epps_Singleton_2sampResult(w, p)
149def poisson_means_test(k1, n1, k2, n2, *, diff=0, alternative='two-sided'):
150 r"""
151 Performs the Poisson means test, AKA the "E-test".
153 This is a test of the null hypothesis that the difference between means of
154 two Poisson distributions is `diff`. The samples are provided as the
155 number of events `k1` and `k2` observed within measurement intervals
156 (e.g. of time, space, number of observations) of sizes `n1` and `n2`.
158 Parameters
159 ----------
160 k1 : int
161 Number of events observed from distribution 1.
162 n1: float
163 Size of sample from distribution 1.
164 k2 : int
165 Number of events observed from distribution 2.
166 n2 : float
167 Size of sample from distribution 2.
168 diff : float, default=0
169 The hypothesized difference in means between the distributions
170 underlying the samples.
171 alternative : {'two-sided', 'less', 'greater'}, optional
172 Defines the alternative hypothesis.
173 The following options are available (default is 'two-sided'):
175 * 'two-sided': the difference between distribution means is not
176 equal to `diff`
177 * 'less': the difference between distribution means is less than
178 `diff`
179 * 'greater': the difference between distribution means is greater
180 than `diff`
182 Returns
183 -------
184 statistic : float
185 The test statistic (see [1]_ equation 3.3).
186 pvalue : float
187 The probability of achieving such an extreme value of the test
188 statistic under the null hypothesis.
190 Notes
191 -----
193 Let:
195 .. math:: X_1 \sim \mbox{Poisson}(\mathtt{n1}\lambda_1)
197 be a random variable independent of
199 .. math:: X_2 \sim \mbox{Poisson}(\mathtt{n2}\lambda_2)
201 and let ``k1`` and ``k2`` be the observed values of :math:`X_1`
202 and :math:`X_2`, respectively. Then `poisson_means_test` uses the number
203 of observed events ``k1`` and ``k2`` from samples of size ``n1`` and
204 ``n2``, respectively, to test the null hypothesis that
206 .. math::
207 H_0: \lambda_1 - \lambda_2 = \mathtt{diff}
209 A benefit of the E-test is that it has good power for small sample sizes,
210 which can reduce sampling costs [1]_. It has been evaluated and determined
211 to be more powerful than the comparable C-test, sometimes referred to as
212 the Poisson exact test.
214 References
215 ----------
216 .. [1] Krishnamoorthy, K., & Thomson, J. (2004). A more powerful test for
217 comparing two Poisson means. Journal of Statistical Planning and
218 Inference, 119(1), 23-35.
220 .. [2] Przyborowski, J., & Wilenski, H. (1940). Homogeneity of results in
221 testing samples from Poisson series: With an application to testing
222 clover seed for dodder. Biometrika, 31(3/4), 313-323.
224 Examples
225 --------
227 Suppose that a gardener wishes to test the number of dodder (weed) seeds
228 in a sack of clover seeds that they buy from a seed company. It has
229 previously been established that the number of dodder seeds in clover
230 follows the Poisson distribution.
232 A 100 gram sample is drawn from the sack before being shipped to the
233 gardener. The sample is analyzed, and it is found to contain no dodder
234 seeds; that is, `k1` is 0. However, upon arrival, the gardener draws
235 another 100 gram sample from the sack. This time, three dodder seeds are
236 found in the sample; that is, `k2` is 3. The gardener would like to
237 know if the difference is significant and not due to chance. The
238 null hypothesis is that the difference between the two samples is merely
239 due to chance, or that :math:`\lambda_1 - \lambda_2 = \mathtt{diff}`
240 where :math:`\mathtt{diff} = 0`. The alternative hypothesis is that the
241 difference is not due to chance, or :math:`\lambda_1 - \lambda_2 \ne 0`.
242 The gardener selects a significance level of 5% to reject the null
243 hypothesis in favor of the alternative [2]_.
245 >>> import scipy.stats as stats
246 >>> res = stats.poisson_means_test(0, 100, 3, 100)
247 >>> res.statistic, res.pvalue
248 (-1.7320508075688772, 0.08837900929018157)
250 The p-value is .088, indicating a near 9% chance of observing a value of
251 the test statistic under the null hypothesis. This exceeds 5%, so the
252 gardener does not reject the null hypothesis as the difference cannot be
253 regarded as significant at this level.
254 """
256 _poisson_means_test_iv(k1, n1, k2, n2, diff, alternative)
258 # "for a given k_1 and k_2, an estimate of \lambda_2 is given by" [1] (3.4)
259 lmbd_hat2 = ((k1 + k2) / (n1 + n2) - diff * n1 / (n1 + n2))
261 # "\hat{\lambda_{2k}} may be less than or equal to zero ... and in this
262 # case the null hypothesis cannot be rejected ... [and] it is not necessary
263 # to compute the p-value". [1] page 26 below eq. (3.6).
264 if lmbd_hat2 <= 0:
265 return _stats_py.SignificanceResult(0, 1)
267 # The unbiased variance estimate [1] (3.2)
268 var = k1 / (n1 ** 2) + k2 / (n2 ** 2)
270 # The _observed_ pivot statistic from the input. It follows the
271 # unnumbered equation following equation (3.3) This is used later in
272 # comparison with the computed pivot statistics in an indicator function.
273 t_k1k2 = (k1 / n1 - k2 / n2 - diff) / np.sqrt(var)
275 # Equation (3.5) of [1] is lengthy, so it is broken into several parts,
276 # beginning here. Note that the probability mass function of poisson is
277 # exp^(-\mu)*\mu^k/k!, so and this is called with shape \mu, here noted
278 # here as nlmbd_hat*. The strategy for evaluating the double summation in
279 # (3.5) is to create two arrays of the values of the two products inside
280 # the summation and then broadcast them together into a matrix, and then
281 # sum across the entire matrix.
283 # Compute constants (as seen in the first and second separated products in
284 # (3.5).). (This is the shape (\mu) parameter of the poisson distribution.)
285 nlmbd_hat1 = n1 * (lmbd_hat2 + diff)
286 nlmbd_hat2 = n2 * lmbd_hat2
288 # Determine summation bounds for tail ends of distribution rather than
289 # summing to infinity. `x1*` is for the outer sum and `x2*` is the inner
290 # sum.
291 x1_lb, x1_ub = distributions.poisson.ppf([1e-10, 1 - 1e-16], nlmbd_hat1)
292 x2_lb, x2_ub = distributions.poisson.ppf([1e-10, 1 - 1e-16], nlmbd_hat2)
294 # Construct arrays to function as the x_1 and x_2 counters on the summation
295 # in (3.5). `x1` is in columns and `x2` is in rows to allow for
296 # broadcasting.
297 x1 = np.arange(x1_lb, x1_ub + 1)
298 x2 = np.arange(x2_lb, x2_ub + 1)[:, None]
300 # These are the two products in equation (3.5) with `prob_x1` being the
301 # first (left side) and `prob_x2` being the second (right side). (To
302 # make as clear as possible: the 1st contains a "+ d" term, the 2nd does
303 # not.)
304 prob_x1 = distributions.poisson.pmf(x1, nlmbd_hat1)
305 prob_x2 = distributions.poisson.pmf(x2, nlmbd_hat2)
307 # compute constants for use in the "pivot statistic" per the
308 # unnumbered equation following (3.3).
309 lmbd_x1 = x1 / n1
310 lmbd_x2 = x2 / n2
311 lmbds_diff = lmbd_x1 - lmbd_x2 - diff
312 var_x1x2 = lmbd_x1 / n1 + lmbd_x2 / n2
314 # This is the 'pivot statistic' for use in the indicator of the summation
315 # (left side of "I[.]").
316 with np.errstate(invalid='ignore', divide='ignore'):
317 t_x1x2 = lmbds_diff / np.sqrt(var_x1x2)
319 # `[indicator]` implements the "I[.] ... the indicator function" per
320 # the paragraph following equation (3.5).
321 if alternative == 'two-sided':
322 indicator = np.abs(t_x1x2) >= np.abs(t_k1k2)
323 elif alternative == 'less':
324 indicator = t_x1x2 <= t_k1k2
325 else:
326 indicator = t_x1x2 >= t_k1k2
328 # Multiply all combinations of the products together, exclude terms
329 # based on the `indicator` and then sum. (3.5)
330 pvalue = np.sum((prob_x1 * prob_x2)[indicator])
331 return _stats_py.SignificanceResult(t_k1k2, pvalue)
334def _poisson_means_test_iv(k1, n1, k2, n2, diff, alternative):
335 # """check for valid types and values of input to `poisson_mean_test`."""
336 if k1 != int(k1) or k2 != int(k2):
337 raise TypeError('`k1` and `k2` must be integers.')
339 count_err = '`k1` and `k2` must be greater than or equal to 0.'
340 if k1 < 0 or k2 < 0:
341 raise ValueError(count_err)
343 if n1 <= 0 or n2 <= 0:
344 raise ValueError('`n1` and `n2` must be greater than 0.')
346 if diff < 0:
347 raise ValueError('diff must be greater than or equal to 0.')
349 alternatives = {'two-sided', 'less', 'greater'}
350 if alternative.lower() not in alternatives:
351 raise ValueError(f"Alternative must be one of '{alternatives}'.")
354class CramerVonMisesResult:
355 def __init__(self, statistic, pvalue):
356 self.statistic = statistic
357 self.pvalue = pvalue
359 def __repr__(self):
360 return (f"{self.__class__.__name__}(statistic={self.statistic}, "
361 f"pvalue={self.pvalue})")
364def _psi1_mod(x):
365 """
366 psi1 is defined in equation 1.10 in Csörgő, S. and Faraway, J. (1996).
367 This implements a modified version by excluding the term V(x) / 12
368 (here: _cdf_cvm_inf(x) / 12) to avoid evaluating _cdf_cvm_inf(x)
369 twice in _cdf_cvm.
371 Implementation based on MAPLE code of Julian Faraway and R code of the
372 function pCvM in the package goftest (v1.1.1), permission granted
373 by Adrian Baddeley. Main difference in the implementation: the code
374 here keeps adding terms of the series until the terms are small enough.
375 """
377 def _ed2(y):
378 z = y**2 / 4
379 b = kv(1/4, z) + kv(3/4, z)
380 return np.exp(-z) * (y/2)**(3/2) * b / np.sqrt(np.pi)
382 def _ed3(y):
383 z = y**2 / 4
384 c = np.exp(-z) / np.sqrt(np.pi)
385 return c * (y/2)**(5/2) * (2*kv(1/4, z) + 3*kv(3/4, z) - kv(5/4, z))
387 def _Ak(k, x):
388 m = 2*k + 1
389 sx = 2 * np.sqrt(x)
390 y1 = x**(3/4)
391 y2 = x**(5/4)
393 e1 = m * gamma(k + 1/2) * _ed2((4 * k + 3)/sx) / (9 * y1)
394 e2 = gamma(k + 1/2) * _ed3((4 * k + 1) / sx) / (72 * y2)
395 e3 = 2 * (m + 2) * gamma(k + 3/2) * _ed3((4 * k + 5) / sx) / (12 * y2)
396 e4 = 7 * m * gamma(k + 1/2) * _ed2((4 * k + 1) / sx) / (144 * y1)
397 e5 = 7 * m * gamma(k + 1/2) * _ed2((4 * k + 5) / sx) / (144 * y1)
399 return e1 + e2 + e3 + e4 + e5
401 x = np.asarray(x)
402 tot = np.zeros_like(x, dtype='float')
403 cond = np.ones_like(x, dtype='bool')
404 k = 0
405 while np.any(cond):
406 z = -_Ak(k, x[cond]) / (np.pi * gamma(k + 1))
407 tot[cond] = tot[cond] + z
408 cond[cond] = np.abs(z) >= 1e-7
409 k += 1
411 return tot
414def _cdf_cvm_inf(x):
415 """
416 Calculate the cdf of the Cramér-von Mises statistic (infinite sample size).
418 See equation 1.2 in Csörgő, S. and Faraway, J. (1996).
420 Implementation based on MAPLE code of Julian Faraway and R code of the
421 function pCvM in the package goftest (v1.1.1), permission granted
422 by Adrian Baddeley. Main difference in the implementation: the code
423 here keeps adding terms of the series until the terms are small enough.
425 The function is not expected to be accurate for large values of x, say
426 x > 4, when the cdf is very close to 1.
427 """
428 x = np.asarray(x)
430 def term(x, k):
431 # this expression can be found in [2], second line of (1.3)
432 u = np.exp(gammaln(k + 0.5) - gammaln(k+1)) / (np.pi**1.5 * np.sqrt(x))
433 y = 4*k + 1
434 q = y**2 / (16*x)
435 b = kv(0.25, q)
436 return u * np.sqrt(y) * np.exp(-q) * b
438 tot = np.zeros_like(x, dtype='float')
439 cond = np.ones_like(x, dtype='bool')
440 k = 0
441 while np.any(cond):
442 z = term(x[cond], k)
443 tot[cond] = tot[cond] + z
444 cond[cond] = np.abs(z) >= 1e-7
445 k += 1
447 return tot
450def _cdf_cvm(x, n=None):
451 """
452 Calculate the cdf of the Cramér-von Mises statistic for a finite sample
453 size n. If N is None, use the asymptotic cdf (n=inf).
455 See equation 1.8 in Csörgő, S. and Faraway, J. (1996) for finite samples,
456 1.2 for the asymptotic cdf.
458 The function is not expected to be accurate for large values of x, say
459 x > 2, when the cdf is very close to 1 and it might return values > 1
460 in that case, e.g. _cdf_cvm(2.0, 12) = 1.0000027556716846. Moreover, it
461 is not accurate for small values of n, especially close to the bounds of
462 the distribution's domain, [1/(12*n), n/3], where the value jumps to 0
463 and 1, respectively. These are limitations of the approximation by Csörgő
464 and Faraway (1996) implemented in this function.
465 """
466 x = np.asarray(x)
467 if n is None:
468 y = _cdf_cvm_inf(x)
469 else:
470 # support of the test statistic is [12/n, n/3], see 1.1 in [2]
471 y = np.zeros_like(x, dtype='float')
472 sup = (1./(12*n) < x) & (x < n/3.)
473 # note: _psi1_mod does not include the term _cdf_cvm_inf(x) / 12
474 # therefore, we need to add it here
475 y[sup] = _cdf_cvm_inf(x[sup]) * (1 + 1./(12*n)) + _psi1_mod(x[sup]) / n
476 y[x >= n/3] = 1
478 if y.ndim == 0:
479 return y[()]
480 return y
483def cramervonmises(rvs, cdf, args=()):
484 """Perform the one-sample Cramér-von Mises test for goodness of fit.
486 This performs a test of the goodness of fit of a cumulative distribution
487 function (cdf) :math:`F` compared to the empirical distribution function
488 :math:`F_n` of observed random variates :math:`X_1, ..., X_n` that are
489 assumed to be independent and identically distributed ([1]_).
490 The null hypothesis is that the :math:`X_i` have cumulative distribution
491 :math:`F`.
493 Parameters
494 ----------
495 rvs : array_like
496 A 1-D array of observed values of the random variables :math:`X_i`.
497 cdf : str or callable
498 The cumulative distribution function :math:`F` to test the
499 observations against. If a string, it should be the name of a
500 distribution in `scipy.stats`. If a callable, that callable is used
501 to calculate the cdf: ``cdf(x, *args) -> float``.
502 args : tuple, optional
503 Distribution parameters. These are assumed to be known; see Notes.
505 Returns
506 -------
507 res : object with attributes
508 statistic : float
509 Cramér-von Mises statistic.
510 pvalue : float
511 The p-value.
513 See Also
514 --------
515 kstest, cramervonmises_2samp
517 Notes
518 -----
519 .. versionadded:: 1.6.0
521 The p-value relies on the approximation given by equation 1.8 in [2]_.
522 It is important to keep in mind that the p-value is only accurate if
523 one tests a simple hypothesis, i.e. the parameters of the reference
524 distribution are known. If the parameters are estimated from the data
525 (composite hypothesis), the computed p-value is not reliable.
527 References
528 ----------
529 .. [1] Cramér-von Mises criterion, Wikipedia,
530 https://en.wikipedia.org/wiki/Cram%C3%A9r%E2%80%93von_Mises_criterion
531 .. [2] Csörgő, S. and Faraway, J. (1996). The Exact and Asymptotic
532 Distribution of Cramér-von Mises Statistics. Journal of the
533 Royal Statistical Society, pp. 221-234.
535 Examples
536 --------
538 Suppose we wish to test whether data generated by ``scipy.stats.norm.rvs``
539 were, in fact, drawn from the standard normal distribution. We choose a
540 significance level of alpha=0.05.
542 >>> import numpy as np
543 >>> from scipy import stats
544 >>> rng = np.random.default_rng()
545 >>> x = stats.norm.rvs(size=500, random_state=rng)
546 >>> res = stats.cramervonmises(x, 'norm')
547 >>> res.statistic, res.pvalue
548 (0.49121480855028343, 0.04189256516661377)
550 The p-value 0.79 exceeds our chosen significance level, so we do not
551 reject the null hypothesis that the observed sample is drawn from the
552 standard normal distribution.
554 Now suppose we wish to check whether the same samples shifted by 2.1 is
555 consistent with being drawn from a normal distribution with a mean of 2.
557 >>> y = x + 2.1
558 >>> res = stats.cramervonmises(y, 'norm', args=(2,))
559 >>> res.statistic, res.pvalue
560 (0.07400330012187435, 0.7274595666160468)
562 Here we have used the `args` keyword to specify the mean (``loc``)
563 of the normal distribution to test the data against. This is equivalent
564 to the following, in which we create a frozen normal distribution with
565 mean 2.1, then pass its ``cdf`` method as an argument.
567 >>> frozen_dist = stats.norm(loc=2)
568 >>> res = stats.cramervonmises(y, frozen_dist.cdf)
569 >>> res.statistic, res.pvalue
570 (0.07400330012187435, 0.7274595666160468)
572 In either case, we would reject the null hypothesis that the observed
573 sample is drawn from a normal distribution with a mean of 2 (and default
574 variance of 1) because the p-value 0.04 is less than our chosen
575 significance level.
577 """
578 if isinstance(cdf, str):
579 cdf = getattr(distributions, cdf).cdf
581 vals = np.sort(np.asarray(rvs))
583 if vals.size <= 1:
584 raise ValueError('The sample must contain at least two observations.')
585 if vals.ndim > 1:
586 raise ValueError('The sample must be one-dimensional.')
588 n = len(vals)
589 cdfvals = cdf(vals, *args)
591 u = (2*np.arange(1, n+1) - 1)/(2*n)
592 w = 1/(12*n) + np.sum((u - cdfvals)**2)
594 # avoid small negative values that can occur due to the approximation
595 p = max(0, 1. - _cdf_cvm(w, n))
597 return CramerVonMisesResult(statistic=w, pvalue=p)
600def _get_wilcoxon_distr(n):
601 """
602 Distribution of probability of the Wilcoxon ranksum statistic r_plus (sum
603 of ranks of positive differences).
604 Returns an array with the probabilities of all the possible ranks
605 r = 0, ..., n*(n+1)/2
606 """
607 c = np.ones(1, dtype=np.double)
608 for k in range(1, n + 1):
609 prev_c = c
610 c = np.zeros(k * (k + 1) // 2 + 1, dtype=np.double)
611 m = len(prev_c)
612 c[:m] = prev_c * 0.5
613 c[-m:] += prev_c * 0.5
614 return c
617def _get_wilcoxon_distr2(n):
618 """
619 Distribution of probability of the Wilcoxon ranksum statistic r_plus (sum
620 of ranks of positive differences).
621 Returns an array with the probabilities of all the possible ranks
622 r = 0, ..., n*(n+1)/2
623 This is a slower reference function
624 References
625 ----------
626 .. [1] 1. Harris T, Hardin JW. Exact Wilcoxon Signed-Rank and Wilcoxon
627 Mann-Whitney Ranksum Tests. The Stata Journal. 2013;13(2):337-343.
628 """
629 ai = np.arange(1, n+1)[:, None]
630 t = n*(n+1)/2
631 q = 2*t
632 j = np.arange(q)
633 theta = 2*np.pi/q*j
634 phi_sp = np.prod(np.cos(theta*ai), axis=0)
635 phi_s = np.exp(1j*theta*t) * phi_sp
636 p = np.real(ifft(phi_s))
637 res = np.zeros(int(t)+1)
638 res[:-1:] = p[::2]
639 res[0] /= 2
640 res[-1] = res[0]
641 return res
644def _tau_b(A):
645 """Calculate Kendall's tau-b and p-value from contingency table."""
646 # See [2] 2.2 and 4.2
648 # contingency table must be truly 2D
649 if A.shape[0] == 1 or A.shape[1] == 1:
650 return np.nan, np.nan
652 NA = A.sum()
653 PA = _P(A)
654 QA = _Q(A)
655 Sri2 = (A.sum(axis=1)**2).sum()
656 Scj2 = (A.sum(axis=0)**2).sum()
657 denominator = (NA**2 - Sri2)*(NA**2 - Scj2)
659 tau = (PA-QA)/(denominator)**0.5
661 numerator = 4*(_a_ij_Aij_Dij2(A) - (PA - QA)**2 / NA)
662 s02_tau_b = numerator/denominator
663 if s02_tau_b == 0: # Avoid divide by zero
664 return tau, 0
665 Z = tau/s02_tau_b**0.5
666 p = 2*norm.sf(abs(Z)) # 2-sided p-value
668 return tau, p
671def _somers_d(A, alternative='two-sided'):
672 """Calculate Somers' D and p-value from contingency table."""
673 # See [3] page 1740
675 # contingency table must be truly 2D
676 if A.shape[0] <= 1 or A.shape[1] <= 1:
677 return np.nan, np.nan
679 NA = A.sum()
680 NA2 = NA**2
681 PA = _P(A)
682 QA = _Q(A)
683 Sri2 = (A.sum(axis=1)**2).sum()
685 d = (PA - QA)/(NA2 - Sri2)
687 S = _a_ij_Aij_Dij2(A) - (PA-QA)**2/NA
689 with np.errstate(divide='ignore'):
690 Z = (PA - QA)/(4*(S))**0.5
692 _, p = scipy.stats._stats_py._normtest_finish(Z, alternative)
694 return d, p
697SomersDResult = make_dataclass("SomersDResult",
698 ("statistic", "pvalue", "table"))
701def somersd(x, y=None, alternative='two-sided'):
702 r"""Calculates Somers' D, an asymmetric measure of ordinal association.
704 Like Kendall's :math:`\tau`, Somers' :math:`D` is a measure of the
705 correspondence between two rankings. Both statistics consider the
706 difference between the number of concordant and discordant pairs in two
707 rankings :math:`X` and :math:`Y`, and both are normalized such that values
708 close to 1 indicate strong agreement and values close to -1 indicate
709 strong disagreement. They differ in how they are normalized. To show the
710 relationship, Somers' :math:`D` can be defined in terms of Kendall's
711 :math:`\tau_a`:
713 .. math::
714 D(Y|X) = \frac{\tau_a(X, Y)}{\tau_a(X, X)}
716 Suppose the first ranking :math:`X` has :math:`r` distinct ranks and the
717 second ranking :math:`Y` has :math:`s` distinct ranks. These two lists of
718 :math:`n` rankings can also be viewed as an :math:`r \times s` contingency
719 table in which element :math:`i, j` is the number of rank pairs with rank
720 :math:`i` in ranking :math:`X` and rank :math:`j` in ranking :math:`Y`.
721 Accordingly, `somersd` also allows the input data to be supplied as a
722 single, 2D contingency table instead of as two separate, 1D rankings.
724 Note that the definition of Somers' :math:`D` is asymmetric: in general,
725 :math:`D(Y|X) \neq D(X|Y)`. ``somersd(x, y)`` calculates Somers'
726 :math:`D(Y|X)`: the "row" variable :math:`X` is treated as an independent
727 variable, and the "column" variable :math:`Y` is dependent. For Somers'
728 :math:`D(X|Y)`, swap the input lists or transpose the input table.
730 Parameters
731 ----------
732 x : array_like
733 1D array of rankings, treated as the (row) independent variable.
734 Alternatively, a 2D contingency table.
735 y : array_like, optional
736 If `x` is a 1D array of rankings, `y` is a 1D array of rankings of the
737 same length, treated as the (column) dependent variable.
738 If `x` is 2D, `y` is ignored.
739 alternative : {'two-sided', 'less', 'greater'}, optional
740 Defines the alternative hypothesis. Default is 'two-sided'.
741 The following options are available:
742 * 'two-sided': the rank correlation is nonzero
743 * 'less': the rank correlation is negative (less than zero)
744 * 'greater': the rank correlation is positive (greater than zero)
746 Returns
747 -------
748 res : SomersDResult
749 A `SomersDResult` object with the following fields:
751 statistic : float
752 The Somers' :math:`D` statistic.
753 pvalue : float
754 The p-value for a hypothesis test whose null
755 hypothesis is an absence of association, :math:`D=0`.
756 See notes for more information.
757 table : 2D array
758 The contingency table formed from rankings `x` and `y` (or the
759 provided contingency table, if `x` is a 2D array)
761 See Also
762 --------
763 kendalltau : Calculates Kendall's tau, another correlation measure.
764 weightedtau : Computes a weighted version of Kendall's tau.
765 spearmanr : Calculates a Spearman rank-order correlation coefficient.
766 pearsonr : Calculates a Pearson correlation coefficient.
768 Notes
769 -----
770 This function follows the contingency table approach of [2]_ and
771 [3]_. *p*-values are computed based on an asymptotic approximation of
772 the test statistic distribution under the null hypothesis :math:`D=0`.
774 Theoretically, hypothesis tests based on Kendall's :math:`tau` and Somers'
775 :math:`D` should be identical.
776 However, the *p*-values returned by `kendalltau` are based
777 on the null hypothesis of *independence* between :math:`X` and :math:`Y`
778 (i.e. the population from which pairs in :math:`X` and :math:`Y` are
779 sampled contains equal numbers of all possible pairs), which is more
780 specific than the null hypothesis :math:`D=0` used here. If the null
781 hypothesis of independence is desired, it is acceptable to use the
782 *p*-value returned by `kendalltau` with the statistic returned by
783 `somersd` and vice versa. For more information, see [2]_.
785 Contingency tables are formatted according to the convention used by
786 SAS and R: the first ranking supplied (``x``) is the "row" variable, and
787 the second ranking supplied (``y``) is the "column" variable. This is
788 opposite the convention of Somers' original paper [1]_.
790 References
791 ----------
792 .. [1] Robert H. Somers, "A New Asymmetric Measure of Association for
793 Ordinal Variables", *American Sociological Review*, Vol. 27, No. 6,
794 pp. 799--811, 1962.
796 .. [2] Morton B. Brown and Jacqueline K. Benedetti, "Sampling Behavior of
797 Tests for Correlation in Two-Way Contingency Tables", *Journal of
798 the American Statistical Association* Vol. 72, No. 358, pp.
799 309--315, 1977.
801 .. [3] SAS Institute, Inc., "The FREQ Procedure (Book Excerpt)",
802 *SAS/STAT 9.2 User's Guide, Second Edition*, SAS Publishing, 2009.
804 .. [4] Laerd Statistics, "Somers' d using SPSS Statistics", *SPSS
805 Statistics Tutorials and Statistical Guides*,
806 https://statistics.laerd.com/spss-tutorials/somers-d-using-spss-statistics.php,
807 Accessed July 31, 2020.
809 Examples
810 --------
811 We calculate Somers' D for the example given in [4]_, in which a hotel
812 chain owner seeks to determine the association between hotel room
813 cleanliness and customer satisfaction. The independent variable, hotel
814 room cleanliness, is ranked on an ordinal scale: "below average (1)",
815 "average (2)", or "above average (3)". The dependent variable, customer
816 satisfaction, is ranked on a second scale: "very dissatisfied (1)",
817 "moderately dissatisfied (2)", "neither dissatisfied nor satisfied (3)",
818 "moderately satisfied (4)", or "very satisfied (5)". 189 customers
819 respond to the survey, and the results are cast into a contingency table
820 with the hotel room cleanliness as the "row" variable and customer
821 satisfaction as the "column" variable.
823 +-----+-----+-----+-----+-----+-----+
824 | | (1) | (2) | (3) | (4) | (5) |
825 +=====+=====+=====+=====+=====+=====+
826 | (1) | 27 | 25 | 14 | 7 | 0 |
827 +-----+-----+-----+-----+-----+-----+
828 | (2) | 7 | 14 | 18 | 35 | 12 |
829 +-----+-----+-----+-----+-----+-----+
830 | (3) | 1 | 3 | 2 | 7 | 17 |
831 +-----+-----+-----+-----+-----+-----+
833 For example, 27 customers assigned their room a cleanliness ranking of
834 "below average (1)" and a corresponding satisfaction of "very
835 dissatisfied (1)". We perform the analysis as follows.
837 >>> from scipy.stats import somersd
838 >>> table = [[27, 25, 14, 7, 0], [7, 14, 18, 35, 12], [1, 3, 2, 7, 17]]
839 >>> res = somersd(table)
840 >>> res.statistic
841 0.6032766111513396
842 >>> res.pvalue
843 1.0007091191074533e-27
845 The value of the Somers' D statistic is approximately 0.6, indicating
846 a positive correlation between room cleanliness and customer satisfaction
847 in the sample.
848 The *p*-value is very small, indicating a very small probability of
849 observing such an extreme value of the statistic under the null
850 hypothesis that the statistic of the entire population (from which
851 our sample of 189 customers is drawn) is zero. This supports the
852 alternative hypothesis that the true value of Somers' D for the population
853 is nonzero.
855 """
856 x, y = np.array(x), np.array(y)
857 if x.ndim == 1:
858 if x.size != y.size:
859 raise ValueError("Rankings must be of equal length.")
860 table = scipy.stats.contingency.crosstab(x, y)[1]
861 elif x.ndim == 2:
862 if np.any(x < 0):
863 raise ValueError("All elements of the contingency table must be "
864 "non-negative.")
865 if np.any(x != x.astype(int)):
866 raise ValueError("All elements of the contingency table must be "
867 "integer.")
868 if x.nonzero()[0].size < 2:
869 raise ValueError("At least two elements of the contingency table "
870 "must be nonzero.")
871 table = x
872 else:
873 raise ValueError("x must be either a 1D or 2D array")
874 d, p = _somers_d(table, alternative)
876 # add alias for consistency with other correlation functions
877 res = SomersDResult(d, p, table)
878 res.correlation = d
879 return res
882# This could be combined with `_all_partitions` in `_resampling.py`
883def _all_partitions(nx, ny):
884 """
885 Partition a set of indices into two fixed-length sets in all possible ways
887 Partition a set of indices 0 ... nx + ny - 1 into two sets of length nx and
888 ny in all possible ways (ignoring order of elements).
889 """
890 z = np.arange(nx+ny)
891 for c in combinations(z, nx):
892 x = np.array(c)
893 mask = np.ones(nx+ny, bool)
894 mask[x] = False
895 y = z[mask]
896 yield x, y
899def _compute_log_combinations(n):
900 """Compute all log combination of C(n, k)."""
901 gammaln_arr = gammaln(np.arange(n + 1) + 1)
902 return gammaln(n + 1) - gammaln_arr - gammaln_arr[::-1]
905BarnardExactResult = make_dataclass(
906 "BarnardExactResult", [("statistic", float), ("pvalue", float)]
907)
910def barnard_exact(table, alternative="two-sided", pooled=True, n=32):
911 r"""Perform a Barnard exact test on a 2x2 contingency table.
913 Parameters
914 ----------
915 table : array_like of ints
916 A 2x2 contingency table. Elements should be non-negative integers.
918 alternative : {'two-sided', 'less', 'greater'}, optional
919 Defines the null and alternative hypotheses. Default is 'two-sided'.
920 Please see explanations in the Notes section below.
922 pooled : bool, optional
923 Whether to compute score statistic with pooled variance (as in
924 Student's t-test, for example) or unpooled variance (as in Welch's
925 t-test). Default is ``True``.
927 n : int, optional
928 Number of sampling points used in the construction of the sampling
929 method. Note that this argument will automatically be converted to
930 the next higher power of 2 since `scipy.stats.qmc.Sobol` is used to
931 select sample points. Default is 32. Must be positive. In most cases,
932 32 points is enough to reach good precision. More points comes at
933 performance cost.
935 Returns
936 -------
937 ber : BarnardExactResult
938 A result object with the following attributes.
940 statistic : float
941 The Wald statistic with pooled or unpooled variance, depending
942 on the user choice of `pooled`.
944 pvalue : float
945 P-value, the probability of obtaining a distribution at least as
946 extreme as the one that was actually observed, assuming that the
947 null hypothesis is true.
949 See Also
950 --------
951 chi2_contingency : Chi-square test of independence of variables in a
952 contingency table.
953 fisher_exact : Fisher exact test on a 2x2 contingency table.
954 boschloo_exact : Boschloo's exact test on a 2x2 contingency table,
955 which is an uniformly more powerful alternative to Fisher's exact test.
957 Notes
958 -----
959 Barnard's test is an exact test used in the analysis of contingency
960 tables. It examines the association of two categorical variables, and
961 is a more powerful alternative than Fisher's exact test
962 for 2x2 contingency tables.
964 Let's define :math:`X_0` a 2x2 matrix representing the observed sample,
965 where each column stores the binomial experiment, as in the example
966 below. Let's also define :math:`p_1, p_2` the theoretical binomial
967 probabilities for :math:`x_{11}` and :math:`x_{12}`. When using
968 Barnard exact test, we can assert three different null hypotheses :
970 - :math:`H_0 : p_1 \geq p_2` versus :math:`H_1 : p_1 < p_2`,
971 with `alternative` = "less"
973 - :math:`H_0 : p_1 \leq p_2` versus :math:`H_1 : p_1 > p_2`,
974 with `alternative` = "greater"
976 - :math:`H_0 : p_1 = p_2` versus :math:`H_1 : p_1 \neq p_2`,
977 with `alternative` = "two-sided" (default one)
979 In order to compute Barnard's exact test, we are using the Wald
980 statistic [3]_ with pooled or unpooled variance.
981 Under the default assumption that both variances are equal
982 (``pooled = True``), the statistic is computed as:
984 .. math::
986 T(X) = \frac{
987 \hat{p}_1 - \hat{p}_2
988 }{
989 \sqrt{
990 \hat{p}(1 - \hat{p})
991 (\frac{1}{c_1} +
992 \frac{1}{c_2})
993 }
994 }
996 with :math:`\hat{p}_1, \hat{p}_2` and :math:`\hat{p}` the estimator of
997 :math:`p_1, p_2` and :math:`p`, the latter being the combined probability,
998 given the assumption that :math:`p_1 = p_2`.
1000 If this assumption is invalid (``pooled = False``), the statistic is:
1002 .. math::
1004 T(X) = \frac{
1005 \hat{p}_1 - \hat{p}_2
1006 }{
1007 \sqrt{
1008 \frac{\hat{p}_1 (1 - \hat{p}_1)}{c_1} +
1009 \frac{\hat{p}_2 (1 - \hat{p}_2)}{c_2}
1010 }
1011 }
1013 The p-value is then computed as:
1015 .. math::
1017 \sum
1018 \binom{c_1}{x_{11}}
1019 \binom{c_2}{x_{12}}
1020 \pi^{x_{11} + x_{12}}
1021 (1 - \pi)^{t - x_{11} - x_{12}}
1023 where the sum is over all 2x2 contingency tables :math:`X` such that:
1024 * :math:`T(X) \leq T(X_0)` when `alternative` = "less",
1025 * :math:`T(X) \geq T(X_0)` when `alternative` = "greater", or
1026 * :math:`T(X) \geq |T(X_0)|` when `alternative` = "two-sided".
1027 Above, :math:`c_1, c_2` are the sum of the columns 1 and 2,
1028 and :math:`t` the total (sum of the 4 sample's element).
1030 The returned p-value is the maximum p-value taken over the nuisance
1031 parameter :math:`\pi`, where :math:`0 \leq \pi \leq 1`.
1033 This function's complexity is :math:`O(n c_1 c_2)`, where `n` is the
1034 number of sample points.
1036 References
1037 ----------
1038 .. [1] Barnard, G. A. "Significance Tests for 2x2 Tables". *Biometrika*.
1039 34.1/2 (1947): 123-138. :doi:`dpgkg3`
1041 .. [2] Mehta, Cyrus R., and Pralay Senchaudhuri. "Conditional versus
1042 unconditional exact tests for comparing two binomials."
1043 *Cytel Software Corporation* 675 (2003): 1-5.
1045 .. [3] "Wald Test". *Wikipedia*. https://en.wikipedia.org/wiki/Wald_test
1047 Examples
1048 --------
1049 An example use of Barnard's test is presented in [2]_.
1051 Consider the following example of a vaccine efficacy study
1052 (Chan, 1998). In a randomized clinical trial of 30 subjects, 15 were
1053 inoculated with a recombinant DNA influenza vaccine and the 15 were
1054 inoculated with a placebo. Twelve of the 15 subjects in the placebo
1055 group (80%) eventually became infected with influenza whereas for the
1056 vaccine group, only 7 of the 15 subjects (47%) became infected. The
1057 data are tabulated as a 2 x 2 table::
1059 Vaccine Placebo
1060 Yes 7 12
1061 No 8 3
1063 When working with statistical hypothesis testing, we usually use a
1064 threshold probability or significance level upon which we decide
1065 to reject the null hypothesis :math:`H_0`. Suppose we choose the common
1066 significance level of 5%.
1068 Our alternative hypothesis is that the vaccine will lower the chance of
1069 becoming infected with the virus; that is, the probability :math:`p_1` of
1070 catching the virus with the vaccine will be *less than* the probability
1071 :math:`p_2` of catching the virus without the vaccine. Therefore, we call
1072 `barnard_exact` with the ``alternative="less"`` option:
1074 >>> import scipy.stats as stats
1075 >>> res = stats.barnard_exact([[7, 12], [8, 3]], alternative="less")
1076 >>> res.statistic
1077 -1.894...
1078 >>> res.pvalue
1079 0.03407...
1081 Under the null hypothesis that the vaccine will not lower the chance of
1082 becoming infected, the probability of obtaining test results at least as
1083 extreme as the observed data is approximately 3.4%. Since this p-value is
1084 less than our chosen significance level, we have evidence to reject
1085 :math:`H_0` in favor of the alternative.
1087 Suppose we had used Fisher's exact test instead:
1089 >>> _, pvalue = stats.fisher_exact([[7, 12], [8, 3]], alternative="less")
1090 >>> pvalue
1091 0.0640...
1093 With the same threshold significance of 5%, we would not have been able
1094 to reject the null hypothesis in favor of the alternative. As stated in
1095 [2]_, Barnard's test is uniformly more powerful than Fisher's exact test
1096 because Barnard's test does not condition on any margin. Fisher's test
1097 should only be used when both sets of marginals are fixed.
1099 """
1100 if n <= 0:
1101 raise ValueError(
1102 "Number of points `n` must be strictly positive, "
1103 f"found {n!r}"
1104 )
1106 table = np.asarray(table, dtype=np.int64)
1108 if not table.shape == (2, 2):
1109 raise ValueError("The input `table` must be of shape (2, 2).")
1111 if np.any(table < 0):
1112 raise ValueError("All values in `table` must be nonnegative.")
1114 if 0 in table.sum(axis=0):
1115 # If both values in column are zero, the p-value is 1 and
1116 # the score's statistic is NaN.
1117 return BarnardExactResult(np.nan, 1.0)
1119 total_col_1, total_col_2 = table.sum(axis=0)
1121 x1 = np.arange(total_col_1 + 1, dtype=np.int64).reshape(-1, 1)
1122 x2 = np.arange(total_col_2 + 1, dtype=np.int64).reshape(1, -1)
1124 # We need to calculate the wald statistics for each combination of x1 and
1125 # x2.
1126 p1, p2 = x1 / total_col_1, x2 / total_col_2
1128 if pooled:
1129 p = (x1 + x2) / (total_col_1 + total_col_2)
1130 variances = p * (1 - p) * (1 / total_col_1 + 1 / total_col_2)
1131 else:
1132 variances = p1 * (1 - p1) / total_col_1 + p2 * (1 - p2) / total_col_2
1134 # To avoid warning when dividing by 0
1135 with np.errstate(divide="ignore", invalid="ignore"):
1136 wald_statistic = np.divide((p1 - p2), np.sqrt(variances))
1138 wald_statistic[p1 == p2] = 0 # Removing NaN values
1140 wald_stat_obs = wald_statistic[table[0, 0], table[0, 1]]
1142 if alternative == "two-sided":
1143 index_arr = np.abs(wald_statistic) >= abs(wald_stat_obs)
1144 elif alternative == "less":
1145 index_arr = wald_statistic <= wald_stat_obs
1146 elif alternative == "greater":
1147 index_arr = wald_statistic >= wald_stat_obs
1148 else:
1149 msg = (
1150 "`alternative` should be one of {'two-sided', 'less', 'greater'},"
1151 f" found {alternative!r}"
1152 )
1153 raise ValueError(msg)
1155 x1_sum_x2 = x1 + x2
1157 x1_log_comb = _compute_log_combinations(total_col_1)
1158 x2_log_comb = _compute_log_combinations(total_col_2)
1159 x1_sum_x2_log_comb = x1_log_comb[x1] + x2_log_comb[x2]
1161 result = shgo(
1162 _get_binomial_log_p_value_with_nuisance_param,
1163 args=(x1_sum_x2, x1_sum_x2_log_comb, index_arr),
1164 bounds=((0, 1),),
1165 n=n,
1166 sampling_method="sobol",
1167 )
1169 # result.fun is the negative log pvalue and therefore needs to be
1170 # changed before return
1171 p_value = np.clip(np.exp(-result.fun), a_min=0, a_max=1)
1172 return BarnardExactResult(wald_stat_obs, p_value)
1175BoschlooExactResult = make_dataclass(
1176 "BoschlooExactResult", [("statistic", float), ("pvalue", float)]
1177)
1180def boschloo_exact(table, alternative="two-sided", n=32):
1181 r"""Perform Boschloo's exact test on a 2x2 contingency table.
1183 Parameters
1184 ----------
1185 table : array_like of ints
1186 A 2x2 contingency table. Elements should be non-negative integers.
1188 alternative : {'two-sided', 'less', 'greater'}, optional
1189 Defines the null and alternative hypotheses. Default is 'two-sided'.
1190 Please see explanations in the Notes section below.
1192 n : int, optional
1193 Number of sampling points used in the construction of the sampling
1194 method. Note that this argument will automatically be converted to
1195 the next higher power of 2 since `scipy.stats.qmc.Sobol` is used to
1196 select sample points. Default is 32. Must be positive. In most cases,
1197 32 points is enough to reach good precision. More points comes at
1198 performance cost.
1200 Returns
1201 -------
1202 ber : BoschlooExactResult
1203 A result object with the following attributes.
1205 statistic : float
1206 The statistic used in Boschloo's test; that is, the p-value
1207 from Fisher's exact test.
1209 pvalue : float
1210 P-value, the probability of obtaining a distribution at least as
1211 extreme as the one that was actually observed, assuming that the
1212 null hypothesis is true.
1214 See Also
1215 --------
1216 chi2_contingency : Chi-square test of independence of variables in a
1217 contingency table.
1218 fisher_exact : Fisher exact test on a 2x2 contingency table.
1219 barnard_exact : Barnard's exact test, which is a more powerful alternative
1220 than Fisher's exact test for 2x2 contingency tables.
1222 Notes
1223 -----
1224 Boschloo's test is an exact test used in the analysis of contingency
1225 tables. It examines the association of two categorical variables, and
1226 is a uniformly more powerful alternative to Fisher's exact test
1227 for 2x2 contingency tables.
1229 Boschloo's exact test uses the p-value of Fisher's exact test as a
1230 statistic, and Boschloo's p-value is the probability under the null
1231 hypothesis of observing such an extreme value of this statistic.
1233 Let's define :math:`X_0` a 2x2 matrix representing the observed sample,
1234 where each column stores the binomial experiment, as in the example
1235 below. Let's also define :math:`p_1, p_2` the theoretical binomial
1236 probabilities for :math:`x_{11}` and :math:`x_{12}`. When using
1237 Boschloo exact test, we can assert three different alternative hypotheses:
1239 - :math:`H_0 : p_1=p_2` versus :math:`H_1 : p_1 < p_2`,
1240 with `alternative` = "less"
1242 - :math:`H_0 : p_1=p_2` versus :math:`H_1 : p_1 > p_2`,
1243 with `alternative` = "greater"
1245 - :math:`H_0 : p_1=p_2` versus :math:`H_1 : p_1 \neq p_2`,
1246 with `alternative` = "two-sided" (default)
1248 There are multiple conventions for computing a two-sided p-value when the
1249 null distribution is asymmetric. Here, we apply the convention that the
1250 p-value of a two-sided test is twice the minimum of the p-values of the
1251 one-sided tests (clipped to 1.0). Note that `fisher_exact` follows a
1252 different convention, so for a given `table`, the statistic reported by
1253 `boschloo_exact` may differ from the p-value reported by `fisher_exact`
1254 when ``alternative='two-sided'``.
1256 .. versionadded:: 1.7.0
1258 References
1259 ----------
1260 .. [1] R.D. Boschloo. "Raised conditional level of significance for the
1261 2 x 2-table when testing the equality of two probabilities",
1262 Statistica Neerlandica, 24(1), 1970
1264 .. [2] "Boschloo's test", Wikipedia,
1265 https://en.wikipedia.org/wiki/Boschloo%27s_test
1267 .. [3] Lise M. Saari et al. "Employee attitudes and job satisfaction",
1268 Human Resource Management, 43(4), 395-407, 2004,
1269 :doi:`10.1002/hrm.20032`.
1271 Examples
1272 --------
1273 In the following example, we consider the article "Employee
1274 attitudes and job satisfaction" [3]_
1275 which reports the results of a survey from 63 scientists and 117 college
1276 professors. Of the 63 scientists, 31 said they were very satisfied with
1277 their jobs, whereas 74 of the college professors were very satisfied
1278 with their work. Is this significant evidence that college
1279 professors are happier with their work than scientists?
1280 The following table summarizes the data mentioned above::
1282 college professors scientists
1283 Very Satisfied 74 31
1284 Dissatisfied 43 32
1286 When working with statistical hypothesis testing, we usually use a
1287 threshold probability or significance level upon which we decide
1288 to reject the null hypothesis :math:`H_0`. Suppose we choose the common
1289 significance level of 5%.
1291 Our alternative hypothesis is that college professors are truly more
1292 satisfied with their work than scientists. Therefore, we expect
1293 :math:`p_1` the proportion of very satisfied college professors to be
1294 greater than :math:`p_2`, the proportion of very satisfied scientists.
1295 We thus call `boschloo_exact` with the ``alternative="greater"`` option:
1297 >>> import scipy.stats as stats
1298 >>> res = stats.boschloo_exact([[74, 31], [43, 32]], alternative="greater")
1299 >>> res.statistic
1300 0.0483...
1301 >>> res.pvalue
1302 0.0355...
1304 Under the null hypothesis that scientists are happier in their work than
1305 college professors, the probability of obtaining test
1306 results at least as extreme as the observed data is approximately 3.55%.
1307 Since this p-value is less than our chosen significance level, we have
1308 evidence to reject :math:`H_0` in favor of the alternative hypothesis.
1310 """
1311 hypergeom = distributions.hypergeom
1313 if n <= 0:
1314 raise ValueError(
1315 "Number of points `n` must be strictly positive,"
1316 f" found {n!r}"
1317 )
1319 table = np.asarray(table, dtype=np.int64)
1321 if not table.shape == (2, 2):
1322 raise ValueError("The input `table` must be of shape (2, 2).")
1324 if np.any(table < 0):
1325 raise ValueError("All values in `table` must be nonnegative.")
1327 if 0 in table.sum(axis=0):
1328 # If both values in column are zero, the p-value is 1 and
1329 # the score's statistic is NaN.
1330 return BoschlooExactResult(np.nan, np.nan)
1332 total_col_1, total_col_2 = table.sum(axis=0)
1333 total = total_col_1 + total_col_2
1334 x1 = np.arange(total_col_1 + 1, dtype=np.int64).reshape(1, -1)
1335 x2 = np.arange(total_col_2 + 1, dtype=np.int64).reshape(-1, 1)
1336 x1_sum_x2 = x1 + x2
1338 if alternative == 'less':
1339 pvalues = hypergeom.cdf(x1, total, x1_sum_x2, total_col_1).T
1340 elif alternative == 'greater':
1341 # Same formula as the 'less' case, but with the second column.
1342 pvalues = hypergeom.cdf(x2, total, x1_sum_x2, total_col_2).T
1343 elif alternative == 'two-sided':
1344 boschloo_less = boschloo_exact(table, alternative="less", n=n)
1345 boschloo_greater = boschloo_exact(table, alternative="greater", n=n)
1347 res = (
1348 boschloo_less if boschloo_less.pvalue < boschloo_greater.pvalue
1349 else boschloo_greater
1350 )
1352 # Two-sided p-value is defined as twice the minimum of the one-sided
1353 # p-values
1354 pvalue = np.clip(2 * res.pvalue, a_min=0, a_max=1)
1355 return BoschlooExactResult(res.statistic, pvalue)
1356 else:
1357 msg = (
1358 f"`alternative` should be one of {'two-sided', 'less', 'greater'},"
1359 f" found {alternative!r}"
1360 )
1361 raise ValueError(msg)
1363 fisher_stat = pvalues[table[0, 0], table[0, 1]]
1365 # fisher_stat * (1+1e-13) guards us from small numerical error. It is
1366 # equivalent to np.isclose with relative tol of 1e-13 and absolute tol of 0
1367 # For more throughout explanations, see gh-14178
1368 index_arr = pvalues <= fisher_stat * (1+1e-13)
1370 x1, x2, x1_sum_x2 = x1.T, x2.T, x1_sum_x2.T
1371 x1_log_comb = _compute_log_combinations(total_col_1)
1372 x2_log_comb = _compute_log_combinations(total_col_2)
1373 x1_sum_x2_log_comb = x1_log_comb[x1] + x2_log_comb[x2]
1375 result = shgo(
1376 _get_binomial_log_p_value_with_nuisance_param,
1377 args=(x1_sum_x2, x1_sum_x2_log_comb, index_arr),
1378 bounds=((0, 1),),
1379 n=n,
1380 sampling_method="sobol",
1381 )
1383 # result.fun is the negative log pvalue and therefore needs to be
1384 # changed before return
1385 p_value = np.clip(np.exp(-result.fun), a_min=0, a_max=1)
1386 return BoschlooExactResult(fisher_stat, p_value)
1389def _get_binomial_log_p_value_with_nuisance_param(
1390 nuisance_param, x1_sum_x2, x1_sum_x2_log_comb, index_arr
1391):
1392 r"""
1393 Compute the log pvalue in respect of a nuisance parameter considering
1394 a 2x2 sample space.
1396 Parameters
1397 ----------
1398 nuisance_param : float
1399 nuisance parameter used in the computation of the maximisation of
1400 the p-value. Must be between 0 and 1
1402 x1_sum_x2 : ndarray
1403 Sum of x1 and x2 inside barnard_exact
1405 x1_sum_x2_log_comb : ndarray
1406 sum of the log combination of x1 and x2
1408 index_arr : ndarray of boolean
1410 Returns
1411 -------
1412 p_value : float
1413 Return the maximum p-value considering every nuisance paramater
1414 between 0 and 1
1416 Notes
1417 -----
1419 Both Barnard's test and Boschloo's test iterate over a nuisance parameter
1420 :math:`\pi \in [0, 1]` to find the maximum p-value. To search this
1421 maxima, this function return the negative log pvalue with respect to the
1422 nuisance parameter passed in params. This negative log p-value is then
1423 used in `shgo` to find the minimum negative pvalue which is our maximum
1424 pvalue.
1426 Also, to compute the different combination used in the
1427 p-values' computation formula, this function uses `gammaln` which is
1428 more tolerant for large value than `scipy.special.comb`. `gammaln` gives
1429 a log combination. For the little precision loss, performances are
1430 improved a lot.
1431 """
1432 t1, t2 = x1_sum_x2.shape
1433 n = t1 + t2 - 2
1434 with np.errstate(divide="ignore", invalid="ignore"):
1435 log_nuisance = np.log(
1436 nuisance_param,
1437 out=np.zeros_like(nuisance_param),
1438 where=nuisance_param >= 0,
1439 )
1440 log_1_minus_nuisance = np.log(
1441 1 - nuisance_param,
1442 out=np.zeros_like(nuisance_param),
1443 where=1 - nuisance_param >= 0,
1444 )
1446 nuisance_power_x1_x2 = log_nuisance * x1_sum_x2
1447 nuisance_power_x1_x2[(x1_sum_x2 == 0)[:, :]] = 0
1449 nuisance_power_n_minus_x1_x2 = log_1_minus_nuisance * (n - x1_sum_x2)
1450 nuisance_power_n_minus_x1_x2[(x1_sum_x2 == n)[:, :]] = 0
1452 tmp_log_values_arr = (
1453 x1_sum_x2_log_comb
1454 + nuisance_power_x1_x2
1455 + nuisance_power_n_minus_x1_x2
1456 )
1458 tmp_values_from_index = tmp_log_values_arr[index_arr]
1460 # To avoid dividing by zero in log function and getting inf value,
1461 # values are centered according to the max
1462 max_value = tmp_values_from_index.max()
1464 # To have better result's precision, the log pvalue is taken here.
1465 # Indeed, pvalue is included inside [0, 1] interval. Passing the
1466 # pvalue to log makes the interval a lot bigger ([-inf, 0]), and thus
1467 # help us to achieve better precision
1468 with np.errstate(divide="ignore", invalid="ignore"):
1469 log_probs = np.exp(tmp_values_from_index - max_value).sum()
1470 log_pvalue = max_value + np.log(
1471 log_probs,
1472 out=np.full_like(log_probs, -np.inf),
1473 where=log_probs > 0,
1474 )
1476 # Since shgo find the minima, minus log pvalue is returned
1477 return -log_pvalue
1480def _pval_cvm_2samp_exact(s, m, n):
1481 """
1482 Compute the exact p-value of the Cramer-von Mises two-sample test
1483 for a given value s of the test statistic.
1484 m and n are the sizes of the samples.
1486 [1] Y. Xiao, A. Gordon, and A. Yakovlev, "A C++ Program for
1487 the Cramér-Von Mises Two-Sample Test", J. Stat. Soft.,
1488 vol. 17, no. 8, pp. 1-15, Dec. 2006.
1489 [2] T. W. Anderson "On the Distribution of the Two-Sample Cramer-von Mises
1490 Criterion," The Annals of Mathematical Statistics, Ann. Math. Statist.
1491 33(3), 1148-1159, (September, 1962)
1492 """
1494 # [1, p. 3]
1495 lcm = np.lcm(m, n)
1496 # [1, p. 4], below eq. 3
1497 a = lcm // m
1498 b = lcm // n
1499 # Combine Eq. 9 in [2] with Eq. 2 in [1] and solve for $\zeta$
1500 # Hint: `s` is $U$ in [2], and $T_2$ in [1] is $T$ in [2]
1501 mn = m * n
1502 zeta = lcm ** 2 * (m + n) * (6 * s - mn * (4 * mn - 1)) // (6 * mn ** 2)
1504 # bound maximum value that may appear in `gs` (remember both rows!)
1505 zeta_bound = lcm**2 * (m + n) # bound elements in row 1
1506 combinations = comb(m + n, m) # sum of row 2
1507 max_gs = max(zeta_bound, combinations)
1508 dtype = np.min_scalar_type(max_gs)
1510 # the frequency table of $g_{u, v}^+$ defined in [1, p. 6]
1511 gs = ([np.array([[0], [1]], dtype=dtype)]
1512 + [np.empty((2, 0), dtype=dtype) for _ in range(m)])
1513 for u in range(n + 1):
1514 next_gs = []
1515 tmp = np.empty((2, 0), dtype=dtype)
1516 for v, g in enumerate(gs):
1517 # Calculate g recursively with eq. 11 in [1]. Even though it
1518 # doesn't look like it, this also does 12/13 (all of Algorithm 1).
1519 vi, i0, i1 = np.intersect1d(tmp[0], g[0], return_indices=True)
1520 tmp = np.concatenate([
1521 np.stack([vi, tmp[1, i0] + g[1, i1]]),
1522 np.delete(tmp, i0, 1),
1523 np.delete(g, i1, 1)
1524 ], 1)
1525 tmp[0] += (a * v - b * u) ** 2
1526 next_gs.append(tmp)
1527 gs = next_gs
1528 value, freq = gs[m]
1529 return np.float64(np.sum(freq[value >= zeta]) / combinations)
1532def cramervonmises_2samp(x, y, method='auto'):
1533 """Perform the two-sample Cramér-von Mises test for goodness of fit.
1535 This is the two-sample version of the Cramér-von Mises test ([1]_):
1536 for two independent samples :math:`X_1, ..., X_n` and
1537 :math:`Y_1, ..., Y_m`, the null hypothesis is that the samples
1538 come from the same (unspecified) continuous distribution.
1540 Parameters
1541 ----------
1542 x : array_like
1543 A 1-D array of observed values of the random variables :math:`X_i`.
1544 y : array_like
1545 A 1-D array of observed values of the random variables :math:`Y_i`.
1546 method : {'auto', 'asymptotic', 'exact'}, optional
1547 The method used to compute the p-value, see Notes for details.
1548 The default is 'auto'.
1550 Returns
1551 -------
1552 res : object with attributes
1553 statistic : float
1554 Cramér-von Mises statistic.
1555 pvalue : float
1556 The p-value.
1558 See Also
1559 --------
1560 cramervonmises, anderson_ksamp, epps_singleton_2samp, ks_2samp
1562 Notes
1563 -----
1564 .. versionadded:: 1.7.0
1566 The statistic is computed according to equation 9 in [2]_. The
1567 calculation of the p-value depends on the keyword `method`:
1569 - ``asymptotic``: The p-value is approximated by using the limiting
1570 distribution of the test statistic.
1571 - ``exact``: The exact p-value is computed by enumerating all
1572 possible combinations of the test statistic, see [2]_.
1574 If ``method='auto'``, the exact approach is used
1575 if both samples contain equal to or less than 20 observations,
1576 otherwise the asymptotic distribution is used.
1578 If the underlying distribution is not continuous, the p-value is likely to
1579 be conservative (Section 6.2 in [3]_). When ranking the data to compute
1580 the test statistic, midranks are used if there are ties.
1582 References
1583 ----------
1584 .. [1] https://en.wikipedia.org/wiki/Cramer-von_Mises_criterion
1585 .. [2] Anderson, T.W. (1962). On the distribution of the two-sample
1586 Cramer-von-Mises criterion. The Annals of Mathematical
1587 Statistics, pp. 1148-1159.
1588 .. [3] Conover, W.J., Practical Nonparametric Statistics, 1971.
1590 Examples
1591 --------
1593 Suppose we wish to test whether two samples generated by
1594 ``scipy.stats.norm.rvs`` have the same distribution. We choose a
1595 significance level of alpha=0.05.
1597 >>> import numpy as np
1598 >>> from scipy import stats
1599 >>> rng = np.random.default_rng()
1600 >>> x = stats.norm.rvs(size=100, random_state=rng)
1601 >>> y = stats.norm.rvs(size=70, random_state=rng)
1602 >>> res = stats.cramervonmises_2samp(x, y)
1603 >>> res.statistic, res.pvalue
1604 (0.29376470588235293, 0.1412873014573014)
1606 The p-value exceeds our chosen significance level, so we do not
1607 reject the null hypothesis that the observed samples are drawn from the
1608 same distribution.
1610 For small sample sizes, one can compute the exact p-values:
1612 >>> x = stats.norm.rvs(size=7, random_state=rng)
1613 >>> y = stats.t.rvs(df=2, size=6, random_state=rng)
1614 >>> res = stats.cramervonmises_2samp(x, y, method='exact')
1615 >>> res.statistic, res.pvalue
1616 (0.197802197802198, 0.31643356643356646)
1618 The p-value based on the asymptotic distribution is a good approximation
1619 even though the sample size is small.
1621 >>> res = stats.cramervonmises_2samp(x, y, method='asymptotic')
1622 >>> res.statistic, res.pvalue
1623 (0.197802197802198, 0.2966041181527128)
1625 Independent of the method, one would not reject the null hypothesis at the
1626 chosen significance level in this example.
1628 """
1629 xa = np.sort(np.asarray(x))
1630 ya = np.sort(np.asarray(y))
1632 if xa.size <= 1 or ya.size <= 1:
1633 raise ValueError('x and y must contain at least two observations.')
1634 if xa.ndim > 1 or ya.ndim > 1:
1635 raise ValueError('The samples must be one-dimensional.')
1636 if method not in ['auto', 'exact', 'asymptotic']:
1637 raise ValueError('method must be either auto, exact or asymptotic.')
1639 nx = len(xa)
1640 ny = len(ya)
1642 if method == 'auto':
1643 if max(nx, ny) > 20:
1644 method = 'asymptotic'
1645 else:
1646 method = 'exact'
1648 # get ranks of x and y in the pooled sample
1649 z = np.concatenate([xa, ya])
1650 # in case of ties, use midrank (see [1])
1651 r = scipy.stats.rankdata(z, method='average')
1652 rx = r[:nx]
1653 ry = r[nx:]
1655 # compute U (eq. 10 in [2])
1656 u = nx * np.sum((rx - np.arange(1, nx+1))**2)
1657 u += ny * np.sum((ry - np.arange(1, ny+1))**2)
1659 # compute T (eq. 9 in [2])
1660 k, N = nx*ny, nx + ny
1661 t = u / (k*N) - (4*k - 1)/(6*N)
1663 if method == 'exact':
1664 p = _pval_cvm_2samp_exact(u, nx, ny)
1665 else:
1666 # compute expected value and variance of T (eq. 11 and 14 in [2])
1667 et = (1 + 1/N)/6
1668 vt = (N+1) * (4*k*N - 3*(nx**2 + ny**2) - 2*k)
1669 vt = vt / (45 * N**2 * 4 * k)
1671 # computed the normalized statistic (eq. 15 in [2])
1672 tn = 1/6 + (t - et) / np.sqrt(45 * vt)
1674 # approximate distribution of tn with limiting distribution
1675 # of the one-sample test statistic
1676 # if tn < 0.003, the _cdf_cvm_inf(tn) < 1.28*1e-18, return 1.0 directly
1677 if tn < 0.003:
1678 p = 1.0
1679 else:
1680 p = max(0, 1. - _cdf_cvm_inf(tn))
1682 return CramerVonMisesResult(statistic=t, pvalue=p)
1685class TukeyHSDResult:
1686 """Result of `scipy.stats.tukey_hsd`.
1688 Attributes
1689 ----------
1690 statistic : float ndarray
1691 The computed statistic of the test for each comparison. The element
1692 at index ``(i, j)`` is the statistic for the comparison between groups
1693 ``i`` and ``j``.
1694 pvalue : float ndarray
1695 The associated p-value from the studentized range distribution. The
1696 element at index ``(i, j)`` is the p-value for the comparison
1697 between groups ``i`` and ``j``.
1699 Notes
1700 -----
1701 The string representation of this object displays the most recently
1702 calculated confidence interval, and if none have been previously
1703 calculated, it will evaluate ``confidence_interval()``.
1705 References
1706 ----------
1707 .. [1] NIST/SEMATECH e-Handbook of Statistical Methods, "7.4.7.1. Tukey's
1708 Method."
1709 https://www.itl.nist.gov/div898/handbook/prc/section4/prc471.htm,
1710 28 November 2020.
1711 """
1713 def __init__(self, statistic, pvalue, _nobs, _ntreatments, _stand_err):
1714 self.statistic = statistic
1715 self.pvalue = pvalue
1716 self._ntreatments = _ntreatments
1717 self._nobs = _nobs
1718 self._stand_err = _stand_err
1719 self._ci = None
1720 self._ci_cl = None
1722 def __str__(self):
1723 # Note: `__str__` prints the confidence intervals from the most
1724 # recent call to `confidence_interval`. If it has not been called,
1725 # it will be called with the default CL of .95.
1726 if self._ci is None:
1727 self.confidence_interval(confidence_level=.95)
1728 s = ("Tukey's HSD Pairwise Group Comparisons"
1729 f" ({self._ci_cl*100:.1f}% Confidence Interval)\n")
1730 s += "Comparison Statistic p-value Lower CI Upper CI\n"
1731 for i in range(self.pvalue.shape[0]):
1732 for j in range(self.pvalue.shape[0]):
1733 if i != j:
1734 s += (f" ({i} - {j}) {self.statistic[i, j]:>10.3f}"
1735 f"{self.pvalue[i, j]:>10.3f}"
1736 f"{self._ci.low[i, j]:>10.3f}"
1737 f"{self._ci.high[i, j]:>10.3f}\n")
1738 return s
1740 def confidence_interval(self, confidence_level=.95):
1741 """Compute the confidence interval for the specified confidence level.
1743 Parameters
1744 ----------
1745 confidence_level : float, optional
1746 Confidence level for the computed confidence interval
1747 of the estimated proportion. Default is .95.
1749 Returns
1750 -------
1751 ci : ``ConfidenceInterval`` object
1752 The object has attributes ``low`` and ``high`` that hold the
1753 lower and upper bounds of the confidence intervals for each
1754 comparison. The high and low values are accessible for each
1755 comparison at index ``(i, j)`` between groups ``i`` and ``j``.
1757 References
1758 ----------
1759 .. [1] NIST/SEMATECH e-Handbook of Statistical Methods, "7.4.7.1.
1760 Tukey's Method."
1761 https://www.itl.nist.gov/div898/handbook/prc/section4/prc471.htm,
1762 28 November 2020.
1764 Examples
1765 --------
1766 >>> from scipy.stats import tukey_hsd
1767 >>> group0 = [24.5, 23.5, 26.4, 27.1, 29.9]
1768 >>> group1 = [28.4, 34.2, 29.5, 32.2, 30.1]
1769 >>> group2 = [26.1, 28.3, 24.3, 26.2, 27.8]
1770 >>> result = tukey_hsd(group0, group1, group2)
1771 >>> ci = result.confidence_interval()
1772 >>> ci.low
1773 array([[-3.649159, -8.249159, -3.909159],
1774 [ 0.950841, -3.649159, 0.690841],
1775 [-3.389159, -7.989159, -3.649159]])
1776 >>> ci.high
1777 array([[ 3.649159, -0.950841, 3.389159],
1778 [ 8.249159, 3.649159, 7.989159],
1779 [ 3.909159, -0.690841, 3.649159]])
1780 """
1781 # check to see if the supplied confidence level matches that of the
1782 # previously computed CI.
1783 if (self._ci is not None and self._ci_cl is not None and
1784 confidence_level == self._ci_cl):
1785 return self._ci
1787 if not 0 < confidence_level < 1:
1788 raise ValueError("Confidence level must be between 0 and 1.")
1789 # determine the critical value of the studentized range using the
1790 # appropriate confidence level, number of treatments, and degrees
1791 # of freedom as determined by the number of data less the number of
1792 # treatments. ("Confidence limits for Tukey's method")[1]. Note that
1793 # in the cases of unequal sample sizes there will be a criterion for
1794 # each group comparison.
1795 params = (confidence_level, self._nobs, self._ntreatments - self._nobs)
1796 srd = distributions.studentized_range.ppf(*params)
1797 # also called maximum critical value, the Tukey criterion is the
1798 # studentized range critical value * the square root of mean square
1799 # error over the sample size.
1800 tukey_criterion = srd * self._stand_err
1801 # the confidence levels are determined by the
1802 # `mean_differences` +- `tukey_criterion`
1803 upper_conf = self.statistic + tukey_criterion
1804 lower_conf = self.statistic - tukey_criterion
1805 self._ci = ConfidenceInterval(low=lower_conf, high=upper_conf)
1806 self._ci_cl = confidence_level
1807 return self._ci
1810def _tukey_hsd_iv(args):
1811 if (len(args)) < 2:
1812 raise ValueError("There must be more than 1 treatment.")
1813 args = [np.asarray(arg) for arg in args]
1814 for arg in args:
1815 if arg.ndim != 1:
1816 raise ValueError("Input samples must be one-dimensional.")
1817 if arg.size <= 1:
1818 raise ValueError("Input sample size must be greater than one.")
1819 if np.isinf(arg).any():
1820 raise ValueError("Input samples must be finite.")
1821 return args
1824def tukey_hsd(*args):
1825 """Perform Tukey's HSD test for equality of means over multiple treatments.
1827 Tukey's honestly significant difference (HSD) test performs pairwise
1828 comparison of means for a set of samples. Whereas ANOVA (e.g. `f_oneway`)
1829 assesses whether the true means underlying each sample are identical,
1830 Tukey's HSD is a post hoc test used to compare the mean of each sample
1831 to the mean of each other sample.
1833 The null hypothesis is that the distributions underlying the samples all
1834 have the same mean. The test statistic, which is computed for every
1835 possible pairing of samples, is simply the difference between the sample
1836 means. For each pair, the p-value is the probability under the null
1837 hypothesis (and other assumptions; see notes) of observing such an extreme
1838 value of the statistic, considering that many pairwise comparisons are
1839 being performed. Confidence intervals for the difference between each pair
1840 of means are also available.
1842 Parameters
1843 ----------
1844 sample1, sample2, ... : array_like
1845 The sample measurements for each group. There must be at least
1846 two arguments.
1848 Returns
1849 -------
1850 result : `~scipy.stats._result_classes.TukeyHSDResult` instance
1851 The return value is an object with the following attributes:
1853 statistic : float ndarray
1854 The computed statistic of the test for each comparison. The element
1855 at index ``(i, j)`` is the statistic for the comparison between
1856 groups ``i`` and ``j``.
1857 pvalue : float ndarray
1858 The computed p-value of the test for each comparison. The element
1859 at index ``(i, j)`` is the p-value for the comparison between
1860 groups ``i`` and ``j``.
1862 The object has the following methods:
1864 confidence_interval(confidence_level=0.95):
1865 Compute the confidence interval for the specified confidence level.
1867 Notes
1868 -----
1869 The use of this test relies on several assumptions.
1871 1. The observations are independent within and among groups.
1872 2. The observations within each group are normally distributed.
1873 3. The distributions from which the samples are drawn have the same finite
1874 variance.
1876 The original formulation of the test was for samples of equal size [6]_.
1877 In case of unequal sample sizes, the test uses the Tukey-Kramer method
1878 [4]_.
1880 References
1881 ----------
1882 .. [1] NIST/SEMATECH e-Handbook of Statistical Methods, "7.4.7.1. Tukey's
1883 Method."
1884 https://www.itl.nist.gov/div898/handbook/prc/section4/prc471.htm,
1885 28 November 2020.
1886 .. [2] Abdi, Herve & Williams, Lynne. (2021). "Tukey's Honestly Significant
1887 Difference (HSD) Test."
1888 https://personal.utdallas.edu/~herve/abdi-HSD2010-pretty.pdf
1889 .. [3] "One-Way ANOVA Using SAS PROC ANOVA & PROC GLM." SAS
1890 Tutorials, 2007, www.stattutorials.com/SAS/TUTORIAL-PROC-GLM.htm.
1891 .. [4] Kramer, Clyde Young. "Extension of Multiple Range Tests to Group
1892 Means with Unequal Numbers of Replications." Biometrics, vol. 12,
1893 no. 3, 1956, pp. 307-310. JSTOR, www.jstor.org/stable/3001469.
1894 Accessed 25 May 2021.
1895 .. [5] NIST/SEMATECH e-Handbook of Statistical Methods, "7.4.3.3.
1896 The ANOVA table and tests of hypotheses about means"
1897 https://www.itl.nist.gov/div898/handbook/prc/section4/prc433.htm,
1898 2 June 2021.
1899 .. [6] Tukey, John W. "Comparing Individual Means in the Analysis of
1900 Variance." Biometrics, vol. 5, no. 2, 1949, pp. 99-114. JSTOR,
1901 www.jstor.org/stable/3001913. Accessed 14 June 2021.
1904 Examples
1905 --------
1906 Here are some data comparing the time to relief of three brands of
1907 headache medicine, reported in minutes. Data adapted from [3]_.
1909 >>> import numpy as np
1910 >>> from scipy.stats import tukey_hsd
1911 >>> group0 = [24.5, 23.5, 26.4, 27.1, 29.9]
1912 >>> group1 = [28.4, 34.2, 29.5, 32.2, 30.1]
1913 >>> group2 = [26.1, 28.3, 24.3, 26.2, 27.8]
1915 We would like to see if the means between any of the groups are
1916 significantly different. First, visually examine a box and whisker plot.
1918 >>> import matplotlib.pyplot as plt
1919 >>> fig, ax = plt.subplots(1, 1)
1920 >>> ax.boxplot([group0, group1, group2])
1921 >>> ax.set_xticklabels(["group0", "group1", "group2"]) # doctest: +SKIP
1922 >>> ax.set_ylabel("mean") # doctest: +SKIP
1923 >>> plt.show()
1925 From the box and whisker plot, we can see overlap in the interquartile
1926 ranges group 1 to group 2 and group 3, but we can apply the ``tukey_hsd``
1927 test to determine if the difference between means is significant. We
1928 set a significance level of .05 to reject the null hypothesis.
1930 >>> res = tukey_hsd(group0, group1, group2)
1931 >>> print(res)
1932 Tukey's HSD Pairwise Group Comparisons (95.0% Confidence Interval)
1933 Comparison Statistic p-value Lower CI Upper CI
1934 (0 - 1) -4.600 0.014 -8.249 -0.951
1935 (0 - 2) -0.260 0.980 -3.909 3.389
1936 (1 - 0) 4.600 0.014 0.951 8.249
1937 (1 - 2) 4.340 0.020 0.691 7.989
1938 (2 - 0) 0.260 0.980 -3.389 3.909
1939 (2 - 1) -4.340 0.020 -7.989 -0.691
1941 The null hypothesis is that each group has the same mean. The p-value for
1942 comparisons between ``group0`` and ``group1`` as well as ``group1`` and
1943 ``group2`` do not exceed .05, so we reject the null hypothesis that they
1944 have the same means. The p-value of the comparison between ``group0``
1945 and ``group2`` exceeds .05, so we accept the null hypothesis that there
1946 is not a significant difference between their means.
1948 We can also compute the confidence interval associated with our chosen
1949 confidence level.
1951 >>> group0 = [24.5, 23.5, 26.4, 27.1, 29.9]
1952 >>> group1 = [28.4, 34.2, 29.5, 32.2, 30.1]
1953 >>> group2 = [26.1, 28.3, 24.3, 26.2, 27.8]
1954 >>> result = tukey_hsd(group0, group1, group2)
1955 >>> conf = res.confidence_interval(confidence_level=.99)
1956 >>> for ((i, j), l) in np.ndenumerate(conf.low):
1957 ... # filter out self comparisons
1958 ... if i != j:
1959 ... h = conf.high[i,j]
1960 ... print(f"({i} - {j}) {l:>6.3f} {h:>6.3f}")
1961 (0 - 1) -9.480 0.280
1962 (0 - 2) -5.140 4.620
1963 (1 - 0) -0.280 9.480
1964 (1 - 2) -0.540 9.220
1965 (2 - 0) -4.620 5.140
1966 (2 - 1) -9.220 0.540
1967 """
1968 args = _tukey_hsd_iv(args)
1969 ntreatments = len(args)
1970 means = np.asarray([np.mean(arg) for arg in args])
1971 nsamples_treatments = np.asarray([a.size for a in args])
1972 nobs = np.sum(nsamples_treatments)
1974 # determine mean square error [5]. Note that this is sometimes called
1975 # mean square error within.
1976 mse = (np.sum([np.var(arg, ddof=1) for arg in args] *
1977 (nsamples_treatments - 1)) / (nobs - ntreatments))
1979 # The calculation of the standard error differs when treatments differ in
1980 # size. See ("Unequal sample sizes")[1].
1981 if np.unique(nsamples_treatments).size == 1:
1982 # all input groups are the same length, so only one value needs to be
1983 # calculated [1].
1984 normalize = 2 / nsamples_treatments[0]
1985 else:
1986 # to compare groups of differing sizes, we must compute a variance
1987 # value for each individual comparison. Use broadcasting to get the
1988 # resulting matrix. [3], verified against [4] (page 308).
1989 normalize = 1 / nsamples_treatments + 1 / nsamples_treatments[None].T
1991 # the standard error is used in the computation of the tukey criterion and
1992 # finding the p-values.
1993 stand_err = np.sqrt(normalize * mse / 2)
1995 # the mean difference is the test statistic.
1996 mean_differences = means[None].T - means
1998 # Calculate the t-statistic to use within the survival function of the
1999 # studentized range to get the p-value.
2000 t_stat = np.abs(mean_differences) / stand_err
2002 params = t_stat, ntreatments, nobs - ntreatments
2003 pvalues = distributions.studentized_range.sf(*params)
2005 return TukeyHSDResult(mean_differences, pvalues, ntreatments,
2006 nobs, stand_err)