Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scipy/stats/_mannwhitneyu.py: 17%
156 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-12 06:31 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-12 06:31 +0000
1import numpy as np
2from collections import namedtuple
3from scipy import special
4from scipy import stats
5from ._axis_nan_policy import _axis_nan_policy_factory
8def _broadcast_concatenate(x, y, axis):
9 '''Broadcast then concatenate arrays, leaving concatenation axis last'''
10 x = np.moveaxis(x, axis, -1)
11 y = np.moveaxis(y, axis, -1)
12 z = np.broadcast(x[..., 0], y[..., 0])
13 x = np.broadcast_to(x, z.shape + (x.shape[-1],))
14 y = np.broadcast_to(y, z.shape + (y.shape[-1],))
15 z = np.concatenate((x, y), axis=-1)
16 return x, y, z
19class _MWU:
20 '''Distribution of MWU statistic under the null hypothesis'''
21 # Possible improvement: if m and n are small enough, use integer arithmetic
23 def __init__(self):
24 '''Minimal initializer'''
25 self._fmnks = -np.ones((1, 1, 1))
26 self._recursive = None
28 def pmf(self, k, m, n):
29 if (self._recursive is None and m <= 500 and n <= 500
30 or self._recursive):
31 return self.pmf_recursive(k, m, n)
32 else:
33 return self.pmf_iterative(k, m, n)
35 def pmf_recursive(self, k, m, n):
36 '''Probability mass function, recursive version'''
37 self._resize_fmnks(m, n, np.max(k))
38 # could loop over just the unique elements, but probably not worth
39 # the time to find them
40 for i in np.ravel(k):
41 self._f(m, n, i)
42 return self._fmnks[m, n, k] / special.binom(m + n, m)
44 def pmf_iterative(self, k, m, n):
45 '''Probability mass function, iterative version'''
46 fmnks = {}
47 for i in np.ravel(k):
48 fmnks = _mwu_f_iterative(m, n, i, fmnks)
49 return (np.array([fmnks[(m, n, ki)] for ki in k])
50 / special.binom(m + n, m))
52 def cdf(self, k, m, n):
53 '''Cumulative distribution function'''
54 # We could use the fact that the distribution is symmetric to avoid
55 # summing more than m*n/2 terms, but it might not be worth the
56 # overhead. Let's leave that to an improvement.
57 pmfs = self.pmf(np.arange(0, np.max(k) + 1), m, n)
58 cdfs = np.cumsum(pmfs)
59 return cdfs[k]
61 def sf(self, k, m, n):
62 '''Survival function'''
63 # Use the fact that the distribution is symmetric; i.e.
64 # _f(m, n, m*n-k) = _f(m, n, k), and sum from the left
65 k = m*n - k
66 # Note that both CDF and SF include the PMF at k. The p-value is
67 # calculated from the SF and should include the mass at k, so this
68 # is desirable
69 return self.cdf(k, m, n)
71 def _resize_fmnks(self, m, n, k):
72 '''If necessary, expand the array that remembers PMF values'''
73 # could probably use `np.pad` but I'm not sure it would save code
74 shape_old = np.array(self._fmnks.shape)
75 shape_new = np.array((m+1, n+1, k+1))
76 if np.any(shape_new > shape_old):
77 shape = np.maximum(shape_old, shape_new)
78 fmnks = -np.ones(shape) # create the new array
79 m0, n0, k0 = shape_old
80 fmnks[:m0, :n0, :k0] = self._fmnks # copy remembered values
81 self._fmnks = fmnks
83 def _f(self, m, n, k):
84 '''Recursive implementation of function of [3] Theorem 2.5'''
86 # [3] Theorem 2.5 Line 1
87 if k < 0 or m < 0 or n < 0 or k > m*n:
88 return 0
90 # if already calculated, return the value
91 if self._fmnks[m, n, k] >= 0:
92 return self._fmnks[m, n, k]
94 if k == 0 and m >= 0 and n >= 0: # [3] Theorem 2.5 Line 2
95 fmnk = 1
96 else: # [3] Theorem 2.5 Line 3 / Equation 3
97 fmnk = self._f(m-1, n, k-n) + self._f(m, n-1, k)
99 self._fmnks[m, n, k] = fmnk # remember result
101 return fmnk
104# Maintain state for faster repeat calls to mannwhitneyu w/ method='exact'
105_mwu_state = _MWU()
108def _mwu_f_iterative(m, n, k, fmnks):
109 '''Iterative implementation of function of [3] Theorem 2.5'''
111 def _base_case(m, n, k):
112 '''Base cases from recursive version'''
114 # if already calculated, return the value
115 if fmnks.get((m, n, k), -1) >= 0:
116 return fmnks[(m, n, k)]
118 # [3] Theorem 2.5 Line 1
119 elif k < 0 or m < 0 or n < 0 or k > m*n:
120 return 0
122 # [3] Theorem 2.5 Line 2
123 elif k == 0 and m >= 0 and n >= 0:
124 return 1
126 return None
128 stack = [(m, n, k)]
129 fmnk = None
131 while stack:
132 # Popping only if necessary would save a tiny bit of time, but NWI.
133 m, n, k = stack.pop()
135 # If we're at a base case, continue (stack unwinds)
136 fmnk = _base_case(m, n, k)
137 if fmnk is not None:
138 fmnks[(m, n, k)] = fmnk
139 continue
141 # If both terms are base cases, continue (stack unwinds)
142 f1 = _base_case(m-1, n, k-n)
143 f2 = _base_case(m, n-1, k)
144 if f1 is not None and f2 is not None:
145 # [3] Theorem 2.5 Line 3 / Equation 3
146 fmnk = f1 + f2
147 fmnks[(m, n, k)] = fmnk
148 continue
150 # recurse deeper
151 stack.append((m, n, k))
152 if f1 is None:
153 stack.append((m-1, n, k-n))
154 if f2 is None:
155 stack.append((m, n-1, k))
157 return fmnks
160def _tie_term(ranks):
161 """Tie correction term"""
162 # element i of t is the number of elements sharing rank i
163 _, t = np.unique(ranks, return_counts=True, axis=-1)
164 return (t**3 - t).sum(axis=-1)
167def _get_mwu_z(U, n1, n2, ranks, axis=0, continuity=True):
168 '''Standardized MWU statistic'''
169 # Follows mannwhitneyu [2]
170 mu = n1 * n2 / 2
171 n = n1 + n2
173 # Tie correction according to [2]
174 tie_term = np.apply_along_axis(_tie_term, -1, ranks)
175 s = np.sqrt(n1*n2/12 * ((n + 1) - tie_term/(n*(n-1))))
177 # equivalent to using scipy.stats.tiecorrect
178 # T = np.apply_along_axis(stats.tiecorrect, -1, ranks)
179 # s = np.sqrt(T * n1 * n2 * (n1+n2+1) / 12.0)
181 numerator = U - mu
183 # Continuity correction.
184 # Because SF is always used to calculate the p-value, we can always
185 # _subtract_ 0.5 for the continuity correction. This always increases the
186 # p-value to account for the rest of the probability mass _at_ q = U.
187 if continuity:
188 numerator -= 0.5
190 # no problem evaluating the norm SF at an infinity
191 with np.errstate(divide='ignore', invalid='ignore'):
192 z = numerator / s
193 return z
196def _mwu_input_validation(x, y, use_continuity, alternative, axis, method):
197 ''' Input validation and standardization for mannwhitneyu '''
198 # Would use np.asarray_chkfinite, but infs are OK
199 x, y = np.atleast_1d(x), np.atleast_1d(y)
200 if np.isnan(x).any() or np.isnan(y).any():
201 raise ValueError('`x` and `y` must not contain NaNs.')
202 if np.size(x) == 0 or np.size(y) == 0:
203 raise ValueError('`x` and `y` must be of nonzero size.')
205 bools = {True, False}
206 if use_continuity not in bools:
207 raise ValueError(f'`use_continuity` must be one of {bools}.')
209 alternatives = {"two-sided", "less", "greater"}
210 alternative = alternative.lower()
211 if alternative not in alternatives:
212 raise ValueError(f'`alternative` must be one of {alternatives}.')
214 axis_int = int(axis)
215 if axis != axis_int:
216 raise ValueError('`axis` must be an integer.')
218 methods = {"asymptotic", "exact", "auto"}
219 method = method.lower()
220 if method not in methods:
221 raise ValueError(f'`method` must be one of {methods}.')
223 return x, y, use_continuity, alternative, axis_int, method
226def _tie_check(xy):
227 """Find any ties in data"""
228 _, t = np.unique(xy, return_counts=True, axis=-1)
229 return np.any(t != 1)
232def _mwu_choose_method(n1, n2, xy, method):
233 """Choose method 'asymptotic' or 'exact' depending on input size, ties"""
235 # if both inputs are large, asymptotic is OK
236 if n1 > 8 and n2 > 8:
237 return "asymptotic"
239 # if there are any ties, asymptotic is preferred
240 if np.apply_along_axis(_tie_check, -1, xy).any():
241 return "asymptotic"
243 return "exact"
246MannwhitneyuResult = namedtuple('MannwhitneyuResult', ('statistic', 'pvalue'))
249@_axis_nan_policy_factory(MannwhitneyuResult, n_samples=2)
250def mannwhitneyu(x, y, use_continuity=True, alternative="two-sided",
251 axis=0, method="auto"):
252 r'''Perform the Mann-Whitney U rank test on two independent samples.
254 The Mann-Whitney U test is a nonparametric test of the null hypothesis
255 that the distribution underlying sample `x` is the same as the
256 distribution underlying sample `y`. It is often used as a test of
257 difference in location between distributions.
259 Parameters
260 ----------
261 x, y : array-like
262 N-d arrays of samples. The arrays must be broadcastable except along
263 the dimension given by `axis`.
264 use_continuity : bool, optional
265 Whether a continuity correction (1/2) should be applied.
266 Default is True when `method` is ``'asymptotic'``; has no effect
267 otherwise.
268 alternative : {'two-sided', 'less', 'greater'}, optional
269 Defines the alternative hypothesis. Default is 'two-sided'.
270 Let *F(u)* and *G(u)* be the cumulative distribution functions of the
271 distributions underlying `x` and `y`, respectively. Then the following
272 alternative hypotheses are available:
274 * 'two-sided': the distributions are not equal, i.e. *F(u) ≠ G(u)* for
275 at least one *u*.
276 * 'less': the distribution underlying `x` is stochastically less
277 than the distribution underlying `y`, i.e. *F(u) > G(u)* for all *u*.
278 * 'greater': the distribution underlying `x` is stochastically greater
279 than the distribution underlying `y`, i.e. *F(u) < G(u)* for all *u*.
281 Under a more restrictive set of assumptions, the alternative hypotheses
282 can be expressed in terms of the locations of the distributions;
283 see [5] section 5.1.
284 axis : int, optional
285 Axis along which to perform the test. Default is 0.
286 method : {'auto', 'asymptotic', 'exact'}, optional
287 Selects the method used to calculate the *p*-value.
288 Default is 'auto'. The following options are available.
290 * ``'asymptotic'``: compares the standardized test statistic
291 against the normal distribution, correcting for ties.
292 * ``'exact'``: computes the exact *p*-value by comparing the observed
293 :math:`U` statistic against the exact distribution of the :math:`U`
294 statistic under the null hypothesis. No correction is made for ties.
295 * ``'auto'``: chooses ``'exact'`` when the size of one of the samples
296 is less than 8 and there are no ties; chooses ``'asymptotic'``
297 otherwise.
299 Returns
300 -------
301 res : MannwhitneyuResult
302 An object containing attributes:
304 statistic : float
305 The Mann-Whitney U statistic corresponding with sample `x`. See
306 Notes for the test statistic corresponding with sample `y`.
307 pvalue : float
308 The associated *p*-value for the chosen `alternative`.
310 Notes
311 -----
312 If ``U1`` is the statistic corresponding with sample `x`, then the
313 statistic corresponding with sample `y` is
314 `U2 = `x.shape[axis] * y.shape[axis] - U1``.
316 `mannwhitneyu` is for independent samples. For related / paired samples,
317 consider `scipy.stats.wilcoxon`.
319 `method` ``'exact'`` is recommended when there are no ties and when either
320 sample size is less than 8 [1]_. The implementation follows the recurrence
321 relation originally proposed in [1]_ as it is described in [3]_.
322 Note that the exact method is *not* corrected for ties, but
323 `mannwhitneyu` will not raise errors or warnings if there are ties in the
324 data.
326 The Mann-Whitney U test is a non-parametric version of the t-test for
327 independent samples. When the means of samples from the populations
328 are normally distributed, consider `scipy.stats.ttest_ind`.
330 See Also
331 --------
332 scipy.stats.wilcoxon, scipy.stats.ranksums, scipy.stats.ttest_ind
334 References
335 ----------
336 .. [1] H.B. Mann and D.R. Whitney, "On a test of whether one of two random
337 variables is stochastically larger than the other", The Annals of
338 Mathematical Statistics, Vol. 18, pp. 50-60, 1947.
339 .. [2] Mann-Whitney U Test, Wikipedia,
340 http://en.wikipedia.org/wiki/Mann-Whitney_U_test
341 .. [3] A. Di Bucchianico, "Combinatorics, computer algebra, and the
342 Wilcoxon-Mann-Whitney test", Journal of Statistical Planning and
343 Inference, Vol. 79, pp. 349-364, 1999.
344 .. [4] Rosie Shier, "Statistics: 2.3 The Mann-Whitney U Test", Mathematics
345 Learning Support Centre, 2004.
346 .. [5] Michael P. Fay and Michael A. Proschan. "Wilcoxon-Mann-Whitney
347 or t-test? On assumptions for hypothesis tests and multiple \
348 interpretations of decision rules." Statistics surveys, Vol. 4, pp.
349 1-39, 2010. https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2857732/
351 Examples
352 --------
353 We follow the example from [4]_: nine randomly sampled young adults were
354 diagnosed with type II diabetes at the ages below.
356 >>> males = [19, 22, 16, 29, 24]
357 >>> females = [20, 11, 17, 12]
359 We use the Mann-Whitney U test to assess whether there is a statistically
360 significant difference in the diagnosis age of males and females.
361 The null hypothesis is that the distribution of male diagnosis ages is
362 the same as the distribution of female diagnosis ages. We decide
363 that a confidence level of 95% is required to reject the null hypothesis
364 in favor of the alternative that the distributions are different.
365 Since the number of samples is very small and there are no ties in the
366 data, we can compare the observed test statistic against the *exact*
367 distribution of the test statistic under the null hypothesis.
369 >>> from scipy.stats import mannwhitneyu
370 >>> U1, p = mannwhitneyu(males, females, method="exact")
371 >>> print(U1)
372 17.0
374 `mannwhitneyu` always reports the statistic associated with the first
375 sample, which, in this case, is males. This agrees with :math:`U_M = 17`
376 reported in [4]_. The statistic associated with the second statistic
377 can be calculated:
379 >>> nx, ny = len(males), len(females)
380 >>> U2 = nx*ny - U1
381 >>> print(U2)
382 3.0
384 This agrees with :math:`U_F = 3` reported in [4]_. The two-sided
385 *p*-value can be calculated from either statistic, and the value produced
386 by `mannwhitneyu` agrees with :math:`p = 0.11` reported in [4]_.
388 >>> print(p)
389 0.1111111111111111
391 The exact distribution of the test statistic is asymptotically normal, so
392 the example continues by comparing the exact *p*-value against the
393 *p*-value produced using the normal approximation.
395 >>> _, pnorm = mannwhitneyu(males, females, method="asymptotic")
396 >>> print(pnorm)
397 0.11134688653314041
399 Here `mannwhitneyu`'s reported *p*-value appears to conflict with the
400 value :math:`p = 0.09` given in [4]_. The reason is that [4]_
401 does not apply the continuity correction performed by `mannwhitneyu`;
402 `mannwhitneyu` reduces the distance between the test statistic and the
403 mean :math:`\mu = n_x n_y / 2` by 0.5 to correct for the fact that the
404 discrete statistic is being compared against a continuous distribution.
405 Here, the :math:`U` statistic used is less than the mean, so we reduce
406 the distance by adding 0.5 in the numerator.
408 >>> import numpy as np
409 >>> from scipy.stats import norm
410 >>> U = min(U1, U2)
411 >>> N = nx + ny
412 >>> z = (U - nx*ny/2 + 0.5) / np.sqrt(nx*ny * (N + 1)/ 12)
413 >>> p = 2 * norm.cdf(z) # use CDF to get p-value from smaller statistic
414 >>> print(p)
415 0.11134688653314041
417 If desired, we can disable the continuity correction to get a result
418 that agrees with that reported in [4]_.
420 >>> _, pnorm = mannwhitneyu(males, females, use_continuity=False,
421 ... method="asymptotic")
422 >>> print(pnorm)
423 0.0864107329737
425 Regardless of whether we perform an exact or asymptotic test, the
426 probability of the test statistic being as extreme or more extreme by
427 chance exceeds 5%, so we do not consider the results statistically
428 significant.
430 Suppose that, before seeing the data, we had hypothesized that females
431 would tend to be diagnosed at a younger age than males.
432 In that case, it would be natural to provide the female ages as the
433 first input, and we would have performed a one-sided test using
434 ``alternative = 'less'``: females are diagnosed at an age that is
435 stochastically less than that of males.
437 >>> res = mannwhitneyu(females, males, alternative="less", method="exact")
438 >>> print(res)
439 MannwhitneyuResult(statistic=3.0, pvalue=0.05555555555555555)
441 Again, the probability of getting a sufficiently low value of the
442 test statistic by chance under the null hypothesis is greater than 5%,
443 so we do not reject the null hypothesis in favor of our alternative.
445 If it is reasonable to assume that the means of samples from the
446 populations are normally distributed, we could have used a t-test to
447 perform the analysis.
449 >>> from scipy.stats import ttest_ind
450 >>> res = ttest_ind(females, males, alternative="less")
451 >>> print(res)
452 Ttest_indResult(statistic=-2.239334696520584, pvalue=0.030068441095757924)
454 Under this assumption, the *p*-value would be low enough to reject the
455 null hypothesis in favor of the alternative.
457 '''
459 x, y, use_continuity, alternative, axis_int, method = (
460 _mwu_input_validation(x, y, use_continuity, alternative, axis, method))
462 x, y, xy = _broadcast_concatenate(x, y, axis)
464 n1, n2 = x.shape[-1], y.shape[-1]
466 if method == "auto":
467 method = _mwu_choose_method(n1, n2, xy, method)
469 # Follows [2]
470 ranks = stats.rankdata(xy, axis=-1) # method 2, step 1
471 R1 = ranks[..., :n1].sum(axis=-1) # method 2, step 2
472 U1 = R1 - n1*(n1+1)/2 # method 2, step 3
473 U2 = n1 * n2 - U1 # as U1 + U2 = n1 * n2
475 if alternative == "greater":
476 U, f = U1, 1 # U is the statistic to use for p-value, f is a factor
477 elif alternative == "less":
478 U, f = U2, 1 # Due to symmetry, use SF of U2 rather than CDF of U1
479 else:
480 U, f = np.maximum(U1, U2), 2 # multiply SF by two for two-sided test
482 if method == "exact":
483 p = _mwu_state.sf(U.astype(int), n1, n2)
484 elif method == "asymptotic":
485 z = _get_mwu_z(U, n1, n2, ranks, continuity=use_continuity)
486 p = stats.norm.sf(z)
487 p *= f
489 # Ensure that test statistic is not greater than 1
490 # This could happen for exact test when U = m*n/2
491 p = np.clip(p, 0, 1)
493 return MannwhitneyuResult(U1, p)