Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scipy/stats/

1import numpy as np

2from collections import namedtuple

3from scipy import special

4from scipy import stats

5from ._axis_nan_policy import _axis_nan_policy_factory

8def _broadcast_concatenate(x, y, axis):

9 '''Broadcast then concatenate arrays, leaving concatenation axis last'''

10 x = np.moveaxis(x, axis, -1)

11 y = np.moveaxis(y, axis, -1)

12 z = np.broadcast(x[..., 0], y[..., 0])

13 x = np.broadcast_to(x, z.shape + (x.shape[-1],))

14 y = np.broadcast_to(y, z.shape + (y.shape[-1],))

15 z = np.concatenate((x, y), axis=-1)

16 return x, y, z

19class _MWU:

20 '''Distribution of MWU statistic under the null hypothesis'''

21 # Possible improvement: if m and n are small enough, use integer arithmetic

23 def __init__(self):

24 '''Minimal initializer'''

25 self._fmnks = -np.ones((1, 1, 1))

26 self._recursive = None

28 def pmf(self, k, m, n):

29 if (self._recursive is None and m <= 500 and n <= 500

30 or self._recursive):

31 return self.pmf_recursive(k, m, n)

32 else:

33 return self.pmf_iterative(k, m, n)

35 def pmf_recursive(self, k, m, n):

36 '''Probability mass function, recursive version'''

37 self._resize_fmnks(m, n, np.max(k))

38 # could loop over just the unique elements, but probably not worth

39 # the time to find them

40 for i in np.ravel(k):

41 self._f(m, n, i)

42 return self._fmnks[m, n, k] / special.binom(m + n, m)

44 def pmf_iterative(self, k, m, n):

45 '''Probability mass function, iterative version'''

46 fmnks = {}

47 for i in np.ravel(k):

48 fmnks = _mwu_f_iterative(m, n, i, fmnks)

49 return (np.array([fmnks[(m, n, ki)] for ki in k])

50 / special.binom(m + n, m))

52 def cdf(self, k, m, n):

53 '''Cumulative distribution function'''

54 # We could use the fact that the distribution is symmetric to avoid

55 # summing more than m*n/2 terms, but it might not be worth the

56 # overhead. Let's leave that to an improvement.

57 pmfs = self.pmf(np.arange(0, np.max(k) + 1), m, n)

58 cdfs = np.cumsum(pmfs)

59 return cdfs[k]

61 def sf(self, k, m, n):

62 '''Survival function'''

63 # Use the fact that the distribution is symmetric; i.e.

64 # _f(m, n, m*n-k) = _f(m, n, k), and sum from the left

65 k = m*n - k

66 # Note that both CDF and SF include the PMF at k. The p-value is

67 # calculated from the SF and should include the mass at k, so this

68 # is desirable

69 return self.cdf(k, m, n)

71 def _resize_fmnks(self, m, n, k):

72 '''If necessary, expand the array that remembers PMF values'''

73 # could probably use `np.pad` but I'm not sure it would save code

74 shape_old = np.array(self._fmnks.shape)

75 shape_new = np.array((m+1, n+1, k+1))

76 if np.any(shape_new > shape_old):

77 shape = np.maximum(shape_old, shape_new)

78 fmnks = -np.ones(shape) # create the new array

79 m0, n0, k0 = shape_old

80 fmnks[:m0, :n0, :k0] = self._fmnks # copy remembered values

81 self._fmnks = fmnks

83 def _f(self, m, n, k):

84 '''Recursive implementation of function of [3] Theorem 2.5'''

86 # [3] Theorem 2.5 Line 1

87 if k < 0 or m < 0 or n < 0 or k > m*n:

88 return 0

90 # if already calculated, return the value

91 if self._fmnks[m, n, k] >= 0:

92 return self._fmnks[m, n, k]

94 if k == 0 and m >= 0 and n >= 0: # [3] Theorem 2.5 Line 2

95 fmnk = 1

96 else: # [3] Theorem 2.5 Line 3 / Equation 3

97 fmnk = self._f(m-1, n, k-n) + self._f(m, n-1, k)

99 self._fmnks[m, n, k] = fmnk # remember result

100

101 return fmnk

102

103

104# Maintain state for faster repeat calls to mannwhitneyu w/ method='exact'

105_mwu_state = _MWU()

106

107

108def _mwu_f_iterative(m, n, k, fmnks):

109 '''Iterative implementation of function of [3] Theorem 2.5'''

110

111 def _base_case(m, n, k):

112 '''Base cases from recursive version'''

113

114 # if already calculated, return the value

115 if fmnks.get((m, n, k), -1) >= 0:

116 return fmnks[(m, n, k)]

117

118 # [3] Theorem 2.5 Line 1

119 elif k < 0 or m < 0 or n < 0 or k > m*n:

120 return 0

121

122 # [3] Theorem 2.5 Line 2

123 elif k == 0 and m >= 0 and n >= 0:

124 return 1

125

126 return None

127

128 stack = [(m, n, k)]

129 fmnk = None

130

131 while stack:

132 # Popping only if necessary would save a tiny bit of time, but NWI.

133 m, n, k = stack.pop()

134

135 # If we're at a base case, continue (stack unwinds)

136 fmnk = _base_case(m, n, k)

137 if fmnk is not None:

138 fmnks[(m, n, k)] = fmnk

139 continue

140

141 # If both terms are base cases, continue (stack unwinds)

142 f1 = _base_case(m-1, n, k-n)

143 f2 = _base_case(m, n-1, k)

144 if f1 is not None and f2 is not None:

145 # [3] Theorem 2.5 Line 3 / Equation 3

146 fmnk = f1 + f2

147 fmnks[(m, n, k)] = fmnk

148 continue

149

150 # recurse deeper

151 stack.append((m, n, k))

152 if f1 is None:

153 stack.append((m-1, n, k-n))

154 if f2 is None:

155 stack.append((m, n-1, k))

156

157 return fmnks

158

159

160def _tie_term(ranks):

161 """Tie correction term"""

162 # element i of t is the number of elements sharing rank i

163 _, t = np.unique(ranks, return_counts=True, axis=-1)

164 return (t**3 - t).sum(axis=-1)

165

166

167def _get_mwu_z(U, n1, n2, ranks, axis=0, continuity=True):

168 '''Standardized MWU statistic'''

169 # Follows mannwhitneyu [2]

170 mu = n1 * n2 / 2

171 n = n1 + n2

172

173 # Tie correction according to [2]

174 tie_term = np.apply_along_axis(_tie_term, -1, ranks)

175 s = np.sqrt(n1*n2/12 * ((n + 1) - tie_term/(n*(n-1))))

176

177 # equivalent to using scipy.stats.tiecorrect

178 # T = np.apply_along_axis(stats.tiecorrect, -1, ranks)

179 # s = np.sqrt(T * n1 * n2 * (n1+n2+1) / 12.0)

180

181 numerator = U - mu

182

183 # Continuity correction.

184 # Because SF is always used to calculate the p-value, we can always

185 # _subtract_ 0.5 for the continuity correction. This always increases the

186 # p-value to account for the rest of the probability mass _at_ q = U.

187 if continuity:

188 numerator -= 0.5

189

190 # no problem evaluating the norm SF at an infinity

191 with np.errstate(divide='ignore', invalid='ignore'):

192 z = numerator / s

193 return z

194

195

196def _mwu_input_validation(x, y, use_continuity, alternative, axis, method):

197 ''' Input validation and standardization for mannwhitneyu '''

198 # Would use np.asarray_chkfinite, but infs are OK

199 x, y = np.atleast_1d(x), np.atleast_1d(y)

200 if np.isnan(x).any() or np.isnan(y).any():

201 raise ValueError('`x` and `y` must not contain NaNs.')

202 if np.size(x) == 0 or np.size(y) == 0:

203 raise ValueError('`x` and `y` must be of nonzero size.')

204

205 bools = {True, False}

206 if use_continuity not in bools:

207 raise ValueError(f'`use_continuity` must be one of {bools}.')

208

209 alternatives = {"two-sided", "less", "greater"}

210 alternative = alternative.lower()

211 if alternative not in alternatives:

212 raise ValueError(f'`alternative` must be one of {alternatives}.')

213

214 axis_int = int(axis)

215 if axis != axis_int:

216 raise ValueError('`axis` must be an integer.')

217

218 methods = {"asymptotic", "exact", "auto"}

219 method = method.lower()

220 if method not in methods:

221 raise ValueError(f'`method` must be one of {methods}.')

222

223 return x, y, use_continuity, alternative, axis_int, method

224

225

226def _tie_check(xy):

227 """Find any ties in data"""

228 _, t = np.unique(xy, return_counts=True, axis=-1)

229 return np.any(t != 1)

230

231

232def _mwu_choose_method(n1, n2, xy, method):

233 """Choose method 'asymptotic' or 'exact' depending on input size, ties"""

234

235 # if both inputs are large, asymptotic is OK

236 if n1 > 8 and n2 > 8:

237 return "asymptotic"

238

239 # if there are any ties, asymptotic is preferred

240 if np.apply_along_axis(_tie_check, -1, xy).any():

241 return "asymptotic"

242

243 return "exact"

244

245

246MannwhitneyuResult = namedtuple('MannwhitneyuResult', ('statistic', 'pvalue'))

247

248

249@_axis_nan_policy_factory(MannwhitneyuResult, n_samples=2)

250def mannwhitneyu(x, y, use_continuity=True, alternative="two-sided",

251 axis=0, method="auto"):

252 r'''Perform the Mann-Whitney U rank test on two independent samples.

253

254 The Mann-Whitney U test is a nonparametric test of the null hypothesis

255 that the distribution underlying sample `x` is the same as the

256 distribution underlying sample `y`. It is often used as a test of

257 difference in location between distributions.

258

259 Parameters

260 ----------

261 x, y : array-like

262 N-d arrays of samples. The arrays must be broadcastable except along

263 the dimension given by `axis`.

264 use_continuity : bool, optional

265 Whether a continuity correction (1/2) should be applied.

266 Default is True when `method` is ``'asymptotic'``; has no effect

267 otherwise.

268 alternative : {'two-sided', 'less', 'greater'}, optional

269 Defines the alternative hypothesis. Default is 'two-sided'.

270 Let *F(u)* and *G(u)* be the cumulative distribution functions of the

271 distributions underlying `x` and `y`, respectively. Then the following

272 alternative hypotheses are available:

273

274 * 'two-sided': the distributions are not equal, i.e. *F(u) ≠ G(u)* for

275 at least one *u*.

276 * 'less': the distribution underlying `x` is stochastically less

277 than the distribution underlying `y`, i.e. *F(u) > G(u)* for all *u*.

278 * 'greater': the distribution underlying `x` is stochastically greater

279 than the distribution underlying `y`, i.e. *F(u) < G(u)* for all *u*.

280

281 Under a more restrictive set of assumptions, the alternative hypotheses

282 can be expressed in terms of the locations of the distributions;

283 see [5] section 5.1.

284 axis : int, optional

285 Axis along which to perform the test. Default is 0.

286 method : {'auto', 'asymptotic', 'exact'}, optional

287 Selects the method used to calculate the *p*-value.

288 Default is 'auto'. The following options are available.

289

290 * ``'asymptotic'``: compares the standardized test statistic

291 against the normal distribution, correcting for ties.

292 * ``'exact'``: computes the exact *p*-value by comparing the observed

293 :math:`U` statistic against the exact distribution of the :math:`U`

294 statistic under the null hypothesis. No correction is made for ties.

295 * ``'auto'``: chooses ``'exact'`` when the size of one of the samples

296 is less than 8 and there are no ties; chooses ``'asymptotic'``

297 otherwise.

298

299 Returns

300 -------

301 res : MannwhitneyuResult

302 An object containing attributes:

303

304 statistic : float

305 The Mann-Whitney U statistic corresponding with sample `x`. See

306 Notes for the test statistic corresponding with sample `y`.

307 pvalue : float

308 The associated *p*-value for the chosen `alternative`.

309

310 Notes

311 -----

312 If ``U1`` is the statistic corresponding with sample `x`, then the

313 statistic corresponding with sample `y` is

314 `U2 = `x.shape[axis] * y.shape[axis] - U1``.

315

316 `mannwhitneyu` is for independent samples. For related / paired samples,

317 consider `scipy.stats.wilcoxon`.

318

319 `method` ``'exact'`` is recommended when there are no ties and when either

320 sample size is less than 8 [1]_. The implementation follows the recurrence

321 relation originally proposed in [1]_ as it is described in [3]_.

322 Note that the exact method is *not* corrected for ties, but

323 `mannwhitneyu` will not raise errors or warnings if there are ties in the

324 data.

325

326 The Mann-Whitney U test is a non-parametric version of the t-test for

327 independent samples. When the means of samples from the populations

328 are normally distributed, consider `scipy.stats.ttest_ind`.

329

330 See Also

331 --------

332 scipy.stats.wilcoxon, scipy.stats.ranksums, scipy.stats.ttest_ind

333

334 References

335 ----------

336 .. [1] H.B. Mann and D.R. Whitney, "On a test of whether one of two random

337 variables is stochastically larger than the other", The Annals of

338 Mathematical Statistics, Vol. 18, pp. 50-60, 1947.

339 .. [2] Mann-Whitney U Test, Wikipedia,

340 http://en.wikipedia.org/wiki/Mann-Whitney_U_test

341 .. [3] A. Di Bucchianico, "Combinatorics, computer algebra, and the

342 Wilcoxon-Mann-Whitney test", Journal of Statistical Planning and

343 Inference, Vol. 79, pp. 349-364, 1999.

344 .. [4] Rosie Shier, "Statistics: 2.3 The Mann-Whitney U Test", Mathematics

345 Learning Support Centre, 2004.

346 .. [5] Michael P. Fay and Michael A. Proschan. "Wilcoxon-Mann-Whitney

347 or t-test? On assumptions for hypothesis tests and multiple \

348 interpretations of decision rules." Statistics surveys, Vol. 4, pp.

349 1-39, 2010. https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2857732/

350

351 Examples

352 --------

353 We follow the example from [4]_: nine randomly sampled young adults were

354 diagnosed with type II diabetes at the ages below.

355

356 >>> males = [19, 22, 16, 29, 24]

357 >>> females = [20, 11, 17, 12]

358

359 We use the Mann-Whitney U test to assess whether there is a statistically

360 significant difference in the diagnosis age of males and females.

361 The null hypothesis is that the distribution of male diagnosis ages is

362 the same as the distribution of female diagnosis ages. We decide

363 that a confidence level of 95% is required to reject the null hypothesis

364 in favor of the alternative that the distributions are different.

365 Since the number of samples is very small and there are no ties in the

366 data, we can compare the observed test statistic against the *exact*

367 distribution of the test statistic under the null hypothesis.

368

369 >>> from scipy.stats import mannwhitneyu

370 >>> U1, p = mannwhitneyu(males, females, method="exact")

371 >>> print(U1)

372 17.0

373

374 `mannwhitneyu` always reports the statistic associated with the first

375 sample, which, in this case, is males. This agrees with :math:`U_M = 17`

376 reported in [4]_. The statistic associated with the second statistic

377 can be calculated:

378

379 >>> nx, ny = len(males), len(females)

380 >>> U2 = nx*ny - U1

381 >>> print(U2)

382 3.0

383

384 This agrees with :math:`U_F = 3` reported in [4]_. The two-sided

385 *p*-value can be calculated from either statistic, and the value produced

386 by `mannwhitneyu` agrees with :math:`p = 0.11` reported in [4]_.

387

388 >>> print(p)

389 0.1111111111111111

390

391 The exact distribution of the test statistic is asymptotically normal, so

392 the example continues by comparing the exact *p*-value against the

393 *p*-value produced using the normal approximation.

394

395 >>> _, pnorm = mannwhitneyu(males, females, method="asymptotic")

396 >>> print(pnorm)

397 0.11134688653314041

398

399 Here `mannwhitneyu`'s reported *p*-value appears to conflict with the

400 value :math:`p = 0.09` given in [4]_. The reason is that [4]_

401 does not apply the continuity correction performed by `mannwhitneyu`;

402 `mannwhitneyu` reduces the distance between the test statistic and the

403 mean :math:`\mu = n_x n_y / 2` by 0.5 to correct for the fact that the

404 discrete statistic is being compared against a continuous distribution.

405 Here, the :math:`U` statistic used is less than the mean, so we reduce

406 the distance by adding 0.5 in the numerator.

407

408 >>> import numpy as np

409 >>> from scipy.stats import norm

410 >>> U = min(U1, U2)

411 >>> N = nx + ny

412 >>> z = (U - nx*ny/2 + 0.5) / np.sqrt(nx*ny * (N + 1)/ 12)

413 >>> p = 2 * norm.cdf(z) # use CDF to get p-value from smaller statistic

414 >>> print(p)

415 0.11134688653314041

416

417 If desired, we can disable the continuity correction to get a result

418 that agrees with that reported in [4]_.

419

420 >>> _, pnorm = mannwhitneyu(males, females, use_continuity=False,

421 ... method="asymptotic")

422 >>> print(pnorm)

423 0.0864107329737

424

425 Regardless of whether we perform an exact or asymptotic test, the

426 probability of the test statistic being as extreme or more extreme by

427 chance exceeds 5%, so we do not consider the results statistically

428 significant.

429

430 Suppose that, before seeing the data, we had hypothesized that females

431 would tend to be diagnosed at a younger age than males.

432 In that case, it would be natural to provide the female ages as the

433 first input, and we would have performed a one-sided test using

434 ``alternative = 'less'``: females are diagnosed at an age that is

435 stochastically less than that of males.

436

437 >>> res = mannwhitneyu(females, males, alternative="less", method="exact")

438 >>> print(res)

439 MannwhitneyuResult(statistic=3.0, pvalue=0.05555555555555555)

440

441 Again, the probability of getting a sufficiently low value of the

442 test statistic by chance under the null hypothesis is greater than 5%,

443 so we do not reject the null hypothesis in favor of our alternative.

444

445 If it is reasonable to assume that the means of samples from the

446 populations are normally distributed, we could have used a t-test to

447 perform the analysis.

448

449 >>> from scipy.stats import ttest_ind

450 >>> res = ttest_ind(females, males, alternative="less")

451 >>> print(res)

452 Ttest_indResult(statistic=-2.239334696520584, pvalue=0.030068441095757924)

453

454 Under this assumption, the *p*-value would be low enough to reject the

455 null hypothesis in favor of the alternative.

456

457 '''

458

459 x, y, use_continuity, alternative, axis_int, method = (

460 _mwu_input_validation(x, y, use_continuity, alternative, axis, method))

461

462 x, y, xy = _broadcast_concatenate(x, y, axis)

463

464 n1, n2 = x.shape[-1], y.shape[-1]

465

466 if method == "auto":

467 method = _mwu_choose_method(n1, n2, xy, method)

468

469 # Follows [2]

470 ranks = stats.rankdata(xy, axis=-1) # method 2, step 1

471 R1 = ranks[..., :n1].sum(axis=-1) # method 2, step 2

472 U1 = R1 - n1*(n1+1)/2 # method 2, step 3

473 U2 = n1 * n2 - U1 # as U1 + U2 = n1 * n2

474

475 if alternative == "greater":

476 U, f = U1, 1 # U is the statistic to use for p-value, f is a factor

477 elif alternative == "less":

478 U, f = U2, 1 # Due to symmetry, use SF of U2 rather than CDF of U1

479 else:

480 U, f = np.maximum(U1, U2), 2 # multiply SF by two for two-sided test

481

482 if method == "exact":

483 p = _mwu_state.sf(U.astype(int), n1, n2)

484 elif method == "asymptotic":

485 z = _get_mwu_z(U, n1, n2, ranks, continuity=use_continuity)

486 p = stats.norm.sf(z)

487 p *= f

488

489 # Ensure that test statistic is not greater than 1

490 # This could happen for exact test when U = m*n/2

491 p = np.clip(p, 0, 1)

492

493 return MannwhitneyuResult(U1, p)

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scipy/stats/_mannwhitneyu.py: 17%

156 statements