Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scipy/stats/_mannwhitneyu.py: 17%

156 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-12-12 06:31 +0000

1import numpy as np 

2from collections import namedtuple 

3from scipy import special 

4from scipy import stats 

5from ._axis_nan_policy import _axis_nan_policy_factory 

6 

7 

8def _broadcast_concatenate(x, y, axis): 

9 '''Broadcast then concatenate arrays, leaving concatenation axis last''' 

10 x = np.moveaxis(x, axis, -1) 

11 y = np.moveaxis(y, axis, -1) 

12 z = np.broadcast(x[..., 0], y[..., 0]) 

13 x = np.broadcast_to(x, z.shape + (x.shape[-1],)) 

14 y = np.broadcast_to(y, z.shape + (y.shape[-1],)) 

15 z = np.concatenate((x, y), axis=-1) 

16 return x, y, z 

17 

18 

19class _MWU: 

20 '''Distribution of MWU statistic under the null hypothesis''' 

21 # Possible improvement: if m and n are small enough, use integer arithmetic 

22 

23 def __init__(self): 

24 '''Minimal initializer''' 

25 self._fmnks = -np.ones((1, 1, 1)) 

26 self._recursive = None 

27 

28 def pmf(self, k, m, n): 

29 if (self._recursive is None and m <= 500 and n <= 500 

30 or self._recursive): 

31 return self.pmf_recursive(k, m, n) 

32 else: 

33 return self.pmf_iterative(k, m, n) 

34 

35 def pmf_recursive(self, k, m, n): 

36 '''Probability mass function, recursive version''' 

37 self._resize_fmnks(m, n, np.max(k)) 

38 # could loop over just the unique elements, but probably not worth 

39 # the time to find them 

40 for i in np.ravel(k): 

41 self._f(m, n, i) 

42 return self._fmnks[m, n, k] / special.binom(m + n, m) 

43 

44 def pmf_iterative(self, k, m, n): 

45 '''Probability mass function, iterative version''' 

46 fmnks = {} 

47 for i in np.ravel(k): 

48 fmnks = _mwu_f_iterative(m, n, i, fmnks) 

49 return (np.array([fmnks[(m, n, ki)] for ki in k]) 

50 / special.binom(m + n, m)) 

51 

52 def cdf(self, k, m, n): 

53 '''Cumulative distribution function''' 

54 # We could use the fact that the distribution is symmetric to avoid 

55 # summing more than m*n/2 terms, but it might not be worth the 

56 # overhead. Let's leave that to an improvement. 

57 pmfs = self.pmf(np.arange(0, np.max(k) + 1), m, n) 

58 cdfs = np.cumsum(pmfs) 

59 return cdfs[k] 

60 

61 def sf(self, k, m, n): 

62 '''Survival function''' 

63 # Use the fact that the distribution is symmetric; i.e. 

64 # _f(m, n, m*n-k) = _f(m, n, k), and sum from the left 

65 k = m*n - k 

66 # Note that both CDF and SF include the PMF at k. The p-value is 

67 # calculated from the SF and should include the mass at k, so this 

68 # is desirable 

69 return self.cdf(k, m, n) 

70 

71 def _resize_fmnks(self, m, n, k): 

72 '''If necessary, expand the array that remembers PMF values''' 

73 # could probably use `np.pad` but I'm not sure it would save code 

74 shape_old = np.array(self._fmnks.shape) 

75 shape_new = np.array((m+1, n+1, k+1)) 

76 if np.any(shape_new > shape_old): 

77 shape = np.maximum(shape_old, shape_new) 

78 fmnks = -np.ones(shape) # create the new array 

79 m0, n0, k0 = shape_old 

80 fmnks[:m0, :n0, :k0] = self._fmnks # copy remembered values 

81 self._fmnks = fmnks 

82 

83 def _f(self, m, n, k): 

84 '''Recursive implementation of function of [3] Theorem 2.5''' 

85 

86 # [3] Theorem 2.5 Line 1 

87 if k < 0 or m < 0 or n < 0 or k > m*n: 

88 return 0 

89 

90 # if already calculated, return the value 

91 if self._fmnks[m, n, k] >= 0: 

92 return self._fmnks[m, n, k] 

93 

94 if k == 0 and m >= 0 and n >= 0: # [3] Theorem 2.5 Line 2 

95 fmnk = 1 

96 else: # [3] Theorem 2.5 Line 3 / Equation 3 

97 fmnk = self._f(m-1, n, k-n) + self._f(m, n-1, k) 

98 

99 self._fmnks[m, n, k] = fmnk # remember result 

100 

101 return fmnk 

102 

103 

104# Maintain state for faster repeat calls to mannwhitneyu w/ method='exact' 

105_mwu_state = _MWU() 

106 

107 

108def _mwu_f_iterative(m, n, k, fmnks): 

109 '''Iterative implementation of function of [3] Theorem 2.5''' 

110 

111 def _base_case(m, n, k): 

112 '''Base cases from recursive version''' 

113 

114 # if already calculated, return the value 

115 if fmnks.get((m, n, k), -1) >= 0: 

116 return fmnks[(m, n, k)] 

117 

118 # [3] Theorem 2.5 Line 1 

119 elif k < 0 or m < 0 or n < 0 or k > m*n: 

120 return 0 

121 

122 # [3] Theorem 2.5 Line 2 

123 elif k == 0 and m >= 0 and n >= 0: 

124 return 1 

125 

126 return None 

127 

128 stack = [(m, n, k)] 

129 fmnk = None 

130 

131 while stack: 

132 # Popping only if necessary would save a tiny bit of time, but NWI. 

133 m, n, k = stack.pop() 

134 

135 # If we're at a base case, continue (stack unwinds) 

136 fmnk = _base_case(m, n, k) 

137 if fmnk is not None: 

138 fmnks[(m, n, k)] = fmnk 

139 continue 

140 

141 # If both terms are base cases, continue (stack unwinds) 

142 f1 = _base_case(m-1, n, k-n) 

143 f2 = _base_case(m, n-1, k) 

144 if f1 is not None and f2 is not None: 

145 # [3] Theorem 2.5 Line 3 / Equation 3 

146 fmnk = f1 + f2 

147 fmnks[(m, n, k)] = fmnk 

148 continue 

149 

150 # recurse deeper 

151 stack.append((m, n, k)) 

152 if f1 is None: 

153 stack.append((m-1, n, k-n)) 

154 if f2 is None: 

155 stack.append((m, n-1, k)) 

156 

157 return fmnks 

158 

159 

160def _tie_term(ranks): 

161 """Tie correction term""" 

162 # element i of t is the number of elements sharing rank i 

163 _, t = np.unique(ranks, return_counts=True, axis=-1) 

164 return (t**3 - t).sum(axis=-1) 

165 

166 

167def _get_mwu_z(U, n1, n2, ranks, axis=0, continuity=True): 

168 '''Standardized MWU statistic''' 

169 # Follows mannwhitneyu [2] 

170 mu = n1 * n2 / 2 

171 n = n1 + n2 

172 

173 # Tie correction according to [2] 

174 tie_term = np.apply_along_axis(_tie_term, -1, ranks) 

175 s = np.sqrt(n1*n2/12 * ((n + 1) - tie_term/(n*(n-1)))) 

176 

177 # equivalent to using scipy.stats.tiecorrect 

178 # T = np.apply_along_axis(stats.tiecorrect, -1, ranks) 

179 # s = np.sqrt(T * n1 * n2 * (n1+n2+1) / 12.0) 

180 

181 numerator = U - mu 

182 

183 # Continuity correction. 

184 # Because SF is always used to calculate the p-value, we can always 

185 # _subtract_ 0.5 for the continuity correction. This always increases the 

186 # p-value to account for the rest of the probability mass _at_ q = U. 

187 if continuity: 

188 numerator -= 0.5 

189 

190 # no problem evaluating the norm SF at an infinity 

191 with np.errstate(divide='ignore', invalid='ignore'): 

192 z = numerator / s 

193 return z 

194 

195 

196def _mwu_input_validation(x, y, use_continuity, alternative, axis, method): 

197 ''' Input validation and standardization for mannwhitneyu ''' 

198 # Would use np.asarray_chkfinite, but infs are OK 

199 x, y = np.atleast_1d(x), np.atleast_1d(y) 

200 if np.isnan(x).any() or np.isnan(y).any(): 

201 raise ValueError('`x` and `y` must not contain NaNs.') 

202 if np.size(x) == 0 or np.size(y) == 0: 

203 raise ValueError('`x` and `y` must be of nonzero size.') 

204 

205 bools = {True, False} 

206 if use_continuity not in bools: 

207 raise ValueError(f'`use_continuity` must be one of {bools}.') 

208 

209 alternatives = {"two-sided", "less", "greater"} 

210 alternative = alternative.lower() 

211 if alternative not in alternatives: 

212 raise ValueError(f'`alternative` must be one of {alternatives}.') 

213 

214 axis_int = int(axis) 

215 if axis != axis_int: 

216 raise ValueError('`axis` must be an integer.') 

217 

218 methods = {"asymptotic", "exact", "auto"} 

219 method = method.lower() 

220 if method not in methods: 

221 raise ValueError(f'`method` must be one of {methods}.') 

222 

223 return x, y, use_continuity, alternative, axis_int, method 

224 

225 

226def _tie_check(xy): 

227 """Find any ties in data""" 

228 _, t = np.unique(xy, return_counts=True, axis=-1) 

229 return np.any(t != 1) 

230 

231 

232def _mwu_choose_method(n1, n2, xy, method): 

233 """Choose method 'asymptotic' or 'exact' depending on input size, ties""" 

234 

235 # if both inputs are large, asymptotic is OK 

236 if n1 > 8 and n2 > 8: 

237 return "asymptotic" 

238 

239 # if there are any ties, asymptotic is preferred 

240 if np.apply_along_axis(_tie_check, -1, xy).any(): 

241 return "asymptotic" 

242 

243 return "exact" 

244 

245 

246MannwhitneyuResult = namedtuple('MannwhitneyuResult', ('statistic', 'pvalue')) 

247 

248 

249@_axis_nan_policy_factory(MannwhitneyuResult, n_samples=2) 

250def mannwhitneyu(x, y, use_continuity=True, alternative="two-sided", 

251 axis=0, method="auto"): 

252 r'''Perform the Mann-Whitney U rank test on two independent samples. 

253 

254 The Mann-Whitney U test is a nonparametric test of the null hypothesis 

255 that the distribution underlying sample `x` is the same as the 

256 distribution underlying sample `y`. It is often used as a test of 

257 difference in location between distributions. 

258 

259 Parameters 

260 ---------- 

261 x, y : array-like 

262 N-d arrays of samples. The arrays must be broadcastable except along 

263 the dimension given by `axis`. 

264 use_continuity : bool, optional 

265 Whether a continuity correction (1/2) should be applied. 

266 Default is True when `method` is ``'asymptotic'``; has no effect 

267 otherwise. 

268 alternative : {'two-sided', 'less', 'greater'}, optional 

269 Defines the alternative hypothesis. Default is 'two-sided'. 

270 Let *F(u)* and *G(u)* be the cumulative distribution functions of the 

271 distributions underlying `x` and `y`, respectively. Then the following 

272 alternative hypotheses are available: 

273 

274 * 'two-sided': the distributions are not equal, i.e. *F(u) ≠ G(u)* for 

275 at least one *u*. 

276 * 'less': the distribution underlying `x` is stochastically less 

277 than the distribution underlying `y`, i.e. *F(u) > G(u)* for all *u*. 

278 * 'greater': the distribution underlying `x` is stochastically greater 

279 than the distribution underlying `y`, i.e. *F(u) < G(u)* for all *u*. 

280 

281 Under a more restrictive set of assumptions, the alternative hypotheses 

282 can be expressed in terms of the locations of the distributions; 

283 see [5] section 5.1. 

284 axis : int, optional 

285 Axis along which to perform the test. Default is 0. 

286 method : {'auto', 'asymptotic', 'exact'}, optional 

287 Selects the method used to calculate the *p*-value. 

288 Default is 'auto'. The following options are available. 

289 

290 * ``'asymptotic'``: compares the standardized test statistic 

291 against the normal distribution, correcting for ties. 

292 * ``'exact'``: computes the exact *p*-value by comparing the observed 

293 :math:`U` statistic against the exact distribution of the :math:`U` 

294 statistic under the null hypothesis. No correction is made for ties. 

295 * ``'auto'``: chooses ``'exact'`` when the size of one of the samples 

296 is less than 8 and there are no ties; chooses ``'asymptotic'`` 

297 otherwise. 

298 

299 Returns 

300 ------- 

301 res : MannwhitneyuResult 

302 An object containing attributes: 

303 

304 statistic : float 

305 The Mann-Whitney U statistic corresponding with sample `x`. See 

306 Notes for the test statistic corresponding with sample `y`. 

307 pvalue : float 

308 The associated *p*-value for the chosen `alternative`. 

309 

310 Notes 

311 ----- 

312 If ``U1`` is the statistic corresponding with sample `x`, then the 

313 statistic corresponding with sample `y` is 

314 `U2 = `x.shape[axis] * y.shape[axis] - U1``. 

315 

316 `mannwhitneyu` is for independent samples. For related / paired samples, 

317 consider `scipy.stats.wilcoxon`. 

318 

319 `method` ``'exact'`` is recommended when there are no ties and when either 

320 sample size is less than 8 [1]_. The implementation follows the recurrence 

321 relation originally proposed in [1]_ as it is described in [3]_. 

322 Note that the exact method is *not* corrected for ties, but 

323 `mannwhitneyu` will not raise errors or warnings if there are ties in the 

324 data. 

325 

326 The Mann-Whitney U test is a non-parametric version of the t-test for 

327 independent samples. When the means of samples from the populations 

328 are normally distributed, consider `scipy.stats.ttest_ind`. 

329 

330 See Also 

331 -------- 

332 scipy.stats.wilcoxon, scipy.stats.ranksums, scipy.stats.ttest_ind 

333 

334 References 

335 ---------- 

336 .. [1] H.B. Mann and D.R. Whitney, "On a test of whether one of two random 

337 variables is stochastically larger than the other", The Annals of 

338 Mathematical Statistics, Vol. 18, pp. 50-60, 1947. 

339 .. [2] Mann-Whitney U Test, Wikipedia, 

340 http://en.wikipedia.org/wiki/Mann-Whitney_U_test 

341 .. [3] A. Di Bucchianico, "Combinatorics, computer algebra, and the 

342 Wilcoxon-Mann-Whitney test", Journal of Statistical Planning and 

343 Inference, Vol. 79, pp. 349-364, 1999. 

344 .. [4] Rosie Shier, "Statistics: 2.3 The Mann-Whitney U Test", Mathematics 

345 Learning Support Centre, 2004. 

346 .. [5] Michael P. Fay and Michael A. Proschan. "Wilcoxon-Mann-Whitney 

347 or t-test? On assumptions for hypothesis tests and multiple \ 

348 interpretations of decision rules." Statistics surveys, Vol. 4, pp. 

349 1-39, 2010. https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2857732/ 

350 

351 Examples 

352 -------- 

353 We follow the example from [4]_: nine randomly sampled young adults were 

354 diagnosed with type II diabetes at the ages below. 

355 

356 >>> males = [19, 22, 16, 29, 24] 

357 >>> females = [20, 11, 17, 12] 

358 

359 We use the Mann-Whitney U test to assess whether there is a statistically 

360 significant difference in the diagnosis age of males and females. 

361 The null hypothesis is that the distribution of male diagnosis ages is 

362 the same as the distribution of female diagnosis ages. We decide 

363 that a confidence level of 95% is required to reject the null hypothesis 

364 in favor of the alternative that the distributions are different. 

365 Since the number of samples is very small and there are no ties in the 

366 data, we can compare the observed test statistic against the *exact* 

367 distribution of the test statistic under the null hypothesis. 

368 

369 >>> from scipy.stats import mannwhitneyu 

370 >>> U1, p = mannwhitneyu(males, females, method="exact") 

371 >>> print(U1) 

372 17.0 

373 

374 `mannwhitneyu` always reports the statistic associated with the first 

375 sample, which, in this case, is males. This agrees with :math:`U_M = 17` 

376 reported in [4]_. The statistic associated with the second statistic 

377 can be calculated: 

378 

379 >>> nx, ny = len(males), len(females) 

380 >>> U2 = nx*ny - U1 

381 >>> print(U2) 

382 3.0 

383 

384 This agrees with :math:`U_F = 3` reported in [4]_. The two-sided 

385 *p*-value can be calculated from either statistic, and the value produced 

386 by `mannwhitneyu` agrees with :math:`p = 0.11` reported in [4]_. 

387 

388 >>> print(p) 

389 0.1111111111111111 

390 

391 The exact distribution of the test statistic is asymptotically normal, so 

392 the example continues by comparing the exact *p*-value against the 

393 *p*-value produced using the normal approximation. 

394 

395 >>> _, pnorm = mannwhitneyu(males, females, method="asymptotic") 

396 >>> print(pnorm) 

397 0.11134688653314041 

398 

399 Here `mannwhitneyu`'s reported *p*-value appears to conflict with the 

400 value :math:`p = 0.09` given in [4]_. The reason is that [4]_ 

401 does not apply the continuity correction performed by `mannwhitneyu`; 

402 `mannwhitneyu` reduces the distance between the test statistic and the 

403 mean :math:`\mu = n_x n_y / 2` by 0.5 to correct for the fact that the 

404 discrete statistic is being compared against a continuous distribution. 

405 Here, the :math:`U` statistic used is less than the mean, so we reduce 

406 the distance by adding 0.5 in the numerator. 

407 

408 >>> import numpy as np 

409 >>> from scipy.stats import norm 

410 >>> U = min(U1, U2) 

411 >>> N = nx + ny 

412 >>> z = (U - nx*ny/2 + 0.5) / np.sqrt(nx*ny * (N + 1)/ 12) 

413 >>> p = 2 * norm.cdf(z) # use CDF to get p-value from smaller statistic 

414 >>> print(p) 

415 0.11134688653314041 

416 

417 If desired, we can disable the continuity correction to get a result 

418 that agrees with that reported in [4]_. 

419 

420 >>> _, pnorm = mannwhitneyu(males, females, use_continuity=False, 

421 ... method="asymptotic") 

422 >>> print(pnorm) 

423 0.0864107329737 

424 

425 Regardless of whether we perform an exact or asymptotic test, the 

426 probability of the test statistic being as extreme or more extreme by 

427 chance exceeds 5%, so we do not consider the results statistically 

428 significant. 

429 

430 Suppose that, before seeing the data, we had hypothesized that females 

431 would tend to be diagnosed at a younger age than males. 

432 In that case, it would be natural to provide the female ages as the 

433 first input, and we would have performed a one-sided test using 

434 ``alternative = 'less'``: females are diagnosed at an age that is 

435 stochastically less than that of males. 

436 

437 >>> res = mannwhitneyu(females, males, alternative="less", method="exact") 

438 >>> print(res) 

439 MannwhitneyuResult(statistic=3.0, pvalue=0.05555555555555555) 

440 

441 Again, the probability of getting a sufficiently low value of the 

442 test statistic by chance under the null hypothesis is greater than 5%, 

443 so we do not reject the null hypothesis in favor of our alternative. 

444 

445 If it is reasonable to assume that the means of samples from the 

446 populations are normally distributed, we could have used a t-test to 

447 perform the analysis. 

448 

449 >>> from scipy.stats import ttest_ind 

450 >>> res = ttest_ind(females, males, alternative="less") 

451 >>> print(res) 

452 Ttest_indResult(statistic=-2.239334696520584, pvalue=0.030068441095757924) 

453 

454 Under this assumption, the *p*-value would be low enough to reject the 

455 null hypothesis in favor of the alternative. 

456 

457 ''' 

458 

459 x, y, use_continuity, alternative, axis_int, method = ( 

460 _mwu_input_validation(x, y, use_continuity, alternative, axis, method)) 

461 

462 x, y, xy = _broadcast_concatenate(x, y, axis) 

463 

464 n1, n2 = x.shape[-1], y.shape[-1] 

465 

466 if method == "auto": 

467 method = _mwu_choose_method(n1, n2, xy, method) 

468 

469 # Follows [2] 

470 ranks = stats.rankdata(xy, axis=-1) # method 2, step 1 

471 R1 = ranks[..., :n1].sum(axis=-1) # method 2, step 2 

472 U1 = R1 - n1*(n1+1)/2 # method 2, step 3 

473 U2 = n1 * n2 - U1 # as U1 + U2 = n1 * n2 

474 

475 if alternative == "greater": 

476 U, f = U1, 1 # U is the statistic to use for p-value, f is a factor 

477 elif alternative == "less": 

478 U, f = U2, 1 # Due to symmetry, use SF of U2 rather than CDF of U1 

479 else: 

480 U, f = np.maximum(U1, U2), 2 # multiply SF by two for two-sided test 

481 

482 if method == "exact": 

483 p = _mwu_state.sf(U.astype(int), n1, n2) 

484 elif method == "asymptotic": 

485 z = _get_mwu_z(U, n1, n2, ranks, continuity=use_continuity) 

486 p = stats.norm.sf(z) 

487 p *= f 

488 

489 # Ensure that test statistic is not greater than 1 

490 # This could happen for exact test when U = m*n/2 

491 p = np.clip(p, 0, 1) 

492 

493 return MannwhitneyuResult(U1, p)