Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scipy/stats/_mstats_basic.py: 11%

991 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-12-12 06:31 +0000

1""" 

2An extension of scipy.stats._stats_py to support masked arrays 

3 

4""" 

5# Original author (2007): Pierre GF Gerard-Marchant 

6 

7 

8__all__ = ['argstoarray', 

9 'count_tied_groups', 

10 'describe', 

11 'f_oneway', 'find_repeats','friedmanchisquare', 

12 'kendalltau','kendalltau_seasonal','kruskal','kruskalwallis', 

13 'ks_twosamp', 'ks_2samp', 'kurtosis', 'kurtosistest', 

14 'ks_1samp', 'kstest', 

15 'linregress', 

16 'mannwhitneyu', 'meppf','mode','moment','mquantiles','msign', 

17 'normaltest', 

18 'obrientransform', 

19 'pearsonr','plotting_positions','pointbiserialr', 

20 'rankdata', 

21 'scoreatpercentile','sem', 

22 'sen_seasonal_slopes','skew','skewtest','spearmanr', 

23 'siegelslopes', 'theilslopes', 

24 'tmax','tmean','tmin','trim','trimboth', 

25 'trimtail','trima','trimr','trimmed_mean','trimmed_std', 

26 'trimmed_stde','trimmed_var','tsem','ttest_1samp','ttest_onesamp', 

27 'ttest_ind','ttest_rel','tvar', 

28 'variation', 

29 'winsorize', 

30 'brunnermunzel', 

31 ] 

32 

33import numpy as np 

34from numpy import ndarray 

35import numpy.ma as ma 

36from numpy.ma import masked, nomask 

37import math 

38 

39import itertools 

40import warnings 

41from collections import namedtuple 

42 

43from . import distributions 

44from scipy._lib._util import _rename_parameter, _contains_nan 

45from scipy._lib._bunch import _make_tuple_bunch 

46import scipy.special as special 

47import scipy.stats._stats_py 

48 

49from ._stats_mstats_common import ( 

50 _find_repeats, 

51 linregress as stats_linregress, 

52 LinregressResult as stats_LinregressResult, 

53 theilslopes as stats_theilslopes, 

54 siegelslopes as stats_siegelslopes 

55 ) 

56 

57def _chk_asarray(a, axis): 

58 # Always returns a masked array, raveled for axis=None 

59 a = ma.asanyarray(a) 

60 if axis is None: 

61 a = ma.ravel(a) 

62 outaxis = 0 

63 else: 

64 outaxis = axis 

65 return a, outaxis 

66 

67 

68def _chk2_asarray(a, b, axis): 

69 a = ma.asanyarray(a) 

70 b = ma.asanyarray(b) 

71 if axis is None: 

72 a = ma.ravel(a) 

73 b = ma.ravel(b) 

74 outaxis = 0 

75 else: 

76 outaxis = axis 

77 return a, b, outaxis 

78 

79 

80def _chk_size(a, b): 

81 a = ma.asanyarray(a) 

82 b = ma.asanyarray(b) 

83 (na, nb) = (a.size, b.size) 

84 if na != nb: 

85 raise ValueError("The size of the input array should match!" 

86 " (%s <> %s)" % (na, nb)) 

87 return (a, b, na) 

88 

89 

90def argstoarray(*args): 

91 """ 

92 Constructs a 2D array from a group of sequences. 

93 

94 Sequences are filled with missing values to match the length of the longest 

95 sequence. 

96 

97 Parameters 

98 ---------- 

99 *args : sequences 

100 Group of sequences. 

101 

102 Returns 

103 ------- 

104 argstoarray : MaskedArray 

105 A ( `m` x `n` ) masked array, where `m` is the number of arguments and 

106 `n` the length of the longest argument. 

107 

108 Notes 

109 ----- 

110 `numpy.ma.row_stack` has identical behavior, but is called with a sequence 

111 of sequences. 

112 

113 Examples 

114 -------- 

115 A 2D masked array constructed from a group of sequences is returned. 

116 

117 >>> from scipy.stats.mstats import argstoarray 

118 >>> argstoarray([1, 2, 3], [4, 5, 6]) 

119 masked_array( 

120 data=[[1.0, 2.0, 3.0], 

121 [4.0, 5.0, 6.0]], 

122 mask=[[False, False, False], 

123 [False, False, False]], 

124 fill_value=1e+20) 

125 

126 The returned masked array filled with missing values when the lengths of 

127 sequences are different. 

128 

129 >>> argstoarray([1, 3], [4, 5, 6]) 

130 masked_array( 

131 data=[[1.0, 3.0, --], 

132 [4.0, 5.0, 6.0]], 

133 mask=[[False, False, True], 

134 [False, False, False]], 

135 fill_value=1e+20) 

136 

137 """ 

138 if len(args) == 1 and not isinstance(args[0], ndarray): 

139 output = ma.asarray(args[0]) 

140 if output.ndim != 2: 

141 raise ValueError("The input should be 2D") 

142 else: 

143 n = len(args) 

144 m = max([len(k) for k in args]) 

145 output = ma.array(np.empty((n,m), dtype=float), mask=True) 

146 for (k,v) in enumerate(args): 

147 output[k,:len(v)] = v 

148 

149 output[np.logical_not(np.isfinite(output._data))] = masked 

150 return output 

151 

152 

153def find_repeats(arr): 

154 """Find repeats in arr and return a tuple (repeats, repeat_count). 

155 

156 The input is cast to float64. Masked values are discarded. 

157 

158 Parameters 

159 ---------- 

160 arr : sequence 

161 Input array. The array is flattened if it is not 1D. 

162 

163 Returns 

164 ------- 

165 repeats : ndarray 

166 Array of repeated values. 

167 counts : ndarray 

168 Array of counts. 

169 

170 Examples 

171 -------- 

172 >>> from scipy.stats import mstats 

173 >>> mstats.find_repeats([2, 1, 2, 3, 2, 2, 5]) 

174 (array([2.]), array([4])) 

175 

176 In the above example, 2 repeats 4 times. 

177 

178 >>> mstats.find_repeats([[10, 20, 1, 2], [5, 5, 4, 4]]) 

179 (array([4., 5.]), array([2, 2])) 

180 

181 In the above example, both 4 and 5 repeat 2 times. 

182 

183 """ 

184 # Make sure we get a copy. ma.compressed promises a "new array", but can 

185 # actually return a reference. 

186 compr = np.asarray(ma.compressed(arr), dtype=np.float64) 

187 try: 

188 need_copy = np.may_share_memory(compr, arr) 

189 except AttributeError: 

190 # numpy < 1.8.2 bug: np.may_share_memory([], []) raises, 

191 # while in numpy 1.8.2 and above it just (correctly) returns False. 

192 need_copy = False 

193 if need_copy: 

194 compr = compr.copy() 

195 return _find_repeats(compr) 

196 

197 

198def count_tied_groups(x, use_missing=False): 

199 """ 

200 Counts the number of tied values. 

201 

202 Parameters 

203 ---------- 

204 x : sequence 

205 Sequence of data on which to counts the ties 

206 use_missing : bool, optional 

207 Whether to consider missing values as tied. 

208 

209 Returns 

210 ------- 

211 count_tied_groups : dict 

212 Returns a dictionary (nb of ties: nb of groups). 

213 

214 Examples 

215 -------- 

216 >>> from scipy.stats import mstats 

217 >>> import numpy as np 

218 >>> z = [0, 0, 0, 2, 2, 2, 3, 3, 4, 5, 6] 

219 >>> mstats.count_tied_groups(z) 

220 {2: 1, 3: 2} 

221 

222 In the above example, the ties were 0 (3x), 2 (3x) and 3 (2x). 

223 

224 >>> z = np.ma.array([0, 0, 1, 2, 2, 2, 3, 3, 4, 5, 6]) 

225 >>> mstats.count_tied_groups(z) 

226 {2: 2, 3: 1} 

227 >>> z[[1,-1]] = np.ma.masked 

228 >>> mstats.count_tied_groups(z, use_missing=True) 

229 {2: 2, 3: 1} 

230 

231 """ 

232 nmasked = ma.getmask(x).sum() 

233 # We need the copy as find_repeats will overwrite the initial data 

234 data = ma.compressed(x).copy() 

235 (ties, counts) = find_repeats(data) 

236 nties = {} 

237 if len(ties): 

238 nties = dict(zip(np.unique(counts), itertools.repeat(1))) 

239 nties.update(dict(zip(*find_repeats(counts)))) 

240 

241 if nmasked and use_missing: 

242 try: 

243 nties[nmasked] += 1 

244 except KeyError: 

245 nties[nmasked] = 1 

246 

247 return nties 

248 

249 

250def rankdata(data, axis=None, use_missing=False): 

251 """Returns the rank (also known as order statistics) of each data point 

252 along the given axis. 

253 

254 If some values are tied, their rank is averaged. 

255 If some values are masked, their rank is set to 0 if use_missing is False, 

256 or set to the average rank of the unmasked values if use_missing is True. 

257 

258 Parameters 

259 ---------- 

260 data : sequence 

261 Input data. The data is transformed to a masked array 

262 axis : {None,int}, optional 

263 Axis along which to perform the ranking. 

264 If None, the array is first flattened. An exception is raised if 

265 the axis is specified for arrays with a dimension larger than 2 

266 use_missing : bool, optional 

267 Whether the masked values have a rank of 0 (False) or equal to the 

268 average rank of the unmasked values (True). 

269 

270 """ 

271 def _rank1d(data, use_missing=False): 

272 n = data.count() 

273 rk = np.empty(data.size, dtype=float) 

274 idx = data.argsort() 

275 rk[idx[:n]] = np.arange(1,n+1) 

276 

277 if use_missing: 

278 rk[idx[n:]] = (n+1)/2. 

279 else: 

280 rk[idx[n:]] = 0 

281 

282 repeats = find_repeats(data.copy()) 

283 for r in repeats[0]: 

284 condition = (data == r).filled(False) 

285 rk[condition] = rk[condition].mean() 

286 return rk 

287 

288 data = ma.array(data, copy=False) 

289 if axis is None: 

290 if data.ndim > 1: 

291 return _rank1d(data.ravel(), use_missing).reshape(data.shape) 

292 else: 

293 return _rank1d(data, use_missing) 

294 else: 

295 return ma.apply_along_axis(_rank1d,axis,data,use_missing).view(ndarray) 

296 

297 

298ModeResult = namedtuple('ModeResult', ('mode', 'count')) 

299 

300 

301def mode(a, axis=0): 

302 """ 

303 Returns an array of the modal (most common) value in the passed array. 

304 

305 Parameters 

306 ---------- 

307 a : array_like 

308 n-dimensional array of which to find mode(s). 

309 axis : int or None, optional 

310 Axis along which to operate. Default is 0. If None, compute over 

311 the whole array `a`. 

312 

313 Returns 

314 ------- 

315 mode : ndarray 

316 Array of modal values. 

317 count : ndarray 

318 Array of counts for each mode. 

319 

320 Notes 

321 ----- 

322 For more details, see `scipy.stats.mode`. 

323 

324 Examples 

325 -------- 

326 >>> import numpy as np 

327 >>> from scipy import stats 

328 >>> from scipy.stats import mstats 

329 >>> m_arr = np.ma.array([1, 1, 0, 0, 0, 0], mask=[0, 0, 1, 1, 1, 0]) 

330 >>> mstats.mode(m_arr) # note that most zeros are masked 

331 ModeResult(mode=array([1.]), count=array([2.])) 

332 

333 """ 

334 return _mode(a, axis=axis, keepdims=True) 

335 

336 

337def _mode(a, axis=0, keepdims=True): 

338 # Don't want to expose `keepdims` from the public `mstats.mode` 

339 a, axis = _chk_asarray(a, axis) 

340 

341 def _mode1D(a): 

342 (rep,cnt) = find_repeats(a) 

343 if not cnt.ndim: 

344 return (0, 0) 

345 elif cnt.size: 

346 return (rep[cnt.argmax()], cnt.max()) 

347 else: 

348 return (a.min(), 1) 

349 

350 if axis is None: 

351 output = _mode1D(ma.ravel(a)) 

352 output = (ma.array(output[0]), ma.array(output[1])) 

353 else: 

354 output = ma.apply_along_axis(_mode1D, axis, a) 

355 if keepdims is None or keepdims: 

356 newshape = list(a.shape) 

357 newshape[axis] = 1 

358 slices = [slice(None)] * output.ndim 

359 slices[axis] = 0 

360 modes = output[tuple(slices)].reshape(newshape) 

361 slices[axis] = 1 

362 counts = output[tuple(slices)].reshape(newshape) 

363 output = (modes, counts) 

364 else: 

365 output = np.moveaxis(output, axis, 0) 

366 

367 return ModeResult(*output) 

368 

369 

370def _betai(a, b, x): 

371 x = np.asanyarray(x) 

372 x = ma.where(x < 1.0, x, 1.0) # if x > 1 then return 1.0 

373 return special.betainc(a, b, x) 

374 

375 

376def msign(x): 

377 """Returns the sign of x, or 0 if x is masked.""" 

378 return ma.filled(np.sign(x), 0) 

379 

380 

381def pearsonr(x, y): 

382 r""" 

383 Pearson correlation coefficient and p-value for testing non-correlation. 

384 

385 The Pearson correlation coefficient [1]_ measures the linear relationship 

386 between two datasets. The calculation of the p-value relies on the 

387 assumption that each dataset is normally distributed. (See Kowalski [3]_ 

388 for a discussion of the effects of non-normality of the input on the 

389 distribution of the correlation coefficient.) Like other correlation 

390 coefficients, this one varies between -1 and +1 with 0 implying no 

391 correlation. Correlations of -1 or +1 imply an exact linear relationship. 

392 

393 Parameters 

394 ---------- 

395 x : (N,) array_like 

396 Input array. 

397 y : (N,) array_like 

398 Input array. 

399 

400 Returns 

401 ------- 

402 r : float 

403 Pearson's correlation coefficient. 

404 p-value : float 

405 Two-tailed p-value. 

406 

407 Warns 

408 ----- 

409 PearsonRConstantInputWarning 

410 Raised if an input is a constant array. The correlation coefficient 

411 is not defined in this case, so ``np.nan`` is returned. 

412 

413 PearsonRNearConstantInputWarning 

414 Raised if an input is "nearly" constant. The array ``x`` is considered 

415 nearly constant if ``norm(x - mean(x)) < 1e-13 * abs(mean(x))``. 

416 Numerical errors in the calculation ``x - mean(x)`` in this case might 

417 result in an inaccurate calculation of r. 

418 

419 See Also 

420 -------- 

421 spearmanr : Spearman rank-order correlation coefficient. 

422 kendalltau : Kendall's tau, a correlation measure for ordinal data. 

423 

424 Notes 

425 ----- 

426 The correlation coefficient is calculated as follows: 

427 

428 .. math:: 

429 

430 r = \frac{\sum (x - m_x) (y - m_y)} 

431 {\sqrt{\sum (x - m_x)^2 \sum (y - m_y)^2}} 

432 

433 where :math:`m_x` is the mean of the vector x and :math:`m_y` is 

434 the mean of the vector y. 

435 

436 Under the assumption that x and y are drawn from 

437 independent normal distributions (so the population correlation coefficient 

438 is 0), the probability density function of the sample correlation 

439 coefficient r is ([1]_, [2]_): 

440 

441 .. math:: 

442 

443 f(r) = \frac{{(1-r^2)}^{n/2-2}}{\mathrm{B}(\frac{1}{2},\frac{n}{2}-1)} 

444 

445 where n is the number of samples, and B is the beta function. This 

446 is sometimes referred to as the exact distribution of r. This is 

447 the distribution that is used in `pearsonr` to compute the p-value. 

448 The distribution is a beta distribution on the interval [-1, 1], 

449 with equal shape parameters a = b = n/2 - 1. In terms of SciPy's 

450 implementation of the beta distribution, the distribution of r is:: 

451 

452 dist = scipy.stats.beta(n/2 - 1, n/2 - 1, loc=-1, scale=2) 

453 

454 The p-value returned by `pearsonr` is a two-sided p-value. The p-value 

455 roughly indicates the probability of an uncorrelated system 

456 producing datasets that have a Pearson correlation at least as extreme 

457 as the one computed from these datasets. More precisely, for a 

458 given sample with correlation coefficient r, the p-value is 

459 the probability that abs(r') of a random sample x' and y' drawn from 

460 the population with zero correlation would be greater than or equal 

461 to abs(r). In terms of the object ``dist`` shown above, the p-value 

462 for a given r and length n can be computed as:: 

463 

464 p = 2*dist.cdf(-abs(r)) 

465 

466 When n is 2, the above continuous distribution is not well-defined. 

467 One can interpret the limit of the beta distribution as the shape 

468 parameters a and b approach a = b = 0 as a discrete distribution with 

469 equal probability masses at r = 1 and r = -1. More directly, one 

470 can observe that, given the data x = [x1, x2] and y = [y1, y2], and 

471 assuming x1 != x2 and y1 != y2, the only possible values for r are 1 

472 and -1. Because abs(r') for any sample x' and y' with length 2 will 

473 be 1, the two-sided p-value for a sample of length 2 is always 1. 

474 

475 References 

476 ---------- 

477 .. [1] "Pearson correlation coefficient", Wikipedia, 

478 https://en.wikipedia.org/wiki/Pearson_correlation_coefficient 

479 .. [2] Student, "Probable error of a correlation coefficient", 

480 Biometrika, Volume 6, Issue 2-3, 1 September 1908, pp. 302-310. 

481 .. [3] C. J. Kowalski, "On the Effects of Non-Normality on the Distribution 

482 of the Sample Product-Moment Correlation Coefficient" 

483 Journal of the Royal Statistical Society. Series C (Applied 

484 Statistics), Vol. 21, No. 1 (1972), pp. 1-12. 

485 

486 Examples 

487 -------- 

488 >>> import numpy as np 

489 >>> from scipy import stats 

490 >>> from scipy.stats import mstats 

491 >>> mstats.pearsonr([1, 2, 3, 4, 5], [10, 9, 2.5, 6, 4]) 

492 (-0.7426106572325057, 0.1505558088534455) 

493 

494 There is a linear dependence between x and y if y = a + b*x + e, where 

495 a,b are constants and e is a random error term, assumed to be independent 

496 of x. For simplicity, assume that x is standard normal, a=0, b=1 and let 

497 e follow a normal distribution with mean zero and standard deviation s>0. 

498 

499 >>> s = 0.5 

500 >>> x = stats.norm.rvs(size=500) 

501 >>> e = stats.norm.rvs(scale=s, size=500) 

502 >>> y = x + e 

503 >>> mstats.pearsonr(x, y) 

504 (0.9029601878969703, 8.428978827629898e-185) # may vary 

505 

506 This should be close to the exact value given by 

507 

508 >>> 1/np.sqrt(1 + s**2) 

509 0.8944271909999159 

510 

511 For s=0.5, we observe a high level of correlation. In general, a large 

512 variance of the noise reduces the correlation, while the correlation 

513 approaches one as the variance of the error goes to zero. 

514 

515 It is important to keep in mind that no correlation does not imply 

516 independence unless (x, y) is jointly normal. Correlation can even be zero 

517 when there is a very simple dependence structure: if X follows a 

518 standard normal distribution, let y = abs(x). Note that the correlation 

519 between x and y is zero. Indeed, since the expectation of x is zero, 

520 cov(x, y) = E[x*y]. By definition, this equals E[x*abs(x)] which is zero 

521 by symmetry. The following lines of code illustrate this observation: 

522 

523 >>> y = np.abs(x) 

524 >>> mstats.pearsonr(x, y) 

525 (-0.016172891856853524, 0.7182823678751942) # may vary 

526 

527 A non-zero correlation coefficient can be misleading. For example, if X has 

528 a standard normal distribution, define y = x if x < 0 and y = 0 otherwise. 

529 A simple calculation shows that corr(x, y) = sqrt(2/Pi) = 0.797..., 

530 implying a high level of correlation: 

531 

532 >>> y = np.where(x < 0, x, 0) 

533 >>> mstats.pearsonr(x, y) 

534 (0.8537091583771509, 3.183461621422181e-143) # may vary 

535 

536 This is unintuitive since there is no dependence of x and y if x is larger 

537 than zero which happens in about half of the cases if we sample x and y. 

538 """ 

539 (x, y, n) = _chk_size(x, y) 

540 (x, y) = (x.ravel(), y.ravel()) 

541 # Get the common mask and the total nb of unmasked elements 

542 m = ma.mask_or(ma.getmask(x), ma.getmask(y)) 

543 n -= m.sum() 

544 df = n-2 

545 if df < 0: 

546 return (masked, masked) 

547 

548 return scipy.stats._stats_py.pearsonr(ma.masked_array(x, mask=m).compressed(), 

549 ma.masked_array(y, mask=m).compressed()) 

550 

551 

552def spearmanr(x, y=None, use_ties=True, axis=None, nan_policy='propagate', 

553 alternative='two-sided'): 

554 """ 

555 Calculates a Spearman rank-order correlation coefficient and the p-value 

556 to test for non-correlation. 

557 

558 The Spearman correlation is a nonparametric measure of the linear 

559 relationship between two datasets. Unlike the Pearson correlation, the 

560 Spearman correlation does not assume that both datasets are normally 

561 distributed. Like other correlation coefficients, this one varies 

562 between -1 and +1 with 0 implying no correlation. Correlations of -1 or 

563 +1 imply a monotonic relationship. Positive correlations imply that 

564 as `x` increases, so does `y`. Negative correlations imply that as `x` 

565 increases, `y` decreases. 

566 

567 Missing values are discarded pair-wise: if a value is missing in `x`, the 

568 corresponding value in `y` is masked. 

569 

570 The p-value roughly indicates the probability of an uncorrelated system 

571 producing datasets that have a Spearman correlation at least as extreme 

572 as the one computed from these datasets. The p-values are not entirely 

573 reliable but are probably reasonable for datasets larger than 500 or so. 

574 

575 Parameters 

576 ---------- 

577 x, y : 1D or 2D array_like, y is optional 

578 One or two 1-D or 2-D arrays containing multiple variables and 

579 observations. When these are 1-D, each represents a vector of 

580 observations of a single variable. For the behavior in the 2-D case, 

581 see under ``axis``, below. 

582 use_ties : bool, optional 

583 DO NOT USE. Does not do anything, keyword is only left in place for 

584 backwards compatibility reasons. 

585 axis : int or None, optional 

586 If axis=0 (default), then each column represents a variable, with 

587 observations in the rows. If axis=1, the relationship is transposed: 

588 each row represents a variable, while the columns contain observations. 

589 If axis=None, then both arrays will be raveled. 

590 nan_policy : {'propagate', 'raise', 'omit'}, optional 

591 Defines how to handle when input contains nan. 'propagate' returns nan, 

592 'raise' throws an error, 'omit' performs the calculations ignoring nan 

593 values. Default is 'propagate'. 

594 alternative : {'two-sided', 'less', 'greater'}, optional 

595 Defines the alternative hypothesis. Default is 'two-sided'. 

596 The following options are available: 

597 

598 * 'two-sided': the correlation is nonzero 

599 * 'less': the correlation is negative (less than zero) 

600 * 'greater': the correlation is positive (greater than zero) 

601 

602 .. versionadded:: 1.7.0 

603 

604 Returns 

605 ------- 

606 res : SignificanceResult 

607 An object containing attributes: 

608 

609 statistic : float or ndarray (2-D square) 

610 Spearman correlation matrix or correlation coefficient (if only 2 

611 variables are given as parameters). Correlation matrix is square 

612 with length equal to total number of variables (columns or rows) in 

613 ``a`` and ``b`` combined. 

614 pvalue : float 

615 The p-value for a hypothesis test whose null hypothesis 

616 is that two sets of data are linearly uncorrelated. See 

617 `alternative` above for alternative hypotheses. `pvalue` has the 

618 same shape as `statistic`. 

619 

620 References 

621 ---------- 

622 [CRCProbStat2000] section 14.7 

623 

624 """ 

625 if not use_ties: 

626 raise ValueError("`use_ties=False` is not supported in SciPy >= 1.2.0") 

627 

628 # Always returns a masked array, raveled if axis=None 

629 x, axisout = _chk_asarray(x, axis) 

630 if y is not None: 

631 # Deal only with 2-D `x` case. 

632 y, _ = _chk_asarray(y, axis) 

633 if axisout == 0: 

634 x = ma.column_stack((x, y)) 

635 else: 

636 x = ma.row_stack((x, y)) 

637 

638 if axisout == 1: 

639 # To simplify the code that follow (always use `n_obs, n_vars` shape) 

640 x = x.T 

641 

642 if nan_policy == 'omit': 

643 x = ma.masked_invalid(x) 

644 

645 def _spearmanr_2cols(x): 

646 # Mask the same observations for all variables, and then drop those 

647 # observations (can't leave them masked, rankdata is weird). 

648 x = ma.mask_rowcols(x, axis=0) 

649 x = x[~x.mask.any(axis=1), :] 

650 

651 # If either column is entirely NaN or Inf 

652 if not np.any(x.data): 

653 res = scipy.stats._stats_py.SignificanceResult(np.nan, np.nan) 

654 res.correlation = np.nan 

655 return res 

656 

657 m = ma.getmask(x) 

658 n_obs = x.shape[0] 

659 dof = n_obs - 2 - int(m.sum(axis=0)[0]) 

660 if dof < 0: 

661 raise ValueError("The input must have at least 3 entries!") 

662 

663 # Gets the ranks and rank differences 

664 x_ranked = rankdata(x, axis=0) 

665 rs = ma.corrcoef(x_ranked, rowvar=False).data 

666 

667 # rs can have elements equal to 1, so avoid zero division warnings 

668 with np.errstate(divide='ignore'): 

669 # clip the small negative values possibly caused by rounding 

670 # errors before taking the square root 

671 t = rs * np.sqrt((dof / ((rs+1.0) * (1.0-rs))).clip(0)) 

672 

673 t, prob = scipy.stats._stats_py._ttest_finish(dof, t, alternative) 

674 

675 # For backwards compatibility, return scalars when comparing 2 columns 

676 if rs.shape == (2, 2): 

677 res = scipy.stats._stats_py.SignificanceResult(rs[1, 0], 

678 prob[1, 0]) 

679 res.correlation = rs[1, 0] 

680 return res 

681 else: 

682 res = scipy.stats._stats_py.SignificanceResult(rs, prob) 

683 res.correlation = rs 

684 return res 

685 

686 # Need to do this per pair of variables, otherwise the dropped observations 

687 # in a third column mess up the result for a pair. 

688 n_vars = x.shape[1] 

689 if n_vars == 2: 

690 return _spearmanr_2cols(x) 

691 else: 

692 rs = np.ones((n_vars, n_vars), dtype=float) 

693 prob = np.zeros((n_vars, n_vars), dtype=float) 

694 for var1 in range(n_vars - 1): 

695 for var2 in range(var1+1, n_vars): 

696 result = _spearmanr_2cols(x[:, [var1, var2]]) 

697 rs[var1, var2] = result.correlation 

698 rs[var2, var1] = result.correlation 

699 prob[var1, var2] = result.pvalue 

700 prob[var2, var1] = result.pvalue 

701 

702 res = scipy.stats._stats_py.SignificanceResult(rs, prob) 

703 res.correlation = rs 

704 return res 

705 

706 

707def _kendall_p_exact(n, c, alternative='two-sided'): 

708 

709 # Use the fact that distribution is symmetric: always calculate a CDF in 

710 # the left tail. 

711 # This will be the one-sided p-value if `c` is on the side of 

712 # the null distribution predicted by the alternative hypothesis. 

713 # The two-sided p-value will be twice this value. 

714 # If `c` is on the other side of the null distribution, we'll need to 

715 # take the complement and add back the probability mass at `c`. 

716 in_right_tail = (c >= (n*(n-1))//2 - c) 

717 alternative_greater = (alternative == 'greater') 

718 c = int(min(c, (n*(n-1))//2 - c)) 

719 

720 # Exact p-value, see Maurice G. Kendall, "Rank Correlation Methods" 

721 # (4th Edition), Charles Griffin & Co., 1970. 

722 if n <= 0: 

723 raise ValueError(f'n ({n}) must be positive') 

724 elif c < 0 or 4*c > n*(n-1): 

725 raise ValueError(f'c ({c}) must satisfy 0 <= 4c <= n(n-1) = {n*(n-1)}.') 

726 elif n == 1: 

727 prob = 1.0 

728 p_mass_at_c = 1 

729 elif n == 2: 

730 prob = 1.0 

731 p_mass_at_c = 0.5 

732 elif c == 0: 

733 prob = 2.0/math.factorial(n) if n < 171 else 0.0 

734 p_mass_at_c = prob/2 

735 elif c == 1: 

736 prob = 2.0/math.factorial(n-1) if n < 172 else 0.0 

737 p_mass_at_c = (n-1)/math.factorial(n) 

738 elif 4*c == n*(n-1) and alternative == 'two-sided': 

739 # I'm sure there's a simple formula for p_mass_at_c in this 

740 # case, but I don't know it. Use generic formula for one-sided p-value. 

741 prob = 1.0 

742 elif n < 171: 

743 new = np.zeros(c+1) 

744 new[0:2] = 1.0 

745 for j in range(3,n+1): 

746 new = np.cumsum(new) 

747 if j <= c: 

748 new[j:] -= new[:c+1-j] 

749 prob = 2.0*np.sum(new)/math.factorial(n) 

750 p_mass_at_c = new[-1]/math.factorial(n) 

751 else: 

752 new = np.zeros(c+1) 

753 new[0:2] = 1.0 

754 for j in range(3, n+1): 

755 new = np.cumsum(new)/j 

756 if j <= c: 

757 new[j:] -= new[:c+1-j] 

758 prob = np.sum(new) 

759 p_mass_at_c = new[-1]/2 

760 

761 if alternative != 'two-sided': 

762 # if the alternative hypothesis and alternative agree, 

763 # one-sided p-value is half the two-sided p-value 

764 if in_right_tail == alternative_greater: 

765 prob /= 2 

766 else: 

767 prob = 1 - prob/2 + p_mass_at_c 

768 

769 prob = np.clip(prob, 0, 1) 

770 

771 return prob 

772 

773 

774def kendalltau(x, y, use_ties=True, use_missing=False, method='auto', 

775 alternative='two-sided'): 

776 """ 

777 Computes Kendall's rank correlation tau on two variables *x* and *y*. 

778 

779 Parameters 

780 ---------- 

781 x : sequence 

782 First data list (for example, time). 

783 y : sequence 

784 Second data list. 

785 use_ties : {True, False}, optional 

786 Whether ties correction should be performed. 

787 use_missing : {False, True}, optional 

788 Whether missing data should be allocated a rank of 0 (False) or the 

789 average rank (True) 

790 method : {'auto', 'asymptotic', 'exact'}, optional 

791 Defines which method is used to calculate the p-value [1]_. 

792 'asymptotic' uses a normal approximation valid for large samples. 

793 'exact' computes the exact p-value, but can only be used if no ties 

794 are present. As the sample size increases, the 'exact' computation 

795 time may grow and the result may lose some precision. 

796 'auto' is the default and selects the appropriate 

797 method based on a trade-off between speed and accuracy. 

798 alternative : {'two-sided', 'less', 'greater'}, optional 

799 Defines the alternative hypothesis. Default is 'two-sided'. 

800 The following options are available: 

801 

802 * 'two-sided': the rank correlation is nonzero 

803 * 'less': the rank correlation is negative (less than zero) 

804 * 'greater': the rank correlation is positive (greater than zero) 

805 

806 Returns 

807 ------- 

808 res : SignificanceResult 

809 An object containing attributes: 

810 

811 statistic : float 

812 The tau statistic. 

813 pvalue : float 

814 The p-value for a hypothesis test whose null hypothesis is 

815 an absence of association, tau = 0. 

816 

817 References 

818 ---------- 

819 .. [1] Maurice G. Kendall, "Rank Correlation Methods" (4th Edition), 

820 Charles Griffin & Co., 1970. 

821 

822 """ 

823 (x, y, n) = _chk_size(x, y) 

824 (x, y) = (x.flatten(), y.flatten()) 

825 m = ma.mask_or(ma.getmask(x), ma.getmask(y)) 

826 if m is not nomask: 

827 x = ma.array(x, mask=m, copy=True) 

828 y = ma.array(y, mask=m, copy=True) 

829 # need int() here, otherwise numpy defaults to 32 bit 

830 # integer on all Windows architectures, causing overflow. 

831 # int() will keep it infinite precision. 

832 n -= int(m.sum()) 

833 

834 if n < 2: 

835 res = scipy.stats._stats_py.SignificanceResult(np.nan, np.nan) 

836 res.correlation = np.nan 

837 return res 

838 

839 rx = ma.masked_equal(rankdata(x, use_missing=use_missing), 0) 

840 ry = ma.masked_equal(rankdata(y, use_missing=use_missing), 0) 

841 idx = rx.argsort() 

842 (rx, ry) = (rx[idx], ry[idx]) 

843 C = np.sum([((ry[i+1:] > ry[i]) * (rx[i+1:] > rx[i])).filled(0).sum() 

844 for i in range(len(ry)-1)], dtype=float) 

845 D = np.sum([((ry[i+1:] < ry[i])*(rx[i+1:] > rx[i])).filled(0).sum() 

846 for i in range(len(ry)-1)], dtype=float) 

847 xties = count_tied_groups(x) 

848 yties = count_tied_groups(y) 

849 if use_ties: 

850 corr_x = np.sum([v*k*(k-1) for (k,v) in xties.items()], dtype=float) 

851 corr_y = np.sum([v*k*(k-1) for (k,v) in yties.items()], dtype=float) 

852 denom = ma.sqrt((n*(n-1)-corr_x)/2. * (n*(n-1)-corr_y)/2.) 

853 else: 

854 denom = n*(n-1)/2. 

855 tau = (C-D) / denom 

856 

857 if method == 'exact' and (xties or yties): 

858 raise ValueError("Ties found, exact method cannot be used.") 

859 

860 if method == 'auto': 

861 if (not xties and not yties) and (n <= 33 or min(C, n*(n-1)/2.0-C) <= 1): 

862 method = 'exact' 

863 else: 

864 method = 'asymptotic' 

865 

866 if not xties and not yties and method == 'exact': 

867 prob = _kendall_p_exact(n, C, alternative) 

868 

869 elif method == 'asymptotic': 

870 var_s = n*(n-1)*(2*n+5) 

871 if use_ties: 

872 var_s -= np.sum([v*k*(k-1)*(2*k+5)*1. for (k,v) in xties.items()]) 

873 var_s -= np.sum([v*k*(k-1)*(2*k+5)*1. for (k,v) in yties.items()]) 

874 v1 = np.sum([v*k*(k-1) for (k, v) in xties.items()], dtype=float) *\ 

875 np.sum([v*k*(k-1) for (k, v) in yties.items()], dtype=float) 

876 v1 /= 2.*n*(n-1) 

877 if n > 2: 

878 v2 = np.sum([v*k*(k-1)*(k-2) for (k,v) in xties.items()], 

879 dtype=float) * \ 

880 np.sum([v*k*(k-1)*(k-2) for (k,v) in yties.items()], 

881 dtype=float) 

882 v2 /= 9.*n*(n-1)*(n-2) 

883 else: 

884 v2 = 0 

885 else: 

886 v1 = v2 = 0 

887 

888 var_s /= 18. 

889 var_s += (v1 + v2) 

890 z = (C-D)/np.sqrt(var_s) 

891 _, prob = scipy.stats._stats_py._normtest_finish(z, alternative) 

892 else: 

893 raise ValueError("Unknown method "+str(method)+" specified, please " 

894 "use auto, exact or asymptotic.") 

895 

896 res = scipy.stats._stats_py.SignificanceResult(tau, prob) 

897 res.correlation = tau 

898 return res 

899 

900 

901def kendalltau_seasonal(x): 

902 """ 

903 Computes a multivariate Kendall's rank correlation tau, for seasonal data. 

904 

905 Parameters 

906 ---------- 

907 x : 2-D ndarray 

908 Array of seasonal data, with seasons in columns. 

909 

910 """ 

911 x = ma.array(x, subok=True, copy=False, ndmin=2) 

912 (n,m) = x.shape 

913 n_p = x.count(0) 

914 

915 S_szn = sum(msign(x[i:]-x[i]).sum(0) for i in range(n)) 

916 S_tot = S_szn.sum() 

917 

918 n_tot = x.count() 

919 ties = count_tied_groups(x.compressed()) 

920 corr_ties = sum(v*k*(k-1) for (k,v) in ties.items()) 

921 denom_tot = ma.sqrt(1.*n_tot*(n_tot-1)*(n_tot*(n_tot-1)-corr_ties))/2. 

922 

923 R = rankdata(x, axis=0, use_missing=True) 

924 K = ma.empty((m,m), dtype=int) 

925 covmat = ma.empty((m,m), dtype=float) 

926 denom_szn = ma.empty(m, dtype=float) 

927 for j in range(m): 

928 ties_j = count_tied_groups(x[:,j].compressed()) 

929 corr_j = sum(v*k*(k-1) for (k,v) in ties_j.items()) 

930 cmb = n_p[j]*(n_p[j]-1) 

931 for k in range(j,m,1): 

932 K[j,k] = sum(msign((x[i:,j]-x[i,j])*(x[i:,k]-x[i,k])).sum() 

933 for i in range(n)) 

934 covmat[j,k] = (K[j,k] + 4*(R[:,j]*R[:,k]).sum() - 

935 n*(n_p[j]+1)*(n_p[k]+1))/3. 

936 K[k,j] = K[j,k] 

937 covmat[k,j] = covmat[j,k] 

938 

939 denom_szn[j] = ma.sqrt(cmb*(cmb-corr_j)) / 2. 

940 

941 var_szn = covmat.diagonal() 

942 

943 z_szn = msign(S_szn) * (abs(S_szn)-1) / ma.sqrt(var_szn) 

944 z_tot_ind = msign(S_tot) * (abs(S_tot)-1) / ma.sqrt(var_szn.sum()) 

945 z_tot_dep = msign(S_tot) * (abs(S_tot)-1) / ma.sqrt(covmat.sum()) 

946 

947 prob_szn = special.erfc(abs(z_szn)/np.sqrt(2)) 

948 prob_tot_ind = special.erfc(abs(z_tot_ind)/np.sqrt(2)) 

949 prob_tot_dep = special.erfc(abs(z_tot_dep)/np.sqrt(2)) 

950 

951 chi2_tot = (z_szn*z_szn).sum() 

952 chi2_trd = m * z_szn.mean()**2 

953 output = {'seasonal tau': S_szn/denom_szn, 

954 'global tau': S_tot/denom_tot, 

955 'global tau (alt)': S_tot/denom_szn.sum(), 

956 'seasonal p-value': prob_szn, 

957 'global p-value (indep)': prob_tot_ind, 

958 'global p-value (dep)': prob_tot_dep, 

959 'chi2 total': chi2_tot, 

960 'chi2 trend': chi2_trd, 

961 } 

962 return output 

963 

964 

965PointbiserialrResult = namedtuple('PointbiserialrResult', ('correlation', 

966 'pvalue')) 

967 

968 

969def pointbiserialr(x, y): 

970 """Calculates a point biserial correlation coefficient and its p-value. 

971 

972 Parameters 

973 ---------- 

974 x : array_like of bools 

975 Input array. 

976 y : array_like 

977 Input array. 

978 

979 Returns 

980 ------- 

981 correlation : float 

982 R value 

983 pvalue : float 

984 2-tailed p-value 

985 

986 Notes 

987 ----- 

988 Missing values are considered pair-wise: if a value is missing in x, 

989 the corresponding value in y is masked. 

990 

991 For more details on `pointbiserialr`, see `scipy.stats.pointbiserialr`. 

992 

993 """ 

994 x = ma.fix_invalid(x, copy=True).astype(bool) 

995 y = ma.fix_invalid(y, copy=True).astype(float) 

996 # Get rid of the missing data 

997 m = ma.mask_or(ma.getmask(x), ma.getmask(y)) 

998 if m is not nomask: 

999 unmask = np.logical_not(m) 

1000 x = x[unmask] 

1001 y = y[unmask] 

1002 

1003 n = len(x) 

1004 # phat is the fraction of x values that are True 

1005 phat = x.sum() / float(n) 

1006 y0 = y[~x] # y-values where x is False 

1007 y1 = y[x] # y-values where x is True 

1008 y0m = y0.mean() 

1009 y1m = y1.mean() 

1010 

1011 rpb = (y1m - y0m)*np.sqrt(phat * (1-phat)) / y.std() 

1012 

1013 df = n-2 

1014 t = rpb*ma.sqrt(df/(1.0-rpb**2)) 

1015 prob = _betai(0.5*df, 0.5, df/(df+t*t)) 

1016 

1017 return PointbiserialrResult(rpb, prob) 

1018 

1019 

1020def linregress(x, y=None): 

1021 r""" 

1022 Linear regression calculation 

1023 

1024 Note that the non-masked version is used, and that this docstring is 

1025 replaced by the non-masked docstring + some info on missing data. 

1026 

1027 """ 

1028 if y is None: 

1029 x = ma.array(x) 

1030 if x.shape[0] == 2: 

1031 x, y = x 

1032 elif x.shape[1] == 2: 

1033 x, y = x.T 

1034 else: 

1035 raise ValueError("If only `x` is given as input, " 

1036 "it has to be of shape (2, N) or (N, 2), " 

1037 f"provided shape was {x.shape}") 

1038 else: 

1039 x = ma.array(x) 

1040 y = ma.array(y) 

1041 

1042 x = x.flatten() 

1043 y = y.flatten() 

1044 

1045 if np.amax(x) == np.amin(x) and len(x) > 1: 

1046 raise ValueError("Cannot calculate a linear regression " 

1047 "if all x values are identical") 

1048 

1049 m = ma.mask_or(ma.getmask(x), ma.getmask(y), shrink=False) 

1050 if m is not nomask: 

1051 x = ma.array(x, mask=m) 

1052 y = ma.array(y, mask=m) 

1053 if np.any(~m): 

1054 result = stats_linregress(x.data[~m], y.data[~m]) 

1055 else: 

1056 # All data is masked 

1057 result = stats_LinregressResult(slope=None, intercept=None, 

1058 rvalue=None, pvalue=None, 

1059 stderr=None, 

1060 intercept_stderr=None) 

1061 else: 

1062 result = stats_linregress(x.data, y.data) 

1063 

1064 return result 

1065 

1066 

1067def theilslopes(y, x=None, alpha=0.95, method='separate'): 

1068 r""" 

1069 Computes the Theil-Sen estimator for a set of points (x, y). 

1070 

1071 `theilslopes` implements a method for robust linear regression. It 

1072 computes the slope as the median of all slopes between paired values. 

1073 

1074 Parameters 

1075 ---------- 

1076 y : array_like 

1077 Dependent variable. 

1078 x : array_like or None, optional 

1079 Independent variable. If None, use ``arange(len(y))`` instead. 

1080 alpha : float, optional 

1081 Confidence degree between 0 and 1. Default is 95% confidence. 

1082 Note that `alpha` is symmetric around 0.5, i.e. both 0.1 and 0.9 are 

1083 interpreted as "find the 90% confidence interval". 

1084 method : {'joint', 'separate'}, optional 

1085 Method to be used for computing estimate for intercept. 

1086 Following methods are supported, 

1087 

1088 * 'joint': Uses np.median(y - slope * x) as intercept. 

1089 * 'separate': Uses np.median(y) - slope * np.median(x) 

1090 as intercept. 

1091 

1092 The default is 'separate'. 

1093 

1094 .. versionadded:: 1.8.0 

1095 

1096 Returns 

1097 ------- 

1098 result : ``TheilslopesResult`` instance 

1099 The return value is an object with the following attributes: 

1100 

1101 slope : float 

1102 Theil slope. 

1103 intercept : float 

1104 Intercept of the Theil line. 

1105 low_slope : float 

1106 Lower bound of the confidence interval on `slope`. 

1107 high_slope : float 

1108 Upper bound of the confidence interval on `slope`. 

1109 

1110 See Also 

1111 -------- 

1112 siegelslopes : a similar technique using repeated medians 

1113 

1114 

1115 Notes 

1116 ----- 

1117 For more details on `theilslopes`, see `scipy.stats.theilslopes`. 

1118 

1119 """ 

1120 y = ma.asarray(y).flatten() 

1121 if x is None: 

1122 x = ma.arange(len(y), dtype=float) 

1123 else: 

1124 x = ma.asarray(x).flatten() 

1125 if len(x) != len(y): 

1126 raise ValueError("Incompatible lengths ! (%s<>%s)" % (len(y),len(x))) 

1127 

1128 m = ma.mask_or(ma.getmask(x), ma.getmask(y)) 

1129 y._mask = x._mask = m 

1130 # Disregard any masked elements of x or y 

1131 y = y.compressed() 

1132 x = x.compressed().astype(float) 

1133 # We now have unmasked arrays so can use `scipy.stats.theilslopes` 

1134 return stats_theilslopes(y, x, alpha=alpha, method=method) 

1135 

1136 

1137def siegelslopes(y, x=None, method="hierarchical"): 

1138 r""" 

1139 Computes the Siegel estimator for a set of points (x, y). 

1140 

1141 `siegelslopes` implements a method for robust linear regression 

1142 using repeated medians to fit a line to the points (x, y). 

1143 The method is robust to outliers with an asymptotic breakdown point 

1144 of 50%. 

1145 

1146 Parameters 

1147 ---------- 

1148 y : array_like 

1149 Dependent variable. 

1150 x : array_like or None, optional 

1151 Independent variable. If None, use ``arange(len(y))`` instead. 

1152 method : {'hierarchical', 'separate'} 

1153 If 'hierarchical', estimate the intercept using the estimated 

1154 slope ``slope`` (default option). 

1155 If 'separate', estimate the intercept independent of the estimated 

1156 slope. See Notes for details. 

1157 

1158 Returns 

1159 ------- 

1160 result : ``SiegelslopesResult`` instance 

1161 The return value is an object with the following attributes: 

1162 

1163 slope : float 

1164 Estimate of the slope of the regression line. 

1165 intercept : float 

1166 Estimate of the intercept of the regression line. 

1167 

1168 See Also 

1169 -------- 

1170 theilslopes : a similar technique without repeated medians 

1171 

1172 Notes 

1173 ----- 

1174 For more details on `siegelslopes`, see `scipy.stats.siegelslopes`. 

1175 

1176 """ 

1177 y = ma.asarray(y).ravel() 

1178 if x is None: 

1179 x = ma.arange(len(y), dtype=float) 

1180 else: 

1181 x = ma.asarray(x).ravel() 

1182 if len(x) != len(y): 

1183 raise ValueError("Incompatible lengths ! (%s<>%s)" % (len(y), len(x))) 

1184 

1185 m = ma.mask_or(ma.getmask(x), ma.getmask(y)) 

1186 y._mask = x._mask = m 

1187 # Disregard any masked elements of x or y 

1188 y = y.compressed() 

1189 x = x.compressed().astype(float) 

1190 # We now have unmasked arrays so can use `scipy.stats.siegelslopes` 

1191 return stats_siegelslopes(y, x, method=method) 

1192 

1193 

1194SenSeasonalSlopesResult = _make_tuple_bunch('SenSeasonalSlopesResult', 

1195 ['intra_slope', 'inter_slope']) 

1196 

1197 

1198def sen_seasonal_slopes(x): 

1199 r""" 

1200 Computes seasonal Theil-Sen and Kendall slope estimators. 

1201 

1202 The seasonal generalization of Sen's slope computes the slopes between all 

1203 pairs of values within a "season" (column) of a 2D array. It returns an 

1204 array containing the median of these "within-season" slopes for each 

1205 season (the Theil-Sen slope estimator of each season), and it returns the 

1206 median of the within-season slopes across all seasons (the seasonal Kendall 

1207 slope estimator). 

1208 

1209 Parameters 

1210 ---------- 

1211 x : 2D array_like 

1212 Each column of `x` contains measurements of the dependent variable 

1213 within a season. The independent variable (usually time) of each season 

1214 is assumed to be ``np.arange(x.shape[0])``. 

1215 

1216 Returns 

1217 ------- 

1218 result : ``SenSeasonalSlopesResult`` instance 

1219 The return value is an object with the following attributes: 

1220 

1221 intra_slope : ndarray 

1222 For each season, the Theil-Sen slope estimator: the median of 

1223 within-season slopes. 

1224 inter_slope : float 

1225 The seasonal Kendall slope estimateor: the median of within-season 

1226 slopes *across all* seasons. 

1227 

1228 See Also 

1229 -------- 

1230 theilslopes : the analogous function for non-seasonal data 

1231 scipy.stats.theilslopes : non-seasonal slopes for non-masked arrays 

1232 

1233 Notes 

1234 ----- 

1235 The slopes :math:`d_{ijk}` within season :math:`i` are: 

1236 

1237 .. math:: 

1238 

1239 d_{ijk} = \frac{x_{ij} - x_{ik}} 

1240 {j - k} 

1241 

1242 for pairs of distinct integer indices :math:`j, k` of :math:`x`. 

1243 

1244 Element :math:`i` of the returned `intra_slope` array is the median of the 

1245 :math:`d_{ijk}` over all :math:`j < k`; this is the Theil-Sen slope 

1246 estimator of season :math:`i`. The returned `inter_slope` value, better 

1247 known as the seasonal Kendall slope estimator, is the median of the 

1248 :math:`d_{ijk}` over all :math:`i, j, k`. 

1249 

1250 References 

1251 ---------- 

1252 .. [1] Hirsch, Robert M., James R. Slack, and Richard A. Smith. 

1253 "Techniques of trend analysis for monthly water quality data." 

1254 *Water Resources Research* 18.1 (1982): 107-121. 

1255 

1256 Examples 

1257 -------- 

1258 Suppose we have 100 observations of a dependent variable for each of four 

1259 seasons: 

1260 

1261 >>> import numpy as np 

1262 >>> rng = np.random.default_rng() 

1263 >>> x = rng.random(size=(100, 4)) 

1264 

1265 We compute the seasonal slopes as: 

1266 

1267 >>> from scipy import stats 

1268 >>> intra_slope, inter_slope = stats.mstats.sen_seasonal_slopes(x) 

1269 

1270 If we define a function to compute all slopes between observations within 

1271 a season: 

1272 

1273 >>> def dijk(yi): 

1274 ... n = len(yi) 

1275 ... x = np.arange(n) 

1276 ... dy = yi - yi[:, np.newaxis] 

1277 ... dx = x - x[:, np.newaxis] 

1278 ... # we only want unique pairs of distinct indices 

1279 ... mask = np.triu(np.ones((n, n), dtype=bool), k=1) 

1280 ... return dy[mask]/dx[mask] 

1281 

1282 then element ``i`` of ``intra_slope`` is the median of ``dijk[x[:, i]]``: 

1283 

1284 >>> i = 2 

1285 >>> np.allclose(np.median(dijk(x[:, i])), intra_slope[i]) 

1286 True 

1287 

1288 and ``inter_slope`` is the median of the values returned by ``dijk`` for 

1289 all seasons: 

1290 

1291 >>> all_slopes = np.concatenate([dijk(x[:, i]) for i in range(x.shape[1])]) 

1292 >>> np.allclose(np.median(all_slopes), inter_slope) 

1293 True 

1294 

1295 Because the data are randomly generated, we would expect the median slopes 

1296 to be nearly zero both within and across all seasons, and indeed they are: 

1297 

1298 >>> intra_slope.data 

1299 array([ 0.00124504, -0.00277761, -0.00221245, -0.00036338]) 

1300 >>> inter_slope 

1301 -0.0010511779872922058 

1302 

1303 """ 

1304 x = ma.array(x, subok=True, copy=False, ndmin=2) 

1305 (n,_) = x.shape 

1306 # Get list of slopes per season 

1307 szn_slopes = ma.vstack([(x[i+1:]-x[i])/np.arange(1,n-i)[:,None] 

1308 for i in range(n)]) 

1309 szn_medslopes = ma.median(szn_slopes, axis=0) 

1310 medslope = ma.median(szn_slopes, axis=None) 

1311 return SenSeasonalSlopesResult(szn_medslopes, medslope) 

1312 

1313 

1314Ttest_1sampResult = namedtuple('Ttest_1sampResult', ('statistic', 'pvalue')) 

1315 

1316 

1317def ttest_1samp(a, popmean, axis=0, alternative='two-sided'): 

1318 """ 

1319 Calculates the T-test for the mean of ONE group of scores. 

1320 

1321 Parameters 

1322 ---------- 

1323 a : array_like 

1324 sample observation 

1325 popmean : float or array_like 

1326 expected value in null hypothesis, if array_like than it must have the 

1327 same shape as `a` excluding the axis dimension 

1328 axis : int or None, optional 

1329 Axis along which to compute test. If None, compute over the whole 

1330 array `a`. 

1331 alternative : {'two-sided', 'less', 'greater'}, optional 

1332 Defines the alternative hypothesis. 

1333 The following options are available (default is 'two-sided'): 

1334 

1335 * 'two-sided': the mean of the underlying distribution of the sample 

1336 is different than the given population mean (`popmean`) 

1337 * 'less': the mean of the underlying distribution of the sample is 

1338 less than the given population mean (`popmean`) 

1339 * 'greater': the mean of the underlying distribution of the sample is 

1340 greater than the given population mean (`popmean`) 

1341 

1342 .. versionadded:: 1.7.0 

1343 

1344 Returns 

1345 ------- 

1346 statistic : float or array 

1347 t-statistic 

1348 pvalue : float or array 

1349 The p-value 

1350 

1351 Notes 

1352 ----- 

1353 For more details on `ttest_1samp`, see `scipy.stats.ttest_1samp`. 

1354 

1355 """ 

1356 a, axis = _chk_asarray(a, axis) 

1357 if a.size == 0: 

1358 return (np.nan, np.nan) 

1359 

1360 x = a.mean(axis=axis) 

1361 v = a.var(axis=axis, ddof=1) 

1362 n = a.count(axis=axis) 

1363 # force df to be an array for masked division not to throw a warning 

1364 df = ma.asanyarray(n - 1.0) 

1365 svar = ((n - 1.0) * v) / df 

1366 with np.errstate(divide='ignore', invalid='ignore'): 

1367 t = (x - popmean) / ma.sqrt(svar / n) 

1368 

1369 t, prob = scipy.stats._stats_py._ttest_finish(df, t, alternative) 

1370 return Ttest_1sampResult(t, prob) 

1371 

1372 

1373ttest_onesamp = ttest_1samp 

1374 

1375 

1376Ttest_indResult = namedtuple('Ttest_indResult', ('statistic', 'pvalue')) 

1377 

1378 

1379def ttest_ind(a, b, axis=0, equal_var=True, alternative='two-sided'): 

1380 """ 

1381 Calculates the T-test for the means of TWO INDEPENDENT samples of scores. 

1382 

1383 Parameters 

1384 ---------- 

1385 a, b : array_like 

1386 The arrays must have the same shape, except in the dimension 

1387 corresponding to `axis` (the first, by default). 

1388 axis : int or None, optional 

1389 Axis along which to compute test. If None, compute over the whole 

1390 arrays, `a`, and `b`. 

1391 equal_var : bool, optional 

1392 If True, perform a standard independent 2 sample test that assumes equal 

1393 population variances. 

1394 If False, perform Welch's t-test, which does not assume equal population 

1395 variance. 

1396 

1397 .. versionadded:: 0.17.0 

1398 alternative : {'two-sided', 'less', 'greater'}, optional 

1399 Defines the alternative hypothesis. 

1400 The following options are available (default is 'two-sided'): 

1401 

1402 * 'two-sided': the means of the distributions underlying the samples 

1403 are unequal. 

1404 * 'less': the mean of the distribution underlying the first sample 

1405 is less than the mean of the distribution underlying the second 

1406 sample. 

1407 * 'greater': the mean of the distribution underlying the first 

1408 sample is greater than the mean of the distribution underlying 

1409 the second sample. 

1410 

1411 .. versionadded:: 1.7.0 

1412 

1413 Returns 

1414 ------- 

1415 statistic : float or array 

1416 The calculated t-statistic. 

1417 pvalue : float or array 

1418 The p-value. 

1419 

1420 Notes 

1421 ----- 

1422 For more details on `ttest_ind`, see `scipy.stats.ttest_ind`. 

1423 

1424 """ 

1425 a, b, axis = _chk2_asarray(a, b, axis) 

1426 

1427 if a.size == 0 or b.size == 0: 

1428 return Ttest_indResult(np.nan, np.nan) 

1429 

1430 (x1, x2) = (a.mean(axis), b.mean(axis)) 

1431 (v1, v2) = (a.var(axis=axis, ddof=1), b.var(axis=axis, ddof=1)) 

1432 (n1, n2) = (a.count(axis), b.count(axis)) 

1433 

1434 if equal_var: 

1435 # force df to be an array for masked division not to throw a warning 

1436 df = ma.asanyarray(n1 + n2 - 2.0) 

1437 svar = ((n1-1)*v1+(n2-1)*v2) / df 

1438 denom = ma.sqrt(svar*(1.0/n1 + 1.0/n2)) # n-D computation here! 

1439 else: 

1440 vn1 = v1/n1 

1441 vn2 = v2/n2 

1442 with np.errstate(divide='ignore', invalid='ignore'): 

1443 df = (vn1 + vn2)**2 / (vn1**2 / (n1 - 1) + vn2**2 / (n2 - 1)) 

1444 

1445 # If df is undefined, variances are zero. 

1446 # It doesn't matter what df is as long as it is not NaN. 

1447 df = np.where(np.isnan(df), 1, df) 

1448 denom = ma.sqrt(vn1 + vn2) 

1449 

1450 with np.errstate(divide='ignore', invalid='ignore'): 

1451 t = (x1-x2) / denom 

1452 

1453 t, prob = scipy.stats._stats_py._ttest_finish(df, t, alternative) 

1454 return Ttest_indResult(t, prob) 

1455 

1456 

1457Ttest_relResult = namedtuple('Ttest_relResult', ('statistic', 'pvalue')) 

1458 

1459 

1460def ttest_rel(a, b, axis=0, alternative='two-sided'): 

1461 """ 

1462 Calculates the T-test on TWO RELATED samples of scores, a and b. 

1463 

1464 Parameters 

1465 ---------- 

1466 a, b : array_like 

1467 The arrays must have the same shape. 

1468 axis : int or None, optional 

1469 Axis along which to compute test. If None, compute over the whole 

1470 arrays, `a`, and `b`. 

1471 alternative : {'two-sided', 'less', 'greater'}, optional 

1472 Defines the alternative hypothesis. 

1473 The following options are available (default is 'two-sided'): 

1474 

1475 * 'two-sided': the means of the distributions underlying the samples 

1476 are unequal. 

1477 * 'less': the mean of the distribution underlying the first sample 

1478 is less than the mean of the distribution underlying the second 

1479 sample. 

1480 * 'greater': the mean of the distribution underlying the first 

1481 sample is greater than the mean of the distribution underlying 

1482 the second sample. 

1483 

1484 .. versionadded:: 1.7.0 

1485 

1486 Returns 

1487 ------- 

1488 statistic : float or array 

1489 t-statistic 

1490 pvalue : float or array 

1491 two-tailed p-value 

1492 

1493 Notes 

1494 ----- 

1495 For more details on `ttest_rel`, see `scipy.stats.ttest_rel`. 

1496 

1497 """ 

1498 a, b, axis = _chk2_asarray(a, b, axis) 

1499 if len(a) != len(b): 

1500 raise ValueError('unequal length arrays') 

1501 

1502 if a.size == 0 or b.size == 0: 

1503 return Ttest_relResult(np.nan, np.nan) 

1504 

1505 n = a.count(axis) 

1506 df = ma.asanyarray(n-1.0) 

1507 d = (a-b).astype('d') 

1508 dm = d.mean(axis) 

1509 v = d.var(axis=axis, ddof=1) 

1510 denom = ma.sqrt(v / n) 

1511 with np.errstate(divide='ignore', invalid='ignore'): 

1512 t = dm / denom 

1513 

1514 t, prob = scipy.stats._stats_py._ttest_finish(df, t, alternative) 

1515 return Ttest_relResult(t, prob) 

1516 

1517 

1518MannwhitneyuResult = namedtuple('MannwhitneyuResult', ('statistic', 

1519 'pvalue')) 

1520 

1521 

1522def mannwhitneyu(x,y, use_continuity=True): 

1523 """ 

1524 Computes the Mann-Whitney statistic 

1525 

1526 Missing values in `x` and/or `y` are discarded. 

1527 

1528 Parameters 

1529 ---------- 

1530 x : sequence 

1531 Input 

1532 y : sequence 

1533 Input 

1534 use_continuity : {True, False}, optional 

1535 Whether a continuity correction (1/2.) should be taken into account. 

1536 

1537 Returns 

1538 ------- 

1539 statistic : float 

1540 The minimum of the Mann-Whitney statistics 

1541 pvalue : float 

1542 Approximate two-sided p-value assuming a normal distribution. 

1543 

1544 """ 

1545 x = ma.asarray(x).compressed().view(ndarray) 

1546 y = ma.asarray(y).compressed().view(ndarray) 

1547 ranks = rankdata(np.concatenate([x,y])) 

1548 (nx, ny) = (len(x), len(y)) 

1549 nt = nx + ny 

1550 U = ranks[:nx].sum() - nx*(nx+1)/2. 

1551 U = max(U, nx*ny - U) 

1552 u = nx*ny - U 

1553 

1554 mu = (nx*ny)/2. 

1555 sigsq = (nt**3 - nt)/12. 

1556 ties = count_tied_groups(ranks) 

1557 sigsq -= sum(v*(k**3-k) for (k,v) in ties.items())/12. 

1558 sigsq *= nx*ny/float(nt*(nt-1)) 

1559 

1560 if use_continuity: 

1561 z = (U - 1/2. - mu) / ma.sqrt(sigsq) 

1562 else: 

1563 z = (U - mu) / ma.sqrt(sigsq) 

1564 

1565 prob = special.erfc(abs(z)/np.sqrt(2)) 

1566 return MannwhitneyuResult(u, prob) 

1567 

1568 

1569KruskalResult = namedtuple('KruskalResult', ('statistic', 'pvalue')) 

1570 

1571 

1572def kruskal(*args): 

1573 """ 

1574 Compute the Kruskal-Wallis H-test for independent samples 

1575 

1576 Parameters 

1577 ---------- 

1578 sample1, sample2, ... : array_like 

1579 Two or more arrays with the sample measurements can be given as 

1580 arguments. 

1581 

1582 Returns 

1583 ------- 

1584 statistic : float 

1585 The Kruskal-Wallis H statistic, corrected for ties 

1586 pvalue : float 

1587 The p-value for the test using the assumption that H has a chi 

1588 square distribution 

1589 

1590 Notes 

1591 ----- 

1592 For more details on `kruskal`, see `scipy.stats.kruskal`. 

1593 

1594 Examples 

1595 -------- 

1596 >>> from scipy.stats.mstats import kruskal 

1597 

1598 Random samples from three different brands of batteries were tested 

1599 to see how long the charge lasted. Results were as follows: 

1600 

1601 >>> a = [6.3, 5.4, 5.7, 5.2, 5.0] 

1602 >>> b = [6.9, 7.0, 6.1, 7.9] 

1603 >>> c = [7.2, 6.9, 6.1, 6.5] 

1604 

1605 Test the hypotesis that the distribution functions for all of the brands' 

1606 durations are identical. Use 5% level of significance. 

1607 

1608 >>> kruskal(a, b, c) 

1609 KruskalResult(statistic=7.113812154696133, pvalue=0.028526948491942164) 

1610 

1611 The null hypothesis is rejected at the 5% level of significance 

1612 because the returned p-value is less than the critical value of 5%. 

1613 

1614 """ 

1615 output = argstoarray(*args) 

1616 ranks = ma.masked_equal(rankdata(output, use_missing=False), 0) 

1617 sumrk = ranks.sum(-1) 

1618 ngrp = ranks.count(-1) 

1619 ntot = ranks.count() 

1620 H = 12./(ntot*(ntot+1)) * (sumrk**2/ngrp).sum() - 3*(ntot+1) 

1621 # Tie correction 

1622 ties = count_tied_groups(ranks) 

1623 T = 1. - sum(v*(k**3-k) for (k,v) in ties.items())/float(ntot**3-ntot) 

1624 if T == 0: 

1625 raise ValueError('All numbers are identical in kruskal') 

1626 

1627 H /= T 

1628 df = len(output) - 1 

1629 prob = distributions.chi2.sf(H, df) 

1630 return KruskalResult(H, prob) 

1631 

1632 

1633kruskalwallis = kruskal 

1634 

1635 

1636@_rename_parameter("mode", "method") 

1637def ks_1samp(x, cdf, args=(), alternative="two-sided", method='auto'): 

1638 """ 

1639 Computes the Kolmogorov-Smirnov test on one sample of masked values. 

1640 

1641 Missing values in `x` are discarded. 

1642 

1643 Parameters 

1644 ---------- 

1645 x : array_like 

1646 a 1-D array of observations of random variables. 

1647 cdf : str or callable 

1648 If a string, it should be the name of a distribution in `scipy.stats`. 

1649 If a callable, that callable is used to calculate the cdf. 

1650 args : tuple, sequence, optional 

1651 Distribution parameters, used if `cdf` is a string. 

1652 alternative : {'two-sided', 'less', 'greater'}, optional 

1653 Indicates the alternative hypothesis. Default is 'two-sided'. 

1654 method : {'auto', 'exact', 'asymp'}, optional 

1655 Defines the method used for calculating the p-value. 

1656 The following options are available (default is 'auto'): 

1657 

1658 * 'auto' : use 'exact' for small size arrays, 'asymp' for large 

1659 * 'exact' : use approximation to exact distribution of test statistic 

1660 * 'asymp' : use asymptotic distribution of test statistic 

1661 

1662 Returns 

1663 ------- 

1664 d : float 

1665 Value of the Kolmogorov Smirnov test 

1666 p : float 

1667 Corresponding p-value. 

1668 

1669 """ 

1670 alternative = {'t': 'two-sided', 'g': 'greater', 'l': 'less'}.get( 

1671 alternative.lower()[0], alternative) 

1672 return scipy.stats._stats_py.ks_1samp( 

1673 x, cdf, args=args, alternative=alternative, method=method) 

1674 

1675 

1676@_rename_parameter("mode", "method") 

1677def ks_2samp(data1, data2, alternative="two-sided", method='auto'): 

1678 """ 

1679 Computes the Kolmogorov-Smirnov test on two samples. 

1680 

1681 Missing values in `x` and/or `y` are discarded. 

1682 

1683 Parameters 

1684 ---------- 

1685 data1 : array_like 

1686 First data set 

1687 data2 : array_like 

1688 Second data set 

1689 alternative : {'two-sided', 'less', 'greater'}, optional 

1690 Indicates the alternative hypothesis. Default is 'two-sided'. 

1691 method : {'auto', 'exact', 'asymp'}, optional 

1692 Defines the method used for calculating the p-value. 

1693 The following options are available (default is 'auto'): 

1694 

1695 * 'auto' : use 'exact' for small size arrays, 'asymp' for large 

1696 * 'exact' : use approximation to exact distribution of test statistic 

1697 * 'asymp' : use asymptotic distribution of test statistic 

1698 

1699 Returns 

1700 ------- 

1701 d : float 

1702 Value of the Kolmogorov Smirnov test 

1703 p : float 

1704 Corresponding p-value. 

1705 

1706 """ 

1707 # Ideally this would be accomplished by 

1708 # ks_2samp = scipy.stats._stats_py.ks_2samp 

1709 # but the circular dependencies between _mstats_basic and stats prevent that. 

1710 alternative = {'t': 'two-sided', 'g': 'greater', 'l': 'less'}.get( 

1711 alternative.lower()[0], alternative) 

1712 return scipy.stats._stats_py.ks_2samp(data1, data2, 

1713 alternative=alternative, 

1714 method=method) 

1715 

1716 

1717ks_twosamp = ks_2samp 

1718 

1719 

1720@_rename_parameter("mode", "method") 

1721def kstest(data1, data2, args=(), alternative='two-sided', method='auto'): 

1722 """ 

1723 

1724 Parameters 

1725 ---------- 

1726 data1 : array_like 

1727 data2 : str, callable or array_like 

1728 args : tuple, sequence, optional 

1729 Distribution parameters, used if `data1` or `data2` are strings. 

1730 alternative : str, as documented in stats.kstest 

1731 method : str, as documented in stats.kstest 

1732 

1733 Returns 

1734 ------- 

1735 tuple of (K-S statistic, probability) 

1736 

1737 """ 

1738 return scipy.stats._stats_py.kstest(data1, data2, args, 

1739 alternative=alternative, method=method) 

1740 

1741 

1742def trima(a, limits=None, inclusive=(True,True)): 

1743 """ 

1744 Trims an array by masking the data outside some given limits. 

1745 

1746 Returns a masked version of the input array. 

1747 

1748 Parameters 

1749 ---------- 

1750 a : array_like 

1751 Input array. 

1752 limits : {None, tuple}, optional 

1753 Tuple of (lower limit, upper limit) in absolute values. 

1754 Values of the input array lower (greater) than the lower (upper) limit 

1755 will be masked. A limit is None indicates an open interval. 

1756 inclusive : (bool, bool) tuple, optional 

1757 Tuple of (lower flag, upper flag), indicating whether values exactly 

1758 equal to the lower (upper) limit are allowed. 

1759 

1760 Examples 

1761 -------- 

1762 >>> from scipy.stats.mstats import trima 

1763 >>> import numpy as np 

1764 

1765 >>> a = np.arange(10) 

1766 

1767 The interval is left-closed and right-open, i.e., `[2, 8)`. 

1768 Trim the array by keeping only values in the interval. 

1769 

1770 >>> trima(a, limits=(2, 8), inclusive=(True, False)) 

1771 masked_array(data=[--, --, 2, 3, 4, 5, 6, 7, --, --], 

1772 mask=[ True, True, False, False, False, False, False, False, 

1773 True, True], 

1774 fill_value=999999) 

1775 

1776 """ 

1777 a = ma.asarray(a) 

1778 a.unshare_mask() 

1779 if (limits is None) or (limits == (None, None)): 

1780 return a 

1781 

1782 (lower_lim, upper_lim) = limits 

1783 (lower_in, upper_in) = inclusive 

1784 condition = False 

1785 if lower_lim is not None: 

1786 if lower_in: 

1787 condition |= (a < lower_lim) 

1788 else: 

1789 condition |= (a <= lower_lim) 

1790 

1791 if upper_lim is not None: 

1792 if upper_in: 

1793 condition |= (a > upper_lim) 

1794 else: 

1795 condition |= (a >= upper_lim) 

1796 

1797 a[condition.filled(True)] = masked 

1798 return a 

1799 

1800 

1801def trimr(a, limits=None, inclusive=(True, True), axis=None): 

1802 """ 

1803 Trims an array by masking some proportion of the data on each end. 

1804 Returns a masked version of the input array. 

1805 

1806 Parameters 

1807 ---------- 

1808 a : sequence 

1809 Input array. 

1810 limits : {None, tuple}, optional 

1811 Tuple of the percentages to cut on each side of the array, with respect 

1812 to the number of unmasked data, as floats between 0. and 1. 

1813 Noting n the number of unmasked data before trimming, the 

1814 (n*limits[0])th smallest data and the (n*limits[1])th largest data are 

1815 masked, and the total number of unmasked data after trimming is 

1816 n*(1.-sum(limits)). The value of one limit can be set to None to 

1817 indicate an open interval. 

1818 inclusive : {(True,True) tuple}, optional 

1819 Tuple of flags indicating whether the number of data being masked on 

1820 the left (right) end should be truncated (True) or rounded (False) to 

1821 integers. 

1822 axis : {None,int}, optional 

1823 Axis along which to trim. If None, the whole array is trimmed, but its 

1824 shape is maintained. 

1825 

1826 """ 

1827 def _trimr1D(a, low_limit, up_limit, low_inclusive, up_inclusive): 

1828 n = a.count() 

1829 idx = a.argsort() 

1830 if low_limit: 

1831 if low_inclusive: 

1832 lowidx = int(low_limit*n) 

1833 else: 

1834 lowidx = int(np.round(low_limit*n)) 

1835 a[idx[:lowidx]] = masked 

1836 if up_limit is not None: 

1837 if up_inclusive: 

1838 upidx = n - int(n*up_limit) 

1839 else: 

1840 upidx = n - int(np.round(n*up_limit)) 

1841 a[idx[upidx:]] = masked 

1842 return a 

1843 

1844 a = ma.asarray(a) 

1845 a.unshare_mask() 

1846 if limits is None: 

1847 return a 

1848 

1849 # Check the limits 

1850 (lolim, uplim) = limits 

1851 errmsg = "The proportion to cut from the %s should be between 0. and 1." 

1852 if lolim is not None: 

1853 if lolim > 1. or lolim < 0: 

1854 raise ValueError(errmsg % 'beginning' + "(got %s)" % lolim) 

1855 if uplim is not None: 

1856 if uplim > 1. or uplim < 0: 

1857 raise ValueError(errmsg % 'end' + "(got %s)" % uplim) 

1858 

1859 (loinc, upinc) = inclusive 

1860 

1861 if axis is None: 

1862 shp = a.shape 

1863 return _trimr1D(a.ravel(),lolim,uplim,loinc,upinc).reshape(shp) 

1864 else: 

1865 return ma.apply_along_axis(_trimr1D, axis, a, lolim,uplim,loinc,upinc) 

1866 

1867 

1868trimdoc = """ 

1869 Parameters 

1870 ---------- 

1871 a : sequence 

1872 Input array 

1873 limits : {None, tuple}, optional 

1874 If `relative` is False, tuple (lower limit, upper limit) in absolute values. 

1875 Values of the input array lower (greater) than the lower (upper) limit are 

1876 masked. 

1877 

1878 If `relative` is True, tuple (lower percentage, upper percentage) to cut 

1879 on each side of the array, with respect to the number of unmasked data. 

1880 

1881 Noting n the number of unmasked data before trimming, the (n*limits[0])th 

1882 smallest data and the (n*limits[1])th largest data are masked, and the 

1883 total number of unmasked data after trimming is n*(1.-sum(limits)) 

1884 In each case, the value of one limit can be set to None to indicate an 

1885 open interval. 

1886 

1887 If limits is None, no trimming is performed 

1888 inclusive : {(bool, bool) tuple}, optional 

1889 If `relative` is False, tuple indicating whether values exactly equal 

1890 to the absolute limits are allowed. 

1891 If `relative` is True, tuple indicating whether the number of data 

1892 being masked on each side should be rounded (True) or truncated 

1893 (False). 

1894 relative : bool, optional 

1895 Whether to consider the limits as absolute values (False) or proportions 

1896 to cut (True). 

1897 axis : int, optional 

1898 Axis along which to trim. 

1899""" 

1900 

1901 

1902def trim(a, limits=None, inclusive=(True,True), relative=False, axis=None): 

1903 """ 

1904 Trims an array by masking the data outside some given limits. 

1905 

1906 Returns a masked version of the input array. 

1907 

1908 %s 

1909 

1910 Examples 

1911 -------- 

1912 >>> from scipy.stats.mstats import trim 

1913 >>> z = [ 1, 2, 3, 4, 5, 6, 7, 8, 9,10] 

1914 >>> print(trim(z,(3,8))) 

1915 [-- -- 3 4 5 6 7 8 -- --] 

1916 >>> print(trim(z,(0.1,0.2),relative=True)) 

1917 [-- 2 3 4 5 6 7 8 -- --] 

1918 

1919 """ 

1920 if relative: 

1921 return trimr(a, limits=limits, inclusive=inclusive, axis=axis) 

1922 else: 

1923 return trima(a, limits=limits, inclusive=inclusive) 

1924 

1925 

1926if trim.__doc__: 

1927 trim.__doc__ = trim.__doc__ % trimdoc 

1928 

1929 

1930def trimboth(data, proportiontocut=0.2, inclusive=(True,True), axis=None): 

1931 """ 

1932 Trims the smallest and largest data values. 

1933 

1934 Trims the `data` by masking the ``int(proportiontocut * n)`` smallest and 

1935 ``int(proportiontocut * n)`` largest values of data along the given axis, 

1936 where n is the number of unmasked values before trimming. 

1937 

1938 Parameters 

1939 ---------- 

1940 data : ndarray 

1941 Data to trim. 

1942 proportiontocut : float, optional 

1943 Percentage of trimming (as a float between 0 and 1). 

1944 If n is the number of unmasked values before trimming, the number of 

1945 values after trimming is ``(1 - 2*proportiontocut) * n``. 

1946 Default is 0.2. 

1947 inclusive : {(bool, bool) tuple}, optional 

1948 Tuple indicating whether the number of data being masked on each side 

1949 should be rounded (True) or truncated (False). 

1950 axis : int, optional 

1951 Axis along which to perform the trimming. 

1952 If None, the input array is first flattened. 

1953 

1954 """ 

1955 return trimr(data, limits=(proportiontocut,proportiontocut), 

1956 inclusive=inclusive, axis=axis) 

1957 

1958 

1959def trimtail(data, proportiontocut=0.2, tail='left', inclusive=(True,True), 

1960 axis=None): 

1961 """ 

1962 Trims the data by masking values from one tail. 

1963 

1964 Parameters 

1965 ---------- 

1966 data : array_like 

1967 Data to trim. 

1968 proportiontocut : float, optional 

1969 Percentage of trimming. If n is the number of unmasked values 

1970 before trimming, the number of values after trimming is 

1971 ``(1 - proportiontocut) * n``. Default is 0.2. 

1972 tail : {'left','right'}, optional 

1973 If 'left' the `proportiontocut` lowest values will be masked. 

1974 If 'right' the `proportiontocut` highest values will be masked. 

1975 Default is 'left'. 

1976 inclusive : {(bool, bool) tuple}, optional 

1977 Tuple indicating whether the number of data being masked on each side 

1978 should be rounded (True) or truncated (False). Default is 

1979 (True, True). 

1980 axis : int, optional 

1981 Axis along which to perform the trimming. 

1982 If None, the input array is first flattened. Default is None. 

1983 

1984 Returns 

1985 ------- 

1986 trimtail : ndarray 

1987 Returned array of same shape as `data` with masked tail values. 

1988 

1989 """ 

1990 tail = str(tail).lower()[0] 

1991 if tail == 'l': 

1992 limits = (proportiontocut,None) 

1993 elif tail == 'r': 

1994 limits = (None, proportiontocut) 

1995 else: 

1996 raise TypeError("The tail argument should be in ('left','right')") 

1997 

1998 return trimr(data, limits=limits, axis=axis, inclusive=inclusive) 

1999 

2000 

2001trim1 = trimtail 

2002 

2003 

2004def trimmed_mean(a, limits=(0.1,0.1), inclusive=(1,1), relative=True, 

2005 axis=None): 

2006 """Returns the trimmed mean of the data along the given axis. 

2007 

2008 %s 

2009 

2010 """ 

2011 if (not isinstance(limits,tuple)) and isinstance(limits,float): 

2012 limits = (limits, limits) 

2013 if relative: 

2014 return trimr(a,limits=limits,inclusive=inclusive,axis=axis).mean(axis=axis) 

2015 else: 

2016 return trima(a,limits=limits,inclusive=inclusive).mean(axis=axis) 

2017 

2018 

2019if trimmed_mean.__doc__: 

2020 trimmed_mean.__doc__ = trimmed_mean.__doc__ % trimdoc 

2021 

2022 

2023def trimmed_var(a, limits=(0.1,0.1), inclusive=(1,1), relative=True, 

2024 axis=None, ddof=0): 

2025 """Returns the trimmed variance of the data along the given axis. 

2026 

2027 %s 

2028 ddof : {0,integer}, optional 

2029 Means Delta Degrees of Freedom. The denominator used during computations 

2030 is (n-ddof). DDOF=0 corresponds to a biased estimate, DDOF=1 to an un- 

2031 biased estimate of the variance. 

2032 

2033 """ 

2034 if (not isinstance(limits,tuple)) and isinstance(limits,float): 

2035 limits = (limits, limits) 

2036 if relative: 

2037 out = trimr(a,limits=limits, inclusive=inclusive,axis=axis) 

2038 else: 

2039 out = trima(a,limits=limits,inclusive=inclusive) 

2040 

2041 return out.var(axis=axis, ddof=ddof) 

2042 

2043 

2044if trimmed_var.__doc__: 

2045 trimmed_var.__doc__ = trimmed_var.__doc__ % trimdoc 

2046 

2047 

2048def trimmed_std(a, limits=(0.1,0.1), inclusive=(1,1), relative=True, 

2049 axis=None, ddof=0): 

2050 """Returns the trimmed standard deviation of the data along the given axis. 

2051 

2052 %s 

2053 ddof : {0,integer}, optional 

2054 Means Delta Degrees of Freedom. The denominator used during computations 

2055 is (n-ddof). DDOF=0 corresponds to a biased estimate, DDOF=1 to an un- 

2056 biased estimate of the variance. 

2057 

2058 """ 

2059 if (not isinstance(limits,tuple)) and isinstance(limits,float): 

2060 limits = (limits, limits) 

2061 if relative: 

2062 out = trimr(a,limits=limits,inclusive=inclusive,axis=axis) 

2063 else: 

2064 out = trima(a,limits=limits,inclusive=inclusive) 

2065 return out.std(axis=axis,ddof=ddof) 

2066 

2067 

2068if trimmed_std.__doc__: 

2069 trimmed_std.__doc__ = trimmed_std.__doc__ % trimdoc 

2070 

2071 

2072def trimmed_stde(a, limits=(0.1,0.1), inclusive=(1,1), axis=None): 

2073 """ 

2074 Returns the standard error of the trimmed mean along the given axis. 

2075 

2076 Parameters 

2077 ---------- 

2078 a : sequence 

2079 Input array 

2080 limits : {(0.1,0.1), tuple of float}, optional 

2081 tuple (lower percentage, upper percentage) to cut on each side of the 

2082 array, with respect to the number of unmasked data. 

2083 

2084 If n is the number of unmasked data before trimming, the values 

2085 smaller than ``n * limits[0]`` and the values larger than 

2086 ``n * `limits[1]`` are masked, and the total number of unmasked 

2087 data after trimming is ``n * (1.-sum(limits))``. In each case, 

2088 the value of one limit can be set to None to indicate an open interval. 

2089 If `limits` is None, no trimming is performed. 

2090 inclusive : {(bool, bool) tuple} optional 

2091 Tuple indicating whether the number of data being masked on each side 

2092 should be rounded (True) or truncated (False). 

2093 axis : int, optional 

2094 Axis along which to trim. 

2095 

2096 Returns 

2097 ------- 

2098 trimmed_stde : scalar or ndarray 

2099 

2100 """ 

2101 def _trimmed_stde_1D(a, low_limit, up_limit, low_inclusive, up_inclusive): 

2102 "Returns the standard error of the trimmed mean for a 1D input data." 

2103 n = a.count() 

2104 idx = a.argsort() 

2105 if low_limit: 

2106 if low_inclusive: 

2107 lowidx = int(low_limit*n) 

2108 else: 

2109 lowidx = np.round(low_limit*n) 

2110 a[idx[:lowidx]] = masked 

2111 if up_limit is not None: 

2112 if up_inclusive: 

2113 upidx = n - int(n*up_limit) 

2114 else: 

2115 upidx = n - np.round(n*up_limit) 

2116 a[idx[upidx:]] = masked 

2117 a[idx[:lowidx]] = a[idx[lowidx]] 

2118 a[idx[upidx:]] = a[idx[upidx-1]] 

2119 winstd = a.std(ddof=1) 

2120 return winstd / ((1-low_limit-up_limit)*np.sqrt(len(a))) 

2121 

2122 a = ma.array(a, copy=True, subok=True) 

2123 a.unshare_mask() 

2124 if limits is None: 

2125 return a.std(axis=axis,ddof=1)/ma.sqrt(a.count(axis)) 

2126 if (not isinstance(limits,tuple)) and isinstance(limits,float): 

2127 limits = (limits, limits) 

2128 

2129 # Check the limits 

2130 (lolim, uplim) = limits 

2131 errmsg = "The proportion to cut from the %s should be between 0. and 1." 

2132 if lolim is not None: 

2133 if lolim > 1. or lolim < 0: 

2134 raise ValueError(errmsg % 'beginning' + "(got %s)" % lolim) 

2135 if uplim is not None: 

2136 if uplim > 1. or uplim < 0: 

2137 raise ValueError(errmsg % 'end' + "(got %s)" % uplim) 

2138 

2139 (loinc, upinc) = inclusive 

2140 if (axis is None): 

2141 return _trimmed_stde_1D(a.ravel(),lolim,uplim,loinc,upinc) 

2142 else: 

2143 if a.ndim > 2: 

2144 raise ValueError("Array 'a' must be at most two dimensional, " 

2145 "but got a.ndim = %d" % a.ndim) 

2146 return ma.apply_along_axis(_trimmed_stde_1D, axis, a, 

2147 lolim,uplim,loinc,upinc) 

2148 

2149 

2150def _mask_to_limits(a, limits, inclusive): 

2151 """Mask an array for values outside of given limits. 

2152 

2153 This is primarily a utility function. 

2154 

2155 Parameters 

2156 ---------- 

2157 a : array 

2158 limits : (float or None, float or None) 

2159 A tuple consisting of the (lower limit, upper limit). Values in the 

2160 input array less than the lower limit or greater than the upper limit 

2161 will be masked out. None implies no limit. 

2162 inclusive : (bool, bool) 

2163 A tuple consisting of the (lower flag, upper flag). These flags 

2164 determine whether values exactly equal to lower or upper are allowed. 

2165 

2166 Returns 

2167 ------- 

2168 A MaskedArray. 

2169 

2170 Raises 

2171 ------ 

2172 A ValueError if there are no values within the given limits. 

2173 """ 

2174 lower_limit, upper_limit = limits 

2175 lower_include, upper_include = inclusive 

2176 am = ma.MaskedArray(a) 

2177 if lower_limit is not None: 

2178 if lower_include: 

2179 am = ma.masked_less(am, lower_limit) 

2180 else: 

2181 am = ma.masked_less_equal(am, lower_limit) 

2182 

2183 if upper_limit is not None: 

2184 if upper_include: 

2185 am = ma.masked_greater(am, upper_limit) 

2186 else: 

2187 am = ma.masked_greater_equal(am, upper_limit) 

2188 

2189 if am.count() == 0: 

2190 raise ValueError("No array values within given limits") 

2191 

2192 return am 

2193 

2194 

2195def tmean(a, limits=None, inclusive=(True, True), axis=None): 

2196 """ 

2197 Compute the trimmed mean. 

2198 

2199 Parameters 

2200 ---------- 

2201 a : array_like 

2202 Array of values. 

2203 limits : None or (lower limit, upper limit), optional 

2204 Values in the input array less than the lower limit or greater than the 

2205 upper limit will be ignored. When limits is None (default), then all 

2206 values are used. Either of the limit values in the tuple can also be 

2207 None representing a half-open interval. 

2208 inclusive : (bool, bool), optional 

2209 A tuple consisting of the (lower flag, upper flag). These flags 

2210 determine whether values exactly equal to the lower or upper limits 

2211 are included. The default value is (True, True). 

2212 axis : int or None, optional 

2213 Axis along which to operate. If None, compute over the 

2214 whole array. Default is None. 

2215 

2216 Returns 

2217 ------- 

2218 tmean : float 

2219 

2220 Notes 

2221 ----- 

2222 For more details on `tmean`, see `scipy.stats.tmean`. 

2223 

2224 Examples 

2225 -------- 

2226 >>> import numpy as np 

2227 >>> from scipy.stats import mstats 

2228 >>> a = np.array([[6, 8, 3, 0], 

2229 ... [3, 9, 1, 2], 

2230 ... [8, 7, 8, 2], 

2231 ... [5, 6, 0, 2], 

2232 ... [4, 5, 5, 2]]) 

2233 ... 

2234 ... 

2235 >>> mstats.tmean(a, (2,5)) 

2236 3.3 

2237 >>> mstats.tmean(a, (2,5), axis=0) 

2238 masked_array(data=[4.0, 5.0, 4.0, 2.0], 

2239 mask=[False, False, False, False], 

2240 fill_value=1e+20) 

2241 

2242 """ 

2243 return trima(a, limits=limits, inclusive=inclusive).mean(axis=axis) 

2244 

2245 

2246def tvar(a, limits=None, inclusive=(True, True), axis=0, ddof=1): 

2247 """ 

2248 Compute the trimmed variance 

2249 

2250 This function computes the sample variance of an array of values, 

2251 while ignoring values which are outside of given `limits`. 

2252 

2253 Parameters 

2254 ---------- 

2255 a : array_like 

2256 Array of values. 

2257 limits : None or (lower limit, upper limit), optional 

2258 Values in the input array less than the lower limit or greater than the 

2259 upper limit will be ignored. When limits is None, then all values are 

2260 used. Either of the limit values in the tuple can also be None 

2261 representing a half-open interval. The default value is None. 

2262 inclusive : (bool, bool), optional 

2263 A tuple consisting of the (lower flag, upper flag). These flags 

2264 determine whether values exactly equal to the lower or upper limits 

2265 are included. The default value is (True, True). 

2266 axis : int or None, optional 

2267 Axis along which to operate. If None, compute over the 

2268 whole array. Default is zero. 

2269 ddof : int, optional 

2270 Delta degrees of freedom. Default is 1. 

2271 

2272 Returns 

2273 ------- 

2274 tvar : float 

2275 Trimmed variance. 

2276 

2277 Notes 

2278 ----- 

2279 For more details on `tvar`, see `scipy.stats.tvar`. 

2280 

2281 """ 

2282 a = a.astype(float).ravel() 

2283 if limits is None: 

2284 n = (~a.mask).sum() # todo: better way to do that? 

2285 return np.ma.var(a) * n/(n-1.) 

2286 am = _mask_to_limits(a, limits=limits, inclusive=inclusive) 

2287 

2288 return np.ma.var(am, axis=axis, ddof=ddof) 

2289 

2290 

2291def tmin(a, lowerlimit=None, axis=0, inclusive=True): 

2292 """ 

2293 Compute the trimmed minimum 

2294 

2295 Parameters 

2296 ---------- 

2297 a : array_like 

2298 array of values 

2299 lowerlimit : None or float, optional 

2300 Values in the input array less than the given limit will be ignored. 

2301 When lowerlimit is None, then all values are used. The default value 

2302 is None. 

2303 axis : int or None, optional 

2304 Axis along which to operate. Default is 0. If None, compute over the 

2305 whole array `a`. 

2306 inclusive : {True, False}, optional 

2307 This flag determines whether values exactly equal to the lower limit 

2308 are included. The default value is True. 

2309 

2310 Returns 

2311 ------- 

2312 tmin : float, int or ndarray 

2313 

2314 Notes 

2315 ----- 

2316 For more details on `tmin`, see `scipy.stats.tmin`. 

2317 

2318 Examples 

2319 -------- 

2320 >>> import numpy as np 

2321 >>> from scipy.stats import mstats 

2322 >>> a = np.array([[6, 8, 3, 0], 

2323 ... [3, 2, 1, 2], 

2324 ... [8, 1, 8, 2], 

2325 ... [5, 3, 0, 2], 

2326 ... [4, 7, 5, 2]]) 

2327 ... 

2328 >>> mstats.tmin(a, 5) 

2329 masked_array(data=[5, 7, 5, --], 

2330 mask=[False, False, False, True], 

2331 fill_value=999999) 

2332 

2333 """ 

2334 a, axis = _chk_asarray(a, axis) 

2335 am = trima(a, (lowerlimit, None), (inclusive, False)) 

2336 return ma.minimum.reduce(am, axis) 

2337 

2338 

2339def tmax(a, upperlimit=None, axis=0, inclusive=True): 

2340 """ 

2341 Compute the trimmed maximum 

2342 

2343 This function computes the maximum value of an array along a given axis, 

2344 while ignoring values larger than a specified upper limit. 

2345 

2346 Parameters 

2347 ---------- 

2348 a : array_like 

2349 array of values 

2350 upperlimit : None or float, optional 

2351 Values in the input array greater than the given limit will be ignored. 

2352 When upperlimit is None, then all values are used. The default value 

2353 is None. 

2354 axis : int or None, optional 

2355 Axis along which to operate. Default is 0. If None, compute over the 

2356 whole array `a`. 

2357 inclusive : {True, False}, optional 

2358 This flag determines whether values exactly equal to the upper limit 

2359 are included. The default value is True. 

2360 

2361 Returns 

2362 ------- 

2363 tmax : float, int or ndarray 

2364 

2365 Notes 

2366 ----- 

2367 For more details on `tmax`, see `scipy.stats.tmax`. 

2368 

2369 Examples 

2370 -------- 

2371 >>> import numpy as np 

2372 >>> from scipy.stats import mstats 

2373 >>> a = np.array([[6, 8, 3, 0], 

2374 ... [3, 9, 1, 2], 

2375 ... [8, 7, 8, 2], 

2376 ... [5, 6, 0, 2], 

2377 ... [4, 5, 5, 2]]) 

2378 ... 

2379 ... 

2380 >>> mstats.tmax(a, 4) 

2381 masked_array(data=[4, --, 3, 2], 

2382 mask=[False, True, False, False], 

2383 fill_value=999999) 

2384 

2385 """ 

2386 a, axis = _chk_asarray(a, axis) 

2387 am = trima(a, (None, upperlimit), (False, inclusive)) 

2388 return ma.maximum.reduce(am, axis) 

2389 

2390 

2391def tsem(a, limits=None, inclusive=(True, True), axis=0, ddof=1): 

2392 """ 

2393 Compute the trimmed standard error of the mean. 

2394 

2395 This function finds the standard error of the mean for given 

2396 values, ignoring values outside the given `limits`. 

2397 

2398 Parameters 

2399 ---------- 

2400 a : array_like 

2401 array of values 

2402 limits : None or (lower limit, upper limit), optional 

2403 Values in the input array less than the lower limit or greater than the 

2404 upper limit will be ignored. When limits is None, then all values are 

2405 used. Either of the limit values in the tuple can also be None 

2406 representing a half-open interval. The default value is None. 

2407 inclusive : (bool, bool), optional 

2408 A tuple consisting of the (lower flag, upper flag). These flags 

2409 determine whether values exactly equal to the lower or upper limits 

2410 are included. The default value is (True, True). 

2411 axis : int or None, optional 

2412 Axis along which to operate. If None, compute over the 

2413 whole array. Default is zero. 

2414 ddof : int, optional 

2415 Delta degrees of freedom. Default is 1. 

2416 

2417 Returns 

2418 ------- 

2419 tsem : float 

2420 

2421 Notes 

2422 ----- 

2423 For more details on `tsem`, see `scipy.stats.tsem`. 

2424 

2425 """ 

2426 a = ma.asarray(a).ravel() 

2427 if limits is None: 

2428 n = float(a.count()) 

2429 return a.std(axis=axis, ddof=ddof)/ma.sqrt(n) 

2430 

2431 am = trima(a.ravel(), limits, inclusive) 

2432 sd = np.sqrt(am.var(axis=axis, ddof=ddof)) 

2433 return sd / np.sqrt(am.count()) 

2434 

2435 

2436def winsorize(a, limits=None, inclusive=(True, True), inplace=False, 

2437 axis=None, nan_policy='propagate'): 

2438 """Returns a Winsorized version of the input array. 

2439 

2440 The (limits[0])th lowest values are set to the (limits[0])th percentile, 

2441 and the (limits[1])th highest values are set to the (1 - limits[1])th 

2442 percentile. 

2443 Masked values are skipped. 

2444 

2445 

2446 Parameters 

2447 ---------- 

2448 a : sequence 

2449 Input array. 

2450 limits : {None, tuple of float}, optional 

2451 Tuple of the percentages to cut on each side of the array, with respect 

2452 to the number of unmasked data, as floats between 0. and 1. 

2453 Noting n the number of unmasked data before trimming, the 

2454 (n*limits[0])th smallest data and the (n*limits[1])th largest data are 

2455 masked, and the total number of unmasked data after trimming 

2456 is n*(1.-sum(limits)) The value of one limit can be set to None to 

2457 indicate an open interval. 

2458 inclusive : {(True, True) tuple}, optional 

2459 Tuple indicating whether the number of data being masked on each side 

2460 should be truncated (True) or rounded (False). 

2461 inplace : {False, True}, optional 

2462 Whether to winsorize in place (True) or to use a copy (False) 

2463 axis : {None, int}, optional 

2464 Axis along which to trim. If None, the whole array is trimmed, but its 

2465 shape is maintained. 

2466 nan_policy : {'propagate', 'raise', 'omit'}, optional 

2467 Defines how to handle when input contains nan. 

2468 The following options are available (default is 'propagate'): 

2469 

2470 * 'propagate': allows nan values and may overwrite or propagate them 

2471 * 'raise': throws an error 

2472 * 'omit': performs the calculations ignoring nan values 

2473 

2474 Notes 

2475 ----- 

2476 This function is applied to reduce the effect of possibly spurious outliers 

2477 by limiting the extreme values. 

2478 

2479 Examples 

2480 -------- 

2481 >>> import numpy as np 

2482 >>> from scipy.stats.mstats import winsorize 

2483 

2484 A shuffled array contains integers from 1 to 10. 

2485 

2486 >>> a = np.array([10, 4, 9, 8, 5, 3, 7, 2, 1, 6]) 

2487 

2488 The 10% of the lowest value (i.e., `1`) and the 20% of the highest 

2489 values (i.e., `9` and `10`) are replaced. 

2490 

2491 >>> winsorize(a, limits=[0.1, 0.2]) 

2492 masked_array(data=[8, 4, 8, 8, 5, 3, 7, 2, 2, 6], 

2493 mask=False, 

2494 fill_value=999999) 

2495 

2496 """ 

2497 def _winsorize1D(a, low_limit, up_limit, low_include, up_include, 

2498 contains_nan, nan_policy): 

2499 n = a.count() 

2500 idx = a.argsort() 

2501 if contains_nan: 

2502 nan_count = np.count_nonzero(np.isnan(a)) 

2503 if low_limit: 

2504 if low_include: 

2505 lowidx = int(low_limit * n) 

2506 else: 

2507 lowidx = np.round(low_limit * n).astype(int) 

2508 if contains_nan and nan_policy == 'omit': 

2509 lowidx = min(lowidx, n-nan_count-1) 

2510 a[idx[:lowidx]] = a[idx[lowidx]] 

2511 if up_limit is not None: 

2512 if up_include: 

2513 upidx = n - int(n * up_limit) 

2514 else: 

2515 upidx = n - np.round(n * up_limit).astype(int) 

2516 if contains_nan and nan_policy == 'omit': 

2517 a[idx[upidx:-nan_count]] = a[idx[upidx - 1]] 

2518 else: 

2519 a[idx[upidx:]] = a[idx[upidx - 1]] 

2520 return a 

2521 

2522 contains_nan, nan_policy = _contains_nan(a, nan_policy) 

2523 # We are going to modify a: better make a copy 

2524 a = ma.array(a, copy=np.logical_not(inplace)) 

2525 

2526 if limits is None: 

2527 return a 

2528 if (not isinstance(limits, tuple)) and isinstance(limits, float): 

2529 limits = (limits, limits) 

2530 

2531 # Check the limits 

2532 (lolim, uplim) = limits 

2533 errmsg = "The proportion to cut from the %s should be between 0. and 1." 

2534 if lolim is not None: 

2535 if lolim > 1. or lolim < 0: 

2536 raise ValueError(errmsg % 'beginning' + "(got %s)" % lolim) 

2537 if uplim is not None: 

2538 if uplim > 1. or uplim < 0: 

2539 raise ValueError(errmsg % 'end' + "(got %s)" % uplim) 

2540 

2541 (loinc, upinc) = inclusive 

2542 

2543 if axis is None: 

2544 shp = a.shape 

2545 return _winsorize1D(a.ravel(), lolim, uplim, loinc, upinc, 

2546 contains_nan, nan_policy).reshape(shp) 

2547 else: 

2548 return ma.apply_along_axis(_winsorize1D, axis, a, lolim, uplim, loinc, 

2549 upinc, contains_nan, nan_policy) 

2550 

2551 

2552def moment(a, moment=1, axis=0): 

2553 """ 

2554 Calculates the nth moment about the mean for a sample. 

2555 

2556 Parameters 

2557 ---------- 

2558 a : array_like 

2559 data 

2560 moment : int, optional 

2561 order of central moment that is returned 

2562 axis : int or None, optional 

2563 Axis along which the central moment is computed. Default is 0. 

2564 If None, compute over the whole array `a`. 

2565 

2566 Returns 

2567 ------- 

2568 n-th central moment : ndarray or float 

2569 The appropriate moment along the given axis or over all values if axis 

2570 is None. The denominator for the moment calculation is the number of 

2571 observations, no degrees of freedom correction is done. 

2572 

2573 Notes 

2574 ----- 

2575 For more details about `moment`, see `scipy.stats.moment`. 

2576 

2577 """ 

2578 a, axis = _chk_asarray(a, axis) 

2579 if a.size == 0: 

2580 moment_shape = list(a.shape) 

2581 del moment_shape[axis] 

2582 dtype = a.dtype.type if a.dtype.kind in 'fc' else np.float64 

2583 # empty array, return nan(s) with shape matching `moment` 

2584 out_shape = (moment_shape if np.isscalar(moment) 

2585 else [len(moment)] + moment_shape) 

2586 if len(out_shape) == 0: 

2587 return dtype(np.nan) 

2588 else: 

2589 return ma.array(np.full(out_shape, np.nan, dtype=dtype)) 

2590 

2591 # for array_like moment input, return a value for each. 

2592 if not np.isscalar(moment): 

2593 mean = a.mean(axis, keepdims=True) 

2594 mmnt = [_moment(a, i, axis, mean=mean) for i in moment] 

2595 return ma.array(mmnt) 

2596 else: 

2597 return _moment(a, moment, axis) 

2598 

2599# Moment with optional pre-computed mean, equal to a.mean(axis, keepdims=True) 

2600def _moment(a, moment, axis, *, mean=None): 

2601 if np.abs(moment - np.round(moment)) > 0: 

2602 raise ValueError("All moment parameters must be integers") 

2603 

2604 if moment == 0 or moment == 1: 

2605 # By definition the zeroth moment about the mean is 1, and the first 

2606 # moment is 0. 

2607 shape = list(a.shape) 

2608 del shape[axis] 

2609 dtype = a.dtype.type if a.dtype.kind in 'fc' else np.float64 

2610 

2611 if len(shape) == 0: 

2612 return dtype(1.0 if moment == 0 else 0.0) 

2613 else: 

2614 return (ma.ones(shape, dtype=dtype) if moment == 0 

2615 else ma.zeros(shape, dtype=dtype)) 

2616 else: 

2617 # Exponentiation by squares: form exponent sequence 

2618 n_list = [moment] 

2619 current_n = moment 

2620 while current_n > 2: 

2621 if current_n % 2: 

2622 current_n = (current_n-1)/2 

2623 else: 

2624 current_n /= 2 

2625 n_list.append(current_n) 

2626 

2627 # Starting point for exponentiation by squares 

2628 mean = a.mean(axis, keepdims=True) if mean is None else mean 

2629 a_zero_mean = a - mean 

2630 if n_list[-1] == 1: 

2631 s = a_zero_mean.copy() 

2632 else: 

2633 s = a_zero_mean**2 

2634 

2635 # Perform multiplications 

2636 for n in n_list[-2::-1]: 

2637 s = s**2 

2638 if n % 2: 

2639 s *= a_zero_mean 

2640 return s.mean(axis) 

2641 

2642 

2643def variation(a, axis=0, ddof=0): 

2644 """ 

2645 Compute the coefficient of variation. 

2646 

2647 The coefficient of variation is the standard deviation divided by the 

2648 mean. This function is equivalent to:: 

2649 

2650 np.std(x, axis=axis, ddof=ddof) / np.mean(x) 

2651 

2652 The default for ``ddof`` is 0, but many definitions of the coefficient 

2653 of variation use the square root of the unbiased sample variance 

2654 for the sample standard deviation, which corresponds to ``ddof=1``. 

2655 

2656 Parameters 

2657 ---------- 

2658 a : array_like 

2659 Input array. 

2660 axis : int or None, optional 

2661 Axis along which to calculate the coefficient of variation. Default 

2662 is 0. If None, compute over the whole array `a`. 

2663 ddof : int, optional 

2664 Delta degrees of freedom. Default is 0. 

2665 

2666 Returns 

2667 ------- 

2668 variation : ndarray 

2669 The calculated variation along the requested axis. 

2670 

2671 Notes 

2672 ----- 

2673 For more details about `variation`, see `scipy.stats.variation`. 

2674 

2675 Examples 

2676 -------- 

2677 >>> import numpy as np 

2678 >>> from scipy.stats.mstats import variation 

2679 >>> a = np.array([2,8,4]) 

2680 >>> variation(a) 

2681 0.5345224838248487 

2682 >>> b = np.array([2,8,3,4]) 

2683 >>> c = np.ma.masked_array(b, mask=[0,0,1,0]) 

2684 >>> variation(c) 

2685 0.5345224838248487 

2686 

2687 In the example above, it can be seen that this works the same as 

2688 `scipy.stats.variation` except 'stats.mstats.variation' ignores masked 

2689 array elements. 

2690 

2691 """ 

2692 a, axis = _chk_asarray(a, axis) 

2693 return a.std(axis, ddof=ddof)/a.mean(axis) 

2694 

2695 

2696def skew(a, axis=0, bias=True): 

2697 """ 

2698 Computes the skewness of a data set. 

2699 

2700 Parameters 

2701 ---------- 

2702 a : ndarray 

2703 data 

2704 axis : int or None, optional 

2705 Axis along which skewness is calculated. Default is 0. 

2706 If None, compute over the whole array `a`. 

2707 bias : bool, optional 

2708 If False, then the calculations are corrected for statistical bias. 

2709 

2710 Returns 

2711 ------- 

2712 skewness : ndarray 

2713 The skewness of values along an axis, returning 0 where all values are 

2714 equal. 

2715 

2716 Notes 

2717 ----- 

2718 For more details about `skew`, see `scipy.stats.skew`. 

2719 

2720 """ 

2721 a, axis = _chk_asarray(a,axis) 

2722 mean = a.mean(axis, keepdims=True) 

2723 m2 = _moment(a, 2, axis, mean=mean) 

2724 m3 = _moment(a, 3, axis, mean=mean) 

2725 zero = (m2 <= (np.finfo(m2.dtype).resolution * mean.squeeze(axis))**2) 

2726 with np.errstate(all='ignore'): 

2727 vals = ma.where(zero, 0, m3 / m2**1.5) 

2728 

2729 if not bias and zero is not ma.masked and m2 is not ma.masked: 

2730 n = a.count(axis) 

2731 can_correct = ~zero & (n > 2) 

2732 if can_correct.any(): 

2733 n = np.extract(can_correct, n) 

2734 m2 = np.extract(can_correct, m2) 

2735 m3 = np.extract(can_correct, m3) 

2736 nval = ma.sqrt((n-1.0)*n)/(n-2.0)*m3/m2**1.5 

2737 np.place(vals, can_correct, nval) 

2738 return vals 

2739 

2740 

2741def kurtosis(a, axis=0, fisher=True, bias=True): 

2742 """ 

2743 Computes the kurtosis (Fisher or Pearson) of a dataset. 

2744 

2745 Kurtosis is the fourth central moment divided by the square of the 

2746 variance. If Fisher's definition is used, then 3.0 is subtracted from 

2747 the result to give 0.0 for a normal distribution. 

2748 

2749 If bias is False then the kurtosis is calculated using k statistics to 

2750 eliminate bias coming from biased moment estimators 

2751 

2752 Use `kurtosistest` to see if result is close enough to normal. 

2753 

2754 Parameters 

2755 ---------- 

2756 a : array 

2757 data for which the kurtosis is calculated 

2758 axis : int or None, optional 

2759 Axis along which the kurtosis is calculated. Default is 0. 

2760 If None, compute over the whole array `a`. 

2761 fisher : bool, optional 

2762 If True, Fisher's definition is used (normal ==> 0.0). If False, 

2763 Pearson's definition is used (normal ==> 3.0). 

2764 bias : bool, optional 

2765 If False, then the calculations are corrected for statistical bias. 

2766 

2767 Returns 

2768 ------- 

2769 kurtosis : array 

2770 The kurtosis of values along an axis. If all values are equal, 

2771 return -3 for Fisher's definition and 0 for Pearson's definition. 

2772 

2773 Notes 

2774 ----- 

2775 For more details about `kurtosis`, see `scipy.stats.kurtosis`. 

2776 

2777 """ 

2778 a, axis = _chk_asarray(a, axis) 

2779 mean = a.mean(axis, keepdims=True) 

2780 m2 = _moment(a, 2, axis, mean=mean) 

2781 m4 = _moment(a, 4, axis, mean=mean) 

2782 zero = (m2 <= (np.finfo(m2.dtype).resolution * mean.squeeze(axis))**2) 

2783 with np.errstate(all='ignore'): 

2784 vals = ma.where(zero, 0, m4 / m2**2.0) 

2785 

2786 if not bias and zero is not ma.masked and m2 is not ma.masked: 

2787 n = a.count(axis) 

2788 can_correct = ~zero & (n > 3) 

2789 if can_correct.any(): 

2790 n = np.extract(can_correct, n) 

2791 m2 = np.extract(can_correct, m2) 

2792 m4 = np.extract(can_correct, m4) 

2793 nval = 1.0/(n-2)/(n-3)*((n*n-1.0)*m4/m2**2.0-3*(n-1)**2.0) 

2794 np.place(vals, can_correct, nval+3.0) 

2795 if fisher: 

2796 return vals - 3 

2797 else: 

2798 return vals 

2799 

2800 

2801DescribeResult = namedtuple('DescribeResult', ('nobs', 'minmax', 'mean', 

2802 'variance', 'skewness', 

2803 'kurtosis')) 

2804 

2805 

2806def describe(a, axis=0, ddof=0, bias=True): 

2807 """ 

2808 Computes several descriptive statistics of the passed array. 

2809 

2810 Parameters 

2811 ---------- 

2812 a : array_like 

2813 Data array 

2814 axis : int or None, optional 

2815 Axis along which to calculate statistics. Default 0. If None, 

2816 compute over the whole array `a`. 

2817 ddof : int, optional 

2818 degree of freedom (default 0); note that default ddof is different 

2819 from the same routine in stats.describe 

2820 bias : bool, optional 

2821 If False, then the skewness and kurtosis calculations are corrected for 

2822 statistical bias. 

2823 

2824 Returns 

2825 ------- 

2826 nobs : int 

2827 (size of the data (discarding missing values) 

2828 

2829 minmax : (int, int) 

2830 min, max 

2831 

2832 mean : float 

2833 arithmetic mean 

2834 

2835 variance : float 

2836 unbiased variance 

2837 

2838 skewness : float 

2839 biased skewness 

2840 

2841 kurtosis : float 

2842 biased kurtosis 

2843 

2844 Examples 

2845 -------- 

2846 >>> import numpy as np 

2847 >>> from scipy.stats.mstats import describe 

2848 >>> ma = np.ma.array(range(6), mask=[0, 0, 0, 1, 1, 1]) 

2849 >>> describe(ma) 

2850 DescribeResult(nobs=3, minmax=(masked_array(data=0, 

2851 mask=False, 

2852 fill_value=999999), masked_array(data=2, 

2853 mask=False, 

2854 fill_value=999999)), mean=1.0, variance=0.6666666666666666, 

2855 skewness=masked_array(data=0., mask=False, fill_value=1e+20), 

2856 kurtosis=-1.5) 

2857 

2858 """ 

2859 a, axis = _chk_asarray(a, axis) 

2860 n = a.count(axis) 

2861 mm = (ma.minimum.reduce(a, axis=axis), ma.maximum.reduce(a, axis=axis)) 

2862 m = a.mean(axis) 

2863 v = a.var(axis, ddof=ddof) 

2864 sk = skew(a, axis, bias=bias) 

2865 kurt = kurtosis(a, axis, bias=bias) 

2866 

2867 return DescribeResult(n, mm, m, v, sk, kurt) 

2868 

2869 

2870def stde_median(data, axis=None): 

2871 """Returns the McKean-Schrader estimate of the standard error of the sample 

2872 median along the given axis. masked values are discarded. 

2873 

2874 Parameters 

2875 ---------- 

2876 data : ndarray 

2877 Data to trim. 

2878 axis : {None,int}, optional 

2879 Axis along which to perform the trimming. 

2880 If None, the input array is first flattened. 

2881 

2882 """ 

2883 def _stdemed_1D(data): 

2884 data = np.sort(data.compressed()) 

2885 n = len(data) 

2886 z = 2.5758293035489004 

2887 k = int(np.round((n+1)/2. - z * np.sqrt(n/4.),0)) 

2888 return ((data[n-k] - data[k-1])/(2.*z)) 

2889 

2890 data = ma.array(data, copy=False, subok=True) 

2891 if (axis is None): 

2892 return _stdemed_1D(data) 

2893 else: 

2894 if data.ndim > 2: 

2895 raise ValueError("Array 'data' must be at most two dimensional, " 

2896 "but got data.ndim = %d" % data.ndim) 

2897 return ma.apply_along_axis(_stdemed_1D, axis, data) 

2898 

2899 

2900SkewtestResult = namedtuple('SkewtestResult', ('statistic', 'pvalue')) 

2901 

2902 

2903def skewtest(a, axis=0, alternative='two-sided'): 

2904 """ 

2905 Tests whether the skew is different from the normal distribution. 

2906 

2907 Parameters 

2908 ---------- 

2909 a : array_like 

2910 The data to be tested 

2911 axis : int or None, optional 

2912 Axis along which statistics are calculated. Default is 0. 

2913 If None, compute over the whole array `a`. 

2914 alternative : {'two-sided', 'less', 'greater'}, optional 

2915 Defines the alternative hypothesis. Default is 'two-sided'. 

2916 The following options are available: 

2917 

2918 * 'two-sided': the skewness of the distribution underlying the sample 

2919 is different from that of the normal distribution (i.e. 0) 

2920 * 'less': the skewness of the distribution underlying the sample 

2921 is less than that of the normal distribution 

2922 * 'greater': the skewness of the distribution underlying the sample 

2923 is greater than that of the normal distribution 

2924 

2925 .. versionadded:: 1.7.0 

2926 

2927 Returns 

2928 ------- 

2929 statistic : array_like 

2930 The computed z-score for this test. 

2931 pvalue : array_like 

2932 A p-value for the hypothesis test 

2933 

2934 Notes 

2935 ----- 

2936 For more details about `skewtest`, see `scipy.stats.skewtest`. 

2937 

2938 """ 

2939 a, axis = _chk_asarray(a, axis) 

2940 if axis is None: 

2941 a = a.ravel() 

2942 axis = 0 

2943 b2 = skew(a,axis) 

2944 n = a.count(axis) 

2945 if np.min(n) < 8: 

2946 raise ValueError( 

2947 "skewtest is not valid with less than 8 samples; %i samples" 

2948 " were given." % np.min(n)) 

2949 

2950 y = b2 * ma.sqrt(((n+1)*(n+3)) / (6.0*(n-2))) 

2951 beta2 = (3.0*(n*n+27*n-70)*(n+1)*(n+3)) / ((n-2.0)*(n+5)*(n+7)*(n+9)) 

2952 W2 = -1 + ma.sqrt(2*(beta2-1)) 

2953 delta = 1/ma.sqrt(0.5*ma.log(W2)) 

2954 alpha = ma.sqrt(2.0/(W2-1)) 

2955 y = ma.where(y == 0, 1, y) 

2956 Z = delta*ma.log(y/alpha + ma.sqrt((y/alpha)**2+1)) 

2957 

2958 return SkewtestResult(*scipy.stats._stats_py._normtest_finish(Z, alternative)) 

2959 

2960 

2961KurtosistestResult = namedtuple('KurtosistestResult', ('statistic', 'pvalue')) 

2962 

2963 

2964def kurtosistest(a, axis=0, alternative='two-sided'): 

2965 """ 

2966 Tests whether a dataset has normal kurtosis 

2967 

2968 Parameters 

2969 ---------- 

2970 a : array_like 

2971 array of the sample data 

2972 axis : int or None, optional 

2973 Axis along which to compute test. Default is 0. If None, 

2974 compute over the whole array `a`. 

2975 alternative : {'two-sided', 'less', 'greater'}, optional 

2976 Defines the alternative hypothesis. 

2977 The following options are available (default is 'two-sided'): 

2978 

2979 * 'two-sided': the kurtosis of the distribution underlying the sample 

2980 is different from that of the normal distribution 

2981 * 'less': the kurtosis of the distribution underlying the sample 

2982 is less than that of the normal distribution 

2983 * 'greater': the kurtosis of the distribution underlying the sample 

2984 is greater than that of the normal distribution 

2985 

2986 .. versionadded:: 1.7.0 

2987 

2988 Returns 

2989 ------- 

2990 statistic : array_like 

2991 The computed z-score for this test. 

2992 pvalue : array_like 

2993 The p-value for the hypothesis test 

2994 

2995 Notes 

2996 ----- 

2997 For more details about `kurtosistest`, see `scipy.stats.kurtosistest`. 

2998 

2999 """ 

3000 a, axis = _chk_asarray(a, axis) 

3001 n = a.count(axis=axis) 

3002 if np.min(n) < 5: 

3003 raise ValueError( 

3004 "kurtosistest requires at least 5 observations; %i observations" 

3005 " were given." % np.min(n)) 

3006 if np.min(n) < 20: 

3007 warnings.warn( 

3008 "kurtosistest only valid for n>=20 ... continuing anyway, n=%i" % 

3009 np.min(n)) 

3010 

3011 b2 = kurtosis(a, axis, fisher=False) 

3012 E = 3.0*(n-1) / (n+1) 

3013 varb2 = 24.0*n*(n-2.)*(n-3) / ((n+1)*(n+1.)*(n+3)*(n+5)) 

3014 x = (b2-E)/ma.sqrt(varb2) 

3015 sqrtbeta1 = 6.0*(n*n-5*n+2)/((n+7)*(n+9)) * np.sqrt((6.0*(n+3)*(n+5)) / 

3016 (n*(n-2)*(n-3))) 

3017 A = 6.0 + 8.0/sqrtbeta1 * (2.0/sqrtbeta1 + np.sqrt(1+4.0/(sqrtbeta1**2))) 

3018 term1 = 1 - 2./(9.0*A) 

3019 denom = 1 + x*ma.sqrt(2/(A-4.0)) 

3020 if np.ma.isMaskedArray(denom): 

3021 # For multi-dimensional array input 

3022 denom[denom == 0.0] = masked 

3023 elif denom == 0.0: 

3024 denom = masked 

3025 

3026 term2 = np.ma.where(denom > 0, ma.power((1-2.0/A)/denom, 1/3.0), 

3027 -ma.power(-(1-2.0/A)/denom, 1/3.0)) 

3028 Z = (term1 - term2) / np.sqrt(2/(9.0*A)) 

3029 

3030 return KurtosistestResult( 

3031 *scipy.stats._stats_py._normtest_finish(Z, alternative) 

3032 ) 

3033 

3034 

3035NormaltestResult = namedtuple('NormaltestResult', ('statistic', 'pvalue')) 

3036 

3037 

3038def normaltest(a, axis=0): 

3039 """ 

3040 Tests whether a sample differs from a normal distribution. 

3041 

3042 Parameters 

3043 ---------- 

3044 a : array_like 

3045 The array containing the data to be tested. 

3046 axis : int or None, optional 

3047 Axis along which to compute test. Default is 0. If None, 

3048 compute over the whole array `a`. 

3049 

3050 Returns 

3051 ------- 

3052 statistic : float or array 

3053 ``s^2 + k^2``, where ``s`` is the z-score returned by `skewtest` and 

3054 ``k`` is the z-score returned by `kurtosistest`. 

3055 pvalue : float or array 

3056 A 2-sided chi squared probability for the hypothesis test. 

3057 

3058 Notes 

3059 ----- 

3060 For more details about `normaltest`, see `scipy.stats.normaltest`. 

3061 

3062 """ 

3063 a, axis = _chk_asarray(a, axis) 

3064 s, _ = skewtest(a, axis) 

3065 k, _ = kurtosistest(a, axis) 

3066 k2 = s*s + k*k 

3067 

3068 return NormaltestResult(k2, distributions.chi2.sf(k2, 2)) 

3069 

3070 

3071def mquantiles(a, prob=list([.25,.5,.75]), alphap=.4, betap=.4, axis=None, 

3072 limit=()): 

3073 """ 

3074 Computes empirical quantiles for a data array. 

3075 

3076 Samples quantile are defined by ``Q(p) = (1-gamma)*x[j] + gamma*x[j+1]``, 

3077 where ``x[j]`` is the j-th order statistic, and gamma is a function of 

3078 ``j = floor(n*p + m)``, ``m = alphap + p*(1 - alphap - betap)`` and 

3079 ``g = n*p + m - j``. 

3080 

3081 Reinterpreting the above equations to compare to **R** lead to the 

3082 equation: ``p(k) = (k - alphap)/(n + 1 - alphap - betap)`` 

3083 

3084 Typical values of (alphap,betap) are: 

3085 - (0,1) : ``p(k) = k/n`` : linear interpolation of cdf 

3086 (**R** type 4) 

3087 - (.5,.5) : ``p(k) = (k - 1/2.)/n`` : piecewise linear function 

3088 (**R** type 5) 

3089 - (0,0) : ``p(k) = k/(n+1)`` : 

3090 (**R** type 6) 

3091 - (1,1) : ``p(k) = (k-1)/(n-1)``: p(k) = mode[F(x[k])]. 

3092 (**R** type 7, **R** default) 

3093 - (1/3,1/3): ``p(k) = (k-1/3)/(n+1/3)``: Then p(k) ~ median[F(x[k])]. 

3094 The resulting quantile estimates are approximately median-unbiased 

3095 regardless of the distribution of x. 

3096 (**R** type 8) 

3097 - (3/8,3/8): ``p(k) = (k-3/8)/(n+1/4)``: Blom. 

3098 The resulting quantile estimates are approximately unbiased 

3099 if x is normally distributed 

3100 (**R** type 9) 

3101 - (.4,.4) : approximately quantile unbiased (Cunnane) 

3102 - (.35,.35): APL, used with PWM 

3103 

3104 Parameters 

3105 ---------- 

3106 a : array_like 

3107 Input data, as a sequence or array of dimension at most 2. 

3108 prob : array_like, optional 

3109 List of quantiles to compute. 

3110 alphap : float, optional 

3111 Plotting positions parameter, default is 0.4. 

3112 betap : float, optional 

3113 Plotting positions parameter, default is 0.4. 

3114 axis : int, optional 

3115 Axis along which to perform the trimming. 

3116 If None (default), the input array is first flattened. 

3117 limit : tuple, optional 

3118 Tuple of (lower, upper) values. 

3119 Values of `a` outside this open interval are ignored. 

3120 

3121 Returns 

3122 ------- 

3123 mquantiles : MaskedArray 

3124 An array containing the calculated quantiles. 

3125 

3126 Notes 

3127 ----- 

3128 This formulation is very similar to **R** except the calculation of 

3129 ``m`` from ``alphap`` and ``betap``, where in **R** ``m`` is defined 

3130 with each type. 

3131 

3132 References 

3133 ---------- 

3134 .. [1] *R* statistical software: https://www.r-project.org/ 

3135 .. [2] *R* ``quantile`` function: 

3136 http://stat.ethz.ch/R-manual/R-devel/library/stats/html/quantile.html 

3137 

3138 Examples 

3139 -------- 

3140 >>> import numpy as np 

3141 >>> from scipy.stats.mstats import mquantiles 

3142 >>> a = np.array([6., 47., 49., 15., 42., 41., 7., 39., 43., 40., 36.]) 

3143 >>> mquantiles(a) 

3144 array([ 19.2, 40. , 42.8]) 

3145 

3146 Using a 2D array, specifying axis and limit. 

3147 

3148 >>> data = np.array([[ 6., 7., 1.], 

3149 ... [ 47., 15., 2.], 

3150 ... [ 49., 36., 3.], 

3151 ... [ 15., 39., 4.], 

3152 ... [ 42., 40., -999.], 

3153 ... [ 41., 41., -999.], 

3154 ... [ 7., -999., -999.], 

3155 ... [ 39., -999., -999.], 

3156 ... [ 43., -999., -999.], 

3157 ... [ 40., -999., -999.], 

3158 ... [ 36., -999., -999.]]) 

3159 >>> print(mquantiles(data, axis=0, limit=(0, 50))) 

3160 [[19.2 14.6 1.45] 

3161 [40. 37.5 2.5 ] 

3162 [42.8 40.05 3.55]] 

3163 

3164 >>> data[:, 2] = -999. 

3165 >>> print(mquantiles(data, axis=0, limit=(0, 50))) 

3166 [[19.200000000000003 14.6 --] 

3167 [40.0 37.5 --] 

3168 [42.800000000000004 40.05 --]] 

3169 

3170 """ 

3171 def _quantiles1D(data,m,p): 

3172 x = np.sort(data.compressed()) 

3173 n = len(x) 

3174 if n == 0: 

3175 return ma.array(np.empty(len(p), dtype=float), mask=True) 

3176 elif n == 1: 

3177 return ma.array(np.resize(x, p.shape), mask=nomask) 

3178 aleph = (n*p + m) 

3179 k = np.floor(aleph.clip(1, n-1)).astype(int) 

3180 gamma = (aleph-k).clip(0,1) 

3181 return (1.-gamma)*x[(k-1).tolist()] + gamma*x[k.tolist()] 

3182 

3183 data = ma.array(a, copy=False) 

3184 if data.ndim > 2: 

3185 raise TypeError("Array should be 2D at most !") 

3186 

3187 if limit: 

3188 condition = (limit[0] < data) & (data < limit[1]) 

3189 data[~condition.filled(True)] = masked 

3190 

3191 p = np.array(prob, copy=False, ndmin=1) 

3192 m = alphap + p*(1.-alphap-betap) 

3193 # Computes quantiles along axis (or globally) 

3194 if (axis is None): 

3195 return _quantiles1D(data, m, p) 

3196 

3197 return ma.apply_along_axis(_quantiles1D, axis, data, m, p) 

3198 

3199 

3200def scoreatpercentile(data, per, limit=(), alphap=.4, betap=.4): 

3201 """Calculate the score at the given 'per' percentile of the 

3202 sequence a. For example, the score at per=50 is the median. 

3203 

3204 This function is a shortcut to mquantile 

3205 

3206 """ 

3207 if (per < 0) or (per > 100.): 

3208 raise ValueError("The percentile should be between 0. and 100. !" 

3209 " (got %s)" % per) 

3210 

3211 return mquantiles(data, prob=[per/100.], alphap=alphap, betap=betap, 

3212 limit=limit, axis=0).squeeze() 

3213 

3214 

3215def plotting_positions(data, alpha=0.4, beta=0.4): 

3216 """ 

3217 Returns plotting positions (or empirical percentile points) for the data. 

3218 

3219 Plotting positions are defined as ``(i-alpha)/(n+1-alpha-beta)``, where: 

3220 - i is the rank order statistics 

3221 - n is the number of unmasked values along the given axis 

3222 - `alpha` and `beta` are two parameters. 

3223 

3224 Typical values for `alpha` and `beta` are: 

3225 - (0,1) : ``p(k) = k/n``, linear interpolation of cdf (R, type 4) 

3226 - (.5,.5) : ``p(k) = (k-1/2.)/n``, piecewise linear function 

3227 (R, type 5) 

3228 - (0,0) : ``p(k) = k/(n+1)``, Weibull (R type 6) 

3229 - (1,1) : ``p(k) = (k-1)/(n-1)``, in this case, 

3230 ``p(k) = mode[F(x[k])]``. That's R default (R type 7) 

3231 - (1/3,1/3): ``p(k) = (k-1/3)/(n+1/3)``, then 

3232 ``p(k) ~ median[F(x[k])]``. 

3233 The resulting quantile estimates are approximately median-unbiased 

3234 regardless of the distribution of x. (R type 8) 

3235 - (3/8,3/8): ``p(k) = (k-3/8)/(n+1/4)``, Blom. 

3236 The resulting quantile estimates are approximately unbiased 

3237 if x is normally distributed (R type 9) 

3238 - (.4,.4) : approximately quantile unbiased (Cunnane) 

3239 - (.35,.35): APL, used with PWM 

3240 - (.3175, .3175): used in scipy.stats.probplot 

3241 

3242 Parameters 

3243 ---------- 

3244 data : array_like 

3245 Input data, as a sequence or array of dimension at most 2. 

3246 alpha : float, optional 

3247 Plotting positions parameter. Default is 0.4. 

3248 beta : float, optional 

3249 Plotting positions parameter. Default is 0.4. 

3250 

3251 Returns 

3252 ------- 

3253 positions : MaskedArray 

3254 The calculated plotting positions. 

3255 

3256 """ 

3257 data = ma.array(data, copy=False).reshape(1,-1) 

3258 n = data.count() 

3259 plpos = np.empty(data.size, dtype=float) 

3260 plpos[n:] = 0 

3261 plpos[data.argsort(axis=None)[:n]] = ((np.arange(1, n+1) - alpha) / 

3262 (n + 1.0 - alpha - beta)) 

3263 return ma.array(plpos, mask=data._mask) 

3264 

3265 

3266meppf = plotting_positions 

3267 

3268 

3269def obrientransform(*args): 

3270 """ 

3271 Computes a transform on input data (any number of columns). Used to 

3272 test for homogeneity of variance prior to running one-way stats. Each 

3273 array in ``*args`` is one level of a factor. If an `f_oneway()` run on 

3274 the transformed data and found significant, variances are unequal. From 

3275 Maxwell and Delaney, p.112. 

3276 

3277 Returns: transformed data for use in an ANOVA 

3278 """ 

3279 data = argstoarray(*args).T 

3280 v = data.var(axis=0,ddof=1) 

3281 m = data.mean(0) 

3282 n = data.count(0).astype(float) 

3283 # result = ((N-1.5)*N*(a-m)**2 - 0.5*v*(n-1))/((n-1)*(n-2)) 

3284 data -= m 

3285 data **= 2 

3286 data *= (n-1.5)*n 

3287 data -= 0.5*v*(n-1) 

3288 data /= (n-1.)*(n-2.) 

3289 if not ma.allclose(v,data.mean(0)): 

3290 raise ValueError("Lack of convergence in obrientransform.") 

3291 

3292 return data 

3293 

3294 

3295def sem(a, axis=0, ddof=1): 

3296 """ 

3297 Calculates the standard error of the mean of the input array. 

3298 

3299 Also sometimes called standard error of measurement. 

3300 

3301 Parameters 

3302 ---------- 

3303 a : array_like 

3304 An array containing the values for which the standard error is 

3305 returned. 

3306 axis : int or None, optional 

3307 If axis is None, ravel `a` first. If axis is an integer, this will be 

3308 the axis over which to operate. Defaults to 0. 

3309 ddof : int, optional 

3310 Delta degrees-of-freedom. How many degrees of freedom to adjust 

3311 for bias in limited samples relative to the population estimate 

3312 of variance. Defaults to 1. 

3313 

3314 Returns 

3315 ------- 

3316 s : ndarray or float 

3317 The standard error of the mean in the sample(s), along the input axis. 

3318 

3319 Notes 

3320 ----- 

3321 The default value for `ddof` changed in scipy 0.15.0 to be consistent with 

3322 `scipy.stats.sem` as well as with the most common definition used (like in 

3323 the R documentation). 

3324 

3325 Examples 

3326 -------- 

3327 Find standard error along the first axis: 

3328 

3329 >>> import numpy as np 

3330 >>> from scipy import stats 

3331 >>> a = np.arange(20).reshape(5,4) 

3332 >>> print(stats.mstats.sem(a)) 

3333 [2.8284271247461903 2.8284271247461903 2.8284271247461903 

3334 2.8284271247461903] 

3335 

3336 Find standard error across the whole array, using n degrees of freedom: 

3337 

3338 >>> print(stats.mstats.sem(a, axis=None, ddof=0)) 

3339 1.2893796958227628 

3340 

3341 """ 

3342 a, axis = _chk_asarray(a, axis) 

3343 n = a.count(axis=axis) 

3344 s = a.std(axis=axis, ddof=ddof) / ma.sqrt(n) 

3345 return s 

3346 

3347 

3348F_onewayResult = namedtuple('F_onewayResult', ('statistic', 'pvalue')) 

3349 

3350 

3351def f_oneway(*args): 

3352 """ 

3353 Performs a 1-way ANOVA, returning an F-value and probability given 

3354 any number of groups. From Heiman, pp.394-7. 

3355 

3356 Usage: ``f_oneway(*args)``, where ``*args`` is 2 or more arrays, 

3357 one per treatment group. 

3358 

3359 Returns 

3360 ------- 

3361 statistic : float 

3362 The computed F-value of the test. 

3363 pvalue : float 

3364 The associated p-value from the F-distribution. 

3365 

3366 """ 

3367 # Construct a single array of arguments: each row is a group 

3368 data = argstoarray(*args) 

3369 ngroups = len(data) 

3370 ntot = data.count() 

3371 sstot = (data**2).sum() - (data.sum())**2/float(ntot) 

3372 ssbg = (data.count(-1) * (data.mean(-1)-data.mean())**2).sum() 

3373 sswg = sstot-ssbg 

3374 dfbg = ngroups-1 

3375 dfwg = ntot - ngroups 

3376 msb = ssbg/float(dfbg) 

3377 msw = sswg/float(dfwg) 

3378 f = msb/msw 

3379 prob = special.fdtrc(dfbg, dfwg, f) # equivalent to stats.f.sf 

3380 

3381 return F_onewayResult(f, prob) 

3382 

3383 

3384FriedmanchisquareResult = namedtuple('FriedmanchisquareResult', 

3385 ('statistic', 'pvalue')) 

3386 

3387 

3388def friedmanchisquare(*args): 

3389 """Friedman Chi-Square is a non-parametric, one-way within-subjects ANOVA. 

3390 This function calculates the Friedman Chi-square test for repeated measures 

3391 and returns the result, along with the associated probability value. 

3392 

3393 Each input is considered a given group. Ideally, the number of treatments 

3394 among each group should be equal. If this is not the case, only the first 

3395 n treatments are taken into account, where n is the number of treatments 

3396 of the smallest group. 

3397 If a group has some missing values, the corresponding treatments are masked 

3398 in the other groups. 

3399 The test statistic is corrected for ties. 

3400 

3401 Masked values in one group are propagated to the other groups. 

3402 

3403 Returns 

3404 ------- 

3405 statistic : float 

3406 the test statistic. 

3407 pvalue : float 

3408 the associated p-value. 

3409 

3410 """ 

3411 data = argstoarray(*args).astype(float) 

3412 k = len(data) 

3413 if k < 3: 

3414 raise ValueError("Less than 3 groups (%i): " % k + 

3415 "the Friedman test is NOT appropriate.") 

3416 

3417 ranked = ma.masked_values(rankdata(data, axis=0), 0) 

3418 if ranked._mask is not nomask: 

3419 ranked = ma.mask_cols(ranked) 

3420 ranked = ranked.compressed().reshape(k,-1).view(ndarray) 

3421 else: 

3422 ranked = ranked._data 

3423 (k,n) = ranked.shape 

3424 # Ties correction 

3425 repeats = [find_repeats(row) for row in ranked.T] 

3426 ties = np.array([y for x, y in repeats if x.size > 0]) 

3427 tie_correction = 1 - (ties**3-ties).sum()/float(n*(k**3-k)) 

3428 

3429 ssbg = np.sum((ranked.sum(-1) - n*(k+1)/2.)**2) 

3430 chisq = ssbg * 12./(n*k*(k+1)) * 1./tie_correction 

3431 

3432 return FriedmanchisquareResult(chisq, 

3433 distributions.chi2.sf(chisq, k-1)) 

3434 

3435 

3436BrunnerMunzelResult = namedtuple('BrunnerMunzelResult', ('statistic', 'pvalue')) 

3437 

3438 

3439def brunnermunzel(x, y, alternative="two-sided", distribution="t"): 

3440 """ 

3441 Computes the Brunner-Munzel test on samples x and y 

3442 

3443 Missing values in `x` and/or `y` are discarded. 

3444 

3445 Parameters 

3446 ---------- 

3447 x, y : array_like 

3448 Array of samples, should be one-dimensional. 

3449 alternative : 'less', 'two-sided', or 'greater', optional 

3450 Whether to get the p-value for the one-sided hypothesis ('less' 

3451 or 'greater') or for the two-sided hypothesis ('two-sided'). 

3452 Defaults value is 'two-sided' . 

3453 distribution : 't' or 'normal', optional 

3454 Whether to get the p-value by t-distribution or by standard normal 

3455 distribution. 

3456 Defaults value is 't' . 

3457 

3458 Returns 

3459 ------- 

3460 statistic : float 

3461 The Brunner-Munzer W statistic. 

3462 pvalue : float 

3463 p-value assuming an t distribution. One-sided or 

3464 two-sided, depending on the choice of `alternative` and `distribution`. 

3465 

3466 See Also 

3467 -------- 

3468 mannwhitneyu : Mann-Whitney rank test on two samples. 

3469 

3470 Notes 

3471 ----- 

3472 For more details on `brunnermunzel`, see `scipy.stats.brunnermunzel`. 

3473 

3474 """ 

3475 x = ma.asarray(x).compressed().view(ndarray) 

3476 y = ma.asarray(y).compressed().view(ndarray) 

3477 nx = len(x) 

3478 ny = len(y) 

3479 if nx == 0 or ny == 0: 

3480 return BrunnerMunzelResult(np.nan, np.nan) 

3481 rankc = rankdata(np.concatenate((x,y))) 

3482 rankcx = rankc[0:nx] 

3483 rankcy = rankc[nx:nx+ny] 

3484 rankcx_mean = np.mean(rankcx) 

3485 rankcy_mean = np.mean(rankcy) 

3486 rankx = rankdata(x) 

3487 ranky = rankdata(y) 

3488 rankx_mean = np.mean(rankx) 

3489 ranky_mean = np.mean(ranky) 

3490 

3491 Sx = np.sum(np.power(rankcx - rankx - rankcx_mean + rankx_mean, 2.0)) 

3492 Sx /= nx - 1 

3493 Sy = np.sum(np.power(rankcy - ranky - rankcy_mean + ranky_mean, 2.0)) 

3494 Sy /= ny - 1 

3495 

3496 wbfn = nx * ny * (rankcy_mean - rankcx_mean) 

3497 wbfn /= (nx + ny) * np.sqrt(nx * Sx + ny * Sy) 

3498 

3499 if distribution == "t": 

3500 df_numer = np.power(nx * Sx + ny * Sy, 2.0) 

3501 df_denom = np.power(nx * Sx, 2.0) / (nx - 1) 

3502 df_denom += np.power(ny * Sy, 2.0) / (ny - 1) 

3503 df = df_numer / df_denom 

3504 p = distributions.t.cdf(wbfn, df) 

3505 elif distribution == "normal": 

3506 p = distributions.norm.cdf(wbfn) 

3507 else: 

3508 raise ValueError( 

3509 "distribution should be 't' or 'normal'") 

3510 

3511 if alternative == "greater": 

3512 pass 

3513 elif alternative == "less": 

3514 p = 1 - p 

3515 elif alternative == "two-sided": 

3516 p = 2 * np.min([p, 1-p]) 

3517 else: 

3518 raise ValueError( 

3519 "alternative should be 'less', 'greater' or 'two-sided'") 

3520 

3521 return BrunnerMunzelResult(wbfn, p)