Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scipy/stats/_mstats

1"""

2An extension of scipy.stats._stats_py to support masked arrays

4"""

5# Original author (2007): Pierre GF Gerard-Marchant

8__all__ = ['argstoarray',

9 'count_tied_groups',

10 'describe',

11 'f_oneway', 'find_repeats','friedmanchisquare',

12 'kendalltau','kendalltau_seasonal','kruskal','kruskalwallis',

13 'ks_twosamp', 'ks_2samp', 'kurtosis', 'kurtosistest',

14 'ks_1samp', 'kstest',

15 'linregress',

16 'mannwhitneyu', 'meppf','mode','moment','mquantiles','msign',

17 'normaltest',

18 'obrientransform',

19 'pearsonr','plotting_positions','pointbiserialr',

20 'rankdata',

21 'scoreatpercentile','sem',

22 'sen_seasonal_slopes','skew','skewtest','spearmanr',

23 'siegelslopes', 'theilslopes',

24 'tmax','tmean','tmin','trim','trimboth',

25 'trimtail','trima','trimr','trimmed_mean','trimmed_std',

26 'trimmed_stde','trimmed_var','tsem','ttest_1samp','ttest_onesamp',

27 'ttest_ind','ttest_rel','tvar',

28 'variation',

29 'winsorize',

30 'brunnermunzel',

31 ]

33import numpy as np

34from numpy import ndarray

35import numpy.ma as ma

36from numpy.ma import masked, nomask

37import math

39import itertools

40import warnings

41from collections import namedtuple

43from . import distributions

44from scipy._lib._util import _rename_parameter, _contains_nan

45from scipy._lib._bunch import _make_tuple_bunch

46import scipy.special as special

47import scipy.stats._stats_py

49from ._stats_mstats_common import (

50 _find_repeats,

51 linregress as stats_linregress,

52 LinregressResult as stats_LinregressResult,

53 theilslopes as stats_theilslopes,

54 siegelslopes as stats_siegelslopes

55 )

57def _chk_asarray(a, axis):

58 # Always returns a masked array, raveled for axis=None

59 a = ma.asanyarray(a)

60 if axis is None:

61 a = ma.ravel(a)

62 outaxis = 0

63 else:

64 outaxis = axis

65 return a, outaxis

68def _chk2_asarray(a, b, axis):

69 a = ma.asanyarray(a)

70 b = ma.asanyarray(b)

71 if axis is None:

72 a = ma.ravel(a)

73 b = ma.ravel(b)

74 outaxis = 0

75 else:

76 outaxis = axis

77 return a, b, outaxis

80def _chk_size(a, b):

81 a = ma.asanyarray(a)

82 b = ma.asanyarray(b)

83 (na, nb) = (a.size, b.size)

84 if na != nb:

85 raise ValueError("The size of the input array should match!"

86 " (%s <> %s)" % (na, nb))

87 return (a, b, na)

90def argstoarray(*args):

91 """

92 Constructs a 2D array from a group of sequences.

94 Sequences are filled with missing values to match the length of the longest

95 sequence.

97 Parameters

98 ----------

99 *args : sequences

100 Group of sequences.

101

102 Returns

103 -------

104 argstoarray : MaskedArray

105 A ( `m` x `n` ) masked array, where `m` is the number of arguments and

106 `n` the length of the longest argument.

107

108 Notes

109 -----

110 `numpy.ma.row_stack` has identical behavior, but is called with a sequence

111 of sequences.

112

113 Examples

114 --------

115 A 2D masked array constructed from a group of sequences is returned.

116

117 >>> from scipy.stats.mstats import argstoarray

118 >>> argstoarray([1, 2, 3], [4, 5, 6])

119 masked_array(

120 data=[[1.0, 2.0, 3.0],

121 [4.0, 5.0, 6.0]],

122 mask=[[False, False, False],

123 [False, False, False]],

124 fill_value=1e+20)

125

126 The returned masked array filled with missing values when the lengths of

127 sequences are different.

128

129 >>> argstoarray([1, 3], [4, 5, 6])

130 masked_array(

131 data=[[1.0, 3.0, --],

132 [4.0, 5.0, 6.0]],

133 mask=[[False, False, True],

134 [False, False, False]],

135 fill_value=1e+20)

136

137 """

138 if len(args) == 1 and not isinstance(args[0], ndarray):

139 output = ma.asarray(args[0])

140 if output.ndim != 2:

141 raise ValueError("The input should be 2D")

142 else:

143 n = len(args)

144 m = max([len(k) for k in args])

145 output = ma.array(np.empty((n,m), dtype=float), mask=True)

146 for (k,v) in enumerate(args):

147 output[k,:len(v)] = v

148

149 output[np.logical_not(np.isfinite(output._data))] = masked

150 return output

151

152

153def find_repeats(arr):

154 """Find repeats in arr and return a tuple (repeats, repeat_count).

155

156 The input is cast to float64. Masked values are discarded.

157

158 Parameters

159 ----------

160 arr : sequence

161 Input array. The array is flattened if it is not 1D.

162

163 Returns

164 -------

165 repeats : ndarray

166 Array of repeated values.

167 counts : ndarray

168 Array of counts.

169

170 Examples

171 --------

172 >>> from scipy.stats import mstats

173 >>> mstats.find_repeats([2, 1, 2, 3, 2, 2, 5])

174 (array([2.]), array([4]))

175

176 In the above example, 2 repeats 4 times.

177

178 >>> mstats.find_repeats([[10, 20, 1, 2], [5, 5, 4, 4]])

179 (array([4., 5.]), array([2, 2]))

180

181 In the above example, both 4 and 5 repeat 2 times.

182

183 """

184 # Make sure we get a copy. ma.compressed promises a "new array", but can

185 # actually return a reference.

186 compr = np.asarray(ma.compressed(arr), dtype=np.float64)

187 try:

188 need_copy = np.may_share_memory(compr, arr)

189 except AttributeError:

190 # numpy < 1.8.2 bug: np.may_share_memory([], []) raises,

191 # while in numpy 1.8.2 and above it just (correctly) returns False.

192 need_copy = False

193 if need_copy:

194 compr = compr.copy()

195 return _find_repeats(compr)

196

197

198def count_tied_groups(x, use_missing=False):

199 """

200 Counts the number of tied values.

201

202 Parameters

203 ----------

204 x : sequence

205 Sequence of data on which to counts the ties

206 use_missing : bool, optional

207 Whether to consider missing values as tied.

208

209 Returns

210 -------

211 count_tied_groups : dict

212 Returns a dictionary (nb of ties: nb of groups).

213

214 Examples

215 --------

216 >>> from scipy.stats import mstats

217 >>> import numpy as np

218 >>> z = [0, 0, 0, 2, 2, 2, 3, 3, 4, 5, 6]

219 >>> mstats.count_tied_groups(z)

220 {2: 1, 3: 2}

221

222 In the above example, the ties were 0 (3x), 2 (3x) and 3 (2x).

223

224 >>> z = np.ma.array([0, 0, 1, 2, 2, 2, 3, 3, 4, 5, 6])

225 >>> mstats.count_tied_groups(z)

226 {2: 2, 3: 1}

227 >>> z[[1,-1]] = np.ma.masked

228 >>> mstats.count_tied_groups(z, use_missing=True)

229 {2: 2, 3: 1}

230

231 """

232 nmasked = ma.getmask(x).sum()

233 # We need the copy as find_repeats will overwrite the initial data

234 data = ma.compressed(x).copy()

235 (ties, counts) = find_repeats(data)

236 nties = {}

237 if len(ties):

238 nties = dict(zip(np.unique(counts), itertools.repeat(1)))

239 nties.update(dict(zip(*find_repeats(counts))))

240

241 if nmasked and use_missing:

242 try:

243 nties[nmasked] += 1

244 except KeyError:

245 nties[nmasked] = 1

246

247 return nties

248

249

250def rankdata(data, axis=None, use_missing=False):

251 """Returns the rank (also known as order statistics) of each data point

252 along the given axis.

253

254 If some values are tied, their rank is averaged.

255 If some values are masked, their rank is set to 0 if use_missing is False,

256 or set to the average rank of the unmasked values if use_missing is True.

257

258 Parameters

259 ----------

260 data : sequence

261 Input data. The data is transformed to a masked array

262 axis : {None,int}, optional

263 Axis along which to perform the ranking.

264 If None, the array is first flattened. An exception is raised if

265 the axis is specified for arrays with a dimension larger than 2

266 use_missing : bool, optional

267 Whether the masked values have a rank of 0 (False) or equal to the

268 average rank of the unmasked values (True).

269

270 """

271 def _rank1d(data, use_missing=False):

272 n = data.count()

273 rk = np.empty(data.size, dtype=float)

274 idx = data.argsort()

275 rk[idx[:n]] = np.arange(1,n+1)

276

277 if use_missing:

278 rk[idx[n:]] = (n+1)/2.

279 else:

280 rk[idx[n:]] = 0

281

282 repeats = find_repeats(data.copy())

283 for r in repeats[0]:

284 condition = (data == r).filled(False)

285 rk[condition] = rk[condition].mean()

286 return rk

287

288 data = ma.array(data, copy=False)

289 if axis is None:

290 if data.ndim > 1:

291 return _rank1d(data.ravel(), use_missing).reshape(data.shape)

292 else:

293 return _rank1d(data, use_missing)

294 else:

295 return ma.apply_along_axis(_rank1d,axis,data,use_missing).view(ndarray)

296

297

298ModeResult = namedtuple('ModeResult', ('mode', 'count'))

299

300

301def mode(a, axis=0):

302 """

303 Returns an array of the modal (most common) value in the passed array.

304

305 Parameters

306 ----------

307 a : array_like

308 n-dimensional array of which to find mode(s).

309 axis : int or None, optional

310 Axis along which to operate. Default is 0. If None, compute over

311 the whole array `a`.

312

313 Returns

314 -------

315 mode : ndarray

316 Array of modal values.

317 count : ndarray

318 Array of counts for each mode.

319

320 Notes

321 -----

322 For more details, see `scipy.stats.mode`.

323

324 Examples

325 --------

326 >>> import numpy as np

327 >>> from scipy import stats

328 >>> from scipy.stats import mstats

329 >>> m_arr = np.ma.array([1, 1, 0, 0, 0, 0], mask=[0, 0, 1, 1, 1, 0])

330 >>> mstats.mode(m_arr) # note that most zeros are masked

331 ModeResult(mode=array([1.]), count=array([2.]))

332

333 """

334 return _mode(a, axis=axis, keepdims=True)

335

336

337def _mode(a, axis=0, keepdims=True):

338 # Don't want to expose `keepdims` from the public `mstats.mode`

339 a, axis = _chk_asarray(a, axis)

340

341 def _mode1D(a):

342 (rep,cnt) = find_repeats(a)

343 if not cnt.ndim:

344 return (0, 0)

345 elif cnt.size:

346 return (rep[cnt.argmax()], cnt.max())

347 else:

348 return (a.min(), 1)

349

350 if axis is None:

351 output = _mode1D(ma.ravel(a))

352 output = (ma.array(output[0]), ma.array(output[1]))

353 else:

354 output = ma.apply_along_axis(_mode1D, axis, a)

355 if keepdims is None or keepdims:

356 newshape = list(a.shape)

357 newshape[axis] = 1

358 slices = [slice(None)] * output.ndim

359 slices[axis] = 0

360 modes = output[tuple(slices)].reshape(newshape)

361 slices[axis] = 1

362 counts = output[tuple(slices)].reshape(newshape)

363 output = (modes, counts)

364 else:

365 output = np.moveaxis(output, axis, 0)

366

367 return ModeResult(*output)

368

369

370def _betai(a, b, x):

371 x = np.asanyarray(x)

372 x = ma.where(x < 1.0, x, 1.0) # if x > 1 then return 1.0

373 return special.betainc(a, b, x)

374

375

376def msign(x):

377 """Returns the sign of x, or 0 if x is masked."""

378 return ma.filled(np.sign(x), 0)

379

380

381def pearsonr(x, y):

382 r"""

383 Pearson correlation coefficient and p-value for testing non-correlation.

384

385 The Pearson correlation coefficient [1]_ measures the linear relationship

386 between two datasets. The calculation of the p-value relies on the

387 assumption that each dataset is normally distributed. (See Kowalski [3]_

388 for a discussion of the effects of non-normality of the input on the

389 distribution of the correlation coefficient.) Like other correlation

390 coefficients, this one varies between -1 and +1 with 0 implying no

391 correlation. Correlations of -1 or +1 imply an exact linear relationship.

392

393 Parameters

394 ----------

395 x : (N,) array_like

396 Input array.

397 y : (N,) array_like

398 Input array.

399

400 Returns

401 -------

402 r : float

403 Pearson's correlation coefficient.

404 p-value : float

405 Two-tailed p-value.

406

407 Warns

408 -----

409 PearsonRConstantInputWarning

410 Raised if an input is a constant array. The correlation coefficient

411 is not defined in this case, so ``np.nan`` is returned.

412

413 PearsonRNearConstantInputWarning

414 Raised if an input is "nearly" constant. The array ``x`` is considered

415 nearly constant if ``norm(x - mean(x)) < 1e-13 * abs(mean(x))``.

416 Numerical errors in the calculation ``x - mean(x)`` in this case might

417 result in an inaccurate calculation of r.

418

419 See Also

420 --------

421 spearmanr : Spearman rank-order correlation coefficient.

422 kendalltau : Kendall's tau, a correlation measure for ordinal data.

423

424 Notes

425 -----

426 The correlation coefficient is calculated as follows:

427

428 .. math::

429

430 r = \frac{\sum (x - m_x) (y - m_y)}

431 {\sqrt{\sum (x - m_x)^2 \sum (y - m_y)^2}}

432

433 where :math:`m_x` is the mean of the vector x and :math:`m_y` is

434 the mean of the vector y.

435

436 Under the assumption that x and y are drawn from

437 independent normal distributions (so the population correlation coefficient

438 is 0), the probability density function of the sample correlation

439 coefficient r is ([1]_, [2]_):

440

441 .. math::

442

443 f(r) = \frac{{(1-r^2)}^{n/2-2}}{\mathrm{B}(\frac{1}{2},\frac{n}{2}-1)}

444

445 where n is the number of samples, and B is the beta function. This

446 is sometimes referred to as the exact distribution of r. This is

447 the distribution that is used in `pearsonr` to compute the p-value.

448 The distribution is a beta distribution on the interval [-1, 1],

449 with equal shape parameters a = b = n/2 - 1. In terms of SciPy's

450 implementation of the beta distribution, the distribution of r is::

451

452 dist = scipy.stats.beta(n/2 - 1, n/2 - 1, loc=-1, scale=2)

453

454 The p-value returned by `pearsonr` is a two-sided p-value. The p-value

455 roughly indicates the probability of an uncorrelated system

456 producing datasets that have a Pearson correlation at least as extreme

457 as the one computed from these datasets. More precisely, for a

458 given sample with correlation coefficient r, the p-value is

459 the probability that abs(r') of a random sample x' and y' drawn from

460 the population with zero correlation would be greater than or equal

461 to abs(r). In terms of the object ``dist`` shown above, the p-value

462 for a given r and length n can be computed as::

463

464 p = 2*dist.cdf(-abs(r))

465

466 When n is 2, the above continuous distribution is not well-defined.

467 One can interpret the limit of the beta distribution as the shape

468 parameters a and b approach a = b = 0 as a discrete distribution with

469 equal probability masses at r = 1 and r = -1. More directly, one

470 can observe that, given the data x = [x1, x2] and y = [y1, y2], and

471 assuming x1 != x2 and y1 != y2, the only possible values for r are 1

472 and -1. Because abs(r') for any sample x' and y' with length 2 will

473 be 1, the two-sided p-value for a sample of length 2 is always 1.

474

475 References

476 ----------

477 .. [1] "Pearson correlation coefficient", Wikipedia,

478 https://en.wikipedia.org/wiki/Pearson_correlation_coefficient

479 .. [2] Student, "Probable error of a correlation coefficient",

480 Biometrika, Volume 6, Issue 2-3, 1 September 1908, pp. 302-310.

481 .. [3] C. J. Kowalski, "On the Effects of Non-Normality on the Distribution

482 of the Sample Product-Moment Correlation Coefficient"

483 Journal of the Royal Statistical Society. Series C (Applied

484 Statistics), Vol. 21, No. 1 (1972), pp. 1-12.

485

486 Examples

487 --------

488 >>> import numpy as np

489 >>> from scipy import stats

490 >>> from scipy.stats import mstats

491 >>> mstats.pearsonr([1, 2, 3, 4, 5], [10, 9, 2.5, 6, 4])

492 (-0.7426106572325057, 0.1505558088534455)

493

494 There is a linear dependence between x and y if y = a + b*x + e, where

495 a,b are constants and e is a random error term, assumed to be independent

496 of x. For simplicity, assume that x is standard normal, a=0, b=1 and let

497 e follow a normal distribution with mean zero and standard deviation s>0.

498

499 >>> s = 0.5

500 >>> x = stats.norm.rvs(size=500)

501 >>> e = stats.norm.rvs(scale=s, size=500)

502 >>> y = x + e

503 >>> mstats.pearsonr(x, y)

504 (0.9029601878969703, 8.428978827629898e-185) # may vary

505

506 This should be close to the exact value given by

507

508 >>> 1/np.sqrt(1 + s**2)

509 0.8944271909999159

510

511 For s=0.5, we observe a high level of correlation. In general, a large

512 variance of the noise reduces the correlation, while the correlation

513 approaches one as the variance of the error goes to zero.

514

515 It is important to keep in mind that no correlation does not imply

516 independence unless (x, y) is jointly normal. Correlation can even be zero

517 when there is a very simple dependence structure: if X follows a

518 standard normal distribution, let y = abs(x). Note that the correlation

519 between x and y is zero. Indeed, since the expectation of x is zero,

520 cov(x, y) = E[x*y]. By definition, this equals E[x*abs(x)] which is zero

521 by symmetry. The following lines of code illustrate this observation:

522

523 >>> y = np.abs(x)

524 >>> mstats.pearsonr(x, y)

525 (-0.016172891856853524, 0.7182823678751942) # may vary

526

527 A non-zero correlation coefficient can be misleading. For example, if X has

528 a standard normal distribution, define y = x if x < 0 and y = 0 otherwise.

529 A simple calculation shows that corr(x, y) = sqrt(2/Pi) = 0.797...,

530 implying a high level of correlation:

531

532 >>> y = np.where(x < 0, x, 0)

533 >>> mstats.pearsonr(x, y)

534 (0.8537091583771509, 3.183461621422181e-143) # may vary

535

536 This is unintuitive since there is no dependence of x and y if x is larger

537 than zero which happens in about half of the cases if we sample x and y.

538 """

539 (x, y, n) = _chk_size(x, y)

540 (x, y) = (x.ravel(), y.ravel())

541 # Get the common mask and the total nb of unmasked elements

542 m = ma.mask_or(ma.getmask(x), ma.getmask(y))

543 n -= m.sum()

544 df = n-2

545 if df < 0:

546 return (masked, masked)

547

548 return scipy.stats._stats_py.pearsonr(ma.masked_array(x, mask=m).compressed(),

549 ma.masked_array(y, mask=m).compressed())

550

551

552def spearmanr(x, y=None, use_ties=True, axis=None, nan_policy='propagate',

553 alternative='two-sided'):

554 """

555 Calculates a Spearman rank-order correlation coefficient and the p-value

556 to test for non-correlation.

557

558 The Spearman correlation is a nonparametric measure of the linear

559 relationship between two datasets. Unlike the Pearson correlation, the

560 Spearman correlation does not assume that both datasets are normally

561 distributed. Like other correlation coefficients, this one varies

562 between -1 and +1 with 0 implying no correlation. Correlations of -1 or

563 +1 imply a monotonic relationship. Positive correlations imply that

564 as `x` increases, so does `y`. Negative correlations imply that as `x`

565 increases, `y` decreases.

566

567 Missing values are discarded pair-wise: if a value is missing in `x`, the

568 corresponding value in `y` is masked.

569

570 The p-value roughly indicates the probability of an uncorrelated system

571 producing datasets that have a Spearman correlation at least as extreme

572 as the one computed from these datasets. The p-values are not entirely

573 reliable but are probably reasonable for datasets larger than 500 or so.

574

575 Parameters

576 ----------

577 x, y : 1D or 2D array_like, y is optional

578 One or two 1-D or 2-D arrays containing multiple variables and

579 observations. When these are 1-D, each represents a vector of

580 observations of a single variable. For the behavior in the 2-D case,

581 see under ``axis``, below.

582 use_ties : bool, optional

583 DO NOT USE. Does not do anything, keyword is only left in place for

584 backwards compatibility reasons.

585 axis : int or None, optional

586 If axis=0 (default), then each column represents a variable, with

587 observations in the rows. If axis=1, the relationship is transposed:

588 each row represents a variable, while the columns contain observations.

589 If axis=None, then both arrays will be raveled.

590 nan_policy : {'propagate', 'raise', 'omit'}, optional

591 Defines how to handle when input contains nan. 'propagate' returns nan,

592 'raise' throws an error, 'omit' performs the calculations ignoring nan

593 values. Default is 'propagate'.

594 alternative : {'two-sided', 'less', 'greater'}, optional

595 Defines the alternative hypothesis. Default is 'two-sided'.

596 The following options are available:

597

598 * 'two-sided': the correlation is nonzero

599 * 'less': the correlation is negative (less than zero)

600 * 'greater': the correlation is positive (greater than zero)

601

602 .. versionadded:: 1.7.0

603

604 Returns

605 -------

606 res : SignificanceResult

607 An object containing attributes:

608

609 statistic : float or ndarray (2-D square)

610 Spearman correlation matrix or correlation coefficient (if only 2

611 variables are given as parameters). Correlation matrix is square

612 with length equal to total number of variables (columns or rows) in

613 ``a`` and ``b`` combined.

614 pvalue : float

615 The p-value for a hypothesis test whose null hypothesis

616 is that two sets of data are linearly uncorrelated. See

617 `alternative` above for alternative hypotheses. `pvalue` has the

618 same shape as `statistic`.

619

620 References

621 ----------

622 [CRCProbStat2000] section 14.7

623

624 """

625 if not use_ties:

626 raise ValueError("`use_ties=False` is not supported in SciPy >= 1.2.0")

627

628 # Always returns a masked array, raveled if axis=None

629 x, axisout = _chk_asarray(x, axis)

630 if y is not None:

631 # Deal only with 2-D `x` case.

632 y, _ = _chk_asarray(y, axis)

633 if axisout == 0:

634 x = ma.column_stack((x, y))

635 else:

636 x = ma.row_stack((x, y))

637

638 if axisout == 1:

639 # To simplify the code that follow (always use `n_obs, n_vars` shape)

640 x = x.T

641

642 if nan_policy == 'omit':

643 x = ma.masked_invalid(x)

644

645 def _spearmanr_2cols(x):

646 # Mask the same observations for all variables, and then drop those

647 # observations (can't leave them masked, rankdata is weird).

648 x = ma.mask_rowcols(x, axis=0)

649 x = x[~x.mask.any(axis=1), :]

650

651 # If either column is entirely NaN or Inf

652 if not np.any(x.data):

653 res = scipy.stats._stats_py.SignificanceResult(np.nan, np.nan)

654 res.correlation = np.nan

655 return res

656

657 m = ma.getmask(x)

658 n_obs = x.shape[0]

659 dof = n_obs - 2 - int(m.sum(axis=0)[0])

660 if dof < 0:

661 raise ValueError("The input must have at least 3 entries!")

662

663 # Gets the ranks and rank differences

664 x_ranked = rankdata(x, axis=0)

665 rs = ma.corrcoef(x_ranked, rowvar=False).data

666

667 # rs can have elements equal to 1, so avoid zero division warnings

668 with np.errstate(divide='ignore'):

669 # clip the small negative values possibly caused by rounding

670 # errors before taking the square root

671 t = rs * np.sqrt((dof / ((rs+1.0) * (1.0-rs))).clip(0))

672

673 t, prob = scipy.stats._stats_py._ttest_finish(dof, t, alternative)

674

675 # For backwards compatibility, return scalars when comparing 2 columns

676 if rs.shape == (2, 2):

677 res = scipy.stats._stats_py.SignificanceResult(rs[1, 0],

678 prob[1, 0])

679 res.correlation = rs[1, 0]

680 return res

681 else:

682 res = scipy.stats._stats_py.SignificanceResult(rs, prob)

683 res.correlation = rs

684 return res

685

686 # Need to do this per pair of variables, otherwise the dropped observations

687 # in a third column mess up the result for a pair.

688 n_vars = x.shape[1]

689 if n_vars == 2:

690 return _spearmanr_2cols(x)

691 else:

692 rs = np.ones((n_vars, n_vars), dtype=float)

693 prob = np.zeros((n_vars, n_vars), dtype=float)

694 for var1 in range(n_vars - 1):

695 for var2 in range(var1+1, n_vars):

696 result = _spearmanr_2cols(x[:, [var1, var2]])

697 rs[var1, var2] = result.correlation

698 rs[var2, var1] = result.correlation

699 prob[var1, var2] = result.pvalue

700 prob[var2, var1] = result.pvalue

701

702 res = scipy.stats._stats_py.SignificanceResult(rs, prob)

703 res.correlation = rs

704 return res

705

706

707def _kendall_p_exact(n, c, alternative='two-sided'):

708

709 # Use the fact that distribution is symmetric: always calculate a CDF in

710 # the left tail.

711 # This will be the one-sided p-value if `c` is on the side of

712 # the null distribution predicted by the alternative hypothesis.

713 # The two-sided p-value will be twice this value.

714 # If `c` is on the other side of the null distribution, we'll need to

715 # take the complement and add back the probability mass at `c`.

716 in_right_tail = (c >= (n*(n-1))//2 - c)

717 alternative_greater = (alternative == 'greater')

718 c = int(min(c, (n*(n-1))//2 - c))

719

720 # Exact p-value, see Maurice G. Kendall, "Rank Correlation Methods"

721 # (4th Edition), Charles Griffin & Co., 1970.

722 if n <= 0:

723 raise ValueError(f'n ({n}) must be positive')

724 elif c < 0 or 4*c > n*(n-1):

725 raise ValueError(f'c ({c}) must satisfy 0 <= 4c <= n(n-1) = {n*(n-1)}.')

726 elif n == 1:

727 prob = 1.0

728 p_mass_at_c = 1

729 elif n == 2:

730 prob = 1.0

731 p_mass_at_c = 0.5

732 elif c == 0:

733 prob = 2.0/math.factorial(n) if n < 171 else 0.0

734 p_mass_at_c = prob/2

735 elif c == 1:

736 prob = 2.0/math.factorial(n-1) if n < 172 else 0.0

737 p_mass_at_c = (n-1)/math.factorial(n)

738 elif 4*c == n*(n-1) and alternative == 'two-sided':

739 # I'm sure there's a simple formula for p_mass_at_c in this

740 # case, but I don't know it. Use generic formula for one-sided p-value.

741 prob = 1.0

742 elif n < 171:

743 new = np.zeros(c+1)

744 new[0:2] = 1.0

745 for j in range(3,n+1):

746 new = np.cumsum(new)

747 if j <= c:

748 new[j:] -= new[:c+1-j]

749 prob = 2.0*np.sum(new)/math.factorial(n)

750 p_mass_at_c = new[-1]/math.factorial(n)

751 else:

752 new = np.zeros(c+1)

753 new[0:2] = 1.0

754 for j in range(3, n+1):

755 new = np.cumsum(new)/j

756 if j <= c:

757 new[j:] -= new[:c+1-j]

758 prob = np.sum(new)

759 p_mass_at_c = new[-1]/2

760

761 if alternative != 'two-sided':

762 # if the alternative hypothesis and alternative agree,

763 # one-sided p-value is half the two-sided p-value

764 if in_right_tail == alternative_greater:

765 prob /= 2

766 else:

767 prob = 1 - prob/2 + p_mass_at_c

768

769 prob = np.clip(prob, 0, 1)

770

771 return prob

772

773

774def kendalltau(x, y, use_ties=True, use_missing=False, method='auto',

775 alternative='two-sided'):

776 """

777 Computes Kendall's rank correlation tau on two variables *x* and *y*.

778

779 Parameters

780 ----------

781 x : sequence

782 First data list (for example, time).

783 y : sequence

784 Second data list.

785 use_ties : {True, False}, optional

786 Whether ties correction should be performed.

787 use_missing : {False, True}, optional

788 Whether missing data should be allocated a rank of 0 (False) or the

789 average rank (True)

790 method : {'auto', 'asymptotic', 'exact'}, optional

791 Defines which method is used to calculate the p-value [1]_.

792 'asymptotic' uses a normal approximation valid for large samples.

793 'exact' computes the exact p-value, but can only be used if no ties

794 are present. As the sample size increases, the 'exact' computation

795 time may grow and the result may lose some precision.

796 'auto' is the default and selects the appropriate

797 method based on a trade-off between speed and accuracy.

798 alternative : {'two-sided', 'less', 'greater'}, optional

799 Defines the alternative hypothesis. Default is 'two-sided'.

800 The following options are available:

801

802 * 'two-sided': the rank correlation is nonzero

803 * 'less': the rank correlation is negative (less than zero)

804 * 'greater': the rank correlation is positive (greater than zero)

805

806 Returns

807 -------

808 res : SignificanceResult

809 An object containing attributes:

810

811 statistic : float

812 The tau statistic.

813 pvalue : float

814 The p-value for a hypothesis test whose null hypothesis is

815 an absence of association, tau = 0.

816

817 References

818 ----------

819 .. [1] Maurice G. Kendall, "Rank Correlation Methods" (4th Edition),

820 Charles Griffin & Co., 1970.

821

822 """

823 (x, y, n) = _chk_size(x, y)

824 (x, y) = (x.flatten(), y.flatten())

825 m = ma.mask_or(ma.getmask(x), ma.getmask(y))

826 if m is not nomask:

827 x = ma.array(x, mask=m, copy=True)

828 y = ma.array(y, mask=m, copy=True)

829 # need int() here, otherwise numpy defaults to 32 bit

830 # integer on all Windows architectures, causing overflow.

831 # int() will keep it infinite precision.

832 n -= int(m.sum())

833

834 if n < 2:

835 res = scipy.stats._stats_py.SignificanceResult(np.nan, np.nan)

836 res.correlation = np.nan

837 return res

838

839 rx = ma.masked_equal(rankdata(x, use_missing=use_missing), 0)

840 ry = ma.masked_equal(rankdata(y, use_missing=use_missing), 0)

841 idx = rx.argsort()

842 (rx, ry) = (rx[idx], ry[idx])

843 C = np.sum([((ry[i+1:] > ry[i]) * (rx[i+1:] > rx[i])).filled(0).sum()

844 for i in range(len(ry)-1)], dtype=float)

845 D = np.sum([((ry[i+1:] < ry[i])*(rx[i+1:] > rx[i])).filled(0).sum()

846 for i in range(len(ry)-1)], dtype=float)

847 xties = count_tied_groups(x)

848 yties = count_tied_groups(y)

849 if use_ties:

850 corr_x = np.sum([v*k*(k-1) for (k,v) in xties.items()], dtype=float)

851 corr_y = np.sum([v*k*(k-1) for (k,v) in yties.items()], dtype=float)

852 denom = ma.sqrt((n*(n-1)-corr_x)/2. * (n*(n-1)-corr_y)/2.)

853 else:

854 denom = n*(n-1)/2.

855 tau = (C-D) / denom

856

857 if method == 'exact' and (xties or yties):

858 raise ValueError("Ties found, exact method cannot be used.")

859

860 if method == 'auto':

861 if (not xties and not yties) and (n <= 33 or min(C, n*(n-1)/2.0-C) <= 1):

862 method = 'exact'

863 else:

864 method = 'asymptotic'

865

866 if not xties and not yties and method == 'exact':

867 prob = _kendall_p_exact(n, C, alternative)

868

869 elif method == 'asymptotic':

870 var_s = n*(n-1)*(2*n+5)

871 if use_ties:

872 var_s -= np.sum([v*k*(k-1)*(2*k+5)*1. for (k,v) in xties.items()])

873 var_s -= np.sum([v*k*(k-1)*(2*k+5)*1. for (k,v) in yties.items()])

874 v1 = np.sum([v*k*(k-1) for (k, v) in xties.items()], dtype=float) *\

875 np.sum([v*k*(k-1) for (k, v) in yties.items()], dtype=float)

876 v1 /= 2.*n*(n-1)

877 if n > 2:

878 v2 = np.sum([v*k*(k-1)*(k-2) for (k,v) in xties.items()],

879 dtype=float) * \

880 np.sum([v*k*(k-1)*(k-2) for (k,v) in yties.items()],

881 dtype=float)

882 v2 /= 9.*n*(n-1)*(n-2)

883 else:

884 v2 = 0

885 else:

886 v1 = v2 = 0

887

888 var_s /= 18.

889 var_s += (v1 + v2)

890 z = (C-D)/np.sqrt(var_s)

891 _, prob = scipy.stats._stats_py._normtest_finish(z, alternative)

892 else:

893 raise ValueError("Unknown method "+str(method)+" specified, please "

894 "use auto, exact or asymptotic.")

895

896 res = scipy.stats._stats_py.SignificanceResult(tau, prob)

897 res.correlation = tau

898 return res

899

900

901def kendalltau_seasonal(x):

902 """

903 Computes a multivariate Kendall's rank correlation tau, for seasonal data.

904

905 Parameters

906 ----------

907 x : 2-D ndarray

908 Array of seasonal data, with seasons in columns.

909

910 """

911 x = ma.array(x, subok=True, copy=False, ndmin=2)

912 (n,m) = x.shape

913 n_p = x.count(0)

914

915 S_szn = sum(msign(x[i:]-x[i]).sum(0) for i in range(n))

916 S_tot = S_szn.sum()

917

918 n_tot = x.count()

919 ties = count_tied_groups(x.compressed())

920 corr_ties = sum(v*k*(k-1) for (k,v) in ties.items())

921 denom_tot = ma.sqrt(1.*n_tot*(n_tot-1)*(n_tot*(n_tot-1)-corr_ties))/2.

922

923 R = rankdata(x, axis=0, use_missing=True)

924 K = ma.empty((m,m), dtype=int)

925 covmat = ma.empty((m,m), dtype=float)

926 denom_szn = ma.empty(m, dtype=float)

927 for j in range(m):

928 ties_j = count_tied_groups(x[:,j].compressed())

929 corr_j = sum(v*k*(k-1) for (k,v) in ties_j.items())

930 cmb = n_p[j]*(n_p[j]-1)

931 for k in range(j,m,1):

932 K[j,k] = sum(msign((x[i:,j]-x[i,j])*(x[i:,k]-x[i,k])).sum()

933 for i in range(n))

934 covmat[j,k] = (K[j,k] + 4*(R[:,j]*R[:,k]).sum() -

935 n*(n_p[j]+1)*(n_p[k]+1))/3.

936 K[k,j] = K[j,k]

937 covmat[k,j] = covmat[j,k]

938

939 denom_szn[j] = ma.sqrt(cmb*(cmb-corr_j)) / 2.

940

941 var_szn = covmat.diagonal()

942

943 z_szn = msign(S_szn) * (abs(S_szn)-1) / ma.sqrt(var_szn)

944 z_tot_ind = msign(S_tot) * (abs(S_tot)-1) / ma.sqrt(var_szn.sum())

945 z_tot_dep = msign(S_tot) * (abs(S_tot)-1) / ma.sqrt(covmat.sum())

946

947 prob_szn = special.erfc(abs(z_szn)/np.sqrt(2))

948 prob_tot_ind = special.erfc(abs(z_tot_ind)/np.sqrt(2))

949 prob_tot_dep = special.erfc(abs(z_tot_dep)/np.sqrt(2))

950

951 chi2_tot = (z_szn*z_szn).sum()

952 chi2_trd = m * z_szn.mean()**2

953 output = {'seasonal tau': S_szn/denom_szn,

954 'global tau': S_tot/denom_tot,

955 'global tau (alt)': S_tot/denom_szn.sum(),

956 'seasonal p-value': prob_szn,

957 'global p-value (indep)': prob_tot_ind,

958 'global p-value (dep)': prob_tot_dep,

959 'chi2 total': chi2_tot,

960 'chi2 trend': chi2_trd,

961 }

962 return output

963

964

965PointbiserialrResult = namedtuple('PointbiserialrResult', ('correlation',

966 'pvalue'))

967

968

969def pointbiserialr(x, y):

970 """Calculates a point biserial correlation coefficient and its p-value.

971

972 Parameters

973 ----------

974 x : array_like of bools

975 Input array.

976 y : array_like

977 Input array.

978

979 Returns

980 -------

981 correlation : float

982 R value

983 pvalue : float

984 2-tailed p-value

985

986 Notes

987 -----

988 Missing values are considered pair-wise: if a value is missing in x,

989 the corresponding value in y is masked.

990

991 For more details on `pointbiserialr`, see `scipy.stats.pointbiserialr`.

992

993 """

994 x = ma.fix_invalid(x, copy=True).astype(bool)

995 y = ma.fix_invalid(y, copy=True).astype(float)

996 # Get rid of the missing data

997 m = ma.mask_or(ma.getmask(x), ma.getmask(y))

998 if m is not nomask:

999 unmask = np.logical_not(m)

1000 x = x[unmask]

1001 y = y[unmask]

1002

1003 n = len(x)

1004 # phat is the fraction of x values that are True

1005 phat = x.sum() / float(n)

1006 y0 = y[~x] # y-values where x is False

1007 y1 = y[x] # y-values where x is True

1008 y0m = y0.mean()

1009 y1m = y1.mean()

1010

1011 rpb = (y1m - y0m)*np.sqrt(phat * (1-phat)) / y.std()

1012

1013 df = n-2

1014 t = rpb*ma.sqrt(df/(1.0-rpb**2))

1015 prob = _betai(0.5*df, 0.5, df/(df+t*t))

1016

1017 return PointbiserialrResult(rpb, prob)

1018

1019

1020def linregress(x, y=None):

1021 r"""

1022 Linear regression calculation

1023

1024 Note that the non-masked version is used, and that this docstring is

1025 replaced by the non-masked docstring + some info on missing data.

1026

1027 """

1028 if y is None:

1029 x = ma.array(x)

1030 if x.shape[0] == 2:

1031 x, y = x

1032 elif x.shape[1] == 2:

1033 x, y = x.T

1034 else:

1035 raise ValueError("If only `x` is given as input, "

1036 "it has to be of shape (2, N) or (N, 2), "

1037 f"provided shape was {x.shape}")

1038 else:

1039 x = ma.array(x)

1040 y = ma.array(y)

1041

1042 x = x.flatten()

1043 y = y.flatten()

1044

1045 if np.amax(x) == np.amin(x) and len(x) > 1:

1046 raise ValueError("Cannot calculate a linear regression "

1047 "if all x values are identical")

1048

1049 m = ma.mask_or(ma.getmask(x), ma.getmask(y), shrink=False)

1050 if m is not nomask:

1051 x = ma.array(x, mask=m)

1052 y = ma.array(y, mask=m)

1053 if np.any(~m):

1054 result = stats_linregress(x.data[~m], y.data[~m])

1055 else:

1056 # All data is masked

1057 result = stats_LinregressResult(slope=None, intercept=None,

1058 rvalue=None, pvalue=None,

1059 stderr=None,

1060 intercept_stderr=None)

1061 else:

1062 result = stats_linregress(x.data, y.data)

1063

1064 return result

1065

1066

1067def theilslopes(y, x=None, alpha=0.95, method='separate'):

1068 r"""

1069 Computes the Theil-Sen estimator for a set of points (x, y).

1070

1071 `theilslopes` implements a method for robust linear regression. It

1072 computes the slope as the median of all slopes between paired values.

1073

1074 Parameters

1075 ----------

1076 y : array_like

1077 Dependent variable.

1078 x : array_like or None, optional

1079 Independent variable. If None, use ``arange(len(y))`` instead.

1080 alpha : float, optional

1081 Confidence degree between 0 and 1. Default is 95% confidence.

1082 Note that `alpha` is symmetric around 0.5, i.e. both 0.1 and 0.9 are

1083 interpreted as "find the 90% confidence interval".

1084 method : {'joint', 'separate'}, optional

1085 Method to be used for computing estimate for intercept.

1086 Following methods are supported,

1087

1088 * 'joint': Uses np.median(y - slope * x) as intercept.

1089 * 'separate': Uses np.median(y) - slope * np.median(x)

1090 as intercept.

1091

1092 The default is 'separate'.

1093

1094 .. versionadded:: 1.8.0

1095

1096 Returns

1097 -------

1098 result : ``TheilslopesResult`` instance

1099 The return value is an object with the following attributes:

1100

1101 slope : float

1102 Theil slope.

1103 intercept : float

1104 Intercept of the Theil line.

1105 low_slope : float

1106 Lower bound of the confidence interval on `slope`.

1107 high_slope : float

1108 Upper bound of the confidence interval on `slope`.

1109

1110 See Also

1111 --------

1112 siegelslopes : a similar technique using repeated medians

1113

1114

1115 Notes

1116 -----

1117 For more details on `theilslopes`, see `scipy.stats.theilslopes`.

1118

1119 """

1120 y = ma.asarray(y).flatten()

1121 if x is None:

1122 x = ma.arange(len(y), dtype=float)

1123 else:

1124 x = ma.asarray(x).flatten()

1125 if len(x) != len(y):

1126 raise ValueError("Incompatible lengths ! (%s<>%s)" % (len(y),len(x)))

1127

1128 m = ma.mask_or(ma.getmask(x), ma.getmask(y))

1129 y._mask = x._mask = m

1130 # Disregard any masked elements of x or y

1131 y = y.compressed()

1132 x = x.compressed().astype(float)

1133 # We now have unmasked arrays so can use `scipy.stats.theilslopes`

1134 return stats_theilslopes(y, x, alpha=alpha, method=method)

1135

1136

1137def siegelslopes(y, x=None, method="hierarchical"):

1138 r"""

1139 Computes the Siegel estimator for a set of points (x, y).

1140

1141 `siegelslopes` implements a method for robust linear regression

1142 using repeated medians to fit a line to the points (x, y).

1143 The method is robust to outliers with an asymptotic breakdown point

1144 of 50%.

1145

1146 Parameters

1147 ----------

1148 y : array_like

1149 Dependent variable.

1150 x : array_like or None, optional

1151 Independent variable. If None, use ``arange(len(y))`` instead.

1152 method : {'hierarchical', 'separate'}

1153 If 'hierarchical', estimate the intercept using the estimated

1154 slope ``slope`` (default option).

1155 If 'separate', estimate the intercept independent of the estimated

1156 slope. See Notes for details.

1157

1158 Returns

1159 -------

1160 result : ``SiegelslopesResult`` instance

1161 The return value is an object with the following attributes:

1162

1163 slope : float

1164 Estimate of the slope of the regression line.

1165 intercept : float

1166 Estimate of the intercept of the regression line.

1167

1168 See Also

1169 --------

1170 theilslopes : a similar technique without repeated medians

1171

1172 Notes

1173 -----

1174 For more details on `siegelslopes`, see `scipy.stats.siegelslopes`.

1175

1176 """

1177 y = ma.asarray(y).ravel()

1178 if x is None:

1179 x = ma.arange(len(y), dtype=float)

1180 else:

1181 x = ma.asarray(x).ravel()

1182 if len(x) != len(y):

1183 raise ValueError("Incompatible lengths ! (%s<>%s)" % (len(y), len(x)))

1184

1185 m = ma.mask_or(ma.getmask(x), ma.getmask(y))

1186 y._mask = x._mask = m

1187 # Disregard any masked elements of x or y

1188 y = y.compressed()

1189 x = x.compressed().astype(float)

1190 # We now have unmasked arrays so can use `scipy.stats.siegelslopes`

1191 return stats_siegelslopes(y, x, method=method)

1192

1193

1194SenSeasonalSlopesResult = _make_tuple_bunch('SenSeasonalSlopesResult',

1195 ['intra_slope', 'inter_slope'])

1196

1197

1198def sen_seasonal_slopes(x):

1199 r"""

1200 Computes seasonal Theil-Sen and Kendall slope estimators.

1201

1202 The seasonal generalization of Sen's slope computes the slopes between all

1203 pairs of values within a "season" (column) of a 2D array. It returns an

1204 array containing the median of these "within-season" slopes for each

1205 season (the Theil-Sen slope estimator of each season), and it returns the

1206 median of the within-season slopes across all seasons (the seasonal Kendall

1207 slope estimator).

1208

1209 Parameters

1210 ----------

1211 x : 2D array_like

1212 Each column of `x` contains measurements of the dependent variable

1213 within a season. The independent variable (usually time) of each season

1214 is assumed to be ``np.arange(x.shape[0])``.

1215

1216 Returns

1217 -------

1218 result : ``SenSeasonalSlopesResult`` instance

1219 The return value is an object with the following attributes:

1220

1221 intra_slope : ndarray

1222 For each season, the Theil-Sen slope estimator: the median of

1223 within-season slopes.

1224 inter_slope : float

1225 The seasonal Kendall slope estimateor: the median of within-season

1226 slopes *across all* seasons.

1227

1228 See Also

1229 --------

1230 theilslopes : the analogous function for non-seasonal data

1231 scipy.stats.theilslopes : non-seasonal slopes for non-masked arrays

1232

1233 Notes

1234 -----

1235 The slopes :math:`d_{ijk}` within season :math:`i` are:

1236

1237 .. math::

1238

1239 d_{ijk} = \frac{x_{ij} - x_{ik}}

1240 {j - k}

1241

1242 for pairs of distinct integer indices :math:`j, k` of :math:`x`.

1243

1244 Element :math:`i` of the returned `intra_slope` array is the median of the

1245 :math:`d_{ijk}` over all :math:`j < k`; this is the Theil-Sen slope

1246 estimator of season :math:`i`. The returned `inter_slope` value, better

1247 known as the seasonal Kendall slope estimator, is the median of the

1248 :math:`d_{ijk}` over all :math:`i, j, k`.

1249

1250 References

1251 ----------

1252 .. [1] Hirsch, Robert M., James R. Slack, and Richard A. Smith.

1253 "Techniques of trend analysis for monthly water quality data."

1254 *Water Resources Research* 18.1 (1982): 107-121.

1255

1256 Examples

1257 --------

1258 Suppose we have 100 observations of a dependent variable for each of four

1259 seasons:

1260

1261 >>> import numpy as np

1262 >>> rng = np.random.default_rng()

1263 >>> x = rng.random(size=(100, 4))

1264

1265 We compute the seasonal slopes as:

1266

1267 >>> from scipy import stats

1268 >>> intra_slope, inter_slope = stats.mstats.sen_seasonal_slopes(x)

1269

1270 If we define a function to compute all slopes between observations within

1271 a season:

1272

1273 >>> def dijk(yi):

1274 ... n = len(yi)

1275 ... x = np.arange(n)

1276 ... dy = yi - yi[:, np.newaxis]

1277 ... dx = x - x[:, np.newaxis]

1278 ... # we only want unique pairs of distinct indices

1279 ... mask = np.triu(np.ones((n, n), dtype=bool), k=1)

1280 ... return dy[mask]/dx[mask]

1281

1282 then element ``i`` of ``intra_slope`` is the median of ``dijk[x[:, i]]``:

1283

1284 >>> i = 2

1285 >>> np.allclose(np.median(dijk(x[:, i])), intra_slope[i])

1286 True

1287

1288 and ``inter_slope`` is the median of the values returned by ``dijk`` for

1289 all seasons:

1290

1291 >>> all_slopes = np.concatenate([dijk(x[:, i]) for i in range(x.shape[1])])

1292 >>> np.allclose(np.median(all_slopes), inter_slope)

1293 True

1294

1295 Because the data are randomly generated, we would expect the median slopes

1296 to be nearly zero both within and across all seasons, and indeed they are:

1297

1298 >>> intra_slope.data

1299 array([ 0.00124504, -0.00277761, -0.00221245, -0.00036338])

1300 >>> inter_slope

1301 -0.0010511779872922058

1302

1303 """

1304 x = ma.array(x, subok=True, copy=False, ndmin=2)

1305 (n,_) = x.shape

1306 # Get list of slopes per season

1307 szn_slopes = ma.vstack([(x[i+1:]-x[i])/np.arange(1,n-i)[:,None]

1308 for i in range(n)])

1309 szn_medslopes = ma.median(szn_slopes, axis=0)

1310 medslope = ma.median(szn_slopes, axis=None)

1311 return SenSeasonalSlopesResult(szn_medslopes, medslope)

1312

1313

1314Ttest_1sampResult = namedtuple('Ttest_1sampResult', ('statistic', 'pvalue'))

1315

1316

1317def ttest_1samp(a, popmean, axis=0, alternative='two-sided'):

1318 """

1319 Calculates the T-test for the mean of ONE group of scores.

1320

1321 Parameters

1322 ----------

1323 a : array_like

1324 sample observation

1325 popmean : float or array_like

1326 expected value in null hypothesis, if array_like than it must have the

1327 same shape as `a` excluding the axis dimension

1328 axis : int or None, optional

1329 Axis along which to compute test. If None, compute over the whole

1330 array `a`.

1331 alternative : {'two-sided', 'less', 'greater'}, optional

1332 Defines the alternative hypothesis.

1333 The following options are available (default is 'two-sided'):

1334

1335 * 'two-sided': the mean of the underlying distribution of the sample

1336 is different than the given population mean (`popmean`)

1337 * 'less': the mean of the underlying distribution of the sample is

1338 less than the given population mean (`popmean`)

1339 * 'greater': the mean of the underlying distribution of the sample is

1340 greater than the given population mean (`popmean`)

1341

1342 .. versionadded:: 1.7.0

1343

1344 Returns

1345 -------

1346 statistic : float or array

1347 t-statistic

1348 pvalue : float or array

1349 The p-value

1350

1351 Notes

1352 -----

1353 For more details on `ttest_1samp`, see `scipy.stats.ttest_1samp`.

1354

1355 """

1356 a, axis = _chk_asarray(a, axis)

1357 if a.size == 0:

1358 return (np.nan, np.nan)

1359

1360 x = a.mean(axis=axis)

1361 v = a.var(axis=axis, ddof=1)

1362 n = a.count(axis=axis)

1363 # force df to be an array for masked division not to throw a warning

1364 df = ma.asanyarray(n - 1.0)

1365 svar = ((n - 1.0) * v) / df

1366 with np.errstate(divide='ignore', invalid='ignore'):

1367 t = (x - popmean) / ma.sqrt(svar / n)

1368

1369 t, prob = scipy.stats._stats_py._ttest_finish(df, t, alternative)

1370 return Ttest_1sampResult(t, prob)

1371

1372

1373ttest_onesamp = ttest_1samp

1374

1375

1376Ttest_indResult = namedtuple('Ttest_indResult', ('statistic', 'pvalue'))

1377

1378

1379def ttest_ind(a, b, axis=0, equal_var=True, alternative='two-sided'):

1380 """

1381 Calculates the T-test for the means of TWO INDEPENDENT samples of scores.

1382

1383 Parameters

1384 ----------

1385 a, b : array_like

1386 The arrays must have the same shape, except in the dimension

1387 corresponding to `axis` (the first, by default).

1388 axis : int or None, optional

1389 Axis along which to compute test. If None, compute over the whole

1390 arrays, `a`, and `b`.

1391 equal_var : bool, optional

1392 If True, perform a standard independent 2 sample test that assumes equal

1393 population variances.

1394 If False, perform Welch's t-test, which does not assume equal population

1395 variance.

1396

1397 .. versionadded:: 0.17.0

1398 alternative : {'two-sided', 'less', 'greater'}, optional

1399 Defines the alternative hypothesis.

1400 The following options are available (default is 'two-sided'):

1401

1402 * 'two-sided': the means of the distributions underlying the samples

1403 are unequal.

1404 * 'less': the mean of the distribution underlying the first sample

1405 is less than the mean of the distribution underlying the second

1406 sample.

1407 * 'greater': the mean of the distribution underlying the first

1408 sample is greater than the mean of the distribution underlying

1409 the second sample.

1410

1411 .. versionadded:: 1.7.0

1412

1413 Returns

1414 -------

1415 statistic : float or array

1416 The calculated t-statistic.

1417 pvalue : float or array

1418 The p-value.

1419

1420 Notes

1421 -----

1422 For more details on `ttest_ind`, see `scipy.stats.ttest_ind`.

1423

1424 """

1425 a, b, axis = _chk2_asarray(a, b, axis)

1426

1427 if a.size == 0 or b.size == 0:

1428 return Ttest_indResult(np.nan, np.nan)

1429

1430 (x1, x2) = (a.mean(axis), b.mean(axis))

1431 (v1, v2) = (a.var(axis=axis, ddof=1), b.var(axis=axis, ddof=1))

1432 (n1, n2) = (a.count(axis), b.count(axis))

1433

1434 if equal_var:

1435 # force df to be an array for masked division not to throw a warning

1436 df = ma.asanyarray(n1 + n2 - 2.0)

1437 svar = ((n1-1)*v1+(n2-1)*v2) / df

1438 denom = ma.sqrt(svar*(1.0/n1 + 1.0/n2)) # n-D computation here!

1439 else:

1440 vn1 = v1/n1

1441 vn2 = v2/n2

1442 with np.errstate(divide='ignore', invalid='ignore'):

1443 df = (vn1 + vn2)**2 / (vn1**2 / (n1 - 1) + vn2**2 / (n2 - 1))

1444

1445 # If df is undefined, variances are zero.

1446 # It doesn't matter what df is as long as it is not NaN.

1447 df = np.where(np.isnan(df), 1, df)

1448 denom = ma.sqrt(vn1 + vn2)

1449

1450 with np.errstate(divide='ignore', invalid='ignore'):

1451 t = (x1-x2) / denom

1452

1453 t, prob = scipy.stats._stats_py._ttest_finish(df, t, alternative)

1454 return Ttest_indResult(t, prob)

1455

1456

1457Ttest_relResult = namedtuple('Ttest_relResult', ('statistic', 'pvalue'))

1458

1459

1460def ttest_rel(a, b, axis=0, alternative='two-sided'):

1461 """

1462 Calculates the T-test on TWO RELATED samples of scores, a and b.

1463

1464 Parameters

1465 ----------

1466 a, b : array_like

1467 The arrays must have the same shape.

1468 axis : int or None, optional

1469 Axis along which to compute test. If None, compute over the whole

1470 arrays, `a`, and `b`.

1471 alternative : {'two-sided', 'less', 'greater'}, optional

1472 Defines the alternative hypothesis.

1473 The following options are available (default is 'two-sided'):

1474

1475 * 'two-sided': the means of the distributions underlying the samples

1476 are unequal.

1477 * 'less': the mean of the distribution underlying the first sample

1478 is less than the mean of the distribution underlying the second

1479 sample.

1480 * 'greater': the mean of the distribution underlying the first

1481 sample is greater than the mean of the distribution underlying

1482 the second sample.

1483

1484 .. versionadded:: 1.7.0

1485

1486 Returns

1487 -------

1488 statistic : float or array

1489 t-statistic

1490 pvalue : float or array

1491 two-tailed p-value

1492

1493 Notes

1494 -----

1495 For more details on `ttest_rel`, see `scipy.stats.ttest_rel`.

1496

1497 """

1498 a, b, axis = _chk2_asarray(a, b, axis)

1499 if len(a) != len(b):

1500 raise ValueError('unequal length arrays')

1501

1502 if a.size == 0 or b.size == 0:

1503 return Ttest_relResult(np.nan, np.nan)

1504

1505 n = a.count(axis)

1506 df = ma.asanyarray(n-1.0)

1507 d = (a-b).astype('d')

1508 dm = d.mean(axis)

1509 v = d.var(axis=axis, ddof=1)

1510 denom = ma.sqrt(v / n)

1511 with np.errstate(divide='ignore', invalid='ignore'):

1512 t = dm / denom

1513

1514 t, prob = scipy.stats._stats_py._ttest_finish(df, t, alternative)

1515 return Ttest_relResult(t, prob)

1516

1517

1518MannwhitneyuResult = namedtuple('MannwhitneyuResult', ('statistic',

1519 'pvalue'))

1520

1521

1522def mannwhitneyu(x,y, use_continuity=True):

1523 """

1524 Computes the Mann-Whitney statistic

1525

1526 Missing values in `x` and/or `y` are discarded.

1527

1528 Parameters

1529 ----------

1530 x : sequence

1531 Input

1532 y : sequence

1533 Input

1534 use_continuity : {True, False}, optional

1535 Whether a continuity correction (1/2.) should be taken into account.

1536

1537 Returns

1538 -------

1539 statistic : float

1540 The minimum of the Mann-Whitney statistics

1541 pvalue : float

1542 Approximate two-sided p-value assuming a normal distribution.

1543

1544 """

1545 x = ma.asarray(x).compressed().view(ndarray)

1546 y = ma.asarray(y).compressed().view(ndarray)

1547 ranks = rankdata(np.concatenate([x,y]))

1548 (nx, ny) = (len(x), len(y))

1549 nt = nx + ny

1550 U = ranks[:nx].sum() - nx*(nx+1)/2.

1551 U = max(U, nx*ny - U)

1552 u = nx*ny - U

1553

1554 mu = (nx*ny)/2.

1555 sigsq = (nt**3 - nt)/12.

1556 ties = count_tied_groups(ranks)

1557 sigsq -= sum(v*(k**3-k) for (k,v) in ties.items())/12.

1558 sigsq *= nx*ny/float(nt*(nt-1))

1559

1560 if use_continuity:

1561 z = (U - 1/2. - mu) / ma.sqrt(sigsq)

1562 else:

1563 z = (U - mu) / ma.sqrt(sigsq)

1564

1565 prob = special.erfc(abs(z)/np.sqrt(2))

1566 return MannwhitneyuResult(u, prob)

1567

1568

1569KruskalResult = namedtuple('KruskalResult', ('statistic', 'pvalue'))

1570

1571

1572def kruskal(*args):

1573 """

1574 Compute the Kruskal-Wallis H-test for independent samples

1575

1576 Parameters

1577 ----------

1578 sample1, sample2, ... : array_like

1579 Two or more arrays with the sample measurements can be given as

1580 arguments.

1581

1582 Returns

1583 -------

1584 statistic : float

1585 The Kruskal-Wallis H statistic, corrected for ties

1586 pvalue : float

1587 The p-value for the test using the assumption that H has a chi

1588 square distribution

1589

1590 Notes

1591 -----

1592 For more details on `kruskal`, see `scipy.stats.kruskal`.

1593

1594 Examples

1595 --------

1596 >>> from scipy.stats.mstats import kruskal

1597

1598 Random samples from three different brands of batteries were tested

1599 to see how long the charge lasted. Results were as follows:

1600

1601 >>> a = [6.3, 5.4, 5.7, 5.2, 5.0]

1602 >>> b = [6.9, 7.0, 6.1, 7.9]

1603 >>> c = [7.2, 6.9, 6.1, 6.5]

1604

1605 Test the hypotesis that the distribution functions for all of the brands'

1606 durations are identical. Use 5% level of significance.

1607

1608 >>> kruskal(a, b, c)

1609 KruskalResult(statistic=7.113812154696133, pvalue=0.028526948491942164)

1610

1611 The null hypothesis is rejected at the 5% level of significance

1612 because the returned p-value is less than the critical value of 5%.

1613

1614 """

1615 output = argstoarray(*args)

1616 ranks = ma.masked_equal(rankdata(output, use_missing=False), 0)

1617 sumrk = ranks.sum(-1)

1618 ngrp = ranks.count(-1)

1619 ntot = ranks.count()

1620 H = 12./(ntot*(ntot+1)) * (sumrk**2/ngrp).sum() - 3*(ntot+1)

1621 # Tie correction

1622 ties = count_tied_groups(ranks)

1623 T = 1. - sum(v*(k**3-k) for (k,v) in ties.items())/float(ntot**3-ntot)

1624 if T == 0:

1625 raise ValueError('All numbers are identical in kruskal')

1626

1627 H /= T

1628 df = len(output) - 1

1629 prob = distributions.chi2.sf(H, df)

1630 return KruskalResult(H, prob)

1631

1632

1633kruskalwallis = kruskal

1634

1635

1636@_rename_parameter("mode", "method")

1637def ks_1samp(x, cdf, args=(), alternative="two-sided", method='auto'):

1638 """

1639 Computes the Kolmogorov-Smirnov test on one sample of masked values.

1640

1641 Missing values in `x` are discarded.

1642

1643 Parameters

1644 ----------

1645 x : array_like

1646 a 1-D array of observations of random variables.

1647 cdf : str or callable

1648 If a string, it should be the name of a distribution in `scipy.stats`.

1649 If a callable, that callable is used to calculate the cdf.

1650 args : tuple, sequence, optional

1651 Distribution parameters, used if `cdf` is a string.

1652 alternative : {'two-sided', 'less', 'greater'}, optional

1653 Indicates the alternative hypothesis. Default is 'two-sided'.

1654 method : {'auto', 'exact', 'asymp'}, optional

1655 Defines the method used for calculating the p-value.

1656 The following options are available (default is 'auto'):

1657

1658 * 'auto' : use 'exact' for small size arrays, 'asymp' for large

1659 * 'exact' : use approximation to exact distribution of test statistic

1660 * 'asymp' : use asymptotic distribution of test statistic

1661

1662 Returns

1663 -------

1664 d : float

1665 Value of the Kolmogorov Smirnov test

1666 p : float

1667 Corresponding p-value.

1668

1669 """

1670 alternative = {'t': 'two-sided', 'g': 'greater', 'l': 'less'}.get(

1671 alternative.lower()[0], alternative)

1672 return scipy.stats._stats_py.ks_1samp(

1673 x, cdf, args=args, alternative=alternative, method=method)

1674

1675

1676@_rename_parameter("mode", "method")

1677def ks_2samp(data1, data2, alternative="two-sided", method='auto'):

1678 """

1679 Computes the Kolmogorov-Smirnov test on two samples.

1680

1681 Missing values in `x` and/or `y` are discarded.

1682

1683 Parameters

1684 ----------

1685 data1 : array_like

1686 First data set

1687 data2 : array_like

1688 Second data set

1689 alternative : {'two-sided', 'less', 'greater'}, optional

1690 Indicates the alternative hypothesis. Default is 'two-sided'.

1691 method : {'auto', 'exact', 'asymp'}, optional

1692 Defines the method used for calculating the p-value.

1693 The following options are available (default is 'auto'):

1694

1695 * 'auto' : use 'exact' for small size arrays, 'asymp' for large

1696 * 'exact' : use approximation to exact distribution of test statistic

1697 * 'asymp' : use asymptotic distribution of test statistic

1698

1699 Returns

1700 -------

1701 d : float

1702 Value of the Kolmogorov Smirnov test

1703 p : float

1704 Corresponding p-value.

1705

1706 """

1707 # Ideally this would be accomplished by

1708 # ks_2samp = scipy.stats._stats_py.ks_2samp

1709 # but the circular dependencies between _mstats_basic and stats prevent that.

1710 alternative = {'t': 'two-sided', 'g': 'greater', 'l': 'less'}.get(

1711 alternative.lower()[0], alternative)

1712 return scipy.stats._stats_py.ks_2samp(data1, data2,

1713 alternative=alternative,

1714 method=method)

1715

1716

1717ks_twosamp = ks_2samp

1718

1719

1720@_rename_parameter("mode", "method")

1721def kstest(data1, data2, args=(), alternative='two-sided', method='auto'):

1722 """

1723

1724 Parameters

1725 ----------

1726 data1 : array_like

1727 data2 : str, callable or array_like

1728 args : tuple, sequence, optional

1729 Distribution parameters, used if `data1` or `data2` are strings.

1730 alternative : str, as documented in stats.kstest

1731 method : str, as documented in stats.kstest

1732

1733 Returns

1734 -------

1735 tuple of (K-S statistic, probability)

1736

1737 """

1738 return scipy.stats._stats_py.kstest(data1, data2, args,

1739 alternative=alternative, method=method)

1740

1741

1742def trima(a, limits=None, inclusive=(True,True)):

1743 """

1744 Trims an array by masking the data outside some given limits.

1745

1746 Returns a masked version of the input array.

1747

1748 Parameters

1749 ----------

1750 a : array_like

1751 Input array.

1752 limits : {None, tuple}, optional

1753 Tuple of (lower limit, upper limit) in absolute values.

1754 Values of the input array lower (greater) than the lower (upper) limit

1755 will be masked. A limit is None indicates an open interval.

1756 inclusive : (bool, bool) tuple, optional

1757 Tuple of (lower flag, upper flag), indicating whether values exactly

1758 equal to the lower (upper) limit are allowed.

1759

1760 Examples

1761 --------

1762 >>> from scipy.stats.mstats import trima

1763 >>> import numpy as np

1764

1765 >>> a = np.arange(10)

1766

1767 The interval is left-closed and right-open, i.e., `[2, 8)`.

1768 Trim the array by keeping only values in the interval.

1769

1770 >>> trima(a, limits=(2, 8), inclusive=(True, False))

1771 masked_array(data=[--, --, 2, 3, 4, 5, 6, 7, --, --],

1772 mask=[ True, True, False, False, False, False, False, False,

1773 True, True],

1774 fill_value=999999)

1775

1776 """

1777 a = ma.asarray(a)

1778 a.unshare_mask()

1779 if (limits is None) or (limits == (None, None)):

1780 return a

1781

1782 (lower_lim, upper_lim) = limits

1783 (lower_in, upper_in) = inclusive

1784 condition = False

1785 if lower_lim is not None:

1786 if lower_in:

1787 condition |= (a < lower_lim)

1788 else:

1789 condition |= (a <= lower_lim)

1790

1791 if upper_lim is not None:

1792 if upper_in:

1793 condition |= (a > upper_lim)

1794 else:

1795 condition |= (a >= upper_lim)

1796

1797 a[condition.filled(True)] = masked

1798 return a

1799

1800

1801def trimr(a, limits=None, inclusive=(True, True), axis=None):

1802 """

1803 Trims an array by masking some proportion of the data on each end.

1804 Returns a masked version of the input array.

1805

1806 Parameters

1807 ----------

1808 a : sequence

1809 Input array.

1810 limits : {None, tuple}, optional

1811 Tuple of the percentages to cut on each side of the array, with respect

1812 to the number of unmasked data, as floats between 0. and 1.

1813 Noting n the number of unmasked data before trimming, the

1814 (n*limits[0])th smallest data and the (n*limits[1])th largest data are

1815 masked, and the total number of unmasked data after trimming is

1816 n*(1.-sum(limits)). The value of one limit can be set to None to

1817 indicate an open interval.

1818 inclusive : {(True,True) tuple}, optional

1819 Tuple of flags indicating whether the number of data being masked on

1820 the left (right) end should be truncated (True) or rounded (False) to

1821 integers.

1822 axis : {None,int}, optional

1823 Axis along which to trim. If None, the whole array is trimmed, but its

1824 shape is maintained.

1825

1826 """

1827 def _trimr1D(a, low_limit, up_limit, low_inclusive, up_inclusive):

1828 n = a.count()

1829 idx = a.argsort()

1830 if low_limit:

1831 if low_inclusive:

1832 lowidx = int(low_limit*n)

1833 else:

1834 lowidx = int(np.round(low_limit*n))

1835 a[idx[:lowidx]] = masked

1836 if up_limit is not None:

1837 if up_inclusive:

1838 upidx = n - int(n*up_limit)

1839 else:

1840 upidx = n - int(np.round(n*up_limit))

1841 a[idx[upidx:]] = masked

1842 return a

1843

1844 a = ma.asarray(a)

1845 a.unshare_mask()

1846 if limits is None:

1847 return a

1848

1849 # Check the limits

1850 (lolim, uplim) = limits

1851 errmsg = "The proportion to cut from the %s should be between 0. and 1."

1852 if lolim is not None:

1853 if lolim > 1. or lolim < 0:

1854 raise ValueError(errmsg % 'beginning' + "(got %s)" % lolim)

1855 if uplim is not None:

1856 if uplim > 1. or uplim < 0:

1857 raise ValueError(errmsg % 'end' + "(got %s)" % uplim)

1858

1859 (loinc, upinc) = inclusive

1860

1861 if axis is None:

1862 shp = a.shape

1863 return _trimr1D(a.ravel(),lolim,uplim,loinc,upinc).reshape(shp)

1864 else:

1865 return ma.apply_along_axis(_trimr1D, axis, a, lolim,uplim,loinc,upinc)

1866

1867

1868trimdoc = """

1869 Parameters

1870 ----------

1871 a : sequence

1872 Input array

1873 limits : {None, tuple}, optional

1874 If `relative` is False, tuple (lower limit, upper limit) in absolute values.

1875 Values of the input array lower (greater) than the lower (upper) limit are

1876 masked.

1877

1878 If `relative` is True, tuple (lower percentage, upper percentage) to cut

1879 on each side of the array, with respect to the number of unmasked data.

1880

1881 Noting n the number of unmasked data before trimming, the (n*limits[0])th

1882 smallest data and the (n*limits[1])th largest data are masked, and the

1883 total number of unmasked data after trimming is n*(1.-sum(limits))

1884 In each case, the value of one limit can be set to None to indicate an

1885 open interval.

1886

1887 If limits is None, no trimming is performed

1888 inclusive : {(bool, bool) tuple}, optional

1889 If `relative` is False, tuple indicating whether values exactly equal

1890 to the absolute limits are allowed.

1891 If `relative` is True, tuple indicating whether the number of data

1892 being masked on each side should be rounded (True) or truncated

1893 (False).

1894 relative : bool, optional

1895 Whether to consider the limits as absolute values (False) or proportions

1896 to cut (True).

1897 axis : int, optional

1898 Axis along which to trim.

1899"""

1900

1901

1902def trim(a, limits=None, inclusive=(True,True), relative=False, axis=None):

1903 """

1904 Trims an array by masking the data outside some given limits.

1905

1906 Returns a masked version of the input array.

1907

1908 %s

1909

1910 Examples

1911 --------

1912 >>> from scipy.stats.mstats import trim

1913 >>> z = [ 1, 2, 3, 4, 5, 6, 7, 8, 9,10]

1914 >>> print(trim(z,(3,8)))

1915 [-- -- 3 4 5 6 7 8 -- --]

1916 >>> print(trim(z,(0.1,0.2),relative=True))

1917 [-- 2 3 4 5 6 7 8 -- --]

1918

1919 """

1920 if relative:

1921 return trimr(a, limits=limits, inclusive=inclusive, axis=axis)

1922 else:

1923 return trima(a, limits=limits, inclusive=inclusive)

1924

1925

1926if trim.__doc__:

1927 trim.__doc__ = trim.__doc__ % trimdoc

1928

1929

1930def trimboth(data, proportiontocut=0.2, inclusive=(True,True), axis=None):

1931 """

1932 Trims the smallest and largest data values.

1933

1934 Trims the `data` by masking the ``int(proportiontocut * n)`` smallest and

1935 ``int(proportiontocut * n)`` largest values of data along the given axis,

1936 where n is the number of unmasked values before trimming.

1937

1938 Parameters

1939 ----------

1940 data : ndarray

1941 Data to trim.

1942 proportiontocut : float, optional

1943 Percentage of trimming (as a float between 0 and 1).

1944 If n is the number of unmasked values before trimming, the number of

1945 values after trimming is ``(1 - 2*proportiontocut) * n``.

1946 Default is 0.2.

1947 inclusive : {(bool, bool) tuple}, optional

1948 Tuple indicating whether the number of data being masked on each side

1949 should be rounded (True) or truncated (False).

1950 axis : int, optional

1951 Axis along which to perform the trimming.

1952 If None, the input array is first flattened.

1953

1954 """

1955 return trimr(data, limits=(proportiontocut,proportiontocut),

1956 inclusive=inclusive, axis=axis)

1957

1958

1959def trimtail(data, proportiontocut=0.2, tail='left', inclusive=(True,True),

1960 axis=None):

1961 """

1962 Trims the data by masking values from one tail.

1963

1964 Parameters

1965 ----------

1966 data : array_like

1967 Data to trim.

1968 proportiontocut : float, optional

1969 Percentage of trimming. If n is the number of unmasked values

1970 before trimming, the number of values after trimming is

1971 ``(1 - proportiontocut) * n``. Default is 0.2.

1972 tail : {'left','right'}, optional

1973 If 'left' the `proportiontocut` lowest values will be masked.

1974 If 'right' the `proportiontocut` highest values will be masked.

1975 Default is 'left'.

1976 inclusive : {(bool, bool) tuple}, optional

1977 Tuple indicating whether the number of data being masked on each side

1978 should be rounded (True) or truncated (False). Default is

1979 (True, True).

1980 axis : int, optional

1981 Axis along which to perform the trimming.

1982 If None, the input array is first flattened. Default is None.

1983

1984 Returns

1985 -------

1986 trimtail : ndarray

1987 Returned array of same shape as `data` with masked tail values.

1988

1989 """

1990 tail = str(tail).lower()[0]

1991 if tail == 'l':

1992 limits = (proportiontocut,None)

1993 elif tail == 'r':

1994 limits = (None, proportiontocut)

1995 else:

1996 raise TypeError("The tail argument should be in ('left','right')")

1997

1998 return trimr(data, limits=limits, axis=axis, inclusive=inclusive)

2001trim1 = trimtail

2004def trimmed_mean(a, limits=(0.1,0.1), inclusive=(1,1), relative=True,

2005 axis=None):

2006 """Returns the trimmed mean of the data along the given axis.

2007

2008 %s

2009

2010 """

2011 if (not isinstance(limits,tuple)) and isinstance(limits,float):

2012 limits = (limits, limits)

2013 if relative:

2014 return trimr(a,limits=limits,inclusive=inclusive,axis=axis).mean(axis=axis)

2015 else:

2016 return trima(a,limits=limits,inclusive=inclusive).mean(axis=axis)

2017

2018

2019if trimmed_mean.__doc__:

2020 trimmed_mean.__doc__ = trimmed_mean.__doc__ % trimdoc

2021

2022

2023def trimmed_var(a, limits=(0.1,0.1), inclusive=(1,1), relative=True,

2024 axis=None, ddof=0):

2025 """Returns the trimmed variance of the data along the given axis.

2026

2027 %s

2028 ddof : {0,integer}, optional

2029 Means Delta Degrees of Freedom. The denominator used during computations

2030 is (n-ddof). DDOF=0 corresponds to a biased estimate, DDOF=1 to an un-

2031 biased estimate of the variance.

2032

2033 """

2034 if (not isinstance(limits,tuple)) and isinstance(limits,float):

2035 limits = (limits, limits)

2036 if relative:

2037 out = trimr(a,limits=limits, inclusive=inclusive,axis=axis)

2038 else:

2039 out = trima(a,limits=limits,inclusive=inclusive)

2040

2041 return out.var(axis=axis, ddof=ddof)

2042

2043

2044if trimmed_var.__doc__:

2045 trimmed_var.__doc__ = trimmed_var.__doc__ % trimdoc

2046

2047

2048def trimmed_std(a, limits=(0.1,0.1), inclusive=(1,1), relative=True,

2049 axis=None, ddof=0):

2050 """Returns the trimmed standard deviation of the data along the given axis.

2051

2052 %s

2053 ddof : {0,integer}, optional

2054 Means Delta Degrees of Freedom. The denominator used during computations

2055 is (n-ddof). DDOF=0 corresponds to a biased estimate, DDOF=1 to an un-

2056 biased estimate of the variance.

2057

2058 """

2059 if (not isinstance(limits,tuple)) and isinstance(limits,float):

2060 limits = (limits, limits)

2061 if relative:

2062 out = trimr(a,limits=limits,inclusive=inclusive,axis=axis)

2063 else:

2064 out = trima(a,limits=limits,inclusive=inclusive)

2065 return out.std(axis=axis,ddof=ddof)

2066

2067

2068if trimmed_std.__doc__:

2069 trimmed_std.__doc__ = trimmed_std.__doc__ % trimdoc

2070

2071

2072def trimmed_stde(a, limits=(0.1,0.1), inclusive=(1,1), axis=None):

2073 """

2074 Returns the standard error of the trimmed mean along the given axis.

2075

2076 Parameters

2077 ----------

2078 a : sequence

2079 Input array

2080 limits : {(0.1,0.1), tuple of float}, optional

2081 tuple (lower percentage, upper percentage) to cut on each side of the

2082 array, with respect to the number of unmasked data.

2083

2084 If n is the number of unmasked data before trimming, the values

2085 smaller than ``n * limits[0]`` and the values larger than

2086 ``n * `limits[1]`` are masked, and the total number of unmasked

2087 data after trimming is ``n * (1.-sum(limits))``. In each case,

2088 the value of one limit can be set to None to indicate an open interval.

2089 If `limits` is None, no trimming is performed.

2090 inclusive : {(bool, bool) tuple} optional

2091 Tuple indicating whether the number of data being masked on each side

2092 should be rounded (True) or truncated (False).

2093 axis : int, optional

2094 Axis along which to trim.

2095

2096 Returns

2097 -------

2098 trimmed_stde : scalar or ndarray

2099

2100 """

2101 def _trimmed_stde_1D(a, low_limit, up_limit, low_inclusive, up_inclusive):

2102 "Returns the standard error of the trimmed mean for a 1D input data."

2103 n = a.count()

2104 idx = a.argsort()

2105 if low_limit:

2106 if low_inclusive:

2107 lowidx = int(low_limit*n)

2108 else:

2109 lowidx = np.round(low_limit*n)

2110 a[idx[:lowidx]] = masked

2111 if up_limit is not None:

2112 if up_inclusive:

2113 upidx = n - int(n*up_limit)

2114 else:

2115 upidx = n - np.round(n*up_limit)

2116 a[idx[upidx:]] = masked

2117 a[idx[:lowidx]] = a[idx[lowidx]]

2118 a[idx[upidx:]] = a[idx[upidx-1]]

2119 winstd = a.std(ddof=1)

2120 return winstd / ((1-low_limit-up_limit)*np.sqrt(len(a)))

2121

2122 a = ma.array(a, copy=True, subok=True)

2123 a.unshare_mask()

2124 if limits is None:

2125 return a.std(axis=axis,ddof=1)/ma.sqrt(a.count(axis))

2126 if (not isinstance(limits,tuple)) and isinstance(limits,float):

2127 limits = (limits, limits)

2128

2129 # Check the limits

2130 (lolim, uplim) = limits

2131 errmsg = "The proportion to cut from the %s should be between 0. and 1."

2132 if lolim is not None:

2133 if lolim > 1. or lolim < 0:

2134 raise ValueError(errmsg % 'beginning' + "(got %s)" % lolim)

2135 if uplim is not None:

2136 if uplim > 1. or uplim < 0:

2137 raise ValueError(errmsg % 'end' + "(got %s)" % uplim)

2138

2139 (loinc, upinc) = inclusive

2140 if (axis is None):

2141 return _trimmed_stde_1D(a.ravel(),lolim,uplim,loinc,upinc)

2142 else:

2143 if a.ndim > 2:

2144 raise ValueError("Array 'a' must be at most two dimensional, "

2145 "but got a.ndim = %d" % a.ndim)

2146 return ma.apply_along_axis(_trimmed_stde_1D, axis, a,

2147 lolim,uplim,loinc,upinc)

2148

2149

2150def _mask_to_limits(a, limits, inclusive):

2151 """Mask an array for values outside of given limits.

2152

2153 This is primarily a utility function.

2154

2155 Parameters

2156 ----------

2157 a : array

2158 limits : (float or None, float or None)

2159 A tuple consisting of the (lower limit, upper limit). Values in the

2160 input array less than the lower limit or greater than the upper limit

2161 will be masked out. None implies no limit.

2162 inclusive : (bool, bool)

2163 A tuple consisting of the (lower flag, upper flag). These flags

2164 determine whether values exactly equal to lower or upper are allowed.

2165

2166 Returns

2167 -------

2168 A MaskedArray.

2169

2170 Raises

2171 ------

2172 A ValueError if there are no values within the given limits.

2173 """

2174 lower_limit, upper_limit = limits

2175 lower_include, upper_include = inclusive

2176 am = ma.MaskedArray(a)

2177 if lower_limit is not None:

2178 if lower_include:

2179 am = ma.masked_less(am, lower_limit)

2180 else:

2181 am = ma.masked_less_equal(am, lower_limit)

2182

2183 if upper_limit is not None:

2184 if upper_include:

2185 am = ma.masked_greater(am, upper_limit)

2186 else:

2187 am = ma.masked_greater_equal(am, upper_limit)

2188

2189 if am.count() == 0:

2190 raise ValueError("No array values within given limits")

2191

2192 return am

2193

2194

2195def tmean(a, limits=None, inclusive=(True, True), axis=None):

2196 """

2197 Compute the trimmed mean.

2198

2199 Parameters

2200 ----------

2201 a : array_like

2202 Array of values.

2203 limits : None or (lower limit, upper limit), optional

2204 Values in the input array less than the lower limit or greater than the

2205 upper limit will be ignored. When limits is None (default), then all

2206 values are used. Either of the limit values in the tuple can also be

2207 None representing a half-open interval.

2208 inclusive : (bool, bool), optional

2209 A tuple consisting of the (lower flag, upper flag). These flags

2210 determine whether values exactly equal to the lower or upper limits

2211 are included. The default value is (True, True).

2212 axis : int or None, optional

2213 Axis along which to operate. If None, compute over the

2214 whole array. Default is None.

2215

2216 Returns

2217 -------

2218 tmean : float

2219

2220 Notes

2221 -----

2222 For more details on `tmean`, see `scipy.stats.tmean`.

2223

2224 Examples

2225 --------

2226 >>> import numpy as np

2227 >>> from scipy.stats import mstats

2228 >>> a = np.array([[6, 8, 3, 0],

2229 ... [3, 9, 1, 2],

2230 ... [8, 7, 8, 2],

2231 ... [5, 6, 0, 2],

2232 ... [4, 5, 5, 2]])

2233 ...

2234 ...

2235 >>> mstats.tmean(a, (2,5))

2236 3.3

2237 >>> mstats.tmean(a, (2,5), axis=0)

2238 masked_array(data=[4.0, 5.0, 4.0, 2.0],

2239 mask=[False, False, False, False],

2240 fill_value=1e+20)

2241

2242 """

2243 return trima(a, limits=limits, inclusive=inclusive).mean(axis=axis)

2244

2245

2246def tvar(a, limits=None, inclusive=(True, True), axis=0, ddof=1):

2247 """

2248 Compute the trimmed variance

2249

2250 This function computes the sample variance of an array of values,

2251 while ignoring values which are outside of given `limits`.

2252

2253 Parameters

2254 ----------

2255 a : array_like

2256 Array of values.

2257 limits : None or (lower limit, upper limit), optional

2258 Values in the input array less than the lower limit or greater than the

2259 upper limit will be ignored. When limits is None, then all values are

2260 used. Either of the limit values in the tuple can also be None

2261 representing a half-open interval. The default value is None.

2262 inclusive : (bool, bool), optional

2263 A tuple consisting of the (lower flag, upper flag). These flags

2264 determine whether values exactly equal to the lower or upper limits

2265 are included. The default value is (True, True).

2266 axis : int or None, optional

2267 Axis along which to operate. If None, compute over the

2268 whole array. Default is zero.

2269 ddof : int, optional

2270 Delta degrees of freedom. Default is 1.

2271

2272 Returns

2273 -------

2274 tvar : float

2275 Trimmed variance.

2276

2277 Notes

2278 -----

2279 For more details on `tvar`, see `scipy.stats.tvar`.

2280

2281 """

2282 a = a.astype(float).ravel()

2283 if limits is None:

2284 n = (~a.mask).sum() # todo: better way to do that?

2285 return np.ma.var(a) * n/(n-1.)

2286 am = _mask_to_limits(a, limits=limits, inclusive=inclusive)

2287

2288 return np.ma.var(am, axis=axis, ddof=ddof)

2289

2290

2291def tmin(a, lowerlimit=None, axis=0, inclusive=True):

2292 """

2293 Compute the trimmed minimum

2294

2295 Parameters

2296 ----------

2297 a : array_like

2298 array of values

2299 lowerlimit : None or float, optional

2300 Values in the input array less than the given limit will be ignored.

2301 When lowerlimit is None, then all values are used. The default value

2302 is None.

2303 axis : int or None, optional

2304 Axis along which to operate. Default is 0. If None, compute over the

2305 whole array `a`.

2306 inclusive : {True, False}, optional

2307 This flag determines whether values exactly equal to the lower limit

2308 are included. The default value is True.

2309

2310 Returns

2311 -------

2312 tmin : float, int or ndarray

2313

2314 Notes

2315 -----

2316 For more details on `tmin`, see `scipy.stats.tmin`.

2317

2318 Examples

2319 --------

2320 >>> import numpy as np

2321 >>> from scipy.stats import mstats

2322 >>> a = np.array([[6, 8, 3, 0],

2323 ... [3, 2, 1, 2],

2324 ... [8, 1, 8, 2],

2325 ... [5, 3, 0, 2],

2326 ... [4, 7, 5, 2]])

2327 ...

2328 >>> mstats.tmin(a, 5)

2329 masked_array(data=[5, 7, 5, --],

2330 mask=[False, False, False, True],

2331 fill_value=999999)

2332

2333 """

2334 a, axis = _chk_asarray(a, axis)

2335 am = trima(a, (lowerlimit, None), (inclusive, False))

2336 return ma.minimum.reduce(am, axis)

2337

2338

2339def tmax(a, upperlimit=None, axis=0, inclusive=True):

2340 """

2341 Compute the trimmed maximum

2342

2343 This function computes the maximum value of an array along a given axis,

2344 while ignoring values larger than a specified upper limit.

2345

2346 Parameters

2347 ----------

2348 a : array_like

2349 array of values

2350 upperlimit : None or float, optional

2351 Values in the input array greater than the given limit will be ignored.

2352 When upperlimit is None, then all values are used. The default value

2353 is None.

2354 axis : int or None, optional

2355 Axis along which to operate. Default is 0. If None, compute over the

2356 whole array `a`.

2357 inclusive : {True, False}, optional

2358 This flag determines whether values exactly equal to the upper limit

2359 are included. The default value is True.

2360

2361 Returns

2362 -------

2363 tmax : float, int or ndarray

2364

2365 Notes

2366 -----

2367 For more details on `tmax`, see `scipy.stats.tmax`.

2368

2369 Examples

2370 --------

2371 >>> import numpy as np

2372 >>> from scipy.stats import mstats

2373 >>> a = np.array([[6, 8, 3, 0],

2374 ... [3, 9, 1, 2],

2375 ... [8, 7, 8, 2],

2376 ... [5, 6, 0, 2],

2377 ... [4, 5, 5, 2]])

2378 ...

2379 ...

2380 >>> mstats.tmax(a, 4)

2381 masked_array(data=[4, --, 3, 2],

2382 mask=[False, True, False, False],

2383 fill_value=999999)

2384

2385 """

2386 a, axis = _chk_asarray(a, axis)

2387 am = trima(a, (None, upperlimit), (False, inclusive))

2388 return ma.maximum.reduce(am, axis)

2389

2390

2391def tsem(a, limits=None, inclusive=(True, True), axis=0, ddof=1):

2392 """

2393 Compute the trimmed standard error of the mean.

2394

2395 This function finds the standard error of the mean for given

2396 values, ignoring values outside the given `limits`.

2397

2398 Parameters

2399 ----------

2400 a : array_like

2401 array of values

2402 limits : None or (lower limit, upper limit), optional

2403 Values in the input array less than the lower limit or greater than the

2404 upper limit will be ignored. When limits is None, then all values are

2405 used. Either of the limit values in the tuple can also be None

2406 representing a half-open interval. The default value is None.

2407 inclusive : (bool, bool), optional

2408 A tuple consisting of the (lower flag, upper flag). These flags

2409 determine whether values exactly equal to the lower or upper limits

2410 are included. The default value is (True, True).

2411 axis : int or None, optional

2412 Axis along which to operate. If None, compute over the

2413 whole array. Default is zero.

2414 ddof : int, optional

2415 Delta degrees of freedom. Default is 1.

2416

2417 Returns

2418 -------

2419 tsem : float

2420

2421 Notes

2422 -----

2423 For more details on `tsem`, see `scipy.stats.tsem`.

2424

2425 """

2426 a = ma.asarray(a).ravel()

2427 if limits is None:

2428 n = float(a.count())

2429 return a.std(axis=axis, ddof=ddof)/ma.sqrt(n)

2430

2431 am = trima(a.ravel(), limits, inclusive)

2432 sd = np.sqrt(am.var(axis=axis, ddof=ddof))

2433 return sd / np.sqrt(am.count())

2434

2435

2436def winsorize(a, limits=None, inclusive=(True, True), inplace=False,

2437 axis=None, nan_policy='propagate'):

2438 """Returns a Winsorized version of the input array.

2439

2440 The (limits[0])th lowest values are set to the (limits[0])th percentile,

2441 and the (limits[1])th highest values are set to the (1 - limits[1])th

2442 percentile.

2443 Masked values are skipped.

2444

2445

2446 Parameters

2447 ----------

2448 a : sequence

2449 Input array.

2450 limits : {None, tuple of float}, optional

2451 Tuple of the percentages to cut on each side of the array, with respect

2452 to the number of unmasked data, as floats between 0. and 1.

2453 Noting n the number of unmasked data before trimming, the

2454 (n*limits[0])th smallest data and the (n*limits[1])th largest data are

2455 masked, and the total number of unmasked data after trimming

2456 is n*(1.-sum(limits)) The value of one limit can be set to None to

2457 indicate an open interval.

2458 inclusive : {(True, True) tuple}, optional

2459 Tuple indicating whether the number of data being masked on each side

2460 should be truncated (True) or rounded (False).

2461 inplace : {False, True}, optional

2462 Whether to winsorize in place (True) or to use a copy (False)

2463 axis : {None, int}, optional

2464 Axis along which to trim. If None, the whole array is trimmed, but its

2465 shape is maintained.

2466 nan_policy : {'propagate', 'raise', 'omit'}, optional

2467 Defines how to handle when input contains nan.

2468 The following options are available (default is 'propagate'):

2469

2470 * 'propagate': allows nan values and may overwrite or propagate them

2471 * 'raise': throws an error

2472 * 'omit': performs the calculations ignoring nan values

2473

2474 Notes

2475 -----

2476 This function is applied to reduce the effect of possibly spurious outliers

2477 by limiting the extreme values.

2478

2479 Examples

2480 --------

2481 >>> import numpy as np

2482 >>> from scipy.stats.mstats import winsorize

2483

2484 A shuffled array contains integers from 1 to 10.

2485

2486 >>> a = np.array([10, 4, 9, 8, 5, 3, 7, 2, 1, 6])

2487

2488 The 10% of the lowest value (i.e., `1`) and the 20% of the highest

2489 values (i.e., `9` and `10`) are replaced.

2490

2491 >>> winsorize(a, limits=[0.1, 0.2])

2492 masked_array(data=[8, 4, 8, 8, 5, 3, 7, 2, 2, 6],

2493 mask=False,

2494 fill_value=999999)

2495

2496 """

2497 def _winsorize1D(a, low_limit, up_limit, low_include, up_include,

2498 contains_nan, nan_policy):

2499 n = a.count()

2500 idx = a.argsort()

2501 if contains_nan:

2502 nan_count = np.count_nonzero(np.isnan(a))

2503 if low_limit:

2504 if low_include:

2505 lowidx = int(low_limit * n)

2506 else:

2507 lowidx = np.round(low_limit * n).astype(int)

2508 if contains_nan and nan_policy == 'omit':

2509 lowidx = min(lowidx, n-nan_count-1)

2510 a[idx[:lowidx]] = a[idx[lowidx]]

2511 if up_limit is not None:

2512 if up_include:

2513 upidx = n - int(n * up_limit)

2514 else:

2515 upidx = n - np.round(n * up_limit).astype(int)

2516 if contains_nan and nan_policy == 'omit':

2517 a[idx[upidx:-nan_count]] = a[idx[upidx - 1]]

2518 else:

2519 a[idx[upidx:]] = a[idx[upidx - 1]]

2520 return a

2521

2522 contains_nan, nan_policy = _contains_nan(a, nan_policy)

2523 # We are going to modify a: better make a copy

2524 a = ma.array(a, copy=np.logical_not(inplace))

2525

2526 if limits is None:

2527 return a

2528 if (not isinstance(limits, tuple)) and isinstance(limits, float):

2529 limits = (limits, limits)

2530

2531 # Check the limits

2532 (lolim, uplim) = limits

2533 errmsg = "The proportion to cut from the %s should be between 0. and 1."

2534 if lolim is not None:

2535 if lolim > 1. or lolim < 0:

2536 raise ValueError(errmsg % 'beginning' + "(got %s)" % lolim)

2537 if uplim is not None:

2538 if uplim > 1. or uplim < 0:

2539 raise ValueError(errmsg % 'end' + "(got %s)" % uplim)

2540

2541 (loinc, upinc) = inclusive

2542

2543 if axis is None:

2544 shp = a.shape

2545 return _winsorize1D(a.ravel(), lolim, uplim, loinc, upinc,

2546 contains_nan, nan_policy).reshape(shp)

2547 else:

2548 return ma.apply_along_axis(_winsorize1D, axis, a, lolim, uplim, loinc,

2549 upinc, contains_nan, nan_policy)

2550

2551

2552def moment(a, moment=1, axis=0):

2553 """

2554 Calculates the nth moment about the mean for a sample.

2555

2556 Parameters

2557 ----------

2558 a : array_like

2559 data

2560 moment : int, optional

2561 order of central moment that is returned

2562 axis : int or None, optional

2563 Axis along which the central moment is computed. Default is 0.

2564 If None, compute over the whole array `a`.

2565

2566 Returns

2567 -------

2568 n-th central moment : ndarray or float

2569 The appropriate moment along the given axis or over all values if axis

2570 is None. The denominator for the moment calculation is the number of

2571 observations, no degrees of freedom correction is done.

2572

2573 Notes

2574 -----

2575 For more details about `moment`, see `scipy.stats.moment`.

2576

2577 """

2578 a, axis = _chk_asarray(a, axis)

2579 if a.size == 0:

2580 moment_shape = list(a.shape)

2581 del moment_shape[axis]

2582 dtype = a.dtype.type if a.dtype.kind in 'fc' else np.float64

2583 # empty array, return nan(s) with shape matching `moment`

2584 out_shape = (moment_shape if np.isscalar(moment)

2585 else [len(moment)] + moment_shape)

2586 if len(out_shape) == 0:

2587 return dtype(np.nan)

2588 else:

2589 return ma.array(np.full(out_shape, np.nan, dtype=dtype))

2590

2591 # for array_like moment input, return a value for each.

2592 if not np.isscalar(moment):

2593 mean = a.mean(axis, keepdims=True)

2594 mmnt = [_moment(a, i, axis, mean=mean) for i in moment]

2595 return ma.array(mmnt)

2596 else:

2597 return _moment(a, moment, axis)

2598

2599# Moment with optional pre-computed mean, equal to a.mean(axis, keepdims=True)

2600def _moment(a, moment, axis, *, mean=None):

2601 if np.abs(moment - np.round(moment)) > 0:

2602 raise ValueError("All moment parameters must be integers")

2603

2604 if moment == 0 or moment == 1:

2605 # By definition the zeroth moment about the mean is 1, and the first

2606 # moment is 0.

2607 shape = list(a.shape)

2608 del shape[axis]

2609 dtype = a.dtype.type if a.dtype.kind in 'fc' else np.float64

2610

2611 if len(shape) == 0:

2612 return dtype(1.0 if moment == 0 else 0.0)

2613 else:

2614 return (ma.ones(shape, dtype=dtype) if moment == 0

2615 else ma.zeros(shape, dtype=dtype))

2616 else:

2617 # Exponentiation by squares: form exponent sequence

2618 n_list = [moment]

2619 current_n = moment

2620 while current_n > 2:

2621 if current_n % 2:

2622 current_n = (current_n-1)/2

2623 else:

2624 current_n /= 2

2625 n_list.append(current_n)

2626

2627 # Starting point for exponentiation by squares

2628 mean = a.mean(axis, keepdims=True) if mean is None else mean

2629 a_zero_mean = a - mean

2630 if n_list[-1] == 1:

2631 s = a_zero_mean.copy()

2632 else:

2633 s = a_zero_mean**2

2634

2635 # Perform multiplications

2636 for n in n_list[-2::-1]:

2637 s = s**2

2638 if n % 2:

2639 s *= a_zero_mean

2640 return s.mean(axis)

2641

2642

2643def variation(a, axis=0, ddof=0):

2644 """

2645 Compute the coefficient of variation.

2646

2647 The coefficient of variation is the standard deviation divided by the

2648 mean. This function is equivalent to::

2649

2650 np.std(x, axis=axis, ddof=ddof) / np.mean(x)

2651

2652 The default for ``ddof`` is 0, but many definitions of the coefficient

2653 of variation use the square root of the unbiased sample variance

2654 for the sample standard deviation, which corresponds to ``ddof=1``.

2655

2656 Parameters

2657 ----------

2658 a : array_like

2659 Input array.

2660 axis : int or None, optional

2661 Axis along which to calculate the coefficient of variation. Default

2662 is 0. If None, compute over the whole array `a`.

2663 ddof : int, optional

2664 Delta degrees of freedom. Default is 0.

2665

2666 Returns

2667 -------

2668 variation : ndarray

2669 The calculated variation along the requested axis.

2670

2671 Notes

2672 -----

2673 For more details about `variation`, see `scipy.stats.variation`.

2674

2675 Examples

2676 --------

2677 >>> import numpy as np

2678 >>> from scipy.stats.mstats import variation

2679 >>> a = np.array([2,8,4])

2680 >>> variation(a)

2681 0.5345224838248487

2682 >>> b = np.array([2,8,3,4])

2683 >>> c = np.ma.masked_array(b, mask=[0,0,1,0])

2684 >>> variation(c)

2685 0.5345224838248487

2686

2687 In the example above, it can be seen that this works the same as

2688 `scipy.stats.variation` except 'stats.mstats.variation' ignores masked

2689 array elements.

2690

2691 """

2692 a, axis = _chk_asarray(a, axis)

2693 return a.std(axis, ddof=ddof)/a.mean(axis)

2694

2695

2696def skew(a, axis=0, bias=True):

2697 """

2698 Computes the skewness of a data set.

2699

2700 Parameters

2701 ----------

2702 a : ndarray

2703 data

2704 axis : int or None, optional

2705 Axis along which skewness is calculated. Default is 0.

2706 If None, compute over the whole array `a`.

2707 bias : bool, optional

2708 If False, then the calculations are corrected for statistical bias.

2709

2710 Returns

2711 -------

2712 skewness : ndarray

2713 The skewness of values along an axis, returning 0 where all values are

2714 equal.

2715

2716 Notes

2717 -----

2718 For more details about `skew`, see `scipy.stats.skew`.

2719

2720 """

2721 a, axis = _chk_asarray(a,axis)

2722 mean = a.mean(axis, keepdims=True)

2723 m2 = _moment(a, 2, axis, mean=mean)

2724 m3 = _moment(a, 3, axis, mean=mean)

2725 zero = (m2 <= (np.finfo(m2.dtype).resolution * mean.squeeze(axis))**2)

2726 with np.errstate(all='ignore'):

2727 vals = ma.where(zero, 0, m3 / m2**1.5)

2728

2729 if not bias and zero is not ma.masked and m2 is not ma.masked:

2730 n = a.count(axis)

2731 can_correct = ~zero & (n > 2)

2732 if can_correct.any():

2733 n = np.extract(can_correct, n)

2734 m2 = np.extract(can_correct, m2)

2735 m3 = np.extract(can_correct, m3)

2736 nval = ma.sqrt((n-1.0)*n)/(n-2.0)*m3/m2**1.5

2737 np.place(vals, can_correct, nval)

2738 return vals

2739

2740

2741def kurtosis(a, axis=0, fisher=True, bias=True):

2742 """

2743 Computes the kurtosis (Fisher or Pearson) of a dataset.

2744

2745 Kurtosis is the fourth central moment divided by the square of the

2746 variance. If Fisher's definition is used, then 3.0 is subtracted from

2747 the result to give 0.0 for a normal distribution.

2748

2749 If bias is False then the kurtosis is calculated using k statistics to

2750 eliminate bias coming from biased moment estimators

2751

2752 Use `kurtosistest` to see if result is close enough to normal.

2753

2754 Parameters

2755 ----------

2756 a : array

2757 data for which the kurtosis is calculated

2758 axis : int or None, optional

2759 Axis along which the kurtosis is calculated. Default is 0.

2760 If None, compute over the whole array `a`.

2761 fisher : bool, optional

2762 If True, Fisher's definition is used (normal ==> 0.0). If False,

2763 Pearson's definition is used (normal ==> 3.0).

2764 bias : bool, optional

2765 If False, then the calculations are corrected for statistical bias.

2766

2767 Returns

2768 -------

2769 kurtosis : array

2770 The kurtosis of values along an axis. If all values are equal,

2771 return -3 for Fisher's definition and 0 for Pearson's definition.

2772

2773 Notes

2774 -----

2775 For more details about `kurtosis`, see `scipy.stats.kurtosis`.

2776

2777 """

2778 a, axis = _chk_asarray(a, axis)

2779 mean = a.mean(axis, keepdims=True)

2780 m2 = _moment(a, 2, axis, mean=mean)

2781 m4 = _moment(a, 4, axis, mean=mean)

2782 zero = (m2 <= (np.finfo(m2.dtype).resolution * mean.squeeze(axis))**2)

2783 with np.errstate(all='ignore'):

2784 vals = ma.where(zero, 0, m4 / m2**2.0)

2785

2786 if not bias and zero is not ma.masked and m2 is not ma.masked:

2787 n = a.count(axis)

2788 can_correct = ~zero & (n > 3)

2789 if can_correct.any():

2790 n = np.extract(can_correct, n)

2791 m2 = np.extract(can_correct, m2)

2792 m4 = np.extract(can_correct, m4)

2793 nval = 1.0/(n-2)/(n-3)*((n*n-1.0)*m4/m2**2.0-3*(n-1)**2.0)

2794 np.place(vals, can_correct, nval+3.0)

2795 if fisher:

2796 return vals - 3

2797 else:

2798 return vals

2799

2800

2801DescribeResult = namedtuple('DescribeResult', ('nobs', 'minmax', 'mean',

2802 'variance', 'skewness',

2803 'kurtosis'))

2804

2805

2806def describe(a, axis=0, ddof=0, bias=True):

2807 """

2808 Computes several descriptive statistics of the passed array.

2809

2810 Parameters

2811 ----------

2812 a : array_like

2813 Data array

2814 axis : int or None, optional

2815 Axis along which to calculate statistics. Default 0. If None,

2816 compute over the whole array `a`.

2817 ddof : int, optional

2818 degree of freedom (default 0); note that default ddof is different

2819 from the same routine in stats.describe

2820 bias : bool, optional

2821 If False, then the skewness and kurtosis calculations are corrected for

2822 statistical bias.

2823

2824 Returns

2825 -------

2826 nobs : int

2827 (size of the data (discarding missing values)

2828

2829 minmax : (int, int)

2830 min, max

2831

2832 mean : float

2833 arithmetic mean

2834

2835 variance : float

2836 unbiased variance

2837

2838 skewness : float

2839 biased skewness

2840

2841 kurtosis : float

2842 biased kurtosis

2843

2844 Examples

2845 --------

2846 >>> import numpy as np

2847 >>> from scipy.stats.mstats import describe

2848 >>> ma = np.ma.array(range(6), mask=[0, 0, 0, 1, 1, 1])

2849 >>> describe(ma)

2850 DescribeResult(nobs=3, minmax=(masked_array(data=0,

2851 mask=False,

2852 fill_value=999999), masked_array(data=2,

2853 mask=False,

2854 fill_value=999999)), mean=1.0, variance=0.6666666666666666,

2855 skewness=masked_array(data=0., mask=False, fill_value=1e+20),

2856 kurtosis=-1.5)

2857

2858 """

2859 a, axis = _chk_asarray(a, axis)

2860 n = a.count(axis)

2861 mm = (ma.minimum.reduce(a, axis=axis), ma.maximum.reduce(a, axis=axis))

2862 m = a.mean(axis)

2863 v = a.var(axis, ddof=ddof)

2864 sk = skew(a, axis, bias=bias)

2865 kurt = kurtosis(a, axis, bias=bias)

2866

2867 return DescribeResult(n, mm, m, v, sk, kurt)

2868

2869

2870def stde_median(data, axis=None):

2871 """Returns the McKean-Schrader estimate of the standard error of the sample

2872 median along the given axis. masked values are discarded.

2873

2874 Parameters

2875 ----------

2876 data : ndarray

2877 Data to trim.

2878 axis : {None,int}, optional

2879 Axis along which to perform the trimming.

2880 If None, the input array is first flattened.

2881

2882 """

2883 def _stdemed_1D(data):

2884 data = np.sort(data.compressed())

2885 n = len(data)

2886 z = 2.5758293035489004

2887 k = int(np.round((n+1)/2. - z * np.sqrt(n/4.),0))

2888 return ((data[n-k] - data[k-1])/(2.*z))

2889

2890 data = ma.array(data, copy=False, subok=True)

2891 if (axis is None):

2892 return _stdemed_1D(data)

2893 else:

2894 if data.ndim > 2:

2895 raise ValueError("Array 'data' must be at most two dimensional, "

2896 "but got data.ndim = %d" % data.ndim)

2897 return ma.apply_along_axis(_stdemed_1D, axis, data)

2898

2899

2900SkewtestResult = namedtuple('SkewtestResult', ('statistic', 'pvalue'))

2901

2902

2903def skewtest(a, axis=0, alternative='two-sided'):

2904 """

2905 Tests whether the skew is different from the normal distribution.

2906

2907 Parameters

2908 ----------

2909 a : array_like

2910 The data to be tested

2911 axis : int or None, optional

2912 Axis along which statistics are calculated. Default is 0.

2913 If None, compute over the whole array `a`.

2914 alternative : {'two-sided', 'less', 'greater'}, optional

2915 Defines the alternative hypothesis. Default is 'two-sided'.

2916 The following options are available:

2917

2918 * 'two-sided': the skewness of the distribution underlying the sample

2919 is different from that of the normal distribution (i.e. 0)

2920 * 'less': the skewness of the distribution underlying the sample

2921 is less than that of the normal distribution

2922 * 'greater': the skewness of the distribution underlying the sample

2923 is greater than that of the normal distribution

2924

2925 .. versionadded:: 1.7.0

2926

2927 Returns

2928 -------

2929 statistic : array_like

2930 The computed z-score for this test.

2931 pvalue : array_like

2932 A p-value for the hypothesis test

2933

2934 Notes

2935 -----

2936 For more details about `skewtest`, see `scipy.stats.skewtest`.

2937

2938 """

2939 a, axis = _chk_asarray(a, axis)

2940 if axis is None:

2941 a = a.ravel()

2942 axis = 0

2943 b2 = skew(a,axis)

2944 n = a.count(axis)

2945 if np.min(n) < 8:

2946 raise ValueError(

2947 "skewtest is not valid with less than 8 samples; %i samples"

2948 " were given." % np.min(n))

2949

2950 y = b2 * ma.sqrt(((n+1)*(n+3)) / (6.0*(n-2)))

2951 beta2 = (3.0*(n*n+27*n-70)*(n+1)*(n+3)) / ((n-2.0)*(n+5)*(n+7)*(n+9))

2952 W2 = -1 + ma.sqrt(2*(beta2-1))

2953 delta = 1/ma.sqrt(0.5*ma.log(W2))

2954 alpha = ma.sqrt(2.0/(W2-1))

2955 y = ma.where(y == 0, 1, y)

2956 Z = delta*ma.log(y/alpha + ma.sqrt((y/alpha)**2+1))

2957

2958 return SkewtestResult(*scipy.stats._stats_py._normtest_finish(Z, alternative))

2959

2960

2961KurtosistestResult = namedtuple('KurtosistestResult', ('statistic', 'pvalue'))

2962

2963

2964def kurtosistest(a, axis=0, alternative='two-sided'):

2965 """

2966 Tests whether a dataset has normal kurtosis

2967

2968 Parameters

2969 ----------

2970 a : array_like

2971 array of the sample data

2972 axis : int or None, optional

2973 Axis along which to compute test. Default is 0. If None,

2974 compute over the whole array `a`.

2975 alternative : {'two-sided', 'less', 'greater'}, optional

2976 Defines the alternative hypothesis.

2977 The following options are available (default is 'two-sided'):

2978

2979 * 'two-sided': the kurtosis of the distribution underlying the sample

2980 is different from that of the normal distribution

2981 * 'less': the kurtosis of the distribution underlying the sample

2982 is less than that of the normal distribution

2983 * 'greater': the kurtosis of the distribution underlying the sample

2984 is greater than that of the normal distribution

2985

2986 .. versionadded:: 1.7.0

2987

2988 Returns

2989 -------

2990 statistic : array_like

2991 The computed z-score for this test.

2992 pvalue : array_like

2993 The p-value for the hypothesis test

2994

2995 Notes

2996 -----

2997 For more details about `kurtosistest`, see `scipy.stats.kurtosistest`.

2998

2999 """

3000 a, axis = _chk_asarray(a, axis)

3001 n = a.count(axis=axis)

3002 if np.min(n) < 5:

3003 raise ValueError(

3004 "kurtosistest requires at least 5 observations; %i observations"

3005 " were given." % np.min(n))

3006 if np.min(n) < 20:

3007 warnings.warn(

3008 "kurtosistest only valid for n>=20 ... continuing anyway, n=%i" %

3009 np.min(n))

3010

3011 b2 = kurtosis(a, axis, fisher=False)

3012 E = 3.0*(n-1) / (n+1)

3013 varb2 = 24.0*n*(n-2.)*(n-3) / ((n+1)*(n+1.)*(n+3)*(n+5))

3014 x = (b2-E)/ma.sqrt(varb2)

3015 sqrtbeta1 = 6.0*(n*n-5*n+2)/((n+7)*(n+9)) * np.sqrt((6.0*(n+3)*(n+5)) /

3016 (n*(n-2)*(n-3)))

3017 A = 6.0 + 8.0/sqrtbeta1 * (2.0/sqrtbeta1 + np.sqrt(1+4.0/(sqrtbeta1**2)))

3018 term1 = 1 - 2./(9.0*A)

3019 denom = 1 + x*ma.sqrt(2/(A-4.0))

3020 if np.ma.isMaskedArray(denom):

3021 # For multi-dimensional array input

3022 denom[denom == 0.0] = masked

3023 elif denom == 0.0:

3024 denom = masked

3025

3026 term2 = np.ma.where(denom > 0, ma.power((1-2.0/A)/denom, 1/3.0),

3027 -ma.power(-(1-2.0/A)/denom, 1/3.0))

3028 Z = (term1 - term2) / np.sqrt(2/(9.0*A))

3029

3030 return KurtosistestResult(

3031 *scipy.stats._stats_py._normtest_finish(Z, alternative)

3032 )

3033

3034

3035NormaltestResult = namedtuple('NormaltestResult', ('statistic', 'pvalue'))

3036

3037

3038def normaltest(a, axis=0):

3039 """

3040 Tests whether a sample differs from a normal distribution.

3041

3042 Parameters

3043 ----------

3044 a : array_like

3045 The array containing the data to be tested.

3046 axis : int or None, optional

3047 Axis along which to compute test. Default is 0. If None,

3048 compute over the whole array `a`.

3049

3050 Returns

3051 -------

3052 statistic : float or array

3053 ``s^2 + k^2``, where ``s`` is the z-score returned by `skewtest` and

3054 ``k`` is the z-score returned by `kurtosistest`.

3055 pvalue : float or array

3056 A 2-sided chi squared probability for the hypothesis test.

3057

3058 Notes

3059 -----

3060 For more details about `normaltest`, see `scipy.stats.normaltest`.

3061

3062 """

3063 a, axis = _chk_asarray(a, axis)

3064 s, _ = skewtest(a, axis)

3065 k, _ = kurtosistest(a, axis)

3066 k2 = s*s + k*k

3067

3068 return NormaltestResult(k2, distributions.chi2.sf(k2, 2))

3069

3070

3071def mquantiles(a, prob=list([.25,.5,.75]), alphap=.4, betap=.4, axis=None,

3072 limit=()):

3073 """

3074 Computes empirical quantiles for a data array.

3075

3076 Samples quantile are defined by ``Q(p) = (1-gamma)*x[j] + gamma*x[j+1]``,

3077 where ``x[j]`` is the j-th order statistic, and gamma is a function of

3078 ``j = floor(n*p + m)``, ``m = alphap + p*(1 - alphap - betap)`` and

3079 ``g = n*p + m - j``.

3080

3081 Reinterpreting the above equations to compare to **R** lead to the

3082 equation: ``p(k) = (k - alphap)/(n + 1 - alphap - betap)``

3083

3084 Typical values of (alphap,betap) are:

3085 - (0,1) : ``p(k) = k/n`` : linear interpolation of cdf

3086 (**R** type 4)

3087 - (.5,.5) : ``p(k) = (k - 1/2.)/n`` : piecewise linear function

3088 (**R** type 5)

3089 - (0,0) : ``p(k) = k/(n+1)`` :

3090 (**R** type 6)

3091 - (1,1) : ``p(k) = (k-1)/(n-1)``: p(k) = mode[F(x[k])].

3092 (**R** type 7, **R** default)

3093 - (1/3,1/3): ``p(k) = (k-1/3)/(n+1/3)``: Then p(k) ~ median[F(x[k])].

3094 The resulting quantile estimates are approximately median-unbiased

3095 regardless of the distribution of x.

3096 (**R** type 8)

3097 - (3/8,3/8): ``p(k) = (k-3/8)/(n+1/4)``: Blom.

3098 The resulting quantile estimates are approximately unbiased

3099 if x is normally distributed

3100 (**R** type 9)

3101 - (.4,.4) : approximately quantile unbiased (Cunnane)

3102 - (.35,.35): APL, used with PWM

3103

3104 Parameters

3105 ----------

3106 a : array_like

3107 Input data, as a sequence or array of dimension at most 2.

3108 prob : array_like, optional

3109 List of quantiles to compute.

3110 alphap : float, optional

3111 Plotting positions parameter, default is 0.4.

3112 betap : float, optional

3113 Plotting positions parameter, default is 0.4.

3114 axis : int, optional

3115 Axis along which to perform the trimming.

3116 If None (default), the input array is first flattened.

3117 limit : tuple, optional

3118 Tuple of (lower, upper) values.

3119 Values of `a` outside this open interval are ignored.

3120

3121 Returns

3122 -------

3123 mquantiles : MaskedArray

3124 An array containing the calculated quantiles.

3125

3126 Notes

3127 -----

3128 This formulation is very similar to **R** except the calculation of

3129 ``m`` from ``alphap`` and ``betap``, where in **R** ``m`` is defined

3130 with each type.

3131

3132 References

3133 ----------

3134 .. [1] *R* statistical software: https://www.r-project.org/

3135 .. [2] *R* ``quantile`` function:

3136 http://stat.ethz.ch/R-manual/R-devel/library/stats/html/quantile.html

3137

3138 Examples

3139 --------

3140 >>> import numpy as np

3141 >>> from scipy.stats.mstats import mquantiles

3142 >>> a = np.array([6., 47., 49., 15., 42., 41., 7., 39., 43., 40., 36.])

3143 >>> mquantiles(a)

3144 array([ 19.2, 40. , 42.8])

3145

3146 Using a 2D array, specifying axis and limit.

3147

3148 >>> data = np.array([[ 6., 7., 1.],

3149 ... [ 47., 15., 2.],

3150 ... [ 49., 36., 3.],

3151 ... [ 15., 39., 4.],

3152 ... [ 42., 40., -999.],

3153 ... [ 41., 41., -999.],

3154 ... [ 7., -999., -999.],

3155 ... [ 39., -999., -999.],

3156 ... [ 43., -999., -999.],

3157 ... [ 40., -999., -999.],

3158 ... [ 36., -999., -999.]])

3159 >>> print(mquantiles(data, axis=0, limit=(0, 50)))

3160 [[19.2 14.6 1.45]

3161 [40. 37.5 2.5 ]

3162 [42.8 40.05 3.55]]

3163

3164 >>> data[:, 2] = -999.

3165 >>> print(mquantiles(data, axis=0, limit=(0, 50)))

3166 [[19.200000000000003 14.6 --]

3167 [40.0 37.5 --]

3168 [42.800000000000004 40.05 --]]

3169

3170 """

3171 def _quantiles1D(data,m,p):

3172 x = np.sort(data.compressed())

3173 n = len(x)

3174 if n == 0:

3175 return ma.array(np.empty(len(p), dtype=float), mask=True)

3176 elif n == 1:

3177 return ma.array(np.resize(x, p.shape), mask=nomask)

3178 aleph = (n*p + m)

3179 k = np.floor(aleph.clip(1, n-1)).astype(int)

3180 gamma = (aleph-k).clip(0,1)

3181 return (1.-gamma)*x[(k-1).tolist()] + gamma*x[k.tolist()]

3182

3183 data = ma.array(a, copy=False)

3184 if data.ndim > 2:

3185 raise TypeError("Array should be 2D at most !")

3186

3187 if limit:

3188 condition = (limit[0] < data) & (data < limit[1])

3189 data[~condition.filled(True)] = masked

3190

3191 p = np.array(prob, copy=False, ndmin=1)

3192 m = alphap + p*(1.-alphap-betap)

3193 # Computes quantiles along axis (or globally)

3194 if (axis is None):

3195 return _quantiles1D(data, m, p)

3196

3197 return ma.apply_along_axis(_quantiles1D, axis, data, m, p)

3198

3199

3200def scoreatpercentile(data, per, limit=(), alphap=.4, betap=.4):

3201 """Calculate the score at the given 'per' percentile of the

3202 sequence a. For example, the score at per=50 is the median.

3203

3204 This function is a shortcut to mquantile

3205

3206 """

3207 if (per < 0) or (per > 100.):

3208 raise ValueError("The percentile should be between 0. and 100. !"

3209 " (got %s)" % per)

3210

3211 return mquantiles(data, prob=[per/100.], alphap=alphap, betap=betap,

3212 limit=limit, axis=0).squeeze()

3213

3214

3215def plotting_positions(data, alpha=0.4, beta=0.4):

3216 """

3217 Returns plotting positions (or empirical percentile points) for the data.

3218

3219 Plotting positions are defined as ``(i-alpha)/(n+1-alpha-beta)``, where:

3220 - i is the rank order statistics

3221 - n is the number of unmasked values along the given axis

3222 - `alpha` and `beta` are two parameters.

3223

3224 Typical values for `alpha` and `beta` are:

3225 - (0,1) : ``p(k) = k/n``, linear interpolation of cdf (R, type 4)

3226 - (.5,.5) : ``p(k) = (k-1/2.)/n``, piecewise linear function

3227 (R, type 5)

3228 - (0,0) : ``p(k) = k/(n+1)``, Weibull (R type 6)

3229 - (1,1) : ``p(k) = (k-1)/(n-1)``, in this case,

3230 ``p(k) = mode[F(x[k])]``. That's R default (R type 7)

3231 - (1/3,1/3): ``p(k) = (k-1/3)/(n+1/3)``, then

3232 ``p(k) ~ median[F(x[k])]``.

3233 The resulting quantile estimates are approximately median-unbiased

3234 regardless of the distribution of x. (R type 8)

3235 - (3/8,3/8): ``p(k) = (k-3/8)/(n+1/4)``, Blom.

3236 The resulting quantile estimates are approximately unbiased

3237 if x is normally distributed (R type 9)

3238 - (.4,.4) : approximately quantile unbiased (Cunnane)

3239 - (.35,.35): APL, used with PWM

3240 - (.3175, .3175): used in scipy.stats.probplot

3241

3242 Parameters

3243 ----------

3244 data : array_like

3245 Input data, as a sequence or array of dimension at most 2.

3246 alpha : float, optional

3247 Plotting positions parameter. Default is 0.4.

3248 beta : float, optional

3249 Plotting positions parameter. Default is 0.4.

3250

3251 Returns

3252 -------

3253 positions : MaskedArray

3254 The calculated plotting positions.

3255

3256 """

3257 data = ma.array(data, copy=False).reshape(1,-1)

3258 n = data.count()

3259 plpos = np.empty(data.size, dtype=float)

3260 plpos[n:] = 0

3261 plpos[data.argsort(axis=None)[:n]] = ((np.arange(1, n+1) - alpha) /

3262 (n + 1.0 - alpha - beta))

3263 return ma.array(plpos, mask=data._mask)

3264

3265

3266meppf = plotting_positions

3267

3268

3269def obrientransform(*args):

3270 """

3271 Computes a transform on input data (any number of columns). Used to

3272 test for homogeneity of variance prior to running one-way stats. Each

3273 array in ``*args`` is one level of a factor. If an `f_oneway()` run on

3274 the transformed data and found significant, variances are unequal. From

3275 Maxwell and Delaney, p.112.

3276

3277 Returns: transformed data for use in an ANOVA

3278 """

3279 data = argstoarray(*args).T

3280 v = data.var(axis=0,ddof=1)

3281 m = data.mean(0)

3282 n = data.count(0).astype(float)

3283 # result = ((N-1.5)*N*(a-m)**2 - 0.5*v*(n-1))/((n-1)*(n-2))

3284 data -= m

3285 data **= 2

3286 data *= (n-1.5)*n

3287 data -= 0.5*v*(n-1)

3288 data /= (n-1.)*(n-2.)

3289 if not ma.allclose(v,data.mean(0)):

3290 raise ValueError("Lack of convergence in obrientransform.")

3291

3292 return data

3293

3294

3295def sem(a, axis=0, ddof=1):

3296 """

3297 Calculates the standard error of the mean of the input array.

3298

3299 Also sometimes called standard error of measurement.

3300

3301 Parameters

3302 ----------

3303 a : array_like

3304 An array containing the values for which the standard error is

3305 returned.

3306 axis : int or None, optional

3307 If axis is None, ravel `a` first. If axis is an integer, this will be

3308 the axis over which to operate. Defaults to 0.

3309 ddof : int, optional

3310 Delta degrees-of-freedom. How many degrees of freedom to adjust

3311 for bias in limited samples relative to the population estimate

3312 of variance. Defaults to 1.

3313

3314 Returns

3315 -------

3316 s : ndarray or float

3317 The standard error of the mean in the sample(s), along the input axis.

3318

3319 Notes

3320 -----

3321 The default value for `ddof` changed in scipy 0.15.0 to be consistent with

3322 `scipy.stats.sem` as well as with the most common definition used (like in

3323 the R documentation).

3324

3325 Examples

3326 --------

3327 Find standard error along the first axis:

3328

3329 >>> import numpy as np

3330 >>> from scipy import stats

3331 >>> a = np.arange(20).reshape(5,4)

3332 >>> print(stats.mstats.sem(a))

3333 [2.8284271247461903 2.8284271247461903 2.8284271247461903

3334 2.8284271247461903]

3335

3336 Find standard error across the whole array, using n degrees of freedom:

3337

3338 >>> print(stats.mstats.sem(a, axis=None, ddof=0))

3339 1.2893796958227628

3340

3341 """

3342 a, axis = _chk_asarray(a, axis)

3343 n = a.count(axis=axis)

3344 s = a.std(axis=axis, ddof=ddof) / ma.sqrt(n)

3345 return s

3346

3347

3348F_onewayResult = namedtuple('F_onewayResult', ('statistic', 'pvalue'))

3349

3350

3351def f_oneway(*args):

3352 """

3353 Performs a 1-way ANOVA, returning an F-value and probability given

3354 any number of groups. From Heiman, pp.394-7.

3355

3356 Usage: ``f_oneway(*args)``, where ``*args`` is 2 or more arrays,

3357 one per treatment group.

3358

3359 Returns

3360 -------

3361 statistic : float

3362 The computed F-value of the test.

3363 pvalue : float

3364 The associated p-value from the F-distribution.

3365

3366 """

3367 # Construct a single array of arguments: each row is a group

3368 data = argstoarray(*args)

3369 ngroups = len(data)

3370 ntot = data.count()

3371 sstot = (data**2).sum() - (data.sum())**2/float(ntot)

3372 ssbg = (data.count(-1) * (data.mean(-1)-data.mean())**2).sum()

3373 sswg = sstot-ssbg

3374 dfbg = ngroups-1

3375 dfwg = ntot - ngroups

3376 msb = ssbg/float(dfbg)

3377 msw = sswg/float(dfwg)

3378 f = msb/msw

3379 prob = special.fdtrc(dfbg, dfwg, f) # equivalent to stats.f.sf

3380

3381 return F_onewayResult(f, prob)

3382

3383

3384FriedmanchisquareResult = namedtuple('FriedmanchisquareResult',

3385 ('statistic', 'pvalue'))

3386

3387

3388def friedmanchisquare(*args):

3389 """Friedman Chi-Square is a non-parametric, one-way within-subjects ANOVA.

3390 This function calculates the Friedman Chi-square test for repeated measures

3391 and returns the result, along with the associated probability value.

3392

3393 Each input is considered a given group. Ideally, the number of treatments

3394 among each group should be equal. If this is not the case, only the first

3395 n treatments are taken into account, where n is the number of treatments

3396 of the smallest group.

3397 If a group has some missing values, the corresponding treatments are masked

3398 in the other groups.

3399 The test statistic is corrected for ties.

3400

3401 Masked values in one group are propagated to the other groups.

3402

3403 Returns

3404 -------

3405 statistic : float

3406 the test statistic.

3407 pvalue : float

3408 the associated p-value.

3409

3410 """

3411 data = argstoarray(*args).astype(float)

3412 k = len(data)

3413 if k < 3:

3414 raise ValueError("Less than 3 groups (%i): " % k +

3415 "the Friedman test is NOT appropriate.")

3416

3417 ranked = ma.masked_values(rankdata(data, axis=0), 0)

3418 if ranked._mask is not nomask:

3419 ranked = ma.mask_cols(ranked)

3420 ranked = ranked.compressed().reshape(k,-1).view(ndarray)

3421 else:

3422 ranked = ranked._data

3423 (k,n) = ranked.shape

3424 # Ties correction

3425 repeats = [find_repeats(row) for row in ranked.T]

3426 ties = np.array([y for x, y in repeats if x.size > 0])

3427 tie_correction = 1 - (ties**3-ties).sum()/float(n*(k**3-k))

3428

3429 ssbg = np.sum((ranked.sum(-1) - n*(k+1)/2.)**2)

3430 chisq = ssbg * 12./(n*k*(k+1)) * 1./tie_correction

3431

3432 return FriedmanchisquareResult(chisq,

3433 distributions.chi2.sf(chisq, k-1))

3434

3435

3436BrunnerMunzelResult = namedtuple('BrunnerMunzelResult', ('statistic', 'pvalue'))

3437

3438

3439def brunnermunzel(x, y, alternative="two-sided", distribution="t"):

3440 """

3441 Computes the Brunner-Munzel test on samples x and y

3442

3443 Missing values in `x` and/or `y` are discarded.

3444

3445 Parameters

3446 ----------

3447 x, y : array_like

3448 Array of samples, should be one-dimensional.

3449 alternative : 'less', 'two-sided', or 'greater', optional

3450 Whether to get the p-value for the one-sided hypothesis ('less'

3451 or 'greater') or for the two-sided hypothesis ('two-sided').

3452 Defaults value is 'two-sided' .

3453 distribution : 't' or 'normal', optional

3454 Whether to get the p-value by t-distribution or by standard normal

3455 distribution.

3456 Defaults value is 't' .

3457

3458 Returns

3459 -------

3460 statistic : float

3461 The Brunner-Munzer W statistic.

3462 pvalue : float

3463 p-value assuming an t distribution. One-sided or

3464 two-sided, depending on the choice of `alternative` and `distribution`.

3465

3466 See Also

3467 --------

3468 mannwhitneyu : Mann-Whitney rank test on two samples.

3469

3470 Notes

3471 -----

3472 For more details on `brunnermunzel`, see `scipy.stats.brunnermunzel`.

3473

3474 """

3475 x = ma.asarray(x).compressed().view(ndarray)

3476 y = ma.asarray(y).compressed().view(ndarray)

3477 nx = len(x)

3478 ny = len(y)

3479 if nx == 0 or ny == 0:

3480 return BrunnerMunzelResult(np.nan, np.nan)

3481 rankc = rankdata(np.concatenate((x,y)))

3482 rankcx = rankc[0:nx]

3483 rankcy = rankc[nx:nx+ny]

3484 rankcx_mean = np.mean(rankcx)

3485 rankcy_mean = np.mean(rankcy)

3486 rankx = rankdata(x)

3487 ranky = rankdata(y)

3488 rankx_mean = np.mean(rankx)

3489 ranky_mean = np.mean(ranky)

3490

3491 Sx = np.sum(np.power(rankcx - rankx - rankcx_mean + rankx_mean, 2.0))

3492 Sx /= nx - 1

3493 Sy = np.sum(np.power(rankcy - ranky - rankcy_mean + ranky_mean, 2.0))

3494 Sy /= ny - 1

3495

3496 wbfn = nx * ny * (rankcy_mean - rankcx_mean)

3497 wbfn /= (nx + ny) * np.sqrt(nx * Sx + ny * Sy)

3498

3499 if distribution == "t":

3500 df_numer = np.power(nx * Sx + ny * Sy, 2.0)

3501 df_denom = np.power(nx * Sx, 2.0) / (nx - 1)

3502 df_denom += np.power(ny * Sy, 2.0) / (ny - 1)

3503 df = df_numer / df_denom

3504 p = distributions.t.cdf(wbfn, df)

3505 elif distribution == "normal":

3506 p = distributions.norm.cdf(wbfn)

3507 else:

3508 raise ValueError(

3509 "distribution should be 't' or 'normal'")

3510

3511 if alternative == "greater":

3512 pass

3513 elif alternative == "less":

3514 p = 1 - p

3515 elif alternative == "two-sided":

3516 p = 2 * np.min([p, 1-p])

3517 else:

3518 raise ValueError(

3519 "alternative should be 'less', 'greater' or 'two-sided'")

3520

3521 return BrunnerMunzelResult(wbfn, p)

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scipy/stats/_mstats_basic.py: 11%

991 statements