Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scipy/stats/_stats

4# The original code from Gary Strangman was heavily adapted for

5# use in SciPy by Travis Oliphant. The original code came with the

6# following disclaimer:

8# This software is provided "as-is". There are no expressed or implied

9# warranties of any kind, including, but not limited to, the warranties

10# of merchantability and fitness for a given application. In no event

11# shall Gary Strangman be liable for any direct, indirect, incidental,

12# special, exemplary or consequential damages (including, but not limited

13# to, loss of use, data or profits, or business interruption) however

14# caused and on any theory of liability, whether in contract, strict

15# liability or tort (including negligence or otherwise) arising in any way

16# out of the use of this software, even if advised of the possibility of

17# such damage.

19"""

20A collection of basic statistical functions for Python.

22References

23----------

24.. [CRCProbStat2000] Zwillinger, D. and Kokoska, S. (2000). CRC Standard

25 Probability and Statistics Tables and Formulae. Chapman & Hall: New

26 York. 2000.

28"""

29import warnings

30import math

31from math import gcd

32from collections import namedtuple, Counter

34import numpy as np

35from numpy import array, asarray, ma

36from numpy.lib import NumpyVersion

37from numpy.testing import suppress_warnings

39from scipy.spatial.distance import cdist

40from scipy.ndimage import _measurements

41from scipy._lib._util import (check_random_state, MapWrapper,

42 rng_integers, _rename_parameter, _contains_nan)

44import scipy.special as special

45from scipy import linalg

46from . import distributions

47from . import _mstats_basic as mstats_basic

48from ._stats_mstats_common import (_find_repeats, linregress, theilslopes,

49 siegelslopes)

50from ._stats import (_kendall_dis, _toint64, _weightedrankedtau,

51 _local_correlations)

52from dataclasses import make_dataclass

53from ._hypotests import _all_partitions

54from ._stats_pythran import _compute_outer_prob_inside_method

55from ._resampling import _batch_generator

56from ._axis_nan_policy import (_axis_nan_policy_factory,

57 _broadcast_concatenate)

58from ._binomtest import _binary_search_for_binom_tst as _binary_search

59from scipy._lib._bunch import _make_tuple_bunch

60from scipy import stats

61from scipy.optimize import root_scalar

64# Functions/classes in other files should be added in `__init__.py`, not here

65__all__ = ['find_repeats', 'gmean', 'hmean', 'pmean', 'mode', 'tmean', 'tvar',

66 'tmin', 'tmax', 'tstd', 'tsem', 'moment',

67 'skew', 'kurtosis', 'describe', 'skewtest', 'kurtosistest',

68 'normaltest', 'jarque_bera',

69 'scoreatpercentile', 'percentileofscore',

70 'cumfreq', 'relfreq', 'obrientransform',

71 'sem', 'zmap', 'zscore', 'gzscore', 'iqr', 'gstd',

72 'median_abs_deviation',

73 'sigmaclip', 'trimboth', 'trim1', 'trim_mean',

74 'f_oneway', 'pearsonr', 'fisher_exact',

75 'spearmanr', 'pointbiserialr',

76 'kendalltau', 'weightedtau', 'multiscale_graphcorr',

77 'linregress', 'siegelslopes', 'theilslopes', 'ttest_1samp',

78 'ttest_ind', 'ttest_ind_from_stats', 'ttest_rel',

79 'kstest', 'ks_1samp', 'ks_2samp',

80 'chisquare', 'power_divergence',

81 'tiecorrect', 'ranksums', 'kruskal', 'friedmanchisquare',

82 'rankdata',

83 'combine_pvalues', 'wasserstein_distance', 'energy_distance',

84 'brunnermunzel', 'alexandergovern',

85 'expectile', ]

88def _chk_asarray(a, axis):

89 if axis is None:

90 a = np.ravel(a)

91 outaxis = 0

92 else:

93 a = np.asarray(a)

94 outaxis = axis

96 if a.ndim == 0:

97 a = np.atleast_1d(a)

99 return a, outaxis

100

101

102def _chk2_asarray(a, b, axis):

103 if axis is None:

104 a = np.ravel(a)

105 b = np.ravel(b)

106 outaxis = 0

107 else:

108 a = np.asarray(a)

109 b = np.asarray(b)

110 outaxis = axis

111

112 if a.ndim == 0:

113 a = np.atleast_1d(a)

114 if b.ndim == 0:

115 b = np.atleast_1d(b)

116

117 return a, b, outaxis

118

119

120def _shape_with_dropped_axis(a, axis):

121 """

122 Given an array `a` and an integer `axis`, return the shape

123 of `a` with the `axis` dimension removed.

124

125 Examples

126 --------

127 >>> a = np.zeros((3, 5, 2))

128 >>> _shape_with_dropped_axis(a, 1)

129 (3, 2)

130

131 """

132 shp = list(a.shape)

133 try:

134 del shp[axis]

135 except IndexError:

136 raise np.AxisError(axis, a.ndim) from None

137 return tuple(shp)

138

139

140def _broadcast_shapes(shape1, shape2):

141 """

142 Given two shapes (i.e. tuples of integers), return the shape

143 that would result from broadcasting two arrays with the given

144 shapes.

145

146 Examples

147 --------

148 >>> _broadcast_shapes((2, 1), (4, 1, 3))

149 (4, 2, 3)

150 """

151 d = len(shape1) - len(shape2)

152 if d <= 0:

153 shp1 = (1,)*(-d) + shape1

154 shp2 = shape2

155 else:

156 shp1 = shape1

157 shp2 = (1,)*d + shape2

158 shape = []

159 for n1, n2 in zip(shp1, shp2):

160 if n1 == 1:

161 n = n2

162 elif n2 == 1 or n1 == n2:

163 n = n1

164 else:

165 raise ValueError(f'shapes {shape1} and {shape2} could not be '

166 'broadcast together')

167 shape.append(n)

168 return tuple(shape)

169

170

171def _broadcast_shapes_with_dropped_axis(a, b, axis):

172 """

173 Given two arrays `a` and `b` and an integer `axis`, find the

174 shape of the broadcast result after dropping `axis` from the

175 shapes of `a` and `b`.

176

177 Examples

178 --------

179 >>> a = np.zeros((5, 2, 1))

180 >>> b = np.zeros((1, 9, 3))

181 >>> _broadcast_shapes_with_dropped_axis(a, b, 1)

182 (5, 3)

183 """

184 shp1 = _shape_with_dropped_axis(a, axis)

185 shp2 = _shape_with_dropped_axis(b, axis)

186 try:

187 shp = _broadcast_shapes(shp1, shp2)

188 except ValueError:

189 raise ValueError(f'non-axis shapes {shp1} and {shp2} could not be '

190 'broadcast together') from None

191 return shp

192

193

194SignificanceResult = _make_tuple_bunch('SignificanceResult',

195 ['statistic', 'pvalue'], [])

196

197

198# note that `weights` are paired with `x`

199@_axis_nan_policy_factory(

200 lambda x: x, n_samples=1, n_outputs=1, too_small=0, paired=True,

201 result_to_tuple=lambda x: (x,), kwd_samples=['weights'])

202def gmean(a, axis=0, dtype=None, weights=None):

203 r"""Compute the weighted geometric mean along the specified axis.

204

205 The weighted geometric mean of the array :math:`a_i` associated to weights

206 :math:`w_i` is:

207

208 .. math::

209

210 \exp \left( \frac{ \sum_{i=1}^n w_i \ln a_i }{ \sum_{i=1}^n w_i }

211 \right) \, ,

212

213 and, with equal weights, it gives:

214

215 .. math::

216

217 \sqrt[n]{ \prod_{i=1}^n a_i } \, .

218

219 Parameters

220 ----------

221 a : array_like

222 Input array or object that can be converted to an array.

223 axis : int or None, optional

224 Axis along which the geometric mean is computed. Default is 0.

225 If None, compute over the whole array `a`.

226 dtype : dtype, optional

227 Type to which the input arrays are cast before the calculation is

228 performed.

229 weights : array_like, optional

230 The `weights` array must be broadcastable to the same shape as `a`.

231 Default is None, which gives each value a weight of 1.0.

232

233 Returns

234 -------

235 gmean : ndarray

236 See `dtype` parameter above.

237

238 See Also

239 --------

240 numpy.mean : Arithmetic average

241 numpy.average : Weighted average

242 hmean : Harmonic mean

243

244 References

245 ----------

246 .. [1] "Weighted Geometric Mean", *Wikipedia*,

247 https://en.wikipedia.org/wiki/Weighted_geometric_mean.

248

249 Examples

250 --------

251 >>> from scipy.stats import gmean

252 >>> gmean([1, 4])

253 2.0

254 >>> gmean([1, 2, 3, 4, 5, 6, 7])

255 3.3800151591412964

256 >>> gmean([1, 4, 7], weights=[3, 1, 3])

257 2.80668351922014

258

259 """

260

261 a = np.asarray(a, dtype=dtype)

262

263 if weights is not None:

264 weights = np.asarray(weights, dtype=dtype)

265

266 with np.errstate(divide='ignore'):

267 log_a = np.log(a)

268

269 return np.exp(np.average(log_a, axis=axis, weights=weights))

270

271

272@_axis_nan_policy_factory(

273 lambda x: x, n_samples=1, n_outputs=1, too_small=0, paired=True,

274 result_to_tuple=lambda x: (x,), kwd_samples=['weights'])

275def hmean(a, axis=0, dtype=None, *, weights=None):

276 r"""Calculate the weighted harmonic mean along the specified axis.

277

278 The weighted harmonic mean of the array :math:`a_i` associated to weights

279 :math:`w_i` is:

280

281 .. math::

282

283 \frac{ \sum_{i=1}^n w_i }{ \sum_{i=1}^n \frac{w_i}{a_i} } \, ,

284

285 and, with equal weights, it gives:

286

287 .. math::

288

289 \frac{ n }{ \sum_{i=1}^n \frac{1}{a_i} } \, .

290

291 Parameters

292 ----------

293 a : array_like

294 Input array, masked array or object that can be converted to an array.

295 axis : int or None, optional

296 Axis along which the harmonic mean is computed. Default is 0.

297 If None, compute over the whole array `a`.

298 dtype : dtype, optional

299 Type of the returned array and of the accumulator in which the

300 elements are summed. If `dtype` is not specified, it defaults to the

301 dtype of `a`, unless `a` has an integer `dtype` with a precision less

302 than that of the default platform integer. In that case, the default

303 platform integer is used.

304 weights : array_like, optional

305 The weights array can either be 1-D (in which case its length must be

306 the size of `a` along the given `axis`) or of the same shape as `a`.

307 Default is None, which gives each value a weight of 1.0.

308

309 .. versionadded:: 1.9

310

311 Returns

312 -------

313 hmean : ndarray

314 See `dtype` parameter above.

315

316 See Also

317 --------

318 numpy.mean : Arithmetic average

319 numpy.average : Weighted average

320 gmean : Geometric mean

321

322 Notes

323 -----

324 The harmonic mean is computed over a single dimension of the input

325 array, axis=0 by default, or all values in the array if axis=None.

326 float64 intermediate and return values are used for integer inputs.

327

328 References

329 ----------

330 .. [1] "Weighted Harmonic Mean", *Wikipedia*,

331 https://en.wikipedia.org/wiki/Harmonic_mean#Weighted_harmonic_mean

332 .. [2] Ferger, F., "The nature and use of the harmonic mean", Journal of

333 the American Statistical Association, vol. 26, pp. 36-40, 1931

334

335 Examples

336 --------

337 >>> from scipy.stats import hmean

338 >>> hmean([1, 4])

339 1.6000000000000001

340 >>> hmean([1, 2, 3, 4, 5, 6, 7])

341 2.6997245179063363

342 >>> hmean([1, 4, 7], weights=[3, 1, 3])

343 1.9029126213592233

344

345 """

346 if not isinstance(a, np.ndarray):

347 a = np.array(a, dtype=dtype)

348 elif dtype:

349 # Must change the default dtype allowing array type

350 if isinstance(a, np.ma.MaskedArray):

351 a = np.ma.asarray(a, dtype=dtype)

352 else:

353 a = np.asarray(a, dtype=dtype)

354

355 if np.all(a >= 0):

356 # Harmonic mean only defined if greater than or equal to zero.

357 if weights is not None:

358 weights = np.asanyarray(weights, dtype=dtype)

359

360 with np.errstate(divide='ignore'):

361 return 1.0 / np.average(1.0 / a, axis=axis, weights=weights)

362 else:

363 raise ValueError("Harmonic mean only defined if all elements greater "

364 "than or equal to zero")

365

366

367@_axis_nan_policy_factory(

368 lambda x: x, n_samples=1, n_outputs=1, too_small=0, paired=True,

369 result_to_tuple=lambda x: (x,), kwd_samples=['weights'])

370def pmean(a, p, *, axis=0, dtype=None, weights=None):

371 r"""Calculate the weighted power mean along the specified axis.

372

373 The weighted power mean of the array :math:`a_i` associated to weights

374 :math:`w_i` is:

375

376 .. math::

377

378 \left( \frac{ \sum_{i=1}^n w_i a_i^p }{ \sum_{i=1}^n w_i }

379 \right)^{ 1 / p } \, ,

380

381 and, with equal weights, it gives:

382

383 .. math::

384

385 \left( \frac{ 1 }{ n } \sum_{i=1}^n a_i^p \right)^{ 1 / p } \, .

386

387 This mean is also called generalized mean or Hölder mean, and must not be

388 confused with the Kolmogorov generalized mean, also called

389 quasi-arithmetic mean or generalized f-mean [3]_.

390

391 Parameters

392 ----------

393 a : array_like

394 Input array, masked array or object that can be converted to an array.

395 p : int or float

396 Exponent.

397 axis : int or None, optional

398 Axis along which the power mean is computed. Default is 0.

399 If None, compute over the whole array `a`.

400 dtype : dtype, optional

401 Type of the returned array and of the accumulator in which the

402 elements are summed. If `dtype` is not specified, it defaults to the

403 dtype of `a`, unless `a` has an integer `dtype` with a precision less

404 than that of the default platform integer. In that case, the default

405 platform integer is used.

406 weights : array_like, optional

407 The weights array can either be 1-D (in which case its length must be

408 the size of `a` along the given `axis`) or of the same shape as `a`.

409 Default is None, which gives each value a weight of 1.0.

410

411 Returns

412 -------

413 pmean : ndarray, see `dtype` parameter above.

414 Output array containing the power mean values.

415

416 See Also

417 --------

418 numpy.average : Weighted average

419 gmean : Geometric mean

420 hmean : Harmonic mean

421

422 Notes

423 -----

424 The power mean is computed over a single dimension of the input

425 array, ``axis=0`` by default, or all values in the array if ``axis=None``.

426 float64 intermediate and return values are used for integer inputs.

427

428 .. versionadded:: 1.9

429

430 References

431 ----------

432 .. [1] "Generalized Mean", *Wikipedia*,

433 https://en.wikipedia.org/wiki/Generalized_mean

434 .. [2] Norris, N., "Convexity properties of generalized mean value

435 functions", The Annals of Mathematical Statistics, vol. 8,

436 pp. 118-120, 1937

437 .. [3] Bullen, P.S., Handbook of Means and Their Inequalities, 2003

438

439 Examples

440 --------

441 >>> from scipy.stats import pmean, hmean, gmean

442 >>> pmean([1, 4], 1.3)

443 2.639372938300652

444 >>> pmean([1, 2, 3, 4, 5, 6, 7], 1.3)

445 4.157111214492084

446 >>> pmean([1, 4, 7], -2, weights=[3, 1, 3])

447 1.4969684896631954

448

449 For p=-1, power mean is equal to harmonic mean:

450

451 >>> pmean([1, 4, 7], -1, weights=[3, 1, 3])

452 1.9029126213592233

453 >>> hmean([1, 4, 7], weights=[3, 1, 3])

454 1.9029126213592233

455

456 For p=0, power mean is defined as the geometric mean:

457

458 >>> pmean([1, 4, 7], 0, weights=[3, 1, 3])

459 2.80668351922014

460 >>> gmean([1, 4, 7], weights=[3, 1, 3])

461 2.80668351922014

462

463 """

464 if not isinstance(p, (int, float)):

465 raise ValueError("Power mean only defined for exponent of type int or "

466 "float.")

467 if p == 0:

468 return gmean(a, axis=axis, dtype=dtype, weights=weights)

469

470 if not isinstance(a, np.ndarray):

471 a = np.array(a, dtype=dtype)

472 elif dtype:

473 # Must change the default dtype allowing array type

474 if isinstance(a, np.ma.MaskedArray):

475 a = np.ma.asarray(a, dtype=dtype)

476 else:

477 a = np.asarray(a, dtype=dtype)

478

479 if np.all(a >= 0):

480 # Power mean only defined if greater than or equal to zero

481 if weights is not None:

482 weights = np.asanyarray(weights, dtype=dtype)

483

484 with np.errstate(divide='ignore'):

485 return np.float_power(

486 np.average(np.float_power(a, p), axis=axis, weights=weights),

487 1/p)

488 else:

489 raise ValueError("Power mean only defined if all elements greater "

490 "than or equal to zero")

491

492

493ModeResult = namedtuple('ModeResult', ('mode', 'count'))

494

495

496def mode(a, axis=0, nan_policy='propagate', keepdims=None):

497 r"""Return an array of the modal (most common) value in the passed array.

498

499 If there is more than one such value, only one is returned.

500 The bin-count for the modal bins is also returned.

501

502 Parameters

503 ----------

504 a : array_like

505 n-dimensional array of which to find mode(s).

506 axis : int or None, optional

507 Axis along which to operate. Default is 0. If None, compute over

508 the whole array `a`.

509 nan_policy : {'propagate', 'raise', 'omit'}, optional

510 Defines how to handle when input contains nan.

511 The following options are available (default is 'propagate'):

512

513 * 'propagate': treats nan as it would treat any other value

514 * 'raise': throws an error

515 * 'omit': performs the calculations ignoring nan values

516 keepdims : bool, optional

517 If set to ``False``, the `axis` over which the statistic is taken

518 is consumed (eliminated from the output array) like other reduction

519 functions (e.g. `skew`, `kurtosis`). If set to ``True``, the `axis` is

520 retained with size one, and the result will broadcast correctly

521 against the input array. The default, ``None``, is undefined legacy

522 behavior retained for backward compatibility.

523

524 .. warning::

525 Unlike other reduction functions (e.g. `skew`, `kurtosis`), the

526 default behavior of `mode` usually retains the axis it acts

527 along. In SciPy 1.11.0, this behavior will change: the default

528 value of `keepdims` will become ``False``, the `axis` over which

529 the statistic is taken will be eliminated, and the value ``None``

530 will no longer be accepted.

531 .. versionadded:: 1.9.0

532

533 Returns

534 -------

535 mode : ndarray

536 Array of modal values.

537 count : ndarray

538 Array of counts for each mode.

539

540 Notes

541 -----

542 The mode of object arrays is calculated using `collections.Counter`, which

543 treats NaNs with different binary representations as distinct.

544

545 .. deprecated:: 1.9.0

546 Support for non-numeric arrays has been deprecated as of SciPy 1.9.0

547 and will be removed in 1.11.0. `pandas.DataFrame.mode`_ can

548 be used instead.

549

550 .. _pandas.DataFrame.mode: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.mode.html

551

552 The mode of arrays with other dtypes is calculated using `numpy.unique`.

553 In NumPy versions 1.21 and after, all NaNs - even those with different

554 binary representations - are treated as equivalent and counted as separate

555 instances of the same value.

556

557 Examples

558 --------

559 >>> import numpy as np

560 >>> a = np.array([[3, 0, 3, 7],

561 ... [3, 2, 6, 2],

562 ... [1, 7, 2, 8],

563 ... [3, 0, 6, 1],

564 ... [3, 2, 5, 5]])

565 >>> from scipy import stats

566 >>> stats.mode(a, keepdims=True)

567 ModeResult(mode=array([[3, 0, 6, 1]]), count=array([[4, 2, 2, 1]]))

568

569 To get mode of whole array, specify ``axis=None``:

570

571 >>> stats.mode(a, axis=None, keepdims=True)

572 ModeResult(mode=[3], count=[5])

573 >>> stats.mode(a, axis=None, keepdims=False)

574 ModeResult(mode=3, count=5)

575

576 """ # noqa: E501

577

578 if keepdims is None:

579 message = ("Unlike other reduction functions (e.g. `skew`, "

580 "`kurtosis`), the default behavior of `mode` typically "

581 "preserves the axis it acts along. In SciPy 1.11.0, "

582 "this behavior will change: the default value of "

583 "`keepdims` will become False, the `axis` over which "

584 "the statistic is taken will be eliminated, and the value "

585 "None will no longer be accepted. "

586 "Set `keepdims` to True or False to avoid this warning.")

587 warnings.warn(message, FutureWarning, stacklevel=2)

588

589 a = np.asarray(a)

590 if a.size == 0:

591 if keepdims is None:

592 return ModeResult(np.array([]), np.array([]))

593 else:

594 # this is tricky to get right; let np.mean do it

595 out = np.mean(a, axis=axis, keepdims=keepdims)

596 return ModeResult(out, out.copy())

597

598 a, axis = _chk_asarray(a, axis)

599

600 contains_nan, nan_policy = _contains_nan(a, nan_policy)

601

602 if contains_nan and nan_policy == 'omit':

603 a = ma.masked_invalid(a)

604 return mstats_basic._mode(a, axis, keepdims=keepdims)

605

606 if not np.issubdtype(a.dtype, np.number):

607 warnings.warn("Support for non-numeric arrays has been deprecated "

608 "as of SciPy 1.9.0 and will be removed in "

609 "1.11.0. `pandas.DataFrame.mode` can be used instead, "

610 "see https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.mode.html.", # noqa: E501

611 DeprecationWarning, stacklevel=2)

612

613 if a.dtype == object:

614 def _mode1D(a):

615 cntr = Counter(a)

616 mode = max(cntr, key=lambda x: cntr[x])

617 return mode, cntr[mode]

618 else:

619 def _mode1D(a):

620 vals, cnts = np.unique(a, return_counts=True)

621 return vals[cnts.argmax()], cnts.max()

622

623 # np.apply_along_axis will convert the _mode1D tuples to a numpy array,

624 # casting types in the process.

625 # This recreates the results without that issue

626 # View of a, rotated so the requested axis is last

627 a_view = np.moveaxis(a, axis, -1)

628

629 inds = np.ndindex(a_view.shape[:-1])

630 modes = np.empty(a_view.shape[:-1], dtype=a.dtype)

631 counts = np.empty(a_view.shape[:-1], dtype=np.int_)

632 for ind in inds:

633 modes[ind], counts[ind] = _mode1D(a_view[ind])

634

635 if keepdims is None or keepdims:

636 newshape = list(a.shape)

637 newshape[axis] = 1

638 return ModeResult(modes.reshape(newshape), counts.reshape(newshape))

639 else:

640 return ModeResult(modes[()], counts[()])

641

642

643def _mask_to_limits(a, limits, inclusive):

644 """Mask an array for values outside of given limits.

645

646 This is primarily a utility function.

647

648 Parameters

649 ----------

650 a : array

651 limits : (float or None, float or None)

652 A tuple consisting of the (lower limit, upper limit). Values in the

653 input array less than the lower limit or greater than the upper limit

654 will be masked out. None implies no limit.

655 inclusive : (bool, bool)

656 A tuple consisting of the (lower flag, upper flag). These flags

657 determine whether values exactly equal to lower or upper are allowed.

658

659 Returns

660 -------

661 A MaskedArray.

662

663 Raises

664 ------

665 A ValueError if there are no values within the given limits.

666

667 """

668 lower_limit, upper_limit = limits

669 lower_include, upper_include = inclusive

670 am = ma.MaskedArray(a)

671 if lower_limit is not None:

672 if lower_include:

673 am = ma.masked_less(am, lower_limit)

674 else:

675 am = ma.masked_less_equal(am, lower_limit)

676

677 if upper_limit is not None:

678 if upper_include:

679 am = ma.masked_greater(am, upper_limit)

680 else:

681 am = ma.masked_greater_equal(am, upper_limit)

682

683 if am.count() == 0:

684 raise ValueError("No array values within given limits")

685

686 return am

687

688

689def tmean(a, limits=None, inclusive=(True, True), axis=None):

690 """Compute the trimmed mean.

691

692 This function finds the arithmetic mean of given values, ignoring values

693 outside the given `limits`.

694

695 Parameters

696 ----------

697 a : array_like

698 Array of values.

699 limits : None or (lower limit, upper limit), optional

700 Values in the input array less than the lower limit or greater than the

701 upper limit will be ignored. When limits is None (default), then all

702 values are used. Either of the limit values in the tuple can also be

703 None representing a half-open interval.

704 inclusive : (bool, bool), optional

705 A tuple consisting of the (lower flag, upper flag). These flags

706 determine whether values exactly equal to the lower or upper limits

707 are included. The default value is (True, True).

708 axis : int or None, optional

709 Axis along which to compute test. Default is None.

710

711 Returns

712 -------

713 tmean : ndarray

714 Trimmed mean.

715

716 See Also

717 --------

718 trim_mean : Returns mean after trimming a proportion from both tails.

719

720 Examples

721 --------

722 >>> import numpy as np

723 >>> from scipy import stats

724 >>> x = np.arange(20)

725 >>> stats.tmean(x)

726 9.5

727 >>> stats.tmean(x, (3,17))

728 10.0

729

730 """

731 a = asarray(a)

732 if limits is None:

733 return np.mean(a, axis)

734 am = _mask_to_limits(a, limits, inclusive)

735 mean = np.ma.filled(am.mean(axis=axis), fill_value=np.nan)

736 return mean if mean.ndim > 0 else mean.item()

737

738

739def tvar(a, limits=None, inclusive=(True, True), axis=0, ddof=1):

740 """Compute the trimmed variance.

741

742 This function computes the sample variance of an array of values,

743 while ignoring values which are outside of given `limits`.

744

745 Parameters

746 ----------

747 a : array_like

748 Array of values.

749 limits : None or (lower limit, upper limit), optional

750 Values in the input array less than the lower limit or greater than the

751 upper limit will be ignored. When limits is None, then all values are

752 used. Either of the limit values in the tuple can also be None

753 representing a half-open interval. The default value is None.

754 inclusive : (bool, bool), optional

755 A tuple consisting of the (lower flag, upper flag). These flags

756 determine whether values exactly equal to the lower or upper limits

757 are included. The default value is (True, True).

758 axis : int or None, optional

759 Axis along which to operate. Default is 0. If None, compute over the

760 whole array `a`.

761 ddof : int, optional

762 Delta degrees of freedom. Default is 1.

763

764 Returns

765 -------

766 tvar : float

767 Trimmed variance.

768

769 Notes

770 -----

771 `tvar` computes the unbiased sample variance, i.e. it uses a correction

772 factor ``n / (n - 1)``.

773

774 Examples

775 --------

776 >>> import numpy as np

777 >>> from scipy import stats

778 >>> x = np.arange(20)

779 >>> stats.tvar(x)

780 35.0

781 >>> stats.tvar(x, (3,17))

782 20.0

783

784 """

785 a = asarray(a)

786 a = a.astype(float)

787 if limits is None:

788 return a.var(ddof=ddof, axis=axis)

789 am = _mask_to_limits(a, limits, inclusive)

790 amnan = am.filled(fill_value=np.nan)

791 return np.nanvar(amnan, ddof=ddof, axis=axis)

792

793

794def tmin(a, lowerlimit=None, axis=0, inclusive=True, nan_policy='propagate'):

795 """Compute the trimmed minimum.

796

797 This function finds the miminum value of an array `a` along the

798 specified axis, but only considering values greater than a specified

799 lower limit.

800

801 Parameters

802 ----------

803 a : array_like

804 Array of values.

805 lowerlimit : None or float, optional

806 Values in the input array less than the given limit will be ignored.

807 When lowerlimit is None, then all values are used. The default value

808 is None.

809 axis : int or None, optional

810 Axis along which to operate. Default is 0. If None, compute over the

811 whole array `a`.

812 inclusive : {True, False}, optional

813 This flag determines whether values exactly equal to the lower limit

814 are included. The default value is True.

815 nan_policy : {'propagate', 'raise', 'omit'}, optional

816 Defines how to handle when input contains nan.

817 The following options are available (default is 'propagate'):

818

819 * 'propagate': returns nan

820 * 'raise': throws an error

821 * 'omit': performs the calculations ignoring nan values

822

823 Returns

824 -------

825 tmin : float, int or ndarray

826 Trimmed minimum.

827

828 Examples

829 --------

830 >>> import numpy as np

831 >>> from scipy import stats

832 >>> x = np.arange(20)

833 >>> stats.tmin(x)

834 0

835

836 >>> stats.tmin(x, 13)

837 13

838

839 >>> stats.tmin(x, 13, inclusive=False)

840 14

841

842 """

843 a, axis = _chk_asarray(a, axis)

844 am = _mask_to_limits(a, (lowerlimit, None), (inclusive, False))

845

846 contains_nan, nan_policy = _contains_nan(am, nan_policy)

847

848 if contains_nan and nan_policy == 'omit':

849 am = ma.masked_invalid(am)

850

851 res = ma.minimum.reduce(am, axis).data

852 if res.ndim == 0:

853 return res[()]

854 return res

855

856

857def tmax(a, upperlimit=None, axis=0, inclusive=True, nan_policy='propagate'):

858 """Compute the trimmed maximum.

859

860 This function computes the maximum value of an array along a given axis,

861 while ignoring values larger than a specified upper limit.

862

863 Parameters

864 ----------

865 a : array_like

866 Array of values.

867 upperlimit : None or float, optional

868 Values in the input array greater than the given limit will be ignored.

869 When upperlimit is None, then all values are used. The default value

870 is None.

871 axis : int or None, optional

872 Axis along which to operate. Default is 0. If None, compute over the

873 whole array `a`.

874 inclusive : {True, False}, optional

875 This flag determines whether values exactly equal to the upper limit

876 are included. The default value is True.

877 nan_policy : {'propagate', 'raise', 'omit'}, optional

878 Defines how to handle when input contains nan.

879 The following options are available (default is 'propagate'):

880

881 * 'propagate': returns nan

882 * 'raise': throws an error

883 * 'omit': performs the calculations ignoring nan values

884

885 Returns

886 -------

887 tmax : float, int or ndarray

888 Trimmed maximum.

889

890 Examples

891 --------

892 >>> import numpy as np

893 >>> from scipy import stats

894 >>> x = np.arange(20)

895 >>> stats.tmax(x)

896 19

897

898 >>> stats.tmax(x, 13)

899 13

900

901 >>> stats.tmax(x, 13, inclusive=False)

902 12

903

904 """

905 a, axis = _chk_asarray(a, axis)

906 am = _mask_to_limits(a, (None, upperlimit), (False, inclusive))

907

908 contains_nan, nan_policy = _contains_nan(am, nan_policy)

909

910 if contains_nan and nan_policy == 'omit':

911 am = ma.masked_invalid(am)

912

913 res = ma.maximum.reduce(am, axis).data

914 if res.ndim == 0:

915 return res[()]

916 return res

917

918

919def tstd(a, limits=None, inclusive=(True, True), axis=0, ddof=1):

920 """Compute the trimmed sample standard deviation.

921

922 This function finds the sample standard deviation of given values,

923 ignoring values outside the given `limits`.

924

925 Parameters

926 ----------

927 a : array_like

928 Array of values.

929 limits : None or (lower limit, upper limit), optional

930 Values in the input array less than the lower limit or greater than the

931 upper limit will be ignored. When limits is None, then all values are

932 used. Either of the limit values in the tuple can also be None

933 representing a half-open interval. The default value is None.

934 inclusive : (bool, bool), optional

935 A tuple consisting of the (lower flag, upper flag). These flags

936 determine whether values exactly equal to the lower or upper limits

937 are included. The default value is (True, True).

938 axis : int or None, optional

939 Axis along which to operate. Default is 0. If None, compute over the

940 whole array `a`.

941 ddof : int, optional

942 Delta degrees of freedom. Default is 1.

943

944 Returns

945 -------

946 tstd : float

947 Trimmed sample standard deviation.

948

949 Notes

950 -----

951 `tstd` computes the unbiased sample standard deviation, i.e. it uses a

952 correction factor ``n / (n - 1)``.

953

954 Examples

955 --------

956 >>> import numpy as np

957 >>> from scipy import stats

958 >>> x = np.arange(20)

959 >>> stats.tstd(x)

960 5.9160797830996161

961 >>> stats.tstd(x, (3,17))

962 4.4721359549995796

963

964 """

965 return np.sqrt(tvar(a, limits, inclusive, axis, ddof))

966

967

968def tsem(a, limits=None, inclusive=(True, True), axis=0, ddof=1):

969 """Compute the trimmed standard error of the mean.

970

971 This function finds the standard error of the mean for given

972 values, ignoring values outside the given `limits`.

973

974 Parameters

975 ----------

976 a : array_like

977 Array of values.

978 limits : None or (lower limit, upper limit), optional

979 Values in the input array less than the lower limit or greater than the

980 upper limit will be ignored. When limits is None, then all values are

981 used. Either of the limit values in the tuple can also be None

982 representing a half-open interval. The default value is None.

983 inclusive : (bool, bool), optional

984 A tuple consisting of the (lower flag, upper flag). These flags

985 determine whether values exactly equal to the lower or upper limits

986 are included. The default value is (True, True).

987 axis : int or None, optional

988 Axis along which to operate. Default is 0. If None, compute over the

989 whole array `a`.

990 ddof : int, optional

991 Delta degrees of freedom. Default is 1.

992

993 Returns

994 -------

995 tsem : float

996 Trimmed standard error of the mean.

997

998 Notes

999 -----

1000 `tsem` uses unbiased sample standard deviation, i.e. it uses a

1001 correction factor ``n / (n - 1)``.

1002

1003 Examples

1004 --------

1005 >>> import numpy as np

1006 >>> from scipy import stats

1007 >>> x = np.arange(20)

1008 >>> stats.tsem(x)

1009 1.3228756555322954

1010 >>> stats.tsem(x, (3,17))

1011 1.1547005383792515

1012

1013 """

1014 a = np.asarray(a).ravel()

1015 if limits is None:

1016 return a.std(ddof=ddof) / np.sqrt(a.size)

1017

1018 am = _mask_to_limits(a, limits, inclusive)

1019 sd = np.sqrt(np.ma.var(am, ddof=ddof, axis=axis))

1020 return sd / np.sqrt(am.count())

1021

1022

1023#####################################

1024# MOMENTS #

1025#####################################

1026

1027

1028def _moment_outputs(kwds):

1029 moment = np.atleast_1d(kwds.get('moment', 1))

1030 if moment.size == 0:

1031 raise ValueError("'moment' must be a scalar or a non-empty 1D "

1032 "list/array.")

1033 return len(moment)

1034

1035

1036def _moment_result_object(*args):

1037 if len(args) == 1:

1038 return args[0]

1039 return np.asarray(args)

1040

1041# `moment` fits into the `_axis_nan_policy` pattern, but it is a bit unusual

1042# because the number of outputs is variable. Specifically,

1043# `result_to_tuple=lambda x: (x,)` may be surprising for a function that

1044# can produce more than one output, but it is intended here.

1045# When `moment is called to produce the output:

1046# - `result_to_tuple` packs the returned array into a single-element tuple,

1047# - `_moment_result_object` extracts and returns that single element.

1048# However, when the input array is empty, `moment` is never called. Instead,

1049# - `_check_empty_inputs` is used to produce an empty array with the

1050# appropriate dimensions.

1051# - A list comprehension creates the appropriate number of copies of this

1052# array, depending on `n_outputs`.

1053# - This list - which may have multiple elements - is passed into

1054# `_moment_result_object`.

1055# - If there is a single output, `_moment_result_object` extracts and returns

1056# the single output from the list.

1057# - If there are multiple outputs, and therefore multiple elements in the list,

1058# `_moment_result_object` converts the list of arrays to a single array and

1059# returns it.

1060# Currently this leads to a slight inconsistency: when the input array is

1061# empty, there is no distinction between the `moment` function being called

1062# with parameter `moments=1` and `moments=[1]`; the latter *should* produce

1063# the same as the former but with a singleton zeroth dimension.

1064@_axis_nan_policy_factory( # noqa: E302

1065 _moment_result_object, n_samples=1, result_to_tuple=lambda x: (x,),

1066 n_outputs=_moment_outputs

1067)

1068def moment(a, moment=1, axis=0, nan_policy='propagate'):

1069 r"""Calculate the nth moment about the mean for a sample.

1070

1071 A moment is a specific quantitative measure of the shape of a set of

1072 points. It is often used to calculate coefficients of skewness and kurtosis

1073 due to its close relationship with them.

1074

1075 Parameters

1076 ----------

1077 a : array_like

1078 Input array.

1079 moment : int or array_like of ints, optional

1080 Order of central moment that is returned. Default is 1.

1081 axis : int or None, optional

1082 Axis along which the central moment is computed. Default is 0.

1083 If None, compute over the whole array `a`.

1084 nan_policy : {'propagate', 'raise', 'omit'}, optional

1085 Defines how to handle when input contains nan.

1086 The following options are available (default is 'propagate'):

1087

1088 * 'propagate': returns nan

1089 * 'raise': throws an error

1090 * 'omit': performs the calculations ignoring nan values

1091

1092 Returns

1093 -------

1094 n-th central moment : ndarray or float

1095 The appropriate moment along the given axis or over all values if axis

1096 is None. The denominator for the moment calculation is the number of

1097 observations, no degrees of freedom correction is done.

1098

1099 See Also

1100 --------

1101 kurtosis, skew, describe

1102

1103 Notes

1104 -----

1105 The k-th central moment of a data sample is:

1106

1107 .. math::

1108

1109 m_k = \frac{1}{n} \sum_{i = 1}^n (x_i - \bar{x})^k

1110

1111 Where n is the number of samples and x-bar is the mean. This function uses

1112 exponentiation by squares [1]_ for efficiency.

1113

1114 Note that, if `a` is an empty array (``a.size == 0``), array `moment` with

1115 one element (`moment.size == 1`) is treated the same as scalar `moment`

1116 (``np.isscalar(moment)``). This might produce arrays of unexpected shape.

1117

1118 References

1119 ----------

1120 .. [1] https://eli.thegreenplace.net/2009/03/21/efficient-integer-exponentiation-algorithms

1121

1122 Examples

1123 --------

1124 >>> from scipy.stats import moment

1125 >>> moment([1, 2, 3, 4, 5], moment=1)

1126 0.0

1127 >>> moment([1, 2, 3, 4, 5], moment=2)

1128 2.0

1129

1130 """

1131 a, axis = _chk_asarray(a, axis)

1132

1133 contains_nan, nan_policy = _contains_nan(a, nan_policy)

1134

1135 if contains_nan and nan_policy == 'omit':

1136 a = ma.masked_invalid(a)

1137 return mstats_basic.moment(a, moment, axis)

1138

1139 # for array_like moment input, return a value for each.

1140 if not np.isscalar(moment):

1141 mean = a.mean(axis, keepdims=True)

1142 mmnt = [_moment(a, i, axis, mean=mean) for i in moment]

1143 return np.array(mmnt)

1144 else:

1145 return _moment(a, moment, axis)

1146

1147

1148# Moment with optional pre-computed mean, equal to a.mean(axis, keepdims=True)

1149def _moment(a, moment, axis, *, mean=None):

1150 if np.abs(moment - np.round(moment)) > 0:

1151 raise ValueError("All moment parameters must be integers")

1152

1153 # moment of empty array is the same regardless of order

1154 if a.size == 0:

1155 return np.mean(a, axis=axis)

1156

1157 if moment == 0 or moment == 1:

1158 # By definition the zeroth moment about the mean is 1, and the first

1159 # moment is 0.

1160 shape = list(a.shape)

1161 del shape[axis]

1162 dtype = a.dtype.type if a.dtype.kind in 'fc' else np.float64

1163

1164 if len(shape) == 0:

1165 return dtype(1.0 if moment == 0 else 0.0)

1166 else:

1167 return (np.ones(shape, dtype=dtype) if moment == 0

1168 else np.zeros(shape, dtype=dtype))

1169 else:

1170 # Exponentiation by squares: form exponent sequence

1171 n_list = [moment]

1172 current_n = moment

1173 while current_n > 2:

1174 if current_n % 2:

1175 current_n = (current_n - 1) / 2

1176 else:

1177 current_n /= 2

1178 n_list.append(current_n)

1179

1180 # Starting point for exponentiation by squares

1181 mean = a.mean(axis, keepdims=True) if mean is None else mean

1182 a_zero_mean = a - mean

1183

1184 eps = np.finfo(a_zero_mean.dtype).resolution * 10

1185 with np.errstate(divide='ignore', invalid='ignore'):

1186 rel_diff = np.max(np.abs(a_zero_mean), axis=axis,

1187 keepdims=True) / np.abs(mean)

1188 with np.errstate(invalid='ignore'):

1189 precision_loss = np.any(rel_diff < eps)

1190 if precision_loss:

1191 message = ("Precision loss occurred in moment calculation due to "

1192 "catastrophic cancellation. This occurs when the data "

1193 "are nearly identical. Results may be unreliable.")

1194 warnings.warn(message, RuntimeWarning, stacklevel=4)

1195

1196 if n_list[-1] == 1:

1197 s = a_zero_mean.copy()

1198 else:

1199 s = a_zero_mean**2

1200

1201 # Perform multiplications

1202 for n in n_list[-2::-1]:

1203 s = s**2

1204 if n % 2:

1205 s *= a_zero_mean

1206 return np.mean(s, axis)

1207

1208

1209def _var(x, axis=0, ddof=0, mean=None):

1210 # Calculate variance of sample, warning if precision is lost

1211 var = _moment(x, 2, axis, mean=mean)

1212 if ddof != 0:

1213 n = x.shape[axis] if axis is not None else x.size

1214 var *= np.divide(n, n-ddof) # to avoid error on division by zero

1215 return var

1216

1217

1218@_axis_nan_policy_factory(

1219 lambda x: x, result_to_tuple=lambda x: (x,), n_outputs=1

1220)

1221def skew(a, axis=0, bias=True, nan_policy='propagate'):

1222 r"""Compute the sample skewness of a data set.

1223

1224 For normally distributed data, the skewness should be about zero. For

1225 unimodal continuous distributions, a skewness value greater than zero means

1226 that there is more weight in the right tail of the distribution. The

1227 function `skewtest` can be used to determine if the skewness value

1228 is close enough to zero, statistically speaking.

1229

1230 Parameters

1231 ----------

1232 a : ndarray

1233 Input array.

1234 axis : int or None, optional

1235 Axis along which skewness is calculated. Default is 0.

1236 If None, compute over the whole array `a`.

1237 bias : bool, optional

1238 If False, then the calculations are corrected for statistical bias.

1239 nan_policy : {'propagate', 'raise', 'omit'}, optional

1240 Defines how to handle when input contains nan.

1241 The following options are available (default is 'propagate'):

1242

1243 * 'propagate': returns nan

1244 * 'raise': throws an error

1245 * 'omit': performs the calculations ignoring nan values

1246

1247 Returns

1248 -------

1249 skewness : ndarray

1250 The skewness of values along an axis, returning NaN where all values

1251 are equal.

1252

1253 Notes

1254 -----

1255 The sample skewness is computed as the Fisher-Pearson coefficient

1256 of skewness, i.e.

1257

1258 .. math::

1259

1260 g_1=\frac{m_3}{m_2^{3/2}}

1261

1262 where

1263

1264 .. math::

1265

1266 m_i=\frac{1}{N}\sum_{n=1}^N(x[n]-\bar{x})^i

1267

1268 is the biased sample :math:`i\texttt{th}` central moment, and

1269 :math:`\bar{x}` is

1270 the sample mean. If ``bias`` is False, the calculations are

1271 corrected for bias and the value computed is the adjusted

1272 Fisher-Pearson standardized moment coefficient, i.e.

1273

1274 .. math::

1275

1276 G_1=\frac{k_3}{k_2^{3/2}}=

1277 \frac{\sqrt{N(N-1)}}{N-2}\frac{m_3}{m_2^{3/2}}.

1278

1279 References

1280 ----------

1281 .. [1] Zwillinger, D. and Kokoska, S. (2000). CRC Standard

1282 Probability and Statistics Tables and Formulae. Chapman & Hall: New

1283 York. 2000.

1284 Section 2.2.24.1

1285

1286 Examples

1287 --------

1288 >>> from scipy.stats import skew

1289 >>> skew([1, 2, 3, 4, 5])

1290 0.0

1291 >>> skew([2, 8, 0, 4, 1, 9, 9, 0])

1292 0.2650554122698573

1293

1294 """

1295 a, axis = _chk_asarray(a, axis)

1296 n = a.shape[axis]

1297

1298 contains_nan, nan_policy = _contains_nan(a, nan_policy)

1299

1300 if contains_nan and nan_policy == 'omit':

1301 a = ma.masked_invalid(a)

1302 return mstats_basic.skew(a, axis, bias)

1303

1304 mean = a.mean(axis, keepdims=True)

1305 m2 = _moment(a, 2, axis, mean=mean)

1306 m3 = _moment(a, 3, axis, mean=mean)

1307 with np.errstate(all='ignore'):

1308 zero = (m2 <= (np.finfo(m2.dtype).resolution * mean.squeeze(axis))**2)

1309 vals = np.where(zero, np.nan, m3 / m2**1.5)

1310 if not bias:

1311 can_correct = ~zero & (n > 2)

1312 if can_correct.any():

1313 m2 = np.extract(can_correct, m2)

1314 m3 = np.extract(can_correct, m3)

1315 nval = np.sqrt((n - 1.0) * n) / (n - 2.0) * m3 / m2**1.5

1316 np.place(vals, can_correct, nval)

1317

1318 if vals.ndim == 0:

1319 return vals.item()

1320

1321 return vals

1322

1323

1324@_axis_nan_policy_factory(

1325 lambda x: x, result_to_tuple=lambda x: (x,), n_outputs=1

1326)

1327def kurtosis(a, axis=0, fisher=True, bias=True, nan_policy='propagate'):

1328 """Compute the kurtosis (Fisher or Pearson) of a dataset.

1329

1330 Kurtosis is the fourth central moment divided by the square of the

1331 variance. If Fisher's definition is used, then 3.0 is subtracted from

1332 the result to give 0.0 for a normal distribution.

1333

1334 If bias is False then the kurtosis is calculated using k statistics to

1335 eliminate bias coming from biased moment estimators

1336

1337 Use `kurtosistest` to see if result is close enough to normal.

1338

1339 Parameters

1340 ----------

1341 a : array

1342 Data for which the kurtosis is calculated.

1343 axis : int or None, optional

1344 Axis along which the kurtosis is calculated. Default is 0.

1345 If None, compute over the whole array `a`.

1346 fisher : bool, optional

1347 If True, Fisher's definition is used (normal ==> 0.0). If False,

1348 Pearson's definition is used (normal ==> 3.0).

1349 bias : bool, optional

1350 If False, then the calculations are corrected for statistical bias.

1351 nan_policy : {'propagate', 'raise', 'omit'}, optional

1352 Defines how to handle when input contains nan. 'propagate' returns nan,

1353 'raise' throws an error, 'omit' performs the calculations ignoring nan

1354 values. Default is 'propagate'.

1355

1356 Returns

1357 -------

1358 kurtosis : array

1359 The kurtosis of values along an axis, returning NaN where all values

1360 are equal.

1361

1362 References

1363 ----------

1364 .. [1] Zwillinger, D. and Kokoska, S. (2000). CRC Standard

1365 Probability and Statistics Tables and Formulae. Chapman & Hall: New

1366 York. 2000.

1367

1368 Examples

1369 --------

1370 In Fisher's definiton, the kurtosis of the normal distribution is zero.

1371 In the following example, the kurtosis is close to zero, because it was

1372 calculated from the dataset, not from the continuous distribution.

1373

1374 >>> import numpy as np

1375 >>> from scipy.stats import norm, kurtosis

1376 >>> data = norm.rvs(size=1000, random_state=3)

1377 >>> kurtosis(data)

1378 -0.06928694200380558

1379

1380 The distribution with a higher kurtosis has a heavier tail.

1381 The zero valued kurtosis of the normal distribution in Fisher's definition

1382 can serve as a reference point.

1383

1384 >>> import matplotlib.pyplot as plt

1385 >>> import scipy.stats as stats

1386 >>> from scipy.stats import kurtosis

1387

1388 >>> x = np.linspace(-5, 5, 100)

1389 >>> ax = plt.subplot()

1390 >>> distnames = ['laplace', 'norm', 'uniform']

1391

1392 >>> for distname in distnames:

1393 ... if distname == 'uniform':

1394 ... dist = getattr(stats, distname)(loc=-2, scale=4)

1395 ... else:

1396 ... dist = getattr(stats, distname)

1397 ... data = dist.rvs(size=1000)

1398 ... kur = kurtosis(data, fisher=True)

1399 ... y = dist.pdf(x)

1400 ... ax.plot(x, y, label="{}, {}".format(distname, round(kur, 3)))

1401 ... ax.legend()

1402

1403 The Laplace distribution has a heavier tail than the normal distribution.

1404 The uniform distribution (which has negative kurtosis) has the thinnest

1405 tail.

1406

1407 """

1408 a, axis = _chk_asarray(a, axis)

1409

1410 contains_nan, nan_policy = _contains_nan(a, nan_policy)

1411

1412 if contains_nan and nan_policy == 'omit':

1413 a = ma.masked_invalid(a)

1414 return mstats_basic.kurtosis(a, axis, fisher, bias)

1415

1416 n = a.shape[axis]

1417 mean = a.mean(axis, keepdims=True)

1418 m2 = _moment(a, 2, axis, mean=mean)

1419 m4 = _moment(a, 4, axis, mean=mean)

1420 with np.errstate(all='ignore'):

1421 zero = (m2 <= (np.finfo(m2.dtype).resolution * mean.squeeze(axis))**2)

1422 vals = np.where(zero, np.nan, m4 / m2**2.0)

1423

1424 if not bias:

1425 can_correct = ~zero & (n > 3)

1426 if can_correct.any():

1427 m2 = np.extract(can_correct, m2)

1428 m4 = np.extract(can_correct, m4)

1429 nval = 1.0/(n-2)/(n-3) * ((n**2-1.0)*m4/m2**2.0 - 3*(n-1)**2.0)

1430 np.place(vals, can_correct, nval + 3.0)

1431

1432 if vals.ndim == 0:

1433 vals = vals.item() # array scalar

1434

1435 return vals - 3 if fisher else vals

1436

1437

1438DescribeResult = namedtuple('DescribeResult',

1439 ('nobs', 'minmax', 'mean', 'variance', 'skewness',

1440 'kurtosis'))

1441

1442

1443def describe(a, axis=0, ddof=1, bias=True, nan_policy='propagate'):

1444 """Compute several descriptive statistics of the passed array.

1445

1446 Parameters

1447 ----------

1448 a : array_like

1449 Input data.

1450 axis : int or None, optional

1451 Axis along which statistics are calculated. Default is 0.

1452 If None, compute over the whole array `a`.

1453 ddof : int, optional

1454 Delta degrees of freedom (only for variance). Default is 1.

1455 bias : bool, optional

1456 If False, then the skewness and kurtosis calculations are corrected

1457 for statistical bias.

1458 nan_policy : {'propagate', 'raise', 'omit'}, optional

1459 Defines how to handle when input contains nan.

1460 The following options are available (default is 'propagate'):

1461

1462 * 'propagate': returns nan

1463 * 'raise': throws an error

1464 * 'omit': performs the calculations ignoring nan values

1465

1466 Returns

1467 -------

1468 nobs : int or ndarray of ints

1469 Number of observations (length of data along `axis`).

1470 When 'omit' is chosen as nan_policy, the length along each axis

1471 slice is counted separately.

1472 minmax: tuple of ndarrays or floats

1473 Minimum and maximum value of `a` along the given axis.

1474 mean : ndarray or float

1475 Arithmetic mean of `a` along the given axis.

1476 variance : ndarray or float

1477 Unbiased variance of `a` along the given axis; denominator is number

1478 of observations minus one.

1479 skewness : ndarray or float

1480 Skewness of `a` along the given axis, based on moment calculations

1481 with denominator equal to the number of observations, i.e. no degrees

1482 of freedom correction.

1483 kurtosis : ndarray or float

1484 Kurtosis (Fisher) of `a` along the given axis. The kurtosis is

1485 normalized so that it is zero for the normal distribution. No

1486 degrees of freedom are used.

1487

1488 See Also

1489 --------

1490 skew, kurtosis

1491

1492 Examples

1493 --------

1494 >>> import numpy as np

1495 >>> from scipy import stats

1496 >>> a = np.arange(10)

1497 >>> stats.describe(a)

1498 DescribeResult(nobs=10, minmax=(0, 9), mean=4.5,

1499 variance=9.166666666666666, skewness=0.0,

1500 kurtosis=-1.2242424242424244)

1501 >>> b = [[1, 2], [3, 4]]

1502 >>> stats.describe(b)

1503 DescribeResult(nobs=2, minmax=(array([1, 2]), array([3, 4])),

1504 mean=array([2., 3.]), variance=array([2., 2.]),

1505 skewness=array([0., 0.]), kurtosis=array([-2., -2.]))

1506

1507 """

1508 a, axis = _chk_asarray(a, axis)

1509

1510 contains_nan, nan_policy = _contains_nan(a, nan_policy)

1511

1512 if contains_nan and nan_policy == 'omit':

1513 a = ma.masked_invalid(a)

1514 return mstats_basic.describe(a, axis, ddof, bias)

1515

1516 if a.size == 0:

1517 raise ValueError("The input must not be empty.")

1518 n = a.shape[axis]

1519 mm = (np.min(a, axis=axis), np.max(a, axis=axis))

1520 m = np.mean(a, axis=axis)

1521 v = _var(a, axis=axis, ddof=ddof)

1522 sk = skew(a, axis, bias=bias)

1523 kurt = kurtosis(a, axis, bias=bias)

1524

1525 return DescribeResult(n, mm, m, v, sk, kurt)

1526

1527#####################################

1528# NORMALITY TESTS #

1529#####################################

1530

1531

1532def _normtest_finish(z, alternative):

1533 """Common code between all the normality-test functions."""

1534 if alternative == 'less':

1535 prob = distributions.norm.cdf(z)

1536 elif alternative == 'greater':

1537 prob = distributions.norm.sf(z)

1538 elif alternative == 'two-sided':

1539 prob = 2 * distributions.norm.sf(np.abs(z))

1540 else:

1541 raise ValueError("alternative must be "

1542 "'less', 'greater' or 'two-sided'")

1543

1544 if z.ndim == 0:

1545 z = z[()]

1546

1547 return z, prob

1548

1549

1550SkewtestResult = namedtuple('SkewtestResult', ('statistic', 'pvalue'))

1551

1552

1553def skewtest(a, axis=0, nan_policy='propagate', alternative='two-sided'):

1554 """Test whether the skew is different from the normal distribution.

1555

1556 This function tests the null hypothesis that the skewness of

1557 the population that the sample was drawn from is the same

1558 as that of a corresponding normal distribution.

1559

1560 Parameters

1561 ----------

1562 a : array

1563 The data to be tested.

1564 axis : int or None, optional

1565 Axis along which statistics are calculated. Default is 0.

1566 If None, compute over the whole array `a`.

1567 nan_policy : {'propagate', 'raise', 'omit'}, optional

1568 Defines how to handle when input contains nan.

1569 The following options are available (default is 'propagate'):

1570

1571 * 'propagate': returns nan

1572 * 'raise': throws an error

1573 * 'omit': performs the calculations ignoring nan values

1574

1575 alternative : {'two-sided', 'less', 'greater'}, optional

1576 Defines the alternative hypothesis. Default is 'two-sided'.

1577 The following options are available:

1578

1579 * 'two-sided': the skewness of the distribution underlying the sample

1580 is different from that of the normal distribution (i.e. 0)

1581 * 'less': the skewness of the distribution underlying the sample

1582 is less than that of the normal distribution

1583 * 'greater': the skewness of the distribution underlying the sample

1584 is greater than that of the normal distribution

1585

1586 .. versionadded:: 1.7.0

1587

1588 Returns

1589 -------

1590 statistic : float

1591 The computed z-score for this test.

1592 pvalue : float

1593 The p-value for the hypothesis test.

1594

1595 Notes

1596 -----

1597 The sample size must be at least 8.

1598

1599 References

1600 ----------

1601 .. [1] R. B. D'Agostino, A. J. Belanger and R. B. D'Agostino Jr.,

1602 "A suggestion for using powerful and informative tests of

1603 normality", American Statistician 44, pp. 316-321, 1990.

1604

1605 Examples

1606 --------

1607 >>> from scipy.stats import skewtest

1608 >>> skewtest([1, 2, 3, 4, 5, 6, 7, 8])

1609 SkewtestResult(statistic=1.0108048609177787, pvalue=0.3121098361421897)

1610 >>> skewtest([2, 8, 0, 4, 1, 9, 9, 0])

1611 SkewtestResult(statistic=0.44626385374196975, pvalue=0.6554066631275459)

1612 >>> skewtest([1, 2, 3, 4, 5, 6, 7, 8000])

1613 SkewtestResult(statistic=3.571773510360407, pvalue=0.0003545719905823133)

1614 >>> skewtest([100, 100, 100, 100, 100, 100, 100, 101])

1615 SkewtestResult(statistic=3.5717766638478072, pvalue=0.000354567720281634)

1616 >>> skewtest([1, 2, 3, 4, 5, 6, 7, 8], alternative='less')

1617 SkewtestResult(statistic=1.0108048609177787, pvalue=0.8439450819289052)

1618 >>> skewtest([1, 2, 3, 4, 5, 6, 7, 8], alternative='greater')

1619 SkewtestResult(statistic=1.0108048609177787, pvalue=0.15605491807109484)

1620

1621 """

1622 a, axis = _chk_asarray(a, axis)

1623

1624 contains_nan, nan_policy = _contains_nan(a, nan_policy)

1625

1626 if contains_nan and nan_policy == 'omit':

1627 a = ma.masked_invalid(a)

1628 return mstats_basic.skewtest(a, axis, alternative)

1629

1630 if axis is None:

1631 a = np.ravel(a)

1632 axis = 0

1633 b2 = skew(a, axis)

1634 n = a.shape[axis]

1635 if n < 8:

1636 raise ValueError(

1637 "skewtest is not valid with less than 8 samples; %i samples"

1638 " were given." % int(n))

1639 y = b2 * math.sqrt(((n + 1) * (n + 3)) / (6.0 * (n - 2)))

1640 beta2 = (3.0 * (n**2 + 27*n - 70) * (n+1) * (n+3) /

1641 ((n-2.0) * (n+5) * (n+7) * (n+9)))

1642 W2 = -1 + math.sqrt(2 * (beta2 - 1))

1643 delta = 1 / math.sqrt(0.5 * math.log(W2))

1644 alpha = math.sqrt(2.0 / (W2 - 1))

1645 y = np.where(y == 0, 1, y)

1646 Z = delta * np.log(y / alpha + np.sqrt((y / alpha)**2 + 1))

1647

1648 return SkewtestResult(*_normtest_finish(Z, alternative))

1649

1650

1651KurtosistestResult = namedtuple('KurtosistestResult', ('statistic', 'pvalue'))

1652

1653

1654def kurtosistest(a, axis=0, nan_policy='propagate', alternative='two-sided'):

1655 """Test whether a dataset has normal kurtosis.

1656

1657 This function tests the null hypothesis that the kurtosis

1658 of the population from which the sample was drawn is that

1659 of the normal distribution.

1660

1661 Parameters

1662 ----------

1663 a : array

1664 Array of the sample data.

1665 axis : int or None, optional

1666 Axis along which to compute test. Default is 0. If None,

1667 compute over the whole array `a`.

1668 nan_policy : {'propagate', 'raise', 'omit'}, optional

1669 Defines how to handle when input contains nan.

1670 The following options are available (default is 'propagate'):

1671

1672 * 'propagate': returns nan

1673 * 'raise': throws an error

1674 * 'omit': performs the calculations ignoring nan values

1675

1676 alternative : {'two-sided', 'less', 'greater'}, optional

1677 Defines the alternative hypothesis.

1678 The following options are available (default is 'two-sided'):

1679

1680 * 'two-sided': the kurtosis of the distribution underlying the sample

1681 is different from that of the normal distribution

1682 * 'less': the kurtosis of the distribution underlying the sample

1683 is less than that of the normal distribution

1684 * 'greater': the kurtosis of the distribution underlying the sample

1685 is greater than that of the normal distribution

1686

1687 .. versionadded:: 1.7.0

1688

1689 Returns

1690 -------

1691 statistic : float

1692 The computed z-score for this test.

1693 pvalue : float

1694 The p-value for the hypothesis test.

1695

1696 Notes

1697 -----

1698 Valid only for n>20. This function uses the method described in [1]_.

1699

1700 References

1701 ----------

1702 .. [1] see e.g. F. J. Anscombe, W. J. Glynn, "Distribution of the kurtosis

1703 statistic b2 for normal samples", Biometrika, vol. 70, pp. 227-234, 1983.

1704

1705 Examples

1706 --------

1707 >>> import numpy as np

1708 >>> from scipy.stats import kurtosistest

1709 >>> kurtosistest(list(range(20)))

1710 KurtosistestResult(statistic=-1.7058104152122062, pvalue=0.08804338332528348)

1711 >>> kurtosistest(list(range(20)), alternative='less')

1712 KurtosistestResult(statistic=-1.7058104152122062, pvalue=0.04402169166264174)

1713 >>> kurtosistest(list(range(20)), alternative='greater')

1714 KurtosistestResult(statistic=-1.7058104152122062, pvalue=0.9559783083373583)

1715

1716 >>> rng = np.random.default_rng()

1717 >>> s = rng.normal(0, 1, 1000)

1718 >>> kurtosistest(s)

1719 KurtosistestResult(statistic=-1.475047944490622, pvalue=0.14019965402996987)

1720

1721 """

1722 a, axis = _chk_asarray(a, axis)

1723

1724 contains_nan, nan_policy = _contains_nan(a, nan_policy)

1725

1726 if contains_nan and nan_policy == 'omit':

1727 a = ma.masked_invalid(a)

1728 return mstats_basic.kurtosistest(a, axis, alternative)

1729

1730 n = a.shape[axis]

1731 if n < 5:

1732 raise ValueError(

1733 "kurtosistest requires at least 5 observations; %i observations"

1734 " were given." % int(n))

1735 if n < 20:

1736 warnings.warn("kurtosistest only valid for n>=20 ... continuing "

1737 "anyway, n=%i" % int(n))

1738 b2 = kurtosis(a, axis, fisher=False)

1739

1740 E = 3.0*(n-1) / (n+1)

1741 varb2 = 24.0*n*(n-2)*(n-3) / ((n+1)*(n+1.)*(n+3)*(n+5)) # [1]_ Eq. 1

1742 x = (b2-E) / np.sqrt(varb2) # [1]_ Eq. 4

1743 # [1]_ Eq. 2:

1744 sqrtbeta1 = 6.0*(n*n-5*n+2)/((n+7)*(n+9)) * np.sqrt((6.0*(n+3)*(n+5)) /

1745 (n*(n-2)*(n-3)))

1746 # [1]_ Eq. 3:

1747 A = 6.0 + 8.0/sqrtbeta1 * (2.0/sqrtbeta1 + np.sqrt(1+4.0/(sqrtbeta1**2)))

1748 term1 = 1 - 2/(9.0*A)

1749 denom = 1 + x*np.sqrt(2/(A-4.0))

1750 term2 = np.sign(denom) * np.where(denom == 0.0, np.nan,

1751 np.power((1-2.0/A)/np.abs(denom), 1/3.0))

1752 if np.any(denom == 0):

1753 msg = "Test statistic not defined in some cases due to division by " \

1754 "zero. Return nan in that case..."

1755 warnings.warn(msg, RuntimeWarning)

1756

1757 Z = (term1 - term2) / np.sqrt(2/(9.0*A)) # [1]_ Eq. 5

1758

1759 # zprob uses upper tail, so Z needs to be positive

1760 return KurtosistestResult(*_normtest_finish(Z, alternative))

1761

1762

1763NormaltestResult = namedtuple('NormaltestResult', ('statistic', 'pvalue'))

1764

1765

1766def normaltest(a, axis=0, nan_policy='propagate'):

1767 """Test whether a sample differs from a normal distribution.

1768

1769 This function tests the null hypothesis that a sample comes

1770 from a normal distribution. It is based on D'Agostino and

1771 Pearson's [1]_, [2]_ test that combines skew and kurtosis to

1772 produce an omnibus test of normality.

1773

1774 Parameters

1775 ----------

1776 a : array_like

1777 The array containing the sample to be tested.

1778 axis : int or None, optional

1779 Axis along which to compute test. Default is 0. If None,

1780 compute over the whole array `a`.

1781 nan_policy : {'propagate', 'raise', 'omit'}, optional

1782 Defines how to handle when input contains nan.

1783 The following options are available (default is 'propagate'):

1784

1785 * 'propagate': returns nan

1786 * 'raise': throws an error

1787 * 'omit': performs the calculations ignoring nan values

1788

1789 Returns

1790 -------

1791 statistic : float or array

1792 ``s^2 + k^2``, where ``s`` is the z-score returned by `skewtest` and

1793 ``k`` is the z-score returned by `kurtosistest`.

1794 pvalue : float or array

1795 A 2-sided chi squared probability for the hypothesis test.

1796

1797 References

1798 ----------

1799 .. [1] D'Agostino, R. B. (1971), "An omnibus test of normality for

1800 moderate and large sample size", Biometrika, 58, 341-348

1801

1802 .. [2] D'Agostino, R. and Pearson, E. S. (1973), "Tests for departure from

1803 normality", Biometrika, 60, 613-622

1804

1805 Examples

1806 --------

1807 >>> import numpy as np

1808 >>> from scipy import stats

1809 >>> rng = np.random.default_rng()

1810 >>> pts = 1000

1811 >>> a = rng.normal(0, 1, size=pts)

1812 >>> b = rng.normal(2, 1, size=pts)

1813 >>> x = np.concatenate((a, b))

1814 >>> k2, p = stats.normaltest(x)

1815 >>> alpha = 1e-3

1816 >>> print("p = {:g}".format(p))

1817 p = 8.4713e-19

1818 >>> if p < alpha: # null hypothesis: x comes from a normal distribution

1819 ... print("The null hypothesis can be rejected")

1820 ... else:

1821 ... print("The null hypothesis cannot be rejected")

1822 The null hypothesis can be rejected

1823

1824 """

1825 a, axis = _chk_asarray(a, axis)

1826

1827 contains_nan, nan_policy = _contains_nan(a, nan_policy)

1828

1829 if contains_nan and nan_policy == 'omit':

1830 a = ma.masked_invalid(a)

1831 return mstats_basic.normaltest(a, axis)

1832

1833 s, _ = skewtest(a, axis)

1834 k, _ = kurtosistest(a, axis)

1835 k2 = s*s + k*k

1836

1837 return NormaltestResult(k2, distributions.chi2.sf(k2, 2))

1838

1839

1840@_axis_nan_policy_factory(SignificanceResult, default_axis=None)

1841def jarque_bera(x, *, axis=None):

1842 """Perform the Jarque-Bera goodness of fit test on sample data.

1843

1844 The Jarque-Bera test tests whether the sample data has the skewness and

1845 kurtosis matching a normal distribution.

1846

1847 Note that this test only works for a large enough number of data samples

1848 (>2000) as the test statistic asymptotically has a Chi-squared distribution

1849 with 2 degrees of freedom.

1850

1851 Parameters

1852 ----------

1853 x : array_like

1854 Observations of a random variable.

1855 axis : int or None, default: 0

1856 If an int, the axis of the input along which to compute the statistic.

1857 The statistic of each axis-slice (e.g. row) of the input will appear in

1858 a corresponding element of the output.

1859 If ``None``, the input will be raveled before computing the statistic.

1860

1861 Returns

1862 -------

1863 result : SignificanceResult

1864 An object with the following attributes:

1865

1866 statistic : float

1867 The test statistic.

1868 pvalue : float

1869 The p-value for the hypothesis test.

1870

1871 References

1872 ----------

1873 .. [1] Jarque, C. and Bera, A. (1980) "Efficient tests for normality,

1874 homoscedasticity and serial independence of regression residuals",

1875 6 Econometric Letters 255-259.

1876

1877 Examples

1878 --------

1879 >>> import numpy as np

1880 >>> from scipy import stats

1881 >>> rng = np.random.default_rng()

1882 >>> x = rng.normal(0, 1, 100000)

1883 >>> jarque_bera_test = stats.jarque_bera(x)

1884 >>> jarque_bera_test

1885 Jarque_beraResult(statistic=3.3415184718131554, pvalue=0.18810419594996775)

1886 >>> jarque_bera_test.statistic

1887 3.3415184718131554

1888 >>> jarque_bera_test.pvalue

1889 0.18810419594996775

1890

1891 """

1892 x = np.asarray(x)

1893 if axis is None:

1894 x = x.ravel()

1895 axis = 0

1896

1897 n = x.shape[axis]

1898 if n == 0:

1899 raise ValueError('At least one observation is required.')

1900

1901 mu = x.mean(axis=axis, keepdims=True)

1902 diffx = x - mu

1903 s = skew(diffx, axis=axis, _no_deco=True)

1904 k = kurtosis(diffx, axis=axis, _no_deco=True)

1905 statistic = n / 6 * (s**2 + k**2 / 4)

1906 pvalue = distributions.chi2.sf(statistic, df=2)

1907

1908 return SignificanceResult(statistic, pvalue)

1909

1910

1911#####################################

1912# FREQUENCY FUNCTIONS #

1913#####################################

1914

1915

1916def scoreatpercentile(a, per, limit=(), interpolation_method='fraction',

1917 axis=None):

1918 """Calculate the score at a given percentile of the input sequence.

1919

1920 For example, the score at `per=50` is the median. If the desired quantile

1921 lies between two data points, we interpolate between them, according to

1922 the value of `interpolation`. If the parameter `limit` is provided, it

1923 should be a tuple (lower, upper) of two values.

1924

1925 Parameters

1926 ----------

1927 a : array_like

1928 A 1-D array of values from which to extract score.

1929 per : array_like

1930 Percentile(s) at which to extract score. Values should be in range

1931 [0,100].

1932 limit : tuple, optional

1933 Tuple of two scalars, the lower and upper limits within which to

1934 compute the percentile. Values of `a` outside

1935 this (closed) interval will be ignored.

1936 interpolation_method : {'fraction', 'lower', 'higher'}, optional

1937 Specifies the interpolation method to use,

1938 when the desired quantile lies between two data points `i` and `j`

1939 The following options are available (default is 'fraction'):

1940

1941 * 'fraction': ``i + (j - i) * fraction`` where ``fraction`` is the

1942 fractional part of the index surrounded by ``i`` and ``j``

1943 * 'lower': ``i``

1944 * 'higher': ``j``

1945

1946 axis : int, optional

1947 Axis along which the percentiles are computed. Default is None. If

1948 None, compute over the whole array `a`.

1949

1950 Returns

1951 -------

1952 score : float or ndarray

1953 Score at percentile(s).

1954

1955 See Also

1956 --------

1957 percentileofscore, numpy.percentile

1958

1959 Notes

1960 -----

1961 This function will become obsolete in the future.

1962 For NumPy 1.9 and higher, `numpy.percentile` provides all the functionality

1963 that `scoreatpercentile` provides. And it's significantly faster.

1964 Therefore it's recommended to use `numpy.percentile` for users that have

1965 numpy >= 1.9.

1966

1967 Examples

1968 --------

1969 >>> import numpy as np

1970 >>> from scipy import stats

1971 >>> a = np.arange(100)

1972 >>> stats.scoreatpercentile(a, 50)

1973 49.5

1974

1975 """

1976 # adapted from NumPy's percentile function. When we require numpy >= 1.8,

1977 # the implementation of this function can be replaced by np.percentile.

1978 a = np.asarray(a)

1979 if a.size == 0:

1980 # empty array, return nan(s) with shape matching `per`

1981 if np.isscalar(per):

1982 return np.nan

1983 else:

1984 return np.full(np.asarray(per).shape, np.nan, dtype=np.float64)

1985

1986 if limit:

1987 a = a[(limit[0] <= a) & (a <= limit[1])]

1988

1989 sorted_ = np.sort(a, axis=axis)

1990 if axis is None:

1991 axis = 0

1992

1993 return _compute_qth_percentile(sorted_, per, interpolation_method, axis)

1994

1995

1996# handle sequence of per's without calling sort multiple times

1997def _compute_qth_percentile(sorted_, per, interpolation_method, axis):

1998 if not np.isscalar(per):

1999 score = [_compute_qth_percentile(sorted_, i,

2000 interpolation_method, axis)

2001 for i in per]

2002 return np.array(score)

2003

2004 if not (0 <= per <= 100):

2005 raise ValueError("percentile must be in the range [0, 100]")

2006

2007 indexer = [slice(None)] * sorted_.ndim

2008 idx = per / 100. * (sorted_.shape[axis] - 1)

2009

2010 if int(idx) != idx:

2011 # round fractional indices according to interpolation method

2012 if interpolation_method == 'lower':

2013 idx = int(np.floor(idx))

2014 elif interpolation_method == 'higher':

2015 idx = int(np.ceil(idx))

2016 elif interpolation_method == 'fraction':

2017 pass # keep idx as fraction and interpolate

2018 else:

2019 raise ValueError("interpolation_method can only be 'fraction', "

2020 "'lower' or 'higher'")

2021

2022 i = int(idx)

2023 if i == idx:

2024 indexer[axis] = slice(i, i + 1)

2025 weights = array(1)

2026 sumval = 1.0

2027 else:

2028 indexer[axis] = slice(i, i + 2)

2029 j = i + 1

2030 weights = array([(j - idx), (idx - i)], float)

2031 wshape = [1] * sorted_.ndim

2032 wshape[axis] = 2

2033 weights.shape = wshape

2034 sumval = weights.sum()

2035

2036 # Use np.add.reduce (== np.sum but a little faster) to coerce data type

2037 return np.add.reduce(sorted_[tuple(indexer)] * weights, axis=axis) / sumval

2038

2039

2040def percentileofscore(a, score, kind='rank', nan_policy='propagate'):

2041 """Compute the percentile rank of a score relative to a list of scores.

2042

2043 A `percentileofscore` of, for example, 80% means that 80% of the

2044 scores in `a` are below the given score. In the case of gaps or

2045 ties, the exact definition depends on the optional keyword, `kind`.

2046

2047 Parameters

2048 ----------

2049 a : array_like

2050 Array to which `score` is compared.

2051 score : array_like

2052 Scores to compute percentiles for.

2053 kind : {'rank', 'weak', 'strict', 'mean'}, optional

2054 Specifies the interpretation of the resulting score.

2055 The following options are available (default is 'rank'):

2056

2057 * 'rank': Average percentage ranking of score. In case of multiple

2058 matches, average the percentage rankings of all matching scores.

2059 * 'weak': This kind corresponds to the definition of a cumulative

2060 distribution function. A percentileofscore of 80% means that 80%

2061 of values are less than or equal to the provided score.

2062 * 'strict': Similar to "weak", except that only values that are

2063 strictly less than the given score are counted.

2064 * 'mean': The average of the "weak" and "strict" scores, often used

2065 in testing. See https://en.wikipedia.org/wiki/Percentile_rank

2066 nan_policy : {'propagate', 'raise', 'omit'}, optional

2067 Specifies how to treat `nan` values in `a`.

2068 The following options are available (default is 'propagate'):

2069

2070 * 'propagate': returns nan (for each value in `score`).

2071 * 'raise': throws an error

2072 * 'omit': performs the calculations ignoring nan values

2073

2074 Returns

2075 -------

2076 pcos : float

2077 Percentile-position of score (0-100) relative to `a`.

2078

2079 See Also

2080 --------

2081 numpy.percentile

2082 scipy.stats.scoreatpercentile, scipy.stats.rankdata

2083

2084 Examples

2085 --------

2086 Three-quarters of the given values lie below a given score:

2087

2088 >>> import numpy as np

2089 >>> from scipy import stats

2090 >>> stats.percentileofscore([1, 2, 3, 4], 3)

2091 75.0

2092

2093 With multiple matches, note how the scores of the two matches, 0.6

2094 and 0.8 respectively, are averaged:

2095

2096 >>> stats.percentileofscore([1, 2, 3, 3, 4], 3)

2097 70.0

2098

2099 Only 2/5 values are strictly less than 3:

2100

2101 >>> stats.percentileofscore([1, 2, 3, 3, 4], 3, kind='strict')

2102 40.0

2103

2104 But 4/5 values are less than or equal to 3:

2105

2106 >>> stats.percentileofscore([1, 2, 3, 3, 4], 3, kind='weak')

2107 80.0

2108

2109 The average between the weak and the strict scores is:

2110

2111 >>> stats.percentileofscore([1, 2, 3, 3, 4], 3, kind='mean')

2112 60.0

2113

2114 Score arrays (of any dimensionality) are supported:

2115

2116 >>> stats.percentileofscore([1, 2, 3, 3, 4], [2, 3])

2117 array([40., 70.])

2118

2119 The inputs can be infinite:

2120

2121 >>> stats.percentileofscore([-np.inf, 0, 1, np.inf], [1, 2, np.inf])

2122 array([75., 75., 100.])

2123

2124 If `a` is empty, then the resulting percentiles are all `nan`:

2125

2126 >>> stats.percentileofscore([], [1, 2])

2127 array([nan, nan])

2128 """

2129

2130 a = np.asarray(a)

2131 n = len(a)

2132 score = np.asarray(score)

2133

2134 # Nan treatment

2135 cna, npa = _contains_nan(a, nan_policy, use_summation=False)

2136 cns, nps = _contains_nan(score, nan_policy, use_summation=False)

2137

2138 if (cna or cns) and nan_policy == 'raise':

2139 raise ValueError("The input contains nan values")

2140

2141 if cns:

2142 # If a score is nan, then the output should be nan

2143 # (also if nan_policy is "omit", because it only applies to `a`)

2144 score = ma.masked_where(np.isnan(score), score)

2145

2146 if cna:

2147 if nan_policy == "omit":

2148 # Don't count nans

2149 a = ma.masked_where(np.isnan(a), a)

2150 n = a.count()

2151

2152 if nan_policy == "propagate":

2153 # All outputs should be nans

2154 n = 0

2155

2156 # Cannot compare to empty list ==> nan

2157 if n == 0:

2158 perct = np.full_like(score, np.nan, dtype=np.float64)

2159

2160 else:

2161 # Prepare broadcasting

2162 score = score[..., None]

2163

2164 def count(x):

2165 return np.count_nonzero(x, -1)

2166

2167 # Despite using masked_array to omit nan values from processing,

2168 # the CI tests on "Azure pipelines" (but not on the other CI servers)

2169 # emits warnings when there are nan values, contrarily to the purpose

2170 # of masked_arrays. As a fix, we simply suppress the warnings.

2171 with suppress_warnings() as sup:

2172 sup.filter(RuntimeWarning,

2173 "invalid value encountered in less")

2174 sup.filter(RuntimeWarning,

2175 "invalid value encountered in greater")

2176

2177 # Main computations/logic

2178 if kind == 'rank':

2179 left = count(a < score)

2180 right = count(a <= score)

2181 plus1 = left < right

2182 perct = (left + right + plus1) * (50.0 / n)

2183 elif kind == 'strict':

2184 perct = count(a < score) * (100.0 / n)

2185 elif kind == 'weak':

2186 perct = count(a <= score) * (100.0 / n)

2187 elif kind == 'mean':

2188 left = count(a < score)

2189 right = count(a <= score)

2190 perct = (left + right) * (50.0 / n)

2191 else:

2192 raise ValueError(

2193 "kind can only be 'rank', 'strict', 'weak' or 'mean'")

2194

2195 # Re-insert nan values

2196 perct = ma.filled(perct, np.nan)

2197

2198 if perct.ndim == 0:

2199 return perct[()]

2200 return perct

2201

2202

2203HistogramResult = namedtuple('HistogramResult',

2204 ('count', 'lowerlimit', 'binsize', 'extrapoints'))

2205

2206

2207def _histogram(a, numbins=10, defaultlimits=None, weights=None,

2208 printextras=False):

2209 """Create a histogram.

2210

2211 Separate the range into several bins and return the number of instances

2212 in each bin.

2213

2214 Parameters

2215 ----------

2216 a : array_like

2217 Array of scores which will be put into bins.

2218 numbins : int, optional

2219 The number of bins to use for the histogram. Default is 10.

2220 defaultlimits : tuple (lower, upper), optional

2221 The lower and upper values for the range of the histogram.

2222 If no value is given, a range slightly larger than the range of the

2223 values in a is used. Specifically ``(a.min() - s, a.max() + s)``,

2224 where ``s = (1/2)(a.max() - a.min()) / (numbins - 1)``.

2225 weights : array_like, optional

2226 The weights for each value in `a`. Default is None, which gives each

2227 value a weight of 1.0

2228 printextras : bool, optional

2229 If True, if there are extra points (i.e. the points that fall outside

2230 the bin limits) a warning is raised saying how many of those points

2231 there are. Default is False.

2232

2233 Returns

2234 -------

2235 count : ndarray

2236 Number of points (or sum of weights) in each bin.

2237 lowerlimit : float

2238 Lowest value of histogram, the lower limit of the first bin.

2239 binsize : float

2240 The size of the bins (all bins have the same size).

2241 extrapoints : int

2242 The number of points outside the range of the histogram.

2243

2244 See Also

2245 --------

2246 numpy.histogram

2247

2248 Notes

2249 -----

2250 This histogram is based on numpy's histogram but has a larger range by

2251 default if default limits is not set.

2252

2253 """

2254 a = np.ravel(a)

2255 if defaultlimits is None:

2256 if a.size == 0:

2257 # handle empty arrays. Undetermined range, so use 0-1.

2258 defaultlimits = (0, 1)

2259 else:

2260 # no range given, so use values in `a`

2261 data_min = a.min()

2262 data_max = a.max()

2263 # Have bins extend past min and max values slightly

2264 s = (data_max - data_min) / (2. * (numbins - 1.))

2265 defaultlimits = (data_min - s, data_max + s)

2266

2267 # use numpy's histogram method to compute bins

2268 hist, bin_edges = np.histogram(a, bins=numbins, range=defaultlimits,

2269 weights=weights)

2270 # hist are not always floats, convert to keep with old output

2271 hist = np.array(hist, dtype=float)

2272 # fixed width for bins is assumed, as numpy's histogram gives

2273 # fixed width bins for int values for 'bins'

2274 binsize = bin_edges[1] - bin_edges[0]

2275 # calculate number of extra points

2276 extrapoints = len([v for v in a

2277 if defaultlimits[0] > v or v > defaultlimits[1]])

2278 if extrapoints > 0 and printextras:

2279 warnings.warn("Points outside given histogram range = %s"

2280 % extrapoints)

2281

2282 return HistogramResult(hist, defaultlimits[0], binsize, extrapoints)

2283

2284

2285CumfreqResult = namedtuple('CumfreqResult',

2286 ('cumcount', 'lowerlimit', 'binsize',

2287 'extrapoints'))

2288

2289

2290def cumfreq(a, numbins=10, defaultreallimits=None, weights=None):

2291 """Return a cumulative frequency histogram, using the histogram function.

2292

2293 A cumulative histogram is a mapping that counts the cumulative number of

2294 observations in all of the bins up to the specified bin.

2295

2296 Parameters

2297 ----------

2298 a : array_like

2299 Input array.

2300 numbins : int, optional

2301 The number of bins to use for the histogram. Default is 10.

2302 defaultreallimits : tuple (lower, upper), optional

2303 The lower and upper values for the range of the histogram.

2304 If no value is given, a range slightly larger than the range of the

2305 values in `a` is used. Specifically ``(a.min() - s, a.max() + s)``,

2306 where ``s = (1/2)(a.max() - a.min()) / (numbins - 1)``.

2307 weights : array_like, optional

2308 The weights for each value in `a`. Default is None, which gives each

2309 value a weight of 1.0

2310

2311 Returns

2312 -------

2313 cumcount : ndarray

2314 Binned values of cumulative frequency.

2315 lowerlimit : float

2316 Lower real limit

2317 binsize : float

2318 Width of each bin.

2319 extrapoints : int

2320 Extra points.

2321

2322 Examples

2323 --------

2324 >>> import numpy as np

2325 >>> import matplotlib.pyplot as plt

2326 >>> from scipy import stats

2327 >>> rng = np.random.default_rng()

2328 >>> x = [1, 4, 2, 1, 3, 1]

2329 >>> res = stats.cumfreq(x, numbins=4, defaultreallimits=(1.5, 5))

2330 >>> res.cumcount

2331 array([ 1., 2., 3., 3.])

2332 >>> res.extrapoints

2333 3

2334

2335 Create a normal distribution with 1000 random values

2336

2337 >>> samples = stats.norm.rvs(size=1000, random_state=rng)

2338

2339 Calculate cumulative frequencies

2340

2341 >>> res = stats.cumfreq(samples, numbins=25)

2342

2343 Calculate space of values for x

2344

2345 >>> x = res.lowerlimit + np.linspace(0, res.binsize*res.cumcount.size,

2346 ... res.cumcount.size)

2347

2348 Plot histogram and cumulative histogram

2349

2350 >>> fig = plt.figure(figsize=(10, 4))

2351 >>> ax1 = fig.add_subplot(1, 2, 1)

2352 >>> ax2 = fig.add_subplot(1, 2, 2)

2353 >>> ax1.hist(samples, bins=25)

2354 >>> ax1.set_title('Histogram')

2355 >>> ax2.bar(x, res.cumcount, width=res.binsize)

2356 >>> ax2.set_title('Cumulative histogram')

2357 >>> ax2.set_xlim([x.min(), x.max()])

2358

2359 >>> plt.show()

2360

2361 """

2362 h, l, b, e = _histogram(a, numbins, defaultreallimits, weights=weights)

2363 cumhist = np.cumsum(h * 1, axis=0)

2364 return CumfreqResult(cumhist, l, b, e)

2365

2366

2367RelfreqResult = namedtuple('RelfreqResult',

2368 ('frequency', 'lowerlimit', 'binsize',

2369 'extrapoints'))

2370

2371

2372def relfreq(a, numbins=10, defaultreallimits=None, weights=None):

2373 """Return a relative frequency histogram, using the histogram function.

2374

2375 A relative frequency histogram is a mapping of the number of

2376 observations in each of the bins relative to the total of observations.

2377

2378 Parameters

2379 ----------

2380 a : array_like

2381 Input array.

2382 numbins : int, optional

2383 The number of bins to use for the histogram. Default is 10.

2384 defaultreallimits : tuple (lower, upper), optional

2385 The lower and upper values for the range of the histogram.

2386 If no value is given, a range slightly larger than the range of the

2387 values in a is used. Specifically ``(a.min() - s, a.max() + s)``,

2388 where ``s = (1/2)(a.max() - a.min()) / (numbins - 1)``.

2389 weights : array_like, optional

2390 The weights for each value in `a`. Default is None, which gives each

2391 value a weight of 1.0

2392

2393 Returns

2394 -------

2395 frequency : ndarray

2396 Binned values of relative frequency.

2397 lowerlimit : float

2398 Lower real limit.

2399 binsize : float

2400 Width of each bin.

2401 extrapoints : int

2402 Extra points.

2403

2404 Examples

2405 --------

2406 >>> import numpy as np

2407 >>> import matplotlib.pyplot as plt

2408 >>> from scipy import stats

2409 >>> rng = np.random.default_rng()

2410 >>> a = np.array([2, 4, 1, 2, 3, 2])

2411 >>> res = stats.relfreq(a, numbins=4)

2412 >>> res.frequency

2413 array([ 0.16666667, 0.5 , 0.16666667, 0.16666667])

2414 >>> np.sum(res.frequency) # relative frequencies should add up to 1

2415 1.0

2416

2417 Create a normal distribution with 1000 random values

2418

2419 >>> samples = stats.norm.rvs(size=1000, random_state=rng)

2420

2421 Calculate relative frequencies

2422

2423 >>> res = stats.relfreq(samples, numbins=25)

2424

2425 Calculate space of values for x

2426

2427 >>> x = res.lowerlimit + np.linspace(0, res.binsize*res.frequency.size,

2428 ... res.frequency.size)

2429

2430 Plot relative frequency histogram

2431

2432 >>> fig = plt.figure(figsize=(5, 4))

2433 >>> ax = fig.add_subplot(1, 1, 1)

2434 >>> ax.bar(x, res.frequency, width=res.binsize)

2435 >>> ax.set_title('Relative frequency histogram')

2436 >>> ax.set_xlim([x.min(), x.max()])

2437

2438 >>> plt.show()

2439

2440 """

2441 a = np.asanyarray(a)

2442 h, l, b, e = _histogram(a, numbins, defaultreallimits, weights=weights)

2443 h = h / a.shape[0]

2444

2445 return RelfreqResult(h, l, b, e)

2446

2447

2448#####################################

2449# VARIABILITY FUNCTIONS #

2450#####################################

2451

2452def obrientransform(*samples):

2453 """Compute the O'Brien transform on input data (any number of arrays).

2454

2455 Used to test for homogeneity of variance prior to running one-way stats.

2456 Each array in ``*samples`` is one level of a factor.

2457 If `f_oneway` is run on the transformed data and found significant,

2458 the variances are unequal. From Maxwell and Delaney [1]_, p.112.

2459

2460 Parameters

2461 ----------

2462 sample1, sample2, ... : array_like

2463 Any number of arrays.

2464

2465 Returns

2466 -------

2467 obrientransform : ndarray

2468 Transformed data for use in an ANOVA. The first dimension

2469 of the result corresponds to the sequence of transformed

2470 arrays. If the arrays given are all 1-D of the same length,

2471 the return value is a 2-D array; otherwise it is a 1-D array

2472 of type object, with each element being an ndarray.

2473

2474 References

2475 ----------

2476 .. [1] S. E. Maxwell and H. D. Delaney, "Designing Experiments and

2477 Analyzing Data: A Model Comparison Perspective", Wadsworth, 1990.

2478

2479 Examples

2480 --------

2481 We'll test the following data sets for differences in their variance.

2482

2483 >>> x = [10, 11, 13, 9, 7, 12, 12, 9, 10]

2484 >>> y = [13, 21, 5, 10, 8, 14, 10, 12, 7, 15]

2485

2486 Apply the O'Brien transform to the data.

2487

2488 >>> from scipy.stats import obrientransform

2489 >>> tx, ty = obrientransform(x, y)

2490

2491 Use `scipy.stats.f_oneway` to apply a one-way ANOVA test to the

2492 transformed data.

2493

2494 >>> from scipy.stats import f_oneway

2495 >>> F, p = f_oneway(tx, ty)

2496 >>> p

2497 0.1314139477040335

2498

2499 If we require that ``p < 0.05`` for significance, we cannot conclude

2500 that the variances are different.

2501

2502 """

2503 TINY = np.sqrt(np.finfo(float).eps)

2504

2505 # `arrays` will hold the transformed arguments.

2506 arrays = []

2507 sLast = None

2508

2509 for sample in samples:

2510 a = np.asarray(sample)

2511 n = len(a)

2512 mu = np.mean(a)

2513 sq = (a - mu)**2

2514 sumsq = sq.sum()

2515

2516 # The O'Brien transform.

2517 t = ((n - 1.5) * n * sq - 0.5 * sumsq) / ((n - 1) * (n - 2))

2518

2519 # Check that the mean of the transformed data is equal to the

2520 # original variance.

2521 var = sumsq / (n - 1)

2522 if abs(var - np.mean(t)) > TINY:

2523 raise ValueError('Lack of convergence in obrientransform.')

2524

2525 arrays.append(t)

2526 sLast = a.shape

2527

2528 if sLast:

2529 for arr in arrays[:-1]:

2530 if sLast != arr.shape:

2531 return np.array(arrays, dtype=object)

2532 return np.array(arrays)

2533

2534

2535def sem(a, axis=0, ddof=1, nan_policy='propagate'):

2536 """Compute standard error of the mean.

2537

2538 Calculate the standard error of the mean (or standard error of

2539 measurement) of the values in the input array.

2540

2541 Parameters

2542 ----------

2543 a : array_like

2544 An array containing the values for which the standard error is

2545 returned.

2546 axis : int or None, optional

2547 Axis along which to operate. Default is 0. If None, compute over

2548 the whole array `a`.

2549 ddof : int, optional

2550 Delta degrees-of-freedom. How many degrees of freedom to adjust

2551 for bias in limited samples relative to the population estimate

2552 of variance. Defaults to 1.

2553 nan_policy : {'propagate', 'raise', 'omit'}, optional

2554 Defines how to handle when input contains nan.

2555 The following options are available (default is 'propagate'):

2556

2557 * 'propagate': returns nan

2558 * 'raise': throws an error

2559 * 'omit': performs the calculations ignoring nan values

2560

2561 Returns

2562 -------

2563 s : ndarray or float

2564 The standard error of the mean in the sample(s), along the input axis.

2565

2566 Notes

2567 -----

2568 The default value for `ddof` is different to the default (0) used by other

2569 ddof containing routines, such as np.std and np.nanstd.

2570

2571 Examples

2572 --------

2573 Find standard error along the first axis:

2574

2575 >>> import numpy as np

2576 >>> from scipy import stats

2577 >>> a = np.arange(20).reshape(5,4)

2578 >>> stats.sem(a)

2579 array([ 2.8284, 2.8284, 2.8284, 2.8284])

2580

2581 Find standard error across the whole array, using n degrees of freedom:

2582

2583 >>> stats.sem(a, axis=None, ddof=0)

2584 1.2893796958227628

2585

2586 """

2587 a, axis = _chk_asarray(a, axis)

2588

2589 contains_nan, nan_policy = _contains_nan(a, nan_policy)

2590

2591 if contains_nan and nan_policy == 'omit':

2592 a = ma.masked_invalid(a)

2593 return mstats_basic.sem(a, axis, ddof)

2594

2595 n = a.shape[axis]

2596 s = np.std(a, axis=axis, ddof=ddof) / np.sqrt(n)

2597 return s

2598

2599

2600def _isconst(x):

2601 """

2602 Check if all values in x are the same. nans are ignored.

2603

2604 x must be a 1d array.

2605

2606 The return value is a 1d array with length 1, so it can be used

2607 in np.apply_along_axis.

2608 """

2609 y = x[~np.isnan(x)]

2610 if y.size == 0:

2611 return np.array([True])

2612 else:

2613 return (y[0] == y).all(keepdims=True)

2614

2615

2616def _quiet_nanmean(x):

2617 """

2618 Compute nanmean for the 1d array x, but quietly return nan if x is all nan.

2619

2620 The return value is a 1d array with length 1, so it can be used

2621 in np.apply_along_axis.

2622 """

2623 y = x[~np.isnan(x)]

2624 if y.size == 0:

2625 return np.array([np.nan])

2626 else:

2627 return np.mean(y, keepdims=True)

2628

2629

2630def _quiet_nanstd(x, ddof=0):

2631 """

2632 Compute nanstd for the 1d array x, but quietly return nan if x is all nan.

2633

2634 The return value is a 1d array with length 1, so it can be used

2635 in np.apply_along_axis.

2636 """

2637 y = x[~np.isnan(x)]

2638 if y.size == 0:

2639 return np.array([np.nan])

2640 else:

2641 return np.std(y, keepdims=True, ddof=ddof)

2642

2643

2644def zscore(a, axis=0, ddof=0, nan_policy='propagate'):

2645 """

2646 Compute the z score.

2647

2648 Compute the z score of each value in the sample, relative to the

2649 sample mean and standard deviation.

2650

2651 Parameters

2652 ----------

2653 a : array_like

2654 An array like object containing the sample data.

2655 axis : int or None, optional

2656 Axis along which to operate. Default is 0. If None, compute over

2657 the whole array `a`.

2658 ddof : int, optional

2659 Degrees of freedom correction in the calculation of the

2660 standard deviation. Default is 0.

2661 nan_policy : {'propagate', 'raise', 'omit'}, optional

2662 Defines how to handle when input contains nan. 'propagate' returns nan,

2663 'raise' throws an error, 'omit' performs the calculations ignoring nan

2664 values. Default is 'propagate'. Note that when the value is 'omit',

2665 nans in the input also propagate to the output, but they do not affect

2666 the z-scores computed for the non-nan values.

2667

2668 Returns

2669 -------

2670 zscore : array_like

2671 The z-scores, standardized by mean and standard deviation of

2672 input array `a`.

2673

2674 Notes

2675 -----

2676 This function preserves ndarray subclasses, and works also with

2677 matrices and masked arrays (it uses `asanyarray` instead of

2678 `asarray` for parameters).

2679

2680 Examples

2681 --------

2682 >>> import numpy as np

2683 >>> a = np.array([ 0.7972, 0.0767, 0.4383, 0.7866, 0.8091,

2684 ... 0.1954, 0.6307, 0.6599, 0.1065, 0.0508])

2685 >>> from scipy import stats

2686 >>> stats.zscore(a)

2687 array([ 1.1273, -1.247 , -0.0552, 1.0923, 1.1664, -0.8559, 0.5786,

2688 0.6748, -1.1488, -1.3324])

2689

2690 Computing along a specified axis, using n-1 degrees of freedom

2691 (``ddof=1``) to calculate the standard deviation:

2692

2693 >>> b = np.array([[ 0.3148, 0.0478, 0.6243, 0.4608],

2694 ... [ 0.7149, 0.0775, 0.6072, 0.9656],

2695 ... [ 0.6341, 0.1403, 0.9759, 0.4064],

2696 ... [ 0.5918, 0.6948, 0.904 , 0.3721],

2697 ... [ 0.0921, 0.2481, 0.1188, 0.1366]])

2698 >>> stats.zscore(b, axis=1, ddof=1)

2699 array([[-0.19264823, -1.28415119, 1.07259584, 0.40420358],

2700 [ 0.33048416, -1.37380874, 0.04251374, 1.00081084],

2701 [ 0.26796377, -1.12598418, 1.23283094, -0.37481053],

2702 [-0.22095197, 0.24468594, 1.19042819, -1.21416216],

2703 [-0.82780366, 1.4457416 , -0.43867764, -0.1792603 ]])

2704

2705 An example with `nan_policy='omit'`:

2706

2707 >>> x = np.array([[25.11, 30.10, np.nan, 32.02, 43.15],

2708 ... [14.95, 16.06, 121.25, 94.35, 29.81]])

2709 >>> stats.zscore(x, axis=1, nan_policy='omit')

2710 array([[-1.13490897, -0.37830299, nan, -0.08718406, 1.60039602],

2711 [-0.91611681, -0.89090508, 1.4983032 , 0.88731639, -0.5785977 ]])

2712 """

2713 return zmap(a, a, axis=axis, ddof=ddof, nan_policy=nan_policy)

2714

2715

2716def gzscore(a, *, axis=0, ddof=0, nan_policy='propagate'):

2717 """

2718 Compute the geometric standard score.

2719

2720 Compute the geometric z score of each strictly positive value in the

2721 sample, relative to the geometric mean and standard deviation.

2722 Mathematically the geometric z score can be evaluated as::

2723

2724 gzscore = log(a/gmu) / log(gsigma)

2725

2726 where ``gmu`` (resp. ``gsigma``) is the geometric mean (resp. standard

2727 deviation).

2728

2729 Parameters

2730 ----------

2731 a : array_like

2732 Sample data.

2733 axis : int or None, optional

2734 Axis along which to operate. Default is 0. If None, compute over

2735 the whole array `a`.

2736 ddof : int, optional

2737 Degrees of freedom correction in the calculation of the

2738 standard deviation. Default is 0.

2739 nan_policy : {'propagate', 'raise', 'omit'}, optional

2740 Defines how to handle when input contains nan. 'propagate' returns nan,

2741 'raise' throws an error, 'omit' performs the calculations ignoring nan

2742 values. Default is 'propagate'. Note that when the value is 'omit',

2743 nans in the input also propagate to the output, but they do not affect

2744 the geometric z scores computed for the non-nan values.

2745

2746 Returns

2747 -------

2748 gzscore : array_like

2749 The geometric z scores, standardized by geometric mean and geometric

2750 standard deviation of input array `a`.

2751

2752 See Also

2753 --------

2754 gmean : Geometric mean

2755 gstd : Geometric standard deviation

2756 zscore : Standard score

2757

2758 Notes

2759 -----

2760 This function preserves ndarray subclasses, and works also with

2761 matrices and masked arrays (it uses ``asanyarray`` instead of

2762 ``asarray`` for parameters).

2763

2764 .. versionadded:: 1.8

2765

2766 Examples

2767 --------

2768 Draw samples from a log-normal distribution:

2769

2770 >>> import numpy as np

2771 >>> from scipy.stats import zscore, gzscore

2772 >>> import matplotlib.pyplot as plt

2773

2774 >>> rng = np.random.default_rng()

2775 >>> mu, sigma = 3., 1. # mean and standard deviation

2776 >>> x = rng.lognormal(mu, sigma, size=500)

2777

2778 Display the histogram of the samples:

2779

2780 >>> fig, ax = plt.subplots()

2781 >>> ax.hist(x, 50)

2782 >>> plt.show()

2783

2784 Display the histogram of the samples standardized by the classical zscore.

2785 Distribution is rescaled but its shape is unchanged.

2786

2787 >>> fig, ax = plt.subplots()

2788 >>> ax.hist(zscore(x), 50)

2789 >>> plt.show()

2790

2791 Demonstrate that the distribution of geometric zscores is rescaled and

2792 quasinormal:

2793

2794 >>> fig, ax = plt.subplots()

2795 >>> ax.hist(gzscore(x), 50)

2796 >>> plt.show()

2797

2798 """

2799 a = np.asanyarray(a)

2800 log = ma.log if isinstance(a, ma.MaskedArray) else np.log

2801

2802 return zscore(log(a), axis=axis, ddof=ddof, nan_policy=nan_policy)

2803

2804

2805def zmap(scores, compare, axis=0, ddof=0, nan_policy='propagate'):

2806 """

2807 Calculate the relative z-scores.

2808

2809 Return an array of z-scores, i.e., scores that are standardized to

2810 zero mean and unit variance, where mean and variance are calculated

2811 from the comparison array.

2812

2813 Parameters

2814 ----------

2815 scores : array_like

2816 The input for which z-scores are calculated.

2817 compare : array_like

2818 The input from which the mean and standard deviation of the

2819 normalization are taken; assumed to have the same dimension as

2820 `scores`.

2821 axis : int or None, optional

2822 Axis over which mean and variance of `compare` are calculated.

2823 Default is 0. If None, compute over the whole array `scores`.

2824 ddof : int, optional

2825 Degrees of freedom correction in the calculation of the

2826 standard deviation. Default is 0.

2827 nan_policy : {'propagate', 'raise', 'omit'}, optional

2828 Defines how to handle the occurrence of nans in `compare`.

2829 'propagate' returns nan, 'raise' raises an exception, 'omit'

2830 performs the calculations ignoring nan values. Default is

2831 'propagate'. Note that when the value is 'omit', nans in `scores`

2832 also propagate to the output, but they do not affect the z-scores

2833 computed for the non-nan values.

2834

2835 Returns

2836 -------

2837 zscore : array_like

2838 Z-scores, in the same shape as `scores`.

2839

2840 Notes

2841 -----

2842 This function preserves ndarray subclasses, and works also with

2843 matrices and masked arrays (it uses `asanyarray` instead of

2844 `asarray` for parameters).

2845

2846 Examples

2847 --------

2848 >>> from scipy.stats import zmap

2849 >>> a = [0.5, 2.0, 2.5, 3]

2850 >>> b = [0, 1, 2, 3, 4]

2851 >>> zmap(a, b)

2852 array([-1.06066017, 0. , 0.35355339, 0.70710678])

2853

2854 """

2855 a = np.asanyarray(compare)

2856

2857 if a.size == 0:

2858 return np.empty(a.shape)

2859

2860 contains_nan, nan_policy = _contains_nan(a, nan_policy)

2861

2862 if contains_nan and nan_policy == 'omit':

2863 if axis is None:

2864 mn = _quiet_nanmean(a.ravel())

2865 std = _quiet_nanstd(a.ravel(), ddof=ddof)

2866 isconst = _isconst(a.ravel())

2867 else:

2868 mn = np.apply_along_axis(_quiet_nanmean, axis, a)

2869 std = np.apply_along_axis(_quiet_nanstd, axis, a, ddof=ddof)

2870 isconst = np.apply_along_axis(_isconst, axis, a)

2871 else:

2872 mn = a.mean(axis=axis, keepdims=True)

2873 std = a.std(axis=axis, ddof=ddof, keepdims=True)

2874 if axis is None:

2875 isconst = (a.item(0) == a).all()

2876 else:

2877 isconst = (_first(a, axis) == a).all(axis=axis, keepdims=True)

2878

2879 # Set std deviations that are 0 to 1 to avoid division by 0.

2880 std[isconst] = 1.0

2881 z = (scores - mn) / std

2882 # Set the outputs associated with a constant input to nan.

2883 z[np.broadcast_to(isconst, z.shape)] = np.nan

2884 return z

2885

2886

2887def gstd(a, axis=0, ddof=1):

2888 """

2889 Calculate the geometric standard deviation of an array.

2890

2891 The geometric standard deviation describes the spread of a set of numbers

2892 where the geometric mean is preferred. It is a multiplicative factor, and

2893 so a dimensionless quantity.

2894

2895 It is defined as the exponent of the standard deviation of ``log(a)``.

2896 Mathematically the population geometric standard deviation can be

2897 evaluated as::

2898

2899 gstd = exp(std(log(a)))

2900

2901 .. versionadded:: 1.3.0

2902

2903 Parameters

2904 ----------

2905 a : array_like

2906 An array like object containing the sample data.

2907 axis : int, tuple or None, optional

2908 Axis along which to operate. Default is 0. If None, compute over

2909 the whole array `a`.

2910 ddof : int, optional

2911 Degree of freedom correction in the calculation of the

2912 geometric standard deviation. Default is 1.

2913

2914 Returns

2915 -------

2916 ndarray or float

2917 An array of the geometric standard deviation. If `axis` is None or `a`

2918 is a 1d array a float is returned.

2919

2920 See Also

2921 --------

2922 gmean : Geometric mean

2923 numpy.std : Standard deviation

2924

2925 Notes

2926 -----

2927 As the calculation requires the use of logarithms the geometric standard

2928 deviation only supports strictly positive values. Any non-positive or

2929 infinite values will raise a `ValueError`.

2930 The geometric standard deviation is sometimes confused with the exponent of

2931 the standard deviation, ``exp(std(a))``. Instead the geometric standard

2932 deviation is ``exp(std(log(a)))``.

2933 The default value for `ddof` is different to the default value (0) used

2934 by other ddof containing functions, such as ``np.std`` and ``np.nanstd``.

2935

2936 References

2937 ----------

2938 .. [1] Kirkwood, T. B., "Geometric means and measures of dispersion",

2939 Biometrics, vol. 35, pp. 908-909, 1979

2940

2941 Examples

2942 --------

2943 Find the geometric standard deviation of a log-normally distributed sample.

2944 Note that the standard deviation of the distribution is one, on a

2945 log scale this evaluates to approximately ``exp(1)``.

2946

2947 >>> import numpy as np

2948 >>> from scipy.stats import gstd

2949 >>> rng = np.random.default_rng()

2950 >>> sample = rng.lognormal(mean=0, sigma=1, size=1000)

2951 >>> gstd(sample)

2952 2.810010162475324

2953

2954 Compute the geometric standard deviation of a multidimensional array and

2955 of a given axis.

2956

2957 >>> a = np.arange(1, 25).reshape(2, 3, 4)

2958 >>> gstd(a, axis=None)

2959 2.2944076136018947

2960 >>> gstd(a, axis=2)

2961 array([[1.82424757, 1.22436866, 1.13183117],

2962 [1.09348306, 1.07244798, 1.05914985]])

2963 >>> gstd(a, axis=(1,2))

2964 array([2.12939215, 1.22120169])

2965

2966 The geometric standard deviation further handles masked arrays.

2967

2968 >>> a = np.arange(1, 25).reshape(2, 3, 4)

2969 >>> ma = np.ma.masked_where(a > 16, a)

2970 >>> ma

2971 masked_array(

2972 data=[[[1, 2, 3, 4],

2973 [5, 6, 7, 8],

2974 [9, 10, 11, 12]],

2975 [[13, 14, 15, 16],

2976 [--, --, --, --],

2977 [--, --, --, --]]],

2978 mask=[[[False, False, False, False],

2979 [False, False, False, False],

2980 [False, False, False, False]],

2981 [[False, False, False, False],

2982 [ True, True, True, True],

2983 [ True, True, True, True]]],

2984 fill_value=999999)

2985 >>> gstd(ma, axis=2)

2986 masked_array(

2987 data=[[1.8242475707663655, 1.2243686572447428, 1.1318311657788478],

2988 [1.0934830582350938, --, --]],

2989 mask=[[False, False, False],

2990 [False, True, True]],

2991 fill_value=999999)

2992

2993 """

2994 a = np.asanyarray(a)

2995 log = ma.log if isinstance(a, ma.MaskedArray) else np.log

2996

2997 try:

2998 with warnings.catch_warnings():

2999 warnings.simplefilter("error", RuntimeWarning)

3000 return np.exp(np.std(log(a), axis=axis, ddof=ddof))

3001 except RuntimeWarning as w:

3002 if np.isinf(a).any():

3003 raise ValueError(

3004 'Infinite value encountered. The geometric standard deviation '

3005 'is defined for strictly positive values only.'

3006 ) from w

3007 a_nan = np.isnan(a)

3008 a_nan_any = a_nan.any()

3009 # exclude NaN's from negativity check, but

3010 # avoid expensive masking for arrays with no NaN

3011 if ((a_nan_any and np.less_equal(np.nanmin(a), 0)) or

3012 (not a_nan_any and np.less_equal(a, 0).any())):

3013 raise ValueError(

3014 'Non positive value encountered. The geometric standard '

3015 'deviation is defined for strictly positive values only.'

3016 ) from w

3017 elif 'Degrees of freedom <= 0 for slice' == str(w):

3018 raise ValueError(w) from w

3019 else:

3020 # Remaining warnings don't need to be exceptions.

3021 return np.exp(np.std(log(a, where=~a_nan), axis=axis, ddof=ddof))

3022 except TypeError as e:

3023 raise ValueError(

3024 'Invalid array input. The inputs could not be '

3025 'safely coerced to any supported types') from e

3026

3027

3028# Private dictionary initialized only once at module level

3029# See https://en.wikipedia.org/wiki/Robust_measures_of_scale

3030_scale_conversions = {'raw': 1.0,

3031 'normal': special.erfinv(0.5) * 2.0 * math.sqrt(2.0)}

3032

3033

3034def iqr(x, axis=None, rng=(25, 75), scale=1.0, nan_policy='propagate',

3035 interpolation='linear', keepdims=False):

3036 r"""

3037 Compute the interquartile range of the data along the specified axis.

3038

3039 The interquartile range (IQR) is the difference between the 75th and

3040 25th percentile of the data. It is a measure of the dispersion

3041 similar to standard deviation or variance, but is much more robust

3042 against outliers [2]_.

3043

3044 The ``rng`` parameter allows this function to compute other

3045 percentile ranges than the actual IQR. For example, setting

3046 ``rng=(0, 100)`` is equivalent to `numpy.ptp`.

3047

3048 The IQR of an empty array is `np.nan`.

3049

3050 .. versionadded:: 0.18.0

3051

3052 Parameters

3053 ----------

3054 x : array_like

3055 Input array or object that can be converted to an array.

3056 axis : int or sequence of int, optional

3057 Axis along which the range is computed. The default is to

3058 compute the IQR for the entire array.

3059 rng : Two-element sequence containing floats in range of [0,100] optional

3060 Percentiles over which to compute the range. Each must be

3061 between 0 and 100, inclusive. The default is the true IQR:

3062 ``(25, 75)``. The order of the elements is not important.

3063 scale : scalar or str, optional

3064 The numerical value of scale will be divided out of the final

3065 result. The following string values are recognized:

3066

3067 * 'raw' : No scaling, just return the raw IQR.

3068 **Deprecated!** Use ``scale=1`` instead.

3069 * 'normal' : Scale by

3070 :math:`2 \sqrt{2} erf^{-1}(\frac{1}{2}) \approx 1.349`.

3071

3072 The default is 1.0. The use of ``scale='raw'`` is deprecated infavor

3073 of ``scale=1`` and will raise an error in SciPy 1.12.0.

3074 Array-like `scale` is also allowed, as long

3075 as it broadcasts correctly to the output such that

3076 ``out / scale`` is a valid operation. The output dimensions

3077 depend on the input array, `x`, the `axis` argument, and the

3078 `keepdims` flag.

3079 nan_policy : {'propagate', 'raise', 'omit'}, optional

3080 Defines how to handle when input contains nan.

3081 The following options are available (default is 'propagate'):

3082

3083 * 'propagate': returns nan

3084 * 'raise': throws an error

3085 * 'omit': performs the calculations ignoring nan values

3086 interpolation : str, optional

3087

3088 Specifies the interpolation method to use when the percentile

3089 boundaries lie between two data points ``i`` and ``j``.

3090 The following options are available (default is 'linear'):

3091

3092 * 'linear': ``i + (j - i)*fraction``, where ``fraction`` is the

3093 fractional part of the index surrounded by ``i`` and ``j``.

3094 * 'lower': ``i``.

3095 * 'higher': ``j``.

3096 * 'nearest': ``i`` or ``j`` whichever is nearest.

3097 * 'midpoint': ``(i + j)/2``.

3098

3099 For NumPy >= 1.22.0, the additional options provided by the ``method``

3100 keyword of `numpy.percentile` are also valid.

3101

3102 keepdims : bool, optional

3103 If this is set to True, the reduced axes are left in the

3104 result as dimensions with size one. With this option, the result

3105 will broadcast correctly against the original array `x`.

3106

3107 Returns

3108 -------

3109 iqr : scalar or ndarray

3110 If ``axis=None``, a scalar is returned. If the input contains

3111 integers or floats of smaller precision than ``np.float64``, then the

3112 output data-type is ``np.float64``. Otherwise, the output data-type is

3113 the same as that of the input.

3114

3115 See Also

3116 --------

3117 numpy.std, numpy.var

3118

3119 References

3120 ----------

3121 .. [1] "Interquartile range" https://en.wikipedia.org/wiki/Interquartile_range

3122 .. [2] "Robust measures of scale" https://en.wikipedia.org/wiki/Robust_measures_of_scale

3123 .. [3] "Quantile" https://en.wikipedia.org/wiki/Quantile

3124

3125 Examples

3126 --------

3127 >>> import numpy as np

3128 >>> from scipy.stats import iqr

3129 >>> x = np.array([[10, 7, 4], [3, 2, 1]])

3130 >>> x

3131 array([[10, 7, 4],

3132 [ 3, 2, 1]])

3133 >>> iqr(x)

3134 4.0

3135 >>> iqr(x, axis=0)

3136 array([ 3.5, 2.5, 1.5])

3137 >>> iqr(x, axis=1)

3138 array([ 3., 1.])

3139 >>> iqr(x, axis=1, keepdims=True)

3140 array([[ 3.],

3141 [ 1.]])

3142

3143 """

3144 x = asarray(x)

3145

3146 # This check prevents percentile from raising an error later. Also, it is

3147 # consistent with `np.var` and `np.std`.

3148 if not x.size:

3149 return np.nan

3150

3151 # An error may be raised here, so fail-fast, before doing lengthy

3152 # computations, even though `scale` is not used until later

3153 if isinstance(scale, str):

3154 scale_key = scale.lower()

3155 if scale_key not in _scale_conversions:

3156 raise ValueError("{0} not a valid scale for `iqr`".format(scale))

3157 if scale_key == 'raw':

3158 msg = ("The use of 'scale=\"raw\"' is deprecated infavor of "

3159 "'scale=1' and will raise an error in SciPy 1.12.0.")

3160 warnings.warn(msg, DeprecationWarning, stacklevel=2)

3161 scale = _scale_conversions[scale_key]

3162

3163 # Select the percentile function to use based on nans and policy

3164 contains_nan, nan_policy = _contains_nan(x, nan_policy)

3165

3166 if contains_nan and nan_policy == 'omit':

3167 percentile_func = np.nanpercentile

3168 else:

3169 percentile_func = np.percentile

3170

3171 if len(rng) != 2:

3172 raise TypeError("quantile range must be two element sequence")

3173

3174 if np.isnan(rng).any():

3175 raise ValueError("range must not contain NaNs")

3176

3177 rng = sorted(rng)

3178 if NumpyVersion(np.__version__) >= '1.22.0':

3179 pct = percentile_func(x, rng, axis=axis, method=interpolation,

3180 keepdims=keepdims)

3181 else:

3182 pct = percentile_func(x, rng, axis=axis, interpolation=interpolation,

3183 keepdims=keepdims)

3184 out = np.subtract(pct[1], pct[0])

3185

3186 if scale != 1.0:

3187 out /= scale

3188

3189 return out

3190

3191

3192def _mad_1d(x, center, nan_policy):

3193 # Median absolute deviation for 1-d array x.

3194 # This is a helper function for `median_abs_deviation`; it assumes its

3195 # arguments have been validated already. In particular, x must be a

3196 # 1-d numpy array, center must be callable, and if nan_policy is not

3197 # 'propagate', it is assumed to be 'omit', because 'raise' is handled

3198 # in `median_abs_deviation`.

3199 # No warning is generated if x is empty or all nan.

3200 isnan = np.isnan(x)

3201 if isnan.any():

3202 if nan_policy == 'propagate':

3203 return np.nan

3204 x = x[~isnan]

3205 if x.size == 0:

3206 # MAD of an empty array is nan.

3207 return np.nan

3208 # Edge cases have been handled, so do the basic MAD calculation.

3209 med = center(x)

3210 mad = np.median(np.abs(x - med))

3211 return mad

3212

3213

3214def median_abs_deviation(x, axis=0, center=np.median, scale=1.0,

3215 nan_policy='propagate'):

3216 r"""

3217 Compute the median absolute deviation of the data along the given axis.

3218

3219 The median absolute deviation (MAD, [1]_) computes the median over the

3220 absolute deviations from the median. It is a measure of dispersion

3221 similar to the standard deviation but more robust to outliers [2]_.

3222

3223 The MAD of an empty array is ``np.nan``.

3224

3225 .. versionadded:: 1.5.0

3226

3227 Parameters

3228 ----------

3229 x : array_like

3230 Input array or object that can be converted to an array.

3231 axis : int or None, optional

3232 Axis along which the range is computed. Default is 0. If None, compute

3233 the MAD over the entire array.

3234 center : callable, optional

3235 A function that will return the central value. The default is to use

3236 np.median. Any user defined function used will need to have the

3237 function signature ``func(arr, axis)``.

3238 scale : scalar or str, optional

3239 The numerical value of scale will be divided out of the final

3240 result. The default is 1.0. The string "normal" is also accepted,

3241 and results in `scale` being the inverse of the standard normal

3242 quantile function at 0.75, which is approximately 0.67449.

3243 Array-like scale is also allowed, as long as it broadcasts correctly

3244 to the output such that ``out / scale`` is a valid operation. The

3245 output dimensions depend on the input array, `x`, and the `axis`

3246 argument.

3247 nan_policy : {'propagate', 'raise', 'omit'}, optional

3248 Defines how to handle when input contains nan.

3249 The following options are available (default is 'propagate'):

3250

3251 * 'propagate': returns nan

3252 * 'raise': throws an error

3253 * 'omit': performs the calculations ignoring nan values

3254

3255 Returns

3256 -------

3257 mad : scalar or ndarray

3258 If ``axis=None``, a scalar is returned. If the input contains

3259 integers or floats of smaller precision than ``np.float64``, then the

3260 output data-type is ``np.float64``. Otherwise, the output data-type is

3261 the same as that of the input.

3262

3263 See Also

3264 --------

3265 numpy.std, numpy.var, numpy.median, scipy.stats.iqr, scipy.stats.tmean,

3266 scipy.stats.tstd, scipy.stats.tvar

3267

3268 Notes

3269 -----

3270 The `center` argument only affects the calculation of the central value

3271 around which the MAD is calculated. That is, passing in ``center=np.mean``

3272 will calculate the MAD around the mean - it will not calculate the *mean*

3273 absolute deviation.

3274

3275 The input array may contain `inf`, but if `center` returns `inf`, the

3276 corresponding MAD for that data will be `nan`.

3277

3278 References

3279 ----------

3280 .. [1] "Median absolute deviation",

3281 https://en.wikipedia.org/wiki/Median_absolute_deviation

3282 .. [2] "Robust measures of scale",

3283 https://en.wikipedia.org/wiki/Robust_measures_of_scale

3284

3285 Examples

3286 --------

3287 When comparing the behavior of `median_abs_deviation` with ``np.std``,

3288 the latter is affected when we change a single value of an array to have an

3289 outlier value while the MAD hardly changes:

3290

3291 >>> import numpy as np

3292 >>> from scipy import stats

3293 >>> x = stats.norm.rvs(size=100, scale=1, random_state=123456)

3294 >>> x.std()

3295 0.9973906394005013

3296 >>> stats.median_abs_deviation(x)

3297 0.82832610097857

3298 >>> x[0] = 345.6

3299 >>> x.std()

3300 34.42304872314415

3301 >>> stats.median_abs_deviation(x)

3302 0.8323442311590675

3303

3304 Axis handling example:

3305

3306 >>> x = np.array([[10, 7, 4], [3, 2, 1]])

3307 >>> x

3308 array([[10, 7, 4],

3309 [ 3, 2, 1]])

3310 >>> stats.median_abs_deviation(x)

3311 array([3.5, 2.5, 1.5])

3312 >>> stats.median_abs_deviation(x, axis=None)

3313 2.0

3314

3315 Scale normal example:

3316

3317 >>> x = stats.norm.rvs(size=1000000, scale=2, random_state=123456)

3318 >>> stats.median_abs_deviation(x)

3319 1.3487398527041636

3320 >>> stats.median_abs_deviation(x, scale='normal')

3321 1.9996446978061115

3322

3323 """

3324 if not callable(center):

3325 raise TypeError("The argument 'center' must be callable. The given "

3326 f"value {repr(center)} is not callable.")

3327

3328 # An error may be raised here, so fail-fast, before doing lengthy

3329 # computations, even though `scale` is not used until later

3330 if isinstance(scale, str):

3331 if scale.lower() == 'normal':

3332 scale = 0.6744897501960817 # special.ndtri(0.75)

3333 else:

3334 raise ValueError(f"{scale} is not a valid scale value.")

3335

3336 x = asarray(x)

3337

3338 # Consistent with `np.var` and `np.std`.

3339 if not x.size:

3340 if axis is None:

3341 return np.nan

3342 nan_shape = tuple(item for i, item in enumerate(x.shape) if i != axis)

3343 if nan_shape == ():

3344 # Return nan, not array(nan)

3345 return np.nan

3346 return np.full(nan_shape, np.nan)

3347

3348 contains_nan, nan_policy = _contains_nan(x, nan_policy)

3349

3350 if contains_nan:

3351 if axis is None:

3352 mad = _mad_1d(x.ravel(), center, nan_policy)

3353 else:

3354 mad = np.apply_along_axis(_mad_1d, axis, x, center, nan_policy)

3355 else:

3356 if axis is None:

3357 med = center(x, axis=None)

3358 mad = np.median(np.abs(x - med))

3359 else:

3360 # Wrap the call to center() in expand_dims() so it acts like

3361 # keepdims=True was used.

3362 med = np.expand_dims(center(x, axis=axis), axis)

3363 mad = np.median(np.abs(x - med), axis=axis)

3364

3365 return mad / scale

3366

3367

3368#####################################

3369# TRIMMING FUNCTIONS #

3370#####################################

3371

3372

3373SigmaclipResult = namedtuple('SigmaclipResult', ('clipped', 'lower', 'upper'))

3374

3375

3376def sigmaclip(a, low=4., high=4.):

3377 """Perform iterative sigma-clipping of array elements.

3378

3379 Starting from the full sample, all elements outside the critical range are

3380 removed, i.e. all elements of the input array `c` that satisfy either of

3381 the following conditions::

3382

3383 c < mean(c) - std(c)*low

3384 c > mean(c) + std(c)*high

3385

3386 The iteration continues with the updated sample until no

3387 elements are outside the (updated) range.

3388

3389 Parameters

3390 ----------

3391 a : array_like

3392 Data array, will be raveled if not 1-D.

3393 low : float, optional

3394 Lower bound factor of sigma clipping. Default is 4.

3395 high : float, optional

3396 Upper bound factor of sigma clipping. Default is 4.

3397

3398 Returns

3399 -------

3400 clipped : ndarray

3401 Input array with clipped elements removed.

3402 lower : float

3403 Lower threshold value use for clipping.

3404 upper : float

3405 Upper threshold value use for clipping.

3406

3407 Examples

3408 --------

3409 >>> import numpy as np

3410 >>> from scipy.stats import sigmaclip

3411 >>> a = np.concatenate((np.linspace(9.5, 10.5, 31),

3412 ... np.linspace(0, 20, 5)))

3413 >>> fact = 1.5

3414 >>> c, low, upp = sigmaclip(a, fact, fact)

3415 >>> c

3416 array([ 9.96666667, 10. , 10.03333333, 10. ])

3417 >>> c.var(), c.std()

3418 (0.00055555555555555165, 0.023570226039551501)

3419 >>> low, c.mean() - fact*c.std(), c.min()

3420 (9.9646446609406727, 9.9646446609406727, 9.9666666666666668)

3421 >>> upp, c.mean() + fact*c.std(), c.max()

3422 (10.035355339059327, 10.035355339059327, 10.033333333333333)

3423

3424 >>> a = np.concatenate((np.linspace(9.5, 10.5, 11),

3425 ... np.linspace(-100, -50, 3)))

3426 >>> c, low, upp = sigmaclip(a, 1.8, 1.8)

3427 >>> (c == np.linspace(9.5, 10.5, 11)).all()

3428 True

3429

3430 """

3431 c = np.asarray(a).ravel()

3432 delta = 1

3433 while delta:

3434 c_std = c.std()

3435 c_mean = c.mean()

3436 size = c.size

3437 critlower = c_mean - c_std * low

3438 critupper = c_mean + c_std * high

3439 c = c[(c >= critlower) & (c <= critupper)]

3440 delta = size - c.size

3441

3442 return SigmaclipResult(c, critlower, critupper)

3443

3444

3445def trimboth(a, proportiontocut, axis=0):

3446 """Slice off a proportion of items from both ends of an array.

3447

3448 Slice off the passed proportion of items from both ends of the passed

3449 array (i.e., with `proportiontocut` = 0.1, slices leftmost 10% **and**

3450 rightmost 10% of scores). The trimmed values are the lowest and

3451 highest ones.

3452 Slice off less if proportion results in a non-integer slice index (i.e.

3453 conservatively slices off `proportiontocut`).

3454

3455 Parameters

3456 ----------

3457 a : array_like

3458 Data to trim.

3459 proportiontocut : float

3460 Proportion (in range 0-1) of total data set to trim of each end.

3461 axis : int or None, optional

3462 Axis along which to trim data. Default is 0. If None, compute over

3463 the whole array `a`.

3464

3465 Returns

3466 -------

3467 out : ndarray

3468 Trimmed version of array `a`. The order of the trimmed content

3469 is undefined.

3470

3471 See Also

3472 --------

3473 trim_mean

3474

3475 Examples

3476 --------

3477 Create an array of 10 values and trim 10% of those values from each end:

3478

3479 >>> import numpy as np

3480 >>> from scipy import stats

3481 >>> a = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

3482 >>> stats.trimboth(a, 0.1)

3483 array([1, 3, 2, 4, 5, 6, 7, 8])

3484

3485 Note that the elements of the input array are trimmed by value, but the

3486 output array is not necessarily sorted.

3487

3488 The proportion to trim is rounded down to the nearest integer. For

3489 instance, trimming 25% of the values from each end of an array of 10

3490 values will return an array of 6 values:

3491

3492 >>> b = np.arange(10)

3493 >>> stats.trimboth(b, 1/4).shape

3494 (6,)

3495

3496 Multidimensional arrays can be trimmed along any axis or across the entire

3497 array:

3498

3499 >>> c = [2, 4, 6, 8, 0, 1, 3, 5, 7, 9]

3500 >>> d = np.array([a, b, c])

3501 >>> stats.trimboth(d, 0.4, axis=0).shape

3502 (1, 10)

3503 >>> stats.trimboth(d, 0.4, axis=1).shape

3504 (3, 2)

3505 >>> stats.trimboth(d, 0.4, axis=None).shape

3506 (6,)

3507

3508 """

3509 a = np.asarray(a)

3510

3511 if a.size == 0:

3512 return a

3513

3514 if axis is None:

3515 a = a.ravel()

3516 axis = 0

3517

3518 nobs = a.shape[axis]

3519 lowercut = int(proportiontocut * nobs)

3520 uppercut = nobs - lowercut

3521 if (lowercut >= uppercut):

3522 raise ValueError("Proportion too big.")

3523

3524 atmp = np.partition(a, (lowercut, uppercut - 1), axis)

3525

3526 sl = [slice(None)] * atmp.ndim

3527 sl[axis] = slice(lowercut, uppercut)

3528 return atmp[tuple(sl)]

3529

3530

3531def trim1(a, proportiontocut, tail='right', axis=0):

3532 """Slice off a proportion from ONE end of the passed array distribution.

3533

3534 If `proportiontocut` = 0.1, slices off 'leftmost' or 'rightmost'

3535 10% of scores. The lowest or highest values are trimmed (depending on

3536 the tail).

3537 Slice off less if proportion results in a non-integer slice index

3538 (i.e. conservatively slices off `proportiontocut` ).

3539

3540 Parameters

3541 ----------

3542 a : array_like

3543 Input array.

3544 proportiontocut : float

3545 Fraction to cut off of 'left' or 'right' of distribution.

3546 tail : {'left', 'right'}, optional

3547 Defaults to 'right'.

3548 axis : int or None, optional

3549 Axis along which to trim data. Default is 0. If None, compute over

3550 the whole array `a`.

3551

3552 Returns

3553 -------

3554 trim1 : ndarray

3555 Trimmed version of array `a`. The order of the trimmed content is

3556 undefined.

3557

3558 Examples

3559 --------

3560 Create an array of 10 values and trim 20% of its lowest values:

3561

3562 >>> import numpy as np

3563 >>> from scipy import stats

3564 >>> a = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

3565 >>> stats.trim1(a, 0.2, 'left')

3566 array([2, 4, 3, 5, 6, 7, 8, 9])

3567

3568 Note that the elements of the input array are trimmed by value, but the

3569 output array is not necessarily sorted.

3570

3571 The proportion to trim is rounded down to the nearest integer. For

3572 instance, trimming 25% of the values from an array of 10 values will

3573 return an array of 8 values:

3574

3575 >>> b = np.arange(10)

3576 >>> stats.trim1(b, 1/4).shape

3577 (8,)

3578

3579 Multidimensional arrays can be trimmed along any axis or across the entire

3580 array:

3581

3582 >>> c = [2, 4, 6, 8, 0, 1, 3, 5, 7, 9]

3583 >>> d = np.array([a, b, c])

3584 >>> stats.trim1(d, 0.8, axis=0).shape

3585 (1, 10)

3586 >>> stats.trim1(d, 0.8, axis=1).shape

3587 (3, 2)

3588 >>> stats.trim1(d, 0.8, axis=None).shape

3589 (6,)

3590

3591 """

3592 a = np.asarray(a)

3593 if axis is None:

3594 a = a.ravel()

3595 axis = 0

3596

3597 nobs = a.shape[axis]

3598

3599 # avoid possible corner case

3600 if proportiontocut >= 1:

3601 return []

3602

3603 if tail.lower() == 'right':

3604 lowercut = 0

3605 uppercut = nobs - int(proportiontocut * nobs)

3606

3607 elif tail.lower() == 'left':

3608 lowercut = int(proportiontocut * nobs)

3609 uppercut = nobs

3610

3611 atmp = np.partition(a, (lowercut, uppercut - 1), axis)

3612

3613 sl = [slice(None)] * atmp.ndim

3614 sl[axis] = slice(lowercut, uppercut)

3615 return atmp[tuple(sl)]

3616

3617

3618def trim_mean(a, proportiontocut, axis=0):

3619 """Return mean of array after trimming distribution from both tails.

3620

3621 If `proportiontocut` = 0.1, slices off 'leftmost' and 'rightmost' 10% of

3622 scores. The input is sorted before slicing. Slices off less if proportion

3623 results in a non-integer slice index (i.e., conservatively slices off

3624 `proportiontocut` ).

3625

3626 Parameters

3627 ----------

3628 a : array_like

3629 Input array.

3630 proportiontocut : float

3631 Fraction to cut off of both tails of the distribution.

3632 axis : int or None, optional

3633 Axis along which the trimmed means are computed. Default is 0.

3634 If None, compute over the whole array `a`.

3635

3636 Returns

3637 -------

3638 trim_mean : ndarray

3639 Mean of trimmed array.

3640

3641 See Also

3642 --------

3643 trimboth

3644 tmean : Compute the trimmed mean ignoring values outside given `limits`.

3645

3646 Examples

3647 --------

3648 >>> import numpy as np

3649 >>> from scipy import stats

3650 >>> x = np.arange(20)

3651 >>> stats.trim_mean(x, 0.1)

3652 9.5

3653 >>> x2 = x.reshape(5, 4)

3654 >>> x2

3655 array([[ 0, 1, 2, 3],

3656 [ 4, 5, 6, 7],

3657 [ 8, 9, 10, 11],

3658 [12, 13, 14, 15],

3659 [16, 17, 18, 19]])

3660 >>> stats.trim_mean(x2, 0.25)

3661 array([ 8., 9., 10., 11.])

3662 >>> stats.trim_mean(x2, 0.25, axis=1)

3663 array([ 1.5, 5.5, 9.5, 13.5, 17.5])

3664

3665 """

3666 a = np.asarray(a)

3667

3668 if a.size == 0:

3669 return np.nan

3670

3671 if axis is None:

3672 a = a.ravel()

3673 axis = 0

3674

3675 nobs = a.shape[axis]

3676 lowercut = int(proportiontocut * nobs)

3677 uppercut = nobs - lowercut

3678 if (lowercut > uppercut):

3679 raise ValueError("Proportion too big.")

3680

3681 atmp = np.partition(a, (lowercut, uppercut - 1), axis)

3682

3683 sl = [slice(None)] * atmp.ndim

3684 sl[axis] = slice(lowercut, uppercut)

3685 return np.mean(atmp[tuple(sl)], axis=axis)

3686

3687

3688F_onewayResult = namedtuple('F_onewayResult', ('statistic', 'pvalue'))

3689

3690

3691def _create_f_oneway_nan_result(shape, axis):

3692 """

3693 This is a helper function for f_oneway for creating the return values

3694 in certain degenerate conditions. It creates return values that are

3695 all nan with the appropriate shape for the given `shape` and `axis`.

3696 """

3697 axis = np.core.multiarray.normalize_axis_index(axis, len(shape))

3698 shp = shape[:axis] + shape[axis+1:]

3699 if shp == ():

3700 f = np.nan

3701 prob = np.nan

3702 else:

3703 f = np.full(shp, fill_value=np.nan)

3704 prob = f.copy()

3705 return F_onewayResult(f, prob)

3706

3707

3708def _first(arr, axis):

3709 """Return arr[..., 0:1, ...] where 0:1 is in the `axis` position."""

3710 return np.take_along_axis(arr, np.array(0, ndmin=arr.ndim), axis)

3711

3712

3713def f_oneway(*samples, axis=0):

3714 """Perform one-way ANOVA.

3715

3716 The one-way ANOVA tests the null hypothesis that two or more groups have

3717 the same population mean. The test is applied to samples from two or

3718 more groups, possibly with differing sizes.

3719

3720 Parameters

3721 ----------

3722 sample1, sample2, ... : array_like

3723 The sample measurements for each group. There must be at least

3724 two arguments. If the arrays are multidimensional, then all the

3725 dimensions of the array must be the same except for `axis`.

3726 axis : int, optional

3727 Axis of the input arrays along which the test is applied.

3728 Default is 0.

3729

3730 Returns

3731 -------

3732 statistic : float

3733 The computed F statistic of the test.

3734 pvalue : float

3735 The associated p-value from the F distribution.

3736

3737 Warns

3738 -----

3739 `~scipy.stats.ConstantInputWarning`

3740 Raised if all values within each of the input arrays are identical.

3741 In this case the F statistic is either infinite or isn't defined,

3742 so ``np.inf`` or ``np.nan`` is returned.

3743

3744 `~scipy.stats.DegenerateDataWarning`

3745 Raised if the length of any input array is 0, or if all the input

3746 arrays have length 1. ``np.nan`` is returned for the F statistic

3747 and the p-value in these cases.

3748

3749 Notes

3750 -----

3751 The ANOVA test has important assumptions that must be satisfied in order

3752 for the associated p-value to be valid.

3753

3754 1. The samples are independent.

3755 2. Each sample is from a normally distributed population.

3756 3. The population standard deviations of the groups are all equal. This

3757 property is known as homoscedasticity.

3758

3759 If these assumptions are not true for a given set of data, it may still

3760 be possible to use the Kruskal-Wallis H-test (`scipy.stats.kruskal`) or

3761 the Alexander-Govern test (`scipy.stats.alexandergovern`) although with

3762 some loss of power.

3763

3764 The length of each group must be at least one, and there must be at

3765 least one group with length greater than one. If these conditions

3766 are not satisfied, a warning is generated and (``np.nan``, ``np.nan``)

3767 is returned.

3768

3769 If all values in each group are identical, and there exist at least two

3770 groups with different values, the function generates a warning and

3771 returns (``np.inf``, 0).

3772

3773 If all values in all groups are the same, function generates a warning

3774 and returns (``np.nan``, ``np.nan``).

3775

3776 The algorithm is from Heiman [2]_, pp.394-7.

3777

3778 References

3779 ----------

3780 .. [1] R. Lowry, "Concepts and Applications of Inferential Statistics",

3781 Chapter 14, 2014, http://vassarstats.net/textbook/

3782

3783 .. [2] G.W. Heiman, "Understanding research methods and statistics: An

3784 integrated introduction for psychology", Houghton, Mifflin and

3785 Company, 2001.

3786

3787 .. [3] G.H. McDonald, "Handbook of Biological Statistics", One-way ANOVA.

3788 http://www.biostathandbook.com/onewayanova.html

3789

3790 Examples

3791 --------

3792 >>> import numpy as np

3793 >>> from scipy.stats import f_oneway

3794

3795 Here are some data [3]_ on a shell measurement (the length of the anterior

3796 adductor muscle scar, standardized by dividing by length) in the mussel

3797 Mytilus trossulus from five locations: Tillamook, Oregon; Newport, Oregon;

3798 Petersburg, Alaska; Magadan, Russia; and Tvarminne, Finland, taken from a

3799 much larger data set used in McDonald et al. (1991).

3800

3801 >>> tillamook = [0.0571, 0.0813, 0.0831, 0.0976, 0.0817, 0.0859, 0.0735,

3802 ... 0.0659, 0.0923, 0.0836]

3803 >>> newport = [0.0873, 0.0662, 0.0672, 0.0819, 0.0749, 0.0649, 0.0835,

3804 ... 0.0725]

3805 >>> petersburg = [0.0974, 0.1352, 0.0817, 0.1016, 0.0968, 0.1064, 0.105]

3806 >>> magadan = [0.1033, 0.0915, 0.0781, 0.0685, 0.0677, 0.0697, 0.0764,

3807 ... 0.0689]

3808 >>> tvarminne = [0.0703, 0.1026, 0.0956, 0.0973, 0.1039, 0.1045]

3809 >>> f_oneway(tillamook, newport, petersburg, magadan, tvarminne)

3810 F_onewayResult(statistic=7.121019471642447, pvalue=0.0002812242314534544)

3811

3812 `f_oneway` accepts multidimensional input arrays. When the inputs

3813 are multidimensional and `axis` is not given, the test is performed

3814 along the first axis of the input arrays. For the following data, the

3815 test is performed three times, once for each column.

3816

3817 >>> a = np.array([[9.87, 9.03, 6.81],

3818 ... [7.18, 8.35, 7.00],

3819 ... [8.39, 7.58, 7.68],

3820 ... [7.45, 6.33, 9.35],

3821 ... [6.41, 7.10, 9.33],

3822 ... [8.00, 8.24, 8.44]])

3823 >>> b = np.array([[6.35, 7.30, 7.16],

3824 ... [6.65, 6.68, 7.63],

3825 ... [5.72, 7.73, 6.72],

3826 ... [7.01, 9.19, 7.41],

3827 ... [7.75, 7.87, 8.30],

3828 ... [6.90, 7.97, 6.97]])

3829 >>> c = np.array([[3.31, 8.77, 1.01],

3830 ... [8.25, 3.24, 3.62],

3831 ... [6.32, 8.81, 5.19],

3832 ... [7.48, 8.83, 8.91],

3833 ... [8.59, 6.01, 6.07],

3834 ... [3.07, 9.72, 7.48]])

3835 >>> F, p = f_oneway(a, b, c)

3836 >>> F

3837 array([1.75676344, 0.03701228, 3.76439349])

3838 >>> p

3839 array([0.20630784, 0.96375203, 0.04733157])

3840

3841 """

3842 if len(samples) < 2:

3843 raise TypeError('at least two inputs are required;'

3844 f' got {len(samples)}.')

3845

3846 samples = [np.asarray(sample, dtype=float) for sample in samples]

3847

3848 # ANOVA on N groups, each in its own array

3849 num_groups = len(samples)

3850

3851 # We haven't explicitly validated axis, but if it is bad, this call of

3852 # np.concatenate will raise np.AxisError. The call will raise ValueError

3853 # if the dimensions of all the arrays, except the axis dimension, are not

3854 # the same.

3855 alldata = np.concatenate(samples, axis=axis)

3856 bign = alldata.shape[axis]

3857

3858 # Check this after forming alldata, so shape errors are detected

3859 # and reported before checking for 0 length inputs.

3860 if any(sample.shape[axis] == 0 for sample in samples):

3861 warnings.warn(stats.DegenerateDataWarning('at least one input '

3862 'has length 0'))

3863 return _create_f_oneway_nan_result(alldata.shape, axis)

3864

3865 # Must have at least one group with length greater than 1.

3866 if all(sample.shape[axis] == 1 for sample in samples):

3867 msg = ('all input arrays have length 1. f_oneway requires that at '

3868 'least one input has length greater than 1.')

3869 warnings.warn(stats.DegenerateDataWarning(msg))

3870 return _create_f_oneway_nan_result(alldata.shape, axis)

3871

3872 # Check if all values within each group are identical, and if the common

3873 # value in at least one group is different from that in another group.

3874 # Based on https://github.com/scipy/scipy/issues/11669

3875

3876 # If axis=0, say, and the groups have shape (n0, ...), (n1, ...), ...,

3877 # then is_const is a boolean array with shape (num_groups, ...).

3878 # It is True if the values within the groups along the axis slice are

3879 # identical. In the typical case where each input array is 1-d, is_const is

3880 # a 1-d array with length num_groups.

3881 is_const = np.concatenate(

3882 [(_first(sample, axis) == sample).all(axis=axis,

3883 keepdims=True)

3884 for sample in samples],

3885 axis=axis

3886 )

3887

3888 # all_const is a boolean array with shape (...) (see previous comment).

3889 # It is True if the values within each group along the axis slice are

3890 # the same (e.g. [[3, 3, 3], [5, 5, 5, 5], [4, 4, 4]]).

3891 all_const = is_const.all(axis=axis)

3892 if all_const.any():

3893 msg = ("Each of the input arrays is constant;"

3894 "the F statistic is not defined or infinite")

3895 warnings.warn(stats.ConstantInputWarning(msg))

3896

3897 # all_same_const is True if all the values in the groups along the axis=0

3898 # slice are the same (e.g. [[3, 3, 3], [3, 3, 3, 3], [3, 3, 3]]).

3899 all_same_const = (_first(alldata, axis) == alldata).all(axis=axis)

3900

3901 # Determine the mean of the data, and subtract that from all inputs to a

3902 # variance (via sum_of_sq / sq_of_sum) calculation. Variance is invariant

3903 # to a shift in location, and centering all data around zero vastly

3904 # improves numerical stability.

3905 offset = alldata.mean(axis=axis, keepdims=True)

3906 alldata -= offset

3907

3908 normalized_ss = _square_of_sums(alldata, axis=axis) / bign

3909

3910 sstot = _sum_of_squares(alldata, axis=axis) - normalized_ss

3911

3912 ssbn = 0

3913 for sample in samples:

3914 ssbn += _square_of_sums(sample - offset,

3915 axis=axis) / sample.shape[axis]

3916

3917 # Naming: variables ending in bn/b are for "between treatments", wn/w are

3918 # for "within treatments"

3919 ssbn -= normalized_ss

3920 sswn = sstot - ssbn

3921 dfbn = num_groups - 1

3922 dfwn = bign - num_groups

3923 msb = ssbn / dfbn

3924 msw = sswn / dfwn

3925 with np.errstate(divide='ignore', invalid='ignore'):

3926 f = msb / msw

3927

3928 prob = special.fdtrc(dfbn, dfwn, f) # equivalent to stats.f.sf

3929

3930 # Fix any f values that should be inf or nan because the corresponding

3931 # inputs were constant.

3932 if np.isscalar(f):

3933 if all_same_const:

3934 f = np.nan

3935 prob = np.nan

3936 elif all_const:

3937 f = np.inf

3938 prob = 0.0

3939 else:

3940 f[all_const] = np.inf

3941 prob[all_const] = 0.0

3942 f[all_same_const] = np.nan

3943 prob[all_same_const] = np.nan

3944

3945 return F_onewayResult(f, prob)

3946

3947

3948def alexandergovern(*samples, nan_policy='propagate'):

3949 """Performs the Alexander Govern test.

3950

3951 The Alexander-Govern approximation tests the equality of k independent

3952 means in the face of heterogeneity of variance. The test is applied to

3953 samples from two or more groups, possibly with differing sizes.

3954

3955 Parameters

3956 ----------

3957 sample1, sample2, ... : array_like

3958 The sample measurements for each group. There must be at least

3959 two samples.

3960 nan_policy : {'propagate', 'raise', 'omit'}, optional

3961 Defines how to handle when input contains nan.

3962 The following options are available (default is 'propagate'):

3963

3964 * 'propagate': returns nan

3965 * 'raise': throws an error

3966 * 'omit': performs the calculations ignoring nan values

3967

3968 Returns

3969 -------

3970 statistic : float

3971 The computed A statistic of the test.

3972 pvalue : float

3973 The associated p-value from the chi-squared distribution.

3974

3975 Warns

3976 -----

3977 `~scipy.stats.ConstantInputWarning`

3978 Raised if an input is a constant array. The statistic is not defined

3979 in this case, so ``np.nan`` is returned.

3980

3981 See Also

3982 --------

3983 f_oneway : one-way ANOVA

3984

3985 Notes

3986 -----

3987 The use of this test relies on several assumptions.

3988

3989 1. The samples are independent.

3990 2. Each sample is from a normally distributed population.

3991 3. Unlike `f_oneway`, this test does not assume on homoscedasticity,

3992 instead relaxing the assumption of equal variances.

3993

3994 Input samples must be finite, one dimensional, and with size greater than

3995 one.

3996

3997 References

3998 ----------

3999 .. [1] Alexander, Ralph A., and Diane M. Govern. "A New and Simpler

4000 Approximation for ANOVA under Variance Heterogeneity." Journal

4001 of Educational Statistics, vol. 19, no. 2, 1994, pp. 91-101.

4002 JSTOR, www.jstor.org/stable/1165140. Accessed 12 Sept. 2020.

4003

4004 Examples

4005 --------

4006 >>> from scipy.stats import alexandergovern

4007

4008 Here are some data on annual percentage rate of interest charged on

4009 new car loans at nine of the largest banks in four American cities

4010 taken from the National Institute of Standards and Technology's

4011 ANOVA dataset.

4012

4013 We use `alexandergovern` to test the null hypothesis that all cities

4014 have the same mean APR against the alternative that the cities do not

4015 all have the same mean APR. We decide that a significance level of 5%

4016 is required to reject the null hypothesis in favor of the alternative.

4017

4018 >>> atlanta = [13.75, 13.75, 13.5, 13.5, 13.0, 13.0, 13.0, 12.75, 12.5]

4019 >>> chicago = [14.25, 13.0, 12.75, 12.5, 12.5, 12.4, 12.3, 11.9, 11.9]

4020 >>> houston = [14.0, 14.0, 13.51, 13.5, 13.5, 13.25, 13.0, 12.5, 12.5]

4021 >>> memphis = [15.0, 14.0, 13.75, 13.59, 13.25, 12.97, 12.5, 12.25,

4022 ... 11.89]

4023 >>> alexandergovern(atlanta, chicago, houston, memphis)

4024 AlexanderGovernResult(statistic=4.65087071883494,

4025 pvalue=0.19922132490385214)

4026

4027 The p-value is 0.1992, indicating a nearly 20% chance of observing

4028 such an extreme value of the test statistic under the null hypothesis.

4029 This exceeds 5%, so we do not reject the null hypothesis in favor of

4030 the alternative.

4031

4032 """

4033 samples = _alexandergovern_input_validation(samples, nan_policy)

4034

4035 if np.any([(sample == sample[0]).all() for sample in samples]):

4036 msg = "An input array is constant; the statistic is not defined."

4037 warnings.warn(stats.ConstantInputWarning(msg))

4038 return AlexanderGovernResult(np.nan, np.nan)

4039

4040 # The following formula numbers reference the equation described on

4041 # page 92 by Alexander, Govern. Formulas 5, 6, and 7 describe other

4042 # tests that serve as the basis for equation (8) but are not needed

4043 # to perform the test.

4044

4045 # precalculate mean and length of each sample

4046 lengths = np.array([ma.count(sample) if nan_policy == 'omit'

4047 else len(sample) for sample in samples])

4048 means = np.array([np.mean(sample) for sample in samples])

4049

4050 # (1) determine standard error of the mean for each sample

4051 standard_errors = [np.std(sample, ddof=1) / np.sqrt(length)

4052 for sample, length in zip(samples, lengths)]

4053

4054 # (2) define a weight for each sample

4055 inv_sq_se = 1 / np.square(standard_errors)

4056 weights = inv_sq_se / np.sum(inv_sq_se)

4057

4058 # (3) determine variance-weighted estimate of the common mean

4059 var_w = np.sum(weights * means)

4060

4061 # (4) determine one-sample t statistic for each group

4062 t_stats = (means - var_w)/standard_errors

4063

4064 # calculate parameters to be used in transformation

4065 v = lengths - 1

4066 a = v - .5

4067 b = 48 * a**2

4068 c = (a * np.log(1 + (t_stats ** 2)/v))**.5

4069

4070 # (8) perform a normalizing transformation on t statistic

4071 z = (c + ((c**3 + 3*c)/b) -

4072 ((4*c**7 + 33*c**5 + 240*c**3 + 855*c) /

4073 (b**2*10 + 8*b*c**4 + 1000*b)))

4074

4075 # (9) calculate statistic

4076 A = np.sum(np.square(z))

4077

4078 # "[the p value is determined from] central chi-square random deviates

4079 # with k - 1 degrees of freedom". Alexander, Govern (94)

4080 p = distributions.chi2.sf(A, len(samples) - 1)

4081 return AlexanderGovernResult(A, p)

4082

4083

4084def _alexandergovern_input_validation(samples, nan_policy):

4085 if len(samples) < 2:

4086 raise TypeError(f"2 or more inputs required, got {len(samples)}")

4087

4088 # input arrays are flattened

4089 samples = [np.asarray(sample, dtype=float) for sample in samples]

4090

4091 for i, sample in enumerate(samples):

4092 if np.size(sample) <= 1:

4093 raise ValueError("Input sample size must be greater than one.")

4094 if sample.ndim != 1:

4095 raise ValueError("Input samples must be one-dimensional")

4096 if np.isinf(sample).any():

4097 raise ValueError("Input samples must be finite.")

4098

4099 contains_nan, nan_policy = _contains_nan(sample,

4100 nan_policy=nan_policy)

4101 if contains_nan and nan_policy == 'omit':

4102 samples[i] = ma.masked_invalid(sample)

4103 return samples

4104

4105

4106AlexanderGovernResult = make_dataclass("AlexanderGovernResult", ("statistic",

4107 "pvalue"))

4108

4109

4110def _pearsonr_fisher_ci(r, n, confidence_level, alternative):

4111 """

4112 Compute the confidence interval for Pearson's R.

4113

4114 Fisher's transformation is used to compute the confidence interval

4115 (https://en.wikipedia.org/wiki/Fisher_transformation).

4116 """

4117 if r == 1:

4118 zr = np.inf

4119 elif r == -1:

4120 zr = -np.inf

4121 else:

4122 zr = np.arctanh(r)

4123

4124 if n > 3:

4125 se = np.sqrt(1 / (n - 3))

4126 if alternative == "two-sided":

4127 h = special.ndtri(0.5 + confidence_level/2)

4128 zlo = zr - h*se

4129 zhi = zr + h*se

4130 rlo = np.tanh(zlo)

4131 rhi = np.tanh(zhi)

4132 elif alternative == "less":

4133 h = special.ndtri(confidence_level)

4134 zhi = zr + h*se

4135 rhi = np.tanh(zhi)

4136 rlo = -1.0

4137 else:

4138 # alternative == "greater":

4139 h = special.ndtri(confidence_level)

4140 zlo = zr - h*se

4141 rlo = np.tanh(zlo)

4142 rhi = 1.0

4143 else:

4144 rlo, rhi = -1.0, 1.0

4145

4146 return ConfidenceInterval(low=rlo, high=rhi)

4147

4148

4149ConfidenceInterval = namedtuple('ConfidenceInterval', ['low', 'high'])

4150

4151PearsonRResultBase = _make_tuple_bunch('PearsonRResultBase',

4152 ['statistic', 'pvalue'], [])

4153

4154

4155class PearsonRResult(PearsonRResultBase):

4156 """

4157 Result of `scipy.stats.pearsonr`

4158

4159 Attributes

4160 ----------

4161 statistic : float

4162 Pearson product-moment correlation coefficient.

4163 pvalue : float

4164 The p-value associated with the chosen alternative.

4165

4166 Methods

4167 -------

4168 confidence_interval

4169 Computes the confidence interval of the correlation

4170 coefficient `statistic` for the given confidence level.

4171

4172 """

4173 def __init__(self, statistic, pvalue, alternative, n):

4174 super().__init__(statistic, pvalue)

4175 self._alternative = alternative

4176 self._n = n

4177

4178 # add alias for consistency with other correlation functions

4179 self.correlation = statistic

4180

4181 def confidence_interval(self, confidence_level=0.95):

4182 """

4183 The confidence interval for the correlation coefficient.

4184

4185 Compute the confidence interval for the correlation coefficient

4186 ``statistic`` with the given confidence level.

4187

4188 The confidence interval is computed using the Fisher transformation

4189 F(r) = arctanh(r) [1]_. When the sample pairs are drawn from a

4190 bivariate normal distribution, F(r) approximately follows a normal

4191 distribution with standard error ``1/sqrt(n - 3)``, where ``n`` is the

4192 length of the original samples along the calculation axis. When

4193 ``n <= 3``, this approximation does not yield a finite, real standard

4194 error, so we define the confidence interval to be -1 to 1.

4195

4196 Parameters

4197 ----------

4198 confidence_level : float

4199 The confidence level for the calculation of the correlation

4200 coefficient confidence interval. Default is 0.95.

4201

4202 Returns

4203 -------

4204 ci : namedtuple

4205 The confidence interval is returned in a ``namedtuple`` with

4206 fields `low` and `high`.

4207

4208 References

4209 ----------

4210 .. [1] "Pearson correlation coefficient", Wikipedia,

4211 https://en.wikipedia.org/wiki/Pearson_correlation_coefficient

4212 """

4213 return _pearsonr_fisher_ci(self.statistic, self._n, confidence_level,

4214 self._alternative)

4215

4216

4217def pearsonr(x, y, *, alternative='two-sided'):

4218 r"""

4219 Pearson correlation coefficient and p-value for testing non-correlation.

4220

4221 The Pearson correlation coefficient [1]_ measures the linear relationship

4222 between two datasets. Like other correlation

4223 coefficients, this one varies between -1 and +1 with 0 implying no

4224 correlation. Correlations of -1 or +1 imply an exact linear relationship.

4225 Positive correlations imply that as x increases, so does y. Negative

4226 correlations imply that as x increases, y decreases.

4227

4228 This function also performs a test of the null hypothesis that the

4229 distributions underlying the samples are uncorrelated and normally

4230 distributed. (See Kowalski [3]_

4231 for a discussion of the effects of non-normality of the input on the

4232 distribution of the correlation coefficient.)

4233 The p-value roughly indicates the probability of an uncorrelated system

4234 producing datasets that have a Pearson correlation at least as extreme

4235 as the one computed from these datasets.

4236

4237 Parameters

4238 ----------

4239 x : (N,) array_like

4240 Input array.

4241 y : (N,) array_like

4242 Input array.

4243 alternative : {'two-sided', 'greater', 'less'}, optional

4244 Defines the alternative hypothesis. Default is 'two-sided'.

4245 The following options are available:

4246

4247 * 'two-sided': the correlation is nonzero

4248 * 'less': the correlation is negative (less than zero)

4249 * 'greater': the correlation is positive (greater than zero)

4250

4251 .. versionadded:: 1.9.0

4252

4253 Returns

4254 -------

4255 result : `~scipy.stats._result_classes.PearsonRResult`

4256 An object with the following attributes:

4257

4258 statistic : float

4259 Pearson product-moment correlation coefficient.

4260 pvalue : float

4261 The p-value associated with the chosen alternative.

4262

4263 The object has the following method:

4264

4265 confidence_interval(confidence_level=0.95)

4266 This method computes the confidence interval of the correlation

4267 coefficient `statistic` for the given confidence level.

4268 The confidence interval is returned in a ``namedtuple`` with

4269 fields `low` and `high`. See the Notes for more details.

4270

4271 Warns

4272 -----

4273 `~scipy.stats.ConstantInputWarning`

4274 Raised if an input is a constant array. The correlation coefficient

4275 is not defined in this case, so ``np.nan`` is returned.

4276

4277 `~scipy.stats.NearConstantInputWarning`

4278 Raised if an input is "nearly" constant. The array ``x`` is considered

4279 nearly constant if ``norm(x - mean(x)) < 1e-13 * abs(mean(x))``.

4280 Numerical errors in the calculation ``x - mean(x)`` in this case might

4281 result in an inaccurate calculation of r.

4282

4283 See Also

4284 --------

4285 spearmanr : Spearman rank-order correlation coefficient.

4286 kendalltau : Kendall's tau, a correlation measure for ordinal data.

4287

4288 Notes

4289 -----

4290 The correlation coefficient is calculated as follows:

4291

4292 .. math::

4293

4294 r = \frac{\sum (x - m_x) (y - m_y)}

4295 {\sqrt{\sum (x - m_x)^2 \sum (y - m_y)^2}}

4296

4297 where :math:`m_x` is the mean of the vector x and :math:`m_y` is

4298 the mean of the vector y.

4299

4300 Under the assumption that x and y are drawn from

4301 independent normal distributions (so the population correlation coefficient

4302 is 0), the probability density function of the sample correlation

4303 coefficient r is ([1]_, [2]_):

4304

4305 .. math::

4306 f(r) = \frac{{(1-r^2)}^{n/2-2}}{\mathrm{B}(\frac{1}{2},\frac{n}{2}-1)}

4307

4308 where n is the number of samples, and B is the beta function. This

4309 is sometimes referred to as the exact distribution of r. This is

4310 the distribution that is used in `pearsonr` to compute the p-value.

4311 The distribution is a beta distribution on the interval [-1, 1],

4312 with equal shape parameters a = b = n/2 - 1. In terms of SciPy's

4313 implementation of the beta distribution, the distribution of r is::

4314

4315 dist = scipy.stats.beta(n/2 - 1, n/2 - 1, loc=-1, scale=2)

4316

4317 The default p-value returned by `pearsonr` is a two-sided p-value. For a

4318 given sample with correlation coefficient r, the p-value is

4319 the probability that abs(r') of a random sample x' and y' drawn from

4320 the population with zero correlation would be greater than or equal

4321 to abs(r). In terms of the object ``dist`` shown above, the p-value

4322 for a given r and length n can be computed as::

4323

4324 p = 2*dist.cdf(-abs(r))

4325

4326 When n is 2, the above continuous distribution is not well-defined.

4327 One can interpret the limit of the beta distribution as the shape

4328 parameters a and b approach a = b = 0 as a discrete distribution with

4329 equal probability masses at r = 1 and r = -1. More directly, one

4330 can observe that, given the data x = [x1, x2] and y = [y1, y2], and

4331 assuming x1 != x2 and y1 != y2, the only possible values for r are 1

4332 and -1. Because abs(r') for any sample x' and y' with length 2 will

4333 be 1, the two-sided p-value for a sample of length 2 is always 1.

4334

4335 For backwards compatibility, the object that is returned also behaves

4336 like a tuple of length two that holds the statistic and the p-value.

4337

4338 References

4339 ----------

4340 .. [1] "Pearson correlation coefficient", Wikipedia,

4341 https://en.wikipedia.org/wiki/Pearson_correlation_coefficient

4342 .. [2] Student, "Probable error of a correlation coefficient",

4343 Biometrika, Volume 6, Issue 2-3, 1 September 1908, pp. 302-310.

4344 .. [3] C. J. Kowalski, "On the Effects of Non-Normality on the Distribution

4345 of the Sample Product-Moment Correlation Coefficient"

4346 Journal of the Royal Statistical Society. Series C (Applied

4347 Statistics), Vol. 21, No. 1 (1972), pp. 1-12.

4348

4349 Examples

4350 --------

4351 >>> import numpy as np

4352 >>> from scipy import stats

4353 >>> res = stats.pearsonr([1, 2, 3, 4, 5], [10, 9, 2.5, 6, 4])

4354 >>> res

4355 PearsonRResult(statistic=-0.7426106572325056, pvalue=0.15055580885344558)

4356 >>> res.confidence_interval()

4357 ConfidenceInterval(low=-0.9816918044786463, high=0.40501116769030976)

4358

4359 There is a linear dependence between x and y if y = a + b*x + e, where

4360 a,b are constants and e is a random error term, assumed to be independent

4361 of x. For simplicity, assume that x is standard normal, a=0, b=1 and let

4362 e follow a normal distribution with mean zero and standard deviation s>0.

4363

4364 >>> rng = np.random.default_rng()

4365 >>> s = 0.5

4366 >>> x = stats.norm.rvs(size=500, random_state=rng)

4367 >>> e = stats.norm.rvs(scale=s, size=500, random_state=rng)

4368 >>> y = x + e

4369 >>> stats.pearsonr(x, y).statistic

4370 0.9001942438244763

4371

4372 This should be close to the exact value given by

4373

4374 >>> 1/np.sqrt(1 + s**2)

4375 0.8944271909999159

4376

4377 For s=0.5, we observe a high level of correlation. In general, a large

4378 variance of the noise reduces the correlation, while the correlation

4379 approaches one as the variance of the error goes to zero.

4380

4381 It is important to keep in mind that no correlation does not imply

4382 independence unless (x, y) is jointly normal. Correlation can even be zero

4383 when there is a very simple dependence structure: if X follows a

4384 standard normal distribution, let y = abs(x). Note that the correlation

4385 between x and y is zero. Indeed, since the expectation of x is zero,

4386 cov(x, y) = E[x*y]. By definition, this equals E[x*abs(x)] which is zero

4387 by symmetry. The following lines of code illustrate this observation:

4388

4389 >>> y = np.abs(x)

4390 >>> stats.pearsonr(x, y)

4391 PearsonRResult(statistic=-0.05444919272687482, pvalue=0.22422294836207743)

4392

4393 A non-zero correlation coefficient can be misleading. For example, if X has

4394 a standard normal distribution, define y = x if x < 0 and y = 0 otherwise.

4395 A simple calculation shows that corr(x, y) = sqrt(2/Pi) = 0.797...,

4396 implying a high level of correlation:

4397

4398 >>> y = np.where(x < 0, x, 0)

4399 >>> stats.pearsonr(x, y)

4400 PearsonRResult(statistic=0.861985781588, pvalue=4.813432002751103e-149)

4401

4402 This is unintuitive since there is no dependence of x and y if x is larger

4403 than zero which happens in about half of the cases if we sample x and y.

4404

4405 """

4406 n = len(x)

4407 if n != len(y):

4408 raise ValueError('x and y must have the same length.')

4409

4410 if n < 2:

4411 raise ValueError('x and y must have length at least 2.')

4412

4413 x = np.asarray(x)

4414 y = np.asarray(y)

4415

4416 if (np.issubdtype(x.dtype, np.complexfloating)

4417 or np.issubdtype(y.dtype, np.complexfloating)):

4418 raise ValueError('This function does not support complex data')

4419

4420 # If an input is constant, the correlation coefficient is not defined.

4421 if (x == x[0]).all() or (y == y[0]).all():

4422 msg = ("An input array is constant; the correlation coefficient "

4423 "is not defined.")

4424 warnings.warn(stats.ConstantInputWarning(msg))

4425 result = PearsonRResult(statistic=np.nan, pvalue=np.nan, n=n,

4426 alternative=alternative)

4427 return result

4428

4429 # dtype is the data type for the calculations. This expression ensures

4430 # that the data type is at least 64 bit floating point. It might have

4431 # more precision if the input is, for example, np.longdouble.

4432 dtype = type(1.0 + x[0] + y[0])

4433

4434 if n == 2:

4435 r = dtype(np.sign(x[1] - x[0])*np.sign(y[1] - y[0]))

4436 result = PearsonRResult(statistic=r, pvalue=1.0, n=n,

4437 alternative=alternative)

4438 return result

4439

4440 xmean = x.mean(dtype=dtype)

4441 ymean = y.mean(dtype=dtype)

4442

4443 # By using `astype(dtype)`, we ensure that the intermediate calculations

4444 # use at least 64 bit floating point.

4445 xm = x.astype(dtype) - xmean

4446 ym = y.astype(dtype) - ymean

4447

4448 # Unlike np.linalg.norm or the expression sqrt((xm*xm).sum()),

4449 # scipy.linalg.norm(xm) does not overflow if xm is, for example,

4450 # [-5e210, 5e210, 3e200, -3e200]

4451 normxm = linalg.norm(xm)

4452 normym = linalg.norm(ym)

4453

4454 threshold = 1e-13

4455 if normxm < threshold*abs(xmean) or normym < threshold*abs(ymean):

4456 # If all the values in x (likewise y) are very close to the mean,

4457 # the loss of precision that occurs in the subtraction xm = x - xmean

4458 # might result in large errors in r.

4459 msg = ("An input array is nearly constant; the computed "

4460 "correlation coefficient may be inaccurate.")

4461 warnings.warn(stats.NearConstantInputWarning(msg))

4462

4463 r = np.dot(xm/normxm, ym/normym)

4464

4465 # Presumably, if abs(r) > 1, then it is only some small artifact of

4466 # floating point arithmetic.

4467 r = max(min(r, 1.0), -1.0)

4468

4469 # As explained in the docstring, the distribution of `r` under the null

4470 # hypothesis is the beta distribution on (-1, 1) with a = b = n/2 - 1.

4471 ab = n/2 - 1

4472 dist = stats.beta(ab, ab, loc=-1, scale=2)

4473 if alternative == 'two-sided':

4474 prob = 2*dist.sf(abs(r))

4475 elif alternative == 'less':

4476 prob = dist.cdf(r)

4477 elif alternative == 'greater':

4478 prob = dist.sf(r)

4479 else:

4480 raise ValueError('alternative must be one of '

4481 '["two-sided", "less", "greater"]')

4482

4483 return PearsonRResult(statistic=r, pvalue=prob, n=n,

4484 alternative=alternative)

4485

4486

4487def fisher_exact(table, alternative='two-sided'):

4488 """Perform a Fisher exact test on a 2x2 contingency table.

4489

4490 The null hypothesis is that the true odds ratio of the populations

4491 underlying the observations is one, and the observations were sampled

4492 from these populations under a condition: the marginals of the

4493 resulting table must equal those of the observed table. The statistic

4494 returned is the unconditional maximum likelihood estimate of the odds

4495 ratio, and the p-value is the probability under the null hypothesis of

4496 obtaining a table at least as extreme as the one that was actually

4497 observed. There are other possible choices of statistic and two-sided

4498 p-value definition associated with Fisher's exact test; please see the

4499 Notes for more information.

4500

4501 Parameters

4502 ----------

4503 table : array_like of ints

4504 A 2x2 contingency table. Elements must be non-negative integers.

4505 alternative : {'two-sided', 'less', 'greater'}, optional

4506 Defines the alternative hypothesis.

4507 The following options are available (default is 'two-sided'):

4508

4509 * 'two-sided': the odds ratio of the underlying population is not one

4510 * 'less': the odds ratio of the underlying population is less than one

4511 * 'greater': the odds ratio of the underlying population is greater

4512 than one

4513

4514 See the Notes for more details.

4515

4516 Returns

4517 -------

4518 res : SignificanceResult

4519 An object containing attributes:

4520

4521 statistic : float

4522 This is the prior odds ratio, not a posterior estimate.

4523 pvalue : float

4524 The probability under the null hypothesis of obtaining a

4525 table at least as extreme as the one that was actually observed.

4526

4527 See Also

4528 --------

4529 chi2_contingency : Chi-square test of independence of variables in a

4530 contingency table. This can be used as an alternative to

4531 `fisher_exact` when the numbers in the table are large.

4532 contingency.odds_ratio : Compute the odds ratio (sample or conditional

4533 MLE) for a 2x2 contingency table.

4534 barnard_exact : Barnard's exact test, which is a more powerful alternative

4535 than Fisher's exact test for 2x2 contingency tables.

4536 boschloo_exact : Boschloo's exact test, which is a more powerful alternative

4537 than Fisher's exact test for 2x2 contingency tables.

4538

4539 Notes

4540 -----

4541 *Null hypothesis and p-values*

4542

4543 The null hypothesis is that the true odds ratio of the populations

4544 underlying the observations is one, and the observations were sampled at

4545 random from these populations under a condition: the marginals of the

4546 resulting table must equal those of the observed table. Equivalently,

4547 the null hypothesis is that the input table is from the hypergeometric

4548 distribution with parameters (as used in `hypergeom`)

4549 ``M = a + b + c + d``, ``n = a + b`` and ``N = a + c``, where the

4550 input table is ``[[a, b], [c, d]]``. This distribution has support

4551 ``max(0, N + n - M) <= x <= min(N, n)``, or, in terms of the values

4552 in the input table, ``min(0, a - d) <= x <= a + min(b, c)``. ``x``

4553 can be interpreted as the upper-left element of a 2x2 table, so the

4554 tables in the distribution have form::

4555

4556 [ x n - x ]

4557 [N - x M - (n + N) + x]

4558

4559 For example, if::

4560

4561 table = [6 2]

4562 [1 4]

4563

4564 then the support is ``2 <= x <= 7``, and the tables in the distribution

4565 are::

4566

4567 [2 6] [3 5] [4 4] [5 3] [6 2] [7 1]

4568 [5 0] [4 1] [3 2] [2 3] [1 4] [0 5]

4569

4570 The probability of each table is given by the hypergeometric distribution

4571 ``hypergeom.pmf(x, M, n, N)``. For this example, these are (rounded to

4572 three significant digits)::

4573

4574 x 2 3 4 5 6 7

4575 p 0.0163 0.163 0.408 0.326 0.0816 0.00466

4576

4577 These can be computed with::

4578

4579 >>> import numpy as np

4580 >>> from scipy.stats import hypergeom

4581 >>> table = np.array([[6, 2], [1, 4]])

4582 >>> M = table.sum()

4583 >>> n = table[0].sum()

4584 >>> N = table[:, 0].sum()

4585 >>> start, end = hypergeom.support(M, n, N)

4586 >>> hypergeom.pmf(np.arange(start, end+1), M, n, N)

4587 array([0.01631702, 0.16317016, 0.40792541, 0.32634033, 0.08158508,

4588 0.004662 ])

4589

4590 The two-sided p-value is the probability that, under the null hypothesis,

4591 a random table would have a probability equal to or less than the

4592 probability of the input table. For our example, the probability of

4593 the input table (where ``x = 6``) is 0.0816. The x values where the

4594 probability does not exceed this are 2, 6 and 7, so the two-sided p-value

4595 is ``0.0163 + 0.0816 + 0.00466 ~= 0.10256``::

4596

4597 >>> from scipy.stats import fisher_exact

4598 >>> res = fisher_exact(table, alternative='two-sided')

4599 >>> res.pvalue

4600 0.10256410256410257

4601

4602 The one-sided p-value for ``alternative='greater'`` is the probability

4603 that a random table has ``x >= a``, which in our example is ``x >= 6``,

4604 or ``0.0816 + 0.00466 ~= 0.08626``::

4605

4606 >>> res = fisher_exact(table, alternative='greater')

4607 >>> res.pvalue

4608 0.08624708624708627

4609

4610 This is equivalent to computing the survival function of the

4611 distribution at ``x = 5`` (one less than ``x`` from the input table,

4612 because we want to include the probability of ``x = 6`` in the sum)::

4613

4614 >>> hypergeom.sf(5, M, n, N)

4615 0.08624708624708627

4616

4617 For ``alternative='less'``, the one-sided p-value is the probability

4618 that a random table has ``x <= a``, (i.e. ``x <= 6`` in our example),

4619 or ``0.0163 + 0.163 + 0.408 + 0.326 + 0.0816 ~= 0.9949``::

4620

4621 >>> res = fisher_exact(table, alternative='less')

4622 >>> res.pvalue

4623 0.9953379953379957

4624

4625 This is equivalent to computing the cumulative distribution function

4626 of the distribution at ``x = 6``:

4627

4628 >>> hypergeom.cdf(6, M, n, N)

4629 0.9953379953379957

4630

4631 *Odds ratio*

4632

4633 The calculated odds ratio is different from the value computed by the

4634 R function ``fisher.test``. This implementation returns the "sample"

4635 or "unconditional" maximum likelihood estimate, while ``fisher.test``

4636 in R uses the conditional maximum likelihood estimate. To compute the

4637 conditional maximum likelihood estimate of the odds ratio, use

4638 `scipy.stats.contingency.odds_ratio`.

4639

4640 Examples

4641 --------

4642 Say we spend a few days counting whales and sharks in the Atlantic and

4643 Indian oceans. In the Atlantic ocean we find 8 whales and 1 shark, in the

4644 Indian ocean 2 whales and 5 sharks. Then our contingency table is::

4645

4646 Atlantic Indian

4647 whales 8 2

4648 sharks 1 5

4649

4650 We use this table to find the p-value:

4651

4652 >>> from scipy.stats import fisher_exact

4653 >>> res = fisher_exact([[8, 2], [1, 5]])

4654 >>> res.pvalue

4655 0.0349...

4656

4657 The probability that we would observe this or an even more imbalanced ratio

4658 by chance is about 3.5%. A commonly used significance level is 5%--if we

4659 adopt that, we can therefore conclude that our observed imbalance is

4660 statistically significant; whales prefer the Atlantic while sharks prefer

4661 the Indian ocean.

4662

4663 """

4664 hypergeom = distributions.hypergeom

4665 # int32 is not enough for the algorithm

4666 c = np.asarray(table, dtype=np.int64)

4667 if not c.shape == (2, 2):

4668 raise ValueError("The input `table` must be of shape (2, 2).")

4669

4670 if np.any(c < 0):

4671 raise ValueError("All values in `table` must be nonnegative.")

4672

4673 if 0 in c.sum(axis=0) or 0 in c.sum(axis=1):

4674 # If both values in a row or column are zero, the p-value is 1 and

4675 # the odds ratio is NaN.

4676 return SignificanceResult(np.nan, 1.0)

4677

4678 if c[1, 0] > 0 and c[0, 1] > 0:

4679 oddsratio = c[0, 0] * c[1, 1] / (c[1, 0] * c[0, 1])

4680 else:

4681 oddsratio = np.inf

4682

4683 n1 = c[0, 0] + c[0, 1]

4684 n2 = c[1, 0] + c[1, 1]

4685 n = c[0, 0] + c[1, 0]

4686

4687 def pmf(x):

4688 return hypergeom.pmf(x, n1 + n2, n1, n)

4689

4690 if alternative == 'less':

4691 pvalue = hypergeom.cdf(c[0, 0], n1 + n2, n1, n)

4692 elif alternative == 'greater':

4693 # Same formula as the 'less' case, but with the second column.

4694 pvalue = hypergeom.cdf(c[0, 1], n1 + n2, n1, c[0, 1] + c[1, 1])

4695 elif alternative == 'two-sided':

4696 mode = int((n + 1) * (n1 + 1) / (n1 + n2 + 2))

4697 pexact = hypergeom.pmf(c[0, 0], n1 + n2, n1, n)

4698 pmode = hypergeom.pmf(mode, n1 + n2, n1, n)

4699

4700 epsilon = 1e-14

4701 gamma = 1 + epsilon

4702

4703 if np.abs(pexact - pmode) / np.maximum(pexact, pmode) <= epsilon:

4704 return SignificanceResult(oddsratio, 1.)

4705

4706 elif c[0, 0] < mode:

4707 plower = hypergeom.cdf(c[0, 0], n1 + n2, n1, n)

4708 if hypergeom.pmf(n, n1 + n2, n1, n) > pexact * gamma:

4709 return SignificanceResult(oddsratio, plower)

4710

4711 guess = _binary_search(lambda x: -pmf(x), -pexact * gamma, mode, n)

4712 pvalue = plower + hypergeom.sf(guess, n1 + n2, n1, n)

4713 else:

4714 pupper = hypergeom.sf(c[0, 0] - 1, n1 + n2, n1, n)

4715 if hypergeom.pmf(0, n1 + n2, n1, n) > pexact * gamma:

4716 return SignificanceResult(oddsratio, pupper)

4717

4718 guess = _binary_search(pmf, pexact * gamma, 0, mode)

4719 pvalue = pupper + hypergeom.cdf(guess, n1 + n2, n1, n)

4720 else:

4721 msg = "`alternative` should be one of {'two-sided', 'less', 'greater'}"

4722 raise ValueError(msg)

4723

4724 pvalue = min(pvalue, 1.0)

4725

4726 return SignificanceResult(oddsratio, pvalue)

4727

4728

4729def spearmanr(a, b=None, axis=0, nan_policy='propagate',

4730 alternative='two-sided'):

4731 """Calculate a Spearman correlation coefficient with associated p-value.

4732

4733 The Spearman rank-order correlation coefficient is a nonparametric measure

4734 of the monotonicity of the relationship between two datasets.

4735 Like other correlation coefficients,

4736 this one varies between -1 and +1 with 0 implying no correlation.

4737 Correlations of -1 or +1 imply an exact monotonic relationship. Positive

4738 correlations imply that as x increases, so does y. Negative correlations

4739 imply that as x increases, y decreases.

4740

4741 The p-value roughly indicates the probability of an uncorrelated system

4742 producing datasets that have a Spearman correlation at least as extreme

4743 as the one computed from these datasets. Although calculation of the

4744 p-value does not make strong assumptions about the distributions underlying

4745 the samples, it is only accurate for very large samples (>500

4746 observations). For smaller sample sizes, consider a permutation test (see

4747 Examples section below).

4748

4749 Parameters

4750 ----------

4751 a, b : 1D or 2D array_like, b is optional

4752 One or two 1-D or 2-D arrays containing multiple variables and

4753 observations. When these are 1-D, each represents a vector of

4754 observations of a single variable. For the behavior in the 2-D case,

4755 see under ``axis``, below.

4756 Both arrays need to have the same length in the ``axis`` dimension.

4757 axis : int or None, optional

4758 If axis=0 (default), then each column represents a variable, with

4759 observations in the rows. If axis=1, the relationship is transposed:

4760 each row represents a variable, while the columns contain observations.

4761 If axis=None, then both arrays will be raveled.

4762 nan_policy : {'propagate', 'raise', 'omit'}, optional

4763 Defines how to handle when input contains nan.

4764 The following options are available (default is 'propagate'):

4765

4766 * 'propagate': returns nan

4767 * 'raise': throws an error

4768 * 'omit': performs the calculations ignoring nan values

4769

4770 alternative : {'two-sided', 'less', 'greater'}, optional

4771 Defines the alternative hypothesis. Default is 'two-sided'.

4772 The following options are available:

4773

4774 * 'two-sided': the correlation is nonzero

4775 * 'less': the correlation is negative (less than zero)

4776 * 'greater': the correlation is positive (greater than zero)

4777

4778 .. versionadded:: 1.7.0

4779

4780 Returns

4781 -------

4782 res : SignificanceResult

4783 An object containing attributes:

4784

4785 statistic : float or ndarray (2-D square)

4786 Spearman correlation matrix or correlation coefficient (if only 2

4787 variables are given as parameters). Correlation matrix is square

4788 with length equal to total number of variables (columns or rows) in

4789 ``a`` and ``b`` combined.

4790 pvalue : float

4791 The p-value for a hypothesis test whose null hypothesis

4792 is that two sets of data are linearly uncorrelated. See

4793 `alternative` above for alternative hypotheses. `pvalue` has the

4794 same shape as `statistic`.

4795

4796 Warns

4797 -----

4798 `~scipy.stats.ConstantInputWarning`

4799 Raised if an input is a constant array. The correlation coefficient

4800 is not defined in this case, so ``np.nan`` is returned.

4801

4802 References

4803 ----------

4804 .. [1] Zwillinger, D. and Kokoska, S. (2000). CRC Standard

4805 Probability and Statistics Tables and Formulae. Chapman & Hall: New

4806 York. 2000.

4807 Section 14.7

4808 .. [2] Kendall, M. G. and Stuart, A. (1973).

4809 The Advanced Theory of Statistics, Volume 2: Inference and Relationship.

4810 Griffin. 1973.

4811 Section 31.18

4812

4813 Examples

4814 --------

4815 >>> import numpy as np

4816 >>> from scipy import stats

4817 >>> res = stats.spearmanr([1, 2, 3, 4, 5], [5, 6, 7, 8, 7])

4818 >>> res.statistic

4819 0.8207826816681233

4820 >>> res.pvalue

4821 0.08858700531354381

4822 >>> rng = np.random.default_rng()

4823 >>> x2n = rng.standard_normal((100, 2))

4824 >>> y2n = rng.standard_normal((100, 2))

4825 >>> res = stats.spearmanr(x2n)

4826 >>> res.statistic, res.pvalue

4827 (-0.07960396039603959, 0.4311168705769747)

4828 >>> res = stats.spearmanr(x2n[:, 0], x2n[:, 1])

4829 >>> res.statistic, res.pvalue

4830 (-0.07960396039603959, 0.4311168705769747)

4831 >>> res = stats.spearmanr(x2n, y2n)

4832 >>> res.statistic

4833 array([[ 1. , -0.07960396, -0.08314431, 0.09662166],

4834 [-0.07960396, 1. , -0.14448245, 0.16738074],

4835 [-0.08314431, -0.14448245, 1. , 0.03234323],

4836 [ 0.09662166, 0.16738074, 0.03234323, 1. ]])

4837 >>> res.pvalue

4838 array([[0. , 0.43111687, 0.41084066, 0.33891628],

4839 [0.43111687, 0. , 0.15151618, 0.09600687],

4840 [0.41084066, 0.15151618, 0. , 0.74938561],

4841 [0.33891628, 0.09600687, 0.74938561, 0. ]])

4842 >>> res = stats.spearmanr(x2n.T, y2n.T, axis=1)

4843 >>> res.statistic

4844 array([[ 1. , -0.07960396, -0.08314431, 0.09662166],

4845 [-0.07960396, 1. , -0.14448245, 0.16738074],

4846 [-0.08314431, -0.14448245, 1. , 0.03234323],

4847 [ 0.09662166, 0.16738074, 0.03234323, 1. ]])

4848 >>> res = stats.spearmanr(x2n, y2n, axis=None)

4849 >>> res.statistic, res.pvalue

4850 (0.044981624540613524, 0.5270803651336189)

4851 >>> res = stats.spearmanr(x2n.ravel(), y2n.ravel())

4852 >>> res.statistic, res.pvalue

4853 (0.044981624540613524, 0.5270803651336189)

4854

4855 >>> rng = np.random.default_rng()

4856 >>> xint = rng.integers(10, size=(100, 2))

4857 >>> res = stats.spearmanr(xint)

4858 >>> res.statistic, res.pvalue

4859 (0.09800224850707953, 0.3320271757932076)

4860

4861 For small samples, consider performing a permutation test instead of

4862 relying on the asymptotic p-value. Note that to calculate the null

4863 distribution of the statistic (for all possibly pairings between

4864 observations in sample ``x`` and ``y``), only one of the two inputs needs

4865 to be permuted.

4866

4867 >>> x = [1.76405235, 0.40015721, 0.97873798,

4868 ... 2.2408932, 1.86755799, -0.97727788]

4869 >>> y = [2.71414076, 0.2488, 0.87551913,

4870 ... 2.6514917, 2.01160156, 0.47699563]

4871 >>> def statistic(x): # permute only `x`

4872 ... return stats.spearmanr(x, y).statistic

4873 >>> res_exact = stats.permutation_test((x,), statistic,

4874 ... permutation_type='pairings')

4875 >>> res_asymptotic = stats.spearmanr(x, y)

4876 >>> res_exact.pvalue, res_asymptotic.pvalue # asymptotic pvalue is too low

4877 (0.10277777777777777, 0.07239650145772594)

4878

4879 """

4880 if axis is not None and axis > 1:

4881 raise ValueError("spearmanr only handles 1-D or 2-D arrays, "

4882 "supplied axis argument {}, please use only "

4883 "values 0, 1 or None for axis".format(axis))

4884

4885 a, axisout = _chk_asarray(a, axis)

4886 if a.ndim > 2:

4887 raise ValueError("spearmanr only handles 1-D or 2-D arrays")

4888

4889 if b is None:

4890 if a.ndim < 2:

4891 raise ValueError("`spearmanr` needs at least 2 "

4892 "variables to compare")

4893 else:

4894 # Concatenate a and b, so that we now only have to handle the case

4895 # of a 2-D `a`.

4896 b, _ = _chk_asarray(b, axis)

4897 if axisout == 0:

4898 a = np.column_stack((a, b))

4899 else:

4900 a = np.row_stack((a, b))

4901

4902 n_vars = a.shape[1 - axisout]

4903 n_obs = a.shape[axisout]

4904 if n_obs <= 1:

4905 # Handle empty arrays or single observations.

4906 res = SignificanceResult(np.nan, np.nan)

4907 res.correlation = np.nan

4908 return res

4909

4910 warn_msg = ("An input array is constant; the correlation coefficient "

4911 "is not defined.")

4912 if axisout == 0:

4913 if (a[:, 0][0] == a[:, 0]).all() or (a[:, 1][0] == a[:, 1]).all():

4914 # If an input is constant, the correlation coefficient

4915 # is not defined.

4916 warnings.warn(stats.ConstantInputWarning(warn_msg))

4917 res = SignificanceResult(np.nan, np.nan)

4918 res.correlation = np.nan

4919 return res

4920 else: # case when axisout == 1 b/c a is 2 dim only

4921 if (a[0, :][0] == a[0, :]).all() or (a[1, :][0] == a[1, :]).all():

4922 # If an input is constant, the correlation coefficient

4923 # is not defined.

4924 warnings.warn(stats.ConstantInputWarning(warn_msg))

4925 res = SignificanceResult(np.nan, np.nan)

4926 res.correlation = np.nan

4927 return res

4928

4929 a_contains_nan, nan_policy = _contains_nan(a, nan_policy)

4930 variable_has_nan = np.zeros(n_vars, dtype=bool)

4931 if a_contains_nan:

4932 if nan_policy == 'omit':

4933 return mstats_basic.spearmanr(a, axis=axis, nan_policy=nan_policy,

4934 alternative=alternative)

4935 elif nan_policy == 'propagate':

4936 if a.ndim == 1 or n_vars <= 2:

4937 res = SignificanceResult(np.nan, np.nan)

4938 res.correlation = np.nan

4939 return res

4940 else:

4941 # Keep track of variables with NaNs, set the outputs to NaN

4942 # only for those variables

4943 variable_has_nan = np.isnan(a).any(axis=axisout)

4944

4945 a_ranked = np.apply_along_axis(rankdata, axisout, a)

4946 rs = np.corrcoef(a_ranked, rowvar=axisout)

4947 dof = n_obs - 2 # degrees of freedom

4948

4949 # rs can have elements equal to 1, so avoid zero division warnings

4950 with np.errstate(divide='ignore'):

4951 # clip the small negative values possibly caused by rounding

4952 # errors before taking the square root

4953 t = rs * np.sqrt((dof/((rs+1.0)*(1.0-rs))).clip(0))

4954

4955 t, prob = _ttest_finish(dof, t, alternative)

4956

4957 # For backwards compatibility, return scalars when comparing 2 columns

4958 if rs.shape == (2, 2):

4959 res = SignificanceResult(rs[1, 0], prob[1, 0])

4960 res.correlation = rs[1, 0]

4961 return res

4962 else:

4963 rs[variable_has_nan, :] = np.nan

4964 rs[:, variable_has_nan] = np.nan

4965 res = SignificanceResult(rs, prob)

4966 res.correlation = rs

4967 return res

4968

4969

4970def pointbiserialr(x, y):

4971 r"""Calculate a point biserial correlation coefficient and its p-value.

4972

4973 The point biserial correlation is used to measure the relationship

4974 between a binary variable, x, and a continuous variable, y. Like other

4975 correlation coefficients, this one varies between -1 and +1 with 0

4976 implying no correlation. Correlations of -1 or +1 imply a determinative

4977 relationship.

4978

4979 This function may be computed using a shortcut formula but produces the

4980 same result as `pearsonr`.

4981

4982 Parameters

4983 ----------

4984 x : array_like of bools

4985 Input array.

4986 y : array_like

4987 Input array.

4988

4989 Returns

4990 -------

4991 res: SignificanceResult

4992 An object containing attributes:

4993

4994 statistic : float

4995 The R value.

4996 pvalue : float

4997 The two-sided p-value.

4998

4999 Notes

5000 -----

5001 `pointbiserialr` uses a t-test with ``n-1`` degrees of freedom.

5002 It is equivalent to `pearsonr`.

5003

5004 The value of the point-biserial correlation can be calculated from:

5005

5006 .. math::

5007

5008 r_{pb} = \frac{\overline{Y_{1}} -

5009 \overline{Y_{0}}}{s_{y}}\sqrt{\frac{N_{1} N_{2}}{N (N - 1))}}

5010

5011 Where :math:`Y_{0}` and :math:`Y_{1}` are means of the metric

5012 observations coded 0 and 1 respectively; :math:`N_{0}` and :math:`N_{1}`

5013 are number of observations coded 0 and 1 respectively; :math:`N` is the

5014 total number of observations and :math:`s_{y}` is the standard

5015 deviation of all the metric observations.

5016

5017 A value of :math:`r_{pb}` that is significantly different from zero is

5018 completely equivalent to a significant difference in means between the two

5019 groups. Thus, an independent groups t Test with :math:`N-2` degrees of

5020 freedom may be used to test whether :math:`r_{pb}` is nonzero. The

5021 relation between the t-statistic for comparing two independent groups and

5022 :math:`r_{pb}` is given by:

5023

5024 .. math::

5025

5026 t = \sqrt{N - 2}\frac{r_{pb}}{\sqrt{1 - r^{2}_{pb}}}

5027

5028 References

5029 ----------

5030 .. [1] J. Lev, "The Point Biserial Coefficient of Correlation", Ann. Math.

5031 Statist., Vol. 20, no.1, pp. 125-126, 1949.

5032

5033 .. [2] R.F. Tate, "Correlation Between a Discrete and a Continuous

5034 Variable. Point-Biserial Correlation.", Ann. Math. Statist., Vol. 25,

5035 np. 3, pp. 603-607, 1954.

5036

5037 .. [3] D. Kornbrot "Point Biserial Correlation", In Wiley StatsRef:

5038 Statistics Reference Online (eds N. Balakrishnan, et al.), 2014.

5039 :doi:`10.1002/9781118445112.stat06227`

5040

5041 Examples

5042 --------

5043 >>> import numpy as np

5044 >>> from scipy import stats

5045 >>> a = np.array([0, 0, 0, 1, 1, 1, 1])

5046 >>> b = np.arange(7)

5047 >>> stats.pointbiserialr(a, b)

5048 (0.8660254037844386, 0.011724811003954652)

5049 >>> stats.pearsonr(a, b)

5050 (0.86602540378443871, 0.011724811003954626)

5051 >>> np.corrcoef(a, b)

5052 array([[ 1. , 0.8660254],

5053 [ 0.8660254, 1. ]])

5054

5055 """

5056 rpb, prob = pearsonr(x, y)

5057 # create result object with alias for backward compatibility

5058 res = SignificanceResult(rpb, prob)

5059 res.correlation = rpb

5060 return res

5061

5062

5063def kendalltau(x, y, initial_lexsort=None, nan_policy='propagate',

5064 method='auto', variant='b', alternative='two-sided'):

5065 """Calculate Kendall's tau, a correlation measure for ordinal data.

5066

5067 Kendall's tau is a measure of the correspondence between two rankings.

5068 Values close to 1 indicate strong agreement, and values close to -1

5069 indicate strong disagreement. This implements two variants of Kendall's

5070 tau: tau-b (the default) and tau-c (also known as Stuart's tau-c). These

5071 differ only in how they are normalized to lie within the range -1 to 1;

5072 the hypothesis tests (their p-values) are identical. Kendall's original

5073 tau-a is not implemented separately because both tau-b and tau-c reduce

5074 to tau-a in the absence of ties.

5075

5076 Parameters

5077 ----------

5078 x, y : array_like

5079 Arrays of rankings, of the same shape. If arrays are not 1-D, they

5080 will be flattened to 1-D.

5081 initial_lexsort : bool, optional, deprecated

5082 This argument is unused.

5083

5084 .. deprecated:: 1.10.0

5085 `kendalltau` keyword argument `initial_lexsort` is deprecated as it

5086 is unused and will be removed in SciPy 1.12.0.

5087 nan_policy : {'propagate', 'raise', 'omit'}, optional

5088 Defines how to handle when input contains nan.

5089 The following options are available (default is 'propagate'):

5090

5091 * 'propagate': returns nan

5092 * 'raise': throws an error

5093 * 'omit': performs the calculations ignoring nan values

5094

5095 method : {'auto', 'asymptotic', 'exact'}, optional

5096 Defines which method is used to calculate the p-value [5]_.

5097 The following options are available (default is 'auto'):

5098

5099 * 'auto': selects the appropriate method based on a trade-off

5100 between speed and accuracy

5101 * 'asymptotic': uses a normal approximation valid for large samples

5102 * 'exact': computes the exact p-value, but can only be used if no ties

5103 are present. As the sample size increases, the 'exact' computation

5104 time may grow and the result may lose some precision.

5105 variant : {'b', 'c'}, optional

5106 Defines which variant of Kendall's tau is returned. Default is 'b'.

5107 alternative : {'two-sided', 'less', 'greater'}, optional

5108 Defines the alternative hypothesis. Default is 'two-sided'.

5109 The following options are available:

5110

5111 * 'two-sided': the rank correlation is nonzero

5112 * 'less': the rank correlation is negative (less than zero)

5113 * 'greater': the rank correlation is positive (greater than zero)

5114

5115 Returns

5116 -------

5117 res : SignificanceResult

5118 An object containing attributes:

5119

5120 statistic : float

5121 The tau statistic.

5122 pvalue : float

5123 The p-value for a hypothesis test whose null hypothesis is

5124 an absence of association, tau = 0.

5125

5126 See Also

5127 --------

5128 spearmanr : Calculates a Spearman rank-order correlation coefficient.

5129 theilslopes : Computes the Theil-Sen estimator for a set of points (x, y).

5130 weightedtau : Computes a weighted version of Kendall's tau.

5131

5132 Notes

5133 -----

5134 The definition of Kendall's tau that is used is [2]_::

5135

5136 tau_b = (P - Q) / sqrt((P + Q + T) * (P + Q + U))

5137

5138 tau_c = 2 (P - Q) / (n**2 * (m - 1) / m)

5139

5140 where P is the number of concordant pairs, Q the number of discordant

5141 pairs, T the number of ties only in `x`, and U the number of ties only in

5142 `y`. If a tie occurs for the same pair in both `x` and `y`, it is not

5143 added to either T or U. n is the total number of samples, and m is the

5144 number of unique values in either `x` or `y`, whichever is smaller.

5145

5146 References

5147 ----------

5148 .. [1] Maurice G. Kendall, "A New Measure of Rank Correlation", Biometrika

5149 Vol. 30, No. 1/2, pp. 81-93, 1938.

5150 .. [2] Maurice G. Kendall, "The treatment of ties in ranking problems",

5151 Biometrika Vol. 33, No. 3, pp. 239-251. 1945.

5152 .. [3] Gottfried E. Noether, "Elements of Nonparametric Statistics", John

5153 Wiley & Sons, 1967.

5154 .. [4] Peter M. Fenwick, "A new data structure for cumulative frequency

5155 tables", Software: Practice and Experience, Vol. 24, No. 3,

5156 pp. 327-336, 1994.

5157 .. [5] Maurice G. Kendall, "Rank Correlation Methods" (4th Edition),

5158 Charles Griffin & Co., 1970.

5159

5160 Examples

5161 --------

5162 >>> from scipy import stats

5163 >>> x1 = [12, 2, 1, 12, 2]

5164 >>> x2 = [1, 4, 7, 1, 0]

5165 >>> res = stats.kendalltau(x1, x2)

5166 >>> res.statistic

5167 -0.47140452079103173

5168 >>> res.pvalue

5169 0.2827454599327748

5170

5171 """

5172 if initial_lexsort is not None:

5173 msg = ("'kendalltau' keyword argument 'initial_lexsort' is deprecated"

5174 " as it is unused and will be removed in SciPy 1.12.0.")

5175 warnings.warn(msg, DeprecationWarning, stacklevel=2)

5176

5177 x = np.asarray(x).ravel()

5178 y = np.asarray(y).ravel()

5179

5180 if x.size != y.size:

5181 raise ValueError("All inputs to `kendalltau` must be of the same "

5182 f"size, found x-size {x.size} and y-size {y.size}")

5183 elif not x.size or not y.size:

5184 # Return NaN if arrays are empty

5185 res = SignificanceResult(np.nan, np.nan)

5186 res.correlation = np.nan

5187 return res

5188

5189 # check both x and y

5190 cnx, npx = _contains_nan(x, nan_policy)

5191 cny, npy = _contains_nan(y, nan_policy)

5192 contains_nan = cnx or cny

5193 if npx == 'omit' or npy == 'omit':

5194 nan_policy = 'omit'

5195

5196 if contains_nan and nan_policy == 'propagate':

5197 res = SignificanceResult(np.nan, np.nan)

5198 res.correlation = np.nan

5199 return res

5200

5201 elif contains_nan and nan_policy == 'omit':

5202 x = ma.masked_invalid(x)

5203 y = ma.masked_invalid(y)

5204 if variant == 'b':

5205 return mstats_basic.kendalltau(x, y, method=method, use_ties=True,

5206 alternative=alternative)

5207 else:

5208 message = ("nan_policy='omit' is currently compatible only with "

5209 "variant='b'.")

5210 raise ValueError(message)

5211

5212 def count_rank_tie(ranks):

5213 cnt = np.bincount(ranks).astype('int64', copy=False)

5214 cnt = cnt[cnt > 1]

5215 return ((cnt * (cnt - 1) // 2).sum(),

5216 (cnt * (cnt - 1.) * (cnt - 2)).sum(),

5217 (cnt * (cnt - 1.) * (2*cnt + 5)).sum())

5218

5219 size = x.size

5220 perm = np.argsort(y) # sort on y and convert y to dense ranks

5221 x, y = x[perm], y[perm]

5222 y = np.r_[True, y[1:] != y[:-1]].cumsum(dtype=np.intp)

5223

5224 # stable sort on x and convert x to dense ranks

5225 perm = np.argsort(x, kind='mergesort')

5226 x, y = x[perm], y[perm]

5227 x = np.r_[True, x[1:] != x[:-1]].cumsum(dtype=np.intp)

5228

5229 dis = _kendall_dis(x, y) # discordant pairs

5230

5231 obs = np.r_[True, (x[1:] != x[:-1]) | (y[1:] != y[:-1]), True]

5232 cnt = np.diff(np.nonzero(obs)[0]).astype('int64', copy=False)

5233

5234 ntie = (cnt * (cnt - 1) // 2).sum() # joint ties

5235 xtie, x0, x1 = count_rank_tie(x) # ties in x, stats

5236 ytie, y0, y1 = count_rank_tie(y) # ties in y, stats

5237

5238 tot = (size * (size - 1)) // 2

5239

5240 if xtie == tot or ytie == tot:

5241 res = SignificanceResult(np.nan, np.nan)

5242 res.correlation = np.nan

5243 return res

5244

5245 # Note that tot = con + dis + (xtie - ntie) + (ytie - ntie) + ntie

5246 # = con + dis + xtie + ytie - ntie

5247 con_minus_dis = tot - xtie - ytie + ntie - 2 * dis

5248 if variant == 'b':

5249 tau = con_minus_dis / np.sqrt(tot - xtie) / np.sqrt(tot - ytie)

5250 elif variant == 'c':

5251 minclasses = min(len(set(x)), len(set(y)))

5252 tau = 2*con_minus_dis / (size**2 * (minclasses-1)/minclasses)

5253 else:

5254 raise ValueError(f"Unknown variant of the method chosen: {variant}. "

5255 "variant must be 'b' or 'c'.")

5256

5257 # Limit range to fix computational errors

5258 tau = min(1., max(-1., tau))

5259

5260 # The p-value calculation is the same for all variants since the p-value

5261 # depends only on con_minus_dis.

5262 if method == 'exact' and (xtie != 0 or ytie != 0):

5263 raise ValueError("Ties found, exact method cannot be used.")

5264

5265 if method == 'auto':

5266 if (xtie == 0 and ytie == 0) and (size <= 33 or

5267 min(dis, tot-dis) <= 1):

5268 method = 'exact'

5269 else:

5270 method = 'asymptotic'

5271

5272 if xtie == 0 and ytie == 0 and method == 'exact':

5273 pvalue = mstats_basic._kendall_p_exact(size, tot-dis, alternative)

5274 elif method == 'asymptotic':

5275 # con_minus_dis is approx normally distributed with this variance [3]_

5276 m = size * (size - 1.)

5277 var = ((m * (2*size + 5) - x1 - y1) / 18 +

5278 (2 * xtie * ytie) / m + x0 * y0 / (9 * m * (size - 2)))

5279 z = con_minus_dis / np.sqrt(var)

5280 _, pvalue = _normtest_finish(z, alternative)

5281 else:

5282 raise ValueError(f"Unknown method {method} specified. Use 'auto', "

5283 "'exact' or 'asymptotic'.")

5284

5285 # create result object with alias for backward compatibility

5286 res = SignificanceResult(tau, pvalue)

5287 res.correlation = tau

5288 return res

5289

5290

5291def weightedtau(x, y, rank=True, weigher=None, additive=True):

5292 r"""Compute a weighted version of Kendall's :math:`\tau`.

5293

5294 The weighted :math:`\tau` is a weighted version of Kendall's

5295 :math:`\tau` in which exchanges of high weight are more influential than

5296 exchanges of low weight. The default parameters compute the additive

5297 hyperbolic version of the index, :math:`\tau_\mathrm h`, which has

5298 been shown to provide the best balance between important and

5299 unimportant elements [1]_.

5300

5301 The weighting is defined by means of a rank array, which assigns a

5302 nonnegative rank to each element (higher importance ranks being

5303 associated with smaller values, e.g., 0 is the highest possible rank),

5304 and a weigher function, which assigns a weight based on the rank to

5305 each element. The weight of an exchange is then the sum or the product

5306 of the weights of the ranks of the exchanged elements. The default

5307 parameters compute :math:`\tau_\mathrm h`: an exchange between

5308 elements with rank :math:`r` and :math:`s` (starting from zero) has

5309 weight :math:`1/(r+1) + 1/(s+1)`.

5310

5311 Specifying a rank array is meaningful only if you have in mind an

5312 external criterion of importance. If, as it usually happens, you do

5313 not have in mind a specific rank, the weighted :math:`\tau` is

5314 defined by averaging the values obtained using the decreasing

5315 lexicographical rank by (`x`, `y`) and by (`y`, `x`). This is the

5316 behavior with default parameters. Note that the convention used

5317 here for ranking (lower values imply higher importance) is opposite

5318 to that used by other SciPy statistical functions.

5319

5320 Parameters

5321 ----------

5322 x, y : array_like

5323 Arrays of scores, of the same shape. If arrays are not 1-D, they will

5324 be flattened to 1-D.

5325 rank : array_like of ints or bool, optional

5326 A nonnegative rank assigned to each element. If it is None, the

5327 decreasing lexicographical rank by (`x`, `y`) will be used: elements of

5328 higher rank will be those with larger `x`-values, using `y`-values to

5329 break ties (in particular, swapping `x` and `y` will give a different

5330 result). If it is False, the element indices will be used

5331 directly as ranks. The default is True, in which case this

5332 function returns the average of the values obtained using the

5333 decreasing lexicographical rank by (`x`, `y`) and by (`y`, `x`).

5334 weigher : callable, optional

5335 The weigher function. Must map nonnegative integers (zero

5336 representing the most important element) to a nonnegative weight.

5337 The default, None, provides hyperbolic weighing, that is,

5338 rank :math:`r` is mapped to weight :math:`1/(r+1)`.

5339 additive : bool, optional

5340 If True, the weight of an exchange is computed by adding the

5341 weights of the ranks of the exchanged elements; otherwise, the weights

5342 are multiplied. The default is True.

5343

5344 Returns

5345 -------

5346 res: SignificanceResult

5347 An object containing attributes:

5348

5349 statistic : float

5350 The weighted :math:`\tau` correlation index.

5351 pvalue : float

5352 Presently ``np.nan``, as the null distribution of the statistic is

5353 unknown (even in the additive hyperbolic case).

5354

5355 See Also

5356 --------

5357 kendalltau : Calculates Kendall's tau.

5358 spearmanr : Calculates a Spearman rank-order correlation coefficient.

5359 theilslopes : Computes the Theil-Sen estimator for a set of points (x, y).

5360

5361 Notes

5362 -----

5363 This function uses an :math:`O(n \log n)`, mergesort-based algorithm

5364 [1]_ that is a weighted extension of Knight's algorithm for Kendall's

5365 :math:`\tau` [2]_. It can compute Shieh's weighted :math:`\tau` [3]_

5366 between rankings without ties (i.e., permutations) by setting

5367 `additive` and `rank` to False, as the definition given in [1]_ is a

5368 generalization of Shieh's.

5369

5370 NaNs are considered the smallest possible score.

5371

5372 .. versionadded:: 0.19.0

5373

5374 References

5375 ----------

5376 .. [1] Sebastiano Vigna, "A weighted correlation index for rankings with

5377 ties", Proceedings of the 24th international conference on World

5378 Wide Web, pp. 1166-1176, ACM, 2015.

5379 .. [2] W.R. Knight, "A Computer Method for Calculating Kendall's Tau with

5380 Ungrouped Data", Journal of the American Statistical Association,

5381 Vol. 61, No. 314, Part 1, pp. 436-439, 1966.

5382 .. [3] Grace S. Shieh. "A weighted Kendall's tau statistic", Statistics &

5383 Probability Letters, Vol. 39, No. 1, pp. 17-24, 1998.

5384

5385 Examples

5386 --------

5387 >>> import numpy as np

5388 >>> from scipy import stats

5389 >>> x = [12, 2, 1, 12, 2]

5390 >>> y = [1, 4, 7, 1, 0]

5391 >>> res = stats.weightedtau(x, y)

5392 >>> res.statistic

5393 -0.56694968153682723

5394 >>> res.pvalue

5395 nan

5396 >>> res = stats.weightedtau(x, y, additive=False)

5397 >>> res.statistic

5398 -0.62205716951801038

5399

5400 NaNs are considered the smallest possible score:

5401

5402 >>> x = [12, 2, 1, 12, 2]

5403 >>> y = [1, 4, 7, 1, np.nan]

5404 >>> res = stats.weightedtau(x, y)

5405 >>> res.statistic

5406 -0.56694968153682723

5407

5408 This is exactly Kendall's tau:

5409

5410 >>> x = [12, 2, 1, 12, 2]

5411 >>> y = [1, 4, 7, 1, 0]

5412 >>> res = stats.weightedtau(x, y, weigher=lambda x: 1)

5413 >>> res.statistic

5414 -0.47140452079103173

5415

5416 >>> x = [12, 2, 1, 12, 2]

5417 >>> y = [1, 4, 7, 1, 0]

5418 >>> stats.weightedtau(x, y, rank=None)

5419 SignificanceResult(statistic=-0.4157652301037516, pvalue=nan)

5420 >>> stats.weightedtau(y, x, rank=None)

5421 SignificanceResult(statistic=-0.7181341329699028, pvalue=nan)

5422

5423 """

5424 x = np.asarray(x).ravel()

5425 y = np.asarray(y).ravel()

5426

5427 if x.size != y.size:

5428 raise ValueError("All inputs to `weightedtau` must be "

5429 "of the same size, "

5430 "found x-size %s and y-size %s" % (x.size, y.size))

5431 if not x.size:

5432 # Return NaN if arrays are empty

5433 res = SignificanceResult(np.nan, np.nan)

5434 res.correlation = np.nan

5435 return res

5436

5437 # If there are NaNs we apply _toint64()

5438 if np.isnan(np.sum(x)):

5439 x = _toint64(x)

5440 if np.isnan(np.sum(y)):

5441 y = _toint64(y)

5442

5443 # Reduce to ranks unsupported types

5444 if x.dtype != y.dtype:

5445 if x.dtype != np.int64:

5446 x = _toint64(x)

5447 if y.dtype != np.int64:

5448 y = _toint64(y)

5449 else:

5450 if x.dtype not in (np.int32, np.int64, np.float32, np.float64):

5451 x = _toint64(x)

5452 y = _toint64(y)

5453

5454 if rank is True:

5455 tau = (

5456 _weightedrankedtau(x, y, None, weigher, additive) +

5457 _weightedrankedtau(y, x, None, weigher, additive)

5458 ) / 2

5459 res = SignificanceResult(tau, np.nan)

5460 res.correlation = tau

5461 return res

5462

5463 if rank is False:

5464 rank = np.arange(x.size, dtype=np.intp)

5465 elif rank is not None:

5466 rank = np.asarray(rank).ravel()

5467 if rank.size != x.size:

5468 raise ValueError(

5469 "All inputs to `weightedtau` must be of the same size, "

5470 "found x-size %s and rank-size %s" % (x.size, rank.size)

5471 )

5472

5473 tau = _weightedrankedtau(x, y, rank, weigher, additive)

5474 res = SignificanceResult(tau, np.nan)

5475 res.correlation = tau

5476 return res

5477

5478

5479# FROM MGCPY: https://github.com/neurodata/mgcpy

5480

5481

5482class _ParallelP:

5483 """Helper function to calculate parallel p-value."""

5484

5485 def __init__(self, x, y, random_states):

5486 self.x = x

5487 self.y = y

5488 self.random_states = random_states

5489

5490 def __call__(self, index):

5491 order = self.random_states[index].permutation(self.y.shape[0])

5492 permy = self.y[order][:, order]

5493

5494 # calculate permuted stats, store in null distribution

5495 perm_stat = _mgc_stat(self.x, permy)[0]

5496

5497 return perm_stat

5498

5499

5500def _perm_test(x, y, stat, reps=1000, workers=-1, random_state=None):

5501 r"""Helper function that calculates the p-value. See below for uses.

5502

5503 Parameters

5504 ----------

5505 x, y : ndarray

5506 `x` and `y` have shapes `(n, p)` and `(n, q)`.

5507 stat : float

5508 The sample test statistic.

5509 reps : int, optional

5510 The number of replications used to estimate the null when using the

5511 permutation test. The default is 1000 replications.

5512 workers : int or map-like callable, optional

5513 If `workers` is an int the population is subdivided into `workers`

5514 sections and evaluated in parallel (uses

5515 `multiprocessing.Pool <multiprocessing>`). Supply `-1` to use all cores

5516 available to the Process. Alternatively supply a map-like callable,

5517 such as `multiprocessing.Pool.map` for evaluating the population in

5518 parallel. This evaluation is carried out as `workers(func, iterable)`.

5519 Requires that `func` be pickleable.

5520 random_state : {None, int, `numpy.random.Generator`,

5521 `numpy.random.RandomState`}, optional

5522

5523 If `seed` is None (or `np.random`), the `numpy.random.RandomState`

5524 singleton is used.

5525 If `seed` is an int, a new ``RandomState`` instance is used,

5526 seeded with `seed`.

5527 If `seed` is already a ``Generator`` or ``RandomState`` instance then

5528 that instance is used.

5529

5530 Returns

5531 -------

5532 pvalue : float

5533 The sample test p-value.

5534 null_dist : list

5535 The approximated null distribution.

5536

5537 """

5538 # generate seeds for each rep (change to new parallel random number

5539 # capabilities in numpy >= 1.17+)

5540 random_state = check_random_state(random_state)

5541 random_states = [np.random.RandomState(rng_integers(random_state, 1 << 32,

5542 size=4, dtype=np.uint32)) for _ in range(reps)]

5543

5544 # parallelizes with specified workers over number of reps and set seeds

5545 parallelp = _ParallelP(x=x, y=y, random_states=random_states)

5546 with MapWrapper(workers) as mapwrapper:

5547 null_dist = np.array(list(mapwrapper(parallelp, range(reps))))

5548

5549 # calculate p-value and significant permutation map through list

5550 pvalue = (1 + (null_dist >= stat).sum()) / (1 + reps)

5551

5552 return pvalue, null_dist

5553

5554

5555def _euclidean_dist(x):

5556 return cdist(x, x)

5557

5558

5559MGCResult = _make_tuple_bunch('MGCResult',

5560 ['statistic', 'pvalue', 'mgc_dict'], [])

5561

5562

5563def multiscale_graphcorr(x, y, compute_distance=_euclidean_dist, reps=1000,

5564 workers=1, is_twosamp=False, random_state=None):

5565 r"""Computes the Multiscale Graph Correlation (MGC) test statistic.

5566

5567 Specifically, for each point, MGC finds the :math:`k`-nearest neighbors for

5568 one property (e.g. cloud density), and the :math:`l`-nearest neighbors for

5569 the other property (e.g. grass wetness) [1]_. This pair :math:`(k, l)` is

5570 called the "scale". A priori, however, it is not know which scales will be

5571 most informative. So, MGC computes all distance pairs, and then efficiently

5572 computes the distance correlations for all scales. The local correlations

5573 illustrate which scales are relatively informative about the relationship.

5574 The key, therefore, to successfully discover and decipher relationships

5575 between disparate data modalities is to adaptively determine which scales

5576 are the most informative, and the geometric implication for the most

5577 informative scales. Doing so not only provides an estimate of whether the

5578 modalities are related, but also provides insight into how the

5579 determination was made. This is especially important in high-dimensional

5580 data, where simple visualizations do not reveal relationships to the

5581 unaided human eye. Characterizations of this implementation in particular

5582 have been derived from and benchmarked within in [2]_.

5583

5584 Parameters

5585 ----------

5586 x, y : ndarray

5587 If ``x`` and ``y`` have shapes ``(n, p)`` and ``(n, q)`` where `n` is

5588 the number of samples and `p` and `q` are the number of dimensions,

5589 then the MGC independence test will be run. Alternatively, ``x`` and

5590 ``y`` can have shapes ``(n, n)`` if they are distance or similarity

5591 matrices, and ``compute_distance`` must be sent to ``None``. If ``x``

5592 and ``y`` have shapes ``(n, p)`` and ``(m, p)``, an unpaired

5593 two-sample MGC test will be run.

5594 compute_distance : callable, optional

5595 A function that computes the distance or similarity among the samples

5596 within each data matrix. Set to ``None`` if ``x`` and ``y`` are

5597 already distance matrices. The default uses the euclidean norm metric.

5598 If you are calling a custom function, either create the distance

5599 matrix before-hand or create a function of the form

5600 ``compute_distance(x)`` where `x` is the data matrix for which

5601 pairwise distances are calculated.

5602 reps : int, optional

5603 The number of replications used to estimate the null when using the

5604 permutation test. The default is ``1000``.

5605 workers : int or map-like callable, optional

5606 If ``workers`` is an int the population is subdivided into ``workers``

5607 sections and evaluated in parallel (uses ``multiprocessing.Pool

5608 <multiprocessing>``). Supply ``-1`` to use all cores available to the

5609 Process. Alternatively supply a map-like callable, such as

5610 ``multiprocessing.Pool.map`` for evaluating the p-value in parallel.

5611 This evaluation is carried out as ``workers(func, iterable)``.

5612 Requires that `func` be pickleable. The default is ``1``.

5613 is_twosamp : bool, optional

5614 If `True`, a two sample test will be run. If ``x`` and ``y`` have

5615 shapes ``(n, p)`` and ``(m, p)``, this optional will be overridden and

5616 set to ``True``. Set to ``True`` if ``x`` and ``y`` both have shapes

5617 ``(n, p)`` and a two sample test is desired. The default is ``False``.

5618 Note that this will not run if inputs are distance matrices.

5619 random_state : {None, int, `numpy.random.Generator`,

5620 `numpy.random.RandomState`}, optional

5621

5622 If `seed` is None (or `np.random`), the `numpy.random.RandomState`

5623 singleton is used.

5624 If `seed` is an int, a new ``RandomState`` instance is used,

5625 seeded with `seed`.

5626 If `seed` is already a ``Generator`` or ``RandomState`` instance then

5627 that instance is used.

5628

5629 Returns

5630 -------

5631 res : MGCResult

5632 An object containing attributes:

5633

5634 statistic : float

5635 The sample MGC test statistic within `[-1, 1]`.

5636 pvalue : float

5637 The p-value obtained via permutation.

5638 mgc_dict : dict

5639 Contains additional useful results:

5640

5641 - mgc_map : ndarray

5642 A 2D representation of the latent geometry of the

5643 relationship.

5644 - opt_scale : (int, int)

5645 The estimated optimal scale as a `(x, y)` pair.

5646 - null_dist : list

5647 The null distribution derived from the permuted matrices.

5648

5649 See Also

5650 --------

5651 pearsonr : Pearson correlation coefficient and p-value for testing

5652 non-correlation.

5653 kendalltau : Calculates Kendall's tau.

5654 spearmanr : Calculates a Spearman rank-order correlation coefficient.

5655

5656 Notes

5657 -----

5658 A description of the process of MGC and applications on neuroscience data

5659 can be found in [1]_. It is performed using the following steps:

5660

5661 #. Two distance matrices :math:`D^X` and :math:`D^Y` are computed and

5662 modified to be mean zero columnwise. This results in two

5663 :math:`n \times n` distance matrices :math:`A` and :math:`B` (the

5664 centering and unbiased modification) [3]_.

5665

5666 #. For all values :math:`k` and :math:`l` from :math:`1, ..., n`,

5667

5668 * The :math:`k`-nearest neighbor and :math:`l`-nearest neighbor graphs

5669 are calculated for each property. Here, :math:`G_k (i, j)` indicates

5670 the :math:`k`-smallest values of the :math:`i`-th row of :math:`A`

5671 and :math:`H_l (i, j)` indicates the :math:`l` smallested values of

5672 the :math:`i`-th row of :math:`B`

5673

5674 * Let :math:`\circ` denotes the entry-wise matrix product, then local

5675 correlations are summed and normalized using the following statistic:

5676

5677 .. math::

5678

5679 c^{kl} = \frac{\sum_{ij} A G_k B H_l}

5680 {\sqrt{\sum_{ij} A^2 G_k \times \sum_{ij} B^2 H_l}}

5681

5682 #. The MGC test statistic is the smoothed optimal local correlation of

5683 :math:`\{ c^{kl} \}`. Denote the smoothing operation as :math:`R(\cdot)`

5684 (which essentially set all isolated large correlations) as 0 and

5685 connected large correlations the same as before, see [3]_.) MGC is,

5686

5687 .. math::

5688

5689 MGC_n (x, y) = \max_{(k, l)} R \left(c^{kl} \left( x_n, y_n \right)

5690 \right)

5691

5692 The test statistic returns a value between :math:`(-1, 1)` since it is

5693 normalized.

5694

5695 The p-value returned is calculated using a permutation test. This process

5696 is completed by first randomly permuting :math:`y` to estimate the null

5697 distribution and then calculating the probability of observing a test

5698 statistic, under the null, at least as extreme as the observed test

5699 statistic.

5700

5701 MGC requires at least 5 samples to run with reliable results. It can also

5702 handle high-dimensional data sets.

5703 In addition, by manipulating the input data matrices, the two-sample

5704 testing problem can be reduced to the independence testing problem [4]_.

5705 Given sample data :math:`U` and :math:`V` of sizes :math:`p \times n`

5706 :math:`p \times m`, data matrix :math:`X` and :math:`Y` can be created as

5707 follows:

5708

5709 .. math::

5710

5711 X = [U | V] \in \mathcal{R}^{p \times (n + m)}

5712 Y = [0_{1 \times n} | 1_{1 \times m}] \in \mathcal{R}^{(n + m)}

5713

5714 Then, the MGC statistic can be calculated as normal. This methodology can

5715 be extended to similar tests such as distance correlation [4]_.

5716

5717 .. versionadded:: 1.4.0

5718

5719 References

5720 ----------

5721 .. [1] Vogelstein, J. T., Bridgeford, E. W., Wang, Q., Priebe, C. E.,

5722 Maggioni, M., & Shen, C. (2019). Discovering and deciphering

5723 relationships across disparate data modalities. ELife.

5724 .. [2] Panda, S., Palaniappan, S., Xiong, J., Swaminathan, A.,

5725 Ramachandran, S., Bridgeford, E. W., ... Vogelstein, J. T. (2019).

5726 mgcpy: A Comprehensive High Dimensional Independence Testing Python

5727 Package. :arXiv:`1907.02088`

5728 .. [3] Shen, C., Priebe, C.E., & Vogelstein, J. T. (2019). From distance

5729 correlation to multiscale graph correlation. Journal of the American

5730 Statistical Association.

5731 .. [4] Shen, C. & Vogelstein, J. T. (2018). The Exact Equivalence of

5732 Distance and Kernel Methods for Hypothesis Testing.

5733 :arXiv:`1806.05514`

5734

5735 Examples

5736 --------

5737 >>> import numpy as np

5738 >>> from scipy.stats import multiscale_graphcorr

5739 >>> x = np.arange(100)

5740 >>> y = x

5741 >>> res = multiscale_graphcorr(x, y)

5742 >>> res.statistic, res.pvalue

5743 (1.0, 0.001)

5744

5745 To run an unpaired two-sample test,

5746

5747 >>> x = np.arange(100)

5748 >>> y = np.arange(79)

5749 >>> res = multiscale_graphcorr(x, y)

5750 >>> res.statistic, res.pvalue # doctest: +SKIP

5751 (0.033258146255703246, 0.023)

5752

5753 or, if shape of the inputs are the same,

5754

5755 >>> x = np.arange(100)

5756 >>> y = x

5757 >>> res = multiscale_graphcorr(x, y, is_twosamp=True)

5758 >>> res.statistic, res.pvalue # doctest: +SKIP

5759 (-0.008021809890200488, 1.0)

5760

5761 """

5762 if not isinstance(x, np.ndarray) or not isinstance(y, np.ndarray):

5763 raise ValueError("x and y must be ndarrays")

5764

5765 # convert arrays of type (n,) to (n, 1)

5766 if x.ndim == 1:

5767 x = x[:, np.newaxis]

5768 elif x.ndim != 2:

5769 raise ValueError("Expected a 2-D array `x`, found shape "

5770 "{}".format(x.shape))

5771 if y.ndim == 1:

5772 y = y[:, np.newaxis]

5773 elif y.ndim != 2:

5774 raise ValueError("Expected a 2-D array `y`, found shape "

5775 "{}".format(y.shape))

5776

5777 nx, px = x.shape

5778 ny, py = y.shape

5779

5780 # check for NaNs

5781 _contains_nan(x, nan_policy='raise')

5782 _contains_nan(y, nan_policy='raise')

5783

5784 # check for positive or negative infinity and raise error

5785 if np.sum(np.isinf(x)) > 0 or np.sum(np.isinf(y)) > 0:

5786 raise ValueError("Inputs contain infinities")

5787

5788 if nx != ny:

5789 if px == py:

5790 # reshape x and y for two sample testing

5791 is_twosamp = True

5792 else:

5793 raise ValueError("Shape mismatch, x and y must have shape [n, p] "

5794 "and [n, q] or have shape [n, p] and [m, p].")

5795

5796 if nx < 5 or ny < 5:

5797 raise ValueError("MGC requires at least 5 samples to give reasonable "

5798 "results.")

5799

5800 # convert x and y to float

5801 x = x.astype(np.float64)

5802 y = y.astype(np.float64)

5803

5804 # check if compute_distance_matrix if a callable()

5805 if not callable(compute_distance) and compute_distance is not None:

5806 raise ValueError("Compute_distance must be a function.")

5807

5808 # check if number of reps exists, integer, or > 0 (if under 1000 raises

5809 # warning)

5810 if not isinstance(reps, int) or reps < 0:

5811 raise ValueError("Number of reps must be an integer greater than 0.")

5812 elif reps < 1000:

5813 msg = ("The number of replications is low (under 1000), and p-value "

5814 "calculations may be unreliable. Use the p-value result, with "

5815 "caution!")

5816 warnings.warn(msg, RuntimeWarning)

5817

5818 if is_twosamp:

5819 if compute_distance is None:

5820 raise ValueError("Cannot run if inputs are distance matrices")

5821 x, y = _two_sample_transform(x, y)

5822

5823 if compute_distance is not None:

5824 # compute distance matrices for x and y

5825 x = compute_distance(x)

5826 y = compute_distance(y)

5827

5828 # calculate MGC stat

5829 stat, stat_dict = _mgc_stat(x, y)

5830 stat_mgc_map = stat_dict["stat_mgc_map"]

5831 opt_scale = stat_dict["opt_scale"]

5832

5833 # calculate permutation MGC p-value

5834 pvalue, null_dist = _perm_test(x, y, stat, reps=reps, workers=workers,

5835 random_state=random_state)

5836

5837 # save all stats (other than stat/p-value) in dictionary

5838 mgc_dict = {"mgc_map": stat_mgc_map,

5839 "opt_scale": opt_scale,

5840 "null_dist": null_dist}

5841

5842 # create result object with alias for backward compatibility

5843 res = MGCResult(stat, pvalue, mgc_dict)

5844 res.stat = stat

5845 return res

5846

5847

5848def _mgc_stat(distx, disty):

5849 r"""Helper function that calculates the MGC stat. See above for use.

5850

5851 Parameters

5852 ----------

5853 distx, disty : ndarray

5854 `distx` and `disty` have shapes `(n, p)` and `(n, q)` or

5855 `(n, n)` and `(n, n)`

5856 if distance matrices.

5857

5858 Returns

5859 -------

5860 stat : float

5861 The sample MGC test statistic within `[-1, 1]`.

5862 stat_dict : dict

5863 Contains additional useful additional returns containing the following

5864 keys:

5865

5866 - stat_mgc_map : ndarray

5867 MGC-map of the statistics.

5868 - opt_scale : (float, float)

5869 The estimated optimal scale as a `(x, y)` pair.

5870

5871 """

5872 # calculate MGC map and optimal scale

5873 stat_mgc_map = _local_correlations(distx, disty, global_corr='mgc')

5874

5875 n, m = stat_mgc_map.shape

5876 if m == 1 or n == 1:

5877 # the global scale at is the statistic calculated at maximial nearest

5878 # neighbors. There is not enough local scale to search over, so

5879 # default to global scale

5880 stat = stat_mgc_map[m - 1][n - 1]

5881 opt_scale = m * n

5882 else:

5883 samp_size = len(distx) - 1

5884

5885 # threshold to find connected region of significant local correlations

5886 sig_connect = _threshold_mgc_map(stat_mgc_map, samp_size)

5887

5888 # maximum within the significant region

5889 stat, opt_scale = _smooth_mgc_map(sig_connect, stat_mgc_map)

5890

5891 stat_dict = {"stat_mgc_map": stat_mgc_map,

5892 "opt_scale": opt_scale}

5893

5894 return stat, stat_dict

5895

5896

5897def _threshold_mgc_map(stat_mgc_map, samp_size):

5898 r"""

5899 Finds a connected region of significance in the MGC-map by thresholding.

5900

5901 Parameters

5902 ----------

5903 stat_mgc_map : ndarray

5904 All local correlations within `[-1,1]`.

5905 samp_size : int

5906 The sample size of original data.

5907

5908 Returns

5909 -------

5910 sig_connect : ndarray

5911 A binary matrix with 1's indicating the significant region.

5912

5913 """

5914 m, n = stat_mgc_map.shape

5915

5916 # 0.02 is simply an empirical threshold, this can be set to 0.01 or 0.05

5917 # with varying levels of performance. Threshold is based on a beta

5918 # approximation.

5919 per_sig = 1 - (0.02 / samp_size) # Percentile to consider as significant

5920 threshold = samp_size * (samp_size - 3)/4 - 1/2 # Beta approximation

5921 threshold = distributions.beta.ppf(per_sig, threshold, threshold) * 2 - 1

5922

5923 # the global scale at is the statistic calculated at maximial nearest

5924 # neighbors. Threshold is the maximum on the global and local scales

5925 threshold = max(threshold, stat_mgc_map[m - 1][n - 1])

5926

5927 # find the largest connected component of significant correlations

5928 sig_connect = stat_mgc_map > threshold

5929 if np.sum(sig_connect) > 0:

5930 sig_connect, _ = _measurements.label(sig_connect)

5931 _, label_counts = np.unique(sig_connect, return_counts=True)

5932

5933 # skip the first element in label_counts, as it is count(zeros)

5934 max_label = np.argmax(label_counts[1:]) + 1

5935 sig_connect = sig_connect == max_label

5936 else:

5937 sig_connect = np.array([[False]])

5938

5939 return sig_connect

5940

5941

5942def _smooth_mgc_map(sig_connect, stat_mgc_map):

5943 """Finds the smoothed maximal within the significant region R.

5944

5945 If area of R is too small it returns the last local correlation. Otherwise,

5946 returns the maximum within significant_connected_region.

5947

5948 Parameters

5949 ----------

5950 sig_connect : ndarray

5951 A binary matrix with 1's indicating the significant region.

5952 stat_mgc_map : ndarray

5953 All local correlations within `[-1, 1]`.

5954

5955 Returns

5956 -------

5957 stat : float

5958 The sample MGC statistic within `[-1, 1]`.

5959 opt_scale: (float, float)

5960 The estimated optimal scale as an `(x, y)` pair.

5961

5962 """

5963 m, n = stat_mgc_map.shape

5964

5965 # the global scale at is the statistic calculated at maximial nearest

5966 # neighbors. By default, statistic and optimal scale are global.

5967 stat = stat_mgc_map[m - 1][n - 1]

5968 opt_scale = [m, n]

5969

5970 if np.linalg.norm(sig_connect) != 0:

5971 # proceed only when the connected region's area is sufficiently large

5972 # 0.02 is simply an empirical threshold, this can be set to 0.01 or 0.05

5973 # with varying levels of performance

5974 if np.sum(sig_connect) >= np.ceil(0.02 * max(m, n)) * min(m, n):

5975 max_corr = max(stat_mgc_map[sig_connect])

5976

5977 # find all scales within significant_connected_region that maximize

5978 # the local correlation

5979 max_corr_index = np.where((stat_mgc_map >= max_corr) & sig_connect)

5980

5981 if max_corr >= stat:

5982 stat = max_corr

5983

5984 k, l = max_corr_index

5985 one_d_indices = k * n + l # 2D to 1D indexing

5986 k = np.max(one_d_indices) // n

5987 l = np.max(one_d_indices) % n

5988 opt_scale = [k+1, l+1] # adding 1s to match R indexing

5989

5990 return stat, opt_scale

5991

5992

5993def _two_sample_transform(u, v):

5994 """Helper function that concatenates x and y for two sample MGC stat.

5995

5996 See above for use.

5997

5998 Parameters

5999 ----------

6000 u, v : ndarray

6001 `u` and `v` have shapes `(n, p)` and `(m, p)`.

6002

6003 Returns

6004 -------

6005 x : ndarray

6006 Concatenate `u` and `v` along the `axis = 0`. `x` thus has shape

6007 `(2n, p)`.

6008 y : ndarray

6009 Label matrix for `x` where 0 refers to samples that comes from `u` and

6010 1 refers to samples that come from `v`. `y` thus has shape `(2n, 1)`.

6011

6012 """

6013 nx = u.shape[0]

6014 ny = v.shape[0]

6015 x = np.concatenate([u, v], axis=0)

6016 y = np.concatenate([np.zeros(nx), np.ones(ny)], axis=0).reshape(-1, 1)

6017 return x, y

6018

6019

6020#####################################

6021# INFERENTIAL STATISTICS #

6022#####################################

6023

6024TtestResultBase = _make_tuple_bunch('TtestResultBase',

6025 ['statistic', 'pvalue'], ['df'])

6026

6027

6028class TtestResult(TtestResultBase):

6029 """

6030 Result of a t-test.

6031

6032 See the documentation of the particular t-test function for more

6033 information about the definition of the statistic and meaning of

6034 the confidence interval.

6035

6036 Attributes

6037 ----------

6038 statistic : float or array

6039 The t-statistic of the sample.

6040 pvalue : float or array

6041 The p-value associated with the given alternative.

6042 df : float or array

6043 The number of degrees of freedom used in calculation of the

6044 t-statistic; this is one less than the size of the sample

6045 (``a.shape[axis]-1`` if there are no masked elements or omitted NaNs).

6046

6047 Methods

6048 -------

6049 confidence_interval

6050 Computes a confidence interval around the population statistic

6051 for the given confidence level.

6052 The confidence interval is returned in a ``namedtuple`` with

6053 fields `low` and `high`.

6054

6055 """

6056

6057 def __init__(self, statistic, pvalue, df, # public

6058 alternative, standard_error, estimate): # private

6059 super().__init__(statistic, pvalue, df=df)

6060 self._alternative = alternative

6061 self._standard_error = standard_error # denominator of t-statistic

6062 self._estimate = estimate # point estimate of sample mean

6063

6064 def confidence_interval(self, confidence_level=0.95):

6065 """

6066 Parameters

6067 ----------

6068 confidence_level : float

6069 The confidence level for the calculation of the population mean

6070 confidence interval. Default is 0.95.

6071

6072 Returns

6073 -------

6074 ci : namedtuple

6075 The confidence interval is returned in a ``namedtuple`` with

6076 fields `low` and `high`.

6077

6078 """

6079 low, high = _t_confidence_interval(self.df, self.statistic,

6080 confidence_level, self._alternative)

6081 low = low * self._standard_error + self._estimate

6082 high = high * self._standard_error + self._estimate

6083 return ConfidenceInterval(low=low, high=high)

6084

6085

6086def pack_TtestResult(statistic, pvalue, df, alternative, standard_error,

6087 estimate):

6088 # this could be any number of dimensions (including 0d), but there is

6089 # at most one unique value

6090 alternative = np.atleast_1d(alternative).ravel()

6091 alternative = alternative[0] if alternative.size else np.nan

6092 return TtestResult(statistic, pvalue, df=df, alternative=alternative,

6093 standard_error=standard_error, estimate=estimate)

6094

6095

6096def unpack_TtestResult(res):

6097 return (res.statistic, res.pvalue, res.df, res._alternative,

6098 res._standard_error, res._estimate)

6099

6100

6101@_axis_nan_policy_factory(pack_TtestResult, default_axis=0, n_samples=2,

6102 result_to_tuple=unpack_TtestResult, n_outputs=6)

6103def ttest_1samp(a, popmean, axis=0, nan_policy='propagate',

6104 alternative="two-sided"):

6105 """Calculate the T-test for the mean of ONE group of scores.

6106

6107 This is a test for the null hypothesis that the expected value

6108 (mean) of a sample of independent observations `a` is equal to the given

6109 population mean, `popmean`.

6110

6111 Parameters

6112 ----------

6113 a : array_like

6114 Sample observation.

6115 popmean : float or array_like

6116 Expected value in null hypothesis. If array_like, then its length along

6117 `axis` must equal 1, and it must otherwise be broadcastable with `a`.

6118 axis : int or None, optional

6119 Axis along which to compute test; default is 0. If None, compute over

6120 the whole array `a`.

6121 nan_policy : {'propagate', 'raise', 'omit'}, optional

6122 Defines how to handle when input contains nan.

6123 The following options are available (default is 'propagate'):

6124

6125 * 'propagate': returns nan

6126 * 'raise': throws an error

6127 * 'omit': performs the calculations ignoring nan values

6128

6129 alternative : {'two-sided', 'less', 'greater'}, optional

6130 Defines the alternative hypothesis.

6131 The following options are available (default is 'two-sided'):

6132

6133 * 'two-sided': the mean of the underlying distribution of the sample

6134 is different than the given population mean (`popmean`)

6135 * 'less': the mean of the underlying distribution of the sample is

6136 less than the given population mean (`popmean`)

6137 * 'greater': the mean of the underlying distribution of the sample is

6138 greater than the given population mean (`popmean`)

6139

6140 Returns

6141 -------

6142 result : `~scipy.stats._result_classes.TtestResult`

6143 An object with the following attributes:

6144

6145 statistic : float or array

6146 The t-statistic.

6147 pvalue : float or array

6148 The p-value associated with the given alternative.

6149 df : float or array

6150 The number of degrees of freedom used in calculation of the

6151 t-statistic; this is one less than the size of the sample

6152 (``a.shape[axis]``).

6153

6154 .. versionadded:: 1.10.0

6155

6156 The object also has the following method:

6157

6158 confidence_interval(confidence_level=0.95)

6159 Computes a confidence interval around the population

6160 mean for the given confidence level.

6161 The confidence interval is returned in a ``namedtuple`` with

6162 fields `low` and `high`.

6163

6164 .. versionadded:: 1.10.0

6165

6166 Notes

6167 -----

6168 The statistic is calculated as ``(np.mean(a) - popmean)/se``, where

6169 ``se`` is the standard error. Therefore, the statistic will be positive

6170 when the sample mean is greater than the population mean and negative when

6171 the sample mean is less than the population mean.

6172

6173 Examples

6174 --------

6175 Suppose we wish to test the null hypothesis that the mean of a population

6176 is equal to 0.5. We choose a confidence level of 99%; that is, we will

6177 reject the null hypothesis in favor of the alternative if the p-value is

6178 less than 0.01.

6179

6180 When testing random variates from the standard uniform distribution, which

6181 has a mean of 0.5, we expect the data to be consistent with the null

6182 hypothesis most of the time.

6183

6184 >>> import numpy as np

6185 >>> from scipy import stats

6186 >>> rng = np.random.default_rng()

6187 >>> rvs = stats.uniform.rvs(size=50, random_state=rng)

6188 >>> stats.ttest_1samp(rvs, popmean=0.5)

6189 TtestResult(statistic=2.456308468440, pvalue=0.017628209047638, df=49)

6190

6191 As expected, the p-value of 0.017 is not below our threshold of 0.01, so

6192 we cannot reject the null hypothesis.

6193

6194 When testing data from the standard *normal* distribution, which has a mean

6195 of 0, we would expect the null hypothesis to be rejected.

6196

6197 >>> rvs = stats.norm.rvs(size=50, random_state=rng)

6198 >>> stats.ttest_1samp(rvs, popmean=0.5)

6199 TtestResult(statistic=-7.433605518875, pvalue=1.416760157221e-09, df=49)

6200

6201 Indeed, the p-value is lower than our threshold of 0.01, so we reject the

6202 null hypothesis in favor of the default "two-sided" alternative: the mean

6203 of the population is *not* equal to 0.5.

6204

6205 However, suppose we were to test the null hypothesis against the

6206 one-sided alternative that the mean of the population is *greater* than

6207 0.5. Since the mean of the standard normal is less than 0.5, we would not

6208 expect the null hypothesis to be rejected.

6209

6210 >>> stats.ttest_1samp(rvs, popmean=0.5, alternative='greater')

6211 TtestResult(statistic=-7.433605518875, pvalue=0.99999999929, df=49)

6212

6213 Unsurprisingly, with a p-value greater than our threshold, we would not

6214 reject the null hypothesis.

6215

6216 Note that when working with a confidence level of 99%, a true null

6217 hypothesis will be rejected approximately 1% of the time.

6218

6219 >>> rvs = stats.uniform.rvs(size=(100, 50), random_state=rng)

6220 >>> res = stats.ttest_1samp(rvs, popmean=0.5, axis=1)

6221 >>> np.sum(res.pvalue < 0.01)

6222 1

6223

6224 Indeed, even though all 100 samples above were drawn from the standard

6225 uniform distribution, which *does* have a population mean of 0.5, we would

6226 mistakenly reject the null hypothesis for one of them.

6227

6228 `ttest_1samp` can also compute a confidence interval around the population

6229 mean.

6230

6231 >>> rvs = stats.norm.rvs(size=50, random_state=rng)

6232 >>> res = stats.ttest_1samp(rvs, popmean=0)

6233 >>> ci = res.confidence_interval(confidence_level=0.95)

6234 >>> ci

6235 ConfidenceInterval(low=-0.3193887540880017, high=0.2898583388980972)

6236

6237 The bounds of the 95% confidence interval are the

6238 minimum and maximum values of the parameter `popmean` for which the

6239 p-value of the test would be 0.05.

6240

6241 >>> res = stats.ttest_1samp(rvs, popmean=ci.low)

6242 >>> np.testing.assert_allclose(res.pvalue, 0.05)

6243 >>> res = stats.ttest_1samp(rvs, popmean=ci.high)

6244 >>> np.testing.assert_allclose(res.pvalue, 0.05)

6245

6246 Under certain assumptions about the population from which a sample

6247 is drawn, the confidence interval with confidence level 95% is expected

6248 to contain the true population mean in 95% of sample replications.

6249

6250 >>> rvs = stats.norm.rvs(size=(50, 1000), loc=1, random_state=rng)

6251 >>> res = stats.ttest_1samp(rvs, popmean=0)

6252 >>> ci = res.confidence_interval()

6253 >>> contains_pop_mean = (ci.low < 1) & (ci.high > 1)

6254 >>> contains_pop_mean.sum()

6255 953

6256

6257 """

6258 a, axis = _chk_asarray(a, axis)

6259

6260 n = a.shape[axis]

6261 df = n - 1

6262

6263 mean = np.mean(a, axis)

6264 try:

6265 popmean = np.squeeze(popmean, axis=axis)

6266 except ValueError as e:

6267 raise ValueError("`popmean.shape[axis]` must equal 1.") from e

6268 d = mean - popmean

6269 v = _var(a, axis, ddof=1)

6270 denom = np.sqrt(v / n)

6271

6272 with np.errstate(divide='ignore', invalid='ignore'):

6273 t = np.divide(d, denom)

6274 t, prob = _ttest_finish(df, t, alternative)

6275

6276 # when nan_policy='omit', `df` can be different for different axis-slices

6277 df = np.broadcast_to(df, t.shape)[()]

6278 # _axis_nan_policy decorator doesn't play well with strings

6279 alternative_num = {"less": -1, "two-sided": 0, "greater": 1}[alternative]

6280 return TtestResult(t, prob, df=df, alternative=alternative_num,

6281 standard_error=denom, estimate=mean)

6282

6283

6284def _t_confidence_interval(df, t, confidence_level, alternative):

6285 # Input validation on `alternative` is already done

6286 # We just need IV on confidence_level

6287 if confidence_level < 0 or confidence_level > 1:

6288 message = "`confidence_level` must be a number between 0 and 1."

6289 raise ValueError(message)

6290

6291 if alternative < 0: # 'less'

6292 p = confidence_level

6293 low, high = np.broadcast_arrays(-np.inf, special.stdtrit(df, p))

6294 elif alternative > 0: # 'greater'

6295 p = 1 - confidence_level

6296 low, high = np.broadcast_arrays(special.stdtrit(df, p), np.inf)

6297 elif alternative == 0: # 'two-sided'

6298 tail_probability = (1 - confidence_level)/2

6299 p = tail_probability, 1-tail_probability

6300 # axis of p must be the zeroth and orthogonal to all the rest

6301 p = np.reshape(p, [2] + [1]*np.asarray(df).ndim)

6302 low, high = special.stdtrit(df, p)

6303 else: # alternative is NaN when input is empty (see _axis_nan_policy)

6304 p, nans = np.broadcast_arrays(t, np.nan)

6305 low, high = nans, nans

6306

6307 return low[()], high[()]

6308

6309

6310def _ttest_finish(df, t, alternative):

6311 """Common code between all 3 t-test functions."""

6312 # We use ``stdtr`` directly here as it handles the case when ``nan``

6313 # values are present in the data and masked arrays are passed

6314 # while ``t.cdf`` emits runtime warnings. This way ``_ttest_finish``

6315 # can be shared between the ``stats`` and ``mstats`` versions.

6316

6317 if alternative == 'less':

6318 pval = special.stdtr(df, t)

6319 elif alternative == 'greater':

6320 pval = special.stdtr(df, -t)

6321 elif alternative == 'two-sided':

6322 pval = special.stdtr(df, -np.abs(t))*2

6323 else:

6324 raise ValueError("alternative must be "

6325 "'less', 'greater' or 'two-sided'")

6326

6327 if t.ndim == 0:

6328 t = t[()]

6329 if pval.ndim == 0:

6330 pval = pval[()]

6331

6332 return t, pval

6333

6334

6335def _ttest_ind_from_stats(mean1, mean2, denom, df, alternative):

6336

6337 d = mean1 - mean2

6338 with np.errstate(divide='ignore', invalid='ignore'):

6339 t = np.divide(d, denom)

6340 t, prob = _ttest_finish(df, t, alternative)

6341

6342 return (t, prob)

6343

6344

6345def _unequal_var_ttest_denom(v1, n1, v2, n2):

6346 vn1 = v1 / n1

6347 vn2 = v2 / n2

6348 with np.errstate(divide='ignore', invalid='ignore'):

6349 df = (vn1 + vn2)**2 / (vn1**2 / (n1 - 1) + vn2**2 / (n2 - 1))

6350

6351 # If df is undefined, variances are zero (assumes n1 > 0 & n2 > 0).

6352 # Hence it doesn't matter what df is as long as it's not NaN.

6353 df = np.where(np.isnan(df), 1, df)

6354 denom = np.sqrt(vn1 + vn2)

6355 return df, denom

6356

6357

6358def _equal_var_ttest_denom(v1, n1, v2, n2):

6359 df = n1 + n2 - 2.0

6360 svar = ((n1 - 1) * v1 + (n2 - 1) * v2) / df

6361 denom = np.sqrt(svar * (1.0 / n1 + 1.0 / n2))

6362 return df, denom

6363

6364

6365Ttest_indResult = namedtuple('Ttest_indResult', ('statistic', 'pvalue'))

6366

6367

6368def ttest_ind_from_stats(mean1, std1, nobs1, mean2, std2, nobs2,

6369 equal_var=True, alternative="two-sided"):

6370 r"""

6371 T-test for means of two independent samples from descriptive statistics.

6372

6373 This is a test for the null hypothesis that two independent

6374 samples have identical average (expected) values.

6375

6376 Parameters

6377 ----------

6378 mean1 : array_like

6379 The mean(s) of sample 1.

6380 std1 : array_like

6381 The corrected sample standard deviation of sample 1 (i.e. ``ddof=1``).

6382 nobs1 : array_like

6383 The number(s) of observations of sample 1.

6384 mean2 : array_like

6385 The mean(s) of sample 2.

6386 std2 : array_like

6387 The corrected sample standard deviation of sample 2 (i.e. ``ddof=1``).

6388 nobs2 : array_like

6389 The number(s) of observations of sample 2.

6390 equal_var : bool, optional

6391 If True (default), perform a standard independent 2 sample test

6392 that assumes equal population variances [1]_.

6393 If False, perform Welch's t-test, which does not assume equal

6394 population variance [2]_.

6395 alternative : {'two-sided', 'less', 'greater'}, optional

6396 Defines the alternative hypothesis.

6397 The following options are available (default is 'two-sided'):

6398

6399 * 'two-sided': the means of the distributions are unequal.

6400 * 'less': the mean of the first distribution is less than the

6401 mean of the second distribution.

6402 * 'greater': the mean of the first distribution is greater than the

6403 mean of the second distribution.

6404

6405 .. versionadded:: 1.6.0

6406

6407 Returns

6408 -------

6409 statistic : float or array

6410 The calculated t-statistics.

6411 pvalue : float or array

6412 The two-tailed p-value.

6413

6414 See Also

6415 --------

6416 scipy.stats.ttest_ind

6417

6418 Notes

6419 -----

6420 The statistic is calculated as ``(mean1 - mean2)/se``, where ``se`` is the

6421 standard error. Therefore, the statistic will be positive when `mean1` is

6422 greater than `mean2` and negative when `mean1` is less than `mean2`.

6423

6424 References

6425 ----------

6426 .. [1] https://en.wikipedia.org/wiki/T-test#Independent_two-sample_t-test

6427

6428 .. [2] https://en.wikipedia.org/wiki/Welch%27s_t-test

6429

6430 Examples

6431 --------

6432 Suppose we have the summary data for two samples, as follows (with the

6433 Sample Variance being the corrected sample variance)::

6434

6435 Sample Sample

6436 Size Mean Variance

6437 Sample 1 13 15.0 87.5

6438 Sample 2 11 12.0 39.0

6439

6440 Apply the t-test to this data (with the assumption that the population

6441 variances are equal):

6442

6443 >>> import numpy as np

6444 >>> from scipy.stats import ttest_ind_from_stats

6445 >>> ttest_ind_from_stats(mean1=15.0, std1=np.sqrt(87.5), nobs1=13,

6446 ... mean2=12.0, std2=np.sqrt(39.0), nobs2=11)

6447 Ttest_indResult(statistic=0.9051358093310269, pvalue=0.3751996797581487)

6448

6449 For comparison, here is the data from which those summary statistics

6450 were taken. With this data, we can compute the same result using

6451 `scipy.stats.ttest_ind`:

6452

6453 >>> a = np.array([1, 3, 4, 6, 11, 13, 15, 19, 22, 24, 25, 26, 26])

6454 >>> b = np.array([2, 4, 6, 9, 11, 13, 14, 15, 18, 19, 21])

6455 >>> from scipy.stats import ttest_ind

6456 >>> ttest_ind(a, b)

6457 Ttest_indResult(statistic=0.905135809331027, pvalue=0.3751996797581486)

6458

6459 Suppose we instead have binary data and would like to apply a t-test to

6460 compare the proportion of 1s in two independent groups::

6461

6462 Number of Sample Sample

6463 Size ones Mean Variance

6464 Sample 1 150 30 0.2 0.161073

6465 Sample 2 200 45 0.225 0.175251

6466

6467 The sample mean :math:`\hat{p}` is the proportion of ones in the sample

6468 and the variance for a binary observation is estimated by

6469 :math:`\hat{p}(1-\hat{p})`.

6470

6471 >>> ttest_ind_from_stats(mean1=0.2, std1=np.sqrt(0.161073), nobs1=150,

6472 ... mean2=0.225, std2=np.sqrt(0.175251), nobs2=200)

6473 Ttest_indResult(statistic=-0.5627187905196761, pvalue=0.5739887114209541)

6474

6475 For comparison, we could compute the t statistic and p-value using

6476 arrays of 0s and 1s and `scipy.stat.ttest_ind`, as above.

6477

6478 >>> group1 = np.array([1]*30 + [0]*(150-30))

6479 >>> group2 = np.array([1]*45 + [0]*(200-45))

6480 >>> ttest_ind(group1, group2)

6481 Ttest_indResult(statistic=-0.5627179589855622, pvalue=0.573989277115258)

6482

6483 """

6484 mean1 = np.asarray(mean1)

6485 std1 = np.asarray(std1)

6486 mean2 = np.asarray(mean2)

6487 std2 = np.asarray(std2)

6488 if equal_var:

6489 df, denom = _equal_var_ttest_denom(std1**2, nobs1, std2**2, nobs2)

6490 else:

6491 df, denom = _unequal_var_ttest_denom(std1**2, nobs1,

6492 std2**2, nobs2)

6493

6494 res = _ttest_ind_from_stats(mean1, mean2, denom, df, alternative)

6495 return Ttest_indResult(*res)

6496

6497

6498def _ttest_nans(a, b, axis, namedtuple_type):

6499 """

6500 Generate an array of `nan`, with shape determined by `a`, `b` and `axis`.

6501

6502 This function is used by ttest_ind and ttest_rel to create the return

6503 value when one of the inputs has size 0.

6504

6505 The shapes of the arrays are determined by dropping `axis` from the

6506 shapes of `a` and `b` and broadcasting what is left.

6507

6508 The return value is a named tuple of the type given in `namedtuple_type`.

6509

6510 Examples

6511 --------

6512 >>> import numpy as np

6513 >>> a = np.zeros((9, 2))

6514 >>> b = np.zeros((5, 1))

6515 >>> _ttest_nans(a, b, 0, Ttest_indResult)

6516 Ttest_indResult(statistic=array([nan, nan]), pvalue=array([nan, nan]))

6517

6518 >>> a = np.zeros((3, 0, 9))

6519 >>> b = np.zeros((1, 10))

6520 >>> stat, p = _ttest_nans(a, b, -1, Ttest_indResult)

6521 >>> stat

6522 array([], shape=(3, 0), dtype=float64)

6523 >>> p

6524 array([], shape=(3, 0), dtype=float64)

6525

6526 >>> a = np.zeros(10)

6527 >>> b = np.zeros(7)

6528 >>> _ttest_nans(a, b, 0, Ttest_indResult)

6529 Ttest_indResult(statistic=nan, pvalue=nan)

6530

6531 """

6532 shp = _broadcast_shapes_with_dropped_axis(a, b, axis)

6533 if len(shp) == 0:

6534 t = np.nan

6535 p = np.nan

6536 else:

6537 t = np.full(shp, fill_value=np.nan)

6538 p = t.copy()

6539 return namedtuple_type(t, p)

6540

6541

6542def ttest_ind(a, b, axis=0, equal_var=True, nan_policy='propagate',

6543 permutations=None, random_state=None, alternative="two-sided",

6544 trim=0):

6545 """

6546 Calculate the T-test for the means of *two independent* samples of scores.

6547

6548 This is a test for the null hypothesis that 2 independent samples

6549 have identical average (expected) values. This test assumes that the

6550 populations have identical variances by default.

6551

6552 Parameters

6553 ----------

6554 a, b : array_like

6555 The arrays must have the same shape, except in the dimension

6556 corresponding to `axis` (the first, by default).

6557 axis : int or None, optional

6558 Axis along which to compute test. If None, compute over the whole

6559 arrays, `a`, and `b`.

6560 equal_var : bool, optional

6561 If True (default), perform a standard independent 2 sample test

6562 that assumes equal population variances [1]_.

6563 If False, perform Welch's t-test, which does not assume equal

6564 population variance [2]_.

6565

6566 .. versionadded:: 0.11.0

6567

6568 nan_policy : {'propagate', 'raise', 'omit'}, optional

6569 Defines how to handle when input contains nan.

6570 The following options are available (default is 'propagate'):

6571

6572 * 'propagate': returns nan

6573 * 'raise': throws an error

6574 * 'omit': performs the calculations ignoring nan values

6575

6576 The 'omit' option is not currently available for permutation tests or

6577 one-sided asympyotic tests.

6578

6579 permutations : non-negative int, np.inf, or None (default), optional

6580 If 0 or None (default), use the t-distribution to calculate p-values.

6581 Otherwise, `permutations` is the number of random permutations that

6582 will be used to estimate p-values using a permutation test. If

6583 `permutations` equals or exceeds the number of distinct partitions of

6584 the pooled data, an exact test is performed instead (i.e. each

6585 distinct partition is used exactly once). See Notes for details.

6586

6587 .. versionadded:: 1.7.0

6588

6589 random_state : {None, int, `numpy.random.Generator`,

6590 `numpy.random.RandomState`}, optional

6591

6592 If `seed` is None (or `np.random`), the `numpy.random.RandomState`

6593 singleton is used.

6594 If `seed` is an int, a new ``RandomState`` instance is used,

6595 seeded with `seed`.

6596 If `seed` is already a ``Generator`` or ``RandomState`` instance then

6597 that instance is used.

6598

6599 Pseudorandom number generator state used to generate permutations

6600 (used only when `permutations` is not None).

6601

6602 .. versionadded:: 1.7.0

6603

6604 alternative : {'two-sided', 'less', 'greater'}, optional

6605 Defines the alternative hypothesis.

6606 The following options are available (default is 'two-sided'):

6607

6608 * 'two-sided': the means of the distributions underlying the samples

6609 are unequal.

6610 * 'less': the mean of the distribution underlying the first sample

6611 is less than the mean of the distribution underlying the second

6612 sample.

6613 * 'greater': the mean of the distribution underlying the first

6614 sample is greater than the mean of the distribution underlying

6615 the second sample.

6616

6617 .. versionadded:: 1.6.0

6618

6619 trim : float, optional

6620 If nonzero, performs a trimmed (Yuen's) t-test.

6621 Defines the fraction of elements to be trimmed from each end of the

6622 input samples. If 0 (default), no elements will be trimmed from either

6623 side. The number of trimmed elements from each tail is the floor of the

6624 trim times the number of elements. Valid range is [0, .5).

6625

6626 .. versionadded:: 1.7

6627

6628 Returns

6629 -------

6630 statistic : float or array

6631 The calculated t-statistic.

6632 pvalue : float or array

6633 The p-value.

6634

6635 Notes

6636 -----

6637 Suppose we observe two independent samples, e.g. flower petal lengths, and

6638 we are considering whether the two samples were drawn from the same

6639 population (e.g. the same species of flower or two species with similar

6640 petal characteristics) or two different populations.

6641

6642 The t-test quantifies the difference between the arithmetic means

6643 of the two samples. The p-value quantifies the probability of observing

6644 as or more extreme values assuming the null hypothesis, that the

6645 samples are drawn from populations with the same population means, is true.

6646 A p-value larger than a chosen threshold (e.g. 5% or 1%) indicates that

6647 our observation is not so unlikely to have occurred by chance. Therefore,

6648 we do not reject the null hypothesis of equal population means.

6649 If the p-value is smaller than our threshold, then we have evidence

6650 against the null hypothesis of equal population means.

6651

6652 By default, the p-value is determined by comparing the t-statistic of the

6653 observed data against a theoretical t-distribution.

6654 When ``1 < permutations < binom(n, k)``, where

6655

6656 * ``k`` is the number of observations in `a`,

6657 * ``n`` is the total number of observations in `a` and `b`, and

6658 * ``binom(n, k)`` is the binomial coefficient (``n`` choose ``k``),

6659

6660 the data are pooled (concatenated), randomly assigned to either group `a`

6661 or `b`, and the t-statistic is calculated. This process is performed

6662 repeatedly (`permutation` times), generating a distribution of the

6663 t-statistic under the null hypothesis, and the t-statistic of the observed

6664 data is compared to this distribution to determine the p-value.

6665 Specifically, the p-value reported is the "achieved significance level"

6666 (ASL) as defined in 4.4 of [3]_. Note that there are other ways of

6667 estimating p-values using randomized permutation tests; for other

6668 options, see the more general `permutation_test`.

6669

6670 When ``permutations >= binom(n, k)``, an exact test is performed: the data

6671 are partitioned between the groups in each distinct way exactly once.

6672

6673 The permutation test can be computationally expensive and not necessarily

6674 more accurate than the analytical test, but it does not make strong

6675 assumptions about the shape of the underlying distribution.

6676

6677 Use of trimming is commonly referred to as the trimmed t-test. At times

6678 called Yuen's t-test, this is an extension of Welch's t-test, with the

6679 difference being the use of winsorized means in calculation of the variance

6680 and the trimmed sample size in calculation of the statistic. Trimming is

6681 recommended if the underlying distribution is long-tailed or contaminated

6682 with outliers [4]_.

6683

6684 The statistic is calculated as ``(np.mean(a) - np.mean(b))/se``, where

6685 ``se`` is the standard error. Therefore, the statistic will be positive

6686 when the sample mean of `a` is greater than the sample mean of `b` and

6687 negative when the sample mean of `a` is less than the sample mean of

6688 `b`.

6689

6690 References

6691 ----------

6692 .. [1] https://en.wikipedia.org/wiki/T-test#Independent_two-sample_t-test

6693

6694 .. [2] https://en.wikipedia.org/wiki/Welch%27s_t-test

6695

6696 .. [3] B. Efron and T. Hastie. Computer Age Statistical Inference. (2016).

6697

6698 .. [4] Yuen, Karen K. "The Two-Sample Trimmed t for Unequal Population

6699 Variances." Biometrika, vol. 61, no. 1, 1974, pp. 165-170. JSTOR,

6700 www.jstor.org/stable/2334299. Accessed 30 Mar. 2021.

6701

6702 .. [5] Yuen, Karen K., and W. J. Dixon. "The Approximate Behaviour and

6703 Performance of the Two-Sample Trimmed t." Biometrika, vol. 60,

6704 no. 2, 1973, pp. 369-374. JSTOR, www.jstor.org/stable/2334550.

6705 Accessed 30 Mar. 2021.

6706

6707 Examples

6708 --------

6709 >>> import numpy as np

6710 >>> from scipy import stats

6711 >>> rng = np.random.default_rng()

6712

6713 Test with sample with identical means:

6714

6715 >>> rvs1 = stats.norm.rvs(loc=5, scale=10, size=500, random_state=rng)

6716 >>> rvs2 = stats.norm.rvs(loc=5, scale=10, size=500, random_state=rng)

6717 >>> stats.ttest_ind(rvs1, rvs2)

6718 Ttest_indResult(statistic=-0.4390847099199348, pvalue=0.6606952038870015)

6719 >>> stats.ttest_ind(rvs1, rvs2, equal_var=False)

6720 Ttest_indResult(statistic=-0.4390847099199348, pvalue=0.6606952553131064)

6721

6722 `ttest_ind` underestimates p for unequal variances:

6723

6724 >>> rvs3 = stats.norm.rvs(loc=5, scale=20, size=500, random_state=rng)

6725 >>> stats.ttest_ind(rvs1, rvs3)

6726 Ttest_indResult(statistic=-1.6370984482905417, pvalue=0.1019251574705033)

6727 >>> stats.ttest_ind(rvs1, rvs3, equal_var=False)

6728 Ttest_indResult(statistic=-1.637098448290542, pvalue=0.10202110497954867)

6729

6730 When ``n1 != n2``, the equal variance t-statistic is no longer equal to the

6731 unequal variance t-statistic:

6732

6733 >>> rvs4 = stats.norm.rvs(loc=5, scale=20, size=100, random_state=rng)

6734 >>> stats.ttest_ind(rvs1, rvs4)

6735 Ttest_indResult(statistic=-1.9481646859513422, pvalue=0.05186270935842703)

6736 >>> stats.ttest_ind(rvs1, rvs4, equal_var=False)

6737 Ttest_indResult(statistic=-1.3146566100751664, pvalue=0.1913495266513811)

6738

6739 T-test with different means, variance, and n:

6740

6741 >>> rvs5 = stats.norm.rvs(loc=8, scale=20, size=100, random_state=rng)

6742 >>> stats.ttest_ind(rvs1, rvs5)

6743 Ttest_indResult(statistic=-2.8415950600298774, pvalue=0.0046418707568707885)

6744 >>> stats.ttest_ind(rvs1, rvs5, equal_var=False)

6745 Ttest_indResult(statistic=-1.8686598649188084, pvalue=0.06434714193919686)

6746

6747 When performing a permutation test, more permutations typically yields

6748 more accurate results. Use a ``np.random.Generator`` to ensure

6749 reproducibility:

6750

6751 >>> stats.ttest_ind(rvs1, rvs5, permutations=10000,

6752 ... random_state=rng)

6753 Ttest_indResult(statistic=-2.8415950600298774, pvalue=0.0052994700529947)

6754

6755 Take these two samples, one of which has an extreme tail.

6756

6757 >>> a = (56, 128.6, 12, 123.8, 64.34, 78, 763.3)

6758 >>> b = (1.1, 2.9, 4.2)

6759

6760 Use the `trim` keyword to perform a trimmed (Yuen) t-test. For example,

6761 using 20% trimming, ``trim=.2``, the test will reduce the impact of one

6762 (``np.floor(trim*len(a))``) element from each tail of sample `a`. It will

6763 have no effect on sample `b` because ``np.floor(trim*len(b))`` is 0.

6764

6765 >>> stats.ttest_ind(a, b, trim=.2)

6766 Ttest_indResult(statistic=3.4463884028073513,

6767 pvalue=0.01369338726499547)

6768 """

6769 if not (0 <= trim < .5):

6770 raise ValueError("Trimming percentage should be 0 <= `trim` < .5.")

6771

6772 a, b, axis = _chk2_asarray(a, b, axis)

6773

6774 # check both a and b

6775 cna, npa = _contains_nan(a, nan_policy)

6776 cnb, npb = _contains_nan(b, nan_policy)

6777 contains_nan = cna or cnb

6778 if npa == 'omit' or npb == 'omit':

6779 nan_policy = 'omit'

6780

6781 if contains_nan and nan_policy == 'omit':

6782 if permutations or trim != 0:

6783 raise ValueError("nan-containing/masked inputs with "

6784 "nan_policy='omit' are currently not "

6785 "supported by permutation tests or "

6786 "trimmed tests.")

6787 a = ma.masked_invalid(a)

6788 b = ma.masked_invalid(b)

6789 return mstats_basic.ttest_ind(a, b, axis, equal_var, alternative)

6790

6791 if a.size == 0 or b.size == 0:

6792 return _ttest_nans(a, b, axis, Ttest_indResult)

6793

6794 if permutations is not None and permutations != 0:

6795 if trim != 0:

6796 raise ValueError("Permutations are currently not supported "

6797 "with trimming.")

6798 if permutations < 0 or (np.isfinite(permutations) and

6799 int(permutations) != permutations):

6800 raise ValueError("Permutations must be a non-negative integer.")

6801

6802 res = _permutation_ttest(a, b, permutations=permutations,

6803 axis=axis, equal_var=equal_var,

6804 nan_policy=nan_policy,

6805 random_state=random_state,

6806 alternative=alternative)

6807

6808 else:

6809 n1 = a.shape[axis]

6810 n2 = b.shape[axis]

6811

6812 if trim == 0:

6813 v1 = _var(a, axis, ddof=1)

6814 v2 = _var(b, axis, ddof=1)

6815 m1 = np.mean(a, axis)

6816 m2 = np.mean(b, axis)

6817 else:

6818 v1, m1, n1 = _ttest_trim_var_mean_len(a, trim, axis)

6819 v2, m2, n2 = _ttest_trim_var_mean_len(b, trim, axis)

6820

6821 if equal_var:

6822 df, denom = _equal_var_ttest_denom(v1, n1, v2, n2)

6823 else:

6824 df, denom = _unequal_var_ttest_denom(v1, n1, v2, n2)

6825 res = _ttest_ind_from_stats(m1, m2, denom, df, alternative)

6826 return Ttest_indResult(*res)

6827

6828

6829def _ttest_trim_var_mean_len(a, trim, axis):

6830 """Variance, mean, and length of winsorized input along specified axis"""

6831 # for use with `ttest_ind` when trimming.

6832 # further calculations in this test assume that the inputs are sorted.

6833 # From [4] Section 1 "Let x_1, ..., x_n be n ordered observations..."

6834 a = np.sort(a, axis=axis)

6835

6836 # `g` is the number of elements to be replaced on each tail, converted

6837 # from a percentage amount of trimming

6838 n = a.shape[axis]

6839 g = int(n * trim)

6840

6841 # Calculate the Winsorized variance of the input samples according to

6842 # specified `g`

6843 v = _calculate_winsorized_variance(a, g, axis)

6844

6845 # the total number of elements in the trimmed samples

6846 n -= 2 * g

6847

6848 # calculate the g-times trimmed mean, as defined in [4] (1-1)

6849 m = trim_mean(a, trim, axis=axis)

6850 return v, m, n

6851

6852

6853def _calculate_winsorized_variance(a, g, axis):

6854 """Calculates g-times winsorized variance along specified axis"""

6855 # it is expected that the input `a` is sorted along the correct axis

6856 if g == 0:

6857 return _var(a, ddof=1, axis=axis)

6858 # move the intended axis to the end that way it is easier to manipulate

6859 a_win = np.moveaxis(a, axis, -1)

6860

6861 # save where NaNs are for later use.

6862 nans_indices = np.any(np.isnan(a_win), axis=-1)

6863

6864 # Winsorization and variance calculation are done in one step in [4]

6865 # (1-3), but here winsorization is done first; replace the left and

6866 # right sides with the repeating value. This can be see in effect in (

6867 # 1-3) in [4], where the leftmost and rightmost tails are replaced with

6868 # `(g + 1) * x_{g + 1}` on the left and `(g + 1) * x_{n - g}` on the

6869 # right. Zero-indexing turns `g + 1` to `g`, and `n - g` to `- g - 1` in

6870 # array indexing.

6871 a_win[..., :g] = a_win[..., [g]]

6872 a_win[..., -g:] = a_win[..., [-g - 1]]

6873

6874 # Determine the variance. In [4], the degrees of freedom is expressed as

6875 # `h - 1`, where `h = n - 2g` (unnumbered equations in Section 1, end of

6876 # page 369, beginning of page 370). This is converted to NumPy's format,

6877 # `n - ddof` for use with `np.var`. The result is converted to an

6878 # array to accommodate indexing later.

6879 var_win = np.asarray(_var(a_win, ddof=(2 * g + 1), axis=-1))

6880

6881 # with `nan_policy='propagate'`, NaNs may be completely trimmed out

6882 # because they were sorted into the tail of the array. In these cases,

6883 # replace computed variances with `np.nan`.

6884 var_win[nans_indices] = np.nan

6885 return var_win

6886

6887

6888def _permutation_distribution_t(data, permutations, size_a, equal_var,

6889 random_state=None):

6890 """Generation permutation distribution of t statistic"""

6891

6892 random_state = check_random_state(random_state)

6893

6894 # prepare permutation indices

6895 size = data.shape[-1]

6896 # number of distinct combinations

6897 n_max = special.comb(size, size_a)

6898

6899 if permutations < n_max:

6900 perm_generator = (random_state.permutation(size)

6901 for i in range(permutations))

6902 else:

6903 permutations = n_max

6904 perm_generator = (np.concatenate(z)

6905 for z in _all_partitions(size_a, size-size_a))

6906

6907 t_stat = []

6908 for indices in _batch_generator(perm_generator, batch=50):

6909 # get one batch from perm_generator at a time as a list

6910 indices = np.array(indices)

6911 # generate permutations

6912 data_perm = data[..., indices]

6913 # move axis indexing permutations to position 0 to broadcast

6914 # nicely with t_stat_observed, which doesn't have this dimension

6915 data_perm = np.moveaxis(data_perm, -2, 0)

6916

6917 a = data_perm[..., :size_a]

6918 b = data_perm[..., size_a:]

6919 t_stat.append(_calc_t_stat(a, b, equal_var))

6920

6921 t_stat = np.concatenate(t_stat, axis=0)

6922

6923 return t_stat, permutations, n_max

6924

6925

6926def _calc_t_stat(a, b, equal_var, axis=-1):

6927 """Calculate the t statistic along the given dimension."""

6928 na = a.shape[axis]

6929 nb = b.shape[axis]

6930 avg_a = np.mean(a, axis=axis)

6931 avg_b = np.mean(b, axis=axis)

6932 var_a = _var(a, axis=axis, ddof=1)

6933 var_b = _var(b, axis=axis, ddof=1)

6934

6935 if not equal_var:

6936 denom = _unequal_var_ttest_denom(var_a, na, var_b, nb)[1]

6937 else:

6938 denom = _equal_var_ttest_denom(var_a, na, var_b, nb)[1]

6939

6940 return (avg_a-avg_b)/denom

6941

6942

6943def _permutation_ttest(a, b, permutations, axis=0, equal_var=True,

6944 nan_policy='propagate', random_state=None,

6945 alternative="two-sided"):

6946 """

6947 Calculates the T-test for the means of TWO INDEPENDENT samples of scores

6948 using permutation methods.

6949

6950 This test is similar to `stats.ttest_ind`, except it doesn't rely on an

6951 approximate normality assumption since it uses a permutation test.

6952 This function is only called from ttest_ind when permutations is not None.

6953

6954 Parameters

6955 ----------

6956 a, b : array_like

6957 The arrays must be broadcastable, except along the dimension

6958 corresponding to `axis` (the zeroth, by default).

6959 axis : int, optional

6960 The axis over which to operate on a and b.

6961 permutations : int, optional

6962 Number of permutations used to calculate p-value. If greater than or

6963 equal to the number of distinct permutations, perform an exact test.

6964 equal_var : bool, optional

6965 If False, an equal variance (Welch's) t-test is conducted. Otherwise,

6966 an ordinary t-test is conducted.

6967 random_state : {None, int, `numpy.random.Generator`}, optional

6968 If `seed` is None the `numpy.random.Generator` singleton is used.

6969 If `seed` is an int, a new ``Generator`` instance is used,

6970 seeded with `seed`.

6971 If `seed` is already a ``Generator`` instance then that instance is

6972 used.

6973 Pseudorandom number generator state used for generating random

6974 permutations.

6975

6976 Returns

6977 -------

6978 statistic : float or array

6979 The calculated t-statistic.

6980 pvalue : float or array

6981 The p-value.

6982

6983 """

6984 random_state = check_random_state(random_state)

6985

6986 t_stat_observed = _calc_t_stat(a, b, equal_var, axis=axis)

6987

6988 na = a.shape[axis]

6989 mat = _broadcast_concatenate((a, b), axis=axis)

6990 mat = np.moveaxis(mat, axis, -1)

6991

6992 t_stat, permutations, n_max = _permutation_distribution_t(

6993 mat, permutations, size_a=na, equal_var=equal_var,

6994 random_state=random_state)

6995

6996 compare = {"less": np.less_equal,

6997 "greater": np.greater_equal,

6998 "two-sided": lambda x, y: (x <= -np.abs(y)) | (x >= np.abs(y))}

6999

7000 # Calculate the p-values

7001 cmps = compare[alternative](t_stat, t_stat_observed)

7002 # Randomized test p-value calculation should use biased estimate; see e.g.

7003 # https://www.degruyter.com/document/doi/10.2202/1544-6115.1585/

7004 adjustment = 1 if n_max > permutations else 0

7005 pvalues = (cmps.sum(axis=0) + adjustment) / (permutations + adjustment)

7006

7007 # nans propagate naturally in statistic calculation, but need to be

7008 # propagated manually into pvalues

7009 if nan_policy == 'propagate' and np.isnan(t_stat_observed).any():

7010 if np.ndim(pvalues) == 0:

7011 pvalues = np.float64(np.nan)

7012 else:

7013 pvalues[np.isnan(t_stat_observed)] = np.nan

7014

7015 return (t_stat_observed, pvalues)

7016

7017

7018def _get_len(a, axis, msg):

7019 try:

7020 n = a.shape[axis]

7021 except IndexError:

7022 raise np.AxisError(axis, a.ndim, msg) from None

7023 return n

7024

7025

7026@_axis_nan_policy_factory(pack_TtestResult, default_axis=0, n_samples=2,

7027 result_to_tuple=unpack_TtestResult, n_outputs=6,

7028 paired=True)

7029def ttest_rel(a, b, axis=0, nan_policy='propagate', alternative="two-sided"):

7030 """Calculate the t-test on TWO RELATED samples of scores, a and b.

7031

7032 This is a test for the null hypothesis that two related or

7033 repeated samples have identical average (expected) values.

7034

7035 Parameters

7036 ----------

7037 a, b : array_like

7038 The arrays must have the same shape.

7039 axis : int or None, optional

7040 Axis along which to compute test. If None, compute over the whole

7041 arrays, `a`, and `b`.

7042 nan_policy : {'propagate', 'raise', 'omit'}, optional

7043 Defines how to handle when input contains nan.

7044 The following options are available (default is 'propagate'):

7045

7046 * 'propagate': returns nan

7047 * 'raise': throws an error

7048 * 'omit': performs the calculations ignoring nan values

7049 alternative : {'two-sided', 'less', 'greater'}, optional

7050 Defines the alternative hypothesis.

7051 The following options are available (default is 'two-sided'):

7052

7053 * 'two-sided': the means of the distributions underlying the samples

7054 are unequal.

7055 * 'less': the mean of the distribution underlying the first sample

7056 is less than the mean of the distribution underlying the second

7057 sample.

7058 * 'greater': the mean of the distribution underlying the first

7059 sample is greater than the mean of the distribution underlying

7060 the second sample.

7061

7062 .. versionadded:: 1.6.0

7063

7064 Returns

7065 -------

7066 result : `~scipy.stats._result_classes.TtestResult`

7067 An object with the following attributes:

7068

7069 statistic : float or array

7070 The t-statistic.

7071 pvalue : float or array

7072 The p-value associated with the given alternative.

7073 df : float or array

7074 The number of degrees of freedom used in calculation of the

7075 t-statistic; this is one less than the size of the sample

7076 (``a.shape[axis]``).

7077

7078 .. versionadded:: 1.10.0

7079

7080 The object also has the following method:

7081

7082 confidence_interval(confidence_level=0.95)

7083 Computes a confidence interval around the difference in

7084 population means for the given confidence level.

7085 The confidence interval is returned in a ``namedtuple`` with

7086 fields `low` and `high`.

7087

7088 .. versionadded:: 1.10.0

7089

7090 Notes

7091 -----

7092 Examples for use are scores of the same set of student in

7093 different exams, or repeated sampling from the same units. The

7094 test measures whether the average score differs significantly

7095 across samples (e.g. exams). If we observe a large p-value, for

7096 example greater than 0.05 or 0.1 then we cannot reject the null

7097 hypothesis of identical average scores. If the p-value is smaller

7098 than the threshold, e.g. 1%, 5% or 10%, then we reject the null

7099 hypothesis of equal averages. Small p-values are associated with

7100 large t-statistics.

7101

7102 The t-statistic is calculated as ``np.mean(a - b)/se``, where ``se`` is the

7103 standard error. Therefore, the t-statistic will be positive when the sample

7104 mean of ``a - b`` is greater than zero and negative when the sample mean of

7105 ``a - b`` is less than zero.

7106

7107 References

7108 ----------

7109 https://en.wikipedia.org/wiki/T-test#Dependent_t-test_for_paired_samples

7110

7111 Examples

7112 --------

7113 >>> import numpy as np

7114 >>> from scipy import stats

7115 >>> rng = np.random.default_rng()

7116

7117 >>> rvs1 = stats.norm.rvs(loc=5, scale=10, size=500, random_state=rng)

7118 >>> rvs2 = (stats.norm.rvs(loc=5, scale=10, size=500, random_state=rng)

7119 ... + stats.norm.rvs(scale=0.2, size=500, random_state=rng))

7120 >>> stats.ttest_rel(rvs1, rvs2)

7121 TtestResult(statistic=-0.4549717054410304, pvalue=0.6493274702088672, df=499) # noqa

7122 >>> rvs3 = (stats.norm.rvs(loc=8, scale=10, size=500, random_state=rng)

7123 ... + stats.norm.rvs(scale=0.2, size=500, random_state=rng))

7124 >>> stats.ttest_rel(rvs1, rvs3)

7125 TtestResult(statistic=-5.879467544540889, pvalue=7.540777129099917e-09, df=499) # noqa

7126

7127 """

7128 a, b, axis = _chk2_asarray(a, b, axis)

7129

7130 na = _get_len(a, axis, "first argument")

7131 nb = _get_len(b, axis, "second argument")

7132 if na != nb:

7133 raise ValueError('unequal length arrays')

7134

7135 if na == 0 or nb == 0:

7136 # _axis_nan_policy decorator ensures this only happens with 1d input

7137 return TtestResult(np.nan, np.nan, df=np.nan, alternative=np.nan,

7138 standard_error=np.nan, estimate=np.nan)

7139

7140 n = a.shape[axis]

7141 df = n - 1

7142

7143 d = (a - b).astype(np.float64)

7144 v = _var(d, axis, ddof=1)

7145 dm = np.mean(d, axis)

7146 denom = np.sqrt(v / n)

7147

7148 with np.errstate(divide='ignore', invalid='ignore'):

7149 t = np.divide(dm, denom)

7150 t, prob = _ttest_finish(df, t, alternative)

7151

7152 # when nan_policy='omit', `df` can be different for different axis-slices

7153 df = np.broadcast_to(df, t.shape)[()]

7154

7155 # _axis_nan_policy decorator doesn't play well with strings

7156 alternative_num = {"less": -1, "two-sided": 0, "greater": 1}[alternative]

7157 return TtestResult(t, prob, df=df, alternative=alternative_num,

7158 standard_error=denom, estimate=dm)

7159

7160

7161# Map from names to lambda_ values used in power_divergence().

7162_power_div_lambda_names = {

7163 "pearson": 1,

7164 "log-likelihood": 0,

7165 "freeman-tukey": -0.5,

7166 "mod-log-likelihood": -1,

7167 "neyman": -2,

7168 "cressie-read": 2/3,

7169}

7170

7171

7172def _count(a, axis=None):

7173 """Count the number of non-masked elements of an array.

7174

7175 This function behaves like `np.ma.count`, but is much faster

7176 for ndarrays.

7177 """

7178 if hasattr(a, 'count'):

7179 num = a.count(axis=axis)

7180 if isinstance(num, np.ndarray) and num.ndim == 0:

7181 # In some cases, the `count` method returns a scalar array (e.g.

7182 # np.array(3)), but we want a plain integer.

7183 num = int(num)

7184 else:

7185 if axis is None:

7186 num = a.size

7187 else:

7188 num = a.shape[axis]

7189 return num

7190

7191

7192def _m_broadcast_to(a, shape):

7193 if np.ma.isMaskedArray(a):

7194 return np.ma.masked_array(np.broadcast_to(a, shape),

7195 mask=np.broadcast_to(a.mask, shape))

7196 return np.broadcast_to(a, shape, subok=True)

7197

7198

7199Power_divergenceResult = namedtuple('Power_divergenceResult',

7200 ('statistic', 'pvalue'))

7201

7202

7203def power_divergence(f_obs, f_exp=None, ddof=0, axis=0, lambda_=None):

7204 """Cressie-Read power divergence statistic and goodness of fit test.

7205

7206 This function tests the null hypothesis that the categorical data

7207 has the given frequencies, using the Cressie-Read power divergence

7208 statistic.

7209

7210 Parameters

7211 ----------

7212 f_obs : array_like

7213 Observed frequencies in each category.

7214 f_exp : array_like, optional

7215 Expected frequencies in each category. By default the categories are

7216 assumed to be equally likely.

7217 ddof : int, optional

7218 "Delta degrees of freedom": adjustment to the degrees of freedom

7219 for the p-value. The p-value is computed using a chi-squared

7220 distribution with ``k - 1 - ddof`` degrees of freedom, where `k`

7221 is the number of observed frequencies. The default value of `ddof`

7222 is 0.

7223 axis : int or None, optional

7224 The axis of the broadcast result of `f_obs` and `f_exp` along which to

7225 apply the test. If axis is None, all values in `f_obs` are treated

7226 as a single data set. Default is 0.

7227 lambda_ : float or str, optional

7228 The power in the Cressie-Read power divergence statistic. The default

7229 is 1. For convenience, `lambda_` may be assigned one of the following

7230 strings, in which case the corresponding numerical value is used:

7231

7232 * ``"pearson"`` (value 1)

7233 Pearson's chi-squared statistic. In this case, the function is

7234 equivalent to `chisquare`.

7235 * ``"log-likelihood"`` (value 0)

7236 Log-likelihood ratio. Also known as the G-test [3]_.

7237 * ``"freeman-tukey"`` (value -1/2)

7238 Freeman-Tukey statistic.

7239 * ``"mod-log-likelihood"`` (value -1)

7240 Modified log-likelihood ratio.

7241 * ``"neyman"`` (value -2)

7242 Neyman's statistic.

7243 * ``"cressie-read"`` (value 2/3)

7244 The power recommended in [5]_.

7245

7246 Returns

7247 -------

7248 statistic : float or ndarray

7249 The Cressie-Read power divergence test statistic. The value is

7250 a float if `axis` is None or if` `f_obs` and `f_exp` are 1-D.

7251 pvalue : float or ndarray

7252 The p-value of the test. The value is a float if `ddof` and the

7253 return value `stat` are scalars.

7254

7255 See Also

7256 --------

7257 chisquare

7258

7259 Notes

7260 -----

7261 This test is invalid when the observed or expected frequencies in each

7262 category are too small. A typical rule is that all of the observed

7263 and expected frequencies should be at least 5.

7264

7265 Also, the sum of the observed and expected frequencies must be the same

7266 for the test to be valid; `power_divergence` raises an error if the sums

7267 do not agree within a relative tolerance of ``1e-8``.

7268

7269 When `lambda_` is less than zero, the formula for the statistic involves

7270 dividing by `f_obs`, so a warning or error may be generated if any value

7271 in `f_obs` is 0.

7272

7273 Similarly, a warning or error may be generated if any value in `f_exp` is

7274 zero when `lambda_` >= 0.

7275

7276 The default degrees of freedom, k-1, are for the case when no parameters

7277 of the distribution are estimated. If p parameters are estimated by

7278 efficient maximum likelihood then the correct degrees of freedom are

7279 k-1-p. If the parameters are estimated in a different way, then the

7280 dof can be between k-1-p and k-1. However, it is also possible that

7281 the asymptotic distribution is not a chisquare, in which case this

7282 test is not appropriate.

7283

7284 This function handles masked arrays. If an element of `f_obs` or `f_exp`

7285 is masked, then data at that position is ignored, and does not count

7286 towards the size of the data set.

7287

7288 .. versionadded:: 0.13.0

7289

7290 References

7291 ----------

7292 .. [1] Lowry, Richard. "Concepts and Applications of Inferential

7293 Statistics". Chapter 8.

7294 https://web.archive.org/web/20171015035606/http://faculty.vassar.edu/lowry/ch8pt1.html

7295 .. [2] "Chi-squared test", https://en.wikipedia.org/wiki/Chi-squared_test

7296 .. [3] "G-test", https://en.wikipedia.org/wiki/G-test

7297 .. [4] Sokal, R. R. and Rohlf, F. J. "Biometry: the principles and

7298 practice of statistics in biological research", New York: Freeman

7299 (1981)

7300 .. [5] Cressie, N. and Read, T. R. C., "Multinomial Goodness-of-Fit

7301 Tests", J. Royal Stat. Soc. Series B, Vol. 46, No. 3 (1984),

7302 pp. 440-464.

7303

7304 Examples

7305 --------

7306 (See `chisquare` for more examples.)

7307

7308 When just `f_obs` is given, it is assumed that the expected frequencies

7309 are uniform and given by the mean of the observed frequencies. Here we

7310 perform a G-test (i.e. use the log-likelihood ratio statistic):

7311

7312 >>> import numpy as np

7313 >>> from scipy.stats import power_divergence

7314 >>> power_divergence([16, 18, 16, 14, 12, 12], lambda_='log-likelihood')

7315 (2.006573162632538, 0.84823476779463769)

7316

7317 The expected frequencies can be given with the `f_exp` argument:

7318

7319 >>> power_divergence([16, 18, 16, 14, 12, 12],

7320 ... f_exp=[16, 16, 16, 16, 16, 8],

7321 ... lambda_='log-likelihood')

7322 (3.3281031458963746, 0.6495419288047497)

7323

7324 When `f_obs` is 2-D, by default the test is applied to each column.

7325

7326 >>> obs = np.array([[16, 18, 16, 14, 12, 12], [32, 24, 16, 28, 20, 24]]).T

7327 >>> obs.shape

7328 (6, 2)

7329 >>> power_divergence(obs, lambda_="log-likelihood")

7330 (array([ 2.00657316, 6.77634498]), array([ 0.84823477, 0.23781225]))

7331

7332 By setting ``axis=None``, the test is applied to all data in the array,

7333 which is equivalent to applying the test to the flattened array.

7334

7335 >>> power_divergence(obs, axis=None)

7336 (23.31034482758621, 0.015975692534127565)

7337 >>> power_divergence(obs.ravel())

7338 (23.31034482758621, 0.015975692534127565)

7339

7340 `ddof` is the change to make to the default degrees of freedom.

7341

7342 >>> power_divergence([16, 18, 16, 14, 12, 12], ddof=1)

7343 (2.0, 0.73575888234288467)

7344

7345 The calculation of the p-values is done by broadcasting the

7346 test statistic with `ddof`.

7347

7348 >>> power_divergence([16, 18, 16, 14, 12, 12], ddof=[0,1,2])

7349 (2.0, array([ 0.84914504, 0.73575888, 0.5724067 ]))

7350

7351 `f_obs` and `f_exp` are also broadcast. In the following, `f_obs` has

7352 shape (6,) and `f_exp` has shape (2, 6), so the result of broadcasting

7353 `f_obs` and `f_exp` has shape (2, 6). To compute the desired chi-squared

7354 statistics, we must use ``axis=1``:

7355

7356 >>> power_divergence([16, 18, 16, 14, 12, 12],

7357 ... f_exp=[[16, 16, 16, 16, 16, 8],

7358 ... [8, 20, 20, 16, 12, 12]],

7359 ... axis=1)

7360 (array([ 3.5 , 9.25]), array([ 0.62338763, 0.09949846]))

7361

7362 """

7363 # Convert the input argument `lambda_` to a numerical value.

7364 if isinstance(lambda_, str):

7365 if lambda_ not in _power_div_lambda_names:

7366 names = repr(list(_power_div_lambda_names.keys()))[1:-1]

7367 raise ValueError("invalid string for lambda_: {0!r}. "

7368 "Valid strings are {1}".format(lambda_, names))

7369 lambda_ = _power_div_lambda_names[lambda_]

7370 elif lambda_ is None:

7371 lambda_ = 1

7372

7373 f_obs = np.asanyarray(f_obs)

7374 f_obs_float = f_obs.astype(np.float64)

7375

7376 if f_exp is not None:

7377 f_exp = np.asanyarray(f_exp)

7378 bshape = _broadcast_shapes(f_obs_float.shape, f_exp.shape)

7379 f_obs_float = _m_broadcast_to(f_obs_float, bshape)

7380 f_exp = _m_broadcast_to(f_exp, bshape)

7381 rtol = 1e-8 # to pass existing tests

7382 with np.errstate(invalid='ignore'):

7383 f_obs_sum = f_obs_float.sum(axis=axis)

7384 f_exp_sum = f_exp.sum(axis=axis)

7385 relative_diff = (np.abs(f_obs_sum - f_exp_sum) /

7386 np.minimum(f_obs_sum, f_exp_sum))

7387 diff_gt_tol = (relative_diff > rtol).any()

7388 if diff_gt_tol:

7389 msg = (f"For each axis slice, the sum of the observed "

7390 f"frequencies must agree with the sum of the "

7391 f"expected frequencies to a relative tolerance "

7392 f"of {rtol}, but the percent differences are:\n"

7393 f"{relative_diff}")

7394 raise ValueError(msg)

7395

7396 else:

7397 # Ignore 'invalid' errors so the edge case of a data set with length 0

7398 # is handled without spurious warnings.

7399 with np.errstate(invalid='ignore'):

7400 f_exp = f_obs.mean(axis=axis, keepdims=True)

7401

7402 # `terms` is the array of terms that are summed along `axis` to create

7403 # the test statistic. We use some specialized code for a few special

7404 # cases of lambda_.

7405 if lambda_ == 1:

7406 # Pearson's chi-squared statistic

7407 terms = (f_obs_float - f_exp)**2 / f_exp

7408 elif lambda_ == 0:

7409 # Log-likelihood ratio (i.e. G-test)

7410 terms = 2.0 * special.xlogy(f_obs, f_obs / f_exp)

7411 elif lambda_ == -1:

7412 # Modified log-likelihood ratio

7413 terms = 2.0 * special.xlogy(f_exp, f_exp / f_obs)

7414 else:

7415 # General Cressie-Read power divergence.

7416 terms = f_obs * ((f_obs / f_exp)**lambda_ - 1)

7417 terms /= 0.5 * lambda_ * (lambda_ + 1)

7418

7419 stat = terms.sum(axis=axis)

7420

7421 num_obs = _count(terms, axis=axis)

7422 ddof = asarray(ddof)

7423 p = distributions.chi2.sf(stat, num_obs - 1 - ddof)

7424

7425 return Power_divergenceResult(stat, p)

7426

7427

7428def chisquare(f_obs, f_exp=None, ddof=0, axis=0):

7429 """Calculate a one-way chi-square test.

7430

7431 The chi-square test tests the null hypothesis that the categorical data

7432 has the given frequencies.

7433

7434 Parameters

7435 ----------

7436 f_obs : array_like

7437 Observed frequencies in each category.

7438 f_exp : array_like, optional

7439 Expected frequencies in each category. By default the categories are

7440 assumed to be equally likely.

7441 ddof : int, optional

7442 "Delta degrees of freedom": adjustment to the degrees of freedom

7443 for the p-value. The p-value is computed using a chi-squared

7444 distribution with ``k - 1 - ddof`` degrees of freedom, where `k`

7445 is the number of observed frequencies. The default value of `ddof`

7446 is 0.

7447 axis : int or None, optional

7448 The axis of the broadcast result of `f_obs` and `f_exp` along which to

7449 apply the test. If axis is None, all values in `f_obs` are treated

7450 as a single data set. Default is 0.

7451

7452 Returns

7453 -------

7454 chisq : float or ndarray

7455 The chi-squared test statistic. The value is a float if `axis` is

7456 None or `f_obs` and `f_exp` are 1-D.

7457 p : float or ndarray

7458 The p-value of the test. The value is a float if `ddof` and the

7459 return value `chisq` are scalars.

7460

7461 See Also

7462 --------

7463 scipy.stats.power_divergence

7464 scipy.stats.fisher_exact : Fisher exact test on a 2x2 contingency table.

7465 scipy.stats.barnard_exact : An unconditional exact test. An alternative

7466 to chi-squared test for small sample sizes.

7467

7468 Notes

7469 -----

7470 This test is invalid when the observed or expected frequencies in each

7471 category are too small. A typical rule is that all of the observed

7472 and expected frequencies should be at least 5. According to [3]_, the

7473 total number of samples is recommended to be greater than 13,

7474 otherwise exact tests (such as Barnard's Exact test) should be used

7475 because they do not overreject.

7476

7477 Also, the sum of the observed and expected frequencies must be the same

7478 for the test to be valid; `chisquare` raises an error if the sums do not

7479 agree within a relative tolerance of ``1e-8``.

7480

7481 The default degrees of freedom, k-1, are for the case when no parameters

7482 of the distribution are estimated. If p parameters are estimated by

7483 efficient maximum likelihood then the correct degrees of freedom are

7484 k-1-p. If the parameters are estimated in a different way, then the

7485 dof can be between k-1-p and k-1. However, it is also possible that

7486 the asymptotic distribution is not chi-square, in which case this test

7487 is not appropriate.

7488

7489 References

7490 ----------

7491 .. [1] Lowry, Richard. "Concepts and Applications of Inferential

7492 Statistics". Chapter 8.

7493 https://web.archive.org/web/20171022032306/http://vassarstats.net:80/textbook/ch8pt1.html

7494 .. [2] "Chi-squared test", https://en.wikipedia.org/wiki/Chi-squared_test

7495 .. [3] Pearson, Karl. "On the criterion that a given system of deviations from the probable

7496 in the case of a correlated system of variables is such that it can be reasonably

7497 supposed to have arisen from random sampling", Philosophical Magazine. Series 5. 50

7498 (1900), pp. 157-175.

7499

7500 Examples

7501 --------

7502 When just `f_obs` is given, it is assumed that the expected frequencies

7503 are uniform and given by the mean of the observed frequencies.

7504

7505 >>> import numpy as np

7506 >>> from scipy.stats import chisquare

7507 >>> chisquare([16, 18, 16, 14, 12, 12])

7508 (2.0, 0.84914503608460956)

7509

7510 With `f_exp` the expected frequencies can be given.

7511

7512 >>> chisquare([16, 18, 16, 14, 12, 12], f_exp=[16, 16, 16, 16, 16, 8])

7513 (3.5, 0.62338762774958223)

7514

7515 When `f_obs` is 2-D, by default the test is applied to each column.

7516

7517 >>> obs = np.array([[16, 18, 16, 14, 12, 12], [32, 24, 16, 28, 20, 24]]).T

7518 >>> obs.shape

7519 (6, 2)

7520 >>> chisquare(obs)

7521 (array([ 2. , 6.66666667]), array([ 0.84914504, 0.24663415]))

7522

7523 By setting ``axis=None``, the test is applied to all data in the array,

7524 which is equivalent to applying the test to the flattened array.

7525

7526 >>> chisquare(obs, axis=None)

7527 (23.31034482758621, 0.015975692534127565)

7528 >>> chisquare(obs.ravel())

7529 (23.31034482758621, 0.015975692534127565)

7530

7531 `ddof` is the change to make to the default degrees of freedom.

7532

7533 >>> chisquare([16, 18, 16, 14, 12, 12], ddof=1)

7534 (2.0, 0.73575888234288467)

7535

7536 The calculation of the p-values is done by broadcasting the

7537 chi-squared statistic with `ddof`.

7538

7539 >>> chisquare([16, 18, 16, 14, 12, 12], ddof=[0,1,2])

7540 (2.0, array([ 0.84914504, 0.73575888, 0.5724067 ]))

7541

7542 `f_obs` and `f_exp` are also broadcast. In the following, `f_obs` has

7543 shape (6,) and `f_exp` has shape (2, 6), so the result of broadcasting

7544 `f_obs` and `f_exp` has shape (2, 6). To compute the desired chi-squared

7545 statistics, we use ``axis=1``:

7546

7547 >>> chisquare([16, 18, 16, 14, 12, 12],

7548 ... f_exp=[[16, 16, 16, 16, 16, 8], [8, 20, 20, 16, 12, 12]],

7549 ... axis=1)

7550 (array([ 3.5 , 9.25]), array([ 0.62338763, 0.09949846]))

7551

7552 """

7553 return power_divergence(f_obs, f_exp=f_exp, ddof=ddof, axis=axis,

7554 lambda_="pearson")

7555

7556

7557KstestResult = _make_tuple_bunch('KstestResult', ['statistic', 'pvalue'],

7558 ['statistic_location', 'statistic_sign'])

7559

7560

7561def _compute_dplus(cdfvals, x):

7562 """Computes D+ as used in the Kolmogorov-Smirnov test.

7563

7564 Parameters

7565 ----------

7566 cdfvals : array_like

7567 Sorted array of CDF values between 0 and 1

7568 x: array_like

7569 Sorted array of the stochastic variable itself

7570

7571 Returns

7572 -------

7573 res: Pair with the following elements:

7574 - The maximum distance of the CDF values below Uniform(0, 1).

7575 - The location at which the maximum is reached.

7576

7577 """

7578 n = len(cdfvals)

7579 dplus = (np.arange(1.0, n + 1) / n - cdfvals)

7580 amax = dplus.argmax()

7581 loc_max = x[amax]

7582 return (dplus[amax], loc_max)

7583

7584

7585def _compute_dminus(cdfvals, x):

7586 """Computes D- as used in the Kolmogorov-Smirnov test.

7587

7588 Parameters

7589 ----------

7590 cdfvals : array_like

7591 Sorted array of CDF values between 0 and 1

7592 x: array_like

7593 Sorted array of the stochastic variable itself

7594

7595 Returns

7596 -------

7597 res: Pair with the following elements:

7598 - Maximum distance of the CDF values above Uniform(0, 1)

7599 - The location at which the maximum is reached.

7600 """

7601 n = len(cdfvals)

7602 dminus = (cdfvals - np.arange(0.0, n)/n)

7603 amax = dminus.argmax()

7604 loc_max = x[amax]

7605 return (dminus[amax], loc_max)

7606

7607

7608@_rename_parameter("mode", "method")

7609def ks_1samp(x, cdf, args=(), alternative='two-sided', method='auto'):

7610 """

7611 Performs the one-sample Kolmogorov-Smirnov test for goodness of fit.

7612

7613 This test compares the underlying distribution F(x) of a sample

7614 against a given continuous distribution G(x). See Notes for a description

7615 of the available null and alternative hypotheses.

7616

7617 Parameters

7618 ----------

7619 x : array_like

7620 a 1-D array of observations of iid random variables.

7621 cdf : callable

7622 callable used to calculate the cdf.

7623 args : tuple, sequence, optional

7624 Distribution parameters, used with `cdf`.

7625 alternative : {'two-sided', 'less', 'greater'}, optional

7626 Defines the null and alternative hypotheses. Default is 'two-sided'.

7627 Please see explanations in the Notes below.

7628 method : {'auto', 'exact', 'approx', 'asymp'}, optional

7629 Defines the distribution used for calculating the p-value.

7630 The following options are available (default is 'auto'):

7631

7632 * 'auto' : selects one of the other options.

7633 * 'exact' : uses the exact distribution of test statistic.

7634 * 'approx' : approximates the two-sided probability with twice

7635 the one-sided probability

7636 * 'asymp': uses asymptotic distribution of test statistic

7637

7638 Returns

7639 -------

7640 res: KstestResult

7641 An object containing attributes:

7642

7643 statistic : float

7644 KS test statistic, either D+, D-, or D (the maximum of the two)

7645 pvalue : float

7646 One-tailed or two-tailed p-value.

7647 statistic_location : float

7648 Value of `x` corresponding with the KS statistic; i.e., the

7649 distance between the empirical distribution function and the

7650 hypothesized cumulative distribution function is measured at this

7651 observation.

7652 statistic_sign : int

7653 +1 if the KS statistic is the maximum positive difference between

7654 the empirical distribution function and the hypothesized cumulative

7655 distribution function (D+); -1 if the KS statistic is the maximum

7656 negative difference (D-).

7657

7658

7659 See Also

7660 --------

7661 ks_2samp, kstest

7662

7663 Notes

7664 -----

7665 There are three options for the null and corresponding alternative

7666 hypothesis that can be selected using the `alternative` parameter.

7667

7668 - `two-sided`: The null hypothesis is that the two distributions are

7669 identical, F(x)=G(x) for all x; the alternative is that they are not

7670 identical.

7671

7672 - `less`: The null hypothesis is that F(x) >= G(x) for all x; the

7673 alternative is that F(x) < G(x) for at least one x.

7674

7675 - `greater`: The null hypothesis is that F(x) <= G(x) for all x; the

7676 alternative is that F(x) > G(x) for at least one x.

7677

7678 Note that the alternative hypotheses describe the *CDFs* of the

7679 underlying distributions, not the observed values. For example,

7680 suppose x1 ~ F and x2 ~ G. If F(x) > G(x) for all x, the values in

7681 x1 tend to be less than those in x2.

7682

7683 Examples

7684 --------

7685 Suppose we wish to test the null hypothesis that a sample is distributed

7686 according to the standard normal.

7687 We choose a confidence level of 95%; that is, we will reject the null

7688 hypothesis in favor of the alternative if the p-value is less than 0.05.

7689

7690 When testing uniformly distributed data, we would expect the

7691 null hypothesis to be rejected.

7692

7693 >>> import numpy as np

7694 >>> from scipy import stats

7695 >>> rng = np.random.default_rng()

7696 >>> stats.ks_1samp(stats.uniform.rvs(size=100, random_state=rng),

7697 ... stats.norm.cdf)

7698 KstestResult(statistic=0.5001899973268688, pvalue=1.1616392184763533e-23)

7699

7700 Indeed, the p-value is lower than our threshold of 0.05, so we reject the

7701 null hypothesis in favor of the default "two-sided" alternative: the data

7702 are *not* distributed according to the standard normal.

7703

7704 When testing random variates from the standard normal distribution, we

7705 expect the data to be consistent with the null hypothesis most of the time.

7706

7707 >>> x = stats.norm.rvs(size=100, random_state=rng)

7708 >>> stats.ks_1samp(x, stats.norm.cdf)

7709 KstestResult(statistic=0.05345882212970396, pvalue=0.9227159037744717)

7710

7711 As expected, the p-value of 0.92 is not below our threshold of 0.05, so

7712 we cannot reject the null hypothesis.

7713

7714 Suppose, however, that the random variates are distributed according to

7715 a normal distribution that is shifted toward greater values. In this case,

7716 the cumulative density function (CDF) of the underlying distribution tends

7717 to be *less* than the CDF of the standard normal. Therefore, we would

7718 expect the null hypothesis to be rejected with ``alternative='less'``:

7719

7720 >>> x = stats.norm.rvs(size=100, loc=0.5, random_state=rng)

7721 >>> stats.ks_1samp(x, stats.norm.cdf, alternative='less')

7722 KstestResult(statistic=0.17482387821055168, pvalue=0.001913921057766743)

7723

7724 and indeed, with p-value smaller than our threshold, we reject the null

7725 hypothesis in favor of the alternative.

7726

7727 """

7728 mode = method

7729

7730 alternative = {'t': 'two-sided', 'g': 'greater', 'l': 'less'}.get(

7731 alternative.lower()[0], alternative)

7732 if alternative not in ['two-sided', 'greater', 'less']:

7733 raise ValueError("Unexpected alternative %s" % alternative)

7734 if np.ma.is_masked(x):

7735 x = x.compressed()

7736

7737 N = len(x)

7738 x = np.sort(x)

7739 cdfvals = cdf(x, *args)

7740

7741 if alternative == 'greater':

7742 Dplus, d_location = _compute_dplus(cdfvals, x)

7743 return KstestResult(Dplus, distributions.ksone.sf(Dplus, N),

7744 statistic_location=d_location,

7745 statistic_sign=1)

7746

7747 if alternative == 'less':

7748 Dminus, d_location = _compute_dminus(cdfvals, x)

7749 return KstestResult(Dminus, distributions.ksone.sf(Dminus, N),

7750 statistic_location=d_location,

7751 statistic_sign=-1)

7752

7753 # alternative == 'two-sided':

7754 Dplus, dplus_location = _compute_dplus(cdfvals, x)

7755 Dminus, dminus_location = _compute_dminus(cdfvals, x)

7756 if Dplus > Dminus:

7757 D = Dplus

7758 d_location = dplus_location

7759 d_sign = 1

7760 else:

7761 D = Dminus

7762 d_location = dminus_location

7763 d_sign = -1

7764

7765 if mode == 'auto': # Always select exact

7766 mode = 'exact'

7767 if mode == 'exact':

7768 prob = distributions.kstwo.sf(D, N)

7769 elif mode == 'asymp':

7770 prob = distributions.kstwobign.sf(D * np.sqrt(N))

7771 else:

7772 # mode == 'approx'

7773 prob = 2 * distributions.ksone.sf(D, N)

7774 prob = np.clip(prob, 0, 1)

7775 return KstestResult(D, prob,

7776 statistic_location=d_location,

7777 statistic_sign=d_sign)

7778

7779

7780Ks_2sampResult = KstestResult

7781

7782

7783def _compute_prob_outside_square(n, h):

7784 """

7785 Compute the proportion of paths that pass outside the two diagonal lines.

7786

7787 Parameters

7788 ----------

7789 n : integer

7790 n > 0

7791 h : integer

7792 0 <= h <= n

7793

7794 Returns

7795 -------

7796 p : float

7797 The proportion of paths that pass outside the lines x-y = +/-h.

7798

7799 """

7800 # Compute Pr(D_{n,n} >= h/n)

7801 # Prob = 2 * ( binom(2n, n-h) - binom(2n, n-2a) + binom(2n, n-3a) - ... )

7802 # / binom(2n, n)

7803 # This formulation exhibits subtractive cancellation.

7804 # Instead divide each term by binom(2n, n), then factor common terms

7805 # and use a Horner-like algorithm

7806 # P = 2 * A0 * (1 - A1*(1 - A2*(1 - A3*(1 - A4*(...)))))

7807

7808 P = 0.0

7809 k = int(np.floor(n / h))

7810 while k >= 0:

7811 p1 = 1.0

7812 # Each of the Ai terms has numerator and denominator with

7813 # h simple terms.

7814 for j in range(h):

7815 p1 = (n - k * h - j) * p1 / (n + k * h + j + 1)

7816 P = p1 * (1.0 - P)

7817 k -= 1

7818 return 2 * P

7819

7820

7821def _count_paths_outside_method(m, n, g, h):

7822 """Count the number of paths that pass outside the specified diagonal.

7823

7824 Parameters

7825 ----------

7826 m : integer

7827 m > 0

7828 n : integer

7829 n > 0

7830 g : integer

7831 g is greatest common divisor of m and n

7832 h : integer

7833 0 <= h <= lcm(m,n)

7834

7835 Returns

7836 -------

7837 p : float

7838 The number of paths that go low.

7839 The calculation may overflow - check for a finite answer.

7840

7841 Notes

7842 -----

7843 Count the integer lattice paths from (0, 0) to (m, n), which at some

7844 point (x, y) along the path, satisfy:

7845 m*y <= n*x - h*g

7846 The paths make steps of size +1 in either positive x or positive y

7847 directions.

7848

7849 We generally follow Hodges' treatment of Drion/Gnedenko/Korolyuk.

7850 Hodges, J.L. Jr.,

7851 "The Significance Probability of the Smirnov Two-Sample Test,"

7852 Arkiv fiur Matematik, 3, No. 43 (1958), 469-86.

7853

7854 """

7855 # Compute #paths which stay lower than x/m-y/n = h/lcm(m,n)

7856 # B(x, y) = #{paths from (0,0) to (x,y) without

7857 # previously crossing the boundary}

7858 # = binom(x, y) - #{paths which already reached the boundary}

7859 # Multiply by the number of path extensions going from (x, y) to (m, n)

7860 # Sum.

7861

7862 # Probability is symmetrical in m, n. Computation below assumes m >= n.

7863 if m < n:

7864 m, n = n, m

7865 mg = m // g

7866 ng = n // g

7867

7868 # Not every x needs to be considered.

7869 # xj holds the list of x values to be checked.

7870 # Wherever n*x/m + ng*h crosses an integer

7871 lxj = n + (mg-h)//mg

7872 xj = [(h + mg * j + ng-1)//ng for j in range(lxj)]

7873 # B is an array just holding a few values of B(x,y), the ones needed.

7874 # B[j] == B(x_j, j)

7875 if lxj == 0:

7876 return special.binom(m + n, n)

7877 B = np.zeros(lxj)

7878 B[0] = 1

7879 # Compute the B(x, y) terms

7880 for j in range(1, lxj):

7881 Bj = special.binom(xj[j] + j, j)

7882 for i in range(j):

7883 bin = special.binom(xj[j] - xj[i] + j - i, j-i)

7884 Bj -= bin * B[i]

7885 B[j] = Bj

7886 # Compute the number of path extensions...

7887 num_paths = 0

7888 for j in range(lxj):

7889 bin = special.binom((m-xj[j]) + (n - j), n-j)

7890 term = B[j] * bin

7891 num_paths += term

7892 return num_paths

7893

7894

7895def _attempt_exact_2kssamp(n1, n2, g, d, alternative):

7896 """Attempts to compute the exact 2sample probability.

7897

7898 n1, n2 are the sample sizes

7899 g is the gcd(n1, n2)

7900 d is the computed max difference in ECDFs

7901

7902 Returns (success, d, probability)

7903 """

7904 lcm = (n1 // g) * n2

7905 h = int(np.round(d * lcm))

7906 d = h * 1.0 / lcm

7907 if h == 0:

7908 return True, d, 1.0

7909 saw_fp_error, prob = False, np.nan

7910 try:

7911 with np.errstate(invalid="raise", over="raise"):

7912 if alternative == 'two-sided':

7913 if n1 == n2:

7914 prob = _compute_prob_outside_square(n1, h)

7915 else:

7916 prob = _compute_outer_prob_inside_method(n1, n2, g, h)

7917 else:

7918 if n1 == n2:

7919 # prob = binom(2n, n-h) / binom(2n, n)

7920 # Evaluating in that form incurs roundoff errors

7921 # from special.binom. Instead calculate directly

7922 jrange = np.arange(h)

7923 prob = np.prod((n1 - jrange) / (n1 + jrange + 1.0))

7924 else:

7925 with np.errstate(over='raise'):

7926 num_paths = _count_paths_outside_method(n1, n2, g, h)

7927 bin = special.binom(n1 + n2, n1)

7928 if num_paths > bin or np.isinf(bin):

7929 saw_fp_error = True

7930 else:

7931 prob = num_paths / bin

7932

7933 except (FloatingPointError, OverflowError):

7934 saw_fp_error = True

7935

7936 if saw_fp_error:

7937 return False, d, np.nan

7938 if not (0 <= prob <= 1):

7939 return False, d, prob

7940 return True, d, prob

7941

7942

7943@_rename_parameter("mode", "method")

7944def ks_2samp(data1, data2, alternative='two-sided', method='auto'):

7945 """

7946 Performs the two-sample Kolmogorov-Smirnov test for goodness of fit.

7947

7948 This test compares the underlying continuous distributions F(x) and G(x)

7949 of two independent samples. See Notes for a description of the available

7950 null and alternative hypotheses.

7951

7952 Parameters

7953 ----------

7954 data1, data2 : array_like, 1-Dimensional

7955 Two arrays of sample observations assumed to be drawn from a continuous

7956 distribution, sample sizes can be different.

7957 alternative : {'two-sided', 'less', 'greater'}, optional

7958 Defines the null and alternative hypotheses. Default is 'two-sided'.

7959 Please see explanations in the Notes below.

7960 method : {'auto', 'exact', 'asymp'}, optional

7961 Defines the method used for calculating the p-value.

7962 The following options are available (default is 'auto'):

7963

7964 * 'auto' : use 'exact' for small size arrays, 'asymp' for large

7965 * 'exact' : use exact distribution of test statistic

7966 * 'asymp' : use asymptotic distribution of test statistic

7967

7968 Returns

7969 -------

7970 res: KstestResult

7971 An object containing attributes:

7972

7973 statistic : float

7974 KS test statistic.

7975 pvalue : float

7976 One-tailed or two-tailed p-value.

7977 statistic_location : float

7978 Value from `data1` or `data2` corresponding with the KS statistic;

7979 i.e., the distance between the empirical distribution functions is

7980 measured at this observation.

7981 statistic_sign : int

7982 +1 if the empirical distribution function of `data1` exceeds

7983 the empirical distribution function of `data2` at

7984 `statistic_location`, otherwise -1.

7985

7986 See Also

7987 --------

7988 kstest, ks_1samp, epps_singleton_2samp, anderson_ksamp

7989

7990 Notes

7991 -----

7992 There are three options for the null and corresponding alternative

7993 hypothesis that can be selected using the `alternative` parameter.

7994

7995 - `less`: The null hypothesis is that F(x) >= G(x) for all x; the

7996 alternative is that F(x) < G(x) for at least one x. The statistic

7997 is the magnitude of the minimum (most negative) difference between the

7998 empirical distribution functions of the samples.

7999

8000 - `greater`: The null hypothesis is that F(x) <= G(x) for all x; the

8001 alternative is that F(x) > G(x) for at least one x. The statistic

8002 is the maximum (most positive) difference between the empirical

8003 distribution functions of the samples.

8004

8005 - `two-sided`: The null hypothesis is that the two distributions are

8006 identical, F(x)=G(x) for all x; the alternative is that they are not

8007 identical. The statistic is the maximum absolute difference between the

8008 empirical distribution functions of the samples.

8009

8010 Note that the alternative hypotheses describe the *CDFs* of the

8011 underlying distributions, not the observed values of the data. For example,

8012 suppose x1 ~ F and x2 ~ G. If F(x) > G(x) for all x, the values in

8013 x1 tend to be less than those in x2.

8014

8015 If the KS statistic is large, then the p-value will be small, and this may

8016 be taken as evidence against the null hypothesis in favor of the

8017 alternative.

8018

8019 If ``method='exact'``, `ks_2samp` attempts to compute an exact p-value,

8020 that is, the probability under the null hypothesis of obtaining a test

8021 statistic value as extreme as the value computed from the data.

8022 If ``method='asymp'``, the asymptotic Kolmogorov-Smirnov distribution is

8023 used to compute an approximate p-value.

8024 If ``method='auto'``, an exact p-value computation is attempted if both

8025 sample sizes are less than 10000; otherwise, the asymptotic method is used.

8026 In any case, if an exact p-value calculation is attempted and fails, a

8027 warning will be emitted, and the asymptotic p-value will be returned.

8028

8029 The 'two-sided' 'exact' computation computes the complementary probability

8030 and then subtracts from 1. As such, the minimum probability it can return

8031 is about 1e-16. While the algorithm itself is exact, numerical

8032 errors may accumulate for large sample sizes. It is most suited to

8033 situations in which one of the sample sizes is only a few thousand.

8034

8035 We generally follow Hodges' treatment of Drion/Gnedenko/Korolyuk [1]_.

8036

8037 References

8038 ----------

8039 .. [1] Hodges, J.L. Jr., "The Significance Probability of the Smirnov

8040 Two-Sample Test," Arkiv fiur Matematik, 3, No. 43 (1958), 469-86.

8041

8042 Examples

8043 --------

8044 Suppose we wish to test the null hypothesis that two samples were drawn

8045 from the same distribution.

8046 We choose a confidence level of 95%; that is, we will reject the null

8047 hypothesis in favor of the alternative if the p-value is less than 0.05.

8048

8049 If the first sample were drawn from a uniform distribution and the second

8050 were drawn from the standard normal, we would expect the null hypothesis

8051 to be rejected.

8052

8053 >>> import numpy as np

8054 >>> from scipy import stats

8055 >>> rng = np.random.default_rng()

8056 >>> sample1 = stats.uniform.rvs(size=100, random_state=rng)

8057 >>> sample2 = stats.norm.rvs(size=110, random_state=rng)

8058 >>> stats.ks_2samp(sample1, sample2)

8059 KstestResult(statistic=0.5454545454545454, pvalue=7.37417839555191e-15)

8060

8061 Indeed, the p-value is lower than our threshold of 0.05, so we reject the

8062 null hypothesis in favor of the default "two-sided" alternative: the data

8063 were *not* drawn from the same distribution.

8064

8065 When both samples are drawn from the same distribution, we expect the data

8066 to be consistent with the null hypothesis most of the time.

8067

8068 >>> sample1 = stats.norm.rvs(size=105, random_state=rng)

8069 >>> sample2 = stats.norm.rvs(size=95, random_state=rng)

8070 >>> stats.ks_2samp(sample1, sample2)

8071 KstestResult(statistic=0.10927318295739348, pvalue=0.5438289009927495)

8072

8073 As expected, the p-value of 0.54 is not below our threshold of 0.05, so

8074 we cannot reject the null hypothesis.

8075

8076 Suppose, however, that the first sample were drawn from

8077 a normal distribution shifted toward greater values. In this case,

8078 the cumulative density function (CDF) of the underlying distribution tends

8079 to be *less* than the CDF underlying the second sample. Therefore, we would

8080 expect the null hypothesis to be rejected with ``alternative='less'``:

8081

8082 >>> sample1 = stats.norm.rvs(size=105, loc=0.5, random_state=rng)

8083 >>> stats.ks_2samp(sample1, sample2, alternative='less')

8084 KstestResult(statistic=0.4055137844611529, pvalue=3.5474563068855554e-08)

8085

8086 and indeed, with p-value smaller than our threshold, we reject the null

8087 hypothesis in favor of the alternative.

8088

8089 """

8090 mode = method

8091

8092 if mode not in ['auto', 'exact', 'asymp']:

8093 raise ValueError(f'Invalid value for mode: {mode}')

8094 alternative = {'t': 'two-sided', 'g': 'greater', 'l': 'less'}.get(

8095 alternative.lower()[0], alternative)

8096 if alternative not in ['two-sided', 'less', 'greater']:

8097 raise ValueError(f'Invalid value for alternative: {alternative}')

8098 MAX_AUTO_N = 10000 # 'auto' will attempt to be exact if n1,n2 <= MAX_AUTO_N

8099 if np.ma.is_masked(data1):

8100 data1 = data1.compressed()

8101 if np.ma.is_masked(data2):

8102 data2 = data2.compressed()

8103 data1 = np.sort(data1)

8104 data2 = np.sort(data2)

8105 n1 = data1.shape[0]

8106 n2 = data2.shape[0]

8107 if min(n1, n2) == 0:

8108 raise ValueError('Data passed to ks_2samp must not be empty')

8109

8110 data_all = np.concatenate([data1, data2])

8111 # using searchsorted solves equal data problem

8112 cdf1 = np.searchsorted(data1, data_all, side='right') / n1

8113 cdf2 = np.searchsorted(data2, data_all, side='right') / n2

8114 cddiffs = cdf1 - cdf2

8115

8116 # Identify the location of the statistic

8117 argminS = np.argmin(cddiffs)

8118 argmaxS = np.argmax(cddiffs)

8119 loc_minS = data_all[argminS]

8120 loc_maxS = data_all[argmaxS]

8121

8122 # Ensure sign of minS is not negative.

8123 minS = np.clip(-cddiffs[argminS], 0, 1)

8124 maxS = cddiffs[argmaxS]

8125

8126 if alternative == 'less' or (alternative == 'two-sided' and minS > maxS):

8127 d = minS

8128 d_location = loc_minS

8129 d_sign = -1

8130 else:

8131 d = maxS

8132 d_location = loc_maxS

8133 d_sign = 1

8134 g = gcd(n1, n2)

8135 n1g = n1 // g

8136 n2g = n2 // g

8137 prob = -np.inf

8138 if mode == 'auto':

8139 mode = 'exact' if max(n1, n2) <= MAX_AUTO_N else 'asymp'

8140 elif mode == 'exact':

8141 # If lcm(n1, n2) is too big, switch from exact to asymp

8142 if n1g >= np.iinfo(np.int32).max / n2g:

8143 mode = 'asymp'

8144 warnings.warn(

8145 f"Exact ks_2samp calculation not possible with samples sizes "

8146 f"{n1} and {n2}. Switching to 'asymp'.", RuntimeWarning,

8147 stacklevel=3)

8148

8149 if mode == 'exact':

8150 success, d, prob = _attempt_exact_2kssamp(n1, n2, g, d, alternative)

8151 if not success:

8152 mode = 'asymp'

8153 warnings.warn(f"ks_2samp: Exact calculation unsuccessful. "

8154 f"Switching to method={mode}.", RuntimeWarning,

8155 stacklevel=3)

8156

8157 if mode == 'asymp':

8158 # The product n1*n2 is large. Use Smirnov's asymptoptic formula.

8159 # Ensure float to avoid overflow in multiplication

8160 # sorted because the one-sided formula is not symmetric in n1, n2

8161 m, n = sorted([float(n1), float(n2)], reverse=True)

8162 en = m * n / (m + n)

8163 if alternative == 'two-sided':

8164 prob = distributions.kstwo.sf(d, np.round(en))

8165 else:

8166 z = np.sqrt(en) * d

8167 # Use Hodges' suggested approximation Eqn 5.3

8168 # Requires m to be the larger of (n1, n2)

8169 expt = -2 * z**2 - 2 * z * (m + 2*n)/np.sqrt(m*n*(m+n))/3.0

8170 prob = np.exp(expt)

8171

8172 prob = np.clip(prob, 0, 1)

8173 return KstestResult(d, prob, statistic_location=d_location,

8174 statistic_sign=d_sign)

8175

8176

8177def _parse_kstest_args(data1, data2, args, N):

8178 # kstest allows many different variations of arguments.

8179 # Pull out the parsing into a separate function

8180 # (xvals, yvals, ) # 2sample

8181 # (xvals, cdf function,..)

8182 # (xvals, name of distribution, ...)

8183 # (name of distribution, name of distribution, ...)

8184

8185 # Returns xvals, yvals, cdf

8186 # where cdf is a cdf function, or None

8187 # and yvals is either an array_like of values, or None

8188 # and xvals is array_like.

8189 rvsfunc, cdf = None, None

8190 if isinstance(data1, str):

8191 rvsfunc = getattr(distributions, data1).rvs

8192 elif callable(data1):

8193 rvsfunc = data1

8194

8195 if isinstance(data2, str):

8196 cdf = getattr(distributions, data2).cdf

8197 data2 = None

8198 elif callable(data2):

8199 cdf = data2

8200 data2 = None

8201

8202 data1 = np.sort(rvsfunc(*args, size=N) if rvsfunc else data1)

8203 return data1, data2, cdf

8204

8205

8206@_rename_parameter("mode", "method")

8207def kstest(rvs, cdf, args=(), N=20, alternative='two-sided', method='auto'):

8208 """

8209 Performs the (one-sample or two-sample) Kolmogorov-Smirnov test for

8210 goodness of fit.

8211

8212 The one-sample test compares the underlying distribution F(x) of a sample

8213 against a given distribution G(x). The two-sample test compares the

8214 underlying distributions of two independent samples. Both tests are valid

8215 only for continuous distributions.

8216

8217 Parameters

8218 ----------

8219 rvs : str, array_like, or callable

8220 If an array, it should be a 1-D array of observations of random

8221 variables.

8222 If a callable, it should be a function to generate random variables;

8223 it is required to have a keyword argument `size`.

8224 If a string, it should be the name of a distribution in `scipy.stats`,

8225 which will be used to generate random variables.

8226 cdf : str, array_like or callable

8227 If array_like, it should be a 1-D array of observations of random

8228 variables, and the two-sample test is performed

8229 (and rvs must be array_like).

8230 If a callable, that callable is used to calculate the cdf.

8231 If a string, it should be the name of a distribution in `scipy.stats`,

8232 which will be used as the cdf function.

8233 args : tuple, sequence, optional

8234 Distribution parameters, used if `rvs` or `cdf` are strings or

8235 callables.

8236 N : int, optional

8237 Sample size if `rvs` is string or callable. Default is 20.

8238 alternative : {'two-sided', 'less', 'greater'}, optional

8239 Defines the null and alternative hypotheses. Default is 'two-sided'.

8240 Please see explanations in the Notes below.

8241 method : {'auto', 'exact', 'approx', 'asymp'}, optional

8242 Defines the distribution used for calculating the p-value.

8243 The following options are available (default is 'auto'):

8244

8245 * 'auto' : selects one of the other options.

8246 * 'exact' : uses the exact distribution of test statistic.

8247 * 'approx' : approximates the two-sided probability with twice the

8248 one-sided probability

8249 * 'asymp': uses asymptotic distribution of test statistic

8250

8251 Returns

8252 -------

8253 res: KstestResult

8254 An object containing attributes:

8255

8256 statistic : float

8257 KS test statistic, either D+, D-, or D (the maximum of the two)

8258 pvalue : float

8259 One-tailed or two-tailed p-value.

8260 statistic_location : float

8261 In a one-sample test, this is the value of `rvs`

8262 corresponding with the KS statistic; i.e., the distance between

8263 the empirical distribution function and the hypothesized cumulative

8264 distribution function is measured at this observation.

8265

8266 In a two-sample test, this is the value from `rvs` or `cdf`

8267 corresponding with the KS statistic; i.e., the distance between

8268 the empirical distribution functions is measured at this

8269 observation.

8270 statistic_sign : int

8271 In a one-sample test, this is +1 if the KS statistic is the

8272 maximum positive difference between the empirical distribution

8273 function and the hypothesized cumulative distribution function

8274 (D+); it is -1 if the KS statistic is the maximum negative

8275 difference (D-).

8276

8277 In a two-sample test, this is +1 if the empirical distribution

8278 function of `rvs` exceeds the empirical distribution

8279 function of `cdf` at `statistic_location`, otherwise -1.

8280

8281 See Also

8282 --------

8283 ks_1samp, ks_2samp

8284

8285 Notes

8286 -----

8287 There are three options for the null and corresponding alternative

8288 hypothesis that can be selected using the `alternative` parameter.

8289

8290 - `two-sided`: The null hypothesis is that the two distributions are

8291 identical, F(x)=G(x) for all x; the alternative is that they are not

8292 identical.

8293

8294 - `less`: The null hypothesis is that F(x) >= G(x) for all x; the

8295 alternative is that F(x) < G(x) for at least one x.

8296

8297 - `greater`: The null hypothesis is that F(x) <= G(x) for all x; the

8298 alternative is that F(x) > G(x) for at least one x.

8299

8300 Note that the alternative hypotheses describe the *CDFs* of the

8301 underlying distributions, not the observed values. For example,

8302 suppose x1 ~ F and x2 ~ G. If F(x) > G(x) for all x, the values in

8303 x1 tend to be less than those in x2.

8304

8305

8306 Examples

8307 --------

8308 Suppose we wish to test the null hypothesis that a sample is distributed

8309 according to the standard normal.

8310 We choose a confidence level of 95%; that is, we will reject the null

8311 hypothesis in favor of the alternative if the p-value is less than 0.05.

8312

8313 When testing uniformly distributed data, we would expect the

8314 null hypothesis to be rejected.

8315

8316 >>> import numpy as np

8317 >>> from scipy import stats

8318 >>> rng = np.random.default_rng()

8319 >>> stats.kstest(stats.uniform.rvs(size=100, random_state=rng),

8320 ... stats.norm.cdf)

8321 KstestResult(statistic=0.5001899973268688, pvalue=1.1616392184763533e-23)

8322

8323 Indeed, the p-value is lower than our threshold of 0.05, so we reject the

8324 null hypothesis in favor of the default "two-sided" alternative: the data

8325 are *not* distributed according to the standard normal.

8326

8327 When testing random variates from the standard normal distribution, we

8328 expect the data to be consistent with the null hypothesis most of the time.

8329

8330 >>> x = stats.norm.rvs(size=100, random_state=rng)

8331 >>> stats.kstest(x, stats.norm.cdf)

8332 KstestResult(statistic=0.05345882212970396, pvalue=0.9227159037744717)

8333

8334 As expected, the p-value of 0.92 is not below our threshold of 0.05, so

8335 we cannot reject the null hypothesis.

8336

8337 Suppose, however, that the random variates are distributed according to

8338 a normal distribution that is shifted toward greater values. In this case,

8339 the cumulative density function (CDF) of the underlying distribution tends

8340 to be *less* than the CDF of the standard normal. Therefore, we would

8341 expect the null hypothesis to be rejected with ``alternative='less'``:

8342

8343 >>> x = stats.norm.rvs(size=100, loc=0.5, random_state=rng)

8344 >>> stats.kstest(x, stats.norm.cdf, alternative='less')

8345 KstestResult(statistic=0.17482387821055168, pvalue=0.001913921057766743)

8346

8347 and indeed, with p-value smaller than our threshold, we reject the null

8348 hypothesis in favor of the alternative.

8349

8350 For convenience, the previous test can be performed using the name of the

8351 distribution as the second argument.

8352

8353 >>> stats.kstest(x, "norm", alternative='less')

8354 KstestResult(statistic=0.17482387821055168, pvalue=0.001913921057766743)

8355

8356 The examples above have all been one-sample tests identical to those

8357 performed by `ks_1samp`. Note that `kstest` can also perform two-sample

8358 tests identical to those performed by `ks_2samp`. For example, when two

8359 samples are drawn from the same distribution, we expect the data to be

8360 consistent with the null hypothesis most of the time.

8361

8362 >>> sample1 = stats.laplace.rvs(size=105, random_state=rng)

8363 >>> sample2 = stats.laplace.rvs(size=95, random_state=rng)

8364 >>> stats.kstest(sample1, sample2)

8365 KstestResult(statistic=0.11779448621553884, pvalue=0.4494256912629795)

8366

8367 As expected, the p-value of 0.45 is not below our threshold of 0.05, so

8368 we cannot reject the null hypothesis.

8369

8370 """

8371 # to not break compatibility with existing code

8372 if alternative == 'two_sided':

8373 alternative = 'two-sided'

8374 if alternative not in ['two-sided', 'greater', 'less']:

8375 raise ValueError("Unexpected alternative %s" % alternative)

8376 xvals, yvals, cdf = _parse_kstest_args(rvs, cdf, args, N)

8377 if cdf:

8378 return ks_1samp(xvals, cdf, args=args, alternative=alternative,

8379 method=method)

8380 return ks_2samp(xvals, yvals, alternative=alternative, method=method)

8381

8382

8383def tiecorrect(rankvals):

8384 """Tie correction factor for Mann-Whitney U and Kruskal-Wallis H tests.

8385

8386 Parameters

8387 ----------

8388 rankvals : array_like

8389 A 1-D sequence of ranks. Typically this will be the array

8390 returned by `~scipy.stats.rankdata`.

8391

8392 Returns

8393 -------

8394 factor : float

8395 Correction factor for U or H.

8396

8397 See Also

8398 --------

8399 rankdata : Assign ranks to the data

8400 mannwhitneyu : Mann-Whitney rank test

8401 kruskal : Kruskal-Wallis H test

8402

8403 References

8404 ----------

8405 .. [1] Siegel, S. (1956) Nonparametric Statistics for the Behavioral

8406 Sciences. New York: McGraw-Hill.

8407

8408 Examples

8409 --------

8410 >>> from scipy.stats import tiecorrect, rankdata

8411 >>> tiecorrect([1, 2.5, 2.5, 4])

8412 0.9

8413 >>> ranks = rankdata([1, 3, 2, 4, 5, 7, 2, 8, 4])

8414 >>> ranks

8415 array([ 1. , 4. , 2.5, 5.5, 7. , 8. , 2.5, 9. , 5.5])

8416 >>> tiecorrect(ranks)

8417 0.9833333333333333

8418

8419 """

8420 arr = np.sort(rankvals)

8421 idx = np.nonzero(np.r_[True, arr[1:] != arr[:-1], True])[0]

8422 cnt = np.diff(idx).astype(np.float64)

8423

8424 size = np.float64(arr.size)

8425 return 1.0 if size < 2 else 1.0 - (cnt**3 - cnt).sum() / (size**3 - size)

8426

8427

8428RanksumsResult = namedtuple('RanksumsResult', ('statistic', 'pvalue'))

8429

8430

8431@_axis_nan_policy_factory(RanksumsResult, n_samples=2)

8432def ranksums(x, y, alternative='two-sided'):

8433 """Compute the Wilcoxon rank-sum statistic for two samples.

8434

8435 The Wilcoxon rank-sum test tests the null hypothesis that two sets

8436 of measurements are drawn from the same distribution. The alternative

8437 hypothesis is that values in one sample are more likely to be

8438 larger than the values in the other sample.

8439

8440 This test should be used to compare two samples from continuous

8441 distributions. It does not handle ties between measurements

8442 in x and y. For tie-handling and an optional continuity correction

8443 see `scipy.stats.mannwhitneyu`.

8444

8445 Parameters

8446 ----------

8447 x,y : array_like

8448 The data from the two samples.

8449 alternative : {'two-sided', 'less', 'greater'}, optional

8450 Defines the alternative hypothesis. Default is 'two-sided'.

8451 The following options are available:

8452

8453 * 'two-sided': one of the distributions (underlying `x` or `y`) is

8454 stochastically greater than the other.

8455 * 'less': the distribution underlying `x` is stochastically less

8456 than the distribution underlying `y`.

8457 * 'greater': the distribution underlying `x` is stochastically greater

8458 than the distribution underlying `y`.

8459

8460 .. versionadded:: 1.7.0

8461

8462 Returns

8463 -------

8464 statistic : float

8465 The test statistic under the large-sample approximation that the

8466 rank sum statistic is normally distributed.

8467 pvalue : float

8468 The p-value of the test.

8469

8470 References

8471 ----------

8472 .. [1] https://en.wikipedia.org/wiki/Wilcoxon_rank-sum_test

8473

8474 Examples

8475 --------

8476 We can test the hypothesis that two independent unequal-sized samples are

8477 drawn from the same distribution with computing the Wilcoxon rank-sum

8478 statistic.

8479

8480 >>> import numpy as np

8481 >>> from scipy.stats import ranksums

8482 >>> rng = np.random.default_rng()

8483 >>> sample1 = rng.uniform(-1, 1, 200)

8484 >>> sample2 = rng.uniform(-0.5, 1.5, 300) # a shifted distribution

8485 >>> ranksums(sample1, sample2)

8486 RanksumsResult(statistic=-7.887059, pvalue=3.09390448e-15) # may vary

8487 >>> ranksums(sample1, sample2, alternative='less')

8488 RanksumsResult(statistic=-7.750585297581713, pvalue=4.573497606342543e-15) # may vary

8489 >>> ranksums(sample1, sample2, alternative='greater')

8490 RanksumsResult(statistic=-7.750585297581713, pvalue=0.9999999999999954) # may vary

8491

8492 The p-value of less than ``0.05`` indicates that this test rejects the

8493 hypothesis at the 5% significance level.

8494

8495 """

8496 x, y = map(np.asarray, (x, y))

8497 n1 = len(x)

8498 n2 = len(y)

8499 alldata = np.concatenate((x, y))

8500 ranked = rankdata(alldata)

8501 x = ranked[:n1]

8502 s = np.sum(x, axis=0)

8503 expected = n1 * (n1+n2+1) / 2.0

8504 z = (s - expected) / np.sqrt(n1*n2*(n1+n2+1)/12.0)

8505 z, prob = _normtest_finish(z, alternative)

8506

8507 return RanksumsResult(z, prob)

8508

8509

8510KruskalResult = namedtuple('KruskalResult', ('statistic', 'pvalue'))

8511

8512

8513@_axis_nan_policy_factory(KruskalResult, n_samples=None)

8514def kruskal(*samples, nan_policy='propagate'):

8515 """Compute the Kruskal-Wallis H-test for independent samples.

8516

8517 The Kruskal-Wallis H-test tests the null hypothesis that the population

8518 median of all of the groups are equal. It is a non-parametric version of

8519 ANOVA. The test works on 2 or more independent samples, which may have

8520 different sizes. Note that rejecting the null hypothesis does not

8521 indicate which of the groups differs. Post hoc comparisons between

8522 groups are required to determine which groups are different.

8523

8524 Parameters

8525 ----------

8526 sample1, sample2, ... : array_like

8527 Two or more arrays with the sample measurements can be given as

8528 arguments. Samples must be one-dimensional.

8529 nan_policy : {'propagate', 'raise', 'omit'}, optional

8530 Defines how to handle when input contains nan.

8531 The following options are available (default is 'propagate'):

8532

8533 * 'propagate': returns nan

8534 * 'raise': throws an error

8535 * 'omit': performs the calculations ignoring nan values

8536

8537 Returns

8538 -------

8539 statistic : float

8540 The Kruskal-Wallis H statistic, corrected for ties.

8541 pvalue : float

8542 The p-value for the test using the assumption that H has a chi

8543 square distribution. The p-value returned is the survival function of

8544 the chi square distribution evaluated at H.

8545

8546 See Also

8547 --------

8548 f_oneway : 1-way ANOVA.

8549 mannwhitneyu : Mann-Whitney rank test on two samples.

8550 friedmanchisquare : Friedman test for repeated measurements.

8551

8552 Notes

8553 -----

8554 Due to the assumption that H has a chi square distribution, the number

8555 of samples in each group must not be too small. A typical rule is

8556 that each sample must have at least 5 measurements.

8557

8558 References

8559 ----------

8560 .. [1] W. H. Kruskal & W. W. Wallis, "Use of Ranks in

8561 One-Criterion Variance Analysis", Journal of the American Statistical

8562 Association, Vol. 47, Issue 260, pp. 583-621, 1952.

8563 .. [2] https://en.wikipedia.org/wiki/Kruskal-Wallis_one-way_analysis_of_variance

8564

8565 Examples

8566 --------

8567 >>> from scipy import stats

8568 >>> x = [1, 3, 5, 7, 9]

8569 >>> y = [2, 4, 6, 8, 10]

8570 >>> stats.kruskal(x, y)

8571 KruskalResult(statistic=0.2727272727272734, pvalue=0.6015081344405895)

8572

8573 >>> x = [1, 1, 1]

8574 >>> y = [2, 2, 2]

8575 >>> z = [2, 2]

8576 >>> stats.kruskal(x, y, z)

8577 KruskalResult(statistic=7.0, pvalue=0.0301973834223185)

8578

8579 """

8580 samples = list(map(np.asarray, samples))

8581

8582 num_groups = len(samples)

8583 if num_groups < 2:

8584 raise ValueError("Need at least two groups in stats.kruskal()")

8585

8586 for sample in samples:

8587 if sample.size == 0:

8588 return KruskalResult(np.nan, np.nan)

8589 elif sample.ndim != 1:

8590 raise ValueError("Samples must be one-dimensional.")

8591

8592 n = np.asarray(list(map(len, samples)))

8593

8594 if nan_policy not in ('propagate', 'raise', 'omit'):

8595 raise ValueError("nan_policy must be 'propagate', 'raise' or 'omit'")

8596

8597 contains_nan = False

8598 for sample in samples:

8599 cn = _contains_nan(sample, nan_policy)

8600 if cn[0]:

8601 contains_nan = True

8602 break

8603

8604 if contains_nan and nan_policy == 'omit':

8605 for sample in samples:

8606 sample = ma.masked_invalid(sample)

8607 return mstats_basic.kruskal(*samples)

8608

8609 if contains_nan and nan_policy == 'propagate':

8610 return KruskalResult(np.nan, np.nan)

8611

8612 alldata = np.concatenate(samples)

8613 ranked = rankdata(alldata)

8614 ties = tiecorrect(ranked)

8615 if ties == 0:

8616 raise ValueError('All numbers are identical in kruskal')

8617

8618 # Compute sum^2/n for each group and sum

8619 j = np.insert(np.cumsum(n), 0, 0)

8620 ssbn = 0

8621 for i in range(num_groups):

8622 ssbn += _square_of_sums(ranked[j[i]:j[i+1]]) / n[i]

8623

8624 totaln = np.sum(n, dtype=float)

8625 h = 12.0 / (totaln * (totaln + 1)) * ssbn - 3 * (totaln + 1)

8626 df = num_groups - 1

8627 h /= ties

8628

8629 return KruskalResult(h, distributions.chi2.sf(h, df))

8630

8631

8632FriedmanchisquareResult = namedtuple('FriedmanchisquareResult',

8633 ('statistic', 'pvalue'))

8634

8635

8636def friedmanchisquare(*samples):

8637 """Compute the Friedman test for repeated samples.

8638

8639 The Friedman test tests the null hypothesis that repeated samples of

8640 the same individuals have the same distribution. It is often used

8641 to test for consistency among samples obtained in different ways.

8642 For example, if two sampling techniques are used on the same set of

8643 individuals, the Friedman test can be used to determine if the two

8644 sampling techniques are consistent.

8645

8646 Parameters

8647 ----------

8648 sample1, sample2, sample3... : array_like

8649 Arrays of observations. All of the arrays must have the same number

8650 of elements. At least three samples must be given.

8651

8652 Returns

8653 -------

8654 statistic : float

8655 The test statistic, correcting for ties.

8656 pvalue : float

8657 The associated p-value assuming that the test statistic has a chi

8658 squared distribution.

8659

8660 Notes

8661 -----

8662 Due to the assumption that the test statistic has a chi squared

8663 distribution, the p-value is only reliable for n > 10 and more than

8664 6 repeated samples.

8665

8666 References

8667 ----------

8668 .. [1] https://en.wikipedia.org/wiki/Friedman_test

8669

8670 """

8671 k = len(samples)

8672 if k < 3:

8673 raise ValueError('At least 3 sets of samples must be given '

8674 'for Friedman test, got {}.'.format(k))

8675

8676 n = len(samples[0])

8677 for i in range(1, k):

8678 if len(samples[i]) != n:

8679 raise ValueError('Unequal N in friedmanchisquare. Aborting.')

8680

8681 # Rank data

8682 data = np.vstack(samples).T

8683 data = data.astype(float)

8684 for i in range(len(data)):

8685 data[i] = rankdata(data[i])

8686

8687 # Handle ties

8688 ties = 0

8689 for d in data:

8690 replist, repnum = find_repeats(array(d))

8691 for t in repnum:

8692 ties += t * (t*t - 1)

8693 c = 1 - ties / (k*(k*k - 1)*n)

8694

8695 ssbn = np.sum(data.sum(axis=0)**2)

8696 chisq = (12.0 / (k*n*(k+1)) * ssbn - 3*n*(k+1)) / c

8697

8698 return FriedmanchisquareResult(chisq, distributions.chi2.sf(chisq, k - 1))

8699

8700

8701BrunnerMunzelResult = namedtuple('BrunnerMunzelResult',

8702 ('statistic', 'pvalue'))

8703

8704

8705def brunnermunzel(x, y, alternative="two-sided", distribution="t",

8706 nan_policy='propagate'):

8707 """Compute the Brunner-Munzel test on samples x and y.

8708

8709 The Brunner-Munzel test is a nonparametric test of the null hypothesis that

8710 when values are taken one by one from each group, the probabilities of

8711 getting large values in both groups are equal.

8712 Unlike the Wilcoxon-Mann-Whitney's U test, this does not require the

8713 assumption of equivariance of two groups. Note that this does not assume

8714 the distributions are same. This test works on two independent samples,

8715 which may have different sizes.

8716

8717 Parameters

8718 ----------

8719 x, y : array_like

8720 Array of samples, should be one-dimensional.

8721 alternative : {'two-sided', 'less', 'greater'}, optional

8722 Defines the alternative hypothesis.

8723 The following options are available (default is 'two-sided'):

8724

8725 * 'two-sided'

8726 * 'less': one-sided

8727 * 'greater': one-sided

8728 distribution : {'t', 'normal'}, optional

8729 Defines how to get the p-value.

8730 The following options are available (default is 't'):

8731

8732 * 't': get the p-value by t-distribution

8733 * 'normal': get the p-value by standard normal distribution.

8734 nan_policy : {'propagate', 'raise', 'omit'}, optional

8735 Defines how to handle when input contains nan.

8736 The following options are available (default is 'propagate'):

8737

8738 * 'propagate': returns nan

8739 * 'raise': throws an error

8740 * 'omit': performs the calculations ignoring nan values

8741

8742 Returns

8743 -------

8744 statistic : float

8745 The Brunner-Munzer W statistic.

8746 pvalue : float

8747 p-value assuming an t distribution. One-sided or

8748 two-sided, depending on the choice of `alternative` and `distribution`.

8749

8750 See Also

8751 --------

8752 mannwhitneyu : Mann-Whitney rank test on two samples.

8753

8754 Notes

8755 -----

8756 Brunner and Munzel recommended to estimate the p-value by t-distribution

8757 when the size of data is 50 or less. If the size is lower than 10, it would

8758 be better to use permuted Brunner Munzel test (see [2]_).

8759

8760 References

8761 ----------

8762 .. [1] Brunner, E. and Munzel, U. "The nonparametric Benhrens-Fisher

8763 problem: Asymptotic theory and a small-sample approximation".

8764 Biometrical Journal. Vol. 42(2000): 17-25.

8765 .. [2] Neubert, K. and Brunner, E. "A studentized permutation test for the

8766 non-parametric Behrens-Fisher problem". Computational Statistics and

8767 Data Analysis. Vol. 51(2007): 5192-5204.

8768

8769 Examples

8770 --------

8771 >>> from scipy import stats

8772 >>> x1 = [1,2,1,1,1,1,1,1,1,1,2,4,1,1]

8773 >>> x2 = [3,3,4,3,1,2,3,1,1,5,4]

8774 >>> w, p_value = stats.brunnermunzel(x1, x2)

8775 >>> w

8776 3.1374674823029505

8777 >>> p_value

8778 0.0057862086661515377

8779

8780 """

8781 x = np.asarray(x)

8782 y = np.asarray(y)

8783

8784 # check both x and y

8785 cnx, npx = _contains_nan(x, nan_policy)

8786 cny, npy = _contains_nan(y, nan_policy)

8787 contains_nan = cnx or cny

8788 if npx == "omit" or npy == "omit":

8789 nan_policy = "omit"

8790

8791 if contains_nan and nan_policy == "propagate":

8792 return BrunnerMunzelResult(np.nan, np.nan)

8793 elif contains_nan and nan_policy == "omit":

8794 x = ma.masked_invalid(x)

8795 y = ma.masked_invalid(y)

8796 return mstats_basic.brunnermunzel(x, y, alternative, distribution)

8797

8798 nx = len(x)

8799 ny = len(y)

8800 if nx == 0 or ny == 0:

8801 return BrunnerMunzelResult(np.nan, np.nan)

8802 rankc = rankdata(np.concatenate((x, y)))

8803 rankcx = rankc[0:nx]

8804 rankcy = rankc[nx:nx+ny]

8805 rankcx_mean = np.mean(rankcx)

8806 rankcy_mean = np.mean(rankcy)

8807 rankx = rankdata(x)

8808 ranky = rankdata(y)

8809 rankx_mean = np.mean(rankx)

8810 ranky_mean = np.mean(ranky)

8811

8812 Sx = np.sum(np.power(rankcx - rankx - rankcx_mean + rankx_mean, 2.0))

8813 Sx /= nx - 1

8814 Sy = np.sum(np.power(rankcy - ranky - rankcy_mean + ranky_mean, 2.0))

8815 Sy /= ny - 1

8816

8817 wbfn = nx * ny * (rankcy_mean - rankcx_mean)

8818 wbfn /= (nx + ny) * np.sqrt(nx * Sx + ny * Sy)

8819

8820 if distribution == "t":

8821 df_numer = np.power(nx * Sx + ny * Sy, 2.0)

8822 df_denom = np.power(nx * Sx, 2.0) / (nx - 1)

8823 df_denom += np.power(ny * Sy, 2.0) / (ny - 1)

8824 df = df_numer / df_denom

8825

8826 if (df_numer == 0) and (df_denom == 0):

8827 message = ("p-value cannot be estimated with `distribution='t' "

8828 "because degrees of freedom parameter is undefined "

8829 "(0/0). Try using `distribution='normal'")

8830 warnings.warn(message, RuntimeWarning)

8831

8832 p = distributions.t.cdf(wbfn, df)

8833 elif distribution == "normal":

8834 p = distributions.norm.cdf(wbfn)

8835 else:

8836 raise ValueError(

8837 "distribution should be 't' or 'normal'")

8838

8839 if alternative == "greater":

8840 pass

8841 elif alternative == "less":

8842 p = 1 - p

8843 elif alternative == "two-sided":

8844 p = 2 * np.min([p, 1-p])

8845 else:

8846 raise ValueError(

8847 "alternative should be 'less', 'greater' or 'two-sided'")

8848

8849 return BrunnerMunzelResult(wbfn, p)

8850

8851

8852def combine_pvalues(pvalues, method='fisher', weights=None):

8853 """

8854 Combine p-values from independent tests that bear upon the same hypothesis.

8855

8856 These methods are intended only for combining p-values from hypothesis

8857 tests based upon continuous distributions.

8858

8859 Each method assumes that under the null hypothesis, the p-values are

8860 sampled independently and uniformly from the interval [0, 1]. A test

8861 statistic (different for each method) is computed and a combined

8862 p-value is calculated based upon the distribution of this test statistic

8863 under the null hypothesis.

8864

8865 Parameters

8866 ----------

8867 pvalues : array_like, 1-D

8868 Array of p-values assumed to come from independent tests based on

8869 continuous distributions.

8870 method : {'fisher', 'pearson', 'tippett', 'stouffer', 'mudholkar_george'}

8871

8872 Name of method to use to combine p-values.

8873

8874 The available methods are (see Notes for details):

8875

8876 * 'fisher': Fisher's method (Fisher's combined probability test)

8877 * 'pearson': Pearson's method

8878 * 'mudholkar_george': Mudholkar's and George's method

8879 * 'tippett': Tippett's method

8880 * 'stouffer': Stouffer's Z-score method

8881 weights : array_like, 1-D, optional

8882 Optional array of weights used only for Stouffer's Z-score method.

8883

8884 Returns

8885 -------

8886 res : SignificanceResult

8887 An object containing attributes:

8888

8889 statistic : float

8890 The statistic calculated by the specified method.

8891 pvalue : float

8892 The combined p-value.

8893

8894 Notes

8895 -----

8896 If this function is applied to tests with a discrete statistics such as

8897 any rank test or contingency-table test, it will yield systematically

8898 wrong results, e.g. Fisher's method will systematically overestimate the

8899 p-value [1]_. This problem becomes less severe for large sample sizes

8900 when the discrete distributions become approximately continuous.

8901

8902 The differences between the methods can be best illustrated by their

8903 statistics and what aspects of a combination of p-values they emphasise

8904 when considering significance [2]_. For example, methods emphasising large

8905 p-values are more sensitive to strong false and true negatives; conversely

8906 methods focussing on small p-values are sensitive to positives.

8907

8908 * The statistics of Fisher's method (also known as Fisher's combined

8909 probability test) [3]_ is :math:`-2\\sum_i \\log(p_i)`, which is

8910 equivalent (as a test statistics) to the product of individual p-values:

8911 :math:`\\prod_i p_i`. Under the null hypothesis, this statistics follows

8912 a :math:`\\chi^2` distribution. This method emphasises small p-values.

8913 * Pearson's method uses :math:`-2\\sum_i\\log(1-p_i)`, which is equivalent

8914 to :math:`\\prod_i \\frac{1}{1-p_i}` [2]_.

8915 It thus emphasises large p-values.

8916 * Mudholkar and George compromise between Fisher's and Pearson's method by

8917 averaging their statistics [4]_. Their method emphasises extreme

8918 p-values, both close to 1 and 0.

8919 * Stouffer's method [5]_ uses Z-scores and the statistic:

8920 :math:`\\sum_i \\Phi^{-1} (p_i)`, where :math:`\\Phi` is the CDF of the

8921 standard normal distribution. The advantage of this method is that it is

8922 straightforward to introduce weights, which can make Stouffer's method

8923 more powerful than Fisher's method when the p-values are from studies

8924 of different size [6]_ [7]_.

8925 * Tippett's method uses the smallest p-value as a statistic.

8926 (Mind that this minimum is not the combined p-value.)

8927

8928 Fisher's method may be extended to combine p-values from dependent tests

8929 [8]_. Extensions such as Brown's method and Kost's method are not currently

8930 implemented.

8931

8932 .. versionadded:: 0.15.0

8933

8934 References

8935 ----------

8936 .. [1] Kincaid, W. M., "The Combination of Tests Based on Discrete

8937 Distributions." Journal of the American Statistical Association 57,

8938 no. 297 (1962), 10-19.

8939 .. [2] Heard, N. and Rubin-Delanchey, P. "Choosing between methods of

8940 combining p-values." Biometrika 105.1 (2018): 239-246.

8941 .. [3] https://en.wikipedia.org/wiki/Fisher%27s_method

8942 .. [4] George, E. O., and G. S. Mudholkar. "On the convolution of logistic

8943 random variables." Metrika 30.1 (1983): 1-13.

8944 .. [5] https://en.wikipedia.org/wiki/Fisher%27s_method#Relation_to_Stouffer.27s_Z-score_method

8945 .. [6] Whitlock, M. C. "Combining probability from independent tests: the

8946 weighted Z-method is superior to Fisher's approach." Journal of

8947 Evolutionary Biology 18, no. 5 (2005): 1368-1373.

8948 .. [7] Zaykin, Dmitri V. "Optimally weighted Z-test is a powerful method

8949 for combining probabilities in meta-analysis." Journal of

8950 Evolutionary Biology 24, no. 8 (2011): 1836-1841.

8951 .. [8] https://en.wikipedia.org/wiki/Extensions_of_Fisher%27s_method

8952

8953 """

8954 pvalues = np.asarray(pvalues)

8955 if pvalues.ndim != 1:

8956 raise ValueError("pvalues is not 1-D")

8957

8958 if method == 'fisher':

8959 statistic = -2 * np.sum(np.log(pvalues))

8960 pval = distributions.chi2.sf(statistic, 2 * len(pvalues))

8961 elif method == 'pearson':

8962 statistic = 2 * np.sum(np.log1p(-pvalues))

8963 pval = distributions.chi2.cdf(-statistic, 2 * len(pvalues))

8964 elif method == 'mudholkar_george':

8965 normalizing_factor = np.sqrt(3/len(pvalues))/np.pi

8966 statistic = -np.sum(np.log(pvalues)) + np.sum(np.log1p(-pvalues))

8967 nu = 5 * len(pvalues) + 4

8968 approx_factor = np.sqrt(nu / (nu - 2))

8969 pval = distributions.t.sf(statistic * normalizing_factor

8970 * approx_factor, nu)

8971 elif method == 'tippett':

8972 statistic = np.min(pvalues)

8973 pval = distributions.beta.cdf(statistic, 1, len(pvalues))

8974 elif method == 'stouffer':

8975 if weights is None:

8976 weights = np.ones_like(pvalues)

8977 elif len(weights) != len(pvalues):

8978 raise ValueError("pvalues and weights must be of the same size.")

8979

8980 weights = np.asarray(weights)

8981 if weights.ndim != 1:

8982 raise ValueError("weights is not 1-D")

8983

8984 Zi = distributions.norm.isf(pvalues)

8985 statistic = np.dot(weights, Zi) / np.linalg.norm(weights)

8986 pval = distributions.norm.sf(statistic)

8987

8988 else:

8989 raise ValueError(

8990 f"Invalid method {method!r}. Valid methods are 'fisher', "

8991 "'pearson', 'mudholkar_george', 'tippett', and 'stouffer'"

8992 )

8993

8994 return SignificanceResult(statistic, pval)

8995

8996

8997#####################################

8998# STATISTICAL DISTANCES #

8999#####################################

9000

9001

9002def wasserstein_distance(u_values, v_values, u_weights=None, v_weights=None):

9003 r"""

9004 Compute the first Wasserstein distance between two 1D distributions.

9005

9006 This distance is also known as the earth mover's distance, since it can be

9007 seen as the minimum amount of "work" required to transform :math:`u` into

9008 :math:`v`, where "work" is measured as the amount of distribution weight

9009 that must be moved, multiplied by the distance it has to be moved.

9010

9011 .. versionadded:: 1.0.0

9012

9013 Parameters

9014 ----------

9015 u_values, v_values : array_like

9016 Values observed in the (empirical) distribution.

9017 u_weights, v_weights : array_like, optional

9018 Weight for each value. If unspecified, each value is assigned the same

9019 weight.

9020 `u_weights` (resp. `v_weights`) must have the same length as

9021 `u_values` (resp. `v_values`). If the weight sum differs from 1, it

9022 must still be positive and finite so that the weights can be normalized

9023 to sum to 1.

9024

9025 Returns

9026 -------

9027 distance : float

9028 The computed distance between the distributions.

9029

9030 Notes

9031 -----

9032 The first Wasserstein distance between the distributions :math:`u` and

9033 :math:`v` is:

9034

9035 .. math::

9036

9037 l_1 (u, v) = \inf_{\pi \in \Gamma (u, v)} \int_{\mathbb{R} \times

9038 \mathbb{R}} |x-y| \mathrm{d} \pi (x, y)

9039

9040 where :math:`\Gamma (u, v)` is the set of (probability) distributions on

9041 :math:`\mathbb{R} \times \mathbb{R}` whose marginals are :math:`u` and

9042 :math:`v` on the first and second factors respectively.

9043

9044 If :math:`U` and :math:`V` are the respective CDFs of :math:`u` and

9045 :math:`v`, this distance also equals to:

9046

9047 .. math::

9048

9049 l_1(u, v) = \int_{-\infty}^{+\infty} |U-V|

9050

9051 See [2]_ for a proof of the equivalence of both definitions.

9052

9053 The input distributions can be empirical, therefore coming from samples

9054 whose values are effectively inputs of the function, or they can be seen as

9055 generalized functions, in which case they are weighted sums of Dirac delta

9056 functions located at the specified values.

9057

9058 References

9059 ----------

9060 .. [1] "Wasserstein metric", https://en.wikipedia.org/wiki/Wasserstein_metric

9061 .. [2] Ramdas, Garcia, Cuturi "On Wasserstein Two Sample Testing and Related

9062 Families of Nonparametric Tests" (2015). :arXiv:`1509.02237`.

9063

9064 Examples

9065 --------

9066 >>> from scipy.stats import wasserstein_distance

9067 >>> wasserstein_distance([0, 1, 3], [5, 6, 8])

9068 5.0

9069 >>> wasserstein_distance([0, 1], [0, 1], [3, 1], [2, 2])

9070 0.25

9071 >>> wasserstein_distance([3.4, 3.9, 7.5, 7.8], [4.5, 1.4],

9072 ... [1.4, 0.9, 3.1, 7.2], [3.2, 3.5])

9073 4.0781331438047861

9074

9075 """

9076 return _cdf_distance(1, u_values, v_values, u_weights, v_weights)

9077

9078

9079def energy_distance(u_values, v_values, u_weights=None, v_weights=None):

9080 r"""Compute the energy distance between two 1D distributions.

9081

9082 .. versionadded:: 1.0.0

9083

9084 Parameters

9085 ----------

9086 u_values, v_values : array_like

9087 Values observed in the (empirical) distribution.

9088 u_weights, v_weights : array_like, optional

9089 Weight for each value. If unspecified, each value is assigned the same

9090 weight.

9091 `u_weights` (resp. `v_weights`) must have the same length as

9092 `u_values` (resp. `v_values`). If the weight sum differs from 1, it

9093 must still be positive and finite so that the weights can be normalized

9094 to sum to 1.

9095

9096 Returns

9097 -------

9098 distance : float

9099 The computed distance between the distributions.

9100

9101 Notes

9102 -----

9103 The energy distance between two distributions :math:`u` and :math:`v`, whose

9104 respective CDFs are :math:`U` and :math:`V`, equals to:

9105

9106 .. math::

9107

9108 D(u, v) = \left( 2\mathbb E|X - Y| - \mathbb E|X - X'| -

9109 \mathbb E|Y - Y'| \right)^{1/2}

9110

9111 where :math:`X` and :math:`X'` (resp. :math:`Y` and :math:`Y'`) are

9112 independent random variables whose probability distribution is :math:`u`

9113 (resp. :math:`v`).

9114

9115 Sometimes the square of this quantity is referred to as the "energy

9116 distance" (e.g. in [2]_, [4]_), but as noted in [1]_ and [3]_, only the

9117 definition above satisfies the axioms of a distance function (metric).

9118

9119 As shown in [2]_, for one-dimensional real-valued variables, the energy

9120 distance is linked to the non-distribution-free version of the Cramér-von

9121 Mises distance:

9122

9123 .. math::

9124

9125 D(u, v) = \sqrt{2} l_2(u, v) = \left( 2 \int_{-\infty}^{+\infty} (U-V)^2

9126 \right)^{1/2}

9127

9128 Note that the common Cramér-von Mises criterion uses the distribution-free

9129 version of the distance. See [2]_ (section 2), for more details about both

9130 versions of the distance.

9131

9132 The input distributions can be empirical, therefore coming from samples

9133 whose values are effectively inputs of the function, or they can be seen as

9134 generalized functions, in which case they are weighted sums of Dirac delta

9135 functions located at the specified values.

9136

9137 References

9138 ----------

9139 .. [1] Rizzo, Szekely "Energy distance." Wiley Interdisciplinary Reviews:

9140 Computational Statistics, 8(1):27-38 (2015).

9141 .. [2] Szekely "E-statistics: The energy of statistical samples." Bowling

9142 Green State University, Department of Mathematics and Statistics,

9143 Technical Report 02-16 (2002).

9144 .. [3] "Energy distance", https://en.wikipedia.org/wiki/Energy_distance

9145 .. [4] Bellemare, Danihelka, Dabney, Mohamed, Lakshminarayanan, Hoyer,

9146 Munos "The Cramer Distance as a Solution to Biased Wasserstein

9147 Gradients" (2017). :arXiv:`1705.10743`.

9148

9149 Examples

9150 --------

9151 >>> from scipy.stats import energy_distance

9152 >>> energy_distance([0], [2])

9153 2.0000000000000004

9154 >>> energy_distance([0, 8], [0, 8], [3, 1], [2, 2])

9155 1.0000000000000002

9156 >>> energy_distance([0.7, 7.4, 2.4, 6.8], [1.4, 8. ],

9157 ... [2.1, 4.2, 7.4, 8. ], [7.6, 8.8])

9158 0.88003340976158217

9159

9160 """

9161 return np.sqrt(2) * _cdf_distance(2, u_values, v_values,

9162 u_weights, v_weights)

9163

9164

9165def _cdf_distance(p, u_values, v_values, u_weights=None, v_weights=None):

9166 r"""

9167 Compute, between two one-dimensional distributions :math:`u` and

9168 :math:`v`, whose respective CDFs are :math:`U` and :math:`V`, the

9169 statistical distance that is defined as:

9170

9171 .. math::

9172

9173 l_p(u, v) = \left( \int_{-\infty}^{+\infty} |U-V|^p \right)^{1/p}

9174

9175 p is a positive parameter; p = 1 gives the Wasserstein distance, p = 2

9176 gives the energy distance.

9177

9178 Parameters

9179 ----------

9180 u_values, v_values : array_like

9181 Values observed in the (empirical) distribution.

9182 u_weights, v_weights : array_like, optional

9183 Weight for each value. If unspecified, each value is assigned the same

9184 weight.

9185 `u_weights` (resp. `v_weights`) must have the same length as

9186 `u_values` (resp. `v_values`). If the weight sum differs from 1, it

9187 must still be positive and finite so that the weights can be normalized

9188 to sum to 1.

9189

9190 Returns

9191 -------

9192 distance : float

9193 The computed distance between the distributions.

9194

9195 Notes

9196 -----

9197 The input distributions can be empirical, therefore coming from samples

9198 whose values are effectively inputs of the function, or they can be seen as

9199 generalized functions, in which case they are weighted sums of Dirac delta

9200 functions located at the specified values.

9201

9202 References

9203 ----------

9204 .. [1] Bellemare, Danihelka, Dabney, Mohamed, Lakshminarayanan, Hoyer,

9205 Munos "The Cramer Distance as a Solution to Biased Wasserstein

9206 Gradients" (2017). :arXiv:`1705.10743`.

9207

9208 """

9209 u_values, u_weights = _validate_distribution(u_values, u_weights)

9210 v_values, v_weights = _validate_distribution(v_values, v_weights)

9211

9212 u_sorter = np.argsort(u_values)

9213 v_sorter = np.argsort(v_values)

9214

9215 all_values = np.concatenate((u_values, v_values))

9216 all_values.sort(kind='mergesort')

9217

9218 # Compute the differences between pairs of successive values of u and v.

9219 deltas = np.diff(all_values)

9220

9221 # Get the respective positions of the values of u and v among the values of

9222 # both distributions.

9223 u_cdf_indices = u_values[u_sorter].searchsorted(all_values[:-1], 'right')

9224 v_cdf_indices = v_values[v_sorter].searchsorted(all_values[:-1], 'right')

9225

9226 # Calculate the CDFs of u and v using their weights, if specified.

9227 if u_weights is None:

9228 u_cdf = u_cdf_indices / u_values.size

9229 else:

9230 u_sorted_cumweights = np.concatenate(([0],

9231 np.cumsum(u_weights[u_sorter])))

9232 u_cdf = u_sorted_cumweights[u_cdf_indices] / u_sorted_cumweights[-1]

9233

9234 if v_weights is None:

9235 v_cdf = v_cdf_indices / v_values.size

9236 else:

9237 v_sorted_cumweights = np.concatenate(([0],

9238 np.cumsum(v_weights[v_sorter])))

9239 v_cdf = v_sorted_cumweights[v_cdf_indices] / v_sorted_cumweights[-1]

9240

9241 # Compute the value of the integral based on the CDFs.

9242 # If p = 1 or p = 2, we avoid using np.power, which introduces an overhead

9243 # of about 15%.

9244 if p == 1:

9245 return np.sum(np.multiply(np.abs(u_cdf - v_cdf), deltas))

9246 if p == 2:

9247 return np.sqrt(np.sum(np.multiply(np.square(u_cdf - v_cdf), deltas)))

9248 return np.power(np.sum(np.multiply(np.power(np.abs(u_cdf - v_cdf), p),

9249 deltas)), 1/p)

9250

9251

9252def _validate_distribution(values, weights):

9253 """

9254 Validate the values and weights from a distribution input of `cdf_distance`

9255 and return them as ndarray objects.

9256

9257 Parameters

9258 ----------

9259 values : array_like

9260 Values observed in the (empirical) distribution.

9261 weights : array_like

9262 Weight for each value.

9263

9264 Returns

9265 -------

9266 values : ndarray

9267 Values as ndarray.

9268 weights : ndarray

9269 Weights as ndarray.

9270

9271 """

9272 # Validate the value array.

9273 values = np.asarray(values, dtype=float)

9274 if len(values) == 0:

9275 raise ValueError("Distribution can't be empty.")

9276

9277 # Validate the weight array, if specified.

9278 if weights is not None:

9279 weights = np.asarray(weights, dtype=float)

9280 if len(weights) != len(values):

9281 raise ValueError('Value and weight array-likes for the same '

9282 'empirical distribution must be of the same size.')

9283 if np.any(weights < 0):

9284 raise ValueError('All weights must be non-negative.')

9285 if not 0 < np.sum(weights) < np.inf:

9286 raise ValueError('Weight array-like sum must be positive and '

9287 'finite. Set as None for an equal distribution of '

9288 'weight.')

9289

9290 return values, weights

9291

9292 return values, None

9293

9294

9295#####################################

9296# SUPPORT FUNCTIONS #

9297#####################################

9298

9299RepeatedResults = namedtuple('RepeatedResults', ('values', 'counts'))

9300

9301

9302def find_repeats(arr):

9303 """Find repeats and repeat counts.

9304

9305 Parameters

9306 ----------

9307 arr : array_like

9308 Input array. This is cast to float64.

9309

9310 Returns

9311 -------

9312 values : ndarray

9313 The unique values from the (flattened) input that are repeated.

9314

9315 counts : ndarray

9316 Number of times the corresponding 'value' is repeated.

9317

9318 Notes

9319 -----

9320 In numpy >= 1.9 `numpy.unique` provides similar functionality. The main

9321 difference is that `find_repeats` only returns repeated values.

9322

9323 Examples

9324 --------

9325 >>> from scipy import stats

9326 >>> stats.find_repeats([2, 1, 2, 3, 2, 2, 5])

9327 RepeatedResults(values=array([2.]), counts=array([4]))

9328

9329 >>> stats.find_repeats([[10, 20, 1, 2], [5, 5, 4, 4]])

9330 RepeatedResults(values=array([4., 5.]), counts=array([2, 2]))

9331

9332 """

9333 # Note: always copies.

9334 return RepeatedResults(*_find_repeats(np.array(arr, dtype=np.float64)))

9335

9336

9337def _sum_of_squares(a, axis=0):

9338 """Square each element of the input array, and return the sum(s) of that.

9339

9340 Parameters

9341 ----------

9342 a : array_like

9343 Input array.

9344 axis : int or None, optional

9345 Axis along which to calculate. Default is 0. If None, compute over

9346 the whole array `a`.

9347

9348 Returns

9349 -------

9350 sum_of_squares : ndarray

9351 The sum along the given axis for (a**2).

9352

9353 See Also

9354 --------

9355 _square_of_sums : The square(s) of the sum(s) (the opposite of

9356 `_sum_of_squares`).

9357

9358 """

9359 a, axis = _chk_asarray(a, axis)

9360 return np.sum(a*a, axis)

9361

9362

9363def _square_of_sums(a, axis=0):

9364 """Sum elements of the input array, and return the square(s) of that sum.

9365

9366 Parameters

9367 ----------

9368 a : array_like

9369 Input array.

9370 axis : int or None, optional

9371 Axis along which to calculate. Default is 0. If None, compute over

9372 the whole array `a`.

9373

9374 Returns

9375 -------

9376 square_of_sums : float or ndarray

9377 The square of the sum over `axis`.

9378

9379 See Also

9380 --------

9381 _sum_of_squares : The sum of squares (the opposite of `square_of_sums`).

9382

9383 """

9384 a, axis = _chk_asarray(a, axis)

9385 s = np.sum(a, axis)

9386 if not np.isscalar(s):

9387 return s.astype(float) * s

9388 else:

9389 return float(s) * s

9390

9391

9392def rankdata(a, method='average', *, axis=None, nan_policy='propagate'):

9393 """Assign ranks to data, dealing with ties appropriately.

9394

9395 By default (``axis=None``), the data array is first flattened, and a flat

9396 array of ranks is returned. Separately reshape the rank array to the

9397 shape of the data array if desired (see Examples).

9398

9399 Ranks begin at 1. The `method` argument controls how ranks are assigned

9400 to equal values. See [1]_ for further discussion of ranking methods.

9401

9402 Parameters

9403 ----------

9404 a : array_like

9405 The array of values to be ranked.

9406 method : {'average', 'min', 'max', 'dense', 'ordinal'}, optional

9407 The method used to assign ranks to tied elements.

9408 The following methods are available (default is 'average'):

9409

9410 * 'average': The average of the ranks that would have been assigned to

9411 all the tied values is assigned to each value.

9412 * 'min': The minimum of the ranks that would have been assigned to all

9413 the tied values is assigned to each value. (This is also

9414 referred to as "competition" ranking.)

9415 * 'max': The maximum of the ranks that would have been assigned to all

9416 the tied values is assigned to each value.

9417 * 'dense': Like 'min', but the rank of the next highest element is

9418 assigned the rank immediately after those assigned to the tied

9419 elements.

9420 * 'ordinal': All values are given a distinct rank, corresponding to

9421 the order that the values occur in `a`.

9422 axis : {None, int}, optional

9423 Axis along which to perform the ranking. If ``None``, the data array

9424 is first flattened.

9425 nan_policy : {'propagate', 'omit', 'raise'}, optional

9426 Defines how to handle when input contains nan.

9427 The following options are available (default is 'propagate'):

9428

9429 * 'propagate': propagates nans through the rank calculation

9430 * 'omit': performs the calculations ignoring nan values

9431 * 'raise': raises an error

9432

9433 .. note::

9434

9435 When `nan_policy` is 'propagate', the output is an array of *all*

9436 nans because ranks relative to nans in the input are undefined.

9437 When `nan_policy` is 'omit', nans in `a` are ignored when ranking

9438 the other values, and the corresponding locations of the output

9439 are nan.

9440

9441 .. versionadded:: 1.10

9442

9443 Returns

9444 -------

9445 ranks : ndarray

9446 An array of size equal to the size of `a`, containing rank

9447 scores.

9448

9449 References

9450 ----------

9451 .. [1] "Ranking", https://en.wikipedia.org/wiki/Ranking

9452

9453 Examples

9454 --------

9455 >>> import numpy as np

9456 >>> from scipy.stats import rankdata

9457 >>> rankdata([0, 2, 3, 2])

9458 array([ 1. , 2.5, 4. , 2.5])

9459 >>> rankdata([0, 2, 3, 2], method='min')

9460 array([ 1, 2, 4, 2])

9461 >>> rankdata([0, 2, 3, 2], method='max')

9462 array([ 1, 3, 4, 3])

9463 >>> rankdata([0, 2, 3, 2], method='dense')

9464 array([ 1, 2, 3, 2])

9465 >>> rankdata([0, 2, 3, 2], method='ordinal')

9466 array([ 1, 2, 4, 3])

9467 >>> rankdata([[0, 2], [3, 2]]).reshape(2,2)

9468 array([[1. , 2.5],

9469 [4. , 2.5]])

9470 >>> rankdata([[0, 2, 2], [3, 2, 5]], axis=1)

9471 array([[1. , 2.5, 2.5],

9472 [2. , 1. , 3. ]])

9473 >>> rankdata([0, 2, 3, np.nan, -2, np.nan], nan_policy="propagate")

9474 array([nan, nan, nan, nan, nan, nan])

9475 >>> rankdata([0, 2, 3, np.nan, -2, np.nan], nan_policy="omit")

9476 array([ 2., 3., 4., nan, 1., nan])

9477

9478 """

9479 if method not in ('average', 'min', 'max', 'dense', 'ordinal'):

9480 raise ValueError('unknown method "{0}"'.format(method))

9481

9482 a = np.asarray(a)

9483

9484 if axis is not None:

9485 if a.size == 0:

9486 # The return values of `normalize_axis_index` are ignored. The

9487 # call validates `axis`, even though we won't use it.

9488 # use scipy._lib._util._normalize_axis_index when available

9489 np.core.multiarray.normalize_axis_index(axis, a.ndim)

9490 dt = np.float64 if method == 'average' else np.int_

9491 return np.empty(a.shape, dtype=dt)

9492 return np.apply_along_axis(rankdata, axis, a, method,

9493 nan_policy=nan_policy)

9494

9495 arr = np.ravel(a)

9496 contains_nan, nan_policy = _contains_nan(arr, nan_policy)

9497 nan_indexes = None

9498 if contains_nan:

9499 if nan_policy == 'omit':

9500 nan_indexes = np.isnan(arr)

9501 if nan_policy == 'propagate':

9502 return np.full_like(arr, np.nan)

9503

9504 algo = 'mergesort' if method == 'ordinal' else 'quicksort'

9505 sorter = np.argsort(arr, kind=algo)

9506

9507 inv = np.empty(sorter.size, dtype=np.intp)

9508 inv[sorter] = np.arange(sorter.size, dtype=np.intp)

9509

9510 if method == 'ordinal':

9511 result = inv + 1

9512

9513 arr = arr[sorter]

9514 obs = np.r_[True, arr[1:] != arr[:-1]]

9515 dense = obs.cumsum()[inv]

9516

9517 if method == 'dense':

9518 result = dense

9519

9520 # cumulative counts of each unique value

9521 count = np.r_[np.nonzero(obs)[0], len(obs)]

9522

9523 if method == 'max':

9524 result = count[dense]

9525

9526 if method == 'min':

9527 result = count[dense - 1] + 1

9528

9529 if method == 'average':

9530 result = .5 * (count[dense] + count[dense - 1] + 1)

9531

9532 if nan_indexes is not None:

9533 result = result.astype('float64')

9534 result[nan_indexes] = np.nan

9535

9536 return result

9537

9538

9539def expectile(a, alpha=0.5, *, weights=None):

9540 r"""Compute the expectile at the specified level.

9541

9542 Expectiles are a generalization of the expectation in the same way as

9543 quantiles are a generalization of the median. The expectile at level

9544 `alpha = 0.5` is the mean (average). See Notes for more details.

9545

9546 Parameters

9547 ----------

9548 a : array_like

9549 Array containing numbers whose expectile is desired.

9550 alpha : float, default: 0.5

9551 The level of the expectile; e.g., `alpha=0.5` gives the mean.

9552 weights : array_like, optional

9553 An array of weights associated with the values in `a`.

9554 The `weights` must be broadcastable to the same shape as `a`.

9555 Default is None, which gives each value a weight of 1.0.

9556 An integer valued weight element acts like repeating the corresponding

9557 observation in `a` that many times. See Notes for more details.

9558

9559 Returns

9560 -------

9561 expectile : ndarray

9562 The empirical expectile at level `alpha`.

9563

9564 See Also

9565 --------

9566 numpy.mean : Arithmetic average

9567 numpy.quantile : Quantile

9568

9569 Notes

9570 -----

9571 In general, the expectile at level :math:`\alpha` of a random variable

9572 :math:`X` with cumulative distribution function (CDF) :math:`F` is given

9573 by the unique solution :math:`t` of:

9574

9575 .. math::

9576

9577 \alpha E((X - t)_+) = (1 - \alpha) E((t - X)_+) \,.

9578

9579 Here, :math:`(x)_+ = \max(0, x)` is the positive part of :math:`x`.

9580 This equation can be equivalently written as:

9581

9582 .. math::

9583

9584 \alpha \int_t^\infty (x - t)\mathrm{d}F(x)

9585 = (1 - \alpha) \int_{-\infty}^t (t - x)\mathrm{d}F(x) \,.

9586

9587 The empirical expectile at level :math:`\alpha` (`alpha`) of a sample

9588 :math:`a_i` (the array `a`) is defined by plugging in the empirical CDF of

9589 `a`. Given sample or case weights :math:`w` (the array `weights`), it

9590 reads :math:`F_a(x) = \frac{1}{\sum_i a_i} \sum_i w_i 1_{a_i \leq x}`

9591 with indicator function :math:`1_{A}`. This leads to the definition of the

9592 empirical expectile at level `alpha` as the unique solution :math:`t` of:

9593

9594 .. math::

9595

9596 \alpha \sum_{i=1}^n w_i (a_i - t)_+ =

9597 (1 - \alpha) \sum_{i=1}^n w_i (t - a_i)_+ \,.

9598

9599 For :math:`\alpha=0.5`, this simplifies to the weighted average.

9600 Furthermore, the larger :math:`\alpha`, the larger the value of the

9601 expectile.

9602

9603 As a final remark, the expectile at level :math:`\alpha` can also be

9604 written as a minimization problem. One often used choice is

9605

9606 .. math::

9607

9608 \operatorname{argmin}_t

9609 E(\lvert 1_{t\geq X} - \alpha\rvert(t - X)^2) \,.

9610

9611 References

9612 ----------

9613 .. [1] W. K. Newey and J. L. Powell (1987), "Asymmetric Least Squares

9614 Estimation and Testing," Econometrica, 55, 819-847.

9615 .. [2] T. Gneiting (2009). "Making and Evaluating Point Forecasts,"

9616 Journal of the American Statistical Association, 106, 746 - 762.

9617 :doi:`10.48550/arXiv.0912.0902`

9618

9619 Examples

9620 --------

9621 >>> import numpy as np

9622 >>> from scipy.stats import expectile

9623 >>> a = [1, 4, 2, -1]

9624 >>> expectile(a, alpha=0.5) == np.mean(a)

9625 True

9626 >>> expectile(a, alpha=0.2)

9627 0.42857142857142855

9628 >>> expectile(a, alpha=0.8)

9629 2.5714285714285716

9630 >>> weights = [1, 3, 1, 1]

9631

9632 """

9633 if alpha < 0 or alpha > 1:

9634 raise ValueError(

9635 "The expectile level alpha must be in the range [0, 1]."

9636 )

9637 a = np.asarray(a)

9638

9639 if weights is not None:

9640 weights = np.broadcast_to(weights, a.shape)

9641

9642 # This is the empirical equivalent of Eq. (13) with identification

9643 # function from Table 9 (omitting a factor of 2) in [2] (their y is our

9644 # data a, their x is our t)

9645 def first_order(t):

9646 return np.average(np.abs((a <= t) - alpha) * (t - a), weights=weights)

9647

9648 if alpha >= 0.5:

9649 x0 = np.average(a, weights=weights)

9650 x1 = np.amax(a)

9651 else:

9652 x1 = np.average(a, weights=weights)

9653 x0 = np.amin(a)

9654

9655 if x0 == x1:

9656 # a has a single unique element

9657 return x0

9658

9659 # Note that the expectile is the unique solution, so no worries about

9660 # finding a wrong root.

9661 res = root_scalar(first_order, x0=x0, x1=x1)

9662 return res.root

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scipy/stats/_stats_py.py: 10%

2034 statements