Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scipy/stats/_stats_py.py: 10%
2034 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-12 06:31 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-12 06:31 +0000
1# Copyright 2002 Gary Strangman. All rights reserved
2# Copyright 2002-2016 The SciPy Developers
3#
4# The original code from Gary Strangman was heavily adapted for
5# use in SciPy by Travis Oliphant. The original code came with the
6# following disclaimer:
7#
8# This software is provided "as-is". There are no expressed or implied
9# warranties of any kind, including, but not limited to, the warranties
10# of merchantability and fitness for a given application. In no event
11# shall Gary Strangman be liable for any direct, indirect, incidental,
12# special, exemplary or consequential damages (including, but not limited
13# to, loss of use, data or profits, or business interruption) however
14# caused and on any theory of liability, whether in contract, strict
15# liability or tort (including negligence or otherwise) arising in any way
16# out of the use of this software, even if advised of the possibility of
17# such damage.
19"""
20A collection of basic statistical functions for Python.
22References
23----------
24.. [CRCProbStat2000] Zwillinger, D. and Kokoska, S. (2000). CRC Standard
25 Probability and Statistics Tables and Formulae. Chapman & Hall: New
26 York. 2000.
28"""
29import warnings
30import math
31from math import gcd
32from collections import namedtuple, Counter
34import numpy as np
35from numpy import array, asarray, ma
36from numpy.lib import NumpyVersion
37from numpy.testing import suppress_warnings
39from scipy.spatial.distance import cdist
40from scipy.ndimage import _measurements
41from scipy._lib._util import (check_random_state, MapWrapper,
42 rng_integers, _rename_parameter, _contains_nan)
44import scipy.special as special
45from scipy import linalg
46from . import distributions
47from . import _mstats_basic as mstats_basic
48from ._stats_mstats_common import (_find_repeats, linregress, theilslopes,
49 siegelslopes)
50from ._stats import (_kendall_dis, _toint64, _weightedrankedtau,
51 _local_correlations)
52from dataclasses import make_dataclass
53from ._hypotests import _all_partitions
54from ._stats_pythran import _compute_outer_prob_inside_method
55from ._resampling import _batch_generator
56from ._axis_nan_policy import (_axis_nan_policy_factory,
57 _broadcast_concatenate)
58from ._binomtest import _binary_search_for_binom_tst as _binary_search
59from scipy._lib._bunch import _make_tuple_bunch
60from scipy import stats
61from scipy.optimize import root_scalar
64# Functions/classes in other files should be added in `__init__.py`, not here
65__all__ = ['find_repeats', 'gmean', 'hmean', 'pmean', 'mode', 'tmean', 'tvar',
66 'tmin', 'tmax', 'tstd', 'tsem', 'moment',
67 'skew', 'kurtosis', 'describe', 'skewtest', 'kurtosistest',
68 'normaltest', 'jarque_bera',
69 'scoreatpercentile', 'percentileofscore',
70 'cumfreq', 'relfreq', 'obrientransform',
71 'sem', 'zmap', 'zscore', 'gzscore', 'iqr', 'gstd',
72 'median_abs_deviation',
73 'sigmaclip', 'trimboth', 'trim1', 'trim_mean',
74 'f_oneway', 'pearsonr', 'fisher_exact',
75 'spearmanr', 'pointbiserialr',
76 'kendalltau', 'weightedtau', 'multiscale_graphcorr',
77 'linregress', 'siegelslopes', 'theilslopes', 'ttest_1samp',
78 'ttest_ind', 'ttest_ind_from_stats', 'ttest_rel',
79 'kstest', 'ks_1samp', 'ks_2samp',
80 'chisquare', 'power_divergence',
81 'tiecorrect', 'ranksums', 'kruskal', 'friedmanchisquare',
82 'rankdata',
83 'combine_pvalues', 'wasserstein_distance', 'energy_distance',
84 'brunnermunzel', 'alexandergovern',
85 'expectile', ]
88def _chk_asarray(a, axis):
89 if axis is None:
90 a = np.ravel(a)
91 outaxis = 0
92 else:
93 a = np.asarray(a)
94 outaxis = axis
96 if a.ndim == 0:
97 a = np.atleast_1d(a)
99 return a, outaxis
102def _chk2_asarray(a, b, axis):
103 if axis is None:
104 a = np.ravel(a)
105 b = np.ravel(b)
106 outaxis = 0
107 else:
108 a = np.asarray(a)
109 b = np.asarray(b)
110 outaxis = axis
112 if a.ndim == 0:
113 a = np.atleast_1d(a)
114 if b.ndim == 0:
115 b = np.atleast_1d(b)
117 return a, b, outaxis
120def _shape_with_dropped_axis(a, axis):
121 """
122 Given an array `a` and an integer `axis`, return the shape
123 of `a` with the `axis` dimension removed.
125 Examples
126 --------
127 >>> a = np.zeros((3, 5, 2))
128 >>> _shape_with_dropped_axis(a, 1)
129 (3, 2)
131 """
132 shp = list(a.shape)
133 try:
134 del shp[axis]
135 except IndexError:
136 raise np.AxisError(axis, a.ndim) from None
137 return tuple(shp)
140def _broadcast_shapes(shape1, shape2):
141 """
142 Given two shapes (i.e. tuples of integers), return the shape
143 that would result from broadcasting two arrays with the given
144 shapes.
146 Examples
147 --------
148 >>> _broadcast_shapes((2, 1), (4, 1, 3))
149 (4, 2, 3)
150 """
151 d = len(shape1) - len(shape2)
152 if d <= 0:
153 shp1 = (1,)*(-d) + shape1
154 shp2 = shape2
155 else:
156 shp1 = shape1
157 shp2 = (1,)*d + shape2
158 shape = []
159 for n1, n2 in zip(shp1, shp2):
160 if n1 == 1:
161 n = n2
162 elif n2 == 1 or n1 == n2:
163 n = n1
164 else:
165 raise ValueError(f'shapes {shape1} and {shape2} could not be '
166 'broadcast together')
167 shape.append(n)
168 return tuple(shape)
171def _broadcast_shapes_with_dropped_axis(a, b, axis):
172 """
173 Given two arrays `a` and `b` and an integer `axis`, find the
174 shape of the broadcast result after dropping `axis` from the
175 shapes of `a` and `b`.
177 Examples
178 --------
179 >>> a = np.zeros((5, 2, 1))
180 >>> b = np.zeros((1, 9, 3))
181 >>> _broadcast_shapes_with_dropped_axis(a, b, 1)
182 (5, 3)
183 """
184 shp1 = _shape_with_dropped_axis(a, axis)
185 shp2 = _shape_with_dropped_axis(b, axis)
186 try:
187 shp = _broadcast_shapes(shp1, shp2)
188 except ValueError:
189 raise ValueError(f'non-axis shapes {shp1} and {shp2} could not be '
190 'broadcast together') from None
191 return shp
194SignificanceResult = _make_tuple_bunch('SignificanceResult',
195 ['statistic', 'pvalue'], [])
198# note that `weights` are paired with `x`
199@_axis_nan_policy_factory(
200 lambda x: x, n_samples=1, n_outputs=1, too_small=0, paired=True,
201 result_to_tuple=lambda x: (x,), kwd_samples=['weights'])
202def gmean(a, axis=0, dtype=None, weights=None):
203 r"""Compute the weighted geometric mean along the specified axis.
205 The weighted geometric mean of the array :math:`a_i` associated to weights
206 :math:`w_i` is:
208 .. math::
210 \exp \left( \frac{ \sum_{i=1}^n w_i \ln a_i }{ \sum_{i=1}^n w_i }
211 \right) \, ,
213 and, with equal weights, it gives:
215 .. math::
217 \sqrt[n]{ \prod_{i=1}^n a_i } \, .
219 Parameters
220 ----------
221 a : array_like
222 Input array or object that can be converted to an array.
223 axis : int or None, optional
224 Axis along which the geometric mean is computed. Default is 0.
225 If None, compute over the whole array `a`.
226 dtype : dtype, optional
227 Type to which the input arrays are cast before the calculation is
228 performed.
229 weights : array_like, optional
230 The `weights` array must be broadcastable to the same shape as `a`.
231 Default is None, which gives each value a weight of 1.0.
233 Returns
234 -------
235 gmean : ndarray
236 See `dtype` parameter above.
238 See Also
239 --------
240 numpy.mean : Arithmetic average
241 numpy.average : Weighted average
242 hmean : Harmonic mean
244 References
245 ----------
246 .. [1] "Weighted Geometric Mean", *Wikipedia*,
247 https://en.wikipedia.org/wiki/Weighted_geometric_mean.
249 Examples
250 --------
251 >>> from scipy.stats import gmean
252 >>> gmean([1, 4])
253 2.0
254 >>> gmean([1, 2, 3, 4, 5, 6, 7])
255 3.3800151591412964
256 >>> gmean([1, 4, 7], weights=[3, 1, 3])
257 2.80668351922014
259 """
261 a = np.asarray(a, dtype=dtype)
263 if weights is not None:
264 weights = np.asarray(weights, dtype=dtype)
266 with np.errstate(divide='ignore'):
267 log_a = np.log(a)
269 return np.exp(np.average(log_a, axis=axis, weights=weights))
272@_axis_nan_policy_factory(
273 lambda x: x, n_samples=1, n_outputs=1, too_small=0, paired=True,
274 result_to_tuple=lambda x: (x,), kwd_samples=['weights'])
275def hmean(a, axis=0, dtype=None, *, weights=None):
276 r"""Calculate the weighted harmonic mean along the specified axis.
278 The weighted harmonic mean of the array :math:`a_i` associated to weights
279 :math:`w_i` is:
281 .. math::
283 \frac{ \sum_{i=1}^n w_i }{ \sum_{i=1}^n \frac{w_i}{a_i} } \, ,
285 and, with equal weights, it gives:
287 .. math::
289 \frac{ n }{ \sum_{i=1}^n \frac{1}{a_i} } \, .
291 Parameters
292 ----------
293 a : array_like
294 Input array, masked array or object that can be converted to an array.
295 axis : int or None, optional
296 Axis along which the harmonic mean is computed. Default is 0.
297 If None, compute over the whole array `a`.
298 dtype : dtype, optional
299 Type of the returned array and of the accumulator in which the
300 elements are summed. If `dtype` is not specified, it defaults to the
301 dtype of `a`, unless `a` has an integer `dtype` with a precision less
302 than that of the default platform integer. In that case, the default
303 platform integer is used.
304 weights : array_like, optional
305 The weights array can either be 1-D (in which case its length must be
306 the size of `a` along the given `axis`) or of the same shape as `a`.
307 Default is None, which gives each value a weight of 1.0.
309 .. versionadded:: 1.9
311 Returns
312 -------
313 hmean : ndarray
314 See `dtype` parameter above.
316 See Also
317 --------
318 numpy.mean : Arithmetic average
319 numpy.average : Weighted average
320 gmean : Geometric mean
322 Notes
323 -----
324 The harmonic mean is computed over a single dimension of the input
325 array, axis=0 by default, or all values in the array if axis=None.
326 float64 intermediate and return values are used for integer inputs.
328 References
329 ----------
330 .. [1] "Weighted Harmonic Mean", *Wikipedia*,
331 https://en.wikipedia.org/wiki/Harmonic_mean#Weighted_harmonic_mean
332 .. [2] Ferger, F., "The nature and use of the harmonic mean", Journal of
333 the American Statistical Association, vol. 26, pp. 36-40, 1931
335 Examples
336 --------
337 >>> from scipy.stats import hmean
338 >>> hmean([1, 4])
339 1.6000000000000001
340 >>> hmean([1, 2, 3, 4, 5, 6, 7])
341 2.6997245179063363
342 >>> hmean([1, 4, 7], weights=[3, 1, 3])
343 1.9029126213592233
345 """
346 if not isinstance(a, np.ndarray):
347 a = np.array(a, dtype=dtype)
348 elif dtype:
349 # Must change the default dtype allowing array type
350 if isinstance(a, np.ma.MaskedArray):
351 a = np.ma.asarray(a, dtype=dtype)
352 else:
353 a = np.asarray(a, dtype=dtype)
355 if np.all(a >= 0):
356 # Harmonic mean only defined if greater than or equal to zero.
357 if weights is not None:
358 weights = np.asanyarray(weights, dtype=dtype)
360 with np.errstate(divide='ignore'):
361 return 1.0 / np.average(1.0 / a, axis=axis, weights=weights)
362 else:
363 raise ValueError("Harmonic mean only defined if all elements greater "
364 "than or equal to zero")
367@_axis_nan_policy_factory(
368 lambda x: x, n_samples=1, n_outputs=1, too_small=0, paired=True,
369 result_to_tuple=lambda x: (x,), kwd_samples=['weights'])
370def pmean(a, p, *, axis=0, dtype=None, weights=None):
371 r"""Calculate the weighted power mean along the specified axis.
373 The weighted power mean of the array :math:`a_i` associated to weights
374 :math:`w_i` is:
376 .. math::
378 \left( \frac{ \sum_{i=1}^n w_i a_i^p }{ \sum_{i=1}^n w_i }
379 \right)^{ 1 / p } \, ,
381 and, with equal weights, it gives:
383 .. math::
385 \left( \frac{ 1 }{ n } \sum_{i=1}^n a_i^p \right)^{ 1 / p } \, .
387 This mean is also called generalized mean or Hölder mean, and must not be
388 confused with the Kolmogorov generalized mean, also called
389 quasi-arithmetic mean or generalized f-mean [3]_.
391 Parameters
392 ----------
393 a : array_like
394 Input array, masked array or object that can be converted to an array.
395 p : int or float
396 Exponent.
397 axis : int or None, optional
398 Axis along which the power mean is computed. Default is 0.
399 If None, compute over the whole array `a`.
400 dtype : dtype, optional
401 Type of the returned array and of the accumulator in which the
402 elements are summed. If `dtype` is not specified, it defaults to the
403 dtype of `a`, unless `a` has an integer `dtype` with a precision less
404 than that of the default platform integer. In that case, the default
405 platform integer is used.
406 weights : array_like, optional
407 The weights array can either be 1-D (in which case its length must be
408 the size of `a` along the given `axis`) or of the same shape as `a`.
409 Default is None, which gives each value a weight of 1.0.
411 Returns
412 -------
413 pmean : ndarray, see `dtype` parameter above.
414 Output array containing the power mean values.
416 See Also
417 --------
418 numpy.average : Weighted average
419 gmean : Geometric mean
420 hmean : Harmonic mean
422 Notes
423 -----
424 The power mean is computed over a single dimension of the input
425 array, ``axis=0`` by default, or all values in the array if ``axis=None``.
426 float64 intermediate and return values are used for integer inputs.
428 .. versionadded:: 1.9
430 References
431 ----------
432 .. [1] "Generalized Mean", *Wikipedia*,
433 https://en.wikipedia.org/wiki/Generalized_mean
434 .. [2] Norris, N., "Convexity properties of generalized mean value
435 functions", The Annals of Mathematical Statistics, vol. 8,
436 pp. 118-120, 1937
437 .. [3] Bullen, P.S., Handbook of Means and Their Inequalities, 2003
439 Examples
440 --------
441 >>> from scipy.stats import pmean, hmean, gmean
442 >>> pmean([1, 4], 1.3)
443 2.639372938300652
444 >>> pmean([1, 2, 3, 4, 5, 6, 7], 1.3)
445 4.157111214492084
446 >>> pmean([1, 4, 7], -2, weights=[3, 1, 3])
447 1.4969684896631954
449 For p=-1, power mean is equal to harmonic mean:
451 >>> pmean([1, 4, 7], -1, weights=[3, 1, 3])
452 1.9029126213592233
453 >>> hmean([1, 4, 7], weights=[3, 1, 3])
454 1.9029126213592233
456 For p=0, power mean is defined as the geometric mean:
458 >>> pmean([1, 4, 7], 0, weights=[3, 1, 3])
459 2.80668351922014
460 >>> gmean([1, 4, 7], weights=[3, 1, 3])
461 2.80668351922014
463 """
464 if not isinstance(p, (int, float)):
465 raise ValueError("Power mean only defined for exponent of type int or "
466 "float.")
467 if p == 0:
468 return gmean(a, axis=axis, dtype=dtype, weights=weights)
470 if not isinstance(a, np.ndarray):
471 a = np.array(a, dtype=dtype)
472 elif dtype:
473 # Must change the default dtype allowing array type
474 if isinstance(a, np.ma.MaskedArray):
475 a = np.ma.asarray(a, dtype=dtype)
476 else:
477 a = np.asarray(a, dtype=dtype)
479 if np.all(a >= 0):
480 # Power mean only defined if greater than or equal to zero
481 if weights is not None:
482 weights = np.asanyarray(weights, dtype=dtype)
484 with np.errstate(divide='ignore'):
485 return np.float_power(
486 np.average(np.float_power(a, p), axis=axis, weights=weights),
487 1/p)
488 else:
489 raise ValueError("Power mean only defined if all elements greater "
490 "than or equal to zero")
493ModeResult = namedtuple('ModeResult', ('mode', 'count'))
496def mode(a, axis=0, nan_policy='propagate', keepdims=None):
497 r"""Return an array of the modal (most common) value in the passed array.
499 If there is more than one such value, only one is returned.
500 The bin-count for the modal bins is also returned.
502 Parameters
503 ----------
504 a : array_like
505 n-dimensional array of which to find mode(s).
506 axis : int or None, optional
507 Axis along which to operate. Default is 0. If None, compute over
508 the whole array `a`.
509 nan_policy : {'propagate', 'raise', 'omit'}, optional
510 Defines how to handle when input contains nan.
511 The following options are available (default is 'propagate'):
513 * 'propagate': treats nan as it would treat any other value
514 * 'raise': throws an error
515 * 'omit': performs the calculations ignoring nan values
516 keepdims : bool, optional
517 If set to ``False``, the `axis` over which the statistic is taken
518 is consumed (eliminated from the output array) like other reduction
519 functions (e.g. `skew`, `kurtosis`). If set to ``True``, the `axis` is
520 retained with size one, and the result will broadcast correctly
521 against the input array. The default, ``None``, is undefined legacy
522 behavior retained for backward compatibility.
524 .. warning::
525 Unlike other reduction functions (e.g. `skew`, `kurtosis`), the
526 default behavior of `mode` usually retains the axis it acts
527 along. In SciPy 1.11.0, this behavior will change: the default
528 value of `keepdims` will become ``False``, the `axis` over which
529 the statistic is taken will be eliminated, and the value ``None``
530 will no longer be accepted.
531 .. versionadded:: 1.9.0
533 Returns
534 -------
535 mode : ndarray
536 Array of modal values.
537 count : ndarray
538 Array of counts for each mode.
540 Notes
541 -----
542 The mode of object arrays is calculated using `collections.Counter`, which
543 treats NaNs with different binary representations as distinct.
545 .. deprecated:: 1.9.0
546 Support for non-numeric arrays has been deprecated as of SciPy 1.9.0
547 and will be removed in 1.11.0. `pandas.DataFrame.mode`_ can
548 be used instead.
550 .. _pandas.DataFrame.mode: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.mode.html
552 The mode of arrays with other dtypes is calculated using `numpy.unique`.
553 In NumPy versions 1.21 and after, all NaNs - even those with different
554 binary representations - are treated as equivalent and counted as separate
555 instances of the same value.
557 Examples
558 --------
559 >>> import numpy as np
560 >>> a = np.array([[3, 0, 3, 7],
561 ... [3, 2, 6, 2],
562 ... [1, 7, 2, 8],
563 ... [3, 0, 6, 1],
564 ... [3, 2, 5, 5]])
565 >>> from scipy import stats
566 >>> stats.mode(a, keepdims=True)
567 ModeResult(mode=array([[3, 0, 6, 1]]), count=array([[4, 2, 2, 1]]))
569 To get mode of whole array, specify ``axis=None``:
571 >>> stats.mode(a, axis=None, keepdims=True)
572 ModeResult(mode=[3], count=[5])
573 >>> stats.mode(a, axis=None, keepdims=False)
574 ModeResult(mode=3, count=5)
576 """ # noqa: E501
578 if keepdims is None:
579 message = ("Unlike other reduction functions (e.g. `skew`, "
580 "`kurtosis`), the default behavior of `mode` typically "
581 "preserves the axis it acts along. In SciPy 1.11.0, "
582 "this behavior will change: the default value of "
583 "`keepdims` will become False, the `axis` over which "
584 "the statistic is taken will be eliminated, and the value "
585 "None will no longer be accepted. "
586 "Set `keepdims` to True or False to avoid this warning.")
587 warnings.warn(message, FutureWarning, stacklevel=2)
589 a = np.asarray(a)
590 if a.size == 0:
591 if keepdims is None:
592 return ModeResult(np.array([]), np.array([]))
593 else:
594 # this is tricky to get right; let np.mean do it
595 out = np.mean(a, axis=axis, keepdims=keepdims)
596 return ModeResult(out, out.copy())
598 a, axis = _chk_asarray(a, axis)
600 contains_nan, nan_policy = _contains_nan(a, nan_policy)
602 if contains_nan and nan_policy == 'omit':
603 a = ma.masked_invalid(a)
604 return mstats_basic._mode(a, axis, keepdims=keepdims)
606 if not np.issubdtype(a.dtype, np.number):
607 warnings.warn("Support for non-numeric arrays has been deprecated "
608 "as of SciPy 1.9.0 and will be removed in "
609 "1.11.0. `pandas.DataFrame.mode` can be used instead, "
610 "see https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.mode.html.", # noqa: E501
611 DeprecationWarning, stacklevel=2)
613 if a.dtype == object:
614 def _mode1D(a):
615 cntr = Counter(a)
616 mode = max(cntr, key=lambda x: cntr[x])
617 return mode, cntr[mode]
618 else:
619 def _mode1D(a):
620 vals, cnts = np.unique(a, return_counts=True)
621 return vals[cnts.argmax()], cnts.max()
623 # np.apply_along_axis will convert the _mode1D tuples to a numpy array,
624 # casting types in the process.
625 # This recreates the results without that issue
626 # View of a, rotated so the requested axis is last
627 a_view = np.moveaxis(a, axis, -1)
629 inds = np.ndindex(a_view.shape[:-1])
630 modes = np.empty(a_view.shape[:-1], dtype=a.dtype)
631 counts = np.empty(a_view.shape[:-1], dtype=np.int_)
632 for ind in inds:
633 modes[ind], counts[ind] = _mode1D(a_view[ind])
635 if keepdims is None or keepdims:
636 newshape = list(a.shape)
637 newshape[axis] = 1
638 return ModeResult(modes.reshape(newshape), counts.reshape(newshape))
639 else:
640 return ModeResult(modes[()], counts[()])
643def _mask_to_limits(a, limits, inclusive):
644 """Mask an array for values outside of given limits.
646 This is primarily a utility function.
648 Parameters
649 ----------
650 a : array
651 limits : (float or None, float or None)
652 A tuple consisting of the (lower limit, upper limit). Values in the
653 input array less than the lower limit or greater than the upper limit
654 will be masked out. None implies no limit.
655 inclusive : (bool, bool)
656 A tuple consisting of the (lower flag, upper flag). These flags
657 determine whether values exactly equal to lower or upper are allowed.
659 Returns
660 -------
661 A MaskedArray.
663 Raises
664 ------
665 A ValueError if there are no values within the given limits.
667 """
668 lower_limit, upper_limit = limits
669 lower_include, upper_include = inclusive
670 am = ma.MaskedArray(a)
671 if lower_limit is not None:
672 if lower_include:
673 am = ma.masked_less(am, lower_limit)
674 else:
675 am = ma.masked_less_equal(am, lower_limit)
677 if upper_limit is not None:
678 if upper_include:
679 am = ma.masked_greater(am, upper_limit)
680 else:
681 am = ma.masked_greater_equal(am, upper_limit)
683 if am.count() == 0:
684 raise ValueError("No array values within given limits")
686 return am
689def tmean(a, limits=None, inclusive=(True, True), axis=None):
690 """Compute the trimmed mean.
692 This function finds the arithmetic mean of given values, ignoring values
693 outside the given `limits`.
695 Parameters
696 ----------
697 a : array_like
698 Array of values.
699 limits : None or (lower limit, upper limit), optional
700 Values in the input array less than the lower limit or greater than the
701 upper limit will be ignored. When limits is None (default), then all
702 values are used. Either of the limit values in the tuple can also be
703 None representing a half-open interval.
704 inclusive : (bool, bool), optional
705 A tuple consisting of the (lower flag, upper flag). These flags
706 determine whether values exactly equal to the lower or upper limits
707 are included. The default value is (True, True).
708 axis : int or None, optional
709 Axis along which to compute test. Default is None.
711 Returns
712 -------
713 tmean : ndarray
714 Trimmed mean.
716 See Also
717 --------
718 trim_mean : Returns mean after trimming a proportion from both tails.
720 Examples
721 --------
722 >>> import numpy as np
723 >>> from scipy import stats
724 >>> x = np.arange(20)
725 >>> stats.tmean(x)
726 9.5
727 >>> stats.tmean(x, (3,17))
728 10.0
730 """
731 a = asarray(a)
732 if limits is None:
733 return np.mean(a, axis)
734 am = _mask_to_limits(a, limits, inclusive)
735 mean = np.ma.filled(am.mean(axis=axis), fill_value=np.nan)
736 return mean if mean.ndim > 0 else mean.item()
739def tvar(a, limits=None, inclusive=(True, True), axis=0, ddof=1):
740 """Compute the trimmed variance.
742 This function computes the sample variance of an array of values,
743 while ignoring values which are outside of given `limits`.
745 Parameters
746 ----------
747 a : array_like
748 Array of values.
749 limits : None or (lower limit, upper limit), optional
750 Values in the input array less than the lower limit or greater than the
751 upper limit will be ignored. When limits is None, then all values are
752 used. Either of the limit values in the tuple can also be None
753 representing a half-open interval. The default value is None.
754 inclusive : (bool, bool), optional
755 A tuple consisting of the (lower flag, upper flag). These flags
756 determine whether values exactly equal to the lower or upper limits
757 are included. The default value is (True, True).
758 axis : int or None, optional
759 Axis along which to operate. Default is 0. If None, compute over the
760 whole array `a`.
761 ddof : int, optional
762 Delta degrees of freedom. Default is 1.
764 Returns
765 -------
766 tvar : float
767 Trimmed variance.
769 Notes
770 -----
771 `tvar` computes the unbiased sample variance, i.e. it uses a correction
772 factor ``n / (n - 1)``.
774 Examples
775 --------
776 >>> import numpy as np
777 >>> from scipy import stats
778 >>> x = np.arange(20)
779 >>> stats.tvar(x)
780 35.0
781 >>> stats.tvar(x, (3,17))
782 20.0
784 """
785 a = asarray(a)
786 a = a.astype(float)
787 if limits is None:
788 return a.var(ddof=ddof, axis=axis)
789 am = _mask_to_limits(a, limits, inclusive)
790 amnan = am.filled(fill_value=np.nan)
791 return np.nanvar(amnan, ddof=ddof, axis=axis)
794def tmin(a, lowerlimit=None, axis=0, inclusive=True, nan_policy='propagate'):
795 """Compute the trimmed minimum.
797 This function finds the miminum value of an array `a` along the
798 specified axis, but only considering values greater than a specified
799 lower limit.
801 Parameters
802 ----------
803 a : array_like
804 Array of values.
805 lowerlimit : None or float, optional
806 Values in the input array less than the given limit will be ignored.
807 When lowerlimit is None, then all values are used. The default value
808 is None.
809 axis : int or None, optional
810 Axis along which to operate. Default is 0. If None, compute over the
811 whole array `a`.
812 inclusive : {True, False}, optional
813 This flag determines whether values exactly equal to the lower limit
814 are included. The default value is True.
815 nan_policy : {'propagate', 'raise', 'omit'}, optional
816 Defines how to handle when input contains nan.
817 The following options are available (default is 'propagate'):
819 * 'propagate': returns nan
820 * 'raise': throws an error
821 * 'omit': performs the calculations ignoring nan values
823 Returns
824 -------
825 tmin : float, int or ndarray
826 Trimmed minimum.
828 Examples
829 --------
830 >>> import numpy as np
831 >>> from scipy import stats
832 >>> x = np.arange(20)
833 >>> stats.tmin(x)
834 0
836 >>> stats.tmin(x, 13)
837 13
839 >>> stats.tmin(x, 13, inclusive=False)
840 14
842 """
843 a, axis = _chk_asarray(a, axis)
844 am = _mask_to_limits(a, (lowerlimit, None), (inclusive, False))
846 contains_nan, nan_policy = _contains_nan(am, nan_policy)
848 if contains_nan and nan_policy == 'omit':
849 am = ma.masked_invalid(am)
851 res = ma.minimum.reduce(am, axis).data
852 if res.ndim == 0:
853 return res[()]
854 return res
857def tmax(a, upperlimit=None, axis=0, inclusive=True, nan_policy='propagate'):
858 """Compute the trimmed maximum.
860 This function computes the maximum value of an array along a given axis,
861 while ignoring values larger than a specified upper limit.
863 Parameters
864 ----------
865 a : array_like
866 Array of values.
867 upperlimit : None or float, optional
868 Values in the input array greater than the given limit will be ignored.
869 When upperlimit is None, then all values are used. The default value
870 is None.
871 axis : int or None, optional
872 Axis along which to operate. Default is 0. If None, compute over the
873 whole array `a`.
874 inclusive : {True, False}, optional
875 This flag determines whether values exactly equal to the upper limit
876 are included. The default value is True.
877 nan_policy : {'propagate', 'raise', 'omit'}, optional
878 Defines how to handle when input contains nan.
879 The following options are available (default is 'propagate'):
881 * 'propagate': returns nan
882 * 'raise': throws an error
883 * 'omit': performs the calculations ignoring nan values
885 Returns
886 -------
887 tmax : float, int or ndarray
888 Trimmed maximum.
890 Examples
891 --------
892 >>> import numpy as np
893 >>> from scipy import stats
894 >>> x = np.arange(20)
895 >>> stats.tmax(x)
896 19
898 >>> stats.tmax(x, 13)
899 13
901 >>> stats.tmax(x, 13, inclusive=False)
902 12
904 """
905 a, axis = _chk_asarray(a, axis)
906 am = _mask_to_limits(a, (None, upperlimit), (False, inclusive))
908 contains_nan, nan_policy = _contains_nan(am, nan_policy)
910 if contains_nan and nan_policy == 'omit':
911 am = ma.masked_invalid(am)
913 res = ma.maximum.reduce(am, axis).data
914 if res.ndim == 0:
915 return res[()]
916 return res
919def tstd(a, limits=None, inclusive=(True, True), axis=0, ddof=1):
920 """Compute the trimmed sample standard deviation.
922 This function finds the sample standard deviation of given values,
923 ignoring values outside the given `limits`.
925 Parameters
926 ----------
927 a : array_like
928 Array of values.
929 limits : None or (lower limit, upper limit), optional
930 Values in the input array less than the lower limit or greater than the
931 upper limit will be ignored. When limits is None, then all values are
932 used. Either of the limit values in the tuple can also be None
933 representing a half-open interval. The default value is None.
934 inclusive : (bool, bool), optional
935 A tuple consisting of the (lower flag, upper flag). These flags
936 determine whether values exactly equal to the lower or upper limits
937 are included. The default value is (True, True).
938 axis : int or None, optional
939 Axis along which to operate. Default is 0. If None, compute over the
940 whole array `a`.
941 ddof : int, optional
942 Delta degrees of freedom. Default is 1.
944 Returns
945 -------
946 tstd : float
947 Trimmed sample standard deviation.
949 Notes
950 -----
951 `tstd` computes the unbiased sample standard deviation, i.e. it uses a
952 correction factor ``n / (n - 1)``.
954 Examples
955 --------
956 >>> import numpy as np
957 >>> from scipy import stats
958 >>> x = np.arange(20)
959 >>> stats.tstd(x)
960 5.9160797830996161
961 >>> stats.tstd(x, (3,17))
962 4.4721359549995796
964 """
965 return np.sqrt(tvar(a, limits, inclusive, axis, ddof))
968def tsem(a, limits=None, inclusive=(True, True), axis=0, ddof=1):
969 """Compute the trimmed standard error of the mean.
971 This function finds the standard error of the mean for given
972 values, ignoring values outside the given `limits`.
974 Parameters
975 ----------
976 a : array_like
977 Array of values.
978 limits : None or (lower limit, upper limit), optional
979 Values in the input array less than the lower limit or greater than the
980 upper limit will be ignored. When limits is None, then all values are
981 used. Either of the limit values in the tuple can also be None
982 representing a half-open interval. The default value is None.
983 inclusive : (bool, bool), optional
984 A tuple consisting of the (lower flag, upper flag). These flags
985 determine whether values exactly equal to the lower or upper limits
986 are included. The default value is (True, True).
987 axis : int or None, optional
988 Axis along which to operate. Default is 0. If None, compute over the
989 whole array `a`.
990 ddof : int, optional
991 Delta degrees of freedom. Default is 1.
993 Returns
994 -------
995 tsem : float
996 Trimmed standard error of the mean.
998 Notes
999 -----
1000 `tsem` uses unbiased sample standard deviation, i.e. it uses a
1001 correction factor ``n / (n - 1)``.
1003 Examples
1004 --------
1005 >>> import numpy as np
1006 >>> from scipy import stats
1007 >>> x = np.arange(20)
1008 >>> stats.tsem(x)
1009 1.3228756555322954
1010 >>> stats.tsem(x, (3,17))
1011 1.1547005383792515
1013 """
1014 a = np.asarray(a).ravel()
1015 if limits is None:
1016 return a.std(ddof=ddof) / np.sqrt(a.size)
1018 am = _mask_to_limits(a, limits, inclusive)
1019 sd = np.sqrt(np.ma.var(am, ddof=ddof, axis=axis))
1020 return sd / np.sqrt(am.count())
1023#####################################
1024# MOMENTS #
1025#####################################
1028def _moment_outputs(kwds):
1029 moment = np.atleast_1d(kwds.get('moment', 1))
1030 if moment.size == 0:
1031 raise ValueError("'moment' must be a scalar or a non-empty 1D "
1032 "list/array.")
1033 return len(moment)
1036def _moment_result_object(*args):
1037 if len(args) == 1:
1038 return args[0]
1039 return np.asarray(args)
1041# `moment` fits into the `_axis_nan_policy` pattern, but it is a bit unusual
1042# because the number of outputs is variable. Specifically,
1043# `result_to_tuple=lambda x: (x,)` may be surprising for a function that
1044# can produce more than one output, but it is intended here.
1045# When `moment is called to produce the output:
1046# - `result_to_tuple` packs the returned array into a single-element tuple,
1047# - `_moment_result_object` extracts and returns that single element.
1048# However, when the input array is empty, `moment` is never called. Instead,
1049# - `_check_empty_inputs` is used to produce an empty array with the
1050# appropriate dimensions.
1051# - A list comprehension creates the appropriate number of copies of this
1052# array, depending on `n_outputs`.
1053# - This list - which may have multiple elements - is passed into
1054# `_moment_result_object`.
1055# - If there is a single output, `_moment_result_object` extracts and returns
1056# the single output from the list.
1057# - If there are multiple outputs, and therefore multiple elements in the list,
1058# `_moment_result_object` converts the list of arrays to a single array and
1059# returns it.
1060# Currently this leads to a slight inconsistency: when the input array is
1061# empty, there is no distinction between the `moment` function being called
1062# with parameter `moments=1` and `moments=[1]`; the latter *should* produce
1063# the same as the former but with a singleton zeroth dimension.
1064@_axis_nan_policy_factory( # noqa: E302
1065 _moment_result_object, n_samples=1, result_to_tuple=lambda x: (x,),
1066 n_outputs=_moment_outputs
1067)
1068def moment(a, moment=1, axis=0, nan_policy='propagate'):
1069 r"""Calculate the nth moment about the mean for a sample.
1071 A moment is a specific quantitative measure of the shape of a set of
1072 points. It is often used to calculate coefficients of skewness and kurtosis
1073 due to its close relationship with them.
1075 Parameters
1076 ----------
1077 a : array_like
1078 Input array.
1079 moment : int or array_like of ints, optional
1080 Order of central moment that is returned. Default is 1.
1081 axis : int or None, optional
1082 Axis along which the central moment is computed. Default is 0.
1083 If None, compute over the whole array `a`.
1084 nan_policy : {'propagate', 'raise', 'omit'}, optional
1085 Defines how to handle when input contains nan.
1086 The following options are available (default is 'propagate'):
1088 * 'propagate': returns nan
1089 * 'raise': throws an error
1090 * 'omit': performs the calculations ignoring nan values
1092 Returns
1093 -------
1094 n-th central moment : ndarray or float
1095 The appropriate moment along the given axis or over all values if axis
1096 is None. The denominator for the moment calculation is the number of
1097 observations, no degrees of freedom correction is done.
1099 See Also
1100 --------
1101 kurtosis, skew, describe
1103 Notes
1104 -----
1105 The k-th central moment of a data sample is:
1107 .. math::
1109 m_k = \frac{1}{n} \sum_{i = 1}^n (x_i - \bar{x})^k
1111 Where n is the number of samples and x-bar is the mean. This function uses
1112 exponentiation by squares [1]_ for efficiency.
1114 Note that, if `a` is an empty array (``a.size == 0``), array `moment` with
1115 one element (`moment.size == 1`) is treated the same as scalar `moment`
1116 (``np.isscalar(moment)``). This might produce arrays of unexpected shape.
1118 References
1119 ----------
1120 .. [1] https://eli.thegreenplace.net/2009/03/21/efficient-integer-exponentiation-algorithms
1122 Examples
1123 --------
1124 >>> from scipy.stats import moment
1125 >>> moment([1, 2, 3, 4, 5], moment=1)
1126 0.0
1127 >>> moment([1, 2, 3, 4, 5], moment=2)
1128 2.0
1130 """
1131 a, axis = _chk_asarray(a, axis)
1133 contains_nan, nan_policy = _contains_nan(a, nan_policy)
1135 if contains_nan and nan_policy == 'omit':
1136 a = ma.masked_invalid(a)
1137 return mstats_basic.moment(a, moment, axis)
1139 # for array_like moment input, return a value for each.
1140 if not np.isscalar(moment):
1141 mean = a.mean(axis, keepdims=True)
1142 mmnt = [_moment(a, i, axis, mean=mean) for i in moment]
1143 return np.array(mmnt)
1144 else:
1145 return _moment(a, moment, axis)
1148# Moment with optional pre-computed mean, equal to a.mean(axis, keepdims=True)
1149def _moment(a, moment, axis, *, mean=None):
1150 if np.abs(moment - np.round(moment)) > 0:
1151 raise ValueError("All moment parameters must be integers")
1153 # moment of empty array is the same regardless of order
1154 if a.size == 0:
1155 return np.mean(a, axis=axis)
1157 if moment == 0 or moment == 1:
1158 # By definition the zeroth moment about the mean is 1, and the first
1159 # moment is 0.
1160 shape = list(a.shape)
1161 del shape[axis]
1162 dtype = a.dtype.type if a.dtype.kind in 'fc' else np.float64
1164 if len(shape) == 0:
1165 return dtype(1.0 if moment == 0 else 0.0)
1166 else:
1167 return (np.ones(shape, dtype=dtype) if moment == 0
1168 else np.zeros(shape, dtype=dtype))
1169 else:
1170 # Exponentiation by squares: form exponent sequence
1171 n_list = [moment]
1172 current_n = moment
1173 while current_n > 2:
1174 if current_n % 2:
1175 current_n = (current_n - 1) / 2
1176 else:
1177 current_n /= 2
1178 n_list.append(current_n)
1180 # Starting point for exponentiation by squares
1181 mean = a.mean(axis, keepdims=True) if mean is None else mean
1182 a_zero_mean = a - mean
1184 eps = np.finfo(a_zero_mean.dtype).resolution * 10
1185 with np.errstate(divide='ignore', invalid='ignore'):
1186 rel_diff = np.max(np.abs(a_zero_mean), axis=axis,
1187 keepdims=True) / np.abs(mean)
1188 with np.errstate(invalid='ignore'):
1189 precision_loss = np.any(rel_diff < eps)
1190 if precision_loss:
1191 message = ("Precision loss occurred in moment calculation due to "
1192 "catastrophic cancellation. This occurs when the data "
1193 "are nearly identical. Results may be unreliable.")
1194 warnings.warn(message, RuntimeWarning, stacklevel=4)
1196 if n_list[-1] == 1:
1197 s = a_zero_mean.copy()
1198 else:
1199 s = a_zero_mean**2
1201 # Perform multiplications
1202 for n in n_list[-2::-1]:
1203 s = s**2
1204 if n % 2:
1205 s *= a_zero_mean
1206 return np.mean(s, axis)
1209def _var(x, axis=0, ddof=0, mean=None):
1210 # Calculate variance of sample, warning if precision is lost
1211 var = _moment(x, 2, axis, mean=mean)
1212 if ddof != 0:
1213 n = x.shape[axis] if axis is not None else x.size
1214 var *= np.divide(n, n-ddof) # to avoid error on division by zero
1215 return var
1218@_axis_nan_policy_factory(
1219 lambda x: x, result_to_tuple=lambda x: (x,), n_outputs=1
1220)
1221def skew(a, axis=0, bias=True, nan_policy='propagate'):
1222 r"""Compute the sample skewness of a data set.
1224 For normally distributed data, the skewness should be about zero. For
1225 unimodal continuous distributions, a skewness value greater than zero means
1226 that there is more weight in the right tail of the distribution. The
1227 function `skewtest` can be used to determine if the skewness value
1228 is close enough to zero, statistically speaking.
1230 Parameters
1231 ----------
1232 a : ndarray
1233 Input array.
1234 axis : int or None, optional
1235 Axis along which skewness is calculated. Default is 0.
1236 If None, compute over the whole array `a`.
1237 bias : bool, optional
1238 If False, then the calculations are corrected for statistical bias.
1239 nan_policy : {'propagate', 'raise', 'omit'}, optional
1240 Defines how to handle when input contains nan.
1241 The following options are available (default is 'propagate'):
1243 * 'propagate': returns nan
1244 * 'raise': throws an error
1245 * 'omit': performs the calculations ignoring nan values
1247 Returns
1248 -------
1249 skewness : ndarray
1250 The skewness of values along an axis, returning NaN where all values
1251 are equal.
1253 Notes
1254 -----
1255 The sample skewness is computed as the Fisher-Pearson coefficient
1256 of skewness, i.e.
1258 .. math::
1260 g_1=\frac{m_3}{m_2^{3/2}}
1262 where
1264 .. math::
1266 m_i=\frac{1}{N}\sum_{n=1}^N(x[n]-\bar{x})^i
1268 is the biased sample :math:`i\texttt{th}` central moment, and
1269 :math:`\bar{x}` is
1270 the sample mean. If ``bias`` is False, the calculations are
1271 corrected for bias and the value computed is the adjusted
1272 Fisher-Pearson standardized moment coefficient, i.e.
1274 .. math::
1276 G_1=\frac{k_3}{k_2^{3/2}}=
1277 \frac{\sqrt{N(N-1)}}{N-2}\frac{m_3}{m_2^{3/2}}.
1279 References
1280 ----------
1281 .. [1] Zwillinger, D. and Kokoska, S. (2000). CRC Standard
1282 Probability and Statistics Tables and Formulae. Chapman & Hall: New
1283 York. 2000.
1284 Section 2.2.24.1
1286 Examples
1287 --------
1288 >>> from scipy.stats import skew
1289 >>> skew([1, 2, 3, 4, 5])
1290 0.0
1291 >>> skew([2, 8, 0, 4, 1, 9, 9, 0])
1292 0.2650554122698573
1294 """
1295 a, axis = _chk_asarray(a, axis)
1296 n = a.shape[axis]
1298 contains_nan, nan_policy = _contains_nan(a, nan_policy)
1300 if contains_nan and nan_policy == 'omit':
1301 a = ma.masked_invalid(a)
1302 return mstats_basic.skew(a, axis, bias)
1304 mean = a.mean(axis, keepdims=True)
1305 m2 = _moment(a, 2, axis, mean=mean)
1306 m3 = _moment(a, 3, axis, mean=mean)
1307 with np.errstate(all='ignore'):
1308 zero = (m2 <= (np.finfo(m2.dtype).resolution * mean.squeeze(axis))**2)
1309 vals = np.where(zero, np.nan, m3 / m2**1.5)
1310 if not bias:
1311 can_correct = ~zero & (n > 2)
1312 if can_correct.any():
1313 m2 = np.extract(can_correct, m2)
1314 m3 = np.extract(can_correct, m3)
1315 nval = np.sqrt((n - 1.0) * n) / (n - 2.0) * m3 / m2**1.5
1316 np.place(vals, can_correct, nval)
1318 if vals.ndim == 0:
1319 return vals.item()
1321 return vals
1324@_axis_nan_policy_factory(
1325 lambda x: x, result_to_tuple=lambda x: (x,), n_outputs=1
1326)
1327def kurtosis(a, axis=0, fisher=True, bias=True, nan_policy='propagate'):
1328 """Compute the kurtosis (Fisher or Pearson) of a dataset.
1330 Kurtosis is the fourth central moment divided by the square of the
1331 variance. If Fisher's definition is used, then 3.0 is subtracted from
1332 the result to give 0.0 for a normal distribution.
1334 If bias is False then the kurtosis is calculated using k statistics to
1335 eliminate bias coming from biased moment estimators
1337 Use `kurtosistest` to see if result is close enough to normal.
1339 Parameters
1340 ----------
1341 a : array
1342 Data for which the kurtosis is calculated.
1343 axis : int or None, optional
1344 Axis along which the kurtosis is calculated. Default is 0.
1345 If None, compute over the whole array `a`.
1346 fisher : bool, optional
1347 If True, Fisher's definition is used (normal ==> 0.0). If False,
1348 Pearson's definition is used (normal ==> 3.0).
1349 bias : bool, optional
1350 If False, then the calculations are corrected for statistical bias.
1351 nan_policy : {'propagate', 'raise', 'omit'}, optional
1352 Defines how to handle when input contains nan. 'propagate' returns nan,
1353 'raise' throws an error, 'omit' performs the calculations ignoring nan
1354 values. Default is 'propagate'.
1356 Returns
1357 -------
1358 kurtosis : array
1359 The kurtosis of values along an axis, returning NaN where all values
1360 are equal.
1362 References
1363 ----------
1364 .. [1] Zwillinger, D. and Kokoska, S. (2000). CRC Standard
1365 Probability and Statistics Tables and Formulae. Chapman & Hall: New
1366 York. 2000.
1368 Examples
1369 --------
1370 In Fisher's definiton, the kurtosis of the normal distribution is zero.
1371 In the following example, the kurtosis is close to zero, because it was
1372 calculated from the dataset, not from the continuous distribution.
1374 >>> import numpy as np
1375 >>> from scipy.stats import norm, kurtosis
1376 >>> data = norm.rvs(size=1000, random_state=3)
1377 >>> kurtosis(data)
1378 -0.06928694200380558
1380 The distribution with a higher kurtosis has a heavier tail.
1381 The zero valued kurtosis of the normal distribution in Fisher's definition
1382 can serve as a reference point.
1384 >>> import matplotlib.pyplot as plt
1385 >>> import scipy.stats as stats
1386 >>> from scipy.stats import kurtosis
1388 >>> x = np.linspace(-5, 5, 100)
1389 >>> ax = plt.subplot()
1390 >>> distnames = ['laplace', 'norm', 'uniform']
1392 >>> for distname in distnames:
1393 ... if distname == 'uniform':
1394 ... dist = getattr(stats, distname)(loc=-2, scale=4)
1395 ... else:
1396 ... dist = getattr(stats, distname)
1397 ... data = dist.rvs(size=1000)
1398 ... kur = kurtosis(data, fisher=True)
1399 ... y = dist.pdf(x)
1400 ... ax.plot(x, y, label="{}, {}".format(distname, round(kur, 3)))
1401 ... ax.legend()
1403 The Laplace distribution has a heavier tail than the normal distribution.
1404 The uniform distribution (which has negative kurtosis) has the thinnest
1405 tail.
1407 """
1408 a, axis = _chk_asarray(a, axis)
1410 contains_nan, nan_policy = _contains_nan(a, nan_policy)
1412 if contains_nan and nan_policy == 'omit':
1413 a = ma.masked_invalid(a)
1414 return mstats_basic.kurtosis(a, axis, fisher, bias)
1416 n = a.shape[axis]
1417 mean = a.mean(axis, keepdims=True)
1418 m2 = _moment(a, 2, axis, mean=mean)
1419 m4 = _moment(a, 4, axis, mean=mean)
1420 with np.errstate(all='ignore'):
1421 zero = (m2 <= (np.finfo(m2.dtype).resolution * mean.squeeze(axis))**2)
1422 vals = np.where(zero, np.nan, m4 / m2**2.0)
1424 if not bias:
1425 can_correct = ~zero & (n > 3)
1426 if can_correct.any():
1427 m2 = np.extract(can_correct, m2)
1428 m4 = np.extract(can_correct, m4)
1429 nval = 1.0/(n-2)/(n-3) * ((n**2-1.0)*m4/m2**2.0 - 3*(n-1)**2.0)
1430 np.place(vals, can_correct, nval + 3.0)
1432 if vals.ndim == 0:
1433 vals = vals.item() # array scalar
1435 return vals - 3 if fisher else vals
1438DescribeResult = namedtuple('DescribeResult',
1439 ('nobs', 'minmax', 'mean', 'variance', 'skewness',
1440 'kurtosis'))
1443def describe(a, axis=0, ddof=1, bias=True, nan_policy='propagate'):
1444 """Compute several descriptive statistics of the passed array.
1446 Parameters
1447 ----------
1448 a : array_like
1449 Input data.
1450 axis : int or None, optional
1451 Axis along which statistics are calculated. Default is 0.
1452 If None, compute over the whole array `a`.
1453 ddof : int, optional
1454 Delta degrees of freedom (only for variance). Default is 1.
1455 bias : bool, optional
1456 If False, then the skewness and kurtosis calculations are corrected
1457 for statistical bias.
1458 nan_policy : {'propagate', 'raise', 'omit'}, optional
1459 Defines how to handle when input contains nan.
1460 The following options are available (default is 'propagate'):
1462 * 'propagate': returns nan
1463 * 'raise': throws an error
1464 * 'omit': performs the calculations ignoring nan values
1466 Returns
1467 -------
1468 nobs : int or ndarray of ints
1469 Number of observations (length of data along `axis`).
1470 When 'omit' is chosen as nan_policy, the length along each axis
1471 slice is counted separately.
1472 minmax: tuple of ndarrays or floats
1473 Minimum and maximum value of `a` along the given axis.
1474 mean : ndarray or float
1475 Arithmetic mean of `a` along the given axis.
1476 variance : ndarray or float
1477 Unbiased variance of `a` along the given axis; denominator is number
1478 of observations minus one.
1479 skewness : ndarray or float
1480 Skewness of `a` along the given axis, based on moment calculations
1481 with denominator equal to the number of observations, i.e. no degrees
1482 of freedom correction.
1483 kurtosis : ndarray or float
1484 Kurtosis (Fisher) of `a` along the given axis. The kurtosis is
1485 normalized so that it is zero for the normal distribution. No
1486 degrees of freedom are used.
1488 See Also
1489 --------
1490 skew, kurtosis
1492 Examples
1493 --------
1494 >>> import numpy as np
1495 >>> from scipy import stats
1496 >>> a = np.arange(10)
1497 >>> stats.describe(a)
1498 DescribeResult(nobs=10, minmax=(0, 9), mean=4.5,
1499 variance=9.166666666666666, skewness=0.0,
1500 kurtosis=-1.2242424242424244)
1501 >>> b = [[1, 2], [3, 4]]
1502 >>> stats.describe(b)
1503 DescribeResult(nobs=2, minmax=(array([1, 2]), array([3, 4])),
1504 mean=array([2., 3.]), variance=array([2., 2.]),
1505 skewness=array([0., 0.]), kurtosis=array([-2., -2.]))
1507 """
1508 a, axis = _chk_asarray(a, axis)
1510 contains_nan, nan_policy = _contains_nan(a, nan_policy)
1512 if contains_nan and nan_policy == 'omit':
1513 a = ma.masked_invalid(a)
1514 return mstats_basic.describe(a, axis, ddof, bias)
1516 if a.size == 0:
1517 raise ValueError("The input must not be empty.")
1518 n = a.shape[axis]
1519 mm = (np.min(a, axis=axis), np.max(a, axis=axis))
1520 m = np.mean(a, axis=axis)
1521 v = _var(a, axis=axis, ddof=ddof)
1522 sk = skew(a, axis, bias=bias)
1523 kurt = kurtosis(a, axis, bias=bias)
1525 return DescribeResult(n, mm, m, v, sk, kurt)
1527#####################################
1528# NORMALITY TESTS #
1529#####################################
1532def _normtest_finish(z, alternative):
1533 """Common code between all the normality-test functions."""
1534 if alternative == 'less':
1535 prob = distributions.norm.cdf(z)
1536 elif alternative == 'greater':
1537 prob = distributions.norm.sf(z)
1538 elif alternative == 'two-sided':
1539 prob = 2 * distributions.norm.sf(np.abs(z))
1540 else:
1541 raise ValueError("alternative must be "
1542 "'less', 'greater' or 'two-sided'")
1544 if z.ndim == 0:
1545 z = z[()]
1547 return z, prob
1550SkewtestResult = namedtuple('SkewtestResult', ('statistic', 'pvalue'))
1553def skewtest(a, axis=0, nan_policy='propagate', alternative='two-sided'):
1554 """Test whether the skew is different from the normal distribution.
1556 This function tests the null hypothesis that the skewness of
1557 the population that the sample was drawn from is the same
1558 as that of a corresponding normal distribution.
1560 Parameters
1561 ----------
1562 a : array
1563 The data to be tested.
1564 axis : int or None, optional
1565 Axis along which statistics are calculated. Default is 0.
1566 If None, compute over the whole array `a`.
1567 nan_policy : {'propagate', 'raise', 'omit'}, optional
1568 Defines how to handle when input contains nan.
1569 The following options are available (default is 'propagate'):
1571 * 'propagate': returns nan
1572 * 'raise': throws an error
1573 * 'omit': performs the calculations ignoring nan values
1575 alternative : {'two-sided', 'less', 'greater'}, optional
1576 Defines the alternative hypothesis. Default is 'two-sided'.
1577 The following options are available:
1579 * 'two-sided': the skewness of the distribution underlying the sample
1580 is different from that of the normal distribution (i.e. 0)
1581 * 'less': the skewness of the distribution underlying the sample
1582 is less than that of the normal distribution
1583 * 'greater': the skewness of the distribution underlying the sample
1584 is greater than that of the normal distribution
1586 .. versionadded:: 1.7.0
1588 Returns
1589 -------
1590 statistic : float
1591 The computed z-score for this test.
1592 pvalue : float
1593 The p-value for the hypothesis test.
1595 Notes
1596 -----
1597 The sample size must be at least 8.
1599 References
1600 ----------
1601 .. [1] R. B. D'Agostino, A. J. Belanger and R. B. D'Agostino Jr.,
1602 "A suggestion for using powerful and informative tests of
1603 normality", American Statistician 44, pp. 316-321, 1990.
1605 Examples
1606 --------
1607 >>> from scipy.stats import skewtest
1608 >>> skewtest([1, 2, 3, 4, 5, 6, 7, 8])
1609 SkewtestResult(statistic=1.0108048609177787, pvalue=0.3121098361421897)
1610 >>> skewtest([2, 8, 0, 4, 1, 9, 9, 0])
1611 SkewtestResult(statistic=0.44626385374196975, pvalue=0.6554066631275459)
1612 >>> skewtest([1, 2, 3, 4, 5, 6, 7, 8000])
1613 SkewtestResult(statistic=3.571773510360407, pvalue=0.0003545719905823133)
1614 >>> skewtest([100, 100, 100, 100, 100, 100, 100, 101])
1615 SkewtestResult(statistic=3.5717766638478072, pvalue=0.000354567720281634)
1616 >>> skewtest([1, 2, 3, 4, 5, 6, 7, 8], alternative='less')
1617 SkewtestResult(statistic=1.0108048609177787, pvalue=0.8439450819289052)
1618 >>> skewtest([1, 2, 3, 4, 5, 6, 7, 8], alternative='greater')
1619 SkewtestResult(statistic=1.0108048609177787, pvalue=0.15605491807109484)
1621 """
1622 a, axis = _chk_asarray(a, axis)
1624 contains_nan, nan_policy = _contains_nan(a, nan_policy)
1626 if contains_nan and nan_policy == 'omit':
1627 a = ma.masked_invalid(a)
1628 return mstats_basic.skewtest(a, axis, alternative)
1630 if axis is None:
1631 a = np.ravel(a)
1632 axis = 0
1633 b2 = skew(a, axis)
1634 n = a.shape[axis]
1635 if n < 8:
1636 raise ValueError(
1637 "skewtest is not valid with less than 8 samples; %i samples"
1638 " were given." % int(n))
1639 y = b2 * math.sqrt(((n + 1) * (n + 3)) / (6.0 * (n - 2)))
1640 beta2 = (3.0 * (n**2 + 27*n - 70) * (n+1) * (n+3) /
1641 ((n-2.0) * (n+5) * (n+7) * (n+9)))
1642 W2 = -1 + math.sqrt(2 * (beta2 - 1))
1643 delta = 1 / math.sqrt(0.5 * math.log(W2))
1644 alpha = math.sqrt(2.0 / (W2 - 1))
1645 y = np.where(y == 0, 1, y)
1646 Z = delta * np.log(y / alpha + np.sqrt((y / alpha)**2 + 1))
1648 return SkewtestResult(*_normtest_finish(Z, alternative))
1651KurtosistestResult = namedtuple('KurtosistestResult', ('statistic', 'pvalue'))
1654def kurtosistest(a, axis=0, nan_policy='propagate', alternative='two-sided'):
1655 """Test whether a dataset has normal kurtosis.
1657 This function tests the null hypothesis that the kurtosis
1658 of the population from which the sample was drawn is that
1659 of the normal distribution.
1661 Parameters
1662 ----------
1663 a : array
1664 Array of the sample data.
1665 axis : int or None, optional
1666 Axis along which to compute test. Default is 0. If None,
1667 compute over the whole array `a`.
1668 nan_policy : {'propagate', 'raise', 'omit'}, optional
1669 Defines how to handle when input contains nan.
1670 The following options are available (default is 'propagate'):
1672 * 'propagate': returns nan
1673 * 'raise': throws an error
1674 * 'omit': performs the calculations ignoring nan values
1676 alternative : {'two-sided', 'less', 'greater'}, optional
1677 Defines the alternative hypothesis.
1678 The following options are available (default is 'two-sided'):
1680 * 'two-sided': the kurtosis of the distribution underlying the sample
1681 is different from that of the normal distribution
1682 * 'less': the kurtosis of the distribution underlying the sample
1683 is less than that of the normal distribution
1684 * 'greater': the kurtosis of the distribution underlying the sample
1685 is greater than that of the normal distribution
1687 .. versionadded:: 1.7.0
1689 Returns
1690 -------
1691 statistic : float
1692 The computed z-score for this test.
1693 pvalue : float
1694 The p-value for the hypothesis test.
1696 Notes
1697 -----
1698 Valid only for n>20. This function uses the method described in [1]_.
1700 References
1701 ----------
1702 .. [1] see e.g. F. J. Anscombe, W. J. Glynn, "Distribution of the kurtosis
1703 statistic b2 for normal samples", Biometrika, vol. 70, pp. 227-234, 1983.
1705 Examples
1706 --------
1707 >>> import numpy as np
1708 >>> from scipy.stats import kurtosistest
1709 >>> kurtosistest(list(range(20)))
1710 KurtosistestResult(statistic=-1.7058104152122062, pvalue=0.08804338332528348)
1711 >>> kurtosistest(list(range(20)), alternative='less')
1712 KurtosistestResult(statistic=-1.7058104152122062, pvalue=0.04402169166264174)
1713 >>> kurtosistest(list(range(20)), alternative='greater')
1714 KurtosistestResult(statistic=-1.7058104152122062, pvalue=0.9559783083373583)
1716 >>> rng = np.random.default_rng()
1717 >>> s = rng.normal(0, 1, 1000)
1718 >>> kurtosistest(s)
1719 KurtosistestResult(statistic=-1.475047944490622, pvalue=0.14019965402996987)
1721 """
1722 a, axis = _chk_asarray(a, axis)
1724 contains_nan, nan_policy = _contains_nan(a, nan_policy)
1726 if contains_nan and nan_policy == 'omit':
1727 a = ma.masked_invalid(a)
1728 return mstats_basic.kurtosistest(a, axis, alternative)
1730 n = a.shape[axis]
1731 if n < 5:
1732 raise ValueError(
1733 "kurtosistest requires at least 5 observations; %i observations"
1734 " were given." % int(n))
1735 if n < 20:
1736 warnings.warn("kurtosistest only valid for n>=20 ... continuing "
1737 "anyway, n=%i" % int(n))
1738 b2 = kurtosis(a, axis, fisher=False)
1740 E = 3.0*(n-1) / (n+1)
1741 varb2 = 24.0*n*(n-2)*(n-3) / ((n+1)*(n+1.)*(n+3)*(n+5)) # [1]_ Eq. 1
1742 x = (b2-E) / np.sqrt(varb2) # [1]_ Eq. 4
1743 # [1]_ Eq. 2:
1744 sqrtbeta1 = 6.0*(n*n-5*n+2)/((n+7)*(n+9)) * np.sqrt((6.0*(n+3)*(n+5)) /
1745 (n*(n-2)*(n-3)))
1746 # [1]_ Eq. 3:
1747 A = 6.0 + 8.0/sqrtbeta1 * (2.0/sqrtbeta1 + np.sqrt(1+4.0/(sqrtbeta1**2)))
1748 term1 = 1 - 2/(9.0*A)
1749 denom = 1 + x*np.sqrt(2/(A-4.0))
1750 term2 = np.sign(denom) * np.where(denom == 0.0, np.nan,
1751 np.power((1-2.0/A)/np.abs(denom), 1/3.0))
1752 if np.any(denom == 0):
1753 msg = "Test statistic not defined in some cases due to division by " \
1754 "zero. Return nan in that case..."
1755 warnings.warn(msg, RuntimeWarning)
1757 Z = (term1 - term2) / np.sqrt(2/(9.0*A)) # [1]_ Eq. 5
1759 # zprob uses upper tail, so Z needs to be positive
1760 return KurtosistestResult(*_normtest_finish(Z, alternative))
1763NormaltestResult = namedtuple('NormaltestResult', ('statistic', 'pvalue'))
1766def normaltest(a, axis=0, nan_policy='propagate'):
1767 """Test whether a sample differs from a normal distribution.
1769 This function tests the null hypothesis that a sample comes
1770 from a normal distribution. It is based on D'Agostino and
1771 Pearson's [1]_, [2]_ test that combines skew and kurtosis to
1772 produce an omnibus test of normality.
1774 Parameters
1775 ----------
1776 a : array_like
1777 The array containing the sample to be tested.
1778 axis : int or None, optional
1779 Axis along which to compute test. Default is 0. If None,
1780 compute over the whole array `a`.
1781 nan_policy : {'propagate', 'raise', 'omit'}, optional
1782 Defines how to handle when input contains nan.
1783 The following options are available (default is 'propagate'):
1785 * 'propagate': returns nan
1786 * 'raise': throws an error
1787 * 'omit': performs the calculations ignoring nan values
1789 Returns
1790 -------
1791 statistic : float or array
1792 ``s^2 + k^2``, where ``s`` is the z-score returned by `skewtest` and
1793 ``k`` is the z-score returned by `kurtosistest`.
1794 pvalue : float or array
1795 A 2-sided chi squared probability for the hypothesis test.
1797 References
1798 ----------
1799 .. [1] D'Agostino, R. B. (1971), "An omnibus test of normality for
1800 moderate and large sample size", Biometrika, 58, 341-348
1802 .. [2] D'Agostino, R. and Pearson, E. S. (1973), "Tests for departure from
1803 normality", Biometrika, 60, 613-622
1805 Examples
1806 --------
1807 >>> import numpy as np
1808 >>> from scipy import stats
1809 >>> rng = np.random.default_rng()
1810 >>> pts = 1000
1811 >>> a = rng.normal(0, 1, size=pts)
1812 >>> b = rng.normal(2, 1, size=pts)
1813 >>> x = np.concatenate((a, b))
1814 >>> k2, p = stats.normaltest(x)
1815 >>> alpha = 1e-3
1816 >>> print("p = {:g}".format(p))
1817 p = 8.4713e-19
1818 >>> if p < alpha: # null hypothesis: x comes from a normal distribution
1819 ... print("The null hypothesis can be rejected")
1820 ... else:
1821 ... print("The null hypothesis cannot be rejected")
1822 The null hypothesis can be rejected
1824 """
1825 a, axis = _chk_asarray(a, axis)
1827 contains_nan, nan_policy = _contains_nan(a, nan_policy)
1829 if contains_nan and nan_policy == 'omit':
1830 a = ma.masked_invalid(a)
1831 return mstats_basic.normaltest(a, axis)
1833 s, _ = skewtest(a, axis)
1834 k, _ = kurtosistest(a, axis)
1835 k2 = s*s + k*k
1837 return NormaltestResult(k2, distributions.chi2.sf(k2, 2))
1840@_axis_nan_policy_factory(SignificanceResult, default_axis=None)
1841def jarque_bera(x, *, axis=None):
1842 """Perform the Jarque-Bera goodness of fit test on sample data.
1844 The Jarque-Bera test tests whether the sample data has the skewness and
1845 kurtosis matching a normal distribution.
1847 Note that this test only works for a large enough number of data samples
1848 (>2000) as the test statistic asymptotically has a Chi-squared distribution
1849 with 2 degrees of freedom.
1851 Parameters
1852 ----------
1853 x : array_like
1854 Observations of a random variable.
1855 axis : int or None, default: 0
1856 If an int, the axis of the input along which to compute the statistic.
1857 The statistic of each axis-slice (e.g. row) of the input will appear in
1858 a corresponding element of the output.
1859 If ``None``, the input will be raveled before computing the statistic.
1861 Returns
1862 -------
1863 result : SignificanceResult
1864 An object with the following attributes:
1866 statistic : float
1867 The test statistic.
1868 pvalue : float
1869 The p-value for the hypothesis test.
1871 References
1872 ----------
1873 .. [1] Jarque, C. and Bera, A. (1980) "Efficient tests for normality,
1874 homoscedasticity and serial independence of regression residuals",
1875 6 Econometric Letters 255-259.
1877 Examples
1878 --------
1879 >>> import numpy as np
1880 >>> from scipy import stats
1881 >>> rng = np.random.default_rng()
1882 >>> x = rng.normal(0, 1, 100000)
1883 >>> jarque_bera_test = stats.jarque_bera(x)
1884 >>> jarque_bera_test
1885 Jarque_beraResult(statistic=3.3415184718131554, pvalue=0.18810419594996775)
1886 >>> jarque_bera_test.statistic
1887 3.3415184718131554
1888 >>> jarque_bera_test.pvalue
1889 0.18810419594996775
1891 """
1892 x = np.asarray(x)
1893 if axis is None:
1894 x = x.ravel()
1895 axis = 0
1897 n = x.shape[axis]
1898 if n == 0:
1899 raise ValueError('At least one observation is required.')
1901 mu = x.mean(axis=axis, keepdims=True)
1902 diffx = x - mu
1903 s = skew(diffx, axis=axis, _no_deco=True)
1904 k = kurtosis(diffx, axis=axis, _no_deco=True)
1905 statistic = n / 6 * (s**2 + k**2 / 4)
1906 pvalue = distributions.chi2.sf(statistic, df=2)
1908 return SignificanceResult(statistic, pvalue)
1911#####################################
1912# FREQUENCY FUNCTIONS #
1913#####################################
1916def scoreatpercentile(a, per, limit=(), interpolation_method='fraction',
1917 axis=None):
1918 """Calculate the score at a given percentile of the input sequence.
1920 For example, the score at `per=50` is the median. If the desired quantile
1921 lies between two data points, we interpolate between them, according to
1922 the value of `interpolation`. If the parameter `limit` is provided, it
1923 should be a tuple (lower, upper) of two values.
1925 Parameters
1926 ----------
1927 a : array_like
1928 A 1-D array of values from which to extract score.
1929 per : array_like
1930 Percentile(s) at which to extract score. Values should be in range
1931 [0,100].
1932 limit : tuple, optional
1933 Tuple of two scalars, the lower and upper limits within which to
1934 compute the percentile. Values of `a` outside
1935 this (closed) interval will be ignored.
1936 interpolation_method : {'fraction', 'lower', 'higher'}, optional
1937 Specifies the interpolation method to use,
1938 when the desired quantile lies between two data points `i` and `j`
1939 The following options are available (default is 'fraction'):
1941 * 'fraction': ``i + (j - i) * fraction`` where ``fraction`` is the
1942 fractional part of the index surrounded by ``i`` and ``j``
1943 * 'lower': ``i``
1944 * 'higher': ``j``
1946 axis : int, optional
1947 Axis along which the percentiles are computed. Default is None. If
1948 None, compute over the whole array `a`.
1950 Returns
1951 -------
1952 score : float or ndarray
1953 Score at percentile(s).
1955 See Also
1956 --------
1957 percentileofscore, numpy.percentile
1959 Notes
1960 -----
1961 This function will become obsolete in the future.
1962 For NumPy 1.9 and higher, `numpy.percentile` provides all the functionality
1963 that `scoreatpercentile` provides. And it's significantly faster.
1964 Therefore it's recommended to use `numpy.percentile` for users that have
1965 numpy >= 1.9.
1967 Examples
1968 --------
1969 >>> import numpy as np
1970 >>> from scipy import stats
1971 >>> a = np.arange(100)
1972 >>> stats.scoreatpercentile(a, 50)
1973 49.5
1975 """
1976 # adapted from NumPy's percentile function. When we require numpy >= 1.8,
1977 # the implementation of this function can be replaced by np.percentile.
1978 a = np.asarray(a)
1979 if a.size == 0:
1980 # empty array, return nan(s) with shape matching `per`
1981 if np.isscalar(per):
1982 return np.nan
1983 else:
1984 return np.full(np.asarray(per).shape, np.nan, dtype=np.float64)
1986 if limit:
1987 a = a[(limit[0] <= a) & (a <= limit[1])]
1989 sorted_ = np.sort(a, axis=axis)
1990 if axis is None:
1991 axis = 0
1993 return _compute_qth_percentile(sorted_, per, interpolation_method, axis)
1996# handle sequence of per's without calling sort multiple times
1997def _compute_qth_percentile(sorted_, per, interpolation_method, axis):
1998 if not np.isscalar(per):
1999 score = [_compute_qth_percentile(sorted_, i,
2000 interpolation_method, axis)
2001 for i in per]
2002 return np.array(score)
2004 if not (0 <= per <= 100):
2005 raise ValueError("percentile must be in the range [0, 100]")
2007 indexer = [slice(None)] * sorted_.ndim
2008 idx = per / 100. * (sorted_.shape[axis] - 1)
2010 if int(idx) != idx:
2011 # round fractional indices according to interpolation method
2012 if interpolation_method == 'lower':
2013 idx = int(np.floor(idx))
2014 elif interpolation_method == 'higher':
2015 idx = int(np.ceil(idx))
2016 elif interpolation_method == 'fraction':
2017 pass # keep idx as fraction and interpolate
2018 else:
2019 raise ValueError("interpolation_method can only be 'fraction', "
2020 "'lower' or 'higher'")
2022 i = int(idx)
2023 if i == idx:
2024 indexer[axis] = slice(i, i + 1)
2025 weights = array(1)
2026 sumval = 1.0
2027 else:
2028 indexer[axis] = slice(i, i + 2)
2029 j = i + 1
2030 weights = array([(j - idx), (idx - i)], float)
2031 wshape = [1] * sorted_.ndim
2032 wshape[axis] = 2
2033 weights.shape = wshape
2034 sumval = weights.sum()
2036 # Use np.add.reduce (== np.sum but a little faster) to coerce data type
2037 return np.add.reduce(sorted_[tuple(indexer)] * weights, axis=axis) / sumval
2040def percentileofscore(a, score, kind='rank', nan_policy='propagate'):
2041 """Compute the percentile rank of a score relative to a list of scores.
2043 A `percentileofscore` of, for example, 80% means that 80% of the
2044 scores in `a` are below the given score. In the case of gaps or
2045 ties, the exact definition depends on the optional keyword, `kind`.
2047 Parameters
2048 ----------
2049 a : array_like
2050 Array to which `score` is compared.
2051 score : array_like
2052 Scores to compute percentiles for.
2053 kind : {'rank', 'weak', 'strict', 'mean'}, optional
2054 Specifies the interpretation of the resulting score.
2055 The following options are available (default is 'rank'):
2057 * 'rank': Average percentage ranking of score. In case of multiple
2058 matches, average the percentage rankings of all matching scores.
2059 * 'weak': This kind corresponds to the definition of a cumulative
2060 distribution function. A percentileofscore of 80% means that 80%
2061 of values are less than or equal to the provided score.
2062 * 'strict': Similar to "weak", except that only values that are
2063 strictly less than the given score are counted.
2064 * 'mean': The average of the "weak" and "strict" scores, often used
2065 in testing. See https://en.wikipedia.org/wiki/Percentile_rank
2066 nan_policy : {'propagate', 'raise', 'omit'}, optional
2067 Specifies how to treat `nan` values in `a`.
2068 The following options are available (default is 'propagate'):
2070 * 'propagate': returns nan (for each value in `score`).
2071 * 'raise': throws an error
2072 * 'omit': performs the calculations ignoring nan values
2074 Returns
2075 -------
2076 pcos : float
2077 Percentile-position of score (0-100) relative to `a`.
2079 See Also
2080 --------
2081 numpy.percentile
2082 scipy.stats.scoreatpercentile, scipy.stats.rankdata
2084 Examples
2085 --------
2086 Three-quarters of the given values lie below a given score:
2088 >>> import numpy as np
2089 >>> from scipy import stats
2090 >>> stats.percentileofscore([1, 2, 3, 4], 3)
2091 75.0
2093 With multiple matches, note how the scores of the two matches, 0.6
2094 and 0.8 respectively, are averaged:
2096 >>> stats.percentileofscore([1, 2, 3, 3, 4], 3)
2097 70.0
2099 Only 2/5 values are strictly less than 3:
2101 >>> stats.percentileofscore([1, 2, 3, 3, 4], 3, kind='strict')
2102 40.0
2104 But 4/5 values are less than or equal to 3:
2106 >>> stats.percentileofscore([1, 2, 3, 3, 4], 3, kind='weak')
2107 80.0
2109 The average between the weak and the strict scores is:
2111 >>> stats.percentileofscore([1, 2, 3, 3, 4], 3, kind='mean')
2112 60.0
2114 Score arrays (of any dimensionality) are supported:
2116 >>> stats.percentileofscore([1, 2, 3, 3, 4], [2, 3])
2117 array([40., 70.])
2119 The inputs can be infinite:
2121 >>> stats.percentileofscore([-np.inf, 0, 1, np.inf], [1, 2, np.inf])
2122 array([75., 75., 100.])
2124 If `a` is empty, then the resulting percentiles are all `nan`:
2126 >>> stats.percentileofscore([], [1, 2])
2127 array([nan, nan])
2128 """
2130 a = np.asarray(a)
2131 n = len(a)
2132 score = np.asarray(score)
2134 # Nan treatment
2135 cna, npa = _contains_nan(a, nan_policy, use_summation=False)
2136 cns, nps = _contains_nan(score, nan_policy, use_summation=False)
2138 if (cna or cns) and nan_policy == 'raise':
2139 raise ValueError("The input contains nan values")
2141 if cns:
2142 # If a score is nan, then the output should be nan
2143 # (also if nan_policy is "omit", because it only applies to `a`)
2144 score = ma.masked_where(np.isnan(score), score)
2146 if cna:
2147 if nan_policy == "omit":
2148 # Don't count nans
2149 a = ma.masked_where(np.isnan(a), a)
2150 n = a.count()
2152 if nan_policy == "propagate":
2153 # All outputs should be nans
2154 n = 0
2156 # Cannot compare to empty list ==> nan
2157 if n == 0:
2158 perct = np.full_like(score, np.nan, dtype=np.float64)
2160 else:
2161 # Prepare broadcasting
2162 score = score[..., None]
2164 def count(x):
2165 return np.count_nonzero(x, -1)
2167 # Despite using masked_array to omit nan values from processing,
2168 # the CI tests on "Azure pipelines" (but not on the other CI servers)
2169 # emits warnings when there are nan values, contrarily to the purpose
2170 # of masked_arrays. As a fix, we simply suppress the warnings.
2171 with suppress_warnings() as sup:
2172 sup.filter(RuntimeWarning,
2173 "invalid value encountered in less")
2174 sup.filter(RuntimeWarning,
2175 "invalid value encountered in greater")
2177 # Main computations/logic
2178 if kind == 'rank':
2179 left = count(a < score)
2180 right = count(a <= score)
2181 plus1 = left < right
2182 perct = (left + right + plus1) * (50.0 / n)
2183 elif kind == 'strict':
2184 perct = count(a < score) * (100.0 / n)
2185 elif kind == 'weak':
2186 perct = count(a <= score) * (100.0 / n)
2187 elif kind == 'mean':
2188 left = count(a < score)
2189 right = count(a <= score)
2190 perct = (left + right) * (50.0 / n)
2191 else:
2192 raise ValueError(
2193 "kind can only be 'rank', 'strict', 'weak' or 'mean'")
2195 # Re-insert nan values
2196 perct = ma.filled(perct, np.nan)
2198 if perct.ndim == 0:
2199 return perct[()]
2200 return perct
2203HistogramResult = namedtuple('HistogramResult',
2204 ('count', 'lowerlimit', 'binsize', 'extrapoints'))
2207def _histogram(a, numbins=10, defaultlimits=None, weights=None,
2208 printextras=False):
2209 """Create a histogram.
2211 Separate the range into several bins and return the number of instances
2212 in each bin.
2214 Parameters
2215 ----------
2216 a : array_like
2217 Array of scores which will be put into bins.
2218 numbins : int, optional
2219 The number of bins to use for the histogram. Default is 10.
2220 defaultlimits : tuple (lower, upper), optional
2221 The lower and upper values for the range of the histogram.
2222 If no value is given, a range slightly larger than the range of the
2223 values in a is used. Specifically ``(a.min() - s, a.max() + s)``,
2224 where ``s = (1/2)(a.max() - a.min()) / (numbins - 1)``.
2225 weights : array_like, optional
2226 The weights for each value in `a`. Default is None, which gives each
2227 value a weight of 1.0
2228 printextras : bool, optional
2229 If True, if there are extra points (i.e. the points that fall outside
2230 the bin limits) a warning is raised saying how many of those points
2231 there are. Default is False.
2233 Returns
2234 -------
2235 count : ndarray
2236 Number of points (or sum of weights) in each bin.
2237 lowerlimit : float
2238 Lowest value of histogram, the lower limit of the first bin.
2239 binsize : float
2240 The size of the bins (all bins have the same size).
2241 extrapoints : int
2242 The number of points outside the range of the histogram.
2244 See Also
2245 --------
2246 numpy.histogram
2248 Notes
2249 -----
2250 This histogram is based on numpy's histogram but has a larger range by
2251 default if default limits is not set.
2253 """
2254 a = np.ravel(a)
2255 if defaultlimits is None:
2256 if a.size == 0:
2257 # handle empty arrays. Undetermined range, so use 0-1.
2258 defaultlimits = (0, 1)
2259 else:
2260 # no range given, so use values in `a`
2261 data_min = a.min()
2262 data_max = a.max()
2263 # Have bins extend past min and max values slightly
2264 s = (data_max - data_min) / (2. * (numbins - 1.))
2265 defaultlimits = (data_min - s, data_max + s)
2267 # use numpy's histogram method to compute bins
2268 hist, bin_edges = np.histogram(a, bins=numbins, range=defaultlimits,
2269 weights=weights)
2270 # hist are not always floats, convert to keep with old output
2271 hist = np.array(hist, dtype=float)
2272 # fixed width for bins is assumed, as numpy's histogram gives
2273 # fixed width bins for int values for 'bins'
2274 binsize = bin_edges[1] - bin_edges[0]
2275 # calculate number of extra points
2276 extrapoints = len([v for v in a
2277 if defaultlimits[0] > v or v > defaultlimits[1]])
2278 if extrapoints > 0 and printextras:
2279 warnings.warn("Points outside given histogram range = %s"
2280 % extrapoints)
2282 return HistogramResult(hist, defaultlimits[0], binsize, extrapoints)
2285CumfreqResult = namedtuple('CumfreqResult',
2286 ('cumcount', 'lowerlimit', 'binsize',
2287 'extrapoints'))
2290def cumfreq(a, numbins=10, defaultreallimits=None, weights=None):
2291 """Return a cumulative frequency histogram, using the histogram function.
2293 A cumulative histogram is a mapping that counts the cumulative number of
2294 observations in all of the bins up to the specified bin.
2296 Parameters
2297 ----------
2298 a : array_like
2299 Input array.
2300 numbins : int, optional
2301 The number of bins to use for the histogram. Default is 10.
2302 defaultreallimits : tuple (lower, upper), optional
2303 The lower and upper values for the range of the histogram.
2304 If no value is given, a range slightly larger than the range of the
2305 values in `a` is used. Specifically ``(a.min() - s, a.max() + s)``,
2306 where ``s = (1/2)(a.max() - a.min()) / (numbins - 1)``.
2307 weights : array_like, optional
2308 The weights for each value in `a`. Default is None, which gives each
2309 value a weight of 1.0
2311 Returns
2312 -------
2313 cumcount : ndarray
2314 Binned values of cumulative frequency.
2315 lowerlimit : float
2316 Lower real limit
2317 binsize : float
2318 Width of each bin.
2319 extrapoints : int
2320 Extra points.
2322 Examples
2323 --------
2324 >>> import numpy as np
2325 >>> import matplotlib.pyplot as plt
2326 >>> from scipy import stats
2327 >>> rng = np.random.default_rng()
2328 >>> x = [1, 4, 2, 1, 3, 1]
2329 >>> res = stats.cumfreq(x, numbins=4, defaultreallimits=(1.5, 5))
2330 >>> res.cumcount
2331 array([ 1., 2., 3., 3.])
2332 >>> res.extrapoints
2333 3
2335 Create a normal distribution with 1000 random values
2337 >>> samples = stats.norm.rvs(size=1000, random_state=rng)
2339 Calculate cumulative frequencies
2341 >>> res = stats.cumfreq(samples, numbins=25)
2343 Calculate space of values for x
2345 >>> x = res.lowerlimit + np.linspace(0, res.binsize*res.cumcount.size,
2346 ... res.cumcount.size)
2348 Plot histogram and cumulative histogram
2350 >>> fig = plt.figure(figsize=(10, 4))
2351 >>> ax1 = fig.add_subplot(1, 2, 1)
2352 >>> ax2 = fig.add_subplot(1, 2, 2)
2353 >>> ax1.hist(samples, bins=25)
2354 >>> ax1.set_title('Histogram')
2355 >>> ax2.bar(x, res.cumcount, width=res.binsize)
2356 >>> ax2.set_title('Cumulative histogram')
2357 >>> ax2.set_xlim([x.min(), x.max()])
2359 >>> plt.show()
2361 """
2362 h, l, b, e = _histogram(a, numbins, defaultreallimits, weights=weights)
2363 cumhist = np.cumsum(h * 1, axis=0)
2364 return CumfreqResult(cumhist, l, b, e)
2367RelfreqResult = namedtuple('RelfreqResult',
2368 ('frequency', 'lowerlimit', 'binsize',
2369 'extrapoints'))
2372def relfreq(a, numbins=10, defaultreallimits=None, weights=None):
2373 """Return a relative frequency histogram, using the histogram function.
2375 A relative frequency histogram is a mapping of the number of
2376 observations in each of the bins relative to the total of observations.
2378 Parameters
2379 ----------
2380 a : array_like
2381 Input array.
2382 numbins : int, optional
2383 The number of bins to use for the histogram. Default is 10.
2384 defaultreallimits : tuple (lower, upper), optional
2385 The lower and upper values for the range of the histogram.
2386 If no value is given, a range slightly larger than the range of the
2387 values in a is used. Specifically ``(a.min() - s, a.max() + s)``,
2388 where ``s = (1/2)(a.max() - a.min()) / (numbins - 1)``.
2389 weights : array_like, optional
2390 The weights for each value in `a`. Default is None, which gives each
2391 value a weight of 1.0
2393 Returns
2394 -------
2395 frequency : ndarray
2396 Binned values of relative frequency.
2397 lowerlimit : float
2398 Lower real limit.
2399 binsize : float
2400 Width of each bin.
2401 extrapoints : int
2402 Extra points.
2404 Examples
2405 --------
2406 >>> import numpy as np
2407 >>> import matplotlib.pyplot as plt
2408 >>> from scipy import stats
2409 >>> rng = np.random.default_rng()
2410 >>> a = np.array([2, 4, 1, 2, 3, 2])
2411 >>> res = stats.relfreq(a, numbins=4)
2412 >>> res.frequency
2413 array([ 0.16666667, 0.5 , 0.16666667, 0.16666667])
2414 >>> np.sum(res.frequency) # relative frequencies should add up to 1
2415 1.0
2417 Create a normal distribution with 1000 random values
2419 >>> samples = stats.norm.rvs(size=1000, random_state=rng)
2421 Calculate relative frequencies
2423 >>> res = stats.relfreq(samples, numbins=25)
2425 Calculate space of values for x
2427 >>> x = res.lowerlimit + np.linspace(0, res.binsize*res.frequency.size,
2428 ... res.frequency.size)
2430 Plot relative frequency histogram
2432 >>> fig = plt.figure(figsize=(5, 4))
2433 >>> ax = fig.add_subplot(1, 1, 1)
2434 >>> ax.bar(x, res.frequency, width=res.binsize)
2435 >>> ax.set_title('Relative frequency histogram')
2436 >>> ax.set_xlim([x.min(), x.max()])
2438 >>> plt.show()
2440 """
2441 a = np.asanyarray(a)
2442 h, l, b, e = _histogram(a, numbins, defaultreallimits, weights=weights)
2443 h = h / a.shape[0]
2445 return RelfreqResult(h, l, b, e)
2448#####################################
2449# VARIABILITY FUNCTIONS #
2450#####################################
2452def obrientransform(*samples):
2453 """Compute the O'Brien transform on input data (any number of arrays).
2455 Used to test for homogeneity of variance prior to running one-way stats.
2456 Each array in ``*samples`` is one level of a factor.
2457 If `f_oneway` is run on the transformed data and found significant,
2458 the variances are unequal. From Maxwell and Delaney [1]_, p.112.
2460 Parameters
2461 ----------
2462 sample1, sample2, ... : array_like
2463 Any number of arrays.
2465 Returns
2466 -------
2467 obrientransform : ndarray
2468 Transformed data for use in an ANOVA. The first dimension
2469 of the result corresponds to the sequence of transformed
2470 arrays. If the arrays given are all 1-D of the same length,
2471 the return value is a 2-D array; otherwise it is a 1-D array
2472 of type object, with each element being an ndarray.
2474 References
2475 ----------
2476 .. [1] S. E. Maxwell and H. D. Delaney, "Designing Experiments and
2477 Analyzing Data: A Model Comparison Perspective", Wadsworth, 1990.
2479 Examples
2480 --------
2481 We'll test the following data sets for differences in their variance.
2483 >>> x = [10, 11, 13, 9, 7, 12, 12, 9, 10]
2484 >>> y = [13, 21, 5, 10, 8, 14, 10, 12, 7, 15]
2486 Apply the O'Brien transform to the data.
2488 >>> from scipy.stats import obrientransform
2489 >>> tx, ty = obrientransform(x, y)
2491 Use `scipy.stats.f_oneway` to apply a one-way ANOVA test to the
2492 transformed data.
2494 >>> from scipy.stats import f_oneway
2495 >>> F, p = f_oneway(tx, ty)
2496 >>> p
2497 0.1314139477040335
2499 If we require that ``p < 0.05`` for significance, we cannot conclude
2500 that the variances are different.
2502 """
2503 TINY = np.sqrt(np.finfo(float).eps)
2505 # `arrays` will hold the transformed arguments.
2506 arrays = []
2507 sLast = None
2509 for sample in samples:
2510 a = np.asarray(sample)
2511 n = len(a)
2512 mu = np.mean(a)
2513 sq = (a - mu)**2
2514 sumsq = sq.sum()
2516 # The O'Brien transform.
2517 t = ((n - 1.5) * n * sq - 0.5 * sumsq) / ((n - 1) * (n - 2))
2519 # Check that the mean of the transformed data is equal to the
2520 # original variance.
2521 var = sumsq / (n - 1)
2522 if abs(var - np.mean(t)) > TINY:
2523 raise ValueError('Lack of convergence in obrientransform.')
2525 arrays.append(t)
2526 sLast = a.shape
2528 if sLast:
2529 for arr in arrays[:-1]:
2530 if sLast != arr.shape:
2531 return np.array(arrays, dtype=object)
2532 return np.array(arrays)
2535def sem(a, axis=0, ddof=1, nan_policy='propagate'):
2536 """Compute standard error of the mean.
2538 Calculate the standard error of the mean (or standard error of
2539 measurement) of the values in the input array.
2541 Parameters
2542 ----------
2543 a : array_like
2544 An array containing the values for which the standard error is
2545 returned.
2546 axis : int or None, optional
2547 Axis along which to operate. Default is 0. If None, compute over
2548 the whole array `a`.
2549 ddof : int, optional
2550 Delta degrees-of-freedom. How many degrees of freedom to adjust
2551 for bias in limited samples relative to the population estimate
2552 of variance. Defaults to 1.
2553 nan_policy : {'propagate', 'raise', 'omit'}, optional
2554 Defines how to handle when input contains nan.
2555 The following options are available (default is 'propagate'):
2557 * 'propagate': returns nan
2558 * 'raise': throws an error
2559 * 'omit': performs the calculations ignoring nan values
2561 Returns
2562 -------
2563 s : ndarray or float
2564 The standard error of the mean in the sample(s), along the input axis.
2566 Notes
2567 -----
2568 The default value for `ddof` is different to the default (0) used by other
2569 ddof containing routines, such as np.std and np.nanstd.
2571 Examples
2572 --------
2573 Find standard error along the first axis:
2575 >>> import numpy as np
2576 >>> from scipy import stats
2577 >>> a = np.arange(20).reshape(5,4)
2578 >>> stats.sem(a)
2579 array([ 2.8284, 2.8284, 2.8284, 2.8284])
2581 Find standard error across the whole array, using n degrees of freedom:
2583 >>> stats.sem(a, axis=None, ddof=0)
2584 1.2893796958227628
2586 """
2587 a, axis = _chk_asarray(a, axis)
2589 contains_nan, nan_policy = _contains_nan(a, nan_policy)
2591 if contains_nan and nan_policy == 'omit':
2592 a = ma.masked_invalid(a)
2593 return mstats_basic.sem(a, axis, ddof)
2595 n = a.shape[axis]
2596 s = np.std(a, axis=axis, ddof=ddof) / np.sqrt(n)
2597 return s
2600def _isconst(x):
2601 """
2602 Check if all values in x are the same. nans are ignored.
2604 x must be a 1d array.
2606 The return value is a 1d array with length 1, so it can be used
2607 in np.apply_along_axis.
2608 """
2609 y = x[~np.isnan(x)]
2610 if y.size == 0:
2611 return np.array([True])
2612 else:
2613 return (y[0] == y).all(keepdims=True)
2616def _quiet_nanmean(x):
2617 """
2618 Compute nanmean for the 1d array x, but quietly return nan if x is all nan.
2620 The return value is a 1d array with length 1, so it can be used
2621 in np.apply_along_axis.
2622 """
2623 y = x[~np.isnan(x)]
2624 if y.size == 0:
2625 return np.array([np.nan])
2626 else:
2627 return np.mean(y, keepdims=True)
2630def _quiet_nanstd(x, ddof=0):
2631 """
2632 Compute nanstd for the 1d array x, but quietly return nan if x is all nan.
2634 The return value is a 1d array with length 1, so it can be used
2635 in np.apply_along_axis.
2636 """
2637 y = x[~np.isnan(x)]
2638 if y.size == 0:
2639 return np.array([np.nan])
2640 else:
2641 return np.std(y, keepdims=True, ddof=ddof)
2644def zscore(a, axis=0, ddof=0, nan_policy='propagate'):
2645 """
2646 Compute the z score.
2648 Compute the z score of each value in the sample, relative to the
2649 sample mean and standard deviation.
2651 Parameters
2652 ----------
2653 a : array_like
2654 An array like object containing the sample data.
2655 axis : int or None, optional
2656 Axis along which to operate. Default is 0. If None, compute over
2657 the whole array `a`.
2658 ddof : int, optional
2659 Degrees of freedom correction in the calculation of the
2660 standard deviation. Default is 0.
2661 nan_policy : {'propagate', 'raise', 'omit'}, optional
2662 Defines how to handle when input contains nan. 'propagate' returns nan,
2663 'raise' throws an error, 'omit' performs the calculations ignoring nan
2664 values. Default is 'propagate'. Note that when the value is 'omit',
2665 nans in the input also propagate to the output, but they do not affect
2666 the z-scores computed for the non-nan values.
2668 Returns
2669 -------
2670 zscore : array_like
2671 The z-scores, standardized by mean and standard deviation of
2672 input array `a`.
2674 Notes
2675 -----
2676 This function preserves ndarray subclasses, and works also with
2677 matrices and masked arrays (it uses `asanyarray` instead of
2678 `asarray` for parameters).
2680 Examples
2681 --------
2682 >>> import numpy as np
2683 >>> a = np.array([ 0.7972, 0.0767, 0.4383, 0.7866, 0.8091,
2684 ... 0.1954, 0.6307, 0.6599, 0.1065, 0.0508])
2685 >>> from scipy import stats
2686 >>> stats.zscore(a)
2687 array([ 1.1273, -1.247 , -0.0552, 1.0923, 1.1664, -0.8559, 0.5786,
2688 0.6748, -1.1488, -1.3324])
2690 Computing along a specified axis, using n-1 degrees of freedom
2691 (``ddof=1``) to calculate the standard deviation:
2693 >>> b = np.array([[ 0.3148, 0.0478, 0.6243, 0.4608],
2694 ... [ 0.7149, 0.0775, 0.6072, 0.9656],
2695 ... [ 0.6341, 0.1403, 0.9759, 0.4064],
2696 ... [ 0.5918, 0.6948, 0.904 , 0.3721],
2697 ... [ 0.0921, 0.2481, 0.1188, 0.1366]])
2698 >>> stats.zscore(b, axis=1, ddof=1)
2699 array([[-0.19264823, -1.28415119, 1.07259584, 0.40420358],
2700 [ 0.33048416, -1.37380874, 0.04251374, 1.00081084],
2701 [ 0.26796377, -1.12598418, 1.23283094, -0.37481053],
2702 [-0.22095197, 0.24468594, 1.19042819, -1.21416216],
2703 [-0.82780366, 1.4457416 , -0.43867764, -0.1792603 ]])
2705 An example with `nan_policy='omit'`:
2707 >>> x = np.array([[25.11, 30.10, np.nan, 32.02, 43.15],
2708 ... [14.95, 16.06, 121.25, 94.35, 29.81]])
2709 >>> stats.zscore(x, axis=1, nan_policy='omit')
2710 array([[-1.13490897, -0.37830299, nan, -0.08718406, 1.60039602],
2711 [-0.91611681, -0.89090508, 1.4983032 , 0.88731639, -0.5785977 ]])
2712 """
2713 return zmap(a, a, axis=axis, ddof=ddof, nan_policy=nan_policy)
2716def gzscore(a, *, axis=0, ddof=0, nan_policy='propagate'):
2717 """
2718 Compute the geometric standard score.
2720 Compute the geometric z score of each strictly positive value in the
2721 sample, relative to the geometric mean and standard deviation.
2722 Mathematically the geometric z score can be evaluated as::
2724 gzscore = log(a/gmu) / log(gsigma)
2726 where ``gmu`` (resp. ``gsigma``) is the geometric mean (resp. standard
2727 deviation).
2729 Parameters
2730 ----------
2731 a : array_like
2732 Sample data.
2733 axis : int or None, optional
2734 Axis along which to operate. Default is 0. If None, compute over
2735 the whole array `a`.
2736 ddof : int, optional
2737 Degrees of freedom correction in the calculation of the
2738 standard deviation. Default is 0.
2739 nan_policy : {'propagate', 'raise', 'omit'}, optional
2740 Defines how to handle when input contains nan. 'propagate' returns nan,
2741 'raise' throws an error, 'omit' performs the calculations ignoring nan
2742 values. Default is 'propagate'. Note that when the value is 'omit',
2743 nans in the input also propagate to the output, but they do not affect
2744 the geometric z scores computed for the non-nan values.
2746 Returns
2747 -------
2748 gzscore : array_like
2749 The geometric z scores, standardized by geometric mean and geometric
2750 standard deviation of input array `a`.
2752 See Also
2753 --------
2754 gmean : Geometric mean
2755 gstd : Geometric standard deviation
2756 zscore : Standard score
2758 Notes
2759 -----
2760 This function preserves ndarray subclasses, and works also with
2761 matrices and masked arrays (it uses ``asanyarray`` instead of
2762 ``asarray`` for parameters).
2764 .. versionadded:: 1.8
2766 Examples
2767 --------
2768 Draw samples from a log-normal distribution:
2770 >>> import numpy as np
2771 >>> from scipy.stats import zscore, gzscore
2772 >>> import matplotlib.pyplot as plt
2774 >>> rng = np.random.default_rng()
2775 >>> mu, sigma = 3., 1. # mean and standard deviation
2776 >>> x = rng.lognormal(mu, sigma, size=500)
2778 Display the histogram of the samples:
2780 >>> fig, ax = plt.subplots()
2781 >>> ax.hist(x, 50)
2782 >>> plt.show()
2784 Display the histogram of the samples standardized by the classical zscore.
2785 Distribution is rescaled but its shape is unchanged.
2787 >>> fig, ax = plt.subplots()
2788 >>> ax.hist(zscore(x), 50)
2789 >>> plt.show()
2791 Demonstrate that the distribution of geometric zscores is rescaled and
2792 quasinormal:
2794 >>> fig, ax = plt.subplots()
2795 >>> ax.hist(gzscore(x), 50)
2796 >>> plt.show()
2798 """
2799 a = np.asanyarray(a)
2800 log = ma.log if isinstance(a, ma.MaskedArray) else np.log
2802 return zscore(log(a), axis=axis, ddof=ddof, nan_policy=nan_policy)
2805def zmap(scores, compare, axis=0, ddof=0, nan_policy='propagate'):
2806 """
2807 Calculate the relative z-scores.
2809 Return an array of z-scores, i.e., scores that are standardized to
2810 zero mean and unit variance, where mean and variance are calculated
2811 from the comparison array.
2813 Parameters
2814 ----------
2815 scores : array_like
2816 The input for which z-scores are calculated.
2817 compare : array_like
2818 The input from which the mean and standard deviation of the
2819 normalization are taken; assumed to have the same dimension as
2820 `scores`.
2821 axis : int or None, optional
2822 Axis over which mean and variance of `compare` are calculated.
2823 Default is 0. If None, compute over the whole array `scores`.
2824 ddof : int, optional
2825 Degrees of freedom correction in the calculation of the
2826 standard deviation. Default is 0.
2827 nan_policy : {'propagate', 'raise', 'omit'}, optional
2828 Defines how to handle the occurrence of nans in `compare`.
2829 'propagate' returns nan, 'raise' raises an exception, 'omit'
2830 performs the calculations ignoring nan values. Default is
2831 'propagate'. Note that when the value is 'omit', nans in `scores`
2832 also propagate to the output, but they do not affect the z-scores
2833 computed for the non-nan values.
2835 Returns
2836 -------
2837 zscore : array_like
2838 Z-scores, in the same shape as `scores`.
2840 Notes
2841 -----
2842 This function preserves ndarray subclasses, and works also with
2843 matrices and masked arrays (it uses `asanyarray` instead of
2844 `asarray` for parameters).
2846 Examples
2847 --------
2848 >>> from scipy.stats import zmap
2849 >>> a = [0.5, 2.0, 2.5, 3]
2850 >>> b = [0, 1, 2, 3, 4]
2851 >>> zmap(a, b)
2852 array([-1.06066017, 0. , 0.35355339, 0.70710678])
2854 """
2855 a = np.asanyarray(compare)
2857 if a.size == 0:
2858 return np.empty(a.shape)
2860 contains_nan, nan_policy = _contains_nan(a, nan_policy)
2862 if contains_nan and nan_policy == 'omit':
2863 if axis is None:
2864 mn = _quiet_nanmean(a.ravel())
2865 std = _quiet_nanstd(a.ravel(), ddof=ddof)
2866 isconst = _isconst(a.ravel())
2867 else:
2868 mn = np.apply_along_axis(_quiet_nanmean, axis, a)
2869 std = np.apply_along_axis(_quiet_nanstd, axis, a, ddof=ddof)
2870 isconst = np.apply_along_axis(_isconst, axis, a)
2871 else:
2872 mn = a.mean(axis=axis, keepdims=True)
2873 std = a.std(axis=axis, ddof=ddof, keepdims=True)
2874 if axis is None:
2875 isconst = (a.item(0) == a).all()
2876 else:
2877 isconst = (_first(a, axis) == a).all(axis=axis, keepdims=True)
2879 # Set std deviations that are 0 to 1 to avoid division by 0.
2880 std[isconst] = 1.0
2881 z = (scores - mn) / std
2882 # Set the outputs associated with a constant input to nan.
2883 z[np.broadcast_to(isconst, z.shape)] = np.nan
2884 return z
2887def gstd(a, axis=0, ddof=1):
2888 """
2889 Calculate the geometric standard deviation of an array.
2891 The geometric standard deviation describes the spread of a set of numbers
2892 where the geometric mean is preferred. It is a multiplicative factor, and
2893 so a dimensionless quantity.
2895 It is defined as the exponent of the standard deviation of ``log(a)``.
2896 Mathematically the population geometric standard deviation can be
2897 evaluated as::
2899 gstd = exp(std(log(a)))
2901 .. versionadded:: 1.3.0
2903 Parameters
2904 ----------
2905 a : array_like
2906 An array like object containing the sample data.
2907 axis : int, tuple or None, optional
2908 Axis along which to operate. Default is 0. If None, compute over
2909 the whole array `a`.
2910 ddof : int, optional
2911 Degree of freedom correction in the calculation of the
2912 geometric standard deviation. Default is 1.
2914 Returns
2915 -------
2916 ndarray or float
2917 An array of the geometric standard deviation. If `axis` is None or `a`
2918 is a 1d array a float is returned.
2920 See Also
2921 --------
2922 gmean : Geometric mean
2923 numpy.std : Standard deviation
2925 Notes
2926 -----
2927 As the calculation requires the use of logarithms the geometric standard
2928 deviation only supports strictly positive values. Any non-positive or
2929 infinite values will raise a `ValueError`.
2930 The geometric standard deviation is sometimes confused with the exponent of
2931 the standard deviation, ``exp(std(a))``. Instead the geometric standard
2932 deviation is ``exp(std(log(a)))``.
2933 The default value for `ddof` is different to the default value (0) used
2934 by other ddof containing functions, such as ``np.std`` and ``np.nanstd``.
2936 References
2937 ----------
2938 .. [1] Kirkwood, T. B., "Geometric means and measures of dispersion",
2939 Biometrics, vol. 35, pp. 908-909, 1979
2941 Examples
2942 --------
2943 Find the geometric standard deviation of a log-normally distributed sample.
2944 Note that the standard deviation of the distribution is one, on a
2945 log scale this evaluates to approximately ``exp(1)``.
2947 >>> import numpy as np
2948 >>> from scipy.stats import gstd
2949 >>> rng = np.random.default_rng()
2950 >>> sample = rng.lognormal(mean=0, sigma=1, size=1000)
2951 >>> gstd(sample)
2952 2.810010162475324
2954 Compute the geometric standard deviation of a multidimensional array and
2955 of a given axis.
2957 >>> a = np.arange(1, 25).reshape(2, 3, 4)
2958 >>> gstd(a, axis=None)
2959 2.2944076136018947
2960 >>> gstd(a, axis=2)
2961 array([[1.82424757, 1.22436866, 1.13183117],
2962 [1.09348306, 1.07244798, 1.05914985]])
2963 >>> gstd(a, axis=(1,2))
2964 array([2.12939215, 1.22120169])
2966 The geometric standard deviation further handles masked arrays.
2968 >>> a = np.arange(1, 25).reshape(2, 3, 4)
2969 >>> ma = np.ma.masked_where(a > 16, a)
2970 >>> ma
2971 masked_array(
2972 data=[[[1, 2, 3, 4],
2973 [5, 6, 7, 8],
2974 [9, 10, 11, 12]],
2975 [[13, 14, 15, 16],
2976 [--, --, --, --],
2977 [--, --, --, --]]],
2978 mask=[[[False, False, False, False],
2979 [False, False, False, False],
2980 [False, False, False, False]],
2981 [[False, False, False, False],
2982 [ True, True, True, True],
2983 [ True, True, True, True]]],
2984 fill_value=999999)
2985 >>> gstd(ma, axis=2)
2986 masked_array(
2987 data=[[1.8242475707663655, 1.2243686572447428, 1.1318311657788478],
2988 [1.0934830582350938, --, --]],
2989 mask=[[False, False, False],
2990 [False, True, True]],
2991 fill_value=999999)
2993 """
2994 a = np.asanyarray(a)
2995 log = ma.log if isinstance(a, ma.MaskedArray) else np.log
2997 try:
2998 with warnings.catch_warnings():
2999 warnings.simplefilter("error", RuntimeWarning)
3000 return np.exp(np.std(log(a), axis=axis, ddof=ddof))
3001 except RuntimeWarning as w:
3002 if np.isinf(a).any():
3003 raise ValueError(
3004 'Infinite value encountered. The geometric standard deviation '
3005 'is defined for strictly positive values only.'
3006 ) from w
3007 a_nan = np.isnan(a)
3008 a_nan_any = a_nan.any()
3009 # exclude NaN's from negativity check, but
3010 # avoid expensive masking for arrays with no NaN
3011 if ((a_nan_any and np.less_equal(np.nanmin(a), 0)) or
3012 (not a_nan_any and np.less_equal(a, 0).any())):
3013 raise ValueError(
3014 'Non positive value encountered. The geometric standard '
3015 'deviation is defined for strictly positive values only.'
3016 ) from w
3017 elif 'Degrees of freedom <= 0 for slice' == str(w):
3018 raise ValueError(w) from w
3019 else:
3020 # Remaining warnings don't need to be exceptions.
3021 return np.exp(np.std(log(a, where=~a_nan), axis=axis, ddof=ddof))
3022 except TypeError as e:
3023 raise ValueError(
3024 'Invalid array input. The inputs could not be '
3025 'safely coerced to any supported types') from e
3028# Private dictionary initialized only once at module level
3029# See https://en.wikipedia.org/wiki/Robust_measures_of_scale
3030_scale_conversions = {'raw': 1.0,
3031 'normal': special.erfinv(0.5) * 2.0 * math.sqrt(2.0)}
3034def iqr(x, axis=None, rng=(25, 75), scale=1.0, nan_policy='propagate',
3035 interpolation='linear', keepdims=False):
3036 r"""
3037 Compute the interquartile range of the data along the specified axis.
3039 The interquartile range (IQR) is the difference between the 75th and
3040 25th percentile of the data. It is a measure of the dispersion
3041 similar to standard deviation or variance, but is much more robust
3042 against outliers [2]_.
3044 The ``rng`` parameter allows this function to compute other
3045 percentile ranges than the actual IQR. For example, setting
3046 ``rng=(0, 100)`` is equivalent to `numpy.ptp`.
3048 The IQR of an empty array is `np.nan`.
3050 .. versionadded:: 0.18.0
3052 Parameters
3053 ----------
3054 x : array_like
3055 Input array or object that can be converted to an array.
3056 axis : int or sequence of int, optional
3057 Axis along which the range is computed. The default is to
3058 compute the IQR for the entire array.
3059 rng : Two-element sequence containing floats in range of [0,100] optional
3060 Percentiles over which to compute the range. Each must be
3061 between 0 and 100, inclusive. The default is the true IQR:
3062 ``(25, 75)``. The order of the elements is not important.
3063 scale : scalar or str, optional
3064 The numerical value of scale will be divided out of the final
3065 result. The following string values are recognized:
3067 * 'raw' : No scaling, just return the raw IQR.
3068 **Deprecated!** Use ``scale=1`` instead.
3069 * 'normal' : Scale by
3070 :math:`2 \sqrt{2} erf^{-1}(\frac{1}{2}) \approx 1.349`.
3072 The default is 1.0. The use of ``scale='raw'`` is deprecated infavor
3073 of ``scale=1`` and will raise an error in SciPy 1.12.0.
3074 Array-like `scale` is also allowed, as long
3075 as it broadcasts correctly to the output such that
3076 ``out / scale`` is a valid operation. The output dimensions
3077 depend on the input array, `x`, the `axis` argument, and the
3078 `keepdims` flag.
3079 nan_policy : {'propagate', 'raise', 'omit'}, optional
3080 Defines how to handle when input contains nan.
3081 The following options are available (default is 'propagate'):
3083 * 'propagate': returns nan
3084 * 'raise': throws an error
3085 * 'omit': performs the calculations ignoring nan values
3086 interpolation : str, optional
3088 Specifies the interpolation method to use when the percentile
3089 boundaries lie between two data points ``i`` and ``j``.
3090 The following options are available (default is 'linear'):
3092 * 'linear': ``i + (j - i)*fraction``, where ``fraction`` is the
3093 fractional part of the index surrounded by ``i`` and ``j``.
3094 * 'lower': ``i``.
3095 * 'higher': ``j``.
3096 * 'nearest': ``i`` or ``j`` whichever is nearest.
3097 * 'midpoint': ``(i + j)/2``.
3099 For NumPy >= 1.22.0, the additional options provided by the ``method``
3100 keyword of `numpy.percentile` are also valid.
3102 keepdims : bool, optional
3103 If this is set to True, the reduced axes are left in the
3104 result as dimensions with size one. With this option, the result
3105 will broadcast correctly against the original array `x`.
3107 Returns
3108 -------
3109 iqr : scalar or ndarray
3110 If ``axis=None``, a scalar is returned. If the input contains
3111 integers or floats of smaller precision than ``np.float64``, then the
3112 output data-type is ``np.float64``. Otherwise, the output data-type is
3113 the same as that of the input.
3115 See Also
3116 --------
3117 numpy.std, numpy.var
3119 References
3120 ----------
3121 .. [1] "Interquartile range" https://en.wikipedia.org/wiki/Interquartile_range
3122 .. [2] "Robust measures of scale" https://en.wikipedia.org/wiki/Robust_measures_of_scale
3123 .. [3] "Quantile" https://en.wikipedia.org/wiki/Quantile
3125 Examples
3126 --------
3127 >>> import numpy as np
3128 >>> from scipy.stats import iqr
3129 >>> x = np.array([[10, 7, 4], [3, 2, 1]])
3130 >>> x
3131 array([[10, 7, 4],
3132 [ 3, 2, 1]])
3133 >>> iqr(x)
3134 4.0
3135 >>> iqr(x, axis=0)
3136 array([ 3.5, 2.5, 1.5])
3137 >>> iqr(x, axis=1)
3138 array([ 3., 1.])
3139 >>> iqr(x, axis=1, keepdims=True)
3140 array([[ 3.],
3141 [ 1.]])
3143 """
3144 x = asarray(x)
3146 # This check prevents percentile from raising an error later. Also, it is
3147 # consistent with `np.var` and `np.std`.
3148 if not x.size:
3149 return np.nan
3151 # An error may be raised here, so fail-fast, before doing lengthy
3152 # computations, even though `scale` is not used until later
3153 if isinstance(scale, str):
3154 scale_key = scale.lower()
3155 if scale_key not in _scale_conversions:
3156 raise ValueError("{0} not a valid scale for `iqr`".format(scale))
3157 if scale_key == 'raw':
3158 msg = ("The use of 'scale=\"raw\"' is deprecated infavor of "
3159 "'scale=1' and will raise an error in SciPy 1.12.0.")
3160 warnings.warn(msg, DeprecationWarning, stacklevel=2)
3161 scale = _scale_conversions[scale_key]
3163 # Select the percentile function to use based on nans and policy
3164 contains_nan, nan_policy = _contains_nan(x, nan_policy)
3166 if contains_nan and nan_policy == 'omit':
3167 percentile_func = np.nanpercentile
3168 else:
3169 percentile_func = np.percentile
3171 if len(rng) != 2:
3172 raise TypeError("quantile range must be two element sequence")
3174 if np.isnan(rng).any():
3175 raise ValueError("range must not contain NaNs")
3177 rng = sorted(rng)
3178 if NumpyVersion(np.__version__) >= '1.22.0':
3179 pct = percentile_func(x, rng, axis=axis, method=interpolation,
3180 keepdims=keepdims)
3181 else:
3182 pct = percentile_func(x, rng, axis=axis, interpolation=interpolation,
3183 keepdims=keepdims)
3184 out = np.subtract(pct[1], pct[0])
3186 if scale != 1.0:
3187 out /= scale
3189 return out
3192def _mad_1d(x, center, nan_policy):
3193 # Median absolute deviation for 1-d array x.
3194 # This is a helper function for `median_abs_deviation`; it assumes its
3195 # arguments have been validated already. In particular, x must be a
3196 # 1-d numpy array, center must be callable, and if nan_policy is not
3197 # 'propagate', it is assumed to be 'omit', because 'raise' is handled
3198 # in `median_abs_deviation`.
3199 # No warning is generated if x is empty or all nan.
3200 isnan = np.isnan(x)
3201 if isnan.any():
3202 if nan_policy == 'propagate':
3203 return np.nan
3204 x = x[~isnan]
3205 if x.size == 0:
3206 # MAD of an empty array is nan.
3207 return np.nan
3208 # Edge cases have been handled, so do the basic MAD calculation.
3209 med = center(x)
3210 mad = np.median(np.abs(x - med))
3211 return mad
3214def median_abs_deviation(x, axis=0, center=np.median, scale=1.0,
3215 nan_policy='propagate'):
3216 r"""
3217 Compute the median absolute deviation of the data along the given axis.
3219 The median absolute deviation (MAD, [1]_) computes the median over the
3220 absolute deviations from the median. It is a measure of dispersion
3221 similar to the standard deviation but more robust to outliers [2]_.
3223 The MAD of an empty array is ``np.nan``.
3225 .. versionadded:: 1.5.0
3227 Parameters
3228 ----------
3229 x : array_like
3230 Input array or object that can be converted to an array.
3231 axis : int or None, optional
3232 Axis along which the range is computed. Default is 0. If None, compute
3233 the MAD over the entire array.
3234 center : callable, optional
3235 A function that will return the central value. The default is to use
3236 np.median. Any user defined function used will need to have the
3237 function signature ``func(arr, axis)``.
3238 scale : scalar or str, optional
3239 The numerical value of scale will be divided out of the final
3240 result. The default is 1.0. The string "normal" is also accepted,
3241 and results in `scale` being the inverse of the standard normal
3242 quantile function at 0.75, which is approximately 0.67449.
3243 Array-like scale is also allowed, as long as it broadcasts correctly
3244 to the output such that ``out / scale`` is a valid operation. The
3245 output dimensions depend on the input array, `x`, and the `axis`
3246 argument.
3247 nan_policy : {'propagate', 'raise', 'omit'}, optional
3248 Defines how to handle when input contains nan.
3249 The following options are available (default is 'propagate'):
3251 * 'propagate': returns nan
3252 * 'raise': throws an error
3253 * 'omit': performs the calculations ignoring nan values
3255 Returns
3256 -------
3257 mad : scalar or ndarray
3258 If ``axis=None``, a scalar is returned. If the input contains
3259 integers or floats of smaller precision than ``np.float64``, then the
3260 output data-type is ``np.float64``. Otherwise, the output data-type is
3261 the same as that of the input.
3263 See Also
3264 --------
3265 numpy.std, numpy.var, numpy.median, scipy.stats.iqr, scipy.stats.tmean,
3266 scipy.stats.tstd, scipy.stats.tvar
3268 Notes
3269 -----
3270 The `center` argument only affects the calculation of the central value
3271 around which the MAD is calculated. That is, passing in ``center=np.mean``
3272 will calculate the MAD around the mean - it will not calculate the *mean*
3273 absolute deviation.
3275 The input array may contain `inf`, but if `center` returns `inf`, the
3276 corresponding MAD for that data will be `nan`.
3278 References
3279 ----------
3280 .. [1] "Median absolute deviation",
3281 https://en.wikipedia.org/wiki/Median_absolute_deviation
3282 .. [2] "Robust measures of scale",
3283 https://en.wikipedia.org/wiki/Robust_measures_of_scale
3285 Examples
3286 --------
3287 When comparing the behavior of `median_abs_deviation` with ``np.std``,
3288 the latter is affected when we change a single value of an array to have an
3289 outlier value while the MAD hardly changes:
3291 >>> import numpy as np
3292 >>> from scipy import stats
3293 >>> x = stats.norm.rvs(size=100, scale=1, random_state=123456)
3294 >>> x.std()
3295 0.9973906394005013
3296 >>> stats.median_abs_deviation(x)
3297 0.82832610097857
3298 >>> x[0] = 345.6
3299 >>> x.std()
3300 34.42304872314415
3301 >>> stats.median_abs_deviation(x)
3302 0.8323442311590675
3304 Axis handling example:
3306 >>> x = np.array([[10, 7, 4], [3, 2, 1]])
3307 >>> x
3308 array([[10, 7, 4],
3309 [ 3, 2, 1]])
3310 >>> stats.median_abs_deviation(x)
3311 array([3.5, 2.5, 1.5])
3312 >>> stats.median_abs_deviation(x, axis=None)
3313 2.0
3315 Scale normal example:
3317 >>> x = stats.norm.rvs(size=1000000, scale=2, random_state=123456)
3318 >>> stats.median_abs_deviation(x)
3319 1.3487398527041636
3320 >>> stats.median_abs_deviation(x, scale='normal')
3321 1.9996446978061115
3323 """
3324 if not callable(center):
3325 raise TypeError("The argument 'center' must be callable. The given "
3326 f"value {repr(center)} is not callable.")
3328 # An error may be raised here, so fail-fast, before doing lengthy
3329 # computations, even though `scale` is not used until later
3330 if isinstance(scale, str):
3331 if scale.lower() == 'normal':
3332 scale = 0.6744897501960817 # special.ndtri(0.75)
3333 else:
3334 raise ValueError(f"{scale} is not a valid scale value.")
3336 x = asarray(x)
3338 # Consistent with `np.var` and `np.std`.
3339 if not x.size:
3340 if axis is None:
3341 return np.nan
3342 nan_shape = tuple(item for i, item in enumerate(x.shape) if i != axis)
3343 if nan_shape == ():
3344 # Return nan, not array(nan)
3345 return np.nan
3346 return np.full(nan_shape, np.nan)
3348 contains_nan, nan_policy = _contains_nan(x, nan_policy)
3350 if contains_nan:
3351 if axis is None:
3352 mad = _mad_1d(x.ravel(), center, nan_policy)
3353 else:
3354 mad = np.apply_along_axis(_mad_1d, axis, x, center, nan_policy)
3355 else:
3356 if axis is None:
3357 med = center(x, axis=None)
3358 mad = np.median(np.abs(x - med))
3359 else:
3360 # Wrap the call to center() in expand_dims() so it acts like
3361 # keepdims=True was used.
3362 med = np.expand_dims(center(x, axis=axis), axis)
3363 mad = np.median(np.abs(x - med), axis=axis)
3365 return mad / scale
3368#####################################
3369# TRIMMING FUNCTIONS #
3370#####################################
3373SigmaclipResult = namedtuple('SigmaclipResult', ('clipped', 'lower', 'upper'))
3376def sigmaclip(a, low=4., high=4.):
3377 """Perform iterative sigma-clipping of array elements.
3379 Starting from the full sample, all elements outside the critical range are
3380 removed, i.e. all elements of the input array `c` that satisfy either of
3381 the following conditions::
3383 c < mean(c) - std(c)*low
3384 c > mean(c) + std(c)*high
3386 The iteration continues with the updated sample until no
3387 elements are outside the (updated) range.
3389 Parameters
3390 ----------
3391 a : array_like
3392 Data array, will be raveled if not 1-D.
3393 low : float, optional
3394 Lower bound factor of sigma clipping. Default is 4.
3395 high : float, optional
3396 Upper bound factor of sigma clipping. Default is 4.
3398 Returns
3399 -------
3400 clipped : ndarray
3401 Input array with clipped elements removed.
3402 lower : float
3403 Lower threshold value use for clipping.
3404 upper : float
3405 Upper threshold value use for clipping.
3407 Examples
3408 --------
3409 >>> import numpy as np
3410 >>> from scipy.stats import sigmaclip
3411 >>> a = np.concatenate((np.linspace(9.5, 10.5, 31),
3412 ... np.linspace(0, 20, 5)))
3413 >>> fact = 1.5
3414 >>> c, low, upp = sigmaclip(a, fact, fact)
3415 >>> c
3416 array([ 9.96666667, 10. , 10.03333333, 10. ])
3417 >>> c.var(), c.std()
3418 (0.00055555555555555165, 0.023570226039551501)
3419 >>> low, c.mean() - fact*c.std(), c.min()
3420 (9.9646446609406727, 9.9646446609406727, 9.9666666666666668)
3421 >>> upp, c.mean() + fact*c.std(), c.max()
3422 (10.035355339059327, 10.035355339059327, 10.033333333333333)
3424 >>> a = np.concatenate((np.linspace(9.5, 10.5, 11),
3425 ... np.linspace(-100, -50, 3)))
3426 >>> c, low, upp = sigmaclip(a, 1.8, 1.8)
3427 >>> (c == np.linspace(9.5, 10.5, 11)).all()
3428 True
3430 """
3431 c = np.asarray(a).ravel()
3432 delta = 1
3433 while delta:
3434 c_std = c.std()
3435 c_mean = c.mean()
3436 size = c.size
3437 critlower = c_mean - c_std * low
3438 critupper = c_mean + c_std * high
3439 c = c[(c >= critlower) & (c <= critupper)]
3440 delta = size - c.size
3442 return SigmaclipResult(c, critlower, critupper)
3445def trimboth(a, proportiontocut, axis=0):
3446 """Slice off a proportion of items from both ends of an array.
3448 Slice off the passed proportion of items from both ends of the passed
3449 array (i.e., with `proportiontocut` = 0.1, slices leftmost 10% **and**
3450 rightmost 10% of scores). The trimmed values are the lowest and
3451 highest ones.
3452 Slice off less if proportion results in a non-integer slice index (i.e.
3453 conservatively slices off `proportiontocut`).
3455 Parameters
3456 ----------
3457 a : array_like
3458 Data to trim.
3459 proportiontocut : float
3460 Proportion (in range 0-1) of total data set to trim of each end.
3461 axis : int or None, optional
3462 Axis along which to trim data. Default is 0. If None, compute over
3463 the whole array `a`.
3465 Returns
3466 -------
3467 out : ndarray
3468 Trimmed version of array `a`. The order of the trimmed content
3469 is undefined.
3471 See Also
3472 --------
3473 trim_mean
3475 Examples
3476 --------
3477 Create an array of 10 values and trim 10% of those values from each end:
3479 >>> import numpy as np
3480 >>> from scipy import stats
3481 >>> a = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
3482 >>> stats.trimboth(a, 0.1)
3483 array([1, 3, 2, 4, 5, 6, 7, 8])
3485 Note that the elements of the input array are trimmed by value, but the
3486 output array is not necessarily sorted.
3488 The proportion to trim is rounded down to the nearest integer. For
3489 instance, trimming 25% of the values from each end of an array of 10
3490 values will return an array of 6 values:
3492 >>> b = np.arange(10)
3493 >>> stats.trimboth(b, 1/4).shape
3494 (6,)
3496 Multidimensional arrays can be trimmed along any axis or across the entire
3497 array:
3499 >>> c = [2, 4, 6, 8, 0, 1, 3, 5, 7, 9]
3500 >>> d = np.array([a, b, c])
3501 >>> stats.trimboth(d, 0.4, axis=0).shape
3502 (1, 10)
3503 >>> stats.trimboth(d, 0.4, axis=1).shape
3504 (3, 2)
3505 >>> stats.trimboth(d, 0.4, axis=None).shape
3506 (6,)
3508 """
3509 a = np.asarray(a)
3511 if a.size == 0:
3512 return a
3514 if axis is None:
3515 a = a.ravel()
3516 axis = 0
3518 nobs = a.shape[axis]
3519 lowercut = int(proportiontocut * nobs)
3520 uppercut = nobs - lowercut
3521 if (lowercut >= uppercut):
3522 raise ValueError("Proportion too big.")
3524 atmp = np.partition(a, (lowercut, uppercut - 1), axis)
3526 sl = [slice(None)] * atmp.ndim
3527 sl[axis] = slice(lowercut, uppercut)
3528 return atmp[tuple(sl)]
3531def trim1(a, proportiontocut, tail='right', axis=0):
3532 """Slice off a proportion from ONE end of the passed array distribution.
3534 If `proportiontocut` = 0.1, slices off 'leftmost' or 'rightmost'
3535 10% of scores. The lowest or highest values are trimmed (depending on
3536 the tail).
3537 Slice off less if proportion results in a non-integer slice index
3538 (i.e. conservatively slices off `proportiontocut` ).
3540 Parameters
3541 ----------
3542 a : array_like
3543 Input array.
3544 proportiontocut : float
3545 Fraction to cut off of 'left' or 'right' of distribution.
3546 tail : {'left', 'right'}, optional
3547 Defaults to 'right'.
3548 axis : int or None, optional
3549 Axis along which to trim data. Default is 0. If None, compute over
3550 the whole array `a`.
3552 Returns
3553 -------
3554 trim1 : ndarray
3555 Trimmed version of array `a`. The order of the trimmed content is
3556 undefined.
3558 Examples
3559 --------
3560 Create an array of 10 values and trim 20% of its lowest values:
3562 >>> import numpy as np
3563 >>> from scipy import stats
3564 >>> a = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
3565 >>> stats.trim1(a, 0.2, 'left')
3566 array([2, 4, 3, 5, 6, 7, 8, 9])
3568 Note that the elements of the input array are trimmed by value, but the
3569 output array is not necessarily sorted.
3571 The proportion to trim is rounded down to the nearest integer. For
3572 instance, trimming 25% of the values from an array of 10 values will
3573 return an array of 8 values:
3575 >>> b = np.arange(10)
3576 >>> stats.trim1(b, 1/4).shape
3577 (8,)
3579 Multidimensional arrays can be trimmed along any axis or across the entire
3580 array:
3582 >>> c = [2, 4, 6, 8, 0, 1, 3, 5, 7, 9]
3583 >>> d = np.array([a, b, c])
3584 >>> stats.trim1(d, 0.8, axis=0).shape
3585 (1, 10)
3586 >>> stats.trim1(d, 0.8, axis=1).shape
3587 (3, 2)
3588 >>> stats.trim1(d, 0.8, axis=None).shape
3589 (6,)
3591 """
3592 a = np.asarray(a)
3593 if axis is None:
3594 a = a.ravel()
3595 axis = 0
3597 nobs = a.shape[axis]
3599 # avoid possible corner case
3600 if proportiontocut >= 1:
3601 return []
3603 if tail.lower() == 'right':
3604 lowercut = 0
3605 uppercut = nobs - int(proportiontocut * nobs)
3607 elif tail.lower() == 'left':
3608 lowercut = int(proportiontocut * nobs)
3609 uppercut = nobs
3611 atmp = np.partition(a, (lowercut, uppercut - 1), axis)
3613 sl = [slice(None)] * atmp.ndim
3614 sl[axis] = slice(lowercut, uppercut)
3615 return atmp[tuple(sl)]
3618def trim_mean(a, proportiontocut, axis=0):
3619 """Return mean of array after trimming distribution from both tails.
3621 If `proportiontocut` = 0.1, slices off 'leftmost' and 'rightmost' 10% of
3622 scores. The input is sorted before slicing. Slices off less if proportion
3623 results in a non-integer slice index (i.e., conservatively slices off
3624 `proportiontocut` ).
3626 Parameters
3627 ----------
3628 a : array_like
3629 Input array.
3630 proportiontocut : float
3631 Fraction to cut off of both tails of the distribution.
3632 axis : int or None, optional
3633 Axis along which the trimmed means are computed. Default is 0.
3634 If None, compute over the whole array `a`.
3636 Returns
3637 -------
3638 trim_mean : ndarray
3639 Mean of trimmed array.
3641 See Also
3642 --------
3643 trimboth
3644 tmean : Compute the trimmed mean ignoring values outside given `limits`.
3646 Examples
3647 --------
3648 >>> import numpy as np
3649 >>> from scipy import stats
3650 >>> x = np.arange(20)
3651 >>> stats.trim_mean(x, 0.1)
3652 9.5
3653 >>> x2 = x.reshape(5, 4)
3654 >>> x2
3655 array([[ 0, 1, 2, 3],
3656 [ 4, 5, 6, 7],
3657 [ 8, 9, 10, 11],
3658 [12, 13, 14, 15],
3659 [16, 17, 18, 19]])
3660 >>> stats.trim_mean(x2, 0.25)
3661 array([ 8., 9., 10., 11.])
3662 >>> stats.trim_mean(x2, 0.25, axis=1)
3663 array([ 1.5, 5.5, 9.5, 13.5, 17.5])
3665 """
3666 a = np.asarray(a)
3668 if a.size == 0:
3669 return np.nan
3671 if axis is None:
3672 a = a.ravel()
3673 axis = 0
3675 nobs = a.shape[axis]
3676 lowercut = int(proportiontocut * nobs)
3677 uppercut = nobs - lowercut
3678 if (lowercut > uppercut):
3679 raise ValueError("Proportion too big.")
3681 atmp = np.partition(a, (lowercut, uppercut - 1), axis)
3683 sl = [slice(None)] * atmp.ndim
3684 sl[axis] = slice(lowercut, uppercut)
3685 return np.mean(atmp[tuple(sl)], axis=axis)
3688F_onewayResult = namedtuple('F_onewayResult', ('statistic', 'pvalue'))
3691def _create_f_oneway_nan_result(shape, axis):
3692 """
3693 This is a helper function for f_oneway for creating the return values
3694 in certain degenerate conditions. It creates return values that are
3695 all nan with the appropriate shape for the given `shape` and `axis`.
3696 """
3697 axis = np.core.multiarray.normalize_axis_index(axis, len(shape))
3698 shp = shape[:axis] + shape[axis+1:]
3699 if shp == ():
3700 f = np.nan
3701 prob = np.nan
3702 else:
3703 f = np.full(shp, fill_value=np.nan)
3704 prob = f.copy()
3705 return F_onewayResult(f, prob)
3708def _first(arr, axis):
3709 """Return arr[..., 0:1, ...] where 0:1 is in the `axis` position."""
3710 return np.take_along_axis(arr, np.array(0, ndmin=arr.ndim), axis)
3713def f_oneway(*samples, axis=0):
3714 """Perform one-way ANOVA.
3716 The one-way ANOVA tests the null hypothesis that two or more groups have
3717 the same population mean. The test is applied to samples from two or
3718 more groups, possibly with differing sizes.
3720 Parameters
3721 ----------
3722 sample1, sample2, ... : array_like
3723 The sample measurements for each group. There must be at least
3724 two arguments. If the arrays are multidimensional, then all the
3725 dimensions of the array must be the same except for `axis`.
3726 axis : int, optional
3727 Axis of the input arrays along which the test is applied.
3728 Default is 0.
3730 Returns
3731 -------
3732 statistic : float
3733 The computed F statistic of the test.
3734 pvalue : float
3735 The associated p-value from the F distribution.
3737 Warns
3738 -----
3739 `~scipy.stats.ConstantInputWarning`
3740 Raised if all values within each of the input arrays are identical.
3741 In this case the F statistic is either infinite or isn't defined,
3742 so ``np.inf`` or ``np.nan`` is returned.
3744 `~scipy.stats.DegenerateDataWarning`
3745 Raised if the length of any input array is 0, or if all the input
3746 arrays have length 1. ``np.nan`` is returned for the F statistic
3747 and the p-value in these cases.
3749 Notes
3750 -----
3751 The ANOVA test has important assumptions that must be satisfied in order
3752 for the associated p-value to be valid.
3754 1. The samples are independent.
3755 2. Each sample is from a normally distributed population.
3756 3. The population standard deviations of the groups are all equal. This
3757 property is known as homoscedasticity.
3759 If these assumptions are not true for a given set of data, it may still
3760 be possible to use the Kruskal-Wallis H-test (`scipy.stats.kruskal`) or
3761 the Alexander-Govern test (`scipy.stats.alexandergovern`) although with
3762 some loss of power.
3764 The length of each group must be at least one, and there must be at
3765 least one group with length greater than one. If these conditions
3766 are not satisfied, a warning is generated and (``np.nan``, ``np.nan``)
3767 is returned.
3769 If all values in each group are identical, and there exist at least two
3770 groups with different values, the function generates a warning and
3771 returns (``np.inf``, 0).
3773 If all values in all groups are the same, function generates a warning
3774 and returns (``np.nan``, ``np.nan``).
3776 The algorithm is from Heiman [2]_, pp.394-7.
3778 References
3779 ----------
3780 .. [1] R. Lowry, "Concepts and Applications of Inferential Statistics",
3781 Chapter 14, 2014, http://vassarstats.net/textbook/
3783 .. [2] G.W. Heiman, "Understanding research methods and statistics: An
3784 integrated introduction for psychology", Houghton, Mifflin and
3785 Company, 2001.
3787 .. [3] G.H. McDonald, "Handbook of Biological Statistics", One-way ANOVA.
3788 http://www.biostathandbook.com/onewayanova.html
3790 Examples
3791 --------
3792 >>> import numpy as np
3793 >>> from scipy.stats import f_oneway
3795 Here are some data [3]_ on a shell measurement (the length of the anterior
3796 adductor muscle scar, standardized by dividing by length) in the mussel
3797 Mytilus trossulus from five locations: Tillamook, Oregon; Newport, Oregon;
3798 Petersburg, Alaska; Magadan, Russia; and Tvarminne, Finland, taken from a
3799 much larger data set used in McDonald et al. (1991).
3801 >>> tillamook = [0.0571, 0.0813, 0.0831, 0.0976, 0.0817, 0.0859, 0.0735,
3802 ... 0.0659, 0.0923, 0.0836]
3803 >>> newport = [0.0873, 0.0662, 0.0672, 0.0819, 0.0749, 0.0649, 0.0835,
3804 ... 0.0725]
3805 >>> petersburg = [0.0974, 0.1352, 0.0817, 0.1016, 0.0968, 0.1064, 0.105]
3806 >>> magadan = [0.1033, 0.0915, 0.0781, 0.0685, 0.0677, 0.0697, 0.0764,
3807 ... 0.0689]
3808 >>> tvarminne = [0.0703, 0.1026, 0.0956, 0.0973, 0.1039, 0.1045]
3809 >>> f_oneway(tillamook, newport, petersburg, magadan, tvarminne)
3810 F_onewayResult(statistic=7.121019471642447, pvalue=0.0002812242314534544)
3812 `f_oneway` accepts multidimensional input arrays. When the inputs
3813 are multidimensional and `axis` is not given, the test is performed
3814 along the first axis of the input arrays. For the following data, the
3815 test is performed three times, once for each column.
3817 >>> a = np.array([[9.87, 9.03, 6.81],
3818 ... [7.18, 8.35, 7.00],
3819 ... [8.39, 7.58, 7.68],
3820 ... [7.45, 6.33, 9.35],
3821 ... [6.41, 7.10, 9.33],
3822 ... [8.00, 8.24, 8.44]])
3823 >>> b = np.array([[6.35, 7.30, 7.16],
3824 ... [6.65, 6.68, 7.63],
3825 ... [5.72, 7.73, 6.72],
3826 ... [7.01, 9.19, 7.41],
3827 ... [7.75, 7.87, 8.30],
3828 ... [6.90, 7.97, 6.97]])
3829 >>> c = np.array([[3.31, 8.77, 1.01],
3830 ... [8.25, 3.24, 3.62],
3831 ... [6.32, 8.81, 5.19],
3832 ... [7.48, 8.83, 8.91],
3833 ... [8.59, 6.01, 6.07],
3834 ... [3.07, 9.72, 7.48]])
3835 >>> F, p = f_oneway(a, b, c)
3836 >>> F
3837 array([1.75676344, 0.03701228, 3.76439349])
3838 >>> p
3839 array([0.20630784, 0.96375203, 0.04733157])
3841 """
3842 if len(samples) < 2:
3843 raise TypeError('at least two inputs are required;'
3844 f' got {len(samples)}.')
3846 samples = [np.asarray(sample, dtype=float) for sample in samples]
3848 # ANOVA on N groups, each in its own array
3849 num_groups = len(samples)
3851 # We haven't explicitly validated axis, but if it is bad, this call of
3852 # np.concatenate will raise np.AxisError. The call will raise ValueError
3853 # if the dimensions of all the arrays, except the axis dimension, are not
3854 # the same.
3855 alldata = np.concatenate(samples, axis=axis)
3856 bign = alldata.shape[axis]
3858 # Check this after forming alldata, so shape errors are detected
3859 # and reported before checking for 0 length inputs.
3860 if any(sample.shape[axis] == 0 for sample in samples):
3861 warnings.warn(stats.DegenerateDataWarning('at least one input '
3862 'has length 0'))
3863 return _create_f_oneway_nan_result(alldata.shape, axis)
3865 # Must have at least one group with length greater than 1.
3866 if all(sample.shape[axis] == 1 for sample in samples):
3867 msg = ('all input arrays have length 1. f_oneway requires that at '
3868 'least one input has length greater than 1.')
3869 warnings.warn(stats.DegenerateDataWarning(msg))
3870 return _create_f_oneway_nan_result(alldata.shape, axis)
3872 # Check if all values within each group are identical, and if the common
3873 # value in at least one group is different from that in another group.
3874 # Based on https://github.com/scipy/scipy/issues/11669
3876 # If axis=0, say, and the groups have shape (n0, ...), (n1, ...), ...,
3877 # then is_const is a boolean array with shape (num_groups, ...).
3878 # It is True if the values within the groups along the axis slice are
3879 # identical. In the typical case where each input array is 1-d, is_const is
3880 # a 1-d array with length num_groups.
3881 is_const = np.concatenate(
3882 [(_first(sample, axis) == sample).all(axis=axis,
3883 keepdims=True)
3884 for sample in samples],
3885 axis=axis
3886 )
3888 # all_const is a boolean array with shape (...) (see previous comment).
3889 # It is True if the values within each group along the axis slice are
3890 # the same (e.g. [[3, 3, 3], [5, 5, 5, 5], [4, 4, 4]]).
3891 all_const = is_const.all(axis=axis)
3892 if all_const.any():
3893 msg = ("Each of the input arrays is constant;"
3894 "the F statistic is not defined or infinite")
3895 warnings.warn(stats.ConstantInputWarning(msg))
3897 # all_same_const is True if all the values in the groups along the axis=0
3898 # slice are the same (e.g. [[3, 3, 3], [3, 3, 3, 3], [3, 3, 3]]).
3899 all_same_const = (_first(alldata, axis) == alldata).all(axis=axis)
3901 # Determine the mean of the data, and subtract that from all inputs to a
3902 # variance (via sum_of_sq / sq_of_sum) calculation. Variance is invariant
3903 # to a shift in location, and centering all data around zero vastly
3904 # improves numerical stability.
3905 offset = alldata.mean(axis=axis, keepdims=True)
3906 alldata -= offset
3908 normalized_ss = _square_of_sums(alldata, axis=axis) / bign
3910 sstot = _sum_of_squares(alldata, axis=axis) - normalized_ss
3912 ssbn = 0
3913 for sample in samples:
3914 ssbn += _square_of_sums(sample - offset,
3915 axis=axis) / sample.shape[axis]
3917 # Naming: variables ending in bn/b are for "between treatments", wn/w are
3918 # for "within treatments"
3919 ssbn -= normalized_ss
3920 sswn = sstot - ssbn
3921 dfbn = num_groups - 1
3922 dfwn = bign - num_groups
3923 msb = ssbn / dfbn
3924 msw = sswn / dfwn
3925 with np.errstate(divide='ignore', invalid='ignore'):
3926 f = msb / msw
3928 prob = special.fdtrc(dfbn, dfwn, f) # equivalent to stats.f.sf
3930 # Fix any f values that should be inf or nan because the corresponding
3931 # inputs were constant.
3932 if np.isscalar(f):
3933 if all_same_const:
3934 f = np.nan
3935 prob = np.nan
3936 elif all_const:
3937 f = np.inf
3938 prob = 0.0
3939 else:
3940 f[all_const] = np.inf
3941 prob[all_const] = 0.0
3942 f[all_same_const] = np.nan
3943 prob[all_same_const] = np.nan
3945 return F_onewayResult(f, prob)
3948def alexandergovern(*samples, nan_policy='propagate'):
3949 """Performs the Alexander Govern test.
3951 The Alexander-Govern approximation tests the equality of k independent
3952 means in the face of heterogeneity of variance. The test is applied to
3953 samples from two or more groups, possibly with differing sizes.
3955 Parameters
3956 ----------
3957 sample1, sample2, ... : array_like
3958 The sample measurements for each group. There must be at least
3959 two samples.
3960 nan_policy : {'propagate', 'raise', 'omit'}, optional
3961 Defines how to handle when input contains nan.
3962 The following options are available (default is 'propagate'):
3964 * 'propagate': returns nan
3965 * 'raise': throws an error
3966 * 'omit': performs the calculations ignoring nan values
3968 Returns
3969 -------
3970 statistic : float
3971 The computed A statistic of the test.
3972 pvalue : float
3973 The associated p-value from the chi-squared distribution.
3975 Warns
3976 -----
3977 `~scipy.stats.ConstantInputWarning`
3978 Raised if an input is a constant array. The statistic is not defined
3979 in this case, so ``np.nan`` is returned.
3981 See Also
3982 --------
3983 f_oneway : one-way ANOVA
3985 Notes
3986 -----
3987 The use of this test relies on several assumptions.
3989 1. The samples are independent.
3990 2. Each sample is from a normally distributed population.
3991 3. Unlike `f_oneway`, this test does not assume on homoscedasticity,
3992 instead relaxing the assumption of equal variances.
3994 Input samples must be finite, one dimensional, and with size greater than
3995 one.
3997 References
3998 ----------
3999 .. [1] Alexander, Ralph A., and Diane M. Govern. "A New and Simpler
4000 Approximation for ANOVA under Variance Heterogeneity." Journal
4001 of Educational Statistics, vol. 19, no. 2, 1994, pp. 91-101.
4002 JSTOR, www.jstor.org/stable/1165140. Accessed 12 Sept. 2020.
4004 Examples
4005 --------
4006 >>> from scipy.stats import alexandergovern
4008 Here are some data on annual percentage rate of interest charged on
4009 new car loans at nine of the largest banks in four American cities
4010 taken from the National Institute of Standards and Technology's
4011 ANOVA dataset.
4013 We use `alexandergovern` to test the null hypothesis that all cities
4014 have the same mean APR against the alternative that the cities do not
4015 all have the same mean APR. We decide that a significance level of 5%
4016 is required to reject the null hypothesis in favor of the alternative.
4018 >>> atlanta = [13.75, 13.75, 13.5, 13.5, 13.0, 13.0, 13.0, 12.75, 12.5]
4019 >>> chicago = [14.25, 13.0, 12.75, 12.5, 12.5, 12.4, 12.3, 11.9, 11.9]
4020 >>> houston = [14.0, 14.0, 13.51, 13.5, 13.5, 13.25, 13.0, 12.5, 12.5]
4021 >>> memphis = [15.0, 14.0, 13.75, 13.59, 13.25, 12.97, 12.5, 12.25,
4022 ... 11.89]
4023 >>> alexandergovern(atlanta, chicago, houston, memphis)
4024 AlexanderGovernResult(statistic=4.65087071883494,
4025 pvalue=0.19922132490385214)
4027 The p-value is 0.1992, indicating a nearly 20% chance of observing
4028 such an extreme value of the test statistic under the null hypothesis.
4029 This exceeds 5%, so we do not reject the null hypothesis in favor of
4030 the alternative.
4032 """
4033 samples = _alexandergovern_input_validation(samples, nan_policy)
4035 if np.any([(sample == sample[0]).all() for sample in samples]):
4036 msg = "An input array is constant; the statistic is not defined."
4037 warnings.warn(stats.ConstantInputWarning(msg))
4038 return AlexanderGovernResult(np.nan, np.nan)
4040 # The following formula numbers reference the equation described on
4041 # page 92 by Alexander, Govern. Formulas 5, 6, and 7 describe other
4042 # tests that serve as the basis for equation (8) but are not needed
4043 # to perform the test.
4045 # precalculate mean and length of each sample
4046 lengths = np.array([ma.count(sample) if nan_policy == 'omit'
4047 else len(sample) for sample in samples])
4048 means = np.array([np.mean(sample) for sample in samples])
4050 # (1) determine standard error of the mean for each sample
4051 standard_errors = [np.std(sample, ddof=1) / np.sqrt(length)
4052 for sample, length in zip(samples, lengths)]
4054 # (2) define a weight for each sample
4055 inv_sq_se = 1 / np.square(standard_errors)
4056 weights = inv_sq_se / np.sum(inv_sq_se)
4058 # (3) determine variance-weighted estimate of the common mean
4059 var_w = np.sum(weights * means)
4061 # (4) determine one-sample t statistic for each group
4062 t_stats = (means - var_w)/standard_errors
4064 # calculate parameters to be used in transformation
4065 v = lengths - 1
4066 a = v - .5
4067 b = 48 * a**2
4068 c = (a * np.log(1 + (t_stats ** 2)/v))**.5
4070 # (8) perform a normalizing transformation on t statistic
4071 z = (c + ((c**3 + 3*c)/b) -
4072 ((4*c**7 + 33*c**5 + 240*c**3 + 855*c) /
4073 (b**2*10 + 8*b*c**4 + 1000*b)))
4075 # (9) calculate statistic
4076 A = np.sum(np.square(z))
4078 # "[the p value is determined from] central chi-square random deviates
4079 # with k - 1 degrees of freedom". Alexander, Govern (94)
4080 p = distributions.chi2.sf(A, len(samples) - 1)
4081 return AlexanderGovernResult(A, p)
4084def _alexandergovern_input_validation(samples, nan_policy):
4085 if len(samples) < 2:
4086 raise TypeError(f"2 or more inputs required, got {len(samples)}")
4088 # input arrays are flattened
4089 samples = [np.asarray(sample, dtype=float) for sample in samples]
4091 for i, sample in enumerate(samples):
4092 if np.size(sample) <= 1:
4093 raise ValueError("Input sample size must be greater than one.")
4094 if sample.ndim != 1:
4095 raise ValueError("Input samples must be one-dimensional")
4096 if np.isinf(sample).any():
4097 raise ValueError("Input samples must be finite.")
4099 contains_nan, nan_policy = _contains_nan(sample,
4100 nan_policy=nan_policy)
4101 if contains_nan and nan_policy == 'omit':
4102 samples[i] = ma.masked_invalid(sample)
4103 return samples
4106AlexanderGovernResult = make_dataclass("AlexanderGovernResult", ("statistic",
4107 "pvalue"))
4110def _pearsonr_fisher_ci(r, n, confidence_level, alternative):
4111 """
4112 Compute the confidence interval for Pearson's R.
4114 Fisher's transformation is used to compute the confidence interval
4115 (https://en.wikipedia.org/wiki/Fisher_transformation).
4116 """
4117 if r == 1:
4118 zr = np.inf
4119 elif r == -1:
4120 zr = -np.inf
4121 else:
4122 zr = np.arctanh(r)
4124 if n > 3:
4125 se = np.sqrt(1 / (n - 3))
4126 if alternative == "two-sided":
4127 h = special.ndtri(0.5 + confidence_level/2)
4128 zlo = zr - h*se
4129 zhi = zr + h*se
4130 rlo = np.tanh(zlo)
4131 rhi = np.tanh(zhi)
4132 elif alternative == "less":
4133 h = special.ndtri(confidence_level)
4134 zhi = zr + h*se
4135 rhi = np.tanh(zhi)
4136 rlo = -1.0
4137 else:
4138 # alternative == "greater":
4139 h = special.ndtri(confidence_level)
4140 zlo = zr - h*se
4141 rlo = np.tanh(zlo)
4142 rhi = 1.0
4143 else:
4144 rlo, rhi = -1.0, 1.0
4146 return ConfidenceInterval(low=rlo, high=rhi)
4149ConfidenceInterval = namedtuple('ConfidenceInterval', ['low', 'high'])
4151PearsonRResultBase = _make_tuple_bunch('PearsonRResultBase',
4152 ['statistic', 'pvalue'], [])
4155class PearsonRResult(PearsonRResultBase):
4156 """
4157 Result of `scipy.stats.pearsonr`
4159 Attributes
4160 ----------
4161 statistic : float
4162 Pearson product-moment correlation coefficient.
4163 pvalue : float
4164 The p-value associated with the chosen alternative.
4166 Methods
4167 -------
4168 confidence_interval
4169 Computes the confidence interval of the correlation
4170 coefficient `statistic` for the given confidence level.
4172 """
4173 def __init__(self, statistic, pvalue, alternative, n):
4174 super().__init__(statistic, pvalue)
4175 self._alternative = alternative
4176 self._n = n
4178 # add alias for consistency with other correlation functions
4179 self.correlation = statistic
4181 def confidence_interval(self, confidence_level=0.95):
4182 """
4183 The confidence interval for the correlation coefficient.
4185 Compute the confidence interval for the correlation coefficient
4186 ``statistic`` with the given confidence level.
4188 The confidence interval is computed using the Fisher transformation
4189 F(r) = arctanh(r) [1]_. When the sample pairs are drawn from a
4190 bivariate normal distribution, F(r) approximately follows a normal
4191 distribution with standard error ``1/sqrt(n - 3)``, where ``n`` is the
4192 length of the original samples along the calculation axis. When
4193 ``n <= 3``, this approximation does not yield a finite, real standard
4194 error, so we define the confidence interval to be -1 to 1.
4196 Parameters
4197 ----------
4198 confidence_level : float
4199 The confidence level for the calculation of the correlation
4200 coefficient confidence interval. Default is 0.95.
4202 Returns
4203 -------
4204 ci : namedtuple
4205 The confidence interval is returned in a ``namedtuple`` with
4206 fields `low` and `high`.
4208 References
4209 ----------
4210 .. [1] "Pearson correlation coefficient", Wikipedia,
4211 https://en.wikipedia.org/wiki/Pearson_correlation_coefficient
4212 """
4213 return _pearsonr_fisher_ci(self.statistic, self._n, confidence_level,
4214 self._alternative)
4217def pearsonr(x, y, *, alternative='two-sided'):
4218 r"""
4219 Pearson correlation coefficient and p-value for testing non-correlation.
4221 The Pearson correlation coefficient [1]_ measures the linear relationship
4222 between two datasets. Like other correlation
4223 coefficients, this one varies between -1 and +1 with 0 implying no
4224 correlation. Correlations of -1 or +1 imply an exact linear relationship.
4225 Positive correlations imply that as x increases, so does y. Negative
4226 correlations imply that as x increases, y decreases.
4228 This function also performs a test of the null hypothesis that the
4229 distributions underlying the samples are uncorrelated and normally
4230 distributed. (See Kowalski [3]_
4231 for a discussion of the effects of non-normality of the input on the
4232 distribution of the correlation coefficient.)
4233 The p-value roughly indicates the probability of an uncorrelated system
4234 producing datasets that have a Pearson correlation at least as extreme
4235 as the one computed from these datasets.
4237 Parameters
4238 ----------
4239 x : (N,) array_like
4240 Input array.
4241 y : (N,) array_like
4242 Input array.
4243 alternative : {'two-sided', 'greater', 'less'}, optional
4244 Defines the alternative hypothesis. Default is 'two-sided'.
4245 The following options are available:
4247 * 'two-sided': the correlation is nonzero
4248 * 'less': the correlation is negative (less than zero)
4249 * 'greater': the correlation is positive (greater than zero)
4251 .. versionadded:: 1.9.0
4253 Returns
4254 -------
4255 result : `~scipy.stats._result_classes.PearsonRResult`
4256 An object with the following attributes:
4258 statistic : float
4259 Pearson product-moment correlation coefficient.
4260 pvalue : float
4261 The p-value associated with the chosen alternative.
4263 The object has the following method:
4265 confidence_interval(confidence_level=0.95)
4266 This method computes the confidence interval of the correlation
4267 coefficient `statistic` for the given confidence level.
4268 The confidence interval is returned in a ``namedtuple`` with
4269 fields `low` and `high`. See the Notes for more details.
4271 Warns
4272 -----
4273 `~scipy.stats.ConstantInputWarning`
4274 Raised if an input is a constant array. The correlation coefficient
4275 is not defined in this case, so ``np.nan`` is returned.
4277 `~scipy.stats.NearConstantInputWarning`
4278 Raised if an input is "nearly" constant. The array ``x`` is considered
4279 nearly constant if ``norm(x - mean(x)) < 1e-13 * abs(mean(x))``.
4280 Numerical errors in the calculation ``x - mean(x)`` in this case might
4281 result in an inaccurate calculation of r.
4283 See Also
4284 --------
4285 spearmanr : Spearman rank-order correlation coefficient.
4286 kendalltau : Kendall's tau, a correlation measure for ordinal data.
4288 Notes
4289 -----
4290 The correlation coefficient is calculated as follows:
4292 .. math::
4294 r = \frac{\sum (x - m_x) (y - m_y)}
4295 {\sqrt{\sum (x - m_x)^2 \sum (y - m_y)^2}}
4297 where :math:`m_x` is the mean of the vector x and :math:`m_y` is
4298 the mean of the vector y.
4300 Under the assumption that x and y are drawn from
4301 independent normal distributions (so the population correlation coefficient
4302 is 0), the probability density function of the sample correlation
4303 coefficient r is ([1]_, [2]_):
4305 .. math::
4306 f(r) = \frac{{(1-r^2)}^{n/2-2}}{\mathrm{B}(\frac{1}{2},\frac{n}{2}-1)}
4308 where n is the number of samples, and B is the beta function. This
4309 is sometimes referred to as the exact distribution of r. This is
4310 the distribution that is used in `pearsonr` to compute the p-value.
4311 The distribution is a beta distribution on the interval [-1, 1],
4312 with equal shape parameters a = b = n/2 - 1. In terms of SciPy's
4313 implementation of the beta distribution, the distribution of r is::
4315 dist = scipy.stats.beta(n/2 - 1, n/2 - 1, loc=-1, scale=2)
4317 The default p-value returned by `pearsonr` is a two-sided p-value. For a
4318 given sample with correlation coefficient r, the p-value is
4319 the probability that abs(r') of a random sample x' and y' drawn from
4320 the population with zero correlation would be greater than or equal
4321 to abs(r). In terms of the object ``dist`` shown above, the p-value
4322 for a given r and length n can be computed as::
4324 p = 2*dist.cdf(-abs(r))
4326 When n is 2, the above continuous distribution is not well-defined.
4327 One can interpret the limit of the beta distribution as the shape
4328 parameters a and b approach a = b = 0 as a discrete distribution with
4329 equal probability masses at r = 1 and r = -1. More directly, one
4330 can observe that, given the data x = [x1, x2] and y = [y1, y2], and
4331 assuming x1 != x2 and y1 != y2, the only possible values for r are 1
4332 and -1. Because abs(r') for any sample x' and y' with length 2 will
4333 be 1, the two-sided p-value for a sample of length 2 is always 1.
4335 For backwards compatibility, the object that is returned also behaves
4336 like a tuple of length two that holds the statistic and the p-value.
4338 References
4339 ----------
4340 .. [1] "Pearson correlation coefficient", Wikipedia,
4341 https://en.wikipedia.org/wiki/Pearson_correlation_coefficient
4342 .. [2] Student, "Probable error of a correlation coefficient",
4343 Biometrika, Volume 6, Issue 2-3, 1 September 1908, pp. 302-310.
4344 .. [3] C. J. Kowalski, "On the Effects of Non-Normality on the Distribution
4345 of the Sample Product-Moment Correlation Coefficient"
4346 Journal of the Royal Statistical Society. Series C (Applied
4347 Statistics), Vol. 21, No. 1 (1972), pp. 1-12.
4349 Examples
4350 --------
4351 >>> import numpy as np
4352 >>> from scipy import stats
4353 >>> res = stats.pearsonr([1, 2, 3, 4, 5], [10, 9, 2.5, 6, 4])
4354 >>> res
4355 PearsonRResult(statistic=-0.7426106572325056, pvalue=0.15055580885344558)
4356 >>> res.confidence_interval()
4357 ConfidenceInterval(low=-0.9816918044786463, high=0.40501116769030976)
4359 There is a linear dependence between x and y if y = a + b*x + e, where
4360 a,b are constants and e is a random error term, assumed to be independent
4361 of x. For simplicity, assume that x is standard normal, a=0, b=1 and let
4362 e follow a normal distribution with mean zero and standard deviation s>0.
4364 >>> rng = np.random.default_rng()
4365 >>> s = 0.5
4366 >>> x = stats.norm.rvs(size=500, random_state=rng)
4367 >>> e = stats.norm.rvs(scale=s, size=500, random_state=rng)
4368 >>> y = x + e
4369 >>> stats.pearsonr(x, y).statistic
4370 0.9001942438244763
4372 This should be close to the exact value given by
4374 >>> 1/np.sqrt(1 + s**2)
4375 0.8944271909999159
4377 For s=0.5, we observe a high level of correlation. In general, a large
4378 variance of the noise reduces the correlation, while the correlation
4379 approaches one as the variance of the error goes to zero.
4381 It is important to keep in mind that no correlation does not imply
4382 independence unless (x, y) is jointly normal. Correlation can even be zero
4383 when there is a very simple dependence structure: if X follows a
4384 standard normal distribution, let y = abs(x). Note that the correlation
4385 between x and y is zero. Indeed, since the expectation of x is zero,
4386 cov(x, y) = E[x*y]. By definition, this equals E[x*abs(x)] which is zero
4387 by symmetry. The following lines of code illustrate this observation:
4389 >>> y = np.abs(x)
4390 >>> stats.pearsonr(x, y)
4391 PearsonRResult(statistic=-0.05444919272687482, pvalue=0.22422294836207743)
4393 A non-zero correlation coefficient can be misleading. For example, if X has
4394 a standard normal distribution, define y = x if x < 0 and y = 0 otherwise.
4395 A simple calculation shows that corr(x, y) = sqrt(2/Pi) = 0.797...,
4396 implying a high level of correlation:
4398 >>> y = np.where(x < 0, x, 0)
4399 >>> stats.pearsonr(x, y)
4400 PearsonRResult(statistic=0.861985781588, pvalue=4.813432002751103e-149)
4402 This is unintuitive since there is no dependence of x and y if x is larger
4403 than zero which happens in about half of the cases if we sample x and y.
4405 """
4406 n = len(x)
4407 if n != len(y):
4408 raise ValueError('x and y must have the same length.')
4410 if n < 2:
4411 raise ValueError('x and y must have length at least 2.')
4413 x = np.asarray(x)
4414 y = np.asarray(y)
4416 if (np.issubdtype(x.dtype, np.complexfloating)
4417 or np.issubdtype(y.dtype, np.complexfloating)):
4418 raise ValueError('This function does not support complex data')
4420 # If an input is constant, the correlation coefficient is not defined.
4421 if (x == x[0]).all() or (y == y[0]).all():
4422 msg = ("An input array is constant; the correlation coefficient "
4423 "is not defined.")
4424 warnings.warn(stats.ConstantInputWarning(msg))
4425 result = PearsonRResult(statistic=np.nan, pvalue=np.nan, n=n,
4426 alternative=alternative)
4427 return result
4429 # dtype is the data type for the calculations. This expression ensures
4430 # that the data type is at least 64 bit floating point. It might have
4431 # more precision if the input is, for example, np.longdouble.
4432 dtype = type(1.0 + x[0] + y[0])
4434 if n == 2:
4435 r = dtype(np.sign(x[1] - x[0])*np.sign(y[1] - y[0]))
4436 result = PearsonRResult(statistic=r, pvalue=1.0, n=n,
4437 alternative=alternative)
4438 return result
4440 xmean = x.mean(dtype=dtype)
4441 ymean = y.mean(dtype=dtype)
4443 # By using `astype(dtype)`, we ensure that the intermediate calculations
4444 # use at least 64 bit floating point.
4445 xm = x.astype(dtype) - xmean
4446 ym = y.astype(dtype) - ymean
4448 # Unlike np.linalg.norm or the expression sqrt((xm*xm).sum()),
4449 # scipy.linalg.norm(xm) does not overflow if xm is, for example,
4450 # [-5e210, 5e210, 3e200, -3e200]
4451 normxm = linalg.norm(xm)
4452 normym = linalg.norm(ym)
4454 threshold = 1e-13
4455 if normxm < threshold*abs(xmean) or normym < threshold*abs(ymean):
4456 # If all the values in x (likewise y) are very close to the mean,
4457 # the loss of precision that occurs in the subtraction xm = x - xmean
4458 # might result in large errors in r.
4459 msg = ("An input array is nearly constant; the computed "
4460 "correlation coefficient may be inaccurate.")
4461 warnings.warn(stats.NearConstantInputWarning(msg))
4463 r = np.dot(xm/normxm, ym/normym)
4465 # Presumably, if abs(r) > 1, then it is only some small artifact of
4466 # floating point arithmetic.
4467 r = max(min(r, 1.0), -1.0)
4469 # As explained in the docstring, the distribution of `r` under the null
4470 # hypothesis is the beta distribution on (-1, 1) with a = b = n/2 - 1.
4471 ab = n/2 - 1
4472 dist = stats.beta(ab, ab, loc=-1, scale=2)
4473 if alternative == 'two-sided':
4474 prob = 2*dist.sf(abs(r))
4475 elif alternative == 'less':
4476 prob = dist.cdf(r)
4477 elif alternative == 'greater':
4478 prob = dist.sf(r)
4479 else:
4480 raise ValueError('alternative must be one of '
4481 '["two-sided", "less", "greater"]')
4483 return PearsonRResult(statistic=r, pvalue=prob, n=n,
4484 alternative=alternative)
4487def fisher_exact(table, alternative='two-sided'):
4488 """Perform a Fisher exact test on a 2x2 contingency table.
4490 The null hypothesis is that the true odds ratio of the populations
4491 underlying the observations is one, and the observations were sampled
4492 from these populations under a condition: the marginals of the
4493 resulting table must equal those of the observed table. The statistic
4494 returned is the unconditional maximum likelihood estimate of the odds
4495 ratio, and the p-value is the probability under the null hypothesis of
4496 obtaining a table at least as extreme as the one that was actually
4497 observed. There are other possible choices of statistic and two-sided
4498 p-value definition associated with Fisher's exact test; please see the
4499 Notes for more information.
4501 Parameters
4502 ----------
4503 table : array_like of ints
4504 A 2x2 contingency table. Elements must be non-negative integers.
4505 alternative : {'two-sided', 'less', 'greater'}, optional
4506 Defines the alternative hypothesis.
4507 The following options are available (default is 'two-sided'):
4509 * 'two-sided': the odds ratio of the underlying population is not one
4510 * 'less': the odds ratio of the underlying population is less than one
4511 * 'greater': the odds ratio of the underlying population is greater
4512 than one
4514 See the Notes for more details.
4516 Returns
4517 -------
4518 res : SignificanceResult
4519 An object containing attributes:
4521 statistic : float
4522 This is the prior odds ratio, not a posterior estimate.
4523 pvalue : float
4524 The probability under the null hypothesis of obtaining a
4525 table at least as extreme as the one that was actually observed.
4527 See Also
4528 --------
4529 chi2_contingency : Chi-square test of independence of variables in a
4530 contingency table. This can be used as an alternative to
4531 `fisher_exact` when the numbers in the table are large.
4532 contingency.odds_ratio : Compute the odds ratio (sample or conditional
4533 MLE) for a 2x2 contingency table.
4534 barnard_exact : Barnard's exact test, which is a more powerful alternative
4535 than Fisher's exact test for 2x2 contingency tables.
4536 boschloo_exact : Boschloo's exact test, which is a more powerful alternative
4537 than Fisher's exact test for 2x2 contingency tables.
4539 Notes
4540 -----
4541 *Null hypothesis and p-values*
4543 The null hypothesis is that the true odds ratio of the populations
4544 underlying the observations is one, and the observations were sampled at
4545 random from these populations under a condition: the marginals of the
4546 resulting table must equal those of the observed table. Equivalently,
4547 the null hypothesis is that the input table is from the hypergeometric
4548 distribution with parameters (as used in `hypergeom`)
4549 ``M = a + b + c + d``, ``n = a + b`` and ``N = a + c``, where the
4550 input table is ``[[a, b], [c, d]]``. This distribution has support
4551 ``max(0, N + n - M) <= x <= min(N, n)``, or, in terms of the values
4552 in the input table, ``min(0, a - d) <= x <= a + min(b, c)``. ``x``
4553 can be interpreted as the upper-left element of a 2x2 table, so the
4554 tables in the distribution have form::
4556 [ x n - x ]
4557 [N - x M - (n + N) + x]
4559 For example, if::
4561 table = [6 2]
4562 [1 4]
4564 then the support is ``2 <= x <= 7``, and the tables in the distribution
4565 are::
4567 [2 6] [3 5] [4 4] [5 3] [6 2] [7 1]
4568 [5 0] [4 1] [3 2] [2 3] [1 4] [0 5]
4570 The probability of each table is given by the hypergeometric distribution
4571 ``hypergeom.pmf(x, M, n, N)``. For this example, these are (rounded to
4572 three significant digits)::
4574 x 2 3 4 5 6 7
4575 p 0.0163 0.163 0.408 0.326 0.0816 0.00466
4577 These can be computed with::
4579 >>> import numpy as np
4580 >>> from scipy.stats import hypergeom
4581 >>> table = np.array([[6, 2], [1, 4]])
4582 >>> M = table.sum()
4583 >>> n = table[0].sum()
4584 >>> N = table[:, 0].sum()
4585 >>> start, end = hypergeom.support(M, n, N)
4586 >>> hypergeom.pmf(np.arange(start, end+1), M, n, N)
4587 array([0.01631702, 0.16317016, 0.40792541, 0.32634033, 0.08158508,
4588 0.004662 ])
4590 The two-sided p-value is the probability that, under the null hypothesis,
4591 a random table would have a probability equal to or less than the
4592 probability of the input table. For our example, the probability of
4593 the input table (where ``x = 6``) is 0.0816. The x values where the
4594 probability does not exceed this are 2, 6 and 7, so the two-sided p-value
4595 is ``0.0163 + 0.0816 + 0.00466 ~= 0.10256``::
4597 >>> from scipy.stats import fisher_exact
4598 >>> res = fisher_exact(table, alternative='two-sided')
4599 >>> res.pvalue
4600 0.10256410256410257
4602 The one-sided p-value for ``alternative='greater'`` is the probability
4603 that a random table has ``x >= a``, which in our example is ``x >= 6``,
4604 or ``0.0816 + 0.00466 ~= 0.08626``::
4606 >>> res = fisher_exact(table, alternative='greater')
4607 >>> res.pvalue
4608 0.08624708624708627
4610 This is equivalent to computing the survival function of the
4611 distribution at ``x = 5`` (one less than ``x`` from the input table,
4612 because we want to include the probability of ``x = 6`` in the sum)::
4614 >>> hypergeom.sf(5, M, n, N)
4615 0.08624708624708627
4617 For ``alternative='less'``, the one-sided p-value is the probability
4618 that a random table has ``x <= a``, (i.e. ``x <= 6`` in our example),
4619 or ``0.0163 + 0.163 + 0.408 + 0.326 + 0.0816 ~= 0.9949``::
4621 >>> res = fisher_exact(table, alternative='less')
4622 >>> res.pvalue
4623 0.9953379953379957
4625 This is equivalent to computing the cumulative distribution function
4626 of the distribution at ``x = 6``:
4628 >>> hypergeom.cdf(6, M, n, N)
4629 0.9953379953379957
4631 *Odds ratio*
4633 The calculated odds ratio is different from the value computed by the
4634 R function ``fisher.test``. This implementation returns the "sample"
4635 or "unconditional" maximum likelihood estimate, while ``fisher.test``
4636 in R uses the conditional maximum likelihood estimate. To compute the
4637 conditional maximum likelihood estimate of the odds ratio, use
4638 `scipy.stats.contingency.odds_ratio`.
4640 Examples
4641 --------
4642 Say we spend a few days counting whales and sharks in the Atlantic and
4643 Indian oceans. In the Atlantic ocean we find 8 whales and 1 shark, in the
4644 Indian ocean 2 whales and 5 sharks. Then our contingency table is::
4646 Atlantic Indian
4647 whales 8 2
4648 sharks 1 5
4650 We use this table to find the p-value:
4652 >>> from scipy.stats import fisher_exact
4653 >>> res = fisher_exact([[8, 2], [1, 5]])
4654 >>> res.pvalue
4655 0.0349...
4657 The probability that we would observe this or an even more imbalanced ratio
4658 by chance is about 3.5%. A commonly used significance level is 5%--if we
4659 adopt that, we can therefore conclude that our observed imbalance is
4660 statistically significant; whales prefer the Atlantic while sharks prefer
4661 the Indian ocean.
4663 """
4664 hypergeom = distributions.hypergeom
4665 # int32 is not enough for the algorithm
4666 c = np.asarray(table, dtype=np.int64)
4667 if not c.shape == (2, 2):
4668 raise ValueError("The input `table` must be of shape (2, 2).")
4670 if np.any(c < 0):
4671 raise ValueError("All values in `table` must be nonnegative.")
4673 if 0 in c.sum(axis=0) or 0 in c.sum(axis=1):
4674 # If both values in a row or column are zero, the p-value is 1 and
4675 # the odds ratio is NaN.
4676 return SignificanceResult(np.nan, 1.0)
4678 if c[1, 0] > 0 and c[0, 1] > 0:
4679 oddsratio = c[0, 0] * c[1, 1] / (c[1, 0] * c[0, 1])
4680 else:
4681 oddsratio = np.inf
4683 n1 = c[0, 0] + c[0, 1]
4684 n2 = c[1, 0] + c[1, 1]
4685 n = c[0, 0] + c[1, 0]
4687 def pmf(x):
4688 return hypergeom.pmf(x, n1 + n2, n1, n)
4690 if alternative == 'less':
4691 pvalue = hypergeom.cdf(c[0, 0], n1 + n2, n1, n)
4692 elif alternative == 'greater':
4693 # Same formula as the 'less' case, but with the second column.
4694 pvalue = hypergeom.cdf(c[0, 1], n1 + n2, n1, c[0, 1] + c[1, 1])
4695 elif alternative == 'two-sided':
4696 mode = int((n + 1) * (n1 + 1) / (n1 + n2 + 2))
4697 pexact = hypergeom.pmf(c[0, 0], n1 + n2, n1, n)
4698 pmode = hypergeom.pmf(mode, n1 + n2, n1, n)
4700 epsilon = 1e-14
4701 gamma = 1 + epsilon
4703 if np.abs(pexact - pmode) / np.maximum(pexact, pmode) <= epsilon:
4704 return SignificanceResult(oddsratio, 1.)
4706 elif c[0, 0] < mode:
4707 plower = hypergeom.cdf(c[0, 0], n1 + n2, n1, n)
4708 if hypergeom.pmf(n, n1 + n2, n1, n) > pexact * gamma:
4709 return SignificanceResult(oddsratio, plower)
4711 guess = _binary_search(lambda x: -pmf(x), -pexact * gamma, mode, n)
4712 pvalue = plower + hypergeom.sf(guess, n1 + n2, n1, n)
4713 else:
4714 pupper = hypergeom.sf(c[0, 0] - 1, n1 + n2, n1, n)
4715 if hypergeom.pmf(0, n1 + n2, n1, n) > pexact * gamma:
4716 return SignificanceResult(oddsratio, pupper)
4718 guess = _binary_search(pmf, pexact * gamma, 0, mode)
4719 pvalue = pupper + hypergeom.cdf(guess, n1 + n2, n1, n)
4720 else:
4721 msg = "`alternative` should be one of {'two-sided', 'less', 'greater'}"
4722 raise ValueError(msg)
4724 pvalue = min(pvalue, 1.0)
4726 return SignificanceResult(oddsratio, pvalue)
4729def spearmanr(a, b=None, axis=0, nan_policy='propagate',
4730 alternative='two-sided'):
4731 """Calculate a Spearman correlation coefficient with associated p-value.
4733 The Spearman rank-order correlation coefficient is a nonparametric measure
4734 of the monotonicity of the relationship between two datasets.
4735 Like other correlation coefficients,
4736 this one varies between -1 and +1 with 0 implying no correlation.
4737 Correlations of -1 or +1 imply an exact monotonic relationship. Positive
4738 correlations imply that as x increases, so does y. Negative correlations
4739 imply that as x increases, y decreases.
4741 The p-value roughly indicates the probability of an uncorrelated system
4742 producing datasets that have a Spearman correlation at least as extreme
4743 as the one computed from these datasets. Although calculation of the
4744 p-value does not make strong assumptions about the distributions underlying
4745 the samples, it is only accurate for very large samples (>500
4746 observations). For smaller sample sizes, consider a permutation test (see
4747 Examples section below).
4749 Parameters
4750 ----------
4751 a, b : 1D or 2D array_like, b is optional
4752 One or two 1-D or 2-D arrays containing multiple variables and
4753 observations. When these are 1-D, each represents a vector of
4754 observations of a single variable. For the behavior in the 2-D case,
4755 see under ``axis``, below.
4756 Both arrays need to have the same length in the ``axis`` dimension.
4757 axis : int or None, optional
4758 If axis=0 (default), then each column represents a variable, with
4759 observations in the rows. If axis=1, the relationship is transposed:
4760 each row represents a variable, while the columns contain observations.
4761 If axis=None, then both arrays will be raveled.
4762 nan_policy : {'propagate', 'raise', 'omit'}, optional
4763 Defines how to handle when input contains nan.
4764 The following options are available (default is 'propagate'):
4766 * 'propagate': returns nan
4767 * 'raise': throws an error
4768 * 'omit': performs the calculations ignoring nan values
4770 alternative : {'two-sided', 'less', 'greater'}, optional
4771 Defines the alternative hypothesis. Default is 'two-sided'.
4772 The following options are available:
4774 * 'two-sided': the correlation is nonzero
4775 * 'less': the correlation is negative (less than zero)
4776 * 'greater': the correlation is positive (greater than zero)
4778 .. versionadded:: 1.7.0
4780 Returns
4781 -------
4782 res : SignificanceResult
4783 An object containing attributes:
4785 statistic : float or ndarray (2-D square)
4786 Spearman correlation matrix or correlation coefficient (if only 2
4787 variables are given as parameters). Correlation matrix is square
4788 with length equal to total number of variables (columns or rows) in
4789 ``a`` and ``b`` combined.
4790 pvalue : float
4791 The p-value for a hypothesis test whose null hypothesis
4792 is that two sets of data are linearly uncorrelated. See
4793 `alternative` above for alternative hypotheses. `pvalue` has the
4794 same shape as `statistic`.
4796 Warns
4797 -----
4798 `~scipy.stats.ConstantInputWarning`
4799 Raised if an input is a constant array. The correlation coefficient
4800 is not defined in this case, so ``np.nan`` is returned.
4802 References
4803 ----------
4804 .. [1] Zwillinger, D. and Kokoska, S. (2000). CRC Standard
4805 Probability and Statistics Tables and Formulae. Chapman & Hall: New
4806 York. 2000.
4807 Section 14.7
4808 .. [2] Kendall, M. G. and Stuart, A. (1973).
4809 The Advanced Theory of Statistics, Volume 2: Inference and Relationship.
4810 Griffin. 1973.
4811 Section 31.18
4813 Examples
4814 --------
4815 >>> import numpy as np
4816 >>> from scipy import stats
4817 >>> res = stats.spearmanr([1, 2, 3, 4, 5], [5, 6, 7, 8, 7])
4818 >>> res.statistic
4819 0.8207826816681233
4820 >>> res.pvalue
4821 0.08858700531354381
4822 >>> rng = np.random.default_rng()
4823 >>> x2n = rng.standard_normal((100, 2))
4824 >>> y2n = rng.standard_normal((100, 2))
4825 >>> res = stats.spearmanr(x2n)
4826 >>> res.statistic, res.pvalue
4827 (-0.07960396039603959, 0.4311168705769747)
4828 >>> res = stats.spearmanr(x2n[:, 0], x2n[:, 1])
4829 >>> res.statistic, res.pvalue
4830 (-0.07960396039603959, 0.4311168705769747)
4831 >>> res = stats.spearmanr(x2n, y2n)
4832 >>> res.statistic
4833 array([[ 1. , -0.07960396, -0.08314431, 0.09662166],
4834 [-0.07960396, 1. , -0.14448245, 0.16738074],
4835 [-0.08314431, -0.14448245, 1. , 0.03234323],
4836 [ 0.09662166, 0.16738074, 0.03234323, 1. ]])
4837 >>> res.pvalue
4838 array([[0. , 0.43111687, 0.41084066, 0.33891628],
4839 [0.43111687, 0. , 0.15151618, 0.09600687],
4840 [0.41084066, 0.15151618, 0. , 0.74938561],
4841 [0.33891628, 0.09600687, 0.74938561, 0. ]])
4842 >>> res = stats.spearmanr(x2n.T, y2n.T, axis=1)
4843 >>> res.statistic
4844 array([[ 1. , -0.07960396, -0.08314431, 0.09662166],
4845 [-0.07960396, 1. , -0.14448245, 0.16738074],
4846 [-0.08314431, -0.14448245, 1. , 0.03234323],
4847 [ 0.09662166, 0.16738074, 0.03234323, 1. ]])
4848 >>> res = stats.spearmanr(x2n, y2n, axis=None)
4849 >>> res.statistic, res.pvalue
4850 (0.044981624540613524, 0.5270803651336189)
4851 >>> res = stats.spearmanr(x2n.ravel(), y2n.ravel())
4852 >>> res.statistic, res.pvalue
4853 (0.044981624540613524, 0.5270803651336189)
4855 >>> rng = np.random.default_rng()
4856 >>> xint = rng.integers(10, size=(100, 2))
4857 >>> res = stats.spearmanr(xint)
4858 >>> res.statistic, res.pvalue
4859 (0.09800224850707953, 0.3320271757932076)
4861 For small samples, consider performing a permutation test instead of
4862 relying on the asymptotic p-value. Note that to calculate the null
4863 distribution of the statistic (for all possibly pairings between
4864 observations in sample ``x`` and ``y``), only one of the two inputs needs
4865 to be permuted.
4867 >>> x = [1.76405235, 0.40015721, 0.97873798,
4868 ... 2.2408932, 1.86755799, -0.97727788]
4869 >>> y = [2.71414076, 0.2488, 0.87551913,
4870 ... 2.6514917, 2.01160156, 0.47699563]
4871 >>> def statistic(x): # permute only `x`
4872 ... return stats.spearmanr(x, y).statistic
4873 >>> res_exact = stats.permutation_test((x,), statistic,
4874 ... permutation_type='pairings')
4875 >>> res_asymptotic = stats.spearmanr(x, y)
4876 >>> res_exact.pvalue, res_asymptotic.pvalue # asymptotic pvalue is too low
4877 (0.10277777777777777, 0.07239650145772594)
4879 """
4880 if axis is not None and axis > 1:
4881 raise ValueError("spearmanr only handles 1-D or 2-D arrays, "
4882 "supplied axis argument {}, please use only "
4883 "values 0, 1 or None for axis".format(axis))
4885 a, axisout = _chk_asarray(a, axis)
4886 if a.ndim > 2:
4887 raise ValueError("spearmanr only handles 1-D or 2-D arrays")
4889 if b is None:
4890 if a.ndim < 2:
4891 raise ValueError("`spearmanr` needs at least 2 "
4892 "variables to compare")
4893 else:
4894 # Concatenate a and b, so that we now only have to handle the case
4895 # of a 2-D `a`.
4896 b, _ = _chk_asarray(b, axis)
4897 if axisout == 0:
4898 a = np.column_stack((a, b))
4899 else:
4900 a = np.row_stack((a, b))
4902 n_vars = a.shape[1 - axisout]
4903 n_obs = a.shape[axisout]
4904 if n_obs <= 1:
4905 # Handle empty arrays or single observations.
4906 res = SignificanceResult(np.nan, np.nan)
4907 res.correlation = np.nan
4908 return res
4910 warn_msg = ("An input array is constant; the correlation coefficient "
4911 "is not defined.")
4912 if axisout == 0:
4913 if (a[:, 0][0] == a[:, 0]).all() or (a[:, 1][0] == a[:, 1]).all():
4914 # If an input is constant, the correlation coefficient
4915 # is not defined.
4916 warnings.warn(stats.ConstantInputWarning(warn_msg))
4917 res = SignificanceResult(np.nan, np.nan)
4918 res.correlation = np.nan
4919 return res
4920 else: # case when axisout == 1 b/c a is 2 dim only
4921 if (a[0, :][0] == a[0, :]).all() or (a[1, :][0] == a[1, :]).all():
4922 # If an input is constant, the correlation coefficient
4923 # is not defined.
4924 warnings.warn(stats.ConstantInputWarning(warn_msg))
4925 res = SignificanceResult(np.nan, np.nan)
4926 res.correlation = np.nan
4927 return res
4929 a_contains_nan, nan_policy = _contains_nan(a, nan_policy)
4930 variable_has_nan = np.zeros(n_vars, dtype=bool)
4931 if a_contains_nan:
4932 if nan_policy == 'omit':
4933 return mstats_basic.spearmanr(a, axis=axis, nan_policy=nan_policy,
4934 alternative=alternative)
4935 elif nan_policy == 'propagate':
4936 if a.ndim == 1 or n_vars <= 2:
4937 res = SignificanceResult(np.nan, np.nan)
4938 res.correlation = np.nan
4939 return res
4940 else:
4941 # Keep track of variables with NaNs, set the outputs to NaN
4942 # only for those variables
4943 variable_has_nan = np.isnan(a).any(axis=axisout)
4945 a_ranked = np.apply_along_axis(rankdata, axisout, a)
4946 rs = np.corrcoef(a_ranked, rowvar=axisout)
4947 dof = n_obs - 2 # degrees of freedom
4949 # rs can have elements equal to 1, so avoid zero division warnings
4950 with np.errstate(divide='ignore'):
4951 # clip the small negative values possibly caused by rounding
4952 # errors before taking the square root
4953 t = rs * np.sqrt((dof/((rs+1.0)*(1.0-rs))).clip(0))
4955 t, prob = _ttest_finish(dof, t, alternative)
4957 # For backwards compatibility, return scalars when comparing 2 columns
4958 if rs.shape == (2, 2):
4959 res = SignificanceResult(rs[1, 0], prob[1, 0])
4960 res.correlation = rs[1, 0]
4961 return res
4962 else:
4963 rs[variable_has_nan, :] = np.nan
4964 rs[:, variable_has_nan] = np.nan
4965 res = SignificanceResult(rs, prob)
4966 res.correlation = rs
4967 return res
4970def pointbiserialr(x, y):
4971 r"""Calculate a point biserial correlation coefficient and its p-value.
4973 The point biserial correlation is used to measure the relationship
4974 between a binary variable, x, and a continuous variable, y. Like other
4975 correlation coefficients, this one varies between -1 and +1 with 0
4976 implying no correlation. Correlations of -1 or +1 imply a determinative
4977 relationship.
4979 This function may be computed using a shortcut formula but produces the
4980 same result as `pearsonr`.
4982 Parameters
4983 ----------
4984 x : array_like of bools
4985 Input array.
4986 y : array_like
4987 Input array.
4989 Returns
4990 -------
4991 res: SignificanceResult
4992 An object containing attributes:
4994 statistic : float
4995 The R value.
4996 pvalue : float
4997 The two-sided p-value.
4999 Notes
5000 -----
5001 `pointbiserialr` uses a t-test with ``n-1`` degrees of freedom.
5002 It is equivalent to `pearsonr`.
5004 The value of the point-biserial correlation can be calculated from:
5006 .. math::
5008 r_{pb} = \frac{\overline{Y_{1}} -
5009 \overline{Y_{0}}}{s_{y}}\sqrt{\frac{N_{1} N_{2}}{N (N - 1))}}
5011 Where :math:`Y_{0}` and :math:`Y_{1}` are means of the metric
5012 observations coded 0 and 1 respectively; :math:`N_{0}` and :math:`N_{1}`
5013 are number of observations coded 0 and 1 respectively; :math:`N` is the
5014 total number of observations and :math:`s_{y}` is the standard
5015 deviation of all the metric observations.
5017 A value of :math:`r_{pb}` that is significantly different from zero is
5018 completely equivalent to a significant difference in means between the two
5019 groups. Thus, an independent groups t Test with :math:`N-2` degrees of
5020 freedom may be used to test whether :math:`r_{pb}` is nonzero. The
5021 relation between the t-statistic for comparing two independent groups and
5022 :math:`r_{pb}` is given by:
5024 .. math::
5026 t = \sqrt{N - 2}\frac{r_{pb}}{\sqrt{1 - r^{2}_{pb}}}
5028 References
5029 ----------
5030 .. [1] J. Lev, "The Point Biserial Coefficient of Correlation", Ann. Math.
5031 Statist., Vol. 20, no.1, pp. 125-126, 1949.
5033 .. [2] R.F. Tate, "Correlation Between a Discrete and a Continuous
5034 Variable. Point-Biserial Correlation.", Ann. Math. Statist., Vol. 25,
5035 np. 3, pp. 603-607, 1954.
5037 .. [3] D. Kornbrot "Point Biserial Correlation", In Wiley StatsRef:
5038 Statistics Reference Online (eds N. Balakrishnan, et al.), 2014.
5039 :doi:`10.1002/9781118445112.stat06227`
5041 Examples
5042 --------
5043 >>> import numpy as np
5044 >>> from scipy import stats
5045 >>> a = np.array([0, 0, 0, 1, 1, 1, 1])
5046 >>> b = np.arange(7)
5047 >>> stats.pointbiserialr(a, b)
5048 (0.8660254037844386, 0.011724811003954652)
5049 >>> stats.pearsonr(a, b)
5050 (0.86602540378443871, 0.011724811003954626)
5051 >>> np.corrcoef(a, b)
5052 array([[ 1. , 0.8660254],
5053 [ 0.8660254, 1. ]])
5055 """
5056 rpb, prob = pearsonr(x, y)
5057 # create result object with alias for backward compatibility
5058 res = SignificanceResult(rpb, prob)
5059 res.correlation = rpb
5060 return res
5063def kendalltau(x, y, initial_lexsort=None, nan_policy='propagate',
5064 method='auto', variant='b', alternative='two-sided'):
5065 """Calculate Kendall's tau, a correlation measure for ordinal data.
5067 Kendall's tau is a measure of the correspondence between two rankings.
5068 Values close to 1 indicate strong agreement, and values close to -1
5069 indicate strong disagreement. This implements two variants of Kendall's
5070 tau: tau-b (the default) and tau-c (also known as Stuart's tau-c). These
5071 differ only in how they are normalized to lie within the range -1 to 1;
5072 the hypothesis tests (their p-values) are identical. Kendall's original
5073 tau-a is not implemented separately because both tau-b and tau-c reduce
5074 to tau-a in the absence of ties.
5076 Parameters
5077 ----------
5078 x, y : array_like
5079 Arrays of rankings, of the same shape. If arrays are not 1-D, they
5080 will be flattened to 1-D.
5081 initial_lexsort : bool, optional, deprecated
5082 This argument is unused.
5084 .. deprecated:: 1.10.0
5085 `kendalltau` keyword argument `initial_lexsort` is deprecated as it
5086 is unused and will be removed in SciPy 1.12.0.
5087 nan_policy : {'propagate', 'raise', 'omit'}, optional
5088 Defines how to handle when input contains nan.
5089 The following options are available (default is 'propagate'):
5091 * 'propagate': returns nan
5092 * 'raise': throws an error
5093 * 'omit': performs the calculations ignoring nan values
5095 method : {'auto', 'asymptotic', 'exact'}, optional
5096 Defines which method is used to calculate the p-value [5]_.
5097 The following options are available (default is 'auto'):
5099 * 'auto': selects the appropriate method based on a trade-off
5100 between speed and accuracy
5101 * 'asymptotic': uses a normal approximation valid for large samples
5102 * 'exact': computes the exact p-value, but can only be used if no ties
5103 are present. As the sample size increases, the 'exact' computation
5104 time may grow and the result may lose some precision.
5105 variant : {'b', 'c'}, optional
5106 Defines which variant of Kendall's tau is returned. Default is 'b'.
5107 alternative : {'two-sided', 'less', 'greater'}, optional
5108 Defines the alternative hypothesis. Default is 'two-sided'.
5109 The following options are available:
5111 * 'two-sided': the rank correlation is nonzero
5112 * 'less': the rank correlation is negative (less than zero)
5113 * 'greater': the rank correlation is positive (greater than zero)
5115 Returns
5116 -------
5117 res : SignificanceResult
5118 An object containing attributes:
5120 statistic : float
5121 The tau statistic.
5122 pvalue : float
5123 The p-value for a hypothesis test whose null hypothesis is
5124 an absence of association, tau = 0.
5126 See Also
5127 --------
5128 spearmanr : Calculates a Spearman rank-order correlation coefficient.
5129 theilslopes : Computes the Theil-Sen estimator for a set of points (x, y).
5130 weightedtau : Computes a weighted version of Kendall's tau.
5132 Notes
5133 -----
5134 The definition of Kendall's tau that is used is [2]_::
5136 tau_b = (P - Q) / sqrt((P + Q + T) * (P + Q + U))
5138 tau_c = 2 (P - Q) / (n**2 * (m - 1) / m)
5140 where P is the number of concordant pairs, Q the number of discordant
5141 pairs, T the number of ties only in `x`, and U the number of ties only in
5142 `y`. If a tie occurs for the same pair in both `x` and `y`, it is not
5143 added to either T or U. n is the total number of samples, and m is the
5144 number of unique values in either `x` or `y`, whichever is smaller.
5146 References
5147 ----------
5148 .. [1] Maurice G. Kendall, "A New Measure of Rank Correlation", Biometrika
5149 Vol. 30, No. 1/2, pp. 81-93, 1938.
5150 .. [2] Maurice G. Kendall, "The treatment of ties in ranking problems",
5151 Biometrika Vol. 33, No. 3, pp. 239-251. 1945.
5152 .. [3] Gottfried E. Noether, "Elements of Nonparametric Statistics", John
5153 Wiley & Sons, 1967.
5154 .. [4] Peter M. Fenwick, "A new data structure for cumulative frequency
5155 tables", Software: Practice and Experience, Vol. 24, No. 3,
5156 pp. 327-336, 1994.
5157 .. [5] Maurice G. Kendall, "Rank Correlation Methods" (4th Edition),
5158 Charles Griffin & Co., 1970.
5160 Examples
5161 --------
5162 >>> from scipy import stats
5163 >>> x1 = [12, 2, 1, 12, 2]
5164 >>> x2 = [1, 4, 7, 1, 0]
5165 >>> res = stats.kendalltau(x1, x2)
5166 >>> res.statistic
5167 -0.47140452079103173
5168 >>> res.pvalue
5169 0.2827454599327748
5171 """
5172 if initial_lexsort is not None:
5173 msg = ("'kendalltau' keyword argument 'initial_lexsort' is deprecated"
5174 " as it is unused and will be removed in SciPy 1.12.0.")
5175 warnings.warn(msg, DeprecationWarning, stacklevel=2)
5177 x = np.asarray(x).ravel()
5178 y = np.asarray(y).ravel()
5180 if x.size != y.size:
5181 raise ValueError("All inputs to `kendalltau` must be of the same "
5182 f"size, found x-size {x.size} and y-size {y.size}")
5183 elif not x.size or not y.size:
5184 # Return NaN if arrays are empty
5185 res = SignificanceResult(np.nan, np.nan)
5186 res.correlation = np.nan
5187 return res
5189 # check both x and y
5190 cnx, npx = _contains_nan(x, nan_policy)
5191 cny, npy = _contains_nan(y, nan_policy)
5192 contains_nan = cnx or cny
5193 if npx == 'omit' or npy == 'omit':
5194 nan_policy = 'omit'
5196 if contains_nan and nan_policy == 'propagate':
5197 res = SignificanceResult(np.nan, np.nan)
5198 res.correlation = np.nan
5199 return res
5201 elif contains_nan and nan_policy == 'omit':
5202 x = ma.masked_invalid(x)
5203 y = ma.masked_invalid(y)
5204 if variant == 'b':
5205 return mstats_basic.kendalltau(x, y, method=method, use_ties=True,
5206 alternative=alternative)
5207 else:
5208 message = ("nan_policy='omit' is currently compatible only with "
5209 "variant='b'.")
5210 raise ValueError(message)
5212 def count_rank_tie(ranks):
5213 cnt = np.bincount(ranks).astype('int64', copy=False)
5214 cnt = cnt[cnt > 1]
5215 return ((cnt * (cnt - 1) // 2).sum(),
5216 (cnt * (cnt - 1.) * (cnt - 2)).sum(),
5217 (cnt * (cnt - 1.) * (2*cnt + 5)).sum())
5219 size = x.size
5220 perm = np.argsort(y) # sort on y and convert y to dense ranks
5221 x, y = x[perm], y[perm]
5222 y = np.r_[True, y[1:] != y[:-1]].cumsum(dtype=np.intp)
5224 # stable sort on x and convert x to dense ranks
5225 perm = np.argsort(x, kind='mergesort')
5226 x, y = x[perm], y[perm]
5227 x = np.r_[True, x[1:] != x[:-1]].cumsum(dtype=np.intp)
5229 dis = _kendall_dis(x, y) # discordant pairs
5231 obs = np.r_[True, (x[1:] != x[:-1]) | (y[1:] != y[:-1]), True]
5232 cnt = np.diff(np.nonzero(obs)[0]).astype('int64', copy=False)
5234 ntie = (cnt * (cnt - 1) // 2).sum() # joint ties
5235 xtie, x0, x1 = count_rank_tie(x) # ties in x, stats
5236 ytie, y0, y1 = count_rank_tie(y) # ties in y, stats
5238 tot = (size * (size - 1)) // 2
5240 if xtie == tot or ytie == tot:
5241 res = SignificanceResult(np.nan, np.nan)
5242 res.correlation = np.nan
5243 return res
5245 # Note that tot = con + dis + (xtie - ntie) + (ytie - ntie) + ntie
5246 # = con + dis + xtie + ytie - ntie
5247 con_minus_dis = tot - xtie - ytie + ntie - 2 * dis
5248 if variant == 'b':
5249 tau = con_minus_dis / np.sqrt(tot - xtie) / np.sqrt(tot - ytie)
5250 elif variant == 'c':
5251 minclasses = min(len(set(x)), len(set(y)))
5252 tau = 2*con_minus_dis / (size**2 * (minclasses-1)/minclasses)
5253 else:
5254 raise ValueError(f"Unknown variant of the method chosen: {variant}. "
5255 "variant must be 'b' or 'c'.")
5257 # Limit range to fix computational errors
5258 tau = min(1., max(-1., tau))
5260 # The p-value calculation is the same for all variants since the p-value
5261 # depends only on con_minus_dis.
5262 if method == 'exact' and (xtie != 0 or ytie != 0):
5263 raise ValueError("Ties found, exact method cannot be used.")
5265 if method == 'auto':
5266 if (xtie == 0 and ytie == 0) and (size <= 33 or
5267 min(dis, tot-dis) <= 1):
5268 method = 'exact'
5269 else:
5270 method = 'asymptotic'
5272 if xtie == 0 and ytie == 0 and method == 'exact':
5273 pvalue = mstats_basic._kendall_p_exact(size, tot-dis, alternative)
5274 elif method == 'asymptotic':
5275 # con_minus_dis is approx normally distributed with this variance [3]_
5276 m = size * (size - 1.)
5277 var = ((m * (2*size + 5) - x1 - y1) / 18 +
5278 (2 * xtie * ytie) / m + x0 * y0 / (9 * m * (size - 2)))
5279 z = con_minus_dis / np.sqrt(var)
5280 _, pvalue = _normtest_finish(z, alternative)
5281 else:
5282 raise ValueError(f"Unknown method {method} specified. Use 'auto', "
5283 "'exact' or 'asymptotic'.")
5285 # create result object with alias for backward compatibility
5286 res = SignificanceResult(tau, pvalue)
5287 res.correlation = tau
5288 return res
5291def weightedtau(x, y, rank=True, weigher=None, additive=True):
5292 r"""Compute a weighted version of Kendall's :math:`\tau`.
5294 The weighted :math:`\tau` is a weighted version of Kendall's
5295 :math:`\tau` in which exchanges of high weight are more influential than
5296 exchanges of low weight. The default parameters compute the additive
5297 hyperbolic version of the index, :math:`\tau_\mathrm h`, which has
5298 been shown to provide the best balance between important and
5299 unimportant elements [1]_.
5301 The weighting is defined by means of a rank array, which assigns a
5302 nonnegative rank to each element (higher importance ranks being
5303 associated with smaller values, e.g., 0 is the highest possible rank),
5304 and a weigher function, which assigns a weight based on the rank to
5305 each element. The weight of an exchange is then the sum or the product
5306 of the weights of the ranks of the exchanged elements. The default
5307 parameters compute :math:`\tau_\mathrm h`: an exchange between
5308 elements with rank :math:`r` and :math:`s` (starting from zero) has
5309 weight :math:`1/(r+1) + 1/(s+1)`.
5311 Specifying a rank array is meaningful only if you have in mind an
5312 external criterion of importance. If, as it usually happens, you do
5313 not have in mind a specific rank, the weighted :math:`\tau` is
5314 defined by averaging the values obtained using the decreasing
5315 lexicographical rank by (`x`, `y`) and by (`y`, `x`). This is the
5316 behavior with default parameters. Note that the convention used
5317 here for ranking (lower values imply higher importance) is opposite
5318 to that used by other SciPy statistical functions.
5320 Parameters
5321 ----------
5322 x, y : array_like
5323 Arrays of scores, of the same shape. If arrays are not 1-D, they will
5324 be flattened to 1-D.
5325 rank : array_like of ints or bool, optional
5326 A nonnegative rank assigned to each element. If it is None, the
5327 decreasing lexicographical rank by (`x`, `y`) will be used: elements of
5328 higher rank will be those with larger `x`-values, using `y`-values to
5329 break ties (in particular, swapping `x` and `y` will give a different
5330 result). If it is False, the element indices will be used
5331 directly as ranks. The default is True, in which case this
5332 function returns the average of the values obtained using the
5333 decreasing lexicographical rank by (`x`, `y`) and by (`y`, `x`).
5334 weigher : callable, optional
5335 The weigher function. Must map nonnegative integers (zero
5336 representing the most important element) to a nonnegative weight.
5337 The default, None, provides hyperbolic weighing, that is,
5338 rank :math:`r` is mapped to weight :math:`1/(r+1)`.
5339 additive : bool, optional
5340 If True, the weight of an exchange is computed by adding the
5341 weights of the ranks of the exchanged elements; otherwise, the weights
5342 are multiplied. The default is True.
5344 Returns
5345 -------
5346 res: SignificanceResult
5347 An object containing attributes:
5349 statistic : float
5350 The weighted :math:`\tau` correlation index.
5351 pvalue : float
5352 Presently ``np.nan``, as the null distribution of the statistic is
5353 unknown (even in the additive hyperbolic case).
5355 See Also
5356 --------
5357 kendalltau : Calculates Kendall's tau.
5358 spearmanr : Calculates a Spearman rank-order correlation coefficient.
5359 theilslopes : Computes the Theil-Sen estimator for a set of points (x, y).
5361 Notes
5362 -----
5363 This function uses an :math:`O(n \log n)`, mergesort-based algorithm
5364 [1]_ that is a weighted extension of Knight's algorithm for Kendall's
5365 :math:`\tau` [2]_. It can compute Shieh's weighted :math:`\tau` [3]_
5366 between rankings without ties (i.e., permutations) by setting
5367 `additive` and `rank` to False, as the definition given in [1]_ is a
5368 generalization of Shieh's.
5370 NaNs are considered the smallest possible score.
5372 .. versionadded:: 0.19.0
5374 References
5375 ----------
5376 .. [1] Sebastiano Vigna, "A weighted correlation index for rankings with
5377 ties", Proceedings of the 24th international conference on World
5378 Wide Web, pp. 1166-1176, ACM, 2015.
5379 .. [2] W.R. Knight, "A Computer Method for Calculating Kendall's Tau with
5380 Ungrouped Data", Journal of the American Statistical Association,
5381 Vol. 61, No. 314, Part 1, pp. 436-439, 1966.
5382 .. [3] Grace S. Shieh. "A weighted Kendall's tau statistic", Statistics &
5383 Probability Letters, Vol. 39, No. 1, pp. 17-24, 1998.
5385 Examples
5386 --------
5387 >>> import numpy as np
5388 >>> from scipy import stats
5389 >>> x = [12, 2, 1, 12, 2]
5390 >>> y = [1, 4, 7, 1, 0]
5391 >>> res = stats.weightedtau(x, y)
5392 >>> res.statistic
5393 -0.56694968153682723
5394 >>> res.pvalue
5395 nan
5396 >>> res = stats.weightedtau(x, y, additive=False)
5397 >>> res.statistic
5398 -0.62205716951801038
5400 NaNs are considered the smallest possible score:
5402 >>> x = [12, 2, 1, 12, 2]
5403 >>> y = [1, 4, 7, 1, np.nan]
5404 >>> res = stats.weightedtau(x, y)
5405 >>> res.statistic
5406 -0.56694968153682723
5408 This is exactly Kendall's tau:
5410 >>> x = [12, 2, 1, 12, 2]
5411 >>> y = [1, 4, 7, 1, 0]
5412 >>> res = stats.weightedtau(x, y, weigher=lambda x: 1)
5413 >>> res.statistic
5414 -0.47140452079103173
5416 >>> x = [12, 2, 1, 12, 2]
5417 >>> y = [1, 4, 7, 1, 0]
5418 >>> stats.weightedtau(x, y, rank=None)
5419 SignificanceResult(statistic=-0.4157652301037516, pvalue=nan)
5420 >>> stats.weightedtau(y, x, rank=None)
5421 SignificanceResult(statistic=-0.7181341329699028, pvalue=nan)
5423 """
5424 x = np.asarray(x).ravel()
5425 y = np.asarray(y).ravel()
5427 if x.size != y.size:
5428 raise ValueError("All inputs to `weightedtau` must be "
5429 "of the same size, "
5430 "found x-size %s and y-size %s" % (x.size, y.size))
5431 if not x.size:
5432 # Return NaN if arrays are empty
5433 res = SignificanceResult(np.nan, np.nan)
5434 res.correlation = np.nan
5435 return res
5437 # If there are NaNs we apply _toint64()
5438 if np.isnan(np.sum(x)):
5439 x = _toint64(x)
5440 if np.isnan(np.sum(y)):
5441 y = _toint64(y)
5443 # Reduce to ranks unsupported types
5444 if x.dtype != y.dtype:
5445 if x.dtype != np.int64:
5446 x = _toint64(x)
5447 if y.dtype != np.int64:
5448 y = _toint64(y)
5449 else:
5450 if x.dtype not in (np.int32, np.int64, np.float32, np.float64):
5451 x = _toint64(x)
5452 y = _toint64(y)
5454 if rank is True:
5455 tau = (
5456 _weightedrankedtau(x, y, None, weigher, additive) +
5457 _weightedrankedtau(y, x, None, weigher, additive)
5458 ) / 2
5459 res = SignificanceResult(tau, np.nan)
5460 res.correlation = tau
5461 return res
5463 if rank is False:
5464 rank = np.arange(x.size, dtype=np.intp)
5465 elif rank is not None:
5466 rank = np.asarray(rank).ravel()
5467 if rank.size != x.size:
5468 raise ValueError(
5469 "All inputs to `weightedtau` must be of the same size, "
5470 "found x-size %s and rank-size %s" % (x.size, rank.size)
5471 )
5473 tau = _weightedrankedtau(x, y, rank, weigher, additive)
5474 res = SignificanceResult(tau, np.nan)
5475 res.correlation = tau
5476 return res
5479# FROM MGCPY: https://github.com/neurodata/mgcpy
5482class _ParallelP:
5483 """Helper function to calculate parallel p-value."""
5485 def __init__(self, x, y, random_states):
5486 self.x = x
5487 self.y = y
5488 self.random_states = random_states
5490 def __call__(self, index):
5491 order = self.random_states[index].permutation(self.y.shape[0])
5492 permy = self.y[order][:, order]
5494 # calculate permuted stats, store in null distribution
5495 perm_stat = _mgc_stat(self.x, permy)[0]
5497 return perm_stat
5500def _perm_test(x, y, stat, reps=1000, workers=-1, random_state=None):
5501 r"""Helper function that calculates the p-value. See below for uses.
5503 Parameters
5504 ----------
5505 x, y : ndarray
5506 `x` and `y` have shapes `(n, p)` and `(n, q)`.
5507 stat : float
5508 The sample test statistic.
5509 reps : int, optional
5510 The number of replications used to estimate the null when using the
5511 permutation test. The default is 1000 replications.
5512 workers : int or map-like callable, optional
5513 If `workers` is an int the population is subdivided into `workers`
5514 sections and evaluated in parallel (uses
5515 `multiprocessing.Pool <multiprocessing>`). Supply `-1` to use all cores
5516 available to the Process. Alternatively supply a map-like callable,
5517 such as `multiprocessing.Pool.map` for evaluating the population in
5518 parallel. This evaluation is carried out as `workers(func, iterable)`.
5519 Requires that `func` be pickleable.
5520 random_state : {None, int, `numpy.random.Generator`,
5521 `numpy.random.RandomState`}, optional
5523 If `seed` is None (or `np.random`), the `numpy.random.RandomState`
5524 singleton is used.
5525 If `seed` is an int, a new ``RandomState`` instance is used,
5526 seeded with `seed`.
5527 If `seed` is already a ``Generator`` or ``RandomState`` instance then
5528 that instance is used.
5530 Returns
5531 -------
5532 pvalue : float
5533 The sample test p-value.
5534 null_dist : list
5535 The approximated null distribution.
5537 """
5538 # generate seeds for each rep (change to new parallel random number
5539 # capabilities in numpy >= 1.17+)
5540 random_state = check_random_state(random_state)
5541 random_states = [np.random.RandomState(rng_integers(random_state, 1 << 32,
5542 size=4, dtype=np.uint32)) for _ in range(reps)]
5544 # parallelizes with specified workers over number of reps and set seeds
5545 parallelp = _ParallelP(x=x, y=y, random_states=random_states)
5546 with MapWrapper(workers) as mapwrapper:
5547 null_dist = np.array(list(mapwrapper(parallelp, range(reps))))
5549 # calculate p-value and significant permutation map through list
5550 pvalue = (1 + (null_dist >= stat).sum()) / (1 + reps)
5552 return pvalue, null_dist
5555def _euclidean_dist(x):
5556 return cdist(x, x)
5559MGCResult = _make_tuple_bunch('MGCResult',
5560 ['statistic', 'pvalue', 'mgc_dict'], [])
5563def multiscale_graphcorr(x, y, compute_distance=_euclidean_dist, reps=1000,
5564 workers=1, is_twosamp=False, random_state=None):
5565 r"""Computes the Multiscale Graph Correlation (MGC) test statistic.
5567 Specifically, for each point, MGC finds the :math:`k`-nearest neighbors for
5568 one property (e.g. cloud density), and the :math:`l`-nearest neighbors for
5569 the other property (e.g. grass wetness) [1]_. This pair :math:`(k, l)` is
5570 called the "scale". A priori, however, it is not know which scales will be
5571 most informative. So, MGC computes all distance pairs, and then efficiently
5572 computes the distance correlations for all scales. The local correlations
5573 illustrate which scales are relatively informative about the relationship.
5574 The key, therefore, to successfully discover and decipher relationships
5575 between disparate data modalities is to adaptively determine which scales
5576 are the most informative, and the geometric implication for the most
5577 informative scales. Doing so not only provides an estimate of whether the
5578 modalities are related, but also provides insight into how the
5579 determination was made. This is especially important in high-dimensional
5580 data, where simple visualizations do not reveal relationships to the
5581 unaided human eye. Characterizations of this implementation in particular
5582 have been derived from and benchmarked within in [2]_.
5584 Parameters
5585 ----------
5586 x, y : ndarray
5587 If ``x`` and ``y`` have shapes ``(n, p)`` and ``(n, q)`` where `n` is
5588 the number of samples and `p` and `q` are the number of dimensions,
5589 then the MGC independence test will be run. Alternatively, ``x`` and
5590 ``y`` can have shapes ``(n, n)`` if they are distance or similarity
5591 matrices, and ``compute_distance`` must be sent to ``None``. If ``x``
5592 and ``y`` have shapes ``(n, p)`` and ``(m, p)``, an unpaired
5593 two-sample MGC test will be run.
5594 compute_distance : callable, optional
5595 A function that computes the distance or similarity among the samples
5596 within each data matrix. Set to ``None`` if ``x`` and ``y`` are
5597 already distance matrices. The default uses the euclidean norm metric.
5598 If you are calling a custom function, either create the distance
5599 matrix before-hand or create a function of the form
5600 ``compute_distance(x)`` where `x` is the data matrix for which
5601 pairwise distances are calculated.
5602 reps : int, optional
5603 The number of replications used to estimate the null when using the
5604 permutation test. The default is ``1000``.
5605 workers : int or map-like callable, optional
5606 If ``workers`` is an int the population is subdivided into ``workers``
5607 sections and evaluated in parallel (uses ``multiprocessing.Pool
5608 <multiprocessing>``). Supply ``-1`` to use all cores available to the
5609 Process. Alternatively supply a map-like callable, such as
5610 ``multiprocessing.Pool.map`` for evaluating the p-value in parallel.
5611 This evaluation is carried out as ``workers(func, iterable)``.
5612 Requires that `func` be pickleable. The default is ``1``.
5613 is_twosamp : bool, optional
5614 If `True`, a two sample test will be run. If ``x`` and ``y`` have
5615 shapes ``(n, p)`` and ``(m, p)``, this optional will be overridden and
5616 set to ``True``. Set to ``True`` if ``x`` and ``y`` both have shapes
5617 ``(n, p)`` and a two sample test is desired. The default is ``False``.
5618 Note that this will not run if inputs are distance matrices.
5619 random_state : {None, int, `numpy.random.Generator`,
5620 `numpy.random.RandomState`}, optional
5622 If `seed` is None (or `np.random`), the `numpy.random.RandomState`
5623 singleton is used.
5624 If `seed` is an int, a new ``RandomState`` instance is used,
5625 seeded with `seed`.
5626 If `seed` is already a ``Generator`` or ``RandomState`` instance then
5627 that instance is used.
5629 Returns
5630 -------
5631 res : MGCResult
5632 An object containing attributes:
5634 statistic : float
5635 The sample MGC test statistic within `[-1, 1]`.
5636 pvalue : float
5637 The p-value obtained via permutation.
5638 mgc_dict : dict
5639 Contains additional useful results:
5641 - mgc_map : ndarray
5642 A 2D representation of the latent geometry of the
5643 relationship.
5644 - opt_scale : (int, int)
5645 The estimated optimal scale as a `(x, y)` pair.
5646 - null_dist : list
5647 The null distribution derived from the permuted matrices.
5649 See Also
5650 --------
5651 pearsonr : Pearson correlation coefficient and p-value for testing
5652 non-correlation.
5653 kendalltau : Calculates Kendall's tau.
5654 spearmanr : Calculates a Spearman rank-order correlation coefficient.
5656 Notes
5657 -----
5658 A description of the process of MGC and applications on neuroscience data
5659 can be found in [1]_. It is performed using the following steps:
5661 #. Two distance matrices :math:`D^X` and :math:`D^Y` are computed and
5662 modified to be mean zero columnwise. This results in two
5663 :math:`n \times n` distance matrices :math:`A` and :math:`B` (the
5664 centering and unbiased modification) [3]_.
5666 #. For all values :math:`k` and :math:`l` from :math:`1, ..., n`,
5668 * The :math:`k`-nearest neighbor and :math:`l`-nearest neighbor graphs
5669 are calculated for each property. Here, :math:`G_k (i, j)` indicates
5670 the :math:`k`-smallest values of the :math:`i`-th row of :math:`A`
5671 and :math:`H_l (i, j)` indicates the :math:`l` smallested values of
5672 the :math:`i`-th row of :math:`B`
5674 * Let :math:`\circ` denotes the entry-wise matrix product, then local
5675 correlations are summed and normalized using the following statistic:
5677 .. math::
5679 c^{kl} = \frac{\sum_{ij} A G_k B H_l}
5680 {\sqrt{\sum_{ij} A^2 G_k \times \sum_{ij} B^2 H_l}}
5682 #. The MGC test statistic is the smoothed optimal local correlation of
5683 :math:`\{ c^{kl} \}`. Denote the smoothing operation as :math:`R(\cdot)`
5684 (which essentially set all isolated large correlations) as 0 and
5685 connected large correlations the same as before, see [3]_.) MGC is,
5687 .. math::
5689 MGC_n (x, y) = \max_{(k, l)} R \left(c^{kl} \left( x_n, y_n \right)
5690 \right)
5692 The test statistic returns a value between :math:`(-1, 1)` since it is
5693 normalized.
5695 The p-value returned is calculated using a permutation test. This process
5696 is completed by first randomly permuting :math:`y` to estimate the null
5697 distribution and then calculating the probability of observing a test
5698 statistic, under the null, at least as extreme as the observed test
5699 statistic.
5701 MGC requires at least 5 samples to run with reliable results. It can also
5702 handle high-dimensional data sets.
5703 In addition, by manipulating the input data matrices, the two-sample
5704 testing problem can be reduced to the independence testing problem [4]_.
5705 Given sample data :math:`U` and :math:`V` of sizes :math:`p \times n`
5706 :math:`p \times m`, data matrix :math:`X` and :math:`Y` can be created as
5707 follows:
5709 .. math::
5711 X = [U | V] \in \mathcal{R}^{p \times (n + m)}
5712 Y = [0_{1 \times n} | 1_{1 \times m}] \in \mathcal{R}^{(n + m)}
5714 Then, the MGC statistic can be calculated as normal. This methodology can
5715 be extended to similar tests such as distance correlation [4]_.
5717 .. versionadded:: 1.4.0
5719 References
5720 ----------
5721 .. [1] Vogelstein, J. T., Bridgeford, E. W., Wang, Q., Priebe, C. E.,
5722 Maggioni, M., & Shen, C. (2019). Discovering and deciphering
5723 relationships across disparate data modalities. ELife.
5724 .. [2] Panda, S., Palaniappan, S., Xiong, J., Swaminathan, A.,
5725 Ramachandran, S., Bridgeford, E. W., ... Vogelstein, J. T. (2019).
5726 mgcpy: A Comprehensive High Dimensional Independence Testing Python
5727 Package. :arXiv:`1907.02088`
5728 .. [3] Shen, C., Priebe, C.E., & Vogelstein, J. T. (2019). From distance
5729 correlation to multiscale graph correlation. Journal of the American
5730 Statistical Association.
5731 .. [4] Shen, C. & Vogelstein, J. T. (2018). The Exact Equivalence of
5732 Distance and Kernel Methods for Hypothesis Testing.
5733 :arXiv:`1806.05514`
5735 Examples
5736 --------
5737 >>> import numpy as np
5738 >>> from scipy.stats import multiscale_graphcorr
5739 >>> x = np.arange(100)
5740 >>> y = x
5741 >>> res = multiscale_graphcorr(x, y)
5742 >>> res.statistic, res.pvalue
5743 (1.0, 0.001)
5745 To run an unpaired two-sample test,
5747 >>> x = np.arange(100)
5748 >>> y = np.arange(79)
5749 >>> res = multiscale_graphcorr(x, y)
5750 >>> res.statistic, res.pvalue # doctest: +SKIP
5751 (0.033258146255703246, 0.023)
5753 or, if shape of the inputs are the same,
5755 >>> x = np.arange(100)
5756 >>> y = x
5757 >>> res = multiscale_graphcorr(x, y, is_twosamp=True)
5758 >>> res.statistic, res.pvalue # doctest: +SKIP
5759 (-0.008021809890200488, 1.0)
5761 """
5762 if not isinstance(x, np.ndarray) or not isinstance(y, np.ndarray):
5763 raise ValueError("x and y must be ndarrays")
5765 # convert arrays of type (n,) to (n, 1)
5766 if x.ndim == 1:
5767 x = x[:, np.newaxis]
5768 elif x.ndim != 2:
5769 raise ValueError("Expected a 2-D array `x`, found shape "
5770 "{}".format(x.shape))
5771 if y.ndim == 1:
5772 y = y[:, np.newaxis]
5773 elif y.ndim != 2:
5774 raise ValueError("Expected a 2-D array `y`, found shape "
5775 "{}".format(y.shape))
5777 nx, px = x.shape
5778 ny, py = y.shape
5780 # check for NaNs
5781 _contains_nan(x, nan_policy='raise')
5782 _contains_nan(y, nan_policy='raise')
5784 # check for positive or negative infinity and raise error
5785 if np.sum(np.isinf(x)) > 0 or np.sum(np.isinf(y)) > 0:
5786 raise ValueError("Inputs contain infinities")
5788 if nx != ny:
5789 if px == py:
5790 # reshape x and y for two sample testing
5791 is_twosamp = True
5792 else:
5793 raise ValueError("Shape mismatch, x and y must have shape [n, p] "
5794 "and [n, q] or have shape [n, p] and [m, p].")
5796 if nx < 5 or ny < 5:
5797 raise ValueError("MGC requires at least 5 samples to give reasonable "
5798 "results.")
5800 # convert x and y to float
5801 x = x.astype(np.float64)
5802 y = y.astype(np.float64)
5804 # check if compute_distance_matrix if a callable()
5805 if not callable(compute_distance) and compute_distance is not None:
5806 raise ValueError("Compute_distance must be a function.")
5808 # check if number of reps exists, integer, or > 0 (if under 1000 raises
5809 # warning)
5810 if not isinstance(reps, int) or reps < 0:
5811 raise ValueError("Number of reps must be an integer greater than 0.")
5812 elif reps < 1000:
5813 msg = ("The number of replications is low (under 1000), and p-value "
5814 "calculations may be unreliable. Use the p-value result, with "
5815 "caution!")
5816 warnings.warn(msg, RuntimeWarning)
5818 if is_twosamp:
5819 if compute_distance is None:
5820 raise ValueError("Cannot run if inputs are distance matrices")
5821 x, y = _two_sample_transform(x, y)
5823 if compute_distance is not None:
5824 # compute distance matrices for x and y
5825 x = compute_distance(x)
5826 y = compute_distance(y)
5828 # calculate MGC stat
5829 stat, stat_dict = _mgc_stat(x, y)
5830 stat_mgc_map = stat_dict["stat_mgc_map"]
5831 opt_scale = stat_dict["opt_scale"]
5833 # calculate permutation MGC p-value
5834 pvalue, null_dist = _perm_test(x, y, stat, reps=reps, workers=workers,
5835 random_state=random_state)
5837 # save all stats (other than stat/p-value) in dictionary
5838 mgc_dict = {"mgc_map": stat_mgc_map,
5839 "opt_scale": opt_scale,
5840 "null_dist": null_dist}
5842 # create result object with alias for backward compatibility
5843 res = MGCResult(stat, pvalue, mgc_dict)
5844 res.stat = stat
5845 return res
5848def _mgc_stat(distx, disty):
5849 r"""Helper function that calculates the MGC stat. See above for use.
5851 Parameters
5852 ----------
5853 distx, disty : ndarray
5854 `distx` and `disty` have shapes `(n, p)` and `(n, q)` or
5855 `(n, n)` and `(n, n)`
5856 if distance matrices.
5858 Returns
5859 -------
5860 stat : float
5861 The sample MGC test statistic within `[-1, 1]`.
5862 stat_dict : dict
5863 Contains additional useful additional returns containing the following
5864 keys:
5866 - stat_mgc_map : ndarray
5867 MGC-map of the statistics.
5868 - opt_scale : (float, float)
5869 The estimated optimal scale as a `(x, y)` pair.
5871 """
5872 # calculate MGC map and optimal scale
5873 stat_mgc_map = _local_correlations(distx, disty, global_corr='mgc')
5875 n, m = stat_mgc_map.shape
5876 if m == 1 or n == 1:
5877 # the global scale at is the statistic calculated at maximial nearest
5878 # neighbors. There is not enough local scale to search over, so
5879 # default to global scale
5880 stat = stat_mgc_map[m - 1][n - 1]
5881 opt_scale = m * n
5882 else:
5883 samp_size = len(distx) - 1
5885 # threshold to find connected region of significant local correlations
5886 sig_connect = _threshold_mgc_map(stat_mgc_map, samp_size)
5888 # maximum within the significant region
5889 stat, opt_scale = _smooth_mgc_map(sig_connect, stat_mgc_map)
5891 stat_dict = {"stat_mgc_map": stat_mgc_map,
5892 "opt_scale": opt_scale}
5894 return stat, stat_dict
5897def _threshold_mgc_map(stat_mgc_map, samp_size):
5898 r"""
5899 Finds a connected region of significance in the MGC-map by thresholding.
5901 Parameters
5902 ----------
5903 stat_mgc_map : ndarray
5904 All local correlations within `[-1,1]`.
5905 samp_size : int
5906 The sample size of original data.
5908 Returns
5909 -------
5910 sig_connect : ndarray
5911 A binary matrix with 1's indicating the significant region.
5913 """
5914 m, n = stat_mgc_map.shape
5916 # 0.02 is simply an empirical threshold, this can be set to 0.01 or 0.05
5917 # with varying levels of performance. Threshold is based on a beta
5918 # approximation.
5919 per_sig = 1 - (0.02 / samp_size) # Percentile to consider as significant
5920 threshold = samp_size * (samp_size - 3)/4 - 1/2 # Beta approximation
5921 threshold = distributions.beta.ppf(per_sig, threshold, threshold) * 2 - 1
5923 # the global scale at is the statistic calculated at maximial nearest
5924 # neighbors. Threshold is the maximum on the global and local scales
5925 threshold = max(threshold, stat_mgc_map[m - 1][n - 1])
5927 # find the largest connected component of significant correlations
5928 sig_connect = stat_mgc_map > threshold
5929 if np.sum(sig_connect) > 0:
5930 sig_connect, _ = _measurements.label(sig_connect)
5931 _, label_counts = np.unique(sig_connect, return_counts=True)
5933 # skip the first element in label_counts, as it is count(zeros)
5934 max_label = np.argmax(label_counts[1:]) + 1
5935 sig_connect = sig_connect == max_label
5936 else:
5937 sig_connect = np.array([[False]])
5939 return sig_connect
5942def _smooth_mgc_map(sig_connect, stat_mgc_map):
5943 """Finds the smoothed maximal within the significant region R.
5945 If area of R is too small it returns the last local correlation. Otherwise,
5946 returns the maximum within significant_connected_region.
5948 Parameters
5949 ----------
5950 sig_connect : ndarray
5951 A binary matrix with 1's indicating the significant region.
5952 stat_mgc_map : ndarray
5953 All local correlations within `[-1, 1]`.
5955 Returns
5956 -------
5957 stat : float
5958 The sample MGC statistic within `[-1, 1]`.
5959 opt_scale: (float, float)
5960 The estimated optimal scale as an `(x, y)` pair.
5962 """
5963 m, n = stat_mgc_map.shape
5965 # the global scale at is the statistic calculated at maximial nearest
5966 # neighbors. By default, statistic and optimal scale are global.
5967 stat = stat_mgc_map[m - 1][n - 1]
5968 opt_scale = [m, n]
5970 if np.linalg.norm(sig_connect) != 0:
5971 # proceed only when the connected region's area is sufficiently large
5972 # 0.02 is simply an empirical threshold, this can be set to 0.01 or 0.05
5973 # with varying levels of performance
5974 if np.sum(sig_connect) >= np.ceil(0.02 * max(m, n)) * min(m, n):
5975 max_corr = max(stat_mgc_map[sig_connect])
5977 # find all scales within significant_connected_region that maximize
5978 # the local correlation
5979 max_corr_index = np.where((stat_mgc_map >= max_corr) & sig_connect)
5981 if max_corr >= stat:
5982 stat = max_corr
5984 k, l = max_corr_index
5985 one_d_indices = k * n + l # 2D to 1D indexing
5986 k = np.max(one_d_indices) // n
5987 l = np.max(one_d_indices) % n
5988 opt_scale = [k+1, l+1] # adding 1s to match R indexing
5990 return stat, opt_scale
5993def _two_sample_transform(u, v):
5994 """Helper function that concatenates x and y for two sample MGC stat.
5996 See above for use.
5998 Parameters
5999 ----------
6000 u, v : ndarray
6001 `u` and `v` have shapes `(n, p)` and `(m, p)`.
6003 Returns
6004 -------
6005 x : ndarray
6006 Concatenate `u` and `v` along the `axis = 0`. `x` thus has shape
6007 `(2n, p)`.
6008 y : ndarray
6009 Label matrix for `x` where 0 refers to samples that comes from `u` and
6010 1 refers to samples that come from `v`. `y` thus has shape `(2n, 1)`.
6012 """
6013 nx = u.shape[0]
6014 ny = v.shape[0]
6015 x = np.concatenate([u, v], axis=0)
6016 y = np.concatenate([np.zeros(nx), np.ones(ny)], axis=0).reshape(-1, 1)
6017 return x, y
6020#####################################
6021# INFERENTIAL STATISTICS #
6022#####################################
6024TtestResultBase = _make_tuple_bunch('TtestResultBase',
6025 ['statistic', 'pvalue'], ['df'])
6028class TtestResult(TtestResultBase):
6029 """
6030 Result of a t-test.
6032 See the documentation of the particular t-test function for more
6033 information about the definition of the statistic and meaning of
6034 the confidence interval.
6036 Attributes
6037 ----------
6038 statistic : float or array
6039 The t-statistic of the sample.
6040 pvalue : float or array
6041 The p-value associated with the given alternative.
6042 df : float or array
6043 The number of degrees of freedom used in calculation of the
6044 t-statistic; this is one less than the size of the sample
6045 (``a.shape[axis]-1`` if there are no masked elements or omitted NaNs).
6047 Methods
6048 -------
6049 confidence_interval
6050 Computes a confidence interval around the population statistic
6051 for the given confidence level.
6052 The confidence interval is returned in a ``namedtuple`` with
6053 fields `low` and `high`.
6055 """
6057 def __init__(self, statistic, pvalue, df, # public
6058 alternative, standard_error, estimate): # private
6059 super().__init__(statistic, pvalue, df=df)
6060 self._alternative = alternative
6061 self._standard_error = standard_error # denominator of t-statistic
6062 self._estimate = estimate # point estimate of sample mean
6064 def confidence_interval(self, confidence_level=0.95):
6065 """
6066 Parameters
6067 ----------
6068 confidence_level : float
6069 The confidence level for the calculation of the population mean
6070 confidence interval. Default is 0.95.
6072 Returns
6073 -------
6074 ci : namedtuple
6075 The confidence interval is returned in a ``namedtuple`` with
6076 fields `low` and `high`.
6078 """
6079 low, high = _t_confidence_interval(self.df, self.statistic,
6080 confidence_level, self._alternative)
6081 low = low * self._standard_error + self._estimate
6082 high = high * self._standard_error + self._estimate
6083 return ConfidenceInterval(low=low, high=high)
6086def pack_TtestResult(statistic, pvalue, df, alternative, standard_error,
6087 estimate):
6088 # this could be any number of dimensions (including 0d), but there is
6089 # at most one unique value
6090 alternative = np.atleast_1d(alternative).ravel()
6091 alternative = alternative[0] if alternative.size else np.nan
6092 return TtestResult(statistic, pvalue, df=df, alternative=alternative,
6093 standard_error=standard_error, estimate=estimate)
6096def unpack_TtestResult(res):
6097 return (res.statistic, res.pvalue, res.df, res._alternative,
6098 res._standard_error, res._estimate)
6101@_axis_nan_policy_factory(pack_TtestResult, default_axis=0, n_samples=2,
6102 result_to_tuple=unpack_TtestResult, n_outputs=6)
6103def ttest_1samp(a, popmean, axis=0, nan_policy='propagate',
6104 alternative="two-sided"):
6105 """Calculate the T-test for the mean of ONE group of scores.
6107 This is a test for the null hypothesis that the expected value
6108 (mean) of a sample of independent observations `a` is equal to the given
6109 population mean, `popmean`.
6111 Parameters
6112 ----------
6113 a : array_like
6114 Sample observation.
6115 popmean : float or array_like
6116 Expected value in null hypothesis. If array_like, then its length along
6117 `axis` must equal 1, and it must otherwise be broadcastable with `a`.
6118 axis : int or None, optional
6119 Axis along which to compute test; default is 0. If None, compute over
6120 the whole array `a`.
6121 nan_policy : {'propagate', 'raise', 'omit'}, optional
6122 Defines how to handle when input contains nan.
6123 The following options are available (default is 'propagate'):
6125 * 'propagate': returns nan
6126 * 'raise': throws an error
6127 * 'omit': performs the calculations ignoring nan values
6129 alternative : {'two-sided', 'less', 'greater'}, optional
6130 Defines the alternative hypothesis.
6131 The following options are available (default is 'two-sided'):
6133 * 'two-sided': the mean of the underlying distribution of the sample
6134 is different than the given population mean (`popmean`)
6135 * 'less': the mean of the underlying distribution of the sample is
6136 less than the given population mean (`popmean`)
6137 * 'greater': the mean of the underlying distribution of the sample is
6138 greater than the given population mean (`popmean`)
6140 Returns
6141 -------
6142 result : `~scipy.stats._result_classes.TtestResult`
6143 An object with the following attributes:
6145 statistic : float or array
6146 The t-statistic.
6147 pvalue : float or array
6148 The p-value associated with the given alternative.
6149 df : float or array
6150 The number of degrees of freedom used in calculation of the
6151 t-statistic; this is one less than the size of the sample
6152 (``a.shape[axis]``).
6154 .. versionadded:: 1.10.0
6156 The object also has the following method:
6158 confidence_interval(confidence_level=0.95)
6159 Computes a confidence interval around the population
6160 mean for the given confidence level.
6161 The confidence interval is returned in a ``namedtuple`` with
6162 fields `low` and `high`.
6164 .. versionadded:: 1.10.0
6166 Notes
6167 -----
6168 The statistic is calculated as ``(np.mean(a) - popmean)/se``, where
6169 ``se`` is the standard error. Therefore, the statistic will be positive
6170 when the sample mean is greater than the population mean and negative when
6171 the sample mean is less than the population mean.
6173 Examples
6174 --------
6175 Suppose we wish to test the null hypothesis that the mean of a population
6176 is equal to 0.5. We choose a confidence level of 99%; that is, we will
6177 reject the null hypothesis in favor of the alternative if the p-value is
6178 less than 0.01.
6180 When testing random variates from the standard uniform distribution, which
6181 has a mean of 0.5, we expect the data to be consistent with the null
6182 hypothesis most of the time.
6184 >>> import numpy as np
6185 >>> from scipy import stats
6186 >>> rng = np.random.default_rng()
6187 >>> rvs = stats.uniform.rvs(size=50, random_state=rng)
6188 >>> stats.ttest_1samp(rvs, popmean=0.5)
6189 TtestResult(statistic=2.456308468440, pvalue=0.017628209047638, df=49)
6191 As expected, the p-value of 0.017 is not below our threshold of 0.01, so
6192 we cannot reject the null hypothesis.
6194 When testing data from the standard *normal* distribution, which has a mean
6195 of 0, we would expect the null hypothesis to be rejected.
6197 >>> rvs = stats.norm.rvs(size=50, random_state=rng)
6198 >>> stats.ttest_1samp(rvs, popmean=0.5)
6199 TtestResult(statistic=-7.433605518875, pvalue=1.416760157221e-09, df=49)
6201 Indeed, the p-value is lower than our threshold of 0.01, so we reject the
6202 null hypothesis in favor of the default "two-sided" alternative: the mean
6203 of the population is *not* equal to 0.5.
6205 However, suppose we were to test the null hypothesis against the
6206 one-sided alternative that the mean of the population is *greater* than
6207 0.5. Since the mean of the standard normal is less than 0.5, we would not
6208 expect the null hypothesis to be rejected.
6210 >>> stats.ttest_1samp(rvs, popmean=0.5, alternative='greater')
6211 TtestResult(statistic=-7.433605518875, pvalue=0.99999999929, df=49)
6213 Unsurprisingly, with a p-value greater than our threshold, we would not
6214 reject the null hypothesis.
6216 Note that when working with a confidence level of 99%, a true null
6217 hypothesis will be rejected approximately 1% of the time.
6219 >>> rvs = stats.uniform.rvs(size=(100, 50), random_state=rng)
6220 >>> res = stats.ttest_1samp(rvs, popmean=0.5, axis=1)
6221 >>> np.sum(res.pvalue < 0.01)
6222 1
6224 Indeed, even though all 100 samples above were drawn from the standard
6225 uniform distribution, which *does* have a population mean of 0.5, we would
6226 mistakenly reject the null hypothesis for one of them.
6228 `ttest_1samp` can also compute a confidence interval around the population
6229 mean.
6231 >>> rvs = stats.norm.rvs(size=50, random_state=rng)
6232 >>> res = stats.ttest_1samp(rvs, popmean=0)
6233 >>> ci = res.confidence_interval(confidence_level=0.95)
6234 >>> ci
6235 ConfidenceInterval(low=-0.3193887540880017, high=0.2898583388980972)
6237 The bounds of the 95% confidence interval are the
6238 minimum and maximum values of the parameter `popmean` for which the
6239 p-value of the test would be 0.05.
6241 >>> res = stats.ttest_1samp(rvs, popmean=ci.low)
6242 >>> np.testing.assert_allclose(res.pvalue, 0.05)
6243 >>> res = stats.ttest_1samp(rvs, popmean=ci.high)
6244 >>> np.testing.assert_allclose(res.pvalue, 0.05)
6246 Under certain assumptions about the population from which a sample
6247 is drawn, the confidence interval with confidence level 95% is expected
6248 to contain the true population mean in 95% of sample replications.
6250 >>> rvs = stats.norm.rvs(size=(50, 1000), loc=1, random_state=rng)
6251 >>> res = stats.ttest_1samp(rvs, popmean=0)
6252 >>> ci = res.confidence_interval()
6253 >>> contains_pop_mean = (ci.low < 1) & (ci.high > 1)
6254 >>> contains_pop_mean.sum()
6255 953
6257 """
6258 a, axis = _chk_asarray(a, axis)
6260 n = a.shape[axis]
6261 df = n - 1
6263 mean = np.mean(a, axis)
6264 try:
6265 popmean = np.squeeze(popmean, axis=axis)
6266 except ValueError as e:
6267 raise ValueError("`popmean.shape[axis]` must equal 1.") from e
6268 d = mean - popmean
6269 v = _var(a, axis, ddof=1)
6270 denom = np.sqrt(v / n)
6272 with np.errstate(divide='ignore', invalid='ignore'):
6273 t = np.divide(d, denom)
6274 t, prob = _ttest_finish(df, t, alternative)
6276 # when nan_policy='omit', `df` can be different for different axis-slices
6277 df = np.broadcast_to(df, t.shape)[()]
6278 # _axis_nan_policy decorator doesn't play well with strings
6279 alternative_num = {"less": -1, "two-sided": 0, "greater": 1}[alternative]
6280 return TtestResult(t, prob, df=df, alternative=alternative_num,
6281 standard_error=denom, estimate=mean)
6284def _t_confidence_interval(df, t, confidence_level, alternative):
6285 # Input validation on `alternative` is already done
6286 # We just need IV on confidence_level
6287 if confidence_level < 0 or confidence_level > 1:
6288 message = "`confidence_level` must be a number between 0 and 1."
6289 raise ValueError(message)
6291 if alternative < 0: # 'less'
6292 p = confidence_level
6293 low, high = np.broadcast_arrays(-np.inf, special.stdtrit(df, p))
6294 elif alternative > 0: # 'greater'
6295 p = 1 - confidence_level
6296 low, high = np.broadcast_arrays(special.stdtrit(df, p), np.inf)
6297 elif alternative == 0: # 'two-sided'
6298 tail_probability = (1 - confidence_level)/2
6299 p = tail_probability, 1-tail_probability
6300 # axis of p must be the zeroth and orthogonal to all the rest
6301 p = np.reshape(p, [2] + [1]*np.asarray(df).ndim)
6302 low, high = special.stdtrit(df, p)
6303 else: # alternative is NaN when input is empty (see _axis_nan_policy)
6304 p, nans = np.broadcast_arrays(t, np.nan)
6305 low, high = nans, nans
6307 return low[()], high[()]
6310def _ttest_finish(df, t, alternative):
6311 """Common code between all 3 t-test functions."""
6312 # We use ``stdtr`` directly here as it handles the case when ``nan``
6313 # values are present in the data and masked arrays are passed
6314 # while ``t.cdf`` emits runtime warnings. This way ``_ttest_finish``
6315 # can be shared between the ``stats`` and ``mstats`` versions.
6317 if alternative == 'less':
6318 pval = special.stdtr(df, t)
6319 elif alternative == 'greater':
6320 pval = special.stdtr(df, -t)
6321 elif alternative == 'two-sided':
6322 pval = special.stdtr(df, -np.abs(t))*2
6323 else:
6324 raise ValueError("alternative must be "
6325 "'less', 'greater' or 'two-sided'")
6327 if t.ndim == 0:
6328 t = t[()]
6329 if pval.ndim == 0:
6330 pval = pval[()]
6332 return t, pval
6335def _ttest_ind_from_stats(mean1, mean2, denom, df, alternative):
6337 d = mean1 - mean2
6338 with np.errstate(divide='ignore', invalid='ignore'):
6339 t = np.divide(d, denom)
6340 t, prob = _ttest_finish(df, t, alternative)
6342 return (t, prob)
6345def _unequal_var_ttest_denom(v1, n1, v2, n2):
6346 vn1 = v1 / n1
6347 vn2 = v2 / n2
6348 with np.errstate(divide='ignore', invalid='ignore'):
6349 df = (vn1 + vn2)**2 / (vn1**2 / (n1 - 1) + vn2**2 / (n2 - 1))
6351 # If df is undefined, variances are zero (assumes n1 > 0 & n2 > 0).
6352 # Hence it doesn't matter what df is as long as it's not NaN.
6353 df = np.where(np.isnan(df), 1, df)
6354 denom = np.sqrt(vn1 + vn2)
6355 return df, denom
6358def _equal_var_ttest_denom(v1, n1, v2, n2):
6359 df = n1 + n2 - 2.0
6360 svar = ((n1 - 1) * v1 + (n2 - 1) * v2) / df
6361 denom = np.sqrt(svar * (1.0 / n1 + 1.0 / n2))
6362 return df, denom
6365Ttest_indResult = namedtuple('Ttest_indResult', ('statistic', 'pvalue'))
6368def ttest_ind_from_stats(mean1, std1, nobs1, mean2, std2, nobs2,
6369 equal_var=True, alternative="two-sided"):
6370 r"""
6371 T-test for means of two independent samples from descriptive statistics.
6373 This is a test for the null hypothesis that two independent
6374 samples have identical average (expected) values.
6376 Parameters
6377 ----------
6378 mean1 : array_like
6379 The mean(s) of sample 1.
6380 std1 : array_like
6381 The corrected sample standard deviation of sample 1 (i.e. ``ddof=1``).
6382 nobs1 : array_like
6383 The number(s) of observations of sample 1.
6384 mean2 : array_like
6385 The mean(s) of sample 2.
6386 std2 : array_like
6387 The corrected sample standard deviation of sample 2 (i.e. ``ddof=1``).
6388 nobs2 : array_like
6389 The number(s) of observations of sample 2.
6390 equal_var : bool, optional
6391 If True (default), perform a standard independent 2 sample test
6392 that assumes equal population variances [1]_.
6393 If False, perform Welch's t-test, which does not assume equal
6394 population variance [2]_.
6395 alternative : {'two-sided', 'less', 'greater'}, optional
6396 Defines the alternative hypothesis.
6397 The following options are available (default is 'two-sided'):
6399 * 'two-sided': the means of the distributions are unequal.
6400 * 'less': the mean of the first distribution is less than the
6401 mean of the second distribution.
6402 * 'greater': the mean of the first distribution is greater than the
6403 mean of the second distribution.
6405 .. versionadded:: 1.6.0
6407 Returns
6408 -------
6409 statistic : float or array
6410 The calculated t-statistics.
6411 pvalue : float or array
6412 The two-tailed p-value.
6414 See Also
6415 --------
6416 scipy.stats.ttest_ind
6418 Notes
6419 -----
6420 The statistic is calculated as ``(mean1 - mean2)/se``, where ``se`` is the
6421 standard error. Therefore, the statistic will be positive when `mean1` is
6422 greater than `mean2` and negative when `mean1` is less than `mean2`.
6424 References
6425 ----------
6426 .. [1] https://en.wikipedia.org/wiki/T-test#Independent_two-sample_t-test
6428 .. [2] https://en.wikipedia.org/wiki/Welch%27s_t-test
6430 Examples
6431 --------
6432 Suppose we have the summary data for two samples, as follows (with the
6433 Sample Variance being the corrected sample variance)::
6435 Sample Sample
6436 Size Mean Variance
6437 Sample 1 13 15.0 87.5
6438 Sample 2 11 12.0 39.0
6440 Apply the t-test to this data (with the assumption that the population
6441 variances are equal):
6443 >>> import numpy as np
6444 >>> from scipy.stats import ttest_ind_from_stats
6445 >>> ttest_ind_from_stats(mean1=15.0, std1=np.sqrt(87.5), nobs1=13,
6446 ... mean2=12.0, std2=np.sqrt(39.0), nobs2=11)
6447 Ttest_indResult(statistic=0.9051358093310269, pvalue=0.3751996797581487)
6449 For comparison, here is the data from which those summary statistics
6450 were taken. With this data, we can compute the same result using
6451 `scipy.stats.ttest_ind`:
6453 >>> a = np.array([1, 3, 4, 6, 11, 13, 15, 19, 22, 24, 25, 26, 26])
6454 >>> b = np.array([2, 4, 6, 9, 11, 13, 14, 15, 18, 19, 21])
6455 >>> from scipy.stats import ttest_ind
6456 >>> ttest_ind(a, b)
6457 Ttest_indResult(statistic=0.905135809331027, pvalue=0.3751996797581486)
6459 Suppose we instead have binary data and would like to apply a t-test to
6460 compare the proportion of 1s in two independent groups::
6462 Number of Sample Sample
6463 Size ones Mean Variance
6464 Sample 1 150 30 0.2 0.161073
6465 Sample 2 200 45 0.225 0.175251
6467 The sample mean :math:`\hat{p}` is the proportion of ones in the sample
6468 and the variance for a binary observation is estimated by
6469 :math:`\hat{p}(1-\hat{p})`.
6471 >>> ttest_ind_from_stats(mean1=0.2, std1=np.sqrt(0.161073), nobs1=150,
6472 ... mean2=0.225, std2=np.sqrt(0.175251), nobs2=200)
6473 Ttest_indResult(statistic=-0.5627187905196761, pvalue=0.5739887114209541)
6475 For comparison, we could compute the t statistic and p-value using
6476 arrays of 0s and 1s and `scipy.stat.ttest_ind`, as above.
6478 >>> group1 = np.array([1]*30 + [0]*(150-30))
6479 >>> group2 = np.array([1]*45 + [0]*(200-45))
6480 >>> ttest_ind(group1, group2)
6481 Ttest_indResult(statistic=-0.5627179589855622, pvalue=0.573989277115258)
6483 """
6484 mean1 = np.asarray(mean1)
6485 std1 = np.asarray(std1)
6486 mean2 = np.asarray(mean2)
6487 std2 = np.asarray(std2)
6488 if equal_var:
6489 df, denom = _equal_var_ttest_denom(std1**2, nobs1, std2**2, nobs2)
6490 else:
6491 df, denom = _unequal_var_ttest_denom(std1**2, nobs1,
6492 std2**2, nobs2)
6494 res = _ttest_ind_from_stats(mean1, mean2, denom, df, alternative)
6495 return Ttest_indResult(*res)
6498def _ttest_nans(a, b, axis, namedtuple_type):
6499 """
6500 Generate an array of `nan`, with shape determined by `a`, `b` and `axis`.
6502 This function is used by ttest_ind and ttest_rel to create the return
6503 value when one of the inputs has size 0.
6505 The shapes of the arrays are determined by dropping `axis` from the
6506 shapes of `a` and `b` and broadcasting what is left.
6508 The return value is a named tuple of the type given in `namedtuple_type`.
6510 Examples
6511 --------
6512 >>> import numpy as np
6513 >>> a = np.zeros((9, 2))
6514 >>> b = np.zeros((5, 1))
6515 >>> _ttest_nans(a, b, 0, Ttest_indResult)
6516 Ttest_indResult(statistic=array([nan, nan]), pvalue=array([nan, nan]))
6518 >>> a = np.zeros((3, 0, 9))
6519 >>> b = np.zeros((1, 10))
6520 >>> stat, p = _ttest_nans(a, b, -1, Ttest_indResult)
6521 >>> stat
6522 array([], shape=(3, 0), dtype=float64)
6523 >>> p
6524 array([], shape=(3, 0), dtype=float64)
6526 >>> a = np.zeros(10)
6527 >>> b = np.zeros(7)
6528 >>> _ttest_nans(a, b, 0, Ttest_indResult)
6529 Ttest_indResult(statistic=nan, pvalue=nan)
6531 """
6532 shp = _broadcast_shapes_with_dropped_axis(a, b, axis)
6533 if len(shp) == 0:
6534 t = np.nan
6535 p = np.nan
6536 else:
6537 t = np.full(shp, fill_value=np.nan)
6538 p = t.copy()
6539 return namedtuple_type(t, p)
6542def ttest_ind(a, b, axis=0, equal_var=True, nan_policy='propagate',
6543 permutations=None, random_state=None, alternative="two-sided",
6544 trim=0):
6545 """
6546 Calculate the T-test for the means of *two independent* samples of scores.
6548 This is a test for the null hypothesis that 2 independent samples
6549 have identical average (expected) values. This test assumes that the
6550 populations have identical variances by default.
6552 Parameters
6553 ----------
6554 a, b : array_like
6555 The arrays must have the same shape, except in the dimension
6556 corresponding to `axis` (the first, by default).
6557 axis : int or None, optional
6558 Axis along which to compute test. If None, compute over the whole
6559 arrays, `a`, and `b`.
6560 equal_var : bool, optional
6561 If True (default), perform a standard independent 2 sample test
6562 that assumes equal population variances [1]_.
6563 If False, perform Welch's t-test, which does not assume equal
6564 population variance [2]_.
6566 .. versionadded:: 0.11.0
6568 nan_policy : {'propagate', 'raise', 'omit'}, optional
6569 Defines how to handle when input contains nan.
6570 The following options are available (default is 'propagate'):
6572 * 'propagate': returns nan
6573 * 'raise': throws an error
6574 * 'omit': performs the calculations ignoring nan values
6576 The 'omit' option is not currently available for permutation tests or
6577 one-sided asympyotic tests.
6579 permutations : non-negative int, np.inf, or None (default), optional
6580 If 0 or None (default), use the t-distribution to calculate p-values.
6581 Otherwise, `permutations` is the number of random permutations that
6582 will be used to estimate p-values using a permutation test. If
6583 `permutations` equals or exceeds the number of distinct partitions of
6584 the pooled data, an exact test is performed instead (i.e. each
6585 distinct partition is used exactly once). See Notes for details.
6587 .. versionadded:: 1.7.0
6589 random_state : {None, int, `numpy.random.Generator`,
6590 `numpy.random.RandomState`}, optional
6592 If `seed` is None (or `np.random`), the `numpy.random.RandomState`
6593 singleton is used.
6594 If `seed` is an int, a new ``RandomState`` instance is used,
6595 seeded with `seed`.
6596 If `seed` is already a ``Generator`` or ``RandomState`` instance then
6597 that instance is used.
6599 Pseudorandom number generator state used to generate permutations
6600 (used only when `permutations` is not None).
6602 .. versionadded:: 1.7.0
6604 alternative : {'two-sided', 'less', 'greater'}, optional
6605 Defines the alternative hypothesis.
6606 The following options are available (default is 'two-sided'):
6608 * 'two-sided': the means of the distributions underlying the samples
6609 are unequal.
6610 * 'less': the mean of the distribution underlying the first sample
6611 is less than the mean of the distribution underlying the second
6612 sample.
6613 * 'greater': the mean of the distribution underlying the first
6614 sample is greater than the mean of the distribution underlying
6615 the second sample.
6617 .. versionadded:: 1.6.0
6619 trim : float, optional
6620 If nonzero, performs a trimmed (Yuen's) t-test.
6621 Defines the fraction of elements to be trimmed from each end of the
6622 input samples. If 0 (default), no elements will be trimmed from either
6623 side. The number of trimmed elements from each tail is the floor of the
6624 trim times the number of elements. Valid range is [0, .5).
6626 .. versionadded:: 1.7
6628 Returns
6629 -------
6630 statistic : float or array
6631 The calculated t-statistic.
6632 pvalue : float or array
6633 The p-value.
6635 Notes
6636 -----
6637 Suppose we observe two independent samples, e.g. flower petal lengths, and
6638 we are considering whether the two samples were drawn from the same
6639 population (e.g. the same species of flower or two species with similar
6640 petal characteristics) or two different populations.
6642 The t-test quantifies the difference between the arithmetic means
6643 of the two samples. The p-value quantifies the probability of observing
6644 as or more extreme values assuming the null hypothesis, that the
6645 samples are drawn from populations with the same population means, is true.
6646 A p-value larger than a chosen threshold (e.g. 5% or 1%) indicates that
6647 our observation is not so unlikely to have occurred by chance. Therefore,
6648 we do not reject the null hypothesis of equal population means.
6649 If the p-value is smaller than our threshold, then we have evidence
6650 against the null hypothesis of equal population means.
6652 By default, the p-value is determined by comparing the t-statistic of the
6653 observed data against a theoretical t-distribution.
6654 When ``1 < permutations < binom(n, k)``, where
6656 * ``k`` is the number of observations in `a`,
6657 * ``n`` is the total number of observations in `a` and `b`, and
6658 * ``binom(n, k)`` is the binomial coefficient (``n`` choose ``k``),
6660 the data are pooled (concatenated), randomly assigned to either group `a`
6661 or `b`, and the t-statistic is calculated. This process is performed
6662 repeatedly (`permutation` times), generating a distribution of the
6663 t-statistic under the null hypothesis, and the t-statistic of the observed
6664 data is compared to this distribution to determine the p-value.
6665 Specifically, the p-value reported is the "achieved significance level"
6666 (ASL) as defined in 4.4 of [3]_. Note that there are other ways of
6667 estimating p-values using randomized permutation tests; for other
6668 options, see the more general `permutation_test`.
6670 When ``permutations >= binom(n, k)``, an exact test is performed: the data
6671 are partitioned between the groups in each distinct way exactly once.
6673 The permutation test can be computationally expensive and not necessarily
6674 more accurate than the analytical test, but it does not make strong
6675 assumptions about the shape of the underlying distribution.
6677 Use of trimming is commonly referred to as the trimmed t-test. At times
6678 called Yuen's t-test, this is an extension of Welch's t-test, with the
6679 difference being the use of winsorized means in calculation of the variance
6680 and the trimmed sample size in calculation of the statistic. Trimming is
6681 recommended if the underlying distribution is long-tailed or contaminated
6682 with outliers [4]_.
6684 The statistic is calculated as ``(np.mean(a) - np.mean(b))/se``, where
6685 ``se`` is the standard error. Therefore, the statistic will be positive
6686 when the sample mean of `a` is greater than the sample mean of `b` and
6687 negative when the sample mean of `a` is less than the sample mean of
6688 `b`.
6690 References
6691 ----------
6692 .. [1] https://en.wikipedia.org/wiki/T-test#Independent_two-sample_t-test
6694 .. [2] https://en.wikipedia.org/wiki/Welch%27s_t-test
6696 .. [3] B. Efron and T. Hastie. Computer Age Statistical Inference. (2016).
6698 .. [4] Yuen, Karen K. "The Two-Sample Trimmed t for Unequal Population
6699 Variances." Biometrika, vol. 61, no. 1, 1974, pp. 165-170. JSTOR,
6700 www.jstor.org/stable/2334299. Accessed 30 Mar. 2021.
6702 .. [5] Yuen, Karen K., and W. J. Dixon. "The Approximate Behaviour and
6703 Performance of the Two-Sample Trimmed t." Biometrika, vol. 60,
6704 no. 2, 1973, pp. 369-374. JSTOR, www.jstor.org/stable/2334550.
6705 Accessed 30 Mar. 2021.
6707 Examples
6708 --------
6709 >>> import numpy as np
6710 >>> from scipy import stats
6711 >>> rng = np.random.default_rng()
6713 Test with sample with identical means:
6715 >>> rvs1 = stats.norm.rvs(loc=5, scale=10, size=500, random_state=rng)
6716 >>> rvs2 = stats.norm.rvs(loc=5, scale=10, size=500, random_state=rng)
6717 >>> stats.ttest_ind(rvs1, rvs2)
6718 Ttest_indResult(statistic=-0.4390847099199348, pvalue=0.6606952038870015)
6719 >>> stats.ttest_ind(rvs1, rvs2, equal_var=False)
6720 Ttest_indResult(statistic=-0.4390847099199348, pvalue=0.6606952553131064)
6722 `ttest_ind` underestimates p for unequal variances:
6724 >>> rvs3 = stats.norm.rvs(loc=5, scale=20, size=500, random_state=rng)
6725 >>> stats.ttest_ind(rvs1, rvs3)
6726 Ttest_indResult(statistic=-1.6370984482905417, pvalue=0.1019251574705033)
6727 >>> stats.ttest_ind(rvs1, rvs3, equal_var=False)
6728 Ttest_indResult(statistic=-1.637098448290542, pvalue=0.10202110497954867)
6730 When ``n1 != n2``, the equal variance t-statistic is no longer equal to the
6731 unequal variance t-statistic:
6733 >>> rvs4 = stats.norm.rvs(loc=5, scale=20, size=100, random_state=rng)
6734 >>> stats.ttest_ind(rvs1, rvs4)
6735 Ttest_indResult(statistic=-1.9481646859513422, pvalue=0.05186270935842703)
6736 >>> stats.ttest_ind(rvs1, rvs4, equal_var=False)
6737 Ttest_indResult(statistic=-1.3146566100751664, pvalue=0.1913495266513811)
6739 T-test with different means, variance, and n:
6741 >>> rvs5 = stats.norm.rvs(loc=8, scale=20, size=100, random_state=rng)
6742 >>> stats.ttest_ind(rvs1, rvs5)
6743 Ttest_indResult(statistic=-2.8415950600298774, pvalue=0.0046418707568707885)
6744 >>> stats.ttest_ind(rvs1, rvs5, equal_var=False)
6745 Ttest_indResult(statistic=-1.8686598649188084, pvalue=0.06434714193919686)
6747 When performing a permutation test, more permutations typically yields
6748 more accurate results. Use a ``np.random.Generator`` to ensure
6749 reproducibility:
6751 >>> stats.ttest_ind(rvs1, rvs5, permutations=10000,
6752 ... random_state=rng)
6753 Ttest_indResult(statistic=-2.8415950600298774, pvalue=0.0052994700529947)
6755 Take these two samples, one of which has an extreme tail.
6757 >>> a = (56, 128.6, 12, 123.8, 64.34, 78, 763.3)
6758 >>> b = (1.1, 2.9, 4.2)
6760 Use the `trim` keyword to perform a trimmed (Yuen) t-test. For example,
6761 using 20% trimming, ``trim=.2``, the test will reduce the impact of one
6762 (``np.floor(trim*len(a))``) element from each tail of sample `a`. It will
6763 have no effect on sample `b` because ``np.floor(trim*len(b))`` is 0.
6765 >>> stats.ttest_ind(a, b, trim=.2)
6766 Ttest_indResult(statistic=3.4463884028073513,
6767 pvalue=0.01369338726499547)
6768 """
6769 if not (0 <= trim < .5):
6770 raise ValueError("Trimming percentage should be 0 <= `trim` < .5.")
6772 a, b, axis = _chk2_asarray(a, b, axis)
6774 # check both a and b
6775 cna, npa = _contains_nan(a, nan_policy)
6776 cnb, npb = _contains_nan(b, nan_policy)
6777 contains_nan = cna or cnb
6778 if npa == 'omit' or npb == 'omit':
6779 nan_policy = 'omit'
6781 if contains_nan and nan_policy == 'omit':
6782 if permutations or trim != 0:
6783 raise ValueError("nan-containing/masked inputs with "
6784 "nan_policy='omit' are currently not "
6785 "supported by permutation tests or "
6786 "trimmed tests.")
6787 a = ma.masked_invalid(a)
6788 b = ma.masked_invalid(b)
6789 return mstats_basic.ttest_ind(a, b, axis, equal_var, alternative)
6791 if a.size == 0 or b.size == 0:
6792 return _ttest_nans(a, b, axis, Ttest_indResult)
6794 if permutations is not None and permutations != 0:
6795 if trim != 0:
6796 raise ValueError("Permutations are currently not supported "
6797 "with trimming.")
6798 if permutations < 0 or (np.isfinite(permutations) and
6799 int(permutations) != permutations):
6800 raise ValueError("Permutations must be a non-negative integer.")
6802 res = _permutation_ttest(a, b, permutations=permutations,
6803 axis=axis, equal_var=equal_var,
6804 nan_policy=nan_policy,
6805 random_state=random_state,
6806 alternative=alternative)
6808 else:
6809 n1 = a.shape[axis]
6810 n2 = b.shape[axis]
6812 if trim == 0:
6813 v1 = _var(a, axis, ddof=1)
6814 v2 = _var(b, axis, ddof=1)
6815 m1 = np.mean(a, axis)
6816 m2 = np.mean(b, axis)
6817 else:
6818 v1, m1, n1 = _ttest_trim_var_mean_len(a, trim, axis)
6819 v2, m2, n2 = _ttest_trim_var_mean_len(b, trim, axis)
6821 if equal_var:
6822 df, denom = _equal_var_ttest_denom(v1, n1, v2, n2)
6823 else:
6824 df, denom = _unequal_var_ttest_denom(v1, n1, v2, n2)
6825 res = _ttest_ind_from_stats(m1, m2, denom, df, alternative)
6826 return Ttest_indResult(*res)
6829def _ttest_trim_var_mean_len(a, trim, axis):
6830 """Variance, mean, and length of winsorized input along specified axis"""
6831 # for use with `ttest_ind` when trimming.
6832 # further calculations in this test assume that the inputs are sorted.
6833 # From [4] Section 1 "Let x_1, ..., x_n be n ordered observations..."
6834 a = np.sort(a, axis=axis)
6836 # `g` is the number of elements to be replaced on each tail, converted
6837 # from a percentage amount of trimming
6838 n = a.shape[axis]
6839 g = int(n * trim)
6841 # Calculate the Winsorized variance of the input samples according to
6842 # specified `g`
6843 v = _calculate_winsorized_variance(a, g, axis)
6845 # the total number of elements in the trimmed samples
6846 n -= 2 * g
6848 # calculate the g-times trimmed mean, as defined in [4] (1-1)
6849 m = trim_mean(a, trim, axis=axis)
6850 return v, m, n
6853def _calculate_winsorized_variance(a, g, axis):
6854 """Calculates g-times winsorized variance along specified axis"""
6855 # it is expected that the input `a` is sorted along the correct axis
6856 if g == 0:
6857 return _var(a, ddof=1, axis=axis)
6858 # move the intended axis to the end that way it is easier to manipulate
6859 a_win = np.moveaxis(a, axis, -1)
6861 # save where NaNs are for later use.
6862 nans_indices = np.any(np.isnan(a_win), axis=-1)
6864 # Winsorization and variance calculation are done in one step in [4]
6865 # (1-3), but here winsorization is done first; replace the left and
6866 # right sides with the repeating value. This can be see in effect in (
6867 # 1-3) in [4], where the leftmost and rightmost tails are replaced with
6868 # `(g + 1) * x_{g + 1}` on the left and `(g + 1) * x_{n - g}` on the
6869 # right. Zero-indexing turns `g + 1` to `g`, and `n - g` to `- g - 1` in
6870 # array indexing.
6871 a_win[..., :g] = a_win[..., [g]]
6872 a_win[..., -g:] = a_win[..., [-g - 1]]
6874 # Determine the variance. In [4], the degrees of freedom is expressed as
6875 # `h - 1`, where `h = n - 2g` (unnumbered equations in Section 1, end of
6876 # page 369, beginning of page 370). This is converted to NumPy's format,
6877 # `n - ddof` for use with `np.var`. The result is converted to an
6878 # array to accommodate indexing later.
6879 var_win = np.asarray(_var(a_win, ddof=(2 * g + 1), axis=-1))
6881 # with `nan_policy='propagate'`, NaNs may be completely trimmed out
6882 # because they were sorted into the tail of the array. In these cases,
6883 # replace computed variances with `np.nan`.
6884 var_win[nans_indices] = np.nan
6885 return var_win
6888def _permutation_distribution_t(data, permutations, size_a, equal_var,
6889 random_state=None):
6890 """Generation permutation distribution of t statistic"""
6892 random_state = check_random_state(random_state)
6894 # prepare permutation indices
6895 size = data.shape[-1]
6896 # number of distinct combinations
6897 n_max = special.comb(size, size_a)
6899 if permutations < n_max:
6900 perm_generator = (random_state.permutation(size)
6901 for i in range(permutations))
6902 else:
6903 permutations = n_max
6904 perm_generator = (np.concatenate(z)
6905 for z in _all_partitions(size_a, size-size_a))
6907 t_stat = []
6908 for indices in _batch_generator(perm_generator, batch=50):
6909 # get one batch from perm_generator at a time as a list
6910 indices = np.array(indices)
6911 # generate permutations
6912 data_perm = data[..., indices]
6913 # move axis indexing permutations to position 0 to broadcast
6914 # nicely with t_stat_observed, which doesn't have this dimension
6915 data_perm = np.moveaxis(data_perm, -2, 0)
6917 a = data_perm[..., :size_a]
6918 b = data_perm[..., size_a:]
6919 t_stat.append(_calc_t_stat(a, b, equal_var))
6921 t_stat = np.concatenate(t_stat, axis=0)
6923 return t_stat, permutations, n_max
6926def _calc_t_stat(a, b, equal_var, axis=-1):
6927 """Calculate the t statistic along the given dimension."""
6928 na = a.shape[axis]
6929 nb = b.shape[axis]
6930 avg_a = np.mean(a, axis=axis)
6931 avg_b = np.mean(b, axis=axis)
6932 var_a = _var(a, axis=axis, ddof=1)
6933 var_b = _var(b, axis=axis, ddof=1)
6935 if not equal_var:
6936 denom = _unequal_var_ttest_denom(var_a, na, var_b, nb)[1]
6937 else:
6938 denom = _equal_var_ttest_denom(var_a, na, var_b, nb)[1]
6940 return (avg_a-avg_b)/denom
6943def _permutation_ttest(a, b, permutations, axis=0, equal_var=True,
6944 nan_policy='propagate', random_state=None,
6945 alternative="two-sided"):
6946 """
6947 Calculates the T-test for the means of TWO INDEPENDENT samples of scores
6948 using permutation methods.
6950 This test is similar to `stats.ttest_ind`, except it doesn't rely on an
6951 approximate normality assumption since it uses a permutation test.
6952 This function is only called from ttest_ind when permutations is not None.
6954 Parameters
6955 ----------
6956 a, b : array_like
6957 The arrays must be broadcastable, except along the dimension
6958 corresponding to `axis` (the zeroth, by default).
6959 axis : int, optional
6960 The axis over which to operate on a and b.
6961 permutations : int, optional
6962 Number of permutations used to calculate p-value. If greater than or
6963 equal to the number of distinct permutations, perform an exact test.
6964 equal_var : bool, optional
6965 If False, an equal variance (Welch's) t-test is conducted. Otherwise,
6966 an ordinary t-test is conducted.
6967 random_state : {None, int, `numpy.random.Generator`}, optional
6968 If `seed` is None the `numpy.random.Generator` singleton is used.
6969 If `seed` is an int, a new ``Generator`` instance is used,
6970 seeded with `seed`.
6971 If `seed` is already a ``Generator`` instance then that instance is
6972 used.
6973 Pseudorandom number generator state used for generating random
6974 permutations.
6976 Returns
6977 -------
6978 statistic : float or array
6979 The calculated t-statistic.
6980 pvalue : float or array
6981 The p-value.
6983 """
6984 random_state = check_random_state(random_state)
6986 t_stat_observed = _calc_t_stat(a, b, equal_var, axis=axis)
6988 na = a.shape[axis]
6989 mat = _broadcast_concatenate((a, b), axis=axis)
6990 mat = np.moveaxis(mat, axis, -1)
6992 t_stat, permutations, n_max = _permutation_distribution_t(
6993 mat, permutations, size_a=na, equal_var=equal_var,
6994 random_state=random_state)
6996 compare = {"less": np.less_equal,
6997 "greater": np.greater_equal,
6998 "two-sided": lambda x, y: (x <= -np.abs(y)) | (x >= np.abs(y))}
7000 # Calculate the p-values
7001 cmps = compare[alternative](t_stat, t_stat_observed)
7002 # Randomized test p-value calculation should use biased estimate; see e.g.
7003 # https://www.degruyter.com/document/doi/10.2202/1544-6115.1585/
7004 adjustment = 1 if n_max > permutations else 0
7005 pvalues = (cmps.sum(axis=0) + adjustment) / (permutations + adjustment)
7007 # nans propagate naturally in statistic calculation, but need to be
7008 # propagated manually into pvalues
7009 if nan_policy == 'propagate' and np.isnan(t_stat_observed).any():
7010 if np.ndim(pvalues) == 0:
7011 pvalues = np.float64(np.nan)
7012 else:
7013 pvalues[np.isnan(t_stat_observed)] = np.nan
7015 return (t_stat_observed, pvalues)
7018def _get_len(a, axis, msg):
7019 try:
7020 n = a.shape[axis]
7021 except IndexError:
7022 raise np.AxisError(axis, a.ndim, msg) from None
7023 return n
7026@_axis_nan_policy_factory(pack_TtestResult, default_axis=0, n_samples=2,
7027 result_to_tuple=unpack_TtestResult, n_outputs=6,
7028 paired=True)
7029def ttest_rel(a, b, axis=0, nan_policy='propagate', alternative="two-sided"):
7030 """Calculate the t-test on TWO RELATED samples of scores, a and b.
7032 This is a test for the null hypothesis that two related or
7033 repeated samples have identical average (expected) values.
7035 Parameters
7036 ----------
7037 a, b : array_like
7038 The arrays must have the same shape.
7039 axis : int or None, optional
7040 Axis along which to compute test. If None, compute over the whole
7041 arrays, `a`, and `b`.
7042 nan_policy : {'propagate', 'raise', 'omit'}, optional
7043 Defines how to handle when input contains nan.
7044 The following options are available (default is 'propagate'):
7046 * 'propagate': returns nan
7047 * 'raise': throws an error
7048 * 'omit': performs the calculations ignoring nan values
7049 alternative : {'two-sided', 'less', 'greater'}, optional
7050 Defines the alternative hypothesis.
7051 The following options are available (default is 'two-sided'):
7053 * 'two-sided': the means of the distributions underlying the samples
7054 are unequal.
7055 * 'less': the mean of the distribution underlying the first sample
7056 is less than the mean of the distribution underlying the second
7057 sample.
7058 * 'greater': the mean of the distribution underlying the first
7059 sample is greater than the mean of the distribution underlying
7060 the second sample.
7062 .. versionadded:: 1.6.0
7064 Returns
7065 -------
7066 result : `~scipy.stats._result_classes.TtestResult`
7067 An object with the following attributes:
7069 statistic : float or array
7070 The t-statistic.
7071 pvalue : float or array
7072 The p-value associated with the given alternative.
7073 df : float or array
7074 The number of degrees of freedom used in calculation of the
7075 t-statistic; this is one less than the size of the sample
7076 (``a.shape[axis]``).
7078 .. versionadded:: 1.10.0
7080 The object also has the following method:
7082 confidence_interval(confidence_level=0.95)
7083 Computes a confidence interval around the difference in
7084 population means for the given confidence level.
7085 The confidence interval is returned in a ``namedtuple`` with
7086 fields `low` and `high`.
7088 .. versionadded:: 1.10.0
7090 Notes
7091 -----
7092 Examples for use are scores of the same set of student in
7093 different exams, or repeated sampling from the same units. The
7094 test measures whether the average score differs significantly
7095 across samples (e.g. exams). If we observe a large p-value, for
7096 example greater than 0.05 or 0.1 then we cannot reject the null
7097 hypothesis of identical average scores. If the p-value is smaller
7098 than the threshold, e.g. 1%, 5% or 10%, then we reject the null
7099 hypothesis of equal averages. Small p-values are associated with
7100 large t-statistics.
7102 The t-statistic is calculated as ``np.mean(a - b)/se``, where ``se`` is the
7103 standard error. Therefore, the t-statistic will be positive when the sample
7104 mean of ``a - b`` is greater than zero and negative when the sample mean of
7105 ``a - b`` is less than zero.
7107 References
7108 ----------
7109 https://en.wikipedia.org/wiki/T-test#Dependent_t-test_for_paired_samples
7111 Examples
7112 --------
7113 >>> import numpy as np
7114 >>> from scipy import stats
7115 >>> rng = np.random.default_rng()
7117 >>> rvs1 = stats.norm.rvs(loc=5, scale=10, size=500, random_state=rng)
7118 >>> rvs2 = (stats.norm.rvs(loc=5, scale=10, size=500, random_state=rng)
7119 ... + stats.norm.rvs(scale=0.2, size=500, random_state=rng))
7120 >>> stats.ttest_rel(rvs1, rvs2)
7121 TtestResult(statistic=-0.4549717054410304, pvalue=0.6493274702088672, df=499) # noqa
7122 >>> rvs3 = (stats.norm.rvs(loc=8, scale=10, size=500, random_state=rng)
7123 ... + stats.norm.rvs(scale=0.2, size=500, random_state=rng))
7124 >>> stats.ttest_rel(rvs1, rvs3)
7125 TtestResult(statistic=-5.879467544540889, pvalue=7.540777129099917e-09, df=499) # noqa
7127 """
7128 a, b, axis = _chk2_asarray(a, b, axis)
7130 na = _get_len(a, axis, "first argument")
7131 nb = _get_len(b, axis, "second argument")
7132 if na != nb:
7133 raise ValueError('unequal length arrays')
7135 if na == 0 or nb == 0:
7136 # _axis_nan_policy decorator ensures this only happens with 1d input
7137 return TtestResult(np.nan, np.nan, df=np.nan, alternative=np.nan,
7138 standard_error=np.nan, estimate=np.nan)
7140 n = a.shape[axis]
7141 df = n - 1
7143 d = (a - b).astype(np.float64)
7144 v = _var(d, axis, ddof=1)
7145 dm = np.mean(d, axis)
7146 denom = np.sqrt(v / n)
7148 with np.errstate(divide='ignore', invalid='ignore'):
7149 t = np.divide(dm, denom)
7150 t, prob = _ttest_finish(df, t, alternative)
7152 # when nan_policy='omit', `df` can be different for different axis-slices
7153 df = np.broadcast_to(df, t.shape)[()]
7155 # _axis_nan_policy decorator doesn't play well with strings
7156 alternative_num = {"less": -1, "two-sided": 0, "greater": 1}[alternative]
7157 return TtestResult(t, prob, df=df, alternative=alternative_num,
7158 standard_error=denom, estimate=dm)
7161# Map from names to lambda_ values used in power_divergence().
7162_power_div_lambda_names = {
7163 "pearson": 1,
7164 "log-likelihood": 0,
7165 "freeman-tukey": -0.5,
7166 "mod-log-likelihood": -1,
7167 "neyman": -2,
7168 "cressie-read": 2/3,
7169}
7172def _count(a, axis=None):
7173 """Count the number of non-masked elements of an array.
7175 This function behaves like `np.ma.count`, but is much faster
7176 for ndarrays.
7177 """
7178 if hasattr(a, 'count'):
7179 num = a.count(axis=axis)
7180 if isinstance(num, np.ndarray) and num.ndim == 0:
7181 # In some cases, the `count` method returns a scalar array (e.g.
7182 # np.array(3)), but we want a plain integer.
7183 num = int(num)
7184 else:
7185 if axis is None:
7186 num = a.size
7187 else:
7188 num = a.shape[axis]
7189 return num
7192def _m_broadcast_to(a, shape):
7193 if np.ma.isMaskedArray(a):
7194 return np.ma.masked_array(np.broadcast_to(a, shape),
7195 mask=np.broadcast_to(a.mask, shape))
7196 return np.broadcast_to(a, shape, subok=True)
7199Power_divergenceResult = namedtuple('Power_divergenceResult',
7200 ('statistic', 'pvalue'))
7203def power_divergence(f_obs, f_exp=None, ddof=0, axis=0, lambda_=None):
7204 """Cressie-Read power divergence statistic and goodness of fit test.
7206 This function tests the null hypothesis that the categorical data
7207 has the given frequencies, using the Cressie-Read power divergence
7208 statistic.
7210 Parameters
7211 ----------
7212 f_obs : array_like
7213 Observed frequencies in each category.
7214 f_exp : array_like, optional
7215 Expected frequencies in each category. By default the categories are
7216 assumed to be equally likely.
7217 ddof : int, optional
7218 "Delta degrees of freedom": adjustment to the degrees of freedom
7219 for the p-value. The p-value is computed using a chi-squared
7220 distribution with ``k - 1 - ddof`` degrees of freedom, where `k`
7221 is the number of observed frequencies. The default value of `ddof`
7222 is 0.
7223 axis : int or None, optional
7224 The axis of the broadcast result of `f_obs` and `f_exp` along which to
7225 apply the test. If axis is None, all values in `f_obs` are treated
7226 as a single data set. Default is 0.
7227 lambda_ : float or str, optional
7228 The power in the Cressie-Read power divergence statistic. The default
7229 is 1. For convenience, `lambda_` may be assigned one of the following
7230 strings, in which case the corresponding numerical value is used:
7232 * ``"pearson"`` (value 1)
7233 Pearson's chi-squared statistic. In this case, the function is
7234 equivalent to `chisquare`.
7235 * ``"log-likelihood"`` (value 0)
7236 Log-likelihood ratio. Also known as the G-test [3]_.
7237 * ``"freeman-tukey"`` (value -1/2)
7238 Freeman-Tukey statistic.
7239 * ``"mod-log-likelihood"`` (value -1)
7240 Modified log-likelihood ratio.
7241 * ``"neyman"`` (value -2)
7242 Neyman's statistic.
7243 * ``"cressie-read"`` (value 2/3)
7244 The power recommended in [5]_.
7246 Returns
7247 -------
7248 statistic : float or ndarray
7249 The Cressie-Read power divergence test statistic. The value is
7250 a float if `axis` is None or if` `f_obs` and `f_exp` are 1-D.
7251 pvalue : float or ndarray
7252 The p-value of the test. The value is a float if `ddof` and the
7253 return value `stat` are scalars.
7255 See Also
7256 --------
7257 chisquare
7259 Notes
7260 -----
7261 This test is invalid when the observed or expected frequencies in each
7262 category are too small. A typical rule is that all of the observed
7263 and expected frequencies should be at least 5.
7265 Also, the sum of the observed and expected frequencies must be the same
7266 for the test to be valid; `power_divergence` raises an error if the sums
7267 do not agree within a relative tolerance of ``1e-8``.
7269 When `lambda_` is less than zero, the formula for the statistic involves
7270 dividing by `f_obs`, so a warning or error may be generated if any value
7271 in `f_obs` is 0.
7273 Similarly, a warning or error may be generated if any value in `f_exp` is
7274 zero when `lambda_` >= 0.
7276 The default degrees of freedom, k-1, are for the case when no parameters
7277 of the distribution are estimated. If p parameters are estimated by
7278 efficient maximum likelihood then the correct degrees of freedom are
7279 k-1-p. If the parameters are estimated in a different way, then the
7280 dof can be between k-1-p and k-1. However, it is also possible that
7281 the asymptotic distribution is not a chisquare, in which case this
7282 test is not appropriate.
7284 This function handles masked arrays. If an element of `f_obs` or `f_exp`
7285 is masked, then data at that position is ignored, and does not count
7286 towards the size of the data set.
7288 .. versionadded:: 0.13.0
7290 References
7291 ----------
7292 .. [1] Lowry, Richard. "Concepts and Applications of Inferential
7293 Statistics". Chapter 8.
7294 https://web.archive.org/web/20171015035606/http://faculty.vassar.edu/lowry/ch8pt1.html
7295 .. [2] "Chi-squared test", https://en.wikipedia.org/wiki/Chi-squared_test
7296 .. [3] "G-test", https://en.wikipedia.org/wiki/G-test
7297 .. [4] Sokal, R. R. and Rohlf, F. J. "Biometry: the principles and
7298 practice of statistics in biological research", New York: Freeman
7299 (1981)
7300 .. [5] Cressie, N. and Read, T. R. C., "Multinomial Goodness-of-Fit
7301 Tests", J. Royal Stat. Soc. Series B, Vol. 46, No. 3 (1984),
7302 pp. 440-464.
7304 Examples
7305 --------
7306 (See `chisquare` for more examples.)
7308 When just `f_obs` is given, it is assumed that the expected frequencies
7309 are uniform and given by the mean of the observed frequencies. Here we
7310 perform a G-test (i.e. use the log-likelihood ratio statistic):
7312 >>> import numpy as np
7313 >>> from scipy.stats import power_divergence
7314 >>> power_divergence([16, 18, 16, 14, 12, 12], lambda_='log-likelihood')
7315 (2.006573162632538, 0.84823476779463769)
7317 The expected frequencies can be given with the `f_exp` argument:
7319 >>> power_divergence([16, 18, 16, 14, 12, 12],
7320 ... f_exp=[16, 16, 16, 16, 16, 8],
7321 ... lambda_='log-likelihood')
7322 (3.3281031458963746, 0.6495419288047497)
7324 When `f_obs` is 2-D, by default the test is applied to each column.
7326 >>> obs = np.array([[16, 18, 16, 14, 12, 12], [32, 24, 16, 28, 20, 24]]).T
7327 >>> obs.shape
7328 (6, 2)
7329 >>> power_divergence(obs, lambda_="log-likelihood")
7330 (array([ 2.00657316, 6.77634498]), array([ 0.84823477, 0.23781225]))
7332 By setting ``axis=None``, the test is applied to all data in the array,
7333 which is equivalent to applying the test to the flattened array.
7335 >>> power_divergence(obs, axis=None)
7336 (23.31034482758621, 0.015975692534127565)
7337 >>> power_divergence(obs.ravel())
7338 (23.31034482758621, 0.015975692534127565)
7340 `ddof` is the change to make to the default degrees of freedom.
7342 >>> power_divergence([16, 18, 16, 14, 12, 12], ddof=1)
7343 (2.0, 0.73575888234288467)
7345 The calculation of the p-values is done by broadcasting the
7346 test statistic with `ddof`.
7348 >>> power_divergence([16, 18, 16, 14, 12, 12], ddof=[0,1,2])
7349 (2.0, array([ 0.84914504, 0.73575888, 0.5724067 ]))
7351 `f_obs` and `f_exp` are also broadcast. In the following, `f_obs` has
7352 shape (6,) and `f_exp` has shape (2, 6), so the result of broadcasting
7353 `f_obs` and `f_exp` has shape (2, 6). To compute the desired chi-squared
7354 statistics, we must use ``axis=1``:
7356 >>> power_divergence([16, 18, 16, 14, 12, 12],
7357 ... f_exp=[[16, 16, 16, 16, 16, 8],
7358 ... [8, 20, 20, 16, 12, 12]],
7359 ... axis=1)
7360 (array([ 3.5 , 9.25]), array([ 0.62338763, 0.09949846]))
7362 """
7363 # Convert the input argument `lambda_` to a numerical value.
7364 if isinstance(lambda_, str):
7365 if lambda_ not in _power_div_lambda_names:
7366 names = repr(list(_power_div_lambda_names.keys()))[1:-1]
7367 raise ValueError("invalid string for lambda_: {0!r}. "
7368 "Valid strings are {1}".format(lambda_, names))
7369 lambda_ = _power_div_lambda_names[lambda_]
7370 elif lambda_ is None:
7371 lambda_ = 1
7373 f_obs = np.asanyarray(f_obs)
7374 f_obs_float = f_obs.astype(np.float64)
7376 if f_exp is not None:
7377 f_exp = np.asanyarray(f_exp)
7378 bshape = _broadcast_shapes(f_obs_float.shape, f_exp.shape)
7379 f_obs_float = _m_broadcast_to(f_obs_float, bshape)
7380 f_exp = _m_broadcast_to(f_exp, bshape)
7381 rtol = 1e-8 # to pass existing tests
7382 with np.errstate(invalid='ignore'):
7383 f_obs_sum = f_obs_float.sum(axis=axis)
7384 f_exp_sum = f_exp.sum(axis=axis)
7385 relative_diff = (np.abs(f_obs_sum - f_exp_sum) /
7386 np.minimum(f_obs_sum, f_exp_sum))
7387 diff_gt_tol = (relative_diff > rtol).any()
7388 if diff_gt_tol:
7389 msg = (f"For each axis slice, the sum of the observed "
7390 f"frequencies must agree with the sum of the "
7391 f"expected frequencies to a relative tolerance "
7392 f"of {rtol}, but the percent differences are:\n"
7393 f"{relative_diff}")
7394 raise ValueError(msg)
7396 else:
7397 # Ignore 'invalid' errors so the edge case of a data set with length 0
7398 # is handled without spurious warnings.
7399 with np.errstate(invalid='ignore'):
7400 f_exp = f_obs.mean(axis=axis, keepdims=True)
7402 # `terms` is the array of terms that are summed along `axis` to create
7403 # the test statistic. We use some specialized code for a few special
7404 # cases of lambda_.
7405 if lambda_ == 1:
7406 # Pearson's chi-squared statistic
7407 terms = (f_obs_float - f_exp)**2 / f_exp
7408 elif lambda_ == 0:
7409 # Log-likelihood ratio (i.e. G-test)
7410 terms = 2.0 * special.xlogy(f_obs, f_obs / f_exp)
7411 elif lambda_ == -1:
7412 # Modified log-likelihood ratio
7413 terms = 2.0 * special.xlogy(f_exp, f_exp / f_obs)
7414 else:
7415 # General Cressie-Read power divergence.
7416 terms = f_obs * ((f_obs / f_exp)**lambda_ - 1)
7417 terms /= 0.5 * lambda_ * (lambda_ + 1)
7419 stat = terms.sum(axis=axis)
7421 num_obs = _count(terms, axis=axis)
7422 ddof = asarray(ddof)
7423 p = distributions.chi2.sf(stat, num_obs - 1 - ddof)
7425 return Power_divergenceResult(stat, p)
7428def chisquare(f_obs, f_exp=None, ddof=0, axis=0):
7429 """Calculate a one-way chi-square test.
7431 The chi-square test tests the null hypothesis that the categorical data
7432 has the given frequencies.
7434 Parameters
7435 ----------
7436 f_obs : array_like
7437 Observed frequencies in each category.
7438 f_exp : array_like, optional
7439 Expected frequencies in each category. By default the categories are
7440 assumed to be equally likely.
7441 ddof : int, optional
7442 "Delta degrees of freedom": adjustment to the degrees of freedom
7443 for the p-value. The p-value is computed using a chi-squared
7444 distribution with ``k - 1 - ddof`` degrees of freedom, where `k`
7445 is the number of observed frequencies. The default value of `ddof`
7446 is 0.
7447 axis : int or None, optional
7448 The axis of the broadcast result of `f_obs` and `f_exp` along which to
7449 apply the test. If axis is None, all values in `f_obs` are treated
7450 as a single data set. Default is 0.
7452 Returns
7453 -------
7454 chisq : float or ndarray
7455 The chi-squared test statistic. The value is a float if `axis` is
7456 None or `f_obs` and `f_exp` are 1-D.
7457 p : float or ndarray
7458 The p-value of the test. The value is a float if `ddof` and the
7459 return value `chisq` are scalars.
7461 See Also
7462 --------
7463 scipy.stats.power_divergence
7464 scipy.stats.fisher_exact : Fisher exact test on a 2x2 contingency table.
7465 scipy.stats.barnard_exact : An unconditional exact test. An alternative
7466 to chi-squared test for small sample sizes.
7468 Notes
7469 -----
7470 This test is invalid when the observed or expected frequencies in each
7471 category are too small. A typical rule is that all of the observed
7472 and expected frequencies should be at least 5. According to [3]_, the
7473 total number of samples is recommended to be greater than 13,
7474 otherwise exact tests (such as Barnard's Exact test) should be used
7475 because they do not overreject.
7477 Also, the sum of the observed and expected frequencies must be the same
7478 for the test to be valid; `chisquare` raises an error if the sums do not
7479 agree within a relative tolerance of ``1e-8``.
7481 The default degrees of freedom, k-1, are for the case when no parameters
7482 of the distribution are estimated. If p parameters are estimated by
7483 efficient maximum likelihood then the correct degrees of freedom are
7484 k-1-p. If the parameters are estimated in a different way, then the
7485 dof can be between k-1-p and k-1. However, it is also possible that
7486 the asymptotic distribution is not chi-square, in which case this test
7487 is not appropriate.
7489 References
7490 ----------
7491 .. [1] Lowry, Richard. "Concepts and Applications of Inferential
7492 Statistics". Chapter 8.
7493 https://web.archive.org/web/20171022032306/http://vassarstats.net:80/textbook/ch8pt1.html
7494 .. [2] "Chi-squared test", https://en.wikipedia.org/wiki/Chi-squared_test
7495 .. [3] Pearson, Karl. "On the criterion that a given system of deviations from the probable
7496 in the case of a correlated system of variables is such that it can be reasonably
7497 supposed to have arisen from random sampling", Philosophical Magazine. Series 5. 50
7498 (1900), pp. 157-175.
7500 Examples
7501 --------
7502 When just `f_obs` is given, it is assumed that the expected frequencies
7503 are uniform and given by the mean of the observed frequencies.
7505 >>> import numpy as np
7506 >>> from scipy.stats import chisquare
7507 >>> chisquare([16, 18, 16, 14, 12, 12])
7508 (2.0, 0.84914503608460956)
7510 With `f_exp` the expected frequencies can be given.
7512 >>> chisquare([16, 18, 16, 14, 12, 12], f_exp=[16, 16, 16, 16, 16, 8])
7513 (3.5, 0.62338762774958223)
7515 When `f_obs` is 2-D, by default the test is applied to each column.
7517 >>> obs = np.array([[16, 18, 16, 14, 12, 12], [32, 24, 16, 28, 20, 24]]).T
7518 >>> obs.shape
7519 (6, 2)
7520 >>> chisquare(obs)
7521 (array([ 2. , 6.66666667]), array([ 0.84914504, 0.24663415]))
7523 By setting ``axis=None``, the test is applied to all data in the array,
7524 which is equivalent to applying the test to the flattened array.
7526 >>> chisquare(obs, axis=None)
7527 (23.31034482758621, 0.015975692534127565)
7528 >>> chisquare(obs.ravel())
7529 (23.31034482758621, 0.015975692534127565)
7531 `ddof` is the change to make to the default degrees of freedom.
7533 >>> chisquare([16, 18, 16, 14, 12, 12], ddof=1)
7534 (2.0, 0.73575888234288467)
7536 The calculation of the p-values is done by broadcasting the
7537 chi-squared statistic with `ddof`.
7539 >>> chisquare([16, 18, 16, 14, 12, 12], ddof=[0,1,2])
7540 (2.0, array([ 0.84914504, 0.73575888, 0.5724067 ]))
7542 `f_obs` and `f_exp` are also broadcast. In the following, `f_obs` has
7543 shape (6,) and `f_exp` has shape (2, 6), so the result of broadcasting
7544 `f_obs` and `f_exp` has shape (2, 6). To compute the desired chi-squared
7545 statistics, we use ``axis=1``:
7547 >>> chisquare([16, 18, 16, 14, 12, 12],
7548 ... f_exp=[[16, 16, 16, 16, 16, 8], [8, 20, 20, 16, 12, 12]],
7549 ... axis=1)
7550 (array([ 3.5 , 9.25]), array([ 0.62338763, 0.09949846]))
7552 """
7553 return power_divergence(f_obs, f_exp=f_exp, ddof=ddof, axis=axis,
7554 lambda_="pearson")
7557KstestResult = _make_tuple_bunch('KstestResult', ['statistic', 'pvalue'],
7558 ['statistic_location', 'statistic_sign'])
7561def _compute_dplus(cdfvals, x):
7562 """Computes D+ as used in the Kolmogorov-Smirnov test.
7564 Parameters
7565 ----------
7566 cdfvals : array_like
7567 Sorted array of CDF values between 0 and 1
7568 x: array_like
7569 Sorted array of the stochastic variable itself
7571 Returns
7572 -------
7573 res: Pair with the following elements:
7574 - The maximum distance of the CDF values below Uniform(0, 1).
7575 - The location at which the maximum is reached.
7577 """
7578 n = len(cdfvals)
7579 dplus = (np.arange(1.0, n + 1) / n - cdfvals)
7580 amax = dplus.argmax()
7581 loc_max = x[amax]
7582 return (dplus[amax], loc_max)
7585def _compute_dminus(cdfvals, x):
7586 """Computes D- as used in the Kolmogorov-Smirnov test.
7588 Parameters
7589 ----------
7590 cdfvals : array_like
7591 Sorted array of CDF values between 0 and 1
7592 x: array_like
7593 Sorted array of the stochastic variable itself
7595 Returns
7596 -------
7597 res: Pair with the following elements:
7598 - Maximum distance of the CDF values above Uniform(0, 1)
7599 - The location at which the maximum is reached.
7600 """
7601 n = len(cdfvals)
7602 dminus = (cdfvals - np.arange(0.0, n)/n)
7603 amax = dminus.argmax()
7604 loc_max = x[amax]
7605 return (dminus[amax], loc_max)
7608@_rename_parameter("mode", "method")
7609def ks_1samp(x, cdf, args=(), alternative='two-sided', method='auto'):
7610 """
7611 Performs the one-sample Kolmogorov-Smirnov test for goodness of fit.
7613 This test compares the underlying distribution F(x) of a sample
7614 against a given continuous distribution G(x). See Notes for a description
7615 of the available null and alternative hypotheses.
7617 Parameters
7618 ----------
7619 x : array_like
7620 a 1-D array of observations of iid random variables.
7621 cdf : callable
7622 callable used to calculate the cdf.
7623 args : tuple, sequence, optional
7624 Distribution parameters, used with `cdf`.
7625 alternative : {'two-sided', 'less', 'greater'}, optional
7626 Defines the null and alternative hypotheses. Default is 'two-sided'.
7627 Please see explanations in the Notes below.
7628 method : {'auto', 'exact', 'approx', 'asymp'}, optional
7629 Defines the distribution used for calculating the p-value.
7630 The following options are available (default is 'auto'):
7632 * 'auto' : selects one of the other options.
7633 * 'exact' : uses the exact distribution of test statistic.
7634 * 'approx' : approximates the two-sided probability with twice
7635 the one-sided probability
7636 * 'asymp': uses asymptotic distribution of test statistic
7638 Returns
7639 -------
7640 res: KstestResult
7641 An object containing attributes:
7643 statistic : float
7644 KS test statistic, either D+, D-, or D (the maximum of the two)
7645 pvalue : float
7646 One-tailed or two-tailed p-value.
7647 statistic_location : float
7648 Value of `x` corresponding with the KS statistic; i.e., the
7649 distance between the empirical distribution function and the
7650 hypothesized cumulative distribution function is measured at this
7651 observation.
7652 statistic_sign : int
7653 +1 if the KS statistic is the maximum positive difference between
7654 the empirical distribution function and the hypothesized cumulative
7655 distribution function (D+); -1 if the KS statistic is the maximum
7656 negative difference (D-).
7659 See Also
7660 --------
7661 ks_2samp, kstest
7663 Notes
7664 -----
7665 There are three options for the null and corresponding alternative
7666 hypothesis that can be selected using the `alternative` parameter.
7668 - `two-sided`: The null hypothesis is that the two distributions are
7669 identical, F(x)=G(x) for all x; the alternative is that they are not
7670 identical.
7672 - `less`: The null hypothesis is that F(x) >= G(x) for all x; the
7673 alternative is that F(x) < G(x) for at least one x.
7675 - `greater`: The null hypothesis is that F(x) <= G(x) for all x; the
7676 alternative is that F(x) > G(x) for at least one x.
7678 Note that the alternative hypotheses describe the *CDFs* of the
7679 underlying distributions, not the observed values. For example,
7680 suppose x1 ~ F and x2 ~ G. If F(x) > G(x) for all x, the values in
7681 x1 tend to be less than those in x2.
7683 Examples
7684 --------
7685 Suppose we wish to test the null hypothesis that a sample is distributed
7686 according to the standard normal.
7687 We choose a confidence level of 95%; that is, we will reject the null
7688 hypothesis in favor of the alternative if the p-value is less than 0.05.
7690 When testing uniformly distributed data, we would expect the
7691 null hypothesis to be rejected.
7693 >>> import numpy as np
7694 >>> from scipy import stats
7695 >>> rng = np.random.default_rng()
7696 >>> stats.ks_1samp(stats.uniform.rvs(size=100, random_state=rng),
7697 ... stats.norm.cdf)
7698 KstestResult(statistic=0.5001899973268688, pvalue=1.1616392184763533e-23)
7700 Indeed, the p-value is lower than our threshold of 0.05, so we reject the
7701 null hypothesis in favor of the default "two-sided" alternative: the data
7702 are *not* distributed according to the standard normal.
7704 When testing random variates from the standard normal distribution, we
7705 expect the data to be consistent with the null hypothesis most of the time.
7707 >>> x = stats.norm.rvs(size=100, random_state=rng)
7708 >>> stats.ks_1samp(x, stats.norm.cdf)
7709 KstestResult(statistic=0.05345882212970396, pvalue=0.9227159037744717)
7711 As expected, the p-value of 0.92 is not below our threshold of 0.05, so
7712 we cannot reject the null hypothesis.
7714 Suppose, however, that the random variates are distributed according to
7715 a normal distribution that is shifted toward greater values. In this case,
7716 the cumulative density function (CDF) of the underlying distribution tends
7717 to be *less* than the CDF of the standard normal. Therefore, we would
7718 expect the null hypothesis to be rejected with ``alternative='less'``:
7720 >>> x = stats.norm.rvs(size=100, loc=0.5, random_state=rng)
7721 >>> stats.ks_1samp(x, stats.norm.cdf, alternative='less')
7722 KstestResult(statistic=0.17482387821055168, pvalue=0.001913921057766743)
7724 and indeed, with p-value smaller than our threshold, we reject the null
7725 hypothesis in favor of the alternative.
7727 """
7728 mode = method
7730 alternative = {'t': 'two-sided', 'g': 'greater', 'l': 'less'}.get(
7731 alternative.lower()[0], alternative)
7732 if alternative not in ['two-sided', 'greater', 'less']:
7733 raise ValueError("Unexpected alternative %s" % alternative)
7734 if np.ma.is_masked(x):
7735 x = x.compressed()
7737 N = len(x)
7738 x = np.sort(x)
7739 cdfvals = cdf(x, *args)
7741 if alternative == 'greater':
7742 Dplus, d_location = _compute_dplus(cdfvals, x)
7743 return KstestResult(Dplus, distributions.ksone.sf(Dplus, N),
7744 statistic_location=d_location,
7745 statistic_sign=1)
7747 if alternative == 'less':
7748 Dminus, d_location = _compute_dminus(cdfvals, x)
7749 return KstestResult(Dminus, distributions.ksone.sf(Dminus, N),
7750 statistic_location=d_location,
7751 statistic_sign=-1)
7753 # alternative == 'two-sided':
7754 Dplus, dplus_location = _compute_dplus(cdfvals, x)
7755 Dminus, dminus_location = _compute_dminus(cdfvals, x)
7756 if Dplus > Dminus:
7757 D = Dplus
7758 d_location = dplus_location
7759 d_sign = 1
7760 else:
7761 D = Dminus
7762 d_location = dminus_location
7763 d_sign = -1
7765 if mode == 'auto': # Always select exact
7766 mode = 'exact'
7767 if mode == 'exact':
7768 prob = distributions.kstwo.sf(D, N)
7769 elif mode == 'asymp':
7770 prob = distributions.kstwobign.sf(D * np.sqrt(N))
7771 else:
7772 # mode == 'approx'
7773 prob = 2 * distributions.ksone.sf(D, N)
7774 prob = np.clip(prob, 0, 1)
7775 return KstestResult(D, prob,
7776 statistic_location=d_location,
7777 statistic_sign=d_sign)
7780Ks_2sampResult = KstestResult
7783def _compute_prob_outside_square(n, h):
7784 """
7785 Compute the proportion of paths that pass outside the two diagonal lines.
7787 Parameters
7788 ----------
7789 n : integer
7790 n > 0
7791 h : integer
7792 0 <= h <= n
7794 Returns
7795 -------
7796 p : float
7797 The proportion of paths that pass outside the lines x-y = +/-h.
7799 """
7800 # Compute Pr(D_{n,n} >= h/n)
7801 # Prob = 2 * ( binom(2n, n-h) - binom(2n, n-2a) + binom(2n, n-3a) - ... )
7802 # / binom(2n, n)
7803 # This formulation exhibits subtractive cancellation.
7804 # Instead divide each term by binom(2n, n), then factor common terms
7805 # and use a Horner-like algorithm
7806 # P = 2 * A0 * (1 - A1*(1 - A2*(1 - A3*(1 - A4*(...)))))
7808 P = 0.0
7809 k = int(np.floor(n / h))
7810 while k >= 0:
7811 p1 = 1.0
7812 # Each of the Ai terms has numerator and denominator with
7813 # h simple terms.
7814 for j in range(h):
7815 p1 = (n - k * h - j) * p1 / (n + k * h + j + 1)
7816 P = p1 * (1.0 - P)
7817 k -= 1
7818 return 2 * P
7821def _count_paths_outside_method(m, n, g, h):
7822 """Count the number of paths that pass outside the specified diagonal.
7824 Parameters
7825 ----------
7826 m : integer
7827 m > 0
7828 n : integer
7829 n > 0
7830 g : integer
7831 g is greatest common divisor of m and n
7832 h : integer
7833 0 <= h <= lcm(m,n)
7835 Returns
7836 -------
7837 p : float
7838 The number of paths that go low.
7839 The calculation may overflow - check for a finite answer.
7841 Notes
7842 -----
7843 Count the integer lattice paths from (0, 0) to (m, n), which at some
7844 point (x, y) along the path, satisfy:
7845 m*y <= n*x - h*g
7846 The paths make steps of size +1 in either positive x or positive y
7847 directions.
7849 We generally follow Hodges' treatment of Drion/Gnedenko/Korolyuk.
7850 Hodges, J.L. Jr.,
7851 "The Significance Probability of the Smirnov Two-Sample Test,"
7852 Arkiv fiur Matematik, 3, No. 43 (1958), 469-86.
7854 """
7855 # Compute #paths which stay lower than x/m-y/n = h/lcm(m,n)
7856 # B(x, y) = #{paths from (0,0) to (x,y) without
7857 # previously crossing the boundary}
7858 # = binom(x, y) - #{paths which already reached the boundary}
7859 # Multiply by the number of path extensions going from (x, y) to (m, n)
7860 # Sum.
7862 # Probability is symmetrical in m, n. Computation below assumes m >= n.
7863 if m < n:
7864 m, n = n, m
7865 mg = m // g
7866 ng = n // g
7868 # Not every x needs to be considered.
7869 # xj holds the list of x values to be checked.
7870 # Wherever n*x/m + ng*h crosses an integer
7871 lxj = n + (mg-h)//mg
7872 xj = [(h + mg * j + ng-1)//ng for j in range(lxj)]
7873 # B is an array just holding a few values of B(x,y), the ones needed.
7874 # B[j] == B(x_j, j)
7875 if lxj == 0:
7876 return special.binom(m + n, n)
7877 B = np.zeros(lxj)
7878 B[0] = 1
7879 # Compute the B(x, y) terms
7880 for j in range(1, lxj):
7881 Bj = special.binom(xj[j] + j, j)
7882 for i in range(j):
7883 bin = special.binom(xj[j] - xj[i] + j - i, j-i)
7884 Bj -= bin * B[i]
7885 B[j] = Bj
7886 # Compute the number of path extensions...
7887 num_paths = 0
7888 for j in range(lxj):
7889 bin = special.binom((m-xj[j]) + (n - j), n-j)
7890 term = B[j] * bin
7891 num_paths += term
7892 return num_paths
7895def _attempt_exact_2kssamp(n1, n2, g, d, alternative):
7896 """Attempts to compute the exact 2sample probability.
7898 n1, n2 are the sample sizes
7899 g is the gcd(n1, n2)
7900 d is the computed max difference in ECDFs
7902 Returns (success, d, probability)
7903 """
7904 lcm = (n1 // g) * n2
7905 h = int(np.round(d * lcm))
7906 d = h * 1.0 / lcm
7907 if h == 0:
7908 return True, d, 1.0
7909 saw_fp_error, prob = False, np.nan
7910 try:
7911 with np.errstate(invalid="raise", over="raise"):
7912 if alternative == 'two-sided':
7913 if n1 == n2:
7914 prob = _compute_prob_outside_square(n1, h)
7915 else:
7916 prob = _compute_outer_prob_inside_method(n1, n2, g, h)
7917 else:
7918 if n1 == n2:
7919 # prob = binom(2n, n-h) / binom(2n, n)
7920 # Evaluating in that form incurs roundoff errors
7921 # from special.binom. Instead calculate directly
7922 jrange = np.arange(h)
7923 prob = np.prod((n1 - jrange) / (n1 + jrange + 1.0))
7924 else:
7925 with np.errstate(over='raise'):
7926 num_paths = _count_paths_outside_method(n1, n2, g, h)
7927 bin = special.binom(n1 + n2, n1)
7928 if num_paths > bin or np.isinf(bin):
7929 saw_fp_error = True
7930 else:
7931 prob = num_paths / bin
7933 except (FloatingPointError, OverflowError):
7934 saw_fp_error = True
7936 if saw_fp_error:
7937 return False, d, np.nan
7938 if not (0 <= prob <= 1):
7939 return False, d, prob
7940 return True, d, prob
7943@_rename_parameter("mode", "method")
7944def ks_2samp(data1, data2, alternative='two-sided', method='auto'):
7945 """
7946 Performs the two-sample Kolmogorov-Smirnov test for goodness of fit.
7948 This test compares the underlying continuous distributions F(x) and G(x)
7949 of two independent samples. See Notes for a description of the available
7950 null and alternative hypotheses.
7952 Parameters
7953 ----------
7954 data1, data2 : array_like, 1-Dimensional
7955 Two arrays of sample observations assumed to be drawn from a continuous
7956 distribution, sample sizes can be different.
7957 alternative : {'two-sided', 'less', 'greater'}, optional
7958 Defines the null and alternative hypotheses. Default is 'two-sided'.
7959 Please see explanations in the Notes below.
7960 method : {'auto', 'exact', 'asymp'}, optional
7961 Defines the method used for calculating the p-value.
7962 The following options are available (default is 'auto'):
7964 * 'auto' : use 'exact' for small size arrays, 'asymp' for large
7965 * 'exact' : use exact distribution of test statistic
7966 * 'asymp' : use asymptotic distribution of test statistic
7968 Returns
7969 -------
7970 res: KstestResult
7971 An object containing attributes:
7973 statistic : float
7974 KS test statistic.
7975 pvalue : float
7976 One-tailed or two-tailed p-value.
7977 statistic_location : float
7978 Value from `data1` or `data2` corresponding with the KS statistic;
7979 i.e., the distance between the empirical distribution functions is
7980 measured at this observation.
7981 statistic_sign : int
7982 +1 if the empirical distribution function of `data1` exceeds
7983 the empirical distribution function of `data2` at
7984 `statistic_location`, otherwise -1.
7986 See Also
7987 --------
7988 kstest, ks_1samp, epps_singleton_2samp, anderson_ksamp
7990 Notes
7991 -----
7992 There are three options for the null and corresponding alternative
7993 hypothesis that can be selected using the `alternative` parameter.
7995 - `less`: The null hypothesis is that F(x) >= G(x) for all x; the
7996 alternative is that F(x) < G(x) for at least one x. The statistic
7997 is the magnitude of the minimum (most negative) difference between the
7998 empirical distribution functions of the samples.
8000 - `greater`: The null hypothesis is that F(x) <= G(x) for all x; the
8001 alternative is that F(x) > G(x) for at least one x. The statistic
8002 is the maximum (most positive) difference between the empirical
8003 distribution functions of the samples.
8005 - `two-sided`: The null hypothesis is that the two distributions are
8006 identical, F(x)=G(x) for all x; the alternative is that they are not
8007 identical. The statistic is the maximum absolute difference between the
8008 empirical distribution functions of the samples.
8010 Note that the alternative hypotheses describe the *CDFs* of the
8011 underlying distributions, not the observed values of the data. For example,
8012 suppose x1 ~ F and x2 ~ G. If F(x) > G(x) for all x, the values in
8013 x1 tend to be less than those in x2.
8015 If the KS statistic is large, then the p-value will be small, and this may
8016 be taken as evidence against the null hypothesis in favor of the
8017 alternative.
8019 If ``method='exact'``, `ks_2samp` attempts to compute an exact p-value,
8020 that is, the probability under the null hypothesis of obtaining a test
8021 statistic value as extreme as the value computed from the data.
8022 If ``method='asymp'``, the asymptotic Kolmogorov-Smirnov distribution is
8023 used to compute an approximate p-value.
8024 If ``method='auto'``, an exact p-value computation is attempted if both
8025 sample sizes are less than 10000; otherwise, the asymptotic method is used.
8026 In any case, if an exact p-value calculation is attempted and fails, a
8027 warning will be emitted, and the asymptotic p-value will be returned.
8029 The 'two-sided' 'exact' computation computes the complementary probability
8030 and then subtracts from 1. As such, the minimum probability it can return
8031 is about 1e-16. While the algorithm itself is exact, numerical
8032 errors may accumulate for large sample sizes. It is most suited to
8033 situations in which one of the sample sizes is only a few thousand.
8035 We generally follow Hodges' treatment of Drion/Gnedenko/Korolyuk [1]_.
8037 References
8038 ----------
8039 .. [1] Hodges, J.L. Jr., "The Significance Probability of the Smirnov
8040 Two-Sample Test," Arkiv fiur Matematik, 3, No. 43 (1958), 469-86.
8042 Examples
8043 --------
8044 Suppose we wish to test the null hypothesis that two samples were drawn
8045 from the same distribution.
8046 We choose a confidence level of 95%; that is, we will reject the null
8047 hypothesis in favor of the alternative if the p-value is less than 0.05.
8049 If the first sample were drawn from a uniform distribution and the second
8050 were drawn from the standard normal, we would expect the null hypothesis
8051 to be rejected.
8053 >>> import numpy as np
8054 >>> from scipy import stats
8055 >>> rng = np.random.default_rng()
8056 >>> sample1 = stats.uniform.rvs(size=100, random_state=rng)
8057 >>> sample2 = stats.norm.rvs(size=110, random_state=rng)
8058 >>> stats.ks_2samp(sample1, sample2)
8059 KstestResult(statistic=0.5454545454545454, pvalue=7.37417839555191e-15)
8061 Indeed, the p-value is lower than our threshold of 0.05, so we reject the
8062 null hypothesis in favor of the default "two-sided" alternative: the data
8063 were *not* drawn from the same distribution.
8065 When both samples are drawn from the same distribution, we expect the data
8066 to be consistent with the null hypothesis most of the time.
8068 >>> sample1 = stats.norm.rvs(size=105, random_state=rng)
8069 >>> sample2 = stats.norm.rvs(size=95, random_state=rng)
8070 >>> stats.ks_2samp(sample1, sample2)
8071 KstestResult(statistic=0.10927318295739348, pvalue=0.5438289009927495)
8073 As expected, the p-value of 0.54 is not below our threshold of 0.05, so
8074 we cannot reject the null hypothesis.
8076 Suppose, however, that the first sample were drawn from
8077 a normal distribution shifted toward greater values. In this case,
8078 the cumulative density function (CDF) of the underlying distribution tends
8079 to be *less* than the CDF underlying the second sample. Therefore, we would
8080 expect the null hypothesis to be rejected with ``alternative='less'``:
8082 >>> sample1 = stats.norm.rvs(size=105, loc=0.5, random_state=rng)
8083 >>> stats.ks_2samp(sample1, sample2, alternative='less')
8084 KstestResult(statistic=0.4055137844611529, pvalue=3.5474563068855554e-08)
8086 and indeed, with p-value smaller than our threshold, we reject the null
8087 hypothesis in favor of the alternative.
8089 """
8090 mode = method
8092 if mode not in ['auto', 'exact', 'asymp']:
8093 raise ValueError(f'Invalid value for mode: {mode}')
8094 alternative = {'t': 'two-sided', 'g': 'greater', 'l': 'less'}.get(
8095 alternative.lower()[0], alternative)
8096 if alternative not in ['two-sided', 'less', 'greater']:
8097 raise ValueError(f'Invalid value for alternative: {alternative}')
8098 MAX_AUTO_N = 10000 # 'auto' will attempt to be exact if n1,n2 <= MAX_AUTO_N
8099 if np.ma.is_masked(data1):
8100 data1 = data1.compressed()
8101 if np.ma.is_masked(data2):
8102 data2 = data2.compressed()
8103 data1 = np.sort(data1)
8104 data2 = np.sort(data2)
8105 n1 = data1.shape[0]
8106 n2 = data2.shape[0]
8107 if min(n1, n2) == 0:
8108 raise ValueError('Data passed to ks_2samp must not be empty')
8110 data_all = np.concatenate([data1, data2])
8111 # using searchsorted solves equal data problem
8112 cdf1 = np.searchsorted(data1, data_all, side='right') / n1
8113 cdf2 = np.searchsorted(data2, data_all, side='right') / n2
8114 cddiffs = cdf1 - cdf2
8116 # Identify the location of the statistic
8117 argminS = np.argmin(cddiffs)
8118 argmaxS = np.argmax(cddiffs)
8119 loc_minS = data_all[argminS]
8120 loc_maxS = data_all[argmaxS]
8122 # Ensure sign of minS is not negative.
8123 minS = np.clip(-cddiffs[argminS], 0, 1)
8124 maxS = cddiffs[argmaxS]
8126 if alternative == 'less' or (alternative == 'two-sided' and minS > maxS):
8127 d = minS
8128 d_location = loc_minS
8129 d_sign = -1
8130 else:
8131 d = maxS
8132 d_location = loc_maxS
8133 d_sign = 1
8134 g = gcd(n1, n2)
8135 n1g = n1 // g
8136 n2g = n2 // g
8137 prob = -np.inf
8138 if mode == 'auto':
8139 mode = 'exact' if max(n1, n2) <= MAX_AUTO_N else 'asymp'
8140 elif mode == 'exact':
8141 # If lcm(n1, n2) is too big, switch from exact to asymp
8142 if n1g >= np.iinfo(np.int32).max / n2g:
8143 mode = 'asymp'
8144 warnings.warn(
8145 f"Exact ks_2samp calculation not possible with samples sizes "
8146 f"{n1} and {n2}. Switching to 'asymp'.", RuntimeWarning,
8147 stacklevel=3)
8149 if mode == 'exact':
8150 success, d, prob = _attempt_exact_2kssamp(n1, n2, g, d, alternative)
8151 if not success:
8152 mode = 'asymp'
8153 warnings.warn(f"ks_2samp: Exact calculation unsuccessful. "
8154 f"Switching to method={mode}.", RuntimeWarning,
8155 stacklevel=3)
8157 if mode == 'asymp':
8158 # The product n1*n2 is large. Use Smirnov's asymptoptic formula.
8159 # Ensure float to avoid overflow in multiplication
8160 # sorted because the one-sided formula is not symmetric in n1, n2
8161 m, n = sorted([float(n1), float(n2)], reverse=True)
8162 en = m * n / (m + n)
8163 if alternative == 'two-sided':
8164 prob = distributions.kstwo.sf(d, np.round(en))
8165 else:
8166 z = np.sqrt(en) * d
8167 # Use Hodges' suggested approximation Eqn 5.3
8168 # Requires m to be the larger of (n1, n2)
8169 expt = -2 * z**2 - 2 * z * (m + 2*n)/np.sqrt(m*n*(m+n))/3.0
8170 prob = np.exp(expt)
8172 prob = np.clip(prob, 0, 1)
8173 return KstestResult(d, prob, statistic_location=d_location,
8174 statistic_sign=d_sign)
8177def _parse_kstest_args(data1, data2, args, N):
8178 # kstest allows many different variations of arguments.
8179 # Pull out the parsing into a separate function
8180 # (xvals, yvals, ) # 2sample
8181 # (xvals, cdf function,..)
8182 # (xvals, name of distribution, ...)
8183 # (name of distribution, name of distribution, ...)
8185 # Returns xvals, yvals, cdf
8186 # where cdf is a cdf function, or None
8187 # and yvals is either an array_like of values, or None
8188 # and xvals is array_like.
8189 rvsfunc, cdf = None, None
8190 if isinstance(data1, str):
8191 rvsfunc = getattr(distributions, data1).rvs
8192 elif callable(data1):
8193 rvsfunc = data1
8195 if isinstance(data2, str):
8196 cdf = getattr(distributions, data2).cdf
8197 data2 = None
8198 elif callable(data2):
8199 cdf = data2
8200 data2 = None
8202 data1 = np.sort(rvsfunc(*args, size=N) if rvsfunc else data1)
8203 return data1, data2, cdf
8206@_rename_parameter("mode", "method")
8207def kstest(rvs, cdf, args=(), N=20, alternative='two-sided', method='auto'):
8208 """
8209 Performs the (one-sample or two-sample) Kolmogorov-Smirnov test for
8210 goodness of fit.
8212 The one-sample test compares the underlying distribution F(x) of a sample
8213 against a given distribution G(x). The two-sample test compares the
8214 underlying distributions of two independent samples. Both tests are valid
8215 only for continuous distributions.
8217 Parameters
8218 ----------
8219 rvs : str, array_like, or callable
8220 If an array, it should be a 1-D array of observations of random
8221 variables.
8222 If a callable, it should be a function to generate random variables;
8223 it is required to have a keyword argument `size`.
8224 If a string, it should be the name of a distribution in `scipy.stats`,
8225 which will be used to generate random variables.
8226 cdf : str, array_like or callable
8227 If array_like, it should be a 1-D array of observations of random
8228 variables, and the two-sample test is performed
8229 (and rvs must be array_like).
8230 If a callable, that callable is used to calculate the cdf.
8231 If a string, it should be the name of a distribution in `scipy.stats`,
8232 which will be used as the cdf function.
8233 args : tuple, sequence, optional
8234 Distribution parameters, used if `rvs` or `cdf` are strings or
8235 callables.
8236 N : int, optional
8237 Sample size if `rvs` is string or callable. Default is 20.
8238 alternative : {'two-sided', 'less', 'greater'}, optional
8239 Defines the null and alternative hypotheses. Default is 'two-sided'.
8240 Please see explanations in the Notes below.
8241 method : {'auto', 'exact', 'approx', 'asymp'}, optional
8242 Defines the distribution used for calculating the p-value.
8243 The following options are available (default is 'auto'):
8245 * 'auto' : selects one of the other options.
8246 * 'exact' : uses the exact distribution of test statistic.
8247 * 'approx' : approximates the two-sided probability with twice the
8248 one-sided probability
8249 * 'asymp': uses asymptotic distribution of test statistic
8251 Returns
8252 -------
8253 res: KstestResult
8254 An object containing attributes:
8256 statistic : float
8257 KS test statistic, either D+, D-, or D (the maximum of the two)
8258 pvalue : float
8259 One-tailed or two-tailed p-value.
8260 statistic_location : float
8261 In a one-sample test, this is the value of `rvs`
8262 corresponding with the KS statistic; i.e., the distance between
8263 the empirical distribution function and the hypothesized cumulative
8264 distribution function is measured at this observation.
8266 In a two-sample test, this is the value from `rvs` or `cdf`
8267 corresponding with the KS statistic; i.e., the distance between
8268 the empirical distribution functions is measured at this
8269 observation.
8270 statistic_sign : int
8271 In a one-sample test, this is +1 if the KS statistic is the
8272 maximum positive difference between the empirical distribution
8273 function and the hypothesized cumulative distribution function
8274 (D+); it is -1 if the KS statistic is the maximum negative
8275 difference (D-).
8277 In a two-sample test, this is +1 if the empirical distribution
8278 function of `rvs` exceeds the empirical distribution
8279 function of `cdf` at `statistic_location`, otherwise -1.
8281 See Also
8282 --------
8283 ks_1samp, ks_2samp
8285 Notes
8286 -----
8287 There are three options for the null and corresponding alternative
8288 hypothesis that can be selected using the `alternative` parameter.
8290 - `two-sided`: The null hypothesis is that the two distributions are
8291 identical, F(x)=G(x) for all x; the alternative is that they are not
8292 identical.
8294 - `less`: The null hypothesis is that F(x) >= G(x) for all x; the
8295 alternative is that F(x) < G(x) for at least one x.
8297 - `greater`: The null hypothesis is that F(x) <= G(x) for all x; the
8298 alternative is that F(x) > G(x) for at least one x.
8300 Note that the alternative hypotheses describe the *CDFs* of the
8301 underlying distributions, not the observed values. For example,
8302 suppose x1 ~ F and x2 ~ G. If F(x) > G(x) for all x, the values in
8303 x1 tend to be less than those in x2.
8306 Examples
8307 --------
8308 Suppose we wish to test the null hypothesis that a sample is distributed
8309 according to the standard normal.
8310 We choose a confidence level of 95%; that is, we will reject the null
8311 hypothesis in favor of the alternative if the p-value is less than 0.05.
8313 When testing uniformly distributed data, we would expect the
8314 null hypothesis to be rejected.
8316 >>> import numpy as np
8317 >>> from scipy import stats
8318 >>> rng = np.random.default_rng()
8319 >>> stats.kstest(stats.uniform.rvs(size=100, random_state=rng),
8320 ... stats.norm.cdf)
8321 KstestResult(statistic=0.5001899973268688, pvalue=1.1616392184763533e-23)
8323 Indeed, the p-value is lower than our threshold of 0.05, so we reject the
8324 null hypothesis in favor of the default "two-sided" alternative: the data
8325 are *not* distributed according to the standard normal.
8327 When testing random variates from the standard normal distribution, we
8328 expect the data to be consistent with the null hypothesis most of the time.
8330 >>> x = stats.norm.rvs(size=100, random_state=rng)
8331 >>> stats.kstest(x, stats.norm.cdf)
8332 KstestResult(statistic=0.05345882212970396, pvalue=0.9227159037744717)
8334 As expected, the p-value of 0.92 is not below our threshold of 0.05, so
8335 we cannot reject the null hypothesis.
8337 Suppose, however, that the random variates are distributed according to
8338 a normal distribution that is shifted toward greater values. In this case,
8339 the cumulative density function (CDF) of the underlying distribution tends
8340 to be *less* than the CDF of the standard normal. Therefore, we would
8341 expect the null hypothesis to be rejected with ``alternative='less'``:
8343 >>> x = stats.norm.rvs(size=100, loc=0.5, random_state=rng)
8344 >>> stats.kstest(x, stats.norm.cdf, alternative='less')
8345 KstestResult(statistic=0.17482387821055168, pvalue=0.001913921057766743)
8347 and indeed, with p-value smaller than our threshold, we reject the null
8348 hypothesis in favor of the alternative.
8350 For convenience, the previous test can be performed using the name of the
8351 distribution as the second argument.
8353 >>> stats.kstest(x, "norm", alternative='less')
8354 KstestResult(statistic=0.17482387821055168, pvalue=0.001913921057766743)
8356 The examples above have all been one-sample tests identical to those
8357 performed by `ks_1samp`. Note that `kstest` can also perform two-sample
8358 tests identical to those performed by `ks_2samp`. For example, when two
8359 samples are drawn from the same distribution, we expect the data to be
8360 consistent with the null hypothesis most of the time.
8362 >>> sample1 = stats.laplace.rvs(size=105, random_state=rng)
8363 >>> sample2 = stats.laplace.rvs(size=95, random_state=rng)
8364 >>> stats.kstest(sample1, sample2)
8365 KstestResult(statistic=0.11779448621553884, pvalue=0.4494256912629795)
8367 As expected, the p-value of 0.45 is not below our threshold of 0.05, so
8368 we cannot reject the null hypothesis.
8370 """
8371 # to not break compatibility with existing code
8372 if alternative == 'two_sided':
8373 alternative = 'two-sided'
8374 if alternative not in ['two-sided', 'greater', 'less']:
8375 raise ValueError("Unexpected alternative %s" % alternative)
8376 xvals, yvals, cdf = _parse_kstest_args(rvs, cdf, args, N)
8377 if cdf:
8378 return ks_1samp(xvals, cdf, args=args, alternative=alternative,
8379 method=method)
8380 return ks_2samp(xvals, yvals, alternative=alternative, method=method)
8383def tiecorrect(rankvals):
8384 """Tie correction factor for Mann-Whitney U and Kruskal-Wallis H tests.
8386 Parameters
8387 ----------
8388 rankvals : array_like
8389 A 1-D sequence of ranks. Typically this will be the array
8390 returned by `~scipy.stats.rankdata`.
8392 Returns
8393 -------
8394 factor : float
8395 Correction factor for U or H.
8397 See Also
8398 --------
8399 rankdata : Assign ranks to the data
8400 mannwhitneyu : Mann-Whitney rank test
8401 kruskal : Kruskal-Wallis H test
8403 References
8404 ----------
8405 .. [1] Siegel, S. (1956) Nonparametric Statistics for the Behavioral
8406 Sciences. New York: McGraw-Hill.
8408 Examples
8409 --------
8410 >>> from scipy.stats import tiecorrect, rankdata
8411 >>> tiecorrect([1, 2.5, 2.5, 4])
8412 0.9
8413 >>> ranks = rankdata([1, 3, 2, 4, 5, 7, 2, 8, 4])
8414 >>> ranks
8415 array([ 1. , 4. , 2.5, 5.5, 7. , 8. , 2.5, 9. , 5.5])
8416 >>> tiecorrect(ranks)
8417 0.9833333333333333
8419 """
8420 arr = np.sort(rankvals)
8421 idx = np.nonzero(np.r_[True, arr[1:] != arr[:-1], True])[0]
8422 cnt = np.diff(idx).astype(np.float64)
8424 size = np.float64(arr.size)
8425 return 1.0 if size < 2 else 1.0 - (cnt**3 - cnt).sum() / (size**3 - size)
8428RanksumsResult = namedtuple('RanksumsResult', ('statistic', 'pvalue'))
8431@_axis_nan_policy_factory(RanksumsResult, n_samples=2)
8432def ranksums(x, y, alternative='two-sided'):
8433 """Compute the Wilcoxon rank-sum statistic for two samples.
8435 The Wilcoxon rank-sum test tests the null hypothesis that two sets
8436 of measurements are drawn from the same distribution. The alternative
8437 hypothesis is that values in one sample are more likely to be
8438 larger than the values in the other sample.
8440 This test should be used to compare two samples from continuous
8441 distributions. It does not handle ties between measurements
8442 in x and y. For tie-handling and an optional continuity correction
8443 see `scipy.stats.mannwhitneyu`.
8445 Parameters
8446 ----------
8447 x,y : array_like
8448 The data from the two samples.
8449 alternative : {'two-sided', 'less', 'greater'}, optional
8450 Defines the alternative hypothesis. Default is 'two-sided'.
8451 The following options are available:
8453 * 'two-sided': one of the distributions (underlying `x` or `y`) is
8454 stochastically greater than the other.
8455 * 'less': the distribution underlying `x` is stochastically less
8456 than the distribution underlying `y`.
8457 * 'greater': the distribution underlying `x` is stochastically greater
8458 than the distribution underlying `y`.
8460 .. versionadded:: 1.7.0
8462 Returns
8463 -------
8464 statistic : float
8465 The test statistic under the large-sample approximation that the
8466 rank sum statistic is normally distributed.
8467 pvalue : float
8468 The p-value of the test.
8470 References
8471 ----------
8472 .. [1] https://en.wikipedia.org/wiki/Wilcoxon_rank-sum_test
8474 Examples
8475 --------
8476 We can test the hypothesis that two independent unequal-sized samples are
8477 drawn from the same distribution with computing the Wilcoxon rank-sum
8478 statistic.
8480 >>> import numpy as np
8481 >>> from scipy.stats import ranksums
8482 >>> rng = np.random.default_rng()
8483 >>> sample1 = rng.uniform(-1, 1, 200)
8484 >>> sample2 = rng.uniform(-0.5, 1.5, 300) # a shifted distribution
8485 >>> ranksums(sample1, sample2)
8486 RanksumsResult(statistic=-7.887059, pvalue=3.09390448e-15) # may vary
8487 >>> ranksums(sample1, sample2, alternative='less')
8488 RanksumsResult(statistic=-7.750585297581713, pvalue=4.573497606342543e-15) # may vary
8489 >>> ranksums(sample1, sample2, alternative='greater')
8490 RanksumsResult(statistic=-7.750585297581713, pvalue=0.9999999999999954) # may vary
8492 The p-value of less than ``0.05`` indicates that this test rejects the
8493 hypothesis at the 5% significance level.
8495 """
8496 x, y = map(np.asarray, (x, y))
8497 n1 = len(x)
8498 n2 = len(y)
8499 alldata = np.concatenate((x, y))
8500 ranked = rankdata(alldata)
8501 x = ranked[:n1]
8502 s = np.sum(x, axis=0)
8503 expected = n1 * (n1+n2+1) / 2.0
8504 z = (s - expected) / np.sqrt(n1*n2*(n1+n2+1)/12.0)
8505 z, prob = _normtest_finish(z, alternative)
8507 return RanksumsResult(z, prob)
8510KruskalResult = namedtuple('KruskalResult', ('statistic', 'pvalue'))
8513@_axis_nan_policy_factory(KruskalResult, n_samples=None)
8514def kruskal(*samples, nan_policy='propagate'):
8515 """Compute the Kruskal-Wallis H-test for independent samples.
8517 The Kruskal-Wallis H-test tests the null hypothesis that the population
8518 median of all of the groups are equal. It is a non-parametric version of
8519 ANOVA. The test works on 2 or more independent samples, which may have
8520 different sizes. Note that rejecting the null hypothesis does not
8521 indicate which of the groups differs. Post hoc comparisons between
8522 groups are required to determine which groups are different.
8524 Parameters
8525 ----------
8526 sample1, sample2, ... : array_like
8527 Two or more arrays with the sample measurements can be given as
8528 arguments. Samples must be one-dimensional.
8529 nan_policy : {'propagate', 'raise', 'omit'}, optional
8530 Defines how to handle when input contains nan.
8531 The following options are available (default is 'propagate'):
8533 * 'propagate': returns nan
8534 * 'raise': throws an error
8535 * 'omit': performs the calculations ignoring nan values
8537 Returns
8538 -------
8539 statistic : float
8540 The Kruskal-Wallis H statistic, corrected for ties.
8541 pvalue : float
8542 The p-value for the test using the assumption that H has a chi
8543 square distribution. The p-value returned is the survival function of
8544 the chi square distribution evaluated at H.
8546 See Also
8547 --------
8548 f_oneway : 1-way ANOVA.
8549 mannwhitneyu : Mann-Whitney rank test on two samples.
8550 friedmanchisquare : Friedman test for repeated measurements.
8552 Notes
8553 -----
8554 Due to the assumption that H has a chi square distribution, the number
8555 of samples in each group must not be too small. A typical rule is
8556 that each sample must have at least 5 measurements.
8558 References
8559 ----------
8560 .. [1] W. H. Kruskal & W. W. Wallis, "Use of Ranks in
8561 One-Criterion Variance Analysis", Journal of the American Statistical
8562 Association, Vol. 47, Issue 260, pp. 583-621, 1952.
8563 .. [2] https://en.wikipedia.org/wiki/Kruskal-Wallis_one-way_analysis_of_variance
8565 Examples
8566 --------
8567 >>> from scipy import stats
8568 >>> x = [1, 3, 5, 7, 9]
8569 >>> y = [2, 4, 6, 8, 10]
8570 >>> stats.kruskal(x, y)
8571 KruskalResult(statistic=0.2727272727272734, pvalue=0.6015081344405895)
8573 >>> x = [1, 1, 1]
8574 >>> y = [2, 2, 2]
8575 >>> z = [2, 2]
8576 >>> stats.kruskal(x, y, z)
8577 KruskalResult(statistic=7.0, pvalue=0.0301973834223185)
8579 """
8580 samples = list(map(np.asarray, samples))
8582 num_groups = len(samples)
8583 if num_groups < 2:
8584 raise ValueError("Need at least two groups in stats.kruskal()")
8586 for sample in samples:
8587 if sample.size == 0:
8588 return KruskalResult(np.nan, np.nan)
8589 elif sample.ndim != 1:
8590 raise ValueError("Samples must be one-dimensional.")
8592 n = np.asarray(list(map(len, samples)))
8594 if nan_policy not in ('propagate', 'raise', 'omit'):
8595 raise ValueError("nan_policy must be 'propagate', 'raise' or 'omit'")
8597 contains_nan = False
8598 for sample in samples:
8599 cn = _contains_nan(sample, nan_policy)
8600 if cn[0]:
8601 contains_nan = True
8602 break
8604 if contains_nan and nan_policy == 'omit':
8605 for sample in samples:
8606 sample = ma.masked_invalid(sample)
8607 return mstats_basic.kruskal(*samples)
8609 if contains_nan and nan_policy == 'propagate':
8610 return KruskalResult(np.nan, np.nan)
8612 alldata = np.concatenate(samples)
8613 ranked = rankdata(alldata)
8614 ties = tiecorrect(ranked)
8615 if ties == 0:
8616 raise ValueError('All numbers are identical in kruskal')
8618 # Compute sum^2/n for each group and sum
8619 j = np.insert(np.cumsum(n), 0, 0)
8620 ssbn = 0
8621 for i in range(num_groups):
8622 ssbn += _square_of_sums(ranked[j[i]:j[i+1]]) / n[i]
8624 totaln = np.sum(n, dtype=float)
8625 h = 12.0 / (totaln * (totaln + 1)) * ssbn - 3 * (totaln + 1)
8626 df = num_groups - 1
8627 h /= ties
8629 return KruskalResult(h, distributions.chi2.sf(h, df))
8632FriedmanchisquareResult = namedtuple('FriedmanchisquareResult',
8633 ('statistic', 'pvalue'))
8636def friedmanchisquare(*samples):
8637 """Compute the Friedman test for repeated samples.
8639 The Friedman test tests the null hypothesis that repeated samples of
8640 the same individuals have the same distribution. It is often used
8641 to test for consistency among samples obtained in different ways.
8642 For example, if two sampling techniques are used on the same set of
8643 individuals, the Friedman test can be used to determine if the two
8644 sampling techniques are consistent.
8646 Parameters
8647 ----------
8648 sample1, sample2, sample3... : array_like
8649 Arrays of observations. All of the arrays must have the same number
8650 of elements. At least three samples must be given.
8652 Returns
8653 -------
8654 statistic : float
8655 The test statistic, correcting for ties.
8656 pvalue : float
8657 The associated p-value assuming that the test statistic has a chi
8658 squared distribution.
8660 Notes
8661 -----
8662 Due to the assumption that the test statistic has a chi squared
8663 distribution, the p-value is only reliable for n > 10 and more than
8664 6 repeated samples.
8666 References
8667 ----------
8668 .. [1] https://en.wikipedia.org/wiki/Friedman_test
8670 """
8671 k = len(samples)
8672 if k < 3:
8673 raise ValueError('At least 3 sets of samples must be given '
8674 'for Friedman test, got {}.'.format(k))
8676 n = len(samples[0])
8677 for i in range(1, k):
8678 if len(samples[i]) != n:
8679 raise ValueError('Unequal N in friedmanchisquare. Aborting.')
8681 # Rank data
8682 data = np.vstack(samples).T
8683 data = data.astype(float)
8684 for i in range(len(data)):
8685 data[i] = rankdata(data[i])
8687 # Handle ties
8688 ties = 0
8689 for d in data:
8690 replist, repnum = find_repeats(array(d))
8691 for t in repnum:
8692 ties += t * (t*t - 1)
8693 c = 1 - ties / (k*(k*k - 1)*n)
8695 ssbn = np.sum(data.sum(axis=0)**2)
8696 chisq = (12.0 / (k*n*(k+1)) * ssbn - 3*n*(k+1)) / c
8698 return FriedmanchisquareResult(chisq, distributions.chi2.sf(chisq, k - 1))
8701BrunnerMunzelResult = namedtuple('BrunnerMunzelResult',
8702 ('statistic', 'pvalue'))
8705def brunnermunzel(x, y, alternative="two-sided", distribution="t",
8706 nan_policy='propagate'):
8707 """Compute the Brunner-Munzel test on samples x and y.
8709 The Brunner-Munzel test is a nonparametric test of the null hypothesis that
8710 when values are taken one by one from each group, the probabilities of
8711 getting large values in both groups are equal.
8712 Unlike the Wilcoxon-Mann-Whitney's U test, this does not require the
8713 assumption of equivariance of two groups. Note that this does not assume
8714 the distributions are same. This test works on two independent samples,
8715 which may have different sizes.
8717 Parameters
8718 ----------
8719 x, y : array_like
8720 Array of samples, should be one-dimensional.
8721 alternative : {'two-sided', 'less', 'greater'}, optional
8722 Defines the alternative hypothesis.
8723 The following options are available (default is 'two-sided'):
8725 * 'two-sided'
8726 * 'less': one-sided
8727 * 'greater': one-sided
8728 distribution : {'t', 'normal'}, optional
8729 Defines how to get the p-value.
8730 The following options are available (default is 't'):
8732 * 't': get the p-value by t-distribution
8733 * 'normal': get the p-value by standard normal distribution.
8734 nan_policy : {'propagate', 'raise', 'omit'}, optional
8735 Defines how to handle when input contains nan.
8736 The following options are available (default is 'propagate'):
8738 * 'propagate': returns nan
8739 * 'raise': throws an error
8740 * 'omit': performs the calculations ignoring nan values
8742 Returns
8743 -------
8744 statistic : float
8745 The Brunner-Munzer W statistic.
8746 pvalue : float
8747 p-value assuming an t distribution. One-sided or
8748 two-sided, depending on the choice of `alternative` and `distribution`.
8750 See Also
8751 --------
8752 mannwhitneyu : Mann-Whitney rank test on two samples.
8754 Notes
8755 -----
8756 Brunner and Munzel recommended to estimate the p-value by t-distribution
8757 when the size of data is 50 or less. If the size is lower than 10, it would
8758 be better to use permuted Brunner Munzel test (see [2]_).
8760 References
8761 ----------
8762 .. [1] Brunner, E. and Munzel, U. "The nonparametric Benhrens-Fisher
8763 problem: Asymptotic theory and a small-sample approximation".
8764 Biometrical Journal. Vol. 42(2000): 17-25.
8765 .. [2] Neubert, K. and Brunner, E. "A studentized permutation test for the
8766 non-parametric Behrens-Fisher problem". Computational Statistics and
8767 Data Analysis. Vol. 51(2007): 5192-5204.
8769 Examples
8770 --------
8771 >>> from scipy import stats
8772 >>> x1 = [1,2,1,1,1,1,1,1,1,1,2,4,1,1]
8773 >>> x2 = [3,3,4,3,1,2,3,1,1,5,4]
8774 >>> w, p_value = stats.brunnermunzel(x1, x2)
8775 >>> w
8776 3.1374674823029505
8777 >>> p_value
8778 0.0057862086661515377
8780 """
8781 x = np.asarray(x)
8782 y = np.asarray(y)
8784 # check both x and y
8785 cnx, npx = _contains_nan(x, nan_policy)
8786 cny, npy = _contains_nan(y, nan_policy)
8787 contains_nan = cnx or cny
8788 if npx == "omit" or npy == "omit":
8789 nan_policy = "omit"
8791 if contains_nan and nan_policy == "propagate":
8792 return BrunnerMunzelResult(np.nan, np.nan)
8793 elif contains_nan and nan_policy == "omit":
8794 x = ma.masked_invalid(x)
8795 y = ma.masked_invalid(y)
8796 return mstats_basic.brunnermunzel(x, y, alternative, distribution)
8798 nx = len(x)
8799 ny = len(y)
8800 if nx == 0 or ny == 0:
8801 return BrunnerMunzelResult(np.nan, np.nan)
8802 rankc = rankdata(np.concatenate((x, y)))
8803 rankcx = rankc[0:nx]
8804 rankcy = rankc[nx:nx+ny]
8805 rankcx_mean = np.mean(rankcx)
8806 rankcy_mean = np.mean(rankcy)
8807 rankx = rankdata(x)
8808 ranky = rankdata(y)
8809 rankx_mean = np.mean(rankx)
8810 ranky_mean = np.mean(ranky)
8812 Sx = np.sum(np.power(rankcx - rankx - rankcx_mean + rankx_mean, 2.0))
8813 Sx /= nx - 1
8814 Sy = np.sum(np.power(rankcy - ranky - rankcy_mean + ranky_mean, 2.0))
8815 Sy /= ny - 1
8817 wbfn = nx * ny * (rankcy_mean - rankcx_mean)
8818 wbfn /= (nx + ny) * np.sqrt(nx * Sx + ny * Sy)
8820 if distribution == "t":
8821 df_numer = np.power(nx * Sx + ny * Sy, 2.0)
8822 df_denom = np.power(nx * Sx, 2.0) / (nx - 1)
8823 df_denom += np.power(ny * Sy, 2.0) / (ny - 1)
8824 df = df_numer / df_denom
8826 if (df_numer == 0) and (df_denom == 0):
8827 message = ("p-value cannot be estimated with `distribution='t' "
8828 "because degrees of freedom parameter is undefined "
8829 "(0/0). Try using `distribution='normal'")
8830 warnings.warn(message, RuntimeWarning)
8832 p = distributions.t.cdf(wbfn, df)
8833 elif distribution == "normal":
8834 p = distributions.norm.cdf(wbfn)
8835 else:
8836 raise ValueError(
8837 "distribution should be 't' or 'normal'")
8839 if alternative == "greater":
8840 pass
8841 elif alternative == "less":
8842 p = 1 - p
8843 elif alternative == "two-sided":
8844 p = 2 * np.min([p, 1-p])
8845 else:
8846 raise ValueError(
8847 "alternative should be 'less', 'greater' or 'two-sided'")
8849 return BrunnerMunzelResult(wbfn, p)
8852def combine_pvalues(pvalues, method='fisher', weights=None):
8853 """
8854 Combine p-values from independent tests that bear upon the same hypothesis.
8856 These methods are intended only for combining p-values from hypothesis
8857 tests based upon continuous distributions.
8859 Each method assumes that under the null hypothesis, the p-values are
8860 sampled independently and uniformly from the interval [0, 1]. A test
8861 statistic (different for each method) is computed and a combined
8862 p-value is calculated based upon the distribution of this test statistic
8863 under the null hypothesis.
8865 Parameters
8866 ----------
8867 pvalues : array_like, 1-D
8868 Array of p-values assumed to come from independent tests based on
8869 continuous distributions.
8870 method : {'fisher', 'pearson', 'tippett', 'stouffer', 'mudholkar_george'}
8872 Name of method to use to combine p-values.
8874 The available methods are (see Notes for details):
8876 * 'fisher': Fisher's method (Fisher's combined probability test)
8877 * 'pearson': Pearson's method
8878 * 'mudholkar_george': Mudholkar's and George's method
8879 * 'tippett': Tippett's method
8880 * 'stouffer': Stouffer's Z-score method
8881 weights : array_like, 1-D, optional
8882 Optional array of weights used only for Stouffer's Z-score method.
8884 Returns
8885 -------
8886 res : SignificanceResult
8887 An object containing attributes:
8889 statistic : float
8890 The statistic calculated by the specified method.
8891 pvalue : float
8892 The combined p-value.
8894 Notes
8895 -----
8896 If this function is applied to tests with a discrete statistics such as
8897 any rank test or contingency-table test, it will yield systematically
8898 wrong results, e.g. Fisher's method will systematically overestimate the
8899 p-value [1]_. This problem becomes less severe for large sample sizes
8900 when the discrete distributions become approximately continuous.
8902 The differences between the methods can be best illustrated by their
8903 statistics and what aspects of a combination of p-values they emphasise
8904 when considering significance [2]_. For example, methods emphasising large
8905 p-values are more sensitive to strong false and true negatives; conversely
8906 methods focussing on small p-values are sensitive to positives.
8908 * The statistics of Fisher's method (also known as Fisher's combined
8909 probability test) [3]_ is :math:`-2\\sum_i \\log(p_i)`, which is
8910 equivalent (as a test statistics) to the product of individual p-values:
8911 :math:`\\prod_i p_i`. Under the null hypothesis, this statistics follows
8912 a :math:`\\chi^2` distribution. This method emphasises small p-values.
8913 * Pearson's method uses :math:`-2\\sum_i\\log(1-p_i)`, which is equivalent
8914 to :math:`\\prod_i \\frac{1}{1-p_i}` [2]_.
8915 It thus emphasises large p-values.
8916 * Mudholkar and George compromise between Fisher's and Pearson's method by
8917 averaging their statistics [4]_. Their method emphasises extreme
8918 p-values, both close to 1 and 0.
8919 * Stouffer's method [5]_ uses Z-scores and the statistic:
8920 :math:`\\sum_i \\Phi^{-1} (p_i)`, where :math:`\\Phi` is the CDF of the
8921 standard normal distribution. The advantage of this method is that it is
8922 straightforward to introduce weights, which can make Stouffer's method
8923 more powerful than Fisher's method when the p-values are from studies
8924 of different size [6]_ [7]_.
8925 * Tippett's method uses the smallest p-value as a statistic.
8926 (Mind that this minimum is not the combined p-value.)
8928 Fisher's method may be extended to combine p-values from dependent tests
8929 [8]_. Extensions such as Brown's method and Kost's method are not currently
8930 implemented.
8932 .. versionadded:: 0.15.0
8934 References
8935 ----------
8936 .. [1] Kincaid, W. M., "The Combination of Tests Based on Discrete
8937 Distributions." Journal of the American Statistical Association 57,
8938 no. 297 (1962), 10-19.
8939 .. [2] Heard, N. and Rubin-Delanchey, P. "Choosing between methods of
8940 combining p-values." Biometrika 105.1 (2018): 239-246.
8941 .. [3] https://en.wikipedia.org/wiki/Fisher%27s_method
8942 .. [4] George, E. O., and G. S. Mudholkar. "On the convolution of logistic
8943 random variables." Metrika 30.1 (1983): 1-13.
8944 .. [5] https://en.wikipedia.org/wiki/Fisher%27s_method#Relation_to_Stouffer.27s_Z-score_method
8945 .. [6] Whitlock, M. C. "Combining probability from independent tests: the
8946 weighted Z-method is superior to Fisher's approach." Journal of
8947 Evolutionary Biology 18, no. 5 (2005): 1368-1373.
8948 .. [7] Zaykin, Dmitri V. "Optimally weighted Z-test is a powerful method
8949 for combining probabilities in meta-analysis." Journal of
8950 Evolutionary Biology 24, no. 8 (2011): 1836-1841.
8951 .. [8] https://en.wikipedia.org/wiki/Extensions_of_Fisher%27s_method
8953 """
8954 pvalues = np.asarray(pvalues)
8955 if pvalues.ndim != 1:
8956 raise ValueError("pvalues is not 1-D")
8958 if method == 'fisher':
8959 statistic = -2 * np.sum(np.log(pvalues))
8960 pval = distributions.chi2.sf(statistic, 2 * len(pvalues))
8961 elif method == 'pearson':
8962 statistic = 2 * np.sum(np.log1p(-pvalues))
8963 pval = distributions.chi2.cdf(-statistic, 2 * len(pvalues))
8964 elif method == 'mudholkar_george':
8965 normalizing_factor = np.sqrt(3/len(pvalues))/np.pi
8966 statistic = -np.sum(np.log(pvalues)) + np.sum(np.log1p(-pvalues))
8967 nu = 5 * len(pvalues) + 4
8968 approx_factor = np.sqrt(nu / (nu - 2))
8969 pval = distributions.t.sf(statistic * normalizing_factor
8970 * approx_factor, nu)
8971 elif method == 'tippett':
8972 statistic = np.min(pvalues)
8973 pval = distributions.beta.cdf(statistic, 1, len(pvalues))
8974 elif method == 'stouffer':
8975 if weights is None:
8976 weights = np.ones_like(pvalues)
8977 elif len(weights) != len(pvalues):
8978 raise ValueError("pvalues and weights must be of the same size.")
8980 weights = np.asarray(weights)
8981 if weights.ndim != 1:
8982 raise ValueError("weights is not 1-D")
8984 Zi = distributions.norm.isf(pvalues)
8985 statistic = np.dot(weights, Zi) / np.linalg.norm(weights)
8986 pval = distributions.norm.sf(statistic)
8988 else:
8989 raise ValueError(
8990 f"Invalid method {method!r}. Valid methods are 'fisher', "
8991 "'pearson', 'mudholkar_george', 'tippett', and 'stouffer'"
8992 )
8994 return SignificanceResult(statistic, pval)
8997#####################################
8998# STATISTICAL DISTANCES #
8999#####################################
9002def wasserstein_distance(u_values, v_values, u_weights=None, v_weights=None):
9003 r"""
9004 Compute the first Wasserstein distance between two 1D distributions.
9006 This distance is also known as the earth mover's distance, since it can be
9007 seen as the minimum amount of "work" required to transform :math:`u` into
9008 :math:`v`, where "work" is measured as the amount of distribution weight
9009 that must be moved, multiplied by the distance it has to be moved.
9011 .. versionadded:: 1.0.0
9013 Parameters
9014 ----------
9015 u_values, v_values : array_like
9016 Values observed in the (empirical) distribution.
9017 u_weights, v_weights : array_like, optional
9018 Weight for each value. If unspecified, each value is assigned the same
9019 weight.
9020 `u_weights` (resp. `v_weights`) must have the same length as
9021 `u_values` (resp. `v_values`). If the weight sum differs from 1, it
9022 must still be positive and finite so that the weights can be normalized
9023 to sum to 1.
9025 Returns
9026 -------
9027 distance : float
9028 The computed distance between the distributions.
9030 Notes
9031 -----
9032 The first Wasserstein distance between the distributions :math:`u` and
9033 :math:`v` is:
9035 .. math::
9037 l_1 (u, v) = \inf_{\pi \in \Gamma (u, v)} \int_{\mathbb{R} \times
9038 \mathbb{R}} |x-y| \mathrm{d} \pi (x, y)
9040 where :math:`\Gamma (u, v)` is the set of (probability) distributions on
9041 :math:`\mathbb{R} \times \mathbb{R}` whose marginals are :math:`u` and
9042 :math:`v` on the first and second factors respectively.
9044 If :math:`U` and :math:`V` are the respective CDFs of :math:`u` and
9045 :math:`v`, this distance also equals to:
9047 .. math::
9049 l_1(u, v) = \int_{-\infty}^{+\infty} |U-V|
9051 See [2]_ for a proof of the equivalence of both definitions.
9053 The input distributions can be empirical, therefore coming from samples
9054 whose values are effectively inputs of the function, or they can be seen as
9055 generalized functions, in which case they are weighted sums of Dirac delta
9056 functions located at the specified values.
9058 References
9059 ----------
9060 .. [1] "Wasserstein metric", https://en.wikipedia.org/wiki/Wasserstein_metric
9061 .. [2] Ramdas, Garcia, Cuturi "On Wasserstein Two Sample Testing and Related
9062 Families of Nonparametric Tests" (2015). :arXiv:`1509.02237`.
9064 Examples
9065 --------
9066 >>> from scipy.stats import wasserstein_distance
9067 >>> wasserstein_distance([0, 1, 3], [5, 6, 8])
9068 5.0
9069 >>> wasserstein_distance([0, 1], [0, 1], [3, 1], [2, 2])
9070 0.25
9071 >>> wasserstein_distance([3.4, 3.9, 7.5, 7.8], [4.5, 1.4],
9072 ... [1.4, 0.9, 3.1, 7.2], [3.2, 3.5])
9073 4.0781331438047861
9075 """
9076 return _cdf_distance(1, u_values, v_values, u_weights, v_weights)
9079def energy_distance(u_values, v_values, u_weights=None, v_weights=None):
9080 r"""Compute the energy distance between two 1D distributions.
9082 .. versionadded:: 1.0.0
9084 Parameters
9085 ----------
9086 u_values, v_values : array_like
9087 Values observed in the (empirical) distribution.
9088 u_weights, v_weights : array_like, optional
9089 Weight for each value. If unspecified, each value is assigned the same
9090 weight.
9091 `u_weights` (resp. `v_weights`) must have the same length as
9092 `u_values` (resp. `v_values`). If the weight sum differs from 1, it
9093 must still be positive and finite so that the weights can be normalized
9094 to sum to 1.
9096 Returns
9097 -------
9098 distance : float
9099 The computed distance between the distributions.
9101 Notes
9102 -----
9103 The energy distance between two distributions :math:`u` and :math:`v`, whose
9104 respective CDFs are :math:`U` and :math:`V`, equals to:
9106 .. math::
9108 D(u, v) = \left( 2\mathbb E|X - Y| - \mathbb E|X - X'| -
9109 \mathbb E|Y - Y'| \right)^{1/2}
9111 where :math:`X` and :math:`X'` (resp. :math:`Y` and :math:`Y'`) are
9112 independent random variables whose probability distribution is :math:`u`
9113 (resp. :math:`v`).
9115 Sometimes the square of this quantity is referred to as the "energy
9116 distance" (e.g. in [2]_, [4]_), but as noted in [1]_ and [3]_, only the
9117 definition above satisfies the axioms of a distance function (metric).
9119 As shown in [2]_, for one-dimensional real-valued variables, the energy
9120 distance is linked to the non-distribution-free version of the Cramér-von
9121 Mises distance:
9123 .. math::
9125 D(u, v) = \sqrt{2} l_2(u, v) = \left( 2 \int_{-\infty}^{+\infty} (U-V)^2
9126 \right)^{1/2}
9128 Note that the common Cramér-von Mises criterion uses the distribution-free
9129 version of the distance. See [2]_ (section 2), for more details about both
9130 versions of the distance.
9132 The input distributions can be empirical, therefore coming from samples
9133 whose values are effectively inputs of the function, or they can be seen as
9134 generalized functions, in which case they are weighted sums of Dirac delta
9135 functions located at the specified values.
9137 References
9138 ----------
9139 .. [1] Rizzo, Szekely "Energy distance." Wiley Interdisciplinary Reviews:
9140 Computational Statistics, 8(1):27-38 (2015).
9141 .. [2] Szekely "E-statistics: The energy of statistical samples." Bowling
9142 Green State University, Department of Mathematics and Statistics,
9143 Technical Report 02-16 (2002).
9144 .. [3] "Energy distance", https://en.wikipedia.org/wiki/Energy_distance
9145 .. [4] Bellemare, Danihelka, Dabney, Mohamed, Lakshminarayanan, Hoyer,
9146 Munos "The Cramer Distance as a Solution to Biased Wasserstein
9147 Gradients" (2017). :arXiv:`1705.10743`.
9149 Examples
9150 --------
9151 >>> from scipy.stats import energy_distance
9152 >>> energy_distance([0], [2])
9153 2.0000000000000004
9154 >>> energy_distance([0, 8], [0, 8], [3, 1], [2, 2])
9155 1.0000000000000002
9156 >>> energy_distance([0.7, 7.4, 2.4, 6.8], [1.4, 8. ],
9157 ... [2.1, 4.2, 7.4, 8. ], [7.6, 8.8])
9158 0.88003340976158217
9160 """
9161 return np.sqrt(2) * _cdf_distance(2, u_values, v_values,
9162 u_weights, v_weights)
9165def _cdf_distance(p, u_values, v_values, u_weights=None, v_weights=None):
9166 r"""
9167 Compute, between two one-dimensional distributions :math:`u` and
9168 :math:`v`, whose respective CDFs are :math:`U` and :math:`V`, the
9169 statistical distance that is defined as:
9171 .. math::
9173 l_p(u, v) = \left( \int_{-\infty}^{+\infty} |U-V|^p \right)^{1/p}
9175 p is a positive parameter; p = 1 gives the Wasserstein distance, p = 2
9176 gives the energy distance.
9178 Parameters
9179 ----------
9180 u_values, v_values : array_like
9181 Values observed in the (empirical) distribution.
9182 u_weights, v_weights : array_like, optional
9183 Weight for each value. If unspecified, each value is assigned the same
9184 weight.
9185 `u_weights` (resp. `v_weights`) must have the same length as
9186 `u_values` (resp. `v_values`). If the weight sum differs from 1, it
9187 must still be positive and finite so that the weights can be normalized
9188 to sum to 1.
9190 Returns
9191 -------
9192 distance : float
9193 The computed distance between the distributions.
9195 Notes
9196 -----
9197 The input distributions can be empirical, therefore coming from samples
9198 whose values are effectively inputs of the function, or they can be seen as
9199 generalized functions, in which case they are weighted sums of Dirac delta
9200 functions located at the specified values.
9202 References
9203 ----------
9204 .. [1] Bellemare, Danihelka, Dabney, Mohamed, Lakshminarayanan, Hoyer,
9205 Munos "The Cramer Distance as a Solution to Biased Wasserstein
9206 Gradients" (2017). :arXiv:`1705.10743`.
9208 """
9209 u_values, u_weights = _validate_distribution(u_values, u_weights)
9210 v_values, v_weights = _validate_distribution(v_values, v_weights)
9212 u_sorter = np.argsort(u_values)
9213 v_sorter = np.argsort(v_values)
9215 all_values = np.concatenate((u_values, v_values))
9216 all_values.sort(kind='mergesort')
9218 # Compute the differences between pairs of successive values of u and v.
9219 deltas = np.diff(all_values)
9221 # Get the respective positions of the values of u and v among the values of
9222 # both distributions.
9223 u_cdf_indices = u_values[u_sorter].searchsorted(all_values[:-1], 'right')
9224 v_cdf_indices = v_values[v_sorter].searchsorted(all_values[:-1], 'right')
9226 # Calculate the CDFs of u and v using their weights, if specified.
9227 if u_weights is None:
9228 u_cdf = u_cdf_indices / u_values.size
9229 else:
9230 u_sorted_cumweights = np.concatenate(([0],
9231 np.cumsum(u_weights[u_sorter])))
9232 u_cdf = u_sorted_cumweights[u_cdf_indices] / u_sorted_cumweights[-1]
9234 if v_weights is None:
9235 v_cdf = v_cdf_indices / v_values.size
9236 else:
9237 v_sorted_cumweights = np.concatenate(([0],
9238 np.cumsum(v_weights[v_sorter])))
9239 v_cdf = v_sorted_cumweights[v_cdf_indices] / v_sorted_cumweights[-1]
9241 # Compute the value of the integral based on the CDFs.
9242 # If p = 1 or p = 2, we avoid using np.power, which introduces an overhead
9243 # of about 15%.
9244 if p == 1:
9245 return np.sum(np.multiply(np.abs(u_cdf - v_cdf), deltas))
9246 if p == 2:
9247 return np.sqrt(np.sum(np.multiply(np.square(u_cdf - v_cdf), deltas)))
9248 return np.power(np.sum(np.multiply(np.power(np.abs(u_cdf - v_cdf), p),
9249 deltas)), 1/p)
9252def _validate_distribution(values, weights):
9253 """
9254 Validate the values and weights from a distribution input of `cdf_distance`
9255 and return them as ndarray objects.
9257 Parameters
9258 ----------
9259 values : array_like
9260 Values observed in the (empirical) distribution.
9261 weights : array_like
9262 Weight for each value.
9264 Returns
9265 -------
9266 values : ndarray
9267 Values as ndarray.
9268 weights : ndarray
9269 Weights as ndarray.
9271 """
9272 # Validate the value array.
9273 values = np.asarray(values, dtype=float)
9274 if len(values) == 0:
9275 raise ValueError("Distribution can't be empty.")
9277 # Validate the weight array, if specified.
9278 if weights is not None:
9279 weights = np.asarray(weights, dtype=float)
9280 if len(weights) != len(values):
9281 raise ValueError('Value and weight array-likes for the same '
9282 'empirical distribution must be of the same size.')
9283 if np.any(weights < 0):
9284 raise ValueError('All weights must be non-negative.')
9285 if not 0 < np.sum(weights) < np.inf:
9286 raise ValueError('Weight array-like sum must be positive and '
9287 'finite. Set as None for an equal distribution of '
9288 'weight.')
9290 return values, weights
9292 return values, None
9295#####################################
9296# SUPPORT FUNCTIONS #
9297#####################################
9299RepeatedResults = namedtuple('RepeatedResults', ('values', 'counts'))
9302def find_repeats(arr):
9303 """Find repeats and repeat counts.
9305 Parameters
9306 ----------
9307 arr : array_like
9308 Input array. This is cast to float64.
9310 Returns
9311 -------
9312 values : ndarray
9313 The unique values from the (flattened) input that are repeated.
9315 counts : ndarray
9316 Number of times the corresponding 'value' is repeated.
9318 Notes
9319 -----
9320 In numpy >= 1.9 `numpy.unique` provides similar functionality. The main
9321 difference is that `find_repeats` only returns repeated values.
9323 Examples
9324 --------
9325 >>> from scipy import stats
9326 >>> stats.find_repeats([2, 1, 2, 3, 2, 2, 5])
9327 RepeatedResults(values=array([2.]), counts=array([4]))
9329 >>> stats.find_repeats([[10, 20, 1, 2], [5, 5, 4, 4]])
9330 RepeatedResults(values=array([4., 5.]), counts=array([2, 2]))
9332 """
9333 # Note: always copies.
9334 return RepeatedResults(*_find_repeats(np.array(arr, dtype=np.float64)))
9337def _sum_of_squares(a, axis=0):
9338 """Square each element of the input array, and return the sum(s) of that.
9340 Parameters
9341 ----------
9342 a : array_like
9343 Input array.
9344 axis : int or None, optional
9345 Axis along which to calculate. Default is 0. If None, compute over
9346 the whole array `a`.
9348 Returns
9349 -------
9350 sum_of_squares : ndarray
9351 The sum along the given axis for (a**2).
9353 See Also
9354 --------
9355 _square_of_sums : The square(s) of the sum(s) (the opposite of
9356 `_sum_of_squares`).
9358 """
9359 a, axis = _chk_asarray(a, axis)
9360 return np.sum(a*a, axis)
9363def _square_of_sums(a, axis=0):
9364 """Sum elements of the input array, and return the square(s) of that sum.
9366 Parameters
9367 ----------
9368 a : array_like
9369 Input array.
9370 axis : int or None, optional
9371 Axis along which to calculate. Default is 0. If None, compute over
9372 the whole array `a`.
9374 Returns
9375 -------
9376 square_of_sums : float or ndarray
9377 The square of the sum over `axis`.
9379 See Also
9380 --------
9381 _sum_of_squares : The sum of squares (the opposite of `square_of_sums`).
9383 """
9384 a, axis = _chk_asarray(a, axis)
9385 s = np.sum(a, axis)
9386 if not np.isscalar(s):
9387 return s.astype(float) * s
9388 else:
9389 return float(s) * s
9392def rankdata(a, method='average', *, axis=None, nan_policy='propagate'):
9393 """Assign ranks to data, dealing with ties appropriately.
9395 By default (``axis=None``), the data array is first flattened, and a flat
9396 array of ranks is returned. Separately reshape the rank array to the
9397 shape of the data array if desired (see Examples).
9399 Ranks begin at 1. The `method` argument controls how ranks are assigned
9400 to equal values. See [1]_ for further discussion of ranking methods.
9402 Parameters
9403 ----------
9404 a : array_like
9405 The array of values to be ranked.
9406 method : {'average', 'min', 'max', 'dense', 'ordinal'}, optional
9407 The method used to assign ranks to tied elements.
9408 The following methods are available (default is 'average'):
9410 * 'average': The average of the ranks that would have been assigned to
9411 all the tied values is assigned to each value.
9412 * 'min': The minimum of the ranks that would have been assigned to all
9413 the tied values is assigned to each value. (This is also
9414 referred to as "competition" ranking.)
9415 * 'max': The maximum of the ranks that would have been assigned to all
9416 the tied values is assigned to each value.
9417 * 'dense': Like 'min', but the rank of the next highest element is
9418 assigned the rank immediately after those assigned to the tied
9419 elements.
9420 * 'ordinal': All values are given a distinct rank, corresponding to
9421 the order that the values occur in `a`.
9422 axis : {None, int}, optional
9423 Axis along which to perform the ranking. If ``None``, the data array
9424 is first flattened.
9425 nan_policy : {'propagate', 'omit', 'raise'}, optional
9426 Defines how to handle when input contains nan.
9427 The following options are available (default is 'propagate'):
9429 * 'propagate': propagates nans through the rank calculation
9430 * 'omit': performs the calculations ignoring nan values
9431 * 'raise': raises an error
9433 .. note::
9435 When `nan_policy` is 'propagate', the output is an array of *all*
9436 nans because ranks relative to nans in the input are undefined.
9437 When `nan_policy` is 'omit', nans in `a` are ignored when ranking
9438 the other values, and the corresponding locations of the output
9439 are nan.
9441 .. versionadded:: 1.10
9443 Returns
9444 -------
9445 ranks : ndarray
9446 An array of size equal to the size of `a`, containing rank
9447 scores.
9449 References
9450 ----------
9451 .. [1] "Ranking", https://en.wikipedia.org/wiki/Ranking
9453 Examples
9454 --------
9455 >>> import numpy as np
9456 >>> from scipy.stats import rankdata
9457 >>> rankdata([0, 2, 3, 2])
9458 array([ 1. , 2.5, 4. , 2.5])
9459 >>> rankdata([0, 2, 3, 2], method='min')
9460 array([ 1, 2, 4, 2])
9461 >>> rankdata([0, 2, 3, 2], method='max')
9462 array([ 1, 3, 4, 3])
9463 >>> rankdata([0, 2, 3, 2], method='dense')
9464 array([ 1, 2, 3, 2])
9465 >>> rankdata([0, 2, 3, 2], method='ordinal')
9466 array([ 1, 2, 4, 3])
9467 >>> rankdata([[0, 2], [3, 2]]).reshape(2,2)
9468 array([[1. , 2.5],
9469 [4. , 2.5]])
9470 >>> rankdata([[0, 2, 2], [3, 2, 5]], axis=1)
9471 array([[1. , 2.5, 2.5],
9472 [2. , 1. , 3. ]])
9473 >>> rankdata([0, 2, 3, np.nan, -2, np.nan], nan_policy="propagate")
9474 array([nan, nan, nan, nan, nan, nan])
9475 >>> rankdata([0, 2, 3, np.nan, -2, np.nan], nan_policy="omit")
9476 array([ 2., 3., 4., nan, 1., nan])
9478 """
9479 if method not in ('average', 'min', 'max', 'dense', 'ordinal'):
9480 raise ValueError('unknown method "{0}"'.format(method))
9482 a = np.asarray(a)
9484 if axis is not None:
9485 if a.size == 0:
9486 # The return values of `normalize_axis_index` are ignored. The
9487 # call validates `axis`, even though we won't use it.
9488 # use scipy._lib._util._normalize_axis_index when available
9489 np.core.multiarray.normalize_axis_index(axis, a.ndim)
9490 dt = np.float64 if method == 'average' else np.int_
9491 return np.empty(a.shape, dtype=dt)
9492 return np.apply_along_axis(rankdata, axis, a, method,
9493 nan_policy=nan_policy)
9495 arr = np.ravel(a)
9496 contains_nan, nan_policy = _contains_nan(arr, nan_policy)
9497 nan_indexes = None
9498 if contains_nan:
9499 if nan_policy == 'omit':
9500 nan_indexes = np.isnan(arr)
9501 if nan_policy == 'propagate':
9502 return np.full_like(arr, np.nan)
9504 algo = 'mergesort' if method == 'ordinal' else 'quicksort'
9505 sorter = np.argsort(arr, kind=algo)
9507 inv = np.empty(sorter.size, dtype=np.intp)
9508 inv[sorter] = np.arange(sorter.size, dtype=np.intp)
9510 if method == 'ordinal':
9511 result = inv + 1
9513 arr = arr[sorter]
9514 obs = np.r_[True, arr[1:] != arr[:-1]]
9515 dense = obs.cumsum()[inv]
9517 if method == 'dense':
9518 result = dense
9520 # cumulative counts of each unique value
9521 count = np.r_[np.nonzero(obs)[0], len(obs)]
9523 if method == 'max':
9524 result = count[dense]
9526 if method == 'min':
9527 result = count[dense - 1] + 1
9529 if method == 'average':
9530 result = .5 * (count[dense] + count[dense - 1] + 1)
9532 if nan_indexes is not None:
9533 result = result.astype('float64')
9534 result[nan_indexes] = np.nan
9536 return result
9539def expectile(a, alpha=0.5, *, weights=None):
9540 r"""Compute the expectile at the specified level.
9542 Expectiles are a generalization of the expectation in the same way as
9543 quantiles are a generalization of the median. The expectile at level
9544 `alpha = 0.5` is the mean (average). See Notes for more details.
9546 Parameters
9547 ----------
9548 a : array_like
9549 Array containing numbers whose expectile is desired.
9550 alpha : float, default: 0.5
9551 The level of the expectile; e.g., `alpha=0.5` gives the mean.
9552 weights : array_like, optional
9553 An array of weights associated with the values in `a`.
9554 The `weights` must be broadcastable to the same shape as `a`.
9555 Default is None, which gives each value a weight of 1.0.
9556 An integer valued weight element acts like repeating the corresponding
9557 observation in `a` that many times. See Notes for more details.
9559 Returns
9560 -------
9561 expectile : ndarray
9562 The empirical expectile at level `alpha`.
9564 See Also
9565 --------
9566 numpy.mean : Arithmetic average
9567 numpy.quantile : Quantile
9569 Notes
9570 -----
9571 In general, the expectile at level :math:`\alpha` of a random variable
9572 :math:`X` with cumulative distribution function (CDF) :math:`F` is given
9573 by the unique solution :math:`t` of:
9575 .. math::
9577 \alpha E((X - t)_+) = (1 - \alpha) E((t - X)_+) \,.
9579 Here, :math:`(x)_+ = \max(0, x)` is the positive part of :math:`x`.
9580 This equation can be equivalently written as:
9582 .. math::
9584 \alpha \int_t^\infty (x - t)\mathrm{d}F(x)
9585 = (1 - \alpha) \int_{-\infty}^t (t - x)\mathrm{d}F(x) \,.
9587 The empirical expectile at level :math:`\alpha` (`alpha`) of a sample
9588 :math:`a_i` (the array `a`) is defined by plugging in the empirical CDF of
9589 `a`. Given sample or case weights :math:`w` (the array `weights`), it
9590 reads :math:`F_a(x) = \frac{1}{\sum_i a_i} \sum_i w_i 1_{a_i \leq x}`
9591 with indicator function :math:`1_{A}`. This leads to the definition of the
9592 empirical expectile at level `alpha` as the unique solution :math:`t` of:
9594 .. math::
9596 \alpha \sum_{i=1}^n w_i (a_i - t)_+ =
9597 (1 - \alpha) \sum_{i=1}^n w_i (t - a_i)_+ \,.
9599 For :math:`\alpha=0.5`, this simplifies to the weighted average.
9600 Furthermore, the larger :math:`\alpha`, the larger the value of the
9601 expectile.
9603 As a final remark, the expectile at level :math:`\alpha` can also be
9604 written as a minimization problem. One often used choice is
9606 .. math::
9608 \operatorname{argmin}_t
9609 E(\lvert 1_{t\geq X} - \alpha\rvert(t - X)^2) \,.
9611 References
9612 ----------
9613 .. [1] W. K. Newey and J. L. Powell (1987), "Asymmetric Least Squares
9614 Estimation and Testing," Econometrica, 55, 819-847.
9615 .. [2] T. Gneiting (2009). "Making and Evaluating Point Forecasts,"
9616 Journal of the American Statistical Association, 106, 746 - 762.
9617 :doi:`10.48550/arXiv.0912.0902`
9619 Examples
9620 --------
9621 >>> import numpy as np
9622 >>> from scipy.stats import expectile
9623 >>> a = [1, 4, 2, -1]
9624 >>> expectile(a, alpha=0.5) == np.mean(a)
9625 True
9626 >>> expectile(a, alpha=0.2)
9627 0.42857142857142855
9628 >>> expectile(a, alpha=0.8)
9629 2.5714285714285716
9630 >>> weights = [1, 3, 1, 1]
9632 """
9633 if alpha < 0 or alpha > 1:
9634 raise ValueError(
9635 "The expectile level alpha must be in the range [0, 1]."
9636 )
9637 a = np.asarray(a)
9639 if weights is not None:
9640 weights = np.broadcast_to(weights, a.shape)
9642 # This is the empirical equivalent of Eq. (13) with identification
9643 # function from Table 9 (omitting a factor of 2) in [2] (their y is our
9644 # data a, their x is our t)
9645 def first_order(t):
9646 return np.average(np.abs((a <= t) - alpha) * (t - a), weights=weights)
9648 if alpha >= 0.5:
9649 x0 = np.average(a, weights=weights)
9650 x1 = np.amax(a)
9651 else:
9652 x1 = np.average(a, weights=weights)
9653 x0 = np.amin(a)
9655 if x0 == x1:
9656 # a has a single unique element
9657 return x0
9659 # Note that the expectile is the unique solution, so no worries about
9660 # finding a wrong root.
9661 res = root_scalar(first_order, x0=x0, x1=x1)
9662 return res.root