Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scipy/stats/_mstats_basic.py: 11%
991 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-12 06:31 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-12 06:31 +0000
1"""
2An extension of scipy.stats._stats_py to support masked arrays
4"""
5# Original author (2007): Pierre GF Gerard-Marchant
8__all__ = ['argstoarray',
9 'count_tied_groups',
10 'describe',
11 'f_oneway', 'find_repeats','friedmanchisquare',
12 'kendalltau','kendalltau_seasonal','kruskal','kruskalwallis',
13 'ks_twosamp', 'ks_2samp', 'kurtosis', 'kurtosistest',
14 'ks_1samp', 'kstest',
15 'linregress',
16 'mannwhitneyu', 'meppf','mode','moment','mquantiles','msign',
17 'normaltest',
18 'obrientransform',
19 'pearsonr','plotting_positions','pointbiserialr',
20 'rankdata',
21 'scoreatpercentile','sem',
22 'sen_seasonal_slopes','skew','skewtest','spearmanr',
23 'siegelslopes', 'theilslopes',
24 'tmax','tmean','tmin','trim','trimboth',
25 'trimtail','trima','trimr','trimmed_mean','trimmed_std',
26 'trimmed_stde','trimmed_var','tsem','ttest_1samp','ttest_onesamp',
27 'ttest_ind','ttest_rel','tvar',
28 'variation',
29 'winsorize',
30 'brunnermunzel',
31 ]
33import numpy as np
34from numpy import ndarray
35import numpy.ma as ma
36from numpy.ma import masked, nomask
37import math
39import itertools
40import warnings
41from collections import namedtuple
43from . import distributions
44from scipy._lib._util import _rename_parameter, _contains_nan
45from scipy._lib._bunch import _make_tuple_bunch
46import scipy.special as special
47import scipy.stats._stats_py
49from ._stats_mstats_common import (
50 _find_repeats,
51 linregress as stats_linregress,
52 LinregressResult as stats_LinregressResult,
53 theilslopes as stats_theilslopes,
54 siegelslopes as stats_siegelslopes
55 )
57def _chk_asarray(a, axis):
58 # Always returns a masked array, raveled for axis=None
59 a = ma.asanyarray(a)
60 if axis is None:
61 a = ma.ravel(a)
62 outaxis = 0
63 else:
64 outaxis = axis
65 return a, outaxis
68def _chk2_asarray(a, b, axis):
69 a = ma.asanyarray(a)
70 b = ma.asanyarray(b)
71 if axis is None:
72 a = ma.ravel(a)
73 b = ma.ravel(b)
74 outaxis = 0
75 else:
76 outaxis = axis
77 return a, b, outaxis
80def _chk_size(a, b):
81 a = ma.asanyarray(a)
82 b = ma.asanyarray(b)
83 (na, nb) = (a.size, b.size)
84 if na != nb:
85 raise ValueError("The size of the input array should match!"
86 " (%s <> %s)" % (na, nb))
87 return (a, b, na)
90def argstoarray(*args):
91 """
92 Constructs a 2D array from a group of sequences.
94 Sequences are filled with missing values to match the length of the longest
95 sequence.
97 Parameters
98 ----------
99 *args : sequences
100 Group of sequences.
102 Returns
103 -------
104 argstoarray : MaskedArray
105 A ( `m` x `n` ) masked array, where `m` is the number of arguments and
106 `n` the length of the longest argument.
108 Notes
109 -----
110 `numpy.ma.row_stack` has identical behavior, but is called with a sequence
111 of sequences.
113 Examples
114 --------
115 A 2D masked array constructed from a group of sequences is returned.
117 >>> from scipy.stats.mstats import argstoarray
118 >>> argstoarray([1, 2, 3], [4, 5, 6])
119 masked_array(
120 data=[[1.0, 2.0, 3.0],
121 [4.0, 5.0, 6.0]],
122 mask=[[False, False, False],
123 [False, False, False]],
124 fill_value=1e+20)
126 The returned masked array filled with missing values when the lengths of
127 sequences are different.
129 >>> argstoarray([1, 3], [4, 5, 6])
130 masked_array(
131 data=[[1.0, 3.0, --],
132 [4.0, 5.0, 6.0]],
133 mask=[[False, False, True],
134 [False, False, False]],
135 fill_value=1e+20)
137 """
138 if len(args) == 1 and not isinstance(args[0], ndarray):
139 output = ma.asarray(args[0])
140 if output.ndim != 2:
141 raise ValueError("The input should be 2D")
142 else:
143 n = len(args)
144 m = max([len(k) for k in args])
145 output = ma.array(np.empty((n,m), dtype=float), mask=True)
146 for (k,v) in enumerate(args):
147 output[k,:len(v)] = v
149 output[np.logical_not(np.isfinite(output._data))] = masked
150 return output
153def find_repeats(arr):
154 """Find repeats in arr and return a tuple (repeats, repeat_count).
156 The input is cast to float64. Masked values are discarded.
158 Parameters
159 ----------
160 arr : sequence
161 Input array. The array is flattened if it is not 1D.
163 Returns
164 -------
165 repeats : ndarray
166 Array of repeated values.
167 counts : ndarray
168 Array of counts.
170 Examples
171 --------
172 >>> from scipy.stats import mstats
173 >>> mstats.find_repeats([2, 1, 2, 3, 2, 2, 5])
174 (array([2.]), array([4]))
176 In the above example, 2 repeats 4 times.
178 >>> mstats.find_repeats([[10, 20, 1, 2], [5, 5, 4, 4]])
179 (array([4., 5.]), array([2, 2]))
181 In the above example, both 4 and 5 repeat 2 times.
183 """
184 # Make sure we get a copy. ma.compressed promises a "new array", but can
185 # actually return a reference.
186 compr = np.asarray(ma.compressed(arr), dtype=np.float64)
187 try:
188 need_copy = np.may_share_memory(compr, arr)
189 except AttributeError:
190 # numpy < 1.8.2 bug: np.may_share_memory([], []) raises,
191 # while in numpy 1.8.2 and above it just (correctly) returns False.
192 need_copy = False
193 if need_copy:
194 compr = compr.copy()
195 return _find_repeats(compr)
198def count_tied_groups(x, use_missing=False):
199 """
200 Counts the number of tied values.
202 Parameters
203 ----------
204 x : sequence
205 Sequence of data on which to counts the ties
206 use_missing : bool, optional
207 Whether to consider missing values as tied.
209 Returns
210 -------
211 count_tied_groups : dict
212 Returns a dictionary (nb of ties: nb of groups).
214 Examples
215 --------
216 >>> from scipy.stats import mstats
217 >>> import numpy as np
218 >>> z = [0, 0, 0, 2, 2, 2, 3, 3, 4, 5, 6]
219 >>> mstats.count_tied_groups(z)
220 {2: 1, 3: 2}
222 In the above example, the ties were 0 (3x), 2 (3x) and 3 (2x).
224 >>> z = np.ma.array([0, 0, 1, 2, 2, 2, 3, 3, 4, 5, 6])
225 >>> mstats.count_tied_groups(z)
226 {2: 2, 3: 1}
227 >>> z[[1,-1]] = np.ma.masked
228 >>> mstats.count_tied_groups(z, use_missing=True)
229 {2: 2, 3: 1}
231 """
232 nmasked = ma.getmask(x).sum()
233 # We need the copy as find_repeats will overwrite the initial data
234 data = ma.compressed(x).copy()
235 (ties, counts) = find_repeats(data)
236 nties = {}
237 if len(ties):
238 nties = dict(zip(np.unique(counts), itertools.repeat(1)))
239 nties.update(dict(zip(*find_repeats(counts))))
241 if nmasked and use_missing:
242 try:
243 nties[nmasked] += 1
244 except KeyError:
245 nties[nmasked] = 1
247 return nties
250def rankdata(data, axis=None, use_missing=False):
251 """Returns the rank (also known as order statistics) of each data point
252 along the given axis.
254 If some values are tied, their rank is averaged.
255 If some values are masked, their rank is set to 0 if use_missing is False,
256 or set to the average rank of the unmasked values if use_missing is True.
258 Parameters
259 ----------
260 data : sequence
261 Input data. The data is transformed to a masked array
262 axis : {None,int}, optional
263 Axis along which to perform the ranking.
264 If None, the array is first flattened. An exception is raised if
265 the axis is specified for arrays with a dimension larger than 2
266 use_missing : bool, optional
267 Whether the masked values have a rank of 0 (False) or equal to the
268 average rank of the unmasked values (True).
270 """
271 def _rank1d(data, use_missing=False):
272 n = data.count()
273 rk = np.empty(data.size, dtype=float)
274 idx = data.argsort()
275 rk[idx[:n]] = np.arange(1,n+1)
277 if use_missing:
278 rk[idx[n:]] = (n+1)/2.
279 else:
280 rk[idx[n:]] = 0
282 repeats = find_repeats(data.copy())
283 for r in repeats[0]:
284 condition = (data == r).filled(False)
285 rk[condition] = rk[condition].mean()
286 return rk
288 data = ma.array(data, copy=False)
289 if axis is None:
290 if data.ndim > 1:
291 return _rank1d(data.ravel(), use_missing).reshape(data.shape)
292 else:
293 return _rank1d(data, use_missing)
294 else:
295 return ma.apply_along_axis(_rank1d,axis,data,use_missing).view(ndarray)
298ModeResult = namedtuple('ModeResult', ('mode', 'count'))
301def mode(a, axis=0):
302 """
303 Returns an array of the modal (most common) value in the passed array.
305 Parameters
306 ----------
307 a : array_like
308 n-dimensional array of which to find mode(s).
309 axis : int or None, optional
310 Axis along which to operate. Default is 0. If None, compute over
311 the whole array `a`.
313 Returns
314 -------
315 mode : ndarray
316 Array of modal values.
317 count : ndarray
318 Array of counts for each mode.
320 Notes
321 -----
322 For more details, see `scipy.stats.mode`.
324 Examples
325 --------
326 >>> import numpy as np
327 >>> from scipy import stats
328 >>> from scipy.stats import mstats
329 >>> m_arr = np.ma.array([1, 1, 0, 0, 0, 0], mask=[0, 0, 1, 1, 1, 0])
330 >>> mstats.mode(m_arr) # note that most zeros are masked
331 ModeResult(mode=array([1.]), count=array([2.]))
333 """
334 return _mode(a, axis=axis, keepdims=True)
337def _mode(a, axis=0, keepdims=True):
338 # Don't want to expose `keepdims` from the public `mstats.mode`
339 a, axis = _chk_asarray(a, axis)
341 def _mode1D(a):
342 (rep,cnt) = find_repeats(a)
343 if not cnt.ndim:
344 return (0, 0)
345 elif cnt.size:
346 return (rep[cnt.argmax()], cnt.max())
347 else:
348 return (a.min(), 1)
350 if axis is None:
351 output = _mode1D(ma.ravel(a))
352 output = (ma.array(output[0]), ma.array(output[1]))
353 else:
354 output = ma.apply_along_axis(_mode1D, axis, a)
355 if keepdims is None or keepdims:
356 newshape = list(a.shape)
357 newshape[axis] = 1
358 slices = [slice(None)] * output.ndim
359 slices[axis] = 0
360 modes = output[tuple(slices)].reshape(newshape)
361 slices[axis] = 1
362 counts = output[tuple(slices)].reshape(newshape)
363 output = (modes, counts)
364 else:
365 output = np.moveaxis(output, axis, 0)
367 return ModeResult(*output)
370def _betai(a, b, x):
371 x = np.asanyarray(x)
372 x = ma.where(x < 1.0, x, 1.0) # if x > 1 then return 1.0
373 return special.betainc(a, b, x)
376def msign(x):
377 """Returns the sign of x, or 0 if x is masked."""
378 return ma.filled(np.sign(x), 0)
381def pearsonr(x, y):
382 r"""
383 Pearson correlation coefficient and p-value for testing non-correlation.
385 The Pearson correlation coefficient [1]_ measures the linear relationship
386 between two datasets. The calculation of the p-value relies on the
387 assumption that each dataset is normally distributed. (See Kowalski [3]_
388 for a discussion of the effects of non-normality of the input on the
389 distribution of the correlation coefficient.) Like other correlation
390 coefficients, this one varies between -1 and +1 with 0 implying no
391 correlation. Correlations of -1 or +1 imply an exact linear relationship.
393 Parameters
394 ----------
395 x : (N,) array_like
396 Input array.
397 y : (N,) array_like
398 Input array.
400 Returns
401 -------
402 r : float
403 Pearson's correlation coefficient.
404 p-value : float
405 Two-tailed p-value.
407 Warns
408 -----
409 PearsonRConstantInputWarning
410 Raised if an input is a constant array. The correlation coefficient
411 is not defined in this case, so ``np.nan`` is returned.
413 PearsonRNearConstantInputWarning
414 Raised if an input is "nearly" constant. The array ``x`` is considered
415 nearly constant if ``norm(x - mean(x)) < 1e-13 * abs(mean(x))``.
416 Numerical errors in the calculation ``x - mean(x)`` in this case might
417 result in an inaccurate calculation of r.
419 See Also
420 --------
421 spearmanr : Spearman rank-order correlation coefficient.
422 kendalltau : Kendall's tau, a correlation measure for ordinal data.
424 Notes
425 -----
426 The correlation coefficient is calculated as follows:
428 .. math::
430 r = \frac{\sum (x - m_x) (y - m_y)}
431 {\sqrt{\sum (x - m_x)^2 \sum (y - m_y)^2}}
433 where :math:`m_x` is the mean of the vector x and :math:`m_y` is
434 the mean of the vector y.
436 Under the assumption that x and y are drawn from
437 independent normal distributions (so the population correlation coefficient
438 is 0), the probability density function of the sample correlation
439 coefficient r is ([1]_, [2]_):
441 .. math::
443 f(r) = \frac{{(1-r^2)}^{n/2-2}}{\mathrm{B}(\frac{1}{2},\frac{n}{2}-1)}
445 where n is the number of samples, and B is the beta function. This
446 is sometimes referred to as the exact distribution of r. This is
447 the distribution that is used in `pearsonr` to compute the p-value.
448 The distribution is a beta distribution on the interval [-1, 1],
449 with equal shape parameters a = b = n/2 - 1. In terms of SciPy's
450 implementation of the beta distribution, the distribution of r is::
452 dist = scipy.stats.beta(n/2 - 1, n/2 - 1, loc=-1, scale=2)
454 The p-value returned by `pearsonr` is a two-sided p-value. The p-value
455 roughly indicates the probability of an uncorrelated system
456 producing datasets that have a Pearson correlation at least as extreme
457 as the one computed from these datasets. More precisely, for a
458 given sample with correlation coefficient r, the p-value is
459 the probability that abs(r') of a random sample x' and y' drawn from
460 the population with zero correlation would be greater than or equal
461 to abs(r). In terms of the object ``dist`` shown above, the p-value
462 for a given r and length n can be computed as::
464 p = 2*dist.cdf(-abs(r))
466 When n is 2, the above continuous distribution is not well-defined.
467 One can interpret the limit of the beta distribution as the shape
468 parameters a and b approach a = b = 0 as a discrete distribution with
469 equal probability masses at r = 1 and r = -1. More directly, one
470 can observe that, given the data x = [x1, x2] and y = [y1, y2], and
471 assuming x1 != x2 and y1 != y2, the only possible values for r are 1
472 and -1. Because abs(r') for any sample x' and y' with length 2 will
473 be 1, the two-sided p-value for a sample of length 2 is always 1.
475 References
476 ----------
477 .. [1] "Pearson correlation coefficient", Wikipedia,
478 https://en.wikipedia.org/wiki/Pearson_correlation_coefficient
479 .. [2] Student, "Probable error of a correlation coefficient",
480 Biometrika, Volume 6, Issue 2-3, 1 September 1908, pp. 302-310.
481 .. [3] C. J. Kowalski, "On the Effects of Non-Normality on the Distribution
482 of the Sample Product-Moment Correlation Coefficient"
483 Journal of the Royal Statistical Society. Series C (Applied
484 Statistics), Vol. 21, No. 1 (1972), pp. 1-12.
486 Examples
487 --------
488 >>> import numpy as np
489 >>> from scipy import stats
490 >>> from scipy.stats import mstats
491 >>> mstats.pearsonr([1, 2, 3, 4, 5], [10, 9, 2.5, 6, 4])
492 (-0.7426106572325057, 0.1505558088534455)
494 There is a linear dependence between x and y if y = a + b*x + e, where
495 a,b are constants and e is a random error term, assumed to be independent
496 of x. For simplicity, assume that x is standard normal, a=0, b=1 and let
497 e follow a normal distribution with mean zero and standard deviation s>0.
499 >>> s = 0.5
500 >>> x = stats.norm.rvs(size=500)
501 >>> e = stats.norm.rvs(scale=s, size=500)
502 >>> y = x + e
503 >>> mstats.pearsonr(x, y)
504 (0.9029601878969703, 8.428978827629898e-185) # may vary
506 This should be close to the exact value given by
508 >>> 1/np.sqrt(1 + s**2)
509 0.8944271909999159
511 For s=0.5, we observe a high level of correlation. In general, a large
512 variance of the noise reduces the correlation, while the correlation
513 approaches one as the variance of the error goes to zero.
515 It is important to keep in mind that no correlation does not imply
516 independence unless (x, y) is jointly normal. Correlation can even be zero
517 when there is a very simple dependence structure: if X follows a
518 standard normal distribution, let y = abs(x). Note that the correlation
519 between x and y is zero. Indeed, since the expectation of x is zero,
520 cov(x, y) = E[x*y]. By definition, this equals E[x*abs(x)] which is zero
521 by symmetry. The following lines of code illustrate this observation:
523 >>> y = np.abs(x)
524 >>> mstats.pearsonr(x, y)
525 (-0.016172891856853524, 0.7182823678751942) # may vary
527 A non-zero correlation coefficient can be misleading. For example, if X has
528 a standard normal distribution, define y = x if x < 0 and y = 0 otherwise.
529 A simple calculation shows that corr(x, y) = sqrt(2/Pi) = 0.797...,
530 implying a high level of correlation:
532 >>> y = np.where(x < 0, x, 0)
533 >>> mstats.pearsonr(x, y)
534 (0.8537091583771509, 3.183461621422181e-143) # may vary
536 This is unintuitive since there is no dependence of x and y if x is larger
537 than zero which happens in about half of the cases if we sample x and y.
538 """
539 (x, y, n) = _chk_size(x, y)
540 (x, y) = (x.ravel(), y.ravel())
541 # Get the common mask and the total nb of unmasked elements
542 m = ma.mask_or(ma.getmask(x), ma.getmask(y))
543 n -= m.sum()
544 df = n-2
545 if df < 0:
546 return (masked, masked)
548 return scipy.stats._stats_py.pearsonr(ma.masked_array(x, mask=m).compressed(),
549 ma.masked_array(y, mask=m).compressed())
552def spearmanr(x, y=None, use_ties=True, axis=None, nan_policy='propagate',
553 alternative='two-sided'):
554 """
555 Calculates a Spearman rank-order correlation coefficient and the p-value
556 to test for non-correlation.
558 The Spearman correlation is a nonparametric measure of the linear
559 relationship between two datasets. Unlike the Pearson correlation, the
560 Spearman correlation does not assume that both datasets are normally
561 distributed. Like other correlation coefficients, this one varies
562 between -1 and +1 with 0 implying no correlation. Correlations of -1 or
563 +1 imply a monotonic relationship. Positive correlations imply that
564 as `x` increases, so does `y`. Negative correlations imply that as `x`
565 increases, `y` decreases.
567 Missing values are discarded pair-wise: if a value is missing in `x`, the
568 corresponding value in `y` is masked.
570 The p-value roughly indicates the probability of an uncorrelated system
571 producing datasets that have a Spearman correlation at least as extreme
572 as the one computed from these datasets. The p-values are not entirely
573 reliable but are probably reasonable for datasets larger than 500 or so.
575 Parameters
576 ----------
577 x, y : 1D or 2D array_like, y is optional
578 One or two 1-D or 2-D arrays containing multiple variables and
579 observations. When these are 1-D, each represents a vector of
580 observations of a single variable. For the behavior in the 2-D case,
581 see under ``axis``, below.
582 use_ties : bool, optional
583 DO NOT USE. Does not do anything, keyword is only left in place for
584 backwards compatibility reasons.
585 axis : int or None, optional
586 If axis=0 (default), then each column represents a variable, with
587 observations in the rows. If axis=1, the relationship is transposed:
588 each row represents a variable, while the columns contain observations.
589 If axis=None, then both arrays will be raveled.
590 nan_policy : {'propagate', 'raise', 'omit'}, optional
591 Defines how to handle when input contains nan. 'propagate' returns nan,
592 'raise' throws an error, 'omit' performs the calculations ignoring nan
593 values. Default is 'propagate'.
594 alternative : {'two-sided', 'less', 'greater'}, optional
595 Defines the alternative hypothesis. Default is 'two-sided'.
596 The following options are available:
598 * 'two-sided': the correlation is nonzero
599 * 'less': the correlation is negative (less than zero)
600 * 'greater': the correlation is positive (greater than zero)
602 .. versionadded:: 1.7.0
604 Returns
605 -------
606 res : SignificanceResult
607 An object containing attributes:
609 statistic : float or ndarray (2-D square)
610 Spearman correlation matrix or correlation coefficient (if only 2
611 variables are given as parameters). Correlation matrix is square
612 with length equal to total number of variables (columns or rows) in
613 ``a`` and ``b`` combined.
614 pvalue : float
615 The p-value for a hypothesis test whose null hypothesis
616 is that two sets of data are linearly uncorrelated. See
617 `alternative` above for alternative hypotheses. `pvalue` has the
618 same shape as `statistic`.
620 References
621 ----------
622 [CRCProbStat2000] section 14.7
624 """
625 if not use_ties:
626 raise ValueError("`use_ties=False` is not supported in SciPy >= 1.2.0")
628 # Always returns a masked array, raveled if axis=None
629 x, axisout = _chk_asarray(x, axis)
630 if y is not None:
631 # Deal only with 2-D `x` case.
632 y, _ = _chk_asarray(y, axis)
633 if axisout == 0:
634 x = ma.column_stack((x, y))
635 else:
636 x = ma.row_stack((x, y))
638 if axisout == 1:
639 # To simplify the code that follow (always use `n_obs, n_vars` shape)
640 x = x.T
642 if nan_policy == 'omit':
643 x = ma.masked_invalid(x)
645 def _spearmanr_2cols(x):
646 # Mask the same observations for all variables, and then drop those
647 # observations (can't leave them masked, rankdata is weird).
648 x = ma.mask_rowcols(x, axis=0)
649 x = x[~x.mask.any(axis=1), :]
651 # If either column is entirely NaN or Inf
652 if not np.any(x.data):
653 res = scipy.stats._stats_py.SignificanceResult(np.nan, np.nan)
654 res.correlation = np.nan
655 return res
657 m = ma.getmask(x)
658 n_obs = x.shape[0]
659 dof = n_obs - 2 - int(m.sum(axis=0)[0])
660 if dof < 0:
661 raise ValueError("The input must have at least 3 entries!")
663 # Gets the ranks and rank differences
664 x_ranked = rankdata(x, axis=0)
665 rs = ma.corrcoef(x_ranked, rowvar=False).data
667 # rs can have elements equal to 1, so avoid zero division warnings
668 with np.errstate(divide='ignore'):
669 # clip the small negative values possibly caused by rounding
670 # errors before taking the square root
671 t = rs * np.sqrt((dof / ((rs+1.0) * (1.0-rs))).clip(0))
673 t, prob = scipy.stats._stats_py._ttest_finish(dof, t, alternative)
675 # For backwards compatibility, return scalars when comparing 2 columns
676 if rs.shape == (2, 2):
677 res = scipy.stats._stats_py.SignificanceResult(rs[1, 0],
678 prob[1, 0])
679 res.correlation = rs[1, 0]
680 return res
681 else:
682 res = scipy.stats._stats_py.SignificanceResult(rs, prob)
683 res.correlation = rs
684 return res
686 # Need to do this per pair of variables, otherwise the dropped observations
687 # in a third column mess up the result for a pair.
688 n_vars = x.shape[1]
689 if n_vars == 2:
690 return _spearmanr_2cols(x)
691 else:
692 rs = np.ones((n_vars, n_vars), dtype=float)
693 prob = np.zeros((n_vars, n_vars), dtype=float)
694 for var1 in range(n_vars - 1):
695 for var2 in range(var1+1, n_vars):
696 result = _spearmanr_2cols(x[:, [var1, var2]])
697 rs[var1, var2] = result.correlation
698 rs[var2, var1] = result.correlation
699 prob[var1, var2] = result.pvalue
700 prob[var2, var1] = result.pvalue
702 res = scipy.stats._stats_py.SignificanceResult(rs, prob)
703 res.correlation = rs
704 return res
707def _kendall_p_exact(n, c, alternative='two-sided'):
709 # Use the fact that distribution is symmetric: always calculate a CDF in
710 # the left tail.
711 # This will be the one-sided p-value if `c` is on the side of
712 # the null distribution predicted by the alternative hypothesis.
713 # The two-sided p-value will be twice this value.
714 # If `c` is on the other side of the null distribution, we'll need to
715 # take the complement and add back the probability mass at `c`.
716 in_right_tail = (c >= (n*(n-1))//2 - c)
717 alternative_greater = (alternative == 'greater')
718 c = int(min(c, (n*(n-1))//2 - c))
720 # Exact p-value, see Maurice G. Kendall, "Rank Correlation Methods"
721 # (4th Edition), Charles Griffin & Co., 1970.
722 if n <= 0:
723 raise ValueError(f'n ({n}) must be positive')
724 elif c < 0 or 4*c > n*(n-1):
725 raise ValueError(f'c ({c}) must satisfy 0 <= 4c <= n(n-1) = {n*(n-1)}.')
726 elif n == 1:
727 prob = 1.0
728 p_mass_at_c = 1
729 elif n == 2:
730 prob = 1.0
731 p_mass_at_c = 0.5
732 elif c == 0:
733 prob = 2.0/math.factorial(n) if n < 171 else 0.0
734 p_mass_at_c = prob/2
735 elif c == 1:
736 prob = 2.0/math.factorial(n-1) if n < 172 else 0.0
737 p_mass_at_c = (n-1)/math.factorial(n)
738 elif 4*c == n*(n-1) and alternative == 'two-sided':
739 # I'm sure there's a simple formula for p_mass_at_c in this
740 # case, but I don't know it. Use generic formula for one-sided p-value.
741 prob = 1.0
742 elif n < 171:
743 new = np.zeros(c+1)
744 new[0:2] = 1.0
745 for j in range(3,n+1):
746 new = np.cumsum(new)
747 if j <= c:
748 new[j:] -= new[:c+1-j]
749 prob = 2.0*np.sum(new)/math.factorial(n)
750 p_mass_at_c = new[-1]/math.factorial(n)
751 else:
752 new = np.zeros(c+1)
753 new[0:2] = 1.0
754 for j in range(3, n+1):
755 new = np.cumsum(new)/j
756 if j <= c:
757 new[j:] -= new[:c+1-j]
758 prob = np.sum(new)
759 p_mass_at_c = new[-1]/2
761 if alternative != 'two-sided':
762 # if the alternative hypothesis and alternative agree,
763 # one-sided p-value is half the two-sided p-value
764 if in_right_tail == alternative_greater:
765 prob /= 2
766 else:
767 prob = 1 - prob/2 + p_mass_at_c
769 prob = np.clip(prob, 0, 1)
771 return prob
774def kendalltau(x, y, use_ties=True, use_missing=False, method='auto',
775 alternative='two-sided'):
776 """
777 Computes Kendall's rank correlation tau on two variables *x* and *y*.
779 Parameters
780 ----------
781 x : sequence
782 First data list (for example, time).
783 y : sequence
784 Second data list.
785 use_ties : {True, False}, optional
786 Whether ties correction should be performed.
787 use_missing : {False, True}, optional
788 Whether missing data should be allocated a rank of 0 (False) or the
789 average rank (True)
790 method : {'auto', 'asymptotic', 'exact'}, optional
791 Defines which method is used to calculate the p-value [1]_.
792 'asymptotic' uses a normal approximation valid for large samples.
793 'exact' computes the exact p-value, but can only be used if no ties
794 are present. As the sample size increases, the 'exact' computation
795 time may grow and the result may lose some precision.
796 'auto' is the default and selects the appropriate
797 method based on a trade-off between speed and accuracy.
798 alternative : {'two-sided', 'less', 'greater'}, optional
799 Defines the alternative hypothesis. Default is 'two-sided'.
800 The following options are available:
802 * 'two-sided': the rank correlation is nonzero
803 * 'less': the rank correlation is negative (less than zero)
804 * 'greater': the rank correlation is positive (greater than zero)
806 Returns
807 -------
808 res : SignificanceResult
809 An object containing attributes:
811 statistic : float
812 The tau statistic.
813 pvalue : float
814 The p-value for a hypothesis test whose null hypothesis is
815 an absence of association, tau = 0.
817 References
818 ----------
819 .. [1] Maurice G. Kendall, "Rank Correlation Methods" (4th Edition),
820 Charles Griffin & Co., 1970.
822 """
823 (x, y, n) = _chk_size(x, y)
824 (x, y) = (x.flatten(), y.flatten())
825 m = ma.mask_or(ma.getmask(x), ma.getmask(y))
826 if m is not nomask:
827 x = ma.array(x, mask=m, copy=True)
828 y = ma.array(y, mask=m, copy=True)
829 # need int() here, otherwise numpy defaults to 32 bit
830 # integer on all Windows architectures, causing overflow.
831 # int() will keep it infinite precision.
832 n -= int(m.sum())
834 if n < 2:
835 res = scipy.stats._stats_py.SignificanceResult(np.nan, np.nan)
836 res.correlation = np.nan
837 return res
839 rx = ma.masked_equal(rankdata(x, use_missing=use_missing), 0)
840 ry = ma.masked_equal(rankdata(y, use_missing=use_missing), 0)
841 idx = rx.argsort()
842 (rx, ry) = (rx[idx], ry[idx])
843 C = np.sum([((ry[i+1:] > ry[i]) * (rx[i+1:] > rx[i])).filled(0).sum()
844 for i in range(len(ry)-1)], dtype=float)
845 D = np.sum([((ry[i+1:] < ry[i])*(rx[i+1:] > rx[i])).filled(0).sum()
846 for i in range(len(ry)-1)], dtype=float)
847 xties = count_tied_groups(x)
848 yties = count_tied_groups(y)
849 if use_ties:
850 corr_x = np.sum([v*k*(k-1) for (k,v) in xties.items()], dtype=float)
851 corr_y = np.sum([v*k*(k-1) for (k,v) in yties.items()], dtype=float)
852 denom = ma.sqrt((n*(n-1)-corr_x)/2. * (n*(n-1)-corr_y)/2.)
853 else:
854 denom = n*(n-1)/2.
855 tau = (C-D) / denom
857 if method == 'exact' and (xties or yties):
858 raise ValueError("Ties found, exact method cannot be used.")
860 if method == 'auto':
861 if (not xties and not yties) and (n <= 33 or min(C, n*(n-1)/2.0-C) <= 1):
862 method = 'exact'
863 else:
864 method = 'asymptotic'
866 if not xties and not yties and method == 'exact':
867 prob = _kendall_p_exact(n, C, alternative)
869 elif method == 'asymptotic':
870 var_s = n*(n-1)*(2*n+5)
871 if use_ties:
872 var_s -= np.sum([v*k*(k-1)*(2*k+5)*1. for (k,v) in xties.items()])
873 var_s -= np.sum([v*k*(k-1)*(2*k+5)*1. for (k,v) in yties.items()])
874 v1 = np.sum([v*k*(k-1) for (k, v) in xties.items()], dtype=float) *\
875 np.sum([v*k*(k-1) for (k, v) in yties.items()], dtype=float)
876 v1 /= 2.*n*(n-1)
877 if n > 2:
878 v2 = np.sum([v*k*(k-1)*(k-2) for (k,v) in xties.items()],
879 dtype=float) * \
880 np.sum([v*k*(k-1)*(k-2) for (k,v) in yties.items()],
881 dtype=float)
882 v2 /= 9.*n*(n-1)*(n-2)
883 else:
884 v2 = 0
885 else:
886 v1 = v2 = 0
888 var_s /= 18.
889 var_s += (v1 + v2)
890 z = (C-D)/np.sqrt(var_s)
891 _, prob = scipy.stats._stats_py._normtest_finish(z, alternative)
892 else:
893 raise ValueError("Unknown method "+str(method)+" specified, please "
894 "use auto, exact or asymptotic.")
896 res = scipy.stats._stats_py.SignificanceResult(tau, prob)
897 res.correlation = tau
898 return res
901def kendalltau_seasonal(x):
902 """
903 Computes a multivariate Kendall's rank correlation tau, for seasonal data.
905 Parameters
906 ----------
907 x : 2-D ndarray
908 Array of seasonal data, with seasons in columns.
910 """
911 x = ma.array(x, subok=True, copy=False, ndmin=2)
912 (n,m) = x.shape
913 n_p = x.count(0)
915 S_szn = sum(msign(x[i:]-x[i]).sum(0) for i in range(n))
916 S_tot = S_szn.sum()
918 n_tot = x.count()
919 ties = count_tied_groups(x.compressed())
920 corr_ties = sum(v*k*(k-1) for (k,v) in ties.items())
921 denom_tot = ma.sqrt(1.*n_tot*(n_tot-1)*(n_tot*(n_tot-1)-corr_ties))/2.
923 R = rankdata(x, axis=0, use_missing=True)
924 K = ma.empty((m,m), dtype=int)
925 covmat = ma.empty((m,m), dtype=float)
926 denom_szn = ma.empty(m, dtype=float)
927 for j in range(m):
928 ties_j = count_tied_groups(x[:,j].compressed())
929 corr_j = sum(v*k*(k-1) for (k,v) in ties_j.items())
930 cmb = n_p[j]*(n_p[j]-1)
931 for k in range(j,m,1):
932 K[j,k] = sum(msign((x[i:,j]-x[i,j])*(x[i:,k]-x[i,k])).sum()
933 for i in range(n))
934 covmat[j,k] = (K[j,k] + 4*(R[:,j]*R[:,k]).sum() -
935 n*(n_p[j]+1)*(n_p[k]+1))/3.
936 K[k,j] = K[j,k]
937 covmat[k,j] = covmat[j,k]
939 denom_szn[j] = ma.sqrt(cmb*(cmb-corr_j)) / 2.
941 var_szn = covmat.diagonal()
943 z_szn = msign(S_szn) * (abs(S_szn)-1) / ma.sqrt(var_szn)
944 z_tot_ind = msign(S_tot) * (abs(S_tot)-1) / ma.sqrt(var_szn.sum())
945 z_tot_dep = msign(S_tot) * (abs(S_tot)-1) / ma.sqrt(covmat.sum())
947 prob_szn = special.erfc(abs(z_szn)/np.sqrt(2))
948 prob_tot_ind = special.erfc(abs(z_tot_ind)/np.sqrt(2))
949 prob_tot_dep = special.erfc(abs(z_tot_dep)/np.sqrt(2))
951 chi2_tot = (z_szn*z_szn).sum()
952 chi2_trd = m * z_szn.mean()**2
953 output = {'seasonal tau': S_szn/denom_szn,
954 'global tau': S_tot/denom_tot,
955 'global tau (alt)': S_tot/denom_szn.sum(),
956 'seasonal p-value': prob_szn,
957 'global p-value (indep)': prob_tot_ind,
958 'global p-value (dep)': prob_tot_dep,
959 'chi2 total': chi2_tot,
960 'chi2 trend': chi2_trd,
961 }
962 return output
965PointbiserialrResult = namedtuple('PointbiserialrResult', ('correlation',
966 'pvalue'))
969def pointbiserialr(x, y):
970 """Calculates a point biserial correlation coefficient and its p-value.
972 Parameters
973 ----------
974 x : array_like of bools
975 Input array.
976 y : array_like
977 Input array.
979 Returns
980 -------
981 correlation : float
982 R value
983 pvalue : float
984 2-tailed p-value
986 Notes
987 -----
988 Missing values are considered pair-wise: if a value is missing in x,
989 the corresponding value in y is masked.
991 For more details on `pointbiserialr`, see `scipy.stats.pointbiserialr`.
993 """
994 x = ma.fix_invalid(x, copy=True).astype(bool)
995 y = ma.fix_invalid(y, copy=True).astype(float)
996 # Get rid of the missing data
997 m = ma.mask_or(ma.getmask(x), ma.getmask(y))
998 if m is not nomask:
999 unmask = np.logical_not(m)
1000 x = x[unmask]
1001 y = y[unmask]
1003 n = len(x)
1004 # phat is the fraction of x values that are True
1005 phat = x.sum() / float(n)
1006 y0 = y[~x] # y-values where x is False
1007 y1 = y[x] # y-values where x is True
1008 y0m = y0.mean()
1009 y1m = y1.mean()
1011 rpb = (y1m - y0m)*np.sqrt(phat * (1-phat)) / y.std()
1013 df = n-2
1014 t = rpb*ma.sqrt(df/(1.0-rpb**2))
1015 prob = _betai(0.5*df, 0.5, df/(df+t*t))
1017 return PointbiserialrResult(rpb, prob)
1020def linregress(x, y=None):
1021 r"""
1022 Linear regression calculation
1024 Note that the non-masked version is used, and that this docstring is
1025 replaced by the non-masked docstring + some info on missing data.
1027 """
1028 if y is None:
1029 x = ma.array(x)
1030 if x.shape[0] == 2:
1031 x, y = x
1032 elif x.shape[1] == 2:
1033 x, y = x.T
1034 else:
1035 raise ValueError("If only `x` is given as input, "
1036 "it has to be of shape (2, N) or (N, 2), "
1037 f"provided shape was {x.shape}")
1038 else:
1039 x = ma.array(x)
1040 y = ma.array(y)
1042 x = x.flatten()
1043 y = y.flatten()
1045 if np.amax(x) == np.amin(x) and len(x) > 1:
1046 raise ValueError("Cannot calculate a linear regression "
1047 "if all x values are identical")
1049 m = ma.mask_or(ma.getmask(x), ma.getmask(y), shrink=False)
1050 if m is not nomask:
1051 x = ma.array(x, mask=m)
1052 y = ma.array(y, mask=m)
1053 if np.any(~m):
1054 result = stats_linregress(x.data[~m], y.data[~m])
1055 else:
1056 # All data is masked
1057 result = stats_LinregressResult(slope=None, intercept=None,
1058 rvalue=None, pvalue=None,
1059 stderr=None,
1060 intercept_stderr=None)
1061 else:
1062 result = stats_linregress(x.data, y.data)
1064 return result
1067def theilslopes(y, x=None, alpha=0.95, method='separate'):
1068 r"""
1069 Computes the Theil-Sen estimator for a set of points (x, y).
1071 `theilslopes` implements a method for robust linear regression. It
1072 computes the slope as the median of all slopes between paired values.
1074 Parameters
1075 ----------
1076 y : array_like
1077 Dependent variable.
1078 x : array_like or None, optional
1079 Independent variable. If None, use ``arange(len(y))`` instead.
1080 alpha : float, optional
1081 Confidence degree between 0 and 1. Default is 95% confidence.
1082 Note that `alpha` is symmetric around 0.5, i.e. both 0.1 and 0.9 are
1083 interpreted as "find the 90% confidence interval".
1084 method : {'joint', 'separate'}, optional
1085 Method to be used for computing estimate for intercept.
1086 Following methods are supported,
1088 * 'joint': Uses np.median(y - slope * x) as intercept.
1089 * 'separate': Uses np.median(y) - slope * np.median(x)
1090 as intercept.
1092 The default is 'separate'.
1094 .. versionadded:: 1.8.0
1096 Returns
1097 -------
1098 result : ``TheilslopesResult`` instance
1099 The return value is an object with the following attributes:
1101 slope : float
1102 Theil slope.
1103 intercept : float
1104 Intercept of the Theil line.
1105 low_slope : float
1106 Lower bound of the confidence interval on `slope`.
1107 high_slope : float
1108 Upper bound of the confidence interval on `slope`.
1110 See Also
1111 --------
1112 siegelslopes : a similar technique using repeated medians
1115 Notes
1116 -----
1117 For more details on `theilslopes`, see `scipy.stats.theilslopes`.
1119 """
1120 y = ma.asarray(y).flatten()
1121 if x is None:
1122 x = ma.arange(len(y), dtype=float)
1123 else:
1124 x = ma.asarray(x).flatten()
1125 if len(x) != len(y):
1126 raise ValueError("Incompatible lengths ! (%s<>%s)" % (len(y),len(x)))
1128 m = ma.mask_or(ma.getmask(x), ma.getmask(y))
1129 y._mask = x._mask = m
1130 # Disregard any masked elements of x or y
1131 y = y.compressed()
1132 x = x.compressed().astype(float)
1133 # We now have unmasked arrays so can use `scipy.stats.theilslopes`
1134 return stats_theilslopes(y, x, alpha=alpha, method=method)
1137def siegelslopes(y, x=None, method="hierarchical"):
1138 r"""
1139 Computes the Siegel estimator for a set of points (x, y).
1141 `siegelslopes` implements a method for robust linear regression
1142 using repeated medians to fit a line to the points (x, y).
1143 The method is robust to outliers with an asymptotic breakdown point
1144 of 50%.
1146 Parameters
1147 ----------
1148 y : array_like
1149 Dependent variable.
1150 x : array_like or None, optional
1151 Independent variable. If None, use ``arange(len(y))`` instead.
1152 method : {'hierarchical', 'separate'}
1153 If 'hierarchical', estimate the intercept using the estimated
1154 slope ``slope`` (default option).
1155 If 'separate', estimate the intercept independent of the estimated
1156 slope. See Notes for details.
1158 Returns
1159 -------
1160 result : ``SiegelslopesResult`` instance
1161 The return value is an object with the following attributes:
1163 slope : float
1164 Estimate of the slope of the regression line.
1165 intercept : float
1166 Estimate of the intercept of the regression line.
1168 See Also
1169 --------
1170 theilslopes : a similar technique without repeated medians
1172 Notes
1173 -----
1174 For more details on `siegelslopes`, see `scipy.stats.siegelslopes`.
1176 """
1177 y = ma.asarray(y).ravel()
1178 if x is None:
1179 x = ma.arange(len(y), dtype=float)
1180 else:
1181 x = ma.asarray(x).ravel()
1182 if len(x) != len(y):
1183 raise ValueError("Incompatible lengths ! (%s<>%s)" % (len(y), len(x)))
1185 m = ma.mask_or(ma.getmask(x), ma.getmask(y))
1186 y._mask = x._mask = m
1187 # Disregard any masked elements of x or y
1188 y = y.compressed()
1189 x = x.compressed().astype(float)
1190 # We now have unmasked arrays so can use `scipy.stats.siegelslopes`
1191 return stats_siegelslopes(y, x, method=method)
1194SenSeasonalSlopesResult = _make_tuple_bunch('SenSeasonalSlopesResult',
1195 ['intra_slope', 'inter_slope'])
1198def sen_seasonal_slopes(x):
1199 r"""
1200 Computes seasonal Theil-Sen and Kendall slope estimators.
1202 The seasonal generalization of Sen's slope computes the slopes between all
1203 pairs of values within a "season" (column) of a 2D array. It returns an
1204 array containing the median of these "within-season" slopes for each
1205 season (the Theil-Sen slope estimator of each season), and it returns the
1206 median of the within-season slopes across all seasons (the seasonal Kendall
1207 slope estimator).
1209 Parameters
1210 ----------
1211 x : 2D array_like
1212 Each column of `x` contains measurements of the dependent variable
1213 within a season. The independent variable (usually time) of each season
1214 is assumed to be ``np.arange(x.shape[0])``.
1216 Returns
1217 -------
1218 result : ``SenSeasonalSlopesResult`` instance
1219 The return value is an object with the following attributes:
1221 intra_slope : ndarray
1222 For each season, the Theil-Sen slope estimator: the median of
1223 within-season slopes.
1224 inter_slope : float
1225 The seasonal Kendall slope estimateor: the median of within-season
1226 slopes *across all* seasons.
1228 See Also
1229 --------
1230 theilslopes : the analogous function for non-seasonal data
1231 scipy.stats.theilslopes : non-seasonal slopes for non-masked arrays
1233 Notes
1234 -----
1235 The slopes :math:`d_{ijk}` within season :math:`i` are:
1237 .. math::
1239 d_{ijk} = \frac{x_{ij} - x_{ik}}
1240 {j - k}
1242 for pairs of distinct integer indices :math:`j, k` of :math:`x`.
1244 Element :math:`i` of the returned `intra_slope` array is the median of the
1245 :math:`d_{ijk}` over all :math:`j < k`; this is the Theil-Sen slope
1246 estimator of season :math:`i`. The returned `inter_slope` value, better
1247 known as the seasonal Kendall slope estimator, is the median of the
1248 :math:`d_{ijk}` over all :math:`i, j, k`.
1250 References
1251 ----------
1252 .. [1] Hirsch, Robert M., James R. Slack, and Richard A. Smith.
1253 "Techniques of trend analysis for monthly water quality data."
1254 *Water Resources Research* 18.1 (1982): 107-121.
1256 Examples
1257 --------
1258 Suppose we have 100 observations of a dependent variable for each of four
1259 seasons:
1261 >>> import numpy as np
1262 >>> rng = np.random.default_rng()
1263 >>> x = rng.random(size=(100, 4))
1265 We compute the seasonal slopes as:
1267 >>> from scipy import stats
1268 >>> intra_slope, inter_slope = stats.mstats.sen_seasonal_slopes(x)
1270 If we define a function to compute all slopes between observations within
1271 a season:
1273 >>> def dijk(yi):
1274 ... n = len(yi)
1275 ... x = np.arange(n)
1276 ... dy = yi - yi[:, np.newaxis]
1277 ... dx = x - x[:, np.newaxis]
1278 ... # we only want unique pairs of distinct indices
1279 ... mask = np.triu(np.ones((n, n), dtype=bool), k=1)
1280 ... return dy[mask]/dx[mask]
1282 then element ``i`` of ``intra_slope`` is the median of ``dijk[x[:, i]]``:
1284 >>> i = 2
1285 >>> np.allclose(np.median(dijk(x[:, i])), intra_slope[i])
1286 True
1288 and ``inter_slope`` is the median of the values returned by ``dijk`` for
1289 all seasons:
1291 >>> all_slopes = np.concatenate([dijk(x[:, i]) for i in range(x.shape[1])])
1292 >>> np.allclose(np.median(all_slopes), inter_slope)
1293 True
1295 Because the data are randomly generated, we would expect the median slopes
1296 to be nearly zero both within and across all seasons, and indeed they are:
1298 >>> intra_slope.data
1299 array([ 0.00124504, -0.00277761, -0.00221245, -0.00036338])
1300 >>> inter_slope
1301 -0.0010511779872922058
1303 """
1304 x = ma.array(x, subok=True, copy=False, ndmin=2)
1305 (n,_) = x.shape
1306 # Get list of slopes per season
1307 szn_slopes = ma.vstack([(x[i+1:]-x[i])/np.arange(1,n-i)[:,None]
1308 for i in range(n)])
1309 szn_medslopes = ma.median(szn_slopes, axis=0)
1310 medslope = ma.median(szn_slopes, axis=None)
1311 return SenSeasonalSlopesResult(szn_medslopes, medslope)
1314Ttest_1sampResult = namedtuple('Ttest_1sampResult', ('statistic', 'pvalue'))
1317def ttest_1samp(a, popmean, axis=0, alternative='two-sided'):
1318 """
1319 Calculates the T-test for the mean of ONE group of scores.
1321 Parameters
1322 ----------
1323 a : array_like
1324 sample observation
1325 popmean : float or array_like
1326 expected value in null hypothesis, if array_like than it must have the
1327 same shape as `a` excluding the axis dimension
1328 axis : int or None, optional
1329 Axis along which to compute test. If None, compute over the whole
1330 array `a`.
1331 alternative : {'two-sided', 'less', 'greater'}, optional
1332 Defines the alternative hypothesis.
1333 The following options are available (default is 'two-sided'):
1335 * 'two-sided': the mean of the underlying distribution of the sample
1336 is different than the given population mean (`popmean`)
1337 * 'less': the mean of the underlying distribution of the sample is
1338 less than the given population mean (`popmean`)
1339 * 'greater': the mean of the underlying distribution of the sample is
1340 greater than the given population mean (`popmean`)
1342 .. versionadded:: 1.7.0
1344 Returns
1345 -------
1346 statistic : float or array
1347 t-statistic
1348 pvalue : float or array
1349 The p-value
1351 Notes
1352 -----
1353 For more details on `ttest_1samp`, see `scipy.stats.ttest_1samp`.
1355 """
1356 a, axis = _chk_asarray(a, axis)
1357 if a.size == 0:
1358 return (np.nan, np.nan)
1360 x = a.mean(axis=axis)
1361 v = a.var(axis=axis, ddof=1)
1362 n = a.count(axis=axis)
1363 # force df to be an array for masked division not to throw a warning
1364 df = ma.asanyarray(n - 1.0)
1365 svar = ((n - 1.0) * v) / df
1366 with np.errstate(divide='ignore', invalid='ignore'):
1367 t = (x - popmean) / ma.sqrt(svar / n)
1369 t, prob = scipy.stats._stats_py._ttest_finish(df, t, alternative)
1370 return Ttest_1sampResult(t, prob)
1373ttest_onesamp = ttest_1samp
1376Ttest_indResult = namedtuple('Ttest_indResult', ('statistic', 'pvalue'))
1379def ttest_ind(a, b, axis=0, equal_var=True, alternative='two-sided'):
1380 """
1381 Calculates the T-test for the means of TWO INDEPENDENT samples of scores.
1383 Parameters
1384 ----------
1385 a, b : array_like
1386 The arrays must have the same shape, except in the dimension
1387 corresponding to `axis` (the first, by default).
1388 axis : int or None, optional
1389 Axis along which to compute test. If None, compute over the whole
1390 arrays, `a`, and `b`.
1391 equal_var : bool, optional
1392 If True, perform a standard independent 2 sample test that assumes equal
1393 population variances.
1394 If False, perform Welch's t-test, which does not assume equal population
1395 variance.
1397 .. versionadded:: 0.17.0
1398 alternative : {'two-sided', 'less', 'greater'}, optional
1399 Defines the alternative hypothesis.
1400 The following options are available (default is 'two-sided'):
1402 * 'two-sided': the means of the distributions underlying the samples
1403 are unequal.
1404 * 'less': the mean of the distribution underlying the first sample
1405 is less than the mean of the distribution underlying the second
1406 sample.
1407 * 'greater': the mean of the distribution underlying the first
1408 sample is greater than the mean of the distribution underlying
1409 the second sample.
1411 .. versionadded:: 1.7.0
1413 Returns
1414 -------
1415 statistic : float or array
1416 The calculated t-statistic.
1417 pvalue : float or array
1418 The p-value.
1420 Notes
1421 -----
1422 For more details on `ttest_ind`, see `scipy.stats.ttest_ind`.
1424 """
1425 a, b, axis = _chk2_asarray(a, b, axis)
1427 if a.size == 0 or b.size == 0:
1428 return Ttest_indResult(np.nan, np.nan)
1430 (x1, x2) = (a.mean(axis), b.mean(axis))
1431 (v1, v2) = (a.var(axis=axis, ddof=1), b.var(axis=axis, ddof=1))
1432 (n1, n2) = (a.count(axis), b.count(axis))
1434 if equal_var:
1435 # force df to be an array for masked division not to throw a warning
1436 df = ma.asanyarray(n1 + n2 - 2.0)
1437 svar = ((n1-1)*v1+(n2-1)*v2) / df
1438 denom = ma.sqrt(svar*(1.0/n1 + 1.0/n2)) # n-D computation here!
1439 else:
1440 vn1 = v1/n1
1441 vn2 = v2/n2
1442 with np.errstate(divide='ignore', invalid='ignore'):
1443 df = (vn1 + vn2)**2 / (vn1**2 / (n1 - 1) + vn2**2 / (n2 - 1))
1445 # If df is undefined, variances are zero.
1446 # It doesn't matter what df is as long as it is not NaN.
1447 df = np.where(np.isnan(df), 1, df)
1448 denom = ma.sqrt(vn1 + vn2)
1450 with np.errstate(divide='ignore', invalid='ignore'):
1451 t = (x1-x2) / denom
1453 t, prob = scipy.stats._stats_py._ttest_finish(df, t, alternative)
1454 return Ttest_indResult(t, prob)
1457Ttest_relResult = namedtuple('Ttest_relResult', ('statistic', 'pvalue'))
1460def ttest_rel(a, b, axis=0, alternative='two-sided'):
1461 """
1462 Calculates the T-test on TWO RELATED samples of scores, a and b.
1464 Parameters
1465 ----------
1466 a, b : array_like
1467 The arrays must have the same shape.
1468 axis : int or None, optional
1469 Axis along which to compute test. If None, compute over the whole
1470 arrays, `a`, and `b`.
1471 alternative : {'two-sided', 'less', 'greater'}, optional
1472 Defines the alternative hypothesis.
1473 The following options are available (default is 'two-sided'):
1475 * 'two-sided': the means of the distributions underlying the samples
1476 are unequal.
1477 * 'less': the mean of the distribution underlying the first sample
1478 is less than the mean of the distribution underlying the second
1479 sample.
1480 * 'greater': the mean of the distribution underlying the first
1481 sample is greater than the mean of the distribution underlying
1482 the second sample.
1484 .. versionadded:: 1.7.0
1486 Returns
1487 -------
1488 statistic : float or array
1489 t-statistic
1490 pvalue : float or array
1491 two-tailed p-value
1493 Notes
1494 -----
1495 For more details on `ttest_rel`, see `scipy.stats.ttest_rel`.
1497 """
1498 a, b, axis = _chk2_asarray(a, b, axis)
1499 if len(a) != len(b):
1500 raise ValueError('unequal length arrays')
1502 if a.size == 0 or b.size == 0:
1503 return Ttest_relResult(np.nan, np.nan)
1505 n = a.count(axis)
1506 df = ma.asanyarray(n-1.0)
1507 d = (a-b).astype('d')
1508 dm = d.mean(axis)
1509 v = d.var(axis=axis, ddof=1)
1510 denom = ma.sqrt(v / n)
1511 with np.errstate(divide='ignore', invalid='ignore'):
1512 t = dm / denom
1514 t, prob = scipy.stats._stats_py._ttest_finish(df, t, alternative)
1515 return Ttest_relResult(t, prob)
1518MannwhitneyuResult = namedtuple('MannwhitneyuResult', ('statistic',
1519 'pvalue'))
1522def mannwhitneyu(x,y, use_continuity=True):
1523 """
1524 Computes the Mann-Whitney statistic
1526 Missing values in `x` and/or `y` are discarded.
1528 Parameters
1529 ----------
1530 x : sequence
1531 Input
1532 y : sequence
1533 Input
1534 use_continuity : {True, False}, optional
1535 Whether a continuity correction (1/2.) should be taken into account.
1537 Returns
1538 -------
1539 statistic : float
1540 The minimum of the Mann-Whitney statistics
1541 pvalue : float
1542 Approximate two-sided p-value assuming a normal distribution.
1544 """
1545 x = ma.asarray(x).compressed().view(ndarray)
1546 y = ma.asarray(y).compressed().view(ndarray)
1547 ranks = rankdata(np.concatenate([x,y]))
1548 (nx, ny) = (len(x), len(y))
1549 nt = nx + ny
1550 U = ranks[:nx].sum() - nx*(nx+1)/2.
1551 U = max(U, nx*ny - U)
1552 u = nx*ny - U
1554 mu = (nx*ny)/2.
1555 sigsq = (nt**3 - nt)/12.
1556 ties = count_tied_groups(ranks)
1557 sigsq -= sum(v*(k**3-k) for (k,v) in ties.items())/12.
1558 sigsq *= nx*ny/float(nt*(nt-1))
1560 if use_continuity:
1561 z = (U - 1/2. - mu) / ma.sqrt(sigsq)
1562 else:
1563 z = (U - mu) / ma.sqrt(sigsq)
1565 prob = special.erfc(abs(z)/np.sqrt(2))
1566 return MannwhitneyuResult(u, prob)
1569KruskalResult = namedtuple('KruskalResult', ('statistic', 'pvalue'))
1572def kruskal(*args):
1573 """
1574 Compute the Kruskal-Wallis H-test for independent samples
1576 Parameters
1577 ----------
1578 sample1, sample2, ... : array_like
1579 Two or more arrays with the sample measurements can be given as
1580 arguments.
1582 Returns
1583 -------
1584 statistic : float
1585 The Kruskal-Wallis H statistic, corrected for ties
1586 pvalue : float
1587 The p-value for the test using the assumption that H has a chi
1588 square distribution
1590 Notes
1591 -----
1592 For more details on `kruskal`, see `scipy.stats.kruskal`.
1594 Examples
1595 --------
1596 >>> from scipy.stats.mstats import kruskal
1598 Random samples from three different brands of batteries were tested
1599 to see how long the charge lasted. Results were as follows:
1601 >>> a = [6.3, 5.4, 5.7, 5.2, 5.0]
1602 >>> b = [6.9, 7.0, 6.1, 7.9]
1603 >>> c = [7.2, 6.9, 6.1, 6.5]
1605 Test the hypotesis that the distribution functions for all of the brands'
1606 durations are identical. Use 5% level of significance.
1608 >>> kruskal(a, b, c)
1609 KruskalResult(statistic=7.113812154696133, pvalue=0.028526948491942164)
1611 The null hypothesis is rejected at the 5% level of significance
1612 because the returned p-value is less than the critical value of 5%.
1614 """
1615 output = argstoarray(*args)
1616 ranks = ma.masked_equal(rankdata(output, use_missing=False), 0)
1617 sumrk = ranks.sum(-1)
1618 ngrp = ranks.count(-1)
1619 ntot = ranks.count()
1620 H = 12./(ntot*(ntot+1)) * (sumrk**2/ngrp).sum() - 3*(ntot+1)
1621 # Tie correction
1622 ties = count_tied_groups(ranks)
1623 T = 1. - sum(v*(k**3-k) for (k,v) in ties.items())/float(ntot**3-ntot)
1624 if T == 0:
1625 raise ValueError('All numbers are identical in kruskal')
1627 H /= T
1628 df = len(output) - 1
1629 prob = distributions.chi2.sf(H, df)
1630 return KruskalResult(H, prob)
1633kruskalwallis = kruskal
1636@_rename_parameter("mode", "method")
1637def ks_1samp(x, cdf, args=(), alternative="two-sided", method='auto'):
1638 """
1639 Computes the Kolmogorov-Smirnov test on one sample of masked values.
1641 Missing values in `x` are discarded.
1643 Parameters
1644 ----------
1645 x : array_like
1646 a 1-D array of observations of random variables.
1647 cdf : str or callable
1648 If a string, it should be the name of a distribution in `scipy.stats`.
1649 If a callable, that callable is used to calculate the cdf.
1650 args : tuple, sequence, optional
1651 Distribution parameters, used if `cdf` is a string.
1652 alternative : {'two-sided', 'less', 'greater'}, optional
1653 Indicates the alternative hypothesis. Default is 'two-sided'.
1654 method : {'auto', 'exact', 'asymp'}, optional
1655 Defines the method used for calculating the p-value.
1656 The following options are available (default is 'auto'):
1658 * 'auto' : use 'exact' for small size arrays, 'asymp' for large
1659 * 'exact' : use approximation to exact distribution of test statistic
1660 * 'asymp' : use asymptotic distribution of test statistic
1662 Returns
1663 -------
1664 d : float
1665 Value of the Kolmogorov Smirnov test
1666 p : float
1667 Corresponding p-value.
1669 """
1670 alternative = {'t': 'two-sided', 'g': 'greater', 'l': 'less'}.get(
1671 alternative.lower()[0], alternative)
1672 return scipy.stats._stats_py.ks_1samp(
1673 x, cdf, args=args, alternative=alternative, method=method)
1676@_rename_parameter("mode", "method")
1677def ks_2samp(data1, data2, alternative="two-sided", method='auto'):
1678 """
1679 Computes the Kolmogorov-Smirnov test on two samples.
1681 Missing values in `x` and/or `y` are discarded.
1683 Parameters
1684 ----------
1685 data1 : array_like
1686 First data set
1687 data2 : array_like
1688 Second data set
1689 alternative : {'two-sided', 'less', 'greater'}, optional
1690 Indicates the alternative hypothesis. Default is 'two-sided'.
1691 method : {'auto', 'exact', 'asymp'}, optional
1692 Defines the method used for calculating the p-value.
1693 The following options are available (default is 'auto'):
1695 * 'auto' : use 'exact' for small size arrays, 'asymp' for large
1696 * 'exact' : use approximation to exact distribution of test statistic
1697 * 'asymp' : use asymptotic distribution of test statistic
1699 Returns
1700 -------
1701 d : float
1702 Value of the Kolmogorov Smirnov test
1703 p : float
1704 Corresponding p-value.
1706 """
1707 # Ideally this would be accomplished by
1708 # ks_2samp = scipy.stats._stats_py.ks_2samp
1709 # but the circular dependencies between _mstats_basic and stats prevent that.
1710 alternative = {'t': 'two-sided', 'g': 'greater', 'l': 'less'}.get(
1711 alternative.lower()[0], alternative)
1712 return scipy.stats._stats_py.ks_2samp(data1, data2,
1713 alternative=alternative,
1714 method=method)
1717ks_twosamp = ks_2samp
1720@_rename_parameter("mode", "method")
1721def kstest(data1, data2, args=(), alternative='two-sided', method='auto'):
1722 """
1724 Parameters
1725 ----------
1726 data1 : array_like
1727 data2 : str, callable or array_like
1728 args : tuple, sequence, optional
1729 Distribution parameters, used if `data1` or `data2` are strings.
1730 alternative : str, as documented in stats.kstest
1731 method : str, as documented in stats.kstest
1733 Returns
1734 -------
1735 tuple of (K-S statistic, probability)
1737 """
1738 return scipy.stats._stats_py.kstest(data1, data2, args,
1739 alternative=alternative, method=method)
1742def trima(a, limits=None, inclusive=(True,True)):
1743 """
1744 Trims an array by masking the data outside some given limits.
1746 Returns a masked version of the input array.
1748 Parameters
1749 ----------
1750 a : array_like
1751 Input array.
1752 limits : {None, tuple}, optional
1753 Tuple of (lower limit, upper limit) in absolute values.
1754 Values of the input array lower (greater) than the lower (upper) limit
1755 will be masked. A limit is None indicates an open interval.
1756 inclusive : (bool, bool) tuple, optional
1757 Tuple of (lower flag, upper flag), indicating whether values exactly
1758 equal to the lower (upper) limit are allowed.
1760 Examples
1761 --------
1762 >>> from scipy.stats.mstats import trima
1763 >>> import numpy as np
1765 >>> a = np.arange(10)
1767 The interval is left-closed and right-open, i.e., `[2, 8)`.
1768 Trim the array by keeping only values in the interval.
1770 >>> trima(a, limits=(2, 8), inclusive=(True, False))
1771 masked_array(data=[--, --, 2, 3, 4, 5, 6, 7, --, --],
1772 mask=[ True, True, False, False, False, False, False, False,
1773 True, True],
1774 fill_value=999999)
1776 """
1777 a = ma.asarray(a)
1778 a.unshare_mask()
1779 if (limits is None) or (limits == (None, None)):
1780 return a
1782 (lower_lim, upper_lim) = limits
1783 (lower_in, upper_in) = inclusive
1784 condition = False
1785 if lower_lim is not None:
1786 if lower_in:
1787 condition |= (a < lower_lim)
1788 else:
1789 condition |= (a <= lower_lim)
1791 if upper_lim is not None:
1792 if upper_in:
1793 condition |= (a > upper_lim)
1794 else:
1795 condition |= (a >= upper_lim)
1797 a[condition.filled(True)] = masked
1798 return a
1801def trimr(a, limits=None, inclusive=(True, True), axis=None):
1802 """
1803 Trims an array by masking some proportion of the data on each end.
1804 Returns a masked version of the input array.
1806 Parameters
1807 ----------
1808 a : sequence
1809 Input array.
1810 limits : {None, tuple}, optional
1811 Tuple of the percentages to cut on each side of the array, with respect
1812 to the number of unmasked data, as floats between 0. and 1.
1813 Noting n the number of unmasked data before trimming, the
1814 (n*limits[0])th smallest data and the (n*limits[1])th largest data are
1815 masked, and the total number of unmasked data after trimming is
1816 n*(1.-sum(limits)). The value of one limit can be set to None to
1817 indicate an open interval.
1818 inclusive : {(True,True) tuple}, optional
1819 Tuple of flags indicating whether the number of data being masked on
1820 the left (right) end should be truncated (True) or rounded (False) to
1821 integers.
1822 axis : {None,int}, optional
1823 Axis along which to trim. If None, the whole array is trimmed, but its
1824 shape is maintained.
1826 """
1827 def _trimr1D(a, low_limit, up_limit, low_inclusive, up_inclusive):
1828 n = a.count()
1829 idx = a.argsort()
1830 if low_limit:
1831 if low_inclusive:
1832 lowidx = int(low_limit*n)
1833 else:
1834 lowidx = int(np.round(low_limit*n))
1835 a[idx[:lowidx]] = masked
1836 if up_limit is not None:
1837 if up_inclusive:
1838 upidx = n - int(n*up_limit)
1839 else:
1840 upidx = n - int(np.round(n*up_limit))
1841 a[idx[upidx:]] = masked
1842 return a
1844 a = ma.asarray(a)
1845 a.unshare_mask()
1846 if limits is None:
1847 return a
1849 # Check the limits
1850 (lolim, uplim) = limits
1851 errmsg = "The proportion to cut from the %s should be between 0. and 1."
1852 if lolim is not None:
1853 if lolim > 1. or lolim < 0:
1854 raise ValueError(errmsg % 'beginning' + "(got %s)" % lolim)
1855 if uplim is not None:
1856 if uplim > 1. or uplim < 0:
1857 raise ValueError(errmsg % 'end' + "(got %s)" % uplim)
1859 (loinc, upinc) = inclusive
1861 if axis is None:
1862 shp = a.shape
1863 return _trimr1D(a.ravel(),lolim,uplim,loinc,upinc).reshape(shp)
1864 else:
1865 return ma.apply_along_axis(_trimr1D, axis, a, lolim,uplim,loinc,upinc)
1868trimdoc = """
1869 Parameters
1870 ----------
1871 a : sequence
1872 Input array
1873 limits : {None, tuple}, optional
1874 If `relative` is False, tuple (lower limit, upper limit) in absolute values.
1875 Values of the input array lower (greater) than the lower (upper) limit are
1876 masked.
1878 If `relative` is True, tuple (lower percentage, upper percentage) to cut
1879 on each side of the array, with respect to the number of unmasked data.
1881 Noting n the number of unmasked data before trimming, the (n*limits[0])th
1882 smallest data and the (n*limits[1])th largest data are masked, and the
1883 total number of unmasked data after trimming is n*(1.-sum(limits))
1884 In each case, the value of one limit can be set to None to indicate an
1885 open interval.
1887 If limits is None, no trimming is performed
1888 inclusive : {(bool, bool) tuple}, optional
1889 If `relative` is False, tuple indicating whether values exactly equal
1890 to the absolute limits are allowed.
1891 If `relative` is True, tuple indicating whether the number of data
1892 being masked on each side should be rounded (True) or truncated
1893 (False).
1894 relative : bool, optional
1895 Whether to consider the limits as absolute values (False) or proportions
1896 to cut (True).
1897 axis : int, optional
1898 Axis along which to trim.
1899"""
1902def trim(a, limits=None, inclusive=(True,True), relative=False, axis=None):
1903 """
1904 Trims an array by masking the data outside some given limits.
1906 Returns a masked version of the input array.
1908 %s
1910 Examples
1911 --------
1912 >>> from scipy.stats.mstats import trim
1913 >>> z = [ 1, 2, 3, 4, 5, 6, 7, 8, 9,10]
1914 >>> print(trim(z,(3,8)))
1915 [-- -- 3 4 5 6 7 8 -- --]
1916 >>> print(trim(z,(0.1,0.2),relative=True))
1917 [-- 2 3 4 5 6 7 8 -- --]
1919 """
1920 if relative:
1921 return trimr(a, limits=limits, inclusive=inclusive, axis=axis)
1922 else:
1923 return trima(a, limits=limits, inclusive=inclusive)
1926if trim.__doc__:
1927 trim.__doc__ = trim.__doc__ % trimdoc
1930def trimboth(data, proportiontocut=0.2, inclusive=(True,True), axis=None):
1931 """
1932 Trims the smallest and largest data values.
1934 Trims the `data` by masking the ``int(proportiontocut * n)`` smallest and
1935 ``int(proportiontocut * n)`` largest values of data along the given axis,
1936 where n is the number of unmasked values before trimming.
1938 Parameters
1939 ----------
1940 data : ndarray
1941 Data to trim.
1942 proportiontocut : float, optional
1943 Percentage of trimming (as a float between 0 and 1).
1944 If n is the number of unmasked values before trimming, the number of
1945 values after trimming is ``(1 - 2*proportiontocut) * n``.
1946 Default is 0.2.
1947 inclusive : {(bool, bool) tuple}, optional
1948 Tuple indicating whether the number of data being masked on each side
1949 should be rounded (True) or truncated (False).
1950 axis : int, optional
1951 Axis along which to perform the trimming.
1952 If None, the input array is first flattened.
1954 """
1955 return trimr(data, limits=(proportiontocut,proportiontocut),
1956 inclusive=inclusive, axis=axis)
1959def trimtail(data, proportiontocut=0.2, tail='left', inclusive=(True,True),
1960 axis=None):
1961 """
1962 Trims the data by masking values from one tail.
1964 Parameters
1965 ----------
1966 data : array_like
1967 Data to trim.
1968 proportiontocut : float, optional
1969 Percentage of trimming. If n is the number of unmasked values
1970 before trimming, the number of values after trimming is
1971 ``(1 - proportiontocut) * n``. Default is 0.2.
1972 tail : {'left','right'}, optional
1973 If 'left' the `proportiontocut` lowest values will be masked.
1974 If 'right' the `proportiontocut` highest values will be masked.
1975 Default is 'left'.
1976 inclusive : {(bool, bool) tuple}, optional
1977 Tuple indicating whether the number of data being masked on each side
1978 should be rounded (True) or truncated (False). Default is
1979 (True, True).
1980 axis : int, optional
1981 Axis along which to perform the trimming.
1982 If None, the input array is first flattened. Default is None.
1984 Returns
1985 -------
1986 trimtail : ndarray
1987 Returned array of same shape as `data` with masked tail values.
1989 """
1990 tail = str(tail).lower()[0]
1991 if tail == 'l':
1992 limits = (proportiontocut,None)
1993 elif tail == 'r':
1994 limits = (None, proportiontocut)
1995 else:
1996 raise TypeError("The tail argument should be in ('left','right')")
1998 return trimr(data, limits=limits, axis=axis, inclusive=inclusive)
2001trim1 = trimtail
2004def trimmed_mean(a, limits=(0.1,0.1), inclusive=(1,1), relative=True,
2005 axis=None):
2006 """Returns the trimmed mean of the data along the given axis.
2008 %s
2010 """
2011 if (not isinstance(limits,tuple)) and isinstance(limits,float):
2012 limits = (limits, limits)
2013 if relative:
2014 return trimr(a,limits=limits,inclusive=inclusive,axis=axis).mean(axis=axis)
2015 else:
2016 return trima(a,limits=limits,inclusive=inclusive).mean(axis=axis)
2019if trimmed_mean.__doc__:
2020 trimmed_mean.__doc__ = trimmed_mean.__doc__ % trimdoc
2023def trimmed_var(a, limits=(0.1,0.1), inclusive=(1,1), relative=True,
2024 axis=None, ddof=0):
2025 """Returns the trimmed variance of the data along the given axis.
2027 %s
2028 ddof : {0,integer}, optional
2029 Means Delta Degrees of Freedom. The denominator used during computations
2030 is (n-ddof). DDOF=0 corresponds to a biased estimate, DDOF=1 to an un-
2031 biased estimate of the variance.
2033 """
2034 if (not isinstance(limits,tuple)) and isinstance(limits,float):
2035 limits = (limits, limits)
2036 if relative:
2037 out = trimr(a,limits=limits, inclusive=inclusive,axis=axis)
2038 else:
2039 out = trima(a,limits=limits,inclusive=inclusive)
2041 return out.var(axis=axis, ddof=ddof)
2044if trimmed_var.__doc__:
2045 trimmed_var.__doc__ = trimmed_var.__doc__ % trimdoc
2048def trimmed_std(a, limits=(0.1,0.1), inclusive=(1,1), relative=True,
2049 axis=None, ddof=0):
2050 """Returns the trimmed standard deviation of the data along the given axis.
2052 %s
2053 ddof : {0,integer}, optional
2054 Means Delta Degrees of Freedom. The denominator used during computations
2055 is (n-ddof). DDOF=0 corresponds to a biased estimate, DDOF=1 to an un-
2056 biased estimate of the variance.
2058 """
2059 if (not isinstance(limits,tuple)) and isinstance(limits,float):
2060 limits = (limits, limits)
2061 if relative:
2062 out = trimr(a,limits=limits,inclusive=inclusive,axis=axis)
2063 else:
2064 out = trima(a,limits=limits,inclusive=inclusive)
2065 return out.std(axis=axis,ddof=ddof)
2068if trimmed_std.__doc__:
2069 trimmed_std.__doc__ = trimmed_std.__doc__ % trimdoc
2072def trimmed_stde(a, limits=(0.1,0.1), inclusive=(1,1), axis=None):
2073 """
2074 Returns the standard error of the trimmed mean along the given axis.
2076 Parameters
2077 ----------
2078 a : sequence
2079 Input array
2080 limits : {(0.1,0.1), tuple of float}, optional
2081 tuple (lower percentage, upper percentage) to cut on each side of the
2082 array, with respect to the number of unmasked data.
2084 If n is the number of unmasked data before trimming, the values
2085 smaller than ``n * limits[0]`` and the values larger than
2086 ``n * `limits[1]`` are masked, and the total number of unmasked
2087 data after trimming is ``n * (1.-sum(limits))``. In each case,
2088 the value of one limit can be set to None to indicate an open interval.
2089 If `limits` is None, no trimming is performed.
2090 inclusive : {(bool, bool) tuple} optional
2091 Tuple indicating whether the number of data being masked on each side
2092 should be rounded (True) or truncated (False).
2093 axis : int, optional
2094 Axis along which to trim.
2096 Returns
2097 -------
2098 trimmed_stde : scalar or ndarray
2100 """
2101 def _trimmed_stde_1D(a, low_limit, up_limit, low_inclusive, up_inclusive):
2102 "Returns the standard error of the trimmed mean for a 1D input data."
2103 n = a.count()
2104 idx = a.argsort()
2105 if low_limit:
2106 if low_inclusive:
2107 lowidx = int(low_limit*n)
2108 else:
2109 lowidx = np.round(low_limit*n)
2110 a[idx[:lowidx]] = masked
2111 if up_limit is not None:
2112 if up_inclusive:
2113 upidx = n - int(n*up_limit)
2114 else:
2115 upidx = n - np.round(n*up_limit)
2116 a[idx[upidx:]] = masked
2117 a[idx[:lowidx]] = a[idx[lowidx]]
2118 a[idx[upidx:]] = a[idx[upidx-1]]
2119 winstd = a.std(ddof=1)
2120 return winstd / ((1-low_limit-up_limit)*np.sqrt(len(a)))
2122 a = ma.array(a, copy=True, subok=True)
2123 a.unshare_mask()
2124 if limits is None:
2125 return a.std(axis=axis,ddof=1)/ma.sqrt(a.count(axis))
2126 if (not isinstance(limits,tuple)) and isinstance(limits,float):
2127 limits = (limits, limits)
2129 # Check the limits
2130 (lolim, uplim) = limits
2131 errmsg = "The proportion to cut from the %s should be between 0. and 1."
2132 if lolim is not None:
2133 if lolim > 1. or lolim < 0:
2134 raise ValueError(errmsg % 'beginning' + "(got %s)" % lolim)
2135 if uplim is not None:
2136 if uplim > 1. or uplim < 0:
2137 raise ValueError(errmsg % 'end' + "(got %s)" % uplim)
2139 (loinc, upinc) = inclusive
2140 if (axis is None):
2141 return _trimmed_stde_1D(a.ravel(),lolim,uplim,loinc,upinc)
2142 else:
2143 if a.ndim > 2:
2144 raise ValueError("Array 'a' must be at most two dimensional, "
2145 "but got a.ndim = %d" % a.ndim)
2146 return ma.apply_along_axis(_trimmed_stde_1D, axis, a,
2147 lolim,uplim,loinc,upinc)
2150def _mask_to_limits(a, limits, inclusive):
2151 """Mask an array for values outside of given limits.
2153 This is primarily a utility function.
2155 Parameters
2156 ----------
2157 a : array
2158 limits : (float or None, float or None)
2159 A tuple consisting of the (lower limit, upper limit). Values in the
2160 input array less than the lower limit or greater than the upper limit
2161 will be masked out. None implies no limit.
2162 inclusive : (bool, bool)
2163 A tuple consisting of the (lower flag, upper flag). These flags
2164 determine whether values exactly equal to lower or upper are allowed.
2166 Returns
2167 -------
2168 A MaskedArray.
2170 Raises
2171 ------
2172 A ValueError if there are no values within the given limits.
2173 """
2174 lower_limit, upper_limit = limits
2175 lower_include, upper_include = inclusive
2176 am = ma.MaskedArray(a)
2177 if lower_limit is not None:
2178 if lower_include:
2179 am = ma.masked_less(am, lower_limit)
2180 else:
2181 am = ma.masked_less_equal(am, lower_limit)
2183 if upper_limit is not None:
2184 if upper_include:
2185 am = ma.masked_greater(am, upper_limit)
2186 else:
2187 am = ma.masked_greater_equal(am, upper_limit)
2189 if am.count() == 0:
2190 raise ValueError("No array values within given limits")
2192 return am
2195def tmean(a, limits=None, inclusive=(True, True), axis=None):
2196 """
2197 Compute the trimmed mean.
2199 Parameters
2200 ----------
2201 a : array_like
2202 Array of values.
2203 limits : None or (lower limit, upper limit), optional
2204 Values in the input array less than the lower limit or greater than the
2205 upper limit will be ignored. When limits is None (default), then all
2206 values are used. Either of the limit values in the tuple can also be
2207 None representing a half-open interval.
2208 inclusive : (bool, bool), optional
2209 A tuple consisting of the (lower flag, upper flag). These flags
2210 determine whether values exactly equal to the lower or upper limits
2211 are included. The default value is (True, True).
2212 axis : int or None, optional
2213 Axis along which to operate. If None, compute over the
2214 whole array. Default is None.
2216 Returns
2217 -------
2218 tmean : float
2220 Notes
2221 -----
2222 For more details on `tmean`, see `scipy.stats.tmean`.
2224 Examples
2225 --------
2226 >>> import numpy as np
2227 >>> from scipy.stats import mstats
2228 >>> a = np.array([[6, 8, 3, 0],
2229 ... [3, 9, 1, 2],
2230 ... [8, 7, 8, 2],
2231 ... [5, 6, 0, 2],
2232 ... [4, 5, 5, 2]])
2233 ...
2234 ...
2235 >>> mstats.tmean(a, (2,5))
2236 3.3
2237 >>> mstats.tmean(a, (2,5), axis=0)
2238 masked_array(data=[4.0, 5.0, 4.0, 2.0],
2239 mask=[False, False, False, False],
2240 fill_value=1e+20)
2242 """
2243 return trima(a, limits=limits, inclusive=inclusive).mean(axis=axis)
2246def tvar(a, limits=None, inclusive=(True, True), axis=0, ddof=1):
2247 """
2248 Compute the trimmed variance
2250 This function computes the sample variance of an array of values,
2251 while ignoring values which are outside of given `limits`.
2253 Parameters
2254 ----------
2255 a : array_like
2256 Array of values.
2257 limits : None or (lower limit, upper limit), optional
2258 Values in the input array less than the lower limit or greater than the
2259 upper limit will be ignored. When limits is None, then all values are
2260 used. Either of the limit values in the tuple can also be None
2261 representing a half-open interval. The default value is None.
2262 inclusive : (bool, bool), optional
2263 A tuple consisting of the (lower flag, upper flag). These flags
2264 determine whether values exactly equal to the lower or upper limits
2265 are included. The default value is (True, True).
2266 axis : int or None, optional
2267 Axis along which to operate. If None, compute over the
2268 whole array. Default is zero.
2269 ddof : int, optional
2270 Delta degrees of freedom. Default is 1.
2272 Returns
2273 -------
2274 tvar : float
2275 Trimmed variance.
2277 Notes
2278 -----
2279 For more details on `tvar`, see `scipy.stats.tvar`.
2281 """
2282 a = a.astype(float).ravel()
2283 if limits is None:
2284 n = (~a.mask).sum() # todo: better way to do that?
2285 return np.ma.var(a) * n/(n-1.)
2286 am = _mask_to_limits(a, limits=limits, inclusive=inclusive)
2288 return np.ma.var(am, axis=axis, ddof=ddof)
2291def tmin(a, lowerlimit=None, axis=0, inclusive=True):
2292 """
2293 Compute the trimmed minimum
2295 Parameters
2296 ----------
2297 a : array_like
2298 array of values
2299 lowerlimit : None or float, optional
2300 Values in the input array less than the given limit will be ignored.
2301 When lowerlimit is None, then all values are used. The default value
2302 is None.
2303 axis : int or None, optional
2304 Axis along which to operate. Default is 0. If None, compute over the
2305 whole array `a`.
2306 inclusive : {True, False}, optional
2307 This flag determines whether values exactly equal to the lower limit
2308 are included. The default value is True.
2310 Returns
2311 -------
2312 tmin : float, int or ndarray
2314 Notes
2315 -----
2316 For more details on `tmin`, see `scipy.stats.tmin`.
2318 Examples
2319 --------
2320 >>> import numpy as np
2321 >>> from scipy.stats import mstats
2322 >>> a = np.array([[6, 8, 3, 0],
2323 ... [3, 2, 1, 2],
2324 ... [8, 1, 8, 2],
2325 ... [5, 3, 0, 2],
2326 ... [4, 7, 5, 2]])
2327 ...
2328 >>> mstats.tmin(a, 5)
2329 masked_array(data=[5, 7, 5, --],
2330 mask=[False, False, False, True],
2331 fill_value=999999)
2333 """
2334 a, axis = _chk_asarray(a, axis)
2335 am = trima(a, (lowerlimit, None), (inclusive, False))
2336 return ma.minimum.reduce(am, axis)
2339def tmax(a, upperlimit=None, axis=0, inclusive=True):
2340 """
2341 Compute the trimmed maximum
2343 This function computes the maximum value of an array along a given axis,
2344 while ignoring values larger than a specified upper limit.
2346 Parameters
2347 ----------
2348 a : array_like
2349 array of values
2350 upperlimit : None or float, optional
2351 Values in the input array greater than the given limit will be ignored.
2352 When upperlimit is None, then all values are used. The default value
2353 is None.
2354 axis : int or None, optional
2355 Axis along which to operate. Default is 0. If None, compute over the
2356 whole array `a`.
2357 inclusive : {True, False}, optional
2358 This flag determines whether values exactly equal to the upper limit
2359 are included. The default value is True.
2361 Returns
2362 -------
2363 tmax : float, int or ndarray
2365 Notes
2366 -----
2367 For more details on `tmax`, see `scipy.stats.tmax`.
2369 Examples
2370 --------
2371 >>> import numpy as np
2372 >>> from scipy.stats import mstats
2373 >>> a = np.array([[6, 8, 3, 0],
2374 ... [3, 9, 1, 2],
2375 ... [8, 7, 8, 2],
2376 ... [5, 6, 0, 2],
2377 ... [4, 5, 5, 2]])
2378 ...
2379 ...
2380 >>> mstats.tmax(a, 4)
2381 masked_array(data=[4, --, 3, 2],
2382 mask=[False, True, False, False],
2383 fill_value=999999)
2385 """
2386 a, axis = _chk_asarray(a, axis)
2387 am = trima(a, (None, upperlimit), (False, inclusive))
2388 return ma.maximum.reduce(am, axis)
2391def tsem(a, limits=None, inclusive=(True, True), axis=0, ddof=1):
2392 """
2393 Compute the trimmed standard error of the mean.
2395 This function finds the standard error of the mean for given
2396 values, ignoring values outside the given `limits`.
2398 Parameters
2399 ----------
2400 a : array_like
2401 array of values
2402 limits : None or (lower limit, upper limit), optional
2403 Values in the input array less than the lower limit or greater than the
2404 upper limit will be ignored. When limits is None, then all values are
2405 used. Either of the limit values in the tuple can also be None
2406 representing a half-open interval. The default value is None.
2407 inclusive : (bool, bool), optional
2408 A tuple consisting of the (lower flag, upper flag). These flags
2409 determine whether values exactly equal to the lower or upper limits
2410 are included. The default value is (True, True).
2411 axis : int or None, optional
2412 Axis along which to operate. If None, compute over the
2413 whole array. Default is zero.
2414 ddof : int, optional
2415 Delta degrees of freedom. Default is 1.
2417 Returns
2418 -------
2419 tsem : float
2421 Notes
2422 -----
2423 For more details on `tsem`, see `scipy.stats.tsem`.
2425 """
2426 a = ma.asarray(a).ravel()
2427 if limits is None:
2428 n = float(a.count())
2429 return a.std(axis=axis, ddof=ddof)/ma.sqrt(n)
2431 am = trima(a.ravel(), limits, inclusive)
2432 sd = np.sqrt(am.var(axis=axis, ddof=ddof))
2433 return sd / np.sqrt(am.count())
2436def winsorize(a, limits=None, inclusive=(True, True), inplace=False,
2437 axis=None, nan_policy='propagate'):
2438 """Returns a Winsorized version of the input array.
2440 The (limits[0])th lowest values are set to the (limits[0])th percentile,
2441 and the (limits[1])th highest values are set to the (1 - limits[1])th
2442 percentile.
2443 Masked values are skipped.
2446 Parameters
2447 ----------
2448 a : sequence
2449 Input array.
2450 limits : {None, tuple of float}, optional
2451 Tuple of the percentages to cut on each side of the array, with respect
2452 to the number of unmasked data, as floats between 0. and 1.
2453 Noting n the number of unmasked data before trimming, the
2454 (n*limits[0])th smallest data and the (n*limits[1])th largest data are
2455 masked, and the total number of unmasked data after trimming
2456 is n*(1.-sum(limits)) The value of one limit can be set to None to
2457 indicate an open interval.
2458 inclusive : {(True, True) tuple}, optional
2459 Tuple indicating whether the number of data being masked on each side
2460 should be truncated (True) or rounded (False).
2461 inplace : {False, True}, optional
2462 Whether to winsorize in place (True) or to use a copy (False)
2463 axis : {None, int}, optional
2464 Axis along which to trim. If None, the whole array is trimmed, but its
2465 shape is maintained.
2466 nan_policy : {'propagate', 'raise', 'omit'}, optional
2467 Defines how to handle when input contains nan.
2468 The following options are available (default is 'propagate'):
2470 * 'propagate': allows nan values and may overwrite or propagate them
2471 * 'raise': throws an error
2472 * 'omit': performs the calculations ignoring nan values
2474 Notes
2475 -----
2476 This function is applied to reduce the effect of possibly spurious outliers
2477 by limiting the extreme values.
2479 Examples
2480 --------
2481 >>> import numpy as np
2482 >>> from scipy.stats.mstats import winsorize
2484 A shuffled array contains integers from 1 to 10.
2486 >>> a = np.array([10, 4, 9, 8, 5, 3, 7, 2, 1, 6])
2488 The 10% of the lowest value (i.e., `1`) and the 20% of the highest
2489 values (i.e., `9` and `10`) are replaced.
2491 >>> winsorize(a, limits=[0.1, 0.2])
2492 masked_array(data=[8, 4, 8, 8, 5, 3, 7, 2, 2, 6],
2493 mask=False,
2494 fill_value=999999)
2496 """
2497 def _winsorize1D(a, low_limit, up_limit, low_include, up_include,
2498 contains_nan, nan_policy):
2499 n = a.count()
2500 idx = a.argsort()
2501 if contains_nan:
2502 nan_count = np.count_nonzero(np.isnan(a))
2503 if low_limit:
2504 if low_include:
2505 lowidx = int(low_limit * n)
2506 else:
2507 lowidx = np.round(low_limit * n).astype(int)
2508 if contains_nan and nan_policy == 'omit':
2509 lowidx = min(lowidx, n-nan_count-1)
2510 a[idx[:lowidx]] = a[idx[lowidx]]
2511 if up_limit is not None:
2512 if up_include:
2513 upidx = n - int(n * up_limit)
2514 else:
2515 upidx = n - np.round(n * up_limit).astype(int)
2516 if contains_nan and nan_policy == 'omit':
2517 a[idx[upidx:-nan_count]] = a[idx[upidx - 1]]
2518 else:
2519 a[idx[upidx:]] = a[idx[upidx - 1]]
2520 return a
2522 contains_nan, nan_policy = _contains_nan(a, nan_policy)
2523 # We are going to modify a: better make a copy
2524 a = ma.array(a, copy=np.logical_not(inplace))
2526 if limits is None:
2527 return a
2528 if (not isinstance(limits, tuple)) and isinstance(limits, float):
2529 limits = (limits, limits)
2531 # Check the limits
2532 (lolim, uplim) = limits
2533 errmsg = "The proportion to cut from the %s should be between 0. and 1."
2534 if lolim is not None:
2535 if lolim > 1. or lolim < 0:
2536 raise ValueError(errmsg % 'beginning' + "(got %s)" % lolim)
2537 if uplim is not None:
2538 if uplim > 1. or uplim < 0:
2539 raise ValueError(errmsg % 'end' + "(got %s)" % uplim)
2541 (loinc, upinc) = inclusive
2543 if axis is None:
2544 shp = a.shape
2545 return _winsorize1D(a.ravel(), lolim, uplim, loinc, upinc,
2546 contains_nan, nan_policy).reshape(shp)
2547 else:
2548 return ma.apply_along_axis(_winsorize1D, axis, a, lolim, uplim, loinc,
2549 upinc, contains_nan, nan_policy)
2552def moment(a, moment=1, axis=0):
2553 """
2554 Calculates the nth moment about the mean for a sample.
2556 Parameters
2557 ----------
2558 a : array_like
2559 data
2560 moment : int, optional
2561 order of central moment that is returned
2562 axis : int or None, optional
2563 Axis along which the central moment is computed. Default is 0.
2564 If None, compute over the whole array `a`.
2566 Returns
2567 -------
2568 n-th central moment : ndarray or float
2569 The appropriate moment along the given axis or over all values if axis
2570 is None. The denominator for the moment calculation is the number of
2571 observations, no degrees of freedom correction is done.
2573 Notes
2574 -----
2575 For more details about `moment`, see `scipy.stats.moment`.
2577 """
2578 a, axis = _chk_asarray(a, axis)
2579 if a.size == 0:
2580 moment_shape = list(a.shape)
2581 del moment_shape[axis]
2582 dtype = a.dtype.type if a.dtype.kind in 'fc' else np.float64
2583 # empty array, return nan(s) with shape matching `moment`
2584 out_shape = (moment_shape if np.isscalar(moment)
2585 else [len(moment)] + moment_shape)
2586 if len(out_shape) == 0:
2587 return dtype(np.nan)
2588 else:
2589 return ma.array(np.full(out_shape, np.nan, dtype=dtype))
2591 # for array_like moment input, return a value for each.
2592 if not np.isscalar(moment):
2593 mean = a.mean(axis, keepdims=True)
2594 mmnt = [_moment(a, i, axis, mean=mean) for i in moment]
2595 return ma.array(mmnt)
2596 else:
2597 return _moment(a, moment, axis)
2599# Moment with optional pre-computed mean, equal to a.mean(axis, keepdims=True)
2600def _moment(a, moment, axis, *, mean=None):
2601 if np.abs(moment - np.round(moment)) > 0:
2602 raise ValueError("All moment parameters must be integers")
2604 if moment == 0 or moment == 1:
2605 # By definition the zeroth moment about the mean is 1, and the first
2606 # moment is 0.
2607 shape = list(a.shape)
2608 del shape[axis]
2609 dtype = a.dtype.type if a.dtype.kind in 'fc' else np.float64
2611 if len(shape) == 0:
2612 return dtype(1.0 if moment == 0 else 0.0)
2613 else:
2614 return (ma.ones(shape, dtype=dtype) if moment == 0
2615 else ma.zeros(shape, dtype=dtype))
2616 else:
2617 # Exponentiation by squares: form exponent sequence
2618 n_list = [moment]
2619 current_n = moment
2620 while current_n > 2:
2621 if current_n % 2:
2622 current_n = (current_n-1)/2
2623 else:
2624 current_n /= 2
2625 n_list.append(current_n)
2627 # Starting point for exponentiation by squares
2628 mean = a.mean(axis, keepdims=True) if mean is None else mean
2629 a_zero_mean = a - mean
2630 if n_list[-1] == 1:
2631 s = a_zero_mean.copy()
2632 else:
2633 s = a_zero_mean**2
2635 # Perform multiplications
2636 for n in n_list[-2::-1]:
2637 s = s**2
2638 if n % 2:
2639 s *= a_zero_mean
2640 return s.mean(axis)
2643def variation(a, axis=0, ddof=0):
2644 """
2645 Compute the coefficient of variation.
2647 The coefficient of variation is the standard deviation divided by the
2648 mean. This function is equivalent to::
2650 np.std(x, axis=axis, ddof=ddof) / np.mean(x)
2652 The default for ``ddof`` is 0, but many definitions of the coefficient
2653 of variation use the square root of the unbiased sample variance
2654 for the sample standard deviation, which corresponds to ``ddof=1``.
2656 Parameters
2657 ----------
2658 a : array_like
2659 Input array.
2660 axis : int or None, optional
2661 Axis along which to calculate the coefficient of variation. Default
2662 is 0. If None, compute over the whole array `a`.
2663 ddof : int, optional
2664 Delta degrees of freedom. Default is 0.
2666 Returns
2667 -------
2668 variation : ndarray
2669 The calculated variation along the requested axis.
2671 Notes
2672 -----
2673 For more details about `variation`, see `scipy.stats.variation`.
2675 Examples
2676 --------
2677 >>> import numpy as np
2678 >>> from scipy.stats.mstats import variation
2679 >>> a = np.array([2,8,4])
2680 >>> variation(a)
2681 0.5345224838248487
2682 >>> b = np.array([2,8,3,4])
2683 >>> c = np.ma.masked_array(b, mask=[0,0,1,0])
2684 >>> variation(c)
2685 0.5345224838248487
2687 In the example above, it can be seen that this works the same as
2688 `scipy.stats.variation` except 'stats.mstats.variation' ignores masked
2689 array elements.
2691 """
2692 a, axis = _chk_asarray(a, axis)
2693 return a.std(axis, ddof=ddof)/a.mean(axis)
2696def skew(a, axis=0, bias=True):
2697 """
2698 Computes the skewness of a data set.
2700 Parameters
2701 ----------
2702 a : ndarray
2703 data
2704 axis : int or None, optional
2705 Axis along which skewness is calculated. Default is 0.
2706 If None, compute over the whole array `a`.
2707 bias : bool, optional
2708 If False, then the calculations are corrected for statistical bias.
2710 Returns
2711 -------
2712 skewness : ndarray
2713 The skewness of values along an axis, returning 0 where all values are
2714 equal.
2716 Notes
2717 -----
2718 For more details about `skew`, see `scipy.stats.skew`.
2720 """
2721 a, axis = _chk_asarray(a,axis)
2722 mean = a.mean(axis, keepdims=True)
2723 m2 = _moment(a, 2, axis, mean=mean)
2724 m3 = _moment(a, 3, axis, mean=mean)
2725 zero = (m2 <= (np.finfo(m2.dtype).resolution * mean.squeeze(axis))**2)
2726 with np.errstate(all='ignore'):
2727 vals = ma.where(zero, 0, m3 / m2**1.5)
2729 if not bias and zero is not ma.masked and m2 is not ma.masked:
2730 n = a.count(axis)
2731 can_correct = ~zero & (n > 2)
2732 if can_correct.any():
2733 n = np.extract(can_correct, n)
2734 m2 = np.extract(can_correct, m2)
2735 m3 = np.extract(can_correct, m3)
2736 nval = ma.sqrt((n-1.0)*n)/(n-2.0)*m3/m2**1.5
2737 np.place(vals, can_correct, nval)
2738 return vals
2741def kurtosis(a, axis=0, fisher=True, bias=True):
2742 """
2743 Computes the kurtosis (Fisher or Pearson) of a dataset.
2745 Kurtosis is the fourth central moment divided by the square of the
2746 variance. If Fisher's definition is used, then 3.0 is subtracted from
2747 the result to give 0.0 for a normal distribution.
2749 If bias is False then the kurtosis is calculated using k statistics to
2750 eliminate bias coming from biased moment estimators
2752 Use `kurtosistest` to see if result is close enough to normal.
2754 Parameters
2755 ----------
2756 a : array
2757 data for which the kurtosis is calculated
2758 axis : int or None, optional
2759 Axis along which the kurtosis is calculated. Default is 0.
2760 If None, compute over the whole array `a`.
2761 fisher : bool, optional
2762 If True, Fisher's definition is used (normal ==> 0.0). If False,
2763 Pearson's definition is used (normal ==> 3.0).
2764 bias : bool, optional
2765 If False, then the calculations are corrected for statistical bias.
2767 Returns
2768 -------
2769 kurtosis : array
2770 The kurtosis of values along an axis. If all values are equal,
2771 return -3 for Fisher's definition and 0 for Pearson's definition.
2773 Notes
2774 -----
2775 For more details about `kurtosis`, see `scipy.stats.kurtosis`.
2777 """
2778 a, axis = _chk_asarray(a, axis)
2779 mean = a.mean(axis, keepdims=True)
2780 m2 = _moment(a, 2, axis, mean=mean)
2781 m4 = _moment(a, 4, axis, mean=mean)
2782 zero = (m2 <= (np.finfo(m2.dtype).resolution * mean.squeeze(axis))**2)
2783 with np.errstate(all='ignore'):
2784 vals = ma.where(zero, 0, m4 / m2**2.0)
2786 if not bias and zero is not ma.masked and m2 is not ma.masked:
2787 n = a.count(axis)
2788 can_correct = ~zero & (n > 3)
2789 if can_correct.any():
2790 n = np.extract(can_correct, n)
2791 m2 = np.extract(can_correct, m2)
2792 m4 = np.extract(can_correct, m4)
2793 nval = 1.0/(n-2)/(n-3)*((n*n-1.0)*m4/m2**2.0-3*(n-1)**2.0)
2794 np.place(vals, can_correct, nval+3.0)
2795 if fisher:
2796 return vals - 3
2797 else:
2798 return vals
2801DescribeResult = namedtuple('DescribeResult', ('nobs', 'minmax', 'mean',
2802 'variance', 'skewness',
2803 'kurtosis'))
2806def describe(a, axis=0, ddof=0, bias=True):
2807 """
2808 Computes several descriptive statistics of the passed array.
2810 Parameters
2811 ----------
2812 a : array_like
2813 Data array
2814 axis : int or None, optional
2815 Axis along which to calculate statistics. Default 0. If None,
2816 compute over the whole array `a`.
2817 ddof : int, optional
2818 degree of freedom (default 0); note that default ddof is different
2819 from the same routine in stats.describe
2820 bias : bool, optional
2821 If False, then the skewness and kurtosis calculations are corrected for
2822 statistical bias.
2824 Returns
2825 -------
2826 nobs : int
2827 (size of the data (discarding missing values)
2829 minmax : (int, int)
2830 min, max
2832 mean : float
2833 arithmetic mean
2835 variance : float
2836 unbiased variance
2838 skewness : float
2839 biased skewness
2841 kurtosis : float
2842 biased kurtosis
2844 Examples
2845 --------
2846 >>> import numpy as np
2847 >>> from scipy.stats.mstats import describe
2848 >>> ma = np.ma.array(range(6), mask=[0, 0, 0, 1, 1, 1])
2849 >>> describe(ma)
2850 DescribeResult(nobs=3, minmax=(masked_array(data=0,
2851 mask=False,
2852 fill_value=999999), masked_array(data=2,
2853 mask=False,
2854 fill_value=999999)), mean=1.0, variance=0.6666666666666666,
2855 skewness=masked_array(data=0., mask=False, fill_value=1e+20),
2856 kurtosis=-1.5)
2858 """
2859 a, axis = _chk_asarray(a, axis)
2860 n = a.count(axis)
2861 mm = (ma.minimum.reduce(a, axis=axis), ma.maximum.reduce(a, axis=axis))
2862 m = a.mean(axis)
2863 v = a.var(axis, ddof=ddof)
2864 sk = skew(a, axis, bias=bias)
2865 kurt = kurtosis(a, axis, bias=bias)
2867 return DescribeResult(n, mm, m, v, sk, kurt)
2870def stde_median(data, axis=None):
2871 """Returns the McKean-Schrader estimate of the standard error of the sample
2872 median along the given axis. masked values are discarded.
2874 Parameters
2875 ----------
2876 data : ndarray
2877 Data to trim.
2878 axis : {None,int}, optional
2879 Axis along which to perform the trimming.
2880 If None, the input array is first flattened.
2882 """
2883 def _stdemed_1D(data):
2884 data = np.sort(data.compressed())
2885 n = len(data)
2886 z = 2.5758293035489004
2887 k = int(np.round((n+1)/2. - z * np.sqrt(n/4.),0))
2888 return ((data[n-k] - data[k-1])/(2.*z))
2890 data = ma.array(data, copy=False, subok=True)
2891 if (axis is None):
2892 return _stdemed_1D(data)
2893 else:
2894 if data.ndim > 2:
2895 raise ValueError("Array 'data' must be at most two dimensional, "
2896 "but got data.ndim = %d" % data.ndim)
2897 return ma.apply_along_axis(_stdemed_1D, axis, data)
2900SkewtestResult = namedtuple('SkewtestResult', ('statistic', 'pvalue'))
2903def skewtest(a, axis=0, alternative='two-sided'):
2904 """
2905 Tests whether the skew is different from the normal distribution.
2907 Parameters
2908 ----------
2909 a : array_like
2910 The data to be tested
2911 axis : int or None, optional
2912 Axis along which statistics are calculated. Default is 0.
2913 If None, compute over the whole array `a`.
2914 alternative : {'two-sided', 'less', 'greater'}, optional
2915 Defines the alternative hypothesis. Default is 'two-sided'.
2916 The following options are available:
2918 * 'two-sided': the skewness of the distribution underlying the sample
2919 is different from that of the normal distribution (i.e. 0)
2920 * 'less': the skewness of the distribution underlying the sample
2921 is less than that of the normal distribution
2922 * 'greater': the skewness of the distribution underlying the sample
2923 is greater than that of the normal distribution
2925 .. versionadded:: 1.7.0
2927 Returns
2928 -------
2929 statistic : array_like
2930 The computed z-score for this test.
2931 pvalue : array_like
2932 A p-value for the hypothesis test
2934 Notes
2935 -----
2936 For more details about `skewtest`, see `scipy.stats.skewtest`.
2938 """
2939 a, axis = _chk_asarray(a, axis)
2940 if axis is None:
2941 a = a.ravel()
2942 axis = 0
2943 b2 = skew(a,axis)
2944 n = a.count(axis)
2945 if np.min(n) < 8:
2946 raise ValueError(
2947 "skewtest is not valid with less than 8 samples; %i samples"
2948 " were given." % np.min(n))
2950 y = b2 * ma.sqrt(((n+1)*(n+3)) / (6.0*(n-2)))
2951 beta2 = (3.0*(n*n+27*n-70)*(n+1)*(n+3)) / ((n-2.0)*(n+5)*(n+7)*(n+9))
2952 W2 = -1 + ma.sqrt(2*(beta2-1))
2953 delta = 1/ma.sqrt(0.5*ma.log(W2))
2954 alpha = ma.sqrt(2.0/(W2-1))
2955 y = ma.where(y == 0, 1, y)
2956 Z = delta*ma.log(y/alpha + ma.sqrt((y/alpha)**2+1))
2958 return SkewtestResult(*scipy.stats._stats_py._normtest_finish(Z, alternative))
2961KurtosistestResult = namedtuple('KurtosistestResult', ('statistic', 'pvalue'))
2964def kurtosistest(a, axis=0, alternative='two-sided'):
2965 """
2966 Tests whether a dataset has normal kurtosis
2968 Parameters
2969 ----------
2970 a : array_like
2971 array of the sample data
2972 axis : int or None, optional
2973 Axis along which to compute test. Default is 0. If None,
2974 compute over the whole array `a`.
2975 alternative : {'two-sided', 'less', 'greater'}, optional
2976 Defines the alternative hypothesis.
2977 The following options are available (default is 'two-sided'):
2979 * 'two-sided': the kurtosis of the distribution underlying the sample
2980 is different from that of the normal distribution
2981 * 'less': the kurtosis of the distribution underlying the sample
2982 is less than that of the normal distribution
2983 * 'greater': the kurtosis of the distribution underlying the sample
2984 is greater than that of the normal distribution
2986 .. versionadded:: 1.7.0
2988 Returns
2989 -------
2990 statistic : array_like
2991 The computed z-score for this test.
2992 pvalue : array_like
2993 The p-value for the hypothesis test
2995 Notes
2996 -----
2997 For more details about `kurtosistest`, see `scipy.stats.kurtosistest`.
2999 """
3000 a, axis = _chk_asarray(a, axis)
3001 n = a.count(axis=axis)
3002 if np.min(n) < 5:
3003 raise ValueError(
3004 "kurtosistest requires at least 5 observations; %i observations"
3005 " were given." % np.min(n))
3006 if np.min(n) < 20:
3007 warnings.warn(
3008 "kurtosistest only valid for n>=20 ... continuing anyway, n=%i" %
3009 np.min(n))
3011 b2 = kurtosis(a, axis, fisher=False)
3012 E = 3.0*(n-1) / (n+1)
3013 varb2 = 24.0*n*(n-2.)*(n-3) / ((n+1)*(n+1.)*(n+3)*(n+5))
3014 x = (b2-E)/ma.sqrt(varb2)
3015 sqrtbeta1 = 6.0*(n*n-5*n+2)/((n+7)*(n+9)) * np.sqrt((6.0*(n+3)*(n+5)) /
3016 (n*(n-2)*(n-3)))
3017 A = 6.0 + 8.0/sqrtbeta1 * (2.0/sqrtbeta1 + np.sqrt(1+4.0/(sqrtbeta1**2)))
3018 term1 = 1 - 2./(9.0*A)
3019 denom = 1 + x*ma.sqrt(2/(A-4.0))
3020 if np.ma.isMaskedArray(denom):
3021 # For multi-dimensional array input
3022 denom[denom == 0.0] = masked
3023 elif denom == 0.0:
3024 denom = masked
3026 term2 = np.ma.where(denom > 0, ma.power((1-2.0/A)/denom, 1/3.0),
3027 -ma.power(-(1-2.0/A)/denom, 1/3.0))
3028 Z = (term1 - term2) / np.sqrt(2/(9.0*A))
3030 return KurtosistestResult(
3031 *scipy.stats._stats_py._normtest_finish(Z, alternative)
3032 )
3035NormaltestResult = namedtuple('NormaltestResult', ('statistic', 'pvalue'))
3038def normaltest(a, axis=0):
3039 """
3040 Tests whether a sample differs from a normal distribution.
3042 Parameters
3043 ----------
3044 a : array_like
3045 The array containing the data to be tested.
3046 axis : int or None, optional
3047 Axis along which to compute test. Default is 0. If None,
3048 compute over the whole array `a`.
3050 Returns
3051 -------
3052 statistic : float or array
3053 ``s^2 + k^2``, where ``s`` is the z-score returned by `skewtest` and
3054 ``k`` is the z-score returned by `kurtosistest`.
3055 pvalue : float or array
3056 A 2-sided chi squared probability for the hypothesis test.
3058 Notes
3059 -----
3060 For more details about `normaltest`, see `scipy.stats.normaltest`.
3062 """
3063 a, axis = _chk_asarray(a, axis)
3064 s, _ = skewtest(a, axis)
3065 k, _ = kurtosistest(a, axis)
3066 k2 = s*s + k*k
3068 return NormaltestResult(k2, distributions.chi2.sf(k2, 2))
3071def mquantiles(a, prob=list([.25,.5,.75]), alphap=.4, betap=.4, axis=None,
3072 limit=()):
3073 """
3074 Computes empirical quantiles for a data array.
3076 Samples quantile are defined by ``Q(p) = (1-gamma)*x[j] + gamma*x[j+1]``,
3077 where ``x[j]`` is the j-th order statistic, and gamma is a function of
3078 ``j = floor(n*p + m)``, ``m = alphap + p*(1 - alphap - betap)`` and
3079 ``g = n*p + m - j``.
3081 Reinterpreting the above equations to compare to **R** lead to the
3082 equation: ``p(k) = (k - alphap)/(n + 1 - alphap - betap)``
3084 Typical values of (alphap,betap) are:
3085 - (0,1) : ``p(k) = k/n`` : linear interpolation of cdf
3086 (**R** type 4)
3087 - (.5,.5) : ``p(k) = (k - 1/2.)/n`` : piecewise linear function
3088 (**R** type 5)
3089 - (0,0) : ``p(k) = k/(n+1)`` :
3090 (**R** type 6)
3091 - (1,1) : ``p(k) = (k-1)/(n-1)``: p(k) = mode[F(x[k])].
3092 (**R** type 7, **R** default)
3093 - (1/3,1/3): ``p(k) = (k-1/3)/(n+1/3)``: Then p(k) ~ median[F(x[k])].
3094 The resulting quantile estimates are approximately median-unbiased
3095 regardless of the distribution of x.
3096 (**R** type 8)
3097 - (3/8,3/8): ``p(k) = (k-3/8)/(n+1/4)``: Blom.
3098 The resulting quantile estimates are approximately unbiased
3099 if x is normally distributed
3100 (**R** type 9)
3101 - (.4,.4) : approximately quantile unbiased (Cunnane)
3102 - (.35,.35): APL, used with PWM
3104 Parameters
3105 ----------
3106 a : array_like
3107 Input data, as a sequence or array of dimension at most 2.
3108 prob : array_like, optional
3109 List of quantiles to compute.
3110 alphap : float, optional
3111 Plotting positions parameter, default is 0.4.
3112 betap : float, optional
3113 Plotting positions parameter, default is 0.4.
3114 axis : int, optional
3115 Axis along which to perform the trimming.
3116 If None (default), the input array is first flattened.
3117 limit : tuple, optional
3118 Tuple of (lower, upper) values.
3119 Values of `a` outside this open interval are ignored.
3121 Returns
3122 -------
3123 mquantiles : MaskedArray
3124 An array containing the calculated quantiles.
3126 Notes
3127 -----
3128 This formulation is very similar to **R** except the calculation of
3129 ``m`` from ``alphap`` and ``betap``, where in **R** ``m`` is defined
3130 with each type.
3132 References
3133 ----------
3134 .. [1] *R* statistical software: https://www.r-project.org/
3135 .. [2] *R* ``quantile`` function:
3136 http://stat.ethz.ch/R-manual/R-devel/library/stats/html/quantile.html
3138 Examples
3139 --------
3140 >>> import numpy as np
3141 >>> from scipy.stats.mstats import mquantiles
3142 >>> a = np.array([6., 47., 49., 15., 42., 41., 7., 39., 43., 40., 36.])
3143 >>> mquantiles(a)
3144 array([ 19.2, 40. , 42.8])
3146 Using a 2D array, specifying axis and limit.
3148 >>> data = np.array([[ 6., 7., 1.],
3149 ... [ 47., 15., 2.],
3150 ... [ 49., 36., 3.],
3151 ... [ 15., 39., 4.],
3152 ... [ 42., 40., -999.],
3153 ... [ 41., 41., -999.],
3154 ... [ 7., -999., -999.],
3155 ... [ 39., -999., -999.],
3156 ... [ 43., -999., -999.],
3157 ... [ 40., -999., -999.],
3158 ... [ 36., -999., -999.]])
3159 >>> print(mquantiles(data, axis=0, limit=(0, 50)))
3160 [[19.2 14.6 1.45]
3161 [40. 37.5 2.5 ]
3162 [42.8 40.05 3.55]]
3164 >>> data[:, 2] = -999.
3165 >>> print(mquantiles(data, axis=0, limit=(0, 50)))
3166 [[19.200000000000003 14.6 --]
3167 [40.0 37.5 --]
3168 [42.800000000000004 40.05 --]]
3170 """
3171 def _quantiles1D(data,m,p):
3172 x = np.sort(data.compressed())
3173 n = len(x)
3174 if n == 0:
3175 return ma.array(np.empty(len(p), dtype=float), mask=True)
3176 elif n == 1:
3177 return ma.array(np.resize(x, p.shape), mask=nomask)
3178 aleph = (n*p + m)
3179 k = np.floor(aleph.clip(1, n-1)).astype(int)
3180 gamma = (aleph-k).clip(0,1)
3181 return (1.-gamma)*x[(k-1).tolist()] + gamma*x[k.tolist()]
3183 data = ma.array(a, copy=False)
3184 if data.ndim > 2:
3185 raise TypeError("Array should be 2D at most !")
3187 if limit:
3188 condition = (limit[0] < data) & (data < limit[1])
3189 data[~condition.filled(True)] = masked
3191 p = np.array(prob, copy=False, ndmin=1)
3192 m = alphap + p*(1.-alphap-betap)
3193 # Computes quantiles along axis (or globally)
3194 if (axis is None):
3195 return _quantiles1D(data, m, p)
3197 return ma.apply_along_axis(_quantiles1D, axis, data, m, p)
3200def scoreatpercentile(data, per, limit=(), alphap=.4, betap=.4):
3201 """Calculate the score at the given 'per' percentile of the
3202 sequence a. For example, the score at per=50 is the median.
3204 This function is a shortcut to mquantile
3206 """
3207 if (per < 0) or (per > 100.):
3208 raise ValueError("The percentile should be between 0. and 100. !"
3209 " (got %s)" % per)
3211 return mquantiles(data, prob=[per/100.], alphap=alphap, betap=betap,
3212 limit=limit, axis=0).squeeze()
3215def plotting_positions(data, alpha=0.4, beta=0.4):
3216 """
3217 Returns plotting positions (or empirical percentile points) for the data.
3219 Plotting positions are defined as ``(i-alpha)/(n+1-alpha-beta)``, where:
3220 - i is the rank order statistics
3221 - n is the number of unmasked values along the given axis
3222 - `alpha` and `beta` are two parameters.
3224 Typical values for `alpha` and `beta` are:
3225 - (0,1) : ``p(k) = k/n``, linear interpolation of cdf (R, type 4)
3226 - (.5,.5) : ``p(k) = (k-1/2.)/n``, piecewise linear function
3227 (R, type 5)
3228 - (0,0) : ``p(k) = k/(n+1)``, Weibull (R type 6)
3229 - (1,1) : ``p(k) = (k-1)/(n-1)``, in this case,
3230 ``p(k) = mode[F(x[k])]``. That's R default (R type 7)
3231 - (1/3,1/3): ``p(k) = (k-1/3)/(n+1/3)``, then
3232 ``p(k) ~ median[F(x[k])]``.
3233 The resulting quantile estimates are approximately median-unbiased
3234 regardless of the distribution of x. (R type 8)
3235 - (3/8,3/8): ``p(k) = (k-3/8)/(n+1/4)``, Blom.
3236 The resulting quantile estimates are approximately unbiased
3237 if x is normally distributed (R type 9)
3238 - (.4,.4) : approximately quantile unbiased (Cunnane)
3239 - (.35,.35): APL, used with PWM
3240 - (.3175, .3175): used in scipy.stats.probplot
3242 Parameters
3243 ----------
3244 data : array_like
3245 Input data, as a sequence or array of dimension at most 2.
3246 alpha : float, optional
3247 Plotting positions parameter. Default is 0.4.
3248 beta : float, optional
3249 Plotting positions parameter. Default is 0.4.
3251 Returns
3252 -------
3253 positions : MaskedArray
3254 The calculated plotting positions.
3256 """
3257 data = ma.array(data, copy=False).reshape(1,-1)
3258 n = data.count()
3259 plpos = np.empty(data.size, dtype=float)
3260 plpos[n:] = 0
3261 plpos[data.argsort(axis=None)[:n]] = ((np.arange(1, n+1) - alpha) /
3262 (n + 1.0 - alpha - beta))
3263 return ma.array(plpos, mask=data._mask)
3266meppf = plotting_positions
3269def obrientransform(*args):
3270 """
3271 Computes a transform on input data (any number of columns). Used to
3272 test for homogeneity of variance prior to running one-way stats. Each
3273 array in ``*args`` is one level of a factor. If an `f_oneway()` run on
3274 the transformed data and found significant, variances are unequal. From
3275 Maxwell and Delaney, p.112.
3277 Returns: transformed data for use in an ANOVA
3278 """
3279 data = argstoarray(*args).T
3280 v = data.var(axis=0,ddof=1)
3281 m = data.mean(0)
3282 n = data.count(0).astype(float)
3283 # result = ((N-1.5)*N*(a-m)**2 - 0.5*v*(n-1))/((n-1)*(n-2))
3284 data -= m
3285 data **= 2
3286 data *= (n-1.5)*n
3287 data -= 0.5*v*(n-1)
3288 data /= (n-1.)*(n-2.)
3289 if not ma.allclose(v,data.mean(0)):
3290 raise ValueError("Lack of convergence in obrientransform.")
3292 return data
3295def sem(a, axis=0, ddof=1):
3296 """
3297 Calculates the standard error of the mean of the input array.
3299 Also sometimes called standard error of measurement.
3301 Parameters
3302 ----------
3303 a : array_like
3304 An array containing the values for which the standard error is
3305 returned.
3306 axis : int or None, optional
3307 If axis is None, ravel `a` first. If axis is an integer, this will be
3308 the axis over which to operate. Defaults to 0.
3309 ddof : int, optional
3310 Delta degrees-of-freedom. How many degrees of freedom to adjust
3311 for bias in limited samples relative to the population estimate
3312 of variance. Defaults to 1.
3314 Returns
3315 -------
3316 s : ndarray or float
3317 The standard error of the mean in the sample(s), along the input axis.
3319 Notes
3320 -----
3321 The default value for `ddof` changed in scipy 0.15.0 to be consistent with
3322 `scipy.stats.sem` as well as with the most common definition used (like in
3323 the R documentation).
3325 Examples
3326 --------
3327 Find standard error along the first axis:
3329 >>> import numpy as np
3330 >>> from scipy import stats
3331 >>> a = np.arange(20).reshape(5,4)
3332 >>> print(stats.mstats.sem(a))
3333 [2.8284271247461903 2.8284271247461903 2.8284271247461903
3334 2.8284271247461903]
3336 Find standard error across the whole array, using n degrees of freedom:
3338 >>> print(stats.mstats.sem(a, axis=None, ddof=0))
3339 1.2893796958227628
3341 """
3342 a, axis = _chk_asarray(a, axis)
3343 n = a.count(axis=axis)
3344 s = a.std(axis=axis, ddof=ddof) / ma.sqrt(n)
3345 return s
3348F_onewayResult = namedtuple('F_onewayResult', ('statistic', 'pvalue'))
3351def f_oneway(*args):
3352 """
3353 Performs a 1-way ANOVA, returning an F-value and probability given
3354 any number of groups. From Heiman, pp.394-7.
3356 Usage: ``f_oneway(*args)``, where ``*args`` is 2 or more arrays,
3357 one per treatment group.
3359 Returns
3360 -------
3361 statistic : float
3362 The computed F-value of the test.
3363 pvalue : float
3364 The associated p-value from the F-distribution.
3366 """
3367 # Construct a single array of arguments: each row is a group
3368 data = argstoarray(*args)
3369 ngroups = len(data)
3370 ntot = data.count()
3371 sstot = (data**2).sum() - (data.sum())**2/float(ntot)
3372 ssbg = (data.count(-1) * (data.mean(-1)-data.mean())**2).sum()
3373 sswg = sstot-ssbg
3374 dfbg = ngroups-1
3375 dfwg = ntot - ngroups
3376 msb = ssbg/float(dfbg)
3377 msw = sswg/float(dfwg)
3378 f = msb/msw
3379 prob = special.fdtrc(dfbg, dfwg, f) # equivalent to stats.f.sf
3381 return F_onewayResult(f, prob)
3384FriedmanchisquareResult = namedtuple('FriedmanchisquareResult',
3385 ('statistic', 'pvalue'))
3388def friedmanchisquare(*args):
3389 """Friedman Chi-Square is a non-parametric, one-way within-subjects ANOVA.
3390 This function calculates the Friedman Chi-square test for repeated measures
3391 and returns the result, along with the associated probability value.
3393 Each input is considered a given group. Ideally, the number of treatments
3394 among each group should be equal. If this is not the case, only the first
3395 n treatments are taken into account, where n is the number of treatments
3396 of the smallest group.
3397 If a group has some missing values, the corresponding treatments are masked
3398 in the other groups.
3399 The test statistic is corrected for ties.
3401 Masked values in one group are propagated to the other groups.
3403 Returns
3404 -------
3405 statistic : float
3406 the test statistic.
3407 pvalue : float
3408 the associated p-value.
3410 """
3411 data = argstoarray(*args).astype(float)
3412 k = len(data)
3413 if k < 3:
3414 raise ValueError("Less than 3 groups (%i): " % k +
3415 "the Friedman test is NOT appropriate.")
3417 ranked = ma.masked_values(rankdata(data, axis=0), 0)
3418 if ranked._mask is not nomask:
3419 ranked = ma.mask_cols(ranked)
3420 ranked = ranked.compressed().reshape(k,-1).view(ndarray)
3421 else:
3422 ranked = ranked._data
3423 (k,n) = ranked.shape
3424 # Ties correction
3425 repeats = [find_repeats(row) for row in ranked.T]
3426 ties = np.array([y for x, y in repeats if x.size > 0])
3427 tie_correction = 1 - (ties**3-ties).sum()/float(n*(k**3-k))
3429 ssbg = np.sum((ranked.sum(-1) - n*(k+1)/2.)**2)
3430 chisq = ssbg * 12./(n*k*(k+1)) * 1./tie_correction
3432 return FriedmanchisquareResult(chisq,
3433 distributions.chi2.sf(chisq, k-1))
3436BrunnerMunzelResult = namedtuple('BrunnerMunzelResult', ('statistic', 'pvalue'))
3439def brunnermunzel(x, y, alternative="two-sided", distribution="t"):
3440 """
3441 Computes the Brunner-Munzel test on samples x and y
3443 Missing values in `x` and/or `y` are discarded.
3445 Parameters
3446 ----------
3447 x, y : array_like
3448 Array of samples, should be one-dimensional.
3449 alternative : 'less', 'two-sided', or 'greater', optional
3450 Whether to get the p-value for the one-sided hypothesis ('less'
3451 or 'greater') or for the two-sided hypothesis ('two-sided').
3452 Defaults value is 'two-sided' .
3453 distribution : 't' or 'normal', optional
3454 Whether to get the p-value by t-distribution or by standard normal
3455 distribution.
3456 Defaults value is 't' .
3458 Returns
3459 -------
3460 statistic : float
3461 The Brunner-Munzer W statistic.
3462 pvalue : float
3463 p-value assuming an t distribution. One-sided or
3464 two-sided, depending on the choice of `alternative` and `distribution`.
3466 See Also
3467 --------
3468 mannwhitneyu : Mann-Whitney rank test on two samples.
3470 Notes
3471 -----
3472 For more details on `brunnermunzel`, see `scipy.stats.brunnermunzel`.
3474 """
3475 x = ma.asarray(x).compressed().view(ndarray)
3476 y = ma.asarray(y).compressed().view(ndarray)
3477 nx = len(x)
3478 ny = len(y)
3479 if nx == 0 or ny == 0:
3480 return BrunnerMunzelResult(np.nan, np.nan)
3481 rankc = rankdata(np.concatenate((x,y)))
3482 rankcx = rankc[0:nx]
3483 rankcy = rankc[nx:nx+ny]
3484 rankcx_mean = np.mean(rankcx)
3485 rankcy_mean = np.mean(rankcy)
3486 rankx = rankdata(x)
3487 ranky = rankdata(y)
3488 rankx_mean = np.mean(rankx)
3489 ranky_mean = np.mean(ranky)
3491 Sx = np.sum(np.power(rankcx - rankx - rankcx_mean + rankx_mean, 2.0))
3492 Sx /= nx - 1
3493 Sy = np.sum(np.power(rankcy - ranky - rankcy_mean + ranky_mean, 2.0))
3494 Sy /= ny - 1
3496 wbfn = nx * ny * (rankcy_mean - rankcx_mean)
3497 wbfn /= (nx + ny) * np.sqrt(nx * Sx + ny * Sy)
3499 if distribution == "t":
3500 df_numer = np.power(nx * Sx + ny * Sy, 2.0)
3501 df_denom = np.power(nx * Sx, 2.0) / (nx - 1)
3502 df_denom += np.power(ny * Sy, 2.0) / (ny - 1)
3503 df = df_numer / df_denom
3504 p = distributions.t.cdf(wbfn, df)
3505 elif distribution == "normal":
3506 p = distributions.norm.cdf(wbfn)
3507 else:
3508 raise ValueError(
3509 "distribution should be 't' or 'normal'")
3511 if alternative == "greater":
3512 pass
3513 elif alternative == "less":
3514 p = 1 - p
3515 elif alternative == "two-sided":
3516 p = 2 * np.min([p, 1-p])
3517 else:
3518 raise ValueError(
3519 "alternative should be 'less', 'greater' or 'two-sided'")
3521 return BrunnerMunzelResult(wbfn, p)