Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scipy/stats/_morestats.py: 10%
985 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-12 06:31 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-12 06:31 +0000
1from __future__ import annotations
2import math
3import warnings
4from collections import namedtuple
6import numpy as np
7from numpy import (isscalar, r_, log, around, unique, asarray, zeros,
8 arange, sort, amin, amax, atleast_1d, sqrt, array,
9 compress, pi, exp, ravel, count_nonzero, sin, cos,
10 arctan2, hypot)
12from scipy import optimize
13from scipy import special
14from scipy._lib._bunch import _make_tuple_bunch
15from scipy._lib._util import _rename_parameter, _contains_nan
17from . import _statlib
18from . import _stats_py
19from ._fit import FitResult
20from ._stats_py import find_repeats, _normtest_finish, SignificanceResult
21from .contingency import chi2_contingency
22from . import distributions
23from ._distn_infrastructure import rv_generic
24from ._hypotests import _get_wilcoxon_distr
25from ._axis_nan_policy import _axis_nan_policy_factory
26from .._lib.deprecation import _deprecated
29__all__ = ['mvsdist',
30 'bayes_mvs', 'kstat', 'kstatvar', 'probplot', 'ppcc_max', 'ppcc_plot',
31 'boxcox_llf', 'boxcox', 'boxcox_normmax', 'boxcox_normplot',
32 'shapiro', 'anderson', 'ansari', 'bartlett', 'levene', 'binom_test',
33 'fligner', 'mood', 'wilcoxon', 'median_test',
34 'circmean', 'circvar', 'circstd', 'anderson_ksamp',
35 'yeojohnson_llf', 'yeojohnson', 'yeojohnson_normmax',
36 'yeojohnson_normplot', 'directional_stats'
37 ]
40Mean = namedtuple('Mean', ('statistic', 'minmax'))
41Variance = namedtuple('Variance', ('statistic', 'minmax'))
42Std_dev = namedtuple('Std_dev', ('statistic', 'minmax'))
45def bayes_mvs(data, alpha=0.90):
46 r"""
47 Bayesian confidence intervals for the mean, var, and std.
49 Parameters
50 ----------
51 data : array_like
52 Input data, if multi-dimensional it is flattened to 1-D by `bayes_mvs`.
53 Requires 2 or more data points.
54 alpha : float, optional
55 Probability that the returned confidence interval contains
56 the true parameter.
58 Returns
59 -------
60 mean_cntr, var_cntr, std_cntr : tuple
61 The three results are for the mean, variance and standard deviation,
62 respectively. Each result is a tuple of the form::
64 (center, (lower, upper))
66 with `center` the mean of the conditional pdf of the value given the
67 data, and `(lower, upper)` a confidence interval, centered on the
68 median, containing the estimate to a probability ``alpha``.
70 See Also
71 --------
72 mvsdist
74 Notes
75 -----
76 Each tuple of mean, variance, and standard deviation estimates represent
77 the (center, (lower, upper)) with center the mean of the conditional pdf
78 of the value given the data and (lower, upper) is a confidence interval
79 centered on the median, containing the estimate to a probability
80 ``alpha``.
82 Converts data to 1-D and assumes all data has the same mean and variance.
83 Uses Jeffrey's prior for variance and std.
85 Equivalent to ``tuple((x.mean(), x.interval(alpha)) for x in mvsdist(dat))``
87 References
88 ----------
89 T.E. Oliphant, "A Bayesian perspective on estimating mean, variance, and
90 standard-deviation from data", https://scholarsarchive.byu.edu/facpub/278,
91 2006.
93 Examples
94 --------
95 First a basic example to demonstrate the outputs:
97 >>> from scipy import stats
98 >>> data = [6, 9, 12, 7, 8, 8, 13]
99 >>> mean, var, std = stats.bayes_mvs(data)
100 >>> mean
101 Mean(statistic=9.0, minmax=(7.103650222612533, 10.896349777387467))
102 >>> var
103 Variance(statistic=10.0, minmax=(3.176724206..., 24.45910382...))
104 >>> std
105 Std_dev(statistic=2.9724954732045084, minmax=(1.7823367265645143, 4.945614605014631))
107 Now we generate some normally distributed random data, and get estimates of
108 mean and standard deviation with 95% confidence intervals for those
109 estimates:
111 >>> n_samples = 100000
112 >>> data = stats.norm.rvs(size=n_samples)
113 >>> res_mean, res_var, res_std = stats.bayes_mvs(data, alpha=0.95)
115 >>> import matplotlib.pyplot as plt
116 >>> fig = plt.figure()
117 >>> ax = fig.add_subplot(111)
118 >>> ax.hist(data, bins=100, density=True, label='Histogram of data')
119 >>> ax.vlines(res_mean.statistic, 0, 0.5, colors='r', label='Estimated mean')
120 >>> ax.axvspan(res_mean.minmax[0],res_mean.minmax[1], facecolor='r',
121 ... alpha=0.2, label=r'Estimated mean (95% limits)')
122 >>> ax.vlines(res_std.statistic, 0, 0.5, colors='g', label='Estimated scale')
123 >>> ax.axvspan(res_std.minmax[0],res_std.minmax[1], facecolor='g', alpha=0.2,
124 ... label=r'Estimated scale (95% limits)')
126 >>> ax.legend(fontsize=10)
127 >>> ax.set_xlim([-4, 4])
128 >>> ax.set_ylim([0, 0.5])
129 >>> plt.show()
131 """
132 m, v, s = mvsdist(data)
133 if alpha >= 1 or alpha <= 0:
134 raise ValueError("0 < alpha < 1 is required, but alpha=%s was given."
135 % alpha)
137 m_res = Mean(m.mean(), m.interval(alpha))
138 v_res = Variance(v.mean(), v.interval(alpha))
139 s_res = Std_dev(s.mean(), s.interval(alpha))
141 return m_res, v_res, s_res
144def mvsdist(data):
145 """
146 'Frozen' distributions for mean, variance, and standard deviation of data.
148 Parameters
149 ----------
150 data : array_like
151 Input array. Converted to 1-D using ravel.
152 Requires 2 or more data-points.
154 Returns
155 -------
156 mdist : "frozen" distribution object
157 Distribution object representing the mean of the data.
158 vdist : "frozen" distribution object
159 Distribution object representing the variance of the data.
160 sdist : "frozen" distribution object
161 Distribution object representing the standard deviation of the data.
163 See Also
164 --------
165 bayes_mvs
167 Notes
168 -----
169 The return values from ``bayes_mvs(data)`` is equivalent to
170 ``tuple((x.mean(), x.interval(0.90)) for x in mvsdist(data))``.
172 In other words, calling ``<dist>.mean()`` and ``<dist>.interval(0.90)``
173 on the three distribution objects returned from this function will give
174 the same results that are returned from `bayes_mvs`.
176 References
177 ----------
178 T.E. Oliphant, "A Bayesian perspective on estimating mean, variance, and
179 standard-deviation from data", https://scholarsarchive.byu.edu/facpub/278,
180 2006.
182 Examples
183 --------
184 >>> from scipy import stats
185 >>> data = [6, 9, 12, 7, 8, 8, 13]
186 >>> mean, var, std = stats.mvsdist(data)
188 We now have frozen distribution objects "mean", "var" and "std" that we can
189 examine:
191 >>> mean.mean()
192 9.0
193 >>> mean.interval(0.95)
194 (6.6120585482655692, 11.387941451734431)
195 >>> mean.std()
196 1.1952286093343936
198 """
199 x = ravel(data)
200 n = len(x)
201 if n < 2:
202 raise ValueError("Need at least 2 data-points.")
203 xbar = x.mean()
204 C = x.var()
205 if n > 1000: # gaussian approximations for large n
206 mdist = distributions.norm(loc=xbar, scale=math.sqrt(C / n))
207 sdist = distributions.norm(loc=math.sqrt(C), scale=math.sqrt(C / (2. * n)))
208 vdist = distributions.norm(loc=C, scale=math.sqrt(2.0 / n) * C)
209 else:
210 nm1 = n - 1
211 fac = n * C / 2.
212 val = nm1 / 2.
213 mdist = distributions.t(nm1, loc=xbar, scale=math.sqrt(C / nm1))
214 sdist = distributions.gengamma(val, -2, scale=math.sqrt(fac))
215 vdist = distributions.invgamma(val, scale=fac)
216 return mdist, vdist, sdist
219@_axis_nan_policy_factory(
220 lambda x: x, result_to_tuple=lambda x: (x,), n_outputs=1, default_axis=None
221)
222def kstat(data, n=2):
223 r"""
224 Return the nth k-statistic (1<=n<=4 so far).
226 The nth k-statistic k_n is the unique symmetric unbiased estimator of the
227 nth cumulant kappa_n.
229 Parameters
230 ----------
231 data : array_like
232 Input array. Note that n-D input gets flattened.
233 n : int, {1, 2, 3, 4}, optional
234 Default is equal to 2.
236 Returns
237 -------
238 kstat : float
239 The nth k-statistic.
241 See Also
242 --------
243 kstatvar: Returns an unbiased estimator of the variance of the k-statistic.
244 moment: Returns the n-th central moment about the mean for a sample.
246 Notes
247 -----
248 For a sample size n, the first few k-statistics are given by:
250 .. math::
252 k_{1} = \mu
253 k_{2} = \frac{n}{n-1} m_{2}
254 k_{3} = \frac{ n^{2} } {(n-1) (n-2)} m_{3}
255 k_{4} = \frac{ n^{2} [(n + 1)m_{4} - 3(n - 1) m^2_{2}]} {(n-1) (n-2) (n-3)}
257 where :math:`\mu` is the sample mean, :math:`m_2` is the sample
258 variance, and :math:`m_i` is the i-th sample central moment.
260 References
261 ----------
262 http://mathworld.wolfram.com/k-Statistic.html
264 http://mathworld.wolfram.com/Cumulant.html
266 Examples
267 --------
268 >>> from scipy import stats
269 >>> from numpy.random import default_rng
270 >>> rng = default_rng()
272 As sample size increases, n-th moment and n-th k-statistic converge to the
273 same number (although they aren't identical). In the case of the normal
274 distribution, they converge to zero.
276 >>> for n in [2, 3, 4, 5, 6, 7]:
277 ... x = rng.normal(size=10**n)
278 ... m, k = stats.moment(x, 3), stats.kstat(x, 3)
279 ... print("%.3g %.3g %.3g" % (m, k, m-k))
280 -0.631 -0.651 0.0194 # random
281 0.0282 0.0283 -8.49e-05
282 -0.0454 -0.0454 1.36e-05
283 7.53e-05 7.53e-05 -2.26e-09
284 0.00166 0.00166 -4.99e-09
285 -2.88e-06 -2.88e-06 8.63e-13
286 """
287 if n > 4 or n < 1:
288 raise ValueError("k-statistics only supported for 1<=n<=4")
289 n = int(n)
290 S = np.zeros(n + 1, np.float64)
291 data = ravel(data)
292 N = data.size
294 # raise ValueError on empty input
295 if N == 0:
296 raise ValueError("Data input must not be empty")
298 # on nan input, return nan without warning
299 if np.isnan(np.sum(data)):
300 return np.nan
302 for k in range(1, n + 1):
303 S[k] = np.sum(data**k, axis=0)
304 if n == 1:
305 return S[1] * 1.0/N
306 elif n == 2:
307 return (N*S[2] - S[1]**2.0) / (N*(N - 1.0))
308 elif n == 3:
309 return (2*S[1]**3 - 3*N*S[1]*S[2] + N*N*S[3]) / (N*(N - 1.0)*(N - 2.0))
310 elif n == 4:
311 return ((-6*S[1]**4 + 12*N*S[1]**2 * S[2] - 3*N*(N-1.0)*S[2]**2 -
312 4*N*(N+1)*S[1]*S[3] + N*N*(N+1)*S[4]) /
313 (N*(N-1.0)*(N-2.0)*(N-3.0)))
314 else:
315 raise ValueError("Should not be here.")
318@_axis_nan_policy_factory(
319 lambda x: x, result_to_tuple=lambda x: (x,), n_outputs=1, default_axis=None
320)
321def kstatvar(data, n=2):
322 r"""Return an unbiased estimator of the variance of the k-statistic.
324 See `kstat` for more details of the k-statistic.
326 Parameters
327 ----------
328 data : array_like
329 Input array. Note that n-D input gets flattened.
330 n : int, {1, 2}, optional
331 Default is equal to 2.
333 Returns
334 -------
335 kstatvar : float
336 The nth k-statistic variance.
338 See Also
339 --------
340 kstat: Returns the n-th k-statistic.
341 moment: Returns the n-th central moment about the mean for a sample.
343 Notes
344 -----
345 The variances of the first few k-statistics are given by:
347 .. math::
349 var(k_{1}) = \frac{\kappa^2}{n}
350 var(k_{2}) = \frac{\kappa^4}{n} + \frac{2\kappa^2_{2}}{n - 1}
351 var(k_{3}) = \frac{\kappa^6}{n} + \frac{9 \kappa_2 \kappa_4}{n - 1} +
352 \frac{9 \kappa^2_{3}}{n - 1} +
353 \frac{6 n \kappa^3_{2}}{(n-1) (n-2)}
354 var(k_{4}) = \frac{\kappa^8}{n} + \frac{16 \kappa_2 \kappa_6}{n - 1} +
355 \frac{48 \kappa_{3} \kappa_5}{n - 1} +
356 \frac{34 \kappa^2_{4}}{n-1} + \frac{72 n \kappa^2_{2} \kappa_4}{(n - 1) (n - 2)} +
357 \frac{144 n \kappa_{2} \kappa^2_{3}}{(n - 1) (n - 2)} +
358 \frac{24 (n + 1) n \kappa^4_{2}}{(n - 1) (n - 2) (n - 3)}
359 """
360 data = ravel(data)
361 N = len(data)
362 if n == 1:
363 return kstat(data, n=2) * 1.0/N
364 elif n == 2:
365 k2 = kstat(data, n=2)
366 k4 = kstat(data, n=4)
367 return (2*N*k2**2 + (N-1)*k4) / (N*(N+1))
368 else:
369 raise ValueError("Only n=1 or n=2 supported.")
372def _calc_uniform_order_statistic_medians(n):
373 """Approximations of uniform order statistic medians.
375 Parameters
376 ----------
377 n : int
378 Sample size.
380 Returns
381 -------
382 v : 1d float array
383 Approximations of the order statistic medians.
385 References
386 ----------
387 .. [1] James J. Filliben, "The Probability Plot Correlation Coefficient
388 Test for Normality", Technometrics, Vol. 17, pp. 111-117, 1975.
390 Examples
391 --------
392 Order statistics of the uniform distribution on the unit interval
393 are marginally distributed according to beta distributions.
394 The expectations of these order statistic are evenly spaced across
395 the interval, but the distributions are skewed in a way that
396 pushes the medians slightly towards the endpoints of the unit interval:
398 >>> import numpy as np
399 >>> n = 4
400 >>> k = np.arange(1, n+1)
401 >>> from scipy.stats import beta
402 >>> a = k
403 >>> b = n-k+1
404 >>> beta.mean(a, b)
405 array([0.2, 0.4, 0.6, 0.8])
406 >>> beta.median(a, b)
407 array([0.15910358, 0.38572757, 0.61427243, 0.84089642])
409 The Filliben approximation uses the exact medians of the smallest
410 and greatest order statistics, and the remaining medians are approximated
411 by points spread evenly across a sub-interval of the unit interval:
413 >>> from scipy.stats._morestats import _calc_uniform_order_statistic_medians
414 >>> _calc_uniform_order_statistic_medians(n)
415 array([0.15910358, 0.38545246, 0.61454754, 0.84089642])
417 This plot shows the skewed distributions of the order statistics
418 of a sample of size four from a uniform distribution on the unit interval:
420 >>> import matplotlib.pyplot as plt
421 >>> x = np.linspace(0.0, 1.0, num=50, endpoint=True)
422 >>> pdfs = [beta.pdf(x, a[i], b[i]) for i in range(n)]
423 >>> plt.figure()
424 >>> plt.plot(x, pdfs[0], x, pdfs[1], x, pdfs[2], x, pdfs[3])
426 """
427 v = np.empty(n, dtype=np.float64)
428 v[-1] = 0.5**(1.0 / n)
429 v[0] = 1 - v[-1]
430 i = np.arange(2, n)
431 v[1:-1] = (i - 0.3175) / (n + 0.365)
432 return v
435def _parse_dist_kw(dist, enforce_subclass=True):
436 """Parse `dist` keyword.
438 Parameters
439 ----------
440 dist : str or stats.distributions instance.
441 Several functions take `dist` as a keyword, hence this utility
442 function.
443 enforce_subclass : bool, optional
444 If True (default), `dist` needs to be a
445 `_distn_infrastructure.rv_generic` instance.
446 It can sometimes be useful to set this keyword to False, if a function
447 wants to accept objects that just look somewhat like such an instance
448 (for example, they have a ``ppf`` method).
450 """
451 if isinstance(dist, rv_generic):
452 pass
453 elif isinstance(dist, str):
454 try:
455 dist = getattr(distributions, dist)
456 except AttributeError as e:
457 raise ValueError("%s is not a valid distribution name" % dist) from e
458 elif enforce_subclass:
459 msg = ("`dist` should be a stats.distributions instance or a string "
460 "with the name of such a distribution.")
461 raise ValueError(msg)
463 return dist
466def _add_axis_labels_title(plot, xlabel, ylabel, title):
467 """Helper function to add axes labels and a title to stats plots."""
468 try:
469 if hasattr(plot, 'set_title'):
470 # Matplotlib Axes instance or something that looks like it
471 plot.set_title(title)
472 plot.set_xlabel(xlabel)
473 plot.set_ylabel(ylabel)
474 else:
475 # matplotlib.pyplot module
476 plot.title(title)
477 plot.xlabel(xlabel)
478 plot.ylabel(ylabel)
479 except Exception:
480 # Not an MPL object or something that looks (enough) like it.
481 # Don't crash on adding labels or title
482 pass
485def probplot(x, sparams=(), dist='norm', fit=True, plot=None, rvalue=False):
486 """
487 Calculate quantiles for a probability plot, and optionally show the plot.
489 Generates a probability plot of sample data against the quantiles of a
490 specified theoretical distribution (the normal distribution by default).
491 `probplot` optionally calculates a best-fit line for the data and plots the
492 results using Matplotlib or a given plot function.
494 Parameters
495 ----------
496 x : array_like
497 Sample/response data from which `probplot` creates the plot.
498 sparams : tuple, optional
499 Distribution-specific shape parameters (shape parameters plus location
500 and scale).
501 dist : str or stats.distributions instance, optional
502 Distribution or distribution function name. The default is 'norm' for a
503 normal probability plot. Objects that look enough like a
504 stats.distributions instance (i.e. they have a ``ppf`` method) are also
505 accepted.
506 fit : bool, optional
507 Fit a least-squares regression (best-fit) line to the sample data if
508 True (default).
509 plot : object, optional
510 If given, plots the quantiles.
511 If given and `fit` is True, also plots the least squares fit.
512 `plot` is an object that has to have methods "plot" and "text".
513 The `matplotlib.pyplot` module or a Matplotlib Axes object can be used,
514 or a custom object with the same methods.
515 Default is None, which means that no plot is created.
516 rvalue : bool, optional
517 If `plot` is provided and `fit` is True, setting `rvalue` to True
518 includes the coefficient of determination on the plot.
519 Default is False.
521 Returns
522 -------
523 (osm, osr) : tuple of ndarrays
524 Tuple of theoretical quantiles (osm, or order statistic medians) and
525 ordered responses (osr). `osr` is simply sorted input `x`.
526 For details on how `osm` is calculated see the Notes section.
527 (slope, intercept, r) : tuple of floats, optional
528 Tuple containing the result of the least-squares fit, if that is
529 performed by `probplot`. `r` is the square root of the coefficient of
530 determination. If ``fit=False`` and ``plot=None``, this tuple is not
531 returned.
533 Notes
534 -----
535 Even if `plot` is given, the figure is not shown or saved by `probplot`;
536 ``plt.show()`` or ``plt.savefig('figname.png')`` should be used after
537 calling `probplot`.
539 `probplot` generates a probability plot, which should not be confused with
540 a Q-Q or a P-P plot. Statsmodels has more extensive functionality of this
541 type, see ``statsmodels.api.ProbPlot``.
543 The formula used for the theoretical quantiles (horizontal axis of the
544 probability plot) is Filliben's estimate::
546 quantiles = dist.ppf(val), for
548 0.5**(1/n), for i = n
549 val = (i - 0.3175) / (n + 0.365), for i = 2, ..., n-1
550 1 - 0.5**(1/n), for i = 1
552 where ``i`` indicates the i-th ordered value and ``n`` is the total number
553 of values.
555 Examples
556 --------
557 >>> import numpy as np
558 >>> from scipy import stats
559 >>> import matplotlib.pyplot as plt
560 >>> nsample = 100
561 >>> rng = np.random.default_rng()
563 A t distribution with small degrees of freedom:
565 >>> ax1 = plt.subplot(221)
566 >>> x = stats.t.rvs(3, size=nsample, random_state=rng)
567 >>> res = stats.probplot(x, plot=plt)
569 A t distribution with larger degrees of freedom:
571 >>> ax2 = plt.subplot(222)
572 >>> x = stats.t.rvs(25, size=nsample, random_state=rng)
573 >>> res = stats.probplot(x, plot=plt)
575 A mixture of two normal distributions with broadcasting:
577 >>> ax3 = plt.subplot(223)
578 >>> x = stats.norm.rvs(loc=[0,5], scale=[1,1.5],
579 ... size=(nsample//2,2), random_state=rng).ravel()
580 >>> res = stats.probplot(x, plot=plt)
582 A standard normal distribution:
584 >>> ax4 = plt.subplot(224)
585 >>> x = stats.norm.rvs(loc=0, scale=1, size=nsample, random_state=rng)
586 >>> res = stats.probplot(x, plot=plt)
588 Produce a new figure with a loggamma distribution, using the ``dist`` and
589 ``sparams`` keywords:
591 >>> fig = plt.figure()
592 >>> ax = fig.add_subplot(111)
593 >>> x = stats.loggamma.rvs(c=2.5, size=500, random_state=rng)
594 >>> res = stats.probplot(x, dist=stats.loggamma, sparams=(2.5,), plot=ax)
595 >>> ax.set_title("Probplot for loggamma dist with shape parameter 2.5")
597 Show the results with Matplotlib:
599 >>> plt.show()
601 """
602 x = np.asarray(x)
603 if x.size == 0:
604 if fit:
605 return (x, x), (np.nan, np.nan, 0.0)
606 else:
607 return x, x
609 osm_uniform = _calc_uniform_order_statistic_medians(len(x))
610 dist = _parse_dist_kw(dist, enforce_subclass=False)
611 if sparams is None:
612 sparams = ()
613 if isscalar(sparams):
614 sparams = (sparams,)
615 if not isinstance(sparams, tuple):
616 sparams = tuple(sparams)
618 osm = dist.ppf(osm_uniform, *sparams)
619 osr = sort(x)
620 if fit:
621 # perform a linear least squares fit.
622 slope, intercept, r, prob, _ = _stats_py.linregress(osm, osr)
624 if plot is not None:
625 plot.plot(osm, osr, 'bo')
626 if fit:
627 plot.plot(osm, slope*osm + intercept, 'r-')
628 _add_axis_labels_title(plot, xlabel='Theoretical quantiles',
629 ylabel='Ordered Values',
630 title='Probability Plot')
632 # Add R^2 value to the plot as text
633 if fit and rvalue:
634 xmin = amin(osm)
635 xmax = amax(osm)
636 ymin = amin(x)
637 ymax = amax(x)
638 posx = xmin + 0.70 * (xmax - xmin)
639 posy = ymin + 0.01 * (ymax - ymin)
640 plot.text(posx, posy, "$R^2=%1.4f$" % r**2)
642 if fit:
643 return (osm, osr), (slope, intercept, r)
644 else:
645 return osm, osr
648def ppcc_max(x, brack=(0.0, 1.0), dist='tukeylambda'):
649 """Calculate the shape parameter that maximizes the PPCC.
651 The probability plot correlation coefficient (PPCC) plot can be used
652 to determine the optimal shape parameter for a one-parameter family
653 of distributions. ``ppcc_max`` returns the shape parameter that would
654 maximize the probability plot correlation coefficient for the given
655 data to a one-parameter family of distributions.
657 Parameters
658 ----------
659 x : array_like
660 Input array.
661 brack : tuple, optional
662 Triple (a,b,c) where (a<b<c). If bracket consists of two numbers (a, c)
663 then they are assumed to be a starting interval for a downhill bracket
664 search (see `scipy.optimize.brent`).
665 dist : str or stats.distributions instance, optional
666 Distribution or distribution function name. Objects that look enough
667 like a stats.distributions instance (i.e. they have a ``ppf`` method)
668 are also accepted. The default is ``'tukeylambda'``.
670 Returns
671 -------
672 shape_value : float
673 The shape parameter at which the probability plot correlation
674 coefficient reaches its max value.
676 See Also
677 --------
678 ppcc_plot, probplot, boxcox
680 Notes
681 -----
682 The brack keyword serves as a starting point which is useful in corner
683 cases. One can use a plot to obtain a rough visual estimate of the location
684 for the maximum to start the search near it.
686 References
687 ----------
688 .. [1] J.J. Filliben, "The Probability Plot Correlation Coefficient Test
689 for Normality", Technometrics, Vol. 17, pp. 111-117, 1975.
690 .. [2] Engineering Statistics Handbook, NIST/SEMATEC,
691 https://www.itl.nist.gov/div898/handbook/eda/section3/ppccplot.htm
693 Examples
694 --------
695 First we generate some random data from a Weibull distribution
696 with shape parameter 2.5:
698 >>> import numpy as np
699 >>> from scipy import stats
700 >>> import matplotlib.pyplot as plt
701 >>> rng = np.random.default_rng()
702 >>> c = 2.5
703 >>> x = stats.weibull_min.rvs(c, scale=4, size=2000, random_state=rng)
705 Generate the PPCC plot for this data with the Weibull distribution.
707 >>> fig, ax = plt.subplots(figsize=(8, 6))
708 >>> res = stats.ppcc_plot(x, c/2, 2*c, dist='weibull_min', plot=ax)
710 We calculate the value where the shape should reach its maximum and a
711 red line is drawn there. The line should coincide with the highest
712 point in the PPCC graph.
714 >>> cmax = stats.ppcc_max(x, brack=(c/2, 2*c), dist='weibull_min')
715 >>> ax.axvline(cmax, color='r')
716 >>> plt.show()
718 """
719 dist = _parse_dist_kw(dist)
720 osm_uniform = _calc_uniform_order_statistic_medians(len(x))
721 osr = sort(x)
723 # this function computes the x-axis values of the probability plot
724 # and computes a linear regression (including the correlation)
725 # and returns 1-r so that a minimization function maximizes the
726 # correlation
727 def tempfunc(shape, mi, yvals, func):
728 xvals = func(mi, shape)
729 r, prob = _stats_py.pearsonr(xvals, yvals)
730 return 1 - r
732 return optimize.brent(tempfunc, brack=brack,
733 args=(osm_uniform, osr, dist.ppf))
736def ppcc_plot(x, a, b, dist='tukeylambda', plot=None, N=80):
737 """Calculate and optionally plot probability plot correlation coefficient.
739 The probability plot correlation coefficient (PPCC) plot can be used to
740 determine the optimal shape parameter for a one-parameter family of
741 distributions. It cannot be used for distributions without shape
742 parameters
743 (like the normal distribution) or with multiple shape parameters.
745 By default a Tukey-Lambda distribution (`stats.tukeylambda`) is used. A
746 Tukey-Lambda PPCC plot interpolates from long-tailed to short-tailed
747 distributions via an approximately normal one, and is therefore
748 particularly useful in practice.
750 Parameters
751 ----------
752 x : array_like
753 Input array.
754 a, b : scalar
755 Lower and upper bounds of the shape parameter to use.
756 dist : str or stats.distributions instance, optional
757 Distribution or distribution function name. Objects that look enough
758 like a stats.distributions instance (i.e. they have a ``ppf`` method)
759 are also accepted. The default is ``'tukeylambda'``.
760 plot : object, optional
761 If given, plots PPCC against the shape parameter.
762 `plot` is an object that has to have methods "plot" and "text".
763 The `matplotlib.pyplot` module or a Matplotlib Axes object can be used,
764 or a custom object with the same methods.
765 Default is None, which means that no plot is created.
766 N : int, optional
767 Number of points on the horizontal axis (equally distributed from
768 `a` to `b`).
770 Returns
771 -------
772 svals : ndarray
773 The shape values for which `ppcc` was calculated.
774 ppcc : ndarray
775 The calculated probability plot correlation coefficient values.
777 See Also
778 --------
779 ppcc_max, probplot, boxcox_normplot, tukeylambda
781 References
782 ----------
783 J.J. Filliben, "The Probability Plot Correlation Coefficient Test for
784 Normality", Technometrics, Vol. 17, pp. 111-117, 1975.
786 Examples
787 --------
788 First we generate some random data from a Weibull distribution
789 with shape parameter 2.5, and plot the histogram of the data:
791 >>> import numpy as np
792 >>> from scipy import stats
793 >>> import matplotlib.pyplot as plt
794 >>> rng = np.random.default_rng()
795 >>> c = 2.5
796 >>> x = stats.weibull_min.rvs(c, scale=4, size=2000, random_state=rng)
798 Take a look at the histogram of the data.
800 >>> fig1, ax = plt.subplots(figsize=(9, 4))
801 >>> ax.hist(x, bins=50)
802 >>> ax.set_title('Histogram of x')
803 >>> plt.show()
805 Now we explore this data with a PPCC plot as well as the related
806 probability plot and Box-Cox normplot. A red line is drawn where we
807 expect the PPCC value to be maximal (at the shape parameter ``c``
808 used above):
810 >>> fig2 = plt.figure(figsize=(12, 4))
811 >>> ax1 = fig2.add_subplot(1, 3, 1)
812 >>> ax2 = fig2.add_subplot(1, 3, 2)
813 >>> ax3 = fig2.add_subplot(1, 3, 3)
814 >>> res = stats.probplot(x, plot=ax1)
815 >>> res = stats.boxcox_normplot(x, -4, 4, plot=ax2)
816 >>> res = stats.ppcc_plot(x, c/2, 2*c, dist='weibull_min', plot=ax3)
817 >>> ax3.axvline(c, color='r')
818 >>> plt.show()
820 """
821 if b <= a:
822 raise ValueError("`b` has to be larger than `a`.")
824 svals = np.linspace(a, b, num=N)
825 ppcc = np.empty_like(svals)
826 for k, sval in enumerate(svals):
827 _, r2 = probplot(x, sval, dist=dist, fit=True)
828 ppcc[k] = r2[-1]
830 if plot is not None:
831 plot.plot(svals, ppcc, 'x')
832 _add_axis_labels_title(plot, xlabel='Shape Values',
833 ylabel='Prob Plot Corr. Coef.',
834 title='(%s) PPCC Plot' % dist)
836 return svals, ppcc
839def boxcox_llf(lmb, data):
840 r"""The boxcox log-likelihood function.
842 Parameters
843 ----------
844 lmb : scalar
845 Parameter for Box-Cox transformation. See `boxcox` for details.
846 data : array_like
847 Data to calculate Box-Cox log-likelihood for. If `data` is
848 multi-dimensional, the log-likelihood is calculated along the first
849 axis.
851 Returns
852 -------
853 llf : float or ndarray
854 Box-Cox log-likelihood of `data` given `lmb`. A float for 1-D `data`,
855 an array otherwise.
857 See Also
858 --------
859 boxcox, probplot, boxcox_normplot, boxcox_normmax
861 Notes
862 -----
863 The Box-Cox log-likelihood function is defined here as
865 .. math::
867 llf = (\lambda - 1) \sum_i(\log(x_i)) -
868 N/2 \log(\sum_i (y_i - \bar{y})^2 / N),
870 where ``y`` is the Box-Cox transformed input data ``x``.
872 Examples
873 --------
874 >>> import numpy as np
875 >>> from scipy import stats
876 >>> import matplotlib.pyplot as plt
877 >>> from mpl_toolkits.axes_grid1.inset_locator import inset_axes
879 Generate some random variates and calculate Box-Cox log-likelihood values
880 for them for a range of ``lmbda`` values:
882 >>> rng = np.random.default_rng()
883 >>> x = stats.loggamma.rvs(5, loc=10, size=1000, random_state=rng)
884 >>> lmbdas = np.linspace(-2, 10)
885 >>> llf = np.zeros(lmbdas.shape, dtype=float)
886 >>> for ii, lmbda in enumerate(lmbdas):
887 ... llf[ii] = stats.boxcox_llf(lmbda, x)
889 Also find the optimal lmbda value with `boxcox`:
891 >>> x_most_normal, lmbda_optimal = stats.boxcox(x)
893 Plot the log-likelihood as function of lmbda. Add the optimal lmbda as a
894 horizontal line to check that that's really the optimum:
896 >>> fig = plt.figure()
897 >>> ax = fig.add_subplot(111)
898 >>> ax.plot(lmbdas, llf, 'b.-')
899 >>> ax.axhline(stats.boxcox_llf(lmbda_optimal, x), color='r')
900 >>> ax.set_xlabel('lmbda parameter')
901 >>> ax.set_ylabel('Box-Cox log-likelihood')
903 Now add some probability plots to show that where the log-likelihood is
904 maximized the data transformed with `boxcox` looks closest to normal:
906 >>> locs = [3, 10, 4] # 'lower left', 'center', 'lower right'
907 >>> for lmbda, loc in zip([-1, lmbda_optimal, 9], locs):
908 ... xt = stats.boxcox(x, lmbda=lmbda)
909 ... (osm, osr), (slope, intercept, r_sq) = stats.probplot(xt)
910 ... ax_inset = inset_axes(ax, width="20%", height="20%", loc=loc)
911 ... ax_inset.plot(osm, osr, 'c.', osm, slope*osm + intercept, 'k-')
912 ... ax_inset.set_xticklabels([])
913 ... ax_inset.set_yticklabels([])
914 ... ax_inset.set_title(r'$\lambda=%1.2f$' % lmbda)
916 >>> plt.show()
918 """
919 data = np.asarray(data)
920 N = data.shape[0]
921 if N == 0:
922 return np.nan
924 logdata = np.log(data)
926 # Compute the variance of the transformed data.
927 if lmb == 0:
928 variance = np.var(logdata, axis=0)
929 else:
930 # Transform without the constant offset 1/lmb. The offset does
931 # not effect the variance, and the subtraction of the offset can
932 # lead to loss of precision.
933 variance = np.var(data**lmb / lmb, axis=0)
935 return (lmb - 1) * np.sum(logdata, axis=0) - N/2 * np.log(variance)
938def _boxcox_conf_interval(x, lmax, alpha):
939 # Need to find the lambda for which
940 # f(x,lmbda) >= f(x,lmax) - 0.5*chi^2_alpha;1
941 fac = 0.5 * distributions.chi2.ppf(1 - alpha, 1)
942 target = boxcox_llf(lmax, x) - fac
944 def rootfunc(lmbda, data, target):
945 return boxcox_llf(lmbda, data) - target
947 # Find positive endpoint of interval in which answer is to be found
948 newlm = lmax + 0.5
949 N = 0
950 while (rootfunc(newlm, x, target) > 0.0) and (N < 500):
951 newlm += 0.1
952 N += 1
954 if N == 500:
955 raise RuntimeError("Could not find endpoint.")
957 lmplus = optimize.brentq(rootfunc, lmax, newlm, args=(x, target))
959 # Now find negative interval in the same way
960 newlm = lmax - 0.5
961 N = 0
962 while (rootfunc(newlm, x, target) > 0.0) and (N < 500):
963 newlm -= 0.1
964 N += 1
966 if N == 500:
967 raise RuntimeError("Could not find endpoint.")
969 lmminus = optimize.brentq(rootfunc, newlm, lmax, args=(x, target))
970 return lmminus, lmplus
973def boxcox(x, lmbda=None, alpha=None, optimizer=None):
974 r"""Return a dataset transformed by a Box-Cox power transformation.
976 Parameters
977 ----------
978 x : ndarray
979 Input array to be transformed.
981 If `lmbda` is not None, this is an alias of
982 `scipy.special.boxcox`.
983 Returns nan if ``x < 0``; returns -inf if ``x == 0 and lmbda < 0``.
985 If `lmbda` is None, array must be positive, 1-dimensional, and
986 non-constant.
988 lmbda : scalar, optional
989 If `lmbda` is None (default), find the value of `lmbda` that maximizes
990 the log-likelihood function and return it as the second output
991 argument.
993 If `lmbda` is not None, do the transformation for that value.
995 alpha : float, optional
996 If `lmbda` is None and `alpha` is not None (default), return the
997 ``100 * (1-alpha)%`` confidence interval for `lmbda` as the third
998 output argument. Must be between 0.0 and 1.0.
1000 If `lmbda` is not None, `alpha` is ignored.
1001 optimizer : callable, optional
1002 If `lmbda` is None, `optimizer` is the scalar optimizer used to find
1003 the value of `lmbda` that minimizes the negative log-likelihood
1004 function. `optimizer` is a callable that accepts one argument:
1006 fun : callable
1007 The objective function, which evaluates the negative
1008 log-likelihood function at a provided value of `lmbda`
1010 and returns an object, such as an instance of
1011 `scipy.optimize.OptimizeResult`, which holds the optimal value of
1012 `lmbda` in an attribute `x`.
1014 See the example in `boxcox_normmax` or the documentation of
1015 `scipy.optimize.minimize_scalar` for more information.
1017 If `lmbda` is not None, `optimizer` is ignored.
1019 Returns
1020 -------
1021 boxcox : ndarray
1022 Box-Cox power transformed array.
1023 maxlog : float, optional
1024 If the `lmbda` parameter is None, the second returned argument is
1025 the `lmbda` that maximizes the log-likelihood function.
1026 (min_ci, max_ci) : tuple of float, optional
1027 If `lmbda` parameter is None and `alpha` is not None, this returned
1028 tuple of floats represents the minimum and maximum confidence limits
1029 given `alpha`.
1031 See Also
1032 --------
1033 probplot, boxcox_normplot, boxcox_normmax, boxcox_llf
1035 Notes
1036 -----
1037 The Box-Cox transform is given by::
1039 y = (x**lmbda - 1) / lmbda, for lmbda != 0
1040 log(x), for lmbda = 0
1042 `boxcox` requires the input data to be positive. Sometimes a Box-Cox
1043 transformation provides a shift parameter to achieve this; `boxcox` does
1044 not. Such a shift parameter is equivalent to adding a positive constant to
1045 `x` before calling `boxcox`.
1047 The confidence limits returned when `alpha` is provided give the interval
1048 where:
1050 .. math::
1052 llf(\hat{\lambda}) - llf(\lambda) < \frac{1}{2}\chi^2(1 - \alpha, 1),
1054 with ``llf`` the log-likelihood function and :math:`\chi^2` the chi-squared
1055 function.
1057 References
1058 ----------
1059 G.E.P. Box and D.R. Cox, "An Analysis of Transformations", Journal of the
1060 Royal Statistical Society B, 26, 211-252 (1964).
1062 Examples
1063 --------
1064 >>> from scipy import stats
1065 >>> import matplotlib.pyplot as plt
1067 We generate some random variates from a non-normal distribution and make a
1068 probability plot for it, to show it is non-normal in the tails:
1070 >>> fig = plt.figure()
1071 >>> ax1 = fig.add_subplot(211)
1072 >>> x = stats.loggamma.rvs(5, size=500) + 5
1073 >>> prob = stats.probplot(x, dist=stats.norm, plot=ax1)
1074 >>> ax1.set_xlabel('')
1075 >>> ax1.set_title('Probplot against normal distribution')
1077 We now use `boxcox` to transform the data so it's closest to normal:
1079 >>> ax2 = fig.add_subplot(212)
1080 >>> xt, _ = stats.boxcox(x)
1081 >>> prob = stats.probplot(xt, dist=stats.norm, plot=ax2)
1082 >>> ax2.set_title('Probplot after Box-Cox transformation')
1084 >>> plt.show()
1086 """
1087 x = np.asarray(x)
1089 if lmbda is not None: # single transformation
1090 return special.boxcox(x, lmbda)
1092 if x.ndim != 1:
1093 raise ValueError("Data must be 1-dimensional.")
1095 if x.size == 0:
1096 return x
1098 if np.all(x == x[0]):
1099 raise ValueError("Data must not be constant.")
1101 if np.any(x <= 0):
1102 raise ValueError("Data must be positive.")
1104 # If lmbda=None, find the lmbda that maximizes the log-likelihood function.
1105 lmax = boxcox_normmax(x, method='mle', optimizer=optimizer)
1106 y = boxcox(x, lmax)
1108 if alpha is None:
1109 return y, lmax
1110 else:
1111 # Find confidence interval
1112 interval = _boxcox_conf_interval(x, lmax, alpha)
1113 return y, lmax, interval
1116def boxcox_normmax(x, brack=None, method='pearsonr', optimizer=None):
1117 """Compute optimal Box-Cox transform parameter for input data.
1119 Parameters
1120 ----------
1121 x : array_like
1122 Input array.
1123 brack : 2-tuple, optional, default (-2.0, 2.0)
1124 The starting interval for a downhill bracket search for the default
1125 `optimize.brent` solver. Note that this is in most cases not
1126 critical; the final result is allowed to be outside this bracket.
1127 If `optimizer` is passed, `brack` must be None.
1128 method : str, optional
1129 The method to determine the optimal transform parameter (`boxcox`
1130 ``lmbda`` parameter). Options are:
1132 'pearsonr' (default)
1133 Maximizes the Pearson correlation coefficient between
1134 ``y = boxcox(x)`` and the expected values for ``y`` if `x` would be
1135 normally-distributed.
1137 'mle'
1138 Minimizes the log-likelihood `boxcox_llf`. This is the method used
1139 in `boxcox`.
1141 'all'
1142 Use all optimization methods available, and return all results.
1143 Useful to compare different methods.
1144 optimizer : callable, optional
1145 `optimizer` is a callable that accepts one argument:
1147 fun : callable
1148 The objective function to be optimized. `fun` accepts one argument,
1149 the Box-Cox transform parameter `lmbda`, and returns the negative
1150 log-likelihood function at the provided value. The job of `optimizer`
1151 is to find the value of `lmbda` that minimizes `fun`.
1153 and returns an object, such as an instance of
1154 `scipy.optimize.OptimizeResult`, which holds the optimal value of
1155 `lmbda` in an attribute `x`.
1157 See the example below or the documentation of
1158 `scipy.optimize.minimize_scalar` for more information.
1160 Returns
1161 -------
1162 maxlog : float or ndarray
1163 The optimal transform parameter found. An array instead of a scalar
1164 for ``method='all'``.
1166 See Also
1167 --------
1168 boxcox, boxcox_llf, boxcox_normplot, scipy.optimize.minimize_scalar
1170 Examples
1171 --------
1172 >>> import numpy as np
1173 >>> from scipy import stats
1174 >>> import matplotlib.pyplot as plt
1176 We can generate some data and determine the optimal ``lmbda`` in various
1177 ways:
1179 >>> rng = np.random.default_rng()
1180 >>> x = stats.loggamma.rvs(5, size=30, random_state=rng) + 5
1181 >>> y, lmax_mle = stats.boxcox(x)
1182 >>> lmax_pearsonr = stats.boxcox_normmax(x)
1184 >>> lmax_mle
1185 2.217563431465757
1186 >>> lmax_pearsonr
1187 2.238318660200961
1188 >>> stats.boxcox_normmax(x, method='all')
1189 array([2.23831866, 2.21756343])
1191 >>> fig = plt.figure()
1192 >>> ax = fig.add_subplot(111)
1193 >>> prob = stats.boxcox_normplot(x, -10, 10, plot=ax)
1194 >>> ax.axvline(lmax_mle, color='r')
1195 >>> ax.axvline(lmax_pearsonr, color='g', ls='--')
1197 >>> plt.show()
1199 Alternatively, we can define our own `optimizer` function. Suppose we
1200 are only interested in values of `lmbda` on the interval [6, 7], we
1201 want to use `scipy.optimize.minimize_scalar` with ``method='bounded'``,
1202 and we want to use tighter tolerances when optimizing the log-likelihood
1203 function. To do this, we define a function that accepts positional argument
1204 `fun` and uses `scipy.optimize.minimize_scalar` to minimize `fun` subject
1205 to the provided bounds and tolerances:
1207 >>> from scipy import optimize
1208 >>> options = {'xatol': 1e-12} # absolute tolerance on `x`
1209 >>> def optimizer(fun):
1210 ... return optimize.minimize_scalar(fun, bounds=(6, 7),
1211 ... method="bounded", options=options)
1212 >>> stats.boxcox_normmax(x, optimizer=optimizer)
1213 6.000...
1214 """
1215 # If optimizer is not given, define default 'brent' optimizer.
1216 if optimizer is None:
1218 # Set default value for `brack`.
1219 if brack is None:
1220 brack = (-2.0, 2.0)
1222 def _optimizer(func, args):
1223 return optimize.brent(func, args=args, brack=brack)
1225 # Otherwise check optimizer.
1226 else:
1227 if not callable(optimizer):
1228 raise ValueError("`optimizer` must be a callable")
1230 if brack is not None:
1231 raise ValueError("`brack` must be None if `optimizer` is given")
1233 # `optimizer` is expected to return a `OptimizeResult` object, we here
1234 # get the solution to the optimization problem.
1235 def _optimizer(func, args):
1236 def func_wrapped(x):
1237 return func(x, *args)
1238 return getattr(optimizer(func_wrapped), 'x', None)
1240 def _pearsonr(x):
1241 osm_uniform = _calc_uniform_order_statistic_medians(len(x))
1242 xvals = distributions.norm.ppf(osm_uniform)
1244 def _eval_pearsonr(lmbda, xvals, samps):
1245 # This function computes the x-axis values of the probability plot
1246 # and computes a linear regression (including the correlation) and
1247 # returns ``1 - r`` so that a minimization function maximizes the
1248 # correlation.
1249 y = boxcox(samps, lmbda)
1250 yvals = np.sort(y)
1251 r, prob = _stats_py.pearsonr(xvals, yvals)
1252 return 1 - r
1254 return _optimizer(_eval_pearsonr, args=(xvals, x))
1256 def _mle(x):
1257 def _eval_mle(lmb, data):
1258 # function to minimize
1259 return -boxcox_llf(lmb, data)
1261 return _optimizer(_eval_mle, args=(x,))
1263 def _all(x):
1264 maxlog = np.empty(2, dtype=float)
1265 maxlog[0] = _pearsonr(x)
1266 maxlog[1] = _mle(x)
1267 return maxlog
1269 methods = {'pearsonr': _pearsonr,
1270 'mle': _mle,
1271 'all': _all}
1272 if method not in methods.keys():
1273 raise ValueError("Method %s not recognized." % method)
1275 optimfunc = methods[method]
1276 res = optimfunc(x)
1277 if res is None:
1278 message = ("`optimizer` must return an object containing the optimal "
1279 "`lmbda` in attribute `x`")
1280 raise ValueError(message)
1281 return res
1284def _normplot(method, x, la, lb, plot=None, N=80):
1285 """Compute parameters for a Box-Cox or Yeo-Johnson normality plot,
1286 optionally show it.
1288 See `boxcox_normplot` or `yeojohnson_normplot` for details.
1289 """
1291 if method == 'boxcox':
1292 title = 'Box-Cox Normality Plot'
1293 transform_func = boxcox
1294 else:
1295 title = 'Yeo-Johnson Normality Plot'
1296 transform_func = yeojohnson
1298 x = np.asarray(x)
1299 if x.size == 0:
1300 return x
1302 if lb <= la:
1303 raise ValueError("`lb` has to be larger than `la`.")
1305 if method == 'boxcox' and np.any(x <= 0):
1306 raise ValueError("Data must be positive.")
1308 lmbdas = np.linspace(la, lb, num=N)
1309 ppcc = lmbdas * 0.0
1310 for i, val in enumerate(lmbdas):
1311 # Determine for each lmbda the square root of correlation coefficient
1312 # of transformed x
1313 z = transform_func(x, lmbda=val)
1314 _, (_, _, r) = probplot(z, dist='norm', fit=True)
1315 ppcc[i] = r
1317 if plot is not None:
1318 plot.plot(lmbdas, ppcc, 'x')
1319 _add_axis_labels_title(plot, xlabel='$\\lambda$',
1320 ylabel='Prob Plot Corr. Coef.',
1321 title=title)
1323 return lmbdas, ppcc
1326def boxcox_normplot(x, la, lb, plot=None, N=80):
1327 """Compute parameters for a Box-Cox normality plot, optionally show it.
1329 A Box-Cox normality plot shows graphically what the best transformation
1330 parameter is to use in `boxcox` to obtain a distribution that is close
1331 to normal.
1333 Parameters
1334 ----------
1335 x : array_like
1336 Input array.
1337 la, lb : scalar
1338 The lower and upper bounds for the ``lmbda`` values to pass to `boxcox`
1339 for Box-Cox transformations. These are also the limits of the
1340 horizontal axis of the plot if that is generated.
1341 plot : object, optional
1342 If given, plots the quantiles and least squares fit.
1343 `plot` is an object that has to have methods "plot" and "text".
1344 The `matplotlib.pyplot` module or a Matplotlib Axes object can be used,
1345 or a custom object with the same methods.
1346 Default is None, which means that no plot is created.
1347 N : int, optional
1348 Number of points on the horizontal axis (equally distributed from
1349 `la` to `lb`).
1351 Returns
1352 -------
1353 lmbdas : ndarray
1354 The ``lmbda`` values for which a Box-Cox transform was done.
1355 ppcc : ndarray
1356 Probability Plot Correlelation Coefficient, as obtained from `probplot`
1357 when fitting the Box-Cox transformed input `x` against a normal
1358 distribution.
1360 See Also
1361 --------
1362 probplot, boxcox, boxcox_normmax, boxcox_llf, ppcc_max
1364 Notes
1365 -----
1366 Even if `plot` is given, the figure is not shown or saved by
1367 `boxcox_normplot`; ``plt.show()`` or ``plt.savefig('figname.png')``
1368 should be used after calling `probplot`.
1370 Examples
1371 --------
1372 >>> from scipy import stats
1373 >>> import matplotlib.pyplot as plt
1375 Generate some non-normally distributed data, and create a Box-Cox plot:
1377 >>> x = stats.loggamma.rvs(5, size=500) + 5
1378 >>> fig = plt.figure()
1379 >>> ax = fig.add_subplot(111)
1380 >>> prob = stats.boxcox_normplot(x, -20, 20, plot=ax)
1382 Determine and plot the optimal ``lmbda`` to transform ``x`` and plot it in
1383 the same plot:
1385 >>> _, maxlog = stats.boxcox(x)
1386 >>> ax.axvline(maxlog, color='r')
1388 >>> plt.show()
1390 """
1391 return _normplot('boxcox', x, la, lb, plot, N)
1394def yeojohnson(x, lmbda=None):
1395 r"""Return a dataset transformed by a Yeo-Johnson power transformation.
1397 Parameters
1398 ----------
1399 x : ndarray
1400 Input array. Should be 1-dimensional.
1401 lmbda : float, optional
1402 If ``lmbda`` is ``None``, find the lambda that maximizes the
1403 log-likelihood function and return it as the second output argument.
1404 Otherwise the transformation is done for the given value.
1406 Returns
1407 -------
1408 yeojohnson: ndarray
1409 Yeo-Johnson power transformed array.
1410 maxlog : float, optional
1411 If the `lmbda` parameter is None, the second returned argument is
1412 the lambda that maximizes the log-likelihood function.
1414 See Also
1415 --------
1416 probplot, yeojohnson_normplot, yeojohnson_normmax, yeojohnson_llf, boxcox
1418 Notes
1419 -----
1420 The Yeo-Johnson transform is given by::
1422 y = ((x + 1)**lmbda - 1) / lmbda, for x >= 0, lmbda != 0
1423 log(x + 1), for x >= 0, lmbda = 0
1424 -((-x + 1)**(2 - lmbda) - 1) / (2 - lmbda), for x < 0, lmbda != 2
1425 -log(-x + 1), for x < 0, lmbda = 2
1427 Unlike `boxcox`, `yeojohnson` does not require the input data to be
1428 positive.
1430 .. versionadded:: 1.2.0
1433 References
1434 ----------
1435 I. Yeo and R.A. Johnson, "A New Family of Power Transformations to
1436 Improve Normality or Symmetry", Biometrika 87.4 (2000):
1439 Examples
1440 --------
1441 >>> from scipy import stats
1442 >>> import matplotlib.pyplot as plt
1444 We generate some random variates from a non-normal distribution and make a
1445 probability plot for it, to show it is non-normal in the tails:
1447 >>> fig = plt.figure()
1448 >>> ax1 = fig.add_subplot(211)
1449 >>> x = stats.loggamma.rvs(5, size=500) + 5
1450 >>> prob = stats.probplot(x, dist=stats.norm, plot=ax1)
1451 >>> ax1.set_xlabel('')
1452 >>> ax1.set_title('Probplot against normal distribution')
1454 We now use `yeojohnson` to transform the data so it's closest to normal:
1456 >>> ax2 = fig.add_subplot(212)
1457 >>> xt, lmbda = stats.yeojohnson(x)
1458 >>> prob = stats.probplot(xt, dist=stats.norm, plot=ax2)
1459 >>> ax2.set_title('Probplot after Yeo-Johnson transformation')
1461 >>> plt.show()
1463 """
1464 x = np.asarray(x)
1465 if x.size == 0:
1466 return x
1468 if np.issubdtype(x.dtype, np.complexfloating):
1469 raise ValueError('Yeo-Johnson transformation is not defined for '
1470 'complex numbers.')
1472 if np.issubdtype(x.dtype, np.integer):
1473 x = x.astype(np.float64, copy=False)
1475 if lmbda is not None:
1476 return _yeojohnson_transform(x, lmbda)
1478 # if lmbda=None, find the lmbda that maximizes the log-likelihood function.
1479 lmax = yeojohnson_normmax(x)
1480 y = _yeojohnson_transform(x, lmax)
1482 return y, lmax
1485def _yeojohnson_transform(x, lmbda):
1486 """Returns `x` transformed by the Yeo-Johnson power transform with given
1487 parameter `lmbda`.
1488 """
1489 out = np.zeros_like(x)
1490 pos = x >= 0 # binary mask
1492 # when x >= 0
1493 if abs(lmbda) < np.spacing(1.):
1494 out[pos] = np.log1p(x[pos])
1495 else: # lmbda != 0
1496 out[pos] = (np.power(x[pos] + 1, lmbda) - 1) / lmbda
1498 # when x < 0
1499 if abs(lmbda - 2) > np.spacing(1.):
1500 out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
1501 else: # lmbda == 2
1502 out[~pos] = -np.log1p(-x[~pos])
1504 return out
1507def yeojohnson_llf(lmb, data):
1508 r"""The yeojohnson log-likelihood function.
1510 Parameters
1511 ----------
1512 lmb : scalar
1513 Parameter for Yeo-Johnson transformation. See `yeojohnson` for
1514 details.
1515 data : array_like
1516 Data to calculate Yeo-Johnson log-likelihood for. If `data` is
1517 multi-dimensional, the log-likelihood is calculated along the first
1518 axis.
1520 Returns
1521 -------
1522 llf : float
1523 Yeo-Johnson log-likelihood of `data` given `lmb`.
1525 See Also
1526 --------
1527 yeojohnson, probplot, yeojohnson_normplot, yeojohnson_normmax
1529 Notes
1530 -----
1531 The Yeo-Johnson log-likelihood function is defined here as
1533 .. math::
1535 llf = -N/2 \log(\hat{\sigma}^2) + (\lambda - 1)
1536 \sum_i \text{ sign }(x_i)\log(|x_i| + 1)
1538 where :math:`\hat{\sigma}^2` is estimated variance of the Yeo-Johnson
1539 transformed input data ``x``.
1541 .. versionadded:: 1.2.0
1543 Examples
1544 --------
1545 >>> import numpy as np
1546 >>> from scipy import stats
1547 >>> import matplotlib.pyplot as plt
1548 >>> from mpl_toolkits.axes_grid1.inset_locator import inset_axes
1550 Generate some random variates and calculate Yeo-Johnson log-likelihood
1551 values for them for a range of ``lmbda`` values:
1553 >>> x = stats.loggamma.rvs(5, loc=10, size=1000)
1554 >>> lmbdas = np.linspace(-2, 10)
1555 >>> llf = np.zeros(lmbdas.shape, dtype=float)
1556 >>> for ii, lmbda in enumerate(lmbdas):
1557 ... llf[ii] = stats.yeojohnson_llf(lmbda, x)
1559 Also find the optimal lmbda value with `yeojohnson`:
1561 >>> x_most_normal, lmbda_optimal = stats.yeojohnson(x)
1563 Plot the log-likelihood as function of lmbda. Add the optimal lmbda as a
1564 horizontal line to check that that's really the optimum:
1566 >>> fig = plt.figure()
1567 >>> ax = fig.add_subplot(111)
1568 >>> ax.plot(lmbdas, llf, 'b.-')
1569 >>> ax.axhline(stats.yeojohnson_llf(lmbda_optimal, x), color='r')
1570 >>> ax.set_xlabel('lmbda parameter')
1571 >>> ax.set_ylabel('Yeo-Johnson log-likelihood')
1573 Now add some probability plots to show that where the log-likelihood is
1574 maximized the data transformed with `yeojohnson` looks closest to normal:
1576 >>> locs = [3, 10, 4] # 'lower left', 'center', 'lower right'
1577 >>> for lmbda, loc in zip([-1, lmbda_optimal, 9], locs):
1578 ... xt = stats.yeojohnson(x, lmbda=lmbda)
1579 ... (osm, osr), (slope, intercept, r_sq) = stats.probplot(xt)
1580 ... ax_inset = inset_axes(ax, width="20%", height="20%", loc=loc)
1581 ... ax_inset.plot(osm, osr, 'c.', osm, slope*osm + intercept, 'k-')
1582 ... ax_inset.set_xticklabels([])
1583 ... ax_inset.set_yticklabels([])
1584 ... ax_inset.set_title(r'$\lambda=%1.2f$' % lmbda)
1586 >>> plt.show()
1588 """
1589 data = np.asarray(data)
1590 n_samples = data.shape[0]
1592 if n_samples == 0:
1593 return np.nan
1595 trans = _yeojohnson_transform(data, lmb)
1596 trans_var = trans.var(axis=0)
1597 loglike = np.empty_like(trans_var)
1599 # Avoid RuntimeWarning raised by np.log when the variance is too low
1600 tiny_variance = trans_var < np.finfo(trans_var.dtype).tiny
1601 loglike[tiny_variance] = np.inf
1603 loglike[~tiny_variance] = (
1604 -n_samples / 2 * np.log(trans_var[~tiny_variance]))
1605 loglike[~tiny_variance] += (
1606 (lmb - 1) * (np.sign(data) * np.log(np.abs(data) + 1)).sum(axis=0))
1607 return loglike
1610def yeojohnson_normmax(x, brack=(-2, 2)):
1611 """Compute optimal Yeo-Johnson transform parameter.
1613 Compute optimal Yeo-Johnson transform parameter for input data, using
1614 maximum likelihood estimation.
1616 Parameters
1617 ----------
1618 x : array_like
1619 Input array.
1620 brack : 2-tuple, optional
1621 The starting interval for a downhill bracket search with
1622 `optimize.brent`. Note that this is in most cases not critical; the
1623 final result is allowed to be outside this bracket.
1625 Returns
1626 -------
1627 maxlog : float
1628 The optimal transform parameter found.
1630 See Also
1631 --------
1632 yeojohnson, yeojohnson_llf, yeojohnson_normplot
1634 Notes
1635 -----
1636 .. versionadded:: 1.2.0
1638 Examples
1639 --------
1640 >>> import numpy as np
1641 >>> from scipy import stats
1642 >>> import matplotlib.pyplot as plt
1644 Generate some data and determine optimal ``lmbda``
1646 >>> rng = np.random.default_rng()
1647 >>> x = stats.loggamma.rvs(5, size=30, random_state=rng) + 5
1648 >>> lmax = stats.yeojohnson_normmax(x)
1650 >>> fig = plt.figure()
1651 >>> ax = fig.add_subplot(111)
1652 >>> prob = stats.yeojohnson_normplot(x, -10, 10, plot=ax)
1653 >>> ax.axvline(lmax, color='r')
1655 >>> plt.show()
1657 """
1658 def _neg_llf(lmbda, data):
1659 llf = yeojohnson_llf(lmbda, data)
1660 # reject likelihoods that are inf which are likely due to small
1661 # variance in the transformed space
1662 llf[np.isinf(llf)] = -np.inf
1663 return -llf
1665 with np.errstate(invalid='ignore'):
1666 return optimize.brent(_neg_llf, brack=brack, args=(x,))
1669def yeojohnson_normplot(x, la, lb, plot=None, N=80):
1670 """Compute parameters for a Yeo-Johnson normality plot, optionally show it.
1672 A Yeo-Johnson normality plot shows graphically what the best
1673 transformation parameter is to use in `yeojohnson` to obtain a
1674 distribution that is close to normal.
1676 Parameters
1677 ----------
1678 x : array_like
1679 Input array.
1680 la, lb : scalar
1681 The lower and upper bounds for the ``lmbda`` values to pass to
1682 `yeojohnson` for Yeo-Johnson transformations. These are also the
1683 limits of the horizontal axis of the plot if that is generated.
1684 plot : object, optional
1685 If given, plots the quantiles and least squares fit.
1686 `plot` is an object that has to have methods "plot" and "text".
1687 The `matplotlib.pyplot` module or a Matplotlib Axes object can be used,
1688 or a custom object with the same methods.
1689 Default is None, which means that no plot is created.
1690 N : int, optional
1691 Number of points on the horizontal axis (equally distributed from
1692 `la` to `lb`).
1694 Returns
1695 -------
1696 lmbdas : ndarray
1697 The ``lmbda`` values for which a Yeo-Johnson transform was done.
1698 ppcc : ndarray
1699 Probability Plot Correlelation Coefficient, as obtained from `probplot`
1700 when fitting the Box-Cox transformed input `x` against a normal
1701 distribution.
1703 See Also
1704 --------
1705 probplot, yeojohnson, yeojohnson_normmax, yeojohnson_llf, ppcc_max
1707 Notes
1708 -----
1709 Even if `plot` is given, the figure is not shown or saved by
1710 `boxcox_normplot`; ``plt.show()`` or ``plt.savefig('figname.png')``
1711 should be used after calling `probplot`.
1713 .. versionadded:: 1.2.0
1715 Examples
1716 --------
1717 >>> from scipy import stats
1718 >>> import matplotlib.pyplot as plt
1720 Generate some non-normally distributed data, and create a Yeo-Johnson plot:
1722 >>> x = stats.loggamma.rvs(5, size=500) + 5
1723 >>> fig = plt.figure()
1724 >>> ax = fig.add_subplot(111)
1725 >>> prob = stats.yeojohnson_normplot(x, -20, 20, plot=ax)
1727 Determine and plot the optimal ``lmbda`` to transform ``x`` and plot it in
1728 the same plot:
1730 >>> _, maxlog = stats.yeojohnson(x)
1731 >>> ax.axvline(maxlog, color='r')
1733 >>> plt.show()
1735 """
1736 return _normplot('yeojohnson', x, la, lb, plot, N)
1739ShapiroResult = namedtuple('ShapiroResult', ('statistic', 'pvalue'))
1742def shapiro(x):
1743 """Perform the Shapiro-Wilk test for normality.
1745 The Shapiro-Wilk test tests the null hypothesis that the
1746 data was drawn from a normal distribution.
1748 Parameters
1749 ----------
1750 x : array_like
1751 Array of sample data.
1753 Returns
1754 -------
1755 statistic : float
1756 The test statistic.
1757 p-value : float
1758 The p-value for the hypothesis test.
1760 See Also
1761 --------
1762 anderson : The Anderson-Darling test for normality
1763 kstest : The Kolmogorov-Smirnov test for goodness of fit.
1765 Notes
1766 -----
1767 The algorithm used is described in [4]_ but censoring parameters as
1768 described are not implemented. For N > 5000 the W test statistic is accurate
1769 but the p-value may not be.
1771 The chance of rejecting the null hypothesis when it is true is close to 5%
1772 regardless of sample size.
1774 References
1775 ----------
1776 .. [1] https://www.itl.nist.gov/div898/handbook/prc/section2/prc213.htm
1777 .. [2] Shapiro, S. S. & Wilk, M.B (1965). An analysis of variance test for
1778 normality (complete samples), Biometrika, Vol. 52, pp. 591-611.
1779 .. [3] Razali, N. M. & Wah, Y. B. (2011) Power comparisons of Shapiro-Wilk,
1780 Kolmogorov-Smirnov, Lilliefors and Anderson-Darling tests, Journal of
1781 Statistical Modeling and Analytics, Vol. 2, pp. 21-33.
1782 .. [4] ALGORITHM AS R94 APPL. STATIST. (1995) VOL. 44, NO. 4.
1784 Examples
1785 --------
1786 >>> import numpy as np
1787 >>> from scipy import stats
1788 >>> rng = np.random.default_rng()
1789 >>> x = stats.norm.rvs(loc=5, scale=3, size=100, random_state=rng)
1790 >>> shapiro_test = stats.shapiro(x)
1791 >>> shapiro_test
1792 ShapiroResult(statistic=0.9813305735588074, pvalue=0.16855233907699585)
1793 >>> shapiro_test.statistic
1794 0.9813305735588074
1795 >>> shapiro_test.pvalue
1796 0.16855233907699585
1798 """
1799 x = np.ravel(x)
1801 N = len(x)
1802 if N < 3:
1803 raise ValueError("Data must be at least length 3.")
1805 x = x - np.median(x)
1807 a = zeros(N, 'f')
1808 init = 0
1810 y = sort(x)
1811 a, w, pw, ifault = _statlib.swilk(y, a[:N//2], init)
1812 if ifault not in [0, 2]:
1813 warnings.warn("Input data for shapiro has range zero. The results "
1814 "may not be accurate.")
1815 if N > 5000:
1816 warnings.warn("p-value may not be accurate for N > 5000.")
1818 return ShapiroResult(w, pw)
1821# Values from Stephens, M A, "EDF Statistics for Goodness of Fit and
1822# Some Comparisons", Journal of the American Statistical
1823# Association, Vol. 69, Issue 347, Sept. 1974, pp 730-737
1824_Avals_norm = array([0.576, 0.656, 0.787, 0.918, 1.092])
1825_Avals_expon = array([0.922, 1.078, 1.341, 1.606, 1.957])
1826# From Stephens, M A, "Goodness of Fit for the Extreme Value Distribution",
1827# Biometrika, Vol. 64, Issue 3, Dec. 1977, pp 583-588.
1828_Avals_gumbel = array([0.474, 0.637, 0.757, 0.877, 1.038])
1829# From Stephens, M A, "Tests of Fit for the Logistic Distribution Based
1830# on the Empirical Distribution Function.", Biometrika,
1831# Vol. 66, Issue 3, Dec. 1979, pp 591-595.
1832_Avals_logistic = array([0.426, 0.563, 0.660, 0.769, 0.906, 1.010])
1835AndersonResult = _make_tuple_bunch('AndersonResult',
1836 ['statistic', 'critical_values',
1837 'significance_level'], ['fit_result'])
1840def anderson(x, dist='norm'):
1841 """Anderson-Darling test for data coming from a particular distribution.
1843 The Anderson-Darling test tests the null hypothesis that a sample is
1844 drawn from a population that follows a particular distribution.
1845 For the Anderson-Darling test, the critical values depend on
1846 which distribution is being tested against. This function works
1847 for normal, exponential, logistic, or Gumbel (Extreme Value
1848 Type I) distributions.
1850 Parameters
1851 ----------
1852 x : array_like
1853 Array of sample data.
1854 dist : {'norm', 'expon', 'logistic', 'gumbel', 'gumbel_l', 'gumbel_r', 'extreme1'}, optional
1855 The type of distribution to test against. The default is 'norm'.
1856 The names 'extreme1', 'gumbel_l' and 'gumbel' are synonyms for the
1857 same distribution.
1859 Returns
1860 -------
1861 result : AndersonResult
1862 An object with the following attributes:
1864 statistic : float
1865 The Anderson-Darling test statistic.
1866 critical_values : list
1867 The critical values for this distribution.
1868 significance_level : list
1869 The significance levels for the corresponding critical values
1870 in percents. The function returns critical values for a
1871 differing set of significance levels depending on the
1872 distribution that is being tested against.
1873 fit_result : `~scipy.stats._result_classes.FitResult`
1874 An object containing the results of fitting the distribution to
1875 the data.
1877 See Also
1878 --------
1879 kstest : The Kolmogorov-Smirnov test for goodness-of-fit.
1881 Notes
1882 -----
1883 Critical values provided are for the following significance levels:
1885 normal/exponential
1886 15%, 10%, 5%, 2.5%, 1%
1887 logistic
1888 25%, 10%, 5%, 2.5%, 1%, 0.5%
1889 Gumbel
1890 25%, 10%, 5%, 2.5%, 1%
1892 If the returned statistic is larger than these critical values then
1893 for the corresponding significance level, the null hypothesis that
1894 the data come from the chosen distribution can be rejected.
1895 The returned statistic is referred to as 'A2' in the references.
1897 References
1898 ----------
1899 .. [1] https://www.itl.nist.gov/div898/handbook/prc/section2/prc213.htm
1900 .. [2] Stephens, M. A. (1974). EDF Statistics for Goodness of Fit and
1901 Some Comparisons, Journal of the American Statistical Association,
1902 Vol. 69, pp. 730-737.
1903 .. [3] Stephens, M. A. (1976). Asymptotic Results for Goodness-of-Fit
1904 Statistics with Unknown Parameters, Annals of Statistics, Vol. 4,
1905 pp. 357-369.
1906 .. [4] Stephens, M. A. (1977). Goodness of Fit for the Extreme Value
1907 Distribution, Biometrika, Vol. 64, pp. 583-588.
1908 .. [5] Stephens, M. A. (1977). Goodness of Fit with Special Reference
1909 to Tests for Exponentiality , Technical Report No. 262,
1910 Department of Statistics, Stanford University, Stanford, CA.
1911 .. [6] Stephens, M. A. (1979). Tests of Fit for the Logistic Distribution
1912 Based on the Empirical Distribution Function, Biometrika, Vol. 66,
1913 pp. 591-595.
1915 Examples
1916 --------
1917 Test the null hypothesis that a random sample was drawn from a normal
1918 distribution (with unspecified mean and standard deviation).
1920 >>> import numpy as np
1921 >>> from scipy.stats import anderson
1922 >>> rng = np.random.default_rng()
1923 >>> data = rng.random(size=35)
1924 >>> res = anderson(data)
1925 >>> res.statistic
1926 0.8398018749744764
1927 >>> res.critical_values
1928 array([0.527, 0.6 , 0.719, 0.839, 0.998])
1929 >>> res.significance_level
1930 array([15. , 10. , 5. , 2.5, 1. ])
1932 The value of the statistic (barely) exceeds the critical value associated
1933 with a significance level of 2.5%, so the null hypothesis may be rejected
1934 at a significance level of 2.5%, but not at a significance level of 1%.
1936 """ # noqa
1937 dist = dist.lower()
1938 if dist in {'extreme1', 'gumbel'}:
1939 dist = 'gumbel_l'
1940 dists = {'norm', 'expon', 'gumbel_l', 'gumbel_r', 'logistic'}
1941 if dist not in dists:
1942 raise ValueError(f"Invalid distribution; dist must be in {dists}.")
1943 y = sort(x)
1944 xbar = np.mean(x, axis=0)
1945 N = len(y)
1946 if dist == 'norm':
1947 s = np.std(x, ddof=1, axis=0)
1948 w = (y - xbar) / s
1949 fit_params = xbar, s
1950 logcdf = distributions.norm.logcdf(w)
1951 logsf = distributions.norm.logsf(w)
1952 sig = array([15, 10, 5, 2.5, 1])
1953 critical = around(_Avals_norm / (1.0 + 4.0/N - 25.0/N/N), 3)
1954 elif dist == 'expon':
1955 w = y / xbar
1956 fit_params = 0, xbar
1957 logcdf = distributions.expon.logcdf(w)
1958 logsf = distributions.expon.logsf(w)
1959 sig = array([15, 10, 5, 2.5, 1])
1960 critical = around(_Avals_expon / (1.0 + 0.6/N), 3)
1961 elif dist == 'logistic':
1962 def rootfunc(ab, xj, N):
1963 a, b = ab
1964 tmp = (xj - a) / b
1965 tmp2 = exp(tmp)
1966 val = [np.sum(1.0/(1+tmp2), axis=0) - 0.5*N,
1967 np.sum(tmp*(1.0-tmp2)/(1+tmp2), axis=0) + N]
1968 return array(val)
1970 sol0 = array([xbar, np.std(x, ddof=1, axis=0)])
1971 sol = optimize.fsolve(rootfunc, sol0, args=(x, N), xtol=1e-5)
1972 w = (y - sol[0]) / sol[1]
1973 fit_params = sol
1974 logcdf = distributions.logistic.logcdf(w)
1975 logsf = distributions.logistic.logsf(w)
1976 sig = array([25, 10, 5, 2.5, 1, 0.5])
1977 critical = around(_Avals_logistic / (1.0 + 0.25/N), 3)
1978 elif dist == 'gumbel_r':
1979 xbar, s = distributions.gumbel_r.fit(x)
1980 w = (y - xbar) / s
1981 fit_params = xbar, s
1982 logcdf = distributions.gumbel_r.logcdf(w)
1983 logsf = distributions.gumbel_r.logsf(w)
1984 sig = array([25, 10, 5, 2.5, 1])
1985 critical = around(_Avals_gumbel / (1.0 + 0.2/sqrt(N)), 3)
1986 elif dist == 'gumbel_l':
1987 xbar, s = distributions.gumbel_l.fit(x)
1988 w = (y - xbar) / s
1989 fit_params = xbar, s
1990 logcdf = distributions.gumbel_l.logcdf(w)
1991 logsf = distributions.gumbel_l.logsf(w)
1992 sig = array([25, 10, 5, 2.5, 1])
1993 critical = around(_Avals_gumbel / (1.0 + 0.2/sqrt(N)), 3)
1995 i = arange(1, N + 1)
1996 A2 = -N - np.sum((2*i - 1.0) / N * (logcdf + logsf[::-1]), axis=0)
1998 # FitResult initializer expects an optimize result, so let's work with it
1999 message = '`anderson` successfully fit the distribution to the data.'
2000 res = optimize.OptimizeResult(success=True, message=message)
2001 res.x = np.array(fit_params)
2002 fit_result = FitResult(getattr(distributions, dist), y,
2003 discrete=False, res=res)
2005 return AndersonResult(A2, critical, sig, fit_result=fit_result)
2008def _anderson_ksamp_midrank(samples, Z, Zstar, k, n, N):
2009 """Compute A2akN equation 7 of Scholz and Stephens.
2011 Parameters
2012 ----------
2013 samples : sequence of 1-D array_like
2014 Array of sample arrays.
2015 Z : array_like
2016 Sorted array of all observations.
2017 Zstar : array_like
2018 Sorted array of unique observations.
2019 k : int
2020 Number of samples.
2021 n : array_like
2022 Number of observations in each sample.
2023 N : int
2024 Total number of observations.
2026 Returns
2027 -------
2028 A2aKN : float
2029 The A2aKN statistics of Scholz and Stephens 1987.
2031 """
2032 A2akN = 0.
2033 Z_ssorted_left = Z.searchsorted(Zstar, 'left')
2034 if N == Zstar.size:
2035 lj = 1.
2036 else:
2037 lj = Z.searchsorted(Zstar, 'right') - Z_ssorted_left
2038 Bj = Z_ssorted_left + lj / 2.
2039 for i in arange(0, k):
2040 s = np.sort(samples[i])
2041 s_ssorted_right = s.searchsorted(Zstar, side='right')
2042 Mij = s_ssorted_right.astype(float)
2043 fij = s_ssorted_right - s.searchsorted(Zstar, 'left')
2044 Mij -= fij / 2.
2045 inner = lj / float(N) * (N*Mij - Bj*n[i])**2 / (Bj*(N - Bj) - N*lj/4.)
2046 A2akN += inner.sum() / n[i]
2047 A2akN *= (N - 1.) / N
2048 return A2akN
2051def _anderson_ksamp_right(samples, Z, Zstar, k, n, N):
2052 """Compute A2akN equation 6 of Scholz & Stephens.
2054 Parameters
2055 ----------
2056 samples : sequence of 1-D array_like
2057 Array of sample arrays.
2058 Z : array_like
2059 Sorted array of all observations.
2060 Zstar : array_like
2061 Sorted array of unique observations.
2062 k : int
2063 Number of samples.
2064 n : array_like
2065 Number of observations in each sample.
2066 N : int
2067 Total number of observations.
2069 Returns
2070 -------
2071 A2KN : float
2072 The A2KN statistics of Scholz and Stephens 1987.
2074 """
2075 A2kN = 0.
2076 lj = Z.searchsorted(Zstar[:-1], 'right') - Z.searchsorted(Zstar[:-1],
2077 'left')
2078 Bj = lj.cumsum()
2079 for i in arange(0, k):
2080 s = np.sort(samples[i])
2081 Mij = s.searchsorted(Zstar[:-1], side='right')
2082 inner = lj / float(N) * (N * Mij - Bj * n[i])**2 / (Bj * (N - Bj))
2083 A2kN += inner.sum() / n[i]
2084 return A2kN
2087Anderson_ksampResult = _make_tuple_bunch(
2088 'Anderson_ksampResult',
2089 ['statistic', 'critical_values', 'pvalue'], []
2090)
2093def anderson_ksamp(samples, midrank=True):
2094 """The Anderson-Darling test for k-samples.
2096 The k-sample Anderson-Darling test is a modification of the
2097 one-sample Anderson-Darling test. It tests the null hypothesis
2098 that k-samples are drawn from the same population without having
2099 to specify the distribution function of that population. The
2100 critical values depend on the number of samples.
2102 Parameters
2103 ----------
2104 samples : sequence of 1-D array_like
2105 Array of sample data in arrays.
2106 midrank : bool, optional
2107 Type of Anderson-Darling test which is computed. Default
2108 (True) is the midrank test applicable to continuous and
2109 discrete populations. If False, the right side empirical
2110 distribution is used.
2112 Returns
2113 -------
2114 res : Anderson_ksampResult
2115 An object containing attributes:
2117 statistic : float
2118 Normalized k-sample Anderson-Darling test statistic.
2119 critical_values : array
2120 The critical values for significance levels 25%, 10%, 5%, 2.5%, 1%,
2121 0.5%, 0.1%.
2122 pvalue : float
2123 The approximate p-value of the test. The value is floored / capped
2124 at 0.1% / 25%.
2126 Raises
2127 ------
2128 ValueError
2129 If less than 2 samples are provided, a sample is empty, or no
2130 distinct observations are in the samples.
2132 See Also
2133 --------
2134 ks_2samp : 2 sample Kolmogorov-Smirnov test
2135 anderson : 1 sample Anderson-Darling test
2137 Notes
2138 -----
2139 [1]_ defines three versions of the k-sample Anderson-Darling test:
2140 one for continuous distributions and two for discrete
2141 distributions, in which ties between samples may occur. The
2142 default of this routine is to compute the version based on the
2143 midrank empirical distribution function. This test is applicable
2144 to continuous and discrete data. If midrank is set to False, the
2145 right side empirical distribution is used for a test for discrete
2146 data. According to [1]_, the two discrete test statistics differ
2147 only slightly if a few collisions due to round-off errors occur in
2148 the test not adjusted for ties between samples.
2150 The critical values corresponding to the significance levels from 0.01
2151 to 0.25 are taken from [1]_. p-values are floored / capped
2152 at 0.1% / 25%. Since the range of critical values might be extended in
2153 future releases, it is recommended not to test ``p == 0.25``, but rather
2154 ``p >= 0.25`` (analogously for the lower bound).
2156 .. versionadded:: 0.14.0
2158 References
2159 ----------
2160 .. [1] Scholz, F. W and Stephens, M. A. (1987), K-Sample
2161 Anderson-Darling Tests, Journal of the American Statistical
2162 Association, Vol. 82, pp. 918-924.
2164 Examples
2165 --------
2166 >>> import numpy as np
2167 >>> from scipy import stats
2168 >>> rng = np.random.default_rng()
2169 >>> res = stats.anderson_ksamp([rng.normal(size=50),
2170 ... rng.normal(loc=0.5, size=30)])
2171 >>> res.statistic, res.pvalue
2172 (1.974403288713695, 0.04991293614572478)
2173 >>> res.critical_values
2174 array([0.325, 1.226, 1.961, 2.718, 3.752, 4.592, 6.546])
2176 The null hypothesis that the two random samples come from the same
2177 distribution can be rejected at the 5% level because the returned
2178 test value is greater than the critical value for 5% (1.961) but
2179 not at the 2.5% level. The interpolation gives an approximate
2180 p-value of 4.99%.
2182 >>> res = stats.anderson_ksamp([rng.normal(size=50),
2183 ... rng.normal(size=30), rng.normal(size=20)])
2184 >>> res.statistic, res.pvalue
2185 (-0.29103725200789504, 0.25)
2186 >>> res.critical_values
2187 array([ 0.44925884, 1.3052767 , 1.9434184 , 2.57696569, 3.41634856,
2188 4.07210043, 5.56419101])
2190 The null hypothesis cannot be rejected for three samples from an
2191 identical distribution. The reported p-value (25%) has been capped and
2192 may not be very accurate (since it corresponds to the value 0.449
2193 whereas the statistic is -0.291).
2195 """
2196 k = len(samples)
2197 if (k < 2):
2198 raise ValueError("anderson_ksamp needs at least two samples")
2200 samples = list(map(np.asarray, samples))
2201 Z = np.sort(np.hstack(samples))
2202 N = Z.size
2203 Zstar = np.unique(Z)
2204 if Zstar.size < 2:
2205 raise ValueError("anderson_ksamp needs more than one distinct "
2206 "observation")
2208 n = np.array([sample.size for sample in samples])
2209 if np.any(n == 0):
2210 raise ValueError("anderson_ksamp encountered sample without "
2211 "observations")
2213 if midrank:
2214 A2kN = _anderson_ksamp_midrank(samples, Z, Zstar, k, n, N)
2215 else:
2216 A2kN = _anderson_ksamp_right(samples, Z, Zstar, k, n, N)
2218 H = (1. / n).sum()
2219 hs_cs = (1. / arange(N - 1, 1, -1)).cumsum()
2220 h = hs_cs[-1] + 1
2221 g = (hs_cs / arange(2, N)).sum()
2223 a = (4*g - 6) * (k - 1) + (10 - 6*g)*H
2224 b = (2*g - 4)*k**2 + 8*h*k + (2*g - 14*h - 4)*H - 8*h + 4*g - 6
2225 c = (6*h + 2*g - 2)*k**2 + (4*h - 4*g + 6)*k + (2*h - 6)*H + 4*h
2226 d = (2*h + 6)*k**2 - 4*h*k
2227 sigmasq = (a*N**3 + b*N**2 + c*N + d) / ((N - 1.) * (N - 2.) * (N - 3.))
2228 m = k - 1
2229 A2 = (A2kN - m) / math.sqrt(sigmasq)
2231 # The b_i values are the interpolation coefficients from Table 2
2232 # of Scholz and Stephens 1987
2233 b0 = np.array([0.675, 1.281, 1.645, 1.96, 2.326, 2.573, 3.085])
2234 b1 = np.array([-0.245, 0.25, 0.678, 1.149, 1.822, 2.364, 3.615])
2235 b2 = np.array([-0.105, -0.305, -0.362, -0.391, -0.396, -0.345, -0.154])
2236 critical = b0 + b1 / math.sqrt(m) + b2 / m
2238 sig = np.array([0.25, 0.1, 0.05, 0.025, 0.01, 0.005, 0.001])
2239 if A2 < critical.min():
2240 p = sig.max()
2241 warnings.warn("p-value capped: true value larger than {}".format(p),
2242 stacklevel=2)
2243 elif A2 > critical.max():
2244 p = sig.min()
2245 warnings.warn("p-value floored: true value smaller than {}".format(p),
2246 stacklevel=2)
2247 else:
2248 # interpolation of probit of significance level
2249 pf = np.polyfit(critical, log(sig), 2)
2250 p = math.exp(np.polyval(pf, A2))
2252 # create result object with alias for backward compatibility
2253 res = Anderson_ksampResult(A2, critical, p)
2254 res.significance_level = p
2255 return res
2258AnsariResult = namedtuple('AnsariResult', ('statistic', 'pvalue'))
2261class _ABW:
2262 """Distribution of Ansari-Bradley W-statistic under the null hypothesis."""
2263 # TODO: calculate exact distribution considering ties
2264 # We could avoid summing over more than half the frequencies,
2265 # but inititally it doesn't seem worth the extra complexity
2267 def __init__(self):
2268 """Minimal initializer."""
2269 self.m = None
2270 self.n = None
2271 self.astart = None
2272 self.total = None
2273 self.freqs = None
2275 def _recalc(self, n, m):
2276 """When necessary, recalculate exact distribution."""
2277 if n != self.n or m != self.m:
2278 self.n, self.m = n, m
2279 # distribution is NOT symmetric when m + n is odd
2280 # n is len(x), m is len(y), and ratio of scales is defined x/y
2281 astart, a1, _ = _statlib.gscale(n, m)
2282 self.astart = astart # minimum value of statistic
2283 # Exact distribution of test statistic under null hypothesis
2284 # expressed as frequencies/counts/integers to maintain precision.
2285 # Stored as floats to avoid overflow of sums.
2286 self.freqs = a1.astype(np.float64)
2287 self.total = self.freqs.sum() # could calculate from m and n
2288 # probability mass is self.freqs / self.total;
2290 def pmf(self, k, n, m):
2291 """Probability mass function."""
2292 self._recalc(n, m)
2293 # The convention here is that PMF at k = 12.5 is the same as at k = 12,
2294 # -> use `floor` in case of ties.
2295 ind = np.floor(k - self.astart).astype(int)
2296 return self.freqs[ind] / self.total
2298 def cdf(self, k, n, m):
2299 """Cumulative distribution function."""
2300 self._recalc(n, m)
2301 # Null distribution derived without considering ties is
2302 # approximate. Round down to avoid Type I error.
2303 ind = np.ceil(k - self.astart).astype(int)
2304 return self.freqs[:ind+1].sum() / self.total
2306 def sf(self, k, n, m):
2307 """Survival function."""
2308 self._recalc(n, m)
2309 # Null distribution derived without considering ties is
2310 # approximate. Round down to avoid Type I error.
2311 ind = np.floor(k - self.astart).astype(int)
2312 return self.freqs[ind:].sum() / self.total
2315# Maintain state for faster repeat calls to ansari w/ method='exact'
2316_abw_state = _ABW()
2319def ansari(x, y, alternative='two-sided'):
2320 """Perform the Ansari-Bradley test for equal scale parameters.
2322 The Ansari-Bradley test ([1]_, [2]_) is a non-parametric test
2323 for the equality of the scale parameter of the distributions
2324 from which two samples were drawn. The null hypothesis states that
2325 the ratio of the scale of the distribution underlying `x` to the scale
2326 of the distribution underlying `y` is 1.
2328 Parameters
2329 ----------
2330 x, y : array_like
2331 Arrays of sample data.
2332 alternative : {'two-sided', 'less', 'greater'}, optional
2333 Defines the alternative hypothesis. Default is 'two-sided'.
2334 The following options are available:
2336 * 'two-sided': the ratio of scales is not equal to 1.
2337 * 'less': the ratio of scales is less than 1.
2338 * 'greater': the ratio of scales is greater than 1.
2340 .. versionadded:: 1.7.0
2342 Returns
2343 -------
2344 statistic : float
2345 The Ansari-Bradley test statistic.
2346 pvalue : float
2347 The p-value of the hypothesis test.
2349 See Also
2350 --------
2351 fligner : A non-parametric test for the equality of k variances
2352 mood : A non-parametric test for the equality of two scale parameters
2354 Notes
2355 -----
2356 The p-value given is exact when the sample sizes are both less than
2357 55 and there are no ties, otherwise a normal approximation for the
2358 p-value is used.
2360 References
2361 ----------
2362 .. [1] Ansari, A. R. and Bradley, R. A. (1960) Rank-sum tests for
2363 dispersions, Annals of Mathematical Statistics, 31, 1174-1189.
2364 .. [2] Sprent, Peter and N.C. Smeeton. Applied nonparametric
2365 statistical methods. 3rd ed. Chapman and Hall/CRC. 2001.
2366 Section 5.8.2.
2367 .. [3] Nathaniel E. Helwig "Nonparametric Dispersion and Equality
2368 Tests" at http://users.stat.umn.edu/~helwig/notes/npde-Notes.pdf
2370 Examples
2371 --------
2372 >>> import numpy as np
2373 >>> from scipy.stats import ansari
2374 >>> rng = np.random.default_rng()
2376 For these examples, we'll create three random data sets. The first
2377 two, with sizes 35 and 25, are drawn from a normal distribution with
2378 mean 0 and standard deviation 2. The third data set has size 25 and
2379 is drawn from a normal distribution with standard deviation 1.25.
2381 >>> x1 = rng.normal(loc=0, scale=2, size=35)
2382 >>> x2 = rng.normal(loc=0, scale=2, size=25)
2383 >>> x3 = rng.normal(loc=0, scale=1.25, size=25)
2385 First we apply `ansari` to `x1` and `x2`. These samples are drawn
2386 from the same distribution, so we expect the Ansari-Bradley test
2387 should not lead us to conclude that the scales of the distributions
2388 are different.
2390 >>> ansari(x1, x2)
2391 AnsariResult(statistic=541.0, pvalue=0.9762532927399098)
2393 With a p-value close to 1, we cannot conclude that there is a
2394 significant difference in the scales (as expected).
2396 Now apply the test to `x1` and `x3`:
2398 >>> ansari(x1, x3)
2399 AnsariResult(statistic=425.0, pvalue=0.0003087020407974518)
2401 The probability of observing such an extreme value of the statistic
2402 under the null hypothesis of equal scales is only 0.03087%. We take this
2403 as evidence against the null hypothesis in favor of the alternative:
2404 the scales of the distributions from which the samples were drawn
2405 are not equal.
2407 We can use the `alternative` parameter to perform a one-tailed test.
2408 In the above example, the scale of `x1` is greater than `x3` and so
2409 the ratio of scales of `x1` and `x3` is greater than 1. This means
2410 that the p-value when ``alternative='greater'`` should be near 0 and
2411 hence we should be able to reject the null hypothesis:
2413 >>> ansari(x1, x3, alternative='greater')
2414 AnsariResult(statistic=425.0, pvalue=0.0001543510203987259)
2416 As we can see, the p-value is indeed quite low. Use of
2417 ``alternative='less'`` should thus yield a large p-value:
2419 >>> ansari(x1, x3, alternative='less')
2420 AnsariResult(statistic=425.0, pvalue=0.9998643258449039)
2422 """
2423 if alternative not in {'two-sided', 'greater', 'less'}:
2424 raise ValueError("'alternative' must be 'two-sided',"
2425 " 'greater', or 'less'.")
2426 x, y = asarray(x), asarray(y)
2427 n = len(x)
2428 m = len(y)
2429 if m < 1:
2430 raise ValueError("Not enough other observations.")
2431 if n < 1:
2432 raise ValueError("Not enough test observations.")
2434 N = m + n
2435 xy = r_[x, y] # combine
2436 rank = _stats_py.rankdata(xy)
2437 symrank = amin(array((rank, N - rank + 1)), 0)
2438 AB = np.sum(symrank[:n], axis=0)
2439 uxy = unique(xy)
2440 repeats = (len(uxy) != len(xy))
2441 exact = ((m < 55) and (n < 55) and not repeats)
2442 if repeats and (m < 55 or n < 55):
2443 warnings.warn("Ties preclude use of exact statistic.")
2444 if exact:
2445 if alternative == 'two-sided':
2446 pval = 2.0 * np.minimum(_abw_state.cdf(AB, n, m),
2447 _abw_state.sf(AB, n, m))
2448 elif alternative == 'greater':
2449 # AB statistic is _smaller_ when ratio of scales is larger,
2450 # so this is the opposite of the usual calculation
2451 pval = _abw_state.cdf(AB, n, m)
2452 else:
2453 pval = _abw_state.sf(AB, n, m)
2454 return AnsariResult(AB, min(1.0, pval))
2456 # otherwise compute normal approximation
2457 if N % 2: # N odd
2458 mnAB = n * (N+1.0)**2 / 4.0 / N
2459 varAB = n * m * (N+1.0) * (3+N**2) / (48.0 * N**2)
2460 else:
2461 mnAB = n * (N+2.0) / 4.0
2462 varAB = m * n * (N+2) * (N-2.0) / 48 / (N-1.0)
2463 if repeats: # adjust variance estimates
2464 # compute np.sum(tj * rj**2,axis=0)
2465 fac = np.sum(symrank**2, axis=0)
2466 if N % 2: # N odd
2467 varAB = m * n * (16*N*fac - (N+1)**4) / (16.0 * N**2 * (N-1))
2468 else: # N even
2469 varAB = m * n * (16*fac - N*(N+2)**2) / (16.0 * N * (N-1))
2471 # Small values of AB indicate larger dispersion for the x sample.
2472 # Large values of AB indicate larger dispersion for the y sample.
2473 # This is opposite to the way we define the ratio of scales. see [1]_.
2474 z = (mnAB - AB) / sqrt(varAB)
2475 z, pval = _normtest_finish(z, alternative)
2476 return AnsariResult(AB, pval)
2479BartlettResult = namedtuple('BartlettResult', ('statistic', 'pvalue'))
2482def bartlett(*samples):
2483 """Perform Bartlett's test for equal variances.
2485 Bartlett's test tests the null hypothesis that all input samples
2486 are from populations with equal variances. For samples
2487 from significantly non-normal populations, Levene's test
2488 `levene` is more robust.
2490 Parameters
2491 ----------
2492 sample1, sample2, ... : array_like
2493 arrays of sample data. Only 1d arrays are accepted, they may have
2494 different lengths.
2496 Returns
2497 -------
2498 statistic : float
2499 The test statistic.
2500 pvalue : float
2501 The p-value of the test.
2503 See Also
2504 --------
2505 fligner : A non-parametric test for the equality of k variances
2506 levene : A robust parametric test for equality of k variances
2508 Notes
2509 -----
2510 Conover et al. (1981) examine many of the existing parametric and
2511 nonparametric tests by extensive simulations and they conclude that the
2512 tests proposed by Fligner and Killeen (1976) and Levene (1960) appear to be
2513 superior in terms of robustness of departures from normality and power
2514 ([3]_).
2516 References
2517 ----------
2518 .. [1] https://www.itl.nist.gov/div898/handbook/eda/section3/eda357.htm
2520 .. [2] Snedecor, George W. and Cochran, William G. (1989), Statistical
2521 Methods, Eighth Edition, Iowa State University Press.
2523 .. [3] Park, C. and Lindsay, B. G. (1999). Robust Scale Estimation and
2524 Hypothesis Testing based on Quadratic Inference Function. Technical
2525 Report #99-03, Center for Likelihood Studies, Pennsylvania State
2526 University.
2528 .. [4] Bartlett, M. S. (1937). Properties of Sufficiency and Statistical
2529 Tests. Proceedings of the Royal Society of London. Series A,
2530 Mathematical and Physical Sciences, Vol. 160, No.901, pp. 268-282.
2532 Examples
2533 --------
2534 Test whether or not the lists `a`, `b` and `c` come from populations
2535 with equal variances.
2537 >>> import numpy as np
2538 >>> from scipy.stats import bartlett
2539 >>> a = [8.88, 9.12, 9.04, 8.98, 9.00, 9.08, 9.01, 8.85, 9.06, 8.99]
2540 >>> b = [8.88, 8.95, 9.29, 9.44, 9.15, 9.58, 8.36, 9.18, 8.67, 9.05]
2541 >>> c = [8.95, 9.12, 8.95, 8.85, 9.03, 8.84, 9.07, 8.98, 8.86, 8.98]
2542 >>> stat, p = bartlett(a, b, c)
2543 >>> p
2544 1.1254782518834628e-05
2546 The very small p-value suggests that the populations do not have equal
2547 variances.
2549 This is not surprising, given that the sample variance of `b` is much
2550 larger than that of `a` and `c`:
2552 >>> [np.var(x, ddof=1) for x in [a, b, c]]
2553 [0.007054444444444413, 0.13073888888888888, 0.008890000000000002]
2555 """
2556 # Handle empty input and input that is not 1d
2557 for sample in samples:
2558 if np.asanyarray(sample).size == 0:
2559 return BartlettResult(np.nan, np.nan)
2560 if np.asanyarray(sample).ndim > 1:
2561 raise ValueError('Samples must be one-dimensional.')
2563 k = len(samples)
2564 if k < 2:
2565 raise ValueError("Must enter at least two input sample vectors.")
2566 Ni = np.empty(k)
2567 ssq = np.empty(k, 'd')
2568 for j in range(k):
2569 Ni[j] = len(samples[j])
2570 ssq[j] = np.var(samples[j], ddof=1)
2571 Ntot = np.sum(Ni, axis=0)
2572 spsq = np.sum((Ni - 1)*ssq, axis=0) / (1.0*(Ntot - k))
2573 numer = (Ntot*1.0 - k) * log(spsq) - np.sum((Ni - 1.0)*log(ssq), axis=0)
2574 denom = 1.0 + 1.0/(3*(k - 1)) * ((np.sum(1.0/(Ni - 1.0), axis=0)) -
2575 1.0/(Ntot - k))
2576 T = numer / denom
2577 pval = distributions.chi2.sf(T, k - 1) # 1 - cdf
2579 return BartlettResult(T, pval)
2582LeveneResult = namedtuple('LeveneResult', ('statistic', 'pvalue'))
2585def levene(*samples, center='median', proportiontocut=0.05):
2586 """Perform Levene test for equal variances.
2588 The Levene test tests the null hypothesis that all input samples
2589 are from populations with equal variances. Levene's test is an
2590 alternative to Bartlett's test `bartlett` in the case where
2591 there are significant deviations from normality.
2593 Parameters
2594 ----------
2595 sample1, sample2, ... : array_like
2596 The sample data, possibly with different lengths. Only one-dimensional
2597 samples are accepted.
2598 center : {'mean', 'median', 'trimmed'}, optional
2599 Which function of the data to use in the test. The default
2600 is 'median'.
2601 proportiontocut : float, optional
2602 When `center` is 'trimmed', this gives the proportion of data points
2603 to cut from each end. (See `scipy.stats.trim_mean`.)
2604 Default is 0.05.
2606 Returns
2607 -------
2608 statistic : float
2609 The test statistic.
2610 pvalue : float
2611 The p-value for the test.
2613 Notes
2614 -----
2615 Three variations of Levene's test are possible. The possibilities
2616 and their recommended usages are:
2618 * 'median' : Recommended for skewed (non-normal) distributions>
2619 * 'mean' : Recommended for symmetric, moderate-tailed distributions.
2620 * 'trimmed' : Recommended for heavy-tailed distributions.
2622 The test version using the mean was proposed in the original article
2623 of Levene ([2]_) while the median and trimmed mean have been studied by
2624 Brown and Forsythe ([3]_), sometimes also referred to as Brown-Forsythe
2625 test.
2627 References
2628 ----------
2629 .. [1] https://www.itl.nist.gov/div898/handbook/eda/section3/eda35a.htm
2630 .. [2] Levene, H. (1960). In Contributions to Probability and Statistics:
2631 Essays in Honor of Harold Hotelling, I. Olkin et al. eds.,
2632 Stanford University Press, pp. 278-292.
2633 .. [3] Brown, M. B. and Forsythe, A. B. (1974), Journal of the American
2634 Statistical Association, 69, 364-367
2636 Examples
2637 --------
2638 Test whether or not the lists `a`, `b` and `c` come from populations
2639 with equal variances.
2641 >>> import numpy as np
2642 >>> from scipy.stats import levene
2643 >>> a = [8.88, 9.12, 9.04, 8.98, 9.00, 9.08, 9.01, 8.85, 9.06, 8.99]
2644 >>> b = [8.88, 8.95, 9.29, 9.44, 9.15, 9.58, 8.36, 9.18, 8.67, 9.05]
2645 >>> c = [8.95, 9.12, 8.95, 8.85, 9.03, 8.84, 9.07, 8.98, 8.86, 8.98]
2646 >>> stat, p = levene(a, b, c)
2647 >>> p
2648 0.002431505967249681
2650 The small p-value suggests that the populations do not have equal
2651 variances.
2653 This is not surprising, given that the sample variance of `b` is much
2654 larger than that of `a` and `c`:
2656 >>> [np.var(x, ddof=1) for x in [a, b, c]]
2657 [0.007054444444444413, 0.13073888888888888, 0.008890000000000002]
2659 """
2660 if center not in ['mean', 'median', 'trimmed']:
2661 raise ValueError("center must be 'mean', 'median' or 'trimmed'.")
2663 k = len(samples)
2664 if k < 2:
2665 raise ValueError("Must enter at least two input sample vectors.")
2666 # check for 1d input
2667 for j in range(k):
2668 if np.asanyarray(samples[j]).ndim > 1:
2669 raise ValueError('Samples must be one-dimensional.')
2671 Ni = np.empty(k)
2672 Yci = np.empty(k, 'd')
2674 if center == 'median':
2675 func = lambda x: np.median(x, axis=0)
2676 elif center == 'mean':
2677 func = lambda x: np.mean(x, axis=0)
2678 else: # center == 'trimmed'
2679 samples = tuple(_stats_py.trimboth(np.sort(sample), proportiontocut)
2680 for sample in samples)
2681 func = lambda x: np.mean(x, axis=0)
2683 for j in range(k):
2684 Ni[j] = len(samples[j])
2685 Yci[j] = func(samples[j])
2686 Ntot = np.sum(Ni, axis=0)
2688 # compute Zij's
2689 Zij = [None] * k
2690 for i in range(k):
2691 Zij[i] = abs(asarray(samples[i]) - Yci[i])
2693 # compute Zbari
2694 Zbari = np.empty(k, 'd')
2695 Zbar = 0.0
2696 for i in range(k):
2697 Zbari[i] = np.mean(Zij[i], axis=0)
2698 Zbar += Zbari[i] * Ni[i]
2700 Zbar /= Ntot
2701 numer = (Ntot - k) * np.sum(Ni * (Zbari - Zbar)**2, axis=0)
2703 # compute denom_variance
2704 dvar = 0.0
2705 for i in range(k):
2706 dvar += np.sum((Zij[i] - Zbari[i])**2, axis=0)
2708 denom = (k - 1.0) * dvar
2710 W = numer / denom
2711 pval = distributions.f.sf(W, k-1, Ntot-k) # 1 - cdf
2712 return LeveneResult(W, pval)
2715@_deprecated("'binom_test' is deprecated in favour of"
2716 " 'binomtest' from version 1.7.0 and will"
2717 " be removed in Scipy 1.12.0.")
2718def binom_test(x, n=None, p=0.5, alternative='two-sided'):
2719 """Perform a test that the probability of success is p.
2721 This is an exact, two-sided test of the null hypothesis
2722 that the probability of success in a Bernoulli experiment
2723 is `p`.
2725 .. deprecated:: 1.10.0
2726 `binom_test` is deprecated in favour of `binomtest` and will
2727 be removed in Scipy 1.12.0.
2729 Parameters
2730 ----------
2731 x : int or array_like
2732 The number of successes, or if x has length 2, it is the
2733 number of successes and the number of failures.
2734 n : int
2735 The number of trials. This is ignored if x gives both the
2736 number of successes and failures.
2737 p : float, optional
2738 The hypothesized probability of success. ``0 <= p <= 1``. The
2739 default value is ``p = 0.5``.
2740 alternative : {'two-sided', 'greater', 'less'}, optional
2741 Indicates the alternative hypothesis. The default value is
2742 'two-sided'.
2744 Returns
2745 -------
2746 p-value : float
2747 The p-value of the hypothesis test.
2749 References
2750 ----------
2751 .. [1] https://en.wikipedia.org/wiki/Binomial_test
2753 Examples
2754 --------
2755 >>> from scipy import stats
2757 A car manufacturer claims that no more than 10% of their cars are unsafe.
2758 15 cars are inspected for safety, 3 were found to be unsafe. Test the
2759 manufacturer's claim:
2761 >>> stats.binom_test(3, n=15, p=0.1, alternative='greater')
2762 0.18406106910639114
2764 The null hypothesis cannot be rejected at the 5% level of significance
2765 because the returned p-value is greater than the critical value of 5%.
2767 """
2768 x = atleast_1d(x).astype(np.int_)
2769 if len(x) == 2:
2770 n = x[1] + x[0]
2771 x = x[0]
2772 elif len(x) == 1:
2773 x = x[0]
2774 if n is None or n < x:
2775 raise ValueError("n must be >= x")
2776 n = np.int_(n)
2777 else:
2778 raise ValueError("Incorrect length for x.")
2780 if (p > 1.0) or (p < 0.0):
2781 raise ValueError("p must be in range [0,1]")
2783 if alternative not in ('two-sided', 'less', 'greater'):
2784 raise ValueError("alternative not recognized\n"
2785 "should be 'two-sided', 'less' or 'greater'")
2787 if alternative == 'less':
2788 pval = distributions.binom.cdf(x, n, p)
2789 return pval
2791 if alternative == 'greater':
2792 pval = distributions.binom.sf(x-1, n, p)
2793 return pval
2795 # if alternative was neither 'less' nor 'greater', then it's 'two-sided'
2796 d = distributions.binom.pmf(x, n, p)
2797 rerr = 1 + 1e-7
2798 if x == p * n:
2799 # special case as shortcut, would also be handled by `else` below
2800 pval = 1.
2801 elif x < p * n:
2802 i = np.arange(np.ceil(p * n), n+1)
2803 y = np.sum(distributions.binom.pmf(i, n, p) <= d*rerr, axis=0)
2804 pval = (distributions.binom.cdf(x, n, p) +
2805 distributions.binom.sf(n - y, n, p))
2806 else:
2807 i = np.arange(np.floor(p*n) + 1)
2808 y = np.sum(distributions.binom.pmf(i, n, p) <= d*rerr, axis=0)
2809 pval = (distributions.binom.cdf(y-1, n, p) +
2810 distributions.binom.sf(x-1, n, p))
2812 return min(1.0, pval)
2815def _apply_func(x, g, func):
2816 # g is list of indices into x
2817 # separating x into different groups
2818 # func should be applied over the groups
2819 g = unique(r_[0, g, len(x)])
2820 output = [func(x[g[k]:g[k+1]]) for k in range(len(g) - 1)]
2822 return asarray(output)
2825FlignerResult = namedtuple('FlignerResult', ('statistic', 'pvalue'))
2828def fligner(*samples, center='median', proportiontocut=0.05):
2829 """Perform Fligner-Killeen test for equality of variance.
2831 Fligner's test tests the null hypothesis that all input samples
2832 are from populations with equal variances. Fligner-Killeen's test is
2833 distribution free when populations are identical [2]_.
2835 Parameters
2836 ----------
2837 sample1, sample2, ... : array_like
2838 Arrays of sample data. Need not be the same length.
2839 center : {'mean', 'median', 'trimmed'}, optional
2840 Keyword argument controlling which function of the data is used in
2841 computing the test statistic. The default is 'median'.
2842 proportiontocut : float, optional
2843 When `center` is 'trimmed', this gives the proportion of data points
2844 to cut from each end. (See `scipy.stats.trim_mean`.)
2845 Default is 0.05.
2847 Returns
2848 -------
2849 statistic : float
2850 The test statistic.
2851 pvalue : float
2852 The p-value for the hypothesis test.
2854 See Also
2855 --------
2856 bartlett : A parametric test for equality of k variances in normal samples
2857 levene : A robust parametric test for equality of k variances
2859 Notes
2860 -----
2861 As with Levene's test there are three variants of Fligner's test that
2862 differ by the measure of central tendency used in the test. See `levene`
2863 for more information.
2865 Conover et al. (1981) examine many of the existing parametric and
2866 nonparametric tests by extensive simulations and they conclude that the
2867 tests proposed by Fligner and Killeen (1976) and Levene (1960) appear to be
2868 superior in terms of robustness of departures from normality and power [3]_.
2870 References
2871 ----------
2872 .. [1] Park, C. and Lindsay, B. G. (1999). Robust Scale Estimation and
2873 Hypothesis Testing based on Quadratic Inference Function. Technical
2874 Report #99-03, Center for Likelihood Studies, Pennsylvania State
2875 University.
2876 https://cecas.clemson.edu/~cspark/cv/paper/qif/draftqif2.pdf
2878 .. [2] Fligner, M.A. and Killeen, T.J. (1976). Distribution-free two-sample
2879 tests for scale. 'Journal of the American Statistical Association.'
2880 71(353), 210-213.
2882 .. [3] Park, C. and Lindsay, B. G. (1999). Robust Scale Estimation and
2883 Hypothesis Testing based on Quadratic Inference Function. Technical
2884 Report #99-03, Center for Likelihood Studies, Pennsylvania State
2885 University.
2887 .. [4] Conover, W. J., Johnson, M. E. and Johnson M. M. (1981). A
2888 comparative study of tests for homogeneity of variances, with
2889 applications to the outer continental shelf biding data.
2890 Technometrics, 23(4), 351-361.
2892 Examples
2893 --------
2894 Test whether or not the lists `a`, `b` and `c` come from populations
2895 with equal variances.
2897 >>> import numpy as np
2898 >>> from scipy.stats import fligner
2899 >>> a = [8.88, 9.12, 9.04, 8.98, 9.00, 9.08, 9.01, 8.85, 9.06, 8.99]
2900 >>> b = [8.88, 8.95, 9.29, 9.44, 9.15, 9.58, 8.36, 9.18, 8.67, 9.05]
2901 >>> c = [8.95, 9.12, 8.95, 8.85, 9.03, 8.84, 9.07, 8.98, 8.86, 8.98]
2902 >>> stat, p = fligner(a, b, c)
2903 >>> p
2904 0.00450826080004775
2906 The small p-value suggests that the populations do not have equal
2907 variances.
2909 This is not surprising, given that the sample variance of `b` is much
2910 larger than that of `a` and `c`:
2912 >>> [np.var(x, ddof=1) for x in [a, b, c]]
2913 [0.007054444444444413, 0.13073888888888888, 0.008890000000000002]
2915 """
2916 if center not in ['mean', 'median', 'trimmed']:
2917 raise ValueError("center must be 'mean', 'median' or 'trimmed'.")
2919 # Handle empty input
2920 for sample in samples:
2921 if np.asanyarray(sample).size == 0:
2922 return FlignerResult(np.nan, np.nan)
2924 k = len(samples)
2925 if k < 2:
2926 raise ValueError("Must enter at least two input sample vectors.")
2928 if center == 'median':
2929 func = lambda x: np.median(x, axis=0)
2930 elif center == 'mean':
2931 func = lambda x: np.mean(x, axis=0)
2932 else: # center == 'trimmed'
2933 samples = tuple(_stats_py.trimboth(sample, proportiontocut)
2934 for sample in samples)
2935 func = lambda x: np.mean(x, axis=0)
2937 Ni = asarray([len(samples[j]) for j in range(k)])
2938 Yci = asarray([func(samples[j]) for j in range(k)])
2939 Ntot = np.sum(Ni, axis=0)
2940 # compute Zij's
2941 Zij = [abs(asarray(samples[i]) - Yci[i]) for i in range(k)]
2942 allZij = []
2943 g = [0]
2944 for i in range(k):
2945 allZij.extend(list(Zij[i]))
2946 g.append(len(allZij))
2948 ranks = _stats_py.rankdata(allZij)
2949 sample = distributions.norm.ppf(ranks / (2*(Ntot + 1.0)) + 0.5)
2951 # compute Aibar
2952 Aibar = _apply_func(sample, g, np.sum) / Ni
2953 anbar = np.mean(sample, axis=0)
2954 varsq = np.var(sample, axis=0, ddof=1)
2955 Xsq = np.sum(Ni * (asarray(Aibar) - anbar)**2.0, axis=0) / varsq
2956 pval = distributions.chi2.sf(Xsq, k - 1) # 1 - cdf
2957 return FlignerResult(Xsq, pval)
2960@_axis_nan_policy_factory(lambda x1: (x1,), n_samples=4, n_outputs=1)
2961def _mood_inner_lc(xy, x, diffs, sorted_xy, n, m, N) -> float:
2962 # Obtain the unique values and their frequencies from the pooled samples.
2963 # "a_j, + b_j, = t_j, for j = 1, ... k" where `k` is the number of unique
2964 # classes, and "[t]he number of values associated with the x's and y's in
2965 # the jth class will be denoted by a_j, and b_j respectively."
2966 # (Mielke, 312)
2967 # Reuse previously computed sorted array and `diff` arrays to obtain the
2968 # unique values and counts. Prepend `diffs` with a non-zero to indicate
2969 # that the first element should be marked as not matching what preceded it.
2970 diffs_prep = np.concatenate(([1], diffs))
2971 # Unique elements are where the was a difference between elements in the
2972 # sorted array
2973 uniques = sorted_xy[diffs_prep != 0]
2974 # The count of each element is the bin size for each set of consecutive
2975 # differences where the difference is zero. Replace nonzero differences
2976 # with 1 and then use the cumulative sum to count the indices.
2977 t = np.bincount(np.cumsum(np.asarray(diffs_prep != 0, dtype=int)))[1:]
2978 k = len(uniques)
2979 js = np.arange(1, k + 1, dtype=int)
2980 # the `b` array mentioned in the paper is not used, outside of the
2981 # calculation of `t`, so we do not need to calculate it separately. Here
2982 # we calculate `a`. In plain language, `a[j]` is the number of values in
2983 # `x` that equal `uniques[j]`.
2984 sorted_xyx = np.sort(np.concatenate((xy, x)))
2985 diffs = np.diff(sorted_xyx)
2986 diffs_prep = np.concatenate(([1], diffs))
2987 diff_is_zero = np.asarray(diffs_prep != 0, dtype=int)
2988 xyx_counts = np.bincount(np.cumsum(diff_is_zero))[1:]
2989 a = xyx_counts - t
2990 # "Define .. a_0 = b_0 = t_0 = S_0 = 0" (Mielke 312) so we shift `a`
2991 # and `t` arrays over 1 to allow a first element of 0 to accommodate this
2992 # indexing.
2993 t = np.concatenate(([0], t))
2994 a = np.concatenate(([0], a))
2995 # S is built from `t`, so it does not need a preceding zero added on.
2996 S = np.cumsum(t)
2997 # define a copy of `S` with a prepending zero for later use to avoid
2998 # the need for indexing.
2999 S_i_m1 = np.concatenate(([0], S[:-1]))
3001 # Psi, as defined by the 6th unnumbered equation on page 313 (Mielke).
3002 # Note that in the paper there is an error where the denominator `2` is
3003 # squared when it should be the entire equation.
3004 def psi(indicator):
3005 return (indicator - (N + 1)/2)**2
3007 # define summation range for use in calculation of phi, as seen in sum
3008 # in the unnumbered equation on the bottom of page 312 (Mielke).
3009 s_lower = S[js - 1] + 1
3010 s_upper = S[js] + 1
3011 phi_J = [np.arange(s_lower[idx], s_upper[idx]) for idx in range(k)]
3013 # for every range in the above array, determine the sum of psi(I) for
3014 # every element in the range. Divide all the sums by `t`. Following the
3015 # last unnumbered equation on page 312.
3016 phis = [np.sum(psi(I_j)) for I_j in phi_J] / t[js]
3018 # `T` is equal to a[j] * phi[j], per the first unnumbered equation on
3019 # page 312. `phis` is already in the order based on `js`, so we index
3020 # into `a` with `js` as well.
3021 T = sum(phis * a[js])
3023 # The approximate statistic
3024 E_0_T = n * (N * N - 1) / 12
3026 varM = (m * n * (N + 1.0) * (N ** 2 - 4) / 180 -
3027 m * n / (180 * N * (N - 1)) * np.sum(
3028 t * (t**2 - 1) * (t**2 - 4 + (15 * (N - S - S_i_m1) ** 2))
3029 ))
3031 return ((T - E_0_T) / np.sqrt(varM),)
3034def mood(x, y, axis=0, alternative="two-sided"):
3035 """Perform Mood's test for equal scale parameters.
3037 Mood's two-sample test for scale parameters is a non-parametric
3038 test for the null hypothesis that two samples are drawn from the
3039 same distribution with the same scale parameter.
3041 Parameters
3042 ----------
3043 x, y : array_like
3044 Arrays of sample data.
3045 axis : int, optional
3046 The axis along which the samples are tested. `x` and `y` can be of
3047 different length along `axis`.
3048 If `axis` is None, `x` and `y` are flattened and the test is done on
3049 all values in the flattened arrays.
3050 alternative : {'two-sided', 'less', 'greater'}, optional
3051 Defines the alternative hypothesis. Default is 'two-sided'.
3052 The following options are available:
3054 * 'two-sided': the scales of the distributions underlying `x` and `y`
3055 are different.
3056 * 'less': the scale of the distribution underlying `x` is less than
3057 the scale of the distribution underlying `y`.
3058 * 'greater': the scale of the distribution underlying `x` is greater
3059 than the scale of the distribution underlying `y`.
3061 .. versionadded:: 1.7.0
3063 Returns
3064 -------
3065 res : SignificanceResult
3066 An object containing attributes:
3068 statistic : scalar or ndarray
3069 The z-score for the hypothesis test. For 1-D inputs a scalar is
3070 returned.
3071 pvalue : scalar ndarray
3072 The p-value for the hypothesis test.
3074 See Also
3075 --------
3076 fligner : A non-parametric test for the equality of k variances
3077 ansari : A non-parametric test for the equality of 2 variances
3078 bartlett : A parametric test for equality of k variances in normal samples
3079 levene : A parametric test for equality of k variances
3081 Notes
3082 -----
3083 The data are assumed to be drawn from probability distributions ``f(x)``
3084 and ``f(x/s) / s`` respectively, for some probability density function f.
3085 The null hypothesis is that ``s == 1``.
3087 For multi-dimensional arrays, if the inputs are of shapes
3088 ``(n0, n1, n2, n3)`` and ``(n0, m1, n2, n3)``, then if ``axis=1``, the
3089 resulting z and p values will have shape ``(n0, n2, n3)``. Note that
3090 ``n1`` and ``m1`` don't have to be equal, but the other dimensions do.
3092 References
3093 ----------
3094 [1] Mielke, Paul W. "Note on Some Squared Rank Tests with Existing Ties."
3095 Technometrics, vol. 9, no. 2, 1967, pp. 312-14. JSTOR,
3096 https://doi.org/10.2307/1266427. Accessed 18 May 2022.
3098 Examples
3099 --------
3100 >>> import numpy as np
3101 >>> from scipy import stats
3102 >>> rng = np.random.default_rng()
3103 >>> x2 = rng.standard_normal((2, 45, 6, 7))
3104 >>> x1 = rng.standard_normal((2, 30, 6, 7))
3105 >>> res = stats.mood(x1, x2, axis=1)
3106 >>> res.pvalue.shape
3107 (2, 6, 7)
3109 Find the number of points where the difference in scale is not significant:
3111 >>> (res.pvalue > 0.1).sum()
3112 78
3114 Perform the test with different scales:
3116 >>> x1 = rng.standard_normal((2, 30))
3117 >>> x2 = rng.standard_normal((2, 35)) * 10.0
3118 >>> stats.mood(x1, x2, axis=1)
3119 SignificanceResult(statistic=array([-5.76174136, -6.12650783]),
3120 pvalue=array([8.32505043e-09, 8.98287869e-10]))
3122 """
3123 x = np.asarray(x, dtype=float)
3124 y = np.asarray(y, dtype=float)
3126 if axis is None:
3127 x = x.flatten()
3128 y = y.flatten()
3129 axis = 0
3131 if axis < 0:
3132 axis = x.ndim + axis
3134 # Determine shape of the result arrays
3135 res_shape = tuple([x.shape[ax] for ax in range(len(x.shape)) if ax != axis])
3136 if not (res_shape == tuple([y.shape[ax] for ax in range(len(y.shape)) if
3137 ax != axis])):
3138 raise ValueError("Dimensions of x and y on all axes except `axis` "
3139 "should match")
3141 n = x.shape[axis]
3142 m = y.shape[axis]
3143 N = m + n
3144 if N < 3:
3145 raise ValueError("Not enough observations.")
3147 xy = np.concatenate((x, y), axis=axis)
3148 # determine if any of the samples contain ties
3149 sorted_xy = np.sort(xy, axis=axis)
3150 diffs = np.diff(sorted_xy, axis=axis)
3151 if 0 in diffs:
3152 z = np.asarray(_mood_inner_lc(xy, x, diffs, sorted_xy, n, m, N,
3153 axis=axis))
3154 else:
3155 if axis != 0:
3156 xy = np.moveaxis(xy, axis, 0)
3158 xy = xy.reshape(xy.shape[0], -1)
3159 # Generalized to the n-dimensional case by adding the axis argument,
3160 # and using for loops, since rankdata is not vectorized. For improving
3161 # performance consider vectorizing rankdata function.
3162 all_ranks = np.empty_like(xy)
3163 for j in range(xy.shape[1]):
3164 all_ranks[:, j] = _stats_py.rankdata(xy[:, j])
3166 Ri = all_ranks[:n]
3167 M = np.sum((Ri - (N + 1.0) / 2) ** 2, axis=0)
3168 # Approx stat.
3169 mnM = n * (N * N - 1.0) / 12
3170 varM = m * n * (N + 1.0) * (N + 2) * (N - 2) / 180
3171 z = (M - mnM) / sqrt(varM)
3172 z, pval = _normtest_finish(z, alternative)
3174 if res_shape == ():
3175 # Return scalars, not 0-D arrays
3176 z = z[0]
3177 pval = pval[0]
3178 else:
3179 z.shape = res_shape
3180 pval.shape = res_shape
3181 return SignificanceResult(z, pval)
3184WilcoxonResult = _make_tuple_bunch('WilcoxonResult', ['statistic', 'pvalue'])
3187def wilcoxon_result_unpacker(res):
3188 if hasattr(res, 'zstatistic'):
3189 return res.statistic, res.pvalue, res.zstatistic
3190 else:
3191 return res.statistic, res.pvalue
3194def wilcoxon_result_object(statistic, pvalue, zstatistic=None):
3195 res = WilcoxonResult(statistic, pvalue)
3196 if zstatistic is not None:
3197 res.zstatistic = zstatistic
3198 return res
3201def wilcoxon_outputs(kwds):
3202 method = kwds.get('method', 'auto')
3203 if method == 'approx':
3204 return 3
3205 return 2
3208@_rename_parameter("mode", "method")
3209@_axis_nan_policy_factory(
3210 wilcoxon_result_object, paired=True,
3211 n_samples=lambda kwds: 2 if kwds.get('y', None) is not None else 1,
3212 result_to_tuple=wilcoxon_result_unpacker, n_outputs=wilcoxon_outputs,
3213)
3214def wilcoxon(x, y=None, zero_method="wilcox", correction=False,
3215 alternative="two-sided", method='auto'):
3216 """Calculate the Wilcoxon signed-rank test.
3218 The Wilcoxon signed-rank test tests the null hypothesis that two
3219 related paired samples come from the same distribution. In particular,
3220 it tests whether the distribution of the differences ``x - y`` is symmetric
3221 about zero. It is a non-parametric version of the paired T-test.
3223 Parameters
3224 ----------
3225 x : array_like
3226 Either the first set of measurements (in which case ``y`` is the second
3227 set of measurements), or the differences between two sets of
3228 measurements (in which case ``y`` is not to be specified.) Must be
3229 one-dimensional.
3230 y : array_like, optional
3231 Either the second set of measurements (if ``x`` is the first set of
3232 measurements), or not specified (if ``x`` is the differences between
3233 two sets of measurements.) Must be one-dimensional.
3234 zero_method : {"wilcox", "pratt", "zsplit"}, optional
3235 There are different conventions for handling pairs of observations
3236 with equal values ("zero-differences", or "zeros").
3238 * "wilcox": Discards all zero-differences (default); see [4]_.
3239 * "pratt": Includes zero-differences in the ranking process,
3240 but drops the ranks of the zeros (more conservative); see [3]_.
3241 In this case, the normal approximation is adjusted as in [5]_.
3242 * "zsplit": Includes zero-differences in the ranking process and
3243 splits the zero rank between positive and negative ones.
3245 correction : bool, optional
3246 If True, apply continuity correction by adjusting the Wilcoxon rank
3247 statistic by 0.5 towards the mean value when computing the
3248 z-statistic if a normal approximation is used. Default is False.
3249 alternative : {"two-sided", "greater", "less"}, optional
3250 Defines the alternative hypothesis. Default is 'two-sided'.
3251 In the following, let ``d`` represent the difference between the paired
3252 samples: ``d = x - y`` if both ``x`` and ``y`` are provided, or
3253 ``d = x`` otherwise.
3255 * 'two-sided': the distribution underlying ``d`` is not symmetric
3256 about zero.
3257 * 'less': the distribution underlying ``d`` is stochastically less
3258 than a distribution symmetric about zero.
3259 * 'greater': the distribution underlying ``d`` is stochastically
3260 greater than a distribution symmetric about zero.
3262 method : {"auto", "exact", "approx"}, optional
3263 Method to calculate the p-value, see Notes. Default is "auto".
3265 Returns
3266 -------
3267 An object with the following attributes.
3269 statistic : array_like
3270 If `alternative` is "two-sided", the sum of the ranks of the
3271 differences above or below zero, whichever is smaller.
3272 Otherwise the sum of the ranks of the differences above zero.
3273 pvalue : array_like
3274 The p-value for the test depending on `alternative` and `method`.
3275 zstatistic : array_like
3276 When ``method = 'approx'``, this is the normalized z-statistic::
3278 z = (T - mn - d) / se
3280 where ``T`` is `statistic` as defined above, ``mn`` is the mean of the
3281 distribution under the null hypothesis, ``d`` is a continuity
3282 correction, and ``se`` is the standard error.
3283 When ``method != 'approx'``, this attribute is not available.
3285 See Also
3286 --------
3287 kruskal, mannwhitneyu
3289 Notes
3290 -----
3291 In the following, let ``d`` represent the difference between the paired
3292 samples: ``d = x - y`` if both ``x`` and ``y`` are provided, or ``d = x``
3293 otherwise. Assume that all elements of ``d`` are independent and
3294 identically distributed observations, and all are distinct and nonzero.
3296 - When ``len(d)`` is sufficiently large, the null distribution of the
3297 normalized test statistic (`zstatistic` above) is approximately normal,
3298 and ``method = 'approx'`` can be used to compute the p-value.
3300 - When ``len(d)`` is small, the normal approximation may not be accurate,
3301 and ``method='exact'`` is preferred (at the cost of additional
3302 execution time).
3304 - The default, ``method='auto'``, selects between the two: when
3305 ``len(d) <= 50``, the exact method is used; otherwise, the approximate
3306 method is used.
3308 The presence of "ties" (i.e. not all elements of ``d`` are unique) and
3309 "zeros" (i.e. elements of ``d`` are zero) changes the null distribution
3310 of the test statistic, and ``method='exact'`` no longer calculates
3311 the exact p-value. If ``method='approx'``, the z-statistic is adjusted
3312 for more accurate comparison against the standard normal, but still,
3313 for finite sample sizes, the standard normal is only an approximation of
3314 the true null distribution of the z-statistic. There is no clear
3315 consensus among references on which method most accurately approximates
3316 the p-value for small samples in the presence of zeros and/or ties. In any
3317 case, this is the behavior of `wilcoxon` when ``method='auto':
3318 ``method='exact'`` is used when ``len(d) <= 50`` *and there are no zeros*;
3319 otherwise, ``method='approx'`` is used.
3321 References
3322 ----------
3323 .. [1] https://en.wikipedia.org/wiki/Wilcoxon_signed-rank_test
3324 .. [2] Conover, W.J., Practical Nonparametric Statistics, 1971.
3325 .. [3] Pratt, J.W., Remarks on Zeros and Ties in the Wilcoxon Signed
3326 Rank Procedures, Journal of the American Statistical Association,
3327 Vol. 54, 1959, pp. 655-667. :doi:`10.1080/01621459.1959.10501526`
3328 .. [4] Wilcoxon, F., Individual Comparisons by Ranking Methods,
3329 Biometrics Bulletin, Vol. 1, 1945, pp. 80-83. :doi:`10.2307/3001968`
3330 .. [5] Cureton, E.E., The Normal Approximation to the Signed-Rank
3331 Sampling Distribution When Zero Differences are Present,
3332 Journal of the American Statistical Association, Vol. 62, 1967,
3333 pp. 1068-1069. :doi:`10.1080/01621459.1967.10500917`
3335 Examples
3336 --------
3337 In [4]_, the differences in height between cross- and self-fertilized
3338 corn plants is given as follows:
3340 >>> d = [6, 8, 14, 16, 23, 24, 28, 29, 41, -48, 49, 56, 60, -67, 75]
3342 Cross-fertilized plants appear to be higher. To test the null
3343 hypothesis that there is no height difference, we can apply the
3344 two-sided test:
3346 >>> from scipy.stats import wilcoxon
3347 >>> res = wilcoxon(d)
3348 >>> res.statistic, res.pvalue
3349 (24.0, 0.041259765625)
3351 Hence, we would reject the null hypothesis at a confidence level of 5%,
3352 concluding that there is a difference in height between the groups.
3353 To confirm that the median of the differences can be assumed to be
3354 positive, we use:
3356 >>> res = wilcoxon(d, alternative='greater')
3357 >>> res.statistic, res.pvalue
3358 (96.0, 0.0206298828125)
3360 This shows that the null hypothesis that the median is negative can be
3361 rejected at a confidence level of 5% in favor of the alternative that
3362 the median is greater than zero. The p-values above are exact. Using the
3363 normal approximation gives very similar values:
3365 >>> res = wilcoxon(d, method='approx')
3366 >>> res.statistic, res.pvalue
3367 (24.0, 0.04088813291185591)
3369 Note that the statistic changed to 96 in the one-sided case (the sum
3370 of ranks of positive differences) whereas it is 24 in the two-sided
3371 case (the minimum of sum of ranks above and below zero).
3373 """
3374 mode = method
3376 if mode not in ["auto", "approx", "exact"]:
3377 raise ValueError("mode must be either 'auto', 'approx' or 'exact'")
3379 if zero_method not in ["wilcox", "pratt", "zsplit"]:
3380 raise ValueError("Zero method must be either 'wilcox' "
3381 "or 'pratt' or 'zsplit'")
3383 if alternative not in ["two-sided", "less", "greater"]:
3384 raise ValueError("Alternative must be either 'two-sided', "
3385 "'greater' or 'less'")
3387 if y is None:
3388 d = asarray(x)
3389 if d.ndim > 1:
3390 raise ValueError('Sample x must be one-dimensional.')
3391 else:
3392 x, y = map(asarray, (x, y))
3393 if x.ndim > 1 or y.ndim > 1:
3394 raise ValueError('Samples x and y must be one-dimensional.')
3395 if len(x) != len(y):
3396 raise ValueError('The samples x and y must have the same length.')
3397 d = x - y
3399 if len(d) == 0:
3400 res = WilcoxonResult(np.nan, np.nan)
3401 if method == 'approx':
3402 res.zstatistic = np.nan
3403 return res
3405 if mode == "auto":
3406 if len(d) <= 50:
3407 mode = "exact"
3408 else:
3409 mode = "approx"
3411 n_zero = np.sum(d == 0)
3412 if n_zero > 0 and mode == "exact":
3413 mode = "approx"
3414 warnings.warn("Exact p-value calculation does not work if there are "
3415 "zeros. Switching to normal approximation.")
3417 if mode == "approx":
3418 if zero_method in ["wilcox", "pratt"]:
3419 if n_zero == len(d):
3420 raise ValueError("zero_method 'wilcox' and 'pratt' do not "
3421 "work if x - y is zero for all elements.")
3422 if zero_method == "wilcox":
3423 # Keep all non-zero differences
3424 d = compress(np.not_equal(d, 0), d)
3426 count = len(d)
3427 if count < 10 and mode == "approx":
3428 warnings.warn("Sample size too small for normal approximation.")
3430 r = _stats_py.rankdata(abs(d))
3431 r_plus = np.sum((d > 0) * r)
3432 r_minus = np.sum((d < 0) * r)
3434 if zero_method == "zsplit":
3435 r_zero = np.sum((d == 0) * r)
3436 r_plus += r_zero / 2.
3437 r_minus += r_zero / 2.
3439 # return min for two-sided test, but r_plus for one-sided test
3440 # the literature is not consistent here
3441 # r_plus is more informative since r_plus + r_minus = count*(count+1)/2,
3442 # i.e. the sum of the ranks, so r_minus and the min can be inferred
3443 # (If alternative='pratt', r_plus + r_minus = count*(count+1)/2 - r_zero.)
3444 # [3] uses the r_plus for the one-sided test, keep min for two-sided test
3445 # to keep backwards compatibility
3446 if alternative == "two-sided":
3447 T = min(r_plus, r_minus)
3448 else:
3449 T = r_plus
3451 if mode == "approx":
3452 mn = count * (count + 1.) * 0.25
3453 se = count * (count + 1.) * (2. * count + 1.)
3455 if zero_method == "pratt":
3456 r = r[d != 0]
3457 # normal approximation needs to be adjusted, see Cureton (1967)
3458 mn -= n_zero * (n_zero + 1.) * 0.25
3459 se -= n_zero * (n_zero + 1.) * (2. * n_zero + 1.)
3461 replist, repnum = find_repeats(r)
3462 if repnum.size != 0:
3463 # Correction for repeated elements.
3464 se -= 0.5 * (repnum * (repnum * repnum - 1)).sum()
3466 se = sqrt(se / 24)
3468 # apply continuity correction if applicable
3469 d = 0
3470 if correction:
3471 if alternative == "two-sided":
3472 d = 0.5 * np.sign(T - mn)
3473 elif alternative == "less":
3474 d = -0.5
3475 else:
3476 d = 0.5
3478 # compute statistic and p-value using normal approximation
3479 z = (T - mn - d) / se
3480 if alternative == "two-sided":
3481 prob = 2. * distributions.norm.sf(abs(z))
3482 elif alternative == "greater":
3483 # large T = r_plus indicates x is greater than y; i.e.
3484 # accept alternative in that case and return small p-value (sf)
3485 prob = distributions.norm.sf(z)
3486 else:
3487 prob = distributions.norm.cdf(z)
3488 elif mode == "exact":
3489 # get pmf of the possible positive ranksums r_plus
3490 pmf = _get_wilcoxon_distr(count)
3491 # note: r_plus is int (ties not allowed), need int for slices below
3492 r_plus = int(r_plus)
3493 if alternative == "two-sided":
3494 if r_plus == (len(pmf) - 1) // 2:
3495 # r_plus is the center of the distribution.
3496 prob = 1.0
3497 else:
3498 p_less = np.sum(pmf[:r_plus + 1])
3499 p_greater = np.sum(pmf[r_plus:])
3500 prob = 2*min(p_greater, p_less)
3501 elif alternative == "greater":
3502 prob = np.sum(pmf[r_plus:])
3503 else:
3504 prob = np.sum(pmf[:r_plus + 1])
3505 prob = np.clip(prob, 0, 1)
3507 res = WilcoxonResult(T, prob)
3508 if method == 'approx':
3509 res.zstatistic = z
3510 return res
3513MedianTestResult = _make_tuple_bunch(
3514 'MedianTestResult',
3515 ['statistic', 'pvalue', 'median', 'table'], []
3516)
3519def median_test(*samples, ties='below', correction=True, lambda_=1,
3520 nan_policy='propagate'):
3521 """Perform a Mood's median test.
3523 Test that two or more samples come from populations with the same median.
3525 Let ``n = len(samples)`` be the number of samples. The "grand median" of
3526 all the data is computed, and a contingency table is formed by
3527 classifying the values in each sample as being above or below the grand
3528 median. The contingency table, along with `correction` and `lambda_`,
3529 are passed to `scipy.stats.chi2_contingency` to compute the test statistic
3530 and p-value.
3532 Parameters
3533 ----------
3534 sample1, sample2, ... : array_like
3535 The set of samples. There must be at least two samples.
3536 Each sample must be a one-dimensional sequence containing at least
3537 one value. The samples are not required to have the same length.
3538 ties : str, optional
3539 Determines how values equal to the grand median are classified in
3540 the contingency table. The string must be one of::
3542 "below":
3543 Values equal to the grand median are counted as "below".
3544 "above":
3545 Values equal to the grand median are counted as "above".
3546 "ignore":
3547 Values equal to the grand median are not counted.
3549 The default is "below".
3550 correction : bool, optional
3551 If True, *and* there are just two samples, apply Yates' correction
3552 for continuity when computing the test statistic associated with
3553 the contingency table. Default is True.
3554 lambda_ : float or str, optional
3555 By default, the statistic computed in this test is Pearson's
3556 chi-squared statistic. `lambda_` allows a statistic from the
3557 Cressie-Read power divergence family to be used instead. See
3558 `power_divergence` for details.
3559 Default is 1 (Pearson's chi-squared statistic).
3560 nan_policy : {'propagate', 'raise', 'omit'}, optional
3561 Defines how to handle when input contains nan. 'propagate' returns nan,
3562 'raise' throws an error, 'omit' performs the calculations ignoring nan
3563 values. Default is 'propagate'.
3565 Returns
3566 -------
3567 res : MedianTestResult
3568 An object containing attributes:
3570 statistic : float
3571 The test statistic. The statistic that is returned is determined
3572 by `lambda_`. The default is Pearson's chi-squared statistic.
3573 pvalue : float
3574 The p-value of the test.
3575 median : float
3576 The grand median.
3577 table : ndarray
3578 The contingency table. The shape of the table is (2, n), where
3579 n is the number of samples. The first row holds the counts of the
3580 values above the grand median, and the second row holds the counts
3581 of the values below the grand median. The table allows further
3582 analysis with, for example, `scipy.stats.chi2_contingency`, or with
3583 `scipy.stats.fisher_exact` if there are two samples, without having
3584 to recompute the table. If ``nan_policy`` is "propagate" and there
3585 are nans in the input, the return value for ``table`` is ``None``.
3587 See Also
3588 --------
3589 kruskal : Compute the Kruskal-Wallis H-test for independent samples.
3590 mannwhitneyu : Computes the Mann-Whitney rank test on samples x and y.
3592 Notes
3593 -----
3594 .. versionadded:: 0.15.0
3596 References
3597 ----------
3598 .. [1] Mood, A. M., Introduction to the Theory of Statistics. McGraw-Hill
3599 (1950), pp. 394-399.
3600 .. [2] Zar, J. H., Biostatistical Analysis, 5th ed. Prentice Hall (2010).
3601 See Sections 8.12 and 10.15.
3603 Examples
3604 --------
3605 A biologist runs an experiment in which there are three groups of plants.
3606 Group 1 has 16 plants, group 2 has 15 plants, and group 3 has 17 plants.
3607 Each plant produces a number of seeds. The seed counts for each group
3608 are::
3610 Group 1: 10 14 14 18 20 22 24 25 31 31 32 39 43 43 48 49
3611 Group 2: 28 30 31 33 34 35 36 40 44 55 57 61 91 92 99
3612 Group 3: 0 3 9 22 23 25 25 33 34 34 40 45 46 48 62 67 84
3614 The following code applies Mood's median test to these samples.
3616 >>> g1 = [10, 14, 14, 18, 20, 22, 24, 25, 31, 31, 32, 39, 43, 43, 48, 49]
3617 >>> g2 = [28, 30, 31, 33, 34, 35, 36, 40, 44, 55, 57, 61, 91, 92, 99]
3618 >>> g3 = [0, 3, 9, 22, 23, 25, 25, 33, 34, 34, 40, 45, 46, 48, 62, 67, 84]
3619 >>> from scipy.stats import median_test
3620 >>> res = median_test(g1, g2, g3)
3622 The median is
3624 >>> res.median
3625 34.0
3627 and the contingency table is
3629 >>> res.table
3630 array([[ 5, 10, 7],
3631 [11, 5, 10]])
3633 `p` is too large to conclude that the medians are not the same:
3635 >>> res.pvalue
3636 0.12609082774093244
3638 The "G-test" can be performed by passing ``lambda_="log-likelihood"`` to
3639 `median_test`.
3641 >>> res = median_test(g1, g2, g3, lambda_="log-likelihood")
3642 >>> res.pvalue
3643 0.12224779737117837
3645 The median occurs several times in the data, so we'll get a different
3646 result if, for example, ``ties="above"`` is used:
3648 >>> res = median_test(g1, g2, g3, ties="above")
3649 >>> res.pvalue
3650 0.063873276069553273
3652 >>> res.table
3653 array([[ 5, 11, 9],
3654 [11, 4, 8]])
3656 This example demonstrates that if the data set is not large and there
3657 are values equal to the median, the p-value can be sensitive to the
3658 choice of `ties`.
3660 """
3661 if len(samples) < 2:
3662 raise ValueError('median_test requires two or more samples.')
3664 ties_options = ['below', 'above', 'ignore']
3665 if ties not in ties_options:
3666 raise ValueError("invalid 'ties' option '%s'; 'ties' must be one "
3667 "of: %s" % (ties, str(ties_options)[1:-1]))
3669 data = [np.asarray(sample) for sample in samples]
3671 # Validate the sizes and shapes of the arguments.
3672 for k, d in enumerate(data):
3673 if d.size == 0:
3674 raise ValueError("Sample %d is empty. All samples must "
3675 "contain at least one value." % (k + 1))
3676 if d.ndim != 1:
3677 raise ValueError("Sample %d has %d dimensions. All "
3678 "samples must be one-dimensional sequences." %
3679 (k + 1, d.ndim))
3681 cdata = np.concatenate(data)
3682 contains_nan, nan_policy = _contains_nan(cdata, nan_policy)
3683 if contains_nan and nan_policy == 'propagate':
3684 return MedianTestResult(np.nan, np.nan, np.nan, None)
3686 if contains_nan:
3687 grand_median = np.median(cdata[~np.isnan(cdata)])
3688 else:
3689 grand_median = np.median(cdata)
3690 # When the minimum version of numpy supported by scipy is 1.9.0,
3691 # the above if/else statement can be replaced by the single line:
3692 # grand_median = np.nanmedian(cdata)
3694 # Create the contingency table.
3695 table = np.zeros((2, len(data)), dtype=np.int64)
3696 for k, sample in enumerate(data):
3697 sample = sample[~np.isnan(sample)]
3699 nabove = count_nonzero(sample > grand_median)
3700 nbelow = count_nonzero(sample < grand_median)
3701 nequal = sample.size - (nabove + nbelow)
3702 table[0, k] += nabove
3703 table[1, k] += nbelow
3704 if ties == "below":
3705 table[1, k] += nequal
3706 elif ties == "above":
3707 table[0, k] += nequal
3709 # Check that no row or column of the table is all zero.
3710 # Such a table can not be given to chi2_contingency, because it would have
3711 # a zero in the table of expected frequencies.
3712 rowsums = table.sum(axis=1)
3713 if rowsums[0] == 0:
3714 raise ValueError("All values are below the grand median (%r)." %
3715 grand_median)
3716 if rowsums[1] == 0:
3717 raise ValueError("All values are above the grand median (%r)." %
3718 grand_median)
3719 if ties == "ignore":
3720 # We already checked that each sample has at least one value, but it
3721 # is possible that all those values equal the grand median. If `ties`
3722 # is "ignore", that would result in a column of zeros in `table`. We
3723 # check for that case here.
3724 zero_cols = np.nonzero((table == 0).all(axis=0))[0]
3725 if len(zero_cols) > 0:
3726 msg = ("All values in sample %d are equal to the grand "
3727 "median (%r), so they are ignored, resulting in an "
3728 "empty sample." % (zero_cols[0] + 1, grand_median))
3729 raise ValueError(msg)
3731 stat, p, dof, expected = chi2_contingency(table, lambda_=lambda_,
3732 correction=correction)
3733 return MedianTestResult(stat, p, grand_median, table)
3736def _circfuncs_common(samples, high, low, nan_policy='propagate'):
3737 # Ensure samples are array-like and size is not zero
3738 samples = np.asarray(samples)
3739 if samples.size == 0:
3740 return np.nan, np.asarray(np.nan), np.asarray(np.nan), None
3742 # Recast samples as radians that range between 0 and 2 pi and calculate
3743 # the sine and cosine
3744 sin_samp = sin((samples - low)*2.*pi / (high - low))
3745 cos_samp = cos((samples - low)*2.*pi / (high - low))
3747 # Apply the NaN policy
3748 contains_nan, nan_policy = _contains_nan(samples, nan_policy)
3749 if contains_nan and nan_policy == 'omit':
3750 mask = np.isnan(samples)
3751 # Set the sines and cosines that are NaN to zero
3752 sin_samp[mask] = 0.0
3753 cos_samp[mask] = 0.0
3754 else:
3755 mask = None
3757 return samples, sin_samp, cos_samp, mask
3760def circmean(samples, high=2*pi, low=0, axis=None, nan_policy='propagate'):
3761 """Compute the circular mean for samples in a range.
3763 Parameters
3764 ----------
3765 samples : array_like
3766 Input array.
3767 high : float or int, optional
3768 High boundary for the sample range. Default is ``2*pi``.
3769 low : float or int, optional
3770 Low boundary for the sample range. Default is 0.
3771 axis : int, optional
3772 Axis along which means are computed. The default is to compute
3773 the mean of the flattened array.
3774 nan_policy : {'propagate', 'raise', 'omit'}, optional
3775 Defines how to handle when input contains nan. 'propagate' returns nan,
3776 'raise' throws an error, 'omit' performs the calculations ignoring nan
3777 values. Default is 'propagate'.
3779 Returns
3780 -------
3781 circmean : float
3782 Circular mean.
3784 See Also
3785 --------
3786 circstd : Circular standard deviation.
3787 circvar : Circular variance.
3789 Examples
3790 --------
3791 For simplicity, all angles are printed out in degrees.
3793 >>> import numpy as np
3794 >>> from scipy.stats import circmean
3795 >>> import matplotlib.pyplot as plt
3796 >>> angles = np.deg2rad(np.array([20, 30, 330]))
3797 >>> circmean = circmean(angles)
3798 >>> np.rad2deg(circmean)
3799 7.294976657784009
3801 >>> mean = angles.mean()
3802 >>> np.rad2deg(mean)
3803 126.66666666666666
3805 Plot and compare the circular mean against the arithmetic mean.
3807 >>> plt.plot(np.cos(np.linspace(0, 2*np.pi, 500)),
3808 ... np.sin(np.linspace(0, 2*np.pi, 500)),
3809 ... c='k')
3810 >>> plt.scatter(np.cos(angles), np.sin(angles), c='k')
3811 >>> plt.scatter(np.cos(circmean), np.sin(circmean), c='b',
3812 ... label='circmean')
3813 >>> plt.scatter(np.cos(mean), np.sin(mean), c='r', label='mean')
3814 >>> plt.legend()
3815 >>> plt.axis('equal')
3816 >>> plt.show()
3818 """
3819 samples, sin_samp, cos_samp, nmask = _circfuncs_common(samples, high, low,
3820 nan_policy=nan_policy)
3821 sin_sum = sin_samp.sum(axis=axis)
3822 cos_sum = cos_samp.sum(axis=axis)
3823 res = arctan2(sin_sum, cos_sum)
3825 mask_nan = ~np.isnan(res)
3826 if mask_nan.ndim > 0:
3827 mask = res[mask_nan] < 0
3828 else:
3829 mask = res < 0
3831 if mask.ndim > 0:
3832 mask_nan[mask_nan] = mask
3833 res[mask_nan] += 2*pi
3834 elif mask:
3835 res += 2*pi
3837 # Set output to NaN if no samples went into the mean
3838 if nmask is not None:
3839 if nmask.all():
3840 res = np.full(shape=res.shape, fill_value=np.nan)
3841 else:
3842 # Find out if any of the axis that are being averaged consist
3843 # entirely of NaN. If one exists, set the result (res) to NaN
3844 nshape = 0 if axis is None else axis
3845 smask = nmask.shape[nshape] == nmask.sum(axis=axis)
3846 if smask.any():
3847 res[smask] = np.nan
3849 return res*(high - low)/2.0/pi + low
3852def circvar(samples, high=2*pi, low=0, axis=None, nan_policy='propagate'):
3853 """Compute the circular variance for samples assumed to be in a range.
3855 Parameters
3856 ----------
3857 samples : array_like
3858 Input array.
3859 high : float or int, optional
3860 High boundary for the sample range. Default is ``2*pi``.
3861 low : float or int, optional
3862 Low boundary for the sample range. Default is 0.
3863 axis : int, optional
3864 Axis along which variances are computed. The default is to compute
3865 the variance of the flattened array.
3866 nan_policy : {'propagate', 'raise', 'omit'}, optional
3867 Defines how to handle when input contains nan. 'propagate' returns nan,
3868 'raise' throws an error, 'omit' performs the calculations ignoring nan
3869 values. Default is 'propagate'.
3871 Returns
3872 -------
3873 circvar : float
3874 Circular variance.
3876 See Also
3877 --------
3878 circmean : Circular mean.
3879 circstd : Circular standard deviation.
3881 Notes
3882 -----
3883 This uses the following definition of circular variance: ``1-R``, where
3884 ``R`` is the mean resultant vector. The
3885 returned value is in the range [0, 1], 0 standing for no variance, and 1
3886 for a large variance. In the limit of small angles, this value is similar
3887 to half the 'linear' variance.
3889 References
3890 ----------
3891 .. [1] Fisher, N.I. *Statistical analysis of circular data*. Cambridge
3892 University Press, 1993.
3894 Examples
3895 --------
3896 >>> import numpy as np
3897 >>> from scipy.stats import circvar
3898 >>> import matplotlib.pyplot as plt
3899 >>> samples_1 = np.array([0.072, -0.158, 0.077, 0.108, 0.286,
3900 ... 0.133, -0.473, -0.001, -0.348, 0.131])
3901 >>> samples_2 = np.array([0.111, -0.879, 0.078, 0.733, 0.421,
3902 ... 0.104, -0.136, -0.867, 0.012, 0.105])
3903 >>> circvar_1 = circvar(samples_1)
3904 >>> circvar_2 = circvar(samples_2)
3906 Plot the samples.
3908 >>> fig, (left, right) = plt.subplots(ncols=2)
3909 >>> for image in (left, right):
3910 ... image.plot(np.cos(np.linspace(0, 2*np.pi, 500)),
3911 ... np.sin(np.linspace(0, 2*np.pi, 500)),
3912 ... c='k')
3913 ... image.axis('equal')
3914 ... image.axis('off')
3915 >>> left.scatter(np.cos(samples_1), np.sin(samples_1), c='k', s=15)
3916 >>> left.set_title(f"circular variance: {np.round(circvar_1, 2)!r}")
3917 >>> right.scatter(np.cos(samples_2), np.sin(samples_2), c='k', s=15)
3918 >>> right.set_title(f"circular variance: {np.round(circvar_2, 2)!r}")
3919 >>> plt.show()
3921 """
3922 samples, sin_samp, cos_samp, mask = _circfuncs_common(samples, high, low,
3923 nan_policy=nan_policy)
3924 if mask is None:
3925 sin_mean = sin_samp.mean(axis=axis)
3926 cos_mean = cos_samp.mean(axis=axis)
3927 else:
3928 nsum = np.asarray(np.sum(~mask, axis=axis).astype(float))
3929 nsum[nsum == 0] = np.nan
3930 sin_mean = sin_samp.sum(axis=axis) / nsum
3931 cos_mean = cos_samp.sum(axis=axis) / nsum
3932 # hypot can go slightly above 1 due to rounding errors
3933 with np.errstate(invalid='ignore'):
3934 R = np.minimum(1, hypot(sin_mean, cos_mean))
3936 res = 1. - R
3937 return res
3940def circstd(samples, high=2*pi, low=0, axis=None, nan_policy='propagate', *,
3941 normalize=False):
3942 """
3943 Compute the circular standard deviation for samples assumed to be in the
3944 range [low to high].
3946 Parameters
3947 ----------
3948 samples : array_like
3949 Input array.
3950 high : float or int, optional
3951 High boundary for the sample range. Default is ``2*pi``.
3952 low : float or int, optional
3953 Low boundary for the sample range. Default is 0.
3954 axis : int, optional
3955 Axis along which standard deviations are computed. The default is
3956 to compute the standard deviation of the flattened array.
3957 nan_policy : {'propagate', 'raise', 'omit'}, optional
3958 Defines how to handle when input contains nan. 'propagate' returns nan,
3959 'raise' throws an error, 'omit' performs the calculations ignoring nan
3960 values. Default is 'propagate'.
3961 normalize : boolean, optional
3962 If True, the returned value is equal to ``sqrt(-2*log(R))`` and does
3963 not depend on the variable units. If False (default), the returned
3964 value is scaled by ``((high-low)/(2*pi))``.
3966 Returns
3967 -------
3968 circstd : float
3969 Circular standard deviation.
3971 See Also
3972 --------
3973 circmean : Circular mean.
3974 circvar : Circular variance.
3976 Notes
3977 -----
3978 This uses a definition of circular standard deviation from [1]_.
3979 Essentially, the calculation is as follows.
3981 .. code-block:: python
3983 import numpy as np
3984 C = np.cos(samples).mean()
3985 S = np.sin(samples).mean()
3986 R = np.sqrt(C**2 + S**2)
3987 l = 2*np.pi / (high-low)
3988 circstd = np.sqrt(-2*np.log(R)) / l
3990 In the limit of small angles, it returns a number close to the 'linear'
3991 standard deviation.
3993 References
3994 ----------
3995 .. [1] Mardia, K. V. (1972). 2. In *Statistics of Directional Data*
3996 (pp. 18-24). Academic Press. :doi:`10.1016/C2013-0-07425-7`.
3998 Examples
3999 --------
4000 >>> import numpy as np
4001 >>> from scipy.stats import circstd
4002 >>> import matplotlib.pyplot as plt
4003 >>> samples_1 = np.array([0.072, -0.158, 0.077, 0.108, 0.286,
4004 ... 0.133, -0.473, -0.001, -0.348, 0.131])
4005 >>> samples_2 = np.array([0.111, -0.879, 0.078, 0.733, 0.421,
4006 ... 0.104, -0.136, -0.867, 0.012, 0.105])
4007 >>> circstd_1 = circstd(samples_1)
4008 >>> circstd_2 = circstd(samples_2)
4010 Plot the samples.
4012 >>> fig, (left, right) = plt.subplots(ncols=2)
4013 >>> for image in (left, right):
4014 ... image.plot(np.cos(np.linspace(0, 2*np.pi, 500)),
4015 ... np.sin(np.linspace(0, 2*np.pi, 500)),
4016 ... c='k')
4017 ... image.axis('equal')
4018 ... image.axis('off')
4019 >>> left.scatter(np.cos(samples_1), np.sin(samples_1), c='k', s=15)
4020 >>> left.set_title(f"circular std: {np.round(circstd_1, 2)!r}")
4021 >>> right.plot(np.cos(np.linspace(0, 2*np.pi, 500)),
4022 ... np.sin(np.linspace(0, 2*np.pi, 500)),
4023 ... c='k')
4024 >>> right.scatter(np.cos(samples_2), np.sin(samples_2), c='k', s=15)
4025 >>> right.set_title(f"circular std: {np.round(circstd_2, 2)!r}")
4026 >>> plt.show()
4028 """
4029 samples, sin_samp, cos_samp, mask = _circfuncs_common(samples, high, low,
4030 nan_policy=nan_policy)
4031 if mask is None:
4032 sin_mean = sin_samp.mean(axis=axis) # [1] (2.2.3)
4033 cos_mean = cos_samp.mean(axis=axis) # [1] (2.2.3)
4034 else:
4035 nsum = np.asarray(np.sum(~mask, axis=axis).astype(float))
4036 nsum[nsum == 0] = np.nan
4037 sin_mean = sin_samp.sum(axis=axis) / nsum
4038 cos_mean = cos_samp.sum(axis=axis) / nsum
4039 # hypot can go slightly above 1 due to rounding errors
4040 with np.errstate(invalid='ignore'):
4041 R = np.minimum(1, hypot(sin_mean, cos_mean)) # [1] (2.2.4)
4043 res = sqrt(-2*log(R))
4044 if not normalize:
4045 res *= (high-low)/(2.*pi) # [1] (2.3.14) w/ (2.3.7)
4046 return res
4049class DirectionalStats:
4050 def __init__(self, mean_direction, mean_resultant_length):
4051 self.mean_direction = mean_direction
4052 self.mean_resultant_length = mean_resultant_length
4054 def __repr__(self):
4055 return (f"DirectionalStats(mean_direction={self.mean_direction},"
4056 f" mean_resultant_length={self.mean_resultant_length})")
4059def directional_stats(samples, *, axis=0, normalize=True):
4060 """
4061 Computes sample statistics for directional data.
4063 Computes the directional mean (also called the mean direction vector) and
4064 mean resultant length of a sample of vectors.
4066 The directional mean is a measure of "preferred direction" of vector data.
4067 It is analogous to the sample mean, but it is for use when the length of
4068 the data is irrelevant (e.g. unit vectors).
4070 The mean resultant length is a value between 0 and 1 used to quantify the
4071 dispersion of directional data: the smaller the mean resultant length, the
4072 greater the dispersion. Several definitions of directional variance
4073 involving the mean resultant length are given in [1]_ and [2]_.
4075 Parameters
4076 ----------
4077 samples : array_like
4078 Input array. Must be at least two-dimensional, and the last axis of the
4079 input must correspond with the dimensionality of the vector space.
4080 When the input is exactly two dimensional, this means that each row
4081 of the data is a vector observation.
4082 axis : int, default: 0
4083 Axis along which the directional mean is computed.
4084 normalize: boolean, default: True
4085 If True, normalize the input to ensure that each observation is a
4086 unit vector. It the observations are already unit vectors, consider
4087 setting this to False to avoid unnecessary computation.
4089 Returns
4090 -------
4091 res : DirectionalStats
4092 An object containing attributes:
4094 mean_direction : ndarray
4095 Directional mean.
4096 mean_resultant_length : ndarray
4097 The mean resultant length [1]_.
4099 See also
4100 --------
4101 circmean: circular mean; i.e. directional mean for 2D *angles*
4102 circvar: circular variance; i.e. directional variance for 2D *angles*
4104 Notes
4105 -----
4106 This uses a definition of directional mean from [1]_.
4107 Assuming the observations are unit vectors, the calculation is as follows.
4109 .. code-block:: python
4111 mean = samples.mean(axis=0)
4112 mean_resultant_length = np.linalg.norm(mean)
4113 mean_direction = mean / mean_resultant_length
4115 This definition is appropriate for *directional* data (i.e. vector data
4116 for which the magnitude of each observation is irrelevant) but not
4117 for *axial* data (i.e. vector data for which the magnitude and *sign* of
4118 each observation is irrelevant).
4120 Several definitions of directional variance involving the mean resultant
4121 length ``R`` have been proposed, including ``1 - R`` [1]_, ``1 - R**2``
4122 [2]_, and ``2 * (1 - R)`` [2]_. Rather than choosing one, this function
4123 returns ``R`` as attribute `mean_resultant_length` so the user can compute
4124 their preferred measure of dispersion.
4126 References
4127 ----------
4128 .. [1] Mardia, Jupp. (2000). *Directional Statistics*
4129 (p. 163). Wiley.
4131 .. [2] https://en.wikipedia.org/wiki/Directional_statistics
4133 Examples
4134 --------
4135 >>> import numpy as np
4136 >>> from scipy.stats import directional_stats
4137 >>> data = np.array([[3, 4], # first observation, 2D vector space
4138 ... [6, -8]]) # second observation
4139 >>> dirstats = directional_stats(data)
4140 >>> dirstats.mean_direction
4141 array([1., 0.])
4143 In contrast, the regular sample mean of the vectors would be influenced
4144 by the magnitude of each observation. Furthermore, the result would not be
4145 a unit vector.
4147 >>> data.mean(axis=0)
4148 array([4.5, -2.])
4150 An exemplary use case for `directional_stats` is to find a *meaningful*
4151 center for a set of observations on a sphere, e.g. geographical locations.
4153 >>> data = np.array([[0.8660254, 0.5, 0.],
4154 ... [0.8660254, -0.5, 0.]])
4155 >>> dirstats = directional_stats(data)
4156 >>> dirstats.mean_direction
4157 array([1., 0., 0.])
4159 The regular sample mean on the other hand yields a result which does not
4160 lie on the surface of the sphere.
4162 >>> data.mean(axis=0)
4163 array([0.8660254, 0., 0.])
4165 The function also returns the mean resultant length, which
4166 can be used to calculate a directional variance. For example, using the
4167 definition ``Var(z) = 1 - R`` from [2]_ where ``R`` is the
4168 mean resultant length, we can calculate the directional variance of the
4169 vectors in the above example as:
4171 >>> 1 - dirstats.mean_resultant_length
4172 0.13397459716167093
4173 """
4174 samples = np.asarray(samples)
4175 if samples.ndim < 2:
4176 raise ValueError("samples must at least be two-dimensional. "
4177 f"Instead samples has shape: {samples.shape!r}")
4178 samples = np.moveaxis(samples, axis, 0)
4179 if normalize:
4180 vectornorms = np.linalg.norm(samples, axis=-1, keepdims=True)
4181 samples = samples/vectornorms
4182 mean = np.mean(samples, axis=0)
4183 mean_resultant_length = np.linalg.norm(mean, axis=-1, keepdims=True)
4184 mean_direction = mean / mean_resultant_length
4185 return DirectionalStats(mean_direction,
4186 mean_resultant_length.squeeze(-1)[()])