Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scipy/stats/_mstats_extras.py: 11%
159 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-12 06:31 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-12 06:31 +0000
1"""
2Additional statistics functions with support for masked arrays.
4"""
6# Original author (2007): Pierre GF Gerard-Marchant
9__all__ = ['compare_medians_ms',
10 'hdquantiles', 'hdmedian', 'hdquantiles_sd',
11 'idealfourths',
12 'median_cihs','mjci','mquantiles_cimj',
13 'rsh',
14 'trimmed_mean_ci',]
17import numpy as np
18from numpy import float_, int_, ndarray
20import numpy.ma as ma
21from numpy.ma import MaskedArray
23from . import _mstats_basic as mstats
25from scipy.stats.distributions import norm, beta, t, binom
28def hdquantiles(data, prob=list([.25,.5,.75]), axis=None, var=False,):
29 """
30 Computes quantile estimates with the Harrell-Davis method.
32 The quantile estimates are calculated as a weighted linear combination
33 of order statistics.
35 Parameters
36 ----------
37 data : array_like
38 Data array.
39 prob : sequence, optional
40 Sequence of quantiles to compute.
41 axis : int or None, optional
42 Axis along which to compute the quantiles. If None, use a flattened
43 array.
44 var : bool, optional
45 Whether to return the variance of the estimate.
47 Returns
48 -------
49 hdquantiles : MaskedArray
50 A (p,) array of quantiles (if `var` is False), or a (2,p) array of
51 quantiles and variances (if `var` is True), where ``p`` is the
52 number of quantiles.
54 See Also
55 --------
56 hdquantiles_sd
58 """
59 def _hd_1D(data,prob,var):
60 "Computes the HD quantiles for a 1D array. Returns nan for invalid data."
61 xsorted = np.squeeze(np.sort(data.compressed().view(ndarray)))
62 # Don't use length here, in case we have a numpy scalar
63 n = xsorted.size
65 hd = np.empty((2,len(prob)), float_)
66 if n < 2:
67 hd.flat = np.nan
68 if var:
69 return hd
70 return hd[0]
72 v = np.arange(n+1) / float(n)
73 betacdf = beta.cdf
74 for (i,p) in enumerate(prob):
75 _w = betacdf(v, (n+1)*p, (n+1)*(1-p))
76 w = _w[1:] - _w[:-1]
77 hd_mean = np.dot(w, xsorted)
78 hd[0,i] = hd_mean
79 #
80 hd[1,i] = np.dot(w, (xsorted-hd_mean)**2)
81 #
82 hd[0, prob == 0] = xsorted[0]
83 hd[0, prob == 1] = xsorted[-1]
84 if var:
85 hd[1, prob == 0] = hd[1, prob == 1] = np.nan
86 return hd
87 return hd[0]
88 # Initialization & checks
89 data = ma.array(data, copy=False, dtype=float_)
90 p = np.array(prob, copy=False, ndmin=1)
91 # Computes quantiles along axis (or globally)
92 if (axis is None) or (data.ndim == 1):
93 result = _hd_1D(data, p, var)
94 else:
95 if data.ndim > 2:
96 raise ValueError("Array 'data' must be at most two dimensional, "
97 "but got data.ndim = %d" % data.ndim)
98 result = ma.apply_along_axis(_hd_1D, axis, data, p, var)
100 return ma.fix_invalid(result, copy=False)
103def hdmedian(data, axis=-1, var=False):
104 """
105 Returns the Harrell-Davis estimate of the median along the given axis.
107 Parameters
108 ----------
109 data : ndarray
110 Data array.
111 axis : int, optional
112 Axis along which to compute the quantiles. If None, use a flattened
113 array.
114 var : bool, optional
115 Whether to return the variance of the estimate.
117 Returns
118 -------
119 hdmedian : MaskedArray
120 The median values. If ``var=True``, the variance is returned inside
121 the masked array. E.g. for a 1-D array the shape change from (1,) to
122 (2,).
124 """
125 result = hdquantiles(data,[0.5], axis=axis, var=var)
126 return result.squeeze()
129def hdquantiles_sd(data, prob=list([.25,.5,.75]), axis=None):
130 """
131 The standard error of the Harrell-Davis quantile estimates by jackknife.
133 Parameters
134 ----------
135 data : array_like
136 Data array.
137 prob : sequence, optional
138 Sequence of quantiles to compute.
139 axis : int, optional
140 Axis along which to compute the quantiles. If None, use a flattened
141 array.
143 Returns
144 -------
145 hdquantiles_sd : MaskedArray
146 Standard error of the Harrell-Davis quantile estimates.
148 See Also
149 --------
150 hdquantiles
152 """
153 def _hdsd_1D(data, prob):
154 "Computes the std error for 1D arrays."
155 xsorted = np.sort(data.compressed())
156 n = len(xsorted)
158 hdsd = np.empty(len(prob), float_)
159 if n < 2:
160 hdsd.flat = np.nan
162 vv = np.arange(n) / float(n-1)
163 betacdf = beta.cdf
165 for (i,p) in enumerate(prob):
166 _w = betacdf(vv, n*p, n*(1-p))
167 w = _w[1:] - _w[:-1]
168 # cumulative sum of weights and data points if
169 # ith point is left out for jackknife
170 mx_ = np.zeros_like(xsorted)
171 mx_[1:] = np.cumsum(w * xsorted[:-1])
172 # similar but from the right
173 mx_[:-1] += np.cumsum(w[::-1] * xsorted[:0:-1])[::-1]
174 hdsd[i] = np.sqrt(mx_.var() * (n - 1))
175 return hdsd
177 # Initialization & checks
178 data = ma.array(data, copy=False, dtype=float_)
179 p = np.array(prob, copy=False, ndmin=1)
180 # Computes quantiles along axis (or globally)
181 if (axis is None):
182 result = _hdsd_1D(data, p)
183 else:
184 if data.ndim > 2:
185 raise ValueError("Array 'data' must be at most two dimensional, "
186 "but got data.ndim = %d" % data.ndim)
187 result = ma.apply_along_axis(_hdsd_1D, axis, data, p)
189 return ma.fix_invalid(result, copy=False).ravel()
192def trimmed_mean_ci(data, limits=(0.2,0.2), inclusive=(True,True),
193 alpha=0.05, axis=None):
194 """
195 Selected confidence interval of the trimmed mean along the given axis.
197 Parameters
198 ----------
199 data : array_like
200 Input data.
201 limits : {None, tuple}, optional
202 None or a two item tuple.
203 Tuple of the percentages to cut on each side of the array, with respect
204 to the number of unmasked data, as floats between 0. and 1. If ``n``
205 is the number of unmasked data before trimming, then
206 (``n * limits[0]``)th smallest data and (``n * limits[1]``)th
207 largest data are masked. The total number of unmasked data after
208 trimming is ``n * (1. - sum(limits))``.
209 The value of one limit can be set to None to indicate an open interval.
211 Defaults to (0.2, 0.2).
212 inclusive : (2,) tuple of boolean, optional
213 If relative==False, tuple indicating whether values exactly equal to
214 the absolute limits are allowed.
215 If relative==True, tuple indicating whether the number of data being
216 masked on each side should be rounded (True) or truncated (False).
218 Defaults to (True, True).
219 alpha : float, optional
220 Confidence level of the intervals.
222 Defaults to 0.05.
223 axis : int, optional
224 Axis along which to cut. If None, uses a flattened version of `data`.
226 Defaults to None.
228 Returns
229 -------
230 trimmed_mean_ci : (2,) ndarray
231 The lower and upper confidence intervals of the trimmed data.
233 """
234 data = ma.array(data, copy=False)
235 trimmed = mstats.trimr(data, limits=limits, inclusive=inclusive, axis=axis)
236 tmean = trimmed.mean(axis)
237 tstde = mstats.trimmed_stde(data,limits=limits,inclusive=inclusive,axis=axis)
238 df = trimmed.count(axis) - 1
239 tppf = t.ppf(1-alpha/2.,df)
240 return np.array((tmean - tppf*tstde, tmean+tppf*tstde))
243def mjci(data, prob=[0.25,0.5,0.75], axis=None):
244 """
245 Returns the Maritz-Jarrett estimators of the standard error of selected
246 experimental quantiles of the data.
248 Parameters
249 ----------
250 data : ndarray
251 Data array.
252 prob : sequence, optional
253 Sequence of quantiles to compute.
254 axis : int or None, optional
255 Axis along which to compute the quantiles. If None, use a flattened
256 array.
258 """
259 def _mjci_1D(data, p):
260 data = np.sort(data.compressed())
261 n = data.size
262 prob = (np.array(p) * n + 0.5).astype(int_)
263 betacdf = beta.cdf
265 mj = np.empty(len(prob), float_)
266 x = np.arange(1,n+1, dtype=float_) / n
267 y = x - 1./n
268 for (i,m) in enumerate(prob):
269 W = betacdf(x,m-1,n-m) - betacdf(y,m-1,n-m)
270 C1 = np.dot(W,data)
271 C2 = np.dot(W,data**2)
272 mj[i] = np.sqrt(C2 - C1**2)
273 return mj
275 data = ma.array(data, copy=False)
276 if data.ndim > 2:
277 raise ValueError("Array 'data' must be at most two dimensional, "
278 "but got data.ndim = %d" % data.ndim)
280 p = np.array(prob, copy=False, ndmin=1)
281 # Computes quantiles along axis (or globally)
282 if (axis is None):
283 return _mjci_1D(data, p)
284 else:
285 return ma.apply_along_axis(_mjci_1D, axis, data, p)
288def mquantiles_cimj(data, prob=[0.25,0.50,0.75], alpha=0.05, axis=None):
289 """
290 Computes the alpha confidence interval for the selected quantiles of the
291 data, with Maritz-Jarrett estimators.
293 Parameters
294 ----------
295 data : ndarray
296 Data array.
297 prob : sequence, optional
298 Sequence of quantiles to compute.
299 alpha : float, optional
300 Confidence level of the intervals.
301 axis : int or None, optional
302 Axis along which to compute the quantiles.
303 If None, use a flattened array.
305 Returns
306 -------
307 ci_lower : ndarray
308 The lower boundaries of the confidence interval. Of the same length as
309 `prob`.
310 ci_upper : ndarray
311 The upper boundaries of the confidence interval. Of the same length as
312 `prob`.
314 """
315 alpha = min(alpha, 1 - alpha)
316 z = norm.ppf(1 - alpha/2.)
317 xq = mstats.mquantiles(data, prob, alphap=0, betap=0, axis=axis)
318 smj = mjci(data, prob, axis=axis)
319 return (xq - z * smj, xq + z * smj)
322def median_cihs(data, alpha=0.05, axis=None):
323 """
324 Computes the alpha-level confidence interval for the median of the data.
326 Uses the Hettmasperger-Sheather method.
328 Parameters
329 ----------
330 data : array_like
331 Input data. Masked values are discarded. The input should be 1D only,
332 or `axis` should be set to None.
333 alpha : float, optional
334 Confidence level of the intervals.
335 axis : int or None, optional
336 Axis along which to compute the quantiles. If None, use a flattened
337 array.
339 Returns
340 -------
341 median_cihs
342 Alpha level confidence interval.
344 """
345 def _cihs_1D(data, alpha):
346 data = np.sort(data.compressed())
347 n = len(data)
348 alpha = min(alpha, 1-alpha)
349 k = int(binom._ppf(alpha/2., n, 0.5))
350 gk = binom.cdf(n-k,n,0.5) - binom.cdf(k-1,n,0.5)
351 if gk < 1-alpha:
352 k -= 1
353 gk = binom.cdf(n-k,n,0.5) - binom.cdf(k-1,n,0.5)
354 gkk = binom.cdf(n-k-1,n,0.5) - binom.cdf(k,n,0.5)
355 I = (gk - 1 + alpha)/(gk - gkk)
356 lambd = (n-k) * I / float(k + (n-2*k)*I)
357 lims = (lambd*data[k] + (1-lambd)*data[k-1],
358 lambd*data[n-k-1] + (1-lambd)*data[n-k])
359 return lims
360 data = ma.array(data, copy=False)
361 # Computes quantiles along axis (or globally)
362 if (axis is None):
363 result = _cihs_1D(data, alpha)
364 else:
365 if data.ndim > 2:
366 raise ValueError("Array 'data' must be at most two dimensional, "
367 "but got data.ndim = %d" % data.ndim)
368 result = ma.apply_along_axis(_cihs_1D, axis, data, alpha)
370 return result
373def compare_medians_ms(group_1, group_2, axis=None):
374 """
375 Compares the medians from two independent groups along the given axis.
377 The comparison is performed using the McKean-Schrader estimate of the
378 standard error of the medians.
380 Parameters
381 ----------
382 group_1 : array_like
383 First dataset. Has to be of size >=7.
384 group_2 : array_like
385 Second dataset. Has to be of size >=7.
386 axis : int, optional
387 Axis along which the medians are estimated. If None, the arrays are
388 flattened. If `axis` is not None, then `group_1` and `group_2`
389 should have the same shape.
391 Returns
392 -------
393 compare_medians_ms : {float, ndarray}
394 If `axis` is None, then returns a float, otherwise returns a 1-D
395 ndarray of floats with a length equal to the length of `group_1`
396 along `axis`.
398 Examples
399 --------
401 >>> from scipy import stats
402 >>> a = [1, 2, 3, 4, 5, 6, 7]
403 >>> b = [8, 9, 10, 11, 12, 13, 14]
404 >>> stats.mstats.compare_medians_ms(a, b, axis=None)
405 1.0693225866553746e-05
407 The function is vectorized to compute along a given axis.
409 >>> import numpy as np
410 >>> rng = np.random.default_rng()
411 >>> x = rng.random(size=(3, 7))
412 >>> y = rng.random(size=(3, 8))
413 >>> stats.mstats.compare_medians_ms(x, y, axis=1)
414 array([0.36908985, 0.36092538, 0.2765313 ])
416 References
417 ----------
418 .. [1] McKean, Joseph W., and Ronald M. Schrader. "A comparison of methods
419 for studentizing the sample median." Communications in
420 Statistics-Simulation and Computation 13.6 (1984): 751-773.
422 """
423 (med_1, med_2) = (ma.median(group_1,axis=axis), ma.median(group_2,axis=axis))
424 (std_1, std_2) = (mstats.stde_median(group_1, axis=axis),
425 mstats.stde_median(group_2, axis=axis))
426 W = np.abs(med_1 - med_2) / ma.sqrt(std_1**2 + std_2**2)
427 return 1 - norm.cdf(W)
430def idealfourths(data, axis=None):
431 """
432 Returns an estimate of the lower and upper quartiles.
434 Uses the ideal fourths algorithm.
436 Parameters
437 ----------
438 data : array_like
439 Input array.
440 axis : int, optional
441 Axis along which the quartiles are estimated. If None, the arrays are
442 flattened.
444 Returns
445 -------
446 idealfourths : {list of floats, masked array}
447 Returns the two internal values that divide `data` into four parts
448 using the ideal fourths algorithm either along the flattened array
449 (if `axis` is None) or along `axis` of `data`.
451 """
452 def _idf(data):
453 x = data.compressed()
454 n = len(x)
455 if n < 3:
456 return [np.nan,np.nan]
457 (j,h) = divmod(n/4. + 5/12.,1)
458 j = int(j)
459 qlo = (1-h)*x[j-1] + h*x[j]
460 k = n - j
461 qup = (1-h)*x[k] + h*x[k-1]
462 return [qlo, qup]
463 data = ma.sort(data, axis=axis).view(MaskedArray)
464 if (axis is None):
465 return _idf(data)
466 else:
467 return ma.apply_along_axis(_idf, axis, data)
470def rsh(data, points=None):
471 """
472 Evaluates Rosenblatt's shifted histogram estimators for each data point.
474 Rosenblatt's estimator is a centered finite-difference approximation to the
475 derivative of the empirical cumulative distribution function.
477 Parameters
478 ----------
479 data : sequence
480 Input data, should be 1-D. Masked values are ignored.
481 points : sequence or None, optional
482 Sequence of points where to evaluate Rosenblatt shifted histogram.
483 If None, use the data.
485 """
486 data = ma.array(data, copy=False)
487 if points is None:
488 points = data
489 else:
490 points = np.array(points, copy=False, ndmin=1)
492 if data.ndim != 1:
493 raise AttributeError("The input array should be 1D only !")
495 n = data.count()
496 r = idealfourths(data, axis=None)
497 h = 1.2 * (r[-1]-r[0]) / n**(1./5)
498 nhi = (data[:,None] <= points[None,:] + h).sum(0)
499 nlo = (data[:,None] < points[None,:] - h).sum(0)
500 return (nhi-nlo) / (2.*n*h)