Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scipy/stats/_mstats_extras.py: 11%

159 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-12-12 06:31 +0000

1""" 

2Additional statistics functions with support for masked arrays. 

3 

4""" 

5 

6# Original author (2007): Pierre GF Gerard-Marchant 

7 

8 

9__all__ = ['compare_medians_ms', 

10 'hdquantiles', 'hdmedian', 'hdquantiles_sd', 

11 'idealfourths', 

12 'median_cihs','mjci','mquantiles_cimj', 

13 'rsh', 

14 'trimmed_mean_ci',] 

15 

16 

17import numpy as np 

18from numpy import float_, int_, ndarray 

19 

20import numpy.ma as ma 

21from numpy.ma import MaskedArray 

22 

23from . import _mstats_basic as mstats 

24 

25from scipy.stats.distributions import norm, beta, t, binom 

26 

27 

28def hdquantiles(data, prob=list([.25,.5,.75]), axis=None, var=False,): 

29 """ 

30 Computes quantile estimates with the Harrell-Davis method. 

31 

32 The quantile estimates are calculated as a weighted linear combination 

33 of order statistics. 

34 

35 Parameters 

36 ---------- 

37 data : array_like 

38 Data array. 

39 prob : sequence, optional 

40 Sequence of quantiles to compute. 

41 axis : int or None, optional 

42 Axis along which to compute the quantiles. If None, use a flattened 

43 array. 

44 var : bool, optional 

45 Whether to return the variance of the estimate. 

46 

47 Returns 

48 ------- 

49 hdquantiles : MaskedArray 

50 A (p,) array of quantiles (if `var` is False), or a (2,p) array of 

51 quantiles and variances (if `var` is True), where ``p`` is the 

52 number of quantiles. 

53 

54 See Also 

55 -------- 

56 hdquantiles_sd 

57 

58 """ 

59 def _hd_1D(data,prob,var): 

60 "Computes the HD quantiles for a 1D array. Returns nan for invalid data." 

61 xsorted = np.squeeze(np.sort(data.compressed().view(ndarray))) 

62 # Don't use length here, in case we have a numpy scalar 

63 n = xsorted.size 

64 

65 hd = np.empty((2,len(prob)), float_) 

66 if n < 2: 

67 hd.flat = np.nan 

68 if var: 

69 return hd 

70 return hd[0] 

71 

72 v = np.arange(n+1) / float(n) 

73 betacdf = beta.cdf 

74 for (i,p) in enumerate(prob): 

75 _w = betacdf(v, (n+1)*p, (n+1)*(1-p)) 

76 w = _w[1:] - _w[:-1] 

77 hd_mean = np.dot(w, xsorted) 

78 hd[0,i] = hd_mean 

79 # 

80 hd[1,i] = np.dot(w, (xsorted-hd_mean)**2) 

81 # 

82 hd[0, prob == 0] = xsorted[0] 

83 hd[0, prob == 1] = xsorted[-1] 

84 if var: 

85 hd[1, prob == 0] = hd[1, prob == 1] = np.nan 

86 return hd 

87 return hd[0] 

88 # Initialization & checks 

89 data = ma.array(data, copy=False, dtype=float_) 

90 p = np.array(prob, copy=False, ndmin=1) 

91 # Computes quantiles along axis (or globally) 

92 if (axis is None) or (data.ndim == 1): 

93 result = _hd_1D(data, p, var) 

94 else: 

95 if data.ndim > 2: 

96 raise ValueError("Array 'data' must be at most two dimensional, " 

97 "but got data.ndim = %d" % data.ndim) 

98 result = ma.apply_along_axis(_hd_1D, axis, data, p, var) 

99 

100 return ma.fix_invalid(result, copy=False) 

101 

102 

103def hdmedian(data, axis=-1, var=False): 

104 """ 

105 Returns the Harrell-Davis estimate of the median along the given axis. 

106 

107 Parameters 

108 ---------- 

109 data : ndarray 

110 Data array. 

111 axis : int, optional 

112 Axis along which to compute the quantiles. If None, use a flattened 

113 array. 

114 var : bool, optional 

115 Whether to return the variance of the estimate. 

116 

117 Returns 

118 ------- 

119 hdmedian : MaskedArray 

120 The median values. If ``var=True``, the variance is returned inside 

121 the masked array. E.g. for a 1-D array the shape change from (1,) to 

122 (2,). 

123 

124 """ 

125 result = hdquantiles(data,[0.5], axis=axis, var=var) 

126 return result.squeeze() 

127 

128 

129def hdquantiles_sd(data, prob=list([.25,.5,.75]), axis=None): 

130 """ 

131 The standard error of the Harrell-Davis quantile estimates by jackknife. 

132 

133 Parameters 

134 ---------- 

135 data : array_like 

136 Data array. 

137 prob : sequence, optional 

138 Sequence of quantiles to compute. 

139 axis : int, optional 

140 Axis along which to compute the quantiles. If None, use a flattened 

141 array. 

142 

143 Returns 

144 ------- 

145 hdquantiles_sd : MaskedArray 

146 Standard error of the Harrell-Davis quantile estimates. 

147 

148 See Also 

149 -------- 

150 hdquantiles 

151 

152 """ 

153 def _hdsd_1D(data, prob): 

154 "Computes the std error for 1D arrays." 

155 xsorted = np.sort(data.compressed()) 

156 n = len(xsorted) 

157 

158 hdsd = np.empty(len(prob), float_) 

159 if n < 2: 

160 hdsd.flat = np.nan 

161 

162 vv = np.arange(n) / float(n-1) 

163 betacdf = beta.cdf 

164 

165 for (i,p) in enumerate(prob): 

166 _w = betacdf(vv, n*p, n*(1-p)) 

167 w = _w[1:] - _w[:-1] 

168 # cumulative sum of weights and data points if 

169 # ith point is left out for jackknife 

170 mx_ = np.zeros_like(xsorted) 

171 mx_[1:] = np.cumsum(w * xsorted[:-1]) 

172 # similar but from the right 

173 mx_[:-1] += np.cumsum(w[::-1] * xsorted[:0:-1])[::-1] 

174 hdsd[i] = np.sqrt(mx_.var() * (n - 1)) 

175 return hdsd 

176 

177 # Initialization & checks 

178 data = ma.array(data, copy=False, dtype=float_) 

179 p = np.array(prob, copy=False, ndmin=1) 

180 # Computes quantiles along axis (or globally) 

181 if (axis is None): 

182 result = _hdsd_1D(data, p) 

183 else: 

184 if data.ndim > 2: 

185 raise ValueError("Array 'data' must be at most two dimensional, " 

186 "but got data.ndim = %d" % data.ndim) 

187 result = ma.apply_along_axis(_hdsd_1D, axis, data, p) 

188 

189 return ma.fix_invalid(result, copy=False).ravel() 

190 

191 

192def trimmed_mean_ci(data, limits=(0.2,0.2), inclusive=(True,True), 

193 alpha=0.05, axis=None): 

194 """ 

195 Selected confidence interval of the trimmed mean along the given axis. 

196 

197 Parameters 

198 ---------- 

199 data : array_like 

200 Input data. 

201 limits : {None, tuple}, optional 

202 None or a two item tuple. 

203 Tuple of the percentages to cut on each side of the array, with respect 

204 to the number of unmasked data, as floats between 0. and 1. If ``n`` 

205 is the number of unmasked data before trimming, then 

206 (``n * limits[0]``)th smallest data and (``n * limits[1]``)th 

207 largest data are masked. The total number of unmasked data after 

208 trimming is ``n * (1. - sum(limits))``. 

209 The value of one limit can be set to None to indicate an open interval. 

210 

211 Defaults to (0.2, 0.2). 

212 inclusive : (2,) tuple of boolean, optional 

213 If relative==False, tuple indicating whether values exactly equal to 

214 the absolute limits are allowed. 

215 If relative==True, tuple indicating whether the number of data being 

216 masked on each side should be rounded (True) or truncated (False). 

217 

218 Defaults to (True, True). 

219 alpha : float, optional 

220 Confidence level of the intervals. 

221 

222 Defaults to 0.05. 

223 axis : int, optional 

224 Axis along which to cut. If None, uses a flattened version of `data`. 

225 

226 Defaults to None. 

227 

228 Returns 

229 ------- 

230 trimmed_mean_ci : (2,) ndarray 

231 The lower and upper confidence intervals of the trimmed data. 

232 

233 """ 

234 data = ma.array(data, copy=False) 

235 trimmed = mstats.trimr(data, limits=limits, inclusive=inclusive, axis=axis) 

236 tmean = trimmed.mean(axis) 

237 tstde = mstats.trimmed_stde(data,limits=limits,inclusive=inclusive,axis=axis) 

238 df = trimmed.count(axis) - 1 

239 tppf = t.ppf(1-alpha/2.,df) 

240 return np.array((tmean - tppf*tstde, tmean+tppf*tstde)) 

241 

242 

243def mjci(data, prob=[0.25,0.5,0.75], axis=None): 

244 """ 

245 Returns the Maritz-Jarrett estimators of the standard error of selected 

246 experimental quantiles of the data. 

247 

248 Parameters 

249 ---------- 

250 data : ndarray 

251 Data array. 

252 prob : sequence, optional 

253 Sequence of quantiles to compute. 

254 axis : int or None, optional 

255 Axis along which to compute the quantiles. If None, use a flattened 

256 array. 

257 

258 """ 

259 def _mjci_1D(data, p): 

260 data = np.sort(data.compressed()) 

261 n = data.size 

262 prob = (np.array(p) * n + 0.5).astype(int_) 

263 betacdf = beta.cdf 

264 

265 mj = np.empty(len(prob), float_) 

266 x = np.arange(1,n+1, dtype=float_) / n 

267 y = x - 1./n 

268 for (i,m) in enumerate(prob): 

269 W = betacdf(x,m-1,n-m) - betacdf(y,m-1,n-m) 

270 C1 = np.dot(W,data) 

271 C2 = np.dot(W,data**2) 

272 mj[i] = np.sqrt(C2 - C1**2) 

273 return mj 

274 

275 data = ma.array(data, copy=False) 

276 if data.ndim > 2: 

277 raise ValueError("Array 'data' must be at most two dimensional, " 

278 "but got data.ndim = %d" % data.ndim) 

279 

280 p = np.array(prob, copy=False, ndmin=1) 

281 # Computes quantiles along axis (or globally) 

282 if (axis is None): 

283 return _mjci_1D(data, p) 

284 else: 

285 return ma.apply_along_axis(_mjci_1D, axis, data, p) 

286 

287 

288def mquantiles_cimj(data, prob=[0.25,0.50,0.75], alpha=0.05, axis=None): 

289 """ 

290 Computes the alpha confidence interval for the selected quantiles of the 

291 data, with Maritz-Jarrett estimators. 

292 

293 Parameters 

294 ---------- 

295 data : ndarray 

296 Data array. 

297 prob : sequence, optional 

298 Sequence of quantiles to compute. 

299 alpha : float, optional 

300 Confidence level of the intervals. 

301 axis : int or None, optional 

302 Axis along which to compute the quantiles. 

303 If None, use a flattened array. 

304 

305 Returns 

306 ------- 

307 ci_lower : ndarray 

308 The lower boundaries of the confidence interval. Of the same length as 

309 `prob`. 

310 ci_upper : ndarray 

311 The upper boundaries of the confidence interval. Of the same length as 

312 `prob`. 

313 

314 """ 

315 alpha = min(alpha, 1 - alpha) 

316 z = norm.ppf(1 - alpha/2.) 

317 xq = mstats.mquantiles(data, prob, alphap=0, betap=0, axis=axis) 

318 smj = mjci(data, prob, axis=axis) 

319 return (xq - z * smj, xq + z * smj) 

320 

321 

322def median_cihs(data, alpha=0.05, axis=None): 

323 """ 

324 Computes the alpha-level confidence interval for the median of the data. 

325 

326 Uses the Hettmasperger-Sheather method. 

327 

328 Parameters 

329 ---------- 

330 data : array_like 

331 Input data. Masked values are discarded. The input should be 1D only, 

332 or `axis` should be set to None. 

333 alpha : float, optional 

334 Confidence level of the intervals. 

335 axis : int or None, optional 

336 Axis along which to compute the quantiles. If None, use a flattened 

337 array. 

338 

339 Returns 

340 ------- 

341 median_cihs 

342 Alpha level confidence interval. 

343 

344 """ 

345 def _cihs_1D(data, alpha): 

346 data = np.sort(data.compressed()) 

347 n = len(data) 

348 alpha = min(alpha, 1-alpha) 

349 k = int(binom._ppf(alpha/2., n, 0.5)) 

350 gk = binom.cdf(n-k,n,0.5) - binom.cdf(k-1,n,0.5) 

351 if gk < 1-alpha: 

352 k -= 1 

353 gk = binom.cdf(n-k,n,0.5) - binom.cdf(k-1,n,0.5) 

354 gkk = binom.cdf(n-k-1,n,0.5) - binom.cdf(k,n,0.5) 

355 I = (gk - 1 + alpha)/(gk - gkk) 

356 lambd = (n-k) * I / float(k + (n-2*k)*I) 

357 lims = (lambd*data[k] + (1-lambd)*data[k-1], 

358 lambd*data[n-k-1] + (1-lambd)*data[n-k]) 

359 return lims 

360 data = ma.array(data, copy=False) 

361 # Computes quantiles along axis (or globally) 

362 if (axis is None): 

363 result = _cihs_1D(data, alpha) 

364 else: 

365 if data.ndim > 2: 

366 raise ValueError("Array 'data' must be at most two dimensional, " 

367 "but got data.ndim = %d" % data.ndim) 

368 result = ma.apply_along_axis(_cihs_1D, axis, data, alpha) 

369 

370 return result 

371 

372 

373def compare_medians_ms(group_1, group_2, axis=None): 

374 """ 

375 Compares the medians from two independent groups along the given axis. 

376 

377 The comparison is performed using the McKean-Schrader estimate of the 

378 standard error of the medians. 

379 

380 Parameters 

381 ---------- 

382 group_1 : array_like 

383 First dataset. Has to be of size >=7. 

384 group_2 : array_like 

385 Second dataset. Has to be of size >=7. 

386 axis : int, optional 

387 Axis along which the medians are estimated. If None, the arrays are 

388 flattened. If `axis` is not None, then `group_1` and `group_2` 

389 should have the same shape. 

390 

391 Returns 

392 ------- 

393 compare_medians_ms : {float, ndarray} 

394 If `axis` is None, then returns a float, otherwise returns a 1-D 

395 ndarray of floats with a length equal to the length of `group_1` 

396 along `axis`. 

397 

398 Examples 

399 -------- 

400 

401 >>> from scipy import stats 

402 >>> a = [1, 2, 3, 4, 5, 6, 7] 

403 >>> b = [8, 9, 10, 11, 12, 13, 14] 

404 >>> stats.mstats.compare_medians_ms(a, b, axis=None) 

405 1.0693225866553746e-05 

406 

407 The function is vectorized to compute along a given axis. 

408 

409 >>> import numpy as np 

410 >>> rng = np.random.default_rng() 

411 >>> x = rng.random(size=(3, 7)) 

412 >>> y = rng.random(size=(3, 8)) 

413 >>> stats.mstats.compare_medians_ms(x, y, axis=1) 

414 array([0.36908985, 0.36092538, 0.2765313 ]) 

415 

416 References 

417 ---------- 

418 .. [1] McKean, Joseph W., and Ronald M. Schrader. "A comparison of methods 

419 for studentizing the sample median." Communications in 

420 Statistics-Simulation and Computation 13.6 (1984): 751-773. 

421 

422 """ 

423 (med_1, med_2) = (ma.median(group_1,axis=axis), ma.median(group_2,axis=axis)) 

424 (std_1, std_2) = (mstats.stde_median(group_1, axis=axis), 

425 mstats.stde_median(group_2, axis=axis)) 

426 W = np.abs(med_1 - med_2) / ma.sqrt(std_1**2 + std_2**2) 

427 return 1 - norm.cdf(W) 

428 

429 

430def idealfourths(data, axis=None): 

431 """ 

432 Returns an estimate of the lower and upper quartiles. 

433 

434 Uses the ideal fourths algorithm. 

435 

436 Parameters 

437 ---------- 

438 data : array_like 

439 Input array. 

440 axis : int, optional 

441 Axis along which the quartiles are estimated. If None, the arrays are 

442 flattened. 

443 

444 Returns 

445 ------- 

446 idealfourths : {list of floats, masked array} 

447 Returns the two internal values that divide `data` into four parts 

448 using the ideal fourths algorithm either along the flattened array 

449 (if `axis` is None) or along `axis` of `data`. 

450 

451 """ 

452 def _idf(data): 

453 x = data.compressed() 

454 n = len(x) 

455 if n < 3: 

456 return [np.nan,np.nan] 

457 (j,h) = divmod(n/4. + 5/12.,1) 

458 j = int(j) 

459 qlo = (1-h)*x[j-1] + h*x[j] 

460 k = n - j 

461 qup = (1-h)*x[k] + h*x[k-1] 

462 return [qlo, qup] 

463 data = ma.sort(data, axis=axis).view(MaskedArray) 

464 if (axis is None): 

465 return _idf(data) 

466 else: 

467 return ma.apply_along_axis(_idf, axis, data) 

468 

469 

470def rsh(data, points=None): 

471 """ 

472 Evaluates Rosenblatt's shifted histogram estimators for each data point. 

473 

474 Rosenblatt's estimator is a centered finite-difference approximation to the 

475 derivative of the empirical cumulative distribution function. 

476 

477 Parameters 

478 ---------- 

479 data : sequence 

480 Input data, should be 1-D. Masked values are ignored. 

481 points : sequence or None, optional 

482 Sequence of points where to evaluate Rosenblatt shifted histogram. 

483 If None, use the data. 

484 

485 """ 

486 data = ma.array(data, copy=False) 

487 if points is None: 

488 points = data 

489 else: 

490 points = np.array(points, copy=False, ndmin=1) 

491 

492 if data.ndim != 1: 

493 raise AttributeError("The input array should be 1D only !") 

494 

495 n = data.count() 

496 r = idealfourths(data, axis=None) 

497 h = 1.2 * (r[-1]-r[0]) / n**(1./5) 

498 nhi = (data[:,None] <= points[None,:] + h).sum(0) 

499 nlo = (data[:,None] < points[None,:] - h).sum(0) 

500 return (nhi-nlo) / (2.*n*h)