Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scipy/stats/_mstats

1"""

2Additional statistics functions with support for masked arrays.

4"""

6# Original author (2007): Pierre GF Gerard-Marchant

9__all__ = ['compare_medians_ms',

10 'hdquantiles', 'hdmedian', 'hdquantiles_sd',

11 'idealfourths',

12 'median_cihs','mjci','mquantiles_cimj',

13 'rsh',

14 'trimmed_mean_ci',]

17import numpy as np

18from numpy import float_, int_, ndarray

20import numpy.ma as ma

21from numpy.ma import MaskedArray

23from . import _mstats_basic as mstats

25from scipy.stats.distributions import norm, beta, t, binom

28def hdquantiles(data, prob=list([.25,.5,.75]), axis=None, var=False,):

29 """

30 Computes quantile estimates with the Harrell-Davis method.

32 The quantile estimates are calculated as a weighted linear combination

33 of order statistics.

35 Parameters

36 ----------

37 data : array_like

38 Data array.

39 prob : sequence, optional

40 Sequence of quantiles to compute.

41 axis : int or None, optional

42 Axis along which to compute the quantiles. If None, use a flattened

43 array.

44 var : bool, optional

45 Whether to return the variance of the estimate.

47 Returns

48 -------

49 hdquantiles : MaskedArray

50 A (p,) array of quantiles (if `var` is False), or a (2,p) array of

51 quantiles and variances (if `var` is True), where ``p`` is the

52 number of quantiles.

54 See Also

55 --------

56 hdquantiles_sd

58 """

59 def _hd_1D(data,prob,var):

60 "Computes the HD quantiles for a 1D array. Returns nan for invalid data."

61 xsorted = np.squeeze(np.sort(data.compressed().view(ndarray)))

62 # Don't use length here, in case we have a numpy scalar

63 n = xsorted.size

65 hd = np.empty((2,len(prob)), float_)

66 if n < 2:

67 hd.flat = np.nan

68 if var:

69 return hd

70 return hd[0]

72 v = np.arange(n+1) / float(n)

73 betacdf = beta.cdf

74 for (i,p) in enumerate(prob):

75 _w = betacdf(v, (n+1)*p, (n+1)*(1-p))

76 w = _w[1:] - _w[:-1]

77 hd_mean = np.dot(w, xsorted)

78 hd[0,i] = hd_mean

79 #

80 hd[1,i] = np.dot(w, (xsorted-hd_mean)**2)

81 #

82 hd[0, prob == 0] = xsorted[0]

83 hd[0, prob == 1] = xsorted[-1]

84 if var:

85 hd[1, prob == 0] = hd[1, prob == 1] = np.nan

86 return hd

87 return hd[0]

88 # Initialization & checks

89 data = ma.array(data, copy=False, dtype=float_)

90 p = np.array(prob, copy=False, ndmin=1)

91 # Computes quantiles along axis (or globally)

92 if (axis is None) or (data.ndim == 1):

93 result = _hd_1D(data, p, var)

94 else:

95 if data.ndim > 2:

96 raise ValueError("Array 'data' must be at most two dimensional, "

97 "but got data.ndim = %d" % data.ndim)

98 result = ma.apply_along_axis(_hd_1D, axis, data, p, var)

100 return ma.fix_invalid(result, copy=False)

101

102

103def hdmedian(data, axis=-1, var=False):

104 """

105 Returns the Harrell-Davis estimate of the median along the given axis.

106

107 Parameters

108 ----------

109 data : ndarray

110 Data array.

111 axis : int, optional

112 Axis along which to compute the quantiles. If None, use a flattened

113 array.

114 var : bool, optional

115 Whether to return the variance of the estimate.

116

117 Returns

118 -------

119 hdmedian : MaskedArray

120 The median values. If ``var=True``, the variance is returned inside

121 the masked array. E.g. for a 1-D array the shape change from (1,) to

122 (2,).

123

124 """

125 result = hdquantiles(data,[0.5], axis=axis, var=var)

126 return result.squeeze()

127

128

129def hdquantiles_sd(data, prob=list([.25,.5,.75]), axis=None):

130 """

131 The standard error of the Harrell-Davis quantile estimates by jackknife.

132

133 Parameters

134 ----------

135 data : array_like

136 Data array.

137 prob : sequence, optional

138 Sequence of quantiles to compute.

139 axis : int, optional

140 Axis along which to compute the quantiles. If None, use a flattened

141 array.

142

143 Returns

144 -------

145 hdquantiles_sd : MaskedArray

146 Standard error of the Harrell-Davis quantile estimates.

147

148 See Also

149 --------

150 hdquantiles

151

152 """

153 def _hdsd_1D(data, prob):

154 "Computes the std error for 1D arrays."

155 xsorted = np.sort(data.compressed())

156 n = len(xsorted)

157

158 hdsd = np.empty(len(prob), float_)

159 if n < 2:

160 hdsd.flat = np.nan

161

162 vv = np.arange(n) / float(n-1)

163 betacdf = beta.cdf

164

165 for (i,p) in enumerate(prob):

166 _w = betacdf(vv, n*p, n*(1-p))

167 w = _w[1:] - _w[:-1]

168 # cumulative sum of weights and data points if

169 # ith point is left out for jackknife

170 mx_ = np.zeros_like(xsorted)

171 mx_[1:] = np.cumsum(w * xsorted[:-1])

172 # similar but from the right

173 mx_[:-1] += np.cumsum(w[::-1] * xsorted[:0:-1])[::-1]

174 hdsd[i] = np.sqrt(mx_.var() * (n - 1))

175 return hdsd

176

177 # Initialization & checks

178 data = ma.array(data, copy=False, dtype=float_)

179 p = np.array(prob, copy=False, ndmin=1)

180 # Computes quantiles along axis (or globally)

181 if (axis is None):

182 result = _hdsd_1D(data, p)

183 else:

184 if data.ndim > 2:

185 raise ValueError("Array 'data' must be at most two dimensional, "

186 "but got data.ndim = %d" % data.ndim)

187 result = ma.apply_along_axis(_hdsd_1D, axis, data, p)

188

189 return ma.fix_invalid(result, copy=False).ravel()

190

191

192def trimmed_mean_ci(data, limits=(0.2,0.2), inclusive=(True,True),

193 alpha=0.05, axis=None):

194 """

195 Selected confidence interval of the trimmed mean along the given axis.

196

197 Parameters

198 ----------

199 data : array_like

200 Input data.

201 limits : {None, tuple}, optional

202 None or a two item tuple.

203 Tuple of the percentages to cut on each side of the array, with respect

204 to the number of unmasked data, as floats between 0. and 1. If ``n``

205 is the number of unmasked data before trimming, then

206 (``n * limits[0]``)th smallest data and (``n * limits[1]``)th

207 largest data are masked. The total number of unmasked data after

208 trimming is ``n * (1. - sum(limits))``.

209 The value of one limit can be set to None to indicate an open interval.

210

211 Defaults to (0.2, 0.2).

212 inclusive : (2,) tuple of boolean, optional

213 If relative==False, tuple indicating whether values exactly equal to

214 the absolute limits are allowed.

215 If relative==True, tuple indicating whether the number of data being

216 masked on each side should be rounded (True) or truncated (False).

217

218 Defaults to (True, True).

219 alpha : float, optional

220 Confidence level of the intervals.

221

222 Defaults to 0.05.

223 axis : int, optional

224 Axis along which to cut. If None, uses a flattened version of `data`.

225

226 Defaults to None.

227

228 Returns

229 -------

230 trimmed_mean_ci : (2,) ndarray

231 The lower and upper confidence intervals of the trimmed data.

232

233 """

234 data = ma.array(data, copy=False)

235 trimmed = mstats.trimr(data, limits=limits, inclusive=inclusive, axis=axis)

236 tmean = trimmed.mean(axis)

237 tstde = mstats.trimmed_stde(data,limits=limits,inclusive=inclusive,axis=axis)

238 df = trimmed.count(axis) - 1

239 tppf = t.ppf(1-alpha/2.,df)

240 return np.array((tmean - tppf*tstde, tmean+tppf*tstde))

241

242

243def mjci(data, prob=[0.25,0.5,0.75], axis=None):

244 """

245 Returns the Maritz-Jarrett estimators of the standard error of selected

246 experimental quantiles of the data.

247

248 Parameters

249 ----------

250 data : ndarray

251 Data array.

252 prob : sequence, optional

253 Sequence of quantiles to compute.

254 axis : int or None, optional

255 Axis along which to compute the quantiles. If None, use a flattened

256 array.

257

258 """

259 def _mjci_1D(data, p):

260 data = np.sort(data.compressed())

261 n = data.size

262 prob = (np.array(p) * n + 0.5).astype(int_)

263 betacdf = beta.cdf

264

265 mj = np.empty(len(prob), float_)

266 x = np.arange(1,n+1, dtype=float_) / n

267 y = x - 1./n

268 for (i,m) in enumerate(prob):

269 W = betacdf(x,m-1,n-m) - betacdf(y,m-1,n-m)

270 C1 = np.dot(W,data)

271 C2 = np.dot(W,data**2)

272 mj[i] = np.sqrt(C2 - C1**2)

273 return mj

274

275 data = ma.array(data, copy=False)

276 if data.ndim > 2:

277 raise ValueError("Array 'data' must be at most two dimensional, "

278 "but got data.ndim = %d" % data.ndim)

279

280 p = np.array(prob, copy=False, ndmin=1)

281 # Computes quantiles along axis (or globally)

282 if (axis is None):

283 return _mjci_1D(data, p)

284 else:

285 return ma.apply_along_axis(_mjci_1D, axis, data, p)

286

287

288def mquantiles_cimj(data, prob=[0.25,0.50,0.75], alpha=0.05, axis=None):

289 """

290 Computes the alpha confidence interval for the selected quantiles of the

291 data, with Maritz-Jarrett estimators.

292

293 Parameters

294 ----------

295 data : ndarray

296 Data array.

297 prob : sequence, optional

298 Sequence of quantiles to compute.

299 alpha : float, optional

300 Confidence level of the intervals.

301 axis : int or None, optional

302 Axis along which to compute the quantiles.

303 If None, use a flattened array.

304

305 Returns

306 -------

307 ci_lower : ndarray

308 The lower boundaries of the confidence interval. Of the same length as

309 `prob`.

310 ci_upper : ndarray

311 The upper boundaries of the confidence interval. Of the same length as

312 `prob`.

313

314 """

315 alpha = min(alpha, 1 - alpha)

316 z = norm.ppf(1 - alpha/2.)

317 xq = mstats.mquantiles(data, prob, alphap=0, betap=0, axis=axis)

318 smj = mjci(data, prob, axis=axis)

319 return (xq - z * smj, xq + z * smj)

320

321

322def median_cihs(data, alpha=0.05, axis=None):

323 """

324 Computes the alpha-level confidence interval for the median of the data.

325

326 Uses the Hettmasperger-Sheather method.

327

328 Parameters

329 ----------

330 data : array_like

331 Input data. Masked values are discarded. The input should be 1D only,

332 or `axis` should be set to None.

333 alpha : float, optional

334 Confidence level of the intervals.

335 axis : int or None, optional

336 Axis along which to compute the quantiles. If None, use a flattened

337 array.

338

339 Returns

340 -------

341 median_cihs

342 Alpha level confidence interval.

343

344 """

345 def _cihs_1D(data, alpha):

346 data = np.sort(data.compressed())

347 n = len(data)

348 alpha = min(alpha, 1-alpha)

349 k = int(binom._ppf(alpha/2., n, 0.5))

350 gk = binom.cdf(n-k,n,0.5) - binom.cdf(k-1,n,0.5)

351 if gk < 1-alpha:

352 k -= 1

353 gk = binom.cdf(n-k,n,0.5) - binom.cdf(k-1,n,0.5)

354 gkk = binom.cdf(n-k-1,n,0.5) - binom.cdf(k,n,0.5)

355 I = (gk - 1 + alpha)/(gk - gkk)

356 lambd = (n-k) * I / float(k + (n-2*k)*I)

357 lims = (lambd*data[k] + (1-lambd)*data[k-1],

358 lambd*data[n-k-1] + (1-lambd)*data[n-k])

359 return lims

360 data = ma.array(data, copy=False)

361 # Computes quantiles along axis (or globally)

362 if (axis is None):

363 result = _cihs_1D(data, alpha)

364 else:

365 if data.ndim > 2:

366 raise ValueError("Array 'data' must be at most two dimensional, "

367 "but got data.ndim = %d" % data.ndim)

368 result = ma.apply_along_axis(_cihs_1D, axis, data, alpha)

369

370 return result

371

372

373def compare_medians_ms(group_1, group_2, axis=None):

374 """

375 Compares the medians from two independent groups along the given axis.

376

377 The comparison is performed using the McKean-Schrader estimate of the

378 standard error of the medians.

379

380 Parameters

381 ----------

382 group_1 : array_like

383 First dataset. Has to be of size >=7.

384 group_2 : array_like

385 Second dataset. Has to be of size >=7.

386 axis : int, optional

387 Axis along which the medians are estimated. If None, the arrays are

388 flattened. If `axis` is not None, then `group_1` and `group_2`

389 should have the same shape.

390

391 Returns

392 -------

393 compare_medians_ms : {float, ndarray}

394 If `axis` is None, then returns a float, otherwise returns a 1-D

395 ndarray of floats with a length equal to the length of `group_1`

396 along `axis`.

397

398 Examples

399 --------

400

401 >>> from scipy import stats

402 >>> a = [1, 2, 3, 4, 5, 6, 7]

403 >>> b = [8, 9, 10, 11, 12, 13, 14]

404 >>> stats.mstats.compare_medians_ms(a, b, axis=None)

405 1.0693225866553746e-05

406

407 The function is vectorized to compute along a given axis.

408

409 >>> import numpy as np

410 >>> rng = np.random.default_rng()

411 >>> x = rng.random(size=(3, 7))

412 >>> y = rng.random(size=(3, 8))

413 >>> stats.mstats.compare_medians_ms(x, y, axis=1)

414 array([0.36908985, 0.36092538, 0.2765313 ])

415

416 References

417 ----------

418 .. [1] McKean, Joseph W., and Ronald M. Schrader. "A comparison of methods

419 for studentizing the sample median." Communications in

420 Statistics-Simulation and Computation 13.6 (1984): 751-773.

421

422 """

423 (med_1, med_2) = (ma.median(group_1,axis=axis), ma.median(group_2,axis=axis))

424 (std_1, std_2) = (mstats.stde_median(group_1, axis=axis),

425 mstats.stde_median(group_2, axis=axis))

426 W = np.abs(med_1 - med_2) / ma.sqrt(std_1**2 + std_2**2)

427 return 1 - norm.cdf(W)

428

429

430def idealfourths(data, axis=None):

431 """

432 Returns an estimate of the lower and upper quartiles.

433

434 Uses the ideal fourths algorithm.

435

436 Parameters

437 ----------

438 data : array_like

439 Input array.

440 axis : int, optional

441 Axis along which the quartiles are estimated. If None, the arrays are

442 flattened.

443

444 Returns

445 -------

446 idealfourths : {list of floats, masked array}

447 Returns the two internal values that divide `data` into four parts

448 using the ideal fourths algorithm either along the flattened array

449 (if `axis` is None) or along `axis` of `data`.

450

451 """

452 def _idf(data):

453 x = data.compressed()

454 n = len(x)

455 if n < 3:

456 return [np.nan,np.nan]

457 (j,h) = divmod(n/4. + 5/12.,1)

458 j = int(j)

459 qlo = (1-h)*x[j-1] + h*x[j]

460 k = n - j

461 qup = (1-h)*x[k] + h*x[k-1]

462 return [qlo, qup]

463 data = ma.sort(data, axis=axis).view(MaskedArray)

464 if (axis is None):

465 return _idf(data)

466 else:

467 return ma.apply_along_axis(_idf, axis, data)

468

469

470def rsh(data, points=None):

471 """

472 Evaluates Rosenblatt's shifted histogram estimators for each data point.

473

474 Rosenblatt's estimator is a centered finite-difference approximation to the

475 derivative of the empirical cumulative distribution function.

476

477 Parameters

478 ----------

479 data : sequence

480 Input data, should be 1-D. Masked values are ignored.

481 points : sequence or None, optional

482 Sequence of points where to evaluate Rosenblatt shifted histogram.

483 If None, use the data.

484

485 """

486 data = ma.array(data, copy=False)

487 if points is None:

488 points = data

489 else:

490 points = np.array(points, copy=False, ndmin=1)

491

492 if data.ndim != 1:

493 raise AttributeError("The input array should be 1D only !")

494

495 n = data.count()

496 r = idealfourths(data, axis=None)

497 h = 1.2 * (r[-1]-r[0]) / n**(1./5)

498 nhi = (data[:,None] <= points[None,:] + h).sum(0)

499 nlo = (data[:,None] < points[None,:] - h).sum(0)

500 return (nhi-nlo) / (2.*n*h)

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scipy/stats/_mstats_extras.py: 11%

159 statements