Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scipy/stats/_stats_mstats_common.py: 13%

110 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-12-12 06:31 +0000

1import warnings 

2import numpy as np 

3import scipy.stats._stats_py 

4from . import distributions 

5from .._lib._bunch import _make_tuple_bunch 

6from ._stats_pythran import siegelslopes as siegelslopes_pythran 

7 

8__all__ = ['_find_repeats', 'linregress', 'theilslopes', 'siegelslopes'] 

9 

10# This is not a namedtuple for backwards compatibility. See PR #12983 

11LinregressResult = _make_tuple_bunch('LinregressResult', 

12 ['slope', 'intercept', 'rvalue', 

13 'pvalue', 'stderr'], 

14 extra_field_names=['intercept_stderr']) 

15TheilslopesResult = _make_tuple_bunch('TheilslopesResult', 

16 ['slope', 'intercept', 

17 'low_slope', 'high_slope']) 

18SiegelslopesResult = _make_tuple_bunch('SiegelslopesResult', 

19 ['slope', 'intercept']) 

20 

21 

22def linregress(x, y=None, alternative='two-sided'): 

23 """ 

24 Calculate a linear least-squares regression for two sets of measurements. 

25 

26 Parameters 

27 ---------- 

28 x, y : array_like 

29 Two sets of measurements. Both arrays should have the same length. If 

30 only `x` is given (and ``y=None``), then it must be a two-dimensional 

31 array where one dimension has length 2. The two sets of measurements 

32 are then found by splitting the array along the length-2 dimension. In 

33 the case where ``y=None`` and `x` is a 2x2 array, ``linregress(x)`` is 

34 equivalent to ``linregress(x[0], x[1])``. 

35 alternative : {'two-sided', 'less', 'greater'}, optional 

36 Defines the alternative hypothesis. Default is 'two-sided'. 

37 The following options are available: 

38 

39 * 'two-sided': the slope of the regression line is nonzero 

40 * 'less': the slope of the regression line is less than zero 

41 * 'greater': the slope of the regression line is greater than zero 

42 

43 .. versionadded:: 1.7.0 

44 

45 Returns 

46 ------- 

47 result : ``LinregressResult`` instance 

48 The return value is an object with the following attributes: 

49 

50 slope : float 

51 Slope of the regression line. 

52 intercept : float 

53 Intercept of the regression line. 

54 rvalue : float 

55 The Pearson correlation coefficient. The square of ``rvalue`` 

56 is equal to the coefficient of determination. 

57 pvalue : float 

58 The p-value for a hypothesis test whose null hypothesis is 

59 that the slope is zero, using Wald Test with t-distribution of 

60 the test statistic. See `alternative` above for alternative 

61 hypotheses. 

62 stderr : float 

63 Standard error of the estimated slope (gradient), under the 

64 assumption of residual normality. 

65 intercept_stderr : float 

66 Standard error of the estimated intercept, under the assumption 

67 of residual normality. 

68 

69 See Also 

70 -------- 

71 scipy.optimize.curve_fit : 

72 Use non-linear least squares to fit a function to data. 

73 scipy.optimize.leastsq : 

74 Minimize the sum of squares of a set of equations. 

75 

76 Notes 

77 ----- 

78 Missing values are considered pair-wise: if a value is missing in `x`, 

79 the corresponding value in `y` is masked. 

80 

81 For compatibility with older versions of SciPy, the return value acts 

82 like a ``namedtuple`` of length 5, with fields ``slope``, ``intercept``, 

83 ``rvalue``, ``pvalue`` and ``stderr``, so one can continue to write:: 

84 

85 slope, intercept, r, p, se = linregress(x, y) 

86 

87 With that style, however, the standard error of the intercept is not 

88 available. To have access to all the computed values, including the 

89 standard error of the intercept, use the return value as an object 

90 with attributes, e.g.:: 

91 

92 result = linregress(x, y) 

93 print(result.intercept, result.intercept_stderr) 

94 

95 Examples 

96 -------- 

97 >>> import numpy as np 

98 >>> import matplotlib.pyplot as plt 

99 >>> from scipy import stats 

100 >>> rng = np.random.default_rng() 

101 

102 Generate some data: 

103 

104 >>> x = rng.random(10) 

105 >>> y = 1.6*x + rng.random(10) 

106 

107 Perform the linear regression: 

108 

109 >>> res = stats.linregress(x, y) 

110 

111 Coefficient of determination (R-squared): 

112 

113 >>> print(f"R-squared: {res.rvalue**2:.6f}") 

114 R-squared: 0.717533 

115 

116 Plot the data along with the fitted line: 

117 

118 >>> plt.plot(x, y, 'o', label='original data') 

119 >>> plt.plot(x, res.intercept + res.slope*x, 'r', label='fitted line') 

120 >>> plt.legend() 

121 >>> plt.show() 

122 

123 Calculate 95% confidence interval on slope and intercept: 

124 

125 >>> # Two-sided inverse Students t-distribution 

126 >>> # p - probability, df - degrees of freedom 

127 >>> from scipy.stats import t 

128 >>> tinv = lambda p, df: abs(t.ppf(p/2, df)) 

129 

130 >>> ts = tinv(0.05, len(x)-2) 

131 >>> print(f"slope (95%): {res.slope:.6f} +/- {ts*res.stderr:.6f}") 

132 slope (95%): 1.453392 +/- 0.743465 

133 >>> print(f"intercept (95%): {res.intercept:.6f}" 

134 ... f" +/- {ts*res.intercept_stderr:.6f}") 

135 intercept (95%): 0.616950 +/- 0.544475 

136 

137 """ 

138 TINY = 1.0e-20 

139 if y is None: # x is a (2, N) or (N, 2) shaped array_like 

140 x = np.asarray(x) 

141 if x.shape[0] == 2: 

142 x, y = x 

143 elif x.shape[1] == 2: 

144 x, y = x.T 

145 else: 

146 raise ValueError("If only `x` is given as input, it has to " 

147 "be of shape (2, N) or (N, 2); provided shape " 

148 f"was {x.shape}.") 

149 else: 

150 x = np.asarray(x) 

151 y = np.asarray(y) 

152 

153 if x.size == 0 or y.size == 0: 

154 raise ValueError("Inputs must not be empty.") 

155 

156 if np.amax(x) == np.amin(x) and len(x) > 1: 

157 raise ValueError("Cannot calculate a linear regression " 

158 "if all x values are identical") 

159 

160 n = len(x) 

161 xmean = np.mean(x, None) 

162 ymean = np.mean(y, None) 

163 

164 # Average sums of square differences from the mean 

165 # ssxm = mean( (x-mean(x))^2 ) 

166 # ssxym = mean( (x-mean(x)) * (y-mean(y)) ) 

167 ssxm, ssxym, _, ssym = np.cov(x, y, bias=1).flat 

168 

169 # R-value 

170 # r = ssxym / sqrt( ssxm * ssym ) 

171 if ssxm == 0.0 or ssym == 0.0: 

172 # If the denominator was going to be 0 

173 r = 0.0 

174 else: 

175 r = ssxym / np.sqrt(ssxm * ssym) 

176 # Test for numerical error propagation (make sure -1 < r < 1) 

177 if r > 1.0: 

178 r = 1.0 

179 elif r < -1.0: 

180 r = -1.0 

181 

182 slope = ssxym / ssxm 

183 intercept = ymean - slope*xmean 

184 if n == 2: 

185 # handle case when only two points are passed in 

186 if y[0] == y[1]: 

187 prob = 1.0 

188 else: 

189 prob = 0.0 

190 slope_stderr = 0.0 

191 intercept_stderr = 0.0 

192 else: 

193 df = n - 2 # Number of degrees of freedom 

194 # n-2 degrees of freedom because 2 has been used up 

195 # to estimate the mean and standard deviation 

196 t = r * np.sqrt(df / ((1.0 - r + TINY)*(1.0 + r + TINY))) 

197 t, prob = scipy.stats._stats_py._ttest_finish(df, t, alternative) 

198 

199 slope_stderr = np.sqrt((1 - r**2) * ssym / ssxm / df) 

200 

201 # Also calculate the standard error of the intercept 

202 # The following relationship is used: 

203 # ssxm = mean( (x-mean(x))^2 ) 

204 # = ssx - sx*sx 

205 # = mean( x^2 ) - mean(x)^2 

206 intercept_stderr = slope_stderr * np.sqrt(ssxm + xmean**2) 

207 

208 return LinregressResult(slope=slope, intercept=intercept, rvalue=r, 

209 pvalue=prob, stderr=slope_stderr, 

210 intercept_stderr=intercept_stderr) 

211 

212 

213def theilslopes(y, x=None, alpha=0.95, method='separate'): 

214 r""" 

215 Computes the Theil-Sen estimator for a set of points (x, y). 

216 

217 `theilslopes` implements a method for robust linear regression. It 

218 computes the slope as the median of all slopes between paired values. 

219 

220 Parameters 

221 ---------- 

222 y : array_like 

223 Dependent variable. 

224 x : array_like or None, optional 

225 Independent variable. If None, use ``arange(len(y))`` instead. 

226 alpha : float, optional 

227 Confidence degree between 0 and 1. Default is 95% confidence. 

228 Note that `alpha` is symmetric around 0.5, i.e. both 0.1 and 0.9 are 

229 interpreted as "find the 90% confidence interval". 

230 method : {'joint', 'separate'}, optional 

231 Method to be used for computing estimate for intercept. 

232 Following methods are supported, 

233 

234 * 'joint': Uses np.median(y - slope * x) as intercept. 

235 * 'separate': Uses np.median(y) - slope * np.median(x) 

236 as intercept. 

237 

238 The default is 'separate'. 

239 

240 .. versionadded:: 1.8.0 

241 

242 Returns 

243 ------- 

244 result : ``TheilslopesResult`` instance 

245 The return value is an object with the following attributes: 

246 

247 slope : float 

248 Theil slope. 

249 intercept : float 

250 Intercept of the Theil line. 

251 low_slope : float 

252 Lower bound of the confidence interval on `slope`. 

253 high_slope : float 

254 Upper bound of the confidence interval on `slope`. 

255 

256 See Also 

257 -------- 

258 siegelslopes : a similar technique using repeated medians 

259 

260 Notes 

261 ----- 

262 The implementation of `theilslopes` follows [1]_. The intercept is 

263 not defined in [1]_, and here it is defined as ``median(y) - 

264 slope*median(x)``, which is given in [3]_. Other definitions of 

265 the intercept exist in the literature such as ``median(y - slope*x)`` 

266 in [4]_. The approach to compute the intercept can be determined by the 

267 parameter ``method``. A confidence interval for the intercept is not 

268 given as this question is not addressed in [1]_. 

269 

270 For compatibility with older versions of SciPy, the return value acts 

271 like a ``namedtuple`` of length 4, with fields ``slope``, ``intercept``, 

272 ``low_slope``, and ``high_slope``, so one can continue to write:: 

273 

274 slope, intercept, low_slope, high_slope = theilslopes(y, x) 

275 

276 References 

277 ---------- 

278 .. [1] P.K. Sen, "Estimates of the regression coefficient based on 

279 Kendall's tau", J. Am. Stat. Assoc., Vol. 63, pp. 1379-1389, 1968. 

280 .. [2] H. Theil, "A rank-invariant method of linear and polynomial 

281 regression analysis I, II and III", Nederl. Akad. Wetensch., Proc. 

282 53:, pp. 386-392, pp. 521-525, pp. 1397-1412, 1950. 

283 .. [3] W.L. Conover, "Practical nonparametric statistics", 2nd ed., 

284 John Wiley and Sons, New York, pp. 493. 

285 .. [4] https://en.wikipedia.org/wiki/Theil%E2%80%93Sen_estimator 

286 

287 Examples 

288 -------- 

289 >>> import numpy as np 

290 >>> from scipy import stats 

291 >>> import matplotlib.pyplot as plt 

292 

293 >>> x = np.linspace(-5, 5, num=150) 

294 >>> y = x + np.random.normal(size=x.size) 

295 >>> y[11:15] += 10 # add outliers 

296 >>> y[-5:] -= 7 

297 

298 Compute the slope, intercept and 90% confidence interval. For comparison, 

299 also compute the least-squares fit with `linregress`: 

300 

301 >>> res = stats.theilslopes(y, x, 0.90, method='separate') 

302 >>> lsq_res = stats.linregress(x, y) 

303 

304 Plot the results. The Theil-Sen regression line is shown in red, with the 

305 dashed red lines illustrating the confidence interval of the slope (note 

306 that the dashed red lines are not the confidence interval of the regression 

307 as the confidence interval of the intercept is not included). The green 

308 line shows the least-squares fit for comparison. 

309 

310 >>> fig = plt.figure() 

311 >>> ax = fig.add_subplot(111) 

312 >>> ax.plot(x, y, 'b.') 

313 >>> ax.plot(x, res[1] + res[0] * x, 'r-') 

314 >>> ax.plot(x, res[1] + res[2] * x, 'r--') 

315 >>> ax.plot(x, res[1] + res[3] * x, 'r--') 

316 >>> ax.plot(x, lsq_res[1] + lsq_res[0] * x, 'g-') 

317 >>> plt.show() 

318 

319 """ 

320 if method not in ['joint', 'separate']: 

321 raise ValueError(("method must be either 'joint' or 'separate'." 

322 "'{}' is invalid.".format(method))) 

323 # We copy both x and y so we can use _find_repeats. 

324 y = np.array(y).flatten() 

325 if x is None: 

326 x = np.arange(len(y), dtype=float) 

327 else: 

328 x = np.array(x, dtype=float).flatten() 

329 if len(x) != len(y): 

330 raise ValueError("Incompatible lengths ! (%s<>%s)" % 

331 (len(y), len(x))) 

332 

333 # Compute sorted slopes only when deltax > 0 

334 deltax = x[:, np.newaxis] - x 

335 deltay = y[:, np.newaxis] - y 

336 slopes = deltay[deltax > 0] / deltax[deltax > 0] 

337 if not slopes.size: 

338 msg = "All `x` coordinates are identical." 

339 warnings.warn(msg, RuntimeWarning, stacklevel=2) 

340 slopes.sort() 

341 medslope = np.median(slopes) 

342 if method == 'joint': 

343 medinter = np.median(y - medslope * x) 

344 else: 

345 medinter = np.median(y) - medslope * np.median(x) 

346 # Now compute confidence intervals 

347 if alpha > 0.5: 

348 alpha = 1. - alpha 

349 

350 z = distributions.norm.ppf(alpha / 2.) 

351 # This implements (2.6) from Sen (1968) 

352 _, nxreps = _find_repeats(x) 

353 _, nyreps = _find_repeats(y) 

354 nt = len(slopes) # N in Sen (1968) 

355 ny = len(y) # n in Sen (1968) 

356 # Equation 2.6 in Sen (1968): 

357 sigsq = 1/18. * (ny * (ny-1) * (2*ny+5) - 

358 sum(k * (k-1) * (2*k + 5) for k in nxreps) - 

359 sum(k * (k-1) * (2*k + 5) for k in nyreps)) 

360 # Find the confidence interval indices in `slopes` 

361 try: 

362 sigma = np.sqrt(sigsq) 

363 Ru = min(int(np.round((nt - z*sigma)/2.)), len(slopes)-1) 

364 Rl = max(int(np.round((nt + z*sigma)/2.)) - 1, 0) 

365 delta = slopes[[Rl, Ru]] 

366 except (ValueError, IndexError): 

367 delta = (np.nan, np.nan) 

368 

369 return TheilslopesResult(slope=medslope, intercept=medinter, 

370 low_slope=delta[0], high_slope=delta[1]) 

371 

372 

373def _find_repeats(arr): 

374 # This function assumes it may clobber its input. 

375 if len(arr) == 0: 

376 return np.array(0, np.float64), np.array(0, np.intp) 

377 

378 # XXX This cast was previously needed for the Fortran implementation, 

379 # should we ditch it? 

380 arr = np.asarray(arr, np.float64).ravel() 

381 arr.sort() 

382 

383 # Taken from NumPy 1.9's np.unique. 

384 change = np.concatenate(([True], arr[1:] != arr[:-1])) 

385 unique = arr[change] 

386 change_idx = np.concatenate(np.nonzero(change) + ([arr.size],)) 

387 freq = np.diff(change_idx) 

388 atleast2 = freq > 1 

389 return unique[atleast2], freq[atleast2] 

390 

391 

392def siegelslopes(y, x=None, method="hierarchical"): 

393 r""" 

394 Computes the Siegel estimator for a set of points (x, y). 

395 

396 `siegelslopes` implements a method for robust linear regression 

397 using repeated medians (see [1]_) to fit a line to the points (x, y). 

398 The method is robust to outliers with an asymptotic breakdown point 

399 of 50%. 

400 

401 Parameters 

402 ---------- 

403 y : array_like 

404 Dependent variable. 

405 x : array_like or None, optional 

406 Independent variable. If None, use ``arange(len(y))`` instead. 

407 method : {'hierarchical', 'separate'} 

408 If 'hierarchical', estimate the intercept using the estimated 

409 slope ``slope`` (default option). 

410 If 'separate', estimate the intercept independent of the estimated 

411 slope. See Notes for details. 

412 

413 Returns 

414 ------- 

415 result : ``SiegelslopesResult`` instance 

416 The return value is an object with the following attributes: 

417 

418 slope : float 

419 Estimate of the slope of the regression line. 

420 intercept : float 

421 Estimate of the intercept of the regression line. 

422 

423 See Also 

424 -------- 

425 theilslopes : a similar technique without repeated medians 

426 

427 Notes 

428 ----- 

429 With ``n = len(y)``, compute ``m_j`` as the median of 

430 the slopes from the point ``(x[j], y[j])`` to all other `n-1` points. 

431 ``slope`` is then the median of all slopes ``m_j``. 

432 Two ways are given to estimate the intercept in [1]_ which can be chosen 

433 via the parameter ``method``. 

434 The hierarchical approach uses the estimated slope ``slope`` 

435 and computes ``intercept`` as the median of ``y - slope*x``. 

436 The other approach estimates the intercept separately as follows: for 

437 each point ``(x[j], y[j])``, compute the intercepts of all the `n-1` 

438 lines through the remaining points and take the median ``i_j``. 

439 ``intercept`` is the median of the ``i_j``. 

440 

441 The implementation computes `n` times the median of a vector of size `n` 

442 which can be slow for large vectors. There are more efficient algorithms 

443 (see [2]_) which are not implemented here. 

444 

445 For compatibility with older versions of SciPy, the return value acts 

446 like a ``namedtuple`` of length 2, with fields ``slope`` and 

447 ``intercept``, so one can continue to write:: 

448 

449 slope, intercept = siegelslopes(y, x) 

450 

451 References 

452 ---------- 

453 .. [1] A. Siegel, "Robust Regression Using Repeated Medians", 

454 Biometrika, Vol. 69, pp. 242-244, 1982. 

455 

456 .. [2] A. Stein and M. Werman, "Finding the repeated median regression 

457 line", Proceedings of the Third Annual ACM-SIAM Symposium on 

458 Discrete Algorithms, pp. 409-413, 1992. 

459 

460 Examples 

461 -------- 

462 >>> import numpy as np 

463 >>> from scipy import stats 

464 >>> import matplotlib.pyplot as plt 

465 

466 >>> x = np.linspace(-5, 5, num=150) 

467 >>> y = x + np.random.normal(size=x.size) 

468 >>> y[11:15] += 10 # add outliers 

469 >>> y[-5:] -= 7 

470 

471 Compute the slope and intercept. For comparison, also compute the 

472 least-squares fit with `linregress`: 

473 

474 >>> res = stats.siegelslopes(y, x) 

475 >>> lsq_res = stats.linregress(x, y) 

476 

477 Plot the results. The Siegel regression line is shown in red. The green 

478 line shows the least-squares fit for comparison. 

479 

480 >>> fig = plt.figure() 

481 >>> ax = fig.add_subplot(111) 

482 >>> ax.plot(x, y, 'b.') 

483 >>> ax.plot(x, res[1] + res[0] * x, 'r-') 

484 >>> ax.plot(x, lsq_res[1] + lsq_res[0] * x, 'g-') 

485 >>> plt.show() 

486 

487 """ 

488 if method not in ['hierarchical', 'separate']: 

489 raise ValueError("method can only be 'hierarchical' or 'separate'") 

490 y = np.asarray(y).ravel() 

491 if x is None: 

492 x = np.arange(len(y), dtype=float) 

493 else: 

494 x = np.asarray(x, dtype=float).ravel() 

495 if len(x) != len(y): 

496 raise ValueError("Incompatible lengths ! (%s<>%s)" % 

497 (len(y), len(x))) 

498 dtype = np.result_type(x, y, np.float32) # use at least float32 

499 y, x = y.astype(dtype), x.astype(dtype) 

500 medslope, medinter = siegelslopes_pythran(y, x, method) 

501 return SiegelslopesResult(slope=medslope, intercept=medinter)