Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scipy/stats/_stats_mstats

1import warnings

2import numpy as np

3import scipy.stats._stats_py

4from . import distributions

5from .._lib._bunch import _make_tuple_bunch

6from ._stats_pythran import siegelslopes as siegelslopes_pythran

8__all__ = ['_find_repeats', 'linregress', 'theilslopes', 'siegelslopes']

10# This is not a namedtuple for backwards compatibility. See PR #12983

11LinregressResult = _make_tuple_bunch('LinregressResult',

12 ['slope', 'intercept', 'rvalue',

13 'pvalue', 'stderr'],

14 extra_field_names=['intercept_stderr'])

15TheilslopesResult = _make_tuple_bunch('TheilslopesResult',

16 ['slope', 'intercept',

17 'low_slope', 'high_slope'])

18SiegelslopesResult = _make_tuple_bunch('SiegelslopesResult',

19 ['slope', 'intercept'])

22def linregress(x, y=None, alternative='two-sided'):

23 """

24 Calculate a linear least-squares regression for two sets of measurements.

26 Parameters

27 ----------

28 x, y : array_like

29 Two sets of measurements. Both arrays should have the same length. If

30 only `x` is given (and ``y=None``), then it must be a two-dimensional

31 array where one dimension has length 2. The two sets of measurements

32 are then found by splitting the array along the length-2 dimension. In

33 the case where ``y=None`` and `x` is a 2x2 array, ``linregress(x)`` is

34 equivalent to ``linregress(x[0], x[1])``.

35 alternative : {'two-sided', 'less', 'greater'}, optional

36 Defines the alternative hypothesis. Default is 'two-sided'.

37 The following options are available:

39 * 'two-sided': the slope of the regression line is nonzero

40 * 'less': the slope of the regression line is less than zero

41 * 'greater': the slope of the regression line is greater than zero

43 .. versionadded:: 1.7.0

45 Returns

46 -------

47 result : ``LinregressResult`` instance

48 The return value is an object with the following attributes:

50 slope : float

51 Slope of the regression line.

52 intercept : float

53 Intercept of the regression line.

54 rvalue : float

55 The Pearson correlation coefficient. The square of ``rvalue``

56 is equal to the coefficient of determination.

57 pvalue : float

58 The p-value for a hypothesis test whose null hypothesis is

59 that the slope is zero, using Wald Test with t-distribution of

60 the test statistic. See `alternative` above for alternative

61 hypotheses.

62 stderr : float

63 Standard error of the estimated slope (gradient), under the

64 assumption of residual normality.

65 intercept_stderr : float

66 Standard error of the estimated intercept, under the assumption

67 of residual normality.

69 See Also

70 --------

71 scipy.optimize.curve_fit :

72 Use non-linear least squares to fit a function to data.

73 scipy.optimize.leastsq :

74 Minimize the sum of squares of a set of equations.

76 Notes

77 -----

78 Missing values are considered pair-wise: if a value is missing in `x`,

79 the corresponding value in `y` is masked.

81 For compatibility with older versions of SciPy, the return value acts

82 like a ``namedtuple`` of length 5, with fields ``slope``, ``intercept``,

83 ``rvalue``, ``pvalue`` and ``stderr``, so one can continue to write::

85 slope, intercept, r, p, se = linregress(x, y)

87 With that style, however, the standard error of the intercept is not

88 available. To have access to all the computed values, including the

89 standard error of the intercept, use the return value as an object

90 with attributes, e.g.::

92 result = linregress(x, y)

93 print(result.intercept, result.intercept_stderr)

95 Examples

96 --------

97 >>> import numpy as np

98 >>> import matplotlib.pyplot as plt

99 >>> from scipy import stats

100 >>> rng = np.random.default_rng()

101

102 Generate some data:

103

104 >>> x = rng.random(10)

105 >>> y = 1.6*x + rng.random(10)

106

107 Perform the linear regression:

108

109 >>> res = stats.linregress(x, y)

110

111 Coefficient of determination (R-squared):

112

113 >>> print(f"R-squared: {res.rvalue**2:.6f}")

114 R-squared: 0.717533

115

116 Plot the data along with the fitted line:

117

118 >>> plt.plot(x, y, 'o', label='original data')

119 >>> plt.plot(x, res.intercept + res.slope*x, 'r', label='fitted line')

120 >>> plt.legend()

121 >>> plt.show()

122

123 Calculate 95% confidence interval on slope and intercept:

124

125 >>> # Two-sided inverse Students t-distribution

126 >>> # p - probability, df - degrees of freedom

127 >>> from scipy.stats import t

128 >>> tinv = lambda p, df: abs(t.ppf(p/2, df))

129

130 >>> ts = tinv(0.05, len(x)-2)

131 >>> print(f"slope (95%): {res.slope:.6f} +/- {ts*res.stderr:.6f}")

132 slope (95%): 1.453392 +/- 0.743465

133 >>> print(f"intercept (95%): {res.intercept:.6f}"

134 ... f" +/- {ts*res.intercept_stderr:.6f}")

135 intercept (95%): 0.616950 +/- 0.544475

136

137 """

138 TINY = 1.0e-20

139 if y is None: # x is a (2, N) or (N, 2) shaped array_like

140 x = np.asarray(x)

141 if x.shape[0] == 2:

142 x, y = x

143 elif x.shape[1] == 2:

144 x, y = x.T

145 else:

146 raise ValueError("If only `x` is given as input, it has to "

147 "be of shape (2, N) or (N, 2); provided shape "

148 f"was {x.shape}.")

149 else:

150 x = np.asarray(x)

151 y = np.asarray(y)

152

153 if x.size == 0 or y.size == 0:

154 raise ValueError("Inputs must not be empty.")

155

156 if np.amax(x) == np.amin(x) and len(x) > 1:

157 raise ValueError("Cannot calculate a linear regression "

158 "if all x values are identical")

159

160 n = len(x)

161 xmean = np.mean(x, None)

162 ymean = np.mean(y, None)

163

164 # Average sums of square differences from the mean

165 # ssxm = mean( (x-mean(x))^2 )

166 # ssxym = mean( (x-mean(x)) * (y-mean(y)) )

167 ssxm, ssxym, _, ssym = np.cov(x, y, bias=1).flat

168

169 # R-value

170 # r = ssxym / sqrt( ssxm * ssym )

171 if ssxm == 0.0 or ssym == 0.0:

172 # If the denominator was going to be 0

173 r = 0.0

174 else:

175 r = ssxym / np.sqrt(ssxm * ssym)

176 # Test for numerical error propagation (make sure -1 < r < 1)

177 if r > 1.0:

178 r = 1.0

179 elif r < -1.0:

180 r = -1.0

181

182 slope = ssxym / ssxm

183 intercept = ymean - slope*xmean

184 if n == 2:

185 # handle case when only two points are passed in

186 if y[0] == y[1]:

187 prob = 1.0

188 else:

189 prob = 0.0

190 slope_stderr = 0.0

191 intercept_stderr = 0.0

192 else:

193 df = n - 2 # Number of degrees of freedom

194 # n-2 degrees of freedom because 2 has been used up

195 # to estimate the mean and standard deviation

196 t = r * np.sqrt(df / ((1.0 - r + TINY)*(1.0 + r + TINY)))

197 t, prob = scipy.stats._stats_py._ttest_finish(df, t, alternative)

198

199 slope_stderr = np.sqrt((1 - r**2) * ssym / ssxm / df)

200

201 # Also calculate the standard error of the intercept

202 # The following relationship is used:

203 # ssxm = mean( (x-mean(x))^2 )

204 # = ssx - sx*sx

205 # = mean( x^2 ) - mean(x)^2

206 intercept_stderr = slope_stderr * np.sqrt(ssxm + xmean**2)

207

208 return LinregressResult(slope=slope, intercept=intercept, rvalue=r,

209 pvalue=prob, stderr=slope_stderr,

210 intercept_stderr=intercept_stderr)

211

212

213def theilslopes(y, x=None, alpha=0.95, method='separate'):

214 r"""

215 Computes the Theil-Sen estimator for a set of points (x, y).

216

217 `theilslopes` implements a method for robust linear regression. It

218 computes the slope as the median of all slopes between paired values.

219

220 Parameters

221 ----------

222 y : array_like

223 Dependent variable.

224 x : array_like or None, optional

225 Independent variable. If None, use ``arange(len(y))`` instead.

226 alpha : float, optional

227 Confidence degree between 0 and 1. Default is 95% confidence.

228 Note that `alpha` is symmetric around 0.5, i.e. both 0.1 and 0.9 are

229 interpreted as "find the 90% confidence interval".

230 method : {'joint', 'separate'}, optional

231 Method to be used for computing estimate for intercept.

232 Following methods are supported,

233

234 * 'joint': Uses np.median(y - slope * x) as intercept.

235 * 'separate': Uses np.median(y) - slope * np.median(x)

236 as intercept.

237

238 The default is 'separate'.

239

240 .. versionadded:: 1.8.0

241

242 Returns

243 -------

244 result : ``TheilslopesResult`` instance

245 The return value is an object with the following attributes:

246

247 slope : float

248 Theil slope.

249 intercept : float

250 Intercept of the Theil line.

251 low_slope : float

252 Lower bound of the confidence interval on `slope`.

253 high_slope : float

254 Upper bound of the confidence interval on `slope`.

255

256 See Also

257 --------

258 siegelslopes : a similar technique using repeated medians

259

260 Notes

261 -----

262 The implementation of `theilslopes` follows [1]_. The intercept is

263 not defined in [1]_, and here it is defined as ``median(y) -

264 slope*median(x)``, which is given in [3]_. Other definitions of

265 the intercept exist in the literature such as ``median(y - slope*x)``

266 in [4]_. The approach to compute the intercept can be determined by the

267 parameter ``method``. A confidence interval for the intercept is not

268 given as this question is not addressed in [1]_.

269

270 For compatibility with older versions of SciPy, the return value acts

271 like a ``namedtuple`` of length 4, with fields ``slope``, ``intercept``,

272 ``low_slope``, and ``high_slope``, so one can continue to write::

273

274 slope, intercept, low_slope, high_slope = theilslopes(y, x)

275

276 References

277 ----------

278 .. [1] P.K. Sen, "Estimates of the regression coefficient based on

279 Kendall's tau", J. Am. Stat. Assoc., Vol. 63, pp. 1379-1389, 1968.

280 .. [2] H. Theil, "A rank-invariant method of linear and polynomial

281 regression analysis I, II and III", Nederl. Akad. Wetensch., Proc.

282 53:, pp. 386-392, pp. 521-525, pp. 1397-1412, 1950.

283 .. [3] W.L. Conover, "Practical nonparametric statistics", 2nd ed.,

284 John Wiley and Sons, New York, pp. 493.

285 .. [4] https://en.wikipedia.org/wiki/Theil%E2%80%93Sen_estimator

286

287 Examples

288 --------

289 >>> import numpy as np

290 >>> from scipy import stats

291 >>> import matplotlib.pyplot as plt

292

293 >>> x = np.linspace(-5, 5, num=150)

294 >>> y = x + np.random.normal(size=x.size)

295 >>> y[11:15] += 10 # add outliers

296 >>> y[-5:] -= 7

297

298 Compute the slope, intercept and 90% confidence interval. For comparison,

299 also compute the least-squares fit with `linregress`:

300

301 >>> res = stats.theilslopes(y, x, 0.90, method='separate')

302 >>> lsq_res = stats.linregress(x, y)

303

304 Plot the results. The Theil-Sen regression line is shown in red, with the

305 dashed red lines illustrating the confidence interval of the slope (note

306 that the dashed red lines are not the confidence interval of the regression

307 as the confidence interval of the intercept is not included). The green

308 line shows the least-squares fit for comparison.

309

310 >>> fig = plt.figure()

311 >>> ax = fig.add_subplot(111)

312 >>> ax.plot(x, y, 'b.')

313 >>> ax.plot(x, res[1] + res[0] * x, 'r-')

314 >>> ax.plot(x, res[1] + res[2] * x, 'r--')

315 >>> ax.plot(x, res[1] + res[3] * x, 'r--')

316 >>> ax.plot(x, lsq_res[1] + lsq_res[0] * x, 'g-')

317 >>> plt.show()

318

319 """

320 if method not in ['joint', 'separate']:

321 raise ValueError(("method must be either 'joint' or 'separate'."

322 "'{}' is invalid.".format(method)))

323 # We copy both x and y so we can use _find_repeats.

324 y = np.array(y).flatten()

325 if x is None:

326 x = np.arange(len(y), dtype=float)

327 else:

328 x = np.array(x, dtype=float).flatten()

329 if len(x) != len(y):

330 raise ValueError("Incompatible lengths ! (%s<>%s)" %

331 (len(y), len(x)))

332

333 # Compute sorted slopes only when deltax > 0

334 deltax = x[:, np.newaxis] - x

335 deltay = y[:, np.newaxis] - y

336 slopes = deltay[deltax > 0] / deltax[deltax > 0]

337 if not slopes.size:

338 msg = "All `x` coordinates are identical."

339 warnings.warn(msg, RuntimeWarning, stacklevel=2)

340 slopes.sort()

341 medslope = np.median(slopes)

342 if method == 'joint':

343 medinter = np.median(y - medslope * x)

344 else:

345 medinter = np.median(y) - medslope * np.median(x)

346 # Now compute confidence intervals

347 if alpha > 0.5:

348 alpha = 1. - alpha

349

350 z = distributions.norm.ppf(alpha / 2.)

351 # This implements (2.6) from Sen (1968)

352 _, nxreps = _find_repeats(x)

353 _, nyreps = _find_repeats(y)

354 nt = len(slopes) # N in Sen (1968)

355 ny = len(y) # n in Sen (1968)

356 # Equation 2.6 in Sen (1968):

357 sigsq = 1/18. * (ny * (ny-1) * (2*ny+5) -

358 sum(k * (k-1) * (2*k + 5) for k in nxreps) -

359 sum(k * (k-1) * (2*k + 5) for k in nyreps))

360 # Find the confidence interval indices in `slopes`

361 try:

362 sigma = np.sqrt(sigsq)

363 Ru = min(int(np.round((nt - z*sigma)/2.)), len(slopes)-1)

364 Rl = max(int(np.round((nt + z*sigma)/2.)) - 1, 0)

365 delta = slopes[[Rl, Ru]]

366 except (ValueError, IndexError):

367 delta = (np.nan, np.nan)

368

369 return TheilslopesResult(slope=medslope, intercept=medinter,

370 low_slope=delta[0], high_slope=delta[1])

371

372

373def _find_repeats(arr):

374 # This function assumes it may clobber its input.

375 if len(arr) == 0:

376 return np.array(0, np.float64), np.array(0, np.intp)

377

378 # XXX This cast was previously needed for the Fortran implementation,

379 # should we ditch it?

380 arr = np.asarray(arr, np.float64).ravel()

381 arr.sort()

382

383 # Taken from NumPy 1.9's np.unique.

384 change = np.concatenate(([True], arr[1:] != arr[:-1]))

385 unique = arr[change]

386 change_idx = np.concatenate(np.nonzero(change) + ([arr.size],))

387 freq = np.diff(change_idx)

388 atleast2 = freq > 1

389 return unique[atleast2], freq[atleast2]

390

391

392def siegelslopes(y, x=None, method="hierarchical"):

393 r"""

394 Computes the Siegel estimator for a set of points (x, y).

395

396 `siegelslopes` implements a method for robust linear regression

397 using repeated medians (see [1]_) to fit a line to the points (x, y).

398 The method is robust to outliers with an asymptotic breakdown point

399 of 50%.

400

401 Parameters

402 ----------

403 y : array_like

404 Dependent variable.

405 x : array_like or None, optional

406 Independent variable. If None, use ``arange(len(y))`` instead.

407 method : {'hierarchical', 'separate'}

408 If 'hierarchical', estimate the intercept using the estimated

409 slope ``slope`` (default option).

410 If 'separate', estimate the intercept independent of the estimated

411 slope. See Notes for details.

412

413 Returns

414 -------

415 result : ``SiegelslopesResult`` instance

416 The return value is an object with the following attributes:

417

418 slope : float

419 Estimate of the slope of the regression line.

420 intercept : float

421 Estimate of the intercept of the regression line.

422

423 See Also

424 --------

425 theilslopes : a similar technique without repeated medians

426

427 Notes

428 -----

429 With ``n = len(y)``, compute ``m_j`` as the median of

430 the slopes from the point ``(x[j], y[j])`` to all other `n-1` points.

431 ``slope`` is then the median of all slopes ``m_j``.

432 Two ways are given to estimate the intercept in [1]_ which can be chosen

433 via the parameter ``method``.

434 The hierarchical approach uses the estimated slope ``slope``

435 and computes ``intercept`` as the median of ``y - slope*x``.

436 The other approach estimates the intercept separately as follows: for

437 each point ``(x[j], y[j])``, compute the intercepts of all the `n-1`

438 lines through the remaining points and take the median ``i_j``.

439 ``intercept`` is the median of the ``i_j``.

440

441 The implementation computes `n` times the median of a vector of size `n`

442 which can be slow for large vectors. There are more efficient algorithms

443 (see [2]_) which are not implemented here.

444

445 For compatibility with older versions of SciPy, the return value acts

446 like a ``namedtuple`` of length 2, with fields ``slope`` and

447 ``intercept``, so one can continue to write::

448

449 slope, intercept = siegelslopes(y, x)

450

451 References

452 ----------

453 .. [1] A. Siegel, "Robust Regression Using Repeated Medians",

454 Biometrika, Vol. 69, pp. 242-244, 1982.

455

456 .. [2] A. Stein and M. Werman, "Finding the repeated median regression

457 line", Proceedings of the Third Annual ACM-SIAM Symposium on

458 Discrete Algorithms, pp. 409-413, 1992.

459

460 Examples

461 --------

462 >>> import numpy as np

463 >>> from scipy import stats

464 >>> import matplotlib.pyplot as plt

465

466 >>> x = np.linspace(-5, 5, num=150)

467 >>> y = x + np.random.normal(size=x.size)

468 >>> y[11:15] += 10 # add outliers

469 >>> y[-5:] -= 7

470

471 Compute the slope and intercept. For comparison, also compute the

472 least-squares fit with `linregress`:

473

474 >>> res = stats.siegelslopes(y, x)

475 >>> lsq_res = stats.linregress(x, y)

476

477 Plot the results. The Siegel regression line is shown in red. The green

478 line shows the least-squares fit for comparison.

479

480 >>> fig = plt.figure()

481 >>> ax = fig.add_subplot(111)

482 >>> ax.plot(x, y, 'b.')

483 >>> ax.plot(x, res[1] + res[0] * x, 'r-')

484 >>> ax.plot(x, lsq_res[1] + lsq_res[0] * x, 'g-')

485 >>> plt.show()

486

487 """

488 if method not in ['hierarchical', 'separate']:

489 raise ValueError("method can only be 'hierarchical' or 'separate'")

490 y = np.asarray(y).ravel()

491 if x is None:

492 x = np.arange(len(y), dtype=float)

493 else:

494 x = np.asarray(x, dtype=float).ravel()

495 if len(x) != len(y):

496 raise ValueError("Incompatible lengths ! (%s<>%s)" %

497 (len(y), len(x)))

498 dtype = np.result_type(x, y, np.float32) # use at least float32

499 y, x = y.astype(dtype), x.astype(dtype)

500 medslope, medinter = siegelslopes_pythran(y, x, method)

501 return SiegelslopesResult(slope=medslope, intercept=medinter)

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scipy/stats/_stats_mstats_common.py: 13%

110 statements