Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scipy/stats/_stats_mstats_common.py: 13%
110 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-12 06:31 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-12 06:31 +0000
1import warnings
2import numpy as np
3import scipy.stats._stats_py
4from . import distributions
5from .._lib._bunch import _make_tuple_bunch
6from ._stats_pythran import siegelslopes as siegelslopes_pythran
8__all__ = ['_find_repeats', 'linregress', 'theilslopes', 'siegelslopes']
10# This is not a namedtuple for backwards compatibility. See PR #12983
11LinregressResult = _make_tuple_bunch('LinregressResult',
12 ['slope', 'intercept', 'rvalue',
13 'pvalue', 'stderr'],
14 extra_field_names=['intercept_stderr'])
15TheilslopesResult = _make_tuple_bunch('TheilslopesResult',
16 ['slope', 'intercept',
17 'low_slope', 'high_slope'])
18SiegelslopesResult = _make_tuple_bunch('SiegelslopesResult',
19 ['slope', 'intercept'])
22def linregress(x, y=None, alternative='two-sided'):
23 """
24 Calculate a linear least-squares regression for two sets of measurements.
26 Parameters
27 ----------
28 x, y : array_like
29 Two sets of measurements. Both arrays should have the same length. If
30 only `x` is given (and ``y=None``), then it must be a two-dimensional
31 array where one dimension has length 2. The two sets of measurements
32 are then found by splitting the array along the length-2 dimension. In
33 the case where ``y=None`` and `x` is a 2x2 array, ``linregress(x)`` is
34 equivalent to ``linregress(x[0], x[1])``.
35 alternative : {'two-sided', 'less', 'greater'}, optional
36 Defines the alternative hypothesis. Default is 'two-sided'.
37 The following options are available:
39 * 'two-sided': the slope of the regression line is nonzero
40 * 'less': the slope of the regression line is less than zero
41 * 'greater': the slope of the regression line is greater than zero
43 .. versionadded:: 1.7.0
45 Returns
46 -------
47 result : ``LinregressResult`` instance
48 The return value is an object with the following attributes:
50 slope : float
51 Slope of the regression line.
52 intercept : float
53 Intercept of the regression line.
54 rvalue : float
55 The Pearson correlation coefficient. The square of ``rvalue``
56 is equal to the coefficient of determination.
57 pvalue : float
58 The p-value for a hypothesis test whose null hypothesis is
59 that the slope is zero, using Wald Test with t-distribution of
60 the test statistic. See `alternative` above for alternative
61 hypotheses.
62 stderr : float
63 Standard error of the estimated slope (gradient), under the
64 assumption of residual normality.
65 intercept_stderr : float
66 Standard error of the estimated intercept, under the assumption
67 of residual normality.
69 See Also
70 --------
71 scipy.optimize.curve_fit :
72 Use non-linear least squares to fit a function to data.
73 scipy.optimize.leastsq :
74 Minimize the sum of squares of a set of equations.
76 Notes
77 -----
78 Missing values are considered pair-wise: if a value is missing in `x`,
79 the corresponding value in `y` is masked.
81 For compatibility with older versions of SciPy, the return value acts
82 like a ``namedtuple`` of length 5, with fields ``slope``, ``intercept``,
83 ``rvalue``, ``pvalue`` and ``stderr``, so one can continue to write::
85 slope, intercept, r, p, se = linregress(x, y)
87 With that style, however, the standard error of the intercept is not
88 available. To have access to all the computed values, including the
89 standard error of the intercept, use the return value as an object
90 with attributes, e.g.::
92 result = linregress(x, y)
93 print(result.intercept, result.intercept_stderr)
95 Examples
96 --------
97 >>> import numpy as np
98 >>> import matplotlib.pyplot as plt
99 >>> from scipy import stats
100 >>> rng = np.random.default_rng()
102 Generate some data:
104 >>> x = rng.random(10)
105 >>> y = 1.6*x + rng.random(10)
107 Perform the linear regression:
109 >>> res = stats.linregress(x, y)
111 Coefficient of determination (R-squared):
113 >>> print(f"R-squared: {res.rvalue**2:.6f}")
114 R-squared: 0.717533
116 Plot the data along with the fitted line:
118 >>> plt.plot(x, y, 'o', label='original data')
119 >>> plt.plot(x, res.intercept + res.slope*x, 'r', label='fitted line')
120 >>> plt.legend()
121 >>> plt.show()
123 Calculate 95% confidence interval on slope and intercept:
125 >>> # Two-sided inverse Students t-distribution
126 >>> # p - probability, df - degrees of freedom
127 >>> from scipy.stats import t
128 >>> tinv = lambda p, df: abs(t.ppf(p/2, df))
130 >>> ts = tinv(0.05, len(x)-2)
131 >>> print(f"slope (95%): {res.slope:.6f} +/- {ts*res.stderr:.6f}")
132 slope (95%): 1.453392 +/- 0.743465
133 >>> print(f"intercept (95%): {res.intercept:.6f}"
134 ... f" +/- {ts*res.intercept_stderr:.6f}")
135 intercept (95%): 0.616950 +/- 0.544475
137 """
138 TINY = 1.0e-20
139 if y is None: # x is a (2, N) or (N, 2) shaped array_like
140 x = np.asarray(x)
141 if x.shape[0] == 2:
142 x, y = x
143 elif x.shape[1] == 2:
144 x, y = x.T
145 else:
146 raise ValueError("If only `x` is given as input, it has to "
147 "be of shape (2, N) or (N, 2); provided shape "
148 f"was {x.shape}.")
149 else:
150 x = np.asarray(x)
151 y = np.asarray(y)
153 if x.size == 0 or y.size == 0:
154 raise ValueError("Inputs must not be empty.")
156 if np.amax(x) == np.amin(x) and len(x) > 1:
157 raise ValueError("Cannot calculate a linear regression "
158 "if all x values are identical")
160 n = len(x)
161 xmean = np.mean(x, None)
162 ymean = np.mean(y, None)
164 # Average sums of square differences from the mean
165 # ssxm = mean( (x-mean(x))^2 )
166 # ssxym = mean( (x-mean(x)) * (y-mean(y)) )
167 ssxm, ssxym, _, ssym = np.cov(x, y, bias=1).flat
169 # R-value
170 # r = ssxym / sqrt( ssxm * ssym )
171 if ssxm == 0.0 or ssym == 0.0:
172 # If the denominator was going to be 0
173 r = 0.0
174 else:
175 r = ssxym / np.sqrt(ssxm * ssym)
176 # Test for numerical error propagation (make sure -1 < r < 1)
177 if r > 1.0:
178 r = 1.0
179 elif r < -1.0:
180 r = -1.0
182 slope = ssxym / ssxm
183 intercept = ymean - slope*xmean
184 if n == 2:
185 # handle case when only two points are passed in
186 if y[0] == y[1]:
187 prob = 1.0
188 else:
189 prob = 0.0
190 slope_stderr = 0.0
191 intercept_stderr = 0.0
192 else:
193 df = n - 2 # Number of degrees of freedom
194 # n-2 degrees of freedom because 2 has been used up
195 # to estimate the mean and standard deviation
196 t = r * np.sqrt(df / ((1.0 - r + TINY)*(1.0 + r + TINY)))
197 t, prob = scipy.stats._stats_py._ttest_finish(df, t, alternative)
199 slope_stderr = np.sqrt((1 - r**2) * ssym / ssxm / df)
201 # Also calculate the standard error of the intercept
202 # The following relationship is used:
203 # ssxm = mean( (x-mean(x))^2 )
204 # = ssx - sx*sx
205 # = mean( x^2 ) - mean(x)^2
206 intercept_stderr = slope_stderr * np.sqrt(ssxm + xmean**2)
208 return LinregressResult(slope=slope, intercept=intercept, rvalue=r,
209 pvalue=prob, stderr=slope_stderr,
210 intercept_stderr=intercept_stderr)
213def theilslopes(y, x=None, alpha=0.95, method='separate'):
214 r"""
215 Computes the Theil-Sen estimator for a set of points (x, y).
217 `theilslopes` implements a method for robust linear regression. It
218 computes the slope as the median of all slopes between paired values.
220 Parameters
221 ----------
222 y : array_like
223 Dependent variable.
224 x : array_like or None, optional
225 Independent variable. If None, use ``arange(len(y))`` instead.
226 alpha : float, optional
227 Confidence degree between 0 and 1. Default is 95% confidence.
228 Note that `alpha` is symmetric around 0.5, i.e. both 0.1 and 0.9 are
229 interpreted as "find the 90% confidence interval".
230 method : {'joint', 'separate'}, optional
231 Method to be used for computing estimate for intercept.
232 Following methods are supported,
234 * 'joint': Uses np.median(y - slope * x) as intercept.
235 * 'separate': Uses np.median(y) - slope * np.median(x)
236 as intercept.
238 The default is 'separate'.
240 .. versionadded:: 1.8.0
242 Returns
243 -------
244 result : ``TheilslopesResult`` instance
245 The return value is an object with the following attributes:
247 slope : float
248 Theil slope.
249 intercept : float
250 Intercept of the Theil line.
251 low_slope : float
252 Lower bound of the confidence interval on `slope`.
253 high_slope : float
254 Upper bound of the confidence interval on `slope`.
256 See Also
257 --------
258 siegelslopes : a similar technique using repeated medians
260 Notes
261 -----
262 The implementation of `theilslopes` follows [1]_. The intercept is
263 not defined in [1]_, and here it is defined as ``median(y) -
264 slope*median(x)``, which is given in [3]_. Other definitions of
265 the intercept exist in the literature such as ``median(y - slope*x)``
266 in [4]_. The approach to compute the intercept can be determined by the
267 parameter ``method``. A confidence interval for the intercept is not
268 given as this question is not addressed in [1]_.
270 For compatibility with older versions of SciPy, the return value acts
271 like a ``namedtuple`` of length 4, with fields ``slope``, ``intercept``,
272 ``low_slope``, and ``high_slope``, so one can continue to write::
274 slope, intercept, low_slope, high_slope = theilslopes(y, x)
276 References
277 ----------
278 .. [1] P.K. Sen, "Estimates of the regression coefficient based on
279 Kendall's tau", J. Am. Stat. Assoc., Vol. 63, pp. 1379-1389, 1968.
280 .. [2] H. Theil, "A rank-invariant method of linear and polynomial
281 regression analysis I, II and III", Nederl. Akad. Wetensch., Proc.
282 53:, pp. 386-392, pp. 521-525, pp. 1397-1412, 1950.
283 .. [3] W.L. Conover, "Practical nonparametric statistics", 2nd ed.,
284 John Wiley and Sons, New York, pp. 493.
285 .. [4] https://en.wikipedia.org/wiki/Theil%E2%80%93Sen_estimator
287 Examples
288 --------
289 >>> import numpy as np
290 >>> from scipy import stats
291 >>> import matplotlib.pyplot as plt
293 >>> x = np.linspace(-5, 5, num=150)
294 >>> y = x + np.random.normal(size=x.size)
295 >>> y[11:15] += 10 # add outliers
296 >>> y[-5:] -= 7
298 Compute the slope, intercept and 90% confidence interval. For comparison,
299 also compute the least-squares fit with `linregress`:
301 >>> res = stats.theilslopes(y, x, 0.90, method='separate')
302 >>> lsq_res = stats.linregress(x, y)
304 Plot the results. The Theil-Sen regression line is shown in red, with the
305 dashed red lines illustrating the confidence interval of the slope (note
306 that the dashed red lines are not the confidence interval of the regression
307 as the confidence interval of the intercept is not included). The green
308 line shows the least-squares fit for comparison.
310 >>> fig = plt.figure()
311 >>> ax = fig.add_subplot(111)
312 >>> ax.plot(x, y, 'b.')
313 >>> ax.plot(x, res[1] + res[0] * x, 'r-')
314 >>> ax.plot(x, res[1] + res[2] * x, 'r--')
315 >>> ax.plot(x, res[1] + res[3] * x, 'r--')
316 >>> ax.plot(x, lsq_res[1] + lsq_res[0] * x, 'g-')
317 >>> plt.show()
319 """
320 if method not in ['joint', 'separate']:
321 raise ValueError(("method must be either 'joint' or 'separate'."
322 "'{}' is invalid.".format(method)))
323 # We copy both x and y so we can use _find_repeats.
324 y = np.array(y).flatten()
325 if x is None:
326 x = np.arange(len(y), dtype=float)
327 else:
328 x = np.array(x, dtype=float).flatten()
329 if len(x) != len(y):
330 raise ValueError("Incompatible lengths ! (%s<>%s)" %
331 (len(y), len(x)))
333 # Compute sorted slopes only when deltax > 0
334 deltax = x[:, np.newaxis] - x
335 deltay = y[:, np.newaxis] - y
336 slopes = deltay[deltax > 0] / deltax[deltax > 0]
337 if not slopes.size:
338 msg = "All `x` coordinates are identical."
339 warnings.warn(msg, RuntimeWarning, stacklevel=2)
340 slopes.sort()
341 medslope = np.median(slopes)
342 if method == 'joint':
343 medinter = np.median(y - medslope * x)
344 else:
345 medinter = np.median(y) - medslope * np.median(x)
346 # Now compute confidence intervals
347 if alpha > 0.5:
348 alpha = 1. - alpha
350 z = distributions.norm.ppf(alpha / 2.)
351 # This implements (2.6) from Sen (1968)
352 _, nxreps = _find_repeats(x)
353 _, nyreps = _find_repeats(y)
354 nt = len(slopes) # N in Sen (1968)
355 ny = len(y) # n in Sen (1968)
356 # Equation 2.6 in Sen (1968):
357 sigsq = 1/18. * (ny * (ny-1) * (2*ny+5) -
358 sum(k * (k-1) * (2*k + 5) for k in nxreps) -
359 sum(k * (k-1) * (2*k + 5) for k in nyreps))
360 # Find the confidence interval indices in `slopes`
361 try:
362 sigma = np.sqrt(sigsq)
363 Ru = min(int(np.round((nt - z*sigma)/2.)), len(slopes)-1)
364 Rl = max(int(np.round((nt + z*sigma)/2.)) - 1, 0)
365 delta = slopes[[Rl, Ru]]
366 except (ValueError, IndexError):
367 delta = (np.nan, np.nan)
369 return TheilslopesResult(slope=medslope, intercept=medinter,
370 low_slope=delta[0], high_slope=delta[1])
373def _find_repeats(arr):
374 # This function assumes it may clobber its input.
375 if len(arr) == 0:
376 return np.array(0, np.float64), np.array(0, np.intp)
378 # XXX This cast was previously needed for the Fortran implementation,
379 # should we ditch it?
380 arr = np.asarray(arr, np.float64).ravel()
381 arr.sort()
383 # Taken from NumPy 1.9's np.unique.
384 change = np.concatenate(([True], arr[1:] != arr[:-1]))
385 unique = arr[change]
386 change_idx = np.concatenate(np.nonzero(change) + ([arr.size],))
387 freq = np.diff(change_idx)
388 atleast2 = freq > 1
389 return unique[atleast2], freq[atleast2]
392def siegelslopes(y, x=None, method="hierarchical"):
393 r"""
394 Computes the Siegel estimator for a set of points (x, y).
396 `siegelslopes` implements a method for robust linear regression
397 using repeated medians (see [1]_) to fit a line to the points (x, y).
398 The method is robust to outliers with an asymptotic breakdown point
399 of 50%.
401 Parameters
402 ----------
403 y : array_like
404 Dependent variable.
405 x : array_like or None, optional
406 Independent variable. If None, use ``arange(len(y))`` instead.
407 method : {'hierarchical', 'separate'}
408 If 'hierarchical', estimate the intercept using the estimated
409 slope ``slope`` (default option).
410 If 'separate', estimate the intercept independent of the estimated
411 slope. See Notes for details.
413 Returns
414 -------
415 result : ``SiegelslopesResult`` instance
416 The return value is an object with the following attributes:
418 slope : float
419 Estimate of the slope of the regression line.
420 intercept : float
421 Estimate of the intercept of the regression line.
423 See Also
424 --------
425 theilslopes : a similar technique without repeated medians
427 Notes
428 -----
429 With ``n = len(y)``, compute ``m_j`` as the median of
430 the slopes from the point ``(x[j], y[j])`` to all other `n-1` points.
431 ``slope`` is then the median of all slopes ``m_j``.
432 Two ways are given to estimate the intercept in [1]_ which can be chosen
433 via the parameter ``method``.
434 The hierarchical approach uses the estimated slope ``slope``
435 and computes ``intercept`` as the median of ``y - slope*x``.
436 The other approach estimates the intercept separately as follows: for
437 each point ``(x[j], y[j])``, compute the intercepts of all the `n-1`
438 lines through the remaining points and take the median ``i_j``.
439 ``intercept`` is the median of the ``i_j``.
441 The implementation computes `n` times the median of a vector of size `n`
442 which can be slow for large vectors. There are more efficient algorithms
443 (see [2]_) which are not implemented here.
445 For compatibility with older versions of SciPy, the return value acts
446 like a ``namedtuple`` of length 2, with fields ``slope`` and
447 ``intercept``, so one can continue to write::
449 slope, intercept = siegelslopes(y, x)
451 References
452 ----------
453 .. [1] A. Siegel, "Robust Regression Using Repeated Medians",
454 Biometrika, Vol. 69, pp. 242-244, 1982.
456 .. [2] A. Stein and M. Werman, "Finding the repeated median regression
457 line", Proceedings of the Third Annual ACM-SIAM Symposium on
458 Discrete Algorithms, pp. 409-413, 1992.
460 Examples
461 --------
462 >>> import numpy as np
463 >>> from scipy import stats
464 >>> import matplotlib.pyplot as plt
466 >>> x = np.linspace(-5, 5, num=150)
467 >>> y = x + np.random.normal(size=x.size)
468 >>> y[11:15] += 10 # add outliers
469 >>> y[-5:] -= 7
471 Compute the slope and intercept. For comparison, also compute the
472 least-squares fit with `linregress`:
474 >>> res = stats.siegelslopes(y, x)
475 >>> lsq_res = stats.linregress(x, y)
477 Plot the results. The Siegel regression line is shown in red. The green
478 line shows the least-squares fit for comparison.
480 >>> fig = plt.figure()
481 >>> ax = fig.add_subplot(111)
482 >>> ax.plot(x, y, 'b.')
483 >>> ax.plot(x, res[1] + res[0] * x, 'r-')
484 >>> ax.plot(x, lsq_res[1] + lsq_res[0] * x, 'g-')
485 >>> plt.show()
487 """
488 if method not in ['hierarchical', 'separate']:
489 raise ValueError("method can only be 'hierarchical' or 'separate'")
490 y = np.asarray(y).ravel()
491 if x is None:
492 x = np.arange(len(y), dtype=float)
493 else:
494 x = np.asarray(x, dtype=float).ravel()
495 if len(x) != len(y):
496 raise ValueError("Incompatible lengths ! (%s<>%s)" %
497 (len(y), len(x)))
498 dtype = np.result_type(x, y, np.float32) # use at least float32
499 y, x = y.astype(dtype), x.astype(dtype)
500 medslope, medinter = siegelslopes_pythran(y, x, method)
501 return SiegelslopesResult(slope=medslope, intercept=medinter)