Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scipy/stats/_fit.py: 8%
400 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-12 06:31 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-12 06:31 +0000
1import warnings
2from collections import namedtuple
3import numpy as np
4from scipy import optimize, stats
5from scipy._lib._util import check_random_state
8def _combine_bounds(name, user_bounds, shape_domain, integral):
9 """Intersection of user-defined bounds and distribution PDF/PMF domain"""
11 user_bounds = np.atleast_1d(user_bounds)
13 if user_bounds[0] > user_bounds[1]:
14 message = (f"There are no values for `{name}` on the interval "
15 f"{list(user_bounds)}.")
16 raise ValueError(message)
18 bounds = (max(user_bounds[0], shape_domain[0]),
19 min(user_bounds[1], shape_domain[1]))
21 if integral and (np.ceil(bounds[0]) > np.floor(bounds[1])):
22 message = (f"There are no integer values for `{name}` on the interval "
23 f"defined by the user-provided bounds and the domain "
24 "of the distribution.")
25 raise ValueError(message)
26 elif not integral and (bounds[0] > bounds[1]):
27 message = (f"There are no values for `{name}` on the interval "
28 f"defined by the user-provided bounds and the domain "
29 "of the distribution.")
30 raise ValueError(message)
32 if not np.all(np.isfinite(bounds)):
33 message = (f"The intersection of user-provided bounds for `{name}` "
34 f"and the domain of the distribution is not finite. Please "
35 f"provide finite bounds for shape `{name}` in `bounds`.")
36 raise ValueError(message)
38 return bounds
41class FitResult:
42 r"""Result of fitting a discrete or continuous distribution to data
44 Attributes
45 ----------
46 params : namedtuple
47 A namedtuple containing the maximum likelihood estimates of the
48 shape parameters, location, and (if applicable) scale of the
49 distribution.
50 success : bool or None
51 Whether the optimizer considered the optimization to terminate
52 successfully or not.
53 message : str or None
54 Any status message provided by the optimizer.
56 """
58 def __init__(self, dist, data, discrete, res):
59 self._dist = dist
60 self._data = data
61 self.discrete = discrete
62 self.pxf = getattr(dist, "pmf", None) or getattr(dist, "pdf", None)
64 shape_names = [] if dist.shapes is None else dist.shapes.split(", ")
65 if not discrete:
66 FitParams = namedtuple('FitParams', shape_names + ['loc', 'scale'])
67 else:
68 FitParams = namedtuple('FitParams', shape_names + ['loc'])
70 self.params = FitParams(*res.x)
72 # Optimizer can report success even when nllf is infinite
73 if res.success and not np.isfinite(self.nllf()):
74 res.success = False
75 res.message = ("Optimization converged to parameter values that "
76 "are inconsistent with the data.")
77 self.success = getattr(res, "success", None)
78 self.message = getattr(res, "message", None)
80 def __repr__(self):
81 keys = ["params", "success", "message"]
82 m = max(map(len, keys)) + 1
83 return '\n'.join([key.rjust(m) + ': ' + repr(getattr(self, key))
84 for key in keys if getattr(self, key) is not None])
86 def nllf(self, params=None, data=None):
87 """Negative log-likelihood function
89 Evaluates the negative of the log-likelihood function of the provided
90 data at the provided parameters.
92 Parameters
93 ----------
94 params : tuple, optional
95 The shape parameters, location, and (if applicable) scale of the
96 distribution as a single tuple. Default is the maximum likelihood
97 estimates (``self.params``).
98 data : array_like, optional
99 The data for which the log-likelihood function is to be evaluated.
100 Default is the data to which the distribution was fit.
102 Returns
103 -------
104 nllf : float
105 The negative of the log-likelihood function.
107 """
108 params = params if params is not None else self.params
109 data = data if data is not None else self._data
110 return self._dist.nnlf(theta=params, x=data)
112 def plot(self, ax=None, *, plot_type="hist"):
113 """Visually compare the data against the fitted distribution.
115 Available only if ``matplotlib`` is installed.
117 Parameters
118 ----------
119 ax : matplotlib.axes.Axes
120 Axes object to draw the plot onto, otherwise uses the current Axes.
121 plot_type : {"hist", "qq", "pp", "cdf"}
122 Type of plot to draw. Options include:
124 - "hist": Superposes the PDF/PMF of the fitted distribution
125 over a normalized histogram of the data.
126 - "qq": Scatter plot of theoretical quantiles against the
127 empirical quantiles. Specifically, the x-coordinates are the
128 values of the fitted distribution PPF evaluated at the
129 percentiles ``(np.arange(1, n) - 0.5)/n``, where ``n`` is the
130 number of data points, and the y-coordinates are the sorted
131 data points.
132 - "pp": Scatter plot of theoretical percentiles against the
133 observed percentiles. Specifically, the x-coordinates are the
134 percentiles ``(np.arange(1, n) - 0.5)/n``, where ``n`` is
135 the number of data points, and the y-coordinates are the values
136 of the fitted distribution CDF evaluated at the sorted
137 data points.
138 - "cdf": Superposes the CDF of the fitted distribution over the
139 empirical CDF. Specifically, the x-coordinates of the empirical
140 CDF are the sorted data points, and the y-coordinates are the
141 percentiles ``(np.arange(1, n) - 0.5)/n``, where ``n`` is
142 the number of data points.
144 Returns
145 -------
146 ax : matplotlib.axes.Axes
147 The matplotlib Axes object on which the plot was drawn.
148 """
149 try:
150 import matplotlib # noqa
151 except ModuleNotFoundError as exc:
152 message = "matplotlib must be installed to use method `plot`."
153 raise ModuleNotFoundError(message) from exc
155 plots = {'histogram': self._hist_plot, 'qq': self._qq_plot,
156 'pp': self._pp_plot, 'cdf': self._cdf_plot,
157 'hist': self._hist_plot}
158 if plot_type.lower() not in plots:
159 message = f"`plot_type` must be one of {set(plots.keys())}"
160 raise ValueError(message)
161 plot = plots[plot_type.lower()]
163 if ax is None:
164 import matplotlib.pyplot as plt
165 ax = plt.gca()
167 fit_params = np.atleast_1d(self.params)
169 return plot(ax=ax, fit_params=fit_params)
171 def _hist_plot(self, ax, fit_params):
172 from matplotlib.ticker import MaxNLocator
174 support = self._dist.support(*fit_params)
175 lb = support[0] if np.isfinite(support[0]) else min(self._data)
176 ub = support[1] if np.isfinite(support[1]) else max(self._data)
177 pxf = "PMF" if self.discrete else "PDF"
179 if self.discrete:
180 x = np.arange(lb, ub + 2)
181 y = self.pxf(x, *fit_params)
182 ax.vlines(x[:-1], 0, y[:-1], label='Fitted Distribution PMF',
183 color='C0')
184 options = dict(density=True, bins=x, align='left', color='C1')
185 ax.xaxis.set_major_locator(MaxNLocator(integer=True))
186 ax.set_xlabel('k')
187 ax.set_ylabel('PMF')
188 else:
189 x = np.linspace(lb, ub, 200)
190 y = self.pxf(x, *fit_params)
191 ax.plot(x, y, '--', label='Fitted Distribution PDF', color='C0')
192 options = dict(density=True, bins=50, align='mid', color='C1')
193 ax.set_xlabel('x')
194 ax.set_ylabel('PDF')
196 if len(self._data) > 50 or self.discrete:
197 ax.hist(self._data, label="Histogram of Data", **options)
198 else:
199 ax.plot(self._data, np.zeros_like(self._data), "*",
200 label='Data', color='C1')
202 ax.set_title(rf"Fitted $\tt {self._dist.name}$ {pxf} and Histogram")
203 ax.legend(*ax.get_legend_handles_labels())
204 return ax
206 def _qp_plot(self, ax, fit_params, qq):
207 data = np.sort(self._data)
208 ps = self._plotting_positions(len(self._data))
210 if qq:
211 qp = "Quantiles"
212 plot_type = 'Q-Q'
213 x = self._dist.ppf(ps, *fit_params)
214 y = data
215 else:
216 qp = "Percentiles"
217 plot_type = 'P-P'
218 x = ps
219 y = self._dist.cdf(data, *fit_params)
221 ax.plot(x, y, '.', label=f'Fitted Distribution {plot_type}',
222 color='C0', zorder=1)
223 xlim = ax.get_xlim()
224 ylim = ax.get_ylim()
225 lim = [min(xlim[0], ylim[0]), max(xlim[1], ylim[1])]
226 if not qq:
227 lim = max(lim[0], 0), min(lim[1], 1)
229 if self.discrete and qq:
230 q_min, q_max = int(lim[0]), int(lim[1]+1)
231 q_ideal = np.arange(q_min, q_max)
232 # q_ideal = np.unique(self._dist.ppf(ps, *fit_params))
233 ax.plot(q_ideal, q_ideal, 'o', label='Reference', color='k',
234 alpha=0.25, markerfacecolor='none', clip_on=True)
235 elif self.discrete and not qq:
236 # The intent of this is to match the plot that would be produced
237 # if x were continuous on [0, 1] and y were cdf(ppf(x)).
238 # It can be approximated by letting x = np.linspace(0, 1, 1000),
239 # but this might not look great when zooming in. The vertical
240 # portions are included to indicate where the transition occurs
241 # where the data completely obscures the horizontal portions.
242 p_min, p_max = lim
243 a, b = self._dist.support(*fit_params)
244 p_min = max(p_min, 0 if np.isfinite(a) else 1e-3)
245 p_max = min(p_max, 1 if np.isfinite(b) else 1-1e-3)
246 q_min, q_max = self._dist.ppf([p_min, p_max], *fit_params)
247 qs = np.arange(q_min-1, q_max+1)
248 ps = self._dist.cdf(qs, *fit_params)
249 ax.step(ps, ps, '-', label='Reference', color='k', alpha=0.25,
250 clip_on=True)
251 else:
252 ax.plot(lim, lim, '-', label='Reference', color='k', alpha=0.25,
253 clip_on=True)
255 ax.set_xlim(lim)
256 ax.set_ylim(lim)
257 ax.set_xlabel(rf"Fitted $\tt {self._dist.name}$ Theoretical {qp}")
258 ax.set_ylabel(f"Data {qp}")
259 ax.set_title(rf"Fitted $\tt {self._dist.name}$ {plot_type} Plot")
260 ax.legend(*ax.get_legend_handles_labels())
261 ax.set_aspect('equal')
262 return ax
264 def _qq_plot(self, **kwargs):
265 return self._qp_plot(qq=True, **kwargs)
267 def _pp_plot(self, **kwargs):
268 return self._qp_plot(qq=False, **kwargs)
270 def _plotting_positions(self, n, a=.5):
271 # See https://en.wikipedia.org/wiki/Q%E2%80%93Q_plot#Plotting_positions
272 k = np.arange(1, n+1)
273 return (k-a) / (n + 1 - 2*a)
275 def _cdf_plot(self, ax, fit_params):
276 data = np.sort(self._data)
277 ecdf = self._plotting_positions(len(self._data))
278 ls = '--' if len(np.unique(data)) < 30 else '.'
279 xlabel = 'k' if self.discrete else 'x'
280 ax.step(data, ecdf, ls, label='Empirical CDF', color='C1', zorder=0)
282 xlim = ax.get_xlim()
283 q = np.linspace(*xlim, 300)
284 tcdf = self._dist.cdf(q, *fit_params)
286 ax.plot(q, tcdf, label='Fitted Distribution CDF', color='C0', zorder=1)
287 ax.set_xlim(xlim)
288 ax.set_ylim(0, 1)
289 ax.set_xlabel(xlabel)
290 ax.set_ylabel("CDF")
291 ax.set_title(rf"Fitted $\tt {self._dist.name}$ and Empirical CDF")
292 handles, labels = ax.get_legend_handles_labels()
293 ax.legend(handles[::-1], labels[::-1])
294 return ax
297def fit(dist, data, bounds=None, *, guess=None, method='mle',
298 optimizer=optimize.differential_evolution):
299 r"""Fit a discrete or continuous distribution to data
301 Given a distribution, data, and bounds on the parameters of the
302 distribution, return maximum likelihood estimates of the parameters.
304 Parameters
305 ----------
306 dist : `scipy.stats.rv_continuous` or `scipy.stats.rv_discrete`
307 The object representing the distribution to be fit to the data.
308 data : 1D array_like
309 The data to which the distribution is to be fit. If the data contain
310 any of ``np.nan``, ``np.inf``, or -``np.inf``, the fit method will
311 raise a ``ValueError``.
312 bounds : dict or sequence of tuples, optional
313 If a dictionary, each key is the name of a parameter of the
314 distribution, and the corresponding value is a tuple containing the
315 lower and upper bound on that parameter. If the distribution is
316 defined only for a finite range of values of that parameter, no entry
317 for that parameter is required; e.g., some distributions have
318 parameters which must be on the interval [0, 1]. Bounds for parameters
319 location (``loc``) and scale (``scale``) are optional; by default,
320 they are fixed to 0 and 1, respectively.
322 If a sequence, element *i* is a tuple containing the lower and upper
323 bound on the *i*\ th parameter of the distribution. In this case,
324 bounds for *all* distribution shape parameters must be provided.
325 Optionally, bounds for location and scale may follow the
326 distribution shape parameters.
328 If a shape is to be held fixed (e.g. if it is known), the
329 lower and upper bounds may be equal. If a user-provided lower or upper
330 bound is beyond a bound of the domain for which the distribution is
331 defined, the bound of the distribution's domain will replace the
332 user-provided value. Similarly, parameters which must be integral
333 will be constrained to integral values within the user-provided bounds.
334 guess : dict or array_like, optional
335 If a dictionary, each key is the name of a parameter of the
336 distribution, and the corresponding value is a guess for the value
337 of the parameter.
339 If a sequence, element *i* is a guess for the *i*\ th parameter of the
340 distribution. In this case, guesses for *all* distribution shape
341 parameters must be provided.
343 If `guess` is not provided, guesses for the decision variables will
344 not be passed to the optimizer. If `guess` is provided, guesses for
345 any missing parameters will be set at the mean of the lower and
346 upper bounds. Guesses for parameters which must be integral will be
347 rounded to integral values, and guesses that lie outside the
348 intersection of the user-provided bounds and the domain of the
349 distribution will be clipped.
350 method : {'mle', 'mse'}
351 With ``method="mle"`` (default), the fit is computed by minimizing
352 the negative log-likelihood function. A large, finite penalty
353 (rather than infinite negative log-likelihood) is applied for
354 observations beyond the support of the distribution.
355 With ``method="mse"``, the fit is computed by minimizing
356 the negative log-product spacing function. The same penalty is applied
357 for observations beyond the support. We follow the approach of [1]_,
358 which is generalized for samples with repeated observations.
359 optimizer : callable, optional
360 `optimizer` is a callable that accepts the following positional
361 argument.
363 fun : callable
364 The objective function to be optimized. `fun` accepts one argument
365 ``x``, candidate shape parameters of the distribution, and returns
366 the objective function value given ``x``, `dist`, and the provided
367 `data`.
368 The job of `optimizer` is to find values of the decision variables
369 that minimizes `fun`.
371 `optimizer` must also accept the following keyword argument.
373 bounds : sequence of tuples
374 The bounds on values of the decision variables; each element will
375 be a tuple containing the lower and upper bound on a decision
376 variable.
378 If `guess` is provided, `optimizer` must also accept the following
379 keyword argument.
381 x0 : array_like
382 The guesses for each decision variable.
384 If the distribution has any shape parameters that must be integral or
385 if the distribution is discrete and the location parameter is not
386 fixed, `optimizer` must also accept the following keyword argument.
388 integrality : array_like of bools
389 For each decision variable, True if the decision variable
390 must be constrained to integer values and False if the decision
391 variable is continuous.
393 `optimizer` must return an object, such as an instance of
394 `scipy.optimize.OptimizeResult`, which holds the optimal values of
395 the decision variables in an attribute ``x``. If attributes
396 ``fun``, ``status``, or ``message`` are provided, they will be
397 included in the result object returned by `fit`.
399 Returns
400 -------
401 result : `~scipy.stats._result_classes.FitResult`
402 An object with the following fields.
404 params : namedtuple
405 A namedtuple containing the maximum likelihood estimates of the
406 shape parameters, location, and (if applicable) scale of the
407 distribution.
408 success : bool or None
409 Whether the optimizer considered the optimization to terminate
410 successfully or not.
411 message : str or None
412 Any status message provided by the optimizer.
414 The object has the following method:
416 nllf(params=None, data=None)
417 By default, the negative log-likehood function at the fitted
418 `params` for the given `data`. Accepts a tuple containing
419 alternative shapes, location, and scale of the distribution and
420 an array of alternative data.
422 plot(ax=None)
423 Superposes the PDF/PMF of the fitted distribution over a normalized
424 histogram of the data.
426 See Also
427 --------
428 rv_continuous, rv_discrete
430 Notes
431 -----
432 Optimization is more likely to converge to the maximum likelihood estimate
433 when the user provides tight bounds containing the maximum likelihood
434 estimate. For example, when fitting a binomial distribution to data, the
435 number of experiments underlying each sample may be known, in which case
436 the corresponding shape parameter ``n`` can be fixed.
438 References
439 ----------
440 .. [1] Shao, Yongzhao, and Marjorie G. Hahn. "Maximum product of spacings
441 method: a unified formulation with illustration of strong
442 consistency." Illinois Journal of Mathematics 43.3 (1999): 489-499.
444 Examples
445 --------
446 Suppose we wish to fit a distribution to the following data.
448 >>> import numpy as np
449 >>> from scipy import stats
450 >>> rng = np.random.default_rng()
451 >>> dist = stats.nbinom
452 >>> shapes = (5, 0.5)
453 >>> data = dist.rvs(*shapes, size=1000, random_state=rng)
455 Suppose we do not know how the data were generated, but we suspect that
456 it follows a negative binomial distribution with parameters *n* and *p*\.
457 (See `scipy.stats.nbinom`.) We believe that the parameter *n* was fewer
458 than 30, and we know that the parameter *p* must lie on the interval
459 [0, 1]. We record this information in a variable `bounds` and pass
460 this information to `fit`.
462 >>> bounds = [(0, 30), (0, 1)]
463 >>> res = stats.fit(dist, data, bounds)
465 `fit` searches within the user-specified `bounds` for the
466 values that best match the data (in the sense of maximum likelihood
467 estimation). In this case, it found shape values similar to those
468 from which the data were actually generated.
470 >>> res.params
471 FitParams(n=5.0, p=0.5028157644634368, loc=0.0) # may vary
473 We can visualize the results by superposing the probability mass function
474 of the distribution (with the shapes fit to the data) over a normalized
475 histogram of the data.
477 >>> import matplotlib.pyplot as plt # matplotlib must be installed to plot
478 >>> res.plot()
479 >>> plt.show()
481 Note that the estimate for *n* was exactly integral; this is because
482 the domain of the `nbinom` PMF includes only integral *n*, and the `nbinom`
483 object "knows" that. `nbinom` also knows that the shape *p* must be a
484 value between 0 and 1. In such a case - when the domain of the distribution
485 with respect to a parameter is finite - we are not required to specify
486 bounds for the parameter.
488 >>> bounds = {'n': (0, 30)} # omit parameter p using a `dict`
489 >>> res2 = stats.fit(dist, data, bounds)
490 >>> res2.params
491 FitParams(n=5.0, p=0.5016492009232932, loc=0.0) # may vary
493 If we wish to force the distribution to be fit with *n* fixed at 6, we can
494 set both the lower and upper bounds on *n* to 6. Note, however, that the
495 value of the objective function being optimized is typically worse (higher)
496 in this case.
498 >>> bounds = {'n': (6, 6)} # fix parameter `n`
499 >>> res3 = stats.fit(dist, data, bounds)
500 >>> res3.params
501 FitParams(n=6.0, p=0.5486556076755706, loc=0.0) # may vary
502 >>> res3.nllf() > res.nllf()
503 True # may vary
505 Note that the numerical results of the previous examples are typical, but
506 they may vary because the default optimizer used by `fit`,
507 `scipy.optimize.differential_evolution`, is stochastic. However, we can
508 customize the settings used by the optimizer to ensure reproducibility -
509 or even use a different optimizer entirely - using the `optimizer`
510 parameter.
512 >>> from scipy.optimize import differential_evolution
513 >>> rng = np.random.default_rng(767585560716548)
514 >>> def optimizer(fun, bounds, *, integrality):
515 ... return differential_evolution(fun, bounds, strategy='best2bin',
516 ... seed=rng, integrality=integrality)
517 >>> bounds = [(0, 30), (0, 1)]
518 >>> res4 = stats.fit(dist, data, bounds, optimizer=optimizer)
519 >>> res4.params
520 FitParams(n=5.0, p=0.5015183149259951, loc=0.0)
522 """
523 # --- Input Validation / Standardization --- #
524 user_bounds = bounds
525 user_guess = guess
527 # distribution input validation and information collection
528 if hasattr(dist, "pdf"): # can't use isinstance for types
529 default_bounds = {'loc': (0, 0), 'scale': (1, 1)}
530 discrete = False
531 elif hasattr(dist, "pmf"):
532 default_bounds = {'loc': (0, 0)}
533 discrete = True
534 else:
535 message = ("`dist` must be an instance of `rv_continuous` "
536 "or `rv_discrete.`")
537 raise ValueError(message)
539 try:
540 param_info = dist._param_info()
541 except AttributeError as e:
542 message = (f"Distribution `{dist.name}` is not yet supported by "
543 "`scipy.stats.fit` because shape information has "
544 "not been defined.")
545 raise ValueError(message) from e
547 # data input validation
548 data = np.asarray(data)
549 if data.ndim != 1:
550 message = "`data` must be exactly one-dimensional."
551 raise ValueError(message)
552 if not (np.issubdtype(data.dtype, np.number)
553 and np.all(np.isfinite(data))):
554 message = "All elements of `data` must be finite numbers."
555 raise ValueError(message)
557 # bounds input validation and information collection
558 n_params = len(param_info)
559 n_shapes = n_params - (1 if discrete else 2)
560 param_list = [param.name for param in param_info]
561 param_names = ", ".join(param_list)
562 shape_names = ", ".join(param_list[:n_shapes])
564 if user_bounds is None:
565 user_bounds = {}
567 if isinstance(user_bounds, dict):
568 default_bounds.update(user_bounds)
569 user_bounds = default_bounds
570 user_bounds_array = np.empty((n_params, 2))
571 for i in range(n_params):
572 param_name = param_info[i].name
573 user_bound = user_bounds.pop(param_name, None)
574 if user_bound is None:
575 user_bound = param_info[i].domain
576 user_bounds_array[i] = user_bound
577 if user_bounds:
578 message = ("Bounds provided for the following unrecognized "
579 f"parameters will be ignored: {set(user_bounds)}")
580 warnings.warn(message, RuntimeWarning, stacklevel=2)
582 else:
583 try:
584 user_bounds = np.asarray(user_bounds, dtype=float)
585 if user_bounds.size == 0:
586 user_bounds = np.empty((0, 2))
587 except ValueError as e:
588 message = ("Each element of a `bounds` sequence must be a tuple "
589 "containing two elements: the lower and upper bound of "
590 "a distribution parameter.")
591 raise ValueError(message) from e
592 if (user_bounds.ndim != 2 or user_bounds.shape[1] != 2):
593 message = ("Each element of `bounds` must be a tuple specifying "
594 "the lower and upper bounds of a shape parameter")
595 raise ValueError(message)
596 if user_bounds.shape[0] < n_shapes:
597 message = (f"A `bounds` sequence must contain at least {n_shapes} "
598 "elements: tuples specifying the lower and upper "
599 f"bounds of all shape parameters {shape_names}.")
600 raise ValueError(message)
601 if user_bounds.shape[0] > n_params:
602 message = ("A `bounds` sequence may not contain more than "
603 f"{n_params} elements: tuples specifying the lower and "
604 "upper bounds of distribution parameters "
605 f"{param_names}.")
606 raise ValueError(message)
608 user_bounds_array = np.empty((n_params, 2))
609 user_bounds_array[n_shapes:] = list(default_bounds.values())
610 user_bounds_array[:len(user_bounds)] = user_bounds
612 user_bounds = user_bounds_array
613 validated_bounds = []
614 for i in range(n_params):
615 name = param_info[i].name
616 user_bound = user_bounds_array[i]
617 param_domain = param_info[i].domain
618 integral = param_info[i].integrality
619 combined = _combine_bounds(name, user_bound, param_domain, integral)
620 validated_bounds.append(combined)
622 bounds = np.asarray(validated_bounds)
623 integrality = [param.integrality for param in param_info]
625 # guess input validation
627 if user_guess is None:
628 guess_array = None
629 elif isinstance(user_guess, dict):
630 default_guess = {param.name: np.mean(bound)
631 for param, bound in zip(param_info, bounds)}
632 unrecognized = set(user_guess) - set(default_guess)
633 if unrecognized:
634 message = ("Guesses provided for the following unrecognized "
635 f"parameters will be ignored: {unrecognized}")
636 warnings.warn(message, RuntimeWarning, stacklevel=2)
637 default_guess.update(user_guess)
639 message = ("Each element of `guess` must be a scalar "
640 "guess for a distribution parameter.")
641 try:
642 guess_array = np.asarray([default_guess[param.name]
643 for param in param_info], dtype=float)
644 except ValueError as e:
645 raise ValueError(message) from e
647 else:
648 message = ("Each element of `guess` must be a scalar "
649 "guess for a distribution parameter.")
650 try:
651 user_guess = np.asarray(user_guess, dtype=float)
652 except ValueError as e:
653 raise ValueError(message) from e
654 if user_guess.ndim != 1:
655 raise ValueError(message)
656 if user_guess.shape[0] < n_shapes:
657 message = (f"A `guess` sequence must contain at least {n_shapes} "
658 "elements: scalar guesses for the distribution shape "
659 f"parameters {shape_names}.")
660 raise ValueError(message)
661 if user_guess.shape[0] > n_params:
662 message = ("A `guess` sequence may not contain more than "
663 f"{n_params} elements: scalar guesses for the "
664 f"distribution parameters {param_names}.")
665 raise ValueError(message)
667 guess_array = np.mean(bounds, axis=1)
668 guess_array[:len(user_guess)] = user_guess
670 if guess_array is not None:
671 guess_rounded = guess_array.copy()
673 guess_rounded[integrality] = np.round(guess_rounded[integrality])
674 rounded = np.where(guess_rounded != guess_array)[0]
675 for i in rounded:
676 message = (f"Guess for parameter `{param_info[i].name}` "
677 f"rounded from {guess_array[i]} to {guess_rounded[i]}.")
678 warnings.warn(message, RuntimeWarning, stacklevel=2)
680 guess_clipped = np.clip(guess_rounded, bounds[:, 0], bounds[:, 1])
681 clipped = np.where(guess_clipped != guess_rounded)[0]
682 for i in clipped:
683 message = (f"Guess for parameter `{param_info[i].name}` "
684 f"clipped from {guess_rounded[i]} to "
685 f"{guess_clipped[i]}.")
686 warnings.warn(message, RuntimeWarning, stacklevel=2)
688 guess = guess_clipped
689 else:
690 guess = None
692 # --- Fitting --- #
693 def nllf(free_params, data=data): # bind data NOW
694 with np.errstate(invalid='ignore', divide='ignore'):
695 return dist._penalized_nnlf(free_params, data)
697 def nlpsf(free_params, data=data): # bind data NOW
698 with np.errstate(invalid='ignore', divide='ignore'):
699 return dist._penalized_nlpsf(free_params, data)
701 methods = {'mle': nllf, 'mse': nlpsf}
702 objective = methods[method.lower()]
704 with np.errstate(invalid='ignore', divide='ignore'):
705 kwds = {}
706 if bounds is not None:
707 kwds['bounds'] = bounds
708 if np.any(integrality):
709 kwds['integrality'] = integrality
710 if guess is not None:
711 kwds['x0'] = guess
712 res = optimizer(objective, **kwds)
714 return FitResult(dist, data, discrete, res)
717GoodnessOfFitResult = namedtuple('GoodnessOfFitResult',
718 ('fit_result', 'statistic', 'pvalue',
719 'null_distribution'))
722def goodness_of_fit(dist, data, *, known_params=None, fit_params=None,
723 guessed_params=None, statistic='ad', n_mc_samples=9999,
724 random_state=None):
725 r"""
726 Perform a goodness of fit test comparing data to a distribution family.
728 Given a distribution family and data, perform a test of the null hypothesis
729 that the data were drawn from a distribution in that family. Any known
730 parameters of the distribution may be specified. Remaining parameters of
731 the distribution will be fit to the data, and the p-value of the test
732 is computed accordingly. Several statistics for comparing the distribution
733 to data are available.
735 Parameters
736 ----------
737 dist : `scipy.stats.rv_continuous`
738 The object representing the distribution family under the null
739 hypothesis.
740 data : 1D array_like
741 Finite, uncensored data to be tested.
742 known_params : dict, optional
743 A dictionary containing name-value pairs of known distribution
744 parameters. Monte Carlo samples are randomly drawn from the
745 null-hypothesized distribution with these values of the parameters.
746 Before the statistic is evaluated for each Monte Carlo sample, only
747 remaining unknown parameters of the null-hypothesized distribution
748 family are fit to the samples; the known parameters are held fixed.
749 If all parameters of the distribution family are known, then the step
750 of fitting the distribution family to each sample is omitted.
751 fit_params : dict, optional
752 A dictionary containing name-value pairs of distribution parameters
753 that have already been fit to the data, e.g. using `scipy.stats.fit`
754 or the ``fit`` method of `dist`. Monte Carlo samples are drawn from the
755 null-hypothesized distribution with these specified values of the
756 parameter. On those Monte Carlo samples, however, these and all other
757 unknown parameters of the null-hypothesized distribution family are
758 fit before the statistic is evaluated.
759 guessed_params : dict, optional
760 A dictionary containing name-value pairs of distribution parameters
761 which have been guessed. These parameters are always considered as
762 free parameters and are fit both to the provided `data` as well as
763 to the Monte Carlo samples drawn from the null-hypothesized
764 distribution. The purpose of these `guessed_params` is to be used as
765 initial values for the numerical fitting procedure.
766 statistic : {"ad", "ks", "cvm"}, optional
767 The statistic used to compare data to a distribution after fitting
768 unknown parameters of the distribution family to the data. The
769 Anderson-Darling ("ad"), Kolmogorov-Smirnov ("ks"), and
770 Cramer-von Mises ("cvm") statistics are available [1]_.
771 n_mc_samples : int, default: 9999
772 The number of Monte Carlo samples drawn from the null hypothesized
773 distribution to form the null distribution of the statistic. The
774 sample size of each is the same as the given `data`.
775 random_state : {None, int, `numpy.random.Generator`,
776 `numpy.random.RandomState`}, optional
778 Pseudorandom number generator state used to generate the Monte Carlo
779 samples.
781 If `random_state` is ``None`` (default), the
782 `numpy.random.RandomState` singleton is used.
783 If `random_state` is an int, a new ``RandomState`` instance is used,
784 seeded with `random_state`.
785 If `random_state` is already a ``Generator`` or ``RandomState``
786 instance, then the provided instance is used.
788 Returns
789 -------
790 res : GoodnessOfFitResult
791 An object with the following attributes.
793 fit_result : `~scipy.stats._result_classes.FitResult`
794 An object representing the fit of the provided `dist` to `data`.
795 This object includes the values of distribution family parameters
796 that fully define the null-hypothesized distribution, that is,
797 the distribution from which Monte Carlo samples are drawn.
798 statistic : float
799 The value of the statistic comparing provided `data` to the
800 null-hypothesized distribution.
801 pvalue : float
802 The proportion of elements in the null distribution with
803 statistic values at least as extreme as the statistic value of the
804 provided `data`.
805 null_distribution : ndarray
806 The value of the statistic for each Monte Carlo sample
807 drawn from the null-hypothesized distribution.
809 Notes
810 -----
811 This is a generalized Monte Carlo goodness-of-fit procedure, special cases
812 of which correspond with various Anderson-Darling tests, Lilliefors' test,
813 etc. The test is described in [2]_, [3]_, and [4]_ as a parametric
814 bootstrap test. This is a Monte Carlo test in which parameters that
815 specify the distribution from which samples are drawn have been estimated
816 from the data. We describe the test using "Monte Carlo" rather than
817 "parametric bootstrap" throughout to avoid confusion with the more familiar
818 nonparametric bootstrap, and describe how the test is performed below.
820 *Traditional goodness of fit tests*
822 Traditionally, critical values corresponding with a fixed set of
823 significance levels are pre-calculated using Monte Carlo methods. Users
824 perform the test by calculating the value of the test statistic only for
825 their observed `data` and comparing this value to tabulated critical
826 values. This practice is not very flexible, as tables are not available for
827 all distributions and combinations of known and unknown parameter values.
828 Also, results can be inaccurate when critical values are interpolated from
829 limited tabulated data to correspond with the user's sample size and
830 fitted parameter values. To overcome these shortcomings, this function
831 allows the user to perform the Monte Carlo trials adapted to their
832 particular data.
834 *Algorithmic overview*
836 In brief, this routine executes the following steps:
838 1. Fit unknown parameters to the given `data`, thereby forming the
839 "null-hypothesized" distribution, and compute the statistic of
840 this pair of data and distribution.
841 2. Draw random samples from this null-hypothesized distribution.
842 3. Fit the unknown parameters to each random sample.
843 4. Calculate the statistic between each sample and the distribution that
844 has been fit to the sample.
845 5. Compare the value of the statistic corresponding with `data` from (1)
846 against the values of the statistic corresponding with the random
847 samples from (4). The p-value is the proportion of samples with a
848 statistic value greater than or equal to the statistic of the observed
849 data.
851 In more detail, the steps are as follows.
853 First, any unknown parameters of the distribution family specified by
854 `dist` are fit to the provided `data` using maximum likelihood estimation.
855 (One exception is the normal distribution with unknown location and scale:
856 we use the bias-corrected standard deviation ``np.std(data, ddof=1)`` for
857 the scale as recommended in [1]_.)
858 These values of the parameters specify a particular member of the
859 distribution family referred to as the "null-hypothesized distribution",
860 that is, the distribution from which the data were sampled under the null
861 hypothesis. The `statistic`, which compares data to a distribution, is
862 computed between `data` and the null-hypothesized distribution.
864 Next, many (specifically `n_mc_samples`) new samples, each containing the
865 same number of observations as `data`, are drawn from the
866 null-hypothesized distribution. All unknown parameters of the distribution
867 family `dist` are fit to *each resample*, and the `statistic` is computed
868 between each sample and its corresponding fitted distribution. These
869 values of the statistic form the Monte Carlo null distribution (not to be
870 confused with the "null-hypothesized distribution" above).
872 The p-value of the test is the proportion of statistic values in the Monte
873 Carlo null distribution that are at least as extreme as the statistic value
874 of the provided `data`. More precisely, the p-value is given by
876 .. math::
878 p = \frac{b + 1}
879 {m + 1}
881 where :math:`b` is the number of statistic values in the Monte Carlo null
882 distribution that are greater than or equal to the the statistic value
883 calculated for `data`, and :math:`m` is the number of elements in the
884 Monte Carlo null distribution (`n_mc_samples`). The addition of :math:`1`
885 to the numerator and denominator can be thought of as including the
886 value of the statistic corresponding with `data` in the null distribution,
887 but a more formal explanation is given in [5]_.
889 *Limitations*
891 The test can be very slow for some distribution families because unknown
892 parameters of the distribution family must be fit to each of the Monte
893 Carlo samples, and for most distributions in SciPy, distribution fitting
894 performed via numerical optimization.
896 *Anti-Pattern*
898 For this reason, it may be tempting
899 to treat parameters of the distribution pre-fit to `data` (by the user)
900 as though they were `known_params`, as specification of all parameters of
901 the distribution precludes the need to fit the distribution to each Monte
902 Carlo sample. (This is essentially how the original Kilmogorov-Smirnov
903 test is performed.) Although such a test can provide evidence against the
904 null hypothesis, the test is conservative in the sense that small p-values
905 will tend to (greatly) *overestimate* the probability of making a type I
906 error (that is, rejecting the null hypothesis although it is true), and the
907 power of the test is low (that is, it is less likely to reject the null
908 hypothesis even when the null hypothesis is false).
909 This is because the Monte Carlo samples are less likely to agree with the
910 null-hypothesized distribution as well as `data`. This tends to increase
911 the values of the statistic recorded in the null distribution, so that a
912 larger number of them exceed the value of statistic for `data`, thereby
913 inflating the p-value.
915 References
916 ----------
917 .. [1] M. A. Stephens (1974). "EDF Statistics for Goodness of Fit and
918 Some Comparisons." Journal of the American Statistical Association,
919 Vol. 69, pp. 730-737.
920 .. [2] W. Stute, W. G. Manteiga, and M. P. Quindimil (1993).
921 "Bootstrap based goodness-of-fit-tests." Metrika 40.1: 243-256.
922 .. [3] C. Genest, & B Rémillard. (2008). "Validity of the parametric
923 bootstrap for goodness-of-fit testing in semiparametric models."
924 Annales de l'IHP Probabilités et statistiques. Vol. 44. No. 6.
925 .. [4] I. Kojadinovic and J. Yan (2012). "Goodness-of-fit testing based on
926 a weighted bootstrap: A fast large-sample alternative to the
927 parametric bootstrap." Canadian Journal of Statistics 40.3: 480-500.
928 .. [5] B. Phipson and G. K. Smyth (2010). "Permutation P-values Should
929 Never Be Zero: Calculating Exact P-values When Permutations Are
930 Randomly Drawn." Statistical Applications in Genetics and Molecular
931 Biology 9.1.
932 .. [6] H. W. Lilliefors (1967). "On the Kolmogorov-Smirnov test for
933 normality with mean and variance unknown." Journal of the American
934 statistical Association 62.318: 399-402.
936 Examples
937 --------
938 A well-known test of the null hypothesis that data were drawn from a
939 given distribution is the Kolmogorov-Smirnov (KS) test, available in SciPy
940 as `scipy.stats.ks_1samp`. Suppose we wish to test whether the following
941 data:
943 >>> import numpy as np
944 >>> from scipy import stats
945 >>> rng = np.random.default_rng()
946 >>> x = stats.uniform.rvs(size=75, random_state=rng)
948 were sampled from a normal distribution. To perform a KS test, the
949 empirical distribution function of the observed data will be compared
950 against the (theoretical) cumulative distribution function of a normal
951 distribution. Of course, to do this, the normal distribution under the null
952 hypothesis must be fully specified. This is commonly done by first fitting
953 the ``loc`` and ``scale`` parameters of the distribution to the observed
954 data, then performing the test.
956 >>> loc, scale = np.mean(x), np.std(x, ddof=1)
957 >>> cdf = stats.norm(loc, scale).cdf
958 >>> stats.ks_1samp(x, cdf)
959 KstestResult(statistic=0.1119257570456813, pvalue=0.2827756409939257)
961 An advantage of the KS-test is that the p-value - the probability of
962 obtaining a value of the test statistic under the null hypothesis as
963 extreme as the value obtained from the observed data - can be calculated
964 exactly and efficiently. `goodness_of_fit` can only approximate these
965 results.
967 >>> known_params = {'loc': loc, 'scale': scale}
968 >>> res = stats.goodness_of_fit(stats.norm, x, known_params=known_params,
969 ... statistic='ks', random_state=rng)
970 >>> res.statistic, res.pvalue
971 (0.1119257570456813, 0.2788)
973 The statistic matches exactly, but the p-value is estimated by forming
974 a "Monte Carlo null distribution", that is, by explicitly drawing random
975 samples from `scipy.stats.norm` with the provided parameters and
976 calculating the stastic for each. The fraction of these statistic values
977 at least as extreme as ``res.statistic`` approximates the exact p-value
978 calculated by `scipy.stats.ks_1samp`.
980 However, in many cases, we would prefer to test only that the data were
981 sampled from one of *any* member of the normal distribution family, not
982 specifically from the normal distribution with the location and scale
983 fitted to the observed sample. In this case, Lilliefors [6]_ argued that
984 the KS test is far too conservative (that is, the p-value overstates
985 the actual probability of rejecting a true null hypothesis) and thus lacks
986 power - the ability to reject the null hypothesis when the null hypothesis
987 is actually false.
988 Indeed, our p-value above is approximately 0.28, which is far too large
989 to reject the null hypothesis at any common significance level.
991 Consider why this might be. Note that in the KS test above, the statistic
992 always compares data against the CDF of a normal distribution fitted to the
993 *observed data*. This tends to reduce the value of the statistic for the
994 observed data, but it is "unfair" when computing the statistic for other
995 samples, such as those we randomly draw to form the Monte Carlo null
996 distribution. It is easy to correct for this: whenever we compute the KS
997 statistic of a sample, we use the CDF of a normal distribution fitted
998 to *that sample*. The null distribution in this case has not been
999 calculated exactly and is tyically approximated using Monte Carlo methods
1000 as described above. This is where `goodness_of_fit` excels.
1002 >>> res = stats.goodness_of_fit(stats.norm, x, statistic='ks',
1003 ... random_state=rng)
1004 >>> res.statistic, res.pvalue
1005 (0.1119257570456813, 0.0196)
1007 Indeed, this p-value is much smaller, and small enough to (correctly)
1008 reject the null hypothesis at common signficance levels, including 5% and
1009 2.5%.
1011 However, the KS statistic is not very sensitive to all deviations from
1012 normality. The original advantage of the KS statistic was the ability
1013 to compute the null distribution theoretically, but a more sensitive
1014 statistic - resulting in a higher test power - can be used now that we can
1015 approximate the null distribution
1016 computationally. The Anderson-Darling statistic [1]_ tends to be more
1017 sensitive, and critical values of the this statistic have been tabulated
1018 for various significance levels and sample sizes using Monte Carlo methods.
1020 >>> res = stats.anderson(x, 'norm')
1021 >>> print(res.statistic)
1022 1.2139573337497467
1023 >>> print(res.critical_values)
1024 [0.549 0.625 0.75 0.875 1.041]
1025 >>> print(res.significance_level)
1026 [15. 10. 5. 2.5 1. ]
1028 Here, the observed value of the statistic exceeds the critical value
1029 corresponding with a 1% significance level. This tells us that the p-value
1030 of the observed data is less than 1%, but what is it? We could interpolate
1031 from these (already-interpolated) values, but `goodness_of_fit` can
1032 estimate it directly.
1034 >>> res = stats.goodness_of_fit(stats.norm, x, statistic='ad',
1035 ... random_state=rng)
1036 >>> res.statistic, res.pvalue
1037 (1.2139573337497467, 0.0034)
1039 A further advantage is that use of `goodness_of_fit` is not limited to
1040 a particular set of distributions or conditions on which parameters
1041 are known versus which must be estimated from data. Instead,
1042 `goodness_of_fit` can estimate p-values relatively quickly for any
1043 distribution with a sufficiently fast and reliable ``fit`` method. For
1044 instance, here we perform a goodness of fit test using the Cramer-von Mises
1045 statistic against the Rayleigh distribution with known location and unknown
1046 scale.
1048 >>> rng = np.random.default_rng()
1049 >>> x = stats.chi(df=2.2, loc=0, scale=2).rvs(size=1000, random_state=rng)
1050 >>> res = stats.goodness_of_fit(stats.rayleigh, x, statistic='cvm',
1051 ... known_params={'loc': 0}, random_state=rng)
1053 This executes fairly quickly, but to check the reliability of the ``fit``
1054 method, we should inspect the fit result.
1056 >>> res.fit_result # location is as specified, and scale is reasonable
1057 params: FitParams(loc=0.0, scale=2.1026719844231243)
1058 success: True
1059 message: 'The fit was performed successfully.'
1060 >>> import matplotlib.pyplot as plt # matplotlib must be installed to plot
1061 >>> res.fit_result.plot()
1062 >>> plt.show()
1064 If the distribution is not fit to the observed data as well as possible,
1065 the test may not control the type I error rate, that is, the chance of
1066 rejecting the null hypothesis even when it is true.
1068 We should also look for extreme outliers in the null distribution that
1069 may be caused by unreliable fitting. These do not necessarily invalidate
1070 the result, but they tend to reduce the test's power.
1072 >>> _, ax = plt.subplots()
1073 >>> ax.hist(np.log10(res.null_distribution))
1074 >>> ax.set_xlabel("log10 of CVM statistic under the null hypothesis")
1075 >>> ax.set_ylabel("Frequency")
1076 >>> ax.set_title("Histogram of the Monte Carlo null distribution")
1077 >>> plt.show()
1079 This plot seems reassuring.
1081 If ``fit`` method is working reliably, and if the distribution of the test
1082 statistic is not particularly sensitive to the values of the fitted
1083 parameters, then the p-value provided by `goodness_of_fit` is expected to
1084 be a good approximation.
1086 >>> res.statistic, res.pvalue
1087 (0.2231991510248692, 0.0525)
1089 """
1090 args = _gof_iv(dist, data, known_params, fit_params, guessed_params,
1091 statistic, n_mc_samples, random_state)
1092 (dist, data, fixed_nhd_params, fixed_rfd_params, guessed_nhd_params,
1093 guessed_rfd_params, statistic, n_mc_samples_int, random_state) = args
1095 # Fit null hypothesis distribution to data
1096 nhd_fit_fun = _get_fit_fun(dist, data, guessed_nhd_params,
1097 fixed_nhd_params)
1098 nhd_vals = nhd_fit_fun(data)
1099 nhd_dist = dist(*nhd_vals)
1101 def rvs(size):
1102 return nhd_dist.rvs(size=size, random_state=random_state)
1104 # Define statistic
1105 fit_fun = _get_fit_fun(dist, data, guessed_rfd_params, fixed_rfd_params)
1106 compare_fun = _compare_dict[statistic]
1108 def statistic_fun(data, axis=-1):
1109 # Make things simple by always working along the last axis.
1110 data = np.moveaxis(data, axis, -1)
1111 rfd_vals = fit_fun(data)
1112 rfd_dist = dist(*rfd_vals)
1113 return compare_fun(rfd_dist, data)
1115 res = stats.monte_carlo_test(data, rvs, statistic_fun, vectorized=True,
1116 n_resamples=n_mc_samples, axis=-1,
1117 alternative='greater')
1118 opt_res = optimize.OptimizeResult()
1119 opt_res.success = True
1120 opt_res.message = "The fit was performed successfully."
1121 opt_res.x = nhd_vals
1122 # Only continuous distributions for now, hence discrete=False
1123 # There's no fundamental limitation; it's just that we're not using
1124 # stats.fit, discrete distributions don't have `fit` method, and
1125 # we haven't written any vectorized fit functions for a discrete
1126 # distribution yet.
1127 return GoodnessOfFitResult(FitResult(dist, data, False, opt_res),
1128 res.statistic, res.pvalue,
1129 res.null_distribution)
1132def _get_fit_fun(dist, data, guessed_params, fixed_params):
1134 shape_names = [] if dist.shapes is None else dist.shapes.split(", ")
1135 param_names = shape_names + ['loc', 'scale']
1136 fparam_names = ['f'+name for name in param_names]
1137 all_fixed = not set(fparam_names).difference(fixed_params)
1138 guessed_shapes = [guessed_params.pop(x, None)
1139 for x in shape_names if x in guessed_params]
1141 # Define statistic, including fitting distribution to data
1142 if dist in _fit_funs:
1143 def fit_fun(data):
1144 params = _fit_funs[dist](data, **fixed_params)
1145 params = np.asarray(np.broadcast_arrays(*params))
1146 if params.ndim > 1:
1147 params = params[..., np.newaxis]
1148 return params
1150 elif all_fixed:
1151 def fit_fun(data):
1152 return [fixed_params[name] for name in fparam_names]
1154 else:
1155 def fit_fun_1d(data):
1156 return dist.fit(data, *guessed_shapes, **guessed_params,
1157 **fixed_params)
1159 def fit_fun(data):
1160 params = np.apply_along_axis(fit_fun_1d, axis=-1, arr=data)
1161 if params.ndim > 1:
1162 params = params.T[..., np.newaxis]
1163 return params
1165 return fit_fun
1168# Vectorized fitting functions. These are to accept ND `data` in which each
1169# row (slice along last axis) is a sample to fit and scalar fixed parameters.
1170# They return a tuple of shape parameter arrays, each of shape data.shape[:-1].
1171def _fit_norm(data, floc=None, fscale=None):
1172 loc = floc
1173 scale = fscale
1174 if loc is None and scale is None:
1175 loc = np.mean(data, axis=-1)
1176 scale = np.std(data, ddof=1, axis=-1)
1177 elif loc is None:
1178 loc = np.mean(data, axis=-1)
1179 elif scale is None:
1180 scale = np.sqrt(((data - loc)**2).mean(axis=-1))
1181 return loc, scale
1184_fit_funs = {stats.norm: _fit_norm} # type: ignore[attr-defined]
1187# Vectorized goodness of fit statistic functions. These accept a frozen
1188# distribution object and `data` in which each row (slice along last axis) is
1189# a sample.
1192def _anderson_darling(dist, data):
1193 x = np.sort(data, axis=-1)
1194 n = data.shape[-1]
1195 i = np.arange(1, n+1)
1196 Si = (2*i - 1)/n * (dist.logcdf(x) + dist.logsf(x[..., ::-1]))
1197 S = np.sum(Si, axis=-1)
1198 return -n - S
1201def _compute_dplus(cdfvals): # adapted from _stats_py before gh-17062
1202 n = cdfvals.shape[-1]
1203 return (np.arange(1.0, n + 1) / n - cdfvals).max(axis=-1)
1206def _compute_dminus(cdfvals, axis=-1):
1207 n = cdfvals.shape[-1]
1208 return (cdfvals - np.arange(0.0, n)/n).max(axis=-1)
1211def _kolmogorov_smirnov(dist, data):
1212 x = np.sort(data, axis=-1)
1213 cdfvals = dist.cdf(x)
1214 Dplus = _compute_dplus(cdfvals) # always works along last axis
1215 Dminus = _compute_dminus(cdfvals)
1216 return np.maximum(Dplus, Dminus)
1219def _cramer_von_mises(dist, data):
1220 x = np.sort(data, axis=-1)
1221 n = data.shape[-1]
1222 cdfvals = dist.cdf(x)
1223 u = (2*np.arange(1, n+1) - 1)/(2*n)
1224 w = 1 / (12*n) + np.sum((u - cdfvals)**2, axis=-1)
1225 return w
1228_compare_dict = {"ad": _anderson_darling, "ks": _kolmogorov_smirnov,
1229 "cvm": _cramer_von_mises}
1232def _gof_iv(dist, data, known_params, fit_params, guessed_params, statistic,
1233 n_mc_samples, random_state):
1235 if not isinstance(dist, stats.rv_continuous):
1236 message = ("`dist` must be a (non-frozen) instance of "
1237 "`stats.rv_continuous`.")
1238 raise TypeError(message)
1240 data = np.asarray(data, dtype=float)
1241 if not data.ndim == 1:
1242 message = "`data` must be a one-dimensional array of numbers."
1243 raise ValueError(message)
1245 # Leave validation of these key/value pairs to the `fit` method,
1246 # but collect these into dictionaries that will be used
1247 known_params = known_params or dict()
1248 fit_params = fit_params or dict()
1249 guessed_params = guessed_params or dict()
1251 known_params_f = {("f"+key): val for key, val in known_params.items()}
1252 fit_params_f = {("f"+key): val for key, val in fit_params.items()}
1254 # These the the values of parameters of the null distribution family
1255 # with which resamples are drawn
1256 fixed_nhd_params = known_params_f.copy()
1257 fixed_nhd_params.update(fit_params_f)
1259 # These are fixed when fitting the distribution family to resamples
1260 fixed_rfd_params = known_params_f.copy()
1262 # These are used as guesses when fitting the distribution family to
1263 # the original data
1264 guessed_nhd_params = guessed_params.copy()
1266 # These are used as guesses when fitting the distribution family to
1267 # resamples
1268 guessed_rfd_params = fit_params.copy()
1269 guessed_rfd_params.update(guessed_params)
1271 statistics = {'ad', 'ks', 'cvm'}
1272 if statistic.lower() not in statistics:
1273 message = f"`statistic` must be one of {statistics}."
1274 raise ValueError(message)
1276 n_mc_samples_int = int(n_mc_samples)
1277 if n_mc_samples_int != n_mc_samples:
1278 message = "`n_mc_samples` must be an integer."
1279 raise TypeError(message)
1281 random_state = check_random_state(random_state)
1283 return (dist, data, fixed_nhd_params, fixed_rfd_params, guessed_nhd_params,
1284 guessed_rfd_params, statistic, n_mc_samples_int, random_state)