Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scipy/stats/

1import warnings

2from collections import namedtuple

3import numpy as np

4from scipy import optimize, stats

5from scipy._lib._util import check_random_state

8def _combine_bounds(name, user_bounds, shape_domain, integral):

9 """Intersection of user-defined bounds and distribution PDF/PMF domain"""

11 user_bounds = np.atleast_1d(user_bounds)

13 if user_bounds[0] > user_bounds[1]:

14 message = (f"There are no values for `{name}` on the interval "

15 f"{list(user_bounds)}.")

16 raise ValueError(message)

18 bounds = (max(user_bounds[0], shape_domain[0]),

19 min(user_bounds[1], shape_domain[1]))

21 if integral and (np.ceil(bounds[0]) > np.floor(bounds[1])):

22 message = (f"There are no integer values for `{name}` on the interval "

23 f"defined by the user-provided bounds and the domain "

24 "of the distribution.")

25 raise ValueError(message)

26 elif not integral and (bounds[0] > bounds[1]):

27 message = (f"There are no values for `{name}` on the interval "

28 f"defined by the user-provided bounds and the domain "

29 "of the distribution.")

30 raise ValueError(message)

32 if not np.all(np.isfinite(bounds)):

33 message = (f"The intersection of user-provided bounds for `{name}` "

34 f"and the domain of the distribution is not finite. Please "

35 f"provide finite bounds for shape `{name}` in `bounds`.")

36 raise ValueError(message)

38 return bounds

41class FitResult:

42 r"""Result of fitting a discrete or continuous distribution to data

44 Attributes

45 ----------

46 params : namedtuple

47 A namedtuple containing the maximum likelihood estimates of the

48 shape parameters, location, and (if applicable) scale of the

49 distribution.

50 success : bool or None

51 Whether the optimizer considered the optimization to terminate

52 successfully or not.

53 message : str or None

54 Any status message provided by the optimizer.

56 """

58 def __init__(self, dist, data, discrete, res):

59 self._dist = dist

60 self._data = data

61 self.discrete = discrete

62 self.pxf = getattr(dist, "pmf", None) or getattr(dist, "pdf", None)

64 shape_names = [] if dist.shapes is None else dist.shapes.split(", ")

65 if not discrete:

66 FitParams = namedtuple('FitParams', shape_names + ['loc', 'scale'])

67 else:

68 FitParams = namedtuple('FitParams', shape_names + ['loc'])

70 self.params = FitParams(*res.x)

72 # Optimizer can report success even when nllf is infinite

73 if res.success and not np.isfinite(self.nllf()):

74 res.success = False

75 res.message = ("Optimization converged to parameter values that "

76 "are inconsistent with the data.")

77 self.success = getattr(res, "success", None)

78 self.message = getattr(res, "message", None)

80 def __repr__(self):

81 keys = ["params", "success", "message"]

82 m = max(map(len, keys)) + 1

83 return '\n'.join([key.rjust(m) + ': ' + repr(getattr(self, key))

84 for key in keys if getattr(self, key) is not None])

86 def nllf(self, params=None, data=None):

87 """Negative log-likelihood function

89 Evaluates the negative of the log-likelihood function of the provided

90 data at the provided parameters.

92 Parameters

93 ----------

94 params : tuple, optional

95 The shape parameters, location, and (if applicable) scale of the

96 distribution as a single tuple. Default is the maximum likelihood

97 estimates (``self.params``).

98 data : array_like, optional

99 The data for which the log-likelihood function is to be evaluated.

100 Default is the data to which the distribution was fit.

101

102 Returns

103 -------

104 nllf : float

105 The negative of the log-likelihood function.

106

107 """

108 params = params if params is not None else self.params

109 data = data if data is not None else self._data

110 return self._dist.nnlf(theta=params, x=data)

111

112 def plot(self, ax=None, *, plot_type="hist"):

113 """Visually compare the data against the fitted distribution.

114

115 Available only if ``matplotlib`` is installed.

116

117 Parameters

118 ----------

119 ax : matplotlib.axes.Axes

120 Axes object to draw the plot onto, otherwise uses the current Axes.

121 plot_type : {"hist", "qq", "pp", "cdf"}

122 Type of plot to draw. Options include:

123

124 - "hist": Superposes the PDF/PMF of the fitted distribution

125 over a normalized histogram of the data.

126 - "qq": Scatter plot of theoretical quantiles against the

127 empirical quantiles. Specifically, the x-coordinates are the

128 values of the fitted distribution PPF evaluated at the

129 percentiles ``(np.arange(1, n) - 0.5)/n``, where ``n`` is the

130 number of data points, and the y-coordinates are the sorted

131 data points.

132 - "pp": Scatter plot of theoretical percentiles against the

133 observed percentiles. Specifically, the x-coordinates are the

134 percentiles ``(np.arange(1, n) - 0.5)/n``, where ``n`` is

135 the number of data points, and the y-coordinates are the values

136 of the fitted distribution CDF evaluated at the sorted

137 data points.

138 - "cdf": Superposes the CDF of the fitted distribution over the

139 empirical CDF. Specifically, the x-coordinates of the empirical

140 CDF are the sorted data points, and the y-coordinates are the

141 percentiles ``(np.arange(1, n) - 0.5)/n``, where ``n`` is

142 the number of data points.

143

144 Returns

145 -------

146 ax : matplotlib.axes.Axes

147 The matplotlib Axes object on which the plot was drawn.

148 """

149 try:

150 import matplotlib # noqa

151 except ModuleNotFoundError as exc:

152 message = "matplotlib must be installed to use method `plot`."

153 raise ModuleNotFoundError(message) from exc

154

155 plots = {'histogram': self._hist_plot, 'qq': self._qq_plot,

156 'pp': self._pp_plot, 'cdf': self._cdf_plot,

157 'hist': self._hist_plot}

158 if plot_type.lower() not in plots:

159 message = f"`plot_type` must be one of {set(plots.keys())}"

160 raise ValueError(message)

161 plot = plots[plot_type.lower()]

162

163 if ax is None:

164 import matplotlib.pyplot as plt

165 ax = plt.gca()

166

167 fit_params = np.atleast_1d(self.params)

168

169 return plot(ax=ax, fit_params=fit_params)

170

171 def _hist_plot(self, ax, fit_params):

172 from matplotlib.ticker import MaxNLocator

173

174 support = self._dist.support(*fit_params)

175 lb = support[0] if np.isfinite(support[0]) else min(self._data)

176 ub = support[1] if np.isfinite(support[1]) else max(self._data)

177 pxf = "PMF" if self.discrete else "PDF"

178

179 if self.discrete:

180 x = np.arange(lb, ub + 2)

181 y = self.pxf(x, *fit_params)

182 ax.vlines(x[:-1], 0, y[:-1], label='Fitted Distribution PMF',

183 color='C0')

184 options = dict(density=True, bins=x, align='left', color='C1')

185 ax.xaxis.set_major_locator(MaxNLocator(integer=True))

186 ax.set_xlabel('k')

187 ax.set_ylabel('PMF')

188 else:

189 x = np.linspace(lb, ub, 200)

190 y = self.pxf(x, *fit_params)

191 ax.plot(x, y, '--', label='Fitted Distribution PDF', color='C0')

192 options = dict(density=True, bins=50, align='mid', color='C1')

193 ax.set_xlabel('x')

194 ax.set_ylabel('PDF')

195

196 if len(self._data) > 50 or self.discrete:

197 ax.hist(self._data, label="Histogram of Data", **options)

198 else:

199 ax.plot(self._data, np.zeros_like(self._data), "*",

200 label='Data', color='C1')

201

202 ax.set_title(rf"Fitted $\tt {self._dist.name}$ {pxf} and Histogram")

203 ax.legend(*ax.get_legend_handles_labels())

204 return ax

205

206 def _qp_plot(self, ax, fit_params, qq):

207 data = np.sort(self._data)

208 ps = self._plotting_positions(len(self._data))

209

210 if qq:

211 qp = "Quantiles"

212 plot_type = 'Q-Q'

213 x = self._dist.ppf(ps, *fit_params)

214 y = data

215 else:

216 qp = "Percentiles"

217 plot_type = 'P-P'

218 x = ps

219 y = self._dist.cdf(data, *fit_params)

220

221 ax.plot(x, y, '.', label=f'Fitted Distribution {plot_type}',

222 color='C0', zorder=1)

223 xlim = ax.get_xlim()

224 ylim = ax.get_ylim()

225 lim = [min(xlim[0], ylim[0]), max(xlim[1], ylim[1])]

226 if not qq:

227 lim = max(lim[0], 0), min(lim[1], 1)

228

229 if self.discrete and qq:

230 q_min, q_max = int(lim[0]), int(lim[1]+1)

231 q_ideal = np.arange(q_min, q_max)

232 # q_ideal = np.unique(self._dist.ppf(ps, *fit_params))

233 ax.plot(q_ideal, q_ideal, 'o', label='Reference', color='k',

234 alpha=0.25, markerfacecolor='none', clip_on=True)

235 elif self.discrete and not qq:

236 # The intent of this is to match the plot that would be produced

237 # if x were continuous on [0, 1] and y were cdf(ppf(x)).

238 # It can be approximated by letting x = np.linspace(0, 1, 1000),

239 # but this might not look great when zooming in. The vertical

240 # portions are included to indicate where the transition occurs

241 # where the data completely obscures the horizontal portions.

242 p_min, p_max = lim

243 a, b = self._dist.support(*fit_params)

244 p_min = max(p_min, 0 if np.isfinite(a) else 1e-3)

245 p_max = min(p_max, 1 if np.isfinite(b) else 1-1e-3)

246 q_min, q_max = self._dist.ppf([p_min, p_max], *fit_params)

247 qs = np.arange(q_min-1, q_max+1)

248 ps = self._dist.cdf(qs, *fit_params)

249 ax.step(ps, ps, '-', label='Reference', color='k', alpha=0.25,

250 clip_on=True)

251 else:

252 ax.plot(lim, lim, '-', label='Reference', color='k', alpha=0.25,

253 clip_on=True)

254

255 ax.set_xlim(lim)

256 ax.set_ylim(lim)

257 ax.set_xlabel(rf"Fitted $\tt {self._dist.name}$ Theoretical {qp}")

258 ax.set_ylabel(f"Data {qp}")

259 ax.set_title(rf"Fitted $\tt {self._dist.name}$ {plot_type} Plot")

260 ax.legend(*ax.get_legend_handles_labels())

261 ax.set_aspect('equal')

262 return ax

263

264 def _qq_plot(self, **kwargs):

265 return self._qp_plot(qq=True, **kwargs)

266

267 def _pp_plot(self, **kwargs):

268 return self._qp_plot(qq=False, **kwargs)

269

270 def _plotting_positions(self, n, a=.5):

271 # See https://en.wikipedia.org/wiki/Q%E2%80%93Q_plot#Plotting_positions

272 k = np.arange(1, n+1)

273 return (k-a) / (n + 1 - 2*a)

274

275 def _cdf_plot(self, ax, fit_params):

276 data = np.sort(self._data)

277 ecdf = self._plotting_positions(len(self._data))

278 ls = '--' if len(np.unique(data)) < 30 else '.'

279 xlabel = 'k' if self.discrete else 'x'

280 ax.step(data, ecdf, ls, label='Empirical CDF', color='C1', zorder=0)

281

282 xlim = ax.get_xlim()

283 q = np.linspace(*xlim, 300)

284 tcdf = self._dist.cdf(q, *fit_params)

285

286 ax.plot(q, tcdf, label='Fitted Distribution CDF', color='C0', zorder=1)

287 ax.set_xlim(xlim)

288 ax.set_ylim(0, 1)

289 ax.set_xlabel(xlabel)

290 ax.set_ylabel("CDF")

291 ax.set_title(rf"Fitted $\tt {self._dist.name}$ and Empirical CDF")

292 handles, labels = ax.get_legend_handles_labels()

293 ax.legend(handles[::-1], labels[::-1])

294 return ax

295

296

297def fit(dist, data, bounds=None, *, guess=None, method='mle',

298 optimizer=optimize.differential_evolution):

299 r"""Fit a discrete or continuous distribution to data

300

301 Given a distribution, data, and bounds on the parameters of the

302 distribution, return maximum likelihood estimates of the parameters.

303

304 Parameters

305 ----------

306 dist : `scipy.stats.rv_continuous` or `scipy.stats.rv_discrete`

307 The object representing the distribution to be fit to the data.

308 data : 1D array_like

309 The data to which the distribution is to be fit. If the data contain

310 any of ``np.nan``, ``np.inf``, or -``np.inf``, the fit method will

311 raise a ``ValueError``.

312 bounds : dict or sequence of tuples, optional

313 If a dictionary, each key is the name of a parameter of the

314 distribution, and the corresponding value is a tuple containing the

315 lower and upper bound on that parameter. If the distribution is

316 defined only for a finite range of values of that parameter, no entry

317 for that parameter is required; e.g., some distributions have

318 parameters which must be on the interval [0, 1]. Bounds for parameters

319 location (``loc``) and scale (``scale``) are optional; by default,

320 they are fixed to 0 and 1, respectively.

321

322 If a sequence, element *i* is a tuple containing the lower and upper

323 bound on the *i*\ th parameter of the distribution. In this case,

324 bounds for *all* distribution shape parameters must be provided.

325 Optionally, bounds for location and scale may follow the

326 distribution shape parameters.

327

328 If a shape is to be held fixed (e.g. if it is known), the

329 lower and upper bounds may be equal. If a user-provided lower or upper

330 bound is beyond a bound of the domain for which the distribution is

331 defined, the bound of the distribution's domain will replace the

332 user-provided value. Similarly, parameters which must be integral

333 will be constrained to integral values within the user-provided bounds.

334 guess : dict or array_like, optional

335 If a dictionary, each key is the name of a parameter of the

336 distribution, and the corresponding value is a guess for the value

337 of the parameter.

338

339 If a sequence, element *i* is a guess for the *i*\ th parameter of the

340 distribution. In this case, guesses for *all* distribution shape

341 parameters must be provided.

342

343 If `guess` is not provided, guesses for the decision variables will

344 not be passed to the optimizer. If `guess` is provided, guesses for

345 any missing parameters will be set at the mean of the lower and

346 upper bounds. Guesses for parameters which must be integral will be

347 rounded to integral values, and guesses that lie outside the

348 intersection of the user-provided bounds and the domain of the

349 distribution will be clipped.

350 method : {'mle', 'mse'}

351 With ``method="mle"`` (default), the fit is computed by minimizing

352 the negative log-likelihood function. A large, finite penalty

353 (rather than infinite negative log-likelihood) is applied for

354 observations beyond the support of the distribution.

355 With ``method="mse"``, the fit is computed by minimizing

356 the negative log-product spacing function. The same penalty is applied

357 for observations beyond the support. We follow the approach of [1]_,

358 which is generalized for samples with repeated observations.

359 optimizer : callable, optional

360 `optimizer` is a callable that accepts the following positional

361 argument.

362

363 fun : callable

364 The objective function to be optimized. `fun` accepts one argument

365 ``x``, candidate shape parameters of the distribution, and returns

366 the objective function value given ``x``, `dist`, and the provided

367 `data`.

368 The job of `optimizer` is to find values of the decision variables

369 that minimizes `fun`.

370

371 `optimizer` must also accept the following keyword argument.

372

373 bounds : sequence of tuples

374 The bounds on values of the decision variables; each element will

375 be a tuple containing the lower and upper bound on a decision

376 variable.

377

378 If `guess` is provided, `optimizer` must also accept the following

379 keyword argument.

380

381 x0 : array_like

382 The guesses for each decision variable.

383

384 If the distribution has any shape parameters that must be integral or

385 if the distribution is discrete and the location parameter is not

386 fixed, `optimizer` must also accept the following keyword argument.

387

388 integrality : array_like of bools

389 For each decision variable, True if the decision variable

390 must be constrained to integer values and False if the decision

391 variable is continuous.

392

393 `optimizer` must return an object, such as an instance of

394 `scipy.optimize.OptimizeResult`, which holds the optimal values of

395 the decision variables in an attribute ``x``. If attributes

396 ``fun``, ``status``, or ``message`` are provided, they will be

397 included in the result object returned by `fit`.

398

399 Returns

400 -------

401 result : `~scipy.stats._result_classes.FitResult`

402 An object with the following fields.

403

404 params : namedtuple

405 A namedtuple containing the maximum likelihood estimates of the

406 shape parameters, location, and (if applicable) scale of the

407 distribution.

408 success : bool or None

409 Whether the optimizer considered the optimization to terminate

410 successfully or not.

411 message : str or None

412 Any status message provided by the optimizer.

413

414 The object has the following method:

415

416 nllf(params=None, data=None)

417 By default, the negative log-likehood function at the fitted

418 `params` for the given `data`. Accepts a tuple containing

419 alternative shapes, location, and scale of the distribution and

420 an array of alternative data.

421

422 plot(ax=None)

423 Superposes the PDF/PMF of the fitted distribution over a normalized

424 histogram of the data.

425

426 See Also

427 --------

428 rv_continuous, rv_discrete

429

430 Notes

431 -----

432 Optimization is more likely to converge to the maximum likelihood estimate

433 when the user provides tight bounds containing the maximum likelihood

434 estimate. For example, when fitting a binomial distribution to data, the

435 number of experiments underlying each sample may be known, in which case

436 the corresponding shape parameter ``n`` can be fixed.

437

438 References

439 ----------

440 .. [1] Shao, Yongzhao, and Marjorie G. Hahn. "Maximum product of spacings

441 method: a unified formulation with illustration of strong

442 consistency." Illinois Journal of Mathematics 43.3 (1999): 489-499.

443

444 Examples

445 --------

446 Suppose we wish to fit a distribution to the following data.

447

448 >>> import numpy as np

449 >>> from scipy import stats

450 >>> rng = np.random.default_rng()

451 >>> dist = stats.nbinom

452 >>> shapes = (5, 0.5)

453 >>> data = dist.rvs(*shapes, size=1000, random_state=rng)

454

455 Suppose we do not know how the data were generated, but we suspect that

456 it follows a negative binomial distribution with parameters *n* and *p*\.

457 (See `scipy.stats.nbinom`.) We believe that the parameter *n* was fewer

458 than 30, and we know that the parameter *p* must lie on the interval

459 [0, 1]. We record this information in a variable `bounds` and pass

460 this information to `fit`.

461

462 >>> bounds = [(0, 30), (0, 1)]

463 >>> res = stats.fit(dist, data, bounds)

464

465 `fit` searches within the user-specified `bounds` for the

466 values that best match the data (in the sense of maximum likelihood

467 estimation). In this case, it found shape values similar to those

468 from which the data were actually generated.

469

470 >>> res.params

471 FitParams(n=5.0, p=0.5028157644634368, loc=0.0) # may vary

472

473 We can visualize the results by superposing the probability mass function

474 of the distribution (with the shapes fit to the data) over a normalized

475 histogram of the data.

476

477 >>> import matplotlib.pyplot as plt # matplotlib must be installed to plot

478 >>> res.plot()

479 >>> plt.show()

480

481 Note that the estimate for *n* was exactly integral; this is because

482 the domain of the `nbinom` PMF includes only integral *n*, and the `nbinom`

483 object "knows" that. `nbinom` also knows that the shape *p* must be a

484 value between 0 and 1. In such a case - when the domain of the distribution

485 with respect to a parameter is finite - we are not required to specify

486 bounds for the parameter.

487

488 >>> bounds = {'n': (0, 30)} # omit parameter p using a `dict`

489 >>> res2 = stats.fit(dist, data, bounds)

490 >>> res2.params

491 FitParams(n=5.0, p=0.5016492009232932, loc=0.0) # may vary

492

493 If we wish to force the distribution to be fit with *n* fixed at 6, we can

494 set both the lower and upper bounds on *n* to 6. Note, however, that the

495 value of the objective function being optimized is typically worse (higher)

496 in this case.

497

498 >>> bounds = {'n': (6, 6)} # fix parameter `n`

499 >>> res3 = stats.fit(dist, data, bounds)

500 >>> res3.params

501 FitParams(n=6.0, p=0.5486556076755706, loc=0.0) # may vary

502 >>> res3.nllf() > res.nllf()

503 True # may vary

504

505 Note that the numerical results of the previous examples are typical, but

506 they may vary because the default optimizer used by `fit`,

507 `scipy.optimize.differential_evolution`, is stochastic. However, we can

508 customize the settings used by the optimizer to ensure reproducibility -

509 or even use a different optimizer entirely - using the `optimizer`

510 parameter.

511

512 >>> from scipy.optimize import differential_evolution

513 >>> rng = np.random.default_rng(767585560716548)

514 >>> def optimizer(fun, bounds, *, integrality):

515 ... return differential_evolution(fun, bounds, strategy='best2bin',

516 ... seed=rng, integrality=integrality)

517 >>> bounds = [(0, 30), (0, 1)]

518 >>> res4 = stats.fit(dist, data, bounds, optimizer=optimizer)

519 >>> res4.params

520 FitParams(n=5.0, p=0.5015183149259951, loc=0.0)

521

522 """

523 # --- Input Validation / Standardization --- #

524 user_bounds = bounds

525 user_guess = guess

526

527 # distribution input validation and information collection

528 if hasattr(dist, "pdf"): # can't use isinstance for types

529 default_bounds = {'loc': (0, 0), 'scale': (1, 1)}

530 discrete = False

531 elif hasattr(dist, "pmf"):

532 default_bounds = {'loc': (0, 0)}

533 discrete = True

534 else:

535 message = ("`dist` must be an instance of `rv_continuous` "

536 "or `rv_discrete.`")

537 raise ValueError(message)

538

539 try:

540 param_info = dist._param_info()

541 except AttributeError as e:

542 message = (f"Distribution `{dist.name}` is not yet supported by "

543 "`scipy.stats.fit` because shape information has "

544 "not been defined.")

545 raise ValueError(message) from e

546

547 # data input validation

548 data = np.asarray(data)

549 if data.ndim != 1:

550 message = "`data` must be exactly one-dimensional."

551 raise ValueError(message)

552 if not (np.issubdtype(data.dtype, np.number)

553 and np.all(np.isfinite(data))):

554 message = "All elements of `data` must be finite numbers."

555 raise ValueError(message)

556

557 # bounds input validation and information collection

558 n_params = len(param_info)

559 n_shapes = n_params - (1 if discrete else 2)

560 param_list = [param.name for param in param_info]

561 param_names = ", ".join(param_list)

562 shape_names = ", ".join(param_list[:n_shapes])

563

564 if user_bounds is None:

565 user_bounds = {}

566

567 if isinstance(user_bounds, dict):

568 default_bounds.update(user_bounds)

569 user_bounds = default_bounds

570 user_bounds_array = np.empty((n_params, 2))

571 for i in range(n_params):

572 param_name = param_info[i].name

573 user_bound = user_bounds.pop(param_name, None)

574 if user_bound is None:

575 user_bound = param_info[i].domain

576 user_bounds_array[i] = user_bound

577 if user_bounds:

578 message = ("Bounds provided for the following unrecognized "

579 f"parameters will be ignored: {set(user_bounds)}")

580 warnings.warn(message, RuntimeWarning, stacklevel=2)

581

582 else:

583 try:

584 user_bounds = np.asarray(user_bounds, dtype=float)

585 if user_bounds.size == 0:

586 user_bounds = np.empty((0, 2))

587 except ValueError as e:

588 message = ("Each element of a `bounds` sequence must be a tuple "

589 "containing two elements: the lower and upper bound of "

590 "a distribution parameter.")

591 raise ValueError(message) from e

592 if (user_bounds.ndim != 2 or user_bounds.shape[1] != 2):

593 message = ("Each element of `bounds` must be a tuple specifying "

594 "the lower and upper bounds of a shape parameter")

595 raise ValueError(message)

596 if user_bounds.shape[0] < n_shapes:

597 message = (f"A `bounds` sequence must contain at least {n_shapes} "

598 "elements: tuples specifying the lower and upper "

599 f"bounds of all shape parameters {shape_names}.")

600 raise ValueError(message)

601 if user_bounds.shape[0] > n_params:

602 message = ("A `bounds` sequence may not contain more than "

603 f"{n_params} elements: tuples specifying the lower and "

604 "upper bounds of distribution parameters "

605 f"{param_names}.")

606 raise ValueError(message)

607

608 user_bounds_array = np.empty((n_params, 2))

609 user_bounds_array[n_shapes:] = list(default_bounds.values())

610 user_bounds_array[:len(user_bounds)] = user_bounds

611

612 user_bounds = user_bounds_array

613 validated_bounds = []

614 for i in range(n_params):

615 name = param_info[i].name

616 user_bound = user_bounds_array[i]

617 param_domain = param_info[i].domain

618 integral = param_info[i].integrality

619 combined = _combine_bounds(name, user_bound, param_domain, integral)

620 validated_bounds.append(combined)

621

622 bounds = np.asarray(validated_bounds)

623 integrality = [param.integrality for param in param_info]

624

625 # guess input validation

626

627 if user_guess is None:

628 guess_array = None

629 elif isinstance(user_guess, dict):

630 default_guess = {param.name: np.mean(bound)

631 for param, bound in zip(param_info, bounds)}

632 unrecognized = set(user_guess) - set(default_guess)

633 if unrecognized:

634 message = ("Guesses provided for the following unrecognized "

635 f"parameters will be ignored: {unrecognized}")

636 warnings.warn(message, RuntimeWarning, stacklevel=2)

637 default_guess.update(user_guess)

638

639 message = ("Each element of `guess` must be a scalar "

640 "guess for a distribution parameter.")

641 try:

642 guess_array = np.asarray([default_guess[param.name]

643 for param in param_info], dtype=float)

644 except ValueError as e:

645 raise ValueError(message) from e

646

647 else:

648 message = ("Each element of `guess` must be a scalar "

649 "guess for a distribution parameter.")

650 try:

651 user_guess = np.asarray(user_guess, dtype=float)

652 except ValueError as e:

653 raise ValueError(message) from e

654 if user_guess.ndim != 1:

655 raise ValueError(message)

656 if user_guess.shape[0] < n_shapes:

657 message = (f"A `guess` sequence must contain at least {n_shapes} "

658 "elements: scalar guesses for the distribution shape "

659 f"parameters {shape_names}.")

660 raise ValueError(message)

661 if user_guess.shape[0] > n_params:

662 message = ("A `guess` sequence may not contain more than "

663 f"{n_params} elements: scalar guesses for the "

664 f"distribution parameters {param_names}.")

665 raise ValueError(message)

666

667 guess_array = np.mean(bounds, axis=1)

668 guess_array[:len(user_guess)] = user_guess

669

670 if guess_array is not None:

671 guess_rounded = guess_array.copy()

672

673 guess_rounded[integrality] = np.round(guess_rounded[integrality])

674 rounded = np.where(guess_rounded != guess_array)[0]

675 for i in rounded:

676 message = (f"Guess for parameter `{param_info[i].name}` "

677 f"rounded from {guess_array[i]} to {guess_rounded[i]}.")

678 warnings.warn(message, RuntimeWarning, stacklevel=2)

679

680 guess_clipped = np.clip(guess_rounded, bounds[:, 0], bounds[:, 1])

681 clipped = np.where(guess_clipped != guess_rounded)[0]

682 for i in clipped:

683 message = (f"Guess for parameter `{param_info[i].name}` "

684 f"clipped from {guess_rounded[i]} to "

685 f"{guess_clipped[i]}.")

686 warnings.warn(message, RuntimeWarning, stacklevel=2)

687

688 guess = guess_clipped

689 else:

690 guess = None

691

692 # --- Fitting --- #

693 def nllf(free_params, data=data): # bind data NOW

694 with np.errstate(invalid='ignore', divide='ignore'):

695 return dist._penalized_nnlf(free_params, data)

696

697 def nlpsf(free_params, data=data): # bind data NOW

698 with np.errstate(invalid='ignore', divide='ignore'):

699 return dist._penalized_nlpsf(free_params, data)

700

701 methods = {'mle': nllf, 'mse': nlpsf}

702 objective = methods[method.lower()]

703

704 with np.errstate(invalid='ignore', divide='ignore'):

705 kwds = {}

706 if bounds is not None:

707 kwds['bounds'] = bounds

708 if np.any(integrality):

709 kwds['integrality'] = integrality

710 if guess is not None:

711 kwds['x0'] = guess

712 res = optimizer(objective, **kwds)

713

714 return FitResult(dist, data, discrete, res)

715

716

717GoodnessOfFitResult = namedtuple('GoodnessOfFitResult',

718 ('fit_result', 'statistic', 'pvalue',

719 'null_distribution'))

720

721

722def goodness_of_fit(dist, data, *, known_params=None, fit_params=None,

723 guessed_params=None, statistic='ad', n_mc_samples=9999,

724 random_state=None):

725 r"""

726 Perform a goodness of fit test comparing data to a distribution family.

727

728 Given a distribution family and data, perform a test of the null hypothesis

729 that the data were drawn from a distribution in that family. Any known

730 parameters of the distribution may be specified. Remaining parameters of

731 the distribution will be fit to the data, and the p-value of the test

732 is computed accordingly. Several statistics for comparing the distribution

733 to data are available.

734

735 Parameters

736 ----------

737 dist : `scipy.stats.rv_continuous`

738 The object representing the distribution family under the null

739 hypothesis.

740 data : 1D array_like

741 Finite, uncensored data to be tested.

742 known_params : dict, optional

743 A dictionary containing name-value pairs of known distribution

744 parameters. Monte Carlo samples are randomly drawn from the

745 null-hypothesized distribution with these values of the parameters.

746 Before the statistic is evaluated for each Monte Carlo sample, only

747 remaining unknown parameters of the null-hypothesized distribution

748 family are fit to the samples; the known parameters are held fixed.

749 If all parameters of the distribution family are known, then the step

750 of fitting the distribution family to each sample is omitted.

751 fit_params : dict, optional

752 A dictionary containing name-value pairs of distribution parameters

753 that have already been fit to the data, e.g. using `scipy.stats.fit`

754 or the ``fit`` method of `dist`. Monte Carlo samples are drawn from the

755 null-hypothesized distribution with these specified values of the

756 parameter. On those Monte Carlo samples, however, these and all other

757 unknown parameters of the null-hypothesized distribution family are

758 fit before the statistic is evaluated.

759 guessed_params : dict, optional

760 A dictionary containing name-value pairs of distribution parameters

761 which have been guessed. These parameters are always considered as

762 free parameters and are fit both to the provided `data` as well as

763 to the Monte Carlo samples drawn from the null-hypothesized

764 distribution. The purpose of these `guessed_params` is to be used as

765 initial values for the numerical fitting procedure.

766 statistic : {"ad", "ks", "cvm"}, optional

767 The statistic used to compare data to a distribution after fitting

768 unknown parameters of the distribution family to the data. The

769 Anderson-Darling ("ad"), Kolmogorov-Smirnov ("ks"), and

770 Cramer-von Mises ("cvm") statistics are available [1]_.

771 n_mc_samples : int, default: 9999

772 The number of Monte Carlo samples drawn from the null hypothesized

773 distribution to form the null distribution of the statistic. The

774 sample size of each is the same as the given `data`.

775 random_state : {None, int, `numpy.random.Generator`,

776 `numpy.random.RandomState`}, optional

777

778 Pseudorandom number generator state used to generate the Monte Carlo

779 samples.

780

781 If `random_state` is ``None`` (default), the

782 `numpy.random.RandomState` singleton is used.

783 If `random_state` is an int, a new ``RandomState`` instance is used,

784 seeded with `random_state`.

785 If `random_state` is already a ``Generator`` or ``RandomState``

786 instance, then the provided instance is used.

787

788 Returns

789 -------

790 res : GoodnessOfFitResult

791 An object with the following attributes.

792

793 fit_result : `~scipy.stats._result_classes.FitResult`

794 An object representing the fit of the provided `dist` to `data`.

795 This object includes the values of distribution family parameters

796 that fully define the null-hypothesized distribution, that is,

797 the distribution from which Monte Carlo samples are drawn.

798 statistic : float

799 The value of the statistic comparing provided `data` to the

800 null-hypothesized distribution.

801 pvalue : float

802 The proportion of elements in the null distribution with

803 statistic values at least as extreme as the statistic value of the

804 provided `data`.

805 null_distribution : ndarray

806 The value of the statistic for each Monte Carlo sample

807 drawn from the null-hypothesized distribution.

808

809 Notes

810 -----

811 This is a generalized Monte Carlo goodness-of-fit procedure, special cases

812 of which correspond with various Anderson-Darling tests, Lilliefors' test,

813 etc. The test is described in [2]_, [3]_, and [4]_ as a parametric

814 bootstrap test. This is a Monte Carlo test in which parameters that

815 specify the distribution from which samples are drawn have been estimated

816 from the data. We describe the test using "Monte Carlo" rather than

817 "parametric bootstrap" throughout to avoid confusion with the more familiar

818 nonparametric bootstrap, and describe how the test is performed below.

819

820 *Traditional goodness of fit tests*

821

822 Traditionally, critical values corresponding with a fixed set of

823 significance levels are pre-calculated using Monte Carlo methods. Users

824 perform the test by calculating the value of the test statistic only for

825 their observed `data` and comparing this value to tabulated critical

826 values. This practice is not very flexible, as tables are not available for

827 all distributions and combinations of known and unknown parameter values.

828 Also, results can be inaccurate when critical values are interpolated from

829 limited tabulated data to correspond with the user's sample size and

830 fitted parameter values. To overcome these shortcomings, this function

831 allows the user to perform the Monte Carlo trials adapted to their

832 particular data.

833

834 *Algorithmic overview*

835

836 In brief, this routine executes the following steps:

837

838 1. Fit unknown parameters to the given `data`, thereby forming the

839 "null-hypothesized" distribution, and compute the statistic of

840 this pair of data and distribution.

841 2. Draw random samples from this null-hypothesized distribution.

842 3. Fit the unknown parameters to each random sample.

843 4. Calculate the statistic between each sample and the distribution that

844 has been fit to the sample.

845 5. Compare the value of the statistic corresponding with `data` from (1)

846 against the values of the statistic corresponding with the random

847 samples from (4). The p-value is the proportion of samples with a

848 statistic value greater than or equal to the statistic of the observed

849 data.

850

851 In more detail, the steps are as follows.

852

853 First, any unknown parameters of the distribution family specified by

854 `dist` are fit to the provided `data` using maximum likelihood estimation.

855 (One exception is the normal distribution with unknown location and scale:

856 we use the bias-corrected standard deviation ``np.std(data, ddof=1)`` for

857 the scale as recommended in [1]_.)

858 These values of the parameters specify a particular member of the

859 distribution family referred to as the "null-hypothesized distribution",

860 that is, the distribution from which the data were sampled under the null

861 hypothesis. The `statistic`, which compares data to a distribution, is

862 computed between `data` and the null-hypothesized distribution.

863

864 Next, many (specifically `n_mc_samples`) new samples, each containing the

865 same number of observations as `data`, are drawn from the

866 null-hypothesized distribution. All unknown parameters of the distribution

867 family `dist` are fit to *each resample*, and the `statistic` is computed

868 between each sample and its corresponding fitted distribution. These

869 values of the statistic form the Monte Carlo null distribution (not to be

870 confused with the "null-hypothesized distribution" above).

871

872 The p-value of the test is the proportion of statistic values in the Monte

873 Carlo null distribution that are at least as extreme as the statistic value

874 of the provided `data`. More precisely, the p-value is given by

875

876 .. math::

877

878 p = \frac{b + 1}

879 {m + 1}

880

881 where :math:`b` is the number of statistic values in the Monte Carlo null

882 distribution that are greater than or equal to the the statistic value

883 calculated for `data`, and :math:`m` is the number of elements in the

884 Monte Carlo null distribution (`n_mc_samples`). The addition of :math:`1`

885 to the numerator and denominator can be thought of as including the

886 value of the statistic corresponding with `data` in the null distribution,

887 but a more formal explanation is given in [5]_.

888

889 *Limitations*

890

891 The test can be very slow for some distribution families because unknown

892 parameters of the distribution family must be fit to each of the Monte

893 Carlo samples, and for most distributions in SciPy, distribution fitting

894 performed via numerical optimization.

895

896 *Anti-Pattern*

897

898 For this reason, it may be tempting

899 to treat parameters of the distribution pre-fit to `data` (by the user)

900 as though they were `known_params`, as specification of all parameters of

901 the distribution precludes the need to fit the distribution to each Monte

902 Carlo sample. (This is essentially how the original Kilmogorov-Smirnov

903 test is performed.) Although such a test can provide evidence against the

904 null hypothesis, the test is conservative in the sense that small p-values

905 will tend to (greatly) *overestimate* the probability of making a type I

906 error (that is, rejecting the null hypothesis although it is true), and the

907 power of the test is low (that is, it is less likely to reject the null

908 hypothesis even when the null hypothesis is false).

909 This is because the Monte Carlo samples are less likely to agree with the

910 null-hypothesized distribution as well as `data`. This tends to increase

911 the values of the statistic recorded in the null distribution, so that a

912 larger number of them exceed the value of statistic for `data`, thereby

913 inflating the p-value.

914

915 References

916 ----------

917 .. [1] M. A. Stephens (1974). "EDF Statistics for Goodness of Fit and

918 Some Comparisons." Journal of the American Statistical Association,

919 Vol. 69, pp. 730-737.

920 .. [2] W. Stute, W. G. Manteiga, and M. P. Quindimil (1993).

921 "Bootstrap based goodness-of-fit-tests." Metrika 40.1: 243-256.

922 .. [3] C. Genest, & B Rémillard. (2008). "Validity of the parametric

923 bootstrap for goodness-of-fit testing in semiparametric models."

924 Annales de l'IHP Probabilités et statistiques. Vol. 44. No. 6.

925 .. [4] I. Kojadinovic and J. Yan (2012). "Goodness-of-fit testing based on

926 a weighted bootstrap: A fast large-sample alternative to the

927 parametric bootstrap." Canadian Journal of Statistics 40.3: 480-500.

928 .. [5] B. Phipson and G. K. Smyth (2010). "Permutation P-values Should

929 Never Be Zero: Calculating Exact P-values When Permutations Are

930 Randomly Drawn." Statistical Applications in Genetics and Molecular

931 Biology 9.1.

932 .. [6] H. W. Lilliefors (1967). "On the Kolmogorov-Smirnov test for

933 normality with mean and variance unknown." Journal of the American

934 statistical Association 62.318: 399-402.

935

936 Examples

937 --------

938 A well-known test of the null hypothesis that data were drawn from a

939 given distribution is the Kolmogorov-Smirnov (KS) test, available in SciPy

940 as `scipy.stats.ks_1samp`. Suppose we wish to test whether the following

941 data:

942

943 >>> import numpy as np

944 >>> from scipy import stats

945 >>> rng = np.random.default_rng()

946 >>> x = stats.uniform.rvs(size=75, random_state=rng)

947

948 were sampled from a normal distribution. To perform a KS test, the

949 empirical distribution function of the observed data will be compared

950 against the (theoretical) cumulative distribution function of a normal

951 distribution. Of course, to do this, the normal distribution under the null

952 hypothesis must be fully specified. This is commonly done by first fitting

953 the ``loc`` and ``scale`` parameters of the distribution to the observed

954 data, then performing the test.

955

956 >>> loc, scale = np.mean(x), np.std(x, ddof=1)

957 >>> cdf = stats.norm(loc, scale).cdf

958 >>> stats.ks_1samp(x, cdf)

959 KstestResult(statistic=0.1119257570456813, pvalue=0.2827756409939257)

960

961 An advantage of the KS-test is that the p-value - the probability of

962 obtaining a value of the test statistic under the null hypothesis as

963 extreme as the value obtained from the observed data - can be calculated

964 exactly and efficiently. `goodness_of_fit` can only approximate these

965 results.

966

967 >>> known_params = {'loc': loc, 'scale': scale}

968 >>> res = stats.goodness_of_fit(stats.norm, x, known_params=known_params,

969 ... statistic='ks', random_state=rng)

970 >>> res.statistic, res.pvalue

971 (0.1119257570456813, 0.2788)

972

973 The statistic matches exactly, but the p-value is estimated by forming

974 a "Monte Carlo null distribution", that is, by explicitly drawing random

975 samples from `scipy.stats.norm` with the provided parameters and

976 calculating the stastic for each. The fraction of these statistic values

977 at least as extreme as ``res.statistic`` approximates the exact p-value

978 calculated by `scipy.stats.ks_1samp`.

979

980 However, in many cases, we would prefer to test only that the data were

981 sampled from one of *any* member of the normal distribution family, not

982 specifically from the normal distribution with the location and scale

983 fitted to the observed sample. In this case, Lilliefors [6]_ argued that

984 the KS test is far too conservative (that is, the p-value overstates

985 the actual probability of rejecting a true null hypothesis) and thus lacks

986 power - the ability to reject the null hypothesis when the null hypothesis

987 is actually false.

988 Indeed, our p-value above is approximately 0.28, which is far too large

989 to reject the null hypothesis at any common significance level.

990

991 Consider why this might be. Note that in the KS test above, the statistic

992 always compares data against the CDF of a normal distribution fitted to the

993 *observed data*. This tends to reduce the value of the statistic for the

994 observed data, but it is "unfair" when computing the statistic for other

995 samples, such as those we randomly draw to form the Monte Carlo null

996 distribution. It is easy to correct for this: whenever we compute the KS

997 statistic of a sample, we use the CDF of a normal distribution fitted

998 to *that sample*. The null distribution in this case has not been

999 calculated exactly and is tyically approximated using Monte Carlo methods

1000 as described above. This is where `goodness_of_fit` excels.

1001

1002 >>> res = stats.goodness_of_fit(stats.norm, x, statistic='ks',

1003 ... random_state=rng)

1004 >>> res.statistic, res.pvalue

1005 (0.1119257570456813, 0.0196)

1006

1007 Indeed, this p-value is much smaller, and small enough to (correctly)

1008 reject the null hypothesis at common signficance levels, including 5% and

1009 2.5%.

1010

1011 However, the KS statistic is not very sensitive to all deviations from

1012 normality. The original advantage of the KS statistic was the ability

1013 to compute the null distribution theoretically, but a more sensitive

1014 statistic - resulting in a higher test power - can be used now that we can

1015 approximate the null distribution

1016 computationally. The Anderson-Darling statistic [1]_ tends to be more

1017 sensitive, and critical values of the this statistic have been tabulated

1018 for various significance levels and sample sizes using Monte Carlo methods.

1019

1020 >>> res = stats.anderson(x, 'norm')

1021 >>> print(res.statistic)

1022 1.2139573337497467

1023 >>> print(res.critical_values)

1024 [0.549 0.625 0.75 0.875 1.041]

1025 >>> print(res.significance_level)

1026 [15. 10. 5. 2.5 1. ]

1027

1028 Here, the observed value of the statistic exceeds the critical value

1029 corresponding with a 1% significance level. This tells us that the p-value

1030 of the observed data is less than 1%, but what is it? We could interpolate

1031 from these (already-interpolated) values, but `goodness_of_fit` can

1032 estimate it directly.

1033

1034 >>> res = stats.goodness_of_fit(stats.norm, x, statistic='ad',

1035 ... random_state=rng)

1036 >>> res.statistic, res.pvalue

1037 (1.2139573337497467, 0.0034)

1038

1039 A further advantage is that use of `goodness_of_fit` is not limited to

1040 a particular set of distributions or conditions on which parameters

1041 are known versus which must be estimated from data. Instead,

1042 `goodness_of_fit` can estimate p-values relatively quickly for any

1043 distribution with a sufficiently fast and reliable ``fit`` method. For

1044 instance, here we perform a goodness of fit test using the Cramer-von Mises

1045 statistic against the Rayleigh distribution with known location and unknown

1046 scale.

1047

1048 >>> rng = np.random.default_rng()

1049 >>> x = stats.chi(df=2.2, loc=0, scale=2).rvs(size=1000, random_state=rng)

1050 >>> res = stats.goodness_of_fit(stats.rayleigh, x, statistic='cvm',

1051 ... known_params={'loc': 0}, random_state=rng)

1052

1053 This executes fairly quickly, but to check the reliability of the ``fit``

1054 method, we should inspect the fit result.

1055

1056 >>> res.fit_result # location is as specified, and scale is reasonable

1057 params: FitParams(loc=0.0, scale=2.1026719844231243)

1058 success: True

1059 message: 'The fit was performed successfully.'

1060 >>> import matplotlib.pyplot as plt # matplotlib must be installed to plot

1061 >>> res.fit_result.plot()

1062 >>> plt.show()

1063

1064 If the distribution is not fit to the observed data as well as possible,

1065 the test may not control the type I error rate, that is, the chance of

1066 rejecting the null hypothesis even when it is true.

1067

1068 We should also look for extreme outliers in the null distribution that

1069 may be caused by unreliable fitting. These do not necessarily invalidate

1070 the result, but they tend to reduce the test's power.

1071

1072 >>> _, ax = plt.subplots()

1073 >>> ax.hist(np.log10(res.null_distribution))

1074 >>> ax.set_xlabel("log10 of CVM statistic under the null hypothesis")

1075 >>> ax.set_ylabel("Frequency")

1076 >>> ax.set_title("Histogram of the Monte Carlo null distribution")

1077 >>> plt.show()

1078

1079 This plot seems reassuring.

1080

1081 If ``fit`` method is working reliably, and if the distribution of the test

1082 statistic is not particularly sensitive to the values of the fitted

1083 parameters, then the p-value provided by `goodness_of_fit` is expected to

1084 be a good approximation.

1085

1086 >>> res.statistic, res.pvalue

1087 (0.2231991510248692, 0.0525)

1088

1089 """

1090 args = _gof_iv(dist, data, known_params, fit_params, guessed_params,

1091 statistic, n_mc_samples, random_state)

1092 (dist, data, fixed_nhd_params, fixed_rfd_params, guessed_nhd_params,

1093 guessed_rfd_params, statistic, n_mc_samples_int, random_state) = args

1094

1095 # Fit null hypothesis distribution to data

1096 nhd_fit_fun = _get_fit_fun(dist, data, guessed_nhd_params,

1097 fixed_nhd_params)

1098 nhd_vals = nhd_fit_fun(data)

1099 nhd_dist = dist(*nhd_vals)

1100

1101 def rvs(size):

1102 return nhd_dist.rvs(size=size, random_state=random_state)

1103

1104 # Define statistic

1105 fit_fun = _get_fit_fun(dist, data, guessed_rfd_params, fixed_rfd_params)

1106 compare_fun = _compare_dict[statistic]

1107

1108 def statistic_fun(data, axis=-1):

1109 # Make things simple by always working along the last axis.

1110 data = np.moveaxis(data, axis, -1)

1111 rfd_vals = fit_fun(data)

1112 rfd_dist = dist(*rfd_vals)

1113 return compare_fun(rfd_dist, data)

1114

1115 res = stats.monte_carlo_test(data, rvs, statistic_fun, vectorized=True,

1116 n_resamples=n_mc_samples, axis=-1,

1117 alternative='greater')

1118 opt_res = optimize.OptimizeResult()

1119 opt_res.success = True

1120 opt_res.message = "The fit was performed successfully."

1121 opt_res.x = nhd_vals

1122 # Only continuous distributions for now, hence discrete=False

1123 # There's no fundamental limitation; it's just that we're not using

1124 # stats.fit, discrete distributions don't have `fit` method, and

1125 # we haven't written any vectorized fit functions for a discrete

1126 # distribution yet.

1127 return GoodnessOfFitResult(FitResult(dist, data, False, opt_res),

1128 res.statistic, res.pvalue,

1129 res.null_distribution)

1130

1131

1132def _get_fit_fun(dist, data, guessed_params, fixed_params):

1133

1134 shape_names = [] if dist.shapes is None else dist.shapes.split(", ")

1135 param_names = shape_names + ['loc', 'scale']

1136 fparam_names = ['f'+name for name in param_names]

1137 all_fixed = not set(fparam_names).difference(fixed_params)

1138 guessed_shapes = [guessed_params.pop(x, None)

1139 for x in shape_names if x in guessed_params]

1140

1141 # Define statistic, including fitting distribution to data

1142 if dist in _fit_funs:

1143 def fit_fun(data):

1144 params = _fit_funs[dist](data, **fixed_params)

1145 params = np.asarray(np.broadcast_arrays(*params))

1146 if params.ndim > 1:

1147 params = params[..., np.newaxis]

1148 return params

1149

1150 elif all_fixed:

1151 def fit_fun(data):

1152 return [fixed_params[name] for name in fparam_names]

1153

1154 else:

1155 def fit_fun_1d(data):

1156 return dist.fit(data, *guessed_shapes, **guessed_params,

1157 **fixed_params)

1158

1159 def fit_fun(data):

1160 params = np.apply_along_axis(fit_fun_1d, axis=-1, arr=data)

1161 if params.ndim > 1:

1162 params = params.T[..., np.newaxis]

1163 return params

1164

1165 return fit_fun

1166

1167

1168# Vectorized fitting functions. These are to accept ND `data` in which each

1169# row (slice along last axis) is a sample to fit and scalar fixed parameters.

1170# They return a tuple of shape parameter arrays, each of shape data.shape[:-1].

1171def _fit_norm(data, floc=None, fscale=None):

1172 loc = floc

1173 scale = fscale

1174 if loc is None and scale is None:

1175 loc = np.mean(data, axis=-1)

1176 scale = np.std(data, ddof=1, axis=-1)

1177 elif loc is None:

1178 loc = np.mean(data, axis=-1)

1179 elif scale is None:

1180 scale = np.sqrt(((data - loc)**2).mean(axis=-1))

1181 return loc, scale

1182

1183

1184_fit_funs = {stats.norm: _fit_norm} # type: ignore[attr-defined]

1185

1186

1187# Vectorized goodness of fit statistic functions. These accept a frozen

1188# distribution object and `data` in which each row (slice along last axis) is

1189# a sample.

1190

1191

1192def _anderson_darling(dist, data):

1193 x = np.sort(data, axis=-1)

1194 n = data.shape[-1]

1195 i = np.arange(1, n+1)

1196 Si = (2*i - 1)/n * (dist.logcdf(x) + dist.logsf(x[..., ::-1]))

1197 S = np.sum(Si, axis=-1)

1198 return -n - S

1199

1200

1201def _compute_dplus(cdfvals): # adapted from _stats_py before gh-17062

1202 n = cdfvals.shape[-1]

1203 return (np.arange(1.0, n + 1) / n - cdfvals).max(axis=-1)

1204

1205

1206def _compute_dminus(cdfvals, axis=-1):

1207 n = cdfvals.shape[-1]

1208 return (cdfvals - np.arange(0.0, n)/n).max(axis=-1)

1209

1210

1211def _kolmogorov_smirnov(dist, data):

1212 x = np.sort(data, axis=-1)

1213 cdfvals = dist.cdf(x)

1214 Dplus = _compute_dplus(cdfvals) # always works along last axis

1215 Dminus = _compute_dminus(cdfvals)

1216 return np.maximum(Dplus, Dminus)

1217

1218

1219def _cramer_von_mises(dist, data):

1220 x = np.sort(data, axis=-1)

1221 n = data.shape[-1]

1222 cdfvals = dist.cdf(x)

1223 u = (2*np.arange(1, n+1) - 1)/(2*n)

1224 w = 1 / (12*n) + np.sum((u - cdfvals)**2, axis=-1)

1225 return w

1226

1227

1228_compare_dict = {"ad": _anderson_darling, "ks": _kolmogorov_smirnov,

1229 "cvm": _cramer_von_mises}

1230

1231

1232def _gof_iv(dist, data, known_params, fit_params, guessed_params, statistic,

1233 n_mc_samples, random_state):

1234

1235 if not isinstance(dist, stats.rv_continuous):

1236 message = ("`dist` must be a (non-frozen) instance of "

1237 "`stats.rv_continuous`.")

1238 raise TypeError(message)

1239

1240 data = np.asarray(data, dtype=float)

1241 if not data.ndim == 1:

1242 message = "`data` must be a one-dimensional array of numbers."

1243 raise ValueError(message)

1244

1245 # Leave validation of these key/value pairs to the `fit` method,

1246 # but collect these into dictionaries that will be used

1247 known_params = known_params or dict()

1248 fit_params = fit_params or dict()

1249 guessed_params = guessed_params or dict()

1250

1251 known_params_f = {("f"+key): val for key, val in known_params.items()}

1252 fit_params_f = {("f"+key): val for key, val in fit_params.items()}

1253

1254 # These the the values of parameters of the null distribution family

1255 # with which resamples are drawn

1256 fixed_nhd_params = known_params_f.copy()

1257 fixed_nhd_params.update(fit_params_f)

1258

1259 # These are fixed when fitting the distribution family to resamples

1260 fixed_rfd_params = known_params_f.copy()

1261

1262 # These are used as guesses when fitting the distribution family to

1263 # the original data

1264 guessed_nhd_params = guessed_params.copy()

1265

1266 # These are used as guesses when fitting the distribution family to

1267 # resamples

1268 guessed_rfd_params = fit_params.copy()

1269 guessed_rfd_params.update(guessed_params)

1270

1271 statistics = {'ad', 'ks', 'cvm'}

1272 if statistic.lower() not in statistics:

1273 message = f"`statistic` must be one of {statistics}."

1274 raise ValueError(message)

1275

1276 n_mc_samples_int = int(n_mc_samples)

1277 if n_mc_samples_int != n_mc_samples:

1278 message = "`n_mc_samples` must be an integer."

1279 raise TypeError(message)

1280

1281 random_state = check_random_state(random_state)

1282

1283 return (dist, data, fixed_nhd_params, fixed_rfd_params, guessed_nhd_params,

1284 guessed_rfd_params, statistic, n_mc_samples_int, random_state)

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scipy/stats/_fit.py: 8%

400 statements