Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scipy/stats/_fit.py: 8%

400 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-12-12 06:31 +0000

1import warnings 

2from collections import namedtuple 

3import numpy as np 

4from scipy import optimize, stats 

5from scipy._lib._util import check_random_state 

6 

7 

8def _combine_bounds(name, user_bounds, shape_domain, integral): 

9 """Intersection of user-defined bounds and distribution PDF/PMF domain""" 

10 

11 user_bounds = np.atleast_1d(user_bounds) 

12 

13 if user_bounds[0] > user_bounds[1]: 

14 message = (f"There are no values for `{name}` on the interval " 

15 f"{list(user_bounds)}.") 

16 raise ValueError(message) 

17 

18 bounds = (max(user_bounds[0], shape_domain[0]), 

19 min(user_bounds[1], shape_domain[1])) 

20 

21 if integral and (np.ceil(bounds[0]) > np.floor(bounds[1])): 

22 message = (f"There are no integer values for `{name}` on the interval " 

23 f"defined by the user-provided bounds and the domain " 

24 "of the distribution.") 

25 raise ValueError(message) 

26 elif not integral and (bounds[0] > bounds[1]): 

27 message = (f"There are no values for `{name}` on the interval " 

28 f"defined by the user-provided bounds and the domain " 

29 "of the distribution.") 

30 raise ValueError(message) 

31 

32 if not np.all(np.isfinite(bounds)): 

33 message = (f"The intersection of user-provided bounds for `{name}` " 

34 f"and the domain of the distribution is not finite. Please " 

35 f"provide finite bounds for shape `{name}` in `bounds`.") 

36 raise ValueError(message) 

37 

38 return bounds 

39 

40 

41class FitResult: 

42 r"""Result of fitting a discrete or continuous distribution to data 

43 

44 Attributes 

45 ---------- 

46 params : namedtuple 

47 A namedtuple containing the maximum likelihood estimates of the 

48 shape parameters, location, and (if applicable) scale of the 

49 distribution. 

50 success : bool or None 

51 Whether the optimizer considered the optimization to terminate 

52 successfully or not. 

53 message : str or None 

54 Any status message provided by the optimizer. 

55 

56 """ 

57 

58 def __init__(self, dist, data, discrete, res): 

59 self._dist = dist 

60 self._data = data 

61 self.discrete = discrete 

62 self.pxf = getattr(dist, "pmf", None) or getattr(dist, "pdf", None) 

63 

64 shape_names = [] if dist.shapes is None else dist.shapes.split(", ") 

65 if not discrete: 

66 FitParams = namedtuple('FitParams', shape_names + ['loc', 'scale']) 

67 else: 

68 FitParams = namedtuple('FitParams', shape_names + ['loc']) 

69 

70 self.params = FitParams(*res.x) 

71 

72 # Optimizer can report success even when nllf is infinite 

73 if res.success and not np.isfinite(self.nllf()): 

74 res.success = False 

75 res.message = ("Optimization converged to parameter values that " 

76 "are inconsistent with the data.") 

77 self.success = getattr(res, "success", None) 

78 self.message = getattr(res, "message", None) 

79 

80 def __repr__(self): 

81 keys = ["params", "success", "message"] 

82 m = max(map(len, keys)) + 1 

83 return '\n'.join([key.rjust(m) + ': ' + repr(getattr(self, key)) 

84 for key in keys if getattr(self, key) is not None]) 

85 

86 def nllf(self, params=None, data=None): 

87 """Negative log-likelihood function 

88 

89 Evaluates the negative of the log-likelihood function of the provided 

90 data at the provided parameters. 

91 

92 Parameters 

93 ---------- 

94 params : tuple, optional 

95 The shape parameters, location, and (if applicable) scale of the 

96 distribution as a single tuple. Default is the maximum likelihood 

97 estimates (``self.params``). 

98 data : array_like, optional 

99 The data for which the log-likelihood function is to be evaluated. 

100 Default is the data to which the distribution was fit. 

101 

102 Returns 

103 ------- 

104 nllf : float 

105 The negative of the log-likelihood function. 

106 

107 """ 

108 params = params if params is not None else self.params 

109 data = data if data is not None else self._data 

110 return self._dist.nnlf(theta=params, x=data) 

111 

112 def plot(self, ax=None, *, plot_type="hist"): 

113 """Visually compare the data against the fitted distribution. 

114 

115 Available only if ``matplotlib`` is installed. 

116 

117 Parameters 

118 ---------- 

119 ax : matplotlib.axes.Axes 

120 Axes object to draw the plot onto, otherwise uses the current Axes. 

121 plot_type : {"hist", "qq", "pp", "cdf"} 

122 Type of plot to draw. Options include: 

123 

124 - "hist": Superposes the PDF/PMF of the fitted distribution 

125 over a normalized histogram of the data. 

126 - "qq": Scatter plot of theoretical quantiles against the 

127 empirical quantiles. Specifically, the x-coordinates are the 

128 values of the fitted distribution PPF evaluated at the 

129 percentiles ``(np.arange(1, n) - 0.5)/n``, where ``n`` is the 

130 number of data points, and the y-coordinates are the sorted 

131 data points. 

132 - "pp": Scatter plot of theoretical percentiles against the 

133 observed percentiles. Specifically, the x-coordinates are the 

134 percentiles ``(np.arange(1, n) - 0.5)/n``, where ``n`` is 

135 the number of data points, and the y-coordinates are the values 

136 of the fitted distribution CDF evaluated at the sorted 

137 data points. 

138 - "cdf": Superposes the CDF of the fitted distribution over the 

139 empirical CDF. Specifically, the x-coordinates of the empirical 

140 CDF are the sorted data points, and the y-coordinates are the 

141 percentiles ``(np.arange(1, n) - 0.5)/n``, where ``n`` is 

142 the number of data points. 

143 

144 Returns 

145 ------- 

146 ax : matplotlib.axes.Axes 

147 The matplotlib Axes object on which the plot was drawn. 

148 """ 

149 try: 

150 import matplotlib # noqa 

151 except ModuleNotFoundError as exc: 

152 message = "matplotlib must be installed to use method `plot`." 

153 raise ModuleNotFoundError(message) from exc 

154 

155 plots = {'histogram': self._hist_plot, 'qq': self._qq_plot, 

156 'pp': self._pp_plot, 'cdf': self._cdf_plot, 

157 'hist': self._hist_plot} 

158 if plot_type.lower() not in plots: 

159 message = f"`plot_type` must be one of {set(plots.keys())}" 

160 raise ValueError(message) 

161 plot = plots[plot_type.lower()] 

162 

163 if ax is None: 

164 import matplotlib.pyplot as plt 

165 ax = plt.gca() 

166 

167 fit_params = np.atleast_1d(self.params) 

168 

169 return plot(ax=ax, fit_params=fit_params) 

170 

171 def _hist_plot(self, ax, fit_params): 

172 from matplotlib.ticker import MaxNLocator 

173 

174 support = self._dist.support(*fit_params) 

175 lb = support[0] if np.isfinite(support[0]) else min(self._data) 

176 ub = support[1] if np.isfinite(support[1]) else max(self._data) 

177 pxf = "PMF" if self.discrete else "PDF" 

178 

179 if self.discrete: 

180 x = np.arange(lb, ub + 2) 

181 y = self.pxf(x, *fit_params) 

182 ax.vlines(x[:-1], 0, y[:-1], label='Fitted Distribution PMF', 

183 color='C0') 

184 options = dict(density=True, bins=x, align='left', color='C1') 

185 ax.xaxis.set_major_locator(MaxNLocator(integer=True)) 

186 ax.set_xlabel('k') 

187 ax.set_ylabel('PMF') 

188 else: 

189 x = np.linspace(lb, ub, 200) 

190 y = self.pxf(x, *fit_params) 

191 ax.plot(x, y, '--', label='Fitted Distribution PDF', color='C0') 

192 options = dict(density=True, bins=50, align='mid', color='C1') 

193 ax.set_xlabel('x') 

194 ax.set_ylabel('PDF') 

195 

196 if len(self._data) > 50 or self.discrete: 

197 ax.hist(self._data, label="Histogram of Data", **options) 

198 else: 

199 ax.plot(self._data, np.zeros_like(self._data), "*", 

200 label='Data', color='C1') 

201 

202 ax.set_title(rf"Fitted $\tt {self._dist.name}$ {pxf} and Histogram") 

203 ax.legend(*ax.get_legend_handles_labels()) 

204 return ax 

205 

206 def _qp_plot(self, ax, fit_params, qq): 

207 data = np.sort(self._data) 

208 ps = self._plotting_positions(len(self._data)) 

209 

210 if qq: 

211 qp = "Quantiles" 

212 plot_type = 'Q-Q' 

213 x = self._dist.ppf(ps, *fit_params) 

214 y = data 

215 else: 

216 qp = "Percentiles" 

217 plot_type = 'P-P' 

218 x = ps 

219 y = self._dist.cdf(data, *fit_params) 

220 

221 ax.plot(x, y, '.', label=f'Fitted Distribution {plot_type}', 

222 color='C0', zorder=1) 

223 xlim = ax.get_xlim() 

224 ylim = ax.get_ylim() 

225 lim = [min(xlim[0], ylim[0]), max(xlim[1], ylim[1])] 

226 if not qq: 

227 lim = max(lim[0], 0), min(lim[1], 1) 

228 

229 if self.discrete and qq: 

230 q_min, q_max = int(lim[0]), int(lim[1]+1) 

231 q_ideal = np.arange(q_min, q_max) 

232 # q_ideal = np.unique(self._dist.ppf(ps, *fit_params)) 

233 ax.plot(q_ideal, q_ideal, 'o', label='Reference', color='k', 

234 alpha=0.25, markerfacecolor='none', clip_on=True) 

235 elif self.discrete and not qq: 

236 # The intent of this is to match the plot that would be produced 

237 # if x were continuous on [0, 1] and y were cdf(ppf(x)). 

238 # It can be approximated by letting x = np.linspace(0, 1, 1000), 

239 # but this might not look great when zooming in. The vertical 

240 # portions are included to indicate where the transition occurs 

241 # where the data completely obscures the horizontal portions. 

242 p_min, p_max = lim 

243 a, b = self._dist.support(*fit_params) 

244 p_min = max(p_min, 0 if np.isfinite(a) else 1e-3) 

245 p_max = min(p_max, 1 if np.isfinite(b) else 1-1e-3) 

246 q_min, q_max = self._dist.ppf([p_min, p_max], *fit_params) 

247 qs = np.arange(q_min-1, q_max+1) 

248 ps = self._dist.cdf(qs, *fit_params) 

249 ax.step(ps, ps, '-', label='Reference', color='k', alpha=0.25, 

250 clip_on=True) 

251 else: 

252 ax.plot(lim, lim, '-', label='Reference', color='k', alpha=0.25, 

253 clip_on=True) 

254 

255 ax.set_xlim(lim) 

256 ax.set_ylim(lim) 

257 ax.set_xlabel(rf"Fitted $\tt {self._dist.name}$ Theoretical {qp}") 

258 ax.set_ylabel(f"Data {qp}") 

259 ax.set_title(rf"Fitted $\tt {self._dist.name}$ {plot_type} Plot") 

260 ax.legend(*ax.get_legend_handles_labels()) 

261 ax.set_aspect('equal') 

262 return ax 

263 

264 def _qq_plot(self, **kwargs): 

265 return self._qp_plot(qq=True, **kwargs) 

266 

267 def _pp_plot(self, **kwargs): 

268 return self._qp_plot(qq=False, **kwargs) 

269 

270 def _plotting_positions(self, n, a=.5): 

271 # See https://en.wikipedia.org/wiki/Q%E2%80%93Q_plot#Plotting_positions 

272 k = np.arange(1, n+1) 

273 return (k-a) / (n + 1 - 2*a) 

274 

275 def _cdf_plot(self, ax, fit_params): 

276 data = np.sort(self._data) 

277 ecdf = self._plotting_positions(len(self._data)) 

278 ls = '--' if len(np.unique(data)) < 30 else '.' 

279 xlabel = 'k' if self.discrete else 'x' 

280 ax.step(data, ecdf, ls, label='Empirical CDF', color='C1', zorder=0) 

281 

282 xlim = ax.get_xlim() 

283 q = np.linspace(*xlim, 300) 

284 tcdf = self._dist.cdf(q, *fit_params) 

285 

286 ax.plot(q, tcdf, label='Fitted Distribution CDF', color='C0', zorder=1) 

287 ax.set_xlim(xlim) 

288 ax.set_ylim(0, 1) 

289 ax.set_xlabel(xlabel) 

290 ax.set_ylabel("CDF") 

291 ax.set_title(rf"Fitted $\tt {self._dist.name}$ and Empirical CDF") 

292 handles, labels = ax.get_legend_handles_labels() 

293 ax.legend(handles[::-1], labels[::-1]) 

294 return ax 

295 

296 

297def fit(dist, data, bounds=None, *, guess=None, method='mle', 

298 optimizer=optimize.differential_evolution): 

299 r"""Fit a discrete or continuous distribution to data 

300 

301 Given a distribution, data, and bounds on the parameters of the 

302 distribution, return maximum likelihood estimates of the parameters. 

303 

304 Parameters 

305 ---------- 

306 dist : `scipy.stats.rv_continuous` or `scipy.stats.rv_discrete` 

307 The object representing the distribution to be fit to the data. 

308 data : 1D array_like 

309 The data to which the distribution is to be fit. If the data contain 

310 any of ``np.nan``, ``np.inf``, or -``np.inf``, the fit method will 

311 raise a ``ValueError``. 

312 bounds : dict or sequence of tuples, optional 

313 If a dictionary, each key is the name of a parameter of the 

314 distribution, and the corresponding value is a tuple containing the 

315 lower and upper bound on that parameter. If the distribution is 

316 defined only for a finite range of values of that parameter, no entry 

317 for that parameter is required; e.g., some distributions have 

318 parameters which must be on the interval [0, 1]. Bounds for parameters 

319 location (``loc``) and scale (``scale``) are optional; by default, 

320 they are fixed to 0 and 1, respectively. 

321 

322 If a sequence, element *i* is a tuple containing the lower and upper 

323 bound on the *i*\ th parameter of the distribution. In this case, 

324 bounds for *all* distribution shape parameters must be provided. 

325 Optionally, bounds for location and scale may follow the 

326 distribution shape parameters. 

327 

328 If a shape is to be held fixed (e.g. if it is known), the 

329 lower and upper bounds may be equal. If a user-provided lower or upper 

330 bound is beyond a bound of the domain for which the distribution is 

331 defined, the bound of the distribution's domain will replace the 

332 user-provided value. Similarly, parameters which must be integral 

333 will be constrained to integral values within the user-provided bounds. 

334 guess : dict or array_like, optional 

335 If a dictionary, each key is the name of a parameter of the 

336 distribution, and the corresponding value is a guess for the value 

337 of the parameter. 

338 

339 If a sequence, element *i* is a guess for the *i*\ th parameter of the 

340 distribution. In this case, guesses for *all* distribution shape 

341 parameters must be provided. 

342 

343 If `guess` is not provided, guesses for the decision variables will 

344 not be passed to the optimizer. If `guess` is provided, guesses for 

345 any missing parameters will be set at the mean of the lower and 

346 upper bounds. Guesses for parameters which must be integral will be 

347 rounded to integral values, and guesses that lie outside the 

348 intersection of the user-provided bounds and the domain of the 

349 distribution will be clipped. 

350 method : {'mle', 'mse'} 

351 With ``method="mle"`` (default), the fit is computed by minimizing 

352 the negative log-likelihood function. A large, finite penalty 

353 (rather than infinite negative log-likelihood) is applied for 

354 observations beyond the support of the distribution. 

355 With ``method="mse"``, the fit is computed by minimizing 

356 the negative log-product spacing function. The same penalty is applied 

357 for observations beyond the support. We follow the approach of [1]_, 

358 which is generalized for samples with repeated observations. 

359 optimizer : callable, optional 

360 `optimizer` is a callable that accepts the following positional 

361 argument. 

362 

363 fun : callable 

364 The objective function to be optimized. `fun` accepts one argument 

365 ``x``, candidate shape parameters of the distribution, and returns 

366 the objective function value given ``x``, `dist`, and the provided 

367 `data`. 

368 The job of `optimizer` is to find values of the decision variables 

369 that minimizes `fun`. 

370 

371 `optimizer` must also accept the following keyword argument. 

372 

373 bounds : sequence of tuples 

374 The bounds on values of the decision variables; each element will 

375 be a tuple containing the lower and upper bound on a decision 

376 variable. 

377 

378 If `guess` is provided, `optimizer` must also accept the following 

379 keyword argument. 

380 

381 x0 : array_like 

382 The guesses for each decision variable. 

383 

384 If the distribution has any shape parameters that must be integral or 

385 if the distribution is discrete and the location parameter is not 

386 fixed, `optimizer` must also accept the following keyword argument. 

387 

388 integrality : array_like of bools 

389 For each decision variable, True if the decision variable 

390 must be constrained to integer values and False if the decision 

391 variable is continuous. 

392 

393 `optimizer` must return an object, such as an instance of 

394 `scipy.optimize.OptimizeResult`, which holds the optimal values of 

395 the decision variables in an attribute ``x``. If attributes 

396 ``fun``, ``status``, or ``message`` are provided, they will be 

397 included in the result object returned by `fit`. 

398 

399 Returns 

400 ------- 

401 result : `~scipy.stats._result_classes.FitResult` 

402 An object with the following fields. 

403 

404 params : namedtuple 

405 A namedtuple containing the maximum likelihood estimates of the 

406 shape parameters, location, and (if applicable) scale of the 

407 distribution. 

408 success : bool or None 

409 Whether the optimizer considered the optimization to terminate 

410 successfully or not. 

411 message : str or None 

412 Any status message provided by the optimizer. 

413 

414 The object has the following method: 

415 

416 nllf(params=None, data=None) 

417 By default, the negative log-likehood function at the fitted 

418 `params` for the given `data`. Accepts a tuple containing 

419 alternative shapes, location, and scale of the distribution and 

420 an array of alternative data. 

421 

422 plot(ax=None) 

423 Superposes the PDF/PMF of the fitted distribution over a normalized 

424 histogram of the data. 

425 

426 See Also 

427 -------- 

428 rv_continuous, rv_discrete 

429 

430 Notes 

431 ----- 

432 Optimization is more likely to converge to the maximum likelihood estimate 

433 when the user provides tight bounds containing the maximum likelihood 

434 estimate. For example, when fitting a binomial distribution to data, the 

435 number of experiments underlying each sample may be known, in which case 

436 the corresponding shape parameter ``n`` can be fixed. 

437 

438 References 

439 ---------- 

440 .. [1] Shao, Yongzhao, and Marjorie G. Hahn. "Maximum product of spacings 

441 method: a unified formulation with illustration of strong 

442 consistency." Illinois Journal of Mathematics 43.3 (1999): 489-499. 

443 

444 Examples 

445 -------- 

446 Suppose we wish to fit a distribution to the following data. 

447 

448 >>> import numpy as np 

449 >>> from scipy import stats 

450 >>> rng = np.random.default_rng() 

451 >>> dist = stats.nbinom 

452 >>> shapes = (5, 0.5) 

453 >>> data = dist.rvs(*shapes, size=1000, random_state=rng) 

454 

455 Suppose we do not know how the data were generated, but we suspect that 

456 it follows a negative binomial distribution with parameters *n* and *p*\. 

457 (See `scipy.stats.nbinom`.) We believe that the parameter *n* was fewer 

458 than 30, and we know that the parameter *p* must lie on the interval 

459 [0, 1]. We record this information in a variable `bounds` and pass 

460 this information to `fit`. 

461 

462 >>> bounds = [(0, 30), (0, 1)] 

463 >>> res = stats.fit(dist, data, bounds) 

464 

465 `fit` searches within the user-specified `bounds` for the 

466 values that best match the data (in the sense of maximum likelihood 

467 estimation). In this case, it found shape values similar to those 

468 from which the data were actually generated. 

469 

470 >>> res.params 

471 FitParams(n=5.0, p=0.5028157644634368, loc=0.0) # may vary 

472 

473 We can visualize the results by superposing the probability mass function 

474 of the distribution (with the shapes fit to the data) over a normalized 

475 histogram of the data. 

476 

477 >>> import matplotlib.pyplot as plt # matplotlib must be installed to plot 

478 >>> res.plot() 

479 >>> plt.show() 

480 

481 Note that the estimate for *n* was exactly integral; this is because 

482 the domain of the `nbinom` PMF includes only integral *n*, and the `nbinom` 

483 object "knows" that. `nbinom` also knows that the shape *p* must be a 

484 value between 0 and 1. In such a case - when the domain of the distribution 

485 with respect to a parameter is finite - we are not required to specify 

486 bounds for the parameter. 

487 

488 >>> bounds = {'n': (0, 30)} # omit parameter p using a `dict` 

489 >>> res2 = stats.fit(dist, data, bounds) 

490 >>> res2.params 

491 FitParams(n=5.0, p=0.5016492009232932, loc=0.0) # may vary 

492 

493 If we wish to force the distribution to be fit with *n* fixed at 6, we can 

494 set both the lower and upper bounds on *n* to 6. Note, however, that the 

495 value of the objective function being optimized is typically worse (higher) 

496 in this case. 

497 

498 >>> bounds = {'n': (6, 6)} # fix parameter `n` 

499 >>> res3 = stats.fit(dist, data, bounds) 

500 >>> res3.params 

501 FitParams(n=6.0, p=0.5486556076755706, loc=0.0) # may vary 

502 >>> res3.nllf() > res.nllf() 

503 True # may vary 

504 

505 Note that the numerical results of the previous examples are typical, but 

506 they may vary because the default optimizer used by `fit`, 

507 `scipy.optimize.differential_evolution`, is stochastic. However, we can 

508 customize the settings used by the optimizer to ensure reproducibility - 

509 or even use a different optimizer entirely - using the `optimizer` 

510 parameter. 

511 

512 >>> from scipy.optimize import differential_evolution 

513 >>> rng = np.random.default_rng(767585560716548) 

514 >>> def optimizer(fun, bounds, *, integrality): 

515 ... return differential_evolution(fun, bounds, strategy='best2bin', 

516 ... seed=rng, integrality=integrality) 

517 >>> bounds = [(0, 30), (0, 1)] 

518 >>> res4 = stats.fit(dist, data, bounds, optimizer=optimizer) 

519 >>> res4.params 

520 FitParams(n=5.0, p=0.5015183149259951, loc=0.0) 

521 

522 """ 

523 # --- Input Validation / Standardization --- # 

524 user_bounds = bounds 

525 user_guess = guess 

526 

527 # distribution input validation and information collection 

528 if hasattr(dist, "pdf"): # can't use isinstance for types 

529 default_bounds = {'loc': (0, 0), 'scale': (1, 1)} 

530 discrete = False 

531 elif hasattr(dist, "pmf"): 

532 default_bounds = {'loc': (0, 0)} 

533 discrete = True 

534 else: 

535 message = ("`dist` must be an instance of `rv_continuous` " 

536 "or `rv_discrete.`") 

537 raise ValueError(message) 

538 

539 try: 

540 param_info = dist._param_info() 

541 except AttributeError as e: 

542 message = (f"Distribution `{dist.name}` is not yet supported by " 

543 "`scipy.stats.fit` because shape information has " 

544 "not been defined.") 

545 raise ValueError(message) from e 

546 

547 # data input validation 

548 data = np.asarray(data) 

549 if data.ndim != 1: 

550 message = "`data` must be exactly one-dimensional." 

551 raise ValueError(message) 

552 if not (np.issubdtype(data.dtype, np.number) 

553 and np.all(np.isfinite(data))): 

554 message = "All elements of `data` must be finite numbers." 

555 raise ValueError(message) 

556 

557 # bounds input validation and information collection 

558 n_params = len(param_info) 

559 n_shapes = n_params - (1 if discrete else 2) 

560 param_list = [param.name for param in param_info] 

561 param_names = ", ".join(param_list) 

562 shape_names = ", ".join(param_list[:n_shapes]) 

563 

564 if user_bounds is None: 

565 user_bounds = {} 

566 

567 if isinstance(user_bounds, dict): 

568 default_bounds.update(user_bounds) 

569 user_bounds = default_bounds 

570 user_bounds_array = np.empty((n_params, 2)) 

571 for i in range(n_params): 

572 param_name = param_info[i].name 

573 user_bound = user_bounds.pop(param_name, None) 

574 if user_bound is None: 

575 user_bound = param_info[i].domain 

576 user_bounds_array[i] = user_bound 

577 if user_bounds: 

578 message = ("Bounds provided for the following unrecognized " 

579 f"parameters will be ignored: {set(user_bounds)}") 

580 warnings.warn(message, RuntimeWarning, stacklevel=2) 

581 

582 else: 

583 try: 

584 user_bounds = np.asarray(user_bounds, dtype=float) 

585 if user_bounds.size == 0: 

586 user_bounds = np.empty((0, 2)) 

587 except ValueError as e: 

588 message = ("Each element of a `bounds` sequence must be a tuple " 

589 "containing two elements: the lower and upper bound of " 

590 "a distribution parameter.") 

591 raise ValueError(message) from e 

592 if (user_bounds.ndim != 2 or user_bounds.shape[1] != 2): 

593 message = ("Each element of `bounds` must be a tuple specifying " 

594 "the lower and upper bounds of a shape parameter") 

595 raise ValueError(message) 

596 if user_bounds.shape[0] < n_shapes: 

597 message = (f"A `bounds` sequence must contain at least {n_shapes} " 

598 "elements: tuples specifying the lower and upper " 

599 f"bounds of all shape parameters {shape_names}.") 

600 raise ValueError(message) 

601 if user_bounds.shape[0] > n_params: 

602 message = ("A `bounds` sequence may not contain more than " 

603 f"{n_params} elements: tuples specifying the lower and " 

604 "upper bounds of distribution parameters " 

605 f"{param_names}.") 

606 raise ValueError(message) 

607 

608 user_bounds_array = np.empty((n_params, 2)) 

609 user_bounds_array[n_shapes:] = list(default_bounds.values()) 

610 user_bounds_array[:len(user_bounds)] = user_bounds 

611 

612 user_bounds = user_bounds_array 

613 validated_bounds = [] 

614 for i in range(n_params): 

615 name = param_info[i].name 

616 user_bound = user_bounds_array[i] 

617 param_domain = param_info[i].domain 

618 integral = param_info[i].integrality 

619 combined = _combine_bounds(name, user_bound, param_domain, integral) 

620 validated_bounds.append(combined) 

621 

622 bounds = np.asarray(validated_bounds) 

623 integrality = [param.integrality for param in param_info] 

624 

625 # guess input validation 

626 

627 if user_guess is None: 

628 guess_array = None 

629 elif isinstance(user_guess, dict): 

630 default_guess = {param.name: np.mean(bound) 

631 for param, bound in zip(param_info, bounds)} 

632 unrecognized = set(user_guess) - set(default_guess) 

633 if unrecognized: 

634 message = ("Guesses provided for the following unrecognized " 

635 f"parameters will be ignored: {unrecognized}") 

636 warnings.warn(message, RuntimeWarning, stacklevel=2) 

637 default_guess.update(user_guess) 

638 

639 message = ("Each element of `guess` must be a scalar " 

640 "guess for a distribution parameter.") 

641 try: 

642 guess_array = np.asarray([default_guess[param.name] 

643 for param in param_info], dtype=float) 

644 except ValueError as e: 

645 raise ValueError(message) from e 

646 

647 else: 

648 message = ("Each element of `guess` must be a scalar " 

649 "guess for a distribution parameter.") 

650 try: 

651 user_guess = np.asarray(user_guess, dtype=float) 

652 except ValueError as e: 

653 raise ValueError(message) from e 

654 if user_guess.ndim != 1: 

655 raise ValueError(message) 

656 if user_guess.shape[0] < n_shapes: 

657 message = (f"A `guess` sequence must contain at least {n_shapes} " 

658 "elements: scalar guesses for the distribution shape " 

659 f"parameters {shape_names}.") 

660 raise ValueError(message) 

661 if user_guess.shape[0] > n_params: 

662 message = ("A `guess` sequence may not contain more than " 

663 f"{n_params} elements: scalar guesses for the " 

664 f"distribution parameters {param_names}.") 

665 raise ValueError(message) 

666 

667 guess_array = np.mean(bounds, axis=1) 

668 guess_array[:len(user_guess)] = user_guess 

669 

670 if guess_array is not None: 

671 guess_rounded = guess_array.copy() 

672 

673 guess_rounded[integrality] = np.round(guess_rounded[integrality]) 

674 rounded = np.where(guess_rounded != guess_array)[0] 

675 for i in rounded: 

676 message = (f"Guess for parameter `{param_info[i].name}` " 

677 f"rounded from {guess_array[i]} to {guess_rounded[i]}.") 

678 warnings.warn(message, RuntimeWarning, stacklevel=2) 

679 

680 guess_clipped = np.clip(guess_rounded, bounds[:, 0], bounds[:, 1]) 

681 clipped = np.where(guess_clipped != guess_rounded)[0] 

682 for i in clipped: 

683 message = (f"Guess for parameter `{param_info[i].name}` " 

684 f"clipped from {guess_rounded[i]} to " 

685 f"{guess_clipped[i]}.") 

686 warnings.warn(message, RuntimeWarning, stacklevel=2) 

687 

688 guess = guess_clipped 

689 else: 

690 guess = None 

691 

692 # --- Fitting --- # 

693 def nllf(free_params, data=data): # bind data NOW 

694 with np.errstate(invalid='ignore', divide='ignore'): 

695 return dist._penalized_nnlf(free_params, data) 

696 

697 def nlpsf(free_params, data=data): # bind data NOW 

698 with np.errstate(invalid='ignore', divide='ignore'): 

699 return dist._penalized_nlpsf(free_params, data) 

700 

701 methods = {'mle': nllf, 'mse': nlpsf} 

702 objective = methods[method.lower()] 

703 

704 with np.errstate(invalid='ignore', divide='ignore'): 

705 kwds = {} 

706 if bounds is not None: 

707 kwds['bounds'] = bounds 

708 if np.any(integrality): 

709 kwds['integrality'] = integrality 

710 if guess is not None: 

711 kwds['x0'] = guess 

712 res = optimizer(objective, **kwds) 

713 

714 return FitResult(dist, data, discrete, res) 

715 

716 

717GoodnessOfFitResult = namedtuple('GoodnessOfFitResult', 

718 ('fit_result', 'statistic', 'pvalue', 

719 'null_distribution')) 

720 

721 

722def goodness_of_fit(dist, data, *, known_params=None, fit_params=None, 

723 guessed_params=None, statistic='ad', n_mc_samples=9999, 

724 random_state=None): 

725 r""" 

726 Perform a goodness of fit test comparing data to a distribution family. 

727 

728 Given a distribution family and data, perform a test of the null hypothesis 

729 that the data were drawn from a distribution in that family. Any known 

730 parameters of the distribution may be specified. Remaining parameters of 

731 the distribution will be fit to the data, and the p-value of the test 

732 is computed accordingly. Several statistics for comparing the distribution 

733 to data are available. 

734 

735 Parameters 

736 ---------- 

737 dist : `scipy.stats.rv_continuous` 

738 The object representing the distribution family under the null 

739 hypothesis. 

740 data : 1D array_like 

741 Finite, uncensored data to be tested. 

742 known_params : dict, optional 

743 A dictionary containing name-value pairs of known distribution 

744 parameters. Monte Carlo samples are randomly drawn from the 

745 null-hypothesized distribution with these values of the parameters. 

746 Before the statistic is evaluated for each Monte Carlo sample, only 

747 remaining unknown parameters of the null-hypothesized distribution 

748 family are fit to the samples; the known parameters are held fixed. 

749 If all parameters of the distribution family are known, then the step 

750 of fitting the distribution family to each sample is omitted. 

751 fit_params : dict, optional 

752 A dictionary containing name-value pairs of distribution parameters 

753 that have already been fit to the data, e.g. using `scipy.stats.fit` 

754 or the ``fit`` method of `dist`. Monte Carlo samples are drawn from the 

755 null-hypothesized distribution with these specified values of the 

756 parameter. On those Monte Carlo samples, however, these and all other 

757 unknown parameters of the null-hypothesized distribution family are 

758 fit before the statistic is evaluated. 

759 guessed_params : dict, optional 

760 A dictionary containing name-value pairs of distribution parameters 

761 which have been guessed. These parameters are always considered as 

762 free parameters and are fit both to the provided `data` as well as 

763 to the Monte Carlo samples drawn from the null-hypothesized 

764 distribution. The purpose of these `guessed_params` is to be used as 

765 initial values for the numerical fitting procedure. 

766 statistic : {"ad", "ks", "cvm"}, optional 

767 The statistic used to compare data to a distribution after fitting 

768 unknown parameters of the distribution family to the data. The 

769 Anderson-Darling ("ad"), Kolmogorov-Smirnov ("ks"), and 

770 Cramer-von Mises ("cvm") statistics are available [1]_. 

771 n_mc_samples : int, default: 9999 

772 The number of Monte Carlo samples drawn from the null hypothesized 

773 distribution to form the null distribution of the statistic. The 

774 sample size of each is the same as the given `data`. 

775 random_state : {None, int, `numpy.random.Generator`, 

776 `numpy.random.RandomState`}, optional 

777 

778 Pseudorandom number generator state used to generate the Monte Carlo 

779 samples. 

780 

781 If `random_state` is ``None`` (default), the 

782 `numpy.random.RandomState` singleton is used. 

783 If `random_state` is an int, a new ``RandomState`` instance is used, 

784 seeded with `random_state`. 

785 If `random_state` is already a ``Generator`` or ``RandomState`` 

786 instance, then the provided instance is used. 

787 

788 Returns 

789 ------- 

790 res : GoodnessOfFitResult 

791 An object with the following attributes. 

792 

793 fit_result : `~scipy.stats._result_classes.FitResult` 

794 An object representing the fit of the provided `dist` to `data`. 

795 This object includes the values of distribution family parameters 

796 that fully define the null-hypothesized distribution, that is, 

797 the distribution from which Monte Carlo samples are drawn. 

798 statistic : float 

799 The value of the statistic comparing provided `data` to the 

800 null-hypothesized distribution. 

801 pvalue : float 

802 The proportion of elements in the null distribution with 

803 statistic values at least as extreme as the statistic value of the 

804 provided `data`. 

805 null_distribution : ndarray 

806 The value of the statistic for each Monte Carlo sample 

807 drawn from the null-hypothesized distribution. 

808 

809 Notes 

810 ----- 

811 This is a generalized Monte Carlo goodness-of-fit procedure, special cases 

812 of which correspond with various Anderson-Darling tests, Lilliefors' test, 

813 etc. The test is described in [2]_, [3]_, and [4]_ as a parametric 

814 bootstrap test. This is a Monte Carlo test in which parameters that 

815 specify the distribution from which samples are drawn have been estimated 

816 from the data. We describe the test using "Monte Carlo" rather than 

817 "parametric bootstrap" throughout to avoid confusion with the more familiar 

818 nonparametric bootstrap, and describe how the test is performed below. 

819 

820 *Traditional goodness of fit tests* 

821 

822 Traditionally, critical values corresponding with a fixed set of 

823 significance levels are pre-calculated using Monte Carlo methods. Users 

824 perform the test by calculating the value of the test statistic only for 

825 their observed `data` and comparing this value to tabulated critical 

826 values. This practice is not very flexible, as tables are not available for 

827 all distributions and combinations of known and unknown parameter values. 

828 Also, results can be inaccurate when critical values are interpolated from 

829 limited tabulated data to correspond with the user's sample size and 

830 fitted parameter values. To overcome these shortcomings, this function 

831 allows the user to perform the Monte Carlo trials adapted to their 

832 particular data. 

833 

834 *Algorithmic overview* 

835 

836 In brief, this routine executes the following steps: 

837 

838 1. Fit unknown parameters to the given `data`, thereby forming the 

839 "null-hypothesized" distribution, and compute the statistic of 

840 this pair of data and distribution. 

841 2. Draw random samples from this null-hypothesized distribution. 

842 3. Fit the unknown parameters to each random sample. 

843 4. Calculate the statistic between each sample and the distribution that 

844 has been fit to the sample. 

845 5. Compare the value of the statistic corresponding with `data` from (1) 

846 against the values of the statistic corresponding with the random 

847 samples from (4). The p-value is the proportion of samples with a 

848 statistic value greater than or equal to the statistic of the observed 

849 data. 

850 

851 In more detail, the steps are as follows. 

852 

853 First, any unknown parameters of the distribution family specified by 

854 `dist` are fit to the provided `data` using maximum likelihood estimation. 

855 (One exception is the normal distribution with unknown location and scale: 

856 we use the bias-corrected standard deviation ``np.std(data, ddof=1)`` for 

857 the scale as recommended in [1]_.) 

858 These values of the parameters specify a particular member of the 

859 distribution family referred to as the "null-hypothesized distribution", 

860 that is, the distribution from which the data were sampled under the null 

861 hypothesis. The `statistic`, which compares data to a distribution, is 

862 computed between `data` and the null-hypothesized distribution. 

863 

864 Next, many (specifically `n_mc_samples`) new samples, each containing the 

865 same number of observations as `data`, are drawn from the 

866 null-hypothesized distribution. All unknown parameters of the distribution 

867 family `dist` are fit to *each resample*, and the `statistic` is computed 

868 between each sample and its corresponding fitted distribution. These 

869 values of the statistic form the Monte Carlo null distribution (not to be 

870 confused with the "null-hypothesized distribution" above). 

871 

872 The p-value of the test is the proportion of statistic values in the Monte 

873 Carlo null distribution that are at least as extreme as the statistic value 

874 of the provided `data`. More precisely, the p-value is given by 

875 

876 .. math:: 

877 

878 p = \frac{b + 1} 

879 {m + 1} 

880 

881 where :math:`b` is the number of statistic values in the Monte Carlo null 

882 distribution that are greater than or equal to the the statistic value 

883 calculated for `data`, and :math:`m` is the number of elements in the 

884 Monte Carlo null distribution (`n_mc_samples`). The addition of :math:`1` 

885 to the numerator and denominator can be thought of as including the 

886 value of the statistic corresponding with `data` in the null distribution, 

887 but a more formal explanation is given in [5]_. 

888 

889 *Limitations* 

890 

891 The test can be very slow for some distribution families because unknown 

892 parameters of the distribution family must be fit to each of the Monte 

893 Carlo samples, and for most distributions in SciPy, distribution fitting 

894 performed via numerical optimization. 

895 

896 *Anti-Pattern* 

897 

898 For this reason, it may be tempting 

899 to treat parameters of the distribution pre-fit to `data` (by the user) 

900 as though they were `known_params`, as specification of all parameters of 

901 the distribution precludes the need to fit the distribution to each Monte 

902 Carlo sample. (This is essentially how the original Kilmogorov-Smirnov 

903 test is performed.) Although such a test can provide evidence against the 

904 null hypothesis, the test is conservative in the sense that small p-values 

905 will tend to (greatly) *overestimate* the probability of making a type I 

906 error (that is, rejecting the null hypothesis although it is true), and the 

907 power of the test is low (that is, it is less likely to reject the null 

908 hypothesis even when the null hypothesis is false). 

909 This is because the Monte Carlo samples are less likely to agree with the 

910 null-hypothesized distribution as well as `data`. This tends to increase 

911 the values of the statistic recorded in the null distribution, so that a 

912 larger number of them exceed the value of statistic for `data`, thereby 

913 inflating the p-value. 

914 

915 References 

916 ---------- 

917 .. [1] M. A. Stephens (1974). "EDF Statistics for Goodness of Fit and 

918 Some Comparisons." Journal of the American Statistical Association, 

919 Vol. 69, pp. 730-737. 

920 .. [2] W. Stute, W. G. Manteiga, and M. P. Quindimil (1993). 

921 "Bootstrap based goodness-of-fit-tests." Metrika 40.1: 243-256. 

922 .. [3] C. Genest, & B Rémillard. (2008). "Validity of the parametric 

923 bootstrap for goodness-of-fit testing in semiparametric models." 

924 Annales de l'IHP Probabilités et statistiques. Vol. 44. No. 6. 

925 .. [4] I. Kojadinovic and J. Yan (2012). "Goodness-of-fit testing based on 

926 a weighted bootstrap: A fast large-sample alternative to the 

927 parametric bootstrap." Canadian Journal of Statistics 40.3: 480-500. 

928 .. [5] B. Phipson and G. K. Smyth (2010). "Permutation P-values Should 

929 Never Be Zero: Calculating Exact P-values When Permutations Are 

930 Randomly Drawn." Statistical Applications in Genetics and Molecular 

931 Biology 9.1. 

932 .. [6] H. W. Lilliefors (1967). "On the Kolmogorov-Smirnov test for 

933 normality with mean and variance unknown." Journal of the American 

934 statistical Association 62.318: 399-402. 

935 

936 Examples 

937 -------- 

938 A well-known test of the null hypothesis that data were drawn from a 

939 given distribution is the Kolmogorov-Smirnov (KS) test, available in SciPy 

940 as `scipy.stats.ks_1samp`. Suppose we wish to test whether the following 

941 data: 

942 

943 >>> import numpy as np 

944 >>> from scipy import stats 

945 >>> rng = np.random.default_rng() 

946 >>> x = stats.uniform.rvs(size=75, random_state=rng) 

947 

948 were sampled from a normal distribution. To perform a KS test, the 

949 empirical distribution function of the observed data will be compared 

950 against the (theoretical) cumulative distribution function of a normal 

951 distribution. Of course, to do this, the normal distribution under the null 

952 hypothesis must be fully specified. This is commonly done by first fitting 

953 the ``loc`` and ``scale`` parameters of the distribution to the observed 

954 data, then performing the test. 

955 

956 >>> loc, scale = np.mean(x), np.std(x, ddof=1) 

957 >>> cdf = stats.norm(loc, scale).cdf 

958 >>> stats.ks_1samp(x, cdf) 

959 KstestResult(statistic=0.1119257570456813, pvalue=0.2827756409939257) 

960 

961 An advantage of the KS-test is that the p-value - the probability of 

962 obtaining a value of the test statistic under the null hypothesis as 

963 extreme as the value obtained from the observed data - can be calculated 

964 exactly and efficiently. `goodness_of_fit` can only approximate these 

965 results. 

966 

967 >>> known_params = {'loc': loc, 'scale': scale} 

968 >>> res = stats.goodness_of_fit(stats.norm, x, known_params=known_params, 

969 ... statistic='ks', random_state=rng) 

970 >>> res.statistic, res.pvalue 

971 (0.1119257570456813, 0.2788) 

972 

973 The statistic matches exactly, but the p-value is estimated by forming 

974 a "Monte Carlo null distribution", that is, by explicitly drawing random 

975 samples from `scipy.stats.norm` with the provided parameters and 

976 calculating the stastic for each. The fraction of these statistic values 

977 at least as extreme as ``res.statistic`` approximates the exact p-value 

978 calculated by `scipy.stats.ks_1samp`. 

979 

980 However, in many cases, we would prefer to test only that the data were 

981 sampled from one of *any* member of the normal distribution family, not 

982 specifically from the normal distribution with the location and scale 

983 fitted to the observed sample. In this case, Lilliefors [6]_ argued that 

984 the KS test is far too conservative (that is, the p-value overstates 

985 the actual probability of rejecting a true null hypothesis) and thus lacks 

986 power - the ability to reject the null hypothesis when the null hypothesis 

987 is actually false. 

988 Indeed, our p-value above is approximately 0.28, which is far too large 

989 to reject the null hypothesis at any common significance level. 

990 

991 Consider why this might be. Note that in the KS test above, the statistic 

992 always compares data against the CDF of a normal distribution fitted to the 

993 *observed data*. This tends to reduce the value of the statistic for the 

994 observed data, but it is "unfair" when computing the statistic for other 

995 samples, such as those we randomly draw to form the Monte Carlo null 

996 distribution. It is easy to correct for this: whenever we compute the KS 

997 statistic of a sample, we use the CDF of a normal distribution fitted 

998 to *that sample*. The null distribution in this case has not been 

999 calculated exactly and is tyically approximated using Monte Carlo methods 

1000 as described above. This is where `goodness_of_fit` excels. 

1001 

1002 >>> res = stats.goodness_of_fit(stats.norm, x, statistic='ks', 

1003 ... random_state=rng) 

1004 >>> res.statistic, res.pvalue 

1005 (0.1119257570456813, 0.0196) 

1006 

1007 Indeed, this p-value is much smaller, and small enough to (correctly) 

1008 reject the null hypothesis at common signficance levels, including 5% and 

1009 2.5%. 

1010 

1011 However, the KS statistic is not very sensitive to all deviations from 

1012 normality. The original advantage of the KS statistic was the ability 

1013 to compute the null distribution theoretically, but a more sensitive 

1014 statistic - resulting in a higher test power - can be used now that we can 

1015 approximate the null distribution 

1016 computationally. The Anderson-Darling statistic [1]_ tends to be more 

1017 sensitive, and critical values of the this statistic have been tabulated 

1018 for various significance levels and sample sizes using Monte Carlo methods. 

1019 

1020 >>> res = stats.anderson(x, 'norm') 

1021 >>> print(res.statistic) 

1022 1.2139573337497467 

1023 >>> print(res.critical_values) 

1024 [0.549 0.625 0.75 0.875 1.041] 

1025 >>> print(res.significance_level) 

1026 [15. 10. 5. 2.5 1. ] 

1027 

1028 Here, the observed value of the statistic exceeds the critical value 

1029 corresponding with a 1% significance level. This tells us that the p-value 

1030 of the observed data is less than 1%, but what is it? We could interpolate 

1031 from these (already-interpolated) values, but `goodness_of_fit` can 

1032 estimate it directly. 

1033 

1034 >>> res = stats.goodness_of_fit(stats.norm, x, statistic='ad', 

1035 ... random_state=rng) 

1036 >>> res.statistic, res.pvalue 

1037 (1.2139573337497467, 0.0034) 

1038 

1039 A further advantage is that use of `goodness_of_fit` is not limited to 

1040 a particular set of distributions or conditions on which parameters 

1041 are known versus which must be estimated from data. Instead, 

1042 `goodness_of_fit` can estimate p-values relatively quickly for any 

1043 distribution with a sufficiently fast and reliable ``fit`` method. For 

1044 instance, here we perform a goodness of fit test using the Cramer-von Mises 

1045 statistic against the Rayleigh distribution with known location and unknown 

1046 scale. 

1047 

1048 >>> rng = np.random.default_rng() 

1049 >>> x = stats.chi(df=2.2, loc=0, scale=2).rvs(size=1000, random_state=rng) 

1050 >>> res = stats.goodness_of_fit(stats.rayleigh, x, statistic='cvm', 

1051 ... known_params={'loc': 0}, random_state=rng) 

1052 

1053 This executes fairly quickly, but to check the reliability of the ``fit`` 

1054 method, we should inspect the fit result. 

1055 

1056 >>> res.fit_result # location is as specified, and scale is reasonable 

1057 params: FitParams(loc=0.0, scale=2.1026719844231243) 

1058 success: True 

1059 message: 'The fit was performed successfully.' 

1060 >>> import matplotlib.pyplot as plt # matplotlib must be installed to plot 

1061 >>> res.fit_result.plot() 

1062 >>> plt.show() 

1063 

1064 If the distribution is not fit to the observed data as well as possible, 

1065 the test may not control the type I error rate, that is, the chance of 

1066 rejecting the null hypothesis even when it is true. 

1067 

1068 We should also look for extreme outliers in the null distribution that 

1069 may be caused by unreliable fitting. These do not necessarily invalidate 

1070 the result, but they tend to reduce the test's power. 

1071 

1072 >>> _, ax = plt.subplots() 

1073 >>> ax.hist(np.log10(res.null_distribution)) 

1074 >>> ax.set_xlabel("log10 of CVM statistic under the null hypothesis") 

1075 >>> ax.set_ylabel("Frequency") 

1076 >>> ax.set_title("Histogram of the Monte Carlo null distribution") 

1077 >>> plt.show() 

1078 

1079 This plot seems reassuring. 

1080 

1081 If ``fit`` method is working reliably, and if the distribution of the test 

1082 statistic is not particularly sensitive to the values of the fitted 

1083 parameters, then the p-value provided by `goodness_of_fit` is expected to 

1084 be a good approximation. 

1085 

1086 >>> res.statistic, res.pvalue 

1087 (0.2231991510248692, 0.0525) 

1088 

1089 """ 

1090 args = _gof_iv(dist, data, known_params, fit_params, guessed_params, 

1091 statistic, n_mc_samples, random_state) 

1092 (dist, data, fixed_nhd_params, fixed_rfd_params, guessed_nhd_params, 

1093 guessed_rfd_params, statistic, n_mc_samples_int, random_state) = args 

1094 

1095 # Fit null hypothesis distribution to data 

1096 nhd_fit_fun = _get_fit_fun(dist, data, guessed_nhd_params, 

1097 fixed_nhd_params) 

1098 nhd_vals = nhd_fit_fun(data) 

1099 nhd_dist = dist(*nhd_vals) 

1100 

1101 def rvs(size): 

1102 return nhd_dist.rvs(size=size, random_state=random_state) 

1103 

1104 # Define statistic 

1105 fit_fun = _get_fit_fun(dist, data, guessed_rfd_params, fixed_rfd_params) 

1106 compare_fun = _compare_dict[statistic] 

1107 

1108 def statistic_fun(data, axis=-1): 

1109 # Make things simple by always working along the last axis. 

1110 data = np.moveaxis(data, axis, -1) 

1111 rfd_vals = fit_fun(data) 

1112 rfd_dist = dist(*rfd_vals) 

1113 return compare_fun(rfd_dist, data) 

1114 

1115 res = stats.monte_carlo_test(data, rvs, statistic_fun, vectorized=True, 

1116 n_resamples=n_mc_samples, axis=-1, 

1117 alternative='greater') 

1118 opt_res = optimize.OptimizeResult() 

1119 opt_res.success = True 

1120 opt_res.message = "The fit was performed successfully." 

1121 opt_res.x = nhd_vals 

1122 # Only continuous distributions for now, hence discrete=False 

1123 # There's no fundamental limitation; it's just that we're not using 

1124 # stats.fit, discrete distributions don't have `fit` method, and 

1125 # we haven't written any vectorized fit functions for a discrete 

1126 # distribution yet. 

1127 return GoodnessOfFitResult(FitResult(dist, data, False, opt_res), 

1128 res.statistic, res.pvalue, 

1129 res.null_distribution) 

1130 

1131 

1132def _get_fit_fun(dist, data, guessed_params, fixed_params): 

1133 

1134 shape_names = [] if dist.shapes is None else dist.shapes.split(", ") 

1135 param_names = shape_names + ['loc', 'scale'] 

1136 fparam_names = ['f'+name for name in param_names] 

1137 all_fixed = not set(fparam_names).difference(fixed_params) 

1138 guessed_shapes = [guessed_params.pop(x, None) 

1139 for x in shape_names if x in guessed_params] 

1140 

1141 # Define statistic, including fitting distribution to data 

1142 if dist in _fit_funs: 

1143 def fit_fun(data): 

1144 params = _fit_funs[dist](data, **fixed_params) 

1145 params = np.asarray(np.broadcast_arrays(*params)) 

1146 if params.ndim > 1: 

1147 params = params[..., np.newaxis] 

1148 return params 

1149 

1150 elif all_fixed: 

1151 def fit_fun(data): 

1152 return [fixed_params[name] for name in fparam_names] 

1153 

1154 else: 

1155 def fit_fun_1d(data): 

1156 return dist.fit(data, *guessed_shapes, **guessed_params, 

1157 **fixed_params) 

1158 

1159 def fit_fun(data): 

1160 params = np.apply_along_axis(fit_fun_1d, axis=-1, arr=data) 

1161 if params.ndim > 1: 

1162 params = params.T[..., np.newaxis] 

1163 return params 

1164 

1165 return fit_fun 

1166 

1167 

1168# Vectorized fitting functions. These are to accept ND `data` in which each 

1169# row (slice along last axis) is a sample to fit and scalar fixed parameters. 

1170# They return a tuple of shape parameter arrays, each of shape data.shape[:-1]. 

1171def _fit_norm(data, floc=None, fscale=None): 

1172 loc = floc 

1173 scale = fscale 

1174 if loc is None and scale is None: 

1175 loc = np.mean(data, axis=-1) 

1176 scale = np.std(data, ddof=1, axis=-1) 

1177 elif loc is None: 

1178 loc = np.mean(data, axis=-1) 

1179 elif scale is None: 

1180 scale = np.sqrt(((data - loc)**2).mean(axis=-1)) 

1181 return loc, scale 

1182 

1183 

1184_fit_funs = {stats.norm: _fit_norm} # type: ignore[attr-defined] 

1185 

1186 

1187# Vectorized goodness of fit statistic functions. These accept a frozen 

1188# distribution object and `data` in which each row (slice along last axis) is 

1189# a sample. 

1190 

1191 

1192def _anderson_darling(dist, data): 

1193 x = np.sort(data, axis=-1) 

1194 n = data.shape[-1] 

1195 i = np.arange(1, n+1) 

1196 Si = (2*i - 1)/n * (dist.logcdf(x) + dist.logsf(x[..., ::-1])) 

1197 S = np.sum(Si, axis=-1) 

1198 return -n - S 

1199 

1200 

1201def _compute_dplus(cdfvals): # adapted from _stats_py before gh-17062 

1202 n = cdfvals.shape[-1] 

1203 return (np.arange(1.0, n + 1) / n - cdfvals).max(axis=-1) 

1204 

1205 

1206def _compute_dminus(cdfvals, axis=-1): 

1207 n = cdfvals.shape[-1] 

1208 return (cdfvals - np.arange(0.0, n)/n).max(axis=-1) 

1209 

1210 

1211def _kolmogorov_smirnov(dist, data): 

1212 x = np.sort(data, axis=-1) 

1213 cdfvals = dist.cdf(x) 

1214 Dplus = _compute_dplus(cdfvals) # always works along last axis 

1215 Dminus = _compute_dminus(cdfvals) 

1216 return np.maximum(Dplus, Dminus) 

1217 

1218 

1219def _cramer_von_mises(dist, data): 

1220 x = np.sort(data, axis=-1) 

1221 n = data.shape[-1] 

1222 cdfvals = dist.cdf(x) 

1223 u = (2*np.arange(1, n+1) - 1)/(2*n) 

1224 w = 1 / (12*n) + np.sum((u - cdfvals)**2, axis=-1) 

1225 return w 

1226 

1227 

1228_compare_dict = {"ad": _anderson_darling, "ks": _kolmogorov_smirnov, 

1229 "cvm": _cramer_von_mises} 

1230 

1231 

1232def _gof_iv(dist, data, known_params, fit_params, guessed_params, statistic, 

1233 n_mc_samples, random_state): 

1234 

1235 if not isinstance(dist, stats.rv_continuous): 

1236 message = ("`dist` must be a (non-frozen) instance of " 

1237 "`stats.rv_continuous`.") 

1238 raise TypeError(message) 

1239 

1240 data = np.asarray(data, dtype=float) 

1241 if not data.ndim == 1: 

1242 message = "`data` must be a one-dimensional array of numbers." 

1243 raise ValueError(message) 

1244 

1245 # Leave validation of these key/value pairs to the `fit` method, 

1246 # but collect these into dictionaries that will be used 

1247 known_params = known_params or dict() 

1248 fit_params = fit_params or dict() 

1249 guessed_params = guessed_params or dict() 

1250 

1251 known_params_f = {("f"+key): val for key, val in known_params.items()} 

1252 fit_params_f = {("f"+key): val for key, val in fit_params.items()} 

1253 

1254 # These the the values of parameters of the null distribution family 

1255 # with which resamples are drawn 

1256 fixed_nhd_params = known_params_f.copy() 

1257 fixed_nhd_params.update(fit_params_f) 

1258 

1259 # These are fixed when fitting the distribution family to resamples 

1260 fixed_rfd_params = known_params_f.copy() 

1261 

1262 # These are used as guesses when fitting the distribution family to 

1263 # the original data 

1264 guessed_nhd_params = guessed_params.copy() 

1265 

1266 # These are used as guesses when fitting the distribution family to 

1267 # resamples 

1268 guessed_rfd_params = fit_params.copy() 

1269 guessed_rfd_params.update(guessed_params) 

1270 

1271 statistics = {'ad', 'ks', 'cvm'} 

1272 if statistic.lower() not in statistics: 

1273 message = f"`statistic` must be one of {statistics}." 

1274 raise ValueError(message) 

1275 

1276 n_mc_samples_int = int(n_mc_samples) 

1277 if n_mc_samples_int != n_mc_samples: 

1278 message = "`n_mc_samples` must be an integer." 

1279 raise TypeError(message) 

1280 

1281 random_state = check_random_state(random_state) 

1282 

1283 return (dist, data, fixed_nhd_params, fixed_rfd_params, guessed_nhd_params, 

1284 guessed_rfd_params, statistic, n_mc_samples_int, random_state)