Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scipy/stats/_stats_py.py: 10%

2034 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-12-12 06:31 +0000

1# Copyright 2002 Gary Strangman. All rights reserved 

2# Copyright 2002-2016 The SciPy Developers 

3# 

4# The original code from Gary Strangman was heavily adapted for 

5# use in SciPy by Travis Oliphant. The original code came with the 

6# following disclaimer: 

7# 

8# This software is provided "as-is". There are no expressed or implied 

9# warranties of any kind, including, but not limited to, the warranties 

10# of merchantability and fitness for a given application. In no event 

11# shall Gary Strangman be liable for any direct, indirect, incidental, 

12# special, exemplary or consequential damages (including, but not limited 

13# to, loss of use, data or profits, or business interruption) however 

14# caused and on any theory of liability, whether in contract, strict 

15# liability or tort (including negligence or otherwise) arising in any way 

16# out of the use of this software, even if advised of the possibility of 

17# such damage. 

18 

19""" 

20A collection of basic statistical functions for Python. 

21 

22References 

23---------- 

24.. [CRCProbStat2000] Zwillinger, D. and Kokoska, S. (2000). CRC Standard 

25 Probability and Statistics Tables and Formulae. Chapman & Hall: New 

26 York. 2000. 

27 

28""" 

29import warnings 

30import math 

31from math import gcd 

32from collections import namedtuple, Counter 

33 

34import numpy as np 

35from numpy import array, asarray, ma 

36from numpy.lib import NumpyVersion 

37from numpy.testing import suppress_warnings 

38 

39from scipy.spatial.distance import cdist 

40from scipy.ndimage import _measurements 

41from scipy._lib._util import (check_random_state, MapWrapper, 

42 rng_integers, _rename_parameter, _contains_nan) 

43 

44import scipy.special as special 

45from scipy import linalg 

46from . import distributions 

47from . import _mstats_basic as mstats_basic 

48from ._stats_mstats_common import (_find_repeats, linregress, theilslopes, 

49 siegelslopes) 

50from ._stats import (_kendall_dis, _toint64, _weightedrankedtau, 

51 _local_correlations) 

52from dataclasses import make_dataclass 

53from ._hypotests import _all_partitions 

54from ._stats_pythran import _compute_outer_prob_inside_method 

55from ._resampling import _batch_generator 

56from ._axis_nan_policy import (_axis_nan_policy_factory, 

57 _broadcast_concatenate) 

58from ._binomtest import _binary_search_for_binom_tst as _binary_search 

59from scipy._lib._bunch import _make_tuple_bunch 

60from scipy import stats 

61from scipy.optimize import root_scalar 

62 

63 

64# Functions/classes in other files should be added in `__init__.py`, not here 

65__all__ = ['find_repeats', 'gmean', 'hmean', 'pmean', 'mode', 'tmean', 'tvar', 

66 'tmin', 'tmax', 'tstd', 'tsem', 'moment', 

67 'skew', 'kurtosis', 'describe', 'skewtest', 'kurtosistest', 

68 'normaltest', 'jarque_bera', 

69 'scoreatpercentile', 'percentileofscore', 

70 'cumfreq', 'relfreq', 'obrientransform', 

71 'sem', 'zmap', 'zscore', 'gzscore', 'iqr', 'gstd', 

72 'median_abs_deviation', 

73 'sigmaclip', 'trimboth', 'trim1', 'trim_mean', 

74 'f_oneway', 'pearsonr', 'fisher_exact', 

75 'spearmanr', 'pointbiserialr', 

76 'kendalltau', 'weightedtau', 'multiscale_graphcorr', 

77 'linregress', 'siegelslopes', 'theilslopes', 'ttest_1samp', 

78 'ttest_ind', 'ttest_ind_from_stats', 'ttest_rel', 

79 'kstest', 'ks_1samp', 'ks_2samp', 

80 'chisquare', 'power_divergence', 

81 'tiecorrect', 'ranksums', 'kruskal', 'friedmanchisquare', 

82 'rankdata', 

83 'combine_pvalues', 'wasserstein_distance', 'energy_distance', 

84 'brunnermunzel', 'alexandergovern', 

85 'expectile', ] 

86 

87 

88def _chk_asarray(a, axis): 

89 if axis is None: 

90 a = np.ravel(a) 

91 outaxis = 0 

92 else: 

93 a = np.asarray(a) 

94 outaxis = axis 

95 

96 if a.ndim == 0: 

97 a = np.atleast_1d(a) 

98 

99 return a, outaxis 

100 

101 

102def _chk2_asarray(a, b, axis): 

103 if axis is None: 

104 a = np.ravel(a) 

105 b = np.ravel(b) 

106 outaxis = 0 

107 else: 

108 a = np.asarray(a) 

109 b = np.asarray(b) 

110 outaxis = axis 

111 

112 if a.ndim == 0: 

113 a = np.atleast_1d(a) 

114 if b.ndim == 0: 

115 b = np.atleast_1d(b) 

116 

117 return a, b, outaxis 

118 

119 

120def _shape_with_dropped_axis(a, axis): 

121 """ 

122 Given an array `a` and an integer `axis`, return the shape 

123 of `a` with the `axis` dimension removed. 

124 

125 Examples 

126 -------- 

127 >>> a = np.zeros((3, 5, 2)) 

128 >>> _shape_with_dropped_axis(a, 1) 

129 (3, 2) 

130 

131 """ 

132 shp = list(a.shape) 

133 try: 

134 del shp[axis] 

135 except IndexError: 

136 raise np.AxisError(axis, a.ndim) from None 

137 return tuple(shp) 

138 

139 

140def _broadcast_shapes(shape1, shape2): 

141 """ 

142 Given two shapes (i.e. tuples of integers), return the shape 

143 that would result from broadcasting two arrays with the given 

144 shapes. 

145 

146 Examples 

147 -------- 

148 >>> _broadcast_shapes((2, 1), (4, 1, 3)) 

149 (4, 2, 3) 

150 """ 

151 d = len(shape1) - len(shape2) 

152 if d <= 0: 

153 shp1 = (1,)*(-d) + shape1 

154 shp2 = shape2 

155 else: 

156 shp1 = shape1 

157 shp2 = (1,)*d + shape2 

158 shape = [] 

159 for n1, n2 in zip(shp1, shp2): 

160 if n1 == 1: 

161 n = n2 

162 elif n2 == 1 or n1 == n2: 

163 n = n1 

164 else: 

165 raise ValueError(f'shapes {shape1} and {shape2} could not be ' 

166 'broadcast together') 

167 shape.append(n) 

168 return tuple(shape) 

169 

170 

171def _broadcast_shapes_with_dropped_axis(a, b, axis): 

172 """ 

173 Given two arrays `a` and `b` and an integer `axis`, find the 

174 shape of the broadcast result after dropping `axis` from the 

175 shapes of `a` and `b`. 

176 

177 Examples 

178 -------- 

179 >>> a = np.zeros((5, 2, 1)) 

180 >>> b = np.zeros((1, 9, 3)) 

181 >>> _broadcast_shapes_with_dropped_axis(a, b, 1) 

182 (5, 3) 

183 """ 

184 shp1 = _shape_with_dropped_axis(a, axis) 

185 shp2 = _shape_with_dropped_axis(b, axis) 

186 try: 

187 shp = _broadcast_shapes(shp1, shp2) 

188 except ValueError: 

189 raise ValueError(f'non-axis shapes {shp1} and {shp2} could not be ' 

190 'broadcast together') from None 

191 return shp 

192 

193 

194SignificanceResult = _make_tuple_bunch('SignificanceResult', 

195 ['statistic', 'pvalue'], []) 

196 

197 

198# note that `weights` are paired with `x` 

199@_axis_nan_policy_factory( 

200 lambda x: x, n_samples=1, n_outputs=1, too_small=0, paired=True, 

201 result_to_tuple=lambda x: (x,), kwd_samples=['weights']) 

202def gmean(a, axis=0, dtype=None, weights=None): 

203 r"""Compute the weighted geometric mean along the specified axis. 

204 

205 The weighted geometric mean of the array :math:`a_i` associated to weights 

206 :math:`w_i` is: 

207 

208 .. math:: 

209 

210 \exp \left( \frac{ \sum_{i=1}^n w_i \ln a_i }{ \sum_{i=1}^n w_i } 

211 \right) \, , 

212 

213 and, with equal weights, it gives: 

214 

215 .. math:: 

216 

217 \sqrt[n]{ \prod_{i=1}^n a_i } \, . 

218 

219 Parameters 

220 ---------- 

221 a : array_like 

222 Input array or object that can be converted to an array. 

223 axis : int or None, optional 

224 Axis along which the geometric mean is computed. Default is 0. 

225 If None, compute over the whole array `a`. 

226 dtype : dtype, optional 

227 Type to which the input arrays are cast before the calculation is 

228 performed. 

229 weights : array_like, optional 

230 The `weights` array must be broadcastable to the same shape as `a`. 

231 Default is None, which gives each value a weight of 1.0. 

232 

233 Returns 

234 ------- 

235 gmean : ndarray 

236 See `dtype` parameter above. 

237 

238 See Also 

239 -------- 

240 numpy.mean : Arithmetic average 

241 numpy.average : Weighted average 

242 hmean : Harmonic mean 

243 

244 References 

245 ---------- 

246 .. [1] "Weighted Geometric Mean", *Wikipedia*, 

247 https://en.wikipedia.org/wiki/Weighted_geometric_mean. 

248 

249 Examples 

250 -------- 

251 >>> from scipy.stats import gmean 

252 >>> gmean([1, 4]) 

253 2.0 

254 >>> gmean([1, 2, 3, 4, 5, 6, 7]) 

255 3.3800151591412964 

256 >>> gmean([1, 4, 7], weights=[3, 1, 3]) 

257 2.80668351922014 

258 

259 """ 

260 

261 a = np.asarray(a, dtype=dtype) 

262 

263 if weights is not None: 

264 weights = np.asarray(weights, dtype=dtype) 

265 

266 with np.errstate(divide='ignore'): 

267 log_a = np.log(a) 

268 

269 return np.exp(np.average(log_a, axis=axis, weights=weights)) 

270 

271 

272@_axis_nan_policy_factory( 

273 lambda x: x, n_samples=1, n_outputs=1, too_small=0, paired=True, 

274 result_to_tuple=lambda x: (x,), kwd_samples=['weights']) 

275def hmean(a, axis=0, dtype=None, *, weights=None): 

276 r"""Calculate the weighted harmonic mean along the specified axis. 

277 

278 The weighted harmonic mean of the array :math:`a_i` associated to weights 

279 :math:`w_i` is: 

280 

281 .. math:: 

282 

283 \frac{ \sum_{i=1}^n w_i }{ \sum_{i=1}^n \frac{w_i}{a_i} } \, , 

284 

285 and, with equal weights, it gives: 

286 

287 .. math:: 

288 

289 \frac{ n }{ \sum_{i=1}^n \frac{1}{a_i} } \, . 

290 

291 Parameters 

292 ---------- 

293 a : array_like 

294 Input array, masked array or object that can be converted to an array. 

295 axis : int or None, optional 

296 Axis along which the harmonic mean is computed. Default is 0. 

297 If None, compute over the whole array `a`. 

298 dtype : dtype, optional 

299 Type of the returned array and of the accumulator in which the 

300 elements are summed. If `dtype` is not specified, it defaults to the 

301 dtype of `a`, unless `a` has an integer `dtype` with a precision less 

302 than that of the default platform integer. In that case, the default 

303 platform integer is used. 

304 weights : array_like, optional 

305 The weights array can either be 1-D (in which case its length must be 

306 the size of `a` along the given `axis`) or of the same shape as `a`. 

307 Default is None, which gives each value a weight of 1.0. 

308 

309 .. versionadded:: 1.9 

310 

311 Returns 

312 ------- 

313 hmean : ndarray 

314 See `dtype` parameter above. 

315 

316 See Also 

317 -------- 

318 numpy.mean : Arithmetic average 

319 numpy.average : Weighted average 

320 gmean : Geometric mean 

321 

322 Notes 

323 ----- 

324 The harmonic mean is computed over a single dimension of the input 

325 array, axis=0 by default, or all values in the array if axis=None. 

326 float64 intermediate and return values are used for integer inputs. 

327 

328 References 

329 ---------- 

330 .. [1] "Weighted Harmonic Mean", *Wikipedia*, 

331 https://en.wikipedia.org/wiki/Harmonic_mean#Weighted_harmonic_mean 

332 .. [2] Ferger, F., "The nature and use of the harmonic mean", Journal of 

333 the American Statistical Association, vol. 26, pp. 36-40, 1931 

334 

335 Examples 

336 -------- 

337 >>> from scipy.stats import hmean 

338 >>> hmean([1, 4]) 

339 1.6000000000000001 

340 >>> hmean([1, 2, 3, 4, 5, 6, 7]) 

341 2.6997245179063363 

342 >>> hmean([1, 4, 7], weights=[3, 1, 3]) 

343 1.9029126213592233 

344 

345 """ 

346 if not isinstance(a, np.ndarray): 

347 a = np.array(a, dtype=dtype) 

348 elif dtype: 

349 # Must change the default dtype allowing array type 

350 if isinstance(a, np.ma.MaskedArray): 

351 a = np.ma.asarray(a, dtype=dtype) 

352 else: 

353 a = np.asarray(a, dtype=dtype) 

354 

355 if np.all(a >= 0): 

356 # Harmonic mean only defined if greater than or equal to zero. 

357 if weights is not None: 

358 weights = np.asanyarray(weights, dtype=dtype) 

359 

360 with np.errstate(divide='ignore'): 

361 return 1.0 / np.average(1.0 / a, axis=axis, weights=weights) 

362 else: 

363 raise ValueError("Harmonic mean only defined if all elements greater " 

364 "than or equal to zero") 

365 

366 

367@_axis_nan_policy_factory( 

368 lambda x: x, n_samples=1, n_outputs=1, too_small=0, paired=True, 

369 result_to_tuple=lambda x: (x,), kwd_samples=['weights']) 

370def pmean(a, p, *, axis=0, dtype=None, weights=None): 

371 r"""Calculate the weighted power mean along the specified axis. 

372 

373 The weighted power mean of the array :math:`a_i` associated to weights 

374 :math:`w_i` is: 

375 

376 .. math:: 

377 

378 \left( \frac{ \sum_{i=1}^n w_i a_i^p }{ \sum_{i=1}^n w_i } 

379 \right)^{ 1 / p } \, , 

380 

381 and, with equal weights, it gives: 

382 

383 .. math:: 

384 

385 \left( \frac{ 1 }{ n } \sum_{i=1}^n a_i^p \right)^{ 1 / p } \, . 

386 

387 This mean is also called generalized mean or Hölder mean, and must not be 

388 confused with the Kolmogorov generalized mean, also called 

389 quasi-arithmetic mean or generalized f-mean [3]_. 

390 

391 Parameters 

392 ---------- 

393 a : array_like 

394 Input array, masked array or object that can be converted to an array. 

395 p : int or float 

396 Exponent. 

397 axis : int or None, optional 

398 Axis along which the power mean is computed. Default is 0. 

399 If None, compute over the whole array `a`. 

400 dtype : dtype, optional 

401 Type of the returned array and of the accumulator in which the 

402 elements are summed. If `dtype` is not specified, it defaults to the 

403 dtype of `a`, unless `a` has an integer `dtype` with a precision less 

404 than that of the default platform integer. In that case, the default 

405 platform integer is used. 

406 weights : array_like, optional 

407 The weights array can either be 1-D (in which case its length must be 

408 the size of `a` along the given `axis`) or of the same shape as `a`. 

409 Default is None, which gives each value a weight of 1.0. 

410 

411 Returns 

412 ------- 

413 pmean : ndarray, see `dtype` parameter above. 

414 Output array containing the power mean values. 

415 

416 See Also 

417 -------- 

418 numpy.average : Weighted average 

419 gmean : Geometric mean 

420 hmean : Harmonic mean 

421 

422 Notes 

423 ----- 

424 The power mean is computed over a single dimension of the input 

425 array, ``axis=0`` by default, or all values in the array if ``axis=None``. 

426 float64 intermediate and return values are used for integer inputs. 

427 

428 .. versionadded:: 1.9 

429 

430 References 

431 ---------- 

432 .. [1] "Generalized Mean", *Wikipedia*, 

433 https://en.wikipedia.org/wiki/Generalized_mean 

434 .. [2] Norris, N., "Convexity properties of generalized mean value 

435 functions", The Annals of Mathematical Statistics, vol. 8, 

436 pp. 118-120, 1937 

437 .. [3] Bullen, P.S., Handbook of Means and Their Inequalities, 2003 

438 

439 Examples 

440 -------- 

441 >>> from scipy.stats import pmean, hmean, gmean 

442 >>> pmean([1, 4], 1.3) 

443 2.639372938300652 

444 >>> pmean([1, 2, 3, 4, 5, 6, 7], 1.3) 

445 4.157111214492084 

446 >>> pmean([1, 4, 7], -2, weights=[3, 1, 3]) 

447 1.4969684896631954 

448 

449 For p=-1, power mean is equal to harmonic mean: 

450 

451 >>> pmean([1, 4, 7], -1, weights=[3, 1, 3]) 

452 1.9029126213592233 

453 >>> hmean([1, 4, 7], weights=[3, 1, 3]) 

454 1.9029126213592233 

455 

456 For p=0, power mean is defined as the geometric mean: 

457 

458 >>> pmean([1, 4, 7], 0, weights=[3, 1, 3]) 

459 2.80668351922014 

460 >>> gmean([1, 4, 7], weights=[3, 1, 3]) 

461 2.80668351922014 

462 

463 """ 

464 if not isinstance(p, (int, float)): 

465 raise ValueError("Power mean only defined for exponent of type int or " 

466 "float.") 

467 if p == 0: 

468 return gmean(a, axis=axis, dtype=dtype, weights=weights) 

469 

470 if not isinstance(a, np.ndarray): 

471 a = np.array(a, dtype=dtype) 

472 elif dtype: 

473 # Must change the default dtype allowing array type 

474 if isinstance(a, np.ma.MaskedArray): 

475 a = np.ma.asarray(a, dtype=dtype) 

476 else: 

477 a = np.asarray(a, dtype=dtype) 

478 

479 if np.all(a >= 0): 

480 # Power mean only defined if greater than or equal to zero 

481 if weights is not None: 

482 weights = np.asanyarray(weights, dtype=dtype) 

483 

484 with np.errstate(divide='ignore'): 

485 return np.float_power( 

486 np.average(np.float_power(a, p), axis=axis, weights=weights), 

487 1/p) 

488 else: 

489 raise ValueError("Power mean only defined if all elements greater " 

490 "than or equal to zero") 

491 

492 

493ModeResult = namedtuple('ModeResult', ('mode', 'count')) 

494 

495 

496def mode(a, axis=0, nan_policy='propagate', keepdims=None): 

497 r"""Return an array of the modal (most common) value in the passed array. 

498 

499 If there is more than one such value, only one is returned. 

500 The bin-count for the modal bins is also returned. 

501 

502 Parameters 

503 ---------- 

504 a : array_like 

505 n-dimensional array of which to find mode(s). 

506 axis : int or None, optional 

507 Axis along which to operate. Default is 0. If None, compute over 

508 the whole array `a`. 

509 nan_policy : {'propagate', 'raise', 'omit'}, optional 

510 Defines how to handle when input contains nan. 

511 The following options are available (default is 'propagate'): 

512 

513 * 'propagate': treats nan as it would treat any other value 

514 * 'raise': throws an error 

515 * 'omit': performs the calculations ignoring nan values 

516 keepdims : bool, optional 

517 If set to ``False``, the `axis` over which the statistic is taken 

518 is consumed (eliminated from the output array) like other reduction 

519 functions (e.g. `skew`, `kurtosis`). If set to ``True``, the `axis` is 

520 retained with size one, and the result will broadcast correctly 

521 against the input array. The default, ``None``, is undefined legacy 

522 behavior retained for backward compatibility. 

523 

524 .. warning:: 

525 Unlike other reduction functions (e.g. `skew`, `kurtosis`), the 

526 default behavior of `mode` usually retains the axis it acts 

527 along. In SciPy 1.11.0, this behavior will change: the default 

528 value of `keepdims` will become ``False``, the `axis` over which 

529 the statistic is taken will be eliminated, and the value ``None`` 

530 will no longer be accepted. 

531 .. versionadded:: 1.9.0 

532 

533 Returns 

534 ------- 

535 mode : ndarray 

536 Array of modal values. 

537 count : ndarray 

538 Array of counts for each mode. 

539 

540 Notes 

541 ----- 

542 The mode of object arrays is calculated using `collections.Counter`, which 

543 treats NaNs with different binary representations as distinct. 

544 

545 .. deprecated:: 1.9.0 

546 Support for non-numeric arrays has been deprecated as of SciPy 1.9.0 

547 and will be removed in 1.11.0. `pandas.DataFrame.mode`_ can 

548 be used instead. 

549 

550 .. _pandas.DataFrame.mode: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.mode.html 

551 

552 The mode of arrays with other dtypes is calculated using `numpy.unique`. 

553 In NumPy versions 1.21 and after, all NaNs - even those with different 

554 binary representations - are treated as equivalent and counted as separate 

555 instances of the same value. 

556 

557 Examples 

558 -------- 

559 >>> import numpy as np 

560 >>> a = np.array([[3, 0, 3, 7], 

561 ... [3, 2, 6, 2], 

562 ... [1, 7, 2, 8], 

563 ... [3, 0, 6, 1], 

564 ... [3, 2, 5, 5]]) 

565 >>> from scipy import stats 

566 >>> stats.mode(a, keepdims=True) 

567 ModeResult(mode=array([[3, 0, 6, 1]]), count=array([[4, 2, 2, 1]])) 

568 

569 To get mode of whole array, specify ``axis=None``: 

570 

571 >>> stats.mode(a, axis=None, keepdims=True) 

572 ModeResult(mode=[3], count=[5]) 

573 >>> stats.mode(a, axis=None, keepdims=False) 

574 ModeResult(mode=3, count=5) 

575 

576 """ # noqa: E501 

577 

578 if keepdims is None: 

579 message = ("Unlike other reduction functions (e.g. `skew`, " 

580 "`kurtosis`), the default behavior of `mode` typically " 

581 "preserves the axis it acts along. In SciPy 1.11.0, " 

582 "this behavior will change: the default value of " 

583 "`keepdims` will become False, the `axis` over which " 

584 "the statistic is taken will be eliminated, and the value " 

585 "None will no longer be accepted. " 

586 "Set `keepdims` to True or False to avoid this warning.") 

587 warnings.warn(message, FutureWarning, stacklevel=2) 

588 

589 a = np.asarray(a) 

590 if a.size == 0: 

591 if keepdims is None: 

592 return ModeResult(np.array([]), np.array([])) 

593 else: 

594 # this is tricky to get right; let np.mean do it 

595 out = np.mean(a, axis=axis, keepdims=keepdims) 

596 return ModeResult(out, out.copy()) 

597 

598 a, axis = _chk_asarray(a, axis) 

599 

600 contains_nan, nan_policy = _contains_nan(a, nan_policy) 

601 

602 if contains_nan and nan_policy == 'omit': 

603 a = ma.masked_invalid(a) 

604 return mstats_basic._mode(a, axis, keepdims=keepdims) 

605 

606 if not np.issubdtype(a.dtype, np.number): 

607 warnings.warn("Support for non-numeric arrays has been deprecated " 

608 "as of SciPy 1.9.0 and will be removed in " 

609 "1.11.0. `pandas.DataFrame.mode` can be used instead, " 

610 "see https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.mode.html.", # noqa: E501 

611 DeprecationWarning, stacklevel=2) 

612 

613 if a.dtype == object: 

614 def _mode1D(a): 

615 cntr = Counter(a) 

616 mode = max(cntr, key=lambda x: cntr[x]) 

617 return mode, cntr[mode] 

618 else: 

619 def _mode1D(a): 

620 vals, cnts = np.unique(a, return_counts=True) 

621 return vals[cnts.argmax()], cnts.max() 

622 

623 # np.apply_along_axis will convert the _mode1D tuples to a numpy array, 

624 # casting types in the process. 

625 # This recreates the results without that issue 

626 # View of a, rotated so the requested axis is last 

627 a_view = np.moveaxis(a, axis, -1) 

628 

629 inds = np.ndindex(a_view.shape[:-1]) 

630 modes = np.empty(a_view.shape[:-1], dtype=a.dtype) 

631 counts = np.empty(a_view.shape[:-1], dtype=np.int_) 

632 for ind in inds: 

633 modes[ind], counts[ind] = _mode1D(a_view[ind]) 

634 

635 if keepdims is None or keepdims: 

636 newshape = list(a.shape) 

637 newshape[axis] = 1 

638 return ModeResult(modes.reshape(newshape), counts.reshape(newshape)) 

639 else: 

640 return ModeResult(modes[()], counts[()]) 

641 

642 

643def _mask_to_limits(a, limits, inclusive): 

644 """Mask an array for values outside of given limits. 

645 

646 This is primarily a utility function. 

647 

648 Parameters 

649 ---------- 

650 a : array 

651 limits : (float or None, float or None) 

652 A tuple consisting of the (lower limit, upper limit). Values in the 

653 input array less than the lower limit or greater than the upper limit 

654 will be masked out. None implies no limit. 

655 inclusive : (bool, bool) 

656 A tuple consisting of the (lower flag, upper flag). These flags 

657 determine whether values exactly equal to lower or upper are allowed. 

658 

659 Returns 

660 ------- 

661 A MaskedArray. 

662 

663 Raises 

664 ------ 

665 A ValueError if there are no values within the given limits. 

666 

667 """ 

668 lower_limit, upper_limit = limits 

669 lower_include, upper_include = inclusive 

670 am = ma.MaskedArray(a) 

671 if lower_limit is not None: 

672 if lower_include: 

673 am = ma.masked_less(am, lower_limit) 

674 else: 

675 am = ma.masked_less_equal(am, lower_limit) 

676 

677 if upper_limit is not None: 

678 if upper_include: 

679 am = ma.masked_greater(am, upper_limit) 

680 else: 

681 am = ma.masked_greater_equal(am, upper_limit) 

682 

683 if am.count() == 0: 

684 raise ValueError("No array values within given limits") 

685 

686 return am 

687 

688 

689def tmean(a, limits=None, inclusive=(True, True), axis=None): 

690 """Compute the trimmed mean. 

691 

692 This function finds the arithmetic mean of given values, ignoring values 

693 outside the given `limits`. 

694 

695 Parameters 

696 ---------- 

697 a : array_like 

698 Array of values. 

699 limits : None or (lower limit, upper limit), optional 

700 Values in the input array less than the lower limit or greater than the 

701 upper limit will be ignored. When limits is None (default), then all 

702 values are used. Either of the limit values in the tuple can also be 

703 None representing a half-open interval. 

704 inclusive : (bool, bool), optional 

705 A tuple consisting of the (lower flag, upper flag). These flags 

706 determine whether values exactly equal to the lower or upper limits 

707 are included. The default value is (True, True). 

708 axis : int or None, optional 

709 Axis along which to compute test. Default is None. 

710 

711 Returns 

712 ------- 

713 tmean : ndarray 

714 Trimmed mean. 

715 

716 See Also 

717 -------- 

718 trim_mean : Returns mean after trimming a proportion from both tails. 

719 

720 Examples 

721 -------- 

722 >>> import numpy as np 

723 >>> from scipy import stats 

724 >>> x = np.arange(20) 

725 >>> stats.tmean(x) 

726 9.5 

727 >>> stats.tmean(x, (3,17)) 

728 10.0 

729 

730 """ 

731 a = asarray(a) 

732 if limits is None: 

733 return np.mean(a, axis) 

734 am = _mask_to_limits(a, limits, inclusive) 

735 mean = np.ma.filled(am.mean(axis=axis), fill_value=np.nan) 

736 return mean if mean.ndim > 0 else mean.item() 

737 

738 

739def tvar(a, limits=None, inclusive=(True, True), axis=0, ddof=1): 

740 """Compute the trimmed variance. 

741 

742 This function computes the sample variance of an array of values, 

743 while ignoring values which are outside of given `limits`. 

744 

745 Parameters 

746 ---------- 

747 a : array_like 

748 Array of values. 

749 limits : None or (lower limit, upper limit), optional 

750 Values in the input array less than the lower limit or greater than the 

751 upper limit will be ignored. When limits is None, then all values are 

752 used. Either of the limit values in the tuple can also be None 

753 representing a half-open interval. The default value is None. 

754 inclusive : (bool, bool), optional 

755 A tuple consisting of the (lower flag, upper flag). These flags 

756 determine whether values exactly equal to the lower or upper limits 

757 are included. The default value is (True, True). 

758 axis : int or None, optional 

759 Axis along which to operate. Default is 0. If None, compute over the 

760 whole array `a`. 

761 ddof : int, optional 

762 Delta degrees of freedom. Default is 1. 

763 

764 Returns 

765 ------- 

766 tvar : float 

767 Trimmed variance. 

768 

769 Notes 

770 ----- 

771 `tvar` computes the unbiased sample variance, i.e. it uses a correction 

772 factor ``n / (n - 1)``. 

773 

774 Examples 

775 -------- 

776 >>> import numpy as np 

777 >>> from scipy import stats 

778 >>> x = np.arange(20) 

779 >>> stats.tvar(x) 

780 35.0 

781 >>> stats.tvar(x, (3,17)) 

782 20.0 

783 

784 """ 

785 a = asarray(a) 

786 a = a.astype(float) 

787 if limits is None: 

788 return a.var(ddof=ddof, axis=axis) 

789 am = _mask_to_limits(a, limits, inclusive) 

790 amnan = am.filled(fill_value=np.nan) 

791 return np.nanvar(amnan, ddof=ddof, axis=axis) 

792 

793 

794def tmin(a, lowerlimit=None, axis=0, inclusive=True, nan_policy='propagate'): 

795 """Compute the trimmed minimum. 

796 

797 This function finds the miminum value of an array `a` along the 

798 specified axis, but only considering values greater than a specified 

799 lower limit. 

800 

801 Parameters 

802 ---------- 

803 a : array_like 

804 Array of values. 

805 lowerlimit : None or float, optional 

806 Values in the input array less than the given limit will be ignored. 

807 When lowerlimit is None, then all values are used. The default value 

808 is None. 

809 axis : int or None, optional 

810 Axis along which to operate. Default is 0. If None, compute over the 

811 whole array `a`. 

812 inclusive : {True, False}, optional 

813 This flag determines whether values exactly equal to the lower limit 

814 are included. The default value is True. 

815 nan_policy : {'propagate', 'raise', 'omit'}, optional 

816 Defines how to handle when input contains nan. 

817 The following options are available (default is 'propagate'): 

818 

819 * 'propagate': returns nan 

820 * 'raise': throws an error 

821 * 'omit': performs the calculations ignoring nan values 

822 

823 Returns 

824 ------- 

825 tmin : float, int or ndarray 

826 Trimmed minimum. 

827 

828 Examples 

829 -------- 

830 >>> import numpy as np 

831 >>> from scipy import stats 

832 >>> x = np.arange(20) 

833 >>> stats.tmin(x) 

834 0 

835 

836 >>> stats.tmin(x, 13) 

837 13 

838 

839 >>> stats.tmin(x, 13, inclusive=False) 

840 14 

841 

842 """ 

843 a, axis = _chk_asarray(a, axis) 

844 am = _mask_to_limits(a, (lowerlimit, None), (inclusive, False)) 

845 

846 contains_nan, nan_policy = _contains_nan(am, nan_policy) 

847 

848 if contains_nan and nan_policy == 'omit': 

849 am = ma.masked_invalid(am) 

850 

851 res = ma.minimum.reduce(am, axis).data 

852 if res.ndim == 0: 

853 return res[()] 

854 return res 

855 

856 

857def tmax(a, upperlimit=None, axis=0, inclusive=True, nan_policy='propagate'): 

858 """Compute the trimmed maximum. 

859 

860 This function computes the maximum value of an array along a given axis, 

861 while ignoring values larger than a specified upper limit. 

862 

863 Parameters 

864 ---------- 

865 a : array_like 

866 Array of values. 

867 upperlimit : None or float, optional 

868 Values in the input array greater than the given limit will be ignored. 

869 When upperlimit is None, then all values are used. The default value 

870 is None. 

871 axis : int or None, optional 

872 Axis along which to operate. Default is 0. If None, compute over the 

873 whole array `a`. 

874 inclusive : {True, False}, optional 

875 This flag determines whether values exactly equal to the upper limit 

876 are included. The default value is True. 

877 nan_policy : {'propagate', 'raise', 'omit'}, optional 

878 Defines how to handle when input contains nan. 

879 The following options are available (default is 'propagate'): 

880 

881 * 'propagate': returns nan 

882 * 'raise': throws an error 

883 * 'omit': performs the calculations ignoring nan values 

884 

885 Returns 

886 ------- 

887 tmax : float, int or ndarray 

888 Trimmed maximum. 

889 

890 Examples 

891 -------- 

892 >>> import numpy as np 

893 >>> from scipy import stats 

894 >>> x = np.arange(20) 

895 >>> stats.tmax(x) 

896 19 

897 

898 >>> stats.tmax(x, 13) 

899 13 

900 

901 >>> stats.tmax(x, 13, inclusive=False) 

902 12 

903 

904 """ 

905 a, axis = _chk_asarray(a, axis) 

906 am = _mask_to_limits(a, (None, upperlimit), (False, inclusive)) 

907 

908 contains_nan, nan_policy = _contains_nan(am, nan_policy) 

909 

910 if contains_nan and nan_policy == 'omit': 

911 am = ma.masked_invalid(am) 

912 

913 res = ma.maximum.reduce(am, axis).data 

914 if res.ndim == 0: 

915 return res[()] 

916 return res 

917 

918 

919def tstd(a, limits=None, inclusive=(True, True), axis=0, ddof=1): 

920 """Compute the trimmed sample standard deviation. 

921 

922 This function finds the sample standard deviation of given values, 

923 ignoring values outside the given `limits`. 

924 

925 Parameters 

926 ---------- 

927 a : array_like 

928 Array of values. 

929 limits : None or (lower limit, upper limit), optional 

930 Values in the input array less than the lower limit or greater than the 

931 upper limit will be ignored. When limits is None, then all values are 

932 used. Either of the limit values in the tuple can also be None 

933 representing a half-open interval. The default value is None. 

934 inclusive : (bool, bool), optional 

935 A tuple consisting of the (lower flag, upper flag). These flags 

936 determine whether values exactly equal to the lower or upper limits 

937 are included. The default value is (True, True). 

938 axis : int or None, optional 

939 Axis along which to operate. Default is 0. If None, compute over the 

940 whole array `a`. 

941 ddof : int, optional 

942 Delta degrees of freedom. Default is 1. 

943 

944 Returns 

945 ------- 

946 tstd : float 

947 Trimmed sample standard deviation. 

948 

949 Notes 

950 ----- 

951 `tstd` computes the unbiased sample standard deviation, i.e. it uses a 

952 correction factor ``n / (n - 1)``. 

953 

954 Examples 

955 -------- 

956 >>> import numpy as np 

957 >>> from scipy import stats 

958 >>> x = np.arange(20) 

959 >>> stats.tstd(x) 

960 5.9160797830996161 

961 >>> stats.tstd(x, (3,17)) 

962 4.4721359549995796 

963 

964 """ 

965 return np.sqrt(tvar(a, limits, inclusive, axis, ddof)) 

966 

967 

968def tsem(a, limits=None, inclusive=(True, True), axis=0, ddof=1): 

969 """Compute the trimmed standard error of the mean. 

970 

971 This function finds the standard error of the mean for given 

972 values, ignoring values outside the given `limits`. 

973 

974 Parameters 

975 ---------- 

976 a : array_like 

977 Array of values. 

978 limits : None or (lower limit, upper limit), optional 

979 Values in the input array less than the lower limit or greater than the 

980 upper limit will be ignored. When limits is None, then all values are 

981 used. Either of the limit values in the tuple can also be None 

982 representing a half-open interval. The default value is None. 

983 inclusive : (bool, bool), optional 

984 A tuple consisting of the (lower flag, upper flag). These flags 

985 determine whether values exactly equal to the lower or upper limits 

986 are included. The default value is (True, True). 

987 axis : int or None, optional 

988 Axis along which to operate. Default is 0. If None, compute over the 

989 whole array `a`. 

990 ddof : int, optional 

991 Delta degrees of freedom. Default is 1. 

992 

993 Returns 

994 ------- 

995 tsem : float 

996 Trimmed standard error of the mean. 

997 

998 Notes 

999 ----- 

1000 `tsem` uses unbiased sample standard deviation, i.e. it uses a 

1001 correction factor ``n / (n - 1)``. 

1002 

1003 Examples 

1004 -------- 

1005 >>> import numpy as np 

1006 >>> from scipy import stats 

1007 >>> x = np.arange(20) 

1008 >>> stats.tsem(x) 

1009 1.3228756555322954 

1010 >>> stats.tsem(x, (3,17)) 

1011 1.1547005383792515 

1012 

1013 """ 

1014 a = np.asarray(a).ravel() 

1015 if limits is None: 

1016 return a.std(ddof=ddof) / np.sqrt(a.size) 

1017 

1018 am = _mask_to_limits(a, limits, inclusive) 

1019 sd = np.sqrt(np.ma.var(am, ddof=ddof, axis=axis)) 

1020 return sd / np.sqrt(am.count()) 

1021 

1022 

1023##################################### 

1024# MOMENTS # 

1025##################################### 

1026 

1027 

1028def _moment_outputs(kwds): 

1029 moment = np.atleast_1d(kwds.get('moment', 1)) 

1030 if moment.size == 0: 

1031 raise ValueError("'moment' must be a scalar or a non-empty 1D " 

1032 "list/array.") 

1033 return len(moment) 

1034 

1035 

1036def _moment_result_object(*args): 

1037 if len(args) == 1: 

1038 return args[0] 

1039 return np.asarray(args) 

1040 

1041# `moment` fits into the `_axis_nan_policy` pattern, but it is a bit unusual 

1042# because the number of outputs is variable. Specifically, 

1043# `result_to_tuple=lambda x: (x,)` may be surprising for a function that 

1044# can produce more than one output, but it is intended here. 

1045# When `moment is called to produce the output: 

1046# - `result_to_tuple` packs the returned array into a single-element tuple, 

1047# - `_moment_result_object` extracts and returns that single element. 

1048# However, when the input array is empty, `moment` is never called. Instead, 

1049# - `_check_empty_inputs` is used to produce an empty array with the 

1050# appropriate dimensions. 

1051# - A list comprehension creates the appropriate number of copies of this 

1052# array, depending on `n_outputs`. 

1053# - This list - which may have multiple elements - is passed into 

1054# `_moment_result_object`. 

1055# - If there is a single output, `_moment_result_object` extracts and returns 

1056# the single output from the list. 

1057# - If there are multiple outputs, and therefore multiple elements in the list, 

1058# `_moment_result_object` converts the list of arrays to a single array and 

1059# returns it. 

1060# Currently this leads to a slight inconsistency: when the input array is 

1061# empty, there is no distinction between the `moment` function being called 

1062# with parameter `moments=1` and `moments=[1]`; the latter *should* produce 

1063# the same as the former but with a singleton zeroth dimension. 

1064@_axis_nan_policy_factory( # noqa: E302 

1065 _moment_result_object, n_samples=1, result_to_tuple=lambda x: (x,), 

1066 n_outputs=_moment_outputs 

1067) 

1068def moment(a, moment=1, axis=0, nan_policy='propagate'): 

1069 r"""Calculate the nth moment about the mean for a sample. 

1070 

1071 A moment is a specific quantitative measure of the shape of a set of 

1072 points. It is often used to calculate coefficients of skewness and kurtosis 

1073 due to its close relationship with them. 

1074 

1075 Parameters 

1076 ---------- 

1077 a : array_like 

1078 Input array. 

1079 moment : int or array_like of ints, optional 

1080 Order of central moment that is returned. Default is 1. 

1081 axis : int or None, optional 

1082 Axis along which the central moment is computed. Default is 0. 

1083 If None, compute over the whole array `a`. 

1084 nan_policy : {'propagate', 'raise', 'omit'}, optional 

1085 Defines how to handle when input contains nan. 

1086 The following options are available (default is 'propagate'): 

1087 

1088 * 'propagate': returns nan 

1089 * 'raise': throws an error 

1090 * 'omit': performs the calculations ignoring nan values 

1091 

1092 Returns 

1093 ------- 

1094 n-th central moment : ndarray or float 

1095 The appropriate moment along the given axis or over all values if axis 

1096 is None. The denominator for the moment calculation is the number of 

1097 observations, no degrees of freedom correction is done. 

1098 

1099 See Also 

1100 -------- 

1101 kurtosis, skew, describe 

1102 

1103 Notes 

1104 ----- 

1105 The k-th central moment of a data sample is: 

1106 

1107 .. math:: 

1108 

1109 m_k = \frac{1}{n} \sum_{i = 1}^n (x_i - \bar{x})^k 

1110 

1111 Where n is the number of samples and x-bar is the mean. This function uses 

1112 exponentiation by squares [1]_ for efficiency. 

1113 

1114 Note that, if `a` is an empty array (``a.size == 0``), array `moment` with 

1115 one element (`moment.size == 1`) is treated the same as scalar `moment` 

1116 (``np.isscalar(moment)``). This might produce arrays of unexpected shape. 

1117 

1118 References 

1119 ---------- 

1120 .. [1] https://eli.thegreenplace.net/2009/03/21/efficient-integer-exponentiation-algorithms 

1121 

1122 Examples 

1123 -------- 

1124 >>> from scipy.stats import moment 

1125 >>> moment([1, 2, 3, 4, 5], moment=1) 

1126 0.0 

1127 >>> moment([1, 2, 3, 4, 5], moment=2) 

1128 2.0 

1129 

1130 """ 

1131 a, axis = _chk_asarray(a, axis) 

1132 

1133 contains_nan, nan_policy = _contains_nan(a, nan_policy) 

1134 

1135 if contains_nan and nan_policy == 'omit': 

1136 a = ma.masked_invalid(a) 

1137 return mstats_basic.moment(a, moment, axis) 

1138 

1139 # for array_like moment input, return a value for each. 

1140 if not np.isscalar(moment): 

1141 mean = a.mean(axis, keepdims=True) 

1142 mmnt = [_moment(a, i, axis, mean=mean) for i in moment] 

1143 return np.array(mmnt) 

1144 else: 

1145 return _moment(a, moment, axis) 

1146 

1147 

1148# Moment with optional pre-computed mean, equal to a.mean(axis, keepdims=True) 

1149def _moment(a, moment, axis, *, mean=None): 

1150 if np.abs(moment - np.round(moment)) > 0: 

1151 raise ValueError("All moment parameters must be integers") 

1152 

1153 # moment of empty array is the same regardless of order 

1154 if a.size == 0: 

1155 return np.mean(a, axis=axis) 

1156 

1157 if moment == 0 or moment == 1: 

1158 # By definition the zeroth moment about the mean is 1, and the first 

1159 # moment is 0. 

1160 shape = list(a.shape) 

1161 del shape[axis] 

1162 dtype = a.dtype.type if a.dtype.kind in 'fc' else np.float64 

1163 

1164 if len(shape) == 0: 

1165 return dtype(1.0 if moment == 0 else 0.0) 

1166 else: 

1167 return (np.ones(shape, dtype=dtype) if moment == 0 

1168 else np.zeros(shape, dtype=dtype)) 

1169 else: 

1170 # Exponentiation by squares: form exponent sequence 

1171 n_list = [moment] 

1172 current_n = moment 

1173 while current_n > 2: 

1174 if current_n % 2: 

1175 current_n = (current_n - 1) / 2 

1176 else: 

1177 current_n /= 2 

1178 n_list.append(current_n) 

1179 

1180 # Starting point for exponentiation by squares 

1181 mean = a.mean(axis, keepdims=True) if mean is None else mean 

1182 a_zero_mean = a - mean 

1183 

1184 eps = np.finfo(a_zero_mean.dtype).resolution * 10 

1185 with np.errstate(divide='ignore', invalid='ignore'): 

1186 rel_diff = np.max(np.abs(a_zero_mean), axis=axis, 

1187 keepdims=True) / np.abs(mean) 

1188 with np.errstate(invalid='ignore'): 

1189 precision_loss = np.any(rel_diff < eps) 

1190 if precision_loss: 

1191 message = ("Precision loss occurred in moment calculation due to " 

1192 "catastrophic cancellation. This occurs when the data " 

1193 "are nearly identical. Results may be unreliable.") 

1194 warnings.warn(message, RuntimeWarning, stacklevel=4) 

1195 

1196 if n_list[-1] == 1: 

1197 s = a_zero_mean.copy() 

1198 else: 

1199 s = a_zero_mean**2 

1200 

1201 # Perform multiplications 

1202 for n in n_list[-2::-1]: 

1203 s = s**2 

1204 if n % 2: 

1205 s *= a_zero_mean 

1206 return np.mean(s, axis) 

1207 

1208 

1209def _var(x, axis=0, ddof=0, mean=None): 

1210 # Calculate variance of sample, warning if precision is lost 

1211 var = _moment(x, 2, axis, mean=mean) 

1212 if ddof != 0: 

1213 n = x.shape[axis] if axis is not None else x.size 

1214 var *= np.divide(n, n-ddof) # to avoid error on division by zero 

1215 return var 

1216 

1217 

1218@_axis_nan_policy_factory( 

1219 lambda x: x, result_to_tuple=lambda x: (x,), n_outputs=1 

1220) 

1221def skew(a, axis=0, bias=True, nan_policy='propagate'): 

1222 r"""Compute the sample skewness of a data set. 

1223 

1224 For normally distributed data, the skewness should be about zero. For 

1225 unimodal continuous distributions, a skewness value greater than zero means 

1226 that there is more weight in the right tail of the distribution. The 

1227 function `skewtest` can be used to determine if the skewness value 

1228 is close enough to zero, statistically speaking. 

1229 

1230 Parameters 

1231 ---------- 

1232 a : ndarray 

1233 Input array. 

1234 axis : int or None, optional 

1235 Axis along which skewness is calculated. Default is 0. 

1236 If None, compute over the whole array `a`. 

1237 bias : bool, optional 

1238 If False, then the calculations are corrected for statistical bias. 

1239 nan_policy : {'propagate', 'raise', 'omit'}, optional 

1240 Defines how to handle when input contains nan. 

1241 The following options are available (default is 'propagate'): 

1242 

1243 * 'propagate': returns nan 

1244 * 'raise': throws an error 

1245 * 'omit': performs the calculations ignoring nan values 

1246 

1247 Returns 

1248 ------- 

1249 skewness : ndarray 

1250 The skewness of values along an axis, returning NaN where all values 

1251 are equal. 

1252 

1253 Notes 

1254 ----- 

1255 The sample skewness is computed as the Fisher-Pearson coefficient 

1256 of skewness, i.e. 

1257 

1258 .. math:: 

1259 

1260 g_1=\frac{m_3}{m_2^{3/2}} 

1261 

1262 where 

1263 

1264 .. math:: 

1265 

1266 m_i=\frac{1}{N}\sum_{n=1}^N(x[n]-\bar{x})^i 

1267 

1268 is the biased sample :math:`i\texttt{th}` central moment, and 

1269 :math:`\bar{x}` is 

1270 the sample mean. If ``bias`` is False, the calculations are 

1271 corrected for bias and the value computed is the adjusted 

1272 Fisher-Pearson standardized moment coefficient, i.e. 

1273 

1274 .. math:: 

1275 

1276 G_1=\frac{k_3}{k_2^{3/2}}= 

1277 \frac{\sqrt{N(N-1)}}{N-2}\frac{m_3}{m_2^{3/2}}. 

1278 

1279 References 

1280 ---------- 

1281 .. [1] Zwillinger, D. and Kokoska, S. (2000). CRC Standard 

1282 Probability and Statistics Tables and Formulae. Chapman & Hall: New 

1283 York. 2000. 

1284 Section 2.2.24.1 

1285 

1286 Examples 

1287 -------- 

1288 >>> from scipy.stats import skew 

1289 >>> skew([1, 2, 3, 4, 5]) 

1290 0.0 

1291 >>> skew([2, 8, 0, 4, 1, 9, 9, 0]) 

1292 0.2650554122698573 

1293 

1294 """ 

1295 a, axis = _chk_asarray(a, axis) 

1296 n = a.shape[axis] 

1297 

1298 contains_nan, nan_policy = _contains_nan(a, nan_policy) 

1299 

1300 if contains_nan and nan_policy == 'omit': 

1301 a = ma.masked_invalid(a) 

1302 return mstats_basic.skew(a, axis, bias) 

1303 

1304 mean = a.mean(axis, keepdims=True) 

1305 m2 = _moment(a, 2, axis, mean=mean) 

1306 m3 = _moment(a, 3, axis, mean=mean) 

1307 with np.errstate(all='ignore'): 

1308 zero = (m2 <= (np.finfo(m2.dtype).resolution * mean.squeeze(axis))**2) 

1309 vals = np.where(zero, np.nan, m3 / m2**1.5) 

1310 if not bias: 

1311 can_correct = ~zero & (n > 2) 

1312 if can_correct.any(): 

1313 m2 = np.extract(can_correct, m2) 

1314 m3 = np.extract(can_correct, m3) 

1315 nval = np.sqrt((n - 1.0) * n) / (n - 2.0) * m3 / m2**1.5 

1316 np.place(vals, can_correct, nval) 

1317 

1318 if vals.ndim == 0: 

1319 return vals.item() 

1320 

1321 return vals 

1322 

1323 

1324@_axis_nan_policy_factory( 

1325 lambda x: x, result_to_tuple=lambda x: (x,), n_outputs=1 

1326) 

1327def kurtosis(a, axis=0, fisher=True, bias=True, nan_policy='propagate'): 

1328 """Compute the kurtosis (Fisher or Pearson) of a dataset. 

1329 

1330 Kurtosis is the fourth central moment divided by the square of the 

1331 variance. If Fisher's definition is used, then 3.0 is subtracted from 

1332 the result to give 0.0 for a normal distribution. 

1333 

1334 If bias is False then the kurtosis is calculated using k statistics to 

1335 eliminate bias coming from biased moment estimators 

1336 

1337 Use `kurtosistest` to see if result is close enough to normal. 

1338 

1339 Parameters 

1340 ---------- 

1341 a : array 

1342 Data for which the kurtosis is calculated. 

1343 axis : int or None, optional 

1344 Axis along which the kurtosis is calculated. Default is 0. 

1345 If None, compute over the whole array `a`. 

1346 fisher : bool, optional 

1347 If True, Fisher's definition is used (normal ==> 0.0). If False, 

1348 Pearson's definition is used (normal ==> 3.0). 

1349 bias : bool, optional 

1350 If False, then the calculations are corrected for statistical bias. 

1351 nan_policy : {'propagate', 'raise', 'omit'}, optional 

1352 Defines how to handle when input contains nan. 'propagate' returns nan, 

1353 'raise' throws an error, 'omit' performs the calculations ignoring nan 

1354 values. Default is 'propagate'. 

1355 

1356 Returns 

1357 ------- 

1358 kurtosis : array 

1359 The kurtosis of values along an axis, returning NaN where all values 

1360 are equal. 

1361 

1362 References 

1363 ---------- 

1364 .. [1] Zwillinger, D. and Kokoska, S. (2000). CRC Standard 

1365 Probability and Statistics Tables and Formulae. Chapman & Hall: New 

1366 York. 2000. 

1367 

1368 Examples 

1369 -------- 

1370 In Fisher's definiton, the kurtosis of the normal distribution is zero. 

1371 In the following example, the kurtosis is close to zero, because it was 

1372 calculated from the dataset, not from the continuous distribution. 

1373 

1374 >>> import numpy as np 

1375 >>> from scipy.stats import norm, kurtosis 

1376 >>> data = norm.rvs(size=1000, random_state=3) 

1377 >>> kurtosis(data) 

1378 -0.06928694200380558 

1379 

1380 The distribution with a higher kurtosis has a heavier tail. 

1381 The zero valued kurtosis of the normal distribution in Fisher's definition 

1382 can serve as a reference point. 

1383 

1384 >>> import matplotlib.pyplot as plt 

1385 >>> import scipy.stats as stats 

1386 >>> from scipy.stats import kurtosis 

1387 

1388 >>> x = np.linspace(-5, 5, 100) 

1389 >>> ax = plt.subplot() 

1390 >>> distnames = ['laplace', 'norm', 'uniform'] 

1391 

1392 >>> for distname in distnames: 

1393 ... if distname == 'uniform': 

1394 ... dist = getattr(stats, distname)(loc=-2, scale=4) 

1395 ... else: 

1396 ... dist = getattr(stats, distname) 

1397 ... data = dist.rvs(size=1000) 

1398 ... kur = kurtosis(data, fisher=True) 

1399 ... y = dist.pdf(x) 

1400 ... ax.plot(x, y, label="{}, {}".format(distname, round(kur, 3))) 

1401 ... ax.legend() 

1402 

1403 The Laplace distribution has a heavier tail than the normal distribution. 

1404 The uniform distribution (which has negative kurtosis) has the thinnest 

1405 tail. 

1406 

1407 """ 

1408 a, axis = _chk_asarray(a, axis) 

1409 

1410 contains_nan, nan_policy = _contains_nan(a, nan_policy) 

1411 

1412 if contains_nan and nan_policy == 'omit': 

1413 a = ma.masked_invalid(a) 

1414 return mstats_basic.kurtosis(a, axis, fisher, bias) 

1415 

1416 n = a.shape[axis] 

1417 mean = a.mean(axis, keepdims=True) 

1418 m2 = _moment(a, 2, axis, mean=mean) 

1419 m4 = _moment(a, 4, axis, mean=mean) 

1420 with np.errstate(all='ignore'): 

1421 zero = (m2 <= (np.finfo(m2.dtype).resolution * mean.squeeze(axis))**2) 

1422 vals = np.where(zero, np.nan, m4 / m2**2.0) 

1423 

1424 if not bias: 

1425 can_correct = ~zero & (n > 3) 

1426 if can_correct.any(): 

1427 m2 = np.extract(can_correct, m2) 

1428 m4 = np.extract(can_correct, m4) 

1429 nval = 1.0/(n-2)/(n-3) * ((n**2-1.0)*m4/m2**2.0 - 3*(n-1)**2.0) 

1430 np.place(vals, can_correct, nval + 3.0) 

1431 

1432 if vals.ndim == 0: 

1433 vals = vals.item() # array scalar 

1434 

1435 return vals - 3 if fisher else vals 

1436 

1437 

1438DescribeResult = namedtuple('DescribeResult', 

1439 ('nobs', 'minmax', 'mean', 'variance', 'skewness', 

1440 'kurtosis')) 

1441 

1442 

1443def describe(a, axis=0, ddof=1, bias=True, nan_policy='propagate'): 

1444 """Compute several descriptive statistics of the passed array. 

1445 

1446 Parameters 

1447 ---------- 

1448 a : array_like 

1449 Input data. 

1450 axis : int or None, optional 

1451 Axis along which statistics are calculated. Default is 0. 

1452 If None, compute over the whole array `a`. 

1453 ddof : int, optional 

1454 Delta degrees of freedom (only for variance). Default is 1. 

1455 bias : bool, optional 

1456 If False, then the skewness and kurtosis calculations are corrected 

1457 for statistical bias. 

1458 nan_policy : {'propagate', 'raise', 'omit'}, optional 

1459 Defines how to handle when input contains nan. 

1460 The following options are available (default is 'propagate'): 

1461 

1462 * 'propagate': returns nan 

1463 * 'raise': throws an error 

1464 * 'omit': performs the calculations ignoring nan values 

1465 

1466 Returns 

1467 ------- 

1468 nobs : int or ndarray of ints 

1469 Number of observations (length of data along `axis`). 

1470 When 'omit' is chosen as nan_policy, the length along each axis 

1471 slice is counted separately. 

1472 minmax: tuple of ndarrays or floats 

1473 Minimum and maximum value of `a` along the given axis. 

1474 mean : ndarray or float 

1475 Arithmetic mean of `a` along the given axis. 

1476 variance : ndarray or float 

1477 Unbiased variance of `a` along the given axis; denominator is number 

1478 of observations minus one. 

1479 skewness : ndarray or float 

1480 Skewness of `a` along the given axis, based on moment calculations 

1481 with denominator equal to the number of observations, i.e. no degrees 

1482 of freedom correction. 

1483 kurtosis : ndarray or float 

1484 Kurtosis (Fisher) of `a` along the given axis. The kurtosis is 

1485 normalized so that it is zero for the normal distribution. No 

1486 degrees of freedom are used. 

1487 

1488 See Also 

1489 -------- 

1490 skew, kurtosis 

1491 

1492 Examples 

1493 -------- 

1494 >>> import numpy as np 

1495 >>> from scipy import stats 

1496 >>> a = np.arange(10) 

1497 >>> stats.describe(a) 

1498 DescribeResult(nobs=10, minmax=(0, 9), mean=4.5, 

1499 variance=9.166666666666666, skewness=0.0, 

1500 kurtosis=-1.2242424242424244) 

1501 >>> b = [[1, 2], [3, 4]] 

1502 >>> stats.describe(b) 

1503 DescribeResult(nobs=2, minmax=(array([1, 2]), array([3, 4])), 

1504 mean=array([2., 3.]), variance=array([2., 2.]), 

1505 skewness=array([0., 0.]), kurtosis=array([-2., -2.])) 

1506 

1507 """ 

1508 a, axis = _chk_asarray(a, axis) 

1509 

1510 contains_nan, nan_policy = _contains_nan(a, nan_policy) 

1511 

1512 if contains_nan and nan_policy == 'omit': 

1513 a = ma.masked_invalid(a) 

1514 return mstats_basic.describe(a, axis, ddof, bias) 

1515 

1516 if a.size == 0: 

1517 raise ValueError("The input must not be empty.") 

1518 n = a.shape[axis] 

1519 mm = (np.min(a, axis=axis), np.max(a, axis=axis)) 

1520 m = np.mean(a, axis=axis) 

1521 v = _var(a, axis=axis, ddof=ddof) 

1522 sk = skew(a, axis, bias=bias) 

1523 kurt = kurtosis(a, axis, bias=bias) 

1524 

1525 return DescribeResult(n, mm, m, v, sk, kurt) 

1526 

1527##################################### 

1528# NORMALITY TESTS # 

1529##################################### 

1530 

1531 

1532def _normtest_finish(z, alternative): 

1533 """Common code between all the normality-test functions.""" 

1534 if alternative == 'less': 

1535 prob = distributions.norm.cdf(z) 

1536 elif alternative == 'greater': 

1537 prob = distributions.norm.sf(z) 

1538 elif alternative == 'two-sided': 

1539 prob = 2 * distributions.norm.sf(np.abs(z)) 

1540 else: 

1541 raise ValueError("alternative must be " 

1542 "'less', 'greater' or 'two-sided'") 

1543 

1544 if z.ndim == 0: 

1545 z = z[()] 

1546 

1547 return z, prob 

1548 

1549 

1550SkewtestResult = namedtuple('SkewtestResult', ('statistic', 'pvalue')) 

1551 

1552 

1553def skewtest(a, axis=0, nan_policy='propagate', alternative='two-sided'): 

1554 """Test whether the skew is different from the normal distribution. 

1555 

1556 This function tests the null hypothesis that the skewness of 

1557 the population that the sample was drawn from is the same 

1558 as that of a corresponding normal distribution. 

1559 

1560 Parameters 

1561 ---------- 

1562 a : array 

1563 The data to be tested. 

1564 axis : int or None, optional 

1565 Axis along which statistics are calculated. Default is 0. 

1566 If None, compute over the whole array `a`. 

1567 nan_policy : {'propagate', 'raise', 'omit'}, optional 

1568 Defines how to handle when input contains nan. 

1569 The following options are available (default is 'propagate'): 

1570 

1571 * 'propagate': returns nan 

1572 * 'raise': throws an error 

1573 * 'omit': performs the calculations ignoring nan values 

1574 

1575 alternative : {'two-sided', 'less', 'greater'}, optional 

1576 Defines the alternative hypothesis. Default is 'two-sided'. 

1577 The following options are available: 

1578 

1579 * 'two-sided': the skewness of the distribution underlying the sample 

1580 is different from that of the normal distribution (i.e. 0) 

1581 * 'less': the skewness of the distribution underlying the sample 

1582 is less than that of the normal distribution 

1583 * 'greater': the skewness of the distribution underlying the sample 

1584 is greater than that of the normal distribution 

1585 

1586 .. versionadded:: 1.7.0 

1587 

1588 Returns 

1589 ------- 

1590 statistic : float 

1591 The computed z-score for this test. 

1592 pvalue : float 

1593 The p-value for the hypothesis test. 

1594 

1595 Notes 

1596 ----- 

1597 The sample size must be at least 8. 

1598 

1599 References 

1600 ---------- 

1601 .. [1] R. B. D'Agostino, A. J. Belanger and R. B. D'Agostino Jr., 

1602 "A suggestion for using powerful and informative tests of 

1603 normality", American Statistician 44, pp. 316-321, 1990. 

1604 

1605 Examples 

1606 -------- 

1607 >>> from scipy.stats import skewtest 

1608 >>> skewtest([1, 2, 3, 4, 5, 6, 7, 8]) 

1609 SkewtestResult(statistic=1.0108048609177787, pvalue=0.3121098361421897) 

1610 >>> skewtest([2, 8, 0, 4, 1, 9, 9, 0]) 

1611 SkewtestResult(statistic=0.44626385374196975, pvalue=0.6554066631275459) 

1612 >>> skewtest([1, 2, 3, 4, 5, 6, 7, 8000]) 

1613 SkewtestResult(statistic=3.571773510360407, pvalue=0.0003545719905823133) 

1614 >>> skewtest([100, 100, 100, 100, 100, 100, 100, 101]) 

1615 SkewtestResult(statistic=3.5717766638478072, pvalue=0.000354567720281634) 

1616 >>> skewtest([1, 2, 3, 4, 5, 6, 7, 8], alternative='less') 

1617 SkewtestResult(statistic=1.0108048609177787, pvalue=0.8439450819289052) 

1618 >>> skewtest([1, 2, 3, 4, 5, 6, 7, 8], alternative='greater') 

1619 SkewtestResult(statistic=1.0108048609177787, pvalue=0.15605491807109484) 

1620 

1621 """ 

1622 a, axis = _chk_asarray(a, axis) 

1623 

1624 contains_nan, nan_policy = _contains_nan(a, nan_policy) 

1625 

1626 if contains_nan and nan_policy == 'omit': 

1627 a = ma.masked_invalid(a) 

1628 return mstats_basic.skewtest(a, axis, alternative) 

1629 

1630 if axis is None: 

1631 a = np.ravel(a) 

1632 axis = 0 

1633 b2 = skew(a, axis) 

1634 n = a.shape[axis] 

1635 if n < 8: 

1636 raise ValueError( 

1637 "skewtest is not valid with less than 8 samples; %i samples" 

1638 " were given." % int(n)) 

1639 y = b2 * math.sqrt(((n + 1) * (n + 3)) / (6.0 * (n - 2))) 

1640 beta2 = (3.0 * (n**2 + 27*n - 70) * (n+1) * (n+3) / 

1641 ((n-2.0) * (n+5) * (n+7) * (n+9))) 

1642 W2 = -1 + math.sqrt(2 * (beta2 - 1)) 

1643 delta = 1 / math.sqrt(0.5 * math.log(W2)) 

1644 alpha = math.sqrt(2.0 / (W2 - 1)) 

1645 y = np.where(y == 0, 1, y) 

1646 Z = delta * np.log(y / alpha + np.sqrt((y / alpha)**2 + 1)) 

1647 

1648 return SkewtestResult(*_normtest_finish(Z, alternative)) 

1649 

1650 

1651KurtosistestResult = namedtuple('KurtosistestResult', ('statistic', 'pvalue')) 

1652 

1653 

1654def kurtosistest(a, axis=0, nan_policy='propagate', alternative='two-sided'): 

1655 """Test whether a dataset has normal kurtosis. 

1656 

1657 This function tests the null hypothesis that the kurtosis 

1658 of the population from which the sample was drawn is that 

1659 of the normal distribution. 

1660 

1661 Parameters 

1662 ---------- 

1663 a : array 

1664 Array of the sample data. 

1665 axis : int or None, optional 

1666 Axis along which to compute test. Default is 0. If None, 

1667 compute over the whole array `a`. 

1668 nan_policy : {'propagate', 'raise', 'omit'}, optional 

1669 Defines how to handle when input contains nan. 

1670 The following options are available (default is 'propagate'): 

1671 

1672 * 'propagate': returns nan 

1673 * 'raise': throws an error 

1674 * 'omit': performs the calculations ignoring nan values 

1675 

1676 alternative : {'two-sided', 'less', 'greater'}, optional 

1677 Defines the alternative hypothesis. 

1678 The following options are available (default is 'two-sided'): 

1679 

1680 * 'two-sided': the kurtosis of the distribution underlying the sample 

1681 is different from that of the normal distribution 

1682 * 'less': the kurtosis of the distribution underlying the sample 

1683 is less than that of the normal distribution 

1684 * 'greater': the kurtosis of the distribution underlying the sample 

1685 is greater than that of the normal distribution 

1686 

1687 .. versionadded:: 1.7.0 

1688 

1689 Returns 

1690 ------- 

1691 statistic : float 

1692 The computed z-score for this test. 

1693 pvalue : float 

1694 The p-value for the hypothesis test. 

1695 

1696 Notes 

1697 ----- 

1698 Valid only for n>20. This function uses the method described in [1]_. 

1699 

1700 References 

1701 ---------- 

1702 .. [1] see e.g. F. J. Anscombe, W. J. Glynn, "Distribution of the kurtosis 

1703 statistic b2 for normal samples", Biometrika, vol. 70, pp. 227-234, 1983. 

1704 

1705 Examples 

1706 -------- 

1707 >>> import numpy as np 

1708 >>> from scipy.stats import kurtosistest 

1709 >>> kurtosistest(list(range(20))) 

1710 KurtosistestResult(statistic=-1.7058104152122062, pvalue=0.08804338332528348) 

1711 >>> kurtosistest(list(range(20)), alternative='less') 

1712 KurtosistestResult(statistic=-1.7058104152122062, pvalue=0.04402169166264174) 

1713 >>> kurtosistest(list(range(20)), alternative='greater') 

1714 KurtosistestResult(statistic=-1.7058104152122062, pvalue=0.9559783083373583) 

1715 

1716 >>> rng = np.random.default_rng() 

1717 >>> s = rng.normal(0, 1, 1000) 

1718 >>> kurtosistest(s) 

1719 KurtosistestResult(statistic=-1.475047944490622, pvalue=0.14019965402996987) 

1720 

1721 """ 

1722 a, axis = _chk_asarray(a, axis) 

1723 

1724 contains_nan, nan_policy = _contains_nan(a, nan_policy) 

1725 

1726 if contains_nan and nan_policy == 'omit': 

1727 a = ma.masked_invalid(a) 

1728 return mstats_basic.kurtosistest(a, axis, alternative) 

1729 

1730 n = a.shape[axis] 

1731 if n < 5: 

1732 raise ValueError( 

1733 "kurtosistest requires at least 5 observations; %i observations" 

1734 " were given." % int(n)) 

1735 if n < 20: 

1736 warnings.warn("kurtosistest only valid for n>=20 ... continuing " 

1737 "anyway, n=%i" % int(n)) 

1738 b2 = kurtosis(a, axis, fisher=False) 

1739 

1740 E = 3.0*(n-1) / (n+1) 

1741 varb2 = 24.0*n*(n-2)*(n-3) / ((n+1)*(n+1.)*(n+3)*(n+5)) # [1]_ Eq. 1 

1742 x = (b2-E) / np.sqrt(varb2) # [1]_ Eq. 4 

1743 # [1]_ Eq. 2: 

1744 sqrtbeta1 = 6.0*(n*n-5*n+2)/((n+7)*(n+9)) * np.sqrt((6.0*(n+3)*(n+5)) / 

1745 (n*(n-2)*(n-3))) 

1746 # [1]_ Eq. 3: 

1747 A = 6.0 + 8.0/sqrtbeta1 * (2.0/sqrtbeta1 + np.sqrt(1+4.0/(sqrtbeta1**2))) 

1748 term1 = 1 - 2/(9.0*A) 

1749 denom = 1 + x*np.sqrt(2/(A-4.0)) 

1750 term2 = np.sign(denom) * np.where(denom == 0.0, np.nan, 

1751 np.power((1-2.0/A)/np.abs(denom), 1/3.0)) 

1752 if np.any(denom == 0): 

1753 msg = "Test statistic not defined in some cases due to division by " \ 

1754 "zero. Return nan in that case..." 

1755 warnings.warn(msg, RuntimeWarning) 

1756 

1757 Z = (term1 - term2) / np.sqrt(2/(9.0*A)) # [1]_ Eq. 5 

1758 

1759 # zprob uses upper tail, so Z needs to be positive 

1760 return KurtosistestResult(*_normtest_finish(Z, alternative)) 

1761 

1762 

1763NormaltestResult = namedtuple('NormaltestResult', ('statistic', 'pvalue')) 

1764 

1765 

1766def normaltest(a, axis=0, nan_policy='propagate'): 

1767 """Test whether a sample differs from a normal distribution. 

1768 

1769 This function tests the null hypothesis that a sample comes 

1770 from a normal distribution. It is based on D'Agostino and 

1771 Pearson's [1]_, [2]_ test that combines skew and kurtosis to 

1772 produce an omnibus test of normality. 

1773 

1774 Parameters 

1775 ---------- 

1776 a : array_like 

1777 The array containing the sample to be tested. 

1778 axis : int or None, optional 

1779 Axis along which to compute test. Default is 0. If None, 

1780 compute over the whole array `a`. 

1781 nan_policy : {'propagate', 'raise', 'omit'}, optional 

1782 Defines how to handle when input contains nan. 

1783 The following options are available (default is 'propagate'): 

1784 

1785 * 'propagate': returns nan 

1786 * 'raise': throws an error 

1787 * 'omit': performs the calculations ignoring nan values 

1788 

1789 Returns 

1790 ------- 

1791 statistic : float or array 

1792 ``s^2 + k^2``, where ``s`` is the z-score returned by `skewtest` and 

1793 ``k`` is the z-score returned by `kurtosistest`. 

1794 pvalue : float or array 

1795 A 2-sided chi squared probability for the hypothesis test. 

1796 

1797 References 

1798 ---------- 

1799 .. [1] D'Agostino, R. B. (1971), "An omnibus test of normality for 

1800 moderate and large sample size", Biometrika, 58, 341-348 

1801 

1802 .. [2] D'Agostino, R. and Pearson, E. S. (1973), "Tests for departure from 

1803 normality", Biometrika, 60, 613-622 

1804 

1805 Examples 

1806 -------- 

1807 >>> import numpy as np 

1808 >>> from scipy import stats 

1809 >>> rng = np.random.default_rng() 

1810 >>> pts = 1000 

1811 >>> a = rng.normal(0, 1, size=pts) 

1812 >>> b = rng.normal(2, 1, size=pts) 

1813 >>> x = np.concatenate((a, b)) 

1814 >>> k2, p = stats.normaltest(x) 

1815 >>> alpha = 1e-3 

1816 >>> print("p = {:g}".format(p)) 

1817 p = 8.4713e-19 

1818 >>> if p < alpha: # null hypothesis: x comes from a normal distribution 

1819 ... print("The null hypothesis can be rejected") 

1820 ... else: 

1821 ... print("The null hypothesis cannot be rejected") 

1822 The null hypothesis can be rejected 

1823 

1824 """ 

1825 a, axis = _chk_asarray(a, axis) 

1826 

1827 contains_nan, nan_policy = _contains_nan(a, nan_policy) 

1828 

1829 if contains_nan and nan_policy == 'omit': 

1830 a = ma.masked_invalid(a) 

1831 return mstats_basic.normaltest(a, axis) 

1832 

1833 s, _ = skewtest(a, axis) 

1834 k, _ = kurtosistest(a, axis) 

1835 k2 = s*s + k*k 

1836 

1837 return NormaltestResult(k2, distributions.chi2.sf(k2, 2)) 

1838 

1839 

1840@_axis_nan_policy_factory(SignificanceResult, default_axis=None) 

1841def jarque_bera(x, *, axis=None): 

1842 """Perform the Jarque-Bera goodness of fit test on sample data. 

1843 

1844 The Jarque-Bera test tests whether the sample data has the skewness and 

1845 kurtosis matching a normal distribution. 

1846 

1847 Note that this test only works for a large enough number of data samples 

1848 (>2000) as the test statistic asymptotically has a Chi-squared distribution 

1849 with 2 degrees of freedom. 

1850 

1851 Parameters 

1852 ---------- 

1853 x : array_like 

1854 Observations of a random variable. 

1855 axis : int or None, default: 0 

1856 If an int, the axis of the input along which to compute the statistic. 

1857 The statistic of each axis-slice (e.g. row) of the input will appear in 

1858 a corresponding element of the output. 

1859 If ``None``, the input will be raveled before computing the statistic. 

1860 

1861 Returns 

1862 ------- 

1863 result : SignificanceResult 

1864 An object with the following attributes: 

1865 

1866 statistic : float 

1867 The test statistic. 

1868 pvalue : float 

1869 The p-value for the hypothesis test. 

1870 

1871 References 

1872 ---------- 

1873 .. [1] Jarque, C. and Bera, A. (1980) "Efficient tests for normality, 

1874 homoscedasticity and serial independence of regression residuals", 

1875 6 Econometric Letters 255-259. 

1876 

1877 Examples 

1878 -------- 

1879 >>> import numpy as np 

1880 >>> from scipy import stats 

1881 >>> rng = np.random.default_rng() 

1882 >>> x = rng.normal(0, 1, 100000) 

1883 >>> jarque_bera_test = stats.jarque_bera(x) 

1884 >>> jarque_bera_test 

1885 Jarque_beraResult(statistic=3.3415184718131554, pvalue=0.18810419594996775) 

1886 >>> jarque_bera_test.statistic 

1887 3.3415184718131554 

1888 >>> jarque_bera_test.pvalue 

1889 0.18810419594996775 

1890 

1891 """ 

1892 x = np.asarray(x) 

1893 if axis is None: 

1894 x = x.ravel() 

1895 axis = 0 

1896 

1897 n = x.shape[axis] 

1898 if n == 0: 

1899 raise ValueError('At least one observation is required.') 

1900 

1901 mu = x.mean(axis=axis, keepdims=True) 

1902 diffx = x - mu 

1903 s = skew(diffx, axis=axis, _no_deco=True) 

1904 k = kurtosis(diffx, axis=axis, _no_deco=True) 

1905 statistic = n / 6 * (s**2 + k**2 / 4) 

1906 pvalue = distributions.chi2.sf(statistic, df=2) 

1907 

1908 return SignificanceResult(statistic, pvalue) 

1909 

1910 

1911##################################### 

1912# FREQUENCY FUNCTIONS # 

1913##################################### 

1914 

1915 

1916def scoreatpercentile(a, per, limit=(), interpolation_method='fraction', 

1917 axis=None): 

1918 """Calculate the score at a given percentile of the input sequence. 

1919 

1920 For example, the score at `per=50` is the median. If the desired quantile 

1921 lies between two data points, we interpolate between them, according to 

1922 the value of `interpolation`. If the parameter `limit` is provided, it 

1923 should be a tuple (lower, upper) of two values. 

1924 

1925 Parameters 

1926 ---------- 

1927 a : array_like 

1928 A 1-D array of values from which to extract score. 

1929 per : array_like 

1930 Percentile(s) at which to extract score. Values should be in range 

1931 [0,100]. 

1932 limit : tuple, optional 

1933 Tuple of two scalars, the lower and upper limits within which to 

1934 compute the percentile. Values of `a` outside 

1935 this (closed) interval will be ignored. 

1936 interpolation_method : {'fraction', 'lower', 'higher'}, optional 

1937 Specifies the interpolation method to use, 

1938 when the desired quantile lies between two data points `i` and `j` 

1939 The following options are available (default is 'fraction'): 

1940 

1941 * 'fraction': ``i + (j - i) * fraction`` where ``fraction`` is the 

1942 fractional part of the index surrounded by ``i`` and ``j`` 

1943 * 'lower': ``i`` 

1944 * 'higher': ``j`` 

1945 

1946 axis : int, optional 

1947 Axis along which the percentiles are computed. Default is None. If 

1948 None, compute over the whole array `a`. 

1949 

1950 Returns 

1951 ------- 

1952 score : float or ndarray 

1953 Score at percentile(s). 

1954 

1955 See Also 

1956 -------- 

1957 percentileofscore, numpy.percentile 

1958 

1959 Notes 

1960 ----- 

1961 This function will become obsolete in the future. 

1962 For NumPy 1.9 and higher, `numpy.percentile` provides all the functionality 

1963 that `scoreatpercentile` provides. And it's significantly faster. 

1964 Therefore it's recommended to use `numpy.percentile` for users that have 

1965 numpy >= 1.9. 

1966 

1967 Examples 

1968 -------- 

1969 >>> import numpy as np 

1970 >>> from scipy import stats 

1971 >>> a = np.arange(100) 

1972 >>> stats.scoreatpercentile(a, 50) 

1973 49.5 

1974 

1975 """ 

1976 # adapted from NumPy's percentile function. When we require numpy >= 1.8, 

1977 # the implementation of this function can be replaced by np.percentile. 

1978 a = np.asarray(a) 

1979 if a.size == 0: 

1980 # empty array, return nan(s) with shape matching `per` 

1981 if np.isscalar(per): 

1982 return np.nan 

1983 else: 

1984 return np.full(np.asarray(per).shape, np.nan, dtype=np.float64) 

1985 

1986 if limit: 

1987 a = a[(limit[0] <= a) & (a <= limit[1])] 

1988 

1989 sorted_ = np.sort(a, axis=axis) 

1990 if axis is None: 

1991 axis = 0 

1992 

1993 return _compute_qth_percentile(sorted_, per, interpolation_method, axis) 

1994 

1995 

1996# handle sequence of per's without calling sort multiple times 

1997def _compute_qth_percentile(sorted_, per, interpolation_method, axis): 

1998 if not np.isscalar(per): 

1999 score = [_compute_qth_percentile(sorted_, i, 

2000 interpolation_method, axis) 

2001 for i in per] 

2002 return np.array(score) 

2003 

2004 if not (0 <= per <= 100): 

2005 raise ValueError("percentile must be in the range [0, 100]") 

2006 

2007 indexer = [slice(None)] * sorted_.ndim 

2008 idx = per / 100. * (sorted_.shape[axis] - 1) 

2009 

2010 if int(idx) != idx: 

2011 # round fractional indices according to interpolation method 

2012 if interpolation_method == 'lower': 

2013 idx = int(np.floor(idx)) 

2014 elif interpolation_method == 'higher': 

2015 idx = int(np.ceil(idx)) 

2016 elif interpolation_method == 'fraction': 

2017 pass # keep idx as fraction and interpolate 

2018 else: 

2019 raise ValueError("interpolation_method can only be 'fraction', " 

2020 "'lower' or 'higher'") 

2021 

2022 i = int(idx) 

2023 if i == idx: 

2024 indexer[axis] = slice(i, i + 1) 

2025 weights = array(1) 

2026 sumval = 1.0 

2027 else: 

2028 indexer[axis] = slice(i, i + 2) 

2029 j = i + 1 

2030 weights = array([(j - idx), (idx - i)], float) 

2031 wshape = [1] * sorted_.ndim 

2032 wshape[axis] = 2 

2033 weights.shape = wshape 

2034 sumval = weights.sum() 

2035 

2036 # Use np.add.reduce (== np.sum but a little faster) to coerce data type 

2037 return np.add.reduce(sorted_[tuple(indexer)] * weights, axis=axis) / sumval 

2038 

2039 

2040def percentileofscore(a, score, kind='rank', nan_policy='propagate'): 

2041 """Compute the percentile rank of a score relative to a list of scores. 

2042 

2043 A `percentileofscore` of, for example, 80% means that 80% of the 

2044 scores in `a` are below the given score. In the case of gaps or 

2045 ties, the exact definition depends on the optional keyword, `kind`. 

2046 

2047 Parameters 

2048 ---------- 

2049 a : array_like 

2050 Array to which `score` is compared. 

2051 score : array_like 

2052 Scores to compute percentiles for. 

2053 kind : {'rank', 'weak', 'strict', 'mean'}, optional 

2054 Specifies the interpretation of the resulting score. 

2055 The following options are available (default is 'rank'): 

2056 

2057 * 'rank': Average percentage ranking of score. In case of multiple 

2058 matches, average the percentage rankings of all matching scores. 

2059 * 'weak': This kind corresponds to the definition of a cumulative 

2060 distribution function. A percentileofscore of 80% means that 80% 

2061 of values are less than or equal to the provided score. 

2062 * 'strict': Similar to "weak", except that only values that are 

2063 strictly less than the given score are counted. 

2064 * 'mean': The average of the "weak" and "strict" scores, often used 

2065 in testing. See https://en.wikipedia.org/wiki/Percentile_rank 

2066 nan_policy : {'propagate', 'raise', 'omit'}, optional 

2067 Specifies how to treat `nan` values in `a`. 

2068 The following options are available (default is 'propagate'): 

2069 

2070 * 'propagate': returns nan (for each value in `score`). 

2071 * 'raise': throws an error 

2072 * 'omit': performs the calculations ignoring nan values 

2073 

2074 Returns 

2075 ------- 

2076 pcos : float 

2077 Percentile-position of score (0-100) relative to `a`. 

2078 

2079 See Also 

2080 -------- 

2081 numpy.percentile 

2082 scipy.stats.scoreatpercentile, scipy.stats.rankdata 

2083 

2084 Examples 

2085 -------- 

2086 Three-quarters of the given values lie below a given score: 

2087 

2088 >>> import numpy as np 

2089 >>> from scipy import stats 

2090 >>> stats.percentileofscore([1, 2, 3, 4], 3) 

2091 75.0 

2092 

2093 With multiple matches, note how the scores of the two matches, 0.6 

2094 and 0.8 respectively, are averaged: 

2095 

2096 >>> stats.percentileofscore([1, 2, 3, 3, 4], 3) 

2097 70.0 

2098 

2099 Only 2/5 values are strictly less than 3: 

2100 

2101 >>> stats.percentileofscore([1, 2, 3, 3, 4], 3, kind='strict') 

2102 40.0 

2103 

2104 But 4/5 values are less than or equal to 3: 

2105 

2106 >>> stats.percentileofscore([1, 2, 3, 3, 4], 3, kind='weak') 

2107 80.0 

2108 

2109 The average between the weak and the strict scores is: 

2110 

2111 >>> stats.percentileofscore([1, 2, 3, 3, 4], 3, kind='mean') 

2112 60.0 

2113 

2114 Score arrays (of any dimensionality) are supported: 

2115 

2116 >>> stats.percentileofscore([1, 2, 3, 3, 4], [2, 3]) 

2117 array([40., 70.]) 

2118 

2119 The inputs can be infinite: 

2120 

2121 >>> stats.percentileofscore([-np.inf, 0, 1, np.inf], [1, 2, np.inf]) 

2122 array([75., 75., 100.]) 

2123 

2124 If `a` is empty, then the resulting percentiles are all `nan`: 

2125 

2126 >>> stats.percentileofscore([], [1, 2]) 

2127 array([nan, nan]) 

2128 """ 

2129 

2130 a = np.asarray(a) 

2131 n = len(a) 

2132 score = np.asarray(score) 

2133 

2134 # Nan treatment 

2135 cna, npa = _contains_nan(a, nan_policy, use_summation=False) 

2136 cns, nps = _contains_nan(score, nan_policy, use_summation=False) 

2137 

2138 if (cna or cns) and nan_policy == 'raise': 

2139 raise ValueError("The input contains nan values") 

2140 

2141 if cns: 

2142 # If a score is nan, then the output should be nan 

2143 # (also if nan_policy is "omit", because it only applies to `a`) 

2144 score = ma.masked_where(np.isnan(score), score) 

2145 

2146 if cna: 

2147 if nan_policy == "omit": 

2148 # Don't count nans 

2149 a = ma.masked_where(np.isnan(a), a) 

2150 n = a.count() 

2151 

2152 if nan_policy == "propagate": 

2153 # All outputs should be nans 

2154 n = 0 

2155 

2156 # Cannot compare to empty list ==> nan 

2157 if n == 0: 

2158 perct = np.full_like(score, np.nan, dtype=np.float64) 

2159 

2160 else: 

2161 # Prepare broadcasting 

2162 score = score[..., None] 

2163 

2164 def count(x): 

2165 return np.count_nonzero(x, -1) 

2166 

2167 # Despite using masked_array to omit nan values from processing, 

2168 # the CI tests on "Azure pipelines" (but not on the other CI servers) 

2169 # emits warnings when there are nan values, contrarily to the purpose 

2170 # of masked_arrays. As a fix, we simply suppress the warnings. 

2171 with suppress_warnings() as sup: 

2172 sup.filter(RuntimeWarning, 

2173 "invalid value encountered in less") 

2174 sup.filter(RuntimeWarning, 

2175 "invalid value encountered in greater") 

2176 

2177 # Main computations/logic 

2178 if kind == 'rank': 

2179 left = count(a < score) 

2180 right = count(a <= score) 

2181 plus1 = left < right 

2182 perct = (left + right + plus1) * (50.0 / n) 

2183 elif kind == 'strict': 

2184 perct = count(a < score) * (100.0 / n) 

2185 elif kind == 'weak': 

2186 perct = count(a <= score) * (100.0 / n) 

2187 elif kind == 'mean': 

2188 left = count(a < score) 

2189 right = count(a <= score) 

2190 perct = (left + right) * (50.0 / n) 

2191 else: 

2192 raise ValueError( 

2193 "kind can only be 'rank', 'strict', 'weak' or 'mean'") 

2194 

2195 # Re-insert nan values 

2196 perct = ma.filled(perct, np.nan) 

2197 

2198 if perct.ndim == 0: 

2199 return perct[()] 

2200 return perct 

2201 

2202 

2203HistogramResult = namedtuple('HistogramResult', 

2204 ('count', 'lowerlimit', 'binsize', 'extrapoints')) 

2205 

2206 

2207def _histogram(a, numbins=10, defaultlimits=None, weights=None, 

2208 printextras=False): 

2209 """Create a histogram. 

2210 

2211 Separate the range into several bins and return the number of instances 

2212 in each bin. 

2213 

2214 Parameters 

2215 ---------- 

2216 a : array_like 

2217 Array of scores which will be put into bins. 

2218 numbins : int, optional 

2219 The number of bins to use for the histogram. Default is 10. 

2220 defaultlimits : tuple (lower, upper), optional 

2221 The lower and upper values for the range of the histogram. 

2222 If no value is given, a range slightly larger than the range of the 

2223 values in a is used. Specifically ``(a.min() - s, a.max() + s)``, 

2224 where ``s = (1/2)(a.max() - a.min()) / (numbins - 1)``. 

2225 weights : array_like, optional 

2226 The weights for each value in `a`. Default is None, which gives each 

2227 value a weight of 1.0 

2228 printextras : bool, optional 

2229 If True, if there are extra points (i.e. the points that fall outside 

2230 the bin limits) a warning is raised saying how many of those points 

2231 there are. Default is False. 

2232 

2233 Returns 

2234 ------- 

2235 count : ndarray 

2236 Number of points (or sum of weights) in each bin. 

2237 lowerlimit : float 

2238 Lowest value of histogram, the lower limit of the first bin. 

2239 binsize : float 

2240 The size of the bins (all bins have the same size). 

2241 extrapoints : int 

2242 The number of points outside the range of the histogram. 

2243 

2244 See Also 

2245 -------- 

2246 numpy.histogram 

2247 

2248 Notes 

2249 ----- 

2250 This histogram is based on numpy's histogram but has a larger range by 

2251 default if default limits is not set. 

2252 

2253 """ 

2254 a = np.ravel(a) 

2255 if defaultlimits is None: 

2256 if a.size == 0: 

2257 # handle empty arrays. Undetermined range, so use 0-1. 

2258 defaultlimits = (0, 1) 

2259 else: 

2260 # no range given, so use values in `a` 

2261 data_min = a.min() 

2262 data_max = a.max() 

2263 # Have bins extend past min and max values slightly 

2264 s = (data_max - data_min) / (2. * (numbins - 1.)) 

2265 defaultlimits = (data_min - s, data_max + s) 

2266 

2267 # use numpy's histogram method to compute bins 

2268 hist, bin_edges = np.histogram(a, bins=numbins, range=defaultlimits, 

2269 weights=weights) 

2270 # hist are not always floats, convert to keep with old output 

2271 hist = np.array(hist, dtype=float) 

2272 # fixed width for bins is assumed, as numpy's histogram gives 

2273 # fixed width bins for int values for 'bins' 

2274 binsize = bin_edges[1] - bin_edges[0] 

2275 # calculate number of extra points 

2276 extrapoints = len([v for v in a 

2277 if defaultlimits[0] > v or v > defaultlimits[1]]) 

2278 if extrapoints > 0 and printextras: 

2279 warnings.warn("Points outside given histogram range = %s" 

2280 % extrapoints) 

2281 

2282 return HistogramResult(hist, defaultlimits[0], binsize, extrapoints) 

2283 

2284 

2285CumfreqResult = namedtuple('CumfreqResult', 

2286 ('cumcount', 'lowerlimit', 'binsize', 

2287 'extrapoints')) 

2288 

2289 

2290def cumfreq(a, numbins=10, defaultreallimits=None, weights=None): 

2291 """Return a cumulative frequency histogram, using the histogram function. 

2292 

2293 A cumulative histogram is a mapping that counts the cumulative number of 

2294 observations in all of the bins up to the specified bin. 

2295 

2296 Parameters 

2297 ---------- 

2298 a : array_like 

2299 Input array. 

2300 numbins : int, optional 

2301 The number of bins to use for the histogram. Default is 10. 

2302 defaultreallimits : tuple (lower, upper), optional 

2303 The lower and upper values for the range of the histogram. 

2304 If no value is given, a range slightly larger than the range of the 

2305 values in `a` is used. Specifically ``(a.min() - s, a.max() + s)``, 

2306 where ``s = (1/2)(a.max() - a.min()) / (numbins - 1)``. 

2307 weights : array_like, optional 

2308 The weights for each value in `a`. Default is None, which gives each 

2309 value a weight of 1.0 

2310 

2311 Returns 

2312 ------- 

2313 cumcount : ndarray 

2314 Binned values of cumulative frequency. 

2315 lowerlimit : float 

2316 Lower real limit 

2317 binsize : float 

2318 Width of each bin. 

2319 extrapoints : int 

2320 Extra points. 

2321 

2322 Examples 

2323 -------- 

2324 >>> import numpy as np 

2325 >>> import matplotlib.pyplot as plt 

2326 >>> from scipy import stats 

2327 >>> rng = np.random.default_rng() 

2328 >>> x = [1, 4, 2, 1, 3, 1] 

2329 >>> res = stats.cumfreq(x, numbins=4, defaultreallimits=(1.5, 5)) 

2330 >>> res.cumcount 

2331 array([ 1., 2., 3., 3.]) 

2332 >>> res.extrapoints 

2333 3 

2334 

2335 Create a normal distribution with 1000 random values 

2336 

2337 >>> samples = stats.norm.rvs(size=1000, random_state=rng) 

2338 

2339 Calculate cumulative frequencies 

2340 

2341 >>> res = stats.cumfreq(samples, numbins=25) 

2342 

2343 Calculate space of values for x 

2344 

2345 >>> x = res.lowerlimit + np.linspace(0, res.binsize*res.cumcount.size, 

2346 ... res.cumcount.size) 

2347 

2348 Plot histogram and cumulative histogram 

2349 

2350 >>> fig = plt.figure(figsize=(10, 4)) 

2351 >>> ax1 = fig.add_subplot(1, 2, 1) 

2352 >>> ax2 = fig.add_subplot(1, 2, 2) 

2353 >>> ax1.hist(samples, bins=25) 

2354 >>> ax1.set_title('Histogram') 

2355 >>> ax2.bar(x, res.cumcount, width=res.binsize) 

2356 >>> ax2.set_title('Cumulative histogram') 

2357 >>> ax2.set_xlim([x.min(), x.max()]) 

2358 

2359 >>> plt.show() 

2360 

2361 """ 

2362 h, l, b, e = _histogram(a, numbins, defaultreallimits, weights=weights) 

2363 cumhist = np.cumsum(h * 1, axis=0) 

2364 return CumfreqResult(cumhist, l, b, e) 

2365 

2366 

2367RelfreqResult = namedtuple('RelfreqResult', 

2368 ('frequency', 'lowerlimit', 'binsize', 

2369 'extrapoints')) 

2370 

2371 

2372def relfreq(a, numbins=10, defaultreallimits=None, weights=None): 

2373 """Return a relative frequency histogram, using the histogram function. 

2374 

2375 A relative frequency histogram is a mapping of the number of 

2376 observations in each of the bins relative to the total of observations. 

2377 

2378 Parameters 

2379 ---------- 

2380 a : array_like 

2381 Input array. 

2382 numbins : int, optional 

2383 The number of bins to use for the histogram. Default is 10. 

2384 defaultreallimits : tuple (lower, upper), optional 

2385 The lower and upper values for the range of the histogram. 

2386 If no value is given, a range slightly larger than the range of the 

2387 values in a is used. Specifically ``(a.min() - s, a.max() + s)``, 

2388 where ``s = (1/2)(a.max() - a.min()) / (numbins - 1)``. 

2389 weights : array_like, optional 

2390 The weights for each value in `a`. Default is None, which gives each 

2391 value a weight of 1.0 

2392 

2393 Returns 

2394 ------- 

2395 frequency : ndarray 

2396 Binned values of relative frequency. 

2397 lowerlimit : float 

2398 Lower real limit. 

2399 binsize : float 

2400 Width of each bin. 

2401 extrapoints : int 

2402 Extra points. 

2403 

2404 Examples 

2405 -------- 

2406 >>> import numpy as np 

2407 >>> import matplotlib.pyplot as plt 

2408 >>> from scipy import stats 

2409 >>> rng = np.random.default_rng() 

2410 >>> a = np.array([2, 4, 1, 2, 3, 2]) 

2411 >>> res = stats.relfreq(a, numbins=4) 

2412 >>> res.frequency 

2413 array([ 0.16666667, 0.5 , 0.16666667, 0.16666667]) 

2414 >>> np.sum(res.frequency) # relative frequencies should add up to 1 

2415 1.0 

2416 

2417 Create a normal distribution with 1000 random values 

2418 

2419 >>> samples = stats.norm.rvs(size=1000, random_state=rng) 

2420 

2421 Calculate relative frequencies 

2422 

2423 >>> res = stats.relfreq(samples, numbins=25) 

2424 

2425 Calculate space of values for x 

2426 

2427 >>> x = res.lowerlimit + np.linspace(0, res.binsize*res.frequency.size, 

2428 ... res.frequency.size) 

2429 

2430 Plot relative frequency histogram 

2431 

2432 >>> fig = plt.figure(figsize=(5, 4)) 

2433 >>> ax = fig.add_subplot(1, 1, 1) 

2434 >>> ax.bar(x, res.frequency, width=res.binsize) 

2435 >>> ax.set_title('Relative frequency histogram') 

2436 >>> ax.set_xlim([x.min(), x.max()]) 

2437 

2438 >>> plt.show() 

2439 

2440 """ 

2441 a = np.asanyarray(a) 

2442 h, l, b, e = _histogram(a, numbins, defaultreallimits, weights=weights) 

2443 h = h / a.shape[0] 

2444 

2445 return RelfreqResult(h, l, b, e) 

2446 

2447 

2448##################################### 

2449# VARIABILITY FUNCTIONS # 

2450##################################### 

2451 

2452def obrientransform(*samples): 

2453 """Compute the O'Brien transform on input data (any number of arrays). 

2454 

2455 Used to test for homogeneity of variance prior to running one-way stats. 

2456 Each array in ``*samples`` is one level of a factor. 

2457 If `f_oneway` is run on the transformed data and found significant, 

2458 the variances are unequal. From Maxwell and Delaney [1]_, p.112. 

2459 

2460 Parameters 

2461 ---------- 

2462 sample1, sample2, ... : array_like 

2463 Any number of arrays. 

2464 

2465 Returns 

2466 ------- 

2467 obrientransform : ndarray 

2468 Transformed data for use in an ANOVA. The first dimension 

2469 of the result corresponds to the sequence of transformed 

2470 arrays. If the arrays given are all 1-D of the same length, 

2471 the return value is a 2-D array; otherwise it is a 1-D array 

2472 of type object, with each element being an ndarray. 

2473 

2474 References 

2475 ---------- 

2476 .. [1] S. E. Maxwell and H. D. Delaney, "Designing Experiments and 

2477 Analyzing Data: A Model Comparison Perspective", Wadsworth, 1990. 

2478 

2479 Examples 

2480 -------- 

2481 We'll test the following data sets for differences in their variance. 

2482 

2483 >>> x = [10, 11, 13, 9, 7, 12, 12, 9, 10] 

2484 >>> y = [13, 21, 5, 10, 8, 14, 10, 12, 7, 15] 

2485 

2486 Apply the O'Brien transform to the data. 

2487 

2488 >>> from scipy.stats import obrientransform 

2489 >>> tx, ty = obrientransform(x, y) 

2490 

2491 Use `scipy.stats.f_oneway` to apply a one-way ANOVA test to the 

2492 transformed data. 

2493 

2494 >>> from scipy.stats import f_oneway 

2495 >>> F, p = f_oneway(tx, ty) 

2496 >>> p 

2497 0.1314139477040335 

2498 

2499 If we require that ``p < 0.05`` for significance, we cannot conclude 

2500 that the variances are different. 

2501 

2502 """ 

2503 TINY = np.sqrt(np.finfo(float).eps) 

2504 

2505 # `arrays` will hold the transformed arguments. 

2506 arrays = [] 

2507 sLast = None 

2508 

2509 for sample in samples: 

2510 a = np.asarray(sample) 

2511 n = len(a) 

2512 mu = np.mean(a) 

2513 sq = (a - mu)**2 

2514 sumsq = sq.sum() 

2515 

2516 # The O'Brien transform. 

2517 t = ((n - 1.5) * n * sq - 0.5 * sumsq) / ((n - 1) * (n - 2)) 

2518 

2519 # Check that the mean of the transformed data is equal to the 

2520 # original variance. 

2521 var = sumsq / (n - 1) 

2522 if abs(var - np.mean(t)) > TINY: 

2523 raise ValueError('Lack of convergence in obrientransform.') 

2524 

2525 arrays.append(t) 

2526 sLast = a.shape 

2527 

2528 if sLast: 

2529 for arr in arrays[:-1]: 

2530 if sLast != arr.shape: 

2531 return np.array(arrays, dtype=object) 

2532 return np.array(arrays) 

2533 

2534 

2535def sem(a, axis=0, ddof=1, nan_policy='propagate'): 

2536 """Compute standard error of the mean. 

2537 

2538 Calculate the standard error of the mean (or standard error of 

2539 measurement) of the values in the input array. 

2540 

2541 Parameters 

2542 ---------- 

2543 a : array_like 

2544 An array containing the values for which the standard error is 

2545 returned. 

2546 axis : int or None, optional 

2547 Axis along which to operate. Default is 0. If None, compute over 

2548 the whole array `a`. 

2549 ddof : int, optional 

2550 Delta degrees-of-freedom. How many degrees of freedom to adjust 

2551 for bias in limited samples relative to the population estimate 

2552 of variance. Defaults to 1. 

2553 nan_policy : {'propagate', 'raise', 'omit'}, optional 

2554 Defines how to handle when input contains nan. 

2555 The following options are available (default is 'propagate'): 

2556 

2557 * 'propagate': returns nan 

2558 * 'raise': throws an error 

2559 * 'omit': performs the calculations ignoring nan values 

2560 

2561 Returns 

2562 ------- 

2563 s : ndarray or float 

2564 The standard error of the mean in the sample(s), along the input axis. 

2565 

2566 Notes 

2567 ----- 

2568 The default value for `ddof` is different to the default (0) used by other 

2569 ddof containing routines, such as np.std and np.nanstd. 

2570 

2571 Examples 

2572 -------- 

2573 Find standard error along the first axis: 

2574 

2575 >>> import numpy as np 

2576 >>> from scipy import stats 

2577 >>> a = np.arange(20).reshape(5,4) 

2578 >>> stats.sem(a) 

2579 array([ 2.8284, 2.8284, 2.8284, 2.8284]) 

2580 

2581 Find standard error across the whole array, using n degrees of freedom: 

2582 

2583 >>> stats.sem(a, axis=None, ddof=0) 

2584 1.2893796958227628 

2585 

2586 """ 

2587 a, axis = _chk_asarray(a, axis) 

2588 

2589 contains_nan, nan_policy = _contains_nan(a, nan_policy) 

2590 

2591 if contains_nan and nan_policy == 'omit': 

2592 a = ma.masked_invalid(a) 

2593 return mstats_basic.sem(a, axis, ddof) 

2594 

2595 n = a.shape[axis] 

2596 s = np.std(a, axis=axis, ddof=ddof) / np.sqrt(n) 

2597 return s 

2598 

2599 

2600def _isconst(x): 

2601 """ 

2602 Check if all values in x are the same. nans are ignored. 

2603 

2604 x must be a 1d array. 

2605 

2606 The return value is a 1d array with length 1, so it can be used 

2607 in np.apply_along_axis. 

2608 """ 

2609 y = x[~np.isnan(x)] 

2610 if y.size == 0: 

2611 return np.array([True]) 

2612 else: 

2613 return (y[0] == y).all(keepdims=True) 

2614 

2615 

2616def _quiet_nanmean(x): 

2617 """ 

2618 Compute nanmean for the 1d array x, but quietly return nan if x is all nan. 

2619 

2620 The return value is a 1d array with length 1, so it can be used 

2621 in np.apply_along_axis. 

2622 """ 

2623 y = x[~np.isnan(x)] 

2624 if y.size == 0: 

2625 return np.array([np.nan]) 

2626 else: 

2627 return np.mean(y, keepdims=True) 

2628 

2629 

2630def _quiet_nanstd(x, ddof=0): 

2631 """ 

2632 Compute nanstd for the 1d array x, but quietly return nan if x is all nan. 

2633 

2634 The return value is a 1d array with length 1, so it can be used 

2635 in np.apply_along_axis. 

2636 """ 

2637 y = x[~np.isnan(x)] 

2638 if y.size == 0: 

2639 return np.array([np.nan]) 

2640 else: 

2641 return np.std(y, keepdims=True, ddof=ddof) 

2642 

2643 

2644def zscore(a, axis=0, ddof=0, nan_policy='propagate'): 

2645 """ 

2646 Compute the z score. 

2647 

2648 Compute the z score of each value in the sample, relative to the 

2649 sample mean and standard deviation. 

2650 

2651 Parameters 

2652 ---------- 

2653 a : array_like 

2654 An array like object containing the sample data. 

2655 axis : int or None, optional 

2656 Axis along which to operate. Default is 0. If None, compute over 

2657 the whole array `a`. 

2658 ddof : int, optional 

2659 Degrees of freedom correction in the calculation of the 

2660 standard deviation. Default is 0. 

2661 nan_policy : {'propagate', 'raise', 'omit'}, optional 

2662 Defines how to handle when input contains nan. 'propagate' returns nan, 

2663 'raise' throws an error, 'omit' performs the calculations ignoring nan 

2664 values. Default is 'propagate'. Note that when the value is 'omit', 

2665 nans in the input also propagate to the output, but they do not affect 

2666 the z-scores computed for the non-nan values. 

2667 

2668 Returns 

2669 ------- 

2670 zscore : array_like 

2671 The z-scores, standardized by mean and standard deviation of 

2672 input array `a`. 

2673 

2674 Notes 

2675 ----- 

2676 This function preserves ndarray subclasses, and works also with 

2677 matrices and masked arrays (it uses `asanyarray` instead of 

2678 `asarray` for parameters). 

2679 

2680 Examples 

2681 -------- 

2682 >>> import numpy as np 

2683 >>> a = np.array([ 0.7972, 0.0767, 0.4383, 0.7866, 0.8091, 

2684 ... 0.1954, 0.6307, 0.6599, 0.1065, 0.0508]) 

2685 >>> from scipy import stats 

2686 >>> stats.zscore(a) 

2687 array([ 1.1273, -1.247 , -0.0552, 1.0923, 1.1664, -0.8559, 0.5786, 

2688 0.6748, -1.1488, -1.3324]) 

2689 

2690 Computing along a specified axis, using n-1 degrees of freedom 

2691 (``ddof=1``) to calculate the standard deviation: 

2692 

2693 >>> b = np.array([[ 0.3148, 0.0478, 0.6243, 0.4608], 

2694 ... [ 0.7149, 0.0775, 0.6072, 0.9656], 

2695 ... [ 0.6341, 0.1403, 0.9759, 0.4064], 

2696 ... [ 0.5918, 0.6948, 0.904 , 0.3721], 

2697 ... [ 0.0921, 0.2481, 0.1188, 0.1366]]) 

2698 >>> stats.zscore(b, axis=1, ddof=1) 

2699 array([[-0.19264823, -1.28415119, 1.07259584, 0.40420358], 

2700 [ 0.33048416, -1.37380874, 0.04251374, 1.00081084], 

2701 [ 0.26796377, -1.12598418, 1.23283094, -0.37481053], 

2702 [-0.22095197, 0.24468594, 1.19042819, -1.21416216], 

2703 [-0.82780366, 1.4457416 , -0.43867764, -0.1792603 ]]) 

2704 

2705 An example with `nan_policy='omit'`: 

2706 

2707 >>> x = np.array([[25.11, 30.10, np.nan, 32.02, 43.15], 

2708 ... [14.95, 16.06, 121.25, 94.35, 29.81]]) 

2709 >>> stats.zscore(x, axis=1, nan_policy='omit') 

2710 array([[-1.13490897, -0.37830299, nan, -0.08718406, 1.60039602], 

2711 [-0.91611681, -0.89090508, 1.4983032 , 0.88731639, -0.5785977 ]]) 

2712 """ 

2713 return zmap(a, a, axis=axis, ddof=ddof, nan_policy=nan_policy) 

2714 

2715 

2716def gzscore(a, *, axis=0, ddof=0, nan_policy='propagate'): 

2717 """ 

2718 Compute the geometric standard score. 

2719 

2720 Compute the geometric z score of each strictly positive value in the 

2721 sample, relative to the geometric mean and standard deviation. 

2722 Mathematically the geometric z score can be evaluated as:: 

2723 

2724 gzscore = log(a/gmu) / log(gsigma) 

2725 

2726 where ``gmu`` (resp. ``gsigma``) is the geometric mean (resp. standard 

2727 deviation). 

2728 

2729 Parameters 

2730 ---------- 

2731 a : array_like 

2732 Sample data. 

2733 axis : int or None, optional 

2734 Axis along which to operate. Default is 0. If None, compute over 

2735 the whole array `a`. 

2736 ddof : int, optional 

2737 Degrees of freedom correction in the calculation of the 

2738 standard deviation. Default is 0. 

2739 nan_policy : {'propagate', 'raise', 'omit'}, optional 

2740 Defines how to handle when input contains nan. 'propagate' returns nan, 

2741 'raise' throws an error, 'omit' performs the calculations ignoring nan 

2742 values. Default is 'propagate'. Note that when the value is 'omit', 

2743 nans in the input also propagate to the output, but they do not affect 

2744 the geometric z scores computed for the non-nan values. 

2745 

2746 Returns 

2747 ------- 

2748 gzscore : array_like 

2749 The geometric z scores, standardized by geometric mean and geometric 

2750 standard deviation of input array `a`. 

2751 

2752 See Also 

2753 -------- 

2754 gmean : Geometric mean 

2755 gstd : Geometric standard deviation 

2756 zscore : Standard score 

2757 

2758 Notes 

2759 ----- 

2760 This function preserves ndarray subclasses, and works also with 

2761 matrices and masked arrays (it uses ``asanyarray`` instead of 

2762 ``asarray`` for parameters). 

2763 

2764 .. versionadded:: 1.8 

2765 

2766 Examples 

2767 -------- 

2768 Draw samples from a log-normal distribution: 

2769 

2770 >>> import numpy as np 

2771 >>> from scipy.stats import zscore, gzscore 

2772 >>> import matplotlib.pyplot as plt 

2773 

2774 >>> rng = np.random.default_rng() 

2775 >>> mu, sigma = 3., 1. # mean and standard deviation 

2776 >>> x = rng.lognormal(mu, sigma, size=500) 

2777 

2778 Display the histogram of the samples: 

2779 

2780 >>> fig, ax = plt.subplots() 

2781 >>> ax.hist(x, 50) 

2782 >>> plt.show() 

2783 

2784 Display the histogram of the samples standardized by the classical zscore. 

2785 Distribution is rescaled but its shape is unchanged. 

2786 

2787 >>> fig, ax = plt.subplots() 

2788 >>> ax.hist(zscore(x), 50) 

2789 >>> plt.show() 

2790 

2791 Demonstrate that the distribution of geometric zscores is rescaled and 

2792 quasinormal: 

2793 

2794 >>> fig, ax = plt.subplots() 

2795 >>> ax.hist(gzscore(x), 50) 

2796 >>> plt.show() 

2797 

2798 """ 

2799 a = np.asanyarray(a) 

2800 log = ma.log if isinstance(a, ma.MaskedArray) else np.log 

2801 

2802 return zscore(log(a), axis=axis, ddof=ddof, nan_policy=nan_policy) 

2803 

2804 

2805def zmap(scores, compare, axis=0, ddof=0, nan_policy='propagate'): 

2806 """ 

2807 Calculate the relative z-scores. 

2808 

2809 Return an array of z-scores, i.e., scores that are standardized to 

2810 zero mean and unit variance, where mean and variance are calculated 

2811 from the comparison array. 

2812 

2813 Parameters 

2814 ---------- 

2815 scores : array_like 

2816 The input for which z-scores are calculated. 

2817 compare : array_like 

2818 The input from which the mean and standard deviation of the 

2819 normalization are taken; assumed to have the same dimension as 

2820 `scores`. 

2821 axis : int or None, optional 

2822 Axis over which mean and variance of `compare` are calculated. 

2823 Default is 0. If None, compute over the whole array `scores`. 

2824 ddof : int, optional 

2825 Degrees of freedom correction in the calculation of the 

2826 standard deviation. Default is 0. 

2827 nan_policy : {'propagate', 'raise', 'omit'}, optional 

2828 Defines how to handle the occurrence of nans in `compare`. 

2829 'propagate' returns nan, 'raise' raises an exception, 'omit' 

2830 performs the calculations ignoring nan values. Default is 

2831 'propagate'. Note that when the value is 'omit', nans in `scores` 

2832 also propagate to the output, but they do not affect the z-scores 

2833 computed for the non-nan values. 

2834 

2835 Returns 

2836 ------- 

2837 zscore : array_like 

2838 Z-scores, in the same shape as `scores`. 

2839 

2840 Notes 

2841 ----- 

2842 This function preserves ndarray subclasses, and works also with 

2843 matrices and masked arrays (it uses `asanyarray` instead of 

2844 `asarray` for parameters). 

2845 

2846 Examples 

2847 -------- 

2848 >>> from scipy.stats import zmap 

2849 >>> a = [0.5, 2.0, 2.5, 3] 

2850 >>> b = [0, 1, 2, 3, 4] 

2851 >>> zmap(a, b) 

2852 array([-1.06066017, 0. , 0.35355339, 0.70710678]) 

2853 

2854 """ 

2855 a = np.asanyarray(compare) 

2856 

2857 if a.size == 0: 

2858 return np.empty(a.shape) 

2859 

2860 contains_nan, nan_policy = _contains_nan(a, nan_policy) 

2861 

2862 if contains_nan and nan_policy == 'omit': 

2863 if axis is None: 

2864 mn = _quiet_nanmean(a.ravel()) 

2865 std = _quiet_nanstd(a.ravel(), ddof=ddof) 

2866 isconst = _isconst(a.ravel()) 

2867 else: 

2868 mn = np.apply_along_axis(_quiet_nanmean, axis, a) 

2869 std = np.apply_along_axis(_quiet_nanstd, axis, a, ddof=ddof) 

2870 isconst = np.apply_along_axis(_isconst, axis, a) 

2871 else: 

2872 mn = a.mean(axis=axis, keepdims=True) 

2873 std = a.std(axis=axis, ddof=ddof, keepdims=True) 

2874 if axis is None: 

2875 isconst = (a.item(0) == a).all() 

2876 else: 

2877 isconst = (_first(a, axis) == a).all(axis=axis, keepdims=True) 

2878 

2879 # Set std deviations that are 0 to 1 to avoid division by 0. 

2880 std[isconst] = 1.0 

2881 z = (scores - mn) / std 

2882 # Set the outputs associated with a constant input to nan. 

2883 z[np.broadcast_to(isconst, z.shape)] = np.nan 

2884 return z 

2885 

2886 

2887def gstd(a, axis=0, ddof=1): 

2888 """ 

2889 Calculate the geometric standard deviation of an array. 

2890 

2891 The geometric standard deviation describes the spread of a set of numbers 

2892 where the geometric mean is preferred. It is a multiplicative factor, and 

2893 so a dimensionless quantity. 

2894 

2895 It is defined as the exponent of the standard deviation of ``log(a)``. 

2896 Mathematically the population geometric standard deviation can be 

2897 evaluated as:: 

2898 

2899 gstd = exp(std(log(a))) 

2900 

2901 .. versionadded:: 1.3.0 

2902 

2903 Parameters 

2904 ---------- 

2905 a : array_like 

2906 An array like object containing the sample data. 

2907 axis : int, tuple or None, optional 

2908 Axis along which to operate. Default is 0. If None, compute over 

2909 the whole array `a`. 

2910 ddof : int, optional 

2911 Degree of freedom correction in the calculation of the 

2912 geometric standard deviation. Default is 1. 

2913 

2914 Returns 

2915 ------- 

2916 ndarray or float 

2917 An array of the geometric standard deviation. If `axis` is None or `a` 

2918 is a 1d array a float is returned. 

2919 

2920 See Also 

2921 -------- 

2922 gmean : Geometric mean 

2923 numpy.std : Standard deviation 

2924 

2925 Notes 

2926 ----- 

2927 As the calculation requires the use of logarithms the geometric standard 

2928 deviation only supports strictly positive values. Any non-positive or 

2929 infinite values will raise a `ValueError`. 

2930 The geometric standard deviation is sometimes confused with the exponent of 

2931 the standard deviation, ``exp(std(a))``. Instead the geometric standard 

2932 deviation is ``exp(std(log(a)))``. 

2933 The default value for `ddof` is different to the default value (0) used 

2934 by other ddof containing functions, such as ``np.std`` and ``np.nanstd``. 

2935 

2936 References 

2937 ---------- 

2938 .. [1] Kirkwood, T. B., "Geometric means and measures of dispersion", 

2939 Biometrics, vol. 35, pp. 908-909, 1979 

2940 

2941 Examples 

2942 -------- 

2943 Find the geometric standard deviation of a log-normally distributed sample. 

2944 Note that the standard deviation of the distribution is one, on a 

2945 log scale this evaluates to approximately ``exp(1)``. 

2946 

2947 >>> import numpy as np 

2948 >>> from scipy.stats import gstd 

2949 >>> rng = np.random.default_rng() 

2950 >>> sample = rng.lognormal(mean=0, sigma=1, size=1000) 

2951 >>> gstd(sample) 

2952 2.810010162475324 

2953 

2954 Compute the geometric standard deviation of a multidimensional array and 

2955 of a given axis. 

2956 

2957 >>> a = np.arange(1, 25).reshape(2, 3, 4) 

2958 >>> gstd(a, axis=None) 

2959 2.2944076136018947 

2960 >>> gstd(a, axis=2) 

2961 array([[1.82424757, 1.22436866, 1.13183117], 

2962 [1.09348306, 1.07244798, 1.05914985]]) 

2963 >>> gstd(a, axis=(1,2)) 

2964 array([2.12939215, 1.22120169]) 

2965 

2966 The geometric standard deviation further handles masked arrays. 

2967 

2968 >>> a = np.arange(1, 25).reshape(2, 3, 4) 

2969 >>> ma = np.ma.masked_where(a > 16, a) 

2970 >>> ma 

2971 masked_array( 

2972 data=[[[1, 2, 3, 4], 

2973 [5, 6, 7, 8], 

2974 [9, 10, 11, 12]], 

2975 [[13, 14, 15, 16], 

2976 [--, --, --, --], 

2977 [--, --, --, --]]], 

2978 mask=[[[False, False, False, False], 

2979 [False, False, False, False], 

2980 [False, False, False, False]], 

2981 [[False, False, False, False], 

2982 [ True, True, True, True], 

2983 [ True, True, True, True]]], 

2984 fill_value=999999) 

2985 >>> gstd(ma, axis=2) 

2986 masked_array( 

2987 data=[[1.8242475707663655, 1.2243686572447428, 1.1318311657788478], 

2988 [1.0934830582350938, --, --]], 

2989 mask=[[False, False, False], 

2990 [False, True, True]], 

2991 fill_value=999999) 

2992 

2993 """ 

2994 a = np.asanyarray(a) 

2995 log = ma.log if isinstance(a, ma.MaskedArray) else np.log 

2996 

2997 try: 

2998 with warnings.catch_warnings(): 

2999 warnings.simplefilter("error", RuntimeWarning) 

3000 return np.exp(np.std(log(a), axis=axis, ddof=ddof)) 

3001 except RuntimeWarning as w: 

3002 if np.isinf(a).any(): 

3003 raise ValueError( 

3004 'Infinite value encountered. The geometric standard deviation ' 

3005 'is defined for strictly positive values only.' 

3006 ) from w 

3007 a_nan = np.isnan(a) 

3008 a_nan_any = a_nan.any() 

3009 # exclude NaN's from negativity check, but 

3010 # avoid expensive masking for arrays with no NaN 

3011 if ((a_nan_any and np.less_equal(np.nanmin(a), 0)) or 

3012 (not a_nan_any and np.less_equal(a, 0).any())): 

3013 raise ValueError( 

3014 'Non positive value encountered. The geometric standard ' 

3015 'deviation is defined for strictly positive values only.' 

3016 ) from w 

3017 elif 'Degrees of freedom <= 0 for slice' == str(w): 

3018 raise ValueError(w) from w 

3019 else: 

3020 # Remaining warnings don't need to be exceptions. 

3021 return np.exp(np.std(log(a, where=~a_nan), axis=axis, ddof=ddof)) 

3022 except TypeError as e: 

3023 raise ValueError( 

3024 'Invalid array input. The inputs could not be ' 

3025 'safely coerced to any supported types') from e 

3026 

3027 

3028# Private dictionary initialized only once at module level 

3029# See https://en.wikipedia.org/wiki/Robust_measures_of_scale 

3030_scale_conversions = {'raw': 1.0, 

3031 'normal': special.erfinv(0.5) * 2.0 * math.sqrt(2.0)} 

3032 

3033 

3034def iqr(x, axis=None, rng=(25, 75), scale=1.0, nan_policy='propagate', 

3035 interpolation='linear', keepdims=False): 

3036 r""" 

3037 Compute the interquartile range of the data along the specified axis. 

3038 

3039 The interquartile range (IQR) is the difference between the 75th and 

3040 25th percentile of the data. It is a measure of the dispersion 

3041 similar to standard deviation or variance, but is much more robust 

3042 against outliers [2]_. 

3043 

3044 The ``rng`` parameter allows this function to compute other 

3045 percentile ranges than the actual IQR. For example, setting 

3046 ``rng=(0, 100)`` is equivalent to `numpy.ptp`. 

3047 

3048 The IQR of an empty array is `np.nan`. 

3049 

3050 .. versionadded:: 0.18.0 

3051 

3052 Parameters 

3053 ---------- 

3054 x : array_like 

3055 Input array or object that can be converted to an array. 

3056 axis : int or sequence of int, optional 

3057 Axis along which the range is computed. The default is to 

3058 compute the IQR for the entire array. 

3059 rng : Two-element sequence containing floats in range of [0,100] optional 

3060 Percentiles over which to compute the range. Each must be 

3061 between 0 and 100, inclusive. The default is the true IQR: 

3062 ``(25, 75)``. The order of the elements is not important. 

3063 scale : scalar or str, optional 

3064 The numerical value of scale will be divided out of the final 

3065 result. The following string values are recognized: 

3066 

3067 * 'raw' : No scaling, just return the raw IQR. 

3068 **Deprecated!** Use ``scale=1`` instead. 

3069 * 'normal' : Scale by 

3070 :math:`2 \sqrt{2} erf^{-1}(\frac{1}{2}) \approx 1.349`. 

3071 

3072 The default is 1.0. The use of ``scale='raw'`` is deprecated infavor 

3073 of ``scale=1`` and will raise an error in SciPy 1.12.0. 

3074 Array-like `scale` is also allowed, as long 

3075 as it broadcasts correctly to the output such that 

3076 ``out / scale`` is a valid operation. The output dimensions 

3077 depend on the input array, `x`, the `axis` argument, and the 

3078 `keepdims` flag. 

3079 nan_policy : {'propagate', 'raise', 'omit'}, optional 

3080 Defines how to handle when input contains nan. 

3081 The following options are available (default is 'propagate'): 

3082 

3083 * 'propagate': returns nan 

3084 * 'raise': throws an error 

3085 * 'omit': performs the calculations ignoring nan values 

3086 interpolation : str, optional 

3087 

3088 Specifies the interpolation method to use when the percentile 

3089 boundaries lie between two data points ``i`` and ``j``. 

3090 The following options are available (default is 'linear'): 

3091 

3092 * 'linear': ``i + (j - i)*fraction``, where ``fraction`` is the 

3093 fractional part of the index surrounded by ``i`` and ``j``. 

3094 * 'lower': ``i``. 

3095 * 'higher': ``j``. 

3096 * 'nearest': ``i`` or ``j`` whichever is nearest. 

3097 * 'midpoint': ``(i + j)/2``. 

3098 

3099 For NumPy >= 1.22.0, the additional options provided by the ``method`` 

3100 keyword of `numpy.percentile` are also valid. 

3101 

3102 keepdims : bool, optional 

3103 If this is set to True, the reduced axes are left in the 

3104 result as dimensions with size one. With this option, the result 

3105 will broadcast correctly against the original array `x`. 

3106 

3107 Returns 

3108 ------- 

3109 iqr : scalar or ndarray 

3110 If ``axis=None``, a scalar is returned. If the input contains 

3111 integers or floats of smaller precision than ``np.float64``, then the 

3112 output data-type is ``np.float64``. Otherwise, the output data-type is 

3113 the same as that of the input. 

3114 

3115 See Also 

3116 -------- 

3117 numpy.std, numpy.var 

3118 

3119 References 

3120 ---------- 

3121 .. [1] "Interquartile range" https://en.wikipedia.org/wiki/Interquartile_range 

3122 .. [2] "Robust measures of scale" https://en.wikipedia.org/wiki/Robust_measures_of_scale 

3123 .. [3] "Quantile" https://en.wikipedia.org/wiki/Quantile 

3124 

3125 Examples 

3126 -------- 

3127 >>> import numpy as np 

3128 >>> from scipy.stats import iqr 

3129 >>> x = np.array([[10, 7, 4], [3, 2, 1]]) 

3130 >>> x 

3131 array([[10, 7, 4], 

3132 [ 3, 2, 1]]) 

3133 >>> iqr(x) 

3134 4.0 

3135 >>> iqr(x, axis=0) 

3136 array([ 3.5, 2.5, 1.5]) 

3137 >>> iqr(x, axis=1) 

3138 array([ 3., 1.]) 

3139 >>> iqr(x, axis=1, keepdims=True) 

3140 array([[ 3.], 

3141 [ 1.]]) 

3142 

3143 """ 

3144 x = asarray(x) 

3145 

3146 # This check prevents percentile from raising an error later. Also, it is 

3147 # consistent with `np.var` and `np.std`. 

3148 if not x.size: 

3149 return np.nan 

3150 

3151 # An error may be raised here, so fail-fast, before doing lengthy 

3152 # computations, even though `scale` is not used until later 

3153 if isinstance(scale, str): 

3154 scale_key = scale.lower() 

3155 if scale_key not in _scale_conversions: 

3156 raise ValueError("{0} not a valid scale for `iqr`".format(scale)) 

3157 if scale_key == 'raw': 

3158 msg = ("The use of 'scale=\"raw\"' is deprecated infavor of " 

3159 "'scale=1' and will raise an error in SciPy 1.12.0.") 

3160 warnings.warn(msg, DeprecationWarning, stacklevel=2) 

3161 scale = _scale_conversions[scale_key] 

3162 

3163 # Select the percentile function to use based on nans and policy 

3164 contains_nan, nan_policy = _contains_nan(x, nan_policy) 

3165 

3166 if contains_nan and nan_policy == 'omit': 

3167 percentile_func = np.nanpercentile 

3168 else: 

3169 percentile_func = np.percentile 

3170 

3171 if len(rng) != 2: 

3172 raise TypeError("quantile range must be two element sequence") 

3173 

3174 if np.isnan(rng).any(): 

3175 raise ValueError("range must not contain NaNs") 

3176 

3177 rng = sorted(rng) 

3178 if NumpyVersion(np.__version__) >= '1.22.0': 

3179 pct = percentile_func(x, rng, axis=axis, method=interpolation, 

3180 keepdims=keepdims) 

3181 else: 

3182 pct = percentile_func(x, rng, axis=axis, interpolation=interpolation, 

3183 keepdims=keepdims) 

3184 out = np.subtract(pct[1], pct[0]) 

3185 

3186 if scale != 1.0: 

3187 out /= scale 

3188 

3189 return out 

3190 

3191 

3192def _mad_1d(x, center, nan_policy): 

3193 # Median absolute deviation for 1-d array x. 

3194 # This is a helper function for `median_abs_deviation`; it assumes its 

3195 # arguments have been validated already. In particular, x must be a 

3196 # 1-d numpy array, center must be callable, and if nan_policy is not 

3197 # 'propagate', it is assumed to be 'omit', because 'raise' is handled 

3198 # in `median_abs_deviation`. 

3199 # No warning is generated if x is empty or all nan. 

3200 isnan = np.isnan(x) 

3201 if isnan.any(): 

3202 if nan_policy == 'propagate': 

3203 return np.nan 

3204 x = x[~isnan] 

3205 if x.size == 0: 

3206 # MAD of an empty array is nan. 

3207 return np.nan 

3208 # Edge cases have been handled, so do the basic MAD calculation. 

3209 med = center(x) 

3210 mad = np.median(np.abs(x - med)) 

3211 return mad 

3212 

3213 

3214def median_abs_deviation(x, axis=0, center=np.median, scale=1.0, 

3215 nan_policy='propagate'): 

3216 r""" 

3217 Compute the median absolute deviation of the data along the given axis. 

3218 

3219 The median absolute deviation (MAD, [1]_) computes the median over the 

3220 absolute deviations from the median. It is a measure of dispersion 

3221 similar to the standard deviation but more robust to outliers [2]_. 

3222 

3223 The MAD of an empty array is ``np.nan``. 

3224 

3225 .. versionadded:: 1.5.0 

3226 

3227 Parameters 

3228 ---------- 

3229 x : array_like 

3230 Input array or object that can be converted to an array. 

3231 axis : int or None, optional 

3232 Axis along which the range is computed. Default is 0. If None, compute 

3233 the MAD over the entire array. 

3234 center : callable, optional 

3235 A function that will return the central value. The default is to use 

3236 np.median. Any user defined function used will need to have the 

3237 function signature ``func(arr, axis)``. 

3238 scale : scalar or str, optional 

3239 The numerical value of scale will be divided out of the final 

3240 result. The default is 1.0. The string "normal" is also accepted, 

3241 and results in `scale` being the inverse of the standard normal 

3242 quantile function at 0.75, which is approximately 0.67449. 

3243 Array-like scale is also allowed, as long as it broadcasts correctly 

3244 to the output such that ``out / scale`` is a valid operation. The 

3245 output dimensions depend on the input array, `x`, and the `axis` 

3246 argument. 

3247 nan_policy : {'propagate', 'raise', 'omit'}, optional 

3248 Defines how to handle when input contains nan. 

3249 The following options are available (default is 'propagate'): 

3250 

3251 * 'propagate': returns nan 

3252 * 'raise': throws an error 

3253 * 'omit': performs the calculations ignoring nan values 

3254 

3255 Returns 

3256 ------- 

3257 mad : scalar or ndarray 

3258 If ``axis=None``, a scalar is returned. If the input contains 

3259 integers or floats of smaller precision than ``np.float64``, then the 

3260 output data-type is ``np.float64``. Otherwise, the output data-type is 

3261 the same as that of the input. 

3262 

3263 See Also 

3264 -------- 

3265 numpy.std, numpy.var, numpy.median, scipy.stats.iqr, scipy.stats.tmean, 

3266 scipy.stats.tstd, scipy.stats.tvar 

3267 

3268 Notes 

3269 ----- 

3270 The `center` argument only affects the calculation of the central value 

3271 around which the MAD is calculated. That is, passing in ``center=np.mean`` 

3272 will calculate the MAD around the mean - it will not calculate the *mean* 

3273 absolute deviation. 

3274 

3275 The input array may contain `inf`, but if `center` returns `inf`, the 

3276 corresponding MAD for that data will be `nan`. 

3277 

3278 References 

3279 ---------- 

3280 .. [1] "Median absolute deviation", 

3281 https://en.wikipedia.org/wiki/Median_absolute_deviation 

3282 .. [2] "Robust measures of scale", 

3283 https://en.wikipedia.org/wiki/Robust_measures_of_scale 

3284 

3285 Examples 

3286 -------- 

3287 When comparing the behavior of `median_abs_deviation` with ``np.std``, 

3288 the latter is affected when we change a single value of an array to have an 

3289 outlier value while the MAD hardly changes: 

3290 

3291 >>> import numpy as np 

3292 >>> from scipy import stats 

3293 >>> x = stats.norm.rvs(size=100, scale=1, random_state=123456) 

3294 >>> x.std() 

3295 0.9973906394005013 

3296 >>> stats.median_abs_deviation(x) 

3297 0.82832610097857 

3298 >>> x[0] = 345.6 

3299 >>> x.std() 

3300 34.42304872314415 

3301 >>> stats.median_abs_deviation(x) 

3302 0.8323442311590675 

3303 

3304 Axis handling example: 

3305 

3306 >>> x = np.array([[10, 7, 4], [3, 2, 1]]) 

3307 >>> x 

3308 array([[10, 7, 4], 

3309 [ 3, 2, 1]]) 

3310 >>> stats.median_abs_deviation(x) 

3311 array([3.5, 2.5, 1.5]) 

3312 >>> stats.median_abs_deviation(x, axis=None) 

3313 2.0 

3314 

3315 Scale normal example: 

3316 

3317 >>> x = stats.norm.rvs(size=1000000, scale=2, random_state=123456) 

3318 >>> stats.median_abs_deviation(x) 

3319 1.3487398527041636 

3320 >>> stats.median_abs_deviation(x, scale='normal') 

3321 1.9996446978061115 

3322 

3323 """ 

3324 if not callable(center): 

3325 raise TypeError("The argument 'center' must be callable. The given " 

3326 f"value {repr(center)} is not callable.") 

3327 

3328 # An error may be raised here, so fail-fast, before doing lengthy 

3329 # computations, even though `scale` is not used until later 

3330 if isinstance(scale, str): 

3331 if scale.lower() == 'normal': 

3332 scale = 0.6744897501960817 # special.ndtri(0.75) 

3333 else: 

3334 raise ValueError(f"{scale} is not a valid scale value.") 

3335 

3336 x = asarray(x) 

3337 

3338 # Consistent with `np.var` and `np.std`. 

3339 if not x.size: 

3340 if axis is None: 

3341 return np.nan 

3342 nan_shape = tuple(item for i, item in enumerate(x.shape) if i != axis) 

3343 if nan_shape == (): 

3344 # Return nan, not array(nan) 

3345 return np.nan 

3346 return np.full(nan_shape, np.nan) 

3347 

3348 contains_nan, nan_policy = _contains_nan(x, nan_policy) 

3349 

3350 if contains_nan: 

3351 if axis is None: 

3352 mad = _mad_1d(x.ravel(), center, nan_policy) 

3353 else: 

3354 mad = np.apply_along_axis(_mad_1d, axis, x, center, nan_policy) 

3355 else: 

3356 if axis is None: 

3357 med = center(x, axis=None) 

3358 mad = np.median(np.abs(x - med)) 

3359 else: 

3360 # Wrap the call to center() in expand_dims() so it acts like 

3361 # keepdims=True was used. 

3362 med = np.expand_dims(center(x, axis=axis), axis) 

3363 mad = np.median(np.abs(x - med), axis=axis) 

3364 

3365 return mad / scale 

3366 

3367 

3368##################################### 

3369# TRIMMING FUNCTIONS # 

3370##################################### 

3371 

3372 

3373SigmaclipResult = namedtuple('SigmaclipResult', ('clipped', 'lower', 'upper')) 

3374 

3375 

3376def sigmaclip(a, low=4., high=4.): 

3377 """Perform iterative sigma-clipping of array elements. 

3378 

3379 Starting from the full sample, all elements outside the critical range are 

3380 removed, i.e. all elements of the input array `c` that satisfy either of 

3381 the following conditions:: 

3382 

3383 c < mean(c) - std(c)*low 

3384 c > mean(c) + std(c)*high 

3385 

3386 The iteration continues with the updated sample until no 

3387 elements are outside the (updated) range. 

3388 

3389 Parameters 

3390 ---------- 

3391 a : array_like 

3392 Data array, will be raveled if not 1-D. 

3393 low : float, optional 

3394 Lower bound factor of sigma clipping. Default is 4. 

3395 high : float, optional 

3396 Upper bound factor of sigma clipping. Default is 4. 

3397 

3398 Returns 

3399 ------- 

3400 clipped : ndarray 

3401 Input array with clipped elements removed. 

3402 lower : float 

3403 Lower threshold value use for clipping. 

3404 upper : float 

3405 Upper threshold value use for clipping. 

3406 

3407 Examples 

3408 -------- 

3409 >>> import numpy as np 

3410 >>> from scipy.stats import sigmaclip 

3411 >>> a = np.concatenate((np.linspace(9.5, 10.5, 31), 

3412 ... np.linspace(0, 20, 5))) 

3413 >>> fact = 1.5 

3414 >>> c, low, upp = sigmaclip(a, fact, fact) 

3415 >>> c 

3416 array([ 9.96666667, 10. , 10.03333333, 10. ]) 

3417 >>> c.var(), c.std() 

3418 (0.00055555555555555165, 0.023570226039551501) 

3419 >>> low, c.mean() - fact*c.std(), c.min() 

3420 (9.9646446609406727, 9.9646446609406727, 9.9666666666666668) 

3421 >>> upp, c.mean() + fact*c.std(), c.max() 

3422 (10.035355339059327, 10.035355339059327, 10.033333333333333) 

3423 

3424 >>> a = np.concatenate((np.linspace(9.5, 10.5, 11), 

3425 ... np.linspace(-100, -50, 3))) 

3426 >>> c, low, upp = sigmaclip(a, 1.8, 1.8) 

3427 >>> (c == np.linspace(9.5, 10.5, 11)).all() 

3428 True 

3429 

3430 """ 

3431 c = np.asarray(a).ravel() 

3432 delta = 1 

3433 while delta: 

3434 c_std = c.std() 

3435 c_mean = c.mean() 

3436 size = c.size 

3437 critlower = c_mean - c_std * low 

3438 critupper = c_mean + c_std * high 

3439 c = c[(c >= critlower) & (c <= critupper)] 

3440 delta = size - c.size 

3441 

3442 return SigmaclipResult(c, critlower, critupper) 

3443 

3444 

3445def trimboth(a, proportiontocut, axis=0): 

3446 """Slice off a proportion of items from both ends of an array. 

3447 

3448 Slice off the passed proportion of items from both ends of the passed 

3449 array (i.e., with `proportiontocut` = 0.1, slices leftmost 10% **and** 

3450 rightmost 10% of scores). The trimmed values are the lowest and 

3451 highest ones. 

3452 Slice off less if proportion results in a non-integer slice index (i.e. 

3453 conservatively slices off `proportiontocut`). 

3454 

3455 Parameters 

3456 ---------- 

3457 a : array_like 

3458 Data to trim. 

3459 proportiontocut : float 

3460 Proportion (in range 0-1) of total data set to trim of each end. 

3461 axis : int or None, optional 

3462 Axis along which to trim data. Default is 0. If None, compute over 

3463 the whole array `a`. 

3464 

3465 Returns 

3466 ------- 

3467 out : ndarray 

3468 Trimmed version of array `a`. The order of the trimmed content 

3469 is undefined. 

3470 

3471 See Also 

3472 -------- 

3473 trim_mean 

3474 

3475 Examples 

3476 -------- 

3477 Create an array of 10 values and trim 10% of those values from each end: 

3478 

3479 >>> import numpy as np 

3480 >>> from scipy import stats 

3481 >>> a = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] 

3482 >>> stats.trimboth(a, 0.1) 

3483 array([1, 3, 2, 4, 5, 6, 7, 8]) 

3484 

3485 Note that the elements of the input array are trimmed by value, but the 

3486 output array is not necessarily sorted. 

3487 

3488 The proportion to trim is rounded down to the nearest integer. For 

3489 instance, trimming 25% of the values from each end of an array of 10 

3490 values will return an array of 6 values: 

3491 

3492 >>> b = np.arange(10) 

3493 >>> stats.trimboth(b, 1/4).shape 

3494 (6,) 

3495 

3496 Multidimensional arrays can be trimmed along any axis or across the entire 

3497 array: 

3498 

3499 >>> c = [2, 4, 6, 8, 0, 1, 3, 5, 7, 9] 

3500 >>> d = np.array([a, b, c]) 

3501 >>> stats.trimboth(d, 0.4, axis=0).shape 

3502 (1, 10) 

3503 >>> stats.trimboth(d, 0.4, axis=1).shape 

3504 (3, 2) 

3505 >>> stats.trimboth(d, 0.4, axis=None).shape 

3506 (6,) 

3507 

3508 """ 

3509 a = np.asarray(a) 

3510 

3511 if a.size == 0: 

3512 return a 

3513 

3514 if axis is None: 

3515 a = a.ravel() 

3516 axis = 0 

3517 

3518 nobs = a.shape[axis] 

3519 lowercut = int(proportiontocut * nobs) 

3520 uppercut = nobs - lowercut 

3521 if (lowercut >= uppercut): 

3522 raise ValueError("Proportion too big.") 

3523 

3524 atmp = np.partition(a, (lowercut, uppercut - 1), axis) 

3525 

3526 sl = [slice(None)] * atmp.ndim 

3527 sl[axis] = slice(lowercut, uppercut) 

3528 return atmp[tuple(sl)] 

3529 

3530 

3531def trim1(a, proportiontocut, tail='right', axis=0): 

3532 """Slice off a proportion from ONE end of the passed array distribution. 

3533 

3534 If `proportiontocut` = 0.1, slices off 'leftmost' or 'rightmost' 

3535 10% of scores. The lowest or highest values are trimmed (depending on 

3536 the tail). 

3537 Slice off less if proportion results in a non-integer slice index 

3538 (i.e. conservatively slices off `proportiontocut` ). 

3539 

3540 Parameters 

3541 ---------- 

3542 a : array_like 

3543 Input array. 

3544 proportiontocut : float 

3545 Fraction to cut off of 'left' or 'right' of distribution. 

3546 tail : {'left', 'right'}, optional 

3547 Defaults to 'right'. 

3548 axis : int or None, optional 

3549 Axis along which to trim data. Default is 0. If None, compute over 

3550 the whole array `a`. 

3551 

3552 Returns 

3553 ------- 

3554 trim1 : ndarray 

3555 Trimmed version of array `a`. The order of the trimmed content is 

3556 undefined. 

3557 

3558 Examples 

3559 -------- 

3560 Create an array of 10 values and trim 20% of its lowest values: 

3561 

3562 >>> import numpy as np 

3563 >>> from scipy import stats 

3564 >>> a = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] 

3565 >>> stats.trim1(a, 0.2, 'left') 

3566 array([2, 4, 3, 5, 6, 7, 8, 9]) 

3567 

3568 Note that the elements of the input array are trimmed by value, but the 

3569 output array is not necessarily sorted. 

3570 

3571 The proportion to trim is rounded down to the nearest integer. For 

3572 instance, trimming 25% of the values from an array of 10 values will 

3573 return an array of 8 values: 

3574 

3575 >>> b = np.arange(10) 

3576 >>> stats.trim1(b, 1/4).shape 

3577 (8,) 

3578 

3579 Multidimensional arrays can be trimmed along any axis or across the entire 

3580 array: 

3581 

3582 >>> c = [2, 4, 6, 8, 0, 1, 3, 5, 7, 9] 

3583 >>> d = np.array([a, b, c]) 

3584 >>> stats.trim1(d, 0.8, axis=0).shape 

3585 (1, 10) 

3586 >>> stats.trim1(d, 0.8, axis=1).shape 

3587 (3, 2) 

3588 >>> stats.trim1(d, 0.8, axis=None).shape 

3589 (6,) 

3590 

3591 """ 

3592 a = np.asarray(a) 

3593 if axis is None: 

3594 a = a.ravel() 

3595 axis = 0 

3596 

3597 nobs = a.shape[axis] 

3598 

3599 # avoid possible corner case 

3600 if proportiontocut >= 1: 

3601 return [] 

3602 

3603 if tail.lower() == 'right': 

3604 lowercut = 0 

3605 uppercut = nobs - int(proportiontocut * nobs) 

3606 

3607 elif tail.lower() == 'left': 

3608 lowercut = int(proportiontocut * nobs) 

3609 uppercut = nobs 

3610 

3611 atmp = np.partition(a, (lowercut, uppercut - 1), axis) 

3612 

3613 sl = [slice(None)] * atmp.ndim 

3614 sl[axis] = slice(lowercut, uppercut) 

3615 return atmp[tuple(sl)] 

3616 

3617 

3618def trim_mean(a, proportiontocut, axis=0): 

3619 """Return mean of array after trimming distribution from both tails. 

3620 

3621 If `proportiontocut` = 0.1, slices off 'leftmost' and 'rightmost' 10% of 

3622 scores. The input is sorted before slicing. Slices off less if proportion 

3623 results in a non-integer slice index (i.e., conservatively slices off 

3624 `proportiontocut` ). 

3625 

3626 Parameters 

3627 ---------- 

3628 a : array_like 

3629 Input array. 

3630 proportiontocut : float 

3631 Fraction to cut off of both tails of the distribution. 

3632 axis : int or None, optional 

3633 Axis along which the trimmed means are computed. Default is 0. 

3634 If None, compute over the whole array `a`. 

3635 

3636 Returns 

3637 ------- 

3638 trim_mean : ndarray 

3639 Mean of trimmed array. 

3640 

3641 See Also 

3642 -------- 

3643 trimboth 

3644 tmean : Compute the trimmed mean ignoring values outside given `limits`. 

3645 

3646 Examples 

3647 -------- 

3648 >>> import numpy as np 

3649 >>> from scipy import stats 

3650 >>> x = np.arange(20) 

3651 >>> stats.trim_mean(x, 0.1) 

3652 9.5 

3653 >>> x2 = x.reshape(5, 4) 

3654 >>> x2 

3655 array([[ 0, 1, 2, 3], 

3656 [ 4, 5, 6, 7], 

3657 [ 8, 9, 10, 11], 

3658 [12, 13, 14, 15], 

3659 [16, 17, 18, 19]]) 

3660 >>> stats.trim_mean(x2, 0.25) 

3661 array([ 8., 9., 10., 11.]) 

3662 >>> stats.trim_mean(x2, 0.25, axis=1) 

3663 array([ 1.5, 5.5, 9.5, 13.5, 17.5]) 

3664 

3665 """ 

3666 a = np.asarray(a) 

3667 

3668 if a.size == 0: 

3669 return np.nan 

3670 

3671 if axis is None: 

3672 a = a.ravel() 

3673 axis = 0 

3674 

3675 nobs = a.shape[axis] 

3676 lowercut = int(proportiontocut * nobs) 

3677 uppercut = nobs - lowercut 

3678 if (lowercut > uppercut): 

3679 raise ValueError("Proportion too big.") 

3680 

3681 atmp = np.partition(a, (lowercut, uppercut - 1), axis) 

3682 

3683 sl = [slice(None)] * atmp.ndim 

3684 sl[axis] = slice(lowercut, uppercut) 

3685 return np.mean(atmp[tuple(sl)], axis=axis) 

3686 

3687 

3688F_onewayResult = namedtuple('F_onewayResult', ('statistic', 'pvalue')) 

3689 

3690 

3691def _create_f_oneway_nan_result(shape, axis): 

3692 """ 

3693 This is a helper function for f_oneway for creating the return values 

3694 in certain degenerate conditions. It creates return values that are 

3695 all nan with the appropriate shape for the given `shape` and `axis`. 

3696 """ 

3697 axis = np.core.multiarray.normalize_axis_index(axis, len(shape)) 

3698 shp = shape[:axis] + shape[axis+1:] 

3699 if shp == (): 

3700 f = np.nan 

3701 prob = np.nan 

3702 else: 

3703 f = np.full(shp, fill_value=np.nan) 

3704 prob = f.copy() 

3705 return F_onewayResult(f, prob) 

3706 

3707 

3708def _first(arr, axis): 

3709 """Return arr[..., 0:1, ...] where 0:1 is in the `axis` position.""" 

3710 return np.take_along_axis(arr, np.array(0, ndmin=arr.ndim), axis) 

3711 

3712 

3713def f_oneway(*samples, axis=0): 

3714 """Perform one-way ANOVA. 

3715 

3716 The one-way ANOVA tests the null hypothesis that two or more groups have 

3717 the same population mean. The test is applied to samples from two or 

3718 more groups, possibly with differing sizes. 

3719 

3720 Parameters 

3721 ---------- 

3722 sample1, sample2, ... : array_like 

3723 The sample measurements for each group. There must be at least 

3724 two arguments. If the arrays are multidimensional, then all the 

3725 dimensions of the array must be the same except for `axis`. 

3726 axis : int, optional 

3727 Axis of the input arrays along which the test is applied. 

3728 Default is 0. 

3729 

3730 Returns 

3731 ------- 

3732 statistic : float 

3733 The computed F statistic of the test. 

3734 pvalue : float 

3735 The associated p-value from the F distribution. 

3736 

3737 Warns 

3738 ----- 

3739 `~scipy.stats.ConstantInputWarning` 

3740 Raised if all values within each of the input arrays are identical. 

3741 In this case the F statistic is either infinite or isn't defined, 

3742 so ``np.inf`` or ``np.nan`` is returned. 

3743 

3744 `~scipy.stats.DegenerateDataWarning` 

3745 Raised if the length of any input array is 0, or if all the input 

3746 arrays have length 1. ``np.nan`` is returned for the F statistic 

3747 and the p-value in these cases. 

3748 

3749 Notes 

3750 ----- 

3751 The ANOVA test has important assumptions that must be satisfied in order 

3752 for the associated p-value to be valid. 

3753 

3754 1. The samples are independent. 

3755 2. Each sample is from a normally distributed population. 

3756 3. The population standard deviations of the groups are all equal. This 

3757 property is known as homoscedasticity. 

3758 

3759 If these assumptions are not true for a given set of data, it may still 

3760 be possible to use the Kruskal-Wallis H-test (`scipy.stats.kruskal`) or 

3761 the Alexander-Govern test (`scipy.stats.alexandergovern`) although with 

3762 some loss of power. 

3763 

3764 The length of each group must be at least one, and there must be at 

3765 least one group with length greater than one. If these conditions 

3766 are not satisfied, a warning is generated and (``np.nan``, ``np.nan``) 

3767 is returned. 

3768 

3769 If all values in each group are identical, and there exist at least two 

3770 groups with different values, the function generates a warning and 

3771 returns (``np.inf``, 0). 

3772 

3773 If all values in all groups are the same, function generates a warning 

3774 and returns (``np.nan``, ``np.nan``). 

3775 

3776 The algorithm is from Heiman [2]_, pp.394-7. 

3777 

3778 References 

3779 ---------- 

3780 .. [1] R. Lowry, "Concepts and Applications of Inferential Statistics", 

3781 Chapter 14, 2014, http://vassarstats.net/textbook/ 

3782 

3783 .. [2] G.W. Heiman, "Understanding research methods and statistics: An 

3784 integrated introduction for psychology", Houghton, Mifflin and 

3785 Company, 2001. 

3786 

3787 .. [3] G.H. McDonald, "Handbook of Biological Statistics", One-way ANOVA. 

3788 http://www.biostathandbook.com/onewayanova.html 

3789 

3790 Examples 

3791 -------- 

3792 >>> import numpy as np 

3793 >>> from scipy.stats import f_oneway 

3794 

3795 Here are some data [3]_ on a shell measurement (the length of the anterior 

3796 adductor muscle scar, standardized by dividing by length) in the mussel 

3797 Mytilus trossulus from five locations: Tillamook, Oregon; Newport, Oregon; 

3798 Petersburg, Alaska; Magadan, Russia; and Tvarminne, Finland, taken from a 

3799 much larger data set used in McDonald et al. (1991). 

3800 

3801 >>> tillamook = [0.0571, 0.0813, 0.0831, 0.0976, 0.0817, 0.0859, 0.0735, 

3802 ... 0.0659, 0.0923, 0.0836] 

3803 >>> newport = [0.0873, 0.0662, 0.0672, 0.0819, 0.0749, 0.0649, 0.0835, 

3804 ... 0.0725] 

3805 >>> petersburg = [0.0974, 0.1352, 0.0817, 0.1016, 0.0968, 0.1064, 0.105] 

3806 >>> magadan = [0.1033, 0.0915, 0.0781, 0.0685, 0.0677, 0.0697, 0.0764, 

3807 ... 0.0689] 

3808 >>> tvarminne = [0.0703, 0.1026, 0.0956, 0.0973, 0.1039, 0.1045] 

3809 >>> f_oneway(tillamook, newport, petersburg, magadan, tvarminne) 

3810 F_onewayResult(statistic=7.121019471642447, pvalue=0.0002812242314534544) 

3811 

3812 `f_oneway` accepts multidimensional input arrays. When the inputs 

3813 are multidimensional and `axis` is not given, the test is performed 

3814 along the first axis of the input arrays. For the following data, the 

3815 test is performed three times, once for each column. 

3816 

3817 >>> a = np.array([[9.87, 9.03, 6.81], 

3818 ... [7.18, 8.35, 7.00], 

3819 ... [8.39, 7.58, 7.68], 

3820 ... [7.45, 6.33, 9.35], 

3821 ... [6.41, 7.10, 9.33], 

3822 ... [8.00, 8.24, 8.44]]) 

3823 >>> b = np.array([[6.35, 7.30, 7.16], 

3824 ... [6.65, 6.68, 7.63], 

3825 ... [5.72, 7.73, 6.72], 

3826 ... [7.01, 9.19, 7.41], 

3827 ... [7.75, 7.87, 8.30], 

3828 ... [6.90, 7.97, 6.97]]) 

3829 >>> c = np.array([[3.31, 8.77, 1.01], 

3830 ... [8.25, 3.24, 3.62], 

3831 ... [6.32, 8.81, 5.19], 

3832 ... [7.48, 8.83, 8.91], 

3833 ... [8.59, 6.01, 6.07], 

3834 ... [3.07, 9.72, 7.48]]) 

3835 >>> F, p = f_oneway(a, b, c) 

3836 >>> F 

3837 array([1.75676344, 0.03701228, 3.76439349]) 

3838 >>> p 

3839 array([0.20630784, 0.96375203, 0.04733157]) 

3840 

3841 """ 

3842 if len(samples) < 2: 

3843 raise TypeError('at least two inputs are required;' 

3844 f' got {len(samples)}.') 

3845 

3846 samples = [np.asarray(sample, dtype=float) for sample in samples] 

3847 

3848 # ANOVA on N groups, each in its own array 

3849 num_groups = len(samples) 

3850 

3851 # We haven't explicitly validated axis, but if it is bad, this call of 

3852 # np.concatenate will raise np.AxisError. The call will raise ValueError 

3853 # if the dimensions of all the arrays, except the axis dimension, are not 

3854 # the same. 

3855 alldata = np.concatenate(samples, axis=axis) 

3856 bign = alldata.shape[axis] 

3857 

3858 # Check this after forming alldata, so shape errors are detected 

3859 # and reported before checking for 0 length inputs. 

3860 if any(sample.shape[axis] == 0 for sample in samples): 

3861 warnings.warn(stats.DegenerateDataWarning('at least one input ' 

3862 'has length 0')) 

3863 return _create_f_oneway_nan_result(alldata.shape, axis) 

3864 

3865 # Must have at least one group with length greater than 1. 

3866 if all(sample.shape[axis] == 1 for sample in samples): 

3867 msg = ('all input arrays have length 1. f_oneway requires that at ' 

3868 'least one input has length greater than 1.') 

3869 warnings.warn(stats.DegenerateDataWarning(msg)) 

3870 return _create_f_oneway_nan_result(alldata.shape, axis) 

3871 

3872 # Check if all values within each group are identical, and if the common 

3873 # value in at least one group is different from that in another group. 

3874 # Based on https://github.com/scipy/scipy/issues/11669 

3875 

3876 # If axis=0, say, and the groups have shape (n0, ...), (n1, ...), ..., 

3877 # then is_const is a boolean array with shape (num_groups, ...). 

3878 # It is True if the values within the groups along the axis slice are 

3879 # identical. In the typical case where each input array is 1-d, is_const is 

3880 # a 1-d array with length num_groups. 

3881 is_const = np.concatenate( 

3882 [(_first(sample, axis) == sample).all(axis=axis, 

3883 keepdims=True) 

3884 for sample in samples], 

3885 axis=axis 

3886 ) 

3887 

3888 # all_const is a boolean array with shape (...) (see previous comment). 

3889 # It is True if the values within each group along the axis slice are 

3890 # the same (e.g. [[3, 3, 3], [5, 5, 5, 5], [4, 4, 4]]). 

3891 all_const = is_const.all(axis=axis) 

3892 if all_const.any(): 

3893 msg = ("Each of the input arrays is constant;" 

3894 "the F statistic is not defined or infinite") 

3895 warnings.warn(stats.ConstantInputWarning(msg)) 

3896 

3897 # all_same_const is True if all the values in the groups along the axis=0 

3898 # slice are the same (e.g. [[3, 3, 3], [3, 3, 3, 3], [3, 3, 3]]). 

3899 all_same_const = (_first(alldata, axis) == alldata).all(axis=axis) 

3900 

3901 # Determine the mean of the data, and subtract that from all inputs to a 

3902 # variance (via sum_of_sq / sq_of_sum) calculation. Variance is invariant 

3903 # to a shift in location, and centering all data around zero vastly 

3904 # improves numerical stability. 

3905 offset = alldata.mean(axis=axis, keepdims=True) 

3906 alldata -= offset 

3907 

3908 normalized_ss = _square_of_sums(alldata, axis=axis) / bign 

3909 

3910 sstot = _sum_of_squares(alldata, axis=axis) - normalized_ss 

3911 

3912 ssbn = 0 

3913 for sample in samples: 

3914 ssbn += _square_of_sums(sample - offset, 

3915 axis=axis) / sample.shape[axis] 

3916 

3917 # Naming: variables ending in bn/b are for "between treatments", wn/w are 

3918 # for "within treatments" 

3919 ssbn -= normalized_ss 

3920 sswn = sstot - ssbn 

3921 dfbn = num_groups - 1 

3922 dfwn = bign - num_groups 

3923 msb = ssbn / dfbn 

3924 msw = sswn / dfwn 

3925 with np.errstate(divide='ignore', invalid='ignore'): 

3926 f = msb / msw 

3927 

3928 prob = special.fdtrc(dfbn, dfwn, f) # equivalent to stats.f.sf 

3929 

3930 # Fix any f values that should be inf or nan because the corresponding 

3931 # inputs were constant. 

3932 if np.isscalar(f): 

3933 if all_same_const: 

3934 f = np.nan 

3935 prob = np.nan 

3936 elif all_const: 

3937 f = np.inf 

3938 prob = 0.0 

3939 else: 

3940 f[all_const] = np.inf 

3941 prob[all_const] = 0.0 

3942 f[all_same_const] = np.nan 

3943 prob[all_same_const] = np.nan 

3944 

3945 return F_onewayResult(f, prob) 

3946 

3947 

3948def alexandergovern(*samples, nan_policy='propagate'): 

3949 """Performs the Alexander Govern test. 

3950 

3951 The Alexander-Govern approximation tests the equality of k independent 

3952 means in the face of heterogeneity of variance. The test is applied to 

3953 samples from two or more groups, possibly with differing sizes. 

3954 

3955 Parameters 

3956 ---------- 

3957 sample1, sample2, ... : array_like 

3958 The sample measurements for each group. There must be at least 

3959 two samples. 

3960 nan_policy : {'propagate', 'raise', 'omit'}, optional 

3961 Defines how to handle when input contains nan. 

3962 The following options are available (default is 'propagate'): 

3963 

3964 * 'propagate': returns nan 

3965 * 'raise': throws an error 

3966 * 'omit': performs the calculations ignoring nan values 

3967 

3968 Returns 

3969 ------- 

3970 statistic : float 

3971 The computed A statistic of the test. 

3972 pvalue : float 

3973 The associated p-value from the chi-squared distribution. 

3974 

3975 Warns 

3976 ----- 

3977 `~scipy.stats.ConstantInputWarning` 

3978 Raised if an input is a constant array. The statistic is not defined 

3979 in this case, so ``np.nan`` is returned. 

3980 

3981 See Also 

3982 -------- 

3983 f_oneway : one-way ANOVA 

3984 

3985 Notes 

3986 ----- 

3987 The use of this test relies on several assumptions. 

3988 

3989 1. The samples are independent. 

3990 2. Each sample is from a normally distributed population. 

3991 3. Unlike `f_oneway`, this test does not assume on homoscedasticity, 

3992 instead relaxing the assumption of equal variances. 

3993 

3994 Input samples must be finite, one dimensional, and with size greater than 

3995 one. 

3996 

3997 References 

3998 ---------- 

3999 .. [1] Alexander, Ralph A., and Diane M. Govern. "A New and Simpler 

4000 Approximation for ANOVA under Variance Heterogeneity." Journal 

4001 of Educational Statistics, vol. 19, no. 2, 1994, pp. 91-101. 

4002 JSTOR, www.jstor.org/stable/1165140. Accessed 12 Sept. 2020. 

4003 

4004 Examples 

4005 -------- 

4006 >>> from scipy.stats import alexandergovern 

4007 

4008 Here are some data on annual percentage rate of interest charged on 

4009 new car loans at nine of the largest banks in four American cities 

4010 taken from the National Institute of Standards and Technology's 

4011 ANOVA dataset. 

4012 

4013 We use `alexandergovern` to test the null hypothesis that all cities 

4014 have the same mean APR against the alternative that the cities do not 

4015 all have the same mean APR. We decide that a significance level of 5% 

4016 is required to reject the null hypothesis in favor of the alternative. 

4017 

4018 >>> atlanta = [13.75, 13.75, 13.5, 13.5, 13.0, 13.0, 13.0, 12.75, 12.5] 

4019 >>> chicago = [14.25, 13.0, 12.75, 12.5, 12.5, 12.4, 12.3, 11.9, 11.9] 

4020 >>> houston = [14.0, 14.0, 13.51, 13.5, 13.5, 13.25, 13.0, 12.5, 12.5] 

4021 >>> memphis = [15.0, 14.0, 13.75, 13.59, 13.25, 12.97, 12.5, 12.25, 

4022 ... 11.89] 

4023 >>> alexandergovern(atlanta, chicago, houston, memphis) 

4024 AlexanderGovernResult(statistic=4.65087071883494, 

4025 pvalue=0.19922132490385214) 

4026 

4027 The p-value is 0.1992, indicating a nearly 20% chance of observing 

4028 such an extreme value of the test statistic under the null hypothesis. 

4029 This exceeds 5%, so we do not reject the null hypothesis in favor of 

4030 the alternative. 

4031 

4032 """ 

4033 samples = _alexandergovern_input_validation(samples, nan_policy) 

4034 

4035 if np.any([(sample == sample[0]).all() for sample in samples]): 

4036 msg = "An input array is constant; the statistic is not defined." 

4037 warnings.warn(stats.ConstantInputWarning(msg)) 

4038 return AlexanderGovernResult(np.nan, np.nan) 

4039 

4040 # The following formula numbers reference the equation described on 

4041 # page 92 by Alexander, Govern. Formulas 5, 6, and 7 describe other 

4042 # tests that serve as the basis for equation (8) but are not needed 

4043 # to perform the test. 

4044 

4045 # precalculate mean and length of each sample 

4046 lengths = np.array([ma.count(sample) if nan_policy == 'omit' 

4047 else len(sample) for sample in samples]) 

4048 means = np.array([np.mean(sample) for sample in samples]) 

4049 

4050 # (1) determine standard error of the mean for each sample 

4051 standard_errors = [np.std(sample, ddof=1) / np.sqrt(length) 

4052 for sample, length in zip(samples, lengths)] 

4053 

4054 # (2) define a weight for each sample 

4055 inv_sq_se = 1 / np.square(standard_errors) 

4056 weights = inv_sq_se / np.sum(inv_sq_se) 

4057 

4058 # (3) determine variance-weighted estimate of the common mean 

4059 var_w = np.sum(weights * means) 

4060 

4061 # (4) determine one-sample t statistic for each group 

4062 t_stats = (means - var_w)/standard_errors 

4063 

4064 # calculate parameters to be used in transformation 

4065 v = lengths - 1 

4066 a = v - .5 

4067 b = 48 * a**2 

4068 c = (a * np.log(1 + (t_stats ** 2)/v))**.5 

4069 

4070 # (8) perform a normalizing transformation on t statistic 

4071 z = (c + ((c**3 + 3*c)/b) - 

4072 ((4*c**7 + 33*c**5 + 240*c**3 + 855*c) / 

4073 (b**2*10 + 8*b*c**4 + 1000*b))) 

4074 

4075 # (9) calculate statistic 

4076 A = np.sum(np.square(z)) 

4077 

4078 # "[the p value is determined from] central chi-square random deviates 

4079 # with k - 1 degrees of freedom". Alexander, Govern (94) 

4080 p = distributions.chi2.sf(A, len(samples) - 1) 

4081 return AlexanderGovernResult(A, p) 

4082 

4083 

4084def _alexandergovern_input_validation(samples, nan_policy): 

4085 if len(samples) < 2: 

4086 raise TypeError(f"2 or more inputs required, got {len(samples)}") 

4087 

4088 # input arrays are flattened 

4089 samples = [np.asarray(sample, dtype=float) for sample in samples] 

4090 

4091 for i, sample in enumerate(samples): 

4092 if np.size(sample) <= 1: 

4093 raise ValueError("Input sample size must be greater than one.") 

4094 if sample.ndim != 1: 

4095 raise ValueError("Input samples must be one-dimensional") 

4096 if np.isinf(sample).any(): 

4097 raise ValueError("Input samples must be finite.") 

4098 

4099 contains_nan, nan_policy = _contains_nan(sample, 

4100 nan_policy=nan_policy) 

4101 if contains_nan and nan_policy == 'omit': 

4102 samples[i] = ma.masked_invalid(sample) 

4103 return samples 

4104 

4105 

4106AlexanderGovernResult = make_dataclass("AlexanderGovernResult", ("statistic", 

4107 "pvalue")) 

4108 

4109 

4110def _pearsonr_fisher_ci(r, n, confidence_level, alternative): 

4111 """ 

4112 Compute the confidence interval for Pearson's R. 

4113 

4114 Fisher's transformation is used to compute the confidence interval 

4115 (https://en.wikipedia.org/wiki/Fisher_transformation). 

4116 """ 

4117 if r == 1: 

4118 zr = np.inf 

4119 elif r == -1: 

4120 zr = -np.inf 

4121 else: 

4122 zr = np.arctanh(r) 

4123 

4124 if n > 3: 

4125 se = np.sqrt(1 / (n - 3)) 

4126 if alternative == "two-sided": 

4127 h = special.ndtri(0.5 + confidence_level/2) 

4128 zlo = zr - h*se 

4129 zhi = zr + h*se 

4130 rlo = np.tanh(zlo) 

4131 rhi = np.tanh(zhi) 

4132 elif alternative == "less": 

4133 h = special.ndtri(confidence_level) 

4134 zhi = zr + h*se 

4135 rhi = np.tanh(zhi) 

4136 rlo = -1.0 

4137 else: 

4138 # alternative == "greater": 

4139 h = special.ndtri(confidence_level) 

4140 zlo = zr - h*se 

4141 rlo = np.tanh(zlo) 

4142 rhi = 1.0 

4143 else: 

4144 rlo, rhi = -1.0, 1.0 

4145 

4146 return ConfidenceInterval(low=rlo, high=rhi) 

4147 

4148 

4149ConfidenceInterval = namedtuple('ConfidenceInterval', ['low', 'high']) 

4150 

4151PearsonRResultBase = _make_tuple_bunch('PearsonRResultBase', 

4152 ['statistic', 'pvalue'], []) 

4153 

4154 

4155class PearsonRResult(PearsonRResultBase): 

4156 """ 

4157 Result of `scipy.stats.pearsonr` 

4158 

4159 Attributes 

4160 ---------- 

4161 statistic : float 

4162 Pearson product-moment correlation coefficient. 

4163 pvalue : float 

4164 The p-value associated with the chosen alternative. 

4165 

4166 Methods 

4167 ------- 

4168 confidence_interval 

4169 Computes the confidence interval of the correlation 

4170 coefficient `statistic` for the given confidence level. 

4171 

4172 """ 

4173 def __init__(self, statistic, pvalue, alternative, n): 

4174 super().__init__(statistic, pvalue) 

4175 self._alternative = alternative 

4176 self._n = n 

4177 

4178 # add alias for consistency with other correlation functions 

4179 self.correlation = statistic 

4180 

4181 def confidence_interval(self, confidence_level=0.95): 

4182 """ 

4183 The confidence interval for the correlation coefficient. 

4184 

4185 Compute the confidence interval for the correlation coefficient 

4186 ``statistic`` with the given confidence level. 

4187 

4188 The confidence interval is computed using the Fisher transformation 

4189 F(r) = arctanh(r) [1]_. When the sample pairs are drawn from a 

4190 bivariate normal distribution, F(r) approximately follows a normal 

4191 distribution with standard error ``1/sqrt(n - 3)``, where ``n`` is the 

4192 length of the original samples along the calculation axis. When 

4193 ``n <= 3``, this approximation does not yield a finite, real standard 

4194 error, so we define the confidence interval to be -1 to 1. 

4195 

4196 Parameters 

4197 ---------- 

4198 confidence_level : float 

4199 The confidence level for the calculation of the correlation 

4200 coefficient confidence interval. Default is 0.95. 

4201 

4202 Returns 

4203 ------- 

4204 ci : namedtuple 

4205 The confidence interval is returned in a ``namedtuple`` with 

4206 fields `low` and `high`. 

4207 

4208 References 

4209 ---------- 

4210 .. [1] "Pearson correlation coefficient", Wikipedia, 

4211 https://en.wikipedia.org/wiki/Pearson_correlation_coefficient 

4212 """ 

4213 return _pearsonr_fisher_ci(self.statistic, self._n, confidence_level, 

4214 self._alternative) 

4215 

4216 

4217def pearsonr(x, y, *, alternative='two-sided'): 

4218 r""" 

4219 Pearson correlation coefficient and p-value for testing non-correlation. 

4220 

4221 The Pearson correlation coefficient [1]_ measures the linear relationship 

4222 between two datasets. Like other correlation 

4223 coefficients, this one varies between -1 and +1 with 0 implying no 

4224 correlation. Correlations of -1 or +1 imply an exact linear relationship. 

4225 Positive correlations imply that as x increases, so does y. Negative 

4226 correlations imply that as x increases, y decreases. 

4227 

4228 This function also performs a test of the null hypothesis that the 

4229 distributions underlying the samples are uncorrelated and normally 

4230 distributed. (See Kowalski [3]_ 

4231 for a discussion of the effects of non-normality of the input on the 

4232 distribution of the correlation coefficient.) 

4233 The p-value roughly indicates the probability of an uncorrelated system 

4234 producing datasets that have a Pearson correlation at least as extreme 

4235 as the one computed from these datasets. 

4236 

4237 Parameters 

4238 ---------- 

4239 x : (N,) array_like 

4240 Input array. 

4241 y : (N,) array_like 

4242 Input array. 

4243 alternative : {'two-sided', 'greater', 'less'}, optional 

4244 Defines the alternative hypothesis. Default is 'two-sided'. 

4245 The following options are available: 

4246 

4247 * 'two-sided': the correlation is nonzero 

4248 * 'less': the correlation is negative (less than zero) 

4249 * 'greater': the correlation is positive (greater than zero) 

4250 

4251 .. versionadded:: 1.9.0 

4252 

4253 Returns 

4254 ------- 

4255 result : `~scipy.stats._result_classes.PearsonRResult` 

4256 An object with the following attributes: 

4257 

4258 statistic : float 

4259 Pearson product-moment correlation coefficient. 

4260 pvalue : float 

4261 The p-value associated with the chosen alternative. 

4262 

4263 The object has the following method: 

4264 

4265 confidence_interval(confidence_level=0.95) 

4266 This method computes the confidence interval of the correlation 

4267 coefficient `statistic` for the given confidence level. 

4268 The confidence interval is returned in a ``namedtuple`` with 

4269 fields `low` and `high`. See the Notes for more details. 

4270 

4271 Warns 

4272 ----- 

4273 `~scipy.stats.ConstantInputWarning` 

4274 Raised if an input is a constant array. The correlation coefficient 

4275 is not defined in this case, so ``np.nan`` is returned. 

4276 

4277 `~scipy.stats.NearConstantInputWarning` 

4278 Raised if an input is "nearly" constant. The array ``x`` is considered 

4279 nearly constant if ``norm(x - mean(x)) < 1e-13 * abs(mean(x))``. 

4280 Numerical errors in the calculation ``x - mean(x)`` in this case might 

4281 result in an inaccurate calculation of r. 

4282 

4283 See Also 

4284 -------- 

4285 spearmanr : Spearman rank-order correlation coefficient. 

4286 kendalltau : Kendall's tau, a correlation measure for ordinal data. 

4287 

4288 Notes 

4289 ----- 

4290 The correlation coefficient is calculated as follows: 

4291 

4292 .. math:: 

4293 

4294 r = \frac{\sum (x - m_x) (y - m_y)} 

4295 {\sqrt{\sum (x - m_x)^2 \sum (y - m_y)^2}} 

4296 

4297 where :math:`m_x` is the mean of the vector x and :math:`m_y` is 

4298 the mean of the vector y. 

4299 

4300 Under the assumption that x and y are drawn from 

4301 independent normal distributions (so the population correlation coefficient 

4302 is 0), the probability density function of the sample correlation 

4303 coefficient r is ([1]_, [2]_): 

4304 

4305 .. math:: 

4306 f(r) = \frac{{(1-r^2)}^{n/2-2}}{\mathrm{B}(\frac{1}{2},\frac{n}{2}-1)} 

4307 

4308 where n is the number of samples, and B is the beta function. This 

4309 is sometimes referred to as the exact distribution of r. This is 

4310 the distribution that is used in `pearsonr` to compute the p-value. 

4311 The distribution is a beta distribution on the interval [-1, 1], 

4312 with equal shape parameters a = b = n/2 - 1. In terms of SciPy's 

4313 implementation of the beta distribution, the distribution of r is:: 

4314 

4315 dist = scipy.stats.beta(n/2 - 1, n/2 - 1, loc=-1, scale=2) 

4316 

4317 The default p-value returned by `pearsonr` is a two-sided p-value. For a 

4318 given sample with correlation coefficient r, the p-value is 

4319 the probability that abs(r') of a random sample x' and y' drawn from 

4320 the population with zero correlation would be greater than or equal 

4321 to abs(r). In terms of the object ``dist`` shown above, the p-value 

4322 for a given r and length n can be computed as:: 

4323 

4324 p = 2*dist.cdf(-abs(r)) 

4325 

4326 When n is 2, the above continuous distribution is not well-defined. 

4327 One can interpret the limit of the beta distribution as the shape 

4328 parameters a and b approach a = b = 0 as a discrete distribution with 

4329 equal probability masses at r = 1 and r = -1. More directly, one 

4330 can observe that, given the data x = [x1, x2] and y = [y1, y2], and 

4331 assuming x1 != x2 and y1 != y2, the only possible values for r are 1 

4332 and -1. Because abs(r') for any sample x' and y' with length 2 will 

4333 be 1, the two-sided p-value for a sample of length 2 is always 1. 

4334 

4335 For backwards compatibility, the object that is returned also behaves 

4336 like a tuple of length two that holds the statistic and the p-value. 

4337 

4338 References 

4339 ---------- 

4340 .. [1] "Pearson correlation coefficient", Wikipedia, 

4341 https://en.wikipedia.org/wiki/Pearson_correlation_coefficient 

4342 .. [2] Student, "Probable error of a correlation coefficient", 

4343 Biometrika, Volume 6, Issue 2-3, 1 September 1908, pp. 302-310. 

4344 .. [3] C. J. Kowalski, "On the Effects of Non-Normality on the Distribution 

4345 of the Sample Product-Moment Correlation Coefficient" 

4346 Journal of the Royal Statistical Society. Series C (Applied 

4347 Statistics), Vol. 21, No. 1 (1972), pp. 1-12. 

4348 

4349 Examples 

4350 -------- 

4351 >>> import numpy as np 

4352 >>> from scipy import stats 

4353 >>> res = stats.pearsonr([1, 2, 3, 4, 5], [10, 9, 2.5, 6, 4]) 

4354 >>> res 

4355 PearsonRResult(statistic=-0.7426106572325056, pvalue=0.15055580885344558) 

4356 >>> res.confidence_interval() 

4357 ConfidenceInterval(low=-0.9816918044786463, high=0.40501116769030976) 

4358 

4359 There is a linear dependence between x and y if y = a + b*x + e, where 

4360 a,b are constants and e is a random error term, assumed to be independent 

4361 of x. For simplicity, assume that x is standard normal, a=0, b=1 and let 

4362 e follow a normal distribution with mean zero and standard deviation s>0. 

4363 

4364 >>> rng = np.random.default_rng() 

4365 >>> s = 0.5 

4366 >>> x = stats.norm.rvs(size=500, random_state=rng) 

4367 >>> e = stats.norm.rvs(scale=s, size=500, random_state=rng) 

4368 >>> y = x + e 

4369 >>> stats.pearsonr(x, y).statistic 

4370 0.9001942438244763 

4371 

4372 This should be close to the exact value given by 

4373 

4374 >>> 1/np.sqrt(1 + s**2) 

4375 0.8944271909999159 

4376 

4377 For s=0.5, we observe a high level of correlation. In general, a large 

4378 variance of the noise reduces the correlation, while the correlation 

4379 approaches one as the variance of the error goes to zero. 

4380 

4381 It is important to keep in mind that no correlation does not imply 

4382 independence unless (x, y) is jointly normal. Correlation can even be zero 

4383 when there is a very simple dependence structure: if X follows a 

4384 standard normal distribution, let y = abs(x). Note that the correlation 

4385 between x and y is zero. Indeed, since the expectation of x is zero, 

4386 cov(x, y) = E[x*y]. By definition, this equals E[x*abs(x)] which is zero 

4387 by symmetry. The following lines of code illustrate this observation: 

4388 

4389 >>> y = np.abs(x) 

4390 >>> stats.pearsonr(x, y) 

4391 PearsonRResult(statistic=-0.05444919272687482, pvalue=0.22422294836207743) 

4392 

4393 A non-zero correlation coefficient can be misleading. For example, if X has 

4394 a standard normal distribution, define y = x if x < 0 and y = 0 otherwise. 

4395 A simple calculation shows that corr(x, y) = sqrt(2/Pi) = 0.797..., 

4396 implying a high level of correlation: 

4397 

4398 >>> y = np.where(x < 0, x, 0) 

4399 >>> stats.pearsonr(x, y) 

4400 PearsonRResult(statistic=0.861985781588, pvalue=4.813432002751103e-149) 

4401 

4402 This is unintuitive since there is no dependence of x and y if x is larger 

4403 than zero which happens in about half of the cases if we sample x and y. 

4404 

4405 """ 

4406 n = len(x) 

4407 if n != len(y): 

4408 raise ValueError('x and y must have the same length.') 

4409 

4410 if n < 2: 

4411 raise ValueError('x and y must have length at least 2.') 

4412 

4413 x = np.asarray(x) 

4414 y = np.asarray(y) 

4415 

4416 if (np.issubdtype(x.dtype, np.complexfloating) 

4417 or np.issubdtype(y.dtype, np.complexfloating)): 

4418 raise ValueError('This function does not support complex data') 

4419 

4420 # If an input is constant, the correlation coefficient is not defined. 

4421 if (x == x[0]).all() or (y == y[0]).all(): 

4422 msg = ("An input array is constant; the correlation coefficient " 

4423 "is not defined.") 

4424 warnings.warn(stats.ConstantInputWarning(msg)) 

4425 result = PearsonRResult(statistic=np.nan, pvalue=np.nan, n=n, 

4426 alternative=alternative) 

4427 return result 

4428 

4429 # dtype is the data type for the calculations. This expression ensures 

4430 # that the data type is at least 64 bit floating point. It might have 

4431 # more precision if the input is, for example, np.longdouble. 

4432 dtype = type(1.0 + x[0] + y[0]) 

4433 

4434 if n == 2: 

4435 r = dtype(np.sign(x[1] - x[0])*np.sign(y[1] - y[0])) 

4436 result = PearsonRResult(statistic=r, pvalue=1.0, n=n, 

4437 alternative=alternative) 

4438 return result 

4439 

4440 xmean = x.mean(dtype=dtype) 

4441 ymean = y.mean(dtype=dtype) 

4442 

4443 # By using `astype(dtype)`, we ensure that the intermediate calculations 

4444 # use at least 64 bit floating point. 

4445 xm = x.astype(dtype) - xmean 

4446 ym = y.astype(dtype) - ymean 

4447 

4448 # Unlike np.linalg.norm or the expression sqrt((xm*xm).sum()), 

4449 # scipy.linalg.norm(xm) does not overflow if xm is, for example, 

4450 # [-5e210, 5e210, 3e200, -3e200] 

4451 normxm = linalg.norm(xm) 

4452 normym = linalg.norm(ym) 

4453 

4454 threshold = 1e-13 

4455 if normxm < threshold*abs(xmean) or normym < threshold*abs(ymean): 

4456 # If all the values in x (likewise y) are very close to the mean, 

4457 # the loss of precision that occurs in the subtraction xm = x - xmean 

4458 # might result in large errors in r. 

4459 msg = ("An input array is nearly constant; the computed " 

4460 "correlation coefficient may be inaccurate.") 

4461 warnings.warn(stats.NearConstantInputWarning(msg)) 

4462 

4463 r = np.dot(xm/normxm, ym/normym) 

4464 

4465 # Presumably, if abs(r) > 1, then it is only some small artifact of 

4466 # floating point arithmetic. 

4467 r = max(min(r, 1.0), -1.0) 

4468 

4469 # As explained in the docstring, the distribution of `r` under the null 

4470 # hypothesis is the beta distribution on (-1, 1) with a = b = n/2 - 1. 

4471 ab = n/2 - 1 

4472 dist = stats.beta(ab, ab, loc=-1, scale=2) 

4473 if alternative == 'two-sided': 

4474 prob = 2*dist.sf(abs(r)) 

4475 elif alternative == 'less': 

4476 prob = dist.cdf(r) 

4477 elif alternative == 'greater': 

4478 prob = dist.sf(r) 

4479 else: 

4480 raise ValueError('alternative must be one of ' 

4481 '["two-sided", "less", "greater"]') 

4482 

4483 return PearsonRResult(statistic=r, pvalue=prob, n=n, 

4484 alternative=alternative) 

4485 

4486 

4487def fisher_exact(table, alternative='two-sided'): 

4488 """Perform a Fisher exact test on a 2x2 contingency table. 

4489 

4490 The null hypothesis is that the true odds ratio of the populations 

4491 underlying the observations is one, and the observations were sampled 

4492 from these populations under a condition: the marginals of the 

4493 resulting table must equal those of the observed table. The statistic 

4494 returned is the unconditional maximum likelihood estimate of the odds 

4495 ratio, and the p-value is the probability under the null hypothesis of 

4496 obtaining a table at least as extreme as the one that was actually 

4497 observed. There are other possible choices of statistic and two-sided 

4498 p-value definition associated with Fisher's exact test; please see the 

4499 Notes for more information. 

4500 

4501 Parameters 

4502 ---------- 

4503 table : array_like of ints 

4504 A 2x2 contingency table. Elements must be non-negative integers. 

4505 alternative : {'two-sided', 'less', 'greater'}, optional 

4506 Defines the alternative hypothesis. 

4507 The following options are available (default is 'two-sided'): 

4508 

4509 * 'two-sided': the odds ratio of the underlying population is not one 

4510 * 'less': the odds ratio of the underlying population is less than one 

4511 * 'greater': the odds ratio of the underlying population is greater 

4512 than one 

4513 

4514 See the Notes for more details. 

4515 

4516 Returns 

4517 ------- 

4518 res : SignificanceResult 

4519 An object containing attributes: 

4520 

4521 statistic : float 

4522 This is the prior odds ratio, not a posterior estimate. 

4523 pvalue : float 

4524 The probability under the null hypothesis of obtaining a 

4525 table at least as extreme as the one that was actually observed. 

4526 

4527 See Also 

4528 -------- 

4529 chi2_contingency : Chi-square test of independence of variables in a 

4530 contingency table. This can be used as an alternative to 

4531 `fisher_exact` when the numbers in the table are large. 

4532 contingency.odds_ratio : Compute the odds ratio (sample or conditional 

4533 MLE) for a 2x2 contingency table. 

4534 barnard_exact : Barnard's exact test, which is a more powerful alternative 

4535 than Fisher's exact test for 2x2 contingency tables. 

4536 boschloo_exact : Boschloo's exact test, which is a more powerful alternative 

4537 than Fisher's exact test for 2x2 contingency tables. 

4538 

4539 Notes 

4540 ----- 

4541 *Null hypothesis and p-values* 

4542 

4543 The null hypothesis is that the true odds ratio of the populations 

4544 underlying the observations is one, and the observations were sampled at 

4545 random from these populations under a condition: the marginals of the 

4546 resulting table must equal those of the observed table. Equivalently, 

4547 the null hypothesis is that the input table is from the hypergeometric 

4548 distribution with parameters (as used in `hypergeom`) 

4549 ``M = a + b + c + d``, ``n = a + b`` and ``N = a + c``, where the 

4550 input table is ``[[a, b], [c, d]]``. This distribution has support 

4551 ``max(0, N + n - M) <= x <= min(N, n)``, or, in terms of the values 

4552 in the input table, ``min(0, a - d) <= x <= a + min(b, c)``. ``x`` 

4553 can be interpreted as the upper-left element of a 2x2 table, so the 

4554 tables in the distribution have form:: 

4555 

4556 [ x n - x ] 

4557 [N - x M - (n + N) + x] 

4558 

4559 For example, if:: 

4560 

4561 table = [6 2] 

4562 [1 4] 

4563 

4564 then the support is ``2 <= x <= 7``, and the tables in the distribution 

4565 are:: 

4566 

4567 [2 6] [3 5] [4 4] [5 3] [6 2] [7 1] 

4568 [5 0] [4 1] [3 2] [2 3] [1 4] [0 5] 

4569 

4570 The probability of each table is given by the hypergeometric distribution 

4571 ``hypergeom.pmf(x, M, n, N)``. For this example, these are (rounded to 

4572 three significant digits):: 

4573 

4574 x 2 3 4 5 6 7 

4575 p 0.0163 0.163 0.408 0.326 0.0816 0.00466 

4576 

4577 These can be computed with:: 

4578 

4579 >>> import numpy as np 

4580 >>> from scipy.stats import hypergeom 

4581 >>> table = np.array([[6, 2], [1, 4]]) 

4582 >>> M = table.sum() 

4583 >>> n = table[0].sum() 

4584 >>> N = table[:, 0].sum() 

4585 >>> start, end = hypergeom.support(M, n, N) 

4586 >>> hypergeom.pmf(np.arange(start, end+1), M, n, N) 

4587 array([0.01631702, 0.16317016, 0.40792541, 0.32634033, 0.08158508, 

4588 0.004662 ]) 

4589 

4590 The two-sided p-value is the probability that, under the null hypothesis, 

4591 a random table would have a probability equal to or less than the 

4592 probability of the input table. For our example, the probability of 

4593 the input table (where ``x = 6``) is 0.0816. The x values where the 

4594 probability does not exceed this are 2, 6 and 7, so the two-sided p-value 

4595 is ``0.0163 + 0.0816 + 0.00466 ~= 0.10256``:: 

4596 

4597 >>> from scipy.stats import fisher_exact 

4598 >>> res = fisher_exact(table, alternative='two-sided') 

4599 >>> res.pvalue 

4600 0.10256410256410257 

4601 

4602 The one-sided p-value for ``alternative='greater'`` is the probability 

4603 that a random table has ``x >= a``, which in our example is ``x >= 6``, 

4604 or ``0.0816 + 0.00466 ~= 0.08626``:: 

4605 

4606 >>> res = fisher_exact(table, alternative='greater') 

4607 >>> res.pvalue 

4608 0.08624708624708627 

4609 

4610 This is equivalent to computing the survival function of the 

4611 distribution at ``x = 5`` (one less than ``x`` from the input table, 

4612 because we want to include the probability of ``x = 6`` in the sum):: 

4613 

4614 >>> hypergeom.sf(5, M, n, N) 

4615 0.08624708624708627 

4616 

4617 For ``alternative='less'``, the one-sided p-value is the probability 

4618 that a random table has ``x <= a``, (i.e. ``x <= 6`` in our example), 

4619 or ``0.0163 + 0.163 + 0.408 + 0.326 + 0.0816 ~= 0.9949``:: 

4620 

4621 >>> res = fisher_exact(table, alternative='less') 

4622 >>> res.pvalue 

4623 0.9953379953379957 

4624 

4625 This is equivalent to computing the cumulative distribution function 

4626 of the distribution at ``x = 6``: 

4627 

4628 >>> hypergeom.cdf(6, M, n, N) 

4629 0.9953379953379957 

4630 

4631 *Odds ratio* 

4632 

4633 The calculated odds ratio is different from the value computed by the 

4634 R function ``fisher.test``. This implementation returns the "sample" 

4635 or "unconditional" maximum likelihood estimate, while ``fisher.test`` 

4636 in R uses the conditional maximum likelihood estimate. To compute the 

4637 conditional maximum likelihood estimate of the odds ratio, use 

4638 `scipy.stats.contingency.odds_ratio`. 

4639 

4640 Examples 

4641 -------- 

4642 Say we spend a few days counting whales and sharks in the Atlantic and 

4643 Indian oceans. In the Atlantic ocean we find 8 whales and 1 shark, in the 

4644 Indian ocean 2 whales and 5 sharks. Then our contingency table is:: 

4645 

4646 Atlantic Indian 

4647 whales 8 2 

4648 sharks 1 5 

4649 

4650 We use this table to find the p-value: 

4651 

4652 >>> from scipy.stats import fisher_exact 

4653 >>> res = fisher_exact([[8, 2], [1, 5]]) 

4654 >>> res.pvalue 

4655 0.0349... 

4656 

4657 The probability that we would observe this or an even more imbalanced ratio 

4658 by chance is about 3.5%. A commonly used significance level is 5%--if we 

4659 adopt that, we can therefore conclude that our observed imbalance is 

4660 statistically significant; whales prefer the Atlantic while sharks prefer 

4661 the Indian ocean. 

4662 

4663 """ 

4664 hypergeom = distributions.hypergeom 

4665 # int32 is not enough for the algorithm 

4666 c = np.asarray(table, dtype=np.int64) 

4667 if not c.shape == (2, 2): 

4668 raise ValueError("The input `table` must be of shape (2, 2).") 

4669 

4670 if np.any(c < 0): 

4671 raise ValueError("All values in `table` must be nonnegative.") 

4672 

4673 if 0 in c.sum(axis=0) or 0 in c.sum(axis=1): 

4674 # If both values in a row or column are zero, the p-value is 1 and 

4675 # the odds ratio is NaN. 

4676 return SignificanceResult(np.nan, 1.0) 

4677 

4678 if c[1, 0] > 0 and c[0, 1] > 0: 

4679 oddsratio = c[0, 0] * c[1, 1] / (c[1, 0] * c[0, 1]) 

4680 else: 

4681 oddsratio = np.inf 

4682 

4683 n1 = c[0, 0] + c[0, 1] 

4684 n2 = c[1, 0] + c[1, 1] 

4685 n = c[0, 0] + c[1, 0] 

4686 

4687 def pmf(x): 

4688 return hypergeom.pmf(x, n1 + n2, n1, n) 

4689 

4690 if alternative == 'less': 

4691 pvalue = hypergeom.cdf(c[0, 0], n1 + n2, n1, n) 

4692 elif alternative == 'greater': 

4693 # Same formula as the 'less' case, but with the second column. 

4694 pvalue = hypergeom.cdf(c[0, 1], n1 + n2, n1, c[0, 1] + c[1, 1]) 

4695 elif alternative == 'two-sided': 

4696 mode = int((n + 1) * (n1 + 1) / (n1 + n2 + 2)) 

4697 pexact = hypergeom.pmf(c[0, 0], n1 + n2, n1, n) 

4698 pmode = hypergeom.pmf(mode, n1 + n2, n1, n) 

4699 

4700 epsilon = 1e-14 

4701 gamma = 1 + epsilon 

4702 

4703 if np.abs(pexact - pmode) / np.maximum(pexact, pmode) <= epsilon: 

4704 return SignificanceResult(oddsratio, 1.) 

4705 

4706 elif c[0, 0] < mode: 

4707 plower = hypergeom.cdf(c[0, 0], n1 + n2, n1, n) 

4708 if hypergeom.pmf(n, n1 + n2, n1, n) > pexact * gamma: 

4709 return SignificanceResult(oddsratio, plower) 

4710 

4711 guess = _binary_search(lambda x: -pmf(x), -pexact * gamma, mode, n) 

4712 pvalue = plower + hypergeom.sf(guess, n1 + n2, n1, n) 

4713 else: 

4714 pupper = hypergeom.sf(c[0, 0] - 1, n1 + n2, n1, n) 

4715 if hypergeom.pmf(0, n1 + n2, n1, n) > pexact * gamma: 

4716 return SignificanceResult(oddsratio, pupper) 

4717 

4718 guess = _binary_search(pmf, pexact * gamma, 0, mode) 

4719 pvalue = pupper + hypergeom.cdf(guess, n1 + n2, n1, n) 

4720 else: 

4721 msg = "`alternative` should be one of {'two-sided', 'less', 'greater'}" 

4722 raise ValueError(msg) 

4723 

4724 pvalue = min(pvalue, 1.0) 

4725 

4726 return SignificanceResult(oddsratio, pvalue) 

4727 

4728 

4729def spearmanr(a, b=None, axis=0, nan_policy='propagate', 

4730 alternative='two-sided'): 

4731 """Calculate a Spearman correlation coefficient with associated p-value. 

4732 

4733 The Spearman rank-order correlation coefficient is a nonparametric measure 

4734 of the monotonicity of the relationship between two datasets. 

4735 Like other correlation coefficients, 

4736 this one varies between -1 and +1 with 0 implying no correlation. 

4737 Correlations of -1 or +1 imply an exact monotonic relationship. Positive 

4738 correlations imply that as x increases, so does y. Negative correlations 

4739 imply that as x increases, y decreases. 

4740 

4741 The p-value roughly indicates the probability of an uncorrelated system 

4742 producing datasets that have a Spearman correlation at least as extreme 

4743 as the one computed from these datasets. Although calculation of the 

4744 p-value does not make strong assumptions about the distributions underlying 

4745 the samples, it is only accurate for very large samples (>500 

4746 observations). For smaller sample sizes, consider a permutation test (see 

4747 Examples section below). 

4748 

4749 Parameters 

4750 ---------- 

4751 a, b : 1D or 2D array_like, b is optional 

4752 One or two 1-D or 2-D arrays containing multiple variables and 

4753 observations. When these are 1-D, each represents a vector of 

4754 observations of a single variable. For the behavior in the 2-D case, 

4755 see under ``axis``, below. 

4756 Both arrays need to have the same length in the ``axis`` dimension. 

4757 axis : int or None, optional 

4758 If axis=0 (default), then each column represents a variable, with 

4759 observations in the rows. If axis=1, the relationship is transposed: 

4760 each row represents a variable, while the columns contain observations. 

4761 If axis=None, then both arrays will be raveled. 

4762 nan_policy : {'propagate', 'raise', 'omit'}, optional 

4763 Defines how to handle when input contains nan. 

4764 The following options are available (default is 'propagate'): 

4765 

4766 * 'propagate': returns nan 

4767 * 'raise': throws an error 

4768 * 'omit': performs the calculations ignoring nan values 

4769 

4770 alternative : {'two-sided', 'less', 'greater'}, optional 

4771 Defines the alternative hypothesis. Default is 'two-sided'. 

4772 The following options are available: 

4773 

4774 * 'two-sided': the correlation is nonzero 

4775 * 'less': the correlation is negative (less than zero) 

4776 * 'greater': the correlation is positive (greater than zero) 

4777 

4778 .. versionadded:: 1.7.0 

4779 

4780 Returns 

4781 ------- 

4782 res : SignificanceResult 

4783 An object containing attributes: 

4784 

4785 statistic : float or ndarray (2-D square) 

4786 Spearman correlation matrix or correlation coefficient (if only 2 

4787 variables are given as parameters). Correlation matrix is square 

4788 with length equal to total number of variables (columns or rows) in 

4789 ``a`` and ``b`` combined. 

4790 pvalue : float 

4791 The p-value for a hypothesis test whose null hypothesis 

4792 is that two sets of data are linearly uncorrelated. See 

4793 `alternative` above for alternative hypotheses. `pvalue` has the 

4794 same shape as `statistic`. 

4795 

4796 Warns 

4797 ----- 

4798 `~scipy.stats.ConstantInputWarning` 

4799 Raised if an input is a constant array. The correlation coefficient 

4800 is not defined in this case, so ``np.nan`` is returned. 

4801 

4802 References 

4803 ---------- 

4804 .. [1] Zwillinger, D. and Kokoska, S. (2000). CRC Standard 

4805 Probability and Statistics Tables and Formulae. Chapman & Hall: New 

4806 York. 2000. 

4807 Section 14.7 

4808 .. [2] Kendall, M. G. and Stuart, A. (1973). 

4809 The Advanced Theory of Statistics, Volume 2: Inference and Relationship. 

4810 Griffin. 1973. 

4811 Section 31.18 

4812 

4813 Examples 

4814 -------- 

4815 >>> import numpy as np 

4816 >>> from scipy import stats 

4817 >>> res = stats.spearmanr([1, 2, 3, 4, 5], [5, 6, 7, 8, 7]) 

4818 >>> res.statistic 

4819 0.8207826816681233 

4820 >>> res.pvalue 

4821 0.08858700531354381 

4822 >>> rng = np.random.default_rng() 

4823 >>> x2n = rng.standard_normal((100, 2)) 

4824 >>> y2n = rng.standard_normal((100, 2)) 

4825 >>> res = stats.spearmanr(x2n) 

4826 >>> res.statistic, res.pvalue 

4827 (-0.07960396039603959, 0.4311168705769747) 

4828 >>> res = stats.spearmanr(x2n[:, 0], x2n[:, 1]) 

4829 >>> res.statistic, res.pvalue 

4830 (-0.07960396039603959, 0.4311168705769747) 

4831 >>> res = stats.spearmanr(x2n, y2n) 

4832 >>> res.statistic 

4833 array([[ 1. , -0.07960396, -0.08314431, 0.09662166], 

4834 [-0.07960396, 1. , -0.14448245, 0.16738074], 

4835 [-0.08314431, -0.14448245, 1. , 0.03234323], 

4836 [ 0.09662166, 0.16738074, 0.03234323, 1. ]]) 

4837 >>> res.pvalue 

4838 array([[0. , 0.43111687, 0.41084066, 0.33891628], 

4839 [0.43111687, 0. , 0.15151618, 0.09600687], 

4840 [0.41084066, 0.15151618, 0. , 0.74938561], 

4841 [0.33891628, 0.09600687, 0.74938561, 0. ]]) 

4842 >>> res = stats.spearmanr(x2n.T, y2n.T, axis=1) 

4843 >>> res.statistic 

4844 array([[ 1. , -0.07960396, -0.08314431, 0.09662166], 

4845 [-0.07960396, 1. , -0.14448245, 0.16738074], 

4846 [-0.08314431, -0.14448245, 1. , 0.03234323], 

4847 [ 0.09662166, 0.16738074, 0.03234323, 1. ]]) 

4848 >>> res = stats.spearmanr(x2n, y2n, axis=None) 

4849 >>> res.statistic, res.pvalue 

4850 (0.044981624540613524, 0.5270803651336189) 

4851 >>> res = stats.spearmanr(x2n.ravel(), y2n.ravel()) 

4852 >>> res.statistic, res.pvalue 

4853 (0.044981624540613524, 0.5270803651336189) 

4854 

4855 >>> rng = np.random.default_rng() 

4856 >>> xint = rng.integers(10, size=(100, 2)) 

4857 >>> res = stats.spearmanr(xint) 

4858 >>> res.statistic, res.pvalue 

4859 (0.09800224850707953, 0.3320271757932076) 

4860 

4861 For small samples, consider performing a permutation test instead of 

4862 relying on the asymptotic p-value. Note that to calculate the null 

4863 distribution of the statistic (for all possibly pairings between 

4864 observations in sample ``x`` and ``y``), only one of the two inputs needs 

4865 to be permuted. 

4866 

4867 >>> x = [1.76405235, 0.40015721, 0.97873798, 

4868 ... 2.2408932, 1.86755799, -0.97727788] 

4869 >>> y = [2.71414076, 0.2488, 0.87551913, 

4870 ... 2.6514917, 2.01160156, 0.47699563] 

4871 >>> def statistic(x): # permute only `x` 

4872 ... return stats.spearmanr(x, y).statistic 

4873 >>> res_exact = stats.permutation_test((x,), statistic, 

4874 ... permutation_type='pairings') 

4875 >>> res_asymptotic = stats.spearmanr(x, y) 

4876 >>> res_exact.pvalue, res_asymptotic.pvalue # asymptotic pvalue is too low 

4877 (0.10277777777777777, 0.07239650145772594) 

4878 

4879 """ 

4880 if axis is not None and axis > 1: 

4881 raise ValueError("spearmanr only handles 1-D or 2-D arrays, " 

4882 "supplied axis argument {}, please use only " 

4883 "values 0, 1 or None for axis".format(axis)) 

4884 

4885 a, axisout = _chk_asarray(a, axis) 

4886 if a.ndim > 2: 

4887 raise ValueError("spearmanr only handles 1-D or 2-D arrays") 

4888 

4889 if b is None: 

4890 if a.ndim < 2: 

4891 raise ValueError("`spearmanr` needs at least 2 " 

4892 "variables to compare") 

4893 else: 

4894 # Concatenate a and b, so that we now only have to handle the case 

4895 # of a 2-D `a`. 

4896 b, _ = _chk_asarray(b, axis) 

4897 if axisout == 0: 

4898 a = np.column_stack((a, b)) 

4899 else: 

4900 a = np.row_stack((a, b)) 

4901 

4902 n_vars = a.shape[1 - axisout] 

4903 n_obs = a.shape[axisout] 

4904 if n_obs <= 1: 

4905 # Handle empty arrays or single observations. 

4906 res = SignificanceResult(np.nan, np.nan) 

4907 res.correlation = np.nan 

4908 return res 

4909 

4910 warn_msg = ("An input array is constant; the correlation coefficient " 

4911 "is not defined.") 

4912 if axisout == 0: 

4913 if (a[:, 0][0] == a[:, 0]).all() or (a[:, 1][0] == a[:, 1]).all(): 

4914 # If an input is constant, the correlation coefficient 

4915 # is not defined. 

4916 warnings.warn(stats.ConstantInputWarning(warn_msg)) 

4917 res = SignificanceResult(np.nan, np.nan) 

4918 res.correlation = np.nan 

4919 return res 

4920 else: # case when axisout == 1 b/c a is 2 dim only 

4921 if (a[0, :][0] == a[0, :]).all() or (a[1, :][0] == a[1, :]).all(): 

4922 # If an input is constant, the correlation coefficient 

4923 # is not defined. 

4924 warnings.warn(stats.ConstantInputWarning(warn_msg)) 

4925 res = SignificanceResult(np.nan, np.nan) 

4926 res.correlation = np.nan 

4927 return res 

4928 

4929 a_contains_nan, nan_policy = _contains_nan(a, nan_policy) 

4930 variable_has_nan = np.zeros(n_vars, dtype=bool) 

4931 if a_contains_nan: 

4932 if nan_policy == 'omit': 

4933 return mstats_basic.spearmanr(a, axis=axis, nan_policy=nan_policy, 

4934 alternative=alternative) 

4935 elif nan_policy == 'propagate': 

4936 if a.ndim == 1 or n_vars <= 2: 

4937 res = SignificanceResult(np.nan, np.nan) 

4938 res.correlation = np.nan 

4939 return res 

4940 else: 

4941 # Keep track of variables with NaNs, set the outputs to NaN 

4942 # only for those variables 

4943 variable_has_nan = np.isnan(a).any(axis=axisout) 

4944 

4945 a_ranked = np.apply_along_axis(rankdata, axisout, a) 

4946 rs = np.corrcoef(a_ranked, rowvar=axisout) 

4947 dof = n_obs - 2 # degrees of freedom 

4948 

4949 # rs can have elements equal to 1, so avoid zero division warnings 

4950 with np.errstate(divide='ignore'): 

4951 # clip the small negative values possibly caused by rounding 

4952 # errors before taking the square root 

4953 t = rs * np.sqrt((dof/((rs+1.0)*(1.0-rs))).clip(0)) 

4954 

4955 t, prob = _ttest_finish(dof, t, alternative) 

4956 

4957 # For backwards compatibility, return scalars when comparing 2 columns 

4958 if rs.shape == (2, 2): 

4959 res = SignificanceResult(rs[1, 0], prob[1, 0]) 

4960 res.correlation = rs[1, 0] 

4961 return res 

4962 else: 

4963 rs[variable_has_nan, :] = np.nan 

4964 rs[:, variable_has_nan] = np.nan 

4965 res = SignificanceResult(rs, prob) 

4966 res.correlation = rs 

4967 return res 

4968 

4969 

4970def pointbiserialr(x, y): 

4971 r"""Calculate a point biserial correlation coefficient and its p-value. 

4972 

4973 The point biserial correlation is used to measure the relationship 

4974 between a binary variable, x, and a continuous variable, y. Like other 

4975 correlation coefficients, this one varies between -1 and +1 with 0 

4976 implying no correlation. Correlations of -1 or +1 imply a determinative 

4977 relationship. 

4978 

4979 This function may be computed using a shortcut formula but produces the 

4980 same result as `pearsonr`. 

4981 

4982 Parameters 

4983 ---------- 

4984 x : array_like of bools 

4985 Input array. 

4986 y : array_like 

4987 Input array. 

4988 

4989 Returns 

4990 ------- 

4991 res: SignificanceResult 

4992 An object containing attributes: 

4993 

4994 statistic : float 

4995 The R value. 

4996 pvalue : float 

4997 The two-sided p-value. 

4998 

4999 Notes 

5000 ----- 

5001 `pointbiserialr` uses a t-test with ``n-1`` degrees of freedom. 

5002 It is equivalent to `pearsonr`. 

5003 

5004 The value of the point-biserial correlation can be calculated from: 

5005 

5006 .. math:: 

5007 

5008 r_{pb} = \frac{\overline{Y_{1}} - 

5009 \overline{Y_{0}}}{s_{y}}\sqrt{\frac{N_{1} N_{2}}{N (N - 1))}} 

5010 

5011 Where :math:`Y_{0}` and :math:`Y_{1}` are means of the metric 

5012 observations coded 0 and 1 respectively; :math:`N_{0}` and :math:`N_{1}` 

5013 are number of observations coded 0 and 1 respectively; :math:`N` is the 

5014 total number of observations and :math:`s_{y}` is the standard 

5015 deviation of all the metric observations. 

5016 

5017 A value of :math:`r_{pb}` that is significantly different from zero is 

5018 completely equivalent to a significant difference in means between the two 

5019 groups. Thus, an independent groups t Test with :math:`N-2` degrees of 

5020 freedom may be used to test whether :math:`r_{pb}` is nonzero. The 

5021 relation between the t-statistic for comparing two independent groups and 

5022 :math:`r_{pb}` is given by: 

5023 

5024 .. math:: 

5025 

5026 t = \sqrt{N - 2}\frac{r_{pb}}{\sqrt{1 - r^{2}_{pb}}} 

5027 

5028 References 

5029 ---------- 

5030 .. [1] J. Lev, "The Point Biserial Coefficient of Correlation", Ann. Math. 

5031 Statist., Vol. 20, no.1, pp. 125-126, 1949. 

5032 

5033 .. [2] R.F. Tate, "Correlation Between a Discrete and a Continuous 

5034 Variable. Point-Biserial Correlation.", Ann. Math. Statist., Vol. 25, 

5035 np. 3, pp. 603-607, 1954. 

5036 

5037 .. [3] D. Kornbrot "Point Biserial Correlation", In Wiley StatsRef: 

5038 Statistics Reference Online (eds N. Balakrishnan, et al.), 2014. 

5039 :doi:`10.1002/9781118445112.stat06227` 

5040 

5041 Examples 

5042 -------- 

5043 >>> import numpy as np 

5044 >>> from scipy import stats 

5045 >>> a = np.array([0, 0, 0, 1, 1, 1, 1]) 

5046 >>> b = np.arange(7) 

5047 >>> stats.pointbiserialr(a, b) 

5048 (0.8660254037844386, 0.011724811003954652) 

5049 >>> stats.pearsonr(a, b) 

5050 (0.86602540378443871, 0.011724811003954626) 

5051 >>> np.corrcoef(a, b) 

5052 array([[ 1. , 0.8660254], 

5053 [ 0.8660254, 1. ]]) 

5054 

5055 """ 

5056 rpb, prob = pearsonr(x, y) 

5057 # create result object with alias for backward compatibility 

5058 res = SignificanceResult(rpb, prob) 

5059 res.correlation = rpb 

5060 return res 

5061 

5062 

5063def kendalltau(x, y, initial_lexsort=None, nan_policy='propagate', 

5064 method='auto', variant='b', alternative='two-sided'): 

5065 """Calculate Kendall's tau, a correlation measure for ordinal data. 

5066 

5067 Kendall's tau is a measure of the correspondence between two rankings. 

5068 Values close to 1 indicate strong agreement, and values close to -1 

5069 indicate strong disagreement. This implements two variants of Kendall's 

5070 tau: tau-b (the default) and tau-c (also known as Stuart's tau-c). These 

5071 differ only in how they are normalized to lie within the range -1 to 1; 

5072 the hypothesis tests (their p-values) are identical. Kendall's original 

5073 tau-a is not implemented separately because both tau-b and tau-c reduce 

5074 to tau-a in the absence of ties. 

5075 

5076 Parameters 

5077 ---------- 

5078 x, y : array_like 

5079 Arrays of rankings, of the same shape. If arrays are not 1-D, they 

5080 will be flattened to 1-D. 

5081 initial_lexsort : bool, optional, deprecated 

5082 This argument is unused. 

5083 

5084 .. deprecated:: 1.10.0 

5085 `kendalltau` keyword argument `initial_lexsort` is deprecated as it 

5086 is unused and will be removed in SciPy 1.12.0. 

5087 nan_policy : {'propagate', 'raise', 'omit'}, optional 

5088 Defines how to handle when input contains nan. 

5089 The following options are available (default is 'propagate'): 

5090 

5091 * 'propagate': returns nan 

5092 * 'raise': throws an error 

5093 * 'omit': performs the calculations ignoring nan values 

5094 

5095 method : {'auto', 'asymptotic', 'exact'}, optional 

5096 Defines which method is used to calculate the p-value [5]_. 

5097 The following options are available (default is 'auto'): 

5098 

5099 * 'auto': selects the appropriate method based on a trade-off 

5100 between speed and accuracy 

5101 * 'asymptotic': uses a normal approximation valid for large samples 

5102 * 'exact': computes the exact p-value, but can only be used if no ties 

5103 are present. As the sample size increases, the 'exact' computation 

5104 time may grow and the result may lose some precision. 

5105 variant : {'b', 'c'}, optional 

5106 Defines which variant of Kendall's tau is returned. Default is 'b'. 

5107 alternative : {'two-sided', 'less', 'greater'}, optional 

5108 Defines the alternative hypothesis. Default is 'two-sided'. 

5109 The following options are available: 

5110 

5111 * 'two-sided': the rank correlation is nonzero 

5112 * 'less': the rank correlation is negative (less than zero) 

5113 * 'greater': the rank correlation is positive (greater than zero) 

5114 

5115 Returns 

5116 ------- 

5117 res : SignificanceResult 

5118 An object containing attributes: 

5119 

5120 statistic : float 

5121 The tau statistic. 

5122 pvalue : float 

5123 The p-value for a hypothesis test whose null hypothesis is 

5124 an absence of association, tau = 0. 

5125 

5126 See Also 

5127 -------- 

5128 spearmanr : Calculates a Spearman rank-order correlation coefficient. 

5129 theilslopes : Computes the Theil-Sen estimator for a set of points (x, y). 

5130 weightedtau : Computes a weighted version of Kendall's tau. 

5131 

5132 Notes 

5133 ----- 

5134 The definition of Kendall's tau that is used is [2]_:: 

5135 

5136 tau_b = (P - Q) / sqrt((P + Q + T) * (P + Q + U)) 

5137 

5138 tau_c = 2 (P - Q) / (n**2 * (m - 1) / m) 

5139 

5140 where P is the number of concordant pairs, Q the number of discordant 

5141 pairs, T the number of ties only in `x`, and U the number of ties only in 

5142 `y`. If a tie occurs for the same pair in both `x` and `y`, it is not 

5143 added to either T or U. n is the total number of samples, and m is the 

5144 number of unique values in either `x` or `y`, whichever is smaller. 

5145 

5146 References 

5147 ---------- 

5148 .. [1] Maurice G. Kendall, "A New Measure of Rank Correlation", Biometrika 

5149 Vol. 30, No. 1/2, pp. 81-93, 1938. 

5150 .. [2] Maurice G. Kendall, "The treatment of ties in ranking problems", 

5151 Biometrika Vol. 33, No. 3, pp. 239-251. 1945. 

5152 .. [3] Gottfried E. Noether, "Elements of Nonparametric Statistics", John 

5153 Wiley & Sons, 1967. 

5154 .. [4] Peter M. Fenwick, "A new data structure for cumulative frequency 

5155 tables", Software: Practice and Experience, Vol. 24, No. 3, 

5156 pp. 327-336, 1994. 

5157 .. [5] Maurice G. Kendall, "Rank Correlation Methods" (4th Edition), 

5158 Charles Griffin & Co., 1970. 

5159 

5160 Examples 

5161 -------- 

5162 >>> from scipy import stats 

5163 >>> x1 = [12, 2, 1, 12, 2] 

5164 >>> x2 = [1, 4, 7, 1, 0] 

5165 >>> res = stats.kendalltau(x1, x2) 

5166 >>> res.statistic 

5167 -0.47140452079103173 

5168 >>> res.pvalue 

5169 0.2827454599327748 

5170 

5171 """ 

5172 if initial_lexsort is not None: 

5173 msg = ("'kendalltau' keyword argument 'initial_lexsort' is deprecated" 

5174 " as it is unused and will be removed in SciPy 1.12.0.") 

5175 warnings.warn(msg, DeprecationWarning, stacklevel=2) 

5176 

5177 x = np.asarray(x).ravel() 

5178 y = np.asarray(y).ravel() 

5179 

5180 if x.size != y.size: 

5181 raise ValueError("All inputs to `kendalltau` must be of the same " 

5182 f"size, found x-size {x.size} and y-size {y.size}") 

5183 elif not x.size or not y.size: 

5184 # Return NaN if arrays are empty 

5185 res = SignificanceResult(np.nan, np.nan) 

5186 res.correlation = np.nan 

5187 return res 

5188 

5189 # check both x and y 

5190 cnx, npx = _contains_nan(x, nan_policy) 

5191 cny, npy = _contains_nan(y, nan_policy) 

5192 contains_nan = cnx or cny 

5193 if npx == 'omit' or npy == 'omit': 

5194 nan_policy = 'omit' 

5195 

5196 if contains_nan and nan_policy == 'propagate': 

5197 res = SignificanceResult(np.nan, np.nan) 

5198 res.correlation = np.nan 

5199 return res 

5200 

5201 elif contains_nan and nan_policy == 'omit': 

5202 x = ma.masked_invalid(x) 

5203 y = ma.masked_invalid(y) 

5204 if variant == 'b': 

5205 return mstats_basic.kendalltau(x, y, method=method, use_ties=True, 

5206 alternative=alternative) 

5207 else: 

5208 message = ("nan_policy='omit' is currently compatible only with " 

5209 "variant='b'.") 

5210 raise ValueError(message) 

5211 

5212 def count_rank_tie(ranks): 

5213 cnt = np.bincount(ranks).astype('int64', copy=False) 

5214 cnt = cnt[cnt > 1] 

5215 return ((cnt * (cnt - 1) // 2).sum(), 

5216 (cnt * (cnt - 1.) * (cnt - 2)).sum(), 

5217 (cnt * (cnt - 1.) * (2*cnt + 5)).sum()) 

5218 

5219 size = x.size 

5220 perm = np.argsort(y) # sort on y and convert y to dense ranks 

5221 x, y = x[perm], y[perm] 

5222 y = np.r_[True, y[1:] != y[:-1]].cumsum(dtype=np.intp) 

5223 

5224 # stable sort on x and convert x to dense ranks 

5225 perm = np.argsort(x, kind='mergesort') 

5226 x, y = x[perm], y[perm] 

5227 x = np.r_[True, x[1:] != x[:-1]].cumsum(dtype=np.intp) 

5228 

5229 dis = _kendall_dis(x, y) # discordant pairs 

5230 

5231 obs = np.r_[True, (x[1:] != x[:-1]) | (y[1:] != y[:-1]), True] 

5232 cnt = np.diff(np.nonzero(obs)[0]).astype('int64', copy=False) 

5233 

5234 ntie = (cnt * (cnt - 1) // 2).sum() # joint ties 

5235 xtie, x0, x1 = count_rank_tie(x) # ties in x, stats 

5236 ytie, y0, y1 = count_rank_tie(y) # ties in y, stats 

5237 

5238 tot = (size * (size - 1)) // 2 

5239 

5240 if xtie == tot or ytie == tot: 

5241 res = SignificanceResult(np.nan, np.nan) 

5242 res.correlation = np.nan 

5243 return res 

5244 

5245 # Note that tot = con + dis + (xtie - ntie) + (ytie - ntie) + ntie 

5246 # = con + dis + xtie + ytie - ntie 

5247 con_minus_dis = tot - xtie - ytie + ntie - 2 * dis 

5248 if variant == 'b': 

5249 tau = con_minus_dis / np.sqrt(tot - xtie) / np.sqrt(tot - ytie) 

5250 elif variant == 'c': 

5251 minclasses = min(len(set(x)), len(set(y))) 

5252 tau = 2*con_minus_dis / (size**2 * (minclasses-1)/minclasses) 

5253 else: 

5254 raise ValueError(f"Unknown variant of the method chosen: {variant}. " 

5255 "variant must be 'b' or 'c'.") 

5256 

5257 # Limit range to fix computational errors 

5258 tau = min(1., max(-1., tau)) 

5259 

5260 # The p-value calculation is the same for all variants since the p-value 

5261 # depends only on con_minus_dis. 

5262 if method == 'exact' and (xtie != 0 or ytie != 0): 

5263 raise ValueError("Ties found, exact method cannot be used.") 

5264 

5265 if method == 'auto': 

5266 if (xtie == 0 and ytie == 0) and (size <= 33 or 

5267 min(dis, tot-dis) <= 1): 

5268 method = 'exact' 

5269 else: 

5270 method = 'asymptotic' 

5271 

5272 if xtie == 0 and ytie == 0 and method == 'exact': 

5273 pvalue = mstats_basic._kendall_p_exact(size, tot-dis, alternative) 

5274 elif method == 'asymptotic': 

5275 # con_minus_dis is approx normally distributed with this variance [3]_ 

5276 m = size * (size - 1.) 

5277 var = ((m * (2*size + 5) - x1 - y1) / 18 + 

5278 (2 * xtie * ytie) / m + x0 * y0 / (9 * m * (size - 2))) 

5279 z = con_minus_dis / np.sqrt(var) 

5280 _, pvalue = _normtest_finish(z, alternative) 

5281 else: 

5282 raise ValueError(f"Unknown method {method} specified. Use 'auto', " 

5283 "'exact' or 'asymptotic'.") 

5284 

5285 # create result object with alias for backward compatibility 

5286 res = SignificanceResult(tau, pvalue) 

5287 res.correlation = tau 

5288 return res 

5289 

5290 

5291def weightedtau(x, y, rank=True, weigher=None, additive=True): 

5292 r"""Compute a weighted version of Kendall's :math:`\tau`. 

5293 

5294 The weighted :math:`\tau` is a weighted version of Kendall's 

5295 :math:`\tau` in which exchanges of high weight are more influential than 

5296 exchanges of low weight. The default parameters compute the additive 

5297 hyperbolic version of the index, :math:`\tau_\mathrm h`, which has 

5298 been shown to provide the best balance between important and 

5299 unimportant elements [1]_. 

5300 

5301 The weighting is defined by means of a rank array, which assigns a 

5302 nonnegative rank to each element (higher importance ranks being 

5303 associated with smaller values, e.g., 0 is the highest possible rank), 

5304 and a weigher function, which assigns a weight based on the rank to 

5305 each element. The weight of an exchange is then the sum or the product 

5306 of the weights of the ranks of the exchanged elements. The default 

5307 parameters compute :math:`\tau_\mathrm h`: an exchange between 

5308 elements with rank :math:`r` and :math:`s` (starting from zero) has 

5309 weight :math:`1/(r+1) + 1/(s+1)`. 

5310 

5311 Specifying a rank array is meaningful only if you have in mind an 

5312 external criterion of importance. If, as it usually happens, you do 

5313 not have in mind a specific rank, the weighted :math:`\tau` is 

5314 defined by averaging the values obtained using the decreasing 

5315 lexicographical rank by (`x`, `y`) and by (`y`, `x`). This is the 

5316 behavior with default parameters. Note that the convention used 

5317 here for ranking (lower values imply higher importance) is opposite 

5318 to that used by other SciPy statistical functions. 

5319 

5320 Parameters 

5321 ---------- 

5322 x, y : array_like 

5323 Arrays of scores, of the same shape. If arrays are not 1-D, they will 

5324 be flattened to 1-D. 

5325 rank : array_like of ints or bool, optional 

5326 A nonnegative rank assigned to each element. If it is None, the 

5327 decreasing lexicographical rank by (`x`, `y`) will be used: elements of 

5328 higher rank will be those with larger `x`-values, using `y`-values to 

5329 break ties (in particular, swapping `x` and `y` will give a different 

5330 result). If it is False, the element indices will be used 

5331 directly as ranks. The default is True, in which case this 

5332 function returns the average of the values obtained using the 

5333 decreasing lexicographical rank by (`x`, `y`) and by (`y`, `x`). 

5334 weigher : callable, optional 

5335 The weigher function. Must map nonnegative integers (zero 

5336 representing the most important element) to a nonnegative weight. 

5337 The default, None, provides hyperbolic weighing, that is, 

5338 rank :math:`r` is mapped to weight :math:`1/(r+1)`. 

5339 additive : bool, optional 

5340 If True, the weight of an exchange is computed by adding the 

5341 weights of the ranks of the exchanged elements; otherwise, the weights 

5342 are multiplied. The default is True. 

5343 

5344 Returns 

5345 ------- 

5346 res: SignificanceResult 

5347 An object containing attributes: 

5348 

5349 statistic : float 

5350 The weighted :math:`\tau` correlation index. 

5351 pvalue : float 

5352 Presently ``np.nan``, as the null distribution of the statistic is 

5353 unknown (even in the additive hyperbolic case). 

5354 

5355 See Also 

5356 -------- 

5357 kendalltau : Calculates Kendall's tau. 

5358 spearmanr : Calculates a Spearman rank-order correlation coefficient. 

5359 theilslopes : Computes the Theil-Sen estimator for a set of points (x, y). 

5360 

5361 Notes 

5362 ----- 

5363 This function uses an :math:`O(n \log n)`, mergesort-based algorithm 

5364 [1]_ that is a weighted extension of Knight's algorithm for Kendall's 

5365 :math:`\tau` [2]_. It can compute Shieh's weighted :math:`\tau` [3]_ 

5366 between rankings without ties (i.e., permutations) by setting 

5367 `additive` and `rank` to False, as the definition given in [1]_ is a 

5368 generalization of Shieh's. 

5369 

5370 NaNs are considered the smallest possible score. 

5371 

5372 .. versionadded:: 0.19.0 

5373 

5374 References 

5375 ---------- 

5376 .. [1] Sebastiano Vigna, "A weighted correlation index for rankings with 

5377 ties", Proceedings of the 24th international conference on World 

5378 Wide Web, pp. 1166-1176, ACM, 2015. 

5379 .. [2] W.R. Knight, "A Computer Method for Calculating Kendall's Tau with 

5380 Ungrouped Data", Journal of the American Statistical Association, 

5381 Vol. 61, No. 314, Part 1, pp. 436-439, 1966. 

5382 .. [3] Grace S. Shieh. "A weighted Kendall's tau statistic", Statistics & 

5383 Probability Letters, Vol. 39, No. 1, pp. 17-24, 1998. 

5384 

5385 Examples 

5386 -------- 

5387 >>> import numpy as np 

5388 >>> from scipy import stats 

5389 >>> x = [12, 2, 1, 12, 2] 

5390 >>> y = [1, 4, 7, 1, 0] 

5391 >>> res = stats.weightedtau(x, y) 

5392 >>> res.statistic 

5393 -0.56694968153682723 

5394 >>> res.pvalue 

5395 nan 

5396 >>> res = stats.weightedtau(x, y, additive=False) 

5397 >>> res.statistic 

5398 -0.62205716951801038 

5399 

5400 NaNs are considered the smallest possible score: 

5401 

5402 >>> x = [12, 2, 1, 12, 2] 

5403 >>> y = [1, 4, 7, 1, np.nan] 

5404 >>> res = stats.weightedtau(x, y) 

5405 >>> res.statistic 

5406 -0.56694968153682723 

5407 

5408 This is exactly Kendall's tau: 

5409 

5410 >>> x = [12, 2, 1, 12, 2] 

5411 >>> y = [1, 4, 7, 1, 0] 

5412 >>> res = stats.weightedtau(x, y, weigher=lambda x: 1) 

5413 >>> res.statistic 

5414 -0.47140452079103173 

5415 

5416 >>> x = [12, 2, 1, 12, 2] 

5417 >>> y = [1, 4, 7, 1, 0] 

5418 >>> stats.weightedtau(x, y, rank=None) 

5419 SignificanceResult(statistic=-0.4157652301037516, pvalue=nan) 

5420 >>> stats.weightedtau(y, x, rank=None) 

5421 SignificanceResult(statistic=-0.7181341329699028, pvalue=nan) 

5422 

5423 """ 

5424 x = np.asarray(x).ravel() 

5425 y = np.asarray(y).ravel() 

5426 

5427 if x.size != y.size: 

5428 raise ValueError("All inputs to `weightedtau` must be " 

5429 "of the same size, " 

5430 "found x-size %s and y-size %s" % (x.size, y.size)) 

5431 if not x.size: 

5432 # Return NaN if arrays are empty 

5433 res = SignificanceResult(np.nan, np.nan) 

5434 res.correlation = np.nan 

5435 return res 

5436 

5437 # If there are NaNs we apply _toint64() 

5438 if np.isnan(np.sum(x)): 

5439 x = _toint64(x) 

5440 if np.isnan(np.sum(y)): 

5441 y = _toint64(y) 

5442 

5443 # Reduce to ranks unsupported types 

5444 if x.dtype != y.dtype: 

5445 if x.dtype != np.int64: 

5446 x = _toint64(x) 

5447 if y.dtype != np.int64: 

5448 y = _toint64(y) 

5449 else: 

5450 if x.dtype not in (np.int32, np.int64, np.float32, np.float64): 

5451 x = _toint64(x) 

5452 y = _toint64(y) 

5453 

5454 if rank is True: 

5455 tau = ( 

5456 _weightedrankedtau(x, y, None, weigher, additive) + 

5457 _weightedrankedtau(y, x, None, weigher, additive) 

5458 ) / 2 

5459 res = SignificanceResult(tau, np.nan) 

5460 res.correlation = tau 

5461 return res 

5462 

5463 if rank is False: 

5464 rank = np.arange(x.size, dtype=np.intp) 

5465 elif rank is not None: 

5466 rank = np.asarray(rank).ravel() 

5467 if rank.size != x.size: 

5468 raise ValueError( 

5469 "All inputs to `weightedtau` must be of the same size, " 

5470 "found x-size %s and rank-size %s" % (x.size, rank.size) 

5471 ) 

5472 

5473 tau = _weightedrankedtau(x, y, rank, weigher, additive) 

5474 res = SignificanceResult(tau, np.nan) 

5475 res.correlation = tau 

5476 return res 

5477 

5478 

5479# FROM MGCPY: https://github.com/neurodata/mgcpy 

5480 

5481 

5482class _ParallelP: 

5483 """Helper function to calculate parallel p-value.""" 

5484 

5485 def __init__(self, x, y, random_states): 

5486 self.x = x 

5487 self.y = y 

5488 self.random_states = random_states 

5489 

5490 def __call__(self, index): 

5491 order = self.random_states[index].permutation(self.y.shape[0]) 

5492 permy = self.y[order][:, order] 

5493 

5494 # calculate permuted stats, store in null distribution 

5495 perm_stat = _mgc_stat(self.x, permy)[0] 

5496 

5497 return perm_stat 

5498 

5499 

5500def _perm_test(x, y, stat, reps=1000, workers=-1, random_state=None): 

5501 r"""Helper function that calculates the p-value. See below for uses. 

5502 

5503 Parameters 

5504 ---------- 

5505 x, y : ndarray 

5506 `x` and `y` have shapes `(n, p)` and `(n, q)`. 

5507 stat : float 

5508 The sample test statistic. 

5509 reps : int, optional 

5510 The number of replications used to estimate the null when using the 

5511 permutation test. The default is 1000 replications. 

5512 workers : int or map-like callable, optional 

5513 If `workers` is an int the population is subdivided into `workers` 

5514 sections and evaluated in parallel (uses 

5515 `multiprocessing.Pool <multiprocessing>`). Supply `-1` to use all cores 

5516 available to the Process. Alternatively supply a map-like callable, 

5517 such as `multiprocessing.Pool.map` for evaluating the population in 

5518 parallel. This evaluation is carried out as `workers(func, iterable)`. 

5519 Requires that `func` be pickleable. 

5520 random_state : {None, int, `numpy.random.Generator`, 

5521 `numpy.random.RandomState`}, optional 

5522 

5523 If `seed` is None (or `np.random`), the `numpy.random.RandomState` 

5524 singleton is used. 

5525 If `seed` is an int, a new ``RandomState`` instance is used, 

5526 seeded with `seed`. 

5527 If `seed` is already a ``Generator`` or ``RandomState`` instance then 

5528 that instance is used. 

5529 

5530 Returns 

5531 ------- 

5532 pvalue : float 

5533 The sample test p-value. 

5534 null_dist : list 

5535 The approximated null distribution. 

5536 

5537 """ 

5538 # generate seeds for each rep (change to new parallel random number 

5539 # capabilities in numpy >= 1.17+) 

5540 random_state = check_random_state(random_state) 

5541 random_states = [np.random.RandomState(rng_integers(random_state, 1 << 32, 

5542 size=4, dtype=np.uint32)) for _ in range(reps)] 

5543 

5544 # parallelizes with specified workers over number of reps and set seeds 

5545 parallelp = _ParallelP(x=x, y=y, random_states=random_states) 

5546 with MapWrapper(workers) as mapwrapper: 

5547 null_dist = np.array(list(mapwrapper(parallelp, range(reps)))) 

5548 

5549 # calculate p-value and significant permutation map through list 

5550 pvalue = (1 + (null_dist >= stat).sum()) / (1 + reps) 

5551 

5552 return pvalue, null_dist 

5553 

5554 

5555def _euclidean_dist(x): 

5556 return cdist(x, x) 

5557 

5558 

5559MGCResult = _make_tuple_bunch('MGCResult', 

5560 ['statistic', 'pvalue', 'mgc_dict'], []) 

5561 

5562 

5563def multiscale_graphcorr(x, y, compute_distance=_euclidean_dist, reps=1000, 

5564 workers=1, is_twosamp=False, random_state=None): 

5565 r"""Computes the Multiscale Graph Correlation (MGC) test statistic. 

5566 

5567 Specifically, for each point, MGC finds the :math:`k`-nearest neighbors for 

5568 one property (e.g. cloud density), and the :math:`l`-nearest neighbors for 

5569 the other property (e.g. grass wetness) [1]_. This pair :math:`(k, l)` is 

5570 called the "scale". A priori, however, it is not know which scales will be 

5571 most informative. So, MGC computes all distance pairs, and then efficiently 

5572 computes the distance correlations for all scales. The local correlations 

5573 illustrate which scales are relatively informative about the relationship. 

5574 The key, therefore, to successfully discover and decipher relationships 

5575 between disparate data modalities is to adaptively determine which scales 

5576 are the most informative, and the geometric implication for the most 

5577 informative scales. Doing so not only provides an estimate of whether the 

5578 modalities are related, but also provides insight into how the 

5579 determination was made. This is especially important in high-dimensional 

5580 data, where simple visualizations do not reveal relationships to the 

5581 unaided human eye. Characterizations of this implementation in particular 

5582 have been derived from and benchmarked within in [2]_. 

5583 

5584 Parameters 

5585 ---------- 

5586 x, y : ndarray 

5587 If ``x`` and ``y`` have shapes ``(n, p)`` and ``(n, q)`` where `n` is 

5588 the number of samples and `p` and `q` are the number of dimensions, 

5589 then the MGC independence test will be run. Alternatively, ``x`` and 

5590 ``y`` can have shapes ``(n, n)`` if they are distance or similarity 

5591 matrices, and ``compute_distance`` must be sent to ``None``. If ``x`` 

5592 and ``y`` have shapes ``(n, p)`` and ``(m, p)``, an unpaired 

5593 two-sample MGC test will be run. 

5594 compute_distance : callable, optional 

5595 A function that computes the distance or similarity among the samples 

5596 within each data matrix. Set to ``None`` if ``x`` and ``y`` are 

5597 already distance matrices. The default uses the euclidean norm metric. 

5598 If you are calling a custom function, either create the distance 

5599 matrix before-hand or create a function of the form 

5600 ``compute_distance(x)`` where `x` is the data matrix for which 

5601 pairwise distances are calculated. 

5602 reps : int, optional 

5603 The number of replications used to estimate the null when using the 

5604 permutation test. The default is ``1000``. 

5605 workers : int or map-like callable, optional 

5606 If ``workers`` is an int the population is subdivided into ``workers`` 

5607 sections and evaluated in parallel (uses ``multiprocessing.Pool 

5608 <multiprocessing>``). Supply ``-1`` to use all cores available to the 

5609 Process. Alternatively supply a map-like callable, such as 

5610 ``multiprocessing.Pool.map`` for evaluating the p-value in parallel. 

5611 This evaluation is carried out as ``workers(func, iterable)``. 

5612 Requires that `func` be pickleable. The default is ``1``. 

5613 is_twosamp : bool, optional 

5614 If `True`, a two sample test will be run. If ``x`` and ``y`` have 

5615 shapes ``(n, p)`` and ``(m, p)``, this optional will be overridden and 

5616 set to ``True``. Set to ``True`` if ``x`` and ``y`` both have shapes 

5617 ``(n, p)`` and a two sample test is desired. The default is ``False``. 

5618 Note that this will not run if inputs are distance matrices. 

5619 random_state : {None, int, `numpy.random.Generator`, 

5620 `numpy.random.RandomState`}, optional 

5621 

5622 If `seed` is None (or `np.random`), the `numpy.random.RandomState` 

5623 singleton is used. 

5624 If `seed` is an int, a new ``RandomState`` instance is used, 

5625 seeded with `seed`. 

5626 If `seed` is already a ``Generator`` or ``RandomState`` instance then 

5627 that instance is used. 

5628 

5629 Returns 

5630 ------- 

5631 res : MGCResult 

5632 An object containing attributes: 

5633 

5634 statistic : float 

5635 The sample MGC test statistic within `[-1, 1]`. 

5636 pvalue : float 

5637 The p-value obtained via permutation. 

5638 mgc_dict : dict 

5639 Contains additional useful results: 

5640 

5641 - mgc_map : ndarray 

5642 A 2D representation of the latent geometry of the 

5643 relationship. 

5644 - opt_scale : (int, int) 

5645 The estimated optimal scale as a `(x, y)` pair. 

5646 - null_dist : list 

5647 The null distribution derived from the permuted matrices. 

5648 

5649 See Also 

5650 -------- 

5651 pearsonr : Pearson correlation coefficient and p-value for testing 

5652 non-correlation. 

5653 kendalltau : Calculates Kendall's tau. 

5654 spearmanr : Calculates a Spearman rank-order correlation coefficient. 

5655 

5656 Notes 

5657 ----- 

5658 A description of the process of MGC and applications on neuroscience data 

5659 can be found in [1]_. It is performed using the following steps: 

5660 

5661 #. Two distance matrices :math:`D^X` and :math:`D^Y` are computed and 

5662 modified to be mean zero columnwise. This results in two 

5663 :math:`n \times n` distance matrices :math:`A` and :math:`B` (the 

5664 centering and unbiased modification) [3]_. 

5665 

5666 #. For all values :math:`k` and :math:`l` from :math:`1, ..., n`, 

5667 

5668 * The :math:`k`-nearest neighbor and :math:`l`-nearest neighbor graphs 

5669 are calculated for each property. Here, :math:`G_k (i, j)` indicates 

5670 the :math:`k`-smallest values of the :math:`i`-th row of :math:`A` 

5671 and :math:`H_l (i, j)` indicates the :math:`l` smallested values of 

5672 the :math:`i`-th row of :math:`B` 

5673 

5674 * Let :math:`\circ` denotes the entry-wise matrix product, then local 

5675 correlations are summed and normalized using the following statistic: 

5676 

5677 .. math:: 

5678 

5679 c^{kl} = \frac{\sum_{ij} A G_k B H_l} 

5680 {\sqrt{\sum_{ij} A^2 G_k \times \sum_{ij} B^2 H_l}} 

5681 

5682 #. The MGC test statistic is the smoothed optimal local correlation of 

5683 :math:`\{ c^{kl} \}`. Denote the smoothing operation as :math:`R(\cdot)` 

5684 (which essentially set all isolated large correlations) as 0 and 

5685 connected large correlations the same as before, see [3]_.) MGC is, 

5686 

5687 .. math:: 

5688 

5689 MGC_n (x, y) = \max_{(k, l)} R \left(c^{kl} \left( x_n, y_n \right) 

5690 \right) 

5691 

5692 The test statistic returns a value between :math:`(-1, 1)` since it is 

5693 normalized. 

5694 

5695 The p-value returned is calculated using a permutation test. This process 

5696 is completed by first randomly permuting :math:`y` to estimate the null 

5697 distribution and then calculating the probability of observing a test 

5698 statistic, under the null, at least as extreme as the observed test 

5699 statistic. 

5700 

5701 MGC requires at least 5 samples to run with reliable results. It can also 

5702 handle high-dimensional data sets. 

5703 In addition, by manipulating the input data matrices, the two-sample 

5704 testing problem can be reduced to the independence testing problem [4]_. 

5705 Given sample data :math:`U` and :math:`V` of sizes :math:`p \times n` 

5706 :math:`p \times m`, data matrix :math:`X` and :math:`Y` can be created as 

5707 follows: 

5708 

5709 .. math:: 

5710 

5711 X = [U | V] \in \mathcal{R}^{p \times (n + m)} 

5712 Y = [0_{1 \times n} | 1_{1 \times m}] \in \mathcal{R}^{(n + m)} 

5713 

5714 Then, the MGC statistic can be calculated as normal. This methodology can 

5715 be extended to similar tests such as distance correlation [4]_. 

5716 

5717 .. versionadded:: 1.4.0 

5718 

5719 References 

5720 ---------- 

5721 .. [1] Vogelstein, J. T., Bridgeford, E. W., Wang, Q., Priebe, C. E., 

5722 Maggioni, M., & Shen, C. (2019). Discovering and deciphering 

5723 relationships across disparate data modalities. ELife. 

5724 .. [2] Panda, S., Palaniappan, S., Xiong, J., Swaminathan, A., 

5725 Ramachandran, S., Bridgeford, E. W., ... Vogelstein, J. T. (2019). 

5726 mgcpy: A Comprehensive High Dimensional Independence Testing Python 

5727 Package. :arXiv:`1907.02088` 

5728 .. [3] Shen, C., Priebe, C.E., & Vogelstein, J. T. (2019). From distance 

5729 correlation to multiscale graph correlation. Journal of the American 

5730 Statistical Association. 

5731 .. [4] Shen, C. & Vogelstein, J. T. (2018). The Exact Equivalence of 

5732 Distance and Kernel Methods for Hypothesis Testing. 

5733 :arXiv:`1806.05514` 

5734 

5735 Examples 

5736 -------- 

5737 >>> import numpy as np 

5738 >>> from scipy.stats import multiscale_graphcorr 

5739 >>> x = np.arange(100) 

5740 >>> y = x 

5741 >>> res = multiscale_graphcorr(x, y) 

5742 >>> res.statistic, res.pvalue 

5743 (1.0, 0.001) 

5744 

5745 To run an unpaired two-sample test, 

5746 

5747 >>> x = np.arange(100) 

5748 >>> y = np.arange(79) 

5749 >>> res = multiscale_graphcorr(x, y) 

5750 >>> res.statistic, res.pvalue # doctest: +SKIP 

5751 (0.033258146255703246, 0.023) 

5752 

5753 or, if shape of the inputs are the same, 

5754 

5755 >>> x = np.arange(100) 

5756 >>> y = x 

5757 >>> res = multiscale_graphcorr(x, y, is_twosamp=True) 

5758 >>> res.statistic, res.pvalue # doctest: +SKIP 

5759 (-0.008021809890200488, 1.0) 

5760 

5761 """ 

5762 if not isinstance(x, np.ndarray) or not isinstance(y, np.ndarray): 

5763 raise ValueError("x and y must be ndarrays") 

5764 

5765 # convert arrays of type (n,) to (n, 1) 

5766 if x.ndim == 1: 

5767 x = x[:, np.newaxis] 

5768 elif x.ndim != 2: 

5769 raise ValueError("Expected a 2-D array `x`, found shape " 

5770 "{}".format(x.shape)) 

5771 if y.ndim == 1: 

5772 y = y[:, np.newaxis] 

5773 elif y.ndim != 2: 

5774 raise ValueError("Expected a 2-D array `y`, found shape " 

5775 "{}".format(y.shape)) 

5776 

5777 nx, px = x.shape 

5778 ny, py = y.shape 

5779 

5780 # check for NaNs 

5781 _contains_nan(x, nan_policy='raise') 

5782 _contains_nan(y, nan_policy='raise') 

5783 

5784 # check for positive or negative infinity and raise error 

5785 if np.sum(np.isinf(x)) > 0 or np.sum(np.isinf(y)) > 0: 

5786 raise ValueError("Inputs contain infinities") 

5787 

5788 if nx != ny: 

5789 if px == py: 

5790 # reshape x and y for two sample testing 

5791 is_twosamp = True 

5792 else: 

5793 raise ValueError("Shape mismatch, x and y must have shape [n, p] " 

5794 "and [n, q] or have shape [n, p] and [m, p].") 

5795 

5796 if nx < 5 or ny < 5: 

5797 raise ValueError("MGC requires at least 5 samples to give reasonable " 

5798 "results.") 

5799 

5800 # convert x and y to float 

5801 x = x.astype(np.float64) 

5802 y = y.astype(np.float64) 

5803 

5804 # check if compute_distance_matrix if a callable() 

5805 if not callable(compute_distance) and compute_distance is not None: 

5806 raise ValueError("Compute_distance must be a function.") 

5807 

5808 # check if number of reps exists, integer, or > 0 (if under 1000 raises 

5809 # warning) 

5810 if not isinstance(reps, int) or reps < 0: 

5811 raise ValueError("Number of reps must be an integer greater than 0.") 

5812 elif reps < 1000: 

5813 msg = ("The number of replications is low (under 1000), and p-value " 

5814 "calculations may be unreliable. Use the p-value result, with " 

5815 "caution!") 

5816 warnings.warn(msg, RuntimeWarning) 

5817 

5818 if is_twosamp: 

5819 if compute_distance is None: 

5820 raise ValueError("Cannot run if inputs are distance matrices") 

5821 x, y = _two_sample_transform(x, y) 

5822 

5823 if compute_distance is not None: 

5824 # compute distance matrices for x and y 

5825 x = compute_distance(x) 

5826 y = compute_distance(y) 

5827 

5828 # calculate MGC stat 

5829 stat, stat_dict = _mgc_stat(x, y) 

5830 stat_mgc_map = stat_dict["stat_mgc_map"] 

5831 opt_scale = stat_dict["opt_scale"] 

5832 

5833 # calculate permutation MGC p-value 

5834 pvalue, null_dist = _perm_test(x, y, stat, reps=reps, workers=workers, 

5835 random_state=random_state) 

5836 

5837 # save all stats (other than stat/p-value) in dictionary 

5838 mgc_dict = {"mgc_map": stat_mgc_map, 

5839 "opt_scale": opt_scale, 

5840 "null_dist": null_dist} 

5841 

5842 # create result object with alias for backward compatibility 

5843 res = MGCResult(stat, pvalue, mgc_dict) 

5844 res.stat = stat 

5845 return res 

5846 

5847 

5848def _mgc_stat(distx, disty): 

5849 r"""Helper function that calculates the MGC stat. See above for use. 

5850 

5851 Parameters 

5852 ---------- 

5853 distx, disty : ndarray 

5854 `distx` and `disty` have shapes `(n, p)` and `(n, q)` or 

5855 `(n, n)` and `(n, n)` 

5856 if distance matrices. 

5857 

5858 Returns 

5859 ------- 

5860 stat : float 

5861 The sample MGC test statistic within `[-1, 1]`. 

5862 stat_dict : dict 

5863 Contains additional useful additional returns containing the following 

5864 keys: 

5865 

5866 - stat_mgc_map : ndarray 

5867 MGC-map of the statistics. 

5868 - opt_scale : (float, float) 

5869 The estimated optimal scale as a `(x, y)` pair. 

5870 

5871 """ 

5872 # calculate MGC map and optimal scale 

5873 stat_mgc_map = _local_correlations(distx, disty, global_corr='mgc') 

5874 

5875 n, m = stat_mgc_map.shape 

5876 if m == 1 or n == 1: 

5877 # the global scale at is the statistic calculated at maximial nearest 

5878 # neighbors. There is not enough local scale to search over, so 

5879 # default to global scale 

5880 stat = stat_mgc_map[m - 1][n - 1] 

5881 opt_scale = m * n 

5882 else: 

5883 samp_size = len(distx) - 1 

5884 

5885 # threshold to find connected region of significant local correlations 

5886 sig_connect = _threshold_mgc_map(stat_mgc_map, samp_size) 

5887 

5888 # maximum within the significant region 

5889 stat, opt_scale = _smooth_mgc_map(sig_connect, stat_mgc_map) 

5890 

5891 stat_dict = {"stat_mgc_map": stat_mgc_map, 

5892 "opt_scale": opt_scale} 

5893 

5894 return stat, stat_dict 

5895 

5896 

5897def _threshold_mgc_map(stat_mgc_map, samp_size): 

5898 r""" 

5899 Finds a connected region of significance in the MGC-map by thresholding. 

5900 

5901 Parameters 

5902 ---------- 

5903 stat_mgc_map : ndarray 

5904 All local correlations within `[-1,1]`. 

5905 samp_size : int 

5906 The sample size of original data. 

5907 

5908 Returns 

5909 ------- 

5910 sig_connect : ndarray 

5911 A binary matrix with 1's indicating the significant region. 

5912 

5913 """ 

5914 m, n = stat_mgc_map.shape 

5915 

5916 # 0.02 is simply an empirical threshold, this can be set to 0.01 or 0.05 

5917 # with varying levels of performance. Threshold is based on a beta 

5918 # approximation. 

5919 per_sig = 1 - (0.02 / samp_size) # Percentile to consider as significant 

5920 threshold = samp_size * (samp_size - 3)/4 - 1/2 # Beta approximation 

5921 threshold = distributions.beta.ppf(per_sig, threshold, threshold) * 2 - 1 

5922 

5923 # the global scale at is the statistic calculated at maximial nearest 

5924 # neighbors. Threshold is the maximum on the global and local scales 

5925 threshold = max(threshold, stat_mgc_map[m - 1][n - 1]) 

5926 

5927 # find the largest connected component of significant correlations 

5928 sig_connect = stat_mgc_map > threshold 

5929 if np.sum(sig_connect) > 0: 

5930 sig_connect, _ = _measurements.label(sig_connect) 

5931 _, label_counts = np.unique(sig_connect, return_counts=True) 

5932 

5933 # skip the first element in label_counts, as it is count(zeros) 

5934 max_label = np.argmax(label_counts[1:]) + 1 

5935 sig_connect = sig_connect == max_label 

5936 else: 

5937 sig_connect = np.array([[False]]) 

5938 

5939 return sig_connect 

5940 

5941 

5942def _smooth_mgc_map(sig_connect, stat_mgc_map): 

5943 """Finds the smoothed maximal within the significant region R. 

5944 

5945 If area of R is too small it returns the last local correlation. Otherwise, 

5946 returns the maximum within significant_connected_region. 

5947 

5948 Parameters 

5949 ---------- 

5950 sig_connect : ndarray 

5951 A binary matrix with 1's indicating the significant region. 

5952 stat_mgc_map : ndarray 

5953 All local correlations within `[-1, 1]`. 

5954 

5955 Returns 

5956 ------- 

5957 stat : float 

5958 The sample MGC statistic within `[-1, 1]`. 

5959 opt_scale: (float, float) 

5960 The estimated optimal scale as an `(x, y)` pair. 

5961 

5962 """ 

5963 m, n = stat_mgc_map.shape 

5964 

5965 # the global scale at is the statistic calculated at maximial nearest 

5966 # neighbors. By default, statistic and optimal scale are global. 

5967 stat = stat_mgc_map[m - 1][n - 1] 

5968 opt_scale = [m, n] 

5969 

5970 if np.linalg.norm(sig_connect) != 0: 

5971 # proceed only when the connected region's area is sufficiently large 

5972 # 0.02 is simply an empirical threshold, this can be set to 0.01 or 0.05 

5973 # with varying levels of performance 

5974 if np.sum(sig_connect) >= np.ceil(0.02 * max(m, n)) * min(m, n): 

5975 max_corr = max(stat_mgc_map[sig_connect]) 

5976 

5977 # find all scales within significant_connected_region that maximize 

5978 # the local correlation 

5979 max_corr_index = np.where((stat_mgc_map >= max_corr) & sig_connect) 

5980 

5981 if max_corr >= stat: 

5982 stat = max_corr 

5983 

5984 k, l = max_corr_index 

5985 one_d_indices = k * n + l # 2D to 1D indexing 

5986 k = np.max(one_d_indices) // n 

5987 l = np.max(one_d_indices) % n 

5988 opt_scale = [k+1, l+1] # adding 1s to match R indexing 

5989 

5990 return stat, opt_scale 

5991 

5992 

5993def _two_sample_transform(u, v): 

5994 """Helper function that concatenates x and y for two sample MGC stat. 

5995 

5996 See above for use. 

5997 

5998 Parameters 

5999 ---------- 

6000 u, v : ndarray 

6001 `u` and `v` have shapes `(n, p)` and `(m, p)`. 

6002 

6003 Returns 

6004 ------- 

6005 x : ndarray 

6006 Concatenate `u` and `v` along the `axis = 0`. `x` thus has shape 

6007 `(2n, p)`. 

6008 y : ndarray 

6009 Label matrix for `x` where 0 refers to samples that comes from `u` and 

6010 1 refers to samples that come from `v`. `y` thus has shape `(2n, 1)`. 

6011 

6012 """ 

6013 nx = u.shape[0] 

6014 ny = v.shape[0] 

6015 x = np.concatenate([u, v], axis=0) 

6016 y = np.concatenate([np.zeros(nx), np.ones(ny)], axis=0).reshape(-1, 1) 

6017 return x, y 

6018 

6019 

6020##################################### 

6021# INFERENTIAL STATISTICS # 

6022##################################### 

6023 

6024TtestResultBase = _make_tuple_bunch('TtestResultBase', 

6025 ['statistic', 'pvalue'], ['df']) 

6026 

6027 

6028class TtestResult(TtestResultBase): 

6029 """ 

6030 Result of a t-test. 

6031 

6032 See the documentation of the particular t-test function for more 

6033 information about the definition of the statistic and meaning of 

6034 the confidence interval. 

6035 

6036 Attributes 

6037 ---------- 

6038 statistic : float or array 

6039 The t-statistic of the sample. 

6040 pvalue : float or array 

6041 The p-value associated with the given alternative. 

6042 df : float or array 

6043 The number of degrees of freedom used in calculation of the 

6044 t-statistic; this is one less than the size of the sample 

6045 (``a.shape[axis]-1`` if there are no masked elements or omitted NaNs). 

6046 

6047 Methods 

6048 ------- 

6049 confidence_interval 

6050 Computes a confidence interval around the population statistic 

6051 for the given confidence level. 

6052 The confidence interval is returned in a ``namedtuple`` with 

6053 fields `low` and `high`. 

6054 

6055 """ 

6056 

6057 def __init__(self, statistic, pvalue, df, # public 

6058 alternative, standard_error, estimate): # private 

6059 super().__init__(statistic, pvalue, df=df) 

6060 self._alternative = alternative 

6061 self._standard_error = standard_error # denominator of t-statistic 

6062 self._estimate = estimate # point estimate of sample mean 

6063 

6064 def confidence_interval(self, confidence_level=0.95): 

6065 """ 

6066 Parameters 

6067 ---------- 

6068 confidence_level : float 

6069 The confidence level for the calculation of the population mean 

6070 confidence interval. Default is 0.95. 

6071 

6072 Returns 

6073 ------- 

6074 ci : namedtuple 

6075 The confidence interval is returned in a ``namedtuple`` with 

6076 fields `low` and `high`. 

6077 

6078 """ 

6079 low, high = _t_confidence_interval(self.df, self.statistic, 

6080 confidence_level, self._alternative) 

6081 low = low * self._standard_error + self._estimate 

6082 high = high * self._standard_error + self._estimate 

6083 return ConfidenceInterval(low=low, high=high) 

6084 

6085 

6086def pack_TtestResult(statistic, pvalue, df, alternative, standard_error, 

6087 estimate): 

6088 # this could be any number of dimensions (including 0d), but there is 

6089 # at most one unique value 

6090 alternative = np.atleast_1d(alternative).ravel() 

6091 alternative = alternative[0] if alternative.size else np.nan 

6092 return TtestResult(statistic, pvalue, df=df, alternative=alternative, 

6093 standard_error=standard_error, estimate=estimate) 

6094 

6095 

6096def unpack_TtestResult(res): 

6097 return (res.statistic, res.pvalue, res.df, res._alternative, 

6098 res._standard_error, res._estimate) 

6099 

6100 

6101@_axis_nan_policy_factory(pack_TtestResult, default_axis=0, n_samples=2, 

6102 result_to_tuple=unpack_TtestResult, n_outputs=6) 

6103def ttest_1samp(a, popmean, axis=0, nan_policy='propagate', 

6104 alternative="two-sided"): 

6105 """Calculate the T-test for the mean of ONE group of scores. 

6106 

6107 This is a test for the null hypothesis that the expected value 

6108 (mean) of a sample of independent observations `a` is equal to the given 

6109 population mean, `popmean`. 

6110 

6111 Parameters 

6112 ---------- 

6113 a : array_like 

6114 Sample observation. 

6115 popmean : float or array_like 

6116 Expected value in null hypothesis. If array_like, then its length along 

6117 `axis` must equal 1, and it must otherwise be broadcastable with `a`. 

6118 axis : int or None, optional 

6119 Axis along which to compute test; default is 0. If None, compute over 

6120 the whole array `a`. 

6121 nan_policy : {'propagate', 'raise', 'omit'}, optional 

6122 Defines how to handle when input contains nan. 

6123 The following options are available (default is 'propagate'): 

6124 

6125 * 'propagate': returns nan 

6126 * 'raise': throws an error 

6127 * 'omit': performs the calculations ignoring nan values 

6128 

6129 alternative : {'two-sided', 'less', 'greater'}, optional 

6130 Defines the alternative hypothesis. 

6131 The following options are available (default is 'two-sided'): 

6132 

6133 * 'two-sided': the mean of the underlying distribution of the sample 

6134 is different than the given population mean (`popmean`) 

6135 * 'less': the mean of the underlying distribution of the sample is 

6136 less than the given population mean (`popmean`) 

6137 * 'greater': the mean of the underlying distribution of the sample is 

6138 greater than the given population mean (`popmean`) 

6139 

6140 Returns 

6141 ------- 

6142 result : `~scipy.stats._result_classes.TtestResult` 

6143 An object with the following attributes: 

6144 

6145 statistic : float or array 

6146 The t-statistic. 

6147 pvalue : float or array 

6148 The p-value associated with the given alternative. 

6149 df : float or array 

6150 The number of degrees of freedom used in calculation of the 

6151 t-statistic; this is one less than the size of the sample 

6152 (``a.shape[axis]``). 

6153 

6154 .. versionadded:: 1.10.0 

6155 

6156 The object also has the following method: 

6157 

6158 confidence_interval(confidence_level=0.95) 

6159 Computes a confidence interval around the population 

6160 mean for the given confidence level. 

6161 The confidence interval is returned in a ``namedtuple`` with 

6162 fields `low` and `high`. 

6163 

6164 .. versionadded:: 1.10.0 

6165 

6166 Notes 

6167 ----- 

6168 The statistic is calculated as ``(np.mean(a) - popmean)/se``, where 

6169 ``se`` is the standard error. Therefore, the statistic will be positive 

6170 when the sample mean is greater than the population mean and negative when 

6171 the sample mean is less than the population mean. 

6172 

6173 Examples 

6174 -------- 

6175 Suppose we wish to test the null hypothesis that the mean of a population 

6176 is equal to 0.5. We choose a confidence level of 99%; that is, we will 

6177 reject the null hypothesis in favor of the alternative if the p-value is 

6178 less than 0.01. 

6179 

6180 When testing random variates from the standard uniform distribution, which 

6181 has a mean of 0.5, we expect the data to be consistent with the null 

6182 hypothesis most of the time. 

6183 

6184 >>> import numpy as np 

6185 >>> from scipy import stats 

6186 >>> rng = np.random.default_rng() 

6187 >>> rvs = stats.uniform.rvs(size=50, random_state=rng) 

6188 >>> stats.ttest_1samp(rvs, popmean=0.5) 

6189 TtestResult(statistic=2.456308468440, pvalue=0.017628209047638, df=49) 

6190 

6191 As expected, the p-value of 0.017 is not below our threshold of 0.01, so 

6192 we cannot reject the null hypothesis. 

6193 

6194 When testing data from the standard *normal* distribution, which has a mean 

6195 of 0, we would expect the null hypothesis to be rejected. 

6196 

6197 >>> rvs = stats.norm.rvs(size=50, random_state=rng) 

6198 >>> stats.ttest_1samp(rvs, popmean=0.5) 

6199 TtestResult(statistic=-7.433605518875, pvalue=1.416760157221e-09, df=49) 

6200 

6201 Indeed, the p-value is lower than our threshold of 0.01, so we reject the 

6202 null hypothesis in favor of the default "two-sided" alternative: the mean 

6203 of the population is *not* equal to 0.5. 

6204 

6205 However, suppose we were to test the null hypothesis against the 

6206 one-sided alternative that the mean of the population is *greater* than 

6207 0.5. Since the mean of the standard normal is less than 0.5, we would not 

6208 expect the null hypothesis to be rejected. 

6209 

6210 >>> stats.ttest_1samp(rvs, popmean=0.5, alternative='greater') 

6211 TtestResult(statistic=-7.433605518875, pvalue=0.99999999929, df=49) 

6212 

6213 Unsurprisingly, with a p-value greater than our threshold, we would not 

6214 reject the null hypothesis. 

6215 

6216 Note that when working with a confidence level of 99%, a true null 

6217 hypothesis will be rejected approximately 1% of the time. 

6218 

6219 >>> rvs = stats.uniform.rvs(size=(100, 50), random_state=rng) 

6220 >>> res = stats.ttest_1samp(rvs, popmean=0.5, axis=1) 

6221 >>> np.sum(res.pvalue < 0.01) 

6222 1 

6223 

6224 Indeed, even though all 100 samples above were drawn from the standard 

6225 uniform distribution, which *does* have a population mean of 0.5, we would 

6226 mistakenly reject the null hypothesis for one of them. 

6227 

6228 `ttest_1samp` can also compute a confidence interval around the population 

6229 mean. 

6230 

6231 >>> rvs = stats.norm.rvs(size=50, random_state=rng) 

6232 >>> res = stats.ttest_1samp(rvs, popmean=0) 

6233 >>> ci = res.confidence_interval(confidence_level=0.95) 

6234 >>> ci 

6235 ConfidenceInterval(low=-0.3193887540880017, high=0.2898583388980972) 

6236 

6237 The bounds of the 95% confidence interval are the 

6238 minimum and maximum values of the parameter `popmean` for which the 

6239 p-value of the test would be 0.05. 

6240 

6241 >>> res = stats.ttest_1samp(rvs, popmean=ci.low) 

6242 >>> np.testing.assert_allclose(res.pvalue, 0.05) 

6243 >>> res = stats.ttest_1samp(rvs, popmean=ci.high) 

6244 >>> np.testing.assert_allclose(res.pvalue, 0.05) 

6245 

6246 Under certain assumptions about the population from which a sample 

6247 is drawn, the confidence interval with confidence level 95% is expected 

6248 to contain the true population mean in 95% of sample replications. 

6249 

6250 >>> rvs = stats.norm.rvs(size=(50, 1000), loc=1, random_state=rng) 

6251 >>> res = stats.ttest_1samp(rvs, popmean=0) 

6252 >>> ci = res.confidence_interval() 

6253 >>> contains_pop_mean = (ci.low < 1) & (ci.high > 1) 

6254 >>> contains_pop_mean.sum() 

6255 953 

6256 

6257 """ 

6258 a, axis = _chk_asarray(a, axis) 

6259 

6260 n = a.shape[axis] 

6261 df = n - 1 

6262 

6263 mean = np.mean(a, axis) 

6264 try: 

6265 popmean = np.squeeze(popmean, axis=axis) 

6266 except ValueError as e: 

6267 raise ValueError("`popmean.shape[axis]` must equal 1.") from e 

6268 d = mean - popmean 

6269 v = _var(a, axis, ddof=1) 

6270 denom = np.sqrt(v / n) 

6271 

6272 with np.errstate(divide='ignore', invalid='ignore'): 

6273 t = np.divide(d, denom) 

6274 t, prob = _ttest_finish(df, t, alternative) 

6275 

6276 # when nan_policy='omit', `df` can be different for different axis-slices 

6277 df = np.broadcast_to(df, t.shape)[()] 

6278 # _axis_nan_policy decorator doesn't play well with strings 

6279 alternative_num = {"less": -1, "two-sided": 0, "greater": 1}[alternative] 

6280 return TtestResult(t, prob, df=df, alternative=alternative_num, 

6281 standard_error=denom, estimate=mean) 

6282 

6283 

6284def _t_confidence_interval(df, t, confidence_level, alternative): 

6285 # Input validation on `alternative` is already done 

6286 # We just need IV on confidence_level 

6287 if confidence_level < 0 or confidence_level > 1: 

6288 message = "`confidence_level` must be a number between 0 and 1." 

6289 raise ValueError(message) 

6290 

6291 if alternative < 0: # 'less' 

6292 p = confidence_level 

6293 low, high = np.broadcast_arrays(-np.inf, special.stdtrit(df, p)) 

6294 elif alternative > 0: # 'greater' 

6295 p = 1 - confidence_level 

6296 low, high = np.broadcast_arrays(special.stdtrit(df, p), np.inf) 

6297 elif alternative == 0: # 'two-sided' 

6298 tail_probability = (1 - confidence_level)/2 

6299 p = tail_probability, 1-tail_probability 

6300 # axis of p must be the zeroth and orthogonal to all the rest 

6301 p = np.reshape(p, [2] + [1]*np.asarray(df).ndim) 

6302 low, high = special.stdtrit(df, p) 

6303 else: # alternative is NaN when input is empty (see _axis_nan_policy) 

6304 p, nans = np.broadcast_arrays(t, np.nan) 

6305 low, high = nans, nans 

6306 

6307 return low[()], high[()] 

6308 

6309 

6310def _ttest_finish(df, t, alternative): 

6311 """Common code between all 3 t-test functions.""" 

6312 # We use ``stdtr`` directly here as it handles the case when ``nan`` 

6313 # values are present in the data and masked arrays are passed 

6314 # while ``t.cdf`` emits runtime warnings. This way ``_ttest_finish`` 

6315 # can be shared between the ``stats`` and ``mstats`` versions. 

6316 

6317 if alternative == 'less': 

6318 pval = special.stdtr(df, t) 

6319 elif alternative == 'greater': 

6320 pval = special.stdtr(df, -t) 

6321 elif alternative == 'two-sided': 

6322 pval = special.stdtr(df, -np.abs(t))*2 

6323 else: 

6324 raise ValueError("alternative must be " 

6325 "'less', 'greater' or 'two-sided'") 

6326 

6327 if t.ndim == 0: 

6328 t = t[()] 

6329 if pval.ndim == 0: 

6330 pval = pval[()] 

6331 

6332 return t, pval 

6333 

6334 

6335def _ttest_ind_from_stats(mean1, mean2, denom, df, alternative): 

6336 

6337 d = mean1 - mean2 

6338 with np.errstate(divide='ignore', invalid='ignore'): 

6339 t = np.divide(d, denom) 

6340 t, prob = _ttest_finish(df, t, alternative) 

6341 

6342 return (t, prob) 

6343 

6344 

6345def _unequal_var_ttest_denom(v1, n1, v2, n2): 

6346 vn1 = v1 / n1 

6347 vn2 = v2 / n2 

6348 with np.errstate(divide='ignore', invalid='ignore'): 

6349 df = (vn1 + vn2)**2 / (vn1**2 / (n1 - 1) + vn2**2 / (n2 - 1)) 

6350 

6351 # If df is undefined, variances are zero (assumes n1 > 0 & n2 > 0). 

6352 # Hence it doesn't matter what df is as long as it's not NaN. 

6353 df = np.where(np.isnan(df), 1, df) 

6354 denom = np.sqrt(vn1 + vn2) 

6355 return df, denom 

6356 

6357 

6358def _equal_var_ttest_denom(v1, n1, v2, n2): 

6359 df = n1 + n2 - 2.0 

6360 svar = ((n1 - 1) * v1 + (n2 - 1) * v2) / df 

6361 denom = np.sqrt(svar * (1.0 / n1 + 1.0 / n2)) 

6362 return df, denom 

6363 

6364 

6365Ttest_indResult = namedtuple('Ttest_indResult', ('statistic', 'pvalue')) 

6366 

6367 

6368def ttest_ind_from_stats(mean1, std1, nobs1, mean2, std2, nobs2, 

6369 equal_var=True, alternative="two-sided"): 

6370 r""" 

6371 T-test for means of two independent samples from descriptive statistics. 

6372 

6373 This is a test for the null hypothesis that two independent 

6374 samples have identical average (expected) values. 

6375 

6376 Parameters 

6377 ---------- 

6378 mean1 : array_like 

6379 The mean(s) of sample 1. 

6380 std1 : array_like 

6381 The corrected sample standard deviation of sample 1 (i.e. ``ddof=1``). 

6382 nobs1 : array_like 

6383 The number(s) of observations of sample 1. 

6384 mean2 : array_like 

6385 The mean(s) of sample 2. 

6386 std2 : array_like 

6387 The corrected sample standard deviation of sample 2 (i.e. ``ddof=1``). 

6388 nobs2 : array_like 

6389 The number(s) of observations of sample 2. 

6390 equal_var : bool, optional 

6391 If True (default), perform a standard independent 2 sample test 

6392 that assumes equal population variances [1]_. 

6393 If False, perform Welch's t-test, which does not assume equal 

6394 population variance [2]_. 

6395 alternative : {'two-sided', 'less', 'greater'}, optional 

6396 Defines the alternative hypothesis. 

6397 The following options are available (default is 'two-sided'): 

6398 

6399 * 'two-sided': the means of the distributions are unequal. 

6400 * 'less': the mean of the first distribution is less than the 

6401 mean of the second distribution. 

6402 * 'greater': the mean of the first distribution is greater than the 

6403 mean of the second distribution. 

6404 

6405 .. versionadded:: 1.6.0 

6406 

6407 Returns 

6408 ------- 

6409 statistic : float or array 

6410 The calculated t-statistics. 

6411 pvalue : float or array 

6412 The two-tailed p-value. 

6413 

6414 See Also 

6415 -------- 

6416 scipy.stats.ttest_ind 

6417 

6418 Notes 

6419 ----- 

6420 The statistic is calculated as ``(mean1 - mean2)/se``, where ``se`` is the 

6421 standard error. Therefore, the statistic will be positive when `mean1` is 

6422 greater than `mean2` and negative when `mean1` is less than `mean2`. 

6423 

6424 References 

6425 ---------- 

6426 .. [1] https://en.wikipedia.org/wiki/T-test#Independent_two-sample_t-test 

6427 

6428 .. [2] https://en.wikipedia.org/wiki/Welch%27s_t-test 

6429 

6430 Examples 

6431 -------- 

6432 Suppose we have the summary data for two samples, as follows (with the 

6433 Sample Variance being the corrected sample variance):: 

6434 

6435 Sample Sample 

6436 Size Mean Variance 

6437 Sample 1 13 15.0 87.5 

6438 Sample 2 11 12.0 39.0 

6439 

6440 Apply the t-test to this data (with the assumption that the population 

6441 variances are equal): 

6442 

6443 >>> import numpy as np 

6444 >>> from scipy.stats import ttest_ind_from_stats 

6445 >>> ttest_ind_from_stats(mean1=15.0, std1=np.sqrt(87.5), nobs1=13, 

6446 ... mean2=12.0, std2=np.sqrt(39.0), nobs2=11) 

6447 Ttest_indResult(statistic=0.9051358093310269, pvalue=0.3751996797581487) 

6448 

6449 For comparison, here is the data from which those summary statistics 

6450 were taken. With this data, we can compute the same result using 

6451 `scipy.stats.ttest_ind`: 

6452 

6453 >>> a = np.array([1, 3, 4, 6, 11, 13, 15, 19, 22, 24, 25, 26, 26]) 

6454 >>> b = np.array([2, 4, 6, 9, 11, 13, 14, 15, 18, 19, 21]) 

6455 >>> from scipy.stats import ttest_ind 

6456 >>> ttest_ind(a, b) 

6457 Ttest_indResult(statistic=0.905135809331027, pvalue=0.3751996797581486) 

6458 

6459 Suppose we instead have binary data and would like to apply a t-test to 

6460 compare the proportion of 1s in two independent groups:: 

6461 

6462 Number of Sample Sample 

6463 Size ones Mean Variance 

6464 Sample 1 150 30 0.2 0.161073 

6465 Sample 2 200 45 0.225 0.175251 

6466 

6467 The sample mean :math:`\hat{p}` is the proportion of ones in the sample 

6468 and the variance for a binary observation is estimated by 

6469 :math:`\hat{p}(1-\hat{p})`. 

6470 

6471 >>> ttest_ind_from_stats(mean1=0.2, std1=np.sqrt(0.161073), nobs1=150, 

6472 ... mean2=0.225, std2=np.sqrt(0.175251), nobs2=200) 

6473 Ttest_indResult(statistic=-0.5627187905196761, pvalue=0.5739887114209541) 

6474 

6475 For comparison, we could compute the t statistic and p-value using 

6476 arrays of 0s and 1s and `scipy.stat.ttest_ind`, as above. 

6477 

6478 >>> group1 = np.array([1]*30 + [0]*(150-30)) 

6479 >>> group2 = np.array([1]*45 + [0]*(200-45)) 

6480 >>> ttest_ind(group1, group2) 

6481 Ttest_indResult(statistic=-0.5627179589855622, pvalue=0.573989277115258) 

6482 

6483 """ 

6484 mean1 = np.asarray(mean1) 

6485 std1 = np.asarray(std1) 

6486 mean2 = np.asarray(mean2) 

6487 std2 = np.asarray(std2) 

6488 if equal_var: 

6489 df, denom = _equal_var_ttest_denom(std1**2, nobs1, std2**2, nobs2) 

6490 else: 

6491 df, denom = _unequal_var_ttest_denom(std1**2, nobs1, 

6492 std2**2, nobs2) 

6493 

6494 res = _ttest_ind_from_stats(mean1, mean2, denom, df, alternative) 

6495 return Ttest_indResult(*res) 

6496 

6497 

6498def _ttest_nans(a, b, axis, namedtuple_type): 

6499 """ 

6500 Generate an array of `nan`, with shape determined by `a`, `b` and `axis`. 

6501 

6502 This function is used by ttest_ind and ttest_rel to create the return 

6503 value when one of the inputs has size 0. 

6504 

6505 The shapes of the arrays are determined by dropping `axis` from the 

6506 shapes of `a` and `b` and broadcasting what is left. 

6507 

6508 The return value is a named tuple of the type given in `namedtuple_type`. 

6509 

6510 Examples 

6511 -------- 

6512 >>> import numpy as np 

6513 >>> a = np.zeros((9, 2)) 

6514 >>> b = np.zeros((5, 1)) 

6515 >>> _ttest_nans(a, b, 0, Ttest_indResult) 

6516 Ttest_indResult(statistic=array([nan, nan]), pvalue=array([nan, nan])) 

6517 

6518 >>> a = np.zeros((3, 0, 9)) 

6519 >>> b = np.zeros((1, 10)) 

6520 >>> stat, p = _ttest_nans(a, b, -1, Ttest_indResult) 

6521 >>> stat 

6522 array([], shape=(3, 0), dtype=float64) 

6523 >>> p 

6524 array([], shape=(3, 0), dtype=float64) 

6525 

6526 >>> a = np.zeros(10) 

6527 >>> b = np.zeros(7) 

6528 >>> _ttest_nans(a, b, 0, Ttest_indResult) 

6529 Ttest_indResult(statistic=nan, pvalue=nan) 

6530 

6531 """ 

6532 shp = _broadcast_shapes_with_dropped_axis(a, b, axis) 

6533 if len(shp) == 0: 

6534 t = np.nan 

6535 p = np.nan 

6536 else: 

6537 t = np.full(shp, fill_value=np.nan) 

6538 p = t.copy() 

6539 return namedtuple_type(t, p) 

6540 

6541 

6542def ttest_ind(a, b, axis=0, equal_var=True, nan_policy='propagate', 

6543 permutations=None, random_state=None, alternative="two-sided", 

6544 trim=0): 

6545 """ 

6546 Calculate the T-test for the means of *two independent* samples of scores. 

6547 

6548 This is a test for the null hypothesis that 2 independent samples 

6549 have identical average (expected) values. This test assumes that the 

6550 populations have identical variances by default. 

6551 

6552 Parameters 

6553 ---------- 

6554 a, b : array_like 

6555 The arrays must have the same shape, except in the dimension 

6556 corresponding to `axis` (the first, by default). 

6557 axis : int or None, optional 

6558 Axis along which to compute test. If None, compute over the whole 

6559 arrays, `a`, and `b`. 

6560 equal_var : bool, optional 

6561 If True (default), perform a standard independent 2 sample test 

6562 that assumes equal population variances [1]_. 

6563 If False, perform Welch's t-test, which does not assume equal 

6564 population variance [2]_. 

6565 

6566 .. versionadded:: 0.11.0 

6567 

6568 nan_policy : {'propagate', 'raise', 'omit'}, optional 

6569 Defines how to handle when input contains nan. 

6570 The following options are available (default is 'propagate'): 

6571 

6572 * 'propagate': returns nan 

6573 * 'raise': throws an error 

6574 * 'omit': performs the calculations ignoring nan values 

6575 

6576 The 'omit' option is not currently available for permutation tests or 

6577 one-sided asympyotic tests. 

6578 

6579 permutations : non-negative int, np.inf, or None (default), optional 

6580 If 0 or None (default), use the t-distribution to calculate p-values. 

6581 Otherwise, `permutations` is the number of random permutations that 

6582 will be used to estimate p-values using a permutation test. If 

6583 `permutations` equals or exceeds the number of distinct partitions of 

6584 the pooled data, an exact test is performed instead (i.e. each 

6585 distinct partition is used exactly once). See Notes for details. 

6586 

6587 .. versionadded:: 1.7.0 

6588 

6589 random_state : {None, int, `numpy.random.Generator`, 

6590 `numpy.random.RandomState`}, optional 

6591 

6592 If `seed` is None (or `np.random`), the `numpy.random.RandomState` 

6593 singleton is used. 

6594 If `seed` is an int, a new ``RandomState`` instance is used, 

6595 seeded with `seed`. 

6596 If `seed` is already a ``Generator`` or ``RandomState`` instance then 

6597 that instance is used. 

6598 

6599 Pseudorandom number generator state used to generate permutations 

6600 (used only when `permutations` is not None). 

6601 

6602 .. versionadded:: 1.7.0 

6603 

6604 alternative : {'two-sided', 'less', 'greater'}, optional 

6605 Defines the alternative hypothesis. 

6606 The following options are available (default is 'two-sided'): 

6607 

6608 * 'two-sided': the means of the distributions underlying the samples 

6609 are unequal. 

6610 * 'less': the mean of the distribution underlying the first sample 

6611 is less than the mean of the distribution underlying the second 

6612 sample. 

6613 * 'greater': the mean of the distribution underlying the first 

6614 sample is greater than the mean of the distribution underlying 

6615 the second sample. 

6616 

6617 .. versionadded:: 1.6.0 

6618 

6619 trim : float, optional 

6620 If nonzero, performs a trimmed (Yuen's) t-test. 

6621 Defines the fraction of elements to be trimmed from each end of the 

6622 input samples. If 0 (default), no elements will be trimmed from either 

6623 side. The number of trimmed elements from each tail is the floor of the 

6624 trim times the number of elements. Valid range is [0, .5). 

6625 

6626 .. versionadded:: 1.7 

6627 

6628 Returns 

6629 ------- 

6630 statistic : float or array 

6631 The calculated t-statistic. 

6632 pvalue : float or array 

6633 The p-value. 

6634 

6635 Notes 

6636 ----- 

6637 Suppose we observe two independent samples, e.g. flower petal lengths, and 

6638 we are considering whether the two samples were drawn from the same 

6639 population (e.g. the same species of flower or two species with similar 

6640 petal characteristics) or two different populations. 

6641 

6642 The t-test quantifies the difference between the arithmetic means 

6643 of the two samples. The p-value quantifies the probability of observing 

6644 as or more extreme values assuming the null hypothesis, that the 

6645 samples are drawn from populations with the same population means, is true. 

6646 A p-value larger than a chosen threshold (e.g. 5% or 1%) indicates that 

6647 our observation is not so unlikely to have occurred by chance. Therefore, 

6648 we do not reject the null hypothesis of equal population means. 

6649 If the p-value is smaller than our threshold, then we have evidence 

6650 against the null hypothesis of equal population means. 

6651 

6652 By default, the p-value is determined by comparing the t-statistic of the 

6653 observed data against a theoretical t-distribution. 

6654 When ``1 < permutations < binom(n, k)``, where 

6655 

6656 * ``k`` is the number of observations in `a`, 

6657 * ``n`` is the total number of observations in `a` and `b`, and 

6658 * ``binom(n, k)`` is the binomial coefficient (``n`` choose ``k``), 

6659 

6660 the data are pooled (concatenated), randomly assigned to either group `a` 

6661 or `b`, and the t-statistic is calculated. This process is performed 

6662 repeatedly (`permutation` times), generating a distribution of the 

6663 t-statistic under the null hypothesis, and the t-statistic of the observed 

6664 data is compared to this distribution to determine the p-value. 

6665 Specifically, the p-value reported is the "achieved significance level" 

6666 (ASL) as defined in 4.4 of [3]_. Note that there are other ways of 

6667 estimating p-values using randomized permutation tests; for other 

6668 options, see the more general `permutation_test`. 

6669 

6670 When ``permutations >= binom(n, k)``, an exact test is performed: the data 

6671 are partitioned between the groups in each distinct way exactly once. 

6672 

6673 The permutation test can be computationally expensive and not necessarily 

6674 more accurate than the analytical test, but it does not make strong 

6675 assumptions about the shape of the underlying distribution. 

6676 

6677 Use of trimming is commonly referred to as the trimmed t-test. At times 

6678 called Yuen's t-test, this is an extension of Welch's t-test, with the 

6679 difference being the use of winsorized means in calculation of the variance 

6680 and the trimmed sample size in calculation of the statistic. Trimming is 

6681 recommended if the underlying distribution is long-tailed or contaminated 

6682 with outliers [4]_. 

6683 

6684 The statistic is calculated as ``(np.mean(a) - np.mean(b))/se``, where 

6685 ``se`` is the standard error. Therefore, the statistic will be positive 

6686 when the sample mean of `a` is greater than the sample mean of `b` and 

6687 negative when the sample mean of `a` is less than the sample mean of 

6688 `b`. 

6689 

6690 References 

6691 ---------- 

6692 .. [1] https://en.wikipedia.org/wiki/T-test#Independent_two-sample_t-test 

6693 

6694 .. [2] https://en.wikipedia.org/wiki/Welch%27s_t-test 

6695 

6696 .. [3] B. Efron and T. Hastie. Computer Age Statistical Inference. (2016). 

6697 

6698 .. [4] Yuen, Karen K. "The Two-Sample Trimmed t for Unequal Population 

6699 Variances." Biometrika, vol. 61, no. 1, 1974, pp. 165-170. JSTOR, 

6700 www.jstor.org/stable/2334299. Accessed 30 Mar. 2021. 

6701 

6702 .. [5] Yuen, Karen K., and W. J. Dixon. "The Approximate Behaviour and 

6703 Performance of the Two-Sample Trimmed t." Biometrika, vol. 60, 

6704 no. 2, 1973, pp. 369-374. JSTOR, www.jstor.org/stable/2334550. 

6705 Accessed 30 Mar. 2021. 

6706 

6707 Examples 

6708 -------- 

6709 >>> import numpy as np 

6710 >>> from scipy import stats 

6711 >>> rng = np.random.default_rng() 

6712 

6713 Test with sample with identical means: 

6714 

6715 >>> rvs1 = stats.norm.rvs(loc=5, scale=10, size=500, random_state=rng) 

6716 >>> rvs2 = stats.norm.rvs(loc=5, scale=10, size=500, random_state=rng) 

6717 >>> stats.ttest_ind(rvs1, rvs2) 

6718 Ttest_indResult(statistic=-0.4390847099199348, pvalue=0.6606952038870015) 

6719 >>> stats.ttest_ind(rvs1, rvs2, equal_var=False) 

6720 Ttest_indResult(statistic=-0.4390847099199348, pvalue=0.6606952553131064) 

6721 

6722 `ttest_ind` underestimates p for unequal variances: 

6723 

6724 >>> rvs3 = stats.norm.rvs(loc=5, scale=20, size=500, random_state=rng) 

6725 >>> stats.ttest_ind(rvs1, rvs3) 

6726 Ttest_indResult(statistic=-1.6370984482905417, pvalue=0.1019251574705033) 

6727 >>> stats.ttest_ind(rvs1, rvs3, equal_var=False) 

6728 Ttest_indResult(statistic=-1.637098448290542, pvalue=0.10202110497954867) 

6729 

6730 When ``n1 != n2``, the equal variance t-statistic is no longer equal to the 

6731 unequal variance t-statistic: 

6732 

6733 >>> rvs4 = stats.norm.rvs(loc=5, scale=20, size=100, random_state=rng) 

6734 >>> stats.ttest_ind(rvs1, rvs4) 

6735 Ttest_indResult(statistic=-1.9481646859513422, pvalue=0.05186270935842703) 

6736 >>> stats.ttest_ind(rvs1, rvs4, equal_var=False) 

6737 Ttest_indResult(statistic=-1.3146566100751664, pvalue=0.1913495266513811) 

6738 

6739 T-test with different means, variance, and n: 

6740 

6741 >>> rvs5 = stats.norm.rvs(loc=8, scale=20, size=100, random_state=rng) 

6742 >>> stats.ttest_ind(rvs1, rvs5) 

6743 Ttest_indResult(statistic=-2.8415950600298774, pvalue=0.0046418707568707885) 

6744 >>> stats.ttest_ind(rvs1, rvs5, equal_var=False) 

6745 Ttest_indResult(statistic=-1.8686598649188084, pvalue=0.06434714193919686) 

6746 

6747 When performing a permutation test, more permutations typically yields 

6748 more accurate results. Use a ``np.random.Generator`` to ensure 

6749 reproducibility: 

6750 

6751 >>> stats.ttest_ind(rvs1, rvs5, permutations=10000, 

6752 ... random_state=rng) 

6753 Ttest_indResult(statistic=-2.8415950600298774, pvalue=0.0052994700529947) 

6754 

6755 Take these two samples, one of which has an extreme tail. 

6756 

6757 >>> a = (56, 128.6, 12, 123.8, 64.34, 78, 763.3) 

6758 >>> b = (1.1, 2.9, 4.2) 

6759 

6760 Use the `trim` keyword to perform a trimmed (Yuen) t-test. For example, 

6761 using 20% trimming, ``trim=.2``, the test will reduce the impact of one 

6762 (``np.floor(trim*len(a))``) element from each tail of sample `a`. It will 

6763 have no effect on sample `b` because ``np.floor(trim*len(b))`` is 0. 

6764 

6765 >>> stats.ttest_ind(a, b, trim=.2) 

6766 Ttest_indResult(statistic=3.4463884028073513, 

6767 pvalue=0.01369338726499547) 

6768 """ 

6769 if not (0 <= trim < .5): 

6770 raise ValueError("Trimming percentage should be 0 <= `trim` < .5.") 

6771 

6772 a, b, axis = _chk2_asarray(a, b, axis) 

6773 

6774 # check both a and b 

6775 cna, npa = _contains_nan(a, nan_policy) 

6776 cnb, npb = _contains_nan(b, nan_policy) 

6777 contains_nan = cna or cnb 

6778 if npa == 'omit' or npb == 'omit': 

6779 nan_policy = 'omit' 

6780 

6781 if contains_nan and nan_policy == 'omit': 

6782 if permutations or trim != 0: 

6783 raise ValueError("nan-containing/masked inputs with " 

6784 "nan_policy='omit' are currently not " 

6785 "supported by permutation tests or " 

6786 "trimmed tests.") 

6787 a = ma.masked_invalid(a) 

6788 b = ma.masked_invalid(b) 

6789 return mstats_basic.ttest_ind(a, b, axis, equal_var, alternative) 

6790 

6791 if a.size == 0 or b.size == 0: 

6792 return _ttest_nans(a, b, axis, Ttest_indResult) 

6793 

6794 if permutations is not None and permutations != 0: 

6795 if trim != 0: 

6796 raise ValueError("Permutations are currently not supported " 

6797 "with trimming.") 

6798 if permutations < 0 or (np.isfinite(permutations) and 

6799 int(permutations) != permutations): 

6800 raise ValueError("Permutations must be a non-negative integer.") 

6801 

6802 res = _permutation_ttest(a, b, permutations=permutations, 

6803 axis=axis, equal_var=equal_var, 

6804 nan_policy=nan_policy, 

6805 random_state=random_state, 

6806 alternative=alternative) 

6807 

6808 else: 

6809 n1 = a.shape[axis] 

6810 n2 = b.shape[axis] 

6811 

6812 if trim == 0: 

6813 v1 = _var(a, axis, ddof=1) 

6814 v2 = _var(b, axis, ddof=1) 

6815 m1 = np.mean(a, axis) 

6816 m2 = np.mean(b, axis) 

6817 else: 

6818 v1, m1, n1 = _ttest_trim_var_mean_len(a, trim, axis) 

6819 v2, m2, n2 = _ttest_trim_var_mean_len(b, trim, axis) 

6820 

6821 if equal_var: 

6822 df, denom = _equal_var_ttest_denom(v1, n1, v2, n2) 

6823 else: 

6824 df, denom = _unequal_var_ttest_denom(v1, n1, v2, n2) 

6825 res = _ttest_ind_from_stats(m1, m2, denom, df, alternative) 

6826 return Ttest_indResult(*res) 

6827 

6828 

6829def _ttest_trim_var_mean_len(a, trim, axis): 

6830 """Variance, mean, and length of winsorized input along specified axis""" 

6831 # for use with `ttest_ind` when trimming. 

6832 # further calculations in this test assume that the inputs are sorted. 

6833 # From [4] Section 1 "Let x_1, ..., x_n be n ordered observations..." 

6834 a = np.sort(a, axis=axis) 

6835 

6836 # `g` is the number of elements to be replaced on each tail, converted 

6837 # from a percentage amount of trimming 

6838 n = a.shape[axis] 

6839 g = int(n * trim) 

6840 

6841 # Calculate the Winsorized variance of the input samples according to 

6842 # specified `g` 

6843 v = _calculate_winsorized_variance(a, g, axis) 

6844 

6845 # the total number of elements in the trimmed samples 

6846 n -= 2 * g 

6847 

6848 # calculate the g-times trimmed mean, as defined in [4] (1-1) 

6849 m = trim_mean(a, trim, axis=axis) 

6850 return v, m, n 

6851 

6852 

6853def _calculate_winsorized_variance(a, g, axis): 

6854 """Calculates g-times winsorized variance along specified axis""" 

6855 # it is expected that the input `a` is sorted along the correct axis 

6856 if g == 0: 

6857 return _var(a, ddof=1, axis=axis) 

6858 # move the intended axis to the end that way it is easier to manipulate 

6859 a_win = np.moveaxis(a, axis, -1) 

6860 

6861 # save where NaNs are for later use. 

6862 nans_indices = np.any(np.isnan(a_win), axis=-1) 

6863 

6864 # Winsorization and variance calculation are done in one step in [4] 

6865 # (1-3), but here winsorization is done first; replace the left and 

6866 # right sides with the repeating value. This can be see in effect in ( 

6867 # 1-3) in [4], where the leftmost and rightmost tails are replaced with 

6868 # `(g + 1) * x_{g + 1}` on the left and `(g + 1) * x_{n - g}` on the 

6869 # right. Zero-indexing turns `g + 1` to `g`, and `n - g` to `- g - 1` in 

6870 # array indexing. 

6871 a_win[..., :g] = a_win[..., [g]] 

6872 a_win[..., -g:] = a_win[..., [-g - 1]] 

6873 

6874 # Determine the variance. In [4], the degrees of freedom is expressed as 

6875 # `h - 1`, where `h = n - 2g` (unnumbered equations in Section 1, end of 

6876 # page 369, beginning of page 370). This is converted to NumPy's format, 

6877 # `n - ddof` for use with `np.var`. The result is converted to an 

6878 # array to accommodate indexing later. 

6879 var_win = np.asarray(_var(a_win, ddof=(2 * g + 1), axis=-1)) 

6880 

6881 # with `nan_policy='propagate'`, NaNs may be completely trimmed out 

6882 # because they were sorted into the tail of the array. In these cases, 

6883 # replace computed variances with `np.nan`. 

6884 var_win[nans_indices] = np.nan 

6885 return var_win 

6886 

6887 

6888def _permutation_distribution_t(data, permutations, size_a, equal_var, 

6889 random_state=None): 

6890 """Generation permutation distribution of t statistic""" 

6891 

6892 random_state = check_random_state(random_state) 

6893 

6894 # prepare permutation indices 

6895 size = data.shape[-1] 

6896 # number of distinct combinations 

6897 n_max = special.comb(size, size_a) 

6898 

6899 if permutations < n_max: 

6900 perm_generator = (random_state.permutation(size) 

6901 for i in range(permutations)) 

6902 else: 

6903 permutations = n_max 

6904 perm_generator = (np.concatenate(z) 

6905 for z in _all_partitions(size_a, size-size_a)) 

6906 

6907 t_stat = [] 

6908 for indices in _batch_generator(perm_generator, batch=50): 

6909 # get one batch from perm_generator at a time as a list 

6910 indices = np.array(indices) 

6911 # generate permutations 

6912 data_perm = data[..., indices] 

6913 # move axis indexing permutations to position 0 to broadcast 

6914 # nicely with t_stat_observed, which doesn't have this dimension 

6915 data_perm = np.moveaxis(data_perm, -2, 0) 

6916 

6917 a = data_perm[..., :size_a] 

6918 b = data_perm[..., size_a:] 

6919 t_stat.append(_calc_t_stat(a, b, equal_var)) 

6920 

6921 t_stat = np.concatenate(t_stat, axis=0) 

6922 

6923 return t_stat, permutations, n_max 

6924 

6925 

6926def _calc_t_stat(a, b, equal_var, axis=-1): 

6927 """Calculate the t statistic along the given dimension.""" 

6928 na = a.shape[axis] 

6929 nb = b.shape[axis] 

6930 avg_a = np.mean(a, axis=axis) 

6931 avg_b = np.mean(b, axis=axis) 

6932 var_a = _var(a, axis=axis, ddof=1) 

6933 var_b = _var(b, axis=axis, ddof=1) 

6934 

6935 if not equal_var: 

6936 denom = _unequal_var_ttest_denom(var_a, na, var_b, nb)[1] 

6937 else: 

6938 denom = _equal_var_ttest_denom(var_a, na, var_b, nb)[1] 

6939 

6940 return (avg_a-avg_b)/denom 

6941 

6942 

6943def _permutation_ttest(a, b, permutations, axis=0, equal_var=True, 

6944 nan_policy='propagate', random_state=None, 

6945 alternative="two-sided"): 

6946 """ 

6947 Calculates the T-test for the means of TWO INDEPENDENT samples of scores 

6948 using permutation methods. 

6949 

6950 This test is similar to `stats.ttest_ind`, except it doesn't rely on an 

6951 approximate normality assumption since it uses a permutation test. 

6952 This function is only called from ttest_ind when permutations is not None. 

6953 

6954 Parameters 

6955 ---------- 

6956 a, b : array_like 

6957 The arrays must be broadcastable, except along the dimension 

6958 corresponding to `axis` (the zeroth, by default). 

6959 axis : int, optional 

6960 The axis over which to operate on a and b. 

6961 permutations : int, optional 

6962 Number of permutations used to calculate p-value. If greater than or 

6963 equal to the number of distinct permutations, perform an exact test. 

6964 equal_var : bool, optional 

6965 If False, an equal variance (Welch's) t-test is conducted. Otherwise, 

6966 an ordinary t-test is conducted. 

6967 random_state : {None, int, `numpy.random.Generator`}, optional 

6968 If `seed` is None the `numpy.random.Generator` singleton is used. 

6969 If `seed` is an int, a new ``Generator`` instance is used, 

6970 seeded with `seed`. 

6971 If `seed` is already a ``Generator`` instance then that instance is 

6972 used. 

6973 Pseudorandom number generator state used for generating random 

6974 permutations. 

6975 

6976 Returns 

6977 ------- 

6978 statistic : float or array 

6979 The calculated t-statistic. 

6980 pvalue : float or array 

6981 The p-value. 

6982 

6983 """ 

6984 random_state = check_random_state(random_state) 

6985 

6986 t_stat_observed = _calc_t_stat(a, b, equal_var, axis=axis) 

6987 

6988 na = a.shape[axis] 

6989 mat = _broadcast_concatenate((a, b), axis=axis) 

6990 mat = np.moveaxis(mat, axis, -1) 

6991 

6992 t_stat, permutations, n_max = _permutation_distribution_t( 

6993 mat, permutations, size_a=na, equal_var=equal_var, 

6994 random_state=random_state) 

6995 

6996 compare = {"less": np.less_equal, 

6997 "greater": np.greater_equal, 

6998 "two-sided": lambda x, y: (x <= -np.abs(y)) | (x >= np.abs(y))} 

6999 

7000 # Calculate the p-values 

7001 cmps = compare[alternative](t_stat, t_stat_observed) 

7002 # Randomized test p-value calculation should use biased estimate; see e.g. 

7003 # https://www.degruyter.com/document/doi/10.2202/1544-6115.1585/ 

7004 adjustment = 1 if n_max > permutations else 0 

7005 pvalues = (cmps.sum(axis=0) + adjustment) / (permutations + adjustment) 

7006 

7007 # nans propagate naturally in statistic calculation, but need to be 

7008 # propagated manually into pvalues 

7009 if nan_policy == 'propagate' and np.isnan(t_stat_observed).any(): 

7010 if np.ndim(pvalues) == 0: 

7011 pvalues = np.float64(np.nan) 

7012 else: 

7013 pvalues[np.isnan(t_stat_observed)] = np.nan 

7014 

7015 return (t_stat_observed, pvalues) 

7016 

7017 

7018def _get_len(a, axis, msg): 

7019 try: 

7020 n = a.shape[axis] 

7021 except IndexError: 

7022 raise np.AxisError(axis, a.ndim, msg) from None 

7023 return n 

7024 

7025 

7026@_axis_nan_policy_factory(pack_TtestResult, default_axis=0, n_samples=2, 

7027 result_to_tuple=unpack_TtestResult, n_outputs=6, 

7028 paired=True) 

7029def ttest_rel(a, b, axis=0, nan_policy='propagate', alternative="two-sided"): 

7030 """Calculate the t-test on TWO RELATED samples of scores, a and b. 

7031 

7032 This is a test for the null hypothesis that two related or 

7033 repeated samples have identical average (expected) values. 

7034 

7035 Parameters 

7036 ---------- 

7037 a, b : array_like 

7038 The arrays must have the same shape. 

7039 axis : int or None, optional 

7040 Axis along which to compute test. If None, compute over the whole 

7041 arrays, `a`, and `b`. 

7042 nan_policy : {'propagate', 'raise', 'omit'}, optional 

7043 Defines how to handle when input contains nan. 

7044 The following options are available (default is 'propagate'): 

7045 

7046 * 'propagate': returns nan 

7047 * 'raise': throws an error 

7048 * 'omit': performs the calculations ignoring nan values 

7049 alternative : {'two-sided', 'less', 'greater'}, optional 

7050 Defines the alternative hypothesis. 

7051 The following options are available (default is 'two-sided'): 

7052 

7053 * 'two-sided': the means of the distributions underlying the samples 

7054 are unequal. 

7055 * 'less': the mean of the distribution underlying the first sample 

7056 is less than the mean of the distribution underlying the second 

7057 sample. 

7058 * 'greater': the mean of the distribution underlying the first 

7059 sample is greater than the mean of the distribution underlying 

7060 the second sample. 

7061 

7062 .. versionadded:: 1.6.0 

7063 

7064 Returns 

7065 ------- 

7066 result : `~scipy.stats._result_classes.TtestResult` 

7067 An object with the following attributes: 

7068 

7069 statistic : float or array 

7070 The t-statistic. 

7071 pvalue : float or array 

7072 The p-value associated with the given alternative. 

7073 df : float or array 

7074 The number of degrees of freedom used in calculation of the 

7075 t-statistic; this is one less than the size of the sample 

7076 (``a.shape[axis]``). 

7077 

7078 .. versionadded:: 1.10.0 

7079 

7080 The object also has the following method: 

7081 

7082 confidence_interval(confidence_level=0.95) 

7083 Computes a confidence interval around the difference in 

7084 population means for the given confidence level. 

7085 The confidence interval is returned in a ``namedtuple`` with 

7086 fields `low` and `high`. 

7087 

7088 .. versionadded:: 1.10.0 

7089 

7090 Notes 

7091 ----- 

7092 Examples for use are scores of the same set of student in 

7093 different exams, or repeated sampling from the same units. The 

7094 test measures whether the average score differs significantly 

7095 across samples (e.g. exams). If we observe a large p-value, for 

7096 example greater than 0.05 or 0.1 then we cannot reject the null 

7097 hypothesis of identical average scores. If the p-value is smaller 

7098 than the threshold, e.g. 1%, 5% or 10%, then we reject the null 

7099 hypothesis of equal averages. Small p-values are associated with 

7100 large t-statistics. 

7101 

7102 The t-statistic is calculated as ``np.mean(a - b)/se``, where ``se`` is the 

7103 standard error. Therefore, the t-statistic will be positive when the sample 

7104 mean of ``a - b`` is greater than zero and negative when the sample mean of 

7105 ``a - b`` is less than zero. 

7106 

7107 References 

7108 ---------- 

7109 https://en.wikipedia.org/wiki/T-test#Dependent_t-test_for_paired_samples 

7110 

7111 Examples 

7112 -------- 

7113 >>> import numpy as np 

7114 >>> from scipy import stats 

7115 >>> rng = np.random.default_rng() 

7116 

7117 >>> rvs1 = stats.norm.rvs(loc=5, scale=10, size=500, random_state=rng) 

7118 >>> rvs2 = (stats.norm.rvs(loc=5, scale=10, size=500, random_state=rng) 

7119 ... + stats.norm.rvs(scale=0.2, size=500, random_state=rng)) 

7120 >>> stats.ttest_rel(rvs1, rvs2) 

7121 TtestResult(statistic=-0.4549717054410304, pvalue=0.6493274702088672, df=499) # noqa 

7122 >>> rvs3 = (stats.norm.rvs(loc=8, scale=10, size=500, random_state=rng) 

7123 ... + stats.norm.rvs(scale=0.2, size=500, random_state=rng)) 

7124 >>> stats.ttest_rel(rvs1, rvs3) 

7125 TtestResult(statistic=-5.879467544540889, pvalue=7.540777129099917e-09, df=499) # noqa 

7126 

7127 """ 

7128 a, b, axis = _chk2_asarray(a, b, axis) 

7129 

7130 na = _get_len(a, axis, "first argument") 

7131 nb = _get_len(b, axis, "second argument") 

7132 if na != nb: 

7133 raise ValueError('unequal length arrays') 

7134 

7135 if na == 0 or nb == 0: 

7136 # _axis_nan_policy decorator ensures this only happens with 1d input 

7137 return TtestResult(np.nan, np.nan, df=np.nan, alternative=np.nan, 

7138 standard_error=np.nan, estimate=np.nan) 

7139 

7140 n = a.shape[axis] 

7141 df = n - 1 

7142 

7143 d = (a - b).astype(np.float64) 

7144 v = _var(d, axis, ddof=1) 

7145 dm = np.mean(d, axis) 

7146 denom = np.sqrt(v / n) 

7147 

7148 with np.errstate(divide='ignore', invalid='ignore'): 

7149 t = np.divide(dm, denom) 

7150 t, prob = _ttest_finish(df, t, alternative) 

7151 

7152 # when nan_policy='omit', `df` can be different for different axis-slices 

7153 df = np.broadcast_to(df, t.shape)[()] 

7154 

7155 # _axis_nan_policy decorator doesn't play well with strings 

7156 alternative_num = {"less": -1, "two-sided": 0, "greater": 1}[alternative] 

7157 return TtestResult(t, prob, df=df, alternative=alternative_num, 

7158 standard_error=denom, estimate=dm) 

7159 

7160 

7161# Map from names to lambda_ values used in power_divergence(). 

7162_power_div_lambda_names = { 

7163 "pearson": 1, 

7164 "log-likelihood": 0, 

7165 "freeman-tukey": -0.5, 

7166 "mod-log-likelihood": -1, 

7167 "neyman": -2, 

7168 "cressie-read": 2/3, 

7169} 

7170 

7171 

7172def _count(a, axis=None): 

7173 """Count the number of non-masked elements of an array. 

7174 

7175 This function behaves like `np.ma.count`, but is much faster 

7176 for ndarrays. 

7177 """ 

7178 if hasattr(a, 'count'): 

7179 num = a.count(axis=axis) 

7180 if isinstance(num, np.ndarray) and num.ndim == 0: 

7181 # In some cases, the `count` method returns a scalar array (e.g. 

7182 # np.array(3)), but we want a plain integer. 

7183 num = int(num) 

7184 else: 

7185 if axis is None: 

7186 num = a.size 

7187 else: 

7188 num = a.shape[axis] 

7189 return num 

7190 

7191 

7192def _m_broadcast_to(a, shape): 

7193 if np.ma.isMaskedArray(a): 

7194 return np.ma.masked_array(np.broadcast_to(a, shape), 

7195 mask=np.broadcast_to(a.mask, shape)) 

7196 return np.broadcast_to(a, shape, subok=True) 

7197 

7198 

7199Power_divergenceResult = namedtuple('Power_divergenceResult', 

7200 ('statistic', 'pvalue')) 

7201 

7202 

7203def power_divergence(f_obs, f_exp=None, ddof=0, axis=0, lambda_=None): 

7204 """Cressie-Read power divergence statistic and goodness of fit test. 

7205 

7206 This function tests the null hypothesis that the categorical data 

7207 has the given frequencies, using the Cressie-Read power divergence 

7208 statistic. 

7209 

7210 Parameters 

7211 ---------- 

7212 f_obs : array_like 

7213 Observed frequencies in each category. 

7214 f_exp : array_like, optional 

7215 Expected frequencies in each category. By default the categories are 

7216 assumed to be equally likely. 

7217 ddof : int, optional 

7218 "Delta degrees of freedom": adjustment to the degrees of freedom 

7219 for the p-value. The p-value is computed using a chi-squared 

7220 distribution with ``k - 1 - ddof`` degrees of freedom, where `k` 

7221 is the number of observed frequencies. The default value of `ddof` 

7222 is 0. 

7223 axis : int or None, optional 

7224 The axis of the broadcast result of `f_obs` and `f_exp` along which to 

7225 apply the test. If axis is None, all values in `f_obs` are treated 

7226 as a single data set. Default is 0. 

7227 lambda_ : float or str, optional 

7228 The power in the Cressie-Read power divergence statistic. The default 

7229 is 1. For convenience, `lambda_` may be assigned one of the following 

7230 strings, in which case the corresponding numerical value is used: 

7231 

7232 * ``"pearson"`` (value 1) 

7233 Pearson's chi-squared statistic. In this case, the function is 

7234 equivalent to `chisquare`. 

7235 * ``"log-likelihood"`` (value 0) 

7236 Log-likelihood ratio. Also known as the G-test [3]_. 

7237 * ``"freeman-tukey"`` (value -1/2) 

7238 Freeman-Tukey statistic. 

7239 * ``"mod-log-likelihood"`` (value -1) 

7240 Modified log-likelihood ratio. 

7241 * ``"neyman"`` (value -2) 

7242 Neyman's statistic. 

7243 * ``"cressie-read"`` (value 2/3) 

7244 The power recommended in [5]_. 

7245 

7246 Returns 

7247 ------- 

7248 statistic : float or ndarray 

7249 The Cressie-Read power divergence test statistic. The value is 

7250 a float if `axis` is None or if` `f_obs` and `f_exp` are 1-D. 

7251 pvalue : float or ndarray 

7252 The p-value of the test. The value is a float if `ddof` and the 

7253 return value `stat` are scalars. 

7254 

7255 See Also 

7256 -------- 

7257 chisquare 

7258 

7259 Notes 

7260 ----- 

7261 This test is invalid when the observed or expected frequencies in each 

7262 category are too small. A typical rule is that all of the observed 

7263 and expected frequencies should be at least 5. 

7264 

7265 Also, the sum of the observed and expected frequencies must be the same 

7266 for the test to be valid; `power_divergence` raises an error if the sums 

7267 do not agree within a relative tolerance of ``1e-8``. 

7268 

7269 When `lambda_` is less than zero, the formula for the statistic involves 

7270 dividing by `f_obs`, so a warning or error may be generated if any value 

7271 in `f_obs` is 0. 

7272 

7273 Similarly, a warning or error may be generated if any value in `f_exp` is 

7274 zero when `lambda_` >= 0. 

7275 

7276 The default degrees of freedom, k-1, are for the case when no parameters 

7277 of the distribution are estimated. If p parameters are estimated by 

7278 efficient maximum likelihood then the correct degrees of freedom are 

7279 k-1-p. If the parameters are estimated in a different way, then the 

7280 dof can be between k-1-p and k-1. However, it is also possible that 

7281 the asymptotic distribution is not a chisquare, in which case this 

7282 test is not appropriate. 

7283 

7284 This function handles masked arrays. If an element of `f_obs` or `f_exp` 

7285 is masked, then data at that position is ignored, and does not count 

7286 towards the size of the data set. 

7287 

7288 .. versionadded:: 0.13.0 

7289 

7290 References 

7291 ---------- 

7292 .. [1] Lowry, Richard. "Concepts and Applications of Inferential 

7293 Statistics". Chapter 8. 

7294 https://web.archive.org/web/20171015035606/http://faculty.vassar.edu/lowry/ch8pt1.html 

7295 .. [2] "Chi-squared test", https://en.wikipedia.org/wiki/Chi-squared_test 

7296 .. [3] "G-test", https://en.wikipedia.org/wiki/G-test 

7297 .. [4] Sokal, R. R. and Rohlf, F. J. "Biometry: the principles and 

7298 practice of statistics in biological research", New York: Freeman 

7299 (1981) 

7300 .. [5] Cressie, N. and Read, T. R. C., "Multinomial Goodness-of-Fit 

7301 Tests", J. Royal Stat. Soc. Series B, Vol. 46, No. 3 (1984), 

7302 pp. 440-464. 

7303 

7304 Examples 

7305 -------- 

7306 (See `chisquare` for more examples.) 

7307 

7308 When just `f_obs` is given, it is assumed that the expected frequencies 

7309 are uniform and given by the mean of the observed frequencies. Here we 

7310 perform a G-test (i.e. use the log-likelihood ratio statistic): 

7311 

7312 >>> import numpy as np 

7313 >>> from scipy.stats import power_divergence 

7314 >>> power_divergence([16, 18, 16, 14, 12, 12], lambda_='log-likelihood') 

7315 (2.006573162632538, 0.84823476779463769) 

7316 

7317 The expected frequencies can be given with the `f_exp` argument: 

7318 

7319 >>> power_divergence([16, 18, 16, 14, 12, 12], 

7320 ... f_exp=[16, 16, 16, 16, 16, 8], 

7321 ... lambda_='log-likelihood') 

7322 (3.3281031458963746, 0.6495419288047497) 

7323 

7324 When `f_obs` is 2-D, by default the test is applied to each column. 

7325 

7326 >>> obs = np.array([[16, 18, 16, 14, 12, 12], [32, 24, 16, 28, 20, 24]]).T 

7327 >>> obs.shape 

7328 (6, 2) 

7329 >>> power_divergence(obs, lambda_="log-likelihood") 

7330 (array([ 2.00657316, 6.77634498]), array([ 0.84823477, 0.23781225])) 

7331 

7332 By setting ``axis=None``, the test is applied to all data in the array, 

7333 which is equivalent to applying the test to the flattened array. 

7334 

7335 >>> power_divergence(obs, axis=None) 

7336 (23.31034482758621, 0.015975692534127565) 

7337 >>> power_divergence(obs.ravel()) 

7338 (23.31034482758621, 0.015975692534127565) 

7339 

7340 `ddof` is the change to make to the default degrees of freedom. 

7341 

7342 >>> power_divergence([16, 18, 16, 14, 12, 12], ddof=1) 

7343 (2.0, 0.73575888234288467) 

7344 

7345 The calculation of the p-values is done by broadcasting the 

7346 test statistic with `ddof`. 

7347 

7348 >>> power_divergence([16, 18, 16, 14, 12, 12], ddof=[0,1,2]) 

7349 (2.0, array([ 0.84914504, 0.73575888, 0.5724067 ])) 

7350 

7351 `f_obs` and `f_exp` are also broadcast. In the following, `f_obs` has 

7352 shape (6,) and `f_exp` has shape (2, 6), so the result of broadcasting 

7353 `f_obs` and `f_exp` has shape (2, 6). To compute the desired chi-squared 

7354 statistics, we must use ``axis=1``: 

7355 

7356 >>> power_divergence([16, 18, 16, 14, 12, 12], 

7357 ... f_exp=[[16, 16, 16, 16, 16, 8], 

7358 ... [8, 20, 20, 16, 12, 12]], 

7359 ... axis=1) 

7360 (array([ 3.5 , 9.25]), array([ 0.62338763, 0.09949846])) 

7361 

7362 """ 

7363 # Convert the input argument `lambda_` to a numerical value. 

7364 if isinstance(lambda_, str): 

7365 if lambda_ not in _power_div_lambda_names: 

7366 names = repr(list(_power_div_lambda_names.keys()))[1:-1] 

7367 raise ValueError("invalid string for lambda_: {0!r}. " 

7368 "Valid strings are {1}".format(lambda_, names)) 

7369 lambda_ = _power_div_lambda_names[lambda_] 

7370 elif lambda_ is None: 

7371 lambda_ = 1 

7372 

7373 f_obs = np.asanyarray(f_obs) 

7374 f_obs_float = f_obs.astype(np.float64) 

7375 

7376 if f_exp is not None: 

7377 f_exp = np.asanyarray(f_exp) 

7378 bshape = _broadcast_shapes(f_obs_float.shape, f_exp.shape) 

7379 f_obs_float = _m_broadcast_to(f_obs_float, bshape) 

7380 f_exp = _m_broadcast_to(f_exp, bshape) 

7381 rtol = 1e-8 # to pass existing tests 

7382 with np.errstate(invalid='ignore'): 

7383 f_obs_sum = f_obs_float.sum(axis=axis) 

7384 f_exp_sum = f_exp.sum(axis=axis) 

7385 relative_diff = (np.abs(f_obs_sum - f_exp_sum) / 

7386 np.minimum(f_obs_sum, f_exp_sum)) 

7387 diff_gt_tol = (relative_diff > rtol).any() 

7388 if diff_gt_tol: 

7389 msg = (f"For each axis slice, the sum of the observed " 

7390 f"frequencies must agree with the sum of the " 

7391 f"expected frequencies to a relative tolerance " 

7392 f"of {rtol}, but the percent differences are:\n" 

7393 f"{relative_diff}") 

7394 raise ValueError(msg) 

7395 

7396 else: 

7397 # Ignore 'invalid' errors so the edge case of a data set with length 0 

7398 # is handled without spurious warnings. 

7399 with np.errstate(invalid='ignore'): 

7400 f_exp = f_obs.mean(axis=axis, keepdims=True) 

7401 

7402 # `terms` is the array of terms that are summed along `axis` to create 

7403 # the test statistic. We use some specialized code for a few special 

7404 # cases of lambda_. 

7405 if lambda_ == 1: 

7406 # Pearson's chi-squared statistic 

7407 terms = (f_obs_float - f_exp)**2 / f_exp 

7408 elif lambda_ == 0: 

7409 # Log-likelihood ratio (i.e. G-test) 

7410 terms = 2.0 * special.xlogy(f_obs, f_obs / f_exp) 

7411 elif lambda_ == -1: 

7412 # Modified log-likelihood ratio 

7413 terms = 2.0 * special.xlogy(f_exp, f_exp / f_obs) 

7414 else: 

7415 # General Cressie-Read power divergence. 

7416 terms = f_obs * ((f_obs / f_exp)**lambda_ - 1) 

7417 terms /= 0.5 * lambda_ * (lambda_ + 1) 

7418 

7419 stat = terms.sum(axis=axis) 

7420 

7421 num_obs = _count(terms, axis=axis) 

7422 ddof = asarray(ddof) 

7423 p = distributions.chi2.sf(stat, num_obs - 1 - ddof) 

7424 

7425 return Power_divergenceResult(stat, p) 

7426 

7427 

7428def chisquare(f_obs, f_exp=None, ddof=0, axis=0): 

7429 """Calculate a one-way chi-square test. 

7430 

7431 The chi-square test tests the null hypothesis that the categorical data 

7432 has the given frequencies. 

7433 

7434 Parameters 

7435 ---------- 

7436 f_obs : array_like 

7437 Observed frequencies in each category. 

7438 f_exp : array_like, optional 

7439 Expected frequencies in each category. By default the categories are 

7440 assumed to be equally likely. 

7441 ddof : int, optional 

7442 "Delta degrees of freedom": adjustment to the degrees of freedom 

7443 for the p-value. The p-value is computed using a chi-squared 

7444 distribution with ``k - 1 - ddof`` degrees of freedom, where `k` 

7445 is the number of observed frequencies. The default value of `ddof` 

7446 is 0. 

7447 axis : int or None, optional 

7448 The axis of the broadcast result of `f_obs` and `f_exp` along which to 

7449 apply the test. If axis is None, all values in `f_obs` are treated 

7450 as a single data set. Default is 0. 

7451 

7452 Returns 

7453 ------- 

7454 chisq : float or ndarray 

7455 The chi-squared test statistic. The value is a float if `axis` is 

7456 None or `f_obs` and `f_exp` are 1-D. 

7457 p : float or ndarray 

7458 The p-value of the test. The value is a float if `ddof` and the 

7459 return value `chisq` are scalars. 

7460 

7461 See Also 

7462 -------- 

7463 scipy.stats.power_divergence 

7464 scipy.stats.fisher_exact : Fisher exact test on a 2x2 contingency table. 

7465 scipy.stats.barnard_exact : An unconditional exact test. An alternative 

7466 to chi-squared test for small sample sizes. 

7467 

7468 Notes 

7469 ----- 

7470 This test is invalid when the observed or expected frequencies in each 

7471 category are too small. A typical rule is that all of the observed 

7472 and expected frequencies should be at least 5. According to [3]_, the 

7473 total number of samples is recommended to be greater than 13, 

7474 otherwise exact tests (such as Barnard's Exact test) should be used 

7475 because they do not overreject. 

7476 

7477 Also, the sum of the observed and expected frequencies must be the same 

7478 for the test to be valid; `chisquare` raises an error if the sums do not 

7479 agree within a relative tolerance of ``1e-8``. 

7480 

7481 The default degrees of freedom, k-1, are for the case when no parameters 

7482 of the distribution are estimated. If p parameters are estimated by 

7483 efficient maximum likelihood then the correct degrees of freedom are 

7484 k-1-p. If the parameters are estimated in a different way, then the 

7485 dof can be between k-1-p and k-1. However, it is also possible that 

7486 the asymptotic distribution is not chi-square, in which case this test 

7487 is not appropriate. 

7488 

7489 References 

7490 ---------- 

7491 .. [1] Lowry, Richard. "Concepts and Applications of Inferential 

7492 Statistics". Chapter 8. 

7493 https://web.archive.org/web/20171022032306/http://vassarstats.net:80/textbook/ch8pt1.html 

7494 .. [2] "Chi-squared test", https://en.wikipedia.org/wiki/Chi-squared_test 

7495 .. [3] Pearson, Karl. "On the criterion that a given system of deviations from the probable 

7496 in the case of a correlated system of variables is such that it can be reasonably 

7497 supposed to have arisen from random sampling", Philosophical Magazine. Series 5. 50 

7498 (1900), pp. 157-175. 

7499 

7500 Examples 

7501 -------- 

7502 When just `f_obs` is given, it is assumed that the expected frequencies 

7503 are uniform and given by the mean of the observed frequencies. 

7504 

7505 >>> import numpy as np 

7506 >>> from scipy.stats import chisquare 

7507 >>> chisquare([16, 18, 16, 14, 12, 12]) 

7508 (2.0, 0.84914503608460956) 

7509 

7510 With `f_exp` the expected frequencies can be given. 

7511 

7512 >>> chisquare([16, 18, 16, 14, 12, 12], f_exp=[16, 16, 16, 16, 16, 8]) 

7513 (3.5, 0.62338762774958223) 

7514 

7515 When `f_obs` is 2-D, by default the test is applied to each column. 

7516 

7517 >>> obs = np.array([[16, 18, 16, 14, 12, 12], [32, 24, 16, 28, 20, 24]]).T 

7518 >>> obs.shape 

7519 (6, 2) 

7520 >>> chisquare(obs) 

7521 (array([ 2. , 6.66666667]), array([ 0.84914504, 0.24663415])) 

7522 

7523 By setting ``axis=None``, the test is applied to all data in the array, 

7524 which is equivalent to applying the test to the flattened array. 

7525 

7526 >>> chisquare(obs, axis=None) 

7527 (23.31034482758621, 0.015975692534127565) 

7528 >>> chisquare(obs.ravel()) 

7529 (23.31034482758621, 0.015975692534127565) 

7530 

7531 `ddof` is the change to make to the default degrees of freedom. 

7532 

7533 >>> chisquare([16, 18, 16, 14, 12, 12], ddof=1) 

7534 (2.0, 0.73575888234288467) 

7535 

7536 The calculation of the p-values is done by broadcasting the 

7537 chi-squared statistic with `ddof`. 

7538 

7539 >>> chisquare([16, 18, 16, 14, 12, 12], ddof=[0,1,2]) 

7540 (2.0, array([ 0.84914504, 0.73575888, 0.5724067 ])) 

7541 

7542 `f_obs` and `f_exp` are also broadcast. In the following, `f_obs` has 

7543 shape (6,) and `f_exp` has shape (2, 6), so the result of broadcasting 

7544 `f_obs` and `f_exp` has shape (2, 6). To compute the desired chi-squared 

7545 statistics, we use ``axis=1``: 

7546 

7547 >>> chisquare([16, 18, 16, 14, 12, 12], 

7548 ... f_exp=[[16, 16, 16, 16, 16, 8], [8, 20, 20, 16, 12, 12]], 

7549 ... axis=1) 

7550 (array([ 3.5 , 9.25]), array([ 0.62338763, 0.09949846])) 

7551 

7552 """ 

7553 return power_divergence(f_obs, f_exp=f_exp, ddof=ddof, axis=axis, 

7554 lambda_="pearson") 

7555 

7556 

7557KstestResult = _make_tuple_bunch('KstestResult', ['statistic', 'pvalue'], 

7558 ['statistic_location', 'statistic_sign']) 

7559 

7560 

7561def _compute_dplus(cdfvals, x): 

7562 """Computes D+ as used in the Kolmogorov-Smirnov test. 

7563 

7564 Parameters 

7565 ---------- 

7566 cdfvals : array_like 

7567 Sorted array of CDF values between 0 and 1 

7568 x: array_like 

7569 Sorted array of the stochastic variable itself 

7570 

7571 Returns 

7572 ------- 

7573 res: Pair with the following elements: 

7574 - The maximum distance of the CDF values below Uniform(0, 1). 

7575 - The location at which the maximum is reached. 

7576 

7577 """ 

7578 n = len(cdfvals) 

7579 dplus = (np.arange(1.0, n + 1) / n - cdfvals) 

7580 amax = dplus.argmax() 

7581 loc_max = x[amax] 

7582 return (dplus[amax], loc_max) 

7583 

7584 

7585def _compute_dminus(cdfvals, x): 

7586 """Computes D- as used in the Kolmogorov-Smirnov test. 

7587 

7588 Parameters 

7589 ---------- 

7590 cdfvals : array_like 

7591 Sorted array of CDF values between 0 and 1 

7592 x: array_like 

7593 Sorted array of the stochastic variable itself 

7594 

7595 Returns 

7596 ------- 

7597 res: Pair with the following elements: 

7598 - Maximum distance of the CDF values above Uniform(0, 1) 

7599 - The location at which the maximum is reached. 

7600 """ 

7601 n = len(cdfvals) 

7602 dminus = (cdfvals - np.arange(0.0, n)/n) 

7603 amax = dminus.argmax() 

7604 loc_max = x[amax] 

7605 return (dminus[amax], loc_max) 

7606 

7607 

7608@_rename_parameter("mode", "method") 

7609def ks_1samp(x, cdf, args=(), alternative='two-sided', method='auto'): 

7610 """ 

7611 Performs the one-sample Kolmogorov-Smirnov test for goodness of fit. 

7612 

7613 This test compares the underlying distribution F(x) of a sample 

7614 against a given continuous distribution G(x). See Notes for a description 

7615 of the available null and alternative hypotheses. 

7616 

7617 Parameters 

7618 ---------- 

7619 x : array_like 

7620 a 1-D array of observations of iid random variables. 

7621 cdf : callable 

7622 callable used to calculate the cdf. 

7623 args : tuple, sequence, optional 

7624 Distribution parameters, used with `cdf`. 

7625 alternative : {'two-sided', 'less', 'greater'}, optional 

7626 Defines the null and alternative hypotheses. Default is 'two-sided'. 

7627 Please see explanations in the Notes below. 

7628 method : {'auto', 'exact', 'approx', 'asymp'}, optional 

7629 Defines the distribution used for calculating the p-value. 

7630 The following options are available (default is 'auto'): 

7631 

7632 * 'auto' : selects one of the other options. 

7633 * 'exact' : uses the exact distribution of test statistic. 

7634 * 'approx' : approximates the two-sided probability with twice 

7635 the one-sided probability 

7636 * 'asymp': uses asymptotic distribution of test statistic 

7637 

7638 Returns 

7639 ------- 

7640 res: KstestResult 

7641 An object containing attributes: 

7642 

7643 statistic : float 

7644 KS test statistic, either D+, D-, or D (the maximum of the two) 

7645 pvalue : float 

7646 One-tailed or two-tailed p-value. 

7647 statistic_location : float 

7648 Value of `x` corresponding with the KS statistic; i.e., the 

7649 distance between the empirical distribution function and the 

7650 hypothesized cumulative distribution function is measured at this 

7651 observation. 

7652 statistic_sign : int 

7653 +1 if the KS statistic is the maximum positive difference between 

7654 the empirical distribution function and the hypothesized cumulative 

7655 distribution function (D+); -1 if the KS statistic is the maximum 

7656 negative difference (D-). 

7657 

7658 

7659 See Also 

7660 -------- 

7661 ks_2samp, kstest 

7662 

7663 Notes 

7664 ----- 

7665 There are three options for the null and corresponding alternative 

7666 hypothesis that can be selected using the `alternative` parameter. 

7667 

7668 - `two-sided`: The null hypothesis is that the two distributions are 

7669 identical, F(x)=G(x) for all x; the alternative is that they are not 

7670 identical. 

7671 

7672 - `less`: The null hypothesis is that F(x) >= G(x) for all x; the 

7673 alternative is that F(x) < G(x) for at least one x. 

7674 

7675 - `greater`: The null hypothesis is that F(x) <= G(x) for all x; the 

7676 alternative is that F(x) > G(x) for at least one x. 

7677 

7678 Note that the alternative hypotheses describe the *CDFs* of the 

7679 underlying distributions, not the observed values. For example, 

7680 suppose x1 ~ F and x2 ~ G. If F(x) > G(x) for all x, the values in 

7681 x1 tend to be less than those in x2. 

7682 

7683 Examples 

7684 -------- 

7685 Suppose we wish to test the null hypothesis that a sample is distributed 

7686 according to the standard normal. 

7687 We choose a confidence level of 95%; that is, we will reject the null 

7688 hypothesis in favor of the alternative if the p-value is less than 0.05. 

7689 

7690 When testing uniformly distributed data, we would expect the 

7691 null hypothesis to be rejected. 

7692 

7693 >>> import numpy as np 

7694 >>> from scipy import stats 

7695 >>> rng = np.random.default_rng() 

7696 >>> stats.ks_1samp(stats.uniform.rvs(size=100, random_state=rng), 

7697 ... stats.norm.cdf) 

7698 KstestResult(statistic=0.5001899973268688, pvalue=1.1616392184763533e-23) 

7699 

7700 Indeed, the p-value is lower than our threshold of 0.05, so we reject the 

7701 null hypothesis in favor of the default "two-sided" alternative: the data 

7702 are *not* distributed according to the standard normal. 

7703 

7704 When testing random variates from the standard normal distribution, we 

7705 expect the data to be consistent with the null hypothesis most of the time. 

7706 

7707 >>> x = stats.norm.rvs(size=100, random_state=rng) 

7708 >>> stats.ks_1samp(x, stats.norm.cdf) 

7709 KstestResult(statistic=0.05345882212970396, pvalue=0.9227159037744717) 

7710 

7711 As expected, the p-value of 0.92 is not below our threshold of 0.05, so 

7712 we cannot reject the null hypothesis. 

7713 

7714 Suppose, however, that the random variates are distributed according to 

7715 a normal distribution that is shifted toward greater values. In this case, 

7716 the cumulative density function (CDF) of the underlying distribution tends 

7717 to be *less* than the CDF of the standard normal. Therefore, we would 

7718 expect the null hypothesis to be rejected with ``alternative='less'``: 

7719 

7720 >>> x = stats.norm.rvs(size=100, loc=0.5, random_state=rng) 

7721 >>> stats.ks_1samp(x, stats.norm.cdf, alternative='less') 

7722 KstestResult(statistic=0.17482387821055168, pvalue=0.001913921057766743) 

7723 

7724 and indeed, with p-value smaller than our threshold, we reject the null 

7725 hypothesis in favor of the alternative. 

7726 

7727 """ 

7728 mode = method 

7729 

7730 alternative = {'t': 'two-sided', 'g': 'greater', 'l': 'less'}.get( 

7731 alternative.lower()[0], alternative) 

7732 if alternative not in ['two-sided', 'greater', 'less']: 

7733 raise ValueError("Unexpected alternative %s" % alternative) 

7734 if np.ma.is_masked(x): 

7735 x = x.compressed() 

7736 

7737 N = len(x) 

7738 x = np.sort(x) 

7739 cdfvals = cdf(x, *args) 

7740 

7741 if alternative == 'greater': 

7742 Dplus, d_location = _compute_dplus(cdfvals, x) 

7743 return KstestResult(Dplus, distributions.ksone.sf(Dplus, N), 

7744 statistic_location=d_location, 

7745 statistic_sign=1) 

7746 

7747 if alternative == 'less': 

7748 Dminus, d_location = _compute_dminus(cdfvals, x) 

7749 return KstestResult(Dminus, distributions.ksone.sf(Dminus, N), 

7750 statistic_location=d_location, 

7751 statistic_sign=-1) 

7752 

7753 # alternative == 'two-sided': 

7754 Dplus, dplus_location = _compute_dplus(cdfvals, x) 

7755 Dminus, dminus_location = _compute_dminus(cdfvals, x) 

7756 if Dplus > Dminus: 

7757 D = Dplus 

7758 d_location = dplus_location 

7759 d_sign = 1 

7760 else: 

7761 D = Dminus 

7762 d_location = dminus_location 

7763 d_sign = -1 

7764 

7765 if mode == 'auto': # Always select exact 

7766 mode = 'exact' 

7767 if mode == 'exact': 

7768 prob = distributions.kstwo.sf(D, N) 

7769 elif mode == 'asymp': 

7770 prob = distributions.kstwobign.sf(D * np.sqrt(N)) 

7771 else: 

7772 # mode == 'approx' 

7773 prob = 2 * distributions.ksone.sf(D, N) 

7774 prob = np.clip(prob, 0, 1) 

7775 return KstestResult(D, prob, 

7776 statistic_location=d_location, 

7777 statistic_sign=d_sign) 

7778 

7779 

7780Ks_2sampResult = KstestResult 

7781 

7782 

7783def _compute_prob_outside_square(n, h): 

7784 """ 

7785 Compute the proportion of paths that pass outside the two diagonal lines. 

7786 

7787 Parameters 

7788 ---------- 

7789 n : integer 

7790 n > 0 

7791 h : integer 

7792 0 <= h <= n 

7793 

7794 Returns 

7795 ------- 

7796 p : float 

7797 The proportion of paths that pass outside the lines x-y = +/-h. 

7798 

7799 """ 

7800 # Compute Pr(D_{n,n} >= h/n) 

7801 # Prob = 2 * ( binom(2n, n-h) - binom(2n, n-2a) + binom(2n, n-3a) - ... ) 

7802 # / binom(2n, n) 

7803 # This formulation exhibits subtractive cancellation. 

7804 # Instead divide each term by binom(2n, n), then factor common terms 

7805 # and use a Horner-like algorithm 

7806 # P = 2 * A0 * (1 - A1*(1 - A2*(1 - A3*(1 - A4*(...))))) 

7807 

7808 P = 0.0 

7809 k = int(np.floor(n / h)) 

7810 while k >= 0: 

7811 p1 = 1.0 

7812 # Each of the Ai terms has numerator and denominator with 

7813 # h simple terms. 

7814 for j in range(h): 

7815 p1 = (n - k * h - j) * p1 / (n + k * h + j + 1) 

7816 P = p1 * (1.0 - P) 

7817 k -= 1 

7818 return 2 * P 

7819 

7820 

7821def _count_paths_outside_method(m, n, g, h): 

7822 """Count the number of paths that pass outside the specified diagonal. 

7823 

7824 Parameters 

7825 ---------- 

7826 m : integer 

7827 m > 0 

7828 n : integer 

7829 n > 0 

7830 g : integer 

7831 g is greatest common divisor of m and n 

7832 h : integer 

7833 0 <= h <= lcm(m,n) 

7834 

7835 Returns 

7836 ------- 

7837 p : float 

7838 The number of paths that go low. 

7839 The calculation may overflow - check for a finite answer. 

7840 

7841 Notes 

7842 ----- 

7843 Count the integer lattice paths from (0, 0) to (m, n), which at some 

7844 point (x, y) along the path, satisfy: 

7845 m*y <= n*x - h*g 

7846 The paths make steps of size +1 in either positive x or positive y 

7847 directions. 

7848 

7849 We generally follow Hodges' treatment of Drion/Gnedenko/Korolyuk. 

7850 Hodges, J.L. Jr., 

7851 "The Significance Probability of the Smirnov Two-Sample Test," 

7852 Arkiv fiur Matematik, 3, No. 43 (1958), 469-86. 

7853 

7854 """ 

7855 # Compute #paths which stay lower than x/m-y/n = h/lcm(m,n) 

7856 # B(x, y) = #{paths from (0,0) to (x,y) without 

7857 # previously crossing the boundary} 

7858 # = binom(x, y) - #{paths which already reached the boundary} 

7859 # Multiply by the number of path extensions going from (x, y) to (m, n) 

7860 # Sum. 

7861 

7862 # Probability is symmetrical in m, n. Computation below assumes m >= n. 

7863 if m < n: 

7864 m, n = n, m 

7865 mg = m // g 

7866 ng = n // g 

7867 

7868 # Not every x needs to be considered. 

7869 # xj holds the list of x values to be checked. 

7870 # Wherever n*x/m + ng*h crosses an integer 

7871 lxj = n + (mg-h)//mg 

7872 xj = [(h + mg * j + ng-1)//ng for j in range(lxj)] 

7873 # B is an array just holding a few values of B(x,y), the ones needed. 

7874 # B[j] == B(x_j, j) 

7875 if lxj == 0: 

7876 return special.binom(m + n, n) 

7877 B = np.zeros(lxj) 

7878 B[0] = 1 

7879 # Compute the B(x, y) terms 

7880 for j in range(1, lxj): 

7881 Bj = special.binom(xj[j] + j, j) 

7882 for i in range(j): 

7883 bin = special.binom(xj[j] - xj[i] + j - i, j-i) 

7884 Bj -= bin * B[i] 

7885 B[j] = Bj 

7886 # Compute the number of path extensions... 

7887 num_paths = 0 

7888 for j in range(lxj): 

7889 bin = special.binom((m-xj[j]) + (n - j), n-j) 

7890 term = B[j] * bin 

7891 num_paths += term 

7892 return num_paths 

7893 

7894 

7895def _attempt_exact_2kssamp(n1, n2, g, d, alternative): 

7896 """Attempts to compute the exact 2sample probability. 

7897 

7898 n1, n2 are the sample sizes 

7899 g is the gcd(n1, n2) 

7900 d is the computed max difference in ECDFs 

7901 

7902 Returns (success, d, probability) 

7903 """ 

7904 lcm = (n1 // g) * n2 

7905 h = int(np.round(d * lcm)) 

7906 d = h * 1.0 / lcm 

7907 if h == 0: 

7908 return True, d, 1.0 

7909 saw_fp_error, prob = False, np.nan 

7910 try: 

7911 with np.errstate(invalid="raise", over="raise"): 

7912 if alternative == 'two-sided': 

7913 if n1 == n2: 

7914 prob = _compute_prob_outside_square(n1, h) 

7915 else: 

7916 prob = _compute_outer_prob_inside_method(n1, n2, g, h) 

7917 else: 

7918 if n1 == n2: 

7919 # prob = binom(2n, n-h) / binom(2n, n) 

7920 # Evaluating in that form incurs roundoff errors 

7921 # from special.binom. Instead calculate directly 

7922 jrange = np.arange(h) 

7923 prob = np.prod((n1 - jrange) / (n1 + jrange + 1.0)) 

7924 else: 

7925 with np.errstate(over='raise'): 

7926 num_paths = _count_paths_outside_method(n1, n2, g, h) 

7927 bin = special.binom(n1 + n2, n1) 

7928 if num_paths > bin or np.isinf(bin): 

7929 saw_fp_error = True 

7930 else: 

7931 prob = num_paths / bin 

7932 

7933 except (FloatingPointError, OverflowError): 

7934 saw_fp_error = True 

7935 

7936 if saw_fp_error: 

7937 return False, d, np.nan 

7938 if not (0 <= prob <= 1): 

7939 return False, d, prob 

7940 return True, d, prob 

7941 

7942 

7943@_rename_parameter("mode", "method") 

7944def ks_2samp(data1, data2, alternative='two-sided', method='auto'): 

7945 """ 

7946 Performs the two-sample Kolmogorov-Smirnov test for goodness of fit. 

7947 

7948 This test compares the underlying continuous distributions F(x) and G(x) 

7949 of two independent samples. See Notes for a description of the available 

7950 null and alternative hypotheses. 

7951 

7952 Parameters 

7953 ---------- 

7954 data1, data2 : array_like, 1-Dimensional 

7955 Two arrays of sample observations assumed to be drawn from a continuous 

7956 distribution, sample sizes can be different. 

7957 alternative : {'two-sided', 'less', 'greater'}, optional 

7958 Defines the null and alternative hypotheses. Default is 'two-sided'. 

7959 Please see explanations in the Notes below. 

7960 method : {'auto', 'exact', 'asymp'}, optional 

7961 Defines the method used for calculating the p-value. 

7962 The following options are available (default is 'auto'): 

7963 

7964 * 'auto' : use 'exact' for small size arrays, 'asymp' for large 

7965 * 'exact' : use exact distribution of test statistic 

7966 * 'asymp' : use asymptotic distribution of test statistic 

7967 

7968 Returns 

7969 ------- 

7970 res: KstestResult 

7971 An object containing attributes: 

7972 

7973 statistic : float 

7974 KS test statistic. 

7975 pvalue : float 

7976 One-tailed or two-tailed p-value. 

7977 statistic_location : float 

7978 Value from `data1` or `data2` corresponding with the KS statistic; 

7979 i.e., the distance between the empirical distribution functions is 

7980 measured at this observation. 

7981 statistic_sign : int 

7982 +1 if the empirical distribution function of `data1` exceeds 

7983 the empirical distribution function of `data2` at 

7984 `statistic_location`, otherwise -1. 

7985 

7986 See Also 

7987 -------- 

7988 kstest, ks_1samp, epps_singleton_2samp, anderson_ksamp 

7989 

7990 Notes 

7991 ----- 

7992 There are three options for the null and corresponding alternative 

7993 hypothesis that can be selected using the `alternative` parameter. 

7994 

7995 - `less`: The null hypothesis is that F(x) >= G(x) for all x; the 

7996 alternative is that F(x) < G(x) for at least one x. The statistic 

7997 is the magnitude of the minimum (most negative) difference between the 

7998 empirical distribution functions of the samples. 

7999 

8000 - `greater`: The null hypothesis is that F(x) <= G(x) for all x; the 

8001 alternative is that F(x) > G(x) for at least one x. The statistic 

8002 is the maximum (most positive) difference between the empirical 

8003 distribution functions of the samples. 

8004 

8005 - `two-sided`: The null hypothesis is that the two distributions are 

8006 identical, F(x)=G(x) for all x; the alternative is that they are not 

8007 identical. The statistic is the maximum absolute difference between the 

8008 empirical distribution functions of the samples. 

8009 

8010 Note that the alternative hypotheses describe the *CDFs* of the 

8011 underlying distributions, not the observed values of the data. For example, 

8012 suppose x1 ~ F and x2 ~ G. If F(x) > G(x) for all x, the values in 

8013 x1 tend to be less than those in x2. 

8014 

8015 If the KS statistic is large, then the p-value will be small, and this may 

8016 be taken as evidence against the null hypothesis in favor of the 

8017 alternative. 

8018 

8019 If ``method='exact'``, `ks_2samp` attempts to compute an exact p-value, 

8020 that is, the probability under the null hypothesis of obtaining a test 

8021 statistic value as extreme as the value computed from the data. 

8022 If ``method='asymp'``, the asymptotic Kolmogorov-Smirnov distribution is 

8023 used to compute an approximate p-value. 

8024 If ``method='auto'``, an exact p-value computation is attempted if both 

8025 sample sizes are less than 10000; otherwise, the asymptotic method is used. 

8026 In any case, if an exact p-value calculation is attempted and fails, a 

8027 warning will be emitted, and the asymptotic p-value will be returned. 

8028 

8029 The 'two-sided' 'exact' computation computes the complementary probability 

8030 and then subtracts from 1. As such, the minimum probability it can return 

8031 is about 1e-16. While the algorithm itself is exact, numerical 

8032 errors may accumulate for large sample sizes. It is most suited to 

8033 situations in which one of the sample sizes is only a few thousand. 

8034 

8035 We generally follow Hodges' treatment of Drion/Gnedenko/Korolyuk [1]_. 

8036 

8037 References 

8038 ---------- 

8039 .. [1] Hodges, J.L. Jr., "The Significance Probability of the Smirnov 

8040 Two-Sample Test," Arkiv fiur Matematik, 3, No. 43 (1958), 469-86. 

8041 

8042 Examples 

8043 -------- 

8044 Suppose we wish to test the null hypothesis that two samples were drawn 

8045 from the same distribution. 

8046 We choose a confidence level of 95%; that is, we will reject the null 

8047 hypothesis in favor of the alternative if the p-value is less than 0.05. 

8048 

8049 If the first sample were drawn from a uniform distribution and the second 

8050 were drawn from the standard normal, we would expect the null hypothesis 

8051 to be rejected. 

8052 

8053 >>> import numpy as np 

8054 >>> from scipy import stats 

8055 >>> rng = np.random.default_rng() 

8056 >>> sample1 = stats.uniform.rvs(size=100, random_state=rng) 

8057 >>> sample2 = stats.norm.rvs(size=110, random_state=rng) 

8058 >>> stats.ks_2samp(sample1, sample2) 

8059 KstestResult(statistic=0.5454545454545454, pvalue=7.37417839555191e-15) 

8060 

8061 Indeed, the p-value is lower than our threshold of 0.05, so we reject the 

8062 null hypothesis in favor of the default "two-sided" alternative: the data 

8063 were *not* drawn from the same distribution. 

8064 

8065 When both samples are drawn from the same distribution, we expect the data 

8066 to be consistent with the null hypothesis most of the time. 

8067 

8068 >>> sample1 = stats.norm.rvs(size=105, random_state=rng) 

8069 >>> sample2 = stats.norm.rvs(size=95, random_state=rng) 

8070 >>> stats.ks_2samp(sample1, sample2) 

8071 KstestResult(statistic=0.10927318295739348, pvalue=0.5438289009927495) 

8072 

8073 As expected, the p-value of 0.54 is not below our threshold of 0.05, so 

8074 we cannot reject the null hypothesis. 

8075 

8076 Suppose, however, that the first sample were drawn from 

8077 a normal distribution shifted toward greater values. In this case, 

8078 the cumulative density function (CDF) of the underlying distribution tends 

8079 to be *less* than the CDF underlying the second sample. Therefore, we would 

8080 expect the null hypothesis to be rejected with ``alternative='less'``: 

8081 

8082 >>> sample1 = stats.norm.rvs(size=105, loc=0.5, random_state=rng) 

8083 >>> stats.ks_2samp(sample1, sample2, alternative='less') 

8084 KstestResult(statistic=0.4055137844611529, pvalue=3.5474563068855554e-08) 

8085 

8086 and indeed, with p-value smaller than our threshold, we reject the null 

8087 hypothesis in favor of the alternative. 

8088 

8089 """ 

8090 mode = method 

8091 

8092 if mode not in ['auto', 'exact', 'asymp']: 

8093 raise ValueError(f'Invalid value for mode: {mode}') 

8094 alternative = {'t': 'two-sided', 'g': 'greater', 'l': 'less'}.get( 

8095 alternative.lower()[0], alternative) 

8096 if alternative not in ['two-sided', 'less', 'greater']: 

8097 raise ValueError(f'Invalid value for alternative: {alternative}') 

8098 MAX_AUTO_N = 10000 # 'auto' will attempt to be exact if n1,n2 <= MAX_AUTO_N 

8099 if np.ma.is_masked(data1): 

8100 data1 = data1.compressed() 

8101 if np.ma.is_masked(data2): 

8102 data2 = data2.compressed() 

8103 data1 = np.sort(data1) 

8104 data2 = np.sort(data2) 

8105 n1 = data1.shape[0] 

8106 n2 = data2.shape[0] 

8107 if min(n1, n2) == 0: 

8108 raise ValueError('Data passed to ks_2samp must not be empty') 

8109 

8110 data_all = np.concatenate([data1, data2]) 

8111 # using searchsorted solves equal data problem 

8112 cdf1 = np.searchsorted(data1, data_all, side='right') / n1 

8113 cdf2 = np.searchsorted(data2, data_all, side='right') / n2 

8114 cddiffs = cdf1 - cdf2 

8115 

8116 # Identify the location of the statistic 

8117 argminS = np.argmin(cddiffs) 

8118 argmaxS = np.argmax(cddiffs) 

8119 loc_minS = data_all[argminS] 

8120 loc_maxS = data_all[argmaxS] 

8121 

8122 # Ensure sign of minS is not negative. 

8123 minS = np.clip(-cddiffs[argminS], 0, 1) 

8124 maxS = cddiffs[argmaxS] 

8125 

8126 if alternative == 'less' or (alternative == 'two-sided' and minS > maxS): 

8127 d = minS 

8128 d_location = loc_minS 

8129 d_sign = -1 

8130 else: 

8131 d = maxS 

8132 d_location = loc_maxS 

8133 d_sign = 1 

8134 g = gcd(n1, n2) 

8135 n1g = n1 // g 

8136 n2g = n2 // g 

8137 prob = -np.inf 

8138 if mode == 'auto': 

8139 mode = 'exact' if max(n1, n2) <= MAX_AUTO_N else 'asymp' 

8140 elif mode == 'exact': 

8141 # If lcm(n1, n2) is too big, switch from exact to asymp 

8142 if n1g >= np.iinfo(np.int32).max / n2g: 

8143 mode = 'asymp' 

8144 warnings.warn( 

8145 f"Exact ks_2samp calculation not possible with samples sizes " 

8146 f"{n1} and {n2}. Switching to 'asymp'.", RuntimeWarning, 

8147 stacklevel=3) 

8148 

8149 if mode == 'exact': 

8150 success, d, prob = _attempt_exact_2kssamp(n1, n2, g, d, alternative) 

8151 if not success: 

8152 mode = 'asymp' 

8153 warnings.warn(f"ks_2samp: Exact calculation unsuccessful. " 

8154 f"Switching to method={mode}.", RuntimeWarning, 

8155 stacklevel=3) 

8156 

8157 if mode == 'asymp': 

8158 # The product n1*n2 is large. Use Smirnov's asymptoptic formula. 

8159 # Ensure float to avoid overflow in multiplication 

8160 # sorted because the one-sided formula is not symmetric in n1, n2 

8161 m, n = sorted([float(n1), float(n2)], reverse=True) 

8162 en = m * n / (m + n) 

8163 if alternative == 'two-sided': 

8164 prob = distributions.kstwo.sf(d, np.round(en)) 

8165 else: 

8166 z = np.sqrt(en) * d 

8167 # Use Hodges' suggested approximation Eqn 5.3 

8168 # Requires m to be the larger of (n1, n2) 

8169 expt = -2 * z**2 - 2 * z * (m + 2*n)/np.sqrt(m*n*(m+n))/3.0 

8170 prob = np.exp(expt) 

8171 

8172 prob = np.clip(prob, 0, 1) 

8173 return KstestResult(d, prob, statistic_location=d_location, 

8174 statistic_sign=d_sign) 

8175 

8176 

8177def _parse_kstest_args(data1, data2, args, N): 

8178 # kstest allows many different variations of arguments. 

8179 # Pull out the parsing into a separate function 

8180 # (xvals, yvals, ) # 2sample 

8181 # (xvals, cdf function,..) 

8182 # (xvals, name of distribution, ...) 

8183 # (name of distribution, name of distribution, ...) 

8184 

8185 # Returns xvals, yvals, cdf 

8186 # where cdf is a cdf function, or None 

8187 # and yvals is either an array_like of values, or None 

8188 # and xvals is array_like. 

8189 rvsfunc, cdf = None, None 

8190 if isinstance(data1, str): 

8191 rvsfunc = getattr(distributions, data1).rvs 

8192 elif callable(data1): 

8193 rvsfunc = data1 

8194 

8195 if isinstance(data2, str): 

8196 cdf = getattr(distributions, data2).cdf 

8197 data2 = None 

8198 elif callable(data2): 

8199 cdf = data2 

8200 data2 = None 

8201 

8202 data1 = np.sort(rvsfunc(*args, size=N) if rvsfunc else data1) 

8203 return data1, data2, cdf 

8204 

8205 

8206@_rename_parameter("mode", "method") 

8207def kstest(rvs, cdf, args=(), N=20, alternative='two-sided', method='auto'): 

8208 """ 

8209 Performs the (one-sample or two-sample) Kolmogorov-Smirnov test for 

8210 goodness of fit. 

8211 

8212 The one-sample test compares the underlying distribution F(x) of a sample 

8213 against a given distribution G(x). The two-sample test compares the 

8214 underlying distributions of two independent samples. Both tests are valid 

8215 only for continuous distributions. 

8216 

8217 Parameters 

8218 ---------- 

8219 rvs : str, array_like, or callable 

8220 If an array, it should be a 1-D array of observations of random 

8221 variables. 

8222 If a callable, it should be a function to generate random variables; 

8223 it is required to have a keyword argument `size`. 

8224 If a string, it should be the name of a distribution in `scipy.stats`, 

8225 which will be used to generate random variables. 

8226 cdf : str, array_like or callable 

8227 If array_like, it should be a 1-D array of observations of random 

8228 variables, and the two-sample test is performed 

8229 (and rvs must be array_like). 

8230 If a callable, that callable is used to calculate the cdf. 

8231 If a string, it should be the name of a distribution in `scipy.stats`, 

8232 which will be used as the cdf function. 

8233 args : tuple, sequence, optional 

8234 Distribution parameters, used if `rvs` or `cdf` are strings or 

8235 callables. 

8236 N : int, optional 

8237 Sample size if `rvs` is string or callable. Default is 20. 

8238 alternative : {'two-sided', 'less', 'greater'}, optional 

8239 Defines the null and alternative hypotheses. Default is 'two-sided'. 

8240 Please see explanations in the Notes below. 

8241 method : {'auto', 'exact', 'approx', 'asymp'}, optional 

8242 Defines the distribution used for calculating the p-value. 

8243 The following options are available (default is 'auto'): 

8244 

8245 * 'auto' : selects one of the other options. 

8246 * 'exact' : uses the exact distribution of test statistic. 

8247 * 'approx' : approximates the two-sided probability with twice the 

8248 one-sided probability 

8249 * 'asymp': uses asymptotic distribution of test statistic 

8250 

8251 Returns 

8252 ------- 

8253 res: KstestResult 

8254 An object containing attributes: 

8255 

8256 statistic : float 

8257 KS test statistic, either D+, D-, or D (the maximum of the two) 

8258 pvalue : float 

8259 One-tailed or two-tailed p-value. 

8260 statistic_location : float 

8261 In a one-sample test, this is the value of `rvs` 

8262 corresponding with the KS statistic; i.e., the distance between 

8263 the empirical distribution function and the hypothesized cumulative 

8264 distribution function is measured at this observation. 

8265 

8266 In a two-sample test, this is the value from `rvs` or `cdf` 

8267 corresponding with the KS statistic; i.e., the distance between 

8268 the empirical distribution functions is measured at this 

8269 observation. 

8270 statistic_sign : int 

8271 In a one-sample test, this is +1 if the KS statistic is the 

8272 maximum positive difference between the empirical distribution 

8273 function and the hypothesized cumulative distribution function 

8274 (D+); it is -1 if the KS statistic is the maximum negative 

8275 difference (D-). 

8276 

8277 In a two-sample test, this is +1 if the empirical distribution 

8278 function of `rvs` exceeds the empirical distribution 

8279 function of `cdf` at `statistic_location`, otherwise -1. 

8280 

8281 See Also 

8282 -------- 

8283 ks_1samp, ks_2samp 

8284 

8285 Notes 

8286 ----- 

8287 There are three options for the null and corresponding alternative 

8288 hypothesis that can be selected using the `alternative` parameter. 

8289 

8290 - `two-sided`: The null hypothesis is that the two distributions are 

8291 identical, F(x)=G(x) for all x; the alternative is that they are not 

8292 identical. 

8293 

8294 - `less`: The null hypothesis is that F(x) >= G(x) for all x; the 

8295 alternative is that F(x) < G(x) for at least one x. 

8296 

8297 - `greater`: The null hypothesis is that F(x) <= G(x) for all x; the 

8298 alternative is that F(x) > G(x) for at least one x. 

8299 

8300 Note that the alternative hypotheses describe the *CDFs* of the 

8301 underlying distributions, not the observed values. For example, 

8302 suppose x1 ~ F and x2 ~ G. If F(x) > G(x) for all x, the values in 

8303 x1 tend to be less than those in x2. 

8304 

8305 

8306 Examples 

8307 -------- 

8308 Suppose we wish to test the null hypothesis that a sample is distributed 

8309 according to the standard normal. 

8310 We choose a confidence level of 95%; that is, we will reject the null 

8311 hypothesis in favor of the alternative if the p-value is less than 0.05. 

8312 

8313 When testing uniformly distributed data, we would expect the 

8314 null hypothesis to be rejected. 

8315 

8316 >>> import numpy as np 

8317 >>> from scipy import stats 

8318 >>> rng = np.random.default_rng() 

8319 >>> stats.kstest(stats.uniform.rvs(size=100, random_state=rng), 

8320 ... stats.norm.cdf) 

8321 KstestResult(statistic=0.5001899973268688, pvalue=1.1616392184763533e-23) 

8322 

8323 Indeed, the p-value is lower than our threshold of 0.05, so we reject the 

8324 null hypothesis in favor of the default "two-sided" alternative: the data 

8325 are *not* distributed according to the standard normal. 

8326 

8327 When testing random variates from the standard normal distribution, we 

8328 expect the data to be consistent with the null hypothesis most of the time. 

8329 

8330 >>> x = stats.norm.rvs(size=100, random_state=rng) 

8331 >>> stats.kstest(x, stats.norm.cdf) 

8332 KstestResult(statistic=0.05345882212970396, pvalue=0.9227159037744717) 

8333 

8334 As expected, the p-value of 0.92 is not below our threshold of 0.05, so 

8335 we cannot reject the null hypothesis. 

8336 

8337 Suppose, however, that the random variates are distributed according to 

8338 a normal distribution that is shifted toward greater values. In this case, 

8339 the cumulative density function (CDF) of the underlying distribution tends 

8340 to be *less* than the CDF of the standard normal. Therefore, we would 

8341 expect the null hypothesis to be rejected with ``alternative='less'``: 

8342 

8343 >>> x = stats.norm.rvs(size=100, loc=0.5, random_state=rng) 

8344 >>> stats.kstest(x, stats.norm.cdf, alternative='less') 

8345 KstestResult(statistic=0.17482387821055168, pvalue=0.001913921057766743) 

8346 

8347 and indeed, with p-value smaller than our threshold, we reject the null 

8348 hypothesis in favor of the alternative. 

8349 

8350 For convenience, the previous test can be performed using the name of the 

8351 distribution as the second argument. 

8352 

8353 >>> stats.kstest(x, "norm", alternative='less') 

8354 KstestResult(statistic=0.17482387821055168, pvalue=0.001913921057766743) 

8355 

8356 The examples above have all been one-sample tests identical to those 

8357 performed by `ks_1samp`. Note that `kstest` can also perform two-sample 

8358 tests identical to those performed by `ks_2samp`. For example, when two 

8359 samples are drawn from the same distribution, we expect the data to be 

8360 consistent with the null hypothesis most of the time. 

8361 

8362 >>> sample1 = stats.laplace.rvs(size=105, random_state=rng) 

8363 >>> sample2 = stats.laplace.rvs(size=95, random_state=rng) 

8364 >>> stats.kstest(sample1, sample2) 

8365 KstestResult(statistic=0.11779448621553884, pvalue=0.4494256912629795) 

8366 

8367 As expected, the p-value of 0.45 is not below our threshold of 0.05, so 

8368 we cannot reject the null hypothesis. 

8369 

8370 """ 

8371 # to not break compatibility with existing code 

8372 if alternative == 'two_sided': 

8373 alternative = 'two-sided' 

8374 if alternative not in ['two-sided', 'greater', 'less']: 

8375 raise ValueError("Unexpected alternative %s" % alternative) 

8376 xvals, yvals, cdf = _parse_kstest_args(rvs, cdf, args, N) 

8377 if cdf: 

8378 return ks_1samp(xvals, cdf, args=args, alternative=alternative, 

8379 method=method) 

8380 return ks_2samp(xvals, yvals, alternative=alternative, method=method) 

8381 

8382 

8383def tiecorrect(rankvals): 

8384 """Tie correction factor for Mann-Whitney U and Kruskal-Wallis H tests. 

8385 

8386 Parameters 

8387 ---------- 

8388 rankvals : array_like 

8389 A 1-D sequence of ranks. Typically this will be the array 

8390 returned by `~scipy.stats.rankdata`. 

8391 

8392 Returns 

8393 ------- 

8394 factor : float 

8395 Correction factor for U or H. 

8396 

8397 See Also 

8398 -------- 

8399 rankdata : Assign ranks to the data 

8400 mannwhitneyu : Mann-Whitney rank test 

8401 kruskal : Kruskal-Wallis H test 

8402 

8403 References 

8404 ---------- 

8405 .. [1] Siegel, S. (1956) Nonparametric Statistics for the Behavioral 

8406 Sciences. New York: McGraw-Hill. 

8407 

8408 Examples 

8409 -------- 

8410 >>> from scipy.stats import tiecorrect, rankdata 

8411 >>> tiecorrect([1, 2.5, 2.5, 4]) 

8412 0.9 

8413 >>> ranks = rankdata([1, 3, 2, 4, 5, 7, 2, 8, 4]) 

8414 >>> ranks 

8415 array([ 1. , 4. , 2.5, 5.5, 7. , 8. , 2.5, 9. , 5.5]) 

8416 >>> tiecorrect(ranks) 

8417 0.9833333333333333 

8418 

8419 """ 

8420 arr = np.sort(rankvals) 

8421 idx = np.nonzero(np.r_[True, arr[1:] != arr[:-1], True])[0] 

8422 cnt = np.diff(idx).astype(np.float64) 

8423 

8424 size = np.float64(arr.size) 

8425 return 1.0 if size < 2 else 1.0 - (cnt**3 - cnt).sum() / (size**3 - size) 

8426 

8427 

8428RanksumsResult = namedtuple('RanksumsResult', ('statistic', 'pvalue')) 

8429 

8430 

8431@_axis_nan_policy_factory(RanksumsResult, n_samples=2) 

8432def ranksums(x, y, alternative='two-sided'): 

8433 """Compute the Wilcoxon rank-sum statistic for two samples. 

8434 

8435 The Wilcoxon rank-sum test tests the null hypothesis that two sets 

8436 of measurements are drawn from the same distribution. The alternative 

8437 hypothesis is that values in one sample are more likely to be 

8438 larger than the values in the other sample. 

8439 

8440 This test should be used to compare two samples from continuous 

8441 distributions. It does not handle ties between measurements 

8442 in x and y. For tie-handling and an optional continuity correction 

8443 see `scipy.stats.mannwhitneyu`. 

8444 

8445 Parameters 

8446 ---------- 

8447 x,y : array_like 

8448 The data from the two samples. 

8449 alternative : {'two-sided', 'less', 'greater'}, optional 

8450 Defines the alternative hypothesis. Default is 'two-sided'. 

8451 The following options are available: 

8452 

8453 * 'two-sided': one of the distributions (underlying `x` or `y`) is 

8454 stochastically greater than the other. 

8455 * 'less': the distribution underlying `x` is stochastically less 

8456 than the distribution underlying `y`. 

8457 * 'greater': the distribution underlying `x` is stochastically greater 

8458 than the distribution underlying `y`. 

8459 

8460 .. versionadded:: 1.7.0 

8461 

8462 Returns 

8463 ------- 

8464 statistic : float 

8465 The test statistic under the large-sample approximation that the 

8466 rank sum statistic is normally distributed. 

8467 pvalue : float 

8468 The p-value of the test. 

8469 

8470 References 

8471 ---------- 

8472 .. [1] https://en.wikipedia.org/wiki/Wilcoxon_rank-sum_test 

8473 

8474 Examples 

8475 -------- 

8476 We can test the hypothesis that two independent unequal-sized samples are 

8477 drawn from the same distribution with computing the Wilcoxon rank-sum 

8478 statistic. 

8479 

8480 >>> import numpy as np 

8481 >>> from scipy.stats import ranksums 

8482 >>> rng = np.random.default_rng() 

8483 >>> sample1 = rng.uniform(-1, 1, 200) 

8484 >>> sample2 = rng.uniform(-0.5, 1.5, 300) # a shifted distribution 

8485 >>> ranksums(sample1, sample2) 

8486 RanksumsResult(statistic=-7.887059, pvalue=3.09390448e-15) # may vary 

8487 >>> ranksums(sample1, sample2, alternative='less') 

8488 RanksumsResult(statistic=-7.750585297581713, pvalue=4.573497606342543e-15) # may vary 

8489 >>> ranksums(sample1, sample2, alternative='greater') 

8490 RanksumsResult(statistic=-7.750585297581713, pvalue=0.9999999999999954) # may vary 

8491 

8492 The p-value of less than ``0.05`` indicates that this test rejects the 

8493 hypothesis at the 5% significance level. 

8494 

8495 """ 

8496 x, y = map(np.asarray, (x, y)) 

8497 n1 = len(x) 

8498 n2 = len(y) 

8499 alldata = np.concatenate((x, y)) 

8500 ranked = rankdata(alldata) 

8501 x = ranked[:n1] 

8502 s = np.sum(x, axis=0) 

8503 expected = n1 * (n1+n2+1) / 2.0 

8504 z = (s - expected) / np.sqrt(n1*n2*(n1+n2+1)/12.0) 

8505 z, prob = _normtest_finish(z, alternative) 

8506 

8507 return RanksumsResult(z, prob) 

8508 

8509 

8510KruskalResult = namedtuple('KruskalResult', ('statistic', 'pvalue')) 

8511 

8512 

8513@_axis_nan_policy_factory(KruskalResult, n_samples=None) 

8514def kruskal(*samples, nan_policy='propagate'): 

8515 """Compute the Kruskal-Wallis H-test for independent samples. 

8516 

8517 The Kruskal-Wallis H-test tests the null hypothesis that the population 

8518 median of all of the groups are equal. It is a non-parametric version of 

8519 ANOVA. The test works on 2 or more independent samples, which may have 

8520 different sizes. Note that rejecting the null hypothesis does not 

8521 indicate which of the groups differs. Post hoc comparisons between 

8522 groups are required to determine which groups are different. 

8523 

8524 Parameters 

8525 ---------- 

8526 sample1, sample2, ... : array_like 

8527 Two or more arrays with the sample measurements can be given as 

8528 arguments. Samples must be one-dimensional. 

8529 nan_policy : {'propagate', 'raise', 'omit'}, optional 

8530 Defines how to handle when input contains nan. 

8531 The following options are available (default is 'propagate'): 

8532 

8533 * 'propagate': returns nan 

8534 * 'raise': throws an error 

8535 * 'omit': performs the calculations ignoring nan values 

8536 

8537 Returns 

8538 ------- 

8539 statistic : float 

8540 The Kruskal-Wallis H statistic, corrected for ties. 

8541 pvalue : float 

8542 The p-value for the test using the assumption that H has a chi 

8543 square distribution. The p-value returned is the survival function of 

8544 the chi square distribution evaluated at H. 

8545 

8546 See Also 

8547 -------- 

8548 f_oneway : 1-way ANOVA. 

8549 mannwhitneyu : Mann-Whitney rank test on two samples. 

8550 friedmanchisquare : Friedman test for repeated measurements. 

8551 

8552 Notes 

8553 ----- 

8554 Due to the assumption that H has a chi square distribution, the number 

8555 of samples in each group must not be too small. A typical rule is 

8556 that each sample must have at least 5 measurements. 

8557 

8558 References 

8559 ---------- 

8560 .. [1] W. H. Kruskal & W. W. Wallis, "Use of Ranks in 

8561 One-Criterion Variance Analysis", Journal of the American Statistical 

8562 Association, Vol. 47, Issue 260, pp. 583-621, 1952. 

8563 .. [2] https://en.wikipedia.org/wiki/Kruskal-Wallis_one-way_analysis_of_variance 

8564 

8565 Examples 

8566 -------- 

8567 >>> from scipy import stats 

8568 >>> x = [1, 3, 5, 7, 9] 

8569 >>> y = [2, 4, 6, 8, 10] 

8570 >>> stats.kruskal(x, y) 

8571 KruskalResult(statistic=0.2727272727272734, pvalue=0.6015081344405895) 

8572 

8573 >>> x = [1, 1, 1] 

8574 >>> y = [2, 2, 2] 

8575 >>> z = [2, 2] 

8576 >>> stats.kruskal(x, y, z) 

8577 KruskalResult(statistic=7.0, pvalue=0.0301973834223185) 

8578 

8579 """ 

8580 samples = list(map(np.asarray, samples)) 

8581 

8582 num_groups = len(samples) 

8583 if num_groups < 2: 

8584 raise ValueError("Need at least two groups in stats.kruskal()") 

8585 

8586 for sample in samples: 

8587 if sample.size == 0: 

8588 return KruskalResult(np.nan, np.nan) 

8589 elif sample.ndim != 1: 

8590 raise ValueError("Samples must be one-dimensional.") 

8591 

8592 n = np.asarray(list(map(len, samples))) 

8593 

8594 if nan_policy not in ('propagate', 'raise', 'omit'): 

8595 raise ValueError("nan_policy must be 'propagate', 'raise' or 'omit'") 

8596 

8597 contains_nan = False 

8598 for sample in samples: 

8599 cn = _contains_nan(sample, nan_policy) 

8600 if cn[0]: 

8601 contains_nan = True 

8602 break 

8603 

8604 if contains_nan and nan_policy == 'omit': 

8605 for sample in samples: 

8606 sample = ma.masked_invalid(sample) 

8607 return mstats_basic.kruskal(*samples) 

8608 

8609 if contains_nan and nan_policy == 'propagate': 

8610 return KruskalResult(np.nan, np.nan) 

8611 

8612 alldata = np.concatenate(samples) 

8613 ranked = rankdata(alldata) 

8614 ties = tiecorrect(ranked) 

8615 if ties == 0: 

8616 raise ValueError('All numbers are identical in kruskal') 

8617 

8618 # Compute sum^2/n for each group and sum 

8619 j = np.insert(np.cumsum(n), 0, 0) 

8620 ssbn = 0 

8621 for i in range(num_groups): 

8622 ssbn += _square_of_sums(ranked[j[i]:j[i+1]]) / n[i] 

8623 

8624 totaln = np.sum(n, dtype=float) 

8625 h = 12.0 / (totaln * (totaln + 1)) * ssbn - 3 * (totaln + 1) 

8626 df = num_groups - 1 

8627 h /= ties 

8628 

8629 return KruskalResult(h, distributions.chi2.sf(h, df)) 

8630 

8631 

8632FriedmanchisquareResult = namedtuple('FriedmanchisquareResult', 

8633 ('statistic', 'pvalue')) 

8634 

8635 

8636def friedmanchisquare(*samples): 

8637 """Compute the Friedman test for repeated samples. 

8638 

8639 The Friedman test tests the null hypothesis that repeated samples of 

8640 the same individuals have the same distribution. It is often used 

8641 to test for consistency among samples obtained in different ways. 

8642 For example, if two sampling techniques are used on the same set of 

8643 individuals, the Friedman test can be used to determine if the two 

8644 sampling techniques are consistent. 

8645 

8646 Parameters 

8647 ---------- 

8648 sample1, sample2, sample3... : array_like 

8649 Arrays of observations. All of the arrays must have the same number 

8650 of elements. At least three samples must be given. 

8651 

8652 Returns 

8653 ------- 

8654 statistic : float 

8655 The test statistic, correcting for ties. 

8656 pvalue : float 

8657 The associated p-value assuming that the test statistic has a chi 

8658 squared distribution. 

8659 

8660 Notes 

8661 ----- 

8662 Due to the assumption that the test statistic has a chi squared 

8663 distribution, the p-value is only reliable for n > 10 and more than 

8664 6 repeated samples. 

8665 

8666 References 

8667 ---------- 

8668 .. [1] https://en.wikipedia.org/wiki/Friedman_test 

8669 

8670 """ 

8671 k = len(samples) 

8672 if k < 3: 

8673 raise ValueError('At least 3 sets of samples must be given ' 

8674 'for Friedman test, got {}.'.format(k)) 

8675 

8676 n = len(samples[0]) 

8677 for i in range(1, k): 

8678 if len(samples[i]) != n: 

8679 raise ValueError('Unequal N in friedmanchisquare. Aborting.') 

8680 

8681 # Rank data 

8682 data = np.vstack(samples).T 

8683 data = data.astype(float) 

8684 for i in range(len(data)): 

8685 data[i] = rankdata(data[i]) 

8686 

8687 # Handle ties 

8688 ties = 0 

8689 for d in data: 

8690 replist, repnum = find_repeats(array(d)) 

8691 for t in repnum: 

8692 ties += t * (t*t - 1) 

8693 c = 1 - ties / (k*(k*k - 1)*n) 

8694 

8695 ssbn = np.sum(data.sum(axis=0)**2) 

8696 chisq = (12.0 / (k*n*(k+1)) * ssbn - 3*n*(k+1)) / c 

8697 

8698 return FriedmanchisquareResult(chisq, distributions.chi2.sf(chisq, k - 1)) 

8699 

8700 

8701BrunnerMunzelResult = namedtuple('BrunnerMunzelResult', 

8702 ('statistic', 'pvalue')) 

8703 

8704 

8705def brunnermunzel(x, y, alternative="two-sided", distribution="t", 

8706 nan_policy='propagate'): 

8707 """Compute the Brunner-Munzel test on samples x and y. 

8708 

8709 The Brunner-Munzel test is a nonparametric test of the null hypothesis that 

8710 when values are taken one by one from each group, the probabilities of 

8711 getting large values in both groups are equal. 

8712 Unlike the Wilcoxon-Mann-Whitney's U test, this does not require the 

8713 assumption of equivariance of two groups. Note that this does not assume 

8714 the distributions are same. This test works on two independent samples, 

8715 which may have different sizes. 

8716 

8717 Parameters 

8718 ---------- 

8719 x, y : array_like 

8720 Array of samples, should be one-dimensional. 

8721 alternative : {'two-sided', 'less', 'greater'}, optional 

8722 Defines the alternative hypothesis. 

8723 The following options are available (default is 'two-sided'): 

8724 

8725 * 'two-sided' 

8726 * 'less': one-sided 

8727 * 'greater': one-sided 

8728 distribution : {'t', 'normal'}, optional 

8729 Defines how to get the p-value. 

8730 The following options are available (default is 't'): 

8731 

8732 * 't': get the p-value by t-distribution 

8733 * 'normal': get the p-value by standard normal distribution. 

8734 nan_policy : {'propagate', 'raise', 'omit'}, optional 

8735 Defines how to handle when input contains nan. 

8736 The following options are available (default is 'propagate'): 

8737 

8738 * 'propagate': returns nan 

8739 * 'raise': throws an error 

8740 * 'omit': performs the calculations ignoring nan values 

8741 

8742 Returns 

8743 ------- 

8744 statistic : float 

8745 The Brunner-Munzer W statistic. 

8746 pvalue : float 

8747 p-value assuming an t distribution. One-sided or 

8748 two-sided, depending on the choice of `alternative` and `distribution`. 

8749 

8750 See Also 

8751 -------- 

8752 mannwhitneyu : Mann-Whitney rank test on two samples. 

8753 

8754 Notes 

8755 ----- 

8756 Brunner and Munzel recommended to estimate the p-value by t-distribution 

8757 when the size of data is 50 or less. If the size is lower than 10, it would 

8758 be better to use permuted Brunner Munzel test (see [2]_). 

8759 

8760 References 

8761 ---------- 

8762 .. [1] Brunner, E. and Munzel, U. "The nonparametric Benhrens-Fisher 

8763 problem: Asymptotic theory and a small-sample approximation". 

8764 Biometrical Journal. Vol. 42(2000): 17-25. 

8765 .. [2] Neubert, K. and Brunner, E. "A studentized permutation test for the 

8766 non-parametric Behrens-Fisher problem". Computational Statistics and 

8767 Data Analysis. Vol. 51(2007): 5192-5204. 

8768 

8769 Examples 

8770 -------- 

8771 >>> from scipy import stats 

8772 >>> x1 = [1,2,1,1,1,1,1,1,1,1,2,4,1,1] 

8773 >>> x2 = [3,3,4,3,1,2,3,1,1,5,4] 

8774 >>> w, p_value = stats.brunnermunzel(x1, x2) 

8775 >>> w 

8776 3.1374674823029505 

8777 >>> p_value 

8778 0.0057862086661515377 

8779 

8780 """ 

8781 x = np.asarray(x) 

8782 y = np.asarray(y) 

8783 

8784 # check both x and y 

8785 cnx, npx = _contains_nan(x, nan_policy) 

8786 cny, npy = _contains_nan(y, nan_policy) 

8787 contains_nan = cnx or cny 

8788 if npx == "omit" or npy == "omit": 

8789 nan_policy = "omit" 

8790 

8791 if contains_nan and nan_policy == "propagate": 

8792 return BrunnerMunzelResult(np.nan, np.nan) 

8793 elif contains_nan and nan_policy == "omit": 

8794 x = ma.masked_invalid(x) 

8795 y = ma.masked_invalid(y) 

8796 return mstats_basic.brunnermunzel(x, y, alternative, distribution) 

8797 

8798 nx = len(x) 

8799 ny = len(y) 

8800 if nx == 0 or ny == 0: 

8801 return BrunnerMunzelResult(np.nan, np.nan) 

8802 rankc = rankdata(np.concatenate((x, y))) 

8803 rankcx = rankc[0:nx] 

8804 rankcy = rankc[nx:nx+ny] 

8805 rankcx_mean = np.mean(rankcx) 

8806 rankcy_mean = np.mean(rankcy) 

8807 rankx = rankdata(x) 

8808 ranky = rankdata(y) 

8809 rankx_mean = np.mean(rankx) 

8810 ranky_mean = np.mean(ranky) 

8811 

8812 Sx = np.sum(np.power(rankcx - rankx - rankcx_mean + rankx_mean, 2.0)) 

8813 Sx /= nx - 1 

8814 Sy = np.sum(np.power(rankcy - ranky - rankcy_mean + ranky_mean, 2.0)) 

8815 Sy /= ny - 1 

8816 

8817 wbfn = nx * ny * (rankcy_mean - rankcx_mean) 

8818 wbfn /= (nx + ny) * np.sqrt(nx * Sx + ny * Sy) 

8819 

8820 if distribution == "t": 

8821 df_numer = np.power(nx * Sx + ny * Sy, 2.0) 

8822 df_denom = np.power(nx * Sx, 2.0) / (nx - 1) 

8823 df_denom += np.power(ny * Sy, 2.0) / (ny - 1) 

8824 df = df_numer / df_denom 

8825 

8826 if (df_numer == 0) and (df_denom == 0): 

8827 message = ("p-value cannot be estimated with `distribution='t' " 

8828 "because degrees of freedom parameter is undefined " 

8829 "(0/0). Try using `distribution='normal'") 

8830 warnings.warn(message, RuntimeWarning) 

8831 

8832 p = distributions.t.cdf(wbfn, df) 

8833 elif distribution == "normal": 

8834 p = distributions.norm.cdf(wbfn) 

8835 else: 

8836 raise ValueError( 

8837 "distribution should be 't' or 'normal'") 

8838 

8839 if alternative == "greater": 

8840 pass 

8841 elif alternative == "less": 

8842 p = 1 - p 

8843 elif alternative == "two-sided": 

8844 p = 2 * np.min([p, 1-p]) 

8845 else: 

8846 raise ValueError( 

8847 "alternative should be 'less', 'greater' or 'two-sided'") 

8848 

8849 return BrunnerMunzelResult(wbfn, p) 

8850 

8851 

8852def combine_pvalues(pvalues, method='fisher', weights=None): 

8853 """ 

8854 Combine p-values from independent tests that bear upon the same hypothesis. 

8855 

8856 These methods are intended only for combining p-values from hypothesis 

8857 tests based upon continuous distributions. 

8858 

8859 Each method assumes that under the null hypothesis, the p-values are 

8860 sampled independently and uniformly from the interval [0, 1]. A test 

8861 statistic (different for each method) is computed and a combined 

8862 p-value is calculated based upon the distribution of this test statistic 

8863 under the null hypothesis. 

8864 

8865 Parameters 

8866 ---------- 

8867 pvalues : array_like, 1-D 

8868 Array of p-values assumed to come from independent tests based on 

8869 continuous distributions. 

8870 method : {'fisher', 'pearson', 'tippett', 'stouffer', 'mudholkar_george'} 

8871 

8872 Name of method to use to combine p-values. 

8873 

8874 The available methods are (see Notes for details): 

8875 

8876 * 'fisher': Fisher's method (Fisher's combined probability test) 

8877 * 'pearson': Pearson's method 

8878 * 'mudholkar_george': Mudholkar's and George's method 

8879 * 'tippett': Tippett's method 

8880 * 'stouffer': Stouffer's Z-score method 

8881 weights : array_like, 1-D, optional 

8882 Optional array of weights used only for Stouffer's Z-score method. 

8883 

8884 Returns 

8885 ------- 

8886 res : SignificanceResult 

8887 An object containing attributes: 

8888 

8889 statistic : float 

8890 The statistic calculated by the specified method. 

8891 pvalue : float 

8892 The combined p-value. 

8893 

8894 Notes 

8895 ----- 

8896 If this function is applied to tests with a discrete statistics such as 

8897 any rank test or contingency-table test, it will yield systematically 

8898 wrong results, e.g. Fisher's method will systematically overestimate the 

8899 p-value [1]_. This problem becomes less severe for large sample sizes 

8900 when the discrete distributions become approximately continuous. 

8901 

8902 The differences between the methods can be best illustrated by their 

8903 statistics and what aspects of a combination of p-values they emphasise 

8904 when considering significance [2]_. For example, methods emphasising large 

8905 p-values are more sensitive to strong false and true negatives; conversely 

8906 methods focussing on small p-values are sensitive to positives. 

8907 

8908 * The statistics of Fisher's method (also known as Fisher's combined 

8909 probability test) [3]_ is :math:`-2\\sum_i \\log(p_i)`, which is 

8910 equivalent (as a test statistics) to the product of individual p-values: 

8911 :math:`\\prod_i p_i`. Under the null hypothesis, this statistics follows 

8912 a :math:`\\chi^2` distribution. This method emphasises small p-values. 

8913 * Pearson's method uses :math:`-2\\sum_i\\log(1-p_i)`, which is equivalent 

8914 to :math:`\\prod_i \\frac{1}{1-p_i}` [2]_. 

8915 It thus emphasises large p-values. 

8916 * Mudholkar and George compromise between Fisher's and Pearson's method by 

8917 averaging their statistics [4]_. Their method emphasises extreme 

8918 p-values, both close to 1 and 0. 

8919 * Stouffer's method [5]_ uses Z-scores and the statistic: 

8920 :math:`\\sum_i \\Phi^{-1} (p_i)`, where :math:`\\Phi` is the CDF of the 

8921 standard normal distribution. The advantage of this method is that it is 

8922 straightforward to introduce weights, which can make Stouffer's method 

8923 more powerful than Fisher's method when the p-values are from studies 

8924 of different size [6]_ [7]_. 

8925 * Tippett's method uses the smallest p-value as a statistic. 

8926 (Mind that this minimum is not the combined p-value.) 

8927 

8928 Fisher's method may be extended to combine p-values from dependent tests 

8929 [8]_. Extensions such as Brown's method and Kost's method are not currently 

8930 implemented. 

8931 

8932 .. versionadded:: 0.15.0 

8933 

8934 References 

8935 ---------- 

8936 .. [1] Kincaid, W. M., "The Combination of Tests Based on Discrete 

8937 Distributions." Journal of the American Statistical Association 57, 

8938 no. 297 (1962), 10-19. 

8939 .. [2] Heard, N. and Rubin-Delanchey, P. "Choosing between methods of 

8940 combining p-values." Biometrika 105.1 (2018): 239-246. 

8941 .. [3] https://en.wikipedia.org/wiki/Fisher%27s_method 

8942 .. [4] George, E. O., and G. S. Mudholkar. "On the convolution of logistic 

8943 random variables." Metrika 30.1 (1983): 1-13. 

8944 .. [5] https://en.wikipedia.org/wiki/Fisher%27s_method#Relation_to_Stouffer.27s_Z-score_method 

8945 .. [6] Whitlock, M. C. "Combining probability from independent tests: the 

8946 weighted Z-method is superior to Fisher's approach." Journal of 

8947 Evolutionary Biology 18, no. 5 (2005): 1368-1373. 

8948 .. [7] Zaykin, Dmitri V. "Optimally weighted Z-test is a powerful method 

8949 for combining probabilities in meta-analysis." Journal of 

8950 Evolutionary Biology 24, no. 8 (2011): 1836-1841. 

8951 .. [8] https://en.wikipedia.org/wiki/Extensions_of_Fisher%27s_method 

8952 

8953 """ 

8954 pvalues = np.asarray(pvalues) 

8955 if pvalues.ndim != 1: 

8956 raise ValueError("pvalues is not 1-D") 

8957 

8958 if method == 'fisher': 

8959 statistic = -2 * np.sum(np.log(pvalues)) 

8960 pval = distributions.chi2.sf(statistic, 2 * len(pvalues)) 

8961 elif method == 'pearson': 

8962 statistic = 2 * np.sum(np.log1p(-pvalues)) 

8963 pval = distributions.chi2.cdf(-statistic, 2 * len(pvalues)) 

8964 elif method == 'mudholkar_george': 

8965 normalizing_factor = np.sqrt(3/len(pvalues))/np.pi 

8966 statistic = -np.sum(np.log(pvalues)) + np.sum(np.log1p(-pvalues)) 

8967 nu = 5 * len(pvalues) + 4 

8968 approx_factor = np.sqrt(nu / (nu - 2)) 

8969 pval = distributions.t.sf(statistic * normalizing_factor 

8970 * approx_factor, nu) 

8971 elif method == 'tippett': 

8972 statistic = np.min(pvalues) 

8973 pval = distributions.beta.cdf(statistic, 1, len(pvalues)) 

8974 elif method == 'stouffer': 

8975 if weights is None: 

8976 weights = np.ones_like(pvalues) 

8977 elif len(weights) != len(pvalues): 

8978 raise ValueError("pvalues and weights must be of the same size.") 

8979 

8980 weights = np.asarray(weights) 

8981 if weights.ndim != 1: 

8982 raise ValueError("weights is not 1-D") 

8983 

8984 Zi = distributions.norm.isf(pvalues) 

8985 statistic = np.dot(weights, Zi) / np.linalg.norm(weights) 

8986 pval = distributions.norm.sf(statistic) 

8987 

8988 else: 

8989 raise ValueError( 

8990 f"Invalid method {method!r}. Valid methods are 'fisher', " 

8991 "'pearson', 'mudholkar_george', 'tippett', and 'stouffer'" 

8992 ) 

8993 

8994 return SignificanceResult(statistic, pval) 

8995 

8996 

8997##################################### 

8998# STATISTICAL DISTANCES # 

8999##################################### 

9000 

9001 

9002def wasserstein_distance(u_values, v_values, u_weights=None, v_weights=None): 

9003 r""" 

9004 Compute the first Wasserstein distance between two 1D distributions. 

9005 

9006 This distance is also known as the earth mover's distance, since it can be 

9007 seen as the minimum amount of "work" required to transform :math:`u` into 

9008 :math:`v`, where "work" is measured as the amount of distribution weight 

9009 that must be moved, multiplied by the distance it has to be moved. 

9010 

9011 .. versionadded:: 1.0.0 

9012 

9013 Parameters 

9014 ---------- 

9015 u_values, v_values : array_like 

9016 Values observed in the (empirical) distribution. 

9017 u_weights, v_weights : array_like, optional 

9018 Weight for each value. If unspecified, each value is assigned the same 

9019 weight. 

9020 `u_weights` (resp. `v_weights`) must have the same length as 

9021 `u_values` (resp. `v_values`). If the weight sum differs from 1, it 

9022 must still be positive and finite so that the weights can be normalized 

9023 to sum to 1. 

9024 

9025 Returns 

9026 ------- 

9027 distance : float 

9028 The computed distance between the distributions. 

9029 

9030 Notes 

9031 ----- 

9032 The first Wasserstein distance between the distributions :math:`u` and 

9033 :math:`v` is: 

9034 

9035 .. math:: 

9036 

9037 l_1 (u, v) = \inf_{\pi \in \Gamma (u, v)} \int_{\mathbb{R} \times 

9038 \mathbb{R}} |x-y| \mathrm{d} \pi (x, y) 

9039 

9040 where :math:`\Gamma (u, v)` is the set of (probability) distributions on 

9041 :math:`\mathbb{R} \times \mathbb{R}` whose marginals are :math:`u` and 

9042 :math:`v` on the first and second factors respectively. 

9043 

9044 If :math:`U` and :math:`V` are the respective CDFs of :math:`u` and 

9045 :math:`v`, this distance also equals to: 

9046 

9047 .. math:: 

9048 

9049 l_1(u, v) = \int_{-\infty}^{+\infty} |U-V| 

9050 

9051 See [2]_ for a proof of the equivalence of both definitions. 

9052 

9053 The input distributions can be empirical, therefore coming from samples 

9054 whose values are effectively inputs of the function, or they can be seen as 

9055 generalized functions, in which case they are weighted sums of Dirac delta 

9056 functions located at the specified values. 

9057 

9058 References 

9059 ---------- 

9060 .. [1] "Wasserstein metric", https://en.wikipedia.org/wiki/Wasserstein_metric 

9061 .. [2] Ramdas, Garcia, Cuturi "On Wasserstein Two Sample Testing and Related 

9062 Families of Nonparametric Tests" (2015). :arXiv:`1509.02237`. 

9063 

9064 Examples 

9065 -------- 

9066 >>> from scipy.stats import wasserstein_distance 

9067 >>> wasserstein_distance([0, 1, 3], [5, 6, 8]) 

9068 5.0 

9069 >>> wasserstein_distance([0, 1], [0, 1], [3, 1], [2, 2]) 

9070 0.25 

9071 >>> wasserstein_distance([3.4, 3.9, 7.5, 7.8], [4.5, 1.4], 

9072 ... [1.4, 0.9, 3.1, 7.2], [3.2, 3.5]) 

9073 4.0781331438047861 

9074 

9075 """ 

9076 return _cdf_distance(1, u_values, v_values, u_weights, v_weights) 

9077 

9078 

9079def energy_distance(u_values, v_values, u_weights=None, v_weights=None): 

9080 r"""Compute the energy distance between two 1D distributions. 

9081 

9082 .. versionadded:: 1.0.0 

9083 

9084 Parameters 

9085 ---------- 

9086 u_values, v_values : array_like 

9087 Values observed in the (empirical) distribution. 

9088 u_weights, v_weights : array_like, optional 

9089 Weight for each value. If unspecified, each value is assigned the same 

9090 weight. 

9091 `u_weights` (resp. `v_weights`) must have the same length as 

9092 `u_values` (resp. `v_values`). If the weight sum differs from 1, it 

9093 must still be positive and finite so that the weights can be normalized 

9094 to sum to 1. 

9095 

9096 Returns 

9097 ------- 

9098 distance : float 

9099 The computed distance between the distributions. 

9100 

9101 Notes 

9102 ----- 

9103 The energy distance between two distributions :math:`u` and :math:`v`, whose 

9104 respective CDFs are :math:`U` and :math:`V`, equals to: 

9105 

9106 .. math:: 

9107 

9108 D(u, v) = \left( 2\mathbb E|X - Y| - \mathbb E|X - X'| - 

9109 \mathbb E|Y - Y'| \right)^{1/2} 

9110 

9111 where :math:`X` and :math:`X'` (resp. :math:`Y` and :math:`Y'`) are 

9112 independent random variables whose probability distribution is :math:`u` 

9113 (resp. :math:`v`). 

9114 

9115 Sometimes the square of this quantity is referred to as the "energy 

9116 distance" (e.g. in [2]_, [4]_), but as noted in [1]_ and [3]_, only the 

9117 definition above satisfies the axioms of a distance function (metric). 

9118 

9119 As shown in [2]_, for one-dimensional real-valued variables, the energy 

9120 distance is linked to the non-distribution-free version of the Cramér-von 

9121 Mises distance: 

9122 

9123 .. math:: 

9124 

9125 D(u, v) = \sqrt{2} l_2(u, v) = \left( 2 \int_{-\infty}^{+\infty} (U-V)^2 

9126 \right)^{1/2} 

9127 

9128 Note that the common Cramér-von Mises criterion uses the distribution-free 

9129 version of the distance. See [2]_ (section 2), for more details about both 

9130 versions of the distance. 

9131 

9132 The input distributions can be empirical, therefore coming from samples 

9133 whose values are effectively inputs of the function, or they can be seen as 

9134 generalized functions, in which case they are weighted sums of Dirac delta 

9135 functions located at the specified values. 

9136 

9137 References 

9138 ---------- 

9139 .. [1] Rizzo, Szekely "Energy distance." Wiley Interdisciplinary Reviews: 

9140 Computational Statistics, 8(1):27-38 (2015). 

9141 .. [2] Szekely "E-statistics: The energy of statistical samples." Bowling 

9142 Green State University, Department of Mathematics and Statistics, 

9143 Technical Report 02-16 (2002). 

9144 .. [3] "Energy distance", https://en.wikipedia.org/wiki/Energy_distance 

9145 .. [4] Bellemare, Danihelka, Dabney, Mohamed, Lakshminarayanan, Hoyer, 

9146 Munos "The Cramer Distance as a Solution to Biased Wasserstein 

9147 Gradients" (2017). :arXiv:`1705.10743`. 

9148 

9149 Examples 

9150 -------- 

9151 >>> from scipy.stats import energy_distance 

9152 >>> energy_distance([0], [2]) 

9153 2.0000000000000004 

9154 >>> energy_distance([0, 8], [0, 8], [3, 1], [2, 2]) 

9155 1.0000000000000002 

9156 >>> energy_distance([0.7, 7.4, 2.4, 6.8], [1.4, 8. ], 

9157 ... [2.1, 4.2, 7.4, 8. ], [7.6, 8.8]) 

9158 0.88003340976158217 

9159 

9160 """ 

9161 return np.sqrt(2) * _cdf_distance(2, u_values, v_values, 

9162 u_weights, v_weights) 

9163 

9164 

9165def _cdf_distance(p, u_values, v_values, u_weights=None, v_weights=None): 

9166 r""" 

9167 Compute, between two one-dimensional distributions :math:`u` and 

9168 :math:`v`, whose respective CDFs are :math:`U` and :math:`V`, the 

9169 statistical distance that is defined as: 

9170 

9171 .. math:: 

9172 

9173 l_p(u, v) = \left( \int_{-\infty}^{+\infty} |U-V|^p \right)^{1/p} 

9174 

9175 p is a positive parameter; p = 1 gives the Wasserstein distance, p = 2 

9176 gives the energy distance. 

9177 

9178 Parameters 

9179 ---------- 

9180 u_values, v_values : array_like 

9181 Values observed in the (empirical) distribution. 

9182 u_weights, v_weights : array_like, optional 

9183 Weight for each value. If unspecified, each value is assigned the same 

9184 weight. 

9185 `u_weights` (resp. `v_weights`) must have the same length as 

9186 `u_values` (resp. `v_values`). If the weight sum differs from 1, it 

9187 must still be positive and finite so that the weights can be normalized 

9188 to sum to 1. 

9189 

9190 Returns 

9191 ------- 

9192 distance : float 

9193 The computed distance between the distributions. 

9194 

9195 Notes 

9196 ----- 

9197 The input distributions can be empirical, therefore coming from samples 

9198 whose values are effectively inputs of the function, or they can be seen as 

9199 generalized functions, in which case they are weighted sums of Dirac delta 

9200 functions located at the specified values. 

9201 

9202 References 

9203 ---------- 

9204 .. [1] Bellemare, Danihelka, Dabney, Mohamed, Lakshminarayanan, Hoyer, 

9205 Munos "The Cramer Distance as a Solution to Biased Wasserstein 

9206 Gradients" (2017). :arXiv:`1705.10743`. 

9207 

9208 """ 

9209 u_values, u_weights = _validate_distribution(u_values, u_weights) 

9210 v_values, v_weights = _validate_distribution(v_values, v_weights) 

9211 

9212 u_sorter = np.argsort(u_values) 

9213 v_sorter = np.argsort(v_values) 

9214 

9215 all_values = np.concatenate((u_values, v_values)) 

9216 all_values.sort(kind='mergesort') 

9217 

9218 # Compute the differences between pairs of successive values of u and v. 

9219 deltas = np.diff(all_values) 

9220 

9221 # Get the respective positions of the values of u and v among the values of 

9222 # both distributions. 

9223 u_cdf_indices = u_values[u_sorter].searchsorted(all_values[:-1], 'right') 

9224 v_cdf_indices = v_values[v_sorter].searchsorted(all_values[:-1], 'right') 

9225 

9226 # Calculate the CDFs of u and v using their weights, if specified. 

9227 if u_weights is None: 

9228 u_cdf = u_cdf_indices / u_values.size 

9229 else: 

9230 u_sorted_cumweights = np.concatenate(([0], 

9231 np.cumsum(u_weights[u_sorter]))) 

9232 u_cdf = u_sorted_cumweights[u_cdf_indices] / u_sorted_cumweights[-1] 

9233 

9234 if v_weights is None: 

9235 v_cdf = v_cdf_indices / v_values.size 

9236 else: 

9237 v_sorted_cumweights = np.concatenate(([0], 

9238 np.cumsum(v_weights[v_sorter]))) 

9239 v_cdf = v_sorted_cumweights[v_cdf_indices] / v_sorted_cumweights[-1] 

9240 

9241 # Compute the value of the integral based on the CDFs. 

9242 # If p = 1 or p = 2, we avoid using np.power, which introduces an overhead 

9243 # of about 15%. 

9244 if p == 1: 

9245 return np.sum(np.multiply(np.abs(u_cdf - v_cdf), deltas)) 

9246 if p == 2: 

9247 return np.sqrt(np.sum(np.multiply(np.square(u_cdf - v_cdf), deltas))) 

9248 return np.power(np.sum(np.multiply(np.power(np.abs(u_cdf - v_cdf), p), 

9249 deltas)), 1/p) 

9250 

9251 

9252def _validate_distribution(values, weights): 

9253 """ 

9254 Validate the values and weights from a distribution input of `cdf_distance` 

9255 and return them as ndarray objects. 

9256 

9257 Parameters 

9258 ---------- 

9259 values : array_like 

9260 Values observed in the (empirical) distribution. 

9261 weights : array_like 

9262 Weight for each value. 

9263 

9264 Returns 

9265 ------- 

9266 values : ndarray 

9267 Values as ndarray. 

9268 weights : ndarray 

9269 Weights as ndarray. 

9270 

9271 """ 

9272 # Validate the value array. 

9273 values = np.asarray(values, dtype=float) 

9274 if len(values) == 0: 

9275 raise ValueError("Distribution can't be empty.") 

9276 

9277 # Validate the weight array, if specified. 

9278 if weights is not None: 

9279 weights = np.asarray(weights, dtype=float) 

9280 if len(weights) != len(values): 

9281 raise ValueError('Value and weight array-likes for the same ' 

9282 'empirical distribution must be of the same size.') 

9283 if np.any(weights < 0): 

9284 raise ValueError('All weights must be non-negative.') 

9285 if not 0 < np.sum(weights) < np.inf: 

9286 raise ValueError('Weight array-like sum must be positive and ' 

9287 'finite. Set as None for an equal distribution of ' 

9288 'weight.') 

9289 

9290 return values, weights 

9291 

9292 return values, None 

9293 

9294 

9295##################################### 

9296# SUPPORT FUNCTIONS # 

9297##################################### 

9298 

9299RepeatedResults = namedtuple('RepeatedResults', ('values', 'counts')) 

9300 

9301 

9302def find_repeats(arr): 

9303 """Find repeats and repeat counts. 

9304 

9305 Parameters 

9306 ---------- 

9307 arr : array_like 

9308 Input array. This is cast to float64. 

9309 

9310 Returns 

9311 ------- 

9312 values : ndarray 

9313 The unique values from the (flattened) input that are repeated. 

9314 

9315 counts : ndarray 

9316 Number of times the corresponding 'value' is repeated. 

9317 

9318 Notes 

9319 ----- 

9320 In numpy >= 1.9 `numpy.unique` provides similar functionality. The main 

9321 difference is that `find_repeats` only returns repeated values. 

9322 

9323 Examples 

9324 -------- 

9325 >>> from scipy import stats 

9326 >>> stats.find_repeats([2, 1, 2, 3, 2, 2, 5]) 

9327 RepeatedResults(values=array([2.]), counts=array([4])) 

9328 

9329 >>> stats.find_repeats([[10, 20, 1, 2], [5, 5, 4, 4]]) 

9330 RepeatedResults(values=array([4., 5.]), counts=array([2, 2])) 

9331 

9332 """ 

9333 # Note: always copies. 

9334 return RepeatedResults(*_find_repeats(np.array(arr, dtype=np.float64))) 

9335 

9336 

9337def _sum_of_squares(a, axis=0): 

9338 """Square each element of the input array, and return the sum(s) of that. 

9339 

9340 Parameters 

9341 ---------- 

9342 a : array_like 

9343 Input array. 

9344 axis : int or None, optional 

9345 Axis along which to calculate. Default is 0. If None, compute over 

9346 the whole array `a`. 

9347 

9348 Returns 

9349 ------- 

9350 sum_of_squares : ndarray 

9351 The sum along the given axis for (a**2). 

9352 

9353 See Also 

9354 -------- 

9355 _square_of_sums : The square(s) of the sum(s) (the opposite of 

9356 `_sum_of_squares`). 

9357 

9358 """ 

9359 a, axis = _chk_asarray(a, axis) 

9360 return np.sum(a*a, axis) 

9361 

9362 

9363def _square_of_sums(a, axis=0): 

9364 """Sum elements of the input array, and return the square(s) of that sum. 

9365 

9366 Parameters 

9367 ---------- 

9368 a : array_like 

9369 Input array. 

9370 axis : int or None, optional 

9371 Axis along which to calculate. Default is 0. If None, compute over 

9372 the whole array `a`. 

9373 

9374 Returns 

9375 ------- 

9376 square_of_sums : float or ndarray 

9377 The square of the sum over `axis`. 

9378 

9379 See Also 

9380 -------- 

9381 _sum_of_squares : The sum of squares (the opposite of `square_of_sums`). 

9382 

9383 """ 

9384 a, axis = _chk_asarray(a, axis) 

9385 s = np.sum(a, axis) 

9386 if not np.isscalar(s): 

9387 return s.astype(float) * s 

9388 else: 

9389 return float(s) * s 

9390 

9391 

9392def rankdata(a, method='average', *, axis=None, nan_policy='propagate'): 

9393 """Assign ranks to data, dealing with ties appropriately. 

9394 

9395 By default (``axis=None``), the data array is first flattened, and a flat 

9396 array of ranks is returned. Separately reshape the rank array to the 

9397 shape of the data array if desired (see Examples). 

9398 

9399 Ranks begin at 1. The `method` argument controls how ranks are assigned 

9400 to equal values. See [1]_ for further discussion of ranking methods. 

9401 

9402 Parameters 

9403 ---------- 

9404 a : array_like 

9405 The array of values to be ranked. 

9406 method : {'average', 'min', 'max', 'dense', 'ordinal'}, optional 

9407 The method used to assign ranks to tied elements. 

9408 The following methods are available (default is 'average'): 

9409 

9410 * 'average': The average of the ranks that would have been assigned to 

9411 all the tied values is assigned to each value. 

9412 * 'min': The minimum of the ranks that would have been assigned to all 

9413 the tied values is assigned to each value. (This is also 

9414 referred to as "competition" ranking.) 

9415 * 'max': The maximum of the ranks that would have been assigned to all 

9416 the tied values is assigned to each value. 

9417 * 'dense': Like 'min', but the rank of the next highest element is 

9418 assigned the rank immediately after those assigned to the tied 

9419 elements. 

9420 * 'ordinal': All values are given a distinct rank, corresponding to 

9421 the order that the values occur in `a`. 

9422 axis : {None, int}, optional 

9423 Axis along which to perform the ranking. If ``None``, the data array 

9424 is first flattened. 

9425 nan_policy : {'propagate', 'omit', 'raise'}, optional 

9426 Defines how to handle when input contains nan. 

9427 The following options are available (default is 'propagate'): 

9428 

9429 * 'propagate': propagates nans through the rank calculation 

9430 * 'omit': performs the calculations ignoring nan values 

9431 * 'raise': raises an error 

9432 

9433 .. note:: 

9434 

9435 When `nan_policy` is 'propagate', the output is an array of *all* 

9436 nans because ranks relative to nans in the input are undefined. 

9437 When `nan_policy` is 'omit', nans in `a` are ignored when ranking 

9438 the other values, and the corresponding locations of the output 

9439 are nan. 

9440 

9441 .. versionadded:: 1.10 

9442 

9443 Returns 

9444 ------- 

9445 ranks : ndarray 

9446 An array of size equal to the size of `a`, containing rank 

9447 scores. 

9448 

9449 References 

9450 ---------- 

9451 .. [1] "Ranking", https://en.wikipedia.org/wiki/Ranking 

9452 

9453 Examples 

9454 -------- 

9455 >>> import numpy as np 

9456 >>> from scipy.stats import rankdata 

9457 >>> rankdata([0, 2, 3, 2]) 

9458 array([ 1. , 2.5, 4. , 2.5]) 

9459 >>> rankdata([0, 2, 3, 2], method='min') 

9460 array([ 1, 2, 4, 2]) 

9461 >>> rankdata([0, 2, 3, 2], method='max') 

9462 array([ 1, 3, 4, 3]) 

9463 >>> rankdata([0, 2, 3, 2], method='dense') 

9464 array([ 1, 2, 3, 2]) 

9465 >>> rankdata([0, 2, 3, 2], method='ordinal') 

9466 array([ 1, 2, 4, 3]) 

9467 >>> rankdata([[0, 2], [3, 2]]).reshape(2,2) 

9468 array([[1. , 2.5], 

9469 [4. , 2.5]]) 

9470 >>> rankdata([[0, 2, 2], [3, 2, 5]], axis=1) 

9471 array([[1. , 2.5, 2.5], 

9472 [2. , 1. , 3. ]]) 

9473 >>> rankdata([0, 2, 3, np.nan, -2, np.nan], nan_policy="propagate") 

9474 array([nan, nan, nan, nan, nan, nan]) 

9475 >>> rankdata([0, 2, 3, np.nan, -2, np.nan], nan_policy="omit") 

9476 array([ 2., 3., 4., nan, 1., nan]) 

9477 

9478 """ 

9479 if method not in ('average', 'min', 'max', 'dense', 'ordinal'): 

9480 raise ValueError('unknown method "{0}"'.format(method)) 

9481 

9482 a = np.asarray(a) 

9483 

9484 if axis is not None: 

9485 if a.size == 0: 

9486 # The return values of `normalize_axis_index` are ignored. The 

9487 # call validates `axis`, even though we won't use it. 

9488 # use scipy._lib._util._normalize_axis_index when available 

9489 np.core.multiarray.normalize_axis_index(axis, a.ndim) 

9490 dt = np.float64 if method == 'average' else np.int_ 

9491 return np.empty(a.shape, dtype=dt) 

9492 return np.apply_along_axis(rankdata, axis, a, method, 

9493 nan_policy=nan_policy) 

9494 

9495 arr = np.ravel(a) 

9496 contains_nan, nan_policy = _contains_nan(arr, nan_policy) 

9497 nan_indexes = None 

9498 if contains_nan: 

9499 if nan_policy == 'omit': 

9500 nan_indexes = np.isnan(arr) 

9501 if nan_policy == 'propagate': 

9502 return np.full_like(arr, np.nan) 

9503 

9504 algo = 'mergesort' if method == 'ordinal' else 'quicksort' 

9505 sorter = np.argsort(arr, kind=algo) 

9506 

9507 inv = np.empty(sorter.size, dtype=np.intp) 

9508 inv[sorter] = np.arange(sorter.size, dtype=np.intp) 

9509 

9510 if method == 'ordinal': 

9511 result = inv + 1 

9512 

9513 arr = arr[sorter] 

9514 obs = np.r_[True, arr[1:] != arr[:-1]] 

9515 dense = obs.cumsum()[inv] 

9516 

9517 if method == 'dense': 

9518 result = dense 

9519 

9520 # cumulative counts of each unique value 

9521 count = np.r_[np.nonzero(obs)[0], len(obs)] 

9522 

9523 if method == 'max': 

9524 result = count[dense] 

9525 

9526 if method == 'min': 

9527 result = count[dense - 1] + 1 

9528 

9529 if method == 'average': 

9530 result = .5 * (count[dense] + count[dense - 1] + 1) 

9531 

9532 if nan_indexes is not None: 

9533 result = result.astype('float64') 

9534 result[nan_indexes] = np.nan 

9535 

9536 return result 

9537 

9538 

9539def expectile(a, alpha=0.5, *, weights=None): 

9540 r"""Compute the expectile at the specified level. 

9541 

9542 Expectiles are a generalization of the expectation in the same way as 

9543 quantiles are a generalization of the median. The expectile at level 

9544 `alpha = 0.5` is the mean (average). See Notes for more details. 

9545 

9546 Parameters 

9547 ---------- 

9548 a : array_like 

9549 Array containing numbers whose expectile is desired. 

9550 alpha : float, default: 0.5 

9551 The level of the expectile; e.g., `alpha=0.5` gives the mean. 

9552 weights : array_like, optional 

9553 An array of weights associated with the values in `a`. 

9554 The `weights` must be broadcastable to the same shape as `a`. 

9555 Default is None, which gives each value a weight of 1.0. 

9556 An integer valued weight element acts like repeating the corresponding 

9557 observation in `a` that many times. See Notes for more details. 

9558 

9559 Returns 

9560 ------- 

9561 expectile : ndarray 

9562 The empirical expectile at level `alpha`. 

9563 

9564 See Also 

9565 -------- 

9566 numpy.mean : Arithmetic average 

9567 numpy.quantile : Quantile 

9568 

9569 Notes 

9570 ----- 

9571 In general, the expectile at level :math:`\alpha` of a random variable 

9572 :math:`X` with cumulative distribution function (CDF) :math:`F` is given 

9573 by the unique solution :math:`t` of: 

9574 

9575 .. math:: 

9576 

9577 \alpha E((X - t)_+) = (1 - \alpha) E((t - X)_+) \,. 

9578 

9579 Here, :math:`(x)_+ = \max(0, x)` is the positive part of :math:`x`. 

9580 This equation can be equivalently written as: 

9581 

9582 .. math:: 

9583 

9584 \alpha \int_t^\infty (x - t)\mathrm{d}F(x) 

9585 = (1 - \alpha) \int_{-\infty}^t (t - x)\mathrm{d}F(x) \,. 

9586 

9587 The empirical expectile at level :math:`\alpha` (`alpha`) of a sample 

9588 :math:`a_i` (the array `a`) is defined by plugging in the empirical CDF of 

9589 `a`. Given sample or case weights :math:`w` (the array `weights`), it 

9590 reads :math:`F_a(x) = \frac{1}{\sum_i a_i} \sum_i w_i 1_{a_i \leq x}` 

9591 with indicator function :math:`1_{A}`. This leads to the definition of the 

9592 empirical expectile at level `alpha` as the unique solution :math:`t` of: 

9593 

9594 .. math:: 

9595 

9596 \alpha \sum_{i=1}^n w_i (a_i - t)_+ = 

9597 (1 - \alpha) \sum_{i=1}^n w_i (t - a_i)_+ \,. 

9598 

9599 For :math:`\alpha=0.5`, this simplifies to the weighted average. 

9600 Furthermore, the larger :math:`\alpha`, the larger the value of the 

9601 expectile. 

9602 

9603 As a final remark, the expectile at level :math:`\alpha` can also be 

9604 written as a minimization problem. One often used choice is 

9605 

9606 .. math:: 

9607 

9608 \operatorname{argmin}_t 

9609 E(\lvert 1_{t\geq X} - \alpha\rvert(t - X)^2) \,. 

9610 

9611 References 

9612 ---------- 

9613 .. [1] W. K. Newey and J. L. Powell (1987), "Asymmetric Least Squares 

9614 Estimation and Testing," Econometrica, 55, 819-847. 

9615 .. [2] T. Gneiting (2009). "Making and Evaluating Point Forecasts," 

9616 Journal of the American Statistical Association, 106, 746 - 762. 

9617 :doi:`10.48550/arXiv.0912.0902` 

9618 

9619 Examples 

9620 -------- 

9621 >>> import numpy as np 

9622 >>> from scipy.stats import expectile 

9623 >>> a = [1, 4, 2, -1] 

9624 >>> expectile(a, alpha=0.5) == np.mean(a) 

9625 True 

9626 >>> expectile(a, alpha=0.2) 

9627 0.42857142857142855 

9628 >>> expectile(a, alpha=0.8) 

9629 2.5714285714285716 

9630 >>> weights = [1, 3, 1, 1] 

9631 

9632 """ 

9633 if alpha < 0 or alpha > 1: 

9634 raise ValueError( 

9635 "The expectile level alpha must be in the range [0, 1]." 

9636 ) 

9637 a = np.asarray(a) 

9638 

9639 if weights is not None: 

9640 weights = np.broadcast_to(weights, a.shape) 

9641 

9642 # This is the empirical equivalent of Eq. (13) with identification 

9643 # function from Table 9 (omitting a factor of 2) in [2] (their y is our 

9644 # data a, their x is our t) 

9645 def first_order(t): 

9646 return np.average(np.abs((a <= t) - alpha) * (t - a), weights=weights) 

9647 

9648 if alpha >= 0.5: 

9649 x0 = np.average(a, weights=weights) 

9650 x1 = np.amax(a) 

9651 else: 

9652 x1 = np.average(a, weights=weights) 

9653 x0 = np.amin(a) 

9654 

9655 if x0 == x1: 

9656 # a has a single unique element 

9657 return x0 

9658 

9659 # Note that the expectile is the unique solution, so no worries about 

9660 # finding a wrong root. 

9661 res = root_scalar(first_order, x0=x0, x1=x1) 

9662 return res.root