Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scipy/stats/contingency.py: 23%

61 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-12-12 06:31 +0000

1""" 

2Contingency table functions (:mod:`scipy.stats.contingency`) 

3============================================================ 

4 

5Functions for creating and analyzing contingency tables. 

6 

7.. currentmodule:: scipy.stats.contingency 

8 

9.. autosummary:: 

10 :toctree: generated/ 

11 

12 chi2_contingency 

13 relative_risk 

14 odds_ratio 

15 crosstab 

16 association 

17 

18 expected_freq 

19 margins 

20 

21""" 

22 

23 

24from functools import reduce 

25import math 

26import numpy as np 

27from ._stats_py import power_divergence 

28from ._relative_risk import relative_risk 

29from ._crosstab import crosstab 

30from ._odds_ratio import odds_ratio 

31from scipy._lib._bunch import _make_tuple_bunch 

32 

33 

34__all__ = ['margins', 'expected_freq', 'chi2_contingency', 'crosstab', 

35 'association', 'relative_risk', 'odds_ratio'] 

36 

37 

38def margins(a): 

39 """Return a list of the marginal sums of the array `a`. 

40 

41 Parameters 

42 ---------- 

43 a : ndarray 

44 The array for which to compute the marginal sums. 

45 

46 Returns 

47 ------- 

48 margsums : list of ndarrays 

49 A list of length `a.ndim`. `margsums[k]` is the result 

50 of summing `a` over all axes except `k`; it has the same 

51 number of dimensions as `a`, but the length of each axis 

52 except axis `k` will be 1. 

53 

54 Examples 

55 -------- 

56 >>> import numpy as np 

57 >>> from scipy.stats.contingency import margins 

58 

59 >>> a = np.arange(12).reshape(2, 6) 

60 >>> a 

61 array([[ 0, 1, 2, 3, 4, 5], 

62 [ 6, 7, 8, 9, 10, 11]]) 

63 >>> m0, m1 = margins(a) 

64 >>> m0 

65 array([[15], 

66 [51]]) 

67 >>> m1 

68 array([[ 6, 8, 10, 12, 14, 16]]) 

69 

70 >>> b = np.arange(24).reshape(2,3,4) 

71 >>> m0, m1, m2 = margins(b) 

72 >>> m0 

73 array([[[ 66]], 

74 [[210]]]) 

75 >>> m1 

76 array([[[ 60], 

77 [ 92], 

78 [124]]]) 

79 >>> m2 

80 array([[[60, 66, 72, 78]]]) 

81 """ 

82 margsums = [] 

83 ranged = list(range(a.ndim)) 

84 for k in ranged: 

85 marg = np.apply_over_axes(np.sum, a, [j for j in ranged if j != k]) 

86 margsums.append(marg) 

87 return margsums 

88 

89 

90def expected_freq(observed): 

91 """ 

92 Compute the expected frequencies from a contingency table. 

93 

94 Given an n-dimensional contingency table of observed frequencies, 

95 compute the expected frequencies for the table based on the marginal 

96 sums under the assumption that the groups associated with each 

97 dimension are independent. 

98 

99 Parameters 

100 ---------- 

101 observed : array_like 

102 The table of observed frequencies. (While this function can handle 

103 a 1-D array, that case is trivial. Generally `observed` is at 

104 least 2-D.) 

105 

106 Returns 

107 ------- 

108 expected : ndarray of float64 

109 The expected frequencies, based on the marginal sums of the table. 

110 Same shape as `observed`. 

111 

112 Examples 

113 -------- 

114 >>> import numpy as np 

115 >>> from scipy.stats.contingency import expected_freq 

116 >>> observed = np.array([[10, 10, 20],[20, 20, 20]]) 

117 >>> expected_freq(observed) 

118 array([[ 12., 12., 16.], 

119 [ 18., 18., 24.]]) 

120 

121 """ 

122 # Typically `observed` is an integer array. If `observed` has a large 

123 # number of dimensions or holds large values, some of the following 

124 # computations may overflow, so we first switch to floating point. 

125 observed = np.asarray(observed, dtype=np.float64) 

126 

127 # Create a list of the marginal sums. 

128 margsums = margins(observed) 

129 

130 # Create the array of expected frequencies. The shapes of the 

131 # marginal sums returned by apply_over_axes() are just what we 

132 # need for broadcasting in the following product. 

133 d = observed.ndim 

134 expected = reduce(np.multiply, margsums) / observed.sum() ** (d - 1) 

135 return expected 

136 

137 

138Chi2ContingencyResult = _make_tuple_bunch( 

139 'Chi2ContingencyResult', 

140 ['statistic', 'pvalue', 'dof', 'expected_freq'], [] 

141) 

142 

143 

144def chi2_contingency(observed, correction=True, lambda_=None): 

145 """Chi-square test of independence of variables in a contingency table. 

146 

147 This function computes the chi-square statistic and p-value for the 

148 hypothesis test of independence of the observed frequencies in the 

149 contingency table [1]_ `observed`. The expected frequencies are computed 

150 based on the marginal sums under the assumption of independence; see 

151 `scipy.stats.contingency.expected_freq`. The number of degrees of 

152 freedom is (expressed using numpy functions and attributes):: 

153 

154 dof = observed.size - sum(observed.shape) + observed.ndim - 1 

155 

156 

157 Parameters 

158 ---------- 

159 observed : array_like 

160 The contingency table. The table contains the observed frequencies 

161 (i.e. number of occurrences) in each category. In the two-dimensional 

162 case, the table is often described as an "R x C table". 

163 correction : bool, optional 

164 If True, *and* the degrees of freedom is 1, apply Yates' correction 

165 for continuity. The effect of the correction is to adjust each 

166 observed value by 0.5 towards the corresponding expected value. 

167 lambda_ : float or str, optional 

168 By default, the statistic computed in this test is Pearson's 

169 chi-squared statistic [2]_. `lambda_` allows a statistic from the 

170 Cressie-Read power divergence family [3]_ to be used instead. See 

171 `scipy.stats.power_divergence` for details. 

172 

173 Returns 

174 ------- 

175 res : Chi2ContingencyResult 

176 An object containing attributes: 

177 

178 statistic : float 

179 The test statistic. 

180 pvalue : float 

181 The p-value of the test. 

182 dof : int 

183 The degrees of freedom. 

184 expected_freq : ndarray, same shape as `observed` 

185 The expected frequencies, based on the marginal sums of the table. 

186 

187 See Also 

188 -------- 

189 scipy.stats.contingency.expected_freq 

190 scipy.stats.fisher_exact 

191 scipy.stats.chisquare 

192 scipy.stats.power_divergence 

193 scipy.stats.barnard_exact 

194 scipy.stats.boschloo_exact 

195 

196 Notes 

197 ----- 

198 An often quoted guideline for the validity of this calculation is that 

199 the test should be used only if the observed and expected frequencies 

200 in each cell are at least 5. 

201 

202 This is a test for the independence of different categories of a 

203 population. The test is only meaningful when the dimension of 

204 `observed` is two or more. Applying the test to a one-dimensional 

205 table will always result in `expected` equal to `observed` and a 

206 chi-square statistic equal to 0. 

207 

208 This function does not handle masked arrays, because the calculation 

209 does not make sense with missing values. 

210 

211 Like `scipy.stats.chisquare`, this function computes a chi-square 

212 statistic; the convenience this function provides is to figure out the 

213 expected frequencies and degrees of freedom from the given contingency 

214 table. If these were already known, and if the Yates' correction was not 

215 required, one could use `scipy.stats.chisquare`. That is, if one calls:: 

216 

217 res = chi2_contingency(obs, correction=False) 

218 

219 then the following is true:: 

220 

221 (res.statistic, res.pvalue) == stats.chisquare(obs.ravel(), 

222 f_exp=ex.ravel(), 

223 ddof=obs.size - 1 - dof) 

224 

225 The `lambda_` argument was added in version 0.13.0 of scipy. 

226 

227 References 

228 ---------- 

229 .. [1] "Contingency table", 

230 https://en.wikipedia.org/wiki/Contingency_table 

231 .. [2] "Pearson's chi-squared test", 

232 https://en.wikipedia.org/wiki/Pearson%27s_chi-squared_test 

233 .. [3] Cressie, N. and Read, T. R. C., "Multinomial Goodness-of-Fit 

234 Tests", J. Royal Stat. Soc. Series B, Vol. 46, No. 3 (1984), 

235 pp. 440-464. 

236 

237 Examples 

238 -------- 

239 A two-way example (2 x 3): 

240 

241 >>> import numpy as np 

242 >>> from scipy.stats import chi2_contingency 

243 >>> obs = np.array([[10, 10, 20], [20, 20, 20]]) 

244 >>> res = chi2_contingency(obs) 

245 >>> res.statistic 

246 2.7777777777777777 

247 >>> res.pvalue 

248 0.24935220877729619 

249 >>> res.dof 

250 2 

251 >>> res.expected_freq 

252 array([[ 12., 12., 16.], 

253 [ 18., 18., 24.]]) 

254 

255 Perform the test using the log-likelihood ratio (i.e. the "G-test") 

256 instead of Pearson's chi-squared statistic. 

257 

258 >>> res = chi2_contingency(obs, lambda_="log-likelihood") 

259 >>> res.statistic 

260 2.7688587616781319 

261 >>> res.pvalue 

262 0.25046668010954165 

263 

264 A four-way example (2 x 2 x 2 x 2): 

265 

266 >>> obs = np.array( 

267 ... [[[[12, 17], 

268 ... [11, 16]], 

269 ... [[11, 12], 

270 ... [15, 16]]], 

271 ... [[[23, 15], 

272 ... [30, 22]], 

273 ... [[14, 17], 

274 ... [15, 16]]]]) 

275 >>> res = chi2_contingency(obs) 

276 >>> res.statistic 

277 8.7584514426741897 

278 >>> res.pvalue 

279 0.64417725029295503 

280 """ 

281 observed = np.asarray(observed) 

282 if np.any(observed < 0): 

283 raise ValueError("All values in `observed` must be nonnegative.") 

284 if observed.size == 0: 

285 raise ValueError("No data; `observed` has size 0.") 

286 

287 expected = expected_freq(observed) 

288 if np.any(expected == 0): 

289 # Include one of the positions where expected is zero in 

290 # the exception message. 

291 zeropos = list(zip(*np.nonzero(expected == 0)))[0] 

292 raise ValueError("The internally computed table of expected " 

293 "frequencies has a zero element at %s." % (zeropos,)) 

294 

295 # The degrees of freedom 

296 dof = expected.size - sum(expected.shape) + expected.ndim - 1 

297 

298 if dof == 0: 

299 # Degenerate case; this occurs when `observed` is 1D (or, more 

300 # generally, when it has only one nontrivial dimension). In this 

301 # case, we also have observed == expected, so chi2 is 0. 

302 chi2 = 0.0 

303 p = 1.0 

304 else: 

305 if dof == 1 and correction: 

306 # Adjust `observed` according to Yates' correction for continuity. 

307 # Magnitude of correction no bigger than difference; see gh-13875 

308 diff = expected - observed 

309 direction = np.sign(diff) 

310 magnitude = np.minimum(0.5, np.abs(diff)) 

311 observed = observed + magnitude * direction 

312 

313 chi2, p = power_divergence(observed, expected, 

314 ddof=observed.size - 1 - dof, axis=None, 

315 lambda_=lambda_) 

316 

317 return Chi2ContingencyResult(chi2, p, dof, expected) 

318 

319 

320def association(observed, method="cramer", correction=False, lambda_=None): 

321 """Calculates degree of association between two nominal variables. 

322 

323 The function provides the option for computing one of three measures of 

324 association between two nominal variables from the data given in a 2d 

325 contingency table: Tschuprow's T, Pearson's Contingency Coefficient 

326 and Cramer's V. 

327 

328 Parameters 

329 ---------- 

330 observed : array-like 

331 The array of observed values 

332 method : {"cramer", "tschuprow", "pearson"} (default = "cramer") 

333 The association test statistic. 

334 correction : bool, optional 

335 Inherited from `scipy.stats.contingency.chi2_contingency()` 

336 lambda_ : float or str, optional 

337 Inherited from `scipy.stats.contingency.chi2_contingency()` 

338 

339 Returns 

340 ------- 

341 statistic : float 

342 Value of the test statistic 

343 

344 Notes 

345 ----- 

346 Cramer's V, Tschuprow's T and Pearson's Contingency Coefficient, all 

347 measure the degree to which two nominal or ordinal variables are related, 

348 or the level of their association. This differs from correlation, although 

349 many often mistakenly consider them equivalent. Correlation measures in 

350 what way two variables are related, whereas, association measures how 

351 related the variables are. As such, association does not subsume 

352 independent variables, and is rather a test of independence. A value of 

353 1.0 indicates perfect association, and 0.0 means the variables have no 

354 association. 

355 

356 Both the Cramer's V and Tschuprow's T are extensions of the phi 

357 coefficient. Moreover, due to the close relationship between the 

358 Cramer's V and Tschuprow's T the returned values can often be similar 

359 or even equivalent. They are likely to diverge more as the array shape 

360 diverges from a 2x2. 

361 

362 References 

363 ---------- 

364 .. [1] "Tschuprow's T", 

365 https://en.wikipedia.org/wiki/Tschuprow's_T 

366 .. [2] Tschuprow, A. A. (1939) 

367 Principles of the Mathematical Theory of Correlation; 

368 translated by M. Kantorowitsch. W. Hodge & Co. 

369 .. [3] "Cramer's V", https://en.wikipedia.org/wiki/Cramer's_V 

370 .. [4] "Nominal Association: Phi and Cramer's V", 

371 http://www.people.vcu.edu/~pdattalo/702SuppRead/MeasAssoc/NominalAssoc.html 

372 .. [5] Gingrich, Paul, "Association Between Variables", 

373 http://uregina.ca/~gingrich/ch11a.pdf 

374 

375 Examples 

376 -------- 

377 An example with a 4x2 contingency table: 

378 

379 >>> import numpy as np 

380 >>> from scipy.stats.contingency import association 

381 >>> obs4x2 = np.array([[100, 150], [203, 322], [420, 700], [320, 210]]) 

382 

383 Pearson's contingency coefficient 

384 >>> association(obs4x2, method="pearson") 

385 0.18303298140595667 

386 

387 Cramer's V 

388 

389 >>> association(obs4x2, method="cramer") 

390 0.18617813077483678 

391 

392 Tschuprow's T 

393 

394 >>> association(obs4x2, method="tschuprow") 

395 0.14146478765062995 

396 """ 

397 arr = np.asarray(observed) 

398 if not np.issubdtype(arr.dtype, np.integer): 

399 raise ValueError("`observed` must be an integer array.") 

400 

401 if len(arr.shape) != 2: 

402 raise ValueError("method only accepts 2d arrays") 

403 

404 chi2_stat = chi2_contingency(arr, correction=correction, 

405 lambda_=lambda_) 

406 

407 phi2 = chi2_stat.statistic / arr.sum() 

408 n_rows, n_cols = arr.shape 

409 if method == "cramer": 

410 value = phi2 / min(n_cols - 1, n_rows - 1) 

411 elif method == "tschuprow": 

412 value = phi2 / math.sqrt((n_rows - 1) * (n_cols - 1)) 

413 elif method == 'pearson': 

414 value = phi2 / (1 + phi2) 

415 else: 

416 raise ValueError("Invalid argument value: 'method' argument must " 

417 "be 'cramer', 'tschuprow', or 'pearson'") 

418 

419 return math.sqrt(value)