Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scipy/stats/contingency.py: 23%

1"""

2Contingency table functions (:mod:`scipy.stats.contingency`)

3============================================================

5Functions for creating and analyzing contingency tables.

7.. currentmodule:: scipy.stats.contingency

9.. autosummary::

10 :toctree: generated/

12 chi2_contingency

13 relative_risk

14 odds_ratio

15 crosstab

16 association

18 expected_freq

19 margins

21"""

24from functools import reduce

25import math

26import numpy as np

27from ._stats_py import power_divergence

28from ._relative_risk import relative_risk

29from ._crosstab import crosstab

30from ._odds_ratio import odds_ratio

31from scipy._lib._bunch import _make_tuple_bunch

34__all__ = ['margins', 'expected_freq', 'chi2_contingency', 'crosstab',

35 'association', 'relative_risk', 'odds_ratio']

38def margins(a):

39 """Return a list of the marginal sums of the array `a`.

41 Parameters

42 ----------

43 a : ndarray

44 The array for which to compute the marginal sums.

46 Returns

47 -------

48 margsums : list of ndarrays

49 A list of length `a.ndim`. `margsums[k]` is the result

50 of summing `a` over all axes except `k`; it has the same

51 number of dimensions as `a`, but the length of each axis

52 except axis `k` will be 1.

54 Examples

55 --------

56 >>> import numpy as np

57 >>> from scipy.stats.contingency import margins

59 >>> a = np.arange(12).reshape(2, 6)

60 >>> a

61 array([[ 0, 1, 2, 3, 4, 5],

62 [ 6, 7, 8, 9, 10, 11]])

63 >>> m0, m1 = margins(a)

64 >>> m0

65 array([[15],

66 [51]])

67 >>> m1

68 array([[ 6, 8, 10, 12, 14, 16]])

70 >>> b = np.arange(24).reshape(2,3,4)

71 >>> m0, m1, m2 = margins(b)

72 >>> m0

73 array([[[ 66]],

74 [[210]]])

75 >>> m1

76 array([[[ 60],

77 [ 92],

78 [124]]])

79 >>> m2

80 array([[[60, 66, 72, 78]]])

81 """

82 margsums = []

83 ranged = list(range(a.ndim))

84 for k in ranged:

85 marg = np.apply_over_axes(np.sum, a, [j for j in ranged if j != k])

86 margsums.append(marg)

87 return margsums

90def expected_freq(observed):

91 """

92 Compute the expected frequencies from a contingency table.

94 Given an n-dimensional contingency table of observed frequencies,

95 compute the expected frequencies for the table based on the marginal

96 sums under the assumption that the groups associated with each

97 dimension are independent.

99 Parameters

100 ----------

101 observed : array_like

102 The table of observed frequencies. (While this function can handle

103 a 1-D array, that case is trivial. Generally `observed` is at

104 least 2-D.)

105

106 Returns

107 -------

108 expected : ndarray of float64

109 The expected frequencies, based on the marginal sums of the table.

110 Same shape as `observed`.

111

112 Examples

113 --------

114 >>> import numpy as np

115 >>> from scipy.stats.contingency import expected_freq

116 >>> observed = np.array([[10, 10, 20],[20, 20, 20]])

117 >>> expected_freq(observed)

118 array([[ 12., 12., 16.],

119 [ 18., 18., 24.]])

120

121 """

122 # Typically `observed` is an integer array. If `observed` has a large

123 # number of dimensions or holds large values, some of the following

124 # computations may overflow, so we first switch to floating point.

125 observed = np.asarray(observed, dtype=np.float64)

126

127 # Create a list of the marginal sums.

128 margsums = margins(observed)

129

130 # Create the array of expected frequencies. The shapes of the

131 # marginal sums returned by apply_over_axes() are just what we

132 # need for broadcasting in the following product.

133 d = observed.ndim

134 expected = reduce(np.multiply, margsums) / observed.sum() ** (d - 1)

135 return expected

136

137

138Chi2ContingencyResult = _make_tuple_bunch(

139 'Chi2ContingencyResult',

140 ['statistic', 'pvalue', 'dof', 'expected_freq'], []

141)

142

143

144def chi2_contingency(observed, correction=True, lambda_=None):

145 """Chi-square test of independence of variables in a contingency table.

146

147 This function computes the chi-square statistic and p-value for the

148 hypothesis test of independence of the observed frequencies in the

149 contingency table [1]_ `observed`. The expected frequencies are computed

150 based on the marginal sums under the assumption of independence; see

151 `scipy.stats.contingency.expected_freq`. The number of degrees of

152 freedom is (expressed using numpy functions and attributes)::

153

154 dof = observed.size - sum(observed.shape) + observed.ndim - 1

155

156

157 Parameters

158 ----------

159 observed : array_like

160 The contingency table. The table contains the observed frequencies

161 (i.e. number of occurrences) in each category. In the two-dimensional

162 case, the table is often described as an "R x C table".

163 correction : bool, optional

164 If True, *and* the degrees of freedom is 1, apply Yates' correction

165 for continuity. The effect of the correction is to adjust each

166 observed value by 0.5 towards the corresponding expected value.

167 lambda_ : float or str, optional

168 By default, the statistic computed in this test is Pearson's

169 chi-squared statistic [2]_. `lambda_` allows a statistic from the

170 Cressie-Read power divergence family [3]_ to be used instead. See

171 `scipy.stats.power_divergence` for details.

172

173 Returns

174 -------

175 res : Chi2ContingencyResult

176 An object containing attributes:

177

178 statistic : float

179 The test statistic.

180 pvalue : float

181 The p-value of the test.

182 dof : int

183 The degrees of freedom.

184 expected_freq : ndarray, same shape as `observed`

185 The expected frequencies, based on the marginal sums of the table.

186

187 See Also

188 --------

189 scipy.stats.contingency.expected_freq

190 scipy.stats.fisher_exact

191 scipy.stats.chisquare

192 scipy.stats.power_divergence

193 scipy.stats.barnard_exact

194 scipy.stats.boschloo_exact

195

196 Notes

197 -----

198 An often quoted guideline for the validity of this calculation is that

199 the test should be used only if the observed and expected frequencies

200 in each cell are at least 5.

201

202 This is a test for the independence of different categories of a

203 population. The test is only meaningful when the dimension of

204 `observed` is two or more. Applying the test to a one-dimensional

205 table will always result in `expected` equal to `observed` and a

206 chi-square statistic equal to 0.

207

208 This function does not handle masked arrays, because the calculation

209 does not make sense with missing values.

210

211 Like `scipy.stats.chisquare`, this function computes a chi-square

212 statistic; the convenience this function provides is to figure out the

213 expected frequencies and degrees of freedom from the given contingency

214 table. If these were already known, and if the Yates' correction was not

215 required, one could use `scipy.stats.chisquare`. That is, if one calls::

216

217 res = chi2_contingency(obs, correction=False)

218

219 then the following is true::

220

221 (res.statistic, res.pvalue) == stats.chisquare(obs.ravel(),

222 f_exp=ex.ravel(),

223 ddof=obs.size - 1 - dof)

224

225 The `lambda_` argument was added in version 0.13.0 of scipy.

226

227 References

228 ----------

229 .. [1] "Contingency table",

230 https://en.wikipedia.org/wiki/Contingency_table

231 .. [2] "Pearson's chi-squared test",

232 https://en.wikipedia.org/wiki/Pearson%27s_chi-squared_test

233 .. [3] Cressie, N. and Read, T. R. C., "Multinomial Goodness-of-Fit

234 Tests", J. Royal Stat. Soc. Series B, Vol. 46, No. 3 (1984),

235 pp. 440-464.

236

237 Examples

238 --------

239 A two-way example (2 x 3):

240

241 >>> import numpy as np

242 >>> from scipy.stats import chi2_contingency

243 >>> obs = np.array([[10, 10, 20], [20, 20, 20]])

244 >>> res = chi2_contingency(obs)

245 >>> res.statistic

246 2.7777777777777777

247 >>> res.pvalue

248 0.24935220877729619

249 >>> res.dof

250 2

251 >>> res.expected_freq

252 array([[ 12., 12., 16.],

253 [ 18., 18., 24.]])

254

255 Perform the test using the log-likelihood ratio (i.e. the "G-test")

256 instead of Pearson's chi-squared statistic.

257

258 >>> res = chi2_contingency(obs, lambda_="log-likelihood")

259 >>> res.statistic

260 2.7688587616781319

261 >>> res.pvalue

262 0.25046668010954165

263

264 A four-way example (2 x 2 x 2 x 2):

265

266 >>> obs = np.array(

267 ... [[[[12, 17],

268 ... [11, 16]],

269 ... [[11, 12],

270 ... [15, 16]]],

271 ... [[[23, 15],

272 ... [30, 22]],

273 ... [[14, 17],

274 ... [15, 16]]]])

275 >>> res = chi2_contingency(obs)

276 >>> res.statistic

277 8.7584514426741897

278 >>> res.pvalue

279 0.64417725029295503

280 """

281 observed = np.asarray(observed)

282 if np.any(observed < 0):

283 raise ValueError("All values in `observed` must be nonnegative.")

284 if observed.size == 0:

285 raise ValueError("No data; `observed` has size 0.")

286

287 expected = expected_freq(observed)

288 if np.any(expected == 0):

289 # Include one of the positions where expected is zero in

290 # the exception message.

291 zeropos = list(zip(*np.nonzero(expected == 0)))[0]

292 raise ValueError("The internally computed table of expected "

293 "frequencies has a zero element at %s." % (zeropos,))

294

295 # The degrees of freedom

296 dof = expected.size - sum(expected.shape) + expected.ndim - 1

297

298 if dof == 0:

299 # Degenerate case; this occurs when `observed` is 1D (or, more

300 # generally, when it has only one nontrivial dimension). In this

301 # case, we also have observed == expected, so chi2 is 0.

302 chi2 = 0.0

303 p = 1.0

304 else:

305 if dof == 1 and correction:

306 # Adjust `observed` according to Yates' correction for continuity.

307 # Magnitude of correction no bigger than difference; see gh-13875

308 diff = expected - observed

309 direction = np.sign(diff)

310 magnitude = np.minimum(0.5, np.abs(diff))

311 observed = observed + magnitude * direction

312

313 chi2, p = power_divergence(observed, expected,

314 ddof=observed.size - 1 - dof, axis=None,

315 lambda_=lambda_)

316

317 return Chi2ContingencyResult(chi2, p, dof, expected)

318

319

320def association(observed, method="cramer", correction=False, lambda_=None):

321 """Calculates degree of association between two nominal variables.

322

323 The function provides the option for computing one of three measures of

324 association between two nominal variables from the data given in a 2d

325 contingency table: Tschuprow's T, Pearson's Contingency Coefficient

326 and Cramer's V.

327

328 Parameters

329 ----------

330 observed : array-like

331 The array of observed values

332 method : {"cramer", "tschuprow", "pearson"} (default = "cramer")

333 The association test statistic.

334 correction : bool, optional

335 Inherited from `scipy.stats.contingency.chi2_contingency()`

336 lambda_ : float or str, optional

337 Inherited from `scipy.stats.contingency.chi2_contingency()`

338

339 Returns

340 -------

341 statistic : float

342 Value of the test statistic

343

344 Notes

345 -----

346 Cramer's V, Tschuprow's T and Pearson's Contingency Coefficient, all

347 measure the degree to which two nominal or ordinal variables are related,

348 or the level of their association. This differs from correlation, although

349 many often mistakenly consider them equivalent. Correlation measures in

350 what way two variables are related, whereas, association measures how

351 related the variables are. As such, association does not subsume

352 independent variables, and is rather a test of independence. A value of

353 1.0 indicates perfect association, and 0.0 means the variables have no

354 association.

355

356 Both the Cramer's V and Tschuprow's T are extensions of the phi

357 coefficient. Moreover, due to the close relationship between the

358 Cramer's V and Tschuprow's T the returned values can often be similar

359 or even equivalent. They are likely to diverge more as the array shape

360 diverges from a 2x2.

361

362 References

363 ----------

364 .. [1] "Tschuprow's T",

365 https://en.wikipedia.org/wiki/Tschuprow's_T

366 .. [2] Tschuprow, A. A. (1939)

367 Principles of the Mathematical Theory of Correlation;

368 translated by M. Kantorowitsch. W. Hodge & Co.

369 .. [3] "Cramer's V", https://en.wikipedia.org/wiki/Cramer's_V

370 .. [4] "Nominal Association: Phi and Cramer's V",

371 http://www.people.vcu.edu/~pdattalo/702SuppRead/MeasAssoc/NominalAssoc.html

372 .. [5] Gingrich, Paul, "Association Between Variables",

373 http://uregina.ca/~gingrich/ch11a.pdf

374

375 Examples

376 --------

377 An example with a 4x2 contingency table:

378

379 >>> import numpy as np

380 >>> from scipy.stats.contingency import association

381 >>> obs4x2 = np.array([[100, 150], [203, 322], [420, 700], [320, 210]])

382

383 Pearson's contingency coefficient

384 >>> association(obs4x2, method="pearson")

385 0.18303298140595667

386

387 Cramer's V

388

389 >>> association(obs4x2, method="cramer")

390 0.18617813077483678

391

392 Tschuprow's T

393

394 >>> association(obs4x2, method="tschuprow")

395 0.14146478765062995

396 """

397 arr = np.asarray(observed)

398 if not np.issubdtype(arr.dtype, np.integer):

399 raise ValueError("`observed` must be an integer array.")

400

401 if len(arr.shape) != 2:

402 raise ValueError("method only accepts 2d arrays")

403

404 chi2_stat = chi2_contingency(arr, correction=correction,

405 lambda_=lambda_)

406

407 phi2 = chi2_stat.statistic / arr.sum()

408 n_rows, n_cols = arr.shape

409 if method == "cramer":

410 value = phi2 / min(n_cols - 1, n_rows - 1)

411 elif method == "tschuprow":

412 value = phi2 / math.sqrt((n_rows - 1) * (n_cols - 1))

413 elif method == 'pearson':

414 value = phi2 / (1 + phi2)

415 else:

416 raise ValueError("Invalid argument value: 'method' argument must "

417 "be 'cramer', 'tschuprow', or 'pearson'")

418

419 return math.sqrt(value)