Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scipy/stats/contingency.py: 23%
61 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-12 06:31 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-12 06:31 +0000
1"""
2Contingency table functions (:mod:`scipy.stats.contingency`)
3============================================================
5Functions for creating and analyzing contingency tables.
7.. currentmodule:: scipy.stats.contingency
9.. autosummary::
10 :toctree: generated/
12 chi2_contingency
13 relative_risk
14 odds_ratio
15 crosstab
16 association
18 expected_freq
19 margins
21"""
24from functools import reduce
25import math
26import numpy as np
27from ._stats_py import power_divergence
28from ._relative_risk import relative_risk
29from ._crosstab import crosstab
30from ._odds_ratio import odds_ratio
31from scipy._lib._bunch import _make_tuple_bunch
34__all__ = ['margins', 'expected_freq', 'chi2_contingency', 'crosstab',
35 'association', 'relative_risk', 'odds_ratio']
38def margins(a):
39 """Return a list of the marginal sums of the array `a`.
41 Parameters
42 ----------
43 a : ndarray
44 The array for which to compute the marginal sums.
46 Returns
47 -------
48 margsums : list of ndarrays
49 A list of length `a.ndim`. `margsums[k]` is the result
50 of summing `a` over all axes except `k`; it has the same
51 number of dimensions as `a`, but the length of each axis
52 except axis `k` will be 1.
54 Examples
55 --------
56 >>> import numpy as np
57 >>> from scipy.stats.contingency import margins
59 >>> a = np.arange(12).reshape(2, 6)
60 >>> a
61 array([[ 0, 1, 2, 3, 4, 5],
62 [ 6, 7, 8, 9, 10, 11]])
63 >>> m0, m1 = margins(a)
64 >>> m0
65 array([[15],
66 [51]])
67 >>> m1
68 array([[ 6, 8, 10, 12, 14, 16]])
70 >>> b = np.arange(24).reshape(2,3,4)
71 >>> m0, m1, m2 = margins(b)
72 >>> m0
73 array([[[ 66]],
74 [[210]]])
75 >>> m1
76 array([[[ 60],
77 [ 92],
78 [124]]])
79 >>> m2
80 array([[[60, 66, 72, 78]]])
81 """
82 margsums = []
83 ranged = list(range(a.ndim))
84 for k in ranged:
85 marg = np.apply_over_axes(np.sum, a, [j for j in ranged if j != k])
86 margsums.append(marg)
87 return margsums
90def expected_freq(observed):
91 """
92 Compute the expected frequencies from a contingency table.
94 Given an n-dimensional contingency table of observed frequencies,
95 compute the expected frequencies for the table based on the marginal
96 sums under the assumption that the groups associated with each
97 dimension are independent.
99 Parameters
100 ----------
101 observed : array_like
102 The table of observed frequencies. (While this function can handle
103 a 1-D array, that case is trivial. Generally `observed` is at
104 least 2-D.)
106 Returns
107 -------
108 expected : ndarray of float64
109 The expected frequencies, based on the marginal sums of the table.
110 Same shape as `observed`.
112 Examples
113 --------
114 >>> import numpy as np
115 >>> from scipy.stats.contingency import expected_freq
116 >>> observed = np.array([[10, 10, 20],[20, 20, 20]])
117 >>> expected_freq(observed)
118 array([[ 12., 12., 16.],
119 [ 18., 18., 24.]])
121 """
122 # Typically `observed` is an integer array. If `observed` has a large
123 # number of dimensions or holds large values, some of the following
124 # computations may overflow, so we first switch to floating point.
125 observed = np.asarray(observed, dtype=np.float64)
127 # Create a list of the marginal sums.
128 margsums = margins(observed)
130 # Create the array of expected frequencies. The shapes of the
131 # marginal sums returned by apply_over_axes() are just what we
132 # need for broadcasting in the following product.
133 d = observed.ndim
134 expected = reduce(np.multiply, margsums) / observed.sum() ** (d - 1)
135 return expected
138Chi2ContingencyResult = _make_tuple_bunch(
139 'Chi2ContingencyResult',
140 ['statistic', 'pvalue', 'dof', 'expected_freq'], []
141)
144def chi2_contingency(observed, correction=True, lambda_=None):
145 """Chi-square test of independence of variables in a contingency table.
147 This function computes the chi-square statistic and p-value for the
148 hypothesis test of independence of the observed frequencies in the
149 contingency table [1]_ `observed`. The expected frequencies are computed
150 based on the marginal sums under the assumption of independence; see
151 `scipy.stats.contingency.expected_freq`. The number of degrees of
152 freedom is (expressed using numpy functions and attributes)::
154 dof = observed.size - sum(observed.shape) + observed.ndim - 1
157 Parameters
158 ----------
159 observed : array_like
160 The contingency table. The table contains the observed frequencies
161 (i.e. number of occurrences) in each category. In the two-dimensional
162 case, the table is often described as an "R x C table".
163 correction : bool, optional
164 If True, *and* the degrees of freedom is 1, apply Yates' correction
165 for continuity. The effect of the correction is to adjust each
166 observed value by 0.5 towards the corresponding expected value.
167 lambda_ : float or str, optional
168 By default, the statistic computed in this test is Pearson's
169 chi-squared statistic [2]_. `lambda_` allows a statistic from the
170 Cressie-Read power divergence family [3]_ to be used instead. See
171 `scipy.stats.power_divergence` for details.
173 Returns
174 -------
175 res : Chi2ContingencyResult
176 An object containing attributes:
178 statistic : float
179 The test statistic.
180 pvalue : float
181 The p-value of the test.
182 dof : int
183 The degrees of freedom.
184 expected_freq : ndarray, same shape as `observed`
185 The expected frequencies, based on the marginal sums of the table.
187 See Also
188 --------
189 scipy.stats.contingency.expected_freq
190 scipy.stats.fisher_exact
191 scipy.stats.chisquare
192 scipy.stats.power_divergence
193 scipy.stats.barnard_exact
194 scipy.stats.boschloo_exact
196 Notes
197 -----
198 An often quoted guideline for the validity of this calculation is that
199 the test should be used only if the observed and expected frequencies
200 in each cell are at least 5.
202 This is a test for the independence of different categories of a
203 population. The test is only meaningful when the dimension of
204 `observed` is two or more. Applying the test to a one-dimensional
205 table will always result in `expected` equal to `observed` and a
206 chi-square statistic equal to 0.
208 This function does not handle masked arrays, because the calculation
209 does not make sense with missing values.
211 Like `scipy.stats.chisquare`, this function computes a chi-square
212 statistic; the convenience this function provides is to figure out the
213 expected frequencies and degrees of freedom from the given contingency
214 table. If these were already known, and if the Yates' correction was not
215 required, one could use `scipy.stats.chisquare`. That is, if one calls::
217 res = chi2_contingency(obs, correction=False)
219 then the following is true::
221 (res.statistic, res.pvalue) == stats.chisquare(obs.ravel(),
222 f_exp=ex.ravel(),
223 ddof=obs.size - 1 - dof)
225 The `lambda_` argument was added in version 0.13.0 of scipy.
227 References
228 ----------
229 .. [1] "Contingency table",
230 https://en.wikipedia.org/wiki/Contingency_table
231 .. [2] "Pearson's chi-squared test",
232 https://en.wikipedia.org/wiki/Pearson%27s_chi-squared_test
233 .. [3] Cressie, N. and Read, T. R. C., "Multinomial Goodness-of-Fit
234 Tests", J. Royal Stat. Soc. Series B, Vol. 46, No. 3 (1984),
235 pp. 440-464.
237 Examples
238 --------
239 A two-way example (2 x 3):
241 >>> import numpy as np
242 >>> from scipy.stats import chi2_contingency
243 >>> obs = np.array([[10, 10, 20], [20, 20, 20]])
244 >>> res = chi2_contingency(obs)
245 >>> res.statistic
246 2.7777777777777777
247 >>> res.pvalue
248 0.24935220877729619
249 >>> res.dof
250 2
251 >>> res.expected_freq
252 array([[ 12., 12., 16.],
253 [ 18., 18., 24.]])
255 Perform the test using the log-likelihood ratio (i.e. the "G-test")
256 instead of Pearson's chi-squared statistic.
258 >>> res = chi2_contingency(obs, lambda_="log-likelihood")
259 >>> res.statistic
260 2.7688587616781319
261 >>> res.pvalue
262 0.25046668010954165
264 A four-way example (2 x 2 x 2 x 2):
266 >>> obs = np.array(
267 ... [[[[12, 17],
268 ... [11, 16]],
269 ... [[11, 12],
270 ... [15, 16]]],
271 ... [[[23, 15],
272 ... [30, 22]],
273 ... [[14, 17],
274 ... [15, 16]]]])
275 >>> res = chi2_contingency(obs)
276 >>> res.statistic
277 8.7584514426741897
278 >>> res.pvalue
279 0.64417725029295503
280 """
281 observed = np.asarray(observed)
282 if np.any(observed < 0):
283 raise ValueError("All values in `observed` must be nonnegative.")
284 if observed.size == 0:
285 raise ValueError("No data; `observed` has size 0.")
287 expected = expected_freq(observed)
288 if np.any(expected == 0):
289 # Include one of the positions where expected is zero in
290 # the exception message.
291 zeropos = list(zip(*np.nonzero(expected == 0)))[0]
292 raise ValueError("The internally computed table of expected "
293 "frequencies has a zero element at %s." % (zeropos,))
295 # The degrees of freedom
296 dof = expected.size - sum(expected.shape) + expected.ndim - 1
298 if dof == 0:
299 # Degenerate case; this occurs when `observed` is 1D (or, more
300 # generally, when it has only one nontrivial dimension). In this
301 # case, we also have observed == expected, so chi2 is 0.
302 chi2 = 0.0
303 p = 1.0
304 else:
305 if dof == 1 and correction:
306 # Adjust `observed` according to Yates' correction for continuity.
307 # Magnitude of correction no bigger than difference; see gh-13875
308 diff = expected - observed
309 direction = np.sign(diff)
310 magnitude = np.minimum(0.5, np.abs(diff))
311 observed = observed + magnitude * direction
313 chi2, p = power_divergence(observed, expected,
314 ddof=observed.size - 1 - dof, axis=None,
315 lambda_=lambda_)
317 return Chi2ContingencyResult(chi2, p, dof, expected)
320def association(observed, method="cramer", correction=False, lambda_=None):
321 """Calculates degree of association between two nominal variables.
323 The function provides the option for computing one of three measures of
324 association between two nominal variables from the data given in a 2d
325 contingency table: Tschuprow's T, Pearson's Contingency Coefficient
326 and Cramer's V.
328 Parameters
329 ----------
330 observed : array-like
331 The array of observed values
332 method : {"cramer", "tschuprow", "pearson"} (default = "cramer")
333 The association test statistic.
334 correction : bool, optional
335 Inherited from `scipy.stats.contingency.chi2_contingency()`
336 lambda_ : float or str, optional
337 Inherited from `scipy.stats.contingency.chi2_contingency()`
339 Returns
340 -------
341 statistic : float
342 Value of the test statistic
344 Notes
345 -----
346 Cramer's V, Tschuprow's T and Pearson's Contingency Coefficient, all
347 measure the degree to which two nominal or ordinal variables are related,
348 or the level of their association. This differs from correlation, although
349 many often mistakenly consider them equivalent. Correlation measures in
350 what way two variables are related, whereas, association measures how
351 related the variables are. As such, association does not subsume
352 independent variables, and is rather a test of independence. A value of
353 1.0 indicates perfect association, and 0.0 means the variables have no
354 association.
356 Both the Cramer's V and Tschuprow's T are extensions of the phi
357 coefficient. Moreover, due to the close relationship between the
358 Cramer's V and Tschuprow's T the returned values can often be similar
359 or even equivalent. They are likely to diverge more as the array shape
360 diverges from a 2x2.
362 References
363 ----------
364 .. [1] "Tschuprow's T",
365 https://en.wikipedia.org/wiki/Tschuprow's_T
366 .. [2] Tschuprow, A. A. (1939)
367 Principles of the Mathematical Theory of Correlation;
368 translated by M. Kantorowitsch. W. Hodge & Co.
369 .. [3] "Cramer's V", https://en.wikipedia.org/wiki/Cramer's_V
370 .. [4] "Nominal Association: Phi and Cramer's V",
371 http://www.people.vcu.edu/~pdattalo/702SuppRead/MeasAssoc/NominalAssoc.html
372 .. [5] Gingrich, Paul, "Association Between Variables",
373 http://uregina.ca/~gingrich/ch11a.pdf
375 Examples
376 --------
377 An example with a 4x2 contingency table:
379 >>> import numpy as np
380 >>> from scipy.stats.contingency import association
381 >>> obs4x2 = np.array([[100, 150], [203, 322], [420, 700], [320, 210]])
383 Pearson's contingency coefficient
384 >>> association(obs4x2, method="pearson")
385 0.18303298140595667
387 Cramer's V
389 >>> association(obs4x2, method="cramer")
390 0.18617813077483678
392 Tschuprow's T
394 >>> association(obs4x2, method="tschuprow")
395 0.14146478765062995
396 """
397 arr = np.asarray(observed)
398 if not np.issubdtype(arr.dtype, np.integer):
399 raise ValueError("`observed` must be an integer array.")
401 if len(arr.shape) != 2:
402 raise ValueError("method only accepts 2d arrays")
404 chi2_stat = chi2_contingency(arr, correction=correction,
405 lambda_=lambda_)
407 phi2 = chi2_stat.statistic / arr.sum()
408 n_rows, n_cols = arr.shape
409 if method == "cramer":
410 value = phi2 / min(n_cols - 1, n_rows - 1)
411 elif method == "tschuprow":
412 value = phi2 / math.sqrt((n_rows - 1) * (n_cols - 1))
413 elif method == 'pearson':
414 value = phi2 / (1 + phi2)
415 else:
416 raise ValueError("Invalid argument value: 'method' argument must "
417 "be 'cramer', 'tschuprow', or 'pearson'")
419 return math.sqrt(value)