Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scipy/stats/_odds_ratio.py: 15%
137 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-12 06:31 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-12 06:31 +0000
2import numpy as np
4from scipy.special import ndtri
5from scipy.optimize import brentq
6from ._discrete_distns import nchypergeom_fisher
7from ._common import ConfidenceInterval
10def _sample_odds_ratio(table):
11 """
12 Given a table [[a, b], [c, d]], compute a*d/(b*c).
14 Return nan if the numerator and denominator are 0.
15 Return inf if just the denominator is 0.
16 """
17 # table must be a 2x2 numpy array.
18 if table[1, 0] > 0 and table[0, 1] > 0:
19 oddsratio = table[0, 0] * table[1, 1] / (table[1, 0] * table[0, 1])
20 elif table[0, 0] == 0 or table[1, 1] == 0:
21 oddsratio = np.nan
22 else:
23 oddsratio = np.inf
24 return oddsratio
27def _solve(func):
28 """
29 Solve func(nc) = 0. func must be an increasing function.
30 """
31 # We could just as well call the variable `x` instead of `nc`, but we
32 # always call this function with functions for which nc (the noncentrality
33 # parameter) is the variable for which we are solving.
34 nc = 1.0
35 value = func(nc)
36 if value == 0:
37 return nc
39 # Multiplicative factor by which to increase or decrease nc when
40 # searching for a bracketing interval.
41 factor = 2.0
42 # Find a bracketing interval.
43 if value > 0:
44 nc /= factor
45 while func(nc) > 0:
46 nc /= factor
47 lo = nc
48 hi = factor*nc
49 else:
50 nc *= factor
51 while func(nc) < 0:
52 nc *= factor
53 lo = nc/factor
54 hi = nc
56 # lo and hi bracket the solution for nc.
57 nc = brentq(func, lo, hi, xtol=1e-13)
58 return nc
61def _nc_hypergeom_mean_inverse(x, M, n, N):
62 """
63 For the given noncentral hypergeometric parameters x, M, n,and N
64 (table[0,0], total, row 0 sum and column 0 sum, resp., of a 2x2
65 contingency table), find the noncentrality parameter of Fisher's
66 noncentral hypergeometric distribution whose mean is x.
67 """
68 nc = _solve(lambda nc: nchypergeom_fisher.mean(M, n, N, nc) - x)
69 return nc
72def _hypergeom_params_from_table(table):
73 # The notation M, n and N is consistent with stats.hypergeom and
74 # stats.nchypergeom_fisher.
75 x = table[0, 0]
76 M = table.sum()
77 n = table[0].sum()
78 N = table[:, 0].sum()
79 return x, M, n, N
82def _ci_upper(table, alpha):
83 """
84 Compute the upper end of the confidence interval.
85 """
86 if _sample_odds_ratio(table) == np.inf:
87 return np.inf
89 x, M, n, N = _hypergeom_params_from_table(table)
91 # nchypergeom_fisher.cdf is a decreasing function of nc, so we negate
92 # it in the lambda expression.
93 nc = _solve(lambda nc: -nchypergeom_fisher.cdf(x, M, n, N, nc) + alpha)
94 return nc
97def _ci_lower(table, alpha):
98 """
99 Compute the lower end of the confidence interval.
100 """
101 if _sample_odds_ratio(table) == 0:
102 return 0
104 x, M, n, N = _hypergeom_params_from_table(table)
106 nc = _solve(lambda nc: nchypergeom_fisher.sf(x - 1, M, n, N, nc) - alpha)
107 return nc
110def _conditional_oddsratio(table):
111 """
112 Conditional MLE of the odds ratio for the 2x2 contingency table.
113 """
114 x, M, n, N = _hypergeom_params_from_table(table)
115 # Get the bounds of the support. The support of the noncentral
116 # hypergeometric distribution with parameters M, n, and N is the same
117 # for all values of the noncentrality parameter, so we can use 1 here.
118 lo, hi = nchypergeom_fisher.support(M, n, N, 1)
120 # Check if x is at one of the extremes of the support. If so, we know
121 # the odds ratio is either 0 or inf.
122 if x == lo:
123 # x is at the low end of the support.
124 return 0
125 if x == hi:
126 # x is at the high end of the support.
127 return np.inf
129 nc = _nc_hypergeom_mean_inverse(x, M, n, N)
130 return nc
133def _conditional_oddsratio_ci(table, confidence_level=0.95,
134 alternative='two-sided'):
135 """
136 Conditional exact confidence interval for the odds ratio.
137 """
138 if alternative == 'two-sided':
139 alpha = 0.5*(1 - confidence_level)
140 lower = _ci_lower(table, alpha)
141 upper = _ci_upper(table, alpha)
142 elif alternative == 'less':
143 lower = 0.0
144 upper = _ci_upper(table, 1 - confidence_level)
145 else:
146 # alternative == 'greater'
147 lower = _ci_lower(table, 1 - confidence_level)
148 upper = np.inf
150 return lower, upper
153def _sample_odds_ratio_ci(table, confidence_level=0.95,
154 alternative='two-sided'):
155 oddsratio = _sample_odds_ratio(table)
156 log_or = np.log(oddsratio)
157 se = np.sqrt((1/table).sum())
158 if alternative == 'less':
159 z = ndtri(confidence_level)
160 loglow = -np.inf
161 loghigh = log_or + z*se
162 elif alternative == 'greater':
163 z = ndtri(confidence_level)
164 loglow = log_or - z*se
165 loghigh = np.inf
166 else:
167 # alternative is 'two-sided'
168 z = ndtri(0.5*confidence_level + 0.5)
169 loglow = log_or - z*se
170 loghigh = log_or + z*se
172 return np.exp(loglow), np.exp(loghigh)
175class OddsRatioResult:
176 """
177 Result of `scipy.stats.contingency.odds_ratio`. See the
178 docstring for `odds_ratio` for more details.
180 Attributes
181 ----------
182 statistic : float
183 The computed odds ratio.
185 * If `kind` is ``'sample'``, this is sample (or unconditional)
186 estimate, given by
187 ``table[0, 0]*table[1, 1]/(table[0, 1]*table[1, 0])``.
188 * If `kind` is ``'conditional'``, this is the conditional
189 maximum likelihood estimate for the odds ratio. It is
190 the noncentrality parameter of Fisher's noncentral
191 hypergeometric distribution with the same hypergeometric
192 parameters as `table` and whose mean is ``table[0, 0]``.
194 Methods
195 -------
196 confidence_interval :
197 Confidence interval for the odds ratio.
198 """
200 def __init__(self, _table, _kind, statistic):
201 # for now, no need to make _table and _kind public, since this sort of
202 # information is returned in very few `scipy.stats` results
203 self._table = _table
204 self._kind = _kind
205 self.statistic = statistic
207 def __repr__(self):
208 return f"OddsRatioResult(statistic={self.statistic})"
210 def confidence_interval(self, confidence_level=0.95,
211 alternative='two-sided'):
212 """
213 Confidence interval for the odds ratio.
215 Parameters
216 ----------
217 confidence_level: float
218 Desired confidence level for the confidence interval.
219 The value must be given as a fraction between 0 and 1.
220 Default is 0.95 (meaning 95%).
222 alternative : {'two-sided', 'less', 'greater'}, optional
223 The alternative hypothesis of the hypothesis test to which the
224 confidence interval corresponds. That is, suppose the null
225 hypothesis is that the true odds ratio equals ``OR`` and the
226 confidence interval is ``(low, high)``. Then the following options
227 for `alternative` are available (default is 'two-sided'):
229 * 'two-sided': the true odds ratio is not equal to ``OR``. There
230 is evidence against the null hypothesis at the chosen
231 `confidence_level` if ``high < OR`` or ``low > OR``.
232 * 'less': the true odds ratio is less than ``OR``. The ``low`` end
233 of the confidence interval is 0, and there is evidence against
234 the null hypothesis at the chosen `confidence_level` if
235 ``high < OR``.
236 * 'greater': the true odds ratio is greater than ``OR``. The
237 ``high`` end of the confidence interval is ``np.inf``, and there
238 is evidence against the null hypothesis at the chosen
239 `confidence_level` if ``low > OR``.
241 Returns
242 -------
243 ci : ``ConfidenceInterval`` instance
244 The confidence interval, represented as an object with
245 attributes ``low`` and ``high``.
247 Notes
248 -----
249 When `kind` is ``'conditional'``, the limits of the confidence
250 interval are the conditional "exact confidence limits" as described
251 by Fisher [1]_. The conditional odds ratio and confidence interval are
252 also discussed in Section 4.1.2 of the text by Sahai and Khurshid [2]_.
254 When `kind` is ``'sample'``, the confidence interval is computed
255 under the assumption that the logarithm of the odds ratio is normally
256 distributed with standard error given by::
258 se = sqrt(1/a + 1/b + 1/c + 1/d)
260 where ``a``, ``b``, ``c`` and ``d`` are the elements of the
261 contingency table. (See, for example, [2]_, section 3.1.3.2,
262 or [3]_, section 2.3.3).
264 References
265 ----------
266 .. [1] R. A. Fisher (1935), The logic of inductive inference,
267 Journal of the Royal Statistical Society, Vol. 98, No. 1,
268 pp. 39-82.
269 .. [2] H. Sahai and A. Khurshid (1996), Statistics in Epidemiology:
270 Methods, Techniques, and Applications, CRC Press LLC, Boca
271 Raton, Florida.
272 .. [3] Alan Agresti, An Introduction to Categorical Data Analyis
273 (second edition), Wiley, Hoboken, NJ, USA (2007).
274 """
275 if alternative not in ['two-sided', 'less', 'greater']:
276 raise ValueError("`alternative` must be 'two-sided', 'less' or "
277 "'greater'.")
279 if confidence_level < 0 or confidence_level > 1:
280 raise ValueError('confidence_level must be between 0 and 1')
282 if self._kind == 'conditional':
283 ci = self._conditional_odds_ratio_ci(confidence_level, alternative)
284 else:
285 ci = self._sample_odds_ratio_ci(confidence_level, alternative)
286 return ci
288 def _conditional_odds_ratio_ci(self, confidence_level=0.95,
289 alternative='two-sided'):
290 """
291 Confidence interval for the conditional odds ratio.
292 """
294 table = self._table
295 if 0 in table.sum(axis=0) or 0 in table.sum(axis=1):
296 # If both values in a row or column are zero, the p-value is 1,
297 # the odds ratio is NaN and the confidence interval is (0, inf).
298 ci = (0, np.inf)
299 else:
300 ci = _conditional_oddsratio_ci(table,
301 confidence_level=confidence_level,
302 alternative=alternative)
303 return ConfidenceInterval(low=ci[0], high=ci[1])
305 def _sample_odds_ratio_ci(self, confidence_level=0.95,
306 alternative='two-sided'):
307 """
308 Confidence interval for the sample odds ratio.
309 """
310 if confidence_level < 0 or confidence_level > 1:
311 raise ValueError('confidence_level must be between 0 and 1')
313 table = self._table
314 if 0 in table.sum(axis=0) or 0 in table.sum(axis=1):
315 # If both values in a row or column are zero, the p-value is 1,
316 # the odds ratio is NaN and the confidence interval is (0, inf).
317 ci = (0, np.inf)
318 else:
319 ci = _sample_odds_ratio_ci(table,
320 confidence_level=confidence_level,
321 alternative=alternative)
322 return ConfidenceInterval(low=ci[0], high=ci[1])
325def odds_ratio(table, *, kind='conditional'):
326 r"""
327 Compute the odds ratio for a 2x2 contingency table.
329 Parameters
330 ----------
331 table : array_like of ints
332 A 2x2 contingency table. Elements must be non-negative integers.
333 kind : str, optional
334 Which kind of odds ratio to compute, either the sample
335 odds ratio (``kind='sample'``) or the conditional odds ratio
336 (``kind='conditional'``). Default is ``'conditional'``.
338 Returns
339 -------
340 result : `~scipy.stats._result_classes.OddsRatioResult` instance
341 The returned object has two computed attributes:
343 statistic : float
344 * If `kind` is ``'sample'``, this is sample (or unconditional)
345 estimate, given by
346 ``table[0, 0]*table[1, 1]/(table[0, 1]*table[1, 0])``.
347 * If `kind` is ``'conditional'``, this is the conditional
348 maximum likelihood estimate for the odds ratio. It is
349 the noncentrality parameter of Fisher's noncentral
350 hypergeometric distribution with the same hypergeometric
351 parameters as `table` and whose mean is ``table[0, 0]``.
353 The object has the method `confidence_interval` that computes
354 the confidence interval of the odds ratio.
356 See Also
357 --------
358 scipy.stats.fisher_exact
359 relative_risk
361 Notes
362 -----
363 The conditional odds ratio was discussed by Fisher (see "Example 1"
364 of [1]_). Texts that cover the odds ratio include [2]_ and [3]_.
366 .. versionadded:: 1.10.0
368 References
369 ----------
370 .. [1] R. A. Fisher (1935), The logic of inductive inference,
371 Journal of the Royal Statistical Society, Vol. 98, No. 1,
372 pp. 39-82.
373 .. [2] Breslow NE, Day NE (1980). Statistical methods in cancer research.
374 Volume I - The analysis of case-control studies. IARC Sci Publ.
375 (32):5-338. PMID: 7216345. (See section 4.2.)
376 .. [3] H. Sahai and A. Khurshid (1996), Statistics in Epidemiology:
377 Methods, Techniques, and Applications, CRC Press LLC, Boca
378 Raton, Florida.
380 Examples
381 --------
382 In epidemiology, individuals are classified as "exposed" or
383 "unexposed" to some factor or treatment. If the occurrence of some
384 illness is under study, those who have the illness are often
385 classifed as "cases", and those without it are "noncases". The
386 counts of the occurrences of these classes gives a contingency
387 table::
389 exposed unexposed
390 cases a b
391 noncases c d
393 The sample odds ratio may be written ``(a/c) / (b/d)``. ``a/c`` can
394 be interpreted as the odds of a case occurring in the exposed group,
395 and ``b/d`` as the odds of a case occurring in the unexposed group.
396 The sample odds ratio is the ratio of these odds. If the odds ratio
397 is greater than 1, it suggests that there is a positive association
398 between being exposed and being a case.
400 Interchanging the rows or columns of the contingency table inverts
401 the odds ratio, so it is import to understand the meaning of labels
402 given to the rows and columns of the table when interpreting the
403 odds ratio.
405 Consider a hypothetical example where it is hypothesized that
406 exposure to a certain chemical is assocated with increased occurrence
407 of a certain disease. Suppose we have the following table for a
408 collection of 410 people::
410 exposed unexposed
411 cases 7 15
412 noncases 58 472
414 The question we ask is "Is exposure to the chemical associated with
415 increased risk of the disease?"
417 Compute the odds ratio:
419 >>> from scipy.stats.contingency import odds_ratio
420 >>> res = odds_ratio([[7, 15], [58, 472]])
421 >>> res.statistic
422 3.7836687705553493
424 For this sample, the odds of getting the disease for those who have
425 been exposed to the chemical are almost 3.8 times that of those who
426 have not been exposed.
428 We can compute the 95% confidence interval for the odds ratio:
430 >>> res.confidence_interval(confidence_level=0.95)
431 ConfidenceInterval(low=1.2514829132266785, high=10.363493716701269)
433 The 95% confidence interval for the conditional odds ratio is
434 approximately (1.25, 10.4).
435 """
436 if kind not in ['conditional', 'sample']:
437 raise ValueError("`kind` must be 'conditional' or 'sample'.")
439 c = np.asarray(table)
441 if c.shape != (2, 2):
442 raise ValueError(f"Invalid shape {c.shape}. The input `table` must be "
443 "of shape (2, 2).")
445 if not np.issubdtype(c.dtype, np.integer):
446 raise ValueError("`table` must be an array of integers, but got "
447 f"type {c.dtype}")
448 c = c.astype(np.int64)
450 if np.any(c < 0):
451 raise ValueError("All values in `table` must be nonnegative.")
453 if 0 in c.sum(axis=0) or 0 in c.sum(axis=1):
454 # If both values in a row or column are zero, the p-value is NaN and
455 # the odds ratio is NaN.
456 result = OddsRatioResult(_table=c, _kind=kind, statistic=np.nan)
457 return result
459 if kind == 'sample':
460 oddsratio = _sample_odds_ratio(c)
461 else: # kind is 'conditional'
462 oddsratio = _conditional_oddsratio(c)
464 result = OddsRatioResult(_table=c, _kind=kind, statistic=oddsratio)
465 return result