Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scipy/stats/_crosstab.py: 13%
39 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-12 06:31 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-12 06:31 +0000
1import numpy as np
2from scipy.sparse import coo_matrix
3from scipy._lib._bunch import _make_tuple_bunch
6CrosstabResult = _make_tuple_bunch(
7 "CrosstabResult", ["elements", "count"]
8)
10def crosstab(*args, levels=None, sparse=False):
11 """
12 Return table of counts for each possible unique combination in ``*args``.
14 When ``len(args) > 1``, the array computed by this function is
15 often referred to as a *contingency table* [1]_.
17 The arguments must be sequences with the same length. The second return
18 value, `count`, is an integer array with ``len(args)`` dimensions. If
19 `levels` is None, the shape of `count` is ``(n0, n1, ...)``, where ``nk``
20 is the number of unique elements in ``args[k]``.
22 Parameters
23 ----------
24 *args : sequences
25 A sequence of sequences whose unique aligned elements are to be
26 counted. The sequences in args must all be the same length.
27 levels : sequence, optional
28 If `levels` is given, it must be a sequence that is the same length as
29 `args`. Each element in `levels` is either a sequence or None. If it
30 is a sequence, it gives the values in the corresponding sequence in
31 `args` that are to be counted. If any value in the sequences in `args`
32 does not occur in the corresponding sequence in `levels`, that value
33 is ignored and not counted in the returned array `count`. The default
34 value of `levels` for ``args[i]`` is ``np.unique(args[i])``
35 sparse : bool, optional
36 If True, return a sparse matrix. The matrix will be an instance of
37 the `scipy.sparse.coo_matrix` class. Because SciPy's sparse matrices
38 must be 2-d, only two input sequences are allowed when `sparse` is
39 True. Default is False.
41 Returns
42 -------
43 res : CrosstabResult
44 An object containing the following attributes:
46 elements : tuple of numpy.ndarrays.
47 Tuple of length ``len(args)`` containing the arrays of elements
48 that are counted in `count`. These can be interpreted as the
49 labels of the corresponding dimensions of `count`. If `levels` was
50 given, then if ``levels[i]`` is not None, ``elements[i]`` will
51 hold the values given in ``levels[i]``.
52 count : numpy.ndarray or scipy.sparse.coo_matrix
53 Counts of the unique elements in ``zip(*args)``, stored in an
54 array. Also known as a *contingency table* when ``len(args) > 1``.
56 See Also
57 --------
58 numpy.unique
60 Notes
61 -----
62 .. versionadded:: 1.7.0
64 References
65 ----------
66 .. [1] "Contingency table", http://en.wikipedia.org/wiki/Contingency_table
68 Examples
69 --------
70 >>> from scipy.stats.contingency import crosstab
72 Given the lists `a` and `x`, create a contingency table that counts the
73 frequencies of the corresponding pairs.
75 >>> a = ['A', 'B', 'A', 'A', 'B', 'B', 'A', 'A', 'B', 'B']
76 >>> x = ['X', 'X', 'X', 'Y', 'Z', 'Z', 'Y', 'Y', 'Z', 'Z']
77 >>> res = crosstab(a, x)
78 >>> avals, xvals = res.elements
79 >>> avals
80 array(['A', 'B'], dtype='<U1')
81 >>> xvals
82 array(['X', 'Y', 'Z'], dtype='<U1')
83 >>> res.count
84 array([[2, 3, 0],
85 [1, 0, 4]])
87 So `('A', 'X')` occurs twice, `('A', 'Y')` occurs three times, etc.
89 Higher dimensional contingency tables can be created.
91 >>> p = [0, 0, 0, 0, 1, 1, 1, 0, 0, 1]
92 >>> res = crosstab(a, x, p)
93 >>> res.count
94 array([[[2, 0],
95 [2, 1],
96 [0, 0]],
97 [[1, 0],
98 [0, 0],
99 [1, 3]]])
100 >>> res.count.shape
101 (2, 3, 2)
103 The values to be counted can be set by using the `levels` argument.
104 It allows the elements of interest in each input sequence to be
105 given explicitly instead finding the unique elements of the sequence.
107 For example, suppose one of the arguments is an array containing the
108 answers to a survey question, with integer values 1 to 4. Even if the
109 value 1 does not occur in the data, we want an entry for it in the table.
111 >>> q1 = [2, 3, 3, 2, 4, 4, 2, 3, 4, 4, 4, 3, 3, 3, 4] # 1 does not occur.
112 >>> q2 = [4, 4, 2, 2, 2, 4, 1, 1, 2, 2, 4, 2, 2, 2, 4] # 3 does not occur.
113 >>> options = [1, 2, 3, 4]
114 >>> res = crosstab(q1, q2, levels=(options, options))
115 >>> res.count
116 array([[0, 0, 0, 0],
117 [1, 1, 0, 1],
118 [1, 4, 0, 1],
119 [0, 3, 0, 3]])
121 If `levels` is given, but an element of `levels` is None, the unique values
122 of the corresponding argument are used. For example,
124 >>> res = crosstab(q1, q2, levels=(None, options))
125 >>> res.elements
126 [array([2, 3, 4]), [1, 2, 3, 4]]
127 >>> res.count
128 array([[1, 1, 0, 1],
129 [1, 4, 0, 1],
130 [0, 3, 0, 3]])
132 If we want to ignore the pairs where 4 occurs in ``q2``, we can
133 give just the values [1, 2] to `levels`, and the 4 will be ignored:
135 >>> res = crosstab(q1, q2, levels=(None, [1, 2]))
136 >>> res.elements
137 [array([2, 3, 4]), [1, 2]]
138 >>> res.count
139 array([[1, 1],
140 [1, 4],
141 [0, 3]])
143 Finally, let's repeat the first example, but return a sparse matrix:
145 >>> res = crosstab(a, x, sparse=True)
146 >>> res.count
147 <2x3 sparse matrix of type '<class 'numpy.int64'>'
148 with 4 stored elements in COOrdinate format>
149 >>> res.count.A
150 array([[2, 3, 0],
151 [1, 0, 4]])
153 """
154 nargs = len(args)
155 if nargs == 0:
156 raise TypeError("At least one input sequence is required.")
158 len0 = len(args[0])
159 if not all(len(a) == len0 for a in args[1:]):
160 raise ValueError("All input sequences must have the same length.")
162 if sparse and nargs != 2:
163 raise ValueError("When `sparse` is True, only two input sequences "
164 "are allowed.")
166 if levels is None:
167 # Call np.unique with return_inverse=True on each argument.
168 actual_levels, indices = zip(*[np.unique(a, return_inverse=True)
169 for a in args])
170 else:
171 # `levels` is not None...
172 if len(levels) != nargs:
173 raise ValueError('len(levels) must equal the number of input '
174 'sequences')
176 args = [np.asarray(arg) for arg in args]
177 mask = np.zeros((nargs, len0), dtype=np.bool_)
178 inv = np.zeros((nargs, len0), dtype=np.intp)
179 actual_levels = []
180 for k, (levels_list, arg) in enumerate(zip(levels, args)):
181 if levels_list is None:
182 levels_list, inv[k, :] = np.unique(arg, return_inverse=True)
183 mask[k, :] = True
184 else:
185 q = arg == np.asarray(levels_list).reshape(-1, 1)
186 mask[k, :] = np.any(q, axis=0)
187 qnz = q.T.nonzero()
188 inv[k, qnz[0]] = qnz[1]
189 actual_levels.append(levels_list)
191 mask_all = mask.all(axis=0)
192 indices = tuple(inv[:, mask_all])
194 if sparse:
195 count = coo_matrix((np.ones(len(indices[0]), dtype=int),
196 (indices[0], indices[1])))
197 count.sum_duplicates()
198 else:
199 shape = [len(u) for u in actual_levels]
200 count = np.zeros(shape, dtype=int)
201 np.add.at(count, indices, 1)
203 return CrosstabResult(actual_levels, count)