Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scipy/stats/_crosstab.py: 13%

39 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-12-12 06:31 +0000

1import numpy as np 

2from scipy.sparse import coo_matrix 

3from scipy._lib._bunch import _make_tuple_bunch 

4 

5 

6CrosstabResult = _make_tuple_bunch( 

7 "CrosstabResult", ["elements", "count"] 

8) 

9 

10def crosstab(*args, levels=None, sparse=False): 

11 """ 

12 Return table of counts for each possible unique combination in ``*args``. 

13 

14 When ``len(args) > 1``, the array computed by this function is 

15 often referred to as a *contingency table* [1]_. 

16 

17 The arguments must be sequences with the same length. The second return 

18 value, `count`, is an integer array with ``len(args)`` dimensions. If 

19 `levels` is None, the shape of `count` is ``(n0, n1, ...)``, where ``nk`` 

20 is the number of unique elements in ``args[k]``. 

21 

22 Parameters 

23 ---------- 

24 *args : sequences 

25 A sequence of sequences whose unique aligned elements are to be 

26 counted. The sequences in args must all be the same length. 

27 levels : sequence, optional 

28 If `levels` is given, it must be a sequence that is the same length as 

29 `args`. Each element in `levels` is either a sequence or None. If it 

30 is a sequence, it gives the values in the corresponding sequence in 

31 `args` that are to be counted. If any value in the sequences in `args` 

32 does not occur in the corresponding sequence in `levels`, that value 

33 is ignored and not counted in the returned array `count`. The default 

34 value of `levels` for ``args[i]`` is ``np.unique(args[i])`` 

35 sparse : bool, optional 

36 If True, return a sparse matrix. The matrix will be an instance of 

37 the `scipy.sparse.coo_matrix` class. Because SciPy's sparse matrices 

38 must be 2-d, only two input sequences are allowed when `sparse` is 

39 True. Default is False. 

40 

41 Returns 

42 ------- 

43 res : CrosstabResult 

44 An object containing the following attributes: 

45 

46 elements : tuple of numpy.ndarrays. 

47 Tuple of length ``len(args)`` containing the arrays of elements 

48 that are counted in `count`. These can be interpreted as the 

49 labels of the corresponding dimensions of `count`. If `levels` was 

50 given, then if ``levels[i]`` is not None, ``elements[i]`` will 

51 hold the values given in ``levels[i]``. 

52 count : numpy.ndarray or scipy.sparse.coo_matrix 

53 Counts of the unique elements in ``zip(*args)``, stored in an 

54 array. Also known as a *contingency table* when ``len(args) > 1``. 

55 

56 See Also 

57 -------- 

58 numpy.unique 

59 

60 Notes 

61 ----- 

62 .. versionadded:: 1.7.0 

63 

64 References 

65 ---------- 

66 .. [1] "Contingency table", http://en.wikipedia.org/wiki/Contingency_table 

67 

68 Examples 

69 -------- 

70 >>> from scipy.stats.contingency import crosstab 

71 

72 Given the lists `a` and `x`, create a contingency table that counts the 

73 frequencies of the corresponding pairs. 

74 

75 >>> a = ['A', 'B', 'A', 'A', 'B', 'B', 'A', 'A', 'B', 'B'] 

76 >>> x = ['X', 'X', 'X', 'Y', 'Z', 'Z', 'Y', 'Y', 'Z', 'Z'] 

77 >>> res = crosstab(a, x) 

78 >>> avals, xvals = res.elements 

79 >>> avals 

80 array(['A', 'B'], dtype='<U1') 

81 >>> xvals 

82 array(['X', 'Y', 'Z'], dtype='<U1') 

83 >>> res.count 

84 array([[2, 3, 0], 

85 [1, 0, 4]]) 

86 

87 So `('A', 'X')` occurs twice, `('A', 'Y')` occurs three times, etc. 

88 

89 Higher dimensional contingency tables can be created. 

90 

91 >>> p = [0, 0, 0, 0, 1, 1, 1, 0, 0, 1] 

92 >>> res = crosstab(a, x, p) 

93 >>> res.count 

94 array([[[2, 0], 

95 [2, 1], 

96 [0, 0]], 

97 [[1, 0], 

98 [0, 0], 

99 [1, 3]]]) 

100 >>> res.count.shape 

101 (2, 3, 2) 

102 

103 The values to be counted can be set by using the `levels` argument. 

104 It allows the elements of interest in each input sequence to be 

105 given explicitly instead finding the unique elements of the sequence. 

106 

107 For example, suppose one of the arguments is an array containing the 

108 answers to a survey question, with integer values 1 to 4. Even if the 

109 value 1 does not occur in the data, we want an entry for it in the table. 

110 

111 >>> q1 = [2, 3, 3, 2, 4, 4, 2, 3, 4, 4, 4, 3, 3, 3, 4] # 1 does not occur. 

112 >>> q2 = [4, 4, 2, 2, 2, 4, 1, 1, 2, 2, 4, 2, 2, 2, 4] # 3 does not occur. 

113 >>> options = [1, 2, 3, 4] 

114 >>> res = crosstab(q1, q2, levels=(options, options)) 

115 >>> res.count 

116 array([[0, 0, 0, 0], 

117 [1, 1, 0, 1], 

118 [1, 4, 0, 1], 

119 [0, 3, 0, 3]]) 

120 

121 If `levels` is given, but an element of `levels` is None, the unique values 

122 of the corresponding argument are used. For example, 

123 

124 >>> res = crosstab(q1, q2, levels=(None, options)) 

125 >>> res.elements 

126 [array([2, 3, 4]), [1, 2, 3, 4]] 

127 >>> res.count 

128 array([[1, 1, 0, 1], 

129 [1, 4, 0, 1], 

130 [0, 3, 0, 3]]) 

131 

132 If we want to ignore the pairs where 4 occurs in ``q2``, we can 

133 give just the values [1, 2] to `levels`, and the 4 will be ignored: 

134 

135 >>> res = crosstab(q1, q2, levels=(None, [1, 2])) 

136 >>> res.elements 

137 [array([2, 3, 4]), [1, 2]] 

138 >>> res.count 

139 array([[1, 1], 

140 [1, 4], 

141 [0, 3]]) 

142 

143 Finally, let's repeat the first example, but return a sparse matrix: 

144 

145 >>> res = crosstab(a, x, sparse=True) 

146 >>> res.count 

147 <2x3 sparse matrix of type '<class 'numpy.int64'>' 

148 with 4 stored elements in COOrdinate format> 

149 >>> res.count.A 

150 array([[2, 3, 0], 

151 [1, 0, 4]]) 

152 

153 """ 

154 nargs = len(args) 

155 if nargs == 0: 

156 raise TypeError("At least one input sequence is required.") 

157 

158 len0 = len(args[0]) 

159 if not all(len(a) == len0 for a in args[1:]): 

160 raise ValueError("All input sequences must have the same length.") 

161 

162 if sparse and nargs != 2: 

163 raise ValueError("When `sparse` is True, only two input sequences " 

164 "are allowed.") 

165 

166 if levels is None: 

167 # Call np.unique with return_inverse=True on each argument. 

168 actual_levels, indices = zip(*[np.unique(a, return_inverse=True) 

169 for a in args]) 

170 else: 

171 # `levels` is not None... 

172 if len(levels) != nargs: 

173 raise ValueError('len(levels) must equal the number of input ' 

174 'sequences') 

175 

176 args = [np.asarray(arg) for arg in args] 

177 mask = np.zeros((nargs, len0), dtype=np.bool_) 

178 inv = np.zeros((nargs, len0), dtype=np.intp) 

179 actual_levels = [] 

180 for k, (levels_list, arg) in enumerate(zip(levels, args)): 

181 if levels_list is None: 

182 levels_list, inv[k, :] = np.unique(arg, return_inverse=True) 

183 mask[k, :] = True 

184 else: 

185 q = arg == np.asarray(levels_list).reshape(-1, 1) 

186 mask[k, :] = np.any(q, axis=0) 

187 qnz = q.T.nonzero() 

188 inv[k, qnz[0]] = qnz[1] 

189 actual_levels.append(levels_list) 

190 

191 mask_all = mask.all(axis=0) 

192 indices = tuple(inv[:, mask_all]) 

193 

194 if sparse: 

195 count = coo_matrix((np.ones(len(indices[0]), dtype=int), 

196 (indices[0], indices[1]))) 

197 count.sum_duplicates() 

198 else: 

199 shape = [len(u) for u in actual_levels] 

200 count = np.zeros(shape, dtype=int) 

201 np.add.at(count, indices, 1) 

202 

203 return CrosstabResult(actual_levels, count)