Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scikit_learn-1.4.dev0-py3.8-linux-x86_64.egg/sklearn/utils/class_weight.py: 11%

64 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-12-12 06:31 +0000

1""" 

2The :mod:`sklearn.utils.class_weight` module includes utilities for handling 

3weights based on class labels. 

4""" 

5 

6# Authors: Andreas Mueller 

7# Manoj Kumar 

8# License: BSD 3 clause 

9 

10import numpy as np 

11from scipy import sparse 

12 

13from ._param_validation import StrOptions, validate_params 

14 

15 

16@validate_params( 

17 { 

18 "class_weight": [dict, StrOptions({"balanced"}), None], 

19 "classes": [np.ndarray], 

20 "y": ["array-like"], 

21 }, 

22 prefer_skip_nested_validation=True, 

23) 

24def compute_class_weight(class_weight, *, classes, y): 

25 """Estimate class weights for unbalanced datasets. 

26 

27 Parameters 

28 ---------- 

29 class_weight : dict, "balanced" or None 

30 If "balanced", class weights will be given by 

31 `n_samples / (n_classes * np.bincount(y))`. 

32 If a dictionary is given, keys are classes and values are corresponding class 

33 weights. 

34 If `None` is given, the class weights will be uniform. 

35 

36 classes : ndarray 

37 Array of the classes occurring in the data, as given by 

38 `np.unique(y_org)` with `y_org` the original class labels. 

39 

40 y : array-like of shape (n_samples,) 

41 Array of original class labels per sample. 

42 

43 Returns 

44 ------- 

45 class_weight_vect : ndarray of shape (n_classes,) 

46 Array with `class_weight_vect[i]` the weight for i-th class. 

47 

48 References 

49 ---------- 

50 The "balanced" heuristic is inspired by 

51 Logistic Regression in Rare Events Data, King, Zen, 2001. 

52 """ 

53 # Import error caused by circular imports. 

54 from ..preprocessing import LabelEncoder 

55 

56 if set(y) - set(classes): 

57 raise ValueError("classes should include all valid labels that can be in y") 

58 if class_weight is None or len(class_weight) == 0: 

59 # uniform class weights 

60 weight = np.ones(classes.shape[0], dtype=np.float64, order="C") 

61 elif class_weight == "balanced": 

62 # Find the weight of each class as present in y. 

63 le = LabelEncoder() 

64 y_ind = le.fit_transform(y) 

65 if not all(np.isin(classes, le.classes_)): 

66 raise ValueError("classes should have valid labels that are in y") 

67 

68 recip_freq = len(y) / (len(le.classes_) * np.bincount(y_ind).astype(np.float64)) 

69 weight = recip_freq[le.transform(classes)] 

70 else: 

71 # user-defined dictionary 

72 weight = np.ones(classes.shape[0], dtype=np.float64, order="C") 

73 unweighted_classes = [] 

74 for i, c in enumerate(classes): 

75 if c in class_weight: 

76 weight[i] = class_weight[c] 

77 else: 

78 unweighted_classes.append(c) 

79 

80 n_weighted_classes = len(classes) - len(unweighted_classes) 

81 if unweighted_classes and n_weighted_classes != len(class_weight): 

82 unweighted_classes_user_friendly_str = np.array(unweighted_classes).tolist() 

83 raise ValueError( 

84 f"The classes, {unweighted_classes_user_friendly_str}, are not in" 

85 " class_weight" 

86 ) 

87 

88 return weight 

89 

90 

91@validate_params( 

92 { 

93 "class_weight": [dict, list, StrOptions({"balanced"}), None], 

94 "y": ["array-like", "sparse matrix"], 

95 "indices": ["array-like", None], 

96 }, 

97 prefer_skip_nested_validation=True, 

98) 

99def compute_sample_weight(class_weight, y, *, indices=None): 

100 """Estimate sample weights by class for unbalanced datasets. 

101 

102 Parameters 

103 ---------- 

104 class_weight : dict, list of dicts, "balanced", or None 

105 Weights associated with classes in the form `{class_label: weight}`. 

106 If not given, all classes are supposed to have weight one. For 

107 multi-output problems, a list of dicts can be provided in the same 

108 order as the columns of y. 

109 

110 Note that for multioutput (including multilabel) weights should be 

111 defined for each class of every column in its own dict. For example, 

112 for four-class multilabel classification weights should be 

113 `[{0: 1, 1: 1}, {0: 1, 1: 5}, {0: 1, 1: 1}, {0: 1, 1: 1}]` instead of 

114 `[{1:1}, {2:5}, {3:1}, {4:1}]`. 

115 

116 The `"balanced"` mode uses the values of y to automatically adjust 

117 weights inversely proportional to class frequencies in the input data: 

118 `n_samples / (n_classes * np.bincount(y))`. 

119 

120 For multi-output, the weights of each column of y will be multiplied. 

121 

122 y : {array-like, sparse matrix} of shape (n_samples,) or (n_samples, n_outputs) 

123 Array of original class labels per sample. 

124 

125 indices : array-like of shape (n_subsample,), default=None 

126 Array of indices to be used in a subsample. Can be of length less than 

127 `n_samples` in the case of a subsample, or equal to `n_samples` in the 

128 case of a bootstrap subsample with repeated indices. If `None`, the 

129 sample weight will be calculated over the full sample. Only `"balanced"` 

130 is supported for `class_weight` if this is provided. 

131 

132 Returns 

133 ------- 

134 sample_weight_vect : ndarray of shape (n_samples,) 

135 Array with sample weights as applied to the original `y`. 

136 """ 

137 

138 # Ensure y is 2D. Sparse matrices are already 2D. 

139 if not sparse.issparse(y): 

140 y = np.atleast_1d(y) 

141 if y.ndim == 1: 

142 y = np.reshape(y, (-1, 1)) 

143 n_outputs = y.shape[1] 

144 

145 if indices is not None and class_weight != "balanced": 

146 raise ValueError( 

147 "The only valid class_weight for subsampling is 'balanced'. " 

148 f"Given {class_weight}." 

149 ) 

150 elif n_outputs > 1: 

151 if class_weight is None or isinstance(class_weight, dict): 

152 raise ValueError( 

153 "For multi-output, class_weight should be a list of dicts, or the " 

154 "string 'balanced'." 

155 ) 

156 elif isinstance(class_weight, list) and len(class_weight) != n_outputs: 

157 raise ValueError( 

158 "For multi-output, number of elements in class_weight should match " 

159 f"number of outputs. Got {len(class_weight)} element(s) while having " 

160 f"{n_outputs} outputs." 

161 ) 

162 

163 expanded_class_weight = [] 

164 for k in range(n_outputs): 

165 if sparse.issparse(y): 

166 # Ok to densify a single column at a time 

167 y_full = y[:, [k]].toarray().flatten() 

168 else: 

169 y_full = y[:, k] 

170 classes_full = np.unique(y_full) 

171 classes_missing = None 

172 

173 if class_weight == "balanced" or n_outputs == 1: 

174 class_weight_k = class_weight 

175 else: 

176 class_weight_k = class_weight[k] 

177 

178 if indices is not None: 

179 # Get class weights for the subsample, covering all classes in 

180 # case some labels that were present in the original data are 

181 # missing from the sample. 

182 y_subsample = y_full[indices] 

183 classes_subsample = np.unique(y_subsample) 

184 

185 weight_k = np.take( 

186 compute_class_weight( 

187 class_weight_k, classes=classes_subsample, y=y_subsample 

188 ), 

189 np.searchsorted(classes_subsample, classes_full), 

190 mode="clip", 

191 ) 

192 

193 classes_missing = set(classes_full) - set(classes_subsample) 

194 else: 

195 weight_k = compute_class_weight( 

196 class_weight_k, classes=classes_full, y=y_full 

197 ) 

198 

199 weight_k = weight_k[np.searchsorted(classes_full, y_full)] 

200 

201 if classes_missing: 

202 # Make missing classes' weight zero 

203 weight_k[np.isin(y_full, list(classes_missing))] = 0.0 

204 

205 expanded_class_weight.append(weight_k) 

206 

207 expanded_class_weight = np.prod(expanded_class_weight, axis=0, dtype=np.float64) 

208 

209 return expanded_class_weight