Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scikit_learn-1.4.dev0-py3.8-linux-x86_64.egg/sklearn/utils/class

1"""

2The :mod:`sklearn.utils.class_weight` module includes utilities for handling

3weights based on class labels.

4"""

6# Authors: Andreas Mueller

7# Manoj Kumar

8# License: BSD 3 clause

10import numpy as np

11from scipy import sparse

13from ._param_validation import StrOptions, validate_params

16@validate_params(

17 {

18 "class_weight": [dict, StrOptions({"balanced"}), None],

19 "classes": [np.ndarray],

20 "y": ["array-like"],

21 },

22 prefer_skip_nested_validation=True,

23)

24def compute_class_weight(class_weight, *, classes, y):

25 """Estimate class weights for unbalanced datasets.

27 Parameters

28 ----------

29 class_weight : dict, "balanced" or None

30 If "balanced", class weights will be given by

31 `n_samples / (n_classes * np.bincount(y))`.

32 If a dictionary is given, keys are classes and values are corresponding class

33 weights.

34 If `None` is given, the class weights will be uniform.

36 classes : ndarray

37 Array of the classes occurring in the data, as given by

38 `np.unique(y_org)` with `y_org` the original class labels.

40 y : array-like of shape (n_samples,)

41 Array of original class labels per sample.

43 Returns

44 -------

45 class_weight_vect : ndarray of shape (n_classes,)

46 Array with `class_weight_vect[i]` the weight for i-th class.

48 References

49 ----------

50 The "balanced" heuristic is inspired by

51 Logistic Regression in Rare Events Data, King, Zen, 2001.

52 """

53 # Import error caused by circular imports.

54 from ..preprocessing import LabelEncoder

56 if set(y) - set(classes):

57 raise ValueError("classes should include all valid labels that can be in y")

58 if class_weight is None or len(class_weight) == 0:

59 # uniform class weights

60 weight = np.ones(classes.shape[0], dtype=np.float64, order="C")

61 elif class_weight == "balanced":

62 # Find the weight of each class as present in y.

63 le = LabelEncoder()

64 y_ind = le.fit_transform(y)

65 if not all(np.isin(classes, le.classes_)):

66 raise ValueError("classes should have valid labels that are in y")

68 recip_freq = len(y) / (len(le.classes_) * np.bincount(y_ind).astype(np.float64))

69 weight = recip_freq[le.transform(classes)]

70 else:

71 # user-defined dictionary

72 weight = np.ones(classes.shape[0], dtype=np.float64, order="C")

73 unweighted_classes = []

74 for i, c in enumerate(classes):

75 if c in class_weight:

76 weight[i] = class_weight[c]

77 else:

78 unweighted_classes.append(c)

80 n_weighted_classes = len(classes) - len(unweighted_classes)

81 if unweighted_classes and n_weighted_classes != len(class_weight):

82 unweighted_classes_user_friendly_str = np.array(unweighted_classes).tolist()

83 raise ValueError(

84 f"The classes, {unweighted_classes_user_friendly_str}, are not in"

85 " class_weight"

86 )

88 return weight

91@validate_params(

92 {

93 "class_weight": [dict, list, StrOptions({"balanced"}), None],

94 "y": ["array-like", "sparse matrix"],

95 "indices": ["array-like", None],

96 },

97 prefer_skip_nested_validation=True,

98)

99def compute_sample_weight(class_weight, y, *, indices=None):

100 """Estimate sample weights by class for unbalanced datasets.

101

102 Parameters

103 ----------

104 class_weight : dict, list of dicts, "balanced", or None

105 Weights associated with classes in the form `{class_label: weight}`.

106 If not given, all classes are supposed to have weight one. For

107 multi-output problems, a list of dicts can be provided in the same

108 order as the columns of y.

109

110 Note that for multioutput (including multilabel) weights should be

111 defined for each class of every column in its own dict. For example,

112 for four-class multilabel classification weights should be

113 `[{0: 1, 1: 1}, {0: 1, 1: 5}, {0: 1, 1: 1}, {0: 1, 1: 1}]` instead of

114 `[{1:1}, {2:5}, {3:1}, {4:1}]`.

115

116 The `"balanced"` mode uses the values of y to automatically adjust

117 weights inversely proportional to class frequencies in the input data:

118 `n_samples / (n_classes * np.bincount(y))`.

119

120 For multi-output, the weights of each column of y will be multiplied.

121

122 y : {array-like, sparse matrix} of shape (n_samples,) or (n_samples, n_outputs)

123 Array of original class labels per sample.

124

125 indices : array-like of shape (n_subsample,), default=None

126 Array of indices to be used in a subsample. Can be of length less than

127 `n_samples` in the case of a subsample, or equal to `n_samples` in the

128 case of a bootstrap subsample with repeated indices. If `None`, the

129 sample weight will be calculated over the full sample. Only `"balanced"`

130 is supported for `class_weight` if this is provided.

131

132 Returns

133 -------

134 sample_weight_vect : ndarray of shape (n_samples,)

135 Array with sample weights as applied to the original `y`.

136 """

137

138 # Ensure y is 2D. Sparse matrices are already 2D.

139 if not sparse.issparse(y):

140 y = np.atleast_1d(y)

141 if y.ndim == 1:

142 y = np.reshape(y, (-1, 1))

143 n_outputs = y.shape[1]

144

145 if indices is not None and class_weight != "balanced":

146 raise ValueError(

147 "The only valid class_weight for subsampling is 'balanced'. "

148 f"Given {class_weight}."

149 )

150 elif n_outputs > 1:

151 if class_weight is None or isinstance(class_weight, dict):

152 raise ValueError(

153 "For multi-output, class_weight should be a list of dicts, or the "

154 "string 'balanced'."

155 )

156 elif isinstance(class_weight, list) and len(class_weight) != n_outputs:

157 raise ValueError(

158 "For multi-output, number of elements in class_weight should match "

159 f"number of outputs. Got {len(class_weight)} element(s) while having "

160 f"{n_outputs} outputs."

161 )

162

163 expanded_class_weight = []

164 for k in range(n_outputs):

165 if sparse.issparse(y):

166 # Ok to densify a single column at a time

167 y_full = y[:, [k]].toarray().flatten()

168 else:

169 y_full = y[:, k]

170 classes_full = np.unique(y_full)

171 classes_missing = None

172

173 if class_weight == "balanced" or n_outputs == 1:

174 class_weight_k = class_weight

175 else:

176 class_weight_k = class_weight[k]

177

178 if indices is not None:

179 # Get class weights for the subsample, covering all classes in

180 # case some labels that were present in the original data are

181 # missing from the sample.

182 y_subsample = y_full[indices]

183 classes_subsample = np.unique(y_subsample)

184

185 weight_k = np.take(

186 compute_class_weight(

187 class_weight_k, classes=classes_subsample, y=y_subsample

188 ),

189 np.searchsorted(classes_subsample, classes_full),

190 mode="clip",

191 )

192

193 classes_missing = set(classes_full) - set(classes_subsample)

194 else:

195 weight_k = compute_class_weight(

196 class_weight_k, classes=classes_full, y=y_full

197 )

198

199 weight_k = weight_k[np.searchsorted(classes_full, y_full)]

200

201 if classes_missing:

202 # Make missing classes' weight zero

203 weight_k[np.isin(y_full, list(classes_missing))] = 0.0

204

205 expanded_class_weight.append(weight_k)

206

207 expanded_class_weight = np.prod(expanded_class_weight, axis=0, dtype=np.float64)

208

209 return expanded_class_weight

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scikit_learn-1.4.dev0-py3.8-linux-x86_64.egg/sklearn/utils/class_weight.py: 11%

64 statements