1"""
2The :mod:`sklearn.utils.class_weight` module includes utilities for handling
3weights based on class labels.
4"""
5
6# Authors: Andreas Mueller
7# Manoj Kumar
8# License: BSD 3 clause
9
10import numpy as np
11from scipy import sparse
12
13from ._param_validation import StrOptions, validate_params
14
15
16@validate_params(
17 {
18 "class_weight": [dict, StrOptions({"balanced"}), None],
19 "classes": [np.ndarray],
20 "y": ["array-like"],
21 },
22 prefer_skip_nested_validation=True,
23)
24def compute_class_weight(class_weight, *, classes, y):
25 """Estimate class weights for unbalanced datasets.
26
27 Parameters
28 ----------
29 class_weight : dict, "balanced" or None
30 If "balanced", class weights will be given by
31 `n_samples / (n_classes * np.bincount(y))`.
32 If a dictionary is given, keys are classes and values are corresponding class
33 weights.
34 If `None` is given, the class weights will be uniform.
35
36 classes : ndarray
37 Array of the classes occurring in the data, as given by
38 `np.unique(y_org)` with `y_org` the original class labels.
39
40 y : array-like of shape (n_samples,)
41 Array of original class labels per sample.
42
43 Returns
44 -------
45 class_weight_vect : ndarray of shape (n_classes,)
46 Array with `class_weight_vect[i]` the weight for i-th class.
47
48 References
49 ----------
50 The "balanced" heuristic is inspired by
51 Logistic Regression in Rare Events Data, King, Zen, 2001.
52 """
53 # Import error caused by circular imports.
54 from ..preprocessing import LabelEncoder
55
56 if set(y) - set(classes):
57 raise ValueError("classes should include all valid labels that can be in y")
58 if class_weight is None or len(class_weight) == 0:
59 # uniform class weights
60 weight = np.ones(classes.shape[0], dtype=np.float64, order="C")
61 elif class_weight == "balanced":
62 # Find the weight of each class as present in y.
63 le = LabelEncoder()
64 y_ind = le.fit_transform(y)
65 if not all(np.isin(classes, le.classes_)):
66 raise ValueError("classes should have valid labels that are in y")
67
68 recip_freq = len(y) / (len(le.classes_) * np.bincount(y_ind).astype(np.float64))
69 weight = recip_freq[le.transform(classes)]
70 else:
71 # user-defined dictionary
72 weight = np.ones(classes.shape[0], dtype=np.float64, order="C")
73 unweighted_classes = []
74 for i, c in enumerate(classes):
75 if c in class_weight:
76 weight[i] = class_weight[c]
77 else:
78 unweighted_classes.append(c)
79
80 n_weighted_classes = len(classes) - len(unweighted_classes)
81 if unweighted_classes and n_weighted_classes != len(class_weight):
82 unweighted_classes_user_friendly_str = np.array(unweighted_classes).tolist()
83 raise ValueError(
84 f"The classes, {unweighted_classes_user_friendly_str}, are not in"
85 " class_weight"
86 )
87
88 return weight
89
90
91@validate_params(
92 {
93 "class_weight": [dict, list, StrOptions({"balanced"}), None],
94 "y": ["array-like", "sparse matrix"],
95 "indices": ["array-like", None],
96 },
97 prefer_skip_nested_validation=True,
98)
99def compute_sample_weight(class_weight, y, *, indices=None):
100 """Estimate sample weights by class for unbalanced datasets.
101
102 Parameters
103 ----------
104 class_weight : dict, list of dicts, "balanced", or None
105 Weights associated with classes in the form `{class_label: weight}`.
106 If not given, all classes are supposed to have weight one. For
107 multi-output problems, a list of dicts can be provided in the same
108 order as the columns of y.
109
110 Note that for multioutput (including multilabel) weights should be
111 defined for each class of every column in its own dict. For example,
112 for four-class multilabel classification weights should be
113 `[{0: 1, 1: 1}, {0: 1, 1: 5}, {0: 1, 1: 1}, {0: 1, 1: 1}]` instead of
114 `[{1:1}, {2:5}, {3:1}, {4:1}]`.
115
116 The `"balanced"` mode uses the values of y to automatically adjust
117 weights inversely proportional to class frequencies in the input data:
118 `n_samples / (n_classes * np.bincount(y))`.
119
120 For multi-output, the weights of each column of y will be multiplied.
121
122 y : {array-like, sparse matrix} of shape (n_samples,) or (n_samples, n_outputs)
123 Array of original class labels per sample.
124
125 indices : array-like of shape (n_subsample,), default=None
126 Array of indices to be used in a subsample. Can be of length less than
127 `n_samples` in the case of a subsample, or equal to `n_samples` in the
128 case of a bootstrap subsample with repeated indices. If `None`, the
129 sample weight will be calculated over the full sample. Only `"balanced"`
130 is supported for `class_weight` if this is provided.
131
132 Returns
133 -------
134 sample_weight_vect : ndarray of shape (n_samples,)
135 Array with sample weights as applied to the original `y`.
136 """
137
138 # Ensure y is 2D. Sparse matrices are already 2D.
139 if not sparse.issparse(y):
140 y = np.atleast_1d(y)
141 if y.ndim == 1:
142 y = np.reshape(y, (-1, 1))
143 n_outputs = y.shape[1]
144
145 if indices is not None and class_weight != "balanced":
146 raise ValueError(
147 "The only valid class_weight for subsampling is 'balanced'. "
148 f"Given {class_weight}."
149 )
150 elif n_outputs > 1:
151 if class_weight is None or isinstance(class_weight, dict):
152 raise ValueError(
153 "For multi-output, class_weight should be a list of dicts, or the "
154 "string 'balanced'."
155 )
156 elif isinstance(class_weight, list) and len(class_weight) != n_outputs:
157 raise ValueError(
158 "For multi-output, number of elements in class_weight should match "
159 f"number of outputs. Got {len(class_weight)} element(s) while having "
160 f"{n_outputs} outputs."
161 )
162
163 expanded_class_weight = []
164 for k in range(n_outputs):
165 if sparse.issparse(y):
166 # Ok to densify a single column at a time
167 y_full = y[:, [k]].toarray().flatten()
168 else:
169 y_full = y[:, k]
170 classes_full = np.unique(y_full)
171 classes_missing = None
172
173 if class_weight == "balanced" or n_outputs == 1:
174 class_weight_k = class_weight
175 else:
176 class_weight_k = class_weight[k]
177
178 if indices is not None:
179 # Get class weights for the subsample, covering all classes in
180 # case some labels that were present in the original data are
181 # missing from the sample.
182 y_subsample = y_full[indices]
183 classes_subsample = np.unique(y_subsample)
184
185 weight_k = np.take(
186 compute_class_weight(
187 class_weight_k, classes=classes_subsample, y=y_subsample
188 ),
189 np.searchsorted(classes_subsample, classes_full),
190 mode="clip",
191 )
192
193 classes_missing = set(classes_full) - set(classes_subsample)
194 else:
195 weight_k = compute_class_weight(
196 class_weight_k, classes=classes_full, y=y_full
197 )
198
199 weight_k = weight_k[np.searchsorted(classes_full, y_full)]
200
201 if classes_missing:
202 # Make missing classes' weight zero
203 weight_k[np.isin(y_full, list(classes_missing))] = 0.0
204
205 expanded_class_weight.append(weight_k)
206
207 expanded_class_weight = np.prod(expanded_class_weight, axis=0, dtype=np.float64)
208
209 return expanded_class_weight