1import warnings
2
3import numpy as np
4
5from ..base import BaseEstimator, TransformerMixin, _fit_context
6from ..utils._param_validation import StrOptions
7from ..utils._set_output import _get_output_config
8from ..utils.metaestimators import available_if
9from ..utils.validation import (
10 _allclose_dense_sparse,
11 _check_feature_names_in,
12 _is_pandas_df,
13 check_array,
14)
15
16
17def _identity(X):
18 """The identity function."""
19 return X
20
21
22class FunctionTransformer(TransformerMixin, BaseEstimator):
23 """Constructs a transformer from an arbitrary callable.
24
25 A FunctionTransformer forwards its X (and optionally y) arguments to a
26 user-defined function or function object and returns the result of this
27 function. This is useful for stateless transformations such as taking the
28 log of frequencies, doing custom scaling, etc.
29
30 Note: If a lambda is used as the function, then the resulting
31 transformer will not be pickleable.
32
33 .. versionadded:: 0.17
34
35 Read more in the :ref:`User Guide <function_transformer>`.
36
37 Parameters
38 ----------
39 func : callable, default=None
40 The callable to use for the transformation. This will be passed
41 the same arguments as transform, with args and kwargs forwarded.
42 If func is None, then func will be the identity function.
43
44 inverse_func : callable, default=None
45 The callable to use for the inverse transformation. This will be
46 passed the same arguments as inverse transform, with args and
47 kwargs forwarded. If inverse_func is None, then inverse_func
48 will be the identity function.
49
50 validate : bool, default=False
51 Indicate that the input X array should be checked before calling
52 ``func``. The possibilities are:
53
54 - If False, there is no input validation.
55 - If True, then X will be converted to a 2-dimensional NumPy array or
56 sparse matrix. If the conversion is not possible an exception is
57 raised.
58
59 .. versionchanged:: 0.22
60 The default of ``validate`` changed from True to False.
61
62 accept_sparse : bool, default=False
63 Indicate that func accepts a sparse matrix as input. If validate is
64 False, this has no effect. Otherwise, if accept_sparse is false,
65 sparse matrix inputs will cause an exception to be raised.
66
67 check_inverse : bool, default=True
68 Whether to check that or ``func`` followed by ``inverse_func`` leads to
69 the original inputs. It can be used for a sanity check, raising a
70 warning when the condition is not fulfilled.
71
72 .. versionadded:: 0.20
73
74 feature_names_out : callable, 'one-to-one' or None, default=None
75 Determines the list of feature names that will be returned by the
76 `get_feature_names_out` method. If it is 'one-to-one', then the output
77 feature names will be equal to the input feature names. If it is a
78 callable, then it must take two positional arguments: this
79 `FunctionTransformer` (`self`) and an array-like of input feature names
80 (`input_features`). It must return an array-like of output feature
81 names. The `get_feature_names_out` method is only defined if
82 `feature_names_out` is not None.
83
84 See ``get_feature_names_out`` for more details.
85
86 .. versionadded:: 1.1
87
88 kw_args : dict, default=None
89 Dictionary of additional keyword arguments to pass to func.
90
91 .. versionadded:: 0.18
92
93 inv_kw_args : dict, default=None
94 Dictionary of additional keyword arguments to pass to inverse_func.
95
96 .. versionadded:: 0.18
97
98 Attributes
99 ----------
100 n_features_in_ : int
101 Number of features seen during :term:`fit`.
102
103 .. versionadded:: 0.24
104
105 feature_names_in_ : ndarray of shape (`n_features_in_`,)
106 Names of features seen during :term:`fit`. Defined only when `X` has feature
107 names that are all strings.
108
109 .. versionadded:: 1.0
110
111 See Also
112 --------
113 MaxAbsScaler : Scale each feature by its maximum absolute value.
114 StandardScaler : Standardize features by removing the mean and
115 scaling to unit variance.
116 LabelBinarizer : Binarize labels in a one-vs-all fashion.
117 MultiLabelBinarizer : Transform between iterable of iterables
118 and a multilabel format.
119
120 Examples
121 --------
122 >>> import numpy as np
123 >>> from sklearn.preprocessing import FunctionTransformer
124 >>> transformer = FunctionTransformer(np.log1p)
125 >>> X = np.array([[0, 1], [2, 3]])
126 >>> transformer.transform(X)
127 array([[0. , 0.6931...],
128 [1.0986..., 1.3862...]])
129 """
130
131 _parameter_constraints: dict = {
132 "func": [callable, None],
133 "inverse_func": [callable, None],
134 "validate": ["boolean"],
135 "accept_sparse": ["boolean"],
136 "check_inverse": ["boolean"],
137 "feature_names_out": [callable, StrOptions({"one-to-one"}), None],
138 "kw_args": [dict, None],
139 "inv_kw_args": [dict, None],
140 }
141
142 def __init__(
143 self,
144 func=None,
145 inverse_func=None,
146 *,
147 validate=False,
148 accept_sparse=False,
149 check_inverse=True,
150 feature_names_out=None,
151 kw_args=None,
152 inv_kw_args=None,
153 ):
154 self.func = func
155 self.inverse_func = inverse_func
156 self.validate = validate
157 self.accept_sparse = accept_sparse
158 self.check_inverse = check_inverse
159 self.feature_names_out = feature_names_out
160 self.kw_args = kw_args
161 self.inv_kw_args = inv_kw_args
162
163 def _check_input(self, X, *, reset):
164 if self.validate:
165 return self._validate_data(X, accept_sparse=self.accept_sparse, reset=reset)
166 elif reset:
167 # Set feature_names_in_ and n_features_in_ even if validate=False
168 # We run this only when reset==True to store the attributes but not
169 # validate them, because validate=False
170 self._check_n_features(X, reset=reset)
171 self._check_feature_names(X, reset=reset)
172 return X
173
174 def _check_inverse_transform(self, X):
175 """Check that func and inverse_func are the inverse."""
176 idx_selected = slice(None, None, max(1, X.shape[0] // 100))
177 X_round_trip = self.inverse_transform(self.transform(X[idx_selected]))
178
179 if hasattr(X, "dtype"):
180 dtypes = [X.dtype]
181 elif hasattr(X, "dtypes"):
182 # Dataframes can have multiple dtypes
183 dtypes = X.dtypes
184
185 if not all(np.issubdtype(d, np.number) for d in dtypes):
186 raise ValueError(
187 "'check_inverse' is only supported when all the elements in `X` is"
188 " numerical."
189 )
190
191 if not _allclose_dense_sparse(X[idx_selected], X_round_trip):
192 warnings.warn(
193 (
194 "The provided functions are not strictly"
195 " inverse of each other. If you are sure you"
196 " want to proceed regardless, set"
197 " 'check_inverse=False'."
198 ),
199 UserWarning,
200 )
201
202 @_fit_context(prefer_skip_nested_validation=True)
203 def fit(self, X, y=None):
204 """Fit transformer by checking X.
205
206 If ``validate`` is ``True``, ``X`` will be checked.
207
208 Parameters
209 ----------
210 X : {array-like, sparse-matrix} of shape (n_samples, n_features) \
211 if `validate=True` else any object that `func` can handle
212 Input array.
213
214 y : Ignored
215 Not used, present here for API consistency by convention.
216
217 Returns
218 -------
219 self : object
220 FunctionTransformer class instance.
221 """
222 X = self._check_input(X, reset=True)
223 if self.check_inverse and not (self.func is None or self.inverse_func is None):
224 self._check_inverse_transform(X)
225 return self
226
227 def transform(self, X):
228 """Transform X using the forward function.
229
230 Parameters
231 ----------
232 X : {array-like, sparse-matrix} of shape (n_samples, n_features) \
233 if `validate=True` else any object that `func` can handle
234 Input array.
235
236 Returns
237 -------
238 X_out : array-like, shape (n_samples, n_features)
239 Transformed input.
240 """
241 X = self._check_input(X, reset=False)
242 out = self._transform(X, func=self.func, kw_args=self.kw_args)
243
244 if hasattr(out, "columns") and self.feature_names_out is not None:
245 # check the consistency between the column names of the output and the
246 # one generated by `get_feature_names_out`
247 if list(out.columns) != list(self.get_feature_names_out()):
248 raise ValueError(
249 "The output generated by `func` have different column names than "
250 "the one generated by the method `get_feature_names_out`. "
251 f"Got output with columns names: {list(out.columns)} and "
252 "`get_feature_names_out` returned: "
253 f"{list(self.get_feature_names_out())}. "
254 "This can be fixed in different manners depending on your use case:"
255 "\n(i) If `func` returns a container with column names, make sure "
256 "they are consistent with the output of `get_feature_names_out`.\n"
257 "(ii) If `func` is a NumPy `ufunc`, then forcing `validate=True` "
258 "could be considered to internally convert the input container to "
259 "a NumPy array before calling the `ufunc`.\n"
260 "(iii) The column names can be overriden by setting "
261 "`set_output(transform='pandas')` such that the column names are "
262 "set to the names provided by `get_feature_names_out`."
263 )
264
265 output_config = _get_output_config("transform", self)["dense"]
266 if (
267 output_config == "pandas"
268 and self.feature_names_out is None
269 and not _is_pandas_df(out)
270 ):
271 warnings.warn(
272 "When `set_output` is configured to be 'pandas', `func` should return "
273 "a DataFrame to follow the `set_output` API or `feature_names_out` "
274 "should be defined."
275 )
276 return out
277
278 def inverse_transform(self, X):
279 """Transform X using the inverse function.
280
281 Parameters
282 ----------
283 X : {array-like, sparse-matrix} of shape (n_samples, n_features) \
284 if `validate=True` else any object that `inverse_func` can handle
285 Input array.
286
287 Returns
288 -------
289 X_out : array-like, shape (n_samples, n_features)
290 Transformed input.
291 """
292 if self.validate:
293 X = check_array(X, accept_sparse=self.accept_sparse)
294 return self._transform(X, func=self.inverse_func, kw_args=self.inv_kw_args)
295
296 @available_if(lambda self: self.feature_names_out is not None)
297 def get_feature_names_out(self, input_features=None):
298 """Get output feature names for transformation.
299
300 This method is only defined if `feature_names_out` is not None.
301
302 Parameters
303 ----------
304 input_features : array-like of str or None, default=None
305 Input feature names.
306
307 - If `input_features` is None, then `feature_names_in_` is
308 used as the input feature names. If `feature_names_in_` is not
309 defined, then names are generated:
310 `[x0, x1, ..., x(n_features_in_ - 1)]`.
311 - If `input_features` is array-like, then `input_features` must
312 match `feature_names_in_` if `feature_names_in_` is defined.
313
314 Returns
315 -------
316 feature_names_out : ndarray of str objects
317 Transformed feature names.
318
319 - If `feature_names_out` is 'one-to-one', the input feature names
320 are returned (see `input_features` above). This requires
321 `feature_names_in_` and/or `n_features_in_` to be defined, which
322 is done automatically if `validate=True`. Alternatively, you can
323 set them in `func`.
324 - If `feature_names_out` is a callable, then it is called with two
325 arguments, `self` and `input_features`, and its return value is
326 returned by this method.
327 """
328 if hasattr(self, "n_features_in_") or input_features is not None:
329 input_features = _check_feature_names_in(self, input_features)
330 if self.feature_names_out == "one-to-one":
331 names_out = input_features
332 elif callable(self.feature_names_out):
333 names_out = self.feature_names_out(self, input_features)
334 else:
335 raise ValueError(
336 f"feature_names_out={self.feature_names_out!r} is invalid. "
337 'It must either be "one-to-one" or a callable with two '
338 "arguments: the function transformer and an array-like of "
339 "input feature names. The callable must return an array-like "
340 "of output feature names."
341 )
342 return np.asarray(names_out, dtype=object)
343
344 def _transform(self, X, func=None, kw_args=None):
345 if func is None:
346 func = _identity
347
348 return func(X, **(kw_args if kw_args else {}))
349
350 def __sklearn_is_fitted__(self):
351 """Return True since FunctionTransfomer is stateless."""
352 return True
353
354 def _more_tags(self):
355 return {"no_validation": not self.validate, "stateless": True}
356
357 def set_output(self, *, transform=None):
358 """Set output container.
359
360 See :ref:`sphx_glr_auto_examples_miscellaneous_plot_set_output.py`
361 for an example on how to use the API.
362
363 Parameters
364 ----------
365 transform : {"default", "pandas"}, default=None
366 Configure output of `transform` and `fit_transform`.
367
368 - `"default"`: Default output format of a transformer
369 - `"pandas"`: DataFrame output
370 - `"polars"`: Polars output
371 - `None`: Transform configuration is unchanged
372
373 .. versionadded:: 1.4
374 `"polars"` option was added.
375
376 Returns
377 -------
378 self : estimator instance
379 Estimator instance.
380 """
381 if not hasattr(self, "_sklearn_output_config"):
382 self._sklearn_output_config = {}
383
384 self._sklearn_output_config["transform"] = transform
385 return self