Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scikit_learn-1.4.dev0-py3.8-linux-x86_64.egg/sklearn/preprocessing/_function_transformer.py: 28%

81 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-12-12 06:31 +0000

1import warnings 

2 

3import numpy as np 

4 

5from ..base import BaseEstimator, TransformerMixin, _fit_context 

6from ..utils._param_validation import StrOptions 

7from ..utils._set_output import _get_output_config 

8from ..utils.metaestimators import available_if 

9from ..utils.validation import ( 

10 _allclose_dense_sparse, 

11 _check_feature_names_in, 

12 _is_pandas_df, 

13 check_array, 

14) 

15 

16 

17def _identity(X): 

18 """The identity function.""" 

19 return X 

20 

21 

22class FunctionTransformer(TransformerMixin, BaseEstimator): 

23 """Constructs a transformer from an arbitrary callable. 

24 

25 A FunctionTransformer forwards its X (and optionally y) arguments to a 

26 user-defined function or function object and returns the result of this 

27 function. This is useful for stateless transformations such as taking the 

28 log of frequencies, doing custom scaling, etc. 

29 

30 Note: If a lambda is used as the function, then the resulting 

31 transformer will not be pickleable. 

32 

33 .. versionadded:: 0.17 

34 

35 Read more in the :ref:`User Guide <function_transformer>`. 

36 

37 Parameters 

38 ---------- 

39 func : callable, default=None 

40 The callable to use for the transformation. This will be passed 

41 the same arguments as transform, with args and kwargs forwarded. 

42 If func is None, then func will be the identity function. 

43 

44 inverse_func : callable, default=None 

45 The callable to use for the inverse transformation. This will be 

46 passed the same arguments as inverse transform, with args and 

47 kwargs forwarded. If inverse_func is None, then inverse_func 

48 will be the identity function. 

49 

50 validate : bool, default=False 

51 Indicate that the input X array should be checked before calling 

52 ``func``. The possibilities are: 

53 

54 - If False, there is no input validation. 

55 - If True, then X will be converted to a 2-dimensional NumPy array or 

56 sparse matrix. If the conversion is not possible an exception is 

57 raised. 

58 

59 .. versionchanged:: 0.22 

60 The default of ``validate`` changed from True to False. 

61 

62 accept_sparse : bool, default=False 

63 Indicate that func accepts a sparse matrix as input. If validate is 

64 False, this has no effect. Otherwise, if accept_sparse is false, 

65 sparse matrix inputs will cause an exception to be raised. 

66 

67 check_inverse : bool, default=True 

68 Whether to check that or ``func`` followed by ``inverse_func`` leads to 

69 the original inputs. It can be used for a sanity check, raising a 

70 warning when the condition is not fulfilled. 

71 

72 .. versionadded:: 0.20 

73 

74 feature_names_out : callable, 'one-to-one' or None, default=None 

75 Determines the list of feature names that will be returned by the 

76 `get_feature_names_out` method. If it is 'one-to-one', then the output 

77 feature names will be equal to the input feature names. If it is a 

78 callable, then it must take two positional arguments: this 

79 `FunctionTransformer` (`self`) and an array-like of input feature names 

80 (`input_features`). It must return an array-like of output feature 

81 names. The `get_feature_names_out` method is only defined if 

82 `feature_names_out` is not None. 

83 

84 See ``get_feature_names_out`` for more details. 

85 

86 .. versionadded:: 1.1 

87 

88 kw_args : dict, default=None 

89 Dictionary of additional keyword arguments to pass to func. 

90 

91 .. versionadded:: 0.18 

92 

93 inv_kw_args : dict, default=None 

94 Dictionary of additional keyword arguments to pass to inverse_func. 

95 

96 .. versionadded:: 0.18 

97 

98 Attributes 

99 ---------- 

100 n_features_in_ : int 

101 Number of features seen during :term:`fit`. 

102 

103 .. versionadded:: 0.24 

104 

105 feature_names_in_ : ndarray of shape (`n_features_in_`,) 

106 Names of features seen during :term:`fit`. Defined only when `X` has feature 

107 names that are all strings. 

108 

109 .. versionadded:: 1.0 

110 

111 See Also 

112 -------- 

113 MaxAbsScaler : Scale each feature by its maximum absolute value. 

114 StandardScaler : Standardize features by removing the mean and 

115 scaling to unit variance. 

116 LabelBinarizer : Binarize labels in a one-vs-all fashion. 

117 MultiLabelBinarizer : Transform between iterable of iterables 

118 and a multilabel format. 

119 

120 Examples 

121 -------- 

122 >>> import numpy as np 

123 >>> from sklearn.preprocessing import FunctionTransformer 

124 >>> transformer = FunctionTransformer(np.log1p) 

125 >>> X = np.array([[0, 1], [2, 3]]) 

126 >>> transformer.transform(X) 

127 array([[0. , 0.6931...], 

128 [1.0986..., 1.3862...]]) 

129 """ 

130 

131 _parameter_constraints: dict = { 

132 "func": [callable, None], 

133 "inverse_func": [callable, None], 

134 "validate": ["boolean"], 

135 "accept_sparse": ["boolean"], 

136 "check_inverse": ["boolean"], 

137 "feature_names_out": [callable, StrOptions({"one-to-one"}), None], 

138 "kw_args": [dict, None], 

139 "inv_kw_args": [dict, None], 

140 } 

141 

142 def __init__( 

143 self, 

144 func=None, 

145 inverse_func=None, 

146 *, 

147 validate=False, 

148 accept_sparse=False, 

149 check_inverse=True, 

150 feature_names_out=None, 

151 kw_args=None, 

152 inv_kw_args=None, 

153 ): 

154 self.func = func 

155 self.inverse_func = inverse_func 

156 self.validate = validate 

157 self.accept_sparse = accept_sparse 

158 self.check_inverse = check_inverse 

159 self.feature_names_out = feature_names_out 

160 self.kw_args = kw_args 

161 self.inv_kw_args = inv_kw_args 

162 

163 def _check_input(self, X, *, reset): 

164 if self.validate: 

165 return self._validate_data(X, accept_sparse=self.accept_sparse, reset=reset) 

166 elif reset: 

167 # Set feature_names_in_ and n_features_in_ even if validate=False 

168 # We run this only when reset==True to store the attributes but not 

169 # validate them, because validate=False 

170 self._check_n_features(X, reset=reset) 

171 self._check_feature_names(X, reset=reset) 

172 return X 

173 

174 def _check_inverse_transform(self, X): 

175 """Check that func and inverse_func are the inverse.""" 

176 idx_selected = slice(None, None, max(1, X.shape[0] // 100)) 

177 X_round_trip = self.inverse_transform(self.transform(X[idx_selected])) 

178 

179 if hasattr(X, "dtype"): 

180 dtypes = [X.dtype] 

181 elif hasattr(X, "dtypes"): 

182 # Dataframes can have multiple dtypes 

183 dtypes = X.dtypes 

184 

185 if not all(np.issubdtype(d, np.number) for d in dtypes): 

186 raise ValueError( 

187 "'check_inverse' is only supported when all the elements in `X` is" 

188 " numerical." 

189 ) 

190 

191 if not _allclose_dense_sparse(X[idx_selected], X_round_trip): 

192 warnings.warn( 

193 ( 

194 "The provided functions are not strictly" 

195 " inverse of each other. If you are sure you" 

196 " want to proceed regardless, set" 

197 " 'check_inverse=False'." 

198 ), 

199 UserWarning, 

200 ) 

201 

202 @_fit_context(prefer_skip_nested_validation=True) 

203 def fit(self, X, y=None): 

204 """Fit transformer by checking X. 

205 

206 If ``validate`` is ``True``, ``X`` will be checked. 

207 

208 Parameters 

209 ---------- 

210 X : {array-like, sparse-matrix} of shape (n_samples, n_features) \ 

211 if `validate=True` else any object that `func` can handle 

212 Input array. 

213 

214 y : Ignored 

215 Not used, present here for API consistency by convention. 

216 

217 Returns 

218 ------- 

219 self : object 

220 FunctionTransformer class instance. 

221 """ 

222 X = self._check_input(X, reset=True) 

223 if self.check_inverse and not (self.func is None or self.inverse_func is None): 

224 self._check_inverse_transform(X) 

225 return self 

226 

227 def transform(self, X): 

228 """Transform X using the forward function. 

229 

230 Parameters 

231 ---------- 

232 X : {array-like, sparse-matrix} of shape (n_samples, n_features) \ 

233 if `validate=True` else any object that `func` can handle 

234 Input array. 

235 

236 Returns 

237 ------- 

238 X_out : array-like, shape (n_samples, n_features) 

239 Transformed input. 

240 """ 

241 X = self._check_input(X, reset=False) 

242 out = self._transform(X, func=self.func, kw_args=self.kw_args) 

243 

244 if hasattr(out, "columns") and self.feature_names_out is not None: 

245 # check the consistency between the column names of the output and the 

246 # one generated by `get_feature_names_out` 

247 if list(out.columns) != list(self.get_feature_names_out()): 

248 raise ValueError( 

249 "The output generated by `func` have different column names than " 

250 "the one generated by the method `get_feature_names_out`. " 

251 f"Got output with columns names: {list(out.columns)} and " 

252 "`get_feature_names_out` returned: " 

253 f"{list(self.get_feature_names_out())}. " 

254 "This can be fixed in different manners depending on your use case:" 

255 "\n(i) If `func` returns a container with column names, make sure " 

256 "they are consistent with the output of `get_feature_names_out`.\n" 

257 "(ii) If `func` is a NumPy `ufunc`, then forcing `validate=True` " 

258 "could be considered to internally convert the input container to " 

259 "a NumPy array before calling the `ufunc`.\n" 

260 "(iii) The column names can be overriden by setting " 

261 "`set_output(transform='pandas')` such that the column names are " 

262 "set to the names provided by `get_feature_names_out`." 

263 ) 

264 

265 output_config = _get_output_config("transform", self)["dense"] 

266 if ( 

267 output_config == "pandas" 

268 and self.feature_names_out is None 

269 and not _is_pandas_df(out) 

270 ): 

271 warnings.warn( 

272 "When `set_output` is configured to be 'pandas', `func` should return " 

273 "a DataFrame to follow the `set_output` API or `feature_names_out` " 

274 "should be defined." 

275 ) 

276 return out 

277 

278 def inverse_transform(self, X): 

279 """Transform X using the inverse function. 

280 

281 Parameters 

282 ---------- 

283 X : {array-like, sparse-matrix} of shape (n_samples, n_features) \ 

284 if `validate=True` else any object that `inverse_func` can handle 

285 Input array. 

286 

287 Returns 

288 ------- 

289 X_out : array-like, shape (n_samples, n_features) 

290 Transformed input. 

291 """ 

292 if self.validate: 

293 X = check_array(X, accept_sparse=self.accept_sparse) 

294 return self._transform(X, func=self.inverse_func, kw_args=self.inv_kw_args) 

295 

296 @available_if(lambda self: self.feature_names_out is not None) 

297 def get_feature_names_out(self, input_features=None): 

298 """Get output feature names for transformation. 

299 

300 This method is only defined if `feature_names_out` is not None. 

301 

302 Parameters 

303 ---------- 

304 input_features : array-like of str or None, default=None 

305 Input feature names. 

306 

307 - If `input_features` is None, then `feature_names_in_` is 

308 used as the input feature names. If `feature_names_in_` is not 

309 defined, then names are generated: 

310 `[x0, x1, ..., x(n_features_in_ - 1)]`. 

311 - If `input_features` is array-like, then `input_features` must 

312 match `feature_names_in_` if `feature_names_in_` is defined. 

313 

314 Returns 

315 ------- 

316 feature_names_out : ndarray of str objects 

317 Transformed feature names. 

318 

319 - If `feature_names_out` is 'one-to-one', the input feature names 

320 are returned (see `input_features` above). This requires 

321 `feature_names_in_` and/or `n_features_in_` to be defined, which 

322 is done automatically if `validate=True`. Alternatively, you can 

323 set them in `func`. 

324 - If `feature_names_out` is a callable, then it is called with two 

325 arguments, `self` and `input_features`, and its return value is 

326 returned by this method. 

327 """ 

328 if hasattr(self, "n_features_in_") or input_features is not None: 

329 input_features = _check_feature_names_in(self, input_features) 

330 if self.feature_names_out == "one-to-one": 

331 names_out = input_features 

332 elif callable(self.feature_names_out): 

333 names_out = self.feature_names_out(self, input_features) 

334 else: 

335 raise ValueError( 

336 f"feature_names_out={self.feature_names_out!r} is invalid. " 

337 'It must either be "one-to-one" or a callable with two ' 

338 "arguments: the function transformer and an array-like of " 

339 "input feature names. The callable must return an array-like " 

340 "of output feature names." 

341 ) 

342 return np.asarray(names_out, dtype=object) 

343 

344 def _transform(self, X, func=None, kw_args=None): 

345 if func is None: 

346 func = _identity 

347 

348 return func(X, **(kw_args if kw_args else {})) 

349 

350 def __sklearn_is_fitted__(self): 

351 """Return True since FunctionTransfomer is stateless.""" 

352 return True 

353 

354 def _more_tags(self): 

355 return {"no_validation": not self.validate, "stateless": True} 

356 

357 def set_output(self, *, transform=None): 

358 """Set output container. 

359 

360 See :ref:`sphx_glr_auto_examples_miscellaneous_plot_set_output.py` 

361 for an example on how to use the API. 

362 

363 Parameters 

364 ---------- 

365 transform : {"default", "pandas"}, default=None 

366 Configure output of `transform` and `fit_transform`. 

367 

368 - `"default"`: Default output format of a transformer 

369 - `"pandas"`: DataFrame output 

370 - `"polars"`: Polars output 

371 - `None`: Transform configuration is unchanged 

372 

373 .. versionadded:: 1.4 

374 `"polars"` option was added. 

375 

376 Returns 

377 ------- 

378 self : estimator instance 

379 Estimator instance. 

380 """ 

381 if not hasattr(self, "_sklearn_output_config"): 

382 self._sklearn_output_config = {} 

383 

384 self._sklearn_output_config["transform"] = transform 

385 return self