Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/core/reshape/encoding.py: 11%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

176 statements  

1from __future__ import annotations 

2 

3from collections import defaultdict 

4from collections.abc import ( 

5 Hashable, 

6 Iterable, 

7) 

8import itertools 

9from typing import ( 

10 TYPE_CHECKING, 

11 cast, 

12) 

13 

14import numpy as np 

15 

16from pandas._libs.sparse import IntIndex 

17 

18from pandas.core.dtypes.common import ( 

19 is_integer_dtype, 

20 is_list_like, 

21 is_object_dtype, 

22 pandas_dtype, 

23) 

24from pandas.core.dtypes.dtypes import ( 

25 ArrowDtype, 

26 CategoricalDtype, 

27) 

28 

29from pandas.core.arrays import SparseArray 

30from pandas.core.arrays.categorical import factorize_from_iterable 

31from pandas.core.arrays.string_ import StringDtype 

32from pandas.core.frame import DataFrame 

33from pandas.core.indexes.api import ( 

34 Index, 

35 default_index, 

36) 

37from pandas.core.series import Series 

38 

39if TYPE_CHECKING: 

40 from pandas._typing import NpDtype 

41 

42 

43def get_dummies( 

44 data, 

45 prefix=None, 

46 prefix_sep: str | Iterable[str] | dict[str, str] = "_", 

47 dummy_na: bool = False, 

48 columns=None, 

49 sparse: bool = False, 

50 drop_first: bool = False, 

51 dtype: NpDtype | None = None, 

52) -> DataFrame: 

53 """ 

54 Convert categorical variable into dummy/indicator variables. 

55 

56 Each variable is converted in as many 0/1 variables as there are different 

57 values. Columns in the output are each named after a value; if the input is 

58 a DataFrame, the name of the original variable is prepended to the value. 

59 

60 Parameters 

61 ---------- 

62 data : array-like, Series, or DataFrame 

63 Data of which to get dummy indicators. 

64 prefix : str, list of str, or dict of str, default None 

65 String to append DataFrame column names. 

66 Pass a list with length equal to the number of columns 

67 when calling get_dummies on a DataFrame. Alternatively, `prefix` 

68 can be a dictionary mapping column names to prefixes. 

69 prefix_sep : str, default '_' 

70 If appending prefix, separator/delimiter to use. Or pass a 

71 list or dictionary as with `prefix`. 

72 dummy_na : bool, default False 

73 Add a column to indicate NaNs, if False NaNs are ignored. 

74 columns : list-like, default None 

75 Column names in the DataFrame to be encoded. 

76 If `columns` is None then all the columns with 

77 `object`, `string`, or `category` dtype will be converted. 

78 sparse : bool, default False 

79 Whether the dummy-encoded columns should be backed by 

80 a :class:`SparseArray` (True) or a regular NumPy array (False). 

81 drop_first : bool, default False 

82 Whether to get k-1 dummies out of k categorical levels by removing the 

83 first level. 

84 dtype : dtype, default bool 

85 Data type for new columns. Only a single dtype is allowed. 

86 

87 Returns 

88 ------- 

89 DataFrame 

90 Dummy-coded data. If `data` contains other columns than the 

91 dummy-coded one(s), these will be prepended, unaltered, to the result. 

92 

93 See Also 

94 -------- 

95 Series.str.get_dummies : Convert Series of strings to dummy codes. 

96 :func:`~pandas.from_dummies` : Convert dummy codes to categorical ``DataFrame``. 

97 

98 Notes 

99 ----- 

100 Reference :ref:`the user guide <reshaping.dummies>` for more examples. 

101 

102 Examples 

103 -------- 

104 >>> s = pd.Series(list('abca')) 

105 

106 >>> pd.get_dummies(s) 

107 a b c 

108 0 True False False 

109 1 False True False 

110 2 False False True 

111 3 True False False 

112 

113 >>> s1 = ['a', 'b', np.nan] 

114 

115 >>> pd.get_dummies(s1) 

116 a b 

117 0 True False 

118 1 False True 

119 2 False False 

120 

121 >>> pd.get_dummies(s1, dummy_na=True) 

122 a b NaN 

123 0 True False False 

124 1 False True False 

125 2 False False True 

126 

127 >>> df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'], 

128 ... 'C': [1, 2, 3]}) 

129 

130 >>> pd.get_dummies(df, prefix=['col1', 'col2']) 

131 C col1_a col1_b col2_a col2_b col2_c 

132 0 1 True False False True False 

133 1 2 False True True False False 

134 2 3 True False False False True 

135 

136 >>> pd.get_dummies(pd.Series(list('abcaa'))) 

137 a b c 

138 0 True False False 

139 1 False True False 

140 2 False False True 

141 3 True False False 

142 4 True False False 

143 

144 >>> pd.get_dummies(pd.Series(list('abcaa')), drop_first=True) 

145 b c 

146 0 False False 

147 1 True False 

148 2 False True 

149 3 False False 

150 4 False False 

151 

152 >>> pd.get_dummies(pd.Series(list('abc')), dtype=float) 

153 a b c 

154 0 1.0 0.0 0.0 

155 1 0.0 1.0 0.0 

156 2 0.0 0.0 1.0 

157 """ 

158 from pandas.core.reshape.concat import concat 

159 

160 dtypes_to_encode = ["object", "string", "category"] 

161 

162 if isinstance(data, DataFrame): 

163 # determine columns being encoded 

164 if columns is None: 

165 data_to_encode = data.select_dtypes(include=dtypes_to_encode) 

166 elif not is_list_like(columns): 

167 raise TypeError("Input must be a list-like for parameter `columns`") 

168 else: 

169 data_to_encode = data[columns] 

170 

171 # validate prefixes and separator to avoid silently dropping cols 

172 def check_len(item, name: str): 

173 if is_list_like(item): 

174 if not len(item) == data_to_encode.shape[1]: 

175 len_msg = ( 

176 f"Length of '{name}' ({len(item)}) did not match the " 

177 "length of the columns being encoded " 

178 f"({data_to_encode.shape[1]})." 

179 ) 

180 raise ValueError(len_msg) 

181 

182 check_len(prefix, "prefix") 

183 check_len(prefix_sep, "prefix_sep") 

184 

185 if isinstance(prefix, str): 

186 prefix = itertools.cycle([prefix]) 

187 if isinstance(prefix, dict): 

188 prefix = [prefix[col] for col in data_to_encode.columns] 

189 

190 if prefix is None: 

191 prefix = data_to_encode.columns 

192 

193 # validate separators 

194 if isinstance(prefix_sep, str): 

195 prefix_sep = itertools.cycle([prefix_sep]) 

196 elif isinstance(prefix_sep, dict): 

197 prefix_sep = [prefix_sep[col] for col in data_to_encode.columns] 

198 

199 with_dummies: list[DataFrame] 

200 if data_to_encode.shape == data.shape: 

201 # Encoding the entire df, do not prepend any dropped columns 

202 with_dummies = [] 

203 elif columns is not None: 

204 # Encoding only cols specified in columns. Get all cols not in 

205 # columns to prepend to result. 

206 with_dummies = [data.drop(columns, axis=1)] 

207 else: 

208 # Encoding only object and category dtype columns. Get remaining 

209 # columns to prepend to result. 

210 with_dummies = [data.select_dtypes(exclude=dtypes_to_encode)] 

211 

212 for col, pre, sep in zip(data_to_encode.items(), prefix, prefix_sep): 

213 # col is (column_name, column), use just column data here 

214 dummy = _get_dummies_1d( 

215 col[1], 

216 prefix=pre, 

217 prefix_sep=sep, 

218 dummy_na=dummy_na, 

219 sparse=sparse, 

220 drop_first=drop_first, 

221 dtype=dtype, 

222 ) 

223 with_dummies.append(dummy) 

224 result = concat(with_dummies, axis=1) 

225 else: 

226 result = _get_dummies_1d( 

227 data, 

228 prefix, 

229 prefix_sep, 

230 dummy_na, 

231 sparse=sparse, 

232 drop_first=drop_first, 

233 dtype=dtype, 

234 ) 

235 return result 

236 

237 

238def _get_dummies_1d( 

239 data, 

240 prefix, 

241 prefix_sep: str | Iterable[str] | dict[str, str] = "_", 

242 dummy_na: bool = False, 

243 sparse: bool = False, 

244 drop_first: bool = False, 

245 dtype: NpDtype | None = None, 

246) -> DataFrame: 

247 from pandas.core.reshape.concat import concat 

248 

249 # Series avoids inconsistent NaN handling 

250 codes, levels = factorize_from_iterable(Series(data, copy=False)) 

251 

252 if dtype is None and hasattr(data, "dtype"): 

253 input_dtype = data.dtype 

254 if isinstance(input_dtype, CategoricalDtype): 

255 input_dtype = input_dtype.categories.dtype 

256 

257 if isinstance(input_dtype, ArrowDtype): 

258 import pyarrow as pa 

259 

260 dtype = ArrowDtype(pa.bool_()) # type: ignore[assignment] 

261 elif ( 

262 isinstance(input_dtype, StringDtype) 

263 and input_dtype.storage != "pyarrow_numpy" 

264 ): 

265 dtype = pandas_dtype("boolean") # type: ignore[assignment] 

266 else: 

267 dtype = np.dtype(bool) 

268 elif dtype is None: 

269 dtype = np.dtype(bool) 

270 

271 _dtype = pandas_dtype(dtype) 

272 

273 if is_object_dtype(_dtype): 

274 raise ValueError("dtype=object is not a valid dtype for get_dummies") 

275 

276 def get_empty_frame(data) -> DataFrame: 

277 index: Index | np.ndarray 

278 if isinstance(data, Series): 

279 index = data.index 

280 else: 

281 index = default_index(len(data)) 

282 return DataFrame(index=index) 

283 

284 # if all NaN 

285 if not dummy_na and len(levels) == 0: 

286 return get_empty_frame(data) 

287 

288 codes = codes.copy() 

289 if dummy_na: 

290 codes[codes == -1] = len(levels) 

291 levels = levels.insert(len(levels), np.nan) 

292 

293 # if dummy_na, we just fake a nan level. drop_first will drop it again 

294 if drop_first and len(levels) == 1: 

295 return get_empty_frame(data) 

296 

297 number_of_cols = len(levels) 

298 

299 if prefix is None: 

300 dummy_cols = levels 

301 else: 

302 dummy_cols = Index([f"{prefix}{prefix_sep}{level}" for level in levels]) 

303 

304 index: Index | None 

305 if isinstance(data, Series): 

306 index = data.index 

307 else: 

308 index = None 

309 

310 if sparse: 

311 fill_value: bool | float 

312 if is_integer_dtype(dtype): 

313 fill_value = 0 

314 elif dtype == np.dtype(bool): 

315 fill_value = False 

316 else: 

317 fill_value = 0.0 

318 

319 sparse_series = [] 

320 N = len(data) 

321 sp_indices: list[list] = [[] for _ in range(len(dummy_cols))] 

322 mask = codes != -1 

323 codes = codes[mask] 

324 n_idx = np.arange(N)[mask] 

325 

326 for ndx, code in zip(n_idx, codes): 

327 sp_indices[code].append(ndx) 

328 

329 if drop_first: 

330 # remove first categorical level to avoid perfect collinearity 

331 # GH12042 

332 sp_indices = sp_indices[1:] 

333 dummy_cols = dummy_cols[1:] 

334 for col, ixs in zip(dummy_cols, sp_indices): 

335 sarr = SparseArray( 

336 np.ones(len(ixs), dtype=dtype), 

337 sparse_index=IntIndex(N, ixs), 

338 fill_value=fill_value, 

339 dtype=dtype, 

340 ) 

341 sparse_series.append(Series(data=sarr, index=index, name=col, copy=False)) 

342 

343 return concat(sparse_series, axis=1, copy=False) 

344 

345 else: 

346 # ensure ndarray layout is column-major 

347 shape = len(codes), number_of_cols 

348 dummy_dtype: NpDtype 

349 if isinstance(_dtype, np.dtype): 

350 dummy_dtype = _dtype 

351 else: 

352 dummy_dtype = np.bool_ 

353 dummy_mat = np.zeros(shape=shape, dtype=dummy_dtype, order="F") 

354 dummy_mat[np.arange(len(codes)), codes] = 1 

355 

356 if not dummy_na: 

357 # reset NaN GH4446 

358 dummy_mat[codes == -1] = 0 

359 

360 if drop_first: 

361 # remove first GH12042 

362 dummy_mat = dummy_mat[:, 1:] 

363 dummy_cols = dummy_cols[1:] 

364 return DataFrame(dummy_mat, index=index, columns=dummy_cols, dtype=_dtype) 

365 

366 

367def from_dummies( 

368 data: DataFrame, 

369 sep: None | str = None, 

370 default_category: None | Hashable | dict[str, Hashable] = None, 

371) -> DataFrame: 

372 """ 

373 Create a categorical ``DataFrame`` from a ``DataFrame`` of dummy variables. 

374 

375 Inverts the operation performed by :func:`~pandas.get_dummies`. 

376 

377 .. versionadded:: 1.5.0 

378 

379 Parameters 

380 ---------- 

381 data : DataFrame 

382 Data which contains dummy-coded variables in form of integer columns of 

383 1's and 0's. 

384 sep : str, default None 

385 Separator used in the column names of the dummy categories they are 

386 character indicating the separation of the categorical names from the prefixes. 

387 For example, if your column names are 'prefix_A' and 'prefix_B', 

388 you can strip the underscore by specifying sep='_'. 

389 default_category : None, Hashable or dict of Hashables, default None 

390 The default category is the implied category when a value has none of the 

391 listed categories specified with a one, i.e. if all dummies in a row are 

392 zero. Can be a single value for all variables or a dict directly mapping 

393 the default categories to a prefix of a variable. 

394 

395 Returns 

396 ------- 

397 DataFrame 

398 Categorical data decoded from the dummy input-data. 

399 

400 Raises 

401 ------ 

402 ValueError 

403 * When the input ``DataFrame`` ``data`` contains NA values. 

404 * When the input ``DataFrame`` ``data`` contains column names with separators 

405 that do not match the separator specified with ``sep``. 

406 * When a ``dict`` passed to ``default_category`` does not include an implied 

407 category for each prefix. 

408 * When a value in ``data`` has more than one category assigned to it. 

409 * When ``default_category=None`` and a value in ``data`` has no category 

410 assigned to it. 

411 TypeError 

412 * When the input ``data`` is not of type ``DataFrame``. 

413 * When the input ``DataFrame`` ``data`` contains non-dummy data. 

414 * When the passed ``sep`` is of a wrong data type. 

415 * When the passed ``default_category`` is of a wrong data type. 

416 

417 See Also 

418 -------- 

419 :func:`~pandas.get_dummies` : Convert ``Series`` or ``DataFrame`` to dummy codes. 

420 :class:`~pandas.Categorical` : Represent a categorical variable in classic. 

421 

422 Notes 

423 ----- 

424 The columns of the passed dummy data should only include 1's and 0's, 

425 or boolean values. 

426 

427 Examples 

428 -------- 

429 >>> df = pd.DataFrame({"a": [1, 0, 0, 1], "b": [0, 1, 0, 0], 

430 ... "c": [0, 0, 1, 0]}) 

431 

432 >>> df 

433 a b c 

434 0 1 0 0 

435 1 0 1 0 

436 2 0 0 1 

437 3 1 0 0 

438 

439 >>> pd.from_dummies(df) 

440 0 a 

441 1 b 

442 2 c 

443 3 a 

444 

445 >>> df = pd.DataFrame({"col1_a": [1, 0, 1], "col1_b": [0, 1, 0], 

446 ... "col2_a": [0, 1, 0], "col2_b": [1, 0, 0], 

447 ... "col2_c": [0, 0, 1]}) 

448 

449 >>> df 

450 col1_a col1_b col2_a col2_b col2_c 

451 0 1 0 0 1 0 

452 1 0 1 1 0 0 

453 2 1 0 0 0 1 

454 

455 >>> pd.from_dummies(df, sep="_") 

456 col1 col2 

457 0 a b 

458 1 b a 

459 2 a c 

460 

461 >>> df = pd.DataFrame({"col1_a": [1, 0, 0], "col1_b": [0, 1, 0], 

462 ... "col2_a": [0, 1, 0], "col2_b": [1, 0, 0], 

463 ... "col2_c": [0, 0, 0]}) 

464 

465 >>> df 

466 col1_a col1_b col2_a col2_b col2_c 

467 0 1 0 0 1 0 

468 1 0 1 1 0 0 

469 2 0 0 0 0 0 

470 

471 >>> pd.from_dummies(df, sep="_", default_category={"col1": "d", "col2": "e"}) 

472 col1 col2 

473 0 a b 

474 1 b a 

475 2 d e 

476 """ 

477 from pandas.core.reshape.concat import concat 

478 

479 if not isinstance(data, DataFrame): 

480 raise TypeError( 

481 "Expected 'data' to be a 'DataFrame'; " 

482 f"Received 'data' of type: {type(data).__name__}" 

483 ) 

484 

485 col_isna_mask = cast(Series, data.isna().any()) 

486 

487 if col_isna_mask.any(): 

488 raise ValueError( 

489 "Dummy DataFrame contains NA value in column: " 

490 f"'{col_isna_mask.idxmax()}'" 

491 ) 

492 

493 # index data with a list of all columns that are dummies 

494 try: 

495 data_to_decode = data.astype("boolean", copy=False) 

496 except TypeError: 

497 raise TypeError("Passed DataFrame contains non-dummy data") 

498 

499 # collect prefixes and get lists to slice data for each prefix 

500 variables_slice = defaultdict(list) 

501 if sep is None: 

502 variables_slice[""] = list(data.columns) 

503 elif isinstance(sep, str): 

504 for col in data_to_decode.columns: 

505 prefix = col.split(sep)[0] 

506 if len(prefix) == len(col): 

507 raise ValueError(f"Separator not specified for column: {col}") 

508 variables_slice[prefix].append(col) 

509 else: 

510 raise TypeError( 

511 "Expected 'sep' to be of type 'str' or 'None'; " 

512 f"Received 'sep' of type: {type(sep).__name__}" 

513 ) 

514 

515 if default_category is not None: 

516 if isinstance(default_category, dict): 

517 if not len(default_category) == len(variables_slice): 

518 len_msg = ( 

519 f"Length of 'default_category' ({len(default_category)}) " 

520 f"did not match the length of the columns being encoded " 

521 f"({len(variables_slice)})" 

522 ) 

523 raise ValueError(len_msg) 

524 elif isinstance(default_category, Hashable): 

525 default_category = dict( 

526 zip(variables_slice, [default_category] * len(variables_slice)) 

527 ) 

528 else: 

529 raise TypeError( 

530 "Expected 'default_category' to be of type " 

531 "'None', 'Hashable', or 'dict'; " 

532 "Received 'default_category' of type: " 

533 f"{type(default_category).__name__}" 

534 ) 

535 

536 cat_data = {} 

537 for prefix, prefix_slice in variables_slice.items(): 

538 if sep is None: 

539 cats = prefix_slice.copy() 

540 else: 

541 cats = [col[len(prefix + sep) :] for col in prefix_slice] 

542 assigned = data_to_decode.loc[:, prefix_slice].sum(axis=1) 

543 if any(assigned > 1): 

544 raise ValueError( 

545 "Dummy DataFrame contains multi-assignment(s); " 

546 f"First instance in row: {assigned.idxmax()}" 

547 ) 

548 if any(assigned == 0): 

549 if isinstance(default_category, dict): 

550 cats.append(default_category[prefix]) 

551 else: 

552 raise ValueError( 

553 "Dummy DataFrame contains unassigned value(s); " 

554 f"First instance in row: {assigned.idxmin()}" 

555 ) 

556 data_slice = concat( 

557 (data_to_decode.loc[:, prefix_slice], assigned == 0), axis=1 

558 ) 

559 else: 

560 data_slice = data_to_decode.loc[:, prefix_slice] 

561 cats_array = data._constructor_sliced(cats, dtype=data.columns.dtype) 

562 # get indices of True entries along axis=1 

563 true_values = data_slice.idxmax(axis=1) 

564 indexer = data_slice.columns.get_indexer_for(true_values) 

565 cat_data[prefix] = cats_array.take(indexer).set_axis(data.index) 

566 

567 result = DataFrame(cat_data) 

568 if sep is not None: 

569 result.columns = result.columns.astype(data.columns.dtype) 

570 return result