Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pandas/core/reshape/encoding.py: 10%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

154 statements  

1from __future__ import annotations 

2 

3from collections import defaultdict 

4import itertools 

5from typing import ( 

6 Hashable, 

7 Iterable, 

8) 

9 

10import numpy as np 

11 

12from pandas._libs.sparse import IntIndex 

13from pandas._typing import NpDtype 

14 

15from pandas.core.dtypes.common import ( 

16 is_integer_dtype, 

17 is_list_like, 

18 is_object_dtype, 

19 pandas_dtype, 

20) 

21 

22from pandas.core.arrays import SparseArray 

23from pandas.core.arrays.categorical import factorize_from_iterable 

24from pandas.core.frame import DataFrame 

25from pandas.core.indexes.api import ( 

26 Index, 

27 default_index, 

28) 

29from pandas.core.series import Series 

30 

31 

32def get_dummies( 

33 data, 

34 prefix=None, 

35 prefix_sep: str | Iterable[str] | dict[str, str] = "_", 

36 dummy_na: bool = False, 

37 columns=None, 

38 sparse: bool = False, 

39 drop_first: bool = False, 

40 dtype: NpDtype | None = None, 

41) -> DataFrame: 

42 """ 

43 Convert categorical variable into dummy/indicator variables. 

44 

45 Each variable is converted in as many 0/1 variables as there are different 

46 values. Columns in the output are each named after a value; if the input is 

47 a DataFrame, the name of the original variable is prepended to the value. 

48 

49 Parameters 

50 ---------- 

51 data : array-like, Series, or DataFrame 

52 Data of which to get dummy indicators. 

53 prefix : str, list of str, or dict of str, default None 

54 String to append DataFrame column names. 

55 Pass a list with length equal to the number of columns 

56 when calling get_dummies on a DataFrame. Alternatively, `prefix` 

57 can be a dictionary mapping column names to prefixes. 

58 prefix_sep : str, default '_' 

59 If appending prefix, separator/delimiter to use. Or pass a 

60 list or dictionary as with `prefix`. 

61 dummy_na : bool, default False 

62 Add a column to indicate NaNs, if False NaNs are ignored. 

63 columns : list-like, default None 

64 Column names in the DataFrame to be encoded. 

65 If `columns` is None then all the columns with 

66 `object`, `string`, or `category` dtype will be converted. 

67 sparse : bool, default False 

68 Whether the dummy-encoded columns should be backed by 

69 a :class:`SparseArray` (True) or a regular NumPy array (False). 

70 drop_first : bool, default False 

71 Whether to get k-1 dummies out of k categorical levels by removing the 

72 first level. 

73 dtype : dtype, default bool 

74 Data type for new columns. Only a single dtype is allowed. 

75 

76 Returns 

77 ------- 

78 DataFrame 

79 Dummy-coded data. If `data` contains other columns than the 

80 dummy-coded one(s), these will be prepended, unaltered, to the result. 

81 

82 See Also 

83 -------- 

84 Series.str.get_dummies : Convert Series of strings to dummy codes. 

85 :func:`~pandas.from_dummies` : Convert dummy codes to categorical ``DataFrame``. 

86 

87 Notes 

88 ----- 

89 Reference :ref:`the user guide <reshaping.dummies>` for more examples. 

90 

91 Examples 

92 -------- 

93 >>> s = pd.Series(list('abca')) 

94 

95 >>> pd.get_dummies(s) 

96 a b c 

97 0 True False False 

98 1 False True False 

99 2 False False True 

100 3 True False False 

101 

102 >>> s1 = ['a', 'b', np.nan] 

103 

104 >>> pd.get_dummies(s1) 

105 a b 

106 0 True False 

107 1 False True 

108 2 False False 

109 

110 >>> pd.get_dummies(s1, dummy_na=True) 

111 a b NaN 

112 0 True False False 

113 1 False True False 

114 2 False False True 

115 

116 >>> df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'], 

117 ... 'C': [1, 2, 3]}) 

118 

119 >>> pd.get_dummies(df, prefix=['col1', 'col2']) 

120 C col1_a col1_b col2_a col2_b col2_c 

121 0 1 True False False True False 

122 1 2 False True True False False 

123 2 3 True False False False True 

124 

125 >>> pd.get_dummies(pd.Series(list('abcaa'))) 

126 a b c 

127 0 True False False 

128 1 False True False 

129 2 False False True 

130 3 True False False 

131 4 True False False 

132 

133 >>> pd.get_dummies(pd.Series(list('abcaa')), drop_first=True) 

134 b c 

135 0 False False 

136 1 True False 

137 2 False True 

138 3 False False 

139 4 False False 

140 

141 >>> pd.get_dummies(pd.Series(list('abc')), dtype=float) 

142 a b c 

143 0 1.0 0.0 0.0 

144 1 0.0 1.0 0.0 

145 2 0.0 0.0 1.0 

146 """ 

147 from pandas.core.reshape.concat import concat 

148 

149 dtypes_to_encode = ["object", "string", "category"] 

150 

151 if isinstance(data, DataFrame): 

152 # determine columns being encoded 

153 if columns is None: 

154 data_to_encode = data.select_dtypes(include=dtypes_to_encode) 

155 elif not is_list_like(columns): 

156 raise TypeError("Input must be a list-like for parameter `columns`") 

157 else: 

158 data_to_encode = data[columns] 

159 

160 # validate prefixes and separator to avoid silently dropping cols 

161 def check_len(item, name): 

162 if is_list_like(item): 

163 if not len(item) == data_to_encode.shape[1]: 

164 len_msg = ( 

165 f"Length of '{name}' ({len(item)}) did not match the " 

166 "length of the columns being encoded " 

167 f"({data_to_encode.shape[1]})." 

168 ) 

169 raise ValueError(len_msg) 

170 

171 check_len(prefix, "prefix") 

172 check_len(prefix_sep, "prefix_sep") 

173 

174 if isinstance(prefix, str): 

175 prefix = itertools.cycle([prefix]) 

176 if isinstance(prefix, dict): 

177 prefix = [prefix[col] for col in data_to_encode.columns] 

178 

179 if prefix is None: 

180 prefix = data_to_encode.columns 

181 

182 # validate separators 

183 if isinstance(prefix_sep, str): 

184 prefix_sep = itertools.cycle([prefix_sep]) 

185 elif isinstance(prefix_sep, dict): 

186 prefix_sep = [prefix_sep[col] for col in data_to_encode.columns] 

187 

188 with_dummies: list[DataFrame] 

189 if data_to_encode.shape == data.shape: 

190 # Encoding the entire df, do not prepend any dropped columns 

191 with_dummies = [] 

192 elif columns is not None: 

193 # Encoding only cols specified in columns. Get all cols not in 

194 # columns to prepend to result. 

195 with_dummies = [data.drop(columns, axis=1)] 

196 else: 

197 # Encoding only object and category dtype columns. Get remaining 

198 # columns to prepend to result. 

199 with_dummies = [data.select_dtypes(exclude=dtypes_to_encode)] 

200 

201 for col, pre, sep in zip(data_to_encode.items(), prefix, prefix_sep): 

202 # col is (column_name, column), use just column data here 

203 dummy = _get_dummies_1d( 

204 col[1], 

205 prefix=pre, 

206 prefix_sep=sep, 

207 dummy_na=dummy_na, 

208 sparse=sparse, 

209 drop_first=drop_first, 

210 dtype=dtype, 

211 ) 

212 with_dummies.append(dummy) 

213 result = concat(with_dummies, axis=1) 

214 else: 

215 result = _get_dummies_1d( 

216 data, 

217 prefix, 

218 prefix_sep, 

219 dummy_na, 

220 sparse=sparse, 

221 drop_first=drop_first, 

222 dtype=dtype, 

223 ) 

224 return result 

225 

226 

227def _get_dummies_1d( 

228 data, 

229 prefix, 

230 prefix_sep: str | Iterable[str] | dict[str, str] = "_", 

231 dummy_na: bool = False, 

232 sparse: bool = False, 

233 drop_first: bool = False, 

234 dtype: NpDtype | None = None, 

235) -> DataFrame: 

236 from pandas.core.reshape.concat import concat 

237 

238 # Series avoids inconsistent NaN handling 

239 codes, levels = factorize_from_iterable(Series(data, copy=False)) 

240 

241 if dtype is None: 

242 dtype = np.dtype(bool) 

243 _dtype = pandas_dtype(dtype) 

244 

245 if is_object_dtype(_dtype): 

246 raise ValueError("dtype=object is not a valid dtype for get_dummies") 

247 

248 def get_empty_frame(data) -> DataFrame: 

249 index: Index | np.ndarray 

250 if isinstance(data, Series): 

251 index = data.index 

252 else: 

253 index = default_index(len(data)) 

254 return DataFrame(index=index) 

255 

256 # if all NaN 

257 if not dummy_na and len(levels) == 0: 

258 return get_empty_frame(data) 

259 

260 codes = codes.copy() 

261 if dummy_na: 

262 codes[codes == -1] = len(levels) 

263 levels = levels.insert(len(levels), np.nan) 

264 

265 # if dummy_na, we just fake a nan level. drop_first will drop it again 

266 if drop_first and len(levels) == 1: 

267 return get_empty_frame(data) 

268 

269 number_of_cols = len(levels) 

270 

271 if prefix is None: 

272 dummy_cols = levels 

273 else: 

274 dummy_cols = Index([f"{prefix}{prefix_sep}{level}" for level in levels]) 

275 

276 index: Index | None 

277 if isinstance(data, Series): 

278 index = data.index 

279 else: 

280 index = None 

281 

282 if sparse: 

283 fill_value: bool | float 

284 if is_integer_dtype(dtype): 

285 fill_value = 0 

286 elif dtype == np.dtype(bool): 

287 fill_value = False 

288 else: 

289 fill_value = 0.0 

290 

291 sparse_series = [] 

292 N = len(data) 

293 sp_indices: list[list] = [[] for _ in range(len(dummy_cols))] 

294 mask = codes != -1 

295 codes = codes[mask] 

296 n_idx = np.arange(N)[mask] 

297 

298 for ndx, code in zip(n_idx, codes): 

299 sp_indices[code].append(ndx) 

300 

301 if drop_first: 

302 # remove first categorical level to avoid perfect collinearity 

303 # GH12042 

304 sp_indices = sp_indices[1:] 

305 dummy_cols = dummy_cols[1:] 

306 for col, ixs in zip(dummy_cols, sp_indices): 

307 sarr = SparseArray( 

308 np.ones(len(ixs), dtype=dtype), 

309 sparse_index=IntIndex(N, ixs), 

310 fill_value=fill_value, 

311 dtype=dtype, 

312 ) 

313 sparse_series.append(Series(data=sarr, index=index, name=col, copy=False)) 

314 

315 return concat(sparse_series, axis=1, copy=False) 

316 

317 else: 

318 # take on axis=1 + transpose to ensure ndarray layout is column-major 

319 eye_dtype: NpDtype 

320 if isinstance(_dtype, np.dtype): 

321 eye_dtype = _dtype 

322 else: 

323 eye_dtype = np.bool_ 

324 dummy_mat = np.eye(number_of_cols, dtype=eye_dtype).take(codes, axis=1).T 

325 

326 if not dummy_na: 

327 # reset NaN GH4446 

328 dummy_mat[codes == -1] = 0 

329 

330 if drop_first: 

331 # remove first GH12042 

332 dummy_mat = dummy_mat[:, 1:] 

333 dummy_cols = dummy_cols[1:] 

334 return DataFrame(dummy_mat, index=index, columns=dummy_cols, dtype=_dtype) 

335 

336 

337def from_dummies( 

338 data: DataFrame, 

339 sep: None | str = None, 

340 default_category: None | Hashable | dict[str, Hashable] = None, 

341) -> DataFrame: 

342 """ 

343 Create a categorical ``DataFrame`` from a ``DataFrame`` of dummy variables. 

344 

345 Inverts the operation performed by :func:`~pandas.get_dummies`. 

346 

347 .. versionadded:: 1.5.0 

348 

349 Parameters 

350 ---------- 

351 data : DataFrame 

352 Data which contains dummy-coded variables in form of integer columns of 

353 1's and 0's. 

354 sep : str, default None 

355 Separator used in the column names of the dummy categories they are 

356 character indicating the separation of the categorical names from the prefixes. 

357 For example, if your column names are 'prefix_A' and 'prefix_B', 

358 you can strip the underscore by specifying sep='_'. 

359 default_category : None, Hashable or dict of Hashables, default None 

360 The default category is the implied category when a value has none of the 

361 listed categories specified with a one, i.e. if all dummies in a row are 

362 zero. Can be a single value for all variables or a dict directly mapping 

363 the default categories to a prefix of a variable. 

364 

365 Returns 

366 ------- 

367 DataFrame 

368 Categorical data decoded from the dummy input-data. 

369 

370 Raises 

371 ------ 

372 ValueError 

373 * When the input ``DataFrame`` ``data`` contains NA values. 

374 * When the input ``DataFrame`` ``data`` contains column names with separators 

375 that do not match the separator specified with ``sep``. 

376 * When a ``dict`` passed to ``default_category`` does not include an implied 

377 category for each prefix. 

378 * When a value in ``data`` has more than one category assigned to it. 

379 * When ``default_category=None`` and a value in ``data`` has no category 

380 assigned to it. 

381 TypeError 

382 * When the input ``data`` is not of type ``DataFrame``. 

383 * When the input ``DataFrame`` ``data`` contains non-dummy data. 

384 * When the passed ``sep`` is of a wrong data type. 

385 * When the passed ``default_category`` is of a wrong data type. 

386 

387 See Also 

388 -------- 

389 :func:`~pandas.get_dummies` : Convert ``Series`` or ``DataFrame`` to dummy codes. 

390 :class:`~pandas.Categorical` : Represent a categorical variable in classic. 

391 

392 Notes 

393 ----- 

394 The columns of the passed dummy data should only include 1's and 0's, 

395 or boolean values. 

396 

397 Examples 

398 -------- 

399 >>> df = pd.DataFrame({"a": [1, 0, 0, 1], "b": [0, 1, 0, 0], 

400 ... "c": [0, 0, 1, 0]}) 

401 

402 >>> df 

403 a b c 

404 0 1 0 0 

405 1 0 1 0 

406 2 0 0 1 

407 3 1 0 0 

408 

409 >>> pd.from_dummies(df) 

410 0 a 

411 1 b 

412 2 c 

413 3 a 

414 

415 >>> df = pd.DataFrame({"col1_a": [1, 0, 1], "col1_b": [0, 1, 0], 

416 ... "col2_a": [0, 1, 0], "col2_b": [1, 0, 0], 

417 ... "col2_c": [0, 0, 1]}) 

418 

419 >>> df 

420 col1_a col1_b col2_a col2_b col2_c 

421 0 1 0 0 1 0 

422 1 0 1 1 0 0 

423 2 1 0 0 0 1 

424 

425 >>> pd.from_dummies(df, sep="_") 

426 col1 col2 

427 0 a b 

428 1 b a 

429 2 a c 

430 

431 >>> df = pd.DataFrame({"col1_a": [1, 0, 0], "col1_b": [0, 1, 0], 

432 ... "col2_a": [0, 1, 0], "col2_b": [1, 0, 0], 

433 ... "col2_c": [0, 0, 0]}) 

434 

435 >>> df 

436 col1_a col1_b col2_a col2_b col2_c 

437 0 1 0 0 1 0 

438 1 0 1 1 0 0 

439 2 0 0 0 0 0 

440 

441 >>> pd.from_dummies(df, sep="_", default_category={"col1": "d", "col2": "e"}) 

442 col1 col2 

443 0 a b 

444 1 b a 

445 2 d e 

446 """ 

447 from pandas.core.reshape.concat import concat 

448 

449 if not isinstance(data, DataFrame): 

450 raise TypeError( 

451 "Expected 'data' to be a 'DataFrame'; " 

452 f"Received 'data' of type: {type(data).__name__}" 

453 ) 

454 

455 if data.isna().any().any(): 

456 raise ValueError( 

457 "Dummy DataFrame contains NA value in column: " 

458 f"'{data.isna().any().idxmax()}'" 

459 ) 

460 

461 # index data with a list of all columns that are dummies 

462 try: 

463 data_to_decode = data.astype("boolean", copy=False) 

464 except TypeError: 

465 raise TypeError("Passed DataFrame contains non-dummy data") 

466 

467 # collect prefixes and get lists to slice data for each prefix 

468 variables_slice = defaultdict(list) 

469 if sep is None: 

470 variables_slice[""] = list(data.columns) 

471 elif isinstance(sep, str): 

472 for col in data_to_decode.columns: 

473 prefix = col.split(sep)[0] 

474 if len(prefix) == len(col): 

475 raise ValueError(f"Separator not specified for column: {col}") 

476 variables_slice[prefix].append(col) 

477 else: 

478 raise TypeError( 

479 "Expected 'sep' to be of type 'str' or 'None'; " 

480 f"Received 'sep' of type: {type(sep).__name__}" 

481 ) 

482 

483 if default_category is not None: 

484 if isinstance(default_category, dict): 

485 if not len(default_category) == len(variables_slice): 

486 len_msg = ( 

487 f"Length of 'default_category' ({len(default_category)}) " 

488 f"did not match the length of the columns being encoded " 

489 f"({len(variables_slice)})" 

490 ) 

491 raise ValueError(len_msg) 

492 elif isinstance(default_category, Hashable): 

493 default_category = dict( 

494 zip(variables_slice, [default_category] * len(variables_slice)) 

495 ) 

496 else: 

497 raise TypeError( 

498 "Expected 'default_category' to be of type " 

499 "'None', 'Hashable', or 'dict'; " 

500 "Received 'default_category' of type: " 

501 f"{type(default_category).__name__}" 

502 ) 

503 

504 cat_data = {} 

505 for prefix, prefix_slice in variables_slice.items(): 

506 if sep is None: 

507 cats = prefix_slice.copy() 

508 else: 

509 cats = [col[len(prefix + sep) :] for col in prefix_slice] 

510 assigned = data_to_decode.loc[:, prefix_slice].sum(axis=1) 

511 if any(assigned > 1): 

512 raise ValueError( 

513 "Dummy DataFrame contains multi-assignment(s); " 

514 f"First instance in row: {assigned.idxmax()}" 

515 ) 

516 if any(assigned == 0): 

517 if isinstance(default_category, dict): 

518 cats.append(default_category[prefix]) 

519 else: 

520 raise ValueError( 

521 "Dummy DataFrame contains unassigned value(s); " 

522 f"First instance in row: {assigned.idxmin()}" 

523 ) 

524 data_slice = concat( 

525 (data_to_decode.loc[:, prefix_slice], assigned == 0), axis=1 

526 ) 

527 else: 

528 data_slice = data_to_decode.loc[:, prefix_slice] 

529 cats_array = np.array(cats, dtype="object") 

530 # get indices of True entries along axis=1 

531 cat_data[prefix] = cats_array[data_slice.to_numpy().nonzero()[1]] 

532 

533 return DataFrame(cat_data)