Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/core/reshape/encoding.py: 11%

1from __future__ import annotations

3from collections import defaultdict

4from collections.abc import (

5 Hashable,

6 Iterable,

8import itertools

9from typing import (

10 TYPE_CHECKING,

11 cast,

12)

14import numpy as np

16from pandas._libs.sparse import IntIndex

18from pandas.core.dtypes.common import (

19 is_integer_dtype,

20 is_list_like,

21 is_object_dtype,

22 pandas_dtype,

23)

24from pandas.core.dtypes.dtypes import (

25 ArrowDtype,

26 CategoricalDtype,

27)

29from pandas.core.arrays import SparseArray

30from pandas.core.arrays.categorical import factorize_from_iterable

31from pandas.core.arrays.string_ import StringDtype

32from pandas.core.frame import DataFrame

33from pandas.core.indexes.api import (

34 Index,

35 default_index,

36)

37from pandas.core.series import Series

39if TYPE_CHECKING:

40 from pandas._typing import NpDtype

43def get_dummies(

44 data,

45 prefix=None,

46 prefix_sep: str | Iterable[str] | dict[str, str] = "_",

47 dummy_na: bool = False,

48 columns=None,

49 sparse: bool = False,

50 drop_first: bool = False,

51 dtype: NpDtype | None = None,

52) -> DataFrame:

53 """

54 Convert categorical variable into dummy/indicator variables.

56 Each variable is converted in as many 0/1 variables as there are different

57 values. Columns in the output are each named after a value; if the input is

58 a DataFrame, the name of the original variable is prepended to the value.

60 Parameters

61 ----------

62 data : array-like, Series, or DataFrame

63 Data of which to get dummy indicators.

64 prefix : str, list of str, or dict of str, default None

65 String to append DataFrame column names.

66 Pass a list with length equal to the number of columns

67 when calling get_dummies on a DataFrame. Alternatively, `prefix`

68 can be a dictionary mapping column names to prefixes.

69 prefix_sep : str, default '_'

70 If appending prefix, separator/delimiter to use. Or pass a

71 list or dictionary as with `prefix`.

72 dummy_na : bool, default False

73 Add a column to indicate NaNs, if False NaNs are ignored.

74 columns : list-like, default None

75 Column names in the DataFrame to be encoded.

76 If `columns` is None then all the columns with

77 `object`, `string`, or `category` dtype will be converted.

78 sparse : bool, default False

79 Whether the dummy-encoded columns should be backed by

80 a :class:`SparseArray` (True) or a regular NumPy array (False).

81 drop_first : bool, default False

82 Whether to get k-1 dummies out of k categorical levels by removing the

83 first level.

84 dtype : dtype, default bool

85 Data type for new columns. Only a single dtype is allowed.

87 Returns

88 -------

89 DataFrame

90 Dummy-coded data. If `data` contains other columns than the

91 dummy-coded one(s), these will be prepended, unaltered, to the result.

93 See Also

94 --------

95 Series.str.get_dummies : Convert Series of strings to dummy codes.

96 :func:`~pandas.from_dummies` : Convert dummy codes to categorical ``DataFrame``.

98 Notes

99 -----

100 Reference :ref:`the user guide <reshaping.dummies>` for more examples.

101

102 Examples

103 --------

104 >>> s = pd.Series(list('abca'))

105

106 >>> pd.get_dummies(s)

107 a b c

108 0 True False False

109 1 False True False

110 2 False False True

111 3 True False False

112

113 >>> s1 = ['a', 'b', np.nan]

114

115 >>> pd.get_dummies(s1)

116 a b

117 0 True False

118 1 False True

119 2 False False

120

121 >>> pd.get_dummies(s1, dummy_na=True)

122 a b NaN

123 0 True False False

124 1 False True False

125 2 False False True

126

127 >>> df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'],

128 ... 'C': [1, 2, 3]})

129

130 >>> pd.get_dummies(df, prefix=['col1', 'col2'])

131 C col1_a col1_b col2_a col2_b col2_c

132 0 1 True False False True False

133 1 2 False True True False False

134 2 3 True False False False True

135

136 >>> pd.get_dummies(pd.Series(list('abcaa')))

137 a b c

138 0 True False False

139 1 False True False

140 2 False False True

141 3 True False False

142 4 True False False

143

144 >>> pd.get_dummies(pd.Series(list('abcaa')), drop_first=True)

145 b c

146 0 False False

147 1 True False

148 2 False True

149 3 False False

150 4 False False

151

152 >>> pd.get_dummies(pd.Series(list('abc')), dtype=float)

153 a b c

154 0 1.0 0.0 0.0

155 1 0.0 1.0 0.0

156 2 0.0 0.0 1.0

157 """

158 from pandas.core.reshape.concat import concat

159

160 dtypes_to_encode = ["object", "string", "category"]

161

162 if isinstance(data, DataFrame):

163 # determine columns being encoded

164 if columns is None:

165 data_to_encode = data.select_dtypes(include=dtypes_to_encode)

166 elif not is_list_like(columns):

167 raise TypeError("Input must be a list-like for parameter `columns`")

168 else:

169 data_to_encode = data[columns]

170

171 # validate prefixes and separator to avoid silently dropping cols

172 def check_len(item, name: str):

173 if is_list_like(item):

174 if not len(item) == data_to_encode.shape[1]:

175 len_msg = (

176 f"Length of '{name}' ({len(item)}) did not match the "

177 "length of the columns being encoded "

178 f"({data_to_encode.shape[1]})."

179 )

180 raise ValueError(len_msg)

181

182 check_len(prefix, "prefix")

183 check_len(prefix_sep, "prefix_sep")

184

185 if isinstance(prefix, str):

186 prefix = itertools.cycle([prefix])

187 if isinstance(prefix, dict):

188 prefix = [prefix[col] for col in data_to_encode.columns]

189

190 if prefix is None:

191 prefix = data_to_encode.columns

192

193 # validate separators

194 if isinstance(prefix_sep, str):

195 prefix_sep = itertools.cycle([prefix_sep])

196 elif isinstance(prefix_sep, dict):

197 prefix_sep = [prefix_sep[col] for col in data_to_encode.columns]

198

199 with_dummies: list[DataFrame]

200 if data_to_encode.shape == data.shape:

201 # Encoding the entire df, do not prepend any dropped columns

202 with_dummies = []

203 elif columns is not None:

204 # Encoding only cols specified in columns. Get all cols not in

205 # columns to prepend to result.

206 with_dummies = [data.drop(columns, axis=1)]

207 else:

208 # Encoding only object and category dtype columns. Get remaining

209 # columns to prepend to result.

210 with_dummies = [data.select_dtypes(exclude=dtypes_to_encode)]

211

212 for col, pre, sep in zip(data_to_encode.items(), prefix, prefix_sep):

213 # col is (column_name, column), use just column data here

214 dummy = _get_dummies_1d(

215 col[1],

216 prefix=pre,

217 prefix_sep=sep,

218 dummy_na=dummy_na,

219 sparse=sparse,

220 drop_first=drop_first,

221 dtype=dtype,

222 )

223 with_dummies.append(dummy)

224 result = concat(with_dummies, axis=1)

225 else:

226 result = _get_dummies_1d(

227 data,

228 prefix,

229 prefix_sep,

230 dummy_na,

231 sparse=sparse,

232 drop_first=drop_first,

233 dtype=dtype,

234 )

235 return result

236

237

238def _get_dummies_1d(

239 data,

240 prefix,

241 prefix_sep: str | Iterable[str] | dict[str, str] = "_",

242 dummy_na: bool = False,

243 sparse: bool = False,

244 drop_first: bool = False,

245 dtype: NpDtype | None = None,

246) -> DataFrame:

247 from pandas.core.reshape.concat import concat

248

249 # Series avoids inconsistent NaN handling

250 codes, levels = factorize_from_iterable(Series(data, copy=False))

251

252 if dtype is None and hasattr(data, "dtype"):

253 input_dtype = data.dtype

254 if isinstance(input_dtype, CategoricalDtype):

255 input_dtype = input_dtype.categories.dtype

256

257 if isinstance(input_dtype, ArrowDtype):

258 import pyarrow as pa

259

260 dtype = ArrowDtype(pa.bool_()) # type: ignore[assignment]

261 elif (

262 isinstance(input_dtype, StringDtype)

263 and input_dtype.storage != "pyarrow_numpy"

264 ):

265 dtype = pandas_dtype("boolean") # type: ignore[assignment]

266 else:

267 dtype = np.dtype(bool)

268 elif dtype is None:

269 dtype = np.dtype(bool)

270

271 _dtype = pandas_dtype(dtype)

272

273 if is_object_dtype(_dtype):

274 raise ValueError("dtype=object is not a valid dtype for get_dummies")

275

276 def get_empty_frame(data) -> DataFrame:

277 index: Index | np.ndarray

278 if isinstance(data, Series):

279 index = data.index

280 else:

281 index = default_index(len(data))

282 return DataFrame(index=index)

283

284 # if all NaN

285 if not dummy_na and len(levels) == 0:

286 return get_empty_frame(data)

287

288 codes = codes.copy()

289 if dummy_na:

290 codes[codes == -1] = len(levels)

291 levels = levels.insert(len(levels), np.nan)

292

293 # if dummy_na, we just fake a nan level. drop_first will drop it again

294 if drop_first and len(levels) == 1:

295 return get_empty_frame(data)

296

297 number_of_cols = len(levels)

298

299 if prefix is None:

300 dummy_cols = levels

301 else:

302 dummy_cols = Index([f"{prefix}{prefix_sep}{level}" for level in levels])

303

304 index: Index | None

305 if isinstance(data, Series):

306 index = data.index

307 else:

308 index = None

309

310 if sparse:

311 fill_value: bool | float

312 if is_integer_dtype(dtype):

313 fill_value = 0

314 elif dtype == np.dtype(bool):

315 fill_value = False

316 else:

317 fill_value = 0.0

318

319 sparse_series = []

320 N = len(data)

321 sp_indices: list[list] = [[] for _ in range(len(dummy_cols))]

322 mask = codes != -1

323 codes = codes[mask]

324 n_idx = np.arange(N)[mask]

325

326 for ndx, code in zip(n_idx, codes):

327 sp_indices[code].append(ndx)

328

329 if drop_first:

330 # remove first categorical level to avoid perfect collinearity

331 # GH12042

332 sp_indices = sp_indices[1:]

333 dummy_cols = dummy_cols[1:]

334 for col, ixs in zip(dummy_cols, sp_indices):

335 sarr = SparseArray(

336 np.ones(len(ixs), dtype=dtype),

337 sparse_index=IntIndex(N, ixs),

338 fill_value=fill_value,

339 dtype=dtype,

340 )

341 sparse_series.append(Series(data=sarr, index=index, name=col, copy=False))

342

343 return concat(sparse_series, axis=1, copy=False)

344

345 else:

346 # ensure ndarray layout is column-major

347 shape = len(codes), number_of_cols

348 dummy_dtype: NpDtype

349 if isinstance(_dtype, np.dtype):

350 dummy_dtype = _dtype

351 else:

352 dummy_dtype = np.bool_

353 dummy_mat = np.zeros(shape=shape, dtype=dummy_dtype, order="F")

354 dummy_mat[np.arange(len(codes)), codes] = 1

355

356 if not dummy_na:

357 # reset NaN GH4446

358 dummy_mat[codes == -1] = 0

359

360 if drop_first:

361 # remove first GH12042

362 dummy_mat = dummy_mat[:, 1:]

363 dummy_cols = dummy_cols[1:]

364 return DataFrame(dummy_mat, index=index, columns=dummy_cols, dtype=_dtype)

365

366

367def from_dummies(

368 data: DataFrame,

369 sep: None | str = None,

370 default_category: None | Hashable | dict[str, Hashable] = None,

371) -> DataFrame:

372 """

373 Create a categorical ``DataFrame`` from a ``DataFrame`` of dummy variables.

374

375 Inverts the operation performed by :func:`~pandas.get_dummies`.

376

377 .. versionadded:: 1.5.0

378

379 Parameters

380 ----------

381 data : DataFrame

382 Data which contains dummy-coded variables in form of integer columns of

383 1's and 0's.

384 sep : str, default None

385 Separator used in the column names of the dummy categories they are

386 character indicating the separation of the categorical names from the prefixes.

387 For example, if your column names are 'prefix_A' and 'prefix_B',

388 you can strip the underscore by specifying sep='_'.

389 default_category : None, Hashable or dict of Hashables, default None

390 The default category is the implied category when a value has none of the

391 listed categories specified with a one, i.e. if all dummies in a row are

392 zero. Can be a single value for all variables or a dict directly mapping

393 the default categories to a prefix of a variable.

394

395 Returns

396 -------

397 DataFrame

398 Categorical data decoded from the dummy input-data.

399

400 Raises

401 ------

402 ValueError

403 * When the input ``DataFrame`` ``data`` contains NA values.

404 * When the input ``DataFrame`` ``data`` contains column names with separators

405 that do not match the separator specified with ``sep``.

406 * When a ``dict`` passed to ``default_category`` does not include an implied

407 category for each prefix.

408 * When a value in ``data`` has more than one category assigned to it.

409 * When ``default_category=None`` and a value in ``data`` has no category

410 assigned to it.

411 TypeError

412 * When the input ``data`` is not of type ``DataFrame``.

413 * When the input ``DataFrame`` ``data`` contains non-dummy data.

414 * When the passed ``sep`` is of a wrong data type.

415 * When the passed ``default_category`` is of a wrong data type.

416

417 See Also

418 --------

419 :func:`~pandas.get_dummies` : Convert ``Series`` or ``DataFrame`` to dummy codes.

420 :class:`~pandas.Categorical` : Represent a categorical variable in classic.

421

422 Notes

423 -----

424 The columns of the passed dummy data should only include 1's and 0's,

425 or boolean values.

426

427 Examples

428 --------

429 >>> df = pd.DataFrame({"a": [1, 0, 0, 1], "b": [0, 1, 0, 0],

430 ... "c": [0, 0, 1, 0]})

431

432 >>> df

433 a b c

434 0 1 0 0

435 1 0 1 0

436 2 0 0 1

437 3 1 0 0

438

439 >>> pd.from_dummies(df)

440 0 a

441 1 b

442 2 c

443 3 a

444

445 >>> df = pd.DataFrame({"col1_a": [1, 0, 1], "col1_b": [0, 1, 0],

446 ... "col2_a": [0, 1, 0], "col2_b": [1, 0, 0],

447 ... "col2_c": [0, 0, 1]})

448

449 >>> df

450 col1_a col1_b col2_a col2_b col2_c

451 0 1 0 0 1 0

452 1 0 1 1 0 0

453 2 1 0 0 0 1

454

455 >>> pd.from_dummies(df, sep="_")

456 col1 col2

457 0 a b

458 1 b a

459 2 a c

460

461 >>> df = pd.DataFrame({"col1_a": [1, 0, 0], "col1_b": [0, 1, 0],

462 ... "col2_a": [0, 1, 0], "col2_b": [1, 0, 0],

463 ... "col2_c": [0, 0, 0]})

464

465 >>> df

466 col1_a col1_b col2_a col2_b col2_c

467 0 1 0 0 1 0

468 1 0 1 1 0 0

469 2 0 0 0 0 0

470

471 >>> pd.from_dummies(df, sep="_", default_category={"col1": "d", "col2": "e"})

472 col1 col2

473 0 a b

474 1 b a

475 2 d e

476 """

477 from pandas.core.reshape.concat import concat

478

479 if not isinstance(data, DataFrame):

480 raise TypeError(

481 "Expected 'data' to be a 'DataFrame'; "

482 f"Received 'data' of type: {type(data).__name__}"

483 )

484

485 col_isna_mask = cast(Series, data.isna().any())

486

487 if col_isna_mask.any():

488 raise ValueError(

489 "Dummy DataFrame contains NA value in column: "

490 f"'{col_isna_mask.idxmax()}'"

491 )

492

493 # index data with a list of all columns that are dummies

494 try:

495 data_to_decode = data.astype("boolean", copy=False)

496 except TypeError:

497 raise TypeError("Passed DataFrame contains non-dummy data")

498

499 # collect prefixes and get lists to slice data for each prefix

500 variables_slice = defaultdict(list)

501 if sep is None:

502 variables_slice[""] = list(data.columns)

503 elif isinstance(sep, str):

504 for col in data_to_decode.columns:

505 prefix = col.split(sep)[0]

506 if len(prefix) == len(col):

507 raise ValueError(f"Separator not specified for column: {col}")

508 variables_slice[prefix].append(col)

509 else:

510 raise TypeError(

511 "Expected 'sep' to be of type 'str' or 'None'; "

512 f"Received 'sep' of type: {type(sep).__name__}"

513 )

514

515 if default_category is not None:

516 if isinstance(default_category, dict):

517 if not len(default_category) == len(variables_slice):

518 len_msg = (

519 f"Length of 'default_category' ({len(default_category)}) "

520 f"did not match the length of the columns being encoded "

521 f"({len(variables_slice)})"

522 )

523 raise ValueError(len_msg)

524 elif isinstance(default_category, Hashable):

525 default_category = dict(

526 zip(variables_slice, [default_category] * len(variables_slice))

527 )

528 else:

529 raise TypeError(

530 "Expected 'default_category' to be of type "

531 "'None', 'Hashable', or 'dict'; "

532 "Received 'default_category' of type: "

533 f"{type(default_category).__name__}"

534 )

535

536 cat_data = {}

537 for prefix, prefix_slice in variables_slice.items():

538 if sep is None:

539 cats = prefix_slice.copy()

540 else:

541 cats = [col[len(prefix + sep) :] for col in prefix_slice]

542 assigned = data_to_decode.loc[:, prefix_slice].sum(axis=1)

543 if any(assigned > 1):

544 raise ValueError(

545 "Dummy DataFrame contains multi-assignment(s); "

546 f"First instance in row: {assigned.idxmax()}"

547 )

548 if any(assigned == 0):

549 if isinstance(default_category, dict):

550 cats.append(default_category[prefix])

551 else:

552 raise ValueError(

553 "Dummy DataFrame contains unassigned value(s); "

554 f"First instance in row: {assigned.idxmin()}"

555 )

556 data_slice = concat(

557 (data_to_decode.loc[:, prefix_slice], assigned == 0), axis=1

558 )

559 else:

560 data_slice = data_to_decode.loc[:, prefix_slice]

561 cats_array = data._constructor_sliced(cats, dtype=data.columns.dtype)

562 # get indices of True entries along axis=1

563 true_values = data_slice.idxmax(axis=1)

564 indexer = data_slice.columns.get_indexer_for(true_values)

565 cat_data[prefix] = cats_array.take(indexer).set_axis(data.index)

566

567 result = DataFrame(cat_data)

568 if sep is not None:

569 result.columns = result.columns.astype(data.columns.dtype)

570 return result