Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pandas/core/reshape/encoding.py: 10%

1from __future__ import annotations

3from collections import defaultdict

4import itertools

5from typing import (

6 Hashable,

7 Iterable,

10import numpy as np

12from pandas._libs.sparse import IntIndex

13from pandas._typing import NpDtype

15from pandas.core.dtypes.common import (

16 is_integer_dtype,

17 is_list_like,

18 is_object_dtype,

19 pandas_dtype,

20)

22from pandas.core.arrays import SparseArray

23from pandas.core.arrays.categorical import factorize_from_iterable

24from pandas.core.frame import DataFrame

25from pandas.core.indexes.api import (

26 Index,

27 default_index,

28)

29from pandas.core.series import Series

32def get_dummies(

33 data,

34 prefix=None,

35 prefix_sep: str | Iterable[str] | dict[str, str] = "_",

36 dummy_na: bool = False,

37 columns=None,

38 sparse: bool = False,

39 drop_first: bool = False,

40 dtype: NpDtype | None = None,

41) -> DataFrame:

42 """

43 Convert categorical variable into dummy/indicator variables.

45 Each variable is converted in as many 0/1 variables as there are different

46 values. Columns in the output are each named after a value; if the input is

47 a DataFrame, the name of the original variable is prepended to the value.

49 Parameters

50 ----------

51 data : array-like, Series, or DataFrame

52 Data of which to get dummy indicators.

53 prefix : str, list of str, or dict of str, default None

54 String to append DataFrame column names.

55 Pass a list with length equal to the number of columns

56 when calling get_dummies on a DataFrame. Alternatively, `prefix`

57 can be a dictionary mapping column names to prefixes.

58 prefix_sep : str, default '_'

59 If appending prefix, separator/delimiter to use. Or pass a

60 list or dictionary as with `prefix`.

61 dummy_na : bool, default False

62 Add a column to indicate NaNs, if False NaNs are ignored.

63 columns : list-like, default None

64 Column names in the DataFrame to be encoded.

65 If `columns` is None then all the columns with

66 `object`, `string`, or `category` dtype will be converted.

67 sparse : bool, default False

68 Whether the dummy-encoded columns should be backed by

69 a :class:`SparseArray` (True) or a regular NumPy array (False).

70 drop_first : bool, default False

71 Whether to get k-1 dummies out of k categorical levels by removing the

72 first level.

73 dtype : dtype, default bool

74 Data type for new columns. Only a single dtype is allowed.

76 Returns

77 -------

78 DataFrame

79 Dummy-coded data. If `data` contains other columns than the

80 dummy-coded one(s), these will be prepended, unaltered, to the result.

82 See Also

83 --------

84 Series.str.get_dummies : Convert Series of strings to dummy codes.

85 :func:`~pandas.from_dummies` : Convert dummy codes to categorical ``DataFrame``.

87 Notes

88 -----

89 Reference :ref:`the user guide <reshaping.dummies>` for more examples.

91 Examples

92 --------

93 >>> s = pd.Series(list('abca'))

95 >>> pd.get_dummies(s)

96 a b c

97 0 True False False

98 1 False True False

99 2 False False True

100 3 True False False

101

102 >>> s1 = ['a', 'b', np.nan]

103

104 >>> pd.get_dummies(s1)

105 a b

106 0 True False

107 1 False True

108 2 False False

109

110 >>> pd.get_dummies(s1, dummy_na=True)

111 a b NaN

112 0 True False False

113 1 False True False

114 2 False False True

115

116 >>> df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'],

117 ... 'C': [1, 2, 3]})

118

119 >>> pd.get_dummies(df, prefix=['col1', 'col2'])

120 C col1_a col1_b col2_a col2_b col2_c

121 0 1 True False False True False

122 1 2 False True True False False

123 2 3 True False False False True

124

125 >>> pd.get_dummies(pd.Series(list('abcaa')))

126 a b c

127 0 True False False

128 1 False True False

129 2 False False True

130 3 True False False

131 4 True False False

132

133 >>> pd.get_dummies(pd.Series(list('abcaa')), drop_first=True)

134 b c

135 0 False False

136 1 True False

137 2 False True

138 3 False False

139 4 False False

140

141 >>> pd.get_dummies(pd.Series(list('abc')), dtype=float)

142 a b c

143 0 1.0 0.0 0.0

144 1 0.0 1.0 0.0

145 2 0.0 0.0 1.0

146 """

147 from pandas.core.reshape.concat import concat

148

149 dtypes_to_encode = ["object", "string", "category"]

150

151 if isinstance(data, DataFrame):

152 # determine columns being encoded

153 if columns is None:

154 data_to_encode = data.select_dtypes(include=dtypes_to_encode)

155 elif not is_list_like(columns):

156 raise TypeError("Input must be a list-like for parameter `columns`")

157 else:

158 data_to_encode = data[columns]

159

160 # validate prefixes and separator to avoid silently dropping cols

161 def check_len(item, name):

162 if is_list_like(item):

163 if not len(item) == data_to_encode.shape[1]:

164 len_msg = (

165 f"Length of '{name}' ({len(item)}) did not match the "

166 "length of the columns being encoded "

167 f"({data_to_encode.shape[1]})."

168 )

169 raise ValueError(len_msg)

170

171 check_len(prefix, "prefix")

172 check_len(prefix_sep, "prefix_sep")

173

174 if isinstance(prefix, str):

175 prefix = itertools.cycle([prefix])

176 if isinstance(prefix, dict):

177 prefix = [prefix[col] for col in data_to_encode.columns]

178

179 if prefix is None:

180 prefix = data_to_encode.columns

181

182 # validate separators

183 if isinstance(prefix_sep, str):

184 prefix_sep = itertools.cycle([prefix_sep])

185 elif isinstance(prefix_sep, dict):

186 prefix_sep = [prefix_sep[col] for col in data_to_encode.columns]

187

188 with_dummies: list[DataFrame]

189 if data_to_encode.shape == data.shape:

190 # Encoding the entire df, do not prepend any dropped columns

191 with_dummies = []

192 elif columns is not None:

193 # Encoding only cols specified in columns. Get all cols not in

194 # columns to prepend to result.

195 with_dummies = [data.drop(columns, axis=1)]

196 else:

197 # Encoding only object and category dtype columns. Get remaining

198 # columns to prepend to result.

199 with_dummies = [data.select_dtypes(exclude=dtypes_to_encode)]

200

201 for col, pre, sep in zip(data_to_encode.items(), prefix, prefix_sep):

202 # col is (column_name, column), use just column data here

203 dummy = _get_dummies_1d(

204 col[1],

205 prefix=pre,

206 prefix_sep=sep,

207 dummy_na=dummy_na,

208 sparse=sparse,

209 drop_first=drop_first,

210 dtype=dtype,

211 )

212 with_dummies.append(dummy)

213 result = concat(with_dummies, axis=1)

214 else:

215 result = _get_dummies_1d(

216 data,

217 prefix,

218 prefix_sep,

219 dummy_na,

220 sparse=sparse,

221 drop_first=drop_first,

222 dtype=dtype,

223 )

224 return result

225

226

227def _get_dummies_1d(

228 data,

229 prefix,

230 prefix_sep: str | Iterable[str] | dict[str, str] = "_",

231 dummy_na: bool = False,

232 sparse: bool = False,

233 drop_first: bool = False,

234 dtype: NpDtype | None = None,

235) -> DataFrame:

236 from pandas.core.reshape.concat import concat

237

238 # Series avoids inconsistent NaN handling

239 codes, levels = factorize_from_iterable(Series(data, copy=False))

240

241 if dtype is None:

242 dtype = np.dtype(bool)

243 _dtype = pandas_dtype(dtype)

244

245 if is_object_dtype(_dtype):

246 raise ValueError("dtype=object is not a valid dtype for get_dummies")

247

248 def get_empty_frame(data) -> DataFrame:

249 index: Index | np.ndarray

250 if isinstance(data, Series):

251 index = data.index

252 else:

253 index = default_index(len(data))

254 return DataFrame(index=index)

255

256 # if all NaN

257 if not dummy_na and len(levels) == 0:

258 return get_empty_frame(data)

259

260 codes = codes.copy()

261 if dummy_na:

262 codes[codes == -1] = len(levels)

263 levels = levels.insert(len(levels), np.nan)

264

265 # if dummy_na, we just fake a nan level. drop_first will drop it again

266 if drop_first and len(levels) == 1:

267 return get_empty_frame(data)

268

269 number_of_cols = len(levels)

270

271 if prefix is None:

272 dummy_cols = levels

273 else:

274 dummy_cols = Index([f"{prefix}{prefix_sep}{level}" for level in levels])

275

276 index: Index | None

277 if isinstance(data, Series):

278 index = data.index

279 else:

280 index = None

281

282 if sparse:

283 fill_value: bool | float

284 if is_integer_dtype(dtype):

285 fill_value = 0

286 elif dtype == np.dtype(bool):

287 fill_value = False

288 else:

289 fill_value = 0.0

290

291 sparse_series = []

292 N = len(data)

293 sp_indices: list[list] = [[] for _ in range(len(dummy_cols))]

294 mask = codes != -1

295 codes = codes[mask]

296 n_idx = np.arange(N)[mask]

297

298 for ndx, code in zip(n_idx, codes):

299 sp_indices[code].append(ndx)

300

301 if drop_first:

302 # remove first categorical level to avoid perfect collinearity

303 # GH12042

304 sp_indices = sp_indices[1:]

305 dummy_cols = dummy_cols[1:]

306 for col, ixs in zip(dummy_cols, sp_indices):

307 sarr = SparseArray(

308 np.ones(len(ixs), dtype=dtype),

309 sparse_index=IntIndex(N, ixs),

310 fill_value=fill_value,

311 dtype=dtype,

312 )

313 sparse_series.append(Series(data=sarr, index=index, name=col, copy=False))

314

315 return concat(sparse_series, axis=1, copy=False)

316

317 else:

318 # take on axis=1 + transpose to ensure ndarray layout is column-major

319 eye_dtype: NpDtype

320 if isinstance(_dtype, np.dtype):

321 eye_dtype = _dtype

322 else:

323 eye_dtype = np.bool_

324 dummy_mat = np.eye(number_of_cols, dtype=eye_dtype).take(codes, axis=1).T

325

326 if not dummy_na:

327 # reset NaN GH4446

328 dummy_mat[codes == -1] = 0

329

330 if drop_first:

331 # remove first GH12042

332 dummy_mat = dummy_mat[:, 1:]

333 dummy_cols = dummy_cols[1:]

334 return DataFrame(dummy_mat, index=index, columns=dummy_cols, dtype=_dtype)

335

336

337def from_dummies(

338 data: DataFrame,

339 sep: None | str = None,

340 default_category: None | Hashable | dict[str, Hashable] = None,

341) -> DataFrame:

342 """

343 Create a categorical ``DataFrame`` from a ``DataFrame`` of dummy variables.

344

345 Inverts the operation performed by :func:`~pandas.get_dummies`.

346

347 .. versionadded:: 1.5.0

348

349 Parameters

350 ----------

351 data : DataFrame

352 Data which contains dummy-coded variables in form of integer columns of

353 1's and 0's.

354 sep : str, default None

355 Separator used in the column names of the dummy categories they are

356 character indicating the separation of the categorical names from the prefixes.

357 For example, if your column names are 'prefix_A' and 'prefix_B',

358 you can strip the underscore by specifying sep='_'.

359 default_category : None, Hashable or dict of Hashables, default None

360 The default category is the implied category when a value has none of the

361 listed categories specified with a one, i.e. if all dummies in a row are

362 zero. Can be a single value for all variables or a dict directly mapping

363 the default categories to a prefix of a variable.

364

365 Returns

366 -------

367 DataFrame

368 Categorical data decoded from the dummy input-data.

369

370 Raises

371 ------

372 ValueError

373 * When the input ``DataFrame`` ``data`` contains NA values.

374 * When the input ``DataFrame`` ``data`` contains column names with separators

375 that do not match the separator specified with ``sep``.

376 * When a ``dict`` passed to ``default_category`` does not include an implied

377 category for each prefix.

378 * When a value in ``data`` has more than one category assigned to it.

379 * When ``default_category=None`` and a value in ``data`` has no category

380 assigned to it.

381 TypeError

382 * When the input ``data`` is not of type ``DataFrame``.

383 * When the input ``DataFrame`` ``data`` contains non-dummy data.

384 * When the passed ``sep`` is of a wrong data type.

385 * When the passed ``default_category`` is of a wrong data type.

386

387 See Also

388 --------

389 :func:`~pandas.get_dummies` : Convert ``Series`` or ``DataFrame`` to dummy codes.

390 :class:`~pandas.Categorical` : Represent a categorical variable in classic.

391

392 Notes

393 -----

394 The columns of the passed dummy data should only include 1's and 0's,

395 or boolean values.

396

397 Examples

398 --------

399 >>> df = pd.DataFrame({"a": [1, 0, 0, 1], "b": [0, 1, 0, 0],

400 ... "c": [0, 0, 1, 0]})

401

402 >>> df

403 a b c

404 0 1 0 0

405 1 0 1 0

406 2 0 0 1

407 3 1 0 0

408

409 >>> pd.from_dummies(df)

410 0 a

411 1 b

412 2 c

413 3 a

414

415 >>> df = pd.DataFrame({"col1_a": [1, 0, 1], "col1_b": [0, 1, 0],

416 ... "col2_a": [0, 1, 0], "col2_b": [1, 0, 0],

417 ... "col2_c": [0, 0, 1]})

418

419 >>> df

420 col1_a col1_b col2_a col2_b col2_c

421 0 1 0 0 1 0

422 1 0 1 1 0 0

423 2 1 0 0 0 1

424

425 >>> pd.from_dummies(df, sep="_")

426 col1 col2

427 0 a b

428 1 b a

429 2 a c

430

431 >>> df = pd.DataFrame({"col1_a": [1, 0, 0], "col1_b": [0, 1, 0],

432 ... "col2_a": [0, 1, 0], "col2_b": [1, 0, 0],

433 ... "col2_c": [0, 0, 0]})

434

435 >>> df

436 col1_a col1_b col2_a col2_b col2_c

437 0 1 0 0 1 0

438 1 0 1 1 0 0

439 2 0 0 0 0 0

440

441 >>> pd.from_dummies(df, sep="_", default_category={"col1": "d", "col2": "e"})

442 col1 col2

443 0 a b

444 1 b a

445 2 d e

446 """

447 from pandas.core.reshape.concat import concat

448

449 if not isinstance(data, DataFrame):

450 raise TypeError(

451 "Expected 'data' to be a 'DataFrame'; "

452 f"Received 'data' of type: {type(data).__name__}"

453 )

454

455 if data.isna().any().any():

456 raise ValueError(

457 "Dummy DataFrame contains NA value in column: "

458 f"'{data.isna().any().idxmax()}'"

459 )

460

461 # index data with a list of all columns that are dummies

462 try:

463 data_to_decode = data.astype("boolean", copy=False)

464 except TypeError:

465 raise TypeError("Passed DataFrame contains non-dummy data")

466

467 # collect prefixes and get lists to slice data for each prefix

468 variables_slice = defaultdict(list)

469 if sep is None:

470 variables_slice[""] = list(data.columns)

471 elif isinstance(sep, str):

472 for col in data_to_decode.columns:

473 prefix = col.split(sep)[0]

474 if len(prefix) == len(col):

475 raise ValueError(f"Separator not specified for column: {col}")

476 variables_slice[prefix].append(col)

477 else:

478 raise TypeError(

479 "Expected 'sep' to be of type 'str' or 'None'; "

480 f"Received 'sep' of type: {type(sep).__name__}"

481 )

482

483 if default_category is not None:

484 if isinstance(default_category, dict):

485 if not len(default_category) == len(variables_slice):

486 len_msg = (

487 f"Length of 'default_category' ({len(default_category)}) "

488 f"did not match the length of the columns being encoded "

489 f"({len(variables_slice)})"

490 )

491 raise ValueError(len_msg)

492 elif isinstance(default_category, Hashable):

493 default_category = dict(

494 zip(variables_slice, [default_category] * len(variables_slice))

495 )

496 else:

497 raise TypeError(

498 "Expected 'default_category' to be of type "

499 "'None', 'Hashable', or 'dict'; "

500 "Received 'default_category' of type: "

501 f"{type(default_category).__name__}"

502 )

503

504 cat_data = {}

505 for prefix, prefix_slice in variables_slice.items():

506 if sep is None:

507 cats = prefix_slice.copy()

508 else:

509 cats = [col[len(prefix + sep) :] for col in prefix_slice]

510 assigned = data_to_decode.loc[:, prefix_slice].sum(axis=1)

511 if any(assigned > 1):

512 raise ValueError(

513 "Dummy DataFrame contains multi-assignment(s); "

514 f"First instance in row: {assigned.idxmax()}"

515 )

516 if any(assigned == 0):

517 if isinstance(default_category, dict):

518 cats.append(default_category[prefix])

519 else:

520 raise ValueError(

521 "Dummy DataFrame contains unassigned value(s); "

522 f"First instance in row: {assigned.idxmin()}"

523 )

524 data_slice = concat(

525 (data_to_decode.loc[:, prefix_slice], assigned == 0), axis=1

526 )

527 else:

528 data_slice = data_to_decode.loc[:, prefix_slice]

529 cats_array = np.array(cats, dtype="object")

530 # get indices of True entries along axis=1

531 cat_data[prefix] = cats_array[data_slice.to_numpy().nonzero()[1]]

532

533 return DataFrame(cat_data)