1from __future__ import annotations
2
3from collections import defaultdict
4from collections.abc import (
5 Hashable,
6 Iterable,
7)
8import itertools
9from typing import (
10 TYPE_CHECKING,
11 cast,
12)
13
14import numpy as np
15
16from pandas._libs.sparse import IntIndex
17
18from pandas.core.dtypes.common import (
19 is_integer_dtype,
20 is_list_like,
21 is_object_dtype,
22 pandas_dtype,
23)
24from pandas.core.dtypes.dtypes import (
25 ArrowDtype,
26 CategoricalDtype,
27)
28
29from pandas.core.arrays import SparseArray
30from pandas.core.arrays.categorical import factorize_from_iterable
31from pandas.core.arrays.string_ import StringDtype
32from pandas.core.frame import DataFrame
33from pandas.core.indexes.api import (
34 Index,
35 default_index,
36)
37from pandas.core.series import Series
38
39if TYPE_CHECKING:
40 from pandas._typing import NpDtype
41
42
43def get_dummies(
44 data,
45 prefix=None,
46 prefix_sep: str | Iterable[str] | dict[str, str] = "_",
47 dummy_na: bool = False,
48 columns=None,
49 sparse: bool = False,
50 drop_first: bool = False,
51 dtype: NpDtype | None = None,
52) -> DataFrame:
53 """
54 Convert categorical variable into dummy/indicator variables.
55
56 Each variable is converted in as many 0/1 variables as there are different
57 values. Columns in the output are each named after a value; if the input is
58 a DataFrame, the name of the original variable is prepended to the value.
59
60 Parameters
61 ----------
62 data : array-like, Series, or DataFrame
63 Data of which to get dummy indicators.
64 prefix : str, list of str, or dict of str, default None
65 String to append DataFrame column names.
66 Pass a list with length equal to the number of columns
67 when calling get_dummies on a DataFrame. Alternatively, `prefix`
68 can be a dictionary mapping column names to prefixes.
69 prefix_sep : str, default '_'
70 If appending prefix, separator/delimiter to use. Or pass a
71 list or dictionary as with `prefix`.
72 dummy_na : bool, default False
73 Add a column to indicate NaNs, if False NaNs are ignored.
74 columns : list-like, default None
75 Column names in the DataFrame to be encoded.
76 If `columns` is None then all the columns with
77 `object`, `string`, or `category` dtype will be converted.
78 sparse : bool, default False
79 Whether the dummy-encoded columns should be backed by
80 a :class:`SparseArray` (True) or a regular NumPy array (False).
81 drop_first : bool, default False
82 Whether to get k-1 dummies out of k categorical levels by removing the
83 first level.
84 dtype : dtype, default bool
85 Data type for new columns. Only a single dtype is allowed.
86
87 Returns
88 -------
89 DataFrame
90 Dummy-coded data. If `data` contains other columns than the
91 dummy-coded one(s), these will be prepended, unaltered, to the result.
92
93 See Also
94 --------
95 Series.str.get_dummies : Convert Series of strings to dummy codes.
96 :func:`~pandas.from_dummies` : Convert dummy codes to categorical ``DataFrame``.
97
98 Notes
99 -----
100 Reference :ref:`the user guide <reshaping.dummies>` for more examples.
101
102 Examples
103 --------
104 >>> s = pd.Series(list('abca'))
105
106 >>> pd.get_dummies(s)
107 a b c
108 0 True False False
109 1 False True False
110 2 False False True
111 3 True False False
112
113 >>> s1 = ['a', 'b', np.nan]
114
115 >>> pd.get_dummies(s1)
116 a b
117 0 True False
118 1 False True
119 2 False False
120
121 >>> pd.get_dummies(s1, dummy_na=True)
122 a b NaN
123 0 True False False
124 1 False True False
125 2 False False True
126
127 >>> df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'],
128 ... 'C': [1, 2, 3]})
129
130 >>> pd.get_dummies(df, prefix=['col1', 'col2'])
131 C col1_a col1_b col2_a col2_b col2_c
132 0 1 True False False True False
133 1 2 False True True False False
134 2 3 True False False False True
135
136 >>> pd.get_dummies(pd.Series(list('abcaa')))
137 a b c
138 0 True False False
139 1 False True False
140 2 False False True
141 3 True False False
142 4 True False False
143
144 >>> pd.get_dummies(pd.Series(list('abcaa')), drop_first=True)
145 b c
146 0 False False
147 1 True False
148 2 False True
149 3 False False
150 4 False False
151
152 >>> pd.get_dummies(pd.Series(list('abc')), dtype=float)
153 a b c
154 0 1.0 0.0 0.0
155 1 0.0 1.0 0.0
156 2 0.0 0.0 1.0
157 """
158 from pandas.core.reshape.concat import concat
159
160 dtypes_to_encode = ["object", "string", "category"]
161
162 if isinstance(data, DataFrame):
163 # determine columns being encoded
164 if columns is None:
165 data_to_encode = data.select_dtypes(include=dtypes_to_encode)
166 elif not is_list_like(columns):
167 raise TypeError("Input must be a list-like for parameter `columns`")
168 else:
169 data_to_encode = data[columns]
170
171 # validate prefixes and separator to avoid silently dropping cols
172 def check_len(item, name: str):
173 if is_list_like(item):
174 if not len(item) == data_to_encode.shape[1]:
175 len_msg = (
176 f"Length of '{name}' ({len(item)}) did not match the "
177 "length of the columns being encoded "
178 f"({data_to_encode.shape[1]})."
179 )
180 raise ValueError(len_msg)
181
182 check_len(prefix, "prefix")
183 check_len(prefix_sep, "prefix_sep")
184
185 if isinstance(prefix, str):
186 prefix = itertools.cycle([prefix])
187 if isinstance(prefix, dict):
188 prefix = [prefix[col] for col in data_to_encode.columns]
189
190 if prefix is None:
191 prefix = data_to_encode.columns
192
193 # validate separators
194 if isinstance(prefix_sep, str):
195 prefix_sep = itertools.cycle([prefix_sep])
196 elif isinstance(prefix_sep, dict):
197 prefix_sep = [prefix_sep[col] for col in data_to_encode.columns]
198
199 with_dummies: list[DataFrame]
200 if data_to_encode.shape == data.shape:
201 # Encoding the entire df, do not prepend any dropped columns
202 with_dummies = []
203 elif columns is not None:
204 # Encoding only cols specified in columns. Get all cols not in
205 # columns to prepend to result.
206 with_dummies = [data.drop(columns, axis=1)]
207 else:
208 # Encoding only object and category dtype columns. Get remaining
209 # columns to prepend to result.
210 with_dummies = [data.select_dtypes(exclude=dtypes_to_encode)]
211
212 for col, pre, sep in zip(data_to_encode.items(), prefix, prefix_sep):
213 # col is (column_name, column), use just column data here
214 dummy = _get_dummies_1d(
215 col[1],
216 prefix=pre,
217 prefix_sep=sep,
218 dummy_na=dummy_na,
219 sparse=sparse,
220 drop_first=drop_first,
221 dtype=dtype,
222 )
223 with_dummies.append(dummy)
224 result = concat(with_dummies, axis=1)
225 else:
226 result = _get_dummies_1d(
227 data,
228 prefix,
229 prefix_sep,
230 dummy_na,
231 sparse=sparse,
232 drop_first=drop_first,
233 dtype=dtype,
234 )
235 return result
236
237
238def _get_dummies_1d(
239 data,
240 prefix,
241 prefix_sep: str | Iterable[str] | dict[str, str] = "_",
242 dummy_na: bool = False,
243 sparse: bool = False,
244 drop_first: bool = False,
245 dtype: NpDtype | None = None,
246) -> DataFrame:
247 from pandas.core.reshape.concat import concat
248
249 # Series avoids inconsistent NaN handling
250 codes, levels = factorize_from_iterable(Series(data, copy=False))
251
252 if dtype is None and hasattr(data, "dtype"):
253 input_dtype = data.dtype
254 if isinstance(input_dtype, CategoricalDtype):
255 input_dtype = input_dtype.categories.dtype
256
257 if isinstance(input_dtype, ArrowDtype):
258 import pyarrow as pa
259
260 dtype = ArrowDtype(pa.bool_()) # type: ignore[assignment]
261 elif (
262 isinstance(input_dtype, StringDtype)
263 and input_dtype.storage != "pyarrow_numpy"
264 ):
265 dtype = pandas_dtype("boolean") # type: ignore[assignment]
266 else:
267 dtype = np.dtype(bool)
268 elif dtype is None:
269 dtype = np.dtype(bool)
270
271 _dtype = pandas_dtype(dtype)
272
273 if is_object_dtype(_dtype):
274 raise ValueError("dtype=object is not a valid dtype for get_dummies")
275
276 def get_empty_frame(data) -> DataFrame:
277 index: Index | np.ndarray
278 if isinstance(data, Series):
279 index = data.index
280 else:
281 index = default_index(len(data))
282 return DataFrame(index=index)
283
284 # if all NaN
285 if not dummy_na and len(levels) == 0:
286 return get_empty_frame(data)
287
288 codes = codes.copy()
289 if dummy_na:
290 codes[codes == -1] = len(levels)
291 levels = levels.insert(len(levels), np.nan)
292
293 # if dummy_na, we just fake a nan level. drop_first will drop it again
294 if drop_first and len(levels) == 1:
295 return get_empty_frame(data)
296
297 number_of_cols = len(levels)
298
299 if prefix is None:
300 dummy_cols = levels
301 else:
302 dummy_cols = Index([f"{prefix}{prefix_sep}{level}" for level in levels])
303
304 index: Index | None
305 if isinstance(data, Series):
306 index = data.index
307 else:
308 index = None
309
310 if sparse:
311 fill_value: bool | float
312 if is_integer_dtype(dtype):
313 fill_value = 0
314 elif dtype == np.dtype(bool):
315 fill_value = False
316 else:
317 fill_value = 0.0
318
319 sparse_series = []
320 N = len(data)
321 sp_indices: list[list] = [[] for _ in range(len(dummy_cols))]
322 mask = codes != -1
323 codes = codes[mask]
324 n_idx = np.arange(N)[mask]
325
326 for ndx, code in zip(n_idx, codes):
327 sp_indices[code].append(ndx)
328
329 if drop_first:
330 # remove first categorical level to avoid perfect collinearity
331 # GH12042
332 sp_indices = sp_indices[1:]
333 dummy_cols = dummy_cols[1:]
334 for col, ixs in zip(dummy_cols, sp_indices):
335 sarr = SparseArray(
336 np.ones(len(ixs), dtype=dtype),
337 sparse_index=IntIndex(N, ixs),
338 fill_value=fill_value,
339 dtype=dtype,
340 )
341 sparse_series.append(Series(data=sarr, index=index, name=col, copy=False))
342
343 return concat(sparse_series, axis=1, copy=False)
344
345 else:
346 # ensure ndarray layout is column-major
347 shape = len(codes), number_of_cols
348 dummy_dtype: NpDtype
349 if isinstance(_dtype, np.dtype):
350 dummy_dtype = _dtype
351 else:
352 dummy_dtype = np.bool_
353 dummy_mat = np.zeros(shape=shape, dtype=dummy_dtype, order="F")
354 dummy_mat[np.arange(len(codes)), codes] = 1
355
356 if not dummy_na:
357 # reset NaN GH4446
358 dummy_mat[codes == -1] = 0
359
360 if drop_first:
361 # remove first GH12042
362 dummy_mat = dummy_mat[:, 1:]
363 dummy_cols = dummy_cols[1:]
364 return DataFrame(dummy_mat, index=index, columns=dummy_cols, dtype=_dtype)
365
366
367def from_dummies(
368 data: DataFrame,
369 sep: None | str = None,
370 default_category: None | Hashable | dict[str, Hashable] = None,
371) -> DataFrame:
372 """
373 Create a categorical ``DataFrame`` from a ``DataFrame`` of dummy variables.
374
375 Inverts the operation performed by :func:`~pandas.get_dummies`.
376
377 .. versionadded:: 1.5.0
378
379 Parameters
380 ----------
381 data : DataFrame
382 Data which contains dummy-coded variables in form of integer columns of
383 1's and 0's.
384 sep : str, default None
385 Separator used in the column names of the dummy categories they are
386 character indicating the separation of the categorical names from the prefixes.
387 For example, if your column names are 'prefix_A' and 'prefix_B',
388 you can strip the underscore by specifying sep='_'.
389 default_category : None, Hashable or dict of Hashables, default None
390 The default category is the implied category when a value has none of the
391 listed categories specified with a one, i.e. if all dummies in a row are
392 zero. Can be a single value for all variables or a dict directly mapping
393 the default categories to a prefix of a variable.
394
395 Returns
396 -------
397 DataFrame
398 Categorical data decoded from the dummy input-data.
399
400 Raises
401 ------
402 ValueError
403 * When the input ``DataFrame`` ``data`` contains NA values.
404 * When the input ``DataFrame`` ``data`` contains column names with separators
405 that do not match the separator specified with ``sep``.
406 * When a ``dict`` passed to ``default_category`` does not include an implied
407 category for each prefix.
408 * When a value in ``data`` has more than one category assigned to it.
409 * When ``default_category=None`` and a value in ``data`` has no category
410 assigned to it.
411 TypeError
412 * When the input ``data`` is not of type ``DataFrame``.
413 * When the input ``DataFrame`` ``data`` contains non-dummy data.
414 * When the passed ``sep`` is of a wrong data type.
415 * When the passed ``default_category`` is of a wrong data type.
416
417 See Also
418 --------
419 :func:`~pandas.get_dummies` : Convert ``Series`` or ``DataFrame`` to dummy codes.
420 :class:`~pandas.Categorical` : Represent a categorical variable in classic.
421
422 Notes
423 -----
424 The columns of the passed dummy data should only include 1's and 0's,
425 or boolean values.
426
427 Examples
428 --------
429 >>> df = pd.DataFrame({"a": [1, 0, 0, 1], "b": [0, 1, 0, 0],
430 ... "c": [0, 0, 1, 0]})
431
432 >>> df
433 a b c
434 0 1 0 0
435 1 0 1 0
436 2 0 0 1
437 3 1 0 0
438
439 >>> pd.from_dummies(df)
440 0 a
441 1 b
442 2 c
443 3 a
444
445 >>> df = pd.DataFrame({"col1_a": [1, 0, 1], "col1_b": [0, 1, 0],
446 ... "col2_a": [0, 1, 0], "col2_b": [1, 0, 0],
447 ... "col2_c": [0, 0, 1]})
448
449 >>> df
450 col1_a col1_b col2_a col2_b col2_c
451 0 1 0 0 1 0
452 1 0 1 1 0 0
453 2 1 0 0 0 1
454
455 >>> pd.from_dummies(df, sep="_")
456 col1 col2
457 0 a b
458 1 b a
459 2 a c
460
461 >>> df = pd.DataFrame({"col1_a": [1, 0, 0], "col1_b": [0, 1, 0],
462 ... "col2_a": [0, 1, 0], "col2_b": [1, 0, 0],
463 ... "col2_c": [0, 0, 0]})
464
465 >>> df
466 col1_a col1_b col2_a col2_b col2_c
467 0 1 0 0 1 0
468 1 0 1 1 0 0
469 2 0 0 0 0 0
470
471 >>> pd.from_dummies(df, sep="_", default_category={"col1": "d", "col2": "e"})
472 col1 col2
473 0 a b
474 1 b a
475 2 d e
476 """
477 from pandas.core.reshape.concat import concat
478
479 if not isinstance(data, DataFrame):
480 raise TypeError(
481 "Expected 'data' to be a 'DataFrame'; "
482 f"Received 'data' of type: {type(data).__name__}"
483 )
484
485 col_isna_mask = cast(Series, data.isna().any())
486
487 if col_isna_mask.any():
488 raise ValueError(
489 "Dummy DataFrame contains NA value in column: "
490 f"'{col_isna_mask.idxmax()}'"
491 )
492
493 # index data with a list of all columns that are dummies
494 try:
495 data_to_decode = data.astype("boolean", copy=False)
496 except TypeError:
497 raise TypeError("Passed DataFrame contains non-dummy data")
498
499 # collect prefixes and get lists to slice data for each prefix
500 variables_slice = defaultdict(list)
501 if sep is None:
502 variables_slice[""] = list(data.columns)
503 elif isinstance(sep, str):
504 for col in data_to_decode.columns:
505 prefix = col.split(sep)[0]
506 if len(prefix) == len(col):
507 raise ValueError(f"Separator not specified for column: {col}")
508 variables_slice[prefix].append(col)
509 else:
510 raise TypeError(
511 "Expected 'sep' to be of type 'str' or 'None'; "
512 f"Received 'sep' of type: {type(sep).__name__}"
513 )
514
515 if default_category is not None:
516 if isinstance(default_category, dict):
517 if not len(default_category) == len(variables_slice):
518 len_msg = (
519 f"Length of 'default_category' ({len(default_category)}) "
520 f"did not match the length of the columns being encoded "
521 f"({len(variables_slice)})"
522 )
523 raise ValueError(len_msg)
524 elif isinstance(default_category, Hashable):
525 default_category = dict(
526 zip(variables_slice, [default_category] * len(variables_slice))
527 )
528 else:
529 raise TypeError(
530 "Expected 'default_category' to be of type "
531 "'None', 'Hashable', or 'dict'; "
532 "Received 'default_category' of type: "
533 f"{type(default_category).__name__}"
534 )
535
536 cat_data = {}
537 for prefix, prefix_slice in variables_slice.items():
538 if sep is None:
539 cats = prefix_slice.copy()
540 else:
541 cats = [col[len(prefix + sep) :] for col in prefix_slice]
542 assigned = data_to_decode.loc[:, prefix_slice].sum(axis=1)
543 if any(assigned > 1):
544 raise ValueError(
545 "Dummy DataFrame contains multi-assignment(s); "
546 f"First instance in row: {assigned.idxmax()}"
547 )
548 if any(assigned == 0):
549 if isinstance(default_category, dict):
550 cats.append(default_category[prefix])
551 else:
552 raise ValueError(
553 "Dummy DataFrame contains unassigned value(s); "
554 f"First instance in row: {assigned.idxmin()}"
555 )
556 data_slice = concat(
557 (data_to_decode.loc[:, prefix_slice], assigned == 0), axis=1
558 )
559 else:
560 data_slice = data_to_decode.loc[:, prefix_slice]
561 cats_array = data._constructor_sliced(cats, dtype=data.columns.dtype)
562 # get indices of True entries along axis=1
563 true_values = data_slice.idxmax(axis=1)
564 indexer = data_slice.columns.get_indexer_for(true_values)
565 cat_data[prefix] = cats_array.take(indexer).set_axis(data.index)
566
567 result = DataFrame(cat_data)
568 if sep is not None:
569 result.columns = result.columns.astype(data.columns.dtype)
570 return result