1from __future__ import annotations
2
3from collections import defaultdict
4import itertools
5from typing import (
6 Hashable,
7 Iterable,
8)
9
10import numpy as np
11
12from pandas._libs.sparse import IntIndex
13from pandas._typing import NpDtype
14
15from pandas.core.dtypes.common import (
16 is_integer_dtype,
17 is_list_like,
18 is_object_dtype,
19 pandas_dtype,
20)
21
22from pandas.core.arrays import SparseArray
23from pandas.core.arrays.categorical import factorize_from_iterable
24from pandas.core.frame import DataFrame
25from pandas.core.indexes.api import (
26 Index,
27 default_index,
28)
29from pandas.core.series import Series
30
31
32def get_dummies(
33 data,
34 prefix=None,
35 prefix_sep: str | Iterable[str] | dict[str, str] = "_",
36 dummy_na: bool = False,
37 columns=None,
38 sparse: bool = False,
39 drop_first: bool = False,
40 dtype: NpDtype | None = None,
41) -> DataFrame:
42 """
43 Convert categorical variable into dummy/indicator variables.
44
45 Each variable is converted in as many 0/1 variables as there are different
46 values. Columns in the output are each named after a value; if the input is
47 a DataFrame, the name of the original variable is prepended to the value.
48
49 Parameters
50 ----------
51 data : array-like, Series, or DataFrame
52 Data of which to get dummy indicators.
53 prefix : str, list of str, or dict of str, default None
54 String to append DataFrame column names.
55 Pass a list with length equal to the number of columns
56 when calling get_dummies on a DataFrame. Alternatively, `prefix`
57 can be a dictionary mapping column names to prefixes.
58 prefix_sep : str, default '_'
59 If appending prefix, separator/delimiter to use. Or pass a
60 list or dictionary as with `prefix`.
61 dummy_na : bool, default False
62 Add a column to indicate NaNs, if False NaNs are ignored.
63 columns : list-like, default None
64 Column names in the DataFrame to be encoded.
65 If `columns` is None then all the columns with
66 `object`, `string`, or `category` dtype will be converted.
67 sparse : bool, default False
68 Whether the dummy-encoded columns should be backed by
69 a :class:`SparseArray` (True) or a regular NumPy array (False).
70 drop_first : bool, default False
71 Whether to get k-1 dummies out of k categorical levels by removing the
72 first level.
73 dtype : dtype, default bool
74 Data type for new columns. Only a single dtype is allowed.
75
76 Returns
77 -------
78 DataFrame
79 Dummy-coded data. If `data` contains other columns than the
80 dummy-coded one(s), these will be prepended, unaltered, to the result.
81
82 See Also
83 --------
84 Series.str.get_dummies : Convert Series of strings to dummy codes.
85 :func:`~pandas.from_dummies` : Convert dummy codes to categorical ``DataFrame``.
86
87 Notes
88 -----
89 Reference :ref:`the user guide <reshaping.dummies>` for more examples.
90
91 Examples
92 --------
93 >>> s = pd.Series(list('abca'))
94
95 >>> pd.get_dummies(s)
96 a b c
97 0 True False False
98 1 False True False
99 2 False False True
100 3 True False False
101
102 >>> s1 = ['a', 'b', np.nan]
103
104 >>> pd.get_dummies(s1)
105 a b
106 0 True False
107 1 False True
108 2 False False
109
110 >>> pd.get_dummies(s1, dummy_na=True)
111 a b NaN
112 0 True False False
113 1 False True False
114 2 False False True
115
116 >>> df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'],
117 ... 'C': [1, 2, 3]})
118
119 >>> pd.get_dummies(df, prefix=['col1', 'col2'])
120 C col1_a col1_b col2_a col2_b col2_c
121 0 1 True False False True False
122 1 2 False True True False False
123 2 3 True False False False True
124
125 >>> pd.get_dummies(pd.Series(list('abcaa')))
126 a b c
127 0 True False False
128 1 False True False
129 2 False False True
130 3 True False False
131 4 True False False
132
133 >>> pd.get_dummies(pd.Series(list('abcaa')), drop_first=True)
134 b c
135 0 False False
136 1 True False
137 2 False True
138 3 False False
139 4 False False
140
141 >>> pd.get_dummies(pd.Series(list('abc')), dtype=float)
142 a b c
143 0 1.0 0.0 0.0
144 1 0.0 1.0 0.0
145 2 0.0 0.0 1.0
146 """
147 from pandas.core.reshape.concat import concat
148
149 dtypes_to_encode = ["object", "string", "category"]
150
151 if isinstance(data, DataFrame):
152 # determine columns being encoded
153 if columns is None:
154 data_to_encode = data.select_dtypes(include=dtypes_to_encode)
155 elif not is_list_like(columns):
156 raise TypeError("Input must be a list-like for parameter `columns`")
157 else:
158 data_to_encode = data[columns]
159
160 # validate prefixes and separator to avoid silently dropping cols
161 def check_len(item, name):
162 if is_list_like(item):
163 if not len(item) == data_to_encode.shape[1]:
164 len_msg = (
165 f"Length of '{name}' ({len(item)}) did not match the "
166 "length of the columns being encoded "
167 f"({data_to_encode.shape[1]})."
168 )
169 raise ValueError(len_msg)
170
171 check_len(prefix, "prefix")
172 check_len(prefix_sep, "prefix_sep")
173
174 if isinstance(prefix, str):
175 prefix = itertools.cycle([prefix])
176 if isinstance(prefix, dict):
177 prefix = [prefix[col] for col in data_to_encode.columns]
178
179 if prefix is None:
180 prefix = data_to_encode.columns
181
182 # validate separators
183 if isinstance(prefix_sep, str):
184 prefix_sep = itertools.cycle([prefix_sep])
185 elif isinstance(prefix_sep, dict):
186 prefix_sep = [prefix_sep[col] for col in data_to_encode.columns]
187
188 with_dummies: list[DataFrame]
189 if data_to_encode.shape == data.shape:
190 # Encoding the entire df, do not prepend any dropped columns
191 with_dummies = []
192 elif columns is not None:
193 # Encoding only cols specified in columns. Get all cols not in
194 # columns to prepend to result.
195 with_dummies = [data.drop(columns, axis=1)]
196 else:
197 # Encoding only object and category dtype columns. Get remaining
198 # columns to prepend to result.
199 with_dummies = [data.select_dtypes(exclude=dtypes_to_encode)]
200
201 for col, pre, sep in zip(data_to_encode.items(), prefix, prefix_sep):
202 # col is (column_name, column), use just column data here
203 dummy = _get_dummies_1d(
204 col[1],
205 prefix=pre,
206 prefix_sep=sep,
207 dummy_na=dummy_na,
208 sparse=sparse,
209 drop_first=drop_first,
210 dtype=dtype,
211 )
212 with_dummies.append(dummy)
213 result = concat(with_dummies, axis=1)
214 else:
215 result = _get_dummies_1d(
216 data,
217 prefix,
218 prefix_sep,
219 dummy_na,
220 sparse=sparse,
221 drop_first=drop_first,
222 dtype=dtype,
223 )
224 return result
225
226
227def _get_dummies_1d(
228 data,
229 prefix,
230 prefix_sep: str | Iterable[str] | dict[str, str] = "_",
231 dummy_na: bool = False,
232 sparse: bool = False,
233 drop_first: bool = False,
234 dtype: NpDtype | None = None,
235) -> DataFrame:
236 from pandas.core.reshape.concat import concat
237
238 # Series avoids inconsistent NaN handling
239 codes, levels = factorize_from_iterable(Series(data, copy=False))
240
241 if dtype is None:
242 dtype = np.dtype(bool)
243 _dtype = pandas_dtype(dtype)
244
245 if is_object_dtype(_dtype):
246 raise ValueError("dtype=object is not a valid dtype for get_dummies")
247
248 def get_empty_frame(data) -> DataFrame:
249 index: Index | np.ndarray
250 if isinstance(data, Series):
251 index = data.index
252 else:
253 index = default_index(len(data))
254 return DataFrame(index=index)
255
256 # if all NaN
257 if not dummy_na and len(levels) == 0:
258 return get_empty_frame(data)
259
260 codes = codes.copy()
261 if dummy_na:
262 codes[codes == -1] = len(levels)
263 levels = levels.insert(len(levels), np.nan)
264
265 # if dummy_na, we just fake a nan level. drop_first will drop it again
266 if drop_first and len(levels) == 1:
267 return get_empty_frame(data)
268
269 number_of_cols = len(levels)
270
271 if prefix is None:
272 dummy_cols = levels
273 else:
274 dummy_cols = Index([f"{prefix}{prefix_sep}{level}" for level in levels])
275
276 index: Index | None
277 if isinstance(data, Series):
278 index = data.index
279 else:
280 index = None
281
282 if sparse:
283 fill_value: bool | float
284 if is_integer_dtype(dtype):
285 fill_value = 0
286 elif dtype == np.dtype(bool):
287 fill_value = False
288 else:
289 fill_value = 0.0
290
291 sparse_series = []
292 N = len(data)
293 sp_indices: list[list] = [[] for _ in range(len(dummy_cols))]
294 mask = codes != -1
295 codes = codes[mask]
296 n_idx = np.arange(N)[mask]
297
298 for ndx, code in zip(n_idx, codes):
299 sp_indices[code].append(ndx)
300
301 if drop_first:
302 # remove first categorical level to avoid perfect collinearity
303 # GH12042
304 sp_indices = sp_indices[1:]
305 dummy_cols = dummy_cols[1:]
306 for col, ixs in zip(dummy_cols, sp_indices):
307 sarr = SparseArray(
308 np.ones(len(ixs), dtype=dtype),
309 sparse_index=IntIndex(N, ixs),
310 fill_value=fill_value,
311 dtype=dtype,
312 )
313 sparse_series.append(Series(data=sarr, index=index, name=col, copy=False))
314
315 return concat(sparse_series, axis=1, copy=False)
316
317 else:
318 # take on axis=1 + transpose to ensure ndarray layout is column-major
319 eye_dtype: NpDtype
320 if isinstance(_dtype, np.dtype):
321 eye_dtype = _dtype
322 else:
323 eye_dtype = np.bool_
324 dummy_mat = np.eye(number_of_cols, dtype=eye_dtype).take(codes, axis=1).T
325
326 if not dummy_na:
327 # reset NaN GH4446
328 dummy_mat[codes == -1] = 0
329
330 if drop_first:
331 # remove first GH12042
332 dummy_mat = dummy_mat[:, 1:]
333 dummy_cols = dummy_cols[1:]
334 return DataFrame(dummy_mat, index=index, columns=dummy_cols, dtype=_dtype)
335
336
337def from_dummies(
338 data: DataFrame,
339 sep: None | str = None,
340 default_category: None | Hashable | dict[str, Hashable] = None,
341) -> DataFrame:
342 """
343 Create a categorical ``DataFrame`` from a ``DataFrame`` of dummy variables.
344
345 Inverts the operation performed by :func:`~pandas.get_dummies`.
346
347 .. versionadded:: 1.5.0
348
349 Parameters
350 ----------
351 data : DataFrame
352 Data which contains dummy-coded variables in form of integer columns of
353 1's and 0's.
354 sep : str, default None
355 Separator used in the column names of the dummy categories they are
356 character indicating the separation of the categorical names from the prefixes.
357 For example, if your column names are 'prefix_A' and 'prefix_B',
358 you can strip the underscore by specifying sep='_'.
359 default_category : None, Hashable or dict of Hashables, default None
360 The default category is the implied category when a value has none of the
361 listed categories specified with a one, i.e. if all dummies in a row are
362 zero. Can be a single value for all variables or a dict directly mapping
363 the default categories to a prefix of a variable.
364
365 Returns
366 -------
367 DataFrame
368 Categorical data decoded from the dummy input-data.
369
370 Raises
371 ------
372 ValueError
373 * When the input ``DataFrame`` ``data`` contains NA values.
374 * When the input ``DataFrame`` ``data`` contains column names with separators
375 that do not match the separator specified with ``sep``.
376 * When a ``dict`` passed to ``default_category`` does not include an implied
377 category for each prefix.
378 * When a value in ``data`` has more than one category assigned to it.
379 * When ``default_category=None`` and a value in ``data`` has no category
380 assigned to it.
381 TypeError
382 * When the input ``data`` is not of type ``DataFrame``.
383 * When the input ``DataFrame`` ``data`` contains non-dummy data.
384 * When the passed ``sep`` is of a wrong data type.
385 * When the passed ``default_category`` is of a wrong data type.
386
387 See Also
388 --------
389 :func:`~pandas.get_dummies` : Convert ``Series`` or ``DataFrame`` to dummy codes.
390 :class:`~pandas.Categorical` : Represent a categorical variable in classic.
391
392 Notes
393 -----
394 The columns of the passed dummy data should only include 1's and 0's,
395 or boolean values.
396
397 Examples
398 --------
399 >>> df = pd.DataFrame({"a": [1, 0, 0, 1], "b": [0, 1, 0, 0],
400 ... "c": [0, 0, 1, 0]})
401
402 >>> df
403 a b c
404 0 1 0 0
405 1 0 1 0
406 2 0 0 1
407 3 1 0 0
408
409 >>> pd.from_dummies(df)
410 0 a
411 1 b
412 2 c
413 3 a
414
415 >>> df = pd.DataFrame({"col1_a": [1, 0, 1], "col1_b": [0, 1, 0],
416 ... "col2_a": [0, 1, 0], "col2_b": [1, 0, 0],
417 ... "col2_c": [0, 0, 1]})
418
419 >>> df
420 col1_a col1_b col2_a col2_b col2_c
421 0 1 0 0 1 0
422 1 0 1 1 0 0
423 2 1 0 0 0 1
424
425 >>> pd.from_dummies(df, sep="_")
426 col1 col2
427 0 a b
428 1 b a
429 2 a c
430
431 >>> df = pd.DataFrame({"col1_a": [1, 0, 0], "col1_b": [0, 1, 0],
432 ... "col2_a": [0, 1, 0], "col2_b": [1, 0, 0],
433 ... "col2_c": [0, 0, 0]})
434
435 >>> df
436 col1_a col1_b col2_a col2_b col2_c
437 0 1 0 0 1 0
438 1 0 1 1 0 0
439 2 0 0 0 0 0
440
441 >>> pd.from_dummies(df, sep="_", default_category={"col1": "d", "col2": "e"})
442 col1 col2
443 0 a b
444 1 b a
445 2 d e
446 """
447 from pandas.core.reshape.concat import concat
448
449 if not isinstance(data, DataFrame):
450 raise TypeError(
451 "Expected 'data' to be a 'DataFrame'; "
452 f"Received 'data' of type: {type(data).__name__}"
453 )
454
455 if data.isna().any().any():
456 raise ValueError(
457 "Dummy DataFrame contains NA value in column: "
458 f"'{data.isna().any().idxmax()}'"
459 )
460
461 # index data with a list of all columns that are dummies
462 try:
463 data_to_decode = data.astype("boolean", copy=False)
464 except TypeError:
465 raise TypeError("Passed DataFrame contains non-dummy data")
466
467 # collect prefixes and get lists to slice data for each prefix
468 variables_slice = defaultdict(list)
469 if sep is None:
470 variables_slice[""] = list(data.columns)
471 elif isinstance(sep, str):
472 for col in data_to_decode.columns:
473 prefix = col.split(sep)[0]
474 if len(prefix) == len(col):
475 raise ValueError(f"Separator not specified for column: {col}")
476 variables_slice[prefix].append(col)
477 else:
478 raise TypeError(
479 "Expected 'sep' to be of type 'str' or 'None'; "
480 f"Received 'sep' of type: {type(sep).__name__}"
481 )
482
483 if default_category is not None:
484 if isinstance(default_category, dict):
485 if not len(default_category) == len(variables_slice):
486 len_msg = (
487 f"Length of 'default_category' ({len(default_category)}) "
488 f"did not match the length of the columns being encoded "
489 f"({len(variables_slice)})"
490 )
491 raise ValueError(len_msg)
492 elif isinstance(default_category, Hashable):
493 default_category = dict(
494 zip(variables_slice, [default_category] * len(variables_slice))
495 )
496 else:
497 raise TypeError(
498 "Expected 'default_category' to be of type "
499 "'None', 'Hashable', or 'dict'; "
500 "Received 'default_category' of type: "
501 f"{type(default_category).__name__}"
502 )
503
504 cat_data = {}
505 for prefix, prefix_slice in variables_slice.items():
506 if sep is None:
507 cats = prefix_slice.copy()
508 else:
509 cats = [col[len(prefix + sep) :] for col in prefix_slice]
510 assigned = data_to_decode.loc[:, prefix_slice].sum(axis=1)
511 if any(assigned > 1):
512 raise ValueError(
513 "Dummy DataFrame contains multi-assignment(s); "
514 f"First instance in row: {assigned.idxmax()}"
515 )
516 if any(assigned == 0):
517 if isinstance(default_category, dict):
518 cats.append(default_category[prefix])
519 else:
520 raise ValueError(
521 "Dummy DataFrame contains unassigned value(s); "
522 f"First instance in row: {assigned.idxmin()}"
523 )
524 data_slice = concat(
525 (data_to_decode.loc[:, prefix_slice], assigned == 0), axis=1
526 )
527 else:
528 data_slice = data_to_decode.loc[:, prefix_slice]
529 cats_array = np.array(cats, dtype="object")
530 # get indices of True entries along axis=1
531 cat_data[prefix] = cats_array[data_slice.to_numpy().nonzero()[1]]
532
533 return DataFrame(cat_data)