1from __future__ import annotations
2
3import re
4from typing import (
5 TYPE_CHECKING,
6 Hashable,
7)
8
9import numpy as np
10
11from pandas.util._decorators import Appender
12
13from pandas.core.dtypes.common import (
14 is_extension_array_dtype,
15 is_list_like,
16)
17from pandas.core.dtypes.concat import concat_compat
18from pandas.core.dtypes.missing import notna
19
20import pandas.core.algorithms as algos
21from pandas.core.arrays import Categorical
22import pandas.core.common as com
23from pandas.core.indexes.api import (
24 Index,
25 MultiIndex,
26)
27from pandas.core.reshape.concat import concat
28from pandas.core.reshape.util import tile_compat
29from pandas.core.shared_docs import _shared_docs
30from pandas.core.tools.numeric import to_numeric
31
32if TYPE_CHECKING:
33 from pandas._typing import AnyArrayLike
34
35 from pandas import DataFrame
36
37
38@Appender(_shared_docs["melt"] % {"caller": "pd.melt(df, ", "other": "DataFrame.melt"})
39def melt(
40 frame: DataFrame,
41 id_vars=None,
42 value_vars=None,
43 var_name=None,
44 value_name: Hashable = "value",
45 col_level=None,
46 ignore_index: bool = True,
47) -> DataFrame:
48 # If multiindex, gather names of columns on all level for checking presence
49 # of `id_vars` and `value_vars`
50 if isinstance(frame.columns, MultiIndex):
51 cols = [x for c in frame.columns for x in c]
52 else:
53 cols = list(frame.columns)
54
55 if value_name in frame.columns:
56 raise ValueError(
57 f"value_name ({value_name}) cannot match an element in "
58 "the DataFrame columns."
59 )
60
61 if id_vars is not None:
62 if not is_list_like(id_vars):
63 id_vars = [id_vars]
64 elif isinstance(frame.columns, MultiIndex) and not isinstance(id_vars, list):
65 raise ValueError(
66 "id_vars must be a list of tuples when columns are a MultiIndex"
67 )
68 else:
69 # Check that `id_vars` are in frame
70 id_vars = list(id_vars)
71 missing = Index(com.flatten(id_vars)).difference(cols)
72 if not missing.empty:
73 raise KeyError(
74 "The following 'id_vars' are not present "
75 f"in the DataFrame: {list(missing)}"
76 )
77 else:
78 id_vars = []
79
80 if value_vars is not None:
81 if not is_list_like(value_vars):
82 value_vars = [value_vars]
83 elif isinstance(frame.columns, MultiIndex) and not isinstance(value_vars, list):
84 raise ValueError(
85 "value_vars must be a list of tuples when columns are a MultiIndex"
86 )
87 else:
88 value_vars = list(value_vars)
89 # Check that `value_vars` are in frame
90 missing = Index(com.flatten(value_vars)).difference(cols)
91 if not missing.empty:
92 raise KeyError(
93 "The following 'value_vars' are not present in "
94 f"the DataFrame: {list(missing)}"
95 )
96 if col_level is not None:
97 idx = frame.columns.get_level_values(col_level).get_indexer(
98 id_vars + value_vars
99 )
100 else:
101 idx = algos.unique(frame.columns.get_indexer_for(id_vars + value_vars))
102 frame = frame.iloc[:, idx]
103 else:
104 frame = frame.copy()
105
106 if col_level is not None: # allow list or other?
107 # frame is a copy
108 frame.columns = frame.columns.get_level_values(col_level)
109
110 if var_name is None:
111 if isinstance(frame.columns, MultiIndex):
112 if len(frame.columns.names) == len(set(frame.columns.names)):
113 var_name = frame.columns.names
114 else:
115 var_name = [f"variable_{i}" for i in range(len(frame.columns.names))]
116 else:
117 var_name = [
118 frame.columns.name if frame.columns.name is not None else "variable"
119 ]
120 if isinstance(var_name, str):
121 var_name = [var_name]
122
123 N, K = frame.shape
124 K -= len(id_vars)
125
126 mdata: dict[Hashable, AnyArrayLike] = {}
127 for col in id_vars:
128 id_data = frame.pop(col)
129 if is_extension_array_dtype(id_data):
130 if K > 0:
131 id_data = concat([id_data] * K, ignore_index=True)
132 else:
133 # We can't concat empty list. (GH 46044)
134 id_data = type(id_data)([], name=id_data.name, dtype=id_data.dtype)
135 else:
136 # error: Incompatible types in assignment (expression has type
137 # "ndarray[Any, dtype[Any]]", variable has type "Series")
138 id_data = np.tile(id_data._values, K) # type: ignore[assignment]
139 mdata[col] = id_data
140
141 mcolumns = id_vars + var_name + [value_name]
142
143 if frame.shape[1] > 0:
144 mdata[value_name] = concat(
145 [frame.iloc[:, i] for i in range(frame.shape[1])]
146 ).values
147 else:
148 mdata[value_name] = frame._values.ravel("F")
149 for i, col in enumerate(var_name):
150 # asanyarray will keep the columns as an Index
151 mdata[col] = np.asanyarray(frame.columns._get_level_values(i)).repeat(N)
152
153 result = frame._constructor(mdata, columns=mcolumns)
154
155 if not ignore_index:
156 result.index = tile_compat(frame.index, K)
157
158 return result
159
160
161def lreshape(data: DataFrame, groups, dropna: bool = True) -> DataFrame:
162 """
163 Reshape wide-format data to long. Generalized inverse of DataFrame.pivot.
164
165 Accepts a dictionary, ``groups``, in which each key is a new column name
166 and each value is a list of old column names that will be "melted" under
167 the new column name as part of the reshape.
168
169 Parameters
170 ----------
171 data : DataFrame
172 The wide-format DataFrame.
173 groups : dict
174 {new_name : list_of_columns}.
175 dropna : bool, default True
176 Do not include columns whose entries are all NaN.
177
178 Returns
179 -------
180 DataFrame
181 Reshaped DataFrame.
182
183 See Also
184 --------
185 melt : Unpivot a DataFrame from wide to long format, optionally leaving
186 identifiers set.
187 pivot : Create a spreadsheet-style pivot table as a DataFrame.
188 DataFrame.pivot : Pivot without aggregation that can handle
189 non-numeric data.
190 DataFrame.pivot_table : Generalization of pivot that can handle
191 duplicate values for one index/column pair.
192 DataFrame.unstack : Pivot based on the index values instead of a
193 column.
194 wide_to_long : Wide panel to long format. Less flexible but more
195 user-friendly than melt.
196
197 Examples
198 --------
199 >>> data = pd.DataFrame({'hr1': [514, 573], 'hr2': [545, 526],
200 ... 'team': ['Red Sox', 'Yankees'],
201 ... 'year1': [2007, 2007], 'year2': [2008, 2008]})
202 >>> data
203 hr1 hr2 team year1 year2
204 0 514 545 Red Sox 2007 2008
205 1 573 526 Yankees 2007 2008
206
207 >>> pd.lreshape(data, {'year': ['year1', 'year2'], 'hr': ['hr1', 'hr2']})
208 team year hr
209 0 Red Sox 2007 514
210 1 Yankees 2007 573
211 2 Red Sox 2008 545
212 3 Yankees 2008 526
213 """
214 if isinstance(groups, dict):
215 keys = list(groups.keys())
216 values = list(groups.values())
217 else:
218 keys, values = zip(*groups)
219
220 all_cols = list(set.union(*(set(x) for x in values)))
221 id_cols = list(data.columns.difference(all_cols))
222
223 K = len(values[0])
224
225 for seq in values:
226 if len(seq) != K:
227 raise ValueError("All column lists must be same length")
228
229 mdata = {}
230 pivot_cols = []
231
232 for target, names in zip(keys, values):
233 to_concat = [data[col]._values for col in names]
234
235 mdata[target] = concat_compat(to_concat)
236 pivot_cols.append(target)
237
238 for col in id_cols:
239 mdata[col] = np.tile(data[col]._values, K)
240
241 if dropna:
242 mask = np.ones(len(mdata[pivot_cols[0]]), dtype=bool)
243 for c in pivot_cols:
244 mask &= notna(mdata[c])
245 if not mask.all():
246 mdata = {k: v[mask] for k, v in mdata.items()}
247
248 return data._constructor(mdata, columns=id_cols + pivot_cols)
249
250
251def wide_to_long(
252 df: DataFrame, stubnames, i, j, sep: str = "", suffix: str = r"\d+"
253) -> DataFrame:
254 r"""
255 Unpivot a DataFrame from wide to long format.
256
257 Less flexible but more user-friendly than melt.
258
259 With stubnames ['A', 'B'], this function expects to find one or more
260 group of columns with format
261 A-suffix1, A-suffix2,..., B-suffix1, B-suffix2,...
262 You specify what you want to call this suffix in the resulting long format
263 with `j` (for example `j='year'`)
264
265 Each row of these wide variables are assumed to be uniquely identified by
266 `i` (can be a single column name or a list of column names)
267
268 All remaining variables in the data frame are left intact.
269
270 Parameters
271 ----------
272 df : DataFrame
273 The wide-format DataFrame.
274 stubnames : str or list-like
275 The stub name(s). The wide format variables are assumed to
276 start with the stub names.
277 i : str or list-like
278 Column(s) to use as id variable(s).
279 j : str
280 The name of the sub-observation variable. What you wish to name your
281 suffix in the long format.
282 sep : str, default ""
283 A character indicating the separation of the variable names
284 in the wide format, to be stripped from the names in the long format.
285 For example, if your column names are A-suffix1, A-suffix2, you
286 can strip the hyphen by specifying `sep='-'`.
287 suffix : str, default '\\d+'
288 A regular expression capturing the wanted suffixes. '\\d+' captures
289 numeric suffixes. Suffixes with no numbers could be specified with the
290 negated character class '\\D+'. You can also further disambiguate
291 suffixes, for example, if your wide variables are of the form A-one,
292 B-two,.., and you have an unrelated column A-rating, you can ignore the
293 last one by specifying `suffix='(!?one|two)'`. When all suffixes are
294 numeric, they are cast to int64/float64.
295
296 Returns
297 -------
298 DataFrame
299 A DataFrame that contains each stub name as a variable, with new index
300 (i, j).
301
302 See Also
303 --------
304 melt : Unpivot a DataFrame from wide to long format, optionally leaving
305 identifiers set.
306 pivot : Create a spreadsheet-style pivot table as a DataFrame.
307 DataFrame.pivot : Pivot without aggregation that can handle
308 non-numeric data.
309 DataFrame.pivot_table : Generalization of pivot that can handle
310 duplicate values for one index/column pair.
311 DataFrame.unstack : Pivot based on the index values instead of a
312 column.
313
314 Notes
315 -----
316 All extra variables are left untouched. This simply uses
317 `pandas.melt` under the hood, but is hard-coded to "do the right thing"
318 in a typical case.
319
320 Examples
321 --------
322 >>> np.random.seed(123)
323 >>> df = pd.DataFrame({"A1970" : {0 : "a", 1 : "b", 2 : "c"},
324 ... "A1980" : {0 : "d", 1 : "e", 2 : "f"},
325 ... "B1970" : {0 : 2.5, 1 : 1.2, 2 : .7},
326 ... "B1980" : {0 : 3.2, 1 : 1.3, 2 : .1},
327 ... "X" : dict(zip(range(3), np.random.randn(3)))
328 ... })
329 >>> df["id"] = df.index
330 >>> df
331 A1970 A1980 B1970 B1980 X id
332 0 a d 2.5 3.2 -1.085631 0
333 1 b e 1.2 1.3 0.997345 1
334 2 c f 0.7 0.1 0.282978 2
335 >>> pd.wide_to_long(df, ["A", "B"], i="id", j="year")
336 ... # doctest: +NORMALIZE_WHITESPACE
337 X A B
338 id year
339 0 1970 -1.085631 a 2.5
340 1 1970 0.997345 b 1.2
341 2 1970 0.282978 c 0.7
342 0 1980 -1.085631 d 3.2
343 1 1980 0.997345 e 1.3
344 2 1980 0.282978 f 0.1
345
346 With multiple id columns
347
348 >>> df = pd.DataFrame({
349 ... 'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3],
350 ... 'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3],
351 ... 'ht1': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1],
352 ... 'ht2': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9]
353 ... })
354 >>> df
355 famid birth ht1 ht2
356 0 1 1 2.8 3.4
357 1 1 2 2.9 3.8
358 2 1 3 2.2 2.9
359 3 2 1 2.0 3.2
360 4 2 2 1.8 2.8
361 5 2 3 1.9 2.4
362 6 3 1 2.2 3.3
363 7 3 2 2.3 3.4
364 8 3 3 2.1 2.9
365 >>> l = pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age')
366 >>> l
367 ... # doctest: +NORMALIZE_WHITESPACE
368 ht
369 famid birth age
370 1 1 1 2.8
371 2 3.4
372 2 1 2.9
373 2 3.8
374 3 1 2.2
375 2 2.9
376 2 1 1 2.0
377 2 3.2
378 2 1 1.8
379 2 2.8
380 3 1 1.9
381 2 2.4
382 3 1 1 2.2
383 2 3.3
384 2 1 2.3
385 2 3.4
386 3 1 2.1
387 2 2.9
388
389 Going from long back to wide just takes some creative use of `unstack`
390
391 >>> w = l.unstack()
392 >>> w.columns = w.columns.map('{0[0]}{0[1]}'.format)
393 >>> w.reset_index()
394 famid birth ht1 ht2
395 0 1 1 2.8 3.4
396 1 1 2 2.9 3.8
397 2 1 3 2.2 2.9
398 3 2 1 2.0 3.2
399 4 2 2 1.8 2.8
400 5 2 3 1.9 2.4
401 6 3 1 2.2 3.3
402 7 3 2 2.3 3.4
403 8 3 3 2.1 2.9
404
405 Less wieldy column names are also handled
406
407 >>> np.random.seed(0)
408 >>> df = pd.DataFrame({'A(weekly)-2010': np.random.rand(3),
409 ... 'A(weekly)-2011': np.random.rand(3),
410 ... 'B(weekly)-2010': np.random.rand(3),
411 ... 'B(weekly)-2011': np.random.rand(3),
412 ... 'X' : np.random.randint(3, size=3)})
413 >>> df['id'] = df.index
414 >>> df # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS
415 A(weekly)-2010 A(weekly)-2011 B(weekly)-2010 B(weekly)-2011 X id
416 0 0.548814 0.544883 0.437587 0.383442 0 0
417 1 0.715189 0.423655 0.891773 0.791725 1 1
418 2 0.602763 0.645894 0.963663 0.528895 1 2
419
420 >>> pd.wide_to_long(df, ['A(weekly)', 'B(weekly)'], i='id',
421 ... j='year', sep='-')
422 ... # doctest: +NORMALIZE_WHITESPACE
423 X A(weekly) B(weekly)
424 id year
425 0 2010 0 0.548814 0.437587
426 1 2010 1 0.715189 0.891773
427 2 2010 1 0.602763 0.963663
428 0 2011 0 0.544883 0.383442
429 1 2011 1 0.423655 0.791725
430 2 2011 1 0.645894 0.528895
431
432 If we have many columns, we could also use a regex to find our
433 stubnames and pass that list on to wide_to_long
434
435 >>> stubnames = sorted(
436 ... set([match[0] for match in df.columns.str.findall(
437 ... r'[A-B]\(.*\)').values if match != []])
438 ... )
439 >>> list(stubnames)
440 ['A(weekly)', 'B(weekly)']
441
442 All of the above examples have integers as suffixes. It is possible to
443 have non-integers as suffixes.
444
445 >>> df = pd.DataFrame({
446 ... 'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3],
447 ... 'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3],
448 ... 'ht_one': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1],
449 ... 'ht_two': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9]
450 ... })
451 >>> df
452 famid birth ht_one ht_two
453 0 1 1 2.8 3.4
454 1 1 2 2.9 3.8
455 2 1 3 2.2 2.9
456 3 2 1 2.0 3.2
457 4 2 2 1.8 2.8
458 5 2 3 1.9 2.4
459 6 3 1 2.2 3.3
460 7 3 2 2.3 3.4
461 8 3 3 2.1 2.9
462
463 >>> l = pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age',
464 ... sep='_', suffix=r'\w+')
465 >>> l
466 ... # doctest: +NORMALIZE_WHITESPACE
467 ht
468 famid birth age
469 1 1 one 2.8
470 two 3.4
471 2 one 2.9
472 two 3.8
473 3 one 2.2
474 two 2.9
475 2 1 one 2.0
476 two 3.2
477 2 one 1.8
478 two 2.8
479 3 one 1.9
480 two 2.4
481 3 1 one 2.2
482 two 3.3
483 2 one 2.3
484 two 3.4
485 3 one 2.1
486 two 2.9
487 """
488
489 def get_var_names(df, stub: str, sep: str, suffix: str) -> list[str]:
490 regex = rf"^{re.escape(stub)}{re.escape(sep)}{suffix}$"
491 pattern = re.compile(regex)
492 return [col for col in df.columns if pattern.match(col)]
493
494 def melt_stub(df, stub: str, i, j, value_vars, sep: str):
495 newdf = melt(
496 df,
497 id_vars=i,
498 value_vars=value_vars,
499 value_name=stub.rstrip(sep),
500 var_name=j,
501 )
502 newdf[j] = Categorical(newdf[j])
503 newdf[j] = newdf[j].str.replace(re.escape(stub + sep), "", regex=True)
504
505 # GH17627 Cast numerics suffixes to int/float
506 newdf[j] = to_numeric(newdf[j], errors="ignore")
507
508 return newdf.set_index(i + [j])
509
510 if not is_list_like(stubnames):
511 stubnames = [stubnames]
512 else:
513 stubnames = list(stubnames)
514
515 if any(col in stubnames for col in df.columns):
516 raise ValueError("stubname can't be identical to a column name")
517
518 if not is_list_like(i):
519 i = [i]
520 else:
521 i = list(i)
522
523 if df[i].duplicated().any():
524 raise ValueError("the id variables need to uniquely identify each row")
525
526 value_vars = [get_var_names(df, stub, sep, suffix) for stub in stubnames]
527
528 value_vars_flattened = [e for sublist in value_vars for e in sublist]
529 id_vars = list(set(df.columns.tolist()).difference(value_vars_flattened))
530
531 _melted = [melt_stub(df, s, i, j, v, sep) for s, v in zip(stubnames, value_vars)]
532 melted = _melted[0].join(_melted[1:], how="outer")
533
534 if len(i) == 1:
535 new = df[id_vars].set_index(i).join(melted)
536 return new
537
538 new = df[id_vars].merge(melted.reset_index(), on=i).set_index(i + [j])
539
540 return new