1from __future__ import annotations
2
3import re
4from typing import TYPE_CHECKING
5
6import numpy as np
7
8from pandas.util._decorators import Appender
9
10from pandas.core.dtypes.common import is_list_like
11from pandas.core.dtypes.concat import concat_compat
12from pandas.core.dtypes.missing import notna
13
14import pandas.core.algorithms as algos
15from pandas.core.indexes.api import MultiIndex
16from pandas.core.reshape.concat import concat
17from pandas.core.reshape.util import tile_compat
18from pandas.core.shared_docs import _shared_docs
19from pandas.core.tools.numeric import to_numeric
20
21if TYPE_CHECKING:
22 from collections.abc import Hashable
23
24 from pandas._typing import AnyArrayLike
25
26 from pandas import DataFrame
27
28
29def ensure_list_vars(arg_vars, variable: str, columns) -> list:
30 if arg_vars is not None:
31 if not is_list_like(arg_vars):
32 return [arg_vars]
33 elif isinstance(columns, MultiIndex) and not isinstance(arg_vars, list):
34 raise ValueError(
35 f"{variable} must be a list of tuples when columns are a MultiIndex"
36 )
37 else:
38 return list(arg_vars)
39 else:
40 return []
41
42
43@Appender(_shared_docs["melt"] % {"caller": "pd.melt(df, ", "other": "DataFrame.melt"})
44def melt(
45 frame: DataFrame,
46 id_vars=None,
47 value_vars=None,
48 var_name=None,
49 value_name: Hashable = "value",
50 col_level=None,
51 ignore_index: bool = True,
52) -> DataFrame:
53 if value_name in frame.columns:
54 raise ValueError(
55 f"value_name ({value_name}) cannot match an element in "
56 "the DataFrame columns."
57 )
58 id_vars = ensure_list_vars(id_vars, "id_vars", frame.columns)
59 value_vars_was_not_none = value_vars is not None
60 value_vars = ensure_list_vars(value_vars, "value_vars", frame.columns)
61
62 if id_vars or value_vars:
63 if col_level is not None:
64 level = frame.columns.get_level_values(col_level)
65 else:
66 level = frame.columns
67 labels = id_vars + value_vars
68 idx = level.get_indexer_for(labels)
69 missing = idx == -1
70 if missing.any():
71 missing_labels = [
72 lab for lab, not_found in zip(labels, missing) if not_found
73 ]
74 raise KeyError(
75 "The following id_vars or value_vars are not present in "
76 f"the DataFrame: {missing_labels}"
77 )
78 if value_vars_was_not_none:
79 frame = frame.iloc[:, algos.unique(idx)]
80 else:
81 frame = frame.copy()
82 else:
83 frame = frame.copy()
84
85 if col_level is not None: # allow list or other?
86 # frame is a copy
87 frame.columns = frame.columns.get_level_values(col_level)
88
89 if var_name is None:
90 if isinstance(frame.columns, MultiIndex):
91 if len(frame.columns.names) == len(set(frame.columns.names)):
92 var_name = frame.columns.names
93 else:
94 var_name = [f"variable_{i}" for i in range(len(frame.columns.names))]
95 else:
96 var_name = [
97 frame.columns.name if frame.columns.name is not None else "variable"
98 ]
99 elif is_list_like(var_name):
100 raise ValueError(f"{var_name=} must be a scalar.")
101 else:
102 var_name = [var_name]
103
104 num_rows, K = frame.shape
105 num_cols_adjusted = K - len(id_vars)
106
107 mdata: dict[Hashable, AnyArrayLike] = {}
108 for col in id_vars:
109 id_data = frame.pop(col)
110 if not isinstance(id_data.dtype, np.dtype):
111 # i.e. ExtensionDtype
112 if num_cols_adjusted > 0:
113 mdata[col] = concat([id_data] * num_cols_adjusted, ignore_index=True)
114 else:
115 # We can't concat empty list. (GH 46044)
116 mdata[col] = type(id_data)([], name=id_data.name, dtype=id_data.dtype)
117 else:
118 mdata[col] = np.tile(id_data._values, num_cols_adjusted)
119
120 mcolumns = id_vars + var_name + [value_name]
121
122 if frame.shape[1] > 0 and not any(
123 not isinstance(dt, np.dtype) and dt._supports_2d for dt in frame.dtypes
124 ):
125 mdata[value_name] = concat(
126 [frame.iloc[:, i] for i in range(frame.shape[1])]
127 ).values
128 else:
129 mdata[value_name] = frame._values.ravel("F")
130 for i, col in enumerate(var_name):
131 mdata[col] = frame.columns._get_level_values(i).repeat(num_rows)
132
133 result = frame._constructor(mdata, columns=mcolumns)
134
135 if not ignore_index:
136 result.index = tile_compat(frame.index, num_cols_adjusted)
137
138 return result
139
140
141def lreshape(data: DataFrame, groups: dict, dropna: bool = True) -> DataFrame:
142 """
143 Reshape wide-format data to long. Generalized inverse of DataFrame.pivot.
144
145 Accepts a dictionary, ``groups``, in which each key is a new column name
146 and each value is a list of old column names that will be "melted" under
147 the new column name as part of the reshape.
148
149 Parameters
150 ----------
151 data : DataFrame
152 The wide-format DataFrame.
153 groups : dict
154 {new_name : list_of_columns}.
155 dropna : bool, default True
156 Do not include columns whose entries are all NaN.
157
158 Returns
159 -------
160 DataFrame
161 Reshaped DataFrame.
162
163 See Also
164 --------
165 melt : Unpivot a DataFrame from wide to long format, optionally leaving
166 identifiers set.
167 pivot : Create a spreadsheet-style pivot table as a DataFrame.
168 DataFrame.pivot : Pivot without aggregation that can handle
169 non-numeric data.
170 DataFrame.pivot_table : Generalization of pivot that can handle
171 duplicate values for one index/column pair.
172 DataFrame.unstack : Pivot based on the index values instead of a
173 column.
174 wide_to_long : Wide panel to long format. Less flexible but more
175 user-friendly than melt.
176
177 Examples
178 --------
179 >>> data = pd.DataFrame({'hr1': [514, 573], 'hr2': [545, 526],
180 ... 'team': ['Red Sox', 'Yankees'],
181 ... 'year1': [2007, 2007], 'year2': [2008, 2008]})
182 >>> data
183 hr1 hr2 team year1 year2
184 0 514 545 Red Sox 2007 2008
185 1 573 526 Yankees 2007 2008
186
187 >>> pd.lreshape(data, {'year': ['year1', 'year2'], 'hr': ['hr1', 'hr2']})
188 team year hr
189 0 Red Sox 2007 514
190 1 Yankees 2007 573
191 2 Red Sox 2008 545
192 3 Yankees 2008 526
193 """
194 mdata = {}
195 pivot_cols = []
196 all_cols: set[Hashable] = set()
197 K = len(next(iter(groups.values())))
198 for target, names in groups.items():
199 if len(names) != K:
200 raise ValueError("All column lists must be same length")
201 to_concat = [data[col]._values for col in names]
202
203 mdata[target] = concat_compat(to_concat)
204 pivot_cols.append(target)
205 all_cols = all_cols.union(names)
206
207 id_cols = list(data.columns.difference(all_cols))
208 for col in id_cols:
209 mdata[col] = np.tile(data[col]._values, K)
210
211 if dropna:
212 mask = np.ones(len(mdata[pivot_cols[0]]), dtype=bool)
213 for c in pivot_cols:
214 mask &= notna(mdata[c])
215 if not mask.all():
216 mdata = {k: v[mask] for k, v in mdata.items()}
217
218 return data._constructor(mdata, columns=id_cols + pivot_cols)
219
220
221def wide_to_long(
222 df: DataFrame, stubnames, i, j, sep: str = "", suffix: str = r"\d+"
223) -> DataFrame:
224 r"""
225 Unpivot a DataFrame from wide to long format.
226
227 Less flexible but more user-friendly than melt.
228
229 With stubnames ['A', 'B'], this function expects to find one or more
230 group of columns with format
231 A-suffix1, A-suffix2,..., B-suffix1, B-suffix2,...
232 You specify what you want to call this suffix in the resulting long format
233 with `j` (for example `j='year'`)
234
235 Each row of these wide variables are assumed to be uniquely identified by
236 `i` (can be a single column name or a list of column names)
237
238 All remaining variables in the data frame are left intact.
239
240 Parameters
241 ----------
242 df : DataFrame
243 The wide-format DataFrame.
244 stubnames : str or list-like
245 The stub name(s). The wide format variables are assumed to
246 start with the stub names.
247 i : str or list-like
248 Column(s) to use as id variable(s).
249 j : str
250 The name of the sub-observation variable. What you wish to name your
251 suffix in the long format.
252 sep : str, default ""
253 A character indicating the separation of the variable names
254 in the wide format, to be stripped from the names in the long format.
255 For example, if your column names are A-suffix1, A-suffix2, you
256 can strip the hyphen by specifying `sep='-'`.
257 suffix : str, default '\\d+'
258 A regular expression capturing the wanted suffixes. '\\d+' captures
259 numeric suffixes. Suffixes with no numbers could be specified with the
260 negated character class '\\D+'. You can also further disambiguate
261 suffixes, for example, if your wide variables are of the form A-one,
262 B-two,.., and you have an unrelated column A-rating, you can ignore the
263 last one by specifying `suffix='(!?one|two)'`. When all suffixes are
264 numeric, they are cast to int64/float64.
265
266 Returns
267 -------
268 DataFrame
269 A DataFrame that contains each stub name as a variable, with new index
270 (i, j).
271
272 See Also
273 --------
274 melt : Unpivot a DataFrame from wide to long format, optionally leaving
275 identifiers set.
276 pivot : Create a spreadsheet-style pivot table as a DataFrame.
277 DataFrame.pivot : Pivot without aggregation that can handle
278 non-numeric data.
279 DataFrame.pivot_table : Generalization of pivot that can handle
280 duplicate values for one index/column pair.
281 DataFrame.unstack : Pivot based on the index values instead of a
282 column.
283
284 Notes
285 -----
286 All extra variables are left untouched. This simply uses
287 `pandas.melt` under the hood, but is hard-coded to "do the right thing"
288 in a typical case.
289
290 Examples
291 --------
292 >>> np.random.seed(123)
293 >>> df = pd.DataFrame({"A1970" : {0 : "a", 1 : "b", 2 : "c"},
294 ... "A1980" : {0 : "d", 1 : "e", 2 : "f"},
295 ... "B1970" : {0 : 2.5, 1 : 1.2, 2 : .7},
296 ... "B1980" : {0 : 3.2, 1 : 1.3, 2 : .1},
297 ... "X" : dict(zip(range(3), np.random.randn(3)))
298 ... })
299 >>> df["id"] = df.index
300 >>> df
301 A1970 A1980 B1970 B1980 X id
302 0 a d 2.5 3.2 -1.085631 0
303 1 b e 1.2 1.3 0.997345 1
304 2 c f 0.7 0.1 0.282978 2
305 >>> pd.wide_to_long(df, ["A", "B"], i="id", j="year")
306 ... # doctest: +NORMALIZE_WHITESPACE
307 X A B
308 id year
309 0 1970 -1.085631 a 2.5
310 1 1970 0.997345 b 1.2
311 2 1970 0.282978 c 0.7
312 0 1980 -1.085631 d 3.2
313 1 1980 0.997345 e 1.3
314 2 1980 0.282978 f 0.1
315
316 With multiple id columns
317
318 >>> df = pd.DataFrame({
319 ... 'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3],
320 ... 'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3],
321 ... 'ht1': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1],
322 ... 'ht2': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9]
323 ... })
324 >>> df
325 famid birth ht1 ht2
326 0 1 1 2.8 3.4
327 1 1 2 2.9 3.8
328 2 1 3 2.2 2.9
329 3 2 1 2.0 3.2
330 4 2 2 1.8 2.8
331 5 2 3 1.9 2.4
332 6 3 1 2.2 3.3
333 7 3 2 2.3 3.4
334 8 3 3 2.1 2.9
335 >>> l = pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age')
336 >>> l
337 ... # doctest: +NORMALIZE_WHITESPACE
338 ht
339 famid birth age
340 1 1 1 2.8
341 2 3.4
342 2 1 2.9
343 2 3.8
344 3 1 2.2
345 2 2.9
346 2 1 1 2.0
347 2 3.2
348 2 1 1.8
349 2 2.8
350 3 1 1.9
351 2 2.4
352 3 1 1 2.2
353 2 3.3
354 2 1 2.3
355 2 3.4
356 3 1 2.1
357 2 2.9
358
359 Going from long back to wide just takes some creative use of `unstack`
360
361 >>> w = l.unstack()
362 >>> w.columns = w.columns.map('{0[0]}{0[1]}'.format)
363 >>> w.reset_index()
364 famid birth ht1 ht2
365 0 1 1 2.8 3.4
366 1 1 2 2.9 3.8
367 2 1 3 2.2 2.9
368 3 2 1 2.0 3.2
369 4 2 2 1.8 2.8
370 5 2 3 1.9 2.4
371 6 3 1 2.2 3.3
372 7 3 2 2.3 3.4
373 8 3 3 2.1 2.9
374
375 Less wieldy column names are also handled
376
377 >>> np.random.seed(0)
378 >>> df = pd.DataFrame({'A(weekly)-2010': np.random.rand(3),
379 ... 'A(weekly)-2011': np.random.rand(3),
380 ... 'B(weekly)-2010': np.random.rand(3),
381 ... 'B(weekly)-2011': np.random.rand(3),
382 ... 'X' : np.random.randint(3, size=3)})
383 >>> df['id'] = df.index
384 >>> df # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS
385 A(weekly)-2010 A(weekly)-2011 B(weekly)-2010 B(weekly)-2011 X id
386 0 0.548814 0.544883 0.437587 0.383442 0 0
387 1 0.715189 0.423655 0.891773 0.791725 1 1
388 2 0.602763 0.645894 0.963663 0.528895 1 2
389
390 >>> pd.wide_to_long(df, ['A(weekly)', 'B(weekly)'], i='id',
391 ... j='year', sep='-')
392 ... # doctest: +NORMALIZE_WHITESPACE
393 X A(weekly) B(weekly)
394 id year
395 0 2010 0 0.548814 0.437587
396 1 2010 1 0.715189 0.891773
397 2 2010 1 0.602763 0.963663
398 0 2011 0 0.544883 0.383442
399 1 2011 1 0.423655 0.791725
400 2 2011 1 0.645894 0.528895
401
402 If we have many columns, we could also use a regex to find our
403 stubnames and pass that list on to wide_to_long
404
405 >>> stubnames = sorted(
406 ... set([match[0] for match in df.columns.str.findall(
407 ... r'[A-B]\(.*\)').values if match != []])
408 ... )
409 >>> list(stubnames)
410 ['A(weekly)', 'B(weekly)']
411
412 All of the above examples have integers as suffixes. It is possible to
413 have non-integers as suffixes.
414
415 >>> df = pd.DataFrame({
416 ... 'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3],
417 ... 'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3],
418 ... 'ht_one': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1],
419 ... 'ht_two': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9]
420 ... })
421 >>> df
422 famid birth ht_one ht_two
423 0 1 1 2.8 3.4
424 1 1 2 2.9 3.8
425 2 1 3 2.2 2.9
426 3 2 1 2.0 3.2
427 4 2 2 1.8 2.8
428 5 2 3 1.9 2.4
429 6 3 1 2.2 3.3
430 7 3 2 2.3 3.4
431 8 3 3 2.1 2.9
432
433 >>> l = pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age',
434 ... sep='_', suffix=r'\w+')
435 >>> l
436 ... # doctest: +NORMALIZE_WHITESPACE
437 ht
438 famid birth age
439 1 1 one 2.8
440 two 3.4
441 2 one 2.9
442 two 3.8
443 3 one 2.2
444 two 2.9
445 2 1 one 2.0
446 two 3.2
447 2 one 1.8
448 two 2.8
449 3 one 1.9
450 two 2.4
451 3 1 one 2.2
452 two 3.3
453 2 one 2.3
454 two 3.4
455 3 one 2.1
456 two 2.9
457 """
458
459 def get_var_names(df, stub: str, sep: str, suffix: str):
460 regex = rf"^{re.escape(stub)}{re.escape(sep)}{suffix}$"
461 return df.columns[df.columns.str.match(regex)]
462
463 def melt_stub(df, stub: str, i, j, value_vars, sep: str):
464 newdf = melt(
465 df,
466 id_vars=i,
467 value_vars=value_vars,
468 value_name=stub.rstrip(sep),
469 var_name=j,
470 )
471 newdf[j] = newdf[j].str.replace(re.escape(stub + sep), "", regex=True)
472
473 # GH17627 Cast numerics suffixes to int/float
474 try:
475 newdf[j] = to_numeric(newdf[j])
476 except (TypeError, ValueError, OverflowError):
477 # TODO: anything else to catch?
478 pass
479
480 return newdf.set_index(i + [j])
481
482 if not is_list_like(stubnames):
483 stubnames = [stubnames]
484 else:
485 stubnames = list(stubnames)
486
487 if df.columns.isin(stubnames).any():
488 raise ValueError("stubname can't be identical to a column name")
489
490 if not is_list_like(i):
491 i = [i]
492 else:
493 i = list(i)
494
495 if df[i].duplicated().any():
496 raise ValueError("the id variables need to uniquely identify each row")
497
498 _melted = []
499 value_vars_flattened = []
500 for stub in stubnames:
501 value_var = get_var_names(df, stub, sep, suffix)
502 value_vars_flattened.extend(value_var)
503 _melted.append(melt_stub(df, stub, i, j, value_var, sep))
504
505 melted = concat(_melted, axis=1)
506 id_vars = df.columns.difference(value_vars_flattened)
507 new = df[id_vars]
508
509 if len(i) == 1:
510 return new.set_index(i).join(melted)
511 else:
512 return new.merge(melted.reset_index(), on=i).set_index(i + [j])