Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/core/reshape/melt.py: 45%

1from __future__ import annotations

3import re

4from typing import TYPE_CHECKING

6import numpy as np

8from pandas.util._decorators import Appender

10from pandas.core.dtypes.common import is_list_like

11from pandas.core.dtypes.concat import concat_compat

12from pandas.core.dtypes.missing import notna

14import pandas.core.algorithms as algos

15from pandas.core.indexes.api import MultiIndex

16from pandas.core.reshape.concat import concat

17from pandas.core.reshape.util import tile_compat

18from pandas.core.shared_docs import _shared_docs

19from pandas.core.tools.numeric import to_numeric

21if TYPE_CHECKING:

22 from collections.abc import Hashable

24 from pandas._typing import AnyArrayLike

26 from pandas import DataFrame

29def ensure_list_vars(arg_vars, variable: str, columns) -> list:

30 if arg_vars is not None:

31 if not is_list_like(arg_vars):

32 return [arg_vars]

33 elif isinstance(columns, MultiIndex) and not isinstance(arg_vars, list):

34 raise ValueError(

35 f"{variable} must be a list of tuples when columns are a MultiIndex"

36 )

37 else:

38 return list(arg_vars)

39 else:

40 return []

43@Appender(_shared_docs["melt"] % {"caller": "pd.melt(df, ", "other": "DataFrame.melt"})

44def melt(

45 frame: DataFrame,

46 id_vars=None,

47 value_vars=None,

48 var_name=None,

49 value_name: Hashable = "value",

50 col_level=None,

51 ignore_index: bool = True,

52) -> DataFrame:

53 if value_name in frame.columns:

54 raise ValueError(

55 f"value_name ({value_name}) cannot match an element in "

56 "the DataFrame columns."

57 )

58 id_vars = ensure_list_vars(id_vars, "id_vars", frame.columns)

59 value_vars_was_not_none = value_vars is not None

60 value_vars = ensure_list_vars(value_vars, "value_vars", frame.columns)

62 if id_vars or value_vars:

63 if col_level is not None:

64 level = frame.columns.get_level_values(col_level)

65 else:

66 level = frame.columns

67 labels = id_vars + value_vars

68 idx = level.get_indexer_for(labels)

69 missing = idx == -1

70 if missing.any():

71 missing_labels = [

72 lab for lab, not_found in zip(labels, missing) if not_found

73 ]

74 raise KeyError(

75 "The following id_vars or value_vars are not present in "

76 f"the DataFrame: {missing_labels}"

77 )

78 if value_vars_was_not_none:

79 frame = frame.iloc[:, algos.unique(idx)]

80 else:

81 frame = frame.copy()

82 else:

83 frame = frame.copy()

85 if col_level is not None: # allow list or other?

86 # frame is a copy

87 frame.columns = frame.columns.get_level_values(col_level)

89 if var_name is None:

90 if isinstance(frame.columns, MultiIndex):

91 if len(frame.columns.names) == len(set(frame.columns.names)):

92 var_name = frame.columns.names

93 else:

94 var_name = [f"variable_{i}" for i in range(len(frame.columns.names))]

95 else:

96 var_name = [

97 frame.columns.name if frame.columns.name is not None else "variable"

98 ]

99 elif is_list_like(var_name):

100 raise ValueError(f"{var_name=} must be a scalar.")

101 else:

102 var_name = [var_name]

103

104 num_rows, K = frame.shape

105 num_cols_adjusted = K - len(id_vars)

106

107 mdata: dict[Hashable, AnyArrayLike] = {}

108 for col in id_vars:

109 id_data = frame.pop(col)

110 if not isinstance(id_data.dtype, np.dtype):

111 # i.e. ExtensionDtype

112 if num_cols_adjusted > 0:

113 mdata[col] = concat([id_data] * num_cols_adjusted, ignore_index=True)

114 else:

115 # We can't concat empty list. (GH 46044)

116 mdata[col] = type(id_data)([], name=id_data.name, dtype=id_data.dtype)

117 else:

118 mdata[col] = np.tile(id_data._values, num_cols_adjusted)

119

120 mcolumns = id_vars + var_name + [value_name]

121

122 if frame.shape[1] > 0 and not any(

123 not isinstance(dt, np.dtype) and dt._supports_2d for dt in frame.dtypes

124 ):

125 mdata[value_name] = concat(

126 [frame.iloc[:, i] for i in range(frame.shape[1])]

127 ).values

128 else:

129 mdata[value_name] = frame._values.ravel("F")

130 for i, col in enumerate(var_name):

131 mdata[col] = frame.columns._get_level_values(i).repeat(num_rows)

132

133 result = frame._constructor(mdata, columns=mcolumns)

134

135 if not ignore_index:

136 result.index = tile_compat(frame.index, num_cols_adjusted)

137

138 return result

139

140

141def lreshape(data: DataFrame, groups: dict, dropna: bool = True) -> DataFrame:

142 """

143 Reshape wide-format data to long. Generalized inverse of DataFrame.pivot.

144

145 Accepts a dictionary, ``groups``, in which each key is a new column name

146 and each value is a list of old column names that will be "melted" under

147 the new column name as part of the reshape.

148

149 Parameters

150 ----------

151 data : DataFrame

152 The wide-format DataFrame.

153 groups : dict

154 {new_name : list_of_columns}.

155 dropna : bool, default True

156 Do not include columns whose entries are all NaN.

157

158 Returns

159 -------

160 DataFrame

161 Reshaped DataFrame.

162

163 See Also

164 --------

165 melt : Unpivot a DataFrame from wide to long format, optionally leaving

166 identifiers set.

167 pivot : Create a spreadsheet-style pivot table as a DataFrame.

168 DataFrame.pivot : Pivot without aggregation that can handle

169 non-numeric data.

170 DataFrame.pivot_table : Generalization of pivot that can handle

171 duplicate values for one index/column pair.

172 DataFrame.unstack : Pivot based on the index values instead of a

173 column.

174 wide_to_long : Wide panel to long format. Less flexible but more

175 user-friendly than melt.

176

177 Examples

178 --------

179 >>> data = pd.DataFrame({'hr1': [514, 573], 'hr2': [545, 526],

180 ... 'team': ['Red Sox', 'Yankees'],

181 ... 'year1': [2007, 2007], 'year2': [2008, 2008]})

182 >>> data

183 hr1 hr2 team year1 year2

184 0 514 545 Red Sox 2007 2008

185 1 573 526 Yankees 2007 2008

186

187 >>> pd.lreshape(data, {'year': ['year1', 'year2'], 'hr': ['hr1', 'hr2']})

188 team year hr

189 0 Red Sox 2007 514

190 1 Yankees 2007 573

191 2 Red Sox 2008 545

192 3 Yankees 2008 526

193 """

194 mdata = {}

195 pivot_cols = []

196 all_cols: set[Hashable] = set()

197 K = len(next(iter(groups.values())))

198 for target, names in groups.items():

199 if len(names) != K:

200 raise ValueError("All column lists must be same length")

201 to_concat = [data[col]._values for col in names]

202

203 mdata[target] = concat_compat(to_concat)

204 pivot_cols.append(target)

205 all_cols = all_cols.union(names)

206

207 id_cols = list(data.columns.difference(all_cols))

208 for col in id_cols:

209 mdata[col] = np.tile(data[col]._values, K)

210

211 if dropna:

212 mask = np.ones(len(mdata[pivot_cols[0]]), dtype=bool)

213 for c in pivot_cols:

214 mask &= notna(mdata[c])

215 if not mask.all():

216 mdata = {k: v[mask] for k, v in mdata.items()}

217

218 return data._constructor(mdata, columns=id_cols + pivot_cols)

219

220

221def wide_to_long(

222 df: DataFrame, stubnames, i, j, sep: str = "", suffix: str = r"\d+"

223) -> DataFrame:

224 r"""

225 Unpivot a DataFrame from wide to long format.

226

227 Less flexible but more user-friendly than melt.

228

229 With stubnames ['A', 'B'], this function expects to find one or more

230 group of columns with format

231 A-suffix1, A-suffix2,..., B-suffix1, B-suffix2,...

232 You specify what you want to call this suffix in the resulting long format

233 with `j` (for example `j='year'`)

234

235 Each row of these wide variables are assumed to be uniquely identified by

236 `i` (can be a single column name or a list of column names)

237

238 All remaining variables in the data frame are left intact.

239

240 Parameters

241 ----------

242 df : DataFrame

243 The wide-format DataFrame.

244 stubnames : str or list-like

245 The stub name(s). The wide format variables are assumed to

246 start with the stub names.

247 i : str or list-like

248 Column(s) to use as id variable(s).

249 j : str

250 The name of the sub-observation variable. What you wish to name your

251 suffix in the long format.

252 sep : str, default ""

253 A character indicating the separation of the variable names

254 in the wide format, to be stripped from the names in the long format.

255 For example, if your column names are A-suffix1, A-suffix2, you

256 can strip the hyphen by specifying `sep='-'`.

257 suffix : str, default '\\d+'

258 A regular expression capturing the wanted suffixes. '\\d+' captures

259 numeric suffixes. Suffixes with no numbers could be specified with the

260 negated character class '\\D+'. You can also further disambiguate

261 suffixes, for example, if your wide variables are of the form A-one,

262 B-two,.., and you have an unrelated column A-rating, you can ignore the

263 last one by specifying `suffix='(!?one|two)'`. When all suffixes are

264 numeric, they are cast to int64/float64.

265

266 Returns

267 -------

268 DataFrame

269 A DataFrame that contains each stub name as a variable, with new index

270 (i, j).

271

272 See Also

273 --------

274 melt : Unpivot a DataFrame from wide to long format, optionally leaving

275 identifiers set.

276 pivot : Create a spreadsheet-style pivot table as a DataFrame.

277 DataFrame.pivot : Pivot without aggregation that can handle

278 non-numeric data.

279 DataFrame.pivot_table : Generalization of pivot that can handle

280 duplicate values for one index/column pair.

281 DataFrame.unstack : Pivot based on the index values instead of a

282 column.

283

284 Notes

285 -----

286 All extra variables are left untouched. This simply uses

287 `pandas.melt` under the hood, but is hard-coded to "do the right thing"

288 in a typical case.

289

290 Examples

291 --------

292 >>> np.random.seed(123)

293 >>> df = pd.DataFrame({"A1970" : {0 : "a", 1 : "b", 2 : "c"},

294 ... "A1980" : {0 : "d", 1 : "e", 2 : "f"},

295 ... "B1970" : {0 : 2.5, 1 : 1.2, 2 : .7},

296 ... "B1980" : {0 : 3.2, 1 : 1.3, 2 : .1},

297 ... "X" : dict(zip(range(3), np.random.randn(3)))

298 ... })

299 >>> df["id"] = df.index

300 >>> df

301 A1970 A1980 B1970 B1980 X id

302 0 a d 2.5 3.2 -1.085631 0

303 1 b e 1.2 1.3 0.997345 1

304 2 c f 0.7 0.1 0.282978 2

305 >>> pd.wide_to_long(df, ["A", "B"], i="id", j="year")

306 ... # doctest: +NORMALIZE_WHITESPACE

307 X A B

308 id year

309 0 1970 -1.085631 a 2.5

310 1 1970 0.997345 b 1.2

311 2 1970 0.282978 c 0.7

312 0 1980 -1.085631 d 3.2

313 1 1980 0.997345 e 1.3

314 2 1980 0.282978 f 0.1

315

316 With multiple id columns

317

318 >>> df = pd.DataFrame({

319 ... 'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3],

320 ... 'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3],

321 ... 'ht1': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1],

322 ... 'ht2': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9]

323 ... })

324 >>> df

325 famid birth ht1 ht2

326 0 1 1 2.8 3.4

327 1 1 2 2.9 3.8

328 2 1 3 2.2 2.9

329 3 2 1 2.0 3.2

330 4 2 2 1.8 2.8

331 5 2 3 1.9 2.4

332 6 3 1 2.2 3.3

333 7 3 2 2.3 3.4

334 8 3 3 2.1 2.9

335 >>> l = pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age')

336 >>> l

337 ... # doctest: +NORMALIZE_WHITESPACE

338 ht

339 famid birth age

340 1 1 1 2.8

341 2 3.4

342 2 1 2.9

343 2 3.8

344 3 1 2.2

345 2 2.9

346 2 1 1 2.0

347 2 3.2

348 2 1 1.8

349 2 2.8

350 3 1 1.9

351 2 2.4

352 3 1 1 2.2

353 2 3.3

354 2 1 2.3

355 2 3.4

356 3 1 2.1

357 2 2.9

358

359 Going from long back to wide just takes some creative use of `unstack`

360

361 >>> w = l.unstack()

362 >>> w.columns = w.columns.map('{0[0]}{0[1]}'.format)

363 >>> w.reset_index()

364 famid birth ht1 ht2

365 0 1 1 2.8 3.4

366 1 1 2 2.9 3.8

367 2 1 3 2.2 2.9

368 3 2 1 2.0 3.2

369 4 2 2 1.8 2.8

370 5 2 3 1.9 2.4

371 6 3 1 2.2 3.3

372 7 3 2 2.3 3.4

373 8 3 3 2.1 2.9

374

375 Less wieldy column names are also handled

376

377 >>> np.random.seed(0)

378 >>> df = pd.DataFrame({'A(weekly)-2010': np.random.rand(3),

379 ... 'A(weekly)-2011': np.random.rand(3),

380 ... 'B(weekly)-2010': np.random.rand(3),

381 ... 'B(weekly)-2011': np.random.rand(3),

382 ... 'X' : np.random.randint(3, size=3)})

383 >>> df['id'] = df.index

384 >>> df # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS

385 A(weekly)-2010 A(weekly)-2011 B(weekly)-2010 B(weekly)-2011 X id

386 0 0.548814 0.544883 0.437587 0.383442 0 0

387 1 0.715189 0.423655 0.891773 0.791725 1 1

388 2 0.602763 0.645894 0.963663 0.528895 1 2

389

390 >>> pd.wide_to_long(df, ['A(weekly)', 'B(weekly)'], i='id',

391 ... j='year', sep='-')

392 ... # doctest: +NORMALIZE_WHITESPACE

393 X A(weekly) B(weekly)

394 id year

395 0 2010 0 0.548814 0.437587

396 1 2010 1 0.715189 0.891773

397 2 2010 1 0.602763 0.963663

398 0 2011 0 0.544883 0.383442

399 1 2011 1 0.423655 0.791725

400 2 2011 1 0.645894 0.528895

401

402 If we have many columns, we could also use a regex to find our

403 stubnames and pass that list on to wide_to_long

404

405 >>> stubnames = sorted(

406 ... set([match[0] for match in df.columns.str.findall(

407 ... r'[A-B]$.*$').values if match != []])

408 ... )

409 >>> list(stubnames)

410 ['A(weekly)', 'B(weekly)']

411

412 All of the above examples have integers as suffixes. It is possible to

413 have non-integers as suffixes.

414

415 >>> df = pd.DataFrame({

416 ... 'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3],

417 ... 'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3],

418 ... 'ht_one': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1],

419 ... 'ht_two': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9]

420 ... })

421 >>> df

422 famid birth ht_one ht_two

423 0 1 1 2.8 3.4

424 1 1 2 2.9 3.8

425 2 1 3 2.2 2.9

426 3 2 1 2.0 3.2

427 4 2 2 1.8 2.8

428 5 2 3 1.9 2.4

429 6 3 1 2.2 3.3

430 7 3 2 2.3 3.4

431 8 3 3 2.1 2.9

432

433 >>> l = pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age',

434 ... sep='_', suffix=r'\w+')

435 >>> l

436 ... # doctest: +NORMALIZE_WHITESPACE

437 ht

438 famid birth age

439 1 1 one 2.8

440 two 3.4

441 2 one 2.9

442 two 3.8

443 3 one 2.2

444 two 2.9

445 2 1 one 2.0

446 two 3.2

447 2 one 1.8

448 two 2.8

449 3 one 1.9

450 two 2.4

451 3 1 one 2.2

452 two 3.3

453 2 one 2.3

454 two 3.4

455 3 one 2.1

456 two 2.9

457 """

458

459 def get_var_names(df, stub: str, sep: str, suffix: str):

460 regex = rf"^{re.escape(stub)}{re.escape(sep)}{suffix}$"

461 return df.columns[df.columns.str.match(regex)]

462

463 def melt_stub(df, stub: str, i, j, value_vars, sep: str):

464 newdf = melt(

465 df,

466 id_vars=i,

467 value_vars=value_vars,

468 value_name=stub.rstrip(sep),

469 var_name=j,

470 )

471 newdf[j] = newdf[j].str.replace(re.escape(stub + sep), "", regex=True)

472

473 # GH17627 Cast numerics suffixes to int/float

474 try:

475 newdf[j] = to_numeric(newdf[j])

476 except (TypeError, ValueError, OverflowError):

477 # TODO: anything else to catch?

478 pass

479

480 return newdf.set_index(i + [j])

481

482 if not is_list_like(stubnames):

483 stubnames = [stubnames]

484 else:

485 stubnames = list(stubnames)

486

487 if df.columns.isin(stubnames).any():

488 raise ValueError("stubname can't be identical to a column name")

489

490 if not is_list_like(i):

491 i = [i]

492 else:

493 i = list(i)

494

495 if df[i].duplicated().any():

496 raise ValueError("the id variables need to uniquely identify each row")

497

498 _melted = []

499 value_vars_flattened = []

500 for stub in stubnames:

501 value_var = get_var_names(df, stub, sep, suffix)

502 value_vars_flattened.extend(value_var)

503 _melted.append(melt_stub(df, stub, i, j, value_var, sep))

504

505 melted = concat(_melted, axis=1)

506 id_vars = df.columns.difference(value_vars_flattened)

507 new = df[id_vars]

508

509 if len(i) == 1:

510 return new.set_index(i).join(melted)

511 else:

512 return new.merge(melted.reset_index(), on=i).set_index(i + [j])