Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/core/reshape/melt.py: 45%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

134 statements  

1from __future__ import annotations 

2 

3import re 

4from typing import TYPE_CHECKING 

5 

6import numpy as np 

7 

8from pandas.util._decorators import Appender 

9 

10from pandas.core.dtypes.common import is_list_like 

11from pandas.core.dtypes.concat import concat_compat 

12from pandas.core.dtypes.missing import notna 

13 

14import pandas.core.algorithms as algos 

15from pandas.core.indexes.api import MultiIndex 

16from pandas.core.reshape.concat import concat 

17from pandas.core.reshape.util import tile_compat 

18from pandas.core.shared_docs import _shared_docs 

19from pandas.core.tools.numeric import to_numeric 

20 

21if TYPE_CHECKING: 

22 from collections.abc import Hashable 

23 

24 from pandas._typing import AnyArrayLike 

25 

26 from pandas import DataFrame 

27 

28 

29def ensure_list_vars(arg_vars, variable: str, columns) -> list: 

30 if arg_vars is not None: 

31 if not is_list_like(arg_vars): 

32 return [arg_vars] 

33 elif isinstance(columns, MultiIndex) and not isinstance(arg_vars, list): 

34 raise ValueError( 

35 f"{variable} must be a list of tuples when columns are a MultiIndex" 

36 ) 

37 else: 

38 return list(arg_vars) 

39 else: 

40 return [] 

41 

42 

43@Appender(_shared_docs["melt"] % {"caller": "pd.melt(df, ", "other": "DataFrame.melt"}) 

44def melt( 

45 frame: DataFrame, 

46 id_vars=None, 

47 value_vars=None, 

48 var_name=None, 

49 value_name: Hashable = "value", 

50 col_level=None, 

51 ignore_index: bool = True, 

52) -> DataFrame: 

53 if value_name in frame.columns: 

54 raise ValueError( 

55 f"value_name ({value_name}) cannot match an element in " 

56 "the DataFrame columns." 

57 ) 

58 id_vars = ensure_list_vars(id_vars, "id_vars", frame.columns) 

59 value_vars_was_not_none = value_vars is not None 

60 value_vars = ensure_list_vars(value_vars, "value_vars", frame.columns) 

61 

62 if id_vars or value_vars: 

63 if col_level is not None: 

64 level = frame.columns.get_level_values(col_level) 

65 else: 

66 level = frame.columns 

67 labels = id_vars + value_vars 

68 idx = level.get_indexer_for(labels) 

69 missing = idx == -1 

70 if missing.any(): 

71 missing_labels = [ 

72 lab for lab, not_found in zip(labels, missing) if not_found 

73 ] 

74 raise KeyError( 

75 "The following id_vars or value_vars are not present in " 

76 f"the DataFrame: {missing_labels}" 

77 ) 

78 if value_vars_was_not_none: 

79 frame = frame.iloc[:, algos.unique(idx)] 

80 else: 

81 frame = frame.copy() 

82 else: 

83 frame = frame.copy() 

84 

85 if col_level is not None: # allow list or other? 

86 # frame is a copy 

87 frame.columns = frame.columns.get_level_values(col_level) 

88 

89 if var_name is None: 

90 if isinstance(frame.columns, MultiIndex): 

91 if len(frame.columns.names) == len(set(frame.columns.names)): 

92 var_name = frame.columns.names 

93 else: 

94 var_name = [f"variable_{i}" for i in range(len(frame.columns.names))] 

95 else: 

96 var_name = [ 

97 frame.columns.name if frame.columns.name is not None else "variable" 

98 ] 

99 elif is_list_like(var_name): 

100 raise ValueError(f"{var_name=} must be a scalar.") 

101 else: 

102 var_name = [var_name] 

103 

104 num_rows, K = frame.shape 

105 num_cols_adjusted = K - len(id_vars) 

106 

107 mdata: dict[Hashable, AnyArrayLike] = {} 

108 for col in id_vars: 

109 id_data = frame.pop(col) 

110 if not isinstance(id_data.dtype, np.dtype): 

111 # i.e. ExtensionDtype 

112 if num_cols_adjusted > 0: 

113 mdata[col] = concat([id_data] * num_cols_adjusted, ignore_index=True) 

114 else: 

115 # We can't concat empty list. (GH 46044) 

116 mdata[col] = type(id_data)([], name=id_data.name, dtype=id_data.dtype) 

117 else: 

118 mdata[col] = np.tile(id_data._values, num_cols_adjusted) 

119 

120 mcolumns = id_vars + var_name + [value_name] 

121 

122 if frame.shape[1] > 0 and not any( 

123 not isinstance(dt, np.dtype) and dt._supports_2d for dt in frame.dtypes 

124 ): 

125 mdata[value_name] = concat( 

126 [frame.iloc[:, i] for i in range(frame.shape[1])] 

127 ).values 

128 else: 

129 mdata[value_name] = frame._values.ravel("F") 

130 for i, col in enumerate(var_name): 

131 mdata[col] = frame.columns._get_level_values(i).repeat(num_rows) 

132 

133 result = frame._constructor(mdata, columns=mcolumns) 

134 

135 if not ignore_index: 

136 result.index = tile_compat(frame.index, num_cols_adjusted) 

137 

138 return result 

139 

140 

141def lreshape(data: DataFrame, groups: dict, dropna: bool = True) -> DataFrame: 

142 """ 

143 Reshape wide-format data to long. Generalized inverse of DataFrame.pivot. 

144 

145 Accepts a dictionary, ``groups``, in which each key is a new column name 

146 and each value is a list of old column names that will be "melted" under 

147 the new column name as part of the reshape. 

148 

149 Parameters 

150 ---------- 

151 data : DataFrame 

152 The wide-format DataFrame. 

153 groups : dict 

154 {new_name : list_of_columns}. 

155 dropna : bool, default True 

156 Do not include columns whose entries are all NaN. 

157 

158 Returns 

159 ------- 

160 DataFrame 

161 Reshaped DataFrame. 

162 

163 See Also 

164 -------- 

165 melt : Unpivot a DataFrame from wide to long format, optionally leaving 

166 identifiers set. 

167 pivot : Create a spreadsheet-style pivot table as a DataFrame. 

168 DataFrame.pivot : Pivot without aggregation that can handle 

169 non-numeric data. 

170 DataFrame.pivot_table : Generalization of pivot that can handle 

171 duplicate values for one index/column pair. 

172 DataFrame.unstack : Pivot based on the index values instead of a 

173 column. 

174 wide_to_long : Wide panel to long format. Less flexible but more 

175 user-friendly than melt. 

176 

177 Examples 

178 -------- 

179 >>> data = pd.DataFrame({'hr1': [514, 573], 'hr2': [545, 526], 

180 ... 'team': ['Red Sox', 'Yankees'], 

181 ... 'year1': [2007, 2007], 'year2': [2008, 2008]}) 

182 >>> data 

183 hr1 hr2 team year1 year2 

184 0 514 545 Red Sox 2007 2008 

185 1 573 526 Yankees 2007 2008 

186 

187 >>> pd.lreshape(data, {'year': ['year1', 'year2'], 'hr': ['hr1', 'hr2']}) 

188 team year hr 

189 0 Red Sox 2007 514 

190 1 Yankees 2007 573 

191 2 Red Sox 2008 545 

192 3 Yankees 2008 526 

193 """ 

194 mdata = {} 

195 pivot_cols = [] 

196 all_cols: set[Hashable] = set() 

197 K = len(next(iter(groups.values()))) 

198 for target, names in groups.items(): 

199 if len(names) != K: 

200 raise ValueError("All column lists must be same length") 

201 to_concat = [data[col]._values for col in names] 

202 

203 mdata[target] = concat_compat(to_concat) 

204 pivot_cols.append(target) 

205 all_cols = all_cols.union(names) 

206 

207 id_cols = list(data.columns.difference(all_cols)) 

208 for col in id_cols: 

209 mdata[col] = np.tile(data[col]._values, K) 

210 

211 if dropna: 

212 mask = np.ones(len(mdata[pivot_cols[0]]), dtype=bool) 

213 for c in pivot_cols: 

214 mask &= notna(mdata[c]) 

215 if not mask.all(): 

216 mdata = {k: v[mask] for k, v in mdata.items()} 

217 

218 return data._constructor(mdata, columns=id_cols + pivot_cols) 

219 

220 

221def wide_to_long( 

222 df: DataFrame, stubnames, i, j, sep: str = "", suffix: str = r"\d+" 

223) -> DataFrame: 

224 r""" 

225 Unpivot a DataFrame from wide to long format. 

226 

227 Less flexible but more user-friendly than melt. 

228 

229 With stubnames ['A', 'B'], this function expects to find one or more 

230 group of columns with format 

231 A-suffix1, A-suffix2,..., B-suffix1, B-suffix2,... 

232 You specify what you want to call this suffix in the resulting long format 

233 with `j` (for example `j='year'`) 

234 

235 Each row of these wide variables are assumed to be uniquely identified by 

236 `i` (can be a single column name or a list of column names) 

237 

238 All remaining variables in the data frame are left intact. 

239 

240 Parameters 

241 ---------- 

242 df : DataFrame 

243 The wide-format DataFrame. 

244 stubnames : str or list-like 

245 The stub name(s). The wide format variables are assumed to 

246 start with the stub names. 

247 i : str or list-like 

248 Column(s) to use as id variable(s). 

249 j : str 

250 The name of the sub-observation variable. What you wish to name your 

251 suffix in the long format. 

252 sep : str, default "" 

253 A character indicating the separation of the variable names 

254 in the wide format, to be stripped from the names in the long format. 

255 For example, if your column names are A-suffix1, A-suffix2, you 

256 can strip the hyphen by specifying `sep='-'`. 

257 suffix : str, default '\\d+' 

258 A regular expression capturing the wanted suffixes. '\\d+' captures 

259 numeric suffixes. Suffixes with no numbers could be specified with the 

260 negated character class '\\D+'. You can also further disambiguate 

261 suffixes, for example, if your wide variables are of the form A-one, 

262 B-two,.., and you have an unrelated column A-rating, you can ignore the 

263 last one by specifying `suffix='(!?one|two)'`. When all suffixes are 

264 numeric, they are cast to int64/float64. 

265 

266 Returns 

267 ------- 

268 DataFrame 

269 A DataFrame that contains each stub name as a variable, with new index 

270 (i, j). 

271 

272 See Also 

273 -------- 

274 melt : Unpivot a DataFrame from wide to long format, optionally leaving 

275 identifiers set. 

276 pivot : Create a spreadsheet-style pivot table as a DataFrame. 

277 DataFrame.pivot : Pivot without aggregation that can handle 

278 non-numeric data. 

279 DataFrame.pivot_table : Generalization of pivot that can handle 

280 duplicate values for one index/column pair. 

281 DataFrame.unstack : Pivot based on the index values instead of a 

282 column. 

283 

284 Notes 

285 ----- 

286 All extra variables are left untouched. This simply uses 

287 `pandas.melt` under the hood, but is hard-coded to "do the right thing" 

288 in a typical case. 

289 

290 Examples 

291 -------- 

292 >>> np.random.seed(123) 

293 >>> df = pd.DataFrame({"A1970" : {0 : "a", 1 : "b", 2 : "c"}, 

294 ... "A1980" : {0 : "d", 1 : "e", 2 : "f"}, 

295 ... "B1970" : {0 : 2.5, 1 : 1.2, 2 : .7}, 

296 ... "B1980" : {0 : 3.2, 1 : 1.3, 2 : .1}, 

297 ... "X" : dict(zip(range(3), np.random.randn(3))) 

298 ... }) 

299 >>> df["id"] = df.index 

300 >>> df 

301 A1970 A1980 B1970 B1980 X id 

302 0 a d 2.5 3.2 -1.085631 0 

303 1 b e 1.2 1.3 0.997345 1 

304 2 c f 0.7 0.1 0.282978 2 

305 >>> pd.wide_to_long(df, ["A", "B"], i="id", j="year") 

306 ... # doctest: +NORMALIZE_WHITESPACE 

307 X A B 

308 id year 

309 0 1970 -1.085631 a 2.5 

310 1 1970 0.997345 b 1.2 

311 2 1970 0.282978 c 0.7 

312 0 1980 -1.085631 d 3.2 

313 1 1980 0.997345 e 1.3 

314 2 1980 0.282978 f 0.1 

315 

316 With multiple id columns 

317 

318 >>> df = pd.DataFrame({ 

319 ... 'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3], 

320 ... 'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3], 

321 ... 'ht1': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1], 

322 ... 'ht2': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9] 

323 ... }) 

324 >>> df 

325 famid birth ht1 ht2 

326 0 1 1 2.8 3.4 

327 1 1 2 2.9 3.8 

328 2 1 3 2.2 2.9 

329 3 2 1 2.0 3.2 

330 4 2 2 1.8 2.8 

331 5 2 3 1.9 2.4 

332 6 3 1 2.2 3.3 

333 7 3 2 2.3 3.4 

334 8 3 3 2.1 2.9 

335 >>> l = pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age') 

336 >>> l 

337 ... # doctest: +NORMALIZE_WHITESPACE 

338 ht 

339 famid birth age 

340 1 1 1 2.8 

341 2 3.4 

342 2 1 2.9 

343 2 3.8 

344 3 1 2.2 

345 2 2.9 

346 2 1 1 2.0 

347 2 3.2 

348 2 1 1.8 

349 2 2.8 

350 3 1 1.9 

351 2 2.4 

352 3 1 1 2.2 

353 2 3.3 

354 2 1 2.3 

355 2 3.4 

356 3 1 2.1 

357 2 2.9 

358 

359 Going from long back to wide just takes some creative use of `unstack` 

360 

361 >>> w = l.unstack() 

362 >>> w.columns = w.columns.map('{0[0]}{0[1]}'.format) 

363 >>> w.reset_index() 

364 famid birth ht1 ht2 

365 0 1 1 2.8 3.4 

366 1 1 2 2.9 3.8 

367 2 1 3 2.2 2.9 

368 3 2 1 2.0 3.2 

369 4 2 2 1.8 2.8 

370 5 2 3 1.9 2.4 

371 6 3 1 2.2 3.3 

372 7 3 2 2.3 3.4 

373 8 3 3 2.1 2.9 

374 

375 Less wieldy column names are also handled 

376 

377 >>> np.random.seed(0) 

378 >>> df = pd.DataFrame({'A(weekly)-2010': np.random.rand(3), 

379 ... 'A(weekly)-2011': np.random.rand(3), 

380 ... 'B(weekly)-2010': np.random.rand(3), 

381 ... 'B(weekly)-2011': np.random.rand(3), 

382 ... 'X' : np.random.randint(3, size=3)}) 

383 >>> df['id'] = df.index 

384 >>> df # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS 

385 A(weekly)-2010 A(weekly)-2011 B(weekly)-2010 B(weekly)-2011 X id 

386 0 0.548814 0.544883 0.437587 0.383442 0 0 

387 1 0.715189 0.423655 0.891773 0.791725 1 1 

388 2 0.602763 0.645894 0.963663 0.528895 1 2 

389 

390 >>> pd.wide_to_long(df, ['A(weekly)', 'B(weekly)'], i='id', 

391 ... j='year', sep='-') 

392 ... # doctest: +NORMALIZE_WHITESPACE 

393 X A(weekly) B(weekly) 

394 id year 

395 0 2010 0 0.548814 0.437587 

396 1 2010 1 0.715189 0.891773 

397 2 2010 1 0.602763 0.963663 

398 0 2011 0 0.544883 0.383442 

399 1 2011 1 0.423655 0.791725 

400 2 2011 1 0.645894 0.528895 

401 

402 If we have many columns, we could also use a regex to find our 

403 stubnames and pass that list on to wide_to_long 

404 

405 >>> stubnames = sorted( 

406 ... set([match[0] for match in df.columns.str.findall( 

407 ... r'[A-B]\(.*\)').values if match != []]) 

408 ... ) 

409 >>> list(stubnames) 

410 ['A(weekly)', 'B(weekly)'] 

411 

412 All of the above examples have integers as suffixes. It is possible to 

413 have non-integers as suffixes. 

414 

415 >>> df = pd.DataFrame({ 

416 ... 'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3], 

417 ... 'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3], 

418 ... 'ht_one': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1], 

419 ... 'ht_two': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9] 

420 ... }) 

421 >>> df 

422 famid birth ht_one ht_two 

423 0 1 1 2.8 3.4 

424 1 1 2 2.9 3.8 

425 2 1 3 2.2 2.9 

426 3 2 1 2.0 3.2 

427 4 2 2 1.8 2.8 

428 5 2 3 1.9 2.4 

429 6 3 1 2.2 3.3 

430 7 3 2 2.3 3.4 

431 8 3 3 2.1 2.9 

432 

433 >>> l = pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age', 

434 ... sep='_', suffix=r'\w+') 

435 >>> l 

436 ... # doctest: +NORMALIZE_WHITESPACE 

437 ht 

438 famid birth age 

439 1 1 one 2.8 

440 two 3.4 

441 2 one 2.9 

442 two 3.8 

443 3 one 2.2 

444 two 2.9 

445 2 1 one 2.0 

446 two 3.2 

447 2 one 1.8 

448 two 2.8 

449 3 one 1.9 

450 two 2.4 

451 3 1 one 2.2 

452 two 3.3 

453 2 one 2.3 

454 two 3.4 

455 3 one 2.1 

456 two 2.9 

457 """ 

458 

459 def get_var_names(df, stub: str, sep: str, suffix: str): 

460 regex = rf"^{re.escape(stub)}{re.escape(sep)}{suffix}$" 

461 return df.columns[df.columns.str.match(regex)] 

462 

463 def melt_stub(df, stub: str, i, j, value_vars, sep: str): 

464 newdf = melt( 

465 df, 

466 id_vars=i, 

467 value_vars=value_vars, 

468 value_name=stub.rstrip(sep), 

469 var_name=j, 

470 ) 

471 newdf[j] = newdf[j].str.replace(re.escape(stub + sep), "", regex=True) 

472 

473 # GH17627 Cast numerics suffixes to int/float 

474 try: 

475 newdf[j] = to_numeric(newdf[j]) 

476 except (TypeError, ValueError, OverflowError): 

477 # TODO: anything else to catch? 

478 pass 

479 

480 return newdf.set_index(i + [j]) 

481 

482 if not is_list_like(stubnames): 

483 stubnames = [stubnames] 

484 else: 

485 stubnames = list(stubnames) 

486 

487 if df.columns.isin(stubnames).any(): 

488 raise ValueError("stubname can't be identical to a column name") 

489 

490 if not is_list_like(i): 

491 i = [i] 

492 else: 

493 i = list(i) 

494 

495 if df[i].duplicated().any(): 

496 raise ValueError("the id variables need to uniquely identify each row") 

497 

498 _melted = [] 

499 value_vars_flattened = [] 

500 for stub in stubnames: 

501 value_var = get_var_names(df, stub, sep, suffix) 

502 value_vars_flattened.extend(value_var) 

503 _melted.append(melt_stub(df, stub, i, j, value_var, sep)) 

504 

505 melted = concat(_melted, axis=1) 

506 id_vars = df.columns.difference(value_vars_flattened) 

507 new = df[id_vars] 

508 

509 if len(i) == 1: 

510 return new.set_index(i).join(melted) 

511 else: 

512 return new.merge(melted.reset_index(), on=i).set_index(i + [j])