Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pandas/core/reshape/melt.py: 15%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

138 statements  

1from __future__ import annotations 

2 

3import re 

4from typing import ( 

5 TYPE_CHECKING, 

6 Hashable, 

7) 

8 

9import numpy as np 

10 

11from pandas.util._decorators import Appender 

12 

13from pandas.core.dtypes.common import ( 

14 is_extension_array_dtype, 

15 is_list_like, 

16) 

17from pandas.core.dtypes.concat import concat_compat 

18from pandas.core.dtypes.missing import notna 

19 

20import pandas.core.algorithms as algos 

21from pandas.core.arrays import Categorical 

22import pandas.core.common as com 

23from pandas.core.indexes.api import ( 

24 Index, 

25 MultiIndex, 

26) 

27from pandas.core.reshape.concat import concat 

28from pandas.core.reshape.util import tile_compat 

29from pandas.core.shared_docs import _shared_docs 

30from pandas.core.tools.numeric import to_numeric 

31 

32if TYPE_CHECKING: 

33 from pandas._typing import AnyArrayLike 

34 

35 from pandas import DataFrame 

36 

37 

38@Appender(_shared_docs["melt"] % {"caller": "pd.melt(df, ", "other": "DataFrame.melt"}) 

39def melt( 

40 frame: DataFrame, 

41 id_vars=None, 

42 value_vars=None, 

43 var_name=None, 

44 value_name: Hashable = "value", 

45 col_level=None, 

46 ignore_index: bool = True, 

47) -> DataFrame: 

48 # If multiindex, gather names of columns on all level for checking presence 

49 # of `id_vars` and `value_vars` 

50 if isinstance(frame.columns, MultiIndex): 

51 cols = [x for c in frame.columns for x in c] 

52 else: 

53 cols = list(frame.columns) 

54 

55 if value_name in frame.columns: 

56 raise ValueError( 

57 f"value_name ({value_name}) cannot match an element in " 

58 "the DataFrame columns." 

59 ) 

60 

61 if id_vars is not None: 

62 if not is_list_like(id_vars): 

63 id_vars = [id_vars] 

64 elif isinstance(frame.columns, MultiIndex) and not isinstance(id_vars, list): 

65 raise ValueError( 

66 "id_vars must be a list of tuples when columns are a MultiIndex" 

67 ) 

68 else: 

69 # Check that `id_vars` are in frame 

70 id_vars = list(id_vars) 

71 missing = Index(com.flatten(id_vars)).difference(cols) 

72 if not missing.empty: 

73 raise KeyError( 

74 "The following 'id_vars' are not present " 

75 f"in the DataFrame: {list(missing)}" 

76 ) 

77 else: 

78 id_vars = [] 

79 

80 if value_vars is not None: 

81 if not is_list_like(value_vars): 

82 value_vars = [value_vars] 

83 elif isinstance(frame.columns, MultiIndex) and not isinstance(value_vars, list): 

84 raise ValueError( 

85 "value_vars must be a list of tuples when columns are a MultiIndex" 

86 ) 

87 else: 

88 value_vars = list(value_vars) 

89 # Check that `value_vars` are in frame 

90 missing = Index(com.flatten(value_vars)).difference(cols) 

91 if not missing.empty: 

92 raise KeyError( 

93 "The following 'value_vars' are not present in " 

94 f"the DataFrame: {list(missing)}" 

95 ) 

96 if col_level is not None: 

97 idx = frame.columns.get_level_values(col_level).get_indexer( 

98 id_vars + value_vars 

99 ) 

100 else: 

101 idx = algos.unique(frame.columns.get_indexer_for(id_vars + value_vars)) 

102 frame = frame.iloc[:, idx] 

103 else: 

104 frame = frame.copy() 

105 

106 if col_level is not None: # allow list or other? 

107 # frame is a copy 

108 frame.columns = frame.columns.get_level_values(col_level) 

109 

110 if var_name is None: 

111 if isinstance(frame.columns, MultiIndex): 

112 if len(frame.columns.names) == len(set(frame.columns.names)): 

113 var_name = frame.columns.names 

114 else: 

115 var_name = [f"variable_{i}" for i in range(len(frame.columns.names))] 

116 else: 

117 var_name = [ 

118 frame.columns.name if frame.columns.name is not None else "variable" 

119 ] 

120 if isinstance(var_name, str): 

121 var_name = [var_name] 

122 

123 N, K = frame.shape 

124 K -= len(id_vars) 

125 

126 mdata: dict[Hashable, AnyArrayLike] = {} 

127 for col in id_vars: 

128 id_data = frame.pop(col) 

129 if is_extension_array_dtype(id_data): 

130 if K > 0: 

131 id_data = concat([id_data] * K, ignore_index=True) 

132 else: 

133 # We can't concat empty list. (GH 46044) 

134 id_data = type(id_data)([], name=id_data.name, dtype=id_data.dtype) 

135 else: 

136 # error: Incompatible types in assignment (expression has type 

137 # "ndarray[Any, dtype[Any]]", variable has type "Series") 

138 id_data = np.tile(id_data._values, K) # type: ignore[assignment] 

139 mdata[col] = id_data 

140 

141 mcolumns = id_vars + var_name + [value_name] 

142 

143 if frame.shape[1] > 0: 

144 mdata[value_name] = concat( 

145 [frame.iloc[:, i] for i in range(frame.shape[1])] 

146 ).values 

147 else: 

148 mdata[value_name] = frame._values.ravel("F") 

149 for i, col in enumerate(var_name): 

150 # asanyarray will keep the columns as an Index 

151 mdata[col] = np.asanyarray(frame.columns._get_level_values(i)).repeat(N) 

152 

153 result = frame._constructor(mdata, columns=mcolumns) 

154 

155 if not ignore_index: 

156 result.index = tile_compat(frame.index, K) 

157 

158 return result 

159 

160 

161def lreshape(data: DataFrame, groups, dropna: bool = True) -> DataFrame: 

162 """ 

163 Reshape wide-format data to long. Generalized inverse of DataFrame.pivot. 

164 

165 Accepts a dictionary, ``groups``, in which each key is a new column name 

166 and each value is a list of old column names that will be "melted" under 

167 the new column name as part of the reshape. 

168 

169 Parameters 

170 ---------- 

171 data : DataFrame 

172 The wide-format DataFrame. 

173 groups : dict 

174 {new_name : list_of_columns}. 

175 dropna : bool, default True 

176 Do not include columns whose entries are all NaN. 

177 

178 Returns 

179 ------- 

180 DataFrame 

181 Reshaped DataFrame. 

182 

183 See Also 

184 -------- 

185 melt : Unpivot a DataFrame from wide to long format, optionally leaving 

186 identifiers set. 

187 pivot : Create a spreadsheet-style pivot table as a DataFrame. 

188 DataFrame.pivot : Pivot without aggregation that can handle 

189 non-numeric data. 

190 DataFrame.pivot_table : Generalization of pivot that can handle 

191 duplicate values for one index/column pair. 

192 DataFrame.unstack : Pivot based on the index values instead of a 

193 column. 

194 wide_to_long : Wide panel to long format. Less flexible but more 

195 user-friendly than melt. 

196 

197 Examples 

198 -------- 

199 >>> data = pd.DataFrame({'hr1': [514, 573], 'hr2': [545, 526], 

200 ... 'team': ['Red Sox', 'Yankees'], 

201 ... 'year1': [2007, 2007], 'year2': [2008, 2008]}) 

202 >>> data 

203 hr1 hr2 team year1 year2 

204 0 514 545 Red Sox 2007 2008 

205 1 573 526 Yankees 2007 2008 

206 

207 >>> pd.lreshape(data, {'year': ['year1', 'year2'], 'hr': ['hr1', 'hr2']}) 

208 team year hr 

209 0 Red Sox 2007 514 

210 1 Yankees 2007 573 

211 2 Red Sox 2008 545 

212 3 Yankees 2008 526 

213 """ 

214 if isinstance(groups, dict): 

215 keys = list(groups.keys()) 

216 values = list(groups.values()) 

217 else: 

218 keys, values = zip(*groups) 

219 

220 all_cols = list(set.union(*(set(x) for x in values))) 

221 id_cols = list(data.columns.difference(all_cols)) 

222 

223 K = len(values[0]) 

224 

225 for seq in values: 

226 if len(seq) != K: 

227 raise ValueError("All column lists must be same length") 

228 

229 mdata = {} 

230 pivot_cols = [] 

231 

232 for target, names in zip(keys, values): 

233 to_concat = [data[col]._values for col in names] 

234 

235 mdata[target] = concat_compat(to_concat) 

236 pivot_cols.append(target) 

237 

238 for col in id_cols: 

239 mdata[col] = np.tile(data[col]._values, K) 

240 

241 if dropna: 

242 mask = np.ones(len(mdata[pivot_cols[0]]), dtype=bool) 

243 for c in pivot_cols: 

244 mask &= notna(mdata[c]) 

245 if not mask.all(): 

246 mdata = {k: v[mask] for k, v in mdata.items()} 

247 

248 return data._constructor(mdata, columns=id_cols + pivot_cols) 

249 

250 

251def wide_to_long( 

252 df: DataFrame, stubnames, i, j, sep: str = "", suffix: str = r"\d+" 

253) -> DataFrame: 

254 r""" 

255 Unpivot a DataFrame from wide to long format. 

256 

257 Less flexible but more user-friendly than melt. 

258 

259 With stubnames ['A', 'B'], this function expects to find one or more 

260 group of columns with format 

261 A-suffix1, A-suffix2,..., B-suffix1, B-suffix2,... 

262 You specify what you want to call this suffix in the resulting long format 

263 with `j` (for example `j='year'`) 

264 

265 Each row of these wide variables are assumed to be uniquely identified by 

266 `i` (can be a single column name or a list of column names) 

267 

268 All remaining variables in the data frame are left intact. 

269 

270 Parameters 

271 ---------- 

272 df : DataFrame 

273 The wide-format DataFrame. 

274 stubnames : str or list-like 

275 The stub name(s). The wide format variables are assumed to 

276 start with the stub names. 

277 i : str or list-like 

278 Column(s) to use as id variable(s). 

279 j : str 

280 The name of the sub-observation variable. What you wish to name your 

281 suffix in the long format. 

282 sep : str, default "" 

283 A character indicating the separation of the variable names 

284 in the wide format, to be stripped from the names in the long format. 

285 For example, if your column names are A-suffix1, A-suffix2, you 

286 can strip the hyphen by specifying `sep='-'`. 

287 suffix : str, default '\\d+' 

288 A regular expression capturing the wanted suffixes. '\\d+' captures 

289 numeric suffixes. Suffixes with no numbers could be specified with the 

290 negated character class '\\D+'. You can also further disambiguate 

291 suffixes, for example, if your wide variables are of the form A-one, 

292 B-two,.., and you have an unrelated column A-rating, you can ignore the 

293 last one by specifying `suffix='(!?one|two)'`. When all suffixes are 

294 numeric, they are cast to int64/float64. 

295 

296 Returns 

297 ------- 

298 DataFrame 

299 A DataFrame that contains each stub name as a variable, with new index 

300 (i, j). 

301 

302 See Also 

303 -------- 

304 melt : Unpivot a DataFrame from wide to long format, optionally leaving 

305 identifiers set. 

306 pivot : Create a spreadsheet-style pivot table as a DataFrame. 

307 DataFrame.pivot : Pivot without aggregation that can handle 

308 non-numeric data. 

309 DataFrame.pivot_table : Generalization of pivot that can handle 

310 duplicate values for one index/column pair. 

311 DataFrame.unstack : Pivot based on the index values instead of a 

312 column. 

313 

314 Notes 

315 ----- 

316 All extra variables are left untouched. This simply uses 

317 `pandas.melt` under the hood, but is hard-coded to "do the right thing" 

318 in a typical case. 

319 

320 Examples 

321 -------- 

322 >>> np.random.seed(123) 

323 >>> df = pd.DataFrame({"A1970" : {0 : "a", 1 : "b", 2 : "c"}, 

324 ... "A1980" : {0 : "d", 1 : "e", 2 : "f"}, 

325 ... "B1970" : {0 : 2.5, 1 : 1.2, 2 : .7}, 

326 ... "B1980" : {0 : 3.2, 1 : 1.3, 2 : .1}, 

327 ... "X" : dict(zip(range(3), np.random.randn(3))) 

328 ... }) 

329 >>> df["id"] = df.index 

330 >>> df 

331 A1970 A1980 B1970 B1980 X id 

332 0 a d 2.5 3.2 -1.085631 0 

333 1 b e 1.2 1.3 0.997345 1 

334 2 c f 0.7 0.1 0.282978 2 

335 >>> pd.wide_to_long(df, ["A", "B"], i="id", j="year") 

336 ... # doctest: +NORMALIZE_WHITESPACE 

337 X A B 

338 id year 

339 0 1970 -1.085631 a 2.5 

340 1 1970 0.997345 b 1.2 

341 2 1970 0.282978 c 0.7 

342 0 1980 -1.085631 d 3.2 

343 1 1980 0.997345 e 1.3 

344 2 1980 0.282978 f 0.1 

345 

346 With multiple id columns 

347 

348 >>> df = pd.DataFrame({ 

349 ... 'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3], 

350 ... 'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3], 

351 ... 'ht1': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1], 

352 ... 'ht2': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9] 

353 ... }) 

354 >>> df 

355 famid birth ht1 ht2 

356 0 1 1 2.8 3.4 

357 1 1 2 2.9 3.8 

358 2 1 3 2.2 2.9 

359 3 2 1 2.0 3.2 

360 4 2 2 1.8 2.8 

361 5 2 3 1.9 2.4 

362 6 3 1 2.2 3.3 

363 7 3 2 2.3 3.4 

364 8 3 3 2.1 2.9 

365 >>> l = pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age') 

366 >>> l 

367 ... # doctest: +NORMALIZE_WHITESPACE 

368 ht 

369 famid birth age 

370 1 1 1 2.8 

371 2 3.4 

372 2 1 2.9 

373 2 3.8 

374 3 1 2.2 

375 2 2.9 

376 2 1 1 2.0 

377 2 3.2 

378 2 1 1.8 

379 2 2.8 

380 3 1 1.9 

381 2 2.4 

382 3 1 1 2.2 

383 2 3.3 

384 2 1 2.3 

385 2 3.4 

386 3 1 2.1 

387 2 2.9 

388 

389 Going from long back to wide just takes some creative use of `unstack` 

390 

391 >>> w = l.unstack() 

392 >>> w.columns = w.columns.map('{0[0]}{0[1]}'.format) 

393 >>> w.reset_index() 

394 famid birth ht1 ht2 

395 0 1 1 2.8 3.4 

396 1 1 2 2.9 3.8 

397 2 1 3 2.2 2.9 

398 3 2 1 2.0 3.2 

399 4 2 2 1.8 2.8 

400 5 2 3 1.9 2.4 

401 6 3 1 2.2 3.3 

402 7 3 2 2.3 3.4 

403 8 3 3 2.1 2.9 

404 

405 Less wieldy column names are also handled 

406 

407 >>> np.random.seed(0) 

408 >>> df = pd.DataFrame({'A(weekly)-2010': np.random.rand(3), 

409 ... 'A(weekly)-2011': np.random.rand(3), 

410 ... 'B(weekly)-2010': np.random.rand(3), 

411 ... 'B(weekly)-2011': np.random.rand(3), 

412 ... 'X' : np.random.randint(3, size=3)}) 

413 >>> df['id'] = df.index 

414 >>> df # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS 

415 A(weekly)-2010 A(weekly)-2011 B(weekly)-2010 B(weekly)-2011 X id 

416 0 0.548814 0.544883 0.437587 0.383442 0 0 

417 1 0.715189 0.423655 0.891773 0.791725 1 1 

418 2 0.602763 0.645894 0.963663 0.528895 1 2 

419 

420 >>> pd.wide_to_long(df, ['A(weekly)', 'B(weekly)'], i='id', 

421 ... j='year', sep='-') 

422 ... # doctest: +NORMALIZE_WHITESPACE 

423 X A(weekly) B(weekly) 

424 id year 

425 0 2010 0 0.548814 0.437587 

426 1 2010 1 0.715189 0.891773 

427 2 2010 1 0.602763 0.963663 

428 0 2011 0 0.544883 0.383442 

429 1 2011 1 0.423655 0.791725 

430 2 2011 1 0.645894 0.528895 

431 

432 If we have many columns, we could also use a regex to find our 

433 stubnames and pass that list on to wide_to_long 

434 

435 >>> stubnames = sorted( 

436 ... set([match[0] for match in df.columns.str.findall( 

437 ... r'[A-B]\(.*\)').values if match != []]) 

438 ... ) 

439 >>> list(stubnames) 

440 ['A(weekly)', 'B(weekly)'] 

441 

442 All of the above examples have integers as suffixes. It is possible to 

443 have non-integers as suffixes. 

444 

445 >>> df = pd.DataFrame({ 

446 ... 'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3], 

447 ... 'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3], 

448 ... 'ht_one': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1], 

449 ... 'ht_two': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9] 

450 ... }) 

451 >>> df 

452 famid birth ht_one ht_two 

453 0 1 1 2.8 3.4 

454 1 1 2 2.9 3.8 

455 2 1 3 2.2 2.9 

456 3 2 1 2.0 3.2 

457 4 2 2 1.8 2.8 

458 5 2 3 1.9 2.4 

459 6 3 1 2.2 3.3 

460 7 3 2 2.3 3.4 

461 8 3 3 2.1 2.9 

462 

463 >>> l = pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age', 

464 ... sep='_', suffix=r'\w+') 

465 >>> l 

466 ... # doctest: +NORMALIZE_WHITESPACE 

467 ht 

468 famid birth age 

469 1 1 one 2.8 

470 two 3.4 

471 2 one 2.9 

472 two 3.8 

473 3 one 2.2 

474 two 2.9 

475 2 1 one 2.0 

476 two 3.2 

477 2 one 1.8 

478 two 2.8 

479 3 one 1.9 

480 two 2.4 

481 3 1 one 2.2 

482 two 3.3 

483 2 one 2.3 

484 two 3.4 

485 3 one 2.1 

486 two 2.9 

487 """ 

488 

489 def get_var_names(df, stub: str, sep: str, suffix: str) -> list[str]: 

490 regex = rf"^{re.escape(stub)}{re.escape(sep)}{suffix}$" 

491 pattern = re.compile(regex) 

492 return [col for col in df.columns if pattern.match(col)] 

493 

494 def melt_stub(df, stub: str, i, j, value_vars, sep: str): 

495 newdf = melt( 

496 df, 

497 id_vars=i, 

498 value_vars=value_vars, 

499 value_name=stub.rstrip(sep), 

500 var_name=j, 

501 ) 

502 newdf[j] = Categorical(newdf[j]) 

503 newdf[j] = newdf[j].str.replace(re.escape(stub + sep), "", regex=True) 

504 

505 # GH17627 Cast numerics suffixes to int/float 

506 newdf[j] = to_numeric(newdf[j], errors="ignore") 

507 

508 return newdf.set_index(i + [j]) 

509 

510 if not is_list_like(stubnames): 

511 stubnames = [stubnames] 

512 else: 

513 stubnames = list(stubnames) 

514 

515 if any(col in stubnames for col in df.columns): 

516 raise ValueError("stubname can't be identical to a column name") 

517 

518 if not is_list_like(i): 

519 i = [i] 

520 else: 

521 i = list(i) 

522 

523 if df[i].duplicated().any(): 

524 raise ValueError("the id variables need to uniquely identify each row") 

525 

526 value_vars = [get_var_names(df, stub, sep, suffix) for stub in stubnames] 

527 

528 value_vars_flattened = [e for sublist in value_vars for e in sublist] 

529 id_vars = list(set(df.columns.tolist()).difference(value_vars_flattened)) 

530 

531 _melted = [melt_stub(df, s, i, j, v, sep) for s, v in zip(stubnames, value_vars)] 

532 melted = _melted[0].join(_melted[1:], how="outer") 

533 

534 if len(i) == 1: 

535 new = df[id_vars].set_index(i).join(melted) 

536 return new 

537 

538 new = df[id_vars].merge(melted.reset_index(), on=i).set_index(i + [j]) 

539 

540 return new