Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pandas/core/reshape/melt.py: 15%

1from __future__ import annotations

3import re

4from typing import (

5 TYPE_CHECKING,

6 Hashable,

9import numpy as np

11from pandas.util._decorators import Appender

13from pandas.core.dtypes.common import (

14 is_extension_array_dtype,

15 is_list_like,

16)

17from pandas.core.dtypes.concat import concat_compat

18from pandas.core.dtypes.missing import notna

20import pandas.core.algorithms as algos

21from pandas.core.arrays import Categorical

22import pandas.core.common as com

23from pandas.core.indexes.api import (

24 Index,

25 MultiIndex,

26)

27from pandas.core.reshape.concat import concat

28from pandas.core.reshape.util import tile_compat

29from pandas.core.shared_docs import _shared_docs

30from pandas.core.tools.numeric import to_numeric

32if TYPE_CHECKING:

33 from pandas._typing import AnyArrayLike

35 from pandas import DataFrame

38@Appender(_shared_docs["melt"] % {"caller": "pd.melt(df, ", "other": "DataFrame.melt"})

39def melt(

40 frame: DataFrame,

41 id_vars=None,

42 value_vars=None,

43 var_name=None,

44 value_name: Hashable = "value",

45 col_level=None,

46 ignore_index: bool = True,

47) -> DataFrame:

48 # If multiindex, gather names of columns on all level for checking presence

49 # of `id_vars` and `value_vars`

50 if isinstance(frame.columns, MultiIndex):

51 cols = [x for c in frame.columns for x in c]

52 else:

53 cols = list(frame.columns)

55 if value_name in frame.columns:

56 raise ValueError(

57 f"value_name ({value_name}) cannot match an element in "

58 "the DataFrame columns."

59 )

61 if id_vars is not None:

62 if not is_list_like(id_vars):

63 id_vars = [id_vars]

64 elif isinstance(frame.columns, MultiIndex) and not isinstance(id_vars, list):

65 raise ValueError(

66 "id_vars must be a list of tuples when columns are a MultiIndex"

67 )

68 else:

69 # Check that `id_vars` are in frame

70 id_vars = list(id_vars)

71 missing = Index(com.flatten(id_vars)).difference(cols)

72 if not missing.empty:

73 raise KeyError(

74 "The following 'id_vars' are not present "

75 f"in the DataFrame: {list(missing)}"

76 )

77 else:

78 id_vars = []

80 if value_vars is not None:

81 if not is_list_like(value_vars):

82 value_vars = [value_vars]

83 elif isinstance(frame.columns, MultiIndex) and not isinstance(value_vars, list):

84 raise ValueError(

85 "value_vars must be a list of tuples when columns are a MultiIndex"

86 )

87 else:

88 value_vars = list(value_vars)

89 # Check that `value_vars` are in frame

90 missing = Index(com.flatten(value_vars)).difference(cols)

91 if not missing.empty:

92 raise KeyError(

93 "The following 'value_vars' are not present in "

94 f"the DataFrame: {list(missing)}"

95 )

96 if col_level is not None:

97 idx = frame.columns.get_level_values(col_level).get_indexer(

98 id_vars + value_vars

99 )

100 else:

101 idx = algos.unique(frame.columns.get_indexer_for(id_vars + value_vars))

102 frame = frame.iloc[:, idx]

103 else:

104 frame = frame.copy()

105

106 if col_level is not None: # allow list or other?

107 # frame is a copy

108 frame.columns = frame.columns.get_level_values(col_level)

109

110 if var_name is None:

111 if isinstance(frame.columns, MultiIndex):

112 if len(frame.columns.names) == len(set(frame.columns.names)):

113 var_name = frame.columns.names

114 else:

115 var_name = [f"variable_{i}" for i in range(len(frame.columns.names))]

116 else:

117 var_name = [

118 frame.columns.name if frame.columns.name is not None else "variable"

119 ]

120 if isinstance(var_name, str):

121 var_name = [var_name]

122

123 N, K = frame.shape

124 K -= len(id_vars)

125

126 mdata: dict[Hashable, AnyArrayLike] = {}

127 for col in id_vars:

128 id_data = frame.pop(col)

129 if is_extension_array_dtype(id_data):

130 if K > 0:

131 id_data = concat([id_data] * K, ignore_index=True)

132 else:

133 # We can't concat empty list. (GH 46044)

134 id_data = type(id_data)([], name=id_data.name, dtype=id_data.dtype)

135 else:

136 # error: Incompatible types in assignment (expression has type

137 # "ndarray[Any, dtype[Any]]", variable has type "Series")

138 id_data = np.tile(id_data._values, K) # type: ignore[assignment]

139 mdata[col] = id_data

140

141 mcolumns = id_vars + var_name + [value_name]

142

143 if frame.shape[1] > 0:

144 mdata[value_name] = concat(

145 [frame.iloc[:, i] for i in range(frame.shape[1])]

146 ).values

147 else:

148 mdata[value_name] = frame._values.ravel("F")

149 for i, col in enumerate(var_name):

150 # asanyarray will keep the columns as an Index

151 mdata[col] = np.asanyarray(frame.columns._get_level_values(i)).repeat(N)

152

153 result = frame._constructor(mdata, columns=mcolumns)

154

155 if not ignore_index:

156 result.index = tile_compat(frame.index, K)

157

158 return result

159

160

161def lreshape(data: DataFrame, groups, dropna: bool = True) -> DataFrame:

162 """

163 Reshape wide-format data to long. Generalized inverse of DataFrame.pivot.

164

165 Accepts a dictionary, ``groups``, in which each key is a new column name

166 and each value is a list of old column names that will be "melted" under

167 the new column name as part of the reshape.

168

169 Parameters

170 ----------

171 data : DataFrame

172 The wide-format DataFrame.

173 groups : dict

174 {new_name : list_of_columns}.

175 dropna : bool, default True

176 Do not include columns whose entries are all NaN.

177

178 Returns

179 -------

180 DataFrame

181 Reshaped DataFrame.

182

183 See Also

184 --------

185 melt : Unpivot a DataFrame from wide to long format, optionally leaving

186 identifiers set.

187 pivot : Create a spreadsheet-style pivot table as a DataFrame.

188 DataFrame.pivot : Pivot without aggregation that can handle

189 non-numeric data.

190 DataFrame.pivot_table : Generalization of pivot that can handle

191 duplicate values for one index/column pair.

192 DataFrame.unstack : Pivot based on the index values instead of a

193 column.

194 wide_to_long : Wide panel to long format. Less flexible but more

195 user-friendly than melt.

196

197 Examples

198 --------

199 >>> data = pd.DataFrame({'hr1': [514, 573], 'hr2': [545, 526],

200 ... 'team': ['Red Sox', 'Yankees'],

201 ... 'year1': [2007, 2007], 'year2': [2008, 2008]})

202 >>> data

203 hr1 hr2 team year1 year2

204 0 514 545 Red Sox 2007 2008

205 1 573 526 Yankees 2007 2008

206

207 >>> pd.lreshape(data, {'year': ['year1', 'year2'], 'hr': ['hr1', 'hr2']})

208 team year hr

209 0 Red Sox 2007 514

210 1 Yankees 2007 573

211 2 Red Sox 2008 545

212 3 Yankees 2008 526

213 """

214 if isinstance(groups, dict):

215 keys = list(groups.keys())

216 values = list(groups.values())

217 else:

218 keys, values = zip(*groups)

219

220 all_cols = list(set.union(*(set(x) for x in values)))

221 id_cols = list(data.columns.difference(all_cols))

222

223 K = len(values[0])

224

225 for seq in values:

226 if len(seq) != K:

227 raise ValueError("All column lists must be same length")

228

229 mdata = {}

230 pivot_cols = []

231

232 for target, names in zip(keys, values):

233 to_concat = [data[col]._values for col in names]

234

235 mdata[target] = concat_compat(to_concat)

236 pivot_cols.append(target)

237

238 for col in id_cols:

239 mdata[col] = np.tile(data[col]._values, K)

240

241 if dropna:

242 mask = np.ones(len(mdata[pivot_cols[0]]), dtype=bool)

243 for c in pivot_cols:

244 mask &= notna(mdata[c])

245 if not mask.all():

246 mdata = {k: v[mask] for k, v in mdata.items()}

247

248 return data._constructor(mdata, columns=id_cols + pivot_cols)

249

250

251def wide_to_long(

252 df: DataFrame, stubnames, i, j, sep: str = "", suffix: str = r"\d+"

253) -> DataFrame:

254 r"""

255 Unpivot a DataFrame from wide to long format.

256

257 Less flexible but more user-friendly than melt.

258

259 With stubnames ['A', 'B'], this function expects to find one or more

260 group of columns with format

261 A-suffix1, A-suffix2,..., B-suffix1, B-suffix2,...

262 You specify what you want to call this suffix in the resulting long format

263 with `j` (for example `j='year'`)

264

265 Each row of these wide variables are assumed to be uniquely identified by

266 `i` (can be a single column name or a list of column names)

267

268 All remaining variables in the data frame are left intact.

269

270 Parameters

271 ----------

272 df : DataFrame

273 The wide-format DataFrame.

274 stubnames : str or list-like

275 The stub name(s). The wide format variables are assumed to

276 start with the stub names.

277 i : str or list-like

278 Column(s) to use as id variable(s).

279 j : str

280 The name of the sub-observation variable. What you wish to name your

281 suffix in the long format.

282 sep : str, default ""

283 A character indicating the separation of the variable names

284 in the wide format, to be stripped from the names in the long format.

285 For example, if your column names are A-suffix1, A-suffix2, you

286 can strip the hyphen by specifying `sep='-'`.

287 suffix : str, default '\\d+'

288 A regular expression capturing the wanted suffixes. '\\d+' captures

289 numeric suffixes. Suffixes with no numbers could be specified with the

290 negated character class '\\D+'. You can also further disambiguate

291 suffixes, for example, if your wide variables are of the form A-one,

292 B-two,.., and you have an unrelated column A-rating, you can ignore the

293 last one by specifying `suffix='(!?one|two)'`. When all suffixes are

294 numeric, they are cast to int64/float64.

295

296 Returns

297 -------

298 DataFrame

299 A DataFrame that contains each stub name as a variable, with new index

300 (i, j).

301

302 See Also

303 --------

304 melt : Unpivot a DataFrame from wide to long format, optionally leaving

305 identifiers set.

306 pivot : Create a spreadsheet-style pivot table as a DataFrame.

307 DataFrame.pivot : Pivot without aggregation that can handle

308 non-numeric data.

309 DataFrame.pivot_table : Generalization of pivot that can handle

310 duplicate values for one index/column pair.

311 DataFrame.unstack : Pivot based on the index values instead of a

312 column.

313

314 Notes

315 -----

316 All extra variables are left untouched. This simply uses

317 `pandas.melt` under the hood, but is hard-coded to "do the right thing"

318 in a typical case.

319

320 Examples

321 --------

322 >>> np.random.seed(123)

323 >>> df = pd.DataFrame({"A1970" : {0 : "a", 1 : "b", 2 : "c"},

324 ... "A1980" : {0 : "d", 1 : "e", 2 : "f"},

325 ... "B1970" : {0 : 2.5, 1 : 1.2, 2 : .7},

326 ... "B1980" : {0 : 3.2, 1 : 1.3, 2 : .1},

327 ... "X" : dict(zip(range(3), np.random.randn(3)))

328 ... })

329 >>> df["id"] = df.index

330 >>> df

331 A1970 A1980 B1970 B1980 X id

332 0 a d 2.5 3.2 -1.085631 0

333 1 b e 1.2 1.3 0.997345 1

334 2 c f 0.7 0.1 0.282978 2

335 >>> pd.wide_to_long(df, ["A", "B"], i="id", j="year")

336 ... # doctest: +NORMALIZE_WHITESPACE

337 X A B

338 id year

339 0 1970 -1.085631 a 2.5

340 1 1970 0.997345 b 1.2

341 2 1970 0.282978 c 0.7

342 0 1980 -1.085631 d 3.2

343 1 1980 0.997345 e 1.3

344 2 1980 0.282978 f 0.1

345

346 With multiple id columns

347

348 >>> df = pd.DataFrame({

349 ... 'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3],

350 ... 'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3],

351 ... 'ht1': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1],

352 ... 'ht2': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9]

353 ... })

354 >>> df

355 famid birth ht1 ht2

356 0 1 1 2.8 3.4

357 1 1 2 2.9 3.8

358 2 1 3 2.2 2.9

359 3 2 1 2.0 3.2

360 4 2 2 1.8 2.8

361 5 2 3 1.9 2.4

362 6 3 1 2.2 3.3

363 7 3 2 2.3 3.4

364 8 3 3 2.1 2.9

365 >>> l = pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age')

366 >>> l

367 ... # doctest: +NORMALIZE_WHITESPACE

368 ht

369 famid birth age

370 1 1 1 2.8

371 2 3.4

372 2 1 2.9

373 2 3.8

374 3 1 2.2

375 2 2.9

376 2 1 1 2.0

377 2 3.2

378 2 1 1.8

379 2 2.8

380 3 1 1.9

381 2 2.4

382 3 1 1 2.2

383 2 3.3

384 2 1 2.3

385 2 3.4

386 3 1 2.1

387 2 2.9

388

389 Going from long back to wide just takes some creative use of `unstack`

390

391 >>> w = l.unstack()

392 >>> w.columns = w.columns.map('{0[0]}{0[1]}'.format)

393 >>> w.reset_index()

394 famid birth ht1 ht2

395 0 1 1 2.8 3.4

396 1 1 2 2.9 3.8

397 2 1 3 2.2 2.9

398 3 2 1 2.0 3.2

399 4 2 2 1.8 2.8

400 5 2 3 1.9 2.4

401 6 3 1 2.2 3.3

402 7 3 2 2.3 3.4

403 8 3 3 2.1 2.9

404

405 Less wieldy column names are also handled

406

407 >>> np.random.seed(0)

408 >>> df = pd.DataFrame({'A(weekly)-2010': np.random.rand(3),

409 ... 'A(weekly)-2011': np.random.rand(3),

410 ... 'B(weekly)-2010': np.random.rand(3),

411 ... 'B(weekly)-2011': np.random.rand(3),

412 ... 'X' : np.random.randint(3, size=3)})

413 >>> df['id'] = df.index

414 >>> df # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS

415 A(weekly)-2010 A(weekly)-2011 B(weekly)-2010 B(weekly)-2011 X id

416 0 0.548814 0.544883 0.437587 0.383442 0 0

417 1 0.715189 0.423655 0.891773 0.791725 1 1

418 2 0.602763 0.645894 0.963663 0.528895 1 2

419

420 >>> pd.wide_to_long(df, ['A(weekly)', 'B(weekly)'], i='id',

421 ... j='year', sep='-')

422 ... # doctest: +NORMALIZE_WHITESPACE

423 X A(weekly) B(weekly)

424 id year

425 0 2010 0 0.548814 0.437587

426 1 2010 1 0.715189 0.891773

427 2 2010 1 0.602763 0.963663

428 0 2011 0 0.544883 0.383442

429 1 2011 1 0.423655 0.791725

430 2 2011 1 0.645894 0.528895

431

432 If we have many columns, we could also use a regex to find our

433 stubnames and pass that list on to wide_to_long

434

435 >>> stubnames = sorted(

436 ... set([match[0] for match in df.columns.str.findall(

437 ... r'[A-B]$.*$').values if match != []])

438 ... )

439 >>> list(stubnames)

440 ['A(weekly)', 'B(weekly)']

441

442 All of the above examples have integers as suffixes. It is possible to

443 have non-integers as suffixes.

444

445 >>> df = pd.DataFrame({

446 ... 'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3],

447 ... 'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3],

448 ... 'ht_one': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1],

449 ... 'ht_two': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9]

450 ... })

451 >>> df

452 famid birth ht_one ht_two

453 0 1 1 2.8 3.4

454 1 1 2 2.9 3.8

455 2 1 3 2.2 2.9

456 3 2 1 2.0 3.2

457 4 2 2 1.8 2.8

458 5 2 3 1.9 2.4

459 6 3 1 2.2 3.3

460 7 3 2 2.3 3.4

461 8 3 3 2.1 2.9

462

463 >>> l = pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age',

464 ... sep='_', suffix=r'\w+')

465 >>> l

466 ... # doctest: +NORMALIZE_WHITESPACE

467 ht

468 famid birth age

469 1 1 one 2.8

470 two 3.4

471 2 one 2.9

472 two 3.8

473 3 one 2.2

474 two 2.9

475 2 1 one 2.0

476 two 3.2

477 2 one 1.8

478 two 2.8

479 3 one 1.9

480 two 2.4

481 3 1 one 2.2

482 two 3.3

483 2 one 2.3

484 two 3.4

485 3 one 2.1

486 two 2.9

487 """

488

489 def get_var_names(df, stub: str, sep: str, suffix: str) -> list[str]:

490 regex = rf"^{re.escape(stub)}{re.escape(sep)}{suffix}$"

491 pattern = re.compile(regex)

492 return [col for col in df.columns if pattern.match(col)]

493

494 def melt_stub(df, stub: str, i, j, value_vars, sep: str):

495 newdf = melt(

496 df,

497 id_vars=i,

498 value_vars=value_vars,

499 value_name=stub.rstrip(sep),

500 var_name=j,

501 )

502 newdf[j] = Categorical(newdf[j])

503 newdf[j] = newdf[j].str.replace(re.escape(stub + sep), "", regex=True)

504

505 # GH17627 Cast numerics suffixes to int/float

506 newdf[j] = to_numeric(newdf[j], errors="ignore")

507

508 return newdf.set_index(i + [j])

509

510 if not is_list_like(stubnames):

511 stubnames = [stubnames]

512 else:

513 stubnames = list(stubnames)

514

515 if any(col in stubnames for col in df.columns):

516 raise ValueError("stubname can't be identical to a column name")

517

518 if not is_list_like(i):

519 i = [i]

520 else:

521 i = list(i)

522

523 if df[i].duplicated().any():

524 raise ValueError("the id variables need to uniquely identify each row")

525

526 value_vars = [get_var_names(df, stub, sep, suffix) for stub in stubnames]

527

528 value_vars_flattened = [e for sublist in value_vars for e in sublist]

529 id_vars = list(set(df.columns.tolist()).difference(value_vars_flattened))

530

531 _melted = [melt_stub(df, s, i, j, v, sep) for s, v in zip(stubnames, value_vars)]

532 melted = _melted[0].join(_melted[1:], how="outer")

533

534 if len(i) == 1:

535 new = df[id_vars].set_index(i).join(melted)

536 return new

537

538 new = df[id_vars].merge(melted.reset_index(), on=i).set_index(i + [j])

539

540 return new