Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/core/methods/describe.py: 22%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

151 statements  

1""" 

2Module responsible for execution of NDFrame.describe() method. 

3 

4Method NDFrame.describe() delegates actual execution to function describe_ndframe(). 

5""" 

6from __future__ import annotations 

7 

8from abc import ( 

9 ABC, 

10 abstractmethod, 

11) 

12from typing import ( 

13 TYPE_CHECKING, 

14 Callable, 

15 cast, 

16) 

17 

18import numpy as np 

19 

20from pandas._libs.tslibs import Timestamp 

21from pandas._typing import ( 

22 DtypeObj, 

23 NDFrameT, 

24 npt, 

25) 

26from pandas.util._validators import validate_percentile 

27 

28from pandas.core.dtypes.common import ( 

29 is_bool_dtype, 

30 is_numeric_dtype, 

31) 

32from pandas.core.dtypes.dtypes import ( 

33 ArrowDtype, 

34 DatetimeTZDtype, 

35 ExtensionDtype, 

36) 

37 

38from pandas.core.arrays.floating import Float64Dtype 

39from pandas.core.reshape.concat import concat 

40 

41from pandas.io.formats.format import format_percentiles 

42 

43if TYPE_CHECKING: 

44 from collections.abc import ( 

45 Hashable, 

46 Sequence, 

47 ) 

48 

49 from pandas import ( 

50 DataFrame, 

51 Series, 

52 ) 

53 

54 

55def describe_ndframe( 

56 *, 

57 obj: NDFrameT, 

58 include: str | Sequence[str] | None, 

59 exclude: str | Sequence[str] | None, 

60 percentiles: Sequence[float] | np.ndarray | None, 

61) -> NDFrameT: 

62 """Describe series or dataframe. 

63 

64 Called from pandas.core.generic.NDFrame.describe() 

65 

66 Parameters 

67 ---------- 

68 obj: DataFrame or Series 

69 Either dataframe or series to be described. 

70 include : 'all', list-like of dtypes or None (default), optional 

71 A white list of data types to include in the result. Ignored for ``Series``. 

72 exclude : list-like of dtypes or None (default), optional, 

73 A black list of data types to omit from the result. Ignored for ``Series``. 

74 percentiles : list-like of numbers, optional 

75 The percentiles to include in the output. All should fall between 0 and 1. 

76 The default is ``[.25, .5, .75]``, which returns the 25th, 50th, and 

77 75th percentiles. 

78 

79 Returns 

80 ------- 

81 Dataframe or series description. 

82 """ 

83 percentiles = _refine_percentiles(percentiles) 

84 

85 describer: NDFrameDescriberAbstract 

86 if obj.ndim == 1: 

87 describer = SeriesDescriber( 

88 obj=cast("Series", obj), 

89 ) 

90 else: 

91 describer = DataFrameDescriber( 

92 obj=cast("DataFrame", obj), 

93 include=include, 

94 exclude=exclude, 

95 ) 

96 

97 result = describer.describe(percentiles=percentiles) 

98 return cast(NDFrameT, result) 

99 

100 

101class NDFrameDescriberAbstract(ABC): 

102 """Abstract class for describing dataframe or series. 

103 

104 Parameters 

105 ---------- 

106 obj : Series or DataFrame 

107 Object to be described. 

108 """ 

109 

110 def __init__(self, obj: DataFrame | Series) -> None: 

111 self.obj = obj 

112 

113 @abstractmethod 

114 def describe(self, percentiles: Sequence[float] | np.ndarray) -> DataFrame | Series: 

115 """Do describe either series or dataframe. 

116 

117 Parameters 

118 ---------- 

119 percentiles : list-like of numbers 

120 The percentiles to include in the output. 

121 """ 

122 

123 

124class SeriesDescriber(NDFrameDescriberAbstract): 

125 """Class responsible for creating series description.""" 

126 

127 obj: Series 

128 

129 def describe(self, percentiles: Sequence[float] | np.ndarray) -> Series: 

130 describe_func = select_describe_func( 

131 self.obj, 

132 ) 

133 return describe_func(self.obj, percentiles) 

134 

135 

136class DataFrameDescriber(NDFrameDescriberAbstract): 

137 """Class responsible for creating dataobj description. 

138 

139 Parameters 

140 ---------- 

141 obj : DataFrame 

142 DataFrame to be described. 

143 include : 'all', list-like of dtypes or None 

144 A white list of data types to include in the result. 

145 exclude : list-like of dtypes or None 

146 A black list of data types to omit from the result. 

147 """ 

148 

149 obj: DataFrame 

150 

151 def __init__( 

152 self, 

153 obj: DataFrame, 

154 *, 

155 include: str | Sequence[str] | None, 

156 exclude: str | Sequence[str] | None, 

157 ) -> None: 

158 self.include = include 

159 self.exclude = exclude 

160 

161 if obj.ndim == 2 and obj.columns.size == 0: 

162 raise ValueError("Cannot describe a DataFrame without columns") 

163 

164 super().__init__(obj) 

165 

166 def describe(self, percentiles: Sequence[float] | np.ndarray) -> DataFrame: 

167 data = self._select_data() 

168 

169 ldesc: list[Series] = [] 

170 for _, series in data.items(): 

171 describe_func = select_describe_func(series) 

172 ldesc.append(describe_func(series, percentiles)) 

173 

174 col_names = reorder_columns(ldesc) 

175 d = concat( 

176 [x.reindex(col_names, copy=False) for x in ldesc], 

177 axis=1, 

178 sort=False, 

179 ) 

180 d.columns = data.columns.copy() 

181 return d 

182 

183 def _select_data(self) -> DataFrame: 

184 """Select columns to be described.""" 

185 if (self.include is None) and (self.exclude is None): 

186 # when some numerics are found, keep only numerics 

187 default_include: list[npt.DTypeLike] = [np.number, "datetime"] 

188 data = self.obj.select_dtypes(include=default_include) 

189 if len(data.columns) == 0: 

190 data = self.obj 

191 elif self.include == "all": 

192 if self.exclude is not None: 

193 msg = "exclude must be None when include is 'all'" 

194 raise ValueError(msg) 

195 data = self.obj 

196 else: 

197 data = self.obj.select_dtypes( 

198 include=self.include, 

199 exclude=self.exclude, 

200 ) 

201 return data 

202 

203 

204def reorder_columns(ldesc: Sequence[Series]) -> list[Hashable]: 

205 """Set a convenient order for rows for display.""" 

206 names: list[Hashable] = [] 

207 seen_names: set[Hashable] = set() 

208 ldesc_indexes = sorted((x.index for x in ldesc), key=len) 

209 for idxnames in ldesc_indexes: 

210 for name in idxnames: 

211 if name not in seen_names: 

212 seen_names.add(name) 

213 names.append(name) 

214 return names 

215 

216 

217def describe_numeric_1d(series: Series, percentiles: Sequence[float]) -> Series: 

218 """Describe series containing numerical data. 

219 

220 Parameters 

221 ---------- 

222 series : Series 

223 Series to be described. 

224 percentiles : list-like of numbers 

225 The percentiles to include in the output. 

226 """ 

227 from pandas import Series 

228 

229 formatted_percentiles = format_percentiles(percentiles) 

230 

231 stat_index = ["count", "mean", "std", "min"] + formatted_percentiles + ["max"] 

232 d = ( 

233 [series.count(), series.mean(), series.std(), series.min()] 

234 + series.quantile(percentiles).tolist() 

235 + [series.max()] 

236 ) 

237 # GH#48340 - always return float on non-complex numeric data 

238 dtype: DtypeObj | None 

239 if isinstance(series.dtype, ExtensionDtype): 

240 if isinstance(series.dtype, ArrowDtype): 

241 if series.dtype.kind == "m": 

242 # GH53001: describe timedeltas with object dtype 

243 dtype = None 

244 else: 

245 import pyarrow as pa 

246 

247 dtype = ArrowDtype(pa.float64()) 

248 else: 

249 dtype = Float64Dtype() 

250 elif series.dtype.kind in "iufb": 

251 # i.e. numeric but exclude complex dtype 

252 dtype = np.dtype("float") 

253 else: 

254 dtype = None 

255 return Series(d, index=stat_index, name=series.name, dtype=dtype) 

256 

257 

258def describe_categorical_1d( 

259 data: Series, 

260 percentiles_ignored: Sequence[float], 

261) -> Series: 

262 """Describe series containing categorical data. 

263 

264 Parameters 

265 ---------- 

266 data : Series 

267 Series to be described. 

268 percentiles_ignored : list-like of numbers 

269 Ignored, but in place to unify interface. 

270 """ 

271 names = ["count", "unique", "top", "freq"] 

272 objcounts = data.value_counts() 

273 count_unique = len(objcounts[objcounts != 0]) 

274 if count_unique > 0: 

275 top, freq = objcounts.index[0], objcounts.iloc[0] 

276 dtype = None 

277 else: 

278 # If the DataFrame is empty, set 'top' and 'freq' to None 

279 # to maintain output shape consistency 

280 top, freq = np.nan, np.nan 

281 dtype = "object" 

282 

283 result = [data.count(), count_unique, top, freq] 

284 

285 from pandas import Series 

286 

287 return Series(result, index=names, name=data.name, dtype=dtype) 

288 

289 

290def describe_timestamp_as_categorical_1d( 

291 data: Series, 

292 percentiles_ignored: Sequence[float], 

293) -> Series: 

294 """Describe series containing timestamp data treated as categorical. 

295 

296 Parameters 

297 ---------- 

298 data : Series 

299 Series to be described. 

300 percentiles_ignored : list-like of numbers 

301 Ignored, but in place to unify interface. 

302 """ 

303 names = ["count", "unique"] 

304 objcounts = data.value_counts() 

305 count_unique = len(objcounts[objcounts != 0]) 

306 result: list[float | Timestamp] = [data.count(), count_unique] 

307 dtype = None 

308 if count_unique > 0: 

309 top, freq = objcounts.index[0], objcounts.iloc[0] 

310 tz = data.dt.tz 

311 asint = data.dropna().values.view("i8") 

312 top = Timestamp(top) 

313 if top.tzinfo is not None and tz is not None: 

314 # Don't tz_localize(None) if key is already tz-aware 

315 top = top.tz_convert(tz) 

316 else: 

317 top = top.tz_localize(tz) 

318 names += ["top", "freq", "first", "last"] 

319 result += [ 

320 top, 

321 freq, 

322 Timestamp(asint.min(), tz=tz), 

323 Timestamp(asint.max(), tz=tz), 

324 ] 

325 

326 # If the DataFrame is empty, set 'top' and 'freq' to None 

327 # to maintain output shape consistency 

328 else: 

329 names += ["top", "freq"] 

330 result += [np.nan, np.nan] 

331 dtype = "object" 

332 

333 from pandas import Series 

334 

335 return Series(result, index=names, name=data.name, dtype=dtype) 

336 

337 

338def describe_timestamp_1d(data: Series, percentiles: Sequence[float]) -> Series: 

339 """Describe series containing datetime64 dtype. 

340 

341 Parameters 

342 ---------- 

343 data : Series 

344 Series to be described. 

345 percentiles : list-like of numbers 

346 The percentiles to include in the output. 

347 """ 

348 # GH-30164 

349 from pandas import Series 

350 

351 formatted_percentiles = format_percentiles(percentiles) 

352 

353 stat_index = ["count", "mean", "min"] + formatted_percentiles + ["max"] 

354 d = ( 

355 [data.count(), data.mean(), data.min()] 

356 + data.quantile(percentiles).tolist() 

357 + [data.max()] 

358 ) 

359 return Series(d, index=stat_index, name=data.name) 

360 

361 

362def select_describe_func( 

363 data: Series, 

364) -> Callable: 

365 """Select proper function for describing series based on data type. 

366 

367 Parameters 

368 ---------- 

369 data : Series 

370 Series to be described. 

371 """ 

372 if is_bool_dtype(data.dtype): 

373 return describe_categorical_1d 

374 elif is_numeric_dtype(data): 

375 return describe_numeric_1d 

376 elif data.dtype.kind == "M" or isinstance(data.dtype, DatetimeTZDtype): 

377 return describe_timestamp_1d 

378 elif data.dtype.kind == "m": 

379 return describe_numeric_1d 

380 else: 

381 return describe_categorical_1d 

382 

383 

384def _refine_percentiles( 

385 percentiles: Sequence[float] | np.ndarray | None, 

386) -> npt.NDArray[np.float64]: 

387 """ 

388 Ensure that percentiles are unique and sorted. 

389 

390 Parameters 

391 ---------- 

392 percentiles : list-like of numbers, optional 

393 The percentiles to include in the output. 

394 """ 

395 if percentiles is None: 

396 return np.array([0.25, 0.5, 0.75]) 

397 

398 # explicit conversion of `percentiles` to list 

399 percentiles = list(percentiles) 

400 

401 # get them all to be in [0, 1] 

402 validate_percentile(percentiles) 

403 

404 # median should always be included 

405 if 0.5 not in percentiles: 

406 percentiles.append(0.5) 

407 

408 percentiles = np.asarray(percentiles) 

409 

410 # sort and check for duplicates 

411 unique_pcts = np.unique(percentiles) 

412 assert percentiles is not None 

413 if len(unique_pcts) < len(percentiles): 

414 raise ValueError("percentiles cannot contain duplicates") 

415 

416 return unique_pcts