Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pandas/core/methods/describe.py: 22%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

148 statements  

1""" 

2Module responsible for execution of NDFrame.describe() method. 

3 

4Method NDFrame.describe() delegates actual execution to function describe_ndframe(). 

5""" 

6from __future__ import annotations 

7 

8from abc import ( 

9 ABC, 

10 abstractmethod, 

11) 

12from typing import ( 

13 TYPE_CHECKING, 

14 Any, 

15 Callable, 

16 Hashable, 

17 Sequence, 

18 cast, 

19) 

20 

21import numpy as np 

22 

23from pandas._libs.tslibs import Timestamp 

24from pandas._typing import ( 

25 DtypeObj, 

26 NDFrameT, 

27 npt, 

28) 

29from pandas.util._validators import validate_percentile 

30 

31from pandas.core.dtypes.common import ( 

32 is_bool_dtype, 

33 is_complex_dtype, 

34 is_extension_array_dtype, 

35 is_numeric_dtype, 

36) 

37from pandas.core.dtypes.dtypes import DatetimeTZDtype 

38 

39from pandas.core.arrays.arrow.dtype import ArrowDtype 

40from pandas.core.arrays.floating import Float64Dtype 

41from pandas.core.reshape.concat import concat 

42 

43from pandas.io.formats.format import format_percentiles 

44 

45if TYPE_CHECKING: 

46 from pandas import ( 

47 DataFrame, 

48 Series, 

49 ) 

50 

51 

52def describe_ndframe( 

53 *, 

54 obj: NDFrameT, 

55 include: str | Sequence[str] | None, 

56 exclude: str | Sequence[str] | None, 

57 percentiles: Sequence[float] | np.ndarray | None, 

58) -> NDFrameT: 

59 """Describe series or dataframe. 

60 

61 Called from pandas.core.generic.NDFrame.describe() 

62 

63 Parameters 

64 ---------- 

65 obj: DataFrame or Series 

66 Either dataframe or series to be described. 

67 include : 'all', list-like of dtypes or None (default), optional 

68 A white list of data types to include in the result. Ignored for ``Series``. 

69 exclude : list-like of dtypes or None (default), optional, 

70 A black list of data types to omit from the result. Ignored for ``Series``. 

71 percentiles : list-like of numbers, optional 

72 The percentiles to include in the output. All should fall between 0 and 1. 

73 The default is ``[.25, .5, .75]``, which returns the 25th, 50th, and 

74 75th percentiles. 

75 

76 Returns 

77 ------- 

78 Dataframe or series description. 

79 """ 

80 percentiles = refine_percentiles(percentiles) 

81 

82 describer: NDFrameDescriberAbstract 

83 if obj.ndim == 1: 

84 describer = SeriesDescriber( 

85 obj=cast("Series", obj), 

86 ) 

87 else: 

88 describer = DataFrameDescriber( 

89 obj=cast("DataFrame", obj), 

90 include=include, 

91 exclude=exclude, 

92 ) 

93 

94 result = describer.describe(percentiles=percentiles) 

95 return cast(NDFrameT, result) 

96 

97 

98class NDFrameDescriberAbstract(ABC): 

99 """Abstract class for describing dataframe or series. 

100 

101 Parameters 

102 ---------- 

103 obj : Series or DataFrame 

104 Object to be described. 

105 """ 

106 

107 def __init__(self, obj: DataFrame | Series) -> None: 

108 self.obj = obj 

109 

110 @abstractmethod 

111 def describe(self, percentiles: Sequence[float] | np.ndarray) -> DataFrame | Series: 

112 """Do describe either series or dataframe. 

113 

114 Parameters 

115 ---------- 

116 percentiles : list-like of numbers 

117 The percentiles to include in the output. 

118 """ 

119 

120 

121class SeriesDescriber(NDFrameDescriberAbstract): 

122 """Class responsible for creating series description.""" 

123 

124 obj: Series 

125 

126 def describe(self, percentiles: Sequence[float] | np.ndarray) -> Series: 

127 describe_func = select_describe_func( 

128 self.obj, 

129 ) 

130 return describe_func(self.obj, percentiles) 

131 

132 

133class DataFrameDescriber(NDFrameDescriberAbstract): 

134 """Class responsible for creating dataobj description. 

135 

136 Parameters 

137 ---------- 

138 obj : DataFrame 

139 DataFrame to be described. 

140 include : 'all', list-like of dtypes or None 

141 A white list of data types to include in the result. 

142 exclude : list-like of dtypes or None 

143 A black list of data types to omit from the result. 

144 """ 

145 

146 def __init__( 

147 self, 

148 obj: DataFrame, 

149 *, 

150 include: str | Sequence[str] | None, 

151 exclude: str | Sequence[str] | None, 

152 ) -> None: 

153 self.include = include 

154 self.exclude = exclude 

155 

156 if obj.ndim == 2 and obj.columns.size == 0: 

157 raise ValueError("Cannot describe a DataFrame without columns") 

158 

159 super().__init__(obj) 

160 

161 def describe(self, percentiles: Sequence[float] | np.ndarray) -> DataFrame: 

162 data = self._select_data() 

163 

164 ldesc: list[Series] = [] 

165 for _, series in data.items(): 

166 describe_func = select_describe_func(series) 

167 ldesc.append(describe_func(series, percentiles)) 

168 

169 col_names = reorder_columns(ldesc) 

170 d = concat( 

171 [x.reindex(col_names, copy=False) for x in ldesc], 

172 axis=1, 

173 sort=False, 

174 ) 

175 d.columns = data.columns.copy() 

176 return d 

177 

178 def _select_data(self): 

179 """Select columns to be described.""" 

180 if (self.include is None) and (self.exclude is None): 

181 # when some numerics are found, keep only numerics 

182 default_include: list[npt.DTypeLike] = [np.number, "datetime"] 

183 data = self.obj.select_dtypes(include=default_include) 

184 if len(data.columns) == 0: 

185 data = self.obj 

186 elif self.include == "all": 

187 if self.exclude is not None: 

188 msg = "exclude must be None when include is 'all'" 

189 raise ValueError(msg) 

190 data = self.obj 

191 else: 

192 data = self.obj.select_dtypes( 

193 include=self.include, 

194 exclude=self.exclude, 

195 ) 

196 return data 

197 

198 

199def reorder_columns(ldesc: Sequence[Series]) -> list[Hashable]: 

200 """Set a convenient order for rows for display.""" 

201 names: list[Hashable] = [] 

202 ldesc_indexes = sorted((x.index for x in ldesc), key=len) 

203 for idxnames in ldesc_indexes: 

204 for name in idxnames: 

205 if name not in names: 

206 names.append(name) 

207 return names 

208 

209 

210def describe_numeric_1d(series: Series, percentiles: Sequence[float]) -> Series: 

211 """Describe series containing numerical data. 

212 

213 Parameters 

214 ---------- 

215 series : Series 

216 Series to be described. 

217 percentiles : list-like of numbers 

218 The percentiles to include in the output. 

219 """ 

220 from pandas import Series 

221 

222 formatted_percentiles = format_percentiles(percentiles) 

223 

224 stat_index = ["count", "mean", "std", "min"] + formatted_percentiles + ["max"] 

225 d = ( 

226 [series.count(), series.mean(), series.std(), series.min()] 

227 + series.quantile(percentiles).tolist() 

228 + [series.max()] 

229 ) 

230 # GH#48340 - always return float on non-complex numeric data 

231 dtype: DtypeObj | None 

232 if is_extension_array_dtype(series): 

233 if isinstance(series.dtype, ArrowDtype): 

234 if series.dtype.kind == "m": 

235 # GH53001: describe timedeltas with object dtype 

236 dtype = None 

237 else: 

238 import pyarrow as pa 

239 

240 dtype = ArrowDtype(pa.float64()) 

241 else: 

242 dtype = Float64Dtype() 

243 elif is_numeric_dtype(series) and not is_complex_dtype(series): 

244 dtype = np.dtype("float") 

245 else: 

246 dtype = None 

247 return Series(d, index=stat_index, name=series.name, dtype=dtype) 

248 

249 

250def describe_categorical_1d( 

251 data: Series, 

252 percentiles_ignored: Sequence[float], 

253) -> Series: 

254 """Describe series containing categorical data. 

255 

256 Parameters 

257 ---------- 

258 data : Series 

259 Series to be described. 

260 percentiles_ignored : list-like of numbers 

261 Ignored, but in place to unify interface. 

262 """ 

263 names = ["count", "unique", "top", "freq"] 

264 objcounts = data.value_counts() 

265 count_unique = len(objcounts[objcounts != 0]) 

266 if count_unique > 0: 

267 top, freq = objcounts.index[0], objcounts.iloc[0] 

268 dtype = None 

269 else: 

270 # If the DataFrame is empty, set 'top' and 'freq' to None 

271 # to maintain output shape consistency 

272 top, freq = np.nan, np.nan 

273 dtype = "object" 

274 

275 result = [data.count(), count_unique, top, freq] 

276 

277 from pandas import Series 

278 

279 return Series(result, index=names, name=data.name, dtype=dtype) 

280 

281 

282def describe_timestamp_as_categorical_1d( 

283 data: Series, 

284 percentiles_ignored: Sequence[float], 

285) -> Series: 

286 """Describe series containing timestamp data treated as categorical. 

287 

288 Parameters 

289 ---------- 

290 data : Series 

291 Series to be described. 

292 percentiles_ignored : list-like of numbers 

293 Ignored, but in place to unify interface. 

294 """ 

295 names = ["count", "unique"] 

296 objcounts = data.value_counts() 

297 count_unique = len(objcounts[objcounts != 0]) 

298 result = [data.count(), count_unique] 

299 dtype = None 

300 if count_unique > 0: 

301 top, freq = objcounts.index[0], objcounts.iloc[0] 

302 tz = data.dt.tz 

303 asint = data.dropna().values.view("i8") 

304 top = Timestamp(top) 

305 if top.tzinfo is not None and tz is not None: 

306 # Don't tz_localize(None) if key is already tz-aware 

307 top = top.tz_convert(tz) 

308 else: 

309 top = top.tz_localize(tz) 

310 names += ["top", "freq", "first", "last"] 

311 result += [ 

312 top, 

313 freq, 

314 Timestamp(asint.min(), tz=tz), 

315 Timestamp(asint.max(), tz=tz), 

316 ] 

317 

318 # If the DataFrame is empty, set 'top' and 'freq' to None 

319 # to maintain output shape consistency 

320 else: 

321 names += ["top", "freq"] 

322 result += [np.nan, np.nan] 

323 dtype = "object" 

324 

325 from pandas import Series 

326 

327 return Series(result, index=names, name=data.name, dtype=dtype) 

328 

329 

330def describe_timestamp_1d(data: Series, percentiles: Sequence[float]) -> Series: 

331 """Describe series containing datetime64 dtype. 

332 

333 Parameters 

334 ---------- 

335 data : Series 

336 Series to be described. 

337 percentiles : list-like of numbers 

338 The percentiles to include in the output. 

339 """ 

340 # GH-30164 

341 from pandas import Series 

342 

343 formatted_percentiles = format_percentiles(percentiles) 

344 

345 stat_index = ["count", "mean", "min"] + formatted_percentiles + ["max"] 

346 d = ( 

347 [data.count(), data.mean(), data.min()] 

348 + data.quantile(percentiles).tolist() 

349 + [data.max()] 

350 ) 

351 return Series(d, index=stat_index, name=data.name) 

352 

353 

354def select_describe_func( 

355 data: Series, 

356) -> Callable: 

357 """Select proper function for describing series based on data type. 

358 

359 Parameters 

360 ---------- 

361 data : Series 

362 Series to be described. 

363 """ 

364 if is_bool_dtype(data.dtype): 

365 return describe_categorical_1d 

366 elif is_numeric_dtype(data): 

367 return describe_numeric_1d 

368 elif data.dtype.kind == "M" or isinstance(data.dtype, DatetimeTZDtype): 

369 return describe_timestamp_1d 

370 elif data.dtype.kind == "m": 

371 return describe_numeric_1d 

372 else: 

373 return describe_categorical_1d 

374 

375 

376def refine_percentiles( 

377 percentiles: Sequence[float] | np.ndarray | None, 

378) -> np.ndarray[Any, np.dtype[np.float64]]: 

379 """ 

380 Ensure that percentiles are unique and sorted. 

381 

382 Parameters 

383 ---------- 

384 percentiles : list-like of numbers, optional 

385 The percentiles to include in the output. 

386 """ 

387 if percentiles is None: 

388 return np.array([0.25, 0.5, 0.75]) 

389 

390 # explicit conversion of `percentiles` to list 

391 percentiles = list(percentiles) 

392 

393 # get them all to be in [0, 1] 

394 validate_percentile(percentiles) 

395 

396 # median should always be included 

397 if 0.5 not in percentiles: 

398 percentiles.append(0.5) 

399 

400 percentiles = np.asarray(percentiles) 

401 

402 # sort and check for duplicates 

403 unique_pcts = np.unique(percentiles) 

404 assert percentiles is not None 

405 if len(unique_pcts) < len(percentiles): 

406 raise ValueError("percentiles cannot contain duplicates") 

407 

408 return unique_pcts