1"""
2Module responsible for execution of NDFrame.describe() method.
3
4Method NDFrame.describe() delegates actual execution to function describe_ndframe().
5"""
6from __future__ import annotations
7
8from abc import (
9 ABC,
10 abstractmethod,
11)
12from typing import (
13 TYPE_CHECKING,
14 Callable,
15 cast,
16)
17
18import numpy as np
19
20from pandas._libs.tslibs import Timestamp
21from pandas._typing import (
22 DtypeObj,
23 NDFrameT,
24 npt,
25)
26from pandas.util._validators import validate_percentile
27
28from pandas.core.dtypes.common import (
29 is_bool_dtype,
30 is_numeric_dtype,
31)
32from pandas.core.dtypes.dtypes import (
33 ArrowDtype,
34 DatetimeTZDtype,
35 ExtensionDtype,
36)
37
38from pandas.core.arrays.floating import Float64Dtype
39from pandas.core.reshape.concat import concat
40
41from pandas.io.formats.format import format_percentiles
42
43if TYPE_CHECKING:
44 from collections.abc import (
45 Hashable,
46 Sequence,
47 )
48
49 from pandas import (
50 DataFrame,
51 Series,
52 )
53
54
55def describe_ndframe(
56 *,
57 obj: NDFrameT,
58 include: str | Sequence[str] | None,
59 exclude: str | Sequence[str] | None,
60 percentiles: Sequence[float] | np.ndarray | None,
61) -> NDFrameT:
62 """Describe series or dataframe.
63
64 Called from pandas.core.generic.NDFrame.describe()
65
66 Parameters
67 ----------
68 obj: DataFrame or Series
69 Either dataframe or series to be described.
70 include : 'all', list-like of dtypes or None (default), optional
71 A white list of data types to include in the result. Ignored for ``Series``.
72 exclude : list-like of dtypes or None (default), optional,
73 A black list of data types to omit from the result. Ignored for ``Series``.
74 percentiles : list-like of numbers, optional
75 The percentiles to include in the output. All should fall between 0 and 1.
76 The default is ``[.25, .5, .75]``, which returns the 25th, 50th, and
77 75th percentiles.
78
79 Returns
80 -------
81 Dataframe or series description.
82 """
83 percentiles = _refine_percentiles(percentiles)
84
85 describer: NDFrameDescriberAbstract
86 if obj.ndim == 1:
87 describer = SeriesDescriber(
88 obj=cast("Series", obj),
89 )
90 else:
91 describer = DataFrameDescriber(
92 obj=cast("DataFrame", obj),
93 include=include,
94 exclude=exclude,
95 )
96
97 result = describer.describe(percentiles=percentiles)
98 return cast(NDFrameT, result)
99
100
101class NDFrameDescriberAbstract(ABC):
102 """Abstract class for describing dataframe or series.
103
104 Parameters
105 ----------
106 obj : Series or DataFrame
107 Object to be described.
108 """
109
110 def __init__(self, obj: DataFrame | Series) -> None:
111 self.obj = obj
112
113 @abstractmethod
114 def describe(self, percentiles: Sequence[float] | np.ndarray) -> DataFrame | Series:
115 """Do describe either series or dataframe.
116
117 Parameters
118 ----------
119 percentiles : list-like of numbers
120 The percentiles to include in the output.
121 """
122
123
124class SeriesDescriber(NDFrameDescriberAbstract):
125 """Class responsible for creating series description."""
126
127 obj: Series
128
129 def describe(self, percentiles: Sequence[float] | np.ndarray) -> Series:
130 describe_func = select_describe_func(
131 self.obj,
132 )
133 return describe_func(self.obj, percentiles)
134
135
136class DataFrameDescriber(NDFrameDescriberAbstract):
137 """Class responsible for creating dataobj description.
138
139 Parameters
140 ----------
141 obj : DataFrame
142 DataFrame to be described.
143 include : 'all', list-like of dtypes or None
144 A white list of data types to include in the result.
145 exclude : list-like of dtypes or None
146 A black list of data types to omit from the result.
147 """
148
149 obj: DataFrame
150
151 def __init__(
152 self,
153 obj: DataFrame,
154 *,
155 include: str | Sequence[str] | None,
156 exclude: str | Sequence[str] | None,
157 ) -> None:
158 self.include = include
159 self.exclude = exclude
160
161 if obj.ndim == 2 and obj.columns.size == 0:
162 raise ValueError("Cannot describe a DataFrame without columns")
163
164 super().__init__(obj)
165
166 def describe(self, percentiles: Sequence[float] | np.ndarray) -> DataFrame:
167 data = self._select_data()
168
169 ldesc: list[Series] = []
170 for _, series in data.items():
171 describe_func = select_describe_func(series)
172 ldesc.append(describe_func(series, percentiles))
173
174 col_names = reorder_columns(ldesc)
175 d = concat(
176 [x.reindex(col_names, copy=False) for x in ldesc],
177 axis=1,
178 sort=False,
179 )
180 d.columns = data.columns.copy()
181 return d
182
183 def _select_data(self) -> DataFrame:
184 """Select columns to be described."""
185 if (self.include is None) and (self.exclude is None):
186 # when some numerics are found, keep only numerics
187 default_include: list[npt.DTypeLike] = [np.number, "datetime"]
188 data = self.obj.select_dtypes(include=default_include)
189 if len(data.columns) == 0:
190 data = self.obj
191 elif self.include == "all":
192 if self.exclude is not None:
193 msg = "exclude must be None when include is 'all'"
194 raise ValueError(msg)
195 data = self.obj
196 else:
197 data = self.obj.select_dtypes(
198 include=self.include,
199 exclude=self.exclude,
200 )
201 return data
202
203
204def reorder_columns(ldesc: Sequence[Series]) -> list[Hashable]:
205 """Set a convenient order for rows for display."""
206 names: list[Hashable] = []
207 seen_names: set[Hashable] = set()
208 ldesc_indexes = sorted((x.index for x in ldesc), key=len)
209 for idxnames in ldesc_indexes:
210 for name in idxnames:
211 if name not in seen_names:
212 seen_names.add(name)
213 names.append(name)
214 return names
215
216
217def describe_numeric_1d(series: Series, percentiles: Sequence[float]) -> Series:
218 """Describe series containing numerical data.
219
220 Parameters
221 ----------
222 series : Series
223 Series to be described.
224 percentiles : list-like of numbers
225 The percentiles to include in the output.
226 """
227 from pandas import Series
228
229 formatted_percentiles = format_percentiles(percentiles)
230
231 stat_index = ["count", "mean", "std", "min"] + formatted_percentiles + ["max"]
232 d = (
233 [series.count(), series.mean(), series.std(), series.min()]
234 + series.quantile(percentiles).tolist()
235 + [series.max()]
236 )
237 # GH#48340 - always return float on non-complex numeric data
238 dtype: DtypeObj | None
239 if isinstance(series.dtype, ExtensionDtype):
240 if isinstance(series.dtype, ArrowDtype):
241 if series.dtype.kind == "m":
242 # GH53001: describe timedeltas with object dtype
243 dtype = None
244 else:
245 import pyarrow as pa
246
247 dtype = ArrowDtype(pa.float64())
248 else:
249 dtype = Float64Dtype()
250 elif series.dtype.kind in "iufb":
251 # i.e. numeric but exclude complex dtype
252 dtype = np.dtype("float")
253 else:
254 dtype = None
255 return Series(d, index=stat_index, name=series.name, dtype=dtype)
256
257
258def describe_categorical_1d(
259 data: Series,
260 percentiles_ignored: Sequence[float],
261) -> Series:
262 """Describe series containing categorical data.
263
264 Parameters
265 ----------
266 data : Series
267 Series to be described.
268 percentiles_ignored : list-like of numbers
269 Ignored, but in place to unify interface.
270 """
271 names = ["count", "unique", "top", "freq"]
272 objcounts = data.value_counts()
273 count_unique = len(objcounts[objcounts != 0])
274 if count_unique > 0:
275 top, freq = objcounts.index[0], objcounts.iloc[0]
276 dtype = None
277 else:
278 # If the DataFrame is empty, set 'top' and 'freq' to None
279 # to maintain output shape consistency
280 top, freq = np.nan, np.nan
281 dtype = "object"
282
283 result = [data.count(), count_unique, top, freq]
284
285 from pandas import Series
286
287 return Series(result, index=names, name=data.name, dtype=dtype)
288
289
290def describe_timestamp_as_categorical_1d(
291 data: Series,
292 percentiles_ignored: Sequence[float],
293) -> Series:
294 """Describe series containing timestamp data treated as categorical.
295
296 Parameters
297 ----------
298 data : Series
299 Series to be described.
300 percentiles_ignored : list-like of numbers
301 Ignored, but in place to unify interface.
302 """
303 names = ["count", "unique"]
304 objcounts = data.value_counts()
305 count_unique = len(objcounts[objcounts != 0])
306 result: list[float | Timestamp] = [data.count(), count_unique]
307 dtype = None
308 if count_unique > 0:
309 top, freq = objcounts.index[0], objcounts.iloc[0]
310 tz = data.dt.tz
311 asint = data.dropna().values.view("i8")
312 top = Timestamp(top)
313 if top.tzinfo is not None and tz is not None:
314 # Don't tz_localize(None) if key is already tz-aware
315 top = top.tz_convert(tz)
316 else:
317 top = top.tz_localize(tz)
318 names += ["top", "freq", "first", "last"]
319 result += [
320 top,
321 freq,
322 Timestamp(asint.min(), tz=tz),
323 Timestamp(asint.max(), tz=tz),
324 ]
325
326 # If the DataFrame is empty, set 'top' and 'freq' to None
327 # to maintain output shape consistency
328 else:
329 names += ["top", "freq"]
330 result += [np.nan, np.nan]
331 dtype = "object"
332
333 from pandas import Series
334
335 return Series(result, index=names, name=data.name, dtype=dtype)
336
337
338def describe_timestamp_1d(data: Series, percentiles: Sequence[float]) -> Series:
339 """Describe series containing datetime64 dtype.
340
341 Parameters
342 ----------
343 data : Series
344 Series to be described.
345 percentiles : list-like of numbers
346 The percentiles to include in the output.
347 """
348 # GH-30164
349 from pandas import Series
350
351 formatted_percentiles = format_percentiles(percentiles)
352
353 stat_index = ["count", "mean", "min"] + formatted_percentiles + ["max"]
354 d = (
355 [data.count(), data.mean(), data.min()]
356 + data.quantile(percentiles).tolist()
357 + [data.max()]
358 )
359 return Series(d, index=stat_index, name=data.name)
360
361
362def select_describe_func(
363 data: Series,
364) -> Callable:
365 """Select proper function for describing series based on data type.
366
367 Parameters
368 ----------
369 data : Series
370 Series to be described.
371 """
372 if is_bool_dtype(data.dtype):
373 return describe_categorical_1d
374 elif is_numeric_dtype(data):
375 return describe_numeric_1d
376 elif data.dtype.kind == "M" or isinstance(data.dtype, DatetimeTZDtype):
377 return describe_timestamp_1d
378 elif data.dtype.kind == "m":
379 return describe_numeric_1d
380 else:
381 return describe_categorical_1d
382
383
384def _refine_percentiles(
385 percentiles: Sequence[float] | np.ndarray | None,
386) -> npt.NDArray[np.float64]:
387 """
388 Ensure that percentiles are unique and sorted.
389
390 Parameters
391 ----------
392 percentiles : list-like of numbers, optional
393 The percentiles to include in the output.
394 """
395 if percentiles is None:
396 return np.array([0.25, 0.5, 0.75])
397
398 # explicit conversion of `percentiles` to list
399 percentiles = list(percentiles)
400
401 # get them all to be in [0, 1]
402 validate_percentile(percentiles)
403
404 # median should always be included
405 if 0.5 not in percentiles:
406 percentiles.append(0.5)
407
408 percentiles = np.asarray(percentiles)
409
410 # sort and check for duplicates
411 unique_pcts = np.unique(percentiles)
412 assert percentiles is not None
413 if len(unique_pcts) < len(percentiles):
414 raise ValueError("percentiles cannot contain duplicates")
415
416 return unique_pcts