1"""
2Module responsible for execution of NDFrame.describe() method.
3
4Method NDFrame.describe() delegates actual execution to function describe_ndframe().
5"""
6from __future__ import annotations
7
8from abc import (
9 ABC,
10 abstractmethod,
11)
12from typing import (
13 TYPE_CHECKING,
14 Any,
15 Callable,
16 Hashable,
17 Sequence,
18 cast,
19)
20
21import numpy as np
22
23from pandas._libs.tslibs import Timestamp
24from pandas._typing import (
25 DtypeObj,
26 NDFrameT,
27 npt,
28)
29from pandas.util._validators import validate_percentile
30
31from pandas.core.dtypes.common import (
32 is_bool_dtype,
33 is_complex_dtype,
34 is_extension_array_dtype,
35 is_numeric_dtype,
36)
37from pandas.core.dtypes.dtypes import DatetimeTZDtype
38
39from pandas.core.arrays.arrow.dtype import ArrowDtype
40from pandas.core.arrays.floating import Float64Dtype
41from pandas.core.reshape.concat import concat
42
43from pandas.io.formats.format import format_percentiles
44
45if TYPE_CHECKING:
46 from pandas import (
47 DataFrame,
48 Series,
49 )
50
51
52def describe_ndframe(
53 *,
54 obj: NDFrameT,
55 include: str | Sequence[str] | None,
56 exclude: str | Sequence[str] | None,
57 percentiles: Sequence[float] | np.ndarray | None,
58) -> NDFrameT:
59 """Describe series or dataframe.
60
61 Called from pandas.core.generic.NDFrame.describe()
62
63 Parameters
64 ----------
65 obj: DataFrame or Series
66 Either dataframe or series to be described.
67 include : 'all', list-like of dtypes or None (default), optional
68 A white list of data types to include in the result. Ignored for ``Series``.
69 exclude : list-like of dtypes or None (default), optional,
70 A black list of data types to omit from the result. Ignored for ``Series``.
71 percentiles : list-like of numbers, optional
72 The percentiles to include in the output. All should fall between 0 and 1.
73 The default is ``[.25, .5, .75]``, which returns the 25th, 50th, and
74 75th percentiles.
75
76 Returns
77 -------
78 Dataframe or series description.
79 """
80 percentiles = refine_percentiles(percentiles)
81
82 describer: NDFrameDescriberAbstract
83 if obj.ndim == 1:
84 describer = SeriesDescriber(
85 obj=cast("Series", obj),
86 )
87 else:
88 describer = DataFrameDescriber(
89 obj=cast("DataFrame", obj),
90 include=include,
91 exclude=exclude,
92 )
93
94 result = describer.describe(percentiles=percentiles)
95 return cast(NDFrameT, result)
96
97
98class NDFrameDescriberAbstract(ABC):
99 """Abstract class for describing dataframe or series.
100
101 Parameters
102 ----------
103 obj : Series or DataFrame
104 Object to be described.
105 """
106
107 def __init__(self, obj: DataFrame | Series) -> None:
108 self.obj = obj
109
110 @abstractmethod
111 def describe(self, percentiles: Sequence[float] | np.ndarray) -> DataFrame | Series:
112 """Do describe either series or dataframe.
113
114 Parameters
115 ----------
116 percentiles : list-like of numbers
117 The percentiles to include in the output.
118 """
119
120
121class SeriesDescriber(NDFrameDescriberAbstract):
122 """Class responsible for creating series description."""
123
124 obj: Series
125
126 def describe(self, percentiles: Sequence[float] | np.ndarray) -> Series:
127 describe_func = select_describe_func(
128 self.obj,
129 )
130 return describe_func(self.obj, percentiles)
131
132
133class DataFrameDescriber(NDFrameDescriberAbstract):
134 """Class responsible for creating dataobj description.
135
136 Parameters
137 ----------
138 obj : DataFrame
139 DataFrame to be described.
140 include : 'all', list-like of dtypes or None
141 A white list of data types to include in the result.
142 exclude : list-like of dtypes or None
143 A black list of data types to omit from the result.
144 """
145
146 def __init__(
147 self,
148 obj: DataFrame,
149 *,
150 include: str | Sequence[str] | None,
151 exclude: str | Sequence[str] | None,
152 ) -> None:
153 self.include = include
154 self.exclude = exclude
155
156 if obj.ndim == 2 and obj.columns.size == 0:
157 raise ValueError("Cannot describe a DataFrame without columns")
158
159 super().__init__(obj)
160
161 def describe(self, percentiles: Sequence[float] | np.ndarray) -> DataFrame:
162 data = self._select_data()
163
164 ldesc: list[Series] = []
165 for _, series in data.items():
166 describe_func = select_describe_func(series)
167 ldesc.append(describe_func(series, percentiles))
168
169 col_names = reorder_columns(ldesc)
170 d = concat(
171 [x.reindex(col_names, copy=False) for x in ldesc],
172 axis=1,
173 sort=False,
174 )
175 d.columns = data.columns.copy()
176 return d
177
178 def _select_data(self):
179 """Select columns to be described."""
180 if (self.include is None) and (self.exclude is None):
181 # when some numerics are found, keep only numerics
182 default_include: list[npt.DTypeLike] = [np.number, "datetime"]
183 data = self.obj.select_dtypes(include=default_include)
184 if len(data.columns) == 0:
185 data = self.obj
186 elif self.include == "all":
187 if self.exclude is not None:
188 msg = "exclude must be None when include is 'all'"
189 raise ValueError(msg)
190 data = self.obj
191 else:
192 data = self.obj.select_dtypes(
193 include=self.include,
194 exclude=self.exclude,
195 )
196 return data
197
198
199def reorder_columns(ldesc: Sequence[Series]) -> list[Hashable]:
200 """Set a convenient order for rows for display."""
201 names: list[Hashable] = []
202 ldesc_indexes = sorted((x.index for x in ldesc), key=len)
203 for idxnames in ldesc_indexes:
204 for name in idxnames:
205 if name not in names:
206 names.append(name)
207 return names
208
209
210def describe_numeric_1d(series: Series, percentiles: Sequence[float]) -> Series:
211 """Describe series containing numerical data.
212
213 Parameters
214 ----------
215 series : Series
216 Series to be described.
217 percentiles : list-like of numbers
218 The percentiles to include in the output.
219 """
220 from pandas import Series
221
222 formatted_percentiles = format_percentiles(percentiles)
223
224 stat_index = ["count", "mean", "std", "min"] + formatted_percentiles + ["max"]
225 d = (
226 [series.count(), series.mean(), series.std(), series.min()]
227 + series.quantile(percentiles).tolist()
228 + [series.max()]
229 )
230 # GH#48340 - always return float on non-complex numeric data
231 dtype: DtypeObj | None
232 if is_extension_array_dtype(series):
233 if isinstance(series.dtype, ArrowDtype):
234 if series.dtype.kind == "m":
235 # GH53001: describe timedeltas with object dtype
236 dtype = None
237 else:
238 import pyarrow as pa
239
240 dtype = ArrowDtype(pa.float64())
241 else:
242 dtype = Float64Dtype()
243 elif is_numeric_dtype(series) and not is_complex_dtype(series):
244 dtype = np.dtype("float")
245 else:
246 dtype = None
247 return Series(d, index=stat_index, name=series.name, dtype=dtype)
248
249
250def describe_categorical_1d(
251 data: Series,
252 percentiles_ignored: Sequence[float],
253) -> Series:
254 """Describe series containing categorical data.
255
256 Parameters
257 ----------
258 data : Series
259 Series to be described.
260 percentiles_ignored : list-like of numbers
261 Ignored, but in place to unify interface.
262 """
263 names = ["count", "unique", "top", "freq"]
264 objcounts = data.value_counts()
265 count_unique = len(objcounts[objcounts != 0])
266 if count_unique > 0:
267 top, freq = objcounts.index[0], objcounts.iloc[0]
268 dtype = None
269 else:
270 # If the DataFrame is empty, set 'top' and 'freq' to None
271 # to maintain output shape consistency
272 top, freq = np.nan, np.nan
273 dtype = "object"
274
275 result = [data.count(), count_unique, top, freq]
276
277 from pandas import Series
278
279 return Series(result, index=names, name=data.name, dtype=dtype)
280
281
282def describe_timestamp_as_categorical_1d(
283 data: Series,
284 percentiles_ignored: Sequence[float],
285) -> Series:
286 """Describe series containing timestamp data treated as categorical.
287
288 Parameters
289 ----------
290 data : Series
291 Series to be described.
292 percentiles_ignored : list-like of numbers
293 Ignored, but in place to unify interface.
294 """
295 names = ["count", "unique"]
296 objcounts = data.value_counts()
297 count_unique = len(objcounts[objcounts != 0])
298 result = [data.count(), count_unique]
299 dtype = None
300 if count_unique > 0:
301 top, freq = objcounts.index[0], objcounts.iloc[0]
302 tz = data.dt.tz
303 asint = data.dropna().values.view("i8")
304 top = Timestamp(top)
305 if top.tzinfo is not None and tz is not None:
306 # Don't tz_localize(None) if key is already tz-aware
307 top = top.tz_convert(tz)
308 else:
309 top = top.tz_localize(tz)
310 names += ["top", "freq", "first", "last"]
311 result += [
312 top,
313 freq,
314 Timestamp(asint.min(), tz=tz),
315 Timestamp(asint.max(), tz=tz),
316 ]
317
318 # If the DataFrame is empty, set 'top' and 'freq' to None
319 # to maintain output shape consistency
320 else:
321 names += ["top", "freq"]
322 result += [np.nan, np.nan]
323 dtype = "object"
324
325 from pandas import Series
326
327 return Series(result, index=names, name=data.name, dtype=dtype)
328
329
330def describe_timestamp_1d(data: Series, percentiles: Sequence[float]) -> Series:
331 """Describe series containing datetime64 dtype.
332
333 Parameters
334 ----------
335 data : Series
336 Series to be described.
337 percentiles : list-like of numbers
338 The percentiles to include in the output.
339 """
340 # GH-30164
341 from pandas import Series
342
343 formatted_percentiles = format_percentiles(percentiles)
344
345 stat_index = ["count", "mean", "min"] + formatted_percentiles + ["max"]
346 d = (
347 [data.count(), data.mean(), data.min()]
348 + data.quantile(percentiles).tolist()
349 + [data.max()]
350 )
351 return Series(d, index=stat_index, name=data.name)
352
353
354def select_describe_func(
355 data: Series,
356) -> Callable:
357 """Select proper function for describing series based on data type.
358
359 Parameters
360 ----------
361 data : Series
362 Series to be described.
363 """
364 if is_bool_dtype(data.dtype):
365 return describe_categorical_1d
366 elif is_numeric_dtype(data):
367 return describe_numeric_1d
368 elif data.dtype.kind == "M" or isinstance(data.dtype, DatetimeTZDtype):
369 return describe_timestamp_1d
370 elif data.dtype.kind == "m":
371 return describe_numeric_1d
372 else:
373 return describe_categorical_1d
374
375
376def refine_percentiles(
377 percentiles: Sequence[float] | np.ndarray | None,
378) -> np.ndarray[Any, np.dtype[np.float64]]:
379 """
380 Ensure that percentiles are unique and sorted.
381
382 Parameters
383 ----------
384 percentiles : list-like of numbers, optional
385 The percentiles to include in the output.
386 """
387 if percentiles is None:
388 return np.array([0.25, 0.5, 0.75])
389
390 # explicit conversion of `percentiles` to list
391 percentiles = list(percentiles)
392
393 # get them all to be in [0, 1]
394 validate_percentile(percentiles)
395
396 # median should always be included
397 if 0.5 not in percentiles:
398 percentiles.append(0.5)
399
400 percentiles = np.asarray(percentiles)
401
402 # sort and check for duplicates
403 unique_pcts = np.unique(percentiles)
404 assert percentiles is not None
405 if len(unique_pcts) < len(percentiles):
406 raise ValueError("percentiles cannot contain duplicates")
407
408 return unique_pcts