1from __future__ import annotations
2
3from textwrap import dedent
4from typing import (
5 TYPE_CHECKING,
6 Any,
7 Callable,
8)
9
10from pandas._typing import (
11 Axis,
12 QuantileInterpolation,
13 WindowingRankType,
14)
15
16if TYPE_CHECKING:
17 from pandas import DataFrame, Series
18 from pandas.core.generic import NDFrame
19
20from pandas.util._decorators import doc
21
22from pandas.core.indexers.objects import (
23 BaseIndexer,
24 ExpandingIndexer,
25 GroupbyIndexer,
26)
27from pandas.core.window.doc import (
28 _shared_docs,
29 create_section_header,
30 kwargs_numeric_only,
31 numba_notes,
32 template_header,
33 template_returns,
34 template_see_also,
35 window_agg_numba_parameters,
36 window_apply_parameters,
37)
38from pandas.core.window.rolling import (
39 BaseWindowGroupby,
40 RollingAndExpandingMixin,
41)
42
43
44class Expanding(RollingAndExpandingMixin):
45 """
46 Provide expanding window calculations.
47
48 Parameters
49 ----------
50 min_periods : int, default 1
51 Minimum number of observations in window required to have a value;
52 otherwise, result is ``np.nan``.
53
54 axis : int or str, default 0
55 If ``0`` or ``'index'``, roll across the rows.
56
57 If ``1`` or ``'columns'``, roll across the columns.
58
59 For `Series` this parameter is unused and defaults to 0.
60
61 method : str {'single', 'table'}, default 'single'
62 Execute the rolling operation per single column or row (``'single'``)
63 or over the entire object (``'table'``).
64
65 This argument is only implemented when specifying ``engine='numba'``
66 in the method call.
67
68 .. versionadded:: 1.3.0
69
70 Returns
71 -------
72 ``Expanding`` subclass
73
74 See Also
75 --------
76 rolling : Provides rolling window calculations.
77 ewm : Provides exponential weighted functions.
78
79 Notes
80 -----
81 See :ref:`Windowing Operations <window.expanding>` for further usage details
82 and examples.
83
84 Examples
85 --------
86 >>> df = pd.DataFrame({"B": [0, 1, 2, np.nan, 4]})
87 >>> df
88 B
89 0 0.0
90 1 1.0
91 2 2.0
92 3 NaN
93 4 4.0
94
95 **min_periods**
96
97 Expanding sum with 1 vs 3 observations needed to calculate a value.
98
99 >>> df.expanding(1).sum()
100 B
101 0 0.0
102 1 1.0
103 2 3.0
104 3 3.0
105 4 7.0
106 >>> df.expanding(3).sum()
107 B
108 0 NaN
109 1 NaN
110 2 3.0
111 3 3.0
112 4 7.0
113 """
114
115 _attributes: list[str] = ["min_periods", "axis", "method"]
116
117 def __init__(
118 self,
119 obj: NDFrame,
120 min_periods: int = 1,
121 axis: Axis = 0,
122 method: str = "single",
123 selection=None,
124 ) -> None:
125 super().__init__(
126 obj=obj,
127 min_periods=min_periods,
128 axis=axis,
129 method=method,
130 selection=selection,
131 )
132
133 def _get_window_indexer(self) -> BaseIndexer:
134 """
135 Return an indexer class that will compute the window start and end bounds
136 """
137 return ExpandingIndexer()
138
139 @doc(
140 _shared_docs["aggregate"],
141 see_also=dedent(
142 """
143 See Also
144 --------
145 pandas.DataFrame.aggregate : Similar DataFrame method.
146 pandas.Series.aggregate : Similar Series method.
147 """
148 ),
149 examples=dedent(
150 """
151 Examples
152 --------
153 >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]})
154 >>> df
155 A B C
156 0 1 4 7
157 1 2 5 8
158 2 3 6 9
159
160 >>> df.ewm(alpha=0.5).mean()
161 A B C
162 0 1.000000 4.000000 7.000000
163 1 1.666667 4.666667 7.666667
164 2 2.428571 5.428571 8.428571
165 """
166 ),
167 klass="Series/Dataframe",
168 axis="",
169 )
170 def aggregate(self, func, *args, **kwargs):
171 return super().aggregate(func, *args, **kwargs)
172
173 agg = aggregate
174
175 @doc(
176 template_header,
177 create_section_header("Returns"),
178 template_returns,
179 create_section_header("See Also"),
180 template_see_also[:-1],
181 window_method="expanding",
182 aggregation_description="count of non NaN observations",
183 agg_method="count",
184 )
185 def count(self, numeric_only: bool = False):
186 return super().count(numeric_only=numeric_only)
187
188 @doc(
189 template_header,
190 create_section_header("Parameters"),
191 window_apply_parameters,
192 create_section_header("Returns"),
193 template_returns,
194 create_section_header("See Also"),
195 template_see_also[:-1],
196 window_method="expanding",
197 aggregation_description="custom aggregation function",
198 agg_method="apply",
199 )
200 def apply(
201 self,
202 func: Callable[..., Any],
203 raw: bool = False,
204 engine: str | None = None,
205 engine_kwargs: dict[str, bool] | None = None,
206 args: tuple[Any, ...] | None = None,
207 kwargs: dict[str, Any] | None = None,
208 ):
209 return super().apply(
210 func,
211 raw=raw,
212 engine=engine,
213 engine_kwargs=engine_kwargs,
214 args=args,
215 kwargs=kwargs,
216 )
217
218 @doc(
219 template_header,
220 create_section_header("Parameters"),
221 kwargs_numeric_only,
222 window_agg_numba_parameters(),
223 create_section_header("Returns"),
224 template_returns,
225 create_section_header("See Also"),
226 template_see_also,
227 create_section_header("Notes"),
228 numba_notes[:-1],
229 window_method="expanding",
230 aggregation_description="sum",
231 agg_method="sum",
232 )
233 def sum(
234 self,
235 numeric_only: bool = False,
236 engine: str | None = None,
237 engine_kwargs: dict[str, bool] | None = None,
238 ):
239 return super().sum(
240 numeric_only=numeric_only,
241 engine=engine,
242 engine_kwargs=engine_kwargs,
243 )
244
245 @doc(
246 template_header,
247 create_section_header("Parameters"),
248 kwargs_numeric_only,
249 window_agg_numba_parameters(),
250 create_section_header("Returns"),
251 template_returns,
252 create_section_header("See Also"),
253 template_see_also,
254 create_section_header("Notes"),
255 numba_notes[:-1],
256 window_method="expanding",
257 aggregation_description="maximum",
258 agg_method="max",
259 )
260 def max(
261 self,
262 numeric_only: bool = False,
263 engine: str | None = None,
264 engine_kwargs: dict[str, bool] | None = None,
265 ):
266 return super().max(
267 numeric_only=numeric_only,
268 engine=engine,
269 engine_kwargs=engine_kwargs,
270 )
271
272 @doc(
273 template_header,
274 create_section_header("Parameters"),
275 kwargs_numeric_only,
276 window_agg_numba_parameters(),
277 create_section_header("Returns"),
278 template_returns,
279 create_section_header("See Also"),
280 template_see_also,
281 create_section_header("Notes"),
282 numba_notes[:-1],
283 window_method="expanding",
284 aggregation_description="minimum",
285 agg_method="min",
286 )
287 def min(
288 self,
289 numeric_only: bool = False,
290 engine: str | None = None,
291 engine_kwargs: dict[str, bool] | None = None,
292 ):
293 return super().min(
294 numeric_only=numeric_only,
295 engine=engine,
296 engine_kwargs=engine_kwargs,
297 )
298
299 @doc(
300 template_header,
301 create_section_header("Parameters"),
302 kwargs_numeric_only,
303 window_agg_numba_parameters(),
304 create_section_header("Returns"),
305 template_returns,
306 create_section_header("See Also"),
307 template_see_also,
308 create_section_header("Notes"),
309 numba_notes[:-1],
310 window_method="expanding",
311 aggregation_description="mean",
312 agg_method="mean",
313 )
314 def mean(
315 self,
316 numeric_only: bool = False,
317 engine: str | None = None,
318 engine_kwargs: dict[str, bool] | None = None,
319 ):
320 return super().mean(
321 numeric_only=numeric_only,
322 engine=engine,
323 engine_kwargs=engine_kwargs,
324 )
325
326 @doc(
327 template_header,
328 create_section_header("Parameters"),
329 kwargs_numeric_only,
330 window_agg_numba_parameters(),
331 create_section_header("Returns"),
332 template_returns,
333 create_section_header("See Also"),
334 template_see_also,
335 create_section_header("Notes"),
336 numba_notes[:-1],
337 window_method="expanding",
338 aggregation_description="median",
339 agg_method="median",
340 )
341 def median(
342 self,
343 numeric_only: bool = False,
344 engine: str | None = None,
345 engine_kwargs: dict[str, bool] | None = None,
346 ):
347 return super().median(
348 numeric_only=numeric_only,
349 engine=engine,
350 engine_kwargs=engine_kwargs,
351 )
352
353 @doc(
354 template_header,
355 create_section_header("Parameters"),
356 dedent(
357 """
358 ddof : int, default 1
359 Delta Degrees of Freedom. The divisor used in calculations
360 is ``N - ddof``, where ``N`` represents the number of elements.\n
361 """
362 ).replace("\n", "", 1),
363 kwargs_numeric_only,
364 window_agg_numba_parameters("1.4"),
365 create_section_header("Returns"),
366 template_returns,
367 create_section_header("See Also"),
368 "numpy.std : Equivalent method for NumPy array.\n",
369 template_see_also,
370 create_section_header("Notes"),
371 dedent(
372 """
373 The default ``ddof`` of 1 used in :meth:`Series.std` is different
374 than the default ``ddof`` of 0 in :func:`numpy.std`.
375
376 A minimum of one period is required for the rolling calculation.\n
377 """
378 ).replace("\n", "", 1),
379 create_section_header("Examples"),
380 dedent(
381 """
382 >>> s = pd.Series([5, 5, 6, 7, 5, 5, 5])
383
384 >>> s.expanding(3).std()
385 0 NaN
386 1 NaN
387 2 0.577350
388 3 0.957427
389 4 0.894427
390 5 0.836660
391 6 0.786796
392 dtype: float64
393 """
394 ).replace("\n", "", 1),
395 window_method="expanding",
396 aggregation_description="standard deviation",
397 agg_method="std",
398 )
399 def std(
400 self,
401 ddof: int = 1,
402 numeric_only: bool = False,
403 engine: str | None = None,
404 engine_kwargs: dict[str, bool] | None = None,
405 ):
406 return super().std(
407 ddof=ddof,
408 numeric_only=numeric_only,
409 engine=engine,
410 engine_kwargs=engine_kwargs,
411 )
412
413 @doc(
414 template_header,
415 create_section_header("Parameters"),
416 dedent(
417 """
418 ddof : int, default 1
419 Delta Degrees of Freedom. The divisor used in calculations
420 is ``N - ddof``, where ``N`` represents the number of elements.\n
421 """
422 ).replace("\n", "", 1),
423 kwargs_numeric_only,
424 window_agg_numba_parameters("1.4"),
425 create_section_header("Returns"),
426 template_returns,
427 create_section_header("See Also"),
428 "numpy.var : Equivalent method for NumPy array.\n",
429 template_see_also,
430 create_section_header("Notes"),
431 dedent(
432 """
433 The default ``ddof`` of 1 used in :meth:`Series.var` is different
434 than the default ``ddof`` of 0 in :func:`numpy.var`.
435
436 A minimum of one period is required for the rolling calculation.\n
437 """
438 ).replace("\n", "", 1),
439 create_section_header("Examples"),
440 dedent(
441 """
442 >>> s = pd.Series([5, 5, 6, 7, 5, 5, 5])
443
444 >>> s.expanding(3).var()
445 0 NaN
446 1 NaN
447 2 0.333333
448 3 0.916667
449 4 0.800000
450 5 0.700000
451 6 0.619048
452 dtype: float64
453 """
454 ).replace("\n", "", 1),
455 window_method="expanding",
456 aggregation_description="variance",
457 agg_method="var",
458 )
459 def var(
460 self,
461 ddof: int = 1,
462 numeric_only: bool = False,
463 engine: str | None = None,
464 engine_kwargs: dict[str, bool] | None = None,
465 ):
466 return super().var(
467 ddof=ddof,
468 numeric_only=numeric_only,
469 engine=engine,
470 engine_kwargs=engine_kwargs,
471 )
472
473 @doc(
474 template_header,
475 create_section_header("Parameters"),
476 dedent(
477 """
478 ddof : int, default 1
479 Delta Degrees of Freedom. The divisor used in calculations
480 is ``N - ddof``, where ``N`` represents the number of elements.\n
481 """
482 ).replace("\n", "", 1),
483 kwargs_numeric_only,
484 create_section_header("Returns"),
485 template_returns,
486 create_section_header("See Also"),
487 template_see_also,
488 create_section_header("Notes"),
489 "A minimum of one period is required for the calculation.\n\n",
490 create_section_header("Examples"),
491 dedent(
492 """
493 >>> s = pd.Series([0, 1, 2, 3])
494
495 >>> s.expanding().sem()
496 0 NaN
497 1 0.707107
498 2 0.707107
499 3 0.745356
500 dtype: float64
501 """
502 ).replace("\n", "", 1),
503 window_method="expanding",
504 aggregation_description="standard error of mean",
505 agg_method="sem",
506 )
507 def sem(self, ddof: int = 1, numeric_only: bool = False):
508 return super().sem(ddof=ddof, numeric_only=numeric_only)
509
510 @doc(
511 template_header,
512 create_section_header("Parameters"),
513 kwargs_numeric_only,
514 create_section_header("Returns"),
515 template_returns,
516 create_section_header("See Also"),
517 "scipy.stats.skew : Third moment of a probability density.\n",
518 template_see_also,
519 create_section_header("Notes"),
520 "A minimum of three periods is required for the rolling calculation.\n",
521 window_method="expanding",
522 aggregation_description="unbiased skewness",
523 agg_method="skew",
524 )
525 def skew(self, numeric_only: bool = False):
526 return super().skew(numeric_only=numeric_only)
527
528 @doc(
529 template_header,
530 create_section_header("Parameters"),
531 kwargs_numeric_only,
532 create_section_header("Returns"),
533 template_returns,
534 create_section_header("See Also"),
535 "scipy.stats.kurtosis : Reference SciPy method.\n",
536 template_see_also,
537 create_section_header("Notes"),
538 "A minimum of four periods is required for the calculation.\n\n",
539 create_section_header("Examples"),
540 dedent(
541 """
542 The example below will show a rolling calculation with a window size of
543 four matching the equivalent function call using `scipy.stats`.
544
545 >>> arr = [1, 2, 3, 4, 999]
546 >>> import scipy.stats
547 >>> print(f"{{scipy.stats.kurtosis(arr[:-1], bias=False):.6f}}")
548 -1.200000
549 >>> print(f"{{scipy.stats.kurtosis(arr, bias=False):.6f}}")
550 4.999874
551 >>> s = pd.Series(arr)
552 >>> s.expanding(4).kurt()
553 0 NaN
554 1 NaN
555 2 NaN
556 3 -1.200000
557 4 4.999874
558 dtype: float64
559 """
560 ).replace("\n", "", 1),
561 window_method="expanding",
562 aggregation_description="Fisher's definition of kurtosis without bias",
563 agg_method="kurt",
564 )
565 def kurt(self, numeric_only: bool = False):
566 return super().kurt(numeric_only=numeric_only)
567
568 @doc(
569 template_header,
570 create_section_header("Parameters"),
571 dedent(
572 """
573 quantile : float
574 Quantile to compute. 0 <= quantile <= 1.
575 interpolation : {{'linear', 'lower', 'higher', 'midpoint', 'nearest'}}
576 This optional parameter specifies the interpolation method to use,
577 when the desired quantile lies between two data points `i` and `j`:
578
579 * linear: `i + (j - i) * fraction`, where `fraction` is the
580 fractional part of the index surrounded by `i` and `j`.
581 * lower: `i`.
582 * higher: `j`.
583 * nearest: `i` or `j` whichever is nearest.
584 * midpoint: (`i` + `j`) / 2.
585 """
586 ).replace("\n", "", 1),
587 kwargs_numeric_only,
588 create_section_header("Returns"),
589 template_returns,
590 create_section_header("See Also"),
591 template_see_also[:-1],
592 window_method="expanding",
593 aggregation_description="quantile",
594 agg_method="quantile",
595 )
596 def quantile(
597 self,
598 quantile: float,
599 interpolation: QuantileInterpolation = "linear",
600 numeric_only: bool = False,
601 ):
602 return super().quantile(
603 quantile=quantile,
604 interpolation=interpolation,
605 numeric_only=numeric_only,
606 )
607
608 @doc(
609 template_header,
610 ".. versionadded:: 1.4.0 \n\n",
611 create_section_header("Parameters"),
612 dedent(
613 """
614 method : {{'average', 'min', 'max'}}, default 'average'
615 How to rank the group of records that have the same value (i.e. ties):
616
617 * average: average rank of the group
618 * min: lowest rank in the group
619 * max: highest rank in the group
620
621 ascending : bool, default True
622 Whether or not the elements should be ranked in ascending order.
623 pct : bool, default False
624 Whether or not to display the returned rankings in percentile
625 form.
626 """
627 ).replace("\n", "", 1),
628 kwargs_numeric_only,
629 create_section_header("Returns"),
630 template_returns,
631 create_section_header("See Also"),
632 template_see_also,
633 create_section_header("Examples"),
634 dedent(
635 """
636 >>> s = pd.Series([1, 4, 2, 3, 5, 3])
637 >>> s.expanding().rank()
638 0 1.0
639 1 2.0
640 2 2.0
641 3 3.0
642 4 5.0
643 5 3.5
644 dtype: float64
645
646 >>> s.expanding().rank(method="max")
647 0 1.0
648 1 2.0
649 2 2.0
650 3 3.0
651 4 5.0
652 5 4.0
653 dtype: float64
654
655 >>> s.expanding().rank(method="min")
656 0 1.0
657 1 2.0
658 2 2.0
659 3 3.0
660 4 5.0
661 5 3.0
662 dtype: float64
663 """
664 ).replace("\n", "", 1),
665 window_method="expanding",
666 aggregation_description="rank",
667 agg_method="rank",
668 )
669 def rank(
670 self,
671 method: WindowingRankType = "average",
672 ascending: bool = True,
673 pct: bool = False,
674 numeric_only: bool = False,
675 ):
676 return super().rank(
677 method=method,
678 ascending=ascending,
679 pct=pct,
680 numeric_only=numeric_only,
681 )
682
683 @doc(
684 template_header,
685 create_section_header("Parameters"),
686 dedent(
687 """
688 other : Series or DataFrame, optional
689 If not supplied then will default to self and produce pairwise
690 output.
691 pairwise : bool, default None
692 If False then only matching columns between self and other will be
693 used and the output will be a DataFrame.
694 If True then all pairwise combinations will be calculated and the
695 output will be a MultiIndexed DataFrame in the case of DataFrame
696 inputs. In the case of missing elements, only complete pairwise
697 observations will be used.
698 ddof : int, default 1
699 Delta Degrees of Freedom. The divisor used in calculations
700 is ``N - ddof``, where ``N`` represents the number of elements.
701 """
702 ).replace("\n", "", 1),
703 kwargs_numeric_only,
704 create_section_header("Returns"),
705 template_returns,
706 create_section_header("See Also"),
707 template_see_also[:-1],
708 window_method="expanding",
709 aggregation_description="sample covariance",
710 agg_method="cov",
711 )
712 def cov(
713 self,
714 other: DataFrame | Series | None = None,
715 pairwise: bool | None = None,
716 ddof: int = 1,
717 numeric_only: bool = False,
718 ):
719 return super().cov(
720 other=other,
721 pairwise=pairwise,
722 ddof=ddof,
723 numeric_only=numeric_only,
724 )
725
726 @doc(
727 template_header,
728 create_section_header("Parameters"),
729 dedent(
730 """
731 other : Series or DataFrame, optional
732 If not supplied then will default to self and produce pairwise
733 output.
734 pairwise : bool, default None
735 If False then only matching columns between self and other will be
736 used and the output will be a DataFrame.
737 If True then all pairwise combinations will be calculated and the
738 output will be a MultiIndexed DataFrame in the case of DataFrame
739 inputs. In the case of missing elements, only complete pairwise
740 observations will be used.
741 """
742 ).replace("\n", "", 1),
743 kwargs_numeric_only,
744 create_section_header("Returns"),
745 template_returns,
746 create_section_header("See Also"),
747 dedent(
748 """
749 cov : Similar method to calculate covariance.
750 numpy.corrcoef : NumPy Pearson's correlation calculation.
751 """
752 ).replace("\n", "", 1),
753 template_see_also,
754 create_section_header("Notes"),
755 dedent(
756 """
757 This function uses Pearson's definition of correlation
758 (https://en.wikipedia.org/wiki/Pearson_correlation_coefficient).
759
760 When `other` is not specified, the output will be self correlation (e.g.
761 all 1's), except for :class:`~pandas.DataFrame` inputs with `pairwise`
762 set to `True`.
763
764 Function will return ``NaN`` for correlations of equal valued sequences;
765 this is the result of a 0/0 division error.
766
767 When `pairwise` is set to `False`, only matching columns between `self` and
768 `other` will be used.
769
770 When `pairwise` is set to `True`, the output will be a MultiIndex DataFrame
771 with the original index on the first level, and the `other` DataFrame
772 columns on the second level.
773
774 In the case of missing elements, only complete pairwise observations
775 will be used.
776 """
777 ).replace("\n", "", 1),
778 window_method="expanding",
779 aggregation_description="correlation",
780 agg_method="corr",
781 )
782 def corr(
783 self,
784 other: DataFrame | Series | None = None,
785 pairwise: bool | None = None,
786 ddof: int = 1,
787 numeric_only: bool = False,
788 ):
789 return super().corr(
790 other=other,
791 pairwise=pairwise,
792 ddof=ddof,
793 numeric_only=numeric_only,
794 )
795
796
797class ExpandingGroupby(BaseWindowGroupby, Expanding):
798 """
799 Provide a expanding groupby implementation.
800 """
801
802 _attributes = Expanding._attributes + BaseWindowGroupby._attributes
803
804 def _get_window_indexer(self) -> GroupbyIndexer:
805 """
806 Return an indexer class that will compute the window start and end bounds
807
808 Returns
809 -------
810 GroupbyIndexer
811 """
812 window_indexer = GroupbyIndexer(
813 groupby_indices=self._grouper.indices,
814 window_indexer=ExpandingIndexer,
815 )
816 return window_indexer