1from __future__ import annotations
2
3import copy
4from textwrap import dedent
5from typing import (
6 TYPE_CHECKING,
7 Callable,
8 Literal,
9 cast,
10 final,
11 no_type_check,
12)
13import warnings
14
15import numpy as np
16
17from pandas._libs import lib
18from pandas._libs.tslibs import (
19 BaseOffset,
20 IncompatibleFrequency,
21 NaT,
22 Period,
23 Timedelta,
24 Timestamp,
25 to_offset,
26)
27from pandas._libs.tslibs.dtypes import freq_to_period_freqstr
28from pandas._typing import NDFrameT
29from pandas.compat.numpy import function as nv
30from pandas.errors import AbstractMethodError
31from pandas.util._decorators import (
32 Appender,
33 Substitution,
34 doc,
35)
36from pandas.util._exceptions import (
37 find_stack_level,
38 rewrite_warning,
39)
40
41from pandas.core.dtypes.dtypes import ArrowDtype
42from pandas.core.dtypes.generic import (
43 ABCDataFrame,
44 ABCSeries,
45)
46
47import pandas.core.algorithms as algos
48from pandas.core.apply import (
49 ResamplerWindowApply,
50 warn_alias_replacement,
51)
52from pandas.core.arrays import ArrowExtensionArray
53from pandas.core.base import (
54 PandasObject,
55 SelectionMixin,
56)
57import pandas.core.common as com
58from pandas.core.generic import (
59 NDFrame,
60 _shared_docs,
61)
62from pandas.core.groupby.generic import SeriesGroupBy
63from pandas.core.groupby.groupby import (
64 BaseGroupBy,
65 GroupBy,
66 _apply_groupings_depr,
67 _pipe_template,
68 get_groupby,
69)
70from pandas.core.groupby.grouper import Grouper
71from pandas.core.groupby.ops import BinGrouper
72from pandas.core.indexes.api import MultiIndex
73from pandas.core.indexes.base import Index
74from pandas.core.indexes.datetimes import (
75 DatetimeIndex,
76 date_range,
77)
78from pandas.core.indexes.period import (
79 PeriodIndex,
80 period_range,
81)
82from pandas.core.indexes.timedeltas import (
83 TimedeltaIndex,
84 timedelta_range,
85)
86
87from pandas.tseries.frequencies import (
88 is_subperiod,
89 is_superperiod,
90)
91from pandas.tseries.offsets import (
92 Day,
93 Tick,
94)
95
96if TYPE_CHECKING:
97 from collections.abc import Hashable
98
99 from pandas._typing import (
100 AnyArrayLike,
101 Axis,
102 AxisInt,
103 Frequency,
104 IndexLabel,
105 InterpolateOptions,
106 T,
107 TimedeltaConvertibleTypes,
108 TimeGrouperOrigin,
109 TimestampConvertibleTypes,
110 npt,
111 )
112
113 from pandas import (
114 DataFrame,
115 Series,
116 )
117
118_shared_docs_kwargs: dict[str, str] = {}
119
120
121class Resampler(BaseGroupBy, PandasObject):
122 """
123 Class for resampling datetimelike data, a groupby-like operation.
124 See aggregate, transform, and apply functions on this object.
125
126 It's easiest to use obj.resample(...) to use Resampler.
127
128 Parameters
129 ----------
130 obj : Series or DataFrame
131 groupby : TimeGrouper
132 axis : int, default 0
133 kind : str or None
134 'period', 'timestamp' to override default index treatment
135
136 Returns
137 -------
138 a Resampler of the appropriate type
139
140 Notes
141 -----
142 After resampling, see aggregate, apply, and transform functions.
143 """
144
145 _grouper: BinGrouper
146 _timegrouper: TimeGrouper
147 binner: DatetimeIndex | TimedeltaIndex | PeriodIndex # depends on subclass
148 exclusions: frozenset[Hashable] = frozenset() # for SelectionMixin compat
149 _internal_names_set = set({"obj", "ax", "_indexer"})
150
151 # to the groupby descriptor
152 _attributes = [
153 "freq",
154 "axis",
155 "closed",
156 "label",
157 "convention",
158 "kind",
159 "origin",
160 "offset",
161 ]
162
163 def __init__(
164 self,
165 obj: NDFrame,
166 timegrouper: TimeGrouper,
167 axis: Axis = 0,
168 kind=None,
169 *,
170 gpr_index: Index,
171 group_keys: bool = False,
172 selection=None,
173 include_groups: bool = True,
174 ) -> None:
175 self._timegrouper = timegrouper
176 self.keys = None
177 self.sort = True
178 self.axis = obj._get_axis_number(axis)
179 self.kind = kind
180 self.group_keys = group_keys
181 self.as_index = True
182 self.include_groups = include_groups
183
184 self.obj, self.ax, self._indexer = self._timegrouper._set_grouper(
185 self._convert_obj(obj), sort=True, gpr_index=gpr_index
186 )
187 self.binner, self._grouper = self._get_binner()
188 self._selection = selection
189 if self._timegrouper.key is not None:
190 self.exclusions = frozenset([self._timegrouper.key])
191 else:
192 self.exclusions = frozenset()
193
194 @final
195 def __str__(self) -> str:
196 """
197 Provide a nice str repr of our rolling object.
198 """
199 attrs = (
200 f"{k}={getattr(self._timegrouper, k)}"
201 for k in self._attributes
202 if getattr(self._timegrouper, k, None) is not None
203 )
204 return f"{type(self).__name__} [{', '.join(attrs)}]"
205
206 @final
207 def __getattr__(self, attr: str):
208 if attr in self._internal_names_set:
209 return object.__getattribute__(self, attr)
210 if attr in self._attributes:
211 return getattr(self._timegrouper, attr)
212 if attr in self.obj:
213 return self[attr]
214
215 return object.__getattribute__(self, attr)
216
217 @final
218 @property
219 def _from_selection(self) -> bool:
220 """
221 Is the resampling from a DataFrame column or MultiIndex level.
222 """
223 # upsampling and PeriodIndex resampling do not work
224 # with selection, this state used to catch and raise an error
225 return self._timegrouper is not None and (
226 self._timegrouper.key is not None or self._timegrouper.level is not None
227 )
228
229 def _convert_obj(self, obj: NDFrameT) -> NDFrameT:
230 """
231 Provide any conversions for the object in order to correctly handle.
232
233 Parameters
234 ----------
235 obj : Series or DataFrame
236
237 Returns
238 -------
239 Series or DataFrame
240 """
241 return obj._consolidate()
242
243 def _get_binner_for_time(self):
244 raise AbstractMethodError(self)
245
246 @final
247 def _get_binner(self):
248 """
249 Create the BinGrouper, assume that self.set_grouper(obj)
250 has already been called.
251 """
252 binner, bins, binlabels = self._get_binner_for_time()
253 assert len(bins) == len(binlabels)
254 bin_grouper = BinGrouper(bins, binlabels, indexer=self._indexer)
255 return binner, bin_grouper
256
257 @final
258 @Substitution(
259 klass="Resampler",
260 examples="""
261 >>> df = pd.DataFrame({'A': [1, 2, 3, 4]},
262 ... index=pd.date_range('2012-08-02', periods=4))
263 >>> df
264 A
265 2012-08-02 1
266 2012-08-03 2
267 2012-08-04 3
268 2012-08-05 4
269
270 To get the difference between each 2-day period's maximum and minimum
271 value in one pass, you can do
272
273 >>> df.resample('2D').pipe(lambda x: x.max() - x.min())
274 A
275 2012-08-02 1
276 2012-08-04 1""",
277 )
278 @Appender(_pipe_template)
279 def pipe(
280 self,
281 func: Callable[..., T] | tuple[Callable[..., T], str],
282 *args,
283 **kwargs,
284 ) -> T:
285 return super().pipe(func, *args, **kwargs)
286
287 _agg_see_also_doc = dedent(
288 """
289 See Also
290 --------
291 DataFrame.groupby.aggregate : Aggregate using callable, string, dict,
292 or list of string/callables.
293 DataFrame.resample.transform : Transforms the Series on each group
294 based on the given function.
295 DataFrame.aggregate: Aggregate using one or more
296 operations over the specified axis.
297 """
298 )
299
300 _agg_examples_doc = dedent(
301 """
302 Examples
303 --------
304 >>> s = pd.Series([1, 2, 3, 4, 5],
305 ... index=pd.date_range('20130101', periods=5, freq='s'))
306 >>> s
307 2013-01-01 00:00:00 1
308 2013-01-01 00:00:01 2
309 2013-01-01 00:00:02 3
310 2013-01-01 00:00:03 4
311 2013-01-01 00:00:04 5
312 Freq: s, dtype: int64
313
314 >>> r = s.resample('2s')
315
316 >>> r.agg("sum")
317 2013-01-01 00:00:00 3
318 2013-01-01 00:00:02 7
319 2013-01-01 00:00:04 5
320 Freq: 2s, dtype: int64
321
322 >>> r.agg(['sum', 'mean', 'max'])
323 sum mean max
324 2013-01-01 00:00:00 3 1.5 2
325 2013-01-01 00:00:02 7 3.5 4
326 2013-01-01 00:00:04 5 5.0 5
327
328 >>> r.agg({'result': lambda x: x.mean() / x.std(),
329 ... 'total': "sum"})
330 result total
331 2013-01-01 00:00:00 2.121320 3
332 2013-01-01 00:00:02 4.949747 7
333 2013-01-01 00:00:04 NaN 5
334
335 >>> r.agg(average="mean", total="sum")
336 average total
337 2013-01-01 00:00:00 1.5 3
338 2013-01-01 00:00:02 3.5 7
339 2013-01-01 00:00:04 5.0 5
340 """
341 )
342
343 @final
344 @doc(
345 _shared_docs["aggregate"],
346 see_also=_agg_see_also_doc,
347 examples=_agg_examples_doc,
348 klass="DataFrame",
349 axis="",
350 )
351 def aggregate(self, func=None, *args, **kwargs):
352 result = ResamplerWindowApply(self, func, args=args, kwargs=kwargs).agg()
353 if result is None:
354 how = func
355 result = self._groupby_and_aggregate(how, *args, **kwargs)
356
357 return result
358
359 agg = aggregate
360 apply = aggregate
361
362 @final
363 def transform(self, arg, *args, **kwargs):
364 """
365 Call function producing a like-indexed Series on each group.
366
367 Return a Series with the transformed values.
368
369 Parameters
370 ----------
371 arg : function
372 To apply to each group. Should return a Series with the same index.
373
374 Returns
375 -------
376 Series
377
378 Examples
379 --------
380 >>> s = pd.Series([1, 2],
381 ... index=pd.date_range('20180101',
382 ... periods=2,
383 ... freq='1h'))
384 >>> s
385 2018-01-01 00:00:00 1
386 2018-01-01 01:00:00 2
387 Freq: h, dtype: int64
388
389 >>> resampled = s.resample('15min')
390 >>> resampled.transform(lambda x: (x - x.mean()) / x.std())
391 2018-01-01 00:00:00 NaN
392 2018-01-01 01:00:00 NaN
393 Freq: h, dtype: float64
394 """
395 return self._selected_obj.groupby(self._timegrouper).transform(
396 arg, *args, **kwargs
397 )
398
399 def _downsample(self, f, **kwargs):
400 raise AbstractMethodError(self)
401
402 def _upsample(self, f, limit: int | None = None, fill_value=None):
403 raise AbstractMethodError(self)
404
405 def _gotitem(self, key, ndim: int, subset=None):
406 """
407 Sub-classes to define. Return a sliced object.
408
409 Parameters
410 ----------
411 key : string / list of selections
412 ndim : {1, 2}
413 requested ndim of result
414 subset : object, default None
415 subset to act on
416 """
417 grouper = self._grouper
418 if subset is None:
419 subset = self.obj
420 if key is not None:
421 subset = subset[key]
422 else:
423 # reached via Apply.agg_dict_like with selection=None and ndim=1
424 assert subset.ndim == 1
425 if ndim == 1:
426 assert subset.ndim == 1
427
428 grouped = get_groupby(
429 subset, by=None, grouper=grouper, axis=self.axis, group_keys=self.group_keys
430 )
431 return grouped
432
433 def _groupby_and_aggregate(self, how, *args, **kwargs):
434 """
435 Re-evaluate the obj with a groupby aggregation.
436 """
437 grouper = self._grouper
438
439 # Excludes `on` column when provided
440 obj = self._obj_with_exclusions
441
442 grouped = get_groupby(
443 obj, by=None, grouper=grouper, axis=self.axis, group_keys=self.group_keys
444 )
445
446 try:
447 if callable(how):
448 # TODO: test_resample_apply_with_additional_args fails if we go
449 # through the non-lambda path, not clear that it should.
450 func = lambda x: how(x, *args, **kwargs)
451 result = grouped.aggregate(func)
452 else:
453 result = grouped.aggregate(how, *args, **kwargs)
454 except (AttributeError, KeyError):
455 # we have a non-reducing function; try to evaluate
456 # alternatively we want to evaluate only a column of the input
457
458 # test_apply_to_one_column_of_df the function being applied references
459 # a DataFrame column, but aggregate_item_by_item operates column-wise
460 # on Series, raising AttributeError or KeyError
461 # (depending on whether the column lookup uses getattr/__getitem__)
462 result = _apply(
463 grouped, how, *args, include_groups=self.include_groups, **kwargs
464 )
465
466 except ValueError as err:
467 if "Must produce aggregated value" in str(err):
468 # raised in _aggregate_named
469 # see test_apply_without_aggregation, test_apply_with_mutated_index
470 pass
471 else:
472 raise
473
474 # we have a non-reducing function
475 # try to evaluate
476 result = _apply(
477 grouped, how, *args, include_groups=self.include_groups, **kwargs
478 )
479
480 return self._wrap_result(result)
481
482 @final
483 def _get_resampler_for_grouping(
484 self, groupby: GroupBy, key, include_groups: bool = True
485 ):
486 """
487 Return the correct class for resampling with groupby.
488 """
489 return self._resampler_for_grouping(
490 groupby=groupby, key=key, parent=self, include_groups=include_groups
491 )
492
493 def _wrap_result(self, result):
494 """
495 Potentially wrap any results.
496 """
497 # GH 47705
498 obj = self.obj
499 if (
500 isinstance(result, ABCDataFrame)
501 and len(result) == 0
502 and not isinstance(result.index, PeriodIndex)
503 ):
504 result = result.set_index(
505 _asfreq_compat(obj.index[:0], freq=self.freq), append=True
506 )
507
508 if isinstance(result, ABCSeries) and self._selection is not None:
509 result.name = self._selection
510
511 if isinstance(result, ABCSeries) and result.empty:
512 # When index is all NaT, result is empty but index is not
513 result.index = _asfreq_compat(obj.index[:0], freq=self.freq)
514 result.name = getattr(obj, "name", None)
515
516 if self._timegrouper._arrow_dtype is not None:
517 result.index = result.index.astype(self._timegrouper._arrow_dtype)
518
519 return result
520
521 @final
522 def ffill(self, limit: int | None = None):
523 """
524 Forward fill the values.
525
526 Parameters
527 ----------
528 limit : int, optional
529 Limit of how many values to fill.
530
531 Returns
532 -------
533 An upsampled Series.
534
535 See Also
536 --------
537 Series.fillna: Fill NA/NaN values using the specified method.
538 DataFrame.fillna: Fill NA/NaN values using the specified method.
539
540 Examples
541 --------
542 Here we only create a ``Series``.
543
544 >>> ser = pd.Series([1, 2, 3, 4], index=pd.DatetimeIndex(
545 ... ['2023-01-01', '2023-01-15', '2023-02-01', '2023-02-15']))
546 >>> ser
547 2023-01-01 1
548 2023-01-15 2
549 2023-02-01 3
550 2023-02-15 4
551 dtype: int64
552
553 Example for ``ffill`` with downsampling (we have fewer dates after resampling):
554
555 >>> ser.resample('MS').ffill()
556 2023-01-01 1
557 2023-02-01 3
558 Freq: MS, dtype: int64
559
560 Example for ``ffill`` with upsampling (fill the new dates with
561 the previous value):
562
563 >>> ser.resample('W').ffill()
564 2023-01-01 1
565 2023-01-08 1
566 2023-01-15 2
567 2023-01-22 2
568 2023-01-29 2
569 2023-02-05 3
570 2023-02-12 3
571 2023-02-19 4
572 Freq: W-SUN, dtype: int64
573
574 With upsampling and limiting (only fill the first new date with the
575 previous value):
576
577 >>> ser.resample('W').ffill(limit=1)
578 2023-01-01 1.0
579 2023-01-08 1.0
580 2023-01-15 2.0
581 2023-01-22 2.0
582 2023-01-29 NaN
583 2023-02-05 3.0
584 2023-02-12 NaN
585 2023-02-19 4.0
586 Freq: W-SUN, dtype: float64
587 """
588 return self._upsample("ffill", limit=limit)
589
590 @final
591 def nearest(self, limit: int | None = None):
592 """
593 Resample by using the nearest value.
594
595 When resampling data, missing values may appear (e.g., when the
596 resampling frequency is higher than the original frequency).
597 The `nearest` method will replace ``NaN`` values that appeared in
598 the resampled data with the value from the nearest member of the
599 sequence, based on the index value.
600 Missing values that existed in the original data will not be modified.
601 If `limit` is given, fill only this many values in each direction for
602 each of the original values.
603
604 Parameters
605 ----------
606 limit : int, optional
607 Limit of how many values to fill.
608
609 Returns
610 -------
611 Series or DataFrame
612 An upsampled Series or DataFrame with ``NaN`` values filled with
613 their nearest value.
614
615 See Also
616 --------
617 backfill : Backward fill the new missing values in the resampled data.
618 pad : Forward fill ``NaN`` values.
619
620 Examples
621 --------
622 >>> s = pd.Series([1, 2],
623 ... index=pd.date_range('20180101',
624 ... periods=2,
625 ... freq='1h'))
626 >>> s
627 2018-01-01 00:00:00 1
628 2018-01-01 01:00:00 2
629 Freq: h, dtype: int64
630
631 >>> s.resample('15min').nearest()
632 2018-01-01 00:00:00 1
633 2018-01-01 00:15:00 1
634 2018-01-01 00:30:00 2
635 2018-01-01 00:45:00 2
636 2018-01-01 01:00:00 2
637 Freq: 15min, dtype: int64
638
639 Limit the number of upsampled values imputed by the nearest:
640
641 >>> s.resample('15min').nearest(limit=1)
642 2018-01-01 00:00:00 1.0
643 2018-01-01 00:15:00 1.0
644 2018-01-01 00:30:00 NaN
645 2018-01-01 00:45:00 2.0
646 2018-01-01 01:00:00 2.0
647 Freq: 15min, dtype: float64
648 """
649 return self._upsample("nearest", limit=limit)
650
651 @final
652 def bfill(self, limit: int | None = None):
653 """
654 Backward fill the new missing values in the resampled data.
655
656 In statistics, imputation is the process of replacing missing data with
657 substituted values [1]_. When resampling data, missing values may
658 appear (e.g., when the resampling frequency is higher than the original
659 frequency). The backward fill will replace NaN values that appeared in
660 the resampled data with the next value in the original sequence.
661 Missing values that existed in the original data will not be modified.
662
663 Parameters
664 ----------
665 limit : int, optional
666 Limit of how many values to fill.
667
668 Returns
669 -------
670 Series, DataFrame
671 An upsampled Series or DataFrame with backward filled NaN values.
672
673 See Also
674 --------
675 bfill : Alias of backfill.
676 fillna : Fill NaN values using the specified method, which can be
677 'backfill'.
678 nearest : Fill NaN values with nearest neighbor starting from center.
679 ffill : Forward fill NaN values.
680 Series.fillna : Fill NaN values in the Series using the
681 specified method, which can be 'backfill'.
682 DataFrame.fillna : Fill NaN values in the DataFrame using the
683 specified method, which can be 'backfill'.
684
685 References
686 ----------
687 .. [1] https://en.wikipedia.org/wiki/Imputation_(statistics)
688
689 Examples
690 --------
691 Resampling a Series:
692
693 >>> s = pd.Series([1, 2, 3],
694 ... index=pd.date_range('20180101', periods=3, freq='h'))
695 >>> s
696 2018-01-01 00:00:00 1
697 2018-01-01 01:00:00 2
698 2018-01-01 02:00:00 3
699 Freq: h, dtype: int64
700
701 >>> s.resample('30min').bfill()
702 2018-01-01 00:00:00 1
703 2018-01-01 00:30:00 2
704 2018-01-01 01:00:00 2
705 2018-01-01 01:30:00 3
706 2018-01-01 02:00:00 3
707 Freq: 30min, dtype: int64
708
709 >>> s.resample('15min').bfill(limit=2)
710 2018-01-01 00:00:00 1.0
711 2018-01-01 00:15:00 NaN
712 2018-01-01 00:30:00 2.0
713 2018-01-01 00:45:00 2.0
714 2018-01-01 01:00:00 2.0
715 2018-01-01 01:15:00 NaN
716 2018-01-01 01:30:00 3.0
717 2018-01-01 01:45:00 3.0
718 2018-01-01 02:00:00 3.0
719 Freq: 15min, dtype: float64
720
721 Resampling a DataFrame that has missing values:
722
723 >>> df = pd.DataFrame({'a': [2, np.nan, 6], 'b': [1, 3, 5]},
724 ... index=pd.date_range('20180101', periods=3,
725 ... freq='h'))
726 >>> df
727 a b
728 2018-01-01 00:00:00 2.0 1
729 2018-01-01 01:00:00 NaN 3
730 2018-01-01 02:00:00 6.0 5
731
732 >>> df.resample('30min').bfill()
733 a b
734 2018-01-01 00:00:00 2.0 1
735 2018-01-01 00:30:00 NaN 3
736 2018-01-01 01:00:00 NaN 3
737 2018-01-01 01:30:00 6.0 5
738 2018-01-01 02:00:00 6.0 5
739
740 >>> df.resample('15min').bfill(limit=2)
741 a b
742 2018-01-01 00:00:00 2.0 1.0
743 2018-01-01 00:15:00 NaN NaN
744 2018-01-01 00:30:00 NaN 3.0
745 2018-01-01 00:45:00 NaN 3.0
746 2018-01-01 01:00:00 NaN 3.0
747 2018-01-01 01:15:00 NaN NaN
748 2018-01-01 01:30:00 6.0 5.0
749 2018-01-01 01:45:00 6.0 5.0
750 2018-01-01 02:00:00 6.0 5.0
751 """
752 return self._upsample("bfill", limit=limit)
753
754 @final
755 def fillna(self, method, limit: int | None = None):
756 """
757 Fill missing values introduced by upsampling.
758
759 In statistics, imputation is the process of replacing missing data with
760 substituted values [1]_. When resampling data, missing values may
761 appear (e.g., when the resampling frequency is higher than the original
762 frequency).
763
764 Missing values that existed in the original data will
765 not be modified.
766
767 Parameters
768 ----------
769 method : {'pad', 'backfill', 'ffill', 'bfill', 'nearest'}
770 Method to use for filling holes in resampled data
771
772 * 'pad' or 'ffill': use previous valid observation to fill gap
773 (forward fill).
774 * 'backfill' or 'bfill': use next valid observation to fill gap.
775 * 'nearest': use nearest valid observation to fill gap.
776
777 limit : int, optional
778 Limit of how many consecutive missing values to fill.
779
780 Returns
781 -------
782 Series or DataFrame
783 An upsampled Series or DataFrame with missing values filled.
784
785 See Also
786 --------
787 bfill : Backward fill NaN values in the resampled data.
788 ffill : Forward fill NaN values in the resampled data.
789 nearest : Fill NaN values in the resampled data
790 with nearest neighbor starting from center.
791 interpolate : Fill NaN values using interpolation.
792 Series.fillna : Fill NaN values in the Series using the
793 specified method, which can be 'bfill' and 'ffill'.
794 DataFrame.fillna : Fill NaN values in the DataFrame using the
795 specified method, which can be 'bfill' and 'ffill'.
796
797 References
798 ----------
799 .. [1] https://en.wikipedia.org/wiki/Imputation_(statistics)
800
801 Examples
802 --------
803 Resampling a Series:
804
805 >>> s = pd.Series([1, 2, 3],
806 ... index=pd.date_range('20180101', periods=3, freq='h'))
807 >>> s
808 2018-01-01 00:00:00 1
809 2018-01-01 01:00:00 2
810 2018-01-01 02:00:00 3
811 Freq: h, dtype: int64
812
813 Without filling the missing values you get:
814
815 >>> s.resample("30min").asfreq()
816 2018-01-01 00:00:00 1.0
817 2018-01-01 00:30:00 NaN
818 2018-01-01 01:00:00 2.0
819 2018-01-01 01:30:00 NaN
820 2018-01-01 02:00:00 3.0
821 Freq: 30min, dtype: float64
822
823 >>> s.resample('30min').fillna("backfill")
824 2018-01-01 00:00:00 1
825 2018-01-01 00:30:00 2
826 2018-01-01 01:00:00 2
827 2018-01-01 01:30:00 3
828 2018-01-01 02:00:00 3
829 Freq: 30min, dtype: int64
830
831 >>> s.resample('15min').fillna("backfill", limit=2)
832 2018-01-01 00:00:00 1.0
833 2018-01-01 00:15:00 NaN
834 2018-01-01 00:30:00 2.0
835 2018-01-01 00:45:00 2.0
836 2018-01-01 01:00:00 2.0
837 2018-01-01 01:15:00 NaN
838 2018-01-01 01:30:00 3.0
839 2018-01-01 01:45:00 3.0
840 2018-01-01 02:00:00 3.0
841 Freq: 15min, dtype: float64
842
843 >>> s.resample('30min').fillna("pad")
844 2018-01-01 00:00:00 1
845 2018-01-01 00:30:00 1
846 2018-01-01 01:00:00 2
847 2018-01-01 01:30:00 2
848 2018-01-01 02:00:00 3
849 Freq: 30min, dtype: int64
850
851 >>> s.resample('30min').fillna("nearest")
852 2018-01-01 00:00:00 1
853 2018-01-01 00:30:00 2
854 2018-01-01 01:00:00 2
855 2018-01-01 01:30:00 3
856 2018-01-01 02:00:00 3
857 Freq: 30min, dtype: int64
858
859 Missing values present before the upsampling are not affected.
860
861 >>> sm = pd.Series([1, None, 3],
862 ... index=pd.date_range('20180101', periods=3, freq='h'))
863 >>> sm
864 2018-01-01 00:00:00 1.0
865 2018-01-01 01:00:00 NaN
866 2018-01-01 02:00:00 3.0
867 Freq: h, dtype: float64
868
869 >>> sm.resample('30min').fillna('backfill')
870 2018-01-01 00:00:00 1.0
871 2018-01-01 00:30:00 NaN
872 2018-01-01 01:00:00 NaN
873 2018-01-01 01:30:00 3.0
874 2018-01-01 02:00:00 3.0
875 Freq: 30min, dtype: float64
876
877 >>> sm.resample('30min').fillna('pad')
878 2018-01-01 00:00:00 1.0
879 2018-01-01 00:30:00 1.0
880 2018-01-01 01:00:00 NaN
881 2018-01-01 01:30:00 NaN
882 2018-01-01 02:00:00 3.0
883 Freq: 30min, dtype: float64
884
885 >>> sm.resample('30min').fillna('nearest')
886 2018-01-01 00:00:00 1.0
887 2018-01-01 00:30:00 NaN
888 2018-01-01 01:00:00 NaN
889 2018-01-01 01:30:00 3.0
890 2018-01-01 02:00:00 3.0
891 Freq: 30min, dtype: float64
892
893 DataFrame resampling is done column-wise. All the same options are
894 available.
895
896 >>> df = pd.DataFrame({'a': [2, np.nan, 6], 'b': [1, 3, 5]},
897 ... index=pd.date_range('20180101', periods=3,
898 ... freq='h'))
899 >>> df
900 a b
901 2018-01-01 00:00:00 2.0 1
902 2018-01-01 01:00:00 NaN 3
903 2018-01-01 02:00:00 6.0 5
904
905 >>> df.resample('30min').fillna("bfill")
906 a b
907 2018-01-01 00:00:00 2.0 1
908 2018-01-01 00:30:00 NaN 3
909 2018-01-01 01:00:00 NaN 3
910 2018-01-01 01:30:00 6.0 5
911 2018-01-01 02:00:00 6.0 5
912 """
913 warnings.warn(
914 f"{type(self).__name__}.fillna is deprecated and will be removed "
915 "in a future version. Use obj.ffill(), obj.bfill(), "
916 "or obj.nearest() instead.",
917 FutureWarning,
918 stacklevel=find_stack_level(),
919 )
920 return self._upsample(method, limit=limit)
921
922 @final
923 def interpolate(
924 self,
925 method: InterpolateOptions = "linear",
926 *,
927 axis: Axis = 0,
928 limit: int | None = None,
929 inplace: bool = False,
930 limit_direction: Literal["forward", "backward", "both"] = "forward",
931 limit_area=None,
932 downcast=lib.no_default,
933 **kwargs,
934 ):
935 """
936 Interpolate values between target timestamps according to different methods.
937
938 The original index is first reindexed to target timestamps
939 (see :meth:`core.resample.Resampler.asfreq`),
940 then the interpolation of ``NaN`` values via :meth:`DataFrame.interpolate`
941 happens.
942
943 Parameters
944 ----------
945 method : str, default 'linear'
946 Interpolation technique to use. One of:
947
948 * 'linear': Ignore the index and treat the values as equally
949 spaced. This is the only method supported on MultiIndexes.
950 * 'time': Works on daily and higher resolution data to interpolate
951 given length of interval.
952 * 'index', 'values': use the actual numerical values of the index.
953 * 'pad': Fill in NaNs using existing values.
954 * 'nearest', 'zero', 'slinear', 'quadratic', 'cubic',
955 'barycentric', 'polynomial': Passed to
956 `scipy.interpolate.interp1d`, whereas 'spline' is passed to
957 `scipy.interpolate.UnivariateSpline`. These methods use the numerical
958 values of the index. Both 'polynomial' and 'spline' require that
959 you also specify an `order` (int), e.g.
960 ``df.interpolate(method='polynomial', order=5)``. Note that,
961 `slinear` method in Pandas refers to the Scipy first order `spline`
962 instead of Pandas first order `spline`.
963 * 'krogh', 'piecewise_polynomial', 'spline', 'pchip', 'akima',
964 'cubicspline': Wrappers around the SciPy interpolation methods of
965 similar names. See `Notes`.
966 * 'from_derivatives': Refers to
967 `scipy.interpolate.BPoly.from_derivatives`.
968
969 axis : {{0 or 'index', 1 or 'columns', None}}, default None
970 Axis to interpolate along. For `Series` this parameter is unused
971 and defaults to 0.
972 limit : int, optional
973 Maximum number of consecutive NaNs to fill. Must be greater than
974 0.
975 inplace : bool, default False
976 Update the data in place if possible.
977 limit_direction : {{'forward', 'backward', 'both'}}, Optional
978 Consecutive NaNs will be filled in this direction.
979
980 If limit is specified:
981 * If 'method' is 'pad' or 'ffill', 'limit_direction' must be 'forward'.
982 * If 'method' is 'backfill' or 'bfill', 'limit_direction' must be
983 'backwards'.
984
985 If 'limit' is not specified:
986 * If 'method' is 'backfill' or 'bfill', the default is 'backward'
987 * else the default is 'forward'
988
989 raises ValueError if `limit_direction` is 'forward' or 'both' and
990 method is 'backfill' or 'bfill'.
991 raises ValueError if `limit_direction` is 'backward' or 'both' and
992 method is 'pad' or 'ffill'.
993
994 limit_area : {{`None`, 'inside', 'outside'}}, default None
995 If limit is specified, consecutive NaNs will be filled with this
996 restriction.
997
998 * ``None``: No fill restriction.
999 * 'inside': Only fill NaNs surrounded by valid values
1000 (interpolate).
1001 * 'outside': Only fill NaNs outside valid values (extrapolate).
1002
1003 downcast : optional, 'infer' or None, defaults to None
1004 Downcast dtypes if possible.
1005
1006 .. deprecated:: 2.1.0
1007
1008 ``**kwargs`` : optional
1009 Keyword arguments to pass on to the interpolating function.
1010
1011 Returns
1012 -------
1013 DataFrame or Series
1014 Interpolated values at the specified freq.
1015
1016 See Also
1017 --------
1018 core.resample.Resampler.asfreq: Return the values at the new freq,
1019 essentially a reindex.
1020 DataFrame.interpolate: Fill NaN values using an interpolation method.
1021
1022 Notes
1023 -----
1024 For high-frequent or non-equidistant time-series with timestamps
1025 the reindexing followed by interpolation may lead to information loss
1026 as shown in the last example.
1027
1028 Examples
1029 --------
1030
1031 >>> start = "2023-03-01T07:00:00"
1032 >>> timesteps = pd.date_range(start, periods=5, freq="s")
1033 >>> series = pd.Series(data=[1, -1, 2, 1, 3], index=timesteps)
1034 >>> series
1035 2023-03-01 07:00:00 1
1036 2023-03-01 07:00:01 -1
1037 2023-03-01 07:00:02 2
1038 2023-03-01 07:00:03 1
1039 2023-03-01 07:00:04 3
1040 Freq: s, dtype: int64
1041
1042 Upsample the dataframe to 0.5Hz by providing the period time of 2s.
1043
1044 >>> series.resample("2s").interpolate("linear")
1045 2023-03-01 07:00:00 1
1046 2023-03-01 07:00:02 2
1047 2023-03-01 07:00:04 3
1048 Freq: 2s, dtype: int64
1049
1050 Downsample the dataframe to 2Hz by providing the period time of 500ms.
1051
1052 >>> series.resample("500ms").interpolate("linear")
1053 2023-03-01 07:00:00.000 1.0
1054 2023-03-01 07:00:00.500 0.0
1055 2023-03-01 07:00:01.000 -1.0
1056 2023-03-01 07:00:01.500 0.5
1057 2023-03-01 07:00:02.000 2.0
1058 2023-03-01 07:00:02.500 1.5
1059 2023-03-01 07:00:03.000 1.0
1060 2023-03-01 07:00:03.500 2.0
1061 2023-03-01 07:00:04.000 3.0
1062 Freq: 500ms, dtype: float64
1063
1064 Internal reindexing with ``asfreq()`` prior to interpolation leads to
1065 an interpolated timeseries on the basis the reindexed timestamps (anchors).
1066 Since not all datapoints from original series become anchors,
1067 it can lead to misleading interpolation results as in the following example:
1068
1069 >>> series.resample("400ms").interpolate("linear")
1070 2023-03-01 07:00:00.000 1.0
1071 2023-03-01 07:00:00.400 1.2
1072 2023-03-01 07:00:00.800 1.4
1073 2023-03-01 07:00:01.200 1.6
1074 2023-03-01 07:00:01.600 1.8
1075 2023-03-01 07:00:02.000 2.0
1076 2023-03-01 07:00:02.400 2.2
1077 2023-03-01 07:00:02.800 2.4
1078 2023-03-01 07:00:03.200 2.6
1079 2023-03-01 07:00:03.600 2.8
1080 2023-03-01 07:00:04.000 3.0
1081 Freq: 400ms, dtype: float64
1082
1083 Note that the series erroneously increases between two anchors
1084 ``07:00:00`` and ``07:00:02``.
1085 """
1086 assert downcast is lib.no_default # just checking coverage
1087 result = self._upsample("asfreq")
1088 return result.interpolate(
1089 method=method,
1090 axis=axis,
1091 limit=limit,
1092 inplace=inplace,
1093 limit_direction=limit_direction,
1094 limit_area=limit_area,
1095 downcast=downcast,
1096 **kwargs,
1097 )
1098
1099 @final
1100 def asfreq(self, fill_value=None):
1101 """
1102 Return the values at the new freq, essentially a reindex.
1103
1104 Parameters
1105 ----------
1106 fill_value : scalar, optional
1107 Value to use for missing values, applied during upsampling (note
1108 this does not fill NaNs that already were present).
1109
1110 Returns
1111 -------
1112 DataFrame or Series
1113 Values at the specified freq.
1114
1115 See Also
1116 --------
1117 Series.asfreq: Convert TimeSeries to specified frequency.
1118 DataFrame.asfreq: Convert TimeSeries to specified frequency.
1119
1120 Examples
1121 --------
1122
1123 >>> ser = pd.Series([1, 2, 3, 4], index=pd.DatetimeIndex(
1124 ... ['2023-01-01', '2023-01-31', '2023-02-01', '2023-02-28']))
1125 >>> ser
1126 2023-01-01 1
1127 2023-01-31 2
1128 2023-02-01 3
1129 2023-02-28 4
1130 dtype: int64
1131 >>> ser.resample('MS').asfreq()
1132 2023-01-01 1
1133 2023-02-01 3
1134 Freq: MS, dtype: int64
1135 """
1136 return self._upsample("asfreq", fill_value=fill_value)
1137
1138 @final
1139 def sum(
1140 self,
1141 numeric_only: bool = False,
1142 min_count: int = 0,
1143 *args,
1144 **kwargs,
1145 ):
1146 """
1147 Compute sum of group values.
1148
1149 Parameters
1150 ----------
1151 numeric_only : bool, default False
1152 Include only float, int, boolean columns.
1153
1154 .. versionchanged:: 2.0.0
1155
1156 numeric_only no longer accepts ``None``.
1157
1158 min_count : int, default 0
1159 The required number of valid values to perform the operation. If fewer
1160 than ``min_count`` non-NA values are present the result will be NA.
1161
1162 Returns
1163 -------
1164 Series or DataFrame
1165 Computed sum of values within each group.
1166
1167 Examples
1168 --------
1169 >>> ser = pd.Series([1, 2, 3, 4], index=pd.DatetimeIndex(
1170 ... ['2023-01-01', '2023-01-15', '2023-02-01', '2023-02-15']))
1171 >>> ser
1172 2023-01-01 1
1173 2023-01-15 2
1174 2023-02-01 3
1175 2023-02-15 4
1176 dtype: int64
1177 >>> ser.resample('MS').sum()
1178 2023-01-01 3
1179 2023-02-01 7
1180 Freq: MS, dtype: int64
1181 """
1182 maybe_warn_args_and_kwargs(type(self), "sum", args, kwargs)
1183 nv.validate_resampler_func("sum", args, kwargs)
1184 return self._downsample("sum", numeric_only=numeric_only, min_count=min_count)
1185
1186 @final
1187 def prod(
1188 self,
1189 numeric_only: bool = False,
1190 min_count: int = 0,
1191 *args,
1192 **kwargs,
1193 ):
1194 """
1195 Compute prod of group values.
1196
1197 Parameters
1198 ----------
1199 numeric_only : bool, default False
1200 Include only float, int, boolean columns.
1201
1202 .. versionchanged:: 2.0.0
1203
1204 numeric_only no longer accepts ``None``.
1205
1206 min_count : int, default 0
1207 The required number of valid values to perform the operation. If fewer
1208 than ``min_count`` non-NA values are present the result will be NA.
1209
1210 Returns
1211 -------
1212 Series or DataFrame
1213 Computed prod of values within each group.
1214
1215 Examples
1216 --------
1217 >>> ser = pd.Series([1, 2, 3, 4], index=pd.DatetimeIndex(
1218 ... ['2023-01-01', '2023-01-15', '2023-02-01', '2023-02-15']))
1219 >>> ser
1220 2023-01-01 1
1221 2023-01-15 2
1222 2023-02-01 3
1223 2023-02-15 4
1224 dtype: int64
1225 >>> ser.resample('MS').prod()
1226 2023-01-01 2
1227 2023-02-01 12
1228 Freq: MS, dtype: int64
1229 """
1230 maybe_warn_args_and_kwargs(type(self), "prod", args, kwargs)
1231 nv.validate_resampler_func("prod", args, kwargs)
1232 return self._downsample("prod", numeric_only=numeric_only, min_count=min_count)
1233
1234 @final
1235 def min(
1236 self,
1237 numeric_only: bool = False,
1238 min_count: int = 0,
1239 *args,
1240 **kwargs,
1241 ):
1242 """
1243 Compute min value of group.
1244
1245 Returns
1246 -------
1247 Series or DataFrame
1248
1249 Examples
1250 --------
1251 >>> ser = pd.Series([1, 2, 3, 4], index=pd.DatetimeIndex(
1252 ... ['2023-01-01', '2023-01-15', '2023-02-01', '2023-02-15']))
1253 >>> ser
1254 2023-01-01 1
1255 2023-01-15 2
1256 2023-02-01 3
1257 2023-02-15 4
1258 dtype: int64
1259 >>> ser.resample('MS').min()
1260 2023-01-01 1
1261 2023-02-01 3
1262 Freq: MS, dtype: int64
1263 """
1264
1265 maybe_warn_args_and_kwargs(type(self), "min", args, kwargs)
1266 nv.validate_resampler_func("min", args, kwargs)
1267 return self._downsample("min", numeric_only=numeric_only, min_count=min_count)
1268
1269 @final
1270 def max(
1271 self,
1272 numeric_only: bool = False,
1273 min_count: int = 0,
1274 *args,
1275 **kwargs,
1276 ):
1277 """
1278 Compute max value of group.
1279
1280 Returns
1281 -------
1282 Series or DataFrame
1283
1284 Examples
1285 --------
1286 >>> ser = pd.Series([1, 2, 3, 4], index=pd.DatetimeIndex(
1287 ... ['2023-01-01', '2023-01-15', '2023-02-01', '2023-02-15']))
1288 >>> ser
1289 2023-01-01 1
1290 2023-01-15 2
1291 2023-02-01 3
1292 2023-02-15 4
1293 dtype: int64
1294 >>> ser.resample('MS').max()
1295 2023-01-01 2
1296 2023-02-01 4
1297 Freq: MS, dtype: int64
1298 """
1299 maybe_warn_args_and_kwargs(type(self), "max", args, kwargs)
1300 nv.validate_resampler_func("max", args, kwargs)
1301 return self._downsample("max", numeric_only=numeric_only, min_count=min_count)
1302
1303 @final
1304 @doc(GroupBy.first)
1305 def first(
1306 self,
1307 numeric_only: bool = False,
1308 min_count: int = 0,
1309 skipna: bool = True,
1310 *args,
1311 **kwargs,
1312 ):
1313 maybe_warn_args_and_kwargs(type(self), "first", args, kwargs)
1314 nv.validate_resampler_func("first", args, kwargs)
1315 return self._downsample(
1316 "first", numeric_only=numeric_only, min_count=min_count, skipna=skipna
1317 )
1318
1319 @final
1320 @doc(GroupBy.last)
1321 def last(
1322 self,
1323 numeric_only: bool = False,
1324 min_count: int = 0,
1325 skipna: bool = True,
1326 *args,
1327 **kwargs,
1328 ):
1329 maybe_warn_args_and_kwargs(type(self), "last", args, kwargs)
1330 nv.validate_resampler_func("last", args, kwargs)
1331 return self._downsample(
1332 "last", numeric_only=numeric_only, min_count=min_count, skipna=skipna
1333 )
1334
1335 @final
1336 @doc(GroupBy.median)
1337 def median(self, numeric_only: bool = False, *args, **kwargs):
1338 maybe_warn_args_and_kwargs(type(self), "median", args, kwargs)
1339 nv.validate_resampler_func("median", args, kwargs)
1340 return self._downsample("median", numeric_only=numeric_only)
1341
1342 @final
1343 def mean(
1344 self,
1345 numeric_only: bool = False,
1346 *args,
1347 **kwargs,
1348 ):
1349 """
1350 Compute mean of groups, excluding missing values.
1351
1352 Parameters
1353 ----------
1354 numeric_only : bool, default False
1355 Include only `float`, `int` or `boolean` data.
1356
1357 .. versionchanged:: 2.0.0
1358
1359 numeric_only now defaults to ``False``.
1360
1361 Returns
1362 -------
1363 DataFrame or Series
1364 Mean of values within each group.
1365
1366 Examples
1367 --------
1368
1369 >>> ser = pd.Series([1, 2, 3, 4], index=pd.DatetimeIndex(
1370 ... ['2023-01-01', '2023-01-15', '2023-02-01', '2023-02-15']))
1371 >>> ser
1372 2023-01-01 1
1373 2023-01-15 2
1374 2023-02-01 3
1375 2023-02-15 4
1376 dtype: int64
1377 >>> ser.resample('MS').mean()
1378 2023-01-01 1.5
1379 2023-02-01 3.5
1380 Freq: MS, dtype: float64
1381 """
1382 maybe_warn_args_and_kwargs(type(self), "mean", args, kwargs)
1383 nv.validate_resampler_func("mean", args, kwargs)
1384 return self._downsample("mean", numeric_only=numeric_only)
1385
1386 @final
1387 def std(
1388 self,
1389 ddof: int = 1,
1390 numeric_only: bool = False,
1391 *args,
1392 **kwargs,
1393 ):
1394 """
1395 Compute standard deviation of groups, excluding missing values.
1396
1397 Parameters
1398 ----------
1399 ddof : int, default 1
1400 Degrees of freedom.
1401 numeric_only : bool, default False
1402 Include only `float`, `int` or `boolean` data.
1403
1404 .. versionadded:: 1.5.0
1405
1406 .. versionchanged:: 2.0.0
1407
1408 numeric_only now defaults to ``False``.
1409
1410 Returns
1411 -------
1412 DataFrame or Series
1413 Standard deviation of values within each group.
1414
1415 Examples
1416 --------
1417
1418 >>> ser = pd.Series([1, 3, 2, 4, 3, 8],
1419 ... index=pd.DatetimeIndex(['2023-01-01',
1420 ... '2023-01-10',
1421 ... '2023-01-15',
1422 ... '2023-02-01',
1423 ... '2023-02-10',
1424 ... '2023-02-15']))
1425 >>> ser.resample('MS').std()
1426 2023-01-01 1.000000
1427 2023-02-01 2.645751
1428 Freq: MS, dtype: float64
1429 """
1430 maybe_warn_args_and_kwargs(type(self), "std", args, kwargs)
1431 nv.validate_resampler_func("std", args, kwargs)
1432 return self._downsample("std", ddof=ddof, numeric_only=numeric_only)
1433
1434 @final
1435 def var(
1436 self,
1437 ddof: int = 1,
1438 numeric_only: bool = False,
1439 *args,
1440 **kwargs,
1441 ):
1442 """
1443 Compute variance of groups, excluding missing values.
1444
1445 Parameters
1446 ----------
1447 ddof : int, default 1
1448 Degrees of freedom.
1449
1450 numeric_only : bool, default False
1451 Include only `float`, `int` or `boolean` data.
1452
1453 .. versionadded:: 1.5.0
1454
1455 .. versionchanged:: 2.0.0
1456
1457 numeric_only now defaults to ``False``.
1458
1459 Returns
1460 -------
1461 DataFrame or Series
1462 Variance of values within each group.
1463
1464 Examples
1465 --------
1466
1467 >>> ser = pd.Series([1, 3, 2, 4, 3, 8],
1468 ... index=pd.DatetimeIndex(['2023-01-01',
1469 ... '2023-01-10',
1470 ... '2023-01-15',
1471 ... '2023-02-01',
1472 ... '2023-02-10',
1473 ... '2023-02-15']))
1474 >>> ser.resample('MS').var()
1475 2023-01-01 1.0
1476 2023-02-01 7.0
1477 Freq: MS, dtype: float64
1478
1479 >>> ser.resample('MS').var(ddof=0)
1480 2023-01-01 0.666667
1481 2023-02-01 4.666667
1482 Freq: MS, dtype: float64
1483 """
1484 maybe_warn_args_and_kwargs(type(self), "var", args, kwargs)
1485 nv.validate_resampler_func("var", args, kwargs)
1486 return self._downsample("var", ddof=ddof, numeric_only=numeric_only)
1487
1488 @final
1489 @doc(GroupBy.sem)
1490 def sem(
1491 self,
1492 ddof: int = 1,
1493 numeric_only: bool = False,
1494 *args,
1495 **kwargs,
1496 ):
1497 maybe_warn_args_and_kwargs(type(self), "sem", args, kwargs)
1498 nv.validate_resampler_func("sem", args, kwargs)
1499 return self._downsample("sem", ddof=ddof, numeric_only=numeric_only)
1500
1501 @final
1502 @doc(GroupBy.ohlc)
1503 def ohlc(
1504 self,
1505 *args,
1506 **kwargs,
1507 ):
1508 maybe_warn_args_and_kwargs(type(self), "ohlc", args, kwargs)
1509 nv.validate_resampler_func("ohlc", args, kwargs)
1510
1511 ax = self.ax
1512 obj = self._obj_with_exclusions
1513 if len(ax) == 0:
1514 # GH#42902
1515 obj = obj.copy()
1516 obj.index = _asfreq_compat(obj.index, self.freq)
1517 if obj.ndim == 1:
1518 obj = obj.to_frame()
1519 obj = obj.reindex(["open", "high", "low", "close"], axis=1)
1520 else:
1521 mi = MultiIndex.from_product(
1522 [obj.columns, ["open", "high", "low", "close"]]
1523 )
1524 obj = obj.reindex(mi, axis=1)
1525 return obj
1526
1527 return self._downsample("ohlc")
1528
1529 @final
1530 @doc(SeriesGroupBy.nunique)
1531 def nunique(
1532 self,
1533 *args,
1534 **kwargs,
1535 ):
1536 maybe_warn_args_and_kwargs(type(self), "nunique", args, kwargs)
1537 nv.validate_resampler_func("nunique", args, kwargs)
1538 return self._downsample("nunique")
1539
1540 @final
1541 @doc(GroupBy.size)
1542 def size(self):
1543 result = self._downsample("size")
1544
1545 # If the result is a non-empty DataFrame we stack to get a Series
1546 # GH 46826
1547 if isinstance(result, ABCDataFrame) and not result.empty:
1548 result = result.stack(future_stack=True)
1549
1550 if not len(self.ax):
1551 from pandas import Series
1552
1553 if self._selected_obj.ndim == 1:
1554 name = self._selected_obj.name
1555 else:
1556 name = None
1557 result = Series([], index=result.index, dtype="int64", name=name)
1558 return result
1559
1560 @final
1561 @doc(GroupBy.count)
1562 def count(self):
1563 result = self._downsample("count")
1564 if not len(self.ax):
1565 if self._selected_obj.ndim == 1:
1566 result = type(self._selected_obj)(
1567 [], index=result.index, dtype="int64", name=self._selected_obj.name
1568 )
1569 else:
1570 from pandas import DataFrame
1571
1572 result = DataFrame(
1573 [], index=result.index, columns=result.columns, dtype="int64"
1574 )
1575
1576 return result
1577
1578 @final
1579 def quantile(self, q: float | list[float] | AnyArrayLike = 0.5, **kwargs):
1580 """
1581 Return value at the given quantile.
1582
1583 Parameters
1584 ----------
1585 q : float or array-like, default 0.5 (50% quantile)
1586
1587 Returns
1588 -------
1589 DataFrame or Series
1590 Quantile of values within each group.
1591
1592 See Also
1593 --------
1594 Series.quantile
1595 Return a series, where the index is q and the values are the quantiles.
1596 DataFrame.quantile
1597 Return a DataFrame, where the columns are the columns of self,
1598 and the values are the quantiles.
1599 DataFrameGroupBy.quantile
1600 Return a DataFrame, where the columns are groupby columns,
1601 and the values are its quantiles.
1602
1603 Examples
1604 --------
1605
1606 >>> ser = pd.Series([1, 3, 2, 4, 3, 8],
1607 ... index=pd.DatetimeIndex(['2023-01-01',
1608 ... '2023-01-10',
1609 ... '2023-01-15',
1610 ... '2023-02-01',
1611 ... '2023-02-10',
1612 ... '2023-02-15']))
1613 >>> ser.resample('MS').quantile()
1614 2023-01-01 2.0
1615 2023-02-01 4.0
1616 Freq: MS, dtype: float64
1617
1618 >>> ser.resample('MS').quantile(.25)
1619 2023-01-01 1.5
1620 2023-02-01 3.5
1621 Freq: MS, dtype: float64
1622 """
1623 return self._downsample("quantile", q=q, **kwargs)
1624
1625
1626class _GroupByMixin(PandasObject, SelectionMixin):
1627 """
1628 Provide the groupby facilities.
1629 """
1630
1631 _attributes: list[str] # in practice the same as Resampler._attributes
1632 _selection: IndexLabel | None = None
1633 _groupby: GroupBy
1634 _timegrouper: TimeGrouper
1635
1636 def __init__(
1637 self,
1638 *,
1639 parent: Resampler,
1640 groupby: GroupBy,
1641 key=None,
1642 selection: IndexLabel | None = None,
1643 include_groups: bool = False,
1644 ) -> None:
1645 # reached via ._gotitem and _get_resampler_for_grouping
1646
1647 assert isinstance(groupby, GroupBy), type(groupby)
1648
1649 # parent is always a Resampler, sometimes a _GroupByMixin
1650 assert isinstance(parent, Resampler), type(parent)
1651
1652 # initialize our GroupByMixin object with
1653 # the resampler attributes
1654 for attr in self._attributes:
1655 setattr(self, attr, getattr(parent, attr))
1656 self._selection = selection
1657
1658 self.binner = parent.binner
1659 self.key = key
1660
1661 self._groupby = groupby
1662 self._timegrouper = copy.copy(parent._timegrouper)
1663
1664 self.ax = parent.ax
1665 self.obj = parent.obj
1666 self.include_groups = include_groups
1667
1668 @no_type_check
1669 def _apply(self, f, *args, **kwargs):
1670 """
1671 Dispatch to _upsample; we are stripping all of the _upsample kwargs and
1672 performing the original function call on the grouped object.
1673 """
1674
1675 def func(x):
1676 x = self._resampler_cls(x, timegrouper=self._timegrouper, gpr_index=self.ax)
1677
1678 if isinstance(f, str):
1679 return getattr(x, f)(**kwargs)
1680
1681 return x.apply(f, *args, **kwargs)
1682
1683 result = _apply(self._groupby, func, include_groups=self.include_groups)
1684 return self._wrap_result(result)
1685
1686 _upsample = _apply
1687 _downsample = _apply
1688 _groupby_and_aggregate = _apply
1689
1690 @final
1691 def _gotitem(self, key, ndim, subset=None):
1692 """
1693 Sub-classes to define. Return a sliced object.
1694
1695 Parameters
1696 ----------
1697 key : string / list of selections
1698 ndim : {1, 2}
1699 requested ndim of result
1700 subset : object, default None
1701 subset to act on
1702 """
1703 # create a new object to prevent aliasing
1704 if subset is None:
1705 subset = self.obj
1706 if key is not None:
1707 subset = subset[key]
1708 else:
1709 # reached via Apply.agg_dict_like with selection=None, ndim=1
1710 assert subset.ndim == 1
1711
1712 # Try to select from a DataFrame, falling back to a Series
1713 try:
1714 if isinstance(key, list) and self.key not in key and self.key is not None:
1715 key.append(self.key)
1716 groupby = self._groupby[key]
1717 except IndexError:
1718 groupby = self._groupby
1719
1720 selection = self._infer_selection(key, subset)
1721
1722 new_rs = type(self)(
1723 groupby=groupby,
1724 parent=cast(Resampler, self),
1725 selection=selection,
1726 )
1727 return new_rs
1728
1729
1730class DatetimeIndexResampler(Resampler):
1731 ax: DatetimeIndex
1732
1733 @property
1734 def _resampler_for_grouping(self):
1735 return DatetimeIndexResamplerGroupby
1736
1737 def _get_binner_for_time(self):
1738 # this is how we are actually creating the bins
1739 if self.kind == "period":
1740 return self._timegrouper._get_time_period_bins(self.ax)
1741 return self._timegrouper._get_time_bins(self.ax)
1742
1743 def _downsample(self, how, **kwargs):
1744 """
1745 Downsample the cython defined function.
1746
1747 Parameters
1748 ----------
1749 how : string / cython mapped function
1750 **kwargs : kw args passed to how function
1751 """
1752 orig_how = how
1753 how = com.get_cython_func(how) or how
1754 if orig_how != how:
1755 warn_alias_replacement(self, orig_how, how)
1756 ax = self.ax
1757
1758 # Excludes `on` column when provided
1759 obj = self._obj_with_exclusions
1760
1761 if not len(ax):
1762 # reset to the new freq
1763 obj = obj.copy()
1764 obj.index = obj.index._with_freq(self.freq)
1765 assert obj.index.freq == self.freq, (obj.index.freq, self.freq)
1766 return obj
1767
1768 # do we have a regular frequency
1769
1770 # error: Item "None" of "Optional[Any]" has no attribute "binlabels"
1771 if (
1772 (ax.freq is not None or ax.inferred_freq is not None)
1773 and len(self._grouper.binlabels) > len(ax)
1774 and how is None
1775 ):
1776 # let's do an asfreq
1777 return self.asfreq()
1778
1779 # we are downsampling
1780 # we want to call the actual grouper method here
1781 if self.axis == 0:
1782 result = obj.groupby(self._grouper).aggregate(how, **kwargs)
1783 else:
1784 # test_resample_axis1
1785 result = obj.T.groupby(self._grouper).aggregate(how, **kwargs).T
1786
1787 return self._wrap_result(result)
1788
1789 def _adjust_binner_for_upsample(self, binner):
1790 """
1791 Adjust our binner when upsampling.
1792
1793 The range of a new index should not be outside specified range
1794 """
1795 if self.closed == "right":
1796 binner = binner[1:]
1797 else:
1798 binner = binner[:-1]
1799 return binner
1800
1801 def _upsample(self, method, limit: int | None = None, fill_value=None):
1802 """
1803 Parameters
1804 ----------
1805 method : string {'backfill', 'bfill', 'pad',
1806 'ffill', 'asfreq'} method for upsampling
1807 limit : int, default None
1808 Maximum size gap to fill when reindexing
1809 fill_value : scalar, default None
1810 Value to use for missing values
1811
1812 See Also
1813 --------
1814 .fillna: Fill NA/NaN values using the specified method.
1815
1816 """
1817 if self.axis:
1818 raise AssertionError("axis must be 0")
1819 if self._from_selection:
1820 raise ValueError(
1821 "Upsampling from level= or on= selection "
1822 "is not supported, use .set_index(...) "
1823 "to explicitly set index to datetime-like"
1824 )
1825
1826 ax = self.ax
1827 obj = self._selected_obj
1828 binner = self.binner
1829 res_index = self._adjust_binner_for_upsample(binner)
1830
1831 # if we have the same frequency as our axis, then we are equal sampling
1832 if (
1833 limit is None
1834 and to_offset(ax.inferred_freq) == self.freq
1835 and len(obj) == len(res_index)
1836 ):
1837 result = obj.copy()
1838 result.index = res_index
1839 else:
1840 if method == "asfreq":
1841 method = None
1842 result = obj.reindex(
1843 res_index, method=method, limit=limit, fill_value=fill_value
1844 )
1845
1846 return self._wrap_result(result)
1847
1848 def _wrap_result(self, result):
1849 result = super()._wrap_result(result)
1850
1851 # we may have a different kind that we were asked originally
1852 # convert if needed
1853 if self.kind == "period" and not isinstance(result.index, PeriodIndex):
1854 if isinstance(result.index, MultiIndex):
1855 # GH 24103 - e.g. groupby resample
1856 if not isinstance(result.index.levels[-1], PeriodIndex):
1857 new_level = result.index.levels[-1].to_period(self.freq)
1858 result.index = result.index.set_levels(new_level, level=-1)
1859 else:
1860 result.index = result.index.to_period(self.freq)
1861 return result
1862
1863
1864# error: Definition of "ax" in base class "_GroupByMixin" is incompatible
1865# with definition in base class "DatetimeIndexResampler"
1866class DatetimeIndexResamplerGroupby( # type: ignore[misc]
1867 _GroupByMixin, DatetimeIndexResampler
1868):
1869 """
1870 Provides a resample of a groupby implementation
1871 """
1872
1873 @property
1874 def _resampler_cls(self):
1875 return DatetimeIndexResampler
1876
1877
1878class PeriodIndexResampler(DatetimeIndexResampler):
1879 # error: Incompatible types in assignment (expression has type "PeriodIndex", base
1880 # class "DatetimeIndexResampler" defined the type as "DatetimeIndex")
1881 ax: PeriodIndex # type: ignore[assignment]
1882
1883 @property
1884 def _resampler_for_grouping(self):
1885 warnings.warn(
1886 "Resampling a groupby with a PeriodIndex is deprecated. "
1887 "Cast to DatetimeIndex before resampling instead.",
1888 FutureWarning,
1889 stacklevel=find_stack_level(),
1890 )
1891 return PeriodIndexResamplerGroupby
1892
1893 def _get_binner_for_time(self):
1894 if self.kind == "timestamp":
1895 return super()._get_binner_for_time()
1896 return self._timegrouper._get_period_bins(self.ax)
1897
1898 def _convert_obj(self, obj: NDFrameT) -> NDFrameT:
1899 obj = super()._convert_obj(obj)
1900
1901 if self._from_selection:
1902 # see GH 14008, GH 12871
1903 msg = (
1904 "Resampling from level= or on= selection "
1905 "with a PeriodIndex is not currently supported, "
1906 "use .set_index(...) to explicitly set index"
1907 )
1908 raise NotImplementedError(msg)
1909
1910 # convert to timestamp
1911 if self.kind == "timestamp":
1912 obj = obj.to_timestamp(how=self.convention)
1913
1914 return obj
1915
1916 def _downsample(self, how, **kwargs):
1917 """
1918 Downsample the cython defined function.
1919
1920 Parameters
1921 ----------
1922 how : string / cython mapped function
1923 **kwargs : kw args passed to how function
1924 """
1925 # we may need to actually resample as if we are timestamps
1926 if self.kind == "timestamp":
1927 return super()._downsample(how, **kwargs)
1928
1929 orig_how = how
1930 how = com.get_cython_func(how) or how
1931 if orig_how != how:
1932 warn_alias_replacement(self, orig_how, how)
1933 ax = self.ax
1934
1935 if is_subperiod(ax.freq, self.freq):
1936 # Downsampling
1937 return self._groupby_and_aggregate(how, **kwargs)
1938 elif is_superperiod(ax.freq, self.freq):
1939 if how == "ohlc":
1940 # GH #13083
1941 # upsampling to subperiods is handled as an asfreq, which works
1942 # for pure aggregating/reducing methods
1943 # OHLC reduces along the time dimension, but creates multiple
1944 # values for each period -> handle by _groupby_and_aggregate()
1945 return self._groupby_and_aggregate(how)
1946 return self.asfreq()
1947 elif ax.freq == self.freq:
1948 return self.asfreq()
1949
1950 raise IncompatibleFrequency(
1951 f"Frequency {ax.freq} cannot be resampled to {self.freq}, "
1952 "as they are not sub or super periods"
1953 )
1954
1955 def _upsample(self, method, limit: int | None = None, fill_value=None):
1956 """
1957 Parameters
1958 ----------
1959 method : {'backfill', 'bfill', 'pad', 'ffill'}
1960 Method for upsampling.
1961 limit : int, default None
1962 Maximum size gap to fill when reindexing.
1963 fill_value : scalar, default None
1964 Value to use for missing values.
1965
1966 See Also
1967 --------
1968 .fillna: Fill NA/NaN values using the specified method.
1969
1970 """
1971 # we may need to actually resample as if we are timestamps
1972 if self.kind == "timestamp":
1973 return super()._upsample(method, limit=limit, fill_value=fill_value)
1974
1975 ax = self.ax
1976 obj = self.obj
1977 new_index = self.binner
1978
1979 # Start vs. end of period
1980 memb = ax.asfreq(self.freq, how=self.convention)
1981
1982 # Get the fill indexer
1983 if method == "asfreq":
1984 method = None
1985 indexer = memb.get_indexer(new_index, method=method, limit=limit)
1986 new_obj = _take_new_index(
1987 obj,
1988 indexer,
1989 new_index,
1990 axis=self.axis,
1991 )
1992 return self._wrap_result(new_obj)
1993
1994
1995# error: Definition of "ax" in base class "_GroupByMixin" is incompatible with
1996# definition in base class "PeriodIndexResampler"
1997class PeriodIndexResamplerGroupby( # type: ignore[misc]
1998 _GroupByMixin, PeriodIndexResampler
1999):
2000 """
2001 Provides a resample of a groupby implementation.
2002 """
2003
2004 @property
2005 def _resampler_cls(self):
2006 return PeriodIndexResampler
2007
2008
2009class TimedeltaIndexResampler(DatetimeIndexResampler):
2010 # error: Incompatible types in assignment (expression has type "TimedeltaIndex",
2011 # base class "DatetimeIndexResampler" defined the type as "DatetimeIndex")
2012 ax: TimedeltaIndex # type: ignore[assignment]
2013
2014 @property
2015 def _resampler_for_grouping(self):
2016 return TimedeltaIndexResamplerGroupby
2017
2018 def _get_binner_for_time(self):
2019 return self._timegrouper._get_time_delta_bins(self.ax)
2020
2021 def _adjust_binner_for_upsample(self, binner):
2022 """
2023 Adjust our binner when upsampling.
2024
2025 The range of a new index is allowed to be greater than original range
2026 so we don't need to change the length of a binner, GH 13022
2027 """
2028 return binner
2029
2030
2031# error: Definition of "ax" in base class "_GroupByMixin" is incompatible with
2032# definition in base class "DatetimeIndexResampler"
2033class TimedeltaIndexResamplerGroupby( # type: ignore[misc]
2034 _GroupByMixin, TimedeltaIndexResampler
2035):
2036 """
2037 Provides a resample of a groupby implementation.
2038 """
2039
2040 @property
2041 def _resampler_cls(self):
2042 return TimedeltaIndexResampler
2043
2044
2045def get_resampler(obj: Series | DataFrame, kind=None, **kwds) -> Resampler:
2046 """
2047 Create a TimeGrouper and return our resampler.
2048 """
2049 tg = TimeGrouper(obj, **kwds) # type: ignore[arg-type]
2050 return tg._get_resampler(obj, kind=kind)
2051
2052
2053get_resampler.__doc__ = Resampler.__doc__
2054
2055
2056def get_resampler_for_grouping(
2057 groupby: GroupBy,
2058 rule,
2059 how=None,
2060 fill_method=None,
2061 limit: int | None = None,
2062 kind=None,
2063 on=None,
2064 include_groups: bool = True,
2065 **kwargs,
2066) -> Resampler:
2067 """
2068 Return our appropriate resampler when grouping as well.
2069 """
2070 # .resample uses 'on' similar to how .groupby uses 'key'
2071 tg = TimeGrouper(freq=rule, key=on, **kwargs)
2072 resampler = tg._get_resampler(groupby.obj, kind=kind)
2073 return resampler._get_resampler_for_grouping(
2074 groupby=groupby, include_groups=include_groups, key=tg.key
2075 )
2076
2077
2078class TimeGrouper(Grouper):
2079 """
2080 Custom groupby class for time-interval grouping.
2081
2082 Parameters
2083 ----------
2084 freq : pandas date offset or offset alias for identifying bin edges
2085 closed : closed end of interval; 'left' or 'right'
2086 label : interval boundary to use for labeling; 'left' or 'right'
2087 convention : {'start', 'end', 'e', 's'}
2088 If axis is PeriodIndex
2089 """
2090
2091 _attributes = Grouper._attributes + (
2092 "closed",
2093 "label",
2094 "how",
2095 "kind",
2096 "convention",
2097 "origin",
2098 "offset",
2099 )
2100
2101 origin: TimeGrouperOrigin
2102
2103 def __init__(
2104 self,
2105 obj: Grouper | None = None,
2106 freq: Frequency = "Min",
2107 key: str | None = None,
2108 closed: Literal["left", "right"] | None = None,
2109 label: Literal["left", "right"] | None = None,
2110 how: str = "mean",
2111 axis: Axis = 0,
2112 fill_method=None,
2113 limit: int | None = None,
2114 kind: str | None = None,
2115 convention: Literal["start", "end", "e", "s"] | None = None,
2116 origin: Literal["epoch", "start", "start_day", "end", "end_day"]
2117 | TimestampConvertibleTypes = "start_day",
2118 offset: TimedeltaConvertibleTypes | None = None,
2119 group_keys: bool = False,
2120 **kwargs,
2121 ) -> None:
2122 # Check for correctness of the keyword arguments which would
2123 # otherwise silently use the default if misspelled
2124 if label not in {None, "left", "right"}:
2125 raise ValueError(f"Unsupported value {label} for `label`")
2126 if closed not in {None, "left", "right"}:
2127 raise ValueError(f"Unsupported value {closed} for `closed`")
2128 if convention not in {None, "start", "end", "e", "s"}:
2129 raise ValueError(f"Unsupported value {convention} for `convention`")
2130
2131 if (
2132 key is None
2133 and obj is not None
2134 and isinstance(obj.index, PeriodIndex) # type: ignore[attr-defined]
2135 or (
2136 key is not None
2137 and obj is not None
2138 and getattr(obj[key], "dtype", None) == "period" # type: ignore[index]
2139 )
2140 ):
2141 freq = to_offset(freq, is_period=True)
2142 else:
2143 freq = to_offset(freq)
2144
2145 end_types = {"ME", "YE", "QE", "BME", "BYE", "BQE", "W"}
2146 rule = freq.rule_code
2147 if rule in end_types or ("-" in rule and rule[: rule.find("-")] in end_types):
2148 if closed is None:
2149 closed = "right"
2150 if label is None:
2151 label = "right"
2152 else:
2153 # The backward resample sets ``closed`` to ``'right'`` by default
2154 # since the last value should be considered as the edge point for
2155 # the last bin. When origin in "end" or "end_day", the value for a
2156 # specific ``Timestamp`` index stands for the resample result from
2157 # the current ``Timestamp`` minus ``freq`` to the current
2158 # ``Timestamp`` with a right close.
2159 if origin in ["end", "end_day"]:
2160 if closed is None:
2161 closed = "right"
2162 if label is None:
2163 label = "right"
2164 else:
2165 if closed is None:
2166 closed = "left"
2167 if label is None:
2168 label = "left"
2169
2170 self.closed = closed
2171 self.label = label
2172 self.kind = kind
2173 self.convention = convention if convention is not None else "e"
2174 self.how = how
2175 self.fill_method = fill_method
2176 self.limit = limit
2177 self.group_keys = group_keys
2178 self._arrow_dtype: ArrowDtype | None = None
2179
2180 if origin in ("epoch", "start", "start_day", "end", "end_day"):
2181 # error: Incompatible types in assignment (expression has type "Union[Union[
2182 # Timestamp, datetime, datetime64, signedinteger[_64Bit], float, str],
2183 # Literal['epoch', 'start', 'start_day', 'end', 'end_day']]", variable has
2184 # type "Union[Timestamp, Literal['epoch', 'start', 'start_day', 'end',
2185 # 'end_day']]")
2186 self.origin = origin # type: ignore[assignment]
2187 else:
2188 try:
2189 self.origin = Timestamp(origin)
2190 except (ValueError, TypeError) as err:
2191 raise ValueError(
2192 "'origin' should be equal to 'epoch', 'start', 'start_day', "
2193 "'end', 'end_day' or "
2194 f"should be a Timestamp convertible type. Got '{origin}' instead."
2195 ) from err
2196
2197 try:
2198 self.offset = Timedelta(offset) if offset is not None else None
2199 except (ValueError, TypeError) as err:
2200 raise ValueError(
2201 "'offset' should be a Timedelta convertible type. "
2202 f"Got '{offset}' instead."
2203 ) from err
2204
2205 # always sort time groupers
2206 kwargs["sort"] = True
2207
2208 super().__init__(freq=freq, key=key, axis=axis, **kwargs)
2209
2210 def _get_resampler(self, obj: NDFrame, kind=None) -> Resampler:
2211 """
2212 Return my resampler or raise if we have an invalid axis.
2213
2214 Parameters
2215 ----------
2216 obj : Series or DataFrame
2217 kind : string, optional
2218 'period','timestamp','timedelta' are valid
2219
2220 Returns
2221 -------
2222 Resampler
2223
2224 Raises
2225 ------
2226 TypeError if incompatible axis
2227
2228 """
2229 _, ax, _ = self._set_grouper(obj, gpr_index=None)
2230 if isinstance(ax, DatetimeIndex):
2231 return DatetimeIndexResampler(
2232 obj,
2233 timegrouper=self,
2234 kind=kind,
2235 axis=self.axis,
2236 group_keys=self.group_keys,
2237 gpr_index=ax,
2238 )
2239 elif isinstance(ax, PeriodIndex) or kind == "period":
2240 if isinstance(ax, PeriodIndex):
2241 # GH#53481
2242 warnings.warn(
2243 "Resampling with a PeriodIndex is deprecated. "
2244 "Cast index to DatetimeIndex before resampling instead.",
2245 FutureWarning,
2246 stacklevel=find_stack_level(),
2247 )
2248 else:
2249 warnings.warn(
2250 "Resampling with kind='period' is deprecated. "
2251 "Use datetime paths instead.",
2252 FutureWarning,
2253 stacklevel=find_stack_level(),
2254 )
2255 return PeriodIndexResampler(
2256 obj,
2257 timegrouper=self,
2258 kind=kind,
2259 axis=self.axis,
2260 group_keys=self.group_keys,
2261 gpr_index=ax,
2262 )
2263 elif isinstance(ax, TimedeltaIndex):
2264 return TimedeltaIndexResampler(
2265 obj,
2266 timegrouper=self,
2267 axis=self.axis,
2268 group_keys=self.group_keys,
2269 gpr_index=ax,
2270 )
2271
2272 raise TypeError(
2273 "Only valid with DatetimeIndex, "
2274 "TimedeltaIndex or PeriodIndex, "
2275 f"but got an instance of '{type(ax).__name__}'"
2276 )
2277
2278 def _get_grouper(
2279 self, obj: NDFrameT, validate: bool = True
2280 ) -> tuple[BinGrouper, NDFrameT]:
2281 # create the resampler and return our binner
2282 r = self._get_resampler(obj)
2283 return r._grouper, cast(NDFrameT, r.obj)
2284
2285 def _get_time_bins(self, ax: DatetimeIndex):
2286 if not isinstance(ax, DatetimeIndex):
2287 raise TypeError(
2288 "axis must be a DatetimeIndex, but got "
2289 f"an instance of {type(ax).__name__}"
2290 )
2291
2292 if len(ax) == 0:
2293 binner = labels = DatetimeIndex(
2294 data=[], freq=self.freq, name=ax.name, dtype=ax.dtype
2295 )
2296 return binner, [], labels
2297
2298 first, last = _get_timestamp_range_edges(
2299 ax.min(),
2300 ax.max(),
2301 self.freq,
2302 unit=ax.unit,
2303 closed=self.closed,
2304 origin=self.origin,
2305 offset=self.offset,
2306 )
2307 # GH #12037
2308 # use first/last directly instead of call replace() on them
2309 # because replace() will swallow the nanosecond part
2310 # thus last bin maybe slightly before the end if the end contains
2311 # nanosecond part and lead to `Values falls after last bin` error
2312 # GH 25758: If DST lands at midnight (e.g. 'America/Havana'), user feedback
2313 # has noted that ambiguous=True provides the most sensible result
2314 binner = labels = date_range(
2315 freq=self.freq,
2316 start=first,
2317 end=last,
2318 tz=ax.tz,
2319 name=ax.name,
2320 ambiguous=True,
2321 nonexistent="shift_forward",
2322 unit=ax.unit,
2323 )
2324
2325 ax_values = ax.asi8
2326 binner, bin_edges = self._adjust_bin_edges(binner, ax_values)
2327
2328 # general version, knowing nothing about relative frequencies
2329 bins = lib.generate_bins_dt64(
2330 ax_values, bin_edges, self.closed, hasnans=ax.hasnans
2331 )
2332
2333 if self.closed == "right":
2334 labels = binner
2335 if self.label == "right":
2336 labels = labels[1:]
2337 elif self.label == "right":
2338 labels = labels[1:]
2339
2340 if ax.hasnans:
2341 binner = binner.insert(0, NaT)
2342 labels = labels.insert(0, NaT)
2343
2344 # if we end up with more labels than bins
2345 # adjust the labels
2346 # GH4076
2347 if len(bins) < len(labels):
2348 labels = labels[: len(bins)]
2349
2350 return binner, bins, labels
2351
2352 def _adjust_bin_edges(
2353 self, binner: DatetimeIndex, ax_values: npt.NDArray[np.int64]
2354 ) -> tuple[DatetimeIndex, npt.NDArray[np.int64]]:
2355 # Some hacks for > daily data, see #1471, #1458, #1483
2356
2357 if self.freq.name in ("BME", "ME", "W") or self.freq.name.split("-")[0] in (
2358 "BQE",
2359 "BYE",
2360 "QE",
2361 "YE",
2362 "W",
2363 ):
2364 # If the right end-point is on the last day of the month, roll forwards
2365 # until the last moment of that day. Note that we only do this for offsets
2366 # which correspond to the end of a super-daily period - "month start", for
2367 # example, is excluded.
2368 if self.closed == "right":
2369 # GH 21459, GH 9119: Adjust the bins relative to the wall time
2370 edges_dti = binner.tz_localize(None)
2371 edges_dti = (
2372 edges_dti
2373 + Timedelta(days=1, unit=edges_dti.unit).as_unit(edges_dti.unit)
2374 - Timedelta(1, unit=edges_dti.unit).as_unit(edges_dti.unit)
2375 )
2376 bin_edges = edges_dti.tz_localize(binner.tz).asi8
2377 else:
2378 bin_edges = binner.asi8
2379
2380 # intraday values on last day
2381 if bin_edges[-2] > ax_values.max():
2382 bin_edges = bin_edges[:-1]
2383 binner = binner[:-1]
2384 else:
2385 bin_edges = binner.asi8
2386 return binner, bin_edges
2387
2388 def _get_time_delta_bins(self, ax: TimedeltaIndex):
2389 if not isinstance(ax, TimedeltaIndex):
2390 raise TypeError(
2391 "axis must be a TimedeltaIndex, but got "
2392 f"an instance of {type(ax).__name__}"
2393 )
2394
2395 if not isinstance(self.freq, Tick):
2396 # GH#51896
2397 raise ValueError(
2398 "Resampling on a TimedeltaIndex requires fixed-duration `freq`, "
2399 f"e.g. '24h' or '3D', not {self.freq}"
2400 )
2401
2402 if not len(ax):
2403 binner = labels = TimedeltaIndex(data=[], freq=self.freq, name=ax.name)
2404 return binner, [], labels
2405
2406 start, end = ax.min(), ax.max()
2407
2408 if self.closed == "right":
2409 end += self.freq
2410
2411 labels = binner = timedelta_range(
2412 start=start, end=end, freq=self.freq, name=ax.name
2413 )
2414
2415 end_stamps = labels
2416 if self.closed == "left":
2417 end_stamps += self.freq
2418
2419 bins = ax.searchsorted(end_stamps, side=self.closed)
2420
2421 if self.offset:
2422 # GH 10530 & 31809
2423 labels += self.offset
2424
2425 return binner, bins, labels
2426
2427 def _get_time_period_bins(self, ax: DatetimeIndex):
2428 if not isinstance(ax, DatetimeIndex):
2429 raise TypeError(
2430 "axis must be a DatetimeIndex, but got "
2431 f"an instance of {type(ax).__name__}"
2432 )
2433
2434 freq = self.freq
2435
2436 if len(ax) == 0:
2437 binner = labels = PeriodIndex(
2438 data=[], freq=freq, name=ax.name, dtype=ax.dtype
2439 )
2440 return binner, [], labels
2441
2442 labels = binner = period_range(start=ax[0], end=ax[-1], freq=freq, name=ax.name)
2443
2444 end_stamps = (labels + freq).asfreq(freq, "s").to_timestamp()
2445 if ax.tz:
2446 end_stamps = end_stamps.tz_localize(ax.tz)
2447 bins = ax.searchsorted(end_stamps, side="left")
2448
2449 return binner, bins, labels
2450
2451 def _get_period_bins(self, ax: PeriodIndex):
2452 if not isinstance(ax, PeriodIndex):
2453 raise TypeError(
2454 "axis must be a PeriodIndex, but got "
2455 f"an instance of {type(ax).__name__}"
2456 )
2457
2458 memb = ax.asfreq(self.freq, how=self.convention)
2459
2460 # NaT handling as in pandas._lib.lib.generate_bins_dt64()
2461 nat_count = 0
2462 if memb.hasnans:
2463 # error: Incompatible types in assignment (expression has type
2464 # "bool_", variable has type "int") [assignment]
2465 nat_count = np.sum(memb._isnan) # type: ignore[assignment]
2466 memb = memb[~memb._isnan]
2467
2468 if not len(memb):
2469 # index contains no valid (non-NaT) values
2470 bins = np.array([], dtype=np.int64)
2471 binner = labels = PeriodIndex(data=[], freq=self.freq, name=ax.name)
2472 if len(ax) > 0:
2473 # index is all NaT
2474 binner, bins, labels = _insert_nat_bin(binner, bins, labels, len(ax))
2475 return binner, bins, labels
2476
2477 freq_mult = self.freq.n
2478
2479 start = ax.min().asfreq(self.freq, how=self.convention)
2480 end = ax.max().asfreq(self.freq, how="end")
2481 bin_shift = 0
2482
2483 if isinstance(self.freq, Tick):
2484 # GH 23882 & 31809: get adjusted bin edge labels with 'origin'
2485 # and 'origin' support. This call only makes sense if the freq is a
2486 # Tick since offset and origin are only used in those cases.
2487 # Not doing this check could create an extra empty bin.
2488 p_start, end = _get_period_range_edges(
2489 start,
2490 end,
2491 self.freq,
2492 closed=self.closed,
2493 origin=self.origin,
2494 offset=self.offset,
2495 )
2496
2497 # Get offset for bin edge (not label edge) adjustment
2498 start_offset = Period(start, self.freq) - Period(p_start, self.freq)
2499 # error: Item "Period" of "Union[Period, Any]" has no attribute "n"
2500 bin_shift = start_offset.n % freq_mult # type: ignore[union-attr]
2501 start = p_start
2502
2503 labels = binner = period_range(
2504 start=start, end=end, freq=self.freq, name=ax.name
2505 )
2506
2507 i8 = memb.asi8
2508
2509 # when upsampling to subperiods, we need to generate enough bins
2510 expected_bins_count = len(binner) * freq_mult
2511 i8_extend = expected_bins_count - (i8[-1] - i8[0])
2512 rng = np.arange(i8[0], i8[-1] + i8_extend, freq_mult)
2513 rng += freq_mult
2514 # adjust bin edge indexes to account for base
2515 rng -= bin_shift
2516
2517 # Wrap in PeriodArray for PeriodArray.searchsorted
2518 prng = type(memb._data)(rng, dtype=memb.dtype)
2519 bins = memb.searchsorted(prng, side="left")
2520
2521 if nat_count > 0:
2522 binner, bins, labels = _insert_nat_bin(binner, bins, labels, nat_count)
2523
2524 return binner, bins, labels
2525
2526 def _set_grouper(
2527 self, obj: NDFrameT, sort: bool = False, *, gpr_index: Index | None = None
2528 ) -> tuple[NDFrameT, Index, npt.NDArray[np.intp] | None]:
2529 obj, ax, indexer = super()._set_grouper(obj, sort, gpr_index=gpr_index)
2530 if isinstance(ax.dtype, ArrowDtype) and ax.dtype.kind in "Mm":
2531 self._arrow_dtype = ax.dtype
2532 ax = Index(
2533 cast(ArrowExtensionArray, ax.array)._maybe_convert_datelike_array()
2534 )
2535 return obj, ax, indexer
2536
2537
2538def _take_new_index(
2539 obj: NDFrameT, indexer: npt.NDArray[np.intp], new_index: Index, axis: AxisInt = 0
2540) -> NDFrameT:
2541 if isinstance(obj, ABCSeries):
2542 new_values = algos.take_nd(obj._values, indexer)
2543 # error: Incompatible return value type (got "Series", expected "NDFrameT")
2544 return obj._constructor( # type: ignore[return-value]
2545 new_values, index=new_index, name=obj.name
2546 )
2547 elif isinstance(obj, ABCDataFrame):
2548 if axis == 1:
2549 raise NotImplementedError("axis 1 is not supported")
2550 new_mgr = obj._mgr.reindex_indexer(new_axis=new_index, indexer=indexer, axis=1)
2551 # error: Incompatible return value type (got "DataFrame", expected "NDFrameT")
2552 return obj._constructor_from_mgr(new_mgr, axes=new_mgr.axes) # type: ignore[return-value]
2553 else:
2554 raise ValueError("'obj' should be either a Series or a DataFrame")
2555
2556
2557def _get_timestamp_range_edges(
2558 first: Timestamp,
2559 last: Timestamp,
2560 freq: BaseOffset,
2561 unit: str,
2562 closed: Literal["right", "left"] = "left",
2563 origin: TimeGrouperOrigin = "start_day",
2564 offset: Timedelta | None = None,
2565) -> tuple[Timestamp, Timestamp]:
2566 """
2567 Adjust the `first` Timestamp to the preceding Timestamp that resides on
2568 the provided offset. Adjust the `last` Timestamp to the following
2569 Timestamp that resides on the provided offset. Input Timestamps that
2570 already reside on the offset will be adjusted depending on the type of
2571 offset and the `closed` parameter.
2572
2573 Parameters
2574 ----------
2575 first : pd.Timestamp
2576 The beginning Timestamp of the range to be adjusted.
2577 last : pd.Timestamp
2578 The ending Timestamp of the range to be adjusted.
2579 freq : pd.DateOffset
2580 The dateoffset to which the Timestamps will be adjusted.
2581 closed : {'right', 'left'}, default "left"
2582 Which side of bin interval is closed.
2583 origin : {'epoch', 'start', 'start_day'} or Timestamp, default 'start_day'
2584 The timestamp on which to adjust the grouping. The timezone of origin must
2585 match the timezone of the index.
2586 If a timestamp is not used, these values are also supported:
2587
2588 - 'epoch': `origin` is 1970-01-01
2589 - 'start': `origin` is the first value of the timeseries
2590 - 'start_day': `origin` is the first day at midnight of the timeseries
2591 offset : pd.Timedelta, default is None
2592 An offset timedelta added to the origin.
2593
2594 Returns
2595 -------
2596 A tuple of length 2, containing the adjusted pd.Timestamp objects.
2597 """
2598 if isinstance(freq, Tick):
2599 index_tz = first.tz
2600 if isinstance(origin, Timestamp) and (origin.tz is None) != (index_tz is None):
2601 raise ValueError("The origin must have the same timezone as the index.")
2602 if origin == "epoch":
2603 # set the epoch based on the timezone to have similar bins results when
2604 # resampling on the same kind of indexes on different timezones
2605 origin = Timestamp("1970-01-01", tz=index_tz)
2606
2607 if isinstance(freq, Day):
2608 # _adjust_dates_anchored assumes 'D' means 24h, but first/last
2609 # might contain a DST transition (23h, 24h, or 25h).
2610 # So "pretend" the dates are naive when adjusting the endpoints
2611 first = first.tz_localize(None)
2612 last = last.tz_localize(None)
2613 if isinstance(origin, Timestamp):
2614 origin = origin.tz_localize(None)
2615
2616 first, last = _adjust_dates_anchored(
2617 first, last, freq, closed=closed, origin=origin, offset=offset, unit=unit
2618 )
2619 if isinstance(freq, Day):
2620 first = first.tz_localize(index_tz)
2621 last = last.tz_localize(index_tz)
2622 else:
2623 first = first.normalize()
2624 last = last.normalize()
2625
2626 if closed == "left":
2627 first = Timestamp(freq.rollback(first))
2628 else:
2629 first = Timestamp(first - freq)
2630
2631 last = Timestamp(last + freq)
2632
2633 return first, last
2634
2635
2636def _get_period_range_edges(
2637 first: Period,
2638 last: Period,
2639 freq: BaseOffset,
2640 closed: Literal["right", "left"] = "left",
2641 origin: TimeGrouperOrigin = "start_day",
2642 offset: Timedelta | None = None,
2643) -> tuple[Period, Period]:
2644 """
2645 Adjust the provided `first` and `last` Periods to the respective Period of
2646 the given offset that encompasses them.
2647
2648 Parameters
2649 ----------
2650 first : pd.Period
2651 The beginning Period of the range to be adjusted.
2652 last : pd.Period
2653 The ending Period of the range to be adjusted.
2654 freq : pd.DateOffset
2655 The freq to which the Periods will be adjusted.
2656 closed : {'right', 'left'}, default "left"
2657 Which side of bin interval is closed.
2658 origin : {'epoch', 'start', 'start_day'}, Timestamp, default 'start_day'
2659 The timestamp on which to adjust the grouping. The timezone of origin must
2660 match the timezone of the index.
2661
2662 If a timestamp is not used, these values are also supported:
2663
2664 - 'epoch': `origin` is 1970-01-01
2665 - 'start': `origin` is the first value of the timeseries
2666 - 'start_day': `origin` is the first day at midnight of the timeseries
2667 offset : pd.Timedelta, default is None
2668 An offset timedelta added to the origin.
2669
2670 Returns
2671 -------
2672 A tuple of length 2, containing the adjusted pd.Period objects.
2673 """
2674 if not all(isinstance(obj, Period) for obj in [first, last]):
2675 raise TypeError("'first' and 'last' must be instances of type Period")
2676
2677 # GH 23882
2678 first_ts = first.to_timestamp()
2679 last_ts = last.to_timestamp()
2680 adjust_first = not freq.is_on_offset(first_ts)
2681 adjust_last = freq.is_on_offset(last_ts)
2682
2683 first_ts, last_ts = _get_timestamp_range_edges(
2684 first_ts, last_ts, freq, unit="ns", closed=closed, origin=origin, offset=offset
2685 )
2686
2687 first = (first_ts + int(adjust_first) * freq).to_period(freq)
2688 last = (last_ts - int(adjust_last) * freq).to_period(freq)
2689 return first, last
2690
2691
2692def _insert_nat_bin(
2693 binner: PeriodIndex, bins: np.ndarray, labels: PeriodIndex, nat_count: int
2694) -> tuple[PeriodIndex, np.ndarray, PeriodIndex]:
2695 # NaT handling as in pandas._lib.lib.generate_bins_dt64()
2696 # shift bins by the number of NaT
2697 assert nat_count > 0
2698 bins += nat_count
2699 bins = np.insert(bins, 0, nat_count)
2700
2701 # Incompatible types in assignment (expression has type "Index", variable
2702 # has type "PeriodIndex")
2703 binner = binner.insert(0, NaT) # type: ignore[assignment]
2704 # Incompatible types in assignment (expression has type "Index", variable
2705 # has type "PeriodIndex")
2706 labels = labels.insert(0, NaT) # type: ignore[assignment]
2707 return binner, bins, labels
2708
2709
2710def _adjust_dates_anchored(
2711 first: Timestamp,
2712 last: Timestamp,
2713 freq: Tick,
2714 closed: Literal["right", "left"] = "right",
2715 origin: TimeGrouperOrigin = "start_day",
2716 offset: Timedelta | None = None,
2717 unit: str = "ns",
2718) -> tuple[Timestamp, Timestamp]:
2719 # First and last offsets should be calculated from the start day to fix an
2720 # error cause by resampling across multiple days when a one day period is
2721 # not a multiple of the frequency. See GH 8683
2722 # To handle frequencies that are not multiple or divisible by a day we let
2723 # the possibility to define a fixed origin timestamp. See GH 31809
2724 first = first.as_unit(unit)
2725 last = last.as_unit(unit)
2726 if offset is not None:
2727 offset = offset.as_unit(unit)
2728
2729 freq_value = Timedelta(freq).as_unit(unit)._value
2730
2731 origin_timestamp = 0 # origin == "epoch"
2732 if origin == "start_day":
2733 origin_timestamp = first.normalize()._value
2734 elif origin == "start":
2735 origin_timestamp = first._value
2736 elif isinstance(origin, Timestamp):
2737 origin_timestamp = origin.as_unit(unit)._value
2738 elif origin in ["end", "end_day"]:
2739 origin_last = last if origin == "end" else last.ceil("D")
2740 sub_freq_times = (origin_last._value - first._value) // freq_value
2741 if closed == "left":
2742 sub_freq_times += 1
2743 first = origin_last - sub_freq_times * freq
2744 origin_timestamp = first._value
2745 origin_timestamp += offset._value if offset else 0
2746
2747 # GH 10117 & GH 19375. If first and last contain timezone information,
2748 # Perform the calculation in UTC in order to avoid localizing on an
2749 # Ambiguous or Nonexistent time.
2750 first_tzinfo = first.tzinfo
2751 last_tzinfo = last.tzinfo
2752 if first_tzinfo is not None:
2753 first = first.tz_convert("UTC")
2754 if last_tzinfo is not None:
2755 last = last.tz_convert("UTC")
2756
2757 foffset = (first._value - origin_timestamp) % freq_value
2758 loffset = (last._value - origin_timestamp) % freq_value
2759
2760 if closed == "right":
2761 if foffset > 0:
2762 # roll back
2763 fresult_int = first._value - foffset
2764 else:
2765 fresult_int = first._value - freq_value
2766
2767 if loffset > 0:
2768 # roll forward
2769 lresult_int = last._value + (freq_value - loffset)
2770 else:
2771 # already the end of the road
2772 lresult_int = last._value
2773 else: # closed == 'left'
2774 if foffset > 0:
2775 fresult_int = first._value - foffset
2776 else:
2777 # start of the road
2778 fresult_int = first._value
2779
2780 if loffset > 0:
2781 # roll forward
2782 lresult_int = last._value + (freq_value - loffset)
2783 else:
2784 lresult_int = last._value + freq_value
2785 fresult = Timestamp(fresult_int, unit=unit)
2786 lresult = Timestamp(lresult_int, unit=unit)
2787 if first_tzinfo is not None:
2788 fresult = fresult.tz_localize("UTC").tz_convert(first_tzinfo)
2789 if last_tzinfo is not None:
2790 lresult = lresult.tz_localize("UTC").tz_convert(last_tzinfo)
2791 return fresult, lresult
2792
2793
2794def asfreq(
2795 obj: NDFrameT,
2796 freq,
2797 method=None,
2798 how=None,
2799 normalize: bool = False,
2800 fill_value=None,
2801) -> NDFrameT:
2802 """
2803 Utility frequency conversion method for Series/DataFrame.
2804
2805 See :meth:`pandas.NDFrame.asfreq` for full documentation.
2806 """
2807 if isinstance(obj.index, PeriodIndex):
2808 if method is not None:
2809 raise NotImplementedError("'method' argument is not supported")
2810
2811 if how is None:
2812 how = "E"
2813
2814 if isinstance(freq, BaseOffset):
2815 if hasattr(freq, "_period_dtype_code"):
2816 freq = freq_to_period_freqstr(freq.n, freq.name)
2817 else:
2818 raise ValueError(
2819 f"Invalid offset: '{freq.base}' for converting time series "
2820 f"with PeriodIndex."
2821 )
2822
2823 new_obj = obj.copy()
2824 new_obj.index = obj.index.asfreq(freq, how=how)
2825
2826 elif len(obj.index) == 0:
2827 new_obj = obj.copy()
2828
2829 new_obj.index = _asfreq_compat(obj.index, freq)
2830 else:
2831 unit = None
2832 if isinstance(obj.index, DatetimeIndex):
2833 # TODO: should we disallow non-DatetimeIndex?
2834 unit = obj.index.unit
2835 dti = date_range(obj.index.min(), obj.index.max(), freq=freq, unit=unit)
2836 dti.name = obj.index.name
2837 new_obj = obj.reindex(dti, method=method, fill_value=fill_value)
2838 if normalize:
2839 new_obj.index = new_obj.index.normalize()
2840
2841 return new_obj
2842
2843
2844def _asfreq_compat(index: DatetimeIndex | PeriodIndex | TimedeltaIndex, freq):
2845 """
2846 Helper to mimic asfreq on (empty) DatetimeIndex and TimedeltaIndex.
2847
2848 Parameters
2849 ----------
2850 index : PeriodIndex, DatetimeIndex, or TimedeltaIndex
2851 freq : DateOffset
2852
2853 Returns
2854 -------
2855 same type as index
2856 """
2857 if len(index) != 0:
2858 # This should never be reached, always checked by the caller
2859 raise ValueError(
2860 "Can only set arbitrary freq for empty DatetimeIndex or TimedeltaIndex"
2861 )
2862 new_index: Index
2863 if isinstance(index, PeriodIndex):
2864 new_index = index.asfreq(freq=freq)
2865 elif isinstance(index, DatetimeIndex):
2866 new_index = DatetimeIndex([], dtype=index.dtype, freq=freq, name=index.name)
2867 elif isinstance(index, TimedeltaIndex):
2868 new_index = TimedeltaIndex([], dtype=index.dtype, freq=freq, name=index.name)
2869 else: # pragma: no cover
2870 raise TypeError(type(index))
2871 return new_index
2872
2873
2874def maybe_warn_args_and_kwargs(cls, kernel: str, args, kwargs) -> None:
2875 """
2876 Warn for deprecation of args and kwargs in resample functions.
2877
2878 Parameters
2879 ----------
2880 cls : type
2881 Class to warn about.
2882 kernel : str
2883 Operation name.
2884 args : tuple or None
2885 args passed by user. Will be None if and only if kernel does not have args.
2886 kwargs : dict or None
2887 kwargs passed by user. Will be None if and only if kernel does not have kwargs.
2888 """
2889 warn_args = args is not None and len(args) > 0
2890 warn_kwargs = kwargs is not None and len(kwargs) > 0
2891 if warn_args and warn_kwargs:
2892 msg = "args and kwargs"
2893 elif warn_args:
2894 msg = "args"
2895 elif warn_kwargs:
2896 msg = "kwargs"
2897 else:
2898 return
2899 warnings.warn(
2900 f"Passing additional {msg} to {cls.__name__}.{kernel} has "
2901 "no impact on the result and is deprecated. This will "
2902 "raise a TypeError in a future version of pandas.",
2903 category=FutureWarning,
2904 stacklevel=find_stack_level(),
2905 )
2906
2907
2908def _apply(
2909 grouped: GroupBy, how: Callable, *args, include_groups: bool, **kwargs
2910) -> DataFrame:
2911 # GH#7155 - rewrite warning to appear as if it came from `.resample`
2912 target_message = "DataFrameGroupBy.apply operated on the grouping columns"
2913 new_message = _apply_groupings_depr.format("DataFrameGroupBy", "resample")
2914 with rewrite_warning(
2915 target_message=target_message,
2916 target_category=DeprecationWarning,
2917 new_message=new_message,
2918 ):
2919 result = grouped.apply(how, *args, include_groups=include_groups, **kwargs)
2920 return result