1from __future__ import annotations
2
3from collections import abc
4from datetime import date
5from functools import partial
6from itertools import islice
7from typing import (
8 TYPE_CHECKING,
9 Callable,
10 TypedDict,
11 Union,
12 cast,
13 overload,
14)
15import warnings
16
17import numpy as np
18
19from pandas._libs import (
20 lib,
21 tslib,
22)
23from pandas._libs.tslibs import (
24 OutOfBoundsDatetime,
25 Timedelta,
26 Timestamp,
27 astype_overflowsafe,
28 is_supported_dtype,
29 timezones as libtimezones,
30)
31from pandas._libs.tslibs.conversion import cast_from_unit_vectorized
32from pandas._libs.tslibs.parsing import (
33 DateParseError,
34 guess_datetime_format,
35)
36from pandas._libs.tslibs.strptime import array_strptime
37from pandas._typing import (
38 AnyArrayLike,
39 ArrayLike,
40 DateTimeErrorChoices,
41)
42from pandas.util._exceptions import find_stack_level
43
44from pandas.core.dtypes.common import (
45 ensure_object,
46 is_float,
47 is_integer,
48 is_integer_dtype,
49 is_list_like,
50 is_numeric_dtype,
51)
52from pandas.core.dtypes.dtypes import (
53 ArrowDtype,
54 DatetimeTZDtype,
55)
56from pandas.core.dtypes.generic import (
57 ABCDataFrame,
58 ABCSeries,
59)
60
61from pandas.arrays import (
62 DatetimeArray,
63 IntegerArray,
64 NumpyExtensionArray,
65)
66from pandas.core.algorithms import unique
67from pandas.core.arrays import ArrowExtensionArray
68from pandas.core.arrays.base import ExtensionArray
69from pandas.core.arrays.datetimes import (
70 maybe_convert_dtype,
71 objects_to_datetime64,
72 tz_to_dtype,
73)
74from pandas.core.construction import extract_array
75from pandas.core.indexes.base import Index
76from pandas.core.indexes.datetimes import DatetimeIndex
77
78if TYPE_CHECKING:
79 from collections.abc import Hashable
80
81 from pandas._libs.tslibs.nattype import NaTType
82 from pandas._libs.tslibs.timedeltas import UnitChoices
83
84 from pandas import (
85 DataFrame,
86 Series,
87 )
88
89# ---------------------------------------------------------------------
90# types used in annotations
91
92ArrayConvertible = Union[list, tuple, AnyArrayLike]
93Scalar = Union[float, str]
94DatetimeScalar = Union[Scalar, date, np.datetime64]
95
96DatetimeScalarOrArrayConvertible = Union[DatetimeScalar, ArrayConvertible]
97
98DatetimeDictArg = Union[list[Scalar], tuple[Scalar, ...], AnyArrayLike]
99
100
101class YearMonthDayDict(TypedDict, total=True):
102 year: DatetimeDictArg
103 month: DatetimeDictArg
104 day: DatetimeDictArg
105
106
107class FulldatetimeDict(YearMonthDayDict, total=False):
108 hour: DatetimeDictArg
109 hours: DatetimeDictArg
110 minute: DatetimeDictArg
111 minutes: DatetimeDictArg
112 second: DatetimeDictArg
113 seconds: DatetimeDictArg
114 ms: DatetimeDictArg
115 us: DatetimeDictArg
116 ns: DatetimeDictArg
117
118
119DictConvertible = Union[FulldatetimeDict, "DataFrame"]
120start_caching_at = 50
121
122
123# ---------------------------------------------------------------------
124
125
126def _guess_datetime_format_for_array(arr, dayfirst: bool | None = False) -> str | None:
127 # Try to guess the format based on the first non-NaN element, return None if can't
128 if (first_non_null := tslib.first_non_null(arr)) != -1:
129 if type(first_non_nan_element := arr[first_non_null]) is str: # noqa: E721
130 # GH#32264 np.str_ object
131 guessed_format = guess_datetime_format(
132 first_non_nan_element, dayfirst=dayfirst
133 )
134 if guessed_format is not None:
135 return guessed_format
136 # If there are multiple non-null elements, warn about
137 # how parsing might not be consistent
138 if tslib.first_non_null(arr[first_non_null + 1 :]) != -1:
139 warnings.warn(
140 "Could not infer format, so each element will be parsed "
141 "individually, falling back to `dateutil`. To ensure parsing is "
142 "consistent and as-expected, please specify a format.",
143 UserWarning,
144 stacklevel=find_stack_level(),
145 )
146 return None
147
148
149def should_cache(
150 arg: ArrayConvertible, unique_share: float = 0.7, check_count: int | None = None
151) -> bool:
152 """
153 Decides whether to do caching.
154
155 If the percent of unique elements among `check_count` elements less
156 than `unique_share * 100` then we can do caching.
157
158 Parameters
159 ----------
160 arg: listlike, tuple, 1-d array, Series
161 unique_share: float, default=0.7, optional
162 0 < unique_share < 1
163 check_count: int, optional
164 0 <= check_count <= len(arg)
165
166 Returns
167 -------
168 do_caching: bool
169
170 Notes
171 -----
172 By default for a sequence of less than 50 items in size, we don't do
173 caching; for the number of elements less than 5000, we take ten percent of
174 all elements to check for a uniqueness share; if the sequence size is more
175 than 5000, then we check only the first 500 elements.
176 All constants were chosen empirically by.
177 """
178 do_caching = True
179
180 # default realization
181 if check_count is None:
182 # in this case, the gain from caching is negligible
183 if len(arg) <= start_caching_at:
184 return False
185
186 if len(arg) <= 5000:
187 check_count = len(arg) // 10
188 else:
189 check_count = 500
190 else:
191 assert (
192 0 <= check_count <= len(arg)
193 ), "check_count must be in next bounds: [0; len(arg)]"
194 if check_count == 0:
195 return False
196
197 assert 0 < unique_share < 1, "unique_share must be in next bounds: (0; 1)"
198
199 try:
200 # We can't cache if the items are not hashable.
201 unique_elements = set(islice(arg, check_count))
202 except TypeError:
203 return False
204 if len(unique_elements) > check_count * unique_share:
205 do_caching = False
206 return do_caching
207
208
209def _maybe_cache(
210 arg: ArrayConvertible,
211 format: str | None,
212 cache: bool,
213 convert_listlike: Callable,
214) -> Series:
215 """
216 Create a cache of unique dates from an array of dates
217
218 Parameters
219 ----------
220 arg : listlike, tuple, 1-d array, Series
221 format : string
222 Strftime format to parse time
223 cache : bool
224 True attempts to create a cache of converted values
225 convert_listlike : function
226 Conversion function to apply on dates
227
228 Returns
229 -------
230 cache_array : Series
231 Cache of converted, unique dates. Can be empty
232 """
233 from pandas import Series
234
235 cache_array = Series(dtype=object)
236
237 if cache:
238 # Perform a quicker unique check
239 if not should_cache(arg):
240 return cache_array
241
242 if not isinstance(arg, (np.ndarray, ExtensionArray, Index, ABCSeries)):
243 arg = np.array(arg)
244
245 unique_dates = unique(arg)
246 if len(unique_dates) < len(arg):
247 cache_dates = convert_listlike(unique_dates, format)
248 # GH#45319
249 try:
250 cache_array = Series(cache_dates, index=unique_dates, copy=False)
251 except OutOfBoundsDatetime:
252 return cache_array
253 # GH#39882 and GH#35888 in case of None and NaT we get duplicates
254 if not cache_array.index.is_unique:
255 cache_array = cache_array[~cache_array.index.duplicated()]
256 return cache_array
257
258
259def _box_as_indexlike(
260 dt_array: ArrayLike, utc: bool = False, name: Hashable | None = None
261) -> Index:
262 """
263 Properly boxes the ndarray of datetimes to DatetimeIndex
264 if it is possible or to generic Index instead
265
266 Parameters
267 ----------
268 dt_array: 1-d array
269 Array of datetimes to be wrapped in an Index.
270 utc : bool
271 Whether to convert/localize timestamps to UTC.
272 name : string, default None
273 Name for a resulting index
274
275 Returns
276 -------
277 result : datetime of converted dates
278 - DatetimeIndex if convertible to sole datetime64 type
279 - general Index otherwise
280 """
281
282 if lib.is_np_dtype(dt_array.dtype, "M"):
283 tz = "utc" if utc else None
284 return DatetimeIndex(dt_array, tz=tz, name=name)
285 return Index(dt_array, name=name, dtype=dt_array.dtype)
286
287
288def _convert_and_box_cache(
289 arg: DatetimeScalarOrArrayConvertible,
290 cache_array: Series,
291 name: Hashable | None = None,
292) -> Index:
293 """
294 Convert array of dates with a cache and wrap the result in an Index.
295
296 Parameters
297 ----------
298 arg : integer, float, string, datetime, list, tuple, 1-d array, Series
299 cache_array : Series
300 Cache of converted, unique dates
301 name : string, default None
302 Name for a DatetimeIndex
303
304 Returns
305 -------
306 result : Index-like of converted dates
307 """
308 from pandas import Series
309
310 result = Series(arg, dtype=cache_array.index.dtype).map(cache_array)
311 return _box_as_indexlike(result._values, utc=False, name=name)
312
313
314def _convert_listlike_datetimes(
315 arg,
316 format: str | None,
317 name: Hashable | None = None,
318 utc: bool = False,
319 unit: str | None = None,
320 errors: DateTimeErrorChoices = "raise",
321 dayfirst: bool | None = None,
322 yearfirst: bool | None = None,
323 exact: bool = True,
324):
325 """
326 Helper function for to_datetime. Performs the conversions of 1D listlike
327 of dates
328
329 Parameters
330 ----------
331 arg : list, tuple, ndarray, Series, Index
332 date to be parsed
333 name : object
334 None or string for the Index name
335 utc : bool
336 Whether to convert/localize timestamps to UTC.
337 unit : str
338 None or string of the frequency of the passed data
339 errors : str
340 error handing behaviors from to_datetime, 'raise', 'coerce', 'ignore'
341 dayfirst : bool
342 dayfirst parsing behavior from to_datetime
343 yearfirst : bool
344 yearfirst parsing behavior from to_datetime
345 exact : bool, default True
346 exact format matching behavior from to_datetime
347
348 Returns
349 -------
350 Index-like of parsed dates
351 """
352 if isinstance(arg, (list, tuple)):
353 arg = np.array(arg, dtype="O")
354 elif isinstance(arg, NumpyExtensionArray):
355 arg = np.array(arg)
356
357 arg_dtype = getattr(arg, "dtype", None)
358 # these are shortcutable
359 tz = "utc" if utc else None
360 if isinstance(arg_dtype, DatetimeTZDtype):
361 if not isinstance(arg, (DatetimeArray, DatetimeIndex)):
362 return DatetimeIndex(arg, tz=tz, name=name)
363 if utc:
364 arg = arg.tz_convert(None).tz_localize("utc")
365 return arg
366
367 elif isinstance(arg_dtype, ArrowDtype) and arg_dtype.type is Timestamp:
368 # TODO: Combine with above if DTI/DTA supports Arrow timestamps
369 if utc:
370 # pyarrow uses UTC, not lowercase utc
371 if isinstance(arg, Index):
372 arg_array = cast(ArrowExtensionArray, arg.array)
373 if arg_dtype.pyarrow_dtype.tz is not None:
374 arg_array = arg_array._dt_tz_convert("UTC")
375 else:
376 arg_array = arg_array._dt_tz_localize("UTC")
377 arg = Index(arg_array)
378 else:
379 # ArrowExtensionArray
380 if arg_dtype.pyarrow_dtype.tz is not None:
381 arg = arg._dt_tz_convert("UTC")
382 else:
383 arg = arg._dt_tz_localize("UTC")
384 return arg
385
386 elif lib.is_np_dtype(arg_dtype, "M"):
387 if not is_supported_dtype(arg_dtype):
388 # We go to closest supported reso, i.e. "s"
389 arg = astype_overflowsafe(
390 # TODO: looks like we incorrectly raise with errors=="ignore"
391 np.asarray(arg),
392 np.dtype("M8[s]"),
393 is_coerce=errors == "coerce",
394 )
395
396 if not isinstance(arg, (DatetimeArray, DatetimeIndex)):
397 return DatetimeIndex(arg, tz=tz, name=name)
398 elif utc:
399 # DatetimeArray, DatetimeIndex
400 return arg.tz_localize("utc")
401
402 return arg
403
404 elif unit is not None:
405 if format is not None:
406 raise ValueError("cannot specify both format and unit")
407 return _to_datetime_with_unit(arg, unit, name, utc, errors)
408 elif getattr(arg, "ndim", 1) > 1:
409 raise TypeError(
410 "arg must be a string, datetime, list, tuple, 1-d array, or Series"
411 )
412
413 # warn if passing timedelta64, raise for PeriodDtype
414 # NB: this must come after unit transformation
415 try:
416 arg, _ = maybe_convert_dtype(arg, copy=False, tz=libtimezones.maybe_get_tz(tz))
417 except TypeError:
418 if errors == "coerce":
419 npvalues = np.array(["NaT"], dtype="datetime64[ns]").repeat(len(arg))
420 return DatetimeIndex(npvalues, name=name)
421 elif errors == "ignore":
422 idx = Index(arg, name=name)
423 return idx
424 raise
425
426 arg = ensure_object(arg)
427
428 if format is None:
429 format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst)
430
431 # `format` could be inferred, or user didn't ask for mixed-format parsing.
432 if format is not None and format != "mixed":
433 return _array_strptime_with_fallback(arg, name, utc, format, exact, errors)
434
435 result, tz_parsed = objects_to_datetime64(
436 arg,
437 dayfirst=dayfirst,
438 yearfirst=yearfirst,
439 utc=utc,
440 errors=errors,
441 allow_object=True,
442 )
443
444 if tz_parsed is not None:
445 # We can take a shortcut since the datetime64 numpy array
446 # is in UTC
447 out_unit = np.datetime_data(result.dtype)[0]
448 dtype = cast(DatetimeTZDtype, tz_to_dtype(tz_parsed, out_unit))
449 dt64_values = result.view(f"M8[{dtype.unit}]")
450 dta = DatetimeArray._simple_new(dt64_values, dtype=dtype)
451 return DatetimeIndex._simple_new(dta, name=name)
452
453 return _box_as_indexlike(result, utc=utc, name=name)
454
455
456def _array_strptime_with_fallback(
457 arg,
458 name,
459 utc: bool,
460 fmt: str,
461 exact: bool,
462 errors: str,
463) -> Index:
464 """
465 Call array_strptime, with fallback behavior depending on 'errors'.
466 """
467 result, tz_out = array_strptime(arg, fmt, exact=exact, errors=errors, utc=utc)
468 if tz_out is not None:
469 unit = np.datetime_data(result.dtype)[0]
470 dtype = DatetimeTZDtype(tz=tz_out, unit=unit)
471 dta = DatetimeArray._simple_new(result, dtype=dtype)
472 if utc:
473 dta = dta.tz_convert("UTC")
474 return Index(dta, name=name)
475 elif result.dtype != object and utc:
476 unit = np.datetime_data(result.dtype)[0]
477 res = Index(result, dtype=f"M8[{unit}, UTC]", name=name)
478 return res
479 return Index(result, dtype=result.dtype, name=name)
480
481
482def _to_datetime_with_unit(arg, unit, name, utc: bool, errors: str) -> Index:
483 """
484 to_datetime specalized to the case where a 'unit' is passed.
485 """
486 arg = extract_array(arg, extract_numpy=True)
487
488 # GH#30050 pass an ndarray to tslib.array_with_unit_to_datetime
489 # because it expects an ndarray argument
490 if isinstance(arg, IntegerArray):
491 arr = arg.astype(f"datetime64[{unit}]")
492 tz_parsed = None
493 else:
494 arg = np.asarray(arg)
495
496 if arg.dtype.kind in "iu":
497 # Note we can't do "f" here because that could induce unwanted
498 # rounding GH#14156, GH#20445
499 arr = arg.astype(f"datetime64[{unit}]", copy=False)
500 try:
501 arr = astype_overflowsafe(arr, np.dtype("M8[ns]"), copy=False)
502 except OutOfBoundsDatetime:
503 if errors == "raise":
504 raise
505 arg = arg.astype(object)
506 return _to_datetime_with_unit(arg, unit, name, utc, errors)
507 tz_parsed = None
508
509 elif arg.dtype.kind == "f":
510 with np.errstate(over="raise"):
511 try:
512 arr = cast_from_unit_vectorized(arg, unit=unit)
513 except OutOfBoundsDatetime:
514 if errors != "raise":
515 return _to_datetime_with_unit(
516 arg.astype(object), unit, name, utc, errors
517 )
518 raise OutOfBoundsDatetime(
519 f"cannot convert input with unit '{unit}'"
520 )
521
522 arr = arr.view("M8[ns]")
523 tz_parsed = None
524 else:
525 arg = arg.astype(object, copy=False)
526 arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)
527
528 if errors == "ignore":
529 # Index constructor _may_ infer to DatetimeIndex
530 result = Index._with_infer(arr, name=name)
531 else:
532 result = DatetimeIndex(arr, name=name)
533
534 if not isinstance(result, DatetimeIndex):
535 return result
536
537 # GH#23758: We may still need to localize the result with tz
538 # GH#25546: Apply tz_parsed first (from arg), then tz (from caller)
539 # result will be naive but in UTC
540 result = result.tz_localize("UTC").tz_convert(tz_parsed)
541
542 if utc:
543 if result.tz is None:
544 result = result.tz_localize("utc")
545 else:
546 result = result.tz_convert("utc")
547 return result
548
549
550def _adjust_to_origin(arg, origin, unit):
551 """
552 Helper function for to_datetime.
553 Adjust input argument to the specified origin
554
555 Parameters
556 ----------
557 arg : list, tuple, ndarray, Series, Index
558 date to be adjusted
559 origin : 'julian' or Timestamp
560 origin offset for the arg
561 unit : str
562 passed unit from to_datetime, must be 'D'
563
564 Returns
565 -------
566 ndarray or scalar of adjusted date(s)
567 """
568 if origin == "julian":
569 original = arg
570 j0 = Timestamp(0).to_julian_date()
571 if unit != "D":
572 raise ValueError("unit must be 'D' for origin='julian'")
573 try:
574 arg = arg - j0
575 except TypeError as err:
576 raise ValueError(
577 "incompatible 'arg' type for given 'origin'='julian'"
578 ) from err
579
580 # preemptively check this for a nice range
581 j_max = Timestamp.max.to_julian_date() - j0
582 j_min = Timestamp.min.to_julian_date() - j0
583 if np.any(arg > j_max) or np.any(arg < j_min):
584 raise OutOfBoundsDatetime(
585 f"{original} is Out of Bounds for origin='julian'"
586 )
587 else:
588 # arg must be numeric
589 if not (
590 (is_integer(arg) or is_float(arg)) or is_numeric_dtype(np.asarray(arg))
591 ):
592 raise ValueError(
593 f"'{arg}' is not compatible with origin='{origin}'; "
594 "it must be numeric with a unit specified"
595 )
596
597 # we are going to offset back to unix / epoch time
598 try:
599 offset = Timestamp(origin, unit=unit)
600 except OutOfBoundsDatetime as err:
601 raise OutOfBoundsDatetime(f"origin {origin} is Out of Bounds") from err
602 except ValueError as err:
603 raise ValueError(
604 f"origin {origin} cannot be converted to a Timestamp"
605 ) from err
606
607 if offset.tz is not None:
608 raise ValueError(f"origin offset {offset} must be tz-naive")
609 td_offset = offset - Timestamp(0)
610
611 # convert the offset to the unit of the arg
612 # this should be lossless in terms of precision
613 ioffset = td_offset // Timedelta(1, unit=unit)
614
615 # scalars & ndarray-like can handle the addition
616 if is_list_like(arg) and not isinstance(arg, (ABCSeries, Index, np.ndarray)):
617 arg = np.asarray(arg)
618 arg = arg + ioffset
619 return arg
620
621
622@overload
623def to_datetime(
624 arg: DatetimeScalar,
625 errors: DateTimeErrorChoices = ...,
626 dayfirst: bool = ...,
627 yearfirst: bool = ...,
628 utc: bool = ...,
629 format: str | None = ...,
630 exact: bool = ...,
631 unit: str | None = ...,
632 infer_datetime_format: bool = ...,
633 origin=...,
634 cache: bool = ...,
635) -> Timestamp:
636 ...
637
638
639@overload
640def to_datetime(
641 arg: Series | DictConvertible,
642 errors: DateTimeErrorChoices = ...,
643 dayfirst: bool = ...,
644 yearfirst: bool = ...,
645 utc: bool = ...,
646 format: str | None = ...,
647 exact: bool = ...,
648 unit: str | None = ...,
649 infer_datetime_format: bool = ...,
650 origin=...,
651 cache: bool = ...,
652) -> Series:
653 ...
654
655
656@overload
657def to_datetime(
658 arg: list | tuple | Index | ArrayLike,
659 errors: DateTimeErrorChoices = ...,
660 dayfirst: bool = ...,
661 yearfirst: bool = ...,
662 utc: bool = ...,
663 format: str | None = ...,
664 exact: bool = ...,
665 unit: str | None = ...,
666 infer_datetime_format: bool = ...,
667 origin=...,
668 cache: bool = ...,
669) -> DatetimeIndex:
670 ...
671
672
673def to_datetime(
674 arg: DatetimeScalarOrArrayConvertible | DictConvertible,
675 errors: DateTimeErrorChoices = "raise",
676 dayfirst: bool = False,
677 yearfirst: bool = False,
678 utc: bool = False,
679 format: str | None = None,
680 exact: bool | lib.NoDefault = lib.no_default,
681 unit: str | None = None,
682 infer_datetime_format: lib.NoDefault | bool = lib.no_default,
683 origin: str = "unix",
684 cache: bool = True,
685) -> DatetimeIndex | Series | DatetimeScalar | NaTType | None:
686 """
687 Convert argument to datetime.
688
689 This function converts a scalar, array-like, :class:`Series` or
690 :class:`DataFrame`/dict-like to a pandas datetime object.
691
692 Parameters
693 ----------
694 arg : int, float, str, datetime, list, tuple, 1-d array, Series, DataFrame/dict-like
695 The object to convert to a datetime. If a :class:`DataFrame` is provided, the
696 method expects minimally the following columns: :const:`"year"`,
697 :const:`"month"`, :const:`"day"`. The column "year"
698 must be specified in 4-digit format.
699 errors : {'ignore', 'raise', 'coerce'}, default 'raise'
700 - If :const:`'raise'`, then invalid parsing will raise an exception.
701 - If :const:`'coerce'`, then invalid parsing will be set as :const:`NaT`.
702 - If :const:`'ignore'`, then invalid parsing will return the input.
703 dayfirst : bool, default False
704 Specify a date parse order if `arg` is str or is list-like.
705 If :const:`True`, parses dates with the day first, e.g. :const:`"10/11/12"`
706 is parsed as :const:`2012-11-10`.
707
708 .. warning::
709
710 ``dayfirst=True`` is not strict, but will prefer to parse
711 with day first.
712
713 yearfirst : bool, default False
714 Specify a date parse order if `arg` is str or is list-like.
715
716 - If :const:`True` parses dates with the year first, e.g.
717 :const:`"10/11/12"` is parsed as :const:`2010-11-12`.
718 - If both `dayfirst` and `yearfirst` are :const:`True`, `yearfirst` is
719 preceded (same as :mod:`dateutil`).
720
721 .. warning::
722
723 ``yearfirst=True`` is not strict, but will prefer to parse
724 with year first.
725
726 utc : bool, default False
727 Control timezone-related parsing, localization and conversion.
728
729 - If :const:`True`, the function *always* returns a timezone-aware
730 UTC-localized :class:`Timestamp`, :class:`Series` or
731 :class:`DatetimeIndex`. To do this, timezone-naive inputs are
732 *localized* as UTC, while timezone-aware inputs are *converted* to UTC.
733
734 - If :const:`False` (default), inputs will not be coerced to UTC.
735 Timezone-naive inputs will remain naive, while timezone-aware ones
736 will keep their time offsets. Limitations exist for mixed
737 offsets (typically, daylight savings), see :ref:`Examples
738 <to_datetime_tz_examples>` section for details.
739
740 .. warning::
741
742 In a future version of pandas, parsing datetimes with mixed time
743 zones will raise an error unless `utc=True`.
744 Please specify `utc=True` to opt in to the new behaviour
745 and silence this warning. To create a `Series` with mixed offsets and
746 `object` dtype, please use `apply` and `datetime.datetime.strptime`.
747
748 See also: pandas general documentation about `timezone conversion and
749 localization
750 <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html
751 #time-zone-handling>`_.
752
753 format : str, default None
754 The strftime to parse time, e.g. :const:`"%d/%m/%Y"`. See
755 `strftime documentation
756 <https://docs.python.org/3/library/datetime.html
757 #strftime-and-strptime-behavior>`_ for more information on choices, though
758 note that :const:`"%f"` will parse all the way up to nanoseconds.
759 You can also pass:
760
761 - "ISO8601", to parse any `ISO8601 <https://en.wikipedia.org/wiki/ISO_8601>`_
762 time string (not necessarily in exactly the same format);
763 - "mixed", to infer the format for each element individually. This is risky,
764 and you should probably use it along with `dayfirst`.
765
766 .. note::
767
768 If a :class:`DataFrame` is passed, then `format` has no effect.
769
770 exact : bool, default True
771 Control how `format` is used:
772
773 - If :const:`True`, require an exact `format` match.
774 - If :const:`False`, allow the `format` to match anywhere in the target
775 string.
776
777 Cannot be used alongside ``format='ISO8601'`` or ``format='mixed'``.
778 unit : str, default 'ns'
779 The unit of the arg (D,s,ms,us,ns) denote the unit, which is an
780 integer or float number. This will be based off the origin.
781 Example, with ``unit='ms'`` and ``origin='unix'``, this would calculate
782 the number of milliseconds to the unix epoch start.
783 infer_datetime_format : bool, default False
784 If :const:`True` and no `format` is given, attempt to infer the format
785 of the datetime strings based on the first non-NaN element,
786 and if it can be inferred, switch to a faster method of parsing them.
787 In some cases this can increase the parsing speed by ~5-10x.
788
789 .. deprecated:: 2.0.0
790 A strict version of this argument is now the default, passing it has
791 no effect.
792
793 origin : scalar, default 'unix'
794 Define the reference date. The numeric values would be parsed as number
795 of units (defined by `unit`) since this reference date.
796
797 - If :const:`'unix'` (or POSIX) time; origin is set to 1970-01-01.
798 - If :const:`'julian'`, unit must be :const:`'D'`, and origin is set to
799 beginning of Julian Calendar. Julian day number :const:`0` is assigned
800 to the day starting at noon on January 1, 4713 BC.
801 - If Timestamp convertible (Timestamp, dt.datetime, np.datetimt64 or date
802 string), origin is set to Timestamp identified by origin.
803 - If a float or integer, origin is the difference
804 (in units determined by the ``unit`` argument) relative to 1970-01-01.
805 cache : bool, default True
806 If :const:`True`, use a cache of unique, converted dates to apply the
807 datetime conversion. May produce significant speed-up when parsing
808 duplicate date strings, especially ones with timezone offsets. The cache
809 is only used when there are at least 50 values. The presence of
810 out-of-bounds values will render the cache unusable and may slow down
811 parsing.
812
813 Returns
814 -------
815 datetime
816 If parsing succeeded.
817 Return type depends on input (types in parenthesis correspond to
818 fallback in case of unsuccessful timezone or out-of-range timestamp
819 parsing):
820
821 - scalar: :class:`Timestamp` (or :class:`datetime.datetime`)
822 - array-like: :class:`DatetimeIndex` (or :class:`Series` with
823 :class:`object` dtype containing :class:`datetime.datetime`)
824 - Series: :class:`Series` of :class:`datetime64` dtype (or
825 :class:`Series` of :class:`object` dtype containing
826 :class:`datetime.datetime`)
827 - DataFrame: :class:`Series` of :class:`datetime64` dtype (or
828 :class:`Series` of :class:`object` dtype containing
829 :class:`datetime.datetime`)
830
831 Raises
832 ------
833 ParserError
834 When parsing a date from string fails.
835 ValueError
836 When another datetime conversion error happens. For example when one
837 of 'year', 'month', day' columns is missing in a :class:`DataFrame`, or
838 when a Timezone-aware :class:`datetime.datetime` is found in an array-like
839 of mixed time offsets, and ``utc=False``.
840
841 See Also
842 --------
843 DataFrame.astype : Cast argument to a specified dtype.
844 to_timedelta : Convert argument to timedelta.
845 convert_dtypes : Convert dtypes.
846
847 Notes
848 -----
849
850 Many input types are supported, and lead to different output types:
851
852 - **scalars** can be int, float, str, datetime object (from stdlib :mod:`datetime`
853 module or :mod:`numpy`). They are converted to :class:`Timestamp` when
854 possible, otherwise they are converted to :class:`datetime.datetime`.
855 None/NaN/null scalars are converted to :const:`NaT`.
856
857 - **array-like** can contain int, float, str, datetime objects. They are
858 converted to :class:`DatetimeIndex` when possible, otherwise they are
859 converted to :class:`Index` with :class:`object` dtype, containing
860 :class:`datetime.datetime`. None/NaN/null entries are converted to
861 :const:`NaT` in both cases.
862
863 - **Series** are converted to :class:`Series` with :class:`datetime64`
864 dtype when possible, otherwise they are converted to :class:`Series` with
865 :class:`object` dtype, containing :class:`datetime.datetime`. None/NaN/null
866 entries are converted to :const:`NaT` in both cases.
867
868 - **DataFrame/dict-like** are converted to :class:`Series` with
869 :class:`datetime64` dtype. For each row a datetime is created from assembling
870 the various dataframe columns. Column keys can be common abbreviations
871 like ['year', 'month', 'day', 'minute', 'second', 'ms', 'us', 'ns']) or
872 plurals of the same.
873
874 The following causes are responsible for :class:`datetime.datetime` objects
875 being returned (possibly inside an :class:`Index` or a :class:`Series` with
876 :class:`object` dtype) instead of a proper pandas designated type
877 (:class:`Timestamp`, :class:`DatetimeIndex` or :class:`Series`
878 with :class:`datetime64` dtype):
879
880 - when any input element is before :const:`Timestamp.min` or after
881 :const:`Timestamp.max`, see `timestamp limitations
882 <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html
883 #timeseries-timestamp-limits>`_.
884
885 - when ``utc=False`` (default) and the input is an array-like or
886 :class:`Series` containing mixed naive/aware datetime, or aware with mixed
887 time offsets. Note that this happens in the (quite frequent) situation when
888 the timezone has a daylight savings policy. In that case you may wish to
889 use ``utc=True``.
890
891 Examples
892 --------
893
894 **Handling various input formats**
895
896 Assembling a datetime from multiple columns of a :class:`DataFrame`. The keys
897 can be common abbreviations like ['year', 'month', 'day', 'minute', 'second',
898 'ms', 'us', 'ns']) or plurals of the same
899
900 >>> df = pd.DataFrame({'year': [2015, 2016],
901 ... 'month': [2, 3],
902 ... 'day': [4, 5]})
903 >>> pd.to_datetime(df)
904 0 2015-02-04
905 1 2016-03-05
906 dtype: datetime64[ns]
907
908 Using a unix epoch time
909
910 >>> pd.to_datetime(1490195805, unit='s')
911 Timestamp('2017-03-22 15:16:45')
912 >>> pd.to_datetime(1490195805433502912, unit='ns')
913 Timestamp('2017-03-22 15:16:45.433502912')
914
915 .. warning:: For float arg, precision rounding might happen. To prevent
916 unexpected behavior use a fixed-width exact type.
917
918 Using a non-unix epoch origin
919
920 >>> pd.to_datetime([1, 2, 3], unit='D',
921 ... origin=pd.Timestamp('1960-01-01'))
922 DatetimeIndex(['1960-01-02', '1960-01-03', '1960-01-04'],
923 dtype='datetime64[ns]', freq=None)
924
925 **Differences with strptime behavior**
926
927 :const:`"%f"` will parse all the way up to nanoseconds.
928
929 >>> pd.to_datetime('2018-10-26 12:00:00.0000000011',
930 ... format='%Y-%m-%d %H:%M:%S.%f')
931 Timestamp('2018-10-26 12:00:00.000000001')
932
933 **Non-convertible date/times**
934
935 Passing ``errors='coerce'`` will force an out-of-bounds date to :const:`NaT`,
936 in addition to forcing non-dates (or non-parseable dates) to :const:`NaT`.
937
938 >>> pd.to_datetime('13000101', format='%Y%m%d', errors='coerce')
939 NaT
940
941 .. _to_datetime_tz_examples:
942
943 **Timezones and time offsets**
944
945 The default behaviour (``utc=False``) is as follows:
946
947 - Timezone-naive inputs are converted to timezone-naive :class:`DatetimeIndex`:
948
949 >>> pd.to_datetime(['2018-10-26 12:00:00', '2018-10-26 13:00:15'])
950 DatetimeIndex(['2018-10-26 12:00:00', '2018-10-26 13:00:15'],
951 dtype='datetime64[ns]', freq=None)
952
953 - Timezone-aware inputs *with constant time offset* are converted to
954 timezone-aware :class:`DatetimeIndex`:
955
956 >>> pd.to_datetime(['2018-10-26 12:00 -0500', '2018-10-26 13:00 -0500'])
957 DatetimeIndex(['2018-10-26 12:00:00-05:00', '2018-10-26 13:00:00-05:00'],
958 dtype='datetime64[ns, UTC-05:00]', freq=None)
959
960 - However, timezone-aware inputs *with mixed time offsets* (for example
961 issued from a timezone with daylight savings, such as Europe/Paris)
962 are **not successfully converted** to a :class:`DatetimeIndex`.
963 Parsing datetimes with mixed time zones will show a warning unless
964 `utc=True`. If you specify `utc=False` the warning below will be shown
965 and a simple :class:`Index` containing :class:`datetime.datetime`
966 objects will be returned:
967
968 >>> pd.to_datetime(['2020-10-25 02:00 +0200',
969 ... '2020-10-25 04:00 +0100']) # doctest: +SKIP
970 FutureWarning: In a future version of pandas, parsing datetimes with mixed
971 time zones will raise an error unless `utc=True`. Please specify `utc=True`
972 to opt in to the new behaviour and silence this warning. To create a `Series`
973 with mixed offsets and `object` dtype, please use `apply` and
974 `datetime.datetime.strptime`.
975 Index([2020-10-25 02:00:00+02:00, 2020-10-25 04:00:00+01:00],
976 dtype='object')
977
978 - A mix of timezone-aware and timezone-naive inputs is also converted to
979 a simple :class:`Index` containing :class:`datetime.datetime` objects:
980
981 >>> from datetime import datetime
982 >>> pd.to_datetime(["2020-01-01 01:00:00-01:00",
983 ... datetime(2020, 1, 1, 3, 0)]) # doctest: +SKIP
984 FutureWarning: In a future version of pandas, parsing datetimes with mixed
985 time zones will raise an error unless `utc=True`. Please specify `utc=True`
986 to opt in to the new behaviour and silence this warning. To create a `Series`
987 with mixed offsets and `object` dtype, please use `apply` and
988 `datetime.datetime.strptime`.
989 Index([2020-01-01 01:00:00-01:00, 2020-01-01 03:00:00], dtype='object')
990
991 |
992
993 Setting ``utc=True`` solves most of the above issues:
994
995 - Timezone-naive inputs are *localized* as UTC
996
997 >>> pd.to_datetime(['2018-10-26 12:00', '2018-10-26 13:00'], utc=True)
998 DatetimeIndex(['2018-10-26 12:00:00+00:00', '2018-10-26 13:00:00+00:00'],
999 dtype='datetime64[ns, UTC]', freq=None)
1000
1001 - Timezone-aware inputs are *converted* to UTC (the output represents the
1002 exact same datetime, but viewed from the UTC time offset `+00:00`).
1003
1004 >>> pd.to_datetime(['2018-10-26 12:00 -0530', '2018-10-26 12:00 -0500'],
1005 ... utc=True)
1006 DatetimeIndex(['2018-10-26 17:30:00+00:00', '2018-10-26 17:00:00+00:00'],
1007 dtype='datetime64[ns, UTC]', freq=None)
1008
1009 - Inputs can contain both string or datetime, the above
1010 rules still apply
1011
1012 >>> pd.to_datetime(['2018-10-26 12:00', datetime(2020, 1, 1, 18)], utc=True)
1013 DatetimeIndex(['2018-10-26 12:00:00+00:00', '2020-01-01 18:00:00+00:00'],
1014 dtype='datetime64[ns, UTC]', freq=None)
1015 """
1016 if exact is not lib.no_default and format in {"mixed", "ISO8601"}:
1017 raise ValueError("Cannot use 'exact' when 'format' is 'mixed' or 'ISO8601'")
1018 if infer_datetime_format is not lib.no_default:
1019 warnings.warn(
1020 "The argument 'infer_datetime_format' is deprecated and will "
1021 "be removed in a future version. "
1022 "A strict version of it is now the default, see "
1023 "https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. "
1024 "You can safely remove this argument.",
1025 stacklevel=find_stack_level(),
1026 )
1027 if errors == "ignore":
1028 # GH#54467
1029 warnings.warn(
1030 "errors='ignore' is deprecated and will raise in a future version. "
1031 "Use to_datetime without passing `errors` and catch exceptions "
1032 "explicitly instead",
1033 FutureWarning,
1034 stacklevel=find_stack_level(),
1035 )
1036
1037 if arg is None:
1038 return None
1039
1040 if origin != "unix":
1041 arg = _adjust_to_origin(arg, origin, unit)
1042
1043 convert_listlike = partial(
1044 _convert_listlike_datetimes,
1045 utc=utc,
1046 unit=unit,
1047 dayfirst=dayfirst,
1048 yearfirst=yearfirst,
1049 errors=errors,
1050 exact=exact,
1051 )
1052 # pylint: disable-next=used-before-assignment
1053 result: Timestamp | NaTType | Series | Index
1054
1055 if isinstance(arg, Timestamp):
1056 result = arg
1057 if utc:
1058 if arg.tz is not None:
1059 result = arg.tz_convert("utc")
1060 else:
1061 result = arg.tz_localize("utc")
1062 elif isinstance(arg, ABCSeries):
1063 cache_array = _maybe_cache(arg, format, cache, convert_listlike)
1064 if not cache_array.empty:
1065 result = arg.map(cache_array)
1066 else:
1067 values = convert_listlike(arg._values, format)
1068 result = arg._constructor(values, index=arg.index, name=arg.name)
1069 elif isinstance(arg, (ABCDataFrame, abc.MutableMapping)):
1070 result = _assemble_from_unit_mappings(arg, errors, utc)
1071 elif isinstance(arg, Index):
1072 cache_array = _maybe_cache(arg, format, cache, convert_listlike)
1073 if not cache_array.empty:
1074 result = _convert_and_box_cache(arg, cache_array, name=arg.name)
1075 else:
1076 result = convert_listlike(arg, format, name=arg.name)
1077 elif is_list_like(arg):
1078 try:
1079 # error: Argument 1 to "_maybe_cache" has incompatible type
1080 # "Union[float, str, datetime, List[Any], Tuple[Any, ...], ExtensionArray,
1081 # ndarray[Any, Any], Series]"; expected "Union[List[Any], Tuple[Any, ...],
1082 # Union[Union[ExtensionArray, ndarray[Any, Any]], Index, Series], Series]"
1083 argc = cast(
1084 Union[list, tuple, ExtensionArray, np.ndarray, "Series", Index], arg
1085 )
1086 cache_array = _maybe_cache(argc, format, cache, convert_listlike)
1087 except OutOfBoundsDatetime:
1088 # caching attempts to create a DatetimeIndex, which may raise
1089 # an OOB. If that's the desired behavior, then just reraise...
1090 if errors == "raise":
1091 raise
1092 # ... otherwise, continue without the cache.
1093 from pandas import Series
1094
1095 cache_array = Series([], dtype=object) # just an empty array
1096 if not cache_array.empty:
1097 result = _convert_and_box_cache(argc, cache_array)
1098 else:
1099 result = convert_listlike(argc, format)
1100 else:
1101 result = convert_listlike(np.array([arg]), format)[0]
1102 if isinstance(arg, bool) and isinstance(result, np.bool_):
1103 result = bool(result) # TODO: avoid this kludge.
1104
1105 # error: Incompatible return value type (got "Union[Timestamp, NaTType,
1106 # Series, Index]", expected "Union[DatetimeIndex, Series, float, str,
1107 # NaTType, None]")
1108 return result # type: ignore[return-value]
1109
1110
1111# mappings for assembling units
1112_unit_map = {
1113 "year": "year",
1114 "years": "year",
1115 "month": "month",
1116 "months": "month",
1117 "day": "day",
1118 "days": "day",
1119 "hour": "h",
1120 "hours": "h",
1121 "minute": "m",
1122 "minutes": "m",
1123 "second": "s",
1124 "seconds": "s",
1125 "ms": "ms",
1126 "millisecond": "ms",
1127 "milliseconds": "ms",
1128 "us": "us",
1129 "microsecond": "us",
1130 "microseconds": "us",
1131 "ns": "ns",
1132 "nanosecond": "ns",
1133 "nanoseconds": "ns",
1134}
1135
1136
1137def _assemble_from_unit_mappings(arg, errors: DateTimeErrorChoices, utc: bool):
1138 """
1139 assemble the unit specified fields from the arg (DataFrame)
1140 Return a Series for actual parsing
1141
1142 Parameters
1143 ----------
1144 arg : DataFrame
1145 errors : {'ignore', 'raise', 'coerce'}, default 'raise'
1146
1147 - If :const:`'raise'`, then invalid parsing will raise an exception
1148 - If :const:`'coerce'`, then invalid parsing will be set as :const:`NaT`
1149 - If :const:`'ignore'`, then invalid parsing will return the input
1150 utc : bool
1151 Whether to convert/localize timestamps to UTC.
1152
1153 Returns
1154 -------
1155 Series
1156 """
1157 from pandas import (
1158 DataFrame,
1159 to_numeric,
1160 to_timedelta,
1161 )
1162
1163 arg = DataFrame(arg)
1164 if not arg.columns.is_unique:
1165 raise ValueError("cannot assemble with duplicate keys")
1166
1167 # replace passed unit with _unit_map
1168 def f(value):
1169 if value in _unit_map:
1170 return _unit_map[value]
1171
1172 # m is case significant
1173 if value.lower() in _unit_map:
1174 return _unit_map[value.lower()]
1175
1176 return value
1177
1178 unit = {k: f(k) for k in arg.keys()}
1179 unit_rev = {v: k for k, v in unit.items()}
1180
1181 # we require at least Ymd
1182 required = ["year", "month", "day"]
1183 req = sorted(set(required) - set(unit_rev.keys()))
1184 if len(req):
1185 _required = ",".join(req)
1186 raise ValueError(
1187 "to assemble mappings requires at least that "
1188 f"[year, month, day] be specified: [{_required}] is missing"
1189 )
1190
1191 # keys we don't recognize
1192 excess = sorted(set(unit_rev.keys()) - set(_unit_map.values()))
1193 if len(excess):
1194 _excess = ",".join(excess)
1195 raise ValueError(
1196 f"extra keys have been passed to the datetime assemblage: [{_excess}]"
1197 )
1198
1199 def coerce(values):
1200 # we allow coercion to if errors allows
1201 values = to_numeric(values, errors=errors)
1202
1203 # prevent overflow in case of int8 or int16
1204 if is_integer_dtype(values.dtype):
1205 values = values.astype("int64", copy=False)
1206 return values
1207
1208 values = (
1209 coerce(arg[unit_rev["year"]]) * 10000
1210 + coerce(arg[unit_rev["month"]]) * 100
1211 + coerce(arg[unit_rev["day"]])
1212 )
1213 try:
1214 values = to_datetime(values, format="%Y%m%d", errors=errors, utc=utc)
1215 except (TypeError, ValueError) as err:
1216 raise ValueError(f"cannot assemble the datetimes: {err}") from err
1217
1218 units: list[UnitChoices] = ["h", "m", "s", "ms", "us", "ns"]
1219 for u in units:
1220 value = unit_rev.get(u)
1221 if value is not None and value in arg:
1222 try:
1223 values += to_timedelta(coerce(arg[value]), unit=u, errors=errors)
1224 except (TypeError, ValueError) as err:
1225 raise ValueError(
1226 f"cannot assemble the datetimes [{value}]: {err}"
1227 ) from err
1228 return values
1229
1230
1231__all__ = [
1232 "DateParseError",
1233 "should_cache",
1234 "to_datetime",
1235]