1from __future__ import annotations
2
3from collections import abc
4from datetime import datetime
5from functools import partial
6from itertools import islice
7from typing import (
8 TYPE_CHECKING,
9 Callable,
10 Hashable,
11 List,
12 Tuple,
13 TypedDict,
14 Union,
15 cast,
16 overload,
17)
18import warnings
19
20import numpy as np
21
22from pandas._libs import (
23 lib,
24 tslib,
25)
26from pandas._libs.tslibs import (
27 OutOfBoundsDatetime,
28 Timedelta,
29 Timestamp,
30 astype_overflowsafe,
31 get_unit_from_dtype,
32 iNaT,
33 is_supported_unit,
34 nat_strings,
35 parsing,
36 timezones as libtimezones,
37)
38from pandas._libs.tslibs.conversion import precision_from_unit
39from pandas._libs.tslibs.parsing import (
40 DateParseError,
41 guess_datetime_format,
42)
43from pandas._libs.tslibs.strptime import array_strptime
44from pandas._typing import (
45 AnyArrayLike,
46 ArrayLike,
47 DateTimeErrorChoices,
48 npt,
49)
50from pandas.util._exceptions import find_stack_level
51
52from pandas.core.dtypes.common import (
53 ensure_object,
54 is_datetime64_dtype,
55 is_datetime64tz_dtype,
56 is_float,
57 is_integer,
58 is_integer_dtype,
59 is_list_like,
60 is_numeric_dtype,
61 is_scalar,
62)
63from pandas.core.dtypes.generic import (
64 ABCDataFrame,
65 ABCSeries,
66)
67from pandas.core.dtypes.missing import notna
68
69from pandas.arrays import (
70 DatetimeArray,
71 IntegerArray,
72 PandasArray,
73)
74from pandas.core import algorithms
75from pandas.core.algorithms import unique
76from pandas.core.arrays.base import ExtensionArray
77from pandas.core.arrays.datetimes import (
78 maybe_convert_dtype,
79 objects_to_datetime64ns,
80 tz_to_dtype,
81)
82from pandas.core.construction import extract_array
83from pandas.core.indexes.base import Index
84from pandas.core.indexes.datetimes import DatetimeIndex
85
86if TYPE_CHECKING:
87 from pandas._libs.tslibs.nattype import NaTType
88 from pandas._libs.tslibs.timedeltas import UnitChoices
89
90 from pandas import (
91 DataFrame,
92 Series,
93 )
94
95# ---------------------------------------------------------------------
96# types used in annotations
97
98ArrayConvertible = Union[List, Tuple, AnyArrayLike]
99Scalar = Union[float, str]
100DatetimeScalar = Union[Scalar, datetime]
101
102DatetimeScalarOrArrayConvertible = Union[DatetimeScalar, ArrayConvertible]
103
104DatetimeDictArg = Union[List[Scalar], Tuple[Scalar, ...], AnyArrayLike]
105
106
107class YearMonthDayDict(TypedDict, total=True):
108 year: DatetimeDictArg
109 month: DatetimeDictArg
110 day: DatetimeDictArg
111
112
113class FulldatetimeDict(YearMonthDayDict, total=False):
114 hour: DatetimeDictArg
115 hours: DatetimeDictArg
116 minute: DatetimeDictArg
117 minutes: DatetimeDictArg
118 second: DatetimeDictArg
119 seconds: DatetimeDictArg
120 ms: DatetimeDictArg
121 us: DatetimeDictArg
122 ns: DatetimeDictArg
123
124
125DictConvertible = Union[FulldatetimeDict, "DataFrame"]
126start_caching_at = 50
127
128
129# ---------------------------------------------------------------------
130
131
132def _guess_datetime_format_for_array(arr, dayfirst: bool | None = False) -> str | None:
133 # Try to guess the format based on the first non-NaN element, return None if can't
134 if (first_non_null := tslib.first_non_null(arr)) != -1:
135 if type(first_non_nan_element := arr[first_non_null]) is str:
136 # GH#32264 np.str_ object
137 guessed_format = guess_datetime_format(
138 first_non_nan_element, dayfirst=dayfirst
139 )
140 if guessed_format is not None:
141 return guessed_format
142 # If there are multiple non-null elements, warn about
143 # how parsing might not be consistent
144 if tslib.first_non_null(arr[first_non_null + 1 :]) != -1:
145 warnings.warn(
146 "Could not infer format, so each element will be parsed "
147 "individually, falling back to `dateutil`. To ensure parsing is "
148 "consistent and as-expected, please specify a format.",
149 UserWarning,
150 stacklevel=find_stack_level(),
151 )
152 return None
153
154
155def should_cache(
156 arg: ArrayConvertible, unique_share: float = 0.7, check_count: int | None = None
157) -> bool:
158 """
159 Decides whether to do caching.
160
161 If the percent of unique elements among `check_count` elements less
162 than `unique_share * 100` then we can do caching.
163
164 Parameters
165 ----------
166 arg: listlike, tuple, 1-d array, Series
167 unique_share: float, default=0.7, optional
168 0 < unique_share < 1
169 check_count: int, optional
170 0 <= check_count <= len(arg)
171
172 Returns
173 -------
174 do_caching: bool
175
176 Notes
177 -----
178 By default for a sequence of less than 50 items in size, we don't do
179 caching; for the number of elements less than 5000, we take ten percent of
180 all elements to check for a uniqueness share; if the sequence size is more
181 than 5000, then we check only the first 500 elements.
182 All constants were chosen empirically by.
183 """
184 do_caching = True
185
186 # default realization
187 if check_count is None:
188 # in this case, the gain from caching is negligible
189 if len(arg) <= start_caching_at:
190 return False
191
192 if len(arg) <= 5000:
193 check_count = len(arg) // 10
194 else:
195 check_count = 500
196 else:
197 assert (
198 0 <= check_count <= len(arg)
199 ), "check_count must be in next bounds: [0; len(arg)]"
200 if check_count == 0:
201 return False
202
203 assert 0 < unique_share < 1, "unique_share must be in next bounds: (0; 1)"
204
205 try:
206 # We can't cache if the items are not hashable.
207 unique_elements = set(islice(arg, check_count))
208 except TypeError:
209 return False
210 if len(unique_elements) > check_count * unique_share:
211 do_caching = False
212 return do_caching
213
214
215def _maybe_cache(
216 arg: ArrayConvertible,
217 format: str | None,
218 cache: bool,
219 convert_listlike: Callable,
220) -> Series:
221 """
222 Create a cache of unique dates from an array of dates
223
224 Parameters
225 ----------
226 arg : listlike, tuple, 1-d array, Series
227 format : string
228 Strftime format to parse time
229 cache : bool
230 True attempts to create a cache of converted values
231 convert_listlike : function
232 Conversion function to apply on dates
233
234 Returns
235 -------
236 cache_array : Series
237 Cache of converted, unique dates. Can be empty
238 """
239 from pandas import Series
240
241 cache_array = Series(dtype=object)
242
243 if cache:
244 # Perform a quicker unique check
245 if not should_cache(arg):
246 return cache_array
247
248 unique_dates = unique(arg)
249 if len(unique_dates) < len(arg):
250 cache_dates = convert_listlike(unique_dates, format)
251 # GH#45319
252 try:
253 cache_array = Series(cache_dates, index=unique_dates, copy=False)
254 except OutOfBoundsDatetime:
255 return cache_array
256 # GH#39882 and GH#35888 in case of None and NaT we get duplicates
257 if not cache_array.index.is_unique:
258 cache_array = cache_array[~cache_array.index.duplicated()]
259 return cache_array
260
261
262def _box_as_indexlike(
263 dt_array: ArrayLike, utc: bool = False, name: Hashable = None
264) -> Index:
265 """
266 Properly boxes the ndarray of datetimes to DatetimeIndex
267 if it is possible or to generic Index instead
268
269 Parameters
270 ----------
271 dt_array: 1-d array
272 Array of datetimes to be wrapped in an Index.
273 utc : bool
274 Whether to convert/localize timestamps to UTC.
275 name : string, default None
276 Name for a resulting index
277
278 Returns
279 -------
280 result : datetime of converted dates
281 - DatetimeIndex if convertible to sole datetime64 type
282 - general Index otherwise
283 """
284
285 if is_datetime64_dtype(dt_array):
286 tz = "utc" if utc else None
287 return DatetimeIndex(dt_array, tz=tz, name=name)
288 return Index(dt_array, name=name, dtype=dt_array.dtype)
289
290
291def _convert_and_box_cache(
292 arg: DatetimeScalarOrArrayConvertible,
293 cache_array: Series,
294 name: Hashable | None = None,
295) -> Index:
296 """
297 Convert array of dates with a cache and wrap the result in an Index.
298
299 Parameters
300 ----------
301 arg : integer, float, string, datetime, list, tuple, 1-d array, Series
302 cache_array : Series
303 Cache of converted, unique dates
304 name : string, default None
305 Name for a DatetimeIndex
306
307 Returns
308 -------
309 result : Index-like of converted dates
310 """
311 from pandas import Series
312
313 result = Series(arg, dtype=cache_array.index.dtype).map(cache_array)
314 return _box_as_indexlike(result._values, utc=False, name=name)
315
316
317def _return_parsed_timezone_results(
318 result: np.ndarray, timezones, utc: bool, name
319) -> Index:
320 """
321 Return results from array_strptime if a %z or %Z directive was passed.
322
323 Parameters
324 ----------
325 result : ndarray[int64]
326 int64 date representations of the dates
327 timezones : ndarray
328 pytz timezone objects
329 utc : bool
330 Whether to convert/localize timestamps to UTC.
331 name : string, default None
332 Name for a DatetimeIndex
333
334 Returns
335 -------
336 tz_result : Index-like of parsed dates with timezone
337 """
338 tz_results = np.empty(len(result), dtype=object)
339 for zone in unique(timezones):
340 mask = timezones == zone
341 dta = DatetimeArray(result[mask]).tz_localize(zone)
342 if utc:
343 if dta.tzinfo is None:
344 dta = dta.tz_localize("utc")
345 else:
346 dta = dta.tz_convert("utc")
347 tz_results[mask] = dta
348
349 return Index(tz_results, name=name)
350
351
352def _convert_listlike_datetimes(
353 arg,
354 format: str | None,
355 name: Hashable = None,
356 utc: bool = False,
357 unit: str | None = None,
358 errors: DateTimeErrorChoices = "raise",
359 dayfirst: bool | None = None,
360 yearfirst: bool | None = None,
361 exact: bool = True,
362):
363 """
364 Helper function for to_datetime. Performs the conversions of 1D listlike
365 of dates
366
367 Parameters
368 ----------
369 arg : list, tuple, ndarray, Series, Index
370 date to be parsed
371 name : object
372 None or string for the Index name
373 utc : bool
374 Whether to convert/localize timestamps to UTC.
375 unit : str
376 None or string of the frequency of the passed data
377 errors : str
378 error handing behaviors from to_datetime, 'raise', 'coerce', 'ignore'
379 dayfirst : bool
380 dayfirst parsing behavior from to_datetime
381 yearfirst : bool
382 yearfirst parsing behavior from to_datetime
383 exact : bool, default True
384 exact format matching behavior from to_datetime
385
386 Returns
387 -------
388 Index-like of parsed dates
389 """
390 if isinstance(arg, (list, tuple)):
391 arg = np.array(arg, dtype="O")
392 elif isinstance(arg, PandasArray):
393 arg = np.array(arg)
394
395 arg_dtype = getattr(arg, "dtype", None)
396 # these are shortcutable
397 tz = "utc" if utc else None
398 if is_datetime64tz_dtype(arg_dtype):
399 if not isinstance(arg, (DatetimeArray, DatetimeIndex)):
400 return DatetimeIndex(arg, tz=tz, name=name)
401 if utc:
402 arg = arg.tz_convert(None).tz_localize("utc")
403 return arg
404
405 elif is_datetime64_dtype(arg_dtype):
406 arg_dtype = cast(np.dtype, arg_dtype)
407 if not is_supported_unit(get_unit_from_dtype(arg_dtype)):
408 # We go to closest supported reso, i.e. "s"
409 arg = astype_overflowsafe(
410 # TODO: looks like we incorrectly raise with errors=="ignore"
411 np.asarray(arg),
412 np.dtype("M8[s]"),
413 is_coerce=errors == "coerce",
414 )
415
416 if not isinstance(arg, (DatetimeArray, DatetimeIndex)):
417 return DatetimeIndex(arg, tz=tz, name=name)
418 elif utc:
419 # DatetimeArray, DatetimeIndex
420 return arg.tz_localize("utc")
421
422 return arg
423
424 elif unit is not None:
425 if format is not None:
426 raise ValueError("cannot specify both format and unit")
427 return _to_datetime_with_unit(arg, unit, name, utc, errors)
428 elif getattr(arg, "ndim", 1) > 1:
429 raise TypeError(
430 "arg must be a string, datetime, list, tuple, 1-d array, or Series"
431 )
432
433 # warn if passing timedelta64, raise for PeriodDtype
434 # NB: this must come after unit transformation
435 try:
436 arg, _ = maybe_convert_dtype(arg, copy=False, tz=libtimezones.maybe_get_tz(tz))
437 except TypeError:
438 if errors == "coerce":
439 npvalues = np.array(["NaT"], dtype="datetime64[ns]").repeat(len(arg))
440 return DatetimeIndex(npvalues, name=name)
441 elif errors == "ignore":
442 idx = Index(arg, name=name)
443 return idx
444 raise
445
446 arg = ensure_object(arg)
447
448 if format is None:
449 format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst)
450
451 # `format` could be inferred, or user didn't ask for mixed-format parsing.
452 if format is not None and format != "mixed":
453 return _array_strptime_with_fallback(arg, name, utc, format, exact, errors)
454
455 result, tz_parsed = objects_to_datetime64ns(
456 arg,
457 dayfirst=dayfirst,
458 yearfirst=yearfirst,
459 utc=utc,
460 errors=errors,
461 allow_object=True,
462 )
463
464 if tz_parsed is not None:
465 # We can take a shortcut since the datetime64 numpy array
466 # is in UTC
467 dta = DatetimeArray(result, dtype=tz_to_dtype(tz_parsed))
468 return DatetimeIndex._simple_new(dta, name=name)
469
470 return _box_as_indexlike(result, utc=utc, name=name)
471
472
473def _array_strptime_with_fallback(
474 arg,
475 name,
476 utc: bool,
477 fmt: str,
478 exact: bool,
479 errors: str,
480) -> Index:
481 """
482 Call array_strptime, with fallback behavior depending on 'errors'.
483 """
484 result, timezones = array_strptime(arg, fmt, exact=exact, errors=errors, utc=utc)
485 if any(tz is not None for tz in timezones):
486 return _return_parsed_timezone_results(result, timezones, utc, name)
487
488 return _box_as_indexlike(result, utc=utc, name=name)
489
490
491def _to_datetime_with_unit(arg, unit, name, utc: bool, errors: str) -> Index:
492 """
493 to_datetime specalized to the case where a 'unit' is passed.
494 """
495 arg = extract_array(arg, extract_numpy=True)
496
497 # GH#30050 pass an ndarray to tslib.array_with_unit_to_datetime
498 # because it expects an ndarray argument
499 if isinstance(arg, IntegerArray):
500 arr = arg.astype(f"datetime64[{unit}]")
501 tz_parsed = None
502 else:
503 arg = np.asarray(arg)
504
505 if arg.dtype.kind in ["i", "u"]:
506 # Note we can't do "f" here because that could induce unwanted
507 # rounding GH#14156, GH#20445
508 arr = arg.astype(f"datetime64[{unit}]", copy=False)
509 try:
510 arr = astype_overflowsafe(arr, np.dtype("M8[ns]"), copy=False)
511 except OutOfBoundsDatetime:
512 if errors == "raise":
513 raise
514 arg = arg.astype(object)
515 return _to_datetime_with_unit(arg, unit, name, utc, errors)
516 tz_parsed = None
517
518 elif arg.dtype.kind == "f":
519 mult, _ = precision_from_unit(unit)
520
521 mask = np.isnan(arg) | (arg == iNaT)
522 fvalues = (arg * mult).astype("f8", copy=False)
523 fvalues[mask] = 0
524
525 if (fvalues < Timestamp.min._value).any() or (
526 fvalues > Timestamp.max._value
527 ).any():
528 if errors != "raise":
529 arg = arg.astype(object)
530 return _to_datetime_with_unit(arg, unit, name, utc, errors)
531 raise OutOfBoundsDatetime(f"cannot convert input with unit '{unit}'")
532
533 arr = fvalues.astype("M8[ns]", copy=False)
534 arr[mask] = np.datetime64("NaT", "ns")
535
536 tz_parsed = None
537 else:
538 arg = arg.astype(object, copy=False)
539 arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)
540
541 if errors == "ignore":
542 # Index constructor _may_ infer to DatetimeIndex
543 result = Index._with_infer(arr, name=name)
544 else:
545 result = DatetimeIndex(arr, name=name)
546
547 if not isinstance(result, DatetimeIndex):
548 return result
549
550 # GH#23758: We may still need to localize the result with tz
551 # GH#25546: Apply tz_parsed first (from arg), then tz (from caller)
552 # result will be naive but in UTC
553 result = result.tz_localize("UTC").tz_convert(tz_parsed)
554
555 if utc:
556 if result.tz is None:
557 result = result.tz_localize("utc")
558 else:
559 result = result.tz_convert("utc")
560 return result
561
562
563def _adjust_to_origin(arg, origin, unit):
564 """
565 Helper function for to_datetime.
566 Adjust input argument to the specified origin
567
568 Parameters
569 ----------
570 arg : list, tuple, ndarray, Series, Index
571 date to be adjusted
572 origin : 'julian' or Timestamp
573 origin offset for the arg
574 unit : str
575 passed unit from to_datetime, must be 'D'
576
577 Returns
578 -------
579 ndarray or scalar of adjusted date(s)
580 """
581 if origin == "julian":
582 original = arg
583 j0 = Timestamp(0).to_julian_date()
584 if unit != "D":
585 raise ValueError("unit must be 'D' for origin='julian'")
586 try:
587 arg = arg - j0
588 except TypeError as err:
589 raise ValueError(
590 "incompatible 'arg' type for given 'origin'='julian'"
591 ) from err
592
593 # preemptively check this for a nice range
594 j_max = Timestamp.max.to_julian_date() - j0
595 j_min = Timestamp.min.to_julian_date() - j0
596 if np.any(arg > j_max) or np.any(arg < j_min):
597 raise OutOfBoundsDatetime(
598 f"{original} is Out of Bounds for origin='julian'"
599 )
600 else:
601 # arg must be numeric
602 if not (
603 (is_scalar(arg) and (is_integer(arg) or is_float(arg)))
604 or is_numeric_dtype(np.asarray(arg))
605 ):
606 raise ValueError(
607 f"'{arg}' is not compatible with origin='{origin}'; "
608 "it must be numeric with a unit specified"
609 )
610
611 # we are going to offset back to unix / epoch time
612 try:
613 offset = Timestamp(origin, unit=unit)
614 except OutOfBoundsDatetime as err:
615 raise OutOfBoundsDatetime(f"origin {origin} is Out of Bounds") from err
616 except ValueError as err:
617 raise ValueError(
618 f"origin {origin} cannot be converted to a Timestamp"
619 ) from err
620
621 if offset.tz is not None:
622 raise ValueError(f"origin offset {offset} must be tz-naive")
623 td_offset = offset - Timestamp(0)
624
625 # convert the offset to the unit of the arg
626 # this should be lossless in terms of precision
627 ioffset = td_offset // Timedelta(1, unit=unit)
628
629 # scalars & ndarray-like can handle the addition
630 if is_list_like(arg) and not isinstance(arg, (ABCSeries, Index, np.ndarray)):
631 arg = np.asarray(arg)
632 arg = arg + ioffset
633 return arg
634
635
636@overload
637def to_datetime(
638 arg: DatetimeScalar,
639 errors: DateTimeErrorChoices = ...,
640 dayfirst: bool = ...,
641 yearfirst: bool = ...,
642 utc: bool = ...,
643 format: str | None = ...,
644 exact: bool = ...,
645 unit: str | None = ...,
646 infer_datetime_format: bool = ...,
647 origin=...,
648 cache: bool = ...,
649) -> Timestamp:
650 ...
651
652
653@overload
654def to_datetime(
655 arg: Series | DictConvertible,
656 errors: DateTimeErrorChoices = ...,
657 dayfirst: bool = ...,
658 yearfirst: bool = ...,
659 utc: bool = ...,
660 format: str | None = ...,
661 exact: bool = ...,
662 unit: str | None = ...,
663 infer_datetime_format: bool = ...,
664 origin=...,
665 cache: bool = ...,
666) -> Series:
667 ...
668
669
670@overload
671def to_datetime(
672 arg: list | tuple | Index | ArrayLike,
673 errors: DateTimeErrorChoices = ...,
674 dayfirst: bool = ...,
675 yearfirst: bool = ...,
676 utc: bool = ...,
677 format: str | None = ...,
678 exact: bool = ...,
679 unit: str | None = ...,
680 infer_datetime_format: bool = ...,
681 origin=...,
682 cache: bool = ...,
683) -> DatetimeIndex:
684 ...
685
686
687def to_datetime(
688 arg: DatetimeScalarOrArrayConvertible | DictConvertible,
689 errors: DateTimeErrorChoices = "raise",
690 dayfirst: bool = False,
691 yearfirst: bool = False,
692 utc: bool = False,
693 format: str | None = None,
694 exact: bool | lib.NoDefault = lib.no_default,
695 unit: str | None = None,
696 infer_datetime_format: lib.NoDefault | bool = lib.no_default,
697 origin: str = "unix",
698 cache: bool = True,
699) -> DatetimeIndex | Series | DatetimeScalar | NaTType | None:
700 """
701 Convert argument to datetime.
702
703 This function converts a scalar, array-like, :class:`Series` or
704 :class:`DataFrame`/dict-like to a pandas datetime object.
705
706 Parameters
707 ----------
708 arg : int, float, str, datetime, list, tuple, 1-d array, Series, DataFrame/dict-like
709 The object to convert to a datetime. If a :class:`DataFrame` is provided, the
710 method expects minimally the following columns: :const:`"year"`,
711 :const:`"month"`, :const:`"day"`.
712 errors : {'ignore', 'raise', 'coerce'}, default 'raise'
713 - If :const:`'raise'`, then invalid parsing will raise an exception.
714 - If :const:`'coerce'`, then invalid parsing will be set as :const:`NaT`.
715 - If :const:`'ignore'`, then invalid parsing will return the input.
716 dayfirst : bool, default False
717 Specify a date parse order if `arg` is str or is list-like.
718 If :const:`True`, parses dates with the day first, e.g. :const:`"10/11/12"`
719 is parsed as :const:`2012-11-10`.
720
721 .. warning::
722
723 ``dayfirst=True`` is not strict, but will prefer to parse
724 with day first.
725
726 yearfirst : bool, default False
727 Specify a date parse order if `arg` is str or is list-like.
728
729 - If :const:`True` parses dates with the year first, e.g.
730 :const:`"10/11/12"` is parsed as :const:`2010-11-12`.
731 - If both `dayfirst` and `yearfirst` are :const:`True`, `yearfirst` is
732 preceded (same as :mod:`dateutil`).
733
734 .. warning::
735
736 ``yearfirst=True`` is not strict, but will prefer to parse
737 with year first.
738
739 utc : bool, default False
740 Control timezone-related parsing, localization and conversion.
741
742 - If :const:`True`, the function *always* returns a timezone-aware
743 UTC-localized :class:`Timestamp`, :class:`Series` or
744 :class:`DatetimeIndex`. To do this, timezone-naive inputs are
745 *localized* as UTC, while timezone-aware inputs are *converted* to UTC.
746
747 - If :const:`False` (default), inputs will not be coerced to UTC.
748 Timezone-naive inputs will remain naive, while timezone-aware ones
749 will keep their time offsets. Limitations exist for mixed
750 offsets (typically, daylight savings), see :ref:`Examples
751 <to_datetime_tz_examples>` section for details.
752
753 See also: pandas general documentation about `timezone conversion and
754 localization
755 <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html
756 #time-zone-handling>`_.
757
758 format : str, default None
759 The strftime to parse time, e.g. :const:`"%d/%m/%Y"`. See
760 `strftime documentation
761 <https://docs.python.org/3/library/datetime.html
762 #strftime-and-strptime-behavior>`_ for more information on choices, though
763 note that :const:`"%f"` will parse all the way up to nanoseconds.
764 You can also pass:
765
766 - "ISO8601", to parse any `ISO8601 <https://en.wikipedia.org/wiki/ISO_8601>`_
767 time string (not necessarily in exactly the same format);
768 - "mixed", to infer the format for each element individually. This is risky,
769 and you should probably use it along with `dayfirst`.
770 exact : bool, default True
771 Control how `format` is used:
772
773 - If :const:`True`, require an exact `format` match.
774 - If :const:`False`, allow the `format` to match anywhere in the target
775 string.
776
777 Cannot be used alongside ``format='ISO8601'`` or ``format='mixed'``.
778 unit : str, default 'ns'
779 The unit of the arg (D,s,ms,us,ns) denote the unit, which is an
780 integer or float number. This will be based off the origin.
781 Example, with ``unit='ms'`` and ``origin='unix'``, this would calculate
782 the number of milliseconds to the unix epoch start.
783 infer_datetime_format : bool, default False
784 If :const:`True` and no `format` is given, attempt to infer the format
785 of the datetime strings based on the first non-NaN element,
786 and if it can be inferred, switch to a faster method of parsing them.
787 In some cases this can increase the parsing speed by ~5-10x.
788
789 .. deprecated:: 2.0.0
790 A strict version of this argument is now the default, passing it has
791 no effect.
792
793 origin : scalar, default 'unix'
794 Define the reference date. The numeric values would be parsed as number
795 of units (defined by `unit`) since this reference date.
796
797 - If :const:`'unix'` (or POSIX) time; origin is set to 1970-01-01.
798 - If :const:`'julian'`, unit must be :const:`'D'`, and origin is set to
799 beginning of Julian Calendar. Julian day number :const:`0` is assigned
800 to the day starting at noon on January 1, 4713 BC.
801 - If Timestamp convertible (Timestamp, dt.datetime, np.datetimt64 or date
802 string), origin is set to Timestamp identified by origin.
803 - If a float or integer, origin is the millisecond difference
804 relative to 1970-01-01.
805 cache : bool, default True
806 If :const:`True`, use a cache of unique, converted dates to apply the
807 datetime conversion. May produce significant speed-up when parsing
808 duplicate date strings, especially ones with timezone offsets. The cache
809 is only used when there are at least 50 values. The presence of
810 out-of-bounds values will render the cache unusable and may slow down
811 parsing.
812
813 Returns
814 -------
815 datetime
816 If parsing succeeded.
817 Return type depends on input (types in parenthesis correspond to
818 fallback in case of unsuccessful timezone or out-of-range timestamp
819 parsing):
820
821 - scalar: :class:`Timestamp` (or :class:`datetime.datetime`)
822 - array-like: :class:`DatetimeIndex` (or :class:`Series` with
823 :class:`object` dtype containing :class:`datetime.datetime`)
824 - Series: :class:`Series` of :class:`datetime64` dtype (or
825 :class:`Series` of :class:`object` dtype containing
826 :class:`datetime.datetime`)
827 - DataFrame: :class:`Series` of :class:`datetime64` dtype (or
828 :class:`Series` of :class:`object` dtype containing
829 :class:`datetime.datetime`)
830
831 Raises
832 ------
833 ParserError
834 When parsing a date from string fails.
835 ValueError
836 When another datetime conversion error happens. For example when one
837 of 'year', 'month', day' columns is missing in a :class:`DataFrame`, or
838 when a Timezone-aware :class:`datetime.datetime` is found in an array-like
839 of mixed time offsets, and ``utc=False``.
840
841 See Also
842 --------
843 DataFrame.astype : Cast argument to a specified dtype.
844 to_timedelta : Convert argument to timedelta.
845 convert_dtypes : Convert dtypes.
846
847 Notes
848 -----
849
850 Many input types are supported, and lead to different output types:
851
852 - **scalars** can be int, float, str, datetime object (from stdlib :mod:`datetime`
853 module or :mod:`numpy`). They are converted to :class:`Timestamp` when
854 possible, otherwise they are converted to :class:`datetime.datetime`.
855 None/NaN/null scalars are converted to :const:`NaT`.
856
857 - **array-like** can contain int, float, str, datetime objects. They are
858 converted to :class:`DatetimeIndex` when possible, otherwise they are
859 converted to :class:`Index` with :class:`object` dtype, containing
860 :class:`datetime.datetime`. None/NaN/null entries are converted to
861 :const:`NaT` in both cases.
862
863 - **Series** are converted to :class:`Series` with :class:`datetime64`
864 dtype when possible, otherwise they are converted to :class:`Series` with
865 :class:`object` dtype, containing :class:`datetime.datetime`. None/NaN/null
866 entries are converted to :const:`NaT` in both cases.
867
868 - **DataFrame/dict-like** are converted to :class:`Series` with
869 :class:`datetime64` dtype. For each row a datetime is created from assembling
870 the various dataframe columns. Column keys can be common abbreviations
871 like [‘year’, ‘month’, ‘day’, ‘minute’, ‘second’, ‘ms’, ‘us’, ‘ns’]) or
872 plurals of the same.
873
874 The following causes are responsible for :class:`datetime.datetime` objects
875 being returned (possibly inside an :class:`Index` or a :class:`Series` with
876 :class:`object` dtype) instead of a proper pandas designated type
877 (:class:`Timestamp`, :class:`DatetimeIndex` or :class:`Series`
878 with :class:`datetime64` dtype):
879
880 - when any input element is before :const:`Timestamp.min` or after
881 :const:`Timestamp.max`, see `timestamp limitations
882 <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html
883 #timeseries-timestamp-limits>`_.
884
885 - when ``utc=False`` (default) and the input is an array-like or
886 :class:`Series` containing mixed naive/aware datetime, or aware with mixed
887 time offsets. Note that this happens in the (quite frequent) situation when
888 the timezone has a daylight savings policy. In that case you may wish to
889 use ``utc=True``.
890
891 Examples
892 --------
893
894 **Handling various input formats**
895
896 Assembling a datetime from multiple columns of a :class:`DataFrame`. The keys
897 can be common abbreviations like ['year', 'month', 'day', 'minute', 'second',
898 'ms', 'us', 'ns']) or plurals of the same
899
900 >>> df = pd.DataFrame({'year': [2015, 2016],
901 ... 'month': [2, 3],
902 ... 'day': [4, 5]})
903 >>> pd.to_datetime(df)
904 0 2015-02-04
905 1 2016-03-05
906 dtype: datetime64[ns]
907
908 Using a unix epoch time
909
910 >>> pd.to_datetime(1490195805, unit='s')
911 Timestamp('2017-03-22 15:16:45')
912 >>> pd.to_datetime(1490195805433502912, unit='ns')
913 Timestamp('2017-03-22 15:16:45.433502912')
914
915 .. warning:: For float arg, precision rounding might happen. To prevent
916 unexpected behavior use a fixed-width exact type.
917
918 Using a non-unix epoch origin
919
920 >>> pd.to_datetime([1, 2, 3], unit='D',
921 ... origin=pd.Timestamp('1960-01-01'))
922 DatetimeIndex(['1960-01-02', '1960-01-03', '1960-01-04'],
923 dtype='datetime64[ns]', freq=None)
924
925 **Differences with strptime behavior**
926
927 :const:`"%f"` will parse all the way up to nanoseconds.
928
929 >>> pd.to_datetime('2018-10-26 12:00:00.0000000011',
930 ... format='%Y-%m-%d %H:%M:%S.%f')
931 Timestamp('2018-10-26 12:00:00.000000001')
932
933 **Non-convertible date/times**
934
935 If a date does not meet the `timestamp limitations
936 <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html
937 #timeseries-timestamp-limits>`_, passing ``errors='ignore'``
938 will return the original input instead of raising any exception.
939
940 Passing ``errors='coerce'`` will force an out-of-bounds date to :const:`NaT`,
941 in addition to forcing non-dates (or non-parseable dates) to :const:`NaT`.
942
943 >>> pd.to_datetime('13000101', format='%Y%m%d', errors='ignore')
944 '13000101'
945 >>> pd.to_datetime('13000101', format='%Y%m%d', errors='coerce')
946 NaT
947
948 .. _to_datetime_tz_examples:
949
950 **Timezones and time offsets**
951
952 The default behaviour (``utc=False``) is as follows:
953
954 - Timezone-naive inputs are converted to timezone-naive :class:`DatetimeIndex`:
955
956 >>> pd.to_datetime(['2018-10-26 12:00:00', '2018-10-26 13:00:15'])
957 DatetimeIndex(['2018-10-26 12:00:00', '2018-10-26 13:00:15'],
958 dtype='datetime64[ns]', freq=None)
959
960 - Timezone-aware inputs *with constant time offset* are converted to
961 timezone-aware :class:`DatetimeIndex`:
962
963 >>> pd.to_datetime(['2018-10-26 12:00 -0500', '2018-10-26 13:00 -0500'])
964 DatetimeIndex(['2018-10-26 12:00:00-05:00', '2018-10-26 13:00:00-05:00'],
965 dtype='datetime64[ns, UTC-05:00]', freq=None)
966
967 - However, timezone-aware inputs *with mixed time offsets* (for example
968 issued from a timezone with daylight savings, such as Europe/Paris)
969 are **not successfully converted** to a :class:`DatetimeIndex`. Instead a
970 simple :class:`Index` containing :class:`datetime.datetime` objects is
971 returned:
972
973 >>> pd.to_datetime(['2020-10-25 02:00 +0200', '2020-10-25 04:00 +0100'])
974 Index([2020-10-25 02:00:00+02:00, 2020-10-25 04:00:00+01:00],
975 dtype='object')
976
977 - A mix of timezone-aware and timezone-naive inputs is also converted to
978 a simple :class:`Index` containing :class:`datetime.datetime` objects:
979
980 >>> from datetime import datetime
981 >>> pd.to_datetime(["2020-01-01 01:00:00-01:00", datetime(2020, 1, 1, 3, 0)])
982 Index([2020-01-01 01:00:00-01:00, 2020-01-01 03:00:00], dtype='object')
983
984 |
985
986 Setting ``utc=True`` solves most of the above issues:
987
988 - Timezone-naive inputs are *localized* as UTC
989
990 >>> pd.to_datetime(['2018-10-26 12:00', '2018-10-26 13:00'], utc=True)
991 DatetimeIndex(['2018-10-26 12:00:00+00:00', '2018-10-26 13:00:00+00:00'],
992 dtype='datetime64[ns, UTC]', freq=None)
993
994 - Timezone-aware inputs are *converted* to UTC (the output represents the
995 exact same datetime, but viewed from the UTC time offset `+00:00`).
996
997 >>> pd.to_datetime(['2018-10-26 12:00 -0530', '2018-10-26 12:00 -0500'],
998 ... utc=True)
999 DatetimeIndex(['2018-10-26 17:30:00+00:00', '2018-10-26 17:00:00+00:00'],
1000 dtype='datetime64[ns, UTC]', freq=None)
1001
1002 - Inputs can contain both string or datetime, the above
1003 rules still apply
1004
1005 >>> pd.to_datetime(['2018-10-26 12:00', datetime(2020, 1, 1, 18)], utc=True)
1006 DatetimeIndex(['2018-10-26 12:00:00+00:00', '2020-01-01 18:00:00+00:00'],
1007 dtype='datetime64[ns, UTC]', freq=None)
1008 """
1009 if exact is not lib.no_default and format in {"mixed", "ISO8601"}:
1010 raise ValueError("Cannot use 'exact' when 'format' is 'mixed' or 'ISO8601'")
1011 if infer_datetime_format is not lib.no_default:
1012 warnings.warn(
1013 "The argument 'infer_datetime_format' is deprecated and will "
1014 "be removed in a future version. "
1015 "A strict version of it is now the default, see "
1016 "https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. "
1017 "You can safely remove this argument.",
1018 stacklevel=find_stack_level(),
1019 )
1020 if arg is None:
1021 return None
1022
1023 if origin != "unix":
1024 arg = _adjust_to_origin(arg, origin, unit)
1025
1026 convert_listlike = partial(
1027 _convert_listlike_datetimes,
1028 utc=utc,
1029 unit=unit,
1030 dayfirst=dayfirst,
1031 yearfirst=yearfirst,
1032 errors=errors,
1033 exact=exact,
1034 )
1035 # pylint: disable-next=used-before-assignment
1036 result: Timestamp | NaTType | Series | Index
1037
1038 if isinstance(arg, Timestamp):
1039 result = arg
1040 if utc:
1041 if arg.tz is not None:
1042 result = arg.tz_convert("utc")
1043 else:
1044 result = arg.tz_localize("utc")
1045 elif isinstance(arg, ABCSeries):
1046 cache_array = _maybe_cache(arg, format, cache, convert_listlike)
1047 if not cache_array.empty:
1048 result = arg.map(cache_array)
1049 else:
1050 values = convert_listlike(arg._values, format)
1051 result = arg._constructor(values, index=arg.index, name=arg.name)
1052 elif isinstance(arg, (ABCDataFrame, abc.MutableMapping)):
1053 result = _assemble_from_unit_mappings(arg, errors, utc)
1054 elif isinstance(arg, Index):
1055 cache_array = _maybe_cache(arg, format, cache, convert_listlike)
1056 if not cache_array.empty:
1057 result = _convert_and_box_cache(arg, cache_array, name=arg.name)
1058 else:
1059 result = convert_listlike(arg, format, name=arg.name)
1060 elif is_list_like(arg):
1061 try:
1062 # error: Argument 1 to "_maybe_cache" has incompatible type
1063 # "Union[float, str, datetime, List[Any], Tuple[Any, ...], ExtensionArray,
1064 # ndarray[Any, Any], Series]"; expected "Union[List[Any], Tuple[Any, ...],
1065 # Union[Union[ExtensionArray, ndarray[Any, Any]], Index, Series], Series]"
1066 argc = cast(
1067 Union[list, tuple, ExtensionArray, np.ndarray, "Series", Index], arg
1068 )
1069 cache_array = _maybe_cache(argc, format, cache, convert_listlike)
1070 except OutOfBoundsDatetime:
1071 # caching attempts to create a DatetimeIndex, which may raise
1072 # an OOB. If that's the desired behavior, then just reraise...
1073 if errors == "raise":
1074 raise
1075 # ... otherwise, continue without the cache.
1076 from pandas import Series
1077
1078 cache_array = Series([], dtype=object) # just an empty array
1079 if not cache_array.empty:
1080 result = _convert_and_box_cache(argc, cache_array)
1081 else:
1082 result = convert_listlike(argc, format)
1083 else:
1084 result = convert_listlike(np.array([arg]), format)[0]
1085 if isinstance(arg, bool) and isinstance(result, np.bool_):
1086 result = bool(result) # TODO: avoid this kludge.
1087
1088 # error: Incompatible return value type (got "Union[Timestamp, NaTType,
1089 # Series, Index]", expected "Union[DatetimeIndex, Series, float, str,
1090 # NaTType, None]")
1091 return result # type: ignore[return-value]
1092
1093
1094# mappings for assembling units
1095_unit_map = {
1096 "year": "year",
1097 "years": "year",
1098 "month": "month",
1099 "months": "month",
1100 "day": "day",
1101 "days": "day",
1102 "hour": "h",
1103 "hours": "h",
1104 "minute": "m",
1105 "minutes": "m",
1106 "second": "s",
1107 "seconds": "s",
1108 "ms": "ms",
1109 "millisecond": "ms",
1110 "milliseconds": "ms",
1111 "us": "us",
1112 "microsecond": "us",
1113 "microseconds": "us",
1114 "ns": "ns",
1115 "nanosecond": "ns",
1116 "nanoseconds": "ns",
1117}
1118
1119
1120def _assemble_from_unit_mappings(arg, errors: DateTimeErrorChoices, utc: bool):
1121 """
1122 assemble the unit specified fields from the arg (DataFrame)
1123 Return a Series for actual parsing
1124
1125 Parameters
1126 ----------
1127 arg : DataFrame
1128 errors : {'ignore', 'raise', 'coerce'}, default 'raise'
1129
1130 - If :const:`'raise'`, then invalid parsing will raise an exception
1131 - If :const:`'coerce'`, then invalid parsing will be set as :const:`NaT`
1132 - If :const:`'ignore'`, then invalid parsing will return the input
1133 utc : bool
1134 Whether to convert/localize timestamps to UTC.
1135
1136 Returns
1137 -------
1138 Series
1139 """
1140 from pandas import (
1141 DataFrame,
1142 to_numeric,
1143 to_timedelta,
1144 )
1145
1146 arg = DataFrame(arg)
1147 if not arg.columns.is_unique:
1148 raise ValueError("cannot assemble with duplicate keys")
1149
1150 # replace passed unit with _unit_map
1151 def f(value):
1152 if value in _unit_map:
1153 return _unit_map[value]
1154
1155 # m is case significant
1156 if value.lower() in _unit_map:
1157 return _unit_map[value.lower()]
1158
1159 return value
1160
1161 unit = {k: f(k) for k in arg.keys()}
1162 unit_rev = {v: k for k, v in unit.items()}
1163
1164 # we require at least Ymd
1165 required = ["year", "month", "day"]
1166 req = sorted(set(required) - set(unit_rev.keys()))
1167 if len(req):
1168 _required = ",".join(req)
1169 raise ValueError(
1170 "to assemble mappings requires at least that "
1171 f"[year, month, day] be specified: [{_required}] is missing"
1172 )
1173
1174 # keys we don't recognize
1175 excess = sorted(set(unit_rev.keys()) - set(_unit_map.values()))
1176 if len(excess):
1177 _excess = ",".join(excess)
1178 raise ValueError(
1179 f"extra keys have been passed to the datetime assemblage: [{_excess}]"
1180 )
1181
1182 def coerce(values):
1183 # we allow coercion to if errors allows
1184 values = to_numeric(values, errors=errors)
1185
1186 # prevent overflow in case of int8 or int16
1187 if is_integer_dtype(values):
1188 values = values.astype("int64", copy=False)
1189 return values
1190
1191 values = (
1192 coerce(arg[unit_rev["year"]]) * 10000
1193 + coerce(arg[unit_rev["month"]]) * 100
1194 + coerce(arg[unit_rev["day"]])
1195 )
1196 try:
1197 values = to_datetime(values, format="%Y%m%d", errors=errors, utc=utc)
1198 except (TypeError, ValueError) as err:
1199 raise ValueError(f"cannot assemble the datetimes: {err}") from err
1200
1201 units: list[UnitChoices] = ["h", "m", "s", "ms", "us", "ns"]
1202 for u in units:
1203 value = unit_rev.get(u)
1204 if value is not None and value in arg:
1205 try:
1206 values += to_timedelta(coerce(arg[value]), unit=u, errors=errors)
1207 except (TypeError, ValueError) as err:
1208 raise ValueError(
1209 f"cannot assemble the datetimes [{value}]: {err}"
1210 ) from err
1211 return values
1212
1213
1214def _attempt_YYYYMMDD(arg: npt.NDArray[np.object_], errors: str) -> np.ndarray | None:
1215 """
1216 try to parse the YYYYMMDD/%Y%m%d format, try to deal with NaT-like,
1217 arg is a passed in as an object dtype, but could really be ints/strings
1218 with nan-like/or floats (e.g. with nan)
1219
1220 Parameters
1221 ----------
1222 arg : np.ndarray[object]
1223 errors : {'raise','ignore','coerce'}
1224 """
1225
1226 def calc(carg):
1227 # calculate the actual result
1228 carg = carg.astype(object, copy=False)
1229 parsed = parsing.try_parse_year_month_day(
1230 carg / 10000, carg / 100 % 100, carg % 100
1231 )
1232 return tslib.array_to_datetime(parsed, errors=errors)[0]
1233
1234 def calc_with_mask(carg, mask):
1235 result = np.empty(carg.shape, dtype="M8[ns]")
1236 iresult = result.view("i8")
1237 iresult[~mask] = iNaT
1238
1239 masked_result = calc(carg[mask].astype(np.float64).astype(np.int64))
1240 result[mask] = masked_result.astype("M8[ns]")
1241 return result
1242
1243 # try intlike / strings that are ints
1244 try:
1245 return calc(arg.astype(np.int64))
1246 except (ValueError, OverflowError, TypeError):
1247 pass
1248
1249 # a float with actual np.nan
1250 try:
1251 carg = arg.astype(np.float64)
1252 return calc_with_mask(carg, notna(carg))
1253 except (ValueError, OverflowError, TypeError):
1254 pass
1255
1256 # string with NaN-like
1257 try:
1258 # error: Argument 2 to "isin" has incompatible type "List[Any]"; expected
1259 # "Union[Union[ExtensionArray, ndarray], Index, Series]"
1260 mask = ~algorithms.isin(arg, list(nat_strings)) # type: ignore[arg-type]
1261 return calc_with_mask(arg, mask)
1262 except (ValueError, OverflowError, TypeError):
1263 pass
1264
1265 return None
1266
1267
1268__all__ = [
1269 "DateParseError",
1270 "should_cache",
1271 "to_datetime",
1272]