Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pandas/core/tools/datetimes.py: 19%

1from __future__ import annotations

3from collections import abc

4from datetime import datetime

5from functools import partial

6from itertools import islice

7from typing import (

8 TYPE_CHECKING,

9 Callable,

10 Hashable,

11 List,

12 Tuple,

13 TypedDict,

14 Union,

15 cast,

16 overload,

17)

18import warnings

20import numpy as np

22from pandas._libs import (

23 lib,

24 tslib,

25)

26from pandas._libs.tslibs import (

27 OutOfBoundsDatetime,

28 Timedelta,

29 Timestamp,

30 astype_overflowsafe,

31 get_unit_from_dtype,

32 iNaT,

33 is_supported_unit,

34 nat_strings,

35 parsing,

36 timezones as libtimezones,

37)

38from pandas._libs.tslibs.conversion import precision_from_unit

39from pandas._libs.tslibs.parsing import (

40 DateParseError,

41 guess_datetime_format,

42)

43from pandas._libs.tslibs.strptime import array_strptime

44from pandas._typing import (

45 AnyArrayLike,

46 ArrayLike,

47 DateTimeErrorChoices,

48 npt,

49)

50from pandas.util._exceptions import find_stack_level

52from pandas.core.dtypes.common import (

53 ensure_object,

54 is_datetime64_dtype,

55 is_datetime64tz_dtype,

56 is_float,

57 is_integer,

58 is_integer_dtype,

59 is_list_like,

60 is_numeric_dtype,

61 is_scalar,

62)

63from pandas.core.dtypes.generic import (

64 ABCDataFrame,

65 ABCSeries,

66)

67from pandas.core.dtypes.missing import notna

69from pandas.arrays import (

70 DatetimeArray,

71 IntegerArray,

72 PandasArray,

73)

74from pandas.core import algorithms

75from pandas.core.algorithms import unique

76from pandas.core.arrays.base import ExtensionArray

77from pandas.core.arrays.datetimes import (

78 maybe_convert_dtype,

79 objects_to_datetime64ns,

80 tz_to_dtype,

81)

82from pandas.core.construction import extract_array

83from pandas.core.indexes.base import Index

84from pandas.core.indexes.datetimes import DatetimeIndex

86if TYPE_CHECKING:

87 from pandas._libs.tslibs.nattype import NaTType

88 from pandas._libs.tslibs.timedeltas import UnitChoices

90 from pandas import (

91 DataFrame,

92 Series,

93 )

95# ---------------------------------------------------------------------

96# types used in annotations

98ArrayConvertible = Union[List, Tuple, AnyArrayLike]

99Scalar = Union[float, str]

100DatetimeScalar = Union[Scalar, datetime]

101

102DatetimeScalarOrArrayConvertible = Union[DatetimeScalar, ArrayConvertible]

103

104DatetimeDictArg = Union[List[Scalar], Tuple[Scalar, ...], AnyArrayLike]

105

106

107class YearMonthDayDict(TypedDict, total=True):

108 year: DatetimeDictArg

109 month: DatetimeDictArg

110 day: DatetimeDictArg

111

112

113class FulldatetimeDict(YearMonthDayDict, total=False):

114 hour: DatetimeDictArg

115 hours: DatetimeDictArg

116 minute: DatetimeDictArg

117 minutes: DatetimeDictArg

118 second: DatetimeDictArg

119 seconds: DatetimeDictArg

120 ms: DatetimeDictArg

121 us: DatetimeDictArg

122 ns: DatetimeDictArg

123

124

125DictConvertible = Union[FulldatetimeDict, "DataFrame"]

126start_caching_at = 50

127

128

129# ---------------------------------------------------------------------

130

131

132def _guess_datetime_format_for_array(arr, dayfirst: bool | None = False) -> str | None:

133 # Try to guess the format based on the first non-NaN element, return None if can't

134 if (first_non_null := tslib.first_non_null(arr)) != -1:

135 if type(first_non_nan_element := arr[first_non_null]) is str:

136 # GH#32264 np.str_ object

137 guessed_format = guess_datetime_format(

138 first_non_nan_element, dayfirst=dayfirst

139 )

140 if guessed_format is not None:

141 return guessed_format

142 # If there are multiple non-null elements, warn about

143 # how parsing might not be consistent

144 if tslib.first_non_null(arr[first_non_null + 1 :]) != -1:

145 warnings.warn(

146 "Could not infer format, so each element will be parsed "

147 "individually, falling back to `dateutil`. To ensure parsing is "

148 "consistent and as-expected, please specify a format.",

149 UserWarning,

150 stacklevel=find_stack_level(),

151 )

152 return None

153

154

155def should_cache(

156 arg: ArrayConvertible, unique_share: float = 0.7, check_count: int | None = None

157) -> bool:

158 """

159 Decides whether to do caching.

160

161 If the percent of unique elements among `check_count` elements less

162 than `unique_share * 100` then we can do caching.

163

164 Parameters

165 ----------

166 arg: listlike, tuple, 1-d array, Series

167 unique_share: float, default=0.7, optional

168 0 < unique_share < 1

169 check_count: int, optional

170 0 <= check_count <= len(arg)

171

172 Returns

173 -------

174 do_caching: bool

175

176 Notes

177 -----

178 By default for a sequence of less than 50 items in size, we don't do

179 caching; for the number of elements less than 5000, we take ten percent of

180 all elements to check for a uniqueness share; if the sequence size is more

181 than 5000, then we check only the first 500 elements.

182 All constants were chosen empirically by.

183 """

184 do_caching = True

185

186 # default realization

187 if check_count is None:

188 # in this case, the gain from caching is negligible

189 if len(arg) <= start_caching_at:

190 return False

191

192 if len(arg) <= 5000:

193 check_count = len(arg) // 10

194 else:

195 check_count = 500

196 else:

197 assert (

198 0 <= check_count <= len(arg)

199 ), "check_count must be in next bounds: [0; len(arg)]"

200 if check_count == 0:

201 return False

202

203 assert 0 < unique_share < 1, "unique_share must be in next bounds: (0; 1)"

204

205 try:

206 # We can't cache if the items are not hashable.

207 unique_elements = set(islice(arg, check_count))

208 except TypeError:

209 return False

210 if len(unique_elements) > check_count * unique_share:

211 do_caching = False

212 return do_caching

213

214

215def _maybe_cache(

216 arg: ArrayConvertible,

217 format: str | None,

218 cache: bool,

219 convert_listlike: Callable,

220) -> Series:

221 """

222 Create a cache of unique dates from an array of dates

223

224 Parameters

225 ----------

226 arg : listlike, tuple, 1-d array, Series

227 format : string

228 Strftime format to parse time

229 cache : bool

230 True attempts to create a cache of converted values

231 convert_listlike : function

232 Conversion function to apply on dates

233

234 Returns

235 -------

236 cache_array : Series

237 Cache of converted, unique dates. Can be empty

238 """

239 from pandas import Series

240

241 cache_array = Series(dtype=object)

242

243 if cache:

244 # Perform a quicker unique check

245 if not should_cache(arg):

246 return cache_array

247

248 unique_dates = unique(arg)

249 if len(unique_dates) < len(arg):

250 cache_dates = convert_listlike(unique_dates, format)

251 # GH#45319

252 try:

253 cache_array = Series(cache_dates, index=unique_dates, copy=False)

254 except OutOfBoundsDatetime:

255 return cache_array

256 # GH#39882 and GH#35888 in case of None and NaT we get duplicates

257 if not cache_array.index.is_unique:

258 cache_array = cache_array[~cache_array.index.duplicated()]

259 return cache_array

260

261

262def _box_as_indexlike(

263 dt_array: ArrayLike, utc: bool = False, name: Hashable = None

264) -> Index:

265 """

266 Properly boxes the ndarray of datetimes to DatetimeIndex

267 if it is possible or to generic Index instead

268

269 Parameters

270 ----------

271 dt_array: 1-d array

272 Array of datetimes to be wrapped in an Index.

273 utc : bool

274 Whether to convert/localize timestamps to UTC.

275 name : string, default None

276 Name for a resulting index

277

278 Returns

279 -------

280 result : datetime of converted dates

281 - DatetimeIndex if convertible to sole datetime64 type

282 - general Index otherwise

283 """

284

285 if is_datetime64_dtype(dt_array):

286 tz = "utc" if utc else None

287 return DatetimeIndex(dt_array, tz=tz, name=name)

288 return Index(dt_array, name=name, dtype=dt_array.dtype)

289

290

291def _convert_and_box_cache(

292 arg: DatetimeScalarOrArrayConvertible,

293 cache_array: Series,

294 name: Hashable | None = None,

295) -> Index:

296 """

297 Convert array of dates with a cache and wrap the result in an Index.

298

299 Parameters

300 ----------

301 arg : integer, float, string, datetime, list, tuple, 1-d array, Series

302 cache_array : Series

303 Cache of converted, unique dates

304 name : string, default None

305 Name for a DatetimeIndex

306

307 Returns

308 -------

309 result : Index-like of converted dates

310 """

311 from pandas import Series

312

313 result = Series(arg, dtype=cache_array.index.dtype).map(cache_array)

314 return _box_as_indexlike(result._values, utc=False, name=name)

315

316

317def _return_parsed_timezone_results(

318 result: np.ndarray, timezones, utc: bool, name

319) -> Index:

320 """

321 Return results from array_strptime if a %z or %Z directive was passed.

322

323 Parameters

324 ----------

325 result : ndarray[int64]

326 int64 date representations of the dates

327 timezones : ndarray

328 pytz timezone objects

329 utc : bool

330 Whether to convert/localize timestamps to UTC.

331 name : string, default None

332 Name for a DatetimeIndex

333

334 Returns

335 -------

336 tz_result : Index-like of parsed dates with timezone

337 """

338 tz_results = np.empty(len(result), dtype=object)

339 for zone in unique(timezones):

340 mask = timezones == zone

341 dta = DatetimeArray(result[mask]).tz_localize(zone)

342 if utc:

343 if dta.tzinfo is None:

344 dta = dta.tz_localize("utc")

345 else:

346 dta = dta.tz_convert("utc")

347 tz_results[mask] = dta

348

349 return Index(tz_results, name=name)

350

351

352def _convert_listlike_datetimes(

353 arg,

354 format: str | None,

355 name: Hashable = None,

356 utc: bool = False,

357 unit: str | None = None,

358 errors: DateTimeErrorChoices = "raise",

359 dayfirst: bool | None = None,

360 yearfirst: bool | None = None,

361 exact: bool = True,

362):

363 """

364 Helper function for to_datetime. Performs the conversions of 1D listlike

365 of dates

366

367 Parameters

368 ----------

369 arg : list, tuple, ndarray, Series, Index

370 date to be parsed

371 name : object

372 None or string for the Index name

373 utc : bool

374 Whether to convert/localize timestamps to UTC.

375 unit : str

376 None or string of the frequency of the passed data

377 errors : str

378 error handing behaviors from to_datetime, 'raise', 'coerce', 'ignore'

379 dayfirst : bool

380 dayfirst parsing behavior from to_datetime

381 yearfirst : bool

382 yearfirst parsing behavior from to_datetime

383 exact : bool, default True

384 exact format matching behavior from to_datetime

385

386 Returns

387 -------

388 Index-like of parsed dates

389 """

390 if isinstance(arg, (list, tuple)):

391 arg = np.array(arg, dtype="O")

392 elif isinstance(arg, PandasArray):

393 arg = np.array(arg)

394

395 arg_dtype = getattr(arg, "dtype", None)

396 # these are shortcutable

397 tz = "utc" if utc else None

398 if is_datetime64tz_dtype(arg_dtype):

399 if not isinstance(arg, (DatetimeArray, DatetimeIndex)):

400 return DatetimeIndex(arg, tz=tz, name=name)

401 if utc:

402 arg = arg.tz_convert(None).tz_localize("utc")

403 return arg

404

405 elif is_datetime64_dtype(arg_dtype):

406 arg_dtype = cast(np.dtype, arg_dtype)

407 if not is_supported_unit(get_unit_from_dtype(arg_dtype)):

408 # We go to closest supported reso, i.e. "s"

409 arg = astype_overflowsafe(

410 # TODO: looks like we incorrectly raise with errors=="ignore"

411 np.asarray(arg),

412 np.dtype("M8[s]"),

413 is_coerce=errors == "coerce",

414 )

415

416 if not isinstance(arg, (DatetimeArray, DatetimeIndex)):

417 return DatetimeIndex(arg, tz=tz, name=name)

418 elif utc:

419 # DatetimeArray, DatetimeIndex

420 return arg.tz_localize("utc")

421

422 return arg

423

424 elif unit is not None:

425 if format is not None:

426 raise ValueError("cannot specify both format and unit")

427 return _to_datetime_with_unit(arg, unit, name, utc, errors)

428 elif getattr(arg, "ndim", 1) > 1:

429 raise TypeError(

430 "arg must be a string, datetime, list, tuple, 1-d array, or Series"

431 )

432

433 # warn if passing timedelta64, raise for PeriodDtype

434 # NB: this must come after unit transformation

435 try:

436 arg, _ = maybe_convert_dtype(arg, copy=False, tz=libtimezones.maybe_get_tz(tz))

437 except TypeError:

438 if errors == "coerce":

439 npvalues = np.array(["NaT"], dtype="datetime64[ns]").repeat(len(arg))

440 return DatetimeIndex(npvalues, name=name)

441 elif errors == "ignore":

442 idx = Index(arg, name=name)

443 return idx

444 raise

445

446 arg = ensure_object(arg)

447

448 if format is None:

449 format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst)

450

451 # `format` could be inferred, or user didn't ask for mixed-format parsing.

452 if format is not None and format != "mixed":

453 return _array_strptime_with_fallback(arg, name, utc, format, exact, errors)

454

455 result, tz_parsed = objects_to_datetime64ns(

456 arg,

457 dayfirst=dayfirst,

458 yearfirst=yearfirst,

459 utc=utc,

460 errors=errors,

461 allow_object=True,

462 )

463

464 if tz_parsed is not None:

465 # We can take a shortcut since the datetime64 numpy array

466 # is in UTC

467 dta = DatetimeArray(result, dtype=tz_to_dtype(tz_parsed))

468 return DatetimeIndex._simple_new(dta, name=name)

469

470 return _box_as_indexlike(result, utc=utc, name=name)

471

472

473def _array_strptime_with_fallback(

474 arg,

475 name,

476 utc: bool,

477 fmt: str,

478 exact: bool,

479 errors: str,

480) -> Index:

481 """

482 Call array_strptime, with fallback behavior depending on 'errors'.

483 """

484 result, timezones = array_strptime(arg, fmt, exact=exact, errors=errors, utc=utc)

485 if any(tz is not None for tz in timezones):

486 return _return_parsed_timezone_results(result, timezones, utc, name)

487

488 return _box_as_indexlike(result, utc=utc, name=name)

489

490

491def _to_datetime_with_unit(arg, unit, name, utc: bool, errors: str) -> Index:

492 """

493 to_datetime specalized to the case where a 'unit' is passed.

494 """

495 arg = extract_array(arg, extract_numpy=True)

496

497 # GH#30050 pass an ndarray to tslib.array_with_unit_to_datetime

498 # because it expects an ndarray argument

499 if isinstance(arg, IntegerArray):

500 arr = arg.astype(f"datetime64[{unit}]")

501 tz_parsed = None

502 else:

503 arg = np.asarray(arg)

504

505 if arg.dtype.kind in ["i", "u"]:

506 # Note we can't do "f" here because that could induce unwanted

507 # rounding GH#14156, GH#20445

508 arr = arg.astype(f"datetime64[{unit}]", copy=False)

509 try:

510 arr = astype_overflowsafe(arr, np.dtype("M8[ns]"), copy=False)

511 except OutOfBoundsDatetime:

512 if errors == "raise":

513 raise

514 arg = arg.astype(object)

515 return _to_datetime_with_unit(arg, unit, name, utc, errors)

516 tz_parsed = None

517

518 elif arg.dtype.kind == "f":

519 mult, _ = precision_from_unit(unit)

520

521 mask = np.isnan(arg) | (arg == iNaT)

522 fvalues = (arg * mult).astype("f8", copy=False)

523 fvalues[mask] = 0

524

525 if (fvalues < Timestamp.min._value).any() or (

526 fvalues > Timestamp.max._value

527 ).any():

528 if errors != "raise":

529 arg = arg.astype(object)

530 return _to_datetime_with_unit(arg, unit, name, utc, errors)

531 raise OutOfBoundsDatetime(f"cannot convert input with unit '{unit}'")

532

533 arr = fvalues.astype("M8[ns]", copy=False)

534 arr[mask] = np.datetime64("NaT", "ns")

535

536 tz_parsed = None

537 else:

538 arg = arg.astype(object, copy=False)

539 arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)

540

541 if errors == "ignore":

542 # Index constructor _may_ infer to DatetimeIndex

543 result = Index._with_infer(arr, name=name)

544 else:

545 result = DatetimeIndex(arr, name=name)

546

547 if not isinstance(result, DatetimeIndex):

548 return result

549

550 # GH#23758: We may still need to localize the result with tz

551 # GH#25546: Apply tz_parsed first (from arg), then tz (from caller)

552 # result will be naive but in UTC

553 result = result.tz_localize("UTC").tz_convert(tz_parsed)

554

555 if utc:

556 if result.tz is None:

557 result = result.tz_localize("utc")

558 else:

559 result = result.tz_convert("utc")

560 return result

561

562

563def _adjust_to_origin(arg, origin, unit):

564 """

565 Helper function for to_datetime.

566 Adjust input argument to the specified origin

567

568 Parameters

569 ----------

570 arg : list, tuple, ndarray, Series, Index

571 date to be adjusted

572 origin : 'julian' or Timestamp

573 origin offset for the arg

574 unit : str

575 passed unit from to_datetime, must be 'D'

576

577 Returns

578 -------

579 ndarray or scalar of adjusted date(s)

580 """

581 if origin == "julian":

582 original = arg

583 j0 = Timestamp(0).to_julian_date()

584 if unit != "D":

585 raise ValueError("unit must be 'D' for origin='julian'")

586 try:

587 arg = arg - j0

588 except TypeError as err:

589 raise ValueError(

590 "incompatible 'arg' type for given 'origin'='julian'"

591 ) from err

592

593 # preemptively check this for a nice range

594 j_max = Timestamp.max.to_julian_date() - j0

595 j_min = Timestamp.min.to_julian_date() - j0

596 if np.any(arg > j_max) or np.any(arg < j_min):

597 raise OutOfBoundsDatetime(

598 f"{original} is Out of Bounds for origin='julian'"

599 )

600 else:

601 # arg must be numeric

602 if not (

603 (is_scalar(arg) and (is_integer(arg) or is_float(arg)))

604 or is_numeric_dtype(np.asarray(arg))

605 ):

606 raise ValueError(

607 f"'{arg}' is not compatible with origin='{origin}'; "

608 "it must be numeric with a unit specified"

609 )

610

611 # we are going to offset back to unix / epoch time

612 try:

613 offset = Timestamp(origin, unit=unit)

614 except OutOfBoundsDatetime as err:

615 raise OutOfBoundsDatetime(f"origin {origin} is Out of Bounds") from err

616 except ValueError as err:

617 raise ValueError(

618 f"origin {origin} cannot be converted to a Timestamp"

619 ) from err

620

621 if offset.tz is not None:

622 raise ValueError(f"origin offset {offset} must be tz-naive")

623 td_offset = offset - Timestamp(0)

624

625 # convert the offset to the unit of the arg

626 # this should be lossless in terms of precision

627 ioffset = td_offset // Timedelta(1, unit=unit)

628

629 # scalars & ndarray-like can handle the addition

630 if is_list_like(arg) and not isinstance(arg, (ABCSeries, Index, np.ndarray)):

631 arg = np.asarray(arg)

632 arg = arg + ioffset

633 return arg

634

635

636@overload

637def to_datetime(

638 arg: DatetimeScalar,

639 errors: DateTimeErrorChoices = ...,

640 dayfirst: bool = ...,

641 yearfirst: bool = ...,

642 utc: bool = ...,

643 format: str | None = ...,

644 exact: bool = ...,

645 unit: str | None = ...,

646 infer_datetime_format: bool = ...,

647 origin=...,

648 cache: bool = ...,

649) -> Timestamp:

650 ...

651

652

653@overload

654def to_datetime(

655 arg: Series | DictConvertible,

656 errors: DateTimeErrorChoices = ...,

657 dayfirst: bool = ...,

658 yearfirst: bool = ...,

659 utc: bool = ...,

660 format: str | None = ...,

661 exact: bool = ...,

662 unit: str | None = ...,

663 infer_datetime_format: bool = ...,

664 origin=...,

665 cache: bool = ...,

666) -> Series:

667 ...

668

669

670@overload

671def to_datetime(

672 arg: list | tuple | Index | ArrayLike,

673 errors: DateTimeErrorChoices = ...,

674 dayfirst: bool = ...,

675 yearfirst: bool = ...,

676 utc: bool = ...,

677 format: str | None = ...,

678 exact: bool = ...,

679 unit: str | None = ...,

680 infer_datetime_format: bool = ...,

681 origin=...,

682 cache: bool = ...,

683) -> DatetimeIndex:

684 ...

685

686

687def to_datetime(

688 arg: DatetimeScalarOrArrayConvertible | DictConvertible,

689 errors: DateTimeErrorChoices = "raise",

690 dayfirst: bool = False,

691 yearfirst: bool = False,

692 utc: bool = False,

693 format: str | None = None,

694 exact: bool | lib.NoDefault = lib.no_default,

695 unit: str | None = None,

696 infer_datetime_format: lib.NoDefault | bool = lib.no_default,

697 origin: str = "unix",

698 cache: bool = True,

699) -> DatetimeIndex | Series | DatetimeScalar | NaTType | None:

700 """

701 Convert argument to datetime.

702

703 This function converts a scalar, array-like, :class:`Series` or

704 :class:`DataFrame`/dict-like to a pandas datetime object.

705

706 Parameters

707 ----------

708 arg : int, float, str, datetime, list, tuple, 1-d array, Series, DataFrame/dict-like

709 The object to convert to a datetime. If a :class:`DataFrame` is provided, the

710 method expects minimally the following columns: :const:`"year"`,

711 :const:`"month"`, :const:`"day"`.

712 errors : {'ignore', 'raise', 'coerce'}, default 'raise'

713 - If :const:`'raise'`, then invalid parsing will raise an exception.

714 - If :const:`'coerce'`, then invalid parsing will be set as :const:`NaT`.

715 - If :const:`'ignore'`, then invalid parsing will return the input.

716 dayfirst : bool, default False

717 Specify a date parse order if `arg` is str or is list-like.

718 If :const:`True`, parses dates with the day first, e.g. :const:`"10/11/12"`

719 is parsed as :const:`2012-11-10`.

720

721 .. warning::

722

723 ``dayfirst=True`` is not strict, but will prefer to parse

724 with day first.

725

726 yearfirst : bool, default False

727 Specify a date parse order if `arg` is str or is list-like.

728

729 - If :const:`True` parses dates with the year first, e.g.

730 :const:`"10/11/12"` is parsed as :const:`2010-11-12`.

731 - If both `dayfirst` and `yearfirst` are :const:`True`, `yearfirst` is

732 preceded (same as :mod:`dateutil`).

733

734 .. warning::

735

736 ``yearfirst=True`` is not strict, but will prefer to parse

737 with year first.

738

739 utc : bool, default False

740 Control timezone-related parsing, localization and conversion.

741

742 - If :const:`True`, the function *always* returns a timezone-aware

743 UTC-localized :class:`Timestamp`, :class:`Series` or

744 :class:`DatetimeIndex`. To do this, timezone-naive inputs are

745 *localized* as UTC, while timezone-aware inputs are *converted* to UTC.

746

747 - If :const:`False` (default), inputs will not be coerced to UTC.

748 Timezone-naive inputs will remain naive, while timezone-aware ones

749 will keep their time offsets. Limitations exist for mixed

750 offsets (typically, daylight savings), see :ref:`Examples

751 <to_datetime_tz_examples>` section for details.

752

753 See also: pandas general documentation about `timezone conversion and

754 localization

755 <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html

756 #time-zone-handling>`_.

757

758 format : str, default None

759 The strftime to parse time, e.g. :const:`"%d/%m/%Y"`. See

760 `strftime documentation

761 <https://docs.python.org/3/library/datetime.html

762 #strftime-and-strptime-behavior>`_ for more information on choices, though

763 note that :const:`"%f"` will parse all the way up to nanoseconds.

764 You can also pass:

765

766 - "ISO8601", to parse any `ISO8601 <https://en.wikipedia.org/wiki/ISO_8601>`_

767 time string (not necessarily in exactly the same format);

768 - "mixed", to infer the format for each element individually. This is risky,

769 and you should probably use it along with `dayfirst`.

770 exact : bool, default True

771 Control how `format` is used:

772

773 - If :const:`True`, require an exact `format` match.

774 - If :const:`False`, allow the `format` to match anywhere in the target

775 string.

776

777 Cannot be used alongside ``format='ISO8601'`` or ``format='mixed'``.

778 unit : str, default 'ns'

779 The unit of the arg (D,s,ms,us,ns) denote the unit, which is an

780 integer or float number. This will be based off the origin.

781 Example, with ``unit='ms'`` and ``origin='unix'``, this would calculate

782 the number of milliseconds to the unix epoch start.

783 infer_datetime_format : bool, default False

784 If :const:`True` and no `format` is given, attempt to infer the format

785 of the datetime strings based on the first non-NaN element,

786 and if it can be inferred, switch to a faster method of parsing them.

787 In some cases this can increase the parsing speed by ~5-10x.

788

789 .. deprecated:: 2.0.0

790 A strict version of this argument is now the default, passing it has

791 no effect.

792

793 origin : scalar, default 'unix'

794 Define the reference date. The numeric values would be parsed as number

795 of units (defined by `unit`) since this reference date.

796

797 - If :const:`'unix'` (or POSIX) time; origin is set to 1970-01-01.

798 - If :const:`'julian'`, unit must be :const:`'D'`, and origin is set to

799 beginning of Julian Calendar. Julian day number :const:`0` is assigned

800 to the day starting at noon on January 1, 4713 BC.

801 - If Timestamp convertible (Timestamp, dt.datetime, np.datetimt64 or date

802 string), origin is set to Timestamp identified by origin.

803 - If a float or integer, origin is the millisecond difference

804 relative to 1970-01-01.

805 cache : bool, default True

806 If :const:`True`, use a cache of unique, converted dates to apply the

807 datetime conversion. May produce significant speed-up when parsing

808 duplicate date strings, especially ones with timezone offsets. The cache

809 is only used when there are at least 50 values. The presence of

810 out-of-bounds values will render the cache unusable and may slow down

811 parsing.

812

813 Returns

814 -------

815 datetime

816 If parsing succeeded.

817 Return type depends on input (types in parenthesis correspond to

818 fallback in case of unsuccessful timezone or out-of-range timestamp

819 parsing):

820

821 - scalar: :class:`Timestamp` (or :class:`datetime.datetime`)

822 - array-like: :class:`DatetimeIndex` (or :class:`Series` with

823 :class:`object` dtype containing :class:`datetime.datetime`)

824 - Series: :class:`Series` of :class:`datetime64` dtype (or

825 :class:`Series` of :class:`object` dtype containing

826 :class:`datetime.datetime`)

827 - DataFrame: :class:`Series` of :class:`datetime64` dtype (or

828 :class:`Series` of :class:`object` dtype containing

829 :class:`datetime.datetime`)

830

831 Raises

832 ------

833 ParserError

834 When parsing a date from string fails.

835 ValueError

836 When another datetime conversion error happens. For example when one

837 of 'year', 'month', day' columns is missing in a :class:`DataFrame`, or

838 when a Timezone-aware :class:`datetime.datetime` is found in an array-like

839 of mixed time offsets, and ``utc=False``.

840

841 See Also

842 --------

843 DataFrame.astype : Cast argument to a specified dtype.

844 to_timedelta : Convert argument to timedelta.

845 convert_dtypes : Convert dtypes.

846

847 Notes

848 -----

849

850 Many input types are supported, and lead to different output types:

851

852 - **scalars** can be int, float, str, datetime object (from stdlib :mod:`datetime`

853 module or :mod:`numpy`). They are converted to :class:`Timestamp` when

854 possible, otherwise they are converted to :class:`datetime.datetime`.

855 None/NaN/null scalars are converted to :const:`NaT`.

856

857 - **array-like** can contain int, float, str, datetime objects. They are

858 converted to :class:`DatetimeIndex` when possible, otherwise they are

859 converted to :class:`Index` with :class:`object` dtype, containing

860 :class:`datetime.datetime`. None/NaN/null entries are converted to

861 :const:`NaT` in both cases.

862

863 - **Series** are converted to :class:`Series` with :class:`datetime64`

864 dtype when possible, otherwise they are converted to :class:`Series` with

865 :class:`object` dtype, containing :class:`datetime.datetime`. None/NaN/null

866 entries are converted to :const:`NaT` in both cases.

867

868 - **DataFrame/dict-like** are converted to :class:`Series` with

869 :class:`datetime64` dtype. For each row a datetime is created from assembling

870 the various dataframe columns. Column keys can be common abbreviations

871 like [‘year’, ‘month’, ‘day’, ‘minute’, ‘second’, ‘ms’, ‘us’, ‘ns’]) or

872 plurals of the same.

873

874 The following causes are responsible for :class:`datetime.datetime` objects

875 being returned (possibly inside an :class:`Index` or a :class:`Series` with

876 :class:`object` dtype) instead of a proper pandas designated type

877 (:class:`Timestamp`, :class:`DatetimeIndex` or :class:`Series`

878 with :class:`datetime64` dtype):

879

880 - when any input element is before :const:`Timestamp.min` or after

881 :const:`Timestamp.max`, see `timestamp limitations

882 <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html

883 #timeseries-timestamp-limits>`_.

884

885 - when ``utc=False`` (default) and the input is an array-like or

886 :class:`Series` containing mixed naive/aware datetime, or aware with mixed

887 time offsets. Note that this happens in the (quite frequent) situation when

888 the timezone has a daylight savings policy. In that case you may wish to

889 use ``utc=True``.

890

891 Examples

892 --------

893

894 **Handling various input formats**

895

896 Assembling a datetime from multiple columns of a :class:`DataFrame`. The keys

897 can be common abbreviations like ['year', 'month', 'day', 'minute', 'second',

898 'ms', 'us', 'ns']) or plurals of the same

899

900 >>> df = pd.DataFrame({'year': [2015, 2016],

901 ... 'month': [2, 3],

902 ... 'day': [4, 5]})

903 >>> pd.to_datetime(df)

904 0 2015-02-04

905 1 2016-03-05

906 dtype: datetime64[ns]

907

908 Using a unix epoch time

909

910 >>> pd.to_datetime(1490195805, unit='s')

911 Timestamp('2017-03-22 15:16:45')

912 >>> pd.to_datetime(1490195805433502912, unit='ns')

913 Timestamp('2017-03-22 15:16:45.433502912')

914

915 .. warning:: For float arg, precision rounding might happen. To prevent

916 unexpected behavior use a fixed-width exact type.

917

918 Using a non-unix epoch origin

919

920 >>> pd.to_datetime([1, 2, 3], unit='D',

921 ... origin=pd.Timestamp('1960-01-01'))

922 DatetimeIndex(['1960-01-02', '1960-01-03', '1960-01-04'],

923 dtype='datetime64[ns]', freq=None)

924

925 **Differences with strptime behavior**

926

927 :const:`"%f"` will parse all the way up to nanoseconds.

928

929 >>> pd.to_datetime('2018-10-26 12:00:00.0000000011',

930 ... format='%Y-%m-%d %H:%M:%S.%f')

931 Timestamp('2018-10-26 12:00:00.000000001')

932

933 **Non-convertible date/times**

934

935 If a date does not meet the `timestamp limitations

936 <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html

937 #timeseries-timestamp-limits>`_, passing ``errors='ignore'``

938 will return the original input instead of raising any exception.

939

940 Passing ``errors='coerce'`` will force an out-of-bounds date to :const:`NaT`,

941 in addition to forcing non-dates (or non-parseable dates) to :const:`NaT`.

942

943 >>> pd.to_datetime('13000101', format='%Y%m%d', errors='ignore')

944 '13000101'

945 >>> pd.to_datetime('13000101', format='%Y%m%d', errors='coerce')

946 NaT

947

948 .. _to_datetime_tz_examples:

949

950 **Timezones and time offsets**

951

952 The default behaviour (``utc=False``) is as follows:

953

954 - Timezone-naive inputs are converted to timezone-naive :class:`DatetimeIndex`:

955

956 >>> pd.to_datetime(['2018-10-26 12:00:00', '2018-10-26 13:00:15'])

957 DatetimeIndex(['2018-10-26 12:00:00', '2018-10-26 13:00:15'],

958 dtype='datetime64[ns]', freq=None)

959

960 - Timezone-aware inputs *with constant time offset* are converted to

961 timezone-aware :class:`DatetimeIndex`:

962

963 >>> pd.to_datetime(['2018-10-26 12:00 -0500', '2018-10-26 13:00 -0500'])

964 DatetimeIndex(['2018-10-26 12:00:00-05:00', '2018-10-26 13:00:00-05:00'],

965 dtype='datetime64[ns, UTC-05:00]', freq=None)

966

967 - However, timezone-aware inputs *with mixed time offsets* (for example

968 issued from a timezone with daylight savings, such as Europe/Paris)

969 are **not successfully converted** to a :class:`DatetimeIndex`. Instead a

970 simple :class:`Index` containing :class:`datetime.datetime` objects is

971 returned:

972

973 >>> pd.to_datetime(['2020-10-25 02:00 +0200', '2020-10-25 04:00 +0100'])

974 Index([2020-10-25 02:00:00+02:00, 2020-10-25 04:00:00+01:00],

975 dtype='object')

976

977 - A mix of timezone-aware and timezone-naive inputs is also converted to

978 a simple :class:`Index` containing :class:`datetime.datetime` objects:

979

980 >>> from datetime import datetime

981 >>> pd.to_datetime(["2020-01-01 01:00:00-01:00", datetime(2020, 1, 1, 3, 0)])

982 Index([2020-01-01 01:00:00-01:00, 2020-01-01 03:00:00], dtype='object')

983

984 |

985

986 Setting ``utc=True`` solves most of the above issues:

987

988 - Timezone-naive inputs are *localized* as UTC

989

990 >>> pd.to_datetime(['2018-10-26 12:00', '2018-10-26 13:00'], utc=True)

991 DatetimeIndex(['2018-10-26 12:00:00+00:00', '2018-10-26 13:00:00+00:00'],

992 dtype='datetime64[ns, UTC]', freq=None)

993

994 - Timezone-aware inputs are *converted* to UTC (the output represents the

995 exact same datetime, but viewed from the UTC time offset `+00:00`).

996

997 >>> pd.to_datetime(['2018-10-26 12:00 -0530', '2018-10-26 12:00 -0500'],

998 ... utc=True)

999 DatetimeIndex(['2018-10-26 17:30:00+00:00', '2018-10-26 17:00:00+00:00'],

1000 dtype='datetime64[ns, UTC]', freq=None)

1001

1002 - Inputs can contain both string or datetime, the above

1003 rules still apply

1004

1005 >>> pd.to_datetime(['2018-10-26 12:00', datetime(2020, 1, 1, 18)], utc=True)

1006 DatetimeIndex(['2018-10-26 12:00:00+00:00', '2020-01-01 18:00:00+00:00'],

1007 dtype='datetime64[ns, UTC]', freq=None)

1008 """

1009 if exact is not lib.no_default and format in {"mixed", "ISO8601"}:

1010 raise ValueError("Cannot use 'exact' when 'format' is 'mixed' or 'ISO8601'")

1011 if infer_datetime_format is not lib.no_default:

1012 warnings.warn(

1013 "The argument 'infer_datetime_format' is deprecated and will "

1014 "be removed in a future version. "

1015 "A strict version of it is now the default, see "

1016 "https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. "

1017 "You can safely remove this argument.",

1018 stacklevel=find_stack_level(),

1019 )

1020 if arg is None:

1021 return None

1022

1023 if origin != "unix":

1024 arg = _adjust_to_origin(arg, origin, unit)

1025

1026 convert_listlike = partial(

1027 _convert_listlike_datetimes,

1028 utc=utc,

1029 unit=unit,

1030 dayfirst=dayfirst,

1031 yearfirst=yearfirst,

1032 errors=errors,

1033 exact=exact,

1034 )

1035 # pylint: disable-next=used-before-assignment

1036 result: Timestamp | NaTType | Series | Index

1037

1038 if isinstance(arg, Timestamp):

1039 result = arg

1040 if utc:

1041 if arg.tz is not None:

1042 result = arg.tz_convert("utc")

1043 else:

1044 result = arg.tz_localize("utc")

1045 elif isinstance(arg, ABCSeries):

1046 cache_array = _maybe_cache(arg, format, cache, convert_listlike)

1047 if not cache_array.empty:

1048 result = arg.map(cache_array)

1049 else:

1050 values = convert_listlike(arg._values, format)

1051 result = arg._constructor(values, index=arg.index, name=arg.name)

1052 elif isinstance(arg, (ABCDataFrame, abc.MutableMapping)):

1053 result = _assemble_from_unit_mappings(arg, errors, utc)

1054 elif isinstance(arg, Index):

1055 cache_array = _maybe_cache(arg, format, cache, convert_listlike)

1056 if not cache_array.empty:

1057 result = _convert_and_box_cache(arg, cache_array, name=arg.name)

1058 else:

1059 result = convert_listlike(arg, format, name=arg.name)

1060 elif is_list_like(arg):

1061 try:

1062 # error: Argument 1 to "_maybe_cache" has incompatible type

1063 # "Union[float, str, datetime, List[Any], Tuple[Any, ...], ExtensionArray,

1064 # ndarray[Any, Any], Series]"; expected "Union[List[Any], Tuple[Any, ...],

1065 # Union[Union[ExtensionArray, ndarray[Any, Any]], Index, Series], Series]"

1066 argc = cast(

1067 Union[list, tuple, ExtensionArray, np.ndarray, "Series", Index], arg

1068 )

1069 cache_array = _maybe_cache(argc, format, cache, convert_listlike)

1070 except OutOfBoundsDatetime:

1071 # caching attempts to create a DatetimeIndex, which may raise

1072 # an OOB. If that's the desired behavior, then just reraise...

1073 if errors == "raise":

1074 raise

1075 # ... otherwise, continue without the cache.

1076 from pandas import Series

1077

1078 cache_array = Series([], dtype=object) # just an empty array

1079 if not cache_array.empty:

1080 result = _convert_and_box_cache(argc, cache_array)

1081 else:

1082 result = convert_listlike(argc, format)

1083 else:

1084 result = convert_listlike(np.array([arg]), format)[0]

1085 if isinstance(arg, bool) and isinstance(result, np.bool_):

1086 result = bool(result) # TODO: avoid this kludge.

1087

1088 # error: Incompatible return value type (got "Union[Timestamp, NaTType,

1089 # Series, Index]", expected "Union[DatetimeIndex, Series, float, str,

1090 # NaTType, None]")

1091 return result # type: ignore[return-value]

1092

1093

1094# mappings for assembling units

1095_unit_map = {

1096 "year": "year",

1097 "years": "year",

1098 "month": "month",

1099 "months": "month",

1100 "day": "day",

1101 "days": "day",

1102 "hour": "h",

1103 "hours": "h",

1104 "minute": "m",

1105 "minutes": "m",

1106 "second": "s",

1107 "seconds": "s",

1108 "ms": "ms",

1109 "millisecond": "ms",

1110 "milliseconds": "ms",

1111 "us": "us",

1112 "microsecond": "us",

1113 "microseconds": "us",

1114 "ns": "ns",

1115 "nanosecond": "ns",

1116 "nanoseconds": "ns",

1117}

1118

1119

1120def _assemble_from_unit_mappings(arg, errors: DateTimeErrorChoices, utc: bool):

1121 """

1122 assemble the unit specified fields from the arg (DataFrame)

1123 Return a Series for actual parsing

1124

1125 Parameters

1126 ----------

1127 arg : DataFrame

1128 errors : {'ignore', 'raise', 'coerce'}, default 'raise'

1129

1130 - If :const:`'raise'`, then invalid parsing will raise an exception

1131 - If :const:`'coerce'`, then invalid parsing will be set as :const:`NaT`

1132 - If :const:`'ignore'`, then invalid parsing will return the input

1133 utc : bool

1134 Whether to convert/localize timestamps to UTC.

1135

1136 Returns

1137 -------

1138 Series

1139 """

1140 from pandas import (

1141 DataFrame,

1142 to_numeric,

1143 to_timedelta,

1144 )

1145

1146 arg = DataFrame(arg)

1147 if not arg.columns.is_unique:

1148 raise ValueError("cannot assemble with duplicate keys")

1149

1150 # replace passed unit with _unit_map

1151 def f(value):

1152 if value in _unit_map:

1153 return _unit_map[value]

1154

1155 # m is case significant

1156 if value.lower() in _unit_map:

1157 return _unit_map[value.lower()]

1158

1159 return value

1160

1161 unit = {k: f(k) for k in arg.keys()}

1162 unit_rev = {v: k for k, v in unit.items()}

1163

1164 # we require at least Ymd

1165 required = ["year", "month", "day"]

1166 req = sorted(set(required) - set(unit_rev.keys()))

1167 if len(req):

1168 _required = ",".join(req)

1169 raise ValueError(

1170 "to assemble mappings requires at least that "

1171 f"[year, month, day] be specified: [{_required}] is missing"

1172 )

1173

1174 # keys we don't recognize

1175 excess = sorted(set(unit_rev.keys()) - set(_unit_map.values()))

1176 if len(excess):

1177 _excess = ",".join(excess)

1178 raise ValueError(

1179 f"extra keys have been passed to the datetime assemblage: [{_excess}]"

1180 )

1181

1182 def coerce(values):

1183 # we allow coercion to if errors allows

1184 values = to_numeric(values, errors=errors)

1185

1186 # prevent overflow in case of int8 or int16

1187 if is_integer_dtype(values):

1188 values = values.astype("int64", copy=False)

1189 return values

1190

1191 values = (

1192 coerce(arg[unit_rev["year"]]) * 10000

1193 + coerce(arg[unit_rev["month"]]) * 100

1194 + coerce(arg[unit_rev["day"]])

1195 )

1196 try:

1197 values = to_datetime(values, format="%Y%m%d", errors=errors, utc=utc)

1198 except (TypeError, ValueError) as err:

1199 raise ValueError(f"cannot assemble the datetimes: {err}") from err

1200

1201 units: list[UnitChoices] = ["h", "m", "s", "ms", "us", "ns"]

1202 for u in units:

1203 value = unit_rev.get(u)

1204 if value is not None and value in arg:

1205 try:

1206 values += to_timedelta(coerce(arg[value]), unit=u, errors=errors)

1207 except (TypeError, ValueError) as err:

1208 raise ValueError(

1209 f"cannot assemble the datetimes [{value}]: {err}"

1210 ) from err

1211 return values

1212

1213

1214def _attempt_YYYYMMDD(arg: npt.NDArray[np.object_], errors: str) -> np.ndarray | None:

1215 """

1216 try to parse the YYYYMMDD/%Y%m%d format, try to deal with NaT-like,

1217 arg is a passed in as an object dtype, but could really be ints/strings

1218 with nan-like/or floats (e.g. with nan)

1219

1220 Parameters

1221 ----------

1222 arg : np.ndarray[object]

1223 errors : {'raise','ignore','coerce'}

1224 """

1225

1226 def calc(carg):

1227 # calculate the actual result

1228 carg = carg.astype(object, copy=False)

1229 parsed = parsing.try_parse_year_month_day(

1230 carg / 10000, carg / 100 % 100, carg % 100

1231 )

1232 return tslib.array_to_datetime(parsed, errors=errors)[0]

1233

1234 def calc_with_mask(carg, mask):

1235 result = np.empty(carg.shape, dtype="M8[ns]")

1236 iresult = result.view("i8")

1237 iresult[~mask] = iNaT

1238

1239 masked_result = calc(carg[mask].astype(np.float64).astype(np.int64))

1240 result[mask] = masked_result.astype("M8[ns]")

1241 return result

1242

1243 # try intlike / strings that are ints

1244 try:

1245 return calc(arg.astype(np.int64))

1246 except (ValueError, OverflowError, TypeError):

1247 pass

1248

1249 # a float with actual np.nan

1250 try:

1251 carg = arg.astype(np.float64)

1252 return calc_with_mask(carg, notna(carg))

1253 except (ValueError, OverflowError, TypeError):

1254 pass

1255

1256 # string with NaN-like

1257 try:

1258 # error: Argument 2 to "isin" has incompatible type "List[Any]"; expected

1259 # "Union[Union[ExtensionArray, ndarray], Index, Series]"

1260 mask = ~algorithms.isin(arg, list(nat_strings)) # type: ignore[arg-type]

1261 return calc_with_mask(arg, mask)

1262 except (ValueError, OverflowError, TypeError):

1263 pass

1264

1265 return None

1266

1267

1268__all__ = [

1269 "DateParseError",

1270 "should_cache",

1271 "to_datetime",

1272]