Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/core/tools/datetimes.py: 54%

1from __future__ import annotations

3from collections import abc

4from datetime import date

5from functools import partial

6from itertools import islice

7from typing import (

8 TYPE_CHECKING,

9 Callable,

10 TypedDict,

11 Union,

12 cast,

13 overload,

14)

15import warnings

17import numpy as np

19from pandas._libs import (

20 lib,

21 tslib,

22)

23from pandas._libs.tslibs import (

24 OutOfBoundsDatetime,

25 Timedelta,

26 Timestamp,

27 astype_overflowsafe,

28 is_supported_dtype,

29 timezones as libtimezones,

30)

31from pandas._libs.tslibs.conversion import cast_from_unit_vectorized

32from pandas._libs.tslibs.parsing import (

33 DateParseError,

34 guess_datetime_format,

35)

36from pandas._libs.tslibs.strptime import array_strptime

37from pandas._typing import (

38 AnyArrayLike,

39 ArrayLike,

40 DateTimeErrorChoices,

41)

42from pandas.util._exceptions import find_stack_level

44from pandas.core.dtypes.common import (

45 ensure_object,

46 is_float,

47 is_integer,

48 is_integer_dtype,

49 is_list_like,

50 is_numeric_dtype,

51)

52from pandas.core.dtypes.dtypes import (

53 ArrowDtype,

54 DatetimeTZDtype,

55)

56from pandas.core.dtypes.generic import (

57 ABCDataFrame,

58 ABCSeries,

59)

61from pandas.arrays import (

62 DatetimeArray,

63 IntegerArray,

64 NumpyExtensionArray,

65)

66from pandas.core.algorithms import unique

67from pandas.core.arrays import ArrowExtensionArray

68from pandas.core.arrays.base import ExtensionArray

69from pandas.core.arrays.datetimes import (

70 maybe_convert_dtype,

71 objects_to_datetime64,

72 tz_to_dtype,

73)

74from pandas.core.construction import extract_array

75from pandas.core.indexes.base import Index

76from pandas.core.indexes.datetimes import DatetimeIndex

78if TYPE_CHECKING:

79 from collections.abc import Hashable

81 from pandas._libs.tslibs.nattype import NaTType

82 from pandas._libs.tslibs.timedeltas import UnitChoices

84 from pandas import (

85 DataFrame,

86 Series,

87 )

89# ---------------------------------------------------------------------

90# types used in annotations

92ArrayConvertible = Union[list, tuple, AnyArrayLike]

93Scalar = Union[float, str]

94DatetimeScalar = Union[Scalar, date, np.datetime64]

96DatetimeScalarOrArrayConvertible = Union[DatetimeScalar, ArrayConvertible]

98DatetimeDictArg = Union[list[Scalar], tuple[Scalar, ...], AnyArrayLike]

100

101class YearMonthDayDict(TypedDict, total=True):

102 year: DatetimeDictArg

103 month: DatetimeDictArg

104 day: DatetimeDictArg

105

106

107class FulldatetimeDict(YearMonthDayDict, total=False):

108 hour: DatetimeDictArg

109 hours: DatetimeDictArg

110 minute: DatetimeDictArg

111 minutes: DatetimeDictArg

112 second: DatetimeDictArg

113 seconds: DatetimeDictArg

114 ms: DatetimeDictArg

115 us: DatetimeDictArg

116 ns: DatetimeDictArg

117

118

119DictConvertible = Union[FulldatetimeDict, "DataFrame"]

120start_caching_at = 50

121

122

123# ---------------------------------------------------------------------

124

125

126def _guess_datetime_format_for_array(arr, dayfirst: bool | None = False) -> str | None:

127 # Try to guess the format based on the first non-NaN element, return None if can't

128 if (first_non_null := tslib.first_non_null(arr)) != -1:

129 if type(first_non_nan_element := arr[first_non_null]) is str: # noqa: E721

130 # GH#32264 np.str_ object

131 guessed_format = guess_datetime_format(

132 first_non_nan_element, dayfirst=dayfirst

133 )

134 if guessed_format is not None:

135 return guessed_format

136 # If there are multiple non-null elements, warn about

137 # how parsing might not be consistent

138 if tslib.first_non_null(arr[first_non_null + 1 :]) != -1:

139 warnings.warn(

140 "Could not infer format, so each element will be parsed "

141 "individually, falling back to `dateutil`. To ensure parsing is "

142 "consistent and as-expected, please specify a format.",

143 UserWarning,

144 stacklevel=find_stack_level(),

145 )

146 return None

147

148

149def should_cache(

150 arg: ArrayConvertible, unique_share: float = 0.7, check_count: int | None = None

151) -> bool:

152 """

153 Decides whether to do caching.

154

155 If the percent of unique elements among `check_count` elements less

156 than `unique_share * 100` then we can do caching.

157

158 Parameters

159 ----------

160 arg: listlike, tuple, 1-d array, Series

161 unique_share: float, default=0.7, optional

162 0 < unique_share < 1

163 check_count: int, optional

164 0 <= check_count <= len(arg)

165

166 Returns

167 -------

168 do_caching: bool

169

170 Notes

171 -----

172 By default for a sequence of less than 50 items in size, we don't do

173 caching; for the number of elements less than 5000, we take ten percent of

174 all elements to check for a uniqueness share; if the sequence size is more

175 than 5000, then we check only the first 500 elements.

176 All constants were chosen empirically by.

177 """

178 do_caching = True

179

180 # default realization

181 if check_count is None:

182 # in this case, the gain from caching is negligible

183 if len(arg) <= start_caching_at:

184 return False

185

186 if len(arg) <= 5000:

187 check_count = len(arg) // 10

188 else:

189 check_count = 500

190 else:

191 assert (

192 0 <= check_count <= len(arg)

193 ), "check_count must be in next bounds: [0; len(arg)]"

194 if check_count == 0:

195 return False

196

197 assert 0 < unique_share < 1, "unique_share must be in next bounds: (0; 1)"

198

199 try:

200 # We can't cache if the items are not hashable.

201 unique_elements = set(islice(arg, check_count))

202 except TypeError:

203 return False

204 if len(unique_elements) > check_count * unique_share:

205 do_caching = False

206 return do_caching

207

208

209def _maybe_cache(

210 arg: ArrayConvertible,

211 format: str | None,

212 cache: bool,

213 convert_listlike: Callable,

214) -> Series:

215 """

216 Create a cache of unique dates from an array of dates

217

218 Parameters

219 ----------

220 arg : listlike, tuple, 1-d array, Series

221 format : string

222 Strftime format to parse time

223 cache : bool

224 True attempts to create a cache of converted values

225 convert_listlike : function

226 Conversion function to apply on dates

227

228 Returns

229 -------

230 cache_array : Series

231 Cache of converted, unique dates. Can be empty

232 """

233 from pandas import Series

234

235 cache_array = Series(dtype=object)

236

237 if cache:

238 # Perform a quicker unique check

239 if not should_cache(arg):

240 return cache_array

241

242 if not isinstance(arg, (np.ndarray, ExtensionArray, Index, ABCSeries)):

243 arg = np.array(arg)

244

245 unique_dates = unique(arg)

246 if len(unique_dates) < len(arg):

247 cache_dates = convert_listlike(unique_dates, format)

248 # GH#45319

249 try:

250 cache_array = Series(cache_dates, index=unique_dates, copy=False)

251 except OutOfBoundsDatetime:

252 return cache_array

253 # GH#39882 and GH#35888 in case of None and NaT we get duplicates

254 if not cache_array.index.is_unique:

255 cache_array = cache_array[~cache_array.index.duplicated()]

256 return cache_array

257

258

259def _box_as_indexlike(

260 dt_array: ArrayLike, utc: bool = False, name: Hashable | None = None

261) -> Index:

262 """

263 Properly boxes the ndarray of datetimes to DatetimeIndex

264 if it is possible or to generic Index instead

265

266 Parameters

267 ----------

268 dt_array: 1-d array

269 Array of datetimes to be wrapped in an Index.

270 utc : bool

271 Whether to convert/localize timestamps to UTC.

272 name : string, default None

273 Name for a resulting index

274

275 Returns

276 -------

277 result : datetime of converted dates

278 - DatetimeIndex if convertible to sole datetime64 type

279 - general Index otherwise

280 """

281

282 if lib.is_np_dtype(dt_array.dtype, "M"):

283 tz = "utc" if utc else None

284 return DatetimeIndex(dt_array, tz=tz, name=name)

285 return Index(dt_array, name=name, dtype=dt_array.dtype)

286

287

288def _convert_and_box_cache(

289 arg: DatetimeScalarOrArrayConvertible,

290 cache_array: Series,

291 name: Hashable | None = None,

292) -> Index:

293 """

294 Convert array of dates with a cache and wrap the result in an Index.

295

296 Parameters

297 ----------

298 arg : integer, float, string, datetime, list, tuple, 1-d array, Series

299 cache_array : Series

300 Cache of converted, unique dates

301 name : string, default None

302 Name for a DatetimeIndex

303

304 Returns

305 -------

306 result : Index-like of converted dates

307 """

308 from pandas import Series

309

310 result = Series(arg, dtype=cache_array.index.dtype).map(cache_array)

311 return _box_as_indexlike(result._values, utc=False, name=name)

312

313

314def _convert_listlike_datetimes(

315 arg,

316 format: str | None,

317 name: Hashable | None = None,

318 utc: bool = False,

319 unit: str | None = None,

320 errors: DateTimeErrorChoices = "raise",

321 dayfirst: bool | None = None,

322 yearfirst: bool | None = None,

323 exact: bool = True,

324):

325 """

326 Helper function for to_datetime. Performs the conversions of 1D listlike

327 of dates

328

329 Parameters

330 ----------

331 arg : list, tuple, ndarray, Series, Index

332 date to be parsed

333 name : object

334 None or string for the Index name

335 utc : bool

336 Whether to convert/localize timestamps to UTC.

337 unit : str

338 None or string of the frequency of the passed data

339 errors : str

340 error handing behaviors from to_datetime, 'raise', 'coerce', 'ignore'

341 dayfirst : bool

342 dayfirst parsing behavior from to_datetime

343 yearfirst : bool

344 yearfirst parsing behavior from to_datetime

345 exact : bool, default True

346 exact format matching behavior from to_datetime

347

348 Returns

349 -------

350 Index-like of parsed dates

351 """

352 if isinstance(arg, (list, tuple)):

353 arg = np.array(arg, dtype="O")

354 elif isinstance(arg, NumpyExtensionArray):

355 arg = np.array(arg)

356

357 arg_dtype = getattr(arg, "dtype", None)

358 # these are shortcutable

359 tz = "utc" if utc else None

360 if isinstance(arg_dtype, DatetimeTZDtype):

361 if not isinstance(arg, (DatetimeArray, DatetimeIndex)):

362 return DatetimeIndex(arg, tz=tz, name=name)

363 if utc:

364 arg = arg.tz_convert(None).tz_localize("utc")

365 return arg

366

367 elif isinstance(arg_dtype, ArrowDtype) and arg_dtype.type is Timestamp:

368 # TODO: Combine with above if DTI/DTA supports Arrow timestamps

369 if utc:

370 # pyarrow uses UTC, not lowercase utc

371 if isinstance(arg, Index):

372 arg_array = cast(ArrowExtensionArray, arg.array)

373 if arg_dtype.pyarrow_dtype.tz is not None:

374 arg_array = arg_array._dt_tz_convert("UTC")

375 else:

376 arg_array = arg_array._dt_tz_localize("UTC")

377 arg = Index(arg_array)

378 else:

379 # ArrowExtensionArray

380 if arg_dtype.pyarrow_dtype.tz is not None:

381 arg = arg._dt_tz_convert("UTC")

382 else:

383 arg = arg._dt_tz_localize("UTC")

384 return arg

385

386 elif lib.is_np_dtype(arg_dtype, "M"):

387 if not is_supported_dtype(arg_dtype):

388 # We go to closest supported reso, i.e. "s"

389 arg = astype_overflowsafe(

390 # TODO: looks like we incorrectly raise with errors=="ignore"

391 np.asarray(arg),

392 np.dtype("M8[s]"),

393 is_coerce=errors == "coerce",

394 )

395

396 if not isinstance(arg, (DatetimeArray, DatetimeIndex)):

397 return DatetimeIndex(arg, tz=tz, name=name)

398 elif utc:

399 # DatetimeArray, DatetimeIndex

400 return arg.tz_localize("utc")

401

402 return arg

403

404 elif unit is not None:

405 if format is not None:

406 raise ValueError("cannot specify both format and unit")

407 return _to_datetime_with_unit(arg, unit, name, utc, errors)

408 elif getattr(arg, "ndim", 1) > 1:

409 raise TypeError(

410 "arg must be a string, datetime, list, tuple, 1-d array, or Series"

411 )

412

413 # warn if passing timedelta64, raise for PeriodDtype

414 # NB: this must come after unit transformation

415 try:

416 arg, _ = maybe_convert_dtype(arg, copy=False, tz=libtimezones.maybe_get_tz(tz))

417 except TypeError:

418 if errors == "coerce":

419 npvalues = np.array(["NaT"], dtype="datetime64[ns]").repeat(len(arg))

420 return DatetimeIndex(npvalues, name=name)

421 elif errors == "ignore":

422 idx = Index(arg, name=name)

423 return idx

424 raise

425

426 arg = ensure_object(arg)

427

428 if format is None:

429 format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst)

430

431 # `format` could be inferred, or user didn't ask for mixed-format parsing.

432 if format is not None and format != "mixed":

433 return _array_strptime_with_fallback(arg, name, utc, format, exact, errors)

434

435 result, tz_parsed = objects_to_datetime64(

436 arg,

437 dayfirst=dayfirst,

438 yearfirst=yearfirst,

439 utc=utc,

440 errors=errors,

441 allow_object=True,

442 )

443

444 if tz_parsed is not None:

445 # We can take a shortcut since the datetime64 numpy array

446 # is in UTC

447 out_unit = np.datetime_data(result.dtype)[0]

448 dtype = cast(DatetimeTZDtype, tz_to_dtype(tz_parsed, out_unit))

449 dt64_values = result.view(f"M8[{dtype.unit}]")

450 dta = DatetimeArray._simple_new(dt64_values, dtype=dtype)

451 return DatetimeIndex._simple_new(dta, name=name)

452

453 return _box_as_indexlike(result, utc=utc, name=name)

454

455

456def _array_strptime_with_fallback(

457 arg,

458 name,

459 utc: bool,

460 fmt: str,

461 exact: bool,

462 errors: str,

463) -> Index:

464 """

465 Call array_strptime, with fallback behavior depending on 'errors'.

466 """

467 result, tz_out = array_strptime(arg, fmt, exact=exact, errors=errors, utc=utc)

468 if tz_out is not None:

469 unit = np.datetime_data(result.dtype)[0]

470 dtype = DatetimeTZDtype(tz=tz_out, unit=unit)

471 dta = DatetimeArray._simple_new(result, dtype=dtype)

472 if utc:

473 dta = dta.tz_convert("UTC")

474 return Index(dta, name=name)

475 elif result.dtype != object and utc:

476 unit = np.datetime_data(result.dtype)[0]

477 res = Index(result, dtype=f"M8[{unit}, UTC]", name=name)

478 return res

479 return Index(result, dtype=result.dtype, name=name)

480

481

482def _to_datetime_with_unit(arg, unit, name, utc: bool, errors: str) -> Index:

483 """

484 to_datetime specalized to the case where a 'unit' is passed.

485 """

486 arg = extract_array(arg, extract_numpy=True)

487

488 # GH#30050 pass an ndarray to tslib.array_with_unit_to_datetime

489 # because it expects an ndarray argument

490 if isinstance(arg, IntegerArray):

491 arr = arg.astype(f"datetime64[{unit}]")

492 tz_parsed = None

493 else:

494 arg = np.asarray(arg)

495

496 if arg.dtype.kind in "iu":

497 # Note we can't do "f" here because that could induce unwanted

498 # rounding GH#14156, GH#20445

499 arr = arg.astype(f"datetime64[{unit}]", copy=False)

500 try:

501 arr = astype_overflowsafe(arr, np.dtype("M8[ns]"), copy=False)

502 except OutOfBoundsDatetime:

503 if errors == "raise":

504 raise

505 arg = arg.astype(object)

506 return _to_datetime_with_unit(arg, unit, name, utc, errors)

507 tz_parsed = None

508

509 elif arg.dtype.kind == "f":

510 with np.errstate(over="raise"):

511 try:

512 arr = cast_from_unit_vectorized(arg, unit=unit)

513 except OutOfBoundsDatetime:

514 if errors != "raise":

515 return _to_datetime_with_unit(

516 arg.astype(object), unit, name, utc, errors

517 )

518 raise OutOfBoundsDatetime(

519 f"cannot convert input with unit '{unit}'"

520 )

521

522 arr = arr.view("M8[ns]")

523 tz_parsed = None

524 else:

525 arg = arg.astype(object, copy=False)

526 arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)

527

528 if errors == "ignore":

529 # Index constructor _may_ infer to DatetimeIndex

530 result = Index._with_infer(arr, name=name)

531 else:

532 result = DatetimeIndex(arr, name=name)

533

534 if not isinstance(result, DatetimeIndex):

535 return result

536

537 # GH#23758: We may still need to localize the result with tz

538 # GH#25546: Apply tz_parsed first (from arg), then tz (from caller)

539 # result will be naive but in UTC

540 result = result.tz_localize("UTC").tz_convert(tz_parsed)

541

542 if utc:

543 if result.tz is None:

544 result = result.tz_localize("utc")

545 else:

546 result = result.tz_convert("utc")

547 return result

548

549

550def _adjust_to_origin(arg, origin, unit):

551 """

552 Helper function for to_datetime.

553 Adjust input argument to the specified origin

554

555 Parameters

556 ----------

557 arg : list, tuple, ndarray, Series, Index

558 date to be adjusted

559 origin : 'julian' or Timestamp

560 origin offset for the arg

561 unit : str

562 passed unit from to_datetime, must be 'D'

563

564 Returns

565 -------

566 ndarray or scalar of adjusted date(s)

567 """

568 if origin == "julian":

569 original = arg

570 j0 = Timestamp(0).to_julian_date()

571 if unit != "D":

572 raise ValueError("unit must be 'D' for origin='julian'")

573 try:

574 arg = arg - j0

575 except TypeError as err:

576 raise ValueError(

577 "incompatible 'arg' type for given 'origin'='julian'"

578 ) from err

579

580 # preemptively check this for a nice range

581 j_max = Timestamp.max.to_julian_date() - j0

582 j_min = Timestamp.min.to_julian_date() - j0

583 if np.any(arg > j_max) or np.any(arg < j_min):

584 raise OutOfBoundsDatetime(

585 f"{original} is Out of Bounds for origin='julian'"

586 )

587 else:

588 # arg must be numeric

589 if not (

590 (is_integer(arg) or is_float(arg)) or is_numeric_dtype(np.asarray(arg))

591 ):

592 raise ValueError(

593 f"'{arg}' is not compatible with origin='{origin}'; "

594 "it must be numeric with a unit specified"

595 )

596

597 # we are going to offset back to unix / epoch time

598 try:

599 offset = Timestamp(origin, unit=unit)

600 except OutOfBoundsDatetime as err:

601 raise OutOfBoundsDatetime(f"origin {origin} is Out of Bounds") from err

602 except ValueError as err:

603 raise ValueError(

604 f"origin {origin} cannot be converted to a Timestamp"

605 ) from err

606

607 if offset.tz is not None:

608 raise ValueError(f"origin offset {offset} must be tz-naive")

609 td_offset = offset - Timestamp(0)

610

611 # convert the offset to the unit of the arg

612 # this should be lossless in terms of precision

613 ioffset = td_offset // Timedelta(1, unit=unit)

614

615 # scalars & ndarray-like can handle the addition

616 if is_list_like(arg) and not isinstance(arg, (ABCSeries, Index, np.ndarray)):

617 arg = np.asarray(arg)

618 arg = arg + ioffset

619 return arg

620

621

622@overload

623def to_datetime(

624 arg: DatetimeScalar,

625 errors: DateTimeErrorChoices = ...,

626 dayfirst: bool = ...,

627 yearfirst: bool = ...,

628 utc: bool = ...,

629 format: str | None = ...,

630 exact: bool = ...,

631 unit: str | None = ...,

632 infer_datetime_format: bool = ...,

633 origin=...,

634 cache: bool = ...,

635) -> Timestamp:

636 ...

637

638

639@overload

640def to_datetime(

641 arg: Series | DictConvertible,

642 errors: DateTimeErrorChoices = ...,

643 dayfirst: bool = ...,

644 yearfirst: bool = ...,

645 utc: bool = ...,

646 format: str | None = ...,

647 exact: bool = ...,

648 unit: str | None = ...,

649 infer_datetime_format: bool = ...,

650 origin=...,

651 cache: bool = ...,

652) -> Series:

653 ...

654

655

656@overload

657def to_datetime(

658 arg: list | tuple | Index | ArrayLike,

659 errors: DateTimeErrorChoices = ...,

660 dayfirst: bool = ...,

661 yearfirst: bool = ...,

662 utc: bool = ...,

663 format: str | None = ...,

664 exact: bool = ...,

665 unit: str | None = ...,

666 infer_datetime_format: bool = ...,

667 origin=...,

668 cache: bool = ...,

669) -> DatetimeIndex:

670 ...

671

672

673def to_datetime(

674 arg: DatetimeScalarOrArrayConvertible | DictConvertible,

675 errors: DateTimeErrorChoices = "raise",

676 dayfirst: bool = False,

677 yearfirst: bool = False,

678 utc: bool = False,

679 format: str | None = None,

680 exact: bool | lib.NoDefault = lib.no_default,

681 unit: str | None = None,

682 infer_datetime_format: lib.NoDefault | bool = lib.no_default,

683 origin: str = "unix",

684 cache: bool = True,

685) -> DatetimeIndex | Series | DatetimeScalar | NaTType | None:

686 """

687 Convert argument to datetime.

688

689 This function converts a scalar, array-like, :class:`Series` or

690 :class:`DataFrame`/dict-like to a pandas datetime object.

691

692 Parameters

693 ----------

694 arg : int, float, str, datetime, list, tuple, 1-d array, Series, DataFrame/dict-like

695 The object to convert to a datetime. If a :class:`DataFrame` is provided, the

696 method expects minimally the following columns: :const:`"year"`,

697 :const:`"month"`, :const:`"day"`. The column "year"

698 must be specified in 4-digit format.

699 errors : {'ignore', 'raise', 'coerce'}, default 'raise'

700 - If :const:`'raise'`, then invalid parsing will raise an exception.

701 - If :const:`'coerce'`, then invalid parsing will be set as :const:`NaT`.

702 - If :const:`'ignore'`, then invalid parsing will return the input.

703 dayfirst : bool, default False

704 Specify a date parse order if `arg` is str or is list-like.

705 If :const:`True`, parses dates with the day first, e.g. :const:`"10/11/12"`

706 is parsed as :const:`2012-11-10`.

707

708 .. warning::

709

710 ``dayfirst=True`` is not strict, but will prefer to parse

711 with day first.

712

713 yearfirst : bool, default False

714 Specify a date parse order if `arg` is str or is list-like.

715

716 - If :const:`True` parses dates with the year first, e.g.

717 :const:`"10/11/12"` is parsed as :const:`2010-11-12`.

718 - If both `dayfirst` and `yearfirst` are :const:`True`, `yearfirst` is

719 preceded (same as :mod:`dateutil`).

720

721 .. warning::

722

723 ``yearfirst=True`` is not strict, but will prefer to parse

724 with year first.

725

726 utc : bool, default False

727 Control timezone-related parsing, localization and conversion.

728

729 - If :const:`True`, the function *always* returns a timezone-aware

730 UTC-localized :class:`Timestamp`, :class:`Series` or

731 :class:`DatetimeIndex`. To do this, timezone-naive inputs are

732 *localized* as UTC, while timezone-aware inputs are *converted* to UTC.

733

734 - If :const:`False` (default), inputs will not be coerced to UTC.

735 Timezone-naive inputs will remain naive, while timezone-aware ones

736 will keep their time offsets. Limitations exist for mixed

737 offsets (typically, daylight savings), see :ref:`Examples

738 <to_datetime_tz_examples>` section for details.

739

740 .. warning::

741

742 In a future version of pandas, parsing datetimes with mixed time

743 zones will raise an error unless `utc=True`.

744 Please specify `utc=True` to opt in to the new behaviour

745 and silence this warning. To create a `Series` with mixed offsets and

746 `object` dtype, please use `apply` and `datetime.datetime.strptime`.

747

748 See also: pandas general documentation about `timezone conversion and

749 localization

750 <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html

751 #time-zone-handling>`_.

752

753 format : str, default None

754 The strftime to parse time, e.g. :const:`"%d/%m/%Y"`. See

755 `strftime documentation

756 <https://docs.python.org/3/library/datetime.html

757 #strftime-and-strptime-behavior>`_ for more information on choices, though

758 note that :const:`"%f"` will parse all the way up to nanoseconds.

759 You can also pass:

760

761 - "ISO8601", to parse any `ISO8601 <https://en.wikipedia.org/wiki/ISO_8601>`_

762 time string (not necessarily in exactly the same format);

763 - "mixed", to infer the format for each element individually. This is risky,

764 and you should probably use it along with `dayfirst`.

765

766 .. note::

767

768 If a :class:`DataFrame` is passed, then `format` has no effect.

769

770 exact : bool, default True

771 Control how `format` is used:

772

773 - If :const:`True`, require an exact `format` match.

774 - If :const:`False`, allow the `format` to match anywhere in the target

775 string.

776

777 Cannot be used alongside ``format='ISO8601'`` or ``format='mixed'``.

778 unit : str, default 'ns'

779 The unit of the arg (D,s,ms,us,ns) denote the unit, which is an

780 integer or float number. This will be based off the origin.

781 Example, with ``unit='ms'`` and ``origin='unix'``, this would calculate

782 the number of milliseconds to the unix epoch start.

783 infer_datetime_format : bool, default False

784 If :const:`True` and no `format` is given, attempt to infer the format

785 of the datetime strings based on the first non-NaN element,

786 and if it can be inferred, switch to a faster method of parsing them.

787 In some cases this can increase the parsing speed by ~5-10x.

788

789 .. deprecated:: 2.0.0

790 A strict version of this argument is now the default, passing it has

791 no effect.

792

793 origin : scalar, default 'unix'

794 Define the reference date. The numeric values would be parsed as number

795 of units (defined by `unit`) since this reference date.

796

797 - If :const:`'unix'` (or POSIX) time; origin is set to 1970-01-01.

798 - If :const:`'julian'`, unit must be :const:`'D'`, and origin is set to

799 beginning of Julian Calendar. Julian day number :const:`0` is assigned

800 to the day starting at noon on January 1, 4713 BC.

801 - If Timestamp convertible (Timestamp, dt.datetime, np.datetimt64 or date

802 string), origin is set to Timestamp identified by origin.

803 - If a float or integer, origin is the difference

804 (in units determined by the ``unit`` argument) relative to 1970-01-01.

805 cache : bool, default True

806 If :const:`True`, use a cache of unique, converted dates to apply the

807 datetime conversion. May produce significant speed-up when parsing

808 duplicate date strings, especially ones with timezone offsets. The cache

809 is only used when there are at least 50 values. The presence of

810 out-of-bounds values will render the cache unusable and may slow down

811 parsing.

812

813 Returns

814 -------

815 datetime

816 If parsing succeeded.

817 Return type depends on input (types in parenthesis correspond to

818 fallback in case of unsuccessful timezone or out-of-range timestamp

819 parsing):

820

821 - scalar: :class:`Timestamp` (or :class:`datetime.datetime`)

822 - array-like: :class:`DatetimeIndex` (or :class:`Series` with

823 :class:`object` dtype containing :class:`datetime.datetime`)

824 - Series: :class:`Series` of :class:`datetime64` dtype (or

825 :class:`Series` of :class:`object` dtype containing

826 :class:`datetime.datetime`)

827 - DataFrame: :class:`Series` of :class:`datetime64` dtype (or

828 :class:`Series` of :class:`object` dtype containing

829 :class:`datetime.datetime`)

830

831 Raises

832 ------

833 ParserError

834 When parsing a date from string fails.

835 ValueError

836 When another datetime conversion error happens. For example when one

837 of 'year', 'month', day' columns is missing in a :class:`DataFrame`, or

838 when a Timezone-aware :class:`datetime.datetime` is found in an array-like

839 of mixed time offsets, and ``utc=False``.

840

841 See Also

842 --------

843 DataFrame.astype : Cast argument to a specified dtype.

844 to_timedelta : Convert argument to timedelta.

845 convert_dtypes : Convert dtypes.

846

847 Notes

848 -----

849

850 Many input types are supported, and lead to different output types:

851

852 - **scalars** can be int, float, str, datetime object (from stdlib :mod:`datetime`

853 module or :mod:`numpy`). They are converted to :class:`Timestamp` when

854 possible, otherwise they are converted to :class:`datetime.datetime`.

855 None/NaN/null scalars are converted to :const:`NaT`.

856

857 - **array-like** can contain int, float, str, datetime objects. They are

858 converted to :class:`DatetimeIndex` when possible, otherwise they are

859 converted to :class:`Index` with :class:`object` dtype, containing

860 :class:`datetime.datetime`. None/NaN/null entries are converted to

861 :const:`NaT` in both cases.

862

863 - **Series** are converted to :class:`Series` with :class:`datetime64`

864 dtype when possible, otherwise they are converted to :class:`Series` with

865 :class:`object` dtype, containing :class:`datetime.datetime`. None/NaN/null

866 entries are converted to :const:`NaT` in both cases.

867

868 - **DataFrame/dict-like** are converted to :class:`Series` with

869 :class:`datetime64` dtype. For each row a datetime is created from assembling

870 the various dataframe columns. Column keys can be common abbreviations

871 like ['year', 'month', 'day', 'minute', 'second', 'ms', 'us', 'ns']) or

872 plurals of the same.

873

874 The following causes are responsible for :class:`datetime.datetime` objects

875 being returned (possibly inside an :class:`Index` or a :class:`Series` with

876 :class:`object` dtype) instead of a proper pandas designated type

877 (:class:`Timestamp`, :class:`DatetimeIndex` or :class:`Series`

878 with :class:`datetime64` dtype):

879

880 - when any input element is before :const:`Timestamp.min` or after

881 :const:`Timestamp.max`, see `timestamp limitations

882 <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html

883 #timeseries-timestamp-limits>`_.

884

885 - when ``utc=False`` (default) and the input is an array-like or

886 :class:`Series` containing mixed naive/aware datetime, or aware with mixed

887 time offsets. Note that this happens in the (quite frequent) situation when

888 the timezone has a daylight savings policy. In that case you may wish to

889 use ``utc=True``.

890

891 Examples

892 --------

893

894 **Handling various input formats**

895

896 Assembling a datetime from multiple columns of a :class:`DataFrame`. The keys

897 can be common abbreviations like ['year', 'month', 'day', 'minute', 'second',

898 'ms', 'us', 'ns']) or plurals of the same

899

900 >>> df = pd.DataFrame({'year': [2015, 2016],

901 ... 'month': [2, 3],

902 ... 'day': [4, 5]})

903 >>> pd.to_datetime(df)

904 0 2015-02-04

905 1 2016-03-05

906 dtype: datetime64[ns]

907

908 Using a unix epoch time

909

910 >>> pd.to_datetime(1490195805, unit='s')

911 Timestamp('2017-03-22 15:16:45')

912 >>> pd.to_datetime(1490195805433502912, unit='ns')

913 Timestamp('2017-03-22 15:16:45.433502912')

914

915 .. warning:: For float arg, precision rounding might happen. To prevent

916 unexpected behavior use a fixed-width exact type.

917

918 Using a non-unix epoch origin

919

920 >>> pd.to_datetime([1, 2, 3], unit='D',

921 ... origin=pd.Timestamp('1960-01-01'))

922 DatetimeIndex(['1960-01-02', '1960-01-03', '1960-01-04'],

923 dtype='datetime64[ns]', freq=None)

924

925 **Differences with strptime behavior**

926

927 :const:`"%f"` will parse all the way up to nanoseconds.

928

929 >>> pd.to_datetime('2018-10-26 12:00:00.0000000011',

930 ... format='%Y-%m-%d %H:%M:%S.%f')

931 Timestamp('2018-10-26 12:00:00.000000001')

932

933 **Non-convertible date/times**

934

935 Passing ``errors='coerce'`` will force an out-of-bounds date to :const:`NaT`,

936 in addition to forcing non-dates (or non-parseable dates) to :const:`NaT`.

937

938 >>> pd.to_datetime('13000101', format='%Y%m%d', errors='coerce')

939 NaT

940

941 .. _to_datetime_tz_examples:

942

943 **Timezones and time offsets**

944

945 The default behaviour (``utc=False``) is as follows:

946

947 - Timezone-naive inputs are converted to timezone-naive :class:`DatetimeIndex`:

948

949 >>> pd.to_datetime(['2018-10-26 12:00:00', '2018-10-26 13:00:15'])

950 DatetimeIndex(['2018-10-26 12:00:00', '2018-10-26 13:00:15'],

951 dtype='datetime64[ns]', freq=None)

952

953 - Timezone-aware inputs *with constant time offset* are converted to

954 timezone-aware :class:`DatetimeIndex`:

955

956 >>> pd.to_datetime(['2018-10-26 12:00 -0500', '2018-10-26 13:00 -0500'])

957 DatetimeIndex(['2018-10-26 12:00:00-05:00', '2018-10-26 13:00:00-05:00'],

958 dtype='datetime64[ns, UTC-05:00]', freq=None)

959

960 - However, timezone-aware inputs *with mixed time offsets* (for example

961 issued from a timezone with daylight savings, such as Europe/Paris)

962 are **not successfully converted** to a :class:`DatetimeIndex`.

963 Parsing datetimes with mixed time zones will show a warning unless

964 `utc=True`. If you specify `utc=False` the warning below will be shown

965 and a simple :class:`Index` containing :class:`datetime.datetime`

966 objects will be returned:

967

968 >>> pd.to_datetime(['2020-10-25 02:00 +0200',

969 ... '2020-10-25 04:00 +0100']) # doctest: +SKIP

970 FutureWarning: In a future version of pandas, parsing datetimes with mixed

971 time zones will raise an error unless `utc=True`. Please specify `utc=True`

972 to opt in to the new behaviour and silence this warning. To create a `Series`

973 with mixed offsets and `object` dtype, please use `apply` and

974 `datetime.datetime.strptime`.

975 Index([2020-10-25 02:00:00+02:00, 2020-10-25 04:00:00+01:00],

976 dtype='object')

977

978 - A mix of timezone-aware and timezone-naive inputs is also converted to

979 a simple :class:`Index` containing :class:`datetime.datetime` objects:

980

981 >>> from datetime import datetime

982 >>> pd.to_datetime(["2020-01-01 01:00:00-01:00",

983 ... datetime(2020, 1, 1, 3, 0)]) # doctest: +SKIP

984 FutureWarning: In a future version of pandas, parsing datetimes with mixed

985 time zones will raise an error unless `utc=True`. Please specify `utc=True`

986 to opt in to the new behaviour and silence this warning. To create a `Series`

987 with mixed offsets and `object` dtype, please use `apply` and

988 `datetime.datetime.strptime`.

989 Index([2020-01-01 01:00:00-01:00, 2020-01-01 03:00:00], dtype='object')

990

991 |

992

993 Setting ``utc=True`` solves most of the above issues:

994

995 - Timezone-naive inputs are *localized* as UTC

996

997 >>> pd.to_datetime(['2018-10-26 12:00', '2018-10-26 13:00'], utc=True)

998 DatetimeIndex(['2018-10-26 12:00:00+00:00', '2018-10-26 13:00:00+00:00'],

999 dtype='datetime64[ns, UTC]', freq=None)

1000

1001 - Timezone-aware inputs are *converted* to UTC (the output represents the

1002 exact same datetime, but viewed from the UTC time offset `+00:00`).

1003

1004 >>> pd.to_datetime(['2018-10-26 12:00 -0530', '2018-10-26 12:00 -0500'],

1005 ... utc=True)

1006 DatetimeIndex(['2018-10-26 17:30:00+00:00', '2018-10-26 17:00:00+00:00'],

1007 dtype='datetime64[ns, UTC]', freq=None)

1008

1009 - Inputs can contain both string or datetime, the above

1010 rules still apply

1011

1012 >>> pd.to_datetime(['2018-10-26 12:00', datetime(2020, 1, 1, 18)], utc=True)

1013 DatetimeIndex(['2018-10-26 12:00:00+00:00', '2020-01-01 18:00:00+00:00'],

1014 dtype='datetime64[ns, UTC]', freq=None)

1015 """

1016 if exact is not lib.no_default and format in {"mixed", "ISO8601"}:

1017 raise ValueError("Cannot use 'exact' when 'format' is 'mixed' or 'ISO8601'")

1018 if infer_datetime_format is not lib.no_default:

1019 warnings.warn(

1020 "The argument 'infer_datetime_format' is deprecated and will "

1021 "be removed in a future version. "

1022 "A strict version of it is now the default, see "

1023 "https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. "

1024 "You can safely remove this argument.",

1025 stacklevel=find_stack_level(),

1026 )

1027 if errors == "ignore":

1028 # GH#54467

1029 warnings.warn(

1030 "errors='ignore' is deprecated and will raise in a future version. "

1031 "Use to_datetime without passing `errors` and catch exceptions "

1032 "explicitly instead",

1033 FutureWarning,

1034 stacklevel=find_stack_level(),

1035 )

1036

1037 if arg is None:

1038 return None

1039

1040 if origin != "unix":

1041 arg = _adjust_to_origin(arg, origin, unit)

1042

1043 convert_listlike = partial(

1044 _convert_listlike_datetimes,

1045 utc=utc,

1046 unit=unit,

1047 dayfirst=dayfirst,

1048 yearfirst=yearfirst,

1049 errors=errors,

1050 exact=exact,

1051 )

1052 # pylint: disable-next=used-before-assignment

1053 result: Timestamp | NaTType | Series | Index

1054

1055 if isinstance(arg, Timestamp):

1056 result = arg

1057 if utc:

1058 if arg.tz is not None:

1059 result = arg.tz_convert("utc")

1060 else:

1061 result = arg.tz_localize("utc")

1062 elif isinstance(arg, ABCSeries):

1063 cache_array = _maybe_cache(arg, format, cache, convert_listlike)

1064 if not cache_array.empty:

1065 result = arg.map(cache_array)

1066 else:

1067 values = convert_listlike(arg._values, format)

1068 result = arg._constructor(values, index=arg.index, name=arg.name)

1069 elif isinstance(arg, (ABCDataFrame, abc.MutableMapping)):

1070 result = _assemble_from_unit_mappings(arg, errors, utc)

1071 elif isinstance(arg, Index):

1072 cache_array = _maybe_cache(arg, format, cache, convert_listlike)

1073 if not cache_array.empty:

1074 result = _convert_and_box_cache(arg, cache_array, name=arg.name)

1075 else:

1076 result = convert_listlike(arg, format, name=arg.name)

1077 elif is_list_like(arg):

1078 try:

1079 # error: Argument 1 to "_maybe_cache" has incompatible type

1080 # "Union[float, str, datetime, List[Any], Tuple[Any, ...], ExtensionArray,

1081 # ndarray[Any, Any], Series]"; expected "Union[List[Any], Tuple[Any, ...],

1082 # Union[Union[ExtensionArray, ndarray[Any, Any]], Index, Series], Series]"

1083 argc = cast(

1084 Union[list, tuple, ExtensionArray, np.ndarray, "Series", Index], arg

1085 )

1086 cache_array = _maybe_cache(argc, format, cache, convert_listlike)

1087 except OutOfBoundsDatetime:

1088 # caching attempts to create a DatetimeIndex, which may raise

1089 # an OOB. If that's the desired behavior, then just reraise...

1090 if errors == "raise":

1091 raise

1092 # ... otherwise, continue without the cache.

1093 from pandas import Series

1094

1095 cache_array = Series([], dtype=object) # just an empty array

1096 if not cache_array.empty:

1097 result = _convert_and_box_cache(argc, cache_array)

1098 else:

1099 result = convert_listlike(argc, format)

1100 else:

1101 result = convert_listlike(np.array([arg]), format)[0]

1102 if isinstance(arg, bool) and isinstance(result, np.bool_):

1103 result = bool(result) # TODO: avoid this kludge.

1104

1105 # error: Incompatible return value type (got "Union[Timestamp, NaTType,

1106 # Series, Index]", expected "Union[DatetimeIndex, Series, float, str,

1107 # NaTType, None]")

1108 return result # type: ignore[return-value]

1109

1110

1111# mappings for assembling units

1112_unit_map = {

1113 "year": "year",

1114 "years": "year",

1115 "month": "month",

1116 "months": "month",

1117 "day": "day",

1118 "days": "day",

1119 "hour": "h",

1120 "hours": "h",

1121 "minute": "m",

1122 "minutes": "m",

1123 "second": "s",

1124 "seconds": "s",

1125 "ms": "ms",

1126 "millisecond": "ms",

1127 "milliseconds": "ms",

1128 "us": "us",

1129 "microsecond": "us",

1130 "microseconds": "us",

1131 "ns": "ns",

1132 "nanosecond": "ns",

1133 "nanoseconds": "ns",

1134}

1135

1136

1137def _assemble_from_unit_mappings(arg, errors: DateTimeErrorChoices, utc: bool):

1138 """

1139 assemble the unit specified fields from the arg (DataFrame)

1140 Return a Series for actual parsing

1141

1142 Parameters

1143 ----------

1144 arg : DataFrame

1145 errors : {'ignore', 'raise', 'coerce'}, default 'raise'

1146

1147 - If :const:`'raise'`, then invalid parsing will raise an exception

1148 - If :const:`'coerce'`, then invalid parsing will be set as :const:`NaT`

1149 - If :const:`'ignore'`, then invalid parsing will return the input

1150 utc : bool

1151 Whether to convert/localize timestamps to UTC.

1152

1153 Returns

1154 -------

1155 Series

1156 """

1157 from pandas import (

1158 DataFrame,

1159 to_numeric,

1160 to_timedelta,

1161 )

1162

1163 arg = DataFrame(arg)

1164 if not arg.columns.is_unique:

1165 raise ValueError("cannot assemble with duplicate keys")

1166

1167 # replace passed unit with _unit_map

1168 def f(value):

1169 if value in _unit_map:

1170 return _unit_map[value]

1171

1172 # m is case significant

1173 if value.lower() in _unit_map:

1174 return _unit_map[value.lower()]

1175

1176 return value

1177

1178 unit = {k: f(k) for k in arg.keys()}

1179 unit_rev = {v: k for k, v in unit.items()}

1180

1181 # we require at least Ymd

1182 required = ["year", "month", "day"]

1183 req = sorted(set(required) - set(unit_rev.keys()))

1184 if len(req):

1185 _required = ",".join(req)

1186 raise ValueError(

1187 "to assemble mappings requires at least that "

1188 f"[year, month, day] be specified: [{_required}] is missing"

1189 )

1190

1191 # keys we don't recognize

1192 excess = sorted(set(unit_rev.keys()) - set(_unit_map.values()))

1193 if len(excess):

1194 _excess = ",".join(excess)

1195 raise ValueError(

1196 f"extra keys have been passed to the datetime assemblage: [{_excess}]"

1197 )

1198

1199 def coerce(values):

1200 # we allow coercion to if errors allows

1201 values = to_numeric(values, errors=errors)

1202

1203 # prevent overflow in case of int8 or int16

1204 if is_integer_dtype(values.dtype):

1205 values = values.astype("int64", copy=False)

1206 return values

1207

1208 values = (

1209 coerce(arg[unit_rev["year"]]) * 10000

1210 + coerce(arg[unit_rev["month"]]) * 100

1211 + coerce(arg[unit_rev["day"]])

1212 )

1213 try:

1214 values = to_datetime(values, format="%Y%m%d", errors=errors, utc=utc)

1215 except (TypeError, ValueError) as err:

1216 raise ValueError(f"cannot assemble the datetimes: {err}") from err

1217

1218 units: list[UnitChoices] = ["h", "m", "s", "ms", "us", "ns"]

1219 for u in units:

1220 value = unit_rev.get(u)

1221 if value is not None and value in arg:

1222 try:

1223 values += to_timedelta(coerce(arg[value]), unit=u, errors=errors)

1224 except (TypeError, ValueError) as err:

1225 raise ValueError(

1226 f"cannot assemble the datetimes [{value}]: {err}"

1227 ) from err

1228 return values

1229

1230

1231__all__ = [

1232 "DateParseError",

1233 "should_cache",

1234 "to_datetime",

1235]