Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/core/tools/datetimes.py: 54%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

351 statements  

1from __future__ import annotations 

2 

3from collections import abc 

4from datetime import date 

5from functools import partial 

6from itertools import islice 

7from typing import ( 

8 TYPE_CHECKING, 

9 Callable, 

10 TypedDict, 

11 Union, 

12 cast, 

13 overload, 

14) 

15import warnings 

16 

17import numpy as np 

18 

19from pandas._libs import ( 

20 lib, 

21 tslib, 

22) 

23from pandas._libs.tslibs import ( 

24 OutOfBoundsDatetime, 

25 Timedelta, 

26 Timestamp, 

27 astype_overflowsafe, 

28 is_supported_dtype, 

29 timezones as libtimezones, 

30) 

31from pandas._libs.tslibs.conversion import cast_from_unit_vectorized 

32from pandas._libs.tslibs.parsing import ( 

33 DateParseError, 

34 guess_datetime_format, 

35) 

36from pandas._libs.tslibs.strptime import array_strptime 

37from pandas._typing import ( 

38 AnyArrayLike, 

39 ArrayLike, 

40 DateTimeErrorChoices, 

41) 

42from pandas.util._exceptions import find_stack_level 

43 

44from pandas.core.dtypes.common import ( 

45 ensure_object, 

46 is_float, 

47 is_integer, 

48 is_integer_dtype, 

49 is_list_like, 

50 is_numeric_dtype, 

51) 

52from pandas.core.dtypes.dtypes import ( 

53 ArrowDtype, 

54 DatetimeTZDtype, 

55) 

56from pandas.core.dtypes.generic import ( 

57 ABCDataFrame, 

58 ABCSeries, 

59) 

60 

61from pandas.arrays import ( 

62 DatetimeArray, 

63 IntegerArray, 

64 NumpyExtensionArray, 

65) 

66from pandas.core.algorithms import unique 

67from pandas.core.arrays import ArrowExtensionArray 

68from pandas.core.arrays.base import ExtensionArray 

69from pandas.core.arrays.datetimes import ( 

70 maybe_convert_dtype, 

71 objects_to_datetime64, 

72 tz_to_dtype, 

73) 

74from pandas.core.construction import extract_array 

75from pandas.core.indexes.base import Index 

76from pandas.core.indexes.datetimes import DatetimeIndex 

77 

78if TYPE_CHECKING: 

79 from collections.abc import Hashable 

80 

81 from pandas._libs.tslibs.nattype import NaTType 

82 from pandas._libs.tslibs.timedeltas import UnitChoices 

83 

84 from pandas import ( 

85 DataFrame, 

86 Series, 

87 ) 

88 

89# --------------------------------------------------------------------- 

90# types used in annotations 

91 

92ArrayConvertible = Union[list, tuple, AnyArrayLike] 

93Scalar = Union[float, str] 

94DatetimeScalar = Union[Scalar, date, np.datetime64] 

95 

96DatetimeScalarOrArrayConvertible = Union[DatetimeScalar, ArrayConvertible] 

97 

98DatetimeDictArg = Union[list[Scalar], tuple[Scalar, ...], AnyArrayLike] 

99 

100 

101class YearMonthDayDict(TypedDict, total=True): 

102 year: DatetimeDictArg 

103 month: DatetimeDictArg 

104 day: DatetimeDictArg 

105 

106 

107class FulldatetimeDict(YearMonthDayDict, total=False): 

108 hour: DatetimeDictArg 

109 hours: DatetimeDictArg 

110 minute: DatetimeDictArg 

111 minutes: DatetimeDictArg 

112 second: DatetimeDictArg 

113 seconds: DatetimeDictArg 

114 ms: DatetimeDictArg 

115 us: DatetimeDictArg 

116 ns: DatetimeDictArg 

117 

118 

119DictConvertible = Union[FulldatetimeDict, "DataFrame"] 

120start_caching_at = 50 

121 

122 

123# --------------------------------------------------------------------- 

124 

125 

126def _guess_datetime_format_for_array(arr, dayfirst: bool | None = False) -> str | None: 

127 # Try to guess the format based on the first non-NaN element, return None if can't 

128 if (first_non_null := tslib.first_non_null(arr)) != -1: 

129 if type(first_non_nan_element := arr[first_non_null]) is str: # noqa: E721 

130 # GH#32264 np.str_ object 

131 guessed_format = guess_datetime_format( 

132 first_non_nan_element, dayfirst=dayfirst 

133 ) 

134 if guessed_format is not None: 

135 return guessed_format 

136 # If there are multiple non-null elements, warn about 

137 # how parsing might not be consistent 

138 if tslib.first_non_null(arr[first_non_null + 1 :]) != -1: 

139 warnings.warn( 

140 "Could not infer format, so each element will be parsed " 

141 "individually, falling back to `dateutil`. To ensure parsing is " 

142 "consistent and as-expected, please specify a format.", 

143 UserWarning, 

144 stacklevel=find_stack_level(), 

145 ) 

146 return None 

147 

148 

149def should_cache( 

150 arg: ArrayConvertible, unique_share: float = 0.7, check_count: int | None = None 

151) -> bool: 

152 """ 

153 Decides whether to do caching. 

154 

155 If the percent of unique elements among `check_count` elements less 

156 than `unique_share * 100` then we can do caching. 

157 

158 Parameters 

159 ---------- 

160 arg: listlike, tuple, 1-d array, Series 

161 unique_share: float, default=0.7, optional 

162 0 < unique_share < 1 

163 check_count: int, optional 

164 0 <= check_count <= len(arg) 

165 

166 Returns 

167 ------- 

168 do_caching: bool 

169 

170 Notes 

171 ----- 

172 By default for a sequence of less than 50 items in size, we don't do 

173 caching; for the number of elements less than 5000, we take ten percent of 

174 all elements to check for a uniqueness share; if the sequence size is more 

175 than 5000, then we check only the first 500 elements. 

176 All constants were chosen empirically by. 

177 """ 

178 do_caching = True 

179 

180 # default realization 

181 if check_count is None: 

182 # in this case, the gain from caching is negligible 

183 if len(arg) <= start_caching_at: 

184 return False 

185 

186 if len(arg) <= 5000: 

187 check_count = len(arg) // 10 

188 else: 

189 check_count = 500 

190 else: 

191 assert ( 

192 0 <= check_count <= len(arg) 

193 ), "check_count must be in next bounds: [0; len(arg)]" 

194 if check_count == 0: 

195 return False 

196 

197 assert 0 < unique_share < 1, "unique_share must be in next bounds: (0; 1)" 

198 

199 try: 

200 # We can't cache if the items are not hashable. 

201 unique_elements = set(islice(arg, check_count)) 

202 except TypeError: 

203 return False 

204 if len(unique_elements) > check_count * unique_share: 

205 do_caching = False 

206 return do_caching 

207 

208 

209def _maybe_cache( 

210 arg: ArrayConvertible, 

211 format: str | None, 

212 cache: bool, 

213 convert_listlike: Callable, 

214) -> Series: 

215 """ 

216 Create a cache of unique dates from an array of dates 

217 

218 Parameters 

219 ---------- 

220 arg : listlike, tuple, 1-d array, Series 

221 format : string 

222 Strftime format to parse time 

223 cache : bool 

224 True attempts to create a cache of converted values 

225 convert_listlike : function 

226 Conversion function to apply on dates 

227 

228 Returns 

229 ------- 

230 cache_array : Series 

231 Cache of converted, unique dates. Can be empty 

232 """ 

233 from pandas import Series 

234 

235 cache_array = Series(dtype=object) 

236 

237 if cache: 

238 # Perform a quicker unique check 

239 if not should_cache(arg): 

240 return cache_array 

241 

242 if not isinstance(arg, (np.ndarray, ExtensionArray, Index, ABCSeries)): 

243 arg = np.array(arg) 

244 

245 unique_dates = unique(arg) 

246 if len(unique_dates) < len(arg): 

247 cache_dates = convert_listlike(unique_dates, format) 

248 # GH#45319 

249 try: 

250 cache_array = Series(cache_dates, index=unique_dates, copy=False) 

251 except OutOfBoundsDatetime: 

252 return cache_array 

253 # GH#39882 and GH#35888 in case of None and NaT we get duplicates 

254 if not cache_array.index.is_unique: 

255 cache_array = cache_array[~cache_array.index.duplicated()] 

256 return cache_array 

257 

258 

259def _box_as_indexlike( 

260 dt_array: ArrayLike, utc: bool = False, name: Hashable | None = None 

261) -> Index: 

262 """ 

263 Properly boxes the ndarray of datetimes to DatetimeIndex 

264 if it is possible or to generic Index instead 

265 

266 Parameters 

267 ---------- 

268 dt_array: 1-d array 

269 Array of datetimes to be wrapped in an Index. 

270 utc : bool 

271 Whether to convert/localize timestamps to UTC. 

272 name : string, default None 

273 Name for a resulting index 

274 

275 Returns 

276 ------- 

277 result : datetime of converted dates 

278 - DatetimeIndex if convertible to sole datetime64 type 

279 - general Index otherwise 

280 """ 

281 

282 if lib.is_np_dtype(dt_array.dtype, "M"): 

283 tz = "utc" if utc else None 

284 return DatetimeIndex(dt_array, tz=tz, name=name) 

285 return Index(dt_array, name=name, dtype=dt_array.dtype) 

286 

287 

288def _convert_and_box_cache( 

289 arg: DatetimeScalarOrArrayConvertible, 

290 cache_array: Series, 

291 name: Hashable | None = None, 

292) -> Index: 

293 """ 

294 Convert array of dates with a cache and wrap the result in an Index. 

295 

296 Parameters 

297 ---------- 

298 arg : integer, float, string, datetime, list, tuple, 1-d array, Series 

299 cache_array : Series 

300 Cache of converted, unique dates 

301 name : string, default None 

302 Name for a DatetimeIndex 

303 

304 Returns 

305 ------- 

306 result : Index-like of converted dates 

307 """ 

308 from pandas import Series 

309 

310 result = Series(arg, dtype=cache_array.index.dtype).map(cache_array) 

311 return _box_as_indexlike(result._values, utc=False, name=name) 

312 

313 

314def _convert_listlike_datetimes( 

315 arg, 

316 format: str | None, 

317 name: Hashable | None = None, 

318 utc: bool = False, 

319 unit: str | None = None, 

320 errors: DateTimeErrorChoices = "raise", 

321 dayfirst: bool | None = None, 

322 yearfirst: bool | None = None, 

323 exact: bool = True, 

324): 

325 """ 

326 Helper function for to_datetime. Performs the conversions of 1D listlike 

327 of dates 

328 

329 Parameters 

330 ---------- 

331 arg : list, tuple, ndarray, Series, Index 

332 date to be parsed 

333 name : object 

334 None or string for the Index name 

335 utc : bool 

336 Whether to convert/localize timestamps to UTC. 

337 unit : str 

338 None or string of the frequency of the passed data 

339 errors : str 

340 error handing behaviors from to_datetime, 'raise', 'coerce', 'ignore' 

341 dayfirst : bool 

342 dayfirst parsing behavior from to_datetime 

343 yearfirst : bool 

344 yearfirst parsing behavior from to_datetime 

345 exact : bool, default True 

346 exact format matching behavior from to_datetime 

347 

348 Returns 

349 ------- 

350 Index-like of parsed dates 

351 """ 

352 if isinstance(arg, (list, tuple)): 

353 arg = np.array(arg, dtype="O") 

354 elif isinstance(arg, NumpyExtensionArray): 

355 arg = np.array(arg) 

356 

357 arg_dtype = getattr(arg, "dtype", None) 

358 # these are shortcutable 

359 tz = "utc" if utc else None 

360 if isinstance(arg_dtype, DatetimeTZDtype): 

361 if not isinstance(arg, (DatetimeArray, DatetimeIndex)): 

362 return DatetimeIndex(arg, tz=tz, name=name) 

363 if utc: 

364 arg = arg.tz_convert(None).tz_localize("utc") 

365 return arg 

366 

367 elif isinstance(arg_dtype, ArrowDtype) and arg_dtype.type is Timestamp: 

368 # TODO: Combine with above if DTI/DTA supports Arrow timestamps 

369 if utc: 

370 # pyarrow uses UTC, not lowercase utc 

371 if isinstance(arg, Index): 

372 arg_array = cast(ArrowExtensionArray, arg.array) 

373 if arg_dtype.pyarrow_dtype.tz is not None: 

374 arg_array = arg_array._dt_tz_convert("UTC") 

375 else: 

376 arg_array = arg_array._dt_tz_localize("UTC") 

377 arg = Index(arg_array) 

378 else: 

379 # ArrowExtensionArray 

380 if arg_dtype.pyarrow_dtype.tz is not None: 

381 arg = arg._dt_tz_convert("UTC") 

382 else: 

383 arg = arg._dt_tz_localize("UTC") 

384 return arg 

385 

386 elif lib.is_np_dtype(arg_dtype, "M"): 

387 if not is_supported_dtype(arg_dtype): 

388 # We go to closest supported reso, i.e. "s" 

389 arg = astype_overflowsafe( 

390 # TODO: looks like we incorrectly raise with errors=="ignore" 

391 np.asarray(arg), 

392 np.dtype("M8[s]"), 

393 is_coerce=errors == "coerce", 

394 ) 

395 

396 if not isinstance(arg, (DatetimeArray, DatetimeIndex)): 

397 return DatetimeIndex(arg, tz=tz, name=name) 

398 elif utc: 

399 # DatetimeArray, DatetimeIndex 

400 return arg.tz_localize("utc") 

401 

402 return arg 

403 

404 elif unit is not None: 

405 if format is not None: 

406 raise ValueError("cannot specify both format and unit") 

407 return _to_datetime_with_unit(arg, unit, name, utc, errors) 

408 elif getattr(arg, "ndim", 1) > 1: 

409 raise TypeError( 

410 "arg must be a string, datetime, list, tuple, 1-d array, or Series" 

411 ) 

412 

413 # warn if passing timedelta64, raise for PeriodDtype 

414 # NB: this must come after unit transformation 

415 try: 

416 arg, _ = maybe_convert_dtype(arg, copy=False, tz=libtimezones.maybe_get_tz(tz)) 

417 except TypeError: 

418 if errors == "coerce": 

419 npvalues = np.array(["NaT"], dtype="datetime64[ns]").repeat(len(arg)) 

420 return DatetimeIndex(npvalues, name=name) 

421 elif errors == "ignore": 

422 idx = Index(arg, name=name) 

423 return idx 

424 raise 

425 

426 arg = ensure_object(arg) 

427 

428 if format is None: 

429 format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst) 

430 

431 # `format` could be inferred, or user didn't ask for mixed-format parsing. 

432 if format is not None and format != "mixed": 

433 return _array_strptime_with_fallback(arg, name, utc, format, exact, errors) 

434 

435 result, tz_parsed = objects_to_datetime64( 

436 arg, 

437 dayfirst=dayfirst, 

438 yearfirst=yearfirst, 

439 utc=utc, 

440 errors=errors, 

441 allow_object=True, 

442 ) 

443 

444 if tz_parsed is not None: 

445 # We can take a shortcut since the datetime64 numpy array 

446 # is in UTC 

447 out_unit = np.datetime_data(result.dtype)[0] 

448 dtype = cast(DatetimeTZDtype, tz_to_dtype(tz_parsed, out_unit)) 

449 dt64_values = result.view(f"M8[{dtype.unit}]") 

450 dta = DatetimeArray._simple_new(dt64_values, dtype=dtype) 

451 return DatetimeIndex._simple_new(dta, name=name) 

452 

453 return _box_as_indexlike(result, utc=utc, name=name) 

454 

455 

456def _array_strptime_with_fallback( 

457 arg, 

458 name, 

459 utc: bool, 

460 fmt: str, 

461 exact: bool, 

462 errors: str, 

463) -> Index: 

464 """ 

465 Call array_strptime, with fallback behavior depending on 'errors'. 

466 """ 

467 result, tz_out = array_strptime(arg, fmt, exact=exact, errors=errors, utc=utc) 

468 if tz_out is not None: 

469 unit = np.datetime_data(result.dtype)[0] 

470 dtype = DatetimeTZDtype(tz=tz_out, unit=unit) 

471 dta = DatetimeArray._simple_new(result, dtype=dtype) 

472 if utc: 

473 dta = dta.tz_convert("UTC") 

474 return Index(dta, name=name) 

475 elif result.dtype != object and utc: 

476 unit = np.datetime_data(result.dtype)[0] 

477 res = Index(result, dtype=f"M8[{unit}, UTC]", name=name) 

478 return res 

479 return Index(result, dtype=result.dtype, name=name) 

480 

481 

482def _to_datetime_with_unit(arg, unit, name, utc: bool, errors: str) -> Index: 

483 """ 

484 to_datetime specalized to the case where a 'unit' is passed. 

485 """ 

486 arg = extract_array(arg, extract_numpy=True) 

487 

488 # GH#30050 pass an ndarray to tslib.array_with_unit_to_datetime 

489 # because it expects an ndarray argument 

490 if isinstance(arg, IntegerArray): 

491 arr = arg.astype(f"datetime64[{unit}]") 

492 tz_parsed = None 

493 else: 

494 arg = np.asarray(arg) 

495 

496 if arg.dtype.kind in "iu": 

497 # Note we can't do "f" here because that could induce unwanted 

498 # rounding GH#14156, GH#20445 

499 arr = arg.astype(f"datetime64[{unit}]", copy=False) 

500 try: 

501 arr = astype_overflowsafe(arr, np.dtype("M8[ns]"), copy=False) 

502 except OutOfBoundsDatetime: 

503 if errors == "raise": 

504 raise 

505 arg = arg.astype(object) 

506 return _to_datetime_with_unit(arg, unit, name, utc, errors) 

507 tz_parsed = None 

508 

509 elif arg.dtype.kind == "f": 

510 with np.errstate(over="raise"): 

511 try: 

512 arr = cast_from_unit_vectorized(arg, unit=unit) 

513 except OutOfBoundsDatetime: 

514 if errors != "raise": 

515 return _to_datetime_with_unit( 

516 arg.astype(object), unit, name, utc, errors 

517 ) 

518 raise OutOfBoundsDatetime( 

519 f"cannot convert input with unit '{unit}'" 

520 ) 

521 

522 arr = arr.view("M8[ns]") 

523 tz_parsed = None 

524 else: 

525 arg = arg.astype(object, copy=False) 

526 arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors) 

527 

528 if errors == "ignore": 

529 # Index constructor _may_ infer to DatetimeIndex 

530 result = Index._with_infer(arr, name=name) 

531 else: 

532 result = DatetimeIndex(arr, name=name) 

533 

534 if not isinstance(result, DatetimeIndex): 

535 return result 

536 

537 # GH#23758: We may still need to localize the result with tz 

538 # GH#25546: Apply tz_parsed first (from arg), then tz (from caller) 

539 # result will be naive but in UTC 

540 result = result.tz_localize("UTC").tz_convert(tz_parsed) 

541 

542 if utc: 

543 if result.tz is None: 

544 result = result.tz_localize("utc") 

545 else: 

546 result = result.tz_convert("utc") 

547 return result 

548 

549 

550def _adjust_to_origin(arg, origin, unit): 

551 """ 

552 Helper function for to_datetime. 

553 Adjust input argument to the specified origin 

554 

555 Parameters 

556 ---------- 

557 arg : list, tuple, ndarray, Series, Index 

558 date to be adjusted 

559 origin : 'julian' or Timestamp 

560 origin offset for the arg 

561 unit : str 

562 passed unit from to_datetime, must be 'D' 

563 

564 Returns 

565 ------- 

566 ndarray or scalar of adjusted date(s) 

567 """ 

568 if origin == "julian": 

569 original = arg 

570 j0 = Timestamp(0).to_julian_date() 

571 if unit != "D": 

572 raise ValueError("unit must be 'D' for origin='julian'") 

573 try: 

574 arg = arg - j0 

575 except TypeError as err: 

576 raise ValueError( 

577 "incompatible 'arg' type for given 'origin'='julian'" 

578 ) from err 

579 

580 # preemptively check this for a nice range 

581 j_max = Timestamp.max.to_julian_date() - j0 

582 j_min = Timestamp.min.to_julian_date() - j0 

583 if np.any(arg > j_max) or np.any(arg < j_min): 

584 raise OutOfBoundsDatetime( 

585 f"{original} is Out of Bounds for origin='julian'" 

586 ) 

587 else: 

588 # arg must be numeric 

589 if not ( 

590 (is_integer(arg) or is_float(arg)) or is_numeric_dtype(np.asarray(arg)) 

591 ): 

592 raise ValueError( 

593 f"'{arg}' is not compatible with origin='{origin}'; " 

594 "it must be numeric with a unit specified" 

595 ) 

596 

597 # we are going to offset back to unix / epoch time 

598 try: 

599 offset = Timestamp(origin, unit=unit) 

600 except OutOfBoundsDatetime as err: 

601 raise OutOfBoundsDatetime(f"origin {origin} is Out of Bounds") from err 

602 except ValueError as err: 

603 raise ValueError( 

604 f"origin {origin} cannot be converted to a Timestamp" 

605 ) from err 

606 

607 if offset.tz is not None: 

608 raise ValueError(f"origin offset {offset} must be tz-naive") 

609 td_offset = offset - Timestamp(0) 

610 

611 # convert the offset to the unit of the arg 

612 # this should be lossless in terms of precision 

613 ioffset = td_offset // Timedelta(1, unit=unit) 

614 

615 # scalars & ndarray-like can handle the addition 

616 if is_list_like(arg) and not isinstance(arg, (ABCSeries, Index, np.ndarray)): 

617 arg = np.asarray(arg) 

618 arg = arg + ioffset 

619 return arg 

620 

621 

622@overload 

623def to_datetime( 

624 arg: DatetimeScalar, 

625 errors: DateTimeErrorChoices = ..., 

626 dayfirst: bool = ..., 

627 yearfirst: bool = ..., 

628 utc: bool = ..., 

629 format: str | None = ..., 

630 exact: bool = ..., 

631 unit: str | None = ..., 

632 infer_datetime_format: bool = ..., 

633 origin=..., 

634 cache: bool = ..., 

635) -> Timestamp: 

636 ... 

637 

638 

639@overload 

640def to_datetime( 

641 arg: Series | DictConvertible, 

642 errors: DateTimeErrorChoices = ..., 

643 dayfirst: bool = ..., 

644 yearfirst: bool = ..., 

645 utc: bool = ..., 

646 format: str | None = ..., 

647 exact: bool = ..., 

648 unit: str | None = ..., 

649 infer_datetime_format: bool = ..., 

650 origin=..., 

651 cache: bool = ..., 

652) -> Series: 

653 ... 

654 

655 

656@overload 

657def to_datetime( 

658 arg: list | tuple | Index | ArrayLike, 

659 errors: DateTimeErrorChoices = ..., 

660 dayfirst: bool = ..., 

661 yearfirst: bool = ..., 

662 utc: bool = ..., 

663 format: str | None = ..., 

664 exact: bool = ..., 

665 unit: str | None = ..., 

666 infer_datetime_format: bool = ..., 

667 origin=..., 

668 cache: bool = ..., 

669) -> DatetimeIndex: 

670 ... 

671 

672 

673def to_datetime( 

674 arg: DatetimeScalarOrArrayConvertible | DictConvertible, 

675 errors: DateTimeErrorChoices = "raise", 

676 dayfirst: bool = False, 

677 yearfirst: bool = False, 

678 utc: bool = False, 

679 format: str | None = None, 

680 exact: bool | lib.NoDefault = lib.no_default, 

681 unit: str | None = None, 

682 infer_datetime_format: lib.NoDefault | bool = lib.no_default, 

683 origin: str = "unix", 

684 cache: bool = True, 

685) -> DatetimeIndex | Series | DatetimeScalar | NaTType | None: 

686 """ 

687 Convert argument to datetime. 

688 

689 This function converts a scalar, array-like, :class:`Series` or 

690 :class:`DataFrame`/dict-like to a pandas datetime object. 

691 

692 Parameters 

693 ---------- 

694 arg : int, float, str, datetime, list, tuple, 1-d array, Series, DataFrame/dict-like 

695 The object to convert to a datetime. If a :class:`DataFrame` is provided, the 

696 method expects minimally the following columns: :const:`"year"`, 

697 :const:`"month"`, :const:`"day"`. The column "year" 

698 must be specified in 4-digit format. 

699 errors : {'ignore', 'raise', 'coerce'}, default 'raise' 

700 - If :const:`'raise'`, then invalid parsing will raise an exception. 

701 - If :const:`'coerce'`, then invalid parsing will be set as :const:`NaT`. 

702 - If :const:`'ignore'`, then invalid parsing will return the input. 

703 dayfirst : bool, default False 

704 Specify a date parse order if `arg` is str or is list-like. 

705 If :const:`True`, parses dates with the day first, e.g. :const:`"10/11/12"` 

706 is parsed as :const:`2012-11-10`. 

707 

708 .. warning:: 

709 

710 ``dayfirst=True`` is not strict, but will prefer to parse 

711 with day first. 

712 

713 yearfirst : bool, default False 

714 Specify a date parse order if `arg` is str or is list-like. 

715 

716 - If :const:`True` parses dates with the year first, e.g. 

717 :const:`"10/11/12"` is parsed as :const:`2010-11-12`. 

718 - If both `dayfirst` and `yearfirst` are :const:`True`, `yearfirst` is 

719 preceded (same as :mod:`dateutil`). 

720 

721 .. warning:: 

722 

723 ``yearfirst=True`` is not strict, but will prefer to parse 

724 with year first. 

725 

726 utc : bool, default False 

727 Control timezone-related parsing, localization and conversion. 

728 

729 - If :const:`True`, the function *always* returns a timezone-aware 

730 UTC-localized :class:`Timestamp`, :class:`Series` or 

731 :class:`DatetimeIndex`. To do this, timezone-naive inputs are 

732 *localized* as UTC, while timezone-aware inputs are *converted* to UTC. 

733 

734 - If :const:`False` (default), inputs will not be coerced to UTC. 

735 Timezone-naive inputs will remain naive, while timezone-aware ones 

736 will keep their time offsets. Limitations exist for mixed 

737 offsets (typically, daylight savings), see :ref:`Examples 

738 <to_datetime_tz_examples>` section for details. 

739 

740 .. warning:: 

741 

742 In a future version of pandas, parsing datetimes with mixed time 

743 zones will raise an error unless `utc=True`. 

744 Please specify `utc=True` to opt in to the new behaviour 

745 and silence this warning. To create a `Series` with mixed offsets and 

746 `object` dtype, please use `apply` and `datetime.datetime.strptime`. 

747 

748 See also: pandas general documentation about `timezone conversion and 

749 localization 

750 <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html 

751 #time-zone-handling>`_. 

752 

753 format : str, default None 

754 The strftime to parse time, e.g. :const:`"%d/%m/%Y"`. See 

755 `strftime documentation 

756 <https://docs.python.org/3/library/datetime.html 

757 #strftime-and-strptime-behavior>`_ for more information on choices, though 

758 note that :const:`"%f"` will parse all the way up to nanoseconds. 

759 You can also pass: 

760 

761 - "ISO8601", to parse any `ISO8601 <https://en.wikipedia.org/wiki/ISO_8601>`_ 

762 time string (not necessarily in exactly the same format); 

763 - "mixed", to infer the format for each element individually. This is risky, 

764 and you should probably use it along with `dayfirst`. 

765 

766 .. note:: 

767 

768 If a :class:`DataFrame` is passed, then `format` has no effect. 

769 

770 exact : bool, default True 

771 Control how `format` is used: 

772 

773 - If :const:`True`, require an exact `format` match. 

774 - If :const:`False`, allow the `format` to match anywhere in the target 

775 string. 

776 

777 Cannot be used alongside ``format='ISO8601'`` or ``format='mixed'``. 

778 unit : str, default 'ns' 

779 The unit of the arg (D,s,ms,us,ns) denote the unit, which is an 

780 integer or float number. This will be based off the origin. 

781 Example, with ``unit='ms'`` and ``origin='unix'``, this would calculate 

782 the number of milliseconds to the unix epoch start. 

783 infer_datetime_format : bool, default False 

784 If :const:`True` and no `format` is given, attempt to infer the format 

785 of the datetime strings based on the first non-NaN element, 

786 and if it can be inferred, switch to a faster method of parsing them. 

787 In some cases this can increase the parsing speed by ~5-10x. 

788 

789 .. deprecated:: 2.0.0 

790 A strict version of this argument is now the default, passing it has 

791 no effect. 

792 

793 origin : scalar, default 'unix' 

794 Define the reference date. The numeric values would be parsed as number 

795 of units (defined by `unit`) since this reference date. 

796 

797 - If :const:`'unix'` (or POSIX) time; origin is set to 1970-01-01. 

798 - If :const:`'julian'`, unit must be :const:`'D'`, and origin is set to 

799 beginning of Julian Calendar. Julian day number :const:`0` is assigned 

800 to the day starting at noon on January 1, 4713 BC. 

801 - If Timestamp convertible (Timestamp, dt.datetime, np.datetimt64 or date 

802 string), origin is set to Timestamp identified by origin. 

803 - If a float or integer, origin is the difference 

804 (in units determined by the ``unit`` argument) relative to 1970-01-01. 

805 cache : bool, default True 

806 If :const:`True`, use a cache of unique, converted dates to apply the 

807 datetime conversion. May produce significant speed-up when parsing 

808 duplicate date strings, especially ones with timezone offsets. The cache 

809 is only used when there are at least 50 values. The presence of 

810 out-of-bounds values will render the cache unusable and may slow down 

811 parsing. 

812 

813 Returns 

814 ------- 

815 datetime 

816 If parsing succeeded. 

817 Return type depends on input (types in parenthesis correspond to 

818 fallback in case of unsuccessful timezone or out-of-range timestamp 

819 parsing): 

820 

821 - scalar: :class:`Timestamp` (or :class:`datetime.datetime`) 

822 - array-like: :class:`DatetimeIndex` (or :class:`Series` with 

823 :class:`object` dtype containing :class:`datetime.datetime`) 

824 - Series: :class:`Series` of :class:`datetime64` dtype (or 

825 :class:`Series` of :class:`object` dtype containing 

826 :class:`datetime.datetime`) 

827 - DataFrame: :class:`Series` of :class:`datetime64` dtype (or 

828 :class:`Series` of :class:`object` dtype containing 

829 :class:`datetime.datetime`) 

830 

831 Raises 

832 ------ 

833 ParserError 

834 When parsing a date from string fails. 

835 ValueError 

836 When another datetime conversion error happens. For example when one 

837 of 'year', 'month', day' columns is missing in a :class:`DataFrame`, or 

838 when a Timezone-aware :class:`datetime.datetime` is found in an array-like 

839 of mixed time offsets, and ``utc=False``. 

840 

841 See Also 

842 -------- 

843 DataFrame.astype : Cast argument to a specified dtype. 

844 to_timedelta : Convert argument to timedelta. 

845 convert_dtypes : Convert dtypes. 

846 

847 Notes 

848 ----- 

849 

850 Many input types are supported, and lead to different output types: 

851 

852 - **scalars** can be int, float, str, datetime object (from stdlib :mod:`datetime` 

853 module or :mod:`numpy`). They are converted to :class:`Timestamp` when 

854 possible, otherwise they are converted to :class:`datetime.datetime`. 

855 None/NaN/null scalars are converted to :const:`NaT`. 

856 

857 - **array-like** can contain int, float, str, datetime objects. They are 

858 converted to :class:`DatetimeIndex` when possible, otherwise they are 

859 converted to :class:`Index` with :class:`object` dtype, containing 

860 :class:`datetime.datetime`. None/NaN/null entries are converted to 

861 :const:`NaT` in both cases. 

862 

863 - **Series** are converted to :class:`Series` with :class:`datetime64` 

864 dtype when possible, otherwise they are converted to :class:`Series` with 

865 :class:`object` dtype, containing :class:`datetime.datetime`. None/NaN/null 

866 entries are converted to :const:`NaT` in both cases. 

867 

868 - **DataFrame/dict-like** are converted to :class:`Series` with 

869 :class:`datetime64` dtype. For each row a datetime is created from assembling 

870 the various dataframe columns. Column keys can be common abbreviations 

871 like ['year', 'month', 'day', 'minute', 'second', 'ms', 'us', 'ns']) or 

872 plurals of the same. 

873 

874 The following causes are responsible for :class:`datetime.datetime` objects 

875 being returned (possibly inside an :class:`Index` or a :class:`Series` with 

876 :class:`object` dtype) instead of a proper pandas designated type 

877 (:class:`Timestamp`, :class:`DatetimeIndex` or :class:`Series` 

878 with :class:`datetime64` dtype): 

879 

880 - when any input element is before :const:`Timestamp.min` or after 

881 :const:`Timestamp.max`, see `timestamp limitations 

882 <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html 

883 #timeseries-timestamp-limits>`_. 

884 

885 - when ``utc=False`` (default) and the input is an array-like or 

886 :class:`Series` containing mixed naive/aware datetime, or aware with mixed 

887 time offsets. Note that this happens in the (quite frequent) situation when 

888 the timezone has a daylight savings policy. In that case you may wish to 

889 use ``utc=True``. 

890 

891 Examples 

892 -------- 

893 

894 **Handling various input formats** 

895 

896 Assembling a datetime from multiple columns of a :class:`DataFrame`. The keys 

897 can be common abbreviations like ['year', 'month', 'day', 'minute', 'second', 

898 'ms', 'us', 'ns']) or plurals of the same 

899 

900 >>> df = pd.DataFrame({'year': [2015, 2016], 

901 ... 'month': [2, 3], 

902 ... 'day': [4, 5]}) 

903 >>> pd.to_datetime(df) 

904 0 2015-02-04 

905 1 2016-03-05 

906 dtype: datetime64[ns] 

907 

908 Using a unix epoch time 

909 

910 >>> pd.to_datetime(1490195805, unit='s') 

911 Timestamp('2017-03-22 15:16:45') 

912 >>> pd.to_datetime(1490195805433502912, unit='ns') 

913 Timestamp('2017-03-22 15:16:45.433502912') 

914 

915 .. warning:: For float arg, precision rounding might happen. To prevent 

916 unexpected behavior use a fixed-width exact type. 

917 

918 Using a non-unix epoch origin 

919 

920 >>> pd.to_datetime([1, 2, 3], unit='D', 

921 ... origin=pd.Timestamp('1960-01-01')) 

922 DatetimeIndex(['1960-01-02', '1960-01-03', '1960-01-04'], 

923 dtype='datetime64[ns]', freq=None) 

924 

925 **Differences with strptime behavior** 

926 

927 :const:`"%f"` will parse all the way up to nanoseconds. 

928 

929 >>> pd.to_datetime('2018-10-26 12:00:00.0000000011', 

930 ... format='%Y-%m-%d %H:%M:%S.%f') 

931 Timestamp('2018-10-26 12:00:00.000000001') 

932 

933 **Non-convertible date/times** 

934 

935 Passing ``errors='coerce'`` will force an out-of-bounds date to :const:`NaT`, 

936 in addition to forcing non-dates (or non-parseable dates) to :const:`NaT`. 

937 

938 >>> pd.to_datetime('13000101', format='%Y%m%d', errors='coerce') 

939 NaT 

940 

941 .. _to_datetime_tz_examples: 

942 

943 **Timezones and time offsets** 

944 

945 The default behaviour (``utc=False``) is as follows: 

946 

947 - Timezone-naive inputs are converted to timezone-naive :class:`DatetimeIndex`: 

948 

949 >>> pd.to_datetime(['2018-10-26 12:00:00', '2018-10-26 13:00:15']) 

950 DatetimeIndex(['2018-10-26 12:00:00', '2018-10-26 13:00:15'], 

951 dtype='datetime64[ns]', freq=None) 

952 

953 - Timezone-aware inputs *with constant time offset* are converted to 

954 timezone-aware :class:`DatetimeIndex`: 

955 

956 >>> pd.to_datetime(['2018-10-26 12:00 -0500', '2018-10-26 13:00 -0500']) 

957 DatetimeIndex(['2018-10-26 12:00:00-05:00', '2018-10-26 13:00:00-05:00'], 

958 dtype='datetime64[ns, UTC-05:00]', freq=None) 

959 

960 - However, timezone-aware inputs *with mixed time offsets* (for example 

961 issued from a timezone with daylight savings, such as Europe/Paris) 

962 are **not successfully converted** to a :class:`DatetimeIndex`. 

963 Parsing datetimes with mixed time zones will show a warning unless 

964 `utc=True`. If you specify `utc=False` the warning below will be shown 

965 and a simple :class:`Index` containing :class:`datetime.datetime` 

966 objects will be returned: 

967 

968 >>> pd.to_datetime(['2020-10-25 02:00 +0200', 

969 ... '2020-10-25 04:00 +0100']) # doctest: +SKIP 

970 FutureWarning: In a future version of pandas, parsing datetimes with mixed 

971 time zones will raise an error unless `utc=True`. Please specify `utc=True` 

972 to opt in to the new behaviour and silence this warning. To create a `Series` 

973 with mixed offsets and `object` dtype, please use `apply` and 

974 `datetime.datetime.strptime`. 

975 Index([2020-10-25 02:00:00+02:00, 2020-10-25 04:00:00+01:00], 

976 dtype='object') 

977 

978 - A mix of timezone-aware and timezone-naive inputs is also converted to 

979 a simple :class:`Index` containing :class:`datetime.datetime` objects: 

980 

981 >>> from datetime import datetime 

982 >>> pd.to_datetime(["2020-01-01 01:00:00-01:00", 

983 ... datetime(2020, 1, 1, 3, 0)]) # doctest: +SKIP 

984 FutureWarning: In a future version of pandas, parsing datetimes with mixed 

985 time zones will raise an error unless `utc=True`. Please specify `utc=True` 

986 to opt in to the new behaviour and silence this warning. To create a `Series` 

987 with mixed offsets and `object` dtype, please use `apply` and 

988 `datetime.datetime.strptime`. 

989 Index([2020-01-01 01:00:00-01:00, 2020-01-01 03:00:00], dtype='object') 

990 

991 | 

992 

993 Setting ``utc=True`` solves most of the above issues: 

994 

995 - Timezone-naive inputs are *localized* as UTC 

996 

997 >>> pd.to_datetime(['2018-10-26 12:00', '2018-10-26 13:00'], utc=True) 

998 DatetimeIndex(['2018-10-26 12:00:00+00:00', '2018-10-26 13:00:00+00:00'], 

999 dtype='datetime64[ns, UTC]', freq=None) 

1000 

1001 - Timezone-aware inputs are *converted* to UTC (the output represents the 

1002 exact same datetime, but viewed from the UTC time offset `+00:00`). 

1003 

1004 >>> pd.to_datetime(['2018-10-26 12:00 -0530', '2018-10-26 12:00 -0500'], 

1005 ... utc=True) 

1006 DatetimeIndex(['2018-10-26 17:30:00+00:00', '2018-10-26 17:00:00+00:00'], 

1007 dtype='datetime64[ns, UTC]', freq=None) 

1008 

1009 - Inputs can contain both string or datetime, the above 

1010 rules still apply 

1011 

1012 >>> pd.to_datetime(['2018-10-26 12:00', datetime(2020, 1, 1, 18)], utc=True) 

1013 DatetimeIndex(['2018-10-26 12:00:00+00:00', '2020-01-01 18:00:00+00:00'], 

1014 dtype='datetime64[ns, UTC]', freq=None) 

1015 """ 

1016 if exact is not lib.no_default and format in {"mixed", "ISO8601"}: 

1017 raise ValueError("Cannot use 'exact' when 'format' is 'mixed' or 'ISO8601'") 

1018 if infer_datetime_format is not lib.no_default: 

1019 warnings.warn( 

1020 "The argument 'infer_datetime_format' is deprecated and will " 

1021 "be removed in a future version. " 

1022 "A strict version of it is now the default, see " 

1023 "https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. " 

1024 "You can safely remove this argument.", 

1025 stacklevel=find_stack_level(), 

1026 ) 

1027 if errors == "ignore": 

1028 # GH#54467 

1029 warnings.warn( 

1030 "errors='ignore' is deprecated and will raise in a future version. " 

1031 "Use to_datetime without passing `errors` and catch exceptions " 

1032 "explicitly instead", 

1033 FutureWarning, 

1034 stacklevel=find_stack_level(), 

1035 ) 

1036 

1037 if arg is None: 

1038 return None 

1039 

1040 if origin != "unix": 

1041 arg = _adjust_to_origin(arg, origin, unit) 

1042 

1043 convert_listlike = partial( 

1044 _convert_listlike_datetimes, 

1045 utc=utc, 

1046 unit=unit, 

1047 dayfirst=dayfirst, 

1048 yearfirst=yearfirst, 

1049 errors=errors, 

1050 exact=exact, 

1051 ) 

1052 # pylint: disable-next=used-before-assignment 

1053 result: Timestamp | NaTType | Series | Index 

1054 

1055 if isinstance(arg, Timestamp): 

1056 result = arg 

1057 if utc: 

1058 if arg.tz is not None: 

1059 result = arg.tz_convert("utc") 

1060 else: 

1061 result = arg.tz_localize("utc") 

1062 elif isinstance(arg, ABCSeries): 

1063 cache_array = _maybe_cache(arg, format, cache, convert_listlike) 

1064 if not cache_array.empty: 

1065 result = arg.map(cache_array) 

1066 else: 

1067 values = convert_listlike(arg._values, format) 

1068 result = arg._constructor(values, index=arg.index, name=arg.name) 

1069 elif isinstance(arg, (ABCDataFrame, abc.MutableMapping)): 

1070 result = _assemble_from_unit_mappings(arg, errors, utc) 

1071 elif isinstance(arg, Index): 

1072 cache_array = _maybe_cache(arg, format, cache, convert_listlike) 

1073 if not cache_array.empty: 

1074 result = _convert_and_box_cache(arg, cache_array, name=arg.name) 

1075 else: 

1076 result = convert_listlike(arg, format, name=arg.name) 

1077 elif is_list_like(arg): 

1078 try: 

1079 # error: Argument 1 to "_maybe_cache" has incompatible type 

1080 # "Union[float, str, datetime, List[Any], Tuple[Any, ...], ExtensionArray, 

1081 # ndarray[Any, Any], Series]"; expected "Union[List[Any], Tuple[Any, ...], 

1082 # Union[Union[ExtensionArray, ndarray[Any, Any]], Index, Series], Series]" 

1083 argc = cast( 

1084 Union[list, tuple, ExtensionArray, np.ndarray, "Series", Index], arg 

1085 ) 

1086 cache_array = _maybe_cache(argc, format, cache, convert_listlike) 

1087 except OutOfBoundsDatetime: 

1088 # caching attempts to create a DatetimeIndex, which may raise 

1089 # an OOB. If that's the desired behavior, then just reraise... 

1090 if errors == "raise": 

1091 raise 

1092 # ... otherwise, continue without the cache. 

1093 from pandas import Series 

1094 

1095 cache_array = Series([], dtype=object) # just an empty array 

1096 if not cache_array.empty: 

1097 result = _convert_and_box_cache(argc, cache_array) 

1098 else: 

1099 result = convert_listlike(argc, format) 

1100 else: 

1101 result = convert_listlike(np.array([arg]), format)[0] 

1102 if isinstance(arg, bool) and isinstance(result, np.bool_): 

1103 result = bool(result) # TODO: avoid this kludge. 

1104 

1105 # error: Incompatible return value type (got "Union[Timestamp, NaTType, 

1106 # Series, Index]", expected "Union[DatetimeIndex, Series, float, str, 

1107 # NaTType, None]") 

1108 return result # type: ignore[return-value] 

1109 

1110 

1111# mappings for assembling units 

1112_unit_map = { 

1113 "year": "year", 

1114 "years": "year", 

1115 "month": "month", 

1116 "months": "month", 

1117 "day": "day", 

1118 "days": "day", 

1119 "hour": "h", 

1120 "hours": "h", 

1121 "minute": "m", 

1122 "minutes": "m", 

1123 "second": "s", 

1124 "seconds": "s", 

1125 "ms": "ms", 

1126 "millisecond": "ms", 

1127 "milliseconds": "ms", 

1128 "us": "us", 

1129 "microsecond": "us", 

1130 "microseconds": "us", 

1131 "ns": "ns", 

1132 "nanosecond": "ns", 

1133 "nanoseconds": "ns", 

1134} 

1135 

1136 

1137def _assemble_from_unit_mappings(arg, errors: DateTimeErrorChoices, utc: bool): 

1138 """ 

1139 assemble the unit specified fields from the arg (DataFrame) 

1140 Return a Series for actual parsing 

1141 

1142 Parameters 

1143 ---------- 

1144 arg : DataFrame 

1145 errors : {'ignore', 'raise', 'coerce'}, default 'raise' 

1146 

1147 - If :const:`'raise'`, then invalid parsing will raise an exception 

1148 - If :const:`'coerce'`, then invalid parsing will be set as :const:`NaT` 

1149 - If :const:`'ignore'`, then invalid parsing will return the input 

1150 utc : bool 

1151 Whether to convert/localize timestamps to UTC. 

1152 

1153 Returns 

1154 ------- 

1155 Series 

1156 """ 

1157 from pandas import ( 

1158 DataFrame, 

1159 to_numeric, 

1160 to_timedelta, 

1161 ) 

1162 

1163 arg = DataFrame(arg) 

1164 if not arg.columns.is_unique: 

1165 raise ValueError("cannot assemble with duplicate keys") 

1166 

1167 # replace passed unit with _unit_map 

1168 def f(value): 

1169 if value in _unit_map: 

1170 return _unit_map[value] 

1171 

1172 # m is case significant 

1173 if value.lower() in _unit_map: 

1174 return _unit_map[value.lower()] 

1175 

1176 return value 

1177 

1178 unit = {k: f(k) for k in arg.keys()} 

1179 unit_rev = {v: k for k, v in unit.items()} 

1180 

1181 # we require at least Ymd 

1182 required = ["year", "month", "day"] 

1183 req = sorted(set(required) - set(unit_rev.keys())) 

1184 if len(req): 

1185 _required = ",".join(req) 

1186 raise ValueError( 

1187 "to assemble mappings requires at least that " 

1188 f"[year, month, day] be specified: [{_required}] is missing" 

1189 ) 

1190 

1191 # keys we don't recognize 

1192 excess = sorted(set(unit_rev.keys()) - set(_unit_map.values())) 

1193 if len(excess): 

1194 _excess = ",".join(excess) 

1195 raise ValueError( 

1196 f"extra keys have been passed to the datetime assemblage: [{_excess}]" 

1197 ) 

1198 

1199 def coerce(values): 

1200 # we allow coercion to if errors allows 

1201 values = to_numeric(values, errors=errors) 

1202 

1203 # prevent overflow in case of int8 or int16 

1204 if is_integer_dtype(values.dtype): 

1205 values = values.astype("int64", copy=False) 

1206 return values 

1207 

1208 values = ( 

1209 coerce(arg[unit_rev["year"]]) * 10000 

1210 + coerce(arg[unit_rev["month"]]) * 100 

1211 + coerce(arg[unit_rev["day"]]) 

1212 ) 

1213 try: 

1214 values = to_datetime(values, format="%Y%m%d", errors=errors, utc=utc) 

1215 except (TypeError, ValueError) as err: 

1216 raise ValueError(f"cannot assemble the datetimes: {err}") from err 

1217 

1218 units: list[UnitChoices] = ["h", "m", "s", "ms", "us", "ns"] 

1219 for u in units: 

1220 value = unit_rev.get(u) 

1221 if value is not None and value in arg: 

1222 try: 

1223 values += to_timedelta(coerce(arg[value]), unit=u, errors=errors) 

1224 except (TypeError, ValueError) as err: 

1225 raise ValueError( 

1226 f"cannot assemble the datetimes [{value}]: {err}" 

1227 ) from err 

1228 return values 

1229 

1230 

1231__all__ = [ 

1232 "DateParseError", 

1233 "should_cache", 

1234 "to_datetime", 

1235]