Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pandas/core/tools/datetimes.py: 19%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

364 statements  

1from __future__ import annotations 

2 

3from collections import abc 

4from datetime import datetime 

5from functools import partial 

6from itertools import islice 

7from typing import ( 

8 TYPE_CHECKING, 

9 Callable, 

10 Hashable, 

11 List, 

12 Tuple, 

13 TypedDict, 

14 Union, 

15 cast, 

16 overload, 

17) 

18import warnings 

19 

20import numpy as np 

21 

22from pandas._libs import ( 

23 lib, 

24 tslib, 

25) 

26from pandas._libs.tslibs import ( 

27 OutOfBoundsDatetime, 

28 Timedelta, 

29 Timestamp, 

30 astype_overflowsafe, 

31 get_unit_from_dtype, 

32 iNaT, 

33 is_supported_unit, 

34 nat_strings, 

35 parsing, 

36 timezones as libtimezones, 

37) 

38from pandas._libs.tslibs.conversion import precision_from_unit 

39from pandas._libs.tslibs.parsing import ( 

40 DateParseError, 

41 guess_datetime_format, 

42) 

43from pandas._libs.tslibs.strptime import array_strptime 

44from pandas._typing import ( 

45 AnyArrayLike, 

46 ArrayLike, 

47 DateTimeErrorChoices, 

48 npt, 

49) 

50from pandas.util._exceptions import find_stack_level 

51 

52from pandas.core.dtypes.common import ( 

53 ensure_object, 

54 is_datetime64_dtype, 

55 is_datetime64tz_dtype, 

56 is_float, 

57 is_integer, 

58 is_integer_dtype, 

59 is_list_like, 

60 is_numeric_dtype, 

61 is_scalar, 

62) 

63from pandas.core.dtypes.generic import ( 

64 ABCDataFrame, 

65 ABCSeries, 

66) 

67from pandas.core.dtypes.missing import notna 

68 

69from pandas.arrays import ( 

70 DatetimeArray, 

71 IntegerArray, 

72 PandasArray, 

73) 

74from pandas.core import algorithms 

75from pandas.core.algorithms import unique 

76from pandas.core.arrays.base import ExtensionArray 

77from pandas.core.arrays.datetimes import ( 

78 maybe_convert_dtype, 

79 objects_to_datetime64ns, 

80 tz_to_dtype, 

81) 

82from pandas.core.construction import extract_array 

83from pandas.core.indexes.base import Index 

84from pandas.core.indexes.datetimes import DatetimeIndex 

85 

86if TYPE_CHECKING: 

87 from pandas._libs.tslibs.nattype import NaTType 

88 from pandas._libs.tslibs.timedeltas import UnitChoices 

89 

90 from pandas import ( 

91 DataFrame, 

92 Series, 

93 ) 

94 

95# --------------------------------------------------------------------- 

96# types used in annotations 

97 

98ArrayConvertible = Union[List, Tuple, AnyArrayLike] 

99Scalar = Union[float, str] 

100DatetimeScalar = Union[Scalar, datetime] 

101 

102DatetimeScalarOrArrayConvertible = Union[DatetimeScalar, ArrayConvertible] 

103 

104DatetimeDictArg = Union[List[Scalar], Tuple[Scalar, ...], AnyArrayLike] 

105 

106 

107class YearMonthDayDict(TypedDict, total=True): 

108 year: DatetimeDictArg 

109 month: DatetimeDictArg 

110 day: DatetimeDictArg 

111 

112 

113class FulldatetimeDict(YearMonthDayDict, total=False): 

114 hour: DatetimeDictArg 

115 hours: DatetimeDictArg 

116 minute: DatetimeDictArg 

117 minutes: DatetimeDictArg 

118 second: DatetimeDictArg 

119 seconds: DatetimeDictArg 

120 ms: DatetimeDictArg 

121 us: DatetimeDictArg 

122 ns: DatetimeDictArg 

123 

124 

125DictConvertible = Union[FulldatetimeDict, "DataFrame"] 

126start_caching_at = 50 

127 

128 

129# --------------------------------------------------------------------- 

130 

131 

132def _guess_datetime_format_for_array(arr, dayfirst: bool | None = False) -> str | None: 

133 # Try to guess the format based on the first non-NaN element, return None if can't 

134 if (first_non_null := tslib.first_non_null(arr)) != -1: 

135 if type(first_non_nan_element := arr[first_non_null]) is str: 

136 # GH#32264 np.str_ object 

137 guessed_format = guess_datetime_format( 

138 first_non_nan_element, dayfirst=dayfirst 

139 ) 

140 if guessed_format is not None: 

141 return guessed_format 

142 # If there are multiple non-null elements, warn about 

143 # how parsing might not be consistent 

144 if tslib.first_non_null(arr[first_non_null + 1 :]) != -1: 

145 warnings.warn( 

146 "Could not infer format, so each element will be parsed " 

147 "individually, falling back to `dateutil`. To ensure parsing is " 

148 "consistent and as-expected, please specify a format.", 

149 UserWarning, 

150 stacklevel=find_stack_level(), 

151 ) 

152 return None 

153 

154 

155def should_cache( 

156 arg: ArrayConvertible, unique_share: float = 0.7, check_count: int | None = None 

157) -> bool: 

158 """ 

159 Decides whether to do caching. 

160 

161 If the percent of unique elements among `check_count` elements less 

162 than `unique_share * 100` then we can do caching. 

163 

164 Parameters 

165 ---------- 

166 arg: listlike, tuple, 1-d array, Series 

167 unique_share: float, default=0.7, optional 

168 0 < unique_share < 1 

169 check_count: int, optional 

170 0 <= check_count <= len(arg) 

171 

172 Returns 

173 ------- 

174 do_caching: bool 

175 

176 Notes 

177 ----- 

178 By default for a sequence of less than 50 items in size, we don't do 

179 caching; for the number of elements less than 5000, we take ten percent of 

180 all elements to check for a uniqueness share; if the sequence size is more 

181 than 5000, then we check only the first 500 elements. 

182 All constants were chosen empirically by. 

183 """ 

184 do_caching = True 

185 

186 # default realization 

187 if check_count is None: 

188 # in this case, the gain from caching is negligible 

189 if len(arg) <= start_caching_at: 

190 return False 

191 

192 if len(arg) <= 5000: 

193 check_count = len(arg) // 10 

194 else: 

195 check_count = 500 

196 else: 

197 assert ( 

198 0 <= check_count <= len(arg) 

199 ), "check_count must be in next bounds: [0; len(arg)]" 

200 if check_count == 0: 

201 return False 

202 

203 assert 0 < unique_share < 1, "unique_share must be in next bounds: (0; 1)" 

204 

205 try: 

206 # We can't cache if the items are not hashable. 

207 unique_elements = set(islice(arg, check_count)) 

208 except TypeError: 

209 return False 

210 if len(unique_elements) > check_count * unique_share: 

211 do_caching = False 

212 return do_caching 

213 

214 

215def _maybe_cache( 

216 arg: ArrayConvertible, 

217 format: str | None, 

218 cache: bool, 

219 convert_listlike: Callable, 

220) -> Series: 

221 """ 

222 Create a cache of unique dates from an array of dates 

223 

224 Parameters 

225 ---------- 

226 arg : listlike, tuple, 1-d array, Series 

227 format : string 

228 Strftime format to parse time 

229 cache : bool 

230 True attempts to create a cache of converted values 

231 convert_listlike : function 

232 Conversion function to apply on dates 

233 

234 Returns 

235 ------- 

236 cache_array : Series 

237 Cache of converted, unique dates. Can be empty 

238 """ 

239 from pandas import Series 

240 

241 cache_array = Series(dtype=object) 

242 

243 if cache: 

244 # Perform a quicker unique check 

245 if not should_cache(arg): 

246 return cache_array 

247 

248 unique_dates = unique(arg) 

249 if len(unique_dates) < len(arg): 

250 cache_dates = convert_listlike(unique_dates, format) 

251 # GH#45319 

252 try: 

253 cache_array = Series(cache_dates, index=unique_dates, copy=False) 

254 except OutOfBoundsDatetime: 

255 return cache_array 

256 # GH#39882 and GH#35888 in case of None and NaT we get duplicates 

257 if not cache_array.index.is_unique: 

258 cache_array = cache_array[~cache_array.index.duplicated()] 

259 return cache_array 

260 

261 

262def _box_as_indexlike( 

263 dt_array: ArrayLike, utc: bool = False, name: Hashable = None 

264) -> Index: 

265 """ 

266 Properly boxes the ndarray of datetimes to DatetimeIndex 

267 if it is possible or to generic Index instead 

268 

269 Parameters 

270 ---------- 

271 dt_array: 1-d array 

272 Array of datetimes to be wrapped in an Index. 

273 utc : bool 

274 Whether to convert/localize timestamps to UTC. 

275 name : string, default None 

276 Name for a resulting index 

277 

278 Returns 

279 ------- 

280 result : datetime of converted dates 

281 - DatetimeIndex if convertible to sole datetime64 type 

282 - general Index otherwise 

283 """ 

284 

285 if is_datetime64_dtype(dt_array): 

286 tz = "utc" if utc else None 

287 return DatetimeIndex(dt_array, tz=tz, name=name) 

288 return Index(dt_array, name=name, dtype=dt_array.dtype) 

289 

290 

291def _convert_and_box_cache( 

292 arg: DatetimeScalarOrArrayConvertible, 

293 cache_array: Series, 

294 name: Hashable | None = None, 

295) -> Index: 

296 """ 

297 Convert array of dates with a cache and wrap the result in an Index. 

298 

299 Parameters 

300 ---------- 

301 arg : integer, float, string, datetime, list, tuple, 1-d array, Series 

302 cache_array : Series 

303 Cache of converted, unique dates 

304 name : string, default None 

305 Name for a DatetimeIndex 

306 

307 Returns 

308 ------- 

309 result : Index-like of converted dates 

310 """ 

311 from pandas import Series 

312 

313 result = Series(arg, dtype=cache_array.index.dtype).map(cache_array) 

314 return _box_as_indexlike(result._values, utc=False, name=name) 

315 

316 

317def _return_parsed_timezone_results( 

318 result: np.ndarray, timezones, utc: bool, name 

319) -> Index: 

320 """ 

321 Return results from array_strptime if a %z or %Z directive was passed. 

322 

323 Parameters 

324 ---------- 

325 result : ndarray[int64] 

326 int64 date representations of the dates 

327 timezones : ndarray 

328 pytz timezone objects 

329 utc : bool 

330 Whether to convert/localize timestamps to UTC. 

331 name : string, default None 

332 Name for a DatetimeIndex 

333 

334 Returns 

335 ------- 

336 tz_result : Index-like of parsed dates with timezone 

337 """ 

338 tz_results = np.empty(len(result), dtype=object) 

339 for zone in unique(timezones): 

340 mask = timezones == zone 

341 dta = DatetimeArray(result[mask]).tz_localize(zone) 

342 if utc: 

343 if dta.tzinfo is None: 

344 dta = dta.tz_localize("utc") 

345 else: 

346 dta = dta.tz_convert("utc") 

347 tz_results[mask] = dta 

348 

349 return Index(tz_results, name=name) 

350 

351 

352def _convert_listlike_datetimes( 

353 arg, 

354 format: str | None, 

355 name: Hashable = None, 

356 utc: bool = False, 

357 unit: str | None = None, 

358 errors: DateTimeErrorChoices = "raise", 

359 dayfirst: bool | None = None, 

360 yearfirst: bool | None = None, 

361 exact: bool = True, 

362): 

363 """ 

364 Helper function for to_datetime. Performs the conversions of 1D listlike 

365 of dates 

366 

367 Parameters 

368 ---------- 

369 arg : list, tuple, ndarray, Series, Index 

370 date to be parsed 

371 name : object 

372 None or string for the Index name 

373 utc : bool 

374 Whether to convert/localize timestamps to UTC. 

375 unit : str 

376 None or string of the frequency of the passed data 

377 errors : str 

378 error handing behaviors from to_datetime, 'raise', 'coerce', 'ignore' 

379 dayfirst : bool 

380 dayfirst parsing behavior from to_datetime 

381 yearfirst : bool 

382 yearfirst parsing behavior from to_datetime 

383 exact : bool, default True 

384 exact format matching behavior from to_datetime 

385 

386 Returns 

387 ------- 

388 Index-like of parsed dates 

389 """ 

390 if isinstance(arg, (list, tuple)): 

391 arg = np.array(arg, dtype="O") 

392 elif isinstance(arg, PandasArray): 

393 arg = np.array(arg) 

394 

395 arg_dtype = getattr(arg, "dtype", None) 

396 # these are shortcutable 

397 tz = "utc" if utc else None 

398 if is_datetime64tz_dtype(arg_dtype): 

399 if not isinstance(arg, (DatetimeArray, DatetimeIndex)): 

400 return DatetimeIndex(arg, tz=tz, name=name) 

401 if utc: 

402 arg = arg.tz_convert(None).tz_localize("utc") 

403 return arg 

404 

405 elif is_datetime64_dtype(arg_dtype): 

406 arg_dtype = cast(np.dtype, arg_dtype) 

407 if not is_supported_unit(get_unit_from_dtype(arg_dtype)): 

408 # We go to closest supported reso, i.e. "s" 

409 arg = astype_overflowsafe( 

410 # TODO: looks like we incorrectly raise with errors=="ignore" 

411 np.asarray(arg), 

412 np.dtype("M8[s]"), 

413 is_coerce=errors == "coerce", 

414 ) 

415 

416 if not isinstance(arg, (DatetimeArray, DatetimeIndex)): 

417 return DatetimeIndex(arg, tz=tz, name=name) 

418 elif utc: 

419 # DatetimeArray, DatetimeIndex 

420 return arg.tz_localize("utc") 

421 

422 return arg 

423 

424 elif unit is not None: 

425 if format is not None: 

426 raise ValueError("cannot specify both format and unit") 

427 return _to_datetime_with_unit(arg, unit, name, utc, errors) 

428 elif getattr(arg, "ndim", 1) > 1: 

429 raise TypeError( 

430 "arg must be a string, datetime, list, tuple, 1-d array, or Series" 

431 ) 

432 

433 # warn if passing timedelta64, raise for PeriodDtype 

434 # NB: this must come after unit transformation 

435 try: 

436 arg, _ = maybe_convert_dtype(arg, copy=False, tz=libtimezones.maybe_get_tz(tz)) 

437 except TypeError: 

438 if errors == "coerce": 

439 npvalues = np.array(["NaT"], dtype="datetime64[ns]").repeat(len(arg)) 

440 return DatetimeIndex(npvalues, name=name) 

441 elif errors == "ignore": 

442 idx = Index(arg, name=name) 

443 return idx 

444 raise 

445 

446 arg = ensure_object(arg) 

447 

448 if format is None: 

449 format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst) 

450 

451 # `format` could be inferred, or user didn't ask for mixed-format parsing. 

452 if format is not None and format != "mixed": 

453 return _array_strptime_with_fallback(arg, name, utc, format, exact, errors) 

454 

455 result, tz_parsed = objects_to_datetime64ns( 

456 arg, 

457 dayfirst=dayfirst, 

458 yearfirst=yearfirst, 

459 utc=utc, 

460 errors=errors, 

461 allow_object=True, 

462 ) 

463 

464 if tz_parsed is not None: 

465 # We can take a shortcut since the datetime64 numpy array 

466 # is in UTC 

467 dta = DatetimeArray(result, dtype=tz_to_dtype(tz_parsed)) 

468 return DatetimeIndex._simple_new(dta, name=name) 

469 

470 return _box_as_indexlike(result, utc=utc, name=name) 

471 

472 

473def _array_strptime_with_fallback( 

474 arg, 

475 name, 

476 utc: bool, 

477 fmt: str, 

478 exact: bool, 

479 errors: str, 

480) -> Index: 

481 """ 

482 Call array_strptime, with fallback behavior depending on 'errors'. 

483 """ 

484 result, timezones = array_strptime(arg, fmt, exact=exact, errors=errors, utc=utc) 

485 if any(tz is not None for tz in timezones): 

486 return _return_parsed_timezone_results(result, timezones, utc, name) 

487 

488 return _box_as_indexlike(result, utc=utc, name=name) 

489 

490 

491def _to_datetime_with_unit(arg, unit, name, utc: bool, errors: str) -> Index: 

492 """ 

493 to_datetime specalized to the case where a 'unit' is passed. 

494 """ 

495 arg = extract_array(arg, extract_numpy=True) 

496 

497 # GH#30050 pass an ndarray to tslib.array_with_unit_to_datetime 

498 # because it expects an ndarray argument 

499 if isinstance(arg, IntegerArray): 

500 arr = arg.astype(f"datetime64[{unit}]") 

501 tz_parsed = None 

502 else: 

503 arg = np.asarray(arg) 

504 

505 if arg.dtype.kind in ["i", "u"]: 

506 # Note we can't do "f" here because that could induce unwanted 

507 # rounding GH#14156, GH#20445 

508 arr = arg.astype(f"datetime64[{unit}]", copy=False) 

509 try: 

510 arr = astype_overflowsafe(arr, np.dtype("M8[ns]"), copy=False) 

511 except OutOfBoundsDatetime: 

512 if errors == "raise": 

513 raise 

514 arg = arg.astype(object) 

515 return _to_datetime_with_unit(arg, unit, name, utc, errors) 

516 tz_parsed = None 

517 

518 elif arg.dtype.kind == "f": 

519 mult, _ = precision_from_unit(unit) 

520 

521 mask = np.isnan(arg) | (arg == iNaT) 

522 fvalues = (arg * mult).astype("f8", copy=False) 

523 fvalues[mask] = 0 

524 

525 if (fvalues < Timestamp.min._value).any() or ( 

526 fvalues > Timestamp.max._value 

527 ).any(): 

528 if errors != "raise": 

529 arg = arg.astype(object) 

530 return _to_datetime_with_unit(arg, unit, name, utc, errors) 

531 raise OutOfBoundsDatetime(f"cannot convert input with unit '{unit}'") 

532 

533 arr = fvalues.astype("M8[ns]", copy=False) 

534 arr[mask] = np.datetime64("NaT", "ns") 

535 

536 tz_parsed = None 

537 else: 

538 arg = arg.astype(object, copy=False) 

539 arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors) 

540 

541 if errors == "ignore": 

542 # Index constructor _may_ infer to DatetimeIndex 

543 result = Index._with_infer(arr, name=name) 

544 else: 

545 result = DatetimeIndex(arr, name=name) 

546 

547 if not isinstance(result, DatetimeIndex): 

548 return result 

549 

550 # GH#23758: We may still need to localize the result with tz 

551 # GH#25546: Apply tz_parsed first (from arg), then tz (from caller) 

552 # result will be naive but in UTC 

553 result = result.tz_localize("UTC").tz_convert(tz_parsed) 

554 

555 if utc: 

556 if result.tz is None: 

557 result = result.tz_localize("utc") 

558 else: 

559 result = result.tz_convert("utc") 

560 return result 

561 

562 

563def _adjust_to_origin(arg, origin, unit): 

564 """ 

565 Helper function for to_datetime. 

566 Adjust input argument to the specified origin 

567 

568 Parameters 

569 ---------- 

570 arg : list, tuple, ndarray, Series, Index 

571 date to be adjusted 

572 origin : 'julian' or Timestamp 

573 origin offset for the arg 

574 unit : str 

575 passed unit from to_datetime, must be 'D' 

576 

577 Returns 

578 ------- 

579 ndarray or scalar of adjusted date(s) 

580 """ 

581 if origin == "julian": 

582 original = arg 

583 j0 = Timestamp(0).to_julian_date() 

584 if unit != "D": 

585 raise ValueError("unit must be 'D' for origin='julian'") 

586 try: 

587 arg = arg - j0 

588 except TypeError as err: 

589 raise ValueError( 

590 "incompatible 'arg' type for given 'origin'='julian'" 

591 ) from err 

592 

593 # preemptively check this for a nice range 

594 j_max = Timestamp.max.to_julian_date() - j0 

595 j_min = Timestamp.min.to_julian_date() - j0 

596 if np.any(arg > j_max) or np.any(arg < j_min): 

597 raise OutOfBoundsDatetime( 

598 f"{original} is Out of Bounds for origin='julian'" 

599 ) 

600 else: 

601 # arg must be numeric 

602 if not ( 

603 (is_scalar(arg) and (is_integer(arg) or is_float(arg))) 

604 or is_numeric_dtype(np.asarray(arg)) 

605 ): 

606 raise ValueError( 

607 f"'{arg}' is not compatible with origin='{origin}'; " 

608 "it must be numeric with a unit specified" 

609 ) 

610 

611 # we are going to offset back to unix / epoch time 

612 try: 

613 offset = Timestamp(origin, unit=unit) 

614 except OutOfBoundsDatetime as err: 

615 raise OutOfBoundsDatetime(f"origin {origin} is Out of Bounds") from err 

616 except ValueError as err: 

617 raise ValueError( 

618 f"origin {origin} cannot be converted to a Timestamp" 

619 ) from err 

620 

621 if offset.tz is not None: 

622 raise ValueError(f"origin offset {offset} must be tz-naive") 

623 td_offset = offset - Timestamp(0) 

624 

625 # convert the offset to the unit of the arg 

626 # this should be lossless in terms of precision 

627 ioffset = td_offset // Timedelta(1, unit=unit) 

628 

629 # scalars & ndarray-like can handle the addition 

630 if is_list_like(arg) and not isinstance(arg, (ABCSeries, Index, np.ndarray)): 

631 arg = np.asarray(arg) 

632 arg = arg + ioffset 

633 return arg 

634 

635 

636@overload 

637def to_datetime( 

638 arg: DatetimeScalar, 

639 errors: DateTimeErrorChoices = ..., 

640 dayfirst: bool = ..., 

641 yearfirst: bool = ..., 

642 utc: bool = ..., 

643 format: str | None = ..., 

644 exact: bool = ..., 

645 unit: str | None = ..., 

646 infer_datetime_format: bool = ..., 

647 origin=..., 

648 cache: bool = ..., 

649) -> Timestamp: 

650 ... 

651 

652 

653@overload 

654def to_datetime( 

655 arg: Series | DictConvertible, 

656 errors: DateTimeErrorChoices = ..., 

657 dayfirst: bool = ..., 

658 yearfirst: bool = ..., 

659 utc: bool = ..., 

660 format: str | None = ..., 

661 exact: bool = ..., 

662 unit: str | None = ..., 

663 infer_datetime_format: bool = ..., 

664 origin=..., 

665 cache: bool = ..., 

666) -> Series: 

667 ... 

668 

669 

670@overload 

671def to_datetime( 

672 arg: list | tuple | Index | ArrayLike, 

673 errors: DateTimeErrorChoices = ..., 

674 dayfirst: bool = ..., 

675 yearfirst: bool = ..., 

676 utc: bool = ..., 

677 format: str | None = ..., 

678 exact: bool = ..., 

679 unit: str | None = ..., 

680 infer_datetime_format: bool = ..., 

681 origin=..., 

682 cache: bool = ..., 

683) -> DatetimeIndex: 

684 ... 

685 

686 

687def to_datetime( 

688 arg: DatetimeScalarOrArrayConvertible | DictConvertible, 

689 errors: DateTimeErrorChoices = "raise", 

690 dayfirst: bool = False, 

691 yearfirst: bool = False, 

692 utc: bool = False, 

693 format: str | None = None, 

694 exact: bool | lib.NoDefault = lib.no_default, 

695 unit: str | None = None, 

696 infer_datetime_format: lib.NoDefault | bool = lib.no_default, 

697 origin: str = "unix", 

698 cache: bool = True, 

699) -> DatetimeIndex | Series | DatetimeScalar | NaTType | None: 

700 """ 

701 Convert argument to datetime. 

702 

703 This function converts a scalar, array-like, :class:`Series` or 

704 :class:`DataFrame`/dict-like to a pandas datetime object. 

705 

706 Parameters 

707 ---------- 

708 arg : int, float, str, datetime, list, tuple, 1-d array, Series, DataFrame/dict-like 

709 The object to convert to a datetime. If a :class:`DataFrame` is provided, the 

710 method expects minimally the following columns: :const:`"year"`, 

711 :const:`"month"`, :const:`"day"`. 

712 errors : {'ignore', 'raise', 'coerce'}, default 'raise' 

713 - If :const:`'raise'`, then invalid parsing will raise an exception. 

714 - If :const:`'coerce'`, then invalid parsing will be set as :const:`NaT`. 

715 - If :const:`'ignore'`, then invalid parsing will return the input. 

716 dayfirst : bool, default False 

717 Specify a date parse order if `arg` is str or is list-like. 

718 If :const:`True`, parses dates with the day first, e.g. :const:`"10/11/12"` 

719 is parsed as :const:`2012-11-10`. 

720 

721 .. warning:: 

722 

723 ``dayfirst=True`` is not strict, but will prefer to parse 

724 with day first. 

725 

726 yearfirst : bool, default False 

727 Specify a date parse order if `arg` is str or is list-like. 

728 

729 - If :const:`True` parses dates with the year first, e.g. 

730 :const:`"10/11/12"` is parsed as :const:`2010-11-12`. 

731 - If both `dayfirst` and `yearfirst` are :const:`True`, `yearfirst` is 

732 preceded (same as :mod:`dateutil`). 

733 

734 .. warning:: 

735 

736 ``yearfirst=True`` is not strict, but will prefer to parse 

737 with year first. 

738 

739 utc : bool, default False 

740 Control timezone-related parsing, localization and conversion. 

741 

742 - If :const:`True`, the function *always* returns a timezone-aware 

743 UTC-localized :class:`Timestamp`, :class:`Series` or 

744 :class:`DatetimeIndex`. To do this, timezone-naive inputs are 

745 *localized* as UTC, while timezone-aware inputs are *converted* to UTC. 

746 

747 - If :const:`False` (default), inputs will not be coerced to UTC. 

748 Timezone-naive inputs will remain naive, while timezone-aware ones 

749 will keep their time offsets. Limitations exist for mixed 

750 offsets (typically, daylight savings), see :ref:`Examples 

751 <to_datetime_tz_examples>` section for details. 

752 

753 See also: pandas general documentation about `timezone conversion and 

754 localization 

755 <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html 

756 #time-zone-handling>`_. 

757 

758 format : str, default None 

759 The strftime to parse time, e.g. :const:`"%d/%m/%Y"`. See 

760 `strftime documentation 

761 <https://docs.python.org/3/library/datetime.html 

762 #strftime-and-strptime-behavior>`_ for more information on choices, though 

763 note that :const:`"%f"` will parse all the way up to nanoseconds. 

764 You can also pass: 

765 

766 - "ISO8601", to parse any `ISO8601 <https://en.wikipedia.org/wiki/ISO_8601>`_ 

767 time string (not necessarily in exactly the same format); 

768 - "mixed", to infer the format for each element individually. This is risky, 

769 and you should probably use it along with `dayfirst`. 

770 exact : bool, default True 

771 Control how `format` is used: 

772 

773 - If :const:`True`, require an exact `format` match. 

774 - If :const:`False`, allow the `format` to match anywhere in the target 

775 string. 

776 

777 Cannot be used alongside ``format='ISO8601'`` or ``format='mixed'``. 

778 unit : str, default 'ns' 

779 The unit of the arg (D,s,ms,us,ns) denote the unit, which is an 

780 integer or float number. This will be based off the origin. 

781 Example, with ``unit='ms'`` and ``origin='unix'``, this would calculate 

782 the number of milliseconds to the unix epoch start. 

783 infer_datetime_format : bool, default False 

784 If :const:`True` and no `format` is given, attempt to infer the format 

785 of the datetime strings based on the first non-NaN element, 

786 and if it can be inferred, switch to a faster method of parsing them. 

787 In some cases this can increase the parsing speed by ~5-10x. 

788 

789 .. deprecated:: 2.0.0 

790 A strict version of this argument is now the default, passing it has 

791 no effect. 

792 

793 origin : scalar, default 'unix' 

794 Define the reference date. The numeric values would be parsed as number 

795 of units (defined by `unit`) since this reference date. 

796 

797 - If :const:`'unix'` (or POSIX) time; origin is set to 1970-01-01. 

798 - If :const:`'julian'`, unit must be :const:`'D'`, and origin is set to 

799 beginning of Julian Calendar. Julian day number :const:`0` is assigned 

800 to the day starting at noon on January 1, 4713 BC. 

801 - If Timestamp convertible (Timestamp, dt.datetime, np.datetimt64 or date 

802 string), origin is set to Timestamp identified by origin. 

803 - If a float or integer, origin is the millisecond difference 

804 relative to 1970-01-01. 

805 cache : bool, default True 

806 If :const:`True`, use a cache of unique, converted dates to apply the 

807 datetime conversion. May produce significant speed-up when parsing 

808 duplicate date strings, especially ones with timezone offsets. The cache 

809 is only used when there are at least 50 values. The presence of 

810 out-of-bounds values will render the cache unusable and may slow down 

811 parsing. 

812 

813 Returns 

814 ------- 

815 datetime 

816 If parsing succeeded. 

817 Return type depends on input (types in parenthesis correspond to 

818 fallback in case of unsuccessful timezone or out-of-range timestamp 

819 parsing): 

820 

821 - scalar: :class:`Timestamp` (or :class:`datetime.datetime`) 

822 - array-like: :class:`DatetimeIndex` (or :class:`Series` with 

823 :class:`object` dtype containing :class:`datetime.datetime`) 

824 - Series: :class:`Series` of :class:`datetime64` dtype (or 

825 :class:`Series` of :class:`object` dtype containing 

826 :class:`datetime.datetime`) 

827 - DataFrame: :class:`Series` of :class:`datetime64` dtype (or 

828 :class:`Series` of :class:`object` dtype containing 

829 :class:`datetime.datetime`) 

830 

831 Raises 

832 ------ 

833 ParserError 

834 When parsing a date from string fails. 

835 ValueError 

836 When another datetime conversion error happens. For example when one 

837 of 'year', 'month', day' columns is missing in a :class:`DataFrame`, or 

838 when a Timezone-aware :class:`datetime.datetime` is found in an array-like 

839 of mixed time offsets, and ``utc=False``. 

840 

841 See Also 

842 -------- 

843 DataFrame.astype : Cast argument to a specified dtype. 

844 to_timedelta : Convert argument to timedelta. 

845 convert_dtypes : Convert dtypes. 

846 

847 Notes 

848 ----- 

849 

850 Many input types are supported, and lead to different output types: 

851 

852 - **scalars** can be int, float, str, datetime object (from stdlib :mod:`datetime` 

853 module or :mod:`numpy`). They are converted to :class:`Timestamp` when 

854 possible, otherwise they are converted to :class:`datetime.datetime`. 

855 None/NaN/null scalars are converted to :const:`NaT`. 

856 

857 - **array-like** can contain int, float, str, datetime objects. They are 

858 converted to :class:`DatetimeIndex` when possible, otherwise they are 

859 converted to :class:`Index` with :class:`object` dtype, containing 

860 :class:`datetime.datetime`. None/NaN/null entries are converted to 

861 :const:`NaT` in both cases. 

862 

863 - **Series** are converted to :class:`Series` with :class:`datetime64` 

864 dtype when possible, otherwise they are converted to :class:`Series` with 

865 :class:`object` dtype, containing :class:`datetime.datetime`. None/NaN/null 

866 entries are converted to :const:`NaT` in both cases. 

867 

868 - **DataFrame/dict-like** are converted to :class:`Series` with 

869 :class:`datetime64` dtype. For each row a datetime is created from assembling 

870 the various dataframe columns. Column keys can be common abbreviations 

871 like [‘year’, ‘month’, ‘day’, ‘minute’, ‘second’, ‘ms’, ‘us’, ‘ns’]) or 

872 plurals of the same. 

873 

874 The following causes are responsible for :class:`datetime.datetime` objects 

875 being returned (possibly inside an :class:`Index` or a :class:`Series` with 

876 :class:`object` dtype) instead of a proper pandas designated type 

877 (:class:`Timestamp`, :class:`DatetimeIndex` or :class:`Series` 

878 with :class:`datetime64` dtype): 

879 

880 - when any input element is before :const:`Timestamp.min` or after 

881 :const:`Timestamp.max`, see `timestamp limitations 

882 <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html 

883 #timeseries-timestamp-limits>`_. 

884 

885 - when ``utc=False`` (default) and the input is an array-like or 

886 :class:`Series` containing mixed naive/aware datetime, or aware with mixed 

887 time offsets. Note that this happens in the (quite frequent) situation when 

888 the timezone has a daylight savings policy. In that case you may wish to 

889 use ``utc=True``. 

890 

891 Examples 

892 -------- 

893 

894 **Handling various input formats** 

895 

896 Assembling a datetime from multiple columns of a :class:`DataFrame`. The keys 

897 can be common abbreviations like ['year', 'month', 'day', 'minute', 'second', 

898 'ms', 'us', 'ns']) or plurals of the same 

899 

900 >>> df = pd.DataFrame({'year': [2015, 2016], 

901 ... 'month': [2, 3], 

902 ... 'day': [4, 5]}) 

903 >>> pd.to_datetime(df) 

904 0 2015-02-04 

905 1 2016-03-05 

906 dtype: datetime64[ns] 

907 

908 Using a unix epoch time 

909 

910 >>> pd.to_datetime(1490195805, unit='s') 

911 Timestamp('2017-03-22 15:16:45') 

912 >>> pd.to_datetime(1490195805433502912, unit='ns') 

913 Timestamp('2017-03-22 15:16:45.433502912') 

914 

915 .. warning:: For float arg, precision rounding might happen. To prevent 

916 unexpected behavior use a fixed-width exact type. 

917 

918 Using a non-unix epoch origin 

919 

920 >>> pd.to_datetime([1, 2, 3], unit='D', 

921 ... origin=pd.Timestamp('1960-01-01')) 

922 DatetimeIndex(['1960-01-02', '1960-01-03', '1960-01-04'], 

923 dtype='datetime64[ns]', freq=None) 

924 

925 **Differences with strptime behavior** 

926 

927 :const:`"%f"` will parse all the way up to nanoseconds. 

928 

929 >>> pd.to_datetime('2018-10-26 12:00:00.0000000011', 

930 ... format='%Y-%m-%d %H:%M:%S.%f') 

931 Timestamp('2018-10-26 12:00:00.000000001') 

932 

933 **Non-convertible date/times** 

934 

935 If a date does not meet the `timestamp limitations 

936 <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html 

937 #timeseries-timestamp-limits>`_, passing ``errors='ignore'`` 

938 will return the original input instead of raising any exception. 

939 

940 Passing ``errors='coerce'`` will force an out-of-bounds date to :const:`NaT`, 

941 in addition to forcing non-dates (or non-parseable dates) to :const:`NaT`. 

942 

943 >>> pd.to_datetime('13000101', format='%Y%m%d', errors='ignore') 

944 '13000101' 

945 >>> pd.to_datetime('13000101', format='%Y%m%d', errors='coerce') 

946 NaT 

947 

948 .. _to_datetime_tz_examples: 

949 

950 **Timezones and time offsets** 

951 

952 The default behaviour (``utc=False``) is as follows: 

953 

954 - Timezone-naive inputs are converted to timezone-naive :class:`DatetimeIndex`: 

955 

956 >>> pd.to_datetime(['2018-10-26 12:00:00', '2018-10-26 13:00:15']) 

957 DatetimeIndex(['2018-10-26 12:00:00', '2018-10-26 13:00:15'], 

958 dtype='datetime64[ns]', freq=None) 

959 

960 - Timezone-aware inputs *with constant time offset* are converted to 

961 timezone-aware :class:`DatetimeIndex`: 

962 

963 >>> pd.to_datetime(['2018-10-26 12:00 -0500', '2018-10-26 13:00 -0500']) 

964 DatetimeIndex(['2018-10-26 12:00:00-05:00', '2018-10-26 13:00:00-05:00'], 

965 dtype='datetime64[ns, UTC-05:00]', freq=None) 

966 

967 - However, timezone-aware inputs *with mixed time offsets* (for example 

968 issued from a timezone with daylight savings, such as Europe/Paris) 

969 are **not successfully converted** to a :class:`DatetimeIndex`. Instead a 

970 simple :class:`Index` containing :class:`datetime.datetime` objects is 

971 returned: 

972 

973 >>> pd.to_datetime(['2020-10-25 02:00 +0200', '2020-10-25 04:00 +0100']) 

974 Index([2020-10-25 02:00:00+02:00, 2020-10-25 04:00:00+01:00], 

975 dtype='object') 

976 

977 - A mix of timezone-aware and timezone-naive inputs is also converted to 

978 a simple :class:`Index` containing :class:`datetime.datetime` objects: 

979 

980 >>> from datetime import datetime 

981 >>> pd.to_datetime(["2020-01-01 01:00:00-01:00", datetime(2020, 1, 1, 3, 0)]) 

982 Index([2020-01-01 01:00:00-01:00, 2020-01-01 03:00:00], dtype='object') 

983 

984 | 

985 

986 Setting ``utc=True`` solves most of the above issues: 

987 

988 - Timezone-naive inputs are *localized* as UTC 

989 

990 >>> pd.to_datetime(['2018-10-26 12:00', '2018-10-26 13:00'], utc=True) 

991 DatetimeIndex(['2018-10-26 12:00:00+00:00', '2018-10-26 13:00:00+00:00'], 

992 dtype='datetime64[ns, UTC]', freq=None) 

993 

994 - Timezone-aware inputs are *converted* to UTC (the output represents the 

995 exact same datetime, but viewed from the UTC time offset `+00:00`). 

996 

997 >>> pd.to_datetime(['2018-10-26 12:00 -0530', '2018-10-26 12:00 -0500'], 

998 ... utc=True) 

999 DatetimeIndex(['2018-10-26 17:30:00+00:00', '2018-10-26 17:00:00+00:00'], 

1000 dtype='datetime64[ns, UTC]', freq=None) 

1001 

1002 - Inputs can contain both string or datetime, the above 

1003 rules still apply 

1004 

1005 >>> pd.to_datetime(['2018-10-26 12:00', datetime(2020, 1, 1, 18)], utc=True) 

1006 DatetimeIndex(['2018-10-26 12:00:00+00:00', '2020-01-01 18:00:00+00:00'], 

1007 dtype='datetime64[ns, UTC]', freq=None) 

1008 """ 

1009 if exact is not lib.no_default and format in {"mixed", "ISO8601"}: 

1010 raise ValueError("Cannot use 'exact' when 'format' is 'mixed' or 'ISO8601'") 

1011 if infer_datetime_format is not lib.no_default: 

1012 warnings.warn( 

1013 "The argument 'infer_datetime_format' is deprecated and will " 

1014 "be removed in a future version. " 

1015 "A strict version of it is now the default, see " 

1016 "https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. " 

1017 "You can safely remove this argument.", 

1018 stacklevel=find_stack_level(), 

1019 ) 

1020 if arg is None: 

1021 return None 

1022 

1023 if origin != "unix": 

1024 arg = _adjust_to_origin(arg, origin, unit) 

1025 

1026 convert_listlike = partial( 

1027 _convert_listlike_datetimes, 

1028 utc=utc, 

1029 unit=unit, 

1030 dayfirst=dayfirst, 

1031 yearfirst=yearfirst, 

1032 errors=errors, 

1033 exact=exact, 

1034 ) 

1035 # pylint: disable-next=used-before-assignment 

1036 result: Timestamp | NaTType | Series | Index 

1037 

1038 if isinstance(arg, Timestamp): 

1039 result = arg 

1040 if utc: 

1041 if arg.tz is not None: 

1042 result = arg.tz_convert("utc") 

1043 else: 

1044 result = arg.tz_localize("utc") 

1045 elif isinstance(arg, ABCSeries): 

1046 cache_array = _maybe_cache(arg, format, cache, convert_listlike) 

1047 if not cache_array.empty: 

1048 result = arg.map(cache_array) 

1049 else: 

1050 values = convert_listlike(arg._values, format) 

1051 result = arg._constructor(values, index=arg.index, name=arg.name) 

1052 elif isinstance(arg, (ABCDataFrame, abc.MutableMapping)): 

1053 result = _assemble_from_unit_mappings(arg, errors, utc) 

1054 elif isinstance(arg, Index): 

1055 cache_array = _maybe_cache(arg, format, cache, convert_listlike) 

1056 if not cache_array.empty: 

1057 result = _convert_and_box_cache(arg, cache_array, name=arg.name) 

1058 else: 

1059 result = convert_listlike(arg, format, name=arg.name) 

1060 elif is_list_like(arg): 

1061 try: 

1062 # error: Argument 1 to "_maybe_cache" has incompatible type 

1063 # "Union[float, str, datetime, List[Any], Tuple[Any, ...], ExtensionArray, 

1064 # ndarray[Any, Any], Series]"; expected "Union[List[Any], Tuple[Any, ...], 

1065 # Union[Union[ExtensionArray, ndarray[Any, Any]], Index, Series], Series]" 

1066 argc = cast( 

1067 Union[list, tuple, ExtensionArray, np.ndarray, "Series", Index], arg 

1068 ) 

1069 cache_array = _maybe_cache(argc, format, cache, convert_listlike) 

1070 except OutOfBoundsDatetime: 

1071 # caching attempts to create a DatetimeIndex, which may raise 

1072 # an OOB. If that's the desired behavior, then just reraise... 

1073 if errors == "raise": 

1074 raise 

1075 # ... otherwise, continue without the cache. 

1076 from pandas import Series 

1077 

1078 cache_array = Series([], dtype=object) # just an empty array 

1079 if not cache_array.empty: 

1080 result = _convert_and_box_cache(argc, cache_array) 

1081 else: 

1082 result = convert_listlike(argc, format) 

1083 else: 

1084 result = convert_listlike(np.array([arg]), format)[0] 

1085 if isinstance(arg, bool) and isinstance(result, np.bool_): 

1086 result = bool(result) # TODO: avoid this kludge. 

1087 

1088 # error: Incompatible return value type (got "Union[Timestamp, NaTType, 

1089 # Series, Index]", expected "Union[DatetimeIndex, Series, float, str, 

1090 # NaTType, None]") 

1091 return result # type: ignore[return-value] 

1092 

1093 

1094# mappings for assembling units 

1095_unit_map = { 

1096 "year": "year", 

1097 "years": "year", 

1098 "month": "month", 

1099 "months": "month", 

1100 "day": "day", 

1101 "days": "day", 

1102 "hour": "h", 

1103 "hours": "h", 

1104 "minute": "m", 

1105 "minutes": "m", 

1106 "second": "s", 

1107 "seconds": "s", 

1108 "ms": "ms", 

1109 "millisecond": "ms", 

1110 "milliseconds": "ms", 

1111 "us": "us", 

1112 "microsecond": "us", 

1113 "microseconds": "us", 

1114 "ns": "ns", 

1115 "nanosecond": "ns", 

1116 "nanoseconds": "ns", 

1117} 

1118 

1119 

1120def _assemble_from_unit_mappings(arg, errors: DateTimeErrorChoices, utc: bool): 

1121 """ 

1122 assemble the unit specified fields from the arg (DataFrame) 

1123 Return a Series for actual parsing 

1124 

1125 Parameters 

1126 ---------- 

1127 arg : DataFrame 

1128 errors : {'ignore', 'raise', 'coerce'}, default 'raise' 

1129 

1130 - If :const:`'raise'`, then invalid parsing will raise an exception 

1131 - If :const:`'coerce'`, then invalid parsing will be set as :const:`NaT` 

1132 - If :const:`'ignore'`, then invalid parsing will return the input 

1133 utc : bool 

1134 Whether to convert/localize timestamps to UTC. 

1135 

1136 Returns 

1137 ------- 

1138 Series 

1139 """ 

1140 from pandas import ( 

1141 DataFrame, 

1142 to_numeric, 

1143 to_timedelta, 

1144 ) 

1145 

1146 arg = DataFrame(arg) 

1147 if not arg.columns.is_unique: 

1148 raise ValueError("cannot assemble with duplicate keys") 

1149 

1150 # replace passed unit with _unit_map 

1151 def f(value): 

1152 if value in _unit_map: 

1153 return _unit_map[value] 

1154 

1155 # m is case significant 

1156 if value.lower() in _unit_map: 

1157 return _unit_map[value.lower()] 

1158 

1159 return value 

1160 

1161 unit = {k: f(k) for k in arg.keys()} 

1162 unit_rev = {v: k for k, v in unit.items()} 

1163 

1164 # we require at least Ymd 

1165 required = ["year", "month", "day"] 

1166 req = sorted(set(required) - set(unit_rev.keys())) 

1167 if len(req): 

1168 _required = ",".join(req) 

1169 raise ValueError( 

1170 "to assemble mappings requires at least that " 

1171 f"[year, month, day] be specified: [{_required}] is missing" 

1172 ) 

1173 

1174 # keys we don't recognize 

1175 excess = sorted(set(unit_rev.keys()) - set(_unit_map.values())) 

1176 if len(excess): 

1177 _excess = ",".join(excess) 

1178 raise ValueError( 

1179 f"extra keys have been passed to the datetime assemblage: [{_excess}]" 

1180 ) 

1181 

1182 def coerce(values): 

1183 # we allow coercion to if errors allows 

1184 values = to_numeric(values, errors=errors) 

1185 

1186 # prevent overflow in case of int8 or int16 

1187 if is_integer_dtype(values): 

1188 values = values.astype("int64", copy=False) 

1189 return values 

1190 

1191 values = ( 

1192 coerce(arg[unit_rev["year"]]) * 10000 

1193 + coerce(arg[unit_rev["month"]]) * 100 

1194 + coerce(arg[unit_rev["day"]]) 

1195 ) 

1196 try: 

1197 values = to_datetime(values, format="%Y%m%d", errors=errors, utc=utc) 

1198 except (TypeError, ValueError) as err: 

1199 raise ValueError(f"cannot assemble the datetimes: {err}") from err 

1200 

1201 units: list[UnitChoices] = ["h", "m", "s", "ms", "us", "ns"] 

1202 for u in units: 

1203 value = unit_rev.get(u) 

1204 if value is not None and value in arg: 

1205 try: 

1206 values += to_timedelta(coerce(arg[value]), unit=u, errors=errors) 

1207 except (TypeError, ValueError) as err: 

1208 raise ValueError( 

1209 f"cannot assemble the datetimes [{value}]: {err}" 

1210 ) from err 

1211 return values 

1212 

1213 

1214def _attempt_YYYYMMDD(arg: npt.NDArray[np.object_], errors: str) -> np.ndarray | None: 

1215 """ 

1216 try to parse the YYYYMMDD/%Y%m%d format, try to deal with NaT-like, 

1217 arg is a passed in as an object dtype, but could really be ints/strings 

1218 with nan-like/or floats (e.g. with nan) 

1219 

1220 Parameters 

1221 ---------- 

1222 arg : np.ndarray[object] 

1223 errors : {'raise','ignore','coerce'} 

1224 """ 

1225 

1226 def calc(carg): 

1227 # calculate the actual result 

1228 carg = carg.astype(object, copy=False) 

1229 parsed = parsing.try_parse_year_month_day( 

1230 carg / 10000, carg / 100 % 100, carg % 100 

1231 ) 

1232 return tslib.array_to_datetime(parsed, errors=errors)[0] 

1233 

1234 def calc_with_mask(carg, mask): 

1235 result = np.empty(carg.shape, dtype="M8[ns]") 

1236 iresult = result.view("i8") 

1237 iresult[~mask] = iNaT 

1238 

1239 masked_result = calc(carg[mask].astype(np.float64).astype(np.int64)) 

1240 result[mask] = masked_result.astype("M8[ns]") 

1241 return result 

1242 

1243 # try intlike / strings that are ints 

1244 try: 

1245 return calc(arg.astype(np.int64)) 

1246 except (ValueError, OverflowError, TypeError): 

1247 pass 

1248 

1249 # a float with actual np.nan 

1250 try: 

1251 carg = arg.astype(np.float64) 

1252 return calc_with_mask(carg, notna(carg)) 

1253 except (ValueError, OverflowError, TypeError): 

1254 pass 

1255 

1256 # string with NaN-like 

1257 try: 

1258 # error: Argument 2 to "isin" has incompatible type "List[Any]"; expected 

1259 # "Union[Union[ExtensionArray, ndarray], Index, Series]" 

1260 mask = ~algorithms.isin(arg, list(nat_strings)) # type: ignore[arg-type] 

1261 return calc_with_mask(arg, mask) 

1262 except (ValueError, OverflowError, TypeError): 

1263 pass 

1264 

1265 return None 

1266 

1267 

1268__all__ = [ 

1269 "DateParseError", 

1270 "should_cache", 

1271 "to_datetime", 

1272]