Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/arrow/parser.py: 82%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""Provides the :class:`Arrow <arrow.parser.DateTimeParser>` class, a better way to parse datetime strings."""
3import re
4from datetime import datetime, timedelta, timezone
5from datetime import tzinfo as dt_tzinfo
6from functools import lru_cache
7from typing import (
8 Any,
9 ClassVar,
10 Dict,
11 Iterable,
12 List,
13 Literal,
14 Match,
15 Optional,
16 Pattern,
17 SupportsFloat,
18 SupportsInt,
19 Tuple,
20 TypedDict,
21 Union,
22 cast,
23 overload,
24)
26try:
27 from zoneinfo import ZoneInfo, ZoneInfoNotFoundError
28except ImportError:
29 from backports.zoneinfo import ZoneInfo, ZoneInfoNotFoundError # type: ignore[no-redef]
31from arrow import locales
32from arrow.constants import DEFAULT_LOCALE
33from arrow.util import next_weekday, normalize_timestamp
36class ParserError(ValueError):
37 """
38 A custom exception class for handling parsing errors in the parser.
40 Notes:
41 This class inherits from the built-in `ValueError` class and is used to raise exceptions
42 when an error occurs during the parsing process.
43 """
45 pass
48# Allows for ParserErrors to be propagated from _build_datetime()
49# when day_of_year errors occur.
50# Before this, the ParserErrors were caught by the try/except in
51# _parse_multiformat() and the appropriate error message was not
52# transmitted to the user.
53class ParserMatchError(ParserError):
54 """
55 This class is a subclass of the ParserError class and is used to raise errors that occur during the matching process.
57 Notes:
58 This class is part of the Arrow parser and is used to provide error handling when a parsing match fails.
60 """
62 pass
65_WEEKDATE_ELEMENT = Union[str, bytes, SupportsInt, bytearray]
67_FORMAT_TYPE = Literal[
68 "YYYY",
69 "YY",
70 "MM",
71 "M",
72 "DDDD",
73 "DDD",
74 "DD",
75 "D",
76 "HH",
77 "H",
78 "hh",
79 "h",
80 "mm",
81 "m",
82 "ss",
83 "s",
84 "X",
85 "x",
86 "ZZZ",
87 "ZZ",
88 "Z",
89 "S",
90 "W",
91 "MMMM",
92 "MMM",
93 "Do",
94 "dddd",
95 "ddd",
96 "d",
97 "a",
98 "A",
99]
102class _Parts(TypedDict, total=False):
103 """
104 A dictionary that represents different parts of a datetime.
106 :class:`_Parts` is a TypedDict that represents various components of a date or time,
107 such as year, month, day, hour, minute, second, microsecond, timestamp, expanded_timestamp, tzinfo,
108 am_pm, day_of_week, and weekdate.
110 :ivar year: The year, if present, as an integer.
111 :ivar month: The month, if present, as an integer.
112 :ivar day_of_year: The day of the year, if present, as an integer.
113 :ivar day: The day, if present, as an integer.
114 :ivar hour: The hour, if present, as an integer.
115 :ivar minute: The minute, if present, as an integer.
116 :ivar second: The second, if present, as an integer.
117 :ivar microsecond: The microsecond, if present, as an integer.
118 :ivar timestamp: The timestamp, if present, as a float.
119 :ivar expanded_timestamp: The expanded timestamp, if present, as an integer.
120 :ivar tzinfo: The timezone info, if present, as a :class:`dt_tzinfo` object.
121 :ivar am_pm: The AM/PM indicator, if present, as a string literal "am" or "pm".
122 :ivar day_of_week: The day of the week, if present, as an integer.
123 :ivar weekdate: The week date, if present, as a tuple of three integers or None.
124 """
126 year: int
127 month: int
128 day_of_year: int
129 day: int
130 hour: int
131 minute: int
132 second: int
133 microsecond: int
134 timestamp: float
135 expanded_timestamp: int
136 tzinfo: dt_tzinfo
137 am_pm: Literal["am", "pm"]
138 day_of_week: int
139 weekdate: Tuple[_WEEKDATE_ELEMENT, _WEEKDATE_ELEMENT, Optional[_WEEKDATE_ELEMENT]]
142class DateTimeParser:
143 """A :class:`DateTimeParser <arrow.arrow.parser>` object
145 Contains the regular expressions and functions to parse and split the input strings into tokens and eventually
146 produce a datetime that is used by :class:`Arrow <arrow.arrow.Arrow>` internally.
148 :param locale: the locale string
149 :param cache_size: the size of the LRU cache used for regular expressions. Defaults to 0.
151 """
153 _FORMAT_RE: ClassVar[Pattern[str]] = re.compile(
154 r"(YYY?Y?|MM?M?M?|Do|DD?D?D?|d?d?d?d|HH?|hh?|mm?|ss?|S+|ZZ?Z?|a|A|x|X|W)"
155 )
156 _ESCAPE_RE: ClassVar[Pattern[str]] = re.compile(r"\[[^\[\]]*\]")
158 _ONE_OR_TWO_DIGIT_RE: ClassVar[Pattern[str]] = re.compile(r"\d{1,2}")
159 _ONE_OR_TWO_OR_THREE_DIGIT_RE: ClassVar[Pattern[str]] = re.compile(r"\d{1,3}")
160 _ONE_OR_MORE_DIGIT_RE: ClassVar[Pattern[str]] = re.compile(r"\d+")
161 _TWO_DIGIT_RE: ClassVar[Pattern[str]] = re.compile(r"\d{2}")
162 _THREE_DIGIT_RE: ClassVar[Pattern[str]] = re.compile(r"\d{3}")
163 _FOUR_DIGIT_RE: ClassVar[Pattern[str]] = re.compile(r"\d{4}")
164 _TZ_Z_RE: ClassVar[Pattern[str]] = re.compile(r"([\+\-])(\d{2})(?:(\d{2}))?|Z")
165 _TZ_ZZ_RE: ClassVar[Pattern[str]] = re.compile(r"([\+\-])(\d{2})(?:\:(\d{2}))?|Z")
166 _TZ_NAME_RE: ClassVar[Pattern[str]] = re.compile(r"\w[\w+\-/]+")
167 # NOTE: timestamps cannot be parsed from natural language strings (by removing the ^...$) because it will
168 # break cases like "15 Jul 2000" and a format list (see issue #447)
169 _TIMESTAMP_RE: ClassVar[Pattern[str]] = re.compile(r"^\-?\d+\.?\d+$")
170 _TIMESTAMP_EXPANDED_RE: ClassVar[Pattern[str]] = re.compile(r"^\-?\d+$")
171 _TIME_RE: ClassVar[Pattern[str]] = re.compile(
172 r"^(\d{2})(?:\:?(\d{2}))?(?:\:?(\d{2}))?(?:([\.\,])(\d+))?$"
173 )
174 _WEEK_DATE_RE: ClassVar[Pattern[str]] = re.compile(
175 r"(?P<year>\d{4})[\-]?W(?P<week>\d{2})[\-]?(?P<day>\d)?"
176 )
178 _BASE_INPUT_RE_MAP: ClassVar[Dict[_FORMAT_TYPE, Pattern[str]]] = {
179 "YYYY": _FOUR_DIGIT_RE,
180 "YY": _TWO_DIGIT_RE,
181 "MM": _TWO_DIGIT_RE,
182 "M": _ONE_OR_TWO_DIGIT_RE,
183 "DDDD": _THREE_DIGIT_RE,
184 "DDD": _ONE_OR_TWO_OR_THREE_DIGIT_RE,
185 "DD": _TWO_DIGIT_RE,
186 "D": _ONE_OR_TWO_DIGIT_RE,
187 "HH": _TWO_DIGIT_RE,
188 "H": _ONE_OR_TWO_DIGIT_RE,
189 "hh": _TWO_DIGIT_RE,
190 "h": _ONE_OR_TWO_DIGIT_RE,
191 "mm": _TWO_DIGIT_RE,
192 "m": _ONE_OR_TWO_DIGIT_RE,
193 "ss": _TWO_DIGIT_RE,
194 "s": _ONE_OR_TWO_DIGIT_RE,
195 "X": _TIMESTAMP_RE,
196 "x": _TIMESTAMP_EXPANDED_RE,
197 "ZZZ": _TZ_NAME_RE,
198 "ZZ": _TZ_ZZ_RE,
199 "Z": _TZ_Z_RE,
200 "S": _ONE_OR_MORE_DIGIT_RE,
201 "W": _WEEK_DATE_RE,
202 }
204 SEPARATORS: ClassVar[List[str]] = ["-", "/", "."]
206 locale: locales.Locale
207 _input_re_map: Dict[_FORMAT_TYPE, Pattern[str]]
209 def __init__(self, locale: str = DEFAULT_LOCALE, cache_size: int = 0) -> None:
210 """
211 Contains the regular expressions and functions to parse and split the input strings into tokens and eventually
212 produce a datetime that is used by :class:`Arrow <arrow.arrow.Arrow>` internally.
214 :param locale: the locale string
215 :type locale: str
216 :param cache_size: the size of the LRU cache used for regular expressions. Defaults to 0.
217 :type cache_size: int
218 """
219 self.locale = locales.get_locale(locale)
220 self._input_re_map = self._BASE_INPUT_RE_MAP.copy()
221 self._input_re_map.update(
222 {
223 "MMMM": self._generate_choice_re(
224 self.locale.month_names[1:], re.IGNORECASE
225 ),
226 "MMM": self._generate_choice_re(
227 self.locale.month_abbreviations[1:], re.IGNORECASE
228 ),
229 "Do": re.compile(self.locale.ordinal_day_re),
230 "dddd": self._generate_choice_re(
231 self.locale.day_names[1:], re.IGNORECASE
232 ),
233 "ddd": self._generate_choice_re(
234 self.locale.day_abbreviations[1:], re.IGNORECASE
235 ),
236 "d": re.compile(r"[1-7]"),
237 "a": self._generate_choice_re(
238 (self.locale.meridians["am"], self.locale.meridians["pm"])
239 ),
240 # note: 'A' token accepts both 'am/pm' and 'AM/PM' formats to
241 # ensure backwards compatibility of this token
242 "A": self._generate_choice_re(self.locale.meridians.values()),
243 }
244 )
245 if cache_size > 0:
246 self._generate_pattern_re = lru_cache(maxsize=cache_size)( # type: ignore
247 self._generate_pattern_re
248 )
250 # TODO: since we support more than ISO 8601, we should rename this function
251 # IDEA: break into multiple functions
252 def parse_iso(
253 self, datetime_string: str, normalize_whitespace: bool = False
254 ) -> datetime:
255 """
256 Parses a datetime string using a ISO 8601-like format.
258 :param datetime_string: The datetime string to parse.
259 :param normalize_whitespace: Whether to normalize whitespace in the datetime string (default is False).
260 :type datetime_string: str
261 :type normalize_whitespace: bool
262 :returns: The parsed datetime object.
263 :rtype: datetime
264 :raises ParserError: If the datetime string is not in a valid ISO 8601-like format.
266 Usage::
267 >>> import arrow.parser
268 >>> arrow.parser.DateTimeParser().parse_iso('2021-10-12T14:30:00')
269 datetime.datetime(2021, 10, 12, 14, 30)
271 """
272 if normalize_whitespace:
273 datetime_string = re.sub(r"\s+", " ", datetime_string.strip())
275 has_space_divider = " " in datetime_string
276 has_t_divider = "T" in datetime_string
278 num_spaces = datetime_string.count(" ")
279 if has_space_divider and num_spaces != 1 or has_t_divider and num_spaces > 0:
280 raise ParserError(
281 f"Expected an ISO 8601-like string, but was given {datetime_string!r}. "
282 "Try passing in a format string to resolve this."
283 )
285 has_time = has_space_divider or has_t_divider
286 has_tz = False
288 # date formats (ISO 8601 and others) to test against
289 # NOTE: YYYYMM is omitted to avoid confusion with YYMMDD (no longer part of ISO 8601, but is still often used)
290 formats = [
291 "YYYY-MM-DD",
292 "YYYY-M-DD",
293 "YYYY-M-D",
294 "YYYY/MM/DD",
295 "YYYY/M/DD",
296 "YYYY/M/D",
297 "YYYY.MM.DD",
298 "YYYY.M.DD",
299 "YYYY.M.D",
300 "YYYYMMDD",
301 "YYYY-DDDD",
302 "YYYYDDDD",
303 "YYYY-MM",
304 "YYYY/MM",
305 "YYYY.MM",
306 "YYYY",
307 "W",
308 ]
310 if has_time:
311 if has_space_divider:
312 date_string, time_string = datetime_string.split(" ", 1)
313 else:
314 date_string, time_string = datetime_string.split("T", 1)
316 time_parts = re.split(
317 r"[\+\-Z]", time_string, maxsplit=1, flags=re.IGNORECASE
318 )
320 time_components: Optional[Match[str]] = self._TIME_RE.match(time_parts[0])
322 if time_components is None:
323 raise ParserError(
324 "Invalid time component provided. "
325 "Please specify a format or provide a valid time component in the basic or extended ISO 8601 time format."
326 )
328 (
329 hours,
330 minutes,
331 seconds,
332 subseconds_sep,
333 subseconds,
334 ) = time_components.groups()
336 has_tz = len(time_parts) == 2
337 has_minutes = minutes is not None
338 has_seconds = seconds is not None
339 has_subseconds = subseconds is not None
341 is_basic_time_format = ":" not in time_parts[0]
342 tz_format = "Z"
344 # use 'ZZ' token instead since tz offset is present in non-basic format
345 if has_tz and ":" in time_parts[1]:
346 tz_format = "ZZ"
348 time_sep = "" if is_basic_time_format else ":"
350 if has_subseconds:
351 time_string = "HH{time_sep}mm{time_sep}ss{subseconds_sep}S".format(
352 time_sep=time_sep, subseconds_sep=subseconds_sep
353 )
354 elif has_seconds:
355 time_string = "HH{time_sep}mm{time_sep}ss".format(time_sep=time_sep)
356 elif has_minutes:
357 time_string = f"HH{time_sep}mm"
358 else:
359 time_string = "HH"
361 if has_space_divider:
362 formats = [f"{f} {time_string}" for f in formats]
363 else:
364 formats = [f"{f}T{time_string}" for f in formats]
366 if has_time and has_tz:
367 # Add "Z" or "ZZ" to the format strings to indicate to
368 # _parse_token() that a timezone needs to be parsed
369 formats = [f"{f}{tz_format}" for f in formats]
371 return self._parse_multiformat(datetime_string, formats)
373 def parse(
374 self,
375 datetime_string: str,
376 fmt: Union[List[str], str],
377 normalize_whitespace: bool = False,
378 ) -> datetime:
379 """
380 Parses a datetime string using a specified format.
382 :param datetime_string: The datetime string to parse.
383 :param fmt: The format string or list of format strings to use for parsing.
384 :param normalize_whitespace: Whether to normalize whitespace in the datetime string (default is False).
385 :type datetime_string: str
386 :type fmt: Union[List[str], str]
387 :type normalize_whitespace: bool
388 :returns: The parsed datetime object.
389 :rtype: datetime
390 :raises ParserMatchError: If the datetime string does not match the specified format.
392 Usage::
394 >>> import arrow.parser
395 >>> arrow.parser.DateTimeParser().parse('2021-10-12 14:30:00', 'YYYY-MM-DD HH:mm:ss')
396 datetime.datetime(2021, 10, 12, 14, 30)
399 """
400 if normalize_whitespace:
401 datetime_string = re.sub(r"\s+", " ", datetime_string)
403 if isinstance(fmt, list):
404 return self._parse_multiformat(datetime_string, fmt)
406 try:
407 fmt_tokens: List[_FORMAT_TYPE]
408 fmt_pattern_re: Pattern[str]
409 fmt_tokens, fmt_pattern_re = self._generate_pattern_re(fmt)
410 except re.error as e:
411 raise ParserMatchError(
412 f"Failed to generate regular expression pattern: {e}."
413 )
415 match = fmt_pattern_re.search(datetime_string)
417 if match is None:
418 raise ParserMatchError(
419 f"Failed to match {fmt!r} when parsing {datetime_string!r}."
420 )
422 parts: _Parts = {}
423 for token in fmt_tokens:
424 value: Union[Tuple[str, str, str], str]
425 if token == "Do":
426 value = match.group("value")
427 elif token == "W":
428 value = (match.group("year"), match.group("week"), match.group("day"))
429 else:
430 value = match.group(token)
432 if value is None:
433 raise ParserMatchError(
434 f"Unable to find a match group for the specified token {token!r}."
435 )
437 self._parse_token(token, value, parts) # type: ignore[arg-type]
439 return self._build_datetime(parts)
441 def _generate_pattern_re(self, fmt: str) -> Tuple[List[_FORMAT_TYPE], Pattern[str]]:
442 """
443 Generates a regular expression pattern from a format string.
445 :param fmt: The format string to convert into a regular expression pattern.
446 :type fmt: str
447 :returns: A tuple containing a list of format tokens and the corresponding regular expression pattern.
448 :rtype: Tuple[List[_FORMAT_TYPE], Pattern[str]]
449 :raises ParserError: If an unrecognized token is encountered in the format string.
450 """
451 # fmt is a string of tokens like 'YYYY-MM-DD'
452 # we construct a new string by replacing each
453 # token by its pattern:
454 # 'YYYY-MM-DD' -> '(?P<YYYY>\d{4})-(?P<MM>\d{2})-(?P<DD>\d{2})'
455 tokens: List[_FORMAT_TYPE] = []
456 offset = 0
458 # Escape all special RegEx chars
459 escaped_fmt = re.escape(fmt)
461 # Extract the bracketed expressions to be reinserted later.
462 escaped_fmt = re.sub(self._ESCAPE_RE, "#", escaped_fmt)
464 # Any number of S is the same as one.
465 # TODO: allow users to specify the number of digits to parse
466 escaped_fmt = re.sub(r"S+", "S", escaped_fmt)
468 escaped_data = re.findall(self._ESCAPE_RE, fmt)
470 fmt_pattern = escaped_fmt
472 for m in self._FORMAT_RE.finditer(escaped_fmt):
473 token: _FORMAT_TYPE = cast(_FORMAT_TYPE, m.group(0))
474 try:
475 input_re = self._input_re_map[token]
476 except KeyError:
477 raise ParserError(f"Unrecognized token {token!r}.")
478 input_pattern = f"(?P<{token}>{input_re.pattern})"
479 tokens.append(token)
480 # a pattern doesn't have the same length as the token
481 # it replaces! We keep the difference in the offset variable.
482 # This works because the string is scanned left-to-right and matches
483 # are returned in the order found by finditer.
484 fmt_pattern = (
485 fmt_pattern[: m.start() + offset]
486 + input_pattern
487 + fmt_pattern[m.end() + offset :]
488 )
489 offset += len(input_pattern) - (m.end() - m.start())
491 final_fmt_pattern = ""
492 split_fmt = fmt_pattern.split(r"\#")
494 # Due to the way Python splits, 'split_fmt' will always be longer
495 for i in range(len(split_fmt)):
496 final_fmt_pattern += split_fmt[i]
497 if i < len(escaped_data):
498 final_fmt_pattern += escaped_data[i][1:-1]
500 # Wrap final_fmt_pattern in a custom word boundary to strictly
501 # match the formatting pattern and filter out date and time formats
502 # that include junk such as: blah1998-09-12 blah, blah 1998-09-12blah,
503 # blah1998-09-12blah. The custom word boundary matches every character
504 # that is not a whitespace character to allow for searching for a date
505 # and time string in a natural language sentence. Therefore, searching
506 # for a string of the form YYYY-MM-DD in "blah 1998-09-12 blah" will
507 # work properly.
508 # Certain punctuation before or after the target pattern such as
509 # "1998-09-12," is permitted. For the full list of valid punctuation,
510 # see the documentation.
512 starting_word_boundary = (
513 r"(?<!\S\S)" # Don't have two consecutive non-whitespace characters. This ensures that we allow cases
514 # like .11.25.2019 but not 1.11.25.2019 (for pattern MM.DD.YYYY)
515 r"(?<![^\,\.\;\:\?\!\"\'\`\[\]\{\}\(\)<>\s])" # This is the list of punctuation that is ok before the
516 # pattern (i.e. "It can't not be these characters before the pattern")
517 r"(\b|^)"
518 # The \b is to block cases like 1201912 but allow 201912 for pattern YYYYMM. The ^ was necessary to allow a
519 # negative number through i.e. before epoch numbers
520 )
521 ending_word_boundary = (
522 r"(?=[\,\.\;\:\?\!\"\'\`\[\]\{\}\(\)\<\>]?" # Positive lookahead stating that these punctuation marks
523 # can appear after the pattern at most 1 time
524 r"(?!\S))" # Don't allow any non-whitespace character after the punctuation
525 )
526 bounded_fmt_pattern = r"{}{}{}".format(
527 starting_word_boundary, final_fmt_pattern, ending_word_boundary
528 )
530 return tokens, re.compile(bounded_fmt_pattern, flags=re.IGNORECASE)
532 @overload
533 def _parse_token(
534 self,
535 token: Literal[
536 "YYYY",
537 "YY",
538 "MM",
539 "M",
540 "DDDD",
541 "DDD",
542 "DD",
543 "D",
544 "Do",
545 "HH",
546 "hh",
547 "h",
548 "H",
549 "mm",
550 "m",
551 "ss",
552 "s",
553 "x",
554 ],
555 value: Union[str, bytes, SupportsInt, bytearray],
556 parts: _Parts,
557 ) -> None: ... # pragma: no cover
559 @overload
560 def _parse_token(
561 self,
562 token: Literal["X"],
563 value: Union[str, bytes, SupportsFloat, bytearray],
564 parts: _Parts,
565 ) -> None: ... # pragma: no cover
567 @overload
568 def _parse_token(
569 self,
570 token: Literal["MMMM", "MMM", "dddd", "ddd", "S"],
571 value: Union[str, bytes, bytearray],
572 parts: _Parts,
573 ) -> None: ... # pragma: no cover
575 @overload
576 def _parse_token(
577 self,
578 token: Literal["a", "A", "ZZZ", "ZZ", "Z"],
579 value: Union[str, bytes],
580 parts: _Parts,
581 ) -> None: ... # pragma: no cover
583 @overload
584 def _parse_token(
585 self,
586 token: Literal["W"],
587 value: Tuple[_WEEKDATE_ELEMENT, _WEEKDATE_ELEMENT, Optional[_WEEKDATE_ELEMENT]],
588 parts: _Parts,
589 ) -> None: ... # pragma: no cover
591 def _parse_token(
592 self,
593 token: Any,
594 value: Any,
595 parts: _Parts,
596 ) -> None:
597 """
598 Parse a token and its value, and update the `_Parts` dictionary with the parsed values.
600 The function supports several tokens, including "YYYY", "YY", "MMMM", "MMM", "MM", "M", "DDDD", "DDD", "DD", "D", "Do", "dddd", "ddd", "HH", "H", "mm", "m", "ss", "s", "S", "X", "x", "ZZZ", "ZZ", "Z", "a", "A", and "W". Each token is matched and the corresponding value is parsed and added to the `_Parts` dictionary.
602 :param token: The token to parse.
603 :type token: Any
604 :param value: The value of the token.
605 :type value: Any
606 :param parts: A dictionary to update with the parsed values.
607 :type parts: _Parts
608 :raises ParserMatchError: If the hour token value is not between 0 and 12 inclusive for tokens "a" or "A".
610 """
611 if token == "YYYY":
612 parts["year"] = int(value)
614 elif token == "YY":
615 value = int(value)
616 parts["year"] = 1900 + value if value > 68 else 2000 + value
618 elif token in ["MMMM", "MMM"]:
619 # FIXME: month_number() is nullable
620 parts["month"] = self.locale.month_number(value.lower()) # type: ignore[typeddict-item]
622 elif token in ["MM", "M"]:
623 parts["month"] = int(value)
625 elif token in ["DDDD", "DDD"]:
626 parts["day_of_year"] = int(value)
628 elif token in ["DD", "D"]:
629 parts["day"] = int(value)
631 elif token == "Do":
632 parts["day"] = int(value)
634 elif token == "dddd":
635 # locale day names are 1-indexed
636 day_of_week = [x.lower() for x in self.locale.day_names].index(
637 value.lower()
638 )
639 parts["day_of_week"] = day_of_week - 1
641 elif token == "ddd":
642 # locale day abbreviations are 1-indexed
643 day_of_week = [x.lower() for x in self.locale.day_abbreviations].index(
644 value.lower()
645 )
646 parts["day_of_week"] = day_of_week - 1
648 elif token.upper() in ["HH", "H"]:
649 parts["hour"] = int(value)
651 elif token in ["mm", "m"]:
652 parts["minute"] = int(value)
654 elif token in ["ss", "s"]:
655 parts["second"] = int(value)
657 elif token == "S":
658 # We have the *most significant* digits of an arbitrary-precision integer.
659 # We want the six most significant digits as an integer, rounded.
660 # IDEA: add nanosecond support somehow? Need datetime support for it first.
661 value = value.ljust(7, "0")
663 # floating-point (IEEE-754) defaults to half-to-even rounding
664 seventh_digit = int(value[6])
665 if seventh_digit == 5:
666 rounding = int(value[5]) % 2
667 elif seventh_digit > 5:
668 rounding = 1
669 else:
670 rounding = 0
672 parts["microsecond"] = int(value[:6]) + rounding
674 elif token == "X":
675 parts["timestamp"] = float(value)
677 elif token == "x":
678 parts["expanded_timestamp"] = int(value)
680 elif token in ["ZZZ", "ZZ", "Z"]:
681 parts["tzinfo"] = TzinfoParser.parse(value)
683 elif token in ["a", "A"]:
684 if value in (self.locale.meridians["am"], self.locale.meridians["AM"]):
685 parts["am_pm"] = "am"
686 if "hour" in parts and not 0 <= parts["hour"] <= 12:
687 raise ParserMatchError(
688 f"Hour token value must be between 0 and 12 inclusive for token {token!r}."
689 )
690 elif value in (self.locale.meridians["pm"], self.locale.meridians["PM"]):
691 parts["am_pm"] = "pm"
692 elif token == "W":
693 parts["weekdate"] = value
695 @staticmethod
696 def _build_datetime(parts: _Parts) -> datetime:
697 """
698 Build a datetime object from a dictionary of date parts.
700 :param parts: A dictionary containing the date parts extracted from a date string.
701 :type parts: dict
702 :return: A datetime object representing the date and time.
703 :rtype: datetime.datetime
704 """
705 weekdate = parts.get("weekdate")
707 if weekdate is not None:
708 year, week = int(weekdate[0]), int(weekdate[1])
710 if weekdate[2] is not None:
711 _day = int(weekdate[2])
712 else:
713 # day not given, default to 1
714 _day = 1
716 date_string = f"{year}-{week}-{_day}"
718 # tokens for ISO 8601 weekdates
719 dt = datetime.strptime(date_string, "%G-%V-%u")
721 parts["year"] = dt.year
722 parts["month"] = dt.month
723 parts["day"] = dt.day
725 timestamp = parts.get("timestamp")
727 if timestamp is not None:
728 return datetime.fromtimestamp(timestamp, tz=timezone.utc)
730 expanded_timestamp = parts.get("expanded_timestamp")
732 if expanded_timestamp is not None:
733 return datetime.fromtimestamp(
734 normalize_timestamp(expanded_timestamp),
735 tz=timezone.utc,
736 )
738 day_of_year = parts.get("day_of_year")
740 if day_of_year is not None:
741 _year = parts.get("year")
742 month = parts.get("month")
743 if _year is None:
744 raise ParserError(
745 "Year component is required with the DDD and DDDD tokens."
746 )
748 if month is not None:
749 raise ParserError(
750 "Month component is not allowed with the DDD and DDDD tokens."
751 )
753 date_string = f"{_year}-{day_of_year}"
754 try:
755 dt = datetime.strptime(date_string, "%Y-%j")
756 except ValueError:
757 raise ParserError(
758 f"The provided day of year {day_of_year!r} is invalid."
759 )
761 parts["year"] = dt.year
762 parts["month"] = dt.month
763 parts["day"] = dt.day
765 day_of_week: Optional[int] = parts.get("day_of_week")
766 day = parts.get("day")
768 # If day is passed, ignore day of week
769 if day_of_week is not None and day is None:
770 year = parts.get("year", 1970)
771 month = parts.get("month", 1)
772 day = 1
774 # dddd => first day of week after epoch
775 # dddd YYYY => first day of week in specified year
776 # dddd MM YYYY => first day of week in specified year and month
777 # dddd MM => first day after epoch in specified month
778 next_weekday_dt = next_weekday(datetime(year, month, day), day_of_week)
779 parts["year"] = next_weekday_dt.year
780 parts["month"] = next_weekday_dt.month
781 parts["day"] = next_weekday_dt.day
783 am_pm = parts.get("am_pm")
784 hour = parts.get("hour", 0)
786 if am_pm == "pm" and hour < 12:
787 hour += 12
788 elif am_pm == "am" and hour == 12:
789 hour = 0
791 # Support for midnight at the end of day
792 if hour == 24:
793 if parts.get("minute", 0) != 0:
794 raise ParserError("Midnight at the end of day must not contain minutes")
795 if parts.get("second", 0) != 0:
796 raise ParserError("Midnight at the end of day must not contain seconds")
797 if parts.get("microsecond", 0) != 0:
798 raise ParserError(
799 "Midnight at the end of day must not contain microseconds"
800 )
801 hour = 0
802 day_increment = 1
803 else:
804 day_increment = 0
806 # account for rounding up to 1000000
807 microsecond = parts.get("microsecond", 0)
808 if microsecond == 1000000:
809 microsecond = 0
810 second_increment = 1
811 else:
812 second_increment = 0
814 increment = timedelta(days=day_increment, seconds=second_increment)
816 return (
817 datetime(
818 year=parts.get("year", 1),
819 month=parts.get("month", 1),
820 day=parts.get("day", 1),
821 hour=hour,
822 minute=parts.get("minute", 0),
823 second=parts.get("second", 0),
824 microsecond=microsecond,
825 tzinfo=parts.get("tzinfo"),
826 )
827 + increment
828 )
830 def _parse_multiformat(self, string: str, formats: Iterable[str]) -> datetime:
831 """
832 Parse a date and time string using multiple formats.
834 Tries to parse the provided string with each format in the given `formats`
835 iterable, returning the resulting `datetime` object if a match is found. If no
836 format matches the string, a `ParserError` is raised.
838 :param string: The date and time string to parse.
839 :type string: str
840 :param formats: An iterable of date and time format strings to try, in order.
841 :type formats: Iterable[str]
842 :returns: The parsed date and time.
843 :rtype: datetime.datetime
844 :raises ParserError: If no format matches the input string.
845 """
846 _datetime: Optional[datetime] = None
848 for fmt in formats:
849 try:
850 _datetime = self.parse(string, fmt)
851 break
852 except ParserMatchError:
853 pass
855 if _datetime is None:
856 supported_formats = ", ".join(formats)
857 raise ParserError(
858 f"Could not match input {string!r} to any of the following formats: {supported_formats}."
859 )
861 return _datetime
863 # generates a capture group of choices separated by an OR operator
864 @staticmethod
865 def _generate_choice_re(
866 choices: Iterable[str], flags: Union[int, re.RegexFlag] = 0
867 ) -> Pattern[str]:
868 """
869 Generate a regular expression pattern that matches a choice from an iterable.
871 Takes an iterable of strings (`choices`) and returns a compiled regular expression
872 pattern that matches any of the choices. The pattern is created by joining the
873 choices with the '|' (OR) operator, which matches any of the enclosed patterns.
875 :param choices: An iterable of strings to match.
876 :type choices: Iterable[str]
877 :param flags: Optional regular expression flags. Default is 0.
878 :type flags: Union[int, re.RegexFlag], optional
879 :returns: A compiled regular expression pattern that matches any of the choices.
880 :rtype: re.Pattern[str]
881 """
882 return re.compile(r"({})".format("|".join(choices)), flags=flags)
885class TzinfoParser:
886 """
887 Parser for timezone information.
888 """
890 _TZINFO_RE: ClassVar[Pattern[str]] = re.compile(
891 r"^(?:\(UTC)*([\+\-])?(\d{2})(?:\:?(\d{2}))?"
892 )
894 @classmethod
895 def parse(cls, tzinfo_string: str) -> dt_tzinfo:
896 """
897 Parse a timezone string and return a datetime timezone object.
899 :param tzinfo_string: The timezone string to parse.
900 :type tzinfo_string: str
901 :returns: The parsed datetime timezone object.
902 :rtype: datetime.timezone
903 :raises ParserError: If the timezone string cannot be parsed.
904 """
905 tzinfo: Optional[dt_tzinfo] = None
907 if tzinfo_string == "local":
908 tzinfo = datetime.now().astimezone().tzinfo
910 elif tzinfo_string in ["utc", "UTC", "Z"]:
911 tzinfo = timezone.utc
913 else:
914 iso_match = cls._TZINFO_RE.match(tzinfo_string)
916 if iso_match:
917 sign: Optional[str]
918 hours: str
919 minutes: Union[str, int, None]
920 sign, hours, minutes = iso_match.groups()
921 seconds = int(hours) * 3600 + int(minutes or 0) * 60
923 if sign == "-":
924 seconds *= -1
926 tzinfo = timezone(timedelta(seconds=seconds))
928 else:
929 try:
930 tzinfo = ZoneInfo(tzinfo_string)
931 except ZoneInfoNotFoundError:
932 tzinfo = None
934 if tzinfo is None:
935 raise ParserError(f"Could not parse timezone expression {tzinfo_string!r}.")
937 return tzinfo