1import calendar
2from collections import OrderedDict
3from datetime import datetime, timedelta, timezone
4from io import StringIO
5
6import pytz
7import regex as re
8
9from dateparser.utils import (
10 _get_missing_parts,
11 get_last_day_of_month,
12 get_next_leap_year,
13 get_previous_leap_year,
14 get_timezone_from_tz_string,
15 set_correct_day_from_settings,
16 set_correct_month_from_settings,
17)
18from dateparser.utils.strptime import strptime
19
20NSP_COMPATIBLE = re.compile(r"\D+")
21MERIDIAN = re.compile(r"am|pm")
22MICROSECOND = re.compile(r"\d{1,6}")
23EIGHT_DIGIT = re.compile(r"^\d{8}$")
24HOUR_MINUTE_REGEX = re.compile(r"^([0-9]|0[0-9]|1[0-9]|2[0-3]):[0-5][0-9]$")
25
26
27def no_space_parser_eligibile(datestring):
28 src = NSP_COMPATIBLE.search(datestring)
29 if not src or ":" == src.group():
30 return True
31 return False
32
33
34def get_unresolved_attrs(parser_object):
35 attrs = ["year", "month", "day"]
36 seen = []
37 unseen = []
38 for attr in attrs:
39 if getattr(parser_object, attr, None) is not None:
40 seen.append(attr)
41 else:
42 unseen.append(attr)
43 return seen, unseen
44
45
46date_order_chart = {
47 "DMY": "%d%m%y",
48 "DYM": "%d%y%m",
49 "MDY": "%m%d%y",
50 "MYD": "%m%y%d",
51 "YDM": "%y%d%m",
52 "YMD": "%y%m%d",
53}
54
55
56def resolve_date_order(order, lst=None):
57 chart_list = {
58 "DMY": ["day", "month", "year"],
59 "DYM": ["day", "year", "month"],
60 "MDY": ["month", "day", "year"],
61 "MYD": ["month", "year", "day"],
62 "YDM": ["year", "day", "month"],
63 "YMD": ["year", "month", "day"],
64 }
65
66 return chart_list[order] if lst else date_order_chart[order]
67
68
69def _parse_absolute(datestring, settings, tz=None):
70 return _parser.parse(datestring, settings, tz)
71
72
73def _parse_nospaces(datestring, settings, tz=None):
74 return _no_spaces_parser.parse(datestring, settings)
75
76
77class _time_parser:
78 time_directives = [
79 "%H:%M:%S",
80 "%I:%M:%S %p",
81 "%H:%M",
82 "%I:%M %p",
83 "%I %p",
84 "%H:%M:%S.%f",
85 "%I:%M:%S.%f %p",
86 "%H:%M %p",
87 ]
88
89 def __call__(self, timestring):
90 _timestring = timestring
91 for directive in self.time_directives:
92 try:
93 return strptime(timestring.strip(), directive).time()
94 except ValueError:
95 pass
96 else:
97 raise ValueError("%s does not seem to be a valid time string" % _timestring)
98
99
100time_parser = _time_parser()
101
102
103class _no_spaces_parser:
104 _dateformats = [
105 "%Y%m%d",
106 "%Y%d%m",
107 "%m%Y%d",
108 "%m%d%Y",
109 "%d%Y%m",
110 "%d%m%Y",
111 "%y%m%d",
112 "%y%d%m",
113 "%m%y%d",
114 "%m%d%y",
115 "%d%y%m",
116 "%d%m%y",
117 ]
118
119 _preferred_formats = ["%Y%m%d%H%M", "%Y%m%d%H%M%S", "%Y%m%d%H%M%S.%f"]
120
121 _preferred_formats_ordered_8_digit = [
122 "%m%d%Y",
123 "%d%m%Y",
124 "%Y%m%d",
125 "%Y%d%m",
126 "%m%Y%d",
127 "%d%Y%m",
128 ]
129
130 _timeformats = ["%H%M%S.%f", "%H%M%S", "%H%M", "%H"]
131
132 period = {"day": ["%d", "%H", "%M", "%S"], "month": ["%m"]}
133
134 _default_order = resolve_date_order("MDY")
135
136 def __init__(self, *args, **kwargs):
137 self._all = (
138 self._dateformats
139 + [x + y for x in self._dateformats for y in self._timeformats]
140 + self._timeformats
141 )
142
143 self.date_formats = {
144 "%m%d%y": (
145 self._preferred_formats
146 + sorted(
147 self._all,
148 key=lambda x: x.lower().startswith("%m%d%y"),
149 reverse=True,
150 )
151 ),
152 "%m%y%d": sorted(
153 self._all, key=lambda x: x.lower().startswith("%m%y%d"), reverse=True
154 ),
155 "%y%m%d": sorted(
156 self._all, key=lambda x: x.lower().startswith("%y%m%d"), reverse=True
157 ),
158 "%y%d%m": sorted(
159 self._all, key=lambda x: x.lower().startswith("%y%d%m"), reverse=True
160 ),
161 "%d%m%y": sorted(
162 self._all, key=lambda x: x.lower().startswith("%d%m%y"), reverse=True
163 ),
164 "%d%y%m": sorted(
165 self._all, key=lambda x: x.lower().startswith("%d%y%m"), reverse=True
166 ),
167 }
168
169 @classmethod
170 def _get_period(cls, format_string):
171 for pname, pdrv in sorted(cls.period.items(), key=lambda x: x[0]):
172 for drv in pdrv:
173 if drv in format_string:
174 return pname
175 else:
176 return "year"
177
178 @classmethod
179 def _find_best_matching_date(cls, datestring):
180 for fmt in cls._preferred_formats_ordered_8_digit:
181 try:
182 dt = strptime(datestring, fmt), cls._get_period(fmt)
183 if len(str(dt[0].year)) == 4:
184 return dt
185 except:
186 pass
187 return None
188
189 @classmethod
190 def parse(cls, datestring, settings):
191 if not no_space_parser_eligibile(datestring):
192 raise ValueError("Unable to parse date from: %s" % datestring)
193
194 datestring = datestring.replace(":", "")
195 if not datestring:
196 raise ValueError("Empty string")
197 tokens = tokenizer(datestring)
198 if settings.DATE_ORDER:
199 order = resolve_date_order(settings.DATE_ORDER)
200 else:
201 order = cls._default_order
202 if EIGHT_DIGIT.match(datestring):
203 dt = cls._find_best_matching_date(datestring)
204 if dt is not None:
205 return dt
206 nsp = cls()
207 ambiguous_date = None
208 for token, _ in tokens.tokenize():
209 for fmt in nsp.date_formats[order]:
210 try:
211 dt = strptime(token, fmt), cls._get_period(fmt)
212 if len(str(dt[0].year)) < 4:
213 ambiguous_date = dt
214 continue
215
216 missing = _get_missing_parts(fmt)
217 _check_strict_parsing(missing, settings)
218 return dt
219 except:
220 pass
221 else:
222 if ambiguous_date:
223 return ambiguous_date
224 else:
225 raise ValueError("Unable to parse date from: %s" % datestring)
226
227
228def _get_missing_error(missing):
229 return "Fields missing from the date string: {}".format(", ".join(missing))
230
231
232def _check_strict_parsing(missing, settings):
233 if settings.STRICT_PARSING and missing:
234 raise ValueError(_get_missing_error(missing))
235 elif settings.REQUIRE_PARTS and missing:
236 errors = [part for part in settings.REQUIRE_PARTS if part in missing]
237 if errors:
238 raise ValueError(_get_missing_error(errors))
239
240
241class _parser:
242 alpha_directives = OrderedDict(
243 [
244 ("weekday", ["%A", "%a"]),
245 ("month", ["%B", "%b"]),
246 ]
247 )
248
249 num_directives = {
250 "month": ["%m"],
251 "day": ["%d"],
252 "year": ["%y", "%Y"],
253 }
254
255 def __init__(self, tokens, settings):
256 self.settings = settings
257 self.tokens = [(t[0].strip(), t[1]) for t in list(tokens)]
258 self.filtered_tokens = [
259 (t[0], t[1], i) for i, t in enumerate(self.tokens) if t[1] <= 1
260 ]
261
262 self.unset_tokens = []
263
264 self.day = None
265 self.month = None
266 self.year = None
267 self.time = None
268
269 self.auto_order = []
270
271 self._token_day = None
272 self._token_month = None
273 self._token_year = None
274 self._token_time = None
275
276 self.ordered_num_directives = OrderedDict(
277 (k, self.num_directives[k])
278 for k in (resolve_date_order(settings.DATE_ORDER, lst=True))
279 )
280
281 skip_index = []
282 skip_component = None
283 skip_tokens = ["t", "year", "hour", "minute"]
284
285 for index, token_type_original_index in enumerate(self.filtered_tokens):
286 if index in skip_index:
287 continue
288
289 token, type, original_index = token_type_original_index
290
291 if token in skip_tokens:
292 continue
293
294 if self.time is None:
295 meridian_index = index + 1
296
297 try:
298 # try case where hours and minutes are separated by a period. Example: 13.20.
299 _is_before_period = self.tokens[original_index + 1][0] == "."
300 _is_after_period = (
301 original_index != 0
302 and self.tokens[original_index - 1][0] == "."
303 )
304
305 if _is_before_period and not _is_after_period:
306 index_next_token = index + 1
307 next_token = self.filtered_tokens[index_next_token][0]
308 index_in_tokens_for_next_token = self.filtered_tokens[
309 index_next_token
310 ][2]
311
312 next_token_is_last = (
313 index_next_token == len(self.filtered_tokens) - 1
314 )
315 if (
316 next_token_is_last
317 or self.tokens[index_in_tokens_for_next_token + 1][0] != "."
318 ):
319 new_token = token + ":" + next_token
320 if re.match(HOUR_MINUTE_REGEX, new_token):
321 token = new_token
322 skip_index.append(index + 1)
323 meridian_index += 1
324 except Exception:
325 pass
326
327 try:
328 microsecond = MICROSECOND.search(
329 self.filtered_tokens[index + 1][0]
330 ).group()
331 # Is after time token? raise ValueError if ':' can't be found:
332 token.index(":")
333 # Is after period? raise ValueError if '.' can't be found:
334 self.tokens[self.tokens.index((token, 0)) + 1][0].index(".")
335 except:
336 microsecond = None
337
338 if microsecond:
339 meridian_index += 1
340
341 try:
342 meridian = MERIDIAN.search(
343 self.filtered_tokens[meridian_index][0]
344 ).group()
345 except:
346 meridian = None
347
348 if any([":" in token, meridian, microsecond]):
349 if meridian and not microsecond:
350 self._token_time = "%s %s" % (token, meridian)
351 skip_index.append(meridian_index)
352 elif microsecond and not meridian:
353 self._token_time = "%s.%s" % (token, microsecond)
354 skip_index.append(index + 1)
355 elif meridian and microsecond:
356 self._token_time = "%s.%s %s" % (token, microsecond, meridian)
357 skip_index.append(index + 1)
358 skip_index.append(meridian_index)
359 else:
360 self._token_time = token
361 self.time = lambda: time_parser(self._token_time)
362 continue
363
364 results = self._parse(type, token, skip_component=skip_component)
365 for res in results:
366 if len(token) == 4 and res[0] == "year":
367 skip_component = "year"
368 setattr(self, *res)
369
370 known, unknown = get_unresolved_attrs(self)
371 params = {}
372 for attr in known:
373 params.update({attr: getattr(self, attr)})
374 for attr in unknown:
375 for token, type, _ in self.unset_tokens:
376 if type == 0:
377 params.update({attr: int(token)})
378 setattr(self, "_token_%s" % attr, token)
379 setattr(self, attr, int(token))
380
381 def _get_period(self):
382 if self.settings.RETURN_TIME_AS_PERIOD:
383 if getattr(self, "time", None):
384 return "time"
385
386 for period in ["time", "day"]:
387 if getattr(self, period, None):
388 return "day"
389
390 for period in ["month", "year"]:
391 if getattr(self, period, None):
392 return period
393
394 if self._results():
395 return "day"
396
397 def _get_datetime_obj(self, **params):
398 try:
399 return datetime(**params)
400 except ValueError as e:
401 error_text = e.__str__()
402 error_msgs = ["day is out of range", "day must be in"]
403 if error_msgs[0] in error_text or error_msgs[1] in error_text:
404 if not (self._token_day or hasattr(self, "_token_weekday")):
405 # if day is not available put last day of the month
406 params["day"] = get_last_day_of_month(
407 params["year"], params["month"]
408 )
409 return datetime(**params)
410 elif (
411 not self._token_year
412 and params["day"] == 29
413 and params["month"] == 2
414 and not calendar.isleap(params["year"])
415 ):
416 # fix the year when year is not present and it is 29 of February
417 params["year"] = self._get_correct_leap_year(
418 self.settings.PREFER_DATES_FROM, params["year"]
419 )
420 return datetime(**params)
421 raise e
422
423 def _get_correct_leap_year(self, prefer_dates_from, current_year):
424 if prefer_dates_from == "future":
425 return get_next_leap_year(current_year)
426 if prefer_dates_from == "past":
427 return get_previous_leap_year(current_year)
428
429 # Default case ('current_period'): return closer leap year
430 next_leap_year = get_next_leap_year(current_year)
431 previous_leap_year = get_previous_leap_year(current_year)
432 next_leap_year_is_closer = (
433 next_leap_year - current_year < current_year - previous_leap_year
434 )
435 return next_leap_year if next_leap_year_is_closer else previous_leap_year
436
437 def _set_relative_base(self):
438 self.now = self.settings.RELATIVE_BASE
439 if not self.now:
440 self.now = datetime.now(tz=timezone.utc).replace(tzinfo=None)
441
442 def _get_datetime_obj_params(self):
443 if not self.now:
444 self._set_relative_base()
445
446 params = {
447 "day": self.day or self.now.day,
448 "month": self.month or self.now.month,
449 "year": self.year or self.now.year,
450 "hour": 0,
451 "minute": 0,
452 "second": 0,
453 "microsecond": 0,
454 }
455 return params
456
457 def _get_date_obj(self, token, directive):
458 return strptime(token, directive)
459
460 def _results(self):
461 missing = [
462 field for field in ("day", "month", "year") if not getattr(self, field)
463 ]
464 _check_strict_parsing(missing, self.settings)
465 self._set_relative_base()
466
467 time = self.time() if self.time is not None else None
468 params = self._get_datetime_obj_params()
469
470 if time:
471 params.update(
472 dict(
473 hour=time.hour,
474 minute=time.minute,
475 second=time.second,
476 microsecond=time.microsecond,
477 )
478 )
479
480 return self._get_datetime_obj(**params)
481
482 def _correct_for_time_frame(self, dateobj, tz):
483 days = ["mon", "tue", "wed", "thu", "fri", "sat", "sun"]
484
485 token_weekday, _ = getattr(self, "_token_weekday", (None, None))
486
487 if token_weekday and not (
488 self._token_year or self._token_month or self._token_day
489 ):
490 day_index = calendar.weekday(dateobj.year, dateobj.month, dateobj.day)
491 day = token_weekday[:3].lower()
492 steps = 0
493 if "future" in self.settings.PREFER_DATES_FROM:
494 if days[day_index] == day:
495 steps = 7
496 else:
497 while days[day_index] != day:
498 day_index = (day_index + 1) % 7
499 steps += 1
500 delta = timedelta(days=steps)
501 else:
502 if days[day_index] == day:
503 if self.settings.PREFER_DATES_FROM == "past":
504 steps = 7
505 else:
506 steps = 0
507 else:
508 while days[day_index] != day:
509 day_index -= 1
510 steps += 1
511 delta = timedelta(days=-steps)
512
513 dateobj = dateobj + delta
514
515 # NOTE: If this assert fires, self.now needs to be made offset-aware in a similar
516 # way that dateobj is temporarily made offset-aware.
517 assert not (
518 self.now.tzinfo is None and dateobj.tzinfo is not None
519 ), "`self.now` doesn't have `tzinfo`. Review comment in code for details."
520
521 # Store the original dateobj values so that upon subsequent parsing everything is not
522 # treated as offset-aware if offset awareness is changed.
523 original_dateobj = dateobj
524
525 # Since date comparisons must be either offset-naive or offset-aware, normalize dateobj
526 # to be offset-aware if one or the other is already offset-aware.
527 if self.now.tzinfo is not None and dateobj.tzinfo is None:
528 dateobj = pytz.utc.localize(dateobj)
529
530 if self.month and not self.year:
531 try:
532 if self.now < dateobj:
533 if self.settings.PREFER_DATES_FROM == "past":
534 dateobj = dateobj.replace(year=dateobj.year - 1)
535 else:
536 if self.settings.PREFER_DATES_FROM == "future":
537 dateobj = dateobj.replace(year=dateobj.year + 1)
538 except ValueError as e:
539 if dateobj.day == 29 and dateobj.month == 2:
540 valid_year = self._get_correct_leap_year(
541 self.settings.PREFER_DATES_FROM, dateobj.year
542 )
543 dateobj = dateobj.replace(year=valid_year)
544 else:
545 raise e
546
547 if self._token_year and len(self._token_year[0]) == 2:
548 if self.now < dateobj:
549 if "past" in self.settings.PREFER_DATES_FROM:
550 dateobj = dateobj.replace(year=dateobj.year - 100)
551 else:
552 if "future" in self.settings.PREFER_DATES_FROM:
553 dateobj = dateobj.replace(year=dateobj.year + 100)
554
555 if self._token_time and not any(
556 [
557 self._token_year,
558 self._token_month,
559 self._token_day,
560 hasattr(self, "_token_weekday"),
561 ]
562 ):
563 # Convert dateobj to utc time to compare with self.now
564 try:
565 tz = tz or get_timezone_from_tz_string(self.settings.TIMEZONE)
566 tz_offset = tz.utcoffset(dateobj)
567 except (pytz.UnknownTimeZoneError, pytz.NonExistentTimeError):
568 tz_offset = timedelta(hours=0)
569
570 if "past" in self.settings.PREFER_DATES_FROM:
571 if self.now < dateobj - tz_offset:
572 dateobj = dateobj + timedelta(days=-1)
573 if "future" in self.settings.PREFER_DATES_FROM:
574 if self.now > dateobj - tz_offset:
575 dateobj = dateobj + timedelta(days=1)
576
577 # Reset dateobj to the original value, thus removing any offset awareness that may
578 # have been set earlier.
579 dateobj = dateobj.replace(tzinfo=original_dateobj.tzinfo)
580
581 return dateobj
582
583 def _correct_for_day(self, dateobj):
584 if (
585 getattr(self, "_token_day", None)
586 or getattr(self, "_token_weekday", None)
587 or getattr(self, "_token_time", None)
588 ):
589 return dateobj
590
591 dateobj = set_correct_day_from_settings(
592 dateobj, self.settings, current_day=self.now.day
593 )
594 return dateobj
595
596 def _correct_for_month(self, dateobj):
597 relative_base = getattr(self.settings, "RELATIVE_BASE", None)
598 relative_base_month = (
599 relative_base.month if hasattr(relative_base, "month") else relative_base
600 )
601
602 if getattr(self, "_token_month", None):
603 return dateobj
604
605 dateobj = set_correct_month_from_settings(
606 dateobj, self.settings, relative_base_month
607 )
608 return dateobj
609
610 @classmethod
611 def parse(cls, datestring, settings, tz=None):
612 tokens = tokenizer(datestring)
613 po = cls(tokens.tokenize(), settings)
614 dateobj = po._results()
615
616 # correction for past, future if applicable
617 dateobj = po._correct_for_time_frame(dateobj, tz)
618
619 # correction for preference of month: beginning, current, end
620 # must happen before day so that day is derived from the correct month
621 dateobj = po._correct_for_month(dateobj)
622
623 # correction for preference of day: beginning, current, end
624 dateobj = po._correct_for_day(dateobj)
625
626 period = po._get_period()
627
628 return dateobj, period
629
630 def _parse(self, type, token, skip_component=None):
631 def set_and_return(token, type, component, dateobj, skip_date_order=False):
632 if not skip_date_order:
633 self.auto_order.append(component)
634 setattr(self, "_token_%s" % component, (token, type))
635 return [(component, getattr(dateobj, component))]
636
637 def parse_number(token, skip_component=None):
638 type = 0
639
640 for component, directives in self.ordered_num_directives.items():
641 if skip_component == component:
642 continue
643 for directive in directives:
644 try:
645 do = self._get_date_obj(token, directive)
646 prev_value = getattr(self, component, None)
647 if not prev_value:
648 return set_and_return(token, type, component, do)
649 else:
650 try:
651 prev_token, prev_type = getattr(
652 self, "_token_%s" % component
653 )
654 if prev_type == type:
655 do = self._get_date_obj(prev_token, directive)
656 except ValueError:
657 self.unset_tokens.append(
658 (prev_token, prev_type, component)
659 )
660 return set_and_return(token, type, component, do)
661 except ValueError:
662 pass
663 else:
664 raise ValueError("Unable to parse: %s" % token)
665
666 def parse_alpha(token, skip_component=None):
667 type = 1
668
669 for component, directives in self.alpha_directives.items():
670 if skip_component == component:
671 continue
672 for directive in directives:
673 try:
674 do = self._get_date_obj(token, directive)
675 prev_value = getattr(self, component, None)
676 if not prev_value:
677 return set_and_return(
678 token, type, component, do, skip_date_order=True
679 )
680 elif component == "month":
681 index = self.auto_order.index("month")
682 self.auto_order[index] = "day"
683 setattr(self, "_token_day", self._token_month)
684 setattr(self, "_token_month", (token, type))
685 return [
686 (component, getattr(do, component)),
687 ("day", prev_value),
688 ]
689 except:
690 pass
691 else:
692 raise ValueError("Unable to parse: %s" % token)
693
694 handlers = {0: parse_number, 1: parse_alpha}
695 return handlers[type](token, skip_component)
696
697
698class tokenizer:
699 digits = "0123456789:"
700 letters = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
701
702 def _isletter(self, tkn):
703 return tkn in self.letters
704
705 def _isdigit(self, tkn):
706 return tkn in self.digits
707
708 def __init__(self, ds):
709 self.instream = StringIO(ds)
710
711 def _switch(self, chara, charb):
712 if self._isdigit(chara):
713 return 0, not self._isdigit(charb)
714
715 if self._isletter(chara):
716 return 1, not self._isletter(charb)
717
718 return 2, self._isdigit(charb) or self._isletter(charb)
719
720 def tokenize(self):
721 token = ""
722 EOF = False
723
724 while not EOF:
725 nextchar = self.instream.read(1)
726
727 if not nextchar:
728 EOF = True
729 type, _ = self._switch(token[-1], nextchar)
730 yield token, type
731 return
732
733 if token:
734 type, switch = self._switch(token[-1], nextchar)
735
736 if not switch:
737 token += nextchar
738 else:
739 yield token, type
740 token = nextchar
741 else:
742 token += nextchar