Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/dateparser/date.py: 79%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import collections
2import sys
3from collections.abc import Set
4from datetime import datetime, timedelta
6import regex as re
7from dateutil.relativedelta import relativedelta
8from tzlocal import get_localzone
10from dateparser.conf import apply_settings, check_settings
11from dateparser.custom_language_detection.language_mapping import map_languages
12from dateparser.date_parser import date_parser
13from dateparser.freshness_date_parser import freshness_date_parser
14from dateparser.languages.loader import LocaleDataLoader
15from dateparser.parser import _parse_absolute, _parse_nospaces
16from dateparser.timezone_parser import pop_tz_offset_from_string
17from dateparser.utils import (
18 apply_timezone_from_settings,
19 get_timezone_from_tz_string,
20 set_correct_day_from_settings,
21 set_correct_month_from_settings,
22)
24APOSTROPHE_LOOK_ALIKE_CHARS = [
25 "\N{RIGHT SINGLE QUOTATION MARK}", # '\u2019'
26 "\N{MODIFIER LETTER APOSTROPHE}", # '\u02bc'
27 "\N{MODIFIER LETTER TURNED COMMA}", # '\u02bb'
28 "\N{ARMENIAN APOSTROPHE}", # '\u055a'
29 "\N{LATIN SMALL LETTER SALTILLO}", # '\ua78c'
30 "\N{PRIME}", # '\u2032'
31 "\N{REVERSED PRIME}", # '\u2035'
32 "\N{MODIFIER LETTER PRIME}", # '\u02b9'
33 "\N{FULLWIDTH APOSTROPHE}", # '\uff07'
34]
36RE_NBSP = re.compile("\xa0", flags=re.UNICODE)
37RE_SPACES = re.compile(r"\s+")
38RE_TRIM_SPACES = re.compile(r"^\s+(\S.*?)\s+$")
39RE_TRIM_COLONS = re.compile(r"(\S.*?):*$")
41RE_SANITIZE_SKIP = re.compile(
42 r"\t|\n|\r|\u00bb|,\s\u0432\b|\u200e|\xb7|\u200f|\u064e|\u064f", flags=re.M
43)
44RE_SANITIZE_RUSSIAN = re.compile(r"([\W\d])\u0433\.", flags=re.I | re.U)
45RE_SANITIZE_CROATIAN = re.compile(
46 r"(\d+)\.\s?(\d+)\.\s?(\d+)\.( u)?", flags=re.I | re.U
47)
48RE_SANITIZE_PERIOD = re.compile(r"(?<=[^0-9\s])\.", flags=re.U)
49RE_SANITIZE_ON = re.compile(r"^.*?on:\s+(.*)")
50RE_SANITIZE_APOSTROPHE = re.compile("|".join(APOSTROPHE_LOOK_ALIKE_CHARS))
52RE_SEARCH_TIMESTAMP = re.compile(r"^(\d{10})(\d{3})?(\d{3})?(?![^.])")
53RE_SEARCH_NEGATIVE_TIMESTAMP = re.compile(r"^([-]\d{10})(\d{3})?(\d{3})?(?![^.])")
56def sanitize_spaces(date_string):
57 date_string = RE_NBSP.sub(" ", date_string)
58 date_string = RE_SPACES.sub(" ", date_string)
59 date_string = RE_TRIM_SPACES.sub(r"\1", date_string)
60 return date_string
63def date_range(begin, end, **kwargs):
64 dateutil_error_prone_args = [
65 "year",
66 "month",
67 "week",
68 "day",
69 "hour",
70 "minute",
71 "second",
72 ]
73 for arg in dateutil_error_prone_args:
74 if arg in kwargs:
75 raise ValueError("Invalid argument: %s" % arg)
77 step = relativedelta(**kwargs) if kwargs else relativedelta(days=1)
79 date = begin
80 while date < end:
81 yield date
82 date += step
84 # handles edge-case when iterating months and last interval is < 30 days
85 if kwargs.get("months", 0) > 0 and (date.year, date.month) == (end.year, end.month):
86 yield end
89def get_intersecting_periods(low, high, period="day"):
90 if period not in [
91 "year",
92 "month",
93 "week",
94 "day",
95 "hour",
96 "minute",
97 "second",
98 "microsecond",
99 ]:
100 raise ValueError("Invalid period: {}".format(period))
102 if high <= low:
103 return
105 step = relativedelta(**{period + "s": 1})
107 current_period_start = low
108 if isinstance(current_period_start, datetime):
109 reset_arguments = {}
110 for test_period in ["microsecond", "second", "minute", "hour"]:
111 if test_period == period:
112 break
113 else:
114 reset_arguments[test_period] = 0
115 current_period_start = current_period_start.replace(**reset_arguments)
117 if period == "week":
118 current_period_start = current_period_start - timedelta(
119 days=current_period_start.weekday()
120 )
121 elif period == "month":
122 current_period_start = current_period_start.replace(day=1)
123 elif period == "year":
124 current_period_start = current_period_start.replace(month=1, day=1)
126 while current_period_start < high:
127 yield current_period_start
128 current_period_start += step
131def sanitize_date(date_string):
132 date_string = RE_SANITIZE_SKIP.sub(" ", date_string)
133 date_string = RE_SANITIZE_RUSSIAN.sub(
134 r"\1 ", date_string
135 ) # remove 'г.' (Russian for year) but not in words
136 date_string = RE_SANITIZE_CROATIAN.sub(
137 r"\1.\2.\3 ", date_string
138 ) # extra '.' and 'u' interferes with parsing relative fractional dates
139 date_string = sanitize_spaces(date_string)
140 date_string = RE_SANITIZE_PERIOD.sub("", date_string)
141 date_string = RE_SANITIZE_ON.sub(r"\1", date_string)
142 date_string = RE_TRIM_COLONS.sub(r"\1", date_string)
143 date_string = RE_SANITIZE_APOSTROPHE.sub("'", date_string)
144 date_string = date_string.strip()
145 return date_string
148def get_date_from_timestamp(date_string, settings, negative=False):
149 if negative:
150 match = RE_SEARCH_NEGATIVE_TIMESTAMP.search(date_string)
151 else:
152 match = RE_SEARCH_TIMESTAMP.search(date_string)
154 if match:
155 if (
156 settings is None
157 or settings.TIMEZONE is None
158 or "local" in settings.TIMEZONE.lower()
159 ):
160 # If the timezone in settings is unset, or it's 'local', use the
161 # local timezone
162 timezone = get_localzone()
163 else:
164 # Otherwise, use the timezone given in settings
165 timezone = get_timezone_from_tz_string(settings.TIMEZONE)
167 seconds = int(match.group(1))
168 millis = int(match.group(2) or 0)
169 micros = int(match.group(3) or 0)
170 date_obj = datetime.fromtimestamp(seconds, timezone).replace(
171 microsecond=millis * 1000 + micros, tzinfo=None
172 )
173 date_obj = apply_timezone_from_settings(date_obj, settings)
174 return date_obj
177def parse_with_formats(date_string, date_formats, settings):
178 """Parse with formats and return a dictionary with 'period' and 'obj_date'.
180 :returns: :class:`datetime.datetime`, dict or None
182 """
183 period = "day"
184 for date_format in date_formats:
185 try:
186 date_obj = datetime.strptime(date_string, date_format)
187 except ValueError:
188 continue
189 else:
190 missing_month = not any(m in date_format for m in ["%m", "%b", "%B"])
191 missing_day = "%d" not in date_format
192 if missing_month and missing_day:
193 period = "year"
194 date_obj = set_correct_month_from_settings(date_obj, settings)
195 date_obj = set_correct_day_from_settings(date_obj, settings)
197 elif missing_month:
198 period = "year"
199 date_obj = set_correct_month_from_settings(date_obj, settings)
201 elif missing_day:
202 period = "month"
203 date_obj = set_correct_day_from_settings(date_obj, settings)
205 if not ("%y" in date_format or "%Y" in date_format):
206 today = datetime.today()
207 date_obj = date_obj.replace(year=today.year)
209 date_obj = apply_timezone_from_settings(date_obj, settings)
211 return DateData(date_obj=date_obj, period=period)
212 else:
213 return DateData(date_obj=None, period=period)
216class _DateLocaleParser:
217 def __init__(self, locale, date_string, date_formats, settings=None):
218 self._settings = settings
219 if not (date_formats is None or isinstance(date_formats, (list, tuple, Set))):
220 raise TypeError("Date formats should be list, tuple or set of strings")
222 self.locale = locale
223 self.date_string = date_string
224 self.date_formats = date_formats
225 self._translated_date = None
226 self._translated_date_with_formatting = None
227 self._parsers = {
228 "timestamp": self._try_timestamp,
229 "negative-timestamp": self._try_negative_timestamp,
230 "relative-time": self._try_freshness_parser,
231 "custom-formats": self._try_given_formats,
232 "absolute-time": self._try_absolute_parser,
233 "no-spaces-time": self._try_nospaces_parser,
234 }
236 @classmethod
237 def parse(cls, locale, date_string, date_formats=None, settings=None):
238 instance = cls(locale, date_string, date_formats, settings)
239 return instance._parse()
241 def _parse(self):
242 for parser_name in self._settings.PARSERS:
243 date_data = self._parsers[parser_name]()
244 if self._is_valid_date_data(date_data):
245 return date_data
246 else:
247 return None
249 def _try_timestamp_parser(self, negative=False):
250 return DateData(
251 date_obj=get_date_from_timestamp(
252 self.date_string, self._settings, negative=negative
253 ),
254 period="time" if self._settings.RETURN_TIME_AS_PERIOD else "day",
255 )
257 def _try_timestamp(self):
258 return self._try_timestamp_parser()
260 def _try_negative_timestamp(self):
261 return self._try_timestamp_parser(negative=True)
263 def _try_freshness_parser(self):
264 try:
265 return freshness_date_parser.get_date_data(
266 self._get_translated_date(), self._settings
267 )
268 except (OverflowError, ValueError):
269 return None
271 def _try_absolute_parser(self):
272 return self._try_parser(parse_method=_parse_absolute)
274 def _try_nospaces_parser(self):
275 return self._try_parser(parse_method=_parse_nospaces)
277 def _try_parser(self, parse_method):
278 _order = self._settings.DATE_ORDER
279 try:
280 if self._settings.PREFER_LOCALE_DATE_ORDER:
281 if "DATE_ORDER" not in self._settings._mod_settings:
282 self._settings.DATE_ORDER = self.locale.info.get(
283 "date_order", _order
284 )
285 date_obj, period = date_parser.parse(
286 self._get_translated_date(),
287 parse_method=parse_method,
288 settings=self._settings,
289 )
290 self._settings.DATE_ORDER = _order
291 return DateData(
292 date_obj=date_obj,
293 period=period,
294 )
295 except ValueError:
296 self._settings.DATE_ORDER = _order
297 return None
299 def _try_given_formats(self):
300 if not self.date_formats:
301 return
303 return parse_with_formats(
304 self._get_translated_date_with_formatting(),
305 self.date_formats,
306 settings=self._settings,
307 )
309 def _get_translated_date(self):
310 if self._translated_date is None:
311 self._translated_date = self.locale.translate(
312 self.date_string, keep_formatting=False, settings=self._settings
313 )
314 return self._translated_date
316 def _get_translated_date_with_formatting(self):
317 if self._translated_date_with_formatting is None:
318 self._translated_date_with_formatting = self.locale.translate(
319 self.date_string, keep_formatting=True, settings=self._settings
320 )
321 return self._translated_date_with_formatting
323 def _is_valid_date_data(self, date_data):
324 if not isinstance(date_data, DateData):
325 return False
326 if not date_data["date_obj"] or not date_data["period"]:
327 return False
328 if date_data["date_obj"] and not isinstance(date_data["date_obj"], datetime):
329 return False
330 if date_data["period"] not in ("time", "day", "week", "month", "year"):
331 return False
332 return True
335class DateData:
336 """
337 Class that represents the parsed data with useful information.
338 It can be accessed with square brackets like a dict object.
339 """
341 def __init__(self, *, date_obj=None, period=None, locale=None):
342 self.date_obj = date_obj
343 self.period = period
344 self.locale = locale
346 def __getitem__(self, k):
347 if not hasattr(self, k):
348 raise KeyError(k)
349 return getattr(self, k)
351 def __setitem__(self, k, v):
352 if not hasattr(self, k):
353 raise KeyError(k)
354 setattr(self, k, v)
356 def __repr__(self):
357 properties_text = ", ".join(
358 "{}={}".format(prop, val.__repr__()) for prop, val in self.__dict__.items()
359 )
361 return "{}({})".format(self.__class__.__name__, properties_text)
364class DateDataParser:
365 """
366 Class which handles language detection, translation and subsequent generic parsing of
367 string representing date and/or time.
369 :param languages:
370 A list of language codes, e.g. ['en', 'es', 'zh-Hant'].
371 If locales are not given, languages and region are
372 used to construct locales for translation.
373 :type languages: list
375 :param locales:
376 A list of locale codes, e.g. ['fr-PF', 'qu-EC', 'af-NA'].
377 The parser uses only these locales to translate date string.
378 :type locales: list
380 :param region:
381 A region code, e.g. 'IN', '001', 'NE'.
382 If locales are not given, languages and region are
383 used to construct locales for translation.
384 :type region: str
386 :param try_previous_locales:
387 If True, locales previously used to translate date are tried first.
388 :type try_previous_locales: bool
390 :param use_given_order:
391 If True, locales are tried for translation of date string
392 in the order in which they are given.
393 :type use_given_order: bool
395 :param settings:
396 Configure customized behavior using settings defined in :mod:`dateparser.conf.Settings`.
397 :type settings: dict
399 :param detect_languages_function:
400 A function for language detection that takes as input a `text` and a `confidence_threshold`,
401 and returns a list of detected language codes.
402 Note: this function is only used if ``languages`` and ``locales`` are not provided.
403 :type detect_languages_function: function
405 :return: A parser instance
407 :raises:
408 ``ValueError``: Unknown Language, ``TypeError``: Languages argument must be a list,
409 ``SettingValidationError``: A provided setting is not valid.
410 """
412 locale_loader = None
414 @apply_settings
415 def __init__(
416 self,
417 languages=None,
418 locales=None,
419 region=None,
420 try_previous_locales=False,
421 use_given_order=False,
422 settings=None,
423 detect_languages_function=None,
424 ):
425 if languages is not None and not isinstance(languages, (list, tuple, Set)):
426 raise TypeError(
427 "languages argument must be a list (%r given)" % type(languages)
428 )
430 if locales is not None and not isinstance(locales, (list, tuple, Set)):
431 raise TypeError(
432 "locales argument must be a list (%r given)" % type(locales)
433 )
435 if region is not None and not isinstance(region, str):
436 raise TypeError("region argument must be str (%r given)" % type(region))
438 if not isinstance(try_previous_locales, bool):
439 raise TypeError(
440 "try_previous_locales argument must be a boolean (%r given)"
441 % type(try_previous_locales)
442 )
444 if not isinstance(use_given_order, bool):
445 raise TypeError(
446 "use_given_order argument must be a boolean (%r given)"
447 % type(use_given_order)
448 )
450 if not locales and not languages and use_given_order:
451 raise ValueError(
452 "locales or languages must be given if use_given_order is True"
453 )
455 check_settings(settings)
457 self._settings = settings
458 self.try_previous_locales = try_previous_locales
459 self.use_given_order = use_given_order
460 self.languages = list(languages) if languages else None
461 self.locales = locales
462 self.region = region
463 self.detect_languages_function = detect_languages_function
464 self.previous_locales = collections.OrderedDict()
466 def get_date_data(self, date_string, date_formats=None):
467 """
468 Parse string representing date and/or time in recognizable localized formats.
469 Supports parsing multiple languages and timezones.
471 :param date_string:
472 A string representing date and/or time in a recognizably valid format.
473 :type date_string: str
474 :param date_formats:
475 A list of format strings using directives as given
476 `here <https://docs.python.org/2/library/datetime.html#strftime-and-strptime-behavior>`_.
477 The parser applies formats one by one, taking into account the detected languages.
478 :type date_formats: list
480 :return: a ``DateData`` object.
482 :raises: ValueError - Unknown Language
484 .. note:: *Period* values can be a 'day' (default), 'week', 'month', 'year', 'time'.
486 *Period* represents the granularity of date parsed from the given string.
488 In the example below, since no day information is present, the day is assumed to be current
489 day ``16`` from *current date* (which is June 16, 2015, at the moment of writing this).
490 Hence, the level of precision is ``month``:
492 >>> DateDataParser().get_date_data('March 2015')
493 DateData(date_obj=datetime.datetime(2015, 3, 16, 0, 0), period='month', locale='en')
495 Similarly, for date strings with no day and month information present, level of precision
496 is ``year`` and day ``16`` and month ``6`` are from *current_date*.
498 >>> DateDataParser().get_date_data('2014')
499 DateData(date_obj=datetime.datetime(2014, 6, 16, 0, 0), period='year', locale='en')
501 Dates with time zone indications or UTC offsets are returned in UTC time unless
502 specified using `Settings <https://dateparser.readthedocs.io/en/latest/settings.html#settings>`__.
504 >>> DateDataParser().get_date_data('23 March 2000, 1:21 PM CET')
505 DateData(date_obj=datetime.datetime(2000, 3, 23, 13, 21, tzinfo=<StaticTzInfo 'CET'>),
506 period='day', locale='en')
508 """
509 if not isinstance(date_string, str):
510 raise TypeError("Input type must be str")
512 res = parse_with_formats(date_string, date_formats or [], self._settings)
513 if res["date_obj"]:
514 return res
516 date_string = sanitize_date(date_string)
518 for locale in self._get_applicable_locales(date_string):
519 parsed_date = _DateLocaleParser.parse(
520 locale, date_string, date_formats, settings=self._settings
521 )
522 if parsed_date:
523 parsed_date["locale"] = locale.shortname
524 if self.try_previous_locales:
525 self.previous_locales[locale] = None
526 return parsed_date
527 else:
528 return DateData(date_obj=None, period="day", locale=None)
530 def get_date_tuple(self, *args, **kwargs):
531 date_data = self.get_date_data(*args, **kwargs)
532 fields = date_data.__dict__.keys()
533 date_tuple = collections.namedtuple("DateData", fields)
534 return date_tuple(**date_data.__dict__)
536 def _get_applicable_locales(self, date_string):
537 pop_tz_cache = []
539 def date_strings():
540 """A generator instead of a static list to avoid calling
541 pop_tz_offset_from_string if the first locale matches on unmodified
542 date_string.
543 """
544 yield date_string
545 if not pop_tz_cache:
546 stripped_date_string, _ = pop_tz_offset_from_string(
547 date_string, as_offset=False
548 )
549 if stripped_date_string == date_string:
550 stripped_date_string = None
551 pop_tz_cache[:] = [stripped_date_string]
552 (stripped_date_string,) = pop_tz_cache
553 if stripped_date_string is not None:
554 yield stripped_date_string
556 if self.try_previous_locales:
557 for locale in self.previous_locales.keys():
558 for s in date_strings():
559 if self._is_applicable_locale(locale, s):
560 yield locale
562 if self.detect_languages_function and not self.languages and not self.locales:
563 detected_languages = self.detect_languages_function(
564 text=date_string,
565 confidence_threshold=self._settings.LANGUAGE_DETECTION_CONFIDENCE_THRESHOLD,
566 )
568 self.languages = map_languages(detected_languages)
570 for locale in self._get_locale_loader().get_locales(
571 languages=self.languages,
572 locales=self.locales,
573 region=self.region,
574 use_given_order=self.use_given_order,
575 ):
576 for s in date_strings():
577 if self._is_applicable_locale(locale, s):
578 yield locale
580 if self._settings.DEFAULT_LANGUAGES:
581 for locale in self._get_locale_loader().get_locales(
582 languages=self._settings.DEFAULT_LANGUAGES,
583 locales=None,
584 region=self.region,
585 use_given_order=self.use_given_order,
586 ):
587 yield locale
589 def _is_applicable_locale(self, locale, date_string):
590 return locale.is_applicable(
591 date_string,
592 strip_timezone=False, # it is stripped outside
593 settings=self._settings,
594 )
596 @classmethod
597 def _get_locale_loader(cls):
598 if not cls.locale_loader:
599 cls.locale_loader = LocaleDataLoader()
600 return cls.locale_loader