Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/dateparser/date.py: 79%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

283 statements  

1import collections 

2import sys 

3from collections.abc import Set 

4from datetime import datetime, timedelta 

5 

6import regex as re 

7from dateutil.relativedelta import relativedelta 

8from tzlocal import get_localzone 

9 

10from dateparser.conf import apply_settings, check_settings 

11from dateparser.custom_language_detection.language_mapping import map_languages 

12from dateparser.date_parser import date_parser 

13from dateparser.freshness_date_parser import freshness_date_parser 

14from dateparser.languages.loader import LocaleDataLoader 

15from dateparser.parser import _parse_absolute, _parse_nospaces 

16from dateparser.timezone_parser import pop_tz_offset_from_string 

17from dateparser.utils import ( 

18 apply_timezone_from_settings, 

19 get_timezone_from_tz_string, 

20 set_correct_day_from_settings, 

21 set_correct_month_from_settings, 

22) 

23 

24APOSTROPHE_LOOK_ALIKE_CHARS = [ 

25 "\N{RIGHT SINGLE QUOTATION MARK}", # '\u2019' 

26 "\N{MODIFIER LETTER APOSTROPHE}", # '\u02bc' 

27 "\N{MODIFIER LETTER TURNED COMMA}", # '\u02bb' 

28 "\N{ARMENIAN APOSTROPHE}", # '\u055a' 

29 "\N{LATIN SMALL LETTER SALTILLO}", # '\ua78c' 

30 "\N{PRIME}", # '\u2032' 

31 "\N{REVERSED PRIME}", # '\u2035' 

32 "\N{MODIFIER LETTER PRIME}", # '\u02b9' 

33 "\N{FULLWIDTH APOSTROPHE}", # '\uff07' 

34] 

35 

36RE_NBSP = re.compile("\xa0", flags=re.UNICODE) 

37RE_SPACES = re.compile(r"\s+") 

38RE_TRIM_SPACES = re.compile(r"^\s+(\S.*?)\s+$") 

39RE_TRIM_COLONS = re.compile(r"(\S.*?):*$") 

40 

41RE_SANITIZE_SKIP = re.compile( 

42 r"\t|\n|\r|\u00bb|,\s\u0432\b|\u200e|\xb7|\u200f|\u064e|\u064f", flags=re.M 

43) 

44RE_SANITIZE_RUSSIAN = re.compile(r"([\W\d])\u0433\.", flags=re.I | re.U) 

45RE_SANITIZE_CROATIAN = re.compile( 

46 r"(\d+)\.\s?(\d+)\.\s?(\d+)\.( u)?", flags=re.I | re.U 

47) 

48RE_SANITIZE_PERIOD = re.compile(r"(?<=[^0-9\s])\.", flags=re.U) 

49RE_SANITIZE_ON = re.compile(r"^.*?on:\s+(.*)") 

50RE_SANITIZE_APOSTROPHE = re.compile("|".join(APOSTROPHE_LOOK_ALIKE_CHARS)) 

51 

52RE_SEARCH_TIMESTAMP = re.compile(r"^(\d{10})(\d{3})?(\d{3})?(?![^.])") 

53RE_SEARCH_NEGATIVE_TIMESTAMP = re.compile(r"^([-]\d{10})(\d{3})?(\d{3})?(?![^.])") 

54 

55 

56def sanitize_spaces(date_string): 

57 date_string = RE_NBSP.sub(" ", date_string) 

58 date_string = RE_SPACES.sub(" ", date_string) 

59 date_string = RE_TRIM_SPACES.sub(r"\1", date_string) 

60 return date_string 

61 

62 

63def date_range(begin, end, **kwargs): 

64 dateutil_error_prone_args = [ 

65 "year", 

66 "month", 

67 "week", 

68 "day", 

69 "hour", 

70 "minute", 

71 "second", 

72 ] 

73 for arg in dateutil_error_prone_args: 

74 if arg in kwargs: 

75 raise ValueError("Invalid argument: %s" % arg) 

76 

77 step = relativedelta(**kwargs) if kwargs else relativedelta(days=1) 

78 

79 date = begin 

80 while date < end: 

81 yield date 

82 date += step 

83 

84 # handles edge-case when iterating months and last interval is < 30 days 

85 if kwargs.get("months", 0) > 0 and (date.year, date.month) == (end.year, end.month): 

86 yield end 

87 

88 

89def get_intersecting_periods(low, high, period="day"): 

90 if period not in [ 

91 "year", 

92 "month", 

93 "week", 

94 "day", 

95 "hour", 

96 "minute", 

97 "second", 

98 "microsecond", 

99 ]: 

100 raise ValueError("Invalid period: {}".format(period)) 

101 

102 if high <= low: 

103 return 

104 

105 step = relativedelta(**{period + "s": 1}) 

106 

107 current_period_start = low 

108 if isinstance(current_period_start, datetime): 

109 reset_arguments = {} 

110 for test_period in ["microsecond", "second", "minute", "hour"]: 

111 if test_period == period: 

112 break 

113 else: 

114 reset_arguments[test_period] = 0 

115 current_period_start = current_period_start.replace(**reset_arguments) 

116 

117 if period == "week": 

118 current_period_start = current_period_start - timedelta( 

119 days=current_period_start.weekday() 

120 ) 

121 elif period == "month": 

122 current_period_start = current_period_start.replace(day=1) 

123 elif period == "year": 

124 current_period_start = current_period_start.replace(month=1, day=1) 

125 

126 while current_period_start < high: 

127 yield current_period_start 

128 current_period_start += step 

129 

130 

131def sanitize_date(date_string): 

132 date_string = RE_SANITIZE_SKIP.sub(" ", date_string) 

133 date_string = RE_SANITIZE_RUSSIAN.sub( 

134 r"\1 ", date_string 

135 ) # remove 'г.' (Russian for year) but not in words 

136 date_string = RE_SANITIZE_CROATIAN.sub( 

137 r"\1.\2.\3 ", date_string 

138 ) # extra '.' and 'u' interferes with parsing relative fractional dates 

139 date_string = sanitize_spaces(date_string) 

140 date_string = RE_SANITIZE_PERIOD.sub("", date_string) 

141 date_string = RE_SANITIZE_ON.sub(r"\1", date_string) 

142 date_string = RE_TRIM_COLONS.sub(r"\1", date_string) 

143 date_string = RE_SANITIZE_APOSTROPHE.sub("'", date_string) 

144 date_string = date_string.strip() 

145 return date_string 

146 

147 

148def get_date_from_timestamp(date_string, settings, negative=False): 

149 if negative: 

150 match = RE_SEARCH_NEGATIVE_TIMESTAMP.search(date_string) 

151 else: 

152 match = RE_SEARCH_TIMESTAMP.search(date_string) 

153 

154 if match: 

155 if ( 

156 settings is None 

157 or settings.TIMEZONE is None 

158 or "local" in settings.TIMEZONE.lower() 

159 ): 

160 # If the timezone in settings is unset, or it's 'local', use the 

161 # local timezone 

162 timezone = get_localzone() 

163 else: 

164 # Otherwise, use the timezone given in settings 

165 timezone = get_timezone_from_tz_string(settings.TIMEZONE) 

166 

167 seconds = int(match.group(1)) 

168 millis = int(match.group(2) or 0) 

169 micros = int(match.group(3) or 0) 

170 date_obj = datetime.fromtimestamp(seconds, timezone).replace( 

171 microsecond=millis * 1000 + micros, tzinfo=None 

172 ) 

173 date_obj = apply_timezone_from_settings(date_obj, settings) 

174 return date_obj 

175 

176 

177def parse_with_formats(date_string, date_formats, settings): 

178 """Parse with formats and return a dictionary with 'period' and 'obj_date'. 

179 

180 :returns: :class:`datetime.datetime`, dict or None 

181 

182 """ 

183 period = "day" 

184 for date_format in date_formats: 

185 try: 

186 date_obj = datetime.strptime(date_string, date_format) 

187 except ValueError: 

188 continue 

189 else: 

190 missing_month = not any(m in date_format for m in ["%m", "%b", "%B"]) 

191 missing_day = "%d" not in date_format 

192 if missing_month and missing_day: 

193 period = "year" 

194 date_obj = set_correct_month_from_settings(date_obj, settings) 

195 date_obj = set_correct_day_from_settings(date_obj, settings) 

196 

197 elif missing_month: 

198 period = "year" 

199 date_obj = set_correct_month_from_settings(date_obj, settings) 

200 

201 elif missing_day: 

202 period = "month" 

203 date_obj = set_correct_day_from_settings(date_obj, settings) 

204 

205 if not ("%y" in date_format or "%Y" in date_format): 

206 today = datetime.today() 

207 date_obj = date_obj.replace(year=today.year) 

208 

209 date_obj = apply_timezone_from_settings(date_obj, settings) 

210 

211 return DateData(date_obj=date_obj, period=period) 

212 else: 

213 return DateData(date_obj=None, period=period) 

214 

215 

216class _DateLocaleParser: 

217 def __init__(self, locale, date_string, date_formats, settings=None): 

218 self._settings = settings 

219 if not (date_formats is None or isinstance(date_formats, (list, tuple, Set))): 

220 raise TypeError("Date formats should be list, tuple or set of strings") 

221 

222 self.locale = locale 

223 self.date_string = date_string 

224 self.date_formats = date_formats 

225 self._translated_date = None 

226 self._translated_date_with_formatting = None 

227 self._parsers = { 

228 "timestamp": self._try_timestamp, 

229 "negative-timestamp": self._try_negative_timestamp, 

230 "relative-time": self._try_freshness_parser, 

231 "custom-formats": self._try_given_formats, 

232 "absolute-time": self._try_absolute_parser, 

233 "no-spaces-time": self._try_nospaces_parser, 

234 } 

235 

236 @classmethod 

237 def parse(cls, locale, date_string, date_formats=None, settings=None): 

238 instance = cls(locale, date_string, date_formats, settings) 

239 return instance._parse() 

240 

241 def _parse(self): 

242 for parser_name in self._settings.PARSERS: 

243 date_data = self._parsers[parser_name]() 

244 if self._is_valid_date_data(date_data): 

245 return date_data 

246 else: 

247 return None 

248 

249 def _try_timestamp_parser(self, negative=False): 

250 return DateData( 

251 date_obj=get_date_from_timestamp( 

252 self.date_string, self._settings, negative=negative 

253 ), 

254 period="time" if self._settings.RETURN_TIME_AS_PERIOD else "day", 

255 ) 

256 

257 def _try_timestamp(self): 

258 return self._try_timestamp_parser() 

259 

260 def _try_negative_timestamp(self): 

261 return self._try_timestamp_parser(negative=True) 

262 

263 def _try_freshness_parser(self): 

264 try: 

265 return freshness_date_parser.get_date_data( 

266 self._get_translated_date(), self._settings 

267 ) 

268 except (OverflowError, ValueError): 

269 return None 

270 

271 def _try_absolute_parser(self): 

272 return self._try_parser(parse_method=_parse_absolute) 

273 

274 def _try_nospaces_parser(self): 

275 return self._try_parser(parse_method=_parse_nospaces) 

276 

277 def _try_parser(self, parse_method): 

278 _order = self._settings.DATE_ORDER 

279 try: 

280 if self._settings.PREFER_LOCALE_DATE_ORDER: 

281 if "DATE_ORDER" not in self._settings._mod_settings: 

282 self._settings.DATE_ORDER = self.locale.info.get( 

283 "date_order", _order 

284 ) 

285 date_obj, period = date_parser.parse( 

286 self._get_translated_date(), 

287 parse_method=parse_method, 

288 settings=self._settings, 

289 ) 

290 self._settings.DATE_ORDER = _order 

291 return DateData( 

292 date_obj=date_obj, 

293 period=period, 

294 ) 

295 except ValueError: 

296 self._settings.DATE_ORDER = _order 

297 return None 

298 

299 def _try_given_formats(self): 

300 if not self.date_formats: 

301 return 

302 

303 return parse_with_formats( 

304 self._get_translated_date_with_formatting(), 

305 self.date_formats, 

306 settings=self._settings, 

307 ) 

308 

309 def _get_translated_date(self): 

310 if self._translated_date is None: 

311 self._translated_date = self.locale.translate( 

312 self.date_string, keep_formatting=False, settings=self._settings 

313 ) 

314 return self._translated_date 

315 

316 def _get_translated_date_with_formatting(self): 

317 if self._translated_date_with_formatting is None: 

318 self._translated_date_with_formatting = self.locale.translate( 

319 self.date_string, keep_formatting=True, settings=self._settings 

320 ) 

321 return self._translated_date_with_formatting 

322 

323 def _is_valid_date_data(self, date_data): 

324 if not isinstance(date_data, DateData): 

325 return False 

326 if not date_data["date_obj"] or not date_data["period"]: 

327 return False 

328 if date_data["date_obj"] and not isinstance(date_data["date_obj"], datetime): 

329 return False 

330 if date_data["period"] not in ("time", "day", "week", "month", "year"): 

331 return False 

332 return True 

333 

334 

335class DateData: 

336 """ 

337 Class that represents the parsed data with useful information. 

338 It can be accessed with square brackets like a dict object. 

339 """ 

340 

341 def __init__(self, *, date_obj=None, period=None, locale=None): 

342 self.date_obj = date_obj 

343 self.period = period 

344 self.locale = locale 

345 

346 def __getitem__(self, k): 

347 if not hasattr(self, k): 

348 raise KeyError(k) 

349 return getattr(self, k) 

350 

351 def __setitem__(self, k, v): 

352 if not hasattr(self, k): 

353 raise KeyError(k) 

354 setattr(self, k, v) 

355 

356 def __repr__(self): 

357 properties_text = ", ".join( 

358 "{}={}".format(prop, val.__repr__()) for prop, val in self.__dict__.items() 

359 ) 

360 

361 return "{}({})".format(self.__class__.__name__, properties_text) 

362 

363 

364class DateDataParser: 

365 """ 

366 Class which handles language detection, translation and subsequent generic parsing of 

367 string representing date and/or time. 

368 

369 :param languages: 

370 A list of language codes, e.g. ['en', 'es', 'zh-Hant']. 

371 If locales are not given, languages and region are 

372 used to construct locales for translation. 

373 :type languages: list 

374 

375 :param locales: 

376 A list of locale codes, e.g. ['fr-PF', 'qu-EC', 'af-NA']. 

377 The parser uses only these locales to translate date string. 

378 :type locales: list 

379 

380 :param region: 

381 A region code, e.g. 'IN', '001', 'NE'. 

382 If locales are not given, languages and region are 

383 used to construct locales for translation. 

384 :type region: str 

385 

386 :param try_previous_locales: 

387 If True, locales previously used to translate date are tried first. 

388 :type try_previous_locales: bool 

389 

390 :param use_given_order: 

391 If True, locales are tried for translation of date string 

392 in the order in which they are given. 

393 :type use_given_order: bool 

394 

395 :param settings: 

396 Configure customized behavior using settings defined in :mod:`dateparser.conf.Settings`. 

397 :type settings: dict 

398 

399 :param detect_languages_function: 

400 A function for language detection that takes as input a `text` and a `confidence_threshold`, 

401 and returns a list of detected language codes. 

402 Note: this function is only used if ``languages`` and ``locales`` are not provided. 

403 :type detect_languages_function: function 

404 

405 :return: A parser instance 

406 

407 :raises: 

408 ``ValueError``: Unknown Language, ``TypeError``: Languages argument must be a list, 

409 ``SettingValidationError``: A provided setting is not valid. 

410 """ 

411 

412 locale_loader = None 

413 

414 @apply_settings 

415 def __init__( 

416 self, 

417 languages=None, 

418 locales=None, 

419 region=None, 

420 try_previous_locales=False, 

421 use_given_order=False, 

422 settings=None, 

423 detect_languages_function=None, 

424 ): 

425 if languages is not None and not isinstance(languages, (list, tuple, Set)): 

426 raise TypeError( 

427 "languages argument must be a list (%r given)" % type(languages) 

428 ) 

429 

430 if locales is not None and not isinstance(locales, (list, tuple, Set)): 

431 raise TypeError( 

432 "locales argument must be a list (%r given)" % type(locales) 

433 ) 

434 

435 if region is not None and not isinstance(region, str): 

436 raise TypeError("region argument must be str (%r given)" % type(region)) 

437 

438 if not isinstance(try_previous_locales, bool): 

439 raise TypeError( 

440 "try_previous_locales argument must be a boolean (%r given)" 

441 % type(try_previous_locales) 

442 ) 

443 

444 if not isinstance(use_given_order, bool): 

445 raise TypeError( 

446 "use_given_order argument must be a boolean (%r given)" 

447 % type(use_given_order) 

448 ) 

449 

450 if not locales and not languages and use_given_order: 

451 raise ValueError( 

452 "locales or languages must be given if use_given_order is True" 

453 ) 

454 

455 check_settings(settings) 

456 

457 self._settings = settings 

458 self.try_previous_locales = try_previous_locales 

459 self.use_given_order = use_given_order 

460 self.languages = list(languages) if languages else None 

461 self.locales = locales 

462 self.region = region 

463 self.detect_languages_function = detect_languages_function 

464 self.previous_locales = collections.OrderedDict() 

465 

466 def get_date_data(self, date_string, date_formats=None): 

467 """ 

468 Parse string representing date and/or time in recognizable localized formats. 

469 Supports parsing multiple languages and timezones. 

470 

471 :param date_string: 

472 A string representing date and/or time in a recognizably valid format. 

473 :type date_string: str 

474 :param date_formats: 

475 A list of format strings using directives as given 

476 `here <https://docs.python.org/2/library/datetime.html#strftime-and-strptime-behavior>`_. 

477 The parser applies formats one by one, taking into account the detected languages. 

478 :type date_formats: list 

479 

480 :return: a ``DateData`` object. 

481 

482 :raises: ValueError - Unknown Language 

483 

484 .. note:: *Period* values can be a 'day' (default), 'week', 'month', 'year', 'time'. 

485 

486 *Period* represents the granularity of date parsed from the given string. 

487 

488 In the example below, since no day information is present, the day is assumed to be current 

489 day ``16`` from *current date* (which is June 16, 2015, at the moment of writing this). 

490 Hence, the level of precision is ``month``: 

491 

492 >>> DateDataParser().get_date_data('March 2015') 

493 DateData(date_obj=datetime.datetime(2015, 3, 16, 0, 0), period='month', locale='en') 

494 

495 Similarly, for date strings with no day and month information present, level of precision 

496 is ``year`` and day ``16`` and month ``6`` are from *current_date*. 

497 

498 >>> DateDataParser().get_date_data('2014') 

499 DateData(date_obj=datetime.datetime(2014, 6, 16, 0, 0), period='year', locale='en') 

500 

501 Dates with time zone indications or UTC offsets are returned in UTC time unless 

502 specified using `Settings <https://dateparser.readthedocs.io/en/latest/settings.html#settings>`__. 

503 

504 >>> DateDataParser().get_date_data('23 March 2000, 1:21 PM CET') 

505 DateData(date_obj=datetime.datetime(2000, 3, 23, 13, 21, tzinfo=<StaticTzInfo 'CET'>), 

506 period='day', locale='en') 

507 

508 """ 

509 if not isinstance(date_string, str): 

510 raise TypeError("Input type must be str") 

511 

512 res = parse_with_formats(date_string, date_formats or [], self._settings) 

513 if res["date_obj"]: 

514 return res 

515 

516 date_string = sanitize_date(date_string) 

517 

518 for locale in self._get_applicable_locales(date_string): 

519 parsed_date = _DateLocaleParser.parse( 

520 locale, date_string, date_formats, settings=self._settings 

521 ) 

522 if parsed_date: 

523 parsed_date["locale"] = locale.shortname 

524 if self.try_previous_locales: 

525 self.previous_locales[locale] = None 

526 return parsed_date 

527 else: 

528 return DateData(date_obj=None, period="day", locale=None) 

529 

530 def get_date_tuple(self, *args, **kwargs): 

531 date_data = self.get_date_data(*args, **kwargs) 

532 fields = date_data.__dict__.keys() 

533 date_tuple = collections.namedtuple("DateData", fields) 

534 return date_tuple(**date_data.__dict__) 

535 

536 def _get_applicable_locales(self, date_string): 

537 pop_tz_cache = [] 

538 

539 def date_strings(): 

540 """A generator instead of a static list to avoid calling 

541 pop_tz_offset_from_string if the first locale matches on unmodified 

542 date_string. 

543 """ 

544 yield date_string 

545 if not pop_tz_cache: 

546 stripped_date_string, _ = pop_tz_offset_from_string( 

547 date_string, as_offset=False 

548 ) 

549 if stripped_date_string == date_string: 

550 stripped_date_string = None 

551 pop_tz_cache[:] = [stripped_date_string] 

552 (stripped_date_string,) = pop_tz_cache 

553 if stripped_date_string is not None: 

554 yield stripped_date_string 

555 

556 if self.try_previous_locales: 

557 for locale in self.previous_locales.keys(): 

558 for s in date_strings(): 

559 if self._is_applicable_locale(locale, s): 

560 yield locale 

561 

562 if self.detect_languages_function and not self.languages and not self.locales: 

563 detected_languages = self.detect_languages_function( 

564 text=date_string, 

565 confidence_threshold=self._settings.LANGUAGE_DETECTION_CONFIDENCE_THRESHOLD, 

566 ) 

567 

568 self.languages = map_languages(detected_languages) 

569 

570 for locale in self._get_locale_loader().get_locales( 

571 languages=self.languages, 

572 locales=self.locales, 

573 region=self.region, 

574 use_given_order=self.use_given_order, 

575 ): 

576 for s in date_strings(): 

577 if self._is_applicable_locale(locale, s): 

578 yield locale 

579 

580 if self._settings.DEFAULT_LANGUAGES: 

581 for locale in self._get_locale_loader().get_locales( 

582 languages=self._settings.DEFAULT_LANGUAGES, 

583 locales=None, 

584 region=self.region, 

585 use_given_order=self.use_given_order, 

586 ): 

587 yield locale 

588 

589 def _is_applicable_locale(self, locale, date_string): 

590 return locale.is_applicable( 

591 date_string, 

592 strip_timezone=False, # it is stripped outside 

593 settings=self._settings, 

594 ) 

595 

596 @classmethod 

597 def _get_locale_loader(cls): 

598 if not cls.locale_loader: 

599 cls.locale_loader = LocaleDataLoader() 

600 return cls.locale_loader