Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/dateparser/languages/locale.py: 51%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

381 statements  

1from collections import OrderedDict 

2from itertools import chain 

3 

4import regex as re 

5from dateutil import parser 

6 

7from dateparser.timezone_parser import pop_tz_offset_from_string, word_is_tz 

8from dateparser.utils import combine_dicts, normalize_unicode 

9 

10from .dictionary import ALWAYS_KEEP_TOKENS, Dictionary, NormalizedDictionary 

11 

12NUMERAL_PATTERN = re.compile(r"(\d+)", re.U) 

13 

14 

15class Locale: 

16 """ 

17 Class that deals with applicability and translation from a locale. 

18 

19 :param shortname: 

20 A locale code, e.g. 'fr-PF', 'qu-EC', 'af-NA'. 

21 :type shortname: str 

22 

23 :param language_info: 

24 Language info (translation data) of the language the locale belongs to. 

25 :type language_info: dict 

26 

27 :return: A Locale instance 

28 """ 

29 

30 _dictionary = None 

31 _normalized_dictionary = None 

32 _simplifications = None 

33 _normalized_simplifications = None 

34 _splitters = None 

35 _wordchars = None 

36 _relative_translations = None 

37 _normalized_relative_translations = None 

38 _abbreviations = None 

39 _split_dictionary = None 

40 _wordchars_for_detection = None 

41 

42 def __init__(self, shortname, language_info): 

43 self.shortname = shortname 

44 locale_specific_info = language_info.get("locale_specific", {}).get( 

45 shortname, {} 

46 ) 

47 self.info = combine_dicts(language_info, locale_specific_info) 

48 self.info.pop("locale_specific", None) 

49 

50 def is_applicable(self, date_string, strip_timezone=False, settings=None): 

51 """ 

52 Check if the locale is applicable to translate date string. 

53 

54 :param date_string: 

55 A string representing date and/or time in a recognizably valid format. 

56 :type date_string: str 

57 

58 :param strip_timezone: 

59 If True, timezone is stripped from date string. 

60 :type strip_timezone: bool 

61 

62 :return: boolean value representing if the locale is applicable for the date string or not. 

63 """ 

64 if strip_timezone: 

65 date_string, _ = pop_tz_offset_from_string(date_string, as_offset=False) 

66 

67 date_string = self._translate_numerals(date_string) 

68 if settings.NORMALIZE: 

69 date_string = normalize_unicode(date_string) 

70 date_string = self._simplify(date_string, settings=settings) 

71 dictionary = self._get_dictionary(settings) 

72 date_tokens = dictionary.split(date_string) 

73 return dictionary.are_tokens_valid(date_tokens) 

74 

75 def count_applicability(self, text, strip_timezone=False, settings=None): 

76 if strip_timezone: 

77 text, _ = pop_tz_offset_from_string(text, as_offset=False) 

78 

79 text = self._simplify(text, settings=settings) 

80 sentences = self._sentence_split(text, settings=settings) 

81 tokens = [] 

82 for sent in sentences: 

83 tokens.extend(self._split(sent, keep_formatting=False, settings=settings)) 

84 return self._count_words_present_in_the_dictionary(tokens, settings) 

85 

86 def _count_words_present_in_the_dictionary(self, words, settings=None): 

87 dictionary = self.clean_dictionary( 

88 self._get_split_dictionary(settings=settings) 

89 ) 

90 dict_cnt = 0 

91 skip_cnt = 0 

92 for word in set(words): 

93 if word in dictionary: 

94 if dictionary[word]: 

95 dict_cnt += 1 

96 else: 

97 skip_cnt += 1 

98 elif word.isdigit(): 

99 skip_cnt += 1 

100 return [dict_cnt, skip_cnt] 

101 

102 @staticmethod 

103 def clean_dictionary(dictionary, threshold=2): 

104 del_keys = [] 

105 for key in dictionary: 

106 if len(key) < threshold: 

107 del_keys.append(key) 

108 for del_key in del_keys: 

109 del dictionary[del_key] 

110 return dictionary 

111 

112 def translate(self, date_string, keep_formatting=False, settings=None): 

113 """ 

114 Translate the date string to its English equivalent. 

115 

116 :param date_string: 

117 A string representing date and/or time in a recognizably valid format. 

118 :type date_string: str 

119 

120 :param keep_formatting: 

121 If True, retain formatting of the date string after translation. 

122 :type keep_formatting: bool 

123 

124 :return: translated date string. 

125 """ 

126 date_string = self._translate_numerals(date_string) 

127 if settings.NORMALIZE: 

128 date_string = normalize_unicode(date_string) 

129 date_string = self._simplify(date_string, settings=settings) 

130 dictionary = self._get_dictionary(settings) 

131 date_string_tokens = dictionary.split(date_string, keep_formatting) 

132 

133 relative_translations = self._get_relative_translations(settings=settings) 

134 

135 for i, word in enumerate(date_string_tokens): 

136 word = word.lower() 

137 for pattern, replacement in relative_translations.items(): 

138 if pattern.match(word): 

139 date_string_tokens[i] = pattern.sub(replacement, word) 

140 break 

141 else: 

142 if word in dictionary: 

143 fallback = word if keep_formatting and not word.isalpha() else "" 

144 date_string_tokens[i] = dictionary[word] or fallback 

145 if "in" in date_string_tokens: 

146 date_string_tokens = self._clear_future_words(date_string_tokens) 

147 

148 return self._join( 

149 list(filter(bool, date_string_tokens)), 

150 separator="" if keep_formatting else " ", 

151 settings=settings, 

152 ) 

153 

154 def _translate_numerals(self, date_string): 

155 date_string_tokens = NUMERAL_PATTERN.split(date_string) 

156 for i, token in enumerate(date_string_tokens): 

157 if token.isdecimal(): 

158 date_string_tokens[i] = str(int(token)).zfill(len(token)) 

159 return "".join(date_string_tokens) 

160 

161 def _get_relative_translations(self, settings=None): 

162 if settings.NORMALIZE: 

163 if self._normalized_relative_translations is None: 

164 self._normalized_relative_translations = ( 

165 self._generate_relative_translations(normalize=True) 

166 ) 

167 return self._normalized_relative_translations 

168 else: 

169 if self._relative_translations is None: 

170 self._relative_translations = self._generate_relative_translations( 

171 normalize=False 

172 ) 

173 return self._relative_translations 

174 

175 def _generate_relative_translations(self, normalize=False): 

176 relative_translations = self.info.get("relative-type-regex", {}) 

177 relative_dictionary = OrderedDict() 

178 for key, value in relative_translations.items(): 

179 if normalize: 

180 value = list(map(normalize_unicode, value)) 

181 pattern = "|".join(sorted(value, key=len, reverse=True)) 

182 pattern = pattern.replace(r"(\d+", r"(?P<n>\d+") 

183 pattern = re.compile( 

184 r"^(?:{})$".format(pattern), re.UNICODE | re.IGNORECASE 

185 ) 

186 relative_dictionary[pattern] = key 

187 return relative_dictionary 

188 

189 def translate_search(self, search_string, settings=None): 

190 dashes = ["-", "——", "—", "~"] 

191 word_joint_unsupported_languages = ["zh", "ja"] 

192 sentences = self._sentence_split(search_string, settings=settings) 

193 dictionary = self._get_dictionary(settings=settings) 

194 translated = [] 

195 original = [] 

196 for sentence in sentences: 

197 original_tokens, simplified_tokens = self._simplify_split_align( 

198 sentence, settings=settings 

199 ) 

200 translated_chunk = [] 

201 original_chunk = [] 

202 last_token_index = len(simplified_tokens) - 1 

203 skip_next_token = False 

204 for i, word in enumerate(simplified_tokens): 

205 next_word = simplified_tokens[i + 1] if i < last_token_index else "" 

206 current_and_next_joined = self._join_chunk( 

207 [word, next_word], settings=settings 

208 ) 

209 if skip_next_token: 

210 skip_next_token = False 

211 continue 

212 

213 if word == "" or word == " ": 

214 translated_chunk.append(word) 

215 original_chunk.append(original_tokens[i]) 

216 elif ( 

217 current_and_next_joined in dictionary 

218 and word not in dashes 

219 and self.shortname not in word_joint_unsupported_languages 

220 ): 

221 translated_chunk.append(dictionary[current_and_next_joined]) 

222 original_chunk.append( 

223 self._join_chunk( 

224 [original_tokens[i], original_tokens[i + 1]], 

225 settings=settings, 

226 ) 

227 ) 

228 skip_next_token = True 

229 elif word in dictionary and word not in dashes: 

230 translated_chunk.append(dictionary[word]) 

231 original_chunk.append(original_tokens[i]) 

232 elif word.strip("()\"'{}[],.،") in dictionary and word not in dashes: 

233 punct = word[len(word.strip("()\"'{}[],.،")) :] 

234 if punct and dictionary[word.strip("()\"'{}[],.،")]: 

235 translated_chunk.append( 

236 dictionary[word.strip("()\"'{}[],.،")] + punct 

237 ) 

238 else: 

239 translated_chunk.append(dictionary[word.strip("()\"'{}[],.،")]) 

240 original_chunk.append(original_tokens[i]) 

241 elif self._token_with_digits_is_ok(word): 

242 translated_chunk.append(word) 

243 original_chunk.append(original_tokens[i]) 

244 # Use original token because word_is_tz is case sensitive 

245 elif translated_chunk and word_is_tz(original_tokens[i]): 

246 translated_chunk.append(word) 

247 original_chunk.append(original_tokens[i]) 

248 else: 

249 if translated_chunk: 

250 translated.append(translated_chunk) 

251 translated_chunk = [] 

252 original.append(original_chunk) 

253 original_chunk = [] 

254 if translated_chunk: 

255 translated.append(translated_chunk) 

256 original.append(original_chunk) 

257 for i in range(len(translated)): 

258 if "in" in translated[i]: 

259 translated[i] = self._clear_future_words(translated[i]) 

260 translated[i] = self._join_chunk( 

261 list(filter(bool, translated[i])), settings=settings 

262 ) 

263 original[i] = self._join_chunk( 

264 list(filter(bool, original[i])), settings=settings 

265 ) 

266 return translated, original 

267 

268 def _get_abbreviations(self, settings): 

269 dictionary = self._get_dictionary(settings=settings) 

270 abbreviations = [] 

271 if self._abbreviations is None: 

272 for item in dictionary: 

273 if item.endswith(".") and len(item) > 1: 

274 abbreviations.append(item) 

275 self._abbreviations = abbreviations 

276 return self._abbreviations 

277 

278 def _sentence_split(self, string, settings): 

279 abbreviations = self._get_abbreviations(settings=settings) 

280 digit_abbreviations = ["[0-9]"] # numeric date with full stop 

281 abbreviation_string = "" 

282 

283 for abbreviation in abbreviations: 

284 abbreviation_string += ( 

285 "(?<! " + abbreviation[:-1] + ")" 

286 ) # negative lookbehind 

287 if self.shortname in ["fi", "cs", "hu", "de", "da"]: 

288 for digit_abbreviation in digit_abbreviations: 

289 abbreviation_string += ( 

290 "(?<!" + digit_abbreviation + ")" 

291 ) # negative lookbehind 

292 

293 splitters_dict = { 

294 1: r"[\.!?;…\r\n]+(?:\s|$)*", # most European, Tagalog, Hebrew, Georgian, 

295 # Indonesian, Vietnamese 

296 2: r"[\.!?;…\r\n]+(\s*[¡¿]*|$)|[¡¿]+", # Spanish 

297 3: r"[|!?;\r\n]+(?:\s|$)+", # Hindi and Bangla 

298 4: r"[。…‥\.!??!;\r\n]+(?:\s|$)+", # Japanese and Chinese 

299 5: r"[\r\n]+", # Thai 

300 6: r"[\r\n؟!\.…]+(?:\s|$)+", 

301 } # Arabic and Farsi 

302 if "sentence_splitter_group" not in self.info: 

303 split_reg = abbreviation_string + splitters_dict[1] 

304 sentences = re.split(split_reg, string) 

305 else: 

306 split_reg = ( 

307 abbreviation_string 

308 + splitters_dict[self.info["sentence_splitter_group"]] 

309 ) 

310 sentences = re.split(split_reg, string) 

311 

312 sentences = filter(None, sentences) 

313 return sentences 

314 

315 def _simplify_split_align(self, original, settings): 

316 # TODO: Switch to new split method. 

317 original_tokens = self._word_split(original, settings=settings) 

318 simplified_tokens = self._word_split( 

319 self._simplify(normalize_unicode(original), settings=settings), 

320 settings=settings, 

321 ) 

322 if len(original_tokens) == len(simplified_tokens): 

323 return original_tokens, simplified_tokens 

324 

325 elif len(original_tokens) < len(simplified_tokens): 

326 add_empty = False 

327 for i, token in enumerate(simplified_tokens): 

328 if i < len(original_tokens): 

329 if token == normalize_unicode(original_tokens[i].lower()): 

330 add_empty = False 

331 else: 

332 if not add_empty: 

333 add_empty = True 

334 continue 

335 else: 

336 original_tokens.insert(i, "") 

337 else: 

338 original_tokens.insert(i, "") 

339 else: 

340 add_empty = False 

341 for i, token in enumerate(original_tokens): 

342 if i < len(simplified_tokens): 

343 if normalize_unicode(token.lower()) == simplified_tokens[i]: 

344 add_empty = False 

345 else: 

346 if not add_empty: 

347 add_empty = True 

348 continue 

349 else: 

350 simplified_tokens.insert(i, "") 

351 else: 

352 simplified_tokens.insert(i, "") 

353 

354 while len(original_tokens) != len(simplified_tokens): 

355 if len(original_tokens) > len(simplified_tokens): 

356 original_tokens.remove("") 

357 else: 

358 simplified_tokens.remove("") 

359 return original_tokens, simplified_tokens 

360 

361 def _get_split_dictionary(self, settings): 

362 if self._split_dictionary is None: 

363 settings.NORMALIZE = True 

364 dictionary = self._get_dictionary(settings=settings) 

365 self._split_dictionary = self._split_dict(dictionary) 

366 return self._split_dictionary 

367 

368 def _split_dict(self, dictionary): 

369 newdict = {} 

370 for item in dictionary: 

371 if " " in item: 

372 items = item.split() 

373 for i in items: 

374 newdict[i] = dictionary[item] 

375 else: 

376 newdict[item] = dictionary[item] 

377 return newdict 

378 

379 def _word_split(self, string, settings): 

380 if "no_word_spacing" in self.info: 

381 return self._split(string, keep_formatting=True, settings=settings) 

382 else: 

383 return string.split() 

384 

385 def _split(self, date_string, keep_formatting, settings=None): 

386 tokens = [date_string] 

387 tokens = list(self._split_tokens_with_regex(tokens, r"(\d+)")) 

388 tokens = list( 

389 self._split_tokens_by_known_words( 

390 tokens, keep_formatting, settings=settings 

391 ) 

392 ) 

393 return tokens 

394 

395 def _split_tokens_with_regex(self, tokens, regex): 

396 tokens = tokens[:] 

397 for i, token in enumerate(tokens): 

398 tokens[i] = re.split(regex, token) 

399 return filter(bool, chain.from_iterable(tokens)) 

400 

401 def _split_tokens_by_known_words(self, tokens, keep_formatting, settings=None): 

402 dictionary = self._get_dictionary(settings) 

403 for i, token in enumerate(tokens): 

404 tokens[i] = dictionary.split(token, keep_formatting) 

405 return list(chain.from_iterable(tokens)) 

406 

407 def _join_chunk(self, chunk, settings): 

408 if "no_word_spacing" in self.info: 

409 return self._join(chunk, separator="", settings=settings) 

410 else: 

411 return re.sub(r"\s{2,}", " ", " ".join(chunk)) 

412 

413 def _token_with_digits_is_ok(self, token): 

414 if "no_word_spacing" in self.info: 

415 if re.search(r"[\d\.:\-/]+", token) is not None: 

416 return True 

417 else: 

418 return False 

419 

420 else: 

421 if re.search(r"\d+", token) is not None: 

422 return True 

423 else: 

424 return False 

425 

426 def _simplify(self, date_string, settings=None): 

427 date_string = date_string.lower() 

428 simplifications = self._get_simplifications(settings=settings) 

429 for simplification in simplifications: 

430 pattern, replacement = list(simplification.items())[0] 

431 date_string = pattern.sub(replacement, date_string).lower() 

432 return date_string 

433 

434 def _get_simplifications(self, settings=None): 

435 no_word_spacing = eval(self.info.get("no_word_spacing", "False")) 

436 if settings.NORMALIZE: 

437 if self._normalized_simplifications is None: 

438 self._normalized_simplifications = [] 

439 simplifications = self._generate_simplifications(normalize=True) 

440 for simplification in simplifications: 

441 pattern, replacement = list(simplification.items())[0] 

442 if not no_word_spacing: 

443 pattern = r"(?<=\A|\W|_)%s(?=\Z|\W|_)" % pattern 

444 pattern = re.compile(pattern, flags=re.I | re.U) 

445 self._normalized_simplifications.append({pattern: replacement}) 

446 return self._normalized_simplifications 

447 

448 else: 

449 if self._simplifications is None: 

450 self._simplifications = [] 

451 simplifications = self._generate_simplifications(normalize=False) 

452 for simplification in simplifications: 

453 pattern, replacement = list(simplification.items())[0] 

454 if not no_word_spacing: 

455 pattern = r"(?<=\A|\W|_)%s(?=\Z|\W|_)" % pattern 

456 pattern = re.compile(pattern, flags=re.I | re.U) 

457 self._simplifications.append({pattern: replacement}) 

458 return self._simplifications 

459 

460 def _generate_simplifications(self, normalize=False): 

461 simplifications = [] 

462 for simplification in self.info.get("simplifications", []): 

463 c_simplification = {} 

464 key, value = list(simplification.items())[0] 

465 if normalize: 

466 key = normalize_unicode(key) 

467 

468 if isinstance(value, int): 

469 c_simplification[key] = str(value) 

470 else: 

471 c_simplification[key] = normalize_unicode(value) if normalize else value 

472 

473 simplifications.append(c_simplification) 

474 return simplifications 

475 

476 def _clear_future_words(self, words): 

477 freshness_words = {"day", "week", "month", "year", "hour", "minute", "second"} 

478 if set(words).isdisjoint(freshness_words): 

479 words.remove("in") 

480 return words 

481 

482 def _join(self, tokens, separator=" ", settings=None): 

483 if not tokens: 

484 return "" 

485 

486 capturing_splitters = self._get_splitters(settings)["capturing"] 

487 joined = tokens[0] 

488 for i in range(1, len(tokens)): 

489 left, right = tokens[i - 1], tokens[i] 

490 if left not in capturing_splitters and right not in capturing_splitters: 

491 joined += separator 

492 joined += right 

493 

494 return joined 

495 

496 def _get_dictionary(self, settings=None): 

497 if not settings.NORMALIZE: 

498 if self._dictionary is None: 

499 self._generate_dictionary() 

500 self._dictionary._settings = settings 

501 return self._dictionary 

502 else: 

503 if self._normalized_dictionary is None: 

504 self._generate_normalized_dictionary() 

505 self._normalized_dictionary._settings = settings 

506 return self._normalized_dictionary 

507 

508 def _get_wordchars(self, settings=None): 

509 if self._wordchars is None: 

510 self._set_wordchars(settings) 

511 return self._wordchars 

512 

513 def _get_splitters(self, settings=None): 

514 if self._splitters is None: 

515 self._set_splitters(settings) 

516 return self._splitters 

517 

518 def _set_splitters(self, settings=None): 

519 splitters = { 

520 # The ones that split string only if they are not surrounded by letters from both sides: 

521 "wordchars": set(), 

522 # The ones that are not filtered out from tokens after split: 

523 "capturing": set(), 

524 } 

525 splitters["capturing"] |= set(ALWAYS_KEEP_TOKENS) 

526 

527 wordchars = self._get_wordchars(settings) 

528 skip = set(self.info.get("skip", [])) | splitters["capturing"] 

529 for token in skip: 

530 if not re.match(r"^\W+$", token, re.UNICODE): 

531 continue 

532 if token in wordchars: 

533 splitters["wordchars"].add(token) 

534 

535 self._splitters = splitters 

536 

537 def _set_wordchars(self, settings=None): 

538 wordchars = set() 

539 for word in self._get_dictionary(settings): 

540 if re.match(r"^[\W\d_]+$", word, re.UNICODE): 

541 continue 

542 for char in word: 

543 wordchars.add(char.lower()) 

544 

545 self._wordchars = wordchars - {" "} | { 

546 "0", 

547 "1", 

548 "2", 

549 "3", 

550 "4", 

551 "5", 

552 "6", 

553 "7", 

554 "8", 

555 "9", 

556 } 

557 

558 def get_wordchars_for_detection(self, settings): 

559 if self._wordchars_for_detection is None: 

560 wordchars = set() 

561 for word in self._get_dictionary(settings): 

562 if re.match(r"^[\W\d_]+$", word, re.UNICODE): 

563 continue 

564 for char in word: 

565 wordchars.add(char.lower()) 

566 self._wordchars_for_detection = wordchars - { 

567 "0", 

568 "1", 

569 "2", 

570 "3", 

571 "4", 

572 "5", 

573 "6", 

574 "7", 

575 "8", 

576 "9", 

577 ":", 

578 "(", 

579 ")", 

580 "'", 

581 "q", 

582 "a", 

583 "m", 

584 "p", 

585 " ", 

586 } 

587 return self._wordchars_for_detection 

588 

589 def _generate_dictionary(self, settings=None): 

590 self._dictionary = Dictionary(self.info, settings=settings) 

591 

592 def _generate_normalized_dictionary(self, settings=None): 

593 self._normalized_dictionary = NormalizedDictionary(self.info, settings=settings) 

594 

595 def to_parserinfo(self, base_cls=parser.parserinfo): 

596 attributes = { 

597 "JUMP": self.info.get("skip", []), 

598 "PERTAIN": self.info.get("pertain", []), 

599 "WEEKDAYS": [ 

600 self.info["monday"], 

601 self.info["tuesday"], 

602 self.info["wednesday"], 

603 self.info["thursday"], 

604 self.info["friday"], 

605 self.info["saturday"], 

606 self.info["sunday"], 

607 ], 

608 "MONTHS": [ 

609 self.info["january"], 

610 self.info["february"], 

611 self.info["march"], 

612 self.info["april"], 

613 self.info["may"], 

614 self.info["june"], 

615 self.info["july"], 

616 self.info["august"], 

617 self.info["september"], 

618 self.info["october"], 

619 self.info["november"], 

620 self.info["december"], 

621 ], 

622 "HMS": [self.info["hour"], self.info["minute"], self.info["second"]], 

623 } 

624 name = "{language}ParserInfo".format(language=self.info["name"]) 

625 return type(name, bases=[base_cls], dict=attributes)