Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/dateparser/languages/locale.py: 51%

1from collections import OrderedDict

2from itertools import chain

4import regex as re

5from dateutil import parser

7from dateparser.timezone_parser import pop_tz_offset_from_string, word_is_tz

8from dateparser.utils import combine_dicts, normalize_unicode

10from .dictionary import ALWAYS_KEEP_TOKENS, Dictionary, NormalizedDictionary

12NUMERAL_PATTERN = re.compile(r"(\d+)", re.U)

15class Locale:

16 """

17 Class that deals with applicability and translation from a locale.

19 :param shortname:

20 A locale code, e.g. 'fr-PF', 'qu-EC', 'af-NA'.

21 :type shortname: str

23 :param language_info:

24 Language info (translation data) of the language the locale belongs to.

25 :type language_info: dict

27 :return: A Locale instance

28 """

30 _dictionary = None

31 _normalized_dictionary = None

32 _simplifications = None

33 _normalized_simplifications = None

34 _splitters = None

35 _wordchars = None

36 _relative_translations = None

37 _normalized_relative_translations = None

38 _abbreviations = None

39 _split_dictionary = None

40 _wordchars_for_detection = None

42 def __init__(self, shortname, language_info):

43 self.shortname = shortname

44 locale_specific_info = language_info.get("locale_specific", {}).get(

45 shortname, {}

46 )

47 self.info = combine_dicts(language_info, locale_specific_info)

48 self.info.pop("locale_specific", None)

50 def is_applicable(self, date_string, strip_timezone=False, settings=None):

51 """

52 Check if the locale is applicable to translate date string.

54 :param date_string:

55 A string representing date and/or time in a recognizably valid format.

56 :type date_string: str

58 :param strip_timezone:

59 If True, timezone is stripped from date string.

60 :type strip_timezone: bool

62 :return: boolean value representing if the locale is applicable for the date string or not.

63 """

64 if strip_timezone:

65 date_string, _ = pop_tz_offset_from_string(date_string, as_offset=False)

67 date_string = self._translate_numerals(date_string)

68 if settings.NORMALIZE:

69 date_string = normalize_unicode(date_string)

70 date_string = self._simplify(date_string, settings=settings)

71 dictionary = self._get_dictionary(settings)

72 date_tokens = dictionary.split(date_string)

73 return dictionary.are_tokens_valid(date_tokens)

75 def count_applicability(self, text, strip_timezone=False, settings=None):

76 if strip_timezone:

77 text, _ = pop_tz_offset_from_string(text, as_offset=False)

79 text = self._simplify(text, settings=settings)

80 sentences = self._sentence_split(text, settings=settings)

81 tokens = []

82 for sent in sentences:

83 tokens.extend(self._split(sent, keep_formatting=False, settings=settings))

84 return self._count_words_present_in_the_dictionary(tokens, settings)

86 def _count_words_present_in_the_dictionary(self, words, settings=None):

87 dictionary = self.clean_dictionary(

88 self._get_split_dictionary(settings=settings)

89 )

90 dict_cnt = 0

91 skip_cnt = 0

92 for word in set(words):

93 if word in dictionary:

94 if dictionary[word]:

95 dict_cnt += 1

96 else:

97 skip_cnt += 1

98 elif word.isdigit():

99 skip_cnt += 1

100 return [dict_cnt, skip_cnt]

101

102 @staticmethod

103 def clean_dictionary(dictionary, threshold=2):

104 del_keys = []

105 for key in dictionary:

106 if len(key) < threshold:

107 del_keys.append(key)

108 for del_key in del_keys:

109 del dictionary[del_key]

110 return dictionary

111

112 def translate(self, date_string, keep_formatting=False, settings=None):

113 """

114 Translate the date string to its English equivalent.

115

116 :param date_string:

117 A string representing date and/or time in a recognizably valid format.

118 :type date_string: str

119

120 :param keep_formatting:

121 If True, retain formatting of the date string after translation.

122 :type keep_formatting: bool

123

124 :return: translated date string.

125 """

126 date_string = self._translate_numerals(date_string)

127 if settings.NORMALIZE:

128 date_string = normalize_unicode(date_string)

129 date_string = self._simplify(date_string, settings=settings)

130 dictionary = self._get_dictionary(settings)

131 date_string_tokens = dictionary.split(date_string, keep_formatting)

132

133 relative_translations = self._get_relative_translations(settings=settings)

134

135 for i, word in enumerate(date_string_tokens):

136 word = word.lower()

137 for pattern, replacement in relative_translations.items():

138 if pattern.match(word):

139 date_string_tokens[i] = pattern.sub(replacement, word)

140 break

141 else:

142 if word in dictionary:

143 fallback = word if keep_formatting and not word.isalpha() else ""

144 date_string_tokens[i] = dictionary[word] or fallback

145 if "in" in date_string_tokens:

146 date_string_tokens = self._clear_future_words(date_string_tokens)

147

148 return self._join(

149 list(filter(bool, date_string_tokens)),

150 separator="" if keep_formatting else " ",

151 settings=settings,

152 )

153

154 def _translate_numerals(self, date_string):

155 date_string_tokens = NUMERAL_PATTERN.split(date_string)

156 for i, token in enumerate(date_string_tokens):

157 if token.isdecimal():

158 date_string_tokens[i] = str(int(token)).zfill(len(token))

159 return "".join(date_string_tokens)

160

161 def _get_relative_translations(self, settings=None):

162 if settings.NORMALIZE:

163 if self._normalized_relative_translations is None:

164 self._normalized_relative_translations = (

165 self._generate_relative_translations(normalize=True)

166 )

167 return self._normalized_relative_translations

168 else:

169 if self._relative_translations is None:

170 self._relative_translations = self._generate_relative_translations(

171 normalize=False

172 )

173 return self._relative_translations

174

175 def _generate_relative_translations(self, normalize=False):

176 relative_translations = self.info.get("relative-type-regex", {})

177 relative_dictionary = OrderedDict()

178 for key, value in relative_translations.items():

179 if normalize:

180 value = list(map(normalize_unicode, value))

181 pattern = "|".join(sorted(value, key=len, reverse=True))

182 pattern = pattern.replace(r"(\d+", r"(?P<n>\d+")

183 pattern = re.compile(

184 r"^(?:{})$".format(pattern), re.UNICODE | re.IGNORECASE

185 )

186 relative_dictionary[pattern] = key

187 return relative_dictionary

188

189 def translate_search(self, search_string, settings=None):

190 dashes = ["-", "——", "—", "～"]

191 word_joint_unsupported_languages = ["zh", "ja"]

192 sentences = self._sentence_split(search_string, settings=settings)

193 dictionary = self._get_dictionary(settings=settings)

194 translated = []

195 original = []

196 for sentence in sentences:

197 original_tokens, simplified_tokens = self._simplify_split_align(

198 sentence, settings=settings

199 )

200 translated_chunk = []

201 original_chunk = []

202 last_token_index = len(simplified_tokens) - 1

203 skip_next_token = False

204 for i, word in enumerate(simplified_tokens):

205 next_word = simplified_tokens[i + 1] if i < last_token_index else ""

206 current_and_next_joined = self._join_chunk(

207 [word, next_word], settings=settings

208 )

209 if skip_next_token:

210 skip_next_token = False

211 continue

212

213 if word == "" or word == " ":

214 translated_chunk.append(word)

215 original_chunk.append(original_tokens[i])

216 elif (

217 current_and_next_joined in dictionary

218 and word not in dashes

219 and self.shortname not in word_joint_unsupported_languages

220 ):

221 translated_chunk.append(dictionary[current_and_next_joined])

222 original_chunk.append(

223 self._join_chunk(

224 [original_tokens[i], original_tokens[i + 1]],

225 settings=settings,

226 )

227 )

228 skip_next_token = True

229 elif word in dictionary and word not in dashes:

230 translated_chunk.append(dictionary[word])

231 original_chunk.append(original_tokens[i])

232 elif word.strip("()\"'{}[],.،") in dictionary and word not in dashes:

233 punct = word[len(word.strip("()\"'{}[],.،")) :]

234 if punct and dictionary[word.strip("()\"'{}[],.،")]:

235 translated_chunk.append(

236 dictionary[word.strip("()\"'{}[],.،")] + punct

237 )

238 else:

239 translated_chunk.append(dictionary[word.strip("()\"'{}[],.،")])

240 original_chunk.append(original_tokens[i])

241 elif self._token_with_digits_is_ok(word):

242 translated_chunk.append(word)

243 original_chunk.append(original_tokens[i])

244 # Use original token because word_is_tz is case sensitive

245 elif translated_chunk and word_is_tz(original_tokens[i]):

246 translated_chunk.append(word)

247 original_chunk.append(original_tokens[i])

248 else:

249 if translated_chunk:

250 translated.append(translated_chunk)

251 translated_chunk = []

252 original.append(original_chunk)

253 original_chunk = []

254 if translated_chunk:

255 translated.append(translated_chunk)

256 original.append(original_chunk)

257 for i in range(len(translated)):

258 if "in" in translated[i]:

259 translated[i] = self._clear_future_words(translated[i])

260 translated[i] = self._join_chunk(

261 list(filter(bool, translated[i])), settings=settings

262 )

263 original[i] = self._join_chunk(

264 list(filter(bool, original[i])), settings=settings

265 )

266 return translated, original

267

268 def _get_abbreviations(self, settings):

269 dictionary = self._get_dictionary(settings=settings)

270 abbreviations = []

271 if self._abbreviations is None:

272 for item in dictionary:

273 if item.endswith(".") and len(item) > 1:

274 abbreviations.append(item)

275 self._abbreviations = abbreviations

276 return self._abbreviations

277

278 def _sentence_split(self, string, settings):

279 abbreviations = self._get_abbreviations(settings=settings)

280 digit_abbreviations = ["[0-9]"] # numeric date with full stop

281 abbreviation_string = ""

282

283 for abbreviation in abbreviations:

284 abbreviation_string += (

285 "(?<! " + abbreviation[:-1] + ")"

286 ) # negative lookbehind

287 if self.shortname in ["fi", "cs", "hu", "de", "da"]:

288 for digit_abbreviation in digit_abbreviations:

289 abbreviation_string += (

290 "(?<!" + digit_abbreviation + ")"

291 ) # negative lookbehind

292

293 splitters_dict = {

294 1: r"[\.!?;…\r\n]+(?:\s|$)*", # most European, Tagalog, Hebrew, Georgian,

295 # Indonesian, Vietnamese

296 2: r"[\.!?;…\r\n]+(\s*[¡¿]*|$)|[¡¿]+", # Spanish

297 3: r"[|!?;\r\n]+(?:\s|$)+", # Hindi and Bangla

298 4: r"[。…‥\.!?？！;\r\n]+(?:\s|$)+", # Japanese and Chinese

299 5: r"[\r\n]+", # Thai

300 6: r"[\r\n؟!\.…]+(?:\s|$)+",

301 } # Arabic and Farsi

302 if "sentence_splitter_group" not in self.info:

303 split_reg = abbreviation_string + splitters_dict[1]

304 sentences = re.split(split_reg, string)

305 else:

306 split_reg = (

307 abbreviation_string

308 + splitters_dict[self.info["sentence_splitter_group"]]

309 )

310 sentences = re.split(split_reg, string)

311

312 sentences = filter(None, sentences)

313 return sentences

314

315 def _simplify_split_align(self, original, settings):

316 # TODO: Switch to new split method.

317 original_tokens = self._word_split(original, settings=settings)

318 simplified_tokens = self._word_split(

319 self._simplify(normalize_unicode(original), settings=settings),

320 settings=settings,

321 )

322 if len(original_tokens) == len(simplified_tokens):

323 return original_tokens, simplified_tokens

324

325 elif len(original_tokens) < len(simplified_tokens):

326 add_empty = False

327 for i, token in enumerate(simplified_tokens):

328 if i < len(original_tokens):

329 if token == normalize_unicode(original_tokens[i].lower()):

330 add_empty = False

331 else:

332 if not add_empty:

333 add_empty = True

334 continue

335 else:

336 original_tokens.insert(i, "")

337 else:

338 original_tokens.insert(i, "")

339 else:

340 add_empty = False

341 for i, token in enumerate(original_tokens):

342 if i < len(simplified_tokens):

343 if normalize_unicode(token.lower()) == simplified_tokens[i]:

344 add_empty = False

345 else:

346 if not add_empty:

347 add_empty = True

348 continue

349 else:

350 simplified_tokens.insert(i, "")

351 else:

352 simplified_tokens.insert(i, "")

353

354 while len(original_tokens) != len(simplified_tokens):

355 if len(original_tokens) > len(simplified_tokens):

356 original_tokens.remove("")

357 else:

358 simplified_tokens.remove("")

359 return original_tokens, simplified_tokens

360

361 def _get_split_dictionary(self, settings):

362 if self._split_dictionary is None:

363 settings.NORMALIZE = True

364 dictionary = self._get_dictionary(settings=settings)

365 self._split_dictionary = self._split_dict(dictionary)

366 return self._split_dictionary

367

368 def _split_dict(self, dictionary):

369 newdict = {}

370 for item in dictionary:

371 if " " in item:

372 items = item.split()

373 for i in items:

374 newdict[i] = dictionary[item]

375 else:

376 newdict[item] = dictionary[item]

377 return newdict

378

379 def _word_split(self, string, settings):

380 if "no_word_spacing" in self.info:

381 return self._split(string, keep_formatting=True, settings=settings)

382 else:

383 return string.split()

384

385 def _split(self, date_string, keep_formatting, settings=None):

386 tokens = [date_string]

387 tokens = list(self._split_tokens_with_regex(tokens, r"(\d+)"))

388 tokens = list(

389 self._split_tokens_by_known_words(

390 tokens, keep_formatting, settings=settings

391 )

392 )

393 return tokens

394

395 def _split_tokens_with_regex(self, tokens, regex):

396 tokens = tokens[:]

397 for i, token in enumerate(tokens):

398 tokens[i] = re.split(regex, token)

399 return filter(bool, chain.from_iterable(tokens))

400

401 def _split_tokens_by_known_words(self, tokens, keep_formatting, settings=None):

402 dictionary = self._get_dictionary(settings)

403 for i, token in enumerate(tokens):

404 tokens[i] = dictionary.split(token, keep_formatting)

405 return list(chain.from_iterable(tokens))

406

407 def _join_chunk(self, chunk, settings):

408 if "no_word_spacing" in self.info:

409 return self._join(chunk, separator="", settings=settings)

410 else:

411 return re.sub(r"\s{2,}", " ", " ".join(chunk))

412

413 def _token_with_digits_is_ok(self, token):

414 if "no_word_spacing" in self.info:

415 if re.search(r"[\d\.:\-/]+", token) is not None:

416 return True

417 else:

418 return False

419

420 else:

421 if re.search(r"\d+", token) is not None:

422 return True

423 else:

424 return False

425

426 def _simplify(self, date_string, settings=None):

427 date_string = date_string.lower()

428 simplifications = self._get_simplifications(settings=settings)

429 for simplification in simplifications:

430 pattern, replacement = list(simplification.items())[0]

431 date_string = pattern.sub(replacement, date_string).lower()

432 return date_string

433

434 def _get_simplifications(self, settings=None):

435 no_word_spacing = eval(self.info.get("no_word_spacing", "False"))

436 if settings.NORMALIZE:

437 if self._normalized_simplifications is None:

438 self._normalized_simplifications = []

439 simplifications = self._generate_simplifications(normalize=True)

440 for simplification in simplifications:

441 pattern, replacement = list(simplification.items())[0]

442 if not no_word_spacing:

443 pattern = r"(?<=\A|\W|_)%s(?=\Z|\W|_)" % pattern

444 pattern = re.compile(pattern, flags=re.I | re.U)

445 self._normalized_simplifications.append({pattern: replacement})

446 return self._normalized_simplifications

447

448 else:

449 if self._simplifications is None:

450 self._simplifications = []

451 simplifications = self._generate_simplifications(normalize=False)

452 for simplification in simplifications:

453 pattern, replacement = list(simplification.items())[0]

454 if not no_word_spacing:

455 pattern = r"(?<=\A|\W|_)%s(?=\Z|\W|_)" % pattern

456 pattern = re.compile(pattern, flags=re.I | re.U)

457 self._simplifications.append({pattern: replacement})

458 return self._simplifications

459

460 def _generate_simplifications(self, normalize=False):

461 simplifications = []

462 for simplification in self.info.get("simplifications", []):

463 c_simplification = {}

464 key, value = list(simplification.items())[0]

465 if normalize:

466 key = normalize_unicode(key)

467

468 if isinstance(value, int):

469 c_simplification[key] = str(value)

470 else:

471 c_simplification[key] = normalize_unicode(value) if normalize else value

472

473 simplifications.append(c_simplification)

474 return simplifications

475

476 def _clear_future_words(self, words):

477 freshness_words = {"day", "week", "month", "year", "hour", "minute", "second"}

478 if set(words).isdisjoint(freshness_words):

479 words.remove("in")

480 return words

481

482 def _join(self, tokens, separator=" ", settings=None):

483 if not tokens:

484 return ""

485

486 capturing_splitters = self._get_splitters(settings)["capturing"]

487 joined = tokens[0]

488 for i in range(1, len(tokens)):

489 left, right = tokens[i - 1], tokens[i]

490 if left not in capturing_splitters and right not in capturing_splitters:

491 joined += separator

492 joined += right

493

494 return joined

495

496 def _get_dictionary(self, settings=None):

497 if not settings.NORMALIZE:

498 if self._dictionary is None:

499 self._generate_dictionary()

500 self._dictionary._settings = settings

501 return self._dictionary

502 else:

503 if self._normalized_dictionary is None:

504 self._generate_normalized_dictionary()

505 self._normalized_dictionary._settings = settings

506 return self._normalized_dictionary

507

508 def _get_wordchars(self, settings=None):

509 if self._wordchars is None:

510 self._set_wordchars(settings)

511 return self._wordchars

512

513 def _get_splitters(self, settings=None):

514 if self._splitters is None:

515 self._set_splitters(settings)

516 return self._splitters

517

518 def _set_splitters(self, settings=None):

519 splitters = {

520 # The ones that split string only if they are not surrounded by letters from both sides:

521 "wordchars": set(),

522 # The ones that are not filtered out from tokens after split:

523 "capturing": set(),

524 }

525 splitters["capturing"] |= set(ALWAYS_KEEP_TOKENS)

526

527 wordchars = self._get_wordchars(settings)

528 skip = set(self.info.get("skip", [])) | splitters["capturing"]

529 for token in skip:

530 if not re.match(r"^\W+$", token, re.UNICODE):

531 continue

532 if token in wordchars:

533 splitters["wordchars"].add(token)

534

535 self._splitters = splitters

536

537 def _set_wordchars(self, settings=None):

538 wordchars = set()

539 for word in self._get_dictionary(settings):

540 if re.match(r"^[\W\d_]+$", word, re.UNICODE):

541 continue

542 for char in word:

543 wordchars.add(char.lower())

544

545 self._wordchars = wordchars - {" "} | {

546 "0",

547 "1",

548 "2",

549 "3",

550 "4",

551 "5",

552 "6",

553 "7",

554 "8",

555 "9",

556 }

557

558 def get_wordchars_for_detection(self, settings):

559 if self._wordchars_for_detection is None:

560 wordchars = set()

561 for word in self._get_dictionary(settings):

562 if re.match(r"^[\W\d_]+$", word, re.UNICODE):

563 continue

564 for char in word:

565 wordchars.add(char.lower())

566 self._wordchars_for_detection = wordchars - {

567 "0",

568 "1",

569 "2",

570 "3",

571 "4",

572 "5",

573 "6",

574 "7",

575 "8",

576 "9",

577 ":",

578 "(",

579 ")",

580 "'",

581 "q",

582 "a",

583 "m",

584 "p",

585 " ",

586 }

587 return self._wordchars_for_detection

588

589 def _generate_dictionary(self, settings=None):

590 self._dictionary = Dictionary(self.info, settings=settings)

591

592 def _generate_normalized_dictionary(self, settings=None):

593 self._normalized_dictionary = NormalizedDictionary(self.info, settings=settings)

594

595 def to_parserinfo(self, base_cls=parser.parserinfo):

596 attributes = {

597 "JUMP": self.info.get("skip", []),

598 "PERTAIN": self.info.get("pertain", []),

599 "WEEKDAYS": [

600 self.info["monday"],

601 self.info["tuesday"],

602 self.info["wednesday"],

603 self.info["thursday"],

604 self.info["friday"],

605 self.info["saturday"],

606 self.info["sunday"],

607 ],

608 "MONTHS": [

609 self.info["january"],

610 self.info["february"],

611 self.info["march"],

612 self.info["april"],

613 self.info["may"],

614 self.info["june"],

615 self.info["july"],

616 self.info["august"],

617 self.info["september"],

618 self.info["october"],

619 self.info["november"],

620 self.info["december"],

621 ],

622 "HMS": [self.info["hour"], self.info["minute"], self.info["second"]],

623 }

624 name = "{language}ParserInfo".format(language=self.info["name"])

625 return type(name, bases=[base_cls], dict=attributes)