1from collections import OrderedDict
2from itertools import chain
3
4import regex as re
5from dateutil import parser
6
7from dateparser.timezone_parser import pop_tz_offset_from_string, word_is_tz
8from dateparser.utils import combine_dicts, normalize_unicode
9
10from .dictionary import ALWAYS_KEEP_TOKENS, Dictionary, NormalizedDictionary
11
12NUMERAL_PATTERN = re.compile(r"(\d+)", re.U)
13
14
15class Locale:
16 """
17 Class that deals with applicability and translation from a locale.
18
19 :param shortname:
20 A locale code, e.g. 'fr-PF', 'qu-EC', 'af-NA'.
21 :type shortname: str
22
23 :param language_info:
24 Language info (translation data) of the language the locale belongs to.
25 :type language_info: dict
26
27 :return: A Locale instance
28 """
29
30 _dictionary = None
31 _normalized_dictionary = None
32 _simplifications = None
33 _normalized_simplifications = None
34 _splitters = None
35 _wordchars = None
36 _relative_translations = None
37 _normalized_relative_translations = None
38 _abbreviations = None
39 _split_dictionary = None
40 _wordchars_for_detection = None
41
42 def __init__(self, shortname, language_info):
43 self.shortname = shortname
44 locale_specific_info = language_info.get("locale_specific", {}).get(
45 shortname, {}
46 )
47 self.info = combine_dicts(language_info, locale_specific_info)
48 self.info.pop("locale_specific", None)
49
50 def is_applicable(self, date_string, strip_timezone=False, settings=None):
51 """
52 Check if the locale is applicable to translate date string.
53
54 :param date_string:
55 A string representing date and/or time in a recognizably valid format.
56 :type date_string: str
57
58 :param strip_timezone:
59 If True, timezone is stripped from date string.
60 :type strip_timezone: bool
61
62 :return: boolean value representing if the locale is applicable for the date string or not.
63 """
64 if strip_timezone:
65 date_string, _ = pop_tz_offset_from_string(date_string, as_offset=False)
66
67 date_string = self._translate_numerals(date_string)
68 if settings.NORMALIZE:
69 date_string = normalize_unicode(date_string)
70 date_string = self._simplify(date_string, settings=settings)
71 dictionary = self._get_dictionary(settings)
72 date_tokens = dictionary.split(date_string)
73 return dictionary.are_tokens_valid(date_tokens)
74
75 def count_applicability(self, text, strip_timezone=False, settings=None):
76 if strip_timezone:
77 text, _ = pop_tz_offset_from_string(text, as_offset=False)
78
79 text = self._simplify(text, settings=settings)
80 sentences = self._sentence_split(text, settings=settings)
81 tokens = []
82 for sent in sentences:
83 tokens.extend(self._split(sent, keep_formatting=False, settings=settings))
84 return self._count_words_present_in_the_dictionary(tokens, settings)
85
86 def _count_words_present_in_the_dictionary(self, words, settings=None):
87 dictionary = self.clean_dictionary(
88 self._get_split_dictionary(settings=settings)
89 )
90 dict_cnt = 0
91 skip_cnt = 0
92 for word in set(words):
93 if word in dictionary:
94 if dictionary[word]:
95 dict_cnt += 1
96 else:
97 skip_cnt += 1
98 elif word.isdigit():
99 skip_cnt += 1
100 return [dict_cnt, skip_cnt]
101
102 @staticmethod
103 def clean_dictionary(dictionary, threshold=2):
104 del_keys = []
105 for key in dictionary:
106 if len(key) < threshold:
107 del_keys.append(key)
108 for del_key in del_keys:
109 del dictionary[del_key]
110 return dictionary
111
112 def translate(self, date_string, keep_formatting=False, settings=None):
113 """
114 Translate the date string to its English equivalent.
115
116 :param date_string:
117 A string representing date and/or time in a recognizably valid format.
118 :type date_string: str
119
120 :param keep_formatting:
121 If True, retain formatting of the date string after translation.
122 :type keep_formatting: bool
123
124 :return: translated date string.
125 """
126 date_string = self._translate_numerals(date_string)
127 if settings.NORMALIZE:
128 date_string = normalize_unicode(date_string)
129 date_string = self._simplify(date_string, settings=settings)
130 dictionary = self._get_dictionary(settings)
131 date_string_tokens = dictionary.split(date_string, keep_formatting)
132
133 relative_translations = self._get_relative_translations(settings=settings)
134
135 for i, word in enumerate(date_string_tokens):
136 word = word.lower()
137 for pattern, replacement in relative_translations.items():
138 if pattern.match(word):
139 date_string_tokens[i] = pattern.sub(replacement, word)
140 break
141 else:
142 if word in dictionary:
143 fallback = word if keep_formatting and not word.isalpha() else ""
144 date_string_tokens[i] = dictionary[word] or fallback
145 if "in" in date_string_tokens:
146 date_string_tokens = self._clear_future_words(date_string_tokens)
147
148 return self._join(
149 list(filter(bool, date_string_tokens)),
150 separator="" if keep_formatting else " ",
151 settings=settings,
152 )
153
154 def _translate_numerals(self, date_string):
155 date_string_tokens = NUMERAL_PATTERN.split(date_string)
156 for i, token in enumerate(date_string_tokens):
157 if token.isdecimal():
158 date_string_tokens[i] = str(int(token)).zfill(len(token))
159 return "".join(date_string_tokens)
160
161 def _get_relative_translations(self, settings=None):
162 if settings.NORMALIZE:
163 if self._normalized_relative_translations is None:
164 self._normalized_relative_translations = (
165 self._generate_relative_translations(normalize=True)
166 )
167 return self._normalized_relative_translations
168 else:
169 if self._relative_translations is None:
170 self._relative_translations = self._generate_relative_translations(
171 normalize=False
172 )
173 return self._relative_translations
174
175 def _generate_relative_translations(self, normalize=False):
176 relative_translations = self.info.get("relative-type-regex", {})
177 relative_dictionary = OrderedDict()
178 for key, value in relative_translations.items():
179 if normalize:
180 value = list(map(normalize_unicode, value))
181 pattern = "|".join(sorted(value, key=len, reverse=True))
182 pattern = pattern.replace(r"(\d+", r"(?P<n>\d+")
183 pattern = re.compile(
184 r"^(?:{})$".format(pattern), re.UNICODE | re.IGNORECASE
185 )
186 relative_dictionary[pattern] = key
187 return relative_dictionary
188
189 def translate_search(self, search_string, settings=None):
190 dashes = ["-", "——", "—", "~"]
191 word_joint_unsupported_languages = ["zh", "ja"]
192 sentences = self._sentence_split(search_string, settings=settings)
193 dictionary = self._get_dictionary(settings=settings)
194 translated = []
195 original = []
196 for sentence in sentences:
197 original_tokens, simplified_tokens = self._simplify_split_align(
198 sentence, settings=settings
199 )
200 translated_chunk = []
201 original_chunk = []
202 last_token_index = len(simplified_tokens) - 1
203 skip_next_token = False
204 for i, word in enumerate(simplified_tokens):
205 next_word = simplified_tokens[i + 1] if i < last_token_index else ""
206 current_and_next_joined = self._join_chunk(
207 [word, next_word], settings=settings
208 )
209 if skip_next_token:
210 skip_next_token = False
211 continue
212
213 if word == "" or word == " ":
214 translated_chunk.append(word)
215 original_chunk.append(original_tokens[i])
216 elif (
217 current_and_next_joined in dictionary
218 and word not in dashes
219 and self.shortname not in word_joint_unsupported_languages
220 ):
221 translated_chunk.append(dictionary[current_and_next_joined])
222 original_chunk.append(
223 self._join_chunk(
224 [original_tokens[i], original_tokens[i + 1]],
225 settings=settings,
226 )
227 )
228 skip_next_token = True
229 elif word in dictionary and word not in dashes:
230 translated_chunk.append(dictionary[word])
231 original_chunk.append(original_tokens[i])
232 elif word.strip("()\"'{}[],.،") in dictionary and word not in dashes:
233 punct = word[len(word.strip("()\"'{}[],.،")) :]
234 if punct and dictionary[word.strip("()\"'{}[],.،")]:
235 translated_chunk.append(
236 dictionary[word.strip("()\"'{}[],.،")] + punct
237 )
238 else:
239 translated_chunk.append(dictionary[word.strip("()\"'{}[],.،")])
240 original_chunk.append(original_tokens[i])
241 elif self._token_with_digits_is_ok(word):
242 translated_chunk.append(word)
243 original_chunk.append(original_tokens[i])
244 # Use original token because word_is_tz is case sensitive
245 elif translated_chunk and word_is_tz(original_tokens[i]):
246 translated_chunk.append(word)
247 original_chunk.append(original_tokens[i])
248 else:
249 if translated_chunk:
250 translated.append(translated_chunk)
251 translated_chunk = []
252 original.append(original_chunk)
253 original_chunk = []
254 if translated_chunk:
255 translated.append(translated_chunk)
256 original.append(original_chunk)
257 for i in range(len(translated)):
258 if "in" in translated[i]:
259 translated[i] = self._clear_future_words(translated[i])
260 translated[i] = self._join_chunk(
261 list(filter(bool, translated[i])), settings=settings
262 )
263 original[i] = self._join_chunk(
264 list(filter(bool, original[i])), settings=settings
265 )
266 return translated, original
267
268 def _get_abbreviations(self, settings):
269 dictionary = self._get_dictionary(settings=settings)
270 abbreviations = []
271 if self._abbreviations is None:
272 for item in dictionary:
273 if item.endswith(".") and len(item) > 1:
274 abbreviations.append(item)
275 self._abbreviations = abbreviations
276 return self._abbreviations
277
278 def _sentence_split(self, string, settings):
279 abbreviations = self._get_abbreviations(settings=settings)
280 digit_abbreviations = ["[0-9]"] # numeric date with full stop
281 abbreviation_string = ""
282
283 for abbreviation in abbreviations:
284 abbreviation_string += (
285 "(?<! " + abbreviation[:-1] + ")"
286 ) # negative lookbehind
287 if self.shortname in ["fi", "cs", "hu", "de", "da"]:
288 for digit_abbreviation in digit_abbreviations:
289 abbreviation_string += (
290 "(?<!" + digit_abbreviation + ")"
291 ) # negative lookbehind
292
293 splitters_dict = {
294 1: r"[\.!?;…\r\n]+(?:\s|$)*", # most European, Tagalog, Hebrew, Georgian,
295 # Indonesian, Vietnamese
296 2: r"[\.!?;…\r\n]+(\s*[¡¿]*|$)|[¡¿]+", # Spanish
297 3: r"[|!?;\r\n]+(?:\s|$)+", # Hindi and Bangla
298 4: r"[。…‥\.!??!;\r\n]+(?:\s|$)+", # Japanese and Chinese
299 5: r"[\r\n]+", # Thai
300 6: r"[\r\n؟!\.…]+(?:\s|$)+",
301 } # Arabic and Farsi
302 if "sentence_splitter_group" not in self.info:
303 split_reg = abbreviation_string + splitters_dict[1]
304 sentences = re.split(split_reg, string)
305 else:
306 split_reg = (
307 abbreviation_string
308 + splitters_dict[self.info["sentence_splitter_group"]]
309 )
310 sentences = re.split(split_reg, string)
311
312 sentences = filter(None, sentences)
313 return sentences
314
315 def _simplify_split_align(self, original, settings):
316 # TODO: Switch to new split method.
317 original_tokens = self._word_split(original, settings=settings)
318 simplified_tokens = self._word_split(
319 self._simplify(normalize_unicode(original), settings=settings),
320 settings=settings,
321 )
322 if len(original_tokens) == len(simplified_tokens):
323 return original_tokens, simplified_tokens
324
325 elif len(original_tokens) < len(simplified_tokens):
326 add_empty = False
327 for i, token in enumerate(simplified_tokens):
328 if i < len(original_tokens):
329 if token == normalize_unicode(original_tokens[i].lower()):
330 add_empty = False
331 else:
332 if not add_empty:
333 add_empty = True
334 continue
335 else:
336 original_tokens.insert(i, "")
337 else:
338 original_tokens.insert(i, "")
339 else:
340 add_empty = False
341 for i, token in enumerate(original_tokens):
342 if i < len(simplified_tokens):
343 if normalize_unicode(token.lower()) == simplified_tokens[i]:
344 add_empty = False
345 else:
346 if not add_empty:
347 add_empty = True
348 continue
349 else:
350 simplified_tokens.insert(i, "")
351 else:
352 simplified_tokens.insert(i, "")
353
354 while len(original_tokens) != len(simplified_tokens):
355 if len(original_tokens) > len(simplified_tokens):
356 original_tokens.remove("")
357 else:
358 simplified_tokens.remove("")
359 return original_tokens, simplified_tokens
360
361 def _get_split_dictionary(self, settings):
362 if self._split_dictionary is None:
363 settings.NORMALIZE = True
364 dictionary = self._get_dictionary(settings=settings)
365 self._split_dictionary = self._split_dict(dictionary)
366 return self._split_dictionary
367
368 def _split_dict(self, dictionary):
369 newdict = {}
370 for item in dictionary:
371 if " " in item:
372 items = item.split()
373 for i in items:
374 newdict[i] = dictionary[item]
375 else:
376 newdict[item] = dictionary[item]
377 return newdict
378
379 def _word_split(self, string, settings):
380 if "no_word_spacing" in self.info:
381 return self._split(string, keep_formatting=True, settings=settings)
382 else:
383 return string.split()
384
385 def _split(self, date_string, keep_formatting, settings=None):
386 tokens = [date_string]
387 tokens = list(self._split_tokens_with_regex(tokens, r"(\d+)"))
388 tokens = list(
389 self._split_tokens_by_known_words(
390 tokens, keep_formatting, settings=settings
391 )
392 )
393 return tokens
394
395 def _split_tokens_with_regex(self, tokens, regex):
396 tokens = tokens[:]
397 for i, token in enumerate(tokens):
398 tokens[i] = re.split(regex, token)
399 return filter(bool, chain.from_iterable(tokens))
400
401 def _split_tokens_by_known_words(self, tokens, keep_formatting, settings=None):
402 dictionary = self._get_dictionary(settings)
403 for i, token in enumerate(tokens):
404 tokens[i] = dictionary.split(token, keep_formatting)
405 return list(chain.from_iterable(tokens))
406
407 def _join_chunk(self, chunk, settings):
408 if "no_word_spacing" in self.info:
409 return self._join(chunk, separator="", settings=settings)
410 else:
411 return re.sub(r"\s{2,}", " ", " ".join(chunk))
412
413 def _token_with_digits_is_ok(self, token):
414 if "no_word_spacing" in self.info:
415 if re.search(r"[\d\.:\-/]+", token) is not None:
416 return True
417 else:
418 return False
419
420 else:
421 if re.search(r"\d+", token) is not None:
422 return True
423 else:
424 return False
425
426 def _simplify(self, date_string, settings=None):
427 date_string = date_string.lower()
428 simplifications = self._get_simplifications(settings=settings)
429 for simplification in simplifications:
430 pattern, replacement = list(simplification.items())[0]
431 date_string = pattern.sub(replacement, date_string).lower()
432 return date_string
433
434 def _get_simplifications(self, settings=None):
435 no_word_spacing = eval(self.info.get("no_word_spacing", "False"))
436 if settings.NORMALIZE:
437 if self._normalized_simplifications is None:
438 self._normalized_simplifications = []
439 simplifications = self._generate_simplifications(normalize=True)
440 for simplification in simplifications:
441 pattern, replacement = list(simplification.items())[0]
442 if not no_word_spacing:
443 pattern = r"(?<=\A|\W|_)%s(?=\Z|\W|_)" % pattern
444 pattern = re.compile(pattern, flags=re.I | re.U)
445 self._normalized_simplifications.append({pattern: replacement})
446 return self._normalized_simplifications
447
448 else:
449 if self._simplifications is None:
450 self._simplifications = []
451 simplifications = self._generate_simplifications(normalize=False)
452 for simplification in simplifications:
453 pattern, replacement = list(simplification.items())[0]
454 if not no_word_spacing:
455 pattern = r"(?<=\A|\W|_)%s(?=\Z|\W|_)" % pattern
456 pattern = re.compile(pattern, flags=re.I | re.U)
457 self._simplifications.append({pattern: replacement})
458 return self._simplifications
459
460 def _generate_simplifications(self, normalize=False):
461 simplifications = []
462 for simplification in self.info.get("simplifications", []):
463 c_simplification = {}
464 key, value = list(simplification.items())[0]
465 if normalize:
466 key = normalize_unicode(key)
467
468 if isinstance(value, int):
469 c_simplification[key] = str(value)
470 else:
471 c_simplification[key] = normalize_unicode(value) if normalize else value
472
473 simplifications.append(c_simplification)
474 return simplifications
475
476 def _clear_future_words(self, words):
477 freshness_words = {"day", "week", "month", "year", "hour", "minute", "second"}
478 if set(words).isdisjoint(freshness_words):
479 words.remove("in")
480 return words
481
482 def _join(self, tokens, separator=" ", settings=None):
483 if not tokens:
484 return ""
485
486 capturing_splitters = self._get_splitters(settings)["capturing"]
487 joined = tokens[0]
488 for i in range(1, len(tokens)):
489 left, right = tokens[i - 1], tokens[i]
490 if left not in capturing_splitters and right not in capturing_splitters:
491 joined += separator
492 joined += right
493
494 return joined
495
496 def _get_dictionary(self, settings=None):
497 if not settings.NORMALIZE:
498 if self._dictionary is None:
499 self._generate_dictionary()
500 self._dictionary._settings = settings
501 return self._dictionary
502 else:
503 if self._normalized_dictionary is None:
504 self._generate_normalized_dictionary()
505 self._normalized_dictionary._settings = settings
506 return self._normalized_dictionary
507
508 def _get_wordchars(self, settings=None):
509 if self._wordchars is None:
510 self._set_wordchars(settings)
511 return self._wordchars
512
513 def _get_splitters(self, settings=None):
514 if self._splitters is None:
515 self._set_splitters(settings)
516 return self._splitters
517
518 def _set_splitters(self, settings=None):
519 splitters = {
520 # The ones that split string only if they are not surrounded by letters from both sides:
521 "wordchars": set(),
522 # The ones that are not filtered out from tokens after split:
523 "capturing": set(),
524 }
525 splitters["capturing"] |= set(ALWAYS_KEEP_TOKENS)
526
527 wordchars = self._get_wordchars(settings)
528 skip = set(self.info.get("skip", [])) | splitters["capturing"]
529 for token in skip:
530 if not re.match(r"^\W+$", token, re.UNICODE):
531 continue
532 if token in wordchars:
533 splitters["wordchars"].add(token)
534
535 self._splitters = splitters
536
537 def _set_wordchars(self, settings=None):
538 wordchars = set()
539 for word in self._get_dictionary(settings):
540 if re.match(r"^[\W\d_]+$", word, re.UNICODE):
541 continue
542 for char in word:
543 wordchars.add(char.lower())
544
545 self._wordchars = wordchars - {" "} | {
546 "0",
547 "1",
548 "2",
549 "3",
550 "4",
551 "5",
552 "6",
553 "7",
554 "8",
555 "9",
556 }
557
558 def get_wordchars_for_detection(self, settings):
559 if self._wordchars_for_detection is None:
560 wordchars = set()
561 for word in self._get_dictionary(settings):
562 if re.match(r"^[\W\d_]+$", word, re.UNICODE):
563 continue
564 for char in word:
565 wordchars.add(char.lower())
566 self._wordchars_for_detection = wordchars - {
567 "0",
568 "1",
569 "2",
570 "3",
571 "4",
572 "5",
573 "6",
574 "7",
575 "8",
576 "9",
577 ":",
578 "(",
579 ")",
580 "'",
581 "q",
582 "a",
583 "m",
584 "p",
585 " ",
586 }
587 return self._wordchars_for_detection
588
589 def _generate_dictionary(self, settings=None):
590 self._dictionary = Dictionary(self.info, settings=settings)
591
592 def _generate_normalized_dictionary(self, settings=None):
593 self._normalized_dictionary = NormalizedDictionary(self.info, settings=settings)
594
595 def to_parserinfo(self, base_cls=parser.parserinfo):
596 attributes = {
597 "JUMP": self.info.get("skip", []),
598 "PERTAIN": self.info.get("pertain", []),
599 "WEEKDAYS": [
600 self.info["monday"],
601 self.info["tuesday"],
602 self.info["wednesday"],
603 self.info["thursday"],
604 self.info["friday"],
605 self.info["saturday"],
606 self.info["sunday"],
607 ],
608 "MONTHS": [
609 self.info["january"],
610 self.info["february"],
611 self.info["march"],
612 self.info["april"],
613 self.info["may"],
614 self.info["june"],
615 self.info["july"],
616 self.info["august"],
617 self.info["september"],
618 self.info["october"],
619 self.info["november"],
620 self.info["december"],
621 ],
622 "HMS": [self.info["hour"], self.info["minute"], self.info["second"]],
623 }
624 name = "{language}ParserInfo".format(language=self.info["name"])
625 return type(name, bases=[base_cls], dict=attributes)