1from itertools import chain, zip_longest
2from operator import methodcaller
3
4import regex as re
5
6from dateparser.utils import normalize_unicode
7
8PARSER_HARDCODED_TOKENS = [":", ".", " ", "-", "/"]
9PARSER_KNOWN_TOKENS = ["am", "pm", "UTC", "GMT", "Z"]
10ALWAYS_KEEP_TOKENS = ["+"] + PARSER_HARDCODED_TOKENS
11KNOWN_WORD_TOKENS = [
12 "monday",
13 "tuesday",
14 "wednesday",
15 "thursday",
16 "friday",
17 "saturday",
18 "sunday",
19 "january",
20 "february",
21 "march",
22 "april",
23 "may",
24 "june",
25 "july",
26 "august",
27 "september",
28 "october",
29 "november",
30 "december",
31 "decade",
32 "year",
33 "month",
34 "week",
35 "day",
36 "hour",
37 "minute",
38 "second",
39 "ago",
40 "in",
41 "am",
42 "pm",
43]
44
45PARENTHESES_PATTERN = re.compile(r"[\(\)]")
46NUMERAL_PATTERN = re.compile(r"(\d+)")
47KEEP_TOKEN_PATTERN = re.compile(r"^.*[^\W_].*$", flags=re.U)
48
49
50class UnknownTokenError(Exception):
51 pass
52
53
54class Dictionary:
55 """
56 Class that modifies and stores translations and handles splitting of date string.
57
58 :param locale_info:
59 Locale info (translation data) of the locale.
60 :type language_info: dict
61
62 :param settings:
63 Configure customized behavior using settings defined in :mod:`dateparser.conf.Settings`.
64 :type settings: dict
65
66 :return: a Dictionary instance.
67 """
68
69 _split_regex_cache = {}
70 _sorted_words_cache = {}
71 _split_relative_regex_cache = {}
72 _sorted_relative_strings_cache = {}
73 _match_relative_regex_cache = {}
74
75 def __init__(self, locale_info, settings=None):
76 dictionary = {}
77 self._settings = settings
78 self.info = locale_info
79
80 if "skip" in locale_info:
81 skip = map(methodcaller("lower"), locale_info["skip"])
82 dictionary.update(zip_longest(skip, [], fillvalue=None))
83 if "pertain" in locale_info:
84 pertain = map(methodcaller("lower"), locale_info["pertain"])
85 dictionary.update(zip_longest(pertain, [], fillvalue=None))
86 for word in KNOWN_WORD_TOKENS:
87 if word in locale_info:
88 translations = map(methodcaller("lower"), locale_info[word])
89 dictionary.update(zip_longest(translations, [], fillvalue=word))
90 dictionary.update(zip_longest(ALWAYS_KEEP_TOKENS, ALWAYS_KEEP_TOKENS))
91 dictionary.update(
92 zip_longest(
93 map(methodcaller("lower"), PARSER_KNOWN_TOKENS), PARSER_KNOWN_TOKENS
94 )
95 )
96
97 relative_type = locale_info.get("relative-type", {})
98 for key, value in relative_type.items():
99 relative_translations = map(methodcaller("lower"), value)
100 dictionary.update(zip_longest(relative_translations, [], fillvalue=key))
101
102 self._dictionary = dictionary
103
104 no_word_spacing = locale_info.get("no_word_spacing", "False")
105 self._no_word_spacing = bool(eval(no_word_spacing))
106
107 relative_type_regex = locale_info.get("relative-type-regex", {})
108 self._relative_strings = list(chain.from_iterable(relative_type_regex.values()))
109
110 def __contains__(self, key):
111 if key in self._settings.SKIP_TOKENS:
112 return True
113 return self._dictionary.__contains__(key)
114
115 def __getitem__(self, key):
116 if key in self._settings.SKIP_TOKENS:
117 return None
118 return self._dictionary.__getitem__(key)
119
120 def __iter__(self):
121 return chain(self._settings.SKIP_TOKENS, iter(self._dictionary))
122
123 def are_tokens_valid(self, tokens):
124 """
125 Check if tokens are valid tokens for the locale.
126
127 :param tokens:
128 a list of string tokens.
129 :type tokens: list
130
131 :return: True if tokens are valid, False otherwise.
132 """
133 has_only_keep_tokens = not set(tokens) - set(ALWAYS_KEEP_TOKENS)
134 if has_only_keep_tokens:
135 return False
136 match_relative_regex = self._get_match_relative_regex_cache()
137 for token in tokens:
138 if token.isdigit() or match_relative_regex.match(token) or token in self:
139 continue
140 else:
141 return False
142 else:
143 return True
144
145 def split(self, string, keep_formatting=False):
146 """
147 Split the date string using translations in locale info.
148
149 :param string:
150 Date string to be splitted.
151 :type string:
152 str
153
154 :param keep_formatting:
155 If True, retain formatting of the date string.
156 :type keep_formatting: bool
157
158 :return: A list of string tokens formed after splitting the date string.
159 """
160 if not string:
161 return string
162
163 split_relative_regex = self._get_split_relative_regex_cache()
164 match_relative_regex = self._get_match_relative_regex_cache()
165
166 tokens = split_relative_regex.split(string)
167
168 for i, token in enumerate(tokens):
169 if match_relative_regex.match(token):
170 tokens[i] = [token]
171 continue
172 tokens[i] = self._split_by_known_words(token, keep_formatting)
173
174 return list(filter(bool, chain.from_iterable(tokens)))
175
176 def _add_to_cache(self, value, cache):
177 cache.setdefault(self._settings.registry_key, {})[self.info["name"]] = value
178 if (
179 self._settings.CACHE_SIZE_LIMIT
180 and len(cache) > self._settings.CACHE_SIZE_LIMIT
181 ):
182 cache.pop(list(cache.keys())[0])
183
184 def _split_by_known_words(self, string: str, keep_formatting: bool):
185 regex = self._get_split_regex_cache()
186 splitted = []
187 unknown = string
188
189 while unknown:
190 match = regex.match(string)
191
192 if not match:
193 curr_split = (
194 self._split_by_numerals(string, keep_formatting)
195 if self._should_capture(string, keep_formatting)
196 else []
197 )
198 unknown = ""
199 else:
200 unparsed, known, unknown = match.groups()
201 curr_split = (
202 [known] if self._should_capture(known, keep_formatting) else []
203 )
204 if unparsed and self._should_capture(unparsed, keep_formatting):
205 curr_split = (
206 self._split_by_numerals(unparsed, keep_formatting) + curr_split
207 )
208 if unknown:
209 string = unknown if string != unknown else ""
210
211 splitted.extend(curr_split)
212 return splitted
213
214 def _split_by_numerals(self, string, keep_formatting):
215 return [
216 token
217 for token in NUMERAL_PATTERN.split(string)
218 if self._should_capture(token, keep_formatting)
219 ]
220
221 def _should_capture(self, token, keep_formatting):
222 return (
223 keep_formatting
224 or token in ALWAYS_KEEP_TOKENS
225 or KEEP_TOKEN_PATTERN.match(token)
226 )
227
228 def _get_sorted_words_from_cache(self):
229 if (
230 self._settings.registry_key not in self._sorted_words_cache
231 or self.info["name"]
232 not in self._sorted_words_cache[self._settings.registry_key]
233 ):
234 self._add_to_cache(
235 cache=self._sorted_words_cache,
236 value=sorted([key for key in self], key=len, reverse=True),
237 )
238 return self._sorted_words_cache[self._settings.registry_key][self.info["name"]]
239
240 def _get_split_regex_cache(self):
241 if (
242 self._settings.registry_key not in self._split_regex_cache
243 or self.info["name"]
244 not in self._split_regex_cache[self._settings.registry_key]
245 ):
246 self._construct_split_regex()
247 return self._split_regex_cache[self._settings.registry_key][self.info["name"]]
248
249 def _construct_split_regex(self):
250 known_words_group = "|".join(
251 map(re.escape, self._get_sorted_words_from_cache())
252 )
253 if self._no_word_spacing:
254 regex = r"^(.*?)({})(.*)$".format(known_words_group)
255 else:
256 regex = r"^(.*?(?:\A|\W|_|\d))({})((?:\Z|\W|_|\d).*)$".format(
257 known_words_group
258 )
259 self._add_to_cache(
260 cache=self._split_regex_cache,
261 value=re.compile(regex, re.UNICODE | re.IGNORECASE),
262 )
263
264 def _get_sorted_relative_strings_from_cache(self):
265 if (
266 self._settings.registry_key not in self._sorted_relative_strings_cache
267 or self.info["name"]
268 not in self._sorted_relative_strings_cache[self._settings.registry_key]
269 ):
270 self._add_to_cache(
271 cache=self._sorted_relative_strings_cache,
272 value=sorted(
273 [
274 PARENTHESES_PATTERN.sub("", key)
275 for key in self._relative_strings
276 ],
277 key=len,
278 reverse=True,
279 ),
280 )
281 return self._sorted_relative_strings_cache[self._settings.registry_key][
282 self.info["name"]
283 ]
284
285 def _get_split_relative_regex_cache(self):
286 if (
287 self._settings.registry_key not in self._split_relative_regex_cache
288 or self.info["name"]
289 not in self._split_relative_regex_cache[self._settings.registry_key]
290 ):
291 self._construct_split_relative_regex()
292 return self._split_relative_regex_cache[self._settings.registry_key][
293 self.info["name"]
294 ]
295
296 def _construct_split_relative_regex(self):
297 known_relative_strings_group = "|".join(
298 self._get_sorted_relative_strings_from_cache()
299 )
300 if self._no_word_spacing:
301 regex = "({})".format(known_relative_strings_group)
302 else:
303 regex = "(?<=(?:\\A|\\W|_))({})(?=(?:\\Z|\\W|_))".format(
304 known_relative_strings_group
305 )
306 self._add_to_cache(
307 cache=self._split_relative_regex_cache,
308 value=re.compile(regex, re.UNICODE | re.IGNORECASE),
309 )
310
311 def _get_match_relative_regex_cache(self):
312 if (
313 self._settings.registry_key not in self._match_relative_regex_cache
314 or self.info["name"]
315 not in self._match_relative_regex_cache[self._settings.registry_key]
316 ):
317 self._construct_match_relative_regex()
318 return self._match_relative_regex_cache[self._settings.registry_key][
319 self.info["name"]
320 ]
321
322 def _construct_match_relative_regex(self):
323 known_relative_strings_group = "|".join(
324 self._get_sorted_relative_strings_from_cache()
325 )
326 regex = "^({})$".format(known_relative_strings_group)
327 self._add_to_cache(
328 cache=self._match_relative_regex_cache,
329 value=re.compile(regex, re.UNICODE | re.IGNORECASE),
330 )
331
332
333class NormalizedDictionary(Dictionary):
334 def __init__(self, locale_info, settings=None):
335 super().__init__(locale_info, settings)
336 self._normalize()
337
338 def _normalize(self):
339 new_dict = {}
340 conflicting_keys = []
341 for key, value in self._dictionary.items():
342 normalized = normalize_unicode(key)
343 if key != normalized and normalized in self._dictionary:
344 conflicting_keys.append(key)
345 else:
346 new_dict[normalized] = value
347 for key in conflicting_keys:
348 normalized = normalize_unicode(key)
349 if key in (self.info.get("skip", []) + self.info.get("pertain", [])):
350 new_dict[normalized] = self._dictionary[key]
351 self._dictionary = new_dict
352 self._relative_strings = list(map(normalize_unicode, self._relative_strings))