Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/dateparser/languages/dictionary.py: 99%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

155 statements  

1from itertools import chain, zip_longest 

2from operator import methodcaller 

3 

4import regex as re 

5 

6from dateparser.utils import normalize_unicode 

7 

8PARSER_HARDCODED_TOKENS = [":", ".", " ", "-", "/"] 

9PARSER_KNOWN_TOKENS = ["am", "pm", "UTC", "GMT", "Z"] 

10ALWAYS_KEEP_TOKENS = ["+"] + PARSER_HARDCODED_TOKENS 

11KNOWN_WORD_TOKENS = [ 

12 "monday", 

13 "tuesday", 

14 "wednesday", 

15 "thursday", 

16 "friday", 

17 "saturday", 

18 "sunday", 

19 "january", 

20 "february", 

21 "march", 

22 "april", 

23 "may", 

24 "june", 

25 "july", 

26 "august", 

27 "september", 

28 "october", 

29 "november", 

30 "december", 

31 "decade", 

32 "year", 

33 "month", 

34 "week", 

35 "day", 

36 "hour", 

37 "minute", 

38 "second", 

39 "ago", 

40 "in", 

41 "am", 

42 "pm", 

43] 

44 

45PARENTHESES_PATTERN = re.compile(r"[\(\)]") 

46NUMERAL_PATTERN = re.compile(r"(\d+)") 

47KEEP_TOKEN_PATTERN = re.compile(r"^.*[^\W_].*$", flags=re.U) 

48 

49 

50class UnknownTokenError(Exception): 

51 pass 

52 

53 

54class Dictionary: 

55 """ 

56 Class that modifies and stores translations and handles splitting of date string. 

57 

58 :param locale_info: 

59 Locale info (translation data) of the locale. 

60 :type language_info: dict 

61 

62 :param settings: 

63 Configure customized behavior using settings defined in :mod:`dateparser.conf.Settings`. 

64 :type settings: dict 

65 

66 :return: a Dictionary instance. 

67 """ 

68 

69 _split_regex_cache = {} 

70 _sorted_words_cache = {} 

71 _split_relative_regex_cache = {} 

72 _sorted_relative_strings_cache = {} 

73 _match_relative_regex_cache = {} 

74 

75 def __init__(self, locale_info, settings=None): 

76 dictionary = {} 

77 self._settings = settings 

78 self.info = locale_info 

79 

80 if "skip" in locale_info: 

81 skip = map(methodcaller("lower"), locale_info["skip"]) 

82 dictionary.update(zip_longest(skip, [], fillvalue=None)) 

83 if "pertain" in locale_info: 

84 pertain = map(methodcaller("lower"), locale_info["pertain"]) 

85 dictionary.update(zip_longest(pertain, [], fillvalue=None)) 

86 for word in KNOWN_WORD_TOKENS: 

87 if word in locale_info: 

88 translations = map(methodcaller("lower"), locale_info[word]) 

89 dictionary.update(zip_longest(translations, [], fillvalue=word)) 

90 dictionary.update(zip_longest(ALWAYS_KEEP_TOKENS, ALWAYS_KEEP_TOKENS)) 

91 dictionary.update( 

92 zip_longest( 

93 map(methodcaller("lower"), PARSER_KNOWN_TOKENS), PARSER_KNOWN_TOKENS 

94 ) 

95 ) 

96 

97 relative_type = locale_info.get("relative-type", {}) 

98 for key, value in relative_type.items(): 

99 relative_translations = map(methodcaller("lower"), value) 

100 dictionary.update(zip_longest(relative_translations, [], fillvalue=key)) 

101 

102 self._dictionary = dictionary 

103 

104 no_word_spacing = locale_info.get("no_word_spacing", "False") 

105 self._no_word_spacing = bool(eval(no_word_spacing)) 

106 

107 relative_type_regex = locale_info.get("relative-type-regex", {}) 

108 self._relative_strings = list(chain.from_iterable(relative_type_regex.values())) 

109 

110 def __contains__(self, key): 

111 if key in self._settings.SKIP_TOKENS: 

112 return True 

113 return self._dictionary.__contains__(key) 

114 

115 def __getitem__(self, key): 

116 if key in self._settings.SKIP_TOKENS: 

117 return None 

118 return self._dictionary.__getitem__(key) 

119 

120 def __iter__(self): 

121 return chain(self._settings.SKIP_TOKENS, iter(self._dictionary)) 

122 

123 def are_tokens_valid(self, tokens): 

124 """ 

125 Check if tokens are valid tokens for the locale. 

126 

127 :param tokens: 

128 a list of string tokens. 

129 :type tokens: list 

130 

131 :return: True if tokens are valid, False otherwise. 

132 """ 

133 has_only_keep_tokens = not set(tokens) - set(ALWAYS_KEEP_TOKENS) 

134 if has_only_keep_tokens: 

135 return False 

136 match_relative_regex = self._get_match_relative_regex_cache() 

137 for token in tokens: 

138 if token.isdigit() or match_relative_regex.match(token) or token in self: 

139 continue 

140 else: 

141 return False 

142 else: 

143 return True 

144 

145 def split(self, string, keep_formatting=False): 

146 """ 

147 Split the date string using translations in locale info. 

148 

149 :param string: 

150 Date string to be splitted. 

151 :type string: 

152 str 

153 

154 :param keep_formatting: 

155 If True, retain formatting of the date string. 

156 :type keep_formatting: bool 

157 

158 :return: A list of string tokens formed after splitting the date string. 

159 """ 

160 if not string: 

161 return string 

162 

163 split_relative_regex = self._get_split_relative_regex_cache() 

164 match_relative_regex = self._get_match_relative_regex_cache() 

165 

166 tokens = split_relative_regex.split(string) 

167 

168 for i, token in enumerate(tokens): 

169 if match_relative_regex.match(token): 

170 tokens[i] = [token] 

171 continue 

172 tokens[i] = self._split_by_known_words(token, keep_formatting) 

173 

174 return list(filter(bool, chain.from_iterable(tokens))) 

175 

176 def _add_to_cache(self, value, cache): 

177 cache.setdefault(self._settings.registry_key, {})[self.info["name"]] = value 

178 if ( 

179 self._settings.CACHE_SIZE_LIMIT 

180 and len(cache) > self._settings.CACHE_SIZE_LIMIT 

181 ): 

182 cache.pop(list(cache.keys())[0]) 

183 

184 def _split_by_known_words(self, string: str, keep_formatting: bool): 

185 regex = self._get_split_regex_cache() 

186 splitted = [] 

187 unknown = string 

188 

189 while unknown: 

190 match = regex.match(string) 

191 

192 if not match: 

193 curr_split = ( 

194 self._split_by_numerals(string, keep_formatting) 

195 if self._should_capture(string, keep_formatting) 

196 else [] 

197 ) 

198 unknown = "" 

199 else: 

200 unparsed, known, unknown = match.groups() 

201 curr_split = ( 

202 [known] if self._should_capture(known, keep_formatting) else [] 

203 ) 

204 if unparsed and self._should_capture(unparsed, keep_formatting): 

205 curr_split = ( 

206 self._split_by_numerals(unparsed, keep_formatting) + curr_split 

207 ) 

208 if unknown: 

209 string = unknown if string != unknown else "" 

210 

211 splitted.extend(curr_split) 

212 return splitted 

213 

214 def _split_by_numerals(self, string, keep_formatting): 

215 return [ 

216 token 

217 for token in NUMERAL_PATTERN.split(string) 

218 if self._should_capture(token, keep_formatting) 

219 ] 

220 

221 def _should_capture(self, token, keep_formatting): 

222 return ( 

223 keep_formatting 

224 or token in ALWAYS_KEEP_TOKENS 

225 or KEEP_TOKEN_PATTERN.match(token) 

226 ) 

227 

228 def _get_sorted_words_from_cache(self): 

229 if ( 

230 self._settings.registry_key not in self._sorted_words_cache 

231 or self.info["name"] 

232 not in self._sorted_words_cache[self._settings.registry_key] 

233 ): 

234 self._add_to_cache( 

235 cache=self._sorted_words_cache, 

236 value=sorted([key for key in self], key=len, reverse=True), 

237 ) 

238 return self._sorted_words_cache[self._settings.registry_key][self.info["name"]] 

239 

240 def _get_split_regex_cache(self): 

241 if ( 

242 self._settings.registry_key not in self._split_regex_cache 

243 or self.info["name"] 

244 not in self._split_regex_cache[self._settings.registry_key] 

245 ): 

246 self._construct_split_regex() 

247 return self._split_regex_cache[self._settings.registry_key][self.info["name"]] 

248 

249 def _construct_split_regex(self): 

250 known_words_group = "|".join( 

251 map(re.escape, self._get_sorted_words_from_cache()) 

252 ) 

253 if self._no_word_spacing: 

254 regex = r"^(.*?)({})(.*)$".format(known_words_group) 

255 else: 

256 regex = r"^(.*?(?:\A|\W|_|\d))({})((?:\Z|\W|_|\d).*)$".format( 

257 known_words_group 

258 ) 

259 self._add_to_cache( 

260 cache=self._split_regex_cache, 

261 value=re.compile(regex, re.UNICODE | re.IGNORECASE), 

262 ) 

263 

264 def _get_sorted_relative_strings_from_cache(self): 

265 if ( 

266 self._settings.registry_key not in self._sorted_relative_strings_cache 

267 or self.info["name"] 

268 not in self._sorted_relative_strings_cache[self._settings.registry_key] 

269 ): 

270 self._add_to_cache( 

271 cache=self._sorted_relative_strings_cache, 

272 value=sorted( 

273 [ 

274 PARENTHESES_PATTERN.sub("", key) 

275 for key in self._relative_strings 

276 ], 

277 key=len, 

278 reverse=True, 

279 ), 

280 ) 

281 return self._sorted_relative_strings_cache[self._settings.registry_key][ 

282 self.info["name"] 

283 ] 

284 

285 def _get_split_relative_regex_cache(self): 

286 if ( 

287 self._settings.registry_key not in self._split_relative_regex_cache 

288 or self.info["name"] 

289 not in self._split_relative_regex_cache[self._settings.registry_key] 

290 ): 

291 self._construct_split_relative_regex() 

292 return self._split_relative_regex_cache[self._settings.registry_key][ 

293 self.info["name"] 

294 ] 

295 

296 def _construct_split_relative_regex(self): 

297 known_relative_strings_group = "|".join( 

298 self._get_sorted_relative_strings_from_cache() 

299 ) 

300 if self._no_word_spacing: 

301 regex = "({})".format(known_relative_strings_group) 

302 else: 

303 regex = "(?<=(?:\\A|\\W|_))({})(?=(?:\\Z|\\W|_))".format( 

304 known_relative_strings_group 

305 ) 

306 self._add_to_cache( 

307 cache=self._split_relative_regex_cache, 

308 value=re.compile(regex, re.UNICODE | re.IGNORECASE), 

309 ) 

310 

311 def _get_match_relative_regex_cache(self): 

312 if ( 

313 self._settings.registry_key not in self._match_relative_regex_cache 

314 or self.info["name"] 

315 not in self._match_relative_regex_cache[self._settings.registry_key] 

316 ): 

317 self._construct_match_relative_regex() 

318 return self._match_relative_regex_cache[self._settings.registry_key][ 

319 self.info["name"] 

320 ] 

321 

322 def _construct_match_relative_regex(self): 

323 known_relative_strings_group = "|".join( 

324 self._get_sorted_relative_strings_from_cache() 

325 ) 

326 regex = "^({})$".format(known_relative_strings_group) 

327 self._add_to_cache( 

328 cache=self._match_relative_regex_cache, 

329 value=re.compile(regex, re.UNICODE | re.IGNORECASE), 

330 ) 

331 

332 

333class NormalizedDictionary(Dictionary): 

334 def __init__(self, locale_info, settings=None): 

335 super().__init__(locale_info, settings) 

336 self._normalize() 

337 

338 def _normalize(self): 

339 new_dict = {} 

340 conflicting_keys = [] 

341 for key, value in self._dictionary.items(): 

342 normalized = normalize_unicode(key) 

343 if key != normalized and normalized in self._dictionary: 

344 conflicting_keys.append(key) 

345 else: 

346 new_dict[normalized] = value 

347 for key in conflicting_keys: 

348 normalized = normalize_unicode(key) 

349 if key in (self.info.get("skip", []) + self.info.get("pertain", [])): 

350 new_dict[normalized] = self._dictionary[key] 

351 self._dictionary = new_dict 

352 self._relative_strings = list(map(normalize_unicode, self._relative_strings))