Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/dateparser/languages/dictionary.py: 99%

1from itertools import chain, zip_longest

2from operator import methodcaller

4import regex as re

6from dateparser.utils import normalize_unicode

8PARSER_HARDCODED_TOKENS = [":", ".", " ", "-", "/"]

9PARSER_KNOWN_TOKENS = ["am", "pm", "UTC", "GMT", "Z"]

10ALWAYS_KEEP_TOKENS = ["+"] + PARSER_HARDCODED_TOKENS

11KNOWN_WORD_TOKENS = [

12 "monday",

13 "tuesday",

14 "wednesday",

15 "thursday",

16 "friday",

17 "saturday",

18 "sunday",

19 "january",

20 "february",

21 "march",

22 "april",

23 "may",

24 "june",

25 "july",

26 "august",

27 "september",

28 "october",

29 "november",

30 "december",

31 "decade",

32 "year",

33 "month",

34 "week",

35 "day",

36 "hour",

37 "minute",

38 "second",

39 "ago",

40 "in",

41 "am",

42 "pm",

43]

45PARENTHESES_PATTERN = re.compile(r"[]")

46NUMERAL_PATTERN = re.compile(r"(\d+)")

47KEEP_TOKEN_PATTERN = re.compile(r"^.*[^\W_].*$", flags=re.U)

50class UnknownTokenError(Exception):

51 pass

54class Dictionary:

55 """

56 Class that modifies and stores translations and handles splitting of date string.

58 :param locale_info:

59 Locale info (translation data) of the locale.

60 :type language_info: dict

62 :param settings:

63 Configure customized behavior using settings defined in :mod:`dateparser.conf.Settings`.

64 :type settings: dict

66 :return: a Dictionary instance.

67 """

69 _split_regex_cache = {}

70 _sorted_words_cache = {}

71 _split_relative_regex_cache = {}

72 _sorted_relative_strings_cache = {}

73 _match_relative_regex_cache = {}

75 def __init__(self, locale_info, settings=None):

76 dictionary = {}

77 self._settings = settings

78 self.info = locale_info

80 if "skip" in locale_info:

81 skip = map(methodcaller("lower"), locale_info["skip"])

82 dictionary.update(zip_longest(skip, [], fillvalue=None))

83 if "pertain" in locale_info:

84 pertain = map(methodcaller("lower"), locale_info["pertain"])

85 dictionary.update(zip_longest(pertain, [], fillvalue=None))

86 for word in KNOWN_WORD_TOKENS:

87 if word in locale_info:

88 translations = map(methodcaller("lower"), locale_info[word])

89 dictionary.update(zip_longest(translations, [], fillvalue=word))

90 dictionary.update(zip_longest(ALWAYS_KEEP_TOKENS, ALWAYS_KEEP_TOKENS))

91 dictionary.update(

92 zip_longest(

93 map(methodcaller("lower"), PARSER_KNOWN_TOKENS), PARSER_KNOWN_TOKENS

94 )

95 )

97 relative_type = locale_info.get("relative-type", {})

98 for key, value in relative_type.items():

99 relative_translations = map(methodcaller("lower"), value)

100 dictionary.update(zip_longest(relative_translations, [], fillvalue=key))

101

102 self._dictionary = dictionary

103

104 no_word_spacing = locale_info.get("no_word_spacing", "False")

105 self._no_word_spacing = bool(eval(no_word_spacing))

106

107 relative_type_regex = locale_info.get("relative-type-regex", {})

108 self._relative_strings = list(chain.from_iterable(relative_type_regex.values()))

109

110 def __contains__(self, key):

111 if key in self._settings.SKIP_TOKENS:

112 return True

113 return self._dictionary.__contains__(key)

114

115 def __getitem__(self, key):

116 if key in self._settings.SKIP_TOKENS:

117 return None

118 return self._dictionary.__getitem__(key)

119

120 def __iter__(self):

121 return chain(self._settings.SKIP_TOKENS, iter(self._dictionary))

122

123 def are_tokens_valid(self, tokens):

124 """

125 Check if tokens are valid tokens for the locale.

126

127 :param tokens:

128 a list of string tokens.

129 :type tokens: list

130

131 :return: True if tokens are valid, False otherwise.

132 """

133 has_only_keep_tokens = not set(tokens) - set(ALWAYS_KEEP_TOKENS)

134 if has_only_keep_tokens:

135 return False

136 match_relative_regex = self._get_match_relative_regex_cache()

137 for token in tokens:

138 if token.isdigit() or match_relative_regex.match(token) or token in self:

139 continue

140 else:

141 return False

142 else:

143 return True

144

145 def split(self, string, keep_formatting=False):

146 """

147 Split the date string using translations in locale info.

148

149 :param string:

150 Date string to be splitted.

151 :type string:

152 str

153

154 :param keep_formatting:

155 If True, retain formatting of the date string.

156 :type keep_formatting: bool

157

158 :return: A list of string tokens formed after splitting the date string.

159 """

160 if not string:

161 return string

162

163 split_relative_regex = self._get_split_relative_regex_cache()

164 match_relative_regex = self._get_match_relative_regex_cache()

165

166 tokens = split_relative_regex.split(string)

167

168 for i, token in enumerate(tokens):

169 if match_relative_regex.match(token):

170 tokens[i] = [token]

171 continue

172 tokens[i] = self._split_by_known_words(token, keep_formatting)

173

174 return list(filter(bool, chain.from_iterable(tokens)))

175

176 def _add_to_cache(self, value, cache):

177 cache.setdefault(self._settings.registry_key, {})[self.info["name"]] = value

178 if (

179 self._settings.CACHE_SIZE_LIMIT

180 and len(cache) > self._settings.CACHE_SIZE_LIMIT

181 ):

182 cache.pop(list(cache.keys())[0])

183

184 def _split_by_known_words(self, string: str, keep_formatting: bool):

185 regex = self._get_split_regex_cache()

186 splitted = []

187 unknown = string

188

189 while unknown:

190 match = regex.match(string)

191

192 if not match:

193 curr_split = (

194 self._split_by_numerals(string, keep_formatting)

195 if self._should_capture(string, keep_formatting)

196 else []

197 )

198 unknown = ""

199 else:

200 unparsed, known, unknown = match.groups()

201 curr_split = (

202 [known] if self._should_capture(known, keep_formatting) else []

203 )

204 if unparsed and self._should_capture(unparsed, keep_formatting):

205 curr_split = (

206 self._split_by_numerals(unparsed, keep_formatting) + curr_split

207 )

208 if unknown:

209 string = unknown if string != unknown else ""

210

211 splitted.extend(curr_split)

212 return splitted

213

214 def _split_by_numerals(self, string, keep_formatting):

215 return [

216 token

217 for token in NUMERAL_PATTERN.split(string)

218 if self._should_capture(token, keep_formatting)

219 ]

220

221 def _should_capture(self, token, keep_formatting):

222 return (

223 keep_formatting

224 or token in ALWAYS_KEEP_TOKENS

225 or KEEP_TOKEN_PATTERN.match(token)

226 )

227

228 def _get_sorted_words_from_cache(self):

229 if (

230 self._settings.registry_key not in self._sorted_words_cache

231 or self.info["name"]

232 not in self._sorted_words_cache[self._settings.registry_key]

233 ):

234 self._add_to_cache(

235 cache=self._sorted_words_cache,

236 value=sorted([key for key in self], key=len, reverse=True),

237 )

238 return self._sorted_words_cache[self._settings.registry_key][self.info["name"]]

239

240 def _get_split_regex_cache(self):

241 if (

242 self._settings.registry_key not in self._split_regex_cache

243 or self.info["name"]

244 not in self._split_regex_cache[self._settings.registry_key]

245 ):

246 self._construct_split_regex()

247 return self._split_regex_cache[self._settings.registry_key][self.info["name"]]

248

249 def _construct_split_regex(self):

250 known_words_group = "|".join(

251 map(re.escape, self._get_sorted_words_from_cache())

252 )

253 if self._no_word_spacing:

254 regex = r"^(.*?)({})(.*)$".format(known_words_group)

255 else:

256 regex = r"^(.*?(?:\A|\W|_|\d))({})((?:\Z|\W|_|\d).*)$".format(

257 known_words_group

258 )

259 self._add_to_cache(

260 cache=self._split_regex_cache,

261 value=re.compile(regex, re.UNICODE | re.IGNORECASE),

262 )

263

264 def _get_sorted_relative_strings_from_cache(self):

265 if (

266 self._settings.registry_key not in self._sorted_relative_strings_cache

267 or self.info["name"]

268 not in self._sorted_relative_strings_cache[self._settings.registry_key]

269 ):

270 self._add_to_cache(

271 cache=self._sorted_relative_strings_cache,

272 value=sorted(

273 [

274 PARENTHESES_PATTERN.sub("", key)

275 for key in self._relative_strings

276 ],

277 key=len,

278 reverse=True,

279 ),

280 )

281 return self._sorted_relative_strings_cache[self._settings.registry_key][

282 self.info["name"]

283 ]

284

285 def _get_split_relative_regex_cache(self):

286 if (

287 self._settings.registry_key not in self._split_relative_regex_cache

288 or self.info["name"]

289 not in self._split_relative_regex_cache[self._settings.registry_key]

290 ):

291 self._construct_split_relative_regex()

292 return self._split_relative_regex_cache[self._settings.registry_key][

293 self.info["name"]

294 ]

295

296 def _construct_split_relative_regex(self):

297 known_relative_strings_group = "|".join(

298 self._get_sorted_relative_strings_from_cache()

299 )

300 if self._no_word_spacing:

301 regex = "({})".format(known_relative_strings_group)

302 else:

303 regex = "(?<=(?:\\A|\\W|_))({})(?=(?:\\Z|\\W|_))".format(

304 known_relative_strings_group

305 )

306 self._add_to_cache(

307 cache=self._split_relative_regex_cache,

308 value=re.compile(regex, re.UNICODE | re.IGNORECASE),

309 )

310

311 def _get_match_relative_regex_cache(self):

312 if (

313 self._settings.registry_key not in self._match_relative_regex_cache

314 or self.info["name"]

315 not in self._match_relative_regex_cache[self._settings.registry_key]

316 ):

317 self._construct_match_relative_regex()

318 return self._match_relative_regex_cache[self._settings.registry_key][

319 self.info["name"]

320 ]

321

322 def _construct_match_relative_regex(self):

323 known_relative_strings_group = "|".join(

324 self._get_sorted_relative_strings_from_cache()

325 )

326 regex = "^({})$".format(known_relative_strings_group)

327 self._add_to_cache(

328 cache=self._match_relative_regex_cache,

329 value=re.compile(regex, re.UNICODE | re.IGNORECASE),

330 )

331

332

333class NormalizedDictionary(Dictionary):

334 def __init__(self, locale_info, settings=None):

335 super().__init__(locale_info, settings)

336 self._normalize()

337

338 def _normalize(self):

339 new_dict = {}

340 conflicting_keys = []

341 for key, value in self._dictionary.items():

342 normalized = normalize_unicode(key)

343 if key != normalized and normalized in self._dictionary:

344 conflicting_keys.append(key)

345 else:

346 new_dict[normalized] = value

347 for key in conflicting_keys:

348 normalized = normalize_unicode(key)

349 if key in (self.info.get("skip", []) + self.info.get("pertain", [])):

350 new_dict[normalized] = self._dictionary[key]

351 self._dictionary = new_dict

352 self._relative_strings = list(map(normalize_unicode, self._relative_strings))