1# -*- coding: utf-8 -*-
2
3from __future__ import print_function
4
5import re
6from collections import defaultdict, Counter
7from functools import partial
8from itertools import chain
9
10from sacremoses.corpus import Perluniprops
11from sacremoses.util import parallelize_preprocess, grouper
12
13# Hack to enable Python2.7 to use encoding.
14import sys
15
16if sys.version_info[0] < 3:
17 import io
18 import warnings
19
20 open = io.open
21 warnings.warn(
22 str(
23 "You should really be using Python3!!! "
24 "Tick tock, tick tock, https://pythonclock.org/"
25 )
26 )
27
28perluniprops = Perluniprops()
29
30
31class MosesTruecaser(object):
32 """
33 This is a Python port of the Moses Truecaser from
34 https://github.com/moses-smt/mosesdecoder/blob/master/scripts/recaser/train-truecaser.perl
35 https://github.com/moses-smt/mosesdecoder/blob/master/scripts/recaser/truecase.perl
36 """
37
38 # Perl Unicode Properties character sets.
39 Lowercase_Letter = str("".join(perluniprops.chars("Lowercase_Letter")))
40 Uppercase_Letter = str("".join(perluniprops.chars("Uppercase_Letter")))
41 Titlecase_Letter = str("".join(perluniprops.chars("Uppercase_Letter")))
42
43 def __init__(self, load_from=None, is_asr=None, encoding="utf8"):
44 """
45 :param load_from:
46 :type load_from:
47
48 :param is_asr: A flag to indicate that model is for ASR. ASR input has
49 no case, make sure it is lowercase, and make sure known are cased
50 eg. 'i' to be uppercased even if i is known.
51 :type is_asr: bool
52 """
53 # Initialize the object.
54 super(MosesTruecaser, self).__init__()
55 # Initialize the language specific nonbreaking prefixes.
56 self.SKIP_LETTERS_REGEX = re.compile(
57 "[{}{}{}]".format(
58 self.Lowercase_Letter, self.Uppercase_Letter, self.Titlecase_Letter
59 )
60 )
61
62 self.XML_SPLIT_REGX = re.compile("(<.*(?<=>))(.*)((?=</)[^>]*>)")
63
64 self.SENT_END = {".", ":", "?", "!"}
65 self.DELAYED_SENT_START = {
66 "(",
67 "[",
68 '"',
69 "'",
70 "'",
71 """,
72 "[",
73 "]",
74 }
75
76 self.encoding = encoding
77
78 self.is_asr = is_asr
79 if load_from:
80 self.model = self._load_model(load_from)
81
82 def learn_truecase_weights(self, tokens, possibly_use_first_token=False):
83 """
84 This function checks through each tokens in a sentence and returns the
85 appropriate weight of each surface token form.
86 """
87 # Keep track of first tokens in the sentence(s) of the line.
88 is_first_word = True
89 truecase_weights = []
90 for i, token in enumerate(tokens):
91 # Skip XML tags.
92 if re.search(r"(<\S[^>]*>)", token):
93 continue
94 # Skip if sentence start symbols.
95 elif token in self.DELAYED_SENT_START:
96 continue
97
98 # Resets the `is_first_word` after seeing sent end symbols.
99 if not is_first_word and token in self.SENT_END:
100 is_first_word = True
101 continue
102 # Skips tokens with nothing to case.
103 if not self.SKIP_LETTERS_REGEX.search(token):
104 is_first_word = False
105 continue
106
107 # If it's not the first word,
108 # then set the current word weight to 1.
109 current_word_weight = 0
110 if not is_first_word:
111 current_word_weight = 1
112 # Otherwise check whether user wants to optionally
113 # use the first word.
114 elif possibly_use_first_token:
115 # Gated special handling of first word of sentence.
116 # Check if first characer of token is lowercase.
117 if token[0].islower():
118 current_word_weight = 1
119 elif i == 1:
120 current_word_weight = 0.1
121
122 is_first_word = False
123
124 if current_word_weight > 0:
125 truecase_weights.append((token.lower(), token, current_word_weight))
126 return truecase_weights
127
128 def _train(
129 self,
130 document_iterator,
131 save_to=None,
132 possibly_use_first_token=False,
133 processes=1,
134 progress_bar=False,
135 ):
136 """
137 :param document_iterator: The input document, each outer list is a sentence,
138 the inner list is the list of tokens for each sentence.
139 :type document_iterator: iter(list(str))
140
141 :param possibly_use_first_token: When True, on the basis that the first
142 word of a sentence is always capitalized; if this option is provided then:
143 a) if a sentence-initial token is *not* capitalized, then it is counted, and
144 b) if a capitalized sentence-initial token is the only token of the segment,
145 then it is counted, but with only 10% of the weight of a normal token.
146 :type possibly_use_first_token: bool
147
148 :returns: A dictionary of the best, known objects as values from `_casing_to_model()`
149 :rtype: {'best': dict, 'known': Counter}
150 """
151 casing = defaultdict(Counter)
152 train_truecaser = partial(
153 self.learn_truecase_weights,
154 possibly_use_first_token=possibly_use_first_token,
155 )
156 token_weights = chain(
157 *parallelize_preprocess(
158 train_truecaser, document_iterator, processes, progress_bar=progress_bar
159 )
160 )
161 # Collect the token_weights from every sentence.
162 for lowercase_token, surface_token, weight in token_weights:
163 casing[lowercase_token][surface_token] += weight
164
165 # Save to file if specified.
166 if save_to:
167 self._save_model_from_casing(casing, save_to)
168 return self._casing_to_model(casing)
169
170 def train(
171 self,
172 documents,
173 save_to=None,
174 possibly_use_first_token=False,
175 processes=1,
176 progress_bar=False,
177 ):
178 """
179 Default duck-type of _train(), accepts list(list(str)) as input documents.
180 """
181 self.model = None # Clear the model first.
182 self.model = self._train(
183 documents,
184 save_to,
185 possibly_use_first_token,
186 processes,
187 progress_bar=progress_bar,
188 )
189 return self.model
190
191 def train_from_file(
192 self,
193 filename,
194 save_to=None,
195 possibly_use_first_token=False,
196 processes=1,
197 progress_bar=False,
198 ):
199 """
200 Duck-type of _train(), accepts a filename to read as a `iter(list(str))`
201 object.
202 """
203 with open(filename, encoding=self.encoding) as fin:
204 # document_iterator = map(str.split, fin.readlines())
205 document_iterator = (
206 line.split() for line in fin.readlines()
207 ) # Lets try a generator comprehension for Python2...
208 self.model = None # Clear the model first.
209 self.model = self._train(
210 document_iterator,
211 save_to,
212 possibly_use_first_token,
213 processes,
214 progress_bar=progress_bar,
215 )
216 return self.model
217
218 def train_from_file_object(
219 self,
220 file_object,
221 save_to=None,
222 possibly_use_first_token=False,
223 processes=1,
224 progress_bar=False,
225 ):
226 """
227 Duck-type of _train(), accepts a file object to read as a `iter(list(str))`
228 object.
229 """
230 # document_iterator = map(str.split, file_object.readlines())
231 document_iterator = (
232 line.split() for line in file_object.readlines()
233 ) # Lets try a generator comprehension for Python2...
234 self.model = None # Clear the model first.
235 self.model = self._train(
236 document_iterator,
237 save_to,
238 possibly_use_first_token,
239 processes,
240 progress_bar=progress_bar,
241 )
242 return self.model
243
244 def truecase(self, text, return_str=False, use_known=False):
245 """
246 Truecase a single sentence / line of text.
247
248 :param text: A single string, i.e. sentence text.
249 :type text: str
250
251 :param use_known: Use the known case if a word is a known word but not the first word.
252 :type use_known: bool
253 """
254 check_model_message = str(
255 "\nUse Truecaser.train() to train a model.\n"
256 "Or use Truecaser('modefile') to load a model."
257 )
258 assert hasattr(self, "model"), check_model_message
259 # Keep track of first tokens in the sentence(s) of the line.
260 is_first_word = True
261 truecased_tokens = []
262 tokens = self.split_xml(text)
263 # best_cases = best_cases if best_cases else self.model['best']
264 # known_cases = known_cases if known_cases else self.model['known']
265
266 for i, token in enumerate(tokens):
267
268 # Append XML tags and continue
269 if re.search(r"(<\S[^>]*>)", token):
270 truecased_tokens.append(token)
271 continue
272
273 # Note this shouldn't happen other if | are escaped as |
274 # To make the truecaser resilient,
275 # we'll just any token starting with pipes as they are.
276 if token == "|" or token.startswith("|"):
277 truecased_tokens.append(token)
278 continue
279
280 # Reads the word token and factors separatedly
281 token, other_factors = re.search(r"^([^\|]+)(.*)", token).groups()
282
283 # Lowercase the ASR tokens.
284 if self.is_asr:
285 token = token.lower()
286
287 # The actual case replacement happens here.
288 # "Most frequent" case of the word.
289 best_case = self.model["best"].get(token.lower(), None)
290 # Other known cases of the word.
291 known_case = self.model["known"].get(token, None)
292 # If it's the start of sentence.
293 if is_first_word and best_case: # Truecase sentence start.
294 token = best_case
295 elif known_case: # Don't change known tokens.
296 token = known_case if use_known else token
297 elif (
298 best_case
299 ): # Truecase otherwise unknown tokens? Heh? From https://github.com/moses-smt/mosesdecoder/blob/master/scripts/recaser/truecase.perl#L66
300 token = best_case
301 # Else, it's an unknown word, don't change the word.
302 # Concat the truecased `word` with the `other_factors`
303 token = token + other_factors
304 # Adds the truecased word.
305 truecased_tokens.append(token)
306
307 # Resets sentence start if this token is an ending punctuation.
308 is_first_word = token in self.SENT_END
309
310 if token in self.DELAYED_SENT_START:
311 is_first_word = False
312
313 # return ' '.join(tokens)
314 return " ".join(truecased_tokens) if return_str else truecased_tokens
315
316 def truecase_file(self, filename, return_str=True):
317 with open(filename, encoding=self.encoding) as fin:
318 for line in fin:
319 truecased_tokens = self.truecase(line.strip())
320 # Yield the truecased line.
321 yield " ".join(truecased_tokens) if return_str else truecased_tokens
322
323 @staticmethod
324 def split_xml(line):
325 """
326 Python port of split_xml function in Moses' truecaser:
327 https://github.com/moses-smt/mosesdecoder/blob/master/scripts/recaser/truecaser.perl
328
329 :param line: Input string, should be tokenized, separated by space.
330 :type line: str
331 """
332 line = line.strip()
333 tokens = []
334 while line:
335 # Assumes that xml tag is always separated by space.
336 has_xml = re.search(r"^\s*(<\S[^>]*>)(.*)$", line)
337 # non-XML test.
338 is_non_xml = re.search(r"^\s*([^\s<>]+)(.*)$", line)
339 # '<' or '>' occurs in word, but it's not an XML tag
340 xml_cognates = re.search(r"^\s*(\S+)(.*)$", line)
341 if has_xml:
342 potential_xml, line_next = has_xml.groups()
343 # exception for factor that is an XML tag
344 if (
345 re.search(r"^\S", line)
346 and len(tokens) > 0
347 and re.search(r"\|$", tokens[-1])
348 ):
349 tokens[-1] += potential_xml
350 # If it's a token with factors, join with the previous token.
351 is_factor = re.search(r"^(\|+)(.*)$", line_next)
352 if is_factor:
353 tokens[-1] += is_factor.group(1)
354 line_next = is_factor.group(2)
355 else:
356 tokens.append(
357 potential_xml + " "
358 ) # Token hack, unique to sacremoses.
359 line = line_next
360
361 elif is_non_xml:
362 tokens.append(is_non_xml.group(1)) # Token hack, unique to sacremoses.
363 line = is_non_xml.group(2)
364 elif xml_cognates:
365 tokens.append(
366 xml_cognates.group(1)
367 ) # Token hack, unique to sacremoses.
368 line = xml_cognates.group(2)
369 else:
370 raise Exception("ERROR: huh? {}".format(line))
371 tokens[-1] = tokens[-1].strip() # Token hack, unique to sacremoses.
372 return tokens
373
374 def _casing_to_model(self, casing):
375 """
376
377 :returns: A tuple of the (best, known) objects.
378 :rtype: tuple(dict, Counter)
379 """
380 best = {}
381 known = Counter()
382
383 for token_lower in casing:
384 tokens = casing[token_lower].most_common()
385 # Set the most frequent case as the "best" case.
386 best[token_lower] = tokens[0][0]
387 # If it's asr, throw away everything
388 if not self.is_asr:
389 for token, count in tokens[1:]:
390 # Note: This is rather odd that the counts are thrown away...
391 # from https://github.com/moses-smt/mosesdecoder/blob/master/scripts/recaser/truecase.perl#L34
392 known[token] += 1
393 model = {"best": best, "known": known, "casing": casing}
394 return model
395
396 def save_model(self, filename):
397 self._save_model_from_casing(self.model["casing"], filename)
398
399 def _save_model_from_casing(self, casing, filename):
400 """
401 Outputs the truecaser model file in the same output format as
402 https://github.com/moses-smt/mosesdecoder/blob/master/scripts/recaser/train-truecaser.perl
403
404 :param casing: The dictionary of tokens counter from `train()`.
405 :type casing: default(Counter)
406 """
407 with open(filename, "w", encoding=self.encoding) as fout:
408 for token in casing:
409 total_token_count = sum(casing[token].values())
410 tokens_counts = []
411 for i, (token, count) in enumerate(casing[token].most_common()):
412 if i == 0:
413 out_token = "{} ({}/{})".format(token, count, total_token_count)
414 else:
415 out_token = "{} ({})".format(token, count, total_token_count)
416 tokens_counts.append(out_token)
417 print(" ".join(tokens_counts), end="\n", file=fout)
418
419 def _load_model(self, filename):
420 """
421 Loads pre-trained truecasing file.
422
423 :returns: A dictionary of the best, known objects as values from `_casing_to_model()`
424 :rtype: {'best': dict, 'known': Counter}
425 """
426 casing = defaultdict(Counter)
427 with open(filename, encoding=self.encoding) as fin:
428 for line in fin:
429 line = line.strip().split()
430 for token, count in grouper(line, 2):
431 count = count.split("/")[0].strip("()")
432 casing[token.lower()][token] = int(count)
433 # Returns the best and known object from `_casing_to_model()`
434 return self._casing_to_model(casing)
435
436
437class MosesDetruecaser(object):
438 def __init__(self):
439 # Initialize the object.
440 super(MosesDetruecaser, self).__init__()
441 self.SENT_END = {".", ":", "?", "!"}
442 self.DELAYED_SENT_START = {
443 "(",
444 "[",
445 '"',
446 "'",
447 "'",
448 """,
449 "[",
450 "]",
451 }
452
453 # Some predefined tokens that will always be in lowercase.
454 self.ALWAYS_LOWER = {
455 "a",
456 "after",
457 "against",
458 "al-.+",
459 "and",
460 "any",
461 "as",
462 "at",
463 "be",
464 "because",
465 "between",
466 "by",
467 "during",
468 "el-.+",
469 "for",
470 "from",
471 "his",
472 "in",
473 "is",
474 "its",
475 "last",
476 "not",
477 "of",
478 "off",
479 "on",
480 "than",
481 "the",
482 "their",
483 "this",
484 "to",
485 "was",
486 "were",
487 "which",
488 "will",
489 "with",
490 }
491
492 def detruecase(self, text, is_headline=False, return_str=False):
493 """
494 Detruecase the translated files from a model that learnt from truecased
495 tokens.
496
497 :param text: A single string, i.e. sentence text.
498 :type text: str
499 """
500 # `cased_tokens` keep tracks of detruecased tokens.
501 cased_tokens = []
502 sentence_start = True
503 # Capitalize token if it's at the sentence start.
504 for token in text.split():
505 token = token[:1].upper() + token[1:] if sentence_start else token
506 cased_tokens.append(token)
507 if token in self.SENT_END:
508 sentence_start = True
509 elif not token in self.DELAYED_SENT_START:
510 sentence_start = False
511 # Check if it's a headline, if so then use title case.
512 if is_headline:
513 cased_tokens = [
514 token if token in self.ALWAYS_LOWER else token[:1].upper() + token[1:]
515 for token in cased_tokens
516 ]
517
518 return " ".join(cased_tokens) if return_str else cased_tokens
519
520
521__all__ = ["MosesTruecaser", "MosesDetruecaser"]