1#!/usr/bin/env python3
2# -*- coding: utf-8 -*-
3
4import re
5
6from sacremoses.corpus import Perluniprops
7from sacremoses.corpus import NonbreakingPrefixes
8from sacremoses.util import is_cjk
9from sacremoses.indic import VIRAMAS, NUKTAS
10
11perluniprops = Perluniprops()
12nonbreaking_prefixes = NonbreakingPrefixes()
13
14
15class MosesTokenizer(object):
16 """
17 This is a Python port of the Moses Tokenizer from
18 https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/tokenizer.perl
19 """
20
21 # Perl Unicode Properties character sets.
22 IsN = str("".join(perluniprops.chars("IsN")))
23 IsAlnum = str(
24 "".join(perluniprops.chars("IsAlnum")) + "".join(VIRAMAS) + "".join(NUKTAS)
25 )
26 IsSc = str("".join(perluniprops.chars("IsSc")))
27 IsSo = str("".join(perluniprops.chars("IsSo")))
28 IsAlpha = str(
29 "".join(perluniprops.chars("IsAlpha")) + "".join(VIRAMAS) + "".join(NUKTAS)
30 )
31 IsLower = str("".join(perluniprops.chars("IsLower")))
32
33 # Remove ASCII junk.
34 DEDUPLICATE_SPACE = r"\s+", r" "
35 ASCII_JUNK = r"[\000-\037]", r""
36
37 # Neurotic Perl heading space, multi-space and trailing space chomp.
38 # These regexes are kept for reference purposes and shouldn't be used!!
39 MID_STRIP = r" +", r" " # Use DEDUPLICATE_SPACE instead.
40 LEFT_STRIP = r"^ ", r"" # Uses text.lstrip() instead.
41 RIGHT_STRIP = r" $", r"" # Uses text.rstrip() instead.
42
43 # Pad all "other" special characters not in IsAlnum.
44 PAD_NOT_ISALNUM = r"([^{}\s\.'\`\,\-])".format(IsAlnum), r" \1 "
45
46 # Splits all hyphens (regardless of circumstances), e.g.
47 # 'foo-bar' -> 'foo @-@ bar'
48 AGGRESSIVE_HYPHEN_SPLIT = (
49 r"([{alphanum}])\-(?=[{alphanum}])".format(alphanum=IsAlnum),
50 r"\1 @-@ ",
51 )
52
53 # Make multi-dots stay together.
54 REPLACE_DOT_WITH_LITERALSTRING_1 = r"\.([\.]+)", " DOTMULTI\1"
55 REPLACE_DOT_WITH_LITERALSTRING_2 = r"DOTMULTI\.([^\.])", "DOTDOTMULTI \1"
56 REPLACE_DOT_WITH_LITERALSTRING_3 = r"DOTMULTI\.", "DOTDOTMULTI"
57
58 # Separate out "," except if within numbers (5,300)
59 # e.g. A,B,C,D,E > A , B,C , D,E
60 # First application uses up B so rule can't see B,C
61 # two-step version here may create extra spaces but these are removed later
62 # will also space digit,letter or letter,digit forms (redundant with next section)
63 COMMA_SEPARATE_1 = r"([^{}])[,]".format(IsN), r"\1 , "
64 COMMA_SEPARATE_2 = r"[,]([^{}])".format(IsN), r" , \1"
65 COMMA_SEPARATE_3 = r"([{}])[,]$".format(IsN), r"\1 , "
66
67 # Attempt to get correct directional quotes.
68 DIRECTIONAL_QUOTE_1 = r"^``", r"`` "
69 DIRECTIONAL_QUOTE_2 = r'^"', r"`` "
70 DIRECTIONAL_QUOTE_3 = r"^`([^`])", r"` \1"
71 DIRECTIONAL_QUOTE_4 = r"^'", r"` "
72 DIRECTIONAL_QUOTE_5 = r'([ ([{<])"', r"\1 `` "
73 DIRECTIONAL_QUOTE_6 = r"([ ([{<])``", r"\1 `` "
74 DIRECTIONAL_QUOTE_7 = r"([ ([{<])`([^`])", r"\1 ` \2"
75 DIRECTIONAL_QUOTE_8 = r"([ ([{<])'", r"\1 ` "
76
77 # Replace ... with _ELLIPSIS_
78 REPLACE_ELLIPSIS = r"\.\.\.", r" _ELLIPSIS_ "
79 # Restore _ELLIPSIS_ with ...
80 RESTORE_ELLIPSIS = r"_ELLIPSIS_", r"\.\.\."
81
82 # Pad , with tailing space except if within numbers, e.g. 5,300
83 COMMA_1 = r"([^{numbers}])[,]([^{numbers}])".format(numbers=IsN), r"\1 , \2"
84 COMMA_2 = r"([{numbers}])[,]([^{numbers}])".format(numbers=IsN), r"\1 , \2"
85 COMMA_3 = r"([^{numbers}])[,]([{numbers}])".format(numbers=IsN), r"\1 , \2"
86
87 # Pad unicode symbols with spaces.
88 SYMBOLS = r"([;:@#\$%&{}{}])".format(IsSc, IsSo), r" \1 "
89
90 # Separate out intra-token slashes. PTB tokenization doesn't do this, so
91 # the tokens should be merged prior to parsing with a PTB-trained parser.
92 # e.g. "and/or" -> "and @/@ or"
93 INTRATOKEN_SLASHES = (
94 r"([{alphanum}])\/([{alphanum}])".format(alphanum=IsAlnum),
95 r"$1 \@\/\@ $2",
96 )
97
98 # Splits final period at end of string.
99 FINAL_PERIOD = r"""([^.])([.])([\]\)}>"']*) ?$""", r"\1 \2\3"
100 # Pad all question marks and exclamation marks with spaces.
101 PAD_QUESTION_EXCLAMATION_MARK = r"([?!])", r" \1 "
102
103 # Handles parentheses, brackets and converts them to PTB symbols.
104 PAD_PARENTHESIS = r"([\]\[\(\){}<>])", r" \1 "
105 CONVERT_PARENTHESIS_1 = r"\(", "-LRB-"
106 CONVERT_PARENTHESIS_2 = r"\)", "-RRB-"
107 CONVERT_PARENTHESIS_3 = r"\[", "-LSB-"
108 CONVERT_PARENTHESIS_4 = r"\]", "-RSB-"
109 CONVERT_PARENTHESIS_5 = r"\{", "-LCB-"
110 CONVERT_PARENTHESIS_6 = r"\}", "-RCB-"
111
112 # Pads double dashes with spaces.
113 PAD_DOUBLE_DASHES = r"--", " -- "
114
115 # Adds spaces to start and end of string to simplify further regexps.
116 PAD_START_OF_STR = r"^", " "
117 PAD_END_OF_STR = r"$", " "
118
119 # Converts double quotes to two single quotes and pad with spaces.
120 CONVERT_DOUBLE_TO_SINGLE_QUOTES = r'"', " '' "
121 # Handles single quote in possessives or close-single-quote.
122 HANDLES_SINGLE_QUOTES = r"([^'])' ", r"\1 ' "
123
124 # Pad apostrophe in possessive or close-single-quote.
125 APOSTROPHE = r"([^'])'", r"\1 ' "
126
127 # Prepend space on contraction apostrophe.
128 CONTRACTION_1 = r"'([sSmMdD]) ", r" '\1 "
129 CONTRACTION_2 = r"'ll ", r" 'll "
130 CONTRACTION_3 = r"'re ", r" 're "
131 CONTRACTION_4 = r"'ve ", r" 've "
132 CONTRACTION_5 = r"n't ", r" n't "
133 CONTRACTION_6 = r"'LL ", r" 'LL "
134 CONTRACTION_7 = r"'RE ", r" 'RE "
135 CONTRACTION_8 = r"'VE ", r" 'VE "
136 CONTRACTION_9 = r"N'T ", r" N'T "
137
138 # Informal Contractions.
139 CONTRACTION_10 = r" ([Cc])annot ", r" \1an not "
140 CONTRACTION_11 = r" ([Dd])'ye ", r" \1' ye "
141 CONTRACTION_12 = r" ([Gg])imme ", r" \1im me "
142 CONTRACTION_13 = r" ([Gg])onna ", r" \1on na "
143 CONTRACTION_14 = r" ([Gg])otta ", r" \1ot ta "
144 CONTRACTION_15 = r" ([Ll])emme ", r" \1em me "
145 CONTRACTION_16 = r" ([Mm])ore'n ", r" \1ore 'n "
146 CONTRACTION_17 = r" '([Tt])is ", r" '\1 is "
147 CONTRACTION_18 = r" '([Tt])was ", r" '\1 was "
148 CONTRACTION_19 = r" ([Ww])anna ", r" \1an na "
149
150 # Clean out extra spaces
151 CLEAN_EXTRA_SPACE_1 = r" *", r" "
152 CLEAN_EXTRA_SPACE_2 = r"^ *", r""
153 CLEAN_EXTRA_SPACE_3 = r" *$", r""
154
155 # Neurotic Perl regexes to escape special characters.
156 ESCAPE_AMPERSAND = r"&", r"&"
157 ESCAPE_PIPE = r"\|", r"|"
158 ESCAPE_LEFT_ANGLE_BRACKET = r"<", r"<"
159 ESCAPE_RIGHT_ANGLE_BRACKET = r">", r">"
160 ESCAPE_SINGLE_QUOTE = r"\'", r"'"
161 ESCAPE_DOUBLE_QUOTE = r"\"", r"""
162 ESCAPE_LEFT_SQUARE_BRACKET = r"\[", r"["
163 ESCAPE_RIGHT_SQUARE_BRACKET = r"]", r"]"
164
165 EN_SPECIFIC_1 = r"([^{alpha}])[']([^{alpha}])".format(alpha=IsAlpha), r"\1 ' \2"
166 EN_SPECIFIC_2 = (
167 r"([^{alpha}{isn}])[']([{alpha}])".format(alpha=IsAlpha, isn=IsN),
168 r"\1 ' \2",
169 )
170 EN_SPECIFIC_3 = r"([{alpha}])[']([^{alpha}])".format(alpha=IsAlpha), r"\1 ' \2"
171 EN_SPECIFIC_4 = r"([{alpha}])[']([{alpha}])".format(alpha=IsAlpha), r"\1 '\2"
172 EN_SPECIFIC_5 = r"([{isn}])[']([s])".format(isn=IsN), r"\1 '\2"
173
174 ENGLISH_SPECIFIC_APOSTROPHE = [
175 EN_SPECIFIC_1,
176 EN_SPECIFIC_2,
177 EN_SPECIFIC_3,
178 EN_SPECIFIC_4,
179 EN_SPECIFIC_5,
180 ]
181
182 FR_IT_SPECIFIC_1 = r"([^{alpha}])[']([^{alpha}])".format(alpha=IsAlpha), r"\1 ' \2"
183 FR_IT_SPECIFIC_2 = r"([^{alpha}])[']([{alpha}])".format(alpha=IsAlpha), r"\1 ' \2"
184 FR_IT_SPECIFIC_3 = r"([{alpha}])[']([^{alpha}])".format(alpha=IsAlpha), r"\1 ' \2"
185 FR_IT_SPECIFIC_4 = r"([{alpha}])[']([{alpha}])".format(alpha=IsAlpha), r"\1' \2"
186
187 FR_IT_SPECIFIC_APOSTROPHE = [
188 FR_IT_SPECIFIC_1,
189 FR_IT_SPECIFIC_2,
190 FR_IT_SPECIFIC_3,
191 FR_IT_SPECIFIC_4,
192 ]
193
194 NON_SPECIFIC_APOSTROPHE = r"\'", " ' "
195
196 TRAILING_DOT_APOSTROPHE = r"\.' ?$", " . ' "
197
198 BASIC_PROTECTED_PATTERN_1 = r"<\/?\S+\/?>"
199 BASIC_PROTECTED_PATTERN_2 = r'<\S+( [a-zA-Z0-9]+\="?[^"]")+ ?\/?>'
200 BASIC_PROTECTED_PATTERN_3 = r"<\S+( [a-zA-Z0-9]+\='?[^']')+ ?\/?>"
201 BASIC_PROTECTED_PATTERN_4 = r"[\w\-\_\.]+\@([\w\-\_]+\.)+[a-zA-Z]{2,}"
202 BASIC_PROTECTED_PATTERN_5 = r"(http[s]?|ftp):\/\/[^:\/\s]+(\/\w+)*\/[\w\-\.]+"
203
204 MOSES_PENN_REGEXES_1 = [
205 DEDUPLICATE_SPACE,
206 ASCII_JUNK,
207 DIRECTIONAL_QUOTE_1,
208 DIRECTIONAL_QUOTE_2,
209 DIRECTIONAL_QUOTE_3,
210 DIRECTIONAL_QUOTE_4,
211 DIRECTIONAL_QUOTE_5,
212 DIRECTIONAL_QUOTE_6,
213 DIRECTIONAL_QUOTE_7,
214 DIRECTIONAL_QUOTE_8,
215 REPLACE_ELLIPSIS,
216 COMMA_1,
217 COMMA_2,
218 COMMA_3,
219 SYMBOLS,
220 INTRATOKEN_SLASHES,
221 FINAL_PERIOD,
222 PAD_QUESTION_EXCLAMATION_MARK,
223 PAD_PARENTHESIS,
224 CONVERT_PARENTHESIS_1,
225 CONVERT_PARENTHESIS_2,
226 CONVERT_PARENTHESIS_3,
227 CONVERT_PARENTHESIS_4,
228 CONVERT_PARENTHESIS_5,
229 CONVERT_PARENTHESIS_6,
230 PAD_DOUBLE_DASHES,
231 PAD_START_OF_STR,
232 PAD_END_OF_STR,
233 CONVERT_DOUBLE_TO_SINGLE_QUOTES,
234 HANDLES_SINGLE_QUOTES,
235 APOSTROPHE,
236 CONTRACTION_1,
237 CONTRACTION_2,
238 CONTRACTION_3,
239 CONTRACTION_4,
240 CONTRACTION_5,
241 CONTRACTION_6,
242 CONTRACTION_7,
243 CONTRACTION_8,
244 CONTRACTION_9,
245 CONTRACTION_10,
246 CONTRACTION_11,
247 CONTRACTION_12,
248 CONTRACTION_13,
249 CONTRACTION_14,
250 CONTRACTION_15,
251 CONTRACTION_16,
252 CONTRACTION_17,
253 CONTRACTION_18,
254 CONTRACTION_19,
255 ]
256
257 MOSES_PENN_REGEXES_2 = [
258 RESTORE_ELLIPSIS,
259 CLEAN_EXTRA_SPACE_1,
260 CLEAN_EXTRA_SPACE_2,
261 CLEAN_EXTRA_SPACE_3,
262 ESCAPE_AMPERSAND,
263 ESCAPE_PIPE,
264 ESCAPE_LEFT_ANGLE_BRACKET,
265 ESCAPE_RIGHT_ANGLE_BRACKET,
266 ESCAPE_SINGLE_QUOTE,
267 ESCAPE_DOUBLE_QUOTE,
268 ]
269
270 MOSES_ESCAPE_XML_REGEXES = [
271 ESCAPE_AMPERSAND,
272 ESCAPE_PIPE,
273 ESCAPE_LEFT_ANGLE_BRACKET,
274 ESCAPE_RIGHT_ANGLE_BRACKET,
275 ESCAPE_SINGLE_QUOTE,
276 ESCAPE_DOUBLE_QUOTE,
277 ESCAPE_LEFT_SQUARE_BRACKET,
278 ESCAPE_RIGHT_SQUARE_BRACKET,
279 ]
280
281 BASIC_PROTECTED_PATTERNS = [
282 BASIC_PROTECTED_PATTERN_1,
283 BASIC_PROTECTED_PATTERN_2,
284 BASIC_PROTECTED_PATTERN_3,
285 BASIC_PROTECTED_PATTERN_4,
286 BASIC_PROTECTED_PATTERN_5,
287 ]
288 WEB_PROTECTED_PATTERNS = [
289 r"((https?|ftp|rsync)://|www\.)[^ ]*", # URLs
290 r"[\w\-\_\.]+\@([\w\-\_]+\.)+[a-zA-Z]{2,}", # Emails user@host.domain
291 r"@[a-zA-Z0-9_]+", # @handler such as twitter/github ID
292 r"#[a-zA-Z0-9_]+", # @hashtag
293 # TODO: emojis especially the multi codepoints
294 ]
295
296 def __init__(self, lang="en", custom_nonbreaking_prefixes_file=None):
297 # Initialize the object.
298 super(MosesTokenizer, self).__init__()
299 self.lang = lang
300
301 # Initialize the language specific nonbreaking prefixes.
302 self.NONBREAKING_PREFIXES = [
303 _nbp.strip() for _nbp in nonbreaking_prefixes.words(lang)
304 ]
305
306 # Load custom nonbreaking prefixes file.
307 if custom_nonbreaking_prefixes_file:
308 self.NONBREAKING_PREFIXES = []
309 with open(custom_nonbreaking_prefixes_file, "r") as fin:
310 for line in fin:
311 line = line.strip()
312 if line and not line.startswith("#"):
313 if line not in self.NONBREAKING_PREFIXES:
314 self.NONBREAKING_PREFIXES.append(line)
315
316 self.NUMERIC_ONLY_PREFIXES = [
317 w.rpartition(" ")[0]
318 for w in self.NONBREAKING_PREFIXES
319 if self.has_numeric_only(w)
320 ]
321 # Add CJK characters to alpha and alnum.
322 if self.lang in ["zh", "ja", "ko", "cjk"]:
323 cjk_chars = ""
324 if self.lang in ["ko", "cjk"]:
325 cjk_chars += str("".join(perluniprops.chars("Hangul")))
326 if self.lang in ["zh", "cjk"]:
327 cjk_chars += str("".join(perluniprops.chars("Han")))
328 if self.lang in ["ja", "cjk"]:
329 cjk_chars += str("".join(perluniprops.chars("Hiragana")))
330 cjk_chars += str("".join(perluniprops.chars("Katakana")))
331 cjk_chars += str("".join(perluniprops.chars("Han")))
332 self.IsAlpha += cjk_chars
333 self.IsAlnum += cjk_chars
334 # Overwrite the alnum regexes.
335 self.PAD_NOT_ISALNUM = r"([^{}\s\.'\`\,\-])".format(self.IsAlnum), r" \1 "
336 self.AGGRESSIVE_HYPHEN_SPLIT = (
337 r"([{alphanum}])\-(?=[{alphanum}])".format(alphanum=self.IsAlnum),
338 r"\1 @-@ ",
339 )
340 self.INTRATOKEN_SLASHES = (
341 r"([{alphanum}])\/([{alphanum}])".format(alphanum=self.IsAlnum),
342 r"$1 \@\/\@ $2",
343 )
344
345 def replace_multidots(self, text):
346 text = re.sub(r"\.([\.]+)", r" DOTMULTI\1", text)
347 while re.search(r"DOTMULTI\.", text):
348 text = re.sub(r"DOTMULTI\.([^\.])", r"DOTDOTMULTI \1", text)
349 text = re.sub(r"DOTMULTI\.", "DOTDOTMULTI", text)
350 return text
351
352 def restore_multidots(self, text):
353 while re.search(r"DOTDOTMULTI", text):
354 text = re.sub(r"DOTDOTMULTI", r"DOTMULTI.", text)
355 return re.sub(r"DOTMULTI", r".", text)
356
357 def islower(self, text):
358 return not set(text).difference(set(self.IsLower))
359
360 def isanyalpha(self, text):
361 return any(set(text).intersection(set(self.IsAlpha)))
362
363 def has_numeric_only(self, text):
364 return bool(re.search(r"[\s]+(\#NUMERIC_ONLY\#)", text))
365
366 def handles_nonbreaking_prefixes(self, text):
367 # Splits the text into tokens to check for nonbreaking prefixes.
368 tokens = text.split()
369 num_tokens = len(tokens)
370 for i, token in enumerate(tokens):
371 # Checks if token ends with a fullstop.
372 token_ends_with_period = re.search(r"^(\S+)\.$", token)
373 if token_ends_with_period:
374 prefix = token_ends_with_period.group(1)
375 # Checks for 3 conditions if
376 # i. the prefix contains a fullstop and
377 # any char in the prefix is within the IsAlpha charset
378 # ii. the prefix is in the list of nonbreaking prefixes and
379 # does not contain #NUMERIC_ONLY#
380 # iii. the token is not the last token and that the
381 # next token contains all lowercase.
382 if (
383 ("." in prefix and self.isanyalpha(prefix))
384 or (
385 prefix in self.NONBREAKING_PREFIXES
386 and prefix not in self.NUMERIC_ONLY_PREFIXES
387 )
388 or (
389 i != num_tokens - 1
390 and tokens[i + 1]
391 and self.islower(tokens[i + 1][0])
392 )
393 ):
394 pass # No change to the token.
395 # Checks if the prefix is in NUMERIC_ONLY_PREFIXES
396 # and ensures that the next word is a digit.
397 elif (
398 prefix in self.NUMERIC_ONLY_PREFIXES
399 and (i + 1) < num_tokens
400 and re.search(r"^[0-9]+", tokens[i + 1])
401 ):
402 pass # No change to the token.
403 else: # Otherwise, adds a space after the tokens before a dot.
404 tokens[i] = prefix + " ."
405 return " ".join(tokens) # Stitch the tokens back.
406
407 def escape_xml(self, text):
408 for regexp, substitution in self.MOSES_ESCAPE_XML_REGEXES:
409 text = re.sub(regexp, substitution, text)
410 return text
411
412 def penn_tokenize(self, text, return_str=False):
413 """
414 This is a Python port of the Penn treebank tokenizer adapted by the Moses
415 machine translation community.
416 """
417 # Converts input string into unicode.
418 text = str(text)
419 # Perform a chain of regex substituitions using MOSES_PENN_REGEXES_1
420 for regexp, substitution in self.MOSES_PENN_REGEXES_1:
421 text = re.sub(regexp, substitution, text)
422 # Handles nonbreaking prefixes.
423 text = self.handles_nonbreaking_prefixes(text)
424 # Restore ellipsis, clean extra spaces, escape XML symbols.
425 for regexp, substitution in self.MOSES_PENN_REGEXES_2:
426 text = re.sub(regexp, substitution, text)
427 return text if return_str else text.split()
428
429 def tokenize(
430 self,
431 text,
432 aggressive_dash_splits=False,
433 return_str=False,
434 escape=True,
435 protected_patterns=None,
436 ):
437 """
438 Python port of the Moses tokenizer.
439
440 :param tokens: A single string, i.e. sentence text.
441 :type tokens: str
442 :param aggressive_dash_splits: Option to trigger dash split rules .
443 :type aggressive_dash_splits: bool
444 """
445 # Converts input string into unicode.
446 text = str(text)
447 # De-duplicate spaces and clean ASCII junk
448 for regexp, substitution in [self.DEDUPLICATE_SPACE, self.ASCII_JUNK]:
449 text = re.sub(regexp, substitution, text)
450
451 if protected_patterns:
452 # Find the tokens that needs to be protected.
453 protected_tokens = [
454 match.group()
455 for protected_pattern in protected_patterns
456 for match in re.finditer(protected_pattern, text, re.IGNORECASE)
457 ]
458 # Apply the protected_patterns.
459 for i, token in enumerate(protected_tokens):
460 substituition = "THISISPROTECTED" + str(i).zfill(3)
461 text = text.replace(token, substituition)
462
463 # Strips heading and trailing spaces.
464 text = text.strip()
465 # FIXME!!!
466 """
467 # For Finnish and Swedish, seperate out all "other" special characters.
468 if self.lang in ["fi", "sv"]:
469 # In Finnish and Swedish, the colon can be used inside words
470 # as an apostrophe-like character:
471 # USA:n, 20:een, EU:ssa, USA:s, S:t
472 regexp, substitution = self.FI_SV_COLON_APOSTROPHE
473 text = re.sub(regexp, substitution, text)
474 # If a colon is not immediately followed by lower-case characters,
475 # separate it out anyway.
476 regexp, substitution = self.FI_SV_COLON_NO_LOWER_FOLLOW
477 text = re.sub(regexp, substitution, text)
478 else:
479 """
480 # Separate special characters outside of IsAlnum character set.
481 regexp, substitution = self.PAD_NOT_ISALNUM
482 text = re.sub(regexp, substitution, text)
483 # Aggressively splits dashes
484 if aggressive_dash_splits:
485 regexp, substitution = self.AGGRESSIVE_HYPHEN_SPLIT
486 text = re.sub(regexp, substitution, text)
487
488 # Replaces multidots with "DOTDOTMULTI" literal strings.
489 text = self.replace_multidots(text)
490
491 # Separate out "," except if within numbers e.g. 5,300
492 for regexp, substitution in [
493 self.COMMA_SEPARATE_1,
494 self.COMMA_SEPARATE_2,
495 self.COMMA_SEPARATE_3,
496 ]:
497 text = re.sub(regexp, substitution, text)
498
499 # (Language-specific) apostrophe tokenization.
500 if self.lang == "en":
501 for regexp, substitution in self.ENGLISH_SPECIFIC_APOSTROPHE:
502 text = re.sub(regexp, substitution, text)
503 elif self.lang in ["fr", "it"]:
504 for regexp, substitution in self.FR_IT_SPECIFIC_APOSTROPHE:
505 text = re.sub(regexp, substitution, text)
506 # FIXME!!!
507 ##elif self.lang == "so":
508 ## for regexp, substitution in self.SO_SPECIFIC_APOSTROPHE:
509 ## text = re.sub(regexp, substitution, text)
510 else:
511 regexp, substitution = self.NON_SPECIFIC_APOSTROPHE
512 text = re.sub(regexp, substitution, text)
513
514 # Handles nonbreaking prefixes.
515 text = self.handles_nonbreaking_prefixes(text)
516 # Cleans up extraneous spaces.
517 regexp, substitution = self.DEDUPLICATE_SPACE
518 text = re.sub(regexp, substitution, text).strip()
519 # Split trailing ".'".
520 regexp, substituition = self.TRAILING_DOT_APOSTROPHE
521 text = re.sub(regexp, substituition, text)
522
523 # Restore the protected tokens.
524 if protected_patterns:
525 for i, token in enumerate(protected_tokens):
526 substituition = "THISISPROTECTED" + str(i).zfill(3)
527 text = text.replace(substituition, token)
528
529 # Restore multidots.
530 text = self.restore_multidots(text)
531 if escape:
532 # Escape XML symbols.
533 text = self.escape_xml(text)
534
535 return text if return_str else text.split()
536
537
538class MosesDetokenizer(object):
539 """
540 This is a Python port of the Moses Detokenizer from
541 https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/detokenizer.perl
542
543 """
544
545 # Currency Symbols.
546 IsAlnum = str("".join(perluniprops.chars("IsAlnum")))
547 IsAlpha = str("".join(perluniprops.chars("IsAlpha")))
548 IsSc = str("".join(perluniprops.chars("IsSc")))
549
550 AGGRESSIVE_HYPHEN_SPLIT = r" \@\-\@ ", r"-"
551
552 # Merge multiple spaces.
553 ONE_SPACE = re.compile(r" {2,}"), " "
554
555 # Unescape special characters.
556 UNESCAPE_FACTOR_SEPARATOR = r"|", r"|"
557 UNESCAPE_LEFT_ANGLE_BRACKET = r"<", r"<"
558 UNESCAPE_RIGHT_ANGLE_BRACKET = r">", r">"
559 UNESCAPE_DOUBLE_QUOTE = r""", r'"'
560 UNESCAPE_SINGLE_QUOTE = r"'", r"'"
561 UNESCAPE_SYNTAX_NONTERMINAL_LEFT = r"[", r"["
562 UNESCAPE_SYNTAX_NONTERMINAL_RIGHT = r"]", r"]"
563 UNESCAPE_AMPERSAND = r"&", r"&"
564 # The legacy regexes are used to support outputs from older Moses versions.
565 UNESCAPE_FACTOR_SEPARATOR_LEGACY = r"&bar;", r"|"
566 UNESCAPE_SYNTAX_NONTERMINAL_LEFT_LEGACY = r"&bra;", r"["
567 UNESCAPE_SYNTAX_NONTERMINAL_RIGHT_LEGACY = r"&ket;", r"]"
568
569 MOSES_UNESCAPE_XML_REGEXES = [
570 UNESCAPE_FACTOR_SEPARATOR_LEGACY,
571 UNESCAPE_FACTOR_SEPARATOR,
572 UNESCAPE_LEFT_ANGLE_BRACKET,
573 UNESCAPE_RIGHT_ANGLE_BRACKET,
574 UNESCAPE_SYNTAX_NONTERMINAL_LEFT_LEGACY,
575 UNESCAPE_SYNTAX_NONTERMINAL_RIGHT_LEGACY,
576 UNESCAPE_DOUBLE_QUOTE,
577 UNESCAPE_SINGLE_QUOTE,
578 UNESCAPE_SYNTAX_NONTERMINAL_LEFT,
579 UNESCAPE_SYNTAX_NONTERMINAL_RIGHT,
580 UNESCAPE_AMPERSAND,
581 ]
582
583 FINNISH_MORPHSET_1 = [
584 "N",
585 "n",
586 "A",
587 "a",
588 "\xc4",
589 "\xe4",
590 "ssa",
591 "Ssa",
592 "ss\xe4",
593 "Ss\xe4",
594 "sta",
595 "st\xe4",
596 "Sta",
597 "St\xe4",
598 "hun",
599 "Hun",
600 "hyn",
601 "Hyn",
602 "han",
603 "Han",
604 "h\xe4n",
605 "H\xe4n",
606 "h\xf6n",
607 "H\xf6n",
608 "un",
609 "Un",
610 "yn",
611 "Yn",
612 "an",
613 "An",
614 "\xe4n",
615 "\xc4n",
616 "\xf6n",
617 "\xd6n",
618 "seen",
619 "Seen",
620 "lla",
621 "Lla",
622 "ll\xe4",
623 "Ll\xe4",
624 "lta",
625 "Lta",
626 "lt\xe4",
627 "Lt\xe4",
628 "lle",
629 "Lle",
630 "ksi",
631 "Ksi",
632 "kse",
633 "Kse",
634 "tta",
635 "Tta",
636 "ine",
637 "Ine",
638 ]
639
640 FINNISH_MORPHSET_2 = ["ni", "si", "mme", "nne", "nsa"]
641
642 FINNISH_MORPHSET_3 = [
643 "ko",
644 "k\xf6",
645 "han",
646 "h\xe4n",
647 "pa",
648 "p\xe4",
649 "kaan",
650 "k\xe4\xe4n",
651 "kin",
652 ]
653
654 FINNISH_REGEX = r"^({})({})?({})$".format(
655 str("|".join(FINNISH_MORPHSET_1)),
656 str("|".join(FINNISH_MORPHSET_2)),
657 str("|".join(FINNISH_MORPHSET_3)),
658 )
659
660 def __init__(self, lang="en"):
661 super(MosesDetokenizer, self).__init__()
662 self.lang = lang
663
664 def unescape_xml(self, text):
665 for regexp, substitution in self.MOSES_UNESCAPE_XML_REGEXES:
666 text = re.sub(regexp, substitution, text)
667 return text
668
669 def tokenize(self, tokens, return_str=True, unescape=True):
670 """
671 Python port of the Moses detokenizer.
672 :param tokens: A list of strings, i.e. tokenized text.
673 :type tokens: list(str)
674 :return: str
675 """
676 # Convert the list of tokens into a string and pad it with spaces.
677 text = r" {} ".format(" ".join(tokens))
678 # Converts input string into unicode.
679 text = str(text)
680 # Detokenize the agressive hyphen split.
681 regexp, substitution = self.AGGRESSIVE_HYPHEN_SPLIT
682 text = re.sub(regexp, substitution, text)
683 if unescape:
684 # Unescape the XML symbols.
685 text = self.unescape_xml(text)
686 # Keep track of no. of quotation marks.
687 quote_counts = {"'": 0, '"': 0, "``": 0, "`": 0, "''": 0}
688
689 # The *prepend_space* variable is used to control the "effects" of
690 # detokenization as the function loops through the list of tokens and
691 # changes the *prepend_space* accordingly as it sequentially checks
692 # through the language specific and language independent conditions.
693 prepend_space = " "
694 detokenized_text = ""
695 tokens = text.split()
696 # Iterate through every token and apply language specific detokenization rule(s).
697 for i, token in enumerate(iter(tokens)):
698 # Check if the first char is CJK.
699 if is_cjk(token[0]) and self.lang != "ko":
700 # Perform left shift if this is a second consecutive CJK word.
701 if i > 0 and is_cjk(tokens[i - 1][-1]):
702 detokenized_text += token
703 # But do nothing special if this is a CJK word that doesn't follow a CJK word
704 else:
705 detokenized_text += prepend_space + token
706 prepend_space = " "
707 # If it's a currency symbol.
708 elif re.search(r"^[" + self.IsSc + r"\(\[\{\¿\¡]+$", token):
709 # Perform right shift on currency and other random punctuation items
710 detokenized_text += prepend_space + token
711 prepend_space = ""
712
713 elif re.search(r"^[\,\.\?\!\:\;\\\%\}\]\)]+$", token):
714 # In French, these punctuations are prefixed with a non-breakable space.
715 if self.lang == "fr" and re.search(r"^[\?\!\:\;\\\%]$", token):
716 detokenized_text += " "
717 # Perform left shift on punctuation items.
718 detokenized_text += token
719 prepend_space = " "
720
721 elif (
722 self.lang == "en"
723 and i > 0
724 and re.search(r"^['][{}]".format(self.IsAlpha), token)
725 ):
726 # and re.search('[{}]$'.format(self.IsAlnum), tokens[i-1])):
727 # For English, left-shift the contraction.
728 detokenized_text += token
729 prepend_space = " "
730
731 elif (
732 self.lang == "cs"
733 and i > 1
734 and re.search(
735 r"^[0-9]+$", tokens[-2]
736 ) # If the previous previous token is a number.
737 and re.search(r"^[.,]$", tokens[-1]) # If previous token is a dot.
738 and re.search(r"^[0-9]+$", token)
739 ): # If the current token is a number.
740 # In Czech, left-shift floats that are decimal numbers.
741 detokenized_text += token
742 prepend_space = " "
743
744 elif (
745 self.lang in ["fr", "it", "ga"]
746 and i <= len(tokens) - 2
747 and re.search(r"[{}][']$".format(self.IsAlpha), token)
748 and re.search(r"^[{}]".format(self.IsAlpha), tokens[i + 1])
749 ): # If the next token is alpha.
750 # For French and Italian, right-shift the contraction.
751 detokenized_text += prepend_space + token
752 prepend_space = ""
753
754 elif (
755 self.lang == "cs"
756 and i <= len(tokens) - 3
757 and re.search(r"[{}][']$".format(self.IsAlpha), token)
758 and re.search(r"^[-–]$", tokens[i + 1])
759 and re.search(r"^li$|^mail.*", tokens[i + 2], re.IGNORECASE)
760 ): # In Perl, ($words[$i+2] =~ /^li$|^mail.*/i)
761 # In Czech, right-shift "-li" and a few Czech dashed words (e.g. e-mail)
762 detokenized_text += prepend_space + token + tokens[i + 1]
763 next(tokens, None) # Advance over the dash
764 prepend_space = ""
765
766 # Combine punctuation smartly.
767 elif re.search(r"""^[\'\"„“`]+$""", token):
768 normalized_quo = token
769 if re.search(r"^[„“”]+$", token):
770 normalized_quo = '"'
771 quote_counts[normalized_quo] = quote_counts.get(normalized_quo, 0)
772
773 if self.lang == "cs" and token == "„":
774 quote_counts[normalized_quo] = 0
775 if self.lang == "cs" and token == "“":
776 quote_counts[normalized_quo] = 1
777
778 if quote_counts[normalized_quo] % 2 == 0:
779 if (
780 self.lang == "en"
781 and token == "'"
782 and i > 0
783 and re.search(r"[s]$", tokens[i - 1])
784 ):
785 # Left shift on single quote for possessives ending
786 # in "s", e.g. "The Jones' house"
787 detokenized_text += token
788 prepend_space = " "
789 else:
790 # Right shift.
791 detokenized_text += prepend_space + token
792 prepend_space = ""
793 quote_counts[normalized_quo] += 1
794 else:
795 # Left shift.
796 detokenized_text += token
797 prepend_space = " "
798 quote_counts[normalized_quo] += 1
799
800 elif (
801 self.lang == "fi"
802 and re.search(r":$", tokens[i - 1])
803 and re.search(self.FINNISH_REGEX, token)
804 ):
805 # Finnish : without intervening space if followed by case suffix
806 # EU:N EU:n EU:ssa EU:sta EU:hun EU:iin ...
807 detokenized_text += prepend_space + token
808 prepend_space = " "
809
810 else:
811 detokenized_text += prepend_space + token
812 prepend_space = " "
813
814 # Merge multiple spaces.
815 regexp, substitution = self.ONE_SPACE
816 detokenized_text = re.sub(regexp, substitution, detokenized_text)
817 # Removes heading and trailing spaces.
818 detokenized_text = detokenized_text.strip()
819
820 return detokenized_text if return_str else detokenized_text.split()
821
822 def detokenize(self, tokens, return_str=True, unescape=True):
823 """Duck-typing the abstract *tokenize()*."""
824 return self.tokenize(tokens, return_str, unescape)
825
826
827__all__ = ["MosesTokenizer", "MosesDetokenizer"]