1#!/usr/bin/env python3
2# -*- coding: utf-8 -*-
3
4import re
5import regex
6
7from itertools import chain
8
9
10class MosesPunctNormalizer:
11 """
12 This is a Python port of the Moses punctuation normalizer from
13 https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/normalize-punctuation.perl
14 """
15
16 EXTRA_WHITESPACE = [ # lines 21 - 30
17 (r"\r", r""),
18 (r"\(", r" ("),
19 (r"\)", r") "),
20 (r" +", r" "),
21 (r"\) ([.!:?;,])", r")\g<1>"),
22 (r"\( ", r"("),
23 (r" \)", r")"),
24 (r"(\d) %", r"\g<1>%"),
25 (r" :", r":"),
26 (r" ;", r";"),
27 ]
28
29 NORMALIZE_UNICODE_IF_NOT_PENN = [(r"`", r"'"), (r"''", r' " ')] # lines 33 - 34
30
31 NORMALIZE_UNICODE = [ # lines 37 - 50
32 ("„", r'"'),
33 ("“", r'"'),
34 ("”", r'"'),
35 ("–", r"-"),
36 ("—", r" - "),
37 (r" +", r" "),
38 ("´", r"'"),
39 ("([a-zA-Z])‘([a-zA-Z])", r"\g<1>'\g<2>"),
40 ("([a-zA-Z])’([a-zA-Z])", r"\g<1>'\g<2>"),
41 ("‘", r"'"),
42 ("‚", r"'"),
43 ("’", r"'"),
44 (r"''", r'"'),
45 ("´´", r'"'),
46 ("…", r"..."),
47 ]
48
49 FRENCH_QUOTES = [ # lines 52 - 57
50 ("\u00A0«\u00A0", r'"'),
51 ("«\u00A0", r'"'),
52 ("«", r'"'),
53 ("\u00A0»\u00A0", r'"'),
54 ("\u00A0»", r'"'),
55 ("»", r'"'),
56 ]
57
58 HANDLE_PSEUDO_SPACES = [ # lines 59 - 67
59 ("\u00A0%", r"%"),
60 ("nº\u00A0", "nº "),
61 ("\u00A0:", r":"),
62 ("\u00A0ºC", " ºC"),
63 ("\u00A0cm", r" cm"),
64 ("\u00A0\\?", "?"),
65 ("\u00A0\\!", "!"),
66 ("\u00A0;", r";"),
67 (",\u00A0", r", "),
68 (r" +", r" "),
69 ]
70
71 EN_QUOTATION_FOLLOWED_BY_COMMA = [(r'"([,.]+)', r'\g<1>"')]
72
73 DE_ES_FR_QUOTATION_FOLLOWED_BY_COMMA = [
74 (r',"', r'",'),
75 (r'(\.+)"(\s*[^<])', r'"\g<1>\g<2>'), # don't fix period at end of sentence
76 ]
77
78 DE_ES_CZ_CS_FR = [
79 ("(\\d)\u00A0(\\d)", r"\g<1>,\g<2>"),
80 ]
81
82 OTHER = [
83 ("(\\d)\u00A0(\\d)", r"\g<1>.\g<2>"),
84 ]
85
86 # Regex substitutions from replace-unicode-punctuation.perl
87 # https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/replace-unicode-punctuation.perl
88 REPLACE_UNICODE_PUNCTUATION = [
89 (",", ","),
90 (r"。\s*", ". "),
91 ("、", ","),
92 ("”", '"'),
93 ("“", '"'),
94 ("∶", ":"),
95 (":", ":"),
96 ("?", "?"),
97 ("《", '"'),
98 ("》", '"'),
99 (")", ")"),
100 ("!", "!"),
101 ("(", "("),
102 (";", ";"),
103 ("」", '"'),
104 ("「", '"'),
105 ("0", "0"),
106 ("1", "1"),
107 ("2", "2"),
108 ("3", "3"),
109 ("4", "4"),
110 ("5", "5"),
111 ("6", "6"),
112 ("7", "7"),
113 ("8", "8"),
114 ("9", "9"),
115 (r".\s*", ". "),
116 ("~", "~"),
117 ("’", "'"),
118 ("…", "..."),
119 ("━", "-"),
120 ("〈", "<"),
121 ("〉", ">"),
122 ("【", "["),
123 ("】", "]"),
124 ("%", "%"),
125 ]
126
127 def __init__(
128 self,
129 lang="en",
130 penn=True,
131 norm_quote_commas=True,
132 norm_numbers=True,
133 pre_replace_unicode_punct=False,
134 post_remove_control_chars=False,
135 ):
136 """
137 :param language: The two-letter language code.
138 :type lang: str
139 :param penn: Normalize Penn Treebank style quotations.
140 :type penn: bool
141 :param norm_quote_commas: Normalize quotations and commas
142 :type norm_quote_commas: bool
143 :param norm_numbers: Normalize numbers
144 :type norm_numbers: bool
145 """
146 self.substitutions = [
147 self.EXTRA_WHITESPACE,
148 self.NORMALIZE_UNICODE,
149 self.FRENCH_QUOTES,
150 self.HANDLE_PSEUDO_SPACES,
151 ]
152
153 if penn: # Adds the penn substitutions after extra_whitespace regexes.
154 self.substitutions.insert(1, self.NORMALIZE_UNICODE_IF_NOT_PENN)
155
156 if norm_quote_commas:
157 if lang == "en":
158 self.substitutions.append(self.EN_QUOTATION_FOLLOWED_BY_COMMA)
159 elif lang in ["de", "es", "fr"]:
160 self.substitutions.append(self.DE_ES_FR_QUOTATION_FOLLOWED_BY_COMMA)
161
162 if norm_numbers:
163 if lang in ["de", "es", "cz", "cs", "fr"]:
164 self.substitutions.append(self.DE_ES_CZ_CS_FR)
165 else:
166 self.substitutions.append(self.OTHER)
167
168 self.substitutions = list(chain(*self.substitutions))
169
170 self.pre_replace_unicode_punct = pre_replace_unicode_punct
171 self.post_remove_control_chars = post_remove_control_chars
172
173 def normalize(self, text):
174 """
175 Returns a string with normalized punctuation.
176 """
177 # Optionally, replace unicode puncts BEFORE normalization.
178 if self.pre_replace_unicode_punct:
179 text = self.replace_unicode_punct(text)
180
181 # Actual normalization.
182 for regexp, substitution in self.substitutions:
183 # print(regexp, substitution)
184 text = re.sub(regexp, substitution, str(text))
185 # print(text)
186
187 # Optionally, replace unicode puncts BEFORE normalization.
188 if self.post_remove_control_chars:
189 text = self.remove_control_chars(text)
190
191 return text.strip()
192
193 def replace_unicode_punct(self, text):
194 for regexp, substitution in self.REPLACE_UNICODE_PUNCTUATION:
195 text = re.sub(regexp, substitution, str(text))
196 return text
197
198 def remove_control_chars(self, text):
199 return regex.sub(r"\p{C}", "", text)