Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/sacremoses/normalize.py: 84%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1#!/usr/bin/env python3
2# -*- coding: utf-8 -*-
4import re
5import regex
7from itertools import chain
10class MosesPunctNormalizer:
11 """
12 This is a Python port of the Moses punctuation normalizer from
13 https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/normalize-punctuation.perl
14 """
16 EXTRA_WHITESPACE = [ # lines 21 - 30
17 (r"\r", r""),
18 (r"\(", r" ("),
19 (r"\)", r") "),
20 (r" +", r" "),
21 (r"\) ([.!:?;,])", r")\g<1>"),
22 (r"\( ", r"("),
23 (r" \)", r")"),
24 (r"(\d) %", r"\g<1>%"),
25 (r" :", r":"),
26 (r" ;", r";"),
27 ]
29 NORMALIZE_UNICODE_IF_NOT_PENN = [(r"`", r"'"), (r"''", r' " ')] # lines 33 - 34
31 NORMALIZE_UNICODE = [ # lines 37 - 50
32 ("„", r'"'),
33 ("“", r'"'),
34 ("”", r'"'),
35 ("–", r"-"),
36 ("—", r" - "),
37 (r" +", r" "),
38 ("´", r"'"),
39 ("([a-zA-Z])‘([a-zA-Z])", r"\g<1>'\g<2>"),
40 ("([a-zA-Z])’([a-zA-Z])", r"\g<1>'\g<2>"),
41 ("‘", r"'"),
42 ("‚", r"'"),
43 ("’", r"'"),
44 (r"''", r'"'),
45 ("´´", r'"'),
46 ("…", r"..."),
47 ]
49 FRENCH_QUOTES = [ # lines 52 - 57
50 ("\u00A0«\u00A0", r'"'),
51 ("«\u00A0", r'"'),
52 ("«", r'"'),
53 ("\u00A0»\u00A0", r'"'),
54 ("\u00A0»", r'"'),
55 ("»", r'"'),
56 ]
58 HANDLE_PSEUDO_SPACES = [ # lines 59 - 67
59 ("\u00A0%", r"%"),
60 ("nº\u00A0", "nº "),
61 ("\u00A0:", r":"),
62 ("\u00A0ºC", " ºC"),
63 ("\u00A0cm", r" cm"),
64 ("\u00A0\\?", "?"),
65 ("\u00A0\\!", "!"),
66 ("\u00A0;", r";"),
67 (",\u00A0", r", "),
68 (r" +", r" "),
69 ]
71 EN_QUOTATION_FOLLOWED_BY_COMMA = [(r'"([,.]+)', r'\g<1>"')]
73 DE_ES_FR_QUOTATION_FOLLOWED_BY_COMMA = [
74 (r',"', r'",'),
75 (r'(\.+)"(\s*[^<])', r'"\g<1>\g<2>'), # don't fix period at end of sentence
76 ]
78 DE_ES_CZ_CS_FR = [
79 ("(\\d)\u00A0(\\d)", r"\g<1>,\g<2>"),
80 ]
82 OTHER = [
83 ("(\\d)\u00A0(\\d)", r"\g<1>.\g<2>"),
84 ]
86 # Regex substitutions from replace-unicode-punctuation.perl
87 # https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/replace-unicode-punctuation.perl
88 REPLACE_UNICODE_PUNCTUATION = [
89 (",", ","),
90 (r"。\s*", ". "),
91 ("、", ","),
92 ("”", '"'),
93 ("“", '"'),
94 ("∶", ":"),
95 (":", ":"),
96 ("?", "?"),
97 ("《", '"'),
98 ("》", '"'),
99 (")", ")"),
100 ("!", "!"),
101 ("(", "("),
102 (";", ";"),
103 ("」", '"'),
104 ("「", '"'),
105 ("0", "0"),
106 ("1", "1"),
107 ("2", "2"),
108 ("3", "3"),
109 ("4", "4"),
110 ("5", "5"),
111 ("6", "6"),
112 ("7", "7"),
113 ("8", "8"),
114 ("9", "9"),
115 (r".\s*", ". "),
116 ("~", "~"),
117 ("’", "'"),
118 ("…", "..."),
119 ("━", "-"),
120 ("〈", "<"),
121 ("〉", ">"),
122 ("【", "["),
123 ("】", "]"),
124 ("%", "%"),
125 ]
127 def __init__(
128 self,
129 lang="en",
130 penn=True,
131 norm_quote_commas=True,
132 norm_numbers=True,
133 pre_replace_unicode_punct=False,
134 post_remove_control_chars=False,
135 ):
136 """
137 :param language: The two-letter language code.
138 :type lang: str
139 :param penn: Normalize Penn Treebank style quotations.
140 :type penn: bool
141 :param norm_quote_commas: Normalize quotations and commas
142 :type norm_quote_commas: bool
143 :param norm_numbers: Normalize numbers
144 :type norm_numbers: bool
145 """
146 self.substitutions = [
147 self.EXTRA_WHITESPACE,
148 self.NORMALIZE_UNICODE,
149 self.FRENCH_QUOTES,
150 self.HANDLE_PSEUDO_SPACES,
151 ]
153 if penn: # Adds the penn substitutions after extra_whitespace regexes.
154 self.substitutions.insert(1, self.NORMALIZE_UNICODE_IF_NOT_PENN)
156 if norm_quote_commas:
157 if lang == "en":
158 self.substitutions.append(self.EN_QUOTATION_FOLLOWED_BY_COMMA)
159 elif lang in ["de", "es", "fr"]:
160 self.substitutions.append(self.DE_ES_FR_QUOTATION_FOLLOWED_BY_COMMA)
162 if norm_numbers:
163 if lang in ["de", "es", "cz", "cs", "fr"]:
164 self.substitutions.append(self.DE_ES_CZ_CS_FR)
165 else:
166 self.substitutions.append(self.OTHER)
168 self.substitutions = list(chain(*self.substitutions))
170 self.pre_replace_unicode_punct = pre_replace_unicode_punct
171 self.post_remove_control_chars = post_remove_control_chars
173 def normalize(self, text):
174 """
175 Returns a string with normalized punctuation.
176 """
177 # Optionally, replace unicode puncts BEFORE normalization.
178 if self.pre_replace_unicode_punct:
179 text = self.replace_unicode_punct(text)
181 # Actual normalization.
182 for regexp, substitution in self.substitutions:
183 # print(regexp, substitution)
184 text = re.sub(regexp, substitution, str(text))
185 # print(text)
187 # Optionally, replace unicode puncts BEFORE normalization.
188 if self.post_remove_control_chars:
189 text = self.remove_control_chars(text)
191 return text.strip()
193 def replace_unicode_punct(self, text):
194 for regexp, substitution in self.REPLACE_UNICODE_PUNCTUATION:
195 text = re.sub(regexp, substitution, str(text))
196 return text
198 def remove_control_chars(self, text):
199 return regex.sub(r"\p{C}", "", text)