Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/sacremoses/normalize.py: 84%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

45 statements  

1#!/usr/bin/env python3 

2# -*- coding: utf-8 -*- 

3 

4import re 

5import regex 

6 

7from itertools import chain 

8 

9 

10class MosesPunctNormalizer: 

11 """ 

12 This is a Python port of the Moses punctuation normalizer from 

13 https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/normalize-punctuation.perl 

14 """ 

15 

16 EXTRA_WHITESPACE = [ # lines 21 - 30 

17 (r"\r", r""), 

18 (r"\(", r" ("), 

19 (r"\)", r") "), 

20 (r" +", r" "), 

21 (r"\) ([.!:?;,])", r")\g<1>"), 

22 (r"\( ", r"("), 

23 (r" \)", r")"), 

24 (r"(\d) %", r"\g<1>%"), 

25 (r" :", r":"), 

26 (r" ;", r";"), 

27 ] 

28 

29 NORMALIZE_UNICODE_IF_NOT_PENN = [(r"`", r"'"), (r"''", r' " ')] # lines 33 - 34 

30 

31 NORMALIZE_UNICODE = [ # lines 37 - 50 

32 ("„", r'"'), 

33 ("“", r'"'), 

34 ("”", r'"'), 

35 ("–", r"-"), 

36 ("—", r" - "), 

37 (r" +", r" "), 

38 ("´", r"'"), 

39 ("([a-zA-Z])‘([a-zA-Z])", r"\g<1>'\g<2>"), 

40 ("([a-zA-Z])’([a-zA-Z])", r"\g<1>'\g<2>"), 

41 ("‘", r"'"), 

42 ("‚", r"'"), 

43 ("’", r"'"), 

44 (r"''", r'"'), 

45 ("´´", r'"'), 

46 ("…", r"..."), 

47 ] 

48 

49 FRENCH_QUOTES = [ # lines 52 - 57 

50 ("\u00A0«\u00A0", r'"'), 

51 ("«\u00A0", r'"'), 

52 ("«", r'"'), 

53 ("\u00A0»\u00A0", r'"'), 

54 ("\u00A0»", r'"'), 

55 ("»", r'"'), 

56 ] 

57 

58 HANDLE_PSEUDO_SPACES = [ # lines 59 - 67 

59 ("\u00A0%", r"%"), 

60 ("nº\u00A0", "nº "), 

61 ("\u00A0:", r":"), 

62 ("\u00A0ºC", " ºC"), 

63 ("\u00A0cm", r" cm"), 

64 ("\u00A0\\?", "?"), 

65 ("\u00A0\\!", "!"), 

66 ("\u00A0;", r";"), 

67 (",\u00A0", r", "), 

68 (r" +", r" "), 

69 ] 

70 

71 EN_QUOTATION_FOLLOWED_BY_COMMA = [(r'"([,.]+)', r'\g<1>"')] 

72 

73 DE_ES_FR_QUOTATION_FOLLOWED_BY_COMMA = [ 

74 (r',"', r'",'), 

75 (r'(\.+)"(\s*[^<])', r'"\g<1>\g<2>'), # don't fix period at end of sentence 

76 ] 

77 

78 DE_ES_CZ_CS_FR = [ 

79 ("(\\d)\u00A0(\\d)", r"\g<1>,\g<2>"), 

80 ] 

81 

82 OTHER = [ 

83 ("(\\d)\u00A0(\\d)", r"\g<1>.\g<2>"), 

84 ] 

85 

86 # Regex substitutions from replace-unicode-punctuation.perl 

87 # https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/replace-unicode-punctuation.perl 

88 REPLACE_UNICODE_PUNCTUATION = [ 

89 (",", ","), 

90 (r"。\s*", ". "), 

91 ("、", ","), 

92 ("”", '"'), 

93 ("“", '"'), 

94 ("∶", ":"), 

95 (":", ":"), 

96 ("?", "?"), 

97 ("《", '"'), 

98 ("》", '"'), 

99 (")", ")"), 

100 ("!", "!"), 

101 ("(", "("), 

102 (";", ";"), 

103 ("」", '"'), 

104 ("「", '"'), 

105 ("0", "0"), 

106 ("1", "1"), 

107 ("2", "2"), 

108 ("3", "3"), 

109 ("4", "4"), 

110 ("5", "5"), 

111 ("6", "6"), 

112 ("7", "7"), 

113 ("8", "8"), 

114 ("9", "9"), 

115 (r".\s*", ". "), 

116 ("~", "~"), 

117 ("’", "'"), 

118 ("…", "..."), 

119 ("━", "-"), 

120 ("〈", "<"), 

121 ("〉", ">"), 

122 ("【", "["), 

123 ("】", "]"), 

124 ("%", "%"), 

125 ] 

126 

127 def __init__( 

128 self, 

129 lang="en", 

130 penn=True, 

131 norm_quote_commas=True, 

132 norm_numbers=True, 

133 pre_replace_unicode_punct=False, 

134 post_remove_control_chars=False, 

135 ): 

136 """ 

137 :param language: The two-letter language code. 

138 :type lang: str 

139 :param penn: Normalize Penn Treebank style quotations. 

140 :type penn: bool 

141 :param norm_quote_commas: Normalize quotations and commas 

142 :type norm_quote_commas: bool 

143 :param norm_numbers: Normalize numbers 

144 :type norm_numbers: bool 

145 """ 

146 self.substitutions = [ 

147 self.EXTRA_WHITESPACE, 

148 self.NORMALIZE_UNICODE, 

149 self.FRENCH_QUOTES, 

150 self.HANDLE_PSEUDO_SPACES, 

151 ] 

152 

153 if penn: # Adds the penn substitutions after extra_whitespace regexes. 

154 self.substitutions.insert(1, self.NORMALIZE_UNICODE_IF_NOT_PENN) 

155 

156 if norm_quote_commas: 

157 if lang == "en": 

158 self.substitutions.append(self.EN_QUOTATION_FOLLOWED_BY_COMMA) 

159 elif lang in ["de", "es", "fr"]: 

160 self.substitutions.append(self.DE_ES_FR_QUOTATION_FOLLOWED_BY_COMMA) 

161 

162 if norm_numbers: 

163 if lang in ["de", "es", "cz", "cs", "fr"]: 

164 self.substitutions.append(self.DE_ES_CZ_CS_FR) 

165 else: 

166 self.substitutions.append(self.OTHER) 

167 

168 self.substitutions = list(chain(*self.substitutions)) 

169 

170 self.pre_replace_unicode_punct = pre_replace_unicode_punct 

171 self.post_remove_control_chars = post_remove_control_chars 

172 

173 def normalize(self, text): 

174 """ 

175 Returns a string with normalized punctuation. 

176 """ 

177 # Optionally, replace unicode puncts BEFORE normalization. 

178 if self.pre_replace_unicode_punct: 

179 text = self.replace_unicode_punct(text) 

180 

181 # Actual normalization. 

182 for regexp, substitution in self.substitutions: 

183 # print(regexp, substitution) 

184 text = re.sub(regexp, substitution, str(text)) 

185 # print(text) 

186 

187 # Optionally, replace unicode puncts BEFORE normalization. 

188 if self.post_remove_control_chars: 

189 text = self.remove_control_chars(text) 

190 

191 return text.strip() 

192 

193 def replace_unicode_punct(self, text): 

194 for regexp, substitution in self.REPLACE_UNICODE_PUNCTUATION: 

195 text = re.sub(regexp, substitution, str(text)) 

196 return text 

197 

198 def remove_control_chars(self, text): 

199 return regex.sub(r"\p{C}", "", text)