Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/sacremoses/normalize.py: 84%

1#!/usr/bin/env python3

2# -*- coding: utf-8 -*-

4import re

5import regex

7from itertools import chain

10class MosesPunctNormalizer:

11 """

12 This is a Python port of the Moses punctuation normalizer from

13 https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/normalize-punctuation.perl

14 """

16 EXTRA_WHITESPACE = [ # lines 21 - 30

17 (r"\r", r""),

18 (r"\(", r" ("),

19 (r"\)", r") "),

20 (r" +", r" "),

21 (r"\) ([.!:?;,])", r")\g<1>"),

22 (r"\( ", r"("),

23 (r" \)", r")"),

24 (r"(\d) %", r"\g<1>%"),

25 (r" :", r":"),

26 (r" ;", r";"),

27 ]

29 NORMALIZE_UNICODE_IF_NOT_PENN = [(r"`", r"'"), (r"''", r' " ')] # lines 33 - 34

31 NORMALIZE_UNICODE = [ # lines 37 - 50

32 ("„", r'"'),

33 ("“", r'"'),

34 ("”", r'"'),

35 ("–", r"-"),

36 ("—", r" - "),

37 (r" +", r" "),

38 ("´", r"'"),

39 ("([a-zA-Z])‘([a-zA-Z])", r"\g<1>'\g<2>"),

40 ("([a-zA-Z])’([a-zA-Z])", r"\g<1>'\g<2>"),

41 ("‘", r"'"),

42 ("‚", r"'"),

43 ("’", r"'"),

44 (r"''", r'"'),

45 ("´´", r'"'),

46 ("…", r"..."),

47 ]

49 FRENCH_QUOTES = [ # lines 52 - 57

50 ("\u00A0«\u00A0", r'"'),

51 ("«\u00A0", r'"'),

52 ("«", r'"'),

53 ("\u00A0»\u00A0", r'"'),

54 ("\u00A0»", r'"'),

55 ("»", r'"'),

56 ]

58 HANDLE_PSEUDO_SPACES = [ # lines 59 - 67

59 ("\u00A0%", r"%"),

60 ("nº\u00A0", "nº "),

61 ("\u00A0:", r":"),

62 ("\u00A0ºC", " ºC"),

63 ("\u00A0cm", r" cm"),

64 ("\u00A0\\?", "?"),

65 ("\u00A0\\!", "!"),

66 ("\u00A0;", r";"),

67 (",\u00A0", r", "),

68 (r" +", r" "),

69 ]

71 EN_QUOTATION_FOLLOWED_BY_COMMA = [(r'"([,.]+)', r'\g<1>"')]

73 DE_ES_FR_QUOTATION_FOLLOWED_BY_COMMA = [

74 (r',"', r'",'),

75 (r'(\.+)"(\s*[^<])', r'"\g<1>\g<2>'), # don't fix period at end of sentence

76 ]

78 DE_ES_CZ_CS_FR = [

79 ("(\\d)\u00A0(\\d)", r"\g<1>,\g<2>"),

80 ]

82 OTHER = [

83 ("(\\d)\u00A0(\\d)", r"\g<1>.\g<2>"),

84 ]

86 # Regex substitutions from replace-unicode-punctuation.perl

87 # https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/replace-unicode-punctuation.perl

88 REPLACE_UNICODE_PUNCTUATION = [

89 ("，", ","),

90 (r"。\s*", ". "),

91 ("、", ","),

92 ("”", '"'),

93 ("“", '"'),

94 ("∶", ":"),

95 ("：", ":"),

96 ("？", "?"),

97 ("《", '"'),

98 ("》", '"'),

99 ("）", ")"),

100 ("！", "!"),

101 ("（", "("),

102 ("；", ";"),

103 ("」", '"'),

104 ("「", '"'),

105 ("０", "0"),

106 ("１", "1"),

107 ("２", "2"),

108 ("３", "3"),

109 ("４", "4"),

110 ("５", "5"),

111 ("６", "6"),

112 ("７", "7"),

113 ("８", "8"),

114 ("９", "9"),

115 (r"．\s*", ". "),

116 ("～", "~"),

117 ("’", "'"),

118 ("…", "..."),

119 ("━", "-"),

120 ("〈", "<"),

121 ("〉", ">"),

122 ("【", "["),

123 ("】", "]"),

124 ("％", "%"),

125 ]

126

127 def __init__(

128 self,

129 lang="en",

130 penn=True,

131 norm_quote_commas=True,

132 norm_numbers=True,

133 pre_replace_unicode_punct=False,

134 post_remove_control_chars=False,

135 ):

136 """

137 :param language: The two-letter language code.

138 :type lang: str

139 :param penn: Normalize Penn Treebank style quotations.

140 :type penn: bool

141 :param norm_quote_commas: Normalize quotations and commas

142 :type norm_quote_commas: bool

143 :param norm_numbers: Normalize numbers

144 :type norm_numbers: bool

145 """

146 self.substitutions = [

147 self.EXTRA_WHITESPACE,

148 self.NORMALIZE_UNICODE,

149 self.FRENCH_QUOTES,

150 self.HANDLE_PSEUDO_SPACES,

151 ]

152

153 if penn: # Adds the penn substitutions after extra_whitespace regexes.

154 self.substitutions.insert(1, self.NORMALIZE_UNICODE_IF_NOT_PENN)

155

156 if norm_quote_commas:

157 if lang == "en":

158 self.substitutions.append(self.EN_QUOTATION_FOLLOWED_BY_COMMA)

159 elif lang in ["de", "es", "fr"]:

160 self.substitutions.append(self.DE_ES_FR_QUOTATION_FOLLOWED_BY_COMMA)

161

162 if norm_numbers:

163 if lang in ["de", "es", "cz", "cs", "fr"]:

164 self.substitutions.append(self.DE_ES_CZ_CS_FR)

165 else:

166 self.substitutions.append(self.OTHER)

167

168 self.substitutions = list(chain(*self.substitutions))

169

170 self.pre_replace_unicode_punct = pre_replace_unicode_punct

171 self.post_remove_control_chars = post_remove_control_chars

172

173 def normalize(self, text):

174 """

175 Returns a string with normalized punctuation.

176 """

177 # Optionally, replace unicode puncts BEFORE normalization.

178 if self.pre_replace_unicode_punct:

179 text = self.replace_unicode_punct(text)

180

181 # Actual normalization.

182 for regexp, substitution in self.substitutions:

183 # print(regexp, substitution)

184 text = re.sub(regexp, substitution, str(text))

185 # print(text)

186

187 # Optionally, replace unicode puncts BEFORE normalization.

188 if self.post_remove_control_chars:

189 text = self.remove_control_chars(text)

190

191 return text.strip()

192

193 def replace_unicode_punct(self, text):

194 for regexp, substitution in self.REPLACE_UNICODE_PUNCTUATION:

195 text = re.sub(regexp, substitution, str(text))

196 return text

197

198 def remove_control_chars(self, text):

199 return regex.sub(r"\p{C}", "", text)