Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/sacremoses/tokenize.py: 77%

1#!/usr/bin/env python3

2# -*- coding: utf-8 -*-

4import re

6from sacremoses.corpus import Perluniprops

7from sacremoses.corpus import NonbreakingPrefixes

8from sacremoses.util import is_cjk

9from sacremoses.indic import VIRAMAS, NUKTAS

11perluniprops = Perluniprops()

12nonbreaking_prefixes = NonbreakingPrefixes()

15class MosesTokenizer(object):

16 """

17 This is a Python port of the Moses Tokenizer from

18 https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/tokenizer.perl

19 """

21 # Perl Unicode Properties character sets.

22 IsN = str("".join(perluniprops.chars("IsN")))

23 IsAlnum = str(

24 "".join(perluniprops.chars("IsAlnum")) + "".join(VIRAMAS) + "".join(NUKTAS)

25 )

26 IsSc = str("".join(perluniprops.chars("IsSc")))

27 IsSo = str("".join(perluniprops.chars("IsSo")))

28 IsAlpha = str(

29 "".join(perluniprops.chars("IsAlpha")) + "".join(VIRAMAS) + "".join(NUKTAS)

30 )

31 IsLower = str("".join(perluniprops.chars("IsLower")))

33 # Remove ASCII junk.

34 DEDUPLICATE_SPACE = r"\s+", r" "

35 ASCII_JUNK = r"[\000-\037]", r""

37 # Neurotic Perl heading space, multi-space and trailing space chomp.

38 # These regexes are kept for reference purposes and shouldn't be used!!

39 MID_STRIP = r" +", r" " # Use DEDUPLICATE_SPACE instead.

40 LEFT_STRIP = r"^ ", r"" # Uses text.lstrip() instead.

41 RIGHT_STRIP = r" $", r"" # Uses text.rstrip() instead.

43 # Pad all "other" special characters not in IsAlnum.

44 PAD_NOT_ISALNUM = r"([^{}\s\.'\`\,\-])".format(IsAlnum), r" \1 "

46 # Splits all hyphens (regardless of circumstances), e.g.

47 # 'foo-bar' -> 'foo @-@ bar'

48 AGGRESSIVE_HYPHEN_SPLIT = (

49 r"([{alphanum}])\-(?=[{alphanum}])".format(alphanum=IsAlnum),

50 r"\1 @-@ ",

51 )

53 # Make multi-dots stay together.

54 REPLACE_DOT_WITH_LITERALSTRING_1 = r"\.([\.]+)", " DOTMULTI\1"

55 REPLACE_DOT_WITH_LITERALSTRING_2 = r"DOTMULTI\.([^\.])", "DOTDOTMULTI \1"

56 REPLACE_DOT_WITH_LITERALSTRING_3 = r"DOTMULTI\.", "DOTDOTMULTI"

58 # Separate out "," except if within numbers (5,300)

59 # e.g. A,B,C,D,E > A , B,C , D,E

60 # First application uses up B so rule can't see B,C

61 # two-step version here may create extra spaces but these are removed later

62 # will also space digit,letter or letter,digit forms (redundant with next section)

63 COMMA_SEPARATE_1 = r"([^{}])[,]".format(IsN), r"\1 , "

64 COMMA_SEPARATE_2 = r"[,]([^{}])".format(IsN), r" , \1"

65 COMMA_SEPARATE_3 = r"([{}])[,]$".format(IsN), r"\1 , "

67 # Attempt to get correct directional quotes.

68 DIRECTIONAL_QUOTE_1 = r"^``", r"`` "

69 DIRECTIONAL_QUOTE_2 = r'^"', r"`` "

70 DIRECTIONAL_QUOTE_3 = r"^`([^`])", r"` \1"

71 DIRECTIONAL_QUOTE_4 = r"^'", r"` "

72 DIRECTIONAL_QUOTE_5 = r'([ ([{<])"', r"\1 `` "

73 DIRECTIONAL_QUOTE_6 = r"([ ([{<])``", r"\1 `` "

74 DIRECTIONAL_QUOTE_7 = r"([ ([{<])`([^`])", r"\1 ` \2"

75 DIRECTIONAL_QUOTE_8 = r"([ ([{<])'", r"\1 ` "

77 # Replace ... with _ELLIPSIS_

78 REPLACE_ELLIPSIS = r"\.\.\.", r" _ELLIPSIS_ "

79 # Restore _ELLIPSIS_ with ...

80 RESTORE_ELLIPSIS = r"_ELLIPSIS_", r"\.\.\."

82 # Pad , with tailing space except if within numbers, e.g. 5,300

83 COMMA_1 = r"([^{numbers}])[,]([^{numbers}])".format(numbers=IsN), r"\1 , \2"

84 COMMA_2 = r"([{numbers}])[,]([^{numbers}])".format(numbers=IsN), r"\1 , \2"

85 COMMA_3 = r"([^{numbers}])[,]([{numbers}])".format(numbers=IsN), r"\1 , \2"

87 # Pad unicode symbols with spaces.

88 SYMBOLS = r"([;:@#\$%&{}{}])".format(IsSc, IsSo), r" \1 "

90 # Separate out intra-token slashes. PTB tokenization doesn't do this, so

91 # the tokens should be merged prior to parsing with a PTB-trained parser.

92 # e.g. "and/or" -> "and @/@ or"

93 INTRATOKEN_SLASHES = (

94 r"([{alphanum}])\/([{alphanum}])".format(alphanum=IsAlnum),

95 r"$1 \@\/\@ $2",

96 )

98 # Splits final period at end of string.

99 FINAL_PERIOD = r"""([^.])([.])([\]\)}>"']*) ?$""", r"\1 \2\3"

100 # Pad all question marks and exclamation marks with spaces.

101 PAD_QUESTION_EXCLAMATION_MARK = r"([?!])", r" \1 "

102

103 # Handles parentheses, brackets and converts them to PTB symbols.

104 PAD_PARENTHESIS = r"([\]\[{}<>])", r" \1 "

105 CONVERT_PARENTHESIS_1 = r"\(", "-LRB-"

106 CONVERT_PARENTHESIS_2 = r"\)", "-RRB-"

107 CONVERT_PARENTHESIS_3 = r"\[", "-LSB-"

108 CONVERT_PARENTHESIS_4 = r"\]", "-RSB-"

109 CONVERT_PARENTHESIS_5 = r"\{", "-LCB-"

110 CONVERT_PARENTHESIS_6 = r"\}", "-RCB-"

111

112 # Pads double dashes with spaces.

113 PAD_DOUBLE_DASHES = r"--", " -- "

114

115 # Adds spaces to start and end of string to simplify further regexps.

116 PAD_START_OF_STR = r"^", " "

117 PAD_END_OF_STR = r"$", " "

118

119 # Converts double quotes to two single quotes and pad with spaces.

120 CONVERT_DOUBLE_TO_SINGLE_QUOTES = r'"', " '' "

121 # Handles single quote in possessives or close-single-quote.

122 HANDLES_SINGLE_QUOTES = r"([^'])' ", r"\1 ' "

123

124 # Pad apostrophe in possessive or close-single-quote.

125 APOSTROPHE = r"([^'])'", r"\1 ' "

126

127 # Prepend space on contraction apostrophe.

128 CONTRACTION_1 = r"'([sSmMdD]) ", r" '\1 "

129 CONTRACTION_2 = r"'ll ", r" 'll "

130 CONTRACTION_3 = r"'re ", r" 're "

131 CONTRACTION_4 = r"'ve ", r" 've "

132 CONTRACTION_5 = r"n't ", r" n't "

133 CONTRACTION_6 = r"'LL ", r" 'LL "

134 CONTRACTION_7 = r"'RE ", r" 'RE "

135 CONTRACTION_8 = r"'VE ", r" 'VE "

136 CONTRACTION_9 = r"N'T ", r" N'T "

137

138 # Informal Contractions.

139 CONTRACTION_10 = r" ([Cc])annot ", r" \1an not "

140 CONTRACTION_11 = r" ([Dd])'ye ", r" \1' ye "

141 CONTRACTION_12 = r" ([Gg])imme ", r" \1im me "

142 CONTRACTION_13 = r" ([Gg])onna ", r" \1on na "

143 CONTRACTION_14 = r" ([Gg])otta ", r" \1ot ta "

144 CONTRACTION_15 = r" ([Ll])emme ", r" \1em me "

145 CONTRACTION_16 = r" ([Mm])ore'n ", r" \1ore 'n "

146 CONTRACTION_17 = r" '([Tt])is ", r" '\1 is "

147 CONTRACTION_18 = r" '([Tt])was ", r" '\1 was "

148 CONTRACTION_19 = r" ([Ww])anna ", r" \1an na "

149

150 # Clean out extra spaces

151 CLEAN_EXTRA_SPACE_1 = r" *", r" "

152 CLEAN_EXTRA_SPACE_2 = r"^ *", r""

153 CLEAN_EXTRA_SPACE_3 = r" *$", r""

154

155 # Neurotic Perl regexes to escape special characters.

156 ESCAPE_AMPERSAND = r"&", r"&"

157 ESCAPE_PIPE = r"\|", r"|"

158 ESCAPE_LEFT_ANGLE_BRACKET = r"<", r"<"

159 ESCAPE_RIGHT_ANGLE_BRACKET = r">", r">"

160 ESCAPE_SINGLE_QUOTE = r"\'", r"'"

161 ESCAPE_DOUBLE_QUOTE = r"\"", r"""

162 ESCAPE_LEFT_SQUARE_BRACKET = r"\[", r"["

163 ESCAPE_RIGHT_SQUARE_BRACKET = r"]", r"]"

164

165 EN_SPECIFIC_1 = r"([^{alpha}])[']([^{alpha}])".format(alpha=IsAlpha), r"\1 ' \2"

166 EN_SPECIFIC_2 = (

167 r"([^{alpha}{isn}])[']([{alpha}])".format(alpha=IsAlpha, isn=IsN),

168 r"\1 ' \2",

169 )

170 EN_SPECIFIC_3 = r"([{alpha}])[']([^{alpha}])".format(alpha=IsAlpha), r"\1 ' \2"

171 EN_SPECIFIC_4 = r"([{alpha}])[']([{alpha}])".format(alpha=IsAlpha), r"\1 '\2"

172 EN_SPECIFIC_5 = r"([{isn}])[']([s])".format(isn=IsN), r"\1 '\2"

173

174 ENGLISH_SPECIFIC_APOSTROPHE = [

175 EN_SPECIFIC_1,

176 EN_SPECIFIC_2,

177 EN_SPECIFIC_3,

178 EN_SPECIFIC_4,

179 EN_SPECIFIC_5,

180 ]

181

182 FR_IT_SPECIFIC_1 = r"([^{alpha}])[']([^{alpha}])".format(alpha=IsAlpha), r"\1 ' \2"

183 FR_IT_SPECIFIC_2 = r"([^{alpha}])[']([{alpha}])".format(alpha=IsAlpha), r"\1 ' \2"

184 FR_IT_SPECIFIC_3 = r"([{alpha}])[']([^{alpha}])".format(alpha=IsAlpha), r"\1 ' \2"

185 FR_IT_SPECIFIC_4 = r"([{alpha}])[']([{alpha}])".format(alpha=IsAlpha), r"\1' \2"

186

187 FR_IT_SPECIFIC_APOSTROPHE = [

188 FR_IT_SPECIFIC_1,

189 FR_IT_SPECIFIC_2,

190 FR_IT_SPECIFIC_3,

191 FR_IT_SPECIFIC_4,

192 ]

193

194 NON_SPECIFIC_APOSTROPHE = r"\'", " ' "

195

196 TRAILING_DOT_APOSTROPHE = r"\.' ?$", " . ' "

197

198 BASIC_PROTECTED_PATTERN_1 = r"<\/?\S+\/?>"

199 BASIC_PROTECTED_PATTERN_2 = r'<\S+( [a-zA-Z0-9]+\="?[^"]")+ ?\/?>'

200 BASIC_PROTECTED_PATTERN_3 = r"<\S+( [a-zA-Z0-9]+\='?[^']')+ ?\/?>"

201 BASIC_PROTECTED_PATTERN_4 = r"[\w\-\_\.]+\@([\w\-\_]+\.)+[a-zA-Z]{2,}"

202 BASIC_PROTECTED_PATTERN_5 = r"(http[s]?|ftp):\/\/[^:\/\s]+(\/\w+)*\/[\w\-\.]+"

203

204 MOSES_PENN_REGEXES_1 = [

205 DEDUPLICATE_SPACE,

206 ASCII_JUNK,

207 DIRECTIONAL_QUOTE_1,

208 DIRECTIONAL_QUOTE_2,

209 DIRECTIONAL_QUOTE_3,

210 DIRECTIONAL_QUOTE_4,

211 DIRECTIONAL_QUOTE_5,

212 DIRECTIONAL_QUOTE_6,

213 DIRECTIONAL_QUOTE_7,

214 DIRECTIONAL_QUOTE_8,

215 REPLACE_ELLIPSIS,

216 COMMA_1,

217 COMMA_2,

218 COMMA_3,

219 SYMBOLS,

220 INTRATOKEN_SLASHES,

221 FINAL_PERIOD,

222 PAD_QUESTION_EXCLAMATION_MARK,

223 PAD_PARENTHESIS,

224 CONVERT_PARENTHESIS_1,

225 CONVERT_PARENTHESIS_2,

226 CONVERT_PARENTHESIS_3,

227 CONVERT_PARENTHESIS_4,

228 CONVERT_PARENTHESIS_5,

229 CONVERT_PARENTHESIS_6,

230 PAD_DOUBLE_DASHES,

231 PAD_START_OF_STR,

232 PAD_END_OF_STR,

233 CONVERT_DOUBLE_TO_SINGLE_QUOTES,

234 HANDLES_SINGLE_QUOTES,

235 APOSTROPHE,

236 CONTRACTION_1,

237 CONTRACTION_2,

238 CONTRACTION_3,

239 CONTRACTION_4,

240 CONTRACTION_5,

241 CONTRACTION_6,

242 CONTRACTION_7,

243 CONTRACTION_8,

244 CONTRACTION_9,

245 CONTRACTION_10,

246 CONTRACTION_11,

247 CONTRACTION_12,

248 CONTRACTION_13,

249 CONTRACTION_14,

250 CONTRACTION_15,

251 CONTRACTION_16,

252 CONTRACTION_17,

253 CONTRACTION_18,

254 CONTRACTION_19,

255 ]

256

257 MOSES_PENN_REGEXES_2 = [

258 RESTORE_ELLIPSIS,

259 CLEAN_EXTRA_SPACE_1,

260 CLEAN_EXTRA_SPACE_2,

261 CLEAN_EXTRA_SPACE_3,

262 ESCAPE_AMPERSAND,

263 ESCAPE_PIPE,

264 ESCAPE_LEFT_ANGLE_BRACKET,

265 ESCAPE_RIGHT_ANGLE_BRACKET,

266 ESCAPE_SINGLE_QUOTE,

267 ESCAPE_DOUBLE_QUOTE,

268 ]

269

270 MOSES_ESCAPE_XML_REGEXES = [

271 ESCAPE_AMPERSAND,

272 ESCAPE_PIPE,

273 ESCAPE_LEFT_ANGLE_BRACKET,

274 ESCAPE_RIGHT_ANGLE_BRACKET,

275 ESCAPE_SINGLE_QUOTE,

276 ESCAPE_DOUBLE_QUOTE,

277 ESCAPE_LEFT_SQUARE_BRACKET,

278 ESCAPE_RIGHT_SQUARE_BRACKET,

279 ]

280

281 BASIC_PROTECTED_PATTERNS = [

282 BASIC_PROTECTED_PATTERN_1,

283 BASIC_PROTECTED_PATTERN_2,

284 BASIC_PROTECTED_PATTERN_3,

285 BASIC_PROTECTED_PATTERN_4,

286 BASIC_PROTECTED_PATTERN_5,

287 ]

288 WEB_PROTECTED_PATTERNS = [

289 r"((https?|ftp|rsync)://|www\.)[^ ]*", # URLs

290 r"[\w\-\_\.]+\@([\w\-\_]+\.)+[a-zA-Z]{2,}", # Emails user@host.domain

291 r"@[a-zA-Z0-9_]+", # @handler such as twitter/github ID

292 r"#[a-zA-Z0-9_]+", # @hashtag

293 # TODO: emojis especially the multi codepoints

294 ]

295

296 def __init__(self, lang="en", custom_nonbreaking_prefixes_file=None):

297 # Initialize the object.

298 super(MosesTokenizer, self).__init__()

299 self.lang = lang

300

301 # Initialize the language specific nonbreaking prefixes.

302 self.NONBREAKING_PREFIXES = [

303 _nbp.strip() for _nbp in nonbreaking_prefixes.words(lang)

304 ]

305

306 # Load custom nonbreaking prefixes file.

307 if custom_nonbreaking_prefixes_file:

308 self.NONBREAKING_PREFIXES = []

309 with open(custom_nonbreaking_prefixes_file, "r") as fin:

310 for line in fin:

311 line = line.strip()

312 if line and not line.startswith("#"):

313 if line not in self.NONBREAKING_PREFIXES:

314 self.NONBREAKING_PREFIXES.append(line)

315

316 self.NUMERIC_ONLY_PREFIXES = [

317 w.rpartition(" ")[0]

318 for w in self.NONBREAKING_PREFIXES

319 if self.has_numeric_only(w)

320 ]

321 # Add CJK characters to alpha and alnum.

322 if self.lang in ["zh", "ja", "ko", "cjk"]:

323 cjk_chars = ""

324 if self.lang in ["ko", "cjk"]:

325 cjk_chars += str("".join(perluniprops.chars("Hangul")))

326 if self.lang in ["zh", "cjk"]:

327 cjk_chars += str("".join(perluniprops.chars("Han")))

328 if self.lang in ["ja", "cjk"]:

329 cjk_chars += str("".join(perluniprops.chars("Hiragana")))

330 cjk_chars += str("".join(perluniprops.chars("Katakana")))

331 cjk_chars += str("".join(perluniprops.chars("Han")))

332 self.IsAlpha += cjk_chars

333 self.IsAlnum += cjk_chars

334 # Overwrite the alnum regexes.

335 self.PAD_NOT_ISALNUM = r"([^{}\s\.'\`\,\-])".format(self.IsAlnum), r" \1 "

336 self.AGGRESSIVE_HYPHEN_SPLIT = (

337 r"([{alphanum}])\-(?=[{alphanum}])".format(alphanum=self.IsAlnum),

338 r"\1 @-@ ",

339 )

340 self.INTRATOKEN_SLASHES = (

341 r"([{alphanum}])\/([{alphanum}])".format(alphanum=self.IsAlnum),

342 r"$1 \@\/\@ $2",

343 )

344

345 def replace_multidots(self, text):

346 text = re.sub(r"\.([\.]+)", r" DOTMULTI\1", text)

347 while re.search(r"DOTMULTI\.", text):

348 text = re.sub(r"DOTMULTI\.([^\.])", r"DOTDOTMULTI \1", text)

349 text = re.sub(r"DOTMULTI\.", "DOTDOTMULTI", text)

350 return text

351

352 def restore_multidots(self, text):

353 while re.search(r"DOTDOTMULTI", text):

354 text = re.sub(r"DOTDOTMULTI", r"DOTMULTI.", text)

355 return re.sub(r"DOTMULTI", r".", text)

356

357 def islower(self, text):

358 return not set(text).difference(set(self.IsLower))

359

360 def isanyalpha(self, text):

361 return any(set(text).intersection(set(self.IsAlpha)))

362

363 def has_numeric_only(self, text):

364 return bool(re.search(r"[\s]+(\#NUMERIC_ONLY\#)", text))

365

366 def handles_nonbreaking_prefixes(self, text):

367 # Splits the text into tokens to check for nonbreaking prefixes.

368 tokens = text.split()

369 num_tokens = len(tokens)

370 for i, token in enumerate(tokens):

371 # Checks if token ends with a fullstop.

372 token_ends_with_period = re.search(r"^(\S+)\.$", token)

373 if token_ends_with_period:

374 prefix = token_ends_with_period.group(1)

375 # Checks for 3 conditions if

376 # i. the prefix contains a fullstop and

377 # any char in the prefix is within the IsAlpha charset

378 # ii. the prefix is in the list of nonbreaking prefixes and

379 # does not contain #NUMERIC_ONLY#

380 # iii. the token is not the last token and that the

381 # next token contains all lowercase.

382 if (

383 ("." in prefix and self.isanyalpha(prefix))

384 or (

385 prefix in self.NONBREAKING_PREFIXES

386 and prefix not in self.NUMERIC_ONLY_PREFIXES

387 )

388 or (

389 i != num_tokens - 1

390 and tokens[i + 1]

391 and self.islower(tokens[i + 1][0])

392 )

393 ):

394 pass # No change to the token.

395 # Checks if the prefix is in NUMERIC_ONLY_PREFIXES

396 # and ensures that the next word is a digit.

397 elif (

398 prefix in self.NUMERIC_ONLY_PREFIXES

399 and (i + 1) < num_tokens

400 and re.search(r"^[0-9]+", tokens[i + 1])

401 ):

402 pass # No change to the token.

403 else: # Otherwise, adds a space after the tokens before a dot.

404 tokens[i] = prefix + " ."

405 return " ".join(tokens) # Stitch the tokens back.

406

407 def escape_xml(self, text):

408 for regexp, substitution in self.MOSES_ESCAPE_XML_REGEXES:

409 text = re.sub(regexp, substitution, text)

410 return text

411

412 def penn_tokenize(self, text, return_str=False):

413 """

414 This is a Python port of the Penn treebank tokenizer adapted by the Moses

415 machine translation community.

416 """

417 # Converts input string into unicode.

418 text = str(text)

419 # Perform a chain of regex substituitions using MOSES_PENN_REGEXES_1

420 for regexp, substitution in self.MOSES_PENN_REGEXES_1:

421 text = re.sub(regexp, substitution, text)

422 # Handles nonbreaking prefixes.

423 text = self.handles_nonbreaking_prefixes(text)

424 # Restore ellipsis, clean extra spaces, escape XML symbols.

425 for regexp, substitution in self.MOSES_PENN_REGEXES_2:

426 text = re.sub(regexp, substitution, text)

427 return text if return_str else text.split()

428

429 def tokenize(

430 self,

431 text,

432 aggressive_dash_splits=False,

433 return_str=False,

434 escape=True,

435 protected_patterns=None,

436 ):

437 """

438 Python port of the Moses tokenizer.

439

440 :param tokens: A single string, i.e. sentence text.

441 :type tokens: str

442 :param aggressive_dash_splits: Option to trigger dash split rules .

443 :type aggressive_dash_splits: bool

444 """

445 # Converts input string into unicode.

446 text = str(text)

447 # De-duplicate spaces and clean ASCII junk

448 for regexp, substitution in [self.DEDUPLICATE_SPACE, self.ASCII_JUNK]:

449 text = re.sub(regexp, substitution, text)

450

451 if protected_patterns:

452 # Find the tokens that needs to be protected.

453 protected_tokens = [

454 match.group()

455 for protected_pattern in protected_patterns

456 for match in re.finditer(protected_pattern, text, re.IGNORECASE)

457 ]

458 # Apply the protected_patterns.

459 for i, token in enumerate(protected_tokens):

460 substituition = "THISISPROTECTED" + str(i).zfill(3)

461 text = text.replace(token, substituition)

462

463 # Strips heading and trailing spaces.

464 text = text.strip()

465 # FIXME!!!

466 """

467 # For Finnish and Swedish, seperate out all "other" special characters.

468 if self.lang in ["fi", "sv"]:

469 # In Finnish and Swedish, the colon can be used inside words

470 # as an apostrophe-like character:

471 # USA:n, 20:een, EU:ssa, USA:s, S:t

472 regexp, substitution = self.FI_SV_COLON_APOSTROPHE

473 text = re.sub(regexp, substitution, text)

474 # If a colon is not immediately followed by lower-case characters,

475 # separate it out anyway.

476 regexp, substitution = self.FI_SV_COLON_NO_LOWER_FOLLOW

477 text = re.sub(regexp, substitution, text)

478 else:

479 """

480 # Separate special characters outside of IsAlnum character set.

481 regexp, substitution = self.PAD_NOT_ISALNUM

482 text = re.sub(regexp, substitution, text)

483 # Aggressively splits dashes

484 if aggressive_dash_splits:

485 regexp, substitution = self.AGGRESSIVE_HYPHEN_SPLIT

486 text = re.sub(regexp, substitution, text)

487

488 # Replaces multidots with "DOTDOTMULTI" literal strings.

489 text = self.replace_multidots(text)

490

491 # Separate out "," except if within numbers e.g. 5,300

492 for regexp, substitution in [

493 self.COMMA_SEPARATE_1,

494 self.COMMA_SEPARATE_2,

495 self.COMMA_SEPARATE_3,

496 ]:

497 text = re.sub(regexp, substitution, text)

498

499 # (Language-specific) apostrophe tokenization.

500 if self.lang == "en":

501 for regexp, substitution in self.ENGLISH_SPECIFIC_APOSTROPHE:

502 text = re.sub(regexp, substitution, text)

503 elif self.lang in ["fr", "it"]:

504 for regexp, substitution in self.FR_IT_SPECIFIC_APOSTROPHE:

505 text = re.sub(regexp, substitution, text)

506 # FIXME!!!

507 ##elif self.lang == "so":

508 ## for regexp, substitution in self.SO_SPECIFIC_APOSTROPHE:

509 ## text = re.sub(regexp, substitution, text)

510 else:

511 regexp, substitution = self.NON_SPECIFIC_APOSTROPHE

512 text = re.sub(regexp, substitution, text)

513

514 # Handles nonbreaking prefixes.

515 text = self.handles_nonbreaking_prefixes(text)

516 # Cleans up extraneous spaces.

517 regexp, substitution = self.DEDUPLICATE_SPACE

518 text = re.sub(regexp, substitution, text).strip()

519 # Split trailing ".'".

520 regexp, substituition = self.TRAILING_DOT_APOSTROPHE

521 text = re.sub(regexp, substituition, text)

522

523 # Restore the protected tokens.

524 if protected_patterns:

525 for i, token in enumerate(protected_tokens):

526 substituition = "THISISPROTECTED" + str(i).zfill(3)

527 text = text.replace(substituition, token)

528

529 # Restore multidots.

530 text = self.restore_multidots(text)

531 if escape:

532 # Escape XML symbols.

533 text = self.escape_xml(text)

534

535 return text if return_str else text.split()

536

537

538class MosesDetokenizer(object):

539 """

540 This is a Python port of the Moses Detokenizer from

541 https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/detokenizer.perl

542

543 """

544

545 # Currency Symbols.

546 IsAlnum = str("".join(perluniprops.chars("IsAlnum")))

547 IsAlpha = str("".join(perluniprops.chars("IsAlpha")))

548 IsSc = str("".join(perluniprops.chars("IsSc")))

549

550 AGGRESSIVE_HYPHEN_SPLIT = r" \@\-\@ ", r"-"

551

552 # Merge multiple spaces.

553 ONE_SPACE = re.compile(r" {2,}"), " "

554

555 # Unescape special characters.

556 UNESCAPE_FACTOR_SEPARATOR = r"|", r"|"

557 UNESCAPE_LEFT_ANGLE_BRACKET = r"<", r"<"

558 UNESCAPE_RIGHT_ANGLE_BRACKET = r">", r">"

559 UNESCAPE_DOUBLE_QUOTE = r""", r'"'

560 UNESCAPE_SINGLE_QUOTE = r"'", r"'"

561 UNESCAPE_SYNTAX_NONTERMINAL_LEFT = r"[", r"["

562 UNESCAPE_SYNTAX_NONTERMINAL_RIGHT = r"]", r"]"

563 UNESCAPE_AMPERSAND = r"&", r"&"

564 # The legacy regexes are used to support outputs from older Moses versions.

565 UNESCAPE_FACTOR_SEPARATOR_LEGACY = r"&bar;", r"|"

566 UNESCAPE_SYNTAX_NONTERMINAL_LEFT_LEGACY = r"&bra;", r"["

567 UNESCAPE_SYNTAX_NONTERMINAL_RIGHT_LEGACY = r"&ket;", r"]"

568

569 MOSES_UNESCAPE_XML_REGEXES = [

570 UNESCAPE_FACTOR_SEPARATOR_LEGACY,

571 UNESCAPE_FACTOR_SEPARATOR,

572 UNESCAPE_LEFT_ANGLE_BRACKET,

573 UNESCAPE_RIGHT_ANGLE_BRACKET,

574 UNESCAPE_SYNTAX_NONTERMINAL_LEFT_LEGACY,

575 UNESCAPE_SYNTAX_NONTERMINAL_RIGHT_LEGACY,

576 UNESCAPE_DOUBLE_QUOTE,

577 UNESCAPE_SINGLE_QUOTE,

578 UNESCAPE_SYNTAX_NONTERMINAL_LEFT,

579 UNESCAPE_SYNTAX_NONTERMINAL_RIGHT,

580 UNESCAPE_AMPERSAND,

581 ]

582

583 FINNISH_MORPHSET_1 = [

584 "N",

585 "n",

586 "A",

587 "a",

588 "\xc4",

589 "\xe4",

590 "ssa",

591 "Ssa",

592 "ss\xe4",

593 "Ss\xe4",

594 "sta",

595 "st\xe4",

596 "Sta",

597 "St\xe4",

598 "hun",

599 "Hun",

600 "hyn",

601 "Hyn",

602 "han",

603 "Han",

604 "h\xe4n",

605 "H\xe4n",

606 "h\xf6n",

607 "H\xf6n",

608 "un",

609 "Un",

610 "yn",

611 "Yn",

612 "an",

613 "An",

614 "\xe4n",

615 "\xc4n",

616 "\xf6n",

617 "\xd6n",

618 "seen",

619 "Seen",

620 "lla",

621 "Lla",

622 "ll\xe4",

623 "Ll\xe4",

624 "lta",

625 "Lta",

626 "lt\xe4",

627 "Lt\xe4",

628 "lle",

629 "Lle",

630 "ksi",

631 "Ksi",

632 "kse",

633 "Kse",

634 "tta",

635 "Tta",

636 "ine",

637 "Ine",

638 ]

639

640 FINNISH_MORPHSET_2 = ["ni", "si", "mme", "nne", "nsa"]

641

642 FINNISH_MORPHSET_3 = [

643 "ko",

644 "k\xf6",

645 "han",

646 "h\xe4n",

647 "pa",

648 "p\xe4",

649 "kaan",

650 "k\xe4\xe4n",

651 "kin",

652 ]

653

654 FINNISH_REGEX = r"^({})({})?({})$".format(

655 str("|".join(FINNISH_MORPHSET_1)),

656 str("|".join(FINNISH_MORPHSET_2)),

657 str("|".join(FINNISH_MORPHSET_3)),

658 )

659

660 def __init__(self, lang="en"):

661 super(MosesDetokenizer, self).__init__()

662 self.lang = lang

663

664 def unescape_xml(self, text):

665 for regexp, substitution in self.MOSES_UNESCAPE_XML_REGEXES:

666 text = re.sub(regexp, substitution, text)

667 return text

668

669 def tokenize(self, tokens, return_str=True, unescape=True):

670 """

671 Python port of the Moses detokenizer.

672 :param tokens: A list of strings, i.e. tokenized text.

673 :type tokens: list(str)

674 :return: str

675 """

676 # Convert the list of tokens into a string and pad it with spaces.

677 text = r" {} ".format(" ".join(tokens))

678 # Converts input string into unicode.

679 text = str(text)

680 # Detokenize the agressive hyphen split.

681 regexp, substitution = self.AGGRESSIVE_HYPHEN_SPLIT

682 text = re.sub(regexp, substitution, text)

683 if unescape:

684 # Unescape the XML symbols.

685 text = self.unescape_xml(text)

686 # Keep track of no. of quotation marks.

687 quote_counts = {"'": 0, '"': 0, "``": 0, "`": 0, "''": 0}

688

689 # The *prepend_space* variable is used to control the "effects" of

690 # detokenization as the function loops through the list of tokens and

691 # changes the *prepend_space* accordingly as it sequentially checks

692 # through the language specific and language independent conditions.

693 prepend_space = " "

694 detokenized_text = ""

695 tokens = text.split()

696 # Iterate through every token and apply language specific detokenization rule(s).

697 for i, token in enumerate(iter(tokens)):

698 # Check if the first char is CJK.

699 if is_cjk(token[0]) and self.lang != "ko":

700 # Perform left shift if this is a second consecutive CJK word.

701 if i > 0 and is_cjk(tokens[i - 1][-1]):

702 detokenized_text += token

703 # But do nothing special if this is a CJK word that doesn't follow a CJK word

704 else:

705 detokenized_text += prepend_space + token

706 prepend_space = " "

707 # If it's a currency symbol.

708 elif re.search(r"^[" + self.IsSc + r"\(\[\{\¿\¡]+$", token):

709 # Perform right shift on currency and other random punctuation items

710 detokenized_text += prepend_space + token

711 prepend_space = ""

712

713 elif re.search(r"^[\,\.\?\!\:\;\\\%\}\]\)]+$", token):

714 # In French, these punctuations are prefixed with a non-breakable space.

715 if self.lang == "fr" and re.search(r"^[\?\!\:\;\\\%]$", token):

716 detokenized_text += " "

717 # Perform left shift on punctuation items.

718 detokenized_text += token

719 prepend_space = " "

720

721 elif (

722 self.lang == "en"

723 and i > 0

724 and re.search(r"^['][{}]".format(self.IsAlpha), token)

725 ):

726 # and re.search('[{}]$'.format(self.IsAlnum), tokens[i-1])):

727 # For English, left-shift the contraction.

728 detokenized_text += token

729 prepend_space = " "

730

731 elif (

732 self.lang == "cs"

733 and i > 1

734 and re.search(

735 r"^[0-9]+$", tokens[-2]

736 ) # If the previous previous token is a number.

737 and re.search(r"^[.,]$", tokens[-1]) # If previous token is a dot.

738 and re.search(r"^[0-9]+$", token)

739 ): # If the current token is a number.

740 # In Czech, left-shift floats that are decimal numbers.

741 detokenized_text += token

742 prepend_space = " "

743

744 elif (

745 self.lang in ["fr", "it", "ga"]

746 and i <= len(tokens) - 2

747 and re.search(r"[{}][']$".format(self.IsAlpha), token)

748 and re.search(r"^[{}]".format(self.IsAlpha), tokens[i + 1])

749 ): # If the next token is alpha.

750 # For French and Italian, right-shift the contraction.

751 detokenized_text += prepend_space + token

752 prepend_space = ""

753

754 elif (

755 self.lang == "cs"

756 and i <= len(tokens) - 3

757 and re.search(r"[{}][']$".format(self.IsAlpha), token)

758 and re.search(r"^[-–]$", tokens[i + 1])

759 and re.search(r"^li$|^mail.*", tokens[i + 2], re.IGNORECASE)

760 ): # In Perl, ($words[$i+2] =~ /^li$|^mail.*/i)

761 # In Czech, right-shift "-li" and a few Czech dashed words (e.g. e-mail)

762 detokenized_text += prepend_space + token + tokens[i + 1]

763 next(tokens, None) # Advance over the dash

764 prepend_space = ""

765

766 # Combine punctuation smartly.

767 elif re.search(r"""^[\'\"„“`]+$""", token):

768 normalized_quo = token

769 if re.search(r"^[„“”]+$", token):

770 normalized_quo = '"'

771 quote_counts[normalized_quo] = quote_counts.get(normalized_quo, 0)

772

773 if self.lang == "cs" and token == "„":

774 quote_counts[normalized_quo] = 0

775 if self.lang == "cs" and token == "“":

776 quote_counts[normalized_quo] = 1

777

778 if quote_counts[normalized_quo] % 2 == 0:

779 if (

780 self.lang == "en"

781 and token == "'"

782 and i > 0

783 and re.search(r"[s]$", tokens[i - 1])

784 ):

785 # Left shift on single quote for possessives ending

786 # in "s", e.g. "The Jones' house"

787 detokenized_text += token

788 prepend_space = " "

789 else:

790 # Right shift.

791 detokenized_text += prepend_space + token

792 prepend_space = ""

793 quote_counts[normalized_quo] += 1

794 else:

795 # Left shift.

796 detokenized_text += token

797 prepend_space = " "

798 quote_counts[normalized_quo] += 1

799

800 elif (

801 self.lang == "fi"

802 and re.search(r":$", tokens[i - 1])

803 and re.search(self.FINNISH_REGEX, token)

804 ):

805 # Finnish : without intervening space if followed by case suffix

806 # EU:N EU:n EU:ssa EU:sta EU:hun EU:iin ...

807 detokenized_text += prepend_space + token

808 prepend_space = " "

809

810 else:

811 detokenized_text += prepend_space + token

812 prepend_space = " "

813

814 # Merge multiple spaces.

815 regexp, substitution = self.ONE_SPACE

816 detokenized_text = re.sub(regexp, substitution, detokenized_text)

817 # Removes heading and trailing spaces.

818 detokenized_text = detokenized_text.strip()

819

820 return detokenized_text if return_str else detokenized_text.split()

821

822 def detokenize(self, tokens, return_str=True, unescape=True):

823 """Duck-typing the abstract *tokenize()*."""

824 return self.tokenize(tokens, return_str, unescape)

825

826

827__all__ = ["MosesTokenizer", "MosesDetokenizer"]