Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/sacremoses/truecase.py: 29%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

194 statements  

1# -*- coding: utf-8 -*- 

2 

3from __future__ import print_function 

4 

5import re 

6from collections import defaultdict, Counter 

7from functools import partial 

8from itertools import chain 

9 

10from sacremoses.corpus import Perluniprops 

11from sacremoses.util import parallelize_preprocess, grouper 

12 

13# Hack to enable Python2.7 to use encoding. 

14import sys 

15 

16if sys.version_info[0] < 3: 

17 import io 

18 import warnings 

19 

20 open = io.open 

21 warnings.warn( 

22 str( 

23 "You should really be using Python3!!! " 

24 "Tick tock, tick tock, https://pythonclock.org/" 

25 ) 

26 ) 

27 

28perluniprops = Perluniprops() 

29 

30 

31class MosesTruecaser(object): 

32 """ 

33 This is a Python port of the Moses Truecaser from 

34 https://github.com/moses-smt/mosesdecoder/blob/master/scripts/recaser/train-truecaser.perl 

35 https://github.com/moses-smt/mosesdecoder/blob/master/scripts/recaser/truecase.perl 

36 """ 

37 

38 # Perl Unicode Properties character sets. 

39 Lowercase_Letter = str("".join(perluniprops.chars("Lowercase_Letter"))) 

40 Uppercase_Letter = str("".join(perluniprops.chars("Uppercase_Letter"))) 

41 Titlecase_Letter = str("".join(perluniprops.chars("Uppercase_Letter"))) 

42 

43 def __init__(self, load_from=None, is_asr=None, encoding="utf8"): 

44 """ 

45 :param load_from: 

46 :type load_from: 

47 

48 :param is_asr: A flag to indicate that model is for ASR. ASR input has 

49 no case, make sure it is lowercase, and make sure known are cased 

50 eg. 'i' to be uppercased even if i is known. 

51 :type is_asr: bool 

52 """ 

53 # Initialize the object. 

54 super(MosesTruecaser, self).__init__() 

55 # Initialize the language specific nonbreaking prefixes. 

56 self.SKIP_LETTERS_REGEX = re.compile( 

57 "[{}{}{}]".format( 

58 self.Lowercase_Letter, self.Uppercase_Letter, self.Titlecase_Letter 

59 ) 

60 ) 

61 

62 self.XML_SPLIT_REGX = re.compile("(<.*(?<=>))(.*)((?=</)[^>]*>)") 

63 

64 self.SENT_END = {".", ":", "?", "!"} 

65 self.DELAYED_SENT_START = { 

66 "(", 

67 "[", 

68 '"', 

69 "'", 

70 "&apos;", 

71 "&quot;", 

72 "&#91;", 

73 "&#93;", 

74 } 

75 

76 self.encoding = encoding 

77 

78 self.is_asr = is_asr 

79 if load_from: 

80 self.model = self._load_model(load_from) 

81 

82 def learn_truecase_weights(self, tokens, possibly_use_first_token=False): 

83 """ 

84 This function checks through each tokens in a sentence and returns the 

85 appropriate weight of each surface token form. 

86 """ 

87 # Keep track of first tokens in the sentence(s) of the line. 

88 is_first_word = True 

89 truecase_weights = [] 

90 for i, token in enumerate(tokens): 

91 # Skip XML tags. 

92 if re.search(r"(<\S[^>]*>)", token): 

93 continue 

94 # Skip if sentence start symbols. 

95 elif token in self.DELAYED_SENT_START: 

96 continue 

97 

98 # Resets the `is_first_word` after seeing sent end symbols. 

99 if not is_first_word and token in self.SENT_END: 

100 is_first_word = True 

101 continue 

102 # Skips tokens with nothing to case. 

103 if not self.SKIP_LETTERS_REGEX.search(token): 

104 is_first_word = False 

105 continue 

106 

107 # If it's not the first word, 

108 # then set the current word weight to 1. 

109 current_word_weight = 0 

110 if not is_first_word: 

111 current_word_weight = 1 

112 # Otherwise check whether user wants to optionally 

113 # use the first word. 

114 elif possibly_use_first_token: 

115 # Gated special handling of first word of sentence. 

116 # Check if first characer of token is lowercase. 

117 if token[0].islower(): 

118 current_word_weight = 1 

119 elif i == 1: 

120 current_word_weight = 0.1 

121 

122 is_first_word = False 

123 

124 if current_word_weight > 0: 

125 truecase_weights.append((token.lower(), token, current_word_weight)) 

126 return truecase_weights 

127 

128 def _train( 

129 self, 

130 document_iterator, 

131 save_to=None, 

132 possibly_use_first_token=False, 

133 processes=1, 

134 progress_bar=False, 

135 ): 

136 """ 

137 :param document_iterator: The input document, each outer list is a sentence, 

138 the inner list is the list of tokens for each sentence. 

139 :type document_iterator: iter(list(str)) 

140 

141 :param possibly_use_first_token: When True, on the basis that the first 

142 word of a sentence is always capitalized; if this option is provided then: 

143 a) if a sentence-initial token is *not* capitalized, then it is counted, and 

144 b) if a capitalized sentence-initial token is the only token of the segment, 

145 then it is counted, but with only 10% of the weight of a normal token. 

146 :type possibly_use_first_token: bool 

147 

148 :returns: A dictionary of the best, known objects as values from `_casing_to_model()` 

149 :rtype: {'best': dict, 'known': Counter} 

150 """ 

151 casing = defaultdict(Counter) 

152 train_truecaser = partial( 

153 self.learn_truecase_weights, 

154 possibly_use_first_token=possibly_use_first_token, 

155 ) 

156 token_weights = chain( 

157 *parallelize_preprocess( 

158 train_truecaser, document_iterator, processes, progress_bar=progress_bar 

159 ) 

160 ) 

161 # Collect the token_weights from every sentence. 

162 for lowercase_token, surface_token, weight in token_weights: 

163 casing[lowercase_token][surface_token] += weight 

164 

165 # Save to file if specified. 

166 if save_to: 

167 self._save_model_from_casing(casing, save_to) 

168 return self._casing_to_model(casing) 

169 

170 def train( 

171 self, 

172 documents, 

173 save_to=None, 

174 possibly_use_first_token=False, 

175 processes=1, 

176 progress_bar=False, 

177 ): 

178 """ 

179 Default duck-type of _train(), accepts list(list(str)) as input documents. 

180 """ 

181 self.model = None # Clear the model first. 

182 self.model = self._train( 

183 documents, 

184 save_to, 

185 possibly_use_first_token, 

186 processes, 

187 progress_bar=progress_bar, 

188 ) 

189 return self.model 

190 

191 def train_from_file( 

192 self, 

193 filename, 

194 save_to=None, 

195 possibly_use_first_token=False, 

196 processes=1, 

197 progress_bar=False, 

198 ): 

199 """ 

200 Duck-type of _train(), accepts a filename to read as a `iter(list(str))` 

201 object. 

202 """ 

203 with open(filename, encoding=self.encoding) as fin: 

204 # document_iterator = map(str.split, fin.readlines()) 

205 document_iterator = ( 

206 line.split() for line in fin.readlines() 

207 ) # Lets try a generator comprehension for Python2... 

208 self.model = None # Clear the model first. 

209 self.model = self._train( 

210 document_iterator, 

211 save_to, 

212 possibly_use_first_token, 

213 processes, 

214 progress_bar=progress_bar, 

215 ) 

216 return self.model 

217 

218 def train_from_file_object( 

219 self, 

220 file_object, 

221 save_to=None, 

222 possibly_use_first_token=False, 

223 processes=1, 

224 progress_bar=False, 

225 ): 

226 """ 

227 Duck-type of _train(), accepts a file object to read as a `iter(list(str))` 

228 object. 

229 """ 

230 # document_iterator = map(str.split, file_object.readlines()) 

231 document_iterator = ( 

232 line.split() for line in file_object.readlines() 

233 ) # Lets try a generator comprehension for Python2... 

234 self.model = None # Clear the model first. 

235 self.model = self._train( 

236 document_iterator, 

237 save_to, 

238 possibly_use_first_token, 

239 processes, 

240 progress_bar=progress_bar, 

241 ) 

242 return self.model 

243 

244 def truecase(self, text, return_str=False, use_known=False): 

245 """ 

246 Truecase a single sentence / line of text. 

247 

248 :param text: A single string, i.e. sentence text. 

249 :type text: str 

250 

251 :param use_known: Use the known case if a word is a known word but not the first word. 

252 :type use_known: bool 

253 """ 

254 check_model_message = str( 

255 "\nUse Truecaser.train() to train a model.\n" 

256 "Or use Truecaser('modefile') to load a model." 

257 ) 

258 assert hasattr(self, "model"), check_model_message 

259 # Keep track of first tokens in the sentence(s) of the line. 

260 is_first_word = True 

261 truecased_tokens = [] 

262 tokens = self.split_xml(text) 

263 # best_cases = best_cases if best_cases else self.model['best'] 

264 # known_cases = known_cases if known_cases else self.model['known'] 

265 

266 for i, token in enumerate(tokens): 

267 

268 # Append XML tags and continue 

269 if re.search(r"(<\S[^>]*>)", token): 

270 truecased_tokens.append(token) 

271 continue 

272 

273 # Note this shouldn't happen other if | are escaped as &#124; 

274 # To make the truecaser resilient, 

275 # we'll just any token starting with pipes as they are. 

276 if token == "|" or token.startswith("|"): 

277 truecased_tokens.append(token) 

278 continue 

279 

280 # Reads the word token and factors separatedly 

281 token, other_factors = re.search(r"^([^\|]+)(.*)", token).groups() 

282 

283 # Lowercase the ASR tokens. 

284 if self.is_asr: 

285 token = token.lower() 

286 

287 # The actual case replacement happens here. 

288 # "Most frequent" case of the word. 

289 best_case = self.model["best"].get(token.lower(), None) 

290 # Other known cases of the word. 

291 known_case = self.model["known"].get(token, None) 

292 # If it's the start of sentence. 

293 if is_first_word and best_case: # Truecase sentence start. 

294 token = best_case 

295 elif known_case: # Don't change known tokens. 

296 token = known_case if use_known else token 

297 elif ( 

298 best_case 

299 ): # Truecase otherwise unknown tokens? Heh? From https://github.com/moses-smt/mosesdecoder/blob/master/scripts/recaser/truecase.perl#L66 

300 token = best_case 

301 # Else, it's an unknown word, don't change the word. 

302 # Concat the truecased `word` with the `other_factors` 

303 token = token + other_factors 

304 # Adds the truecased word. 

305 truecased_tokens.append(token) 

306 

307 # Resets sentence start if this token is an ending punctuation. 

308 is_first_word = token in self.SENT_END 

309 

310 if token in self.DELAYED_SENT_START: 

311 is_first_word = False 

312 

313 # return ' '.join(tokens) 

314 return " ".join(truecased_tokens) if return_str else truecased_tokens 

315 

316 def truecase_file(self, filename, return_str=True): 

317 with open(filename, encoding=self.encoding) as fin: 

318 for line in fin: 

319 truecased_tokens = self.truecase(line.strip()) 

320 # Yield the truecased line. 

321 yield " ".join(truecased_tokens) if return_str else truecased_tokens 

322 

323 @staticmethod 

324 def split_xml(line): 

325 """ 

326 Python port of split_xml function in Moses' truecaser: 

327 https://github.com/moses-smt/mosesdecoder/blob/master/scripts/recaser/truecaser.perl 

328 

329 :param line: Input string, should be tokenized, separated by space. 

330 :type line: str 

331 """ 

332 line = line.strip() 

333 tokens = [] 

334 while line: 

335 # Assumes that xml tag is always separated by space. 

336 has_xml = re.search(r"^\s*(<\S[^>]*>)(.*)$", line) 

337 # non-XML test. 

338 is_non_xml = re.search(r"^\s*([^\s<>]+)(.*)$", line) 

339 # '<' or '>' occurs in word, but it's not an XML tag 

340 xml_cognates = re.search(r"^\s*(\S+)(.*)$", line) 

341 if has_xml: 

342 potential_xml, line_next = has_xml.groups() 

343 # exception for factor that is an XML tag 

344 if ( 

345 re.search(r"^\S", line) 

346 and len(tokens) > 0 

347 and re.search(r"\|$", tokens[-1]) 

348 ): 

349 tokens[-1] += potential_xml 

350 # If it's a token with factors, join with the previous token. 

351 is_factor = re.search(r"^(\|+)(.*)$", line_next) 

352 if is_factor: 

353 tokens[-1] += is_factor.group(1) 

354 line_next = is_factor.group(2) 

355 else: 

356 tokens.append( 

357 potential_xml + " " 

358 ) # Token hack, unique to sacremoses. 

359 line = line_next 

360 

361 elif is_non_xml: 

362 tokens.append(is_non_xml.group(1)) # Token hack, unique to sacremoses. 

363 line = is_non_xml.group(2) 

364 elif xml_cognates: 

365 tokens.append( 

366 xml_cognates.group(1) 

367 ) # Token hack, unique to sacremoses. 

368 line = xml_cognates.group(2) 

369 else: 

370 raise Exception("ERROR: huh? {}".format(line)) 

371 tokens[-1] = tokens[-1].strip() # Token hack, unique to sacremoses. 

372 return tokens 

373 

374 def _casing_to_model(self, casing): 

375 """ 

376 

377 :returns: A tuple of the (best, known) objects. 

378 :rtype: tuple(dict, Counter) 

379 """ 

380 best = {} 

381 known = Counter() 

382 

383 for token_lower in casing: 

384 tokens = casing[token_lower].most_common() 

385 # Set the most frequent case as the "best" case. 

386 best[token_lower] = tokens[0][0] 

387 # If it's asr, throw away everything 

388 if not self.is_asr: 

389 for token, count in tokens[1:]: 

390 # Note: This is rather odd that the counts are thrown away... 

391 # from https://github.com/moses-smt/mosesdecoder/blob/master/scripts/recaser/truecase.perl#L34 

392 known[token] += 1 

393 model = {"best": best, "known": known, "casing": casing} 

394 return model 

395 

396 def save_model(self, filename): 

397 self._save_model_from_casing(self.model["casing"], filename) 

398 

399 def _save_model_from_casing(self, casing, filename): 

400 """ 

401 Outputs the truecaser model file in the same output format as 

402 https://github.com/moses-smt/mosesdecoder/blob/master/scripts/recaser/train-truecaser.perl 

403 

404 :param casing: The dictionary of tokens counter from `train()`. 

405 :type casing: default(Counter) 

406 """ 

407 with open(filename, "w", encoding=self.encoding) as fout: 

408 for token in casing: 

409 total_token_count = sum(casing[token].values()) 

410 tokens_counts = [] 

411 for i, (token, count) in enumerate(casing[token].most_common()): 

412 if i == 0: 

413 out_token = "{} ({}/{})".format(token, count, total_token_count) 

414 else: 

415 out_token = "{} ({})".format(token, count, total_token_count) 

416 tokens_counts.append(out_token) 

417 print(" ".join(tokens_counts), end="\n", file=fout) 

418 

419 def _load_model(self, filename): 

420 """ 

421 Loads pre-trained truecasing file. 

422 

423 :returns: A dictionary of the best, known objects as values from `_casing_to_model()` 

424 :rtype: {'best': dict, 'known': Counter} 

425 """ 

426 casing = defaultdict(Counter) 

427 with open(filename, encoding=self.encoding) as fin: 

428 for line in fin: 

429 line = line.strip().split() 

430 for token, count in grouper(line, 2): 

431 count = count.split("/")[0].strip("()") 

432 casing[token.lower()][token] = int(count) 

433 # Returns the best and known object from `_casing_to_model()` 

434 return self._casing_to_model(casing) 

435 

436 

437class MosesDetruecaser(object): 

438 def __init__(self): 

439 # Initialize the object. 

440 super(MosesDetruecaser, self).__init__() 

441 self.SENT_END = {".", ":", "?", "!"} 

442 self.DELAYED_SENT_START = { 

443 "(", 

444 "[", 

445 '"', 

446 "'", 

447 "&apos;", 

448 "&quot;", 

449 "&#91;", 

450 "&#93;", 

451 } 

452 

453 # Some predefined tokens that will always be in lowercase. 

454 self.ALWAYS_LOWER = { 

455 "a", 

456 "after", 

457 "against", 

458 "al-.+", 

459 "and", 

460 "any", 

461 "as", 

462 "at", 

463 "be", 

464 "because", 

465 "between", 

466 "by", 

467 "during", 

468 "el-.+", 

469 "for", 

470 "from", 

471 "his", 

472 "in", 

473 "is", 

474 "its", 

475 "last", 

476 "not", 

477 "of", 

478 "off", 

479 "on", 

480 "than", 

481 "the", 

482 "their", 

483 "this", 

484 "to", 

485 "was", 

486 "were", 

487 "which", 

488 "will", 

489 "with", 

490 } 

491 

492 def detruecase(self, text, is_headline=False, return_str=False): 

493 """ 

494 Detruecase the translated files from a model that learnt from truecased 

495 tokens. 

496 

497 :param text: A single string, i.e. sentence text. 

498 :type text: str 

499 """ 

500 # `cased_tokens` keep tracks of detruecased tokens. 

501 cased_tokens = [] 

502 sentence_start = True 

503 # Capitalize token if it's at the sentence start. 

504 for token in text.split(): 

505 token = token[:1].upper() + token[1:] if sentence_start else token 

506 cased_tokens.append(token) 

507 if token in self.SENT_END: 

508 sentence_start = True 

509 elif not token in self.DELAYED_SENT_START: 

510 sentence_start = False 

511 # Check if it's a headline, if so then use title case. 

512 if is_headline: 

513 cased_tokens = [ 

514 token if token in self.ALWAYS_LOWER else token[:1].upper() + token[1:] 

515 for token in cased_tokens 

516 ] 

517 

518 return " ".join(cased_tokens) if return_str else cased_tokens 

519 

520 

521__all__ = ["MosesTruecaser", "MosesDetruecaser"]