Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/sacremoses/truecase.py: 29%

1# -*- coding: utf-8 -*-

3from __future__ import print_function

5import re

6from collections import defaultdict, Counter

7from functools import partial

8from itertools import chain

10from sacremoses.corpus import Perluniprops

11from sacremoses.util import parallelize_preprocess, grouper

13# Hack to enable Python2.7 to use encoding.

14import sys

16if sys.version_info[0] < 3:

17 import io

18 import warnings

20 open = io.open

21 warnings.warn(

22 str(

23 "You should really be using Python3!!! "

24 "Tick tock, tick tock, https://pythonclock.org/"

25 )

26 )

28perluniprops = Perluniprops()

31class MosesTruecaser(object):

32 """

33 This is a Python port of the Moses Truecaser from

34 https://github.com/moses-smt/mosesdecoder/blob/master/scripts/recaser/train-truecaser.perl

35 https://github.com/moses-smt/mosesdecoder/blob/master/scripts/recaser/truecase.perl

36 """

38 # Perl Unicode Properties character sets.

39 Lowercase_Letter = str("".join(perluniprops.chars("Lowercase_Letter")))

40 Uppercase_Letter = str("".join(perluniprops.chars("Uppercase_Letter")))

41 Titlecase_Letter = str("".join(perluniprops.chars("Uppercase_Letter")))

43 def __init__(self, load_from=None, is_asr=None, encoding="utf8"):

44 """

45 :param load_from:

46 :type load_from:

48 :param is_asr: A flag to indicate that model is for ASR. ASR input has

49 no case, make sure it is lowercase, and make sure known are cased

50 eg. 'i' to be uppercased even if i is known.

51 :type is_asr: bool

52 """

53 # Initialize the object.

54 super(MosesTruecaser, self).__init__()

55 # Initialize the language specific nonbreaking prefixes.

56 self.SKIP_LETTERS_REGEX = re.compile(

57 "[{}{}{}]".format(

58 self.Lowercase_Letter, self.Uppercase_Letter, self.Titlecase_Letter

59 )

60 )

62 self.XML_SPLIT_REGX = re.compile("(<.*(?<=>))(.*)((?=</)[^>]*>)")

64 self.SENT_END = {".", ":", "?", "!"}

65 self.DELAYED_SENT_START = {

66 "(",

67 "[",

68 '"',

69 "'",

70 "'",

71 """,

72 "[",

73 "]",

74 }

76 self.encoding = encoding

78 self.is_asr = is_asr

79 if load_from:

80 self.model = self._load_model(load_from)

82 def learn_truecase_weights(self, tokens, possibly_use_first_token=False):

83 """

84 This function checks through each tokens in a sentence and returns the

85 appropriate weight of each surface token form.

86 """

87 # Keep track of first tokens in the sentence(s) of the line.

88 is_first_word = True

89 truecase_weights = []

90 for i, token in enumerate(tokens):

91 # Skip XML tags.

92 if re.search(r"(<\S[^>]*>)", token):

93 continue

94 # Skip if sentence start symbols.

95 elif token in self.DELAYED_SENT_START:

96 continue

98 # Resets the `is_first_word` after seeing sent end symbols.

99 if not is_first_word and token in self.SENT_END:

100 is_first_word = True

101 continue

102 # Skips tokens with nothing to case.

103 if not self.SKIP_LETTERS_REGEX.search(token):

104 is_first_word = False

105 continue

106

107 # If it's not the first word,

108 # then set the current word weight to 1.

109 current_word_weight = 0

110 if not is_first_word:

111 current_word_weight = 1

112 # Otherwise check whether user wants to optionally

113 # use the first word.

114 elif possibly_use_first_token:

115 # Gated special handling of first word of sentence.

116 # Check if first characer of token is lowercase.

117 if token[0].islower():

118 current_word_weight = 1

119 elif i == 1:

120 current_word_weight = 0.1

121

122 is_first_word = False

123

124 if current_word_weight > 0:

125 truecase_weights.append((token.lower(), token, current_word_weight))

126 return truecase_weights

127

128 def _train(

129 self,

130 document_iterator,

131 save_to=None,

132 possibly_use_first_token=False,

133 processes=1,

134 progress_bar=False,

135 ):

136 """

137 :param document_iterator: The input document, each outer list is a sentence,

138 the inner list is the list of tokens for each sentence.

139 :type document_iterator: iter(list(str))

140

141 :param possibly_use_first_token: When True, on the basis that the first

142 word of a sentence is always capitalized; if this option is provided then:

143 a) if a sentence-initial token is *not* capitalized, then it is counted, and

144 b) if a capitalized sentence-initial token is the only token of the segment,

145 then it is counted, but with only 10% of the weight of a normal token.

146 :type possibly_use_first_token: bool

147

148 :returns: A dictionary of the best, known objects as values from `_casing_to_model()`

149 :rtype: {'best': dict, 'known': Counter}

150 """

151 casing = defaultdict(Counter)

152 train_truecaser = partial(

153 self.learn_truecase_weights,

154 possibly_use_first_token=possibly_use_first_token,

155 )

156 token_weights = chain(

157 *parallelize_preprocess(

158 train_truecaser, document_iterator, processes, progress_bar=progress_bar

159 )

160 )

161 # Collect the token_weights from every sentence.

162 for lowercase_token, surface_token, weight in token_weights:

163 casing[lowercase_token][surface_token] += weight

164

165 # Save to file if specified.

166 if save_to:

167 self._save_model_from_casing(casing, save_to)

168 return self._casing_to_model(casing)

169

170 def train(

171 self,

172 documents,

173 save_to=None,

174 possibly_use_first_token=False,

175 processes=1,

176 progress_bar=False,

177 ):

178 """

179 Default duck-type of _train(), accepts list(list(str)) as input documents.

180 """

181 self.model = None # Clear the model first.

182 self.model = self._train(

183 documents,

184 save_to,

185 possibly_use_first_token,

186 processes,

187 progress_bar=progress_bar,

188 )

189 return self.model

190

191 def train_from_file(

192 self,

193 filename,

194 save_to=None,

195 possibly_use_first_token=False,

196 processes=1,

197 progress_bar=False,

198 ):

199 """

200 Duck-type of _train(), accepts a filename to read as a `iter(list(str))`

201 object.

202 """

203 with open(filename, encoding=self.encoding) as fin:

204 # document_iterator = map(str.split, fin.readlines())

205 document_iterator = (

206 line.split() for line in fin.readlines()

207 ) # Lets try a generator comprehension for Python2...

208 self.model = None # Clear the model first.

209 self.model = self._train(

210 document_iterator,

211 save_to,

212 possibly_use_first_token,

213 processes,

214 progress_bar=progress_bar,

215 )

216 return self.model

217

218 def train_from_file_object(

219 self,

220 file_object,

221 save_to=None,

222 possibly_use_first_token=False,

223 processes=1,

224 progress_bar=False,

225 ):

226 """

227 Duck-type of _train(), accepts a file object to read as a `iter(list(str))`

228 object.

229 """

230 # document_iterator = map(str.split, file_object.readlines())

231 document_iterator = (

232 line.split() for line in file_object.readlines()

233 ) # Lets try a generator comprehension for Python2...

234 self.model = None # Clear the model first.

235 self.model = self._train(

236 document_iterator,

237 save_to,

238 possibly_use_first_token,

239 processes,

240 progress_bar=progress_bar,

241 )

242 return self.model

243

244 def truecase(self, text, return_str=False, use_known=False):

245 """

246 Truecase a single sentence / line of text.

247

248 :param text: A single string, i.e. sentence text.

249 :type text: str

250

251 :param use_known: Use the known case if a word is a known word but not the first word.

252 :type use_known: bool

253 """

254 check_model_message = str(

255 "\nUse Truecaser.train() to train a model.\n"

256 "Or use Truecaser('modefile') to load a model."

257 )

258 assert hasattr(self, "model"), check_model_message

259 # Keep track of first tokens in the sentence(s) of the line.

260 is_first_word = True

261 truecased_tokens = []

262 tokens = self.split_xml(text)

263 # best_cases = best_cases if best_cases else self.model['best']

264 # known_cases = known_cases if known_cases else self.model['known']

265

266 for i, token in enumerate(tokens):

267

268 # Append XML tags and continue

269 if re.search(r"(<\S[^>]*>)", token):

270 truecased_tokens.append(token)

271 continue

272

273 # Note this shouldn't happen other if | are escaped as |

274 # To make the truecaser resilient,

275 # we'll just any token starting with pipes as they are.

276 if token == "|" or token.startswith("|"):

277 truecased_tokens.append(token)

278 continue

279

280 # Reads the word token and factors separatedly

281 token, other_factors = re.search(r"^([^\|]+)(.*)", token).groups()

282

283 # Lowercase the ASR tokens.

284 if self.is_asr:

285 token = token.lower()

286

287 # The actual case replacement happens here.

288 # "Most frequent" case of the word.

289 best_case = self.model["best"].get(token.lower(), None)

290 # Other known cases of the word.

291 known_case = self.model["known"].get(token, None)

292 # If it's the start of sentence.

293 if is_first_word and best_case: # Truecase sentence start.

294 token = best_case

295 elif known_case: # Don't change known tokens.

296 token = known_case if use_known else token

297 elif (

298 best_case

299 ): # Truecase otherwise unknown tokens? Heh? From https://github.com/moses-smt/mosesdecoder/blob/master/scripts/recaser/truecase.perl#L66

300 token = best_case

301 # Else, it's an unknown word, don't change the word.

302 # Concat the truecased `word` with the `other_factors`

303 token = token + other_factors

304 # Adds the truecased word.

305 truecased_tokens.append(token)

306

307 # Resets sentence start if this token is an ending punctuation.

308 is_first_word = token in self.SENT_END

309

310 if token in self.DELAYED_SENT_START:

311 is_first_word = False

312

313 # return ' '.join(tokens)

314 return " ".join(truecased_tokens) if return_str else truecased_tokens

315

316 def truecase_file(self, filename, return_str=True):

317 with open(filename, encoding=self.encoding) as fin:

318 for line in fin:

319 truecased_tokens = self.truecase(line.strip())

320 # Yield the truecased line.

321 yield " ".join(truecased_tokens) if return_str else truecased_tokens

322

323 @staticmethod

324 def split_xml(line):

325 """

326 Python port of split_xml function in Moses' truecaser:

327 https://github.com/moses-smt/mosesdecoder/blob/master/scripts/recaser/truecaser.perl

328

329 :param line: Input string, should be tokenized, separated by space.

330 :type line: str

331 """

332 line = line.strip()

333 tokens = []

334 while line:

335 # Assumes that xml tag is always separated by space.

336 has_xml = re.search(r"^\s*(<\S[^>]*>)(.*)$", line)

337 # non-XML test.

338 is_non_xml = re.search(r"^\s*([^\s<>]+)(.*)$", line)

339 # '<' or '>' occurs in word, but it's not an XML tag

340 xml_cognates = re.search(r"^\s*(\S+)(.*)$", line)

341 if has_xml:

342 potential_xml, line_next = has_xml.groups()

343 # exception for factor that is an XML tag

344 if (

345 re.search(r"^\S", line)

346 and len(tokens) > 0

347 and re.search(r"\|$", tokens[-1])

348 ):

349 tokens[-1] += potential_xml

350 # If it's a token with factors, join with the previous token.

351 is_factor = re.search(r"^(\|+)(.*)$", line_next)

352 if is_factor:

353 tokens[-1] += is_factor.group(1)

354 line_next = is_factor.group(2)

355 else:

356 tokens.append(

357 potential_xml + " "

358 ) # Token hack, unique to sacremoses.

359 line = line_next

360

361 elif is_non_xml:

362 tokens.append(is_non_xml.group(1)) # Token hack, unique to sacremoses.

363 line = is_non_xml.group(2)

364 elif xml_cognates:

365 tokens.append(

366 xml_cognates.group(1)

367 ) # Token hack, unique to sacremoses.

368 line = xml_cognates.group(2)

369 else:

370 raise Exception("ERROR: huh? {}".format(line))

371 tokens[-1] = tokens[-1].strip() # Token hack, unique to sacremoses.

372 return tokens

373

374 def _casing_to_model(self, casing):

375 """

376

377 :returns: A tuple of the (best, known) objects.

378 :rtype: tuple(dict, Counter)

379 """

380 best = {}

381 known = Counter()

382

383 for token_lower in casing:

384 tokens = casing[token_lower].most_common()

385 # Set the most frequent case as the "best" case.

386 best[token_lower] = tokens[0][0]

387 # If it's asr, throw away everything

388 if not self.is_asr:

389 for token, count in tokens[1:]:

390 # Note: This is rather odd that the counts are thrown away...

391 # from https://github.com/moses-smt/mosesdecoder/blob/master/scripts/recaser/truecase.perl#L34

392 known[token] += 1

393 model = {"best": best, "known": known, "casing": casing}

394 return model

395

396 def save_model(self, filename):

397 self._save_model_from_casing(self.model["casing"], filename)

398

399 def _save_model_from_casing(self, casing, filename):

400 """

401 Outputs the truecaser model file in the same output format as

402 https://github.com/moses-smt/mosesdecoder/blob/master/scripts/recaser/train-truecaser.perl

403

404 :param casing: The dictionary of tokens counter from `train()`.

405 :type casing: default(Counter)

406 """

407 with open(filename, "w", encoding=self.encoding) as fout:

408 for token in casing:

409 total_token_count = sum(casing[token].values())

410 tokens_counts = []

411 for i, (token, count) in enumerate(casing[token].most_common()):

412 if i == 0:

413 out_token = "{} ({}/{})".format(token, count, total_token_count)

414 else:

415 out_token = "{} ({})".format(token, count, total_token_count)

416 tokens_counts.append(out_token)

417 print(" ".join(tokens_counts), end="\n", file=fout)

418

419 def _load_model(self, filename):

420 """

421 Loads pre-trained truecasing file.

422

423 :returns: A dictionary of the best, known objects as values from `_casing_to_model()`

424 :rtype: {'best': dict, 'known': Counter}

425 """

426 casing = defaultdict(Counter)

427 with open(filename, encoding=self.encoding) as fin:

428 for line in fin:

429 line = line.strip().split()

430 for token, count in grouper(line, 2):

431 count = count.split("/")[0].strip("()")

432 casing[token.lower()][token] = int(count)

433 # Returns the best and known object from `_casing_to_model()`

434 return self._casing_to_model(casing)

435

436

437class MosesDetruecaser(object):

438 def __init__(self):

439 # Initialize the object.

440 super(MosesDetruecaser, self).__init__()

441 self.SENT_END = {".", ":", "?", "!"}

442 self.DELAYED_SENT_START = {

443 "(",

444 "[",

445 '"',

446 "'",

447 "'",

448 """,

449 "[",

450 "]",

451 }

452

453 # Some predefined tokens that will always be in lowercase.

454 self.ALWAYS_LOWER = {

455 "a",

456 "after",

457 "against",

458 "al-.+",

459 "and",

460 "any",

461 "as",

462 "at",

463 "be",

464 "because",

465 "between",

466 "by",

467 "during",

468 "el-.+",

469 "for",

470 "from",

471 "his",

472 "in",

473 "is",

474 "its",

475 "last",

476 "not",

477 "of",

478 "off",

479 "on",

480 "than",

481 "the",

482 "their",

483 "this",

484 "to",

485 "was",

486 "were",

487 "which",

488 "will",

489 "with",

490 }

491

492 def detruecase(self, text, is_headline=False, return_str=False):

493 """

494 Detruecase the translated files from a model that learnt from truecased

495 tokens.

496

497 :param text: A single string, i.e. sentence text.

498 :type text: str

499 """

500 # `cased_tokens` keep tracks of detruecased tokens.

501 cased_tokens = []

502 sentence_start = True

503 # Capitalize token if it's at the sentence start.

504 for token in text.split():

505 token = token[:1].upper() + token[1:] if sentence_start else token

506 cased_tokens.append(token)

507 if token in self.SENT_END:

508 sentence_start = True

509 elif not token in self.DELAYED_SENT_START:

510 sentence_start = False

511 # Check if it's a headline, if so then use title case.

512 if is_headline:

513 cased_tokens = [

514 token if token in self.ALWAYS_LOWER else token[:1].upper() + token[1:]

515 for token in cased_tokens

516 ]

517

518 return " ".join(cased_tokens) if return_str else cased_tokens

519

520

521__all__ = ["MosesTruecaser", "MosesDetruecaser"]