Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/babel/messages/pofile.py: 34%

1"""

2babel.messages.pofile

3~~~~~~~~~~~~~~~~~~~~~

5Reading and writing of files in the ``gettext`` PO (portable object)

6format.

9:license: BSD, see LICENSE for more details.

10"""

12from __future__ import annotations

14import os

15import re

16from collections.abc import Iterable

17from typing import TYPE_CHECKING, Literal

19from babel.core import Locale

20from babel.messages.catalog import Catalog, Message

21from babel.util import TextWrapper

23if TYPE_CHECKING:

24 from typing import IO, AnyStr

26 from _typeshed import SupportsWrite

29_unescape_re = re.compile(r'\\([\\trn"])')

32def unescape(string: str) -> str:

33 r"""Reverse `escape` the given string.

35 >>> print(unescape('"Say:\\n \\"hello, world!\\"\\n"'))

36 Say:

37 "hello, world!"

38 <BLANKLINE>

40 :param string: the string to unescape

41 """

43 def replace_escapes(match):

44 m = match.group(1)

45 if m == 'n':

46 return '\n'

47 elif m == 't':

48 return '\t'

49 elif m == 'r':

50 return '\r'

51 # m is \ or "

52 return m

54 if "\\" not in string: # Fast path: there's nothing to unescape

55 return string[1:-1]

56 return _unescape_re.sub(replace_escapes, string[1:-1])

59def denormalize(string: str) -> str:

60 r"""Reverse the normalization done by the `normalize` function.

62 >>> print(denormalize(r'''""

63 ... "Say:\n"

64 ... " \"hello, world!\"\n"'''))

65 Say:

66 "hello, world!"

67 <BLANKLINE>

69 >>> print(denormalize(r'''""

70 ... "Say:\n"

71 ... " \"Lorem ipsum dolor sit "

72 ... "amet, consectetur adipisicing"

73 ... " elit, \"\n"'''))

74 Say:

75 "Lorem ipsum dolor sit amet, consectetur adipisicing elit, "

76 <BLANKLINE>

78 :param string: the string to denormalize

79 """

80 if '\n' in string:

81 escaped_lines = string.splitlines()

82 if string.startswith('""'):

83 escaped_lines = escaped_lines[1:]

84 return ''.join(map(unescape, escaped_lines))

85 else:

86 return unescape(string)

89def _extract_locations(line: str) -> list[str]:

90 """Extract locations from location comments.

92 Locations are extracted while properly handling First Strong

93 Isolate (U+2068) and Pop Directional Isolate (U+2069), used by

94 gettext to enclose filenames with spaces and tabs in their names.

95 """

96 if "\u2068" not in line and "\u2069" not in line:

97 return line.lstrip().split()

99 locations = []

100 location = ""

101 in_filename = False

102 for c in line:

103 if c == "\u2068":

104 if in_filename:

105 raise ValueError(

106 "location comment contains more First Strong Isolate "

107 "characters, than Pop Directional Isolate characters",

108 )

109 in_filename = True

110 continue

111 elif c == "\u2069":

112 if not in_filename:

113 raise ValueError(

114 "location comment contains more Pop Directional Isolate "

115 "characters, than First Strong Isolate characters",

116 )

117 in_filename = False

118 continue

119 elif c == " ":

120 if in_filename:

121 location += c

122 elif location:

123 locations.append(location)

124 location = ""

125 else:

126 location += c

127 else:

128 if location:

129 if in_filename:

130 raise ValueError(

131 "location comment contains more First Strong Isolate "

132 "characters, than Pop Directional Isolate characters",

133 )

134 locations.append(location)

135

136 return locations

137

138

139class PoFileError(Exception):

140 """Exception thrown by PoParser when an invalid po file is encountered."""

141

142 def __init__(self, message: str, catalog: Catalog, line: str, lineno: int) -> None:

143 super().__init__(f'{message} on {lineno}')

144 self.catalog = catalog

145 self.line = line

146 self.lineno = lineno

147

148

149class _NormalizedString(list):

150 def __init__(self, *args: str) -> None:

151 super().__init__(map(str.strip, args))

152

153 def denormalize(self) -> str:

154 if not self:

155 return ""

156 return ''.join(map(unescape, self))

157

158

159class PoFileParser:

160 """Support class to read messages from a ``gettext`` PO (portable object) file

161 and add them to a `Catalog`

162

163 See `read_po` for simple cases.

164 """

165

166 def __init__(

167 self,

168 catalog: Catalog,

169 ignore_obsolete: bool = False,

170 abort_invalid: bool = False,

171 ) -> None:

172 self.catalog = catalog

173 self.ignore_obsolete = ignore_obsolete

174 self.counter = 0

175 self.offset = 0

176 self.abort_invalid = abort_invalid

177 self._reset_message_state()

178

179 def _reset_message_state(self) -> None:

180 self.messages = []

181 self.translations = []

182 self.locations = []

183 self.flags = []

184 self.user_comments = []

185 self.auto_comments = []

186 self.context = None

187 self.obsolete = False

188 self.in_msgid = False

189 self.in_msgstr = False

190 self.in_msgctxt = False

191

192 def _add_message(self) -> None:

193 """

194 Add a message to the catalog based on the current parser state and

195 clear the state ready to process the next message.

196 """

197 if len(self.messages) > 1:

198 msgid = tuple(m.denormalize() for m in self.messages)

199 string = ['' for _ in range(self.catalog.num_plurals)]

200 for idx, translation in sorted(self.translations):

201 if idx >= self.catalog.num_plurals:

202 self._invalid_pofile(

203 "",

204 self.offset,

205 "msg has more translations than num_plurals of catalog",

206 )

207 continue

208 string[idx] = translation.denormalize()

209 string = tuple(string)

210 else:

211 msgid = self.messages[0].denormalize()

212 string = self.translations[0][1].denormalize()

213 msgctxt = self.context.denormalize() if self.context else None

214 message = Message(

215 msgid,

216 string,

217 self.locations,

218 self.flags,

219 self.auto_comments,

220 self.user_comments,

221 lineno=self.offset + 1,

222 context=msgctxt,

223 )

224 if self.obsolete:

225 if not self.ignore_obsolete:

226 self.catalog.obsolete[self.catalog._key_for(msgid, msgctxt)] = message

227 else:

228 self.catalog[msgid] = message

229 self.counter += 1

230 self._reset_message_state()

231

232 def _finish_current_message(self) -> None:

233 if self.messages:

234 if not self.translations:

235 self._invalid_pofile(

236 "",

237 self.offset,

238 f"missing msgstr for msgid '{self.messages[0].denormalize()}'",

239 )

240 self.translations.append([0, _NormalizedString()])

241 self._add_message()

242

243 def _process_message_line(self, lineno, line, obsolete=False) -> None:

244 if not line:

245 return

246 if line[0] == '"':

247 self._process_string_continuation_line(line, lineno)

248 else:

249 self._process_keyword_line(lineno, line, obsolete)

250

251 def _process_keyword_line(self, lineno, line, obsolete=False) -> None:

252 keyword, _, arg = line.partition(' ')

253

254 if keyword in ['msgid', 'msgctxt']:

255 self._finish_current_message()

256

257 self.obsolete = obsolete

258

259 # The line that has the msgid is stored as the offset of the msg

260 # should this be the msgctxt if it has one?

261 if keyword == 'msgid':

262 self.offset = lineno

263

264 if keyword in ['msgid', 'msgid_plural']:

265 self.in_msgctxt = False

266 self.in_msgid = True

267 self.messages.append(_NormalizedString(arg))

268 return

269

270 if keyword == 'msgctxt':

271 self.in_msgctxt = True

272 self.context = _NormalizedString(arg)

273 return

274

275 if keyword == 'msgstr' or keyword.startswith('msgstr['):

276 self.in_msgid = False

277 self.in_msgstr = True

278 kwarg, has_bracket, idxarg = keyword.partition('[')

279 idx = int(idxarg[:-1]) if has_bracket else 0

280 s = _NormalizedString(arg) if arg != '""' else _NormalizedString()

281 self.translations.append([idx, s])

282 return

283

284 self._invalid_pofile(line, lineno, "Unknown or misformatted keyword")

285

286 def _process_string_continuation_line(self, line, lineno) -> None:

287 if self.in_msgid:

288 s = self.messages[-1]

289 elif self.in_msgstr:

290 s = self.translations[-1][1]

291 elif self.in_msgctxt:

292 s = self.context

293 else:

294 self._invalid_pofile(

295 line,

296 lineno,

297 "Got line starting with \" but not in msgid, msgstr or msgctxt",

298 )

299 return

300 # For performance reasons, `NormalizedString` doesn't strip internally

301 s.append(line.strip())

302

303 def _process_comment(self, line) -> None:

304 self._finish_current_message()

305

306 prefix = line[:2]

307 if prefix == '#:':

308 for location in _extract_locations(line[2:]):

309 a, colon, b = location.rpartition(':')

310 if colon:

311 try:

312 self.locations.append((a, int(b)))

313 except ValueError:

314 continue

315 else: # No line number specified

316 self.locations.append((location, None))

317 return

318

319 if prefix == '#,':

320 self.flags.extend(flag.strip() for flag in line[2:].lstrip().split(','))

321 return

322

323 if prefix == '#.':

324 # These are called auto-comments

325 comment = line[2:].strip()

326 if comment: # Just check that we're not adding empty comments

327 self.auto_comments.append(comment)

328 return

329

330 # These are called user comments

331 self.user_comments.append(line[1:].strip())

332

333 def parse(self, fileobj: IO[AnyStr] | Iterable[AnyStr]) -> None:

334 """

335 Reads from the file-like object (or iterable of string-likes) `fileobj`

336 and adds any po file units found in it to the `Catalog`

337 supplied to the constructor.

338

339 All of the items in the iterable must be the same type; either `str`

340 or `bytes` (decoded with the catalog charset), but not a mixture.

341 """

342 needs_decode = None

343

344 for lineno, line in enumerate(fileobj):

345 line = line.strip()

346 if needs_decode is None:

347 # If we don't yet know whether we need to decode,

348 # let's find out now.

349 needs_decode = not isinstance(line, str)

350 if not line:

351 continue

352 if needs_decode:

353 line = line.decode(self.catalog.charset)

354 if line[0] == '#':

355 if line[:2] == '#~':

356 self._process_message_line(lineno, line[2:].lstrip(), obsolete=True)

357 else:

358 try:

359 self._process_comment(line)

360 except ValueError as exc:

361 self._invalid_pofile(line, lineno, str(exc))

362 else:

363 self._process_message_line(lineno, line)

364

365 self._finish_current_message()

366

367 # No actual messages found, but there was some info in comments, from which

368 # we'll construct an empty header message

369 if not self.counter and (self.flags or self.user_comments or self.auto_comments):

370 self.messages.append(_NormalizedString())

371 self.translations.append([0, _NormalizedString()])

372 self._add_message()

373

374 def _invalid_pofile(self, line, lineno, msg) -> None:

375 assert isinstance(line, str)

376 if self.abort_invalid:

377 raise PoFileError(msg, self.catalog, line, lineno)

378 print("WARNING:", msg)

379 print(f"WARNING: Problem on line {lineno + 1}: {line!r}")

380

381

382def read_po(

383 fileobj: IO[AnyStr] | Iterable[AnyStr],

384 locale: Locale | str | None = None,

385 domain: str | None = None,

386 ignore_obsolete: bool = False,

387 charset: str | None = None,

388 abort_invalid: bool = False,

389) -> Catalog:

390 """Read messages from a ``gettext`` PO (portable object) file from the given

391 file-like object (or an iterable of lines) and return a `Catalog`.

392

393 >>> from datetime import datetime

394 >>> from io import StringIO

395 >>> buf = StringIO('''

396 ... #: main.py:1

397 ... #, fuzzy, python-format

398 ... msgid "foo %(name)s"

399 ... msgstr "quux %(name)s"

400 ...

401 ... # A user comment

402 ... #. An auto comment

403 ... #: main.py:3

404 ... msgid "bar"

405 ... msgid_plural "baz"

406 ... msgstr[0] "bar"

407 ... msgstr[1] "baaz"

408 ... ''')

409 >>> catalog = read_po(buf)

410 >>> catalog.revision_date = datetime(2007, 4, 1)

411

412 >>> for message in catalog:

413 ... if message.id:

414 ... print((message.id, message.string))

415 ... print(' ', (message.locations, sorted(list(message.flags))))

416 ... print(' ', (message.user_comments, message.auto_comments))

417 ('foo %(name)s', 'quux %(name)s')

418 ([('main.py', 1)], ['fuzzy', 'python-format'])

419 ([], [])

420 (('bar', 'baz'), ('bar', 'baaz'))

421 ([('main.py', 3)], [])

422 (['A user comment'], ['An auto comment'])

423

424 .. versionadded:: 1.0

425 Added support for explicit charset argument.

426

427 :param fileobj: the file-like object (or iterable of lines) to read the PO file from

428 :param locale: the locale identifier or `Locale` object, or `None`

429 if the catalog is not bound to a locale (which basically

430 means it's a template)

431 :param domain: the message domain

432 :param ignore_obsolete: whether to ignore obsolete messages in the input

433 :param charset: the character set of the catalog.

434 :param abort_invalid: abort read if po file is invalid

435 """

436 catalog = Catalog(locale=locale, domain=domain, charset=charset)

437 parser = PoFileParser(catalog, ignore_obsolete, abort_invalid=abort_invalid)

438 parser.parse(fileobj)

439 return catalog

440

441

442WORD_SEP = re.compile(

443 '('

444 r'\s+|' # any whitespace

445 r'[^\s\w]*\w+[a-zA-Z]-(?=\w+[a-zA-Z])|' # hyphenated words

446 r'(?<=[\w\!\"\'\&\.\,\?])-{2,}(?=\w)' # em-dash

447 ')',

448)

449

450

451def escape(string: str) -> str:

452 r"""Escape the given string so that it can be included in double-quoted

453 strings in ``PO`` files.

454

455 >>> escape('''Say:

456 ... "hello, world!"

457 ... ''')

458 '"Say:\\n \\"hello, world!\\"\\n"'

459

460 :param string: the string to escape

461 """

462 return '"%s"' % string.replace('\\', '\\\\').replace('\t', '\\t').replace(

463 '\r',

464 '\\r',

465 ).replace('\n', '\\n').replace('"', '\\"')

466

467

468def normalize(string: str, prefix: str = '', width: int = 76) -> str:

469 r"""Convert a string into a format that is appropriate for .po files.

470

471 >>> print(normalize('''Say:

472 ... "hello, world!"

473 ... ''', width=None))

474 ""

475 "Say:\n"

476 " \"hello, world!\"\n"

477

478 >>> print(normalize('''Say:

479 ... "Lorem ipsum dolor sit amet, consectetur adipisicing elit, "

480 ... ''', width=32))

481 ""

482 "Say:\n"

483 " \"Lorem ipsum dolor sit "

484 "amet, consectetur adipisicing"

485 " elit, \"\n"

486

487 :param string: the string to normalize

488 :param prefix: a string that should be prepended to every line

489 :param width: the maximum line width; use `None`, 0, or a negative number

490 to completely disable line wrapping

491 """

492 if width and width > 0:

493 prefixlen = len(prefix)

494 lines = []

495 for line in string.splitlines(True):

496 if len(escape(line)) + prefixlen > width:

497 chunks = WORD_SEP.split(line)

498 chunks.reverse()

499 while chunks:

500 buf = []

501 size = 2

502 while chunks:

503 length = len(escape(chunks[-1])) - 2 + prefixlen

504 if size + length < width:

505 buf.append(chunks.pop())

506 size += length

507 else:

508 if not buf:

509 # handle long chunks by putting them on a

510 # separate line

511 buf.append(chunks.pop())

512 break

513 lines.append(''.join(buf))

514 else:

515 lines.append(line)

516 else:

517 lines = string.splitlines(True)

518

519 if len(lines) <= 1:

520 return escape(string)

521

522 # Remove empty trailing line

523 if lines and not lines[-1]:

524 del lines[-1]

525 lines[-1] += '\n'

526 return '""\n' + '\n'.join([(prefix + escape(line)) for line in lines])

527

528

529def _enclose_filename_if_necessary(filename: str) -> str:

530 """Enclose filenames which include white spaces or tabs.

531

532 Do the same as gettext and enclose filenames which contain white

533 spaces or tabs with First Strong Isolate (U+2068) and Pop

534 Directional Isolate (U+2069).

535 """

536 if " " not in filename and "\t" not in filename:

537 return filename

538

539 if not filename.startswith("\u2068"):

540 filename = "\u2068" + filename

541 if not filename.endswith("\u2069"):

542 filename += "\u2069"

543 return filename

544

545

546def write_po(

547 fileobj: SupportsWrite[bytes],

548 catalog: Catalog,

549 width: int = 76,

550 no_location: bool = False,

551 omit_header: bool = False,

552 sort_output: bool = False,

553 sort_by_file: bool = False,

554 ignore_obsolete: bool = False,

555 include_previous: bool = False,

556 include_lineno: bool = True,

557) -> None:

558 r"""Write a ``gettext`` PO (portable object) template file for a given

559 message catalog to the provided file-like object.

560

561 >>> catalog = Catalog()

562 >>> catalog.add('foo %(name)s', locations=[('main.py', 1)],

563 ... flags=('fuzzy',))

564 <Message...>

565 >>> catalog.add(('bar', 'baz'), locations=[('main.py', 3)])

566 <Message...>

567 >>> from io import BytesIO

568 >>> buf = BytesIO()

569 >>> write_po(buf, catalog, omit_header=True)

570 >>> print(buf.getvalue().decode("utf8"))

571 #: main.py:1

572 #, fuzzy, python-format

573 msgid "foo %(name)s"

574 msgstr ""

575 <BLANKLINE>

576 #: main.py:3

577 msgid "bar"

578 msgid_plural "baz"

579 msgstr[0] ""

580 msgstr[1] ""

581 <BLANKLINE>

582 <BLANKLINE>

583

584 :param fileobj: the file-like object to write to

585 :param catalog: the `Catalog` instance

586 :param width: the maximum line width for the generated output; use `None`,

587 0, or a negative number to completely disable line wrapping

588 :param no_location: do not emit a location comment for every message

589 :param omit_header: do not include the ``msgid ""`` entry at the top of the

590 output

591 :param sort_output: whether to sort the messages in the output by msgid

592 :param sort_by_file: whether to sort the messages in the output by their

593 locations

594 :param ignore_obsolete: whether to ignore obsolete messages and not include

595 them in the output; by default they are included as

596 comments

597 :param include_previous: include the old msgid as a comment when

598 updating the catalog

599 :param include_lineno: include line number in the location comment

600 """

601

602 sort_by = None

603 if sort_output:

604 sort_by = "message"

605 elif sort_by_file:

606 sort_by = "location"

607

608 for line in generate_po(

609 catalog,

610 ignore_obsolete=ignore_obsolete,

611 include_lineno=include_lineno,

612 include_previous=include_previous,

613 no_location=no_location,

614 omit_header=omit_header,

615 sort_by=sort_by,

616 width=width,

617 ):

618 if isinstance(line, str):

619 line = line.encode(catalog.charset, 'backslashreplace')

620 fileobj.write(line)

621

622

623def generate_po(

624 catalog: Catalog,

625 *,

626 ignore_obsolete: bool = False,

627 include_lineno: bool = True,

628 include_previous: bool = False,

629 no_location: bool = False,

630 omit_header: bool = False,

631 sort_by: Literal["message", "location"] | None = None,

632 width: int = 76,

633) -> Iterable[str]:

634 r"""Yield text strings representing a ``gettext`` PO (portable object) file.

635

636 See `write_po()` for a more detailed description.

637 """

638 # xgettext always wraps comments even if --no-wrap is passed;

639 # provide the same behaviour

640 comment_width = width if width and width > 0 else 76

641

642 comment_wrapper = TextWrapper(width=comment_width, break_long_words=False)

643 header_wrapper = TextWrapper(width=width, subsequent_indent="# ", break_long_words=False)

644

645 def _format_comment(comment, prefix=''):

646 for line in comment_wrapper.wrap(comment):

647 yield f"#{prefix} {line.strip()}\n"

648

649 def _format_message(message, prefix=''):

650 if isinstance(message.id, (list, tuple)):

651 if message.context:

652 yield f"{prefix}msgctxt {normalize(message.context, prefix=prefix, width=width)}\n"

653 yield f"{prefix}msgid {normalize(message.id[0], prefix=prefix, width=width)}\n"

654 yield f"{prefix}msgid_plural {normalize(message.id[1], prefix=prefix, width=width)}\n"

655

656 for idx in range(catalog.num_plurals):

657 try:

658 string = message.string[idx]

659 except IndexError:

660 string = ''

661 yield f"{prefix}msgstr[{idx:d}] {normalize(string, prefix=prefix, width=width)}\n"

662 else:

663 if message.context:

664 yield f"{prefix}msgctxt {normalize(message.context, prefix=prefix, width=width)}\n"

665 yield f"{prefix}msgid {normalize(message.id, prefix=prefix, width=width)}\n"

666 yield f"{prefix}msgstr {normalize(message.string or '', prefix=prefix, width=width)}\n"

667

668 for message in _sort_messages(catalog, sort_by=sort_by):

669 if not message.id: # This is the header "message"

670 if omit_header:

671 continue

672 comment_header = catalog.header_comment

673 if width and width > 0:

674 lines = []

675 for line in comment_header.splitlines():

676 lines += header_wrapper.wrap(line)

677 comment_header = '\n'.join(lines)

678 yield f"{comment_header}\n"

679

680 for comment in message.user_comments:

681 yield from _format_comment(comment)

682 for comment in message.auto_comments:

683 yield from _format_comment(comment, prefix='.')

684

685 if not no_location:

686 locs = []

687

688 # sort locations by filename and lineno.

689 # if there's no <int> as lineno, use `-1`.

690 # if no sorting possible, leave unsorted.

691 # (see issue #606)

692 try:

693 locations = sorted(

694 message.locations,

695 key=lambda x: (x[0], isinstance(x[1], int) and x[1] or -1),

696 )

697 except TypeError: # e.g. "TypeError: unorderable types: NoneType() < int()"

698 locations = message.locations

699

700 for filename, lineno in locations:

701 location = filename.replace(os.sep, '/')

702 location = _enclose_filename_if_necessary(location)

703 if lineno and include_lineno:

704 location = f"{location}:{lineno:d}"

705 if location not in locs:

706 locs.append(location)

707 yield from _format_comment(' '.join(locs), prefix=':')

708 if message.flags:

709 yield f"#{', '.join(['', *sorted(message.flags)])}\n"

710

711 if message.previous_id and include_previous:

712 yield from _format_comment(

713 f'msgid {normalize(message.previous_id[0], width=width)}',

714 prefix='|',

715 )

716 if len(message.previous_id) > 1:

717 norm_previous_id = normalize(message.previous_id[1], width=width)

718 yield from _format_comment(f'msgid_plural {norm_previous_id}', prefix='|')

719

720 yield from _format_message(message)

721 yield '\n'

722

723 if not ignore_obsolete:

724 for message in _sort_messages(

725 catalog.obsolete.values(),

726 sort_by=sort_by,

727 ):

728 for comment in message.user_comments:

729 yield from _format_comment(comment)

730 yield from _format_message(message, prefix='#~ ')

731 yield '\n'

732

733

734def _sort_messages(

735 messages: Iterable[Message],

736 sort_by: Literal["message", "location"] | None,

737) -> list[Message]:

738 """

739 Sort the given message iterable by the given criteria.

740

741 Always returns a list.

742

743 :param messages: An iterable of Messages.

744 :param sort_by: Sort by which criteria? Options are `message` and `location`.

745 :return: list[Message]

746 """

747 messages = list(messages)

748 if sort_by == "message":

749 messages.sort()

750 elif sort_by == "location":

751 messages.sort(key=lambda m: m.locations)

752 return messages