1"""
2 babel.messages.pofile
3 ~~~~~~~~~~~~~~~~~~~~~
4
5 Reading and writing of files in the ``gettext`` PO (portable object)
6 format.
7
8 :copyright: (c) 2013-2024 by the Babel Team.
9 :license: BSD, see LICENSE for more details.
10"""
11from __future__ import annotations
12
13import os
14import re
15from collections.abc import Iterable
16from typing import TYPE_CHECKING
17
18from babel.core import Locale
19from babel.messages.catalog import Catalog, Message
20from babel.util import _cmp, wraptext
21
22if TYPE_CHECKING:
23 from typing import IO, AnyStr
24
25 from _typeshed import SupportsWrite
26 from typing_extensions import Literal
27
28
29def unescape(string: str) -> str:
30 r"""Reverse `escape` the given string.
31
32 >>> print(unescape('"Say:\\n \\"hello, world!\\"\\n"'))
33 Say:
34 "hello, world!"
35 <BLANKLINE>
36
37 :param string: the string to unescape
38 """
39 def replace_escapes(match):
40 m = match.group(1)
41 if m == 'n':
42 return '\n'
43 elif m == 't':
44 return '\t'
45 elif m == 'r':
46 return '\r'
47 # m is \ or "
48 return m
49 return re.compile(r'\\([\\trn"])').sub(replace_escapes, string[1:-1])
50
51
52def denormalize(string: str) -> str:
53 r"""Reverse the normalization done by the `normalize` function.
54
55 >>> print(denormalize(r'''""
56 ... "Say:\n"
57 ... " \"hello, world!\"\n"'''))
58 Say:
59 "hello, world!"
60 <BLANKLINE>
61
62 >>> print(denormalize(r'''""
63 ... "Say:\n"
64 ... " \"Lorem ipsum dolor sit "
65 ... "amet, consectetur adipisicing"
66 ... " elit, \"\n"'''))
67 Say:
68 "Lorem ipsum dolor sit amet, consectetur adipisicing elit, "
69 <BLANKLINE>
70
71 :param string: the string to denormalize
72 """
73 if '\n' in string:
74 escaped_lines = string.splitlines()
75 if string.startswith('""'):
76 escaped_lines = escaped_lines[1:]
77 lines = map(unescape, escaped_lines)
78 return ''.join(lines)
79 else:
80 return unescape(string)
81
82
83class PoFileError(Exception):
84 """Exception thrown by PoParser when an invalid po file is encountered."""
85
86 def __init__(self, message: str, catalog: Catalog, line: str, lineno: int) -> None:
87 super().__init__(f'{message} on {lineno}')
88 self.catalog = catalog
89 self.line = line
90 self.lineno = lineno
91
92
93class _NormalizedString:
94
95 def __init__(self, *args: str) -> None:
96 self._strs: list[str] = []
97 for arg in args:
98 self.append(arg)
99
100 def append(self, s: str) -> None:
101 self._strs.append(s.strip())
102
103 def denormalize(self) -> str:
104 return ''.join(map(unescape, self._strs))
105
106 def __bool__(self) -> bool:
107 return bool(self._strs)
108
109 def __repr__(self) -> str:
110 return os.linesep.join(self._strs)
111
112 def __cmp__(self, other: object) -> int:
113 if not other:
114 return 1
115
116 return _cmp(str(self), str(other))
117
118 def __gt__(self, other: object) -> bool:
119 return self.__cmp__(other) > 0
120
121 def __lt__(self, other: object) -> bool:
122 return self.__cmp__(other) < 0
123
124 def __ge__(self, other: object) -> bool:
125 return self.__cmp__(other) >= 0
126
127 def __le__(self, other: object) -> bool:
128 return self.__cmp__(other) <= 0
129
130 def __eq__(self, other: object) -> bool:
131 return self.__cmp__(other) == 0
132
133 def __ne__(self, other: object) -> bool:
134 return self.__cmp__(other) != 0
135
136
137class PoFileParser:
138 """Support class to read messages from a ``gettext`` PO (portable object) file
139 and add them to a `Catalog`
140
141 See `read_po` for simple cases.
142 """
143
144 _keywords = [
145 'msgid',
146 'msgstr',
147 'msgctxt',
148 'msgid_plural',
149 ]
150
151 def __init__(self, catalog: Catalog, ignore_obsolete: bool = False, abort_invalid: bool = False) -> None:
152 self.catalog = catalog
153 self.ignore_obsolete = ignore_obsolete
154 self.counter = 0
155 self.offset = 0
156 self.abort_invalid = abort_invalid
157 self._reset_message_state()
158
159 def _reset_message_state(self) -> None:
160 self.messages = []
161 self.translations = []
162 self.locations = []
163 self.flags = []
164 self.user_comments = []
165 self.auto_comments = []
166 self.context = None
167 self.obsolete = False
168 self.in_msgid = False
169 self.in_msgstr = False
170 self.in_msgctxt = False
171
172 def _add_message(self) -> None:
173 """
174 Add a message to the catalog based on the current parser state and
175 clear the state ready to process the next message.
176 """
177 self.translations.sort()
178 if len(self.messages) > 1:
179 msgid = tuple(m.denormalize() for m in self.messages)
180 else:
181 msgid = self.messages[0].denormalize()
182 if isinstance(msgid, (list, tuple)):
183 string = ['' for _ in range(self.catalog.num_plurals)]
184 for idx, translation in self.translations:
185 if idx >= self.catalog.num_plurals:
186 self._invalid_pofile("", self.offset, "msg has more translations than num_plurals of catalog")
187 continue
188 string[idx] = translation.denormalize()
189 string = tuple(string)
190 else:
191 string = self.translations[0][1].denormalize()
192 msgctxt = self.context.denormalize() if self.context else None
193 message = Message(msgid, string, list(self.locations), set(self.flags),
194 self.auto_comments, self.user_comments, lineno=self.offset + 1,
195 context=msgctxt)
196 if self.obsolete:
197 if not self.ignore_obsolete:
198 self.catalog.obsolete[msgid] = message
199 else:
200 self.catalog[msgid] = message
201 self.counter += 1
202 self._reset_message_state()
203
204 def _finish_current_message(self) -> None:
205 if self.messages:
206 self._add_message()
207
208 def _process_message_line(self, lineno, line, obsolete=False) -> None:
209 if line.startswith('"'):
210 self._process_string_continuation_line(line, lineno)
211 else:
212 self._process_keyword_line(lineno, line, obsolete)
213
214 def _process_keyword_line(self, lineno, line, obsolete=False) -> None:
215
216 for keyword in self._keywords:
217 try:
218 if line.startswith(keyword) and line[len(keyword)] in [' ', '[']:
219 arg = line[len(keyword):]
220 break
221 except IndexError:
222 self._invalid_pofile(line, lineno, "Keyword must be followed by a string")
223 else:
224 self._invalid_pofile(line, lineno, "Start of line didn't match any expected keyword.")
225 return
226
227 if keyword in ['msgid', 'msgctxt']:
228 self._finish_current_message()
229
230 self.obsolete = obsolete
231
232 # The line that has the msgid is stored as the offset of the msg
233 # should this be the msgctxt if it has one?
234 if keyword == 'msgid':
235 self.offset = lineno
236
237 if keyword in ['msgid', 'msgid_plural']:
238 self.in_msgctxt = False
239 self.in_msgid = True
240 self.messages.append(_NormalizedString(arg))
241
242 elif keyword == 'msgstr':
243 self.in_msgid = False
244 self.in_msgstr = True
245 if arg.startswith('['):
246 idx, msg = arg[1:].split(']', 1)
247 self.translations.append([int(idx), _NormalizedString(msg)])
248 else:
249 self.translations.append([0, _NormalizedString(arg)])
250
251 elif keyword == 'msgctxt':
252 self.in_msgctxt = True
253 self.context = _NormalizedString(arg)
254
255 def _process_string_continuation_line(self, line, lineno) -> None:
256 if self.in_msgid:
257 s = self.messages[-1]
258 elif self.in_msgstr:
259 s = self.translations[-1][1]
260 elif self.in_msgctxt:
261 s = self.context
262 else:
263 self._invalid_pofile(line, lineno, "Got line starting with \" but not in msgid, msgstr or msgctxt")
264 return
265 s.append(line)
266
267 def _process_comment(self, line) -> None:
268
269 self._finish_current_message()
270
271 if line[1:].startswith(':'):
272 for location in line[2:].lstrip().split():
273 pos = location.rfind(':')
274 if pos >= 0:
275 try:
276 lineno = int(location[pos + 1:])
277 except ValueError:
278 continue
279 self.locations.append((location[:pos], lineno))
280 else:
281 self.locations.append((location, None))
282 elif line[1:].startswith(','):
283 for flag in line[2:].lstrip().split(','):
284 self.flags.append(flag.strip())
285 elif line[1:].startswith('.'):
286 # These are called auto-comments
287 comment = line[2:].strip()
288 if comment: # Just check that we're not adding empty comments
289 self.auto_comments.append(comment)
290 else:
291 # These are called user comments
292 self.user_comments.append(line[1:].strip())
293
294 def parse(self, fileobj: IO[AnyStr] | Iterable[AnyStr]) -> None:
295 """
296 Reads from the file-like object `fileobj` and adds any po file
297 units found in it to the `Catalog` supplied to the constructor.
298 """
299
300 for lineno, line in enumerate(fileobj):
301 line = line.strip()
302 if not isinstance(line, str):
303 line = line.decode(self.catalog.charset)
304 if not line:
305 continue
306 if line.startswith('#'):
307 if line[1:].startswith('~'):
308 self._process_message_line(lineno, line[2:].lstrip(), obsolete=True)
309 else:
310 self._process_comment(line)
311 else:
312 self._process_message_line(lineno, line)
313
314 self._finish_current_message()
315
316 # No actual messages found, but there was some info in comments, from which
317 # we'll construct an empty header message
318 if not self.counter and (self.flags or self.user_comments or self.auto_comments):
319 self.messages.append(_NormalizedString('""'))
320 self.translations.append([0, _NormalizedString('""')])
321 self._add_message()
322
323 def _invalid_pofile(self, line, lineno, msg) -> None:
324 assert isinstance(line, str)
325 if self.abort_invalid:
326 raise PoFileError(msg, self.catalog, line, lineno)
327 print("WARNING:", msg)
328 print(f"WARNING: Problem on line {lineno + 1}: {line!r}")
329
330
331def read_po(
332 fileobj: IO[AnyStr] | Iterable[AnyStr],
333 locale: str | Locale | None = None,
334 domain: str | None = None,
335 ignore_obsolete: bool = False,
336 charset: str | None = None,
337 abort_invalid: bool = False,
338) -> Catalog:
339 """Read messages from a ``gettext`` PO (portable object) file from the given
340 file-like object (or an iterable of lines) and return a `Catalog`.
341
342 >>> from datetime import datetime
343 >>> from io import StringIO
344 >>> buf = StringIO('''
345 ... #: main.py:1
346 ... #, fuzzy, python-format
347 ... msgid "foo %(name)s"
348 ... msgstr "quux %(name)s"
349 ...
350 ... # A user comment
351 ... #. An auto comment
352 ... #: main.py:3
353 ... msgid "bar"
354 ... msgid_plural "baz"
355 ... msgstr[0] "bar"
356 ... msgstr[1] "baaz"
357 ... ''')
358 >>> catalog = read_po(buf)
359 >>> catalog.revision_date = datetime(2007, 4, 1)
360
361 >>> for message in catalog:
362 ... if message.id:
363 ... print((message.id, message.string))
364 ... print(' ', (message.locations, sorted(list(message.flags))))
365 ... print(' ', (message.user_comments, message.auto_comments))
366 (u'foo %(name)s', u'quux %(name)s')
367 ([(u'main.py', 1)], [u'fuzzy', u'python-format'])
368 ([], [])
369 ((u'bar', u'baz'), (u'bar', u'baaz'))
370 ([(u'main.py', 3)], [])
371 ([u'A user comment'], [u'An auto comment'])
372
373 .. versionadded:: 1.0
374 Added support for explicit charset argument.
375
376 :param fileobj: the file-like object (or iterable of lines) to read the PO file from
377 :param locale: the locale identifier or `Locale` object, or `None`
378 if the catalog is not bound to a locale (which basically
379 means it's a template)
380 :param domain: the message domain
381 :param ignore_obsolete: whether to ignore obsolete messages in the input
382 :param charset: the character set of the catalog.
383 :param abort_invalid: abort read if po file is invalid
384 """
385 catalog = Catalog(locale=locale, domain=domain, charset=charset)
386 parser = PoFileParser(catalog, ignore_obsolete, abort_invalid=abort_invalid)
387 parser.parse(fileobj)
388 return catalog
389
390
391WORD_SEP = re.compile('('
392 r'\s+|' # any whitespace
393 r'[^\s\w]*\w+[a-zA-Z]-(?=\w+[a-zA-Z])|' # hyphenated words
394 r'(?<=[\w\!\"\'\&\.\,\?])-{2,}(?=\w)' # em-dash
395 ')')
396
397
398def escape(string: str) -> str:
399 r"""Escape the given string so that it can be included in double-quoted
400 strings in ``PO`` files.
401
402 >>> escape('''Say:
403 ... "hello, world!"
404 ... ''')
405 '"Say:\\n \\"hello, world!\\"\\n"'
406
407 :param string: the string to escape
408 """
409 return '"%s"' % string.replace('\\', '\\\\') \
410 .replace('\t', '\\t') \
411 .replace('\r', '\\r') \
412 .replace('\n', '\\n') \
413 .replace('\"', '\\"')
414
415
416def normalize(string: str, prefix: str = '', width: int = 76) -> str:
417 r"""Convert a string into a format that is appropriate for .po files.
418
419 >>> print(normalize('''Say:
420 ... "hello, world!"
421 ... ''', width=None))
422 ""
423 "Say:\n"
424 " \"hello, world!\"\n"
425
426 >>> print(normalize('''Say:
427 ... "Lorem ipsum dolor sit amet, consectetur adipisicing elit, "
428 ... ''', width=32))
429 ""
430 "Say:\n"
431 " \"Lorem ipsum dolor sit "
432 "amet, consectetur adipisicing"
433 " elit, \"\n"
434
435 :param string: the string to normalize
436 :param prefix: a string that should be prepended to every line
437 :param width: the maximum line width; use `None`, 0, or a negative number
438 to completely disable line wrapping
439 """
440 if width and width > 0:
441 prefixlen = len(prefix)
442 lines = []
443 for line in string.splitlines(True):
444 if len(escape(line)) + prefixlen > width:
445 chunks = WORD_SEP.split(line)
446 chunks.reverse()
447 while chunks:
448 buf = []
449 size = 2
450 while chunks:
451 length = len(escape(chunks[-1])) - 2 + prefixlen
452 if size + length < width:
453 buf.append(chunks.pop())
454 size += length
455 else:
456 if not buf:
457 # handle long chunks by putting them on a
458 # separate line
459 buf.append(chunks.pop())
460 break
461 lines.append(''.join(buf))
462 else:
463 lines.append(line)
464 else:
465 lines = string.splitlines(True)
466
467 if len(lines) <= 1:
468 return escape(string)
469
470 # Remove empty trailing line
471 if lines and not lines[-1]:
472 del lines[-1]
473 lines[-1] += '\n'
474 return '""\n' + '\n'.join([(prefix + escape(line)) for line in lines])
475
476
477def write_po(
478 fileobj: SupportsWrite[bytes],
479 catalog: Catalog,
480 width: int = 76,
481 no_location: bool = False,
482 omit_header: bool = False,
483 sort_output: bool = False,
484 sort_by_file: bool = False,
485 ignore_obsolete: bool = False,
486 include_previous: bool = False,
487 include_lineno: bool = True,
488) -> None:
489 r"""Write a ``gettext`` PO (portable object) template file for a given
490 message catalog to the provided file-like object.
491
492 >>> catalog = Catalog()
493 >>> catalog.add(u'foo %(name)s', locations=[('main.py', 1)],
494 ... flags=('fuzzy',))
495 <Message...>
496 >>> catalog.add((u'bar', u'baz'), locations=[('main.py', 3)])
497 <Message...>
498 >>> from io import BytesIO
499 >>> buf = BytesIO()
500 >>> write_po(buf, catalog, omit_header=True)
501 >>> print(buf.getvalue().decode("utf8"))
502 #: main.py:1
503 #, fuzzy, python-format
504 msgid "foo %(name)s"
505 msgstr ""
506 <BLANKLINE>
507 #: main.py:3
508 msgid "bar"
509 msgid_plural "baz"
510 msgstr[0] ""
511 msgstr[1] ""
512 <BLANKLINE>
513 <BLANKLINE>
514
515 :param fileobj: the file-like object to write to
516 :param catalog: the `Catalog` instance
517 :param width: the maximum line width for the generated output; use `None`,
518 0, or a negative number to completely disable line wrapping
519 :param no_location: do not emit a location comment for every message
520 :param omit_header: do not include the ``msgid ""`` entry at the top of the
521 output
522 :param sort_output: whether to sort the messages in the output by msgid
523 :param sort_by_file: whether to sort the messages in the output by their
524 locations
525 :param ignore_obsolete: whether to ignore obsolete messages and not include
526 them in the output; by default they are included as
527 comments
528 :param include_previous: include the old msgid as a comment when
529 updating the catalog
530 :param include_lineno: include line number in the location comment
531 """
532
533 sort_by = None
534 if sort_output:
535 sort_by = "message"
536 elif sort_by_file:
537 sort_by = "location"
538
539 for line in generate_po(
540 catalog,
541 ignore_obsolete=ignore_obsolete,
542 include_lineno=include_lineno,
543 include_previous=include_previous,
544 no_location=no_location,
545 omit_header=omit_header,
546 sort_by=sort_by,
547 width=width,
548 ):
549 if isinstance(line, str):
550 line = line.encode(catalog.charset, 'backslashreplace')
551 fileobj.write(line)
552
553
554def generate_po(
555 catalog: Catalog,
556 *,
557 ignore_obsolete: bool = False,
558 include_lineno: bool = True,
559 include_previous: bool = False,
560 no_location: bool = False,
561 omit_header: bool = False,
562 sort_by: Literal["message", "location"] | None = None,
563 width: int = 76,
564) -> Iterable[str]:
565 r"""Yield text strings representing a ``gettext`` PO (portable object) file.
566
567 See `write_po()` for a more detailed description.
568 """
569 # xgettext always wraps comments even if --no-wrap is passed;
570 # provide the same behaviour
571 comment_width = width if width and width > 0 else 76
572
573 def _format_comment(comment, prefix=''):
574 for line in wraptext(comment, comment_width):
575 yield f"#{prefix} {line.strip()}\n"
576
577 def _format_message(message, prefix=''):
578 if isinstance(message.id, (list, tuple)):
579 if message.context:
580 yield f"{prefix}msgctxt {normalize(message.context, prefix=prefix, width=width)}\n"
581 yield f"{prefix}msgid {normalize(message.id[0], prefix=prefix, width=width)}\n"
582 yield f"{prefix}msgid_plural {normalize(message.id[1], prefix=prefix, width=width)}\n"
583
584 for idx in range(catalog.num_plurals):
585 try:
586 string = message.string[idx]
587 except IndexError:
588 string = ''
589 yield f"{prefix}msgstr[{idx:d}] {normalize(string, prefix=prefix, width=width)}\n"
590 else:
591 if message.context:
592 yield f"{prefix}msgctxt {normalize(message.context, prefix=prefix, width=width)}\n"
593 yield f"{prefix}msgid {normalize(message.id, prefix=prefix, width=width)}\n"
594 yield f"{prefix}msgstr {normalize(message.string or '', prefix=prefix, width=width)}\n"
595
596 for message in _sort_messages(catalog, sort_by=sort_by):
597 if not message.id: # This is the header "message"
598 if omit_header:
599 continue
600 comment_header = catalog.header_comment
601 if width and width > 0:
602 lines = []
603 for line in comment_header.splitlines():
604 lines += wraptext(line, width=width,
605 subsequent_indent='# ')
606 comment_header = '\n'.join(lines)
607 yield f"{comment_header}\n"
608
609 for comment in message.user_comments:
610 yield from _format_comment(comment)
611 for comment in message.auto_comments:
612 yield from _format_comment(comment, prefix='.')
613
614 if not no_location:
615 locs = []
616
617 # sort locations by filename and lineno.
618 # if there's no <int> as lineno, use `-1`.
619 # if no sorting possible, leave unsorted.
620 # (see issue #606)
621 try:
622 locations = sorted(message.locations,
623 key=lambda x: (x[0], isinstance(x[1], int) and x[1] or -1))
624 except TypeError: # e.g. "TypeError: unorderable types: NoneType() < int()"
625 locations = message.locations
626
627 for filename, lineno in locations:
628 location = filename.replace(os.sep, '/')
629 if lineno and include_lineno:
630 location = f"{location}:{lineno:d}"
631 if location not in locs:
632 locs.append(location)
633 yield from _format_comment(' '.join(locs), prefix=':')
634 if message.flags:
635 yield f"#{', '.join(['', *sorted(message.flags)])}\n"
636
637 if message.previous_id and include_previous:
638 yield from _format_comment(
639 f'msgid {normalize(message.previous_id[0], width=width)}',
640 prefix='|',
641 )
642 if len(message.previous_id) > 1:
643 norm_previous_id = normalize(message.previous_id[1], width=width)
644 yield from _format_comment(f'msgid_plural {norm_previous_id}', prefix='|')
645
646 yield from _format_message(message)
647 yield '\n'
648
649 if not ignore_obsolete:
650 for message in _sort_messages(
651 catalog.obsolete.values(),
652 sort_by=sort_by,
653 ):
654 for comment in message.user_comments:
655 yield from _format_comment(comment)
656 yield from _format_message(message, prefix='#~ ')
657 yield '\n'
658
659
660def _sort_messages(messages: Iterable[Message], sort_by: Literal["message", "location"] | None) -> list[Message]:
661 """
662 Sort the given message iterable by the given criteria.
663
664 Always returns a list.
665
666 :param messages: An iterable of Messages.
667 :param sort_by: Sort by which criteria? Options are `message` and `location`.
668 :return: list[Message]
669 """
670 messages = list(messages)
671 if sort_by == "message":
672 messages.sort()
673 elif sort_by == "location":
674 messages.sort(key=lambda m: m.locations)
675 return messages