1"""
2babel.messages.pofile
3~~~~~~~~~~~~~~~~~~~~~
4
5Reading and writing of files in the ``gettext`` PO (portable object)
6format.
7
8:copyright: (c) 2013-2025 by the Babel Team.
9:license: BSD, see LICENSE for more details.
10"""
11
12from __future__ import annotations
13
14import os
15import re
16from collections.abc import Iterable
17from typing import TYPE_CHECKING, Literal
18
19from babel.core import Locale
20from babel.messages.catalog import Catalog, Message
21from babel.util import TextWrapper
22
23if TYPE_CHECKING:
24 from typing import IO, AnyStr
25
26 from _typeshed import SupportsWrite
27
28
29_unescape_re = re.compile(r'\\([\\trn"])')
30
31
32def unescape(string: str) -> str:
33 r"""Reverse `escape` the given string.
34
35 >>> print(unescape('"Say:\\n \\"hello, world!\\"\\n"'))
36 Say:
37 "hello, world!"
38 <BLANKLINE>
39
40 :param string: the string to unescape
41 """
42
43 def replace_escapes(match):
44 m = match.group(1)
45 if m == 'n':
46 return '\n'
47 elif m == 't':
48 return '\t'
49 elif m == 'r':
50 return '\r'
51 # m is \ or "
52 return m
53
54 if "\\" not in string: # Fast path: there's nothing to unescape
55 return string[1:-1]
56 return _unescape_re.sub(replace_escapes, string[1:-1])
57
58
59def denormalize(string: str) -> str:
60 r"""Reverse the normalization done by the `normalize` function.
61
62 >>> print(denormalize(r'''""
63 ... "Say:\n"
64 ... " \"hello, world!\"\n"'''))
65 Say:
66 "hello, world!"
67 <BLANKLINE>
68
69 >>> print(denormalize(r'''""
70 ... "Say:\n"
71 ... " \"Lorem ipsum dolor sit "
72 ... "amet, consectetur adipisicing"
73 ... " elit, \"\n"'''))
74 Say:
75 "Lorem ipsum dolor sit amet, consectetur adipisicing elit, "
76 <BLANKLINE>
77
78 :param string: the string to denormalize
79 """
80 if '\n' in string:
81 escaped_lines = string.splitlines()
82 if string.startswith('""'):
83 escaped_lines = escaped_lines[1:]
84 return ''.join(map(unescape, escaped_lines))
85 else:
86 return unescape(string)
87
88
89def _extract_locations(line: str) -> list[str]:
90 """Extract locations from location comments.
91
92 Locations are extracted while properly handling First Strong
93 Isolate (U+2068) and Pop Directional Isolate (U+2069), used by
94 gettext to enclose filenames with spaces and tabs in their names.
95 """
96 if "\u2068" not in line and "\u2069" not in line:
97 return line.lstrip().split()
98
99 locations = []
100 location = ""
101 in_filename = False
102 for c in line:
103 if c == "\u2068":
104 if in_filename:
105 raise ValueError(
106 "location comment contains more First Strong Isolate "
107 "characters, than Pop Directional Isolate characters",
108 )
109 in_filename = True
110 continue
111 elif c == "\u2069":
112 if not in_filename:
113 raise ValueError(
114 "location comment contains more Pop Directional Isolate "
115 "characters, than First Strong Isolate characters",
116 )
117 in_filename = False
118 continue
119 elif c == " ":
120 if in_filename:
121 location += c
122 elif location:
123 locations.append(location)
124 location = ""
125 else:
126 location += c
127 else:
128 if location:
129 if in_filename:
130 raise ValueError(
131 "location comment contains more First Strong Isolate "
132 "characters, than Pop Directional Isolate characters",
133 )
134 locations.append(location)
135
136 return locations
137
138
139class PoFileError(Exception):
140 """Exception thrown by PoParser when an invalid po file is encountered."""
141
142 def __init__(self, message: str, catalog: Catalog, line: str, lineno: int) -> None:
143 super().__init__(f'{message} on {lineno}')
144 self.catalog = catalog
145 self.line = line
146 self.lineno = lineno
147
148
149class _NormalizedString(list):
150 def __init__(self, *args: str) -> None:
151 super().__init__(map(str.strip, args))
152
153 def denormalize(self) -> str:
154 if not self:
155 return ""
156 return ''.join(map(unescape, self))
157
158
159class PoFileParser:
160 """Support class to read messages from a ``gettext`` PO (portable object) file
161 and add them to a `Catalog`
162
163 See `read_po` for simple cases.
164 """
165
166 def __init__(
167 self,
168 catalog: Catalog,
169 ignore_obsolete: bool = False,
170 abort_invalid: bool = False,
171 ) -> None:
172 self.catalog = catalog
173 self.ignore_obsolete = ignore_obsolete
174 self.counter = 0
175 self.offset = 0
176 self.abort_invalid = abort_invalid
177 self._reset_message_state()
178
179 def _reset_message_state(self) -> None:
180 self.messages = []
181 self.translations = []
182 self.locations = []
183 self.flags = []
184 self.user_comments = []
185 self.auto_comments = []
186 self.context = None
187 self.obsolete = False
188 self.in_msgid = False
189 self.in_msgstr = False
190 self.in_msgctxt = False
191
192 def _add_message(self) -> None:
193 """
194 Add a message to the catalog based on the current parser state and
195 clear the state ready to process the next message.
196 """
197 if len(self.messages) > 1:
198 msgid = tuple(m.denormalize() for m in self.messages)
199 string = ['' for _ in range(self.catalog.num_plurals)]
200 for idx, translation in sorted(self.translations):
201 if idx >= self.catalog.num_plurals:
202 self._invalid_pofile(
203 "",
204 self.offset,
205 "msg has more translations than num_plurals of catalog",
206 )
207 continue
208 string[idx] = translation.denormalize()
209 string = tuple(string)
210 else:
211 msgid = self.messages[0].denormalize()
212 string = self.translations[0][1].denormalize()
213 msgctxt = self.context.denormalize() if self.context else None
214 message = Message(
215 msgid,
216 string,
217 self.locations,
218 self.flags,
219 self.auto_comments,
220 self.user_comments,
221 lineno=self.offset + 1,
222 context=msgctxt,
223 )
224 if self.obsolete:
225 if not self.ignore_obsolete:
226 self.catalog.obsolete[self.catalog._key_for(msgid, msgctxt)] = message
227 else:
228 self.catalog[msgid] = message
229 self.counter += 1
230 self._reset_message_state()
231
232 def _finish_current_message(self) -> None:
233 if self.messages:
234 if not self.translations:
235 self._invalid_pofile(
236 "",
237 self.offset,
238 f"missing msgstr for msgid '{self.messages[0].denormalize()}'",
239 )
240 self.translations.append([0, _NormalizedString()])
241 self._add_message()
242
243 def _process_message_line(self, lineno, line, obsolete=False) -> None:
244 if not line:
245 return
246 if line[0] == '"':
247 self._process_string_continuation_line(line, lineno)
248 else:
249 self._process_keyword_line(lineno, line, obsolete)
250
251 def _process_keyword_line(self, lineno, line, obsolete=False) -> None:
252 keyword, _, arg = line.partition(' ')
253
254 if keyword in ['msgid', 'msgctxt']:
255 self._finish_current_message()
256
257 self.obsolete = obsolete
258
259 # The line that has the msgid is stored as the offset of the msg
260 # should this be the msgctxt if it has one?
261 if keyword == 'msgid':
262 self.offset = lineno
263
264 if keyword in ['msgid', 'msgid_plural']:
265 self.in_msgctxt = False
266 self.in_msgid = True
267 self.messages.append(_NormalizedString(arg))
268 return
269
270 if keyword == 'msgctxt':
271 self.in_msgctxt = True
272 self.context = _NormalizedString(arg)
273 return
274
275 if keyword == 'msgstr' or keyword.startswith('msgstr['):
276 self.in_msgid = False
277 self.in_msgstr = True
278 kwarg, has_bracket, idxarg = keyword.partition('[')
279 idx = int(idxarg[:-1]) if has_bracket else 0
280 s = _NormalizedString(arg) if arg != '""' else _NormalizedString()
281 self.translations.append([idx, s])
282 return
283
284 self._invalid_pofile(line, lineno, "Unknown or misformatted keyword")
285
286 def _process_string_continuation_line(self, line, lineno) -> None:
287 if self.in_msgid:
288 s = self.messages[-1]
289 elif self.in_msgstr:
290 s = self.translations[-1][1]
291 elif self.in_msgctxt:
292 s = self.context
293 else:
294 self._invalid_pofile(
295 line,
296 lineno,
297 "Got line starting with \" but not in msgid, msgstr or msgctxt",
298 )
299 return
300 # For performance reasons, `NormalizedString` doesn't strip internally
301 s.append(line.strip())
302
303 def _process_comment(self, line) -> None:
304 self._finish_current_message()
305
306 prefix = line[:2]
307 if prefix == '#:':
308 for location in _extract_locations(line[2:]):
309 a, colon, b = location.rpartition(':')
310 if colon:
311 try:
312 self.locations.append((a, int(b)))
313 except ValueError:
314 continue
315 else: # No line number specified
316 self.locations.append((location, None))
317 return
318
319 if prefix == '#,':
320 self.flags.extend(flag.strip() for flag in line[2:].lstrip().split(','))
321 return
322
323 if prefix == '#.':
324 # These are called auto-comments
325 comment = line[2:].strip()
326 if comment: # Just check that we're not adding empty comments
327 self.auto_comments.append(comment)
328 return
329
330 # These are called user comments
331 self.user_comments.append(line[1:].strip())
332
333 def parse(self, fileobj: IO[AnyStr] | Iterable[AnyStr]) -> None:
334 """
335 Reads from the file-like object (or iterable of string-likes) `fileobj`
336 and adds any po file units found in it to the `Catalog`
337 supplied to the constructor.
338
339 All of the items in the iterable must be the same type; either `str`
340 or `bytes` (decoded with the catalog charset), but not a mixture.
341 """
342 needs_decode = None
343
344 for lineno, line in enumerate(fileobj):
345 line = line.strip()
346 if needs_decode is None:
347 # If we don't yet know whether we need to decode,
348 # let's find out now.
349 needs_decode = not isinstance(line, str)
350 if not line:
351 continue
352 if needs_decode:
353 line = line.decode(self.catalog.charset)
354 if line[0] == '#':
355 if line[:2] == '#~':
356 self._process_message_line(lineno, line[2:].lstrip(), obsolete=True)
357 else:
358 try:
359 self._process_comment(line)
360 except ValueError as exc:
361 self._invalid_pofile(line, lineno, str(exc))
362 else:
363 self._process_message_line(lineno, line)
364
365 self._finish_current_message()
366
367 # No actual messages found, but there was some info in comments, from which
368 # we'll construct an empty header message
369 if not self.counter and (self.flags or self.user_comments or self.auto_comments):
370 self.messages.append(_NormalizedString())
371 self.translations.append([0, _NormalizedString()])
372 self._add_message()
373
374 def _invalid_pofile(self, line, lineno, msg) -> None:
375 assert isinstance(line, str)
376 if self.abort_invalid:
377 raise PoFileError(msg, self.catalog, line, lineno)
378 print("WARNING:", msg)
379 print(f"WARNING: Problem on line {lineno + 1}: {line!r}")
380
381
382def read_po(
383 fileobj: IO[AnyStr] | Iterable[AnyStr],
384 locale: Locale | str | None = None,
385 domain: str | None = None,
386 ignore_obsolete: bool = False,
387 charset: str | None = None,
388 abort_invalid: bool = False,
389) -> Catalog:
390 """Read messages from a ``gettext`` PO (portable object) file from the given
391 file-like object (or an iterable of lines) and return a `Catalog`.
392
393 >>> from datetime import datetime
394 >>> from io import StringIO
395 >>> buf = StringIO('''
396 ... #: main.py:1
397 ... #, fuzzy, python-format
398 ... msgid "foo %(name)s"
399 ... msgstr "quux %(name)s"
400 ...
401 ... # A user comment
402 ... #. An auto comment
403 ... #: main.py:3
404 ... msgid "bar"
405 ... msgid_plural "baz"
406 ... msgstr[0] "bar"
407 ... msgstr[1] "baaz"
408 ... ''')
409 >>> catalog = read_po(buf)
410 >>> catalog.revision_date = datetime(2007, 4, 1)
411
412 >>> for message in catalog:
413 ... if message.id:
414 ... print((message.id, message.string))
415 ... print(' ', (message.locations, sorted(list(message.flags))))
416 ... print(' ', (message.user_comments, message.auto_comments))
417 ('foo %(name)s', 'quux %(name)s')
418 ([('main.py', 1)], ['fuzzy', 'python-format'])
419 ([], [])
420 (('bar', 'baz'), ('bar', 'baaz'))
421 ([('main.py', 3)], [])
422 (['A user comment'], ['An auto comment'])
423
424 .. versionadded:: 1.0
425 Added support for explicit charset argument.
426
427 :param fileobj: the file-like object (or iterable of lines) to read the PO file from
428 :param locale: the locale identifier or `Locale` object, or `None`
429 if the catalog is not bound to a locale (which basically
430 means it's a template)
431 :param domain: the message domain
432 :param ignore_obsolete: whether to ignore obsolete messages in the input
433 :param charset: the character set of the catalog.
434 :param abort_invalid: abort read if po file is invalid
435 """
436 catalog = Catalog(locale=locale, domain=domain, charset=charset)
437 parser = PoFileParser(catalog, ignore_obsolete, abort_invalid=abort_invalid)
438 parser.parse(fileobj)
439 return catalog
440
441
442WORD_SEP = re.compile(
443 '('
444 r'\s+|' # any whitespace
445 r'[^\s\w]*\w+[a-zA-Z]-(?=\w+[a-zA-Z])|' # hyphenated words
446 r'(?<=[\w\!\"\'\&\.\,\?])-{2,}(?=\w)' # em-dash
447 ')',
448)
449
450
451def escape(string: str) -> str:
452 r"""Escape the given string so that it can be included in double-quoted
453 strings in ``PO`` files.
454
455 >>> escape('''Say:
456 ... "hello, world!"
457 ... ''')
458 '"Say:\\n \\"hello, world!\\"\\n"'
459
460 :param string: the string to escape
461 """
462 return '"%s"' % string.replace('\\', '\\\\').replace('\t', '\\t').replace(
463 '\r',
464 '\\r',
465 ).replace('\n', '\\n').replace('"', '\\"')
466
467
468def normalize(string: str, prefix: str = '', width: int = 76) -> str:
469 r"""Convert a string into a format that is appropriate for .po files.
470
471 >>> print(normalize('''Say:
472 ... "hello, world!"
473 ... ''', width=None))
474 ""
475 "Say:\n"
476 " \"hello, world!\"\n"
477
478 >>> print(normalize('''Say:
479 ... "Lorem ipsum dolor sit amet, consectetur adipisicing elit, "
480 ... ''', width=32))
481 ""
482 "Say:\n"
483 " \"Lorem ipsum dolor sit "
484 "amet, consectetur adipisicing"
485 " elit, \"\n"
486
487 :param string: the string to normalize
488 :param prefix: a string that should be prepended to every line
489 :param width: the maximum line width; use `None`, 0, or a negative number
490 to completely disable line wrapping
491 """
492 if width and width > 0:
493 prefixlen = len(prefix)
494 lines = []
495 for line in string.splitlines(True):
496 if len(escape(line)) + prefixlen > width:
497 chunks = WORD_SEP.split(line)
498 chunks.reverse()
499 while chunks:
500 buf = []
501 size = 2
502 while chunks:
503 length = len(escape(chunks[-1])) - 2 + prefixlen
504 if size + length < width:
505 buf.append(chunks.pop())
506 size += length
507 else:
508 if not buf:
509 # handle long chunks by putting them on a
510 # separate line
511 buf.append(chunks.pop())
512 break
513 lines.append(''.join(buf))
514 else:
515 lines.append(line)
516 else:
517 lines = string.splitlines(True)
518
519 if len(lines) <= 1:
520 return escape(string)
521
522 # Remove empty trailing line
523 if lines and not lines[-1]:
524 del lines[-1]
525 lines[-1] += '\n'
526 return '""\n' + '\n'.join([(prefix + escape(line)) for line in lines])
527
528
529def _enclose_filename_if_necessary(filename: str) -> str:
530 """Enclose filenames which include white spaces or tabs.
531
532 Do the same as gettext and enclose filenames which contain white
533 spaces or tabs with First Strong Isolate (U+2068) and Pop
534 Directional Isolate (U+2069).
535 """
536 if " " not in filename and "\t" not in filename:
537 return filename
538
539 if not filename.startswith("\u2068"):
540 filename = "\u2068" + filename
541 if not filename.endswith("\u2069"):
542 filename += "\u2069"
543 return filename
544
545
546def write_po(
547 fileobj: SupportsWrite[bytes],
548 catalog: Catalog,
549 width: int = 76,
550 no_location: bool = False,
551 omit_header: bool = False,
552 sort_output: bool = False,
553 sort_by_file: bool = False,
554 ignore_obsolete: bool = False,
555 include_previous: bool = False,
556 include_lineno: bool = True,
557) -> None:
558 r"""Write a ``gettext`` PO (portable object) template file for a given
559 message catalog to the provided file-like object.
560
561 >>> catalog = Catalog()
562 >>> catalog.add('foo %(name)s', locations=[('main.py', 1)],
563 ... flags=('fuzzy',))
564 <Message...>
565 >>> catalog.add(('bar', 'baz'), locations=[('main.py', 3)])
566 <Message...>
567 >>> from io import BytesIO
568 >>> buf = BytesIO()
569 >>> write_po(buf, catalog, omit_header=True)
570 >>> print(buf.getvalue().decode("utf8"))
571 #: main.py:1
572 #, fuzzy, python-format
573 msgid "foo %(name)s"
574 msgstr ""
575 <BLANKLINE>
576 #: main.py:3
577 msgid "bar"
578 msgid_plural "baz"
579 msgstr[0] ""
580 msgstr[1] ""
581 <BLANKLINE>
582 <BLANKLINE>
583
584 :param fileobj: the file-like object to write to
585 :param catalog: the `Catalog` instance
586 :param width: the maximum line width for the generated output; use `None`,
587 0, or a negative number to completely disable line wrapping
588 :param no_location: do not emit a location comment for every message
589 :param omit_header: do not include the ``msgid ""`` entry at the top of the
590 output
591 :param sort_output: whether to sort the messages in the output by msgid
592 :param sort_by_file: whether to sort the messages in the output by their
593 locations
594 :param ignore_obsolete: whether to ignore obsolete messages and not include
595 them in the output; by default they are included as
596 comments
597 :param include_previous: include the old msgid as a comment when
598 updating the catalog
599 :param include_lineno: include line number in the location comment
600 """
601
602 sort_by = None
603 if sort_output:
604 sort_by = "message"
605 elif sort_by_file:
606 sort_by = "location"
607
608 for line in generate_po(
609 catalog,
610 ignore_obsolete=ignore_obsolete,
611 include_lineno=include_lineno,
612 include_previous=include_previous,
613 no_location=no_location,
614 omit_header=omit_header,
615 sort_by=sort_by,
616 width=width,
617 ):
618 if isinstance(line, str):
619 line = line.encode(catalog.charset, 'backslashreplace')
620 fileobj.write(line)
621
622
623def generate_po(
624 catalog: Catalog,
625 *,
626 ignore_obsolete: bool = False,
627 include_lineno: bool = True,
628 include_previous: bool = False,
629 no_location: bool = False,
630 omit_header: bool = False,
631 sort_by: Literal["message", "location"] | None = None,
632 width: int = 76,
633) -> Iterable[str]:
634 r"""Yield text strings representing a ``gettext`` PO (portable object) file.
635
636 See `write_po()` for a more detailed description.
637 """
638 # xgettext always wraps comments even if --no-wrap is passed;
639 # provide the same behaviour
640 comment_width = width if width and width > 0 else 76
641
642 comment_wrapper = TextWrapper(width=comment_width, break_long_words=False)
643 header_wrapper = TextWrapper(width=width, subsequent_indent="# ", break_long_words=False)
644
645 def _format_comment(comment, prefix=''):
646 for line in comment_wrapper.wrap(comment):
647 yield f"#{prefix} {line.strip()}\n"
648
649 def _format_message(message, prefix=''):
650 if isinstance(message.id, (list, tuple)):
651 if message.context:
652 yield f"{prefix}msgctxt {normalize(message.context, prefix=prefix, width=width)}\n"
653 yield f"{prefix}msgid {normalize(message.id[0], prefix=prefix, width=width)}\n"
654 yield f"{prefix}msgid_plural {normalize(message.id[1], prefix=prefix, width=width)}\n"
655
656 for idx in range(catalog.num_plurals):
657 try:
658 string = message.string[idx]
659 except IndexError:
660 string = ''
661 yield f"{prefix}msgstr[{idx:d}] {normalize(string, prefix=prefix, width=width)}\n"
662 else:
663 if message.context:
664 yield f"{prefix}msgctxt {normalize(message.context, prefix=prefix, width=width)}\n"
665 yield f"{prefix}msgid {normalize(message.id, prefix=prefix, width=width)}\n"
666 yield f"{prefix}msgstr {normalize(message.string or '', prefix=prefix, width=width)}\n"
667
668 for message in _sort_messages(catalog, sort_by=sort_by):
669 if not message.id: # This is the header "message"
670 if omit_header:
671 continue
672 comment_header = catalog.header_comment
673 if width and width > 0:
674 lines = []
675 for line in comment_header.splitlines():
676 lines += header_wrapper.wrap(line)
677 comment_header = '\n'.join(lines)
678 yield f"{comment_header}\n"
679
680 for comment in message.user_comments:
681 yield from _format_comment(comment)
682 for comment in message.auto_comments:
683 yield from _format_comment(comment, prefix='.')
684
685 if not no_location:
686 locs = []
687
688 # sort locations by filename and lineno.
689 # if there's no <int> as lineno, use `-1`.
690 # if no sorting possible, leave unsorted.
691 # (see issue #606)
692 try:
693 locations = sorted(
694 message.locations,
695 key=lambda x: (x[0], isinstance(x[1], int) and x[1] or -1),
696 )
697 except TypeError: # e.g. "TypeError: unorderable types: NoneType() < int()"
698 locations = message.locations
699
700 for filename, lineno in locations:
701 location = filename.replace(os.sep, '/')
702 location = _enclose_filename_if_necessary(location)
703 if lineno and include_lineno:
704 location = f"{location}:{lineno:d}"
705 if location not in locs:
706 locs.append(location)
707 yield from _format_comment(' '.join(locs), prefix=':')
708 if message.flags:
709 yield f"#{', '.join(['', *sorted(message.flags)])}\n"
710
711 if message.previous_id and include_previous:
712 yield from _format_comment(
713 f'msgid {normalize(message.previous_id[0], width=width)}',
714 prefix='|',
715 )
716 if len(message.previous_id) > 1:
717 norm_previous_id = normalize(message.previous_id[1], width=width)
718 yield from _format_comment(f'msgid_plural {norm_previous_id}', prefix='|')
719
720 yield from _format_message(message)
721 yield '\n'
722
723 if not ignore_obsolete:
724 for message in _sort_messages(
725 catalog.obsolete.values(),
726 sort_by=sort_by,
727 ):
728 for comment in message.user_comments:
729 yield from _format_comment(comment)
730 yield from _format_message(message, prefix='#~ ')
731 yield '\n'
732
733
734def _sort_messages(
735 messages: Iterable[Message],
736 sort_by: Literal["message", "location"] | None,
737) -> list[Message]:
738 """
739 Sort the given message iterable by the given criteria.
740
741 Always returns a list.
742
743 :param messages: An iterable of Messages.
744 :param sort_by: Sort by which criteria? Options are `message` and `location`.
745 :return: list[Message]
746 """
747 messages = list(messages)
748 if sort_by == "message":
749 messages.sort()
750 elif sort_by == "location":
751 messages.sort(key=lambda m: m.locations)
752 return messages