1"""
2 babel.messages.pofile
3 ~~~~~~~~~~~~~~~~~~~~~
4
5 Reading and writing of files in the ``gettext`` PO (portable object)
6 format.
7
8 :copyright: (c) 2013-2025 by the Babel Team.
9 :license: BSD, see LICENSE for more details.
10"""
11from __future__ import annotations
12
13import os
14import re
15from collections.abc import Iterable
16from typing import TYPE_CHECKING, Literal
17
18from babel.core import Locale
19from babel.messages.catalog import Catalog, Message
20from babel.util import TextWrapper
21
22if TYPE_CHECKING:
23 from typing import IO, AnyStr
24
25 from _typeshed import SupportsWrite
26
27
28_unescape_re = re.compile(r'\\([\\trn"])')
29
30
31def unescape(string: str) -> str:
32 r"""Reverse `escape` the given string.
33
34 >>> print(unescape('"Say:\\n \\"hello, world!\\"\\n"'))
35 Say:
36 "hello, world!"
37 <BLANKLINE>
38
39 :param string: the string to unescape
40 """
41 def replace_escapes(match):
42 m = match.group(1)
43 if m == 'n':
44 return '\n'
45 elif m == 't':
46 return '\t'
47 elif m == 'r':
48 return '\r'
49 # m is \ or "
50 return m
51
52 if "\\" not in string: # Fast path: there's nothing to unescape
53 return string[1:-1]
54 return _unescape_re.sub(replace_escapes, string[1:-1])
55
56
57def denormalize(string: str) -> str:
58 r"""Reverse the normalization done by the `normalize` function.
59
60 >>> print(denormalize(r'''""
61 ... "Say:\n"
62 ... " \"hello, world!\"\n"'''))
63 Say:
64 "hello, world!"
65 <BLANKLINE>
66
67 >>> print(denormalize(r'''""
68 ... "Say:\n"
69 ... " \"Lorem ipsum dolor sit "
70 ... "amet, consectetur adipisicing"
71 ... " elit, \"\n"'''))
72 Say:
73 "Lorem ipsum dolor sit amet, consectetur adipisicing elit, "
74 <BLANKLINE>
75
76 :param string: the string to denormalize
77 """
78 if '\n' in string:
79 escaped_lines = string.splitlines()
80 if string.startswith('""'):
81 escaped_lines = escaped_lines[1:]
82 return ''.join(map(unescape, escaped_lines))
83 else:
84 return unescape(string)
85
86
87def _extract_locations(line: str) -> list[str]:
88 """Extract locations from location comments.
89
90 Locations are extracted while properly handling First Strong
91 Isolate (U+2068) and Pop Directional Isolate (U+2069), used by
92 gettext to enclose filenames with spaces and tabs in their names.
93 """
94 if "\u2068" not in line and "\u2069" not in line:
95 return line.lstrip().split()
96
97 locations = []
98 location = ""
99 in_filename = False
100 for c in line:
101 if c == "\u2068":
102 if in_filename:
103 raise ValueError("location comment contains more First Strong Isolate "
104 "characters, than Pop Directional Isolate characters")
105 in_filename = True
106 continue
107 elif c == "\u2069":
108 if not in_filename:
109 raise ValueError("location comment contains more Pop Directional Isolate "
110 "characters, than First Strong Isolate characters")
111 in_filename = False
112 continue
113 elif c == " ":
114 if in_filename:
115 location += c
116 elif location:
117 locations.append(location)
118 location = ""
119 else:
120 location += c
121 else:
122 if location:
123 if in_filename:
124 raise ValueError("location comment contains more First Strong Isolate "
125 "characters, than Pop Directional Isolate characters")
126 locations.append(location)
127
128 return locations
129
130
131class PoFileError(Exception):
132 """Exception thrown by PoParser when an invalid po file is encountered."""
133
134 def __init__(self, message: str, catalog: Catalog, line: str, lineno: int) -> None:
135 super().__init__(f'{message} on {lineno}')
136 self.catalog = catalog
137 self.line = line
138 self.lineno = lineno
139
140
141class _NormalizedString(list):
142 def __init__(self, *args: str) -> None:
143 super().__init__(map(str.strip, args))
144
145 def denormalize(self) -> str:
146 if not self:
147 return ""
148 return ''.join(map(unescape, self))
149
150
151class PoFileParser:
152 """Support class to read messages from a ``gettext`` PO (portable object) file
153 and add them to a `Catalog`
154
155 See `read_po` for simple cases.
156 """
157
158 def __init__(self, catalog: Catalog, ignore_obsolete: bool = False, abort_invalid: bool = False) -> None:
159 self.catalog = catalog
160 self.ignore_obsolete = ignore_obsolete
161 self.counter = 0
162 self.offset = 0
163 self.abort_invalid = abort_invalid
164 self._reset_message_state()
165
166 def _reset_message_state(self) -> None:
167 self.messages = []
168 self.translations = []
169 self.locations = []
170 self.flags = []
171 self.user_comments = []
172 self.auto_comments = []
173 self.context = None
174 self.obsolete = False
175 self.in_msgid = False
176 self.in_msgstr = False
177 self.in_msgctxt = False
178
179 def _add_message(self) -> None:
180 """
181 Add a message to the catalog based on the current parser state and
182 clear the state ready to process the next message.
183 """
184 if len(self.messages) > 1:
185 msgid = tuple(m.denormalize() for m in self.messages)
186 string = ['' for _ in range(self.catalog.num_plurals)]
187 for idx, translation in sorted(self.translations):
188 if idx >= self.catalog.num_plurals:
189 self._invalid_pofile("", self.offset, "msg has more translations than num_plurals of catalog")
190 continue
191 string[idx] = translation.denormalize()
192 string = tuple(string)
193 else:
194 msgid = self.messages[0].denormalize()
195 string = self.translations[0][1].denormalize()
196 msgctxt = self.context.denormalize() if self.context else None
197 message = Message(msgid, string, self.locations, self.flags,
198 self.auto_comments, self.user_comments, lineno=self.offset + 1,
199 context=msgctxt)
200 if self.obsolete:
201 if not self.ignore_obsolete:
202 self.catalog.obsolete[self.catalog._key_for(msgid, msgctxt)] = message
203 else:
204 self.catalog[msgid] = message
205 self.counter += 1
206 self._reset_message_state()
207
208 def _finish_current_message(self) -> None:
209 if self.messages:
210 if not self.translations:
211 self._invalid_pofile("", self.offset, f"missing msgstr for msgid '{self.messages[0].denormalize()}'")
212 self.translations.append([0, _NormalizedString()])
213 self._add_message()
214
215 def _process_message_line(self, lineno, line, obsolete=False) -> None:
216 if not line:
217 return
218 if line[0] == '"':
219 self._process_string_continuation_line(line, lineno)
220 else:
221 self._process_keyword_line(lineno, line, obsolete)
222
223 def _process_keyword_line(self, lineno, line, obsolete=False) -> None:
224 keyword, _, arg = line.partition(' ')
225
226 if keyword in ['msgid', 'msgctxt']:
227 self._finish_current_message()
228
229 self.obsolete = obsolete
230
231 # The line that has the msgid is stored as the offset of the msg
232 # should this be the msgctxt if it has one?
233 if keyword == 'msgid':
234 self.offset = lineno
235
236 if keyword in ['msgid', 'msgid_plural']:
237 self.in_msgctxt = False
238 self.in_msgid = True
239 self.messages.append(_NormalizedString(arg))
240 return
241
242 if keyword == 'msgctxt':
243 self.in_msgctxt = True
244 self.context = _NormalizedString(arg)
245 return
246
247 if keyword == 'msgstr' or keyword.startswith('msgstr['):
248 self.in_msgid = False
249 self.in_msgstr = True
250 kwarg, has_bracket, idxarg = keyword.partition('[')
251 idx = int(idxarg[:-1]) if has_bracket else 0
252 s = _NormalizedString(arg) if arg != '""' else _NormalizedString()
253 self.translations.append([idx, s])
254 return
255
256 self._invalid_pofile(line, lineno, "Unknown or misformatted keyword")
257
258 def _process_string_continuation_line(self, line, lineno) -> None:
259 if self.in_msgid:
260 s = self.messages[-1]
261 elif self.in_msgstr:
262 s = self.translations[-1][1]
263 elif self.in_msgctxt:
264 s = self.context
265 else:
266 self._invalid_pofile(line, lineno, "Got line starting with \" but not in msgid, msgstr or msgctxt")
267 return
268 s.append(line.strip()) # For performance reasons, `NormalizedString` doesn't strip internally
269
270 def _process_comment(self, line) -> None:
271
272 self._finish_current_message()
273
274 prefix = line[:2]
275 if prefix == '#:':
276 for location in _extract_locations(line[2:]):
277 a, colon, b = location.rpartition(':')
278 if colon:
279 try:
280 self.locations.append((a, int(b)))
281 except ValueError:
282 continue
283 else: # No line number specified
284 self.locations.append((location, None))
285 return
286
287 if prefix == '#,':
288 self.flags.extend(flag.strip() for flag in line[2:].lstrip().split(','))
289 return
290
291 if prefix == '#.':
292 # These are called auto-comments
293 comment = line[2:].strip()
294 if comment: # Just check that we're not adding empty comments
295 self.auto_comments.append(comment)
296 return
297
298 # These are called user comments
299 self.user_comments.append(line[1:].strip())
300
301 def parse(self, fileobj: IO[AnyStr] | Iterable[AnyStr]) -> None:
302 """
303 Reads from the file-like object (or iterable of string-likes) `fileobj`
304 and adds any po file units found in it to the `Catalog`
305 supplied to the constructor.
306
307 All of the items in the iterable must be the same type; either `str`
308 or `bytes` (decoded with the catalog charset), but not a mixture.
309 """
310 needs_decode = None
311
312 for lineno, line in enumerate(fileobj):
313 line = line.strip()
314 if needs_decode is None:
315 # If we don't yet know whether we need to decode,
316 # let's find out now.
317 needs_decode = not isinstance(line, str)
318 if not line:
319 continue
320 if needs_decode:
321 line = line.decode(self.catalog.charset)
322 if line[0] == '#':
323 if line[:2] == '#~':
324 self._process_message_line(lineno, line[2:].lstrip(), obsolete=True)
325 else:
326 try:
327 self._process_comment(line)
328 except ValueError as exc:
329 self._invalid_pofile(line, lineno, str(exc))
330 else:
331 self._process_message_line(lineno, line)
332
333 self._finish_current_message()
334
335 # No actual messages found, but there was some info in comments, from which
336 # we'll construct an empty header message
337 if not self.counter and (self.flags or self.user_comments or self.auto_comments):
338 self.messages.append(_NormalizedString())
339 self.translations.append([0, _NormalizedString()])
340 self._add_message()
341
342 def _invalid_pofile(self, line, lineno, msg) -> None:
343 assert isinstance(line, str)
344 if self.abort_invalid:
345 raise PoFileError(msg, self.catalog, line, lineno)
346 print("WARNING:", msg)
347 print(f"WARNING: Problem on line {lineno + 1}: {line!r}")
348
349
350def read_po(
351 fileobj: IO[AnyStr] | Iterable[AnyStr],
352 locale: Locale | str | None = None,
353 domain: str | None = None,
354 ignore_obsolete: bool = False,
355 charset: str | None = None,
356 abort_invalid: bool = False,
357) -> Catalog:
358 """Read messages from a ``gettext`` PO (portable object) file from the given
359 file-like object (or an iterable of lines) and return a `Catalog`.
360
361 >>> from datetime import datetime
362 >>> from io import StringIO
363 >>> buf = StringIO('''
364 ... #: main.py:1
365 ... #, fuzzy, python-format
366 ... msgid "foo %(name)s"
367 ... msgstr "quux %(name)s"
368 ...
369 ... # A user comment
370 ... #. An auto comment
371 ... #: main.py:3
372 ... msgid "bar"
373 ... msgid_plural "baz"
374 ... msgstr[0] "bar"
375 ... msgstr[1] "baaz"
376 ... ''')
377 >>> catalog = read_po(buf)
378 >>> catalog.revision_date = datetime(2007, 4, 1)
379
380 >>> for message in catalog:
381 ... if message.id:
382 ... print((message.id, message.string))
383 ... print(' ', (message.locations, sorted(list(message.flags))))
384 ... print(' ', (message.user_comments, message.auto_comments))
385 ('foo %(name)s', 'quux %(name)s')
386 ([('main.py', 1)], ['fuzzy', 'python-format'])
387 ([], [])
388 (('bar', 'baz'), ('bar', 'baaz'))
389 ([('main.py', 3)], [])
390 (['A user comment'], ['An auto comment'])
391
392 .. versionadded:: 1.0
393 Added support for explicit charset argument.
394
395 :param fileobj: the file-like object (or iterable of lines) to read the PO file from
396 :param locale: the locale identifier or `Locale` object, or `None`
397 if the catalog is not bound to a locale (which basically
398 means it's a template)
399 :param domain: the message domain
400 :param ignore_obsolete: whether to ignore obsolete messages in the input
401 :param charset: the character set of the catalog.
402 :param abort_invalid: abort read if po file is invalid
403 """
404 catalog = Catalog(locale=locale, domain=domain, charset=charset)
405 parser = PoFileParser(catalog, ignore_obsolete, abort_invalid=abort_invalid)
406 parser.parse(fileobj)
407 return catalog
408
409
410WORD_SEP = re.compile('('
411 r'\s+|' # any whitespace
412 r'[^\s\w]*\w+[a-zA-Z]-(?=\w+[a-zA-Z])|' # hyphenated words
413 r'(?<=[\w\!\"\'\&\.\,\?])-{2,}(?=\w)' # em-dash
414 ')')
415
416
417def escape(string: str) -> str:
418 r"""Escape the given string so that it can be included in double-quoted
419 strings in ``PO`` files.
420
421 >>> escape('''Say:
422 ... "hello, world!"
423 ... ''')
424 '"Say:\\n \\"hello, world!\\"\\n"'
425
426 :param string: the string to escape
427 """
428 return '"%s"' % string.replace('\\', '\\\\') \
429 .replace('\t', '\\t') \
430 .replace('\r', '\\r') \
431 .replace('\n', '\\n') \
432 .replace('\"', '\\"')
433
434
435def normalize(string: str, prefix: str = '', width: int = 76) -> str:
436 r"""Convert a string into a format that is appropriate for .po files.
437
438 >>> print(normalize('''Say:
439 ... "hello, world!"
440 ... ''', width=None))
441 ""
442 "Say:\n"
443 " \"hello, world!\"\n"
444
445 >>> print(normalize('''Say:
446 ... "Lorem ipsum dolor sit amet, consectetur adipisicing elit, "
447 ... ''', width=32))
448 ""
449 "Say:\n"
450 " \"Lorem ipsum dolor sit "
451 "amet, consectetur adipisicing"
452 " elit, \"\n"
453
454 :param string: the string to normalize
455 :param prefix: a string that should be prepended to every line
456 :param width: the maximum line width; use `None`, 0, or a negative number
457 to completely disable line wrapping
458 """
459 if width and width > 0:
460 prefixlen = len(prefix)
461 lines = []
462 for line in string.splitlines(True):
463 if len(escape(line)) + prefixlen > width:
464 chunks = WORD_SEP.split(line)
465 chunks.reverse()
466 while chunks:
467 buf = []
468 size = 2
469 while chunks:
470 length = len(escape(chunks[-1])) - 2 + prefixlen
471 if size + length < width:
472 buf.append(chunks.pop())
473 size += length
474 else:
475 if not buf:
476 # handle long chunks by putting them on a
477 # separate line
478 buf.append(chunks.pop())
479 break
480 lines.append(''.join(buf))
481 else:
482 lines.append(line)
483 else:
484 lines = string.splitlines(True)
485
486 if len(lines) <= 1:
487 return escape(string)
488
489 # Remove empty trailing line
490 if lines and not lines[-1]:
491 del lines[-1]
492 lines[-1] += '\n'
493 return '""\n' + '\n'.join([(prefix + escape(line)) for line in lines])
494
495
496def _enclose_filename_if_necessary(filename: str) -> str:
497 """Enclose filenames which include white spaces or tabs.
498
499 Do the same as gettext and enclose filenames which contain white
500 spaces or tabs with First Strong Isolate (U+2068) and Pop
501 Directional Isolate (U+2069).
502 """
503 if " " not in filename and "\t" not in filename:
504 return filename
505
506 if not filename.startswith("\u2068"):
507 filename = "\u2068" + filename
508 if not filename.endswith("\u2069"):
509 filename += "\u2069"
510 return filename
511
512
513def write_po(
514 fileobj: SupportsWrite[bytes],
515 catalog: Catalog,
516 width: int = 76,
517 no_location: bool = False,
518 omit_header: bool = False,
519 sort_output: bool = False,
520 sort_by_file: bool = False,
521 ignore_obsolete: bool = False,
522 include_previous: bool = False,
523 include_lineno: bool = True,
524) -> None:
525 r"""Write a ``gettext`` PO (portable object) template file for a given
526 message catalog to the provided file-like object.
527
528 >>> catalog = Catalog()
529 >>> catalog.add('foo %(name)s', locations=[('main.py', 1)],
530 ... flags=('fuzzy',))
531 <Message...>
532 >>> catalog.add(('bar', 'baz'), locations=[('main.py', 3)])
533 <Message...>
534 >>> from io import BytesIO
535 >>> buf = BytesIO()
536 >>> write_po(buf, catalog, omit_header=True)
537 >>> print(buf.getvalue().decode("utf8"))
538 #: main.py:1
539 #, fuzzy, python-format
540 msgid "foo %(name)s"
541 msgstr ""
542 <BLANKLINE>
543 #: main.py:3
544 msgid "bar"
545 msgid_plural "baz"
546 msgstr[0] ""
547 msgstr[1] ""
548 <BLANKLINE>
549 <BLANKLINE>
550
551 :param fileobj: the file-like object to write to
552 :param catalog: the `Catalog` instance
553 :param width: the maximum line width for the generated output; use `None`,
554 0, or a negative number to completely disable line wrapping
555 :param no_location: do not emit a location comment for every message
556 :param omit_header: do not include the ``msgid ""`` entry at the top of the
557 output
558 :param sort_output: whether to sort the messages in the output by msgid
559 :param sort_by_file: whether to sort the messages in the output by their
560 locations
561 :param ignore_obsolete: whether to ignore obsolete messages and not include
562 them in the output; by default they are included as
563 comments
564 :param include_previous: include the old msgid as a comment when
565 updating the catalog
566 :param include_lineno: include line number in the location comment
567 """
568
569 sort_by = None
570 if sort_output:
571 sort_by = "message"
572 elif sort_by_file:
573 sort_by = "location"
574
575 for line in generate_po(
576 catalog,
577 ignore_obsolete=ignore_obsolete,
578 include_lineno=include_lineno,
579 include_previous=include_previous,
580 no_location=no_location,
581 omit_header=omit_header,
582 sort_by=sort_by,
583 width=width,
584 ):
585 if isinstance(line, str):
586 line = line.encode(catalog.charset, 'backslashreplace')
587 fileobj.write(line)
588
589
590def generate_po(
591 catalog: Catalog,
592 *,
593 ignore_obsolete: bool = False,
594 include_lineno: bool = True,
595 include_previous: bool = False,
596 no_location: bool = False,
597 omit_header: bool = False,
598 sort_by: Literal["message", "location"] | None = None,
599 width: int = 76,
600) -> Iterable[str]:
601 r"""Yield text strings representing a ``gettext`` PO (portable object) file.
602
603 See `write_po()` for a more detailed description.
604 """
605 # xgettext always wraps comments even if --no-wrap is passed;
606 # provide the same behaviour
607 comment_width = width if width and width > 0 else 76
608
609 comment_wrapper = TextWrapper(width=comment_width, break_long_words=False)
610 header_wrapper = TextWrapper(width=width, subsequent_indent="# ", break_long_words=False)
611
612 def _format_comment(comment, prefix=''):
613 for line in comment_wrapper.wrap(comment):
614 yield f"#{prefix} {line.strip()}\n"
615
616 def _format_message(message, prefix=''):
617 if isinstance(message.id, (list, tuple)):
618 if message.context:
619 yield f"{prefix}msgctxt {normalize(message.context, prefix=prefix, width=width)}\n"
620 yield f"{prefix}msgid {normalize(message.id[0], prefix=prefix, width=width)}\n"
621 yield f"{prefix}msgid_plural {normalize(message.id[1], prefix=prefix, width=width)}\n"
622
623 for idx in range(catalog.num_plurals):
624 try:
625 string = message.string[idx]
626 except IndexError:
627 string = ''
628 yield f"{prefix}msgstr[{idx:d}] {normalize(string, prefix=prefix, width=width)}\n"
629 else:
630 if message.context:
631 yield f"{prefix}msgctxt {normalize(message.context, prefix=prefix, width=width)}\n"
632 yield f"{prefix}msgid {normalize(message.id, prefix=prefix, width=width)}\n"
633 yield f"{prefix}msgstr {normalize(message.string or '', prefix=prefix, width=width)}\n"
634
635 for message in _sort_messages(catalog, sort_by=sort_by):
636 if not message.id: # This is the header "message"
637 if omit_header:
638 continue
639 comment_header = catalog.header_comment
640 if width and width > 0:
641 lines = []
642 for line in comment_header.splitlines():
643 lines += header_wrapper.wrap(line)
644 comment_header = '\n'.join(lines)
645 yield f"{comment_header}\n"
646
647 for comment in message.user_comments:
648 yield from _format_comment(comment)
649 for comment in message.auto_comments:
650 yield from _format_comment(comment, prefix='.')
651
652 if not no_location:
653 locs = []
654
655 # sort locations by filename and lineno.
656 # if there's no <int> as lineno, use `-1`.
657 # if no sorting possible, leave unsorted.
658 # (see issue #606)
659 try:
660 locations = sorted(message.locations,
661 key=lambda x: (x[0], isinstance(x[1], int) and x[1] or -1))
662 except TypeError: # e.g. "TypeError: unorderable types: NoneType() < int()"
663 locations = message.locations
664
665 for filename, lineno in locations:
666 location = filename.replace(os.sep, '/')
667 location = _enclose_filename_if_necessary(location)
668 if lineno and include_lineno:
669 location = f"{location}:{lineno:d}"
670 if location not in locs:
671 locs.append(location)
672 yield from _format_comment(' '.join(locs), prefix=':')
673 if message.flags:
674 yield f"#{', '.join(['', *sorted(message.flags)])}\n"
675
676 if message.previous_id and include_previous:
677 yield from _format_comment(
678 f'msgid {normalize(message.previous_id[0], width=width)}',
679 prefix='|',
680 )
681 if len(message.previous_id) > 1:
682 norm_previous_id = normalize(message.previous_id[1], width=width)
683 yield from _format_comment(f'msgid_plural {norm_previous_id}', prefix='|')
684
685 yield from _format_message(message)
686 yield '\n'
687
688 if not ignore_obsolete:
689 for message in _sort_messages(
690 catalog.obsolete.values(),
691 sort_by=sort_by,
692 ):
693 for comment in message.user_comments:
694 yield from _format_comment(comment)
695 yield from _format_message(message, prefix='#~ ')
696 yield '\n'
697
698
699def _sort_messages(messages: Iterable[Message], sort_by: Literal["message", "location"] | None) -> list[Message]:
700 """
701 Sort the given message iterable by the given criteria.
702
703 Always returns a list.
704
705 :param messages: An iterable of Messages.
706 :param sort_by: Sort by which criteria? Options are `message` and `location`.
707 :return: list[Message]
708 """
709 messages = list(messages)
710 if sort_by == "message":
711 messages.sort()
712 elif sort_by == "location":
713 messages.sort(key=lambda m: m.locations)
714 return messages