1# SPDX-FileCopyrightText: 2022 James R. Barlow
2# SPDX-License-Identifier: MPL-2.0
3
4"""Converters for XMP <-> DocumentInfo value transformation."""
5
6from __future__ import annotations
7
8from abc import ABC, abstractmethod
9from datetime import datetime
10from typing import Any, NamedTuple
11
12from pikepdf.models.metadata._constants import XMP_NS_DC, XMP_NS_PDF, XMP_NS_XMP
13from pikepdf.objects import Name, String
14
15
16def encode_pdf_date(d: datetime) -> str:
17 """Encode Python datetime object as PDF date string.
18
19 From Adobe pdfmark manual:
20 (D:YYYYMMDDHHmmSSOHH'mm')
21 D: is an optional prefix. YYYY is the year. All fields after the year are
22 optional. MM is the month (01-12), DD is the day (01-31), HH is the
23 hour (00-23), mm are the minutes (00-59), and SS are the seconds
24 (00-59). The remainder of the string defines the relation of local
25 time to GMT. O is either + for a positive difference (local time is
26 later than GMT) or - (minus) for a negative difference. HH' is the
27 absolute value of the offset from GMT in hours, and mm' is the
28 absolute value of the offset in minutes. If no GMT information is
29 specified, the relation between the specified time and GMT is
30 considered unknown. Regardless of whether or not GMT
31 information is specified, the remainder of the string should specify
32 the local time.
33
34 'D:' is required in PDF/A, so we always add it.
35 """
36 # The formatting of %Y is not consistent as described in
37 # https://bugs.python.org/issue13305 and underspecification in libc.
38 # So explicitly format the year with leading zeros
39 s = f"D:{d.year:04d}"
40 s += d.strftime(r'%m%d%H%M%S')
41 tz = d.strftime('%z')
42 if tz:
43 sign, tz_hours, tz_mins = tz[0], tz[1:3], tz[3:5]
44 s += f"{sign}{tz_hours}'{tz_mins}"
45 return s
46
47
48def decode_pdf_date(s: str) -> datetime:
49 """Decode a pdfmark date to a Python datetime object.
50
51 A pdfmark date is a string in a particular format, as described in
52 :func:`encode_pdf_date`.
53 """
54 if isinstance(s, String):
55 s = str(s)
56 t = s
57 if t.startswith('D:'):
58 t = t[2:]
59 utcs = [
60 "Z00'00'", # Literal Z00'00', is incorrect but found in the wild
61 "Z00'00", # Correctly formatted UTC
62 "Z", # Alternate UTC
63 ]
64 for utc in utcs:
65 if t.endswith(utc):
66 t = t.replace(utc, "+0000")
67 break
68 t = t.replace("'", "") # Remove apos from PDF time strings
69
70 date_formats = [
71 r"%Y%m%d%H%M%S%z", # Format with timezone
72 r"%Y%m%d%H%M%S", # Format without timezone
73 r"%Y%m%d", # Date only format
74 ]
75 for date_format in date_formats:
76 try:
77 return datetime.strptime(t, date_format)
78 except ValueError:
79 continue
80 raise ValueError(f"Date string does not match any known format: {s} (read as {t})")
81
82
83class Converter(ABC):
84 """XMP <-> DocumentInfo converter."""
85
86 @staticmethod
87 @abstractmethod
88 def xmp_from_docinfo(docinfo_val: str | None) -> Any: # type: ignore
89 """Derive XMP metadata from a DocumentInfo string."""
90
91 @staticmethod
92 @abstractmethod
93 def docinfo_from_xmp(xmp_val: Any) -> str | None:
94 """Derive a DocumentInfo value from equivalent XMP metadata."""
95
96
97class AuthorConverter(Converter):
98 """Convert XMP document authors to DocumentInfo."""
99
100 @staticmethod
101 def xmp_from_docinfo(docinfo_val: str | None) -> Any: # type: ignore
102 """Derive XMP authors info from DocumentInfo."""
103 return [docinfo_val]
104
105 @staticmethod
106 def docinfo_from_xmp(xmp_val):
107 """Derive DocumentInfo authors from XMP.
108
109 XMP supports multiple author values, while DocumentInfo has a string,
110 so we return the values separated by semi-colons.
111 """
112 if isinstance(xmp_val, str):
113 return xmp_val
114 if xmp_val is None or xmp_val == [None]:
115 return None
116 return '; '.join(author for author in xmp_val if author is not None)
117
118
119class DateConverter(Converter):
120 """Convert XMP dates to DocumentInfo."""
121
122 @staticmethod
123 def xmp_from_docinfo(docinfo_val):
124 """Derive XMP date from DocumentInfo."""
125 if isinstance(docinfo_val, String):
126 docinfo_val = str(docinfo_val)
127 if docinfo_val == '':
128 return ''
129 val = docinfo_val[2:] if docinfo_val.startswith('D:') else docinfo_val
130 if len(val) in (4, 6) and val.isdigit():
131 return val if len(val) == 4 else f'{val[:4]}-{val[4:]}'
132 return decode_pdf_date(docinfo_val).isoformat()
133
134 @staticmethod
135 def docinfo_from_xmp(xmp_val):
136 """Derive DocumentInfo from XMP."""
137 if len(xmp_val) in (4, 7) and 'T' not in xmp_val:
138 return f'D:{xmp_val.replace("-", "")}'
139 if xmp_val.endswith('Z'):
140 xmp_val = xmp_val[:-1] + '+00:00'
141 return encode_pdf_date(datetime.fromisoformat(xmp_val))
142
143
144class DocinfoMapping(NamedTuple):
145 """Map DocumentInfo keys to their XMP equivalents, along with converter."""
146
147 ns: str
148 key: str
149 name: Name
150 converter: type[Converter] | None
151
152
153DOCINFO_MAPPING: list[DocinfoMapping] = [
154 DocinfoMapping(XMP_NS_DC, 'creator', Name.Author, AuthorConverter),
155 DocinfoMapping(XMP_NS_DC, 'description', Name.Subject, None),
156 DocinfoMapping(XMP_NS_DC, 'title', Name.Title, None),
157 DocinfoMapping(XMP_NS_PDF, 'Keywords', Name.Keywords, None),
158 DocinfoMapping(XMP_NS_PDF, 'Producer', Name.Producer, None),
159 DocinfoMapping(XMP_NS_XMP, 'CreateDate', Name.CreationDate, DateConverter),
160 DocinfoMapping(XMP_NS_XMP, 'CreatorTool', Name.Creator, None),
161 DocinfoMapping(XMP_NS_XMP, 'ModifyDate', Name.ModDate, DateConverter),
162]