1# SPDX-FileCopyrightText: 2022 James R. Barlow
2# SPDX-License-Identifier: MPL-2.0
3
4"""PdfMetadata - facade for XMP and DocumentInfo metadata."""
5
6from __future__ import annotations
7
8import logging
9from collections.abc import Iterator, MutableMapping
10from datetime import datetime, timezone
11from typing import TYPE_CHECKING, Any
12from warnings import warn
13
14from pikepdf._version import __version__ as pikepdf_version
15from pikepdf.models.metadata._constants import (
16 XMP_NS_PDF,
17 XMP_NS_PDFA_ID,
18 XMP_NS_PDFX_ID,
19 XMP_NS_XMP,
20 clean,
21)
22from pikepdf.models.metadata._converters import DOCINFO_MAPPING, DocinfoMapping
23from pikepdf.models.metadata._docinfo import DocinfoStore
24from pikepdf.models.metadata._xmp import XmpDocument
25from pikepdf.objects import Name, Stream
26
27if TYPE_CHECKING: # pragma: no cover
28 from lxml.etree import QName
29
30 from pikepdf import Pdf
31
32
33log = logging.getLogger(__name__)
34
35
36class PdfMetadata(MutableMapping):
37 """Read and edit the metadata associated with a PDF.
38
39 The PDF specification contain two types of metadata, the newer XMP
40 (Extensible Metadata Platform, XML-based) and older DocumentInformation
41 dictionary. The PDF 2.0 specification removes the DocumentInformation
42 dictionary.
43
44 This primarily works with XMP metadata, but includes methods to generate
45 XMP from DocumentInformation and will also coordinate updates to
46 DocumentInformation so that the two are kept consistent.
47
48 XMP metadata fields may be accessed using the full XML namespace URI or
49 the short name. For example ``metadata['dc:description']``
50 and ``metadata['{http://purl.org/dc/elements/1.1/}description']``
51 both refer to the same field. Several common XML namespaces are registered
52 automatically.
53
54 See the XMP specification for details of allowable fields.
55
56 To update metadata, use a with block.
57
58 Example:
59 >>> with pdf.open_metadata() as records:
60 ... records['dc:title'] = 'New Title'
61
62 See Also:
63 :meth:`pikepdf.Pdf.open_metadata`
64 """
65
66 # Keep DOCINFO_MAPPING at class level for backward compatibility
67 DOCINFO_MAPPING: list[DocinfoMapping] = DOCINFO_MAPPING
68
69 # Delegate namespace dicts to XmpDocument for backward compatibility
70 NS: dict[str, str] = XmpDocument.NS
71 REVERSE_NS: dict[str, str] = XmpDocument.REVERSE_NS
72
73 def __init__(
74 self,
75 pdf: Pdf,
76 pikepdf_mark: bool = True,
77 sync_docinfo: bool = True,
78 overwrite_invalid_xml: bool = True,
79 ):
80 """Construct PdfMetadata. Use Pdf.open_metadata() instead."""
81 self._pdf = pdf
82 self.mark = pikepdf_mark
83 self.sync_docinfo = sync_docinfo
84 self._updating = False
85 self._overwrite_invalid_xml = overwrite_invalid_xml
86
87 # Initialize XmpDocument with PDF's XMP data
88 self._xmp_doc = self._load_xmp()
89
90 # Initialize DocinfoStore
91 self._docinfo = DocinfoStore(pdf)
92
93 def _load_xmp(self) -> XmpDocument:
94 """Load XMP from PDF or create empty XmpDocument."""
95 try:
96 data = self._pdf.Root.Metadata.read_bytes()
97 except AttributeError:
98 data = b''
99
100 return XmpDocument(
101 data, overwrite_invalid_xml=self._overwrite_invalid_xml
102 )
103
104 def load_from_docinfo(
105 self, docinfo, delete_missing: bool = False, raise_failure: bool = False
106 ) -> None:
107 """Populate the XMP metadata object with DocumentInfo.
108
109 Arguments:
110 docinfo: a DocumentInfo, e.g pdf.docinfo
111 delete_missing: if the entry is not DocumentInfo, delete the equivalent
112 from XMP
113 raise_failure: if True, raise any failure to convert docinfo;
114 otherwise warn and continue
115
116 A few entries in the deprecated DocumentInfo dictionary are considered
117 approximately equivalent to certain XMP records. This method copies
118 those entries into the XMP metadata.
119 """
120 from lxml.etree import QName
121
122 def warn_or_raise(msg, e=None):
123 if raise_failure:
124 raise ValueError(msg) from e
125 warn(msg)
126
127 for uri, shortkey, docinfo_name, converter in self.DOCINFO_MAPPING:
128 qname = QName(uri, shortkey)
129 # docinfo might be a dict or pikepdf.Dictionary, so lookup keys
130 # by str(Name)
131 val = docinfo.get(str(docinfo_name))
132 if val is None:
133 if delete_missing and qname in self:
134 del self[qname]
135 continue
136 try:
137 val = str(val)
138 if converter:
139 val = converter.xmp_from_docinfo(val)
140 if not val:
141 continue
142 self._setitem(qname, val, True)
143 except (ValueError, AttributeError, NotImplementedError) as e:
144 warn_or_raise(
145 f"The metadata field {docinfo_name} could not be copied to XMP", e
146 )
147 valid_docinfo_names = {
148 str(docinfo_name) for _, _, docinfo_name, _ in self.DOCINFO_MAPPING
149 }
150 extra_docinfo_names = {str(k) for k in docinfo.keys()} - valid_docinfo_names
151 for extra in extra_docinfo_names:
152 warn_or_raise(
153 f"The metadata field {extra} with value '{repr(docinfo.get(extra))}' "
154 "has no XMP equivalent, so it was discarded",
155 )
156
157 def __enter__(self):
158 """Open metadata for editing."""
159 self._updating = True
160 return self
161
162 def __exit__(self, exc_type, exc_val, exc_tb):
163 """Close metadata and apply changes."""
164 try:
165 if exc_type is not None:
166 return
167 self._apply_changes()
168 finally:
169 self._updating = False
170
171 def _update_docinfo(self):
172 """Update the PDF's DocumentInfo dictionary to match XMP metadata.
173
174 The standard mapping is described here:
175 https://www.pdfa.org/pdfa-metadata-xmp-rdf-dublin-core/
176 """
177 from lxml.etree import QName
178
179 # Touch object to ensure it exists
180 self._pdf.docinfo # pylint: disable=pointless-statement
181 for uri, element, docinfo_name, converter in self.DOCINFO_MAPPING:
182 qname = QName(uri, element)
183 try:
184 value = self[qname]
185 except KeyError:
186 if docinfo_name in self._pdf.docinfo:
187 del self._pdf.docinfo[docinfo_name]
188 continue
189 if converter:
190 try:
191 value = converter.docinfo_from_xmp(value)
192 except ValueError:
193 warn(
194 f"The DocumentInfo field {docinfo_name} could not be "
195 "updated from XMP"
196 )
197 value = None
198 except Exception as e:
199 raise ValueError(
200 "An error occurred while updating DocumentInfo field "
201 f"{docinfo_name} from XMP {qname} with value {value}"
202 ) from e
203 if value is None:
204 if docinfo_name in self._pdf.docinfo:
205 del self._pdf.docinfo[docinfo_name]
206 continue
207 self._docinfo.set(docinfo_name, clean(value))
208
209 def _apply_changes(self):
210 """Serialize our changes back to the PDF in memory.
211
212 Depending how we are initialized, leave our metadata mark and producer.
213 """
214 from lxml.etree import QName
215
216 if self.mark:
217 # We were asked to mark the file as being edited by pikepdf
218 self._setitem(
219 QName(XMP_NS_XMP, 'MetadataDate'),
220 datetime.now(timezone.utc).isoformat(),
221 applying_mark=True,
222 )
223 self._setitem(
224 QName(XMP_NS_PDF, 'Producer'),
225 'pikepdf ' + pikepdf_version,
226 applying_mark=True,
227 )
228 xml = self._xmp_doc.to_bytes()
229 self._pdf.Root.Metadata = Stream(self._pdf, xml)
230 self._pdf.Root.Metadata[Name.Type] = Name.Metadata
231 self._pdf.Root.Metadata[Name.Subtype] = Name.XML
232 if self.sync_docinfo:
233 self._update_docinfo()
234
235 @classmethod
236 def _qname(cls, name: QName | str) -> str:
237 """Convert name to an XML QName.
238
239 e.g. pdf:Producer -> {http://ns.adobe.com/pdf/1.3/}Producer
240 """
241 return XmpDocument.qname(name)
242
243 @classmethod
244 def register_xml_namespace(cls, uri: str, prefix: str) -> None:
245 """Register a new XML/XMP namespace.
246
247 Arguments:
248 uri: The long form of the namespace.
249 prefix: The alias to use when interpreting XMP.
250 """
251 XmpDocument.register_xml_namespace(uri, prefix)
252
253 def _prefix_from_uri(self, uriname: str) -> str:
254 """Given a fully qualified XML name, find a prefix.
255
256 e.g. {http://ns.adobe.com/pdf/1.3/}Producer -> pdf:Producer
257 """
258 return self._xmp_doc.prefix_from_uri(uriname)
259
260 def __contains__(self, key: object) -> bool: # type: ignore[override]
261 """Test if XMP key is in metadata."""
262 from lxml.etree import QName
263
264 if not isinstance(key, (str, QName)):
265 raise TypeError(f"{key!r} must be str or QName")
266 return key in self._xmp_doc
267
268 def __getitem__(self, key: str | QName) -> Any:
269 """Retrieve XMP metadata for key."""
270 return self._xmp_doc[key]
271
272 def __iter__(self) -> Iterator[str]:
273 """Iterate through XMP metadata attributes and nodes."""
274 return iter(self._xmp_doc)
275
276 def __len__(self) -> int:
277 """Return number of items in metadata."""
278 return len(self._xmp_doc)
279
280 def _setitem(
281 self,
282 key: str | QName,
283 val: set[str] | list[str] | str,
284 applying_mark: bool = False,
285 ) -> None:
286 if not self._updating:
287 raise RuntimeError("Metadata not opened for editing, use with block")
288
289 qkey = self._qname(key)
290 self._setitem_check_args(key, val, applying_mark, qkey)
291 self._xmp_doc.set_value(key, val)
292
293 def _setitem_check_args(
294 self, key: str | QName, val: Any, applying_mark: bool, qkey: str
295 ) -> None:
296 if (
297 self.mark
298 and not applying_mark
299 and qkey
300 in (
301 self._qname('xmp:MetadataDate'),
302 self._qname('pdf:Producer'),
303 )
304 ):
305 # Complain if user writes self[pdf:Producer] = ... and because it will
306 # be overwritten on save, unless self._updating_mark, in which case
307 # the action was initiated internally
308 log.warning(
309 f"Update to {key} will be overwritten because metadata was opened "
310 "with set_pikepdf_as_editor=True"
311 )
312 if isinstance(val, str) and qkey in (self._qname('dc:creator')):
313 log.error(f"{key} should be set to a list of strings")
314
315 def __setitem__(self, key: str | QName, val: set[str] | list[str] | str) -> None:
316 """Set XMP metadata key to value."""
317 return self._setitem(key, val, False)
318
319 def __delitem__(self, key: str | QName) -> None:
320 """Delete item from XMP metadata."""
321 if not self._updating:
322 raise RuntimeError("Metadata not opened for editing, use with block")
323 del self._xmp_doc[key]
324
325 @property
326 def pdfa_status(self) -> str:
327 """Return the PDF/A conformance level claimed by this PDF, or False.
328
329 A PDF may claim to PDF/A compliant without this being true. Use an
330 independent verifier such as veraPDF to test if a PDF is truly
331 conformant.
332
333 Returns:
334 The conformance level of the PDF/A, or an empty string if the
335 PDF does not claim PDF/A conformance. Possible valid values
336 are: 1A, 1B, 2A, 2B, 2U, 3A, 3B, 3U. Note that ISO standard
337 typically refers to PDF/A-1b for example, using lower case;
338 this function returns the value as it appears in the PDF, which
339 is uppercase.
340 """
341 from lxml.etree import QName
342
343 key_part = QName(XMP_NS_PDFA_ID, 'part')
344 key_conformance = QName(XMP_NS_PDFA_ID, 'conformance')
345 try:
346 return self[key_part] + self[key_conformance]
347 except KeyError:
348 return ''
349
350 @property
351 def pdfx_status(self) -> str:
352 """Return the PDF/X conformance level claimed by this PDF, or False.
353
354 A PDF may claim to PDF/X compliant without this being true. Use an
355 independent verifier such as veraPDF to test if a PDF is truly
356 conformant.
357
358 Returns:
359 The conformance level of the PDF/X, or an empty string if the
360 PDF does not claim PDF/X conformance.
361 """
362 from lxml.etree import QName
363
364 pdfx_version = QName(XMP_NS_PDFX_ID, 'GTS_PDFXVersion')
365 try:
366 return self[pdfx_version]
367 except KeyError:
368 return ''
369
370 def __str__(self) -> str:
371 """Convert XMP metadata to XML string."""
372 return str(self._xmp_doc)
373
374 # Backward compatibility methods for internal API access
375 def _load(self) -> None:
376 """No-op for backward compatibility.
377
378 Previously this triggered lazy loading of XMP. Now XMP is loaded
379 immediately in __init__.
380 """
381 pass
382
383 def _get_rdf_root(self):
384 """Get the rdf:RDF root element.
385
386 Provided for backward compatibility with code that accesses
387 internal XMP structure.
388 """
389 return self._xmp_doc._get_rdf_root()
390
391 def _get_xml_bytes(self, xpacket: bool = True) -> bytes:
392 """Serialize XMP to XML bytes.
393
394 Provided for backward compatibility.
395 """
396 return self._xmp_doc.to_bytes(xpacket=xpacket)