1# SPDX-FileCopyrightText: 2022 James R. Barlow
2# SPDX-License-Identifier: MPL-2.0
3
4"""PdfMetadata - facade for XMP and DocumentInfo metadata."""
5
6from __future__ import annotations
7
8import logging
9from collections.abc import Iterator, MutableMapping
10from datetime import datetime, timezone
11from typing import TYPE_CHECKING, Any
12from warnings import warn
13
14from lxml.etree import QName
15
16from pikepdf._version import __version__ as pikepdf_version
17from pikepdf.models.metadata._constants import (
18 XMP_NS_PDF,
19 XMP_NS_PDFA_ID,
20 XMP_NS_PDFX_ID,
21 XMP_NS_XMP,
22 clean,
23)
24from pikepdf.models.metadata._converters import DOCINFO_MAPPING, DocinfoMapping
25from pikepdf.models.metadata._docinfo import DocinfoStore
26from pikepdf.models.metadata._xmp import XmpDocument
27from pikepdf.objects import Name, Stream
28
29if TYPE_CHECKING: # pragma: no cover
30 from pikepdf import Pdf
31
32log = logging.getLogger(__name__)
33
34
35class PdfMetadata(MutableMapping):
36 """Read and edit the metadata associated with a PDF.
37
38 The PDF specification contain two types of metadata, the newer XMP
39 (Extensible Metadata Platform, XML-based) and older DocumentInformation
40 dictionary. The PDF 2.0 specification removes the DocumentInformation
41 dictionary.
42
43 This primarily works with XMP metadata, but includes methods to generate
44 XMP from DocumentInformation and will also coordinate updates to
45 DocumentInformation so that the two are kept consistent.
46
47 XMP metadata fields may be accessed using the full XML namespace URI or
48 the short name. For example ``metadata['dc:description']``
49 and ``metadata['{http://purl.org/dc/elements/1.1/}description']``
50 both refer to the same field. Several common XML namespaces are registered
51 automatically.
52
53 See the XMP specification for details of allowable fields.
54
55 To update metadata, use a with block.
56
57 Example:
58 >>> with pdf.open_metadata() as records:
59 ... records['dc:title'] = 'New Title'
60
61 See Also:
62 :meth:`pikepdf.Pdf.open_metadata`
63 """
64
65 # Keep DOCINFO_MAPPING at class level for backward compatibility
66 DOCINFO_MAPPING: list[DocinfoMapping] = DOCINFO_MAPPING
67
68 # Delegate namespace dicts to XmpDocument for backward compatibility
69 NS: dict[str, str] = XmpDocument.NS
70 REVERSE_NS: dict[str, str] = XmpDocument.REVERSE_NS
71
72 def __init__(
73 self,
74 pdf: Pdf,
75 pikepdf_mark: bool = True,
76 sync_docinfo: bool = True,
77 overwrite_invalid_xml: bool = True,
78 ):
79 """Construct PdfMetadata. Use Pdf.open_metadata() instead."""
80 self._pdf = pdf
81 self.mark = pikepdf_mark
82 self.sync_docinfo = sync_docinfo
83 self._updating = False
84 self._overwrite_invalid_xml = overwrite_invalid_xml
85
86 # Initialize XmpDocument with PDF's XMP data
87 self._xmp_doc = self._load_xmp()
88
89 # Initialize DocinfoStore
90 self._docinfo = DocinfoStore(pdf)
91
92 def _load_xmp(self) -> XmpDocument:
93 """Load XMP from PDF or create empty XmpDocument."""
94 try:
95 data = self._pdf.Root.Metadata.read_bytes()
96 except AttributeError:
97 data = b''
98
99 return XmpDocument(
100 data, overwrite_invalid_xml=self._overwrite_invalid_xml
101 )
102
103 def load_from_docinfo(
104 self, docinfo, delete_missing: bool = False, raise_failure: bool = False
105 ) -> None:
106 """Populate the XMP metadata object with DocumentInfo.
107
108 Arguments:
109 docinfo: a DocumentInfo, e.g pdf.docinfo
110 delete_missing: if the entry is not DocumentInfo, delete the equivalent
111 from XMP
112 raise_failure: if True, raise any failure to convert docinfo;
113 otherwise warn and continue
114
115 A few entries in the deprecated DocumentInfo dictionary are considered
116 approximately equivalent to certain XMP records. This method copies
117 those entries into the XMP metadata.
118 """
119
120 def warn_or_raise(msg, e=None):
121 if raise_failure:
122 raise ValueError(msg) from e
123 warn(msg)
124
125 for uri, shortkey, docinfo_name, converter in self.DOCINFO_MAPPING:
126 qname = QName(uri, shortkey)
127 # docinfo might be a dict or pikepdf.Dictionary, so lookup keys
128 # by str(Name)
129 val = docinfo.get(str(docinfo_name))
130 if val is None:
131 if delete_missing and qname in self:
132 del self[qname]
133 continue
134 try:
135 val = str(val)
136 if converter:
137 val = converter.xmp_from_docinfo(val)
138 if not val:
139 continue
140 self._setitem(qname, val, True)
141 except (ValueError, AttributeError, NotImplementedError) as e:
142 warn_or_raise(
143 f"The metadata field {docinfo_name} could not be copied to XMP", e
144 )
145 valid_docinfo_names = {
146 str(docinfo_name) for _, _, docinfo_name, _ in self.DOCINFO_MAPPING
147 }
148 extra_docinfo_names = {str(k) for k in docinfo.keys()} - valid_docinfo_names
149 for extra in extra_docinfo_names:
150 warn_or_raise(
151 f"The metadata field {extra} with value '{repr(docinfo.get(extra))}' "
152 "has no XMP equivalent, so it was discarded",
153 )
154
155 def __enter__(self):
156 """Open metadata for editing."""
157 self._updating = True
158 return self
159
160 def __exit__(self, exc_type, exc_val, exc_tb):
161 """Close metadata and apply changes."""
162 try:
163 if exc_type is not None:
164 return
165 self._apply_changes()
166 finally:
167 self._updating = False
168
169 def _update_docinfo(self):
170 """Update the PDF's DocumentInfo dictionary to match XMP metadata.
171
172 The standard mapping is described here:
173 https://www.pdfa.org/pdfa-metadata-xmp-rdf-dublin-core/
174 """
175 # Touch object to ensure it exists
176 self._pdf.docinfo # pylint: disable=pointless-statement
177 for uri, element, docinfo_name, converter in self.DOCINFO_MAPPING:
178 qname = QName(uri, element)
179 try:
180 value = self[qname]
181 except KeyError:
182 if docinfo_name in self._pdf.docinfo:
183 del self._pdf.docinfo[docinfo_name]
184 continue
185 if converter:
186 try:
187 value = converter.docinfo_from_xmp(value)
188 except ValueError:
189 warn(
190 f"The DocumentInfo field {docinfo_name} could not be "
191 "updated from XMP"
192 )
193 value = None
194 except Exception as e:
195 raise ValueError(
196 "An error occurred while updating DocumentInfo field "
197 f"{docinfo_name} from XMP {qname} with value {value}"
198 ) from e
199 if value is None:
200 if docinfo_name in self._pdf.docinfo:
201 del self._pdf.docinfo[docinfo_name]
202 continue
203 self._docinfo.set(docinfo_name, clean(value))
204
205 def _apply_changes(self):
206 """Serialize our changes back to the PDF in memory.
207
208 Depending how we are initialized, leave our metadata mark and producer.
209 """
210 if self.mark:
211 # We were asked to mark the file as being edited by pikepdf
212 self._setitem(
213 QName(XMP_NS_XMP, 'MetadataDate'),
214 datetime.now(timezone.utc).isoformat(),
215 applying_mark=True,
216 )
217 self._setitem(
218 QName(XMP_NS_PDF, 'Producer'),
219 'pikepdf ' + pikepdf_version,
220 applying_mark=True,
221 )
222 xml = self._xmp_doc.to_bytes()
223 self._pdf.Root.Metadata = Stream(self._pdf, xml)
224 self._pdf.Root.Metadata[Name.Type] = Name.Metadata
225 self._pdf.Root.Metadata[Name.Subtype] = Name.XML
226 if self.sync_docinfo:
227 self._update_docinfo()
228
229 @classmethod
230 def _qname(cls, name: QName | str) -> str:
231 """Convert name to an XML QName.
232
233 e.g. pdf:Producer -> {http://ns.adobe.com/pdf/1.3/}Producer
234 """
235 return XmpDocument.qname(name)
236
237 @classmethod
238 def register_xml_namespace(cls, uri: str, prefix: str) -> None:
239 """Register a new XML/XMP namespace.
240
241 Arguments:
242 uri: The long form of the namespace.
243 prefix: The alias to use when interpreting XMP.
244 """
245 XmpDocument.register_xml_namespace(uri, prefix)
246
247 def _prefix_from_uri(self, uriname: str) -> str:
248 """Given a fully qualified XML name, find a prefix.
249
250 e.g. {http://ns.adobe.com/pdf/1.3/}Producer -> pdf:Producer
251 """
252 return self._xmp_doc.prefix_from_uri(uriname)
253
254 def __contains__(self, key: object) -> bool: # type: ignore[override]
255 """Test if XMP key is in metadata."""
256 if not isinstance(key, (str, QName)):
257 raise TypeError(f"{key!r} must be str or QName")
258 return key in self._xmp_doc
259
260 def __getitem__(self, key: str | QName) -> Any:
261 """Retrieve XMP metadata for key."""
262 return self._xmp_doc[key]
263
264 def __iter__(self) -> Iterator[str]:
265 """Iterate through XMP metadata attributes and nodes."""
266 return iter(self._xmp_doc)
267
268 def __len__(self) -> int:
269 """Return number of items in metadata."""
270 return len(self._xmp_doc)
271
272 def _setitem(
273 self,
274 key: str | QName,
275 val: set[str] | list[str] | str,
276 applying_mark: bool = False,
277 ) -> None:
278 if not self._updating:
279 raise RuntimeError("Metadata not opened for editing, use with block")
280
281 qkey = self._qname(key)
282 self._setitem_check_args(key, val, applying_mark, qkey)
283 self._xmp_doc.set_value(key, val)
284
285 def _setitem_check_args(
286 self, key: str | QName, val: Any, applying_mark: bool, qkey: str
287 ) -> None:
288 if (
289 self.mark
290 and not applying_mark
291 and qkey
292 in (
293 self._qname('xmp:MetadataDate'),
294 self._qname('pdf:Producer'),
295 )
296 ):
297 # Complain if user writes self[pdf:Producer] = ... and because it will
298 # be overwritten on save, unless self._updating_mark, in which case
299 # the action was initiated internally
300 log.warning(
301 f"Update to {key} will be overwritten because metadata was opened "
302 "with set_pikepdf_as_editor=True"
303 )
304 if isinstance(val, str) and qkey in (self._qname('dc:creator')):
305 log.error(f"{key} should be set to a list of strings")
306
307 def __setitem__(self, key: str | QName, val: set[str] | list[str] | str) -> None:
308 """Set XMP metadata key to value."""
309 return self._setitem(key, val, False)
310
311 def __delitem__(self, key: str | QName) -> None:
312 """Delete item from XMP metadata."""
313 if not self._updating:
314 raise RuntimeError("Metadata not opened for editing, use with block")
315 del self._xmp_doc[key]
316
317 @property
318 def pdfa_status(self) -> str:
319 """Return the PDF/A conformance level claimed by this PDF, or False.
320
321 A PDF may claim to PDF/A compliant without this being true. Use an
322 independent verifier such as veraPDF to test if a PDF is truly
323 conformant.
324
325 Returns:
326 The conformance level of the PDF/A, or an empty string if the
327 PDF does not claim PDF/A conformance. Possible valid values
328 are: 1A, 1B, 2A, 2B, 2U, 3A, 3B, 3U. Note that ISO standard
329 typically refers to PDF/A-1b for example, using lower case;
330 this function returns the value as it appears in the PDF, which
331 is uppercase.
332 """
333 key_part = QName(XMP_NS_PDFA_ID, 'part')
334 key_conformance = QName(XMP_NS_PDFA_ID, 'conformance')
335 try:
336 return self[key_part] + self[key_conformance]
337 except KeyError:
338 return ''
339
340 @property
341 def pdfx_status(self) -> str:
342 """Return the PDF/X conformance level claimed by this PDF, or False.
343
344 A PDF may claim to PDF/X compliant without this being true. Use an
345 independent verifier such as veraPDF to test if a PDF is truly
346 conformant.
347
348 Returns:
349 The conformance level of the PDF/X, or an empty string if the
350 PDF does not claim PDF/X conformance.
351 """
352 pdfx_version = QName(XMP_NS_PDFX_ID, 'GTS_PDFXVersion')
353 try:
354 return self[pdfx_version]
355 except KeyError:
356 return ''
357
358 def __str__(self) -> str:
359 """Convert XMP metadata to XML string."""
360 return str(self._xmp_doc)
361
362 # Backward compatibility methods for internal API access
363 def _load(self) -> None:
364 """No-op for backward compatibility.
365
366 Previously this triggered lazy loading of XMP. Now XMP is loaded
367 immediately in __init__.
368 """
369 pass
370
371 def _get_rdf_root(self):
372 """Get the rdf:RDF root element.
373
374 Provided for backward compatibility with code that accesses
375 internal XMP structure.
376 """
377 return self._xmp_doc._get_rdf_root()
378
379 def _get_xml_bytes(self, xpacket: bool = True) -> bytes:
380 """Serialize XMP to XML bytes.
381
382 Provided for backward compatibility.
383 """
384 return self._xmp_doc.to_bytes(xpacket=xpacket)