1# SPDX-FileCopyrightText: 2022 James R. Barlow
2# SPDX-License-Identifier: MPL-2.0
3
4"""PDF metadata handling."""
5
6from __future__ import annotations
7
8import logging
9import re
10from abc import ABC, abstractmethod
11from collections.abc import Iterable, Iterator, MutableMapping
12from datetime import datetime, timezone
13from functools import wraps
14from io import BytesIO
15from typing import TYPE_CHECKING, Any, Callable, NamedTuple
16from warnings import warn
17
18from lxml import etree
19from lxml.etree import QName, XMLSyntaxError, _Element
20
21from pikepdf._version import __version__ as pikepdf_version
22from pikepdf._xml import parse_xml
23from pikepdf.objects import Name, Stream, String
24
25if TYPE_CHECKING: # pragma: no cover
26 from pikepdf import Pdf
27
28
29XMP_NS_DC = "http://purl.org/dc/elements/1.1/"
30XMP_NS_PDF = "http://ns.adobe.com/pdf/1.3/"
31XMP_NS_PDFA_ID = "http://www.aiim.org/pdfa/ns/id/"
32XMP_NS_PDFA_EXTENSION = "http://www.aiim.org/pdfa/ns/extension/"
33XMP_NS_PDFA_PROPERTY = "http://www.aiim.org/pdfa/ns/property#"
34XMP_NS_PDFA_SCHEMA = "http://www.aiim.org/pdfa/ns/schema#"
35XMP_NS_PDFUA_ID = "http://www.aiim.org/pdfua/ns/id/"
36XMP_NS_PDFX_ID = "http://www.npes.org/pdfx/ns/id/"
37XMP_NS_PHOTOSHOP = "http://ns.adobe.com/photoshop/1.0/"
38XMP_NS_PRISM = "http://prismstandard.org/namespaces/basic/1.0/"
39XMP_NS_PRISM2 = "http://prismstandard.org/namespaces/basic/2.0/"
40XMP_NS_PRISM3 = "http://prismstandard.org/namespaces/basic/3.0/"
41XMP_NS_RDF = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
42XMP_NS_XMP = "http://ns.adobe.com/xap/1.0/"
43XMP_NS_XMP_MM = "http://ns.adobe.com/xap/1.0/mm/"
44XMP_NS_XMP_RIGHTS = "http://ns.adobe.com/xap/1.0/rights/"
45
46DEFAULT_NAMESPACES: list[tuple[str, str]] = [
47 ('adobe:ns:meta/', 'x'),
48 (XMP_NS_DC, 'dc'),
49 (XMP_NS_PDF, 'pdf'),
50 (XMP_NS_PDFA_ID, 'pdfaid'),
51 (XMP_NS_PDFA_EXTENSION, 'pdfaExtension'),
52 (XMP_NS_PDFA_PROPERTY, 'pdfaProperty'),
53 (XMP_NS_PDFA_SCHEMA, 'pdfaSchema'),
54 (XMP_NS_PDFUA_ID, 'pdfuaid'),
55 (XMP_NS_PDFX_ID, 'pdfxid'),
56 (XMP_NS_PHOTOSHOP, 'photoshop'),
57 (XMP_NS_PRISM, 'prism'),
58 (XMP_NS_PRISM2, 'prism2'),
59 (XMP_NS_PRISM3, 'prism3'),
60 (XMP_NS_RDF, 'rdf'),
61 (XMP_NS_XMP, 'xmp'),
62 (XMP_NS_XMP_MM, 'xmpMM'),
63 (XMP_NS_XMP_RIGHTS, 'xmpRights'),
64 ('http://crossref.org/crossmark/1.0/', 'crossmark'),
65 ('http://www.niso.org/schemas/jav/1.0/', 'jav'),
66 ('http://ns.adobe.com/pdfx/1.3/', 'pdfx'),
67 ('http://www.niso.org/schemas/ali/1.0/', 'ali'),
68]
69
70for _uri, _prefix in DEFAULT_NAMESPACES:
71 etree.register_namespace(_prefix, _uri)
72
73# This one should not be registered
74XMP_NS_XML = "http://www.w3.org/XML/1998/namespace"
75
76XPACKET_BEGIN = b"""<?xpacket begin="\xef\xbb\xbf" id="W5M0MpCehiHzreSzNTczkc9d"?>\n"""
77
78XMP_EMPTY = b"""<x:xmpmeta xmlns:x="adobe:ns:meta/" x:xmptk="pikepdf">
79 <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
80 </rdf:RDF>
81</x:xmpmeta>
82"""
83
84XPACKET_END = b"""\n<?xpacket end="w"?>\n"""
85
86
87class XmpContainer(NamedTuple):
88 """Map XMP container object to suitable Python container."""
89
90 rdf_type: str
91 py_type: type
92 insert_fn: Callable[..., None]
93
94
95log = logging.getLogger(__name__)
96
97
98class NeverRaise(Exception):
99 """An exception that is never raised."""
100
101
102class AltList(list):
103 """XMP AltList container."""
104
105
106XMP_CONTAINERS = [
107 XmpContainer('Alt', AltList, AltList.append),
108 XmpContainer('Bag', set, set.add),
109 XmpContainer('Seq', list, list.append),
110]
111
112LANG_ALTS = frozenset(
113 [
114 str(QName(XMP_NS_DC, 'title')),
115 str(QName(XMP_NS_DC, 'description')),
116 str(QName(XMP_NS_DC, 'rights')),
117 str(QName(XMP_NS_XMP_RIGHTS, 'UsageTerms')),
118 ]
119)
120
121# These are the illegal characters in XML 1.0. (XML 1.1 is a bit more permissive,
122# but we'll be strict to ensure wider compatibility.)
123re_xml_illegal_chars = re.compile(
124 r"(?u)[^\x09\x0A\x0D\x20-\U0000D7FF\U0000E000-\U0000FFFD\U00010000-\U0010FFFF]"
125)
126re_xml_illegal_bytes = re.compile(rb"[^\x09\x0A\x0D\x20-\xFF]|�")
127
128# Might want to check re_xml_illegal_bytes for patterns such as:
129# br"&#(?:[0-9]|0[0-9]|1[0-9]|2[0-9]|3[0-1]
130# |x[0-9A-Fa-f]|x0[0-9A-Fa-f]|x1[0-9A-Fa-f]);"
131
132
133def _parser_basic(xml: bytes):
134 return parse_xml(BytesIO(xml))
135
136
137def _parser_strip_illegal_bytes(xml: bytes):
138 return parse_xml(BytesIO(re_xml_illegal_bytes.sub(b'', xml)))
139
140
141def _parser_recovery(xml: bytes):
142 return parse_xml(BytesIO(xml), recover=True)
143
144
145def _parser_replace_with_empty_xmp(_xml: bytes = b''):
146 log.warning("Error occurred parsing XMP, replacing with empty XMP.")
147 return _parser_basic(XMP_EMPTY)
148
149
150def _clean(s: str | Iterable[str], joiner: str = '; ') -> str:
151 """Ensure an object can safely be inserted in a XML tag body.
152
153 If we still have a non-str object at this point, the best option is to
154 join it, because it's apparently calling for a new node in a place that
155 isn't allowed in the spec or not supported.
156 """
157 if not isinstance(s, str):
158 if isinstance(s, Iterable):
159 warn(f"Merging elements of {s}")
160 if isinstance(s, set):
161 s = joiner.join(sorted(s))
162 else:
163 s = joiner.join(s)
164 else:
165 raise TypeError("object must be a string or iterable of strings")
166 return re_xml_illegal_chars.sub('', s)
167
168
169def encode_pdf_date(d: datetime) -> str:
170 """Encode Python datetime object as PDF date string.
171
172 From Adobe pdfmark manual:
173 (D:YYYYMMDDHHmmSSOHH'mm')
174 D: is an optional prefix. YYYY is the year. All fields after the year are
175 optional. MM is the month (01-12), DD is the day (01-31), HH is the
176 hour (00-23), mm are the minutes (00-59), and SS are the seconds
177 (00-59). The remainder of the string defines the relation of local
178 time to GMT. O is either + for a positive difference (local time is
179 later than GMT) or - (minus) for a negative difference. HH' is the
180 absolute value of the offset from GMT in hours, and mm' is the
181 absolute value of the offset in minutes. If no GMT information is
182 specified, the relation between the specified time and GMT is
183 considered unknown. Regardless of whether or not GMT
184 information is specified, the remainder of the string should specify
185 the local time.
186
187 'D:' is required in PDF/A, so we always add it.
188 """
189 # The formatting of %Y is not consistent as described in
190 # https://bugs.python.org/issue13305 and underspecification in libc.
191 # So explicitly format the year with leading zeros
192 s = f"D:{d.year:04d}"
193 s += d.strftime(r'%m%d%H%M%S')
194 tz = d.strftime('%z')
195 if tz:
196 sign, tz_hours, tz_mins = tz[0], tz[1:3], tz[3:5]
197 s += f"{sign}{tz_hours}'{tz_mins}"
198 return s
199
200
201def decode_pdf_date(s: str) -> datetime:
202 """Decode a pdfmark date to a Python datetime object.
203
204 A pdfmark date is a string in a particular format, as described in
205 :func:`encode_pdf_date`.
206 """
207 if isinstance(s, String):
208 s = str(s)
209 t = s
210 if t.startswith('D:'):
211 t = t[2:]
212 utcs = [
213 "Z00'00'", # Literal Z00'00', is incorrect but found in the wild
214 "Z00'00", # Correctly formatted UTC
215 "Z", # Alternate UTC
216 ]
217 for utc in utcs:
218 if t.endswith(utc):
219 t = t.replace(utc, "+0000")
220 break
221 t = t.replace("'", "") # Remove apos from PDF time strings
222
223 date_formats = [
224 r"%Y%m%d%H%M%S%z", # Format with timezone
225 r"%Y%m%d%H%M%S", # Format without timezone
226 r"%Y%m%d", # Date only format
227 ]
228 for date_format in date_formats:
229 try:
230 return datetime.strptime(t, date_format)
231 except ValueError:
232 continue
233 raise ValueError(f"Date string does not match any known format: {s} (read as {t})")
234
235
236class Converter(ABC):
237 """XMP <-> DocumentInfo converter."""
238
239 @staticmethod
240 @abstractmethod
241 def xmp_from_docinfo(docinfo_val: str | None) -> Any: # type: ignore
242 """Derive XMP metadata from a DocumentInfo string."""
243
244 @staticmethod
245 @abstractmethod
246 def docinfo_from_xmp(xmp_val: Any) -> str | None:
247 """Derive a DocumentInfo value from equivalent XMP metadata."""
248
249
250class AuthorConverter(Converter):
251 """Convert XMP document authors to DocumentInfo."""
252
253 @staticmethod
254 def xmp_from_docinfo(docinfo_val: str | None) -> Any: # type: ignore
255 """Derive XMP authors info from DocumentInfo."""
256 return [docinfo_val]
257
258 @staticmethod
259 def docinfo_from_xmp(xmp_val):
260 """Derive DocumentInfo authors from XMP.
261
262 XMP supports multiple author values, while DocumentInfo has a string,
263 so we return the values separated by semi-colons.
264 """
265 if isinstance(xmp_val, str):
266 return xmp_val
267 if xmp_val is None or xmp_val == [None]:
268 return None
269 return '; '.join(author for author in xmp_val if author is not None)
270
271
272class DateConverter(Converter):
273 """Convert XMP dates to DocumentInfo."""
274
275 @staticmethod
276 def xmp_from_docinfo(docinfo_val):
277 """Derive XMP date from DocumentInfo."""
278 if docinfo_val == '':
279 return ''
280 return decode_pdf_date(docinfo_val).isoformat()
281
282 @staticmethod
283 def docinfo_from_xmp(xmp_val):
284 """Derive DocumentInfo from XMP."""
285 if xmp_val.endswith('Z'):
286 xmp_val = xmp_val[:-1] + '+00:00'
287 dateobj = datetime.fromisoformat(xmp_val)
288 return encode_pdf_date(dateobj)
289
290
291class DocinfoMapping(NamedTuple):
292 """Map DocumentInfo keys to their XMP equivalents, along with converter."""
293
294 ns: str
295 key: str
296 name: Name
297 converter: type[Converter] | None
298
299
300def ensure_loaded(fn):
301 """Ensure the XMP has been loaded and parsed.
302
303 TODO: Can this be removed? Why allow the uninit'ed state to even exist?
304 """
305
306 @wraps(fn)
307 def wrapper(self, *args, **kwargs):
308 if not self._xmp:
309 self._load()
310 return fn(self, *args, **kwargs)
311
312 return wrapper
313
314
315class PdfMetadata(MutableMapping):
316 """Read and edit the metadata associated with a PDF.
317
318 The PDF specification contain two types of metadata, the newer XMP
319 (Extensible Metadata Platform, XML-based) and older DocumentInformation
320 dictionary. The PDF 2.0 specification removes the DocumentInformation
321 dictionary.
322
323 This primarily works with XMP metadata, but includes methods to generate
324 XMP from DocumentInformation and will also coordinate updates to
325 DocumentInformation so that the two are kept consistent.
326
327 XMP metadata fields may be accessed using the full XML namespace URI or
328 the short name. For example ``metadata['dc:description']``
329 and ``metadata['{http://purl.org/dc/elements/1.1/}description']``
330 both refer to the same field. Several common XML namespaces are registered
331 automatically.
332
333 See the XMP specification for details of allowable fields.
334
335 To update metadata, use a with block.
336
337 Example:
338 >>> with pdf.open_metadata() as records:
339 ... records['dc:title'] = 'New Title'
340
341 See Also:
342 :meth:`pikepdf.Pdf.open_metadata`
343 """
344
345 DOCINFO_MAPPING: list[DocinfoMapping] = [
346 DocinfoMapping(XMP_NS_DC, 'creator', Name.Author, AuthorConverter),
347 DocinfoMapping(XMP_NS_DC, 'description', Name.Subject, None),
348 DocinfoMapping(XMP_NS_DC, 'title', Name.Title, None),
349 DocinfoMapping(XMP_NS_PDF, 'Keywords', Name.Keywords, None),
350 DocinfoMapping(XMP_NS_PDF, 'Producer', Name.Producer, None),
351 DocinfoMapping(XMP_NS_XMP, 'CreateDate', Name.CreationDate, DateConverter),
352 DocinfoMapping(XMP_NS_XMP, 'CreatorTool', Name.Creator, None),
353 DocinfoMapping(XMP_NS_XMP, 'ModifyDate', Name.ModDate, DateConverter),
354 ]
355
356 NS: dict[str, str] = {prefix: uri for uri, prefix in DEFAULT_NAMESPACES}
357 REVERSE_NS: dict[str, str] = dict(DEFAULT_NAMESPACES)
358
359 _PARSERS_OVERWRITE_INVALID_XML: Iterable[Callable[[bytes], Any]] = [
360 _parser_basic,
361 _parser_strip_illegal_bytes,
362 _parser_recovery,
363 _parser_replace_with_empty_xmp,
364 ]
365 _PARSERS_STANDARD: Iterable[Callable[[bytes], Any]] = [_parser_basic]
366
367 @classmethod
368 def register_xml_namespace(cls, uri, prefix):
369 """Register a new XML/XMP namespace.
370
371 Arguments:
372 uri: The long form of the namespace.
373 prefix: The alias to use when interpreting XMP.
374 """
375 cls.NS[prefix] = uri
376 cls.REVERSE_NS[uri] = prefix
377 etree.register_namespace(_prefix, _uri)
378
379 def __init__(
380 self,
381 pdf: Pdf,
382 pikepdf_mark: bool = True,
383 sync_docinfo: bool = True,
384 overwrite_invalid_xml: bool = True,
385 ):
386 """Construct PdfMetadata. Use Pdf.open_metadata() instead."""
387 self._pdf = pdf
388 self.mark = pikepdf_mark
389 self.sync_docinfo = sync_docinfo
390 self._updating = False
391 self.overwrite_invalid_xml = overwrite_invalid_xml
392 self._xmp = None
393
394 def load_from_docinfo(
395 self, docinfo, delete_missing: bool = False, raise_failure: bool = False
396 ) -> None:
397 """Populate the XMP metadata object with DocumentInfo.
398
399 Arguments:
400 docinfo: a DocumentInfo, e.g pdf.docinfo
401 delete_missing: if the entry is not DocumentInfo, delete the equivalent
402 from XMP
403 raise_failure: if True, raise any failure to convert docinfo;
404 otherwise warn and continue
405
406 A few entries in the deprecated DocumentInfo dictionary are considered
407 approximately equivalent to certain XMP records. This method copies
408 those entries into the XMP metadata.
409 """
410
411 def warn_or_raise(msg, e=None):
412 if raise_failure:
413 raise ValueError(msg) from e
414 warn(msg)
415
416 for uri, shortkey, docinfo_name, converter in self.DOCINFO_MAPPING:
417 qname = QName(uri, shortkey)
418 # docinfo might be a dict or pikepdf.Dictionary, so lookup keys
419 # by str(Name)
420 val = docinfo.get(str(docinfo_name))
421 if val is None:
422 if delete_missing and qname in self:
423 del self[qname]
424 continue
425 try:
426 val = str(val)
427 if converter:
428 val = converter.xmp_from_docinfo(val)
429 if not val:
430 continue
431 self._setitem(qname, val, True)
432 except (ValueError, AttributeError, NotImplementedError) as e:
433 warn_or_raise(
434 f"The metadata field {docinfo_name} could not be copied to XMP", e
435 )
436 valid_docinfo_names = {
437 str(docinfo_name) for _, _, docinfo_name, _ in self.DOCINFO_MAPPING
438 }
439 extra_docinfo_names = {str(k) for k in docinfo.keys()} - valid_docinfo_names
440 for extra in extra_docinfo_names:
441 warn_or_raise(
442 f"The metadata field {extra} with value '{repr(docinfo.get(extra))}' "
443 "has no XMP equivalent, so it was discarded",
444 )
445
446 def _load(self) -> None:
447 try:
448 data = self._pdf.Root.Metadata.read_bytes()
449 except AttributeError:
450 data = b''
451 self._load_from(data)
452
453 def _load_from(self, data: bytes) -> None:
454 if data.strip() == b'':
455 data = XMP_EMPTY # on some platforms lxml chokes on empty documents
456
457 parsers = (
458 self._PARSERS_OVERWRITE_INVALID_XML
459 if self.overwrite_invalid_xml
460 else self._PARSERS_STANDARD
461 )
462
463 for parser in parsers:
464 try:
465 self._xmp = parser(data)
466 except (
467 XMLSyntaxError if self.overwrite_invalid_xml else NeverRaise # type: ignore
468 ) as e:
469 if str(e).startswith("Start tag expected, '<' not found") or str(
470 e
471 ).startswith("Document is empty"):
472 self._xmp = _parser_replace_with_empty_xmp()
473 break
474 else:
475 break
476
477 if self._xmp is not None:
478 try:
479 pis = self._xmp.xpath('/processing-instruction()')
480 for pi in pis:
481 etree.strip_tags(self._xmp, pi.tag)
482 self._get_rdf_root()
483 except (
484 Exception # pylint: disable=broad-except
485 if self.overwrite_invalid_xml
486 else NeverRaise
487 ) as e:
488 log.warning("Error occurred parsing XMP", exc_info=e)
489 self._xmp = _parser_replace_with_empty_xmp()
490 else:
491 log.warning("Error occurred parsing XMP")
492 self._xmp = _parser_replace_with_empty_xmp()
493
494 @ensure_loaded
495 def __enter__(self):
496 """Open metadata for editing."""
497 self._updating = True
498 return self
499
500 def __exit__(self, exc_type, exc_val, exc_tb):
501 """Close metadata and apply changes."""
502 try:
503 if exc_type is not None:
504 return
505 self._apply_changes()
506 finally:
507 self._updating = False
508
509 def _update_docinfo(self):
510 """Update the PDF's DocumentInfo dictionary to match XMP metadata.
511
512 The standard mapping is described here:
513 https://www.pdfa.org/pdfa-metadata-xmp-rdf-dublin-core/
514 """
515 # Touch object to ensure it exists
516 self._pdf.docinfo # pylint: disable=pointless-statement
517 for uri, element, docinfo_name, converter in self.DOCINFO_MAPPING:
518 qname = QName(uri, element)
519 try:
520 value = self[qname]
521 except KeyError:
522 if docinfo_name in self._pdf.docinfo:
523 del self._pdf.docinfo[docinfo_name]
524 continue
525 if converter:
526 try:
527 value = converter.docinfo_from_xmp(value)
528 except ValueError:
529 warn(
530 f"The DocumentInfo field {docinfo_name} could not be "
531 "updated from XMP"
532 )
533 value = None
534 except Exception as e:
535 raise ValueError(
536 "An error occurred while updating DocumentInfo field "
537 f"{docinfo_name} from XMP {qname} with value {value}"
538 ) from e
539 if value is None:
540 if docinfo_name in self._pdf.docinfo:
541 del self._pdf.docinfo[docinfo_name]
542 continue
543 value = _clean(value)
544 try:
545 # Try to save pure ASCII
546 self._pdf.docinfo[docinfo_name] = value.encode('ascii')
547 except UnicodeEncodeError:
548 # qpdf will serialize this as a UTF-16 with BOM string
549 self._pdf.docinfo[docinfo_name] = value
550
551 def _get_xml_bytes(self, xpacket=True):
552 data = BytesIO()
553 if xpacket:
554 data.write(XPACKET_BEGIN)
555 self._xmp.write(data, encoding='utf-8', pretty_print=True)
556 if xpacket:
557 data.write(XPACKET_END)
558 data.seek(0)
559 xml_bytes = data.read()
560 return xml_bytes
561
562 def _apply_changes(self):
563 """Serialize our changes back to the PDF in memory.
564
565 Depending how we are initialized, leave our metadata mark and producer.
566 """
567 if self.mark:
568 # We were asked to mark the file as being edited by pikepdf
569 self._setitem(
570 QName(XMP_NS_XMP, 'MetadataDate'),
571 datetime.now(timezone.utc).isoformat(),
572 applying_mark=True,
573 )
574 self._setitem(
575 QName(XMP_NS_PDF, 'Producer'),
576 'pikepdf ' + pikepdf_version,
577 applying_mark=True,
578 )
579 xml = self._get_xml_bytes()
580 self._pdf.Root.Metadata = Stream(self._pdf, xml)
581 self._pdf.Root.Metadata[Name.Type] = Name.Metadata
582 self._pdf.Root.Metadata[Name.Subtype] = Name.XML
583 if self.sync_docinfo:
584 self._update_docinfo()
585
586 @classmethod
587 def _qname(cls, name: QName | str) -> str:
588 """Convert name to an XML QName.
589
590 e.g. pdf:Producer -> {http://ns.adobe.com/pdf/1.3/}Producer
591 """
592 if isinstance(name, QName):
593 return str(name)
594 if not isinstance(name, str):
595 raise TypeError(f"{name} must be str")
596 if name == '':
597 return name
598 if name.startswith('{'):
599 return name
600 try:
601 prefix, tag = name.split(':', maxsplit=1)
602 except ValueError:
603 # If missing the namespace, it belongs in the default namespace.
604 # A tag such <xyz xmlns="http://example.com"> defines a default
605 # namespace of http://example.com for all enclosed tags that don't
606 # override the namespace with a colon prefix.
607 # XMP does not usually use the default namespace, so we can
608 # assume it's just blank. In practice a document that depends on
609 # defining a default namespace over some part of its content
610 # could introduce a collision.
611 # See: https://www.w3.org/TR/REC-xml-names/#dt-defaultNS
612 prefix, tag = '', name
613 uri = cls.NS.get(prefix, None)
614 return str(QName(uri, tag))
615
616 def _prefix_from_uri(self, uriname):
617 """Given a fully qualified XML name, find a prefix.
618
619 e.g. {http://ns.adobe.com/pdf/1.3/}Producer -> pdf:Producer
620 """
621 uripart, tag = uriname.split('}', maxsplit=1)
622 uri = uripart.replace('{', '')
623 return self.REVERSE_NS[uri] + ':' + tag
624
625 def _get_subelements(self, node: _Element) -> Any:
626 """Gather the sub-elements attached to a node.
627
628 Gather rdf:Bag and and rdf:Seq into set and list respectively. For
629 alternate languages values, take the first language only for
630 simplicity.
631 """
632 items = node.find('rdf:Alt', self.NS)
633 if items is not None:
634 try:
635 return items[0].text
636 except IndexError:
637 return ''
638
639 for xmlcontainer, container, insertfn in XMP_CONTAINERS:
640 items = node.find(f'rdf:{xmlcontainer}', self.NS)
641 if items is None:
642 continue
643 result = container()
644 for item in items:
645 insertfn(result, item.text)
646 return result
647 return ''
648
649 def _get_rdf_root(self) -> _Element:
650 assert self._xmp is not None
651 rdf = self._xmp.find('.//rdf:RDF', self.NS)
652 if rdf is None:
653 rdf = self._xmp.getroot()
654 if not rdf.tag == '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF':
655 raise ValueError("Metadata seems to be XML but not XMP")
656 return rdf
657
658 def _get_elements(
659 self, name: str | QName = ''
660 ) -> Iterator[tuple[_Element, str | bytes | None, Any, _Element]]:
661 """Get elements from XMP.
662
663 Core routine to find elements matching name within the XMP and yield
664 them.
665
666 For XMP spec 7.9.2.2, rdf:Description with property attributes,
667 we yield the node which will have the desired as one of its attributes.
668 qname is returned so that the node.attrib can be used to locate the
669 source.
670
671 For XMP spec 7.5, simple valued XMP properties, we yield the node,
672 None, and the value. For structure or array valued properties we gather
673 the elements. We ignore qualifiers.
674
675 Args:
676 name: a prefixed name or QName to look for within the
677 data section of the XMP; looks for all data keys if omitted
678
679 Yields:
680 tuple: (node, qname_attrib, value, parent_node)
681
682 """
683 qname = self._qname(name)
684 rdf = self._get_rdf_root()
685 for rdfdesc in rdf.findall('rdf:Description[@rdf:about=""]', self.NS):
686 if qname and qname in rdfdesc.keys():
687 yield (rdfdesc, qname, rdfdesc.get(qname), rdf)
688 elif not qname:
689 for k, v in rdfdesc.items():
690 if v:
691 yield (rdfdesc, k, v, rdf)
692 xpath = qname if name else '*'
693 for node in rdfdesc.findall(xpath, self.NS):
694 if node.text and node.text.strip():
695 yield (node, None, node.text, rdfdesc)
696 continue
697 values = self._get_subelements(node)
698 yield (node, None, values, rdfdesc)
699
700 def _get_element_values(self, name: str | QName = '') -> Iterator[Any]:
701 yield from (v[2] for v in self._get_elements(name))
702
703 @ensure_loaded
704 def __contains__(self, key: str | QName):
705 """Test if XMP key is in metadata."""
706 return any(self._get_element_values(key))
707
708 @ensure_loaded
709 def __getitem__(self, key: str | QName):
710 """Retrieve XMP metadata for key."""
711 try:
712 return next(self._get_element_values(key))
713 except StopIteration:
714 raise KeyError(key) from None
715
716 @ensure_loaded
717 def __iter__(self):
718 """Iterate through XMP metadata attributes and nodes."""
719 for node, attrib, _val, _parents in self._get_elements():
720 if attrib:
721 yield attrib
722 else:
723 yield node.tag
724
725 @ensure_loaded
726 def __len__(self):
727 """Return number of items in metadata."""
728 return len(list(iter(self)))
729
730 def _setitem(
731 self,
732 key: str | QName,
733 val: set[str] | list[str] | str,
734 applying_mark: bool = False,
735 ):
736 if not self._updating:
737 raise RuntimeError("Metadata not opened for editing, use with block")
738
739 qkey = self._qname(key)
740 self._setitem_check_args(key, val, applying_mark, qkey)
741
742 try:
743 # Update existing node
744 self._setitem_update(key, val, qkey)
745 except StopIteration:
746 # Insert a new node
747 self._setitem_insert(key, val)
748
749 def _setitem_check_args(self, key, val, applying_mark: bool, qkey: str) -> None:
750 if (
751 self.mark
752 and not applying_mark
753 and qkey
754 in (
755 self._qname('xmp:MetadataDate'),
756 self._qname('pdf:Producer'),
757 )
758 ):
759 # Complain if user writes self[pdf:Producer] = ... and because it will
760 # be overwritten on save, unless self._updating_mark, in which case
761 # the action was initiated internally
762 log.warning(
763 f"Update to {key} will be overwritten because metadata was opened "
764 "with set_pikepdf_as_editor=True"
765 )
766 if isinstance(val, str) and qkey in (self._qname('dc:creator')):
767 log.error(f"{key} should be set to a list of strings")
768
769 def _setitem_add_array(self, node, items: Iterable) -> None:
770 rdf_type = next(
771 c.rdf_type for c in XMP_CONTAINERS if isinstance(items, c.py_type)
772 )
773 seq = etree.SubElement(node, str(QName(XMP_NS_RDF, rdf_type)))
774 tag_attrib: dict[str, str] | None = None
775 if rdf_type == 'Alt':
776 tag_attrib = {str(QName(XMP_NS_XML, 'lang')): 'x-default'}
777 for item in items:
778 el = etree.SubElement(seq, str(QName(XMP_NS_RDF, 'li')), attrib=tag_attrib)
779 if item is not None:
780 inner_text: str | None = _clean(item)
781 if inner_text == '':
782 inner_text = None
783 el.text = inner_text
784
785 def _setitem_update(self, key, val, qkey):
786 # Locate existing node to replace
787 node, attrib, _oldval, _parent = next(self._get_elements(key))
788 if attrib:
789 if not isinstance(val, str):
790 if qkey == self._qname('dc:creator'):
791 # dc:creator incorrectly created as an attribute - we're
792 # replacing it anyway, so remove the old one
793 del node.attrib[qkey]
794 self._setitem_add_array(node, _clean(val))
795 else:
796 raise TypeError(f"Setting {key} to {val} with type {type(val)}")
797 else:
798 node.set(attrib, _clean(val))
799 elif isinstance(val, (list, set)):
800 for child in node.findall('*'):
801 node.remove(child)
802 self._setitem_add_array(node, val)
803 elif isinstance(val, str):
804 for child in node.findall('*'):
805 node.remove(child)
806 if str(self._qname(key)) in LANG_ALTS:
807 self._setitem_add_array(node, AltList([_clean(val)]))
808 else:
809 node.text = _clean(val)
810 else:
811 raise TypeError(f"Setting {key} to {val} with type {type(val)}")
812
813 def _setitem_insert(self, key, val):
814 rdf = self._get_rdf_root()
815 if str(self._qname(key)) in LANG_ALTS:
816 val = AltList([_clean(val)])
817 if isinstance(val, (list, set)):
818 rdfdesc = etree.SubElement(
819 rdf,
820 str(QName(XMP_NS_RDF, 'Description')),
821 attrib={str(QName(XMP_NS_RDF, 'about')): ''},
822 )
823 node = etree.SubElement(rdfdesc, self._qname(key))
824 self._setitem_add_array(node, val)
825 elif isinstance(val, str):
826 rdfdesc = rdf.find('rdf:Description[@rdf:about=""]', self.NS)
827 if rdfdesc is None:
828 rdfdesc = etree.SubElement(
829 rdf,
830 str(QName(XMP_NS_RDF, 'Description')),
831 attrib={str(QName(XMP_NS_RDF, 'about')): ''},
832 )
833 node = etree.SubElement(rdfdesc, self._qname(key))
834 node.text = _clean(val)
835 else:
836 raise TypeError(f"Setting {key} to {val} with type {type(val)}") from None
837
838 @ensure_loaded
839 def __setitem__(self, key: str | QName, val: set[str] | list[str] | str):
840 """Set XMP metadata key to value."""
841 return self._setitem(key, val, False)
842
843 @ensure_loaded
844 def __delitem__(self, key: str | QName):
845 """Delete item from XMP metadata."""
846 if not self._updating:
847 raise RuntimeError("Metadata not opened for editing, use with block")
848 try:
849 node, attrib, _oldval, parent = next(self._get_elements(key))
850 if attrib: # Inline
851 del node.attrib[attrib]
852 if (
853 len(node.attrib) == 1
854 and len(node) == 0
855 and QName(XMP_NS_RDF, 'about') in node.attrib.keys()
856 ):
857 # The only thing left on this node is rdf:about="", so remove it
858 parent.remove(node)
859 else:
860 parent.remove(node)
861 except StopIteration:
862 raise KeyError(key) from None
863
864 @property
865 def pdfa_status(self) -> str:
866 """Return the PDF/A conformance level claimed by this PDF, or False.
867
868 A PDF may claim to PDF/A compliant without this being true. Use an
869 independent verifier such as veraPDF to test if a PDF is truly
870 conformant.
871
872 Returns:
873 The conformance level of the PDF/A, or an empty string if the
874 PDF does not claim PDF/A conformance. Possible valid values
875 are: 1A, 1B, 2A, 2B, 2U, 3A, 3B, 3U.
876 """
877 # do same as @ensure_loaded - mypy can't handle decorated property
878 if not self._xmp:
879 self._load()
880
881 key_part = QName(XMP_NS_PDFA_ID, 'part')
882 key_conformance = QName(XMP_NS_PDFA_ID, 'conformance')
883 try:
884 return self[key_part] + self[key_conformance]
885 except KeyError:
886 return ''
887
888 @property
889 def pdfx_status(self) -> str:
890 """Return the PDF/X conformance level claimed by this PDF, or False.
891
892 A PDF may claim to PDF/X compliant without this being true. Use an
893 independent verifier such as veraPDF to test if a PDF is truly
894 conformant.
895
896 Returns:
897 The conformance level of the PDF/X, or an empty string if the
898 PDF does not claim PDF/X conformance.
899 """
900 # do same as @ensure_loaded - mypy can't handle decorated property
901 if not self._xmp:
902 self._load()
903
904 pdfx_version = QName(XMP_NS_PDFX_ID, 'GTS_PDFXVersion')
905 try:
906 return self[pdfx_version]
907 except KeyError:
908 return ''
909
910 @ensure_loaded
911 def __str__(self):
912 """Convert XMP metadata to XML string."""
913 return self._get_xml_bytes(xpacket=False).decode('utf-8')