Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/xmp.py: 34%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2Anything related to Extensible Metadata Platform (XMP) metadata.
4https://en.wikipedia.org/wiki/Extensible_Metadata_Platform
5"""
7import datetime
8import decimal
9import re
10from collections.abc import Iterator
11from typing import (
12 Any,
13 Callable,
14 Optional,
15 TypeVar,
16 Union,
17 cast,
18)
19from xml.dom.expatbuilder import ExpatBuilderNS
20from xml.dom.minidom import Document
21from xml.dom.minidom import Element as XmlElement
22from xml.dom.xmlbuilder import Options
23from xml.parsers.expat import ExpatError, XMLParserType
25from ._protocols import XmpInformationProtocol
26from ._utils import StreamType, deprecate_with_replacement, deprecation_no_replacement
27from .errors import LimitReachedError, PdfReadError, XmpDocumentError
28from .generic import ContentStream, PdfObject
30XMP_MAX_INPUT_LENGTH = 5_000_000
31XMP_MAX_ELEMENT_COUNT = 100_000
33RDF_NAMESPACE = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
34DC_NAMESPACE = "http://purl.org/dc/elements/1.1/"
35XMP_NAMESPACE = "http://ns.adobe.com/xap/1.0/"
36PDF_NAMESPACE = "http://ns.adobe.com/pdf/1.3/"
37XMPMM_NAMESPACE = "http://ns.adobe.com/xap/1.0/mm/"
39# What is the PDFX namespace, you might ask?
40# It's documented here: https://github.com/adobe/xmp-docs/raw/master/XMPSpecifications/XMPSpecificationPart3.pdf
41# This namespace is used to place "custom metadata"
42# properties, which are arbitrary metadata properties with no semantic or
43# documented meaning.
44#
45# Elements in the namespace are key/value-style storage,
46# where the element name is the key and the content is the value. The keys
47# are transformed into valid XML identifiers by substituting an invalid
48# identifier character with \u2182 followed by the unicode hex ID of the
49# original character. A key like "my car" is therefore "my\u21820020car".
50#
51# \u2182 is the unicode character \u{ROMAN NUMERAL TEN THOUSAND}
52#
53# The pdfx namespace should be avoided.
54# A custom data schema and sensical XML elements could be used instead, as is
55# suggested by Adobe's own documentation on XMP under "Extensibility of
56# Schemas".
57PDFX_NAMESPACE = "http://ns.adobe.com/pdfx/1.3/"
59# PDF/A
60PDFAID_NAMESPACE = "http://www.aiim.org/pdfa/ns/id/"
62# Internal mapping of namespace URI → prefix
63_NAMESPACE_PREFIX_MAP = {
64 DC_NAMESPACE: "dc",
65 XMP_NAMESPACE: "xmp",
66 PDF_NAMESPACE: "pdf",
67 XMPMM_NAMESPACE: "xmpMM",
68 PDFAID_NAMESPACE: "pdfaid",
69 PDFX_NAMESPACE: "pdfx",
70}
72iso8601 = re.compile(
73 """
74 (?P<year>[0-9]{4})
75 (-
76 (?P<month>[0-9]{2})
77 (-
78 (?P<day>[0-9]+)
79 (T
80 (?P<hour>[0-9]{2}):
81 (?P<minute>[0-9]{2})
82 (:(?P<second>[0-9]{2}(.[0-9]+)?))?
83 (?P<tzd>Z|[-+][0-9]{2}:[0-9]{2})
84 )?
85 )?
86 )?
87 """,
88 re.VERBOSE,
89)
92K = TypeVar("K")
94# Minimal XMP template
95_MINIMAL_XMP = f"""<?xpacket begin="\ufeff" id="W5M0MpCehiHzreSzNTczkc9d"?>
96<x:xmpmeta xmlns:x="adobe:ns:meta/" x:xmptk="pypdf">
97 <rdf:RDF xmlns:rdf="{RDF_NAMESPACE}">
98 <rdf:Description rdf:about=""
99 xmlns:dc="{DC_NAMESPACE}"
100 xmlns:xmp="{XMP_NAMESPACE}"
101 xmlns:pdf="{PDF_NAMESPACE}"
102 xmlns:xmpMM="{XMPMM_NAMESPACE}"
103 xmlns:pdfaid="{PDFAID_NAMESPACE}"
104 xmlns:pdfx="{PDFX_NAMESPACE}">
105 </rdf:Description>
106 </rdf:RDF>
107</x:xmpmeta>
108<?xpacket end="r"?>"""
111def _identity(value: K) -> K:
112 return value
115def _converter_date(value: str) -> datetime.datetime:
116 matches = iso8601.match(value)
117 if matches is None:
118 raise ValueError(f"Invalid date format: {value}")
119 year = int(matches.group("year"))
120 month = int(matches.group("month") or "1")
121 day = int(matches.group("day") or "1")
122 hour = int(matches.group("hour") or "0")
123 minute = int(matches.group("minute") or "0")
124 second = decimal.Decimal(matches.group("second") or "0")
125 seconds_dec = second.to_integral(decimal.ROUND_FLOOR)
126 milliseconds_dec = (second - seconds_dec) * 1_000_000
128 seconds = int(seconds_dec)
129 milliseconds = int(milliseconds_dec)
131 tzd = matches.group("tzd") or "Z"
132 dt = datetime.datetime(year, month, day, hour, minute, seconds, milliseconds)
133 if tzd != "Z":
134 tzd_hours, tzd_minutes = (int(x) for x in tzd.split(":"))
135 tzd_hours *= -1
136 if tzd_hours < 0:
137 tzd_minutes *= -1
138 dt = dt + datetime.timedelta(hours=tzd_hours, minutes=tzd_minutes)
139 return dt
142def _format_datetime_utc(value: datetime.datetime) -> str:
143 """Format a datetime as UTC with trailing 'Z'.
145 - If the input is timezone-aware, convert to UTC first.
146 - If naive, assume UTC.
147 """
148 if value.tzinfo is not None and value.utcoffset() is not None:
149 value = value.astimezone(datetime.timezone.utc)
151 value = value.replace(tzinfo=None)
152 return value.strftime("%Y-%m-%dT%H:%M:%S.%fZ")
155def _generic_get(
156 element: XmlElement, self: "XmpInformation", list_type: str, converter: Callable[[Any], Any] = _identity
157) -> Optional[list[str]]:
158 containers = element.getElementsByTagNameNS(RDF_NAMESPACE, list_type)
159 retval: list[Any] = []
160 if len(containers):
161 for container in containers:
162 for item in container.getElementsByTagNameNS(RDF_NAMESPACE, "li"):
163 value = self._get_text(item)
164 value = converter(value)
165 retval.append(value)
166 return retval
167 return None
170class _XmpBuilder(ExpatBuilderNS):
171 """
172 Custom XML parser denying all entity declarations.
174 This is a stripped down and typed version inspired by what *defusedxml* does.
176 Why do we need this? The default limits of *libexpat* used by Python only block exponential entity expansion,
177 but not cases like quadratic entity expansion which can still cause quite some memory usage.
178 """
180 def __init__(self, options: Optional[Options] = None) -> None:
181 super().__init__(options=options)
182 self._element_count = 0
184 def custom_entity_declaration_handler(
185 self,
186 entity_name: str,
187 is_parameter_entity: bool,
188 value: Optional[str],
189 base: Optional[str],
190 system_id: str,
191 public_id: Optional[str],
192 notation_name: Optional[str],
193 ) -> None:
194 raise ExpatError(f"Forbidden entities: {entity_name!r}")
196 def start_element_handler(self, name: str, attributes: list[str]) -> None:
197 self._element_count += 1
198 if self._element_count > XMP_MAX_ELEMENT_COUNT:
199 raise LimitReachedError(f"XMP metadata exceeds limit of {XMP_MAX_ELEMENT_COUNT} elements.")
200 super().start_element_handler(name=name, attributes=attributes)
202 def install(self, parser: XMLParserType) -> None:
203 super().install(parser)
205 parser.EntityDeclHandler = self.custom_entity_declaration_handler
206 parser.StartElementHandler = self.start_element_handler
209class XmpInformation(XmpInformationProtocol, PdfObject):
210 """
211 An object that represents Extensible Metadata Platform (XMP) metadata.
212 Usually accessed by :py:attr:`xmp_metadata()<pypdf.PdfReader.xmp_metadata>`.
214 Raises:
215 PdfReadError: if XML is invalid
217 """
219 def __init__(self, stream: ContentStream) -> None:
220 self.stream = stream
221 try:
222 data = self.stream.get_data()
223 if (length := len(data)) > XMP_MAX_INPUT_LENGTH:
224 raise LimitReachedError(f"XMP stream size {length} exceeds limit of {XMP_MAX_INPUT_LENGTH}.")
225 doc_root: Document = _XmpBuilder().parseString(data)
226 except (AttributeError, ExpatError) as e:
227 raise PdfReadError(f"XML in XmpInformation was invalid: {e}")
228 self.rdf_root: XmlElement = doc_root.getElementsByTagNameNS(
229 RDF_NAMESPACE, "RDF"
230 )[0]
231 self.cache: dict[Any, Any] = {}
233 @classmethod
234 def create(cls) -> "XmpInformation":
235 """
236 Create a new XmpInformation object with minimal structure.
238 Returns:
239 A new XmpInformation instance with empty metadata fields.
240 """
241 stream = ContentStream(None, None)
242 stream.set_data(_MINIMAL_XMP.encode("utf-8"))
243 return cls(stream)
245 def write_to_stream(
246 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
247 ) -> None:
248 deprecate_with_replacement(
249 "XmpInformation.write_to_stream",
250 "PdfWriter.xmp_metadata",
251 "6.0.0"
252 )
253 if encryption_key is not None: # deprecated
254 deprecation_no_replacement(
255 "the encryption_key parameter of write_to_stream", "5.0.0"
256 )
257 self.stream.write_to_stream(stream)
259 def get_element(self, about_uri: str, namespace: str, name: str) -> Iterator[Any]:
260 for desc in self.rdf_root.getElementsByTagNameNS(RDF_NAMESPACE, "Description"):
261 if desc.getAttributeNS(RDF_NAMESPACE, "about") == about_uri:
262 attr = desc.getAttributeNodeNS(namespace, name)
263 if attr is not None:
264 yield attr
265 yield from desc.getElementsByTagNameNS(namespace, name)
267 def get_nodes_in_namespace(self, about_uri: str, namespace: str) -> Iterator[Any]:
268 for desc in self.rdf_root.getElementsByTagNameNS(RDF_NAMESPACE, "Description"):
269 if desc.getAttributeNS(RDF_NAMESPACE, "about") == about_uri:
270 for i in range(desc.attributes.length):
271 attr = desc.attributes.item(i)
272 if attr and attr.namespaceURI == namespace:
273 yield attr
274 for child in desc.childNodes:
275 if child.namespaceURI == namespace:
276 yield child
278 def _get_text(self, element: XmlElement) -> str:
279 text = ""
280 for child in element.childNodes:
281 if child.nodeType == child.TEXT_NODE:
282 text += child.data
283 return text
285 def _get_single_value(
286 self,
287 namespace: str,
288 name: str,
289 converter: Callable[[str], Any] = _identity,
290 ) -> Optional[Any]:
291 cached = self.cache.get(namespace, {}).get(name)
292 if cached:
293 return cached
294 value = None
295 for element in self.get_element("", namespace, name):
296 if element.nodeType == element.ATTRIBUTE_NODE:
297 value = element.nodeValue
298 else:
299 value = self._get_text(element)
300 break
301 if value is not None:
302 value = converter(value)
303 ns_cache = self.cache.setdefault(namespace, {})
304 ns_cache[name] = value
305 return value
307 def _getter_bag(self, namespace: str, name: str) -> Optional[list[str]]:
308 cached = self.cache.get(namespace, {}).get(name)
309 if cached:
310 return cast(list[str], cached)
311 retval: list[str] = []
312 for element in self.get_element("", namespace, name):
313 if (bags := _generic_get(element, self, list_type="Bag")) is not None:
314 retval.extend(bags)
315 else:
316 value = self._get_text(element)
317 retval.append(value)
318 ns_cache = self.cache.setdefault(namespace, {})
319 ns_cache[name] = retval
320 return retval
322 def _get_seq_values(
323 self,
324 namespace: str,
325 name: str,
326 converter: Callable[[Any], Any] = _identity,
327 ) -> Optional[list[Any]]:
328 cached = self.cache.get(namespace, {}).get(name)
329 if cached:
330 return cast(list[Any], cached)
331 retval: list[Any] = []
332 for element in self.get_element("", namespace, name):
333 if (seqs := _generic_get(element, self, list_type="Seq", converter=converter)) is not None:
334 retval.extend(seqs)
335 elif (bags := _generic_get(element, self, list_type="Bag")) is not None:
336 # See issue at https://github.com/py-pdf/pypdf/issues/3324
337 # Some applications violate the XMP metadata standard regarding `dc:creator` which should
338 # be an "ordered array" and thus a sequence, but use an unordered array (bag) instead.
339 # This seems to stem from the fact that the original Dublin Core specification does indeed
340 # use bags or direct values, while PDFs are expected to follow the XMP standard and ignore
341 # the plain Dublin Core variant. For this reason, add a fallback here to deal with such
342 # issues accordingly.
343 retval.extend(bags)
344 else:
345 value = converter(self._get_text(element))
346 retval.append(value)
347 ns_cache = self.cache.setdefault(namespace, {})
348 ns_cache[name] = retval
349 return retval
351 def _get_langalt_values(self, namespace: str, name: str) -> Optional[dict[Any, Any]]:
352 cached = self.cache.get(namespace, {}).get(name)
353 if cached:
354 return cast(dict[Any, Any], cached)
355 retval: dict[Any, Any] = {}
356 for element in self.get_element("", namespace, name):
357 alts = element.getElementsByTagNameNS(RDF_NAMESPACE, "Alt")
358 if len(alts):
359 for alt in alts:
360 for item in alt.getElementsByTagNameNS(RDF_NAMESPACE, "li"):
361 value = self._get_text(item)
362 retval[item.getAttribute("xml:lang")] = value
363 else:
364 retval["x-default"] = self._get_text(element)
365 ns_cache = self.cache.setdefault(namespace, {})
366 ns_cache[name] = retval
367 return retval
369 @property
370 def dc_contributor(self) -> Optional[list[str]]:
371 """Contributors to the resource (other than the authors)."""
372 return self._getter_bag(DC_NAMESPACE, "contributor")
374 @dc_contributor.setter
375 def dc_contributor(self, values: Optional[list[str]]) -> None:
376 self._set_bag_values(DC_NAMESPACE, "contributor", values)
378 @property
379 def dc_coverage(self) -> Optional[str]:
380 """Text describing the extent or scope of the resource."""
381 return self._get_single_value(DC_NAMESPACE, "coverage")
383 @dc_coverage.setter
384 def dc_coverage(self, value: Optional[str]) -> None:
385 self._set_single_value(DC_NAMESPACE, "coverage", value)
387 @property
388 def dc_creator(self) -> Optional[list[str]]:
389 """A sorted array of names of the authors of the resource, listed in order of precedence."""
390 return self._get_seq_values(DC_NAMESPACE, "creator")
392 @dc_creator.setter
393 def dc_creator(self, values: Optional[list[str]]) -> None:
394 self._set_seq_values(DC_NAMESPACE, "creator", values)
396 @property
397 def dc_date(self) -> Optional[list[datetime.datetime]]:
398 """A sorted array of dates of significance to the resource. The dates and times are in UTC."""
399 return self._get_seq_values(DC_NAMESPACE, "date", _converter_date)
401 @dc_date.setter
402 def dc_date(self, values: Optional[list[Union[str, datetime.datetime]]]) -> None:
403 if values is None:
404 self._set_seq_values(DC_NAMESPACE, "date", None)
405 else:
406 date_strings = []
407 for value in values:
408 if isinstance(value, datetime.datetime):
409 date_strings.append(_format_datetime_utc(value))
410 else:
411 date_strings.append(str(value))
412 self._set_seq_values(DC_NAMESPACE, "date", date_strings)
414 @property
415 def dc_description(self) -> Optional[dict[str, str]]:
416 """A language-keyed dictionary of textual descriptions of the content of the resource."""
417 return self._get_langalt_values(DC_NAMESPACE, "description")
419 @dc_description.setter
420 def dc_description(self, values: Optional[dict[str, str]]) -> None:
421 self._set_langalt_values(DC_NAMESPACE, "description", values)
423 @property
424 def dc_format(self) -> Optional[str]:
425 """The mime-type of the resource."""
426 return self._get_single_value(DC_NAMESPACE, "format")
428 @dc_format.setter
429 def dc_format(self, value: Optional[str]) -> None:
430 self._set_single_value(DC_NAMESPACE, "format", value)
432 @property
433 def dc_identifier(self) -> Optional[str]:
434 """Unique identifier of the resource."""
435 return self._get_single_value(DC_NAMESPACE, "identifier")
437 @dc_identifier.setter
438 def dc_identifier(self, value: Optional[str]) -> None:
439 self._set_single_value(DC_NAMESPACE, "identifier", value)
441 @property
442 def dc_language(self) -> Optional[list[str]]:
443 """An unordered array specifying the languages used in the resource."""
444 return self._getter_bag(DC_NAMESPACE, "language")
446 @dc_language.setter
447 def dc_language(self, values: Optional[list[str]]) -> None:
448 self._set_bag_values(DC_NAMESPACE, "language", values)
450 @property
451 def dc_publisher(self) -> Optional[list[str]]:
452 """An unordered array of publisher names."""
453 return self._getter_bag(DC_NAMESPACE, "publisher")
455 @dc_publisher.setter
456 def dc_publisher(self, values: Optional[list[str]]) -> None:
457 self._set_bag_values(DC_NAMESPACE, "publisher", values)
459 @property
460 def dc_relation(self) -> Optional[list[str]]:
461 """An unordered array of text descriptions of relationships to other documents."""
462 return self._getter_bag(DC_NAMESPACE, "relation")
464 @dc_relation.setter
465 def dc_relation(self, values: Optional[list[str]]) -> None:
466 self._set_bag_values(DC_NAMESPACE, "relation", values)
468 @property
469 def dc_rights(self) -> Optional[dict[str, str]]:
470 """A language-keyed dictionary of textual descriptions of the rights the user has to this resource."""
471 return self._get_langalt_values(DC_NAMESPACE, "rights")
473 @dc_rights.setter
474 def dc_rights(self, values: Optional[dict[str, str]]) -> None:
475 self._set_langalt_values(DC_NAMESPACE, "rights", values)
477 @property
478 def dc_source(self) -> Optional[str]:
479 """Unique identifier of the work from which this resource was derived."""
480 return self._get_single_value(DC_NAMESPACE, "source")
482 @dc_source.setter
483 def dc_source(self, value: Optional[str]) -> None:
484 self._set_single_value(DC_NAMESPACE, "source", value)
486 @property
487 def dc_subject(self) -> Optional[list[str]]:
488 """An unordered array of descriptive phrases or keywords that specify the topic of the content."""
489 return self._getter_bag(DC_NAMESPACE, "subject")
491 @dc_subject.setter
492 def dc_subject(self, values: Optional[list[str]]) -> None:
493 self._set_bag_values(DC_NAMESPACE, "subject", values)
495 @property
496 def dc_title(self) -> Optional[dict[str, str]]:
497 """A language-keyed dictionary of the title of the resource."""
498 return self._get_langalt_values(DC_NAMESPACE, "title")
500 @dc_title.setter
501 def dc_title(self, values: Optional[dict[str, str]]) -> None:
502 self._set_langalt_values(DC_NAMESPACE, "title", values)
504 @property
505 def dc_type(self) -> Optional[list[str]]:
506 """An unordered array of textual descriptions of the document type."""
507 return self._getter_bag(DC_NAMESPACE, "type")
509 @dc_type.setter
510 def dc_type(self, values: Optional[list[str]]) -> None:
511 self._set_bag_values(DC_NAMESPACE, "type", values)
513 @property
514 def pdf_keywords(self) -> Optional[str]:
515 """An unformatted text string representing document keywords."""
516 return self._get_single_value(PDF_NAMESPACE, "Keywords")
518 @pdf_keywords.setter
519 def pdf_keywords(self, value: Optional[str]) -> None:
520 self._set_single_value(PDF_NAMESPACE, "Keywords", value)
522 @property
523 def pdf_pdfversion(self) -> Optional[str]:
524 """The PDF file version, for example 1.0 or 1.3."""
525 return self._get_single_value(PDF_NAMESPACE, "PDFVersion")
527 @pdf_pdfversion.setter
528 def pdf_pdfversion(self, value: Optional[str]) -> None:
529 self._set_single_value(PDF_NAMESPACE, "PDFVersion", value)
531 @property
532 def pdf_producer(self) -> Optional[str]:
533 """The name of the tool that saved the document as a PDF."""
534 return self._get_single_value(PDF_NAMESPACE, "Producer")
536 @pdf_producer.setter
537 def pdf_producer(self, value: Optional[str]) -> None:
538 self._set_single_value(PDF_NAMESPACE, "Producer", value)
540 @property
541 def xmp_create_date(self) -> Optional[datetime.datetime]:
542 """The date and time the resource was originally created. Returned as a UTC datetime object."""
543 return self._get_single_value(XMP_NAMESPACE, "CreateDate", _converter_date)
545 @xmp_create_date.setter
546 def xmp_create_date(self, value: Optional[datetime.datetime]) -> None:
547 if value:
548 date_str = _format_datetime_utc(value)
549 self._set_single_value(XMP_NAMESPACE, "CreateDate", date_str)
550 else:
551 self._set_single_value(XMP_NAMESPACE, "CreateDate", None)
553 @property
554 def xmp_modify_date(self) -> Optional[datetime.datetime]:
555 """The date and time the resource was last modified. Returned as a UTC datetime object."""
556 return self._get_single_value(XMP_NAMESPACE, "ModifyDate", _converter_date)
558 @xmp_modify_date.setter
559 def xmp_modify_date(self, value: Optional[datetime.datetime]) -> None:
560 if value:
561 date_str = _format_datetime_utc(value)
562 self._set_single_value(XMP_NAMESPACE, "ModifyDate", date_str)
563 else:
564 self._set_single_value(XMP_NAMESPACE, "ModifyDate", None)
566 @property
567 def xmp_metadata_date(self) -> Optional[datetime.datetime]:
568 """The date and time that any metadata for this resource was last changed. Returned as a UTC datetime object."""
569 return self._get_single_value(XMP_NAMESPACE, "MetadataDate", _converter_date)
571 @xmp_metadata_date.setter
572 def xmp_metadata_date(self, value: Optional[datetime.datetime]) -> None:
573 if value:
574 date_str = _format_datetime_utc(value)
575 self._set_single_value(XMP_NAMESPACE, "MetadataDate", date_str)
576 else:
577 self._set_single_value(XMP_NAMESPACE, "MetadataDate", None)
579 @property
580 def xmp_creator_tool(self) -> Optional[str]:
581 """The name of the first known tool used to create the resource."""
582 return self._get_single_value(XMP_NAMESPACE, "CreatorTool")
584 @xmp_creator_tool.setter
585 def xmp_creator_tool(self, value: Optional[str]) -> None:
586 self._set_single_value(XMP_NAMESPACE, "CreatorTool", value)
588 @property
589 def xmpmm_document_id(self) -> Optional[str]:
590 """The common identifier for all versions and renditions of this resource."""
591 return self._get_single_value(XMPMM_NAMESPACE, "DocumentID")
593 @xmpmm_document_id.setter
594 def xmpmm_document_id(self, value: Optional[str]) -> None:
595 self._set_single_value(XMPMM_NAMESPACE, "DocumentID", value)
597 @property
598 def xmpmm_instance_id(self) -> Optional[str]:
599 """An identifier for a specific incarnation of a document, updated each time a file is saved."""
600 return self._get_single_value(XMPMM_NAMESPACE, "InstanceID")
602 @xmpmm_instance_id.setter
603 def xmpmm_instance_id(self, value: Optional[str]) -> None:
604 self._set_single_value(XMPMM_NAMESPACE, "InstanceID", value)
606 @property
607 def pdfaid_part(self) -> Optional[str]:
608 """The part of the PDF/A standard that the document conforms to (e.g., 1, 2, 3)."""
609 return self._get_single_value(PDFAID_NAMESPACE, "part")
611 @pdfaid_part.setter
612 def pdfaid_part(self, value: Optional[str]) -> None:
613 self._set_single_value(PDFAID_NAMESPACE, "part", value)
615 @property
616 def pdfaid_conformance(self) -> Optional[str]:
617 """The conformance level within the PDF/A standard (e.g., 'A', 'B', 'U')."""
618 return self._get_single_value(PDFAID_NAMESPACE, "conformance")
620 @pdfaid_conformance.setter
621 def pdfaid_conformance(self, value: Optional[str]) -> None:
622 self._set_single_value(PDFAID_NAMESPACE, "conformance", value)
624 @property
625 def custom_properties(self) -> dict[Any, Any]:
626 """
627 Retrieve custom metadata properties defined in the undocumented pdfx
628 metadata schema.
630 Returns:
631 A dictionary of key/value items for custom metadata properties.
633 """
634 if not hasattr(self, "_custom_properties"):
635 self._custom_properties = {}
636 for node in self.get_nodes_in_namespace("", PDFX_NAMESPACE):
637 key = node.localName
638 while True:
639 # see documentation about PDFX_NAMESPACE earlier in file
640 idx = key.find("\u2182")
641 if idx == -1:
642 break
643 key = (
644 key[:idx]
645 + chr(int(key[idx + 1 : idx + 5], base=16))
646 + key[idx + 5 :]
647 )
648 if node.nodeType == node.ATTRIBUTE_NODE:
649 value = node.nodeValue
650 else:
651 value = self._get_text(node)
652 self._custom_properties[key] = value
653 return self._custom_properties
655 def _get_or_create_description(self, about_uri: str = "") -> XmlElement:
656 """Get or create an rdf:Description element with the given about URI."""
657 for desc in self.rdf_root.getElementsByTagNameNS(RDF_NAMESPACE, "Description"):
658 if desc.getAttributeNS(RDF_NAMESPACE, "about") == about_uri:
659 return desc
661 doc = self.rdf_root.ownerDocument
662 if doc is None:
663 raise XmpDocumentError("XMP Document is None")
664 desc = doc.createElementNS(RDF_NAMESPACE, "rdf:Description")
665 desc.setAttributeNS(RDF_NAMESPACE, "rdf:about", about_uri)
666 self.rdf_root.appendChild(desc)
667 return desc
669 def _clear_cache_entry(self, namespace: str, name: str) -> None:
670 """Remove a cached value for a given namespace/name if present."""
671 ns_cache = self.cache.get(namespace)
672 if ns_cache and name in ns_cache:
673 del ns_cache[name]
675 def _set_single_value(self, namespace: str, name: str, value: Optional[str]) -> None:
676 """Set or remove a single metadata value."""
677 self._clear_cache_entry(namespace, name)
678 desc = self._get_or_create_description()
680 existing_elements = list(desc.getElementsByTagNameNS(namespace, name))
681 for elem in existing_elements:
682 desc.removeChild(elem)
684 if existing_attr := desc.getAttributeNodeNS(namespace, name):
685 desc.removeAttributeNode(existing_attr)
687 if value is not None:
688 doc = self.rdf_root.ownerDocument
689 if doc is None:
690 raise XmpDocumentError("XMP Document is None")
691 prefix = self._get_namespace_prefix(namespace)
692 elem = doc.createElementNS(namespace, f"{prefix}:{name}")
693 text_node = doc.createTextNode(str(value))
694 elem.appendChild(text_node)
695 desc.appendChild(elem)
697 self._update_stream()
699 def _set_bag_values(self, namespace: str, name: str, values: Optional[list[str]]) -> None:
700 """Set or remove bag values (unordered array)."""
701 self._clear_cache_entry(namespace, name)
702 desc = self._get_or_create_description()
704 existing_elements = list(desc.getElementsByTagNameNS(namespace, name))
705 for elem in existing_elements:
706 desc.removeChild(elem)
708 if values:
709 doc = self.rdf_root.ownerDocument
710 if doc is None:
711 raise XmpDocumentError("XMP Document is None")
712 prefix = self._get_namespace_prefix(namespace)
713 elem = doc.createElementNS(namespace, f"{prefix}:{name}")
714 bag = doc.createElementNS(RDF_NAMESPACE, "rdf:Bag")
716 for value in values:
717 li = doc.createElementNS(RDF_NAMESPACE, "rdf:li")
718 text_node = doc.createTextNode(str(value))
719 li.appendChild(text_node)
720 bag.appendChild(li)
722 elem.appendChild(bag)
723 desc.appendChild(elem)
725 self._update_stream()
727 def _set_seq_values(self, namespace: str, name: str, values: Optional[list[str]]) -> None:
728 """Set or remove sequence values (ordered array)."""
729 self._clear_cache_entry(namespace, name)
730 desc = self._get_or_create_description()
732 existing_elements = list(desc.getElementsByTagNameNS(namespace, name))
733 for elem in existing_elements:
734 desc.removeChild(elem)
736 if values:
737 doc = self.rdf_root.ownerDocument
738 if doc is None:
739 raise XmpDocumentError("XMP Document is None")
740 prefix = self._get_namespace_prefix(namespace)
741 elem = doc.createElementNS(namespace, f"{prefix}:{name}")
742 seq = doc.createElementNS(RDF_NAMESPACE, "rdf:Seq")
744 for value in values:
745 li = doc.createElementNS(RDF_NAMESPACE, "rdf:li")
746 text_node = doc.createTextNode(str(value))
747 li.appendChild(text_node)
748 seq.appendChild(li)
750 elem.appendChild(seq)
751 desc.appendChild(elem)
753 self._update_stream()
755 def _set_langalt_values(self, namespace: str, name: str, values: Optional[dict[str, str]]) -> None:
756 """Set or remove language alternative values."""
757 self._clear_cache_entry(namespace, name)
758 desc = self._get_or_create_description()
760 existing_elements = list(desc.getElementsByTagNameNS(namespace, name))
761 for elem in existing_elements:
762 desc.removeChild(elem)
764 if values:
765 doc = self.rdf_root.ownerDocument
766 if doc is None:
767 raise XmpDocumentError("XMP Document is None")
768 prefix = self._get_namespace_prefix(namespace)
769 elem = doc.createElementNS(namespace, f"{prefix}:{name}")
770 alt = doc.createElementNS(RDF_NAMESPACE, "rdf:Alt")
772 for lang, value in values.items():
773 li = doc.createElementNS(RDF_NAMESPACE, "rdf:li")
774 li.setAttribute("xml:lang", lang)
775 text_node = doc.createTextNode(str(value))
776 li.appendChild(text_node)
777 alt.appendChild(li)
779 elem.appendChild(alt)
780 desc.appendChild(elem)
782 self._update_stream()
784 def _get_namespace_prefix(self, namespace: str) -> str:
785 """Get the appropriate namespace prefix for a given namespace URI."""
786 return _NAMESPACE_PREFIX_MAP.get(namespace, "unknown")
788 def _update_stream(self) -> None:
789 """Update the stream with the current XML content."""
790 doc = self.rdf_root.ownerDocument
791 if doc is None:
792 raise XmpDocumentError("XMP Document is None")
794 xml_data = doc.toxml(encoding="utf-8")
795 self.stream.set_data(xml_data)