Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/xmp.py: 34%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2Anything related to Extensible Metadata Platform (XMP) metadata.
4https://en.wikipedia.org/wiki/Extensible_Metadata_Platform
5"""
7import datetime
8import decimal
9import re
10from collections.abc import Iterator
11from typing import (
12 Any,
13 Callable,
14 Optional,
15 TypeVar,
16 Union,
17 cast,
18)
19from xml.dom.expatbuilder import ExpatBuilderNS
20from xml.dom.minidom import Document
21from xml.dom.minidom import Element as XmlElement
22from xml.parsers.expat import ExpatError, XMLParserType
24from ._protocols import XmpInformationProtocol
25from ._utils import StreamType, deprecate_with_replacement, deprecation_no_replacement
26from .errors import PdfReadError, XmpDocumentError
27from .generic import ContentStream, PdfObject
29RDF_NAMESPACE = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
30DC_NAMESPACE = "http://purl.org/dc/elements/1.1/"
31XMP_NAMESPACE = "http://ns.adobe.com/xap/1.0/"
32PDF_NAMESPACE = "http://ns.adobe.com/pdf/1.3/"
33XMPMM_NAMESPACE = "http://ns.adobe.com/xap/1.0/mm/"
35# What is the PDFX namespace, you might ask?
36# It's documented here: https://github.com/adobe/xmp-docs/raw/master/XMPSpecifications/XMPSpecificationPart3.pdf
37# This namespace is used to place "custom metadata"
38# properties, which are arbitrary metadata properties with no semantic or
39# documented meaning.
40#
41# Elements in the namespace are key/value-style storage,
42# where the element name is the key and the content is the value. The keys
43# are transformed into valid XML identifiers by substituting an invalid
44# identifier character with \u2182 followed by the unicode hex ID of the
45# original character. A key like "my car" is therefore "my\u21820020car".
46#
47# \u2182 is the unicode character \u{ROMAN NUMERAL TEN THOUSAND}
48#
49# The pdfx namespace should be avoided.
50# A custom data schema and sensical XML elements could be used instead, as is
51# suggested by Adobe's own documentation on XMP under "Extensibility of
52# Schemas".
53PDFX_NAMESPACE = "http://ns.adobe.com/pdfx/1.3/"
55# PDF/A
56PDFAID_NAMESPACE = "http://www.aiim.org/pdfa/ns/id/"
58# Internal mapping of namespace URI → prefix
59_NAMESPACE_PREFIX_MAP = {
60 DC_NAMESPACE: "dc",
61 XMP_NAMESPACE: "xmp",
62 PDF_NAMESPACE: "pdf",
63 XMPMM_NAMESPACE: "xmpMM",
64 PDFAID_NAMESPACE: "pdfaid",
65 PDFX_NAMESPACE: "pdfx",
66}
68iso8601 = re.compile(
69 """
70 (?P<year>[0-9]{4})
71 (-
72 (?P<month>[0-9]{2})
73 (-
74 (?P<day>[0-9]+)
75 (T
76 (?P<hour>[0-9]{2}):
77 (?P<minute>[0-9]{2})
78 (:(?P<second>[0-9]{2}(.[0-9]+)?))?
79 (?P<tzd>Z|[-+][0-9]{2}:[0-9]{2})
80 )?
81 )?
82 )?
83 """,
84 re.VERBOSE,
85)
88K = TypeVar("K")
90# Minimal XMP template
91_MINIMAL_XMP = f"""<?xpacket begin="\ufeff" id="W5M0MpCehiHzreSzNTczkc9d"?>
92<x:xmpmeta xmlns:x="adobe:ns:meta/" x:xmptk="pypdf">
93 <rdf:RDF xmlns:rdf="{RDF_NAMESPACE}">
94 <rdf:Description rdf:about=""
95 xmlns:dc="{DC_NAMESPACE}"
96 xmlns:xmp="{XMP_NAMESPACE}"
97 xmlns:pdf="{PDF_NAMESPACE}"
98 xmlns:xmpMM="{XMPMM_NAMESPACE}"
99 xmlns:pdfaid="{PDFAID_NAMESPACE}"
100 xmlns:pdfx="{PDFX_NAMESPACE}">
101 </rdf:Description>
102 </rdf:RDF>
103</x:xmpmeta>
104<?xpacket end="r"?>"""
107def _identity(value: K) -> K:
108 return value
111def _converter_date(value: str) -> datetime.datetime:
112 matches = iso8601.match(value)
113 if matches is None:
114 raise ValueError(f"Invalid date format: {value}")
115 year = int(matches.group("year"))
116 month = int(matches.group("month") or "1")
117 day = int(matches.group("day") or "1")
118 hour = int(matches.group("hour") or "0")
119 minute = int(matches.group("minute") or "0")
120 second = decimal.Decimal(matches.group("second") or "0")
121 seconds_dec = second.to_integral(decimal.ROUND_FLOOR)
122 milliseconds_dec = (second - seconds_dec) * 1_000_000
124 seconds = int(seconds_dec)
125 milliseconds = int(milliseconds_dec)
127 tzd = matches.group("tzd") or "Z"
128 dt = datetime.datetime(year, month, day, hour, minute, seconds, milliseconds)
129 if tzd != "Z":
130 tzd_hours, tzd_minutes = (int(x) for x in tzd.split(":"))
131 tzd_hours *= -1
132 if tzd_hours < 0:
133 tzd_minutes *= -1
134 dt = dt + datetime.timedelta(hours=tzd_hours, minutes=tzd_minutes)
135 return dt
138def _format_datetime_utc(value: datetime.datetime) -> str:
139 """Format a datetime as UTC with trailing 'Z'.
141 - If the input is timezone-aware, convert to UTC first.
142 - If naive, assume UTC.
143 """
144 if value.tzinfo is not None and value.utcoffset() is not None:
145 value = value.astimezone(datetime.timezone.utc)
147 value = value.replace(tzinfo=None)
148 return value.strftime("%Y-%m-%dT%H:%M:%S.%fZ")
151def _generic_get(
152 element: XmlElement, self: "XmpInformation", list_type: str, converter: Callable[[Any], Any] = _identity
153) -> Optional[list[str]]:
154 containers = element.getElementsByTagNameNS(RDF_NAMESPACE, list_type)
155 retval: list[Any] = []
156 if len(containers):
157 for container in containers:
158 for item in container.getElementsByTagNameNS(RDF_NAMESPACE, "li"):
159 value = self._get_text(item)
160 value = converter(value)
161 retval.append(value)
162 return retval
163 return None
166class _XmpBuilder(ExpatBuilderNS):
167 """
168 Custom XML parser denying all entity declarations.
170 This is a stripped down and typed version inspired by what *defusedxml* does.
172 Why do we need this? The default limits of *libexpat* used by Python only block exponential entity expansion,
173 but not cases like quadratic entity expansion which can still cause quite some memory usage.
174 """
176 def custom_entity_declaration_handler(
177 self,
178 entity_name: str,
179 is_parameter_entity: bool,
180 value: Optional[str],
181 base: Optional[str],
182 system_id: str,
183 public_id: Optional[str],
184 notation_name: Optional[str],
185 ) -> None:
186 raise ExpatError(f"Forbidden entities: {entity_name!r}")
188 def install(self, parser: XMLParserType) -> None:
189 super().install(parser)
191 parser.EntityDeclHandler = self.custom_entity_declaration_handler
194class XmpInformation(XmpInformationProtocol, PdfObject):
195 """
196 An object that represents Extensible Metadata Platform (XMP) metadata.
197 Usually accessed by :py:attr:`xmp_metadata()<pypdf.PdfReader.xmp_metadata>`.
199 Raises:
200 PdfReadError: if XML is invalid
202 """
204 def __init__(self, stream: ContentStream) -> None:
205 self.stream = stream
206 try:
207 data = self.stream.get_data()
208 doc_root: Document = _XmpBuilder().parseString(data)
209 except (AttributeError, ExpatError) as e:
210 raise PdfReadError(f"XML in XmpInformation was invalid: {e}")
211 self.rdf_root: XmlElement = doc_root.getElementsByTagNameNS(
212 RDF_NAMESPACE, "RDF"
213 )[0]
214 self.cache: dict[Any, Any] = {}
216 @classmethod
217 def create(cls) -> "XmpInformation":
218 """
219 Create a new XmpInformation object with minimal structure.
221 Returns:
222 A new XmpInformation instance with empty metadata fields.
223 """
224 stream = ContentStream(None, None)
225 stream.set_data(_MINIMAL_XMP.encode("utf-8"))
226 return cls(stream)
228 def write_to_stream(
229 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
230 ) -> None:
231 deprecate_with_replacement(
232 "XmpInformation.write_to_stream",
233 "PdfWriter.xmp_metadata",
234 "6.0.0"
235 )
236 if encryption_key is not None: # deprecated
237 deprecation_no_replacement(
238 "the encryption_key parameter of write_to_stream", "5.0.0"
239 )
240 self.stream.write_to_stream(stream)
242 def get_element(self, about_uri: str, namespace: str, name: str) -> Iterator[Any]:
243 for desc in self.rdf_root.getElementsByTagNameNS(RDF_NAMESPACE, "Description"):
244 if desc.getAttributeNS(RDF_NAMESPACE, "about") == about_uri:
245 attr = desc.getAttributeNodeNS(namespace, name)
246 if attr is not None:
247 yield attr
248 yield from desc.getElementsByTagNameNS(namespace, name)
250 def get_nodes_in_namespace(self, about_uri: str, namespace: str) -> Iterator[Any]:
251 for desc in self.rdf_root.getElementsByTagNameNS(RDF_NAMESPACE, "Description"):
252 if desc.getAttributeNS(RDF_NAMESPACE, "about") == about_uri:
253 for i in range(desc.attributes.length):
254 attr = desc.attributes.item(i)
255 if attr and attr.namespaceURI == namespace:
256 yield attr
257 for child in desc.childNodes:
258 if child.namespaceURI == namespace:
259 yield child
261 def _get_text(self, element: XmlElement) -> str:
262 text = ""
263 for child in element.childNodes:
264 if child.nodeType == child.TEXT_NODE:
265 text += child.data
266 return text
268 def _get_single_value(
269 self,
270 namespace: str,
271 name: str,
272 converter: Callable[[str], Any] = _identity,
273 ) -> Optional[Any]:
274 cached = self.cache.get(namespace, {}).get(name)
275 if cached:
276 return cached
277 value = None
278 for element in self.get_element("", namespace, name):
279 if element.nodeType == element.ATTRIBUTE_NODE:
280 value = element.nodeValue
281 else:
282 value = self._get_text(element)
283 break
284 if value is not None:
285 value = converter(value)
286 ns_cache = self.cache.setdefault(namespace, {})
287 ns_cache[name] = value
288 return value
290 def _getter_bag(self, namespace: str, name: str) -> Optional[list[str]]:
291 cached = self.cache.get(namespace, {}).get(name)
292 if cached:
293 return cast(list[str], cached)
294 retval: list[str] = []
295 for element in self.get_element("", namespace, name):
296 if (bags := _generic_get(element, self, list_type="Bag")) is not None:
297 retval.extend(bags)
298 else:
299 value = self._get_text(element)
300 retval.append(value)
301 ns_cache = self.cache.setdefault(namespace, {})
302 ns_cache[name] = retval
303 return retval
305 def _get_seq_values(
306 self,
307 namespace: str,
308 name: str,
309 converter: Callable[[Any], Any] = _identity,
310 ) -> Optional[list[Any]]:
311 cached = self.cache.get(namespace, {}).get(name)
312 if cached:
313 return cast(list[Any], cached)
314 retval: list[Any] = []
315 for element in self.get_element("", namespace, name):
316 if (seqs := _generic_get(element, self, list_type="Seq", converter=converter)) is not None:
317 retval.extend(seqs)
318 elif (bags := _generic_get(element, self, list_type="Bag")) is not None:
319 # See issue at https://github.com/py-pdf/pypdf/issues/3324
320 # Some applications violate the XMP metadata standard regarding `dc:creator` which should
321 # be an "ordered array" and thus a sequence, but use an unordered array (bag) instead.
322 # This seems to stem from the fact that the original Dublin Core specification does indeed
323 # use bags or direct values, while PDFs are expected to follow the XMP standard and ignore
324 # the plain Dublin Core variant. For this reason, add a fallback here to deal with such
325 # issues accordingly.
326 retval.extend(bags)
327 else:
328 value = converter(self._get_text(element))
329 retval.append(value)
330 ns_cache = self.cache.setdefault(namespace, {})
331 ns_cache[name] = retval
332 return retval
334 def _get_langalt_values(self, namespace: str, name: str) -> Optional[dict[Any, Any]]:
335 cached = self.cache.get(namespace, {}).get(name)
336 if cached:
337 return cast(dict[Any, Any], cached)
338 retval: dict[Any, Any] = {}
339 for element in self.get_element("", namespace, name):
340 alts = element.getElementsByTagNameNS(RDF_NAMESPACE, "Alt")
341 if len(alts):
342 for alt in alts:
343 for item in alt.getElementsByTagNameNS(RDF_NAMESPACE, "li"):
344 value = self._get_text(item)
345 retval[item.getAttribute("xml:lang")] = value
346 else:
347 retval["x-default"] = self._get_text(element)
348 ns_cache = self.cache.setdefault(namespace, {})
349 ns_cache[name] = retval
350 return retval
352 @property
353 def dc_contributor(self) -> Optional[list[str]]:
354 """Contributors to the resource (other than the authors)."""
355 return self._getter_bag(DC_NAMESPACE, "contributor")
357 @dc_contributor.setter
358 def dc_contributor(self, values: Optional[list[str]]) -> None:
359 self._set_bag_values(DC_NAMESPACE, "contributor", values)
361 @property
362 def dc_coverage(self) -> Optional[str]:
363 """Text describing the extent or scope of the resource."""
364 return self._get_single_value(DC_NAMESPACE, "coverage")
366 @dc_coverage.setter
367 def dc_coverage(self, value: Optional[str]) -> None:
368 self._set_single_value(DC_NAMESPACE, "coverage", value)
370 @property
371 def dc_creator(self) -> Optional[list[str]]:
372 """A sorted array of names of the authors of the resource, listed in order of precedence."""
373 return self._get_seq_values(DC_NAMESPACE, "creator")
375 @dc_creator.setter
376 def dc_creator(self, values: Optional[list[str]]) -> None:
377 self._set_seq_values(DC_NAMESPACE, "creator", values)
379 @property
380 def dc_date(self) -> Optional[list[datetime.datetime]]:
381 """A sorted array of dates of significance to the resource. The dates and times are in UTC."""
382 return self._get_seq_values(DC_NAMESPACE, "date", _converter_date)
384 @dc_date.setter
385 def dc_date(self, values: Optional[list[Union[str, datetime.datetime]]]) -> None:
386 if values is None:
387 self._set_seq_values(DC_NAMESPACE, "date", None)
388 else:
389 date_strings = []
390 for value in values:
391 if isinstance(value, datetime.datetime):
392 date_strings.append(_format_datetime_utc(value))
393 else:
394 date_strings.append(str(value))
395 self._set_seq_values(DC_NAMESPACE, "date", date_strings)
397 @property
398 def dc_description(self) -> Optional[dict[str, str]]:
399 """A language-keyed dictionary of textual descriptions of the content of the resource."""
400 return self._get_langalt_values(DC_NAMESPACE, "description")
402 @dc_description.setter
403 def dc_description(self, values: Optional[dict[str, str]]) -> None:
404 self._set_langalt_values(DC_NAMESPACE, "description", values)
406 @property
407 def dc_format(self) -> Optional[str]:
408 """The mime-type of the resource."""
409 return self._get_single_value(DC_NAMESPACE, "format")
411 @dc_format.setter
412 def dc_format(self, value: Optional[str]) -> None:
413 self._set_single_value(DC_NAMESPACE, "format", value)
415 @property
416 def dc_identifier(self) -> Optional[str]:
417 """Unique identifier of the resource."""
418 return self._get_single_value(DC_NAMESPACE, "identifier")
420 @dc_identifier.setter
421 def dc_identifier(self, value: Optional[str]) -> None:
422 self._set_single_value(DC_NAMESPACE, "identifier", value)
424 @property
425 def dc_language(self) -> Optional[list[str]]:
426 """An unordered array specifying the languages used in the resource."""
427 return self._getter_bag(DC_NAMESPACE, "language")
429 @dc_language.setter
430 def dc_language(self, values: Optional[list[str]]) -> None:
431 self._set_bag_values(DC_NAMESPACE, "language", values)
433 @property
434 def dc_publisher(self) -> Optional[list[str]]:
435 """An unordered array of publisher names."""
436 return self._getter_bag(DC_NAMESPACE, "publisher")
438 @dc_publisher.setter
439 def dc_publisher(self, values: Optional[list[str]]) -> None:
440 self._set_bag_values(DC_NAMESPACE, "publisher", values)
442 @property
443 def dc_relation(self) -> Optional[list[str]]:
444 """An unordered array of text descriptions of relationships to other documents."""
445 return self._getter_bag(DC_NAMESPACE, "relation")
447 @dc_relation.setter
448 def dc_relation(self, values: Optional[list[str]]) -> None:
449 self._set_bag_values(DC_NAMESPACE, "relation", values)
451 @property
452 def dc_rights(self) -> Optional[dict[str, str]]:
453 """A language-keyed dictionary of textual descriptions of the rights the user has to this resource."""
454 return self._get_langalt_values(DC_NAMESPACE, "rights")
456 @dc_rights.setter
457 def dc_rights(self, values: Optional[dict[str, str]]) -> None:
458 self._set_langalt_values(DC_NAMESPACE, "rights", values)
460 @property
461 def dc_source(self) -> Optional[str]:
462 """Unique identifier of the work from which this resource was derived."""
463 return self._get_single_value(DC_NAMESPACE, "source")
465 @dc_source.setter
466 def dc_source(self, value: Optional[str]) -> None:
467 self._set_single_value(DC_NAMESPACE, "source", value)
469 @property
470 def dc_subject(self) -> Optional[list[str]]:
471 """An unordered array of descriptive phrases or keywords that specify the topic of the content."""
472 return self._getter_bag(DC_NAMESPACE, "subject")
474 @dc_subject.setter
475 def dc_subject(self, values: Optional[list[str]]) -> None:
476 self._set_bag_values(DC_NAMESPACE, "subject", values)
478 @property
479 def dc_title(self) -> Optional[dict[str, str]]:
480 """A language-keyed dictionary of the title of the resource."""
481 return self._get_langalt_values(DC_NAMESPACE, "title")
483 @dc_title.setter
484 def dc_title(self, values: Optional[dict[str, str]]) -> None:
485 self._set_langalt_values(DC_NAMESPACE, "title", values)
487 @property
488 def dc_type(self) -> Optional[list[str]]:
489 """An unordered array of textual descriptions of the document type."""
490 return self._getter_bag(DC_NAMESPACE, "type")
492 @dc_type.setter
493 def dc_type(self, values: Optional[list[str]]) -> None:
494 self._set_bag_values(DC_NAMESPACE, "type", values)
496 @property
497 def pdf_keywords(self) -> Optional[str]:
498 """An unformatted text string representing document keywords."""
499 return self._get_single_value(PDF_NAMESPACE, "Keywords")
501 @pdf_keywords.setter
502 def pdf_keywords(self, value: Optional[str]) -> None:
503 self._set_single_value(PDF_NAMESPACE, "Keywords", value)
505 @property
506 def pdf_pdfversion(self) -> Optional[str]:
507 """The PDF file version, for example 1.0 or 1.3."""
508 return self._get_single_value(PDF_NAMESPACE, "PDFVersion")
510 @pdf_pdfversion.setter
511 def pdf_pdfversion(self, value: Optional[str]) -> None:
512 self._set_single_value(PDF_NAMESPACE, "PDFVersion", value)
514 @property
515 def pdf_producer(self) -> Optional[str]:
516 """The name of the tool that saved the document as a PDF."""
517 return self._get_single_value(PDF_NAMESPACE, "Producer")
519 @pdf_producer.setter
520 def pdf_producer(self, value: Optional[str]) -> None:
521 self._set_single_value(PDF_NAMESPACE, "Producer", value)
523 @property
524 def xmp_create_date(self) -> Optional[datetime.datetime]:
525 """The date and time the resource was originally created. Returned as a UTC datetime object."""
526 return self._get_single_value(XMP_NAMESPACE, "CreateDate", _converter_date)
528 @xmp_create_date.setter
529 def xmp_create_date(self, value: Optional[datetime.datetime]) -> None:
530 if value:
531 date_str = _format_datetime_utc(value)
532 self._set_single_value(XMP_NAMESPACE, "CreateDate", date_str)
533 else:
534 self._set_single_value(XMP_NAMESPACE, "CreateDate", None)
536 @property
537 def xmp_modify_date(self) -> Optional[datetime.datetime]:
538 """The date and time the resource was last modified. Returned as a UTC datetime object."""
539 return self._get_single_value(XMP_NAMESPACE, "ModifyDate", _converter_date)
541 @xmp_modify_date.setter
542 def xmp_modify_date(self, value: Optional[datetime.datetime]) -> None:
543 if value:
544 date_str = _format_datetime_utc(value)
545 self._set_single_value(XMP_NAMESPACE, "ModifyDate", date_str)
546 else:
547 self._set_single_value(XMP_NAMESPACE, "ModifyDate", None)
549 @property
550 def xmp_metadata_date(self) -> Optional[datetime.datetime]:
551 """The date and time that any metadata for this resource was last changed. Returned as a UTC datetime object."""
552 return self._get_single_value(XMP_NAMESPACE, "MetadataDate", _converter_date)
554 @xmp_metadata_date.setter
555 def xmp_metadata_date(self, value: Optional[datetime.datetime]) -> None:
556 if value:
557 date_str = _format_datetime_utc(value)
558 self._set_single_value(XMP_NAMESPACE, "MetadataDate", date_str)
559 else:
560 self._set_single_value(XMP_NAMESPACE, "MetadataDate", None)
562 @property
563 def xmp_creator_tool(self) -> Optional[str]:
564 """The name of the first known tool used to create the resource."""
565 return self._get_single_value(XMP_NAMESPACE, "CreatorTool")
567 @xmp_creator_tool.setter
568 def xmp_creator_tool(self, value: Optional[str]) -> None:
569 self._set_single_value(XMP_NAMESPACE, "CreatorTool", value)
571 @property
572 def xmpmm_document_id(self) -> Optional[str]:
573 """The common identifier for all versions and renditions of this resource."""
574 return self._get_single_value(XMPMM_NAMESPACE, "DocumentID")
576 @xmpmm_document_id.setter
577 def xmpmm_document_id(self, value: Optional[str]) -> None:
578 self._set_single_value(XMPMM_NAMESPACE, "DocumentID", value)
580 @property
581 def xmpmm_instance_id(self) -> Optional[str]:
582 """An identifier for a specific incarnation of a document, updated each time a file is saved."""
583 return self._get_single_value(XMPMM_NAMESPACE, "InstanceID")
585 @xmpmm_instance_id.setter
586 def xmpmm_instance_id(self, value: Optional[str]) -> None:
587 self._set_single_value(XMPMM_NAMESPACE, "InstanceID", value)
589 @property
590 def pdfaid_part(self) -> Optional[str]:
591 """The part of the PDF/A standard that the document conforms to (e.g., 1, 2, 3)."""
592 return self._get_single_value(PDFAID_NAMESPACE, "part")
594 @pdfaid_part.setter
595 def pdfaid_part(self, value: Optional[str]) -> None:
596 self._set_single_value(PDFAID_NAMESPACE, "part", value)
598 @property
599 def pdfaid_conformance(self) -> Optional[str]:
600 """The conformance level within the PDF/A standard (e.g., 'A', 'B', 'U')."""
601 return self._get_single_value(PDFAID_NAMESPACE, "conformance")
603 @pdfaid_conformance.setter
604 def pdfaid_conformance(self, value: Optional[str]) -> None:
605 self._set_single_value(PDFAID_NAMESPACE, "conformance", value)
607 @property
608 def custom_properties(self) -> dict[Any, Any]:
609 """
610 Retrieve custom metadata properties defined in the undocumented pdfx
611 metadata schema.
613 Returns:
614 A dictionary of key/value items for custom metadata properties.
616 """
617 if not hasattr(self, "_custom_properties"):
618 self._custom_properties = {}
619 for node in self.get_nodes_in_namespace("", PDFX_NAMESPACE):
620 key = node.localName
621 while True:
622 # see documentation about PDFX_NAMESPACE earlier in file
623 idx = key.find("\u2182")
624 if idx == -1:
625 break
626 key = (
627 key[:idx]
628 + chr(int(key[idx + 1 : idx + 5], base=16))
629 + key[idx + 5 :]
630 )
631 if node.nodeType == node.ATTRIBUTE_NODE:
632 value = node.nodeValue
633 else:
634 value = self._get_text(node)
635 self._custom_properties[key] = value
636 return self._custom_properties
638 def _get_or_create_description(self, about_uri: str = "") -> XmlElement:
639 """Get or create an rdf:Description element with the given about URI."""
640 for desc in self.rdf_root.getElementsByTagNameNS(RDF_NAMESPACE, "Description"):
641 if desc.getAttributeNS(RDF_NAMESPACE, "about") == about_uri:
642 return desc
644 doc = self.rdf_root.ownerDocument
645 if doc is None:
646 raise XmpDocumentError("XMP Document is None")
647 desc = doc.createElementNS(RDF_NAMESPACE, "rdf:Description")
648 desc.setAttributeNS(RDF_NAMESPACE, "rdf:about", about_uri)
649 self.rdf_root.appendChild(desc)
650 return desc
652 def _clear_cache_entry(self, namespace: str, name: str) -> None:
653 """Remove a cached value for a given namespace/name if present."""
654 ns_cache = self.cache.get(namespace)
655 if ns_cache and name in ns_cache:
656 del ns_cache[name]
658 def _set_single_value(self, namespace: str, name: str, value: Optional[str]) -> None:
659 """Set or remove a single metadata value."""
660 self._clear_cache_entry(namespace, name)
661 desc = self._get_or_create_description()
663 existing_elements = list(desc.getElementsByTagNameNS(namespace, name))
664 for elem in existing_elements:
665 desc.removeChild(elem)
667 if existing_attr := desc.getAttributeNodeNS(namespace, name):
668 desc.removeAttributeNode(existing_attr)
670 if value is not None:
671 doc = self.rdf_root.ownerDocument
672 if doc is None:
673 raise XmpDocumentError("XMP Document is None")
674 prefix = self._get_namespace_prefix(namespace)
675 elem = doc.createElementNS(namespace, f"{prefix}:{name}")
676 text_node = doc.createTextNode(str(value))
677 elem.appendChild(text_node)
678 desc.appendChild(elem)
680 self._update_stream()
682 def _set_bag_values(self, namespace: str, name: str, values: Optional[list[str]]) -> None:
683 """Set or remove bag values (unordered array)."""
684 self._clear_cache_entry(namespace, name)
685 desc = self._get_or_create_description()
687 existing_elements = list(desc.getElementsByTagNameNS(namespace, name))
688 for elem in existing_elements:
689 desc.removeChild(elem)
691 if values:
692 doc = self.rdf_root.ownerDocument
693 if doc is None:
694 raise XmpDocumentError("XMP Document is None")
695 prefix = self._get_namespace_prefix(namespace)
696 elem = doc.createElementNS(namespace, f"{prefix}:{name}")
697 bag = doc.createElementNS(RDF_NAMESPACE, "rdf:Bag")
699 for value in values:
700 li = doc.createElementNS(RDF_NAMESPACE, "rdf:li")
701 text_node = doc.createTextNode(str(value))
702 li.appendChild(text_node)
703 bag.appendChild(li)
705 elem.appendChild(bag)
706 desc.appendChild(elem)
708 self._update_stream()
710 def _set_seq_values(self, namespace: str, name: str, values: Optional[list[str]]) -> None:
711 """Set or remove sequence values (ordered array)."""
712 self._clear_cache_entry(namespace, name)
713 desc = self._get_or_create_description()
715 existing_elements = list(desc.getElementsByTagNameNS(namespace, name))
716 for elem in existing_elements:
717 desc.removeChild(elem)
719 if values:
720 doc = self.rdf_root.ownerDocument
721 if doc is None:
722 raise XmpDocumentError("XMP Document is None")
723 prefix = self._get_namespace_prefix(namespace)
724 elem = doc.createElementNS(namespace, f"{prefix}:{name}")
725 seq = doc.createElementNS(RDF_NAMESPACE, "rdf:Seq")
727 for value in values:
728 li = doc.createElementNS(RDF_NAMESPACE, "rdf:li")
729 text_node = doc.createTextNode(str(value))
730 li.appendChild(text_node)
731 seq.appendChild(li)
733 elem.appendChild(seq)
734 desc.appendChild(elem)
736 self._update_stream()
738 def _set_langalt_values(self, namespace: str, name: str, values: Optional[dict[str, str]]) -> None:
739 """Set or remove language alternative values."""
740 self._clear_cache_entry(namespace, name)
741 desc = self._get_or_create_description()
743 existing_elements = list(desc.getElementsByTagNameNS(namespace, name))
744 for elem in existing_elements:
745 desc.removeChild(elem)
747 if values:
748 doc = self.rdf_root.ownerDocument
749 if doc is None:
750 raise XmpDocumentError("XMP Document is None")
751 prefix = self._get_namespace_prefix(namespace)
752 elem = doc.createElementNS(namespace, f"{prefix}:{name}")
753 alt = doc.createElementNS(RDF_NAMESPACE, "rdf:Alt")
755 for lang, value in values.items():
756 li = doc.createElementNS(RDF_NAMESPACE, "rdf:li")
757 li.setAttribute("xml:lang", lang)
758 text_node = doc.createTextNode(str(value))
759 li.appendChild(text_node)
760 alt.appendChild(li)
762 elem.appendChild(alt)
763 desc.appendChild(elem)
765 self._update_stream()
767 def _get_namespace_prefix(self, namespace: str) -> str:
768 """Get the appropriate namespace prefix for a given namespace URI."""
769 return _NAMESPACE_PREFIX_MAP.get(namespace, "unknown")
771 def _update_stream(self) -> None:
772 """Update the stream with the current XML content."""
773 doc = self.rdf_root.ownerDocument
774 if doc is None:
775 raise XmpDocumentError("XMP Document is None")
777 xml_data = doc.toxml(encoding="utf-8")
778 self.stream.set_data(xml_data)