Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/xmp.py: 33%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2Anything related to Extensible Metadata Platform (XMP) metadata.
4https://en.wikipedia.org/wiki/Extensible_Metadata_Platform
5"""
7import datetime
8import decimal
9import re
10from collections.abc import Iterator
11from typing import (
12 Any,
13 Callable,
14 Optional,
15 TypeVar,
16 Union,
17)
18from xml.dom.minidom import Document, parseString
19from xml.dom.minidom import Element as XmlElement
20from xml.parsers.expat import ExpatError
22from ._protocols import XmpInformationProtocol
23from ._utils import StreamType, deprecate_with_replacement, deprecation_no_replacement
24from .errors import PdfReadError, XmpDocumentError
25from .generic import ContentStream, PdfObject
27RDF_NAMESPACE = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
28DC_NAMESPACE = "http://purl.org/dc/elements/1.1/"
29XMP_NAMESPACE = "http://ns.adobe.com/xap/1.0/"
30PDF_NAMESPACE = "http://ns.adobe.com/pdf/1.3/"
31XMPMM_NAMESPACE = "http://ns.adobe.com/xap/1.0/mm/"
33# What is the PDFX namespace, you might ask?
34# It's documented here: https://github.com/adobe/xmp-docs/raw/master/XMPSpecifications/XMPSpecificationPart3.pdf
35# This namespace is used to place "custom metadata"
36# properties, which are arbitrary metadata properties with no semantic or
37# documented meaning.
38#
39# Elements in the namespace are key/value-style storage,
40# where the element name is the key and the content is the value. The keys
41# are transformed into valid XML identifiers by substituting an invalid
42# identifier character with \u2182 followed by the unicode hex ID of the
43# original character. A key like "my car" is therefore "my\u21820020car".
44#
45# \u2182 is the unicode character \u{ROMAN NUMERAL TEN THOUSAND}
46#
47# The pdfx namespace should be avoided.
48# A custom data schema and sensical XML elements could be used instead, as is
49# suggested by Adobe's own documentation on XMP under "Extensibility of
50# Schemas".
51PDFX_NAMESPACE = "http://ns.adobe.com/pdfx/1.3/"
53# PDF/A
54PDFAID_NAMESPACE = "http://www.aiim.org/pdfa/ns/id/"
56# Internal mapping of namespace URI → prefix
57_NAMESPACE_PREFIX_MAP = {
58 DC_NAMESPACE: "dc",
59 XMP_NAMESPACE: "xmp",
60 PDF_NAMESPACE: "pdf",
61 XMPMM_NAMESPACE: "xmpMM",
62 PDFAID_NAMESPACE: "pdfaid",
63 PDFX_NAMESPACE: "pdfx",
64}
66iso8601 = re.compile(
67 """
68 (?P<year>[0-9]{4})
69 (-
70 (?P<month>[0-9]{2})
71 (-
72 (?P<day>[0-9]+)
73 (T
74 (?P<hour>[0-9]{2}):
75 (?P<minute>[0-9]{2})
76 (:(?P<second>[0-9]{2}(.[0-9]+)?))?
77 (?P<tzd>Z|[-+][0-9]{2}:[0-9]{2})
78 )?
79 )?
80 )?
81 """,
82 re.VERBOSE,
83)
86K = TypeVar("K")
88# Minimal XMP template
89_MINIMAL_XMP = f"""<?xpacket begin="\ufeff" id="W5M0MpCehiHzreSzNTczkc9d"?>
90<x:xmpmeta xmlns:x="adobe:ns:meta/" x:xmptk="pypdf">
91 <rdf:RDF xmlns:rdf="{RDF_NAMESPACE}">
92 <rdf:Description rdf:about=""
93 xmlns:dc="{DC_NAMESPACE}"
94 xmlns:xmp="{XMP_NAMESPACE}"
95 xmlns:pdf="{PDF_NAMESPACE}"
96 xmlns:xmpMM="{XMPMM_NAMESPACE}"
97 xmlns:pdfaid="{PDFAID_NAMESPACE}"
98 xmlns:pdfx="{PDFX_NAMESPACE}">
99 </rdf:Description>
100 </rdf:RDF>
101</x:xmpmeta>
102<?xpacket end="r"?>"""
105def _identity(value: K) -> K:
106 return value
109def _converter_date(value: str) -> datetime.datetime:
110 matches = iso8601.match(value)
111 if matches is None:
112 raise ValueError(f"Invalid date format: {value}")
113 year = int(matches.group("year"))
114 month = int(matches.group("month") or "1")
115 day = int(matches.group("day") or "1")
116 hour = int(matches.group("hour") or "0")
117 minute = int(matches.group("minute") or "0")
118 second = decimal.Decimal(matches.group("second") or "0")
119 seconds_dec = second.to_integral(decimal.ROUND_FLOOR)
120 milliseconds_dec = (second - seconds_dec) * 1_000_000
122 seconds = int(seconds_dec)
123 milliseconds = int(milliseconds_dec)
125 tzd = matches.group("tzd") or "Z"
126 dt = datetime.datetime(year, month, day, hour, minute, seconds, milliseconds)
127 if tzd != "Z":
128 tzd_hours, tzd_minutes = (int(x) for x in tzd.split(":"))
129 tzd_hours *= -1
130 if tzd_hours < 0:
131 tzd_minutes *= -1
132 dt = dt + datetime.timedelta(hours=tzd_hours, minutes=tzd_minutes)
133 return dt
136def _format_datetime_utc(value: datetime.datetime) -> str:
137 """Format a datetime as UTC with trailing 'Z'.
139 - If the input is timezone-aware, convert to UTC first.
140 - If naive, assume UTC.
141 """
142 if value.tzinfo is not None and value.utcoffset() is not None:
143 value = value.astimezone(datetime.timezone.utc)
145 value = value.replace(tzinfo=None)
146 return value.strftime("%Y-%m-%dT%H:%M:%S.%fZ")
149def _generic_get(
150 element: XmlElement, self: "XmpInformation", list_type: str, converter: Callable[[Any], Any] = _identity
151) -> Optional[list[str]]:
152 containers = element.getElementsByTagNameNS(RDF_NAMESPACE, list_type)
153 retval: list[Any] = []
154 if len(containers):
155 for container in containers:
156 for item in container.getElementsByTagNameNS(RDF_NAMESPACE, "li"):
157 value = self._get_text(item)
158 value = converter(value)
159 retval.append(value)
160 return retval
161 return None
164class XmpInformation(XmpInformationProtocol, PdfObject):
165 """
166 An object that represents Extensible Metadata Platform (XMP) metadata.
167 Usually accessed by :py:attr:`xmp_metadata()<pypdf.PdfReader.xmp_metadata>`.
169 Raises:
170 PdfReadError: if XML is invalid
172 """
174 def __init__(self, stream: ContentStream) -> None:
175 self.stream = stream
176 try:
177 data = self.stream.get_data()
178 doc_root: Document = parseString(data) # noqa: S318
179 except (AttributeError, ExpatError) as e:
180 raise PdfReadError(f"XML in XmpInformation was invalid: {e}")
181 self.rdf_root: XmlElement = doc_root.getElementsByTagNameNS(
182 RDF_NAMESPACE, "RDF"
183 )[0]
184 self.cache: dict[Any, Any] = {}
186 @classmethod
187 def create(cls) -> "XmpInformation":
188 """
189 Create a new XmpInformation object with minimal structure.
191 Returns:
192 A new XmpInformation instance with empty metadata fields.
193 """
194 stream = ContentStream(None, None)
195 stream.set_data(_MINIMAL_XMP.encode("utf-8"))
196 return cls(stream)
198 def write_to_stream(
199 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
200 ) -> None:
201 deprecate_with_replacement(
202 "XmpInformation.write_to_stream",
203 "PdfWriter.xmp_metadata",
204 "6.0.0"
205 )
206 if encryption_key is not None: # deprecated
207 deprecation_no_replacement(
208 "the encryption_key parameter of write_to_stream", "5.0.0"
209 )
210 self.stream.write_to_stream(stream)
212 def get_element(self, about_uri: str, namespace: str, name: str) -> Iterator[Any]:
213 for desc in self.rdf_root.getElementsByTagNameNS(RDF_NAMESPACE, "Description"):
214 if desc.getAttributeNS(RDF_NAMESPACE, "about") == about_uri:
215 attr = desc.getAttributeNodeNS(namespace, name)
216 if attr is not None:
217 yield attr
218 yield from desc.getElementsByTagNameNS(namespace, name)
220 def get_nodes_in_namespace(self, about_uri: str, namespace: str) -> Iterator[Any]:
221 for desc in self.rdf_root.getElementsByTagNameNS(RDF_NAMESPACE, "Description"):
222 if desc.getAttributeNS(RDF_NAMESPACE, "about") == about_uri:
223 for i in range(desc.attributes.length):
224 attr = desc.attributes.item(i)
225 if attr and attr.namespaceURI == namespace:
226 yield attr
227 for child in desc.childNodes:
228 if child.namespaceURI == namespace:
229 yield child
231 def _get_text(self, element: XmlElement) -> str:
232 text = ""
233 for child in element.childNodes:
234 if child.nodeType == child.TEXT_NODE:
235 text += child.data
236 return text
238 def _get_single_value(
239 self,
240 namespace: str,
241 name: str,
242 converter: Callable[[str], Any] = _identity,
243 ) -> Optional[Any]:
244 cached = self.cache.get(namespace, {}).get(name)
245 if cached:
246 return cached
247 value = None
248 for element in self.get_element("", namespace, name):
249 if element.nodeType == element.ATTRIBUTE_NODE:
250 value = element.nodeValue
251 else:
252 value = self._get_text(element)
253 break
254 if value is not None:
255 value = converter(value)
256 ns_cache = self.cache.setdefault(namespace, {})
257 ns_cache[name] = value
258 return value
260 def _getter_bag(self, namespace: str, name: str) -> Optional[list[str]]:
261 cached = self.cache.get(namespace, {}).get(name)
262 if cached:
263 return cached
264 retval: list[str] = []
265 for element in self.get_element("", namespace, name):
266 if (bags := _generic_get(element, self, list_type="Bag")) is not None:
267 retval.extend(bags)
268 else:
269 value = self._get_text(element)
270 retval.append(value)
271 ns_cache = self.cache.setdefault(namespace, {})
272 ns_cache[name] = retval
273 return retval
275 def _get_seq_values(
276 self,
277 namespace: str,
278 name: str,
279 converter: Callable[[Any], Any] = _identity,
280 ) -> Optional[list[Any]]:
281 cached = self.cache.get(namespace, {}).get(name)
282 if cached:
283 return cached
284 retval: list[Any] = []
285 for element in self.get_element("", namespace, name):
286 if (seqs := _generic_get(element, self, list_type="Seq", converter=converter)) is not None:
287 retval.extend(seqs)
288 elif (bags := _generic_get(element, self, list_type="Bag")) is not None:
289 # See issue at https://github.com/py-pdf/pypdf/issues/3324
290 # Some applications violate the XMP metadata standard regarding `dc:creator` which should
291 # be an "ordered array" and thus a sequence, but use an unordered array (bag) instead.
292 # This seems to stem from the fact that the original Dublin Core specification does indeed
293 # use bags or direct values, while PDFs are expected to follow the XMP standard and ignore
294 # the plain Dublin Core variant. For this reason, add a fallback here to deal with such
295 # issues accordingly.
296 retval.extend(bags)
297 else:
298 value = converter(self._get_text(element))
299 retval.append(value)
300 ns_cache = self.cache.setdefault(namespace, {})
301 ns_cache[name] = retval
302 return retval
304 def _get_langalt_values(self, namespace: str, name: str) -> Optional[dict[Any, Any]]:
305 cached = self.cache.get(namespace, {}).get(name)
306 if cached:
307 return cached
308 retval: dict[Any, Any] = {}
309 for element in self.get_element("", namespace, name):
310 alts = element.getElementsByTagNameNS(RDF_NAMESPACE, "Alt")
311 if len(alts):
312 for alt in alts:
313 for item in alt.getElementsByTagNameNS(RDF_NAMESPACE, "li"):
314 value = self._get_text(item)
315 retval[item.getAttribute("xml:lang")] = value
316 else:
317 retval["x-default"] = self._get_text(element)
318 ns_cache = self.cache.setdefault(namespace, {})
319 ns_cache[name] = retval
320 return retval
322 @property
323 def dc_contributor(self) -> Optional[list[str]]:
324 """Contributors to the resource (other than the authors)."""
325 return self._getter_bag(DC_NAMESPACE, "contributor")
327 @dc_contributor.setter
328 def dc_contributor(self, values: Optional[list[str]]) -> None:
329 self._set_bag_values(DC_NAMESPACE, "contributor", values)
331 @property
332 def dc_coverage(self) -> Optional[str]:
333 """Text describing the extent or scope of the resource."""
334 return self._get_single_value(DC_NAMESPACE, "coverage")
336 @dc_coverage.setter
337 def dc_coverage(self, value: Optional[str]) -> None:
338 self._set_single_value(DC_NAMESPACE, "coverage", value)
340 @property
341 def dc_creator(self) -> Optional[list[str]]:
342 """A sorted array of names of the authors of the resource, listed in order of precedence."""
343 return self._get_seq_values(DC_NAMESPACE, "creator")
345 @dc_creator.setter
346 def dc_creator(self, values: Optional[list[str]]) -> None:
347 self._set_seq_values(DC_NAMESPACE, "creator", values)
349 @property
350 def dc_date(self) -> Optional[list[datetime.datetime]]:
351 """A sorted array of dates of significance to the resource. The dates and times are in UTC."""
352 return self._get_seq_values(DC_NAMESPACE, "date", _converter_date)
354 @dc_date.setter
355 def dc_date(self, values: Optional[list[Union[str, datetime.datetime]]]) -> None:
356 if values is None:
357 self._set_seq_values(DC_NAMESPACE, "date", None)
358 else:
359 date_strings = []
360 for value in values:
361 if isinstance(value, datetime.datetime):
362 date_strings.append(_format_datetime_utc(value))
363 else:
364 date_strings.append(str(value))
365 self._set_seq_values(DC_NAMESPACE, "date", date_strings)
367 @property
368 def dc_description(self) -> Optional[dict[str, str]]:
369 """A language-keyed dictionary of textual descriptions of the content of the resource."""
370 return self._get_langalt_values(DC_NAMESPACE, "description")
372 @dc_description.setter
373 def dc_description(self, values: Optional[dict[str, str]]) -> None:
374 self._set_langalt_values(DC_NAMESPACE, "description", values)
376 @property
377 def dc_format(self) -> Optional[str]:
378 """The mime-type of the resource."""
379 return self._get_single_value(DC_NAMESPACE, "format")
381 @dc_format.setter
382 def dc_format(self, value: Optional[str]) -> None:
383 self._set_single_value(DC_NAMESPACE, "format", value)
385 @property
386 def dc_identifier(self) -> Optional[str]:
387 """Unique identifier of the resource."""
388 return self._get_single_value(DC_NAMESPACE, "identifier")
390 @dc_identifier.setter
391 def dc_identifier(self, value: Optional[str]) -> None:
392 self._set_single_value(DC_NAMESPACE, "identifier", value)
394 @property
395 def dc_language(self) -> Optional[list[str]]:
396 """An unordered array specifying the languages used in the resource."""
397 return self._getter_bag(DC_NAMESPACE, "language")
399 @dc_language.setter
400 def dc_language(self, values: Optional[list[str]]) -> None:
401 self._set_bag_values(DC_NAMESPACE, "language", values)
403 @property
404 def dc_publisher(self) -> Optional[list[str]]:
405 """An unordered array of publisher names."""
406 return self._getter_bag(DC_NAMESPACE, "publisher")
408 @dc_publisher.setter
409 def dc_publisher(self, values: Optional[list[str]]) -> None:
410 self._set_bag_values(DC_NAMESPACE, "publisher", values)
412 @property
413 def dc_relation(self) -> Optional[list[str]]:
414 """An unordered array of text descriptions of relationships to other documents."""
415 return self._getter_bag(DC_NAMESPACE, "relation")
417 @dc_relation.setter
418 def dc_relation(self, values: Optional[list[str]]) -> None:
419 self._set_bag_values(DC_NAMESPACE, "relation", values)
421 @property
422 def dc_rights(self) -> Optional[dict[str, str]]:
423 """A language-keyed dictionary of textual descriptions of the rights the user has to this resource."""
424 return self._get_langalt_values(DC_NAMESPACE, "rights")
426 @dc_rights.setter
427 def dc_rights(self, values: Optional[dict[str, str]]) -> None:
428 self._set_langalt_values(DC_NAMESPACE, "rights", values)
430 @property
431 def dc_source(self) -> Optional[str]:
432 """Unique identifier of the work from which this resource was derived."""
433 return self._get_single_value(DC_NAMESPACE, "source")
435 @dc_source.setter
436 def dc_source(self, value: Optional[str]) -> None:
437 self._set_single_value(DC_NAMESPACE, "source", value)
439 @property
440 def dc_subject(self) -> Optional[list[str]]:
441 """An unordered array of descriptive phrases or keywords that specify the topic of the content."""
442 return self._getter_bag(DC_NAMESPACE, "subject")
444 @dc_subject.setter
445 def dc_subject(self, values: Optional[list[str]]) -> None:
446 self._set_bag_values(DC_NAMESPACE, "subject", values)
448 @property
449 def dc_title(self) -> Optional[dict[str, str]]:
450 """A language-keyed dictionary of the title of the resource."""
451 return self._get_langalt_values(DC_NAMESPACE, "title")
453 @dc_title.setter
454 def dc_title(self, values: Optional[dict[str, str]]) -> None:
455 self._set_langalt_values(DC_NAMESPACE, "title", values)
457 @property
458 def dc_type(self) -> Optional[list[str]]:
459 """An unordered array of textual descriptions of the document type."""
460 return self._getter_bag(DC_NAMESPACE, "type")
462 @dc_type.setter
463 def dc_type(self, values: Optional[list[str]]) -> None:
464 self._set_bag_values(DC_NAMESPACE, "type", values)
466 @property
467 def pdf_keywords(self) -> Optional[str]:
468 """An unformatted text string representing document keywords."""
469 return self._get_single_value(PDF_NAMESPACE, "Keywords")
471 @pdf_keywords.setter
472 def pdf_keywords(self, value: Optional[str]) -> None:
473 self._set_single_value(PDF_NAMESPACE, "Keywords", value)
475 @property
476 def pdf_pdfversion(self) -> Optional[str]:
477 """The PDF file version, for example 1.0 or 1.3."""
478 return self._get_single_value(PDF_NAMESPACE, "PDFVersion")
480 @pdf_pdfversion.setter
481 def pdf_pdfversion(self, value: Optional[str]) -> None:
482 self._set_single_value(PDF_NAMESPACE, "PDFVersion", value)
484 @property
485 def pdf_producer(self) -> Optional[str]:
486 """The name of the tool that saved the document as a PDF."""
487 return self._get_single_value(PDF_NAMESPACE, "Producer")
489 @pdf_producer.setter
490 def pdf_producer(self, value: Optional[str]) -> None:
491 self._set_single_value(PDF_NAMESPACE, "Producer", value)
493 @property
494 def xmp_create_date(self) -> Optional[datetime.datetime]:
495 """The date and time the resource was originally created. Returned as a UTC datetime object."""
496 return self._get_single_value(XMP_NAMESPACE, "CreateDate", _converter_date)
498 @xmp_create_date.setter
499 def xmp_create_date(self, value: Optional[datetime.datetime]) -> None:
500 if value:
501 date_str = _format_datetime_utc(value)
502 self._set_single_value(XMP_NAMESPACE, "CreateDate", date_str)
503 else:
504 self._set_single_value(XMP_NAMESPACE, "CreateDate", None)
506 @property
507 def xmp_modify_date(self) -> Optional[datetime.datetime]:
508 """The date and time the resource was last modified. Returned as a UTC datetime object."""
509 return self._get_single_value(XMP_NAMESPACE, "ModifyDate", _converter_date)
511 @xmp_modify_date.setter
512 def xmp_modify_date(self, value: Optional[datetime.datetime]) -> None:
513 if value:
514 date_str = _format_datetime_utc(value)
515 self._set_single_value(XMP_NAMESPACE, "ModifyDate", date_str)
516 else:
517 self._set_single_value(XMP_NAMESPACE, "ModifyDate", None)
519 @property
520 def xmp_metadata_date(self) -> Optional[datetime.datetime]:
521 """The date and time that any metadata for this resource was last changed. Returned as a UTC datetime object."""
522 return self._get_single_value(XMP_NAMESPACE, "MetadataDate", _converter_date)
524 @xmp_metadata_date.setter
525 def xmp_metadata_date(self, value: Optional[datetime.datetime]) -> None:
526 if value:
527 date_str = _format_datetime_utc(value)
528 self._set_single_value(XMP_NAMESPACE, "MetadataDate", date_str)
529 else:
530 self._set_single_value(XMP_NAMESPACE, "MetadataDate", None)
532 @property
533 def xmp_creator_tool(self) -> Optional[str]:
534 """The name of the first known tool used to create the resource."""
535 return self._get_single_value(XMP_NAMESPACE, "CreatorTool")
537 @xmp_creator_tool.setter
538 def xmp_creator_tool(self, value: Optional[str]) -> None:
539 self._set_single_value(XMP_NAMESPACE, "CreatorTool", value)
541 @property
542 def xmpmm_document_id(self) -> Optional[str]:
543 """The common identifier for all versions and renditions of this resource."""
544 return self._get_single_value(XMPMM_NAMESPACE, "DocumentID")
546 @xmpmm_document_id.setter
547 def xmpmm_document_id(self, value: Optional[str]) -> None:
548 self._set_single_value(XMPMM_NAMESPACE, "DocumentID", value)
550 @property
551 def xmpmm_instance_id(self) -> Optional[str]:
552 """An identifier for a specific incarnation of a document, updated each time a file is saved."""
553 return self._get_single_value(XMPMM_NAMESPACE, "InstanceID")
555 @xmpmm_instance_id.setter
556 def xmpmm_instance_id(self, value: Optional[str]) -> None:
557 self._set_single_value(XMPMM_NAMESPACE, "InstanceID", value)
559 @property
560 def pdfaid_part(self) -> Optional[str]:
561 """The part of the PDF/A standard that the document conforms to (e.g., 1, 2, 3)."""
562 return self._get_single_value(PDFAID_NAMESPACE, "part")
564 @pdfaid_part.setter
565 def pdfaid_part(self, value: Optional[str]) -> None:
566 self._set_single_value(PDFAID_NAMESPACE, "part", value)
568 @property
569 def pdfaid_conformance(self) -> Optional[str]:
570 """The conformance level within the PDF/A standard (e.g., 'A', 'B', 'U')."""
571 return self._get_single_value(PDFAID_NAMESPACE, "conformance")
573 @pdfaid_conformance.setter
574 def pdfaid_conformance(self, value: Optional[str]) -> None:
575 self._set_single_value(PDFAID_NAMESPACE, "conformance", value)
577 @property
578 def custom_properties(self) -> dict[Any, Any]:
579 """
580 Retrieve custom metadata properties defined in the undocumented pdfx
581 metadata schema.
583 Returns:
584 A dictionary of key/value items for custom metadata properties.
586 """
587 if not hasattr(self, "_custom_properties"):
588 self._custom_properties = {}
589 for node in self.get_nodes_in_namespace("", PDFX_NAMESPACE):
590 key = node.localName
591 while True:
592 # see documentation about PDFX_NAMESPACE earlier in file
593 idx = key.find("\u2182")
594 if idx == -1:
595 break
596 key = (
597 key[:idx]
598 + chr(int(key[idx + 1 : idx + 5], base=16))
599 + key[idx + 5 :]
600 )
601 if node.nodeType == node.ATTRIBUTE_NODE:
602 value = node.nodeValue
603 else:
604 value = self._get_text(node)
605 self._custom_properties[key] = value
606 return self._custom_properties
608 def _get_or_create_description(self, about_uri: str = "") -> XmlElement:
609 """Get or create an rdf:Description element with the given about URI."""
610 for desc in self.rdf_root.getElementsByTagNameNS(RDF_NAMESPACE, "Description"):
611 if desc.getAttributeNS(RDF_NAMESPACE, "about") == about_uri:
612 return desc
614 doc = self.rdf_root.ownerDocument
615 if doc is None:
616 raise XmpDocumentError("XMP Document is None")
617 desc = doc.createElementNS(RDF_NAMESPACE, "rdf:Description")
618 desc.setAttributeNS(RDF_NAMESPACE, "rdf:about", about_uri)
619 self.rdf_root.appendChild(desc)
620 return desc
622 def _clear_cache_entry(self, namespace: str, name: str) -> None:
623 """Remove a cached value for a given namespace/name if present."""
624 ns_cache = self.cache.get(namespace)
625 if ns_cache and name in ns_cache:
626 del ns_cache[name]
628 def _set_single_value(self, namespace: str, name: str, value: Optional[str]) -> None:
629 """Set or remove a single metadata value."""
630 self._clear_cache_entry(namespace, name)
631 desc = self._get_or_create_description()
633 existing_elements = list(desc.getElementsByTagNameNS(namespace, name))
634 for elem in existing_elements:
635 desc.removeChild(elem)
637 if existing_attr := desc.getAttributeNodeNS(namespace, name):
638 desc.removeAttributeNode(existing_attr)
640 if value is not None:
641 doc = self.rdf_root.ownerDocument
642 if doc is None:
643 raise XmpDocumentError("XMP Document is None")
644 prefix = self._get_namespace_prefix(namespace)
645 elem = doc.createElementNS(namespace, f"{prefix}:{name}")
646 text_node = doc.createTextNode(str(value))
647 elem.appendChild(text_node)
648 desc.appendChild(elem)
650 self._update_stream()
652 def _set_bag_values(self, namespace: str, name: str, values: Optional[list[str]]) -> None:
653 """Set or remove bag values (unordered array)."""
654 self._clear_cache_entry(namespace, name)
655 desc = self._get_or_create_description()
657 existing_elements = list(desc.getElementsByTagNameNS(namespace, name))
658 for elem in existing_elements:
659 desc.removeChild(elem)
661 if values:
662 doc = self.rdf_root.ownerDocument
663 if doc is None:
664 raise XmpDocumentError("XMP Document is None")
665 prefix = self._get_namespace_prefix(namespace)
666 elem = doc.createElementNS(namespace, f"{prefix}:{name}")
667 bag = doc.createElementNS(RDF_NAMESPACE, "rdf:Bag")
669 for value in values:
670 li = doc.createElementNS(RDF_NAMESPACE, "rdf:li")
671 text_node = doc.createTextNode(str(value))
672 li.appendChild(text_node)
673 bag.appendChild(li)
675 elem.appendChild(bag)
676 desc.appendChild(elem)
678 self._update_stream()
680 def _set_seq_values(self, namespace: str, name: str, values: Optional[list[str]]) -> None:
681 """Set or remove sequence values (ordered array)."""
682 self._clear_cache_entry(namespace, name)
683 desc = self._get_or_create_description()
685 existing_elements = list(desc.getElementsByTagNameNS(namespace, name))
686 for elem in existing_elements:
687 desc.removeChild(elem)
689 if values:
690 doc = self.rdf_root.ownerDocument
691 if doc is None:
692 raise XmpDocumentError("XMP Document is None")
693 prefix = self._get_namespace_prefix(namespace)
694 elem = doc.createElementNS(namespace, f"{prefix}:{name}")
695 seq = doc.createElementNS(RDF_NAMESPACE, "rdf:Seq")
697 for value in values:
698 li = doc.createElementNS(RDF_NAMESPACE, "rdf:li")
699 text_node = doc.createTextNode(str(value))
700 li.appendChild(text_node)
701 seq.appendChild(li)
703 elem.appendChild(seq)
704 desc.appendChild(elem)
706 self._update_stream()
708 def _set_langalt_values(self, namespace: str, name: str, values: Optional[dict[str, str]]) -> None:
709 """Set or remove language alternative values."""
710 self._clear_cache_entry(namespace, name)
711 desc = self._get_or_create_description()
713 existing_elements = list(desc.getElementsByTagNameNS(namespace, name))
714 for elem in existing_elements:
715 desc.removeChild(elem)
717 if values:
718 doc = self.rdf_root.ownerDocument
719 if doc is None:
720 raise XmpDocumentError("XMP Document is None")
721 prefix = self._get_namespace_prefix(namespace)
722 elem = doc.createElementNS(namespace, f"{prefix}:{name}")
723 alt = doc.createElementNS(RDF_NAMESPACE, "rdf:Alt")
725 for lang, value in values.items():
726 li = doc.createElementNS(RDF_NAMESPACE, "rdf:li")
727 li.setAttribute("xml:lang", lang)
728 text_node = doc.createTextNode(str(value))
729 li.appendChild(text_node)
730 alt.appendChild(li)
732 elem.appendChild(alt)
733 desc.appendChild(elem)
735 self._update_stream()
737 def _get_namespace_prefix(self, namespace: str) -> str:
738 """Get the appropriate namespace prefix for a given namespace URI."""
739 return _NAMESPACE_PREFIX_MAP.get(namespace, "unknown")
741 def _update_stream(self) -> None:
742 """Update the stream with the current XML content."""
743 doc = self.rdf_root.ownerDocument
744 if doc is None:
745 raise XmpDocumentError("XMP Document is None")
747 xml_data = doc.toxml(encoding="utf-8")
748 self.stream.set_data(xml_data)