Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/xmp.py: 42%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2Anything related to Extensible Metadata Platform (XMP) metadata.
4https://en.wikipedia.org/wiki/Extensible_Metadata_Platform
5"""
7import datetime
8import decimal
9import re
10from collections.abc import Iterator
11from typing import (
12 Any,
13 Callable,
14 Optional,
15 TypeVar,
16 Union,
17)
18from xml.dom.minidom import Document, parseString
19from xml.dom.minidom import Element as XmlElement
20from xml.parsers.expat import ExpatError
22from ._protocols import XmpInformationProtocol
23from ._utils import StreamType, deprecate_with_replacement, deprecation_no_replacement
24from .errors import PdfReadError
25from .generic import ContentStream, PdfObject
27RDF_NAMESPACE = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
28DC_NAMESPACE = "http://purl.org/dc/elements/1.1/"
29XMP_NAMESPACE = "http://ns.adobe.com/xap/1.0/"
30PDF_NAMESPACE = "http://ns.adobe.com/pdf/1.3/"
31XMPMM_NAMESPACE = "http://ns.adobe.com/xap/1.0/mm/"
33# What is the PDFX namespace, you might ask?
34# It's documented here: https://github.com/adobe/xmp-docs/raw/master/XMPSpecifications/XMPSpecificationPart3.pdf
35# This namespace is used to place "custom metadata"
36# properties, which are arbitrary metadata properties with no semantic or
37# documented meaning.
38#
39# Elements in the namespace are key/value-style storage,
40# where the element name is the key and the content is the value. The keys
41# are transformed into valid XML identifiers by substituting an invalid
42# identifier character with \u2182 followed by the unicode hex ID of the
43# original character. A key like "my car" is therefore "my\u21820020car".
44#
45# \u2182 is the unicode character \u{ROMAN NUMERAL TEN THOUSAND}
46#
47# The pdfx namespace should be avoided.
48# A custom data schema and sensical XML elements could be used instead, as is
49# suggested by Adobe's own documentation on XMP under "Extensibility of
50# Schemas".
51PDFX_NAMESPACE = "http://ns.adobe.com/pdfx/1.3/"
53# PDF/A
54PDFAID_NAMESPACE = "http://www.aiim.org/pdfa/ns/id/"
56iso8601 = re.compile(
57 """
58 (?P<year>[0-9]{4})
59 (-
60 (?P<month>[0-9]{2})
61 (-
62 (?P<day>[0-9]+)
63 (T
64 (?P<hour>[0-9]{2}):
65 (?P<minute>[0-9]{2})
66 (:(?P<second>[0-9]{2}(.[0-9]+)?))?
67 (?P<tzd>Z|[-+][0-9]{2}:[0-9]{2})
68 )?
69 )?
70 )?
71 """,
72 re.VERBOSE,
73)
76K = TypeVar("K")
79def _identity(value: K) -> K:
80 return value
83def _converter_date(value: str) -> datetime.datetime:
84 matches = iso8601.match(value)
85 if matches is None:
86 raise ValueError(f"Invalid date format: {value}")
87 year = int(matches.group("year"))
88 month = int(matches.group("month") or "1")
89 day = int(matches.group("day") or "1")
90 hour = int(matches.group("hour") or "0")
91 minute = int(matches.group("minute") or "0")
92 second = decimal.Decimal(matches.group("second") or "0")
93 seconds_dec = second.to_integral(decimal.ROUND_FLOOR)
94 milliseconds_dec = (second - seconds_dec) * 1_000_000
96 seconds = int(seconds_dec)
97 milliseconds = int(milliseconds_dec)
99 tzd = matches.group("tzd") or "Z"
100 dt = datetime.datetime(year, month, day, hour, minute, seconds, milliseconds)
101 if tzd != "Z":
102 tzd_hours, tzd_minutes = (int(x) for x in tzd.split(":"))
103 tzd_hours *= -1
104 if tzd_hours < 0:
105 tzd_minutes *= -1
106 dt = dt + datetime.timedelta(hours=tzd_hours, minutes=tzd_minutes)
107 return dt
110def _generic_get(
111 element: XmlElement, self: "XmpInformation", list_type: str, converter: Callable[[Any], Any] = _identity
112) -> Optional[list[str]]:
113 containers = element.getElementsByTagNameNS(RDF_NAMESPACE, list_type)
114 retval: list[Any] = []
115 if len(containers):
116 for container in containers:
117 for item in container.getElementsByTagNameNS(RDF_NAMESPACE, "li"):
118 value = self._get_text(item)
119 value = converter(value)
120 retval.append(value)
121 return retval
122 return None
125def _getter_bag(
126 namespace: str, name: str
127) -> Callable[["XmpInformation"], Optional[list[str]]]:
128 def get(self: "XmpInformation") -> Optional[list[str]]:
129 cached = self.cache.get(namespace, {}).get(name)
130 if cached:
131 return cached
132 retval: list[str] = []
133 for element in self.get_element("", namespace, name):
134 if (bags := _generic_get(element, self, list_type="Bag")) is not None:
135 retval.extend(bags)
136 else:
137 value = self._get_text(element)
138 retval.append(value)
139 ns_cache = self.cache.setdefault(namespace, {})
140 ns_cache[name] = retval
141 return retval
143 return get
146def _getter_seq(
147 namespace: str, name: str, converter: Callable[[Any], Any] = _identity
148) -> Callable[["XmpInformation"], Optional[list[Any]]]:
149 def get(self: "XmpInformation") -> Optional[list[Any]]:
150 cached = self.cache.get(namespace, {}).get(name)
151 if cached:
152 return cached
153 retval = []
154 for element in self.get_element("", namespace, name):
155 if (seqs := _generic_get(element, self, list_type="Seq", converter=converter)) is not None:
156 retval.extend(seqs)
157 elif (bags := _generic_get(element, self, list_type="Bag")) is not None:
158 # See issue at https://github.com/py-pdf/pypdf/issues/3324
159 # Some applications violate the XMP metadata standard regarding `dc:creator` which should
160 # be an "ordered array" and thus a sequence, but use an unordered array (bag) instead.
161 # This seems to stem from the fact that the original Dublin Core specification does indeed
162 # use bags or direct values, while PDFs are expected to follow the XMP standard and ignore
163 # the plain Dublin Core variant. For this reason, add a fallback here to deal with such
164 # issues accordingly.
165 retval.extend(bags)
166 else:
167 value = converter(self._get_text(element))
168 retval.append(value)
169 ns_cache = self.cache.setdefault(namespace, {})
170 ns_cache[name] = retval
171 return retval
173 return get
176def _getter_langalt(
177 namespace: str, name: str
178) -> Callable[["XmpInformation"], Optional[dict[Any, Any]]]:
179 def get(self: "XmpInformation") -> Optional[dict[Any, Any]]:
180 cached = self.cache.get(namespace, {}).get(name)
181 if cached:
182 return cached
183 retval = {}
184 for element in self.get_element("", namespace, name):
185 alts = element.getElementsByTagNameNS(RDF_NAMESPACE, "Alt")
186 if len(alts):
187 for alt in alts:
188 for item in alt.getElementsByTagNameNS(RDF_NAMESPACE, "li"):
189 value = self._get_text(item)
190 retval[item.getAttribute("xml:lang")] = value
191 else:
192 retval["x-default"] = self._get_text(element)
193 ns_cache = self.cache.setdefault(namespace, {})
194 ns_cache[name] = retval
195 return retval
197 return get
200def _getter_single(
201 namespace: str, name: str, converter: Callable[[str], Any] = _identity
202) -> Callable[["XmpInformation"], Optional[Any]]:
203 def get(self: "XmpInformation") -> Optional[Any]:
204 cached = self.cache.get(namespace, {}).get(name)
205 if cached:
206 return cached
207 value = None
208 for element in self.get_element("", namespace, name):
209 if element.nodeType == element.ATTRIBUTE_NODE:
210 value = element.nodeValue
211 else:
212 value = self._get_text(element)
213 break
214 if value is not None:
215 value = converter(value)
216 ns_cache = self.cache.setdefault(namespace, {})
217 ns_cache[name] = value
218 return value
220 return get
223class XmpInformation(XmpInformationProtocol, PdfObject):
224 """
225 An object that represents Extensible Metadata Platform (XMP) metadata.
226 Usually accessed by :py:attr:`xmp_metadata()<pypdf.PdfReader.xmp_metadata>`.
228 Raises:
229 PdfReadError: if XML is invalid
231 """
233 def __init__(self, stream: ContentStream) -> None:
234 self.stream = stream
235 try:
236 data = self.stream.get_data()
237 doc_root: Document = parseString(data) # noqa: S318
238 except (AttributeError, ExpatError) as e:
239 raise PdfReadError(f"XML in XmpInformation was invalid: {e}")
240 self.rdf_root: XmlElement = doc_root.getElementsByTagNameNS(
241 RDF_NAMESPACE, "RDF"
242 )[0]
243 self.cache: dict[Any, Any] = {}
245 def write_to_stream(
246 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
247 ) -> None:
248 deprecate_with_replacement(
249 "XmpInformation.write_to_stream",
250 "PdfWriter.xmp_metadata",
251 "6.0.0"
252 )
253 if encryption_key is not None: # deprecated
254 deprecation_no_replacement(
255 "the encryption_key parameter of write_to_stream", "5.0.0"
256 )
257 self.stream.write_to_stream(stream)
259 def get_element(self, about_uri: str, namespace: str, name: str) -> Iterator[Any]:
260 for desc in self.rdf_root.getElementsByTagNameNS(RDF_NAMESPACE, "Description"):
261 if desc.getAttributeNS(RDF_NAMESPACE, "about") == about_uri:
262 attr = desc.getAttributeNodeNS(namespace, name)
263 if attr is not None:
264 yield attr
265 yield from desc.getElementsByTagNameNS(namespace, name)
267 def get_nodes_in_namespace(self, about_uri: str, namespace: str) -> Iterator[Any]:
268 for desc in self.rdf_root.getElementsByTagNameNS(RDF_NAMESPACE, "Description"):
269 if desc.getAttributeNS(RDF_NAMESPACE, "about") == about_uri:
270 for i in range(desc.attributes.length):
271 attr = desc.attributes.item(i)
272 if attr and attr.namespaceURI == namespace:
273 yield attr
274 for child in desc.childNodes:
275 if child.namespaceURI == namespace:
276 yield child
278 def _get_text(self, element: XmlElement) -> str:
279 text = ""
280 for child in element.childNodes:
281 if child.nodeType == child.TEXT_NODE:
282 text += child.data
283 return text
285 dc_contributor = property(_getter_bag(DC_NAMESPACE, "contributor"))
286 """
287 Contributors to the resource (other than the authors).
289 An unsorted array of names.
290 """
292 dc_coverage = property(_getter_single(DC_NAMESPACE, "coverage"))
293 """Text describing the extent or scope of the resource."""
295 dc_creator = property(_getter_seq(DC_NAMESPACE, "creator"))
296 """A sorted array of names of the authors of the resource, listed in order
297 of precedence."""
299 dc_date = property(_getter_seq(DC_NAMESPACE, "date", _converter_date))
300 """
301 A sorted array of dates (datetime.datetime instances) of significance to
302 the resource.
304 The dates and times are in UTC.
305 """
307 dc_description = property(_getter_langalt(DC_NAMESPACE, "description"))
308 """A language-keyed dictionary of textual descriptions of the content of the
309 resource."""
311 dc_format = property(_getter_single(DC_NAMESPACE, "format"))
312 """The mime-type of the resource."""
314 dc_identifier = property(_getter_single(DC_NAMESPACE, "identifier"))
315 """Unique identifier of the resource."""
317 dc_language = property(_getter_bag(DC_NAMESPACE, "language"))
318 """An unordered array specifying the languages used in the resource."""
320 dc_publisher = property(_getter_bag(DC_NAMESPACE, "publisher"))
321 """An unordered array of publisher names."""
323 dc_relation = property(_getter_bag(DC_NAMESPACE, "relation"))
324 """An unordered array of text descriptions of relationships to other
325 documents."""
327 dc_rights = property(_getter_langalt(DC_NAMESPACE, "rights"))
328 """A language-keyed dictionary of textual descriptions of the rights the
329 user has to this resource."""
331 dc_source = property(_getter_single(DC_NAMESPACE, "source"))
332 """Unique identifier of the work from which this resource was derived."""
334 dc_subject = property(_getter_bag(DC_NAMESPACE, "subject"))
335 """An unordered array of descriptive phrases or keywords that specify the
336 topic of the content of the resource."""
338 dc_title = property(_getter_langalt(DC_NAMESPACE, "title"))
339 """A language-keyed dictionary of the title of the resource."""
341 dc_type = property(_getter_bag(DC_NAMESPACE, "type"))
342 """An unordered array of textual descriptions of the document type."""
344 pdf_keywords = property(_getter_single(PDF_NAMESPACE, "Keywords"))
345 """An unformatted text string representing document keywords."""
347 pdf_pdfversion = property(_getter_single(PDF_NAMESPACE, "PDFVersion"))
348 """The PDF file version, for example 1.0 or 1.3."""
350 pdf_producer = property(_getter_single(PDF_NAMESPACE, "Producer"))
351 """The name of the tool that saved the document as a PDF."""
353 xmp_create_date = property(
354 _getter_single(XMP_NAMESPACE, "CreateDate", _converter_date)
355 )
356 """
357 The date and time the resource was originally created.
359 The date and time are returned as a UTC datetime.datetime object.
360 """
362 xmp_modify_date = property(
363 _getter_single(XMP_NAMESPACE, "ModifyDate", _converter_date)
364 )
365 """
366 The date and time the resource was last modified.
368 The date and time are returned as a UTC datetime.datetime object.
369 """
371 xmp_metadata_date = property(
372 _getter_single(XMP_NAMESPACE, "MetadataDate", _converter_date)
373 )
374 """
375 The date and time that any metadata for this resource was last changed.
377 The date and time are returned as a UTC datetime.datetime object.
378 """
380 xmp_creator_tool = property(_getter_single(XMP_NAMESPACE, "CreatorTool"))
381 """The name of the first known tool used to create the resource."""
383 xmpmm_document_id = property(_getter_single(XMPMM_NAMESPACE, "DocumentID"))
384 """The common identifier for all versions and renditions of this resource."""
386 xmpmm_instance_id = property(_getter_single(XMPMM_NAMESPACE, "InstanceID"))
387 """An identifier for a specific incarnation of a document, updated each
388 time a file is saved."""
390 pdfaid_part = property(_getter_single(PDFAID_NAMESPACE, "part"))
391 """The part of the PDF/A standard that the document conforms to (e.g., 1, 2, 3)."""
393 pdfaid_conformance = property(_getter_single(PDFAID_NAMESPACE, "conformance"))
394 """The conformance level within the PDF/A standard (e.g., 'A', 'B', 'U')."""
396 @property
397 def custom_properties(self) -> dict[Any, Any]:
398 """
399 Retrieve custom metadata properties defined in the undocumented pdfx
400 metadata schema.
402 Returns:
403 A dictionary of key/value items for custom metadata properties.
405 """
406 if not hasattr(self, "_custom_properties"):
407 self._custom_properties = {}
408 for node in self.get_nodes_in_namespace("", PDFX_NAMESPACE):
409 key = node.localName
410 while True:
411 # see documentation about PDFX_NAMESPACE earlier in file
412 idx = key.find("\u2182")
413 if idx == -1:
414 break
415 key = (
416 key[:idx]
417 + chr(int(key[idx + 1 : idx + 5], base=16))
418 + key[idx + 5 :]
419 )
420 if node.nodeType == node.ATTRIBUTE_NODE:
421 value = node.nodeValue
422 else:
423 value = self._get_text(node)
424 self._custom_properties[key] = value
425 return self._custom_properties