Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/xmp.py: 42%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2Anything related to Extensible Metadata Platform (XMP) metadata.
4https://en.wikipedia.org/wiki/Extensible_Metadata_Platform
5"""
7import datetime
8import decimal
9import re
10from typing import (
11 Any,
12 Callable,
13 Dict,
14 Iterator,
15 List,
16 Optional,
17 TypeVar,
18 Union,
19)
20from xml.dom.minidom import Document, parseString
21from xml.dom.minidom import Element as XmlElement
22from xml.parsers.expat import ExpatError
24from ._protocols import XmpInformationProtocol
25from ._utils import StreamType, deprecate_no_replacement
26from .errors import PdfReadError
27from .generic import ContentStream, PdfObject
29RDF_NAMESPACE = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
30DC_NAMESPACE = "http://purl.org/dc/elements/1.1/"
31XMP_NAMESPACE = "http://ns.adobe.com/xap/1.0/"
32PDF_NAMESPACE = "http://ns.adobe.com/pdf/1.3/"
33XMPMM_NAMESPACE = "http://ns.adobe.com/xap/1.0/mm/"
35# What is the PDFX namespace, you might ask?
36# It's documented here: https://github.com/adobe/xmp-docs/raw/master/XMPSpecifications/XMPSpecificationPart3.pdf
37# This namespace is used to place "custom metadata"
38# properties, which are arbitrary metadata properties with no semantic or
39# documented meaning.
40#
41# Elements in the namespace are key/value-style storage,
42# where the element name is the key and the content is the value. The keys
43# are transformed into valid XML identifiers by substituting an invalid
44# identifier character with \u2182 followed by the unicode hex ID of the
45# original character. A key like "my car" is therefore "my\u21820020car".
46#
47# \u2182 is the unicode character \u{ROMAN NUMERAL TEN THOUSAND}
48#
49# The pdfx namespace should be avoided.
50# A custom data schema and sensical XML elements could be used instead, as is
51# suggested by Adobe's own documentation on XMP under "Extensibility of
52# Schemas".
53PDFX_NAMESPACE = "http://ns.adobe.com/pdfx/1.3/"
55# PDF/A
56PDFAID_NAMESPACE = "http://www.aiim.org/pdfa/ns/id/"
58iso8601 = re.compile(
59 """
60 (?P<year>[0-9]{4})
61 (-
62 (?P<month>[0-9]{2})
63 (-
64 (?P<day>[0-9]+)
65 (T
66 (?P<hour>[0-9]{2}):
67 (?P<minute>[0-9]{2})
68 (:(?P<second>[0-9]{2}(.[0-9]+)?))?
69 (?P<tzd>Z|[-+][0-9]{2}:[0-9]{2})
70 )?
71 )?
72 )?
73 """,
74 re.VERBOSE,
75)
78K = TypeVar("K")
81def _identity(value: K) -> K:
82 return value
85def _converter_date(value: str) -> datetime.datetime:
86 matches = iso8601.match(value)
87 if matches is None:
88 raise ValueError(f"Invalid date format: {value}")
89 year = int(matches.group("year"))
90 month = int(matches.group("month") or "1")
91 day = int(matches.group("day") or "1")
92 hour = int(matches.group("hour") or "0")
93 minute = int(matches.group("minute") or "0")
94 second = decimal.Decimal(matches.group("second") or "0")
95 seconds_dec = second.to_integral(decimal.ROUND_FLOOR)
96 milliseconds_dec = (second - seconds_dec) * 1_000_000
98 seconds = int(seconds_dec)
99 milliseconds = int(milliseconds_dec)
101 tzd = matches.group("tzd") or "Z"
102 dt = datetime.datetime(year, month, day, hour, minute, seconds, milliseconds)
103 if tzd != "Z":
104 tzd_hours, tzd_minutes = (int(x) for x in tzd.split(":"))
105 tzd_hours *= -1
106 if tzd_hours < 0:
107 tzd_minutes *= -1
108 dt = dt + datetime.timedelta(hours=tzd_hours, minutes=tzd_minutes)
109 return dt
112def _generic_get(
113 element: XmlElement, self: "XmpInformation", list_type: str, converter: Callable[[Any], Any] = _identity
114) -> Optional[List[str]]:
115 containers = element.getElementsByTagNameNS(RDF_NAMESPACE, list_type)
116 retval: List[Any] = []
117 if len(containers):
118 for container in containers:
119 for item in container.getElementsByTagNameNS(RDF_NAMESPACE, "li"):
120 value = self._get_text(item)
121 value = converter(value)
122 retval.append(value)
123 return retval
124 return None
127def _getter_bag(
128 namespace: str, name: str
129) -> Callable[["XmpInformation"], Optional[List[str]]]:
130 def get(self: "XmpInformation") -> Optional[List[str]]:
131 cached = self.cache.get(namespace, {}).get(name)
132 if cached:
133 return cached
134 retval: List[str] = []
135 for element in self.get_element("", namespace, name):
136 if (bags := _generic_get(element, self, list_type="Bag")) is not None:
137 retval.extend(bags)
138 else:
139 value = self._get_text(element)
140 retval.append(value)
141 ns_cache = self.cache.setdefault(namespace, {})
142 ns_cache[name] = retval
143 return retval
145 return get
148def _getter_seq(
149 namespace: str, name: str, converter: Callable[[Any], Any] = _identity
150) -> Callable[["XmpInformation"], Optional[List[Any]]]:
151 def get(self: "XmpInformation") -> Optional[List[Any]]:
152 cached = self.cache.get(namespace, {}).get(name)
153 if cached:
154 return cached
155 retval = []
156 for element in self.get_element("", namespace, name):
157 if (seqs := _generic_get(element, self, list_type="Seq", converter=converter)) is not None:
158 retval.extend(seqs)
159 elif (bags := _generic_get(element, self, list_type="Bag")) is not None:
160 # See issue at https://github.com/py-pdf/pypdf/issues/3324
161 # Some applications violate the XMP metadata standard regarding `dc:creator` which should
162 # be an "ordered array" and thus a sequence, but use an unordered array (bag) instead.
163 # This seems to stem from the fact that the original Dublin Core specification does indeed
164 # use bags or direct values, while PDFs are expected to follow the XMP standard and ignore
165 # the plain Dublin Core variant. For this reason, add a fallback here to deal with such
166 # issues accordingly.
167 retval.extend(bags)
168 else:
169 value = converter(self._get_text(element))
170 retval.append(value)
171 ns_cache = self.cache.setdefault(namespace, {})
172 ns_cache[name] = retval
173 return retval
175 return get
178def _getter_langalt(
179 namespace: str, name: str
180) -> Callable[["XmpInformation"], Optional[Dict[Any, Any]]]:
181 def get(self: "XmpInformation") -> Optional[Dict[Any, Any]]:
182 cached = self.cache.get(namespace, {}).get(name)
183 if cached:
184 return cached
185 retval = {}
186 for element in self.get_element("", namespace, name):
187 alts = element.getElementsByTagNameNS(RDF_NAMESPACE, "Alt")
188 if len(alts):
189 for alt in alts:
190 for item in alt.getElementsByTagNameNS(RDF_NAMESPACE, "li"):
191 value = self._get_text(item)
192 retval[item.getAttribute("xml:lang")] = value
193 else:
194 retval["x-default"] = self._get_text(element)
195 ns_cache = self.cache.setdefault(namespace, {})
196 ns_cache[name] = retval
197 return retval
199 return get
202def _getter_single(
203 namespace: str, name: str, converter: Callable[[str], Any] = _identity
204) -> Callable[["XmpInformation"], Optional[Any]]:
205 def get(self: "XmpInformation") -> Optional[Any]:
206 cached = self.cache.get(namespace, {}).get(name)
207 if cached:
208 return cached
209 value = None
210 for element in self.get_element("", namespace, name):
211 if element.nodeType == element.ATTRIBUTE_NODE:
212 value = element.nodeValue
213 else:
214 value = self._get_text(element)
215 break
216 if value is not None:
217 value = converter(value)
218 ns_cache = self.cache.setdefault(namespace, {})
219 ns_cache[name] = value
220 return value
222 return get
225class XmpInformation(XmpInformationProtocol, PdfObject):
226 """
227 An object that represents Extensible Metadata Platform (XMP) metadata.
228 Usually accessed by :py:attr:`xmp_metadata()<pypdf.PdfReader.xmp_metadata>`.
230 Raises:
231 PdfReadError: if XML is invalid
233 """
235 def __init__(self, stream: ContentStream) -> None:
236 self.stream = stream
237 try:
238 data = self.stream.get_data()
239 doc_root: Document = parseString(data) # noqa: S318
240 except (AttributeError, ExpatError) as e:
241 raise PdfReadError(f"XML in XmpInformation was invalid: {e}")
242 self.rdf_root: XmlElement = doc_root.getElementsByTagNameNS(
243 RDF_NAMESPACE, "RDF"
244 )[0]
245 self.cache: Dict[Any, Any] = {}
247 def write_to_stream(
248 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
249 ) -> None:
250 if encryption_key is not None: # deprecated
251 deprecate_no_replacement(
252 "the encryption_key parameter of write_to_stream", "5.0.0"
253 )
254 self.stream.write_to_stream(stream)
256 def get_element(self, about_uri: str, namespace: str, name: str) -> Iterator[Any]:
257 for desc in self.rdf_root.getElementsByTagNameNS(RDF_NAMESPACE, "Description"):
258 if desc.getAttributeNS(RDF_NAMESPACE, "about") == about_uri:
259 attr = desc.getAttributeNodeNS(namespace, name)
260 if attr is not None:
261 yield attr
262 yield from desc.getElementsByTagNameNS(namespace, name)
264 def get_nodes_in_namespace(self, about_uri: str, namespace: str) -> Iterator[Any]:
265 for desc in self.rdf_root.getElementsByTagNameNS(RDF_NAMESPACE, "Description"):
266 if desc.getAttributeNS(RDF_NAMESPACE, "about") == about_uri:
267 for i in range(desc.attributes.length):
268 attr = desc.attributes.item(i)
269 if attr and attr.namespaceURI == namespace:
270 yield attr
271 for child in desc.childNodes:
272 if child.namespaceURI == namespace:
273 yield child
275 def _get_text(self, element: XmlElement) -> str:
276 text = ""
277 for child in element.childNodes:
278 if child.nodeType == child.TEXT_NODE:
279 text += child.data
280 return text
282 dc_contributor = property(_getter_bag(DC_NAMESPACE, "contributor"))
283 """
284 Contributors to the resource (other than the authors).
286 An unsorted array of names.
287 """
289 dc_coverage = property(_getter_single(DC_NAMESPACE, "coverage"))
290 """Text describing the extent or scope of the resource."""
292 dc_creator = property(_getter_seq(DC_NAMESPACE, "creator"))
293 """A sorted array of names of the authors of the resource, listed in order
294 of precedence."""
296 dc_date = property(_getter_seq(DC_NAMESPACE, "date", _converter_date))
297 """
298 A sorted array of dates (datetime.datetime instances) of significance to
299 the resource.
301 The dates and times are in UTC.
302 """
304 dc_description = property(_getter_langalt(DC_NAMESPACE, "description"))
305 """A language-keyed dictionary of textual descriptions of the content of the
306 resource."""
308 dc_format = property(_getter_single(DC_NAMESPACE, "format"))
309 """The mime-type of the resource."""
311 dc_identifier = property(_getter_single(DC_NAMESPACE, "identifier"))
312 """Unique identifier of the resource."""
314 dc_language = property(_getter_bag(DC_NAMESPACE, "language"))
315 """An unordered array specifying the languages used in the resource."""
317 dc_publisher = property(_getter_bag(DC_NAMESPACE, "publisher"))
318 """An unordered array of publisher names."""
320 dc_relation = property(_getter_bag(DC_NAMESPACE, "relation"))
321 """An unordered array of text descriptions of relationships to other
322 documents."""
324 dc_rights = property(_getter_langalt(DC_NAMESPACE, "rights"))
325 """A language-keyed dictionary of textual descriptions of the rights the
326 user has to this resource."""
328 dc_source = property(_getter_single(DC_NAMESPACE, "source"))
329 """Unique identifier of the work from which this resource was derived."""
331 dc_subject = property(_getter_bag(DC_NAMESPACE, "subject"))
332 """An unordered array of descriptive phrases or keywords that specify the
333 topic of the content of the resource."""
335 dc_title = property(_getter_langalt(DC_NAMESPACE, "title"))
336 """A language-keyed dictionary of the title of the resource."""
338 dc_type = property(_getter_bag(DC_NAMESPACE, "type"))
339 """An unordered array of textual descriptions of the document type."""
341 pdf_keywords = property(_getter_single(PDF_NAMESPACE, "Keywords"))
342 """An unformatted text string representing document keywords."""
344 pdf_pdfversion = property(_getter_single(PDF_NAMESPACE, "PDFVersion"))
345 """The PDF file version, for example 1.0 or 1.3."""
347 pdf_producer = property(_getter_single(PDF_NAMESPACE, "Producer"))
348 """The name of the tool that saved the document as a PDF."""
350 xmp_create_date = property(
351 _getter_single(XMP_NAMESPACE, "CreateDate", _converter_date)
352 )
353 """
354 The date and time the resource was originally created.
356 The date and time are returned as a UTC datetime.datetime object.
357 """
359 xmp_modify_date = property(
360 _getter_single(XMP_NAMESPACE, "ModifyDate", _converter_date)
361 )
362 """
363 The date and time the resource was last modified.
365 The date and time are returned as a UTC datetime.datetime object.
366 """
368 xmp_metadata_date = property(
369 _getter_single(XMP_NAMESPACE, "MetadataDate", _converter_date)
370 )
371 """
372 The date and time that any metadata for this resource was last changed.
374 The date and time are returned as a UTC datetime.datetime object.
375 """
377 xmp_creator_tool = property(_getter_single(XMP_NAMESPACE, "CreatorTool"))
378 """The name of the first known tool used to create the resource."""
380 xmpmm_document_id = property(_getter_single(XMPMM_NAMESPACE, "DocumentID"))
381 """The common identifier for all versions and renditions of this resource."""
383 xmpmm_instance_id = property(_getter_single(XMPMM_NAMESPACE, "InstanceID"))
384 """An identifier for a specific incarnation of a document, updated each
385 time a file is saved."""
387 pdfaid_part = property(_getter_single(PDFAID_NAMESPACE, "part"))
388 """The part of the PDF/A standard that the document conforms to (e.g., 1, 2, 3)."""
390 pdfaid_conformance = property(_getter_single(PDFAID_NAMESPACE, "conformance"))
391 """The conformance level within the PDF/A standard (e.g., 'A', 'B', 'U')."""
393 @property
394 def custom_properties(self) -> Dict[Any, Any]:
395 """
396 Retrieve custom metadata properties defined in the undocumented pdfx
397 metadata schema.
399 Returns:
400 A dictionary of key/value items for custom metadata properties.
402 """
403 if not hasattr(self, "_custom_properties"):
404 self._custom_properties = {}
405 for node in self.get_nodes_in_namespace("", PDFX_NAMESPACE):
406 key = node.localName
407 while True:
408 # see documentation about PDFX_NAMESPACE earlier in file
409 idx = key.find("\u2182")
410 if idx == -1:
411 break
412 key = (
413 key[:idx]
414 + chr(int(key[idx + 1 : idx + 5], base=16))
415 + key[idx + 5 :]
416 )
417 if node.nodeType == node.ATTRIBUTE_NODE:
418 value = node.nodeValue
419 else:
420 value = self._get_text(node)
421 self._custom_properties[key] = value
422 return self._custom_properties