Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/xmp.py: 42%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

231 statements  

1""" 

2Anything related to Extensible Metadata Platform (XMP) metadata. 

3 

4https://en.wikipedia.org/wiki/Extensible_Metadata_Platform 

5""" 

6 

7import datetime 

8import decimal 

9import re 

10from collections.abc import Iterator 

11from typing import ( 

12 Any, 

13 Callable, 

14 Optional, 

15 TypeVar, 

16 Union, 

17) 

18from xml.dom.minidom import Document, parseString 

19from xml.dom.minidom import Element as XmlElement 

20from xml.parsers.expat import ExpatError 

21 

22from ._protocols import XmpInformationProtocol 

23from ._utils import StreamType, deprecate_with_replacement, deprecation_no_replacement 

24from .errors import PdfReadError 

25from .generic import ContentStream, PdfObject 

26 

27RDF_NAMESPACE = "http://www.w3.org/1999/02/22-rdf-syntax-ns#" 

28DC_NAMESPACE = "http://purl.org/dc/elements/1.1/" 

29XMP_NAMESPACE = "http://ns.adobe.com/xap/1.0/" 

30PDF_NAMESPACE = "http://ns.adobe.com/pdf/1.3/" 

31XMPMM_NAMESPACE = "http://ns.adobe.com/xap/1.0/mm/" 

32 

33# What is the PDFX namespace, you might ask? 

34# It's documented here: https://github.com/adobe/xmp-docs/raw/master/XMPSpecifications/XMPSpecificationPart3.pdf 

35# This namespace is used to place "custom metadata" 

36# properties, which are arbitrary metadata properties with no semantic or 

37# documented meaning. 

38# 

39# Elements in the namespace are key/value-style storage, 

40# where the element name is the key and the content is the value. The keys 

41# are transformed into valid XML identifiers by substituting an invalid 

42# identifier character with \u2182 followed by the unicode hex ID of the 

43# original character. A key like "my car" is therefore "my\u21820020car". 

44# 

45# \u2182 is the unicode character \u{ROMAN NUMERAL TEN THOUSAND} 

46# 

47# The pdfx namespace should be avoided. 

48# A custom data schema and sensical XML elements could be used instead, as is 

49# suggested by Adobe's own documentation on XMP under "Extensibility of 

50# Schemas". 

51PDFX_NAMESPACE = "http://ns.adobe.com/pdfx/1.3/" 

52 

53# PDF/A 

54PDFAID_NAMESPACE = "http://www.aiim.org/pdfa/ns/id/" 

55 

56iso8601 = re.compile( 

57 """ 

58 (?P<year>[0-9]{4}) 

59 (- 

60 (?P<month>[0-9]{2}) 

61 (- 

62 (?P<day>[0-9]+) 

63 (T 

64 (?P<hour>[0-9]{2}): 

65 (?P<minute>[0-9]{2}) 

66 (:(?P<second>[0-9]{2}(.[0-9]+)?))? 

67 (?P<tzd>Z|[-+][0-9]{2}:[0-9]{2}) 

68 )? 

69 )? 

70 )? 

71 """, 

72 re.VERBOSE, 

73) 

74 

75 

76K = TypeVar("K") 

77 

78 

79def _identity(value: K) -> K: 

80 return value 

81 

82 

83def _converter_date(value: str) -> datetime.datetime: 

84 matches = iso8601.match(value) 

85 if matches is None: 

86 raise ValueError(f"Invalid date format: {value}") 

87 year = int(matches.group("year")) 

88 month = int(matches.group("month") or "1") 

89 day = int(matches.group("day") or "1") 

90 hour = int(matches.group("hour") or "0") 

91 minute = int(matches.group("minute") or "0") 

92 second = decimal.Decimal(matches.group("second") or "0") 

93 seconds_dec = second.to_integral(decimal.ROUND_FLOOR) 

94 milliseconds_dec = (second - seconds_dec) * 1_000_000 

95 

96 seconds = int(seconds_dec) 

97 milliseconds = int(milliseconds_dec) 

98 

99 tzd = matches.group("tzd") or "Z" 

100 dt = datetime.datetime(year, month, day, hour, minute, seconds, milliseconds) 

101 if tzd != "Z": 

102 tzd_hours, tzd_minutes = (int(x) for x in tzd.split(":")) 

103 tzd_hours *= -1 

104 if tzd_hours < 0: 

105 tzd_minutes *= -1 

106 dt = dt + datetime.timedelta(hours=tzd_hours, minutes=tzd_minutes) 

107 return dt 

108 

109 

110def _generic_get( 

111 element: XmlElement, self: "XmpInformation", list_type: str, converter: Callable[[Any], Any] = _identity 

112) -> Optional[list[str]]: 

113 containers = element.getElementsByTagNameNS(RDF_NAMESPACE, list_type) 

114 retval: list[Any] = [] 

115 if len(containers): 

116 for container in containers: 

117 for item in container.getElementsByTagNameNS(RDF_NAMESPACE, "li"): 

118 value = self._get_text(item) 

119 value = converter(value) 

120 retval.append(value) 

121 return retval 

122 return None 

123 

124 

125def _getter_bag( 

126 namespace: str, name: str 

127) -> Callable[["XmpInformation"], Optional[list[str]]]: 

128 def get(self: "XmpInformation") -> Optional[list[str]]: 

129 cached = self.cache.get(namespace, {}).get(name) 

130 if cached: 

131 return cached 

132 retval: list[str] = [] 

133 for element in self.get_element("", namespace, name): 

134 if (bags := _generic_get(element, self, list_type="Bag")) is not None: 

135 retval.extend(bags) 

136 else: 

137 value = self._get_text(element) 

138 retval.append(value) 

139 ns_cache = self.cache.setdefault(namespace, {}) 

140 ns_cache[name] = retval 

141 return retval 

142 

143 return get 

144 

145 

146def _getter_seq( 

147 namespace: str, name: str, converter: Callable[[Any], Any] = _identity 

148) -> Callable[["XmpInformation"], Optional[list[Any]]]: 

149 def get(self: "XmpInformation") -> Optional[list[Any]]: 

150 cached = self.cache.get(namespace, {}).get(name) 

151 if cached: 

152 return cached 

153 retval = [] 

154 for element in self.get_element("", namespace, name): 

155 if (seqs := _generic_get(element, self, list_type="Seq", converter=converter)) is not None: 

156 retval.extend(seqs) 

157 elif (bags := _generic_get(element, self, list_type="Bag")) is not None: 

158 # See issue at https://github.com/py-pdf/pypdf/issues/3324 

159 # Some applications violate the XMP metadata standard regarding `dc:creator` which should 

160 # be an "ordered array" and thus a sequence, but use an unordered array (bag) instead. 

161 # This seems to stem from the fact that the original Dublin Core specification does indeed 

162 # use bags or direct values, while PDFs are expected to follow the XMP standard and ignore 

163 # the plain Dublin Core variant. For this reason, add a fallback here to deal with such 

164 # issues accordingly. 

165 retval.extend(bags) 

166 else: 

167 value = converter(self._get_text(element)) 

168 retval.append(value) 

169 ns_cache = self.cache.setdefault(namespace, {}) 

170 ns_cache[name] = retval 

171 return retval 

172 

173 return get 

174 

175 

176def _getter_langalt( 

177 namespace: str, name: str 

178) -> Callable[["XmpInformation"], Optional[dict[Any, Any]]]: 

179 def get(self: "XmpInformation") -> Optional[dict[Any, Any]]: 

180 cached = self.cache.get(namespace, {}).get(name) 

181 if cached: 

182 return cached 

183 retval = {} 

184 for element in self.get_element("", namespace, name): 

185 alts = element.getElementsByTagNameNS(RDF_NAMESPACE, "Alt") 

186 if len(alts): 

187 for alt in alts: 

188 for item in alt.getElementsByTagNameNS(RDF_NAMESPACE, "li"): 

189 value = self._get_text(item) 

190 retval[item.getAttribute("xml:lang")] = value 

191 else: 

192 retval["x-default"] = self._get_text(element) 

193 ns_cache = self.cache.setdefault(namespace, {}) 

194 ns_cache[name] = retval 

195 return retval 

196 

197 return get 

198 

199 

200def _getter_single( 

201 namespace: str, name: str, converter: Callable[[str], Any] = _identity 

202) -> Callable[["XmpInformation"], Optional[Any]]: 

203 def get(self: "XmpInformation") -> Optional[Any]: 

204 cached = self.cache.get(namespace, {}).get(name) 

205 if cached: 

206 return cached 

207 value = None 

208 for element in self.get_element("", namespace, name): 

209 if element.nodeType == element.ATTRIBUTE_NODE: 

210 value = element.nodeValue 

211 else: 

212 value = self._get_text(element) 

213 break 

214 if value is not None: 

215 value = converter(value) 

216 ns_cache = self.cache.setdefault(namespace, {}) 

217 ns_cache[name] = value 

218 return value 

219 

220 return get 

221 

222 

223class XmpInformation(XmpInformationProtocol, PdfObject): 

224 """ 

225 An object that represents Extensible Metadata Platform (XMP) metadata. 

226 Usually accessed by :py:attr:`xmp_metadata()<pypdf.PdfReader.xmp_metadata>`. 

227 

228 Raises: 

229 PdfReadError: if XML is invalid 

230 

231 """ 

232 

233 def __init__(self, stream: ContentStream) -> None: 

234 self.stream = stream 

235 try: 

236 data = self.stream.get_data() 

237 doc_root: Document = parseString(data) # noqa: S318 

238 except (AttributeError, ExpatError) as e: 

239 raise PdfReadError(f"XML in XmpInformation was invalid: {e}") 

240 self.rdf_root: XmlElement = doc_root.getElementsByTagNameNS( 

241 RDF_NAMESPACE, "RDF" 

242 )[0] 

243 self.cache: dict[Any, Any] = {} 

244 

245 def write_to_stream( 

246 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None 

247 ) -> None: 

248 deprecate_with_replacement( 

249 "XmpInformation.write_to_stream", 

250 "PdfWriter.xmp_metadata", 

251 "6.0.0" 

252 ) 

253 if encryption_key is not None: # deprecated 

254 deprecation_no_replacement( 

255 "the encryption_key parameter of write_to_stream", "5.0.0" 

256 ) 

257 self.stream.write_to_stream(stream) 

258 

259 def get_element(self, about_uri: str, namespace: str, name: str) -> Iterator[Any]: 

260 for desc in self.rdf_root.getElementsByTagNameNS(RDF_NAMESPACE, "Description"): 

261 if desc.getAttributeNS(RDF_NAMESPACE, "about") == about_uri: 

262 attr = desc.getAttributeNodeNS(namespace, name) 

263 if attr is not None: 

264 yield attr 

265 yield from desc.getElementsByTagNameNS(namespace, name) 

266 

267 def get_nodes_in_namespace(self, about_uri: str, namespace: str) -> Iterator[Any]: 

268 for desc in self.rdf_root.getElementsByTagNameNS(RDF_NAMESPACE, "Description"): 

269 if desc.getAttributeNS(RDF_NAMESPACE, "about") == about_uri: 

270 for i in range(desc.attributes.length): 

271 attr = desc.attributes.item(i) 

272 if attr and attr.namespaceURI == namespace: 

273 yield attr 

274 for child in desc.childNodes: 

275 if child.namespaceURI == namespace: 

276 yield child 

277 

278 def _get_text(self, element: XmlElement) -> str: 

279 text = "" 

280 for child in element.childNodes: 

281 if child.nodeType == child.TEXT_NODE: 

282 text += child.data 

283 return text 

284 

285 dc_contributor = property(_getter_bag(DC_NAMESPACE, "contributor")) 

286 """ 

287 Contributors to the resource (other than the authors). 

288 

289 An unsorted array of names. 

290 """ 

291 

292 dc_coverage = property(_getter_single(DC_NAMESPACE, "coverage")) 

293 """Text describing the extent or scope of the resource.""" 

294 

295 dc_creator = property(_getter_seq(DC_NAMESPACE, "creator")) 

296 """A sorted array of names of the authors of the resource, listed in order 

297 of precedence.""" 

298 

299 dc_date = property(_getter_seq(DC_NAMESPACE, "date", _converter_date)) 

300 """ 

301 A sorted array of dates (datetime.datetime instances) of significance to 

302 the resource. 

303 

304 The dates and times are in UTC. 

305 """ 

306 

307 dc_description = property(_getter_langalt(DC_NAMESPACE, "description")) 

308 """A language-keyed dictionary of textual descriptions of the content of the 

309 resource.""" 

310 

311 dc_format = property(_getter_single(DC_NAMESPACE, "format")) 

312 """The mime-type of the resource.""" 

313 

314 dc_identifier = property(_getter_single(DC_NAMESPACE, "identifier")) 

315 """Unique identifier of the resource.""" 

316 

317 dc_language = property(_getter_bag(DC_NAMESPACE, "language")) 

318 """An unordered array specifying the languages used in the resource.""" 

319 

320 dc_publisher = property(_getter_bag(DC_NAMESPACE, "publisher")) 

321 """An unordered array of publisher names.""" 

322 

323 dc_relation = property(_getter_bag(DC_NAMESPACE, "relation")) 

324 """An unordered array of text descriptions of relationships to other 

325 documents.""" 

326 

327 dc_rights = property(_getter_langalt(DC_NAMESPACE, "rights")) 

328 """A language-keyed dictionary of textual descriptions of the rights the 

329 user has to this resource.""" 

330 

331 dc_source = property(_getter_single(DC_NAMESPACE, "source")) 

332 """Unique identifier of the work from which this resource was derived.""" 

333 

334 dc_subject = property(_getter_bag(DC_NAMESPACE, "subject")) 

335 """An unordered array of descriptive phrases or keywords that specify the 

336 topic of the content of the resource.""" 

337 

338 dc_title = property(_getter_langalt(DC_NAMESPACE, "title")) 

339 """A language-keyed dictionary of the title of the resource.""" 

340 

341 dc_type = property(_getter_bag(DC_NAMESPACE, "type")) 

342 """An unordered array of textual descriptions of the document type.""" 

343 

344 pdf_keywords = property(_getter_single(PDF_NAMESPACE, "Keywords")) 

345 """An unformatted text string representing document keywords.""" 

346 

347 pdf_pdfversion = property(_getter_single(PDF_NAMESPACE, "PDFVersion")) 

348 """The PDF file version, for example 1.0 or 1.3.""" 

349 

350 pdf_producer = property(_getter_single(PDF_NAMESPACE, "Producer")) 

351 """The name of the tool that saved the document as a PDF.""" 

352 

353 xmp_create_date = property( 

354 _getter_single(XMP_NAMESPACE, "CreateDate", _converter_date) 

355 ) 

356 """ 

357 The date and time the resource was originally created. 

358 

359 The date and time are returned as a UTC datetime.datetime object. 

360 """ 

361 

362 xmp_modify_date = property( 

363 _getter_single(XMP_NAMESPACE, "ModifyDate", _converter_date) 

364 ) 

365 """ 

366 The date and time the resource was last modified. 

367 

368 The date and time are returned as a UTC datetime.datetime object. 

369 """ 

370 

371 xmp_metadata_date = property( 

372 _getter_single(XMP_NAMESPACE, "MetadataDate", _converter_date) 

373 ) 

374 """ 

375 The date and time that any metadata for this resource was last changed. 

376 

377 The date and time are returned as a UTC datetime.datetime object. 

378 """ 

379 

380 xmp_creator_tool = property(_getter_single(XMP_NAMESPACE, "CreatorTool")) 

381 """The name of the first known tool used to create the resource.""" 

382 

383 xmpmm_document_id = property(_getter_single(XMPMM_NAMESPACE, "DocumentID")) 

384 """The common identifier for all versions and renditions of this resource.""" 

385 

386 xmpmm_instance_id = property(_getter_single(XMPMM_NAMESPACE, "InstanceID")) 

387 """An identifier for a specific incarnation of a document, updated each 

388 time a file is saved.""" 

389 

390 pdfaid_part = property(_getter_single(PDFAID_NAMESPACE, "part")) 

391 """The part of the PDF/A standard that the document conforms to (e.g., 1, 2, 3).""" 

392 

393 pdfaid_conformance = property(_getter_single(PDFAID_NAMESPACE, "conformance")) 

394 """The conformance level within the PDF/A standard (e.g., 'A', 'B', 'U').""" 

395 

396 @property 

397 def custom_properties(self) -> dict[Any, Any]: 

398 """ 

399 Retrieve custom metadata properties defined in the undocumented pdfx 

400 metadata schema. 

401 

402 Returns: 

403 A dictionary of key/value items for custom metadata properties. 

404 

405 """ 

406 if not hasattr(self, "_custom_properties"): 

407 self._custom_properties = {} 

408 for node in self.get_nodes_in_namespace("", PDFX_NAMESPACE): 

409 key = node.localName 

410 while True: 

411 # see documentation about PDFX_NAMESPACE earlier in file 

412 idx = key.find("\u2182") 

413 if idx == -1: 

414 break 

415 key = ( 

416 key[:idx] 

417 + chr(int(key[idx + 1 : idx + 5], base=16)) 

418 + key[idx + 5 :] 

419 ) 

420 if node.nodeType == node.ATTRIBUTE_NODE: 

421 value = node.nodeValue 

422 else: 

423 value = self._get_text(node) 

424 self._custom_properties[key] = value 

425 return self._custom_properties