Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/xmp.py: 42%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

229 statements  

1""" 

2Anything related to Extensible Metadata Platform (XMP) metadata. 

3 

4https://en.wikipedia.org/wiki/Extensible_Metadata_Platform 

5""" 

6 

7import datetime 

8import decimal 

9import re 

10from typing import ( 

11 Any, 

12 Callable, 

13 Dict, 

14 Iterator, 

15 List, 

16 Optional, 

17 TypeVar, 

18 Union, 

19) 

20from xml.dom.minidom import Document, parseString 

21from xml.dom.minidom import Element as XmlElement 

22from xml.parsers.expat import ExpatError 

23 

24from ._protocols import XmpInformationProtocol 

25from ._utils import StreamType, deprecate_no_replacement 

26from .errors import PdfReadError 

27from .generic import ContentStream, PdfObject 

28 

29RDF_NAMESPACE = "http://www.w3.org/1999/02/22-rdf-syntax-ns#" 

30DC_NAMESPACE = "http://purl.org/dc/elements/1.1/" 

31XMP_NAMESPACE = "http://ns.adobe.com/xap/1.0/" 

32PDF_NAMESPACE = "http://ns.adobe.com/pdf/1.3/" 

33XMPMM_NAMESPACE = "http://ns.adobe.com/xap/1.0/mm/" 

34 

35# What is the PDFX namespace, you might ask? 

36# It's documented here: https://github.com/adobe/xmp-docs/raw/master/XMPSpecifications/XMPSpecificationPart3.pdf 

37# This namespace is used to place "custom metadata" 

38# properties, which are arbitrary metadata properties with no semantic or 

39# documented meaning. 

40# 

41# Elements in the namespace are key/value-style storage, 

42# where the element name is the key and the content is the value. The keys 

43# are transformed into valid XML identifiers by substituting an invalid 

44# identifier character with \u2182 followed by the unicode hex ID of the 

45# original character. A key like "my car" is therefore "my\u21820020car". 

46# 

47# \u2182 is the unicode character \u{ROMAN NUMERAL TEN THOUSAND} 

48# 

49# The pdfx namespace should be avoided. 

50# A custom data schema and sensical XML elements could be used instead, as is 

51# suggested by Adobe's own documentation on XMP under "Extensibility of 

52# Schemas". 

53PDFX_NAMESPACE = "http://ns.adobe.com/pdfx/1.3/" 

54 

55# PDF/A 

56PDFAID_NAMESPACE = "http://www.aiim.org/pdfa/ns/id/" 

57 

58iso8601 = re.compile( 

59 """ 

60 (?P<year>[0-9]{4}) 

61 (- 

62 (?P<month>[0-9]{2}) 

63 (- 

64 (?P<day>[0-9]+) 

65 (T 

66 (?P<hour>[0-9]{2}): 

67 (?P<minute>[0-9]{2}) 

68 (:(?P<second>[0-9]{2}(.[0-9]+)?))? 

69 (?P<tzd>Z|[-+][0-9]{2}:[0-9]{2}) 

70 )? 

71 )? 

72 )? 

73 """, 

74 re.VERBOSE, 

75) 

76 

77 

78K = TypeVar("K") 

79 

80 

81def _identity(value: K) -> K: 

82 return value 

83 

84 

85def _converter_date(value: str) -> datetime.datetime: 

86 matches = iso8601.match(value) 

87 if matches is None: 

88 raise ValueError(f"Invalid date format: {value}") 

89 year = int(matches.group("year")) 

90 month = int(matches.group("month") or "1") 

91 day = int(matches.group("day") or "1") 

92 hour = int(matches.group("hour") or "0") 

93 minute = int(matches.group("minute") or "0") 

94 second = decimal.Decimal(matches.group("second") or "0") 

95 seconds_dec = second.to_integral(decimal.ROUND_FLOOR) 

96 milliseconds_dec = (second - seconds_dec) * 1_000_000 

97 

98 seconds = int(seconds_dec) 

99 milliseconds = int(milliseconds_dec) 

100 

101 tzd = matches.group("tzd") or "Z" 

102 dt = datetime.datetime(year, month, day, hour, minute, seconds, milliseconds) 

103 if tzd != "Z": 

104 tzd_hours, tzd_minutes = (int(x) for x in tzd.split(":")) 

105 tzd_hours *= -1 

106 if tzd_hours < 0: 

107 tzd_minutes *= -1 

108 dt = dt + datetime.timedelta(hours=tzd_hours, minutes=tzd_minutes) 

109 return dt 

110 

111 

112def _generic_get( 

113 element: XmlElement, self: "XmpInformation", list_type: str, converter: Callable[[Any], Any] = _identity 

114) -> Optional[List[str]]: 

115 containers = element.getElementsByTagNameNS(RDF_NAMESPACE, list_type) 

116 retval: List[Any] = [] 

117 if len(containers): 

118 for container in containers: 

119 for item in container.getElementsByTagNameNS(RDF_NAMESPACE, "li"): 

120 value = self._get_text(item) 

121 value = converter(value) 

122 retval.append(value) 

123 return retval 

124 return None 

125 

126 

127def _getter_bag( 

128 namespace: str, name: str 

129) -> Callable[["XmpInformation"], Optional[List[str]]]: 

130 def get(self: "XmpInformation") -> Optional[List[str]]: 

131 cached = self.cache.get(namespace, {}).get(name) 

132 if cached: 

133 return cached 

134 retval: List[str] = [] 

135 for element in self.get_element("", namespace, name): 

136 if (bags := _generic_get(element, self, list_type="Bag")) is not None: 

137 retval.extend(bags) 

138 else: 

139 value = self._get_text(element) 

140 retval.append(value) 

141 ns_cache = self.cache.setdefault(namespace, {}) 

142 ns_cache[name] = retval 

143 return retval 

144 

145 return get 

146 

147 

148def _getter_seq( 

149 namespace: str, name: str, converter: Callable[[Any], Any] = _identity 

150) -> Callable[["XmpInformation"], Optional[List[Any]]]: 

151 def get(self: "XmpInformation") -> Optional[List[Any]]: 

152 cached = self.cache.get(namespace, {}).get(name) 

153 if cached: 

154 return cached 

155 retval = [] 

156 for element in self.get_element("", namespace, name): 

157 if (seqs := _generic_get(element, self, list_type="Seq", converter=converter)) is not None: 

158 retval.extend(seqs) 

159 elif (bags := _generic_get(element, self, list_type="Bag")) is not None: 

160 # See issue at https://github.com/py-pdf/pypdf/issues/3324 

161 # Some applications violate the XMP metadata standard regarding `dc:creator` which should 

162 # be an "ordered array" and thus a sequence, but use an unordered array (bag) instead. 

163 # This seems to stem from the fact that the original Dublin Core specification does indeed 

164 # use bags or direct values, while PDFs are expected to follow the XMP standard and ignore 

165 # the plain Dublin Core variant. For this reason, add a fallback here to deal with such 

166 # issues accordingly. 

167 retval.extend(bags) 

168 else: 

169 value = converter(self._get_text(element)) 

170 retval.append(value) 

171 ns_cache = self.cache.setdefault(namespace, {}) 

172 ns_cache[name] = retval 

173 return retval 

174 

175 return get 

176 

177 

178def _getter_langalt( 

179 namespace: str, name: str 

180) -> Callable[["XmpInformation"], Optional[Dict[Any, Any]]]: 

181 def get(self: "XmpInformation") -> Optional[Dict[Any, Any]]: 

182 cached = self.cache.get(namespace, {}).get(name) 

183 if cached: 

184 return cached 

185 retval = {} 

186 for element in self.get_element("", namespace, name): 

187 alts = element.getElementsByTagNameNS(RDF_NAMESPACE, "Alt") 

188 if len(alts): 

189 for alt in alts: 

190 for item in alt.getElementsByTagNameNS(RDF_NAMESPACE, "li"): 

191 value = self._get_text(item) 

192 retval[item.getAttribute("xml:lang")] = value 

193 else: 

194 retval["x-default"] = self._get_text(element) 

195 ns_cache = self.cache.setdefault(namespace, {}) 

196 ns_cache[name] = retval 

197 return retval 

198 

199 return get 

200 

201 

202def _getter_single( 

203 namespace: str, name: str, converter: Callable[[str], Any] = _identity 

204) -> Callable[["XmpInformation"], Optional[Any]]: 

205 def get(self: "XmpInformation") -> Optional[Any]: 

206 cached = self.cache.get(namespace, {}).get(name) 

207 if cached: 

208 return cached 

209 value = None 

210 for element in self.get_element("", namespace, name): 

211 if element.nodeType == element.ATTRIBUTE_NODE: 

212 value = element.nodeValue 

213 else: 

214 value = self._get_text(element) 

215 break 

216 if value is not None: 

217 value = converter(value) 

218 ns_cache = self.cache.setdefault(namespace, {}) 

219 ns_cache[name] = value 

220 return value 

221 

222 return get 

223 

224 

225class XmpInformation(XmpInformationProtocol, PdfObject): 

226 """ 

227 An object that represents Extensible Metadata Platform (XMP) metadata. 

228 Usually accessed by :py:attr:`xmp_metadata()<pypdf.PdfReader.xmp_metadata>`. 

229 

230 Raises: 

231 PdfReadError: if XML is invalid 

232 

233 """ 

234 

235 def __init__(self, stream: ContentStream) -> None: 

236 self.stream = stream 

237 try: 

238 data = self.stream.get_data() 

239 doc_root: Document = parseString(data) # noqa: S318 

240 except (AttributeError, ExpatError) as e: 

241 raise PdfReadError(f"XML in XmpInformation was invalid: {e}") 

242 self.rdf_root: XmlElement = doc_root.getElementsByTagNameNS( 

243 RDF_NAMESPACE, "RDF" 

244 )[0] 

245 self.cache: Dict[Any, Any] = {} 

246 

247 def write_to_stream( 

248 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None 

249 ) -> None: 

250 if encryption_key is not None: # deprecated 

251 deprecate_no_replacement( 

252 "the encryption_key parameter of write_to_stream", "5.0.0" 

253 ) 

254 self.stream.write_to_stream(stream) 

255 

256 def get_element(self, about_uri: str, namespace: str, name: str) -> Iterator[Any]: 

257 for desc in self.rdf_root.getElementsByTagNameNS(RDF_NAMESPACE, "Description"): 

258 if desc.getAttributeNS(RDF_NAMESPACE, "about") == about_uri: 

259 attr = desc.getAttributeNodeNS(namespace, name) 

260 if attr is not None: 

261 yield attr 

262 yield from desc.getElementsByTagNameNS(namespace, name) 

263 

264 def get_nodes_in_namespace(self, about_uri: str, namespace: str) -> Iterator[Any]: 

265 for desc in self.rdf_root.getElementsByTagNameNS(RDF_NAMESPACE, "Description"): 

266 if desc.getAttributeNS(RDF_NAMESPACE, "about") == about_uri: 

267 for i in range(desc.attributes.length): 

268 attr = desc.attributes.item(i) 

269 if attr and attr.namespaceURI == namespace: 

270 yield attr 

271 for child in desc.childNodes: 

272 if child.namespaceURI == namespace: 

273 yield child 

274 

275 def _get_text(self, element: XmlElement) -> str: 

276 text = "" 

277 for child in element.childNodes: 

278 if child.nodeType == child.TEXT_NODE: 

279 text += child.data 

280 return text 

281 

282 dc_contributor = property(_getter_bag(DC_NAMESPACE, "contributor")) 

283 """ 

284 Contributors to the resource (other than the authors). 

285 

286 An unsorted array of names. 

287 """ 

288 

289 dc_coverage = property(_getter_single(DC_NAMESPACE, "coverage")) 

290 """Text describing the extent or scope of the resource.""" 

291 

292 dc_creator = property(_getter_seq(DC_NAMESPACE, "creator")) 

293 """A sorted array of names of the authors of the resource, listed in order 

294 of precedence.""" 

295 

296 dc_date = property(_getter_seq(DC_NAMESPACE, "date", _converter_date)) 

297 """ 

298 A sorted array of dates (datetime.datetime instances) of significance to 

299 the resource. 

300 

301 The dates and times are in UTC. 

302 """ 

303 

304 dc_description = property(_getter_langalt(DC_NAMESPACE, "description")) 

305 """A language-keyed dictionary of textual descriptions of the content of the 

306 resource.""" 

307 

308 dc_format = property(_getter_single(DC_NAMESPACE, "format")) 

309 """The mime-type of the resource.""" 

310 

311 dc_identifier = property(_getter_single(DC_NAMESPACE, "identifier")) 

312 """Unique identifier of the resource.""" 

313 

314 dc_language = property(_getter_bag(DC_NAMESPACE, "language")) 

315 """An unordered array specifying the languages used in the resource.""" 

316 

317 dc_publisher = property(_getter_bag(DC_NAMESPACE, "publisher")) 

318 """An unordered array of publisher names.""" 

319 

320 dc_relation = property(_getter_bag(DC_NAMESPACE, "relation")) 

321 """An unordered array of text descriptions of relationships to other 

322 documents.""" 

323 

324 dc_rights = property(_getter_langalt(DC_NAMESPACE, "rights")) 

325 """A language-keyed dictionary of textual descriptions of the rights the 

326 user has to this resource.""" 

327 

328 dc_source = property(_getter_single(DC_NAMESPACE, "source")) 

329 """Unique identifier of the work from which this resource was derived.""" 

330 

331 dc_subject = property(_getter_bag(DC_NAMESPACE, "subject")) 

332 """An unordered array of descriptive phrases or keywords that specify the 

333 topic of the content of the resource.""" 

334 

335 dc_title = property(_getter_langalt(DC_NAMESPACE, "title")) 

336 """A language-keyed dictionary of the title of the resource.""" 

337 

338 dc_type = property(_getter_bag(DC_NAMESPACE, "type")) 

339 """An unordered array of textual descriptions of the document type.""" 

340 

341 pdf_keywords = property(_getter_single(PDF_NAMESPACE, "Keywords")) 

342 """An unformatted text string representing document keywords.""" 

343 

344 pdf_pdfversion = property(_getter_single(PDF_NAMESPACE, "PDFVersion")) 

345 """The PDF file version, for example 1.0 or 1.3.""" 

346 

347 pdf_producer = property(_getter_single(PDF_NAMESPACE, "Producer")) 

348 """The name of the tool that saved the document as a PDF.""" 

349 

350 xmp_create_date = property( 

351 _getter_single(XMP_NAMESPACE, "CreateDate", _converter_date) 

352 ) 

353 """ 

354 The date and time the resource was originally created. 

355 

356 The date and time are returned as a UTC datetime.datetime object. 

357 """ 

358 

359 xmp_modify_date = property( 

360 _getter_single(XMP_NAMESPACE, "ModifyDate", _converter_date) 

361 ) 

362 """ 

363 The date and time the resource was last modified. 

364 

365 The date and time are returned as a UTC datetime.datetime object. 

366 """ 

367 

368 xmp_metadata_date = property( 

369 _getter_single(XMP_NAMESPACE, "MetadataDate", _converter_date) 

370 ) 

371 """ 

372 The date and time that any metadata for this resource was last changed. 

373 

374 The date and time are returned as a UTC datetime.datetime object. 

375 """ 

376 

377 xmp_creator_tool = property(_getter_single(XMP_NAMESPACE, "CreatorTool")) 

378 """The name of the first known tool used to create the resource.""" 

379 

380 xmpmm_document_id = property(_getter_single(XMPMM_NAMESPACE, "DocumentID")) 

381 """The common identifier for all versions and renditions of this resource.""" 

382 

383 xmpmm_instance_id = property(_getter_single(XMPMM_NAMESPACE, "InstanceID")) 

384 """An identifier for a specific incarnation of a document, updated each 

385 time a file is saved.""" 

386 

387 pdfaid_part = property(_getter_single(PDFAID_NAMESPACE, "part")) 

388 """The part of the PDF/A standard that the document conforms to (e.g., 1, 2, 3).""" 

389 

390 pdfaid_conformance = property(_getter_single(PDFAID_NAMESPACE, "conformance")) 

391 """The conformance level within the PDF/A standard (e.g., 'A', 'B', 'U').""" 

392 

393 @property 

394 def custom_properties(self) -> Dict[Any, Any]: 

395 """ 

396 Retrieve custom metadata properties defined in the undocumented pdfx 

397 metadata schema. 

398 

399 Returns: 

400 A dictionary of key/value items for custom metadata properties. 

401 

402 """ 

403 if not hasattr(self, "_custom_properties"): 

404 self._custom_properties = {} 

405 for node in self.get_nodes_in_namespace("", PDFX_NAMESPACE): 

406 key = node.localName 

407 while True: 

408 # see documentation about PDFX_NAMESPACE earlier in file 

409 idx = key.find("\u2182") 

410 if idx == -1: 

411 break 

412 key = ( 

413 key[:idx] 

414 + chr(int(key[idx + 1 : idx + 5], base=16)) 

415 + key[idx + 5 :] 

416 ) 

417 if node.nodeType == node.ATTRIBUTE_NODE: 

418 value = node.nodeValue 

419 else: 

420 value = self._get_text(node) 

421 self._custom_properties[key] = value 

422 return self._custom_properties