Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/xmp.py: 34%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

481 statements  

1""" 

2Anything related to Extensible Metadata Platform (XMP) metadata. 

3 

4https://en.wikipedia.org/wiki/Extensible_Metadata_Platform 

5""" 

6 

7import datetime 

8import decimal 

9import re 

10from collections.abc import Iterator 

11from typing import ( 

12 Any, 

13 Callable, 

14 Optional, 

15 TypeVar, 

16 Union, 

17 cast, 

18) 

19from xml.dom.expatbuilder import ExpatBuilderNS 

20from xml.dom.minidom import Document 

21from xml.dom.minidom import Element as XmlElement 

22from xml.dom.xmlbuilder import Options 

23from xml.parsers.expat import ExpatError, XMLParserType 

24 

25from ._protocols import XmpInformationProtocol 

26from ._utils import StreamType, deprecate_with_replacement, deprecation_no_replacement 

27from .errors import LimitReachedError, PdfReadError, XmpDocumentError 

28from .generic import ContentStream, PdfObject 

29 

30XMP_MAX_INPUT_LENGTH = 5_000_000 

31XMP_MAX_ELEMENT_COUNT = 100_000 

32 

33RDF_NAMESPACE = "http://www.w3.org/1999/02/22-rdf-syntax-ns#" 

34DC_NAMESPACE = "http://purl.org/dc/elements/1.1/" 

35XMP_NAMESPACE = "http://ns.adobe.com/xap/1.0/" 

36PDF_NAMESPACE = "http://ns.adobe.com/pdf/1.3/" 

37XMPMM_NAMESPACE = "http://ns.adobe.com/xap/1.0/mm/" 

38 

39# What is the PDFX namespace, you might ask? 

40# It's documented here: https://github.com/adobe/xmp-docs/raw/master/XMPSpecifications/XMPSpecificationPart3.pdf 

41# This namespace is used to place "custom metadata" 

42# properties, which are arbitrary metadata properties with no semantic or 

43# documented meaning. 

44# 

45# Elements in the namespace are key/value-style storage, 

46# where the element name is the key and the content is the value. The keys 

47# are transformed into valid XML identifiers by substituting an invalid 

48# identifier character with \u2182 followed by the unicode hex ID of the 

49# original character. A key like "my car" is therefore "my\u21820020car". 

50# 

51# \u2182 is the unicode character \u{ROMAN NUMERAL TEN THOUSAND} 

52# 

53# The pdfx namespace should be avoided. 

54# A custom data schema and sensical XML elements could be used instead, as is 

55# suggested by Adobe's own documentation on XMP under "Extensibility of 

56# Schemas". 

57PDFX_NAMESPACE = "http://ns.adobe.com/pdfx/1.3/" 

58 

59# PDF/A 

60PDFAID_NAMESPACE = "http://www.aiim.org/pdfa/ns/id/" 

61 

62# Internal mapping of namespace URI → prefix 

63_NAMESPACE_PREFIX_MAP = { 

64 DC_NAMESPACE: "dc", 

65 XMP_NAMESPACE: "xmp", 

66 PDF_NAMESPACE: "pdf", 

67 XMPMM_NAMESPACE: "xmpMM", 

68 PDFAID_NAMESPACE: "pdfaid", 

69 PDFX_NAMESPACE: "pdfx", 

70} 

71 

72iso8601 = re.compile( 

73 """ 

74 (?P<year>[0-9]{4}) 

75 (- 

76 (?P<month>[0-9]{2}) 

77 (- 

78 (?P<day>[0-9]+) 

79 (T 

80 (?P<hour>[0-9]{2}): 

81 (?P<minute>[0-9]{2}) 

82 (:(?P<second>[0-9]{2}(.[0-9]+)?))? 

83 (?P<tzd>Z|[-+][0-9]{2}:[0-9]{2}) 

84 )? 

85 )? 

86 )? 

87 """, 

88 re.VERBOSE, 

89) 

90 

91 

92K = TypeVar("K") 

93 

94# Minimal XMP template 

95_MINIMAL_XMP = f"""<?xpacket begin="\ufeff" id="W5M0MpCehiHzreSzNTczkc9d"?> 

96<x:xmpmeta xmlns:x="adobe:ns:meta/" x:xmptk="pypdf"> 

97 <rdf:RDF xmlns:rdf="{RDF_NAMESPACE}"> 

98 <rdf:Description rdf:about="" 

99 xmlns:dc="{DC_NAMESPACE}" 

100 xmlns:xmp="{XMP_NAMESPACE}" 

101 xmlns:pdf="{PDF_NAMESPACE}" 

102 xmlns:xmpMM="{XMPMM_NAMESPACE}" 

103 xmlns:pdfaid="{PDFAID_NAMESPACE}" 

104 xmlns:pdfx="{PDFX_NAMESPACE}"> 

105 </rdf:Description> 

106 </rdf:RDF> 

107</x:xmpmeta> 

108<?xpacket end="r"?>""" 

109 

110 

111def _identity(value: K) -> K: 

112 return value 

113 

114 

115def _converter_date(value: str) -> datetime.datetime: 

116 matches = iso8601.match(value) 

117 if matches is None: 

118 raise ValueError(f"Invalid date format: {value}") 

119 year = int(matches.group("year")) 

120 month = int(matches.group("month") or "1") 

121 day = int(matches.group("day") or "1") 

122 hour = int(matches.group("hour") or "0") 

123 minute = int(matches.group("minute") or "0") 

124 second = decimal.Decimal(matches.group("second") or "0") 

125 seconds_dec = second.to_integral(decimal.ROUND_FLOOR) 

126 milliseconds_dec = (second - seconds_dec) * 1_000_000 

127 

128 seconds = int(seconds_dec) 

129 milliseconds = int(milliseconds_dec) 

130 

131 tzd = matches.group("tzd") or "Z" 

132 dt = datetime.datetime(year, month, day, hour, minute, seconds, milliseconds) 

133 if tzd != "Z": 

134 tzd_hours, tzd_minutes = (int(x) for x in tzd.split(":")) 

135 tzd_hours *= -1 

136 if tzd_hours < 0: 

137 tzd_minutes *= -1 

138 dt = dt + datetime.timedelta(hours=tzd_hours, minutes=tzd_minutes) 

139 return dt 

140 

141 

142def _format_datetime_utc(value: datetime.datetime) -> str: 

143 """Format a datetime as UTC with trailing 'Z'. 

144 

145 - If the input is timezone-aware, convert to UTC first. 

146 - If naive, assume UTC. 

147 """ 

148 if value.tzinfo is not None and value.utcoffset() is not None: 

149 value = value.astimezone(datetime.timezone.utc) 

150 

151 value = value.replace(tzinfo=None) 

152 return value.strftime("%Y-%m-%dT%H:%M:%S.%fZ") 

153 

154 

155def _generic_get( 

156 element: XmlElement, self: "XmpInformation", list_type: str, converter: Callable[[Any], Any] = _identity 

157) -> Optional[list[str]]: 

158 containers = element.getElementsByTagNameNS(RDF_NAMESPACE, list_type) 

159 retval: list[Any] = [] 

160 if len(containers): 

161 for container in containers: 

162 for item in container.getElementsByTagNameNS(RDF_NAMESPACE, "li"): 

163 value = self._get_text(item) 

164 value = converter(value) 

165 retval.append(value) 

166 return retval 

167 return None 

168 

169 

170class _XmpBuilder(ExpatBuilderNS): 

171 """ 

172 Custom XML parser denying all entity declarations. 

173 

174 This is a stripped down and typed version inspired by what *defusedxml* does. 

175 

176 Why do we need this? The default limits of *libexpat* used by Python only block exponential entity expansion, 

177 but not cases like quadratic entity expansion which can still cause quite some memory usage. 

178 """ 

179 

180 def __init__(self, options: Optional[Options] = None) -> None: 

181 super().__init__(options=options) 

182 self._element_count = 0 

183 

184 def custom_entity_declaration_handler( 

185 self, 

186 entity_name: str, 

187 is_parameter_entity: bool, 

188 value: Optional[str], 

189 base: Optional[str], 

190 system_id: str, 

191 public_id: Optional[str], 

192 notation_name: Optional[str], 

193 ) -> None: 

194 raise ExpatError(f"Forbidden entities: {entity_name!r}") 

195 

196 def start_element_handler(self, name: str, attributes: list[str]) -> None: 

197 self._element_count += 1 

198 if self._element_count > XMP_MAX_ELEMENT_COUNT: 

199 raise LimitReachedError(f"XMP metadata exceeds limit of {XMP_MAX_ELEMENT_COUNT} elements.") 

200 super().start_element_handler(name=name, attributes=attributes) 

201 

202 def install(self, parser: XMLParserType) -> None: 

203 super().install(parser) 

204 

205 parser.EntityDeclHandler = self.custom_entity_declaration_handler 

206 parser.StartElementHandler = self.start_element_handler 

207 

208 

209class XmpInformation(XmpInformationProtocol, PdfObject): 

210 """ 

211 An object that represents Extensible Metadata Platform (XMP) metadata. 

212 Usually accessed by :py:attr:`xmp_metadata()<pypdf.PdfReader.xmp_metadata>`. 

213 

214 Raises: 

215 PdfReadError: if XML is invalid 

216 

217 """ 

218 

219 def __init__(self, stream: ContentStream) -> None: 

220 self.stream = stream 

221 try: 

222 data = self.stream.get_data() 

223 if (length := len(data)) > XMP_MAX_INPUT_LENGTH: 

224 raise LimitReachedError(f"XMP stream size {length} exceeds limit of {XMP_MAX_INPUT_LENGTH}.") 

225 doc_root: Document = _XmpBuilder().parseString(data) 

226 except (AttributeError, ExpatError) as e: 

227 raise PdfReadError(f"XML in XmpInformation was invalid: {e}") 

228 self.rdf_root: XmlElement = doc_root.getElementsByTagNameNS( 

229 RDF_NAMESPACE, "RDF" 

230 )[0] 

231 self.cache: dict[Any, Any] = {} 

232 

233 @classmethod 

234 def create(cls) -> "XmpInformation": 

235 """ 

236 Create a new XmpInformation object with minimal structure. 

237 

238 Returns: 

239 A new XmpInformation instance with empty metadata fields. 

240 """ 

241 stream = ContentStream(None, None) 

242 stream.set_data(_MINIMAL_XMP.encode("utf-8")) 

243 return cls(stream) 

244 

245 def write_to_stream( 

246 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None 

247 ) -> None: 

248 deprecate_with_replacement( 

249 "XmpInformation.write_to_stream", 

250 "PdfWriter.xmp_metadata", 

251 "6.0.0" 

252 ) 

253 if encryption_key is not None: # deprecated 

254 deprecation_no_replacement( 

255 "the encryption_key parameter of write_to_stream", "5.0.0" 

256 ) 

257 self.stream.write_to_stream(stream) 

258 

259 def get_element(self, about_uri: str, namespace: str, name: str) -> Iterator[Any]: 

260 for desc in self.rdf_root.getElementsByTagNameNS(RDF_NAMESPACE, "Description"): 

261 if desc.getAttributeNS(RDF_NAMESPACE, "about") == about_uri: 

262 attr = desc.getAttributeNodeNS(namespace, name) 

263 if attr is not None: 

264 yield attr 

265 yield from desc.getElementsByTagNameNS(namespace, name) 

266 

267 def get_nodes_in_namespace(self, about_uri: str, namespace: str) -> Iterator[Any]: 

268 for desc in self.rdf_root.getElementsByTagNameNS(RDF_NAMESPACE, "Description"): 

269 if desc.getAttributeNS(RDF_NAMESPACE, "about") == about_uri: 

270 for i in range(desc.attributes.length): 

271 attr = desc.attributes.item(i) 

272 if attr and attr.namespaceURI == namespace: 

273 yield attr 

274 for child in desc.childNodes: 

275 if child.namespaceURI == namespace: 

276 yield child 

277 

278 def _get_text(self, element: XmlElement) -> str: 

279 text = "" 

280 for child in element.childNodes: 

281 if child.nodeType == child.TEXT_NODE: 

282 text += child.data 

283 return text 

284 

285 def _get_single_value( 

286 self, 

287 namespace: str, 

288 name: str, 

289 converter: Callable[[str], Any] = _identity, 

290 ) -> Optional[Any]: 

291 cached = self.cache.get(namespace, {}).get(name) 

292 if cached: 

293 return cached 

294 value = None 

295 for element in self.get_element("", namespace, name): 

296 if element.nodeType == element.ATTRIBUTE_NODE: 

297 value = element.nodeValue 

298 else: 

299 value = self._get_text(element) 

300 break 

301 if value is not None: 

302 value = converter(value) 

303 ns_cache = self.cache.setdefault(namespace, {}) 

304 ns_cache[name] = value 

305 return value 

306 

307 def _getter_bag(self, namespace: str, name: str) -> Optional[list[str]]: 

308 cached = self.cache.get(namespace, {}).get(name) 

309 if cached: 

310 return cast(list[str], cached) 

311 retval: list[str] = [] 

312 for element in self.get_element("", namespace, name): 

313 if (bags := _generic_get(element, self, list_type="Bag")) is not None: 

314 retval.extend(bags) 

315 else: 

316 value = self._get_text(element) 

317 retval.append(value) 

318 ns_cache = self.cache.setdefault(namespace, {}) 

319 ns_cache[name] = retval 

320 return retval 

321 

322 def _get_seq_values( 

323 self, 

324 namespace: str, 

325 name: str, 

326 converter: Callable[[Any], Any] = _identity, 

327 ) -> Optional[list[Any]]: 

328 cached = self.cache.get(namespace, {}).get(name) 

329 if cached: 

330 return cast(list[Any], cached) 

331 retval: list[Any] = [] 

332 for element in self.get_element("", namespace, name): 

333 if (seqs := _generic_get(element, self, list_type="Seq", converter=converter)) is not None: 

334 retval.extend(seqs) 

335 elif (bags := _generic_get(element, self, list_type="Bag")) is not None: 

336 # See issue at https://github.com/py-pdf/pypdf/issues/3324 

337 # Some applications violate the XMP metadata standard regarding `dc:creator` which should 

338 # be an "ordered array" and thus a sequence, but use an unordered array (bag) instead. 

339 # This seems to stem from the fact that the original Dublin Core specification does indeed 

340 # use bags or direct values, while PDFs are expected to follow the XMP standard and ignore 

341 # the plain Dublin Core variant. For this reason, add a fallback here to deal with such 

342 # issues accordingly. 

343 retval.extend(bags) 

344 else: 

345 value = converter(self._get_text(element)) 

346 retval.append(value) 

347 ns_cache = self.cache.setdefault(namespace, {}) 

348 ns_cache[name] = retval 

349 return retval 

350 

351 def _get_langalt_values(self, namespace: str, name: str) -> Optional[dict[Any, Any]]: 

352 cached = self.cache.get(namespace, {}).get(name) 

353 if cached: 

354 return cast(dict[Any, Any], cached) 

355 retval: dict[Any, Any] = {} 

356 for element in self.get_element("", namespace, name): 

357 alts = element.getElementsByTagNameNS(RDF_NAMESPACE, "Alt") 

358 if len(alts): 

359 for alt in alts: 

360 for item in alt.getElementsByTagNameNS(RDF_NAMESPACE, "li"): 

361 value = self._get_text(item) 

362 retval[item.getAttribute("xml:lang")] = value 

363 else: 

364 retval["x-default"] = self._get_text(element) 

365 ns_cache = self.cache.setdefault(namespace, {}) 

366 ns_cache[name] = retval 

367 return retval 

368 

369 @property 

370 def dc_contributor(self) -> Optional[list[str]]: 

371 """Contributors to the resource (other than the authors).""" 

372 return self._getter_bag(DC_NAMESPACE, "contributor") 

373 

374 @dc_contributor.setter 

375 def dc_contributor(self, values: Optional[list[str]]) -> None: 

376 self._set_bag_values(DC_NAMESPACE, "contributor", values) 

377 

378 @property 

379 def dc_coverage(self) -> Optional[str]: 

380 """Text describing the extent or scope of the resource.""" 

381 return self._get_single_value(DC_NAMESPACE, "coverage") 

382 

383 @dc_coverage.setter 

384 def dc_coverage(self, value: Optional[str]) -> None: 

385 self._set_single_value(DC_NAMESPACE, "coverage", value) 

386 

387 @property 

388 def dc_creator(self) -> Optional[list[str]]: 

389 """A sorted array of names of the authors of the resource, listed in order of precedence.""" 

390 return self._get_seq_values(DC_NAMESPACE, "creator") 

391 

392 @dc_creator.setter 

393 def dc_creator(self, values: Optional[list[str]]) -> None: 

394 self._set_seq_values(DC_NAMESPACE, "creator", values) 

395 

396 @property 

397 def dc_date(self) -> Optional[list[datetime.datetime]]: 

398 """A sorted array of dates of significance to the resource. The dates and times are in UTC.""" 

399 return self._get_seq_values(DC_NAMESPACE, "date", _converter_date) 

400 

401 @dc_date.setter 

402 def dc_date(self, values: Optional[list[Union[str, datetime.datetime]]]) -> None: 

403 if values is None: 

404 self._set_seq_values(DC_NAMESPACE, "date", None) 

405 else: 

406 date_strings = [] 

407 for value in values: 

408 if isinstance(value, datetime.datetime): 

409 date_strings.append(_format_datetime_utc(value)) 

410 else: 

411 date_strings.append(str(value)) 

412 self._set_seq_values(DC_NAMESPACE, "date", date_strings) 

413 

414 @property 

415 def dc_description(self) -> Optional[dict[str, str]]: 

416 """A language-keyed dictionary of textual descriptions of the content of the resource.""" 

417 return self._get_langalt_values(DC_NAMESPACE, "description") 

418 

419 @dc_description.setter 

420 def dc_description(self, values: Optional[dict[str, str]]) -> None: 

421 self._set_langalt_values(DC_NAMESPACE, "description", values) 

422 

423 @property 

424 def dc_format(self) -> Optional[str]: 

425 """The mime-type of the resource.""" 

426 return self._get_single_value(DC_NAMESPACE, "format") 

427 

428 @dc_format.setter 

429 def dc_format(self, value: Optional[str]) -> None: 

430 self._set_single_value(DC_NAMESPACE, "format", value) 

431 

432 @property 

433 def dc_identifier(self) -> Optional[str]: 

434 """Unique identifier of the resource.""" 

435 return self._get_single_value(DC_NAMESPACE, "identifier") 

436 

437 @dc_identifier.setter 

438 def dc_identifier(self, value: Optional[str]) -> None: 

439 self._set_single_value(DC_NAMESPACE, "identifier", value) 

440 

441 @property 

442 def dc_language(self) -> Optional[list[str]]: 

443 """An unordered array specifying the languages used in the resource.""" 

444 return self._getter_bag(DC_NAMESPACE, "language") 

445 

446 @dc_language.setter 

447 def dc_language(self, values: Optional[list[str]]) -> None: 

448 self._set_bag_values(DC_NAMESPACE, "language", values) 

449 

450 @property 

451 def dc_publisher(self) -> Optional[list[str]]: 

452 """An unordered array of publisher names.""" 

453 return self._getter_bag(DC_NAMESPACE, "publisher") 

454 

455 @dc_publisher.setter 

456 def dc_publisher(self, values: Optional[list[str]]) -> None: 

457 self._set_bag_values(DC_NAMESPACE, "publisher", values) 

458 

459 @property 

460 def dc_relation(self) -> Optional[list[str]]: 

461 """An unordered array of text descriptions of relationships to other documents.""" 

462 return self._getter_bag(DC_NAMESPACE, "relation") 

463 

464 @dc_relation.setter 

465 def dc_relation(self, values: Optional[list[str]]) -> None: 

466 self._set_bag_values(DC_NAMESPACE, "relation", values) 

467 

468 @property 

469 def dc_rights(self) -> Optional[dict[str, str]]: 

470 """A language-keyed dictionary of textual descriptions of the rights the user has to this resource.""" 

471 return self._get_langalt_values(DC_NAMESPACE, "rights") 

472 

473 @dc_rights.setter 

474 def dc_rights(self, values: Optional[dict[str, str]]) -> None: 

475 self._set_langalt_values(DC_NAMESPACE, "rights", values) 

476 

477 @property 

478 def dc_source(self) -> Optional[str]: 

479 """Unique identifier of the work from which this resource was derived.""" 

480 return self._get_single_value(DC_NAMESPACE, "source") 

481 

482 @dc_source.setter 

483 def dc_source(self, value: Optional[str]) -> None: 

484 self._set_single_value(DC_NAMESPACE, "source", value) 

485 

486 @property 

487 def dc_subject(self) -> Optional[list[str]]: 

488 """An unordered array of descriptive phrases or keywords that specify the topic of the content.""" 

489 return self._getter_bag(DC_NAMESPACE, "subject") 

490 

491 @dc_subject.setter 

492 def dc_subject(self, values: Optional[list[str]]) -> None: 

493 self._set_bag_values(DC_NAMESPACE, "subject", values) 

494 

495 @property 

496 def dc_title(self) -> Optional[dict[str, str]]: 

497 """A language-keyed dictionary of the title of the resource.""" 

498 return self._get_langalt_values(DC_NAMESPACE, "title") 

499 

500 @dc_title.setter 

501 def dc_title(self, values: Optional[dict[str, str]]) -> None: 

502 self._set_langalt_values(DC_NAMESPACE, "title", values) 

503 

504 @property 

505 def dc_type(self) -> Optional[list[str]]: 

506 """An unordered array of textual descriptions of the document type.""" 

507 return self._getter_bag(DC_NAMESPACE, "type") 

508 

509 @dc_type.setter 

510 def dc_type(self, values: Optional[list[str]]) -> None: 

511 self._set_bag_values(DC_NAMESPACE, "type", values) 

512 

513 @property 

514 def pdf_keywords(self) -> Optional[str]: 

515 """An unformatted text string representing document keywords.""" 

516 return self._get_single_value(PDF_NAMESPACE, "Keywords") 

517 

518 @pdf_keywords.setter 

519 def pdf_keywords(self, value: Optional[str]) -> None: 

520 self._set_single_value(PDF_NAMESPACE, "Keywords", value) 

521 

522 @property 

523 def pdf_pdfversion(self) -> Optional[str]: 

524 """The PDF file version, for example 1.0 or 1.3.""" 

525 return self._get_single_value(PDF_NAMESPACE, "PDFVersion") 

526 

527 @pdf_pdfversion.setter 

528 def pdf_pdfversion(self, value: Optional[str]) -> None: 

529 self._set_single_value(PDF_NAMESPACE, "PDFVersion", value) 

530 

531 @property 

532 def pdf_producer(self) -> Optional[str]: 

533 """The name of the tool that saved the document as a PDF.""" 

534 return self._get_single_value(PDF_NAMESPACE, "Producer") 

535 

536 @pdf_producer.setter 

537 def pdf_producer(self, value: Optional[str]) -> None: 

538 self._set_single_value(PDF_NAMESPACE, "Producer", value) 

539 

540 @property 

541 def xmp_create_date(self) -> Optional[datetime.datetime]: 

542 """The date and time the resource was originally created. Returned as a UTC datetime object.""" 

543 return self._get_single_value(XMP_NAMESPACE, "CreateDate", _converter_date) 

544 

545 @xmp_create_date.setter 

546 def xmp_create_date(self, value: Optional[datetime.datetime]) -> None: 

547 if value: 

548 date_str = _format_datetime_utc(value) 

549 self._set_single_value(XMP_NAMESPACE, "CreateDate", date_str) 

550 else: 

551 self._set_single_value(XMP_NAMESPACE, "CreateDate", None) 

552 

553 @property 

554 def xmp_modify_date(self) -> Optional[datetime.datetime]: 

555 """The date and time the resource was last modified. Returned as a UTC datetime object.""" 

556 return self._get_single_value(XMP_NAMESPACE, "ModifyDate", _converter_date) 

557 

558 @xmp_modify_date.setter 

559 def xmp_modify_date(self, value: Optional[datetime.datetime]) -> None: 

560 if value: 

561 date_str = _format_datetime_utc(value) 

562 self._set_single_value(XMP_NAMESPACE, "ModifyDate", date_str) 

563 else: 

564 self._set_single_value(XMP_NAMESPACE, "ModifyDate", None) 

565 

566 @property 

567 def xmp_metadata_date(self) -> Optional[datetime.datetime]: 

568 """The date and time that any metadata for this resource was last changed. Returned as a UTC datetime object.""" 

569 return self._get_single_value(XMP_NAMESPACE, "MetadataDate", _converter_date) 

570 

571 @xmp_metadata_date.setter 

572 def xmp_metadata_date(self, value: Optional[datetime.datetime]) -> None: 

573 if value: 

574 date_str = _format_datetime_utc(value) 

575 self._set_single_value(XMP_NAMESPACE, "MetadataDate", date_str) 

576 else: 

577 self._set_single_value(XMP_NAMESPACE, "MetadataDate", None) 

578 

579 @property 

580 def xmp_creator_tool(self) -> Optional[str]: 

581 """The name of the first known tool used to create the resource.""" 

582 return self._get_single_value(XMP_NAMESPACE, "CreatorTool") 

583 

584 @xmp_creator_tool.setter 

585 def xmp_creator_tool(self, value: Optional[str]) -> None: 

586 self._set_single_value(XMP_NAMESPACE, "CreatorTool", value) 

587 

588 @property 

589 def xmpmm_document_id(self) -> Optional[str]: 

590 """The common identifier for all versions and renditions of this resource.""" 

591 return self._get_single_value(XMPMM_NAMESPACE, "DocumentID") 

592 

593 @xmpmm_document_id.setter 

594 def xmpmm_document_id(self, value: Optional[str]) -> None: 

595 self._set_single_value(XMPMM_NAMESPACE, "DocumentID", value) 

596 

597 @property 

598 def xmpmm_instance_id(self) -> Optional[str]: 

599 """An identifier for a specific incarnation of a document, updated each time a file is saved.""" 

600 return self._get_single_value(XMPMM_NAMESPACE, "InstanceID") 

601 

602 @xmpmm_instance_id.setter 

603 def xmpmm_instance_id(self, value: Optional[str]) -> None: 

604 self._set_single_value(XMPMM_NAMESPACE, "InstanceID", value) 

605 

606 @property 

607 def pdfaid_part(self) -> Optional[str]: 

608 """The part of the PDF/A standard that the document conforms to (e.g., 1, 2, 3).""" 

609 return self._get_single_value(PDFAID_NAMESPACE, "part") 

610 

611 @pdfaid_part.setter 

612 def pdfaid_part(self, value: Optional[str]) -> None: 

613 self._set_single_value(PDFAID_NAMESPACE, "part", value) 

614 

615 @property 

616 def pdfaid_conformance(self) -> Optional[str]: 

617 """The conformance level within the PDF/A standard (e.g., 'A', 'B', 'U').""" 

618 return self._get_single_value(PDFAID_NAMESPACE, "conformance") 

619 

620 @pdfaid_conformance.setter 

621 def pdfaid_conformance(self, value: Optional[str]) -> None: 

622 self._set_single_value(PDFAID_NAMESPACE, "conformance", value) 

623 

624 @property 

625 def custom_properties(self) -> dict[Any, Any]: 

626 """ 

627 Retrieve custom metadata properties defined in the undocumented pdfx 

628 metadata schema. 

629 

630 Returns: 

631 A dictionary of key/value items for custom metadata properties. 

632 

633 """ 

634 if not hasattr(self, "_custom_properties"): 

635 self._custom_properties = {} 

636 for node in self.get_nodes_in_namespace("", PDFX_NAMESPACE): 

637 key = node.localName 

638 while True: 

639 # see documentation about PDFX_NAMESPACE earlier in file 

640 idx = key.find("\u2182") 

641 if idx == -1: 

642 break 

643 key = ( 

644 key[:idx] 

645 + chr(int(key[idx + 1 : idx + 5], base=16)) 

646 + key[idx + 5 :] 

647 ) 

648 if node.nodeType == node.ATTRIBUTE_NODE: 

649 value = node.nodeValue 

650 else: 

651 value = self._get_text(node) 

652 self._custom_properties[key] = value 

653 return self._custom_properties 

654 

655 def _get_or_create_description(self, about_uri: str = "") -> XmlElement: 

656 """Get or create an rdf:Description element with the given about URI.""" 

657 for desc in self.rdf_root.getElementsByTagNameNS(RDF_NAMESPACE, "Description"): 

658 if desc.getAttributeNS(RDF_NAMESPACE, "about") == about_uri: 

659 return desc 

660 

661 doc = self.rdf_root.ownerDocument 

662 if doc is None: 

663 raise XmpDocumentError("XMP Document is None") 

664 desc = doc.createElementNS(RDF_NAMESPACE, "rdf:Description") 

665 desc.setAttributeNS(RDF_NAMESPACE, "rdf:about", about_uri) 

666 self.rdf_root.appendChild(desc) 

667 return desc 

668 

669 def _clear_cache_entry(self, namespace: str, name: str) -> None: 

670 """Remove a cached value for a given namespace/name if present.""" 

671 ns_cache = self.cache.get(namespace) 

672 if ns_cache and name in ns_cache: 

673 del ns_cache[name] 

674 

675 def _set_single_value(self, namespace: str, name: str, value: Optional[str]) -> None: 

676 """Set or remove a single metadata value.""" 

677 self._clear_cache_entry(namespace, name) 

678 desc = self._get_or_create_description() 

679 

680 existing_elements = list(desc.getElementsByTagNameNS(namespace, name)) 

681 for elem in existing_elements: 

682 desc.removeChild(elem) 

683 

684 if existing_attr := desc.getAttributeNodeNS(namespace, name): 

685 desc.removeAttributeNode(existing_attr) 

686 

687 if value is not None: 

688 doc = self.rdf_root.ownerDocument 

689 if doc is None: 

690 raise XmpDocumentError("XMP Document is None") 

691 prefix = self._get_namespace_prefix(namespace) 

692 elem = doc.createElementNS(namespace, f"{prefix}:{name}") 

693 text_node = doc.createTextNode(str(value)) 

694 elem.appendChild(text_node) 

695 desc.appendChild(elem) 

696 

697 self._update_stream() 

698 

699 def _set_bag_values(self, namespace: str, name: str, values: Optional[list[str]]) -> None: 

700 """Set or remove bag values (unordered array).""" 

701 self._clear_cache_entry(namespace, name) 

702 desc = self._get_or_create_description() 

703 

704 existing_elements = list(desc.getElementsByTagNameNS(namespace, name)) 

705 for elem in existing_elements: 

706 desc.removeChild(elem) 

707 

708 if values: 

709 doc = self.rdf_root.ownerDocument 

710 if doc is None: 

711 raise XmpDocumentError("XMP Document is None") 

712 prefix = self._get_namespace_prefix(namespace) 

713 elem = doc.createElementNS(namespace, f"{prefix}:{name}") 

714 bag = doc.createElementNS(RDF_NAMESPACE, "rdf:Bag") 

715 

716 for value in values: 

717 li = doc.createElementNS(RDF_NAMESPACE, "rdf:li") 

718 text_node = doc.createTextNode(str(value)) 

719 li.appendChild(text_node) 

720 bag.appendChild(li) 

721 

722 elem.appendChild(bag) 

723 desc.appendChild(elem) 

724 

725 self._update_stream() 

726 

727 def _set_seq_values(self, namespace: str, name: str, values: Optional[list[str]]) -> None: 

728 """Set or remove sequence values (ordered array).""" 

729 self._clear_cache_entry(namespace, name) 

730 desc = self._get_or_create_description() 

731 

732 existing_elements = list(desc.getElementsByTagNameNS(namespace, name)) 

733 for elem in existing_elements: 

734 desc.removeChild(elem) 

735 

736 if values: 

737 doc = self.rdf_root.ownerDocument 

738 if doc is None: 

739 raise XmpDocumentError("XMP Document is None") 

740 prefix = self._get_namespace_prefix(namespace) 

741 elem = doc.createElementNS(namespace, f"{prefix}:{name}") 

742 seq = doc.createElementNS(RDF_NAMESPACE, "rdf:Seq") 

743 

744 for value in values: 

745 li = doc.createElementNS(RDF_NAMESPACE, "rdf:li") 

746 text_node = doc.createTextNode(str(value)) 

747 li.appendChild(text_node) 

748 seq.appendChild(li) 

749 

750 elem.appendChild(seq) 

751 desc.appendChild(elem) 

752 

753 self._update_stream() 

754 

755 def _set_langalt_values(self, namespace: str, name: str, values: Optional[dict[str, str]]) -> None: 

756 """Set or remove language alternative values.""" 

757 self._clear_cache_entry(namespace, name) 

758 desc = self._get_or_create_description() 

759 

760 existing_elements = list(desc.getElementsByTagNameNS(namespace, name)) 

761 for elem in existing_elements: 

762 desc.removeChild(elem) 

763 

764 if values: 

765 doc = self.rdf_root.ownerDocument 

766 if doc is None: 

767 raise XmpDocumentError("XMP Document is None") 

768 prefix = self._get_namespace_prefix(namespace) 

769 elem = doc.createElementNS(namespace, f"{prefix}:{name}") 

770 alt = doc.createElementNS(RDF_NAMESPACE, "rdf:Alt") 

771 

772 for lang, value in values.items(): 

773 li = doc.createElementNS(RDF_NAMESPACE, "rdf:li") 

774 li.setAttribute("xml:lang", lang) 

775 text_node = doc.createTextNode(str(value)) 

776 li.appendChild(text_node) 

777 alt.appendChild(li) 

778 

779 elem.appendChild(alt) 

780 desc.appendChild(elem) 

781 

782 self._update_stream() 

783 

784 def _get_namespace_prefix(self, namespace: str) -> str: 

785 """Get the appropriate namespace prefix for a given namespace URI.""" 

786 return _NAMESPACE_PREFIX_MAP.get(namespace, "unknown") 

787 

788 def _update_stream(self) -> None: 

789 """Update the stream with the current XML content.""" 

790 doc = self.rdf_root.ownerDocument 

791 if doc is None: 

792 raise XmpDocumentError("XMP Document is None") 

793 

794 xml_data = doc.toxml(encoding="utf-8") 

795 self.stream.set_data(xml_data)