Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/xmp.py: 34%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

467 statements  

1""" 

2Anything related to Extensible Metadata Platform (XMP) metadata. 

3 

4https://en.wikipedia.org/wiki/Extensible_Metadata_Platform 

5""" 

6 

7import datetime 

8import decimal 

9import re 

10from collections.abc import Iterator 

11from typing import ( 

12 Any, 

13 Callable, 

14 Optional, 

15 TypeVar, 

16 Union, 

17 cast, 

18) 

19from xml.dom.expatbuilder import ExpatBuilderNS 

20from xml.dom.minidom import Document 

21from xml.dom.minidom import Element as XmlElement 

22from xml.parsers.expat import ExpatError, XMLParserType 

23 

24from ._protocols import XmpInformationProtocol 

25from ._utils import StreamType, deprecate_with_replacement, deprecation_no_replacement 

26from .errors import PdfReadError, XmpDocumentError 

27from .generic import ContentStream, PdfObject 

28 

29RDF_NAMESPACE = "http://www.w3.org/1999/02/22-rdf-syntax-ns#" 

30DC_NAMESPACE = "http://purl.org/dc/elements/1.1/" 

31XMP_NAMESPACE = "http://ns.adobe.com/xap/1.0/" 

32PDF_NAMESPACE = "http://ns.adobe.com/pdf/1.3/" 

33XMPMM_NAMESPACE = "http://ns.adobe.com/xap/1.0/mm/" 

34 

35# What is the PDFX namespace, you might ask? 

36# It's documented here: https://github.com/adobe/xmp-docs/raw/master/XMPSpecifications/XMPSpecificationPart3.pdf 

37# This namespace is used to place "custom metadata" 

38# properties, which are arbitrary metadata properties with no semantic or 

39# documented meaning. 

40# 

41# Elements in the namespace are key/value-style storage, 

42# where the element name is the key and the content is the value. The keys 

43# are transformed into valid XML identifiers by substituting an invalid 

44# identifier character with \u2182 followed by the unicode hex ID of the 

45# original character. A key like "my car" is therefore "my\u21820020car". 

46# 

47# \u2182 is the unicode character \u{ROMAN NUMERAL TEN THOUSAND} 

48# 

49# The pdfx namespace should be avoided. 

50# A custom data schema and sensical XML elements could be used instead, as is 

51# suggested by Adobe's own documentation on XMP under "Extensibility of 

52# Schemas". 

53PDFX_NAMESPACE = "http://ns.adobe.com/pdfx/1.3/" 

54 

55# PDF/A 

56PDFAID_NAMESPACE = "http://www.aiim.org/pdfa/ns/id/" 

57 

58# Internal mapping of namespace URI → prefix 

59_NAMESPACE_PREFIX_MAP = { 

60 DC_NAMESPACE: "dc", 

61 XMP_NAMESPACE: "xmp", 

62 PDF_NAMESPACE: "pdf", 

63 XMPMM_NAMESPACE: "xmpMM", 

64 PDFAID_NAMESPACE: "pdfaid", 

65 PDFX_NAMESPACE: "pdfx", 

66} 

67 

68iso8601 = re.compile( 

69 """ 

70 (?P<year>[0-9]{4}) 

71 (- 

72 (?P<month>[0-9]{2}) 

73 (- 

74 (?P<day>[0-9]+) 

75 (T 

76 (?P<hour>[0-9]{2}): 

77 (?P<minute>[0-9]{2}) 

78 (:(?P<second>[0-9]{2}(.[0-9]+)?))? 

79 (?P<tzd>Z|[-+][0-9]{2}:[0-9]{2}) 

80 )? 

81 )? 

82 )? 

83 """, 

84 re.VERBOSE, 

85) 

86 

87 

88K = TypeVar("K") 

89 

90# Minimal XMP template 

91_MINIMAL_XMP = f"""<?xpacket begin="\ufeff" id="W5M0MpCehiHzreSzNTczkc9d"?> 

92<x:xmpmeta xmlns:x="adobe:ns:meta/" x:xmptk="pypdf"> 

93 <rdf:RDF xmlns:rdf="{RDF_NAMESPACE}"> 

94 <rdf:Description rdf:about="" 

95 xmlns:dc="{DC_NAMESPACE}" 

96 xmlns:xmp="{XMP_NAMESPACE}" 

97 xmlns:pdf="{PDF_NAMESPACE}" 

98 xmlns:xmpMM="{XMPMM_NAMESPACE}" 

99 xmlns:pdfaid="{PDFAID_NAMESPACE}" 

100 xmlns:pdfx="{PDFX_NAMESPACE}"> 

101 </rdf:Description> 

102 </rdf:RDF> 

103</x:xmpmeta> 

104<?xpacket end="r"?>""" 

105 

106 

107def _identity(value: K) -> K: 

108 return value 

109 

110 

111def _converter_date(value: str) -> datetime.datetime: 

112 matches = iso8601.match(value) 

113 if matches is None: 

114 raise ValueError(f"Invalid date format: {value}") 

115 year = int(matches.group("year")) 

116 month = int(matches.group("month") or "1") 

117 day = int(matches.group("day") or "1") 

118 hour = int(matches.group("hour") or "0") 

119 minute = int(matches.group("minute") or "0") 

120 second = decimal.Decimal(matches.group("second") or "0") 

121 seconds_dec = second.to_integral(decimal.ROUND_FLOOR) 

122 milliseconds_dec = (second - seconds_dec) * 1_000_000 

123 

124 seconds = int(seconds_dec) 

125 milliseconds = int(milliseconds_dec) 

126 

127 tzd = matches.group("tzd") or "Z" 

128 dt = datetime.datetime(year, month, day, hour, minute, seconds, milliseconds) 

129 if tzd != "Z": 

130 tzd_hours, tzd_minutes = (int(x) for x in tzd.split(":")) 

131 tzd_hours *= -1 

132 if tzd_hours < 0: 

133 tzd_minutes *= -1 

134 dt = dt + datetime.timedelta(hours=tzd_hours, minutes=tzd_minutes) 

135 return dt 

136 

137 

138def _format_datetime_utc(value: datetime.datetime) -> str: 

139 """Format a datetime as UTC with trailing 'Z'. 

140 

141 - If the input is timezone-aware, convert to UTC first. 

142 - If naive, assume UTC. 

143 """ 

144 if value.tzinfo is not None and value.utcoffset() is not None: 

145 value = value.astimezone(datetime.timezone.utc) 

146 

147 value = value.replace(tzinfo=None) 

148 return value.strftime("%Y-%m-%dT%H:%M:%S.%fZ") 

149 

150 

151def _generic_get( 

152 element: XmlElement, self: "XmpInformation", list_type: str, converter: Callable[[Any], Any] = _identity 

153) -> Optional[list[str]]: 

154 containers = element.getElementsByTagNameNS(RDF_NAMESPACE, list_type) 

155 retval: list[Any] = [] 

156 if len(containers): 

157 for container in containers: 

158 for item in container.getElementsByTagNameNS(RDF_NAMESPACE, "li"): 

159 value = self._get_text(item) 

160 value = converter(value) 

161 retval.append(value) 

162 return retval 

163 return None 

164 

165 

166class _XmpBuilder(ExpatBuilderNS): 

167 """ 

168 Custom XML parser denying all entity declarations. 

169 

170 This is a stripped down and typed version inspired by what *defusedxml* does. 

171 

172 Why do we need this? The default limits of *libexpat* used by Python only block exponential entity expansion, 

173 but not cases like quadratic entity expansion which can still cause quite some memory usage. 

174 """ 

175 

176 def custom_entity_declaration_handler( 

177 self, 

178 entity_name: str, 

179 is_parameter_entity: bool, 

180 value: Optional[str], 

181 base: Optional[str], 

182 system_id: str, 

183 public_id: Optional[str], 

184 notation_name: Optional[str], 

185 ) -> None: 

186 raise ExpatError(f"Forbidden entities: {entity_name!r}") 

187 

188 def install(self, parser: XMLParserType) -> None: 

189 super().install(parser) 

190 

191 parser.EntityDeclHandler = self.custom_entity_declaration_handler 

192 

193 

194class XmpInformation(XmpInformationProtocol, PdfObject): 

195 """ 

196 An object that represents Extensible Metadata Platform (XMP) metadata. 

197 Usually accessed by :py:attr:`xmp_metadata()<pypdf.PdfReader.xmp_metadata>`. 

198 

199 Raises: 

200 PdfReadError: if XML is invalid 

201 

202 """ 

203 

204 def __init__(self, stream: ContentStream) -> None: 

205 self.stream = stream 

206 try: 

207 data = self.stream.get_data() 

208 doc_root: Document = _XmpBuilder().parseString(data) 

209 except (AttributeError, ExpatError) as e: 

210 raise PdfReadError(f"XML in XmpInformation was invalid: {e}") 

211 self.rdf_root: XmlElement = doc_root.getElementsByTagNameNS( 

212 RDF_NAMESPACE, "RDF" 

213 )[0] 

214 self.cache: dict[Any, Any] = {} 

215 

216 @classmethod 

217 def create(cls) -> "XmpInformation": 

218 """ 

219 Create a new XmpInformation object with minimal structure. 

220 

221 Returns: 

222 A new XmpInformation instance with empty metadata fields. 

223 """ 

224 stream = ContentStream(None, None) 

225 stream.set_data(_MINIMAL_XMP.encode("utf-8")) 

226 return cls(stream) 

227 

228 def write_to_stream( 

229 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None 

230 ) -> None: 

231 deprecate_with_replacement( 

232 "XmpInformation.write_to_stream", 

233 "PdfWriter.xmp_metadata", 

234 "6.0.0" 

235 ) 

236 if encryption_key is not None: # deprecated 

237 deprecation_no_replacement( 

238 "the encryption_key parameter of write_to_stream", "5.0.0" 

239 ) 

240 self.stream.write_to_stream(stream) 

241 

242 def get_element(self, about_uri: str, namespace: str, name: str) -> Iterator[Any]: 

243 for desc in self.rdf_root.getElementsByTagNameNS(RDF_NAMESPACE, "Description"): 

244 if desc.getAttributeNS(RDF_NAMESPACE, "about") == about_uri: 

245 attr = desc.getAttributeNodeNS(namespace, name) 

246 if attr is not None: 

247 yield attr 

248 yield from desc.getElementsByTagNameNS(namespace, name) 

249 

250 def get_nodes_in_namespace(self, about_uri: str, namespace: str) -> Iterator[Any]: 

251 for desc in self.rdf_root.getElementsByTagNameNS(RDF_NAMESPACE, "Description"): 

252 if desc.getAttributeNS(RDF_NAMESPACE, "about") == about_uri: 

253 for i in range(desc.attributes.length): 

254 attr = desc.attributes.item(i) 

255 if attr and attr.namespaceURI == namespace: 

256 yield attr 

257 for child in desc.childNodes: 

258 if child.namespaceURI == namespace: 

259 yield child 

260 

261 def _get_text(self, element: XmlElement) -> str: 

262 text = "" 

263 for child in element.childNodes: 

264 if child.nodeType == child.TEXT_NODE: 

265 text += child.data 

266 return text 

267 

268 def _get_single_value( 

269 self, 

270 namespace: str, 

271 name: str, 

272 converter: Callable[[str], Any] = _identity, 

273 ) -> Optional[Any]: 

274 cached = self.cache.get(namespace, {}).get(name) 

275 if cached: 

276 return cached 

277 value = None 

278 for element in self.get_element("", namespace, name): 

279 if element.nodeType == element.ATTRIBUTE_NODE: 

280 value = element.nodeValue 

281 else: 

282 value = self._get_text(element) 

283 break 

284 if value is not None: 

285 value = converter(value) 

286 ns_cache = self.cache.setdefault(namespace, {}) 

287 ns_cache[name] = value 

288 return value 

289 

290 def _getter_bag(self, namespace: str, name: str) -> Optional[list[str]]: 

291 cached = self.cache.get(namespace, {}).get(name) 

292 if cached: 

293 return cast(list[str], cached) 

294 retval: list[str] = [] 

295 for element in self.get_element("", namespace, name): 

296 if (bags := _generic_get(element, self, list_type="Bag")) is not None: 

297 retval.extend(bags) 

298 else: 

299 value = self._get_text(element) 

300 retval.append(value) 

301 ns_cache = self.cache.setdefault(namespace, {}) 

302 ns_cache[name] = retval 

303 return retval 

304 

305 def _get_seq_values( 

306 self, 

307 namespace: str, 

308 name: str, 

309 converter: Callable[[Any], Any] = _identity, 

310 ) -> Optional[list[Any]]: 

311 cached = self.cache.get(namespace, {}).get(name) 

312 if cached: 

313 return cast(list[Any], cached) 

314 retval: list[Any] = [] 

315 for element in self.get_element("", namespace, name): 

316 if (seqs := _generic_get(element, self, list_type="Seq", converter=converter)) is not None: 

317 retval.extend(seqs) 

318 elif (bags := _generic_get(element, self, list_type="Bag")) is not None: 

319 # See issue at https://github.com/py-pdf/pypdf/issues/3324 

320 # Some applications violate the XMP metadata standard regarding `dc:creator` which should 

321 # be an "ordered array" and thus a sequence, but use an unordered array (bag) instead. 

322 # This seems to stem from the fact that the original Dublin Core specification does indeed 

323 # use bags or direct values, while PDFs are expected to follow the XMP standard and ignore 

324 # the plain Dublin Core variant. For this reason, add a fallback here to deal with such 

325 # issues accordingly. 

326 retval.extend(bags) 

327 else: 

328 value = converter(self._get_text(element)) 

329 retval.append(value) 

330 ns_cache = self.cache.setdefault(namespace, {}) 

331 ns_cache[name] = retval 

332 return retval 

333 

334 def _get_langalt_values(self, namespace: str, name: str) -> Optional[dict[Any, Any]]: 

335 cached = self.cache.get(namespace, {}).get(name) 

336 if cached: 

337 return cast(dict[Any, Any], cached) 

338 retval: dict[Any, Any] = {} 

339 for element in self.get_element("", namespace, name): 

340 alts = element.getElementsByTagNameNS(RDF_NAMESPACE, "Alt") 

341 if len(alts): 

342 for alt in alts: 

343 for item in alt.getElementsByTagNameNS(RDF_NAMESPACE, "li"): 

344 value = self._get_text(item) 

345 retval[item.getAttribute("xml:lang")] = value 

346 else: 

347 retval["x-default"] = self._get_text(element) 

348 ns_cache = self.cache.setdefault(namespace, {}) 

349 ns_cache[name] = retval 

350 return retval 

351 

352 @property 

353 def dc_contributor(self) -> Optional[list[str]]: 

354 """Contributors to the resource (other than the authors).""" 

355 return self._getter_bag(DC_NAMESPACE, "contributor") 

356 

357 @dc_contributor.setter 

358 def dc_contributor(self, values: Optional[list[str]]) -> None: 

359 self._set_bag_values(DC_NAMESPACE, "contributor", values) 

360 

361 @property 

362 def dc_coverage(self) -> Optional[str]: 

363 """Text describing the extent or scope of the resource.""" 

364 return self._get_single_value(DC_NAMESPACE, "coverage") 

365 

366 @dc_coverage.setter 

367 def dc_coverage(self, value: Optional[str]) -> None: 

368 self._set_single_value(DC_NAMESPACE, "coverage", value) 

369 

370 @property 

371 def dc_creator(self) -> Optional[list[str]]: 

372 """A sorted array of names of the authors of the resource, listed in order of precedence.""" 

373 return self._get_seq_values(DC_NAMESPACE, "creator") 

374 

375 @dc_creator.setter 

376 def dc_creator(self, values: Optional[list[str]]) -> None: 

377 self._set_seq_values(DC_NAMESPACE, "creator", values) 

378 

379 @property 

380 def dc_date(self) -> Optional[list[datetime.datetime]]: 

381 """A sorted array of dates of significance to the resource. The dates and times are in UTC.""" 

382 return self._get_seq_values(DC_NAMESPACE, "date", _converter_date) 

383 

384 @dc_date.setter 

385 def dc_date(self, values: Optional[list[Union[str, datetime.datetime]]]) -> None: 

386 if values is None: 

387 self._set_seq_values(DC_NAMESPACE, "date", None) 

388 else: 

389 date_strings = [] 

390 for value in values: 

391 if isinstance(value, datetime.datetime): 

392 date_strings.append(_format_datetime_utc(value)) 

393 else: 

394 date_strings.append(str(value)) 

395 self._set_seq_values(DC_NAMESPACE, "date", date_strings) 

396 

397 @property 

398 def dc_description(self) -> Optional[dict[str, str]]: 

399 """A language-keyed dictionary of textual descriptions of the content of the resource.""" 

400 return self._get_langalt_values(DC_NAMESPACE, "description") 

401 

402 @dc_description.setter 

403 def dc_description(self, values: Optional[dict[str, str]]) -> None: 

404 self._set_langalt_values(DC_NAMESPACE, "description", values) 

405 

406 @property 

407 def dc_format(self) -> Optional[str]: 

408 """The mime-type of the resource.""" 

409 return self._get_single_value(DC_NAMESPACE, "format") 

410 

411 @dc_format.setter 

412 def dc_format(self, value: Optional[str]) -> None: 

413 self._set_single_value(DC_NAMESPACE, "format", value) 

414 

415 @property 

416 def dc_identifier(self) -> Optional[str]: 

417 """Unique identifier of the resource.""" 

418 return self._get_single_value(DC_NAMESPACE, "identifier") 

419 

420 @dc_identifier.setter 

421 def dc_identifier(self, value: Optional[str]) -> None: 

422 self._set_single_value(DC_NAMESPACE, "identifier", value) 

423 

424 @property 

425 def dc_language(self) -> Optional[list[str]]: 

426 """An unordered array specifying the languages used in the resource.""" 

427 return self._getter_bag(DC_NAMESPACE, "language") 

428 

429 @dc_language.setter 

430 def dc_language(self, values: Optional[list[str]]) -> None: 

431 self._set_bag_values(DC_NAMESPACE, "language", values) 

432 

433 @property 

434 def dc_publisher(self) -> Optional[list[str]]: 

435 """An unordered array of publisher names.""" 

436 return self._getter_bag(DC_NAMESPACE, "publisher") 

437 

438 @dc_publisher.setter 

439 def dc_publisher(self, values: Optional[list[str]]) -> None: 

440 self._set_bag_values(DC_NAMESPACE, "publisher", values) 

441 

442 @property 

443 def dc_relation(self) -> Optional[list[str]]: 

444 """An unordered array of text descriptions of relationships to other documents.""" 

445 return self._getter_bag(DC_NAMESPACE, "relation") 

446 

447 @dc_relation.setter 

448 def dc_relation(self, values: Optional[list[str]]) -> None: 

449 self._set_bag_values(DC_NAMESPACE, "relation", values) 

450 

451 @property 

452 def dc_rights(self) -> Optional[dict[str, str]]: 

453 """A language-keyed dictionary of textual descriptions of the rights the user has to this resource.""" 

454 return self._get_langalt_values(DC_NAMESPACE, "rights") 

455 

456 @dc_rights.setter 

457 def dc_rights(self, values: Optional[dict[str, str]]) -> None: 

458 self._set_langalt_values(DC_NAMESPACE, "rights", values) 

459 

460 @property 

461 def dc_source(self) -> Optional[str]: 

462 """Unique identifier of the work from which this resource was derived.""" 

463 return self._get_single_value(DC_NAMESPACE, "source") 

464 

465 @dc_source.setter 

466 def dc_source(self, value: Optional[str]) -> None: 

467 self._set_single_value(DC_NAMESPACE, "source", value) 

468 

469 @property 

470 def dc_subject(self) -> Optional[list[str]]: 

471 """An unordered array of descriptive phrases or keywords that specify the topic of the content.""" 

472 return self._getter_bag(DC_NAMESPACE, "subject") 

473 

474 @dc_subject.setter 

475 def dc_subject(self, values: Optional[list[str]]) -> None: 

476 self._set_bag_values(DC_NAMESPACE, "subject", values) 

477 

478 @property 

479 def dc_title(self) -> Optional[dict[str, str]]: 

480 """A language-keyed dictionary of the title of the resource.""" 

481 return self._get_langalt_values(DC_NAMESPACE, "title") 

482 

483 @dc_title.setter 

484 def dc_title(self, values: Optional[dict[str, str]]) -> None: 

485 self._set_langalt_values(DC_NAMESPACE, "title", values) 

486 

487 @property 

488 def dc_type(self) -> Optional[list[str]]: 

489 """An unordered array of textual descriptions of the document type.""" 

490 return self._getter_bag(DC_NAMESPACE, "type") 

491 

492 @dc_type.setter 

493 def dc_type(self, values: Optional[list[str]]) -> None: 

494 self._set_bag_values(DC_NAMESPACE, "type", values) 

495 

496 @property 

497 def pdf_keywords(self) -> Optional[str]: 

498 """An unformatted text string representing document keywords.""" 

499 return self._get_single_value(PDF_NAMESPACE, "Keywords") 

500 

501 @pdf_keywords.setter 

502 def pdf_keywords(self, value: Optional[str]) -> None: 

503 self._set_single_value(PDF_NAMESPACE, "Keywords", value) 

504 

505 @property 

506 def pdf_pdfversion(self) -> Optional[str]: 

507 """The PDF file version, for example 1.0 or 1.3.""" 

508 return self._get_single_value(PDF_NAMESPACE, "PDFVersion") 

509 

510 @pdf_pdfversion.setter 

511 def pdf_pdfversion(self, value: Optional[str]) -> None: 

512 self._set_single_value(PDF_NAMESPACE, "PDFVersion", value) 

513 

514 @property 

515 def pdf_producer(self) -> Optional[str]: 

516 """The name of the tool that saved the document as a PDF.""" 

517 return self._get_single_value(PDF_NAMESPACE, "Producer") 

518 

519 @pdf_producer.setter 

520 def pdf_producer(self, value: Optional[str]) -> None: 

521 self._set_single_value(PDF_NAMESPACE, "Producer", value) 

522 

523 @property 

524 def xmp_create_date(self) -> Optional[datetime.datetime]: 

525 """The date and time the resource was originally created. Returned as a UTC datetime object.""" 

526 return self._get_single_value(XMP_NAMESPACE, "CreateDate", _converter_date) 

527 

528 @xmp_create_date.setter 

529 def xmp_create_date(self, value: Optional[datetime.datetime]) -> None: 

530 if value: 

531 date_str = _format_datetime_utc(value) 

532 self._set_single_value(XMP_NAMESPACE, "CreateDate", date_str) 

533 else: 

534 self._set_single_value(XMP_NAMESPACE, "CreateDate", None) 

535 

536 @property 

537 def xmp_modify_date(self) -> Optional[datetime.datetime]: 

538 """The date and time the resource was last modified. Returned as a UTC datetime object.""" 

539 return self._get_single_value(XMP_NAMESPACE, "ModifyDate", _converter_date) 

540 

541 @xmp_modify_date.setter 

542 def xmp_modify_date(self, value: Optional[datetime.datetime]) -> None: 

543 if value: 

544 date_str = _format_datetime_utc(value) 

545 self._set_single_value(XMP_NAMESPACE, "ModifyDate", date_str) 

546 else: 

547 self._set_single_value(XMP_NAMESPACE, "ModifyDate", None) 

548 

549 @property 

550 def xmp_metadata_date(self) -> Optional[datetime.datetime]: 

551 """The date and time that any metadata for this resource was last changed. Returned as a UTC datetime object.""" 

552 return self._get_single_value(XMP_NAMESPACE, "MetadataDate", _converter_date) 

553 

554 @xmp_metadata_date.setter 

555 def xmp_metadata_date(self, value: Optional[datetime.datetime]) -> None: 

556 if value: 

557 date_str = _format_datetime_utc(value) 

558 self._set_single_value(XMP_NAMESPACE, "MetadataDate", date_str) 

559 else: 

560 self._set_single_value(XMP_NAMESPACE, "MetadataDate", None) 

561 

562 @property 

563 def xmp_creator_tool(self) -> Optional[str]: 

564 """The name of the first known tool used to create the resource.""" 

565 return self._get_single_value(XMP_NAMESPACE, "CreatorTool") 

566 

567 @xmp_creator_tool.setter 

568 def xmp_creator_tool(self, value: Optional[str]) -> None: 

569 self._set_single_value(XMP_NAMESPACE, "CreatorTool", value) 

570 

571 @property 

572 def xmpmm_document_id(self) -> Optional[str]: 

573 """The common identifier for all versions and renditions of this resource.""" 

574 return self._get_single_value(XMPMM_NAMESPACE, "DocumentID") 

575 

576 @xmpmm_document_id.setter 

577 def xmpmm_document_id(self, value: Optional[str]) -> None: 

578 self._set_single_value(XMPMM_NAMESPACE, "DocumentID", value) 

579 

580 @property 

581 def xmpmm_instance_id(self) -> Optional[str]: 

582 """An identifier for a specific incarnation of a document, updated each time a file is saved.""" 

583 return self._get_single_value(XMPMM_NAMESPACE, "InstanceID") 

584 

585 @xmpmm_instance_id.setter 

586 def xmpmm_instance_id(self, value: Optional[str]) -> None: 

587 self._set_single_value(XMPMM_NAMESPACE, "InstanceID", value) 

588 

589 @property 

590 def pdfaid_part(self) -> Optional[str]: 

591 """The part of the PDF/A standard that the document conforms to (e.g., 1, 2, 3).""" 

592 return self._get_single_value(PDFAID_NAMESPACE, "part") 

593 

594 @pdfaid_part.setter 

595 def pdfaid_part(self, value: Optional[str]) -> None: 

596 self._set_single_value(PDFAID_NAMESPACE, "part", value) 

597 

598 @property 

599 def pdfaid_conformance(self) -> Optional[str]: 

600 """The conformance level within the PDF/A standard (e.g., 'A', 'B', 'U').""" 

601 return self._get_single_value(PDFAID_NAMESPACE, "conformance") 

602 

603 @pdfaid_conformance.setter 

604 def pdfaid_conformance(self, value: Optional[str]) -> None: 

605 self._set_single_value(PDFAID_NAMESPACE, "conformance", value) 

606 

607 @property 

608 def custom_properties(self) -> dict[Any, Any]: 

609 """ 

610 Retrieve custom metadata properties defined in the undocumented pdfx 

611 metadata schema. 

612 

613 Returns: 

614 A dictionary of key/value items for custom metadata properties. 

615 

616 """ 

617 if not hasattr(self, "_custom_properties"): 

618 self._custom_properties = {} 

619 for node in self.get_nodes_in_namespace("", PDFX_NAMESPACE): 

620 key = node.localName 

621 while True: 

622 # see documentation about PDFX_NAMESPACE earlier in file 

623 idx = key.find("\u2182") 

624 if idx == -1: 

625 break 

626 key = ( 

627 key[:idx] 

628 + chr(int(key[idx + 1 : idx + 5], base=16)) 

629 + key[idx + 5 :] 

630 ) 

631 if node.nodeType == node.ATTRIBUTE_NODE: 

632 value = node.nodeValue 

633 else: 

634 value = self._get_text(node) 

635 self._custom_properties[key] = value 

636 return self._custom_properties 

637 

638 def _get_or_create_description(self, about_uri: str = "") -> XmlElement: 

639 """Get or create an rdf:Description element with the given about URI.""" 

640 for desc in self.rdf_root.getElementsByTagNameNS(RDF_NAMESPACE, "Description"): 

641 if desc.getAttributeNS(RDF_NAMESPACE, "about") == about_uri: 

642 return desc 

643 

644 doc = self.rdf_root.ownerDocument 

645 if doc is None: 

646 raise XmpDocumentError("XMP Document is None") 

647 desc = doc.createElementNS(RDF_NAMESPACE, "rdf:Description") 

648 desc.setAttributeNS(RDF_NAMESPACE, "rdf:about", about_uri) 

649 self.rdf_root.appendChild(desc) 

650 return desc 

651 

652 def _clear_cache_entry(self, namespace: str, name: str) -> None: 

653 """Remove a cached value for a given namespace/name if present.""" 

654 ns_cache = self.cache.get(namespace) 

655 if ns_cache and name in ns_cache: 

656 del ns_cache[name] 

657 

658 def _set_single_value(self, namespace: str, name: str, value: Optional[str]) -> None: 

659 """Set or remove a single metadata value.""" 

660 self._clear_cache_entry(namespace, name) 

661 desc = self._get_or_create_description() 

662 

663 existing_elements = list(desc.getElementsByTagNameNS(namespace, name)) 

664 for elem in existing_elements: 

665 desc.removeChild(elem) 

666 

667 if existing_attr := desc.getAttributeNodeNS(namespace, name): 

668 desc.removeAttributeNode(existing_attr) 

669 

670 if value is not None: 

671 doc = self.rdf_root.ownerDocument 

672 if doc is None: 

673 raise XmpDocumentError("XMP Document is None") 

674 prefix = self._get_namespace_prefix(namespace) 

675 elem = doc.createElementNS(namespace, f"{prefix}:{name}") 

676 text_node = doc.createTextNode(str(value)) 

677 elem.appendChild(text_node) 

678 desc.appendChild(elem) 

679 

680 self._update_stream() 

681 

682 def _set_bag_values(self, namespace: str, name: str, values: Optional[list[str]]) -> None: 

683 """Set or remove bag values (unordered array).""" 

684 self._clear_cache_entry(namespace, name) 

685 desc = self._get_or_create_description() 

686 

687 existing_elements = list(desc.getElementsByTagNameNS(namespace, name)) 

688 for elem in existing_elements: 

689 desc.removeChild(elem) 

690 

691 if values: 

692 doc = self.rdf_root.ownerDocument 

693 if doc is None: 

694 raise XmpDocumentError("XMP Document is None") 

695 prefix = self._get_namespace_prefix(namespace) 

696 elem = doc.createElementNS(namespace, f"{prefix}:{name}") 

697 bag = doc.createElementNS(RDF_NAMESPACE, "rdf:Bag") 

698 

699 for value in values: 

700 li = doc.createElementNS(RDF_NAMESPACE, "rdf:li") 

701 text_node = doc.createTextNode(str(value)) 

702 li.appendChild(text_node) 

703 bag.appendChild(li) 

704 

705 elem.appendChild(bag) 

706 desc.appendChild(elem) 

707 

708 self._update_stream() 

709 

710 def _set_seq_values(self, namespace: str, name: str, values: Optional[list[str]]) -> None: 

711 """Set or remove sequence values (ordered array).""" 

712 self._clear_cache_entry(namespace, name) 

713 desc = self._get_or_create_description() 

714 

715 existing_elements = list(desc.getElementsByTagNameNS(namespace, name)) 

716 for elem in existing_elements: 

717 desc.removeChild(elem) 

718 

719 if values: 

720 doc = self.rdf_root.ownerDocument 

721 if doc is None: 

722 raise XmpDocumentError("XMP Document is None") 

723 prefix = self._get_namespace_prefix(namespace) 

724 elem = doc.createElementNS(namespace, f"{prefix}:{name}") 

725 seq = doc.createElementNS(RDF_NAMESPACE, "rdf:Seq") 

726 

727 for value in values: 

728 li = doc.createElementNS(RDF_NAMESPACE, "rdf:li") 

729 text_node = doc.createTextNode(str(value)) 

730 li.appendChild(text_node) 

731 seq.appendChild(li) 

732 

733 elem.appendChild(seq) 

734 desc.appendChild(elem) 

735 

736 self._update_stream() 

737 

738 def _set_langalt_values(self, namespace: str, name: str, values: Optional[dict[str, str]]) -> None: 

739 """Set or remove language alternative values.""" 

740 self._clear_cache_entry(namespace, name) 

741 desc = self._get_or_create_description() 

742 

743 existing_elements = list(desc.getElementsByTagNameNS(namespace, name)) 

744 for elem in existing_elements: 

745 desc.removeChild(elem) 

746 

747 if values: 

748 doc = self.rdf_root.ownerDocument 

749 if doc is None: 

750 raise XmpDocumentError("XMP Document is None") 

751 prefix = self._get_namespace_prefix(namespace) 

752 elem = doc.createElementNS(namespace, f"{prefix}:{name}") 

753 alt = doc.createElementNS(RDF_NAMESPACE, "rdf:Alt") 

754 

755 for lang, value in values.items(): 

756 li = doc.createElementNS(RDF_NAMESPACE, "rdf:li") 

757 li.setAttribute("xml:lang", lang) 

758 text_node = doc.createTextNode(str(value)) 

759 li.appendChild(text_node) 

760 alt.appendChild(li) 

761 

762 elem.appendChild(alt) 

763 desc.appendChild(elem) 

764 

765 self._update_stream() 

766 

767 def _get_namespace_prefix(self, namespace: str) -> str: 

768 """Get the appropriate namespace prefix for a given namespace URI.""" 

769 return _NAMESPACE_PREFIX_MAP.get(namespace, "unknown") 

770 

771 def _update_stream(self) -> None: 

772 """Update the stream with the current XML content.""" 

773 doc = self.rdf_root.ownerDocument 

774 if doc is None: 

775 raise XmpDocumentError("XMP Document is None") 

776 

777 xml_data = doc.toxml(encoding="utf-8") 

778 self.stream.set_data(xml_data)