Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/xmp.py: 33%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

460 statements  

1""" 

2Anything related to Extensible Metadata Platform (XMP) metadata. 

3 

4https://en.wikipedia.org/wiki/Extensible_Metadata_Platform 

5""" 

6 

7import datetime 

8import decimal 

9import re 

10from collections.abc import Iterator 

11from typing import ( 

12 Any, 

13 Callable, 

14 Optional, 

15 TypeVar, 

16 Union, 

17) 

18from xml.dom.minidom import Document, parseString 

19from xml.dom.minidom import Element as XmlElement 

20from xml.parsers.expat import ExpatError 

21 

22from ._protocols import XmpInformationProtocol 

23from ._utils import StreamType, deprecate_with_replacement, deprecation_no_replacement 

24from .errors import PdfReadError, XmpDocumentError 

25from .generic import ContentStream, PdfObject 

26 

27RDF_NAMESPACE = "http://www.w3.org/1999/02/22-rdf-syntax-ns#" 

28DC_NAMESPACE = "http://purl.org/dc/elements/1.1/" 

29XMP_NAMESPACE = "http://ns.adobe.com/xap/1.0/" 

30PDF_NAMESPACE = "http://ns.adobe.com/pdf/1.3/" 

31XMPMM_NAMESPACE = "http://ns.adobe.com/xap/1.0/mm/" 

32 

33# What is the PDFX namespace, you might ask? 

34# It's documented here: https://github.com/adobe/xmp-docs/raw/master/XMPSpecifications/XMPSpecificationPart3.pdf 

35# This namespace is used to place "custom metadata" 

36# properties, which are arbitrary metadata properties with no semantic or 

37# documented meaning. 

38# 

39# Elements in the namespace are key/value-style storage, 

40# where the element name is the key and the content is the value. The keys 

41# are transformed into valid XML identifiers by substituting an invalid 

42# identifier character with \u2182 followed by the unicode hex ID of the 

43# original character. A key like "my car" is therefore "my\u21820020car". 

44# 

45# \u2182 is the unicode character \u{ROMAN NUMERAL TEN THOUSAND} 

46# 

47# The pdfx namespace should be avoided. 

48# A custom data schema and sensical XML elements could be used instead, as is 

49# suggested by Adobe's own documentation on XMP under "Extensibility of 

50# Schemas". 

51PDFX_NAMESPACE = "http://ns.adobe.com/pdfx/1.3/" 

52 

53# PDF/A 

54PDFAID_NAMESPACE = "http://www.aiim.org/pdfa/ns/id/" 

55 

56# Internal mapping of namespace URI → prefix 

57_NAMESPACE_PREFIX_MAP = { 

58 DC_NAMESPACE: "dc", 

59 XMP_NAMESPACE: "xmp", 

60 PDF_NAMESPACE: "pdf", 

61 XMPMM_NAMESPACE: "xmpMM", 

62 PDFAID_NAMESPACE: "pdfaid", 

63 PDFX_NAMESPACE: "pdfx", 

64} 

65 

66iso8601 = re.compile( 

67 """ 

68 (?P<year>[0-9]{4}) 

69 (- 

70 (?P<month>[0-9]{2}) 

71 (- 

72 (?P<day>[0-9]+) 

73 (T 

74 (?P<hour>[0-9]{2}): 

75 (?P<minute>[0-9]{2}) 

76 (:(?P<second>[0-9]{2}(.[0-9]+)?))? 

77 (?P<tzd>Z|[-+][0-9]{2}:[0-9]{2}) 

78 )? 

79 )? 

80 )? 

81 """, 

82 re.VERBOSE, 

83) 

84 

85 

86K = TypeVar("K") 

87 

88# Minimal XMP template 

89_MINIMAL_XMP = f"""<?xpacket begin="\ufeff" id="W5M0MpCehiHzreSzNTczkc9d"?> 

90<x:xmpmeta xmlns:x="adobe:ns:meta/" x:xmptk="pypdf"> 

91 <rdf:RDF xmlns:rdf="{RDF_NAMESPACE}"> 

92 <rdf:Description rdf:about="" 

93 xmlns:dc="{DC_NAMESPACE}" 

94 xmlns:xmp="{XMP_NAMESPACE}" 

95 xmlns:pdf="{PDF_NAMESPACE}" 

96 xmlns:xmpMM="{XMPMM_NAMESPACE}" 

97 xmlns:pdfaid="{PDFAID_NAMESPACE}" 

98 xmlns:pdfx="{PDFX_NAMESPACE}"> 

99 </rdf:Description> 

100 </rdf:RDF> 

101</x:xmpmeta> 

102<?xpacket end="r"?>""" 

103 

104 

105def _identity(value: K) -> K: 

106 return value 

107 

108 

109def _converter_date(value: str) -> datetime.datetime: 

110 matches = iso8601.match(value) 

111 if matches is None: 

112 raise ValueError(f"Invalid date format: {value}") 

113 year = int(matches.group("year")) 

114 month = int(matches.group("month") or "1") 

115 day = int(matches.group("day") or "1") 

116 hour = int(matches.group("hour") or "0") 

117 minute = int(matches.group("minute") or "0") 

118 second = decimal.Decimal(matches.group("second") or "0") 

119 seconds_dec = second.to_integral(decimal.ROUND_FLOOR) 

120 milliseconds_dec = (second - seconds_dec) * 1_000_000 

121 

122 seconds = int(seconds_dec) 

123 milliseconds = int(milliseconds_dec) 

124 

125 tzd = matches.group("tzd") or "Z" 

126 dt = datetime.datetime(year, month, day, hour, minute, seconds, milliseconds) 

127 if tzd != "Z": 

128 tzd_hours, tzd_minutes = (int(x) for x in tzd.split(":")) 

129 tzd_hours *= -1 

130 if tzd_hours < 0: 

131 tzd_minutes *= -1 

132 dt = dt + datetime.timedelta(hours=tzd_hours, minutes=tzd_minutes) 

133 return dt 

134 

135 

136def _format_datetime_utc(value: datetime.datetime) -> str: 

137 """Format a datetime as UTC with trailing 'Z'. 

138 

139 - If the input is timezone-aware, convert to UTC first. 

140 - If naive, assume UTC. 

141 """ 

142 if value.tzinfo is not None and value.utcoffset() is not None: 

143 value = value.astimezone(datetime.timezone.utc) 

144 

145 value = value.replace(tzinfo=None) 

146 return value.strftime("%Y-%m-%dT%H:%M:%S.%fZ") 

147 

148 

149def _generic_get( 

150 element: XmlElement, self: "XmpInformation", list_type: str, converter: Callable[[Any], Any] = _identity 

151) -> Optional[list[str]]: 

152 containers = element.getElementsByTagNameNS(RDF_NAMESPACE, list_type) 

153 retval: list[Any] = [] 

154 if len(containers): 

155 for container in containers: 

156 for item in container.getElementsByTagNameNS(RDF_NAMESPACE, "li"): 

157 value = self._get_text(item) 

158 value = converter(value) 

159 retval.append(value) 

160 return retval 

161 return None 

162 

163 

164class XmpInformation(XmpInformationProtocol, PdfObject): 

165 """ 

166 An object that represents Extensible Metadata Platform (XMP) metadata. 

167 Usually accessed by :py:attr:`xmp_metadata()<pypdf.PdfReader.xmp_metadata>`. 

168 

169 Raises: 

170 PdfReadError: if XML is invalid 

171 

172 """ 

173 

174 def __init__(self, stream: ContentStream) -> None: 

175 self.stream = stream 

176 try: 

177 data = self.stream.get_data() 

178 doc_root: Document = parseString(data) # noqa: S318 

179 except (AttributeError, ExpatError) as e: 

180 raise PdfReadError(f"XML in XmpInformation was invalid: {e}") 

181 self.rdf_root: XmlElement = doc_root.getElementsByTagNameNS( 

182 RDF_NAMESPACE, "RDF" 

183 )[0] 

184 self.cache: dict[Any, Any] = {} 

185 

186 @classmethod 

187 def create(cls) -> "XmpInformation": 

188 """ 

189 Create a new XmpInformation object with minimal structure. 

190 

191 Returns: 

192 A new XmpInformation instance with empty metadata fields. 

193 """ 

194 stream = ContentStream(None, None) 

195 stream.set_data(_MINIMAL_XMP.encode("utf-8")) 

196 return cls(stream) 

197 

198 def write_to_stream( 

199 self, stream: StreamType, encryption_key: Union[None, str, bytes] = None 

200 ) -> None: 

201 deprecate_with_replacement( 

202 "XmpInformation.write_to_stream", 

203 "PdfWriter.xmp_metadata", 

204 "6.0.0" 

205 ) 

206 if encryption_key is not None: # deprecated 

207 deprecation_no_replacement( 

208 "the encryption_key parameter of write_to_stream", "5.0.0" 

209 ) 

210 self.stream.write_to_stream(stream) 

211 

212 def get_element(self, about_uri: str, namespace: str, name: str) -> Iterator[Any]: 

213 for desc in self.rdf_root.getElementsByTagNameNS(RDF_NAMESPACE, "Description"): 

214 if desc.getAttributeNS(RDF_NAMESPACE, "about") == about_uri: 

215 attr = desc.getAttributeNodeNS(namespace, name) 

216 if attr is not None: 

217 yield attr 

218 yield from desc.getElementsByTagNameNS(namespace, name) 

219 

220 def get_nodes_in_namespace(self, about_uri: str, namespace: str) -> Iterator[Any]: 

221 for desc in self.rdf_root.getElementsByTagNameNS(RDF_NAMESPACE, "Description"): 

222 if desc.getAttributeNS(RDF_NAMESPACE, "about") == about_uri: 

223 for i in range(desc.attributes.length): 

224 attr = desc.attributes.item(i) 

225 if attr and attr.namespaceURI == namespace: 

226 yield attr 

227 for child in desc.childNodes: 

228 if child.namespaceURI == namespace: 

229 yield child 

230 

231 def _get_text(self, element: XmlElement) -> str: 

232 text = "" 

233 for child in element.childNodes: 

234 if child.nodeType == child.TEXT_NODE: 

235 text += child.data 

236 return text 

237 

238 def _get_single_value( 

239 self, 

240 namespace: str, 

241 name: str, 

242 converter: Callable[[str], Any] = _identity, 

243 ) -> Optional[Any]: 

244 cached = self.cache.get(namespace, {}).get(name) 

245 if cached: 

246 return cached 

247 value = None 

248 for element in self.get_element("", namespace, name): 

249 if element.nodeType == element.ATTRIBUTE_NODE: 

250 value = element.nodeValue 

251 else: 

252 value = self._get_text(element) 

253 break 

254 if value is not None: 

255 value = converter(value) 

256 ns_cache = self.cache.setdefault(namespace, {}) 

257 ns_cache[name] = value 

258 return value 

259 

260 def _getter_bag(self, namespace: str, name: str) -> Optional[list[str]]: 

261 cached = self.cache.get(namespace, {}).get(name) 

262 if cached: 

263 return cached 

264 retval: list[str] = [] 

265 for element in self.get_element("", namespace, name): 

266 if (bags := _generic_get(element, self, list_type="Bag")) is not None: 

267 retval.extend(bags) 

268 else: 

269 value = self._get_text(element) 

270 retval.append(value) 

271 ns_cache = self.cache.setdefault(namespace, {}) 

272 ns_cache[name] = retval 

273 return retval 

274 

275 def _get_seq_values( 

276 self, 

277 namespace: str, 

278 name: str, 

279 converter: Callable[[Any], Any] = _identity, 

280 ) -> Optional[list[Any]]: 

281 cached = self.cache.get(namespace, {}).get(name) 

282 if cached: 

283 return cached 

284 retval: list[Any] = [] 

285 for element in self.get_element("", namespace, name): 

286 if (seqs := _generic_get(element, self, list_type="Seq", converter=converter)) is not None: 

287 retval.extend(seqs) 

288 elif (bags := _generic_get(element, self, list_type="Bag")) is not None: 

289 # See issue at https://github.com/py-pdf/pypdf/issues/3324 

290 # Some applications violate the XMP metadata standard regarding `dc:creator` which should 

291 # be an "ordered array" and thus a sequence, but use an unordered array (bag) instead. 

292 # This seems to stem from the fact that the original Dublin Core specification does indeed 

293 # use bags or direct values, while PDFs are expected to follow the XMP standard and ignore 

294 # the plain Dublin Core variant. For this reason, add a fallback here to deal with such 

295 # issues accordingly. 

296 retval.extend(bags) 

297 else: 

298 value = converter(self._get_text(element)) 

299 retval.append(value) 

300 ns_cache = self.cache.setdefault(namespace, {}) 

301 ns_cache[name] = retval 

302 return retval 

303 

304 def _get_langalt_values(self, namespace: str, name: str) -> Optional[dict[Any, Any]]: 

305 cached = self.cache.get(namespace, {}).get(name) 

306 if cached: 

307 return cached 

308 retval: dict[Any, Any] = {} 

309 for element in self.get_element("", namespace, name): 

310 alts = element.getElementsByTagNameNS(RDF_NAMESPACE, "Alt") 

311 if len(alts): 

312 for alt in alts: 

313 for item in alt.getElementsByTagNameNS(RDF_NAMESPACE, "li"): 

314 value = self._get_text(item) 

315 retval[item.getAttribute("xml:lang")] = value 

316 else: 

317 retval["x-default"] = self._get_text(element) 

318 ns_cache = self.cache.setdefault(namespace, {}) 

319 ns_cache[name] = retval 

320 return retval 

321 

322 @property 

323 def dc_contributor(self) -> Optional[list[str]]: 

324 """Contributors to the resource (other than the authors).""" 

325 return self._getter_bag(DC_NAMESPACE, "contributor") 

326 

327 @dc_contributor.setter 

328 def dc_contributor(self, values: Optional[list[str]]) -> None: 

329 self._set_bag_values(DC_NAMESPACE, "contributor", values) 

330 

331 @property 

332 def dc_coverage(self) -> Optional[str]: 

333 """Text describing the extent or scope of the resource.""" 

334 return self._get_single_value(DC_NAMESPACE, "coverage") 

335 

336 @dc_coverage.setter 

337 def dc_coverage(self, value: Optional[str]) -> None: 

338 self._set_single_value(DC_NAMESPACE, "coverage", value) 

339 

340 @property 

341 def dc_creator(self) -> Optional[list[str]]: 

342 """A sorted array of names of the authors of the resource, listed in order of precedence.""" 

343 return self._get_seq_values(DC_NAMESPACE, "creator") 

344 

345 @dc_creator.setter 

346 def dc_creator(self, values: Optional[list[str]]) -> None: 

347 self._set_seq_values(DC_NAMESPACE, "creator", values) 

348 

349 @property 

350 def dc_date(self) -> Optional[list[datetime.datetime]]: 

351 """A sorted array of dates of significance to the resource. The dates and times are in UTC.""" 

352 return self._get_seq_values(DC_NAMESPACE, "date", _converter_date) 

353 

354 @dc_date.setter 

355 def dc_date(self, values: Optional[list[Union[str, datetime.datetime]]]) -> None: 

356 if values is None: 

357 self._set_seq_values(DC_NAMESPACE, "date", None) 

358 else: 

359 date_strings = [] 

360 for value in values: 

361 if isinstance(value, datetime.datetime): 

362 date_strings.append(_format_datetime_utc(value)) 

363 else: 

364 date_strings.append(str(value)) 

365 self._set_seq_values(DC_NAMESPACE, "date", date_strings) 

366 

367 @property 

368 def dc_description(self) -> Optional[dict[str, str]]: 

369 """A language-keyed dictionary of textual descriptions of the content of the resource.""" 

370 return self._get_langalt_values(DC_NAMESPACE, "description") 

371 

372 @dc_description.setter 

373 def dc_description(self, values: Optional[dict[str, str]]) -> None: 

374 self._set_langalt_values(DC_NAMESPACE, "description", values) 

375 

376 @property 

377 def dc_format(self) -> Optional[str]: 

378 """The mime-type of the resource.""" 

379 return self._get_single_value(DC_NAMESPACE, "format") 

380 

381 @dc_format.setter 

382 def dc_format(self, value: Optional[str]) -> None: 

383 self._set_single_value(DC_NAMESPACE, "format", value) 

384 

385 @property 

386 def dc_identifier(self) -> Optional[str]: 

387 """Unique identifier of the resource.""" 

388 return self._get_single_value(DC_NAMESPACE, "identifier") 

389 

390 @dc_identifier.setter 

391 def dc_identifier(self, value: Optional[str]) -> None: 

392 self._set_single_value(DC_NAMESPACE, "identifier", value) 

393 

394 @property 

395 def dc_language(self) -> Optional[list[str]]: 

396 """An unordered array specifying the languages used in the resource.""" 

397 return self._getter_bag(DC_NAMESPACE, "language") 

398 

399 @dc_language.setter 

400 def dc_language(self, values: Optional[list[str]]) -> None: 

401 self._set_bag_values(DC_NAMESPACE, "language", values) 

402 

403 @property 

404 def dc_publisher(self) -> Optional[list[str]]: 

405 """An unordered array of publisher names.""" 

406 return self._getter_bag(DC_NAMESPACE, "publisher") 

407 

408 @dc_publisher.setter 

409 def dc_publisher(self, values: Optional[list[str]]) -> None: 

410 self._set_bag_values(DC_NAMESPACE, "publisher", values) 

411 

412 @property 

413 def dc_relation(self) -> Optional[list[str]]: 

414 """An unordered array of text descriptions of relationships to other documents.""" 

415 return self._getter_bag(DC_NAMESPACE, "relation") 

416 

417 @dc_relation.setter 

418 def dc_relation(self, values: Optional[list[str]]) -> None: 

419 self._set_bag_values(DC_NAMESPACE, "relation", values) 

420 

421 @property 

422 def dc_rights(self) -> Optional[dict[str, str]]: 

423 """A language-keyed dictionary of textual descriptions of the rights the user has to this resource.""" 

424 return self._get_langalt_values(DC_NAMESPACE, "rights") 

425 

426 @dc_rights.setter 

427 def dc_rights(self, values: Optional[dict[str, str]]) -> None: 

428 self._set_langalt_values(DC_NAMESPACE, "rights", values) 

429 

430 @property 

431 def dc_source(self) -> Optional[str]: 

432 """Unique identifier of the work from which this resource was derived.""" 

433 return self._get_single_value(DC_NAMESPACE, "source") 

434 

435 @dc_source.setter 

436 def dc_source(self, value: Optional[str]) -> None: 

437 self._set_single_value(DC_NAMESPACE, "source", value) 

438 

439 @property 

440 def dc_subject(self) -> Optional[list[str]]: 

441 """An unordered array of descriptive phrases or keywords that specify the topic of the content.""" 

442 return self._getter_bag(DC_NAMESPACE, "subject") 

443 

444 @dc_subject.setter 

445 def dc_subject(self, values: Optional[list[str]]) -> None: 

446 self._set_bag_values(DC_NAMESPACE, "subject", values) 

447 

448 @property 

449 def dc_title(self) -> Optional[dict[str, str]]: 

450 """A language-keyed dictionary of the title of the resource.""" 

451 return self._get_langalt_values(DC_NAMESPACE, "title") 

452 

453 @dc_title.setter 

454 def dc_title(self, values: Optional[dict[str, str]]) -> None: 

455 self._set_langalt_values(DC_NAMESPACE, "title", values) 

456 

457 @property 

458 def dc_type(self) -> Optional[list[str]]: 

459 """An unordered array of textual descriptions of the document type.""" 

460 return self._getter_bag(DC_NAMESPACE, "type") 

461 

462 @dc_type.setter 

463 def dc_type(self, values: Optional[list[str]]) -> None: 

464 self._set_bag_values(DC_NAMESPACE, "type", values) 

465 

466 @property 

467 def pdf_keywords(self) -> Optional[str]: 

468 """An unformatted text string representing document keywords.""" 

469 return self._get_single_value(PDF_NAMESPACE, "Keywords") 

470 

471 @pdf_keywords.setter 

472 def pdf_keywords(self, value: Optional[str]) -> None: 

473 self._set_single_value(PDF_NAMESPACE, "Keywords", value) 

474 

475 @property 

476 def pdf_pdfversion(self) -> Optional[str]: 

477 """The PDF file version, for example 1.0 or 1.3.""" 

478 return self._get_single_value(PDF_NAMESPACE, "PDFVersion") 

479 

480 @pdf_pdfversion.setter 

481 def pdf_pdfversion(self, value: Optional[str]) -> None: 

482 self._set_single_value(PDF_NAMESPACE, "PDFVersion", value) 

483 

484 @property 

485 def pdf_producer(self) -> Optional[str]: 

486 """The name of the tool that saved the document as a PDF.""" 

487 return self._get_single_value(PDF_NAMESPACE, "Producer") 

488 

489 @pdf_producer.setter 

490 def pdf_producer(self, value: Optional[str]) -> None: 

491 self._set_single_value(PDF_NAMESPACE, "Producer", value) 

492 

493 @property 

494 def xmp_create_date(self) -> Optional[datetime.datetime]: 

495 """The date and time the resource was originally created. Returned as a UTC datetime object.""" 

496 return self._get_single_value(XMP_NAMESPACE, "CreateDate", _converter_date) 

497 

498 @xmp_create_date.setter 

499 def xmp_create_date(self, value: Optional[datetime.datetime]) -> None: 

500 if value: 

501 date_str = _format_datetime_utc(value) 

502 self._set_single_value(XMP_NAMESPACE, "CreateDate", date_str) 

503 else: 

504 self._set_single_value(XMP_NAMESPACE, "CreateDate", None) 

505 

506 @property 

507 def xmp_modify_date(self) -> Optional[datetime.datetime]: 

508 """The date and time the resource was last modified. Returned as a UTC datetime object.""" 

509 return self._get_single_value(XMP_NAMESPACE, "ModifyDate", _converter_date) 

510 

511 @xmp_modify_date.setter 

512 def xmp_modify_date(self, value: Optional[datetime.datetime]) -> None: 

513 if value: 

514 date_str = _format_datetime_utc(value) 

515 self._set_single_value(XMP_NAMESPACE, "ModifyDate", date_str) 

516 else: 

517 self._set_single_value(XMP_NAMESPACE, "ModifyDate", None) 

518 

519 @property 

520 def xmp_metadata_date(self) -> Optional[datetime.datetime]: 

521 """The date and time that any metadata for this resource was last changed. Returned as a UTC datetime object.""" 

522 return self._get_single_value(XMP_NAMESPACE, "MetadataDate", _converter_date) 

523 

524 @xmp_metadata_date.setter 

525 def xmp_metadata_date(self, value: Optional[datetime.datetime]) -> None: 

526 if value: 

527 date_str = _format_datetime_utc(value) 

528 self._set_single_value(XMP_NAMESPACE, "MetadataDate", date_str) 

529 else: 

530 self._set_single_value(XMP_NAMESPACE, "MetadataDate", None) 

531 

532 @property 

533 def xmp_creator_tool(self) -> Optional[str]: 

534 """The name of the first known tool used to create the resource.""" 

535 return self._get_single_value(XMP_NAMESPACE, "CreatorTool") 

536 

537 @xmp_creator_tool.setter 

538 def xmp_creator_tool(self, value: Optional[str]) -> None: 

539 self._set_single_value(XMP_NAMESPACE, "CreatorTool", value) 

540 

541 @property 

542 def xmpmm_document_id(self) -> Optional[str]: 

543 """The common identifier for all versions and renditions of this resource.""" 

544 return self._get_single_value(XMPMM_NAMESPACE, "DocumentID") 

545 

546 @xmpmm_document_id.setter 

547 def xmpmm_document_id(self, value: Optional[str]) -> None: 

548 self._set_single_value(XMPMM_NAMESPACE, "DocumentID", value) 

549 

550 @property 

551 def xmpmm_instance_id(self) -> Optional[str]: 

552 """An identifier for a specific incarnation of a document, updated each time a file is saved.""" 

553 return self._get_single_value(XMPMM_NAMESPACE, "InstanceID") 

554 

555 @xmpmm_instance_id.setter 

556 def xmpmm_instance_id(self, value: Optional[str]) -> None: 

557 self._set_single_value(XMPMM_NAMESPACE, "InstanceID", value) 

558 

559 @property 

560 def pdfaid_part(self) -> Optional[str]: 

561 """The part of the PDF/A standard that the document conforms to (e.g., 1, 2, 3).""" 

562 return self._get_single_value(PDFAID_NAMESPACE, "part") 

563 

564 @pdfaid_part.setter 

565 def pdfaid_part(self, value: Optional[str]) -> None: 

566 self._set_single_value(PDFAID_NAMESPACE, "part", value) 

567 

568 @property 

569 def pdfaid_conformance(self) -> Optional[str]: 

570 """The conformance level within the PDF/A standard (e.g., 'A', 'B', 'U').""" 

571 return self._get_single_value(PDFAID_NAMESPACE, "conformance") 

572 

573 @pdfaid_conformance.setter 

574 def pdfaid_conformance(self, value: Optional[str]) -> None: 

575 self._set_single_value(PDFAID_NAMESPACE, "conformance", value) 

576 

577 @property 

578 def custom_properties(self) -> dict[Any, Any]: 

579 """ 

580 Retrieve custom metadata properties defined in the undocumented pdfx 

581 metadata schema. 

582 

583 Returns: 

584 A dictionary of key/value items for custom metadata properties. 

585 

586 """ 

587 if not hasattr(self, "_custom_properties"): 

588 self._custom_properties = {} 

589 for node in self.get_nodes_in_namespace("", PDFX_NAMESPACE): 

590 key = node.localName 

591 while True: 

592 # see documentation about PDFX_NAMESPACE earlier in file 

593 idx = key.find("\u2182") 

594 if idx == -1: 

595 break 

596 key = ( 

597 key[:idx] 

598 + chr(int(key[idx + 1 : idx + 5], base=16)) 

599 + key[idx + 5 :] 

600 ) 

601 if node.nodeType == node.ATTRIBUTE_NODE: 

602 value = node.nodeValue 

603 else: 

604 value = self._get_text(node) 

605 self._custom_properties[key] = value 

606 return self._custom_properties 

607 

608 def _get_or_create_description(self, about_uri: str = "") -> XmlElement: 

609 """Get or create an rdf:Description element with the given about URI.""" 

610 for desc in self.rdf_root.getElementsByTagNameNS(RDF_NAMESPACE, "Description"): 

611 if desc.getAttributeNS(RDF_NAMESPACE, "about") == about_uri: 

612 return desc 

613 

614 doc = self.rdf_root.ownerDocument 

615 if doc is None: 

616 raise XmpDocumentError("XMP Document is None") 

617 desc = doc.createElementNS(RDF_NAMESPACE, "rdf:Description") 

618 desc.setAttributeNS(RDF_NAMESPACE, "rdf:about", about_uri) 

619 self.rdf_root.appendChild(desc) 

620 return desc 

621 

622 def _clear_cache_entry(self, namespace: str, name: str) -> None: 

623 """Remove a cached value for a given namespace/name if present.""" 

624 ns_cache = self.cache.get(namespace) 

625 if ns_cache and name in ns_cache: 

626 del ns_cache[name] 

627 

628 def _set_single_value(self, namespace: str, name: str, value: Optional[str]) -> None: 

629 """Set or remove a single metadata value.""" 

630 self._clear_cache_entry(namespace, name) 

631 desc = self._get_or_create_description() 

632 

633 existing_elements = list(desc.getElementsByTagNameNS(namespace, name)) 

634 for elem in existing_elements: 

635 desc.removeChild(elem) 

636 

637 if existing_attr := desc.getAttributeNodeNS(namespace, name): 

638 desc.removeAttributeNode(existing_attr) 

639 

640 if value is not None: 

641 doc = self.rdf_root.ownerDocument 

642 if doc is None: 

643 raise XmpDocumentError("XMP Document is None") 

644 prefix = self._get_namespace_prefix(namespace) 

645 elem = doc.createElementNS(namespace, f"{prefix}:{name}") 

646 text_node = doc.createTextNode(str(value)) 

647 elem.appendChild(text_node) 

648 desc.appendChild(elem) 

649 

650 self._update_stream() 

651 

652 def _set_bag_values(self, namespace: str, name: str, values: Optional[list[str]]) -> None: 

653 """Set or remove bag values (unordered array).""" 

654 self._clear_cache_entry(namespace, name) 

655 desc = self._get_or_create_description() 

656 

657 existing_elements = list(desc.getElementsByTagNameNS(namespace, name)) 

658 for elem in existing_elements: 

659 desc.removeChild(elem) 

660 

661 if values: 

662 doc = self.rdf_root.ownerDocument 

663 if doc is None: 

664 raise XmpDocumentError("XMP Document is None") 

665 prefix = self._get_namespace_prefix(namespace) 

666 elem = doc.createElementNS(namespace, f"{prefix}:{name}") 

667 bag = doc.createElementNS(RDF_NAMESPACE, "rdf:Bag") 

668 

669 for value in values: 

670 li = doc.createElementNS(RDF_NAMESPACE, "rdf:li") 

671 text_node = doc.createTextNode(str(value)) 

672 li.appendChild(text_node) 

673 bag.appendChild(li) 

674 

675 elem.appendChild(bag) 

676 desc.appendChild(elem) 

677 

678 self._update_stream() 

679 

680 def _set_seq_values(self, namespace: str, name: str, values: Optional[list[str]]) -> None: 

681 """Set or remove sequence values (ordered array).""" 

682 self._clear_cache_entry(namespace, name) 

683 desc = self._get_or_create_description() 

684 

685 existing_elements = list(desc.getElementsByTagNameNS(namespace, name)) 

686 for elem in existing_elements: 

687 desc.removeChild(elem) 

688 

689 if values: 

690 doc = self.rdf_root.ownerDocument 

691 if doc is None: 

692 raise XmpDocumentError("XMP Document is None") 

693 prefix = self._get_namespace_prefix(namespace) 

694 elem = doc.createElementNS(namespace, f"{prefix}:{name}") 

695 seq = doc.createElementNS(RDF_NAMESPACE, "rdf:Seq") 

696 

697 for value in values: 

698 li = doc.createElementNS(RDF_NAMESPACE, "rdf:li") 

699 text_node = doc.createTextNode(str(value)) 

700 li.appendChild(text_node) 

701 seq.appendChild(li) 

702 

703 elem.appendChild(seq) 

704 desc.appendChild(elem) 

705 

706 self._update_stream() 

707 

708 def _set_langalt_values(self, namespace: str, name: str, values: Optional[dict[str, str]]) -> None: 

709 """Set or remove language alternative values.""" 

710 self._clear_cache_entry(namespace, name) 

711 desc = self._get_or_create_description() 

712 

713 existing_elements = list(desc.getElementsByTagNameNS(namespace, name)) 

714 for elem in existing_elements: 

715 desc.removeChild(elem) 

716 

717 if values: 

718 doc = self.rdf_root.ownerDocument 

719 if doc is None: 

720 raise XmpDocumentError("XMP Document is None") 

721 prefix = self._get_namespace_prefix(namespace) 

722 elem = doc.createElementNS(namespace, f"{prefix}:{name}") 

723 alt = doc.createElementNS(RDF_NAMESPACE, "rdf:Alt") 

724 

725 for lang, value in values.items(): 

726 li = doc.createElementNS(RDF_NAMESPACE, "rdf:li") 

727 li.setAttribute("xml:lang", lang) 

728 text_node = doc.createTextNode(str(value)) 

729 li.appendChild(text_node) 

730 alt.appendChild(li) 

731 

732 elem.appendChild(alt) 

733 desc.appendChild(elem) 

734 

735 self._update_stream() 

736 

737 def _get_namespace_prefix(self, namespace: str) -> str: 

738 """Get the appropriate namespace prefix for a given namespace URI.""" 

739 return _NAMESPACE_PREFIX_MAP.get(namespace, "unknown") 

740 

741 def _update_stream(self) -> None: 

742 """Update the stream with the current XML content.""" 

743 doc = self.rdf_root.ownerDocument 

744 if doc is None: 

745 raise XmpDocumentError("XMP Document is None") 

746 

747 xml_data = doc.toxml(encoding="utf-8") 

748 self.stream.set_data(xml_data)