Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pikepdf/models/metadata.py: 29%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

448 statements  

1# SPDX-FileCopyrightText: 2022 James R. Barlow 

2# SPDX-License-Identifier: MPL-2.0 

3 

4"""PDF metadata handling.""" 

5 

6from __future__ import annotations 

7 

8import logging 

9import re 

10from abc import ABC, abstractmethod 

11from collections.abc import Iterable, Iterator, MutableMapping 

12from datetime import datetime, timezone 

13from functools import wraps 

14from io import BytesIO 

15from typing import TYPE_CHECKING, Any, Callable, NamedTuple 

16from warnings import warn 

17 

18from lxml import etree 

19from lxml.etree import QName, XMLSyntaxError, _Element 

20 

21from pikepdf._version import __version__ as pikepdf_version 

22from pikepdf._xml import parse_xml 

23from pikepdf.objects import Name, Stream, String 

24 

25if TYPE_CHECKING: # pragma: no cover 

26 from pikepdf import Pdf 

27 

28 

29XMP_NS_DC = "http://purl.org/dc/elements/1.1/" 

30XMP_NS_PDF = "http://ns.adobe.com/pdf/1.3/" 

31XMP_NS_PDFA_ID = "http://www.aiim.org/pdfa/ns/id/" 

32XMP_NS_PDFA_EXTENSION = "http://www.aiim.org/pdfa/ns/extension/" 

33XMP_NS_PDFA_PROPERTY = "http://www.aiim.org/pdfa/ns/property#" 

34XMP_NS_PDFA_SCHEMA = "http://www.aiim.org/pdfa/ns/schema#" 

35XMP_NS_PDFUA_ID = "http://www.aiim.org/pdfua/ns/id/" 

36XMP_NS_PDFX_ID = "http://www.npes.org/pdfx/ns/id/" 

37XMP_NS_PHOTOSHOP = "http://ns.adobe.com/photoshop/1.0/" 

38XMP_NS_PRISM = "http://prismstandard.org/namespaces/basic/1.0/" 

39XMP_NS_PRISM2 = "http://prismstandard.org/namespaces/basic/2.0/" 

40XMP_NS_PRISM3 = "http://prismstandard.org/namespaces/basic/3.0/" 

41XMP_NS_RDF = "http://www.w3.org/1999/02/22-rdf-syntax-ns#" 

42XMP_NS_XMP = "http://ns.adobe.com/xap/1.0/" 

43XMP_NS_XMP_MM = "http://ns.adobe.com/xap/1.0/mm/" 

44XMP_NS_XMP_RIGHTS = "http://ns.adobe.com/xap/1.0/rights/" 

45 

46DEFAULT_NAMESPACES: list[tuple[str, str]] = [ 

47 ('adobe:ns:meta/', 'x'), 

48 (XMP_NS_DC, 'dc'), 

49 (XMP_NS_PDF, 'pdf'), 

50 (XMP_NS_PDFA_ID, 'pdfaid'), 

51 (XMP_NS_PDFA_EXTENSION, 'pdfaExtension'), 

52 (XMP_NS_PDFA_PROPERTY, 'pdfaProperty'), 

53 (XMP_NS_PDFA_SCHEMA, 'pdfaSchema'), 

54 (XMP_NS_PDFUA_ID, 'pdfuaid'), 

55 (XMP_NS_PDFX_ID, 'pdfxid'), 

56 (XMP_NS_PHOTOSHOP, 'photoshop'), 

57 (XMP_NS_PRISM, 'prism'), 

58 (XMP_NS_PRISM2, 'prism2'), 

59 (XMP_NS_PRISM3, 'prism3'), 

60 (XMP_NS_RDF, 'rdf'), 

61 (XMP_NS_XMP, 'xmp'), 

62 (XMP_NS_XMP_MM, 'xmpMM'), 

63 (XMP_NS_XMP_RIGHTS, 'xmpRights'), 

64 ('http://crossref.org/crossmark/1.0/', 'crossmark'), 

65 ('http://www.niso.org/schemas/jav/1.0/', 'jav'), 

66 ('http://ns.adobe.com/pdfx/1.3/', 'pdfx'), 

67 ('http://www.niso.org/schemas/ali/1.0/', 'ali'), 

68] 

69 

70for _uri, _prefix in DEFAULT_NAMESPACES: 

71 etree.register_namespace(_prefix, _uri) 

72 

73# This one should not be registered 

74XMP_NS_XML = "http://www.w3.org/XML/1998/namespace" 

75 

76XPACKET_BEGIN = b"""<?xpacket begin="\xef\xbb\xbf" id="W5M0MpCehiHzreSzNTczkc9d"?>\n""" 

77 

78XMP_EMPTY = b"""<x:xmpmeta xmlns:x="adobe:ns:meta/" x:xmptk="pikepdf"> 

79 <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"> 

80 </rdf:RDF> 

81</x:xmpmeta> 

82""" 

83 

84XPACKET_END = b"""\n<?xpacket end="w"?>\n""" 

85 

86 

87class XmpContainer(NamedTuple): 

88 """Map XMP container object to suitable Python container.""" 

89 

90 rdf_type: str 

91 py_type: type 

92 insert_fn: Callable[..., None] 

93 

94 

95log = logging.getLogger(__name__) 

96 

97 

98class NeverRaise(Exception): 

99 """An exception that is never raised.""" 

100 

101 

102class AltList(list): 

103 """XMP AltList container.""" 

104 

105 

106XMP_CONTAINERS = [ 

107 XmpContainer('Alt', AltList, AltList.append), 

108 XmpContainer('Bag', set, set.add), 

109 XmpContainer('Seq', list, list.append), 

110] 

111 

112LANG_ALTS = frozenset( 

113 [ 

114 str(QName(XMP_NS_DC, 'title')), 

115 str(QName(XMP_NS_DC, 'description')), 

116 str(QName(XMP_NS_DC, 'rights')), 

117 str(QName(XMP_NS_XMP_RIGHTS, 'UsageTerms')), 

118 ] 

119) 

120 

121# These are the illegal characters in XML 1.0. (XML 1.1 is a bit more permissive, 

122# but we'll be strict to ensure wider compatibility.) 

123re_xml_illegal_chars = re.compile( 

124 r"(?u)[^\x09\x0A\x0D\x20-\U0000D7FF\U0000E000-\U0000FFFD\U00010000-\U0010FFFF]" 

125) 

126re_xml_illegal_bytes = re.compile(rb"[^\x09\x0A\x0D\x20-\xFF]|&#0;") 

127 

128# Might want to check re_xml_illegal_bytes for patterns such as: 

129# br"&#(?:[0-9]|0[0-9]|1[0-9]|2[0-9]|3[0-1] 

130# |x[0-9A-Fa-f]|x0[0-9A-Fa-f]|x1[0-9A-Fa-f]);" 

131 

132 

133def _parser_basic(xml: bytes): 

134 return parse_xml(BytesIO(xml)) 

135 

136 

137def _parser_strip_illegal_bytes(xml: bytes): 

138 return parse_xml(BytesIO(re_xml_illegal_bytes.sub(b'', xml))) 

139 

140 

141def _parser_recovery(xml: bytes): 

142 return parse_xml(BytesIO(xml), recover=True) 

143 

144 

145def _parser_replace_with_empty_xmp(_xml: bytes = b''): 

146 log.warning("Error occurred parsing XMP, replacing with empty XMP.") 

147 return _parser_basic(XMP_EMPTY) 

148 

149 

150def _clean(s: str | Iterable[str], joiner: str = '; ') -> str: 

151 """Ensure an object can safely be inserted in a XML tag body. 

152 

153 If we still have a non-str object at this point, the best option is to 

154 join it, because it's apparently calling for a new node in a place that 

155 isn't allowed in the spec or not supported. 

156 """ 

157 if not isinstance(s, str): 

158 if isinstance(s, Iterable): 

159 warn(f"Merging elements of {s}") 

160 if isinstance(s, set): 

161 s = joiner.join(sorted(s)) 

162 else: 

163 s = joiner.join(s) 

164 else: 

165 raise TypeError("object must be a string or iterable of strings") 

166 return re_xml_illegal_chars.sub('', s) 

167 

168 

169def encode_pdf_date(d: datetime) -> str: 

170 """Encode Python datetime object as PDF date string. 

171 

172 From Adobe pdfmark manual: 

173 (D:YYYYMMDDHHmmSSOHH'mm') 

174 D: is an optional prefix. YYYY is the year. All fields after the year are 

175 optional. MM is the month (01-12), DD is the day (01-31), HH is the 

176 hour (00-23), mm are the minutes (00-59), and SS are the seconds 

177 (00-59). The remainder of the string defines the relation of local 

178 time to GMT. O is either + for a positive difference (local time is 

179 later than GMT) or - (minus) for a negative difference. HH' is the 

180 absolute value of the offset from GMT in hours, and mm' is the 

181 absolute value of the offset in minutes. If no GMT information is 

182 specified, the relation between the specified time and GMT is 

183 considered unknown. Regardless of whether or not GMT 

184 information is specified, the remainder of the string should specify 

185 the local time. 

186 

187 'D:' is required in PDF/A, so we always add it. 

188 """ 

189 # The formatting of %Y is not consistent as described in 

190 # https://bugs.python.org/issue13305 and underspecification in libc. 

191 # So explicitly format the year with leading zeros 

192 s = f"D:{d.year:04d}" 

193 s += d.strftime(r'%m%d%H%M%S') 

194 tz = d.strftime('%z') 

195 if tz: 

196 sign, tz_hours, tz_mins = tz[0], tz[1:3], tz[3:5] 

197 s += f"{sign}{tz_hours}'{tz_mins}" 

198 return s 

199 

200 

201def decode_pdf_date(s: str) -> datetime: 

202 """Decode a pdfmark date to a Python datetime object. 

203 

204 A pdfmark date is a string in a particular format, as described in 

205 :func:`encode_pdf_date`. 

206 """ 

207 if isinstance(s, String): 

208 s = str(s) 

209 t = s 

210 if t.startswith('D:'): 

211 t = t[2:] 

212 utcs = [ 

213 "Z00'00'", # Literal Z00'00', is incorrect but found in the wild 

214 "Z00'00", # Correctly formatted UTC 

215 "Z", # Alternate UTC 

216 ] 

217 for utc in utcs: 

218 if t.endswith(utc): 

219 t = t.replace(utc, "+0000") 

220 break 

221 t = t.replace("'", "") # Remove apos from PDF time strings 

222 

223 date_formats = [ 

224 r"%Y%m%d%H%M%S%z", # Format with timezone 

225 r"%Y%m%d%H%M%S", # Format without timezone 

226 r"%Y%m%d", # Date only format 

227 ] 

228 for date_format in date_formats: 

229 try: 

230 return datetime.strptime(t, date_format) 

231 except ValueError: 

232 continue 

233 raise ValueError(f"Date string does not match any known format: {s} (read as {t})") 

234 

235 

236class Converter(ABC): 

237 """XMP <-> DocumentInfo converter.""" 

238 

239 @staticmethod 

240 @abstractmethod 

241 def xmp_from_docinfo(docinfo_val: str | None) -> Any: # type: ignore 

242 """Derive XMP metadata from a DocumentInfo string.""" 

243 

244 @staticmethod 

245 @abstractmethod 

246 def docinfo_from_xmp(xmp_val: Any) -> str | None: 

247 """Derive a DocumentInfo value from equivalent XMP metadata.""" 

248 

249 

250class AuthorConverter(Converter): 

251 """Convert XMP document authors to DocumentInfo.""" 

252 

253 @staticmethod 

254 def xmp_from_docinfo(docinfo_val: str | None) -> Any: # type: ignore 

255 """Derive XMP authors info from DocumentInfo.""" 

256 return [docinfo_val] 

257 

258 @staticmethod 

259 def docinfo_from_xmp(xmp_val): 

260 """Derive DocumentInfo authors from XMP. 

261 

262 XMP supports multiple author values, while DocumentInfo has a string, 

263 so we return the values separated by semi-colons. 

264 """ 

265 if isinstance(xmp_val, str): 

266 return xmp_val 

267 if xmp_val is None or xmp_val == [None]: 

268 return None 

269 return '; '.join(author for author in xmp_val if author is not None) 

270 

271 

272class DateConverter(Converter): 

273 """Convert XMP dates to DocumentInfo.""" 

274 

275 @staticmethod 

276 def xmp_from_docinfo(docinfo_val): 

277 """Derive XMP date from DocumentInfo.""" 

278 if docinfo_val == '': 

279 return '' 

280 return decode_pdf_date(docinfo_val).isoformat() 

281 

282 @staticmethod 

283 def docinfo_from_xmp(xmp_val): 

284 """Derive DocumentInfo from XMP.""" 

285 if xmp_val.endswith('Z'): 

286 xmp_val = xmp_val[:-1] + '+00:00' 

287 dateobj = datetime.fromisoformat(xmp_val) 

288 return encode_pdf_date(dateobj) 

289 

290 

291class DocinfoMapping(NamedTuple): 

292 """Map DocumentInfo keys to their XMP equivalents, along with converter.""" 

293 

294 ns: str 

295 key: str 

296 name: Name 

297 converter: type[Converter] | None 

298 

299 

300def ensure_loaded(fn): 

301 """Ensure the XMP has been loaded and parsed. 

302 

303 TODO: Can this be removed? Why allow the uninit'ed state to even exist? 

304 """ 

305 

306 @wraps(fn) 

307 def wrapper(self, *args, **kwargs): 

308 if not self._xmp: 

309 self._load() 

310 return fn(self, *args, **kwargs) 

311 

312 return wrapper 

313 

314 

315class PdfMetadata(MutableMapping): 

316 """Read and edit the metadata associated with a PDF. 

317 

318 The PDF specification contain two types of metadata, the newer XMP 

319 (Extensible Metadata Platform, XML-based) and older DocumentInformation 

320 dictionary. The PDF 2.0 specification removes the DocumentInformation 

321 dictionary. 

322 

323 This primarily works with XMP metadata, but includes methods to generate 

324 XMP from DocumentInformation and will also coordinate updates to 

325 DocumentInformation so that the two are kept consistent. 

326 

327 XMP metadata fields may be accessed using the full XML namespace URI or 

328 the short name. For example ``metadata['dc:description']`` 

329 and ``metadata['{http://purl.org/dc/elements/1.1/}description']`` 

330 both refer to the same field. Several common XML namespaces are registered 

331 automatically. 

332 

333 See the XMP specification for details of allowable fields. 

334 

335 To update metadata, use a with block. 

336 

337 Example: 

338 >>> with pdf.open_metadata() as records: 

339 ... records['dc:title'] = 'New Title' 

340 

341 See Also: 

342 :meth:`pikepdf.Pdf.open_metadata` 

343 """ 

344 

345 DOCINFO_MAPPING: list[DocinfoMapping] = [ 

346 DocinfoMapping(XMP_NS_DC, 'creator', Name.Author, AuthorConverter), 

347 DocinfoMapping(XMP_NS_DC, 'description', Name.Subject, None), 

348 DocinfoMapping(XMP_NS_DC, 'title', Name.Title, None), 

349 DocinfoMapping(XMP_NS_PDF, 'Keywords', Name.Keywords, None), 

350 DocinfoMapping(XMP_NS_PDF, 'Producer', Name.Producer, None), 

351 DocinfoMapping(XMP_NS_XMP, 'CreateDate', Name.CreationDate, DateConverter), 

352 DocinfoMapping(XMP_NS_XMP, 'CreatorTool', Name.Creator, None), 

353 DocinfoMapping(XMP_NS_XMP, 'ModifyDate', Name.ModDate, DateConverter), 

354 ] 

355 

356 NS: dict[str, str] = {prefix: uri for uri, prefix in DEFAULT_NAMESPACES} 

357 REVERSE_NS: dict[str, str] = dict(DEFAULT_NAMESPACES) 

358 

359 _PARSERS_OVERWRITE_INVALID_XML: Iterable[Callable[[bytes], Any]] = [ 

360 _parser_basic, 

361 _parser_strip_illegal_bytes, 

362 _parser_recovery, 

363 _parser_replace_with_empty_xmp, 

364 ] 

365 _PARSERS_STANDARD: Iterable[Callable[[bytes], Any]] = [_parser_basic] 

366 

367 @classmethod 

368 def register_xml_namespace(cls, uri, prefix): 

369 """Register a new XML/XMP namespace. 

370 

371 Arguments: 

372 uri: The long form of the namespace. 

373 prefix: The alias to use when interpreting XMP. 

374 """ 

375 cls.NS[prefix] = uri 

376 cls.REVERSE_NS[uri] = prefix 

377 etree.register_namespace(_prefix, _uri) 

378 

379 def __init__( 

380 self, 

381 pdf: Pdf, 

382 pikepdf_mark: bool = True, 

383 sync_docinfo: bool = True, 

384 overwrite_invalid_xml: bool = True, 

385 ): 

386 """Construct PdfMetadata. Use Pdf.open_metadata() instead.""" 

387 self._pdf = pdf 

388 self.mark = pikepdf_mark 

389 self.sync_docinfo = sync_docinfo 

390 self._updating = False 

391 self.overwrite_invalid_xml = overwrite_invalid_xml 

392 self._xmp = None 

393 

394 def load_from_docinfo( 

395 self, docinfo, delete_missing: bool = False, raise_failure: bool = False 

396 ) -> None: 

397 """Populate the XMP metadata object with DocumentInfo. 

398 

399 Arguments: 

400 docinfo: a DocumentInfo, e.g pdf.docinfo 

401 delete_missing: if the entry is not DocumentInfo, delete the equivalent 

402 from XMP 

403 raise_failure: if True, raise any failure to convert docinfo; 

404 otherwise warn and continue 

405 

406 A few entries in the deprecated DocumentInfo dictionary are considered 

407 approximately equivalent to certain XMP records. This method copies 

408 those entries into the XMP metadata. 

409 """ 

410 

411 def warn_or_raise(msg, e=None): 

412 if raise_failure: 

413 raise ValueError(msg) from e 

414 warn(msg) 

415 

416 for uri, shortkey, docinfo_name, converter in self.DOCINFO_MAPPING: 

417 qname = QName(uri, shortkey) 

418 # docinfo might be a dict or pikepdf.Dictionary, so lookup keys 

419 # by str(Name) 

420 val = docinfo.get(str(docinfo_name)) 

421 if val is None: 

422 if delete_missing and qname in self: 

423 del self[qname] 

424 continue 

425 try: 

426 val = str(val) 

427 if converter: 

428 val = converter.xmp_from_docinfo(val) 

429 if not val: 

430 continue 

431 self._setitem(qname, val, True) 

432 except (ValueError, AttributeError, NotImplementedError) as e: 

433 warn_or_raise( 

434 f"The metadata field {docinfo_name} could not be copied to XMP", e 

435 ) 

436 valid_docinfo_names = { 

437 str(docinfo_name) for _, _, docinfo_name, _ in self.DOCINFO_MAPPING 

438 } 

439 extra_docinfo_names = {str(k) for k in docinfo.keys()} - valid_docinfo_names 

440 for extra in extra_docinfo_names: 

441 warn_or_raise( 

442 f"The metadata field {extra} with value '{repr(docinfo.get(extra))}' " 

443 "has no XMP equivalent, so it was discarded", 

444 ) 

445 

446 def _load(self) -> None: 

447 try: 

448 data = self._pdf.Root.Metadata.read_bytes() 

449 except AttributeError: 

450 data = b'' 

451 self._load_from(data) 

452 

453 def _load_from(self, data: bytes) -> None: 

454 if data.strip() == b'': 

455 data = XMP_EMPTY # on some platforms lxml chokes on empty documents 

456 

457 parsers = ( 

458 self._PARSERS_OVERWRITE_INVALID_XML 

459 if self.overwrite_invalid_xml 

460 else self._PARSERS_STANDARD 

461 ) 

462 

463 for parser in parsers: 

464 try: 

465 self._xmp = parser(data) 

466 except ( 

467 XMLSyntaxError if self.overwrite_invalid_xml else NeverRaise # type: ignore 

468 ) as e: 

469 if str(e).startswith("Start tag expected, '<' not found") or str( 

470 e 

471 ).startswith("Document is empty"): 

472 self._xmp = _parser_replace_with_empty_xmp() 

473 break 

474 else: 

475 break 

476 

477 if self._xmp is not None: 

478 try: 

479 pis = self._xmp.xpath('/processing-instruction()') 

480 for pi in pis: 

481 etree.strip_tags(self._xmp, pi.tag) 

482 self._get_rdf_root() 

483 except ( 

484 Exception # pylint: disable=broad-except 

485 if self.overwrite_invalid_xml 

486 else NeverRaise 

487 ) as e: 

488 log.warning("Error occurred parsing XMP", exc_info=e) 

489 self._xmp = _parser_replace_with_empty_xmp() 

490 else: 

491 log.warning("Error occurred parsing XMP") 

492 self._xmp = _parser_replace_with_empty_xmp() 

493 

494 @ensure_loaded 

495 def __enter__(self): 

496 """Open metadata for editing.""" 

497 self._updating = True 

498 return self 

499 

500 def __exit__(self, exc_type, exc_val, exc_tb): 

501 """Close metadata and apply changes.""" 

502 try: 

503 if exc_type is not None: 

504 return 

505 self._apply_changes() 

506 finally: 

507 self._updating = False 

508 

509 def _update_docinfo(self): 

510 """Update the PDF's DocumentInfo dictionary to match XMP metadata. 

511 

512 The standard mapping is described here: 

513 https://www.pdfa.org/pdfa-metadata-xmp-rdf-dublin-core/ 

514 """ 

515 # Touch object to ensure it exists 

516 self._pdf.docinfo # pylint: disable=pointless-statement 

517 for uri, element, docinfo_name, converter in self.DOCINFO_MAPPING: 

518 qname = QName(uri, element) 

519 try: 

520 value = self[qname] 

521 except KeyError: 

522 if docinfo_name in self._pdf.docinfo: 

523 del self._pdf.docinfo[docinfo_name] 

524 continue 

525 if converter: 

526 try: 

527 value = converter.docinfo_from_xmp(value) 

528 except ValueError: 

529 warn( 

530 f"The DocumentInfo field {docinfo_name} could not be " 

531 "updated from XMP" 

532 ) 

533 value = None 

534 except Exception as e: 

535 raise ValueError( 

536 "An error occurred while updating DocumentInfo field " 

537 f"{docinfo_name} from XMP {qname} with value {value}" 

538 ) from e 

539 if value is None: 

540 if docinfo_name in self._pdf.docinfo: 

541 del self._pdf.docinfo[docinfo_name] 

542 continue 

543 value = _clean(value) 

544 try: 

545 # Try to save pure ASCII 

546 self._pdf.docinfo[docinfo_name] = value.encode('ascii') 

547 except UnicodeEncodeError: 

548 # qpdf will serialize this as a UTF-16 with BOM string 

549 self._pdf.docinfo[docinfo_name] = value 

550 

551 def _get_xml_bytes(self, xpacket=True): 

552 data = BytesIO() 

553 if xpacket: 

554 data.write(XPACKET_BEGIN) 

555 self._xmp.write(data, encoding='utf-8', pretty_print=True) 

556 if xpacket: 

557 data.write(XPACKET_END) 

558 data.seek(0) 

559 xml_bytes = data.read() 

560 return xml_bytes 

561 

562 def _apply_changes(self): 

563 """Serialize our changes back to the PDF in memory. 

564 

565 Depending how we are initialized, leave our metadata mark and producer. 

566 """ 

567 if self.mark: 

568 # We were asked to mark the file as being edited by pikepdf 

569 self._setitem( 

570 QName(XMP_NS_XMP, 'MetadataDate'), 

571 datetime.now(timezone.utc).isoformat(), 

572 applying_mark=True, 

573 ) 

574 self._setitem( 

575 QName(XMP_NS_PDF, 'Producer'), 

576 'pikepdf ' + pikepdf_version, 

577 applying_mark=True, 

578 ) 

579 xml = self._get_xml_bytes() 

580 self._pdf.Root.Metadata = Stream(self._pdf, xml) 

581 self._pdf.Root.Metadata[Name.Type] = Name.Metadata 

582 self._pdf.Root.Metadata[Name.Subtype] = Name.XML 

583 if self.sync_docinfo: 

584 self._update_docinfo() 

585 

586 @classmethod 

587 def _qname(cls, name: QName | str) -> str: 

588 """Convert name to an XML QName. 

589 

590 e.g. pdf:Producer -> {http://ns.adobe.com/pdf/1.3/}Producer 

591 """ 

592 if isinstance(name, QName): 

593 return str(name) 

594 if not isinstance(name, str): 

595 raise TypeError(f"{name} must be str") 

596 if name == '': 

597 return name 

598 if name.startswith('{'): 

599 return name 

600 try: 

601 prefix, tag = name.split(':', maxsplit=1) 

602 except ValueError: 

603 # If missing the namespace, it belongs in the default namespace. 

604 # A tag such <xyz xmlns="http://example.com"> defines a default 

605 # namespace of http://example.com for all enclosed tags that don't 

606 # override the namespace with a colon prefix. 

607 # XMP does not usually use the default namespace, so we can 

608 # assume it's just blank. In practice a document that depends on 

609 # defining a default namespace over some part of its content 

610 # could introduce a collision. 

611 # See: https://www.w3.org/TR/REC-xml-names/#dt-defaultNS 

612 prefix, tag = '', name 

613 uri = cls.NS.get(prefix, None) 

614 return str(QName(uri, tag)) 

615 

616 def _prefix_from_uri(self, uriname): 

617 """Given a fully qualified XML name, find a prefix. 

618 

619 e.g. {http://ns.adobe.com/pdf/1.3/}Producer -> pdf:Producer 

620 """ 

621 uripart, tag = uriname.split('}', maxsplit=1) 

622 uri = uripart.replace('{', '') 

623 return self.REVERSE_NS[uri] + ':' + tag 

624 

625 def _get_subelements(self, node: _Element) -> Any: 

626 """Gather the sub-elements attached to a node. 

627 

628 Gather rdf:Bag and and rdf:Seq into set and list respectively. For 

629 alternate languages values, take the first language only for 

630 simplicity. 

631 """ 

632 items = node.find('rdf:Alt', self.NS) 

633 if items is not None: 

634 try: 

635 return items[0].text 

636 except IndexError: 

637 return '' 

638 

639 for xmlcontainer, container, insertfn in XMP_CONTAINERS: 

640 items = node.find(f'rdf:{xmlcontainer}', self.NS) 

641 if items is None: 

642 continue 

643 result = container() 

644 for item in items: 

645 insertfn(result, item.text) 

646 return result 

647 return '' 

648 

649 def _get_rdf_root(self) -> _Element: 

650 assert self._xmp is not None 

651 rdf = self._xmp.find('.//rdf:RDF', self.NS) 

652 if rdf is None: 

653 rdf = self._xmp.getroot() 

654 if not rdf.tag == '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF': 

655 raise ValueError("Metadata seems to be XML but not XMP") 

656 return rdf 

657 

658 def _get_elements( 

659 self, name: str | QName = '' 

660 ) -> Iterator[tuple[_Element, str | bytes | None, Any, _Element]]: 

661 """Get elements from XMP. 

662 

663 Core routine to find elements matching name within the XMP and yield 

664 them. 

665 

666 For XMP spec 7.9.2.2, rdf:Description with property attributes, 

667 we yield the node which will have the desired as one of its attributes. 

668 qname is returned so that the node.attrib can be used to locate the 

669 source. 

670 

671 For XMP spec 7.5, simple valued XMP properties, we yield the node, 

672 None, and the value. For structure or array valued properties we gather 

673 the elements. We ignore qualifiers. 

674 

675 Args: 

676 name: a prefixed name or QName to look for within the 

677 data section of the XMP; looks for all data keys if omitted 

678 

679 Yields: 

680 tuple: (node, qname_attrib, value, parent_node) 

681 

682 """ 

683 qname = self._qname(name) 

684 rdf = self._get_rdf_root() 

685 for rdfdesc in rdf.findall('rdf:Description[@rdf:about=""]', self.NS): 

686 if qname and qname in rdfdesc.keys(): 

687 yield (rdfdesc, qname, rdfdesc.get(qname), rdf) 

688 elif not qname: 

689 for k, v in rdfdesc.items(): 

690 if v: 

691 yield (rdfdesc, k, v, rdf) 

692 xpath = qname if name else '*' 

693 for node in rdfdesc.findall(xpath, self.NS): 

694 if node.text and node.text.strip(): 

695 yield (node, None, node.text, rdfdesc) 

696 continue 

697 values = self._get_subelements(node) 

698 yield (node, None, values, rdfdesc) 

699 

700 def _get_element_values(self, name: str | QName = '') -> Iterator[Any]: 

701 yield from (v[2] for v in self._get_elements(name)) 

702 

703 @ensure_loaded 

704 def __contains__(self, key: str | QName): 

705 """Test if XMP key is in metadata.""" 

706 return any(self._get_element_values(key)) 

707 

708 @ensure_loaded 

709 def __getitem__(self, key: str | QName): 

710 """Retrieve XMP metadata for key.""" 

711 try: 

712 return next(self._get_element_values(key)) 

713 except StopIteration: 

714 raise KeyError(key) from None 

715 

716 @ensure_loaded 

717 def __iter__(self): 

718 """Iterate through XMP metadata attributes and nodes.""" 

719 for node, attrib, _val, _parents in self._get_elements(): 

720 if attrib: 

721 yield attrib 

722 else: 

723 yield node.tag 

724 

725 @ensure_loaded 

726 def __len__(self): 

727 """Return number of items in metadata.""" 

728 return len(list(iter(self))) 

729 

730 def _setitem( 

731 self, 

732 key: str | QName, 

733 val: set[str] | list[str] | str, 

734 applying_mark: bool = False, 

735 ): 

736 if not self._updating: 

737 raise RuntimeError("Metadata not opened for editing, use with block") 

738 

739 qkey = self._qname(key) 

740 self._setitem_check_args(key, val, applying_mark, qkey) 

741 

742 try: 

743 # Update existing node 

744 self._setitem_update(key, val, qkey) 

745 except StopIteration: 

746 # Insert a new node 

747 self._setitem_insert(key, val) 

748 

749 def _setitem_check_args(self, key, val, applying_mark: bool, qkey: str) -> None: 

750 if ( 

751 self.mark 

752 and not applying_mark 

753 and qkey 

754 in ( 

755 self._qname('xmp:MetadataDate'), 

756 self._qname('pdf:Producer'), 

757 ) 

758 ): 

759 # Complain if user writes self[pdf:Producer] = ... and because it will 

760 # be overwritten on save, unless self._updating_mark, in which case 

761 # the action was initiated internally 

762 log.warning( 

763 f"Update to {key} will be overwritten because metadata was opened " 

764 "with set_pikepdf_as_editor=True" 

765 ) 

766 if isinstance(val, str) and qkey in (self._qname('dc:creator')): 

767 log.error(f"{key} should be set to a list of strings") 

768 

769 def _setitem_add_array(self, node, items: Iterable) -> None: 

770 rdf_type = next( 

771 c.rdf_type for c in XMP_CONTAINERS if isinstance(items, c.py_type) 

772 ) 

773 seq = etree.SubElement(node, str(QName(XMP_NS_RDF, rdf_type))) 

774 tag_attrib: dict[str, str] | None = None 

775 if rdf_type == 'Alt': 

776 tag_attrib = {str(QName(XMP_NS_XML, 'lang')): 'x-default'} 

777 for item in items: 

778 el = etree.SubElement(seq, str(QName(XMP_NS_RDF, 'li')), attrib=tag_attrib) 

779 if item is not None: 

780 inner_text: str | None = _clean(item) 

781 if inner_text == '': 

782 inner_text = None 

783 el.text = inner_text 

784 

785 def _setitem_update(self, key, val, qkey): 

786 # Locate existing node to replace 

787 node, attrib, _oldval, _parent = next(self._get_elements(key)) 

788 if attrib: 

789 if not isinstance(val, str): 

790 if qkey == self._qname('dc:creator'): 

791 # dc:creator incorrectly created as an attribute - we're 

792 # replacing it anyway, so remove the old one 

793 del node.attrib[qkey] 

794 self._setitem_add_array(node, _clean(val)) 

795 else: 

796 raise TypeError(f"Setting {key} to {val} with type {type(val)}") 

797 else: 

798 node.set(attrib, _clean(val)) 

799 elif isinstance(val, (list, set)): 

800 for child in node.findall('*'): 

801 node.remove(child) 

802 self._setitem_add_array(node, val) 

803 elif isinstance(val, str): 

804 for child in node.findall('*'): 

805 node.remove(child) 

806 if str(self._qname(key)) in LANG_ALTS: 

807 self._setitem_add_array(node, AltList([_clean(val)])) 

808 else: 

809 node.text = _clean(val) 

810 else: 

811 raise TypeError(f"Setting {key} to {val} with type {type(val)}") 

812 

813 def _setitem_insert(self, key, val): 

814 rdf = self._get_rdf_root() 

815 if str(self._qname(key)) in LANG_ALTS: 

816 val = AltList([_clean(val)]) 

817 if isinstance(val, (list, set)): 

818 rdfdesc = etree.SubElement( 

819 rdf, 

820 str(QName(XMP_NS_RDF, 'Description')), 

821 attrib={str(QName(XMP_NS_RDF, 'about')): ''}, 

822 ) 

823 node = etree.SubElement(rdfdesc, self._qname(key)) 

824 self._setitem_add_array(node, val) 

825 elif isinstance(val, str): 

826 rdfdesc = rdf.find('rdf:Description[@rdf:about=""]', self.NS) 

827 if rdfdesc is None: 

828 rdfdesc = etree.SubElement( 

829 rdf, 

830 str(QName(XMP_NS_RDF, 'Description')), 

831 attrib={str(QName(XMP_NS_RDF, 'about')): ''}, 

832 ) 

833 node = etree.SubElement(rdfdesc, self._qname(key)) 

834 node.text = _clean(val) 

835 else: 

836 raise TypeError(f"Setting {key} to {val} with type {type(val)}") from None 

837 

838 @ensure_loaded 

839 def __setitem__(self, key: str | QName, val: set[str] | list[str] | str): 

840 """Set XMP metadata key to value.""" 

841 return self._setitem(key, val, False) 

842 

843 @ensure_loaded 

844 def __delitem__(self, key: str | QName): 

845 """Delete item from XMP metadata.""" 

846 if not self._updating: 

847 raise RuntimeError("Metadata not opened for editing, use with block") 

848 try: 

849 node, attrib, _oldval, parent = next(self._get_elements(key)) 

850 if attrib: # Inline 

851 del node.attrib[attrib] 

852 if ( 

853 len(node.attrib) == 1 

854 and len(node) == 0 

855 and QName(XMP_NS_RDF, 'about') in node.attrib.keys() 

856 ): 

857 # The only thing left on this node is rdf:about="", so remove it 

858 parent.remove(node) 

859 else: 

860 parent.remove(node) 

861 except StopIteration: 

862 raise KeyError(key) from None 

863 

864 @property 

865 def pdfa_status(self) -> str: 

866 """Return the PDF/A conformance level claimed by this PDF, or False. 

867 

868 A PDF may claim to PDF/A compliant without this being true. Use an 

869 independent verifier such as veraPDF to test if a PDF is truly 

870 conformant. 

871 

872 Returns: 

873 The conformance level of the PDF/A, or an empty string if the 

874 PDF does not claim PDF/A conformance. Possible valid values 

875 are: 1A, 1B, 2A, 2B, 2U, 3A, 3B, 3U. 

876 """ 

877 # do same as @ensure_loaded - mypy can't handle decorated property 

878 if not self._xmp: 

879 self._load() 

880 

881 key_part = QName(XMP_NS_PDFA_ID, 'part') 

882 key_conformance = QName(XMP_NS_PDFA_ID, 'conformance') 

883 try: 

884 return self[key_part] + self[key_conformance] 

885 except KeyError: 

886 return '' 

887 

888 @property 

889 def pdfx_status(self) -> str: 

890 """Return the PDF/X conformance level claimed by this PDF, or False. 

891 

892 A PDF may claim to PDF/X compliant without this being true. Use an 

893 independent verifier such as veraPDF to test if a PDF is truly 

894 conformant. 

895 

896 Returns: 

897 The conformance level of the PDF/X, or an empty string if the 

898 PDF does not claim PDF/X conformance. 

899 """ 

900 # do same as @ensure_loaded - mypy can't handle decorated property 

901 if not self._xmp: 

902 self._load() 

903 

904 pdfx_version = QName(XMP_NS_PDFX_ID, 'GTS_PDFXVersion') 

905 try: 

906 return self[pdfx_version] 

907 except KeyError: 

908 return '' 

909 

910 @ensure_loaded 

911 def __str__(self): 

912 """Convert XMP metadata to XML string.""" 

913 return self._get_xml_bytes(xpacket=False).decode('utf-8')