Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pikepdf/models/metadata.py: 29%

1# SPDX-FileCopyrightText: 2022 James R. Barlow

2# SPDX-License-Identifier: MPL-2.0

4"""PDF metadata handling."""

6from __future__ import annotations

8import logging

9import re

10from abc import ABC, abstractmethod

11from collections.abc import Iterable, Iterator, MutableMapping

12from datetime import datetime, timezone

13from functools import wraps

14from io import BytesIO

15from typing import TYPE_CHECKING, Any, Callable, NamedTuple

16from warnings import warn

18from lxml import etree

19from lxml.etree import QName, XMLSyntaxError, _Element

21from pikepdf._version import __version__ as pikepdf_version

22from pikepdf._xml import parse_xml

23from pikepdf.objects import Name, Stream, String

25if TYPE_CHECKING: # pragma: no cover

26 from pikepdf import Pdf

29XMP_NS_DC = "http://purl.org/dc/elements/1.1/"

30XMP_NS_PDF = "http://ns.adobe.com/pdf/1.3/"

31XMP_NS_PDFA_ID = "http://www.aiim.org/pdfa/ns/id/"

32XMP_NS_PDFA_EXTENSION = "http://www.aiim.org/pdfa/ns/extension/"

33XMP_NS_PDFA_PROPERTY = "http://www.aiim.org/pdfa/ns/property#"

34XMP_NS_PDFA_SCHEMA = "http://www.aiim.org/pdfa/ns/schema#"

35XMP_NS_PDFUA_ID = "http://www.aiim.org/pdfua/ns/id/"

36XMP_NS_PDFX_ID = "http://www.npes.org/pdfx/ns/id/"

37XMP_NS_PHOTOSHOP = "http://ns.adobe.com/photoshop/1.0/"

38XMP_NS_PRISM = "http://prismstandard.org/namespaces/basic/1.0/"

39XMP_NS_PRISM2 = "http://prismstandard.org/namespaces/basic/2.0/"

40XMP_NS_PRISM3 = "http://prismstandard.org/namespaces/basic/3.0/"

41XMP_NS_RDF = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"

42XMP_NS_XMP = "http://ns.adobe.com/xap/1.0/"

43XMP_NS_XMP_MM = "http://ns.adobe.com/xap/1.0/mm/"

44XMP_NS_XMP_RIGHTS = "http://ns.adobe.com/xap/1.0/rights/"

46DEFAULT_NAMESPACES: list[tuple[str, str]] = [

47 ('adobe:ns:meta/', 'x'),

48 (XMP_NS_DC, 'dc'),

49 (XMP_NS_PDF, 'pdf'),

50 (XMP_NS_PDFA_ID, 'pdfaid'),

51 (XMP_NS_PDFA_EXTENSION, 'pdfaExtension'),

52 (XMP_NS_PDFA_PROPERTY, 'pdfaProperty'),

53 (XMP_NS_PDFA_SCHEMA, 'pdfaSchema'),

54 (XMP_NS_PDFUA_ID, 'pdfuaid'),

55 (XMP_NS_PDFX_ID, 'pdfxid'),

56 (XMP_NS_PHOTOSHOP, 'photoshop'),

57 (XMP_NS_PRISM, 'prism'),

58 (XMP_NS_PRISM2, 'prism2'),

59 (XMP_NS_PRISM3, 'prism3'),

60 (XMP_NS_RDF, 'rdf'),

61 (XMP_NS_XMP, 'xmp'),

62 (XMP_NS_XMP_MM, 'xmpMM'),

63 (XMP_NS_XMP_RIGHTS, 'xmpRights'),

64 ('http://crossref.org/crossmark/1.0/', 'crossmark'),

65 ('http://www.niso.org/schemas/jav/1.0/', 'jav'),

66 ('http://ns.adobe.com/pdfx/1.3/', 'pdfx'),

67 ('http://www.niso.org/schemas/ali/1.0/', 'ali'),

68]

70for _uri, _prefix in DEFAULT_NAMESPACES:

71 etree.register_namespace(_prefix, _uri)

73# This one should not be registered

74XMP_NS_XML = "http://www.w3.org/XML/1998/namespace"

76XPACKET_BEGIN = b"""<?xpacket begin="\xef\xbb\xbf" id="W5M0MpCehiHzreSzNTczkc9d"?>\n"""

78XMP_EMPTY = b"""<x:xmpmeta xmlns:x="adobe:ns:meta/" x:xmptk="pikepdf">

79 <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">

80 </rdf:RDF>

81</x:xmpmeta>

82"""

84XPACKET_END = b"""\n<?xpacket end="w"?>\n"""

87class XmpContainer(NamedTuple):

88 """Map XMP container object to suitable Python container."""

90 rdf_type: str

91 py_type: type

92 insert_fn: Callable[..., None]

95log = logging.getLogger(__name__)

98class NeverRaise(Exception):

99 """An exception that is never raised."""

100

101

102class AltList(list):

103 """XMP AltList container."""

104

105

106XMP_CONTAINERS = [

107 XmpContainer('Alt', AltList, AltList.append),

108 XmpContainer('Bag', set, set.add),

109 XmpContainer('Seq', list, list.append),

110]

111

112LANG_ALTS = frozenset(

113 [

114 str(QName(XMP_NS_DC, 'title')),

115 str(QName(XMP_NS_DC, 'description')),

116 str(QName(XMP_NS_DC, 'rights')),

117 str(QName(XMP_NS_XMP_RIGHTS, 'UsageTerms')),

118 ]

119)

120

121# These are the illegal characters in XML 1.0. (XML 1.1 is a bit more permissive,

122# but we'll be strict to ensure wider compatibility.)

123re_xml_illegal_chars = re.compile(

124 r"(?u)[^\x09\x0A\x0D\x20-\U0000D7FF\U0000E000-\U0000FFFD\U00010000-\U0010FFFF]"

125)

126re_xml_illegal_bytes = re.compile(rb"[^\x09\x0A\x0D\x20-\xFF]|")

127

128# Might want to check re_xml_illegal_bytes for patterns such as:

129# br"&#(?:[0-9]|0[0-9]|1[0-9]|2[0-9]|3[0-1]

130# |x[0-9A-Fa-f]|x0[0-9A-Fa-f]|x1[0-9A-Fa-f]);"

131

132

133def _parser_basic(xml: bytes):

134 return parse_xml(BytesIO(xml))

135

136

137def _parser_strip_illegal_bytes(xml: bytes):

138 return parse_xml(BytesIO(re_xml_illegal_bytes.sub(b'', xml)))

139

140

141def _parser_recovery(xml: bytes):

142 return parse_xml(BytesIO(xml), recover=True)

143

144

145def _parser_replace_with_empty_xmp(_xml: bytes = b''):

146 log.warning("Error occurred parsing XMP, replacing with empty XMP.")

147 return _parser_basic(XMP_EMPTY)

148

149

150def _clean(s: str | Iterable[str], joiner: str = '; ') -> str:

151 """Ensure an object can safely be inserted in a XML tag body.

152

153 If we still have a non-str object at this point, the best option is to

154 join it, because it's apparently calling for a new node in a place that

155 isn't allowed in the spec or not supported.

156 """

157 if not isinstance(s, str):

158 if isinstance(s, Iterable):

159 warn(f"Merging elements of {s}")

160 if isinstance(s, set):

161 s = joiner.join(sorted(s))

162 else:

163 s = joiner.join(s)

164 else:

165 raise TypeError("object must be a string or iterable of strings")

166 return re_xml_illegal_chars.sub('', s)

167

168

169def encode_pdf_date(d: datetime) -> str:

170 """Encode Python datetime object as PDF date string.

171

172 From Adobe pdfmark manual:

173 (D:YYYYMMDDHHmmSSOHH'mm')

174 D: is an optional prefix. YYYY is the year. All fields after the year are

175 optional. MM is the month (01-12), DD is the day (01-31), HH is the

176 hour (00-23), mm are the minutes (00-59), and SS are the seconds

177 (00-59). The remainder of the string defines the relation of local

178 time to GMT. O is either + for a positive difference (local time is

179 later than GMT) or - (minus) for a negative difference. HH' is the

180 absolute value of the offset from GMT in hours, and mm' is the

181 absolute value of the offset in minutes. If no GMT information is

182 specified, the relation between the specified time and GMT is

183 considered unknown. Regardless of whether or not GMT

184 information is specified, the remainder of the string should specify

185 the local time.

186

187 'D:' is required in PDF/A, so we always add it.

188 """

189 # The formatting of %Y is not consistent as described in

190 # https://bugs.python.org/issue13305 and underspecification in libc.

191 # So explicitly format the year with leading zeros

192 s = f"D:{d.year:04d}"

193 s += d.strftime(r'%m%d%H%M%S')

194 tz = d.strftime('%z')

195 if tz:

196 sign, tz_hours, tz_mins = tz[0], tz[1:3], tz[3:5]

197 s += f"{sign}{tz_hours}'{tz_mins}"

198 return s

199

200

201def decode_pdf_date(s: str) -> datetime:

202 """Decode a pdfmark date to a Python datetime object.

203

204 A pdfmark date is a string in a particular format, as described in

205 :func:`encode_pdf_date`.

206 """

207 if isinstance(s, String):

208 s = str(s)

209 t = s

210 if t.startswith('D:'):

211 t = t[2:]

212 utcs = [

213 "Z00'00'", # Literal Z00'00', is incorrect but found in the wild

214 "Z00'00", # Correctly formatted UTC

215 "Z", # Alternate UTC

216 ]

217 for utc in utcs:

218 if t.endswith(utc):

219 t = t.replace(utc, "+0000")

220 break

221 t = t.replace("'", "") # Remove apos from PDF time strings

222

223 date_formats = [

224 r"%Y%m%d%H%M%S%z", # Format with timezone

225 r"%Y%m%d%H%M%S", # Format without timezone

226 r"%Y%m%d", # Date only format

227 ]

228 for date_format in date_formats:

229 try:

230 return datetime.strptime(t, date_format)

231 except ValueError:

232 continue

233 raise ValueError(f"Date string does not match any known format: {s} (read as {t})")

234

235

236class Converter(ABC):

237 """XMP <-> DocumentInfo converter."""

238

239 @staticmethod

240 @abstractmethod

241 def xmp_from_docinfo(docinfo_val: str | None) -> Any: # type: ignore

242 """Derive XMP metadata from a DocumentInfo string."""

243

244 @staticmethod

245 @abstractmethod

246 def docinfo_from_xmp(xmp_val: Any) -> str | None:

247 """Derive a DocumentInfo value from equivalent XMP metadata."""

248

249

250class AuthorConverter(Converter):

251 """Convert XMP document authors to DocumentInfo."""

252

253 @staticmethod

254 def xmp_from_docinfo(docinfo_val: str | None) -> Any: # type: ignore

255 """Derive XMP authors info from DocumentInfo."""

256 return [docinfo_val]

257

258 @staticmethod

259 def docinfo_from_xmp(xmp_val):

260 """Derive DocumentInfo authors from XMP.

261

262 XMP supports multiple author values, while DocumentInfo has a string,

263 so we return the values separated by semi-colons.

264 """

265 if isinstance(xmp_val, str):

266 return xmp_val

267 if xmp_val is None or xmp_val == [None]:

268 return None

269 return '; '.join(author for author in xmp_val if author is not None)

270

271

272class DateConverter(Converter):

273 """Convert XMP dates to DocumentInfo."""

274

275 @staticmethod

276 def xmp_from_docinfo(docinfo_val):

277 """Derive XMP date from DocumentInfo."""

278 if docinfo_val == '':

279 return ''

280 return decode_pdf_date(docinfo_val).isoformat()

281

282 @staticmethod

283 def docinfo_from_xmp(xmp_val):

284 """Derive DocumentInfo from XMP."""

285 if xmp_val.endswith('Z'):

286 xmp_val = xmp_val[:-1] + '+00:00'

287 dateobj = datetime.fromisoformat(xmp_val)

288 return encode_pdf_date(dateobj)

289

290

291class DocinfoMapping(NamedTuple):

292 """Map DocumentInfo keys to their XMP equivalents, along with converter."""

293

294 ns: str

295 key: str

296 name: Name

297 converter: type[Converter] | None

298

299

300def ensure_loaded(fn):

301 """Ensure the XMP has been loaded and parsed.

302

303 TODO: Can this be removed? Why allow the uninit'ed state to even exist?

304 """

305

306 @wraps(fn)

307 def wrapper(self, *args, **kwargs):

308 if not self._xmp:

309 self._load()

310 return fn(self, *args, **kwargs)

311

312 return wrapper

313

314

315class PdfMetadata(MutableMapping):

316 """Read and edit the metadata associated with a PDF.

317

318 The PDF specification contain two types of metadata, the newer XMP

319 (Extensible Metadata Platform, XML-based) and older DocumentInformation

320 dictionary. The PDF 2.0 specification removes the DocumentInformation

321 dictionary.

322

323 This primarily works with XMP metadata, but includes methods to generate

324 XMP from DocumentInformation and will also coordinate updates to

325 DocumentInformation so that the two are kept consistent.

326

327 XMP metadata fields may be accessed using the full XML namespace URI or

328 the short name. For example ``metadata['dc:description']``

329 and ``metadata['{http://purl.org/dc/elements/1.1/}description']``

330 both refer to the same field. Several common XML namespaces are registered

331 automatically.

332

333 See the XMP specification for details of allowable fields.

334

335 To update metadata, use a with block.

336

337 Example:

338 >>> with pdf.open_metadata() as records:

339 ... records['dc:title'] = 'New Title'

340

341 See Also:

342 :meth:`pikepdf.Pdf.open_metadata`

343 """

344

345 DOCINFO_MAPPING: list[DocinfoMapping] = [

346 DocinfoMapping(XMP_NS_DC, 'creator', Name.Author, AuthorConverter),

347 DocinfoMapping(XMP_NS_DC, 'description', Name.Subject, None),

348 DocinfoMapping(XMP_NS_DC, 'title', Name.Title, None),

349 DocinfoMapping(XMP_NS_PDF, 'Keywords', Name.Keywords, None),

350 DocinfoMapping(XMP_NS_PDF, 'Producer', Name.Producer, None),

351 DocinfoMapping(XMP_NS_XMP, 'CreateDate', Name.CreationDate, DateConverter),

352 DocinfoMapping(XMP_NS_XMP, 'CreatorTool', Name.Creator, None),

353 DocinfoMapping(XMP_NS_XMP, 'ModifyDate', Name.ModDate, DateConverter),

354 ]

355

356 NS: dict[str, str] = {prefix: uri for uri, prefix in DEFAULT_NAMESPACES}

357 REVERSE_NS: dict[str, str] = dict(DEFAULT_NAMESPACES)

358

359 _PARSERS_OVERWRITE_INVALID_XML: Iterable[Callable[[bytes], Any]] = [

360 _parser_basic,

361 _parser_strip_illegal_bytes,

362 _parser_recovery,

363 _parser_replace_with_empty_xmp,

364 ]

365 _PARSERS_STANDARD: Iterable[Callable[[bytes], Any]] = [_parser_basic]

366

367 @classmethod

368 def register_xml_namespace(cls, uri, prefix):

369 """Register a new XML/XMP namespace.

370

371 Arguments:

372 uri: The long form of the namespace.

373 prefix: The alias to use when interpreting XMP.

374 """

375 cls.NS[prefix] = uri

376 cls.REVERSE_NS[uri] = prefix

377 etree.register_namespace(_prefix, _uri)

378

379 def __init__(

380 self,

381 pdf: Pdf,

382 pikepdf_mark: bool = True,

383 sync_docinfo: bool = True,

384 overwrite_invalid_xml: bool = True,

385 ):

386 """Construct PdfMetadata. Use Pdf.open_metadata() instead."""

387 self._pdf = pdf

388 self.mark = pikepdf_mark

389 self.sync_docinfo = sync_docinfo

390 self._updating = False

391 self.overwrite_invalid_xml = overwrite_invalid_xml

392 self._xmp = None

393

394 def load_from_docinfo(

395 self, docinfo, delete_missing: bool = False, raise_failure: bool = False

396 ) -> None:

397 """Populate the XMP metadata object with DocumentInfo.

398

399 Arguments:

400 docinfo: a DocumentInfo, e.g pdf.docinfo

401 delete_missing: if the entry is not DocumentInfo, delete the equivalent

402 from XMP

403 raise_failure: if True, raise any failure to convert docinfo;

404 otherwise warn and continue

405

406 A few entries in the deprecated DocumentInfo dictionary are considered

407 approximately equivalent to certain XMP records. This method copies

408 those entries into the XMP metadata.

409 """

410

411 def warn_or_raise(msg, e=None):

412 if raise_failure:

413 raise ValueError(msg) from e

414 warn(msg)

415

416 for uri, shortkey, docinfo_name, converter in self.DOCINFO_MAPPING:

417 qname = QName(uri, shortkey)

418 # docinfo might be a dict or pikepdf.Dictionary, so lookup keys

419 # by str(Name)

420 val = docinfo.get(str(docinfo_name))

421 if val is None:

422 if delete_missing and qname in self:

423 del self[qname]

424 continue

425 try:

426 val = str(val)

427 if converter:

428 val = converter.xmp_from_docinfo(val)

429 if not val:

430 continue

431 self._setitem(qname, val, True)

432 except (ValueError, AttributeError, NotImplementedError) as e:

433 warn_or_raise(

434 f"The metadata field {docinfo_name} could not be copied to XMP", e

435 )

436 valid_docinfo_names = {

437 str(docinfo_name) for _, _, docinfo_name, _ in self.DOCINFO_MAPPING

438 }

439 extra_docinfo_names = {str(k) for k in docinfo.keys()} - valid_docinfo_names

440 for extra in extra_docinfo_names:

441 warn_or_raise(

442 f"The metadata field {extra} with value '{repr(docinfo.get(extra))}' "

443 "has no XMP equivalent, so it was discarded",

444 )

445

446 def _load(self) -> None:

447 try:

448 data = self._pdf.Root.Metadata.read_bytes()

449 except AttributeError:

450 data = b''

451 self._load_from(data)

452

453 def _load_from(self, data: bytes) -> None:

454 if data.strip() == b'':

455 data = XMP_EMPTY # on some platforms lxml chokes on empty documents

456

457 parsers = (

458 self._PARSERS_OVERWRITE_INVALID_XML

459 if self.overwrite_invalid_xml

460 else self._PARSERS_STANDARD

461 )

462

463 for parser in parsers:

464 try:

465 self._xmp = parser(data)

466 except (

467 XMLSyntaxError if self.overwrite_invalid_xml else NeverRaise # type: ignore

468 ) as e:

469 if str(e).startswith("Start tag expected, '<' not found") or str(

470 e

471 ).startswith("Document is empty"):

472 self._xmp = _parser_replace_with_empty_xmp()

473 break

474 else:

475 break

476

477 if self._xmp is not None:

478 try:

479 pis = self._xmp.xpath('/processing-instruction()')

480 for pi in pis:

481 etree.strip_tags(self._xmp, pi.tag)

482 self._get_rdf_root()

483 except (

484 Exception # pylint: disable=broad-except

485 if self.overwrite_invalid_xml

486 else NeverRaise

487 ) as e:

488 log.warning("Error occurred parsing XMP", exc_info=e)

489 self._xmp = _parser_replace_with_empty_xmp()

490 else:

491 log.warning("Error occurred parsing XMP")

492 self._xmp = _parser_replace_with_empty_xmp()

493

494 @ensure_loaded

495 def __enter__(self):

496 """Open metadata for editing."""

497 self._updating = True

498 return self

499

500 def __exit__(self, exc_type, exc_val, exc_tb):

501 """Close metadata and apply changes."""

502 try:

503 if exc_type is not None:

504 return

505 self._apply_changes()

506 finally:

507 self._updating = False

508

509 def _update_docinfo(self):

510 """Update the PDF's DocumentInfo dictionary to match XMP metadata.

511

512 The standard mapping is described here:

513 https://www.pdfa.org/pdfa-metadata-xmp-rdf-dublin-core/

514 """

515 # Touch object to ensure it exists

516 self._pdf.docinfo # pylint: disable=pointless-statement

517 for uri, element, docinfo_name, converter in self.DOCINFO_MAPPING:

518 qname = QName(uri, element)

519 try:

520 value = self[qname]

521 except KeyError:

522 if docinfo_name in self._pdf.docinfo:

523 del self._pdf.docinfo[docinfo_name]

524 continue

525 if converter:

526 try:

527 value = converter.docinfo_from_xmp(value)

528 except ValueError:

529 warn(

530 f"The DocumentInfo field {docinfo_name} could not be "

531 "updated from XMP"

532 )

533 value = None

534 except Exception as e:

535 raise ValueError(

536 "An error occurred while updating DocumentInfo field "

537 f"{docinfo_name} from XMP {qname} with value {value}"

538 ) from e

539 if value is None:

540 if docinfo_name in self._pdf.docinfo:

541 del self._pdf.docinfo[docinfo_name]

542 continue

543 value = _clean(value)

544 try:

545 # Try to save pure ASCII

546 self._pdf.docinfo[docinfo_name] = value.encode('ascii')

547 except UnicodeEncodeError:

548 # qpdf will serialize this as a UTF-16 with BOM string

549 self._pdf.docinfo[docinfo_name] = value

550

551 def _get_xml_bytes(self, xpacket=True):

552 data = BytesIO()

553 if xpacket:

554 data.write(XPACKET_BEGIN)

555 self._xmp.write(data, encoding='utf-8', pretty_print=True)

556 if xpacket:

557 data.write(XPACKET_END)

558 data.seek(0)

559 xml_bytes = data.read()

560 return xml_bytes

561

562 def _apply_changes(self):

563 """Serialize our changes back to the PDF in memory.

564

565 Depending how we are initialized, leave our metadata mark and producer.

566 """

567 if self.mark:

568 # We were asked to mark the file as being edited by pikepdf

569 self._setitem(

570 QName(XMP_NS_XMP, 'MetadataDate'),

571 datetime.now(timezone.utc).isoformat(),

572 applying_mark=True,

573 )

574 self._setitem(

575 QName(XMP_NS_PDF, 'Producer'),

576 'pikepdf ' + pikepdf_version,

577 applying_mark=True,

578 )

579 xml = self._get_xml_bytes()

580 self._pdf.Root.Metadata = Stream(self._pdf, xml)

581 self._pdf.Root.Metadata[Name.Type] = Name.Metadata

582 self._pdf.Root.Metadata[Name.Subtype] = Name.XML

583 if self.sync_docinfo:

584 self._update_docinfo()

585

586 @classmethod

587 def _qname(cls, name: QName | str) -> str:

588 """Convert name to an XML QName.

589

590 e.g. pdf:Producer -> {http://ns.adobe.com/pdf/1.3/}Producer

591 """

592 if isinstance(name, QName):

593 return str(name)

594 if not isinstance(name, str):

595 raise TypeError(f"{name} must be str")

596 if name == '':

597 return name

598 if name.startswith('{'):

599 return name

600 try:

601 prefix, tag = name.split(':', maxsplit=1)

602 except ValueError:

603 # If missing the namespace, it belongs in the default namespace.

604 # A tag such <xyz xmlns="http://example.com"> defines a default

605 # namespace of http://example.com for all enclosed tags that don't

606 # override the namespace with a colon prefix.

607 # XMP does not usually use the default namespace, so we can

608 # assume it's just blank. In practice a document that depends on

609 # defining a default namespace over some part of its content

610 # could introduce a collision.

611 # See: https://www.w3.org/TR/REC-xml-names/#dt-defaultNS

612 prefix, tag = '', name

613 uri = cls.NS.get(prefix, None)

614 return str(QName(uri, tag))

615

616 def _prefix_from_uri(self, uriname):

617 """Given a fully qualified XML name, find a prefix.

618

619 e.g. {http://ns.adobe.com/pdf/1.3/}Producer -> pdf:Producer

620 """

621 uripart, tag = uriname.split('}', maxsplit=1)

622 uri = uripart.replace('{', '')

623 return self.REVERSE_NS[uri] + ':' + tag

624

625 def _get_subelements(self, node: _Element) -> Any:

626 """Gather the sub-elements attached to a node.

627

628 Gather rdf:Bag and and rdf:Seq into set and list respectively. For

629 alternate languages values, take the first language only for

630 simplicity.

631 """

632 items = node.find('rdf:Alt', self.NS)

633 if items is not None:

634 try:

635 return items[0].text

636 except IndexError:

637 return ''

638

639 for xmlcontainer, container, insertfn in XMP_CONTAINERS:

640 items = node.find(f'rdf:{xmlcontainer}', self.NS)

641 if items is None:

642 continue

643 result = container()

644 for item in items:

645 insertfn(result, item.text)

646 return result

647 return ''

648

649 def _get_rdf_root(self) -> _Element:

650 assert self._xmp is not None

651 rdf = self._xmp.find('.//rdf:RDF', self.NS)

652 if rdf is None:

653 rdf = self._xmp.getroot()

654 if not rdf.tag == '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF':

655 raise ValueError("Metadata seems to be XML but not XMP")

656 return rdf

657

658 def _get_elements(

659 self, name: str | QName = ''

660 ) -> Iterator[tuple[_Element, str | bytes | None, Any, _Element]]:

661 """Get elements from XMP.

662

663 Core routine to find elements matching name within the XMP and yield

664 them.

665

666 For XMP spec 7.9.2.2, rdf:Description with property attributes,

667 we yield the node which will have the desired as one of its attributes.

668 qname is returned so that the node.attrib can be used to locate the

669 source.

670

671 For XMP spec 7.5, simple valued XMP properties, we yield the node,

672 None, and the value. For structure or array valued properties we gather

673 the elements. We ignore qualifiers.

674

675 Args:

676 name: a prefixed name or QName to look for within the

677 data section of the XMP; looks for all data keys if omitted

678

679 Yields:

680 tuple: (node, qname_attrib, value, parent_node)

681

682 """

683 qname = self._qname(name)

684 rdf = self._get_rdf_root()

685 for rdfdesc in rdf.findall('rdf:Description[@rdf:about=""]', self.NS):

686 if qname and qname in rdfdesc.keys():

687 yield (rdfdesc, qname, rdfdesc.get(qname), rdf)

688 elif not qname:

689 for k, v in rdfdesc.items():

690 if v:

691 yield (rdfdesc, k, v, rdf)

692 xpath = qname if name else '*'

693 for node in rdfdesc.findall(xpath, self.NS):

694 if node.text and node.text.strip():

695 yield (node, None, node.text, rdfdesc)

696 continue

697 values = self._get_subelements(node)

698 yield (node, None, values, rdfdesc)

699

700 def _get_element_values(self, name: str | QName = '') -> Iterator[Any]:

701 yield from (v[2] for v in self._get_elements(name))

702

703 @ensure_loaded

704 def __contains__(self, key: str | QName):

705 """Test if XMP key is in metadata."""

706 return any(self._get_element_values(key))

707

708 @ensure_loaded

709 def __getitem__(self, key: str | QName):

710 """Retrieve XMP metadata for key."""

711 try:

712 return next(self._get_element_values(key))

713 except StopIteration:

714 raise KeyError(key) from None

715

716 @ensure_loaded

717 def __iter__(self):

718 """Iterate through XMP metadata attributes and nodes."""

719 for node, attrib, _val, _parents in self._get_elements():

720 if attrib:

721 yield attrib

722 else:

723 yield node.tag

724

725 @ensure_loaded

726 def __len__(self):

727 """Return number of items in metadata."""

728 return len(list(iter(self)))

729

730 def _setitem(

731 self,

732 key: str | QName,

733 val: set[str] | list[str] | str,

734 applying_mark: bool = False,

735 ):

736 if not self._updating:

737 raise RuntimeError("Metadata not opened for editing, use with block")

738

739 qkey = self._qname(key)

740 self._setitem_check_args(key, val, applying_mark, qkey)

741

742 try:

743 # Update existing node

744 self._setitem_update(key, val, qkey)

745 except StopIteration:

746 # Insert a new node

747 self._setitem_insert(key, val)

748

749 def _setitem_check_args(self, key, val, applying_mark: bool, qkey: str) -> None:

750 if (

751 self.mark

752 and not applying_mark

753 and qkey

754 in (

755 self._qname('xmp:MetadataDate'),

756 self._qname('pdf:Producer'),

757 )

758 ):

759 # Complain if user writes self[pdf:Producer] = ... and because it will

760 # be overwritten on save, unless self._updating_mark, in which case

761 # the action was initiated internally

762 log.warning(

763 f"Update to {key} will be overwritten because metadata was opened "

764 "with set_pikepdf_as_editor=True"

765 )

766 if isinstance(val, str) and qkey in (self._qname('dc:creator')):

767 log.error(f"{key} should be set to a list of strings")

768

769 def _setitem_add_array(self, node, items: Iterable) -> None:

770 rdf_type = next(

771 c.rdf_type for c in XMP_CONTAINERS if isinstance(items, c.py_type)

772 )

773 seq = etree.SubElement(node, str(QName(XMP_NS_RDF, rdf_type)))

774 tag_attrib: dict[str, str] | None = None

775 if rdf_type == 'Alt':

776 tag_attrib = {str(QName(XMP_NS_XML, 'lang')): 'x-default'}

777 for item in items:

778 el = etree.SubElement(seq, str(QName(XMP_NS_RDF, 'li')), attrib=tag_attrib)

779 if item is not None:

780 inner_text: str | None = _clean(item)

781 if inner_text == '':

782 inner_text = None

783 el.text = inner_text

784

785 def _setitem_update(self, key, val, qkey):

786 # Locate existing node to replace

787 node, attrib, _oldval, _parent = next(self._get_elements(key))

788 if attrib:

789 if not isinstance(val, str):

790 if qkey == self._qname('dc:creator'):

791 # dc:creator incorrectly created as an attribute - we're

792 # replacing it anyway, so remove the old one

793 del node.attrib[qkey]

794 self._setitem_add_array(node, _clean(val))

795 else:

796 raise TypeError(f"Setting {key} to {val} with type {type(val)}")

797 else:

798 node.set(attrib, _clean(val))

799 elif isinstance(val, (list, set)):

800 for child in node.findall('*'):

801 node.remove(child)

802 self._setitem_add_array(node, val)

803 elif isinstance(val, str):

804 for child in node.findall('*'):

805 node.remove(child)

806 if str(self._qname(key)) in LANG_ALTS:

807 self._setitem_add_array(node, AltList([_clean(val)]))

808 else:

809 node.text = _clean(val)

810 else:

811 raise TypeError(f"Setting {key} to {val} with type {type(val)}")

812

813 def _setitem_insert(self, key, val):

814 rdf = self._get_rdf_root()

815 if str(self._qname(key)) in LANG_ALTS:

816 val = AltList([_clean(val)])

817 if isinstance(val, (list, set)):

818 rdfdesc = etree.SubElement(

819 rdf,

820 str(QName(XMP_NS_RDF, 'Description')),

821 attrib={str(QName(XMP_NS_RDF, 'about')): ''},

822 )

823 node = etree.SubElement(rdfdesc, self._qname(key))

824 self._setitem_add_array(node, val)

825 elif isinstance(val, str):

826 rdfdesc = rdf.find('rdf:Description[@rdf:about=""]', self.NS)

827 if rdfdesc is None:

828 rdfdesc = etree.SubElement(

829 rdf,

830 str(QName(XMP_NS_RDF, 'Description')),

831 attrib={str(QName(XMP_NS_RDF, 'about')): ''},

832 )

833 node = etree.SubElement(rdfdesc, self._qname(key))

834 node.text = _clean(val)

835 else:

836 raise TypeError(f"Setting {key} to {val} with type {type(val)}") from None

837

838 @ensure_loaded

839 def __setitem__(self, key: str | QName, val: set[str] | list[str] | str):

840 """Set XMP metadata key to value."""

841 return self._setitem(key, val, False)

842

843 @ensure_loaded

844 def __delitem__(self, key: str | QName):

845 """Delete item from XMP metadata."""

846 if not self._updating:

847 raise RuntimeError("Metadata not opened for editing, use with block")

848 try:

849 node, attrib, _oldval, parent = next(self._get_elements(key))

850 if attrib: # Inline

851 del node.attrib[attrib]

852 if (

853 len(node.attrib) == 1

854 and len(node) == 0

855 and QName(XMP_NS_RDF, 'about') in node.attrib.keys()

856 ):

857 # The only thing left on this node is rdf:about="", so remove it

858 parent.remove(node)

859 else:

860 parent.remove(node)

861 except StopIteration:

862 raise KeyError(key) from None

863

864 @property

865 def pdfa_status(self) -> str:

866 """Return the PDF/A conformance level claimed by this PDF, or False.

867

868 A PDF may claim to PDF/A compliant without this being true. Use an

869 independent verifier such as veraPDF to test if a PDF is truly

870 conformant.

871

872 Returns:

873 The conformance level of the PDF/A, or an empty string if the

874 PDF does not claim PDF/A conformance. Possible valid values

875 are: 1A, 1B, 2A, 2B, 2U, 3A, 3B, 3U.

876 """

877 # do same as @ensure_loaded - mypy can't handle decorated property

878 if not self._xmp:

879 self._load()

880

881 key_part = QName(XMP_NS_PDFA_ID, 'part')

882 key_conformance = QName(XMP_NS_PDFA_ID, 'conformance')

883 try:

884 return self[key_part] + self[key_conformance]

885 except KeyError:

886 return ''

887

888 @property

889 def pdfx_status(self) -> str:

890 """Return the PDF/X conformance level claimed by this PDF, or False.

891

892 A PDF may claim to PDF/X compliant without this being true. Use an

893 independent verifier such as veraPDF to test if a PDF is truly

894 conformant.

895

896 Returns:

897 The conformance level of the PDF/X, or an empty string if the

898 PDF does not claim PDF/X conformance.

899 """

900 # do same as @ensure_loaded - mypy can't handle decorated property

901 if not self._xmp:

902 self._load()

903

904 pdfx_version = QName(XMP_NS_PDFX_ID, 'GTS_PDFXVersion')

905 try:

906 return self[pdfx_version]

907 except KeyError:

908 return ''

909

910 @ensure_loaded

911 def __str__(self):

912 """Convert XMP metadata to XML string."""

913 return self._get_xml_bytes(xpacket=False).decode('utf-8')