Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pikepdf/models/metadata/

1# SPDX-FileCopyrightText: 2022 James R. Barlow

2# SPDX-License-Identifier: MPL-2.0

4"""Converters for XMP <-> DocumentInfo value transformation."""

6from __future__ import annotations

8from abc import ABC, abstractmethod

9from datetime import datetime

10from typing import Any, NamedTuple

12from pikepdf.models.metadata._constants import XMP_NS_DC, XMP_NS_PDF, XMP_NS_XMP

13from pikepdf.objects import Name, String

16def encode_pdf_date(d: datetime) -> str:

17 """Encode Python datetime object as PDF date string.

19 From Adobe pdfmark manual:

20 (D:YYYYMMDDHHmmSSOHH'mm')

21 D: is an optional prefix. YYYY is the year. All fields after the year are

22 optional. MM is the month (01-12), DD is the day (01-31), HH is the

23 hour (00-23), mm are the minutes (00-59), and SS are the seconds

24 (00-59). The remainder of the string defines the relation of local

25 time to GMT. O is either + for a positive difference (local time is

26 later than GMT) or - (minus) for a negative difference. HH' is the

27 absolute value of the offset from GMT in hours, and mm' is the

28 absolute value of the offset in minutes. If no GMT information is

29 specified, the relation between the specified time and GMT is

30 considered unknown. Regardless of whether or not GMT

31 information is specified, the remainder of the string should specify

32 the local time.

34 'D:' is required in PDF/A, so we always add it.

35 """

36 # The formatting of %Y is not consistent as described in

37 # https://bugs.python.org/issue13305 and underspecification in libc.

38 # So explicitly format the year with leading zeros

39 s = f"D:{d.year:04d}"

40 s += d.strftime(r'%m%d%H%M%S')

41 tz = d.strftime('%z')

42 if tz:

43 sign, tz_hours, tz_mins = tz[0], tz[1:3], tz[3:5]

44 s += f"{sign}{tz_hours}'{tz_mins}"

45 return s

48def decode_pdf_date(s: str) -> datetime:

49 """Decode a pdfmark date to a Python datetime object.

51 A pdfmark date is a string in a particular format, as described in

52 :func:`encode_pdf_date`.

53 """

54 if isinstance(s, String):

55 s = str(s)

56 t = s

57 if t.startswith('D:'):

58 t = t[2:]

59 utcs = [

60 "Z00'00'", # Literal Z00'00', is incorrect but found in the wild

61 "Z00'00", # Correctly formatted UTC

62 "Z", # Alternate UTC

63 ]

64 for utc in utcs:

65 if t.endswith(utc):

66 t = t.replace(utc, "+0000")

67 break

68 t = t.replace("'", "") # Remove apos from PDF time strings

70 date_formats = [

71 r"%Y%m%d%H%M%S%z", # Format with timezone

72 r"%Y%m%d%H%M%S", # Format without timezone

73 r"%Y%m%d", # Date only format

74 ]

75 for date_format in date_formats:

76 try:

77 return datetime.strptime(t, date_format)

78 except ValueError:

79 continue

80 raise ValueError(f"Date string does not match any known format: {s} (read as {t})")

83class Converter(ABC):

84 """XMP <-> DocumentInfo converter."""

86 @staticmethod

87 @abstractmethod

88 def xmp_from_docinfo(docinfo_val: str | None) -> Any: # type: ignore

89 """Derive XMP metadata from a DocumentInfo string."""

91 @staticmethod

92 @abstractmethod

93 def docinfo_from_xmp(xmp_val: Any) -> str | None:

94 """Derive a DocumentInfo value from equivalent XMP metadata."""

97class AuthorConverter(Converter):

98 """Convert XMP document authors to DocumentInfo."""

100 @staticmethod

101 def xmp_from_docinfo(docinfo_val: str | None) -> Any: # type: ignore

102 """Derive XMP authors info from DocumentInfo."""

103 return [docinfo_val]

104

105 @staticmethod

106 def docinfo_from_xmp(xmp_val):

107 """Derive DocumentInfo authors from XMP.

108

109 XMP supports multiple author values, while DocumentInfo has a string,

110 so we return the values separated by semi-colons.

111 """

112 if isinstance(xmp_val, str):

113 return xmp_val

114 if xmp_val is None or xmp_val == [None]:

115 return None

116 return '; '.join(author for author in xmp_val if author is not None)

117

118

119class DateConverter(Converter):

120 """Convert XMP dates to DocumentInfo."""

121

122 @staticmethod

123 def xmp_from_docinfo(docinfo_val):

124 """Derive XMP date from DocumentInfo."""

125 if isinstance(docinfo_val, String):

126 docinfo_val = str(docinfo_val)

127 if docinfo_val == '':

128 return ''

129 val = docinfo_val[2:] if docinfo_val.startswith('D:') else docinfo_val

130 if len(val) in (4, 6) and val.isdigit():

131 return val if len(val) == 4 else f'{val[:4]}-{val[4:]}'

132 return decode_pdf_date(docinfo_val).isoformat()

133

134 @staticmethod

135 def docinfo_from_xmp(xmp_val):

136 """Derive DocumentInfo from XMP."""

137 if len(xmp_val) in (4, 7) and 'T' not in xmp_val:

138 return f'D:{xmp_val.replace("-", "")}'

139 if xmp_val.endswith('Z'):

140 xmp_val = xmp_val[:-1] + '+00:00'

141 return encode_pdf_date(datetime.fromisoformat(xmp_val))

142

143

144class DocinfoMapping(NamedTuple):

145 """Map DocumentInfo keys to their XMP equivalents, along with converter."""

146

147 ns: str

148 key: str

149 name: Name

150 converter: type[Converter] | None

151

152

153DOCINFO_MAPPING: list[DocinfoMapping] = [

154 DocinfoMapping(XMP_NS_DC, 'creator', Name.Author, AuthorConverter),

155 DocinfoMapping(XMP_NS_DC, 'description', Name.Subject, None),

156 DocinfoMapping(XMP_NS_DC, 'title', Name.Title, None),

157 DocinfoMapping(XMP_NS_PDF, 'Keywords', Name.Keywords, None),

158 DocinfoMapping(XMP_NS_PDF, 'Producer', Name.Producer, None),

159 DocinfoMapping(XMP_NS_XMP, 'CreateDate', Name.CreationDate, DateConverter),

160 DocinfoMapping(XMP_NS_XMP, 'CreatorTool', Name.Creator, None),

161 DocinfoMapping(XMP_NS_XMP, 'ModifyDate', Name.ModDate, DateConverter),

162]

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pikepdf/models/metadata/_converters.py: 42%

77 statements