Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pikepdf/models/metadata/_converters.py: 42%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

77 statements  

1# SPDX-FileCopyrightText: 2022 James R. Barlow 

2# SPDX-License-Identifier: MPL-2.0 

3 

4"""Converters for XMP <-> DocumentInfo value transformation.""" 

5 

6from __future__ import annotations 

7 

8from abc import ABC, abstractmethod 

9from datetime import datetime 

10from typing import Any, NamedTuple 

11 

12from pikepdf.models.metadata._constants import XMP_NS_DC, XMP_NS_PDF, XMP_NS_XMP 

13from pikepdf.objects import Name, String 

14 

15 

16def encode_pdf_date(d: datetime) -> str: 

17 """Encode Python datetime object as PDF date string. 

18 

19 From Adobe pdfmark manual: 

20 (D:YYYYMMDDHHmmSSOHH'mm') 

21 D: is an optional prefix. YYYY is the year. All fields after the year are 

22 optional. MM is the month (01-12), DD is the day (01-31), HH is the 

23 hour (00-23), mm are the minutes (00-59), and SS are the seconds 

24 (00-59). The remainder of the string defines the relation of local 

25 time to GMT. O is either + for a positive difference (local time is 

26 later than GMT) or - (minus) for a negative difference. HH' is the 

27 absolute value of the offset from GMT in hours, and mm' is the 

28 absolute value of the offset in minutes. If no GMT information is 

29 specified, the relation between the specified time and GMT is 

30 considered unknown. Regardless of whether or not GMT 

31 information is specified, the remainder of the string should specify 

32 the local time. 

33 

34 'D:' is required in PDF/A, so we always add it. 

35 """ 

36 # The formatting of %Y is not consistent as described in 

37 # https://bugs.python.org/issue13305 and underspecification in libc. 

38 # So explicitly format the year with leading zeros 

39 s = f"D:{d.year:04d}" 

40 s += d.strftime(r'%m%d%H%M%S') 

41 tz = d.strftime('%z') 

42 if tz: 

43 sign, tz_hours, tz_mins = tz[0], tz[1:3], tz[3:5] 

44 s += f"{sign}{tz_hours}'{tz_mins}" 

45 return s 

46 

47 

48def decode_pdf_date(s: str) -> datetime: 

49 """Decode a pdfmark date to a Python datetime object. 

50 

51 A pdfmark date is a string in a particular format, as described in 

52 :func:`encode_pdf_date`. 

53 """ 

54 if isinstance(s, String): 

55 s = str(s) 

56 t = s 

57 if t.startswith('D:'): 

58 t = t[2:] 

59 utcs = [ 

60 "Z00'00'", # Literal Z00'00', is incorrect but found in the wild 

61 "Z00'00", # Correctly formatted UTC 

62 "Z", # Alternate UTC 

63 ] 

64 for utc in utcs: 

65 if t.endswith(utc): 

66 t = t.replace(utc, "+0000") 

67 break 

68 t = t.replace("'", "") # Remove apos from PDF time strings 

69 

70 date_formats = [ 

71 r"%Y%m%d%H%M%S%z", # Format with timezone 

72 r"%Y%m%d%H%M%S", # Format without timezone 

73 r"%Y%m%d", # Date only format 

74 ] 

75 for date_format in date_formats: 

76 try: 

77 return datetime.strptime(t, date_format) 

78 except ValueError: 

79 continue 

80 raise ValueError(f"Date string does not match any known format: {s} (read as {t})") 

81 

82 

83class Converter(ABC): 

84 """XMP <-> DocumentInfo converter.""" 

85 

86 @staticmethod 

87 @abstractmethod 

88 def xmp_from_docinfo(docinfo_val: str | None) -> Any: # type: ignore 

89 """Derive XMP metadata from a DocumentInfo string.""" 

90 

91 @staticmethod 

92 @abstractmethod 

93 def docinfo_from_xmp(xmp_val: Any) -> str | None: 

94 """Derive a DocumentInfo value from equivalent XMP metadata.""" 

95 

96 

97class AuthorConverter(Converter): 

98 """Convert XMP document authors to DocumentInfo.""" 

99 

100 @staticmethod 

101 def xmp_from_docinfo(docinfo_val: str | None) -> Any: # type: ignore 

102 """Derive XMP authors info from DocumentInfo.""" 

103 return [docinfo_val] 

104 

105 @staticmethod 

106 def docinfo_from_xmp(xmp_val): 

107 """Derive DocumentInfo authors from XMP. 

108 

109 XMP supports multiple author values, while DocumentInfo has a string, 

110 so we return the values separated by semi-colons. 

111 """ 

112 if isinstance(xmp_val, str): 

113 return xmp_val 

114 if xmp_val is None or xmp_val == [None]: 

115 return None 

116 return '; '.join(author for author in xmp_val if author is not None) 

117 

118 

119class DateConverter(Converter): 

120 """Convert XMP dates to DocumentInfo.""" 

121 

122 @staticmethod 

123 def xmp_from_docinfo(docinfo_val): 

124 """Derive XMP date from DocumentInfo.""" 

125 if isinstance(docinfo_val, String): 

126 docinfo_val = str(docinfo_val) 

127 if docinfo_val == '': 

128 return '' 

129 val = docinfo_val[2:] if docinfo_val.startswith('D:') else docinfo_val 

130 if len(val) in (4, 6) and val.isdigit(): 

131 return val if len(val) == 4 else f'{val[:4]}-{val[4:]}' 

132 return decode_pdf_date(docinfo_val).isoformat() 

133 

134 @staticmethod 

135 def docinfo_from_xmp(xmp_val): 

136 """Derive DocumentInfo from XMP.""" 

137 if len(xmp_val) in (4, 7) and 'T' not in xmp_val: 

138 return f'D:{xmp_val.replace("-", "")}' 

139 if xmp_val.endswith('Z'): 

140 xmp_val = xmp_val[:-1] + '+00:00' 

141 return encode_pdf_date(datetime.fromisoformat(xmp_val)) 

142 

143 

144class DocinfoMapping(NamedTuple): 

145 """Map DocumentInfo keys to their XMP equivalents, along with converter.""" 

146 

147 ns: str 

148 key: str 

149 name: Name 

150 converter: type[Converter] | None 

151 

152 

153DOCINFO_MAPPING: list[DocinfoMapping] = [ 

154 DocinfoMapping(XMP_NS_DC, 'creator', Name.Author, AuthorConverter), 

155 DocinfoMapping(XMP_NS_DC, 'description', Name.Subject, None), 

156 DocinfoMapping(XMP_NS_DC, 'title', Name.Title, None), 

157 DocinfoMapping(XMP_NS_PDF, 'Keywords', Name.Keywords, None), 

158 DocinfoMapping(XMP_NS_PDF, 'Producer', Name.Producer, None), 

159 DocinfoMapping(XMP_NS_XMP, 'CreateDate', Name.CreationDate, DateConverter), 

160 DocinfoMapping(XMP_NS_XMP, 'CreatorTool', Name.Creator, None), 

161 DocinfoMapping(XMP_NS_XMP, 'ModifyDate', Name.ModDate, DateConverter), 

162]