Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pikepdf/codec.py: 42%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

62 statements  

1# SPDX-FileCopyrightText: 2022 James R. Barlow 

2# SPDX-License-Identifier: MPL-2.0 

3 

4"""Implement pdfdoc codec.""" 

5 

6from __future__ import annotations 

7 

8import codecs 

9import sys 

10from collections.abc import Container 

11from typing import Any 

12 

13from pikepdf._core import pdf_doc_to_utf8, utf8_to_pdf_doc 

14 

15if sys.version_info >= (3, 12): 

16 from collections.abc import Buffer 

17else: 

18 Buffer = Any 

19 

20# pylint: disable=redefined-builtin 

21 

22# See PDF Reference Manual 1.7, Table D.2. 

23# The following generates set of all Unicode code points that can be encoded in 

24# pdfdoc. Since pdfdoc is 8-bit, the vast majority of code points cannot be. 

25 

26# Due to a bug, qpdf <= 10.5 and pikepdf < 5 had some inconsistencies around 

27# PdfDocEncoding. 

28PDFDOC_ENCODABLE = frozenset( 

29 list(range(0x00, 0x17 + 1)) 

30 + list(range(0x20, 0x7E + 1)) 

31 + [ 

32 0x2022, 

33 0x2020, 

34 0x2021, 

35 0x2026, 

36 0x2014, 

37 0x2013, 

38 0x0192, 

39 0x2044, 

40 0x2039, 

41 0x203A, 

42 0x2212, 

43 0x2030, 

44 0x201E, 

45 0x201C, 

46 0x201D, 

47 0x2018, 

48 0x2019, 

49 0x201A, 

50 0x2122, 

51 0xFB01, 

52 0xFB02, 

53 0x0141, 

54 0x0152, 

55 0x0160, 

56 0x0178, 

57 0x017D, 

58 0x0131, 

59 0x0142, 

60 0x0153, 

61 0x0161, 

62 0x017E, 

63 0x20AC, 

64 ] 

65 + [0x02D8, 0x02C7, 0x02C6, 0x02D9, 0x02DD, 0x02DB, 0x02DA, 0x02DC] 

66 + list(range(0xA1, 0xAC + 1)) 

67 + list(range(0xAE, 0xFF + 1)) 

68) 

69 

70 

71def _find_first_index(s: str, ordinals: Container[int]) -> int: 

72 for n, char in enumerate(s): 

73 if ord(char) not in ordinals: 

74 return n 

75 raise ValueError("couldn't find the unencodable character") # pragma: no cover 

76 

77 

78def pdfdoc_encode(input: str, errors: str = 'strict') -> tuple[bytes, int]: 

79 """Convert input string to bytes in PdfDocEncoding.""" 

80 error_marker = b'?' if errors == 'replace' else b'\xad' 

81 success, pdfdoc = utf8_to_pdf_doc(input, error_marker) 

82 if success: 

83 return pdfdoc, len(input) 

84 

85 if errors == 'ignore': 

86 pdfdoc = pdfdoc.replace(b'\xad', b'') 

87 return pdfdoc, len(input) 

88 if errors == 'replace': 

89 return pdfdoc, len(input) 

90 if errors == 'strict': 

91 if input.startswith('\xfe\xff') or input.startswith('\xff\xfe'): 

92 raise UnicodeEncodeError( 

93 'pdfdoc', 

94 input, 

95 0, 

96 2, 

97 "strings beginning with byte order marks cannot be encoded in pdfdoc", 

98 ) 

99 

100 # libqpdf doesn't return what character caused the error, and Python 

101 # needs this, so make an educated guess and raise an exception based 

102 # on that. 

103 offending_index = _find_first_index(input, PDFDOC_ENCODABLE) 

104 raise UnicodeEncodeError( 

105 'pdfdoc', 

106 input, 

107 offending_index, 

108 offending_index + 1, 

109 "character cannot be represented in pdfdoc encoding", 

110 ) 

111 raise LookupError(errors) 

112 

113 

114def pdfdoc_decode(input: Buffer, errors: str = 'strict') -> tuple[str, int]: 

115 """Convert PdfDoc-encoded input into a Python str.""" 

116 if isinstance(input, memoryview): 

117 input = input.tobytes() 

118 s = pdf_doc_to_utf8(input) 

119 if errors == 'strict': 

120 idx = s.find('\ufffd') 

121 if idx >= 0: 

122 raise UnicodeDecodeError( 

123 'pdfdoc', 

124 input, 

125 idx, 

126 idx + 1, 

127 "no Unicode mapping is defined for this character", 

128 ) 

129 

130 return s, len(input) 

131 

132 

133class PdfDocCodec(codecs.Codec): 

134 """Implement PdfDocEncoding character map used inside PDFs.""" 

135 

136 def encode(self, input: str, errors: str = 'strict') -> tuple[bytes, int]: 

137 """Implement codecs.Codec.encode for pdfdoc.""" 

138 return pdfdoc_encode(input, errors) 

139 

140 def decode(self, input: Buffer, errors: str = 'strict') -> tuple[str, int]: 

141 """Implement codecs.Codec.decode for pdfdoc.""" 

142 return pdfdoc_decode(input, errors) 

143 

144 

145class PdfDocStreamWriter(PdfDocCodec, codecs.StreamWriter): 

146 """Implement PdfDocEncoding stream writer.""" 

147 

148 

149class PdfDocStreamReader(PdfDocCodec, codecs.StreamReader): 

150 """Implement PdfDocEncoding stream reader.""" 

151 

152 def decode(self, input: bytes, errors: str = 'strict') -> tuple[str, int]: 

153 """Implement codecs.StreamReader.decode for pdfdoc.""" 

154 return PdfDocCodec.decode(self, input, errors) 

155 

156 

157class PdfDocIncrementalEncoder(codecs.IncrementalEncoder): 

158 """Implement PdfDocEncoding incremental encoder.""" 

159 

160 def encode(self, input: str, final: bool = False) -> bytes: 

161 """Implement codecs.IncrementalEncoder.encode for pdfdoc.""" 

162 return pdfdoc_encode(input, 'strict')[0] 

163 

164 

165class PdfDocIncrementalDecoder(codecs.IncrementalDecoder): 

166 """Implement PdfDocEncoding incremental decoder.""" 

167 

168 def decode(self, input: Any, final: bool = False) -> str: # type: ignore 

169 """Implement codecs.IncrementalDecoder.decode for pdfdoc.""" 

170 return pdfdoc_decode(bytes(input), 'strict')[0] 

171 

172 

173def find_pdfdoc(encoding: str) -> codecs.CodecInfo | None: 

174 """Register pdfdoc codec with Python. 

175 

176 Both pdfdoc and pdfdoc_pikepdf are registered. Use "pdfdoc_pikepdf" if pikepdf's 

177 codec is required. If another third party package installs a codec named pdfdoc, 

178 the first imported by Python will be registered and will service all encoding. 

179 Unfortunately, Python's codec infrastructure does not give a better mechanism 

180 for resolving conflicts. 

181 """ 

182 if encoding in ('pdfdoc', 'pdfdoc_pikepdf'): 

183 codec = PdfDocCodec() 

184 return codecs.CodecInfo( 

185 name=encoding, 

186 encode=codec.encode, 

187 decode=codec.decode, 

188 streamwriter=PdfDocStreamWriter, 

189 streamreader=PdfDocStreamReader, 

190 incrementalencoder=PdfDocIncrementalEncoder, 

191 incrementaldecoder=PdfDocIncrementalDecoder, 

192 ) 

193 return None # pragma: no cover 

194 

195 

196codecs.register(find_pdfdoc) 

197 

198__all__ = ['utf8_to_pdf_doc', 'pdf_doc_to_utf8']