Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pikepdf/codec.py: 42%

1# SPDX-FileCopyrightText: 2022 James R. Barlow

2# SPDX-License-Identifier: MPL-2.0

4"""Implement pdfdoc codec."""

6from __future__ import annotations

8import codecs

9import sys

10from collections.abc import Container

11from typing import Any

13from pikepdf._core import pdf_doc_to_utf8, utf8_to_pdf_doc

15if sys.version_info >= (3, 12):

16 from collections.abc import Buffer

17else:

18 Buffer = Any

20# pylint: disable=redefined-builtin

22# See PDF Reference Manual 1.7, Table D.2.

23# The following generates set of all Unicode code points that can be encoded in

24# pdfdoc. Since pdfdoc is 8-bit, the vast majority of code points cannot be.

26# Due to a bug, qpdf <= 10.5 and pikepdf < 5 had some inconsistencies around

27# PdfDocEncoding.

28PDFDOC_ENCODABLE = frozenset(

29 list(range(0x00, 0x17 + 1))

30 + list(range(0x20, 0x7E + 1))

31 + [

32 0x2022,

33 0x2020,

34 0x2021,

35 0x2026,

36 0x2014,

37 0x2013,

38 0x0192,

39 0x2044,

40 0x2039,

41 0x203A,

42 0x2212,

43 0x2030,

44 0x201E,

45 0x201C,

46 0x201D,

47 0x2018,

48 0x2019,

49 0x201A,

50 0x2122,

51 0xFB01,

52 0xFB02,

53 0x0141,

54 0x0152,

55 0x0160,

56 0x0178,

57 0x017D,

58 0x0131,

59 0x0142,

60 0x0153,

61 0x0161,

62 0x017E,

63 0x20AC,

64 ]

65 + [0x02D8, 0x02C7, 0x02C6, 0x02D9, 0x02DD, 0x02DB, 0x02DA, 0x02DC]

66 + list(range(0xA1, 0xAC + 1))

67 + list(range(0xAE, 0xFF + 1))

68)

71def _find_first_index(s: str, ordinals: Container[int]) -> int:

72 for n, char in enumerate(s):

73 if ord(char) not in ordinals:

74 return n

75 raise ValueError("couldn't find the unencodable character") # pragma: no cover

78def pdfdoc_encode(input: str, errors: str = 'strict') -> tuple[bytes, int]:

79 """Convert input string to bytes in PdfDocEncoding."""

80 error_marker = b'?' if errors == 'replace' else b'\xad'

81 success, pdfdoc = utf8_to_pdf_doc(input, error_marker)

82 if success:

83 return pdfdoc, len(input)

85 if errors == 'ignore':

86 pdfdoc = pdfdoc.replace(b'\xad', b'')

87 return pdfdoc, len(input)

88 if errors == 'replace':

89 return pdfdoc, len(input)

90 if errors == 'strict':

91 if input.startswith('\xfe\xff') or input.startswith('\xff\xfe'):

92 raise UnicodeEncodeError(

93 'pdfdoc',

94 input,

95 0,

96 2,

97 "strings beginning with byte order marks cannot be encoded in pdfdoc",

98 )

100 # libqpdf doesn't return what character caused the error, and Python

101 # needs this, so make an educated guess and raise an exception based

102 # on that.

103 offending_index = _find_first_index(input, PDFDOC_ENCODABLE)

104 raise UnicodeEncodeError(

105 'pdfdoc',

106 input,

107 offending_index,

108 offending_index + 1,

109 "character cannot be represented in pdfdoc encoding",

110 )

111 raise LookupError(errors)

112

113

114def pdfdoc_decode(input: Buffer, errors: str = 'strict') -> tuple[str, int]:

115 """Convert PdfDoc-encoded input into a Python str."""

116 if isinstance(input, memoryview):

117 input = input.tobytes()

118 s = pdf_doc_to_utf8(input)

119 if errors == 'strict':

120 idx = s.find('\ufffd')

121 if idx >= 0:

122 raise UnicodeDecodeError(

123 'pdfdoc',

124 input,

125 idx,

126 idx + 1,

127 "no Unicode mapping is defined for this character",

128 )

129

130 return s, len(input)

131

132

133class PdfDocCodec(codecs.Codec):

134 """Implement PdfDocEncoding character map used inside PDFs."""

135

136 def encode(self, input: str, errors: str = 'strict') -> tuple[bytes, int]:

137 """Implement codecs.Codec.encode for pdfdoc."""

138 return pdfdoc_encode(input, errors)

139

140 def decode(self, input: Buffer, errors: str = 'strict') -> tuple[str, int]:

141 """Implement codecs.Codec.decode for pdfdoc."""

142 return pdfdoc_decode(input, errors)

143

144

145class PdfDocStreamWriter(PdfDocCodec, codecs.StreamWriter):

146 """Implement PdfDocEncoding stream writer."""

147

148

149class PdfDocStreamReader(PdfDocCodec, codecs.StreamReader):

150 """Implement PdfDocEncoding stream reader."""

151

152 def decode(self, input: bytes, errors: str = 'strict') -> tuple[str, int]:

153 """Implement codecs.StreamReader.decode for pdfdoc."""

154 return PdfDocCodec.decode(self, input, errors)

155

156

157class PdfDocIncrementalEncoder(codecs.IncrementalEncoder):

158 """Implement PdfDocEncoding incremental encoder."""

159

160 def encode(self, input: str, final: bool = False) -> bytes:

161 """Implement codecs.IncrementalEncoder.encode for pdfdoc."""

162 return pdfdoc_encode(input, 'strict')[0]

163

164

165class PdfDocIncrementalDecoder(codecs.IncrementalDecoder):

166 """Implement PdfDocEncoding incremental decoder."""

167

168 def decode(self, input: Any, final: bool = False) -> str: # type: ignore

169 """Implement codecs.IncrementalDecoder.decode for pdfdoc."""

170 return pdfdoc_decode(bytes(input), 'strict')[0]

171

172

173def find_pdfdoc(encoding: str) -> codecs.CodecInfo | None:

174 """Register pdfdoc codec with Python.

175

176 Both pdfdoc and pdfdoc_pikepdf are registered. Use "pdfdoc_pikepdf" if pikepdf's

177 codec is required. If another third party package installs a codec named pdfdoc,

178 the first imported by Python will be registered and will service all encoding.

179 Unfortunately, Python's codec infrastructure does not give a better mechanism

180 for resolving conflicts.

181 """

182 if encoding in ('pdfdoc', 'pdfdoc_pikepdf'):

183 codec = PdfDocCodec()

184 return codecs.CodecInfo(

185 name=encoding,

186 encode=codec.encode,

187 decode=codec.decode,

188 streamwriter=PdfDocStreamWriter,

189 streamreader=PdfDocStreamReader,

190 incrementalencoder=PdfDocIncrementalEncoder,

191 incrementaldecoder=PdfDocIncrementalDecoder,

192 )

193 return None # pragma: no cover

194

195

196codecs.register(find_pdfdoc)

197

198__all__ = ['utf8_to_pdf_doc', 'pdf_doc_to_utf8']