Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pikepdf/codec.py: 42%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# SPDX-FileCopyrightText: 2022 James R. Barlow
2# SPDX-License-Identifier: MPL-2.0
4"""Implement pdfdoc codec."""
6from __future__ import annotations
8import codecs
9import sys
10from collections.abc import Container
11from typing import Any
13from pikepdf._core import pdf_doc_to_utf8, utf8_to_pdf_doc
15if sys.version_info >= (3, 12):
16 from collections.abc import Buffer
17else:
18 Buffer = Any
20# pylint: disable=redefined-builtin
22# See PDF Reference Manual 1.7, Table D.2.
23# The following generates set of all Unicode code points that can be encoded in
24# pdfdoc. Since pdfdoc is 8-bit, the vast majority of code points cannot be.
26# Due to a bug, qpdf <= 10.5 and pikepdf < 5 had some inconsistencies around
27# PdfDocEncoding.
28PDFDOC_ENCODABLE = frozenset(
29 list(range(0x00, 0x17 + 1))
30 + list(range(0x20, 0x7E + 1))
31 + [
32 0x2022,
33 0x2020,
34 0x2021,
35 0x2026,
36 0x2014,
37 0x2013,
38 0x0192,
39 0x2044,
40 0x2039,
41 0x203A,
42 0x2212,
43 0x2030,
44 0x201E,
45 0x201C,
46 0x201D,
47 0x2018,
48 0x2019,
49 0x201A,
50 0x2122,
51 0xFB01,
52 0xFB02,
53 0x0141,
54 0x0152,
55 0x0160,
56 0x0178,
57 0x017D,
58 0x0131,
59 0x0142,
60 0x0153,
61 0x0161,
62 0x017E,
63 0x20AC,
64 ]
65 + [0x02D8, 0x02C7, 0x02C6, 0x02D9, 0x02DD, 0x02DB, 0x02DA, 0x02DC]
66 + list(range(0xA1, 0xAC + 1))
67 + list(range(0xAE, 0xFF + 1))
68)
71def _find_first_index(s: str, ordinals: Container[int]) -> int:
72 for n, char in enumerate(s):
73 if ord(char) not in ordinals:
74 return n
75 raise ValueError("couldn't find the unencodable character") # pragma: no cover
78def pdfdoc_encode(input: str, errors: str = 'strict') -> tuple[bytes, int]:
79 """Convert input string to bytes in PdfDocEncoding."""
80 error_marker = b'?' if errors == 'replace' else b'\xad'
81 success, pdfdoc = utf8_to_pdf_doc(input, error_marker)
82 if success:
83 return pdfdoc, len(input)
85 if errors == 'ignore':
86 pdfdoc = pdfdoc.replace(b'\xad', b'')
87 return pdfdoc, len(input)
88 if errors == 'replace':
89 return pdfdoc, len(input)
90 if errors == 'strict':
91 if input.startswith('\xfe\xff') or input.startswith('\xff\xfe'):
92 raise UnicodeEncodeError(
93 'pdfdoc',
94 input,
95 0,
96 2,
97 "strings beginning with byte order marks cannot be encoded in pdfdoc",
98 )
100 # libqpdf doesn't return what character caused the error, and Python
101 # needs this, so make an educated guess and raise an exception based
102 # on that.
103 offending_index = _find_first_index(input, PDFDOC_ENCODABLE)
104 raise UnicodeEncodeError(
105 'pdfdoc',
106 input,
107 offending_index,
108 offending_index + 1,
109 "character cannot be represented in pdfdoc encoding",
110 )
111 raise LookupError(errors)
114def pdfdoc_decode(input: Buffer, errors: str = 'strict') -> tuple[str, int]:
115 """Convert PdfDoc-encoded input into a Python str."""
116 if isinstance(input, memoryview):
117 input = input.tobytes()
118 s = pdf_doc_to_utf8(input)
119 if errors == 'strict':
120 idx = s.find('\ufffd')
121 if idx >= 0:
122 raise UnicodeDecodeError(
123 'pdfdoc',
124 input,
125 idx,
126 idx + 1,
127 "no Unicode mapping is defined for this character",
128 )
130 return s, len(input)
133class PdfDocCodec(codecs.Codec):
134 """Implement PdfDocEncoding character map used inside PDFs."""
136 def encode(self, input: str, errors: str = 'strict') -> tuple[bytes, int]:
137 """Implement codecs.Codec.encode for pdfdoc."""
138 return pdfdoc_encode(input, errors)
140 def decode(self, input: Buffer, errors: str = 'strict') -> tuple[str, int]:
141 """Implement codecs.Codec.decode for pdfdoc."""
142 return pdfdoc_decode(input, errors)
145class PdfDocStreamWriter(PdfDocCodec, codecs.StreamWriter):
146 """Implement PdfDocEncoding stream writer."""
149class PdfDocStreamReader(PdfDocCodec, codecs.StreamReader):
150 """Implement PdfDocEncoding stream reader."""
152 def decode(self, input: bytes, errors: str = 'strict') -> tuple[str, int]:
153 """Implement codecs.StreamReader.decode for pdfdoc."""
154 return PdfDocCodec.decode(self, input, errors)
157class PdfDocIncrementalEncoder(codecs.IncrementalEncoder):
158 """Implement PdfDocEncoding incremental encoder."""
160 def encode(self, input: str, final: bool = False) -> bytes:
161 """Implement codecs.IncrementalEncoder.encode for pdfdoc."""
162 return pdfdoc_encode(input, 'strict')[0]
165class PdfDocIncrementalDecoder(codecs.IncrementalDecoder):
166 """Implement PdfDocEncoding incremental decoder."""
168 def decode(self, input: Any, final: bool = False) -> str: # type: ignore
169 """Implement codecs.IncrementalDecoder.decode for pdfdoc."""
170 return pdfdoc_decode(bytes(input), 'strict')[0]
173def find_pdfdoc(encoding: str) -> codecs.CodecInfo | None:
174 """Register pdfdoc codec with Python.
176 Both pdfdoc and pdfdoc_pikepdf are registered. Use "pdfdoc_pikepdf" if pikepdf's
177 codec is required. If another third party package installs a codec named pdfdoc,
178 the first imported by Python will be registered and will service all encoding.
179 Unfortunately, Python's codec infrastructure does not give a better mechanism
180 for resolving conflicts.
181 """
182 if encoding in ('pdfdoc', 'pdfdoc_pikepdf'):
183 codec = PdfDocCodec()
184 return codecs.CodecInfo(
185 name=encoding,
186 encode=codec.encode,
187 decode=codec.decode,
188 streamwriter=PdfDocStreamWriter,
189 streamreader=PdfDocStreamReader,
190 incrementalencoder=PdfDocIncrementalEncoder,
191 incrementaldecoder=PdfDocIncrementalDecoder,
192 )
193 return None # pragma: no cover
196codecs.register(find_pdfdoc)
198__all__ = ['utf8_to_pdf_doc', 'pdf_doc_to_utf8']