1# Copyright (c) 2024, pypdf contributors
2# All rights reserved.
3#
4# Redistribution and use in source and binary forms, with or without
5# modification, are permitted provided that the following conditions are
6# met:
7#
8# * Redistributions of source code must retain the above copyright notice,
9# this list of conditions and the following disclaimer.
10# * Redistributions in binary form must reproduce the above copyright notice,
11# this list of conditions and the following disclaimer in the documentation
12# and/or other materials provided with the distribution.
13# * The name of the author may not be used to endorse or promote products
14# derived from this software without specific prior written permission.
15#
16# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26# POSSIBILITY OF SUCH DAMAGE.
27
28import logging
29from io import BytesIO
30from typing import IO
31
32from .._utils import (
33 WHITESPACES,
34 WHITESPACES_AS_BYTES,
35 StreamType,
36 read_non_whitespace,
37)
38from ..errors import PdfReadError
39
40logger = logging.getLogger(__name__)
41
42# An inline image should be used only for small images (4096 bytes or less),
43# but allow twice this for cases where this has been exceeded.
44BUFFER_SIZE = 8192
45
46
47def extract_inline_AHx(stream: StreamType) -> bytes:
48 """
49 Extract HexEncoded stream from inline image.
50 The stream will be moved onto the EI.
51 """
52 data_out: bytes = b""
53 # Read data until delimiter > and EI as backup.
54 while True:
55 data_buffered = read_non_whitespace(stream) + stream.read(BUFFER_SIZE)
56 if not data_buffered:
57 raise PdfReadError("Unexpected end of stream")
58 pos_tok = data_buffered.find(b">")
59 if pos_tok >= 0: # found >
60 data_out += data_buffered[: pos_tok + 1]
61 stream.seek(-len(data_buffered) + pos_tok + 1, 1)
62 break
63 pos_ei = data_buffered.find(b"EI")
64 if pos_ei >= 0: # found EI
65 stream.seek(-len(data_buffered) + pos_ei - 1, 1)
66 c = stream.read(1)
67 while c in WHITESPACES:
68 stream.seek(-2, 1)
69 c = stream.read(1)
70 pos_ei -= 1
71 data_out += data_buffered[:pos_ei]
72 break
73 if len(data_buffered) == 2:
74 data_out += data_buffered
75 raise PdfReadError("Unexpected end of stream")
76 # Neither > nor EI found
77 data_out += data_buffered[:-2]
78 stream.seek(-2, 1)
79
80 ei_tok = read_non_whitespace(stream)
81 ei_tok += stream.read(2)
82 stream.seek(-3, 1)
83 if ei_tok[:2] != b"EI" or not (ei_tok[2:3] == b"" or ei_tok[2:3] in WHITESPACES):
84 raise PdfReadError("EI stream not found")
85 return data_out
86
87
88def extract_inline_A85(stream: StreamType) -> bytes:
89 """
90 Extract A85 stream from inline image.
91 The stream will be moved onto the EI.
92 """
93 data_out: bytes = b""
94 # Read data until delimiter ~>
95 while True:
96 data_buffered = read_non_whitespace(stream) + stream.read(BUFFER_SIZE)
97 if not data_buffered:
98 raise PdfReadError("Unexpected end of stream")
99 pos_tok = data_buffered.find(b"~>")
100 if pos_tok >= 0: # found!
101 data_out += data_buffered[: pos_tok + 2]
102 stream.seek(-len(data_buffered) + pos_tok + 2, 1)
103 break
104 if len(data_buffered) == 2: # end of buffer
105 data_out += data_buffered
106 raise PdfReadError("Unexpected end of stream")
107 data_out += data_buffered[
108 :-2
109 ] # back by one char in case of in the middle of ~>
110 stream.seek(-2, 1)
111
112 ei_tok = read_non_whitespace(stream)
113 ei_tok += stream.read(2)
114 stream.seek(-3, 1)
115 if ei_tok[:2] != b"EI" or not (ei_tok[2:3] == b"" or ei_tok[2:3] in WHITESPACES):
116 raise PdfReadError("EI stream not found")
117 return data_out
118
119
120def extract_inline_RL(stream: StreamType) -> bytes:
121 """
122 Extract RL (RunLengthDecode) stream from inline image.
123 The stream will be moved onto the EI.
124 """
125 data_out: bytes = b""
126 # Read data until delimiter 128
127 while True:
128 data_buffered = stream.read(BUFFER_SIZE)
129 if not data_buffered:
130 raise PdfReadError("Unexpected end of stream")
131 pos_tok = data_buffered.find(b"\x80")
132 if pos_tok >= 0: # found
133 data_out += data_buffered[: pos_tok + 1]
134 stream.seek(-len(data_buffered) + pos_tok + 1, 1)
135 break
136 data_out += data_buffered
137
138 ei_tok = read_non_whitespace(stream)
139 ei_tok += stream.read(2)
140 stream.seek(-3, 1)
141 if ei_tok[:2] != b"EI" or not (ei_tok[2:3] == b"" or ei_tok[2:3] in WHITESPACES):
142 raise PdfReadError("EI stream not found")
143 return data_out
144
145
146def extract_inline_DCT(stream: StreamType) -> bytes:
147 """
148 Extract DCT (JPEG) stream from inline image.
149 The stream will be moved onto the EI.
150 """
151 data_out: bytes = b""
152 # Read Blocks of data (ID/Size/data) up to ID=FF/D9
153 # https://www.digicamsoft.com/itu/itu-t81-36.html
154 notfirst = False
155 while True:
156 c = stream.read(1)
157 if notfirst or (c == b"\xff"):
158 data_out += c
159 if c != b"\xff":
160 continue
161 notfirst = True
162 c = stream.read(1)
163 data_out += c
164 if c == b"\xff":
165 stream.seek(-1, 1) # pragma: no cover
166 elif c == b"\x00": # stuffing
167 pass
168 elif c == b"\xd9": # end
169 break
170 elif c in (
171 b"\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc9\xca\xcb\xcc\xcd\xce\xcf"
172 b"\xda\xdb\xdc\xdd\xde\xdf"
173 b"\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xfe"
174 ):
175 c = stream.read(2)
176 data_out += c
177 sz = c[0] * 256 + c[1]
178 data_out += stream.read(sz - 2)
179
180 ei_tok = read_non_whitespace(stream)
181 ei_tok += stream.read(2)
182 stream.seek(-3, 1)
183 if ei_tok[:2] != b"EI" or not (ei_tok[2:3] == b"" or ei_tok[2:3] in WHITESPACES):
184 raise PdfReadError("EI stream not found")
185 return data_out
186
187
188def extract_inline_default(stream: StreamType) -> bytes:
189 """Legacy method, used by default"""
190 stream_out = BytesIO()
191 # Read the inline image, while checking for EI (End Image) operator.
192 while True:
193 data_buffered = stream.read(BUFFER_SIZE)
194 if not data_buffered:
195 raise PdfReadError("Unexpected end of stream")
196 pos_ei = data_buffered.find(
197 b"E"
198 ) # We can not look straight for "EI" because it may not have been loaded in the buffer
199
200 if pos_ei == -1:
201 stream_out.write(data_buffered)
202 else:
203 # Write out everything including E (the one from EI to be removed)
204 stream_out.write(data_buffered[0 : pos_ei + 1])
205 sav_pos_ei = stream_out.tell() - 1
206 # Seek back in the stream to read the E next
207 stream.seek(pos_ei + 1 - len(data_buffered), 1)
208 saved_pos = stream.tell()
209 # Check for End Image
210 tok2 = stream.read(1) # I of "EI"
211 if tok2 != b"I":
212 stream.seek(saved_pos, 0)
213 continue
214 tok3 = stream.read(1) # possible space after "EI"
215 if tok3 not in WHITESPACES:
216 stream.seek(saved_pos, 0)
217 continue
218 while tok3 in WHITESPACES:
219 tok3 = stream.read(1)
220 if data_buffered[pos_ei - 1 : pos_ei] not in WHITESPACES and tok3 not in {
221 b"Q",
222 b"E",
223 }: # for Q or EMC
224 stream.seek(saved_pos, 0)
225 continue
226 if is_followed_by_binary_data(stream):
227 # Inline image contains `EI ` sequence usually marking the end of it, but
228 # is followed by binary data which does not make sense for the actual end.
229 stream.seek(saved_pos, 0)
230 continue
231 # Data contains [\s]EI[\s](Q|EMC): 4 chars are sufficient
232 # remove E(I) wrongly inserted earlier
233 stream.seek(saved_pos - 1, 0)
234 stream_out.truncate(sav_pos_ei)
235 break
236
237 return stream_out.getvalue()
238
239
240def is_followed_by_binary_data(stream: IO[bytes], length: int = 10) -> bool:
241 """
242 Check if the next bytes of the stream look like binary image data or regular page content.
243
244 This is just some heuristics due to the PDF specification being too imprecise about
245 inline images containing the `EI` marker which would end an image. Starting with PDF 2.0,
246 we finally get a mandatory length field, but with (proper) PDF 2.0 support being very limited
247 everywhere, we should not expect to be able to remove such hacks in the near future - especially
248 considering legacy documents as well.
249
250 The actual implementation draws some inspiration from
251 https://github.com/itext/itext-java/blob/9.1.0/kernel/src/main/java/com/itextpdf/kernel/pdf/canvas/parser/util/InlineImageParsingUtils.java
252 """
253 position = stream.tell()
254 data = stream.read(length)
255 stream.seek(position)
256 if not data:
257 return False
258 operator_start = None
259 operator_end = None
260
261 for index, byte in enumerate(data):
262 if byte < 32 and byte not in WHITESPACES_AS_BYTES:
263 # This covers all characters not being displayable directly, although omitting whitespace
264 # to allow for operator detection.
265 return True
266 is_whitespace = byte in WHITESPACES_AS_BYTES
267 if operator_start is None and not is_whitespace:
268 # Interpret all other non-whitespace characters as the start of an operation.
269 operator_start = index
270 if operator_start is not None and is_whitespace:
271 # A whitespace stops an operation.
272 # Assume that having an inline image with tons of whitespace is rather unlikely.
273 operator_end = index
274 break
275
276 if operator_start is None:
277 # Inline images should not have tons of whitespaces, which would lead to no operator start.
278 return False
279 if operator_end is None:
280 # We probably are inside an operation.
281 operator_end = length
282 operator_length = operator_end - operator_start
283 operator = data[operator_start:operator_end]
284 if operator.startswith(b"/") and operator_length > 1:
285 # Name object.
286 return False
287 if operator.replace(b".", b"").isdigit():
288 # Graphics operator, for example a move. A number (integer or float).
289 return False
290 if operator_length > 3: # noqa: SIM103
291 # Usually, the operators inside a content stream should not have more than three characters,
292 # especially after an inline image.
293 return True
294 return False