Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/generic/_image_inline.py: 8%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

176 statements  

1# Copyright (c) 2024, pypdf contributors 

2# All rights reserved. 

3# 

4# Redistribution and use in source and binary forms, with or without 

5# modification, are permitted provided that the following conditions are 

6# met: 

7# 

8# * Redistributions of source code must retain the above copyright notice, 

9# this list of conditions and the following disclaimer. 

10# * Redistributions in binary form must reproduce the above copyright notice, 

11# this list of conditions and the following disclaimer in the documentation 

12# and/or other materials provided with the distribution. 

13# * The name of the author may not be used to endorse or promote products 

14# derived from this software without specific prior written permission. 

15# 

16# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 

17# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 

18# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 

19# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 

20# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 

21# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 

22# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 

23# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 

24# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 

25# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 

26# POSSIBILITY OF SUCH DAMAGE. 

27 

28import logging 

29from io import BytesIO 

30from typing import IO 

31 

32from .._utils import ( 

33 WHITESPACES, 

34 WHITESPACES_AS_BYTES, 

35 BinaryStreamType, 

36 StreamType, 

37 logger_warning, 

38 read_non_whitespace, 

39) 

40from ..errors import PdfReadError 

41 

42logger = logging.getLogger(__name__) 

43 

44# An inline image should be used only for small images (4096 bytes or less), 

45# but allow twice this for cases where this has been exceeded. 

46BUFFER_SIZE = 8192 

47 

48 

49def _check_end_image_marker(stream: StreamType) -> bool: 

50 ei_tok = read_non_whitespace(stream) 

51 ei_tok += stream.read(2) 

52 stream.seek(-3, 1) 

53 return ei_tok[:2] == b"EI" and (ei_tok[2:3] == b"" or ei_tok[2:3] in WHITESPACES) 

54 

55 

56def extract_inline__ascii_hex_decode(stream: StreamType) -> bytes: 

57 """ 

58 Extract HexEncoded stream from inline image. 

59 The stream will be moved onto the EI. 

60 """ 

61 data_out = bytearray() 

62 # Read data until delimiter > and EI as backup. 

63 while True: 

64 data_buffered = read_non_whitespace(stream) + stream.read(BUFFER_SIZE) 

65 if not data_buffered: 

66 raise PdfReadError("Unexpected end of stream") 

67 pos_tok = data_buffered.find(b">") 

68 if pos_tok >= 0: # found > 

69 data_out += data_buffered[: pos_tok + 1] 

70 stream.seek(-len(data_buffered) + pos_tok + 1, 1) 

71 break 

72 pos_ei = data_buffered.find(b"EI") 

73 if pos_ei >= 0: # found EI 

74 stream.seek(-len(data_buffered) + pos_ei - 1, 1) 

75 c = stream.read(1) 

76 while c in WHITESPACES: 

77 stream.seek(-2, 1) 

78 c = stream.read(1) 

79 pos_ei -= 1 

80 data_out += data_buffered[:pos_ei] 

81 break 

82 if len(data_buffered) == 2: 

83 data_out += data_buffered 

84 raise PdfReadError("Unexpected end of stream") 

85 # Neither > nor EI found 

86 data_out += data_buffered[:-2] 

87 stream.seek(-2, 1) 

88 

89 if not _check_end_image_marker(stream): 

90 raise PdfReadError("EI stream not found") 

91 return bytes(data_out) 

92 

93 

94def extract_inline__ascii85_decode(stream: StreamType) -> bytes: 

95 """ 

96 Extract A85 stream from inline image. 

97 The stream will be moved onto the EI. 

98 """ 

99 data_out = bytearray() 

100 # Read data until delimiter ~> 

101 while True: 

102 data_buffered = read_non_whitespace(stream) + stream.read(BUFFER_SIZE) 

103 if not data_buffered: 

104 raise PdfReadError("Unexpected end of stream") 

105 pos_tok = data_buffered.find(b"~>") 

106 if pos_tok >= 0: # found! 

107 data_out += data_buffered[: pos_tok + 2] 

108 stream.seek(-len(data_buffered) + pos_tok + 2, 1) 

109 break 

110 if len(data_buffered) == 2: # end of buffer 

111 data_out += data_buffered 

112 raise PdfReadError("Unexpected end of stream") 

113 data_out += data_buffered[ 

114 :-2 

115 ] # back by one char in case of in the middle of ~> 

116 stream.seek(-2, 1) 

117 

118 if not _check_end_image_marker(stream): 

119 raise PdfReadError("EI stream not found") 

120 return bytes(data_out) 

121 

122 

123def extract_inline__run_length_decode(stream: StreamType) -> bytes: 

124 """ 

125 Extract RL (RunLengthDecode) stream from inline image. 

126 The stream will be moved onto the EI. 

127 """ 

128 data_out = bytearray() 

129 # Read data until delimiter 128 

130 while True: 

131 data_buffered = stream.read(BUFFER_SIZE) 

132 if not data_buffered: 

133 raise PdfReadError("Unexpected end of stream") 

134 pos_tok = data_buffered.find(b"\x80") 

135 if pos_tok >= 0: # found 

136 # Ideally, we could just use plain run-length decoding here, where 80_16 = 128_10 

137 # marks the EOD. But there apparently are cases like in issue #3517, where we have 

138 # an inline image with up to 51 EOD markers. In these cases, be resilient here and 

139 # use the default `EI` marker detection instead. Please note that this fallback 

140 # still omits special `EI` handling within the stream, but for now assume that having 

141 # both of these cases occur at the same time is very unlikely (and the image stream 

142 # is broken anyway). 

143 # For now, do not skip over more than one whitespace character. 

144 after_token = data_buffered[pos_tok + 1 : pos_tok + 4] 

145 if after_token.startswith(b"EI") or after_token.endswith(b"EI"): 

146 data_out += data_buffered[: pos_tok + 1] 

147 stream.seek(-len(data_buffered) + pos_tok + 1, 1) 

148 else: 

149 logger_warning("Early EOD in RunLengthDecode of inline image, using fallback.", __name__) 

150 ei_marker = data_buffered.find(b"EI") 

151 if ei_marker > 0: 

152 data_out += data_buffered[: ei_marker] 

153 stream.seek(-len(data_buffered) + ei_marker - 1, 1) 

154 break 

155 data_out += data_buffered 

156 

157 if not _check_end_image_marker(stream): 

158 raise PdfReadError("EI stream not found") 

159 return bytes(data_out) 

160 

161 

162def extract_inline__dct_decode(stream: BinaryStreamType) -> bytes: 

163 """ 

164 Extract DCT (JPEG) stream from inline image. 

165 The stream will be moved onto the EI. 

166 """ 

167 def read(length: int) -> bytes: 

168 # If 0 bytes are returned, and *size* was not 0, this indicates end of file. 

169 # If the object is in non-blocking mode and no bytes are available, `None` is returned. 

170 _result = stream.read(length) 

171 if _result is None or len(_result) != length: 

172 raise PdfReadError("Unexpected end of stream") 

173 return _result 

174 

175 data_out = bytearray() 

176 # Read Blocks of data (ID/Size/data) up to ID=FF/D9 

177 # https://www.digicamsoft.com/itu/itu-t81-36.html 

178 not_first = False 

179 while True: 

180 c = read(1) 

181 if not_first or (c == b"\xff"): 

182 data_out += c 

183 if c != b"\xff": 

184 continue 

185 not_first = True 

186 c = read(1) 

187 data_out += c 

188 if c == b"\xff": 

189 stream.seek(-1, 1) # pragma: no cover 

190 elif c == b"\x00": # stuffing 

191 pass 

192 elif c == b"\xd9": # end 

193 break 

194 elif c in ( 

195 b"\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc9\xca\xcb\xcc\xcd\xce\xcf" 

196 b"\xda\xdb\xdc\xdd\xde\xdf" 

197 b"\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xfe" 

198 ): 

199 c = read(2) 

200 data_out += c 

201 sz = c[0] * 256 + c[1] 

202 data_out += read(sz - 2) 

203 

204 if not _check_end_image_marker(stream): 

205 raise PdfReadError("EI stream not found") 

206 return bytes(data_out) 

207 

208 

209def extract_inline_default(stream: StreamType) -> bytes: 

210 """Legacy method, used by default""" 

211 stream_out = BytesIO() 

212 # Read the inline image, while checking for EI (End Image) operator. 

213 while True: 

214 data_buffered = stream.read(BUFFER_SIZE) 

215 if not data_buffered: 

216 raise PdfReadError("Unexpected end of stream") 

217 pos_ei = data_buffered.find( 

218 b"E" 

219 ) # We can not look straight for "EI" because it may not have been loaded in the buffer 

220 

221 if pos_ei == -1: 

222 stream_out.write(data_buffered) 

223 else: 

224 # Write out everything including E (the one from EI to be removed) 

225 stream_out.write(data_buffered[0 : pos_ei + 1]) 

226 sav_pos_ei = stream_out.tell() - 1 

227 # Seek back in the stream to read the E next 

228 stream.seek(pos_ei + 1 - len(data_buffered), 1) 

229 saved_pos = stream.tell() 

230 # Check for End Image 

231 tok2 = stream.read(1) # I of "EI" 

232 if tok2 != b"I": 

233 stream.seek(saved_pos, 0) 

234 continue 

235 tok3 = stream.read(1) # possible space after "EI" 

236 if tok3 not in WHITESPACES: 

237 stream.seek(saved_pos, 0) 

238 continue 

239 while tok3 in WHITESPACES: 

240 tok3 = stream.read(1) 

241 if data_buffered[pos_ei - 1 : pos_ei] not in WHITESPACES and tok3 not in { 

242 b"Q", 

243 b"E", 

244 }: # for Q or EMC 

245 stream.seek(saved_pos, 0) 

246 continue 

247 if is_followed_by_binary_data(stream): 

248 # Inline image contains `EI ` sequence usually marking the end of it, but 

249 # is followed by binary data which does not make sense for the actual end. 

250 stream.seek(saved_pos, 0) 

251 continue 

252 # Data contains [\s]EI[\s](Q|EMC): 4 chars are sufficient 

253 # remove E(I) wrongly inserted earlier 

254 stream.seek(saved_pos - 1, 0) 

255 stream_out.truncate(sav_pos_ei) 

256 break 

257 

258 return stream_out.getvalue() 

259 

260 

261def is_followed_by_binary_data(stream: IO[bytes], length: int = 10) -> bool: 

262 """ 

263 Check if the next bytes of the stream look like binary image data or regular page content. 

264 

265 This is just some heuristics due to the PDF specification being too imprecise about 

266 inline images containing the `EI` marker which would end an image. Starting with PDF 2.0, 

267 we finally get a mandatory length field, but with (proper) PDF 2.0 support being very limited 

268 everywhere, we should not expect to be able to remove such hacks in the near future - especially 

269 considering legacy documents as well. 

270 

271 The actual implementation draws some inspiration from 

272 https://github.com/itext/itext-java/blob/9.1.0/kernel/src/main/java/com/itextpdf/kernel/pdf/canvas/parser/util/InlineImageParsingUtils.java 

273 """ 

274 position = stream.tell() 

275 data = stream.read(length) 

276 stream.seek(position) 

277 if not data: 

278 return False 

279 operator_start = None 

280 operator_end = None 

281 

282 for index, byte in enumerate(data): 

283 if byte < 32 and byte not in WHITESPACES_AS_BYTES: 

284 # This covers all characters not being displayable directly, although omitting whitespace 

285 # to allow for operator detection. 

286 return True 

287 is_whitespace = byte in WHITESPACES_AS_BYTES 

288 if operator_start is None and not is_whitespace: 

289 # Interpret all other non-whitespace characters as the start of an operation. 

290 operator_start = index 

291 if operator_start is not None and is_whitespace: 

292 # A whitespace stops an operation. 

293 # Assume that having an inline image with tons of whitespace is rather unlikely. 

294 operator_end = index 

295 break 

296 

297 if operator_start is None: 

298 # Inline images should not have tons of whitespaces, which would lead to no operator start. 

299 return False 

300 if operator_end is None: 

301 # We probably are inside an operation. 

302 operator_end = length 

303 operator_length = operator_end - operator_start 

304 operator = data[operator_start:operator_end] 

305 if operator.startswith(b"/") and operator_length > 1: 

306 # Name object. 

307 return False 

308 if operator.replace(b".", b"").isdigit(): 

309 # Graphics operator, for example a move. A number (integer or float). 

310 return False 

311 if operator_length > 3: # noqa: SIM103 

312 # Usually, the operators inside a content stream should not have more than three characters, 

313 # especially after an inline image. 

314 return True 

315 return False