Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/generic/_image_inline.py: 8%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

176 statements  

1# Copyright (c) 2024, pypdf contributors 

2# All rights reserved. 

3# 

4# Redistribution and use in source and binary forms, with or without 

5# modification, are permitted provided that the following conditions are 

6# met: 

7# 

8# * Redistributions of source code must retain the above copyright notice, 

9# this list of conditions and the following disclaimer. 

10# * Redistributions in binary form must reproduce the above copyright notice, 

11# this list of conditions and the following disclaimer in the documentation 

12# and/or other materials provided with the distribution. 

13# * The name of the author may not be used to endorse or promote products 

14# derived from this software without specific prior written permission. 

15# 

16# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 

17# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 

18# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 

19# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 

20# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 

21# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 

22# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 

23# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 

24# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 

25# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 

26# POSSIBILITY OF SUCH DAMAGE. 

27 

28import logging 

29from io import BytesIO 

30from typing import IO 

31 

32from .._utils import ( 

33 WHITESPACES, 

34 WHITESPACES_AS_BYTES, 

35 StreamType, 

36 logger_warning, 

37 read_non_whitespace, 

38) 

39from ..errors import PdfReadError 

40 

41logger = logging.getLogger(__name__) 

42 

43# An inline image should be used only for small images (4096 bytes or less), 

44# but allow twice this for cases where this has been exceeded. 

45BUFFER_SIZE = 8192 

46 

47 

48def _check_end_image_marker(stream: StreamType) -> bool: 

49 ei_tok = read_non_whitespace(stream) 

50 ei_tok += stream.read(2) 

51 stream.seek(-3, 1) 

52 return ei_tok[:2] == b"EI" and (ei_tok[2:3] == b"" or ei_tok[2:3] in WHITESPACES) 

53 

54 

55def extract_inline__ascii_hex_decode(stream: StreamType) -> bytes: 

56 """ 

57 Extract HexEncoded stream from inline image. 

58 The stream will be moved onto the EI. 

59 """ 

60 data_out: bytes = b"" 

61 # Read data until delimiter > and EI as backup. 

62 while True: 

63 data_buffered = read_non_whitespace(stream) + stream.read(BUFFER_SIZE) 

64 if not data_buffered: 

65 raise PdfReadError("Unexpected end of stream") 

66 pos_tok = data_buffered.find(b">") 

67 if pos_tok >= 0: # found > 

68 data_out += data_buffered[: pos_tok + 1] 

69 stream.seek(-len(data_buffered) + pos_tok + 1, 1) 

70 break 

71 pos_ei = data_buffered.find(b"EI") 

72 if pos_ei >= 0: # found EI 

73 stream.seek(-len(data_buffered) + pos_ei - 1, 1) 

74 c = stream.read(1) 

75 while c in WHITESPACES: 

76 stream.seek(-2, 1) 

77 c = stream.read(1) 

78 pos_ei -= 1 

79 data_out += data_buffered[:pos_ei] 

80 break 

81 if len(data_buffered) == 2: 

82 data_out += data_buffered 

83 raise PdfReadError("Unexpected end of stream") 

84 # Neither > nor EI found 

85 data_out += data_buffered[:-2] 

86 stream.seek(-2, 1) 

87 

88 if not _check_end_image_marker(stream): 

89 raise PdfReadError("EI stream not found") 

90 return data_out 

91 

92 

93def extract_inline__ascii85_decode(stream: StreamType) -> bytes: 

94 """ 

95 Extract A85 stream from inline image. 

96 The stream will be moved onto the EI. 

97 """ 

98 data_out: bytes = b"" 

99 # Read data until delimiter ~> 

100 while True: 

101 data_buffered = read_non_whitespace(stream) + stream.read(BUFFER_SIZE) 

102 if not data_buffered: 

103 raise PdfReadError("Unexpected end of stream") 

104 pos_tok = data_buffered.find(b"~>") 

105 if pos_tok >= 0: # found! 

106 data_out += data_buffered[: pos_tok + 2] 

107 stream.seek(-len(data_buffered) + pos_tok + 2, 1) 

108 break 

109 if len(data_buffered) == 2: # end of buffer 

110 data_out += data_buffered 

111 raise PdfReadError("Unexpected end of stream") 

112 data_out += data_buffered[ 

113 :-2 

114 ] # back by one char in case of in the middle of ~> 

115 stream.seek(-2, 1) 

116 

117 if not _check_end_image_marker(stream): 

118 raise PdfReadError("EI stream not found") 

119 return data_out 

120 

121 

122def extract_inline__run_length_decode(stream: StreamType) -> bytes: 

123 """ 

124 Extract RL (RunLengthDecode) stream from inline image. 

125 The stream will be moved onto the EI. 

126 """ 

127 data_out: bytes = b"" 

128 # Read data until delimiter 128 

129 while True: 

130 data_buffered = stream.read(BUFFER_SIZE) 

131 if not data_buffered: 

132 raise PdfReadError("Unexpected end of stream") 

133 pos_tok = data_buffered.find(b"\x80") 

134 if pos_tok >= 0: # found 

135 # Ideally, we could just use plain run-length decoding here, where 80_16 = 128_10 

136 # marks the EOD. But there apparently are cases like in issue #3517, where we have 

137 # an inline image with up to 51 EOD markers. In these cases, be resilient here and 

138 # use the default `EI` marker detection instead. Please note that this fallback 

139 # still omits special `EI` handling within the stream, but for now assume that having 

140 # both of these cases occur at the same time is very unlikely (and the image stream 

141 # is broken anyway). 

142 # For now, do not skip over more than one whitespace character. 

143 after_token = data_buffered[pos_tok + 1 : pos_tok + 4] 

144 if after_token.startswith(b"EI") or after_token.endswith(b"EI"): 

145 data_out += data_buffered[: pos_tok + 1] 

146 stream.seek(-len(data_buffered) + pos_tok + 1, 1) 

147 else: 

148 logger_warning("Early EOD in RunLengthDecode of inline image, using fallback.", __name__) 

149 ei_marker = data_buffered.find(b"EI") 

150 if ei_marker > 0: 

151 data_out += data_buffered[: ei_marker] 

152 stream.seek(-len(data_buffered) + ei_marker - 1, 1) 

153 break 

154 data_out += data_buffered 

155 

156 if not _check_end_image_marker(stream): 

157 raise PdfReadError("EI stream not found") 

158 return data_out 

159 

160 

161def extract_inline__dct_decode(stream: StreamType) -> bytes: 

162 """ 

163 Extract DCT (JPEG) stream from inline image. 

164 The stream will be moved onto the EI. 

165 """ 

166 def read(length: int) -> bytes: 

167 # If 0 bytes are returned, and *size* was not 0, this indicates end of file. 

168 # If the object is in non-blocking mode and no bytes are available, `None` is returned. 

169 _result = stream.read(length) 

170 if _result is None or len(_result) != length: 

171 raise PdfReadError("Unexpected end of stream") 

172 return _result 

173 

174 data_out: bytes = b"" 

175 # Read Blocks of data (ID/Size/data) up to ID=FF/D9 

176 # https://www.digicamsoft.com/itu/itu-t81-36.html 

177 not_first = False 

178 while True: 

179 c = read(1) 

180 if not_first or (c == b"\xff"): 

181 data_out += c 

182 if c != b"\xff": 

183 continue 

184 not_first = True 

185 c = read(1) 

186 data_out += c 

187 if c == b"\xff": 

188 stream.seek(-1, 1) # pragma: no cover 

189 elif c == b"\x00": # stuffing 

190 pass 

191 elif c == b"\xd9": # end 

192 break 

193 elif c in ( 

194 b"\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc9\xca\xcb\xcc\xcd\xce\xcf" 

195 b"\xda\xdb\xdc\xdd\xde\xdf" 

196 b"\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xfe" 

197 ): 

198 c = read(2) 

199 data_out += c 

200 sz = c[0] * 256 + c[1] 

201 data_out += read(sz - 2) 

202 

203 if not _check_end_image_marker(stream): 

204 raise PdfReadError("EI stream not found") 

205 return data_out 

206 

207 

208def extract_inline_default(stream: StreamType) -> bytes: 

209 """Legacy method, used by default""" 

210 stream_out = BytesIO() 

211 # Read the inline image, while checking for EI (End Image) operator. 

212 while True: 

213 data_buffered = stream.read(BUFFER_SIZE) 

214 if not data_buffered: 

215 raise PdfReadError("Unexpected end of stream") 

216 pos_ei = data_buffered.find( 

217 b"E" 

218 ) # We can not look straight for "EI" because it may not have been loaded in the buffer 

219 

220 if pos_ei == -1: 

221 stream_out.write(data_buffered) 

222 else: 

223 # Write out everything including E (the one from EI to be removed) 

224 stream_out.write(data_buffered[0 : pos_ei + 1]) 

225 sav_pos_ei = stream_out.tell() - 1 

226 # Seek back in the stream to read the E next 

227 stream.seek(pos_ei + 1 - len(data_buffered), 1) 

228 saved_pos = stream.tell() 

229 # Check for End Image 

230 tok2 = stream.read(1) # I of "EI" 

231 if tok2 != b"I": 

232 stream.seek(saved_pos, 0) 

233 continue 

234 tok3 = stream.read(1) # possible space after "EI" 

235 if tok3 not in WHITESPACES: 

236 stream.seek(saved_pos, 0) 

237 continue 

238 while tok3 in WHITESPACES: 

239 tok3 = stream.read(1) 

240 if data_buffered[pos_ei - 1 : pos_ei] not in WHITESPACES and tok3 not in { 

241 b"Q", 

242 b"E", 

243 }: # for Q or EMC 

244 stream.seek(saved_pos, 0) 

245 continue 

246 if is_followed_by_binary_data(stream): 

247 # Inline image contains `EI ` sequence usually marking the end of it, but 

248 # is followed by binary data which does not make sense for the actual end. 

249 stream.seek(saved_pos, 0) 

250 continue 

251 # Data contains [\s]EI[\s](Q|EMC): 4 chars are sufficient 

252 # remove E(I) wrongly inserted earlier 

253 stream.seek(saved_pos - 1, 0) 

254 stream_out.truncate(sav_pos_ei) 

255 break 

256 

257 return stream_out.getvalue() 

258 

259 

260def is_followed_by_binary_data(stream: IO[bytes], length: int = 10) -> bool: 

261 """ 

262 Check if the next bytes of the stream look like binary image data or regular page content. 

263 

264 This is just some heuristics due to the PDF specification being too imprecise about 

265 inline images containing the `EI` marker which would end an image. Starting with PDF 2.0, 

266 we finally get a mandatory length field, but with (proper) PDF 2.0 support being very limited 

267 everywhere, we should not expect to be able to remove such hacks in the near future - especially 

268 considering legacy documents as well. 

269 

270 The actual implementation draws some inspiration from 

271 https://github.com/itext/itext-java/blob/9.1.0/kernel/src/main/java/com/itextpdf/kernel/pdf/canvas/parser/util/InlineImageParsingUtils.java 

272 """ 

273 position = stream.tell() 

274 data = stream.read(length) 

275 stream.seek(position) 

276 if not data: 

277 return False 

278 operator_start = None 

279 operator_end = None 

280 

281 for index, byte in enumerate(data): 

282 if byte < 32 and byte not in WHITESPACES_AS_BYTES: 

283 # This covers all characters not being displayable directly, although omitting whitespace 

284 # to allow for operator detection. 

285 return True 

286 is_whitespace = byte in WHITESPACES_AS_BYTES 

287 if operator_start is None and not is_whitespace: 

288 # Interpret all other non-whitespace characters as the start of an operation. 

289 operator_start = index 

290 if operator_start is not None and is_whitespace: 

291 # A whitespace stops an operation. 

292 # Assume that having an inline image with tons of whitespace is rather unlikely. 

293 operator_end = index 

294 break 

295 

296 if operator_start is None: 

297 # Inline images should not have tons of whitespaces, which would lead to no operator start. 

298 return False 

299 if operator_end is None: 

300 # We probably are inside an operation. 

301 operator_end = length 

302 operator_length = operator_end - operator_start 

303 operator = data[operator_start:operator_end] 

304 if operator.startswith(b"/") and operator_length > 1: 

305 # Name object. 

306 return False 

307 if operator.replace(b".", b"").isdigit(): 

308 # Graphics operator, for example a move. A number (integer or float). 

309 return False 

310 if operator_length > 3: # noqa: SIM103 

311 # Usually, the operators inside a content stream should not have more than three characters, 

312 # especially after an inline image. 

313 return True 

314 return False