Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/generic/_image_inline.py: 7%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

176 statements  

1# Copyright (c) 2024, pypdf contributors 

2# All rights reserved. 

3# 

4# Redistribution and use in source and binary forms, with or without 

5# modification, are permitted provided that the following conditions are 

6# met: 

7# 

8# * Redistributions of source code must retain the above copyright notice, 

9# this list of conditions and the following disclaimer. 

10# * Redistributions in binary form must reproduce the above copyright notice, 

11# this list of conditions and the following disclaimer in the documentation 

12# and/or other materials provided with the distribution. 

13# * The name of the author may not be used to endorse or promote products 

14# derived from this software without specific prior written permission. 

15# 

16# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 

17# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 

18# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 

19# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 

20# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 

21# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 

22# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 

23# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 

24# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 

25# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 

26# POSSIBILITY OF SUCH DAMAGE. 

27 

28import logging 

29from io import BytesIO 

30from typing import IO 

31 

32from .._utils import ( 

33 WHITESPACES, 

34 WHITESPACES_AS_BYTES, 

35 StreamType, 

36 read_non_whitespace, 

37) 

38from ..errors import PdfReadError 

39 

40logger = logging.getLogger(__name__) 

41 

42# An inline image should be used only for small images (4096 bytes or less), 

43# but allow twice this for cases where this has been exceeded. 

44BUFFER_SIZE = 8192 

45 

46 

47def extract_inline_AHx(stream: StreamType) -> bytes: 

48 """ 

49 Extract HexEncoded stream from inline image. 

50 The stream will be moved onto the EI. 

51 """ 

52 data_out: bytes = b"" 

53 # Read data until delimiter > and EI as backup. 

54 while True: 

55 data_buffered = read_non_whitespace(stream) + stream.read(BUFFER_SIZE) 

56 if not data_buffered: 

57 raise PdfReadError("Unexpected end of stream") 

58 pos_tok = data_buffered.find(b">") 

59 if pos_tok >= 0: # found > 

60 data_out += data_buffered[: pos_tok + 1] 

61 stream.seek(-len(data_buffered) + pos_tok + 1, 1) 

62 break 

63 pos_ei = data_buffered.find(b"EI") 

64 if pos_ei >= 0: # found EI 

65 stream.seek(-len(data_buffered) + pos_ei - 1, 1) 

66 c = stream.read(1) 

67 while c in WHITESPACES: 

68 stream.seek(-2, 1) 

69 c = stream.read(1) 

70 pos_ei -= 1 

71 data_out += data_buffered[:pos_ei] 

72 break 

73 if len(data_buffered) == 2: 

74 data_out += data_buffered 

75 raise PdfReadError("Unexpected end of stream") 

76 # Neither > nor EI found 

77 data_out += data_buffered[:-2] 

78 stream.seek(-2, 1) 

79 

80 ei_tok = read_non_whitespace(stream) 

81 ei_tok += stream.read(2) 

82 stream.seek(-3, 1) 

83 if ei_tok[:2] != b"EI" or not (ei_tok[2:3] == b"" or ei_tok[2:3] in WHITESPACES): 

84 raise PdfReadError("EI stream not found") 

85 return data_out 

86 

87 

88def extract_inline_A85(stream: StreamType) -> bytes: 

89 """ 

90 Extract A85 stream from inline image. 

91 The stream will be moved onto the EI. 

92 """ 

93 data_out: bytes = b"" 

94 # Read data until delimiter ~> 

95 while True: 

96 data_buffered = read_non_whitespace(stream) + stream.read(BUFFER_SIZE) 

97 if not data_buffered: 

98 raise PdfReadError("Unexpected end of stream") 

99 pos_tok = data_buffered.find(b"~>") 

100 if pos_tok >= 0: # found! 

101 data_out += data_buffered[: pos_tok + 2] 

102 stream.seek(-len(data_buffered) + pos_tok + 2, 1) 

103 break 

104 if len(data_buffered) == 2: # end of buffer 

105 data_out += data_buffered 

106 raise PdfReadError("Unexpected end of stream") 

107 data_out += data_buffered[ 

108 :-2 

109 ] # back by one char in case of in the middle of ~> 

110 stream.seek(-2, 1) 

111 

112 ei_tok = read_non_whitespace(stream) 

113 ei_tok += stream.read(2) 

114 stream.seek(-3, 1) 

115 if ei_tok[:2] != b"EI" or not (ei_tok[2:3] == b"" or ei_tok[2:3] in WHITESPACES): 

116 raise PdfReadError("EI stream not found") 

117 return data_out 

118 

119 

120def extract_inline_RL(stream: StreamType) -> bytes: 

121 """ 

122 Extract RL (RunLengthDecode) stream from inline image. 

123 The stream will be moved onto the EI. 

124 """ 

125 data_out: bytes = b"" 

126 # Read data until delimiter 128 

127 while True: 

128 data_buffered = stream.read(BUFFER_SIZE) 

129 if not data_buffered: 

130 raise PdfReadError("Unexpected end of stream") 

131 pos_tok = data_buffered.find(b"\x80") 

132 if pos_tok >= 0: # found 

133 data_out += data_buffered[: pos_tok + 1] 

134 stream.seek(-len(data_buffered) + pos_tok + 1, 1) 

135 break 

136 data_out += data_buffered 

137 

138 ei_tok = read_non_whitespace(stream) 

139 ei_tok += stream.read(2) 

140 stream.seek(-3, 1) 

141 if ei_tok[:2] != b"EI" or not (ei_tok[2:3] == b"" or ei_tok[2:3] in WHITESPACES): 

142 raise PdfReadError("EI stream not found") 

143 return data_out 

144 

145 

146def extract_inline_DCT(stream: StreamType) -> bytes: 

147 """ 

148 Extract DCT (JPEG) stream from inline image. 

149 The stream will be moved onto the EI. 

150 """ 

151 def read(length: int) -> bytes: 

152 # If 0 bytes are returned, and *size* was not 0, this indicates end of file. 

153 # If the object is in non-blocking mode and no bytes are available, `None` is returned. 

154 _result = stream.read(length) 

155 if _result is None or len(_result) != length: 

156 raise PdfReadError("Unexpected end of stream") 

157 return _result 

158 

159 data_out: bytes = b"" 

160 # Read Blocks of data (ID/Size/data) up to ID=FF/D9 

161 # https://www.digicamsoft.com/itu/itu-t81-36.html 

162 not_first = False 

163 while True: 

164 c = read(1) 

165 if not_first or (c == b"\xff"): 

166 data_out += c 

167 if c != b"\xff": 

168 continue 

169 not_first = True 

170 c = read(1) 

171 data_out += c 

172 if c == b"\xff": 

173 stream.seek(-1, 1) # pragma: no cover 

174 elif c == b"\x00": # stuffing 

175 pass 

176 elif c == b"\xd9": # end 

177 break 

178 elif c in ( 

179 b"\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc9\xca\xcb\xcc\xcd\xce\xcf" 

180 b"\xda\xdb\xdc\xdd\xde\xdf" 

181 b"\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xfe" 

182 ): 

183 c = read(2) 

184 data_out += c 

185 sz = c[0] * 256 + c[1] 

186 data_out += read(sz - 2) 

187 

188 ei_tok = read_non_whitespace(stream) 

189 ei_tok += stream.read(2) 

190 stream.seek(-3, 1) 

191 if ei_tok[:2] != b"EI" or not (ei_tok[2:3] == b"" or ei_tok[2:3] in WHITESPACES): 

192 raise PdfReadError("EI stream not found") 

193 return data_out 

194 

195 

196def extract_inline_default(stream: StreamType) -> bytes: 

197 """Legacy method, used by default""" 

198 stream_out = BytesIO() 

199 # Read the inline image, while checking for EI (End Image) operator. 

200 while True: 

201 data_buffered = stream.read(BUFFER_SIZE) 

202 if not data_buffered: 

203 raise PdfReadError("Unexpected end of stream") 

204 pos_ei = data_buffered.find( 

205 b"E" 

206 ) # We can not look straight for "EI" because it may not have been loaded in the buffer 

207 

208 if pos_ei == -1: 

209 stream_out.write(data_buffered) 

210 else: 

211 # Write out everything including E (the one from EI to be removed) 

212 stream_out.write(data_buffered[0 : pos_ei + 1]) 

213 sav_pos_ei = stream_out.tell() - 1 

214 # Seek back in the stream to read the E next 

215 stream.seek(pos_ei + 1 - len(data_buffered), 1) 

216 saved_pos = stream.tell() 

217 # Check for End Image 

218 tok2 = stream.read(1) # I of "EI" 

219 if tok2 != b"I": 

220 stream.seek(saved_pos, 0) 

221 continue 

222 tok3 = stream.read(1) # possible space after "EI" 

223 if tok3 not in WHITESPACES: 

224 stream.seek(saved_pos, 0) 

225 continue 

226 while tok3 in WHITESPACES: 

227 tok3 = stream.read(1) 

228 if data_buffered[pos_ei - 1 : pos_ei] not in WHITESPACES and tok3 not in { 

229 b"Q", 

230 b"E", 

231 }: # for Q or EMC 

232 stream.seek(saved_pos, 0) 

233 continue 

234 if is_followed_by_binary_data(stream): 

235 # Inline image contains `EI ` sequence usually marking the end of it, but 

236 # is followed by binary data which does not make sense for the actual end. 

237 stream.seek(saved_pos, 0) 

238 continue 

239 # Data contains [\s]EI[\s](Q|EMC): 4 chars are sufficient 

240 # remove E(I) wrongly inserted earlier 

241 stream.seek(saved_pos - 1, 0) 

242 stream_out.truncate(sav_pos_ei) 

243 break 

244 

245 return stream_out.getvalue() 

246 

247 

248def is_followed_by_binary_data(stream: IO[bytes], length: int = 10) -> bool: 

249 """ 

250 Check if the next bytes of the stream look like binary image data or regular page content. 

251 

252 This is just some heuristics due to the PDF specification being too imprecise about 

253 inline images containing the `EI` marker which would end an image. Starting with PDF 2.0, 

254 we finally get a mandatory length field, but with (proper) PDF 2.0 support being very limited 

255 everywhere, we should not expect to be able to remove such hacks in the near future - especially 

256 considering legacy documents as well. 

257 

258 The actual implementation draws some inspiration from 

259 https://github.com/itext/itext-java/blob/9.1.0/kernel/src/main/java/com/itextpdf/kernel/pdf/canvas/parser/util/InlineImageParsingUtils.java 

260 """ 

261 position = stream.tell() 

262 data = stream.read(length) 

263 stream.seek(position) 

264 if not data: 

265 return False 

266 operator_start = None 

267 operator_end = None 

268 

269 for index, byte in enumerate(data): 

270 if byte < 32 and byte not in WHITESPACES_AS_BYTES: 

271 # This covers all characters not being displayable directly, although omitting whitespace 

272 # to allow for operator detection. 

273 return True 

274 is_whitespace = byte in WHITESPACES_AS_BYTES 

275 if operator_start is None and not is_whitespace: 

276 # Interpret all other non-whitespace characters as the start of an operation. 

277 operator_start = index 

278 if operator_start is not None and is_whitespace: 

279 # A whitespace stops an operation. 

280 # Assume that having an inline image with tons of whitespace is rather unlikely. 

281 operator_end = index 

282 break 

283 

284 if operator_start is None: 

285 # Inline images should not have tons of whitespaces, which would lead to no operator start. 

286 return False 

287 if operator_end is None: 

288 # We probably are inside an operation. 

289 operator_end = length 

290 operator_length = operator_end - operator_start 

291 operator = data[operator_start:operator_end] 

292 if operator.startswith(b"/") and operator_length > 1: 

293 # Name object. 

294 return False 

295 if operator.replace(b".", b"").isdigit(): 

296 # Graphics operator, for example a move. A number (integer or float). 

297 return False 

298 if operator_length > 3: # noqa: SIM103 

299 # Usually, the operators inside a content stream should not have more than three characters, 

300 # especially after an inline image. 

301 return True 

302 return False