Coverage for /pythoncovmergedfiles/medio/medio/src/pdfminer.six/pdfminer/pdfdevice.py: 51%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

148 statements  

1import logging 

2from collections.abc import Iterable, Sequence 

3from typing import ( 

4 TYPE_CHECKING, 

5 BinaryIO, 

6 Optional, 

7 cast, 

8) 

9 

10from pdfminer import utils 

11from pdfminer.pdfcolor import PDFColorSpace 

12from pdfminer.pdffont import PDFFont, PDFUnicodeNotDefined 

13from pdfminer.pdfpage import PDFPage 

14from pdfminer.pdftypes import PDFStream 

15from pdfminer.psparser import PSLiteral 

16from pdfminer.utils import Matrix, PathSegment, Point, Rect 

17 

18if TYPE_CHECKING: 

19 from pdfminer.pdfinterp import ( 

20 PDFGraphicState, 

21 PDFResourceManager, 

22 PDFStackT, 

23 PDFTextState, 

24 ) 

25 

26 

27PDFTextSeq = Iterable[int | float | bytes] 

28 

29logger = logging.getLogger(__name__) 

30 

31 

32class PDFDevice: 

33 """Translate the output of PDFPageInterpreter to the output that is needed""" 

34 

35 def __init__(self, rsrcmgr: "PDFResourceManager") -> None: 

36 self.rsrcmgr = rsrcmgr 

37 self.ctm: Matrix | None = None 

38 

39 def __repr__(self) -> str: 

40 return "<PDFDevice>" 

41 

42 def __enter__(self) -> "PDFDevice": 

43 return self 

44 

45 def __exit__(self, exc_type: object, exc_val: object, exc_tb: object) -> None: 

46 self.close() 

47 

48 def close(self) -> None: 

49 pass 

50 

51 def set_ctm(self, ctm: Matrix) -> None: 

52 self.ctm = ctm 

53 

54 def begin_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None) -> None: 

55 pass 

56 

57 def end_tag(self) -> None: 

58 pass 

59 

60 def do_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None) -> None: 

61 pass 

62 

63 def begin_page(self, page: PDFPage, ctm: Matrix) -> None: 

64 pass 

65 

66 def end_page(self, page: PDFPage) -> None: 

67 pass 

68 

69 def begin_figure(self, name: str, bbox: Rect, matrix: Matrix) -> None: 

70 pass 

71 

72 def end_figure(self, name: str) -> None: 

73 pass 

74 

75 def paint_path( 

76 self, 

77 graphicstate: "PDFGraphicState", 

78 stroke: bool, 

79 fill: bool, 

80 evenodd: bool, 

81 path: Sequence[PathSegment], 

82 ) -> None: 

83 pass 

84 

85 def render_image(self, name: str, stream: PDFStream) -> None: 

86 pass 

87 

88 def render_string( 

89 self, 

90 textstate: "PDFTextState", 

91 seq: PDFTextSeq, 

92 ncs: PDFColorSpace, 

93 graphicstate: "PDFGraphicState", 

94 ) -> None: 

95 pass 

96 

97 

98class PDFTextDevice(PDFDevice): 

99 def render_string( 

100 self, 

101 textstate: "PDFTextState", 

102 seq: PDFTextSeq, 

103 ncs: PDFColorSpace, 

104 graphicstate: "PDFGraphicState", 

105 ) -> None: 

106 assert self.ctm is not None 

107 matrix = utils.mult_matrix(textstate.matrix, self.ctm) 

108 font = textstate.font 

109 fontsize = textstate.fontsize 

110 scaling = textstate.scaling * 0.01 

111 charspace = textstate.charspace * scaling 

112 wordspace = textstate.wordspace * scaling 

113 rise = textstate.rise 

114 assert font is not None 

115 if font.is_multibyte(): 

116 wordspace = 0 

117 dxscale = 0.001 * fontsize * scaling 

118 if font.is_vertical(): 

119 textstate.linematrix = self.render_string_vertical( 

120 seq, 

121 matrix, 

122 textstate.linematrix, 

123 font, 

124 fontsize, 

125 scaling, 

126 charspace, 

127 wordspace, 

128 rise, 

129 dxscale, 

130 ncs, 

131 graphicstate, 

132 ) 

133 else: 

134 textstate.linematrix = self.render_string_horizontal( 

135 seq, 

136 matrix, 

137 textstate.linematrix, 

138 font, 

139 fontsize, 

140 scaling, 

141 charspace, 

142 wordspace, 

143 rise, 

144 dxscale, 

145 ncs, 

146 graphicstate, 

147 ) 

148 

149 def render_string_horizontal( 

150 self, 

151 seq: PDFTextSeq, 

152 matrix: Matrix, 

153 pos: Point, 

154 font: PDFFont, 

155 fontsize: float, 

156 scaling: float, 

157 charspace: float, 

158 wordspace: float, 

159 rise: float, 

160 dxscale: float, 

161 ncs: PDFColorSpace, 

162 graphicstate: "PDFGraphicState", 

163 ) -> Point: 

164 (x, y) = pos 

165 needcharspace = False 

166 for obj in seq: 

167 if isinstance(obj, (int, float)): 

168 x -= obj * dxscale 

169 needcharspace = True 

170 elif isinstance(obj, bytes): 

171 for cid in font.decode(obj): 

172 if needcharspace: 

173 x += charspace 

174 x += self.render_char( 

175 utils.translate_matrix(matrix, (x, y)), 

176 font, 

177 fontsize, 

178 scaling, 

179 rise, 

180 cid, 

181 ncs, 

182 graphicstate, 

183 ) 

184 if cid == 32 and wordspace: 

185 x += wordspace 

186 needcharspace = True 

187 else: 

188 logger.warning( 

189 "Cannot render horizontal string because " 

190 "%r is not a valid int, float or bytes.", 

191 obj, 

192 ) 

193 return (x, y) 

194 

195 def render_string_vertical( 

196 self, 

197 seq: PDFTextSeq, 

198 matrix: Matrix, 

199 pos: Point, 

200 font: PDFFont, 

201 fontsize: float, 

202 scaling: float, 

203 charspace: float, 

204 wordspace: float, 

205 rise: float, 

206 dxscale: float, 

207 ncs: PDFColorSpace, 

208 graphicstate: "PDFGraphicState", 

209 ) -> Point: 

210 (x, y) = pos 

211 needcharspace = False 

212 for obj in seq: 

213 if isinstance(obj, (int, float)): 

214 y -= obj * dxscale 

215 needcharspace = True 

216 elif isinstance(obj, bytes): 

217 for cid in font.decode(obj): 

218 if needcharspace: 

219 y += charspace 

220 y += self.render_char( 

221 utils.translate_matrix(matrix, (x, y)), 

222 font, 

223 fontsize, 

224 scaling, 

225 rise, 

226 cid, 

227 ncs, 

228 graphicstate, 

229 ) 

230 if cid == 32 and wordspace: 

231 y += wordspace 

232 needcharspace = True 

233 else: 

234 logger.warning( 

235 "Cannot render vertical string because %r is not a valid " 

236 "int, float or bytes.", 

237 obj, 

238 ) 

239 return (x, y) 

240 

241 def render_char( 

242 self, 

243 matrix: Matrix, 

244 font: PDFFont, 

245 fontsize: float, 

246 scaling: float, 

247 rise: float, 

248 cid: int, 

249 ncs: PDFColorSpace, 

250 graphicstate: "PDFGraphicState", 

251 ) -> float: 

252 return 0 

253 

254 

255class TagExtractor(PDFDevice): 

256 def __init__( 

257 self, 

258 rsrcmgr: "PDFResourceManager", 

259 outfp: BinaryIO, 

260 codec: str = "utf-8", 

261 ) -> None: 

262 PDFDevice.__init__(self, rsrcmgr) 

263 self.outfp = outfp 

264 self.codec = codec 

265 self.pageno = 0 

266 self._stack: list[PSLiteral] = [] 

267 

268 def render_string( 

269 self, 

270 textstate: "PDFTextState", 

271 seq: PDFTextSeq, 

272 ncs: PDFColorSpace, 

273 graphicstate: "PDFGraphicState", 

274 ) -> None: 

275 font = textstate.font 

276 assert font is not None 

277 text = "" 

278 for obj in seq: 

279 if isinstance(obj, str): 

280 obj = utils.make_compat_bytes(obj) 

281 if not isinstance(obj, bytes): 

282 continue 

283 chars = font.decode(obj) 

284 for cid in chars: 

285 try: 

286 char = font.to_unichr(cid) 

287 text += char 

288 except PDFUnicodeNotDefined: 

289 pass 

290 self._write(utils.enc(text)) 

291 

292 def begin_page(self, page: PDFPage, ctm: Matrix) -> None: 

293 output = ( 

294 f'<page id="{self.pageno}" bbox="{utils.bbox2str(page.mediabox)}" ' 

295 f'rotate="{page.rotate}">' 

296 ) 

297 self._write(output) 

298 

299 def end_page(self, page: PDFPage) -> None: 

300 self._write("</page>\n") 

301 self.pageno += 1 

302 

303 def begin_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None) -> None: 

304 s = "" 

305 if isinstance(props, dict): 

306 s = "".join( 

307 [ 

308 f' {utils.enc(k)}="{utils.make_compat_str(v)}"' 

309 for (k, v) in sorted(props.items()) 

310 ], 

311 ) 

312 out_s = f"<{utils.enc(cast(str, tag.name))}{s}>" 

313 self._write(out_s) 

314 self._stack.append(tag) 

315 

316 def end_tag(self) -> None: 

317 assert self._stack, str(self.pageno) 

318 tag = self._stack.pop(-1) 

319 out_s = f"</{utils.enc(cast(str, tag.name))}>" 

320 self._write(out_s) 

321 

322 def do_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None) -> None: 

323 self.begin_tag(tag, props) 

324 self._stack.pop(-1) 

325 

326 def _write(self, s: str) -> None: 

327 self.outfp.write(s.encode(self.codec))