Coverage for /pythoncovmergedfiles/medio/medio/src/pdfminer.six/pdfminer/pdfdevice.py: 64%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

147 statements  

1import logging 

2from typing import ( 

3 TYPE_CHECKING, 

4 BinaryIO, 

5 Iterable, 

6 List, 

7 Optional, 

8 Sequence, 

9 Union, 

10 cast, 

11) 

12 

13from pdfminer import utils 

14from pdfminer.pdfcolor import PDFColorSpace 

15from pdfminer.pdffont import PDFFont, PDFUnicodeNotDefined 

16from pdfminer.pdfpage import PDFPage 

17from pdfminer.pdftypes import PDFStream 

18from pdfminer.psparser import PSLiteral 

19from pdfminer.utils import Matrix, PathSegment, Point, Rect 

20 

21if TYPE_CHECKING: 

22 from pdfminer.pdfinterp import ( 

23 PDFGraphicState, 

24 PDFResourceManager, 

25 PDFStackT, 

26 PDFTextState, 

27 ) 

28 

29 

30PDFTextSeq = Iterable[Union[int, float, bytes]] 

31 

32logger = logging.getLogger(__name__) 

33 

34 

35class PDFDevice: 

36 """Translate the output of PDFPageInterpreter to the output that is needed""" 

37 

38 def __init__(self, rsrcmgr: "PDFResourceManager") -> None: 

39 self.rsrcmgr = rsrcmgr 

40 self.ctm: Optional[Matrix] = None 

41 

42 def __repr__(self) -> str: 

43 return "<PDFDevice>" 

44 

45 def __enter__(self) -> "PDFDevice": 

46 return self 

47 

48 def __exit__(self, exc_type: object, exc_val: object, exc_tb: object) -> None: 

49 self.close() 

50 

51 def close(self) -> None: 

52 pass 

53 

54 def set_ctm(self, ctm: Matrix) -> None: 

55 self.ctm = ctm 

56 

57 def begin_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None) -> None: 

58 pass 

59 

60 def end_tag(self) -> None: 

61 pass 

62 

63 def do_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None) -> None: 

64 pass 

65 

66 def begin_page(self, page: PDFPage, ctm: Matrix) -> None: 

67 pass 

68 

69 def end_page(self, page: PDFPage) -> None: 

70 pass 

71 

72 def begin_figure(self, name: str, bbox: Rect, matrix: Matrix) -> None: 

73 pass 

74 

75 def end_figure(self, name: str) -> None: 

76 pass 

77 

78 def paint_path( 

79 self, 

80 graphicstate: "PDFGraphicState", 

81 stroke: bool, 

82 fill: bool, 

83 evenodd: bool, 

84 path: Sequence[PathSegment], 

85 ) -> None: 

86 pass 

87 

88 def render_image(self, name: str, stream: PDFStream) -> None: 

89 pass 

90 

91 def render_string( 

92 self, 

93 textstate: "PDFTextState", 

94 seq: PDFTextSeq, 

95 ncs: PDFColorSpace, 

96 graphicstate: "PDFGraphicState", 

97 ) -> None: 

98 pass 

99 

100 

101class PDFTextDevice(PDFDevice): 

102 def render_string( 

103 self, 

104 textstate: "PDFTextState", 

105 seq: PDFTextSeq, 

106 ncs: PDFColorSpace, 

107 graphicstate: "PDFGraphicState", 

108 ) -> None: 

109 assert self.ctm is not None 

110 matrix = utils.mult_matrix(textstate.matrix, self.ctm) 

111 font = textstate.font 

112 fontsize = textstate.fontsize 

113 scaling = textstate.scaling * 0.01 

114 charspace = textstate.charspace * scaling 

115 wordspace = textstate.wordspace * scaling 

116 rise = textstate.rise 

117 assert font is not None 

118 if font.is_multibyte(): 

119 wordspace = 0 

120 dxscale = 0.001 * fontsize * scaling 

121 if font.is_vertical(): 

122 textstate.linematrix = self.render_string_vertical( 

123 seq, 

124 matrix, 

125 textstate.linematrix, 

126 font, 

127 fontsize, 

128 scaling, 

129 charspace, 

130 wordspace, 

131 rise, 

132 dxscale, 

133 ncs, 

134 graphicstate, 

135 ) 

136 else: 

137 textstate.linematrix = self.render_string_horizontal( 

138 seq, 

139 matrix, 

140 textstate.linematrix, 

141 font, 

142 fontsize, 

143 scaling, 

144 charspace, 

145 wordspace, 

146 rise, 

147 dxscale, 

148 ncs, 

149 graphicstate, 

150 ) 

151 

152 def render_string_horizontal( 

153 self, 

154 seq: PDFTextSeq, 

155 matrix: Matrix, 

156 pos: Point, 

157 font: PDFFont, 

158 fontsize: float, 

159 scaling: float, 

160 charspace: float, 

161 wordspace: float, 

162 rise: float, 

163 dxscale: float, 

164 ncs: PDFColorSpace, 

165 graphicstate: "PDFGraphicState", 

166 ) -> Point: 

167 (x, y) = pos 

168 needcharspace = False 

169 for obj in seq: 

170 if isinstance(obj, (int, float)): 

171 x -= obj * dxscale 

172 needcharspace = True 

173 elif isinstance(obj, bytes): 

174 for cid in font.decode(obj): 

175 if needcharspace: 

176 x += charspace 

177 x += self.render_char( 

178 utils.translate_matrix(matrix, (x, y)), 

179 font, 

180 fontsize, 

181 scaling, 

182 rise, 

183 cid, 

184 ncs, 

185 graphicstate, 

186 ) 

187 if cid == 32 and wordspace: 

188 x += wordspace 

189 needcharspace = True 

190 else: 

191 logger.warning( 

192 f"Cannot render horizontal string because {obj!r} is not a valid int, float or bytes." 

193 ) 

194 return (x, y) 

195 

196 def render_string_vertical( 

197 self, 

198 seq: PDFTextSeq, 

199 matrix: Matrix, 

200 pos: Point, 

201 font: PDFFont, 

202 fontsize: float, 

203 scaling: float, 

204 charspace: float, 

205 wordspace: float, 

206 rise: float, 

207 dxscale: float, 

208 ncs: PDFColorSpace, 

209 graphicstate: "PDFGraphicState", 

210 ) -> Point: 

211 (x, y) = pos 

212 needcharspace = False 

213 for obj in seq: 

214 if isinstance(obj, (int, float)): 

215 y -= obj * dxscale 

216 needcharspace = True 

217 elif isinstance(obj, bytes): 

218 for cid in font.decode(obj): 

219 if needcharspace: 

220 y += charspace 

221 y += self.render_char( 

222 utils.translate_matrix(matrix, (x, y)), 

223 font, 

224 fontsize, 

225 scaling, 

226 rise, 

227 cid, 

228 ncs, 

229 graphicstate, 

230 ) 

231 if cid == 32 and wordspace: 

232 y += wordspace 

233 needcharspace = True 

234 else: 

235 logger.warning( 

236 f"Cannot render vertical string because {obj!r} is not a valid int, float or bytes." 

237 ) 

238 return (x, y) 

239 

240 def render_char( 

241 self, 

242 matrix: Matrix, 

243 font: PDFFont, 

244 fontsize: float, 

245 scaling: float, 

246 rise: float, 

247 cid: int, 

248 ncs: PDFColorSpace, 

249 graphicstate: "PDFGraphicState", 

250 ) -> float: 

251 return 0 

252 

253 

254class TagExtractor(PDFDevice): 

255 def __init__( 

256 self, 

257 rsrcmgr: "PDFResourceManager", 

258 outfp: BinaryIO, 

259 codec: str = "utf-8", 

260 ) -> None: 

261 PDFDevice.__init__(self, rsrcmgr) 

262 self.outfp = outfp 

263 self.codec = codec 

264 self.pageno = 0 

265 self._stack: List[PSLiteral] = [] 

266 

267 def render_string( 

268 self, 

269 textstate: "PDFTextState", 

270 seq: PDFTextSeq, 

271 ncs: PDFColorSpace, 

272 graphicstate: "PDFGraphicState", 

273 ) -> None: 

274 font = textstate.font 

275 assert font is not None 

276 text = "" 

277 for obj in seq: 

278 if isinstance(obj, str): 

279 obj = utils.make_compat_bytes(obj) 

280 if not isinstance(obj, bytes): 

281 continue 

282 chars = font.decode(obj) 

283 for cid in chars: 

284 try: 

285 char = font.to_unichr(cid) 

286 text += char 

287 except PDFUnicodeNotDefined: 

288 pass 

289 self._write(utils.enc(text)) 

290 

291 def begin_page(self, page: PDFPage, ctm: Matrix) -> None: 

292 output = '<page id="%s" bbox="%s" rotate="%d">' % ( 

293 self.pageno, 

294 utils.bbox2str(page.mediabox), 

295 page.rotate, 

296 ) 

297 self._write(output) 

298 

299 def end_page(self, page: PDFPage) -> None: 

300 self._write("</page>\n") 

301 self.pageno += 1 

302 

303 def begin_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None) -> None: 

304 s = "" 

305 if isinstance(props, dict): 

306 s = "".join( 

307 [ 

308 f' {utils.enc(k)}="{utils.make_compat_str(v)}"' 

309 for (k, v) in sorted(props.items()) 

310 ], 

311 ) 

312 out_s = f"<{utils.enc(cast(str, tag.name))}{s}>" 

313 self._write(out_s) 

314 self._stack.append(tag) 

315 

316 def end_tag(self) -> None: 

317 assert self._stack, str(self.pageno) 

318 tag = self._stack.pop(-1) 

319 out_s = "</%s>" % utils.enc(cast(str, tag.name)) 

320 self._write(out_s) 

321 

322 def do_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None) -> None: 

323 self.begin_tag(tag, props) 

324 self._stack.pop(-1) 

325 

326 def _write(self, s: str) -> None: 

327 self.outfp.write(s.encode(self.codec))