Coverage for /pythoncovmergedfiles/medio/medio/src/pdfminer.six/pdfminer/pdfdevice.py: 63%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

141 statements  

1from typing import ( 

2 TYPE_CHECKING, 

3 BinaryIO, 

4 Iterable, 

5 List, 

6 Optional, 

7 Sequence, 

8 Union, 

9 cast, 

10) 

11 

12from pdfminer import utils 

13from pdfminer.pdfcolor import PDFColorSpace 

14from pdfminer.pdffont import PDFFont, PDFUnicodeNotDefined 

15from pdfminer.pdfpage import PDFPage 

16from pdfminer.pdftypes import PDFStream 

17from pdfminer.psparser import PSLiteral 

18from pdfminer.utils import Matrix, PathSegment, Point, Rect 

19 

20if TYPE_CHECKING: 

21 from pdfminer.pdfinterp import ( 

22 PDFGraphicState, 

23 PDFResourceManager, 

24 PDFStackT, 

25 PDFTextState, 

26 ) 

27 

28 

29PDFTextSeq = Iterable[Union[int, float, bytes]] 

30 

31 

32class PDFDevice: 

33 """Translate the output of PDFPageInterpreter to the output that is needed""" 

34 

35 def __init__(self, rsrcmgr: "PDFResourceManager") -> None: 

36 self.rsrcmgr = rsrcmgr 

37 self.ctm: Optional[Matrix] = None 

38 

39 def __repr__(self) -> str: 

40 return "<PDFDevice>" 

41 

42 def __enter__(self) -> "PDFDevice": 

43 return self 

44 

45 def __exit__(self, exc_type: object, exc_val: object, exc_tb: object) -> None: 

46 self.close() 

47 

48 def close(self) -> None: 

49 pass 

50 

51 def set_ctm(self, ctm: Matrix) -> None: 

52 self.ctm = ctm 

53 

54 def begin_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None) -> None: 

55 pass 

56 

57 def end_tag(self) -> None: 

58 pass 

59 

60 def do_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None) -> None: 

61 pass 

62 

63 def begin_page(self, page: PDFPage, ctm: Matrix) -> None: 

64 pass 

65 

66 def end_page(self, page: PDFPage) -> None: 

67 pass 

68 

69 def begin_figure(self, name: str, bbox: Rect, matrix: Matrix) -> None: 

70 pass 

71 

72 def end_figure(self, name: str) -> None: 

73 pass 

74 

75 def paint_path( 

76 self, 

77 graphicstate: "PDFGraphicState", 

78 stroke: bool, 

79 fill: bool, 

80 evenodd: bool, 

81 path: Sequence[PathSegment], 

82 ) -> None: 

83 pass 

84 

85 def render_image(self, name: str, stream: PDFStream) -> None: 

86 pass 

87 

88 def render_string( 

89 self, 

90 textstate: "PDFTextState", 

91 seq: PDFTextSeq, 

92 ncs: PDFColorSpace, 

93 graphicstate: "PDFGraphicState", 

94 ) -> None: 

95 pass 

96 

97 

98class PDFTextDevice(PDFDevice): 

99 def render_string( 

100 self, 

101 textstate: "PDFTextState", 

102 seq: PDFTextSeq, 

103 ncs: PDFColorSpace, 

104 graphicstate: "PDFGraphicState", 

105 ) -> None: 

106 assert self.ctm is not None 

107 matrix = utils.mult_matrix(textstate.matrix, self.ctm) 

108 font = textstate.font 

109 fontsize = textstate.fontsize 

110 scaling = textstate.scaling * 0.01 

111 charspace = textstate.charspace * scaling 

112 wordspace = textstate.wordspace * scaling 

113 rise = textstate.rise 

114 assert font is not None 

115 if font.is_multibyte(): 

116 wordspace = 0 

117 dxscale = 0.001 * fontsize * scaling 

118 if font.is_vertical(): 

119 textstate.linematrix = self.render_string_vertical( 

120 seq, 

121 matrix, 

122 textstate.linematrix, 

123 font, 

124 fontsize, 

125 scaling, 

126 charspace, 

127 wordspace, 

128 rise, 

129 dxscale, 

130 ncs, 

131 graphicstate, 

132 ) 

133 else: 

134 textstate.linematrix = self.render_string_horizontal( 

135 seq, 

136 matrix, 

137 textstate.linematrix, 

138 font, 

139 fontsize, 

140 scaling, 

141 charspace, 

142 wordspace, 

143 rise, 

144 dxscale, 

145 ncs, 

146 graphicstate, 

147 ) 

148 

149 def render_string_horizontal( 

150 self, 

151 seq: PDFTextSeq, 

152 matrix: Matrix, 

153 pos: Point, 

154 font: PDFFont, 

155 fontsize: float, 

156 scaling: float, 

157 charspace: float, 

158 wordspace: float, 

159 rise: float, 

160 dxscale: float, 

161 ncs: PDFColorSpace, 

162 graphicstate: "PDFGraphicState", 

163 ) -> Point: 

164 (x, y) = pos 

165 needcharspace = False 

166 for obj in seq: 

167 if isinstance(obj, (int, float)): 

168 x -= obj * dxscale 

169 needcharspace = True 

170 else: 

171 for cid in font.decode(obj): 

172 if needcharspace: 

173 x += charspace 

174 x += self.render_char( 

175 utils.translate_matrix(matrix, (x, y)), 

176 font, 

177 fontsize, 

178 scaling, 

179 rise, 

180 cid, 

181 ncs, 

182 graphicstate, 

183 ) 

184 if cid == 32 and wordspace: 

185 x += wordspace 

186 needcharspace = True 

187 return (x, y) 

188 

189 def render_string_vertical( 

190 self, 

191 seq: PDFTextSeq, 

192 matrix: Matrix, 

193 pos: Point, 

194 font: PDFFont, 

195 fontsize: float, 

196 scaling: float, 

197 charspace: float, 

198 wordspace: float, 

199 rise: float, 

200 dxscale: float, 

201 ncs: PDFColorSpace, 

202 graphicstate: "PDFGraphicState", 

203 ) -> Point: 

204 (x, y) = pos 

205 needcharspace = False 

206 for obj in seq: 

207 if isinstance(obj, (int, float)): 

208 y -= obj * dxscale 

209 needcharspace = True 

210 else: 

211 for cid in font.decode(obj): 

212 if needcharspace: 

213 y += charspace 

214 y += self.render_char( 

215 utils.translate_matrix(matrix, (x, y)), 

216 font, 

217 fontsize, 

218 scaling, 

219 rise, 

220 cid, 

221 ncs, 

222 graphicstate, 

223 ) 

224 if cid == 32 and wordspace: 

225 y += wordspace 

226 needcharspace = True 

227 return (x, y) 

228 

229 def render_char( 

230 self, 

231 matrix: Matrix, 

232 font: PDFFont, 

233 fontsize: float, 

234 scaling: float, 

235 rise: float, 

236 cid: int, 

237 ncs: PDFColorSpace, 

238 graphicstate: "PDFGraphicState", 

239 ) -> float: 

240 return 0 

241 

242 

243class TagExtractor(PDFDevice): 

244 def __init__( 

245 self, 

246 rsrcmgr: "PDFResourceManager", 

247 outfp: BinaryIO, 

248 codec: str = "utf-8", 

249 ) -> None: 

250 PDFDevice.__init__(self, rsrcmgr) 

251 self.outfp = outfp 

252 self.codec = codec 

253 self.pageno = 0 

254 self._stack: List[PSLiteral] = [] 

255 

256 def render_string( 

257 self, 

258 textstate: "PDFTextState", 

259 seq: PDFTextSeq, 

260 ncs: PDFColorSpace, 

261 graphicstate: "PDFGraphicState", 

262 ) -> None: 

263 font = textstate.font 

264 assert font is not None 

265 text = "" 

266 for obj in seq: 

267 if isinstance(obj, str): 

268 obj = utils.make_compat_bytes(obj) 

269 if not isinstance(obj, bytes): 

270 continue 

271 chars = font.decode(obj) 

272 for cid in chars: 

273 try: 

274 char = font.to_unichr(cid) 

275 text += char 

276 except PDFUnicodeNotDefined: 

277 pass 

278 self._write(utils.enc(text)) 

279 

280 def begin_page(self, page: PDFPage, ctm: Matrix) -> None: 

281 output = '<page id="%s" bbox="%s" rotate="%d">' % ( 

282 self.pageno, 

283 utils.bbox2str(page.mediabox), 

284 page.rotate, 

285 ) 

286 self._write(output) 

287 

288 def end_page(self, page: PDFPage) -> None: 

289 self._write("</page>\n") 

290 self.pageno += 1 

291 

292 def begin_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None) -> None: 

293 s = "" 

294 if isinstance(props, dict): 

295 s = "".join( 

296 [ 

297 f' {utils.enc(k)}="{utils.make_compat_str(v)}"' 

298 for (k, v) in sorted(props.items()) 

299 ], 

300 ) 

301 out_s = f"<{utils.enc(cast(str, tag.name))}{s}>" 

302 self._write(out_s) 

303 self._stack.append(tag) 

304 

305 def end_tag(self) -> None: 

306 assert self._stack, str(self.pageno) 

307 tag = self._stack.pop(-1) 

308 out_s = "</%s>" % utils.enc(cast(str, tag.name)) 

309 self._write(out_s) 

310 

311 def do_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None) -> None: 

312 self.begin_tag(tag, props) 

313 self._stack.pop(-1) 

314 

315 def _write(self, s: str) -> None: 

316 self.outfp.write(s.encode(self.codec))