Coverage for /pythoncovmergedfiles/medio/medio/src/pdfminer.six/pdfminer/pdfdevice.py: 54%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

148 statements  

1import logging 

2from collections.abc import Iterable, Sequence 

3from typing import ( 

4 TYPE_CHECKING, 

5 BinaryIO, 

6 Optional, 

7 cast, 

8) 

9 

10from pdfminer import utils 

11from pdfminer.pdfcolor import PDFColorSpace 

12from pdfminer.pdffont import PDFFont, PDFUnicodeNotDefined 

13from pdfminer.pdfpage import PDFPage 

14from pdfminer.pdftypes import PDFStream 

15from pdfminer.psparser import PSLiteral 

16from pdfminer.utils import Matrix, PathSegment, Point, Rect 

17 

18if TYPE_CHECKING: 

19 from pdfminer.pdfinterp import ( 

20 PDFGraphicState, 

21 PDFResourceManager, 

22 PDFStackT, 

23 PDFTextState, 

24 ) 

25 

26 

27PDFTextSeq = Iterable[int | float | bytes] 

28 

29logger = logging.getLogger(__name__) 

30 

31 

32class PDFDevice: 

33 """Translate the output of PDFPageInterpreter to the output that is needed""" 

34 

35 def __init__(self, rsrcmgr: "PDFResourceManager") -> None: 

36 self.rsrcmgr = rsrcmgr 

37 self.ctm: Matrix | None = None 

38 

39 def __repr__(self) -> str: 

40 return "<PDFDevice>" 

41 

42 def __enter__(self) -> "PDFDevice": 

43 return self 

44 

45 def __exit__(self, exc_type: object, exc_val: object, exc_tb: object) -> None: 

46 self.close() 

47 

48 def close(self) -> None: 

49 pass 

50 

51 def set_ctm(self, ctm: Matrix) -> None: 

52 self.ctm = ctm 

53 

54 def begin_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None) -> None: 

55 pass 

56 

57 def end_tag(self) -> None: 

58 pass 

59 

60 def do_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None) -> None: 

61 pass 

62 

63 def begin_page(self, page: PDFPage, ctm: Matrix) -> None: 

64 pass 

65 

66 def end_page(self, page: PDFPage) -> None: 

67 pass 

68 

69 def begin_figure(self, name: str, bbox: Rect, matrix: Matrix) -> None: 

70 pass 

71 

72 def end_figure(self, name: str) -> None: 

73 pass 

74 

75 def paint_path( 

76 self, 

77 graphicstate: "PDFGraphicState", 

78 stroke: bool, 

79 fill: bool, 

80 evenodd: bool, 

81 path: Sequence[PathSegment], 

82 ) -> None: 

83 pass 

84 

85 def render_image(self, name: str, stream: PDFStream) -> None: 

86 pass 

87 

88 def render_string( 

89 self, 

90 textstate: "PDFTextState", 

91 seq: PDFTextSeq, 

92 ncs: PDFColorSpace, 

93 graphicstate: "PDFGraphicState", 

94 ) -> None: 

95 pass 

96 

97 

98class PDFTextDevice(PDFDevice): 

99 def render_string( 

100 self, 

101 textstate: "PDFTextState", 

102 seq: PDFTextSeq, 

103 ncs: PDFColorSpace, 

104 graphicstate: "PDFGraphicState", 

105 ) -> None: 

106 assert self.ctm is not None 

107 matrix = utils.mult_matrix(textstate.matrix, self.ctm) 

108 font = textstate.font 

109 fontsize = textstate.fontsize 

110 scaling = textstate.scaling * 0.01 

111 charspace = textstate.charspace * scaling 

112 wordspace = textstate.wordspace * scaling 

113 rise = textstate.rise 

114 assert font is not None 

115 if font.is_multibyte(): 

116 wordspace = 0 

117 dxscale = 0.001 * fontsize * scaling 

118 if font.is_vertical(): 

119 textstate.linematrix = self.render_string_vertical( 

120 seq, 

121 matrix, 

122 textstate.linematrix, 

123 font, 

124 fontsize, 

125 scaling, 

126 charspace, 

127 wordspace, 

128 rise, 

129 dxscale, 

130 ncs, 

131 graphicstate, 

132 ) 

133 else: 

134 textstate.linematrix = self.render_string_horizontal( 

135 seq, 

136 matrix, 

137 textstate.linematrix, 

138 font, 

139 fontsize, 

140 scaling, 

141 charspace, 

142 wordspace, 

143 rise, 

144 dxscale, 

145 ncs, 

146 graphicstate, 

147 ) 

148 

149 def render_string_horizontal( 

150 self, 

151 seq: PDFTextSeq, 

152 matrix: Matrix, 

153 pos: Point, 

154 font: PDFFont, 

155 fontsize: float, 

156 scaling: float, 

157 charspace: float, 

158 wordspace: float, 

159 rise: float, 

160 dxscale: float, 

161 ncs: PDFColorSpace, 

162 graphicstate: "PDFGraphicState", 

163 ) -> Point: 

164 (x, y) = pos 

165 needcharspace = False 

166 for obj in seq: 

167 if isinstance(obj, (int, float)): 

168 x -= obj * dxscale 

169 needcharspace = True 

170 elif isinstance(obj, bytes): 

171 for cid in font.decode(obj): 

172 if needcharspace: 

173 x += charspace 

174 x += self.render_char( 

175 utils.translate_matrix(matrix, (x, y)), 

176 font, 

177 fontsize, 

178 scaling, 

179 rise, 

180 cid, 

181 ncs, 

182 graphicstate, 

183 ) 

184 if cid == 32 and wordspace: 

185 x += wordspace 

186 needcharspace = True 

187 else: 

188 logger.warning( 

189 f"Cannot render horizontal string because " 

190 f"{obj!r} is not a valid int, float or bytes." 

191 ) 

192 return (x, y) 

193 

194 def render_string_vertical( 

195 self, 

196 seq: PDFTextSeq, 

197 matrix: Matrix, 

198 pos: Point, 

199 font: PDFFont, 

200 fontsize: float, 

201 scaling: float, 

202 charspace: float, 

203 wordspace: float, 

204 rise: float, 

205 dxscale: float, 

206 ncs: PDFColorSpace, 

207 graphicstate: "PDFGraphicState", 

208 ) -> Point: 

209 (x, y) = pos 

210 needcharspace = False 

211 for obj in seq: 

212 if isinstance(obj, (int, float)): 

213 y -= obj * dxscale 

214 needcharspace = True 

215 elif isinstance(obj, bytes): 

216 for cid in font.decode(obj): 

217 if needcharspace: 

218 y += charspace 

219 y += self.render_char( 

220 utils.translate_matrix(matrix, (x, y)), 

221 font, 

222 fontsize, 

223 scaling, 

224 rise, 

225 cid, 

226 ncs, 

227 graphicstate, 

228 ) 

229 if cid == 32 and wordspace: 

230 y += wordspace 

231 needcharspace = True 

232 else: 

233 logger.warning( 

234 f"Cannot render vertical string because {obj!r} is not a valid " 

235 f"int, float or bytes." 

236 ) 

237 return (x, y) 

238 

239 def render_char( 

240 self, 

241 matrix: Matrix, 

242 font: PDFFont, 

243 fontsize: float, 

244 scaling: float, 

245 rise: float, 

246 cid: int, 

247 ncs: PDFColorSpace, 

248 graphicstate: "PDFGraphicState", 

249 ) -> float: 

250 return 0 

251 

252 

253class TagExtractor(PDFDevice): 

254 def __init__( 

255 self, 

256 rsrcmgr: "PDFResourceManager", 

257 outfp: BinaryIO, 

258 codec: str = "utf-8", 

259 ) -> None: 

260 PDFDevice.__init__(self, rsrcmgr) 

261 self.outfp = outfp 

262 self.codec = codec 

263 self.pageno = 0 

264 self._stack: list[PSLiteral] = [] 

265 

266 def render_string( 

267 self, 

268 textstate: "PDFTextState", 

269 seq: PDFTextSeq, 

270 ncs: PDFColorSpace, 

271 graphicstate: "PDFGraphicState", 

272 ) -> None: 

273 font = textstate.font 

274 assert font is not None 

275 text = "" 

276 for obj in seq: 

277 if isinstance(obj, str): 

278 obj = utils.make_compat_bytes(obj) 

279 if not isinstance(obj, bytes): 

280 continue 

281 chars = font.decode(obj) 

282 for cid in chars: 

283 try: 

284 char = font.to_unichr(cid) 

285 text += char 

286 except PDFUnicodeNotDefined: 

287 pass 

288 self._write(utils.enc(text)) 

289 

290 def begin_page(self, page: PDFPage, ctm: Matrix) -> None: 

291 output = ( 

292 f'<page id="{self.pageno}" bbox="{utils.bbox2str(page.mediabox)}" ' 

293 f'rotate="{page.rotate}">' 

294 ) 

295 self._write(output) 

296 

297 def end_page(self, page: PDFPage) -> None: 

298 self._write("</page>\n") 

299 self.pageno += 1 

300 

301 def begin_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None) -> None: 

302 s = "" 

303 if isinstance(props, dict): 

304 s = "".join( 

305 [ 

306 f' {utils.enc(k)}="{utils.make_compat_str(v)}"' 

307 for (k, v) in sorted(props.items()) 

308 ], 

309 ) 

310 out_s = f"<{utils.enc(cast(str, tag.name))}{s}>" 

311 self._write(out_s) 

312 self._stack.append(tag) 

313 

314 def end_tag(self) -> None: 

315 assert self._stack, str(self.pageno) 

316 tag = self._stack.pop(-1) 

317 out_s = f"</{utils.enc(cast(str, tag.name))}>" 

318 self._write(out_s) 

319 

320 def do_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None) -> None: 

321 self.begin_tag(tag, props) 

322 self._stack.pop(-1) 

323 

324 def _write(self, s: str) -> None: 

325 self.outfp.write(s.encode(self.codec))