Coverage for /pythoncovmergedfiles/medio/medio/src/pdfminer.six/pdfminer/pdfdevice.py: 51%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import logging
2from collections.abc import Iterable, Sequence
3from typing import (
4 TYPE_CHECKING,
5 BinaryIO,
6 Optional,
7 cast,
8)
10from pdfminer import utils
11from pdfminer.pdfcolor import PDFColorSpace
12from pdfminer.pdffont import PDFFont, PDFUnicodeNotDefined
13from pdfminer.pdfpage import PDFPage
14from pdfminer.pdftypes import PDFStream
15from pdfminer.psparser import PSLiteral
16from pdfminer.utils import Matrix, PathSegment, Point, Rect
18if TYPE_CHECKING:
19 from pdfminer.pdfinterp import (
20 PDFGraphicState,
21 PDFResourceManager,
22 PDFStackT,
23 PDFTextState,
24 )
27PDFTextSeq = Iterable[int | float | bytes]
29logger = logging.getLogger(__name__)
32class PDFDevice:
33 """Translate the output of PDFPageInterpreter to the output that is needed"""
35 def __init__(self, rsrcmgr: "PDFResourceManager") -> None:
36 self.rsrcmgr = rsrcmgr
37 self.ctm: Matrix | None = None
39 def __repr__(self) -> str:
40 return "<PDFDevice>"
42 def __enter__(self) -> "PDFDevice":
43 return self
45 def __exit__(self, exc_type: object, exc_val: object, exc_tb: object) -> None:
46 self.close()
48 def close(self) -> None:
49 pass
51 def set_ctm(self, ctm: Matrix) -> None:
52 self.ctm = ctm
54 def begin_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None) -> None:
55 pass
57 def end_tag(self) -> None:
58 pass
60 def do_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None) -> None:
61 pass
63 def begin_page(self, page: PDFPage, ctm: Matrix) -> None:
64 pass
66 def end_page(self, page: PDFPage) -> None:
67 pass
69 def begin_figure(self, name: str, bbox: Rect, matrix: Matrix) -> None:
70 pass
72 def end_figure(self, name: str) -> None:
73 pass
75 def paint_path(
76 self,
77 graphicstate: "PDFGraphicState",
78 stroke: bool,
79 fill: bool,
80 evenodd: bool,
81 path: Sequence[PathSegment],
82 ) -> None:
83 pass
85 def render_image(self, name: str, stream: PDFStream) -> None:
86 pass
88 def render_string(
89 self,
90 textstate: "PDFTextState",
91 seq: PDFTextSeq,
92 ncs: PDFColorSpace,
93 graphicstate: "PDFGraphicState",
94 ) -> None:
95 pass
98class PDFTextDevice(PDFDevice):
99 def render_string(
100 self,
101 textstate: "PDFTextState",
102 seq: PDFTextSeq,
103 ncs: PDFColorSpace,
104 graphicstate: "PDFGraphicState",
105 ) -> None:
106 assert self.ctm is not None
107 matrix = utils.mult_matrix(textstate.matrix, self.ctm)
108 font = textstate.font
109 fontsize = textstate.fontsize
110 scaling = textstate.scaling * 0.01
111 charspace = textstate.charspace * scaling
112 wordspace = textstate.wordspace * scaling
113 rise = textstate.rise
114 assert font is not None
115 if font.is_multibyte():
116 wordspace = 0
117 dxscale = 0.001 * fontsize * scaling
118 if font.is_vertical():
119 textstate.linematrix = self.render_string_vertical(
120 seq,
121 matrix,
122 textstate.linematrix,
123 font,
124 fontsize,
125 scaling,
126 charspace,
127 wordspace,
128 rise,
129 dxscale,
130 ncs,
131 graphicstate,
132 )
133 else:
134 textstate.linematrix = self.render_string_horizontal(
135 seq,
136 matrix,
137 textstate.linematrix,
138 font,
139 fontsize,
140 scaling,
141 charspace,
142 wordspace,
143 rise,
144 dxscale,
145 ncs,
146 graphicstate,
147 )
149 def render_string_horizontal(
150 self,
151 seq: PDFTextSeq,
152 matrix: Matrix,
153 pos: Point,
154 font: PDFFont,
155 fontsize: float,
156 scaling: float,
157 charspace: float,
158 wordspace: float,
159 rise: float,
160 dxscale: float,
161 ncs: PDFColorSpace,
162 graphicstate: "PDFGraphicState",
163 ) -> Point:
164 (x, y) = pos
165 needcharspace = False
166 for obj in seq:
167 if isinstance(obj, (int, float)):
168 x -= obj * dxscale
169 needcharspace = True
170 elif isinstance(obj, bytes):
171 for cid in font.decode(obj):
172 if needcharspace:
173 x += charspace
174 x += self.render_char(
175 utils.translate_matrix(matrix, (x, y)),
176 font,
177 fontsize,
178 scaling,
179 rise,
180 cid,
181 ncs,
182 graphicstate,
183 )
184 if cid == 32 and wordspace:
185 x += wordspace
186 needcharspace = True
187 else:
188 logger.warning(
189 "Cannot render horizontal string because "
190 "%r is not a valid int, float or bytes.",
191 obj,
192 )
193 return (x, y)
195 def render_string_vertical(
196 self,
197 seq: PDFTextSeq,
198 matrix: Matrix,
199 pos: Point,
200 font: PDFFont,
201 fontsize: float,
202 scaling: float,
203 charspace: float,
204 wordspace: float,
205 rise: float,
206 dxscale: float,
207 ncs: PDFColorSpace,
208 graphicstate: "PDFGraphicState",
209 ) -> Point:
210 (x, y) = pos
211 needcharspace = False
212 for obj in seq:
213 if isinstance(obj, (int, float)):
214 y -= obj * dxscale
215 needcharspace = True
216 elif isinstance(obj, bytes):
217 for cid in font.decode(obj):
218 if needcharspace:
219 y += charspace
220 y += self.render_char(
221 utils.translate_matrix(matrix, (x, y)),
222 font,
223 fontsize,
224 scaling,
225 rise,
226 cid,
227 ncs,
228 graphicstate,
229 )
230 if cid == 32 and wordspace:
231 y += wordspace
232 needcharspace = True
233 else:
234 logger.warning(
235 "Cannot render vertical string because %r is not a valid "
236 "int, float or bytes.",
237 obj,
238 )
239 return (x, y)
241 def render_char(
242 self,
243 matrix: Matrix,
244 font: PDFFont,
245 fontsize: float,
246 scaling: float,
247 rise: float,
248 cid: int,
249 ncs: PDFColorSpace,
250 graphicstate: "PDFGraphicState",
251 ) -> float:
252 return 0
255class TagExtractor(PDFDevice):
256 def __init__(
257 self,
258 rsrcmgr: "PDFResourceManager",
259 outfp: BinaryIO,
260 codec: str = "utf-8",
261 ) -> None:
262 PDFDevice.__init__(self, rsrcmgr)
263 self.outfp = outfp
264 self.codec = codec
265 self.pageno = 0
266 self._stack: list[PSLiteral] = []
268 def render_string(
269 self,
270 textstate: "PDFTextState",
271 seq: PDFTextSeq,
272 ncs: PDFColorSpace,
273 graphicstate: "PDFGraphicState",
274 ) -> None:
275 font = textstate.font
276 assert font is not None
277 text = ""
278 for obj in seq:
279 if isinstance(obj, str):
280 obj = utils.make_compat_bytes(obj)
281 if not isinstance(obj, bytes):
282 continue
283 chars = font.decode(obj)
284 for cid in chars:
285 try:
286 char = font.to_unichr(cid)
287 text += char
288 except PDFUnicodeNotDefined:
289 pass
290 self._write(utils.enc(text))
292 def begin_page(self, page: PDFPage, ctm: Matrix) -> None:
293 output = (
294 f'<page id="{self.pageno}" bbox="{utils.bbox2str(page.mediabox)}" '
295 f'rotate="{page.rotate}">'
296 )
297 self._write(output)
299 def end_page(self, page: PDFPage) -> None:
300 self._write("</page>\n")
301 self.pageno += 1
303 def begin_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None) -> None:
304 s = ""
305 if isinstance(props, dict):
306 s = "".join(
307 [
308 f' {utils.enc(k)}="{utils.make_compat_str(v)}"'
309 for (k, v) in sorted(props.items())
310 ],
311 )
312 out_s = f"<{utils.enc(cast(str, tag.name))}{s}>"
313 self._write(out_s)
314 self._stack.append(tag)
316 def end_tag(self) -> None:
317 assert self._stack, str(self.pageno)
318 tag = self._stack.pop(-1)
319 out_s = f"</{utils.enc(cast(str, tag.name))}>"
320 self._write(out_s)
322 def do_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None) -> None:
323 self.begin_tag(tag, props)
324 self._stack.pop(-1)
326 def _write(self, s: str) -> None:
327 self.outfp.write(s.encode(self.codec))