Coverage for /pythoncovmergedfiles/medio/medio/src/pdfminer.six/pdfminer/pdfdevice.py: 54%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import logging
2from collections.abc import Iterable, Sequence
3from typing import (
4 TYPE_CHECKING,
5 BinaryIO,
6 Optional,
7 cast,
8)
10from pdfminer import utils
11from pdfminer.pdfcolor import PDFColorSpace
12from pdfminer.pdffont import PDFFont, PDFUnicodeNotDefined
13from pdfminer.pdfpage import PDFPage
14from pdfminer.pdftypes import PDFStream
15from pdfminer.psparser import PSLiteral
16from pdfminer.utils import Matrix, PathSegment, Point, Rect
18if TYPE_CHECKING:
19 from pdfminer.pdfinterp import (
20 PDFGraphicState,
21 PDFResourceManager,
22 PDFStackT,
23 PDFTextState,
24 )
27PDFTextSeq = Iterable[int | float | bytes]
29logger = logging.getLogger(__name__)
32class PDFDevice:
33 """Translate the output of PDFPageInterpreter to the output that is needed"""
35 def __init__(self, rsrcmgr: "PDFResourceManager") -> None:
36 self.rsrcmgr = rsrcmgr
37 self.ctm: Matrix | None = None
39 def __repr__(self) -> str:
40 return "<PDFDevice>"
42 def __enter__(self) -> "PDFDevice":
43 return self
45 def __exit__(self, exc_type: object, exc_val: object, exc_tb: object) -> None:
46 self.close()
48 def close(self) -> None:
49 pass
51 def set_ctm(self, ctm: Matrix) -> None:
52 self.ctm = ctm
54 def begin_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None) -> None:
55 pass
57 def end_tag(self) -> None:
58 pass
60 def do_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None) -> None:
61 pass
63 def begin_page(self, page: PDFPage, ctm: Matrix) -> None:
64 pass
66 def end_page(self, page: PDFPage) -> None:
67 pass
69 def begin_figure(self, name: str, bbox: Rect, matrix: Matrix) -> None:
70 pass
72 def end_figure(self, name: str) -> None:
73 pass
75 def paint_path(
76 self,
77 graphicstate: "PDFGraphicState",
78 stroke: bool,
79 fill: bool,
80 evenodd: bool,
81 path: Sequence[PathSegment],
82 ) -> None:
83 pass
85 def render_image(self, name: str, stream: PDFStream) -> None:
86 pass
88 def render_string(
89 self,
90 textstate: "PDFTextState",
91 seq: PDFTextSeq,
92 ncs: PDFColorSpace,
93 graphicstate: "PDFGraphicState",
94 ) -> None:
95 pass
98class PDFTextDevice(PDFDevice):
99 def render_string(
100 self,
101 textstate: "PDFTextState",
102 seq: PDFTextSeq,
103 ncs: PDFColorSpace,
104 graphicstate: "PDFGraphicState",
105 ) -> None:
106 assert self.ctm is not None
107 matrix = utils.mult_matrix(textstate.matrix, self.ctm)
108 font = textstate.font
109 fontsize = textstate.fontsize
110 scaling = textstate.scaling * 0.01
111 charspace = textstate.charspace * scaling
112 wordspace = textstate.wordspace * scaling
113 rise = textstate.rise
114 assert font is not None
115 if font.is_multibyte():
116 wordspace = 0
117 dxscale = 0.001 * fontsize * scaling
118 if font.is_vertical():
119 textstate.linematrix = self.render_string_vertical(
120 seq,
121 matrix,
122 textstate.linematrix,
123 font,
124 fontsize,
125 scaling,
126 charspace,
127 wordspace,
128 rise,
129 dxscale,
130 ncs,
131 graphicstate,
132 )
133 else:
134 textstate.linematrix = self.render_string_horizontal(
135 seq,
136 matrix,
137 textstate.linematrix,
138 font,
139 fontsize,
140 scaling,
141 charspace,
142 wordspace,
143 rise,
144 dxscale,
145 ncs,
146 graphicstate,
147 )
149 def render_string_horizontal(
150 self,
151 seq: PDFTextSeq,
152 matrix: Matrix,
153 pos: Point,
154 font: PDFFont,
155 fontsize: float,
156 scaling: float,
157 charspace: float,
158 wordspace: float,
159 rise: float,
160 dxscale: float,
161 ncs: PDFColorSpace,
162 graphicstate: "PDFGraphicState",
163 ) -> Point:
164 (x, y) = pos
165 needcharspace = False
166 for obj in seq:
167 if isinstance(obj, (int, float)):
168 x -= obj * dxscale
169 needcharspace = True
170 elif isinstance(obj, bytes):
171 for cid in font.decode(obj):
172 if needcharspace:
173 x += charspace
174 x += self.render_char(
175 utils.translate_matrix(matrix, (x, y)),
176 font,
177 fontsize,
178 scaling,
179 rise,
180 cid,
181 ncs,
182 graphicstate,
183 )
184 if cid == 32 and wordspace:
185 x += wordspace
186 needcharspace = True
187 else:
188 logger.warning(
189 f"Cannot render horizontal string because "
190 f"{obj!r} is not a valid int, float or bytes."
191 )
192 return (x, y)
194 def render_string_vertical(
195 self,
196 seq: PDFTextSeq,
197 matrix: Matrix,
198 pos: Point,
199 font: PDFFont,
200 fontsize: float,
201 scaling: float,
202 charspace: float,
203 wordspace: float,
204 rise: float,
205 dxscale: float,
206 ncs: PDFColorSpace,
207 graphicstate: "PDFGraphicState",
208 ) -> Point:
209 (x, y) = pos
210 needcharspace = False
211 for obj in seq:
212 if isinstance(obj, (int, float)):
213 y -= obj * dxscale
214 needcharspace = True
215 elif isinstance(obj, bytes):
216 for cid in font.decode(obj):
217 if needcharspace:
218 y += charspace
219 y += self.render_char(
220 utils.translate_matrix(matrix, (x, y)),
221 font,
222 fontsize,
223 scaling,
224 rise,
225 cid,
226 ncs,
227 graphicstate,
228 )
229 if cid == 32 and wordspace:
230 y += wordspace
231 needcharspace = True
232 else:
233 logger.warning(
234 f"Cannot render vertical string because {obj!r} is not a valid "
235 f"int, float or bytes."
236 )
237 return (x, y)
239 def render_char(
240 self,
241 matrix: Matrix,
242 font: PDFFont,
243 fontsize: float,
244 scaling: float,
245 rise: float,
246 cid: int,
247 ncs: PDFColorSpace,
248 graphicstate: "PDFGraphicState",
249 ) -> float:
250 return 0
253class TagExtractor(PDFDevice):
254 def __init__(
255 self,
256 rsrcmgr: "PDFResourceManager",
257 outfp: BinaryIO,
258 codec: str = "utf-8",
259 ) -> None:
260 PDFDevice.__init__(self, rsrcmgr)
261 self.outfp = outfp
262 self.codec = codec
263 self.pageno = 0
264 self._stack: list[PSLiteral] = []
266 def render_string(
267 self,
268 textstate: "PDFTextState",
269 seq: PDFTextSeq,
270 ncs: PDFColorSpace,
271 graphicstate: "PDFGraphicState",
272 ) -> None:
273 font = textstate.font
274 assert font is not None
275 text = ""
276 for obj in seq:
277 if isinstance(obj, str):
278 obj = utils.make_compat_bytes(obj)
279 if not isinstance(obj, bytes):
280 continue
281 chars = font.decode(obj)
282 for cid in chars:
283 try:
284 char = font.to_unichr(cid)
285 text += char
286 except PDFUnicodeNotDefined:
287 pass
288 self._write(utils.enc(text))
290 def begin_page(self, page: PDFPage, ctm: Matrix) -> None:
291 output = (
292 f'<page id="{self.pageno}" bbox="{utils.bbox2str(page.mediabox)}" '
293 f'rotate="{page.rotate}">'
294 )
295 self._write(output)
297 def end_page(self, page: PDFPage) -> None:
298 self._write("</page>\n")
299 self.pageno += 1
301 def begin_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None) -> None:
302 s = ""
303 if isinstance(props, dict):
304 s = "".join(
305 [
306 f' {utils.enc(k)}="{utils.make_compat_str(v)}"'
307 for (k, v) in sorted(props.items())
308 ],
309 )
310 out_s = f"<{utils.enc(cast(str, tag.name))}{s}>"
311 self._write(out_s)
312 self._stack.append(tag)
314 def end_tag(self) -> None:
315 assert self._stack, str(self.pageno)
316 tag = self._stack.pop(-1)
317 out_s = f"</{utils.enc(cast(str, tag.name))}>"
318 self._write(out_s)
320 def do_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None) -> None:
321 self.begin_tag(tag, props)
322 self._stack.pop(-1)
324 def _write(self, s: str) -> None:
325 self.outfp.write(s.encode(self.codec))