Coverage for /pythoncovmergedfiles/medio/medio/src/pdfminer.six/pdfminer/pdfdevice.py: 64%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import logging
2from typing import (
3 TYPE_CHECKING,
4 BinaryIO,
5 Iterable,
6 List,
7 Optional,
8 Sequence,
9 Union,
10 cast,
11)
13from pdfminer import utils
14from pdfminer.pdfcolor import PDFColorSpace
15from pdfminer.pdffont import PDFFont, PDFUnicodeNotDefined
16from pdfminer.pdfpage import PDFPage
17from pdfminer.pdftypes import PDFStream
18from pdfminer.psparser import PSLiteral
19from pdfminer.utils import Matrix, PathSegment, Point, Rect
21if TYPE_CHECKING:
22 from pdfminer.pdfinterp import (
23 PDFGraphicState,
24 PDFResourceManager,
25 PDFStackT,
26 PDFTextState,
27 )
30PDFTextSeq = Iterable[Union[int, float, bytes]]
32logger = logging.getLogger(__name__)
35class PDFDevice:
36 """Translate the output of PDFPageInterpreter to the output that is needed"""
38 def __init__(self, rsrcmgr: "PDFResourceManager") -> None:
39 self.rsrcmgr = rsrcmgr
40 self.ctm: Optional[Matrix] = None
42 def __repr__(self) -> str:
43 return "<PDFDevice>"
45 def __enter__(self) -> "PDFDevice":
46 return self
48 def __exit__(self, exc_type: object, exc_val: object, exc_tb: object) -> None:
49 self.close()
51 def close(self) -> None:
52 pass
54 def set_ctm(self, ctm: Matrix) -> None:
55 self.ctm = ctm
57 def begin_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None) -> None:
58 pass
60 def end_tag(self) -> None:
61 pass
63 def do_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None) -> None:
64 pass
66 def begin_page(self, page: PDFPage, ctm: Matrix) -> None:
67 pass
69 def end_page(self, page: PDFPage) -> None:
70 pass
72 def begin_figure(self, name: str, bbox: Rect, matrix: Matrix) -> None:
73 pass
75 def end_figure(self, name: str) -> None:
76 pass
78 def paint_path(
79 self,
80 graphicstate: "PDFGraphicState",
81 stroke: bool,
82 fill: bool,
83 evenodd: bool,
84 path: Sequence[PathSegment],
85 ) -> None:
86 pass
88 def render_image(self, name: str, stream: PDFStream) -> None:
89 pass
91 def render_string(
92 self,
93 textstate: "PDFTextState",
94 seq: PDFTextSeq,
95 ncs: PDFColorSpace,
96 graphicstate: "PDFGraphicState",
97 ) -> None:
98 pass
101class PDFTextDevice(PDFDevice):
102 def render_string(
103 self,
104 textstate: "PDFTextState",
105 seq: PDFTextSeq,
106 ncs: PDFColorSpace,
107 graphicstate: "PDFGraphicState",
108 ) -> None:
109 assert self.ctm is not None
110 matrix = utils.mult_matrix(textstate.matrix, self.ctm)
111 font = textstate.font
112 fontsize = textstate.fontsize
113 scaling = textstate.scaling * 0.01
114 charspace = textstate.charspace * scaling
115 wordspace = textstate.wordspace * scaling
116 rise = textstate.rise
117 assert font is not None
118 if font.is_multibyte():
119 wordspace = 0
120 dxscale = 0.001 * fontsize * scaling
121 if font.is_vertical():
122 textstate.linematrix = self.render_string_vertical(
123 seq,
124 matrix,
125 textstate.linematrix,
126 font,
127 fontsize,
128 scaling,
129 charspace,
130 wordspace,
131 rise,
132 dxscale,
133 ncs,
134 graphicstate,
135 )
136 else:
137 textstate.linematrix = self.render_string_horizontal(
138 seq,
139 matrix,
140 textstate.linematrix,
141 font,
142 fontsize,
143 scaling,
144 charspace,
145 wordspace,
146 rise,
147 dxscale,
148 ncs,
149 graphicstate,
150 )
152 def render_string_horizontal(
153 self,
154 seq: PDFTextSeq,
155 matrix: Matrix,
156 pos: Point,
157 font: PDFFont,
158 fontsize: float,
159 scaling: float,
160 charspace: float,
161 wordspace: float,
162 rise: float,
163 dxscale: float,
164 ncs: PDFColorSpace,
165 graphicstate: "PDFGraphicState",
166 ) -> Point:
167 (x, y) = pos
168 needcharspace = False
169 for obj in seq:
170 if isinstance(obj, (int, float)):
171 x -= obj * dxscale
172 needcharspace = True
173 elif isinstance(obj, bytes):
174 for cid in font.decode(obj):
175 if needcharspace:
176 x += charspace
177 x += self.render_char(
178 utils.translate_matrix(matrix, (x, y)),
179 font,
180 fontsize,
181 scaling,
182 rise,
183 cid,
184 ncs,
185 graphicstate,
186 )
187 if cid == 32 and wordspace:
188 x += wordspace
189 needcharspace = True
190 else:
191 logger.warning(
192 f"Cannot render horizontal string because {obj!r} is not a valid int, float or bytes."
193 )
194 return (x, y)
196 def render_string_vertical(
197 self,
198 seq: PDFTextSeq,
199 matrix: Matrix,
200 pos: Point,
201 font: PDFFont,
202 fontsize: float,
203 scaling: float,
204 charspace: float,
205 wordspace: float,
206 rise: float,
207 dxscale: float,
208 ncs: PDFColorSpace,
209 graphicstate: "PDFGraphicState",
210 ) -> Point:
211 (x, y) = pos
212 needcharspace = False
213 for obj in seq:
214 if isinstance(obj, (int, float)):
215 y -= obj * dxscale
216 needcharspace = True
217 elif isinstance(obj, bytes):
218 for cid in font.decode(obj):
219 if needcharspace:
220 y += charspace
221 y += self.render_char(
222 utils.translate_matrix(matrix, (x, y)),
223 font,
224 fontsize,
225 scaling,
226 rise,
227 cid,
228 ncs,
229 graphicstate,
230 )
231 if cid == 32 and wordspace:
232 y += wordspace
233 needcharspace = True
234 else:
235 logger.warning(
236 f"Cannot render vertical string because {obj!r} is not a valid int, float or bytes."
237 )
238 return (x, y)
240 def render_char(
241 self,
242 matrix: Matrix,
243 font: PDFFont,
244 fontsize: float,
245 scaling: float,
246 rise: float,
247 cid: int,
248 ncs: PDFColorSpace,
249 graphicstate: "PDFGraphicState",
250 ) -> float:
251 return 0
254class TagExtractor(PDFDevice):
255 def __init__(
256 self,
257 rsrcmgr: "PDFResourceManager",
258 outfp: BinaryIO,
259 codec: str = "utf-8",
260 ) -> None:
261 PDFDevice.__init__(self, rsrcmgr)
262 self.outfp = outfp
263 self.codec = codec
264 self.pageno = 0
265 self._stack: List[PSLiteral] = []
267 def render_string(
268 self,
269 textstate: "PDFTextState",
270 seq: PDFTextSeq,
271 ncs: PDFColorSpace,
272 graphicstate: "PDFGraphicState",
273 ) -> None:
274 font = textstate.font
275 assert font is not None
276 text = ""
277 for obj in seq:
278 if isinstance(obj, str):
279 obj = utils.make_compat_bytes(obj)
280 if not isinstance(obj, bytes):
281 continue
282 chars = font.decode(obj)
283 for cid in chars:
284 try:
285 char = font.to_unichr(cid)
286 text += char
287 except PDFUnicodeNotDefined:
288 pass
289 self._write(utils.enc(text))
291 def begin_page(self, page: PDFPage, ctm: Matrix) -> None:
292 output = '<page id="%s" bbox="%s" rotate="%d">' % (
293 self.pageno,
294 utils.bbox2str(page.mediabox),
295 page.rotate,
296 )
297 self._write(output)
299 def end_page(self, page: PDFPage) -> None:
300 self._write("</page>\n")
301 self.pageno += 1
303 def begin_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None) -> None:
304 s = ""
305 if isinstance(props, dict):
306 s = "".join(
307 [
308 f' {utils.enc(k)}="{utils.make_compat_str(v)}"'
309 for (k, v) in sorted(props.items())
310 ],
311 )
312 out_s = f"<{utils.enc(cast(str, tag.name))}{s}>"
313 self._write(out_s)
314 self._stack.append(tag)
316 def end_tag(self) -> None:
317 assert self._stack, str(self.pageno)
318 tag = self._stack.pop(-1)
319 out_s = "</%s>" % utils.enc(cast(str, tag.name))
320 self._write(out_s)
322 def do_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None) -> None:
323 self.begin_tag(tag, props)
324 self._stack.pop(-1)
326 def _write(self, s: str) -> None:
327 self.outfp.write(s.encode(self.codec))