Coverage for /pythoncovmergedfiles/medio/medio/src/pdfminer.six/pdfminer/pdfdevice.py: 63%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1from typing import (
2 TYPE_CHECKING,
3 BinaryIO,
4 Iterable,
5 List,
6 Optional,
7 Sequence,
8 Union,
9 cast,
10)
12from pdfminer import utils
13from pdfminer.pdfcolor import PDFColorSpace
14from pdfminer.pdffont import PDFFont, PDFUnicodeNotDefined
15from pdfminer.pdfpage import PDFPage
16from pdfminer.pdftypes import PDFStream
17from pdfminer.psparser import PSLiteral
18from pdfminer.utils import Matrix, PathSegment, Point, Rect
20if TYPE_CHECKING:
21 from pdfminer.pdfinterp import (
22 PDFGraphicState,
23 PDFResourceManager,
24 PDFStackT,
25 PDFTextState,
26 )
29PDFTextSeq = Iterable[Union[int, float, bytes]]
32class PDFDevice:
33 """Translate the output of PDFPageInterpreter to the output that is needed"""
35 def __init__(self, rsrcmgr: "PDFResourceManager") -> None:
36 self.rsrcmgr = rsrcmgr
37 self.ctm: Optional[Matrix] = None
39 def __repr__(self) -> str:
40 return "<PDFDevice>"
42 def __enter__(self) -> "PDFDevice":
43 return self
45 def __exit__(self, exc_type: object, exc_val: object, exc_tb: object) -> None:
46 self.close()
48 def close(self) -> None:
49 pass
51 def set_ctm(self, ctm: Matrix) -> None:
52 self.ctm = ctm
54 def begin_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None) -> None:
55 pass
57 def end_tag(self) -> None:
58 pass
60 def do_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None) -> None:
61 pass
63 def begin_page(self, page: PDFPage, ctm: Matrix) -> None:
64 pass
66 def end_page(self, page: PDFPage) -> None:
67 pass
69 def begin_figure(self, name: str, bbox: Rect, matrix: Matrix) -> None:
70 pass
72 def end_figure(self, name: str) -> None:
73 pass
75 def paint_path(
76 self,
77 graphicstate: "PDFGraphicState",
78 stroke: bool,
79 fill: bool,
80 evenodd: bool,
81 path: Sequence[PathSegment],
82 ) -> None:
83 pass
85 def render_image(self, name: str, stream: PDFStream) -> None:
86 pass
88 def render_string(
89 self,
90 textstate: "PDFTextState",
91 seq: PDFTextSeq,
92 ncs: PDFColorSpace,
93 graphicstate: "PDFGraphicState",
94 ) -> None:
95 pass
98class PDFTextDevice(PDFDevice):
99 def render_string(
100 self,
101 textstate: "PDFTextState",
102 seq: PDFTextSeq,
103 ncs: PDFColorSpace,
104 graphicstate: "PDFGraphicState",
105 ) -> None:
106 assert self.ctm is not None
107 matrix = utils.mult_matrix(textstate.matrix, self.ctm)
108 font = textstate.font
109 fontsize = textstate.fontsize
110 scaling = textstate.scaling * 0.01
111 charspace = textstate.charspace * scaling
112 wordspace = textstate.wordspace * scaling
113 rise = textstate.rise
114 assert font is not None
115 if font.is_multibyte():
116 wordspace = 0
117 dxscale = 0.001 * fontsize * scaling
118 if font.is_vertical():
119 textstate.linematrix = self.render_string_vertical(
120 seq,
121 matrix,
122 textstate.linematrix,
123 font,
124 fontsize,
125 scaling,
126 charspace,
127 wordspace,
128 rise,
129 dxscale,
130 ncs,
131 graphicstate,
132 )
133 else:
134 textstate.linematrix = self.render_string_horizontal(
135 seq,
136 matrix,
137 textstate.linematrix,
138 font,
139 fontsize,
140 scaling,
141 charspace,
142 wordspace,
143 rise,
144 dxscale,
145 ncs,
146 graphicstate,
147 )
149 def render_string_horizontal(
150 self,
151 seq: PDFTextSeq,
152 matrix: Matrix,
153 pos: Point,
154 font: PDFFont,
155 fontsize: float,
156 scaling: float,
157 charspace: float,
158 wordspace: float,
159 rise: float,
160 dxscale: float,
161 ncs: PDFColorSpace,
162 graphicstate: "PDFGraphicState",
163 ) -> Point:
164 (x, y) = pos
165 needcharspace = False
166 for obj in seq:
167 if isinstance(obj, (int, float)):
168 x -= obj * dxscale
169 needcharspace = True
170 else:
171 for cid in font.decode(obj):
172 if needcharspace:
173 x += charspace
174 x += self.render_char(
175 utils.translate_matrix(matrix, (x, y)),
176 font,
177 fontsize,
178 scaling,
179 rise,
180 cid,
181 ncs,
182 graphicstate,
183 )
184 if cid == 32 and wordspace:
185 x += wordspace
186 needcharspace = True
187 return (x, y)
189 def render_string_vertical(
190 self,
191 seq: PDFTextSeq,
192 matrix: Matrix,
193 pos: Point,
194 font: PDFFont,
195 fontsize: float,
196 scaling: float,
197 charspace: float,
198 wordspace: float,
199 rise: float,
200 dxscale: float,
201 ncs: PDFColorSpace,
202 graphicstate: "PDFGraphicState",
203 ) -> Point:
204 (x, y) = pos
205 needcharspace = False
206 for obj in seq:
207 if isinstance(obj, (int, float)):
208 y -= obj * dxscale
209 needcharspace = True
210 else:
211 for cid in font.decode(obj):
212 if needcharspace:
213 y += charspace
214 y += self.render_char(
215 utils.translate_matrix(matrix, (x, y)),
216 font,
217 fontsize,
218 scaling,
219 rise,
220 cid,
221 ncs,
222 graphicstate,
223 )
224 if cid == 32 and wordspace:
225 y += wordspace
226 needcharspace = True
227 return (x, y)
229 def render_char(
230 self,
231 matrix: Matrix,
232 font: PDFFont,
233 fontsize: float,
234 scaling: float,
235 rise: float,
236 cid: int,
237 ncs: PDFColorSpace,
238 graphicstate: "PDFGraphicState",
239 ) -> float:
240 return 0
243class TagExtractor(PDFDevice):
244 def __init__(
245 self,
246 rsrcmgr: "PDFResourceManager",
247 outfp: BinaryIO,
248 codec: str = "utf-8",
249 ) -> None:
250 PDFDevice.__init__(self, rsrcmgr)
251 self.outfp = outfp
252 self.codec = codec
253 self.pageno = 0
254 self._stack: List[PSLiteral] = []
256 def render_string(
257 self,
258 textstate: "PDFTextState",
259 seq: PDFTextSeq,
260 ncs: PDFColorSpace,
261 graphicstate: "PDFGraphicState",
262 ) -> None:
263 font = textstate.font
264 assert font is not None
265 text = ""
266 for obj in seq:
267 if isinstance(obj, str):
268 obj = utils.make_compat_bytes(obj)
269 if not isinstance(obj, bytes):
270 continue
271 chars = font.decode(obj)
272 for cid in chars:
273 try:
274 char = font.to_unichr(cid)
275 text += char
276 except PDFUnicodeNotDefined:
277 pass
278 self._write(utils.enc(text))
280 def begin_page(self, page: PDFPage, ctm: Matrix) -> None:
281 output = '<page id="%s" bbox="%s" rotate="%d">' % (
282 self.pageno,
283 utils.bbox2str(page.mediabox),
284 page.rotate,
285 )
286 self._write(output)
288 def end_page(self, page: PDFPage) -> None:
289 self._write("</page>\n")
290 self.pageno += 1
292 def begin_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None) -> None:
293 s = ""
294 if isinstance(props, dict):
295 s = "".join(
296 [
297 f' {utils.enc(k)}="{utils.make_compat_str(v)}"'
298 for (k, v) in sorted(props.items())
299 ],
300 )
301 out_s = f"<{utils.enc(cast(str, tag.name))}{s}>"
302 self._write(out_s)
303 self._stack.append(tag)
305 def end_tag(self) -> None:
306 assert self._stack, str(self.pageno)
307 tag = self._stack.pop(-1)
308 out_s = "</%s>" % utils.enc(cast(str, tag.name))
309 self._write(out_s)
311 def do_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None) -> None:
312 self.begin_tag(tag, props)
313 self._stack.pop(-1)
315 def _write(self, s: str) -> None:
316 self.outfp.write(s.encode(self.codec))