Coverage for /pythoncovmergedfiles/medio/medio/src/pdfminer.six/pdfminer/pdfinterp.py: 94%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import logging
2import re
3from io import BytesIO
4from typing import Dict, List, Mapping, Optional, Sequence, Tuple, Union, cast
6from pdfminer import settings
7from pdfminer.casting import safe_float
8from pdfminer.cmapdb import CMap, CMapBase, CMapDB
9from pdfminer.pdfcolor import PREDEFINED_COLORSPACE, PDFColorSpace
10from pdfminer.pdfdevice import PDFDevice, PDFTextSeq
11from pdfminer.pdfexceptions import PDFException, PDFValueError
12from pdfminer.pdffont import (
13 PDFCIDFont,
14 PDFFont,
15 PDFFontError,
16 PDFTrueTypeFont,
17 PDFType1Font,
18 PDFType3Font,
19)
20from pdfminer.pdfpage import PDFPage
21from pdfminer.pdftypes import (
22 LITERALS_ASCII85_DECODE,
23 PDFObjRef,
24 PDFStream,
25 dict_value,
26 list_value,
27 resolve1,
28 stream_value,
29)
30from pdfminer.psexceptions import PSEOF, PSTypeError
31from pdfminer.psparser import (
32 KWD,
33 LIT,
34 PSKeyword,
35 PSLiteral,
36 PSStackParser,
37 PSStackType,
38 keyword_name,
39 literal_name,
40)
41from pdfminer.utils import (
42 MATRIX_IDENTITY,
43 Matrix,
44 PathSegment,
45 Point,
46 Rect,
47 choplist,
48 mult_matrix,
49)
51log = logging.getLogger(__name__)
54class PDFResourceError(PDFException):
55 pass
58class PDFInterpreterError(PDFException):
59 pass
62LITERAL_PDF = LIT("PDF")
63LITERAL_TEXT = LIT("Text")
64LITERAL_FONT = LIT("Font")
65LITERAL_FORM = LIT("Form")
66LITERAL_IMAGE = LIT("Image")
69class PDFTextState:
70 matrix: Matrix
71 linematrix: Point
73 def __init__(self) -> None:
74 self.font: Optional[PDFFont] = None
75 self.fontsize: float = 0
76 self.charspace: float = 0
77 self.wordspace: float = 0
78 self.scaling: float = 100
79 self.leading: float = 0
80 self.render: int = 0
81 self.rise: float = 0
82 self.reset()
83 # self.matrix is set
84 # self.linematrix is set
86 def __repr__(self) -> str:
87 return (
88 "<PDFTextState: font=%r, fontsize=%r, charspace=%r, "
89 "wordspace=%r, scaling=%r, leading=%r, render=%r, rise=%r, "
90 "matrix=%r, linematrix=%r>"
91 % (
92 self.font,
93 self.fontsize,
94 self.charspace,
95 self.wordspace,
96 self.scaling,
97 self.leading,
98 self.render,
99 self.rise,
100 self.matrix,
101 self.linematrix,
102 )
103 )
105 def copy(self) -> "PDFTextState":
106 obj = PDFTextState()
107 obj.font = self.font
108 obj.fontsize = self.fontsize
109 obj.charspace = self.charspace
110 obj.wordspace = self.wordspace
111 obj.scaling = self.scaling
112 obj.leading = self.leading
113 obj.render = self.render
114 obj.rise = self.rise
115 obj.matrix = self.matrix
116 obj.linematrix = self.linematrix
117 return obj
119 def reset(self) -> None:
120 self.matrix = MATRIX_IDENTITY
121 self.linematrix = (0, 0)
124Color = Union[
125 float, # Greyscale
126 Tuple[float, float, float], # R, G, B
127 Tuple[float, float, float, float], # C, M, Y, K
128]
131class PDFGraphicState:
132 def __init__(self) -> None:
133 self.linewidth: float = 0
134 self.linecap: Optional[object] = None
135 self.linejoin: Optional[object] = None
136 self.miterlimit: Optional[object] = None
137 self.dash: Optional[Tuple[object, object]] = None
138 self.intent: Optional[object] = None
139 self.flatness: Optional[object] = None
141 # stroking color
142 self.scolor: Optional[Color] = None
144 # non stroking color
145 self.ncolor: Optional[Color] = None
147 def copy(self) -> "PDFGraphicState":
148 obj = PDFGraphicState()
149 obj.linewidth = self.linewidth
150 obj.linecap = self.linecap
151 obj.linejoin = self.linejoin
152 obj.miterlimit = self.miterlimit
153 obj.dash = self.dash
154 obj.intent = self.intent
155 obj.flatness = self.flatness
156 obj.scolor = self.scolor
157 obj.ncolor = self.ncolor
158 return obj
160 def __repr__(self) -> str:
161 return (
162 "<PDFGraphicState: linewidth=%r, linecap=%r, linejoin=%r, "
163 " miterlimit=%r, dash=%r, intent=%r, flatness=%r, "
164 " stroking color=%r, non stroking color=%r>"
165 % (
166 self.linewidth,
167 self.linecap,
168 self.linejoin,
169 self.miterlimit,
170 self.dash,
171 self.intent,
172 self.flatness,
173 self.scolor,
174 self.ncolor,
175 )
176 )
179class PDFResourceManager:
180 """Repository of shared resources.
182 ResourceManager facilitates reuse of shared resources
183 such as fonts and images so that large objects are not
184 allocated multiple times.
185 """
187 def __init__(self, caching: bool = True) -> None:
188 self.caching = caching
189 self._cached_fonts: Dict[object, PDFFont] = {}
191 def get_procset(self, procs: Sequence[object]) -> None:
192 for proc in procs:
193 if proc is LITERAL_PDF or proc is LITERAL_TEXT:
194 pass
195 else:
196 pass
198 def get_cmap(self, cmapname: str, strict: bool = False) -> CMapBase:
199 try:
200 return CMapDB.get_cmap(cmapname)
201 except CMapDB.CMapNotFound:
202 if strict:
203 raise
204 return CMap()
206 def get_font(self, objid: object, spec: Mapping[str, object]) -> PDFFont:
207 if objid and objid in self._cached_fonts:
208 font = self._cached_fonts[objid]
209 else:
210 log.debug("get_font: create: objid=%r, spec=%r", objid, spec)
211 if settings.STRICT:
212 if spec["Type"] is not LITERAL_FONT:
213 raise PDFFontError("Type is not /Font")
214 # Create a Font object.
215 if "Subtype" in spec:
216 subtype = literal_name(spec["Subtype"])
217 else:
218 if settings.STRICT:
219 raise PDFFontError("Font Subtype is not specified.")
220 subtype = "Type1"
221 if subtype in ("Type1", "MMType1"):
222 # Type1 Font
223 font = PDFType1Font(self, spec)
224 elif subtype == "TrueType":
225 # TrueType Font
226 font = PDFTrueTypeFont(self, spec)
227 elif subtype == "Type3":
228 # Type3 Font
229 font = PDFType3Font(self, spec)
230 elif subtype in ("CIDFontType0", "CIDFontType2"):
231 # CID Font
232 font = PDFCIDFont(self, spec)
233 elif subtype == "Type0":
234 # Type0 Font
235 dfonts = list_value(spec["DescendantFonts"])
236 assert dfonts
237 subspec = dict_value(dfonts[0]).copy()
238 for k in ("Encoding", "ToUnicode"):
239 if k in spec:
240 subspec[k] = resolve1(spec[k])
241 font = self.get_font(None, subspec)
242 else:
243 if settings.STRICT:
244 raise PDFFontError("Invalid Font spec: %r" % spec)
245 font = PDFType1Font(self, spec) # this is so wrong!
246 if objid and self.caching:
247 self._cached_fonts[objid] = font
248 return font
251class PDFContentParser(PSStackParser[Union[PSKeyword, PDFStream]]):
252 def __init__(self, streams: Sequence[object]) -> None:
253 self.streams = streams
254 self.istream = 0
255 # PSStackParser.__init__(fp=None) is safe only because we've overloaded
256 # all the methods that would attempt to access self.fp without first
257 # calling self.fillfp().
258 PSStackParser.__init__(self, None) # type: ignore[arg-type]
260 def fillfp(self) -> None:
261 if not self.fp:
262 if self.istream < len(self.streams):
263 strm = stream_value(self.streams[self.istream])
264 self.istream += 1
265 else:
266 raise PSEOF("Unexpected EOF, file truncated?")
267 self.fp = BytesIO(strm.get_data())
269 def seek(self, pos: int) -> None:
270 self.fillfp()
271 PSStackParser.seek(self, pos)
273 def fillbuf(self) -> None:
274 if self.charpos < len(self.buf):
275 return
276 while 1:
277 self.fillfp()
278 self.bufpos = self.fp.tell()
279 self.buf = self.fp.read(self.BUFSIZ)
280 if self.buf:
281 break
282 self.fp = None # type: ignore[assignment]
283 self.charpos = 0
285 def get_inline_data(self, pos: int, target: bytes = b"EI") -> Tuple[int, bytes]:
286 self.seek(pos)
287 i = 0
288 data = b""
289 while i <= len(target):
290 self.fillbuf()
291 if i:
292 ci = self.buf[self.charpos]
293 c = bytes((ci,))
294 data += c
295 self.charpos += 1
296 if (
297 len(target) <= i
298 and c.isspace()
299 or i < len(target)
300 and c == (bytes((target[i],)))
301 ):
302 i += 1
303 else:
304 i = 0
305 else:
306 try:
307 j = self.buf.index(target[0], self.charpos)
308 data += self.buf[self.charpos : j + 1]
309 self.charpos = j + 1
310 i = 1
311 except ValueError:
312 data += self.buf[self.charpos :]
313 self.charpos = len(self.buf)
314 data = data[: -(len(target) + 1)] # strip the last part
315 data = re.sub(rb"(\x0d\x0a|[\x0d\x0a])$", b"", data)
316 return (pos, data)
318 def flush(self) -> None:
319 self.add_results(*self.popall())
321 KEYWORD_BI = KWD(b"BI")
322 KEYWORD_ID = KWD(b"ID")
323 KEYWORD_EI = KWD(b"EI")
325 def do_keyword(self, pos: int, token: PSKeyword) -> None:
326 if token is self.KEYWORD_BI:
327 # inline image within a content stream
328 self.start_type(pos, "inline")
329 elif token is self.KEYWORD_ID:
330 try:
331 (_, objs) = self.end_type("inline")
332 if len(objs) % 2 != 0:
333 error_msg = f"Invalid dictionary construct: {objs!r}"
334 raise PSTypeError(error_msg)
335 d = {literal_name(k): resolve1(v) for (k, v) in choplist(2, objs)}
336 eos = b"EI"
337 filter = d.get("F", None)
338 if filter is not None:
339 if isinstance(filter, PSLiteral):
340 filter = [filter]
341 if filter[0] in LITERALS_ASCII85_DECODE:
342 eos = b"~>"
343 (pos, data) = self.get_inline_data(pos + len(b"ID "), target=eos)
344 if eos != b"EI": # it may be necessary for decoding
345 data += eos
346 obj = PDFStream(d, data)
347 self.push((pos, obj))
348 if eos == b"EI": # otherwise it is still in the stream
349 self.push((pos, self.KEYWORD_EI))
350 except PSTypeError:
351 if settings.STRICT:
352 raise
353 else:
354 self.push((pos, token))
357PDFStackT = PSStackType[PDFStream]
358"""Types that may appear on the PDF argument stack."""
361class PDFPageInterpreter:
362 """Processor for the content of a PDF page
364 Reference: PDF Reference, Appendix A, Operator Summary
365 """
367 def __init__(self, rsrcmgr: PDFResourceManager, device: PDFDevice) -> None:
368 self.rsrcmgr = rsrcmgr
369 self.device = device
371 def dup(self) -> "PDFPageInterpreter":
372 return self.__class__(self.rsrcmgr, self.device)
374 def init_resources(self, resources: Dict[object, object]) -> None:
375 """Prepare the fonts and XObjects listed in the Resource attribute."""
376 self.resources = resources
377 self.fontmap: Dict[object, PDFFont] = {}
378 self.xobjmap = {}
379 self.csmap: Dict[str, PDFColorSpace] = PREDEFINED_COLORSPACE.copy()
380 if not resources:
381 return
383 def get_colorspace(spec: object) -> Optional[PDFColorSpace]:
384 if isinstance(spec, list):
385 name = literal_name(spec[0])
386 else:
387 name = literal_name(spec)
388 if name == "ICCBased" and isinstance(spec, list) and len(spec) >= 2:
389 return PDFColorSpace(name, stream_value(spec[1])["N"])
390 elif name == "DeviceN" and isinstance(spec, list) and len(spec) >= 2:
391 return PDFColorSpace(name, len(list_value(spec[1])))
392 else:
393 return PREDEFINED_COLORSPACE.get(name)
395 for k, v in dict_value(resources).items():
396 log.debug("Resource: %r: %r", k, v)
397 if k == "Font":
398 for fontid, spec in dict_value(v).items():
399 objid = None
400 if isinstance(spec, PDFObjRef):
401 objid = spec.objid
402 spec = dict_value(spec)
403 self.fontmap[fontid] = self.rsrcmgr.get_font(objid, spec)
404 elif k == "ColorSpace":
405 for csid, spec in dict_value(v).items():
406 colorspace = get_colorspace(resolve1(spec))
407 if colorspace is not None:
408 self.csmap[csid] = colorspace
409 elif k == "ProcSet":
410 self.rsrcmgr.get_procset(list_value(v))
411 elif k == "XObject":
412 for xobjid, xobjstrm in dict_value(v).items():
413 self.xobjmap[xobjid] = xobjstrm
415 def init_state(self, ctm: Matrix) -> None:
416 """Initialize the text and graphic states for rendering a page."""
417 # gstack: stack for graphical states.
418 self.gstack: List[Tuple[Matrix, PDFTextState, PDFGraphicState]] = []
419 self.ctm = ctm
420 self.device.set_ctm(self.ctm)
421 self.textstate = PDFTextState()
422 self.graphicstate = PDFGraphicState()
423 self.curpath: List[PathSegment] = []
424 # argstack: stack for command arguments.
425 self.argstack: List[PDFStackT] = []
426 # set some global states.
427 self.scs: Optional[PDFColorSpace] = None
428 self.ncs: Optional[PDFColorSpace] = None
429 if self.csmap:
430 self.scs = self.ncs = next(iter(self.csmap.values()))
432 def push(self, obj: PDFStackT) -> None:
433 self.argstack.append(obj)
435 def pop(self, n: int) -> List[PDFStackT]:
436 if n == 0:
437 return []
438 x = self.argstack[-n:]
439 self.argstack = self.argstack[:-n]
440 return x
442 def get_current_state(self) -> Tuple[Matrix, PDFTextState, PDFGraphicState]:
443 return (self.ctm, self.textstate.copy(), self.graphicstate.copy())
445 def set_current_state(
446 self,
447 state: Tuple[Matrix, PDFTextState, PDFGraphicState],
448 ) -> None:
449 (self.ctm, self.textstate, self.graphicstate) = state
450 self.device.set_ctm(self.ctm)
452 def do_q(self) -> None:
453 """Save graphics state"""
454 self.gstack.append(self.get_current_state())
456 def do_Q(self) -> None:
457 """Restore graphics state"""
458 if self.gstack:
459 self.set_current_state(self.gstack.pop())
461 def do_cm(
462 self,
463 a1: PDFStackT,
464 b1: PDFStackT,
465 c1: PDFStackT,
466 d1: PDFStackT,
467 e1: PDFStackT,
468 f1: PDFStackT,
469 ) -> None:
470 """Concatenate matrix to current transformation matrix"""
471 self.ctm = mult_matrix(cast(Matrix, (a1, b1, c1, d1, e1, f1)), self.ctm)
472 self.device.set_ctm(self.ctm)
474 def do_w(self, linewidth: PDFStackT) -> None:
475 """Set line width"""
476 self.graphicstate.linewidth = cast(float, linewidth)
478 def do_J(self, linecap: PDFStackT) -> None:
479 """Set line cap style"""
480 self.graphicstate.linecap = linecap
482 def do_j(self, linejoin: PDFStackT) -> None:
483 """Set line join style"""
484 self.graphicstate.linejoin = linejoin
486 def do_M(self, miterlimit: PDFStackT) -> None:
487 """Set miter limit"""
488 self.graphicstate.miterlimit = miterlimit
490 def do_d(self, dash: PDFStackT, phase: PDFStackT) -> None:
491 """Set line dash pattern"""
492 self.graphicstate.dash = (dash, phase)
494 def do_ri(self, intent: PDFStackT) -> None:
495 """Set color rendering intent"""
496 self.graphicstate.intent = intent
498 def do_i(self, flatness: PDFStackT) -> None:
499 """Set flatness tolerance"""
500 self.graphicstate.flatness = flatness
502 def do_gs(self, name: PDFStackT) -> None:
503 """Set parameters from graphics state parameter dictionary"""
504 # TODO
506 def do_m(self, x: PDFStackT, y: PDFStackT) -> None:
507 """Begin new subpath"""
508 self.curpath.append(("m", cast(float, x), cast(float, y)))
510 def do_l(self, x: PDFStackT, y: PDFStackT) -> None:
511 """Append straight line segment to path"""
512 self.curpath.append(("l", cast(float, x), cast(float, y)))
514 def do_c(
515 self,
516 x1: PDFStackT,
517 y1: PDFStackT,
518 x2: PDFStackT,
519 y2: PDFStackT,
520 x3: PDFStackT,
521 y3: PDFStackT,
522 ) -> None:
523 """Append curved segment to path (three control points)"""
524 self.curpath.append(
525 (
526 "c",
527 cast(float, x1),
528 cast(float, y1),
529 cast(float, x2),
530 cast(float, y2),
531 cast(float, x3),
532 cast(float, y3),
533 ),
534 )
536 def do_v(self, x2: PDFStackT, y2: PDFStackT, x3: PDFStackT, y3: PDFStackT) -> None:
537 """Append curved segment to path (initial point replicated)"""
538 self.curpath.append(
539 ("v", cast(float, x2), cast(float, y2), cast(float, x3), cast(float, y3)),
540 )
542 def do_y(self, x1: PDFStackT, y1: PDFStackT, x3: PDFStackT, y3: PDFStackT) -> None:
543 """Append curved segment to path (final point replicated)"""
544 self.curpath.append(
545 ("y", cast(float, x1), cast(float, y1), cast(float, x3), cast(float, y3)),
546 )
548 def do_h(self) -> None:
549 """Close subpath"""
550 self.curpath.append(("h",))
552 def do_re(self, x: PDFStackT, y: PDFStackT, w: PDFStackT, h: PDFStackT) -> None:
553 """Append rectangle to path"""
554 x = cast(float, x)
555 y = cast(float, y)
556 w = cast(float, w)
557 h = cast(float, h)
558 self.curpath.append(("m", x, y))
559 self.curpath.append(("l", x + w, y))
560 self.curpath.append(("l", x + w, y + h))
561 self.curpath.append(("l", x, y + h))
562 self.curpath.append(("h",))
564 def do_S(self) -> None:
565 """Stroke path"""
566 self.device.paint_path(self.graphicstate, True, False, False, self.curpath)
567 self.curpath = []
569 def do_s(self) -> None:
570 """Close and stroke path"""
571 self.do_h()
572 self.do_S()
574 def do_f(self) -> None:
575 """Fill path using nonzero winding number rule"""
576 self.device.paint_path(self.graphicstate, False, True, False, self.curpath)
577 self.curpath = []
579 def do_F(self) -> None:
580 """Fill path using nonzero winding number rule (obsolete)"""
582 def do_f_a(self) -> None:
583 """Fill path using even-odd rule"""
584 self.device.paint_path(self.graphicstate, False, True, True, self.curpath)
585 self.curpath = []
587 def do_B(self) -> None:
588 """Fill and stroke path using nonzero winding number rule"""
589 self.device.paint_path(self.graphicstate, True, True, False, self.curpath)
590 self.curpath = []
592 def do_B_a(self) -> None:
593 """Fill and stroke path using even-odd rule"""
594 self.device.paint_path(self.graphicstate, True, True, True, self.curpath)
595 self.curpath = []
597 def do_b(self) -> None:
598 """Close, fill, and stroke path using nonzero winding number rule"""
599 self.do_h()
600 self.do_B()
602 def do_b_a(self) -> None:
603 """Close, fill, and stroke path using even-odd rule"""
604 self.do_h()
605 self.do_B_a()
607 def do_n(self) -> None:
608 """End path without filling or stroking"""
609 self.curpath = []
611 def do_W(self) -> None:
612 """Set clipping path using nonzero winding number rule"""
614 def do_W_a(self) -> None:
615 """Set clipping path using even-odd rule"""
617 def do_CS(self, name: PDFStackT) -> None:
618 """Set color space for stroking operations
620 Introduced in PDF 1.1
621 """
622 try:
623 self.scs = self.csmap[literal_name(name)]
624 except KeyError:
625 if settings.STRICT:
626 raise PDFInterpreterError("Undefined ColorSpace: %r" % name)
628 def do_cs(self, name: PDFStackT) -> None:
629 """Set color space for nonstroking operations"""
630 try:
631 self.ncs = self.csmap[literal_name(name)]
632 except KeyError:
633 if settings.STRICT:
634 raise PDFInterpreterError("Undefined ColorSpace: %r" % name)
636 def do_G(self, gray: PDFStackT) -> None:
637 """Set gray level for stroking operations"""
638 self.graphicstate.scolor = cast(float, gray)
639 self.scs = self.csmap["DeviceGray"]
641 def do_g(self, gray: PDFStackT) -> None:
642 """Set gray level for nonstroking operations"""
643 self.graphicstate.ncolor = cast(float, gray)
644 self.ncs = self.csmap["DeviceGray"]
646 def do_RG(self, r: PDFStackT, g: PDFStackT, b: PDFStackT) -> None:
647 """Set RGB color for stroking operations"""
648 self.graphicstate.scolor = (cast(float, r), cast(float, g), cast(float, b))
649 self.scs = self.csmap["DeviceRGB"]
651 def do_rg(self, r: PDFStackT, g: PDFStackT, b: PDFStackT) -> None:
652 """Set RGB color for nonstroking operations"""
653 self.graphicstate.ncolor = (cast(float, r), cast(float, g), cast(float, b))
654 self.ncs = self.csmap["DeviceRGB"]
656 def do_K(self, c: PDFStackT, m: PDFStackT, y: PDFStackT, k: PDFStackT) -> None:
657 """Set CMYK color for stroking operations"""
658 self.graphicstate.scolor = (
659 cast(float, c),
660 cast(float, m),
661 cast(float, y),
662 cast(float, k),
663 )
664 self.scs = self.csmap["DeviceCMYK"]
666 def do_k(self, c: PDFStackT, m: PDFStackT, y: PDFStackT, k: PDFStackT) -> None:
667 """Set CMYK color for nonstroking operations"""
668 self.graphicstate.ncolor = (
669 cast(float, c),
670 cast(float, m),
671 cast(float, y),
672 cast(float, k),
673 )
674 self.ncs = self.csmap["DeviceCMYK"]
676 def do_SCN(self) -> None:
677 """Set color for stroking operations."""
678 if self.scs:
679 n = self.scs.ncomponents
680 else:
681 if settings.STRICT:
682 raise PDFInterpreterError("No colorspace specified!")
683 n = 1
684 self.graphicstate.scolor = cast(Color, self.pop(n))
686 def do_scn(self) -> None:
687 """Set color for nonstroking operations"""
688 if self.ncs:
689 n = self.ncs.ncomponents
690 else:
691 if settings.STRICT:
692 raise PDFInterpreterError("No colorspace specified!")
693 n = 1
694 self.graphicstate.ncolor = cast(Color, self.pop(n))
696 def do_SC(self) -> None:
697 """Set color for stroking operations"""
698 self.do_SCN()
700 def do_sc(self) -> None:
701 """Set color for nonstroking operations"""
702 self.do_scn()
704 def do_sh(self, name: object) -> None:
705 """Paint area defined by shading pattern"""
707 def do_BT(self) -> None:
708 """Begin text object
710 Initializing the text matrix, Tm, and the text line matrix, Tlm, to
711 the identity matrix. Text objects cannot be nested; a second BT cannot
712 appear before an ET.
713 """
714 self.textstate.reset()
716 def do_ET(self) -> None:
717 """End a text object"""
719 def do_BX(self) -> None:
720 """Begin compatibility section"""
722 def do_EX(self) -> None:
723 """End compatibility section"""
725 def do_MP(self, tag: PDFStackT) -> None:
726 """Define marked-content point"""
727 self.device.do_tag(cast(PSLiteral, tag))
729 def do_DP(self, tag: PDFStackT, props: PDFStackT) -> None:
730 """Define marked-content point with property list"""
731 self.device.do_tag(cast(PSLiteral, tag), props)
733 def do_BMC(self, tag: PDFStackT) -> None:
734 """Begin marked-content sequence"""
735 self.device.begin_tag(cast(PSLiteral, tag))
737 def do_BDC(self, tag: PDFStackT, props: PDFStackT) -> None:
738 """Begin marked-content sequence with property list"""
739 self.device.begin_tag(cast(PSLiteral, tag), props)
741 def do_EMC(self) -> None:
742 """End marked-content sequence"""
743 self.device.end_tag()
745 def do_Tc(self, space: PDFStackT) -> None:
746 """Set character spacing.
748 Character spacing is used by the Tj, TJ, and ' operators.
750 :param space: a number expressed in unscaled text space units.
751 """
752 self.textstate.charspace = cast(float, space)
754 def do_Tw(self, space: PDFStackT) -> None:
755 """Set the word spacing.
757 Word spacing is used by the Tj, TJ, and ' operators.
759 :param space: a number expressed in unscaled text space units
760 """
761 self.textstate.wordspace = cast(float, space)
763 def do_Tz(self, scale: PDFStackT) -> None:
764 """Set the horizontal scaling.
766 :param scale: is a number specifying the percentage of the normal width
767 """
768 self.textstate.scaling = cast(float, scale)
770 def do_TL(self, leading: PDFStackT) -> None:
771 """Set the text leading.
773 Text leading is used only by the T*, ', and " operators.
775 :param leading: a number expressed in unscaled text space units
776 """
777 self.textstate.leading = -cast(float, leading)
779 def do_Tf(self, fontid: PDFStackT, fontsize: PDFStackT) -> None:
780 """Set the text font
782 :param fontid: the name of a font resource in the Font subdictionary
783 of the current resource dictionary
784 :param fontsize: size is a number representing a scale factor.
785 """
786 try:
787 self.textstate.font = self.fontmap[literal_name(fontid)]
788 except KeyError:
789 if settings.STRICT:
790 raise PDFInterpreterError("Undefined Font id: %r" % fontid)
791 self.textstate.font = self.rsrcmgr.get_font(None, {})
792 self.textstate.fontsize = cast(float, fontsize)
794 def do_Tr(self, render: PDFStackT) -> None:
795 """Set the text rendering mode"""
796 self.textstate.render = cast(int, render)
798 def do_Ts(self, rise: PDFStackT) -> None:
799 """Set the text rise
801 :param rise: a number expressed in unscaled text space units
802 """
803 self.textstate.rise = cast(float, rise)
805 def do_Td(self, tx: PDFStackT, ty: PDFStackT) -> None:
806 """Move to the start of the next line
808 Offset from the start of the current line by (tx , ty).
809 """
810 tx_ = safe_float(tx)
811 ty_ = safe_float(ty)
812 if tx_ is not None and ty_ is not None:
813 (a, b, c, d, e, f) = self.textstate.matrix
814 e_new = tx_ * a + ty_ * c + e
815 f_new = tx_ * b + ty_ * d + f
816 self.textstate.matrix = (a, b, c, d, e_new, f_new)
818 elif settings.STRICT:
819 raise PDFValueError(f"Invalid offset ({tx!r}, {ty!r}) for Td")
821 self.textstate.linematrix = (0, 0)
823 def do_TD(self, tx: PDFStackT, ty: PDFStackT) -> None:
824 """Move to the start of the next line.
826 offset from the start of the current line by (tx , ty). As a side effect, this
827 operator sets the leading parameter in the text state.
828 """
829 tx_ = safe_float(tx)
830 ty_ = safe_float(ty)
832 if tx_ is not None and ty_ is not None:
833 (a, b, c, d, e, f) = self.textstate.matrix
834 e_new = tx_ * a + ty_ * c + e
835 f_new = tx_ * b + ty_ * d + f
836 self.textstate.matrix = (a, b, c, d, e_new, f_new)
838 elif settings.STRICT:
839 raise PDFValueError("Invalid offset ({tx}, {ty}) for TD")
841 if ty_ is not None:
842 self.textstate.leading = ty_
844 self.textstate.linematrix = (0, 0)
846 def do_Tm(
847 self,
848 a: PDFStackT,
849 b: PDFStackT,
850 c: PDFStackT,
851 d: PDFStackT,
852 e: PDFStackT,
853 f: PDFStackT,
854 ) -> None:
855 """Set text matrix and text line matrix"""
856 self.textstate.matrix = cast(Matrix, (a, b, c, d, e, f))
857 self.textstate.linematrix = (0, 0)
859 def do_T_a(self) -> None:
860 """Move to start of next text line"""
861 (a, b, c, d, e, f) = self.textstate.matrix
862 self.textstate.matrix = (
863 a,
864 b,
865 c,
866 d,
867 self.textstate.leading * c + e,
868 self.textstate.leading * d + f,
869 )
870 self.textstate.linematrix = (0, 0)
872 def do_TJ(self, seq: PDFStackT) -> None:
873 """Show text, allowing individual glyph positioning"""
874 if self.textstate.font is None:
875 if settings.STRICT:
876 raise PDFInterpreterError("No font specified!")
877 return
878 assert self.ncs is not None
879 self.device.render_string(
880 self.textstate,
881 cast(PDFTextSeq, seq),
882 self.ncs,
883 self.graphicstate.copy(),
884 )
886 def do_Tj(self, s: PDFStackT) -> None:
887 """Show text"""
888 self.do_TJ([s])
890 def do__q(self, s: PDFStackT) -> None:
891 """Move to next line and show text
893 The ' (single quote) operator.
894 """
895 self.do_T_a()
896 self.do_TJ([s])
898 def do__w(self, aw: PDFStackT, ac: PDFStackT, s: PDFStackT) -> None:
899 """Set word and character spacing, move to next line, and show text
901 The " (double quote) operator.
902 """
903 self.do_Tw(aw)
904 self.do_Tc(ac)
905 self.do_TJ([s])
907 def do_BI(self) -> None:
908 """Begin inline image object"""
910 def do_ID(self) -> None:
911 """Begin inline image data"""
913 def do_EI(self, obj: PDFStackT) -> None:
914 """End inline image object"""
915 if isinstance(obj, PDFStream) and "W" in obj and "H" in obj:
916 iobjid = str(id(obj))
917 self.device.begin_figure(iobjid, (0, 0, 1, 1), MATRIX_IDENTITY)
918 self.device.render_image(iobjid, obj)
919 self.device.end_figure(iobjid)
921 def do_Do(self, xobjid_arg: PDFStackT) -> None:
922 """Invoke named XObject"""
923 xobjid = literal_name(xobjid_arg)
924 try:
925 xobj = stream_value(self.xobjmap[xobjid])
926 except KeyError:
927 if settings.STRICT:
928 raise PDFInterpreterError("Undefined xobject id: %r" % xobjid)
929 return
930 log.debug("Processing xobj: %r", xobj)
931 subtype = xobj.get("Subtype")
932 if subtype is LITERAL_FORM and "BBox" in xobj:
933 interpreter = self.dup()
934 bbox = cast(Rect, list_value(xobj["BBox"]))
935 matrix = cast(Matrix, list_value(xobj.get("Matrix", MATRIX_IDENTITY)))
936 # According to PDF reference 1.7 section 4.9.1, XObjects in
937 # earlier PDFs (prior to v1.2) use the page's Resources entry
938 # instead of having their own Resources entry.
939 xobjres = xobj.get("Resources")
940 if xobjres:
941 resources = dict_value(xobjres)
942 else:
943 resources = self.resources.copy()
944 self.device.begin_figure(xobjid, bbox, matrix)
945 interpreter.render_contents(
946 resources,
947 [xobj],
948 ctm=mult_matrix(matrix, self.ctm),
949 )
950 self.device.end_figure(xobjid)
951 elif subtype is LITERAL_IMAGE and "Width" in xobj and "Height" in xobj:
952 self.device.begin_figure(xobjid, (0, 0, 1, 1), MATRIX_IDENTITY)
953 self.device.render_image(xobjid, xobj)
954 self.device.end_figure(xobjid)
955 else:
956 # unsupported xobject type.
957 pass
959 def process_page(self, page: PDFPage) -> None:
960 log.debug("Processing page: %r", page)
961 (x0, y0, x1, y1) = page.mediabox
962 if page.rotate == 90:
963 ctm = (0, -1, 1, 0, -y0, x1)
964 elif page.rotate == 180:
965 ctm = (-1, 0, 0, -1, x1, y1)
966 elif page.rotate == 270:
967 ctm = (0, 1, -1, 0, y1, -x0)
968 else:
969 ctm = (1, 0, 0, 1, -x0, -y0)
970 self.device.begin_page(page, ctm)
971 self.render_contents(page.resources, page.contents, ctm=ctm)
972 self.device.end_page(page)
974 def render_contents(
975 self,
976 resources: Dict[object, object],
977 streams: Sequence[object],
978 ctm: Matrix = MATRIX_IDENTITY,
979 ) -> None:
980 """Render the content streams.
982 This method may be called recursively.
983 """
984 log.debug(
985 "render_contents: resources=%r, streams=%r, ctm=%r",
986 resources,
987 streams,
988 ctm,
989 )
990 self.init_resources(resources)
991 self.init_state(ctm)
992 self.execute(list_value(streams))
994 def execute(self, streams: Sequence[object]) -> None:
995 try:
996 parser = PDFContentParser(streams)
997 except PSEOF:
998 # empty page
999 return
1000 while True:
1001 try:
1002 (_, obj) = parser.nextobject()
1003 except PSEOF:
1004 break
1005 if isinstance(obj, PSKeyword):
1006 name = keyword_name(obj)
1007 method = "do_%s" % name.replace("*", "_a").replace('"', "_w").replace(
1008 "'",
1009 "_q",
1010 )
1011 if hasattr(self, method):
1012 func = getattr(self, method)
1013 nargs = func.__code__.co_argcount - 1
1014 if nargs:
1015 args = self.pop(nargs)
1016 log.debug("exec: %s %r", name, args)
1017 if len(args) == nargs:
1018 func(*args)
1019 else:
1020 log.debug("exec: %s", name)
1021 func()
1022 elif settings.STRICT:
1023 error_msg = "Unknown operator: %r" % name
1024 raise PDFInterpreterError(error_msg)
1025 else:
1026 self.push(obj)