Coverage for /pythoncovmergedfiles/medio/medio/src/pdfminer.six/pdfminer/pdfinterp.py: 96%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import logging
2import re
3from io import BytesIO
4from typing import Dict, List, Mapping, Optional, Sequence, Tuple, Union, cast
6from pdfminer import settings
7from pdfminer.casting import safe_cmyk, safe_float, safe_int, safe_matrix, safe_rgb
8from pdfminer.cmapdb import CMap, CMapBase, CMapDB
9from pdfminer.pdfcolor import PREDEFINED_COLORSPACE, PDFColorSpace
10from pdfminer.pdfdevice import PDFDevice, PDFTextSeq
11from pdfminer.pdfexceptions import PDFException, PDFValueError
12from pdfminer.pdffont import (
13 PDFCIDFont,
14 PDFFont,
15 PDFFontError,
16 PDFTrueTypeFont,
17 PDFType1Font,
18 PDFType3Font,
19)
20from pdfminer.pdfpage import PDFPage
21from pdfminer.pdftypes import (
22 LITERALS_ASCII85_DECODE,
23 PDFObjRef,
24 PDFStream,
25 dict_value,
26 list_value,
27 resolve1,
28 stream_value,
29)
30from pdfminer.psexceptions import PSEOF, PSTypeError
31from pdfminer.psparser import (
32 KWD,
33 LIT,
34 PSKeyword,
35 PSLiteral,
36 PSStackParser,
37 PSStackType,
38 keyword_name,
39 literal_name,
40)
41from pdfminer.utils import (
42 MATRIX_IDENTITY,
43 Matrix,
44 PathSegment,
45 Point,
46 Rect,
47 choplist,
48 mult_matrix,
49)
51log = logging.getLogger(__name__)
54class PDFResourceError(PDFException):
55 pass
58class PDFInterpreterError(PDFException):
59 pass
62LITERAL_PDF = LIT("PDF")
63LITERAL_TEXT = LIT("Text")
64LITERAL_FONT = LIT("Font")
65LITERAL_FORM = LIT("Form")
66LITERAL_IMAGE = LIT("Image")
69class PDFTextState:
70 matrix: Matrix
71 linematrix: Point
73 def __init__(self) -> None:
74 self.font: Optional[PDFFont] = None
75 self.fontsize: float = 0
76 self.charspace: float = 0
77 self.wordspace: float = 0
78 self.scaling: float = 100
79 self.leading: float = 0
80 self.render: int = 0
81 self.rise: float = 0
82 self.reset()
83 # self.matrix is set
84 # self.linematrix is set
86 def __repr__(self) -> str:
87 return (
88 "<PDFTextState: font=%r, fontsize=%r, charspace=%r, "
89 "wordspace=%r, scaling=%r, leading=%r, render=%r, rise=%r, "
90 "matrix=%r, linematrix=%r>"
91 % (
92 self.font,
93 self.fontsize,
94 self.charspace,
95 self.wordspace,
96 self.scaling,
97 self.leading,
98 self.render,
99 self.rise,
100 self.matrix,
101 self.linematrix,
102 )
103 )
105 def copy(self) -> "PDFTextState":
106 obj = PDFTextState()
107 obj.font = self.font
108 obj.fontsize = self.fontsize
109 obj.charspace = self.charspace
110 obj.wordspace = self.wordspace
111 obj.scaling = self.scaling
112 obj.leading = self.leading
113 obj.render = self.render
114 obj.rise = self.rise
115 obj.matrix = self.matrix
116 obj.linematrix = self.linematrix
117 return obj
119 def reset(self) -> None:
120 self.matrix = MATRIX_IDENTITY
121 self.linematrix = (0, 0)
124Color = Union[
125 float, # Greyscale
126 Tuple[float, float, float], # R, G, B
127 Tuple[float, float, float, float], # C, M, Y, K
128]
131class PDFGraphicState:
132 def __init__(self) -> None:
133 self.linewidth: float = 0
134 self.linecap: Optional[object] = None
135 self.linejoin: Optional[object] = None
136 self.miterlimit: Optional[object] = None
137 self.dash: Optional[Tuple[object, object]] = None
138 self.intent: Optional[object] = None
139 self.flatness: Optional[object] = None
141 # stroking color
142 self.scolor: Color = 0
143 self.scs: PDFColorSpace = PREDEFINED_COLORSPACE["DeviceGray"]
145 # non stroking color
146 self.ncolor: Color = 0
147 self.ncs: PDFColorSpace = PREDEFINED_COLORSPACE["DeviceGray"]
149 def copy(self) -> "PDFGraphicState":
150 obj = PDFGraphicState()
151 obj.linewidth = self.linewidth
152 obj.linecap = self.linecap
153 obj.linejoin = self.linejoin
154 obj.miterlimit = self.miterlimit
155 obj.dash = self.dash
156 obj.intent = self.intent
157 obj.flatness = self.flatness
158 obj.scolor = self.scolor
159 obj.scs = self.scs
160 obj.ncolor = self.ncolor
161 obj.ncs = self.ncs
162 return obj
164 def __repr__(self) -> str:
165 return (
166 "<PDFGraphicState: linewidth=%r, linecap=%r, linejoin=%r, "
167 " miterlimit=%r, dash=%r, intent=%r, flatness=%r, "
168 " stroking color=%r, non stroking color=%r>"
169 % (
170 self.linewidth,
171 self.linecap,
172 self.linejoin,
173 self.miterlimit,
174 self.dash,
175 self.intent,
176 self.flatness,
177 self.scolor,
178 self.ncolor,
179 )
180 )
183class PDFResourceManager:
184 """Repository of shared resources.
186 ResourceManager facilitates reuse of shared resources
187 such as fonts and images so that large objects are not
188 allocated multiple times.
189 """
191 def __init__(self, caching: bool = True) -> None:
192 self.caching = caching
193 self._cached_fonts: Dict[object, PDFFont] = {}
195 def get_procset(self, procs: Sequence[object]) -> None:
196 for proc in procs:
197 if proc is LITERAL_PDF or proc is LITERAL_TEXT:
198 pass
199 else:
200 pass
202 def get_cmap(self, cmapname: str, strict: bool = False) -> CMapBase:
203 try:
204 return CMapDB.get_cmap(cmapname)
205 except CMapDB.CMapNotFound:
206 if strict:
207 raise
208 return CMap()
210 def get_font(self, objid: object, spec: Mapping[str, object]) -> PDFFont:
211 if objid and objid in self._cached_fonts:
212 font = self._cached_fonts[objid]
213 else:
214 log.debug("get_font: create: objid=%r, spec=%r", objid, spec)
215 if settings.STRICT:
216 if spec["Type"] is not LITERAL_FONT:
217 raise PDFFontError("Type is not /Font")
218 # Create a Font object.
219 if "Subtype" in spec:
220 subtype = literal_name(spec["Subtype"])
221 else:
222 if settings.STRICT:
223 raise PDFFontError("Font Subtype is not specified.")
224 subtype = "Type1"
225 if subtype in ("Type1", "MMType1"):
226 # Type1 Font
227 font = PDFType1Font(self, spec)
228 elif subtype == "TrueType":
229 # TrueType Font
230 font = PDFTrueTypeFont(self, spec)
231 elif subtype == "Type3":
232 # Type3 Font
233 font = PDFType3Font(self, spec)
234 elif subtype in ("CIDFontType0", "CIDFontType2"):
235 # CID Font
236 font = PDFCIDFont(self, spec)
237 elif subtype == "Type0":
238 # Type0 Font
239 dfonts = list_value(spec["DescendantFonts"])
240 assert dfonts
241 subspec = dict_value(dfonts[0]).copy()
242 for k in ("Encoding", "ToUnicode"):
243 if k in spec:
244 subspec[k] = resolve1(spec[k])
245 font = self.get_font(None, subspec)
246 else:
247 if settings.STRICT:
248 raise PDFFontError("Invalid Font spec: %r" % spec)
249 font = PDFType1Font(self, spec) # this is so wrong!
250 if objid and self.caching:
251 self._cached_fonts[objid] = font
252 return font
255class PDFContentParser(PSStackParser[Union[PSKeyword, PDFStream]]):
256 def __init__(self, streams: Sequence[object]) -> None:
257 self.streams = streams
258 self.istream = 0
259 # PSStackParser.__init__(fp=None) is safe only because we've overloaded
260 # all the methods that would attempt to access self.fp without first
261 # calling self.fillfp().
262 PSStackParser.__init__(self, None) # type: ignore[arg-type]
264 def fillfp(self) -> None:
265 if not self.fp:
266 if self.istream < len(self.streams):
267 strm = stream_value(self.streams[self.istream])
268 self.istream += 1
269 else:
270 raise PSEOF("Unexpected EOF, file truncated?")
271 self.fp = BytesIO(strm.get_data())
273 def seek(self, pos: int) -> None:
274 self.fillfp()
275 PSStackParser.seek(self, pos)
277 def fillbuf(self) -> None:
278 if self.charpos < len(self.buf):
279 return
280 while 1:
281 self.fillfp()
282 self.bufpos = self.fp.tell()
283 self.buf = self.fp.read(self.BUFSIZ)
284 if self.buf:
285 break
286 self.fp = None # type: ignore[assignment]
287 self.charpos = 0
289 def get_inline_data(self, pos: int, target: bytes = b"EI") -> Tuple[int, bytes]:
290 self.seek(pos)
291 i = 0
292 data = b""
293 while i <= len(target):
294 self.fillbuf()
295 if i:
296 ci = self.buf[self.charpos]
297 c = bytes((ci,))
298 data += c
299 self.charpos += 1
300 if (
301 len(target) <= i
302 and c.isspace()
303 or i < len(target)
304 and c == (bytes((target[i],)))
305 ):
306 i += 1
307 else:
308 i = 0
309 else:
310 try:
311 j = self.buf.index(target[0], self.charpos)
312 data += self.buf[self.charpos : j + 1]
313 self.charpos = j + 1
314 i = 1
315 except ValueError:
316 data += self.buf[self.charpos :]
317 self.charpos = len(self.buf)
318 data = data[: -(len(target) + 1)] # strip the last part
319 data = re.sub(rb"(\x0d\x0a|[\x0d\x0a])$", b"", data)
320 return (pos, data)
322 def flush(self) -> None:
323 self.add_results(*self.popall())
325 KEYWORD_BI = KWD(b"BI")
326 KEYWORD_ID = KWD(b"ID")
327 KEYWORD_EI = KWD(b"EI")
329 def do_keyword(self, pos: int, token: PSKeyword) -> None:
330 if token is self.KEYWORD_BI:
331 # inline image within a content stream
332 self.start_type(pos, "inline")
333 elif token is self.KEYWORD_ID:
334 try:
335 (_, objs) = self.end_type("inline")
336 if len(objs) % 2 != 0:
337 error_msg = f"Invalid dictionary construct: {objs!r}"
338 raise PSTypeError(error_msg)
339 d = {literal_name(k): resolve1(v) for (k, v) in choplist(2, objs)}
340 eos = b"EI"
341 filter = d.get("F", None)
342 if filter is not None:
343 if isinstance(filter, PSLiteral):
344 filter = [filter]
345 if filter[0] in LITERALS_ASCII85_DECODE:
346 eos = b"~>"
347 (pos, data) = self.get_inline_data(pos + len(b"ID "), target=eos)
348 if eos != b"EI": # it may be necessary for decoding
349 data += eos
350 obj = PDFStream(d, data)
351 self.push((pos, obj))
352 if eos == b"EI": # otherwise it is still in the stream
353 self.push((pos, self.KEYWORD_EI))
354 except PSTypeError:
355 if settings.STRICT:
356 raise
357 else:
358 self.push((pos, token))
361PDFStackT = PSStackType[PDFStream]
362"""Types that may appear on the PDF argument stack."""
365class PDFPageInterpreter:
366 """Processor for the content of a PDF page
368 Reference: PDF Reference, Appendix A, Operator Summary
369 """
371 def __init__(self, rsrcmgr: PDFResourceManager, device: PDFDevice) -> None:
372 self.rsrcmgr = rsrcmgr
373 self.device = device
375 def dup(self) -> "PDFPageInterpreter":
376 return self.__class__(self.rsrcmgr, self.device)
378 def init_resources(self, resources: Dict[object, object]) -> None:
379 """Prepare the fonts and XObjects listed in the Resource attribute."""
380 self.resources = resources
381 self.fontmap: Dict[object, PDFFont] = {}
382 self.xobjmap = {}
383 self.csmap: Dict[str, PDFColorSpace] = PREDEFINED_COLORSPACE.copy()
384 if not resources:
385 return
387 def get_colorspace(spec: object) -> Optional[PDFColorSpace]:
388 if isinstance(spec, list):
389 name = literal_name(spec[0])
390 else:
391 name = literal_name(spec)
392 if name == "ICCBased" and isinstance(spec, list) and len(spec) >= 2:
393 return PDFColorSpace(name, stream_value(spec[1])["N"])
394 elif name == "DeviceN" and isinstance(spec, list) and len(spec) >= 2:
395 return PDFColorSpace(name, len(list_value(spec[1])))
396 else:
397 return PREDEFINED_COLORSPACE.get(name)
399 for k, v in dict_value(resources).items():
400 log.debug("Resource: %r: %r", k, v)
401 if k == "Font":
402 for fontid, spec in dict_value(v).items():
403 objid = None
404 if isinstance(spec, PDFObjRef):
405 objid = spec.objid
406 spec = dict_value(spec)
407 self.fontmap[fontid] = self.rsrcmgr.get_font(objid, spec)
408 elif k == "ColorSpace":
409 for csid, spec in dict_value(v).items():
410 colorspace = get_colorspace(resolve1(spec))
411 if colorspace is not None:
412 self.csmap[csid] = colorspace
413 elif k == "ProcSet":
414 self.rsrcmgr.get_procset(list_value(v))
415 elif k == "XObject":
416 for xobjid, xobjstrm in dict_value(v).items():
417 self.xobjmap[xobjid] = xobjstrm
419 def init_state(self, ctm: Matrix) -> None:
420 """Initialize the text and graphic states for rendering a page."""
421 # gstack: stack for graphical states.
422 self.gstack: List[Tuple[Matrix, PDFTextState, PDFGraphicState]] = []
423 self.ctm = ctm
424 self.device.set_ctm(self.ctm)
425 self.textstate = PDFTextState()
426 self.graphicstate = PDFGraphicState()
427 self.curpath: List[PathSegment] = []
428 # argstack: stack for command arguments.
429 self.argstack: List[PDFStackT] = []
431 def push(self, obj: PDFStackT) -> None:
432 self.argstack.append(obj)
434 def pop(self, n: int) -> List[PDFStackT]:
435 if n == 0:
436 return []
437 x = self.argstack[-n:]
438 self.argstack = self.argstack[:-n]
439 return x
441 def get_current_state(self) -> Tuple[Matrix, PDFTextState, PDFGraphicState]:
442 return (self.ctm, self.textstate.copy(), self.graphicstate.copy())
444 def set_current_state(
445 self,
446 state: Tuple[Matrix, PDFTextState, PDFGraphicState],
447 ) -> None:
448 (self.ctm, self.textstate, self.graphicstate) = state
449 self.device.set_ctm(self.ctm)
451 def do_q(self) -> None:
452 """Save graphics state"""
453 self.gstack.append(self.get_current_state())
455 def do_Q(self) -> None:
456 """Restore graphics state"""
457 if self.gstack:
458 self.set_current_state(self.gstack.pop())
460 def do_cm(
461 self,
462 a1: PDFStackT,
463 b1: PDFStackT,
464 c1: PDFStackT,
465 d1: PDFStackT,
466 e1: PDFStackT,
467 f1: PDFStackT,
468 ) -> None:
469 """Concatenate matrix to current transformation matrix"""
470 matrix = safe_matrix(a1, b1, c1, d1, e1, f1)
472 if matrix is None:
473 log.warning(
474 f"Cannot concatenate matrix to current transformation matrix because not all values in {(a1, b1, c1, d1, e1, f1)!r} can be parsed as floats"
475 )
476 else:
477 self.ctm = mult_matrix(matrix, self.ctm)
478 self.device.set_ctm(self.ctm)
480 def do_w(self, linewidth: PDFStackT) -> None:
481 """Set line width"""
482 linewidth_f = safe_float(linewidth)
483 if linewidth_f is None:
484 log.warning(
485 f"Cannot set line width because {linewidth!r} is an invalid float value"
486 )
487 else:
488 self.graphicstate.linewidth = linewidth_f
490 def do_J(self, linecap: PDFStackT) -> None:
491 """Set line cap style"""
492 self.graphicstate.linecap = linecap
494 def do_j(self, linejoin: PDFStackT) -> None:
495 """Set line join style"""
496 self.graphicstate.linejoin = linejoin
498 def do_M(self, miterlimit: PDFStackT) -> None:
499 """Set miter limit"""
500 self.graphicstate.miterlimit = miterlimit
502 def do_d(self, dash: PDFStackT, phase: PDFStackT) -> None:
503 """Set line dash pattern"""
504 self.graphicstate.dash = (dash, phase)
506 def do_ri(self, intent: PDFStackT) -> None:
507 """Set color rendering intent"""
508 self.graphicstate.intent = intent
510 def do_i(self, flatness: PDFStackT) -> None:
511 """Set flatness tolerance"""
512 self.graphicstate.flatness = flatness
514 def do_gs(self, name: PDFStackT) -> None:
515 """Set parameters from graphics state parameter dictionary"""
516 # TODO
518 def do_m(self, x: PDFStackT, y: PDFStackT) -> None:
519 """Begin new subpath"""
520 x_f = safe_float(x)
521 y_f = safe_float(y)
523 if x_f is None or y_f is None:
524 point = ("m", x, y)
525 log.warning(
526 f"Cannot start new subpath because not all values in {point!r} can be parsed as floats"
527 )
528 else:
529 point = ("m", x_f, y_f)
530 self.curpath.append(point)
532 def do_l(self, x: PDFStackT, y: PDFStackT) -> None:
533 """Append straight line segment to path"""
534 x_f = safe_float(x)
535 y_f = safe_float(y)
536 if x_f is None or y_f is None:
537 point = ("l", x, y)
538 log.warning(
539 f"Cannot append straight line segment to path because not all values in {point!r} can be parsed as floats"
540 )
541 else:
542 point = ("l", x_f, y_f)
543 self.curpath.append(point)
545 def do_c(
546 self,
547 x1: PDFStackT,
548 y1: PDFStackT,
549 x2: PDFStackT,
550 y2: PDFStackT,
551 x3: PDFStackT,
552 y3: PDFStackT,
553 ) -> None:
554 """Append curved segment to path (three control points)"""
555 x1_f = safe_float(x1)
556 y1_f = safe_float(y1)
557 x2_f = safe_float(x2)
558 y2_f = safe_float(y2)
559 x3_f = safe_float(x3)
560 y3_f = safe_float(y3)
561 if (
562 x1_f is None
563 or y1_f is None
564 or x2_f is None
565 or y2_f is None
566 or x3_f is None
567 or y3_f is None
568 ):
569 point = ("c", x1, y1, x2, y2, x3, y3)
570 log.warning(
571 f"Cannot append curved segment to path because not all values in {point!r} can be parsed as floats"
572 )
573 else:
574 point = ("c", x1_f, y1_f, x2_f, y2_f, x3_f, y3_f)
575 self.curpath.append(point)
577 def do_v(self, x2: PDFStackT, y2: PDFStackT, x3: PDFStackT, y3: PDFStackT) -> None:
578 """Append curved segment to path (initial point replicated)"""
579 x2_f = safe_float(x2)
580 y2_f = safe_float(y2)
581 x3_f = safe_float(x3)
582 y3_f = safe_float(y3)
583 if x2_f is None or y2_f is None or x3_f is None or y3_f is None:
584 point = ("v", x2, y2, x3, y3)
585 log.warning(
586 f"Cannot append curved segment to path because not all values in {point!r} can be parsed as floats"
587 )
588 else:
589 point = ("v", x2_f, y2_f, x3_f, y3_f)
590 self.curpath.append(point)
592 def do_y(self, x1: PDFStackT, y1: PDFStackT, x3: PDFStackT, y3: PDFStackT) -> None:
593 """Append curved segment to path (final point replicated)"""
594 x1_f = safe_float(x1)
595 y1_f = safe_float(y1)
596 x3_f = safe_float(x3)
597 y3_f = safe_float(y3)
598 if x1_f is None or y1_f is None or x3_f is None or y3_f is None:
599 point = ("y", x1, y1, x3, y3)
600 log.warning(
601 f"Cannot append curved segment to path because not all values in {point!r} can be parsed as floats"
602 )
603 else:
604 point = ("y", x1_f, y1_f, x3_f, y3_f)
605 self.curpath.append(point)
607 def do_h(self) -> None:
608 """Close subpath"""
609 self.curpath.append(("h",))
611 def do_re(self, x: PDFStackT, y: PDFStackT, w: PDFStackT, h: PDFStackT) -> None:
612 """Append rectangle to path"""
613 x_f = safe_float(x)
614 y_f = safe_float(y)
615 w_f = safe_float(w)
616 h_f = safe_float(h)
618 if x_f is None or y_f is None or w_f is None or h_f is None:
619 values = (x, y, w, h)
620 log.warning(
621 f"Cannot append rectangle to path because not all values in {values!r} can be parsed as floats"
622 )
623 else:
624 self.curpath.append(("m", x_f, y_f))
625 self.curpath.append(("l", x_f + w_f, y_f))
626 self.curpath.append(("l", x_f + w_f, y_f + h_f))
627 self.curpath.append(("l", x_f, y_f + h_f))
628 self.curpath.append(("h",))
630 def do_S(self) -> None:
631 """Stroke path"""
632 self.device.paint_path(self.graphicstate, True, False, False, self.curpath)
633 self.curpath = []
635 def do_s(self) -> None:
636 """Close and stroke path"""
637 self.do_h()
638 self.do_S()
640 def do_f(self) -> None:
641 """Fill path using nonzero winding number rule"""
642 self.device.paint_path(self.graphicstate, False, True, False, self.curpath)
643 self.curpath = []
645 def do_F(self) -> None:
646 """Fill path using nonzero winding number rule (obsolete)"""
648 def do_f_a(self) -> None:
649 """Fill path using even-odd rule"""
650 self.device.paint_path(self.graphicstate, False, True, True, self.curpath)
651 self.curpath = []
653 def do_B(self) -> None:
654 """Fill and stroke path using nonzero winding number rule"""
655 self.device.paint_path(self.graphicstate, True, True, False, self.curpath)
656 self.curpath = []
658 def do_B_a(self) -> None:
659 """Fill and stroke path using even-odd rule"""
660 self.device.paint_path(self.graphicstate, True, True, True, self.curpath)
661 self.curpath = []
663 def do_b(self) -> None:
664 """Close, fill, and stroke path using nonzero winding number rule"""
665 self.do_h()
666 self.do_B()
668 def do_b_a(self) -> None:
669 """Close, fill, and stroke path using even-odd rule"""
670 self.do_h()
671 self.do_B_a()
673 def do_n(self) -> None:
674 """End path without filling or stroking"""
675 self.curpath = []
677 def do_W(self) -> None:
678 """Set clipping path using nonzero winding number rule"""
680 def do_W_a(self) -> None:
681 """Set clipping path using even-odd rule"""
683 def do_CS(self, name: PDFStackT) -> None:
684 """Set color space for stroking operations
686 Introduced in PDF 1.1
687 """
688 try:
689 self.graphicstate.scs = self.csmap[literal_name(name)]
690 except KeyError:
691 if settings.STRICT:
692 raise PDFInterpreterError("Undefined ColorSpace: %r" % name)
694 def do_cs(self, name: PDFStackT) -> None:
695 """Set color space for nonstroking operations"""
696 try:
697 self.graphicstate.ncs = self.csmap[literal_name(name)]
698 except KeyError:
699 if settings.STRICT:
700 raise PDFInterpreterError("Undefined ColorSpace: %r" % name)
702 def do_G(self, gray: PDFStackT) -> None:
703 """Set gray level for stroking operations"""
704 gray_f = safe_float(gray)
706 if gray_f is None:
707 log.warning(
708 f"Cannot set gray level because {gray!r} is an invalid float value"
709 )
710 else:
711 self.graphicstate.scolor = gray_f
712 self.graphicstate.scs = self.csmap["DeviceGray"]
714 def do_g(self, gray: PDFStackT) -> None:
715 """Set gray level for nonstroking operations"""
716 gray_f = safe_float(gray)
718 if gray_f is None:
719 log.warning(
720 f"Cannot set gray level because {gray!r} is an invalid float value"
721 )
722 else:
723 self.graphicstate.ncolor = gray_f
724 self.graphicstate.ncs = self.csmap["DeviceGray"]
726 def do_RG(self, r: PDFStackT, g: PDFStackT, b: PDFStackT) -> None:
727 """Set RGB color for stroking operations"""
728 rgb = safe_rgb(r, g, b)
730 if rgb is None:
731 log.warning(
732 f"Cannot set RGB stroke color because not all values in {(r, g, b)!r} can be parsed as floats"
733 )
734 else:
735 self.graphicstate.scolor = rgb
736 self.graphicstate.scs = self.csmap["DeviceRGB"]
738 def do_rg(self, r: PDFStackT, g: PDFStackT, b: PDFStackT) -> None:
739 """Set RGB color for nonstroking operations"""
740 rgb = safe_rgb(r, g, b)
742 if rgb is None:
743 log.warning(
744 f"Cannot set RGB non-stroke color because not all values in {(r, g, b)!r} can be parsed as floats"
745 )
746 else:
747 self.graphicstate.ncolor = rgb
748 self.graphicstate.ncs = self.csmap["DeviceRGB"]
750 def do_K(self, c: PDFStackT, m: PDFStackT, y: PDFStackT, k: PDFStackT) -> None:
751 """Set CMYK color for stroking operations"""
752 cmyk = safe_cmyk(c, m, y, k)
754 if cmyk is None:
755 log.warning(
756 f"Cannot set CMYK stroke color because not all values in {(c, m, y, k)!r} can be parsed as floats"
757 )
758 else:
759 self.graphicstate.scolor = cmyk
760 self.graphicstate.scs = self.csmap["DeviceCMYK"]
762 def do_k(self, c: PDFStackT, m: PDFStackT, y: PDFStackT, k: PDFStackT) -> None:
763 """Set CMYK color for nonstroking operations"""
764 cmyk = safe_cmyk(c, m, y, k)
766 if cmyk is None:
767 log.warning(
768 f"Cannot set CMYK non-stroke color because not all values in {(c, m, y, k)!r} can be parsed as floats"
769 )
770 else:
771 self.graphicstate.ncolor = cmyk
772 self.graphicstate.ncs = self.csmap["DeviceCMYK"]
774 def do_SCN(self) -> None:
775 """Set color for stroking operations."""
776 n = self.graphicstate.scs.ncomponents
778 components = self.pop(n)
779 if len(components) != n:
780 log.warning(
781 f"Cannot set stroke color because expected {n} components but got {components!r}"
782 )
784 elif len(components) == 1:
785 gray = components[0]
786 gray_f = safe_float(gray)
787 if gray_f is None:
788 log.warning(
789 f"Cannot set gray stroke color because {gray!r} is an invalid float value"
790 )
791 else:
792 self.graphicstate.scolor = gray_f
794 elif len(components) == 3:
795 rgb = safe_rgb(*components)
797 if rgb is None:
798 log.warning(
799 f"Cannot set RGB stroke color because components {components!r} cannot be parsed as RGB"
800 )
801 else:
802 self.graphicstate.scolor = rgb
804 elif len(components) == 4:
805 cmyk = safe_cmyk(*components)
807 if cmyk is None:
808 log.warning(
809 f"Cannot set CMYK stroke color because components {components!r} cannot be parsed as CMYK"
810 )
811 else:
812 self.graphicstate.scolor = cmyk
814 else:
815 log.warning(
816 f"Cannot set stroke color because {len(components)} components are specified but only 1 (grayscale), 3 (rgb) and 4 (cmyk) are supported"
817 )
819 def do_scn(self) -> None:
820 """Set color for nonstroking operations"""
821 n = self.graphicstate.ncs.ncomponents
823 components = self.pop(n)
824 if len(components) != n:
825 log.warning(
826 f"Cannot set non-stroke color because expected {n} components but got {components!r}"
827 )
829 elif len(components) == 1:
830 gray = components[0]
831 gray_f = safe_float(gray)
832 if gray_f is None:
833 log.warning(
834 f"Cannot set gray non-stroke color because {gray!r} is an invalid float value"
835 )
836 else:
837 self.graphicstate.ncolor = gray_f
839 elif len(components) == 3:
840 rgb = safe_rgb(*components)
842 if rgb is None:
843 log.warning(
844 f"Cannot set RGB non-stroke color because components {components!r} cannot be parsed as RGB"
845 )
846 else:
847 self.graphicstate.ncolor = rgb
849 elif len(components) == 4:
850 cmyk = safe_cmyk(*components)
852 if cmyk is None:
853 log.warning(
854 f"Cannot set CMYK non-stroke color because components {components!r} cannot be parsed as CMYK"
855 )
856 else:
857 self.graphicstate.ncolor = cmyk
859 else:
860 log.warning(
861 f"Cannot set non-stroke color because {len(components)} components are specified but only 1 (grayscale), 3 (rgb) and 4 (cmyk) are supported"
862 )
864 def do_SC(self) -> None:
865 """Set color for stroking operations"""
866 self.do_SCN()
868 def do_sc(self) -> None:
869 """Set color for nonstroking operations"""
870 self.do_scn()
872 def do_sh(self, name: object) -> None:
873 """Paint area defined by shading pattern"""
875 def do_BT(self) -> None:
876 """Begin text object
878 Initializing the text matrix, Tm, and the text line matrix, Tlm, to
879 the identity matrix. Text objects cannot be nested; a second BT cannot
880 appear before an ET.
881 """
882 self.textstate.reset()
884 def do_ET(self) -> None:
885 """End a text object"""
887 def do_BX(self) -> None:
888 """Begin compatibility section"""
890 def do_EX(self) -> None:
891 """End compatibility section"""
893 def do_MP(self, tag: PDFStackT) -> None:
894 """Define marked-content point"""
895 if isinstance(tag, PSLiteral):
896 self.device.do_tag(tag)
897 else:
898 log.warning(
899 f"Cannot define marked-content point because {tag!r} is not a PSLiteral"
900 )
902 def do_DP(self, tag: PDFStackT, props: PDFStackT) -> None:
903 """Define marked-content point with property list"""
904 if isinstance(tag, PSLiteral):
905 self.device.do_tag(tag, props)
906 else:
907 log.warning(
908 f"Cannot define marked-content point with property list because {tag!r} is not a PSLiteral"
909 )
911 def do_BMC(self, tag: PDFStackT) -> None:
912 """Begin marked-content sequence"""
913 if isinstance(tag, PSLiteral):
914 self.device.begin_tag(tag)
915 else:
916 log.warning(
917 f"Cannot begin marked-content sequence because {tag!r} is not a PSLiteral"
918 )
920 def do_BDC(self, tag: PDFStackT, props: PDFStackT) -> None:
921 """Begin marked-content sequence with property list"""
922 if isinstance(tag, PSLiteral):
923 self.device.begin_tag(tag, props)
924 else:
925 log.warning(
926 f"Cannot begin marked-content sequence with property list because {tag!r} is not a PSLiteral"
927 )
929 def do_EMC(self) -> None:
930 """End marked-content sequence"""
931 self.device.end_tag()
933 def do_Tc(self, space: PDFStackT) -> None:
934 """Set character spacing.
936 Character spacing is used by the Tj, TJ, and ' operators.
938 :param space: a number expressed in unscaled text space units.
939 """
940 charspace = safe_float(space)
941 if charspace is None:
942 log.warning(
943 f"Could not set character spacing because {space!r} is an invalid float value"
944 )
945 else:
946 self.textstate.charspace = charspace
948 def do_Tw(self, space: PDFStackT) -> None:
949 """Set the word spacing.
951 Word spacing is used by the Tj, TJ, and ' operators.
953 :param space: a number expressed in unscaled text space units
954 """
955 wordspace = safe_float(space)
956 if wordspace is None:
957 log.warning(
958 f"Could not set word spacing becuase {space!r} is an invalid float value"
959 )
960 else:
961 self.textstate.wordspace = wordspace
963 def do_Tz(self, scale: PDFStackT) -> None:
964 """Set the horizontal scaling.
966 :param scale: is a number specifying the percentage of the normal width
967 """
968 scale_f = safe_float(scale)
970 if scale_f is None:
971 log.warning(
972 f"Could not set horizontal scaling because {scale!r} is an invalid float value"
973 )
974 else:
975 self.textstate.scaling = scale_f
977 def do_TL(self, leading: PDFStackT) -> None:
978 """Set the text leading.
980 Text leading is used only by the T*, ', and " operators.
982 :param leading: a number expressed in unscaled text space units
983 """
984 leading_f = safe_float(leading)
985 if leading_f is None:
986 log.warning(
987 f"Could not set text leading because {leading!r} is an invalid float value"
988 )
989 else:
990 self.textstate.leading = -leading_f
992 def do_Tf(self, fontid: PDFStackT, fontsize: PDFStackT) -> None:
993 """Set the text font
995 :param fontid: the name of a font resource in the Font subdictionary
996 of the current resource dictionary
997 :param fontsize: size is a number representing a scale factor.
998 """
999 try:
1000 self.textstate.font = self.fontmap[literal_name(fontid)]
1001 except KeyError:
1002 if settings.STRICT:
1003 raise PDFInterpreterError("Undefined Font id: %r" % fontid)
1004 self.textstate.font = self.rsrcmgr.get_font(None, {})
1006 fontsize_f = safe_float(fontsize)
1007 if fontsize_f is None:
1008 log.warning(
1009 f"Could not set text font because {fontsize!r} is an invalid float value"
1010 )
1011 else:
1012 self.textstate.fontsize = fontsize_f
1014 def do_Tr(self, render: PDFStackT) -> None:
1015 """Set the text rendering mode"""
1016 render_i = safe_int(render)
1018 if render_i is None:
1019 log.warning(
1020 f"Could not set text rendering mode because {render!r} is an invalid int value"
1021 )
1022 else:
1023 self.textstate.render = render_i
1025 def do_Ts(self, rise: PDFStackT) -> None:
1026 """Set the text rise
1028 :param rise: a number expressed in unscaled text space units
1029 """
1030 rise_f = safe_float(rise)
1032 if rise_f is None:
1033 log.warning(
1034 f"Could not set text rise because {rise!r} is an invalid float value"
1035 )
1036 else:
1037 self.textstate.rise = rise_f
1039 def do_Td(self, tx: PDFStackT, ty: PDFStackT) -> None:
1040 """Move to the start of the next line
1042 Offset from the start of the current line by (tx , ty).
1043 """
1044 tx_ = safe_float(tx)
1045 ty_ = safe_float(ty)
1046 if tx_ is not None and ty_ is not None:
1047 (a, b, c, d, e, f) = self.textstate.matrix
1048 e_new = tx_ * a + ty_ * c + e
1049 f_new = tx_ * b + ty_ * d + f
1050 self.textstate.matrix = (a, b, c, d, e_new, f_new)
1052 elif settings.STRICT:
1053 raise PDFValueError(f"Invalid offset ({tx!r}, {ty!r}) for Td")
1055 self.textstate.linematrix = (0, 0)
1057 def do_TD(self, tx: PDFStackT, ty: PDFStackT) -> None:
1058 """Move to the start of the next line.
1060 offset from the start of the current line by (tx , ty). As a side effect, this
1061 operator sets the leading parameter in the text state.
1062 """
1063 tx_ = safe_float(tx)
1064 ty_ = safe_float(ty)
1066 if tx_ is not None and ty_ is not None:
1067 (a, b, c, d, e, f) = self.textstate.matrix
1068 e_new = tx_ * a + ty_ * c + e
1069 f_new = tx_ * b + ty_ * d + f
1070 self.textstate.matrix = (a, b, c, d, e_new, f_new)
1072 elif settings.STRICT:
1073 raise PDFValueError("Invalid offset ({tx}, {ty}) for TD")
1075 if ty_ is not None:
1076 self.textstate.leading = ty_
1078 self.textstate.linematrix = (0, 0)
1080 def do_Tm(
1081 self,
1082 a: PDFStackT,
1083 b: PDFStackT,
1084 c: PDFStackT,
1085 d: PDFStackT,
1086 e: PDFStackT,
1087 f: PDFStackT,
1088 ) -> None:
1089 """Set text matrix and text line matrix"""
1090 values = (a, b, c, d, e, f)
1091 matrix = safe_matrix(*values)
1093 if matrix is None:
1094 log.warning(
1095 f"Could not set text matrix because not all values in {values!r} can be parsed as floats"
1096 )
1097 else:
1098 self.textstate.matrix = matrix
1099 self.textstate.linematrix = (0, 0)
1101 def do_T_a(self) -> None:
1102 """Move to start of next text line"""
1103 (a, b, c, d, e, f) = self.textstate.matrix
1104 self.textstate.matrix = (
1105 a,
1106 b,
1107 c,
1108 d,
1109 self.textstate.leading * c + e,
1110 self.textstate.leading * d + f,
1111 )
1112 self.textstate.linematrix = (0, 0)
1114 def do_TJ(self, seq: PDFStackT) -> None:
1115 """Show text, allowing individual glyph positioning"""
1116 if self.textstate.font is None:
1117 if settings.STRICT:
1118 raise PDFInterpreterError("No font specified!")
1119 return
1120 self.device.render_string(
1121 self.textstate,
1122 cast(PDFTextSeq, seq),
1123 self.graphicstate.ncs,
1124 self.graphicstate.copy(),
1125 )
1127 def do_Tj(self, s: PDFStackT) -> None:
1128 """Show text"""
1129 self.do_TJ([s])
1131 def do__q(self, s: PDFStackT) -> None:
1132 """Move to next line and show text
1134 The ' (single quote) operator.
1135 """
1136 self.do_T_a()
1137 self.do_TJ([s])
1139 def do__w(self, aw: PDFStackT, ac: PDFStackT, s: PDFStackT) -> None:
1140 """Set word and character spacing, move to next line, and show text
1142 The " (double quote) operator.
1143 """
1144 self.do_Tw(aw)
1145 self.do_Tc(ac)
1146 self.do_TJ([s])
1148 def do_BI(self) -> None:
1149 """Begin inline image object"""
1151 def do_ID(self) -> None:
1152 """Begin inline image data"""
1154 def do_EI(self, obj: PDFStackT) -> None:
1155 """End inline image object"""
1156 if isinstance(obj, PDFStream) and "W" in obj and "H" in obj:
1157 iobjid = str(id(obj))
1158 self.device.begin_figure(iobjid, (0, 0, 1, 1), MATRIX_IDENTITY)
1159 self.device.render_image(iobjid, obj)
1160 self.device.end_figure(iobjid)
1162 def do_Do(self, xobjid_arg: PDFStackT) -> None:
1163 """Invoke named XObject"""
1164 xobjid = literal_name(xobjid_arg)
1165 try:
1166 xobj = stream_value(self.xobjmap[xobjid])
1167 except KeyError:
1168 if settings.STRICT:
1169 raise PDFInterpreterError("Undefined xobject id: %r" % xobjid)
1170 return
1171 log.debug("Processing xobj: %r", xobj)
1172 subtype = xobj.get("Subtype")
1173 if subtype is LITERAL_FORM and "BBox" in xobj:
1174 interpreter = self.dup()
1175 bbox = cast(Rect, list_value(xobj["BBox"]))
1176 matrix = cast(Matrix, list_value(xobj.get("Matrix", MATRIX_IDENTITY)))
1177 # According to PDF reference 1.7 section 4.9.1, XObjects in
1178 # earlier PDFs (prior to v1.2) use the page's Resources entry
1179 # instead of having their own Resources entry.
1180 xobjres = xobj.get("Resources")
1181 if xobjres:
1182 resources = dict_value(xobjres)
1183 else:
1184 resources = self.resources.copy()
1185 self.device.begin_figure(xobjid, bbox, matrix)
1186 interpreter.render_contents(
1187 resources,
1188 [xobj],
1189 ctm=mult_matrix(matrix, self.ctm),
1190 )
1191 self.device.end_figure(xobjid)
1192 elif subtype is LITERAL_IMAGE and "Width" in xobj and "Height" in xobj:
1193 self.device.begin_figure(xobjid, (0, 0, 1, 1), MATRIX_IDENTITY)
1194 self.device.render_image(xobjid, xobj)
1195 self.device.end_figure(xobjid)
1196 else:
1197 # unsupported xobject type.
1198 pass
1200 def process_page(self, page: PDFPage) -> None:
1201 log.debug("Processing page: %r", page)
1202 (x0, y0, x1, y1) = page.mediabox
1203 if page.rotate == 90:
1204 ctm = (0, -1, 1, 0, -y0, x1)
1205 elif page.rotate == 180:
1206 ctm = (-1, 0, 0, -1, x1, y1)
1207 elif page.rotate == 270:
1208 ctm = (0, 1, -1, 0, y1, -x0)
1209 else:
1210 ctm = (1, 0, 0, 1, -x0, -y0)
1211 self.device.begin_page(page, ctm)
1212 self.render_contents(page.resources, page.contents, ctm=ctm)
1213 self.device.end_page(page)
1215 def render_contents(
1216 self,
1217 resources: Dict[object, object],
1218 streams: Sequence[object],
1219 ctm: Matrix = MATRIX_IDENTITY,
1220 ) -> None:
1221 """Render the content streams.
1223 This method may be called recursively.
1224 """
1225 log.debug(
1226 "render_contents: resources=%r, streams=%r, ctm=%r",
1227 resources,
1228 streams,
1229 ctm,
1230 )
1231 self.init_resources(resources)
1232 self.init_state(ctm)
1233 self.execute(list_value(streams))
1235 def execute(self, streams: Sequence[object]) -> None:
1236 try:
1237 parser = PDFContentParser(streams)
1238 except PSEOF:
1239 # empty page
1240 return
1241 while True:
1242 try:
1243 (_, obj) = parser.nextobject()
1244 except PSEOF:
1245 break
1246 if isinstance(obj, PSKeyword):
1247 name = keyword_name(obj)
1248 method = "do_%s" % name.replace("*", "_a").replace('"', "_w").replace(
1249 "'",
1250 "_q",
1251 )
1252 if hasattr(self, method):
1253 func = getattr(self, method)
1254 nargs = func.__code__.co_argcount - 1
1255 if nargs:
1256 args = self.pop(nargs)
1257 log.debug("exec: %s %r", name, args)
1258 if len(args) == nargs:
1259 func(*args)
1260 else:
1261 log.debug("exec: %s", name)
1262 func()
1263 elif settings.STRICT:
1264 error_msg = "Unknown operator: %r" % name
1265 raise PDFInterpreterError(error_msg)
1266 else:
1267 self.push(obj)