Coverage for /pythoncovmergedfiles/medio/medio/src/pdfminer.six/pdfminer/pdfinterp.py: 95%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import logging
2import re
3from io import BytesIO
4from typing import Dict, List, Mapping, Optional, Sequence, Tuple, Union, cast
6from pdfminer import settings
7from pdfminer.casting import safe_cmyk, safe_float, safe_int, safe_matrix, safe_rgb
8from pdfminer.cmapdb import CMap, CMapBase, CMapDB
9from pdfminer.pdfcolor import PREDEFINED_COLORSPACE, PDFColorSpace
10from pdfminer.pdfdevice import PDFDevice, PDFTextSeq
11from pdfminer.pdfexceptions import PDFException, PDFValueError
12from pdfminer.pdffont import (
13 PDFCIDFont,
14 PDFFont,
15 PDFFontError,
16 PDFTrueTypeFont,
17 PDFType1Font,
18 PDFType3Font,
19)
20from pdfminer.pdfpage import PDFPage
21from pdfminer.pdftypes import (
22 LITERALS_ASCII85_DECODE,
23 PDFObjRef,
24 PDFStream,
25 dict_value,
26 list_value,
27 resolve1,
28 stream_value,
29)
30from pdfminer.psexceptions import PSEOF, PSTypeError
31from pdfminer.psparser import (
32 KWD,
33 LIT,
34 PSKeyword,
35 PSLiteral,
36 PSStackParser,
37 PSStackType,
38 keyword_name,
39 literal_name,
40)
41from pdfminer.utils import (
42 MATRIX_IDENTITY,
43 Matrix,
44 PathSegment,
45 Point,
46 Rect,
47 choplist,
48 mult_matrix,
49)
51log = logging.getLogger(__name__)
54class PDFResourceError(PDFException):
55 pass
58class PDFInterpreterError(PDFException):
59 pass
62LITERAL_PDF = LIT("PDF")
63LITERAL_TEXT = LIT("Text")
64LITERAL_FONT = LIT("Font")
65LITERAL_FORM = LIT("Form")
66LITERAL_IMAGE = LIT("Image")
69class PDFTextState:
70 matrix: Matrix
71 linematrix: Point
73 def __init__(self) -> None:
74 self.font: Optional[PDFFont] = None
75 self.fontsize: float = 0
76 self.charspace: float = 0
77 self.wordspace: float = 0
78 self.scaling: float = 100
79 self.leading: float = 0
80 self.render: int = 0
81 self.rise: float = 0
82 self.reset()
83 # self.matrix is set
84 # self.linematrix is set
86 def __repr__(self) -> str:
87 return (
88 "<PDFTextState: font=%r, fontsize=%r, charspace=%r, "
89 "wordspace=%r, scaling=%r, leading=%r, render=%r, rise=%r, "
90 "matrix=%r, linematrix=%r>"
91 % (
92 self.font,
93 self.fontsize,
94 self.charspace,
95 self.wordspace,
96 self.scaling,
97 self.leading,
98 self.render,
99 self.rise,
100 self.matrix,
101 self.linematrix,
102 )
103 )
105 def copy(self) -> "PDFTextState":
106 obj = PDFTextState()
107 obj.font = self.font
108 obj.fontsize = self.fontsize
109 obj.charspace = self.charspace
110 obj.wordspace = self.wordspace
111 obj.scaling = self.scaling
112 obj.leading = self.leading
113 obj.render = self.render
114 obj.rise = self.rise
115 obj.matrix = self.matrix
116 obj.linematrix = self.linematrix
117 return obj
119 def reset(self) -> None:
120 self.matrix = MATRIX_IDENTITY
121 self.linematrix = (0, 0)
124Color = Union[
125 float, # Greyscale
126 Tuple[float, float, float], # R, G, B
127 Tuple[float, float, float, float], # C, M, Y, K
128]
131class PDFGraphicState:
132 def __init__(self) -> None:
133 self.linewidth: float = 0
134 self.linecap: Optional[object] = None
135 self.linejoin: Optional[object] = None
136 self.miterlimit: Optional[object] = None
137 self.dash: Optional[Tuple[object, object]] = None
138 self.intent: Optional[object] = None
139 self.flatness: Optional[object] = None
141 # stroking color
142 self.scolor: Color = 0
143 self.scs: PDFColorSpace = PREDEFINED_COLORSPACE["DeviceGray"]
145 # non stroking color
146 self.ncolor: Color = 0
147 self.ncs: PDFColorSpace = PREDEFINED_COLORSPACE["DeviceGray"]
149 def copy(self) -> "PDFGraphicState":
150 obj = PDFGraphicState()
151 obj.linewidth = self.linewidth
152 obj.linecap = self.linecap
153 obj.linejoin = self.linejoin
154 obj.miterlimit = self.miterlimit
155 obj.dash = self.dash
156 obj.intent = self.intent
157 obj.flatness = self.flatness
158 obj.scolor = self.scolor
159 obj.ncolor = self.ncolor
160 return obj
162 def __repr__(self) -> str:
163 return (
164 "<PDFGraphicState: linewidth=%r, linecap=%r, linejoin=%r, "
165 " miterlimit=%r, dash=%r, intent=%r, flatness=%r, "
166 " stroking color=%r, non stroking color=%r>"
167 % (
168 self.linewidth,
169 self.linecap,
170 self.linejoin,
171 self.miterlimit,
172 self.dash,
173 self.intent,
174 self.flatness,
175 self.scolor,
176 self.ncolor,
177 )
178 )
181class PDFResourceManager:
182 """Repository of shared resources.
184 ResourceManager facilitates reuse of shared resources
185 such as fonts and images so that large objects are not
186 allocated multiple times.
187 """
189 def __init__(self, caching: bool = True) -> None:
190 self.caching = caching
191 self._cached_fonts: Dict[object, PDFFont] = {}
193 def get_procset(self, procs: Sequence[object]) -> None:
194 for proc in procs:
195 if proc is LITERAL_PDF or proc is LITERAL_TEXT:
196 pass
197 else:
198 pass
200 def get_cmap(self, cmapname: str, strict: bool = False) -> CMapBase:
201 try:
202 return CMapDB.get_cmap(cmapname)
203 except CMapDB.CMapNotFound:
204 if strict:
205 raise
206 return CMap()
208 def get_font(self, objid: object, spec: Mapping[str, object]) -> PDFFont:
209 if objid and objid in self._cached_fonts:
210 font = self._cached_fonts[objid]
211 else:
212 log.debug("get_font: create: objid=%r, spec=%r", objid, spec)
213 if settings.STRICT:
214 if spec["Type"] is not LITERAL_FONT:
215 raise PDFFontError("Type is not /Font")
216 # Create a Font object.
217 if "Subtype" in spec:
218 subtype = literal_name(spec["Subtype"])
219 else:
220 if settings.STRICT:
221 raise PDFFontError("Font Subtype is not specified.")
222 subtype = "Type1"
223 if subtype in ("Type1", "MMType1"):
224 # Type1 Font
225 font = PDFType1Font(self, spec)
226 elif subtype == "TrueType":
227 # TrueType Font
228 font = PDFTrueTypeFont(self, spec)
229 elif subtype == "Type3":
230 # Type3 Font
231 font = PDFType3Font(self, spec)
232 elif subtype in ("CIDFontType0", "CIDFontType2"):
233 # CID Font
234 font = PDFCIDFont(self, spec)
235 elif subtype == "Type0":
236 # Type0 Font
237 dfonts = list_value(spec["DescendantFonts"])
238 assert dfonts
239 subspec = dict_value(dfonts[0]).copy()
240 for k in ("Encoding", "ToUnicode"):
241 if k in spec:
242 subspec[k] = resolve1(spec[k])
243 font = self.get_font(None, subspec)
244 else:
245 if settings.STRICT:
246 raise PDFFontError("Invalid Font spec: %r" % spec)
247 font = PDFType1Font(self, spec) # this is so wrong!
248 if objid and self.caching:
249 self._cached_fonts[objid] = font
250 return font
253class PDFContentParser(PSStackParser[Union[PSKeyword, PDFStream]]):
254 def __init__(self, streams: Sequence[object]) -> None:
255 self.streams = streams
256 self.istream = 0
257 # PSStackParser.__init__(fp=None) is safe only because we've overloaded
258 # all the methods that would attempt to access self.fp without first
259 # calling self.fillfp().
260 PSStackParser.__init__(self, None) # type: ignore[arg-type]
262 def fillfp(self) -> None:
263 if not self.fp:
264 if self.istream < len(self.streams):
265 strm = stream_value(self.streams[self.istream])
266 self.istream += 1
267 else:
268 raise PSEOF("Unexpected EOF, file truncated?")
269 self.fp = BytesIO(strm.get_data())
271 def seek(self, pos: int) -> None:
272 self.fillfp()
273 PSStackParser.seek(self, pos)
275 def fillbuf(self) -> None:
276 if self.charpos < len(self.buf):
277 return
278 while 1:
279 self.fillfp()
280 self.bufpos = self.fp.tell()
281 self.buf = self.fp.read(self.BUFSIZ)
282 if self.buf:
283 break
284 self.fp = None # type: ignore[assignment]
285 self.charpos = 0
287 def get_inline_data(self, pos: int, target: bytes = b"EI") -> Tuple[int, bytes]:
288 self.seek(pos)
289 i = 0
290 data = b""
291 while i <= len(target):
292 self.fillbuf()
293 if i:
294 ci = self.buf[self.charpos]
295 c = bytes((ci,))
296 data += c
297 self.charpos += 1
298 if (
299 len(target) <= i
300 and c.isspace()
301 or i < len(target)
302 and c == (bytes((target[i],)))
303 ):
304 i += 1
305 else:
306 i = 0
307 else:
308 try:
309 j = self.buf.index(target[0], self.charpos)
310 data += self.buf[self.charpos : j + 1]
311 self.charpos = j + 1
312 i = 1
313 except ValueError:
314 data += self.buf[self.charpos :]
315 self.charpos = len(self.buf)
316 data = data[: -(len(target) + 1)] # strip the last part
317 data = re.sub(rb"(\x0d\x0a|[\x0d\x0a])$", b"", data)
318 return (pos, data)
320 def flush(self) -> None:
321 self.add_results(*self.popall())
323 KEYWORD_BI = KWD(b"BI")
324 KEYWORD_ID = KWD(b"ID")
325 KEYWORD_EI = KWD(b"EI")
327 def do_keyword(self, pos: int, token: PSKeyword) -> None:
328 if token is self.KEYWORD_BI:
329 # inline image within a content stream
330 self.start_type(pos, "inline")
331 elif token is self.KEYWORD_ID:
332 try:
333 (_, objs) = self.end_type("inline")
334 if len(objs) % 2 != 0:
335 error_msg = f"Invalid dictionary construct: {objs!r}"
336 raise PSTypeError(error_msg)
337 d = {literal_name(k): resolve1(v) for (k, v) in choplist(2, objs)}
338 eos = b"EI"
339 filter = d.get("F", None)
340 if filter is not None:
341 if isinstance(filter, PSLiteral):
342 filter = [filter]
343 if filter[0] in LITERALS_ASCII85_DECODE:
344 eos = b"~>"
345 (pos, data) = self.get_inline_data(pos + len(b"ID "), target=eos)
346 if eos != b"EI": # it may be necessary for decoding
347 data += eos
348 obj = PDFStream(d, data)
349 self.push((pos, obj))
350 if eos == b"EI": # otherwise it is still in the stream
351 self.push((pos, self.KEYWORD_EI))
352 except PSTypeError:
353 if settings.STRICT:
354 raise
355 else:
356 self.push((pos, token))
359PDFStackT = PSStackType[PDFStream]
360"""Types that may appear on the PDF argument stack."""
363class PDFPageInterpreter:
364 """Processor for the content of a PDF page
366 Reference: PDF Reference, Appendix A, Operator Summary
367 """
369 def __init__(self, rsrcmgr: PDFResourceManager, device: PDFDevice) -> None:
370 self.rsrcmgr = rsrcmgr
371 self.device = device
373 def dup(self) -> "PDFPageInterpreter":
374 return self.__class__(self.rsrcmgr, self.device)
376 def init_resources(self, resources: Dict[object, object]) -> None:
377 """Prepare the fonts and XObjects listed in the Resource attribute."""
378 self.resources = resources
379 self.fontmap: Dict[object, PDFFont] = {}
380 self.xobjmap = {}
381 self.csmap: Dict[str, PDFColorSpace] = PREDEFINED_COLORSPACE.copy()
382 if not resources:
383 return
385 def get_colorspace(spec: object) -> Optional[PDFColorSpace]:
386 if isinstance(spec, list):
387 name = literal_name(spec[0])
388 else:
389 name = literal_name(spec)
390 if name == "ICCBased" and isinstance(spec, list) and len(spec) >= 2:
391 return PDFColorSpace(name, stream_value(spec[1])["N"])
392 elif name == "DeviceN" and isinstance(spec, list) and len(spec) >= 2:
393 return PDFColorSpace(name, len(list_value(spec[1])))
394 else:
395 return PREDEFINED_COLORSPACE.get(name)
397 for k, v in dict_value(resources).items():
398 log.debug("Resource: %r: %r", k, v)
399 if k == "Font":
400 for fontid, spec in dict_value(v).items():
401 objid = None
402 if isinstance(spec, PDFObjRef):
403 objid = spec.objid
404 spec = dict_value(spec)
405 self.fontmap[fontid] = self.rsrcmgr.get_font(objid, spec)
406 elif k == "ColorSpace":
407 for csid, spec in dict_value(v).items():
408 colorspace = get_colorspace(resolve1(spec))
409 if colorspace is not None:
410 self.csmap[csid] = colorspace
411 elif k == "ProcSet":
412 self.rsrcmgr.get_procset(list_value(v))
413 elif k == "XObject":
414 for xobjid, xobjstrm in dict_value(v).items():
415 self.xobjmap[xobjid] = xobjstrm
417 def init_state(self, ctm: Matrix) -> None:
418 """Initialize the text and graphic states for rendering a page."""
419 # gstack: stack for graphical states.
420 self.gstack: List[Tuple[Matrix, PDFTextState, PDFGraphicState]] = []
421 self.ctm = ctm
422 self.device.set_ctm(self.ctm)
423 self.textstate = PDFTextState()
424 self.graphicstate = PDFGraphicState()
425 self.curpath: List[PathSegment] = []
426 # argstack: stack for command arguments.
427 self.argstack: List[PDFStackT] = []
429 def push(self, obj: PDFStackT) -> None:
430 self.argstack.append(obj)
432 def pop(self, n: int) -> List[PDFStackT]:
433 if n == 0:
434 return []
435 x = self.argstack[-n:]
436 self.argstack = self.argstack[:-n]
437 return x
439 def get_current_state(self) -> Tuple[Matrix, PDFTextState, PDFGraphicState]:
440 return (self.ctm, self.textstate.copy(), self.graphicstate.copy())
442 def set_current_state(
443 self,
444 state: Tuple[Matrix, PDFTextState, PDFGraphicState],
445 ) -> None:
446 (self.ctm, self.textstate, self.graphicstate) = state
447 self.device.set_ctm(self.ctm)
449 def do_q(self) -> None:
450 """Save graphics state"""
451 self.gstack.append(self.get_current_state())
453 def do_Q(self) -> None:
454 """Restore graphics state"""
455 if self.gstack:
456 self.set_current_state(self.gstack.pop())
458 def do_cm(
459 self,
460 a1: PDFStackT,
461 b1: PDFStackT,
462 c1: PDFStackT,
463 d1: PDFStackT,
464 e1: PDFStackT,
465 f1: PDFStackT,
466 ) -> None:
467 """Concatenate matrix to current transformation matrix"""
468 matrix = safe_matrix(a1, b1, c1, d1, e1, f1)
470 if matrix is None:
471 log.warning(
472 f"Cannot concatenate matrix to current transformation matrix because not all values in {(a1, b1, c1, d1, e1, f1)!r} can be parsed as floats"
473 )
474 else:
475 self.ctm = mult_matrix(matrix, self.ctm)
476 self.device.set_ctm(self.ctm)
478 def do_w(self, linewidth: PDFStackT) -> None:
479 """Set line width"""
480 linewidth_f = safe_float(linewidth)
481 if linewidth_f is None:
482 log.warning(
483 f"Cannot set line width because {linewidth!r} is an invalid float value"
484 )
485 else:
486 self.graphicstate.linewidth = linewidth_f
488 def do_J(self, linecap: PDFStackT) -> None:
489 """Set line cap style"""
490 self.graphicstate.linecap = linecap
492 def do_j(self, linejoin: PDFStackT) -> None:
493 """Set line join style"""
494 self.graphicstate.linejoin = linejoin
496 def do_M(self, miterlimit: PDFStackT) -> None:
497 """Set miter limit"""
498 self.graphicstate.miterlimit = miterlimit
500 def do_d(self, dash: PDFStackT, phase: PDFStackT) -> None:
501 """Set line dash pattern"""
502 self.graphicstate.dash = (dash, phase)
504 def do_ri(self, intent: PDFStackT) -> None:
505 """Set color rendering intent"""
506 self.graphicstate.intent = intent
508 def do_i(self, flatness: PDFStackT) -> None:
509 """Set flatness tolerance"""
510 self.graphicstate.flatness = flatness
512 def do_gs(self, name: PDFStackT) -> None:
513 """Set parameters from graphics state parameter dictionary"""
514 # TODO
516 def do_m(self, x: PDFStackT, y: PDFStackT) -> None:
517 """Begin new subpath"""
518 x_f = safe_float(x)
519 y_f = safe_float(y)
521 if x_f is None or y_f is None:
522 point = ("m", x, y)
523 log.warning(
524 f"Cannot start new subpath because not all values in {point!r} can be parsed as floats"
525 )
526 else:
527 point = ("m", x_f, y_f)
528 self.curpath.append(point)
530 def do_l(self, x: PDFStackT, y: PDFStackT) -> None:
531 """Append straight line segment to path"""
532 x_f = safe_float(x)
533 y_f = safe_float(y)
534 if x_f is None or y_f is None:
535 point = ("l", x, y)
536 log.warning(
537 f"Cannot append straight line segment to path because not all values in {point!r} can be parsed as floats"
538 )
539 else:
540 point = ("l", x_f, y_f)
541 self.curpath.append(point)
543 def do_c(
544 self,
545 x1: PDFStackT,
546 y1: PDFStackT,
547 x2: PDFStackT,
548 y2: PDFStackT,
549 x3: PDFStackT,
550 y3: PDFStackT,
551 ) -> None:
552 """Append curved segment to path (three control points)"""
553 x1_f = safe_float(x1)
554 y1_f = safe_float(y1)
555 x2_f = safe_float(x2)
556 y2_f = safe_float(y2)
557 x3_f = safe_float(x3)
558 y3_f = safe_float(y3)
559 if (
560 x1_f is None
561 or y1_f is None
562 or x2_f is None
563 or y2_f is None
564 or x3_f is None
565 or y3_f is None
566 ):
567 point = ("c", x1, y1, x2, y2, x3, y3)
568 log.warning(
569 f"Cannot append curved segment to path because not all values in {point!r} can be parsed as floats"
570 )
571 else:
572 point = ("c", x1_f, y1_f, x2_f, y2_f, x3_f, y3_f)
573 self.curpath.append(point)
575 def do_v(self, x2: PDFStackT, y2: PDFStackT, x3: PDFStackT, y3: PDFStackT) -> None:
576 """Append curved segment to path (initial point replicated)"""
577 x2_f = safe_float(x2)
578 y2_f = safe_float(y2)
579 x3_f = safe_float(x3)
580 y3_f = safe_float(y3)
581 if x2_f is None or y2_f is None or x3_f is None or y3_f is None:
582 point = ("v", x2, y2, x3, y3)
583 log.warning(
584 f"Cannot append curved segment to path because not all values in {point!r} can be parsed as floats"
585 )
586 else:
587 point = ("v", x2_f, y2_f, x3_f, y3_f)
588 self.curpath.append(point)
590 def do_y(self, x1: PDFStackT, y1: PDFStackT, x3: PDFStackT, y3: PDFStackT) -> None:
591 """Append curved segment to path (final point replicated)"""
592 x1_f = safe_float(x1)
593 y1_f = safe_float(y1)
594 x3_f = safe_float(x3)
595 y3_f = safe_float(y3)
596 if x1_f is None or y1_f is None or x3_f is None or y3_f is None:
597 point = ("y", x1, y1, x3, y3)
598 log.warning(
599 f"Cannot append curved segment to path because not all values in {point!r} can be parsed as floats"
600 )
601 else:
602 point = ("y", x1_f, y1_f, x3_f, y3_f)
603 self.curpath.append(point)
605 def do_h(self) -> None:
606 """Close subpath"""
607 self.curpath.append(("h",))
609 def do_re(self, x: PDFStackT, y: PDFStackT, w: PDFStackT, h: PDFStackT) -> None:
610 """Append rectangle to path"""
611 x_f = safe_float(x)
612 y_f = safe_float(y)
613 w_f = safe_float(w)
614 h_f = safe_float(h)
616 if x_f is None or y_f is None or w_f is None or h_f is None:
617 values = (x, y, w, h)
618 log.warning(
619 f"Cannot append rectangle to path because not all values in {values!r} can be parsed as floats"
620 )
621 else:
622 self.curpath.append(("m", x_f, y_f))
623 self.curpath.append(("l", x_f + w_f, y_f))
624 self.curpath.append(("l", x_f + w_f, y_f + h_f))
625 self.curpath.append(("l", x_f, y_f + h_f))
626 self.curpath.append(("h",))
628 def do_S(self) -> None:
629 """Stroke path"""
630 self.device.paint_path(self.graphicstate, True, False, False, self.curpath)
631 self.curpath = []
633 def do_s(self) -> None:
634 """Close and stroke path"""
635 self.do_h()
636 self.do_S()
638 def do_f(self) -> None:
639 """Fill path using nonzero winding number rule"""
640 self.device.paint_path(self.graphicstate, False, True, False, self.curpath)
641 self.curpath = []
643 def do_F(self) -> None:
644 """Fill path using nonzero winding number rule (obsolete)"""
646 def do_f_a(self) -> None:
647 """Fill path using even-odd rule"""
648 self.device.paint_path(self.graphicstate, False, True, True, self.curpath)
649 self.curpath = []
651 def do_B(self) -> None:
652 """Fill and stroke path using nonzero winding number rule"""
653 self.device.paint_path(self.graphicstate, True, True, False, self.curpath)
654 self.curpath = []
656 def do_B_a(self) -> None:
657 """Fill and stroke path using even-odd rule"""
658 self.device.paint_path(self.graphicstate, True, True, True, self.curpath)
659 self.curpath = []
661 def do_b(self) -> None:
662 """Close, fill, and stroke path using nonzero winding number rule"""
663 self.do_h()
664 self.do_B()
666 def do_b_a(self) -> None:
667 """Close, fill, and stroke path using even-odd rule"""
668 self.do_h()
669 self.do_B_a()
671 def do_n(self) -> None:
672 """End path without filling or stroking"""
673 self.curpath = []
675 def do_W(self) -> None:
676 """Set clipping path using nonzero winding number rule"""
678 def do_W_a(self) -> None:
679 """Set clipping path using even-odd rule"""
681 def do_CS(self, name: PDFStackT) -> None:
682 """Set color space for stroking operations
684 Introduced in PDF 1.1
685 """
686 try:
687 self.graphicstate.scs = self.csmap[literal_name(name)]
688 except KeyError:
689 if settings.STRICT:
690 raise PDFInterpreterError("Undefined ColorSpace: %r" % name)
692 def do_cs(self, name: PDFStackT) -> None:
693 """Set color space for nonstroking operations"""
694 try:
695 self.graphicstate.ncs = self.csmap[literal_name(name)]
696 except KeyError:
697 if settings.STRICT:
698 raise PDFInterpreterError("Undefined ColorSpace: %r" % name)
700 def do_G(self, gray: PDFStackT) -> None:
701 """Set gray level for stroking operations"""
702 gray_f = safe_float(gray)
704 if gray_f is None:
705 log.warning(
706 f"Cannot set gray level because {gray!r} is an invalid float value"
707 )
708 else:
709 self.graphicstate.scolor = gray_f
710 self.graphicstate.scs = self.csmap["DeviceGray"]
712 def do_g(self, gray: PDFStackT) -> None:
713 """Set gray level for nonstroking operations"""
714 gray_f = safe_float(gray)
716 if gray_f is None:
717 log.warning(
718 f"Cannot set gray level because {gray!r} is an invalid float value"
719 )
720 else:
721 self.graphicstate.ncolor = gray_f
722 self.graphicstate.ncs = self.csmap["DeviceGray"]
724 def do_RG(self, r: PDFStackT, g: PDFStackT, b: PDFStackT) -> None:
725 """Set RGB color for stroking operations"""
726 rgb = safe_rgb(r, g, b)
728 if rgb is None:
729 log.warning(
730 f"Cannot set RGB stroke color because not all values in {(r, g, b)!r} can be parsed as floats"
731 )
732 else:
733 self.graphicstate.scolor = rgb
734 self.graphicstate.scs = self.csmap["DeviceRGB"]
736 def do_rg(self, r: PDFStackT, g: PDFStackT, b: PDFStackT) -> None:
737 """Set RGB color for nonstroking operations"""
738 rgb = safe_rgb(r, g, b)
740 if rgb is None:
741 log.warning(
742 f"Cannot set RGB non-stroke color because not all values in {(r, g, b)!r} can be parsed as floats"
743 )
744 else:
745 self.graphicstate.ncolor = rgb
746 self.graphicstate.ncs = self.csmap["DeviceRGB"]
748 def do_K(self, c: PDFStackT, m: PDFStackT, y: PDFStackT, k: PDFStackT) -> None:
749 """Set CMYK color for stroking operations"""
750 cmyk = safe_cmyk(c, m, y, k)
752 if cmyk is None:
753 log.warning(
754 f"Cannot set CMYK stroke color because not all values in {(c, m, y, k)!r} can be parsed as floats"
755 )
756 else:
757 self.graphicstate.scolor = cmyk
758 self.graphicstate.scs = self.csmap["DeviceCMYK"]
760 def do_k(self, c: PDFStackT, m: PDFStackT, y: PDFStackT, k: PDFStackT) -> None:
761 """Set CMYK color for nonstroking operations"""
762 cmyk = safe_cmyk(c, m, y, k)
764 if cmyk is None:
765 log.warning(
766 f"Cannot set CMYK non-stroke color because not all values in {(c, m, y, k)!r} can be parsed as floats"
767 )
768 else:
769 self.graphicstate.ncolor = cmyk
770 self.graphicstate.ncs = self.csmap["DeviceCMYK"]
772 def do_SCN(self) -> None:
773 """Set color for stroking operations."""
774 n = self.graphicstate.scs.ncomponents
776 components = self.pop(n)
777 if len(components) != n:
778 log.warning(
779 f"Cannot set stroke color because expected {n} components but got {components:!r}"
780 )
782 elif len(components) == 1:
783 gray = components[0]
784 gray_f = safe_float(gray)
785 if gray_f is None:
786 log.warning(
787 f"Cannot set gray stroke color because {gray!r} is an invalid float value"
788 )
789 else:
790 self.graphicstate.scolor = gray_f
792 elif len(components) == 3:
793 rgb = safe_rgb(*components)
795 if rgb is None:
796 log.warning(
797 f"Cannot set RGB stroke color because components {components!r} cannot be parsed as RGB"
798 )
799 else:
800 self.graphicstate.scolor = rgb
802 elif len(components) == 4:
803 cmyk = safe_cmyk(*components)
805 if cmyk is None:
806 log.warning(
807 f"Cannot set CMYK stroke color because components {components!r} cannot be parsed as CMYK"
808 )
809 else:
810 self.graphicstate.scolor = cmyk
812 else:
813 log.warning(
814 f"Cannot set stroke color because {len(components)} components are specified but only 1 (grayscale), 3 (rgb) and 4 (cmyk) are supported"
815 )
817 def do_scn(self) -> None:
818 """Set color for nonstroking operations"""
819 n = self.graphicstate.ncs.ncomponents
821 components = self.pop(n)
822 if len(components) != n:
823 log.warning(
824 f"Cannot set non-stroke color because expected {n} components but got {components:!r}"
825 )
827 elif len(components) == 1:
828 gray = components[0]
829 gray_f = safe_float(gray)
830 if gray_f is None:
831 log.warning(
832 f"Cannot set gray non-stroke color because {gray!r} is an invalid float value"
833 )
834 else:
835 self.graphicstate.ncolor = gray_f
837 elif len(components) == 3:
838 rgb = safe_rgb(*components)
840 if rgb is None:
841 log.warning(
842 f"Cannot set RGB non-stroke color because components {components!r} cannot be parsed as RGB"
843 )
844 else:
845 self.graphicstate.ncolor = rgb
847 elif len(components) == 4:
848 cmyk = safe_cmyk(*components)
850 if cmyk is None:
851 log.warning(
852 f"Cannot set CMYK non-stroke color because components {components!r} cannot be parsed as CMYK"
853 )
854 else:
855 self.graphicstate.ncolor = cmyk
857 else:
858 log.warning(
859 f"Cannot set non-stroke color because {len(components)} components are specified but only 1 (grayscale), 3 (rgb) and 4 (cmyk) are supported"
860 )
862 def do_SC(self) -> None:
863 """Set color for stroking operations"""
864 self.do_SCN()
866 def do_sc(self) -> None:
867 """Set color for nonstroking operations"""
868 self.do_scn()
870 def do_sh(self, name: object) -> None:
871 """Paint area defined by shading pattern"""
873 def do_BT(self) -> None:
874 """Begin text object
876 Initializing the text matrix, Tm, and the text line matrix, Tlm, to
877 the identity matrix. Text objects cannot be nested; a second BT cannot
878 appear before an ET.
879 """
880 self.textstate.reset()
882 def do_ET(self) -> None:
883 """End a text object"""
885 def do_BX(self) -> None:
886 """Begin compatibility section"""
888 def do_EX(self) -> None:
889 """End compatibility section"""
891 def do_MP(self, tag: PDFStackT) -> None:
892 """Define marked-content point"""
893 if isinstance(tag, PSLiteral):
894 self.device.do_tag(tag)
895 else:
896 log.warning(
897 f"Cannot define marked-content point because {tag!r} is not a PSLiteral"
898 )
900 def do_DP(self, tag: PDFStackT, props: PDFStackT) -> None:
901 """Define marked-content point with property list"""
902 if isinstance(tag, PSLiteral):
903 self.device.do_tag(tag, props)
904 else:
905 log.warning(
906 f"Cannot define marked-content point with property list because {tag!r} is not a PSLiteral"
907 )
909 def do_BMC(self, tag: PDFStackT) -> None:
910 """Begin marked-content sequence"""
911 if isinstance(tag, PSLiteral):
912 self.device.begin_tag(tag)
913 else:
914 log.warning(
915 f"Cannot begin marked-content sequence because {tag!r} is not a PSLiteral"
916 )
918 def do_BDC(self, tag: PDFStackT, props: PDFStackT) -> None:
919 """Begin marked-content sequence with property list"""
920 if isinstance(tag, PSLiteral):
921 self.device.begin_tag(tag, props)
922 else:
923 log.warning(
924 f"Cannot begin marked-content sequence with property list because {tag!r} is not a PSLiteral"
925 )
927 def do_EMC(self) -> None:
928 """End marked-content sequence"""
929 self.device.end_tag()
931 def do_Tc(self, space: PDFStackT) -> None:
932 """Set character spacing.
934 Character spacing is used by the Tj, TJ, and ' operators.
936 :param space: a number expressed in unscaled text space units.
937 """
938 charspace = safe_float(space)
939 if charspace is None:
940 log.warning(
941 f"Could not set character spacing because {space!r} is an invalid float value"
942 )
943 else:
944 self.textstate.charspace = charspace
946 def do_Tw(self, space: PDFStackT) -> None:
947 """Set the word spacing.
949 Word spacing is used by the Tj, TJ, and ' operators.
951 :param space: a number expressed in unscaled text space units
952 """
953 wordspace = safe_float(space)
954 if wordspace is None:
955 log.warning(
956 f"Could not set word spacing becuase {space!r} is an invalid float value"
957 )
958 else:
959 self.textstate.wordspace = wordspace
961 def do_Tz(self, scale: PDFStackT) -> None:
962 """Set the horizontal scaling.
964 :param scale: is a number specifying the percentage of the normal width
965 """
966 scale_f = safe_float(scale)
968 if scale_f is None:
969 log.warning(
970 f"Could not set horizontal scaling because {scale!r} is an invalid float value"
971 )
972 else:
973 self.textstate.scaling = scale_f
975 def do_TL(self, leading: PDFStackT) -> None:
976 """Set the text leading.
978 Text leading is used only by the T*, ', and " operators.
980 :param leading: a number expressed in unscaled text space units
981 """
982 leading_f = safe_float(leading)
983 if leading_f is None:
984 log.warning(
985 f"Could not set text leading because {leading!r} is an invalid float value"
986 )
987 else:
988 self.textstate.leading = -leading_f
990 def do_Tf(self, fontid: PDFStackT, fontsize: PDFStackT) -> None:
991 """Set the text font
993 :param fontid: the name of a font resource in the Font subdictionary
994 of the current resource dictionary
995 :param fontsize: size is a number representing a scale factor.
996 """
997 try:
998 self.textstate.font = self.fontmap[literal_name(fontid)]
999 except KeyError:
1000 if settings.STRICT:
1001 raise PDFInterpreterError("Undefined Font id: %r" % fontid)
1002 self.textstate.font = self.rsrcmgr.get_font(None, {})
1004 fontsize_f = safe_float(fontsize)
1005 if fontsize_f is None:
1006 log.warning(
1007 f"Could not set text font because {fontsize!r} is an invalid float value"
1008 )
1009 else:
1010 self.textstate.fontsize = fontsize_f
1012 def do_Tr(self, render: PDFStackT) -> None:
1013 """Set the text rendering mode"""
1014 render_i = safe_int(render)
1016 if render_i is None:
1017 log.warning(
1018 f"Could not set text rendering mode because {render!r} is an invalid int value"
1019 )
1020 else:
1021 self.textstate.render = render_i
1023 def do_Ts(self, rise: PDFStackT) -> None:
1024 """Set the text rise
1026 :param rise: a number expressed in unscaled text space units
1027 """
1028 rise_f = safe_float(rise)
1030 if rise_f is None:
1031 log.warning(
1032 f"Could not set text rise because {rise!r} is an invalid float value"
1033 )
1034 else:
1035 self.textstate.rise = rise_f
1037 def do_Td(self, tx: PDFStackT, ty: PDFStackT) -> None:
1038 """Move to the start of the next line
1040 Offset from the start of the current line by (tx , ty).
1041 """
1042 tx_ = safe_float(tx)
1043 ty_ = safe_float(ty)
1044 if tx_ is not None and ty_ is not None:
1045 (a, b, c, d, e, f) = self.textstate.matrix
1046 e_new = tx_ * a + ty_ * c + e
1047 f_new = tx_ * b + ty_ * d + f
1048 self.textstate.matrix = (a, b, c, d, e_new, f_new)
1050 elif settings.STRICT:
1051 raise PDFValueError(f"Invalid offset ({tx!r}, {ty!r}) for Td")
1053 self.textstate.linematrix = (0, 0)
1055 def do_TD(self, tx: PDFStackT, ty: PDFStackT) -> None:
1056 """Move to the start of the next line.
1058 offset from the start of the current line by (tx , ty). As a side effect, this
1059 operator sets the leading parameter in the text state.
1060 """
1061 tx_ = safe_float(tx)
1062 ty_ = safe_float(ty)
1064 if tx_ is not None and ty_ is not None:
1065 (a, b, c, d, e, f) = self.textstate.matrix
1066 e_new = tx_ * a + ty_ * c + e
1067 f_new = tx_ * b + ty_ * d + f
1068 self.textstate.matrix = (a, b, c, d, e_new, f_new)
1070 elif settings.STRICT:
1071 raise PDFValueError("Invalid offset ({tx}, {ty}) for TD")
1073 if ty_ is not None:
1074 self.textstate.leading = ty_
1076 self.textstate.linematrix = (0, 0)
1078 def do_Tm(
1079 self,
1080 a: PDFStackT,
1081 b: PDFStackT,
1082 c: PDFStackT,
1083 d: PDFStackT,
1084 e: PDFStackT,
1085 f: PDFStackT,
1086 ) -> None:
1087 """Set text matrix and text line matrix"""
1088 values = (a, b, c, d, e, f)
1089 matrix = safe_matrix(*values)
1091 if matrix is None:
1092 log.warning(
1093 f"Could not set text matrix because not all values in {values!r} can be parsed as floats"
1094 )
1095 else:
1096 self.textstate.matrix = matrix
1097 self.textstate.linematrix = (0, 0)
1099 def do_T_a(self) -> None:
1100 """Move to start of next text line"""
1101 (a, b, c, d, e, f) = self.textstate.matrix
1102 self.textstate.matrix = (
1103 a,
1104 b,
1105 c,
1106 d,
1107 self.textstate.leading * c + e,
1108 self.textstate.leading * d + f,
1109 )
1110 self.textstate.linematrix = (0, 0)
1112 def do_TJ(self, seq: PDFStackT) -> None:
1113 """Show text, allowing individual glyph positioning"""
1114 if self.textstate.font is None:
1115 if settings.STRICT:
1116 raise PDFInterpreterError("No font specified!")
1117 return
1118 self.device.render_string(
1119 self.textstate,
1120 cast(PDFTextSeq, seq),
1121 self.graphicstate.ncs,
1122 self.graphicstate.copy(),
1123 )
1125 def do_Tj(self, s: PDFStackT) -> None:
1126 """Show text"""
1127 self.do_TJ([s])
1129 def do__q(self, s: PDFStackT) -> None:
1130 """Move to next line and show text
1132 The ' (single quote) operator.
1133 """
1134 self.do_T_a()
1135 self.do_TJ([s])
1137 def do__w(self, aw: PDFStackT, ac: PDFStackT, s: PDFStackT) -> None:
1138 """Set word and character spacing, move to next line, and show text
1140 The " (double quote) operator.
1141 """
1142 self.do_Tw(aw)
1143 self.do_Tc(ac)
1144 self.do_TJ([s])
1146 def do_BI(self) -> None:
1147 """Begin inline image object"""
1149 def do_ID(self) -> None:
1150 """Begin inline image data"""
1152 def do_EI(self, obj: PDFStackT) -> None:
1153 """End inline image object"""
1154 if isinstance(obj, PDFStream) and "W" in obj and "H" in obj:
1155 iobjid = str(id(obj))
1156 self.device.begin_figure(iobjid, (0, 0, 1, 1), MATRIX_IDENTITY)
1157 self.device.render_image(iobjid, obj)
1158 self.device.end_figure(iobjid)
1160 def do_Do(self, xobjid_arg: PDFStackT) -> None:
1161 """Invoke named XObject"""
1162 xobjid = literal_name(xobjid_arg)
1163 try:
1164 xobj = stream_value(self.xobjmap[xobjid])
1165 except KeyError:
1166 if settings.STRICT:
1167 raise PDFInterpreterError("Undefined xobject id: %r" % xobjid)
1168 return
1169 log.debug("Processing xobj: %r", xobj)
1170 subtype = xobj.get("Subtype")
1171 if subtype is LITERAL_FORM and "BBox" in xobj:
1172 interpreter = self.dup()
1173 bbox = cast(Rect, list_value(xobj["BBox"]))
1174 matrix = cast(Matrix, list_value(xobj.get("Matrix", MATRIX_IDENTITY)))
1175 # According to PDF reference 1.7 section 4.9.1, XObjects in
1176 # earlier PDFs (prior to v1.2) use the page's Resources entry
1177 # instead of having their own Resources entry.
1178 xobjres = xobj.get("Resources")
1179 if xobjres:
1180 resources = dict_value(xobjres)
1181 else:
1182 resources = self.resources.copy()
1183 self.device.begin_figure(xobjid, bbox, matrix)
1184 interpreter.render_contents(
1185 resources,
1186 [xobj],
1187 ctm=mult_matrix(matrix, self.ctm),
1188 )
1189 self.device.end_figure(xobjid)
1190 elif subtype is LITERAL_IMAGE and "Width" in xobj and "Height" in xobj:
1191 self.device.begin_figure(xobjid, (0, 0, 1, 1), MATRIX_IDENTITY)
1192 self.device.render_image(xobjid, xobj)
1193 self.device.end_figure(xobjid)
1194 else:
1195 # unsupported xobject type.
1196 pass
1198 def process_page(self, page: PDFPage) -> None:
1199 log.debug("Processing page: %r", page)
1200 (x0, y0, x1, y1) = page.mediabox
1201 if page.rotate == 90:
1202 ctm = (0, -1, 1, 0, -y0, x1)
1203 elif page.rotate == 180:
1204 ctm = (-1, 0, 0, -1, x1, y1)
1205 elif page.rotate == 270:
1206 ctm = (0, 1, -1, 0, y1, -x0)
1207 else:
1208 ctm = (1, 0, 0, 1, -x0, -y0)
1209 self.device.begin_page(page, ctm)
1210 self.render_contents(page.resources, page.contents, ctm=ctm)
1211 self.device.end_page(page)
1213 def render_contents(
1214 self,
1215 resources: Dict[object, object],
1216 streams: Sequence[object],
1217 ctm: Matrix = MATRIX_IDENTITY,
1218 ) -> None:
1219 """Render the content streams.
1221 This method may be called recursively.
1222 """
1223 log.debug(
1224 "render_contents: resources=%r, streams=%r, ctm=%r",
1225 resources,
1226 streams,
1227 ctm,
1228 )
1229 self.init_resources(resources)
1230 self.init_state(ctm)
1231 self.execute(list_value(streams))
1233 def execute(self, streams: Sequence[object]) -> None:
1234 try:
1235 parser = PDFContentParser(streams)
1236 except PSEOF:
1237 # empty page
1238 return
1239 while True:
1240 try:
1241 (_, obj) = parser.nextobject()
1242 except PSEOF:
1243 break
1244 if isinstance(obj, PSKeyword):
1245 name = keyword_name(obj)
1246 method = "do_%s" % name.replace("*", "_a").replace('"', "_w").replace(
1247 "'",
1248 "_q",
1249 )
1250 if hasattr(self, method):
1251 func = getattr(self, method)
1252 nargs = func.__code__.co_argcount - 1
1253 if nargs:
1254 args = self.pop(nargs)
1255 log.debug("exec: %s %r", name, args)
1256 if len(args) == nargs:
1257 func(*args)
1258 else:
1259 log.debug("exec: %s", name)
1260 func()
1261 elif settings.STRICT:
1262 error_msg = "Unknown operator: %r" % name
1263 raise PDFInterpreterError(error_msg)
1264 else:
1265 self.push(obj)