Coverage for /pythoncovmergedfiles/medio/medio/src/pdfminer.six/pdfminer/pdfinterp.py: 85%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import logging
2import re
3from collections.abc import Mapping, Sequence
4from io import BytesIO
5from typing import Union, cast
7from pdfminer import settings
8from pdfminer.casting import safe_cmyk, safe_float, safe_int, safe_matrix, safe_rgb
9from pdfminer.cmapdb import CMap, CMapBase, CMapDB
10from pdfminer.pdfcolor import PREDEFINED_COLORSPACE, PDFColorSpace
11from pdfminer.pdfdevice import PDFDevice, PDFTextSeq
12from pdfminer.pdfexceptions import PDFException, PDFValueError
13from pdfminer.pdffont import (
14 PDFCIDFont,
15 PDFFont,
16 PDFFontError,
17 PDFTrueTypeFont,
18 PDFType1Font,
19 PDFType3Font,
20)
21from pdfminer.pdfpage import PDFPage
22from pdfminer.pdftypes import (
23 LITERALS_ASCII85_DECODE,
24 PDFObjRef,
25 PDFStream,
26 dict_value,
27 int_value,
28 list_value,
29 resolve1,
30 stream_value,
31)
32from pdfminer.psexceptions import PSEOF, PSTypeError
33from pdfminer.psparser import (
34 KWD,
35 LIT,
36 PSKeyword,
37 PSLiteral,
38 PSStackParser,
39 PSStackType,
40 keyword_name,
41 literal_name,
42)
43from pdfminer.utils import (
44 MATRIX_IDENTITY,
45 Matrix,
46 PathSegment,
47 Point,
48 Rect,
49 choplist,
50 mult_matrix,
51)
53log = logging.getLogger(__name__)
56class PDFResourceError(PDFException):
57 pass
60class PDFInterpreterError(PDFException):
61 pass
64LITERAL_PDF = LIT("PDF")
65LITERAL_TEXT = LIT("Text")
66LITERAL_FONT = LIT("Font")
67LITERAL_FORM = LIT("Form")
68LITERAL_IMAGE = LIT("Image")
71class PDFTextState:
72 matrix: Matrix
73 linematrix: Point
75 def __init__(self) -> None:
76 self.font: PDFFont | None = None
77 self.fontsize: float = 0
78 self.charspace: float = 0
79 self.wordspace: float = 0
80 self.scaling: float = 100
81 self.leading: float = 0
82 self.render: int = 0
83 self.rise: float = 0
84 self.reset()
85 # self.matrix is set
86 # self.linematrix is set
88 def __repr__(self) -> str:
89 return (
90 f"<PDFTextState: font={self.font!r}, "
91 f"fontsize={self.fontsize!r}, "
92 f"charspace={self.charspace!r}, "
93 f"wordspace={self.wordspace!r}, "
94 f"scaling={self.scaling!r}, "
95 f"leading={self.leading!r}, "
96 f"render={self.render!r}, "
97 f"rise={self.rise!r}, "
98 f"matrix={self.matrix!r}, "
99 f"linematrix={self.linematrix!r}>"
100 )
102 def copy(self) -> "PDFTextState":
103 obj = PDFTextState()
104 obj.font = self.font
105 obj.fontsize = self.fontsize
106 obj.charspace = self.charspace
107 obj.wordspace = self.wordspace
108 obj.scaling = self.scaling
109 obj.leading = self.leading
110 obj.render = self.render
111 obj.rise = self.rise
112 obj.matrix = self.matrix
113 obj.linematrix = self.linematrix
114 return obj
116 def reset(self) -> None:
117 self.matrix = MATRIX_IDENTITY
118 self.linematrix = (0, 0)
121# Standard color types (used standalone or as base for uncolored patterns)
122StandardColor = Union[
123 float, # Greyscale
124 tuple[float, float, float], # R, G, B
125 tuple[float, float, float, float], # C, M, Y, K
126]
128# Complete color type including patterns
129Color = Union[
130 StandardColor, # Standard colors (gray, RGB, CMYK)
131 str, # Pattern name (colored pattern, PaintType=1)
132 tuple[
133 StandardColor, str
134 ], # (base_color, pattern_name) (uncolored pattern, PaintType=2)
135]
138class PDFGraphicState:
139 def __init__(self) -> None:
140 self.linewidth: float = 0
141 self.linecap: object | None = None
142 self.linejoin: object | None = None
143 self.miterlimit: object | None = None
144 self.dash: tuple[object, object] | None = None
145 self.intent: object | None = None
146 self.flatness: object | None = None
148 # stroking color
149 self.scolor: Color = 0
150 self.scs: PDFColorSpace = PREDEFINED_COLORSPACE["DeviceGray"]
152 # non stroking color
153 self.ncolor: Color = 0
154 self.ncs: PDFColorSpace = PREDEFINED_COLORSPACE["DeviceGray"]
156 def copy(self) -> "PDFGraphicState":
157 obj = PDFGraphicState()
158 obj.linewidth = self.linewidth
159 obj.linecap = self.linecap
160 obj.linejoin = self.linejoin
161 obj.miterlimit = self.miterlimit
162 obj.dash = self.dash
163 obj.intent = self.intent
164 obj.flatness = self.flatness
165 obj.scolor = self.scolor
166 obj.scs = self.scs
167 obj.ncolor = self.ncolor
168 obj.ncs = self.ncs
169 return obj
171 def __repr__(self) -> str:
172 return (
173 f"<PDFGraphicState: "
174 f"linewidth={self.linewidth!r}, "
175 f"linecap={self.linecap!r}, "
176 f"linejoin={self.linejoin!r}, "
177 f"miterlimit={self.miterlimit!r}, "
178 f"dash={self.dash!r}, "
179 f"intent={self.intent!r}, "
180 f"flatness={self.flatness!r}, "
181 f"stroking color={self.scolor!r}, "
182 f"non stroking color={self.ncolor!r}>"
183 )
186class PDFResourceManager:
187 """Repository of shared resources.
189 ResourceManager facilitates reuse of shared resources
190 such as fonts and images so that large objects are not
191 allocated multiple times.
192 """
194 def __init__(self, caching: bool = True) -> None:
195 self.caching = caching
196 self._cached_fonts: dict[object, PDFFont] = {}
198 def get_procset(self, procs: Sequence[object]) -> None:
199 for proc in procs:
200 if proc is LITERAL_PDF or proc is LITERAL_TEXT:
201 pass
202 else:
203 pass
205 def get_cmap(self, cmapname: str, strict: bool = False) -> CMapBase:
206 try:
207 return CMapDB.get_cmap(cmapname)
208 except CMapDB.CMapNotFound:
209 if strict:
210 raise
211 return CMap()
213 def get_font(self, objid: object, spec: Mapping[str, object]) -> PDFFont:
214 if objid and objid in self._cached_fonts:
215 font = self._cached_fonts[objid]
216 else:
217 log.debug("get_font: create: objid=%r, spec=%r", objid, spec)
218 if settings.STRICT and spec["Type"] is not LITERAL_FONT:
219 raise PDFFontError("Type is not /Font")
220 # Create a Font object.
221 if "Subtype" in spec:
222 subtype = literal_name(spec["Subtype"])
223 else:
224 if settings.STRICT:
225 raise PDFFontError("Font Subtype is not specified.")
226 subtype = "Type1"
227 if subtype in ("Type1", "MMType1"):
228 # Type1 Font
229 font = PDFType1Font(self, spec)
230 elif subtype == "TrueType":
231 # TrueType Font
232 font = PDFTrueTypeFont(self, spec)
233 elif subtype == "Type3":
234 # Type3 Font
235 font = PDFType3Font(self, spec)
236 elif subtype in ("CIDFontType0", "CIDFontType2"):
237 # CID Font
238 font = PDFCIDFont(self, spec)
239 elif subtype == "Type0":
240 # Type0 Font
241 dfonts = list_value(spec["DescendantFonts"])
242 assert dfonts
243 subspec = dict_value(dfonts[0]).copy()
244 for k in ("Encoding", "ToUnicode"):
245 if k in spec:
246 subspec[k] = resolve1(spec[k])
247 font = self.get_font(None, subspec)
248 else:
249 if settings.STRICT:
250 raise PDFFontError(f"Invalid Font spec: {spec!r}")
251 font = PDFType1Font(self, spec) # this is so wrong!
252 if objid and self.caching:
253 self._cached_fonts[objid] = font
254 return font
257class PDFContentParser(PSStackParser[Union[PSKeyword, PDFStream]]):
258 def __init__(self, streams: Sequence[object]) -> None:
259 self.streams = streams
260 self.istream = 0
261 # PSStackParser.__init__(fp=None) is safe only because we've overloaded
262 # all the methods that would attempt to access self.fp without first
263 # calling self.fillfp().
264 PSStackParser.__init__(self, None) # type: ignore[arg-type]
266 def fillfp(self) -> bool:
267 if not self.fp:
268 if self.istream < len(self.streams):
269 strm = stream_value(self.streams[self.istream])
270 self.istream += 1
271 else:
272 raise PSEOF("Unexpected EOF, file truncated?")
273 self.fp = BytesIO(strm.get_data())
274 return True
275 return False
277 def seek(self, pos: int) -> None:
278 self.fillfp()
279 PSStackParser.seek(self, pos)
281 def fillbuf(self) -> bool:
282 if self.charpos < len(self.buf):
283 return False
284 new_stream = False
285 while 1:
286 new_stream = self.fillfp()
287 self.bufpos = self.fp.tell()
288 self.buf = self.fp.read(self.BUFSIZ)
289 if self.buf:
290 break
291 self.fp = None # type: ignore[assignment]
292 self.charpos = 0
293 return new_stream
295 def get_inline_data(self, pos: int, target: bytes = b"EI") -> tuple[int, bytes]:
296 self.seek(pos)
297 i = 0
298 data = b""
299 while i <= len(target):
300 self.fillbuf()
301 if i:
302 ci = self.buf[self.charpos]
303 c = bytes((ci,))
304 data += c
305 self.charpos += 1
306 if (len(target) <= i and c.isspace()) or (
307 i < len(target) and c == (bytes((target[i],)))
308 ):
309 i += 1
310 else:
311 i = 0
312 else:
313 try:
314 j = self.buf.index(target[0], self.charpos)
315 data += self.buf[self.charpos : j + 1]
316 self.charpos = j + 1
317 i = 1
318 except ValueError:
319 data += self.buf[self.charpos :]
320 self.charpos = len(self.buf)
321 data = data[: -(len(target) + 1)] # strip the last part
322 data = re.sub(rb"(\x0d\x0a|[\x0d\x0a])$", b"", data)
323 return (pos, data)
325 def flush(self) -> None:
326 self.add_results(*self.popall())
328 KEYWORD_BI = KWD(b"BI")
329 KEYWORD_ID = KWD(b"ID")
330 KEYWORD_EI = KWD(b"EI")
332 def do_keyword(self, pos: int, token: PSKeyword) -> None:
333 if token is self.KEYWORD_BI:
334 # inline image within a content stream
335 self.start_type(pos, "inline")
336 elif token is self.KEYWORD_ID:
337 try:
338 (_, objs) = self.end_type("inline")
339 if len(objs) % 2 != 0:
340 error_msg = f"Invalid dictionary construct: {objs!r}"
341 raise PSTypeError(error_msg)
342 d = {literal_name(k): resolve1(v) for (k, v) in choplist(2, objs)}
343 eos = b"EI"
344 filter = d.get("F")
345 if filter is not None:
346 if isinstance(filter, PSLiteral):
347 filter = [filter]
348 if filter[0] in LITERALS_ASCII85_DECODE:
349 eos = b"~>"
350 (pos, data) = self.get_inline_data(pos + len(b"ID "), target=eos)
351 if eos != b"EI": # it may be necessary for decoding
352 data += eos
353 obj = PDFStream(d, data)
354 self.push((pos, obj))
355 if eos == b"EI": # otherwise it is still in the stream
356 self.push((pos, self.KEYWORD_EI))
357 except PSTypeError:
358 if settings.STRICT:
359 raise
360 else:
361 self.push((pos, token))
364# Types that may appear on the PDF argument stack.
365PDFStackT = PSStackType[PDFStream]
368class PDFPageInterpreter:
369 """Processor for the content of a PDF page
371 Reference: PDF Reference, Appendix A, Operator Summary
372 """
374 def __init__(self, rsrcmgr: PDFResourceManager, device: PDFDevice) -> None:
375 self.rsrcmgr = rsrcmgr
376 self.device = device
377 # Track stream IDs currently being executed to detect circular references
378 self.stream_ids: set[int] = set()
379 # Track stream IDs from parent interpreters in the call stack
380 self.parent_stream_ids: set[int] = set()
382 def dup(self) -> "PDFPageInterpreter":
383 return self.__class__(self.rsrcmgr, self.device)
385 def subinterp(self) -> "PDFPageInterpreter":
386 """Create a sub-interpreter for processing nested content streams.
388 This is used when invoking Form XObjects to prevent circular references.
389 Unlike dup(), this method propagates the stream ID tracking from the
390 parent interpreter, allowing detection of circular references across
391 nested XObject invocations.
392 """
393 interp = self.dup()
394 interp.parent_stream_ids.update(self.parent_stream_ids)
395 interp.parent_stream_ids.update(self.stream_ids)
396 return interp
398 def init_resources(self, resources: dict[object, object]) -> None:
399 """Prepare the fonts and XObjects listed in the Resource attribute."""
400 self.resources = resources
401 self.fontmap: dict[object, PDFFont] = {}
402 self.xobjmap = {}
403 self.csmap: dict[str, PDFColorSpace] = PREDEFINED_COLORSPACE.copy()
404 if not resources:
405 return
407 def get_colorspace(spec: object) -> PDFColorSpace | None:
408 if isinstance(spec, list):
409 name = literal_name(spec[0])
410 else:
411 name = literal_name(spec)
412 if name == "ICCBased" and isinstance(spec, list) and len(spec) >= 2:
413 return PDFColorSpace(name, int_value(stream_value(spec[1])["N"]))
414 elif name == "DeviceN" and isinstance(spec, list) and len(spec) >= 2:
415 return PDFColorSpace(name, len(list_value(spec[1])))
416 else:
417 return PREDEFINED_COLORSPACE.get(name)
419 for k, v in dict_value(resources).items():
420 log.debug("Resource: %r: %r", k, v)
421 if k == "Font":
422 for fontid, spec in dict_value(v).items():
423 objid = None
424 if isinstance(spec, PDFObjRef):
425 objid = spec.objid
426 spec = dict_value(spec)
427 self.fontmap[fontid] = self.rsrcmgr.get_font(objid, spec)
428 elif k == "ColorSpace":
429 for csid, spec in dict_value(v).items():
430 colorspace = get_colorspace(resolve1(spec))
431 if colorspace is not None:
432 self.csmap[csid] = colorspace
433 elif k == "ProcSet":
434 self.rsrcmgr.get_procset(list_value(v))
435 elif k == "XObject":
436 for xobjid, xobjstrm in dict_value(v).items():
437 self.xobjmap[xobjid] = xobjstrm
439 def init_state(self, ctm: Matrix) -> None:
440 """Initialize the text and graphic states for rendering a page."""
441 # gstack: stack for graphical states.
442 self.gstack: list[tuple[Matrix, PDFTextState, PDFGraphicState]] = []
443 self.ctm = ctm
444 self.device.set_ctm(self.ctm)
445 self.textstate = PDFTextState()
446 self.graphicstate = PDFGraphicState()
447 self.curpath: list[PathSegment] = []
448 # argstack: stack for command arguments.
449 self.argstack: list[PDFStackT] = []
451 def push(self, obj: PDFStackT) -> None:
452 self.argstack.append(obj)
454 def pop(self, n: int) -> list[PDFStackT]:
455 if n == 0:
456 return []
457 x = self.argstack[-n:]
458 self.argstack = self.argstack[:-n]
459 return x
461 def get_current_state(self) -> tuple[Matrix, PDFTextState, PDFGraphicState]:
462 return (self.ctm, self.textstate.copy(), self.graphicstate.copy())
464 def set_current_state(
465 self,
466 state: tuple[Matrix, PDFTextState, PDFGraphicState],
467 ) -> None:
468 (self.ctm, self.textstate, self.graphicstate) = state
469 self.device.set_ctm(self.ctm)
471 def do_q(self) -> None:
472 """Save graphics state"""
473 self.gstack.append(self.get_current_state())
475 def do_Q(self) -> None:
476 """Restore graphics state"""
477 if self.gstack:
478 self.set_current_state(self.gstack.pop())
480 def do_cm(
481 self,
482 a1: PDFStackT,
483 b1: PDFStackT,
484 c1: PDFStackT,
485 d1: PDFStackT,
486 e1: PDFStackT,
487 f1: PDFStackT,
488 ) -> None:
489 """Concatenate matrix to current transformation matrix"""
490 matrix = safe_matrix(a1, b1, c1, d1, e1, f1)
492 if matrix is None:
493 log.warning(
494 "Cannot concatenate matrix to current transformation matrix "
495 "because not all values in %r can be parsed as floats",
496 (a1, b1, c1, d1, e1, f1),
497 )
498 else:
499 self.ctm = mult_matrix(matrix, self.ctm)
500 self.device.set_ctm(self.ctm)
502 def do_w(self, linewidth: PDFStackT) -> None:
503 """Set line width"""
504 linewidth_f = safe_float(linewidth)
505 if linewidth_f is None:
506 log.warning(
507 "Cannot set line width because %r is an invalid float value",
508 linewidth,
509 )
510 else:
511 scale = (self.ctm[0] ** 2 + self.ctm[1] ** 2) ** 0.5
512 self.graphicstate.linewidth = linewidth_f * scale
514 def do_J(self, linecap: PDFStackT) -> None:
515 """Set line cap style"""
516 self.graphicstate.linecap = linecap
518 def do_j(self, linejoin: PDFStackT) -> None:
519 """Set line join style"""
520 self.graphicstate.linejoin = linejoin
522 def do_M(self, miterlimit: PDFStackT) -> None:
523 """Set miter limit"""
524 self.graphicstate.miterlimit = miterlimit
526 def do_d(self, dash: PDFStackT, phase: PDFStackT) -> None:
527 """Set line dash pattern"""
528 self.graphicstate.dash = (dash, phase)
530 def do_ri(self, intent: PDFStackT) -> None:
531 """Set color rendering intent"""
532 self.graphicstate.intent = intent
534 def do_i(self, flatness: PDFStackT) -> None:
535 """Set flatness tolerance"""
536 self.graphicstate.flatness = flatness
538 def do_gs(self, name: PDFStackT) -> None:
539 """Set parameters from graphics state parameter dictionary"""
540 # TODO
542 def do_m(self, x: PDFStackT, y: PDFStackT) -> None:
543 """Begin new subpath"""
544 x_f = safe_float(x)
545 y_f = safe_float(y)
547 if x_f is None or y_f is None:
548 point = ("m", x, y)
549 log.warning(
550 "Cannot start new subpath because not all values "
551 "in %r can be parsed as floats",
552 point,
553 )
554 else:
555 point = ("m", x_f, y_f)
556 self.curpath.append(point)
558 def do_l(self, x: PDFStackT, y: PDFStackT) -> None:
559 """Append straight line segment to path"""
560 x_f = safe_float(x)
561 y_f = safe_float(y)
562 if x_f is None or y_f is None:
563 point = ("l", x, y)
564 log.warning(
565 "Cannot append straight line segment to path "
566 "because not all values in %r can be parsed as floats",
567 point,
568 )
569 else:
570 point = ("l", x_f, y_f)
571 self.curpath.append(point)
573 def do_c(
574 self,
575 x1: PDFStackT,
576 y1: PDFStackT,
577 x2: PDFStackT,
578 y2: PDFStackT,
579 x3: PDFStackT,
580 y3: PDFStackT,
581 ) -> None:
582 """Append curved segment to path (three control points)"""
583 x1_f = safe_float(x1)
584 y1_f = safe_float(y1)
585 x2_f = safe_float(x2)
586 y2_f = safe_float(y2)
587 x3_f = safe_float(x3)
588 y3_f = safe_float(y3)
589 if (
590 x1_f is None
591 or y1_f is None
592 or x2_f is None
593 or y2_f is None
594 or x3_f is None
595 or y3_f is None
596 ):
597 point = ("c", x1, y1, x2, y2, x3, y3)
598 log.warning(
599 "Cannot append curved segment to path "
600 "because not all values in %r can be parsed as floats",
601 point,
602 )
603 else:
604 point = ("c", x1_f, y1_f, x2_f, y2_f, x3_f, y3_f)
605 self.curpath.append(point)
607 def do_v(self, x2: PDFStackT, y2: PDFStackT, x3: PDFStackT, y3: PDFStackT) -> None:
608 """Append curved segment to path (initial point replicated)"""
609 x2_f = safe_float(x2)
610 y2_f = safe_float(y2)
611 x3_f = safe_float(x3)
612 y3_f = safe_float(y3)
613 if x2_f is None or y2_f is None or x3_f is None or y3_f is None:
614 point = ("v", x2, y2, x3, y3)
615 log.warning(
616 "Cannot append curved segment to path "
617 "because not all values in %r can be parsed as floats",
618 point,
619 )
620 else:
621 point = ("v", x2_f, y2_f, x3_f, y3_f)
622 self.curpath.append(point)
624 def do_y(self, x1: PDFStackT, y1: PDFStackT, x3: PDFStackT, y3: PDFStackT) -> None:
625 """Append curved segment to path (final point replicated)"""
626 x1_f = safe_float(x1)
627 y1_f = safe_float(y1)
628 x3_f = safe_float(x3)
629 y3_f = safe_float(y3)
630 if x1_f is None or y1_f is None or x3_f is None or y3_f is None:
631 point = ("y", x1, y1, x3, y3)
632 log.warning(
633 "Cannot append curved segment to path "
634 "because not all values in %r can be parsed as floats",
635 point,
636 )
637 else:
638 point = ("y", x1_f, y1_f, x3_f, y3_f)
639 self.curpath.append(point)
641 def do_h(self) -> None:
642 """Close subpath"""
643 self.curpath.append(("h",))
645 def do_re(self, x: PDFStackT, y: PDFStackT, w: PDFStackT, h: PDFStackT) -> None:
646 """Append rectangle to path"""
647 x_f = safe_float(x)
648 y_f = safe_float(y)
649 w_f = safe_float(w)
650 h_f = safe_float(h)
652 if x_f is None or y_f is None or w_f is None or h_f is None:
653 values = (x, y, w, h)
654 log.warning(
655 "Cannot append rectangle to path "
656 "because not all values in %r can be parsed as floats",
657 values,
658 )
659 else:
660 self.curpath.append(("m", x_f, y_f))
661 self.curpath.append(("l", x_f + w_f, y_f))
662 self.curpath.append(("l", x_f + w_f, y_f + h_f))
663 self.curpath.append(("l", x_f, y_f + h_f))
664 self.curpath.append(("h",))
666 def do_S(self) -> None:
667 """Stroke path"""
668 self.device.paint_path(self.graphicstate, True, False, False, self.curpath)
669 self.curpath = []
671 def do_s(self) -> None:
672 """Close and stroke path"""
673 self.do_h()
674 self.do_S()
676 def do_f(self) -> None:
677 """Fill path using nonzero winding number rule"""
678 self.device.paint_path(self.graphicstate, False, True, False, self.curpath)
679 self.curpath = []
681 def do_F(self) -> None:
682 """Fill path using nonzero winding number rule (obsolete)"""
684 def do_f_a(self) -> None:
685 """Fill path using even-odd rule"""
686 self.device.paint_path(self.graphicstate, False, True, True, self.curpath)
687 self.curpath = []
689 def do_B(self) -> None:
690 """Fill and stroke path using nonzero winding number rule"""
691 self.device.paint_path(self.graphicstate, True, True, False, self.curpath)
692 self.curpath = []
694 def do_B_a(self) -> None:
695 """Fill and stroke path using even-odd rule"""
696 self.device.paint_path(self.graphicstate, True, True, True, self.curpath)
697 self.curpath = []
699 def do_b(self) -> None:
700 """Close, fill, and stroke path using nonzero winding number rule"""
701 self.do_h()
702 self.do_B()
704 def do_b_a(self) -> None:
705 """Close, fill, and stroke path using even-odd rule"""
706 self.do_h()
707 self.do_B_a()
709 def do_n(self) -> None:
710 """End path without filling or stroking"""
711 self.curpath = []
713 def do_W(self) -> None:
714 """Set clipping path using nonzero winding number rule"""
716 def do_W_a(self) -> None:
717 """Set clipping path using even-odd rule"""
719 def do_CS(self, name: PDFStackT) -> None:
720 """Set color space for stroking operations
722 Introduced in PDF 1.1
723 """
724 try:
725 self.graphicstate.scs = self.csmap[literal_name(name)]
726 except KeyError as err:
727 if settings.STRICT:
728 raise PDFInterpreterError(f"Undefined ColorSpace: {name!r}") from err
730 def do_cs(self, name: PDFStackT) -> None:
731 """Set color space for nonstroking operations"""
732 try:
733 self.graphicstate.ncs = self.csmap[literal_name(name)]
734 except KeyError as err:
735 if settings.STRICT:
736 raise PDFInterpreterError(f"Undefined ColorSpace: {name!r}") from err
738 def do_G(self, gray: PDFStackT) -> None:
739 """Set gray level for stroking operations"""
740 gray_f = safe_float(gray)
742 if gray_f is None:
743 log.warning(
744 "Cannot set gray level because %r is an invalid float value",
745 gray,
746 )
747 else:
748 self.graphicstate.scolor = gray_f
749 self.graphicstate.scs = self.csmap["DeviceGray"]
751 def do_g(self, gray: PDFStackT) -> None:
752 """Set gray level for nonstroking operations"""
753 gray_f = safe_float(gray)
755 if gray_f is None:
756 log.warning(
757 "Cannot set gray level because %r is an invalid float value",
758 gray,
759 )
760 else:
761 self.graphicstate.ncolor = gray_f
762 self.graphicstate.ncs = self.csmap["DeviceGray"]
764 def do_RG(self, r: PDFStackT, g: PDFStackT, b: PDFStackT) -> None:
765 """Set RGB color for stroking operations"""
766 rgb = safe_rgb(r, g, b)
768 if rgb is None:
769 log.warning(
770 "Cannot set RGB stroke color "
771 "because not all values in %r can be parsed as floats",
772 (r, g, b),
773 )
774 else:
775 self.graphicstate.scolor = rgb
776 self.graphicstate.scs = self.csmap["DeviceRGB"]
778 def do_rg(self, r: PDFStackT, g: PDFStackT, b: PDFStackT) -> None:
779 """Set RGB color for nonstroking operations"""
780 rgb = safe_rgb(r, g, b)
782 if rgb is None:
783 log.warning(
784 "Cannot set RGB non-stroke color "
785 "because not all values in %r can be parsed as floats",
786 (r, g, b),
787 )
788 else:
789 self.graphicstate.ncolor = rgb
790 self.graphicstate.ncs = self.csmap["DeviceRGB"]
792 def do_K(self, c: PDFStackT, m: PDFStackT, y: PDFStackT, k: PDFStackT) -> None:
793 """Set CMYK color for stroking operations"""
794 cmyk = safe_cmyk(c, m, y, k)
796 if cmyk is None:
797 log.warning(
798 "Cannot set CMYK stroke color "
799 "because not all values in %r can be parsed as floats",
800 (c, m, y, k),
801 )
802 else:
803 self.graphicstate.scolor = cmyk
804 self.graphicstate.scs = self.csmap["DeviceCMYK"]
806 def do_k(self, c: PDFStackT, m: PDFStackT, y: PDFStackT, k: PDFStackT) -> None:
807 """Set CMYK color for nonstroking operations"""
808 cmyk = safe_cmyk(c, m, y, k)
810 if cmyk is None:
811 log.warning(
812 "Cannot set CMYK non-stroke color "
813 "because not all values in %r can be parsed as floats",
814 (c, m, y, k),
815 )
816 else:
817 self.graphicstate.ncolor = cmyk
818 self.graphicstate.ncs = self.csmap["DeviceCMYK"]
820 def _parse_color_components(
821 self, components: list[PDFStackT], context: str
822 ) -> StandardColor | None:
823 """Parse color components into StandardColor (gray, RGB, or CMYK).
825 Args:
826 components: List of 1, 3, or 4 numeric color components
827 context: Description for error messages (e.g., "stroke", "non-stroke")
829 Returns:
830 Parsed color (float for gray, tuple for RGB/CMYK) or None if invalid
831 """
832 if len(components) == 1:
833 gray = safe_float(components[0])
834 if gray is None:
835 log.warning(
836 "Cannot set %s color: %r is an invalid float value",
837 context,
838 components[0],
839 )
840 return gray
842 elif len(components) == 3:
843 rgb = safe_rgb(*components)
844 if rgb is None:
845 log.warning(
846 "Cannot set %s color: components %r cannot be parsed as RGB",
847 context,
848 components,
849 )
850 return rgb
852 elif len(components) == 4:
853 cmyk = safe_cmyk(*components)
854 if cmyk is None:
855 log.warning(
856 "Cannot set %s color: components %r cannot be parsed as CMYK",
857 context,
858 components,
859 )
860 return cmyk
862 else:
863 log.warning(
864 "Cannot set %s color: %d components specified, "
865 "but only 1 (grayscale), 3 (RGB), and 4 (CMYK) are supported",
866 context,
867 len(components),
868 )
869 return None
871 def do_SCN(self) -> None:
872 """Set color for stroking operations.
874 Handles Pattern color spaces per ISO 32000-1:2008 4.5.5 (PDF 1.7)
875 and ISO 32000-2:2020 8.7.3 (PDF 2.0):
876 - Colored patterns (PaintType=1): single operand (pattern name)
877 - Uncolored patterns (PaintType=2): n+1 operands (colors + pattern name)
878 """
879 n = self.graphicstate.scs.ncomponents
881 components = self.pop(n)
882 if len(components) != n:
883 log.warning(
884 "Cannot set stroke color because expected %d components but got %r",
885 n,
886 components,
887 )
889 elif self.graphicstate.scs.name != "Pattern":
890 # Standard colors (gray, RGB, CMYK) - common case
891 color = self._parse_color_components(components, "stroke")
892 if color is not None:
893 self.graphicstate.scolor = color
895 elif len(components) >= 1:
896 # Pattern color space (ISO 32000 8.7.3.2-3)
897 # Last component is always the pattern name
898 pattern_component = components[-1]
900 # Per spec: pattern name must be a name object (PSLiteral)
901 if not isinstance(pattern_component, PSLiteral):
902 log.warning(
903 "Pattern color space requires name object (PSLiteral), "
904 "got %s: %r. "
905 "Per ISO 32000 8.7.3.2, colored patterns use syntax '/name SCN'. "
906 "Per ISO 32000 8.7.3.3, uncolored patterns use "
907 "syntax 'c1...cn /name SCN'.",
908 type(pattern_component).__name__,
909 pattern_component,
910 )
911 return
913 pattern_name = literal_name(pattern_component)
915 if len(components) == 1:
916 # Colored tiling pattern (PaintType=1): just pattern name
917 self.graphicstate.scolor = pattern_name
918 log.debug("Set stroke pattern (colored): %s", pattern_name)
919 else:
920 # Uncolored tiling pattern (PaintType=2):
921 # color components + pattern name
922 base_color_components = components[:-1]
924 # Parse base color using shared logic
925 base_color = self._parse_color_components(
926 base_color_components, "stroke (uncolored pattern)"
927 )
928 if base_color is None:
929 return
931 # Store as tuple: (base_color, pattern_name)
932 self.graphicstate.scolor = (base_color, pattern_name)
933 log.debug(
934 "Set stroke pattern (uncolored): %s + %s", base_color, pattern_name
935 )
937 def do_scn(self) -> None:
938 """Set color for nonstroking operations.
940 Handles Pattern color spaces per ISO 32000-1:2008 4.5.5 (PDF 1.7)
941 and ISO 32000-2:2020 §8.7.3 (PDF 2.0):
942 - Colored patterns (PaintType=1): single operand (pattern name)
943 - Uncolored patterns (PaintType=2): n+1 operands (colors + pattern name)
944 """
945 n = self.graphicstate.ncs.ncomponents
947 components = self.pop(n)
948 if len(components) != n:
949 log.warning(
950 "Cannot set non-stroke color because expected %d components but got %r",
951 n,
952 components,
953 )
955 elif self.graphicstate.ncs.name != "Pattern":
956 # Standard colors (gray, RGB, CMYK) - common case
957 color = self._parse_color_components(components, "non-stroke")
958 if color is not None:
959 self.graphicstate.ncolor = color
961 elif len(components) >= 1:
962 # Pattern color space (ISO 32000 8.7.3.2-3)
963 # Last component is always the pattern name
964 pattern_component = components[-1]
966 # Per spec: pattern name must be a name object (PSLiteral)
967 if not isinstance(pattern_component, PSLiteral):
968 log.warning(
969 "Pattern color space requires name object (PSLiteral), "
970 "got %s: %r. "
971 "Per ISO 32000 8.7.3.2, colored patterns use syntax '/name scn'. "
972 "Per ISO 32000 8.7.3.3, uncolored patterns use "
973 "syntax 'c1...cn /name scn'.",
974 type(pattern_component).__name__,
975 pattern_component,
976 )
977 return
979 pattern_name = literal_name(pattern_component)
981 if len(components) == 1:
982 # Colored tiling pattern (PaintType=1): just pattern name
983 self.graphicstate.ncolor = pattern_name
984 log.debug("Set non-stroke pattern (colored): %s", pattern_name)
985 else:
986 # Uncolored tiling pattern (PaintType=2):
987 # color components + pattern name
988 base_color_components = components[:-1]
990 # Parse base color using shared logic
991 base_color = self._parse_color_components(
992 base_color_components, "non-stroke (uncolored pattern)"
993 )
994 if base_color is None:
995 return
997 # Store as tuple: (base_color, pattern_name)
998 self.graphicstate.ncolor = (base_color, pattern_name)
999 log.debug(
1000 "Set non-stroke pattern (uncolored): %s + %s",
1001 base_color,
1002 pattern_name,
1003 )
1005 def do_SC(self) -> None:
1006 """Set color for stroking operations"""
1007 self.do_SCN()
1009 def do_sc(self) -> None:
1010 """Set color for nonstroking operations"""
1011 self.do_scn()
1013 def do_sh(self, name: object) -> None:
1014 """Paint area defined by shading pattern"""
1016 def do_BT(self) -> None:
1017 """Begin text object
1019 Initializing the text matrix, Tm, and the text line matrix, Tlm, to
1020 the identity matrix. Text objects cannot be nested; a second BT cannot
1021 appear before an ET.
1022 """
1023 self.textstate.reset()
1025 def do_ET(self) -> None:
1026 """End a text object"""
1028 def do_BX(self) -> None:
1029 """Begin compatibility section"""
1031 def do_EX(self) -> None:
1032 """End compatibility section"""
1034 def do_MP(self, tag: PDFStackT) -> None:
1035 """Define marked-content point"""
1036 if isinstance(tag, PSLiteral):
1037 self.device.do_tag(tag)
1038 else:
1039 log.warning(
1040 "Cannot define marked-content point because %r is not a PSLiteral",
1041 tag,
1042 )
1044 def do_DP(self, tag: PDFStackT, props: PDFStackT) -> None:
1045 """Define marked-content point with property list"""
1046 if isinstance(tag, PSLiteral):
1047 self.device.do_tag(tag, props)
1048 else:
1049 log.warning(
1050 "Cannot define marked-content point with property list "
1051 "because %r is not a PSLiteral",
1052 tag,
1053 )
1055 def do_BMC(self, tag: PDFStackT) -> None:
1056 """Begin marked-content sequence"""
1057 if isinstance(tag, PSLiteral):
1058 self.device.begin_tag(tag)
1059 else:
1060 log.warning(
1061 "Cannot begin marked-content sequence because %r is not a PSLiteral",
1062 tag,
1063 )
1065 def do_BDC(self, tag: PDFStackT, props: PDFStackT) -> None:
1066 """Begin marked-content sequence with property list"""
1067 if isinstance(tag, PSLiteral):
1068 self.device.begin_tag(tag, props)
1069 else:
1070 log.warning(
1071 "Cannot begin marked-content sequence with property list "
1072 "because %r is not a PSLiteral",
1073 tag,
1074 )
1076 def do_EMC(self) -> None:
1077 """End marked-content sequence"""
1078 self.device.end_tag()
1080 def do_Tc(self, space: PDFStackT) -> None:
1081 """Set character spacing.
1083 Character spacing is used by the Tj, TJ, and ' operators.
1085 :param space: a number expressed in unscaled text space units.
1086 """
1087 charspace = safe_float(space)
1088 if charspace is None:
1089 log.warning(
1090 "Could not set character spacing because %r is an invalid float value",
1091 space,
1092 )
1093 else:
1094 self.textstate.charspace = charspace
1096 def do_Tw(self, space: PDFStackT) -> None:
1097 """Set the word spacing.
1099 Word spacing is used by the Tj, TJ, and ' operators.
1101 :param space: a number expressed in unscaled text space units
1102 """
1103 wordspace = safe_float(space)
1104 if wordspace is None:
1105 log.warning(
1106 "Could not set word spacing because %r is an invalid float value",
1107 space,
1108 )
1109 else:
1110 self.textstate.wordspace = wordspace
1112 def do_Tz(self, scale: PDFStackT) -> None:
1113 """Set the horizontal scaling.
1115 :param scale: is a number specifying the percentage of the normal width
1116 """
1117 scale_f = safe_float(scale)
1119 if scale_f is None:
1120 log.warning(
1121 "Could not set horizontal scaling because %r is an invalid float value",
1122 scale,
1123 )
1124 else:
1125 self.textstate.scaling = scale_f
1127 def do_TL(self, leading: PDFStackT) -> None:
1128 """Set the text leading.
1130 Text leading is used only by the T*, ', and " operators.
1132 :param leading: a number expressed in unscaled text space units
1133 """
1134 leading_f = safe_float(leading)
1135 if leading_f is None:
1136 log.warning(
1137 "Could not set text leading because %r is an invalid float value",
1138 leading,
1139 )
1140 else:
1141 self.textstate.leading = -leading_f
1143 def do_Tf(self, fontid: PDFStackT, fontsize: PDFStackT) -> None:
1144 """Set the text font
1146 :param fontid: the name of a font resource in the Font subdictionary
1147 of the current resource dictionary
1148 :param fontsize: size is a number representing a scale factor.
1149 """
1150 try:
1151 self.textstate.font = self.fontmap[literal_name(fontid)]
1152 except KeyError as err:
1153 if settings.STRICT:
1154 raise PDFInterpreterError(f"Undefined Font id: {fontid!r}") from err
1155 self.textstate.font = self.rsrcmgr.get_font(None, {})
1157 fontsize_f = safe_float(fontsize)
1158 if fontsize_f is None:
1159 log.warning(
1160 "Could not set text font because %r is an invalid float value",
1161 fontsize,
1162 )
1163 else:
1164 self.textstate.fontsize = fontsize_f
1166 def do_Tr(self, render: PDFStackT) -> None:
1167 """Set the text rendering mode"""
1168 render_i = safe_int(render)
1170 if render_i is None:
1171 log.warning(
1172 "Could not set text rendering mode because %r is an invalid int value",
1173 render,
1174 )
1175 else:
1176 self.textstate.render = render_i
1178 def do_Ts(self, rise: PDFStackT) -> None:
1179 """Set the text rise
1181 :param rise: a number expressed in unscaled text space units
1182 """
1183 rise_f = safe_float(rise)
1185 if rise_f is None:
1186 log.warning(
1187 "Could not set text rise because %r is an invalid float value",
1188 rise,
1189 )
1190 else:
1191 self.textstate.rise = rise_f
1193 def do_Td(self, tx: PDFStackT, ty: PDFStackT) -> None:
1194 """Move to the start of the next line
1196 Offset from the start of the current line by (tx , ty).
1197 """
1198 tx_ = safe_float(tx)
1199 ty_ = safe_float(ty)
1200 if tx_ is not None and ty_ is not None:
1201 (a, b, c, d, e, f) = self.textstate.matrix
1202 e_new = tx_ * a + ty_ * c + e
1203 f_new = tx_ * b + ty_ * d + f
1204 self.textstate.matrix = (a, b, c, d, e_new, f_new)
1206 elif settings.STRICT:
1207 raise PDFValueError(f"Invalid offset ({tx!r}, {ty!r}) for Td")
1209 self.textstate.linematrix = (0, 0)
1211 def do_TD(self, tx: PDFStackT, ty: PDFStackT) -> None:
1212 """Move to the start of the next line.
1214 offset from the start of the current line by (tx , ty). As a side effect, this
1215 operator sets the leading parameter in the text state.
1216 """
1217 tx_ = safe_float(tx)
1218 ty_ = safe_float(ty)
1220 if tx_ is not None and ty_ is not None:
1221 (a, b, c, d, e, f) = self.textstate.matrix
1222 e_new = tx_ * a + ty_ * c + e
1223 f_new = tx_ * b + ty_ * d + f
1224 self.textstate.matrix = (a, b, c, d, e_new, f_new)
1226 elif settings.STRICT:
1227 raise PDFValueError("Invalid offset ({tx}, {ty}) for TD")
1229 if ty_ is not None:
1230 self.textstate.leading = ty_
1232 self.textstate.linematrix = (0, 0)
1234 def do_Tm(
1235 self,
1236 a: PDFStackT,
1237 b: PDFStackT,
1238 c: PDFStackT,
1239 d: PDFStackT,
1240 e: PDFStackT,
1241 f: PDFStackT,
1242 ) -> None:
1243 """Set text matrix and text line matrix"""
1244 values = (a, b, c, d, e, f)
1245 matrix = safe_matrix(*values)
1247 if matrix is None:
1248 log.warning(
1249 "Could not set text matrix because "
1250 "not all values in %r can be parsed as floats",
1251 values,
1252 )
1253 else:
1254 self.textstate.matrix = matrix
1255 self.textstate.linematrix = (0, 0)
1257 def do_T_a(self) -> None:
1258 """Move to start of next text line"""
1259 (a, b, c, d, e, f) = self.textstate.matrix
1260 self.textstate.matrix = (
1261 a,
1262 b,
1263 c,
1264 d,
1265 self.textstate.leading * c + e,
1266 self.textstate.leading * d + f,
1267 )
1268 self.textstate.linematrix = (0, 0)
1270 def do_TJ(self, seq: PDFStackT) -> None:
1271 """Show text, allowing individual glyph positioning"""
1272 if self.textstate.font is None:
1273 if settings.STRICT:
1274 raise PDFInterpreterError("No font specified!")
1275 return
1276 self.device.render_string(
1277 self.textstate,
1278 cast(PDFTextSeq, seq),
1279 self.graphicstate.ncs,
1280 self.graphicstate.copy(),
1281 )
1283 def do_Tj(self, s: PDFStackT) -> None:
1284 """Show text"""
1285 self.do_TJ([s])
1287 def do__q(self, s: PDFStackT) -> None:
1288 """Move to next line and show text
1290 The ' (single quote) operator.
1291 """
1292 self.do_T_a()
1293 self.do_TJ([s])
1295 def do__w(self, aw: PDFStackT, ac: PDFStackT, s: PDFStackT) -> None:
1296 """Set word and character spacing, move to next line, and show text
1298 The " (double quote) operator.
1299 """
1300 self.do_Tw(aw)
1301 self.do_Tc(ac)
1302 self.do_TJ([s])
1304 def do_BI(self) -> None:
1305 """Begin inline image object"""
1307 def do_ID(self) -> None:
1308 """Begin inline image data"""
1310 def do_EI(self, obj: PDFStackT) -> None:
1311 """End inline image object"""
1312 if isinstance(obj, PDFStream) and "W" in obj and "H" in obj:
1313 iobjid = str(id(obj))
1314 self.device.begin_figure(iobjid, (0, 0, 1, 1), MATRIX_IDENTITY)
1315 self.device.render_image(iobjid, obj)
1316 self.device.end_figure(iobjid)
1318 def do_Do(self, xobjid_arg: PDFStackT) -> None:
1319 """Invoke named XObject"""
1320 xobjid = literal_name(xobjid_arg)
1321 try:
1322 xobj = stream_value(self.xobjmap[xobjid])
1323 except KeyError as err:
1324 if settings.STRICT:
1325 raise PDFInterpreterError(f"Undefined xobject id: {xobjid!r}") from err
1326 return
1327 log.debug("Processing xobj: %r", xobj)
1328 subtype = xobj.get("Subtype")
1329 if subtype is LITERAL_FORM and "BBox" in xobj:
1330 interpreter = self.subinterp()
1331 bbox = cast(Rect, list_value(xobj["BBox"]))
1332 matrix = cast(Matrix, list_value(xobj.get("Matrix", MATRIX_IDENTITY)))
1333 # According to PDF reference 1.7 section 4.9.1, XObjects in
1334 # earlier PDFs (prior to v1.2) use the page's Resources entry
1335 # instead of having their own Resources entry.
1336 xobjres = xobj.get("Resources")
1337 resources = dict_value(xobjres) if xobjres else self.resources.copy()
1338 self.device.begin_figure(xobjid, bbox, matrix)
1339 interpreter.render_contents(
1340 resources,
1341 [xobj],
1342 ctm=mult_matrix(matrix, self.ctm),
1343 )
1344 self.device.end_figure(xobjid)
1345 elif subtype is LITERAL_IMAGE and "Width" in xobj and "Height" in xobj:
1346 self.device.begin_figure(xobjid, (0, 0, 1, 1), MATRIX_IDENTITY)
1347 self.device.render_image(xobjid, xobj)
1348 self.device.end_figure(xobjid)
1349 else:
1350 # unsupported xobject type.
1351 pass
1353 def process_page(self, page: PDFPage) -> None:
1354 log.debug("Processing page: %r", page)
1355 (x0, y0, x1, y1) = page.mediabox
1356 if page.rotate == 90:
1357 ctm = (0, -1, 1, 0, -y0, x1)
1358 elif page.rotate == 180:
1359 ctm = (-1, 0, 0, -1, x1, y1)
1360 elif page.rotate == 270:
1361 ctm = (0, 1, -1, 0, y1, -x0)
1362 else:
1363 ctm = (1, 0, 0, 1, -x0, -y0)
1364 self.device.begin_page(page, ctm)
1365 self.render_contents(page.resources, page.contents, ctm=ctm)
1366 self.device.end_page(page)
1368 def render_contents(
1369 self,
1370 resources: dict[object, object],
1371 streams: Sequence[object],
1372 ctm: Matrix = MATRIX_IDENTITY,
1373 ) -> None:
1374 """Render the content streams.
1376 This method may be called recursively.
1377 """
1378 log.debug(
1379 "render_contents: resources=%r, streams=%r, ctm=%r",
1380 resources,
1381 streams,
1382 ctm,
1383 )
1384 self.init_resources(resources)
1385 self.init_state(ctm)
1386 self.execute(list_value(streams))
1388 def execute(self, streams: Sequence[object]) -> None:
1389 # Detect and prevent circular references in content streams
1390 # (including Form XObjects).
1391 # We track stream IDs being executed in the current interpreter and
1392 # all parent interpreters. If a stream is already being processed
1393 # in the call stack, we skip
1394 # it to prevent infinite recursion (CWE-835 vulnerability).
1395 valid_streams: list[PDFStream] = []
1396 self.stream_ids.clear()
1397 for obj in streams:
1398 stream = stream_value(obj)
1399 if stream.objid is None:
1400 # Inline streams without object IDs can't be tracked for circular refs
1401 log.warning(
1402 "Execute called on non-indirect object (inline image?) %r", stream
1403 )
1404 continue
1405 if stream.objid in self.parent_stream_ids:
1406 log.warning(
1407 "Refusing to execute circular reference to content stream %d",
1408 stream.objid,
1409 )
1410 else:
1411 valid_streams.append(stream)
1412 self.stream_ids.add(stream.objid)
1413 try:
1414 parser = PDFContentParser(valid_streams)
1415 except PSEOF:
1416 # empty page
1417 return
1418 while True:
1419 try:
1420 (_, obj) = parser.nextobject()
1421 except PSEOF:
1422 break
1423 if isinstance(obj, PSKeyword):
1424 name = keyword_name(obj)
1425 method = "do_{}".format(
1426 name.replace("*", "_a")
1427 .replace('"', "_w")
1428 .replace(
1429 "'",
1430 "_q",
1431 )
1432 )
1433 if hasattr(self, method):
1434 func = getattr(self, method)
1435 nargs = func.__code__.co_argcount - 1
1436 if nargs:
1437 args = self.pop(nargs)
1438 log.debug("exec: %s %r", name, args)
1439 if len(args) == nargs:
1440 func(*args)
1441 else:
1442 log.debug("exec: %s", name)
1443 func()
1444 elif settings.STRICT:
1445 error_msg = f"Unknown operator: {name!r}"
1446 raise PDFInterpreterError(error_msg)
1447 else:
1448 self.push(obj)