Coverage for /pythoncovmergedfiles/medio/medio/src/pdfminer.six/pdfminer/pdfinterp.py: 70%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import logging
2import re
3from collections.abc import Mapping, Sequence
4from io import BytesIO
5from typing import Union, cast
7from pdfminer import settings
8from pdfminer.casting import safe_cmyk, safe_float, safe_int, safe_matrix, safe_rgb
9from pdfminer.cmapdb import CMap, CMapBase, CMapDB
10from pdfminer.pdfcolor import PREDEFINED_COLORSPACE, PDFColorSpace
11from pdfminer.pdfdevice import PDFDevice, PDFTextSeq
12from pdfminer.pdfexceptions import PDFException, PDFValueError
13from pdfminer.pdffont import (
14 PDFCIDFont,
15 PDFFont,
16 PDFFontError,
17 PDFTrueTypeFont,
18 PDFType1Font,
19 PDFType3Font,
20)
21from pdfminer.pdfpage import PDFPage
22from pdfminer.pdftypes import (
23 LITERALS_ASCII85_DECODE,
24 PDFObjRef,
25 PDFStream,
26 dict_value,
27 list_value,
28 resolve1,
29 stream_value,
30)
31from pdfminer.psexceptions import PSEOF, PSTypeError
32from pdfminer.psparser import (
33 KWD,
34 LIT,
35 PSKeyword,
36 PSLiteral,
37 PSStackParser,
38 PSStackType,
39 keyword_name,
40 literal_name,
41)
42from pdfminer.utils import (
43 MATRIX_IDENTITY,
44 Matrix,
45 PathSegment,
46 Point,
47 Rect,
48 choplist,
49 mult_matrix,
50)
52log = logging.getLogger(__name__)
55class PDFResourceError(PDFException):
56 pass
59class PDFInterpreterError(PDFException):
60 pass
63LITERAL_PDF = LIT("PDF")
64LITERAL_TEXT = LIT("Text")
65LITERAL_FONT = LIT("Font")
66LITERAL_FORM = LIT("Form")
67LITERAL_IMAGE = LIT("Image")
70class PDFTextState:
71 matrix: Matrix
72 linematrix: Point
74 def __init__(self) -> None:
75 self.font: PDFFont | None = None
76 self.fontsize: float = 0
77 self.charspace: float = 0
78 self.wordspace: float = 0
79 self.scaling: float = 100
80 self.leading: float = 0
81 self.render: int = 0
82 self.rise: float = 0
83 self.reset()
84 # self.matrix is set
85 # self.linematrix is set
87 def __repr__(self) -> str:
88 return (
89 f"<PDFTextState: font={self.font!r}, "
90 f"fontsize={self.fontsize!r}, "
91 f"charspace={self.charspace!r}, "
92 f"wordspace={self.wordspace!r}, "
93 f"scaling={self.scaling!r}, "
94 f"leading={self.leading!r}, "
95 f"render={self.render!r}, "
96 f"rise={self.rise!r}, "
97 f"matrix={self.matrix!r}, "
98 f"linematrix={self.linematrix!r}>"
99 )
101 def copy(self) -> "PDFTextState":
102 obj = PDFTextState()
103 obj.font = self.font
104 obj.fontsize = self.fontsize
105 obj.charspace = self.charspace
106 obj.wordspace = self.wordspace
107 obj.scaling = self.scaling
108 obj.leading = self.leading
109 obj.render = self.render
110 obj.rise = self.rise
111 obj.matrix = self.matrix
112 obj.linematrix = self.linematrix
113 return obj
115 def reset(self) -> None:
116 self.matrix = MATRIX_IDENTITY
117 self.linematrix = (0, 0)
120# Standard color types (used standalone or as base for uncolored patterns)
121StandardColor = Union[
122 float, # Greyscale
123 tuple[float, float, float], # R, G, B
124 tuple[float, float, float, float], # C, M, Y, K
125]
127# Complete color type including patterns
128Color = Union[
129 StandardColor, # Standard colors (gray, RGB, CMYK)
130 str, # Pattern name (colored pattern, PaintType=1)
131 tuple[
132 StandardColor, str
133 ], # (base_color, pattern_name) (uncolored pattern, PaintType=2)
134]
137class PDFGraphicState:
138 def __init__(self) -> None:
139 self.linewidth: float = 0
140 self.linecap: object | None = None
141 self.linejoin: object | None = None
142 self.miterlimit: object | None = None
143 self.dash: tuple[object, object] | None = None
144 self.intent: object | None = None
145 self.flatness: object | None = None
147 # stroking color
148 self.scolor: Color = 0
149 self.scs: PDFColorSpace = PREDEFINED_COLORSPACE["DeviceGray"]
151 # non stroking color
152 self.ncolor: Color = 0
153 self.ncs: PDFColorSpace = PREDEFINED_COLORSPACE["DeviceGray"]
155 def copy(self) -> "PDFGraphicState":
156 obj = PDFGraphicState()
157 obj.linewidth = self.linewidth
158 obj.linecap = self.linecap
159 obj.linejoin = self.linejoin
160 obj.miterlimit = self.miterlimit
161 obj.dash = self.dash
162 obj.intent = self.intent
163 obj.flatness = self.flatness
164 obj.scolor = self.scolor
165 obj.scs = self.scs
166 obj.ncolor = self.ncolor
167 obj.ncs = self.ncs
168 return obj
170 def __repr__(self) -> str:
171 return (
172 f"<PDFGraphicState: "
173 f"linewidth={self.linewidth!r}, "
174 f"linecap={self.linecap!r}, "
175 f"linejoin={self.linejoin!r}, "
176 f"miterlimit={self.miterlimit!r}, "
177 f"dash={self.dash!r}, "
178 f"intent={self.intent!r}, "
179 f"flatness={self.flatness!r}, "
180 f"stroking color={self.scolor!r}, "
181 f"non stroking color={self.ncolor!r}>"
182 )
185class PDFResourceManager:
186 """Repository of shared resources.
188 ResourceManager facilitates reuse of shared resources
189 such as fonts and images so that large objects are not
190 allocated multiple times.
191 """
193 def __init__(self, caching: bool = True) -> None:
194 self.caching = caching
195 self._cached_fonts: dict[object, PDFFont] = {}
197 def get_procset(self, procs: Sequence[object]) -> None:
198 for proc in procs:
199 if proc is LITERAL_PDF or proc is LITERAL_TEXT:
200 pass
201 else:
202 pass
204 def get_cmap(self, cmapname: str, strict: bool = False) -> CMapBase:
205 try:
206 return CMapDB.get_cmap(cmapname)
207 except CMapDB.CMapNotFound:
208 if strict:
209 raise
210 return CMap()
212 def get_font(self, objid: object, spec: Mapping[str, object]) -> PDFFont:
213 if objid and objid in self._cached_fonts:
214 font = self._cached_fonts[objid]
215 else:
216 log.debug("get_font: create: objid=%r, spec=%r", objid, spec)
217 if settings.STRICT and spec["Type"] is not LITERAL_FONT:
218 raise PDFFontError("Type is not /Font")
219 # Create a Font object.
220 if "Subtype" in spec:
221 subtype = literal_name(spec["Subtype"])
222 else:
223 if settings.STRICT:
224 raise PDFFontError("Font Subtype is not specified.")
225 subtype = "Type1"
226 if subtype in ("Type1", "MMType1"):
227 # Type1 Font
228 font = PDFType1Font(self, spec)
229 elif subtype == "TrueType":
230 # TrueType Font
231 font = PDFTrueTypeFont(self, spec)
232 elif subtype == "Type3":
233 # Type3 Font
234 font = PDFType3Font(self, spec)
235 elif subtype in ("CIDFontType0", "CIDFontType2"):
236 # CID Font
237 font = PDFCIDFont(self, spec)
238 elif subtype == "Type0":
239 # Type0 Font
240 dfonts = list_value(spec["DescendantFonts"])
241 assert dfonts
242 subspec = dict_value(dfonts[0]).copy()
243 for k in ("Encoding", "ToUnicode"):
244 if k in spec:
245 subspec[k] = resolve1(spec[k])
246 font = self.get_font(None, subspec)
247 else:
248 if settings.STRICT:
249 raise PDFFontError(f"Invalid Font spec: {spec!r}")
250 font = PDFType1Font(self, spec) # this is so wrong!
251 if objid and self.caching:
252 self._cached_fonts[objid] = font
253 return font
256class PDFContentParser(PSStackParser[Union[PSKeyword, PDFStream]]):
257 def __init__(self, streams: Sequence[object]) -> None:
258 self.streams = streams
259 self.istream = 0
260 # PSStackParser.__init__(fp=None) is safe only because we've overloaded
261 # all the methods that would attempt to access self.fp without first
262 # calling self.fillfp().
263 PSStackParser.__init__(self, None) # type: ignore[arg-type]
265 def fillfp(self) -> bool:
266 if not self.fp:
267 if self.istream < len(self.streams):
268 strm = stream_value(self.streams[self.istream])
269 self.istream += 1
270 else:
271 raise PSEOF("Unexpected EOF, file truncated?")
272 self.fp = BytesIO(strm.get_data())
273 return True
274 return False
276 def seek(self, pos: int) -> None:
277 self.fillfp()
278 PSStackParser.seek(self, pos)
280 def fillbuf(self) -> bool:
281 if self.charpos < len(self.buf):
282 return False
283 new_stream = False
284 while 1:
285 new_stream = self.fillfp()
286 self.bufpos = self.fp.tell()
287 self.buf = self.fp.read(self.BUFSIZ)
288 if self.buf:
289 break
290 self.fp = None # type: ignore[assignment]
291 self.charpos = 0
292 return new_stream
294 def get_inline_data(self, pos: int, target: bytes = b"EI") -> tuple[int, bytes]:
295 self.seek(pos)
296 i = 0
297 data = b""
298 while i <= len(target):
299 self.fillbuf()
300 if i:
301 ci = self.buf[self.charpos]
302 c = bytes((ci,))
303 data += c
304 self.charpos += 1
305 if (len(target) <= i and c.isspace()) or (
306 i < len(target) and c == (bytes((target[i],)))
307 ):
308 i += 1
309 else:
310 i = 0
311 else:
312 try:
313 j = self.buf.index(target[0], self.charpos)
314 data += self.buf[self.charpos : j + 1]
315 self.charpos = j + 1
316 i = 1
317 except ValueError:
318 data += self.buf[self.charpos :]
319 self.charpos = len(self.buf)
320 data = data[: -(len(target) + 1)] # strip the last part
321 data = re.sub(rb"(\x0d\x0a|[\x0d\x0a])$", b"", data)
322 return (pos, data)
324 def flush(self) -> None:
325 self.add_results(*self.popall())
327 KEYWORD_BI = KWD(b"BI")
328 KEYWORD_ID = KWD(b"ID")
329 KEYWORD_EI = KWD(b"EI")
331 def do_keyword(self, pos: int, token: PSKeyword) -> None:
332 if token is self.KEYWORD_BI:
333 # inline image within a content stream
334 self.start_type(pos, "inline")
335 elif token is self.KEYWORD_ID:
336 try:
337 (_, objs) = self.end_type("inline")
338 if len(objs) % 2 != 0:
339 error_msg = f"Invalid dictionary construct: {objs!r}"
340 raise PSTypeError(error_msg)
341 d = {literal_name(k): resolve1(v) for (k, v) in choplist(2, objs)}
342 eos = b"EI"
343 filter = d.get("F")
344 if filter is not None:
345 if isinstance(filter, PSLiteral):
346 filter = [filter]
347 if filter[0] in LITERALS_ASCII85_DECODE:
348 eos = b"~>"
349 (pos, data) = self.get_inline_data(pos + len(b"ID "), target=eos)
350 if eos != b"EI": # it may be necessary for decoding
351 data += eos
352 obj = PDFStream(d, data)
353 self.push((pos, obj))
354 if eos == b"EI": # otherwise it is still in the stream
355 self.push((pos, self.KEYWORD_EI))
356 except PSTypeError:
357 if settings.STRICT:
358 raise
359 else:
360 self.push((pos, token))
363# Types that may appear on the PDF argument stack.
364PDFStackT = PSStackType[PDFStream]
367class PDFPageInterpreter:
368 """Processor for the content of a PDF page
370 Reference: PDF Reference, Appendix A, Operator Summary
371 """
373 def __init__(self, rsrcmgr: PDFResourceManager, device: PDFDevice) -> None:
374 self.rsrcmgr = rsrcmgr
375 self.device = device
376 # Track stream IDs currently being executed to detect circular references
377 self.stream_ids: set[int] = set()
378 # Track stream IDs from parent interpreters in the call stack
379 self.parent_stream_ids: set[int] = set()
381 def dup(self) -> "PDFPageInterpreter":
382 return self.__class__(self.rsrcmgr, self.device)
384 def subinterp(self) -> "PDFPageInterpreter":
385 """Create a sub-interpreter for processing nested content streams.
387 This is used when invoking Form XObjects to prevent circular references.
388 Unlike dup(), this method propagates the stream ID tracking from the
389 parent interpreter, allowing detection of circular references across
390 nested XObject invocations.
391 """
392 interp = self.dup()
393 interp.parent_stream_ids.update(self.parent_stream_ids)
394 interp.parent_stream_ids.update(self.stream_ids)
395 return interp
397 def init_resources(self, resources: dict[object, object]) -> None:
398 """Prepare the fonts and XObjects listed in the Resource attribute."""
399 self.resources = resources
400 self.fontmap: dict[object, PDFFont] = {}
401 self.xobjmap = {}
402 self.csmap: dict[str, PDFColorSpace] = PREDEFINED_COLORSPACE.copy()
403 if not resources:
404 return
406 def get_colorspace(spec: object) -> PDFColorSpace | None:
407 if isinstance(spec, list):
408 name = literal_name(spec[0])
409 else:
410 name = literal_name(spec)
411 if name == "ICCBased" and isinstance(spec, list) and len(spec) >= 2:
412 return PDFColorSpace(name, stream_value(spec[1])["N"])
413 elif name == "DeviceN" and isinstance(spec, list) and len(spec) >= 2:
414 return PDFColorSpace(name, len(list_value(spec[1])))
415 else:
416 return PREDEFINED_COLORSPACE.get(name)
418 for k, v in dict_value(resources).items():
419 log.debug("Resource: %r: %r", k, v)
420 if k == "Font":
421 for fontid, spec in dict_value(v).items():
422 objid = None
423 if isinstance(spec, PDFObjRef):
424 objid = spec.objid
425 spec = dict_value(spec)
426 self.fontmap[fontid] = self.rsrcmgr.get_font(objid, spec)
427 elif k == "ColorSpace":
428 for csid, spec in dict_value(v).items():
429 colorspace = get_colorspace(resolve1(spec))
430 if colorspace is not None:
431 self.csmap[csid] = colorspace
432 elif k == "ProcSet":
433 self.rsrcmgr.get_procset(list_value(v))
434 elif k == "XObject":
435 for xobjid, xobjstrm in dict_value(v).items():
436 self.xobjmap[xobjid] = xobjstrm
438 def init_state(self, ctm: Matrix) -> None:
439 """Initialize the text and graphic states for rendering a page."""
440 # gstack: stack for graphical states.
441 self.gstack: list[tuple[Matrix, PDFTextState, PDFGraphicState]] = []
442 self.ctm = ctm
443 self.device.set_ctm(self.ctm)
444 self.textstate = PDFTextState()
445 self.graphicstate = PDFGraphicState()
446 self.curpath: list[PathSegment] = []
447 # argstack: stack for command arguments.
448 self.argstack: list[PDFStackT] = []
450 def push(self, obj: PDFStackT) -> None:
451 self.argstack.append(obj)
453 def pop(self, n: int) -> list[PDFStackT]:
454 if n == 0:
455 return []
456 x = self.argstack[-n:]
457 self.argstack = self.argstack[:-n]
458 return x
460 def get_current_state(self) -> tuple[Matrix, PDFTextState, PDFGraphicState]:
461 return (self.ctm, self.textstate.copy(), self.graphicstate.copy())
463 def set_current_state(
464 self,
465 state: tuple[Matrix, PDFTextState, PDFGraphicState],
466 ) -> None:
467 (self.ctm, self.textstate, self.graphicstate) = state
468 self.device.set_ctm(self.ctm)
470 def do_q(self) -> None:
471 """Save graphics state"""
472 self.gstack.append(self.get_current_state())
474 def do_Q(self) -> None:
475 """Restore graphics state"""
476 if self.gstack:
477 self.set_current_state(self.gstack.pop())
479 def do_cm(
480 self,
481 a1: PDFStackT,
482 b1: PDFStackT,
483 c1: PDFStackT,
484 d1: PDFStackT,
485 e1: PDFStackT,
486 f1: PDFStackT,
487 ) -> None:
488 """Concatenate matrix to current transformation matrix"""
489 matrix = safe_matrix(a1, b1, c1, d1, e1, f1)
491 if matrix is None:
492 log.warning(
493 "Cannot concatenate matrix to current transformation matrix "
494 "because not all values in %r can be parsed as floats",
495 (a1, b1, c1, d1, e1, f1),
496 )
497 else:
498 self.ctm = mult_matrix(matrix, self.ctm)
499 self.device.set_ctm(self.ctm)
501 def do_w(self, linewidth: PDFStackT) -> None:
502 """Set line width"""
503 linewidth_f = safe_float(linewidth)
504 if linewidth_f is None:
505 log.warning(
506 "Cannot set line width because %r is an invalid float value",
507 linewidth,
508 )
509 else:
510 scale = (self.ctm[0] ** 2 + self.ctm[1] ** 2) ** 0.5
511 self.graphicstate.linewidth = linewidth_f * scale
513 def do_J(self, linecap: PDFStackT) -> None:
514 """Set line cap style"""
515 self.graphicstate.linecap = linecap
517 def do_j(self, linejoin: PDFStackT) -> None:
518 """Set line join style"""
519 self.graphicstate.linejoin = linejoin
521 def do_M(self, miterlimit: PDFStackT) -> None:
522 """Set miter limit"""
523 self.graphicstate.miterlimit = miterlimit
525 def do_d(self, dash: PDFStackT, phase: PDFStackT) -> None:
526 """Set line dash pattern"""
527 self.graphicstate.dash = (dash, phase)
529 def do_ri(self, intent: PDFStackT) -> None:
530 """Set color rendering intent"""
531 self.graphicstate.intent = intent
533 def do_i(self, flatness: PDFStackT) -> None:
534 """Set flatness tolerance"""
535 self.graphicstate.flatness = flatness
537 def do_gs(self, name: PDFStackT) -> None:
538 """Set parameters from graphics state parameter dictionary"""
539 # TODO
541 def do_m(self, x: PDFStackT, y: PDFStackT) -> None:
542 """Begin new subpath"""
543 x_f = safe_float(x)
544 y_f = safe_float(y)
546 if x_f is None or y_f is None:
547 point = ("m", x, y)
548 log.warning(
549 "Cannot start new subpath because not all values "
550 "in %r can be parsed as floats",
551 point,
552 )
553 else:
554 point = ("m", x_f, y_f)
555 self.curpath.append(point)
557 def do_l(self, x: PDFStackT, y: PDFStackT) -> None:
558 """Append straight line segment to path"""
559 x_f = safe_float(x)
560 y_f = safe_float(y)
561 if x_f is None or y_f is None:
562 point = ("l", x, y)
563 log.warning(
564 "Cannot append straight line segment to path "
565 "because not all values in %r can be parsed as floats",
566 point,
567 )
568 else:
569 point = ("l", x_f, y_f)
570 self.curpath.append(point)
572 def do_c(
573 self,
574 x1: PDFStackT,
575 y1: PDFStackT,
576 x2: PDFStackT,
577 y2: PDFStackT,
578 x3: PDFStackT,
579 y3: PDFStackT,
580 ) -> None:
581 """Append curved segment to path (three control points)"""
582 x1_f = safe_float(x1)
583 y1_f = safe_float(y1)
584 x2_f = safe_float(x2)
585 y2_f = safe_float(y2)
586 x3_f = safe_float(x3)
587 y3_f = safe_float(y3)
588 if (
589 x1_f is None
590 or y1_f is None
591 or x2_f is None
592 or y2_f is None
593 or x3_f is None
594 or y3_f is None
595 ):
596 point = ("c", x1, y1, x2, y2, x3, y3)
597 log.warning(
598 "Cannot append curved segment to path "
599 "because not all values in %r can be parsed as floats",
600 point,
601 )
602 else:
603 point = ("c", x1_f, y1_f, x2_f, y2_f, x3_f, y3_f)
604 self.curpath.append(point)
606 def do_v(self, x2: PDFStackT, y2: PDFStackT, x3: PDFStackT, y3: PDFStackT) -> None:
607 """Append curved segment to path (initial point replicated)"""
608 x2_f = safe_float(x2)
609 y2_f = safe_float(y2)
610 x3_f = safe_float(x3)
611 y3_f = safe_float(y3)
612 if x2_f is None or y2_f is None or x3_f is None or y3_f is None:
613 point = ("v", x2, y2, x3, y3)
614 log.warning(
615 "Cannot append curved segment to path "
616 "because not all values in %r can be parsed as floats",
617 point,
618 )
619 else:
620 point = ("v", x2_f, y2_f, x3_f, y3_f)
621 self.curpath.append(point)
623 def do_y(self, x1: PDFStackT, y1: PDFStackT, x3: PDFStackT, y3: PDFStackT) -> None:
624 """Append curved segment to path (final point replicated)"""
625 x1_f = safe_float(x1)
626 y1_f = safe_float(y1)
627 x3_f = safe_float(x3)
628 y3_f = safe_float(y3)
629 if x1_f is None or y1_f is None or x3_f is None or y3_f is None:
630 point = ("y", x1, y1, x3, y3)
631 log.warning(
632 "Cannot append curved segment to path "
633 "because not all values in %r can be parsed as floats",
634 point,
635 )
636 else:
637 point = ("y", x1_f, y1_f, x3_f, y3_f)
638 self.curpath.append(point)
640 def do_h(self) -> None:
641 """Close subpath"""
642 self.curpath.append(("h",))
644 def do_re(self, x: PDFStackT, y: PDFStackT, w: PDFStackT, h: PDFStackT) -> None:
645 """Append rectangle to path"""
646 x_f = safe_float(x)
647 y_f = safe_float(y)
648 w_f = safe_float(w)
649 h_f = safe_float(h)
651 if x_f is None or y_f is None or w_f is None or h_f is None:
652 values = (x, y, w, h)
653 log.warning(
654 "Cannot append rectangle to path "
655 "because not all values in %r can be parsed as floats",
656 values,
657 )
658 else:
659 self.curpath.append(("m", x_f, y_f))
660 self.curpath.append(("l", x_f + w_f, y_f))
661 self.curpath.append(("l", x_f + w_f, y_f + h_f))
662 self.curpath.append(("l", x_f, y_f + h_f))
663 self.curpath.append(("h",))
665 def do_S(self) -> None:
666 """Stroke path"""
667 self.device.paint_path(self.graphicstate, True, False, False, self.curpath)
668 self.curpath = []
670 def do_s(self) -> None:
671 """Close and stroke path"""
672 self.do_h()
673 self.do_S()
675 def do_f(self) -> None:
676 """Fill path using nonzero winding number rule"""
677 self.device.paint_path(self.graphicstate, False, True, False, self.curpath)
678 self.curpath = []
680 def do_F(self) -> None:
681 """Fill path using nonzero winding number rule (obsolete)"""
683 def do_f_a(self) -> None:
684 """Fill path using even-odd rule"""
685 self.device.paint_path(self.graphicstate, False, True, True, self.curpath)
686 self.curpath = []
688 def do_B(self) -> None:
689 """Fill and stroke path using nonzero winding number rule"""
690 self.device.paint_path(self.graphicstate, True, True, False, self.curpath)
691 self.curpath = []
693 def do_B_a(self) -> None:
694 """Fill and stroke path using even-odd rule"""
695 self.device.paint_path(self.graphicstate, True, True, True, self.curpath)
696 self.curpath = []
698 def do_b(self) -> None:
699 """Close, fill, and stroke path using nonzero winding number rule"""
700 self.do_h()
701 self.do_B()
703 def do_b_a(self) -> None:
704 """Close, fill, and stroke path using even-odd rule"""
705 self.do_h()
706 self.do_B_a()
708 def do_n(self) -> None:
709 """End path without filling or stroking"""
710 self.curpath = []
712 def do_W(self) -> None:
713 """Set clipping path using nonzero winding number rule"""
715 def do_W_a(self) -> None:
716 """Set clipping path using even-odd rule"""
718 def do_CS(self, name: PDFStackT) -> None:
719 """Set color space for stroking operations
721 Introduced in PDF 1.1
722 """
723 try:
724 self.graphicstate.scs = self.csmap[literal_name(name)]
725 except KeyError as err:
726 if settings.STRICT:
727 raise PDFInterpreterError(f"Undefined ColorSpace: {name!r}") from err
729 def do_cs(self, name: PDFStackT) -> None:
730 """Set color space for nonstroking operations"""
731 try:
732 self.graphicstate.ncs = self.csmap[literal_name(name)]
733 except KeyError as err:
734 if settings.STRICT:
735 raise PDFInterpreterError(f"Undefined ColorSpace: {name!r}") from err
737 def do_G(self, gray: PDFStackT) -> None:
738 """Set gray level for stroking operations"""
739 gray_f = safe_float(gray)
741 if gray_f is None:
742 log.warning(
743 "Cannot set gray level because %r is an invalid float value",
744 gray,
745 )
746 else:
747 self.graphicstate.scolor = gray_f
748 self.graphicstate.scs = self.csmap["DeviceGray"]
750 def do_g(self, gray: PDFStackT) -> None:
751 """Set gray level for nonstroking operations"""
752 gray_f = safe_float(gray)
754 if gray_f is None:
755 log.warning(
756 "Cannot set gray level because %r is an invalid float value",
757 gray,
758 )
759 else:
760 self.graphicstate.ncolor = gray_f
761 self.graphicstate.ncs = self.csmap["DeviceGray"]
763 def do_RG(self, r: PDFStackT, g: PDFStackT, b: PDFStackT) -> None:
764 """Set RGB color for stroking operations"""
765 rgb = safe_rgb(r, g, b)
767 if rgb is None:
768 log.warning(
769 "Cannot set RGB stroke color "
770 "because not all values in %r can be parsed as floats",
771 (r, g, b),
772 )
773 else:
774 self.graphicstate.scolor = rgb
775 self.graphicstate.scs = self.csmap["DeviceRGB"]
777 def do_rg(self, r: PDFStackT, g: PDFStackT, b: PDFStackT) -> None:
778 """Set RGB color for nonstroking operations"""
779 rgb = safe_rgb(r, g, b)
781 if rgb is None:
782 log.warning(
783 "Cannot set RGB non-stroke color "
784 "because not all values in %r can be parsed as floats",
785 (r, g, b),
786 )
787 else:
788 self.graphicstate.ncolor = rgb
789 self.graphicstate.ncs = self.csmap["DeviceRGB"]
791 def do_K(self, c: PDFStackT, m: PDFStackT, y: PDFStackT, k: PDFStackT) -> None:
792 """Set CMYK color for stroking operations"""
793 cmyk = safe_cmyk(c, m, y, k)
795 if cmyk is None:
796 log.warning(
797 "Cannot set CMYK stroke color "
798 "because not all values in %r can be parsed as floats",
799 (c, m, y, k),
800 )
801 else:
802 self.graphicstate.scolor = cmyk
803 self.graphicstate.scs = self.csmap["DeviceCMYK"]
805 def do_k(self, c: PDFStackT, m: PDFStackT, y: PDFStackT, k: PDFStackT) -> None:
806 """Set CMYK color for nonstroking operations"""
807 cmyk = safe_cmyk(c, m, y, k)
809 if cmyk is None:
810 log.warning(
811 "Cannot set CMYK non-stroke color "
812 "because not all values in %r can be parsed as floats",
813 (c, m, y, k),
814 )
815 else:
816 self.graphicstate.ncolor = cmyk
817 self.graphicstate.ncs = self.csmap["DeviceCMYK"]
819 def _parse_color_components(
820 self, components: list[PDFStackT], context: str
821 ) -> StandardColor | None:
822 """Parse color components into StandardColor (gray, RGB, or CMYK).
824 Args:
825 components: List of 1, 3, or 4 numeric color components
826 context: Description for error messages (e.g., "stroke", "non-stroke")
828 Returns:
829 Parsed color (float for gray, tuple for RGB/CMYK) or None if invalid
830 """
831 if len(components) == 1:
832 gray = safe_float(components[0])
833 if gray is None:
834 log.warning(
835 "Cannot set %s color: %r is an invalid float value",
836 context,
837 components[0],
838 )
839 return gray
841 elif len(components) == 3:
842 rgb = safe_rgb(*components)
843 if rgb is None:
844 log.warning(
845 "Cannot set %s color: components %r cannot be parsed as RGB",
846 context,
847 components,
848 )
849 return rgb
851 elif len(components) == 4:
852 cmyk = safe_cmyk(*components)
853 if cmyk is None:
854 log.warning(
855 "Cannot set %s color: components %r cannot be parsed as CMYK",
856 context,
857 components,
858 )
859 return cmyk
861 else:
862 log.warning(
863 "Cannot set %s color: %d components specified, "
864 "but only 1 (grayscale), 3 (RGB), and 4 (CMYK) are supported",
865 context,
866 len(components),
867 )
868 return None
870 def do_SCN(self) -> None:
871 """Set color for stroking operations.
873 Handles Pattern color spaces per ISO 32000-1:2008 4.5.5 (PDF 1.7)
874 and ISO 32000-2:2020 8.7.3 (PDF 2.0):
875 - Colored patterns (PaintType=1): single operand (pattern name)
876 - Uncolored patterns (PaintType=2): n+1 operands (colors + pattern name)
877 """
878 n = self.graphicstate.scs.ncomponents
880 components = self.pop(n)
881 if len(components) != n:
882 log.warning(
883 "Cannot set stroke color because expected %d components but got %r",
884 n,
885 components,
886 )
888 elif self.graphicstate.scs.name != "Pattern":
889 # Standard colors (gray, RGB, CMYK) - common case
890 color = self._parse_color_components(components, "stroke")
891 if color is not None:
892 self.graphicstate.scolor = color
894 elif len(components) >= 1:
895 # Pattern color space (ISO 32000 8.7.3.2-3)
896 # Last component is always the pattern name
897 pattern_component = components[-1]
899 # Per spec: pattern name must be a name object (PSLiteral)
900 if not isinstance(pattern_component, PSLiteral):
901 log.warning(
902 "Pattern color space requires name object (PSLiteral), "
903 "got %s: %r. "
904 "Per ISO 32000 8.7.3.2, colored patterns use syntax '/name SCN'. "
905 "Per ISO 32000 8.7.3.3, uncolored patterns use "
906 "syntax 'c1...cn /name SCN'.",
907 type(pattern_component).__name__,
908 pattern_component,
909 )
910 return
912 pattern_name = literal_name(pattern_component)
914 if len(components) == 1:
915 # Colored tiling pattern (PaintType=1): just pattern name
916 self.graphicstate.scolor = pattern_name
917 log.debug("Set stroke pattern (colored): %s", pattern_name)
918 else:
919 # Uncolored tiling pattern (PaintType=2):
920 # color components + pattern name
921 base_color_components = components[:-1]
923 # Parse base color using shared logic
924 base_color = self._parse_color_components(
925 base_color_components, "stroke (uncolored pattern)"
926 )
927 if base_color is None:
928 return
930 # Store as tuple: (base_color, pattern_name)
931 self.graphicstate.scolor = (base_color, pattern_name)
932 log.debug(
933 "Set stroke pattern (uncolored): %s + %s", base_color, pattern_name
934 )
936 def do_scn(self) -> None:
937 """Set color for nonstroking operations.
939 Handles Pattern color spaces per ISO 32000-1:2008 4.5.5 (PDF 1.7)
940 and ISO 32000-2:2020 §8.7.3 (PDF 2.0):
941 - Colored patterns (PaintType=1): single operand (pattern name)
942 - Uncolored patterns (PaintType=2): n+1 operands (colors + pattern name)
943 """
944 n = self.graphicstate.ncs.ncomponents
946 components = self.pop(n)
947 if len(components) != n:
948 log.warning(
949 "Cannot set non-stroke color because expected %d components but got %r",
950 n,
951 components,
952 )
954 elif self.graphicstate.ncs.name != "Pattern":
955 # Standard colors (gray, RGB, CMYK) - common case
956 color = self._parse_color_components(components, "non-stroke")
957 if color is not None:
958 self.graphicstate.ncolor = color
960 elif len(components) >= 1:
961 # Pattern color space (ISO 32000 8.7.3.2-3)
962 # Last component is always the pattern name
963 pattern_component = components[-1]
965 # Per spec: pattern name must be a name object (PSLiteral)
966 if not isinstance(pattern_component, PSLiteral):
967 log.warning(
968 "Pattern color space requires name object (PSLiteral), "
969 "got %s: %r. "
970 "Per ISO 32000 8.7.3.2, colored patterns use syntax '/name scn'. "
971 "Per ISO 32000 8.7.3.3, uncolored patterns use "
972 "syntax 'c1...cn /name scn'.",
973 type(pattern_component).__name__,
974 pattern_component,
975 )
976 return
978 pattern_name = literal_name(pattern_component)
980 if len(components) == 1:
981 # Colored tiling pattern (PaintType=1): just pattern name
982 self.graphicstate.ncolor = pattern_name
983 log.debug("Set non-stroke pattern (colored): %s", pattern_name)
984 else:
985 # Uncolored tiling pattern (PaintType=2):
986 # color components + pattern name
987 base_color_components = components[:-1]
989 # Parse base color using shared logic
990 base_color = self._parse_color_components(
991 base_color_components, "non-stroke (uncolored pattern)"
992 )
993 if base_color is None:
994 return
996 # Store as tuple: (base_color, pattern_name)
997 self.graphicstate.ncolor = (base_color, pattern_name)
998 log.debug(
999 "Set non-stroke pattern (uncolored): %s + %s",
1000 base_color,
1001 pattern_name,
1002 )
1004 def do_SC(self) -> None:
1005 """Set color for stroking operations"""
1006 self.do_SCN()
1008 def do_sc(self) -> None:
1009 """Set color for nonstroking operations"""
1010 self.do_scn()
1012 def do_sh(self, name: object) -> None:
1013 """Paint area defined by shading pattern"""
1015 def do_BT(self) -> None:
1016 """Begin text object
1018 Initializing the text matrix, Tm, and the text line matrix, Tlm, to
1019 the identity matrix. Text objects cannot be nested; a second BT cannot
1020 appear before an ET.
1021 """
1022 self.textstate.reset()
1024 def do_ET(self) -> None:
1025 """End a text object"""
1027 def do_BX(self) -> None:
1028 """Begin compatibility section"""
1030 def do_EX(self) -> None:
1031 """End compatibility section"""
1033 def do_MP(self, tag: PDFStackT) -> None:
1034 """Define marked-content point"""
1035 if isinstance(tag, PSLiteral):
1036 self.device.do_tag(tag)
1037 else:
1038 log.warning(
1039 "Cannot define marked-content point because %r is not a PSLiteral",
1040 tag,
1041 )
1043 def do_DP(self, tag: PDFStackT, props: PDFStackT) -> None:
1044 """Define marked-content point with property list"""
1045 if isinstance(tag, PSLiteral):
1046 self.device.do_tag(tag, props)
1047 else:
1048 log.warning(
1049 "Cannot define marked-content point with property list "
1050 "because %r is not a PSLiteral",
1051 tag,
1052 )
1054 def do_BMC(self, tag: PDFStackT) -> None:
1055 """Begin marked-content sequence"""
1056 if isinstance(tag, PSLiteral):
1057 self.device.begin_tag(tag)
1058 else:
1059 log.warning(
1060 "Cannot begin marked-content sequence because %r is not a PSLiteral",
1061 tag,
1062 )
1064 def do_BDC(self, tag: PDFStackT, props: PDFStackT) -> None:
1065 """Begin marked-content sequence with property list"""
1066 if isinstance(tag, PSLiteral):
1067 self.device.begin_tag(tag, props)
1068 else:
1069 log.warning(
1070 "Cannot begin marked-content sequence with property list "
1071 "because %r is not a PSLiteral",
1072 tag,
1073 )
1075 def do_EMC(self) -> None:
1076 """End marked-content sequence"""
1077 self.device.end_tag()
1079 def do_Tc(self, space: PDFStackT) -> None:
1080 """Set character spacing.
1082 Character spacing is used by the Tj, TJ, and ' operators.
1084 :param space: a number expressed in unscaled text space units.
1085 """
1086 charspace = safe_float(space)
1087 if charspace is None:
1088 log.warning(
1089 "Could not set character spacing because %r is an invalid float value",
1090 space,
1091 )
1092 else:
1093 self.textstate.charspace = charspace
1095 def do_Tw(self, space: PDFStackT) -> None:
1096 """Set the word spacing.
1098 Word spacing is used by the Tj, TJ, and ' operators.
1100 :param space: a number expressed in unscaled text space units
1101 """
1102 wordspace = safe_float(space)
1103 if wordspace is None:
1104 log.warning(
1105 "Could not set word spacing because %r is an invalid float value",
1106 space,
1107 )
1108 else:
1109 self.textstate.wordspace = wordspace
1111 def do_Tz(self, scale: PDFStackT) -> None:
1112 """Set the horizontal scaling.
1114 :param scale: is a number specifying the percentage of the normal width
1115 """
1116 scale_f = safe_float(scale)
1118 if scale_f is None:
1119 log.warning(
1120 "Could not set horizontal scaling because %r is an invalid float value",
1121 scale,
1122 )
1123 else:
1124 self.textstate.scaling = scale_f
1126 def do_TL(self, leading: PDFStackT) -> None:
1127 """Set the text leading.
1129 Text leading is used only by the T*, ', and " operators.
1131 :param leading: a number expressed in unscaled text space units
1132 """
1133 leading_f = safe_float(leading)
1134 if leading_f is None:
1135 log.warning(
1136 "Could not set text leading because %r is an invalid float value",
1137 leading,
1138 )
1139 else:
1140 self.textstate.leading = -leading_f
1142 def do_Tf(self, fontid: PDFStackT, fontsize: PDFStackT) -> None:
1143 """Set the text font
1145 :param fontid: the name of a font resource in the Font subdictionary
1146 of the current resource dictionary
1147 :param fontsize: size is a number representing a scale factor.
1148 """
1149 try:
1150 self.textstate.font = self.fontmap[literal_name(fontid)]
1151 except KeyError as err:
1152 if settings.STRICT:
1153 raise PDFInterpreterError(f"Undefined Font id: {fontid!r}") from err
1154 self.textstate.font = self.rsrcmgr.get_font(None, {})
1156 fontsize_f = safe_float(fontsize)
1157 if fontsize_f is None:
1158 log.warning(
1159 "Could not set text font because %r is an invalid float value",
1160 fontsize,
1161 )
1162 else:
1163 self.textstate.fontsize = fontsize_f
1165 def do_Tr(self, render: PDFStackT) -> None:
1166 """Set the text rendering mode"""
1167 render_i = safe_int(render)
1169 if render_i is None:
1170 log.warning(
1171 "Could not set text rendering mode because %r is an invalid int value",
1172 render,
1173 )
1174 else:
1175 self.textstate.render = render_i
1177 def do_Ts(self, rise: PDFStackT) -> None:
1178 """Set the text rise
1180 :param rise: a number expressed in unscaled text space units
1181 """
1182 rise_f = safe_float(rise)
1184 if rise_f is None:
1185 log.warning(
1186 "Could not set text rise because %r is an invalid float value",
1187 rise,
1188 )
1189 else:
1190 self.textstate.rise = rise_f
1192 def do_Td(self, tx: PDFStackT, ty: PDFStackT) -> None:
1193 """Move to the start of the next line
1195 Offset from the start of the current line by (tx , ty).
1196 """
1197 tx_ = safe_float(tx)
1198 ty_ = safe_float(ty)
1199 if tx_ is not None and ty_ is not None:
1200 (a, b, c, d, e, f) = self.textstate.matrix
1201 e_new = tx_ * a + ty_ * c + e
1202 f_new = tx_ * b + ty_ * d + f
1203 self.textstate.matrix = (a, b, c, d, e_new, f_new)
1205 elif settings.STRICT:
1206 raise PDFValueError(f"Invalid offset ({tx!r}, {ty!r}) for Td")
1208 self.textstate.linematrix = (0, 0)
1210 def do_TD(self, tx: PDFStackT, ty: PDFStackT) -> None:
1211 """Move to the start of the next line.
1213 offset from the start of the current line by (tx , ty). As a side effect, this
1214 operator sets the leading parameter in the text state.
1215 """
1216 tx_ = safe_float(tx)
1217 ty_ = safe_float(ty)
1219 if tx_ is not None and ty_ is not None:
1220 (a, b, c, d, e, f) = self.textstate.matrix
1221 e_new = tx_ * a + ty_ * c + e
1222 f_new = tx_ * b + ty_ * d + f
1223 self.textstate.matrix = (a, b, c, d, e_new, f_new)
1225 elif settings.STRICT:
1226 raise PDFValueError("Invalid offset ({tx}, {ty}) for TD")
1228 if ty_ is not None:
1229 self.textstate.leading = ty_
1231 self.textstate.linematrix = (0, 0)
1233 def do_Tm(
1234 self,
1235 a: PDFStackT,
1236 b: PDFStackT,
1237 c: PDFStackT,
1238 d: PDFStackT,
1239 e: PDFStackT,
1240 f: PDFStackT,
1241 ) -> None:
1242 """Set text matrix and text line matrix"""
1243 values = (a, b, c, d, e, f)
1244 matrix = safe_matrix(*values)
1246 if matrix is None:
1247 log.warning(
1248 "Could not set text matrix because "
1249 "not all values in %r can be parsed as floats",
1250 values,
1251 )
1252 else:
1253 self.textstate.matrix = matrix
1254 self.textstate.linematrix = (0, 0)
1256 def do_T_a(self) -> None:
1257 """Move to start of next text line"""
1258 (a, b, c, d, e, f) = self.textstate.matrix
1259 self.textstate.matrix = (
1260 a,
1261 b,
1262 c,
1263 d,
1264 self.textstate.leading * c + e,
1265 self.textstate.leading * d + f,
1266 )
1267 self.textstate.linematrix = (0, 0)
1269 def do_TJ(self, seq: PDFStackT) -> None:
1270 """Show text, allowing individual glyph positioning"""
1271 if self.textstate.font is None:
1272 if settings.STRICT:
1273 raise PDFInterpreterError("No font specified!")
1274 return
1275 self.device.render_string(
1276 self.textstate,
1277 cast(PDFTextSeq, seq),
1278 self.graphicstate.ncs,
1279 self.graphicstate.copy(),
1280 )
1282 def do_Tj(self, s: PDFStackT) -> None:
1283 """Show text"""
1284 self.do_TJ([s])
1286 def do__q(self, s: PDFStackT) -> None:
1287 """Move to next line and show text
1289 The ' (single quote) operator.
1290 """
1291 self.do_T_a()
1292 self.do_TJ([s])
1294 def do__w(self, aw: PDFStackT, ac: PDFStackT, s: PDFStackT) -> None:
1295 """Set word and character spacing, move to next line, and show text
1297 The " (double quote) operator.
1298 """
1299 self.do_Tw(aw)
1300 self.do_Tc(ac)
1301 self.do_TJ([s])
1303 def do_BI(self) -> None:
1304 """Begin inline image object"""
1306 def do_ID(self) -> None:
1307 """Begin inline image data"""
1309 def do_EI(self, obj: PDFStackT) -> None:
1310 """End inline image object"""
1311 if isinstance(obj, PDFStream) and "W" in obj and "H" in obj:
1312 iobjid = str(id(obj))
1313 self.device.begin_figure(iobjid, (0, 0, 1, 1), MATRIX_IDENTITY)
1314 self.device.render_image(iobjid, obj)
1315 self.device.end_figure(iobjid)
1317 def do_Do(self, xobjid_arg: PDFStackT) -> None:
1318 """Invoke named XObject"""
1319 xobjid = literal_name(xobjid_arg)
1320 try:
1321 xobj = stream_value(self.xobjmap[xobjid])
1322 except KeyError as err:
1323 if settings.STRICT:
1324 raise PDFInterpreterError(f"Undefined xobject id: {xobjid!r}") from err
1325 return
1326 log.debug("Processing xobj: %r", xobj)
1327 subtype = xobj.get("Subtype")
1328 if subtype is LITERAL_FORM and "BBox" in xobj:
1329 interpreter = self.subinterp()
1330 bbox = cast(Rect, list_value(xobj["BBox"]))
1331 matrix = cast(Matrix, list_value(xobj.get("Matrix", MATRIX_IDENTITY)))
1332 # According to PDF reference 1.7 section 4.9.1, XObjects in
1333 # earlier PDFs (prior to v1.2) use the page's Resources entry
1334 # instead of having their own Resources entry.
1335 xobjres = xobj.get("Resources")
1336 resources = dict_value(xobjres) if xobjres else self.resources.copy()
1337 self.device.begin_figure(xobjid, bbox, matrix)
1338 interpreter.render_contents(
1339 resources,
1340 [xobj],
1341 ctm=mult_matrix(matrix, self.ctm),
1342 )
1343 self.device.end_figure(xobjid)
1344 elif subtype is LITERAL_IMAGE and "Width" in xobj and "Height" in xobj:
1345 self.device.begin_figure(xobjid, (0, 0, 1, 1), MATRIX_IDENTITY)
1346 self.device.render_image(xobjid, xobj)
1347 self.device.end_figure(xobjid)
1348 else:
1349 # unsupported xobject type.
1350 pass
1352 def process_page(self, page: PDFPage) -> None:
1353 log.debug("Processing page: %r", page)
1354 (x0, y0, x1, y1) = page.mediabox
1355 if page.rotate == 90:
1356 ctm = (0, -1, 1, 0, -y0, x1)
1357 elif page.rotate == 180:
1358 ctm = (-1, 0, 0, -1, x1, y1)
1359 elif page.rotate == 270:
1360 ctm = (0, 1, -1, 0, y1, -x0)
1361 else:
1362 ctm = (1, 0, 0, 1, -x0, -y0)
1363 self.device.begin_page(page, ctm)
1364 self.render_contents(page.resources, page.contents, ctm=ctm)
1365 self.device.end_page(page)
1367 def render_contents(
1368 self,
1369 resources: dict[object, object],
1370 streams: Sequence[object],
1371 ctm: Matrix = MATRIX_IDENTITY,
1372 ) -> None:
1373 """Render the content streams.
1375 This method may be called recursively.
1376 """
1377 log.debug(
1378 "render_contents: resources=%r, streams=%r, ctm=%r",
1379 resources,
1380 streams,
1381 ctm,
1382 )
1383 self.init_resources(resources)
1384 self.init_state(ctm)
1385 self.execute(list_value(streams))
1387 def execute(self, streams: Sequence[object]) -> None:
1388 # Detect and prevent circular references in content streams
1389 # (including Form XObjects).
1390 # We track stream IDs being executed in the current interpreter and
1391 # all parent interpreters. If a stream is already being processed
1392 # in the call stack, we skip
1393 # it to prevent infinite recursion (CWE-835 vulnerability).
1394 valid_streams: list[PDFStream] = []
1395 self.stream_ids.clear()
1396 for obj in streams:
1397 stream = stream_value(obj)
1398 if stream.objid is None:
1399 # Inline streams without object IDs can't be tracked for circular refs
1400 log.warning(
1401 "Execute called on non-indirect object (inline image?) %r", stream
1402 )
1403 continue
1404 if stream.objid in self.parent_stream_ids:
1405 log.warning(
1406 "Refusing to execute circular reference to content stream %d",
1407 stream.objid,
1408 )
1409 else:
1410 valid_streams.append(stream)
1411 self.stream_ids.add(stream.objid)
1412 try:
1413 parser = PDFContentParser(valid_streams)
1414 except PSEOF:
1415 # empty page
1416 return
1417 while True:
1418 try:
1419 (_, obj) = parser.nextobject()
1420 except PSEOF:
1421 break
1422 if isinstance(obj, PSKeyword):
1423 name = keyword_name(obj)
1424 method = "do_{}".format(
1425 name.replace("*", "_a")
1426 .replace('"', "_w")
1427 .replace(
1428 "'",
1429 "_q",
1430 )
1431 )
1432 if hasattr(self, method):
1433 func = getattr(self, method)
1434 nargs = func.__code__.co_argcount - 1
1435 if nargs:
1436 args = self.pop(nargs)
1437 log.debug("exec: %s %r", name, args)
1438 if len(args) == nargs:
1439 func(*args)
1440 else:
1441 log.debug("exec: %s", name)
1442 func()
1443 elif settings.STRICT:
1444 error_msg = f"Unknown operator: {name!r}"
1445 raise PDFInterpreterError(error_msg)
1446 else:
1447 self.push(obj)