Coverage for /pythoncovmergedfiles/medio/medio/src/pdfminer.six/pdfminer/pdfinterp.py: 85%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import logging
2import re
3from collections.abc import Mapping, Sequence
4from io import BytesIO
5from typing import Union, cast
7from pdfminer import settings
8from pdfminer.casting import safe_cmyk, safe_float, safe_int, safe_matrix, safe_rgb
9from pdfminer.cmapdb import CMap, CMapBase, CMapDB
10from pdfminer.pdfcolor import PREDEFINED_COLORSPACE, PDFColorSpace
11from pdfminer.pdfdevice import PDFDevice, PDFTextSeq
12from pdfminer.pdfexceptions import PDFException, PDFValueError
13from pdfminer.pdffont import (
14 PDFCIDFont,
15 PDFFont,
16 PDFFontError,
17 PDFTrueTypeFont,
18 PDFType1Font,
19 PDFType3Font,
20)
21from pdfminer.pdfpage import PDFPage
22from pdfminer.pdftypes import (
23 LITERALS_ASCII85_DECODE,
24 PDFObjRef,
25 PDFStream,
26 dict_value,
27 list_value,
28 resolve1,
29 stream_value,
30)
31from pdfminer.psexceptions import PSEOF, PSTypeError
32from pdfminer.psparser import (
33 KWD,
34 LIT,
35 PSKeyword,
36 PSLiteral,
37 PSStackParser,
38 PSStackType,
39 keyword_name,
40 literal_name,
41)
42from pdfminer.utils import (
43 MATRIX_IDENTITY,
44 Matrix,
45 PathSegment,
46 Point,
47 Rect,
48 choplist,
49 mult_matrix,
50)
52log = logging.getLogger(__name__)
55class PDFResourceError(PDFException):
56 pass
59class PDFInterpreterError(PDFException):
60 pass
63LITERAL_PDF = LIT("PDF")
64LITERAL_TEXT = LIT("Text")
65LITERAL_FONT = LIT("Font")
66LITERAL_FORM = LIT("Form")
67LITERAL_IMAGE = LIT("Image")
70class PDFTextState:
71 matrix: Matrix
72 linematrix: Point
74 def __init__(self) -> None:
75 self.font: PDFFont | None = None
76 self.fontsize: float = 0
77 self.charspace: float = 0
78 self.wordspace: float = 0
79 self.scaling: float = 100
80 self.leading: float = 0
81 self.render: int = 0
82 self.rise: float = 0
83 self.reset()
84 # self.matrix is set
85 # self.linematrix is set
87 def __repr__(self) -> str:
88 return (
89 f"<PDFTextState: font={self.font!r}, "
90 f"fontsize={self.fontsize!r}, "
91 f"charspace={self.charspace!r}, "
92 f"wordspace={self.wordspace!r}, "
93 f"scaling={self.scaling!r}, "
94 f"leading={self.leading!r}, "
95 f"render={self.render!r}, "
96 f"rise={self.rise!r}, "
97 f"matrix={self.matrix!r}, "
98 f"linematrix={self.linematrix!r}>"
99 )
101 def copy(self) -> "PDFTextState":
102 obj = PDFTextState()
103 obj.font = self.font
104 obj.fontsize = self.fontsize
105 obj.charspace = self.charspace
106 obj.wordspace = self.wordspace
107 obj.scaling = self.scaling
108 obj.leading = self.leading
109 obj.render = self.render
110 obj.rise = self.rise
111 obj.matrix = self.matrix
112 obj.linematrix = self.linematrix
113 return obj
115 def reset(self) -> None:
116 self.matrix = MATRIX_IDENTITY
117 self.linematrix = (0, 0)
120# Standard color types (used standalone or as base for uncolored patterns)
121StandardColor = Union[
122 float, # Greyscale
123 tuple[float, float, float], # R, G, B
124 tuple[float, float, float, float], # C, M, Y, K
125]
127# Complete color type including patterns
128Color = Union[
129 StandardColor, # Standard colors (gray, RGB, CMYK)
130 str, # Pattern name (colored pattern, PaintType=1)
131 tuple[
132 StandardColor, str
133 ], # (base_color, pattern_name) (uncolored pattern, PaintType=2)
134]
137class PDFGraphicState:
138 def __init__(self) -> None:
139 self.linewidth: float = 0
140 self.linecap: object | None = None
141 self.linejoin: object | None = None
142 self.miterlimit: object | None = None
143 self.dash: tuple[object, object] | None = None
144 self.intent: object | None = None
145 self.flatness: object | None = None
147 # stroking color
148 self.scolor: Color = 0
149 self.scs: PDFColorSpace = PREDEFINED_COLORSPACE["DeviceGray"]
151 # non stroking color
152 self.ncolor: Color = 0
153 self.ncs: PDFColorSpace = PREDEFINED_COLORSPACE["DeviceGray"]
155 def copy(self) -> "PDFGraphicState":
156 obj = PDFGraphicState()
157 obj.linewidth = self.linewidth
158 obj.linecap = self.linecap
159 obj.linejoin = self.linejoin
160 obj.miterlimit = self.miterlimit
161 obj.dash = self.dash
162 obj.intent = self.intent
163 obj.flatness = self.flatness
164 obj.scolor = self.scolor
165 obj.scs = self.scs
166 obj.ncolor = self.ncolor
167 obj.ncs = self.ncs
168 return obj
170 def __repr__(self) -> str:
171 return (
172 f"<PDFGraphicState: "
173 f"linewidth={self.linewidth!r}, "
174 f"linecap={self.linecap!r}, "
175 f"linejoin={self.linejoin!r}, "
176 f"miterlimit={self.miterlimit!r}, "
177 f"dash={self.dash!r}, "
178 f"intent={self.intent!r}, "
179 f"flatness={self.flatness!r}, "
180 f"stroking color={self.scolor!r}, "
181 f"non stroking color={self.ncolor!r}>"
182 )
185class PDFResourceManager:
186 """Repository of shared resources.
188 ResourceManager facilitates reuse of shared resources
189 such as fonts and images so that large objects are not
190 allocated multiple times.
191 """
193 def __init__(self, caching: bool = True) -> None:
194 self.caching = caching
195 self._cached_fonts: dict[object, PDFFont] = {}
197 def get_procset(self, procs: Sequence[object]) -> None:
198 for proc in procs:
199 if proc is LITERAL_PDF or proc is LITERAL_TEXT:
200 pass
201 else:
202 pass
204 def get_cmap(self, cmapname: str, strict: bool = False) -> CMapBase:
205 try:
206 return CMapDB.get_cmap(cmapname)
207 except CMapDB.CMapNotFound:
208 if strict:
209 raise
210 return CMap()
212 def get_font(self, objid: object, spec: Mapping[str, object]) -> PDFFont:
213 if objid and objid in self._cached_fonts:
214 font = self._cached_fonts[objid]
215 else:
216 log.debug("get_font: create: objid=%r, spec=%r", objid, spec)
217 if settings.STRICT and spec["Type"] is not LITERAL_FONT:
218 raise PDFFontError("Type is not /Font")
219 # Create a Font object.
220 if "Subtype" in spec:
221 subtype = literal_name(spec["Subtype"])
222 else:
223 if settings.STRICT:
224 raise PDFFontError("Font Subtype is not specified.")
225 subtype = "Type1"
226 if subtype in ("Type1", "MMType1"):
227 # Type1 Font
228 font = PDFType1Font(self, spec)
229 elif subtype == "TrueType":
230 # TrueType Font
231 font = PDFTrueTypeFont(self, spec)
232 elif subtype == "Type3":
233 # Type3 Font
234 font = PDFType3Font(self, spec)
235 elif subtype in ("CIDFontType0", "CIDFontType2"):
236 # CID Font
237 font = PDFCIDFont(self, spec)
238 elif subtype == "Type0":
239 # Type0 Font
240 dfonts = list_value(spec["DescendantFonts"])
241 assert dfonts
242 subspec = dict_value(dfonts[0]).copy()
243 for k in ("Encoding", "ToUnicode"):
244 if k in spec:
245 subspec[k] = resolve1(spec[k])
246 font = self.get_font(None, subspec)
247 else:
248 if settings.STRICT:
249 raise PDFFontError(f"Invalid Font spec: {spec!r}")
250 font = PDFType1Font(self, spec) # this is so wrong!
251 if objid and self.caching:
252 self._cached_fonts[objid] = font
253 return font
256class PDFContentParser(PSStackParser[Union[PSKeyword, PDFStream]]):
257 def __init__(self, streams: Sequence[object]) -> None:
258 self.streams = streams
259 self.istream = 0
260 # PSStackParser.__init__(fp=None) is safe only because we've overloaded
261 # all the methods that would attempt to access self.fp without first
262 # calling self.fillfp().
263 PSStackParser.__init__(self, None) # type: ignore[arg-type]
265 def fillfp(self) -> bool:
266 if not self.fp:
267 if self.istream < len(self.streams):
268 strm = stream_value(self.streams[self.istream])
269 self.istream += 1
270 else:
271 raise PSEOF("Unexpected EOF, file truncated?")
272 self.fp = BytesIO(strm.get_data())
273 return True
274 return False
276 def seek(self, pos: int) -> None:
277 self.fillfp()
278 PSStackParser.seek(self, pos)
280 def fillbuf(self) -> bool:
281 if self.charpos < len(self.buf):
282 return False
283 new_stream = False
284 while 1:
285 new_stream = self.fillfp()
286 self.bufpos = self.fp.tell()
287 self.buf = self.fp.read(self.BUFSIZ)
288 if self.buf:
289 break
290 self.fp = None # type: ignore[assignment]
291 self.charpos = 0
292 return new_stream
294 def get_inline_data(self, pos: int, target: bytes = b"EI") -> tuple[int, bytes]:
295 self.seek(pos)
296 i = 0
297 data = b""
298 while i <= len(target):
299 self.fillbuf()
300 if i:
301 ci = self.buf[self.charpos]
302 c = bytes((ci,))
303 data += c
304 self.charpos += 1
305 if (len(target) <= i and c.isspace()) or (
306 i < len(target) and c == (bytes((target[i],)))
307 ):
308 i += 1
309 else:
310 i = 0
311 else:
312 try:
313 j = self.buf.index(target[0], self.charpos)
314 data += self.buf[self.charpos : j + 1]
315 self.charpos = j + 1
316 i = 1
317 except ValueError:
318 data += self.buf[self.charpos :]
319 self.charpos = len(self.buf)
320 data = data[: -(len(target) + 1)] # strip the last part
321 data = re.sub(rb"(\x0d\x0a|[\x0d\x0a])$", b"", data)
322 return (pos, data)
324 def flush(self) -> None:
325 self.add_results(*self.popall())
327 KEYWORD_BI = KWD(b"BI")
328 KEYWORD_ID = KWD(b"ID")
329 KEYWORD_EI = KWD(b"EI")
331 def do_keyword(self, pos: int, token: PSKeyword) -> None:
332 if token is self.KEYWORD_BI:
333 # inline image within a content stream
334 self.start_type(pos, "inline")
335 elif token is self.KEYWORD_ID:
336 try:
337 (_, objs) = self.end_type("inline")
338 if len(objs) % 2 != 0:
339 error_msg = f"Invalid dictionary construct: {objs!r}"
340 raise PSTypeError(error_msg)
341 d = {literal_name(k): resolve1(v) for (k, v) in choplist(2, objs)}
342 eos = b"EI"
343 filter = d.get("F")
344 if filter is not None:
345 if isinstance(filter, PSLiteral):
346 filter = [filter]
347 if filter[0] in LITERALS_ASCII85_DECODE:
348 eos = b"~>"
349 (pos, data) = self.get_inline_data(pos + len(b"ID "), target=eos)
350 if eos != b"EI": # it may be necessary for decoding
351 data += eos
352 obj = PDFStream(d, data)
353 self.push((pos, obj))
354 if eos == b"EI": # otherwise it is still in the stream
355 self.push((pos, self.KEYWORD_EI))
356 except PSTypeError:
357 if settings.STRICT:
358 raise
359 else:
360 self.push((pos, token))
363# Types that may appear on the PDF argument stack.
364PDFStackT = PSStackType[PDFStream]
367class PDFPageInterpreter:
368 """Processor for the content of a PDF page
370 Reference: PDF Reference, Appendix A, Operator Summary
371 """
373 def __init__(self, rsrcmgr: PDFResourceManager, device: PDFDevice) -> None:
374 self.rsrcmgr = rsrcmgr
375 self.device = device
376 # Track stream IDs currently being executed to detect circular references
377 self.stream_ids: set[int] = set()
378 # Track stream IDs from parent interpreters in the call stack
379 self.parent_stream_ids: set[int] = set()
381 def dup(self) -> "PDFPageInterpreter":
382 return self.__class__(self.rsrcmgr, self.device)
384 def subinterp(self) -> "PDFPageInterpreter":
385 """Create a sub-interpreter for processing nested content streams.
387 This is used when invoking Form XObjects to prevent circular references.
388 Unlike dup(), this method propagates the stream ID tracking from the
389 parent interpreter, allowing detection of circular references across
390 nested XObject invocations.
391 """
392 interp = self.dup()
393 interp.parent_stream_ids.update(self.parent_stream_ids)
394 interp.parent_stream_ids.update(self.stream_ids)
395 return interp
397 def init_resources(self, resources: dict[object, object]) -> None:
398 """Prepare the fonts and XObjects listed in the Resource attribute."""
399 self.resources = resources
400 self.fontmap: dict[object, PDFFont] = {}
401 self.xobjmap = {}
402 self.csmap: dict[str, PDFColorSpace] = PREDEFINED_COLORSPACE.copy()
403 if not resources:
404 return
406 def get_colorspace(spec: object) -> PDFColorSpace | None:
407 if isinstance(spec, list):
408 name = literal_name(spec[0])
409 else:
410 name = literal_name(spec)
411 if name == "ICCBased" and isinstance(spec, list) and len(spec) >= 2:
412 return PDFColorSpace(name, stream_value(spec[1])["N"])
413 elif name == "DeviceN" and isinstance(spec, list) and len(spec) >= 2:
414 return PDFColorSpace(name, len(list_value(spec[1])))
415 else:
416 return PREDEFINED_COLORSPACE.get(name)
418 for k, v in dict_value(resources).items():
419 log.debug("Resource: %r: %r", k, v)
420 if k == "Font":
421 for fontid, spec in dict_value(v).items():
422 objid = None
423 if isinstance(spec, PDFObjRef):
424 objid = spec.objid
425 spec = dict_value(spec)
426 self.fontmap[fontid] = self.rsrcmgr.get_font(objid, spec)
427 elif k == "ColorSpace":
428 for csid, spec in dict_value(v).items():
429 colorspace = get_colorspace(resolve1(spec))
430 if colorspace is not None:
431 self.csmap[csid] = colorspace
432 elif k == "ProcSet":
433 self.rsrcmgr.get_procset(list_value(v))
434 elif k == "XObject":
435 for xobjid, xobjstrm in dict_value(v).items():
436 self.xobjmap[xobjid] = xobjstrm
438 def init_state(self, ctm: Matrix) -> None:
439 """Initialize the text and graphic states for rendering a page."""
440 # gstack: stack for graphical states.
441 self.gstack: list[tuple[Matrix, PDFTextState, PDFGraphicState]] = []
442 self.ctm = ctm
443 self.device.set_ctm(self.ctm)
444 self.textstate = PDFTextState()
445 self.graphicstate = PDFGraphicState()
446 self.curpath: list[PathSegment] = []
447 # argstack: stack for command arguments.
448 self.argstack: list[PDFStackT] = []
450 def push(self, obj: PDFStackT) -> None:
451 self.argstack.append(obj)
453 def pop(self, n: int) -> list[PDFStackT]:
454 if n == 0:
455 return []
456 x = self.argstack[-n:]
457 self.argstack = self.argstack[:-n]
458 return x
460 def get_current_state(self) -> tuple[Matrix, PDFTextState, PDFGraphicState]:
461 return (self.ctm, self.textstate.copy(), self.graphicstate.copy())
463 def set_current_state(
464 self,
465 state: tuple[Matrix, PDFTextState, PDFGraphicState],
466 ) -> None:
467 (self.ctm, self.textstate, self.graphicstate) = state
468 self.device.set_ctm(self.ctm)
470 def do_q(self) -> None:
471 """Save graphics state"""
472 self.gstack.append(self.get_current_state())
474 def do_Q(self) -> None:
475 """Restore graphics state"""
476 if self.gstack:
477 self.set_current_state(self.gstack.pop())
479 def do_cm(
480 self,
481 a1: PDFStackT,
482 b1: PDFStackT,
483 c1: PDFStackT,
484 d1: PDFStackT,
485 e1: PDFStackT,
486 f1: PDFStackT,
487 ) -> None:
488 """Concatenate matrix to current transformation matrix"""
489 matrix = safe_matrix(a1, b1, c1, d1, e1, f1)
491 if matrix is None:
492 log.warning(
493 "Cannot concatenate matrix to current transformation matrix "
494 f"because not all values in {(a1, b1, c1, d1, e1, f1)!r} "
495 "can be parsed as floats"
496 )
497 else:
498 self.ctm = mult_matrix(matrix, self.ctm)
499 self.device.set_ctm(self.ctm)
501 def do_w(self, linewidth: PDFStackT) -> None:
502 """Set line width"""
503 linewidth_f = safe_float(linewidth)
504 if linewidth_f is None:
505 log.warning(
506 f"Cannot set line width because {linewidth!r} is an invalid float value"
507 )
508 else:
509 scale = (self.ctm[0] ** 2 + self.ctm[1] ** 2) ** 0.5
510 self.graphicstate.linewidth = linewidth_f * scale
512 def do_J(self, linecap: PDFStackT) -> None:
513 """Set line cap style"""
514 self.graphicstate.linecap = linecap
516 def do_j(self, linejoin: PDFStackT) -> None:
517 """Set line join style"""
518 self.graphicstate.linejoin = linejoin
520 def do_M(self, miterlimit: PDFStackT) -> None:
521 """Set miter limit"""
522 self.graphicstate.miterlimit = miterlimit
524 def do_d(self, dash: PDFStackT, phase: PDFStackT) -> None:
525 """Set line dash pattern"""
526 self.graphicstate.dash = (dash, phase)
528 def do_ri(self, intent: PDFStackT) -> None:
529 """Set color rendering intent"""
530 self.graphicstate.intent = intent
532 def do_i(self, flatness: PDFStackT) -> None:
533 """Set flatness tolerance"""
534 self.graphicstate.flatness = flatness
536 def do_gs(self, name: PDFStackT) -> None:
537 """Set parameters from graphics state parameter dictionary"""
538 # TODO
540 def do_m(self, x: PDFStackT, y: PDFStackT) -> None:
541 """Begin new subpath"""
542 x_f = safe_float(x)
543 y_f = safe_float(y)
545 if x_f is None or y_f is None:
546 point = ("m", x, y)
547 log.warning(
548 "Cannot start new subpath because not all values "
549 f"in {point!r} can be parsed as floats"
550 )
551 else:
552 point = ("m", x_f, y_f)
553 self.curpath.append(point)
555 def do_l(self, x: PDFStackT, y: PDFStackT) -> None:
556 """Append straight line segment to path"""
557 x_f = safe_float(x)
558 y_f = safe_float(y)
559 if x_f is None or y_f is None:
560 point = ("l", x, y)
561 log.warning(
562 "Cannot append straight line segment to path "
563 f"because not all values in {point!r} can be parsed as floats"
564 )
565 else:
566 point = ("l", x_f, y_f)
567 self.curpath.append(point)
569 def do_c(
570 self,
571 x1: PDFStackT,
572 y1: PDFStackT,
573 x2: PDFStackT,
574 y2: PDFStackT,
575 x3: PDFStackT,
576 y3: PDFStackT,
577 ) -> None:
578 """Append curved segment to path (three control points)"""
579 x1_f = safe_float(x1)
580 y1_f = safe_float(y1)
581 x2_f = safe_float(x2)
582 y2_f = safe_float(y2)
583 x3_f = safe_float(x3)
584 y3_f = safe_float(y3)
585 if (
586 x1_f is None
587 or y1_f is None
588 or x2_f is None
589 or y2_f is None
590 or x3_f is None
591 or y3_f is None
592 ):
593 point = ("c", x1, y1, x2, y2, x3, y3)
594 log.warning(
595 "Cannot append curved segment to path "
596 f"because not all values in {point!r} can be parsed as floats"
597 )
598 else:
599 point = ("c", x1_f, y1_f, x2_f, y2_f, x3_f, y3_f)
600 self.curpath.append(point)
602 def do_v(self, x2: PDFStackT, y2: PDFStackT, x3: PDFStackT, y3: PDFStackT) -> None:
603 """Append curved segment to path (initial point replicated)"""
604 x2_f = safe_float(x2)
605 y2_f = safe_float(y2)
606 x3_f = safe_float(x3)
607 y3_f = safe_float(y3)
608 if x2_f is None or y2_f is None or x3_f is None or y3_f is None:
609 point = ("v", x2, y2, x3, y3)
610 log.warning(
611 "Cannot append curved segment to path "
612 f"because not all values in {point!r} can be parsed as floats"
613 )
614 else:
615 point = ("v", x2_f, y2_f, x3_f, y3_f)
616 self.curpath.append(point)
618 def do_y(self, x1: PDFStackT, y1: PDFStackT, x3: PDFStackT, y3: PDFStackT) -> None:
619 """Append curved segment to path (final point replicated)"""
620 x1_f = safe_float(x1)
621 y1_f = safe_float(y1)
622 x3_f = safe_float(x3)
623 y3_f = safe_float(y3)
624 if x1_f is None or y1_f is None or x3_f is None or y3_f is None:
625 point = ("y", x1, y1, x3, y3)
626 log.warning(
627 "Cannot append curved segment to path "
628 f"because not all values in {point!r} can be parsed as floats"
629 )
630 else:
631 point = ("y", x1_f, y1_f, x3_f, y3_f)
632 self.curpath.append(point)
634 def do_h(self) -> None:
635 """Close subpath"""
636 self.curpath.append(("h",))
638 def do_re(self, x: PDFStackT, y: PDFStackT, w: PDFStackT, h: PDFStackT) -> None:
639 """Append rectangle to path"""
640 x_f = safe_float(x)
641 y_f = safe_float(y)
642 w_f = safe_float(w)
643 h_f = safe_float(h)
645 if x_f is None or y_f is None or w_f is None or h_f is None:
646 values = (x, y, w, h)
647 log.warning(
648 "Cannot append rectangle to path "
649 f"because not all values in {values!r} can be parsed as floats"
650 )
651 else:
652 self.curpath.append(("m", x_f, y_f))
653 self.curpath.append(("l", x_f + w_f, y_f))
654 self.curpath.append(("l", x_f + w_f, y_f + h_f))
655 self.curpath.append(("l", x_f, y_f + h_f))
656 self.curpath.append(("h",))
658 def do_S(self) -> None:
659 """Stroke path"""
660 self.device.paint_path(self.graphicstate, True, False, False, self.curpath)
661 self.curpath = []
663 def do_s(self) -> None:
664 """Close and stroke path"""
665 self.do_h()
666 self.do_S()
668 def do_f(self) -> None:
669 """Fill path using nonzero winding number rule"""
670 self.device.paint_path(self.graphicstate, False, True, False, self.curpath)
671 self.curpath = []
673 def do_F(self) -> None:
674 """Fill path using nonzero winding number rule (obsolete)"""
676 def do_f_a(self) -> None:
677 """Fill path using even-odd rule"""
678 self.device.paint_path(self.graphicstate, False, True, True, self.curpath)
679 self.curpath = []
681 def do_B(self) -> None:
682 """Fill and stroke path using nonzero winding number rule"""
683 self.device.paint_path(self.graphicstate, True, True, False, self.curpath)
684 self.curpath = []
686 def do_B_a(self) -> None:
687 """Fill and stroke path using even-odd rule"""
688 self.device.paint_path(self.graphicstate, True, True, True, self.curpath)
689 self.curpath = []
691 def do_b(self) -> None:
692 """Close, fill, and stroke path using nonzero winding number rule"""
693 self.do_h()
694 self.do_B()
696 def do_b_a(self) -> None:
697 """Close, fill, and stroke path using even-odd rule"""
698 self.do_h()
699 self.do_B_a()
701 def do_n(self) -> None:
702 """End path without filling or stroking"""
703 self.curpath = []
705 def do_W(self) -> None:
706 """Set clipping path using nonzero winding number rule"""
708 def do_W_a(self) -> None:
709 """Set clipping path using even-odd rule"""
711 def do_CS(self, name: PDFStackT) -> None:
712 """Set color space for stroking operations
714 Introduced in PDF 1.1
715 """
716 try:
717 self.graphicstate.scs = self.csmap[literal_name(name)]
718 except KeyError as err:
719 if settings.STRICT:
720 raise PDFInterpreterError(f"Undefined ColorSpace: {name!r}") from err
722 def do_cs(self, name: PDFStackT) -> None:
723 """Set color space for nonstroking operations"""
724 try:
725 self.graphicstate.ncs = self.csmap[literal_name(name)]
726 except KeyError as err:
727 if settings.STRICT:
728 raise PDFInterpreterError(f"Undefined ColorSpace: {name!r}") from err
730 def do_G(self, gray: PDFStackT) -> None:
731 """Set gray level for stroking operations"""
732 gray_f = safe_float(gray)
734 if gray_f is None:
735 log.warning(
736 f"Cannot set gray level because {gray!r} is an invalid float value"
737 )
738 else:
739 self.graphicstate.scolor = gray_f
740 self.graphicstate.scs = self.csmap["DeviceGray"]
742 def do_g(self, gray: PDFStackT) -> None:
743 """Set gray level for nonstroking operations"""
744 gray_f = safe_float(gray)
746 if gray_f is None:
747 log.warning(
748 f"Cannot set gray level because {gray!r} is an invalid float value"
749 )
750 else:
751 self.graphicstate.ncolor = gray_f
752 self.graphicstate.ncs = self.csmap["DeviceGray"]
754 def do_RG(self, r: PDFStackT, g: PDFStackT, b: PDFStackT) -> None:
755 """Set RGB color for stroking operations"""
756 rgb = safe_rgb(r, g, b)
758 if rgb is None:
759 log.warning(
760 "Cannot set RGB stroke color "
761 f"because not all values in {(r, g, b)!r} can be parsed as floats"
762 )
763 else:
764 self.graphicstate.scolor = rgb
765 self.graphicstate.scs = self.csmap["DeviceRGB"]
767 def do_rg(self, r: PDFStackT, g: PDFStackT, b: PDFStackT) -> None:
768 """Set RGB color for nonstroking operations"""
769 rgb = safe_rgb(r, g, b)
771 if rgb is None:
772 log.warning(
773 "Cannot set RGB non-stroke color "
774 f"because not all values in {(r, g, b)!r} can be parsed as floats"
775 )
776 else:
777 self.graphicstate.ncolor = rgb
778 self.graphicstate.ncs = self.csmap["DeviceRGB"]
780 def do_K(self, c: PDFStackT, m: PDFStackT, y: PDFStackT, k: PDFStackT) -> None:
781 """Set CMYK color for stroking operations"""
782 cmyk = safe_cmyk(c, m, y, k)
784 if cmyk is None:
785 log.warning(
786 "Cannot set CMYK stroke color "
787 f"because not all values in {(c, m, y, k)!r} can be parsed as floats"
788 )
789 else:
790 self.graphicstate.scolor = cmyk
791 self.graphicstate.scs = self.csmap["DeviceCMYK"]
793 def do_k(self, c: PDFStackT, m: PDFStackT, y: PDFStackT, k: PDFStackT) -> None:
794 """Set CMYK color for nonstroking operations"""
795 cmyk = safe_cmyk(c, m, y, k)
797 if cmyk is None:
798 log.warning(
799 "Cannot set CMYK non-stroke color "
800 f"because not all values in {(c, m, y, k)!r} can be parsed as floats"
801 )
802 else:
803 self.graphicstate.ncolor = cmyk
804 self.graphicstate.ncs = self.csmap["DeviceCMYK"]
806 def _parse_color_components(
807 self, components: list[PDFStackT], context: str
808 ) -> StandardColor | None:
809 """Parse color components into StandardColor (gray, RGB, or CMYK).
811 Args:
812 components: List of 1, 3, or 4 numeric color components
813 context: Description for error messages (e.g., "stroke", "non-stroke")
815 Returns:
816 Parsed color (float for gray, tuple for RGB/CMYK) or None if invalid
817 """
818 if len(components) == 1:
819 gray = safe_float(components[0])
820 if gray is None:
821 log.warning(
822 f"Cannot set {context} color: "
823 f"{components[0]!r} is an invalid float value"
824 )
825 return gray
827 elif len(components) == 3:
828 rgb = safe_rgb(*components)
829 if rgb is None:
830 log.warning(
831 f"Cannot set {context} color: "
832 f"components {components!r} cannot be parsed as RGB"
833 )
834 return rgb
836 elif len(components) == 4:
837 cmyk = safe_cmyk(*components)
838 if cmyk is None:
839 log.warning(
840 f"Cannot set {context} color: "
841 f"components {components!r} cannot be parsed as CMYK"
842 )
843 return cmyk
845 else:
846 log.warning(
847 f"Cannot set {context} color: "
848 f"{len(components)} components specified, "
849 "but only 1 (grayscale), 3 (RGB), and 4 (CMYK) are supported"
850 )
851 return None
853 def do_SCN(self) -> None:
854 """Set color for stroking operations.
856 Handles Pattern color spaces per ISO 32000-1:2008 4.5.5 (PDF 1.7)
857 and ISO 32000-2:2020 8.7.3 (PDF 2.0):
858 - Colored patterns (PaintType=1): single operand (pattern name)
859 - Uncolored patterns (PaintType=2): n+1 operands (colors + pattern name)
860 """
861 n = self.graphicstate.scs.ncomponents
863 components = self.pop(n)
864 if len(components) != n:
865 log.warning(
866 "Cannot set stroke color because "
867 f"expected {n} components but got {components!r}"
868 )
870 elif self.graphicstate.scs.name != "Pattern":
871 # Standard colors (gray, RGB, CMYK) - common case
872 color = self._parse_color_components(components, "stroke")
873 if color is not None:
874 self.graphicstate.scolor = color
876 elif len(components) >= 1:
877 # Pattern color space (ISO 32000 8.7.3.2-3)
878 # Last component is always the pattern name
879 pattern_component = components[-1]
881 # Per spec: pattern name must be a name object (PSLiteral)
882 if not isinstance(pattern_component, PSLiteral):
883 log.warning(
884 f"Pattern color space requires name object (PSLiteral), "
885 f"got {type(pattern_component).__name__}: {pattern_component!r}. "
886 "Per ISO 32000 8.7.3.2, colored patterns use syntax '/name SCN'. "
887 "Per ISO 32000 8.7.3.3, uncolored patterns use "
888 "syntax 'c1...cn /name SCN'."
889 )
890 return
892 pattern_name = literal_name(pattern_component)
894 if len(components) == 1:
895 # Colored tiling pattern (PaintType=1): just pattern name
896 self.graphicstate.scolor = pattern_name
897 log.debug(f"Set stroke pattern (colored): {pattern_name}")
898 else:
899 # Uncolored tiling pattern (PaintType=2):
900 # color components + pattern name
901 base_color_components = components[:-1]
903 # Parse base color using shared logic
904 base_color = self._parse_color_components(
905 base_color_components, "stroke (uncolored pattern)"
906 )
907 if base_color is None:
908 return
910 # Store as tuple: (base_color, pattern_name)
911 self.graphicstate.scolor = (base_color, pattern_name)
912 log.debug(
913 f"Set stroke pattern (uncolored): {base_color} + {pattern_name}"
914 )
916 def do_scn(self) -> None:
917 """Set color for nonstroking operations.
919 Handles Pattern color spaces per ISO 32000-1:2008 4.5.5 (PDF 1.7)
920 and ISO 32000-2:2020 §8.7.3 (PDF 2.0):
921 - Colored patterns (PaintType=1): single operand (pattern name)
922 - Uncolored patterns (PaintType=2): n+1 operands (colors + pattern name)
923 """
924 n = self.graphicstate.ncs.ncomponents
926 components = self.pop(n)
927 if len(components) != n:
928 log.warning(
929 "Cannot set non-stroke color because "
930 f"expected {n} components but got {components!r}"
931 )
933 elif self.graphicstate.ncs.name != "Pattern":
934 # Standard colors (gray, RGB, CMYK) - common case
935 color = self._parse_color_components(components, "non-stroke")
936 if color is not None:
937 self.graphicstate.ncolor = color
939 elif len(components) >= 1:
940 # Pattern color space (ISO 32000 8.7.3.2-3)
941 # Last component is always the pattern name
942 pattern_component = components[-1]
944 # Per spec: pattern name must be a name object (PSLiteral)
945 if not isinstance(pattern_component, PSLiteral):
946 log.warning(
947 f"Pattern color space requires name object (PSLiteral), "
948 f"got {type(pattern_component).__name__}: {pattern_component!r}. "
949 "Per ISO 32000 8.7.3.2, colored patterns use syntax '/name scn'. "
950 "Per ISO 32000 8.7.3.3, uncolored patterns use "
951 "syntax 'c1...cn /name scn'."
952 )
953 return
955 pattern_name = literal_name(pattern_component)
957 if len(components) == 1:
958 # Colored tiling pattern (PaintType=1): just pattern name
959 self.graphicstate.ncolor = pattern_name
960 log.debug(f"Set non-stroke pattern (colored): {pattern_name}")
961 else:
962 # Uncolored tiling pattern (PaintType=2):
963 # color components + pattern name
964 base_color_components = components[:-1]
966 # Parse base color using shared logic
967 base_color = self._parse_color_components(
968 base_color_components, "non-stroke (uncolored pattern)"
969 )
970 if base_color is None:
971 return
973 # Store as tuple: (base_color, pattern_name)
974 self.graphicstate.ncolor = (base_color, pattern_name)
975 log.debug(
976 f"Set non-stroke pattern (uncolored): {base_color} + {pattern_name}"
977 )
979 def do_SC(self) -> None:
980 """Set color for stroking operations"""
981 self.do_SCN()
983 def do_sc(self) -> None:
984 """Set color for nonstroking operations"""
985 self.do_scn()
987 def do_sh(self, name: object) -> None:
988 """Paint area defined by shading pattern"""
990 def do_BT(self) -> None:
991 """Begin text object
993 Initializing the text matrix, Tm, and the text line matrix, Tlm, to
994 the identity matrix. Text objects cannot be nested; a second BT cannot
995 appear before an ET.
996 """
997 self.textstate.reset()
999 def do_ET(self) -> None:
1000 """End a text object"""
1002 def do_BX(self) -> None:
1003 """Begin compatibility section"""
1005 def do_EX(self) -> None:
1006 """End compatibility section"""
1008 def do_MP(self, tag: PDFStackT) -> None:
1009 """Define marked-content point"""
1010 if isinstance(tag, PSLiteral):
1011 self.device.do_tag(tag)
1012 else:
1013 log.warning(
1014 f"Cannot define marked-content point because {tag!r} is not a PSLiteral"
1015 )
1017 def do_DP(self, tag: PDFStackT, props: PDFStackT) -> None:
1018 """Define marked-content point with property list"""
1019 if isinstance(tag, PSLiteral):
1020 self.device.do_tag(tag, props)
1021 else:
1022 log.warning(
1023 "Cannot define marked-content point with property list "
1024 f"because {tag!r} is not a PSLiteral"
1025 )
1027 def do_BMC(self, tag: PDFStackT) -> None:
1028 """Begin marked-content sequence"""
1029 if isinstance(tag, PSLiteral):
1030 self.device.begin_tag(tag)
1031 else:
1032 log.warning(
1033 "Cannot begin marked-content sequence because "
1034 f"{tag!r} is not a PSLiteral"
1035 )
1037 def do_BDC(self, tag: PDFStackT, props: PDFStackT) -> None:
1038 """Begin marked-content sequence with property list"""
1039 if isinstance(tag, PSLiteral):
1040 self.device.begin_tag(tag, props)
1041 else:
1042 log.warning(
1043 f"Cannot begin marked-content sequence with property list "
1044 f"because {tag!r} is not a PSLiteral"
1045 )
1047 def do_EMC(self) -> None:
1048 """End marked-content sequence"""
1049 self.device.end_tag()
1051 def do_Tc(self, space: PDFStackT) -> None:
1052 """Set character spacing.
1054 Character spacing is used by the Tj, TJ, and ' operators.
1056 :param space: a number expressed in unscaled text space units.
1057 """
1058 charspace = safe_float(space)
1059 if charspace is None:
1060 log.warning(
1061 "Could not set character spacing because "
1062 f"{space!r} is an invalid float value"
1063 )
1064 else:
1065 self.textstate.charspace = charspace
1067 def do_Tw(self, space: PDFStackT) -> None:
1068 """Set the word spacing.
1070 Word spacing is used by the Tj, TJ, and ' operators.
1072 :param space: a number expressed in unscaled text space units
1073 """
1074 wordspace = safe_float(space)
1075 if wordspace is None:
1076 log.warning(
1077 "Could not set word spacing because "
1078 f"{space!r} is an invalid float value"
1079 )
1080 else:
1081 self.textstate.wordspace = wordspace
1083 def do_Tz(self, scale: PDFStackT) -> None:
1084 """Set the horizontal scaling.
1086 :param scale: is a number specifying the percentage of the normal width
1087 """
1088 scale_f = safe_float(scale)
1090 if scale_f is None:
1091 log.warning(
1092 "Could not set horizontal scaling because "
1093 f"{scale!r} is an invalid float value"
1094 )
1095 else:
1096 self.textstate.scaling = scale_f
1098 def do_TL(self, leading: PDFStackT) -> None:
1099 """Set the text leading.
1101 Text leading is used only by the T*, ', and " operators.
1103 :param leading: a number expressed in unscaled text space units
1104 """
1105 leading_f = safe_float(leading)
1106 if leading_f is None:
1107 log.warning(
1108 "Could not set text leading because "
1109 f"{leading!r} is an invalid float value"
1110 )
1111 else:
1112 self.textstate.leading = -leading_f
1114 def do_Tf(self, fontid: PDFStackT, fontsize: PDFStackT) -> None:
1115 """Set the text font
1117 :param fontid: the name of a font resource in the Font subdictionary
1118 of the current resource dictionary
1119 :param fontsize: size is a number representing a scale factor.
1120 """
1121 try:
1122 self.textstate.font = self.fontmap[literal_name(fontid)]
1123 except KeyError as err:
1124 if settings.STRICT:
1125 raise PDFInterpreterError(f"Undefined Font id: {fontid!r}") from err
1126 self.textstate.font = self.rsrcmgr.get_font(None, {})
1128 fontsize_f = safe_float(fontsize)
1129 if fontsize_f is None:
1130 log.warning(
1131 f"Could not set text font because "
1132 f"{fontsize!r} is an invalid float value"
1133 )
1134 else:
1135 self.textstate.fontsize = fontsize_f
1137 def do_Tr(self, render: PDFStackT) -> None:
1138 """Set the text rendering mode"""
1139 render_i = safe_int(render)
1141 if render_i is None:
1142 log.warning(
1143 "Could not set text rendering mode because "
1144 f"{render!r} is an invalid int value"
1145 )
1146 else:
1147 self.textstate.render = render_i
1149 def do_Ts(self, rise: PDFStackT) -> None:
1150 """Set the text rise
1152 :param rise: a number expressed in unscaled text space units
1153 """
1154 rise_f = safe_float(rise)
1156 if rise_f is None:
1157 log.warning(
1158 f"Could not set text rise because {rise!r} is an invalid float value"
1159 )
1160 else:
1161 self.textstate.rise = rise_f
1163 def do_Td(self, tx: PDFStackT, ty: PDFStackT) -> None:
1164 """Move to the start of the next line
1166 Offset from the start of the current line by (tx , ty).
1167 """
1168 tx_ = safe_float(tx)
1169 ty_ = safe_float(ty)
1170 if tx_ is not None and ty_ is not None:
1171 (a, b, c, d, e, f) = self.textstate.matrix
1172 e_new = tx_ * a + ty_ * c + e
1173 f_new = tx_ * b + ty_ * d + f
1174 self.textstate.matrix = (a, b, c, d, e_new, f_new)
1176 elif settings.STRICT:
1177 raise PDFValueError(f"Invalid offset ({tx!r}, {ty!r}) for Td")
1179 self.textstate.linematrix = (0, 0)
1181 def do_TD(self, tx: PDFStackT, ty: PDFStackT) -> None:
1182 """Move to the start of the next line.
1184 offset from the start of the current line by (tx , ty). As a side effect, this
1185 operator sets the leading parameter in the text state.
1186 """
1187 tx_ = safe_float(tx)
1188 ty_ = safe_float(ty)
1190 if tx_ is not None and ty_ is not None:
1191 (a, b, c, d, e, f) = self.textstate.matrix
1192 e_new = tx_ * a + ty_ * c + e
1193 f_new = tx_ * b + ty_ * d + f
1194 self.textstate.matrix = (a, b, c, d, e_new, f_new)
1196 elif settings.STRICT:
1197 raise PDFValueError("Invalid offset ({tx}, {ty}) for TD")
1199 if ty_ is not None:
1200 self.textstate.leading = ty_
1202 self.textstate.linematrix = (0, 0)
1204 def do_Tm(
1205 self,
1206 a: PDFStackT,
1207 b: PDFStackT,
1208 c: PDFStackT,
1209 d: PDFStackT,
1210 e: PDFStackT,
1211 f: PDFStackT,
1212 ) -> None:
1213 """Set text matrix and text line matrix"""
1214 values = (a, b, c, d, e, f)
1215 matrix = safe_matrix(*values)
1217 if matrix is None:
1218 log.warning(
1219 f"Could not set text matrix because "
1220 f"not all values in {values!r} can be parsed as floats"
1221 )
1222 else:
1223 self.textstate.matrix = matrix
1224 self.textstate.linematrix = (0, 0)
1226 def do_T_a(self) -> None:
1227 """Move to start of next text line"""
1228 (a, b, c, d, e, f) = self.textstate.matrix
1229 self.textstate.matrix = (
1230 a,
1231 b,
1232 c,
1233 d,
1234 self.textstate.leading * c + e,
1235 self.textstate.leading * d + f,
1236 )
1237 self.textstate.linematrix = (0, 0)
1239 def do_TJ(self, seq: PDFStackT) -> None:
1240 """Show text, allowing individual glyph positioning"""
1241 if self.textstate.font is None:
1242 if settings.STRICT:
1243 raise PDFInterpreterError("No font specified!")
1244 return
1245 self.device.render_string(
1246 self.textstate,
1247 cast(PDFTextSeq, seq),
1248 self.graphicstate.ncs,
1249 self.graphicstate.copy(),
1250 )
1252 def do_Tj(self, s: PDFStackT) -> None:
1253 """Show text"""
1254 self.do_TJ([s])
1256 def do__q(self, s: PDFStackT) -> None:
1257 """Move to next line and show text
1259 The ' (single quote) operator.
1260 """
1261 self.do_T_a()
1262 self.do_TJ([s])
1264 def do__w(self, aw: PDFStackT, ac: PDFStackT, s: PDFStackT) -> None:
1265 """Set word and character spacing, move to next line, and show text
1267 The " (double quote) operator.
1268 """
1269 self.do_Tw(aw)
1270 self.do_Tc(ac)
1271 self.do_TJ([s])
1273 def do_BI(self) -> None:
1274 """Begin inline image object"""
1276 def do_ID(self) -> None:
1277 """Begin inline image data"""
1279 def do_EI(self, obj: PDFStackT) -> None:
1280 """End inline image object"""
1281 if isinstance(obj, PDFStream) and "W" in obj and "H" in obj:
1282 iobjid = str(id(obj))
1283 self.device.begin_figure(iobjid, (0, 0, 1, 1), MATRIX_IDENTITY)
1284 self.device.render_image(iobjid, obj)
1285 self.device.end_figure(iobjid)
1287 def do_Do(self, xobjid_arg: PDFStackT) -> None:
1288 """Invoke named XObject"""
1289 xobjid = literal_name(xobjid_arg)
1290 try:
1291 xobj = stream_value(self.xobjmap[xobjid])
1292 except KeyError as err:
1293 if settings.STRICT:
1294 raise PDFInterpreterError(f"Undefined xobject id: {xobjid!r}") from err
1295 return
1296 log.debug("Processing xobj: %r", xobj)
1297 subtype = xobj.get("Subtype")
1298 if subtype is LITERAL_FORM and "BBox" in xobj:
1299 interpreter = self.subinterp()
1300 bbox = cast(Rect, list_value(xobj["BBox"]))
1301 matrix = cast(Matrix, list_value(xobj.get("Matrix", MATRIX_IDENTITY)))
1302 # According to PDF reference 1.7 section 4.9.1, XObjects in
1303 # earlier PDFs (prior to v1.2) use the page's Resources entry
1304 # instead of having their own Resources entry.
1305 xobjres = xobj.get("Resources")
1306 resources = dict_value(xobjres) if xobjres else self.resources.copy()
1307 self.device.begin_figure(xobjid, bbox, matrix)
1308 interpreter.render_contents(
1309 resources,
1310 [xobj],
1311 ctm=mult_matrix(matrix, self.ctm),
1312 )
1313 self.device.end_figure(xobjid)
1314 elif subtype is LITERAL_IMAGE and "Width" in xobj and "Height" in xobj:
1315 self.device.begin_figure(xobjid, (0, 0, 1, 1), MATRIX_IDENTITY)
1316 self.device.render_image(xobjid, xobj)
1317 self.device.end_figure(xobjid)
1318 else:
1319 # unsupported xobject type.
1320 pass
1322 def process_page(self, page: PDFPage) -> None:
1323 log.debug("Processing page: %r", page)
1324 (x0, y0, x1, y1) = page.mediabox
1325 if page.rotate == 90:
1326 ctm = (0, -1, 1, 0, -y0, x1)
1327 elif page.rotate == 180:
1328 ctm = (-1, 0, 0, -1, x1, y1)
1329 elif page.rotate == 270:
1330 ctm = (0, 1, -1, 0, y1, -x0)
1331 else:
1332 ctm = (1, 0, 0, 1, -x0, -y0)
1333 self.device.begin_page(page, ctm)
1334 self.render_contents(page.resources, page.contents, ctm=ctm)
1335 self.device.end_page(page)
1337 def render_contents(
1338 self,
1339 resources: dict[object, object],
1340 streams: Sequence[object],
1341 ctm: Matrix = MATRIX_IDENTITY,
1342 ) -> None:
1343 """Render the content streams.
1345 This method may be called recursively.
1346 """
1347 log.debug(
1348 "render_contents: resources=%r, streams=%r, ctm=%r",
1349 resources,
1350 streams,
1351 ctm,
1352 )
1353 self.init_resources(resources)
1354 self.init_state(ctm)
1355 self.execute(list_value(streams))
1357 def execute(self, streams: Sequence[object]) -> None:
1358 # Detect and prevent circular references in content streams
1359 # (including Form XObjects).
1360 # We track stream IDs being executed in the current interpreter and
1361 # all parent interpreters. If a stream is already being processed
1362 # in the call stack, we skip
1363 # it to prevent infinite recursion (CWE-835 vulnerability).
1364 valid_streams: list[PDFStream] = []
1365 self.stream_ids.clear()
1366 for obj in streams:
1367 stream = stream_value(obj)
1368 if stream.objid is None:
1369 # Inline streams without object IDs can't be tracked for circular refs
1370 log.warning(
1371 "Execute called on non-indirect object (inline image?) %r", stream
1372 )
1373 continue
1374 if stream.objid in self.parent_stream_ids:
1375 log.warning(
1376 "Refusing to execute circular reference to content stream %d",
1377 stream.objid,
1378 )
1379 else:
1380 valid_streams.append(stream)
1381 self.stream_ids.add(stream.objid)
1382 try:
1383 parser = PDFContentParser(valid_streams)
1384 except PSEOF:
1385 # empty page
1386 return
1387 while True:
1388 try:
1389 (_, obj) = parser.nextobject()
1390 except PSEOF:
1391 break
1392 if isinstance(obj, PSKeyword):
1393 name = keyword_name(obj)
1394 method = "do_{}".format(
1395 name.replace("*", "_a")
1396 .replace('"', "_w")
1397 .replace(
1398 "'",
1399 "_q",
1400 )
1401 )
1402 if hasattr(self, method):
1403 func = getattr(self, method)
1404 nargs = func.__code__.co_argcount - 1
1405 if nargs:
1406 args = self.pop(nargs)
1407 log.debug("exec: %s %r", name, args)
1408 if len(args) == nargs:
1409 func(*args)
1410 else:
1411 log.debug("exec: %s", name)
1412 func()
1413 elif settings.STRICT:
1414 error_msg = f"Unknown operator: {name!r}"
1415 raise PDFInterpreterError(error_msg)
1416 else:
1417 self.push(obj)