1# SPDX-FileCopyrightText: 2022 James R. Barlow
2# SPDX-License-Identifier: MPL-2.0
3
4"""Content stream parsing."""
5
6from __future__ import annotations
7
8from collections.abc import Collection
9from typing import TYPE_CHECKING, Union, cast
10
11from pikepdf._core import (
12 ContentStreamInlineImage,
13 ContentStreamInstruction,
14 Object,
15 ObjectType,
16 Page,
17 PdfError,
18 _unparse_content_stream,
19)
20from pikepdf.objects import Operator
21
22if TYPE_CHECKING:
23 from pikepdf.models.image import PdfInlineImage
24
25# Operands, Operator
26_OldContentStreamOperands = Collection[Union[Object, 'PdfInlineImage']]
27_OldContentStreamInstructions = tuple[_OldContentStreamOperands, Operator]
28
29ContentStreamInstructions = Union[ContentStreamInstruction, ContentStreamInlineImage]
30
31UnparseableContentStreamInstructions = Union[
32 ContentStreamInstructions, _OldContentStreamInstructions
33]
34
35
36class PdfParsingError(Exception):
37 """Error when parsing a PDF content stream."""
38
39 def __init__(self, message=None, line=None):
40 if not message:
41 message = f"Error encoding content stream at line {line}"
42 super().__init__(message)
43 self.line = line
44
45
46def parse_content_stream(
47 page_or_stream: Object | Page, operators: str = ''
48) -> list[ContentStreamInstructions]:
49 """Parse a PDF content stream into a sequence of instructions.
50
51 A PDF content stream is list of instructions that describe where to render
52 the text and graphics in a PDF. This is the starting point for analyzing
53 PDFs.
54
55 If the input is a page and page.Contents is an array, then the content
56 stream is automatically treated as one coalesced stream.
57
58 Each instruction contains at least one operator and zero or more operands.
59
60 This function does not have anything to do with opening a PDF file itself or
61 processing data from a whole PDF. It is for processing a specific object inside
62 a PDF that is already opened.
63
64 Args:
65 page_or_stream: A page object, or the content
66 stream attached to another object such as a Form XObject.
67 operators: A space-separated string of operators to whitelist.
68 For example 'q Q cm Do' will return only operators
69 that pertain to drawing images. Use 'BI ID EI' for inline images.
70 All other operators and associated tokens are ignored. If blank,
71 all tokens are accepted.
72
73 Example:
74 >>> with pikepdf.Pdf.open("../tests/resources/pal-1bit-trivial.pdf") as pdf:
75 ... page = pdf.pages[0]
76 ... for operands, command in pikepdf.parse_content_stream(page):
77 ... print(command)
78 q
79 cm
80 Do
81 Q
82
83 .. versionchanged:: 3.0
84 Returns a list of ``ContentStreamInstructions`` instead of a list
85 of (operand, operator) tuples. The returned items are duck-type compatible
86 with the previous returned items.
87 """
88 if not isinstance(page_or_stream, (Object, Page)):
89 raise TypeError("stream must be a pikepdf.Object or pikepdf.Page")
90
91 if (
92 isinstance(page_or_stream, Object)
93 and page_or_stream._type_code != ObjectType.stream
94 and page_or_stream.get('/Type') != '/Page'
95 ):
96 raise TypeError("parse_content_stream called on page or stream object")
97
98 if isinstance(page_or_stream, Page):
99 page_or_stream = page_or_stream.obj
100
101 try:
102 if page_or_stream.get('/Type') == '/Page':
103 page = page_or_stream
104 instructions = cast(
105 list[ContentStreamInstructions],
106 page._parse_page_contents_grouped(operators),
107 )
108 else:
109 stream = page_or_stream
110 instructions = cast(
111 list[ContentStreamInstructions],
112 Object._parse_stream_grouped(stream, operators),
113 )
114 except PdfError as e:
115 if 'supposed to be a stream or an array' in str(e):
116 raise TypeError("parse_content_stream called on non-stream Object") from e
117 raise e from e
118
119 return instructions
120
121
122def unparse_content_stream(
123 instructions: Collection[UnparseableContentStreamInstructions],
124) -> bytes:
125 """Convert collection of instructions to bytes suitable for storing in PDF.
126
127 Given a parsed list of instructions/operand-operators, convert to bytes suitable
128 for embedding in a PDF. In PDF the operator always follows the operands.
129
130 Args:
131 instructions: collection of instructions such as is returned
132 by :func:`parse_content_stream()`
133
134 Returns:
135 A binary content stream, suitable for attaching to a Pdf.
136 To attach to a Pdf, use :meth:`Pdf.make_stream()``.
137
138 .. versionchanged:: 3.0
139 Now accept collections that contain any mixture of
140 ``ContentStreamInstruction``, ``ContentStreamInlineImage``, and the older
141 operand-operator tuples from pikepdf 2.x.
142 """
143 try:
144 return _unparse_content_stream(instructions)
145 except (ValueError, TypeError, RuntimeError) as e:
146 raise PdfParsingError(
147 "While unparsing a content stream, an error occurred"
148 ) from e