1# SPDX-FileCopyrightText: 2025 @rakurtz
2# SPDX-FileCopyrightText: 2025 James R. Barlow
3# SPDX-License-Identifier: MPL-2.0
4
5"""Parsing the matrixes in a PDF file."""
6
7from __future__ import annotations
8
9from logging import getLogger
10
11from pikepdf._core import Matrix, Page
12from pikepdf.models._content_stream import parse_content_stream
13from pikepdf.objects import Operator
14
15logger = getLogger(__file__)
16OPERATOR_CM = Operator(
17 'cm'
18) # "Concatenate Matrix": Changes the CTM (Current Transformation Matrix)
19OPERATOR_DO = Operator('Do') # "Draw Object":
20OPERATOR_STACK = Operator('q') # Stores the CTM to a stack
21OPERATOR_POP = Operator('Q') # Restores the previous CTM
22
23
24class MatrixStack:
25 """Tracks the CTM (current transformation matrix) in a PDF content stream.
26
27 The CTM starts as the initial matrix and can be changed via the 'cm'
28 (concatenate matrix) operator --> CTM = CTM x CM (with CTM and CM
29 being 3x3 matrixes). Initial matrix is the identity matrix unless overridden.
30
31 Furthermore can the CTM be stored to the stack via the 'q' operator.
32 This save the CTM and subsequent 'cm' operators change a copy of that CTM
33 --> 'q 1 0 0 1 0 0 cm'
34 --> Copy CTM onto the stack and change the copy via 'cm'
35
36 With the 'Q' operator the current CTM is replaced with the previous one from the
37 stack.
38
39 Error handling:
40 1. Popping from an empty stack results in CTM being set to the initial matrix
41 2. Multiplying with invalid operands sets the CTM to invalid
42 3. Multiplying an invalid CTM with a valid CM results in an invalid CTM
43 4. Stacking an invalid CTM results in a copy of that invalid CTM onto the stack
44 --> All operations with an invalid CTM result in an invalid CTM
45 --> The CTM is valid again when all invalid CTMs are popped off the stack
46 """
47
48 def __init__(self, initial_matrix: Matrix = Matrix.identity()) -> None:
49 """Initializing the stack with the initial matrix."""
50 self._initial_matrix = initial_matrix
51 self._stack: list[Matrix | None] = [self._initial_matrix]
52
53 def stack(self):
54 """Copying the current CTM onto the stack."""
55 self._stack.append(self._stack[-1])
56
57 def pop(self):
58 """Removing the current CTM from the stack.
59
60 The stack is not permitted to underflow. If popped too many times, the CTM
61 is set to the initial matrix. Some PDFs contain invalid content streams
62 that would result in an underflow, therefore the initial matrix is used
63 as a safe fallback.
64 """
65 assert len(self._stack) >= 1, "can't be empty"
66 if len(self._stack) == 1:
67 self._stack = [self._initial_matrix]
68 else:
69 self._stack.pop()
70
71 def multiply(self, matrix: Matrix):
72 """Multiplies the CTM with `matrix`. The result is not returned."""
73 if self._stack[-1] is None:
74 return
75 else:
76 self._stack[-1] = self._stack[-1] @ matrix
77
78 def invalidate_current_transformation_matrix(self):
79 """Registers the occurence of an invalid CM.
80
81 See `# Error handling` for further informations.
82 """
83 self._stack[-1] = None
84
85 @property
86 def ctm(self) -> Matrix | None:
87 """Returns the current transformation matrix or `None` if it's invalid."""
88 return self._stack[-1]
89
90
91def get_objects_with_ctm(
92 page: Page, initial_matrix: Matrix = Matrix.identity()
93) -> list[tuple[str, Matrix]]:
94 """Determines the current transformation matrix (CTM) for each drawn object.
95
96 Filters objects with an invalid CTM.
97 """
98 objects_with_ctm: list[
99 tuple[str, Matrix]
100 ] = [] # Stores the matrixes and the corresponding objects
101 matrix_stack = MatrixStack(initial_matrix)
102 for inst in parse_content_stream(page):
103 operator, operands = inst.operator, inst.operands
104 if operator == OPERATOR_STACK:
105 matrix_stack.stack()
106
107 elif operator == OPERATOR_POP:
108 matrix_stack.pop()
109
110 elif operator == OPERATOR_CM:
111 try:
112 matrix_stack.multiply(Matrix(*operands))
113 except TypeError:
114 logger.debug(f"malformed operands for `cm` operator: {operands}")
115 matrix_stack.invalidate_current_transformation_matrix()
116
117 elif operator == OPERATOR_DO:
118 name = str(operands[0]) # Name of the image (or other object)
119 if matrix_stack.ctm is not None:
120 objects_with_ctm.append(
121 (name, matrix_stack.ctm)
122 ) # Explicit copying the CTM
123 else:
124 logger.debug(
125 f"skipping `Do` operator due to invalid CTM for object: {name}"
126 )
127
128 return objects_with_ctm