1"""manage the PDF transform stack during "layout" mode text extraction"""
2
3from collections import ChainMap, Counter
4from collections import ChainMap as ChainMapType
5from collections import Counter as CounterType
6from collections.abc import MutableMapping
7from typing import Any, Union
8
9from ...errors import PdfReadError
10from .. import mult
11from ._font import Font
12from ._text_state_params import TextStateParams
13
14TextStateManagerChainMapType = ChainMapType[Union[int, str], Union[float, bool]]
15TextStateManagerDictType = MutableMapping[Union[int, str], Union[float, bool]]
16
17
18class TextStateManager:
19 """
20 Tracks the current text state including cm/tm/trm transformation matrices.
21
22 Attributes:
23 transform_stack (ChainMap): ChainMap of cm/tm transformation matrices
24 q_queue (Counter[int]): Counter of q operators
25 q_depth (List[int]): list of q operator nesting levels
26 Tc (float): character spacing
27 Tw (float): word spacing
28 Tz (int): horizontal scaling
29 TL (float): leading
30 Ts (float): text rise
31 font (Font): font object
32 font_size (int | float): font size
33
34 """
35
36 def __init__(self) -> None:
37 self.transform_stack: TextStateManagerChainMapType = ChainMap(
38 self.new_transform()
39 )
40 self.q_queue: CounterType[int] = Counter()
41 self.q_depth = [0]
42 self.Tc: float = 0.0
43 self.Tw: float = 0.0
44 self.Tz: float = 100.0
45 self.TL: float = 0.0
46 self.Ts: float = 0.0
47 self.font_stack: list[tuple[Union[Font, None], Union[int, float]]] = []
48 self.font: Union[Font, None] = None
49 self.font_size: Union[int, float] = 0
50
51 def set_state_param(self, op: bytes, value: Union[float, list[Any]]) -> None:
52 """
53 Set a text state parameter. Supports Tc, Tz, Tw, TL, and Ts operators.
54
55 Args:
56 op: operator read from PDF stream as bytes. No action is taken
57 for unsupported operators (see supported operators above).
58 value (float | List[Any]): new parameter value. If a list,
59 value[0] is used.
60
61 """
62 if op not in [b"Tc", b"Tz", b"Tw", b"TL", b"Ts"]:
63 return
64 self.__setattr__(op.decode(), value[0] if isinstance(value, list) else value)
65
66 def set_font(self, font: Font, size: float) -> None:
67 """
68 Set the current font and font_size.
69
70 Args:
71 font (Font): a layout mode Font
72 size (float): font size
73
74 """
75 self.font = font
76 self.font_size = size
77
78 def text_state_params(self, value: Union[bytes, str] = "") -> TextStateParams:
79 """
80 Create a TextStateParams instance to display a text string. Type[bytes] values
81 will be decoded implicitly.
82
83 Args:
84 value (str | bytes): text to associate with the captured state.
85
86 Raises:
87 PdfReadError: if font not set (no Tf operator in incoming pdf content stream)
88
89 Returns:
90 TextStateParams: current text state parameters
91
92 """
93 if not isinstance(self.font, Font):
94 raise PdfReadError(
95 "font not set: is PDF missing a Tf operator?"
96 ) # pragma: no cover
97 if isinstance(value, bytes):
98 try:
99 if isinstance(self.font.encoding, str):
100 txt = value.decode(self.font.encoding, "surrogatepass")
101 else:
102 txt = "".join(
103 self.font.encoding[x]
104 if x in self.font.encoding
105 else bytes((x,)).decode()
106 for x in value
107 )
108 except (UnicodeEncodeError, UnicodeDecodeError):
109 txt = value.decode("utf-8", "replace")
110 txt = "".join(
111 self.font.char_map.get(x, x) for x in txt
112 )
113 else:
114 txt = value
115 return TextStateParams(
116 txt,
117 self.font,
118 self.font_size,
119 self.Tc,
120 self.Tw,
121 self.Tz,
122 self.TL,
123 self.Ts,
124 self.effective_transform,
125 )
126
127 @staticmethod
128 def raw_transform(
129 _a: float = 1.0,
130 _b: float = 0.0,
131 _c: float = 0.0,
132 _d: float = 1.0,
133 _e: float = 0.0,
134 _f: float = 0.0,
135 ) -> dict[int, float]:
136 """Only a/b/c/d/e/f matrix params"""
137 return dict(zip(range(6), map(float, (_a, _b, _c, _d, _e, _f))))
138
139 @staticmethod
140 def new_transform(
141 _a: float = 1.0,
142 _b: float = 0.0,
143 _c: float = 0.0,
144 _d: float = 1.0,
145 _e: float = 0.0,
146 _f: float = 0.0,
147 is_text: bool = False,
148 is_render: bool = False,
149 ) -> TextStateManagerDictType:
150 """Standard a/b/c/d/e/f matrix params + 'is_text' and 'is_render' keys"""
151 result: Any = TextStateManager.raw_transform(_a, _b, _c, _d, _e, _f)
152 result.update({"is_text": is_text, "is_render": is_render})
153 return result
154
155 def reset_tm(self) -> TextStateManagerChainMapType:
156 """Clear all transforms from chainmap having is_text==True or is_render==True"""
157 while (
158 self.transform_stack.maps[0]["is_text"]
159 or self.transform_stack.maps[0]["is_render"]
160 ):
161 self.transform_stack = self.transform_stack.parents
162 return self.transform_stack
163
164 def reset_trm(self) -> TextStateManagerChainMapType:
165 """Clear all transforms from chainmap having is_render==True"""
166 while self.transform_stack.maps[0]["is_render"]:
167 self.transform_stack = self.transform_stack.parents
168 return self.transform_stack
169
170 def remove_q(self) -> TextStateManagerChainMapType:
171 """Rewind to stack prior state after closing a 'q' with internal 'cm' ops"""
172 self.font, self.font_size = self.font_stack.pop(-1)
173 self.transform_stack = self.reset_tm()
174 self.transform_stack.maps = self.transform_stack.maps[
175 self.q_queue.pop(self.q_depth.pop(), 0) :
176 ]
177 return self.transform_stack
178
179 def add_q(self) -> None:
180 """Add another level to q_queue"""
181 self.font_stack.append((self.font, self.font_size))
182 self.q_depth.append(len(self.q_depth))
183
184 def add_cm(self, *args: Any) -> TextStateManagerChainMapType:
185 """Concatenate an additional transform matrix"""
186 self.transform_stack = self.reset_tm()
187 self.q_queue.update(self.q_depth[-1:])
188 self.transform_stack = self.transform_stack.new_child(self.new_transform(*args))
189 return self.transform_stack
190
191 def _complete_matrix(self, operands: list[float]) -> list[float]:
192 """Adds a, b, c, and d to an "e/f only" operand set (e.g Td)"""
193 if len(operands) == 2: # this is a Td operator or equivalent
194 operands = [1.0, 0.0, 0.0, 1.0, *operands]
195 return operands
196
197 def add_tm(self, operands: list[float]) -> TextStateManagerChainMapType:
198 """Append a text transform matrix"""
199 self.transform_stack = self.transform_stack.new_child(
200 self.new_transform( # type: ignore[misc]
201 *self._complete_matrix(operands), is_text=True # type: ignore[arg-type]
202 )
203 )
204 return self.transform_stack
205
206 def add_trm(self, operands: list[float]) -> TextStateManagerChainMapType:
207 """Append a text rendering transform matrix"""
208 self.transform_stack = self.transform_stack.new_child(
209 self.new_transform( # type: ignore[misc]
210 *self._complete_matrix(operands), is_text=True, is_render=True # type: ignore[arg-type]
211 )
212 )
213 return self.transform_stack
214
215 @property
216 def effective_transform(self) -> list[float]:
217 """Current effective transform accounting for cm, tm, and trm transforms"""
218 eff_transform = [*self.transform_stack.maps[0].values()]
219 for transform in self.transform_stack.maps[1:]:
220 eff_transform = mult(eff_transform, transform) # type: ignore[arg-type] # dict has int keys 0-5
221 return eff_transform