1"""manage the PDF transform stack during "layout" mode text extraction"""
2
3from collections import ChainMap, Counter
4from typing import Any, Dict, List, MutableMapping, Tuple, Union
5from typing import ChainMap as ChainMapType
6from typing import Counter as CounterType
7
8from ...errors import PdfReadError
9from .. import mult
10from ._font import Font
11from ._text_state_params import TextStateParams
12
13TextStateManagerChainMapType = ChainMapType[Union[int, str], Union[float, bool]]
14TextStateManagerDictType = MutableMapping[Union[int, str], Union[float, bool]]
15
16
17class TextStateManager:
18 """
19 Tracks the current text state including cm/tm/trm transformation matrices.
20
21 Attributes:
22 transform_stack (ChainMap): ChainMap of cm/tm transformation matrices
23 q_queue (Counter[int]): Counter of q operators
24 q_depth (List[int]): list of q operator nesting levels
25 Tc (float): character spacing
26 Tw (float): word spacing
27 Tz (int): horizontal scaling
28 TL (float): leading
29 Ts (float): text rise
30 font (Font): font object
31 font_size (int | float): font size
32
33 """
34
35 def __init__(self) -> None:
36 self.transform_stack: TextStateManagerChainMapType = ChainMap(
37 self.new_transform()
38 )
39 self.q_queue: CounterType[int] = Counter()
40 self.q_depth = [0]
41 self.Tc: float = 0.0
42 self.Tw: float = 0.0
43 self.Tz: float = 100.0
44 self.TL: float = 0.0
45 self.Ts: float = 0.0
46 self.font_stack: List[Tuple[Union[Font, None], Union[int, float]]] = []
47 self.font: Union[Font, None] = None
48 self.font_size: Union[int, float] = 0
49
50 def set_state_param(self, op: bytes, value: Union[float, List[Any]]) -> None:
51 """
52 Set a text state parameter. Supports Tc, Tz, Tw, TL, and Ts operators.
53
54 Args:
55 op: operator read from PDF stream as bytes. No action is taken
56 for unsupported operators (see supported operators above).
57 value (float | List[Any]): new parameter value. If a list,
58 value[0] is used.
59
60 """
61 if op not in [b"Tc", b"Tz", b"Tw", b"TL", b"Ts"]:
62 return
63 self.__setattr__(op.decode(), value[0] if isinstance(value, list) else value)
64
65 def set_font(self, font: Font, size: float) -> None:
66 """
67 Set the current font and font_size.
68
69 Args:
70 font (Font): a layout mode Font
71 size (float): font size
72
73 """
74 self.font = font
75 self.font_size = size
76
77 def text_state_params(self, value: Union[bytes, str] = "") -> TextStateParams:
78 """
79 Create a TextStateParams instance to display a text string. Type[bytes] values
80 will be decoded implicitly.
81
82 Args:
83 value (str | bytes): text to associate with the captured state.
84
85 Raises:
86 PdfReadError: if font not set (no Tf operator in incoming pdf content stream)
87
88 Returns:
89 TextStateParams: current text state parameters
90
91 """
92 if not isinstance(self.font, Font):
93 raise PdfReadError(
94 "font not set: is PDF missing a Tf operator?"
95 ) # pragma: no cover
96 if isinstance(value, bytes):
97 try:
98 if isinstance(self.font.encoding, str):
99 txt = value.decode(self.font.encoding, "surrogatepass")
100 else:
101 txt = "".join(
102 self.font.encoding[x]
103 if x in self.font.encoding
104 else bytes((x,)).decode()
105 for x in value
106 )
107 except (UnicodeEncodeError, UnicodeDecodeError):
108 txt = value.decode("utf-8", "replace")
109 txt = "".join(
110 self.font.char_map.get(x, x) for x in txt
111 )
112 else:
113 txt = value
114 return TextStateParams(
115 txt,
116 self.font,
117 self.font_size,
118 self.Tc,
119 self.Tw,
120 self.Tz,
121 self.TL,
122 self.Ts,
123 self.effective_transform,
124 )
125
126 @staticmethod
127 def raw_transform(
128 _a: float = 1.0,
129 _b: float = 0.0,
130 _c: float = 0.0,
131 _d: float = 1.0,
132 _e: float = 0.0,
133 _f: float = 0.0,
134 ) -> Dict[int, float]:
135 """Only a/b/c/d/e/f matrix params"""
136 return dict(zip(range(6), map(float, (_a, _b, _c, _d, _e, _f))))
137
138 @staticmethod
139 def new_transform(
140 _a: float = 1.0,
141 _b: float = 0.0,
142 _c: float = 0.0,
143 _d: float = 1.0,
144 _e: float = 0.0,
145 _f: float = 0.0,
146 is_text: bool = False,
147 is_render: bool = False,
148 ) -> TextStateManagerDictType:
149 """Standard a/b/c/d/e/f matrix params + 'is_text' and 'is_render' keys"""
150 result: Any = TextStateManager.raw_transform(_a, _b, _c, _d, _e, _f)
151 result.update({"is_text": is_text, "is_render": is_render})
152 return result
153
154 def reset_tm(self) -> TextStateManagerChainMapType:
155 """Clear all transforms from chainmap having is_text==True or is_render==True"""
156 while (
157 self.transform_stack.maps[0]["is_text"]
158 or self.transform_stack.maps[0]["is_render"]
159 ):
160 self.transform_stack = self.transform_stack.parents
161 return self.transform_stack
162
163 def reset_trm(self) -> TextStateManagerChainMapType:
164 """Clear all transforms from chainmap having is_render==True"""
165 while self.transform_stack.maps[0]["is_render"]:
166 self.transform_stack = self.transform_stack.parents
167 return self.transform_stack
168
169 def remove_q(self) -> TextStateManagerChainMapType:
170 """Rewind to stack prior state after closing a 'q' with internal 'cm' ops"""
171 self.font, self.font_size = self.font_stack.pop(-1)
172 self.transform_stack = self.reset_tm()
173 self.transform_stack.maps = self.transform_stack.maps[
174 self.q_queue.pop(self.q_depth.pop(), 0) :
175 ]
176 return self.transform_stack
177
178 def add_q(self) -> None:
179 """Add another level to q_queue"""
180 self.font_stack.append((self.font, self.font_size))
181 self.q_depth.append(len(self.q_depth))
182
183 def add_cm(self, *args: Any) -> TextStateManagerChainMapType:
184 """Concatenate an additional transform matrix"""
185 self.transform_stack = self.reset_tm()
186 self.q_queue.update(self.q_depth[-1:])
187 self.transform_stack = self.transform_stack.new_child(self.new_transform(*args))
188 return self.transform_stack
189
190 def _complete_matrix(self, operands: List[float]) -> List[float]:
191 """Adds a, b, c, and d to an "e/f only" operand set (e.g Td)"""
192 if len(operands) == 2: # this is a Td operator or equivalent
193 operands = [1.0, 0.0, 0.0, 1.0, *operands]
194 return operands
195
196 def add_tm(self, operands: List[float]) -> TextStateManagerChainMapType:
197 """Append a text transform matrix"""
198 self.transform_stack = self.transform_stack.new_child(
199 self.new_transform( # type: ignore[misc]
200 *self._complete_matrix(operands), is_text=True # type: ignore[arg-type]
201 )
202 )
203 return self.transform_stack
204
205 def add_trm(self, operands: List[float]) -> TextStateManagerChainMapType:
206 """Append a text rendering transform matrix"""
207 self.transform_stack = self.transform_stack.new_child(
208 self.new_transform( # type: ignore[misc]
209 *self._complete_matrix(operands), is_text=True, is_render=True # type: ignore[arg-type]
210 )
211 )
212 return self.transform_stack
213
214 @property
215 def effective_transform(self) -> List[float]:
216 """Current effective transform accounting for cm, tm, and trm transforms"""
217 eff_transform = [*self.transform_stack.maps[0].values()]
218 for transform in self.transform_stack.maps[1:]:
219 eff_transform = mult(eff_transform, transform) # type: ignore[arg-type] # dict has int keys 0-5
220 return eff_transform