1# Copyright (c) 2006, Mathieu Fenniak
2# Copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com>
3#
4# All rights reserved.
5#
6# Redistribution and use in source and binary forms, with or without
7# modification, are permitted provided that the following conditions are
8# met:
9#
10# * Redistributions of source code must retain the above copyright notice,
11# this list of conditions and the following disclaimer.
12# * Redistributions in binary form must reproduce the above copyright notice,
13# this list of conditions and the following disclaimer in the documentation
14# and/or other materials provided with the distribution.
15# * The name of the author may not be used to endorse or promote products
16# derived from this software without specific prior written permission.
17#
18# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
22# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28# POSSIBILITY OF SUCH DAMAGE.
29
30import math
31from typing import Any, Callable, Optional, Union
32
33from .._font import Font, FontDescriptor
34from ..generic import DictionaryObject, TextStringObject
35from . import OrientationNotFoundError, crlf_space_check, get_display_str, get_text_operands, mult
36
37
38class TextExtraction:
39 """
40 A class to handle PDF text extraction operations.
41
42 This class encapsulates all the state and operations needed for extracting
43 text from PDF content streams, replacing the nested functions and nonlocal
44 variables in the original implementation.
45 """
46
47 def __init__(self) -> None:
48 self._font_width_maps: dict[str, tuple[dict[Any, float], str, float]] = {}
49
50 # Text extraction state variables
51 self.cm_matrix: list[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
52 self.tm_matrix: list[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
53 self.cm_stack: list[
54 tuple[
55 list[float],
56 Optional[DictionaryObject],
57 Font,
58 float,
59 float,
60 float,
61 float,
62 ]
63 ] = []
64
65 # Store the last modified matrices; can be an intermediate position
66 self.cm_prev: list[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
67 self.tm_prev: list[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
68
69 # Store the position at the beginning of building the text
70 self.memo_cm: list[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
71 self.memo_tm: list[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
72
73 self.char_scale = 1.0
74 self.space_scale = 1.0
75 self._space_width: float = 500.0 # will be set correctly at first Tf
76 self._actual_str_size: dict[str, float] = {
77 "str_widths": 0.0,
78 "str_height": 0.0,
79 } # will be set to string length calculation result
80 self.TL = 0.0
81 self.font_size = 12.0 # init just in case of
82
83 # Text extraction variables
84 self.text: str = ""
85 self.output: str = ""
86 self.rtl_dir: bool = False # right-to-left
87 self.font_resource: Optional[DictionaryObject] = None
88 self.font = Font(
89 name = "NotInitialized",
90 sub_type="Unknown",
91 encoding="charmap",
92 font_descriptor=FontDescriptor(),
93 )
94 self.orientations: tuple[int, ...] = (0, 90, 180, 270)
95 self.visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None
96 self.font_resources: dict[str, DictionaryObject] = {}
97 self.fonts: dict[str, Font] = {}
98
99 self.operation_handlers = {
100 b"BT": self._handle_bt,
101 b"ET": self._handle_et,
102 b"q": self._handle_save_graphics_state,
103 b"Q": self._handle_restore_graphics_state,
104 b"cm": self._handle_cm,
105 b"Tz": self._handle_tz,
106 b"Tw": self._handle_tw,
107 b"TL": self._handle_tl,
108 b"Tf": self._handle_tf,
109 b"Td": self._handle_td,
110 b"Tm": self._handle_tm,
111 b"T*": self._handle_t_star,
112 b"Tj": self._handle_tj_operation,
113 }
114
115 def initialize_extraction(
116 self,
117 orientations: tuple[int, ...] = (0, 90, 180, 270),
118 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None,
119 font_resources: Optional[dict[str, DictionaryObject]] = None,
120 fonts: Optional[dict[str, Font]] = None
121 ) -> None:
122 """Initialize the extractor with extraction parameters."""
123 self.orientations = orientations
124 self.visitor_text = visitor_text
125 self.font_resources = font_resources or {}
126 self.fonts = fonts or {}
127
128 # Reset state
129 self.text = ""
130 self.output = ""
131 self.rtl_dir = False
132
133 def compute_str_widths(self, str_widths: float) -> float:
134 return str_widths / 1000
135
136 def process_operation(self, operator: bytes, operands: list[Any]) -> None:
137 if operator in self.operation_handlers:
138 handler = self.operation_handlers[operator]
139 str_widths = handler(operands)
140
141 # Post-process operations that affect text positioning
142 if operator in {b"Td", b"Tm", b"T*", b"Tj"}:
143 self._post_process_text_operation(str_widths or 0.0)
144
145 def _post_process_text_operation(self, str_widths: float) -> None:
146 """Handle common post-processing for text positioning operations."""
147 try:
148 self.text, self.output, self.cm_prev, self.tm_prev = crlf_space_check(
149 self.text,
150 (self.cm_prev, self.tm_prev),
151 (self.cm_matrix, self.tm_matrix),
152 (self.memo_cm, self.memo_tm),
153 self.font_resource,
154 self.orientations,
155 self.output,
156 self.font_size,
157 self.visitor_text,
158 str_widths,
159 self.compute_str_widths(self.font_size * self._space_width),
160 self._actual_str_size["str_height"],
161 )
162 if self.text == "":
163 self.memo_cm = self.cm_matrix.copy()
164 self.memo_tm = self.tm_matrix.copy()
165 except OrientationNotFoundError:
166 pass
167
168 def _handle_tj(
169 self,
170 text: str,
171 operands: list[Union[str, TextStringObject]],
172 cm_matrix: list[float],
173 tm_matrix: list[float],
174 font_resource: Optional[DictionaryObject],
175 font: Font,
176 orientations: tuple[int, ...],
177 font_size: float,
178 rtl_dir: bool,
179 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]],
180 actual_str_size: dict[str, float],
181 ) -> tuple[str, bool, dict[str, float]]:
182 text_operands, is_str_operands = get_text_operands(
183 operands, cm_matrix, tm_matrix, font, orientations
184 )
185 if is_str_operands:
186 text += text_operands
187 font_widths = sum([font.space_width if x == " " else font.text_width(x) for x in text_operands])
188 else:
189 text, rtl_dir, font_widths = get_display_str(
190 text,
191 cm_matrix,
192 tm_matrix, # text matrix
193 font_resource,
194 font,
195 text_operands,
196 font_size,
197 rtl_dir,
198 visitor_text,
199 )
200 actual_str_size["str_widths"] += font_widths * font_size
201 actual_str_size["str_height"] = font_size
202 return text, rtl_dir, actual_str_size
203
204 def _flush_text(self) -> None:
205 """Flush accumulated text to output and call visitor if present."""
206 self.output += self.text
207 if self.visitor_text is not None:
208 self.visitor_text(self.text, self.memo_cm, self.memo_tm, self.font_resource, self.font_size)
209 self.text = ""
210 self.memo_cm = self.cm_matrix.copy()
211 self.memo_tm = self.tm_matrix.copy()
212
213 # Operation handlers
214
215 def _handle_bt(self, operands: list[Any]) -> None:
216 """Handle BT (Begin Text) operation - Table 5.4 page 405."""
217 self.tm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
218 self._flush_text()
219
220 def _handle_et(self, operands: list[Any]) -> None:
221 """Handle ET (End Text) operation - Table 5.4 page 405."""
222 self._flush_text()
223
224 def _handle_save_graphics_state(self, operands: list[Any]) -> None:
225 """Handle q (Save graphics state) operation - Table 4.7 page 219."""
226 self.cm_stack.append(
227 (
228 self.cm_matrix,
229 self.font_resource,
230 self.font,
231 self.font_size,
232 self.char_scale,
233 self.space_scale,
234 self.TL,
235 )
236 )
237
238 def _handle_restore_graphics_state(self, operands: list[Any]) -> None:
239 """Handle Q (Restore graphics state) operation - Table 4.7 page 219."""
240 try:
241 (
242 self.cm_matrix,
243 self.font_resource,
244 self.font,
245 self.font_size,
246 self.char_scale,
247 self.space_scale,
248 self.TL,
249 ) = self.cm_stack.pop()
250 except Exception:
251 self.cm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
252
253 def _handle_cm(self, operands: list[Any]) -> None:
254 """Handle cm (Modify current matrix) operation - Table 4.7 page 219."""
255 self.output += self.text
256 if self.visitor_text is not None:
257 self.visitor_text(self.text, self.memo_cm, self.memo_tm, self.font_resource, self.font_size)
258 self.text = ""
259 try:
260 self.cm_matrix = mult([float(operand) for operand in operands[:6]], self.cm_matrix)
261 except Exception:
262 self.cm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
263 self.memo_cm = self.cm_matrix.copy()
264 self.memo_tm = self.tm_matrix.copy()
265
266 def _handle_tz(self, operands: list[Any]) -> None:
267 """Handle Tz (Set horizontal text scaling) operation - Table 5.2 page 398."""
268 self.char_scale = float(operands[0]) / 100 if operands else 1.0
269
270 def _handle_tw(self, operands: list[Any]) -> None:
271 """Handle Tw (Set word spacing) operation - Table 5.2 page 398."""
272 self.space_scale = 1.0 + float(operands[0] if operands else 0.0)
273
274 def _handle_tl(self, operands: list[Any]) -> None:
275 """Handle TL (Set Text Leading) operation - Table 5.2 page 398."""
276 scale_x = math.sqrt(self.tm_matrix[0] ** 2 + self.tm_matrix[2] ** 2)
277 self.TL = float(operands[0] if operands else 0.0) * self.font_size * scale_x
278
279 def _handle_tf(self, operands: list[Any]) -> None:
280 """Handle Tf (Set font size) operation - Table 5.2 page 398."""
281 if self.text != "":
282 self.output += self.text # .translate(cmap)
283 if self.visitor_text is not None:
284 self.visitor_text(self.text, self.memo_cm, self.memo_tm, self.font_resource, self.font_size)
285 self.text = ""
286 self.memo_cm = self.cm_matrix.copy()
287 self.memo_tm = self.tm_matrix.copy()
288 try:
289 self.font_resource = self.font_resources[operands[0]]
290 self.font = self.fonts[operands[0]]
291 except KeyError: # font not found
292 self.font_resource = None
293 font_descriptor = FontDescriptor()
294 self.font = Font(
295 "Unknown",
296 space_width=250,
297 encoding=dict.fromkeys(range(256), "�"),
298 font_descriptor=font_descriptor,
299 character_map={},
300 character_widths=font_descriptor.character_widths
301 )
302
303 self._space_width = self.font.space_width / 2 # Actually the width of _half_ a space...
304 try:
305 self.font_size = float(operands[1])
306 except Exception:
307 pass # keep previous size
308
309 def _handle_td(self, operands: list[Any]) -> float:
310 """Handle Td (Move text position) operation - Table 5.5 page 406."""
311 # A special case is a translating only tm:
312 # tm = [1, 0, 0, 1, e, f]
313 # i.e. tm[4] += tx, tm[5] += ty.
314 tx, ty = float(operands[0]), float(operands[1])
315 self.tm_matrix[4] += tx * self.tm_matrix[0] + ty * self.tm_matrix[2]
316 self.tm_matrix[5] += tx * self.tm_matrix[1] + ty * self.tm_matrix[3]
317 str_widths = self.compute_str_widths(self._actual_str_size["str_widths"])
318 self._actual_str_size["str_widths"] = 0.0
319 return str_widths
320
321 def _handle_tm(self, operands: list[Any]) -> float:
322 """Handle Tm (Set text matrix) operation - Table 5.5 page 406."""
323 self.tm_matrix = [float(operand) for operand in operands[:6]]
324 str_widths = self.compute_str_widths(self._actual_str_size["str_widths"])
325 self._actual_str_size["str_widths"] = 0.0
326 return str_widths
327
328 def _handle_t_star(self, operands: list[Any]) -> float:
329 """Handle T* (Move to next line) operation - Table 5.5 page 406."""
330 self.tm_matrix[4] -= self.TL * self.tm_matrix[2]
331 self.tm_matrix[5] -= self.TL * self.tm_matrix[3]
332 str_widths = self.compute_str_widths(self._actual_str_size["str_widths"])
333 self._actual_str_size["str_widths"] = 0.0
334 return str_widths
335
336 def _handle_tj_operation(self, operands: list[Any]) -> float:
337 """Handle Tj (Show text) operation - Table 5.5 page 406."""
338 self.text, self.rtl_dir, self._actual_str_size = self._handle_tj(
339 self.text,
340 operands,
341 self.cm_matrix,
342 self.tm_matrix,
343 self.font_resource,
344 self.font,
345 self.orientations,
346 self.font_size,
347 self.rtl_dir,
348 self.visitor_text,
349 self._actual_str_size,
350 )
351 return 0.0 # str_widths will be handled in post-processing