1# Copyright (c) 2006, Mathieu Fenniak
2# Copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com>
3#
4# All rights reserved.
5#
6# Redistribution and use in source and binary forms, with or without
7# modification, are permitted provided that the following conditions are
8# met:
9#
10# * Redistributions of source code must retain the above copyright notice,
11# this list of conditions and the following disclaimer.
12# * Redistributions in binary form must reproduce the above copyright notice,
13# this list of conditions and the following disclaimer in the documentation
14# and/or other materials provided with the distribution.
15# * The name of the author may not be used to endorse or promote products
16# derived from this software without specific prior written permission.
17#
18# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
22# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28# POSSIBILITY OF SUCH DAMAGE.
29
30import math
31from typing import Any, Callable, Dict, List, Optional, Tuple, Union
32
33from .._cmap import build_font_width_map, compute_font_width, get_actual_str_key
34from ..generic import DictionaryObject, TextStringObject
35from . import OrientationNotFoundError, crlf_space_check, get_display_str, get_text_operands, mult
36
37
38class TextExtraction:
39 """
40 A class to handle PDF text extraction operations.
41
42 This class encapsulates all the state and operations needed for extracting
43 text from PDF content streams, replacing the nested functions and nonlocal
44 variables in the original implementation.
45 """
46
47 def __init__(self) -> None:
48 self._font_width_maps: Dict[str, Tuple[Dict[Any, float], str, float]] = {}
49
50 # Text extraction state variables
51 self.cm_matrix: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
52 self.tm_matrix: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
53 self.cm_stack: List[
54 Tuple[
55 List[float],
56 Tuple[Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]],
57 float,
58 float,
59 float,
60 float,
61 float,
62 ]
63 ] = []
64
65 # Store the last modified matrices; can be an intermediate position
66 self.cm_prev: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
67 self.tm_prev: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
68
69 # Store the position at the beginning of building the text
70 self.memo_cm: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
71 self.memo_tm: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
72
73 self.char_scale = 1.0
74 self.space_scale = 1.0
75 self._space_width: float = 500.0 # will be set correctly at first Tf
76 self._actual_str_size: Dict[str, float] = {
77 "str_widths": 0.0,
78 "space_width": 0.0,
79 "str_height": 0.0,
80 } # will be set to string length calculation result
81 self.TL = 0.0
82 self.font_size = 12.0 # init just in case of
83
84 # Text extraction variables
85 self.text: str = ""
86 self.output: str = ""
87 self.rtl_dir: bool = False # right-to-left
88 self.cmap: Tuple[Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]] = (
89 "charmap",
90 {},
91 "NotInitialized",
92 None,
93 ) # (encoding, CMAP, font resource name, font)
94 self.orientations: Tuple[int, ...] = (0, 90, 180, 270)
95 self.visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None
96 self.cmaps: Dict[str, Tuple[str, float, Union[str, Dict[int, str]], Dict[str, str], DictionaryObject]] = {}
97
98 self.operation_handlers = {
99 b"BT": self._handle_bt,
100 b"ET": self._handle_et,
101 b"q": self._handle_save_graphics_state,
102 b"Q": self._handle_restore_graphics_state,
103 b"cm": self._handle_cm,
104 b"Tz": self._handle_tz,
105 b"Tw": self._handle_tw,
106 b"TL": self._handle_tl,
107 b"Tf": self._handle_tf,
108 b"Td": self._handle_td,
109 b"Tm": self._handle_tm,
110 b"T*": self._handle_t_star,
111 b"Tj": self._handle_tj_operation,
112 }
113
114 def initialize_extraction(
115 self,
116 orientations: Tuple[int, ...] = (0, 90, 180, 270),
117 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None,
118 cmaps: Optional[
119 Dict[str, Tuple[str, float, Union[str, Dict[int, str]], Dict[str, str], DictionaryObject]]
120 ] = None,
121 ) -> None:
122 """Initialize the extractor with extraction parameters."""
123 self.orientations = orientations
124 self.visitor_text = visitor_text
125 self.cmaps = cmaps or {}
126
127 # Reset state
128 self.text = ""
129 self.output = ""
130 self.rtl_dir = False
131
132 def compute_str_widths(self, str_widths: float) -> float:
133 return str_widths / 1000
134
135 def process_operation(self, operator: bytes, operands: List[Any]) -> None:
136 if operator in self.operation_handlers:
137 handler = self.operation_handlers[operator]
138 str_widths = handler(operands)
139
140 # Post-process operations that affect text positioning
141 if operator in {b"Td", b"Tm", b"T*", b"Tj"}:
142 self._post_process_text_operation(str_widths or 0.0)
143
144 def _post_process_text_operation(self, str_widths: float) -> None:
145 """Handle common post-processing for text positioning operations."""
146 try:
147 self.text, self.output, self.cm_prev, self.tm_prev = crlf_space_check(
148 self.text,
149 (self.cm_prev, self.tm_prev),
150 (self.cm_matrix, self.tm_matrix),
151 (self.memo_cm, self.memo_tm),
152 self.cmap,
153 self.orientations,
154 self.output,
155 self.font_size,
156 self.visitor_text,
157 str_widths,
158 self.compute_str_widths(self._actual_str_size["space_width"]),
159 self._actual_str_size["str_height"],
160 )
161 if self.text == "":
162 self.memo_cm = self.cm_matrix.copy()
163 self.memo_tm = self.tm_matrix.copy()
164 except OrientationNotFoundError:
165 pass
166
167 def _get_actual_font_widths(
168 self,
169 cmap: Tuple[
170 Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]
171 ],
172 text_operands: str,
173 font_size: float,
174 space_width: float,
175 ) -> Tuple[float, float, float]:
176 font_widths: float = 0
177 font_name: str = cmap[2]
178 if font_name not in self._font_width_maps:
179 if cmap[3] is None:
180 font_width_map: Dict[Any, float] = {}
181 space_char = " "
182 actual_space_width: float = space_width
183 font_width_map["default"] = actual_space_width * 2
184 else:
185 space_char = get_actual_str_key(" ", cmap[0], cmap[1])
186 font_width_map = build_font_width_map(cmap[3], space_width * 2)
187 actual_space_width = compute_font_width(font_width_map, space_char)
188 if actual_space_width == 0:
189 actual_space_width = space_width
190 self._font_width_maps[font_name] = (font_width_map, space_char, actual_space_width)
191 font_width_map = self._font_width_maps[font_name][0]
192 space_char = self._font_width_maps[font_name][1]
193 actual_space_width = self._font_width_maps[font_name][2]
194
195 if text_operands:
196 for char in text_operands:
197 if char == space_char:
198 font_widths += actual_space_width
199 continue
200 font_widths += compute_font_width(font_width_map, char)
201 return (font_widths * font_size, space_width * font_size, font_size)
202
203 def _handle_tj(
204 self,
205 text: str,
206 operands: List[Union[str, TextStringObject]],
207 cm_matrix: List[float],
208 tm_matrix: List[float],
209 cmap: Tuple[
210 Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]
211 ],
212 orientations: Tuple[int, ...],
213 font_size: float,
214 rtl_dir: bool,
215 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]],
216 space_width: float,
217 actual_str_size: Dict[str, float],
218 ) -> Tuple[str, bool, Dict[str, float]]:
219 text_operands, is_str_operands = get_text_operands(
220 operands, cm_matrix, tm_matrix, cmap, orientations)
221 if is_str_operands:
222 text += text_operands
223 else:
224 text, rtl_dir = get_display_str(
225 text,
226 cm_matrix,
227 tm_matrix, # text matrix
228 cmap,
229 text_operands,
230 font_size,
231 rtl_dir,
232 visitor_text,
233 )
234 font_widths, actual_str_size["space_width"], actual_str_size["str_height"] = (
235 self._get_actual_font_widths(cmap, text_operands, font_size, space_width))
236 actual_str_size["str_widths"] += font_widths
237
238 return text, rtl_dir, actual_str_size
239
240 def _flush_text(self) -> None:
241 """Flush accumulated text to output and call visitor if present."""
242 self.output += self.text
243 if self.visitor_text is not None:
244 self.visitor_text(self.text, self.memo_cm, self.memo_tm, self.cmap[3], self.font_size)
245 self.text = ""
246 self.memo_cm = self.cm_matrix.copy()
247 self.memo_tm = self.tm_matrix.copy()
248
249 # Operation handlers
250
251 def _handle_bt(self, operands: List[Any]) -> None:
252 """Handle BT (Begin Text) operation - Table 5.4 page 405."""
253 self.tm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
254 self._flush_text()
255
256 def _handle_et(self, operands: List[Any]) -> None:
257 """Handle ET (End Text) operation - Table 5.4 page 405."""
258 self._flush_text()
259
260 def _handle_save_graphics_state(self, operands: List[Any]) -> None:
261 """Handle q (Save graphics state) operation - Table 4.7 page 219."""
262 self.cm_stack.append(
263 (
264 self.cm_matrix,
265 self.cmap,
266 self.font_size,
267 self.char_scale,
268 self.space_scale,
269 self._space_width,
270 self.TL,
271 )
272 )
273
274 def _handle_restore_graphics_state(self, operands: List[Any]) -> None:
275 """Handle Q (Restore graphics state) operation - Table 4.7 page 219."""
276 try:
277 (
278 self.cm_matrix,
279 self.cmap,
280 self.font_size,
281 self.char_scale,
282 self.space_scale,
283 self._space_width,
284 self.TL,
285 ) = self.cm_stack.pop()
286 except Exception:
287 self.cm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
288
289 def _handle_cm(self, operands: List[Any]) -> None:
290 """Handle cm (Modify current matrix) operation - Table 4.7 page 219."""
291 self.output += self.text
292 if self.visitor_text is not None:
293 self.visitor_text(self.text, self.memo_cm, self.memo_tm, self.cmap[3], self.font_size)
294 self.text = ""
295 try:
296 self.cm_matrix = mult([float(operand) for operand in operands[:6]], self.cm_matrix)
297 except Exception:
298 self.cm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
299 self.memo_cm = self.cm_matrix.copy()
300 self.memo_tm = self.tm_matrix.copy()
301
302 def _handle_tz(self, operands: List[Any]) -> None:
303 """Handle Tz (Set horizontal text scaling) operation - Table 5.2 page 398."""
304 self.char_scale = float(operands[0]) / 100 if operands else 1.0
305
306 def _handle_tw(self, operands: List[Any]) -> None:
307 """Handle Tw (Set word spacing) operation - Table 5.2 page 398."""
308 self.space_scale = 1.0 + float(operands[0] if operands else 0.0)
309
310 def _handle_tl(self, operands: List[Any]) -> None:
311 """Handle TL (Set Text Leading) operation - Table 5.2 page 398."""
312 scale_x = math.sqrt(self.tm_matrix[0] ** 2 + self.tm_matrix[2] ** 2)
313 self.TL = float(operands[0] if operands else 0.0) * self.font_size * scale_x
314
315 def _handle_tf(self, operands: List[Any]) -> None:
316 """Handle Tf (Set font size) operation - Table 5.2 page 398."""
317 if self.text != "":
318 self.output += self.text # .translate(cmap)
319 if self.visitor_text is not None:
320 self.visitor_text(self.text, self.memo_cm, self.memo_tm, self.cmap[3], self.font_size)
321 self.text = ""
322 self.memo_cm = self.cm_matrix.copy()
323 self.memo_tm = self.tm_matrix.copy()
324 try:
325 # Import here to avoid circular imports
326 from .._cmap import unknown_char_map # noqa: PLC0415
327
328 # char_map_tuple: font_type,
329 # float(sp_width / 2),
330 # encoding,
331 # map_dict,
332 # font_dict (describes the font)
333 char_map_tuple = self.cmaps[operands[0]]
334 # current cmap: encoding,
335 # map_dict,
336 # font resource name (internal name, not the real font name),
337 # font_dict
338 self.cmap = (
339 char_map_tuple[2],
340 char_map_tuple[3],
341 operands[0],
342 char_map_tuple[4],
343 )
344 self._space_width = char_map_tuple[1]
345 except KeyError: # font not found
346 self.cmap = (
347 unknown_char_map[2],
348 unknown_char_map[3],
349 f"???{operands[0]}",
350 None,
351 )
352 self._space_width = unknown_char_map[1]
353 try:
354 self.font_size = float(operands[1])
355 except Exception:
356 pass # keep previous size
357
358 def _handle_td(self, operands: List[Any]) -> float:
359 """Handle Td (Move text position) operation - Table 5.5 page 406."""
360 # A special case is a translating only tm:
361 # tm = [1, 0, 0, 1, e, f]
362 # i.e. tm[4] += tx, tm[5] += ty.
363 tx, ty = float(operands[0]), float(operands[1])
364 self.tm_matrix[4] += tx * self.tm_matrix[0] + ty * self.tm_matrix[2]
365 self.tm_matrix[5] += tx * self.tm_matrix[1] + ty * self.tm_matrix[3]
366 str_widths = self.compute_str_widths(self._actual_str_size["str_widths"])
367 self._actual_str_size["str_widths"] = 0.0
368 return str_widths
369
370 def _handle_tm(self, operands: List[Any]) -> float:
371 """Handle Tm (Set text matrix) operation - Table 5.5 page 406."""
372 self.tm_matrix = [float(operand) for operand in operands[:6]]
373 str_widths = self.compute_str_widths(self._actual_str_size["str_widths"])
374 self._actual_str_size["str_widths"] = 0.0
375 return str_widths
376
377 def _handle_t_star(self, operands: List[Any]) -> float:
378 """Handle T* (Move to next line) operation - Table 5.5 page 406."""
379 self.tm_matrix[4] -= self.TL * self.tm_matrix[2]
380 self.tm_matrix[5] -= self.TL * self.tm_matrix[3]
381 str_widths = self.compute_str_widths(self._actual_str_size["str_widths"])
382 self._actual_str_size["str_widths"] = 0.0
383 return str_widths
384
385 def _handle_tj_operation(self, operands: List[Any]) -> float:
386 """Handle Tj (Show text) operation - Table 5.5 page 406."""
387 self.text, self.rtl_dir, self._actual_str_size = self._handle_tj(
388 self.text,
389 operands,
390 self.cm_matrix,
391 self.tm_matrix,
392 self.cmap,
393 self.orientations,
394 self.font_size,
395 self.rtl_dir,
396 self.visitor_text,
397 self._space_width,
398 self._actual_str_size,
399 )
400 return 0.0 # str_widths will be handled in post-processing