1# Copyright (c) 2006, Mathieu Fenniak
2# Copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com>
3#
4# All rights reserved.
5#
6# Redistribution and use in source and binary forms, with or without
7# modification, are permitted provided that the following conditions are
8# met:
9#
10# * Redistributions of source code must retain the above copyright notice,
11# this list of conditions and the following disclaimer.
12# * Redistributions in binary form must reproduce the above copyright notice,
13# this list of conditions and the following disclaimer in the documentation
14# and/or other materials provided with the distribution.
15# * The name of the author may not be used to endorse or promote products
16# derived from this software without specific prior written permission.
17#
18# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
22# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28# POSSIBILITY OF SUCH DAMAGE.
29
30import math
31from typing import Any, Callable, Dict, List, Optional, Tuple, Union
32
33from .._cmap import build_font_width_map, compute_font_width, get_actual_str_key
34from ..generic import DictionaryObject, TextStringObject
35from . import OrientationNotFoundError, crlf_space_check, get_display_str, get_text_operands, mult
36
37
38class TextExtraction:
39 """
40 A class to handle PDF text extraction operations.
41
42 This class encapsulates all the state and operations needed for extracting
43 text from PDF content streams, replacing the nested functions and nonlocal
44 variables in the original implementation.
45 """
46
47 def __init__(self) -> None:
48 self._font_width_maps: Dict[str, Tuple[Dict[Any, float], str, float]] = {}
49
50 # Text extraction state variables
51 self.cm_matrix: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
52 self.tm_matrix: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
53 self.cm_stack: List[
54 Tuple[
55 List[float],
56 Tuple[Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]],
57 float,
58 float,
59 float,
60 float,
61 float,
62 ]
63 ] = []
64
65 # Store the last modified matrices; can be an intermediate position
66 self.cm_prev: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
67 self.tm_prev: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
68
69 # Store the position at the beginning of building the text
70 self.memo_cm: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
71 self.memo_tm: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
72
73 self.char_scale = 1.0
74 self.space_scale = 1.0
75 self._space_width: float = 500.0 # will be set correctly at first Tf
76 self._actual_str_size: Dict[str, float] = {
77 "str_widths": 0.0,
78 "space_width": 0.0,
79 "str_height": 0.0,
80 } # will be set to string length calculation result
81 self.TL = 0.0
82 self.font_size = 12.0 # init just in case of
83
84 # Text extraction variables
85 self.text: str = ""
86 self.output: str = ""
87 self.rtl_dir: bool = False # right-to-left
88 self.cmap: Tuple[Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]] = (
89 "charmap",
90 {},
91 "NotInitialized",
92 None,
93 ) # (encoding, CMAP, font resource name, font)
94 self.orientations: Tuple[int, ...] = (0, 90, 180, 270)
95 self.visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None
96 self.cmaps: Dict[str, Tuple[str, float, Union[str, Dict[int, str]], Dict[str, str], DictionaryObject]] = {}
97
98 def initialize_extraction(
99 self,
100 orientations: Tuple[int, ...] = (0, 90, 180, 270),
101 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None,
102 cmaps: Optional[
103 Dict[str, Tuple[str, float, Union[str, Dict[int, str]], Dict[str, str], DictionaryObject]]
104 ] = None,
105 ) -> None:
106 """Initialize the extractor with extraction parameters."""
107 self.orientations = orientations
108 self.visitor_text = visitor_text
109 self.cmaps = cmaps or {}
110
111 # Reset state
112 self.text = ""
113 self.output = ""
114 self.rtl_dir = False
115
116 def compute_str_widths(self, str_widths: float) -> float:
117 return str_widths / 1000
118
119 def process_operation(self, operator: bytes, operands: List[Any]) -> None:
120 str_widths: float = 0.0
121
122 # Table 5.4 page 405
123 if operator == b"BT": # Begin Text
124 self.tm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
125 # Flush text:
126 self.output += self.text
127 if self.visitor_text is not None:
128 self.visitor_text(self.text, self.memo_cm, self.memo_tm, self.cmap[3], self.font_size)
129 self.text = ""
130 self.memo_cm = self.cm_matrix.copy()
131 self.memo_tm = self.tm_matrix.copy()
132 return
133 if operator == b"ET": # End Text
134 # Flush text:
135 self.output += self.text
136 if self.visitor_text is not None:
137 self.visitor_text(self.text, self.memo_cm, self.memo_tm, self.cmap[3], self.font_size)
138 self.text = ""
139 self.memo_cm = self.cm_matrix.copy()
140 self.memo_tm = self.tm_matrix.copy()
141
142 # Table 4.7 "Graphics state operators", page 219
143 # cm_matrix calculation is reserved for later
144 elif operator == b"q": # Save graphics state
145 self.cm_stack.append(
146 (
147 self.cm_matrix,
148 self.cmap,
149 self.font_size,
150 self.char_scale,
151 self.space_scale,
152 self._space_width,
153 self.TL,
154 )
155 )
156 elif operator == b"Q": # Restore graphics state
157 try:
158 (
159 self.cm_matrix,
160 self.cmap,
161 self.font_size,
162 self.char_scale,
163 self.space_scale,
164 self._space_width,
165 self.TL,
166 ) = self.cm_stack.pop()
167 except Exception:
168 self.cm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
169 elif operator == b"cm": # Modify current matrix
170 self.output += self.text
171 if self.visitor_text is not None:
172 self.visitor_text(self.text, self.memo_cm, self.memo_tm, self.cmap[3], self.font_size)
173 self.text = ""
174 try:
175 self.cm_matrix = mult([float(operand) for operand in operands[:6]], self.cm_matrix)
176 except Exception:
177 self.cm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
178 self.memo_cm = self.cm_matrix.copy()
179 self.memo_tm = self.tm_matrix.copy()
180
181 # Table 5.2 page 398
182 elif operator == b"Tz": # Set horizontal text scaling
183 self.char_scale = float(operands[0]) / 100 if operands else 1.0
184 elif operator == b"Tw": # Set word spacing
185 self.space_scale = 1.0 + float(operands[0] if operands else 0.0)
186 elif operator == b"TL": # Set Text Leading
187 scale_x = math.sqrt(self.tm_matrix[0] ** 2 + self.tm_matrix[2] ** 2)
188 self.TL = float(operands[0] if operands else 0.0) * self.font_size * scale_x
189 elif operator == b"Tf": # Set font size
190 if self.text != "":
191 self.output += self.text # .translate(cmap)
192 if self.visitor_text is not None:
193 self.visitor_text(self.text, self.memo_cm, self.memo_tm, self.cmap[3], self.font_size)
194 self.text = ""
195 self.memo_cm = self.cm_matrix.copy()
196 self.memo_tm = self.tm_matrix.copy()
197 try:
198 # Import here to avoid circular imports
199 from .._cmap import unknown_char_map # noqa: PLC0415
200
201 # char_map_tuple: font_type,
202 # float(sp_width / 2),
203 # encoding,
204 # map_dict,
205 # font_dict (describes the font)
206 char_map_tuple = self.cmaps[operands[0]]
207 # current cmap: encoding,
208 # map_dict,
209 # font resource name (internal name, not the real font name),
210 # font_dict
211 self.cmap = (
212 char_map_tuple[2],
213 char_map_tuple[3],
214 operands[0],
215 char_map_tuple[4],
216 )
217 self._space_width = char_map_tuple[1]
218 except KeyError: # font not found
219 self.cmap = (
220 unknown_char_map[2],
221 unknown_char_map[3],
222 f"???{operands[0]}",
223 None,
224 )
225 self._space_width = unknown_char_map[1]
226 try:
227 self.font_size = float(operands[1])
228 except Exception:
229 pass # keep previous size
230 # Table 5.5 page 406
231 elif operator == b"Td": # Move text position
232 # A special case is a translating only tm:
233 # tm = [1, 0, 0, 1, e, f]
234 # i.e. tm[4] += tx, tm[5] += ty.
235 tx, ty = float(operands[0]), float(operands[1])
236 self.tm_matrix[4] += tx * self.tm_matrix[0] + ty * self.tm_matrix[2]
237 self.tm_matrix[5] += tx * self.tm_matrix[1] + ty * self.tm_matrix[3]
238 str_widths = self.compute_str_widths(self._actual_str_size["str_widths"])
239 self._actual_str_size["str_widths"] = 0.0
240 elif operator == b"Tm": # Set text matrix
241 self.tm_matrix = [float(operand) for operand in operands[:6]]
242 str_widths = self.compute_str_widths(self._actual_str_size["str_widths"])
243 self._actual_str_size["str_widths"] = 0.0
244 elif operator == b"T*": # Move to next line
245 self.tm_matrix[4] -= self.TL * self.tm_matrix[2]
246 self.tm_matrix[5] -= self.TL * self.tm_matrix[3]
247 str_widths = self.compute_str_widths(self._actual_str_size["str_widths"])
248 self._actual_str_size["str_widths"] = 0.0
249 elif operator == b"Tj": # Show text
250 self.text, self.rtl_dir, self._actual_str_size = self._handle_tj(
251 self.text,
252 operands,
253 self.cm_matrix,
254 self.tm_matrix,
255 self.cmap,
256 self.orientations,
257 self.font_size,
258 self.rtl_dir,
259 self.visitor_text,
260 self._space_width,
261 self._actual_str_size,
262 )
263 else:
264 return
265
266 if operator in {b"Td", b"Tm", b"T*", b"Tj"}:
267 try:
268 self.text, self.output, self.cm_prev, self.tm_prev = crlf_space_check(
269 self.text,
270 (self.cm_prev, self.tm_prev),
271 (self.cm_matrix, self.tm_matrix),
272 (self.memo_cm, self.memo_tm),
273 self.cmap,
274 self.orientations,
275 self.output,
276 self.font_size,
277 self.visitor_text,
278 str_widths,
279 self.compute_str_widths(self._actual_str_size["space_width"]),
280 self._actual_str_size["str_height"],
281 )
282 if self.text == "":
283 self.memo_cm = self.cm_matrix.copy()
284 self.memo_tm = self.tm_matrix.copy()
285 except OrientationNotFoundError:
286 return
287
288 def _get_actual_font_widths(
289 self,
290 cmap: Tuple[
291 Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]
292 ],
293 text_operands: str,
294 font_size: float,
295 space_width: float,
296 ) -> Tuple[float, float, float]:
297 font_widths: float = 0
298 font_name: str = cmap[2]
299 if font_name not in self._font_width_maps:
300 if cmap[3] is None:
301 font_width_map: Dict[Any, float] = {}
302 space_char = " "
303 actual_space_width: float = space_width
304 font_width_map["default"] = actual_space_width * 2
305 else:
306 space_char = get_actual_str_key(" ", cmap[0], cmap[1])
307 font_width_map = build_font_width_map(cmap[3], space_width * 2)
308 actual_space_width = compute_font_width(font_width_map, space_char)
309 if actual_space_width == 0:
310 actual_space_width = space_width
311 self._font_width_maps[font_name] = (font_width_map, space_char, actual_space_width)
312 font_width_map = self._font_width_maps[font_name][0]
313 space_char = self._font_width_maps[font_name][1]
314 actual_space_width = self._font_width_maps[font_name][2]
315
316 if text_operands:
317 for char in text_operands:
318 if char == space_char:
319 font_widths += actual_space_width
320 continue
321 font_widths += compute_font_width(font_width_map, char)
322 return (font_widths * font_size, space_width * font_size, font_size)
323
324 def _handle_tj(
325 self,
326 text: str,
327 operands: List[Union[str, TextStringObject]],
328 cm_matrix: List[float],
329 tm_matrix: List[float],
330 cmap: Tuple[
331 Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]
332 ],
333 orientations: Tuple[int, ...],
334 font_size: float,
335 rtl_dir: bool,
336 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]],
337 space_width: float,
338 actual_str_size: Dict[str, float],
339 ) -> Tuple[str, bool, Dict[str, float]]:
340 text_operands, is_str_operands = get_text_operands(
341 operands, cm_matrix, tm_matrix, cmap, orientations)
342 if is_str_operands:
343 text += text_operands
344 else:
345 text, rtl_dir = get_display_str(
346 text,
347 cm_matrix,
348 tm_matrix, # text matrix
349 cmap,
350 text_operands,
351 font_size,
352 rtl_dir,
353 visitor_text,
354 )
355 font_widths, actual_str_size["space_width"], actual_str_size["str_height"] = (
356 self._get_actual_font_widths(cmap, text_operands, font_size, space_width))
357 actual_str_size["str_widths"] += font_widths
358
359 return text, rtl_dir, actual_str_size