Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_text_extraction/_text_extractor.py: 16%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

164 statements  

1# Copyright (c) 2006, Mathieu Fenniak 

2# Copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com> 

3# 

4# All rights reserved. 

5# 

6# Redistribution and use in source and binary forms, with or without 

7# modification, are permitted provided that the following conditions are 

8# met: 

9# 

10# * Redistributions of source code must retain the above copyright notice, 

11# this list of conditions and the following disclaimer. 

12# * Redistributions in binary form must reproduce the above copyright notice, 

13# this list of conditions and the following disclaimer in the documentation 

14# and/or other materials provided with the distribution. 

15# * The name of the author may not be used to endorse or promote products 

16# derived from this software without specific prior written permission. 

17# 

18# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 

19# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 

20# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 

21# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 

22# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 

23# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 

24# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 

25# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 

26# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 

27# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 

28# POSSIBILITY OF SUCH DAMAGE. 

29 

30import math 

31from typing import Any, Callable, Dict, List, Optional, Tuple, Union 

32 

33from .._cmap import build_font_width_map, compute_font_width, get_actual_str_key 

34from ..generic import DictionaryObject, TextStringObject 

35from . import OrientationNotFoundError, crlf_space_check, get_display_str, get_text_operands, mult 

36 

37 

38class TextExtraction: 

39 """ 

40 A class to handle PDF text extraction operations. 

41 

42 This class encapsulates all the state and operations needed for extracting 

43 text from PDF content streams, replacing the nested functions and nonlocal 

44 variables in the original implementation. 

45 """ 

46 

47 def __init__(self) -> None: 

48 self._font_width_maps: Dict[str, Tuple[Dict[Any, float], str, float]] = {} 

49 

50 # Text extraction state variables 

51 self.cm_matrix: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] 

52 self.tm_matrix: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] 

53 self.cm_stack: List[ 

54 Tuple[ 

55 List[float], 

56 Tuple[Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]], 

57 float, 

58 float, 

59 float, 

60 float, 

61 float, 

62 ] 

63 ] = [] 

64 

65 # Store the last modified matrices; can be an intermediate position 

66 self.cm_prev: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] 

67 self.tm_prev: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] 

68 

69 # Store the position at the beginning of building the text 

70 self.memo_cm: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] 

71 self.memo_tm: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] 

72 

73 self.char_scale = 1.0 

74 self.space_scale = 1.0 

75 self._space_width: float = 500.0 # will be set correctly at first Tf 

76 self._actual_str_size: Dict[str, float] = { 

77 "str_widths": 0.0, 

78 "space_width": 0.0, 

79 "str_height": 0.0, 

80 } # will be set to string length calculation result 

81 self.TL = 0.0 

82 self.font_size = 12.0 # init just in case of 

83 

84 # Text extraction variables 

85 self.text: str = "" 

86 self.output: str = "" 

87 self.rtl_dir: bool = False # right-to-left 

88 self.cmap: Tuple[Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]] = ( 

89 "charmap", 

90 {}, 

91 "NotInitialized", 

92 None, 

93 ) # (encoding, CMAP, font resource name, font) 

94 self.orientations: Tuple[int, ...] = (0, 90, 180, 270) 

95 self.visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None 

96 self.cmaps: Dict[str, Tuple[str, float, Union[str, Dict[int, str]], Dict[str, str], DictionaryObject]] = {} 

97 

98 self.operation_handlers = { 

99 b"BT": self._handle_bt, 

100 b"ET": self._handle_et, 

101 b"q": self._handle_save_graphics_state, 

102 b"Q": self._handle_restore_graphics_state, 

103 b"cm": self._handle_cm, 

104 b"Tz": self._handle_tz, 

105 b"Tw": self._handle_tw, 

106 b"TL": self._handle_tl, 

107 b"Tf": self._handle_tf, 

108 b"Td": self._handle_td, 

109 b"Tm": self._handle_tm, 

110 b"T*": self._handle_t_star, 

111 b"Tj": self._handle_tj_operation, 

112 } 

113 

114 def initialize_extraction( 

115 self, 

116 orientations: Tuple[int, ...] = (0, 90, 180, 270), 

117 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None, 

118 cmaps: Optional[ 

119 Dict[str, Tuple[str, float, Union[str, Dict[int, str]], Dict[str, str], DictionaryObject]] 

120 ] = None, 

121 ) -> None: 

122 """Initialize the extractor with extraction parameters.""" 

123 self.orientations = orientations 

124 self.visitor_text = visitor_text 

125 self.cmaps = cmaps or {} 

126 

127 # Reset state 

128 self.text = "" 

129 self.output = "" 

130 self.rtl_dir = False 

131 

132 def compute_str_widths(self, str_widths: float) -> float: 

133 return str_widths / 1000 

134 

135 def process_operation(self, operator: bytes, operands: List[Any]) -> None: 

136 if operator in self.operation_handlers: 

137 handler = self.operation_handlers[operator] 

138 str_widths = handler(operands) 

139 

140 # Post-process operations that affect text positioning 

141 if operator in {b"Td", b"Tm", b"T*", b"Tj"}: 

142 self._post_process_text_operation(str_widths or 0.0) 

143 

144 def _post_process_text_operation(self, str_widths: float) -> None: 

145 """Handle common post-processing for text positioning operations.""" 

146 try: 

147 self.text, self.output, self.cm_prev, self.tm_prev = crlf_space_check( 

148 self.text, 

149 (self.cm_prev, self.tm_prev), 

150 (self.cm_matrix, self.tm_matrix), 

151 (self.memo_cm, self.memo_tm), 

152 self.cmap, 

153 self.orientations, 

154 self.output, 

155 self.font_size, 

156 self.visitor_text, 

157 str_widths, 

158 self.compute_str_widths(self._actual_str_size["space_width"]), 

159 self._actual_str_size["str_height"], 

160 ) 

161 if self.text == "": 

162 self.memo_cm = self.cm_matrix.copy() 

163 self.memo_tm = self.tm_matrix.copy() 

164 except OrientationNotFoundError: 

165 pass 

166 

167 def _get_actual_font_widths( 

168 self, 

169 cmap: Tuple[ 

170 Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject] 

171 ], 

172 text_operands: str, 

173 font_size: float, 

174 space_width: float, 

175 ) -> Tuple[float, float, float]: 

176 font_widths: float = 0 

177 font_name: str = cmap[2] 

178 if font_name not in self._font_width_maps: 

179 if cmap[3] is None: 

180 font_width_map: Dict[Any, float] = {} 

181 space_char = " " 

182 actual_space_width: float = space_width 

183 font_width_map["default"] = actual_space_width * 2 

184 else: 

185 space_char = get_actual_str_key(" ", cmap[0], cmap[1]) 

186 font_width_map = build_font_width_map(cmap[3], space_width * 2) 

187 actual_space_width = compute_font_width(font_width_map, space_char) 

188 if actual_space_width == 0: 

189 actual_space_width = space_width 

190 self._font_width_maps[font_name] = (font_width_map, space_char, actual_space_width) 

191 font_width_map = self._font_width_maps[font_name][0] 

192 space_char = self._font_width_maps[font_name][1] 

193 actual_space_width = self._font_width_maps[font_name][2] 

194 

195 if text_operands: 

196 for char in text_operands: 

197 if char == space_char: 

198 font_widths += actual_space_width 

199 continue 

200 font_widths += compute_font_width(font_width_map, char) 

201 return (font_widths * font_size, space_width * font_size, font_size) 

202 

203 def _handle_tj( 

204 self, 

205 text: str, 

206 operands: List[Union[str, TextStringObject]], 

207 cm_matrix: List[float], 

208 tm_matrix: List[float], 

209 cmap: Tuple[ 

210 Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject] 

211 ], 

212 orientations: Tuple[int, ...], 

213 font_size: float, 

214 rtl_dir: bool, 

215 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]], 

216 space_width: float, 

217 actual_str_size: Dict[str, float], 

218 ) -> Tuple[str, bool, Dict[str, float]]: 

219 text_operands, is_str_operands = get_text_operands( 

220 operands, cm_matrix, tm_matrix, cmap, orientations) 

221 if is_str_operands: 

222 text += text_operands 

223 else: 

224 text, rtl_dir = get_display_str( 

225 text, 

226 cm_matrix, 

227 tm_matrix, # text matrix 

228 cmap, 

229 text_operands, 

230 font_size, 

231 rtl_dir, 

232 visitor_text, 

233 ) 

234 font_widths, actual_str_size["space_width"], actual_str_size["str_height"] = ( 

235 self._get_actual_font_widths(cmap, text_operands, font_size, space_width)) 

236 actual_str_size["str_widths"] += font_widths 

237 

238 return text, rtl_dir, actual_str_size 

239 

240 def _flush_text(self) -> None: 

241 """Flush accumulated text to output and call visitor if present.""" 

242 self.output += self.text 

243 if self.visitor_text is not None: 

244 self.visitor_text(self.text, self.memo_cm, self.memo_tm, self.cmap[3], self.font_size) 

245 self.text = "" 

246 self.memo_cm = self.cm_matrix.copy() 

247 self.memo_tm = self.tm_matrix.copy() 

248 

249 # Operation handlers 

250 

251 def _handle_bt(self, operands: List[Any]) -> None: 

252 """Handle BT (Begin Text) operation - Table 5.4 page 405.""" 

253 self.tm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] 

254 self._flush_text() 

255 

256 def _handle_et(self, operands: List[Any]) -> None: 

257 """Handle ET (End Text) operation - Table 5.4 page 405.""" 

258 self._flush_text() 

259 

260 def _handle_save_graphics_state(self, operands: List[Any]) -> None: 

261 """Handle q (Save graphics state) operation - Table 4.7 page 219.""" 

262 self.cm_stack.append( 

263 ( 

264 self.cm_matrix, 

265 self.cmap, 

266 self.font_size, 

267 self.char_scale, 

268 self.space_scale, 

269 self._space_width, 

270 self.TL, 

271 ) 

272 ) 

273 

274 def _handle_restore_graphics_state(self, operands: List[Any]) -> None: 

275 """Handle Q (Restore graphics state) operation - Table 4.7 page 219.""" 

276 try: 

277 ( 

278 self.cm_matrix, 

279 self.cmap, 

280 self.font_size, 

281 self.char_scale, 

282 self.space_scale, 

283 self._space_width, 

284 self.TL, 

285 ) = self.cm_stack.pop() 

286 except Exception: 

287 self.cm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] 

288 

289 def _handle_cm(self, operands: List[Any]) -> None: 

290 """Handle cm (Modify current matrix) operation - Table 4.7 page 219.""" 

291 self.output += self.text 

292 if self.visitor_text is not None: 

293 self.visitor_text(self.text, self.memo_cm, self.memo_tm, self.cmap[3], self.font_size) 

294 self.text = "" 

295 try: 

296 self.cm_matrix = mult([float(operand) for operand in operands[:6]], self.cm_matrix) 

297 except Exception: 

298 self.cm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] 

299 self.memo_cm = self.cm_matrix.copy() 

300 self.memo_tm = self.tm_matrix.copy() 

301 

302 def _handle_tz(self, operands: List[Any]) -> None: 

303 """Handle Tz (Set horizontal text scaling) operation - Table 5.2 page 398.""" 

304 self.char_scale = float(operands[0]) / 100 if operands else 1.0 

305 

306 def _handle_tw(self, operands: List[Any]) -> None: 

307 """Handle Tw (Set word spacing) operation - Table 5.2 page 398.""" 

308 self.space_scale = 1.0 + float(operands[0] if operands else 0.0) 

309 

310 def _handle_tl(self, operands: List[Any]) -> None: 

311 """Handle TL (Set Text Leading) operation - Table 5.2 page 398.""" 

312 scale_x = math.sqrt(self.tm_matrix[0] ** 2 + self.tm_matrix[2] ** 2) 

313 self.TL = float(operands[0] if operands else 0.0) * self.font_size * scale_x 

314 

315 def _handle_tf(self, operands: List[Any]) -> None: 

316 """Handle Tf (Set font size) operation - Table 5.2 page 398.""" 

317 if self.text != "": 

318 self.output += self.text # .translate(cmap) 

319 if self.visitor_text is not None: 

320 self.visitor_text(self.text, self.memo_cm, self.memo_tm, self.cmap[3], self.font_size) 

321 self.text = "" 

322 self.memo_cm = self.cm_matrix.copy() 

323 self.memo_tm = self.tm_matrix.copy() 

324 try: 

325 # Import here to avoid circular imports 

326 from .._cmap import unknown_char_map # noqa: PLC0415 

327 

328 # char_map_tuple: font_type, 

329 # float(sp_width / 2), 

330 # encoding, 

331 # map_dict, 

332 # font_dict (describes the font) 

333 char_map_tuple = self.cmaps[operands[0]] 

334 # current cmap: encoding, 

335 # map_dict, 

336 # font resource name (internal name, not the real font name), 

337 # font_dict 

338 self.cmap = ( 

339 char_map_tuple[2], 

340 char_map_tuple[3], 

341 operands[0], 

342 char_map_tuple[4], 

343 ) 

344 self._space_width = char_map_tuple[1] 

345 except KeyError: # font not found 

346 self.cmap = ( 

347 unknown_char_map[2], 

348 unknown_char_map[3], 

349 f"???{operands[0]}", 

350 None, 

351 ) 

352 self._space_width = unknown_char_map[1] 

353 try: 

354 self.font_size = float(operands[1]) 

355 except Exception: 

356 pass # keep previous size 

357 

358 def _handle_td(self, operands: List[Any]) -> float: 

359 """Handle Td (Move text position) operation - Table 5.5 page 406.""" 

360 # A special case is a translating only tm: 

361 # tm = [1, 0, 0, 1, e, f] 

362 # i.e. tm[4] += tx, tm[5] += ty. 

363 tx, ty = float(operands[0]), float(operands[1]) 

364 self.tm_matrix[4] += tx * self.tm_matrix[0] + ty * self.tm_matrix[2] 

365 self.tm_matrix[5] += tx * self.tm_matrix[1] + ty * self.tm_matrix[3] 

366 str_widths = self.compute_str_widths(self._actual_str_size["str_widths"]) 

367 self._actual_str_size["str_widths"] = 0.0 

368 return str_widths 

369 

370 def _handle_tm(self, operands: List[Any]) -> float: 

371 """Handle Tm (Set text matrix) operation - Table 5.5 page 406.""" 

372 self.tm_matrix = [float(operand) for operand in operands[:6]] 

373 str_widths = self.compute_str_widths(self._actual_str_size["str_widths"]) 

374 self._actual_str_size["str_widths"] = 0.0 

375 return str_widths 

376 

377 def _handle_t_star(self, operands: List[Any]) -> float: 

378 """Handle T* (Move to next line) operation - Table 5.5 page 406.""" 

379 self.tm_matrix[4] -= self.TL * self.tm_matrix[2] 

380 self.tm_matrix[5] -= self.TL * self.tm_matrix[3] 

381 str_widths = self.compute_str_widths(self._actual_str_size["str_widths"]) 

382 self._actual_str_size["str_widths"] = 0.0 

383 return str_widths 

384 

385 def _handle_tj_operation(self, operands: List[Any]) -> float: 

386 """Handle Tj (Show text) operation - Table 5.5 page 406.""" 

387 self.text, self.rtl_dir, self._actual_str_size = self._handle_tj( 

388 self.text, 

389 operands, 

390 self.cm_matrix, 

391 self.tm_matrix, 

392 self.cmap, 

393 self.orientations, 

394 self.font_size, 

395 self.rtl_dir, 

396 self.visitor_text, 

397 self._space_width, 

398 self._actual_str_size, 

399 ) 

400 return 0.0 # str_widths will be handled in post-processing