Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_text_extraction/_text_extractor.py: 18%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

143 statements  

1# Copyright (c) 2006, Mathieu Fenniak 

2# Copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com> 

3# 

4# All rights reserved. 

5# 

6# Redistribution and use in source and binary forms, with or without 

7# modification, are permitted provided that the following conditions are 

8# met: 

9# 

10# * Redistributions of source code must retain the above copyright notice, 

11# this list of conditions and the following disclaimer. 

12# * Redistributions in binary form must reproduce the above copyright notice, 

13# this list of conditions and the following disclaimer in the documentation 

14# and/or other materials provided with the distribution. 

15# * The name of the author may not be used to endorse or promote products 

16# derived from this software without specific prior written permission. 

17# 

18# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 

19# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 

20# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 

21# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 

22# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 

23# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 

24# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 

25# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 

26# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 

27# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 

28# POSSIBILITY OF SUCH DAMAGE. 

29 

30import math 

31from typing import Any, Callable, Optional, Union 

32 

33from .._font import Font, FontDescriptor 

34from ..generic import DictionaryObject, TextStringObject 

35from . import OrientationNotFoundError, crlf_space_check, get_display_str, get_text_operands, mult 

36 

37 

38class TextExtraction: 

39 """ 

40 A class to handle PDF text extraction operations. 

41 

42 This class encapsulates all the state and operations needed for extracting 

43 text from PDF content streams, replacing the nested functions and nonlocal 

44 variables in the original implementation. 

45 """ 

46 

47 def __init__(self) -> None: 

48 self._font_width_maps: dict[str, tuple[dict[Any, float], str, float]] = {} 

49 

50 # Text extraction state variables 

51 self.cm_matrix: list[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] 

52 self.tm_matrix: list[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] 

53 self.cm_stack: list[ 

54 tuple[ 

55 list[float], 

56 Optional[DictionaryObject], 

57 Font, 

58 float, 

59 float, 

60 float, 

61 float, 

62 ] 

63 ] = [] 

64 

65 # Store the last modified matrices; can be an intermediate position 

66 self.cm_prev: list[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] 

67 self.tm_prev: list[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] 

68 

69 # Store the position at the beginning of building the text 

70 self.memo_cm: list[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] 

71 self.memo_tm: list[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] 

72 

73 self.char_scale = 1.0 

74 self.space_scale = 1.0 

75 self._space_width: float = 500.0 # will be set correctly at first Tf 

76 self._actual_str_size: dict[str, float] = { 

77 "str_widths": 0.0, 

78 "str_height": 0.0, 

79 } # will be set to string length calculation result 

80 self.TL = 0.0 

81 self.font_size = 12.0 # init just in case of 

82 

83 # Text extraction variables 

84 self.text: str = "" 

85 self.output: str = "" 

86 self.rtl_dir: bool = False # right-to-left 

87 self.font_resource: Optional[DictionaryObject] = None 

88 self.font = Font( 

89 name = "NotInitialized", 

90 sub_type="Unknown", 

91 encoding="charmap", 

92 font_descriptor=FontDescriptor(), 

93 ) 

94 self.orientations: tuple[int, ...] = (0, 90, 180, 270) 

95 self.visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None 

96 self.font_resources: dict[str, DictionaryObject] = {} 

97 self.fonts: dict[str, Font] = {} 

98 

99 self.operation_handlers = { 

100 b"BT": self._handle_bt, 

101 b"ET": self._handle_et, 

102 b"q": self._handle_save_graphics_state, 

103 b"Q": self._handle_restore_graphics_state, 

104 b"cm": self._handle_cm, 

105 b"Tz": self._handle_tz, 

106 b"Tw": self._handle_tw, 

107 b"TL": self._handle_tl, 

108 b"Tf": self._handle_tf, 

109 b"Td": self._handle_td, 

110 b"Tm": self._handle_tm, 

111 b"T*": self._handle_t_star, 

112 b"Tj": self._handle_tj_operation, 

113 } 

114 

115 def initialize_extraction( 

116 self, 

117 orientations: tuple[int, ...] = (0, 90, 180, 270), 

118 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None, 

119 font_resources: Optional[dict[str, DictionaryObject]] = None, 

120 fonts: Optional[dict[str, Font]] = None 

121 ) -> None: 

122 """Initialize the extractor with extraction parameters.""" 

123 self.orientations = orientations 

124 self.visitor_text = visitor_text 

125 self.font_resources = font_resources or {} 

126 self.fonts = fonts or {} 

127 

128 # Reset state 

129 self.text = "" 

130 self.output = "" 

131 self.rtl_dir = False 

132 

133 def compute_str_widths(self, str_widths: float) -> float: 

134 return str_widths / 1000 

135 

136 def process_operation(self, operator: bytes, operands: list[Any]) -> None: 

137 if operator in self.operation_handlers: 

138 handler = self.operation_handlers[operator] 

139 str_widths = handler(operands) 

140 

141 # Post-process operations that affect text positioning 

142 if operator in {b"Td", b"Tm", b"T*", b"Tj"}: 

143 self._post_process_text_operation(str_widths or 0.0) 

144 

145 def _post_process_text_operation(self, str_widths: float) -> None: 

146 """Handle common post-processing for text positioning operations.""" 

147 try: 

148 self.text, self.output, self.cm_prev, self.tm_prev = crlf_space_check( 

149 self.text, 

150 (self.cm_prev, self.tm_prev), 

151 (self.cm_matrix, self.tm_matrix), 

152 (self.memo_cm, self.memo_tm), 

153 self.font_resource, 

154 self.orientations, 

155 self.output, 

156 self.font_size, 

157 self.visitor_text, 

158 str_widths, 

159 self.compute_str_widths(self.font_size * self._space_width), 

160 self._actual_str_size["str_height"], 

161 ) 

162 if self.text == "": 

163 self.memo_cm = self.cm_matrix.copy() 

164 self.memo_tm = self.tm_matrix.copy() 

165 except OrientationNotFoundError: 

166 pass 

167 

168 def _handle_tj( 

169 self, 

170 text: str, 

171 operands: list[Union[str, TextStringObject]], 

172 cm_matrix: list[float], 

173 tm_matrix: list[float], 

174 font_resource: Optional[DictionaryObject], 

175 font: Font, 

176 orientations: tuple[int, ...], 

177 font_size: float, 

178 rtl_dir: bool, 

179 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]], 

180 actual_str_size: dict[str, float], 

181 ) -> tuple[str, bool, dict[str, float]]: 

182 text_operands, is_str_operands = get_text_operands( 

183 operands, cm_matrix, tm_matrix, font, orientations 

184 ) 

185 if is_str_operands: 

186 text += text_operands 

187 font_widths = sum([font.space_width if x == " " else font.text_width(x) for x in text_operands]) 

188 else: 

189 text, rtl_dir, font_widths = get_display_str( 

190 text, 

191 cm_matrix, 

192 tm_matrix, # text matrix 

193 font_resource, 

194 font, 

195 text_operands, 

196 font_size, 

197 rtl_dir, 

198 visitor_text, 

199 ) 

200 actual_str_size["str_widths"] += font_widths * font_size 

201 actual_str_size["str_height"] = font_size 

202 return text, rtl_dir, actual_str_size 

203 

204 def _flush_text(self) -> None: 

205 """Flush accumulated text to output and call visitor if present.""" 

206 self.output += self.text 

207 if self.visitor_text is not None: 

208 self.visitor_text(self.text, self.memo_cm, self.memo_tm, self.font_resource, self.font_size) 

209 self.text = "" 

210 self.memo_cm = self.cm_matrix.copy() 

211 self.memo_tm = self.tm_matrix.copy() 

212 

213 # Operation handlers 

214 

215 def _handle_bt(self, operands: list[Any]) -> None: 

216 """Handle BT (Begin Text) operation - Table 5.4 page 405.""" 

217 self.tm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] 

218 self._flush_text() 

219 

220 def _handle_et(self, operands: list[Any]) -> None: 

221 """Handle ET (End Text) operation - Table 5.4 page 405.""" 

222 self._flush_text() 

223 

224 def _handle_save_graphics_state(self, operands: list[Any]) -> None: 

225 """Handle q (Save graphics state) operation - Table 4.7 page 219.""" 

226 self.cm_stack.append( 

227 ( 

228 self.cm_matrix, 

229 self.font_resource, 

230 self.font, 

231 self.font_size, 

232 self.char_scale, 

233 self.space_scale, 

234 self.TL, 

235 ) 

236 ) 

237 

238 def _handle_restore_graphics_state(self, operands: list[Any]) -> None: 

239 """Handle Q (Restore graphics state) operation - Table 4.7 page 219.""" 

240 try: 

241 ( 

242 self.cm_matrix, 

243 self.font_resource, 

244 self.font, 

245 self.font_size, 

246 self.char_scale, 

247 self.space_scale, 

248 self.TL, 

249 ) = self.cm_stack.pop() 

250 except Exception: 

251 self.cm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] 

252 

253 def _handle_cm(self, operands: list[Any]) -> None: 

254 """Handle cm (Modify current matrix) operation - Table 4.7 page 219.""" 

255 self.output += self.text 

256 if self.visitor_text is not None: 

257 self.visitor_text(self.text, self.memo_cm, self.memo_tm, self.font_resource, self.font_size) 

258 self.text = "" 

259 try: 

260 self.cm_matrix = mult([float(operand) for operand in operands[:6]], self.cm_matrix) 

261 except Exception: 

262 self.cm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] 

263 self.memo_cm = self.cm_matrix.copy() 

264 self.memo_tm = self.tm_matrix.copy() 

265 

266 def _handle_tz(self, operands: list[Any]) -> None: 

267 """Handle Tz (Set horizontal text scaling) operation - Table 5.2 page 398.""" 

268 self.char_scale = float(operands[0]) / 100 if operands else 1.0 

269 

270 def _handle_tw(self, operands: list[Any]) -> None: 

271 """Handle Tw (Set word spacing) operation - Table 5.2 page 398.""" 

272 self.space_scale = 1.0 + float(operands[0] if operands else 0.0) 

273 

274 def _handle_tl(self, operands: list[Any]) -> None: 

275 """Handle TL (Set Text Leading) operation - Table 5.2 page 398.""" 

276 scale_x = math.sqrt(self.tm_matrix[0] ** 2 + self.tm_matrix[2] ** 2) 

277 self.TL = float(operands[0] if operands else 0.0) * self.font_size * scale_x 

278 

279 def _handle_tf(self, operands: list[Any]) -> None: 

280 """Handle Tf (Set font size) operation - Table 5.2 page 398.""" 

281 if self.text != "": 

282 self.output += self.text # .translate(cmap) 

283 if self.visitor_text is not None: 

284 self.visitor_text(self.text, self.memo_cm, self.memo_tm, self.font_resource, self.font_size) 

285 self.text = "" 

286 self.memo_cm = self.cm_matrix.copy() 

287 self.memo_tm = self.tm_matrix.copy() 

288 try: 

289 self.font_resource = self.font_resources[operands[0]] 

290 self.font = self.fonts[operands[0]] 

291 except KeyError: # font not found 

292 self.font_resource = None 

293 font_descriptor = FontDescriptor() 

294 self.font = Font( 

295 "Unknown", 

296 space_width=250, 

297 encoding=dict.fromkeys(range(256), "�"), 

298 font_descriptor=font_descriptor, 

299 character_map={}, 

300 character_widths=font_descriptor.character_widths 

301 ) 

302 

303 self._space_width = self.font.space_width / 2 # Actually the width of _half_ a space... 

304 try: 

305 self.font_size = float(operands[1]) 

306 except Exception: 

307 pass # keep previous size 

308 

309 def _handle_td(self, operands: list[Any]) -> float: 

310 """Handle Td (Move text position) operation - Table 5.5 page 406.""" 

311 # A special case is a translating only tm: 

312 # tm = [1, 0, 0, 1, e, f] 

313 # i.e. tm[4] += tx, tm[5] += ty. 

314 tx, ty = float(operands[0]), float(operands[1]) 

315 self.tm_matrix[4] += tx * self.tm_matrix[0] + ty * self.tm_matrix[2] 

316 self.tm_matrix[5] += tx * self.tm_matrix[1] + ty * self.tm_matrix[3] 

317 str_widths = self.compute_str_widths(self._actual_str_size["str_widths"]) 

318 self._actual_str_size["str_widths"] = 0.0 

319 return str_widths 

320 

321 def _handle_tm(self, operands: list[Any]) -> float: 

322 """Handle Tm (Set text matrix) operation - Table 5.5 page 406.""" 

323 self.tm_matrix = [float(operand) for operand in operands[:6]] 

324 str_widths = self.compute_str_widths(self._actual_str_size["str_widths"]) 

325 self._actual_str_size["str_widths"] = 0.0 

326 return str_widths 

327 

328 def _handle_t_star(self, operands: list[Any]) -> float: 

329 """Handle T* (Move to next line) operation - Table 5.5 page 406.""" 

330 self.tm_matrix[4] -= self.TL * self.tm_matrix[2] 

331 self.tm_matrix[5] -= self.TL * self.tm_matrix[3] 

332 str_widths = self.compute_str_widths(self._actual_str_size["str_widths"]) 

333 self._actual_str_size["str_widths"] = 0.0 

334 return str_widths 

335 

336 def _handle_tj_operation(self, operands: list[Any]) -> float: 

337 """Handle Tj (Show text) operation - Table 5.5 page 406.""" 

338 self.text, self.rtl_dir, self._actual_str_size = self._handle_tj( 

339 self.text, 

340 operands, 

341 self.cm_matrix, 

342 self.tm_matrix, 

343 self.font_resource, 

344 self.font, 

345 self.orientations, 

346 self.font_size, 

347 self.rtl_dir, 

348 self.visitor_text, 

349 self._actual_str_size, 

350 ) 

351 return 0.0 # str_widths will be handled in post-processing