Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_text_extraction/_text_extractor.py: 8%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

160 statements  

1# Copyright (c) 2006, Mathieu Fenniak 

2# Copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com> 

3# 

4# All rights reserved. 

5# 

6# Redistribution and use in source and binary forms, with or without 

7# modification, are permitted provided that the following conditions are 

8# met: 

9# 

10# * Redistributions of source code must retain the above copyright notice, 

11# this list of conditions and the following disclaimer. 

12# * Redistributions in binary form must reproduce the above copyright notice, 

13# this list of conditions and the following disclaimer in the documentation 

14# and/or other materials provided with the distribution. 

15# * The name of the author may not be used to endorse or promote products 

16# derived from this software without specific prior written permission. 

17# 

18# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 

19# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 

20# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 

21# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 

22# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 

23# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 

24# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 

25# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 

26# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 

27# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 

28# POSSIBILITY OF SUCH DAMAGE. 

29 

30import math 

31from typing import Any, Callable, Dict, List, Optional, Tuple, Union 

32 

33from .._cmap import build_font_width_map, compute_font_width, get_actual_str_key 

34from ..generic import DictionaryObject, TextStringObject 

35from . import OrientationNotFoundError, crlf_space_check, get_display_str, get_text_operands, mult 

36 

37 

38class TextExtraction: 

39 """ 

40 A class to handle PDF text extraction operations. 

41 

42 This class encapsulates all the state and operations needed for extracting 

43 text from PDF content streams, replacing the nested functions and nonlocal 

44 variables in the original implementation. 

45 """ 

46 

47 def __init__(self) -> None: 

48 self._font_width_maps: Dict[str, Tuple[Dict[Any, float], str, float]] = {} 

49 

50 # Text extraction state variables 

51 self.cm_matrix: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] 

52 self.tm_matrix: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] 

53 self.cm_stack: List[ 

54 Tuple[ 

55 List[float], 

56 Tuple[Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]], 

57 float, 

58 float, 

59 float, 

60 float, 

61 float, 

62 ] 

63 ] = [] 

64 

65 # Store the last modified matrices; can be an intermediate position 

66 self.cm_prev: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] 

67 self.tm_prev: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] 

68 

69 # Store the position at the beginning of building the text 

70 self.memo_cm: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] 

71 self.memo_tm: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] 

72 

73 self.char_scale = 1.0 

74 self.space_scale = 1.0 

75 self._space_width: float = 500.0 # will be set correctly at first Tf 

76 self._actual_str_size: Dict[str, float] = { 

77 "str_widths": 0.0, 

78 "space_width": 0.0, 

79 "str_height": 0.0, 

80 } # will be set to string length calculation result 

81 self.TL = 0.0 

82 self.font_size = 12.0 # init just in case of 

83 

84 # Text extraction variables 

85 self.text: str = "" 

86 self.output: str = "" 

87 self.rtl_dir: bool = False # right-to-left 

88 self.cmap: Tuple[Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]] = ( 

89 "charmap", 

90 {}, 

91 "NotInitialized", 

92 None, 

93 ) # (encoding, CMAP, font resource name, font) 

94 self.orientations: Tuple[int, ...] = (0, 90, 180, 270) 

95 self.visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None 

96 self.cmaps: Dict[str, Tuple[str, float, Union[str, Dict[int, str]], Dict[str, str], DictionaryObject]] = {} 

97 

98 def initialize_extraction( 

99 self, 

100 orientations: Tuple[int, ...] = (0, 90, 180, 270), 

101 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None, 

102 cmaps: Optional[ 

103 Dict[str, Tuple[str, float, Union[str, Dict[int, str]], Dict[str, str], DictionaryObject]] 

104 ] = None, 

105 ) -> None: 

106 """Initialize the extractor with extraction parameters.""" 

107 self.orientations = orientations 

108 self.visitor_text = visitor_text 

109 self.cmaps = cmaps or {} 

110 

111 # Reset state 

112 self.text = "" 

113 self.output = "" 

114 self.rtl_dir = False 

115 

116 def compute_str_widths(self, str_widths: float) -> float: 

117 return str_widths / 1000 

118 

119 def process_operation(self, operator: bytes, operands: List[Any]) -> None: 

120 str_widths: float = 0.0 

121 

122 # Table 5.4 page 405 

123 if operator == b"BT": # Begin Text 

124 self.tm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] 

125 # Flush text: 

126 self.output += self.text 

127 if self.visitor_text is not None: 

128 self.visitor_text(self.text, self.memo_cm, self.memo_tm, self.cmap[3], self.font_size) 

129 self.text = "" 

130 self.memo_cm = self.cm_matrix.copy() 

131 self.memo_tm = self.tm_matrix.copy() 

132 return 

133 if operator == b"ET": # End Text 

134 # Flush text: 

135 self.output += self.text 

136 if self.visitor_text is not None: 

137 self.visitor_text(self.text, self.memo_cm, self.memo_tm, self.cmap[3], self.font_size) 

138 self.text = "" 

139 self.memo_cm = self.cm_matrix.copy() 

140 self.memo_tm = self.tm_matrix.copy() 

141 

142 # Table 4.7 "Graphics state operators", page 219 

143 # cm_matrix calculation is reserved for later 

144 elif operator == b"q": # Save graphics state 

145 self.cm_stack.append( 

146 ( 

147 self.cm_matrix, 

148 self.cmap, 

149 self.font_size, 

150 self.char_scale, 

151 self.space_scale, 

152 self._space_width, 

153 self.TL, 

154 ) 

155 ) 

156 elif operator == b"Q": # Restore graphics state 

157 try: 

158 ( 

159 self.cm_matrix, 

160 self.cmap, 

161 self.font_size, 

162 self.char_scale, 

163 self.space_scale, 

164 self._space_width, 

165 self.TL, 

166 ) = self.cm_stack.pop() 

167 except Exception: 

168 self.cm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] 

169 elif operator == b"cm": # Modify current matrix 

170 self.output += self.text 

171 if self.visitor_text is not None: 

172 self.visitor_text(self.text, self.memo_cm, self.memo_tm, self.cmap[3], self.font_size) 

173 self.text = "" 

174 try: 

175 self.cm_matrix = mult([float(operand) for operand in operands[:6]], self.cm_matrix) 

176 except Exception: 

177 self.cm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] 

178 self.memo_cm = self.cm_matrix.copy() 

179 self.memo_tm = self.tm_matrix.copy() 

180 

181 # Table 5.2 page 398 

182 elif operator == b"Tz": # Set horizontal text scaling 

183 self.char_scale = float(operands[0]) / 100 if operands else 1.0 

184 elif operator == b"Tw": # Set word spacing 

185 self.space_scale = 1.0 + float(operands[0] if operands else 0.0) 

186 elif operator == b"TL": # Set Text Leading 

187 scale_x = math.sqrt(self.tm_matrix[0] ** 2 + self.tm_matrix[2] ** 2) 

188 self.TL = float(operands[0] if operands else 0.0) * self.font_size * scale_x 

189 elif operator == b"Tf": # Set font size 

190 if self.text != "": 

191 self.output += self.text # .translate(cmap) 

192 if self.visitor_text is not None: 

193 self.visitor_text(self.text, self.memo_cm, self.memo_tm, self.cmap[3], self.font_size) 

194 self.text = "" 

195 self.memo_cm = self.cm_matrix.copy() 

196 self.memo_tm = self.tm_matrix.copy() 

197 try: 

198 # Import here to avoid circular imports 

199 from .._cmap import unknown_char_map # noqa: PLC0415 

200 

201 # char_map_tuple: font_type, 

202 # float(sp_width / 2), 

203 # encoding, 

204 # map_dict, 

205 # font_dict (describes the font) 

206 char_map_tuple = self.cmaps[operands[0]] 

207 # current cmap: encoding, 

208 # map_dict, 

209 # font resource name (internal name, not the real font name), 

210 # font_dict 

211 self.cmap = ( 

212 char_map_tuple[2], 

213 char_map_tuple[3], 

214 operands[0], 

215 char_map_tuple[4], 

216 ) 

217 self._space_width = char_map_tuple[1] 

218 except KeyError: # font not found 

219 self.cmap = ( 

220 unknown_char_map[2], 

221 unknown_char_map[3], 

222 f"???{operands[0]}", 

223 None, 

224 ) 

225 self._space_width = unknown_char_map[1] 

226 try: 

227 self.font_size = float(operands[1]) 

228 except Exception: 

229 pass # keep previous size 

230 # Table 5.5 page 406 

231 elif operator == b"Td": # Move text position 

232 # A special case is a translating only tm: 

233 # tm = [1, 0, 0, 1, e, f] 

234 # i.e. tm[4] += tx, tm[5] += ty. 

235 tx, ty = float(operands[0]), float(operands[1]) 

236 self.tm_matrix[4] += tx * self.tm_matrix[0] + ty * self.tm_matrix[2] 

237 self.tm_matrix[5] += tx * self.tm_matrix[1] + ty * self.tm_matrix[3] 

238 str_widths = self.compute_str_widths(self._actual_str_size["str_widths"]) 

239 self._actual_str_size["str_widths"] = 0.0 

240 elif operator == b"Tm": # Set text matrix 

241 self.tm_matrix = [float(operand) for operand in operands[:6]] 

242 str_widths = self.compute_str_widths(self._actual_str_size["str_widths"]) 

243 self._actual_str_size["str_widths"] = 0.0 

244 elif operator == b"T*": # Move to next line 

245 self.tm_matrix[4] -= self.TL * self.tm_matrix[2] 

246 self.tm_matrix[5] -= self.TL * self.tm_matrix[3] 

247 str_widths = self.compute_str_widths(self._actual_str_size["str_widths"]) 

248 self._actual_str_size["str_widths"] = 0.0 

249 elif operator == b"Tj": # Show text 

250 self.text, self.rtl_dir, self._actual_str_size = self._handle_tj( 

251 self.text, 

252 operands, 

253 self.cm_matrix, 

254 self.tm_matrix, 

255 self.cmap, 

256 self.orientations, 

257 self.font_size, 

258 self.rtl_dir, 

259 self.visitor_text, 

260 self._space_width, 

261 self._actual_str_size, 

262 ) 

263 else: 

264 return 

265 

266 if operator in {b"Td", b"Tm", b"T*", b"Tj"}: 

267 try: 

268 self.text, self.output, self.cm_prev, self.tm_prev = crlf_space_check( 

269 self.text, 

270 (self.cm_prev, self.tm_prev), 

271 (self.cm_matrix, self.tm_matrix), 

272 (self.memo_cm, self.memo_tm), 

273 self.cmap, 

274 self.orientations, 

275 self.output, 

276 self.font_size, 

277 self.visitor_text, 

278 str_widths, 

279 self.compute_str_widths(self._actual_str_size["space_width"]), 

280 self._actual_str_size["str_height"], 

281 ) 

282 if self.text == "": 

283 self.memo_cm = self.cm_matrix.copy() 

284 self.memo_tm = self.tm_matrix.copy() 

285 except OrientationNotFoundError: 

286 return 

287 

288 def _get_actual_font_widths( 

289 self, 

290 cmap: Tuple[ 

291 Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject] 

292 ], 

293 text_operands: str, 

294 font_size: float, 

295 space_width: float, 

296 ) -> Tuple[float, float, float]: 

297 font_widths: float = 0 

298 font_name: str = cmap[2] 

299 if font_name not in self._font_width_maps: 

300 if cmap[3] is None: 

301 font_width_map: Dict[Any, float] = {} 

302 space_char = " " 

303 actual_space_width: float = space_width 

304 font_width_map["default"] = actual_space_width * 2 

305 else: 

306 space_char = get_actual_str_key(" ", cmap[0], cmap[1]) 

307 font_width_map = build_font_width_map(cmap[3], space_width * 2) 

308 actual_space_width = compute_font_width(font_width_map, space_char) 

309 if actual_space_width == 0: 

310 actual_space_width = space_width 

311 self._font_width_maps[font_name] = (font_width_map, space_char, actual_space_width) 

312 font_width_map = self._font_width_maps[font_name][0] 

313 space_char = self._font_width_maps[font_name][1] 

314 actual_space_width = self._font_width_maps[font_name][2] 

315 

316 if text_operands: 

317 for char in text_operands: 

318 if char == space_char: 

319 font_widths += actual_space_width 

320 continue 

321 font_widths += compute_font_width(font_width_map, char) 

322 return (font_widths * font_size, space_width * font_size, font_size) 

323 

324 def _handle_tj( 

325 self, 

326 text: str, 

327 operands: List[Union[str, TextStringObject]], 

328 cm_matrix: List[float], 

329 tm_matrix: List[float], 

330 cmap: Tuple[ 

331 Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject] 

332 ], 

333 orientations: Tuple[int, ...], 

334 font_size: float, 

335 rtl_dir: bool, 

336 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]], 

337 space_width: float, 

338 actual_str_size: Dict[str, float], 

339 ) -> Tuple[str, bool, Dict[str, float]]: 

340 text_operands, is_str_operands = get_text_operands( 

341 operands, cm_matrix, tm_matrix, cmap, orientations) 

342 if is_str_operands: 

343 text += text_operands 

344 else: 

345 text, rtl_dir = get_display_str( 

346 text, 

347 cm_matrix, 

348 tm_matrix, # text matrix 

349 cmap, 

350 text_operands, 

351 font_size, 

352 rtl_dir, 

353 visitor_text, 

354 ) 

355 font_widths, actual_str_size["space_width"], actual_str_size["str_height"] = ( 

356 self._get_actual_font_widths(cmap, text_operands, font_size, space_width)) 

357 actual_str_size["str_widths"] += font_widths 

358 

359 return text, rtl_dir, actual_str_size