Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_text_extraction/__init__.py: 14%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

111 statements  

1""" 

2Code related to text extraction. 

3 

4Some parts are still in _page.py. In doubt, they will stay there. 

5""" 

6 

7import math 

8from typing import Any, Callable, Dict, List, Optional, Tuple, Union 

9 

10from ..generic import DictionaryObject, TextStringObject, encode_pdfdocencoding 

11 

12CUSTOM_RTL_MIN: int = -1 

13CUSTOM_RTL_MAX: int = -1 

14CUSTOM_RTL_SPECIAL_CHARS: List[int] = [] 

15LAYOUT_NEW_BT_GROUP_SPACE_WIDTHS: int = 5 

16 

17 

18class OrientationNotFoundError(Exception): 

19 pass 

20 

21 

22def set_custom_rtl( 

23 _min: Union[str, int, None] = None, 

24 _max: Union[str, int, None] = None, 

25 specials: Union[str, List[int], None] = None, 

26) -> Tuple[int, int, List[int]]: 

27 """ 

28 Change the Right-To-Left and special characters custom parameters. 

29 

30 Args: 

31 _min: The new minimum value for the range of custom characters that 

32 will be written right to left. 

33 If set to ``None``, the value will not be changed. 

34 If set to an integer or string, it will be converted to its ASCII code. 

35 The default value is -1, which sets no additional range to be converted. 

36 _max: The new maximum value for the range of custom characters that will 

37 be written right to left. 

38 If set to ``None``, the value will not be changed. 

39 If set to an integer or string, it will be converted to its ASCII code. 

40 The default value is -1, which sets no additional range to be converted. 

41 specials: The new list of special characters to be inserted in the 

42 current insertion order. 

43 If set to ``None``, the current value will not be changed. 

44 If set to a string, it will be converted to a list of ASCII codes. 

45 The default value is an empty list. 

46 

47 Returns: 

48 A tuple containing the new values for ``CUSTOM_RTL_MIN``, 

49 ``CUSTOM_RTL_MAX``, and ``CUSTOM_RTL_SPECIAL_CHARS``. 

50 

51 """ 

52 global CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS 

53 if isinstance(_min, int): 

54 CUSTOM_RTL_MIN = _min 

55 elif isinstance(_min, str): 

56 CUSTOM_RTL_MIN = ord(_min) 

57 if isinstance(_max, int): 

58 CUSTOM_RTL_MAX = _max 

59 elif isinstance(_max, str): 

60 CUSTOM_RTL_MAX = ord(_max) 

61 if isinstance(specials, str): 

62 CUSTOM_RTL_SPECIAL_CHARS = [ord(x) for x in specials] 

63 elif isinstance(specials, list): 

64 CUSTOM_RTL_SPECIAL_CHARS = specials 

65 return CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS 

66 

67 

68def mult(m: List[float], n: List[float]) -> List[float]: 

69 return [ 

70 m[0] * n[0] + m[1] * n[2], 

71 m[0] * n[1] + m[1] * n[3], 

72 m[2] * n[0] + m[3] * n[2], 

73 m[2] * n[1] + m[3] * n[3], 

74 m[4] * n[0] + m[5] * n[2] + n[4], 

75 m[4] * n[1] + m[5] * n[3] + n[5], 

76 ] 

77 

78 

79def orient(m: List[float]) -> int: 

80 if m[3] > 1e-6: 

81 return 0 

82 if m[3] < -1e-6: 

83 return 180 

84 if m[1] > 0: 

85 return 90 

86 return 270 

87 

88 

89def crlf_space_check( 

90 text: str, 

91 cmtm_prev: Tuple[List[float], List[float]], 

92 cmtm_matrix: Tuple[List[float], List[float]], 

93 memo_cmtm: Tuple[List[float], List[float]], 

94 cmap: Tuple[ 

95 Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject] 

96 ], 

97 orientations: Tuple[int, ...], 

98 output: str, 

99 font_size: float, 

100 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]], 

101 str_widths: float, 

102 spacewidth: float, 

103 str_height: float, 

104) -> Tuple[str, str, List[float], List[float]]: 

105 cm_prev = cmtm_prev[0] 

106 tm_prev = cmtm_prev[1] 

107 cm_matrix = cmtm_matrix[0] 

108 tm_matrix = cmtm_matrix[1] 

109 memo_cm = memo_cmtm[0] 

110 memo_tm = memo_cmtm[1] 

111 

112 m_prev = mult(tm_prev, cm_prev) 

113 m = mult(tm_matrix, cm_matrix) 

114 orientation = orient(m) 

115 delta_x = m[4] - m_prev[4] 

116 delta_y = m[5] - m_prev[5] 

117 # Table 108 of the 1.7 reference ("Text positioning operators") 

118 scale_prev_x = math.sqrt(tm_prev[0]**2 + tm_prev[1]**2) 

119 scale_prev_y = math.sqrt(tm_prev[2]**2 + tm_prev[3]**2) 

120 scale_y = math.sqrt(tm_matrix[2]**2 + tm_matrix[3]**2) 

121 cm_prev = m 

122 

123 if orientation not in orientations: 

124 raise OrientationNotFoundError 

125 if orientation in (0, 180): 

126 moved_height: float = delta_y 

127 moved_width: float = delta_x 

128 elif orientation in (90, 270): 

129 moved_height = delta_x 

130 moved_width = delta_y 

131 try: 

132 if abs(moved_height) > 0.8 * min(str_height * scale_prev_y, font_size * scale_y): 

133 if (output + text)[-1] != "\n": 

134 output += text + "\n" 

135 if visitor_text is not None: 

136 visitor_text( 

137 text + "\n", 

138 memo_cm, 

139 memo_tm, 

140 cmap[3], 

141 font_size, 

142 ) 

143 text = "" 

144 elif ( 

145 (moved_width >= (spacewidth + str_widths) * scale_prev_x) 

146 and (output + text)[-1] != " " 

147 ): 

148 text += " " 

149 except Exception: 

150 pass 

151 tm_prev = tm_matrix.copy() 

152 cm_prev = cm_matrix.copy() 

153 return text, output, cm_prev, tm_prev 

154 

155 

156def get_text_operands( 

157 operands: List[Union[str, TextStringObject]], 

158 cm_matrix: List[float], 

159 tm_matrix: List[float], 

160 cmap: Tuple[ 

161 Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject] 

162 ], 

163 orientations: Tuple[int, ...] 

164) -> Tuple[str, bool]: 

165 t: str = "" 

166 is_str_operands = False 

167 m = mult(tm_matrix, cm_matrix) 

168 orientation = orient(m) 

169 if orientation in orientations and len(operands) > 0: 

170 if isinstance(operands[0], str): 

171 t = operands[0] 

172 is_str_operands = True 

173 else: 

174 t = "" 

175 tt: bytes = ( 

176 encode_pdfdocencoding(operands[0]) 

177 if isinstance(operands[0], str) 

178 else operands[0] 

179 ) 

180 if isinstance(cmap[0], str): 

181 try: 

182 t = tt.decode(cmap[0], "surrogatepass") # apply str encoding 

183 except Exception: 

184 # the data does not match the expectation, 

185 # we use the alternative ; 

186 # text extraction may not be good 

187 t = tt.decode( 

188 "utf-16-be" if cmap[0] == "charmap" else "charmap", 

189 "surrogatepass", 

190 ) # apply str encoding 

191 else: # apply dict encoding 

192 t = "".join( 

193 [cmap[0][x] if x in cmap[0] else bytes((x,)).decode() for x in tt] 

194 ) 

195 return (t, is_str_operands) 

196 

197 

198def get_display_str( 

199 text: str, 

200 cm_matrix: List[float], 

201 tm_matrix: List[float], 

202 cmap: Tuple[ 

203 Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject] 

204 ], 

205 text_operands: str, 

206 font_size: float, 

207 rtl_dir: bool, 

208 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] 

209) -> Tuple[str, bool]: 

210 # "\u0590 - \u08FF \uFB50 - \uFDFF" 

211 for x in [cmap[1].get(x, x) for x in text_operands]: 

212 # x can be a sequence of bytes ; ex: habibi.pdf 

213 if len(x) == 1: 

214 xx = ord(x) 

215 else: 

216 xx = 1 

217 # fmt: off 

218 if ( 

219 # cases where the current inserting order is kept 

220 (xx <= 0x2F) # punctuations but... 

221 or 0x3A <= xx <= 0x40 # numbers (x30-39) 

222 or 0x2000 <= xx <= 0x206F # upper punctuations.. 

223 or 0x20A0 <= xx <= 0x21FF # but (numbers) indices/exponents 

224 or xx in CUSTOM_RTL_SPECIAL_CHARS # customized.... 

225 ): 

226 text = x + text if rtl_dir else text + x 

227 elif ( # right-to-left characters set 

228 0x0590 <= xx <= 0x08FF 

229 or 0xFB1D <= xx <= 0xFDFF 

230 or 0xFE70 <= xx <= 0xFEFF 

231 or CUSTOM_RTL_MIN <= xx <= CUSTOM_RTL_MAX 

232 ): 

233 if not rtl_dir: 

234 rtl_dir = True 

235 if visitor_text is not None: 

236 visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size) 

237 text = "" 

238 text = x + text 

239 else: # left-to-right 

240 if rtl_dir: 

241 rtl_dir = False 

242 if visitor_text is not None: 

243 visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size) 

244 text = "" 

245 text = text + x 

246 # fmt: on 

247 return text, rtl_dir