Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_text_extraction/__init__.py: 15%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

114 statements  

1""" 

2Code related to text extraction. 

3 

4Some parts are still in _page.py. In doubt, they will stay there. 

5""" 

6 

7import math 

8from typing import Any, Callable, Optional, Union 

9 

10from .._font import Font 

11from ..generic import DictionaryObject, TextStringObject, encode_pdfdocencoding 

12 

13CUSTOM_RTL_MIN: int = -1 

14CUSTOM_RTL_MAX: int = -1 

15CUSTOM_RTL_SPECIAL_CHARS: list[int] = [] 

16LAYOUT_NEW_BT_GROUP_SPACE_WIDTHS: int = 5 

17 

18 

19class OrientationNotFoundError(Exception): 

20 pass 

21 

22 

23def set_custom_rtl( 

24 _min: Union[str, int, None] = None, 

25 _max: Union[str, int, None] = None, 

26 specials: Union[str, list[int], None] = None, 

27) -> tuple[int, int, list[int]]: 

28 """ 

29 Change the Right-To-Left and special characters custom parameters. 

30 

31 Args: 

32 _min: The new minimum value for the range of custom characters that 

33 will be written right to left. 

34 If set to ``None``, the value will not be changed. 

35 If set to an integer or string, it will be converted to its ASCII code. 

36 The default value is -1, which sets no additional range to be converted. 

37 _max: The new maximum value for the range of custom characters that will 

38 be written right to left. 

39 If set to ``None``, the value will not be changed. 

40 If set to an integer or string, it will be converted to its ASCII code. 

41 The default value is -1, which sets no additional range to be converted. 

42 specials: The new list of special characters to be inserted in the 

43 current insertion order. 

44 If set to ``None``, the current value will not be changed. 

45 If set to a string, it will be converted to a list of ASCII codes. 

46 The default value is an empty list. 

47 

48 Returns: 

49 A tuple containing the new values for ``CUSTOM_RTL_MIN``, 

50 ``CUSTOM_RTL_MAX``, and ``CUSTOM_RTL_SPECIAL_CHARS``. 

51 

52 """ 

53 global CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS 

54 if isinstance(_min, int): 

55 CUSTOM_RTL_MIN = _min 

56 elif isinstance(_min, str): 

57 CUSTOM_RTL_MIN = ord(_min) 

58 if isinstance(_max, int): 

59 CUSTOM_RTL_MAX = _max 

60 elif isinstance(_max, str): 

61 CUSTOM_RTL_MAX = ord(_max) 

62 if isinstance(specials, str): 

63 CUSTOM_RTL_SPECIAL_CHARS = [ord(x) for x in specials] 

64 elif isinstance(specials, list): 

65 CUSTOM_RTL_SPECIAL_CHARS = specials 

66 return CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS 

67 

68 

69def mult(m: list[float], n: list[float]) -> list[float]: 

70 return [ 

71 m[0] * n[0] + m[1] * n[2], 

72 m[0] * n[1] + m[1] * n[3], 

73 m[2] * n[0] + m[3] * n[2], 

74 m[2] * n[1] + m[3] * n[3], 

75 m[4] * n[0] + m[5] * n[2] + n[4], 

76 m[4] * n[1] + m[5] * n[3] + n[5], 

77 ] 

78 

79 

80def orient(m: list[float]) -> int: 

81 if m[3] > 1e-6: 

82 return 0 

83 if m[3] < -1e-6: 

84 return 180 

85 if m[1] > 0: 

86 return 90 

87 return 270 

88 

89 

90def crlf_space_check( 

91 text: str, 

92 cmtm_prev: tuple[list[float], list[float]], 

93 cmtm_matrix: tuple[list[float], list[float]], 

94 memo_cmtm: tuple[list[float], list[float]], 

95 font_resource: Optional[DictionaryObject], 

96 orientations: tuple[int, ...], 

97 output: str, 

98 font_size: float, 

99 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]], 

100 str_widths: float, 

101 spacewidth: float, 

102 str_height: float, 

103) -> tuple[str, str, list[float], list[float]]: 

104 cm_prev = cmtm_prev[0] 

105 tm_prev = cmtm_prev[1] 

106 cm_matrix = cmtm_matrix[0] 

107 tm_matrix = cmtm_matrix[1] 

108 memo_cm = memo_cmtm[0] 

109 memo_tm = memo_cmtm[1] 

110 

111 m_prev = mult(tm_prev, cm_prev) 

112 m = mult(tm_matrix, cm_matrix) 

113 orientation = orient(m) 

114 delta_x = m[4] - m_prev[4] 

115 delta_y = m[5] - m_prev[5] 

116 # Table 108 of the 1.7 reference ("Text positioning operators") 

117 scale_prev_x = math.sqrt(tm_prev[0]**2 + tm_prev[1]**2) 

118 scale_prev_y = math.sqrt(tm_prev[2]**2 + tm_prev[3]**2) 

119 scale_y = math.sqrt(tm_matrix[2]**2 + tm_matrix[3]**2) 

120 cm_prev = m 

121 

122 if orientation not in orientations: 

123 raise OrientationNotFoundError 

124 if orientation in (0, 180): 

125 moved_height: float = delta_y 

126 moved_width: float = delta_x 

127 elif orientation in (90, 270): 

128 moved_height = delta_x 

129 moved_width = delta_y 

130 try: 

131 if abs(moved_height) > 0.8 * min(str_height * scale_prev_y, font_size * scale_y): 

132 if (output + text)[-1] != "\n": 

133 output += text + "\n" 

134 if visitor_text is not None: 

135 visitor_text( 

136 text + "\n", 

137 memo_cm, 

138 memo_tm, 

139 font_resource, 

140 font_size, 

141 ) 

142 text = "" 

143 elif ( 

144 (moved_width >= (spacewidth + str_widths) * scale_prev_x) 

145 and (output + text)[-1] != " " 

146 ): 

147 text += " " 

148 except Exception: 

149 pass 

150 tm_prev = tm_matrix.copy() 

151 cm_prev = cm_matrix.copy() 

152 return text, output, cm_prev, tm_prev 

153 

154 

155def get_text_operands( 

156 operands: list[Union[str, TextStringObject]], 

157 cm_matrix: list[float], 

158 tm_matrix: list[float], 

159 font: Font, 

160 orientations: tuple[int, ...] 

161) -> tuple[str, bool]: 

162 t: str = "" 

163 is_str_operands = False 

164 m = mult(tm_matrix, cm_matrix) 

165 orientation = orient(m) 

166 if orientation in orientations and len(operands) > 0: 

167 if isinstance(operands[0], str): 

168 t = operands[0] 

169 is_str_operands = True 

170 else: 

171 t = "" 

172 tt: bytes = ( 

173 encode_pdfdocencoding(operands[0]) 

174 if isinstance(operands[0], str) 

175 else operands[0] 

176 ) 

177 if isinstance(font.encoding, str): 

178 try: 

179 t = tt.decode(font.encoding, "surrogatepass") # apply str encoding 

180 except Exception: 

181 # the data does not match the expectation, 

182 # we use the alternative ; 

183 # text extraction may not be good 

184 t = tt.decode( 

185 "utf-16-be" if font.encoding == "charmap" else "charmap", 

186 "surrogatepass", 

187 ) # apply str encoding 

188 else: # apply dict encoding 

189 t = "".join( 

190 [font.encoding[x] if x in font.encoding else bytes((x,)).decode() for x in tt] 

191 ) 

192 return (t, is_str_operands) 

193 

194 

195def get_display_str( 

196 text: str, 

197 cm_matrix: list[float], 

198 tm_matrix: list[float], 

199 font_resource: Optional[DictionaryObject], 

200 font: Font, 

201 text_operands: str, 

202 font_size: float, 

203 rtl_dir: bool, 

204 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] 

205) -> tuple[str, bool, float]: 

206 # "\u0590 - \u08FF \uFB50 - \uFDFF" 

207 widths: float = 0.0 

208 for x in [font.character_map.get(x, x) for x in text_operands]: 

209 # x can be a sequence of bytes ; ex: habibi.pdf 

210 if len(x) == 1: 

211 xx = ord(x) 

212 else: 

213 xx = 1 

214 # fmt: off 

215 if ( 

216 # cases where the current inserting order is kept 

217 (xx <= 0x2F) # punctuations but... 

218 or 0x3A <= xx <= 0x40 # numbers (x30-39) 

219 or 0x2000 <= xx <= 0x206F # upper punctuations.. 

220 or 0x20A0 <= xx <= 0x21FF # but (numbers) indices/exponents 

221 or xx in CUSTOM_RTL_SPECIAL_CHARS # customized.... 

222 ): 

223 text = x + text if rtl_dir else text + x 

224 elif ( # right-to-left characters set 

225 0x0590 <= xx <= 0x08FF 

226 or 0xFB1D <= xx <= 0xFDFF 

227 or 0xFE70 <= xx <= 0xFEFF 

228 or CUSTOM_RTL_MIN <= xx <= CUSTOM_RTL_MAX 

229 ): 

230 if not rtl_dir: 

231 rtl_dir = True 

232 if visitor_text is not None: 

233 visitor_text(text, cm_matrix, tm_matrix, font_resource, font_size) 

234 text = "" 

235 text = x + text 

236 else: # left-to-right 

237 if rtl_dir: 

238 rtl_dir = False 

239 if visitor_text is not None: 

240 visitor_text(text, cm_matrix, tm_matrix, font_resource, font_size) 

241 text = "" 

242 text = text + x 

243 widths += font.space_width if x == " " else font.text_width(x) 

244 # fmt: on 

245 return text, rtl_dir, widths