Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_cmap.py: 9%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

187 statements  

1import binascii 

2from binascii import Error as BinasciiError 

3from binascii import unhexlify 

4from math import ceil 

5from typing import Any, Union, cast 

6 

7from ._codecs import adobe_glyphs, charset_encoding 

8from ._utils import logger_error, logger_warning 

9from .generic import ( 

10 DecodedStreamObject, 

11 DictionaryObject, 

12 NullObject, 

13 StreamObject, 

14 is_null_or_none, 

15) 

16 

17_predefined_cmap: dict[str, str] = { 

18 "/Identity-H": "utf-16-be", 

19 "/Identity-V": "utf-16-be", 

20 "/GB-EUC-H": "gbk", 

21 "/GB-EUC-V": "gbk", 

22 "/GBpc-EUC-H": "gb2312", 

23 "/GBpc-EUC-V": "gb2312", 

24 "/GBK-EUC-H": "gbk", 

25 "/GBK-EUC-V": "gbk", 

26 "/GBK2K-H": "gb18030", 

27 "/GBK2K-V": "gb18030", 

28 "/ETen-B5-H": "cp950", 

29 "/ETen-B5-V": "cp950", 

30 "/ETenms-B5-H": "cp950", 

31 "/ETenms-B5-V": "cp950", 

32 "/UniCNS-UTF16-H": "utf-16-be", 

33 "/UniCNS-UTF16-V": "utf-16-be", 

34 "/UniGB-UTF16-H": "gb18030", 

35 "/UniGB-UTF16-V": "gb18030", 

36 # UCS2 in code 

37} 

38 

39 

40def get_encoding( 

41 ft: DictionaryObject 

42) -> tuple[Union[str, dict[int, str]], dict[Any, Any]]: 

43 encoding = _parse_encoding(ft) 

44 map_dict, int_entry = _parse_to_unicode(ft) 

45 

46 # Apply rule from PDF ref 1.7 §5.9.1, 1st bullet: 

47 # if cmap not empty encoding should be discarded 

48 # (here transformed into identity for those characters) 

49 # If encoding is a string, it is expected to be an identity translation. 

50 if isinstance(encoding, dict): 

51 for x in int_entry: 

52 if x <= 255: 

53 encoding[x] = chr(x) 

54 

55 return encoding, map_dict 

56 

57 

58def _parse_encoding( 

59 ft: DictionaryObject 

60) -> Union[str, dict[int, str]]: 

61 encoding: Union[str, list[str], dict[int, str]] = [] 

62 if "/Encoding" not in ft: 

63 if "/BaseFont" in ft and cast(str, ft["/BaseFont"]) in charset_encoding: 

64 encoding = dict( 

65 zip(range(256), charset_encoding[cast(str, ft["/BaseFont"])]) 

66 ) 

67 else: 

68 encoding = "charmap" 

69 return encoding 

70 enc: Union[str, DictionaryObject, NullObject] = cast( 

71 Union[str, DictionaryObject, NullObject], ft["/Encoding"].get_object() 

72 ) 

73 if isinstance(enc, str): 

74 try: 

75 # already done : enc = NameObject.unnumber(enc.encode()).decode() 

76 # for #xx decoding 

77 if enc in charset_encoding: 

78 encoding = charset_encoding[enc].copy() 

79 elif enc in _predefined_cmap: 

80 encoding = _predefined_cmap[enc] 

81 elif "-UCS2-" in enc: 

82 encoding = "utf-16-be" 

83 else: 

84 raise Exception("not found") 

85 except Exception: 

86 logger_error(f"Advanced encoding {enc} not implemented yet", __name__) 

87 encoding = enc 

88 elif isinstance(enc, DictionaryObject) and "/BaseEncoding" in enc: 

89 try: 

90 encoding = charset_encoding[cast(str, enc["/BaseEncoding"])].copy() 

91 except Exception: 

92 logger_error( 

93 f"Advanced encoding {encoding} not implemented yet", 

94 __name__, 

95 ) 

96 encoding = charset_encoding["/StandardEncoding"].copy() 

97 else: 

98 encoding = charset_encoding["/StandardEncoding"].copy() 

99 if isinstance(enc, DictionaryObject) and "/Differences" in enc: 

100 x: int = 0 

101 o: Union[int, str] 

102 for o in cast(DictionaryObject, enc["/Differences"]): 

103 if isinstance(o, int): 

104 x = o 

105 else: # isinstance(o, str): 

106 try: 

107 if x < len(encoding): 

108 encoding[x] = adobe_glyphs[o] # type: ignore 

109 except Exception: 

110 encoding[x] = o # type: ignore 

111 x += 1 

112 if isinstance(encoding, list): 

113 encoding = dict(zip(range(256), encoding)) 

114 return encoding 

115 

116 

117def _parse_to_unicode( 

118 ft: DictionaryObject 

119) -> tuple[dict[Any, Any], list[int]]: 

120 # will store all translation code 

121 # and map_dict[-1] we will have the number of bytes to convert 

122 map_dict: dict[Any, Any] = {} 

123 

124 # will provide the list of cmap keys as int to correct encoding 

125 int_entry: list[int] = [] 

126 

127 if "/ToUnicode" not in ft: 

128 if ft.get("/Subtype", "") == "/Type1": 

129 return _type1_alternative(ft, map_dict, int_entry) 

130 return {}, [] 

131 process_rg: bool = False 

132 process_char: bool = False 

133 multiline_rg: Union[ 

134 None, tuple[int, int] 

135 ] = None # tuple = (current_char, remaining size) ; cf #1285 for example of file 

136 cm = prepare_cm(ft) 

137 for line in cm.split(b"\n"): 

138 process_rg, process_char, multiline_rg = process_cm_line( 

139 line.strip(b" \t"), 

140 process_rg, 

141 process_char, 

142 multiline_rg, 

143 map_dict, 

144 int_entry, 

145 ) 

146 

147 return map_dict, int_entry 

148 

149 

150def prepare_cm(ft: DictionaryObject) -> bytes: 

151 tu = ft["/ToUnicode"] 

152 cm: bytes 

153 if isinstance(tu, StreamObject): 

154 cm = cast(DecodedStreamObject, ft["/ToUnicode"]).get_data() 

155 else: # if (tu is None) or cast(str, tu).startswith("/Identity"): 

156 # the full range 0000-FFFF will be processed 

157 cm = b"beginbfrange\n<0000> <0001> <0000>\nendbfrange" 

158 if isinstance(cm, str): 

159 cm = cm.encode() 

160 # we need to prepare cm before due to missing return line in pdf printed 

161 # to pdf from word 

162 cm = ( 

163 cm.strip() 

164 .replace(b"beginbfchar", b"\nbeginbfchar\n") 

165 .replace(b"endbfchar", b"\nendbfchar\n") 

166 .replace(b"beginbfrange", b"\nbeginbfrange\n") 

167 .replace(b"endbfrange", b"\nendbfrange\n") 

168 .replace(b"<<", b"\n{\n") # text between << and >> not used but 

169 .replace(b">>", b"\n}\n") # some solution to find it back 

170 ) 

171 ll = cm.split(b"<") 

172 for i in range(len(ll)): 

173 j = ll[i].find(b">") 

174 if j >= 0: 

175 if j == 0: 

176 # string is empty: stash a placeholder here (see below) 

177 # see https://github.com/py-pdf/pypdf/issues/1111 

178 content = b"." 

179 else: 

180 content = ll[i][:j].replace(b" ", b"") 

181 ll[i] = content + b" " + ll[i][j + 1 :] 

182 cm = ( 

183 (b" ".join(ll)) 

184 .replace(b"[", b" [ ") 

185 .replace(b"]", b" ]\n ") 

186 .replace(b"\r", b"\n") 

187 ) 

188 return cm 

189 

190 

191def process_cm_line( 

192 line: bytes, 

193 process_rg: bool, 

194 process_char: bool, 

195 multiline_rg: Union[None, tuple[int, int]], 

196 map_dict: dict[Any, Any], 

197 int_entry: list[int], 

198) -> tuple[bool, bool, Union[None, tuple[int, int]]]: 

199 if line == b"" or line[0] == 37: # 37 = % 

200 return process_rg, process_char, multiline_rg 

201 line = line.replace(b"\t", b" ") 

202 if b"beginbfrange" in line: 

203 process_rg = True 

204 elif b"endbfrange" in line: 

205 process_rg = False 

206 elif b"beginbfchar" in line: 

207 process_char = True 

208 elif b"endbfchar" in line: 

209 process_char = False 

210 elif process_rg: 

211 try: 

212 multiline_rg = parse_bfrange(line, map_dict, int_entry, multiline_rg) 

213 except binascii.Error as error: 

214 logger_warning(f"Skipping broken line {line!r}: {error}", __name__) 

215 elif process_char: 

216 parse_bfchar(line, map_dict, int_entry) 

217 return process_rg, process_char, multiline_rg 

218 

219 

220def parse_bfrange( 

221 line: bytes, 

222 map_dict: dict[Any, Any], 

223 int_entry: list[int], 

224 multiline_rg: Union[None, tuple[int, int]], 

225) -> Union[None, tuple[int, int]]: 

226 lst = [x for x in line.split(b" ") if x] 

227 closure_found = False 

228 if multiline_rg is not None: 

229 fmt = b"%%0%dX" % (map_dict[-1] * 2) 

230 a = multiline_rg[0] # a, b not in the current line 

231 b = multiline_rg[1] 

232 for sq in lst: 

233 if sq == b"]": 

234 closure_found = True 

235 break 

236 map_dict[ 

237 unhexlify(fmt % a).decode( 

238 "charmap" if map_dict[-1] == 1 else "utf-16-be", 

239 "surrogatepass", 

240 ) 

241 ] = unhexlify(sq).decode("utf-16-be", "surrogatepass") 

242 int_entry.append(a) 

243 a += 1 

244 else: 

245 a = int(lst[0], 16) 

246 b = int(lst[1], 16) 

247 nbi = max(len(lst[0]), len(lst[1])) 

248 map_dict[-1] = ceil(nbi / 2) 

249 fmt = b"%%0%dX" % (map_dict[-1] * 2) 

250 if lst[2] == b"[": 

251 for sq in lst[3:]: 

252 if sq == b"]": 

253 closure_found = True 

254 break 

255 map_dict[ 

256 unhexlify(fmt % a).decode( 

257 "charmap" if map_dict[-1] == 1 else "utf-16-be", 

258 "surrogatepass", 

259 ) 

260 ] = unhexlify(sq).decode("utf-16-be", "surrogatepass") 

261 int_entry.append(a) 

262 a += 1 

263 else: # case without list 

264 c = int(lst[2], 16) 

265 fmt2 = b"%%0%dX" % max(4, len(lst[2])) 

266 closure_found = True 

267 while a <= b: 

268 map_dict[ 

269 unhexlify(fmt % a).decode( 

270 "charmap" if map_dict[-1] == 1 else "utf-16-be", 

271 "surrogatepass", 

272 ) 

273 ] = unhexlify(fmt2 % c).decode("utf-16-be", "surrogatepass") 

274 int_entry.append(a) 

275 a += 1 

276 c += 1 

277 return None if closure_found else (a, b) 

278 

279 

280def parse_bfchar(line: bytes, map_dict: dict[Any, Any], int_entry: list[int]) -> None: 

281 lst = [x for x in line.split(b" ") if x] 

282 map_dict[-1] = len(lst[0]) // 2 

283 while len(lst) > 1: 

284 map_to = "" 

285 # placeholder (see above) means empty string 

286 if lst[1] != b".": 

287 try: 

288 map_to = unhexlify(lst[1]).decode( 

289 "charmap" if len(lst[1]) < 4 else "utf-16-be", "surrogatepass" 

290 ) # join is here as some cases where the code was split 

291 except BinasciiError as exception: 

292 logger_warning(f"Got invalid hex string: {exception!s} ({lst[1]!r})", __name__) 

293 map_dict[ 

294 unhexlify(lst[0]).decode( 

295 "charmap" if map_dict[-1] == 1 else "utf-16-be", "surrogatepass" 

296 ) 

297 ] = map_to 

298 int_entry.append(int(lst[0], 16)) 

299 lst = lst[2:] 

300 

301 

302def _type1_alternative( 

303 ft: DictionaryObject, 

304 map_dict: dict[Any, Any], 

305 int_entry: list[int], 

306) -> tuple[dict[Any, Any], list[int]]: 

307 if "/FontDescriptor" not in ft: 

308 return map_dict, int_entry 

309 ft_desc = cast(DictionaryObject, ft["/FontDescriptor"]).get("/FontFile") 

310 if is_null_or_none(ft_desc): 

311 return map_dict, int_entry 

312 assert ft_desc is not None, "mypy" 

313 txt = ft_desc.get_object().get_data() 

314 txt = txt.split(b"eexec\n")[0] # only clear part 

315 txt = txt.split(b"/Encoding")[1] # to get the encoding part 

316 lines = txt.replace(b"\r", b"\n").split(b"\n") 

317 for li in lines: 

318 if li.startswith(b"dup"): 

319 words = [_w for _w in li.split(b" ") if _w != b""] 

320 if len(words) > 3 and words[3] != b"put": 

321 continue 

322 try: 

323 i = int(words[1]) 

324 except ValueError: # pragma: no cover 

325 continue 

326 try: 

327 v = adobe_glyphs[words[2].decode()] 

328 except KeyError: 

329 if words[2].startswith(b"/uni"): 

330 try: 

331 v = chr(int(words[2][4:], 16)) 

332 except ValueError: # pragma: no cover 

333 continue 

334 else: 

335 continue 

336 map_dict[chr(i)] = v 

337 int_entry.append(i) 

338 return map_dict, int_entry