Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_cmap.py: 10%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

202 statements  

1import binascii 

2from binascii import Error as BinasciiError 

3from binascii import unhexlify 

4from math import ceil 

5from typing import Any, Union, cast 

6 

7from ._codecs import adobe_glyphs, charset_encoding 

8from ._utils import logger_error, logger_warning 

9from .errors import LimitReachedError 

10from .generic import ( 

11 DecodedStreamObject, 

12 DictionaryObject, 

13 NullObject, 

14 StreamObject, 

15 is_null_or_none, 

16) 

17 

18_predefined_cmap: dict[str, str] = { 

19 "/Identity-H": "utf-16-be", 

20 "/Identity-V": "utf-16-be", 

21 "/GB-EUC-H": "gbk", 

22 "/GB-EUC-V": "gbk", 

23 "/GBpc-EUC-H": "gb2312", 

24 "/GBpc-EUC-V": "gb2312", 

25 "/GBK-EUC-H": "gbk", 

26 "/GBK-EUC-V": "gbk", 

27 "/GBK2K-H": "gb18030", 

28 "/GBK2K-V": "gb18030", 

29 "/ETen-B5-H": "cp950", 

30 "/ETen-B5-V": "cp950", 

31 "/ETenms-B5-H": "cp950", 

32 "/ETenms-B5-V": "cp950", 

33 "/UniCNS-UTF16-H": "utf-16-be", 

34 "/UniCNS-UTF16-V": "utf-16-be", 

35 "/UniGB-UTF16-H": "gb18030", 

36 "/UniGB-UTF16-V": "gb18030", 

37 # UCS2 in code 

38} 

39 

40 

41def get_encoding( 

42 ft: DictionaryObject 

43) -> tuple[Union[str, dict[int, str]], dict[Any, Any]]: 

44 encoding = _parse_encoding(ft) 

45 map_dict, int_entry = _parse_to_unicode(ft) 

46 

47 # Apply rule from PDF ref 1.7 §5.9.1, 1st bullet: 

48 # if cmap not empty encoding should be discarded 

49 # (here transformed into identity for those characters) 

50 # If encoding is a string, it is expected to be an identity translation. 

51 if isinstance(encoding, dict): 

52 for x in int_entry: 

53 if x <= 255: 

54 encoding[x] = chr(x) 

55 

56 return encoding, map_dict 

57 

58 

59def _parse_encoding( 

60 ft: DictionaryObject 

61) -> Union[str, dict[int, str]]: 

62 encoding: Union[str, list[str], dict[int, str]] = [] 

63 if "/Encoding" not in ft: 

64 if "/BaseFont" in ft and cast(str, ft["/BaseFont"]) in charset_encoding: 

65 encoding = dict( 

66 zip(range(256), charset_encoding[cast(str, ft["/BaseFont"])]) 

67 ) 

68 else: 

69 encoding = "charmap" 

70 return encoding 

71 enc: Union[str, DictionaryObject, NullObject] = cast( 

72 Union[str, DictionaryObject, NullObject], ft["/Encoding"].get_object() 

73 ) 

74 if isinstance(enc, str): 

75 try: 

76 # already done : enc = NameObject.unnumber(enc.encode()).decode() 

77 # for #xx decoding 

78 if enc in charset_encoding: 

79 encoding = charset_encoding[enc].copy() 

80 elif enc in _predefined_cmap: 

81 encoding = _predefined_cmap[enc] 

82 elif "-UCS2-" in enc: 

83 encoding = "utf-16-be" 

84 else: 

85 raise Exception("not found") 

86 except Exception: 

87 logger_error(f"Advanced encoding {enc} not implemented yet", __name__) 

88 encoding = enc 

89 elif isinstance(enc, DictionaryObject) and "/BaseEncoding" in enc: 

90 try: 

91 encoding = charset_encoding[cast(str, enc["/BaseEncoding"])].copy() 

92 except Exception: 

93 logger_error( 

94 f"Advanced encoding {encoding} not implemented yet", 

95 __name__, 

96 ) 

97 encoding = charset_encoding["/StandardEncoding"].copy() 

98 else: 

99 encoding = charset_encoding["/StandardEncoding"].copy() 

100 if isinstance(enc, DictionaryObject) and "/Differences" in enc: 

101 x: int = 0 

102 o: Union[int, str] 

103 for o in cast(DictionaryObject, enc["/Differences"]): 

104 if isinstance(o, int): 

105 x = o 

106 else: # isinstance(o, str): 

107 try: 

108 if x < len(encoding): 

109 encoding[x] = adobe_glyphs[o] # type: ignore 

110 except Exception: 

111 encoding[x] = o # type: ignore 

112 x += 1 

113 if isinstance(encoding, list): 

114 encoding = dict(zip(range(256), encoding)) 

115 return encoding 

116 

117 

118def _parse_to_unicode( 

119 ft: DictionaryObject 

120) -> tuple[dict[Any, Any], list[int]]: 

121 # will store all translation code 

122 # and map_dict[-1] we will have the number of bytes to convert 

123 map_dict: dict[Any, Any] = {} 

124 

125 # will provide the list of cmap keys as int to correct encoding 

126 int_entry: list[int] = [] 

127 

128 if "/ToUnicode" not in ft: 

129 if ft.get("/Subtype", "") == "/Type1": 

130 return _type1_alternative(ft, map_dict, int_entry) 

131 return {}, [] 

132 process_rg: bool = False 

133 process_char: bool = False 

134 multiline_rg: Union[ 

135 None, tuple[int, int] 

136 ] = None # tuple = (current_char, remaining size) ; cf #1285 for example of file 

137 cm = prepare_cm(ft) 

138 for line in cm.split(b"\n"): 

139 process_rg, process_char, multiline_rg = process_cm_line( 

140 line.strip(b" \t"), 

141 process_rg, 

142 process_char, 

143 multiline_rg, 

144 map_dict, 

145 int_entry, 

146 ) 

147 

148 return map_dict, int_entry 

149 

150 

151def prepare_cm(ft: DictionaryObject) -> bytes: 

152 tu = ft["/ToUnicode"] 

153 cm: bytes 

154 if isinstance(tu, StreamObject): 

155 cm = cast(DecodedStreamObject, ft["/ToUnicode"]).get_data() 

156 else: # if (tu is None) or cast(str, tu).startswith("/Identity"): 

157 # the full range 0000-FFFF will be processed 

158 cm = b"beginbfrange\n<0000> <0001> <0000>\nendbfrange" 

159 if isinstance(cm, str): 

160 cm = cm.encode() 

161 # we need to prepare cm before due to missing return line in pdf printed 

162 # to pdf from word 

163 cm = ( 

164 cm.strip() 

165 .replace(b"beginbfchar", b"\nbeginbfchar\n") 

166 .replace(b"endbfchar", b"\nendbfchar\n") 

167 .replace(b"beginbfrange", b"\nbeginbfrange\n") 

168 .replace(b"endbfrange", b"\nendbfrange\n") 

169 .replace(b"<<", b"\n{\n") # text between << and >> not used but 

170 .replace(b">>", b"\n}\n") # some solution to find it back 

171 ) 

172 ll = cm.split(b"<") 

173 for i in range(len(ll)): 

174 j = ll[i].find(b">") 

175 if j >= 0: 

176 if j == 0: 

177 # string is empty: stash a placeholder here (see below) 

178 # see https://github.com/py-pdf/pypdf/issues/1111 

179 content = b"." 

180 else: 

181 content = ll[i][:j].replace(b" ", b"") 

182 ll[i] = content + b" " + ll[i][j + 1 :] 

183 cm = ( 

184 (b" ".join(ll)) 

185 .replace(b"[", b" [ ") 

186 .replace(b"]", b" ]\n ") 

187 .replace(b"\r", b"\n") 

188 ) 

189 return cm 

190 

191 

192def process_cm_line( 

193 line: bytes, 

194 process_rg: bool, 

195 process_char: bool, 

196 multiline_rg: Union[None, tuple[int, int]], 

197 map_dict: dict[Any, Any], 

198 int_entry: list[int], 

199) -> tuple[bool, bool, Union[None, tuple[int, int]]]: 

200 if line == b"" or line[0] == 37: # 37 = % 

201 return process_rg, process_char, multiline_rg 

202 line = line.replace(b"\t", b" ") 

203 if b"beginbfrange" in line: 

204 process_rg = True 

205 elif b"endbfrange" in line: 

206 process_rg = False 

207 elif b"beginbfchar" in line: 

208 process_char = True 

209 elif b"endbfchar" in line: 

210 process_char = False 

211 elif process_rg: 

212 try: 

213 multiline_rg = parse_bfrange(line, map_dict, int_entry, multiline_rg) 

214 except binascii.Error as error: 

215 logger_warning(f"Skipping broken line {line!r}: {error}", __name__) 

216 elif process_char: 

217 parse_bfchar(line, map_dict, int_entry) 

218 return process_rg, process_char, multiline_rg 

219 

220 

221# Usual values should be up to 65_536. 

222MAPPING_DICTIONARY_SIZE_LIMIT = 100_000 

223 

224 

225def _check_mapping_size(size: int) -> None: 

226 if size > MAPPING_DICTIONARY_SIZE_LIMIT: 

227 raise LimitReachedError(f"Maximum /ToUnicode size limit reached: {size} > {MAPPING_DICTIONARY_SIZE_LIMIT}.") 

228 

229 

230def parse_bfrange( 

231 line: bytes, 

232 map_dict: dict[Any, Any], 

233 int_entry: list[int], 

234 multiline_rg: Union[None, tuple[int, int]], 

235) -> Union[None, tuple[int, int]]: 

236 lst = [x for x in line.split(b" ") if x] 

237 closure_found = False 

238 entry_count = len(int_entry) 

239 _check_mapping_size(entry_count) 

240 if multiline_rg is not None: 

241 fmt = b"%%0%dX" % (map_dict[-1] * 2) 

242 a = multiline_rg[0] # a, b not in the current line 

243 b = multiline_rg[1] 

244 for sq in lst: 

245 if sq == b"]": 

246 closure_found = True 

247 break 

248 entry_count += 1 

249 _check_mapping_size(entry_count) 

250 map_dict[ 

251 unhexlify(fmt % a).decode( 

252 "charmap" if map_dict[-1] == 1 else "utf-16-be", 

253 "surrogatepass", 

254 ) 

255 ] = unhexlify(sq).decode("utf-16-be", "surrogatepass") 

256 int_entry.append(a) 

257 a += 1 

258 else: 

259 a = int(lst[0], 16) 

260 b = int(lst[1], 16) 

261 nbi = max(len(lst[0]), len(lst[1])) 

262 map_dict[-1] = ceil(nbi / 2) 

263 fmt = b"%%0%dX" % (map_dict[-1] * 2) 

264 if lst[2] == b"[": 

265 for sq in lst[3:]: 

266 if sq == b"]": 

267 closure_found = True 

268 break 

269 entry_count += 1 

270 _check_mapping_size(entry_count) 

271 map_dict[ 

272 unhexlify(fmt % a).decode( 

273 "charmap" if map_dict[-1] == 1 else "utf-16-be", 

274 "surrogatepass", 

275 ) 

276 ] = unhexlify(sq).decode("utf-16-be", "surrogatepass") 

277 int_entry.append(a) 

278 a += 1 

279 else: # case without list 

280 c = int(lst[2], 16) 

281 fmt2 = b"%%0%dX" % max(4, len(lst[2])) 

282 closure_found = True 

283 range_size = max(0, b - a + 1) 

284 _check_mapping_size(entry_count + range_size) # This can be checked beforehand. 

285 while a <= b: 

286 map_dict[ 

287 unhexlify(fmt % a).decode( 

288 "charmap" if map_dict[-1] == 1 else "utf-16-be", 

289 "surrogatepass", 

290 ) 

291 ] = unhexlify(fmt2 % c).decode("utf-16-be", "surrogatepass") 

292 int_entry.append(a) 

293 a += 1 

294 c += 1 

295 return None if closure_found else (a, b) 

296 

297 

298def parse_bfchar(line: bytes, map_dict: dict[Any, Any], int_entry: list[int]) -> None: 

299 lst = [x for x in line.split(b" ") if x] 

300 new_count = len(lst) // 2 

301 _check_mapping_size(len(int_entry) + new_count) # This can be checked beforehand. 

302 map_dict[-1] = len(lst[0]) // 2 

303 while len(lst) > 1: 

304 map_to = "" 

305 # placeholder (see above) means empty string 

306 if lst[1] != b".": 

307 try: 

308 map_to = unhexlify(lst[1]).decode( 

309 "charmap" if len(lst[1]) < 4 else "utf-16-be", "surrogatepass" 

310 ) # join is here as some cases where the code was split 

311 except BinasciiError as exception: 

312 logger_warning(f"Got invalid hex string: {exception!s} ({lst[1]!r})", __name__) 

313 map_dict[ 

314 unhexlify(lst[0]).decode( 

315 "charmap" if map_dict[-1] == 1 else "utf-16-be", "surrogatepass" 

316 ) 

317 ] = map_to 

318 int_entry.append(int(lst[0], 16)) 

319 lst = lst[2:] 

320 

321 

322def _type1_alternative( 

323 ft: DictionaryObject, 

324 map_dict: dict[Any, Any], 

325 int_entry: list[int], 

326) -> tuple[dict[Any, Any], list[int]]: 

327 if "/FontDescriptor" not in ft: 

328 return map_dict, int_entry 

329 ft_desc = cast(DictionaryObject, ft["/FontDescriptor"]).get("/FontFile") 

330 if is_null_or_none(ft_desc): 

331 return map_dict, int_entry 

332 assert ft_desc is not None, "mypy" 

333 txt = ft_desc.get_object().get_data() 

334 txt = txt.split(b"eexec\n")[0] # only clear part 

335 txt = txt.split(b"/Encoding")[1] # to get the encoding part 

336 lines = txt.replace(b"\r", b"\n").split(b"\n") 

337 for li in lines: 

338 if li.startswith(b"dup"): 

339 words = [_w for _w in li.split(b" ") if _w != b""] 

340 if len(words) > 3 and words[3] != b"put": 

341 continue 

342 try: 

343 i = int(words[1]) 

344 except ValueError: # pragma: no cover 

345 continue 

346 try: 

347 v = adobe_glyphs[words[2].decode()] 

348 except KeyError: 

349 if words[2].startswith(b"/uni"): 

350 try: 

351 v = chr(int(words[2][4:], 16)) 

352 except ValueError: # pragma: no cover 

353 continue 

354 else: 

355 continue 

356 map_dict[chr(i)] = v 

357 int_entry.append(i) 

358 return map_dict, int_entry