Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_cmap.py: 10%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

203 statements  

1import binascii 

2from binascii import Error as BinasciiError 

3from binascii import unhexlify 

4from math import ceil 

5from typing import Any, Union, cast 

6 

7from ._codecs import adobe_glyphs, charset_encoding 

8from ._utils import logger_error, logger_warning 

9from .errors import LimitReachedError 

10from .generic import ( 

11 DecodedStreamObject, 

12 DictionaryObject, 

13 NullObject, 

14 StreamObject, 

15 is_null_or_none, 

16) 

17 

18_predefined_cmap: dict[str, str] = { 

19 "/Identity-H": "utf-16-be", 

20 "/Identity-V": "utf-16-be", 

21 "/GB-EUC-H": "gbk", 

22 "/GB-EUC-V": "gbk", 

23 "/GBpc-EUC-H": "gb2312", 

24 "/GBpc-EUC-V": "gb2312", 

25 "/GBK-EUC-H": "gbk", 

26 "/GBK-EUC-V": "gbk", 

27 "/GBK2K-H": "gb18030", 

28 "/GBK2K-V": "gb18030", 

29 "/ETen-B5-H": "cp950", 

30 "/ETen-B5-V": "cp950", 

31 "/ETenms-B5-H": "cp950", 

32 "/ETenms-B5-V": "cp950", 

33 "/UniCNS-UTF16-H": "utf-16-be", 

34 "/UniCNS-UTF16-V": "utf-16-be", 

35 "/UniGB-UTF16-H": "gb18030", 

36 "/UniGB-UTF16-V": "gb18030", 

37 # Japanese CMaps (PDF Reference 1.7, Appendix H) 

38 "/90ms-RKSJ-H": "cp932", # Shift-JIS (JIS X 0208-1990), horizontal 

39 "/90ms-RKSJ-V": "cp932", # Shift-JIS (JIS X 0208-1990), vertical 

40 "/UniJIS-UTF16-H": "utf-16-be", # Unicode UTF-16BE -> JIS, horizontal 

41 "/UniJIS-UTF16-V": "utf-16-be", # Unicode UTF-16BE -> JIS, vertical 

42 # UCS2 in code 

43} 

44 

45 

46def get_encoding( 

47 ft: DictionaryObject 

48) -> tuple[Union[str, dict[int, str]], dict[Any, Any]]: 

49 encoding = _parse_encoding(ft) 

50 map_dict, int_entry = _parse_to_unicode(ft) 

51 

52 # Apply rule from PDF ref 1.7 §5.9.1, 1st bullet: 

53 # if cmap not empty encoding should be discarded 

54 # (here transformed into identity for those characters) 

55 # If encoding is a string, it is expected to be an identity translation. 

56 if isinstance(encoding, dict): 

57 for x in int_entry: 

58 if x <= 255: 

59 encoding[x] = chr(x) 

60 

61 return encoding, map_dict 

62 

63 

64def _parse_encoding( 

65 ft: DictionaryObject 

66) -> Union[str, dict[int, str]]: 

67 encoding: Union[str, list[str], dict[int, str]] = [] 

68 if "/Encoding" not in ft: 

69 if "/BaseFont" in ft and cast(str, ft["/BaseFont"]) in charset_encoding: 

70 encoding = dict( 

71 zip(range(256), charset_encoding[cast(str, ft["/BaseFont"])]) 

72 ) 

73 else: 

74 encoding = "charmap" 

75 return encoding 

76 enc: Union[str, DictionaryObject, NullObject] = cast( 

77 Union[str, DictionaryObject, NullObject], ft["/Encoding"].get_object() 

78 ) 

79 if isinstance(enc, str): 

80 try: 

81 # already done : enc = NameObject.unnumber(enc.encode()).decode() 

82 # for #xx decoding 

83 if enc in charset_encoding: 

84 encoding = charset_encoding[enc].copy() 

85 elif enc in _predefined_cmap: 

86 encoding = _predefined_cmap[enc] 

87 elif "-UCS2-" in enc: 

88 encoding = "utf-16-be" 

89 else: 

90 raise Exception("not found") 

91 except Exception: 

92 logger_error("Advanced encoding %(encoding)s not implemented yet", source=__name__, encoding=enc) 

93 encoding = enc 

94 elif isinstance(enc, DictionaryObject) and "/BaseEncoding" in enc: 

95 try: 

96 encoding = charset_encoding[cast(str, enc["/BaseEncoding"])].copy() 

97 except Exception: 

98 logger_error( 

99 "Advanced encoding %(encoding)s not implemented yet", 

100 source=__name__, encoding=encoding 

101 ) 

102 encoding = charset_encoding["/StandardEncoding"].copy() 

103 else: 

104 encoding = charset_encoding["/StandardEncoding"].copy() 

105 if isinstance(enc, DictionaryObject) and "/Differences" in enc: 

106 x: int = 0 

107 o: Union[int, str] 

108 for o in cast(DictionaryObject, enc["/Differences"]): 

109 if isinstance(o, int): 

110 x = o 

111 else: # isinstance(o, str): 

112 try: 

113 if x < len(encoding): 

114 encoding[x] = adobe_glyphs[o] # type: ignore[index] 

115 except Exception: 

116 encoding[x] = o # type: ignore[index] 

117 x += 1 

118 if isinstance(encoding, list): 

119 encoding = dict(zip(range(256), encoding)) 

120 return encoding 

121 

122 

123def _parse_to_unicode( 

124 ft: DictionaryObject 

125) -> tuple[dict[Any, Any], list[int]]: 

126 # will store all translation code 

127 # and map_dict[-1] we will have the number of bytes to convert 

128 map_dict: dict[Any, Any] = {} 

129 

130 # will provide the list of cmap keys as int to correct encoding 

131 int_entry: list[int] = [] 

132 

133 if "/ToUnicode" not in ft: 

134 if ft.get("/Subtype", "") == "/Type1": 

135 return _type1_alternative(ft, map_dict, int_entry) 

136 return {}, [] 

137 process_rg: bool = False 

138 process_char: bool = False 

139 multiline_rg: Union[ 

140 None, tuple[int, int] 

141 ] = None # tuple = (current_char, remaining size) ; cf #1285 for example of file 

142 cm = prepare_cm(ft) 

143 for line in cm.split(b"\n"): 

144 process_rg, process_char, multiline_rg = process_cm_line( 

145 line.strip(b" \t"), 

146 process_rg, 

147 process_char, 

148 multiline_rg, 

149 map_dict, 

150 int_entry, 

151 ) 

152 

153 map_dict.pop(-1, None) # Don't pass the -1 key, we only used it to temporarily store encoding length 

154 

155 return map_dict, int_entry 

156 

157 

158def prepare_cm(ft: DictionaryObject) -> bytes: 

159 tu = ft["/ToUnicode"] 

160 cm: bytes 

161 if isinstance(tu, StreamObject): 

162 cm = cast(DecodedStreamObject, ft["/ToUnicode"]).get_data() 

163 else: # if (tu is None) or cast(str, tu).startswith("/Identity"): 

164 # the full range 0000-FFFF will be processed 

165 cm = b"beginbfrange\n<0000> <0001> <0000>\nendbfrange" 

166 if isinstance(cm, str): 

167 cm = cm.encode() 

168 # we need to prepare cm before due to missing return line in pdf printed 

169 # to pdf from word 

170 cm = ( 

171 cm.strip() 

172 .replace(b"beginbfchar", b"\nbeginbfchar\n") 

173 .replace(b"endbfchar", b"\nendbfchar\n") 

174 .replace(b"beginbfrange", b"\nbeginbfrange\n") 

175 .replace(b"endbfrange", b"\nendbfrange\n") 

176 .replace(b"<<", b"\n{\n") # text between << and >> not used but 

177 .replace(b">>", b"\n}\n") # some solution to find it back 

178 ) 

179 ll = cm.split(b"<") 

180 for i in range(len(ll)): 

181 j = ll[i].find(b">") 

182 if j >= 0: 

183 if j == 0: 

184 # string is empty: stash a placeholder here (see below) 

185 # see https://github.com/py-pdf/pypdf/issues/1111 

186 content = b"." 

187 else: 

188 content = ll[i][:j].replace(b" ", b"") 

189 ll[i] = content + b" " + ll[i][j + 1 :] 

190 cm = ( 

191 (b" ".join(ll)) 

192 .replace(b"[", b" [ ") 

193 .replace(b"]", b" ]\n ") 

194 .replace(b"\r", b"\n") 

195 ) 

196 return cm 

197 

198 

199def process_cm_line( 

200 line: bytes, 

201 process_rg: bool, 

202 process_char: bool, 

203 multiline_rg: Union[None, tuple[int, int]], 

204 map_dict: dict[Any, Any], 

205 int_entry: list[int], 

206) -> tuple[bool, bool, Union[None, tuple[int, int]]]: 

207 if line == b"" or line[0] == 37: # 37 = % 

208 return process_rg, process_char, multiline_rg 

209 line = line.replace(b"\t", b" ") 

210 if b"beginbfrange" in line: 

211 process_rg = True 

212 elif b"endbfrange" in line: 

213 process_rg = False 

214 elif b"beginbfchar" in line: 

215 process_char = True 

216 elif b"endbfchar" in line: 

217 process_char = False 

218 elif process_rg: 

219 try: 

220 multiline_rg = parse_bfrange(line, map_dict, int_entry, multiline_rg) 

221 except binascii.Error as error: 

222 logger_warning("Skipping broken line %(line)r: %(error)s", source=__name__, line=line, error=error) 

223 elif process_char: 

224 parse_bfchar(line, map_dict, int_entry) 

225 return process_rg, process_char, multiline_rg 

226 

227 

228# Usual values should be up to 65_536. 

229MAPPING_DICTIONARY_SIZE_LIMIT = 100_000 

230 

231 

232def _check_mapping_size(size: int) -> None: 

233 if size > MAPPING_DICTIONARY_SIZE_LIMIT: 

234 raise LimitReachedError(f"Maximum /ToUnicode size limit reached: {size} > {MAPPING_DICTIONARY_SIZE_LIMIT}.") 

235 

236 

237def parse_bfrange( 

238 line: bytes, 

239 map_dict: dict[Any, Any], 

240 int_entry: list[int], 

241 multiline_rg: Union[None, tuple[int, int]], 

242) -> Union[None, tuple[int, int]]: 

243 lst = [x for x in line.split(b" ") if x] 

244 closure_found = False 

245 entry_count = len(int_entry) 

246 _check_mapping_size(entry_count) 

247 if multiline_rg is not None: 

248 fmt = b"%%0%dX" % (map_dict[-1] * 2) 

249 a = multiline_rg[0] # a, b not in the current line 

250 b = multiline_rg[1] 

251 for sq in lst: 

252 if sq == b"]": 

253 closure_found = True 

254 break 

255 entry_count += 1 

256 _check_mapping_size(entry_count) 

257 map_dict[ 

258 unhexlify(fmt % a).decode( 

259 "charmap" if map_dict[-1] == 1 else "utf-16-be", 

260 "surrogatepass", 

261 ) 

262 ] = unhexlify(sq).decode("utf-16-be", "surrogatepass") 

263 int_entry.append(a) 

264 a += 1 

265 else: 

266 a = int(lst[0], 16) 

267 b = int(lst[1], 16) 

268 nbi = max(len(lst[0]), len(lst[1])) 

269 map_dict[-1] = ceil(nbi / 2) 

270 fmt = b"%%0%dX" % (map_dict[-1] * 2) 

271 if lst[2] == b"[": 

272 for sq in lst[3:]: 

273 if sq == b"]": 

274 closure_found = True 

275 break 

276 entry_count += 1 

277 _check_mapping_size(entry_count) 

278 map_dict[ 

279 unhexlify(fmt % a).decode( 

280 "charmap" if map_dict[-1] == 1 else "utf-16-be", 

281 "surrogatepass", 

282 ) 

283 ] = unhexlify(sq).decode("utf-16-be", "surrogatepass") 

284 int_entry.append(a) 

285 a += 1 

286 else: # case without list 

287 c = int(lst[2], 16) 

288 fmt2 = b"%%0%dX" % max(4, len(lst[2])) 

289 closure_found = True 

290 range_size = max(0, b - a + 1) 

291 _check_mapping_size(entry_count + range_size) # This can be checked beforehand. 

292 while a <= b: 

293 map_dict[ 

294 unhexlify(fmt % a).decode( 

295 "charmap" if map_dict[-1] == 1 else "utf-16-be", 

296 "surrogatepass", 

297 ) 

298 ] = unhexlify(fmt2 % c).decode("utf-16-be", "surrogatepass") 

299 int_entry.append(a) 

300 a += 1 

301 c += 1 

302 return None if closure_found else (a, b) 

303 

304 

305def parse_bfchar(line: bytes, map_dict: dict[Any, Any], int_entry: list[int]) -> None: 

306 lst = [x for x in line.split(b" ") if x] 

307 new_count = len(lst) // 2 

308 _check_mapping_size(len(int_entry) + new_count) # This can be checked beforehand. 

309 map_dict[-1] = len(lst[0]) // 2 

310 while len(lst) > 1: 

311 map_to = "" 

312 # placeholder (see above) means empty string 

313 if lst[1] != b".": 

314 try: 

315 map_to = unhexlify(lst[1]).decode( 

316 "charmap" if len(lst[1]) < 4 else "utf-16-be", "surrogatepass" 

317 ) # join is here as some cases where the code was split 

318 except BinasciiError as exception: 

319 logger_warning( 

320 "Got invalid hex string: %(exception)s (%(lst_value)r)", 

321 source=__name__, 

322 exception=exception, 

323 lst_value=lst[1], 

324 ) 

325 map_dict[ 

326 unhexlify(lst[0]).decode( 

327 "charmap" if map_dict[-1] == 1 else "utf-16-be", "surrogatepass" 

328 ) 

329 ] = map_to 

330 int_entry.append(int(lst[0], 16)) 

331 lst = lst[2:] 

332 

333 

334def _type1_alternative( 

335 ft: DictionaryObject, 

336 map_dict: dict[Any, Any], 

337 int_entry: list[int], 

338) -> tuple[dict[Any, Any], list[int]]: 

339 if "/FontDescriptor" not in ft: 

340 return map_dict, int_entry 

341 ft_desc = cast(DictionaryObject, ft["/FontDescriptor"]).get("/FontFile") 

342 if is_null_or_none(ft_desc): 

343 return map_dict, int_entry 

344 assert ft_desc is not None, "mypy" 

345 txt = ft_desc.get_object().get_data() 

346 txt = txt.split(b"eexec\n")[0] # only clear part 

347 txt = txt.split(b"/Encoding")[1] # to get the encoding part 

348 lines = txt.replace(b"\r", b"\n").split(b"\n") 

349 for li in lines: 

350 if li.startswith(b"dup"): 

351 words = [_w for _w in li.split(b" ") if _w != b""] 

352 if len(words) > 3 and words[3] != b"put": 

353 continue 

354 try: 

355 i = int(words[1]) 

356 except ValueError: # pragma: no cover 

357 continue 

358 try: 

359 v = adobe_glyphs[words[2].decode()] 

360 except KeyError: 

361 if words[2].startswith(b"/uni"): 

362 try: 

363 v = chr(int(words[2][4:], 16)) 

364 except ValueError: # pragma: no cover 

365 continue 

366 else: 

367 continue 

368 map_dict[chr(i)] = v 

369 int_entry.append(i) 

370 return map_dict, int_entry