Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_cmap.py: 9%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

278 statements  

1import binascii 

2from binascii import unhexlify 

3from math import ceil 

4from typing import Any, Dict, List, Tuple, Union, cast 

5 

6from ._codecs import adobe_glyphs, charset_encoding 

7from ._utils import logger_error, logger_warning 

8from .generic import ( 

9 ArrayObject, 

10 DecodedStreamObject, 

11 DictionaryObject, 

12 NullObject, 

13 StreamObject, 

14 is_null_or_none, 

15) 

16 

17 

18# code freely inspired from @twiggy ; see #711 

19def build_char_map( 

20 font_name: str, space_width: float, obj: DictionaryObject 

21) -> Tuple[str, float, Union[str, Dict[int, str]], Dict[Any, Any], DictionaryObject]: 

22 """ 

23 Determine information about a font. 

24 

25 Args: 

26 font_name: font name as a string 

27 space_width: default space width if no data is found. 

28 obj: XObject or Page where you can find a /Resource dictionary 

29 

30 Returns: 

31 Font sub-type, space_width criteria (50% of width), encoding, map character-map, font-dictionary. 

32 The font-dictionary itself is suitable for the curious. 

33 

34 """ 

35 ft: DictionaryObject = obj["/Resources"]["/Font"][font_name] # type: ignore 

36 font_subtype, font_halfspace, font_encoding, font_map = build_char_map_from_dict( 

37 space_width, ft 

38 ) 

39 return font_subtype, font_halfspace, font_encoding, font_map, ft 

40 

41 

42def build_char_map_from_dict( 

43 space_width: float, ft: DictionaryObject 

44) -> Tuple[str, float, Union[str, Dict[int, str]], Dict[Any, Any]]: 

45 """ 

46 Determine information about a font. 

47 

48 Args: 

49 space_width: default space with if no data found 

50 (normally half the width of a character). 

51 ft: Font Dictionary 

52 

53 Returns: 

54 Font sub-type, space_width criteria(50% of width), encoding, map character-map. 

55 The font-dictionary itself is suitable for the curious. 

56 

57 """ 

58 font_type = cast(str, ft["/Subtype"].get_object()) 

59 encoding, map_dict = get_encoding(ft) 

60 

61 space_key_char = get_actual_str_key(" ", encoding, map_dict) 

62 font_width_map = build_font_width_map(ft, space_width * 2.0) 

63 half_space_width = compute_space_width(font_width_map, space_key_char) / 2.0 

64 

65 return ( 

66 font_type, 

67 half_space_width, 

68 encoding, 

69 # https://github.com/python/mypy/issues/4374 

70 map_dict 

71 ) 

72 

73 

74# used when missing data, e.g. font def missing 

75unknown_char_map: Tuple[str, float, Union[str, Dict[int, str]], Dict[Any, Any]] = ( 

76 "Unknown", 

77 9999, 

78 dict.fromkeys(range(256), "�"), 

79 {}, 

80) 

81 

82 

83_predefined_cmap: Dict[str, str] = { 

84 "/Identity-H": "utf-16-be", 

85 "/Identity-V": "utf-16-be", 

86 "/GB-EUC-H": "gbk", 

87 "/GB-EUC-V": "gbk", 

88 "/GBpc-EUC-H": "gb2312", 

89 "/GBpc-EUC-V": "gb2312", 

90 "/GBK-EUC-H": "gbk", 

91 "/GBK-EUC-V": "gbk", 

92 "/GBK2K-H": "gb18030", 

93 "/GBK2K-V": "gb18030", 

94 "/ETen-B5-H": "cp950", 

95 "/ETen-B5-V": "cp950", 

96 "/ETenms-B5-H": "cp950", 

97 "/ETenms-B5-V": "cp950", 

98 "/UniCNS-UTF16-H": "utf-16-be", 

99 "/UniCNS-UTF16-V": "utf-16-be", 

100 "/UniGB-UTF16-H": "gb18030", 

101 "/UniGB-UTF16-V": "gb18030", 

102 # UCS2 in code 

103} 

104 

105# manually extracted from http://mirrors.ctan.org/fonts/adobe/afm/Adobe-Core35_AFMs-229.tar.gz 

106_default_fonts_space_width: Dict[str, int] = { 

107 "/Courier": 600, 

108 "/Courier-Bold": 600, 

109 "/Courier-BoldOblique": 600, 

110 "/Courier-Oblique": 600, 

111 "/Helvetica": 278, 

112 "/Helvetica-Bold": 278, 

113 "/Helvetica-BoldOblique": 278, 

114 "/Helvetica-Oblique": 278, 

115 "/Helvetica-Narrow": 228, 

116 "/Helvetica-NarrowBold": 228, 

117 "/Helvetica-NarrowBoldOblique": 228, 

118 "/Helvetica-NarrowOblique": 228, 

119 "/Times-Roman": 250, 

120 "/Times-Bold": 250, 

121 "/Times-BoldItalic": 250, 

122 "/Times-Italic": 250, 

123 "/Symbol": 250, 

124 "/ZapfDingbats": 278, 

125} 

126 

127 

128def get_encoding( 

129 ft: DictionaryObject 

130) -> Tuple[Union[str, Dict[int, str]], Dict[Any, Any]]: 

131 encoding = _parse_encoding(ft) 

132 map_dict, int_entry = _parse_to_unicode(ft) 

133 

134 # Apply rule from PDF ref 1.7 §5.9.1, 1st bullet: 

135 # if cmap not empty encoding should be discarded 

136 # (here transformed into identity for those characters) 

137 # If encoding is a string, it is expected to be an identity translation. 

138 if isinstance(encoding, dict): 

139 for x in int_entry: 

140 if x <= 255: 

141 encoding[x] = chr(x) 

142 

143 return encoding, map_dict 

144 

145 

146def _parse_encoding( 

147 ft: DictionaryObject 

148) -> Union[str, Dict[int, str]]: 

149 encoding: Union[str, List[str], Dict[int, str]] = [] 

150 if "/Encoding" not in ft: 

151 if "/BaseFont" in ft and cast(str, ft["/BaseFont"]) in charset_encoding: 

152 encoding = dict( 

153 zip(range(256), charset_encoding[cast(str, ft["/BaseFont"])]) 

154 ) 

155 else: 

156 encoding = "charmap" 

157 return encoding 

158 enc: Union[str, DictionaryObject, NullObject] = cast( 

159 Union[str, DictionaryObject, NullObject], ft["/Encoding"].get_object() 

160 ) 

161 if isinstance(enc, str): 

162 try: 

163 # already done : enc = NameObject.unnumber(enc.encode()).decode() 

164 # for #xx decoding 

165 if enc in charset_encoding: 

166 encoding = charset_encoding[enc].copy() 

167 elif enc in _predefined_cmap: 

168 encoding = _predefined_cmap[enc] 

169 elif "-UCS2-" in enc: 

170 encoding = "utf-16-be" 

171 else: 

172 raise Exception("not found") 

173 except Exception: 

174 logger_error(f"Advanced encoding {enc} not implemented yet", __name__) 

175 encoding = enc 

176 elif isinstance(enc, DictionaryObject) and "/BaseEncoding" in enc: 

177 try: 

178 encoding = charset_encoding[cast(str, enc["/BaseEncoding"])].copy() 

179 except Exception: 

180 logger_error( 

181 f"Advanced encoding {encoding} not implemented yet", 

182 __name__, 

183 ) 

184 encoding = charset_encoding["/StandardEncoding"].copy() 

185 else: 

186 encoding = charset_encoding["/StandardEncoding"].copy() 

187 if isinstance(enc, DictionaryObject) and "/Differences" in enc: 

188 x: int = 0 

189 o: Union[int, str] 

190 for o in cast(DictionaryObject, enc["/Differences"]): 

191 if isinstance(o, int): 

192 x = o 

193 else: # isinstance(o, str): 

194 try: 

195 if x < len(encoding): 

196 encoding[x] = adobe_glyphs[o] # type: ignore 

197 except Exception: 

198 encoding[x] = o # type: ignore 

199 x += 1 

200 if isinstance(encoding, list): 

201 encoding = dict(zip(range(256), encoding)) 

202 return encoding 

203 

204 

205def _parse_to_unicode( 

206 ft: DictionaryObject 

207) -> Tuple[Dict[Any, Any], List[int]]: 

208 # will store all translation code 

209 # and map_dict[-1] we will have the number of bytes to convert 

210 map_dict: Dict[Any, Any] = {} 

211 

212 # will provide the list of cmap keys as int to correct encoding 

213 int_entry: List[int] = [] 

214 

215 if "/ToUnicode" not in ft: 

216 if ft.get("/Subtype", "") == "/Type1": 

217 return _type1_alternative(ft, map_dict, int_entry) 

218 return {}, [] 

219 process_rg: bool = False 

220 process_char: bool = False 

221 multiline_rg: Union[ 

222 None, Tuple[int, int] 

223 ] = None # tuple = (current_char, remaining size) ; cf #1285 for example of file 

224 cm = prepare_cm(ft) 

225 for line in cm.split(b"\n"): 

226 process_rg, process_char, multiline_rg = process_cm_line( 

227 line.strip(b" \t"), 

228 process_rg, 

229 process_char, 

230 multiline_rg, 

231 map_dict, 

232 int_entry, 

233 ) 

234 

235 return map_dict, int_entry 

236 

237 

238def get_actual_str_key( 

239 value_char: str, encoding: Union[str, Dict[int, str]], map_dict: Dict[Any, Any] 

240) -> str: 

241 key_dict = {} 

242 if isinstance(encoding, dict): 

243 key_dict = {value: chr(key) for key, value in encoding.items() if value == value_char} 

244 else: 

245 key_dict = {value: key for key, value in map_dict.items() if value == value_char} 

246 return key_dict.get(value_char, value_char) 

247 

248 

249def prepare_cm(ft: DictionaryObject) -> bytes: 

250 tu = ft["/ToUnicode"] 

251 cm: bytes 

252 if isinstance(tu, StreamObject): 

253 cm = cast(DecodedStreamObject, ft["/ToUnicode"]).get_data() 

254 else: # if (tu is None) or cast(str, tu).startswith("/Identity"): 

255 # the full range 0000-FFFF will be processed 

256 cm = b"beginbfrange\n<0000> <0001> <0000>\nendbfrange" 

257 if isinstance(cm, str): 

258 cm = cm.encode() 

259 # we need to prepare cm before due to missing return line in pdf printed 

260 # to pdf from word 

261 cm = ( 

262 cm.strip() 

263 .replace(b"beginbfchar", b"\nbeginbfchar\n") 

264 .replace(b"endbfchar", b"\nendbfchar\n") 

265 .replace(b"beginbfrange", b"\nbeginbfrange\n") 

266 .replace(b"endbfrange", b"\nendbfrange\n") 

267 .replace(b"<<", b"\n{\n") # text between << and >> not used but 

268 .replace(b">>", b"\n}\n") # some solution to find it back 

269 ) 

270 ll = cm.split(b"<") 

271 for i in range(len(ll)): 

272 j = ll[i].find(b">") 

273 if j >= 0: 

274 if j == 0: 

275 # string is empty: stash a placeholder here (see below) 

276 # see https://github.com/py-pdf/pypdf/issues/1111 

277 content = b"." 

278 else: 

279 content = ll[i][:j].replace(b" ", b"") 

280 ll[i] = content + b" " + ll[i][j + 1 :] 

281 cm = ( 

282 (b" ".join(ll)) 

283 .replace(b"[", b" [ ") 

284 .replace(b"]", b" ]\n ") 

285 .replace(b"\r", b"\n") 

286 ) 

287 return cm 

288 

289 

290def process_cm_line( 

291 line: bytes, 

292 process_rg: bool, 

293 process_char: bool, 

294 multiline_rg: Union[None, Tuple[int, int]], 

295 map_dict: Dict[Any, Any], 

296 int_entry: List[int], 

297) -> Tuple[bool, bool, Union[None, Tuple[int, int]]]: 

298 if line == b"" or line[0] == 37: # 37 = % 

299 return process_rg, process_char, multiline_rg 

300 line = line.replace(b"\t", b" ") 

301 if b"beginbfrange" in line: 

302 process_rg = True 

303 elif b"endbfrange" in line: 

304 process_rg = False 

305 elif b"beginbfchar" in line: 

306 process_char = True 

307 elif b"endbfchar" in line: 

308 process_char = False 

309 elif process_rg: 

310 try: 

311 multiline_rg = parse_bfrange(line, map_dict, int_entry, multiline_rg) 

312 except binascii.Error as error: 

313 logger_warning(f"Skipping broken line {line!r}: {error}", __name__) 

314 elif process_char: 

315 parse_bfchar(line, map_dict, int_entry) 

316 return process_rg, process_char, multiline_rg 

317 

318 

319def parse_bfrange( 

320 line: bytes, 

321 map_dict: Dict[Any, Any], 

322 int_entry: List[int], 

323 multiline_rg: Union[None, Tuple[int, int]], 

324) -> Union[None, Tuple[int, int]]: 

325 lst = [x for x in line.split(b" ") if x] 

326 closure_found = False 

327 if multiline_rg is not None: 

328 fmt = b"%%0%dX" % (map_dict[-1] * 2) 

329 a = multiline_rg[0] # a, b not in the current line 

330 b = multiline_rg[1] 

331 for sq in lst: 

332 if sq == b"]": 

333 closure_found = True 

334 break 

335 map_dict[ 

336 unhexlify(fmt % a).decode( 

337 "charmap" if map_dict[-1] == 1 else "utf-16-be", 

338 "surrogatepass", 

339 ) 

340 ] = unhexlify(sq).decode("utf-16-be", "surrogatepass") 

341 int_entry.append(a) 

342 a += 1 

343 else: 

344 a = int(lst[0], 16) 

345 b = int(lst[1], 16) 

346 nbi = max(len(lst[0]), len(lst[1])) 

347 map_dict[-1] = ceil(nbi / 2) 

348 fmt = b"%%0%dX" % (map_dict[-1] * 2) 

349 if lst[2] == b"[": 

350 for sq in lst[3:]: 

351 if sq == b"]": 

352 closure_found = True 

353 break 

354 map_dict[ 

355 unhexlify(fmt % a).decode( 

356 "charmap" if map_dict[-1] == 1 else "utf-16-be", 

357 "surrogatepass", 

358 ) 

359 ] = unhexlify(sq).decode("utf-16-be", "surrogatepass") 

360 int_entry.append(a) 

361 a += 1 

362 else: # case without list 

363 c = int(lst[2], 16) 

364 fmt2 = b"%%0%dX" % max(4, len(lst[2])) 

365 closure_found = True 

366 while a <= b: 

367 map_dict[ 

368 unhexlify(fmt % a).decode( 

369 "charmap" if map_dict[-1] == 1 else "utf-16-be", 

370 "surrogatepass", 

371 ) 

372 ] = unhexlify(fmt2 % c).decode("utf-16-be", "surrogatepass") 

373 int_entry.append(a) 

374 a += 1 

375 c += 1 

376 return None if closure_found else (a, b) 

377 

378 

379def parse_bfchar(line: bytes, map_dict: Dict[Any, Any], int_entry: List[int]) -> None: 

380 lst = [x for x in line.split(b" ") if x] 

381 map_dict[-1] = len(lst[0]) // 2 

382 while len(lst) > 1: 

383 map_to = "" 

384 # placeholder (see above) means empty string 

385 if lst[1] != b".": 

386 map_to = unhexlify(lst[1]).decode( 

387 "charmap" if len(lst[1]) < 4 else "utf-16-be", "surrogatepass" 

388 ) # join is here as some cases where the code was split 

389 map_dict[ 

390 unhexlify(lst[0]).decode( 

391 "charmap" if map_dict[-1] == 1 else "utf-16-be", "surrogatepass" 

392 ) 

393 ] = map_to 

394 int_entry.append(int(lst[0], 16)) 

395 lst = lst[2:] 

396 

397 

398def build_font_width_map( 

399 ft: DictionaryObject, default_font_width: float 

400) -> Dict[Any, float]: 

401 font_width_map: Dict[Any, float] = {} 

402 st: int = 0 

403 en: int = 0 

404 try: 

405 default_font_width = _default_fonts_space_width[cast(str, ft["/BaseFont"].get_object())] * 2.0 

406 except KeyError: 

407 pass 

408 if "/DescendantFonts" in ft: # ft["/Subtype"].startswith("/CIDFontType"): 

409 # §9.7.4.3 of the 1.7 reference ("Glyph Metrics in CIDFonts") 

410 # Widths for a CIDFont are defined using the DW and W entries. 

411 # DW2 and W2 are for vertical use. Vertical type is not implemented. 

412 ft1 = ft["/DescendantFonts"][0].get_object() # type: ignore 

413 if "/DW" in ft1: 

414 font_width_map["default"] = cast(float, ft1["/DW"].get_object()) 

415 else: 

416 font_width_map["default"] = default_font_width 

417 if "/W" in ft1: 

418 w = ft1["/W"].get_object() 

419 else: 

420 w = [] 

421 while len(w) > 0: 

422 st = w[0] if isinstance(w[0], int) else w[0].get_object() 

423 second = w[1].get_object() 

424 if isinstance(second, int): 

425 # C_first C_last same_W 

426 en = second 

427 width = w[2].get_object() 

428 if not isinstance(width, (int, float)): 

429 logger_warning(f"Expected numeric value for width, got {width}. Ignoring it.", __name__) 

430 w = w[3:] 

431 continue 

432 for c_code in range(st, en + 1): 

433 font_width_map[chr(c_code)] = width 

434 w = w[3:] 

435 elif isinstance(second, list): 

436 # Starting_C [W1 W2 ... Wn] 

437 c_code = st 

438 for ww in second: 

439 width = ww.get_object() 

440 font_width_map[chr(c_code)] = width 

441 c_code += 1 

442 w = w[2:] 

443 else: 

444 logger_warning( 

445 "unknown widths : \n" + (ft1["/W"]).__repr__(), 

446 __name__, 

447 ) 

448 break 

449 elif "/Widths" in ft: 

450 w = cast(ArrayObject, ft["/Widths"].get_object()) 

451 if "/FontDescriptor" in ft and "/MissingWidth" in cast( 

452 DictionaryObject, ft["/FontDescriptor"] 

453 ): 

454 font_width_map["default"] = ft["/FontDescriptor"]["/MissingWidth"].get_object() # type: ignore 

455 else: 

456 # will consider width of char as avg(width) 

457 m = 0 

458 cpt = 0 

459 for xx in w: 

460 xx = xx.get_object() 

461 if xx > 0: 

462 m += xx 

463 cpt += 1 

464 font_width_map["default"] = m / max(1, cpt) 

465 st = cast(int, ft["/FirstChar"]) 

466 en = cast(int, ft["/LastChar"]) 

467 for c_code in range(st, en + 1): 

468 try: 

469 width = w[c_code - st].get_object() 

470 font_width_map[chr(c_code)] = width 

471 except (IndexError, KeyError): 

472 # The PDF structure is invalid. The array is too small 

473 # for the specified font width. 

474 pass 

475 if is_null_or_none(font_width_map.get("default")): 

476 font_width_map["default"] = default_font_width if default_font_width else 0.0 

477 return font_width_map 

478 

479 

480def compute_space_width( 

481 font_width_map: Dict[Any, float], space_char: str 

482) -> float: 

483 try: 

484 sp_width = font_width_map[space_char] 

485 if sp_width == 0: 

486 raise ValueError("Zero width") 

487 except (KeyError, ValueError): 

488 sp_width = ( 

489 font_width_map["default"] / 2.0 

490 ) # if using default we consider space will be only half size 

491 

492 return sp_width 

493 

494 

495def compute_font_width( 

496 font_width_map: Dict[Any, float], 

497 char: str 

498) -> float: 

499 char_width: float = 0.0 

500 try: 

501 char_width = font_width_map[char] 

502 except KeyError: 

503 char_width = ( 

504 font_width_map["default"] 

505 ) 

506 

507 return char_width 

508 

509 

510def _type1_alternative( 

511 ft: DictionaryObject, 

512 map_dict: Dict[Any, Any], 

513 int_entry: List[int], 

514) -> Tuple[Dict[Any, Any], List[int]]: 

515 if "/FontDescriptor" not in ft: 

516 return map_dict, int_entry 

517 ft_desc = cast(DictionaryObject, ft["/FontDescriptor"]).get("/FontFile") 

518 if is_null_or_none(ft_desc): 

519 return map_dict, int_entry 

520 assert ft_desc is not None, "mypy" 

521 txt = ft_desc.get_object().get_data() 

522 txt = txt.split(b"eexec\n")[0] # only clear part 

523 txt = txt.split(b"/Encoding")[1] # to get the encoding part 

524 lines = txt.replace(b"\r", b"\n").split(b"\n") 

525 for li in lines: 

526 if li.startswith(b"dup"): 

527 words = [_w for _w in li.split(b" ") if _w != b""] 

528 if len(words) > 3 and words[3] != b"put": 

529 continue 

530 try: 

531 i = int(words[1]) 

532 except ValueError: # pragma: no cover 

533 continue 

534 try: 

535 v = adobe_glyphs[words[2].decode()] 

536 except KeyError: 

537 if words[2].startswith(b"/uni"): 

538 try: 

539 v = chr(int(words[2][4:], 16)) 

540 except ValueError: # pragma: no cover 

541 continue 

542 else: 

543 continue 

544 map_dict[chr(i)] = v 

545 int_entry.append(i) 

546 return map_dict, int_entry