Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_cmap.py: 9%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

282 statements  

1import binascii 

2from binascii import Error as BinasciiError 

3from binascii import unhexlify 

4from math import ceil 

5from typing import Any, Dict, List, Tuple, Union, cast 

6 

7from ._codecs import adobe_glyphs, charset_encoding 

8from ._utils import logger_error, logger_warning 

9from .generic import ( 

10 ArrayObject, 

11 DecodedStreamObject, 

12 DictionaryObject, 

13 NullObject, 

14 StreamObject, 

15 is_null_or_none, 

16) 

17 

18 

19# code freely inspired from @twiggy ; see #711 

20def build_char_map( 

21 font_name: str, space_width: float, obj: DictionaryObject 

22) -> Tuple[str, float, Union[str, Dict[int, str]], Dict[Any, Any], DictionaryObject]: 

23 """ 

24 Determine information about a font. 

25 

26 Args: 

27 font_name: font name as a string 

28 space_width: default space width if no data is found. 

29 obj: XObject or Page where you can find a /Resource dictionary 

30 

31 Returns: 

32 Font sub-type, space_width criteria (50% of width), encoding, map character-map, font-dictionary. 

33 The font-dictionary itself is suitable for the curious. 

34 

35 """ 

36 ft: DictionaryObject = obj["/Resources"]["/Font"][font_name] # type: ignore 

37 font_subtype, font_halfspace, font_encoding, font_map = build_char_map_from_dict( 

38 space_width, ft 

39 ) 

40 return font_subtype, font_halfspace, font_encoding, font_map, ft 

41 

42 

43def build_char_map_from_dict( 

44 space_width: float, ft: DictionaryObject 

45) -> Tuple[str, float, Union[str, Dict[int, str]], Dict[Any, Any]]: 

46 """ 

47 Determine information about a font. 

48 

49 Args: 

50 space_width: default space with if no data found 

51 (normally half the width of a character). 

52 ft: Font Dictionary 

53 

54 Returns: 

55 Font sub-type, space_width criteria(50% of width), encoding, map character-map. 

56 The font-dictionary itself is suitable for the curious. 

57 

58 """ 

59 font_type = cast(str, ft["/Subtype"].get_object()) 

60 encoding, map_dict = get_encoding(ft) 

61 

62 space_key_char = get_actual_str_key(" ", encoding, map_dict) 

63 font_width_map = build_font_width_map(ft, space_width * 2.0) 

64 half_space_width = compute_space_width(font_width_map, space_key_char) / 2.0 

65 

66 return ( 

67 font_type, 

68 half_space_width, 

69 encoding, 

70 # https://github.com/python/mypy/issues/4374 

71 map_dict 

72 ) 

73 

74 

75# used when missing data, e.g. font def missing 

76unknown_char_map: Tuple[str, float, Union[str, Dict[int, str]], Dict[Any, Any]] = ( 

77 "Unknown", 

78 9999, 

79 dict.fromkeys(range(256), "�"), 

80 {}, 

81) 

82 

83 

84_predefined_cmap: Dict[str, str] = { 

85 "/Identity-H": "utf-16-be", 

86 "/Identity-V": "utf-16-be", 

87 "/GB-EUC-H": "gbk", 

88 "/GB-EUC-V": "gbk", 

89 "/GBpc-EUC-H": "gb2312", 

90 "/GBpc-EUC-V": "gb2312", 

91 "/GBK-EUC-H": "gbk", 

92 "/GBK-EUC-V": "gbk", 

93 "/GBK2K-H": "gb18030", 

94 "/GBK2K-V": "gb18030", 

95 "/ETen-B5-H": "cp950", 

96 "/ETen-B5-V": "cp950", 

97 "/ETenms-B5-H": "cp950", 

98 "/ETenms-B5-V": "cp950", 

99 "/UniCNS-UTF16-H": "utf-16-be", 

100 "/UniCNS-UTF16-V": "utf-16-be", 

101 "/UniGB-UTF16-H": "gb18030", 

102 "/UniGB-UTF16-V": "gb18030", 

103 # UCS2 in code 

104} 

105 

106# manually extracted from http://mirrors.ctan.org/fonts/adobe/afm/Adobe-Core35_AFMs-229.tar.gz 

107_default_fonts_space_width: Dict[str, int] = { 

108 "/Courier": 600, 

109 "/Courier-Bold": 600, 

110 "/Courier-BoldOblique": 600, 

111 "/Courier-Oblique": 600, 

112 "/Helvetica": 278, 

113 "/Helvetica-Bold": 278, 

114 "/Helvetica-BoldOblique": 278, 

115 "/Helvetica-Oblique": 278, 

116 "/Helvetica-Narrow": 228, 

117 "/Helvetica-NarrowBold": 228, 

118 "/Helvetica-NarrowBoldOblique": 228, 

119 "/Helvetica-NarrowOblique": 228, 

120 "/Times-Roman": 250, 

121 "/Times-Bold": 250, 

122 "/Times-BoldItalic": 250, 

123 "/Times-Italic": 250, 

124 "/Symbol": 250, 

125 "/ZapfDingbats": 278, 

126} 

127 

128 

129def get_encoding( 

130 ft: DictionaryObject 

131) -> Tuple[Union[str, Dict[int, str]], Dict[Any, Any]]: 

132 encoding = _parse_encoding(ft) 

133 map_dict, int_entry = _parse_to_unicode(ft) 

134 

135 # Apply rule from PDF ref 1.7 §5.9.1, 1st bullet: 

136 # if cmap not empty encoding should be discarded 

137 # (here transformed into identity for those characters) 

138 # If encoding is a string, it is expected to be an identity translation. 

139 if isinstance(encoding, dict): 

140 for x in int_entry: 

141 if x <= 255: 

142 encoding[x] = chr(x) 

143 

144 return encoding, map_dict 

145 

146 

147def _parse_encoding( 

148 ft: DictionaryObject 

149) -> Union[str, Dict[int, str]]: 

150 encoding: Union[str, List[str], Dict[int, str]] = [] 

151 if "/Encoding" not in ft: 

152 if "/BaseFont" in ft and cast(str, ft["/BaseFont"]) in charset_encoding: 

153 encoding = dict( 

154 zip(range(256), charset_encoding[cast(str, ft["/BaseFont"])]) 

155 ) 

156 else: 

157 encoding = "charmap" 

158 return encoding 

159 enc: Union[str, DictionaryObject, NullObject] = cast( 

160 Union[str, DictionaryObject, NullObject], ft["/Encoding"].get_object() 

161 ) 

162 if isinstance(enc, str): 

163 try: 

164 # already done : enc = NameObject.unnumber(enc.encode()).decode() 

165 # for #xx decoding 

166 if enc in charset_encoding: 

167 encoding = charset_encoding[enc].copy() 

168 elif enc in _predefined_cmap: 

169 encoding = _predefined_cmap[enc] 

170 elif "-UCS2-" in enc: 

171 encoding = "utf-16-be" 

172 else: 

173 raise Exception("not found") 

174 except Exception: 

175 logger_error(f"Advanced encoding {enc} not implemented yet", __name__) 

176 encoding = enc 

177 elif isinstance(enc, DictionaryObject) and "/BaseEncoding" in enc: 

178 try: 

179 encoding = charset_encoding[cast(str, enc["/BaseEncoding"])].copy() 

180 except Exception: 

181 logger_error( 

182 f"Advanced encoding {encoding} not implemented yet", 

183 __name__, 

184 ) 

185 encoding = charset_encoding["/StandardEncoding"].copy() 

186 else: 

187 encoding = charset_encoding["/StandardEncoding"].copy() 

188 if isinstance(enc, DictionaryObject) and "/Differences" in enc: 

189 x: int = 0 

190 o: Union[int, str] 

191 for o in cast(DictionaryObject, enc["/Differences"]): 

192 if isinstance(o, int): 

193 x = o 

194 else: # isinstance(o, str): 

195 try: 

196 if x < len(encoding): 

197 encoding[x] = adobe_glyphs[o] # type: ignore 

198 except Exception: 

199 encoding[x] = o # type: ignore 

200 x += 1 

201 if isinstance(encoding, list): 

202 encoding = dict(zip(range(256), encoding)) 

203 return encoding 

204 

205 

206def _parse_to_unicode( 

207 ft: DictionaryObject 

208) -> Tuple[Dict[Any, Any], List[int]]: 

209 # will store all translation code 

210 # and map_dict[-1] we will have the number of bytes to convert 

211 map_dict: Dict[Any, Any] = {} 

212 

213 # will provide the list of cmap keys as int to correct encoding 

214 int_entry: List[int] = [] 

215 

216 if "/ToUnicode" not in ft: 

217 if ft.get("/Subtype", "") == "/Type1": 

218 return _type1_alternative(ft, map_dict, int_entry) 

219 return {}, [] 

220 process_rg: bool = False 

221 process_char: bool = False 

222 multiline_rg: Union[ 

223 None, Tuple[int, int] 

224 ] = None # tuple = (current_char, remaining size) ; cf #1285 for example of file 

225 cm = prepare_cm(ft) 

226 for line in cm.split(b"\n"): 

227 process_rg, process_char, multiline_rg = process_cm_line( 

228 line.strip(b" \t"), 

229 process_rg, 

230 process_char, 

231 multiline_rg, 

232 map_dict, 

233 int_entry, 

234 ) 

235 

236 return map_dict, int_entry 

237 

238 

239def get_actual_str_key( 

240 value_char: str, encoding: Union[str, Dict[int, str]], map_dict: Dict[Any, Any] 

241) -> str: 

242 key_dict = {} 

243 if isinstance(encoding, dict): 

244 key_dict = {value: chr(key) for key, value in encoding.items() if value == value_char} 

245 else: 

246 key_dict = {value: key for key, value in map_dict.items() if value == value_char} 

247 return key_dict.get(value_char, value_char) 

248 

249 

250def prepare_cm(ft: DictionaryObject) -> bytes: 

251 tu = ft["/ToUnicode"] 

252 cm: bytes 

253 if isinstance(tu, StreamObject): 

254 cm = cast(DecodedStreamObject, ft["/ToUnicode"]).get_data() 

255 else: # if (tu is None) or cast(str, tu).startswith("/Identity"): 

256 # the full range 0000-FFFF will be processed 

257 cm = b"beginbfrange\n<0000> <0001> <0000>\nendbfrange" 

258 if isinstance(cm, str): 

259 cm = cm.encode() 

260 # we need to prepare cm before due to missing return line in pdf printed 

261 # to pdf from word 

262 cm = ( 

263 cm.strip() 

264 .replace(b"beginbfchar", b"\nbeginbfchar\n") 

265 .replace(b"endbfchar", b"\nendbfchar\n") 

266 .replace(b"beginbfrange", b"\nbeginbfrange\n") 

267 .replace(b"endbfrange", b"\nendbfrange\n") 

268 .replace(b"<<", b"\n{\n") # text between << and >> not used but 

269 .replace(b">>", b"\n}\n") # some solution to find it back 

270 ) 

271 ll = cm.split(b"<") 

272 for i in range(len(ll)): 

273 j = ll[i].find(b">") 

274 if j >= 0: 

275 if j == 0: 

276 # string is empty: stash a placeholder here (see below) 

277 # see https://github.com/py-pdf/pypdf/issues/1111 

278 content = b"." 

279 else: 

280 content = ll[i][:j].replace(b" ", b"") 

281 ll[i] = content + b" " + ll[i][j + 1 :] 

282 cm = ( 

283 (b" ".join(ll)) 

284 .replace(b"[", b" [ ") 

285 .replace(b"]", b" ]\n ") 

286 .replace(b"\r", b"\n") 

287 ) 

288 return cm 

289 

290 

291def process_cm_line( 

292 line: bytes, 

293 process_rg: bool, 

294 process_char: bool, 

295 multiline_rg: Union[None, Tuple[int, int]], 

296 map_dict: Dict[Any, Any], 

297 int_entry: List[int], 

298) -> Tuple[bool, bool, Union[None, Tuple[int, int]]]: 

299 if line == b"" or line[0] == 37: # 37 = % 

300 return process_rg, process_char, multiline_rg 

301 line = line.replace(b"\t", b" ") 

302 if b"beginbfrange" in line: 

303 process_rg = True 

304 elif b"endbfrange" in line: 

305 process_rg = False 

306 elif b"beginbfchar" in line: 

307 process_char = True 

308 elif b"endbfchar" in line: 

309 process_char = False 

310 elif process_rg: 

311 try: 

312 multiline_rg = parse_bfrange(line, map_dict, int_entry, multiline_rg) 

313 except binascii.Error as error: 

314 logger_warning(f"Skipping broken line {line!r}: {error}", __name__) 

315 elif process_char: 

316 parse_bfchar(line, map_dict, int_entry) 

317 return process_rg, process_char, multiline_rg 

318 

319 

320def parse_bfrange( 

321 line: bytes, 

322 map_dict: Dict[Any, Any], 

323 int_entry: List[int], 

324 multiline_rg: Union[None, Tuple[int, int]], 

325) -> Union[None, Tuple[int, int]]: 

326 lst = [x for x in line.split(b" ") if x] 

327 closure_found = False 

328 if multiline_rg is not None: 

329 fmt = b"%%0%dX" % (map_dict[-1] * 2) 

330 a = multiline_rg[0] # a, b not in the current line 

331 b = multiline_rg[1] 

332 for sq in lst: 

333 if sq == b"]": 

334 closure_found = True 

335 break 

336 map_dict[ 

337 unhexlify(fmt % a).decode( 

338 "charmap" if map_dict[-1] == 1 else "utf-16-be", 

339 "surrogatepass", 

340 ) 

341 ] = unhexlify(sq).decode("utf-16-be", "surrogatepass") 

342 int_entry.append(a) 

343 a += 1 

344 else: 

345 a = int(lst[0], 16) 

346 b = int(lst[1], 16) 

347 nbi = max(len(lst[0]), len(lst[1])) 

348 map_dict[-1] = ceil(nbi / 2) 

349 fmt = b"%%0%dX" % (map_dict[-1] * 2) 

350 if lst[2] == b"[": 

351 for sq in lst[3:]: 

352 if sq == b"]": 

353 closure_found = True 

354 break 

355 map_dict[ 

356 unhexlify(fmt % a).decode( 

357 "charmap" if map_dict[-1] == 1 else "utf-16-be", 

358 "surrogatepass", 

359 ) 

360 ] = unhexlify(sq).decode("utf-16-be", "surrogatepass") 

361 int_entry.append(a) 

362 a += 1 

363 else: # case without list 

364 c = int(lst[2], 16) 

365 fmt2 = b"%%0%dX" % max(4, len(lst[2])) 

366 closure_found = True 

367 while a <= b: 

368 map_dict[ 

369 unhexlify(fmt % a).decode( 

370 "charmap" if map_dict[-1] == 1 else "utf-16-be", 

371 "surrogatepass", 

372 ) 

373 ] = unhexlify(fmt2 % c).decode("utf-16-be", "surrogatepass") 

374 int_entry.append(a) 

375 a += 1 

376 c += 1 

377 return None if closure_found else (a, b) 

378 

379 

380def parse_bfchar(line: bytes, map_dict: Dict[Any, Any], int_entry: List[int]) -> None: 

381 lst = [x for x in line.split(b" ") if x] 

382 map_dict[-1] = len(lst[0]) // 2 

383 while len(lst) > 1: 

384 map_to = "" 

385 # placeholder (see above) means empty string 

386 if lst[1] != b".": 

387 try: 

388 map_to = unhexlify(lst[1]).decode( 

389 "charmap" if len(lst[1]) < 4 else "utf-16-be", "surrogatepass" 

390 ) # join is here as some cases where the code was split 

391 except BinasciiError as exception: 

392 logger_warning(f"Got invalid hex string: {exception!s} ({lst[1]!r})", __name__) 

393 map_dict[ 

394 unhexlify(lst[0]).decode( 

395 "charmap" if map_dict[-1] == 1 else "utf-16-be", "surrogatepass" 

396 ) 

397 ] = map_to 

398 int_entry.append(int(lst[0], 16)) 

399 lst = lst[2:] 

400 

401 

402def build_font_width_map( 

403 ft: DictionaryObject, default_font_width: float 

404) -> Dict[Any, float]: 

405 font_width_map: Dict[Any, float] = {} 

406 st: int = 0 

407 en: int = 0 

408 try: 

409 default_font_width = _default_fonts_space_width[cast(str, ft["/BaseFont"].get_object())] * 2.0 

410 except KeyError: 

411 pass 

412 if "/DescendantFonts" in ft: # ft["/Subtype"].startswith("/CIDFontType"): 

413 # §9.7.4.3 of the 1.7 reference ("Glyph Metrics in CIDFonts") 

414 # Widths for a CIDFont are defined using the DW and W entries. 

415 # DW2 and W2 are for vertical use. Vertical type is not implemented. 

416 ft1 = ft["/DescendantFonts"][0].get_object() # type: ignore 

417 if "/DW" in ft1: 

418 font_width_map["default"] = cast(float, ft1["/DW"].get_object()) 

419 else: 

420 font_width_map["default"] = default_font_width 

421 if "/W" in ft1: 

422 w = ft1["/W"].get_object() 

423 else: 

424 w = [] 

425 while len(w) > 0: 

426 st = w[0] if isinstance(w[0], int) else w[0].get_object() 

427 second = w[1].get_object() 

428 if isinstance(second, int): 

429 # C_first C_last same_W 

430 en = second 

431 width = w[2].get_object() 

432 if not isinstance(width, (int, float)): 

433 logger_warning(f"Expected numeric value for width, got {width}. Ignoring it.", __name__) 

434 w = w[3:] 

435 continue 

436 for c_code in range(st, en + 1): 

437 font_width_map[chr(c_code)] = width 

438 w = w[3:] 

439 elif isinstance(second, list): 

440 # Starting_C [W1 W2 ... Wn] 

441 c_code = st 

442 for ww in second: 

443 width = ww.get_object() 

444 font_width_map[chr(c_code)] = width 

445 c_code += 1 

446 w = w[2:] 

447 else: 

448 logger_warning( 

449 "unknown widths : \n" + (ft1["/W"]).__repr__(), 

450 __name__, 

451 ) 

452 break 

453 elif "/Widths" in ft: 

454 w = cast(ArrayObject, ft["/Widths"].get_object()) 

455 if "/FontDescriptor" in ft and "/MissingWidth" in cast( 

456 DictionaryObject, ft["/FontDescriptor"] 

457 ): 

458 font_width_map["default"] = ft["/FontDescriptor"]["/MissingWidth"].get_object() # type: ignore 

459 else: 

460 # will consider width of char as avg(width) 

461 m = 0 

462 cpt = 0 

463 for xx in w: 

464 xx = xx.get_object() 

465 if xx > 0: 

466 m += xx 

467 cpt += 1 

468 font_width_map["default"] = m / max(1, cpt) 

469 st = cast(int, ft["/FirstChar"]) 

470 en = cast(int, ft["/LastChar"]) 

471 for c_code in range(st, en + 1): 

472 try: 

473 width = w[c_code - st].get_object() 

474 font_width_map[chr(c_code)] = width 

475 except (IndexError, KeyError): 

476 # The PDF structure is invalid. The array is too small 

477 # for the specified font width. 

478 pass 

479 if is_null_or_none(font_width_map.get("default")): 

480 font_width_map["default"] = default_font_width if default_font_width else 0.0 

481 return font_width_map 

482 

483 

484def compute_space_width( 

485 font_width_map: Dict[Any, float], space_char: str 

486) -> float: 

487 try: 

488 sp_width = font_width_map[space_char] 

489 if sp_width == 0: 

490 raise ValueError("Zero width") 

491 except (KeyError, ValueError): 

492 sp_width = ( 

493 font_width_map["default"] / 2.0 

494 ) # if using default we consider space will be only half size 

495 

496 return sp_width 

497 

498 

499def compute_font_width( 

500 font_width_map: Dict[Any, float], 

501 char: str 

502) -> float: 

503 char_width: float = 0.0 

504 try: 

505 char_width = font_width_map[char] 

506 except KeyError: 

507 char_width = ( 

508 font_width_map["default"] 

509 ) 

510 

511 return char_width 

512 

513 

514def _type1_alternative( 

515 ft: DictionaryObject, 

516 map_dict: Dict[Any, Any], 

517 int_entry: List[int], 

518) -> Tuple[Dict[Any, Any], List[int]]: 

519 if "/FontDescriptor" not in ft: 

520 return map_dict, int_entry 

521 ft_desc = cast(DictionaryObject, ft["/FontDescriptor"]).get("/FontFile") 

522 if is_null_or_none(ft_desc): 

523 return map_dict, int_entry 

524 assert ft_desc is not None, "mypy" 

525 txt = ft_desc.get_object().get_data() 

526 txt = txt.split(b"eexec\n")[0] # only clear part 

527 txt = txt.split(b"/Encoding")[1] # to get the encoding part 

528 lines = txt.replace(b"\r", b"\n").split(b"\n") 

529 for li in lines: 

530 if li.startswith(b"dup"): 

531 words = [_w for _w in li.split(b" ") if _w != b""] 

532 if len(words) > 3 and words[3] != b"put": 

533 continue 

534 try: 

535 i = int(words[1]) 

536 except ValueError: # pragma: no cover 

537 continue 

538 try: 

539 v = adobe_glyphs[words[2].decode()] 

540 except KeyError: 

541 if words[2].startswith(b"/uni"): 

542 try: 

543 v = chr(int(words[2][4:], 16)) 

544 except ValueError: # pragma: no cover 

545 continue 

546 else: 

547 continue 

548 map_dict[chr(i)] = v 

549 int_entry.append(i) 

550 return map_dict, int_entry