Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_cmap.py: 9%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

285 statements  

1import binascii 

2from binascii import Error as BinasciiError 

3from binascii import unhexlify 

4from math import ceil 

5from typing import Any, Union, cast 

6 

7from ._codecs import adobe_glyphs, charset_encoding 

8from ._codecs.core_fontmetrics import CORE_FONT_METRICS 

9from ._utils import logger_error, logger_warning 

10from .generic import ( 

11 ArrayObject, 

12 DecodedStreamObject, 

13 DictionaryObject, 

14 NullObject, 

15 StreamObject, 

16 is_null_or_none, 

17) 

18 

19 

20# code freely inspired from @twiggy ; see #711 

21def build_char_map( 

22 font_name: str, space_width: float, obj: DictionaryObject 

23) -> tuple[str, float, Union[str, dict[int, str]], dict[Any, Any], DictionaryObject]: 

24 """ 

25 Determine information about a font. 

26 

27 Args: 

28 font_name: font name as a string 

29 space_width: default space width if no data is found. 

30 obj: XObject or Page where you can find a /Resource dictionary 

31 

32 Returns: 

33 Font sub-type, space_width criteria (50% of width), encoding, map character-map, font-dictionary. 

34 The font-dictionary itself is suitable for the curious. 

35 

36 """ 

37 ft: DictionaryObject = obj["/Resources"]["/Font"][font_name] # type: ignore 

38 font_subtype, font_halfspace, font_encoding, font_map = build_char_map_from_dict( 

39 space_width, ft 

40 ) 

41 return font_subtype, font_halfspace, font_encoding, font_map, ft 

42 

43 

44def build_char_map_from_dict( 

45 space_width: float, ft: DictionaryObject 

46) -> tuple[str, float, Union[str, dict[int, str]], dict[Any, Any]]: 

47 """ 

48 Determine information about a font. 

49 

50 Args: 

51 space_width: default space with if no data found 

52 (normally half the width of a character). 

53 ft: Font Dictionary 

54 

55 Returns: 

56 Font sub-type, space_width criteria(50% of width), encoding, map character-map. 

57 The font-dictionary itself is suitable for the curious. 

58 

59 """ 

60 font_type = cast(str, ft["/Subtype"].get_object()) 

61 encoding, map_dict = get_encoding(ft) 

62 

63 space_key_char = get_actual_str_key(" ", encoding, map_dict) 

64 font_width_map = build_font_width_map(ft, space_width * 2.0) 

65 half_space_width = compute_space_width(font_width_map, space_key_char) / 2.0 

66 

67 return ( 

68 font_type, 

69 half_space_width, 

70 encoding, 

71 # https://github.com/python/mypy/issues/4374 

72 map_dict 

73 ) 

74 

75 

76# used when missing data, e.g. font def missing 

77unknown_char_map: tuple[str, float, Union[str, dict[int, str]], dict[Any, Any]] = ( 

78 "Unknown", 

79 9999, 

80 dict.fromkeys(range(256), "�"), 

81 {}, 

82) 

83 

84 

85_predefined_cmap: dict[str, str] = { 

86 "/Identity-H": "utf-16-be", 

87 "/Identity-V": "utf-16-be", 

88 "/GB-EUC-H": "gbk", 

89 "/GB-EUC-V": "gbk", 

90 "/GBpc-EUC-H": "gb2312", 

91 "/GBpc-EUC-V": "gb2312", 

92 "/GBK-EUC-H": "gbk", 

93 "/GBK-EUC-V": "gbk", 

94 "/GBK2K-H": "gb18030", 

95 "/GBK2K-V": "gb18030", 

96 "/ETen-B5-H": "cp950", 

97 "/ETen-B5-V": "cp950", 

98 "/ETenms-B5-H": "cp950", 

99 "/ETenms-B5-V": "cp950", 

100 "/UniCNS-UTF16-H": "utf-16-be", 

101 "/UniCNS-UTF16-V": "utf-16-be", 

102 "/UniGB-UTF16-H": "gb18030", 

103 "/UniGB-UTF16-V": "gb18030", 

104 # UCS2 in code 

105} 

106 

107 

108def get_encoding( 

109 ft: DictionaryObject 

110) -> tuple[Union[str, dict[int, str]], dict[Any, Any]]: 

111 encoding = _parse_encoding(ft) 

112 map_dict, int_entry = _parse_to_unicode(ft) 

113 

114 # Apply rule from PDF ref 1.7 §5.9.1, 1st bullet: 

115 # if cmap not empty encoding should be discarded 

116 # (here transformed into identity for those characters) 

117 # If encoding is a string, it is expected to be an identity translation. 

118 if isinstance(encoding, dict): 

119 for x in int_entry: 

120 if x <= 255: 

121 encoding[x] = chr(x) 

122 

123 return encoding, map_dict 

124 

125 

126def _parse_encoding( 

127 ft: DictionaryObject 

128) -> Union[str, dict[int, str]]: 

129 encoding: Union[str, list[str], dict[int, str]] = [] 

130 if "/Encoding" not in ft: 

131 if "/BaseFont" in ft and cast(str, ft["/BaseFont"]) in charset_encoding: 

132 encoding = dict( 

133 zip(range(256), charset_encoding[cast(str, ft["/BaseFont"])]) 

134 ) 

135 else: 

136 encoding = "charmap" 

137 return encoding 

138 enc: Union[str, DictionaryObject, NullObject] = cast( 

139 Union[str, DictionaryObject, NullObject], ft["/Encoding"].get_object() 

140 ) 

141 if isinstance(enc, str): 

142 try: 

143 # already done : enc = NameObject.unnumber(enc.encode()).decode() 

144 # for #xx decoding 

145 if enc in charset_encoding: 

146 encoding = charset_encoding[enc].copy() 

147 elif enc in _predefined_cmap: 

148 encoding = _predefined_cmap[enc] 

149 elif "-UCS2-" in enc: 

150 encoding = "utf-16-be" 

151 else: 

152 raise Exception("not found") 

153 except Exception: 

154 logger_error(f"Advanced encoding {enc} not implemented yet", __name__) 

155 encoding = enc 

156 elif isinstance(enc, DictionaryObject) and "/BaseEncoding" in enc: 

157 try: 

158 encoding = charset_encoding[cast(str, enc["/BaseEncoding"])].copy() 

159 except Exception: 

160 logger_error( 

161 f"Advanced encoding {encoding} not implemented yet", 

162 __name__, 

163 ) 

164 encoding = charset_encoding["/StandardEncoding"].copy() 

165 else: 

166 encoding = charset_encoding["/StandardEncoding"].copy() 

167 if isinstance(enc, DictionaryObject) and "/Differences" in enc: 

168 x: int = 0 

169 o: Union[int, str] 

170 for o in cast(DictionaryObject, enc["/Differences"]): 

171 if isinstance(o, int): 

172 x = o 

173 else: # isinstance(o, str): 

174 try: 

175 if x < len(encoding): 

176 encoding[x] = adobe_glyphs[o] # type: ignore 

177 except Exception: 

178 encoding[x] = o # type: ignore 

179 x += 1 

180 if isinstance(encoding, list): 

181 encoding = dict(zip(range(256), encoding)) 

182 return encoding 

183 

184 

185def _parse_to_unicode( 

186 ft: DictionaryObject 

187) -> tuple[dict[Any, Any], list[int]]: 

188 # will store all translation code 

189 # and map_dict[-1] we will have the number of bytes to convert 

190 map_dict: dict[Any, Any] = {} 

191 

192 # will provide the list of cmap keys as int to correct encoding 

193 int_entry: list[int] = [] 

194 

195 if "/ToUnicode" not in ft: 

196 if ft.get("/Subtype", "") == "/Type1": 

197 return _type1_alternative(ft, map_dict, int_entry) 

198 return {}, [] 

199 process_rg: bool = False 

200 process_char: bool = False 

201 multiline_rg: Union[ 

202 None, tuple[int, int] 

203 ] = None # tuple = (current_char, remaining size) ; cf #1285 for example of file 

204 cm = prepare_cm(ft) 

205 for line in cm.split(b"\n"): 

206 process_rg, process_char, multiline_rg = process_cm_line( 

207 line.strip(b" \t"), 

208 process_rg, 

209 process_char, 

210 multiline_rg, 

211 map_dict, 

212 int_entry, 

213 ) 

214 

215 return map_dict, int_entry 

216 

217 

218def get_actual_str_key( 

219 value_char: str, encoding: Union[str, dict[int, str]], map_dict: dict[Any, Any] 

220) -> str: 

221 key_dict = {} 

222 if isinstance(encoding, dict): 

223 key_dict = {value: chr(key) for key, value in encoding.items() if value == value_char} 

224 else: 

225 key_dict = {value: key for key, value in map_dict.items() if value == value_char} 

226 return key_dict.get(value_char, value_char) 

227 

228 

229def prepare_cm(ft: DictionaryObject) -> bytes: 

230 tu = ft["/ToUnicode"] 

231 cm: bytes 

232 if isinstance(tu, StreamObject): 

233 cm = cast(DecodedStreamObject, ft["/ToUnicode"]).get_data() 

234 else: # if (tu is None) or cast(str, tu).startswith("/Identity"): 

235 # the full range 0000-FFFF will be processed 

236 cm = b"beginbfrange\n<0000> <0001> <0000>\nendbfrange" 

237 if isinstance(cm, str): 

238 cm = cm.encode() 

239 # we need to prepare cm before due to missing return line in pdf printed 

240 # to pdf from word 

241 cm = ( 

242 cm.strip() 

243 .replace(b"beginbfchar", b"\nbeginbfchar\n") 

244 .replace(b"endbfchar", b"\nendbfchar\n") 

245 .replace(b"beginbfrange", b"\nbeginbfrange\n") 

246 .replace(b"endbfrange", b"\nendbfrange\n") 

247 .replace(b"<<", b"\n{\n") # text between << and >> not used but 

248 .replace(b">>", b"\n}\n") # some solution to find it back 

249 ) 

250 ll = cm.split(b"<") 

251 for i in range(len(ll)): 

252 j = ll[i].find(b">") 

253 if j >= 0: 

254 if j == 0: 

255 # string is empty: stash a placeholder here (see below) 

256 # see https://github.com/py-pdf/pypdf/issues/1111 

257 content = b"." 

258 else: 

259 content = ll[i][:j].replace(b" ", b"") 

260 ll[i] = content + b" " + ll[i][j + 1 :] 

261 cm = ( 

262 (b" ".join(ll)) 

263 .replace(b"[", b" [ ") 

264 .replace(b"]", b" ]\n ") 

265 .replace(b"\r", b"\n") 

266 ) 

267 return cm 

268 

269 

270def process_cm_line( 

271 line: bytes, 

272 process_rg: bool, 

273 process_char: bool, 

274 multiline_rg: Union[None, tuple[int, int]], 

275 map_dict: dict[Any, Any], 

276 int_entry: list[int], 

277) -> tuple[bool, bool, Union[None, tuple[int, int]]]: 

278 if line == b"" or line[0] == 37: # 37 = % 

279 return process_rg, process_char, multiline_rg 

280 line = line.replace(b"\t", b" ") 

281 if b"beginbfrange" in line: 

282 process_rg = True 

283 elif b"endbfrange" in line: 

284 process_rg = False 

285 elif b"beginbfchar" in line: 

286 process_char = True 

287 elif b"endbfchar" in line: 

288 process_char = False 

289 elif process_rg: 

290 try: 

291 multiline_rg = parse_bfrange(line, map_dict, int_entry, multiline_rg) 

292 except binascii.Error as error: 

293 logger_warning(f"Skipping broken line {line!r}: {error}", __name__) 

294 elif process_char: 

295 parse_bfchar(line, map_dict, int_entry) 

296 return process_rg, process_char, multiline_rg 

297 

298 

299def parse_bfrange( 

300 line: bytes, 

301 map_dict: dict[Any, Any], 

302 int_entry: list[int], 

303 multiline_rg: Union[None, tuple[int, int]], 

304) -> Union[None, tuple[int, int]]: 

305 lst = [x for x in line.split(b" ") if x] 

306 closure_found = False 

307 if multiline_rg is not None: 

308 fmt = b"%%0%dX" % (map_dict[-1] * 2) 

309 a = multiline_rg[0] # a, b not in the current line 

310 b = multiline_rg[1] 

311 for sq in lst: 

312 if sq == b"]": 

313 closure_found = True 

314 break 

315 map_dict[ 

316 unhexlify(fmt % a).decode( 

317 "charmap" if map_dict[-1] == 1 else "utf-16-be", 

318 "surrogatepass", 

319 ) 

320 ] = unhexlify(sq).decode("utf-16-be", "surrogatepass") 

321 int_entry.append(a) 

322 a += 1 

323 else: 

324 a = int(lst[0], 16) 

325 b = int(lst[1], 16) 

326 nbi = max(len(lst[0]), len(lst[1])) 

327 map_dict[-1] = ceil(nbi / 2) 

328 fmt = b"%%0%dX" % (map_dict[-1] * 2) 

329 if lst[2] == b"[": 

330 for sq in lst[3:]: 

331 if sq == b"]": 

332 closure_found = True 

333 break 

334 map_dict[ 

335 unhexlify(fmt % a).decode( 

336 "charmap" if map_dict[-1] == 1 else "utf-16-be", 

337 "surrogatepass", 

338 ) 

339 ] = unhexlify(sq).decode("utf-16-be", "surrogatepass") 

340 int_entry.append(a) 

341 a += 1 

342 else: # case without list 

343 c = int(lst[2], 16) 

344 fmt2 = b"%%0%dX" % max(4, len(lst[2])) 

345 closure_found = True 

346 while a <= b: 

347 map_dict[ 

348 unhexlify(fmt % a).decode( 

349 "charmap" if map_dict[-1] == 1 else "utf-16-be", 

350 "surrogatepass", 

351 ) 

352 ] = unhexlify(fmt2 % c).decode("utf-16-be", "surrogatepass") 

353 int_entry.append(a) 

354 a += 1 

355 c += 1 

356 return None if closure_found else (a, b) 

357 

358 

359def parse_bfchar(line: bytes, map_dict: dict[Any, Any], int_entry: list[int]) -> None: 

360 lst = [x for x in line.split(b" ") if x] 

361 map_dict[-1] = len(lst[0]) // 2 

362 while len(lst) > 1: 

363 map_to = "" 

364 # placeholder (see above) means empty string 

365 if lst[1] != b".": 

366 try: 

367 map_to = unhexlify(lst[1]).decode( 

368 "charmap" if len(lst[1]) < 4 else "utf-16-be", "surrogatepass" 

369 ) # join is here as some cases where the code was split 

370 except BinasciiError as exception: 

371 logger_warning(f"Got invalid hex string: {exception!s} ({lst[1]!r})", __name__) 

372 map_dict[ 

373 unhexlify(lst[0]).decode( 

374 "charmap" if map_dict[-1] == 1 else "utf-16-be", "surrogatepass" 

375 ) 

376 ] = map_to 

377 int_entry.append(int(lst[0], 16)) 

378 lst = lst[2:] 

379 

380 

381def build_font_width_map( 

382 ft: DictionaryObject, default_font_width: float 

383) -> dict[Any, float]: 

384 font_width_map: dict[Any, float] = {} 

385 st: int = 0 

386 en: int = 0 

387 if "/DescendantFonts" in ft: # ft["/Subtype"].startswith("/CIDFontType"): 

388 # §9.7.4.3 of the 1.7 reference ("Glyph Metrics in CIDFonts") 

389 # Widths for a CIDFont are defined using the DW and W entries. 

390 # DW2 and W2 are for vertical use. Vertical type is not implemented. 

391 ft1 = ft["/DescendantFonts"][0].get_object() # type: ignore 

392 if "/DW" in ft1: 

393 font_width_map["default"] = cast(float, ft1["/DW"].get_object()) 

394 else: 

395 font_name = str(ft["/BaseFont"]).removeprefix("/") 

396 if font_name in CORE_FONT_METRICS: 

397 # This applies to test_tounicode_is_identity, which has a CID CourierNew font that 

398 # apparently does not specify the width of a space. 

399 font_width_map["default"] = CORE_FONT_METRICS[font_name].character_widths[" "] * 2 

400 else: 

401 font_width_map["default"] = default_font_width 

402 if "/W" in ft1: 

403 w = ft1["/W"].get_object() 

404 else: 

405 w = [] 

406 while len(w) > 0: 

407 st = w[0] if isinstance(w[0], int) else w[0].get_object() 

408 second = w[1].get_object() 

409 if isinstance(second, int): 

410 # C_first C_last same_W 

411 en = second 

412 width = w[2].get_object() 

413 if not isinstance(width, (int, float)): 

414 logger_warning(f"Expected numeric value for width, got {width}. Ignoring it.", __name__) 

415 w = w[3:] 

416 continue 

417 for c_code in range(st, en + 1): 

418 font_width_map[chr(c_code)] = width 

419 w = w[3:] 

420 elif isinstance(second, list): 

421 # Starting_C [W1 W2 ... Wn] 

422 c_code = st 

423 for ww in second: 

424 width = ww.get_object() 

425 font_width_map[chr(c_code)] = width 

426 c_code += 1 

427 w = w[2:] 

428 else: 

429 logger_warning( 

430 "unknown widths : \n" + (ft1["/W"]).__repr__(), 

431 __name__, 

432 ) 

433 break 

434 elif "/Widths" in ft: 

435 w = cast(ArrayObject, ft["/Widths"].get_object()) 

436 if "/FontDescriptor" in ft and "/MissingWidth" in cast( 

437 DictionaryObject, ft["/FontDescriptor"] 

438 ): 

439 font_width_map["default"] = ft["/FontDescriptor"]["/MissingWidth"].get_object() # type: ignore 

440 else: 

441 # will consider width of char as avg(width) 

442 m = 0 

443 cpt = 0 

444 for xx in w: 

445 xx = xx.get_object() 

446 if xx > 0: 

447 m += xx 

448 cpt += 1 

449 font_width_map["default"] = m / max(1, cpt) 

450 st = cast(int, ft["/FirstChar"]) 

451 en = cast(int, ft["/LastChar"]) 

452 for c_code in range(st, en + 1): 

453 try: 

454 width = w[c_code - st].get_object() 

455 font_width_map[chr(c_code)] = width 

456 except (IndexError, KeyError): 

457 # The PDF structure is invalid. The array is too small 

458 # for the specified font width. 

459 pass 

460 else: 

461 font_name = str(ft["/BaseFont"]).removeprefix("/") 

462 if font_name in CORE_FONT_METRICS: 

463 font_width_map = cast(dict[str, float], CORE_FONT_METRICS[font_name].character_widths) 

464 font_width_map["default"] = font_width_map[" "] * 2 

465 if is_null_or_none(font_width_map.get("default")): 

466 font_width_map["default"] = 0 

467 return font_width_map 

468 

469 

470def compute_space_width( 

471 font_width_map: dict[Any, float], space_char: str 

472) -> float: 

473 try: 

474 sp_width = font_width_map[space_char] 

475 if sp_width == 0: 

476 raise ValueError("Zero width") 

477 except (KeyError, ValueError): 

478 sp_width = ( 

479 font_width_map["default"] / 2.0 

480 ) # if using default we consider space will be only half size 

481 

482 return sp_width 

483 

484 

485def compute_font_width( 

486 font_width_map: dict[Any, float], 

487 char: str 

488) -> float: 

489 char_width: float = 0.0 

490 try: 

491 char_width = font_width_map[char] 

492 except KeyError: 

493 char_width = ( 

494 font_width_map["default"] 

495 ) 

496 

497 return char_width 

498 

499 

500def _type1_alternative( 

501 ft: DictionaryObject, 

502 map_dict: dict[Any, Any], 

503 int_entry: list[int], 

504) -> tuple[dict[Any, Any], list[int]]: 

505 if "/FontDescriptor" not in ft: 

506 return map_dict, int_entry 

507 ft_desc = cast(DictionaryObject, ft["/FontDescriptor"]).get("/FontFile") 

508 if is_null_or_none(ft_desc): 

509 return map_dict, int_entry 

510 assert ft_desc is not None, "mypy" 

511 txt = ft_desc.get_object().get_data() 

512 txt = txt.split(b"eexec\n")[0] # only clear part 

513 txt = txt.split(b"/Encoding")[1] # to get the encoding part 

514 lines = txt.replace(b"\r", b"\n").split(b"\n") 

515 for li in lines: 

516 if li.startswith(b"dup"): 

517 words = [_w for _w in li.split(b" ") if _w != b""] 

518 if len(words) > 3 and words[3] != b"put": 

519 continue 

520 try: 

521 i = int(words[1]) 

522 except ValueError: # pragma: no cover 

523 continue 

524 try: 

525 v = adobe_glyphs[words[2].decode()] 

526 except KeyError: 

527 if words[2].startswith(b"/uni"): 

528 try: 

529 v = chr(int(words[2][4:], 16)) 

530 except ValueError: # pragma: no cover 

531 continue 

532 else: 

533 continue 

534 map_dict[chr(i)] = v 

535 int_entry.append(i) 

536 return map_dict, int_entry