Coverage for /pythoncovmergedfiles/medio/medio/src/pdfminer.six/pdfminer/cmapdb.py: 61%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

356 statements  

1"""Adobe character mapping (CMap) support. 

2 

3CMaps provide the mapping between character codes and Unicode 

4code-points to character ids (CIDs). 

5 

6More information is available on: 

7 

8 https://github.com/adobe-type-tools/cmap-resources 

9 

10""" 

11 

12import contextlib 

13import gzip 

14import json 

15import logging 

16import os 

17import os.path 

18import struct 

19import sys 

20from collections.abc import Iterable, Iterator, MutableMapping 

21from typing import ( 

22 Any, 

23 BinaryIO, 

24 ClassVar, 

25 TextIO, 

26 Union, 

27 cast, 

28) 

29 

30from pdfminer.encodingdb import name2unicode 

31from pdfminer.pdfexceptions import PDFException, PDFTypeError 

32from pdfminer.psexceptions import PSEOF, PSSyntaxError 

33from pdfminer.psparser import KWD, PSKeyword, PSLiteral, PSStackParser, literal_name 

34from pdfminer.utils import choplist, nunpack 

35 

36log = logging.getLogger(__name__) 

37 

38 

39class CMapError(PDFException): 

40 pass 

41 

42 

43class CMapBase: 

44 debug = 0 

45 

46 def __init__(self, **kwargs: object) -> None: 

47 self.attrs: MutableMapping[str, object] = kwargs.copy() 

48 

49 def is_vertical(self) -> bool: 

50 return self.attrs.get("WMode", 0) != 0 

51 

52 def set_attr(self, k: str, v: object) -> None: 

53 self.attrs[k] = v 

54 

55 def add_code2cid(self, code: str, cid: int) -> None: 

56 pass 

57 

58 def add_cid2unichr(self, cid: int, code: PSLiteral | bytes | int) -> None: 

59 pass 

60 

61 def use_cmap(self, cmap: "CMapBase") -> None: 

62 pass 

63 

64 def decode(self, code: bytes) -> Iterable[int]: 

65 raise NotImplementedError 

66 

67 

68class CMap(CMapBase): 

69 def __init__(self, **kwargs: str | int) -> None: 

70 CMapBase.__init__(self, **kwargs) 

71 self.code2cid: dict[int, object] = {} 

72 

73 def __repr__(self) -> str: 

74 return "<CMap: {}>".format(self.attrs.get("CMapName")) 

75 

76 def use_cmap(self, cmap: CMapBase) -> None: 

77 assert isinstance(cmap, CMap), str(type(cmap)) 

78 

79 def copy(dst: dict[int, object], src: dict[int, object]) -> None: 

80 for k, v in src.items(): 

81 if isinstance(v, dict): 

82 d: dict[int, object] = {} 

83 dst[k] = d 

84 copy(d, v) 

85 else: 

86 dst[k] = v 

87 

88 copy(self.code2cid, cmap.code2cid) 

89 

90 def decode(self, code: bytes) -> Iterator[int]: 

91 log.debug("decode: %r, %r", self, code) 

92 d = self.code2cid 

93 for i in iter(code): 

94 if i in d: 

95 x = d[i] 

96 if isinstance(x, int): 

97 yield x 

98 d = self.code2cid 

99 else: 

100 d = cast(dict[int, object], x) 

101 else: 

102 d = self.code2cid 

103 

104 def dump( 

105 self, 

106 out: TextIO = sys.stdout, 

107 code2cid: dict[int, object] | None = None, 

108 code: tuple[int, ...] = (), 

109 ) -> None: 

110 if code2cid is None: 

111 code2cid = self.code2cid 

112 code = () 

113 for k, v in sorted(code2cid.items()): 

114 c = (*code, k) 

115 if isinstance(v, int): 

116 out.write(f"code {c!r} = cid {v}\n") 

117 else: 

118 self.dump(out=out, code2cid=cast(dict[int, object], v), code=c) 

119 

120 

121class IdentityCMap(CMapBase): 

122 def decode(self, code: bytes) -> tuple[int, ...]: 

123 n = len(code) // 2 

124 if n: 

125 return struct.unpack(f">{n}H", code[: n * 2]) 

126 else: 

127 return () 

128 

129 

130class IdentityCMapByte(IdentityCMap): 

131 def decode(self, code: bytes) -> tuple[int, ...]: 

132 n = len(code) 

133 if n: 

134 return struct.unpack(f">{n}B", code[:n]) 

135 else: 

136 return () 

137 

138 

139class UnicodeMap(CMapBase): 

140 def __init__(self, **kwargs: str | int) -> None: 

141 CMapBase.__init__(self, **kwargs) 

142 self.cid2unichr: dict[int, str] = {} 

143 

144 def __repr__(self) -> str: 

145 return "<UnicodeMap: {}>".format(self.attrs.get("CMapName")) 

146 

147 def get_unichr(self, cid: int) -> str: 

148 log.debug("get_unichr: %r, %r", self, cid) 

149 return self.cid2unichr[cid] 

150 

151 def dump(self, out: TextIO = sys.stdout) -> None: 

152 for k, v in sorted(self.cid2unichr.items()): 

153 out.write(f"cid {k} = unicode {v!r}\n") 

154 

155 

156class IdentityUnicodeMap(UnicodeMap): 

157 def get_unichr(self, cid: int) -> str: 

158 """Interpret character id as unicode codepoint""" 

159 log.debug("get_unichr: %r, %r", self, cid) 

160 return chr(cid) 

161 

162 

163class FileCMap(CMap): 

164 def add_code2cid(self, code: str, cid: int) -> None: 

165 assert isinstance(code, str) and isinstance(cid, int), str( 

166 (type(code), type(cid)), 

167 ) 

168 d = self.code2cid 

169 for c in code[:-1]: 

170 ci = ord(c) 

171 if ci in d: 

172 d = cast(dict[int, object], d[ci]) 

173 else: 

174 t: dict[int, object] = {} 

175 d[ci] = t 

176 d = t 

177 ci = ord(code[-1]) 

178 d[ci] = cid 

179 

180 

181class FileUnicodeMap(UnicodeMap): 

182 def add_cid2unichr(self, cid: int, code: PSLiteral | bytes | int) -> None: 

183 assert isinstance(cid, int), str(type(cid)) 

184 if isinstance(code, PSLiteral): 

185 # Interpret as an Adobe glyph name. 

186 assert isinstance(code.name, str) 

187 unichr = name2unicode(code.name) 

188 elif isinstance(code, bytes): 

189 # Interpret as UTF-16BE. 

190 unichr = code.decode("UTF-16BE", "ignore") 

191 elif isinstance(code, int): 

192 unichr = chr(code) 

193 else: 

194 raise PDFTypeError(code) 

195 

196 # A0 = non-breaking space, some weird fonts can have a collision on a cid here. 

197 if unichr == "\u00a0" and self.cid2unichr.get(cid) == " ": 

198 return 

199 self.cid2unichr[cid] = unichr 

200 

201 

202class PyCMap(CMap): 

203 def __init__(self, name: str, module: Any) -> None: 

204 super().__init__(CMapName=name) 

205 self.code2cid = module.CODE2CID 

206 if module.IS_VERTICAL: 

207 self.attrs["WMode"] = 1 

208 

209 

210class PyUnicodeMap(UnicodeMap): 

211 def __init__(self, name: str, module: Any, vertical: bool) -> None: 

212 super().__init__(CMapName=name) 

213 if vertical: 

214 self.cid2unichr = module.CID2UNICHR_V 

215 self.attrs["WMode"] = 1 

216 else: 

217 self.cid2unichr = module.CID2UNICHR_H 

218 

219 

220class CMapDB: 

221 _cmap_cache: ClassVar[dict[str, PyCMap]] = {} 

222 _umap_cache: ClassVar[dict[str, list[PyUnicodeMap]]] = {} 

223 

224 class CMapNotFound(CMapError): 

225 pass 

226 

227 @staticmethod 

228 def _convert_code2cid_keys( 

229 d: Union[dict[str, object], int], 

230 ) -> Union[dict[int, object], int]: 

231 """Recursively convert string keys to integers in CODE2CID dictionaries.""" 

232 if not isinstance(d, dict): 

233 return d 

234 result: dict[int, object] = {} 

235 for k, v in d.items(): 

236 # Convert string keys to integers 

237 try: 

238 new_key = int(k) 

239 except (ValueError, TypeError): 

240 new_key = k # type: ignore[assignment] 

241 # Recursively convert nested dictionaries 

242 if isinstance(v, dict): 

243 result[new_key] = CMapDB._convert_code2cid_keys(v) 

244 else: 

245 result[new_key] = v 

246 return result 

247 

248 @classmethod 

249 def _load_data(cls, name: str) -> type[Any]: 

250 name = name.replace("\0", "") 

251 log.debug("loading: %r", name) 

252 cmap_paths = ( 

253 os.environ.get("CMAP_PATH", "/usr/share/pdfminer/"), 

254 os.path.join(os.path.dirname(__file__), "cmap"), 

255 ) 

256 

257 for directory in cmap_paths: 

258 # Load JSON format (secure) 

259 json_filename = f"{name}.json.gz" 

260 json_path = os.path.join(directory, json_filename) 

261 resolved_json_path = os.path.realpath(json_path) 

262 resolved_directory = os.path.realpath(directory) 

263 

264 # Check if resolved path is within the intended directory 

265 if resolved_json_path.startswith( 

266 resolved_directory + os.sep 

267 ) and os.path.exists(resolved_json_path): 

268 log.debug("loading JSON: %r", json_path) 

269 with gzip.open(resolved_json_path, "rt", encoding="utf-8") as gzfile: 

270 data: dict[str, Any] = json.load(gzfile) 

271 # Convert string keys to integers for CID mappings 

272 if "CID2UNICHR_H" in data: 

273 data["CID2UNICHR_H"] = { 

274 int(k): v for k, v in data["CID2UNICHR_H"].items() 

275 } 

276 if "CID2UNICHR_V" in data: 

277 data["CID2UNICHR_V"] = { 

278 int(k): v for k, v in data["CID2UNICHR_V"].items() 

279 } 

280 # CODE2CID may also have numeric keys that need conversion 

281 if data.get("CODE2CID"): 

282 data["CODE2CID"] = cls._convert_code2cid_keys(data["CODE2CID"]) 

283 return type(str(name), (), data) 

284 

285 raise CMapDB.CMapNotFound(name) 

286 

287 @classmethod 

288 def get_cmap(cls, name: str) -> CMapBase: 

289 if name == "Identity-H": 

290 return IdentityCMap(WMode=0) 

291 elif name == "Identity-V": 

292 return IdentityCMap(WMode=1) 

293 elif name == "OneByteIdentityH": 

294 return IdentityCMapByte(WMode=0) 

295 elif name == "OneByteIdentityV": 

296 return IdentityCMapByte(WMode=1) 

297 try: 

298 return cls._cmap_cache[name] 

299 except KeyError: 

300 pass 

301 data = cls._load_data(name) 

302 cls._cmap_cache[name] = cmap = PyCMap(name, data) 

303 return cmap 

304 

305 @classmethod 

306 def get_unicode_map(cls, name: str, vertical: bool = False) -> UnicodeMap: 

307 try: 

308 return cls._umap_cache[name][vertical] 

309 except KeyError: 

310 pass 

311 data = cls._load_data(f"to-unicode-{name}") 

312 cls._umap_cache[name] = [PyUnicodeMap(name, data, v) for v in (False, True)] 

313 return cls._umap_cache[name][vertical] 

314 

315 

316class CMapParser(PSStackParser[PSKeyword]): 

317 def __init__(self, cmap: CMapBase, fp: BinaryIO) -> None: 

318 PSStackParser.__init__(self, fp) 

319 self.cmap = cmap 

320 # some ToUnicode maps don't have "begincmap" keyword. 

321 self._in_cmap = True 

322 self._warnings: set[str] = set() 

323 

324 def run(self) -> None: 

325 with contextlib.suppress(PSEOF): 

326 self.nextobject() 

327 

328 KEYWORD_BEGINCMAP = KWD(b"begincmap") 

329 KEYWORD_ENDCMAP = KWD(b"endcmap") 

330 KEYWORD_USECMAP = KWD(b"usecmap") 

331 KEYWORD_DEF = KWD(b"def") 

332 KEYWORD_BEGINCODESPACERANGE = KWD(b"begincodespacerange") 

333 KEYWORD_ENDCODESPACERANGE = KWD(b"endcodespacerange") 

334 KEYWORD_BEGINCIDRANGE = KWD(b"begincidrange") 

335 KEYWORD_ENDCIDRANGE = KWD(b"endcidrange") 

336 KEYWORD_BEGINCIDCHAR = KWD(b"begincidchar") 

337 KEYWORD_ENDCIDCHAR = KWD(b"endcidchar") 

338 KEYWORD_BEGINBFRANGE = KWD(b"beginbfrange") 

339 KEYWORD_ENDBFRANGE = KWD(b"endbfrange") 

340 KEYWORD_BEGINBFCHAR = KWD(b"beginbfchar") 

341 KEYWORD_ENDBFCHAR = KWD(b"endbfchar") 

342 KEYWORD_BEGINNOTDEFRANGE = KWD(b"beginnotdefrange") 

343 KEYWORD_ENDNOTDEFRANGE = KWD(b"endnotdefrange") 

344 

345 def do_keyword(self, pos: int, token: PSKeyword) -> None: 

346 """ToUnicode CMaps 

347 

348 See Section 5.9.2 - ToUnicode CMaps of the PDF Reference. 

349 """ 

350 if token is self.KEYWORD_BEGINCMAP: 

351 self._in_cmap = True 

352 self.popall() 

353 return 

354 

355 elif token is self.KEYWORD_ENDCMAP: 

356 self._in_cmap = False 

357 return 

358 

359 if not self._in_cmap: 

360 return 

361 

362 if token is self.KEYWORD_DEF: 

363 try: 

364 ((_, k), (_, v)) = self.pop(2) 

365 self.cmap.set_attr(literal_name(k), v) 

366 except PSSyntaxError: 

367 pass 

368 return 

369 

370 if token is self.KEYWORD_USECMAP: 

371 try: 

372 ((_, cmapname),) = self.pop(1) 

373 self.cmap.use_cmap(CMapDB.get_cmap(literal_name(cmapname))) 

374 except PSSyntaxError: 

375 pass 

376 except CMapDB.CMapNotFound: 

377 pass 

378 return 

379 

380 if token is self.KEYWORD_BEGINCODESPACERANGE: 

381 self.popall() 

382 return 

383 if token is self.KEYWORD_ENDCODESPACERANGE: 

384 self.popall() 

385 return 

386 

387 if token is self.KEYWORD_BEGINCIDRANGE: 

388 self.popall() 

389 return 

390 

391 if token is self.KEYWORD_ENDCIDRANGE: 

392 objs = [obj for (__, obj) in self.popall()] 

393 for start_byte, end_byte, cid in choplist(3, objs): 

394 if not isinstance(start_byte, bytes): 

395 self._warn_once("The start object of begincidrange is not a byte.") 

396 continue 

397 if not isinstance(end_byte, bytes): 

398 self._warn_once("The end object of begincidrange is not a byte.") 

399 continue 

400 if not isinstance(cid, int): 

401 self._warn_once("The cid object of begincidrange is not a byte.") 

402 continue 

403 if len(start_byte) != len(end_byte): 

404 self._warn_once( 

405 "The start and end byte of begincidrange have " 

406 "different lengths.", 

407 ) 

408 continue 

409 start_prefix = start_byte[:-4] 

410 end_prefix = end_byte[:-4] 

411 if start_prefix != end_prefix: 

412 self._warn_once( 

413 "The prefix of the start and end byte of " 

414 "begincidrange are not the same.", 

415 ) 

416 continue 

417 svar = start_byte[-4:] 

418 evar = end_byte[-4:] 

419 start = nunpack(svar) 

420 end = nunpack(evar) 

421 vlen = len(svar) 

422 for i in range(end - start + 1): 

423 x = start_prefix + struct.pack(">L", start + i)[-vlen:] 

424 self.cmap.add_cid2unichr(cid + i, x) 

425 return 

426 

427 if token is self.KEYWORD_BEGINCIDCHAR: 

428 self.popall() 

429 return 

430 

431 if token is self.KEYWORD_ENDCIDCHAR: 

432 objs = [obj for (__, obj) in self.popall()] 

433 for cid, code in choplist(2, objs): 

434 if isinstance(code, bytes) and isinstance(cid, int): 

435 self.cmap.add_cid2unichr(cid, code) 

436 return 

437 

438 if token is self.KEYWORD_BEGINBFRANGE: 

439 self.popall() 

440 return 

441 

442 if token is self.KEYWORD_ENDBFRANGE: 

443 objs = [obj for (__, obj) in self.popall()] 

444 for start_byte, end_byte, code in choplist(3, objs): 

445 if not isinstance(start_byte, bytes): 

446 self._warn_once("The start object is not a byte.") 

447 continue 

448 if not isinstance(end_byte, bytes): 

449 self._warn_once("The end object is not a byte.") 

450 continue 

451 if len(start_byte) != len(end_byte): 

452 self._warn_once("The start and end byte have different lengths.") 

453 continue 

454 start = nunpack(start_byte) 

455 end = nunpack(end_byte) 

456 if isinstance(code, list): 

457 if len(code) != end - start + 1: 

458 self._warn_once( 

459 "The difference between the start and end " 

460 "offsets does not match the code length.", 

461 ) 

462 for cid, unicode_value in zip( 

463 range(start, end + 1), code, strict=False 

464 ): 

465 self.cmap.add_cid2unichr(cid, unicode_value) 

466 else: 

467 assert isinstance(code, bytes) 

468 var = code[-4:] 

469 base = nunpack(var) 

470 prefix = code[:-4] 

471 vlen = len(var) 

472 for i in range(end - start + 1): 

473 x = prefix + struct.pack(">L", base + i)[-vlen:] 

474 self.cmap.add_cid2unichr(start + i, x) 

475 return 

476 

477 if token is self.KEYWORD_BEGINBFCHAR: 

478 self.popall() 

479 return 

480 

481 if token is self.KEYWORD_ENDBFCHAR: 

482 objs = [obj for (__, obj) in self.popall()] 

483 for cid, code in choplist(2, objs): 

484 if isinstance(cid, bytes) and isinstance(code, bytes): 

485 self.cmap.add_cid2unichr(nunpack(cid), code) 

486 return 

487 

488 if token is self.KEYWORD_BEGINNOTDEFRANGE: 

489 self.popall() 

490 return 

491 

492 if token is self.KEYWORD_ENDNOTDEFRANGE: 

493 self.popall() 

494 return 

495 

496 self.push((pos, token)) 

497 

498 def _warn_once(self, msg: str) -> None: 

499 """Warn once for each unique message""" 

500 if msg not in self._warnings: 

501 self._warnings.add(msg) 

502 base_msg = ( 

503 "Ignoring (part of) ToUnicode map because the PDF data " 

504 "does not conform to the format. This could result in " 

505 "(cid) values in the output. " 

506 ) 

507 log.warning(base_msg + msg)