Coverage for /pythoncovmergedfiles/medio/medio/src/pdfminer.six/pdfminer/cmapdb.py: 79%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

338 statements  

1"""Adobe character mapping (CMap) support. 

2 

3CMaps provide the mapping between character codes and Unicode 

4code-points to character ids (CIDs). 

5 

6More information is available on: 

7 

8 https://github.com/adobe-type-tools/cmap-resources 

9 

10""" 

11 

12import gzip 

13import logging 

14import os 

15import os.path 

16import pickle as pickle 

17import struct 

18import sys 

19from typing import ( 

20 Any, 

21 BinaryIO, 

22 Dict, 

23 Iterable, 

24 Iterator, 

25 List, 

26 MutableMapping, 

27 Optional, 

28 Set, 

29 TextIO, 

30 Tuple, 

31 Union, 

32 cast, 

33) 

34 

35from pdfminer.encodingdb import name2unicode 

36from pdfminer.pdfexceptions import PDFException, PDFTypeError 

37from pdfminer.psexceptions import PSEOF, PSSyntaxError 

38from pdfminer.psparser import KWD, PSKeyword, PSLiteral, PSStackParser, literal_name 

39from pdfminer.utils import choplist, nunpack 

40 

41log = logging.getLogger(__name__) 

42 

43 

44class CMapError(PDFException): 

45 pass 

46 

47 

48class CMapBase: 

49 debug = 0 

50 

51 def __init__(self, **kwargs: object) -> None: 

52 self.attrs: MutableMapping[str, object] = kwargs.copy() 

53 

54 def is_vertical(self) -> bool: 

55 return self.attrs.get("WMode", 0) != 0 

56 

57 def set_attr(self, k: str, v: object) -> None: 

58 self.attrs[k] = v 

59 

60 def add_code2cid(self, code: str, cid: int) -> None: 

61 pass 

62 

63 def add_cid2unichr(self, cid: int, code: Union[PSLiteral, bytes, int]) -> None: 

64 pass 

65 

66 def use_cmap(self, cmap: "CMapBase") -> None: 

67 pass 

68 

69 def decode(self, code: bytes) -> Iterable[int]: 

70 raise NotImplementedError 

71 

72 

73class CMap(CMapBase): 

74 def __init__(self, **kwargs: Union[str, int]) -> None: 

75 CMapBase.__init__(self, **kwargs) 

76 self.code2cid: Dict[int, object] = {} 

77 

78 def __repr__(self) -> str: 

79 return "<CMap: %s>" % self.attrs.get("CMapName") 

80 

81 def use_cmap(self, cmap: CMapBase) -> None: 

82 assert isinstance(cmap, CMap), str(type(cmap)) 

83 

84 def copy(dst: Dict[int, object], src: Dict[int, object]) -> None: 

85 for k, v in src.items(): 

86 if isinstance(v, dict): 

87 d: Dict[int, object] = {} 

88 dst[k] = d 

89 copy(d, v) 

90 else: 

91 dst[k] = v 

92 

93 copy(self.code2cid, cmap.code2cid) 

94 

95 def decode(self, code: bytes) -> Iterator[int]: 

96 log.debug("decode: %r, %r", self, code) 

97 d = self.code2cid 

98 for i in iter(code): 

99 if i in d: 

100 x = d[i] 

101 if isinstance(x, int): 

102 yield x 

103 d = self.code2cid 

104 else: 

105 d = cast(Dict[int, object], x) 

106 else: 

107 d = self.code2cid 

108 

109 def dump( 

110 self, 

111 out: TextIO = sys.stdout, 

112 code2cid: Optional[Dict[int, object]] = None, 

113 code: Tuple[int, ...] = (), 

114 ) -> None: 

115 if code2cid is None: 

116 code2cid = self.code2cid 

117 code = () 

118 for k, v in sorted(code2cid.items()): 

119 c = code + (k,) 

120 if isinstance(v, int): 

121 out.write("code %r = cid %d\n" % (c, v)) 

122 else: 

123 self.dump(out=out, code2cid=cast(Dict[int, object], v), code=c) 

124 

125 

126class IdentityCMap(CMapBase): 

127 def decode(self, code: bytes) -> Tuple[int, ...]: 

128 n = len(code) // 2 

129 if n: 

130 return struct.unpack(">%dH" % n, code) 

131 else: 

132 return () 

133 

134 

135class IdentityCMapByte(IdentityCMap): 

136 def decode(self, code: bytes) -> Tuple[int, ...]: 

137 n = len(code) 

138 if n: 

139 return struct.unpack(">%dB" % n, code) 

140 else: 

141 return () 

142 

143 

144class UnicodeMap(CMapBase): 

145 def __init__(self, **kwargs: Union[str, int]) -> None: 

146 CMapBase.__init__(self, **kwargs) 

147 self.cid2unichr: Dict[int, str] = {} 

148 

149 def __repr__(self) -> str: 

150 return "<UnicodeMap: %s>" % self.attrs.get("CMapName") 

151 

152 def get_unichr(self, cid: int) -> str: 

153 log.debug("get_unichr: %r, %r", self, cid) 

154 return self.cid2unichr[cid] 

155 

156 def dump(self, out: TextIO = sys.stdout) -> None: 

157 for k, v in sorted(self.cid2unichr.items()): 

158 out.write("cid %d = unicode %r\n" % (k, v)) 

159 

160 

161class IdentityUnicodeMap(UnicodeMap): 

162 def get_unichr(self, cid: int) -> str: 

163 """Interpret character id as unicode codepoint""" 

164 log.debug("get_unichr: %r, %r", self, cid) 

165 return chr(cid) 

166 

167 

168class FileCMap(CMap): 

169 def add_code2cid(self, code: str, cid: int) -> None: 

170 assert isinstance(code, str) and isinstance(cid, int), str( 

171 (type(code), type(cid)), 

172 ) 

173 d = self.code2cid 

174 for c in code[:-1]: 

175 ci = ord(c) 

176 if ci in d: 

177 d = cast(Dict[int, object], d[ci]) 

178 else: 

179 t: Dict[int, object] = {} 

180 d[ci] = t 

181 d = t 

182 ci = ord(code[-1]) 

183 d[ci] = cid 

184 

185 

186class FileUnicodeMap(UnicodeMap): 

187 def add_cid2unichr(self, cid: int, code: Union[PSLiteral, bytes, int]) -> None: 

188 assert isinstance(cid, int), str(type(cid)) 

189 if isinstance(code, PSLiteral): 

190 # Interpret as an Adobe glyph name. 

191 assert isinstance(code.name, str) 

192 unichr = name2unicode(code.name) 

193 elif isinstance(code, bytes): 

194 # Interpret as UTF-16BE. 

195 unichr = code.decode("UTF-16BE", "ignore") 

196 elif isinstance(code, int): 

197 unichr = chr(code) 

198 else: 

199 raise PDFTypeError(code) 

200 

201 # A0 = non-breaking space, some weird fonts can have a collision on a cid here. 

202 if unichr == "\u00a0" and self.cid2unichr.get(cid) == " ": 

203 return 

204 self.cid2unichr[cid] = unichr 

205 

206 

207class PyCMap(CMap): 

208 def __init__(self, name: str, module: Any) -> None: 

209 super().__init__(CMapName=name) 

210 self.code2cid = module.CODE2CID 

211 if module.IS_VERTICAL: 

212 self.attrs["WMode"] = 1 

213 

214 

215class PyUnicodeMap(UnicodeMap): 

216 def __init__(self, name: str, module: Any, vertical: bool) -> None: 

217 super().__init__(CMapName=name) 

218 if vertical: 

219 self.cid2unichr = module.CID2UNICHR_V 

220 self.attrs["WMode"] = 1 

221 else: 

222 self.cid2unichr = module.CID2UNICHR_H 

223 

224 

225class CMapDB: 

226 _cmap_cache: Dict[str, PyCMap] = {} 

227 _umap_cache: Dict[str, List[PyUnicodeMap]] = {} 

228 

229 class CMapNotFound(CMapError): 

230 pass 

231 

232 @classmethod 

233 def _load_data(cls, name: str) -> Any: 

234 name = name.replace("\0", "") 

235 filename = "%s.pickle.gz" % name 

236 log.debug("loading: %r", name) 

237 cmap_paths = ( 

238 os.environ.get("CMAP_PATH", "/usr/share/pdfminer/"), 

239 os.path.join(os.path.dirname(__file__), "cmap"), 

240 ) 

241 for directory in cmap_paths: 

242 path = os.path.join(directory, filename) 

243 # Resolve paths to prevent directory traversal 

244 resolved_path = os.path.realpath(path) 

245 resolved_directory = os.path.realpath(directory) 

246 # Check if resolved path is within the intended directory 

247 if not resolved_path.startswith(resolved_directory + os.sep): 

248 continue 

249 if os.path.exists(resolved_path): 

250 gzfile = gzip.open(resolved_path) 

251 try: 

252 return type(str(name), (), pickle.loads(gzfile.read())) 

253 finally: 

254 gzfile.close() 

255 raise CMapDB.CMapNotFound(name) 

256 

257 @classmethod 

258 def get_cmap(cls, name: str) -> CMapBase: 

259 if name == "Identity-H": 

260 return IdentityCMap(WMode=0) 

261 elif name == "Identity-V": 

262 return IdentityCMap(WMode=1) 

263 elif name == "OneByteIdentityH": 

264 return IdentityCMapByte(WMode=0) 

265 elif name == "OneByteIdentityV": 

266 return IdentityCMapByte(WMode=1) 

267 try: 

268 return cls._cmap_cache[name] 

269 except KeyError: 

270 pass 

271 data = cls._load_data(name) 

272 cls._cmap_cache[name] = cmap = PyCMap(name, data) 

273 return cmap 

274 

275 @classmethod 

276 def get_unicode_map(cls, name: str, vertical: bool = False) -> UnicodeMap: 

277 try: 

278 return cls._umap_cache[name][vertical] 

279 except KeyError: 

280 pass 

281 data = cls._load_data("to-unicode-%s" % name) 

282 cls._umap_cache[name] = [PyUnicodeMap(name, data, v) for v in (False, True)] 

283 return cls._umap_cache[name][vertical] 

284 

285 

286class CMapParser(PSStackParser[PSKeyword]): 

287 def __init__(self, cmap: CMapBase, fp: BinaryIO) -> None: 

288 PSStackParser.__init__(self, fp) 

289 self.cmap = cmap 

290 # some ToUnicode maps don't have "begincmap" keyword. 

291 self._in_cmap = True 

292 self._warnings: Set[str] = set() 

293 

294 def run(self) -> None: 

295 try: 

296 self.nextobject() 

297 except PSEOF: 

298 pass 

299 

300 KEYWORD_BEGINCMAP = KWD(b"begincmap") 

301 KEYWORD_ENDCMAP = KWD(b"endcmap") 

302 KEYWORD_USECMAP = KWD(b"usecmap") 

303 KEYWORD_DEF = KWD(b"def") 

304 KEYWORD_BEGINCODESPACERANGE = KWD(b"begincodespacerange") 

305 KEYWORD_ENDCODESPACERANGE = KWD(b"endcodespacerange") 

306 KEYWORD_BEGINCIDRANGE = KWD(b"begincidrange") 

307 KEYWORD_ENDCIDRANGE = KWD(b"endcidrange") 

308 KEYWORD_BEGINCIDCHAR = KWD(b"begincidchar") 

309 KEYWORD_ENDCIDCHAR = KWD(b"endcidchar") 

310 KEYWORD_BEGINBFRANGE = KWD(b"beginbfrange") 

311 KEYWORD_ENDBFRANGE = KWD(b"endbfrange") 

312 KEYWORD_BEGINBFCHAR = KWD(b"beginbfchar") 

313 KEYWORD_ENDBFCHAR = KWD(b"endbfchar") 

314 KEYWORD_BEGINNOTDEFRANGE = KWD(b"beginnotdefrange") 

315 KEYWORD_ENDNOTDEFRANGE = KWD(b"endnotdefrange") 

316 

317 def do_keyword(self, pos: int, token: PSKeyword) -> None: 

318 """ToUnicode CMaps 

319 

320 See Section 5.9.2 - ToUnicode CMaps of the PDF Reference. 

321 """ 

322 if token is self.KEYWORD_BEGINCMAP: 

323 self._in_cmap = True 

324 self.popall() 

325 return 

326 

327 elif token is self.KEYWORD_ENDCMAP: 

328 self._in_cmap = False 

329 return 

330 

331 if not self._in_cmap: 

332 return 

333 

334 if token is self.KEYWORD_DEF: 

335 try: 

336 ((_, k), (_, v)) = self.pop(2) 

337 self.cmap.set_attr(literal_name(k), v) 

338 except PSSyntaxError: 

339 pass 

340 return 

341 

342 if token is self.KEYWORD_USECMAP: 

343 try: 

344 ((_, cmapname),) = self.pop(1) 

345 self.cmap.use_cmap(CMapDB.get_cmap(literal_name(cmapname))) 

346 except PSSyntaxError: 

347 pass 

348 except CMapDB.CMapNotFound: 

349 pass 

350 return 

351 

352 if token is self.KEYWORD_BEGINCODESPACERANGE: 

353 self.popall() 

354 return 

355 if token is self.KEYWORD_ENDCODESPACERANGE: 

356 self.popall() 

357 return 

358 

359 if token is self.KEYWORD_BEGINCIDRANGE: 

360 self.popall() 

361 return 

362 

363 if token is self.KEYWORD_ENDCIDRANGE: 

364 objs = [obj for (__, obj) in self.popall()] 

365 for start_byte, end_byte, cid in choplist(3, objs): 

366 if not isinstance(start_byte, bytes): 

367 self._warn_once("The start object of begincidrange is not a byte.") 

368 continue 

369 if not isinstance(end_byte, bytes): 

370 self._warn_once("The end object of begincidrange is not a byte.") 

371 continue 

372 if not isinstance(cid, int): 

373 self._warn_once("The cid object of begincidrange is not a byte.") 

374 continue 

375 if len(start_byte) != len(end_byte): 

376 self._warn_once( 

377 "The start and end byte of begincidrange have " 

378 "different lengths.", 

379 ) 

380 continue 

381 start_prefix = start_byte[:-4] 

382 end_prefix = end_byte[:-4] 

383 if start_prefix != end_prefix: 

384 self._warn_once( 

385 "The prefix of the start and end byte of " 

386 "begincidrange are not the same.", 

387 ) 

388 continue 

389 svar = start_byte[-4:] 

390 evar = end_byte[-4:] 

391 start = nunpack(svar) 

392 end = nunpack(evar) 

393 vlen = len(svar) 

394 for i in range(end - start + 1): 

395 x = start_prefix + struct.pack(">L", start + i)[-vlen:] 

396 self.cmap.add_cid2unichr(cid + i, x) 

397 return 

398 

399 if token is self.KEYWORD_BEGINCIDCHAR: 

400 self.popall() 

401 return 

402 

403 if token is self.KEYWORD_ENDCIDCHAR: 

404 objs = [obj for (__, obj) in self.popall()] 

405 for cid, code in choplist(2, objs): 

406 if isinstance(code, bytes) and isinstance(cid, int): 

407 self.cmap.add_cid2unichr(cid, code) 

408 return 

409 

410 if token is self.KEYWORD_BEGINBFRANGE: 

411 self.popall() 

412 return 

413 

414 if token is self.KEYWORD_ENDBFRANGE: 

415 objs = [obj for (__, obj) in self.popall()] 

416 for start_byte, end_byte, code in choplist(3, objs): 

417 if not isinstance(start_byte, bytes): 

418 self._warn_once("The start object is not a byte.") 

419 continue 

420 if not isinstance(end_byte, bytes): 

421 self._warn_once("The end object is not a byte.") 

422 continue 

423 if len(start_byte) != len(end_byte): 

424 self._warn_once("The start and end byte have different lengths.") 

425 continue 

426 start = nunpack(start_byte) 

427 end = nunpack(end_byte) 

428 if isinstance(code, list): 

429 if len(code) != end - start + 1: 

430 self._warn_once( 

431 "The difference between the start and end " 

432 "offsets does not match the code length.", 

433 ) 

434 for cid, unicode_value in zip(range(start, end + 1), code): 

435 self.cmap.add_cid2unichr(cid, unicode_value) 

436 else: 

437 assert isinstance(code, bytes) 

438 var = code[-4:] 

439 base = nunpack(var) 

440 prefix = code[:-4] 

441 vlen = len(var) 

442 for i in range(end - start + 1): 

443 x = prefix + struct.pack(">L", base + i)[-vlen:] 

444 self.cmap.add_cid2unichr(start + i, x) 

445 return 

446 

447 if token is self.KEYWORD_BEGINBFCHAR: 

448 self.popall() 

449 return 

450 

451 if token is self.KEYWORD_ENDBFCHAR: 

452 objs = [obj for (__, obj) in self.popall()] 

453 for cid, code in choplist(2, objs): 

454 if isinstance(cid, bytes) and isinstance(code, bytes): 

455 self.cmap.add_cid2unichr(nunpack(cid), code) 

456 return 

457 

458 if token is self.KEYWORD_BEGINNOTDEFRANGE: 

459 self.popall() 

460 return 

461 

462 if token is self.KEYWORD_ENDNOTDEFRANGE: 

463 self.popall() 

464 return 

465 

466 self.push((pos, token)) 

467 

468 def _warn_once(self, msg: str) -> None: 

469 """Warn once for each unique message""" 

470 if msg not in self._warnings: 

471 self._warnings.add(msg) 

472 base_msg = ( 

473 "Ignoring (part of) ToUnicode map because the PDF data " 

474 "does not conform to the format. This could result in " 

475 "(cid) values in the output. " 

476 ) 

477 log.warning(base_msg + msg)