Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pdfminer/cmapdb.py: 25%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

336 statements  

1"""Adobe character mapping (CMap) support. 

2 

3CMaps provide the mapping between character codes and Unicode 

4code-points to character ids (CIDs). 

5 

6More information is available on: 

7 

8 https://github.com/adobe-type-tools/cmap-resources 

9 

10""" 

11 

12import contextlib 

13import gzip 

14import logging 

15import os 

16import os.path 

17import pickle as pickle 

18import struct 

19import sys 

20from collections.abc import Iterable, Iterator, MutableMapping 

21from typing import ( 

22 Any, 

23 BinaryIO, 

24 ClassVar, 

25 TextIO, 

26 cast, 

27) 

28 

29from pdfminer.encodingdb import name2unicode 

30from pdfminer.pdfexceptions import PDFException, PDFTypeError 

31from pdfminer.psexceptions import PSEOF, PSSyntaxError 

32from pdfminer.psparser import KWD, PSKeyword, PSLiteral, PSStackParser, literal_name 

33from pdfminer.utils import choplist, nunpack 

34 

35log = logging.getLogger(__name__) 

36 

37 

38class CMapError(PDFException): 

39 pass 

40 

41 

42class CMapBase: 

43 debug = 0 

44 

45 def __init__(self, **kwargs: object) -> None: 

46 self.attrs: MutableMapping[str, object] = kwargs.copy() 

47 

48 def is_vertical(self) -> bool: 

49 return self.attrs.get("WMode", 0) != 0 

50 

51 def set_attr(self, k: str, v: object) -> None: 

52 self.attrs[k] = v 

53 

54 def add_code2cid(self, code: str, cid: int) -> None: 

55 pass 

56 

57 def add_cid2unichr(self, cid: int, code: PSLiteral | bytes | int) -> None: 

58 pass 

59 

60 def use_cmap(self, cmap: "CMapBase") -> None: 

61 pass 

62 

63 def decode(self, code: bytes) -> Iterable[int]: 

64 raise NotImplementedError 

65 

66 

67class CMap(CMapBase): 

68 def __init__(self, **kwargs: str | int) -> None: 

69 CMapBase.__init__(self, **kwargs) 

70 self.code2cid: dict[int, object] = {} 

71 

72 def __repr__(self) -> str: 

73 return "<CMap: {}>".format(self.attrs.get("CMapName")) 

74 

75 def use_cmap(self, cmap: CMapBase) -> None: 

76 assert isinstance(cmap, CMap), str(type(cmap)) 

77 

78 def copy(dst: dict[int, object], src: dict[int, object]) -> None: 

79 for k, v in src.items(): 

80 if isinstance(v, dict): 

81 d: dict[int, object] = {} 

82 dst[k] = d 

83 copy(d, v) 

84 else: 

85 dst[k] = v 

86 

87 copy(self.code2cid, cmap.code2cid) 

88 

89 def decode(self, code: bytes) -> Iterator[int]: 

90 log.debug("decode: %r, %r", self, code) 

91 d = self.code2cid 

92 for i in iter(code): 

93 if i in d: 

94 x = d[i] 

95 if isinstance(x, int): 

96 yield x 

97 d = self.code2cid 

98 else: 

99 d = cast(dict[int, object], x) 

100 else: 

101 d = self.code2cid 

102 

103 def dump( 

104 self, 

105 out: TextIO = sys.stdout, 

106 code2cid: dict[int, object] | None = None, 

107 code: tuple[int, ...] = (), 

108 ) -> None: 

109 if code2cid is None: 

110 code2cid = self.code2cid 

111 code = () 

112 for k, v in sorted(code2cid.items()): 

113 c = (*code, k) 

114 if isinstance(v, int): 

115 out.write(f"code {c!r} = cid {v}\n") 

116 else: 

117 self.dump(out=out, code2cid=cast(dict[int, object], v), code=c) 

118 

119 

120class IdentityCMap(CMapBase): 

121 def decode(self, code: bytes) -> tuple[int, ...]: 

122 n = len(code) // 2 

123 if n: 

124 return struct.unpack(f">{n}H", code[: n * 2]) 

125 else: 

126 return () 

127 

128 

129class IdentityCMapByte(IdentityCMap): 

130 def decode(self, code: bytes) -> tuple[int, ...]: 

131 n = len(code) 

132 if n: 

133 return struct.unpack(f">{n}B", code[:n]) 

134 else: 

135 return () 

136 

137 

138class UnicodeMap(CMapBase): 

139 def __init__(self, **kwargs: str | int) -> None: 

140 CMapBase.__init__(self, **kwargs) 

141 self.cid2unichr: dict[int, str] = {} 

142 

143 def __repr__(self) -> str: 

144 return "<UnicodeMap: {}>".format(self.attrs.get("CMapName")) 

145 

146 def get_unichr(self, cid: int) -> str: 

147 log.debug("get_unichr: %r, %r", self, cid) 

148 return self.cid2unichr[cid] 

149 

150 def dump(self, out: TextIO = sys.stdout) -> None: 

151 for k, v in sorted(self.cid2unichr.items()): 

152 out.write(f"cid {k} = unicode {v!r}\n") 

153 

154 

155class IdentityUnicodeMap(UnicodeMap): 

156 def get_unichr(self, cid: int) -> str: 

157 """Interpret character id as unicode codepoint""" 

158 log.debug("get_unichr: %r, %r", self, cid) 

159 return chr(cid) 

160 

161 

162class FileCMap(CMap): 

163 def add_code2cid(self, code: str, cid: int) -> None: 

164 assert isinstance(code, str) and isinstance(cid, int), str( 

165 (type(code), type(cid)), 

166 ) 

167 d = self.code2cid 

168 for c in code[:-1]: 

169 ci = ord(c) 

170 if ci in d: 

171 d = cast(dict[int, object], d[ci]) 

172 else: 

173 t: dict[int, object] = {} 

174 d[ci] = t 

175 d = t 

176 ci = ord(code[-1]) 

177 d[ci] = cid 

178 

179 

180class FileUnicodeMap(UnicodeMap): 

181 def add_cid2unichr(self, cid: int, code: PSLiteral | bytes | int) -> None: 

182 assert isinstance(cid, int), str(type(cid)) 

183 if isinstance(code, PSLiteral): 

184 # Interpret as an Adobe glyph name. 

185 assert isinstance(code.name, str) 

186 unichr = name2unicode(code.name) 

187 elif isinstance(code, bytes): 

188 # Interpret as UTF-16BE. 

189 unichr = code.decode("UTF-16BE", "ignore") 

190 elif isinstance(code, int): 

191 unichr = chr(code) 

192 else: 

193 raise PDFTypeError(code) 

194 

195 # A0 = non-breaking space, some weird fonts can have a collision on a cid here. 

196 if unichr == "\u00a0" and self.cid2unichr.get(cid) == " ": 

197 return 

198 self.cid2unichr[cid] = unichr 

199 

200 

201class PyCMap(CMap): 

202 def __init__(self, name: str, module: Any) -> None: 

203 super().__init__(CMapName=name) 

204 self.code2cid = module.CODE2CID 

205 if module.IS_VERTICAL: 

206 self.attrs["WMode"] = 1 

207 

208 

209class PyUnicodeMap(UnicodeMap): 

210 def __init__(self, name: str, module: Any, vertical: bool) -> None: 

211 super().__init__(CMapName=name) 

212 if vertical: 

213 self.cid2unichr = module.CID2UNICHR_V 

214 self.attrs["WMode"] = 1 

215 else: 

216 self.cid2unichr = module.CID2UNICHR_H 

217 

218 

219class CMapDB: 

220 _cmap_cache: ClassVar[dict[str, PyCMap]] = {} 

221 _umap_cache: ClassVar[dict[str, list[PyUnicodeMap]]] = {} 

222 

223 class CMapNotFound(CMapError): 

224 pass 

225 

226 @classmethod 

227 def _load_data(cls, name: str) -> Any: 

228 name = name.replace("\0", "") 

229 filename = f"{name}.pickle.gz" 

230 log.debug("loading: %r", name) 

231 cmap_paths = ( 

232 os.environ.get("CMAP_PATH", "/usr/share/pdfminer/"), 

233 os.path.join(os.path.dirname(__file__), "cmap"), 

234 ) 

235 for directory in cmap_paths: 

236 path = os.path.join(directory, filename) 

237 # Resolve paths to prevent directory traversal 

238 resolved_path = os.path.realpath(path) 

239 resolved_directory = os.path.realpath(directory) 

240 # Check if resolved path is within the intended directory 

241 if not resolved_path.startswith(resolved_directory + os.sep): 

242 continue 

243 if os.path.exists(resolved_path): 

244 with gzip.open(resolved_path) as gzfile: 

245 return type(str(name), (), pickle.loads(gzfile.read())) 

246 raise CMapDB.CMapNotFound(name) 

247 

248 @classmethod 

249 def get_cmap(cls, name: str) -> CMapBase: 

250 if name == "Identity-H": 

251 return IdentityCMap(WMode=0) 

252 elif name == "Identity-V": 

253 return IdentityCMap(WMode=1) 

254 elif name == "OneByteIdentityH": 

255 return IdentityCMapByte(WMode=0) 

256 elif name == "OneByteIdentityV": 

257 return IdentityCMapByte(WMode=1) 

258 try: 

259 return cls._cmap_cache[name] 

260 except KeyError: 

261 pass 

262 data = cls._load_data(name) 

263 cls._cmap_cache[name] = cmap = PyCMap(name, data) 

264 return cmap 

265 

266 @classmethod 

267 def get_unicode_map(cls, name: str, vertical: bool = False) -> UnicodeMap: 

268 try: 

269 return cls._umap_cache[name][vertical] 

270 except KeyError: 

271 pass 

272 data = cls._load_data(f"to-unicode-{name}") 

273 cls._umap_cache[name] = [PyUnicodeMap(name, data, v) for v in (False, True)] 

274 return cls._umap_cache[name][vertical] 

275 

276 

277class CMapParser(PSStackParser[PSKeyword]): 

278 def __init__(self, cmap: CMapBase, fp: BinaryIO) -> None: 

279 PSStackParser.__init__(self, fp) 

280 self.cmap = cmap 

281 # some ToUnicode maps don't have "begincmap" keyword. 

282 self._in_cmap = True 

283 self._warnings: set[str] = set() 

284 

285 def run(self) -> None: 

286 with contextlib.suppress(PSEOF): 

287 self.nextobject() 

288 

289 KEYWORD_BEGINCMAP = KWD(b"begincmap") 

290 KEYWORD_ENDCMAP = KWD(b"endcmap") 

291 KEYWORD_USECMAP = KWD(b"usecmap") 

292 KEYWORD_DEF = KWD(b"def") 

293 KEYWORD_BEGINCODESPACERANGE = KWD(b"begincodespacerange") 

294 KEYWORD_ENDCODESPACERANGE = KWD(b"endcodespacerange") 

295 KEYWORD_BEGINCIDRANGE = KWD(b"begincidrange") 

296 KEYWORD_ENDCIDRANGE = KWD(b"endcidrange") 

297 KEYWORD_BEGINCIDCHAR = KWD(b"begincidchar") 

298 KEYWORD_ENDCIDCHAR = KWD(b"endcidchar") 

299 KEYWORD_BEGINBFRANGE = KWD(b"beginbfrange") 

300 KEYWORD_ENDBFRANGE = KWD(b"endbfrange") 

301 KEYWORD_BEGINBFCHAR = KWD(b"beginbfchar") 

302 KEYWORD_ENDBFCHAR = KWD(b"endbfchar") 

303 KEYWORD_BEGINNOTDEFRANGE = KWD(b"beginnotdefrange") 

304 KEYWORD_ENDNOTDEFRANGE = KWD(b"endnotdefrange") 

305 

306 def do_keyword(self, pos: int, token: PSKeyword) -> None: 

307 """ToUnicode CMaps 

308 

309 See Section 5.9.2 - ToUnicode CMaps of the PDF Reference. 

310 """ 

311 if token is self.KEYWORD_BEGINCMAP: 

312 self._in_cmap = True 

313 self.popall() 

314 return 

315 

316 elif token is self.KEYWORD_ENDCMAP: 

317 self._in_cmap = False 

318 return 

319 

320 if not self._in_cmap: 

321 return 

322 

323 if token is self.KEYWORD_DEF: 

324 try: 

325 ((_, k), (_, v)) = self.pop(2) 

326 self.cmap.set_attr(literal_name(k), v) 

327 except PSSyntaxError: 

328 pass 

329 return 

330 

331 if token is self.KEYWORD_USECMAP: 

332 try: 

333 ((_, cmapname),) = self.pop(1) 

334 self.cmap.use_cmap(CMapDB.get_cmap(literal_name(cmapname))) 

335 except PSSyntaxError: 

336 pass 

337 except CMapDB.CMapNotFound: 

338 pass 

339 return 

340 

341 if token is self.KEYWORD_BEGINCODESPACERANGE: 

342 self.popall() 

343 return 

344 if token is self.KEYWORD_ENDCODESPACERANGE: 

345 self.popall() 

346 return 

347 

348 if token is self.KEYWORD_BEGINCIDRANGE: 

349 self.popall() 

350 return 

351 

352 if token is self.KEYWORD_ENDCIDRANGE: 

353 objs = [obj for (__, obj) in self.popall()] 

354 for start_byte, end_byte, cid in choplist(3, objs): 

355 if not isinstance(start_byte, bytes): 

356 self._warn_once("The start object of begincidrange is not a byte.") 

357 continue 

358 if not isinstance(end_byte, bytes): 

359 self._warn_once("The end object of begincidrange is not a byte.") 

360 continue 

361 if not isinstance(cid, int): 

362 self._warn_once("The cid object of begincidrange is not a byte.") 

363 continue 

364 if len(start_byte) != len(end_byte): 

365 self._warn_once( 

366 "The start and end byte of begincidrange have " 

367 "different lengths.", 

368 ) 

369 continue 

370 start_prefix = start_byte[:-4] 

371 end_prefix = end_byte[:-4] 

372 if start_prefix != end_prefix: 

373 self._warn_once( 

374 "The prefix of the start and end byte of " 

375 "begincidrange are not the same.", 

376 ) 

377 continue 

378 svar = start_byte[-4:] 

379 evar = end_byte[-4:] 

380 start = nunpack(svar) 

381 end = nunpack(evar) 

382 vlen = len(svar) 

383 for i in range(end - start + 1): 

384 x = start_prefix + struct.pack(">L", start + i)[-vlen:] 

385 self.cmap.add_cid2unichr(cid + i, x) 

386 return 

387 

388 if token is self.KEYWORD_BEGINCIDCHAR: 

389 self.popall() 

390 return 

391 

392 if token is self.KEYWORD_ENDCIDCHAR: 

393 objs = [obj for (__, obj) in self.popall()] 

394 for cid, code in choplist(2, objs): 

395 if isinstance(code, bytes) and isinstance(cid, int): 

396 self.cmap.add_cid2unichr(cid, code) 

397 return 

398 

399 if token is self.KEYWORD_BEGINBFRANGE: 

400 self.popall() 

401 return 

402 

403 if token is self.KEYWORD_ENDBFRANGE: 

404 objs = [obj for (__, obj) in self.popall()] 

405 for start_byte, end_byte, code in choplist(3, objs): 

406 if not isinstance(start_byte, bytes): 

407 self._warn_once("The start object is not a byte.") 

408 continue 

409 if not isinstance(end_byte, bytes): 

410 self._warn_once("The end object is not a byte.") 

411 continue 

412 if len(start_byte) != len(end_byte): 

413 self._warn_once("The start and end byte have different lengths.") 

414 continue 

415 start = nunpack(start_byte) 

416 end = nunpack(end_byte) 

417 if isinstance(code, list): 

418 if len(code) != end - start + 1: 

419 self._warn_once( 

420 "The difference between the start and end " 

421 "offsets does not match the code length.", 

422 ) 

423 for cid, unicode_value in zip( 

424 range(start, end + 1), code, strict=False 

425 ): 

426 self.cmap.add_cid2unichr(cid, unicode_value) 

427 else: 

428 assert isinstance(code, bytes) 

429 var = code[-4:] 

430 base = nunpack(var) 

431 prefix = code[:-4] 

432 vlen = len(var) 

433 for i in range(end - start + 1): 

434 x = prefix + struct.pack(">L", base + i)[-vlen:] 

435 self.cmap.add_cid2unichr(start + i, x) 

436 return 

437 

438 if token is self.KEYWORD_BEGINBFCHAR: 

439 self.popall() 

440 return 

441 

442 if token is self.KEYWORD_ENDBFCHAR: 

443 objs = [obj for (__, obj) in self.popall()] 

444 for cid, code in choplist(2, objs): 

445 if isinstance(cid, bytes) and isinstance(code, bytes): 

446 self.cmap.add_cid2unichr(nunpack(cid), code) 

447 return 

448 

449 if token is self.KEYWORD_BEGINNOTDEFRANGE: 

450 self.popall() 

451 return 

452 

453 if token is self.KEYWORD_ENDNOTDEFRANGE: 

454 self.popall() 

455 return 

456 

457 self.push((pos, token)) 

458 

459 def _warn_once(self, msg: str) -> None: 

460 """Warn once for each unique message""" 

461 if msg not in self._warnings: 

462 self._warnings.add(msg) 

463 base_msg = ( 

464 "Ignoring (part of) ToUnicode map because the PDF data " 

465 "does not conform to the format. This could result in " 

466 "(cid) values in the output. " 

467 ) 

468 log.warning(base_msg + msg)