Coverage for /pythoncovmergedfiles/medio/medio/src/pdfminer.six/pdfminer/cmapdb.py: 79%

1"""Adobe character mapping (CMap) support.

3CMaps provide the mapping between character codes and Unicode

4code-points to character ids (CIDs).

6More information is available on:

8 https://github.com/adobe-type-tools/cmap-resources

10"""

12import gzip

13import logging

14import os

15import os.path

16import pickle as pickle

17import struct

18import sys

19from typing import (

20 Any,

21 BinaryIO,

22 Dict,

23 Iterable,

24 Iterator,

25 List,

26 MutableMapping,

27 Optional,

28 Set,

29 TextIO,

30 Tuple,

31 Union,

32 cast,

33)

35from pdfminer.encodingdb import name2unicode

36from pdfminer.pdfexceptions import PDFException, PDFTypeError

37from pdfminer.psexceptions import PSEOF, PSSyntaxError

38from pdfminer.psparser import KWD, PSKeyword, PSLiteral, PSStackParser, literal_name

39from pdfminer.utils import choplist, nunpack

41log = logging.getLogger(__name__)

44class CMapError(PDFException):

45 pass

48class CMapBase:

49 debug = 0

51 def __init__(self, **kwargs: object) -> None:

52 self.attrs: MutableMapping[str, object] = kwargs.copy()

54 def is_vertical(self) -> bool:

55 return self.attrs.get("WMode", 0) != 0

57 def set_attr(self, k: str, v: object) -> None:

58 self.attrs[k] = v

60 def add_code2cid(self, code: str, cid: int) -> None:

61 pass

63 def add_cid2unichr(self, cid: int, code: Union[PSLiteral, bytes, int]) -> None:

64 pass

66 def use_cmap(self, cmap: "CMapBase") -> None:

67 pass

69 def decode(self, code: bytes) -> Iterable[int]:

70 raise NotImplementedError

73class CMap(CMapBase):

74 def __init__(self, **kwargs: Union[str, int]) -> None:

75 CMapBase.__init__(self, **kwargs)

76 self.code2cid: Dict[int, object] = {}

78 def __repr__(self) -> str:

79 return "<CMap: %s>" % self.attrs.get("CMapName")

81 def use_cmap(self, cmap: CMapBase) -> None:

82 assert isinstance(cmap, CMap), str(type(cmap))

84 def copy(dst: Dict[int, object], src: Dict[int, object]) -> None:

85 for k, v in src.items():

86 if isinstance(v, dict):

87 d: Dict[int, object] = {}

88 dst[k] = d

89 copy(d, v)

90 else:

91 dst[k] = v

93 copy(self.code2cid, cmap.code2cid)

95 def decode(self, code: bytes) -> Iterator[int]:

96 log.debug("decode: %r, %r", self, code)

97 d = self.code2cid

98 for i in iter(code):

99 if i in d:

100 x = d[i]

101 if isinstance(x, int):

102 yield x

103 d = self.code2cid

104 else:

105 d = cast(Dict[int, object], x)

106 else:

107 d = self.code2cid

108

109 def dump(

110 self,

111 out: TextIO = sys.stdout,

112 code2cid: Optional[Dict[int, object]] = None,

113 code: Tuple[int, ...] = (),

114 ) -> None:

115 if code2cid is None:

116 code2cid = self.code2cid

117 code = ()

118 for k, v in sorted(code2cid.items()):

119 c = code + (k,)

120 if isinstance(v, int):

121 out.write("code %r = cid %d\n" % (c, v))

122 else:

123 self.dump(out=out, code2cid=cast(Dict[int, object], v), code=c)

124

125

126class IdentityCMap(CMapBase):

127 def decode(self, code: bytes) -> Tuple[int, ...]:

128 n = len(code) // 2

129 if n:

130 return struct.unpack(">%dH" % n, code)

131 else:

132 return ()

133

134

135class IdentityCMapByte(IdentityCMap):

136 def decode(self, code: bytes) -> Tuple[int, ...]:

137 n = len(code)

138 if n:

139 return struct.unpack(">%dB" % n, code)

140 else:

141 return ()

142

143

144class UnicodeMap(CMapBase):

145 def __init__(self, **kwargs: Union[str, int]) -> None:

146 CMapBase.__init__(self, **kwargs)

147 self.cid2unichr: Dict[int, str] = {}

148

149 def __repr__(self) -> str:

150 return "<UnicodeMap: %s>" % self.attrs.get("CMapName")

151

152 def get_unichr(self, cid: int) -> str:

153 log.debug("get_unichr: %r, %r", self, cid)

154 return self.cid2unichr[cid]

155

156 def dump(self, out: TextIO = sys.stdout) -> None:

157 for k, v in sorted(self.cid2unichr.items()):

158 out.write("cid %d = unicode %r\n" % (k, v))

159

160

161class IdentityUnicodeMap(UnicodeMap):

162 def get_unichr(self, cid: int) -> str:

163 """Interpret character id as unicode codepoint"""

164 log.debug("get_unichr: %r, %r", self, cid)

165 return chr(cid)

166

167

168class FileCMap(CMap):

169 def add_code2cid(self, code: str, cid: int) -> None:

170 assert isinstance(code, str) and isinstance(cid, int), str(

171 (type(code), type(cid)),

172 )

173 d = self.code2cid

174 for c in code[:-1]:

175 ci = ord(c)

176 if ci in d:

177 d = cast(Dict[int, object], d[ci])

178 else:

179 t: Dict[int, object] = {}

180 d[ci] = t

181 d = t

182 ci = ord(code[-1])

183 d[ci] = cid

184

185

186class FileUnicodeMap(UnicodeMap):

187 def add_cid2unichr(self, cid: int, code: Union[PSLiteral, bytes, int]) -> None:

188 assert isinstance(cid, int), str(type(cid))

189 if isinstance(code, PSLiteral):

190 # Interpret as an Adobe glyph name.

191 assert isinstance(code.name, str)

192 unichr = name2unicode(code.name)

193 elif isinstance(code, bytes):

194 # Interpret as UTF-16BE.

195 unichr = code.decode("UTF-16BE", "ignore")

196 elif isinstance(code, int):

197 unichr = chr(code)

198 else:

199 raise PDFTypeError(code)

200

201 # A0 = non-breaking space, some weird fonts can have a collision on a cid here.

202 if unichr == "\u00a0" and self.cid2unichr.get(cid) == " ":

203 return

204 self.cid2unichr[cid] = unichr

205

206

207class PyCMap(CMap):

208 def __init__(self, name: str, module: Any) -> None:

209 super().__init__(CMapName=name)

210 self.code2cid = module.CODE2CID

211 if module.IS_VERTICAL:

212 self.attrs["WMode"] = 1

213

214

215class PyUnicodeMap(UnicodeMap):

216 def __init__(self, name: str, module: Any, vertical: bool) -> None:

217 super().__init__(CMapName=name)

218 if vertical:

219 self.cid2unichr = module.CID2UNICHR_V

220 self.attrs["WMode"] = 1

221 else:

222 self.cid2unichr = module.CID2UNICHR_H

223

224

225class CMapDB:

226 _cmap_cache: Dict[str, PyCMap] = {}

227 _umap_cache: Dict[str, List[PyUnicodeMap]] = {}

228

229 class CMapNotFound(CMapError):

230 pass

231

232 @classmethod

233 def _load_data(cls, name: str) -> Any:

234 name = name.replace("\0", "")

235 filename = "%s.pickle.gz" % name

236 log.debug("loading: %r", name)

237 cmap_paths = (

238 os.environ.get("CMAP_PATH", "/usr/share/pdfminer/"),

239 os.path.join(os.path.dirname(__file__), "cmap"),

240 )

241 for directory in cmap_paths:

242 path = os.path.join(directory, filename)

243 # Resolve paths to prevent directory traversal

244 resolved_path = os.path.realpath(path)

245 resolved_directory = os.path.realpath(directory)

246 # Check if resolved path is within the intended directory

247 if not resolved_path.startswith(resolved_directory + os.sep):

248 continue

249 if os.path.exists(resolved_path):

250 gzfile = gzip.open(resolved_path)

251 try:

252 return type(str(name), (), pickle.loads(gzfile.read()))

253 finally:

254 gzfile.close()

255 raise CMapDB.CMapNotFound(name)

256

257 @classmethod

258 def get_cmap(cls, name: str) -> CMapBase:

259 if name == "Identity-H":

260 return IdentityCMap(WMode=0)

261 elif name == "Identity-V":

262 return IdentityCMap(WMode=1)

263 elif name == "OneByteIdentityH":

264 return IdentityCMapByte(WMode=0)

265 elif name == "OneByteIdentityV":

266 return IdentityCMapByte(WMode=1)

267 try:

268 return cls._cmap_cache[name]

269 except KeyError:

270 pass

271 data = cls._load_data(name)

272 cls._cmap_cache[name] = cmap = PyCMap(name, data)

273 return cmap

274

275 @classmethod

276 def get_unicode_map(cls, name: str, vertical: bool = False) -> UnicodeMap:

277 try:

278 return cls._umap_cache[name][vertical]

279 except KeyError:

280 pass

281 data = cls._load_data("to-unicode-%s" % name)

282 cls._umap_cache[name] = [PyUnicodeMap(name, data, v) for v in (False, True)]

283 return cls._umap_cache[name][vertical]

284

285

286class CMapParser(PSStackParser[PSKeyword]):

287 def __init__(self, cmap: CMapBase, fp: BinaryIO) -> None:

288 PSStackParser.__init__(self, fp)

289 self.cmap = cmap

290 # some ToUnicode maps don't have "begincmap" keyword.

291 self._in_cmap = True

292 self._warnings: Set[str] = set()

293

294 def run(self) -> None:

295 try:

296 self.nextobject()

297 except PSEOF:

298 pass

299

300 KEYWORD_BEGINCMAP = KWD(b"begincmap")

301 KEYWORD_ENDCMAP = KWD(b"endcmap")

302 KEYWORD_USECMAP = KWD(b"usecmap")

303 KEYWORD_DEF = KWD(b"def")

304 KEYWORD_BEGINCODESPACERANGE = KWD(b"begincodespacerange")

305 KEYWORD_ENDCODESPACERANGE = KWD(b"endcodespacerange")

306 KEYWORD_BEGINCIDRANGE = KWD(b"begincidrange")

307 KEYWORD_ENDCIDRANGE = KWD(b"endcidrange")

308 KEYWORD_BEGINCIDCHAR = KWD(b"begincidchar")

309 KEYWORD_ENDCIDCHAR = KWD(b"endcidchar")

310 KEYWORD_BEGINBFRANGE = KWD(b"beginbfrange")

311 KEYWORD_ENDBFRANGE = KWD(b"endbfrange")

312 KEYWORD_BEGINBFCHAR = KWD(b"beginbfchar")

313 KEYWORD_ENDBFCHAR = KWD(b"endbfchar")

314 KEYWORD_BEGINNOTDEFRANGE = KWD(b"beginnotdefrange")

315 KEYWORD_ENDNOTDEFRANGE = KWD(b"endnotdefrange")

316

317 def do_keyword(self, pos: int, token: PSKeyword) -> None:

318 """ToUnicode CMaps

319

320 See Section 5.9.2 - ToUnicode CMaps of the PDF Reference.

321 """

322 if token is self.KEYWORD_BEGINCMAP:

323 self._in_cmap = True

324 self.popall()

325 return

326

327 elif token is self.KEYWORD_ENDCMAP:

328 self._in_cmap = False

329 return

330

331 if not self._in_cmap:

332 return

333

334 if token is self.KEYWORD_DEF:

335 try:

336 ((_, k), (_, v)) = self.pop(2)

337 self.cmap.set_attr(literal_name(k), v)

338 except PSSyntaxError:

339 pass

340 return

341

342 if token is self.KEYWORD_USECMAP:

343 try:

344 ((_, cmapname),) = self.pop(1)

345 self.cmap.use_cmap(CMapDB.get_cmap(literal_name(cmapname)))

346 except PSSyntaxError:

347 pass

348 except CMapDB.CMapNotFound:

349 pass

350 return

351

352 if token is self.KEYWORD_BEGINCODESPACERANGE:

353 self.popall()

354 return

355 if token is self.KEYWORD_ENDCODESPACERANGE:

356 self.popall()

357 return

358

359 if token is self.KEYWORD_BEGINCIDRANGE:

360 self.popall()

361 return

362

363 if token is self.KEYWORD_ENDCIDRANGE:

364 objs = [obj for (__, obj) in self.popall()]

365 for start_byte, end_byte, cid in choplist(3, objs):

366 if not isinstance(start_byte, bytes):

367 self._warn_once("The start object of begincidrange is not a byte.")

368 continue

369 if not isinstance(end_byte, bytes):

370 self._warn_once("The end object of begincidrange is not a byte.")

371 continue

372 if not isinstance(cid, int):

373 self._warn_once("The cid object of begincidrange is not a byte.")

374 continue

375 if len(start_byte) != len(end_byte):

376 self._warn_once(

377 "The start and end byte of begincidrange have "

378 "different lengths.",

379 )

380 continue

381 start_prefix = start_byte[:-4]

382 end_prefix = end_byte[:-4]

383 if start_prefix != end_prefix:

384 self._warn_once(

385 "The prefix of the start and end byte of "

386 "begincidrange are not the same.",

387 )

388 continue

389 svar = start_byte[-4:]

390 evar = end_byte[-4:]

391 start = nunpack(svar)

392 end = nunpack(evar)

393 vlen = len(svar)

394 for i in range(end - start + 1):

395 x = start_prefix + struct.pack(">L", start + i)[-vlen:]

396 self.cmap.add_cid2unichr(cid + i, x)

397 return

398

399 if token is self.KEYWORD_BEGINCIDCHAR:

400 self.popall()

401 return

402

403 if token is self.KEYWORD_ENDCIDCHAR:

404 objs = [obj for (__, obj) in self.popall()]

405 for cid, code in choplist(2, objs):

406 if isinstance(code, bytes) and isinstance(cid, int):

407 self.cmap.add_cid2unichr(cid, code)

408 return

409

410 if token is self.KEYWORD_BEGINBFRANGE:

411 self.popall()

412 return

413

414 if token is self.KEYWORD_ENDBFRANGE:

415 objs = [obj for (__, obj) in self.popall()]

416 for start_byte, end_byte, code in choplist(3, objs):

417 if not isinstance(start_byte, bytes):

418 self._warn_once("The start object is not a byte.")

419 continue

420 if not isinstance(end_byte, bytes):

421 self._warn_once("The end object is not a byte.")

422 continue

423 if len(start_byte) != len(end_byte):

424 self._warn_once("The start and end byte have different lengths.")

425 continue

426 start = nunpack(start_byte)

427 end = nunpack(end_byte)

428 if isinstance(code, list):

429 if len(code) != end - start + 1:

430 self._warn_once(

431 "The difference between the start and end "

432 "offsets does not match the code length.",

433 )

434 for cid, unicode_value in zip(range(start, end + 1), code):

435 self.cmap.add_cid2unichr(cid, unicode_value)

436 else:

437 assert isinstance(code, bytes)

438 var = code[-4:]

439 base = nunpack(var)

440 prefix = code[:-4]

441 vlen = len(var)

442 for i in range(end - start + 1):

443 x = prefix + struct.pack(">L", base + i)[-vlen:]

444 self.cmap.add_cid2unichr(start + i, x)

445 return

446

447 if token is self.KEYWORD_BEGINBFCHAR:

448 self.popall()

449 return

450

451 if token is self.KEYWORD_ENDBFCHAR:

452 objs = [obj for (__, obj) in self.popall()]

453 for cid, code in choplist(2, objs):

454 if isinstance(cid, bytes) and isinstance(code, bytes):

455 self.cmap.add_cid2unichr(nunpack(cid), code)

456 return

457

458 if token is self.KEYWORD_BEGINNOTDEFRANGE:

459 self.popall()

460 return

461

462 if token is self.KEYWORD_ENDNOTDEFRANGE:

463 self.popall()

464 return

465

466 self.push((pos, token))

467

468 def _warn_once(self, msg: str) -> None:

469 """Warn once for each unique message"""

470 if msg not in self._warnings:

471 self._warnings.add(msg)

472 base_msg = (

473 "Ignoring (part of) ToUnicode map because the PDF data "

474 "does not conform to the format. This could result in "

475 "(cid) values in the output. "

476 )

477 log.warning(base_msg + msg)