Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pdfminer/cmapdb.py: 25%

1"""Adobe character mapping (CMap) support.

3CMaps provide the mapping between character codes and Unicode

4code-points to character ids (CIDs).

6More information is available on:

8 https://github.com/adobe-type-tools/cmap-resources

10"""

12import contextlib

13import gzip

14import logging

15import os

16import os.path

17import pickle as pickle

18import struct

19import sys

20from collections.abc import Iterable, Iterator, MutableMapping

21from typing import (

22 Any,

23 BinaryIO,

24 ClassVar,

25 TextIO,

26 cast,

27)

29from pdfminer.encodingdb import name2unicode

30from pdfminer.pdfexceptions import PDFException, PDFTypeError

31from pdfminer.psexceptions import PSEOF, PSSyntaxError

32from pdfminer.psparser import KWD, PSKeyword, PSLiteral, PSStackParser, literal_name

33from pdfminer.utils import choplist, nunpack

35log = logging.getLogger(__name__)

38class CMapError(PDFException):

39 pass

42class CMapBase:

43 debug = 0

45 def __init__(self, **kwargs: object) -> None:

46 self.attrs: MutableMapping[str, object] = kwargs.copy()

48 def is_vertical(self) -> bool:

49 return self.attrs.get("WMode", 0) != 0

51 def set_attr(self, k: str, v: object) -> None:

52 self.attrs[k] = v

54 def add_code2cid(self, code: str, cid: int) -> None:

55 pass

57 def add_cid2unichr(self, cid: int, code: PSLiteral | bytes | int) -> None:

58 pass

60 def use_cmap(self, cmap: "CMapBase") -> None:

61 pass

63 def decode(self, code: bytes) -> Iterable[int]:

64 raise NotImplementedError

67class CMap(CMapBase):

68 def __init__(self, **kwargs: str | int) -> None:

69 CMapBase.__init__(self, **kwargs)

70 self.code2cid: dict[int, object] = {}

72 def __repr__(self) -> str:

73 return "<CMap: {}>".format(self.attrs.get("CMapName"))

75 def use_cmap(self, cmap: CMapBase) -> None:

76 assert isinstance(cmap, CMap), str(type(cmap))

78 def copy(dst: dict[int, object], src: dict[int, object]) -> None:

79 for k, v in src.items():

80 if isinstance(v, dict):

81 d: dict[int, object] = {}

82 dst[k] = d

83 copy(d, v)

84 else:

85 dst[k] = v

87 copy(self.code2cid, cmap.code2cid)

89 def decode(self, code: bytes) -> Iterator[int]:

90 log.debug("decode: %r, %r", self, code)

91 d = self.code2cid

92 for i in iter(code):

93 if i in d:

94 x = d[i]

95 if isinstance(x, int):

96 yield x

97 d = self.code2cid

98 else:

99 d = cast(dict[int, object], x)

100 else:

101 d = self.code2cid

102

103 def dump(

104 self,

105 out: TextIO = sys.stdout,

106 code2cid: dict[int, object] | None = None,

107 code: tuple[int, ...] = (),

108 ) -> None:

109 if code2cid is None:

110 code2cid = self.code2cid

111 code = ()

112 for k, v in sorted(code2cid.items()):

113 c = (*code, k)

114 if isinstance(v, int):

115 out.write(f"code {c!r} = cid {v}\n")

116 else:

117 self.dump(out=out, code2cid=cast(dict[int, object], v), code=c)

118

119

120class IdentityCMap(CMapBase):

121 def decode(self, code: bytes) -> tuple[int, ...]:

122 n = len(code) // 2

123 if n:

124 return struct.unpack(f">{n}H", code[: n * 2])

125 else:

126 return ()

127

128

129class IdentityCMapByte(IdentityCMap):

130 def decode(self, code: bytes) -> tuple[int, ...]:

131 n = len(code)

132 if n:

133 return struct.unpack(f">{n}B", code[:n])

134 else:

135 return ()

136

137

138class UnicodeMap(CMapBase):

139 def __init__(self, **kwargs: str | int) -> None:

140 CMapBase.__init__(self, **kwargs)

141 self.cid2unichr: dict[int, str] = {}

142

143 def __repr__(self) -> str:

144 return "<UnicodeMap: {}>".format(self.attrs.get("CMapName"))

145

146 def get_unichr(self, cid: int) -> str:

147 log.debug("get_unichr: %r, %r", self, cid)

148 return self.cid2unichr[cid]

149

150 def dump(self, out: TextIO = sys.stdout) -> None:

151 for k, v in sorted(self.cid2unichr.items()):

152 out.write(f"cid {k} = unicode {v!r}\n")

153

154

155class IdentityUnicodeMap(UnicodeMap):

156 def get_unichr(self, cid: int) -> str:

157 """Interpret character id as unicode codepoint"""

158 log.debug("get_unichr: %r, %r", self, cid)

159 return chr(cid)

160

161

162class FileCMap(CMap):

163 def add_code2cid(self, code: str, cid: int) -> None:

164 assert isinstance(code, str) and isinstance(cid, int), str(

165 (type(code), type(cid)),

166 )

167 d = self.code2cid

168 for c in code[:-1]:

169 ci = ord(c)

170 if ci in d:

171 d = cast(dict[int, object], d[ci])

172 else:

173 t: dict[int, object] = {}

174 d[ci] = t

175 d = t

176 ci = ord(code[-1])

177 d[ci] = cid

178

179

180class FileUnicodeMap(UnicodeMap):

181 def add_cid2unichr(self, cid: int, code: PSLiteral | bytes | int) -> None:

182 assert isinstance(cid, int), str(type(cid))

183 if isinstance(code, PSLiteral):

184 # Interpret as an Adobe glyph name.

185 assert isinstance(code.name, str)

186 unichr = name2unicode(code.name)

187 elif isinstance(code, bytes):

188 # Interpret as UTF-16BE.

189 unichr = code.decode("UTF-16BE", "ignore")

190 elif isinstance(code, int):

191 unichr = chr(code)

192 else:

193 raise PDFTypeError(code)

194

195 # A0 = non-breaking space, some weird fonts can have a collision on a cid here.

196 if unichr == "\u00a0" and self.cid2unichr.get(cid) == " ":

197 return

198 self.cid2unichr[cid] = unichr

199

200

201class PyCMap(CMap):

202 def __init__(self, name: str, module: Any) -> None:

203 super().__init__(CMapName=name)

204 self.code2cid = module.CODE2CID

205 if module.IS_VERTICAL:

206 self.attrs["WMode"] = 1

207

208

209class PyUnicodeMap(UnicodeMap):

210 def __init__(self, name: str, module: Any, vertical: bool) -> None:

211 super().__init__(CMapName=name)

212 if vertical:

213 self.cid2unichr = module.CID2UNICHR_V

214 self.attrs["WMode"] = 1

215 else:

216 self.cid2unichr = module.CID2UNICHR_H

217

218

219class CMapDB:

220 _cmap_cache: ClassVar[dict[str, PyCMap]] = {}

221 _umap_cache: ClassVar[dict[str, list[PyUnicodeMap]]] = {}

222

223 class CMapNotFound(CMapError):

224 pass

225

226 @classmethod

227 def _load_data(cls, name: str) -> Any:

228 name = name.replace("\0", "")

229 filename = f"{name}.pickle.gz"

230 log.debug("loading: %r", name)

231 cmap_paths = (

232 os.environ.get("CMAP_PATH", "/usr/share/pdfminer/"),

233 os.path.join(os.path.dirname(__file__), "cmap"),

234 )

235 for directory in cmap_paths:

236 path = os.path.join(directory, filename)

237 # Resolve paths to prevent directory traversal

238 resolved_path = os.path.realpath(path)

239 resolved_directory = os.path.realpath(directory)

240 # Check if resolved path is within the intended directory

241 if not resolved_path.startswith(resolved_directory + os.sep):

242 continue

243 if os.path.exists(resolved_path):

244 with gzip.open(resolved_path) as gzfile:

245 return type(str(name), (), pickle.loads(gzfile.read()))

246 raise CMapDB.CMapNotFound(name)

247

248 @classmethod

249 def get_cmap(cls, name: str) -> CMapBase:

250 if name == "Identity-H":

251 return IdentityCMap(WMode=0)

252 elif name == "Identity-V":

253 return IdentityCMap(WMode=1)

254 elif name == "OneByteIdentityH":

255 return IdentityCMapByte(WMode=0)

256 elif name == "OneByteIdentityV":

257 return IdentityCMapByte(WMode=1)

258 try:

259 return cls._cmap_cache[name]

260 except KeyError:

261 pass

262 data = cls._load_data(name)

263 cls._cmap_cache[name] = cmap = PyCMap(name, data)

264 return cmap

265

266 @classmethod

267 def get_unicode_map(cls, name: str, vertical: bool = False) -> UnicodeMap:

268 try:

269 return cls._umap_cache[name][vertical]

270 except KeyError:

271 pass

272 data = cls._load_data(f"to-unicode-{name}")

273 cls._umap_cache[name] = [PyUnicodeMap(name, data, v) for v in (False, True)]

274 return cls._umap_cache[name][vertical]

275

276

277class CMapParser(PSStackParser[PSKeyword]):

278 def __init__(self, cmap: CMapBase, fp: BinaryIO) -> None:

279 PSStackParser.__init__(self, fp)

280 self.cmap = cmap

281 # some ToUnicode maps don't have "begincmap" keyword.

282 self._in_cmap = True

283 self._warnings: set[str] = set()

284

285 def run(self) -> None:

286 with contextlib.suppress(PSEOF):

287 self.nextobject()

288

289 KEYWORD_BEGINCMAP = KWD(b"begincmap")

290 KEYWORD_ENDCMAP = KWD(b"endcmap")

291 KEYWORD_USECMAP = KWD(b"usecmap")

292 KEYWORD_DEF = KWD(b"def")

293 KEYWORD_BEGINCODESPACERANGE = KWD(b"begincodespacerange")

294 KEYWORD_ENDCODESPACERANGE = KWD(b"endcodespacerange")

295 KEYWORD_BEGINCIDRANGE = KWD(b"begincidrange")

296 KEYWORD_ENDCIDRANGE = KWD(b"endcidrange")

297 KEYWORD_BEGINCIDCHAR = KWD(b"begincidchar")

298 KEYWORD_ENDCIDCHAR = KWD(b"endcidchar")

299 KEYWORD_BEGINBFRANGE = KWD(b"beginbfrange")

300 KEYWORD_ENDBFRANGE = KWD(b"endbfrange")

301 KEYWORD_BEGINBFCHAR = KWD(b"beginbfchar")

302 KEYWORD_ENDBFCHAR = KWD(b"endbfchar")

303 KEYWORD_BEGINNOTDEFRANGE = KWD(b"beginnotdefrange")

304 KEYWORD_ENDNOTDEFRANGE = KWD(b"endnotdefrange")

305

306 def do_keyword(self, pos: int, token: PSKeyword) -> None:

307 """ToUnicode CMaps

308

309 See Section 5.9.2 - ToUnicode CMaps of the PDF Reference.

310 """

311 if token is self.KEYWORD_BEGINCMAP:

312 self._in_cmap = True

313 self.popall()

314 return

315

316 elif token is self.KEYWORD_ENDCMAP:

317 self._in_cmap = False

318 return

319

320 if not self._in_cmap:

321 return

322

323 if token is self.KEYWORD_DEF:

324 try:

325 ((_, k), (_, v)) = self.pop(2)

326 self.cmap.set_attr(literal_name(k), v)

327 except PSSyntaxError:

328 pass

329 return

330

331 if token is self.KEYWORD_USECMAP:

332 try:

333 ((_, cmapname),) = self.pop(1)

334 self.cmap.use_cmap(CMapDB.get_cmap(literal_name(cmapname)))

335 except PSSyntaxError:

336 pass

337 except CMapDB.CMapNotFound:

338 pass

339 return

340

341 if token is self.KEYWORD_BEGINCODESPACERANGE:

342 self.popall()

343 return

344 if token is self.KEYWORD_ENDCODESPACERANGE:

345 self.popall()

346 return

347

348 if token is self.KEYWORD_BEGINCIDRANGE:

349 self.popall()

350 return

351

352 if token is self.KEYWORD_ENDCIDRANGE:

353 objs = [obj for (__, obj) in self.popall()]

354 for start_byte, end_byte, cid in choplist(3, objs):

355 if not isinstance(start_byte, bytes):

356 self._warn_once("The start object of begincidrange is not a byte.")

357 continue

358 if not isinstance(end_byte, bytes):

359 self._warn_once("The end object of begincidrange is not a byte.")

360 continue

361 if not isinstance(cid, int):

362 self._warn_once("The cid object of begincidrange is not a byte.")

363 continue

364 if len(start_byte) != len(end_byte):

365 self._warn_once(

366 "The start and end byte of begincidrange have "

367 "different lengths.",

368 )

369 continue

370 start_prefix = start_byte[:-4]

371 end_prefix = end_byte[:-4]

372 if start_prefix != end_prefix:

373 self._warn_once(

374 "The prefix of the start and end byte of "

375 "begincidrange are not the same.",

376 )

377 continue

378 svar = start_byte[-4:]

379 evar = end_byte[-4:]

380 start = nunpack(svar)

381 end = nunpack(evar)

382 vlen = len(svar)

383 for i in range(end - start + 1):

384 x = start_prefix + struct.pack(">L", start + i)[-vlen:]

385 self.cmap.add_cid2unichr(cid + i, x)

386 return

387

388 if token is self.KEYWORD_BEGINCIDCHAR:

389 self.popall()

390 return

391

392 if token is self.KEYWORD_ENDCIDCHAR:

393 objs = [obj for (__, obj) in self.popall()]

394 for cid, code in choplist(2, objs):

395 if isinstance(code, bytes) and isinstance(cid, int):

396 self.cmap.add_cid2unichr(cid, code)

397 return

398

399 if token is self.KEYWORD_BEGINBFRANGE:

400 self.popall()

401 return

402

403 if token is self.KEYWORD_ENDBFRANGE:

404 objs = [obj for (__, obj) in self.popall()]

405 for start_byte, end_byte, code in choplist(3, objs):

406 if not isinstance(start_byte, bytes):

407 self._warn_once("The start object is not a byte.")

408 continue

409 if not isinstance(end_byte, bytes):

410 self._warn_once("The end object is not a byte.")

411 continue

412 if len(start_byte) != len(end_byte):

413 self._warn_once("The start and end byte have different lengths.")

414 continue

415 start = nunpack(start_byte)

416 end = nunpack(end_byte)

417 if isinstance(code, list):

418 if len(code) != end - start + 1:

419 self._warn_once(

420 "The difference between the start and end "

421 "offsets does not match the code length.",

422 )

423 for cid, unicode_value in zip(

424 range(start, end + 1), code, strict=False

425 ):

426 self.cmap.add_cid2unichr(cid, unicode_value)

427 else:

428 assert isinstance(code, bytes)

429 var = code[-4:]

430 base = nunpack(var)

431 prefix = code[:-4]

432 vlen = len(var)

433 for i in range(end - start + 1):

434 x = prefix + struct.pack(">L", base + i)[-vlen:]

435 self.cmap.add_cid2unichr(start + i, x)

436 return

437

438 if token is self.KEYWORD_BEGINBFCHAR:

439 self.popall()

440 return

441

442 if token is self.KEYWORD_ENDBFCHAR:

443 objs = [obj for (__, obj) in self.popall()]

444 for cid, code in choplist(2, objs):

445 if isinstance(cid, bytes) and isinstance(code, bytes):

446 self.cmap.add_cid2unichr(nunpack(cid), code)

447 return

448

449 if token is self.KEYWORD_BEGINNOTDEFRANGE:

450 self.popall()

451 return

452

453 if token is self.KEYWORD_ENDNOTDEFRANGE:

454 self.popall()

455 return

456

457 self.push((pos, token))

458

459 def _warn_once(self, msg: str) -> None:

460 """Warn once for each unique message"""

461 if msg not in self._warnings:

462 self._warnings.add(msg)

463 base_msg = (

464 "Ignoring (part of) ToUnicode map because the PDF data "

465 "does not conform to the format. This could result in "

466 "(cid) values in the output. "

467 )

468 log.warning(base_msg + msg)