Coverage for /pythoncovmergedfiles/medio/medio/src/pdfminer.six/pdfminer/cmapdb.py: 61%

1"""Adobe character mapping (CMap) support.

3CMaps provide the mapping between character codes and Unicode

4code-points to character ids (CIDs).

6More information is available on:

8 https://github.com/adobe-type-tools/cmap-resources

10"""

12import contextlib

13import gzip

14import json

15import logging

16import os

17import os.path

18import struct

19import sys

20from collections.abc import Iterable, Iterator, MutableMapping

21from typing import (

22 Any,

23 BinaryIO,

24 ClassVar,

25 TextIO,

26 Union,

27 cast,

28)

30from pdfminer.encodingdb import name2unicode

31from pdfminer.pdfexceptions import PDFException, PDFTypeError

32from pdfminer.psexceptions import PSEOF, PSSyntaxError

33from pdfminer.psparser import KWD, PSKeyword, PSLiteral, PSStackParser, literal_name

34from pdfminer.utils import choplist, nunpack

36log = logging.getLogger(__name__)

39class CMapError(PDFException):

40 pass

43class CMapBase:

44 debug = 0

46 def __init__(self, **kwargs: object) -> None:

47 self.attrs: MutableMapping[str, object] = kwargs.copy()

49 def is_vertical(self) -> bool:

50 return self.attrs.get("WMode", 0) != 0

52 def set_attr(self, k: str, v: object) -> None:

53 self.attrs[k] = v

55 def add_code2cid(self, code: str, cid: int) -> None:

56 pass

58 def add_cid2unichr(self, cid: int, code: PSLiteral | bytes | int) -> None:

59 pass

61 def use_cmap(self, cmap: "CMapBase") -> None:

62 pass

64 def decode(self, code: bytes) -> Iterable[int]:

65 raise NotImplementedError

68class CMap(CMapBase):

69 def __init__(self, **kwargs: str | int) -> None:

70 CMapBase.__init__(self, **kwargs)

71 self.code2cid: dict[int, object] = {}

73 def __repr__(self) -> str:

74 return "<CMap: {}>".format(self.attrs.get("CMapName"))

76 def use_cmap(self, cmap: CMapBase) -> None:

77 assert isinstance(cmap, CMap), str(type(cmap))

79 def copy(dst: dict[int, object], src: dict[int, object]) -> None:

80 for k, v in src.items():

81 if isinstance(v, dict):

82 d: dict[int, object] = {}

83 dst[k] = d

84 copy(d, v)

85 else:

86 dst[k] = v

88 copy(self.code2cid, cmap.code2cid)

90 def decode(self, code: bytes) -> Iterator[int]:

91 log.debug("decode: %r, %r", self, code)

92 d = self.code2cid

93 for i in iter(code):

94 if i in d:

95 x = d[i]

96 if isinstance(x, int):

97 yield x

98 d = self.code2cid

99 else:

100 d = cast(dict[int, object], x)

101 else:

102 d = self.code2cid

103

104 def dump(

105 self,

106 out: TextIO = sys.stdout,

107 code2cid: dict[int, object] | None = None,

108 code: tuple[int, ...] = (),

109 ) -> None:

110 if code2cid is None:

111 code2cid = self.code2cid

112 code = ()

113 for k, v in sorted(code2cid.items()):

114 c = (*code, k)

115 if isinstance(v, int):

116 out.write(f"code {c!r} = cid {v}\n")

117 else:

118 self.dump(out=out, code2cid=cast(dict[int, object], v), code=c)

119

120

121class IdentityCMap(CMapBase):

122 def decode(self, code: bytes) -> tuple[int, ...]:

123 n = len(code) // 2

124 if n:

125 return struct.unpack(f">{n}H", code[: n * 2])

126 else:

127 return ()

128

129

130class IdentityCMapByte(IdentityCMap):

131 def decode(self, code: bytes) -> tuple[int, ...]:

132 n = len(code)

133 if n:

134 return struct.unpack(f">{n}B", code[:n])

135 else:

136 return ()

137

138

139class UnicodeMap(CMapBase):

140 def __init__(self, **kwargs: str | int) -> None:

141 CMapBase.__init__(self, **kwargs)

142 self.cid2unichr: dict[int, str] = {}

143

144 def __repr__(self) -> str:

145 return "<UnicodeMap: {}>".format(self.attrs.get("CMapName"))

146

147 def get_unichr(self, cid: int) -> str:

148 log.debug("get_unichr: %r, %r", self, cid)

149 return self.cid2unichr[cid]

150

151 def dump(self, out: TextIO = sys.stdout) -> None:

152 for k, v in sorted(self.cid2unichr.items()):

153 out.write(f"cid {k} = unicode {v!r}\n")

154

155

156class IdentityUnicodeMap(UnicodeMap):

157 def get_unichr(self, cid: int) -> str:

158 """Interpret character id as unicode codepoint"""

159 log.debug("get_unichr: %r, %r", self, cid)

160 return chr(cid)

161

162

163class FileCMap(CMap):

164 def add_code2cid(self, code: str, cid: int) -> None:

165 assert isinstance(code, str) and isinstance(cid, int), str(

166 (type(code), type(cid)),

167 )

168 d = self.code2cid

169 for c in code[:-1]:

170 ci = ord(c)

171 if ci in d:

172 d = cast(dict[int, object], d[ci])

173 else:

174 t: dict[int, object] = {}

175 d[ci] = t

176 d = t

177 ci = ord(code[-1])

178 d[ci] = cid

179

180

181class FileUnicodeMap(UnicodeMap):

182 def add_cid2unichr(self, cid: int, code: PSLiteral | bytes | int) -> None:

183 assert isinstance(cid, int), str(type(cid))

184 if isinstance(code, PSLiteral):

185 # Interpret as an Adobe glyph name.

186 assert isinstance(code.name, str)

187 unichr = name2unicode(code.name)

188 elif isinstance(code, bytes):

189 # Interpret as UTF-16BE.

190 unichr = code.decode("UTF-16BE", "ignore")

191 elif isinstance(code, int):

192 unichr = chr(code)

193 else:

194 raise PDFTypeError(code)

195

196 # A0 = non-breaking space, some weird fonts can have a collision on a cid here.

197 if unichr == "\u00a0" and self.cid2unichr.get(cid) == " ":

198 return

199 self.cid2unichr[cid] = unichr

200

201

202class PyCMap(CMap):

203 def __init__(self, name: str, module: Any) -> None:

204 super().__init__(CMapName=name)

205 self.code2cid = module.CODE2CID

206 if module.IS_VERTICAL:

207 self.attrs["WMode"] = 1

208

209

210class PyUnicodeMap(UnicodeMap):

211 def __init__(self, name: str, module: Any, vertical: bool) -> None:

212 super().__init__(CMapName=name)

213 if vertical:

214 self.cid2unichr = module.CID2UNICHR_V

215 self.attrs["WMode"] = 1

216 else:

217 self.cid2unichr = module.CID2UNICHR_H

218

219

220class CMapDB:

221 _cmap_cache: ClassVar[dict[str, PyCMap]] = {}

222 _umap_cache: ClassVar[dict[str, list[PyUnicodeMap]]] = {}

223

224 class CMapNotFound(CMapError):

225 pass

226

227 @staticmethod

228 def _convert_code2cid_keys(

229 d: Union[dict[str, object], int],

230 ) -> Union[dict[int, object], int]:

231 """Recursively convert string keys to integers in CODE2CID dictionaries."""

232 if not isinstance(d, dict):

233 return d

234 result: dict[int, object] = {}

235 for k, v in d.items():

236 # Convert string keys to integers

237 try:

238 new_key = int(k)

239 except (ValueError, TypeError):

240 new_key = k # type: ignore[assignment]

241 # Recursively convert nested dictionaries

242 if isinstance(v, dict):

243 result[new_key] = CMapDB._convert_code2cid_keys(v)

244 else:

245 result[new_key] = v

246 return result

247

248 @classmethod

249 def _load_data(cls, name: str) -> type[Any]:

250 name = name.replace("\0", "")

251 log.debug("loading: %r", name)

252 cmap_paths = (

253 os.environ.get("CMAP_PATH", "/usr/share/pdfminer/"),

254 os.path.join(os.path.dirname(__file__), "cmap"),

255 )

256

257 for directory in cmap_paths:

258 # Load JSON format (secure)

259 json_filename = f"{name}.json.gz"

260 json_path = os.path.join(directory, json_filename)

261 resolved_json_path = os.path.realpath(json_path)

262 resolved_directory = os.path.realpath(directory)

263

264 # Check if resolved path is within the intended directory

265 if resolved_json_path.startswith(

266 resolved_directory + os.sep

267 ) and os.path.exists(resolved_json_path):

268 log.debug("loading JSON: %r", json_path)

269 with gzip.open(resolved_json_path, "rt", encoding="utf-8") as gzfile:

270 data: dict[str, Any] = json.load(gzfile)

271 # Convert string keys to integers for CID mappings

272 if "CID2UNICHR_H" in data:

273 data["CID2UNICHR_H"] = {

274 int(k): v for k, v in data["CID2UNICHR_H"].items()

275 }

276 if "CID2UNICHR_V" in data:

277 data["CID2UNICHR_V"] = {

278 int(k): v for k, v in data["CID2UNICHR_V"].items()

279 }

280 # CODE2CID may also have numeric keys that need conversion

281 if data.get("CODE2CID"):

282 data["CODE2CID"] = cls._convert_code2cid_keys(data["CODE2CID"])

283 return type(str(name), (), data)

284

285 raise CMapDB.CMapNotFound(name)

286

287 @classmethod

288 def get_cmap(cls, name: str) -> CMapBase:

289 if name == "Identity-H":

290 return IdentityCMap(WMode=0)

291 elif name == "Identity-V":

292 return IdentityCMap(WMode=1)

293 elif name == "OneByteIdentityH":

294 return IdentityCMapByte(WMode=0)

295 elif name == "OneByteIdentityV":

296 return IdentityCMapByte(WMode=1)

297 try:

298 return cls._cmap_cache[name]

299 except KeyError:

300 pass

301 data = cls._load_data(name)

302 cls._cmap_cache[name] = cmap = PyCMap(name, data)

303 return cmap

304

305 @classmethod

306 def get_unicode_map(cls, name: str, vertical: bool = False) -> UnicodeMap:

307 try:

308 return cls._umap_cache[name][vertical]

309 except KeyError:

310 pass

311 data = cls._load_data(f"to-unicode-{name}")

312 cls._umap_cache[name] = [PyUnicodeMap(name, data, v) for v in (False, True)]

313 return cls._umap_cache[name][vertical]

314

315

316class CMapParser(PSStackParser[PSKeyword]):

317 def __init__(self, cmap: CMapBase, fp: BinaryIO) -> None:

318 PSStackParser.__init__(self, fp)

319 self.cmap = cmap

320 # some ToUnicode maps don't have "begincmap" keyword.

321 self._in_cmap = True

322 self._warnings: set[str] = set()

323

324 def run(self) -> None:

325 with contextlib.suppress(PSEOF):

326 self.nextobject()

327

328 KEYWORD_BEGINCMAP = KWD(b"begincmap")

329 KEYWORD_ENDCMAP = KWD(b"endcmap")

330 KEYWORD_USECMAP = KWD(b"usecmap")

331 KEYWORD_DEF = KWD(b"def")

332 KEYWORD_BEGINCODESPACERANGE = KWD(b"begincodespacerange")

333 KEYWORD_ENDCODESPACERANGE = KWD(b"endcodespacerange")

334 KEYWORD_BEGINCIDRANGE = KWD(b"begincidrange")

335 KEYWORD_ENDCIDRANGE = KWD(b"endcidrange")

336 KEYWORD_BEGINCIDCHAR = KWD(b"begincidchar")

337 KEYWORD_ENDCIDCHAR = KWD(b"endcidchar")

338 KEYWORD_BEGINBFRANGE = KWD(b"beginbfrange")

339 KEYWORD_ENDBFRANGE = KWD(b"endbfrange")

340 KEYWORD_BEGINBFCHAR = KWD(b"beginbfchar")

341 KEYWORD_ENDBFCHAR = KWD(b"endbfchar")

342 KEYWORD_BEGINNOTDEFRANGE = KWD(b"beginnotdefrange")

343 KEYWORD_ENDNOTDEFRANGE = KWD(b"endnotdefrange")

344

345 def do_keyword(self, pos: int, token: PSKeyword) -> None:

346 """ToUnicode CMaps

347

348 See Section 5.9.2 - ToUnicode CMaps of the PDF Reference.

349 """

350 if token is self.KEYWORD_BEGINCMAP:

351 self._in_cmap = True

352 self.popall()

353 return

354

355 elif token is self.KEYWORD_ENDCMAP:

356 self._in_cmap = False

357 return

358

359 if not self._in_cmap:

360 return

361

362 if token is self.KEYWORD_DEF:

363 try:

364 ((_, k), (_, v)) = self.pop(2)

365 self.cmap.set_attr(literal_name(k), v)

366 except PSSyntaxError:

367 pass

368 return

369

370 if token is self.KEYWORD_USECMAP:

371 try:

372 ((_, cmapname),) = self.pop(1)

373 self.cmap.use_cmap(CMapDB.get_cmap(literal_name(cmapname)))

374 except PSSyntaxError:

375 pass

376 except CMapDB.CMapNotFound:

377 pass

378 return

379

380 if token is self.KEYWORD_BEGINCODESPACERANGE:

381 self.popall()

382 return

383 if token is self.KEYWORD_ENDCODESPACERANGE:

384 self.popall()

385 return

386

387 if token is self.KEYWORD_BEGINCIDRANGE:

388 self.popall()

389 return

390

391 if token is self.KEYWORD_ENDCIDRANGE:

392 objs = [obj for (__, obj) in self.popall()]

393 for start_byte, end_byte, cid in choplist(3, objs):

394 if not isinstance(start_byte, bytes):

395 self._warn_once("The start object of begincidrange is not a byte.")

396 continue

397 if not isinstance(end_byte, bytes):

398 self._warn_once("The end object of begincidrange is not a byte.")

399 continue

400 if not isinstance(cid, int):

401 self._warn_once("The cid object of begincidrange is not a byte.")

402 continue

403 if len(start_byte) != len(end_byte):

404 self._warn_once(

405 "The start and end byte of begincidrange have "

406 "different lengths.",

407 )

408 continue

409 start_prefix = start_byte[:-4]

410 end_prefix = end_byte[:-4]

411 if start_prefix != end_prefix:

412 self._warn_once(

413 "The prefix of the start and end byte of "

414 "begincidrange are not the same.",

415 )

416 continue

417 svar = start_byte[-4:]

418 evar = end_byte[-4:]

419 start = nunpack(svar)

420 end = nunpack(evar)

421 vlen = len(svar)

422 for i in range(end - start + 1):

423 x = start_prefix + struct.pack(">L", start + i)[-vlen:]

424 self.cmap.add_cid2unichr(cid + i, x)

425 return

426

427 if token is self.KEYWORD_BEGINCIDCHAR:

428 self.popall()

429 return

430

431 if token is self.KEYWORD_ENDCIDCHAR:

432 objs = [obj for (__, obj) in self.popall()]

433 for cid, code in choplist(2, objs):

434 if isinstance(code, bytes) and isinstance(cid, int):

435 self.cmap.add_cid2unichr(cid, code)

436 return

437

438 if token is self.KEYWORD_BEGINBFRANGE:

439 self.popall()

440 return

441

442 if token is self.KEYWORD_ENDBFRANGE:

443 objs = [obj for (__, obj) in self.popall()]

444 for start_byte, end_byte, code in choplist(3, objs):

445 if not isinstance(start_byte, bytes):

446 self._warn_once("The start object is not a byte.")

447 continue

448 if not isinstance(end_byte, bytes):

449 self._warn_once("The end object is not a byte.")

450 continue

451 if len(start_byte) != len(end_byte):

452 self._warn_once("The start and end byte have different lengths.")

453 continue

454 start = nunpack(start_byte)

455 end = nunpack(end_byte)

456 if isinstance(code, list):

457 if len(code) != end - start + 1:

458 self._warn_once(

459 "The difference between the start and end "

460 "offsets does not match the code length.",

461 )

462 for cid, unicode_value in zip(

463 range(start, end + 1), code, strict=False

464 ):

465 self.cmap.add_cid2unichr(cid, unicode_value)

466 else:

467 assert isinstance(code, bytes)

468 var = code[-4:]

469 base = nunpack(var)

470 prefix = code[:-4]

471 vlen = len(var)

472 for i in range(end - start + 1):

473 x = prefix + struct.pack(">L", base + i)[-vlen:]

474 self.cmap.add_cid2unichr(start + i, x)

475 return

476

477 if token is self.KEYWORD_BEGINBFCHAR:

478 self.popall()

479 return

480

481 if token is self.KEYWORD_ENDBFCHAR:

482 objs = [obj for (__, obj) in self.popall()]

483 for cid, code in choplist(2, objs):

484 if isinstance(cid, bytes) and isinstance(code, bytes):

485 self.cmap.add_cid2unichr(nunpack(cid), code)

486 return

487

488 if token is self.KEYWORD_BEGINNOTDEFRANGE:

489 self.popall()

490 return

491

492 if token is self.KEYWORD_ENDNOTDEFRANGE:

493 self.popall()

494 return

495

496 self.push((pos, token))

497

498 def _warn_once(self, msg: str) -> None:

499 """Warn once for each unique message"""

500 if msg not in self._warnings:

501 self._warnings.add(msg)

502 base_msg = (

503 "Ignoring (part of) ToUnicode map because the PDF data "

504 "does not conform to the format. This could result in "

505 "(cid) values in the output. "

506 )

507 log.warning(base_msg + msg)