Coverage for /pythoncovmergedfiles/medio/medio/src/pdfminer.six/pdfminer/cmapdb.py: 61%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""Adobe character mapping (CMap) support.
3CMaps provide the mapping between character codes and Unicode
4code-points to character ids (CIDs).
6More information is available on:
8 https://github.com/adobe-type-tools/cmap-resources
10"""
12import contextlib
13import gzip
14import json
15import logging
16import os
17import os.path
18import struct
19import sys
20from collections.abc import Iterable, Iterator, MutableMapping
21from typing import (
22 Any,
23 BinaryIO,
24 ClassVar,
25 TextIO,
26 Union,
27 cast,
28)
30from pdfminer.encodingdb import name2unicode
31from pdfminer.pdfexceptions import PDFException, PDFTypeError
32from pdfminer.psexceptions import PSEOF, PSSyntaxError
33from pdfminer.psparser import KWD, PSKeyword, PSLiteral, PSStackParser, literal_name
34from pdfminer.utils import choplist, nunpack
36log = logging.getLogger(__name__)
39class CMapError(PDFException):
40 pass
43class CMapBase:
44 debug = 0
46 def __init__(self, **kwargs: object) -> None:
47 self.attrs: MutableMapping[str, object] = kwargs.copy()
49 def is_vertical(self) -> bool:
50 return self.attrs.get("WMode", 0) != 0
52 def set_attr(self, k: str, v: object) -> None:
53 self.attrs[k] = v
55 def add_code2cid(self, code: str, cid: int) -> None:
56 pass
58 def add_cid2unichr(self, cid: int, code: PSLiteral | bytes | int) -> None:
59 pass
61 def use_cmap(self, cmap: "CMapBase") -> None:
62 pass
64 def decode(self, code: bytes) -> Iterable[int]:
65 raise NotImplementedError
68class CMap(CMapBase):
69 def __init__(self, **kwargs: str | int) -> None:
70 CMapBase.__init__(self, **kwargs)
71 self.code2cid: dict[int, object] = {}
73 def __repr__(self) -> str:
74 return "<CMap: {}>".format(self.attrs.get("CMapName"))
76 def use_cmap(self, cmap: CMapBase) -> None:
77 assert isinstance(cmap, CMap), str(type(cmap))
79 def copy(dst: dict[int, object], src: dict[int, object]) -> None:
80 for k, v in src.items():
81 if isinstance(v, dict):
82 d: dict[int, object] = {}
83 dst[k] = d
84 copy(d, v)
85 else:
86 dst[k] = v
88 copy(self.code2cid, cmap.code2cid)
90 def decode(self, code: bytes) -> Iterator[int]:
91 log.debug("decode: %r, %r", self, code)
92 d = self.code2cid
93 for i in iter(code):
94 if i in d:
95 x = d[i]
96 if isinstance(x, int):
97 yield x
98 d = self.code2cid
99 else:
100 d = cast(dict[int, object], x)
101 else:
102 d = self.code2cid
104 def dump(
105 self,
106 out: TextIO = sys.stdout,
107 code2cid: dict[int, object] | None = None,
108 code: tuple[int, ...] = (),
109 ) -> None:
110 if code2cid is None:
111 code2cid = self.code2cid
112 code = ()
113 for k, v in sorted(code2cid.items()):
114 c = (*code, k)
115 if isinstance(v, int):
116 out.write(f"code {c!r} = cid {v}\n")
117 else:
118 self.dump(out=out, code2cid=cast(dict[int, object], v), code=c)
121class IdentityCMap(CMapBase):
122 def decode(self, code: bytes) -> tuple[int, ...]:
123 n = len(code) // 2
124 if n:
125 return struct.unpack(f">{n}H", code[: n * 2])
126 else:
127 return ()
130class IdentityCMapByte(IdentityCMap):
131 def decode(self, code: bytes) -> tuple[int, ...]:
132 n = len(code)
133 if n:
134 return struct.unpack(f">{n}B", code[:n])
135 else:
136 return ()
139class UnicodeMap(CMapBase):
140 def __init__(self, **kwargs: str | int) -> None:
141 CMapBase.__init__(self, **kwargs)
142 self.cid2unichr: dict[int, str] = {}
144 def __repr__(self) -> str:
145 return "<UnicodeMap: {}>".format(self.attrs.get("CMapName"))
147 def get_unichr(self, cid: int) -> str:
148 log.debug("get_unichr: %r, %r", self, cid)
149 return self.cid2unichr[cid]
151 def dump(self, out: TextIO = sys.stdout) -> None:
152 for k, v in sorted(self.cid2unichr.items()):
153 out.write(f"cid {k} = unicode {v!r}\n")
156class IdentityUnicodeMap(UnicodeMap):
157 def get_unichr(self, cid: int) -> str:
158 """Interpret character id as unicode codepoint"""
159 log.debug("get_unichr: %r, %r", self, cid)
160 return chr(cid)
163class FileCMap(CMap):
164 def add_code2cid(self, code: str, cid: int) -> None:
165 assert isinstance(code, str) and isinstance(cid, int), str(
166 (type(code), type(cid)),
167 )
168 d = self.code2cid
169 for c in code[:-1]:
170 ci = ord(c)
171 if ci in d:
172 d = cast(dict[int, object], d[ci])
173 else:
174 t: dict[int, object] = {}
175 d[ci] = t
176 d = t
177 ci = ord(code[-1])
178 d[ci] = cid
181class FileUnicodeMap(UnicodeMap):
182 def add_cid2unichr(self, cid: int, code: PSLiteral | bytes | int) -> None:
183 assert isinstance(cid, int), str(type(cid))
184 if isinstance(code, PSLiteral):
185 # Interpret as an Adobe glyph name.
186 assert isinstance(code.name, str)
187 unichr = name2unicode(code.name)
188 elif isinstance(code, bytes):
189 # Interpret as UTF-16BE.
190 unichr = code.decode("UTF-16BE", "ignore")
191 elif isinstance(code, int):
192 unichr = chr(code)
193 else:
194 raise PDFTypeError(code)
196 # A0 = non-breaking space, some weird fonts can have a collision on a cid here.
197 if unichr == "\u00a0" and self.cid2unichr.get(cid) == " ":
198 return
199 self.cid2unichr[cid] = unichr
202class PyCMap(CMap):
203 def __init__(self, name: str, module: Any) -> None:
204 super().__init__(CMapName=name)
205 self.code2cid = module.CODE2CID
206 if module.IS_VERTICAL:
207 self.attrs["WMode"] = 1
210class PyUnicodeMap(UnicodeMap):
211 def __init__(self, name: str, module: Any, vertical: bool) -> None:
212 super().__init__(CMapName=name)
213 if vertical:
214 self.cid2unichr = module.CID2UNICHR_V
215 self.attrs["WMode"] = 1
216 else:
217 self.cid2unichr = module.CID2UNICHR_H
220class CMapDB:
221 _cmap_cache: ClassVar[dict[str, PyCMap]] = {}
222 _umap_cache: ClassVar[dict[str, list[PyUnicodeMap]]] = {}
224 class CMapNotFound(CMapError):
225 pass
227 @staticmethod
228 def _convert_code2cid_keys(
229 d: Union[dict[str, object], int],
230 ) -> Union[dict[int, object], int]:
231 """Recursively convert string keys to integers in CODE2CID dictionaries."""
232 if not isinstance(d, dict):
233 return d
234 result: dict[int, object] = {}
235 for k, v in d.items():
236 # Convert string keys to integers
237 try:
238 new_key = int(k)
239 except (ValueError, TypeError):
240 new_key = k # type: ignore[assignment]
241 # Recursively convert nested dictionaries
242 if isinstance(v, dict):
243 result[new_key] = CMapDB._convert_code2cid_keys(v)
244 else:
245 result[new_key] = v
246 return result
248 @classmethod
249 def _load_data(cls, name: str) -> type[Any]:
250 name = name.replace("\0", "")
251 log.debug("loading: %r", name)
252 cmap_paths = (
253 os.environ.get("CMAP_PATH", "/usr/share/pdfminer/"),
254 os.path.join(os.path.dirname(__file__), "cmap"),
255 )
257 for directory in cmap_paths:
258 # Load JSON format (secure)
259 json_filename = f"{name}.json.gz"
260 json_path = os.path.join(directory, json_filename)
261 resolved_json_path = os.path.realpath(json_path)
262 resolved_directory = os.path.realpath(directory)
264 # Check if resolved path is within the intended directory
265 if resolved_json_path.startswith(
266 resolved_directory + os.sep
267 ) and os.path.exists(resolved_json_path):
268 log.debug("loading JSON: %r", json_path)
269 with gzip.open(resolved_json_path, "rt", encoding="utf-8") as gzfile:
270 data: dict[str, Any] = json.load(gzfile)
271 # Convert string keys to integers for CID mappings
272 if "CID2UNICHR_H" in data:
273 data["CID2UNICHR_H"] = {
274 int(k): v for k, v in data["CID2UNICHR_H"].items()
275 }
276 if "CID2UNICHR_V" in data:
277 data["CID2UNICHR_V"] = {
278 int(k): v for k, v in data["CID2UNICHR_V"].items()
279 }
280 # CODE2CID may also have numeric keys that need conversion
281 if data.get("CODE2CID"):
282 data["CODE2CID"] = cls._convert_code2cid_keys(data["CODE2CID"])
283 return type(str(name), (), data)
285 raise CMapDB.CMapNotFound(name)
287 @classmethod
288 def get_cmap(cls, name: str) -> CMapBase:
289 if name == "Identity-H":
290 return IdentityCMap(WMode=0)
291 elif name == "Identity-V":
292 return IdentityCMap(WMode=1)
293 elif name == "OneByteIdentityH":
294 return IdentityCMapByte(WMode=0)
295 elif name == "OneByteIdentityV":
296 return IdentityCMapByte(WMode=1)
297 try:
298 return cls._cmap_cache[name]
299 except KeyError:
300 pass
301 data = cls._load_data(name)
302 cls._cmap_cache[name] = cmap = PyCMap(name, data)
303 return cmap
305 @classmethod
306 def get_unicode_map(cls, name: str, vertical: bool = False) -> UnicodeMap:
307 try:
308 return cls._umap_cache[name][vertical]
309 except KeyError:
310 pass
311 data = cls._load_data(f"to-unicode-{name}")
312 cls._umap_cache[name] = [PyUnicodeMap(name, data, v) for v in (False, True)]
313 return cls._umap_cache[name][vertical]
316class CMapParser(PSStackParser[PSKeyword]):
317 def __init__(self, cmap: CMapBase, fp: BinaryIO) -> None:
318 PSStackParser.__init__(self, fp)
319 self.cmap = cmap
320 # some ToUnicode maps don't have "begincmap" keyword.
321 self._in_cmap = True
322 self._warnings: set[str] = set()
324 def run(self) -> None:
325 with contextlib.suppress(PSEOF):
326 self.nextobject()
328 KEYWORD_BEGINCMAP = KWD(b"begincmap")
329 KEYWORD_ENDCMAP = KWD(b"endcmap")
330 KEYWORD_USECMAP = KWD(b"usecmap")
331 KEYWORD_DEF = KWD(b"def")
332 KEYWORD_BEGINCODESPACERANGE = KWD(b"begincodespacerange")
333 KEYWORD_ENDCODESPACERANGE = KWD(b"endcodespacerange")
334 KEYWORD_BEGINCIDRANGE = KWD(b"begincidrange")
335 KEYWORD_ENDCIDRANGE = KWD(b"endcidrange")
336 KEYWORD_BEGINCIDCHAR = KWD(b"begincidchar")
337 KEYWORD_ENDCIDCHAR = KWD(b"endcidchar")
338 KEYWORD_BEGINBFRANGE = KWD(b"beginbfrange")
339 KEYWORD_ENDBFRANGE = KWD(b"endbfrange")
340 KEYWORD_BEGINBFCHAR = KWD(b"beginbfchar")
341 KEYWORD_ENDBFCHAR = KWD(b"endbfchar")
342 KEYWORD_BEGINNOTDEFRANGE = KWD(b"beginnotdefrange")
343 KEYWORD_ENDNOTDEFRANGE = KWD(b"endnotdefrange")
345 def do_keyword(self, pos: int, token: PSKeyword) -> None:
346 """ToUnicode CMaps
348 See Section 5.9.2 - ToUnicode CMaps of the PDF Reference.
349 """
350 if token is self.KEYWORD_BEGINCMAP:
351 self._in_cmap = True
352 self.popall()
353 return
355 elif token is self.KEYWORD_ENDCMAP:
356 self._in_cmap = False
357 return
359 if not self._in_cmap:
360 return
362 if token is self.KEYWORD_DEF:
363 try:
364 ((_, k), (_, v)) = self.pop(2)
365 self.cmap.set_attr(literal_name(k), v)
366 except PSSyntaxError:
367 pass
368 return
370 if token is self.KEYWORD_USECMAP:
371 try:
372 ((_, cmapname),) = self.pop(1)
373 self.cmap.use_cmap(CMapDB.get_cmap(literal_name(cmapname)))
374 except PSSyntaxError:
375 pass
376 except CMapDB.CMapNotFound:
377 pass
378 return
380 if token is self.KEYWORD_BEGINCODESPACERANGE:
381 self.popall()
382 return
383 if token is self.KEYWORD_ENDCODESPACERANGE:
384 self.popall()
385 return
387 if token is self.KEYWORD_BEGINCIDRANGE:
388 self.popall()
389 return
391 if token is self.KEYWORD_ENDCIDRANGE:
392 objs = [obj for (__, obj) in self.popall()]
393 for start_byte, end_byte, cid in choplist(3, objs):
394 if not isinstance(start_byte, bytes):
395 self._warn_once("The start object of begincidrange is not a byte.")
396 continue
397 if not isinstance(end_byte, bytes):
398 self._warn_once("The end object of begincidrange is not a byte.")
399 continue
400 if not isinstance(cid, int):
401 self._warn_once("The cid object of begincidrange is not a byte.")
402 continue
403 if len(start_byte) != len(end_byte):
404 self._warn_once(
405 "The start and end byte of begincidrange have "
406 "different lengths.",
407 )
408 continue
409 start_prefix = start_byte[:-4]
410 end_prefix = end_byte[:-4]
411 if start_prefix != end_prefix:
412 self._warn_once(
413 "The prefix of the start and end byte of "
414 "begincidrange are not the same.",
415 )
416 continue
417 svar = start_byte[-4:]
418 evar = end_byte[-4:]
419 start = nunpack(svar)
420 end = nunpack(evar)
421 vlen = len(svar)
422 for i in range(end - start + 1):
423 x = start_prefix + struct.pack(">L", start + i)[-vlen:]
424 self.cmap.add_cid2unichr(cid + i, x)
425 return
427 if token is self.KEYWORD_BEGINCIDCHAR:
428 self.popall()
429 return
431 if token is self.KEYWORD_ENDCIDCHAR:
432 objs = [obj for (__, obj) in self.popall()]
433 for cid, code in choplist(2, objs):
434 if isinstance(code, bytes) and isinstance(cid, int):
435 self.cmap.add_cid2unichr(cid, code)
436 return
438 if token is self.KEYWORD_BEGINBFRANGE:
439 self.popall()
440 return
442 if token is self.KEYWORD_ENDBFRANGE:
443 objs = [obj for (__, obj) in self.popall()]
444 for start_byte, end_byte, code in choplist(3, objs):
445 if not isinstance(start_byte, bytes):
446 self._warn_once("The start object is not a byte.")
447 continue
448 if not isinstance(end_byte, bytes):
449 self._warn_once("The end object is not a byte.")
450 continue
451 if len(start_byte) != len(end_byte):
452 self._warn_once("The start and end byte have different lengths.")
453 continue
454 start = nunpack(start_byte)
455 end = nunpack(end_byte)
456 if isinstance(code, list):
457 if len(code) != end - start + 1:
458 self._warn_once(
459 "The difference between the start and end "
460 "offsets does not match the code length.",
461 )
462 for cid, unicode_value in zip(
463 range(start, end + 1), code, strict=False
464 ):
465 self.cmap.add_cid2unichr(cid, unicode_value)
466 else:
467 assert isinstance(code, bytes)
468 var = code[-4:]
469 base = nunpack(var)
470 prefix = code[:-4]
471 vlen = len(var)
472 for i in range(end - start + 1):
473 x = prefix + struct.pack(">L", base + i)[-vlen:]
474 self.cmap.add_cid2unichr(start + i, x)
475 return
477 if token is self.KEYWORD_BEGINBFCHAR:
478 self.popall()
479 return
481 if token is self.KEYWORD_ENDBFCHAR:
482 objs = [obj for (__, obj) in self.popall()]
483 for cid, code in choplist(2, objs):
484 if isinstance(cid, bytes) and isinstance(code, bytes):
485 self.cmap.add_cid2unichr(nunpack(cid), code)
486 return
488 if token is self.KEYWORD_BEGINNOTDEFRANGE:
489 self.popall()
490 return
492 if token is self.KEYWORD_ENDNOTDEFRANGE:
493 self.popall()
494 return
496 self.push((pos, token))
498 def _warn_once(self, msg: str) -> None:
499 """Warn once for each unique message"""
500 if msg not in self._warnings:
501 self._warnings.add(msg)
502 base_msg = (
503 "Ignoring (part of) ToUnicode map because the PDF data "
504 "does not conform to the format. This could result in "
505 "(cid) values in the output. "
506 )
507 log.warning(base_msg + msg)