Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pdfminer/cmapdb.py: 25%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""Adobe character mapping (CMap) support.
3CMaps provide the mapping between character codes and Unicode
4code-points to character ids (CIDs).
6More information is available on:
8 https://github.com/adobe-type-tools/cmap-resources
10"""
12import contextlib
13import gzip
14import logging
15import os
16import os.path
17import pickle as pickle
18import struct
19import sys
20from collections.abc import Iterable, Iterator, MutableMapping
21from typing import (
22 Any,
23 BinaryIO,
24 ClassVar,
25 TextIO,
26 cast,
27)
29from pdfminer.encodingdb import name2unicode
30from pdfminer.pdfexceptions import PDFException, PDFTypeError
31from pdfminer.psexceptions import PSEOF, PSSyntaxError
32from pdfminer.psparser import KWD, PSKeyword, PSLiteral, PSStackParser, literal_name
33from pdfminer.utils import choplist, nunpack
35log = logging.getLogger(__name__)
38class CMapError(PDFException):
39 pass
42class CMapBase:
43 debug = 0
45 def __init__(self, **kwargs: object) -> None:
46 self.attrs: MutableMapping[str, object] = kwargs.copy()
48 def is_vertical(self) -> bool:
49 return self.attrs.get("WMode", 0) != 0
51 def set_attr(self, k: str, v: object) -> None:
52 self.attrs[k] = v
54 def add_code2cid(self, code: str, cid: int) -> None:
55 pass
57 def add_cid2unichr(self, cid: int, code: PSLiteral | bytes | int) -> None:
58 pass
60 def use_cmap(self, cmap: "CMapBase") -> None:
61 pass
63 def decode(self, code: bytes) -> Iterable[int]:
64 raise NotImplementedError
67class CMap(CMapBase):
68 def __init__(self, **kwargs: str | int) -> None:
69 CMapBase.__init__(self, **kwargs)
70 self.code2cid: dict[int, object] = {}
72 def __repr__(self) -> str:
73 return "<CMap: {}>".format(self.attrs.get("CMapName"))
75 def use_cmap(self, cmap: CMapBase) -> None:
76 assert isinstance(cmap, CMap), str(type(cmap))
78 def copy(dst: dict[int, object], src: dict[int, object]) -> None:
79 for k, v in src.items():
80 if isinstance(v, dict):
81 d: dict[int, object] = {}
82 dst[k] = d
83 copy(d, v)
84 else:
85 dst[k] = v
87 copy(self.code2cid, cmap.code2cid)
89 def decode(self, code: bytes) -> Iterator[int]:
90 log.debug("decode: %r, %r", self, code)
91 d = self.code2cid
92 for i in iter(code):
93 if i in d:
94 x = d[i]
95 if isinstance(x, int):
96 yield x
97 d = self.code2cid
98 else:
99 d = cast(dict[int, object], x)
100 else:
101 d = self.code2cid
103 def dump(
104 self,
105 out: TextIO = sys.stdout,
106 code2cid: dict[int, object] | None = None,
107 code: tuple[int, ...] = (),
108 ) -> None:
109 if code2cid is None:
110 code2cid = self.code2cid
111 code = ()
112 for k, v in sorted(code2cid.items()):
113 c = (*code, k)
114 if isinstance(v, int):
115 out.write(f"code {c!r} = cid {v}\n")
116 else:
117 self.dump(out=out, code2cid=cast(dict[int, object], v), code=c)
120class IdentityCMap(CMapBase):
121 def decode(self, code: bytes) -> tuple[int, ...]:
122 n = len(code) // 2
123 if n:
124 return struct.unpack(f">{n}H", code[: n * 2])
125 else:
126 return ()
129class IdentityCMapByte(IdentityCMap):
130 def decode(self, code: bytes) -> tuple[int, ...]:
131 n = len(code)
132 if n:
133 return struct.unpack(f">{n}B", code[:n])
134 else:
135 return ()
138class UnicodeMap(CMapBase):
139 def __init__(self, **kwargs: str | int) -> None:
140 CMapBase.__init__(self, **kwargs)
141 self.cid2unichr: dict[int, str] = {}
143 def __repr__(self) -> str:
144 return "<UnicodeMap: {}>".format(self.attrs.get("CMapName"))
146 def get_unichr(self, cid: int) -> str:
147 log.debug("get_unichr: %r, %r", self, cid)
148 return self.cid2unichr[cid]
150 def dump(self, out: TextIO = sys.stdout) -> None:
151 for k, v in sorted(self.cid2unichr.items()):
152 out.write(f"cid {k} = unicode {v!r}\n")
155class IdentityUnicodeMap(UnicodeMap):
156 def get_unichr(self, cid: int) -> str:
157 """Interpret character id as unicode codepoint"""
158 log.debug("get_unichr: %r, %r", self, cid)
159 return chr(cid)
162class FileCMap(CMap):
163 def add_code2cid(self, code: str, cid: int) -> None:
164 assert isinstance(code, str) and isinstance(cid, int), str(
165 (type(code), type(cid)),
166 )
167 d = self.code2cid
168 for c in code[:-1]:
169 ci = ord(c)
170 if ci in d:
171 d = cast(dict[int, object], d[ci])
172 else:
173 t: dict[int, object] = {}
174 d[ci] = t
175 d = t
176 ci = ord(code[-1])
177 d[ci] = cid
180class FileUnicodeMap(UnicodeMap):
181 def add_cid2unichr(self, cid: int, code: PSLiteral | bytes | int) -> None:
182 assert isinstance(cid, int), str(type(cid))
183 if isinstance(code, PSLiteral):
184 # Interpret as an Adobe glyph name.
185 assert isinstance(code.name, str)
186 unichr = name2unicode(code.name)
187 elif isinstance(code, bytes):
188 # Interpret as UTF-16BE.
189 unichr = code.decode("UTF-16BE", "ignore")
190 elif isinstance(code, int):
191 unichr = chr(code)
192 else:
193 raise PDFTypeError(code)
195 # A0 = non-breaking space, some weird fonts can have a collision on a cid here.
196 if unichr == "\u00a0" and self.cid2unichr.get(cid) == " ":
197 return
198 self.cid2unichr[cid] = unichr
201class PyCMap(CMap):
202 def __init__(self, name: str, module: Any) -> None:
203 super().__init__(CMapName=name)
204 self.code2cid = module.CODE2CID
205 if module.IS_VERTICAL:
206 self.attrs["WMode"] = 1
209class PyUnicodeMap(UnicodeMap):
210 def __init__(self, name: str, module: Any, vertical: bool) -> None:
211 super().__init__(CMapName=name)
212 if vertical:
213 self.cid2unichr = module.CID2UNICHR_V
214 self.attrs["WMode"] = 1
215 else:
216 self.cid2unichr = module.CID2UNICHR_H
219class CMapDB:
220 _cmap_cache: ClassVar[dict[str, PyCMap]] = {}
221 _umap_cache: ClassVar[dict[str, list[PyUnicodeMap]]] = {}
223 class CMapNotFound(CMapError):
224 pass
226 @classmethod
227 def _load_data(cls, name: str) -> Any:
228 name = name.replace("\0", "")
229 filename = f"{name}.pickle.gz"
230 log.debug("loading: %r", name)
231 cmap_paths = (
232 os.environ.get("CMAP_PATH", "/usr/share/pdfminer/"),
233 os.path.join(os.path.dirname(__file__), "cmap"),
234 )
235 for directory in cmap_paths:
236 path = os.path.join(directory, filename)
237 # Resolve paths to prevent directory traversal
238 resolved_path = os.path.realpath(path)
239 resolved_directory = os.path.realpath(directory)
240 # Check if resolved path is within the intended directory
241 if not resolved_path.startswith(resolved_directory + os.sep):
242 continue
243 if os.path.exists(resolved_path):
244 with gzip.open(resolved_path) as gzfile:
245 return type(str(name), (), pickle.loads(gzfile.read()))
246 raise CMapDB.CMapNotFound(name)
248 @classmethod
249 def get_cmap(cls, name: str) -> CMapBase:
250 if name == "Identity-H":
251 return IdentityCMap(WMode=0)
252 elif name == "Identity-V":
253 return IdentityCMap(WMode=1)
254 elif name == "OneByteIdentityH":
255 return IdentityCMapByte(WMode=0)
256 elif name == "OneByteIdentityV":
257 return IdentityCMapByte(WMode=1)
258 try:
259 return cls._cmap_cache[name]
260 except KeyError:
261 pass
262 data = cls._load_data(name)
263 cls._cmap_cache[name] = cmap = PyCMap(name, data)
264 return cmap
266 @classmethod
267 def get_unicode_map(cls, name: str, vertical: bool = False) -> UnicodeMap:
268 try:
269 return cls._umap_cache[name][vertical]
270 except KeyError:
271 pass
272 data = cls._load_data(f"to-unicode-{name}")
273 cls._umap_cache[name] = [PyUnicodeMap(name, data, v) for v in (False, True)]
274 return cls._umap_cache[name][vertical]
277class CMapParser(PSStackParser[PSKeyword]):
278 def __init__(self, cmap: CMapBase, fp: BinaryIO) -> None:
279 PSStackParser.__init__(self, fp)
280 self.cmap = cmap
281 # some ToUnicode maps don't have "begincmap" keyword.
282 self._in_cmap = True
283 self._warnings: set[str] = set()
285 def run(self) -> None:
286 with contextlib.suppress(PSEOF):
287 self.nextobject()
289 KEYWORD_BEGINCMAP = KWD(b"begincmap")
290 KEYWORD_ENDCMAP = KWD(b"endcmap")
291 KEYWORD_USECMAP = KWD(b"usecmap")
292 KEYWORD_DEF = KWD(b"def")
293 KEYWORD_BEGINCODESPACERANGE = KWD(b"begincodespacerange")
294 KEYWORD_ENDCODESPACERANGE = KWD(b"endcodespacerange")
295 KEYWORD_BEGINCIDRANGE = KWD(b"begincidrange")
296 KEYWORD_ENDCIDRANGE = KWD(b"endcidrange")
297 KEYWORD_BEGINCIDCHAR = KWD(b"begincidchar")
298 KEYWORD_ENDCIDCHAR = KWD(b"endcidchar")
299 KEYWORD_BEGINBFRANGE = KWD(b"beginbfrange")
300 KEYWORD_ENDBFRANGE = KWD(b"endbfrange")
301 KEYWORD_BEGINBFCHAR = KWD(b"beginbfchar")
302 KEYWORD_ENDBFCHAR = KWD(b"endbfchar")
303 KEYWORD_BEGINNOTDEFRANGE = KWD(b"beginnotdefrange")
304 KEYWORD_ENDNOTDEFRANGE = KWD(b"endnotdefrange")
306 def do_keyword(self, pos: int, token: PSKeyword) -> None:
307 """ToUnicode CMaps
309 See Section 5.9.2 - ToUnicode CMaps of the PDF Reference.
310 """
311 if token is self.KEYWORD_BEGINCMAP:
312 self._in_cmap = True
313 self.popall()
314 return
316 elif token is self.KEYWORD_ENDCMAP:
317 self._in_cmap = False
318 return
320 if not self._in_cmap:
321 return
323 if token is self.KEYWORD_DEF:
324 try:
325 ((_, k), (_, v)) = self.pop(2)
326 self.cmap.set_attr(literal_name(k), v)
327 except PSSyntaxError:
328 pass
329 return
331 if token is self.KEYWORD_USECMAP:
332 try:
333 ((_, cmapname),) = self.pop(1)
334 self.cmap.use_cmap(CMapDB.get_cmap(literal_name(cmapname)))
335 except PSSyntaxError:
336 pass
337 except CMapDB.CMapNotFound:
338 pass
339 return
341 if token is self.KEYWORD_BEGINCODESPACERANGE:
342 self.popall()
343 return
344 if token is self.KEYWORD_ENDCODESPACERANGE:
345 self.popall()
346 return
348 if token is self.KEYWORD_BEGINCIDRANGE:
349 self.popall()
350 return
352 if token is self.KEYWORD_ENDCIDRANGE:
353 objs = [obj for (__, obj) in self.popall()]
354 for start_byte, end_byte, cid in choplist(3, objs):
355 if not isinstance(start_byte, bytes):
356 self._warn_once("The start object of begincidrange is not a byte.")
357 continue
358 if not isinstance(end_byte, bytes):
359 self._warn_once("The end object of begincidrange is not a byte.")
360 continue
361 if not isinstance(cid, int):
362 self._warn_once("The cid object of begincidrange is not a byte.")
363 continue
364 if len(start_byte) != len(end_byte):
365 self._warn_once(
366 "The start and end byte of begincidrange have "
367 "different lengths.",
368 )
369 continue
370 start_prefix = start_byte[:-4]
371 end_prefix = end_byte[:-4]
372 if start_prefix != end_prefix:
373 self._warn_once(
374 "The prefix of the start and end byte of "
375 "begincidrange are not the same.",
376 )
377 continue
378 svar = start_byte[-4:]
379 evar = end_byte[-4:]
380 start = nunpack(svar)
381 end = nunpack(evar)
382 vlen = len(svar)
383 for i in range(end - start + 1):
384 x = start_prefix + struct.pack(">L", start + i)[-vlen:]
385 self.cmap.add_cid2unichr(cid + i, x)
386 return
388 if token is self.KEYWORD_BEGINCIDCHAR:
389 self.popall()
390 return
392 if token is self.KEYWORD_ENDCIDCHAR:
393 objs = [obj for (__, obj) in self.popall()]
394 for cid, code in choplist(2, objs):
395 if isinstance(code, bytes) and isinstance(cid, int):
396 self.cmap.add_cid2unichr(cid, code)
397 return
399 if token is self.KEYWORD_BEGINBFRANGE:
400 self.popall()
401 return
403 if token is self.KEYWORD_ENDBFRANGE:
404 objs = [obj for (__, obj) in self.popall()]
405 for start_byte, end_byte, code in choplist(3, objs):
406 if not isinstance(start_byte, bytes):
407 self._warn_once("The start object is not a byte.")
408 continue
409 if not isinstance(end_byte, bytes):
410 self._warn_once("The end object is not a byte.")
411 continue
412 if len(start_byte) != len(end_byte):
413 self._warn_once("The start and end byte have different lengths.")
414 continue
415 start = nunpack(start_byte)
416 end = nunpack(end_byte)
417 if isinstance(code, list):
418 if len(code) != end - start + 1:
419 self._warn_once(
420 "The difference between the start and end "
421 "offsets does not match the code length.",
422 )
423 for cid, unicode_value in zip(
424 range(start, end + 1), code, strict=False
425 ):
426 self.cmap.add_cid2unichr(cid, unicode_value)
427 else:
428 assert isinstance(code, bytes)
429 var = code[-4:]
430 base = nunpack(var)
431 prefix = code[:-4]
432 vlen = len(var)
433 for i in range(end - start + 1):
434 x = prefix + struct.pack(">L", base + i)[-vlen:]
435 self.cmap.add_cid2unichr(start + i, x)
436 return
438 if token is self.KEYWORD_BEGINBFCHAR:
439 self.popall()
440 return
442 if token is self.KEYWORD_ENDBFCHAR:
443 objs = [obj for (__, obj) in self.popall()]
444 for cid, code in choplist(2, objs):
445 if isinstance(cid, bytes) and isinstance(code, bytes):
446 self.cmap.add_cid2unichr(nunpack(cid), code)
447 return
449 if token is self.KEYWORD_BEGINNOTDEFRANGE:
450 self.popall()
451 return
453 if token is self.KEYWORD_ENDNOTDEFRANGE:
454 self.popall()
455 return
457 self.push((pos, token))
459 def _warn_once(self, msg: str) -> None:
460 """Warn once for each unique message"""
461 if msg not in self._warnings:
462 self._warnings.add(msg)
463 base_msg = (
464 "Ignoring (part of) ToUnicode map because the PDF data "
465 "does not conform to the format. This could result in "
466 "(cid) values in the output. "
467 )
468 log.warning(base_msg + msg)