Coverage for /pythoncovmergedfiles/medio/medio/src/pdfminer.six/pdfminer/cmapdb.py: 69%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""Adobe character mapping (CMap) support.
3CMaps provide the mapping between character codes and Unicode
4code-points to character ids (CIDs).
6More information is available on:
8 https://github.com/adobe-type-tools/cmap-resources
10"""
12import gzip
13import logging
14import os
15import os.path
16import pickle as pickle
17import struct
18import sys
19from typing import (
20 Any,
21 BinaryIO,
22 Dict,
23 Iterable,
24 Iterator,
25 List,
26 MutableMapping,
27 Optional,
28 Set,
29 TextIO,
30 Tuple,
31 Union,
32 cast,
33)
35from pdfminer.encodingdb import name2unicode
36from pdfminer.pdfexceptions import PDFException, PDFTypeError
37from pdfminer.psexceptions import PSEOF, PSSyntaxError
38from pdfminer.psparser import KWD, PSKeyword, PSLiteral, PSStackParser, literal_name
39from pdfminer.utils import choplist, nunpack
41log = logging.getLogger(__name__)
44class CMapError(PDFException):
45 pass
48class CMapBase:
49 debug = 0
51 def __init__(self, **kwargs: object) -> None:
52 self.attrs: MutableMapping[str, object] = kwargs.copy()
54 def is_vertical(self) -> bool:
55 return self.attrs.get("WMode", 0) != 0
57 def set_attr(self, k: str, v: object) -> None:
58 self.attrs[k] = v
60 def add_code2cid(self, code: str, cid: int) -> None:
61 pass
63 def add_cid2unichr(self, cid: int, code: Union[PSLiteral, bytes, int]) -> None:
64 pass
66 def use_cmap(self, cmap: "CMapBase") -> None:
67 pass
69 def decode(self, code: bytes) -> Iterable[int]:
70 raise NotImplementedError
73class CMap(CMapBase):
74 def __init__(self, **kwargs: Union[str, int]) -> None:
75 CMapBase.__init__(self, **kwargs)
76 self.code2cid: Dict[int, object] = {}
78 def __repr__(self) -> str:
79 return "<CMap: %s>" % self.attrs.get("CMapName")
81 def use_cmap(self, cmap: CMapBase) -> None:
82 assert isinstance(cmap, CMap), str(type(cmap))
84 def copy(dst: Dict[int, object], src: Dict[int, object]) -> None:
85 for k, v in src.items():
86 if isinstance(v, dict):
87 d: Dict[int, object] = {}
88 dst[k] = d
89 copy(d, v)
90 else:
91 dst[k] = v
93 copy(self.code2cid, cmap.code2cid)
95 def decode(self, code: bytes) -> Iterator[int]:
96 log.debug("decode: %r, %r", self, code)
97 d = self.code2cid
98 for i in iter(code):
99 if i in d:
100 x = d[i]
101 if isinstance(x, int):
102 yield x
103 d = self.code2cid
104 else:
105 d = cast(Dict[int, object], x)
106 else:
107 d = self.code2cid
109 def dump(
110 self,
111 out: TextIO = sys.stdout,
112 code2cid: Optional[Dict[int, object]] = None,
113 code: Tuple[int, ...] = (),
114 ) -> None:
115 if code2cid is None:
116 code2cid = self.code2cid
117 code = ()
118 for k, v in sorted(code2cid.items()):
119 c = code + (k,)
120 if isinstance(v, int):
121 out.write("code %r = cid %d\n" % (c, v))
122 else:
123 self.dump(out=out, code2cid=cast(Dict[int, object], v), code=c)
126class IdentityCMap(CMapBase):
127 def decode(self, code: bytes) -> Tuple[int, ...]:
128 n = len(code) // 2
129 if n:
130 return struct.unpack(">%dH" % n, code)
131 else:
132 return ()
135class IdentityCMapByte(IdentityCMap):
136 def decode(self, code: bytes) -> Tuple[int, ...]:
137 n = len(code)
138 if n:
139 return struct.unpack(">%dB" % n, code)
140 else:
141 return ()
144class UnicodeMap(CMapBase):
145 def __init__(self, **kwargs: Union[str, int]) -> None:
146 CMapBase.__init__(self, **kwargs)
147 self.cid2unichr: Dict[int, str] = {}
149 def __repr__(self) -> str:
150 return "<UnicodeMap: %s>" % self.attrs.get("CMapName")
152 def get_unichr(self, cid: int) -> str:
153 log.debug("get_unichr: %r, %r", self, cid)
154 return self.cid2unichr[cid]
156 def dump(self, out: TextIO = sys.stdout) -> None:
157 for k, v in sorted(self.cid2unichr.items()):
158 out.write("cid %d = unicode %r\n" % (k, v))
161class IdentityUnicodeMap(UnicodeMap):
162 def get_unichr(self, cid: int) -> str:
163 """Interpret character id as unicode codepoint"""
164 log.debug("get_unichr: %r, %r", self, cid)
165 return chr(cid)
168class FileCMap(CMap):
169 def add_code2cid(self, code: str, cid: int) -> None:
170 assert isinstance(code, str) and isinstance(cid, int), str(
171 (type(code), type(cid)),
172 )
173 d = self.code2cid
174 for c in code[:-1]:
175 ci = ord(c)
176 if ci in d:
177 d = cast(Dict[int, object], d[ci])
178 else:
179 t: Dict[int, object] = {}
180 d[ci] = t
181 d = t
182 ci = ord(code[-1])
183 d[ci] = cid
186class FileUnicodeMap(UnicodeMap):
187 def add_cid2unichr(self, cid: int, code: Union[PSLiteral, bytes, int]) -> None:
188 assert isinstance(cid, int), str(type(cid))
189 if isinstance(code, PSLiteral):
190 # Interpret as an Adobe glyph name.
191 assert isinstance(code.name, str)
192 unichr = name2unicode(code.name)
193 elif isinstance(code, bytes):
194 # Interpret as UTF-16BE.
195 unichr = code.decode("UTF-16BE", "ignore")
196 elif isinstance(code, int):
197 unichr = chr(code)
198 else:
199 raise PDFTypeError(code)
201 # A0 = non-breaking space, some weird fonts can have a collision on a cid here.
202 if unichr == "\u00a0" and self.cid2unichr.get(cid) == " ":
203 return
204 self.cid2unichr[cid] = unichr
207class PyCMap(CMap):
208 def __init__(self, name: str, module: Any) -> None:
209 super().__init__(CMapName=name)
210 self.code2cid = module.CODE2CID
211 if module.IS_VERTICAL:
212 self.attrs["WMode"] = 1
215class PyUnicodeMap(UnicodeMap):
216 def __init__(self, name: str, module: Any, vertical: bool) -> None:
217 super().__init__(CMapName=name)
218 if vertical:
219 self.cid2unichr = module.CID2UNICHR_V
220 self.attrs["WMode"] = 1
221 else:
222 self.cid2unichr = module.CID2UNICHR_H
225class CMapDB:
226 _cmap_cache: Dict[str, PyCMap] = {}
227 _umap_cache: Dict[str, List[PyUnicodeMap]] = {}
229 class CMapNotFound(CMapError):
230 pass
232 @classmethod
233 def _load_data(cls, name: str) -> Any:
234 name = name.replace("\0", "")
235 filename = "%s.pickle.gz" % name
236 log.debug("loading: %r", name)
237 cmap_paths = (
238 os.environ.get("CMAP_PATH", "/usr/share/pdfminer/"),
239 os.path.join(os.path.dirname(__file__), "cmap"),
240 )
241 for directory in cmap_paths:
242 path = os.path.join(directory, filename)
243 if os.path.exists(path):
244 gzfile = gzip.open(path)
245 try:
246 return type(str(name), (), pickle.loads(gzfile.read()))
247 finally:
248 gzfile.close()
249 raise CMapDB.CMapNotFound(name)
251 @classmethod
252 def get_cmap(cls, name: str) -> CMapBase:
253 if name == "Identity-H":
254 return IdentityCMap(WMode=0)
255 elif name == "Identity-V":
256 return IdentityCMap(WMode=1)
257 elif name == "OneByteIdentityH":
258 return IdentityCMapByte(WMode=0)
259 elif name == "OneByteIdentityV":
260 return IdentityCMapByte(WMode=1)
261 try:
262 return cls._cmap_cache[name]
263 except KeyError:
264 pass
265 data = cls._load_data(name)
266 cls._cmap_cache[name] = cmap = PyCMap(name, data)
267 return cmap
269 @classmethod
270 def get_unicode_map(cls, name: str, vertical: bool = False) -> UnicodeMap:
271 try:
272 return cls._umap_cache[name][vertical]
273 except KeyError:
274 pass
275 data = cls._load_data("to-unicode-%s" % name)
276 cls._umap_cache[name] = [PyUnicodeMap(name, data, v) for v in (False, True)]
277 return cls._umap_cache[name][vertical]
280class CMapParser(PSStackParser[PSKeyword]):
281 def __init__(self, cmap: CMapBase, fp: BinaryIO) -> None:
282 PSStackParser.__init__(self, fp)
283 self.cmap = cmap
284 # some ToUnicode maps don't have "begincmap" keyword.
285 self._in_cmap = True
286 self._warnings: Set[str] = set()
288 def run(self) -> None:
289 try:
290 self.nextobject()
291 except PSEOF:
292 pass
294 KEYWORD_BEGINCMAP = KWD(b"begincmap")
295 KEYWORD_ENDCMAP = KWD(b"endcmap")
296 KEYWORD_USECMAP = KWD(b"usecmap")
297 KEYWORD_DEF = KWD(b"def")
298 KEYWORD_BEGINCODESPACERANGE = KWD(b"begincodespacerange")
299 KEYWORD_ENDCODESPACERANGE = KWD(b"endcodespacerange")
300 KEYWORD_BEGINCIDRANGE = KWD(b"begincidrange")
301 KEYWORD_ENDCIDRANGE = KWD(b"endcidrange")
302 KEYWORD_BEGINCIDCHAR = KWD(b"begincidchar")
303 KEYWORD_ENDCIDCHAR = KWD(b"endcidchar")
304 KEYWORD_BEGINBFRANGE = KWD(b"beginbfrange")
305 KEYWORD_ENDBFRANGE = KWD(b"endbfrange")
306 KEYWORD_BEGINBFCHAR = KWD(b"beginbfchar")
307 KEYWORD_ENDBFCHAR = KWD(b"endbfchar")
308 KEYWORD_BEGINNOTDEFRANGE = KWD(b"beginnotdefrange")
309 KEYWORD_ENDNOTDEFRANGE = KWD(b"endnotdefrange")
311 def do_keyword(self, pos: int, token: PSKeyword) -> None:
312 """ToUnicode CMaps
314 See Section 5.9.2 - ToUnicode CMaps of the PDF Reference.
315 """
316 if token is self.KEYWORD_BEGINCMAP:
317 self._in_cmap = True
318 self.popall()
319 return
321 elif token is self.KEYWORD_ENDCMAP:
322 self._in_cmap = False
323 return
325 if not self._in_cmap:
326 return
328 if token is self.KEYWORD_DEF:
329 try:
330 ((_, k), (_, v)) = self.pop(2)
331 self.cmap.set_attr(literal_name(k), v)
332 except PSSyntaxError:
333 pass
334 return
336 if token is self.KEYWORD_USECMAP:
337 try:
338 ((_, cmapname),) = self.pop(1)
339 self.cmap.use_cmap(CMapDB.get_cmap(literal_name(cmapname)))
340 except PSSyntaxError:
341 pass
342 except CMapDB.CMapNotFound:
343 pass
344 return
346 if token is self.KEYWORD_BEGINCODESPACERANGE:
347 self.popall()
348 return
349 if token is self.KEYWORD_ENDCODESPACERANGE:
350 self.popall()
351 return
353 if token is self.KEYWORD_BEGINCIDRANGE:
354 self.popall()
355 return
357 if token is self.KEYWORD_ENDCIDRANGE:
358 objs = [obj for (__, obj) in self.popall()]
359 for start_byte, end_byte, cid in choplist(3, objs):
360 if not isinstance(start_byte, bytes):
361 self._warn_once("The start object of begincidrange is not a byte.")
362 continue
363 if not isinstance(end_byte, bytes):
364 self._warn_once("The end object of begincidrange is not a byte.")
365 continue
366 if not isinstance(cid, int):
367 self._warn_once("The cid object of begincidrange is not a byte.")
368 continue
369 if len(start_byte) != len(end_byte):
370 self._warn_once(
371 "The start and end byte of begincidrange have "
372 "different lengths.",
373 )
374 continue
375 start_prefix = start_byte[:-4]
376 end_prefix = end_byte[:-4]
377 if start_prefix != end_prefix:
378 self._warn_once(
379 "The prefix of the start and end byte of "
380 "begincidrange are not the same.",
381 )
382 continue
383 svar = start_byte[-4:]
384 evar = end_byte[-4:]
385 start = nunpack(svar)
386 end = nunpack(evar)
387 vlen = len(svar)
388 for i in range(end - start + 1):
389 x = start_prefix + struct.pack(">L", start + i)[-vlen:]
390 self.cmap.add_cid2unichr(cid + i, x)
391 return
393 if token is self.KEYWORD_BEGINCIDCHAR:
394 self.popall()
395 return
397 if token is self.KEYWORD_ENDCIDCHAR:
398 objs = [obj for (__, obj) in self.popall()]
399 for cid, code in choplist(2, objs):
400 if isinstance(code, bytes) and isinstance(cid, int):
401 self.cmap.add_cid2unichr(cid, code)
402 return
404 if token is self.KEYWORD_BEGINBFRANGE:
405 self.popall()
406 return
408 if token is self.KEYWORD_ENDBFRANGE:
409 objs = [obj for (__, obj) in self.popall()]
410 for start_byte, end_byte, code in choplist(3, objs):
411 if not isinstance(start_byte, bytes):
412 self._warn_once("The start object is not a byte.")
413 continue
414 if not isinstance(end_byte, bytes):
415 self._warn_once("The end object is not a byte.")
416 continue
417 if len(start_byte) != len(end_byte):
418 self._warn_once("The start and end byte have different lengths.")
419 continue
420 start = nunpack(start_byte)
421 end = nunpack(end_byte)
422 if isinstance(code, list):
423 if len(code) != end - start + 1:
424 self._warn_once(
425 "The difference between the start and end "
426 "offsets does not match the code length.",
427 )
428 for cid, unicode_value in zip(range(start, end + 1), code):
429 self.cmap.add_cid2unichr(cid, unicode_value)
430 else:
431 assert isinstance(code, bytes)
432 var = code[-4:]
433 base = nunpack(var)
434 prefix = code[:-4]
435 vlen = len(var)
436 for i in range(end - start + 1):
437 x = prefix + struct.pack(">L", base + i)[-vlen:]
438 self.cmap.add_cid2unichr(start + i, x)
439 return
441 if token is self.KEYWORD_BEGINBFCHAR:
442 self.popall()
443 return
445 if token is self.KEYWORD_ENDBFCHAR:
446 objs = [obj for (__, obj) in self.popall()]
447 for cid, code in choplist(2, objs):
448 if isinstance(cid, bytes) and isinstance(code, bytes):
449 self.cmap.add_cid2unichr(nunpack(cid), code)
450 return
452 if token is self.KEYWORD_BEGINNOTDEFRANGE:
453 self.popall()
454 return
456 if token is self.KEYWORD_ENDNOTDEFRANGE:
457 self.popall()
458 return
460 self.push((pos, token))
462 def _warn_once(self, msg: str) -> None:
463 """Warn once for each unique message"""
464 if msg not in self._warnings:
465 self._warnings.add(msg)
466 base_msg = (
467 "Ignoring (part of) ToUnicode map because the PDF data "
468 "does not conform to the format. This could result in "
469 "(cid) values in the output. "
470 )
471 log.warning(base_msg + msg)