Coverage for /pythoncovmergedfiles/medio/medio/src/pdfminer.six/pdfminer/pdffont.py: 59%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import logging
2import struct
3from io import BytesIO
4from typing import (
5 TYPE_CHECKING,
6 Any,
7 BinaryIO,
8 Dict,
9 Iterable,
10 Iterator,
11 List,
12 Mapping,
13 Optional,
14 Tuple,
15 Union,
16 cast,
17)
19from pdfminer import settings
20from pdfminer.cmapdb import (
21 CMap,
22 CMapBase,
23 CMapDB,
24 CMapParser,
25 FileUnicodeMap,
26 IdentityUnicodeMap,
27 UnicodeMap,
28)
29from pdfminer.encodingdb import EncodingDB, name2unicode
30from pdfminer.fontmetrics import FONT_METRICS
31from pdfminer.pdfexceptions import PDFException, PDFKeyError, PDFValueError
32from pdfminer.pdftypes import (
33 PDFStream,
34 dict_value,
35 int_value,
36 list_value,
37 num_value,
38 resolve1,
39 resolve_all,
40 stream_value,
41)
42from pdfminer.psexceptions import PSEOF
43from pdfminer.psparser import (
44 KWD,
45 LIT,
46 PSKeyword,
47 PSLiteral,
48 PSStackParser,
49 literal_name,
50)
51from pdfminer.utils import Matrix, Point, Rect, apply_matrix_norm, choplist, nunpack
53if TYPE_CHECKING:
54 from pdfminer.pdfinterp import PDFResourceManager
56log = logging.getLogger(__name__)
59def get_widths(seq: Iterable[object]) -> Dict[int, float]:
60 """Build a mapping of character widths for horizontal writing."""
61 widths: Dict[int, float] = {}
62 r: List[float] = []
63 for v in seq:
64 if isinstance(v, list):
65 if r:
66 char1 = r[-1]
67 for i, w in enumerate(v):
68 widths[cast(int, char1) + i] = w
69 r = []
70 elif isinstance(v, (int, float)): # == utils.isnumber(v)
71 r.append(v)
72 if len(r) == 3:
73 (char1, char2, w) = r
74 for i in range(cast(int, char1), cast(int, char2) + 1):
75 widths[i] = w
76 r = []
77 return widths
80def get_widths2(seq: Iterable[object]) -> Dict[int, Tuple[float, Point]]:
81 """Build a mapping of character widths for vertical writing."""
82 widths: Dict[int, Tuple[float, Point]] = {}
83 r: List[float] = []
84 for v in seq:
85 if isinstance(v, list):
86 if r:
87 char1 = r[-1]
88 for i, (w, vx, vy) in enumerate(choplist(3, v)):
89 widths[cast(int, char1) + i] = (w, (vx, vy))
90 r = []
91 elif isinstance(v, (int, float)): # == utils.isnumber(v)
92 r.append(v)
93 if len(r) == 5:
94 (char1, char2, w, vx, vy) = r
95 for i in range(cast(int, char1), cast(int, char2) + 1):
96 widths[i] = (w, (vx, vy))
97 r = []
98 return widths
101class FontMetricsDB:
102 @classmethod
103 def get_metrics(cls, fontname: str) -> Tuple[Dict[str, object], Dict[str, int]]:
104 return FONT_METRICS[fontname]
107# int here means that we're not extending PSStackParser with additional types.
108class Type1FontHeaderParser(PSStackParser[int]):
109 KEYWORD_BEGIN = KWD(b"begin")
110 KEYWORD_END = KWD(b"end")
111 KEYWORD_DEF = KWD(b"def")
112 KEYWORD_PUT = KWD(b"put")
113 KEYWORD_DICT = KWD(b"dict")
114 KEYWORD_ARRAY = KWD(b"array")
115 KEYWORD_READONLY = KWD(b"readonly")
116 KEYWORD_FOR = KWD(b"for")
118 def __init__(self, data: BinaryIO) -> None:
119 PSStackParser.__init__(self, data)
120 self._cid2unicode: Dict[int, str] = {}
122 def get_encoding(self) -> Dict[int, str]:
123 """Parse the font encoding.
125 The Type1 font encoding maps character codes to character names. These
126 character names could either be standard Adobe glyph names, or
127 character names associated with custom CharStrings for this font. A
128 CharString is a sequence of operations that describe how the character
129 should be drawn. Currently, this function returns '' (empty string)
130 for character names that are associated with a CharStrings.
132 Reference: Adobe Systems Incorporated, Adobe Type 1 Font Format
134 :returns mapping of character identifiers (cid's) to unicode characters
135 """
136 while 1:
137 try:
138 (cid, name) = self.nextobject()
139 except PSEOF:
140 break
141 try:
142 self._cid2unicode[cid] = name2unicode(cast(str, name))
143 except KeyError as e:
144 log.debug(str(e))
145 return self._cid2unicode
147 def do_keyword(self, pos: int, token: PSKeyword) -> None:
148 if token is self.KEYWORD_PUT:
149 ((_, key), (_, value)) = self.pop(2)
150 if isinstance(key, int) and isinstance(value, PSLiteral):
151 self.add_results((key, literal_name(value)))
154NIBBLES = ("0", "1", "2", "3", "4", "5", "6", "7", "8", "9", ".", "e", "e-", None, "-")
156# Mapping of cmap names. Original cmap name is kept if not in the mapping.
157# (missing reference for why DLIdent is mapped to Identity)
158IDENTITY_ENCODER = {
159 "DLIdent-H": "Identity-H",
160 "DLIdent-V": "Identity-V",
161}
164def getdict(data: bytes) -> Dict[int, List[Union[float, int]]]:
165 d: Dict[int, List[Union[float, int]]] = {}
166 fp = BytesIO(data)
167 stack: List[Union[float, int]] = []
168 while 1:
169 c = fp.read(1)
170 if not c:
171 break
172 b0 = ord(c)
173 if b0 <= 21:
174 d[b0] = stack
175 stack = []
176 continue
177 if b0 == 30:
178 s = ""
179 loop = True
180 while loop:
181 b = ord(fp.read(1))
182 for n in (b >> 4, b & 15):
183 if n == 15:
184 loop = False
185 else:
186 nibble = NIBBLES[n]
187 assert nibble is not None
188 s += nibble
189 value = float(s)
190 elif b0 >= 32 and b0 <= 246:
191 value = b0 - 139
192 else:
193 b1 = ord(fp.read(1))
194 if b0 >= 247 and b0 <= 250:
195 value = ((b0 - 247) << 8) + b1 + 108
196 elif b0 >= 251 and b0 <= 254:
197 value = -((b0 - 251) << 8) - b1 - 108
198 else:
199 b2 = ord(fp.read(1))
200 if b1 >= 128:
201 b1 -= 256
202 if b0 == 28:
203 value = b1 << 8 | b2
204 else:
205 value = b1 << 24 | b2 << 16 | struct.unpack(">H", fp.read(2))[0]
206 stack.append(value)
207 return d
210class CFFFont:
211 STANDARD_STRINGS = (
212 ".notdef",
213 "space",
214 "exclam",
215 "quotedbl",
216 "numbersign",
217 "dollar",
218 "percent",
219 "ampersand",
220 "quoteright",
221 "parenleft",
222 "parenright",
223 "asterisk",
224 "plus",
225 "comma",
226 "hyphen",
227 "period",
228 "slash",
229 "zero",
230 "one",
231 "two",
232 "three",
233 "four",
234 "five",
235 "six",
236 "seven",
237 "eight",
238 "nine",
239 "colon",
240 "semicolon",
241 "less",
242 "equal",
243 "greater",
244 "question",
245 "at",
246 "A",
247 "B",
248 "C",
249 "D",
250 "E",
251 "F",
252 "G",
253 "H",
254 "I",
255 "J",
256 "K",
257 "L",
258 "M",
259 "N",
260 "O",
261 "P",
262 "Q",
263 "R",
264 "S",
265 "T",
266 "U",
267 "V",
268 "W",
269 "X",
270 "Y",
271 "Z",
272 "bracketleft",
273 "backslash",
274 "bracketright",
275 "asciicircum",
276 "underscore",
277 "quoteleft",
278 "a",
279 "b",
280 "c",
281 "d",
282 "e",
283 "f",
284 "g",
285 "h",
286 "i",
287 "j",
288 "k",
289 "l",
290 "m",
291 "n",
292 "o",
293 "p",
294 "q",
295 "r",
296 "s",
297 "t",
298 "u",
299 "v",
300 "w",
301 "x",
302 "y",
303 "z",
304 "braceleft",
305 "bar",
306 "braceright",
307 "asciitilde",
308 "exclamdown",
309 "cent",
310 "sterling",
311 "fraction",
312 "yen",
313 "florin",
314 "section",
315 "currency",
316 "quotesingle",
317 "quotedblleft",
318 "guillemotleft",
319 "guilsinglleft",
320 "guilsinglright",
321 "fi",
322 "fl",
323 "endash",
324 "dagger",
325 "daggerdbl",
326 "periodcentered",
327 "paragraph",
328 "bullet",
329 "quotesinglbase",
330 "quotedblbase",
331 "quotedblright",
332 "guillemotright",
333 "ellipsis",
334 "perthousand",
335 "questiondown",
336 "grave",
337 "acute",
338 "circumflex",
339 "tilde",
340 "macron",
341 "breve",
342 "dotaccent",
343 "dieresis",
344 "ring",
345 "cedilla",
346 "hungarumlaut",
347 "ogonek",
348 "caron",
349 "emdash",
350 "AE",
351 "ordfeminine",
352 "Lslash",
353 "Oslash",
354 "OE",
355 "ordmasculine",
356 "ae",
357 "dotlessi",
358 "lslash",
359 "oslash",
360 "oe",
361 "germandbls",
362 "onesuperior",
363 "logicalnot",
364 "mu",
365 "trademark",
366 "Eth",
367 "onehalf",
368 "plusminus",
369 "Thorn",
370 "onequarter",
371 "divide",
372 "brokenbar",
373 "degree",
374 "thorn",
375 "threequarters",
376 "twosuperior",
377 "registered",
378 "minus",
379 "eth",
380 "multiply",
381 "threesuperior",
382 "copyright",
383 "Aacute",
384 "Acircumflex",
385 "Adieresis",
386 "Agrave",
387 "Aring",
388 "Atilde",
389 "Ccedilla",
390 "Eacute",
391 "Ecircumflex",
392 "Edieresis",
393 "Egrave",
394 "Iacute",
395 "Icircumflex",
396 "Idieresis",
397 "Igrave",
398 "Ntilde",
399 "Oacute",
400 "Ocircumflex",
401 "Odieresis",
402 "Ograve",
403 "Otilde",
404 "Scaron",
405 "Uacute",
406 "Ucircumflex",
407 "Udieresis",
408 "Ugrave",
409 "Yacute",
410 "Ydieresis",
411 "Zcaron",
412 "aacute",
413 "acircumflex",
414 "adieresis",
415 "agrave",
416 "aring",
417 "atilde",
418 "ccedilla",
419 "eacute",
420 "ecircumflex",
421 "edieresis",
422 "egrave",
423 "iacute",
424 "icircumflex",
425 "idieresis",
426 "igrave",
427 "ntilde",
428 "oacute",
429 "ocircumflex",
430 "odieresis",
431 "ograve",
432 "otilde",
433 "scaron",
434 "uacute",
435 "ucircumflex",
436 "udieresis",
437 "ugrave",
438 "yacute",
439 "ydieresis",
440 "zcaron",
441 "exclamsmall",
442 "Hungarumlautsmall",
443 "dollaroldstyle",
444 "dollarsuperior",
445 "ampersandsmall",
446 "Acutesmall",
447 "parenleftsuperior",
448 "parenrightsuperior",
449 "twodotenleader",
450 "onedotenleader",
451 "zerooldstyle",
452 "oneoldstyle",
453 "twooldstyle",
454 "threeoldstyle",
455 "fouroldstyle",
456 "fiveoldstyle",
457 "sixoldstyle",
458 "sevenoldstyle",
459 "eightoldstyle",
460 "nineoldstyle",
461 "commasuperior",
462 "threequartersemdash",
463 "periodsuperior",
464 "questionsmall",
465 "asuperior",
466 "bsuperior",
467 "centsuperior",
468 "dsuperior",
469 "esuperior",
470 "isuperior",
471 "lsuperior",
472 "msuperior",
473 "nsuperior",
474 "osuperior",
475 "rsuperior",
476 "ssuperior",
477 "tsuperior",
478 "ff",
479 "ffi",
480 "ffl",
481 "parenleftinferior",
482 "parenrightinferior",
483 "Circumflexsmall",
484 "hyphensuperior",
485 "Gravesmall",
486 "Asmall",
487 "Bsmall",
488 "Csmall",
489 "Dsmall",
490 "Esmall",
491 "Fsmall",
492 "Gsmall",
493 "Hsmall",
494 "Ismall",
495 "Jsmall",
496 "Ksmall",
497 "Lsmall",
498 "Msmall",
499 "Nsmall",
500 "Osmall",
501 "Psmall",
502 "Qsmall",
503 "Rsmall",
504 "Ssmall",
505 "Tsmall",
506 "Usmall",
507 "Vsmall",
508 "Wsmall",
509 "Xsmall",
510 "Ysmall",
511 "Zsmall",
512 "colonmonetary",
513 "onefitted",
514 "rupiah",
515 "Tildesmall",
516 "exclamdownsmall",
517 "centoldstyle",
518 "Lslashsmall",
519 "Scaronsmall",
520 "Zcaronsmall",
521 "Dieresissmall",
522 "Brevesmall",
523 "Caronsmall",
524 "Dotaccentsmall",
525 "Macronsmall",
526 "figuredash",
527 "hypheninferior",
528 "Ogoneksmall",
529 "Ringsmall",
530 "Cedillasmall",
531 "questiondownsmall",
532 "oneeighth",
533 "threeeighths",
534 "fiveeighths",
535 "seveneighths",
536 "onethird",
537 "twothirds",
538 "zerosuperior",
539 "foursuperior",
540 "fivesuperior",
541 "sixsuperior",
542 "sevensuperior",
543 "eightsuperior",
544 "ninesuperior",
545 "zeroinferior",
546 "oneinferior",
547 "twoinferior",
548 "threeinferior",
549 "fourinferior",
550 "fiveinferior",
551 "sixinferior",
552 "seveninferior",
553 "eightinferior",
554 "nineinferior",
555 "centinferior",
556 "dollarinferior",
557 "periodinferior",
558 "commainferior",
559 "Agravesmall",
560 "Aacutesmall",
561 "Acircumflexsmall",
562 "Atildesmall",
563 "Adieresissmall",
564 "Aringsmall",
565 "AEsmall",
566 "Ccedillasmall",
567 "Egravesmall",
568 "Eacutesmall",
569 "Ecircumflexsmall",
570 "Edieresissmall",
571 "Igravesmall",
572 "Iacutesmall",
573 "Icircumflexsmall",
574 "Idieresissmall",
575 "Ethsmall",
576 "Ntildesmall",
577 "Ogravesmall",
578 "Oacutesmall",
579 "Ocircumflexsmall",
580 "Otildesmall",
581 "Odieresissmall",
582 "OEsmall",
583 "Oslashsmall",
584 "Ugravesmall",
585 "Uacutesmall",
586 "Ucircumflexsmall",
587 "Udieresissmall",
588 "Yacutesmall",
589 "Thornsmall",
590 "Ydieresissmall",
591 "001.000",
592 "001.001",
593 "001.002",
594 "001.003",
595 "Black",
596 "Bold",
597 "Book",
598 "Light",
599 "Medium",
600 "Regular",
601 "Roman",
602 "Semibold",
603 )
605 class INDEX:
606 def __init__(self, fp: BinaryIO) -> None:
607 self.fp = fp
608 self.offsets: List[int] = []
609 (count, offsize) = struct.unpack(">HB", self.fp.read(3))
610 for i in range(count + 1):
611 self.offsets.append(nunpack(self.fp.read(offsize)))
612 self.base = self.fp.tell() - 1
613 self.fp.seek(self.base + self.offsets[-1])
615 def __repr__(self) -> str:
616 return "<INDEX: size=%d>" % len(self)
618 def __len__(self) -> int:
619 return len(self.offsets) - 1
621 def __getitem__(self, i: int) -> bytes:
622 self.fp.seek(self.base + self.offsets[i])
623 return self.fp.read(self.offsets[i + 1] - self.offsets[i])
625 def __iter__(self) -> Iterator[bytes]:
626 return iter(self[i] for i in range(len(self)))
628 def __init__(self, name: str, fp: BinaryIO) -> None:
629 self.name = name
630 self.fp = fp
631 # Header
632 (_major, _minor, hdrsize, offsize) = struct.unpack("BBBB", self.fp.read(4))
633 self.fp.read(hdrsize - 4)
634 # Name INDEX
635 self.name_index = self.INDEX(self.fp)
636 # Top DICT INDEX
637 self.dict_index = self.INDEX(self.fp)
638 # String INDEX
639 self.string_index = self.INDEX(self.fp)
640 # Global Subr INDEX
641 self.subr_index = self.INDEX(self.fp)
642 # Top DICT DATA
643 self.top_dict = getdict(self.dict_index[0])
644 (charset_pos,) = self.top_dict.get(15, [0])
645 (encoding_pos,) = self.top_dict.get(16, [0])
646 (charstring_pos,) = self.top_dict.get(17, [0])
647 # CharStrings
648 self.fp.seek(cast(int, charstring_pos))
649 self.charstring = self.INDEX(self.fp)
650 self.nglyphs = len(self.charstring)
651 # Encodings
652 self.code2gid = {}
653 self.gid2code = {}
654 self.fp.seek(cast(int, encoding_pos))
655 format = self.fp.read(1)
656 if format == b"\x00":
657 # Format 0
658 (n,) = struct.unpack("B", self.fp.read(1))
659 for code, gid in enumerate(struct.unpack("B" * n, self.fp.read(n))):
660 self.code2gid[code] = gid
661 self.gid2code[gid] = code
662 elif format == b"\x01":
663 # Format 1
664 (n,) = struct.unpack("B", self.fp.read(1))
665 code = 0
666 for i in range(n):
667 (first, nleft) = struct.unpack("BB", self.fp.read(2))
668 for gid in range(first, first + nleft + 1):
669 self.code2gid[code] = gid
670 self.gid2code[gid] = code
671 code += 1
672 else:
673 raise PDFValueError("unsupported encoding format: %r" % format)
674 # Charsets
675 self.name2gid = {}
676 self.gid2name = {}
677 self.fp.seek(cast(int, charset_pos))
678 format = self.fp.read(1)
679 if format == b"\x00":
680 # Format 0
681 n = self.nglyphs - 1
682 for gid, sid in enumerate(
683 cast(
684 Tuple[int, ...], struct.unpack(">" + "H" * n, self.fp.read(2 * n))
685 ),
686 ):
687 gid += 1
688 sidname = self.getstr(sid)
689 self.name2gid[sidname] = gid
690 self.gid2name[gid] = sidname
691 elif format == b"\x01":
692 # Format 1
693 (n,) = struct.unpack("B", self.fp.read(1))
694 sid = 0
695 for i in range(n):
696 (first, nleft) = struct.unpack("BB", self.fp.read(2))
697 for gid in range(first, first + nleft + 1):
698 sidname = self.getstr(sid)
699 self.name2gid[sidname] = gid
700 self.gid2name[gid] = sidname
701 sid += 1
702 elif format == b"\x02":
703 # Format 2
704 assert False, str(("Unhandled", format))
705 else:
706 raise PDFValueError("unsupported charset format: %r" % format)
708 def getstr(self, sid: int) -> Union[str, bytes]:
709 # This returns str for one of the STANDARD_STRINGS but bytes otherwise,
710 # and appears to be a needless source of type complexity.
711 if sid < len(self.STANDARD_STRINGS):
712 return self.STANDARD_STRINGS[sid]
713 return self.string_index[sid - len(self.STANDARD_STRINGS)]
716class TrueTypeFont:
717 class CMapNotFound(PDFException):
718 pass
720 def __init__(self, name: str, fp: BinaryIO) -> None:
721 self.name = name
722 self.fp = fp
723 self.tables: Dict[bytes, Tuple[int, int]] = {}
724 self.fonttype = fp.read(4)
725 try:
726 (ntables, _1, _2, _3) = cast(
727 Tuple[int, int, int, int],
728 struct.unpack(">HHHH", fp.read(8)),
729 )
730 for _ in range(ntables):
731 (name_bytes, tsum, offset, length) = cast(
732 Tuple[bytes, int, int, int],
733 struct.unpack(">4sLLL", fp.read(16)),
734 )
735 self.tables[name_bytes] = (offset, length)
736 except struct.error:
737 # Do not fail if there are not enough bytes to read. Even for
738 # corrupted PDFs we would like to get as much information as
739 # possible, so continue.
740 pass
742 def create_unicode_map(self) -> FileUnicodeMap:
743 if b"cmap" not in self.tables:
744 raise TrueTypeFont.CMapNotFound
745 (base_offset, length) = self.tables[b"cmap"]
746 fp = self.fp
747 fp.seek(base_offset)
748 (version, nsubtables) = cast(Tuple[int, int], struct.unpack(">HH", fp.read(4)))
749 subtables: List[Tuple[int, int, int]] = []
750 for i in range(nsubtables):
751 subtables.append(
752 cast(Tuple[int, int, int], struct.unpack(">HHL", fp.read(8))),
753 )
754 char2gid: Dict[int, int] = {}
755 # Only supports subtable type 0, 2 and 4.
756 for platform_id, encoding_id, st_offset in subtables:
757 # Skip non-Unicode cmaps.
758 # https://docs.microsoft.com/en-us/typography/opentype/spec/cmap
759 if not (platform_id == 0 or (platform_id == 3 and encoding_id in [1, 10])):
760 continue
761 fp.seek(base_offset + st_offset)
762 (fmttype, fmtlen, fmtlang) = cast(
763 Tuple[int, int, int],
764 struct.unpack(">HHH", fp.read(6)),
765 )
766 if fmttype == 0:
767 char2gid.update(
768 enumerate(
769 cast(Tuple[int, ...], struct.unpack(">256B", fp.read(256))),
770 ),
771 )
772 elif fmttype == 2:
773 subheaderkeys = cast(
774 Tuple[int, ...],
775 struct.unpack(">256H", fp.read(512)),
776 )
777 firstbytes = [0] * 8192
778 for i, k in enumerate(subheaderkeys):
779 firstbytes[k // 8] = i
780 nhdrs = max(subheaderkeys) // 8 + 1
781 hdrs: List[Tuple[int, int, int, int, int]] = []
782 for i in range(nhdrs):
783 (firstcode, entcount, delta, offset) = cast(
784 Tuple[int, int, int, int],
785 struct.unpack(">HHhH", fp.read(8)),
786 )
787 hdrs.append((i, firstcode, entcount, delta, fp.tell() - 2 + offset))
788 for i, firstcode, entcount, delta, pos in hdrs:
789 if not entcount:
790 continue
791 first = firstcode + (firstbytes[i] << 8)
792 fp.seek(pos)
793 for c in range(entcount):
794 gid = cast(Tuple[int], struct.unpack(">H", fp.read(2)))[0]
795 if gid:
796 gid += delta
797 char2gid[first + c] = gid
798 elif fmttype == 4:
799 (segcount, _1, _2, _3) = cast(
800 Tuple[int, int, int, int],
801 struct.unpack(">HHHH", fp.read(8)),
802 )
803 segcount //= 2
804 ecs = cast(
805 Tuple[int, ...],
806 struct.unpack(">%dH" % segcount, fp.read(2 * segcount)),
807 )
808 fp.read(2)
809 scs = cast(
810 Tuple[int, ...],
811 struct.unpack(">%dH" % segcount, fp.read(2 * segcount)),
812 )
813 idds = cast(
814 Tuple[int, ...],
815 struct.unpack(">%dh" % segcount, fp.read(2 * segcount)),
816 )
817 pos = fp.tell()
818 idrs = cast(
819 Tuple[int, ...],
820 struct.unpack(">%dH" % segcount, fp.read(2 * segcount)),
821 )
822 for ec, sc, idd, idr in zip(ecs, scs, idds, idrs):
823 if idr:
824 fp.seek(pos + idr)
825 for c in range(sc, ec + 1):
826 b = cast(Tuple[int], struct.unpack(">H", fp.read(2)))[0]
827 char2gid[c] = (b + idd) & 0xFFFF
828 else:
829 for c in range(sc, ec + 1):
830 char2gid[c] = (c + idd) & 0xFFFF
831 else:
832 assert False, str(("Unhandled", fmttype))
833 if not char2gid:
834 raise TrueTypeFont.CMapNotFound
835 # create unicode map
836 unicode_map = FileUnicodeMap()
837 for char, gid in char2gid.items():
838 unicode_map.add_cid2unichr(gid, char)
839 return unicode_map
842class PDFFontError(PDFException):
843 pass
846class PDFUnicodeNotDefined(PDFFontError):
847 pass
850LITERAL_STANDARD_ENCODING = LIT("StandardEncoding")
851LITERAL_TYPE1C = LIT("Type1C")
853# Font widths are maintained in a dict type that maps from *either* unicode
854# chars or integer character IDs.
855FontWidthDict = Union[Dict[int, float], Dict[str, float]]
858class PDFFont:
859 def __init__(
860 self,
861 descriptor: Mapping[str, Any],
862 widths: FontWidthDict,
863 default_width: Optional[float] = None,
864 ) -> None:
865 self.descriptor = descriptor
866 self.widths: FontWidthDict = resolve_all(widths)
867 self.fontname = resolve1(descriptor.get("FontName", "unknown"))
868 if isinstance(self.fontname, PSLiteral):
869 self.fontname = literal_name(self.fontname)
870 self.flags = int_value(descriptor.get("Flags", 0))
871 self.ascent = num_value(descriptor.get("Ascent", 0))
872 self.descent = num_value(descriptor.get("Descent", 0))
873 self.italic_angle = num_value(descriptor.get("ItalicAngle", 0))
874 if default_width is None:
875 self.default_width = num_value(descriptor.get("MissingWidth", 0))
876 else:
877 self.default_width = default_width
878 self.default_width = resolve1(self.default_width)
879 self.leading = num_value(descriptor.get("Leading", 0))
880 self.bbox = cast(
881 Rect,
882 list_value(resolve_all(descriptor.get("FontBBox", (0, 0, 0, 0)))),
883 )
884 self.hscale = self.vscale = 0.001
886 # PDF RM 9.8.1 specifies /Descent should always be a negative number.
887 # PScript5.dll seems to produce Descent with a positive number, but
888 # text analysis will be wrong if this is taken as correct. So force
889 # descent to negative.
890 if self.descent > 0:
891 self.descent = -self.descent
893 def __repr__(self) -> str:
894 return "<PDFFont>"
896 def is_vertical(self) -> bool:
897 return False
899 def is_multibyte(self) -> bool:
900 return False
902 def decode(self, bytes: bytes) -> Iterable[int]:
903 return bytearray(bytes) # map(ord, bytes)
905 def get_ascent(self) -> float:
906 """Ascent above the baseline, in text space units"""
907 return self.ascent * self.vscale
909 def get_descent(self) -> float:
910 """Descent below the baseline, in text space units; always negative"""
911 return self.descent * self.vscale
913 def get_width(self) -> float:
914 w = self.bbox[2] - self.bbox[0]
915 if w == 0:
916 w = -self.default_width
917 return w * self.hscale
919 def get_height(self) -> float:
920 h = self.bbox[3] - self.bbox[1]
921 if h == 0:
922 h = self.ascent - self.descent
923 return h * self.vscale
925 def char_width(self, cid: int) -> float:
926 # Because character widths may be mapping either IDs or strings,
927 # we try to lookup the character ID first, then its str equivalent.
928 try:
929 return cast(Dict[int, float], self.widths)[cid] * self.hscale
930 except KeyError:
931 str_widths = cast(Dict[str, float], self.widths)
932 try:
933 return str_widths[self.to_unichr(cid)] * self.hscale
934 except (KeyError, PDFUnicodeNotDefined):
935 return self.default_width * self.hscale
937 def char_disp(self, cid: int) -> Union[float, Tuple[Optional[float], float]]:
938 """Returns an integer for horizontal fonts, a tuple for vertical fonts."""
939 return 0
941 def string_width(self, s: bytes) -> float:
942 return sum(self.char_width(cid) for cid in self.decode(s))
944 def to_unichr(self, cid: int) -> str:
945 raise NotImplementedError
948class PDFSimpleFont(PDFFont):
949 def __init__(
950 self,
951 descriptor: Mapping[str, Any],
952 widths: FontWidthDict,
953 spec: Mapping[str, Any],
954 ) -> None:
955 # Font encoding is specified either by a name of
956 # built-in encoding or a dictionary that describes
957 # the differences.
958 if "Encoding" in spec:
959 encoding = resolve1(spec["Encoding"])
960 else:
961 encoding = LITERAL_STANDARD_ENCODING
962 if isinstance(encoding, dict):
963 name = literal_name(encoding.get("BaseEncoding", LITERAL_STANDARD_ENCODING))
964 diff = list_value(encoding.get("Differences", []))
965 self.cid2unicode = EncodingDB.get_encoding(name, diff)
966 else:
967 self.cid2unicode = EncodingDB.get_encoding(literal_name(encoding))
968 self.unicode_map: Optional[UnicodeMap] = None
969 if "ToUnicode" in spec:
970 strm = stream_value(spec["ToUnicode"])
971 self.unicode_map = FileUnicodeMap()
972 CMapParser(self.unicode_map, BytesIO(strm.get_data())).run()
973 PDFFont.__init__(self, descriptor, widths)
975 def to_unichr(self, cid: int) -> str:
976 if self.unicode_map:
977 try:
978 return self.unicode_map.get_unichr(cid)
979 except KeyError:
980 pass
981 try:
982 return self.cid2unicode[cid]
983 except KeyError:
984 raise PDFUnicodeNotDefined(None, cid)
987class PDFType1Font(PDFSimpleFont):
988 def __init__(self, rsrcmgr: "PDFResourceManager", spec: Mapping[str, Any]) -> None:
989 try:
990 self.basefont = literal_name(spec["BaseFont"])
991 except KeyError:
992 if settings.STRICT:
993 raise PDFFontError("BaseFont is missing")
994 self.basefont = "unknown"
996 widths: FontWidthDict
997 try:
998 (descriptor, int_widths) = FontMetricsDB.get_metrics(self.basefont)
999 widths = cast(Dict[str, float], int_widths) # implicit int->float
1000 except KeyError:
1001 descriptor = dict_value(spec.get("FontDescriptor", {}))
1002 firstchar = int_value(spec.get("FirstChar", 0))
1003 # lastchar = int_value(spec.get('LastChar', 255))
1004 width_list = list_value(spec.get("Widths", [0] * 256))
1005 widths = {i + firstchar: resolve1(w) for (i, w) in enumerate(width_list)}
1006 PDFSimpleFont.__init__(self, descriptor, widths, spec)
1007 if "Encoding" not in spec and "FontFile" in descriptor:
1008 # try to recover the missing encoding info from the font file.
1009 self.fontfile = stream_value(descriptor.get("FontFile"))
1010 length1 = int_value(self.fontfile["Length1"])
1011 data = self.fontfile.get_data()[:length1]
1012 parser = Type1FontHeaderParser(BytesIO(data))
1013 self.cid2unicode = parser.get_encoding()
1015 def __repr__(self) -> str:
1016 return "<PDFType1Font: basefont=%r>" % self.basefont
1019class PDFTrueTypeFont(PDFType1Font):
1020 def __repr__(self) -> str:
1021 return "<PDFTrueTypeFont: basefont=%r>" % self.basefont
1024class PDFType3Font(PDFSimpleFont):
1025 def __init__(self, rsrcmgr: "PDFResourceManager", spec: Mapping[str, Any]) -> None:
1026 firstchar = int_value(spec.get("FirstChar", 0))
1027 # lastchar = int_value(spec.get('LastChar', 0))
1028 width_list = list_value(spec.get("Widths", [0] * 256))
1029 widths = {i + firstchar: w for (i, w) in enumerate(width_list)}
1030 if "FontDescriptor" in spec:
1031 descriptor = dict_value(spec["FontDescriptor"])
1032 else:
1033 descriptor = {"Ascent": 0, "Descent": 0, "FontBBox": spec["FontBBox"]}
1034 PDFSimpleFont.__init__(self, descriptor, widths, spec)
1035 self.matrix = cast(Matrix, tuple(list_value(spec.get("FontMatrix"))))
1036 (_, self.descent, _, self.ascent) = self.bbox
1037 (self.hscale, self.vscale) = apply_matrix_norm(self.matrix, (1, 1))
1039 def __repr__(self) -> str:
1040 return "<PDFType3Font>"
1043class PDFCIDFont(PDFFont):
1044 default_disp: Union[float, Tuple[Optional[float], float]]
1046 def __init__(
1047 self,
1048 rsrcmgr: "PDFResourceManager",
1049 spec: Mapping[str, Any],
1050 strict: bool = settings.STRICT,
1051 ) -> None:
1052 try:
1053 self.basefont = literal_name(spec["BaseFont"])
1054 except KeyError:
1055 if strict:
1056 raise PDFFontError("BaseFont is missing")
1057 self.basefont = "unknown"
1058 self.cidsysteminfo = dict_value(spec.get("CIDSystemInfo", {}))
1059 cid_registry = resolve1(self.cidsysteminfo.get("Registry", b"unknown")).decode(
1060 "latin1",
1061 )
1062 cid_ordering = resolve1(self.cidsysteminfo.get("Ordering", b"unknown")).decode(
1063 "latin1",
1064 )
1065 self.cidcoding = f"{cid_registry.strip()}-{cid_ordering.strip()}"
1066 self.cmap: CMapBase = self.get_cmap_from_spec(spec, strict)
1068 try:
1069 descriptor = dict_value(spec["FontDescriptor"])
1070 except KeyError:
1071 if strict:
1072 raise PDFFontError("FontDescriptor is missing")
1073 descriptor = {}
1074 ttf = None
1075 if "FontFile2" in descriptor:
1076 self.fontfile = stream_value(descriptor.get("FontFile2"))
1077 ttf = TrueTypeFont(self.basefont, BytesIO(self.fontfile.get_data()))
1078 self.unicode_map: Optional[UnicodeMap] = None
1079 if "ToUnicode" in spec:
1080 if isinstance(spec["ToUnicode"], PDFStream):
1081 strm = stream_value(spec["ToUnicode"])
1082 self.unicode_map = FileUnicodeMap()
1083 CMapParser(self.unicode_map, BytesIO(strm.get_data())).run()
1084 else:
1085 cmap_name = literal_name(spec["ToUnicode"])
1086 encoding = literal_name(spec["Encoding"])
1087 if (
1088 "Identity" in cid_ordering
1089 or "Identity" in cmap_name
1090 or "Identity" in encoding
1091 ):
1092 self.unicode_map = IdentityUnicodeMap()
1093 elif self.cidcoding in ("Adobe-Identity", "Adobe-UCS"):
1094 if ttf:
1095 try:
1096 self.unicode_map = ttf.create_unicode_map()
1097 except TrueTypeFont.CMapNotFound:
1098 pass
1099 else:
1100 try:
1101 self.unicode_map = CMapDB.get_unicode_map(
1102 self.cidcoding,
1103 self.cmap.is_vertical(),
1104 )
1105 except CMapDB.CMapNotFound:
1106 pass
1108 self.vertical = self.cmap.is_vertical()
1109 if self.vertical:
1110 # writing mode: vertical
1111 widths2 = get_widths2(list_value(spec.get("W2", [])))
1112 self.disps = {cid: (vx, vy) for (cid, (_, (vx, vy))) in widths2.items()}
1113 (vy, w) = resolve1(spec.get("DW2", [880, -1000]))
1114 self.default_disp = (None, vy)
1115 widths = {cid: w for (cid, (w, _)) in widths2.items()}
1116 default_width = w
1117 else:
1118 # writing mode: horizontal
1119 self.disps = {}
1120 self.default_disp = 0
1121 widths = get_widths(list_value(spec.get("W", [])))
1122 default_width = spec.get("DW", 1000)
1123 PDFFont.__init__(self, descriptor, widths, default_width=default_width)
1125 def get_cmap_from_spec(self, spec: Mapping[str, Any], strict: bool) -> CMapBase:
1126 """Get cmap from font specification
1128 For certain PDFs, Encoding Type isn't mentioned as an attribute of
1129 Encoding but as an attribute of CMapName, where CMapName is an
1130 attribute of spec['Encoding'].
1131 The horizontal/vertical modes are mentioned with different name
1132 such as 'DLIdent-H/V','OneByteIdentityH/V','Identity-H/V'.
1133 """
1134 cmap_name = self._get_cmap_name(spec, strict)
1136 try:
1137 return CMapDB.get_cmap(cmap_name)
1138 except CMapDB.CMapNotFound as e:
1139 if strict:
1140 raise PDFFontError(e)
1141 return CMap()
1143 @staticmethod
1144 def _get_cmap_name(spec: Mapping[str, Any], strict: bool) -> str:
1145 """Get cmap name from font specification"""
1146 cmap_name = "unknown" # default value
1148 try:
1149 spec_encoding = spec["Encoding"]
1150 if hasattr(spec_encoding, "name"):
1151 cmap_name = literal_name(spec["Encoding"])
1152 else:
1153 cmap_name = literal_name(spec_encoding["CMapName"])
1154 except KeyError:
1155 if strict:
1156 raise PDFFontError("Encoding is unspecified")
1158 if type(cmap_name) is PDFStream: # type: ignore[comparison-overlap]
1159 cmap_name_stream: PDFStream = cast(PDFStream, cmap_name)
1160 if "CMapName" in cmap_name_stream:
1161 cmap_name = cmap_name_stream.get("CMapName").name
1162 elif strict:
1163 raise PDFFontError("CMapName unspecified for encoding")
1165 return IDENTITY_ENCODER.get(cmap_name, cmap_name)
1167 def __repr__(self) -> str:
1168 return f"<PDFCIDFont: basefont={self.basefont!r}, cidcoding={self.cidcoding!r}>"
1170 def is_vertical(self) -> bool:
1171 return self.vertical
1173 def is_multibyte(self) -> bool:
1174 return True
1176 def decode(self, bytes: bytes) -> Iterable[int]:
1177 return self.cmap.decode(bytes)
1179 def char_disp(self, cid: int) -> Union[float, Tuple[Optional[float], float]]:
1180 """Returns an integer for horizontal fonts, a tuple for vertical fonts."""
1181 return self.disps.get(cid, self.default_disp)
1183 def to_unichr(self, cid: int) -> str:
1184 try:
1185 if not self.unicode_map:
1186 raise PDFKeyError(cid)
1187 return self.unicode_map.get_unichr(cid)
1188 except KeyError:
1189 raise PDFUnicodeNotDefined(self.cidcoding, cid)