Coverage for /pythoncovmergedfiles/medio/medio/src/pdfminer.six/pdfminer/pdffont.py: 67%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import logging
2import struct
3from io import BytesIO
4from typing import (
5 TYPE_CHECKING,
6 Any,
7 BinaryIO,
8 Dict,
9 Iterable,
10 Iterator,
11 List,
12 Mapping,
13 Optional,
14 Tuple,
15 Union,
16 cast,
17)
19from pdfminer import settings
20from pdfminer.casting import safe_float, safe_rect_list
21from pdfminer.cmapdb import (
22 CMap,
23 CMapBase,
24 CMapDB,
25 CMapParser,
26 FileUnicodeMap,
27 IdentityUnicodeMap,
28 UnicodeMap,
29)
30from pdfminer.encodingdb import EncodingDB, name2unicode
31from pdfminer.fontmetrics import FONT_METRICS
32from pdfminer.pdfexceptions import PDFException, PDFKeyError, PDFValueError
33from pdfminer.pdftypes import (
34 PDFStream,
35 dict_value,
36 int_value,
37 list_value,
38 num_value,
39 resolve1,
40 resolve_all,
41 stream_value,
42)
43from pdfminer.psexceptions import PSEOF
44from pdfminer.psparser import (
45 KWD,
46 LIT,
47 PSKeyword,
48 PSLiteral,
49 PSStackParser,
50 literal_name,
51)
52from pdfminer.utils import Matrix, Point, Rect, apply_matrix_norm, choplist, nunpack
54if TYPE_CHECKING:
55 from pdfminer.pdfinterp import PDFResourceManager
57log = logging.getLogger(__name__)
60def get_widths(seq: Iterable[object]) -> Dict[Union[str, int], float]:
61 """Build a mapping of character widths for horizontal writing."""
62 widths: Dict[int, float] = {}
63 r: List[float] = []
64 for v in seq:
65 v = resolve1(v)
66 if isinstance(v, list):
67 if r:
68 char1 = r[-1]
69 for i, w in enumerate(v):
70 widths[cast(int, char1) + i] = w
71 r = []
72 elif isinstance(v, (int, float)): # == utils.isnumber(v)
73 r.append(v)
74 if len(r) == 3:
75 (char1, char2, w) = r
76 if isinstance(char1, int) and isinstance(char2, int):
77 for i in range(cast(int, char1), cast(int, char2) + 1):
78 widths[i] = w
79 else:
80 log.warning(
81 f"Skipping invalid font width specification for {char1} to {char2} because either of them is not an int"
82 )
83 r = []
84 else:
85 log.warning(
86 f"Skipping invalid font width specification for {v} because it is not a number or a list"
87 )
88 return cast(Dict[Union[str, int], float], widths)
91def get_widths2(seq: Iterable[object]) -> Dict[int, Tuple[float, Point]]:
92 """Build a mapping of character widths for vertical writing."""
93 widths: Dict[int, Tuple[float, Point]] = {}
94 r: List[float] = []
95 for v in seq:
96 if isinstance(v, list):
97 if r:
98 char1 = r[-1]
99 for i, (w, vx, vy) in enumerate(choplist(3, v)):
100 widths[cast(int, char1) + i] = (w, (vx, vy))
101 r = []
102 elif isinstance(v, (int, float)): # == utils.isnumber(v)
103 r.append(v)
104 if len(r) == 5:
105 (char1, char2, w, vx, vy) = r
106 for i in range(cast(int, char1), cast(int, char2) + 1):
107 widths[i] = (w, (vx, vy))
108 r = []
109 return widths
112class FontMetricsDB:
113 @classmethod
114 def get_metrics(cls, fontname: str) -> Tuple[Dict[str, object], Dict[str, int]]:
115 return FONT_METRICS[fontname]
118# int here means that we're not extending PSStackParser with additional types.
119class Type1FontHeaderParser(PSStackParser[int]):
120 KEYWORD_BEGIN = KWD(b"begin")
121 KEYWORD_END = KWD(b"end")
122 KEYWORD_DEF = KWD(b"def")
123 KEYWORD_PUT = KWD(b"put")
124 KEYWORD_DICT = KWD(b"dict")
125 KEYWORD_ARRAY = KWD(b"array")
126 KEYWORD_READONLY = KWD(b"readonly")
127 KEYWORD_FOR = KWD(b"for")
129 def __init__(self, data: BinaryIO) -> None:
130 PSStackParser.__init__(self, data)
131 self._cid2unicode: Dict[int, str] = {}
133 def get_encoding(self) -> Dict[int, str]:
134 """Parse the font encoding.
136 The Type1 font encoding maps character codes to character names. These
137 character names could either be standard Adobe glyph names, or
138 character names associated with custom CharStrings for this font. A
139 CharString is a sequence of operations that describe how the character
140 should be drawn. Currently, this function returns '' (empty string)
141 for character names that are associated with a CharStrings.
143 Reference: Adobe Systems Incorporated, Adobe Type 1 Font Format
145 :returns mapping of character identifiers (cid's) to unicode characters
146 """
147 while 1:
148 try:
149 (cid, name) = self.nextobject()
150 except PSEOF:
151 break
152 try:
153 self._cid2unicode[cid] = name2unicode(cast(str, name))
154 except KeyError as e:
155 log.debug(str(e))
156 return self._cid2unicode
158 def do_keyword(self, pos: int, token: PSKeyword) -> None:
159 if token is self.KEYWORD_PUT:
160 ((_, key), (_, value)) = self.pop(2)
161 if isinstance(key, int) and isinstance(value, PSLiteral):
162 self.add_results((key, literal_name(value)))
165NIBBLES = ("0", "1", "2", "3", "4", "5", "6", "7", "8", "9", ".", "e", "e-", None, "-")
167# Mapping of cmap names. Original cmap name is kept if not in the mapping.
168# (missing reference for why DLIdent is mapped to Identity)
169IDENTITY_ENCODER = {
170 "DLIdent-H": "Identity-H",
171 "DLIdent-V": "Identity-V",
172}
175def getdict(data: bytes) -> Dict[int, List[Union[float, int]]]:
176 d: Dict[int, List[Union[float, int]]] = {}
177 fp = BytesIO(data)
178 stack: List[Union[float, int]] = []
179 while 1:
180 c = fp.read(1)
181 if not c:
182 break
183 b0 = ord(c)
184 if b0 <= 21:
185 d[b0] = stack
186 stack = []
187 continue
188 if b0 == 30:
189 s = ""
190 loop = True
191 while loop:
192 b = ord(fp.read(1))
193 for n in (b >> 4, b & 15):
194 if n == 15:
195 loop = False
196 else:
197 nibble = NIBBLES[n]
198 assert nibble is not None
199 s += nibble
200 value = float(s)
201 elif b0 >= 32 and b0 <= 246:
202 value = b0 - 139
203 else:
204 b1 = ord(fp.read(1))
205 if b0 >= 247 and b0 <= 250:
206 value = ((b0 - 247) << 8) + b1 + 108
207 elif b0 >= 251 and b0 <= 254:
208 value = -((b0 - 251) << 8) - b1 - 108
209 else:
210 b2 = ord(fp.read(1))
211 if b1 >= 128:
212 b1 -= 256
213 if b0 == 28:
214 value = b1 << 8 | b2
215 else:
216 value = b1 << 24 | b2 << 16 | struct.unpack(">H", fp.read(2))[0]
217 stack.append(value)
218 return d
221class CFFFont:
222 STANDARD_STRINGS = (
223 ".notdef",
224 "space",
225 "exclam",
226 "quotedbl",
227 "numbersign",
228 "dollar",
229 "percent",
230 "ampersand",
231 "quoteright",
232 "parenleft",
233 "parenright",
234 "asterisk",
235 "plus",
236 "comma",
237 "hyphen",
238 "period",
239 "slash",
240 "zero",
241 "one",
242 "two",
243 "three",
244 "four",
245 "five",
246 "six",
247 "seven",
248 "eight",
249 "nine",
250 "colon",
251 "semicolon",
252 "less",
253 "equal",
254 "greater",
255 "question",
256 "at",
257 "A",
258 "B",
259 "C",
260 "D",
261 "E",
262 "F",
263 "G",
264 "H",
265 "I",
266 "J",
267 "K",
268 "L",
269 "M",
270 "N",
271 "O",
272 "P",
273 "Q",
274 "R",
275 "S",
276 "T",
277 "U",
278 "V",
279 "W",
280 "X",
281 "Y",
282 "Z",
283 "bracketleft",
284 "backslash",
285 "bracketright",
286 "asciicircum",
287 "underscore",
288 "quoteleft",
289 "a",
290 "b",
291 "c",
292 "d",
293 "e",
294 "f",
295 "g",
296 "h",
297 "i",
298 "j",
299 "k",
300 "l",
301 "m",
302 "n",
303 "o",
304 "p",
305 "q",
306 "r",
307 "s",
308 "t",
309 "u",
310 "v",
311 "w",
312 "x",
313 "y",
314 "z",
315 "braceleft",
316 "bar",
317 "braceright",
318 "asciitilde",
319 "exclamdown",
320 "cent",
321 "sterling",
322 "fraction",
323 "yen",
324 "florin",
325 "section",
326 "currency",
327 "quotesingle",
328 "quotedblleft",
329 "guillemotleft",
330 "guilsinglleft",
331 "guilsinglright",
332 "fi",
333 "fl",
334 "endash",
335 "dagger",
336 "daggerdbl",
337 "periodcentered",
338 "paragraph",
339 "bullet",
340 "quotesinglbase",
341 "quotedblbase",
342 "quotedblright",
343 "guillemotright",
344 "ellipsis",
345 "perthousand",
346 "questiondown",
347 "grave",
348 "acute",
349 "circumflex",
350 "tilde",
351 "macron",
352 "breve",
353 "dotaccent",
354 "dieresis",
355 "ring",
356 "cedilla",
357 "hungarumlaut",
358 "ogonek",
359 "caron",
360 "emdash",
361 "AE",
362 "ordfeminine",
363 "Lslash",
364 "Oslash",
365 "OE",
366 "ordmasculine",
367 "ae",
368 "dotlessi",
369 "lslash",
370 "oslash",
371 "oe",
372 "germandbls",
373 "onesuperior",
374 "logicalnot",
375 "mu",
376 "trademark",
377 "Eth",
378 "onehalf",
379 "plusminus",
380 "Thorn",
381 "onequarter",
382 "divide",
383 "brokenbar",
384 "degree",
385 "thorn",
386 "threequarters",
387 "twosuperior",
388 "registered",
389 "minus",
390 "eth",
391 "multiply",
392 "threesuperior",
393 "copyright",
394 "Aacute",
395 "Acircumflex",
396 "Adieresis",
397 "Agrave",
398 "Aring",
399 "Atilde",
400 "Ccedilla",
401 "Eacute",
402 "Ecircumflex",
403 "Edieresis",
404 "Egrave",
405 "Iacute",
406 "Icircumflex",
407 "Idieresis",
408 "Igrave",
409 "Ntilde",
410 "Oacute",
411 "Ocircumflex",
412 "Odieresis",
413 "Ograve",
414 "Otilde",
415 "Scaron",
416 "Uacute",
417 "Ucircumflex",
418 "Udieresis",
419 "Ugrave",
420 "Yacute",
421 "Ydieresis",
422 "Zcaron",
423 "aacute",
424 "acircumflex",
425 "adieresis",
426 "agrave",
427 "aring",
428 "atilde",
429 "ccedilla",
430 "eacute",
431 "ecircumflex",
432 "edieresis",
433 "egrave",
434 "iacute",
435 "icircumflex",
436 "idieresis",
437 "igrave",
438 "ntilde",
439 "oacute",
440 "ocircumflex",
441 "odieresis",
442 "ograve",
443 "otilde",
444 "scaron",
445 "uacute",
446 "ucircumflex",
447 "udieresis",
448 "ugrave",
449 "yacute",
450 "ydieresis",
451 "zcaron",
452 "exclamsmall",
453 "Hungarumlautsmall",
454 "dollaroldstyle",
455 "dollarsuperior",
456 "ampersandsmall",
457 "Acutesmall",
458 "parenleftsuperior",
459 "parenrightsuperior",
460 "twodotenleader",
461 "onedotenleader",
462 "zerooldstyle",
463 "oneoldstyle",
464 "twooldstyle",
465 "threeoldstyle",
466 "fouroldstyle",
467 "fiveoldstyle",
468 "sixoldstyle",
469 "sevenoldstyle",
470 "eightoldstyle",
471 "nineoldstyle",
472 "commasuperior",
473 "threequartersemdash",
474 "periodsuperior",
475 "questionsmall",
476 "asuperior",
477 "bsuperior",
478 "centsuperior",
479 "dsuperior",
480 "esuperior",
481 "isuperior",
482 "lsuperior",
483 "msuperior",
484 "nsuperior",
485 "osuperior",
486 "rsuperior",
487 "ssuperior",
488 "tsuperior",
489 "ff",
490 "ffi",
491 "ffl",
492 "parenleftinferior",
493 "parenrightinferior",
494 "Circumflexsmall",
495 "hyphensuperior",
496 "Gravesmall",
497 "Asmall",
498 "Bsmall",
499 "Csmall",
500 "Dsmall",
501 "Esmall",
502 "Fsmall",
503 "Gsmall",
504 "Hsmall",
505 "Ismall",
506 "Jsmall",
507 "Ksmall",
508 "Lsmall",
509 "Msmall",
510 "Nsmall",
511 "Osmall",
512 "Psmall",
513 "Qsmall",
514 "Rsmall",
515 "Ssmall",
516 "Tsmall",
517 "Usmall",
518 "Vsmall",
519 "Wsmall",
520 "Xsmall",
521 "Ysmall",
522 "Zsmall",
523 "colonmonetary",
524 "onefitted",
525 "rupiah",
526 "Tildesmall",
527 "exclamdownsmall",
528 "centoldstyle",
529 "Lslashsmall",
530 "Scaronsmall",
531 "Zcaronsmall",
532 "Dieresissmall",
533 "Brevesmall",
534 "Caronsmall",
535 "Dotaccentsmall",
536 "Macronsmall",
537 "figuredash",
538 "hypheninferior",
539 "Ogoneksmall",
540 "Ringsmall",
541 "Cedillasmall",
542 "questiondownsmall",
543 "oneeighth",
544 "threeeighths",
545 "fiveeighths",
546 "seveneighths",
547 "onethird",
548 "twothirds",
549 "zerosuperior",
550 "foursuperior",
551 "fivesuperior",
552 "sixsuperior",
553 "sevensuperior",
554 "eightsuperior",
555 "ninesuperior",
556 "zeroinferior",
557 "oneinferior",
558 "twoinferior",
559 "threeinferior",
560 "fourinferior",
561 "fiveinferior",
562 "sixinferior",
563 "seveninferior",
564 "eightinferior",
565 "nineinferior",
566 "centinferior",
567 "dollarinferior",
568 "periodinferior",
569 "commainferior",
570 "Agravesmall",
571 "Aacutesmall",
572 "Acircumflexsmall",
573 "Atildesmall",
574 "Adieresissmall",
575 "Aringsmall",
576 "AEsmall",
577 "Ccedillasmall",
578 "Egravesmall",
579 "Eacutesmall",
580 "Ecircumflexsmall",
581 "Edieresissmall",
582 "Igravesmall",
583 "Iacutesmall",
584 "Icircumflexsmall",
585 "Idieresissmall",
586 "Ethsmall",
587 "Ntildesmall",
588 "Ogravesmall",
589 "Oacutesmall",
590 "Ocircumflexsmall",
591 "Otildesmall",
592 "Odieresissmall",
593 "OEsmall",
594 "Oslashsmall",
595 "Ugravesmall",
596 "Uacutesmall",
597 "Ucircumflexsmall",
598 "Udieresissmall",
599 "Yacutesmall",
600 "Thornsmall",
601 "Ydieresissmall",
602 "001.000",
603 "001.001",
604 "001.002",
605 "001.003",
606 "Black",
607 "Bold",
608 "Book",
609 "Light",
610 "Medium",
611 "Regular",
612 "Roman",
613 "Semibold",
614 )
616 class INDEX:
617 def __init__(self, fp: BinaryIO) -> None:
618 self.fp = fp
619 self.offsets: List[int] = []
620 (count, offsize) = struct.unpack(">HB", self.fp.read(3))
621 for i in range(count + 1):
622 self.offsets.append(nunpack(self.fp.read(offsize)))
623 self.base = self.fp.tell() - 1
624 self.fp.seek(self.base + self.offsets[-1])
626 def __repr__(self) -> str:
627 return "<INDEX: size=%d>" % len(self)
629 def __len__(self) -> int:
630 return len(self.offsets) - 1
632 def __getitem__(self, i: int) -> bytes:
633 self.fp.seek(self.base + self.offsets[i])
634 return self.fp.read(self.offsets[i + 1] - self.offsets[i])
636 def __iter__(self) -> Iterator[bytes]:
637 return iter(self[i] for i in range(len(self)))
639 def __init__(self, name: str, fp: BinaryIO) -> None:
640 self.name = name
641 self.fp = fp
642 # Header
643 (_major, _minor, hdrsize, offsize) = struct.unpack("BBBB", self.fp.read(4))
644 self.fp.read(hdrsize - 4)
645 # Name INDEX
646 self.name_index = self.INDEX(self.fp)
647 # Top DICT INDEX
648 self.dict_index = self.INDEX(self.fp)
649 # String INDEX
650 self.string_index = self.INDEX(self.fp)
651 # Global Subr INDEX
652 self.subr_index = self.INDEX(self.fp)
653 # Top DICT DATA
654 self.top_dict = getdict(self.dict_index[0])
655 (charset_pos,) = self.top_dict.get(15, [0])
656 (encoding_pos,) = self.top_dict.get(16, [0])
657 (charstring_pos,) = self.top_dict.get(17, [0])
658 # CharStrings
659 self.fp.seek(cast(int, charstring_pos))
660 self.charstring = self.INDEX(self.fp)
661 self.nglyphs = len(self.charstring)
662 # Encodings
663 self.code2gid = {}
664 self.gid2code = {}
665 self.fp.seek(cast(int, encoding_pos))
666 format = self.fp.read(1)
667 if format == b"\x00":
668 # Format 0
669 (n,) = struct.unpack("B", self.fp.read(1))
670 for code, gid in enumerate(struct.unpack("B" * n, self.fp.read(n))):
671 self.code2gid[code] = gid
672 self.gid2code[gid] = code
673 elif format == b"\x01":
674 # Format 1
675 (n,) = struct.unpack("B", self.fp.read(1))
676 code = 0
677 for i in range(n):
678 (first, nleft) = struct.unpack("BB", self.fp.read(2))
679 for gid in range(first, first + nleft + 1):
680 self.code2gid[code] = gid
681 self.gid2code[gid] = code
682 code += 1
683 else:
684 raise PDFValueError("unsupported encoding format: %r" % format)
685 # Charsets
686 self.name2gid = {}
687 self.gid2name = {}
688 self.fp.seek(cast(int, charset_pos))
689 format = self.fp.read(1)
690 if format == b"\x00":
691 # Format 0
692 n = self.nglyphs - 1
693 for gid, sid in enumerate(
694 cast(
695 Tuple[int, ...], struct.unpack(">" + "H" * n, self.fp.read(2 * n))
696 ),
697 ):
698 gid += 1
699 sidname = self.getstr(sid)
700 self.name2gid[sidname] = gid
701 self.gid2name[gid] = sidname
702 elif format == b"\x01":
703 # Format 1
704 (n,) = struct.unpack("B", self.fp.read(1))
705 sid = 0
706 for i in range(n):
707 (first, nleft) = struct.unpack("BB", self.fp.read(2))
708 for gid in range(first, first + nleft + 1):
709 sidname = self.getstr(sid)
710 self.name2gid[sidname] = gid
711 self.gid2name[gid] = sidname
712 sid += 1
713 elif format == b"\x02":
714 # Format 2
715 assert False, str(("Unhandled", format))
716 else:
717 raise PDFValueError("unsupported charset format: %r" % format)
719 def getstr(self, sid: int) -> Union[str, bytes]:
720 # This returns str for one of the STANDARD_STRINGS but bytes otherwise,
721 # and appears to be a needless source of type complexity.
722 if sid < len(self.STANDARD_STRINGS):
723 return self.STANDARD_STRINGS[sid]
724 return self.string_index[sid - len(self.STANDARD_STRINGS)]
727class TrueTypeFont:
728 class CMapNotFound(PDFException):
729 pass
731 def __init__(self, name: str, fp: BinaryIO) -> None:
732 self.name = name
733 self.fp = fp
734 self.tables: Dict[bytes, Tuple[int, int]] = {}
735 self.fonttype = fp.read(4)
736 try:
737 (ntables, _1, _2, _3) = cast(
738 Tuple[int, int, int, int],
739 struct.unpack(">HHHH", fp.read(8)),
740 )
741 for _ in range(ntables):
742 (name_bytes, tsum, offset, length) = cast(
743 Tuple[bytes, int, int, int],
744 struct.unpack(">4sLLL", fp.read(16)),
745 )
746 self.tables[name_bytes] = (offset, length)
747 except struct.error:
748 # Do not fail if there are not enough bytes to read. Even for
749 # corrupted PDFs we would like to get as much information as
750 # possible, so continue.
751 pass
753 def create_unicode_map(self) -> FileUnicodeMap:
754 if b"cmap" not in self.tables:
755 raise TrueTypeFont.CMapNotFound
756 (base_offset, length) = self.tables[b"cmap"]
757 fp = self.fp
758 fp.seek(base_offset)
759 (version, nsubtables) = cast(Tuple[int, int], struct.unpack(">HH", fp.read(4)))
760 subtables: List[Tuple[int, int, int]] = []
761 for i in range(nsubtables):
762 subtables.append(
763 cast(Tuple[int, int, int], struct.unpack(">HHL", fp.read(8))),
764 )
765 char2gid: Dict[int, int] = {}
766 # Only supports subtable type 0, 2 and 4.
767 for platform_id, encoding_id, st_offset in subtables:
768 # Skip non-Unicode cmaps.
769 # https://docs.microsoft.com/en-us/typography/opentype/spec/cmap
770 if not (platform_id == 0 or (platform_id == 3 and encoding_id in [1, 10])):
771 continue
772 fp.seek(base_offset + st_offset)
773 (fmttype, fmtlen, fmtlang) = cast(
774 Tuple[int, int, int],
775 struct.unpack(">HHH", fp.read(6)),
776 )
777 if fmttype == 0:
778 char2gid.update(
779 enumerate(
780 cast(Tuple[int, ...], struct.unpack(">256B", fp.read(256))),
781 ),
782 )
783 elif fmttype == 2:
784 subheaderkeys = cast(
785 Tuple[int, ...],
786 struct.unpack(">256H", fp.read(512)),
787 )
788 firstbytes = [0] * 8192
789 for i, k in enumerate(subheaderkeys):
790 firstbytes[k // 8] = i
791 nhdrs = max(subheaderkeys) // 8 + 1
792 hdrs: List[Tuple[int, int, int, int, int]] = []
793 for i in range(nhdrs):
794 (firstcode, entcount, delta, offset) = cast(
795 Tuple[int, int, int, int],
796 struct.unpack(">HHhH", fp.read(8)),
797 )
798 hdrs.append((i, firstcode, entcount, delta, fp.tell() - 2 + offset))
799 for i, firstcode, entcount, delta, pos in hdrs:
800 if not entcount:
801 continue
802 first = firstcode + (firstbytes[i] << 8)
803 fp.seek(pos)
804 for c in range(entcount):
805 gid = cast(Tuple[int], struct.unpack(">H", fp.read(2)))[0]
806 if gid:
807 gid += delta
808 char2gid[first + c] = gid
809 elif fmttype == 4:
810 (segcount, _1, _2, _3) = cast(
811 Tuple[int, int, int, int],
812 struct.unpack(">HHHH", fp.read(8)),
813 )
814 segcount //= 2
815 ecs = cast(
816 Tuple[int, ...],
817 struct.unpack(">%dH" % segcount, fp.read(2 * segcount)),
818 )
819 fp.read(2)
820 scs = cast(
821 Tuple[int, ...],
822 struct.unpack(">%dH" % segcount, fp.read(2 * segcount)),
823 )
824 idds = cast(
825 Tuple[int, ...],
826 struct.unpack(">%dh" % segcount, fp.read(2 * segcount)),
827 )
828 pos = fp.tell()
829 idrs = cast(
830 Tuple[int, ...],
831 struct.unpack(">%dH" % segcount, fp.read(2 * segcount)),
832 )
833 for ec, sc, idd, idr in zip(ecs, scs, idds, idrs):
834 if idr:
835 fp.seek(pos + idr)
836 for c in range(sc, ec + 1):
837 b = cast(Tuple[int], struct.unpack(">H", fp.read(2)))[0]
838 char2gid[c] = (b + idd) & 0xFFFF
839 else:
840 for c in range(sc, ec + 1):
841 char2gid[c] = (c + idd) & 0xFFFF
842 else:
843 assert False, str(("Unhandled", fmttype))
844 if not char2gid:
845 raise TrueTypeFont.CMapNotFound
846 # create unicode map
847 unicode_map = FileUnicodeMap()
848 for char, gid in char2gid.items():
849 unicode_map.add_cid2unichr(gid, char)
850 return unicode_map
853class PDFFontError(PDFException):
854 pass
857class PDFUnicodeNotDefined(PDFFontError):
858 pass
861LITERAL_STANDARD_ENCODING = LIT("StandardEncoding")
862LITERAL_TYPE1C = LIT("Type1C")
864# Font widths are maintained in a dict type that maps from *either* unicode
865# chars or integer character IDs.
866FontWidthDict = Dict[Union[int, str], float]
869class PDFFont:
870 def __init__(
871 self,
872 descriptor: Mapping[str, Any],
873 widths: FontWidthDict,
874 default_width: Optional[float] = None,
875 ) -> None:
876 self.descriptor = descriptor
877 self.widths: FontWidthDict = resolve_all(widths)
878 self.fontname = resolve1(descriptor.get("FontName", "unknown"))
879 if isinstance(self.fontname, PSLiteral):
880 self.fontname = literal_name(self.fontname)
881 self.flags = int_value(descriptor.get("Flags", 0))
882 self.ascent = num_value(descriptor.get("Ascent", 0))
883 self.descent = num_value(descriptor.get("Descent", 0))
884 self.italic_angle = num_value(descriptor.get("ItalicAngle", 0))
885 if default_width is None:
886 self.default_width = num_value(descriptor.get("MissingWidth", 0))
887 else:
888 self.default_width = default_width
889 self.default_width = resolve1(self.default_width)
890 self.leading = num_value(descriptor.get("Leading", 0))
891 self.bbox = self._parse_bbox(descriptor)
892 self.hscale = self.vscale = 0.001
894 # PDF RM 9.8.1 specifies /Descent should always be a negative number.
895 # PScript5.dll seems to produce Descent with a positive number, but
896 # text analysis will be wrong if this is taken as correct. So force
897 # descent to negative.
898 if self.descent > 0:
899 self.descent = -self.descent
901 def __repr__(self) -> str:
902 return "<PDFFont>"
904 def is_vertical(self) -> bool:
905 return False
907 def is_multibyte(self) -> bool:
908 return False
910 def decode(self, bytes: bytes) -> Iterable[int]:
911 return bytearray(bytes) # map(ord, bytes)
913 def get_ascent(self) -> float:
914 """Ascent above the baseline, in text space units"""
915 return self.ascent * self.vscale
917 def get_descent(self) -> float:
918 """Descent below the baseline, in text space units; always negative"""
919 return self.descent * self.vscale
921 def get_width(self) -> float:
922 w = self.bbox[2] - self.bbox[0]
923 if w == 0:
924 w = -self.default_width
925 return w * self.hscale
927 def get_height(self) -> float:
928 h = self.bbox[3] - self.bbox[1]
929 if h == 0:
930 h = self.ascent - self.descent
931 return h * self.vscale
933 def char_width(self, cid: int) -> float:
934 # Because character widths may be mapping either IDs or strings,
935 # we try to lookup the character ID first, then its str equivalent.
936 cid_width = safe_float(self.widths.get(cid))
937 if cid_width is not None:
938 return cid_width * self.hscale
940 try:
941 str_cid = self.to_unichr(cid)
942 cid_width = safe_float(self.widths.get(str_cid))
943 if cid_width is not None:
944 return cid_width * self.hscale
946 except PDFUnicodeNotDefined:
947 pass
949 return self.default_width * self.hscale
951 def char_disp(self, cid: int) -> Union[float, Tuple[Optional[float], float]]:
952 """Returns an integer for horizontal fonts, a tuple for vertical fonts."""
953 return 0
955 def string_width(self, s: bytes) -> float:
956 return sum(self.char_width(cid) for cid in self.decode(s))
958 def to_unichr(self, cid: int) -> str:
959 raise NotImplementedError
961 @staticmethod
962 def _parse_bbox(descriptor: Mapping[str, Any]) -> Rect:
963 """Parse FontBBox from the fonts descriptor"""
964 font_bbox = resolve_all(descriptor.get("FontBBox"))
965 bbox = safe_rect_list(font_bbox)
966 if bbox is None:
967 log.warning(
968 f"Could get FontBBox from font descriptor because {font_bbox!r} cannot be parsed as 4 floats"
969 )
970 return 0.0, 0.0, 0.0, 0.0
971 return bbox
974class PDFSimpleFont(PDFFont):
975 def __init__(
976 self,
977 descriptor: Mapping[str, Any],
978 widths: FontWidthDict,
979 spec: Mapping[str, Any],
980 ) -> None:
981 # Font encoding is specified either by a name of
982 # built-in encoding or a dictionary that describes
983 # the differences.
984 if "Encoding" in spec:
985 encoding = resolve1(spec["Encoding"])
986 else:
987 encoding = LITERAL_STANDARD_ENCODING
988 if isinstance(encoding, dict):
989 name = literal_name(encoding.get("BaseEncoding", LITERAL_STANDARD_ENCODING))
990 diff = list_value(encoding.get("Differences", []))
991 self.cid2unicode = EncodingDB.get_encoding(name, diff)
992 else:
993 self.cid2unicode = EncodingDB.get_encoding(literal_name(encoding))
994 self.unicode_map: Optional[UnicodeMap] = None
995 if "ToUnicode" in spec:
996 strm = stream_value(spec["ToUnicode"])
997 self.unicode_map = FileUnicodeMap()
998 CMapParser(self.unicode_map, BytesIO(strm.get_data())).run()
999 PDFFont.__init__(self, descriptor, widths)
1001 def to_unichr(self, cid: int) -> str:
1002 if self.unicode_map:
1003 try:
1004 return self.unicode_map.get_unichr(cid)
1005 except KeyError:
1006 pass
1007 try:
1008 return self.cid2unicode[cid]
1009 except KeyError:
1010 raise PDFUnicodeNotDefined(None, cid)
1013class PDFType1Font(PDFSimpleFont):
1014 def __init__(self, rsrcmgr: "PDFResourceManager", spec: Mapping[str, Any]) -> None:
1015 try:
1016 self.basefont = literal_name(spec["BaseFont"])
1017 except KeyError:
1018 if settings.STRICT:
1019 raise PDFFontError("BaseFont is missing")
1020 self.basefont = "unknown"
1022 widths: FontWidthDict
1023 try:
1024 (descriptor, int_widths) = FontMetricsDB.get_metrics(self.basefont)
1025 widths = cast(
1026 Dict[Union[str, int], float], int_widths
1027 ) # implicit int->float
1028 except KeyError:
1029 descriptor = dict_value(spec.get("FontDescriptor", {}))
1030 firstchar = int_value(spec.get("FirstChar", 0))
1031 # lastchar = int_value(spec.get('LastChar', 255))
1032 width_list = list_value(spec.get("Widths", [0] * 256))
1033 widths = {i + firstchar: resolve1(w) for (i, w) in enumerate(width_list)}
1034 PDFSimpleFont.__init__(self, descriptor, widths, spec)
1035 if "Encoding" not in spec and "FontFile" in descriptor:
1036 # try to recover the missing encoding info from the font file.
1037 self.fontfile = stream_value(descriptor.get("FontFile"))
1038 length1 = int_value(self.fontfile["Length1"])
1039 data = self.fontfile.get_data()[:length1]
1040 parser = Type1FontHeaderParser(BytesIO(data))
1041 self.cid2unicode = parser.get_encoding()
1043 def __repr__(self) -> str:
1044 return "<PDFType1Font: basefont=%r>" % self.basefont
1047class PDFTrueTypeFont(PDFType1Font):
1048 def __repr__(self) -> str:
1049 return "<PDFTrueTypeFont: basefont=%r>" % self.basefont
1052class PDFType3Font(PDFSimpleFont):
1053 def __init__(self, rsrcmgr: "PDFResourceManager", spec: Mapping[str, Any]) -> None:
1054 firstchar = int_value(spec.get("FirstChar", 0))
1055 # lastchar = int_value(spec.get('LastChar', 0))
1056 width_list = list_value(spec.get("Widths", [0] * 256))
1057 widths: Dict[Union[str, int], float] = {
1058 i + firstchar: w for (i, w) in enumerate(width_list)
1059 }
1060 if "FontDescriptor" in spec:
1061 descriptor = dict_value(spec["FontDescriptor"])
1062 else:
1063 descriptor = {"Ascent": 0, "Descent": 0, "FontBBox": spec["FontBBox"]}
1064 PDFSimpleFont.__init__(self, descriptor, widths, spec)
1065 self.matrix = cast(Matrix, tuple(list_value(spec.get("FontMatrix"))))
1066 (_, self.descent, _, self.ascent) = self.bbox
1067 (self.hscale, self.vscale) = apply_matrix_norm(self.matrix, (1, 1))
1069 def __repr__(self) -> str:
1070 return "<PDFType3Font>"
1073class PDFCIDFont(PDFFont):
1074 default_disp: Union[float, Tuple[Optional[float], float]]
1076 def __init__(
1077 self,
1078 rsrcmgr: "PDFResourceManager",
1079 spec: Mapping[str, Any],
1080 strict: bool = settings.STRICT,
1081 ) -> None:
1082 try:
1083 self.basefont = literal_name(spec["BaseFont"])
1084 except KeyError:
1085 if strict:
1086 raise PDFFontError("BaseFont is missing")
1087 self.basefont = "unknown"
1088 self.cidsysteminfo = dict_value(spec.get("CIDSystemInfo", {}))
1089 cid_registry = resolve1(self.cidsysteminfo.get("Registry", b"unknown")).decode(
1090 "latin1",
1091 )
1092 cid_ordering = resolve1(self.cidsysteminfo.get("Ordering", b"unknown")).decode(
1093 "latin1",
1094 )
1095 self.cidcoding = f"{cid_registry.strip()}-{cid_ordering.strip()}"
1096 self.cmap: CMapBase = self.get_cmap_from_spec(spec, strict)
1098 try:
1099 descriptor = dict_value(spec["FontDescriptor"])
1100 except KeyError:
1101 if strict:
1102 raise PDFFontError("FontDescriptor is missing")
1103 descriptor = {}
1104 ttf = None
1105 if "FontFile2" in descriptor:
1106 self.fontfile = stream_value(descriptor.get("FontFile2"))
1107 ttf = TrueTypeFont(self.basefont, BytesIO(self.fontfile.get_data()))
1108 self.unicode_map: Optional[UnicodeMap] = None
1109 if "ToUnicode" in spec:
1110 if isinstance(spec["ToUnicode"], PDFStream):
1111 strm = stream_value(spec["ToUnicode"])
1112 self.unicode_map = FileUnicodeMap()
1113 CMapParser(self.unicode_map, BytesIO(strm.get_data())).run()
1114 else:
1115 cmap_name = literal_name(spec["ToUnicode"])
1116 encoding = literal_name(spec["Encoding"])
1117 if (
1118 "Identity" in cid_ordering
1119 or "Identity" in cmap_name
1120 or "Identity" in encoding
1121 ):
1122 self.unicode_map = IdentityUnicodeMap()
1123 elif self.cidcoding in ("Adobe-Identity", "Adobe-UCS"):
1124 if ttf:
1125 try:
1126 self.unicode_map = ttf.create_unicode_map()
1127 except TrueTypeFont.CMapNotFound:
1128 pass
1129 else:
1130 try:
1131 self.unicode_map = CMapDB.get_unicode_map(
1132 self.cidcoding,
1133 self.cmap.is_vertical(),
1134 )
1135 except CMapDB.CMapNotFound:
1136 pass
1138 self.vertical = self.cmap.is_vertical()
1139 if self.vertical:
1140 # writing mode: vertical
1141 widths2 = get_widths2(list_value(spec.get("W2", [])))
1142 self.disps = {cid: (vx, vy) for (cid, (_, (vx, vy))) in widths2.items()}
1143 (vy, w) = resolve1(spec.get("DW2", [880, -1000]))
1144 self.default_disp = (None, vy)
1145 widths: Dict[Union[str, int], float] = {
1146 cid: w for (cid, (w, _)) in widths2.items()
1147 }
1148 default_width = w
1149 else:
1150 # writing mode: horizontal
1151 self.disps = {}
1152 self.default_disp = 0
1153 widths = get_widths(list_value(spec.get("W", [])))
1154 default_width = spec.get("DW", 1000)
1155 PDFFont.__init__(self, descriptor, widths, default_width=default_width)
1157 def get_cmap_from_spec(self, spec: Mapping[str, Any], strict: bool) -> CMapBase:
1158 """Get cmap from font specification
1160 For certain PDFs, Encoding Type isn't mentioned as an attribute of
1161 Encoding but as an attribute of CMapName, where CMapName is an
1162 attribute of spec['Encoding'].
1163 The horizontal/vertical modes are mentioned with different name
1164 such as 'DLIdent-H/V','OneByteIdentityH/V','Identity-H/V'.
1165 """
1166 cmap_name = self._get_cmap_name(spec, strict)
1168 try:
1169 return CMapDB.get_cmap(cmap_name)
1170 except CMapDB.CMapNotFound as e:
1171 if strict:
1172 raise PDFFontError(e)
1173 return CMap()
1175 @staticmethod
1176 def _get_cmap_name(spec: Mapping[str, Any], strict: bool) -> str:
1177 """Get cmap name from font specification"""
1178 cmap_name = "unknown" # default value
1180 try:
1181 spec_encoding = spec["Encoding"]
1182 if hasattr(spec_encoding, "name"):
1183 cmap_name = literal_name(spec["Encoding"])
1184 else:
1185 cmap_name = literal_name(spec_encoding["CMapName"])
1186 except KeyError:
1187 if strict:
1188 raise PDFFontError("Encoding is unspecified")
1190 if type(cmap_name) is PDFStream: # type: ignore[comparison-overlap]
1191 cmap_name_stream: PDFStream = cast(PDFStream, cmap_name)
1192 if "CMapName" in cmap_name_stream:
1193 cmap_name = cmap_name_stream.get("CMapName").name
1194 elif strict:
1195 raise PDFFontError("CMapName unspecified for encoding")
1197 return IDENTITY_ENCODER.get(cmap_name, cmap_name)
1199 def __repr__(self) -> str:
1200 return f"<PDFCIDFont: basefont={self.basefont!r}, cidcoding={self.cidcoding!r}>"
1202 def is_vertical(self) -> bool:
1203 return self.vertical
1205 def is_multibyte(self) -> bool:
1206 return True
1208 def decode(self, bytes: bytes) -> Iterable[int]:
1209 return self.cmap.decode(bytes)
1211 def char_disp(self, cid: int) -> Union[float, Tuple[Optional[float], float]]:
1212 """Returns an integer for horizontal fonts, a tuple for vertical fonts."""
1213 return self.disps.get(cid, self.default_disp)
1215 def to_unichr(self, cid: int) -> str:
1216 try:
1217 if not self.unicode_map:
1218 raise PDFKeyError(cid)
1219 return self.unicode_map.get_unichr(cid)
1220 except KeyError:
1221 raise PDFUnicodeNotDefined(self.cidcoding, cid)