Coverage for /pythoncovmergedfiles/medio/medio/src/pdfminer.six/pdfminer/pdffont.py: 45%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import contextlib
2import logging
3import struct
4from collections.abc import Iterable, Iterator, Mapping
5from io import BytesIO
6from typing import (
7 TYPE_CHECKING,
8 Any,
9 BinaryIO,
10 cast,
11)
13from pdfminer import settings
14from pdfminer.casting import safe_float, safe_rect_list
15from pdfminer.cmapdb import (
16 CMap,
17 CMapBase,
18 CMapDB,
19 CMapParser,
20 FileUnicodeMap,
21 IdentityUnicodeMap,
22 UnicodeMap,
23)
24from pdfminer.encodingdb import EncodingDB, name2unicode
25from pdfminer.fontmetrics import FONT_METRICS
26from pdfminer.pdfexceptions import PDFException, PDFKeyError, PDFValueError
27from pdfminer.pdftypes import (
28 PDFStream,
29 dict_value,
30 int_value,
31 list_value,
32 num_value,
33 resolve1,
34 resolve_all,
35 stream_value,
36)
37from pdfminer.psexceptions import PSEOF
38from pdfminer.psparser import (
39 KWD,
40 LIT,
41 PSKeyword,
42 PSLiteral,
43 PSStackParser,
44 literal_name,
45)
46from pdfminer.utils import Matrix, Point, Rect, apply_matrix_norm, choplist, nunpack
48if TYPE_CHECKING:
49 from pdfminer.pdfinterp import PDFResourceManager
51log = logging.getLogger(__name__)
54def get_widths(seq: Iterable[object]) -> dict[str | int, float]:
55 """Build a mapping of character widths for horizontal writing."""
56 widths: dict[int, float] = {}
57 r: list[float] = []
58 for v in seq:
59 v = resolve1(v)
60 if isinstance(v, list):
61 if r:
62 char1 = r[-1]
63 for i, w in enumerate(v):
64 widths[cast(int, char1) + i] = w
65 r = []
66 elif isinstance(v, (int, float)): # == utils.isnumber(v)
67 r.append(v)
68 if len(r) == 3:
69 (char1, char2, w) = r
70 if isinstance(char1, int) and isinstance(char2, int):
71 for i in range(char1, char2 + 1):
72 widths[i] = w
73 else:
74 log.warning(
75 f"Skipping invalid font width specification for {char1} to "
76 f"{char2} because either of them is not an int"
77 )
78 r = []
79 else:
80 log.warning(
81 f"Skipping invalid font width specification for {v} "
82 f"because it is not a number or a list"
83 )
84 return cast(dict[str | int, float], widths)
87def get_widths2(seq: Iterable[object]) -> dict[int, tuple[float, Point]]:
88 """Build a mapping of character widths for vertical writing."""
89 widths: dict[int, tuple[float, Point]] = {}
90 r: list[float] = []
91 for v in seq:
92 if isinstance(v, list):
93 if r:
94 char1 = r[-1]
95 for i, (w, vx, vy) in enumerate(choplist(3, v)):
96 widths[cast(int, char1) + i] = (w, (vx, vy))
97 r = []
98 elif isinstance(v, (int, float)): # == utils.isnumber(v)
99 r.append(v)
100 if len(r) == 5:
101 (char1, char2, w, vx, vy) = r
102 for i in range(cast(int, char1), cast(int, char2) + 1):
103 widths[i] = (w, (vx, vy))
104 r = []
105 return widths
108class FontMetricsDB:
109 @classmethod
110 def get_metrics(cls, fontname: str) -> tuple[dict[str, object], dict[str, int]]:
111 return FONT_METRICS[fontname]
114# int here means that we're not extending PSStackParser with additional types.
115class Type1FontHeaderParser(PSStackParser[int]):
116 KEYWORD_BEGIN = KWD(b"begin")
117 KEYWORD_END = KWD(b"end")
118 KEYWORD_DEF = KWD(b"def")
119 KEYWORD_PUT = KWD(b"put")
120 KEYWORD_DICT = KWD(b"dict")
121 KEYWORD_ARRAY = KWD(b"array")
122 KEYWORD_READONLY = KWD(b"readonly")
123 KEYWORD_FOR = KWD(b"for")
125 def __init__(self, data: BinaryIO) -> None:
126 PSStackParser.__init__(self, data)
127 self._cid2unicode: dict[int, str] = {}
129 def get_encoding(self) -> dict[int, str]:
130 """Parse the font encoding.
132 The Type1 font encoding maps character codes to character names. These
133 character names could either be standard Adobe glyph names, or
134 character names associated with custom CharStrings for this font. A
135 CharString is a sequence of operations that describe how the character
136 should be drawn. Currently, this function returns '' (empty string)
137 for character names that are associated with a CharStrings.
139 Reference: Adobe Systems Incorporated, Adobe Type 1 Font Format
141 :returns mapping of character identifiers (cid's) to unicode characters
142 """
143 while 1:
144 try:
145 (cid, name) = self.nextobject()
146 except PSEOF:
147 break
148 try:
149 self._cid2unicode[cid] = name2unicode(cast(str, name))
150 except KeyError as e:
151 log.debug(str(e))
152 return self._cid2unicode
154 def do_keyword(self, pos: int, token: PSKeyword) -> None:
155 if token is self.KEYWORD_PUT:
156 ((_, key), (_, value)) = self.pop(2)
157 if isinstance(key, int) and isinstance(value, PSLiteral):
158 self.add_results((key, literal_name(value)))
161NIBBLES = ("0", "1", "2", "3", "4", "5", "6", "7", "8", "9", ".", "e", "e-", None, "-")
163# Mapping of cmap names. Original cmap name is kept if not in the mapping.
164# (missing reference for why DLIdent is mapped to Identity)
165IDENTITY_ENCODER = {
166 "DLIdent-H": "Identity-H",
167 "DLIdent-V": "Identity-V",
168}
171def getdict(data: bytes) -> dict[int, list[float | int]]:
172 d: dict[int, list[float | int]] = {}
173 fp = BytesIO(data)
174 stack: list[float | int] = []
175 while 1:
176 c = fp.read(1)
177 if not c:
178 break
179 b0 = ord(c)
180 if b0 <= 21:
181 d[b0] = stack
182 stack = []
183 continue
184 if b0 == 30:
185 s = ""
186 loop = True
187 while loop:
188 b = ord(fp.read(1))
189 for n in (b >> 4, b & 15):
190 if n == 15:
191 loop = False
192 else:
193 nibble = NIBBLES[n]
194 assert nibble is not None
195 s += nibble
196 value = float(s)
197 elif b0 >= 32 and b0 <= 246:
198 value = b0 - 139
199 else:
200 b1 = ord(fp.read(1))
201 if b0 >= 247 and b0 <= 250:
202 value = ((b0 - 247) << 8) + b1 + 108
203 elif b0 >= 251 and b0 <= 254:
204 value = -((b0 - 251) << 8) - b1 - 108
205 else:
206 b2 = ord(fp.read(1))
207 if b1 >= 128:
208 b1 -= 256
209 if b0 == 28:
210 value = b1 << 8 | b2
211 else:
212 value = b1 << 24 | b2 << 16 | struct.unpack(">H", fp.read(2))[0]
213 stack.append(value)
214 return d
217class CFFFont:
218 STANDARD_STRINGS = (
219 ".notdef",
220 "space",
221 "exclam",
222 "quotedbl",
223 "numbersign",
224 "dollar",
225 "percent",
226 "ampersand",
227 "quoteright",
228 "parenleft",
229 "parenright",
230 "asterisk",
231 "plus",
232 "comma",
233 "hyphen",
234 "period",
235 "slash",
236 "zero",
237 "one",
238 "two",
239 "three",
240 "four",
241 "five",
242 "six",
243 "seven",
244 "eight",
245 "nine",
246 "colon",
247 "semicolon",
248 "less",
249 "equal",
250 "greater",
251 "question",
252 "at",
253 "A",
254 "B",
255 "C",
256 "D",
257 "E",
258 "F",
259 "G",
260 "H",
261 "I",
262 "J",
263 "K",
264 "L",
265 "M",
266 "N",
267 "O",
268 "P",
269 "Q",
270 "R",
271 "S",
272 "T",
273 "U",
274 "V",
275 "W",
276 "X",
277 "Y",
278 "Z",
279 "bracketleft",
280 "backslash",
281 "bracketright",
282 "asciicircum",
283 "underscore",
284 "quoteleft",
285 "a",
286 "b",
287 "c",
288 "d",
289 "e",
290 "f",
291 "g",
292 "h",
293 "i",
294 "j",
295 "k",
296 "l",
297 "m",
298 "n",
299 "o",
300 "p",
301 "q",
302 "r",
303 "s",
304 "t",
305 "u",
306 "v",
307 "w",
308 "x",
309 "y",
310 "z",
311 "braceleft",
312 "bar",
313 "braceright",
314 "asciitilde",
315 "exclamdown",
316 "cent",
317 "sterling",
318 "fraction",
319 "yen",
320 "florin",
321 "section",
322 "currency",
323 "quotesingle",
324 "quotedblleft",
325 "guillemotleft",
326 "guilsinglleft",
327 "guilsinglright",
328 "fi",
329 "fl",
330 "endash",
331 "dagger",
332 "daggerdbl",
333 "periodcentered",
334 "paragraph",
335 "bullet",
336 "quotesinglbase",
337 "quotedblbase",
338 "quotedblright",
339 "guillemotright",
340 "ellipsis",
341 "perthousand",
342 "questiondown",
343 "grave",
344 "acute",
345 "circumflex",
346 "tilde",
347 "macron",
348 "breve",
349 "dotaccent",
350 "dieresis",
351 "ring",
352 "cedilla",
353 "hungarumlaut",
354 "ogonek",
355 "caron",
356 "emdash",
357 "AE",
358 "ordfeminine",
359 "Lslash",
360 "Oslash",
361 "OE",
362 "ordmasculine",
363 "ae",
364 "dotlessi",
365 "lslash",
366 "oslash",
367 "oe",
368 "germandbls",
369 "onesuperior",
370 "logicalnot",
371 "mu",
372 "trademark",
373 "Eth",
374 "onehalf",
375 "plusminus",
376 "Thorn",
377 "onequarter",
378 "divide",
379 "brokenbar",
380 "degree",
381 "thorn",
382 "threequarters",
383 "twosuperior",
384 "registered",
385 "minus",
386 "eth",
387 "multiply",
388 "threesuperior",
389 "copyright",
390 "Aacute",
391 "Acircumflex",
392 "Adieresis",
393 "Agrave",
394 "Aring",
395 "Atilde",
396 "Ccedilla",
397 "Eacute",
398 "Ecircumflex",
399 "Edieresis",
400 "Egrave",
401 "Iacute",
402 "Icircumflex",
403 "Idieresis",
404 "Igrave",
405 "Ntilde",
406 "Oacute",
407 "Ocircumflex",
408 "Odieresis",
409 "Ograve",
410 "Otilde",
411 "Scaron",
412 "Uacute",
413 "Ucircumflex",
414 "Udieresis",
415 "Ugrave",
416 "Yacute",
417 "Ydieresis",
418 "Zcaron",
419 "aacute",
420 "acircumflex",
421 "adieresis",
422 "agrave",
423 "aring",
424 "atilde",
425 "ccedilla",
426 "eacute",
427 "ecircumflex",
428 "edieresis",
429 "egrave",
430 "iacute",
431 "icircumflex",
432 "idieresis",
433 "igrave",
434 "ntilde",
435 "oacute",
436 "ocircumflex",
437 "odieresis",
438 "ograve",
439 "otilde",
440 "scaron",
441 "uacute",
442 "ucircumflex",
443 "udieresis",
444 "ugrave",
445 "yacute",
446 "ydieresis",
447 "zcaron",
448 "exclamsmall",
449 "Hungarumlautsmall",
450 "dollaroldstyle",
451 "dollarsuperior",
452 "ampersandsmall",
453 "Acutesmall",
454 "parenleftsuperior",
455 "parenrightsuperior",
456 "twodotenleader",
457 "onedotenleader",
458 "zerooldstyle",
459 "oneoldstyle",
460 "twooldstyle",
461 "threeoldstyle",
462 "fouroldstyle",
463 "fiveoldstyle",
464 "sixoldstyle",
465 "sevenoldstyle",
466 "eightoldstyle",
467 "nineoldstyle",
468 "commasuperior",
469 "threequartersemdash",
470 "periodsuperior",
471 "questionsmall",
472 "asuperior",
473 "bsuperior",
474 "centsuperior",
475 "dsuperior",
476 "esuperior",
477 "isuperior",
478 "lsuperior",
479 "msuperior",
480 "nsuperior",
481 "osuperior",
482 "rsuperior",
483 "ssuperior",
484 "tsuperior",
485 "ff",
486 "ffi",
487 "ffl",
488 "parenleftinferior",
489 "parenrightinferior",
490 "Circumflexsmall",
491 "hyphensuperior",
492 "Gravesmall",
493 "Asmall",
494 "Bsmall",
495 "Csmall",
496 "Dsmall",
497 "Esmall",
498 "Fsmall",
499 "Gsmall",
500 "Hsmall",
501 "Ismall",
502 "Jsmall",
503 "Ksmall",
504 "Lsmall",
505 "Msmall",
506 "Nsmall",
507 "Osmall",
508 "Psmall",
509 "Qsmall",
510 "Rsmall",
511 "Ssmall",
512 "Tsmall",
513 "Usmall",
514 "Vsmall",
515 "Wsmall",
516 "Xsmall",
517 "Ysmall",
518 "Zsmall",
519 "colonmonetary",
520 "onefitted",
521 "rupiah",
522 "Tildesmall",
523 "exclamdownsmall",
524 "centoldstyle",
525 "Lslashsmall",
526 "Scaronsmall",
527 "Zcaronsmall",
528 "Dieresissmall",
529 "Brevesmall",
530 "Caronsmall",
531 "Dotaccentsmall",
532 "Macronsmall",
533 "figuredash",
534 "hypheninferior",
535 "Ogoneksmall",
536 "Ringsmall",
537 "Cedillasmall",
538 "questiondownsmall",
539 "oneeighth",
540 "threeeighths",
541 "fiveeighths",
542 "seveneighths",
543 "onethird",
544 "twothirds",
545 "zerosuperior",
546 "foursuperior",
547 "fivesuperior",
548 "sixsuperior",
549 "sevensuperior",
550 "eightsuperior",
551 "ninesuperior",
552 "zeroinferior",
553 "oneinferior",
554 "twoinferior",
555 "threeinferior",
556 "fourinferior",
557 "fiveinferior",
558 "sixinferior",
559 "seveninferior",
560 "eightinferior",
561 "nineinferior",
562 "centinferior",
563 "dollarinferior",
564 "periodinferior",
565 "commainferior",
566 "Agravesmall",
567 "Aacutesmall",
568 "Acircumflexsmall",
569 "Atildesmall",
570 "Adieresissmall",
571 "Aringsmall",
572 "AEsmall",
573 "Ccedillasmall",
574 "Egravesmall",
575 "Eacutesmall",
576 "Ecircumflexsmall",
577 "Edieresissmall",
578 "Igravesmall",
579 "Iacutesmall",
580 "Icircumflexsmall",
581 "Idieresissmall",
582 "Ethsmall",
583 "Ntildesmall",
584 "Ogravesmall",
585 "Oacutesmall",
586 "Ocircumflexsmall",
587 "Otildesmall",
588 "Odieresissmall",
589 "OEsmall",
590 "Oslashsmall",
591 "Ugravesmall",
592 "Uacutesmall",
593 "Ucircumflexsmall",
594 "Udieresissmall",
595 "Yacutesmall",
596 "Thornsmall",
597 "Ydieresissmall",
598 "001.000",
599 "001.001",
600 "001.002",
601 "001.003",
602 "Black",
603 "Bold",
604 "Book",
605 "Light",
606 "Medium",
607 "Regular",
608 "Roman",
609 "Semibold",
610 )
612 class INDEX:
613 def __init__(self, fp: BinaryIO) -> None:
614 self.fp = fp
615 self.offsets: list[int] = []
616 (count, offsize) = struct.unpack(">HB", self.fp.read(3))
617 for _i in range(count + 1):
618 self.offsets.append(nunpack(self.fp.read(offsize)))
619 self.base = self.fp.tell() - 1
620 self.fp.seek(self.base + self.offsets[-1])
622 def __repr__(self) -> str:
623 return f"<INDEX: size={len(self)}>"
625 def __len__(self) -> int:
626 return len(self.offsets) - 1
628 def __getitem__(self, i: int) -> bytes:
629 self.fp.seek(self.base + self.offsets[i])
630 return self.fp.read(self.offsets[i + 1] - self.offsets[i])
632 def __iter__(self) -> Iterator[bytes]:
633 return iter(self[i] for i in range(len(self)))
635 def __init__(self, name: str, fp: BinaryIO) -> None:
636 self.name = name
637 self.fp = fp
638 # Header
639 (_major, _minor, hdrsize, _offsize) = struct.unpack("BBBB", self.fp.read(4))
640 self.fp.read(hdrsize - 4)
641 # Name INDEX
642 self.name_index = self.INDEX(self.fp)
643 # Top DICT INDEX
644 self.dict_index = self.INDEX(self.fp)
645 # String INDEX
646 self.string_index = self.INDEX(self.fp)
647 # Global Subr INDEX
648 self.subr_index = self.INDEX(self.fp)
649 # Top DICT DATA
650 self.top_dict = getdict(self.dict_index[0])
651 (charset_pos,) = self.top_dict.get(15, [0])
652 (encoding_pos,) = self.top_dict.get(16, [0])
653 (charstring_pos,) = self.top_dict.get(17, [0])
654 # CharStrings
655 self.fp.seek(cast(int, charstring_pos))
656 self.charstring = self.INDEX(self.fp)
657 self.nglyphs = len(self.charstring)
658 # Encodings
659 self.code2gid = {}
660 self.gid2code = {}
661 self.fp.seek(cast(int, encoding_pos))
662 format = self.fp.read(1)
663 if format == b"\x00":
664 # Format 0
665 (n,) = struct.unpack("B", self.fp.read(1))
666 for code, gid in enumerate(struct.unpack("B" * n, self.fp.read(n))):
667 self.code2gid[code] = gid
668 self.gid2code[gid] = code
669 elif format == b"\x01":
670 # Format 1
671 (n,) = struct.unpack("B", self.fp.read(1))
672 code = 0
673 for _i in range(n):
674 (first, nleft) = struct.unpack("BB", self.fp.read(2))
675 for gid in range(first, first + nleft + 1):
676 self.code2gid[code] = gid
677 self.gid2code[gid] = code
678 code += 1
679 else:
680 raise PDFValueError(f"unsupported encoding format: {format!r}")
681 # Charsets
682 self.name2gid = {}
683 self.gid2name = {}
684 self.fp.seek(cast(int, charset_pos))
685 format = self.fp.read(1)
686 if format == b"\x00":
687 # Format 0
688 n = self.nglyphs - 1
689 for gid, sid in enumerate(
690 cast(
691 tuple[int, ...], struct.unpack(">" + "H" * n, self.fp.read(2 * n))
692 ),
693 ):
694 gid += 1
695 sidname = self.getstr(sid)
696 self.name2gid[sidname] = gid
697 self.gid2name[gid] = sidname
698 elif format == b"\x01":
699 # Format 1
700 (n,) = struct.unpack("B", self.fp.read(1))
701 sid = 0
702 for _i in range(n):
703 (first, nleft) = struct.unpack("BB", self.fp.read(2))
704 for gid in range(first, first + nleft + 1):
705 sidname = self.getstr(sid)
706 self.name2gid[sidname] = gid
707 self.gid2name[gid] = sidname
708 sid += 1
709 elif format == b"\x02":
710 # Format 2
711 raise AssertionError(str(("Unhandled", format)))
712 else:
713 raise PDFValueError(f"unsupported charset format: {format!r}")
715 def getstr(self, sid: int) -> str | bytes:
716 # This returns str for one of the STANDARD_STRINGS but bytes otherwise,
717 # and appears to be a needless source of type complexity.
718 if sid < len(self.STANDARD_STRINGS):
719 return self.STANDARD_STRINGS[sid]
720 return self.string_index[sid - len(self.STANDARD_STRINGS)]
723class TrueTypeFont:
724 class CMapNotFound(PDFException):
725 pass
727 def __init__(self, name: str, fp: BinaryIO) -> None:
728 self.name = name
729 self.fp = fp
730 self.tables: dict[bytes, tuple[int, int]] = {}
731 self.fonttype = fp.read(4)
732 try:
733 (ntables, _1, _2, _3) = cast(
734 tuple[int, int, int, int],
735 struct.unpack(">HHHH", fp.read(8)),
736 )
737 for _ in range(ntables):
738 (name_bytes, _tsum, offset, length) = cast(
739 tuple[bytes, int, int, int],
740 struct.unpack(">4sLLL", fp.read(16)),
741 )
742 self.tables[name_bytes] = (offset, length)
743 except struct.error:
744 # Do not fail if there are not enough bytes to read. Even for
745 # corrupted PDFs we would like to get as much information as
746 # possible, so continue.
747 pass
749 def create_unicode_map(self) -> FileUnicodeMap:
750 if b"cmap" not in self.tables:
751 raise TrueTypeFont.CMapNotFound
752 (base_offset, _length) = self.tables[b"cmap"]
753 fp = self.fp
754 fp.seek(base_offset)
755 (_version, nsubtables) = cast(tuple[int, int], struct.unpack(">HH", fp.read(4)))
756 subtables: list[tuple[int, int, int]] = []
757 for _i in range(nsubtables):
758 subtables.append(
759 cast(tuple[int, int, int], struct.unpack(">HHL", fp.read(8))),
760 )
761 char2gid: dict[int, int] = {}
762 # Only supports subtable type 0, 2 and 4.
763 for platform_id, encoding_id, st_offset in subtables:
764 # Skip non-Unicode cmaps.
765 # https://docs.microsoft.com/en-us/typography/opentype/spec/cmap
766 if not (platform_id == 0 or (platform_id == 3 and encoding_id in [1, 10])):
767 continue
768 fp.seek(base_offset + st_offset)
769 (fmttype, _fmtlen, _fmtlang) = cast(
770 tuple[int, int, int],
771 struct.unpack(">HHH", fp.read(6)),
772 )
773 if fmttype == 0:
774 char2gid.update(
775 enumerate(
776 cast(tuple[int, ...], struct.unpack(">256B", fp.read(256))),
777 ),
778 )
779 elif fmttype == 2:
780 subheaderkeys = cast(
781 tuple[int, ...],
782 struct.unpack(">256H", fp.read(512)),
783 )
784 firstbytes = [0] * 8192
785 for i, k in enumerate(subheaderkeys):
786 firstbytes[k // 8] = i
787 nhdrs = max(subheaderkeys) // 8 + 1
788 hdrs: list[tuple[int, int, int, int, int]] = []
789 for i in range(nhdrs):
790 (firstcode, entcount, delta, offset) = cast(
791 tuple[int, int, int, int],
792 struct.unpack(">HHhH", fp.read(8)),
793 )
794 hdrs.append((i, firstcode, entcount, delta, fp.tell() - 2 + offset))
795 for i, firstcode, entcount, delta, pos in hdrs:
796 if not entcount:
797 continue
798 first = firstcode + (firstbytes[i] << 8)
799 fp.seek(pos)
800 for c in range(entcount):
801 gid = cast(tuple[int], struct.unpack(">H", fp.read(2)))[0]
802 if gid:
803 gid += delta
804 char2gid[first + c] = gid
805 elif fmttype == 4:
806 (segcount, _1, _2, _3) = cast(
807 tuple[int, int, int, int],
808 struct.unpack(">HHHH", fp.read(8)),
809 )
810 segcount //= 2
811 ecs = cast(
812 tuple[int, ...],
813 struct.unpack(f">{segcount}H", fp.read(2 * segcount)),
814 )
815 fp.read(2)
816 scs = cast(
817 tuple[int, ...],
818 struct.unpack(f">{segcount}H", fp.read(2 * segcount)),
819 )
820 idds = cast(
821 tuple[int, ...],
822 struct.unpack(f">{segcount}h", fp.read(2 * segcount)),
823 )
824 pos = fp.tell()
825 idrs = cast(
826 tuple[int, ...],
827 struct.unpack(f">{segcount}H", fp.read(2 * segcount)),
828 )
829 for ec, sc, idd, idr in zip(ecs, scs, idds, idrs, strict=False):
830 if idr:
831 fp.seek(pos + idr)
832 for c in range(sc, ec + 1):
833 b = cast(tuple[int], struct.unpack(">H", fp.read(2)))[0]
834 char2gid[c] = (b + idd) & 0xFFFF
835 else:
836 for c in range(sc, ec + 1):
837 char2gid[c] = (c + idd) & 0xFFFF
838 else:
839 raise AssertionError(str(("Unhandled", fmttype)))
840 if not char2gid:
841 raise TrueTypeFont.CMapNotFound
842 # create unicode map
843 unicode_map = FileUnicodeMap()
844 for char, gid in char2gid.items():
845 unicode_map.add_cid2unichr(gid, char)
846 return unicode_map
849class PDFFontError(PDFException):
850 pass
853class PDFUnicodeNotDefined(PDFFontError):
854 pass
857LITERAL_STANDARD_ENCODING = LIT("StandardEncoding")
858LITERAL_TYPE1C = LIT("Type1C")
860# Font widths are maintained in a dict type that maps from *either* unicode
861# chars or integer character IDs.
862FontWidthDict = dict[int | str, float]
865class PDFFont:
866 def __init__(
867 self,
868 descriptor: Mapping[str, Any],
869 widths: FontWidthDict,
870 default_width: float | None = None,
871 ) -> None:
872 self.descriptor = descriptor
873 self.widths: FontWidthDict = resolve_all(widths)
874 self.fontname = resolve1(descriptor.get("FontName", "unknown"))
875 if isinstance(self.fontname, PSLiteral):
876 self.fontname = literal_name(self.fontname)
877 self.flags = int_value(descriptor.get("Flags", 0))
878 self.ascent = num_value(descriptor.get("Ascent", 0))
879 self.descent = num_value(descriptor.get("Descent", 0))
880 self.italic_angle = num_value(descriptor.get("ItalicAngle", 0))
881 if default_width is None:
882 self.default_width = num_value(descriptor.get("MissingWidth", 0))
883 else:
884 self.default_width = default_width
885 self.default_width = resolve1(self.default_width)
886 self.leading = num_value(descriptor.get("Leading", 0))
887 self.bbox = self._parse_bbox(descriptor)
888 self.hscale = self.vscale = 0.001
890 # PDF RM 9.8.1 specifies /Descent should always be a negative number.
891 # PScript5.dll seems to produce Descent with a positive number, but
892 # text analysis will be wrong if this is taken as correct. So force
893 # descent to negative.
894 if self.descent > 0:
895 self.descent = -self.descent
897 def __repr__(self) -> str:
898 return "<PDFFont>"
900 def is_vertical(self) -> bool:
901 return False
903 def is_multibyte(self) -> bool:
904 return False
906 def decode(self, bytes: bytes) -> Iterable[int]:
907 return bytearray(bytes) # map(ord, bytes)
909 def get_ascent(self) -> float:
910 """Ascent above the baseline, in text space units"""
911 return self.ascent * self.vscale
913 def get_descent(self) -> float:
914 """Descent below the baseline, in text space units; always negative"""
915 return self.descent * self.vscale
917 def get_width(self) -> float:
918 w = self.bbox[2] - self.bbox[0]
919 if w == 0:
920 w = -self.default_width
921 return w * self.hscale
923 def get_height(self) -> float:
924 h = self.bbox[3] - self.bbox[1]
925 if h == 0:
926 h = self.ascent - self.descent
927 return h * self.vscale
929 def char_width(self, cid: int) -> float:
930 # Because character widths may be mapping either IDs or strings,
931 # we try to lookup the character ID first, then its str equivalent.
932 cid_width = safe_float(self.widths.get(cid))
933 if cid_width is not None:
934 return cid_width * self.hscale
936 try:
937 str_cid = self.to_unichr(cid)
938 cid_width = safe_float(self.widths.get(str_cid))
939 if cid_width is not None:
940 return cid_width * self.hscale
942 except PDFUnicodeNotDefined:
943 pass
945 return self.default_width * self.hscale
947 def char_disp(self, cid: int) -> float | tuple[float | None, float]:
948 """Returns an integer for horizontal fonts, a tuple for vertical fonts."""
949 return 0
951 def string_width(self, s: bytes) -> float:
952 return sum(self.char_width(cid) for cid in self.decode(s))
954 def to_unichr(self, cid: int) -> str:
955 raise NotImplementedError
957 @staticmethod
958 def _parse_bbox(descriptor: Mapping[str, Any]) -> Rect:
959 """Parse FontBBox from the fonts descriptor"""
960 font_bbox = resolve_all(descriptor.get("FontBBox"))
961 bbox = safe_rect_list(font_bbox)
962 if bbox is None:
963 log.warning(
964 f"Could not get FontBBox from font descriptor because "
965 f"{font_bbox!r} cannot be parsed as 4 floats"
966 )
967 return 0.0, 0.0, 0.0, 0.0
968 return bbox
971class PDFSimpleFont(PDFFont):
972 def __init__(
973 self,
974 descriptor: Mapping[str, Any],
975 widths: FontWidthDict,
976 spec: Mapping[str, Any],
977 ) -> None:
978 # Font encoding is specified either by a name of
979 # built-in encoding or a dictionary that describes
980 # the differences.
982 default_encoding = LITERAL_STANDARD_ENCODING
983 if literal_name(spec.get("Subtype")) == "TrueType":
984 # PDF spec: TrueType fonts without Encoding default to WinAnsiEncoding
985 default_encoding = LIT("WinAnsiEncoding")
987 encoding = default_encoding
988 if "Encoding" in spec:
989 encoding = resolve1(spec["Encoding"])
991 if isinstance(encoding, dict):
992 name = literal_name(encoding.get("BaseEncoding", default_encoding))
993 diff = list_value(encoding.get("Differences", []))
994 self.cid2unicode = EncodingDB.get_encoding(name, diff)
995 else:
996 self.cid2unicode = EncodingDB.get_encoding(literal_name(encoding))
998 self.unicode_map: UnicodeMap | None = None
999 if "ToUnicode" in spec:
1000 strm = stream_value(spec["ToUnicode"])
1001 self.unicode_map = FileUnicodeMap()
1002 CMapParser(self.unicode_map, BytesIO(strm.get_data())).run()
1003 PDFFont.__init__(self, descriptor, widths)
1005 def to_unichr(self, cid: int) -> str:
1006 if self.unicode_map:
1007 try:
1008 return self.unicode_map.get_unichr(cid)
1009 except KeyError:
1010 pass
1011 try:
1012 return self.cid2unicode[cid]
1013 except KeyError as err:
1014 raise PDFUnicodeNotDefined(None, cid) from err
1017class PDFType1Font(PDFSimpleFont):
1018 def __init__(self, rsrcmgr: "PDFResourceManager", spec: Mapping[str, Any]) -> None:
1019 try:
1020 self.basefont = literal_name(spec["BaseFont"])
1021 except KeyError:
1022 if settings.STRICT:
1023 raise PDFFontError("BaseFont is missing") from None
1024 self.basefont = "unknown"
1026 widths: FontWidthDict
1027 try:
1028 (descriptor, int_widths) = FontMetricsDB.get_metrics(self.basefont)
1029 widths = cast(dict[str | int, float], int_widths) # implicit int->float
1030 except KeyError:
1031 descriptor = dict_value(spec.get("FontDescriptor", {}))
1032 firstchar = int_value(spec.get("FirstChar", 0))
1033 # lastchar = int_value(spec.get('LastChar', 255))
1034 width_list = list_value(spec.get("Widths", [0] * 256))
1035 widths = {i + firstchar: resolve1(w) for (i, w) in enumerate(width_list)}
1036 PDFSimpleFont.__init__(self, descriptor, widths, spec)
1037 if "Encoding" not in spec and "FontFile" in descriptor:
1038 # try to recover the missing encoding info from the font file.
1039 self.fontfile = stream_value(descriptor.get("FontFile"))
1040 length1 = int_value(self.fontfile["Length1"])
1041 data = self.fontfile.get_data()[:length1]
1042 parser = Type1FontHeaderParser(BytesIO(data))
1043 self.cid2unicode = parser.get_encoding()
1045 def __repr__(self) -> str:
1046 return f"<PDFType1Font: basefont={self.basefont!r}>"
1049class PDFTrueTypeFont(PDFType1Font):
1050 def __repr__(self) -> str:
1051 return f"<PDFTrueTypeFont: basefont={self.basefont!r}>"
1054class PDFType3Font(PDFSimpleFont):
1055 def __init__(self, rsrcmgr: "PDFResourceManager", spec: Mapping[str, Any]) -> None:
1056 firstchar = int_value(spec.get("FirstChar", 0))
1057 # lastchar = int_value(spec.get('LastChar', 0))
1058 width_list = list_value(spec.get("Widths", [0] * 256))
1059 widths: dict[str | int, float] = {
1060 i + firstchar: w for (i, w) in enumerate(width_list)
1061 }
1062 if "FontDescriptor" in spec:
1063 descriptor = dict_value(spec["FontDescriptor"])
1064 else:
1065 descriptor = {"Ascent": 0, "Descent": 0, "FontBBox": spec["FontBBox"]}
1066 PDFSimpleFont.__init__(self, descriptor, widths, spec)
1067 self.matrix = cast(Matrix, tuple(list_value(spec.get("FontMatrix"))))
1068 (_, self.descent, _, self.ascent) = self.bbox
1069 (self.hscale, self.vscale) = apply_matrix_norm(self.matrix, (1, 1))
1071 def __repr__(self) -> str:
1072 return "<PDFType3Font>"
1075class PDFCIDFont(PDFFont):
1076 default_disp: float | tuple[float | None, float]
1078 def __init__(
1079 self,
1080 rsrcmgr: "PDFResourceManager",
1081 spec: Mapping[str, Any],
1082 strict: bool = settings.STRICT,
1083 ) -> None:
1084 try:
1085 self.basefont = literal_name(spec["BaseFont"])
1086 except KeyError:
1087 if strict:
1088 raise PDFFontError("BaseFont is missing") from None
1089 self.basefont = "unknown"
1090 self.cidsysteminfo = dict_value(spec.get("CIDSystemInfo", {}))
1091 cid_registry = resolve1(self.cidsysteminfo.get("Registry", b"unknown")).decode(
1092 "latin1",
1093 )
1094 cid_ordering = resolve1(self.cidsysteminfo.get("Ordering", b"unknown")).decode(
1095 "latin1",
1096 )
1097 self.cidcoding = f"{cid_registry.strip()}-{cid_ordering.strip()}"
1098 self.cmap: CMapBase = self.get_cmap_from_spec(spec, strict)
1100 try:
1101 descriptor = dict_value(spec["FontDescriptor"])
1102 except KeyError:
1103 if strict:
1104 raise PDFFontError("FontDescriptor is missing") from None
1105 descriptor = {}
1106 ttf = None
1107 if "FontFile2" in descriptor:
1108 self.fontfile = stream_value(descriptor.get("FontFile2"))
1109 ttf = TrueTypeFont(self.basefont, BytesIO(self.fontfile.get_data()))
1110 self.unicode_map: UnicodeMap | None = None
1111 if "ToUnicode" in spec:
1112 if isinstance(spec["ToUnicode"], PDFStream):
1113 strm = stream_value(spec["ToUnicode"])
1114 self.unicode_map = FileUnicodeMap()
1115 CMapParser(self.unicode_map, BytesIO(strm.get_data())).run()
1116 else:
1117 cmap_name = literal_name(spec["ToUnicode"])
1118 encoding = literal_name(spec["Encoding"])
1119 if (
1120 "Identity" in cid_ordering
1121 or "Identity" in cmap_name
1122 or "Identity" in encoding
1123 ):
1124 self.unicode_map = IdentityUnicodeMap()
1125 elif self.cidcoding in ("Adobe-Identity", "Adobe-UCS"):
1126 if ttf:
1127 with contextlib.suppress(TrueTypeFont.CMapNotFound):
1128 self.unicode_map = ttf.create_unicode_map()
1129 else:
1130 with contextlib.suppress(CMapDB.CMapNotFound):
1131 self.unicode_map = CMapDB.get_unicode_map(
1132 self.cidcoding,
1133 self.cmap.is_vertical(),
1134 )
1136 self.vertical = self.cmap.is_vertical()
1137 if self.vertical:
1138 # writing mode: vertical
1139 widths2 = get_widths2(list_value(spec.get("W2", [])))
1140 self.disps = {cid: (vx, vy) for (cid, (_, (vx, vy))) in widths2.items()}
1141 (vy, w) = resolve1(spec.get("DW2", [880, -1000]))
1142 self.default_disp = (None, vy)
1143 widths: dict[str | int, float] = {
1144 cid: w for (cid, (w, _)) in widths2.items()
1145 }
1146 default_width = w
1147 else:
1148 # writing mode: horizontal
1149 self.disps = {}
1150 self.default_disp = 0
1151 widths = get_widths(list_value(spec.get("W", [])))
1152 default_width = spec.get("DW", 1000)
1153 PDFFont.__init__(self, descriptor, widths, default_width=default_width)
1155 def get_cmap_from_spec(self, spec: Mapping[str, Any], strict: bool) -> CMapBase:
1156 """Get cmap from font specification
1158 For certain PDFs, Encoding Type isn't mentioned as an attribute of
1159 Encoding but as an attribute of CMapName, where CMapName is an
1160 attribute of spec['Encoding'].
1161 The horizontal/vertical modes are mentioned with different name
1162 such as 'DLIdent-H/V','OneByteIdentityH/V','Identity-H/V'.
1163 """
1164 cmap_name = self._get_cmap_name(spec, strict)
1166 try:
1167 return CMapDB.get_cmap(cmap_name)
1168 except CMapDB.CMapNotFound as e:
1169 if strict:
1170 raise PDFFontError(e) from e
1171 return CMap()
1173 @staticmethod
1174 def _get_cmap_name(spec: Mapping[str, Any], strict: bool) -> str:
1175 """Get cmap name from font specification"""
1176 cmap_name = "unknown" # default value
1178 try:
1179 spec_encoding = spec["Encoding"]
1180 if hasattr(spec_encoding, "name"):
1181 cmap_name = literal_name(spec["Encoding"])
1182 else:
1183 cmap_name = literal_name(spec_encoding["CMapName"])
1184 except KeyError:
1185 if strict:
1186 raise PDFFontError("Encoding is unspecified") from None
1188 if type(cmap_name) is PDFStream: # type: ignore[comparison-overlap]
1189 cmap_name_stream: PDFStream = cast(PDFStream, cmap_name)
1190 if "CMapName" in cmap_name_stream:
1191 cmap_name = cmap_name_stream.get("CMapName").name
1192 elif strict:
1193 raise PDFFontError("CMapName unspecified for encoding")
1195 return IDENTITY_ENCODER.get(cmap_name, cmap_name)
1197 def __repr__(self) -> str:
1198 return f"<PDFCIDFont: basefont={self.basefont!r}, cidcoding={self.cidcoding!r}>"
1200 def is_vertical(self) -> bool:
1201 return self.vertical
1203 def is_multibyte(self) -> bool:
1204 return True
1206 def decode(self, bytes: bytes) -> Iterable[int]:
1207 return self.cmap.decode(bytes)
1209 def char_disp(self, cid: int) -> float | tuple[float | None, float]:
1210 """Returns an integer for horizontal fonts, a tuple for vertical fonts."""
1211 return self.disps.get(cid, self.default_disp)
1213 def to_unichr(self, cid: int) -> str:
1214 try:
1215 if not self.unicode_map:
1216 raise PDFKeyError(cid)
1217 return self.unicode_map.get_unichr(cid)
1218 except KeyError as err:
1219 raise PDFUnicodeNotDefined(self.cidcoding, cid) from err