Coverage for /pythoncovmergedfiles/medio/medio/src/pdfminer.six/pdfminer/pdffont.py: 59%

1import logging

2import struct

3from io import BytesIO

4from typing import (

5 TYPE_CHECKING,

6 Any,

7 BinaryIO,

8 Dict,

9 Iterable,

10 Iterator,

11 List,

12 Mapping,

13 Optional,

14 Tuple,

15 Union,

16 cast,

17)

19from pdfminer import settings

20from pdfminer.cmapdb import (

21 CMap,

22 CMapBase,

23 CMapDB,

24 CMapParser,

25 FileUnicodeMap,

26 IdentityUnicodeMap,

27 UnicodeMap,

28)

29from pdfminer.encodingdb import EncodingDB, name2unicode

30from pdfminer.fontmetrics import FONT_METRICS

31from pdfminer.pdfexceptions import PDFException, PDFKeyError, PDFValueError

32from pdfminer.pdftypes import (

33 PDFStream,

34 dict_value,

35 int_value,

36 list_value,

37 num_value,

38 resolve1,

39 resolve_all,

40 stream_value,

41)

42from pdfminer.psexceptions import PSEOF

43from pdfminer.psparser import (

44 KWD,

45 LIT,

46 PSKeyword,

47 PSLiteral,

48 PSStackParser,

49 literal_name,

50)

51from pdfminer.utils import Matrix, Point, Rect, apply_matrix_norm, choplist, nunpack

53if TYPE_CHECKING:

54 from pdfminer.pdfinterp import PDFResourceManager

56log = logging.getLogger(__name__)

59def get_widths(seq: Iterable[object]) -> Dict[int, float]:

60 """Build a mapping of character widths for horizontal writing."""

61 widths: Dict[int, float] = {}

62 r: List[float] = []

63 for v in seq:

64 if isinstance(v, list):

65 if r:

66 char1 = r[-1]

67 for i, w in enumerate(v):

68 widths[cast(int, char1) + i] = w

69 r = []

70 elif isinstance(v, (int, float)): # == utils.isnumber(v)

71 r.append(v)

72 if len(r) == 3:

73 (char1, char2, w) = r

74 for i in range(cast(int, char1), cast(int, char2) + 1):

75 widths[i] = w

76 r = []

77 return widths

80def get_widths2(seq: Iterable[object]) -> Dict[int, Tuple[float, Point]]:

81 """Build a mapping of character widths for vertical writing."""

82 widths: Dict[int, Tuple[float, Point]] = {}

83 r: List[float] = []

84 for v in seq:

85 if isinstance(v, list):

86 if r:

87 char1 = r[-1]

88 for i, (w, vx, vy) in enumerate(choplist(3, v)):

89 widths[cast(int, char1) + i] = (w, (vx, vy))

90 r = []

91 elif isinstance(v, (int, float)): # == utils.isnumber(v)

92 r.append(v)

93 if len(r) == 5:

94 (char1, char2, w, vx, vy) = r

95 for i in range(cast(int, char1), cast(int, char2) + 1):

96 widths[i] = (w, (vx, vy))

97 r = []

98 return widths

100

101class FontMetricsDB:

102 @classmethod

103 def get_metrics(cls, fontname: str) -> Tuple[Dict[str, object], Dict[str, int]]:

104 return FONT_METRICS[fontname]

105

106

107# int here means that we're not extending PSStackParser with additional types.

108class Type1FontHeaderParser(PSStackParser[int]):

109 KEYWORD_BEGIN = KWD(b"begin")

110 KEYWORD_END = KWD(b"end")

111 KEYWORD_DEF = KWD(b"def")

112 KEYWORD_PUT = KWD(b"put")

113 KEYWORD_DICT = KWD(b"dict")

114 KEYWORD_ARRAY = KWD(b"array")

115 KEYWORD_READONLY = KWD(b"readonly")

116 KEYWORD_FOR = KWD(b"for")

117

118 def __init__(self, data: BinaryIO) -> None:

119 PSStackParser.__init__(self, data)

120 self._cid2unicode: Dict[int, str] = {}

121

122 def get_encoding(self) -> Dict[int, str]:

123 """Parse the font encoding.

124

125 The Type1 font encoding maps character codes to character names. These

126 character names could either be standard Adobe glyph names, or

127 character names associated with custom CharStrings for this font. A

128 CharString is a sequence of operations that describe how the character

129 should be drawn. Currently, this function returns '' (empty string)

130 for character names that are associated with a CharStrings.

131

132 Reference: Adobe Systems Incorporated, Adobe Type 1 Font Format

133

134 :returns mapping of character identifiers (cid's) to unicode characters

135 """

136 while 1:

137 try:

138 (cid, name) = self.nextobject()

139 except PSEOF:

140 break

141 try:

142 self._cid2unicode[cid] = name2unicode(cast(str, name))

143 except KeyError as e:

144 log.debug(str(e))

145 return self._cid2unicode

146

147 def do_keyword(self, pos: int, token: PSKeyword) -> None:

148 if token is self.KEYWORD_PUT:

149 ((_, key), (_, value)) = self.pop(2)

150 if isinstance(key, int) and isinstance(value, PSLiteral):

151 self.add_results((key, literal_name(value)))

152

153

154NIBBLES = ("0", "1", "2", "3", "4", "5", "6", "7", "8", "9", ".", "e", "e-", None, "-")

155

156# Mapping of cmap names. Original cmap name is kept if not in the mapping.

157# (missing reference for why DLIdent is mapped to Identity)

158IDENTITY_ENCODER = {

159 "DLIdent-H": "Identity-H",

160 "DLIdent-V": "Identity-V",

161}

162

163

164def getdict(data: bytes) -> Dict[int, List[Union[float, int]]]:

165 d: Dict[int, List[Union[float, int]]] = {}

166 fp = BytesIO(data)

167 stack: List[Union[float, int]] = []

168 while 1:

169 c = fp.read(1)

170 if not c:

171 break

172 b0 = ord(c)

173 if b0 <= 21:

174 d[b0] = stack

175 stack = []

176 continue

177 if b0 == 30:

178 s = ""

179 loop = True

180 while loop:

181 b = ord(fp.read(1))

182 for n in (b >> 4, b & 15):

183 if n == 15:

184 loop = False

185 else:

186 nibble = NIBBLES[n]

187 assert nibble is not None

188 s += nibble

189 value = float(s)

190 elif b0 >= 32 and b0 <= 246:

191 value = b0 - 139

192 else:

193 b1 = ord(fp.read(1))

194 if b0 >= 247 and b0 <= 250:

195 value = ((b0 - 247) << 8) + b1 + 108

196 elif b0 >= 251 and b0 <= 254:

197 value = -((b0 - 251) << 8) - b1 - 108

198 else:

199 b2 = ord(fp.read(1))

200 if b1 >= 128:

201 b1 -= 256

202 if b0 == 28:

203 value = b1 << 8 | b2

204 else:

205 value = b1 << 24 | b2 << 16 | struct.unpack(">H", fp.read(2))[0]

206 stack.append(value)

207 return d

208

209

210class CFFFont:

211 STANDARD_STRINGS = (

212 ".notdef",

213 "space",

214 "exclam",

215 "quotedbl",

216 "numbersign",

217 "dollar",

218 "percent",

219 "ampersand",

220 "quoteright",

221 "parenleft",

222 "parenright",

223 "asterisk",

224 "plus",

225 "comma",

226 "hyphen",

227 "period",

228 "slash",

229 "zero",

230 "one",

231 "two",

232 "three",

233 "four",

234 "five",

235 "six",

236 "seven",

237 "eight",

238 "nine",

239 "colon",

240 "semicolon",

241 "less",

242 "equal",

243 "greater",

244 "question",

245 "at",

246 "A",

247 "B",

248 "C",

249 "D",

250 "E",

251 "F",

252 "G",

253 "H",

254 "I",

255 "J",

256 "K",

257 "L",

258 "M",

259 "N",

260 "O",

261 "P",

262 "Q",

263 "R",

264 "S",

265 "T",

266 "U",

267 "V",

268 "W",

269 "X",

270 "Y",

271 "Z",

272 "bracketleft",

273 "backslash",

274 "bracketright",

275 "asciicircum",

276 "underscore",

277 "quoteleft",

278 "a",

279 "b",

280 "c",

281 "d",

282 "e",

283 "f",

284 "g",

285 "h",

286 "i",

287 "j",

288 "k",

289 "l",

290 "m",

291 "n",

292 "o",

293 "p",

294 "q",

295 "r",

296 "s",

297 "t",

298 "u",

299 "v",

300 "w",

301 "x",

302 "y",

303 "z",

304 "braceleft",

305 "bar",

306 "braceright",

307 "asciitilde",

308 "exclamdown",

309 "cent",

310 "sterling",

311 "fraction",

312 "yen",

313 "florin",

314 "section",

315 "currency",

316 "quotesingle",

317 "quotedblleft",

318 "guillemotleft",

319 "guilsinglleft",

320 "guilsinglright",

321 "fi",

322 "fl",

323 "endash",

324 "dagger",

325 "daggerdbl",

326 "periodcentered",

327 "paragraph",

328 "bullet",

329 "quotesinglbase",

330 "quotedblbase",

331 "quotedblright",

332 "guillemotright",

333 "ellipsis",

334 "perthousand",

335 "questiondown",

336 "grave",

337 "acute",

338 "circumflex",

339 "tilde",

340 "macron",

341 "breve",

342 "dotaccent",

343 "dieresis",

344 "ring",

345 "cedilla",

346 "hungarumlaut",

347 "ogonek",

348 "caron",

349 "emdash",

350 "AE",

351 "ordfeminine",

352 "Lslash",

353 "Oslash",

354 "OE",

355 "ordmasculine",

356 "ae",

357 "dotlessi",

358 "lslash",

359 "oslash",

360 "oe",

361 "germandbls",

362 "onesuperior",

363 "logicalnot",

364 "mu",

365 "trademark",

366 "Eth",

367 "onehalf",

368 "plusminus",

369 "Thorn",

370 "onequarter",

371 "divide",

372 "brokenbar",

373 "degree",

374 "thorn",

375 "threequarters",

376 "twosuperior",

377 "registered",

378 "minus",

379 "eth",

380 "multiply",

381 "threesuperior",

382 "copyright",

383 "Aacute",

384 "Acircumflex",

385 "Adieresis",

386 "Agrave",

387 "Aring",

388 "Atilde",

389 "Ccedilla",

390 "Eacute",

391 "Ecircumflex",

392 "Edieresis",

393 "Egrave",

394 "Iacute",

395 "Icircumflex",

396 "Idieresis",

397 "Igrave",

398 "Ntilde",

399 "Oacute",

400 "Ocircumflex",

401 "Odieresis",

402 "Ograve",

403 "Otilde",

404 "Scaron",

405 "Uacute",

406 "Ucircumflex",

407 "Udieresis",

408 "Ugrave",

409 "Yacute",

410 "Ydieresis",

411 "Zcaron",

412 "aacute",

413 "acircumflex",

414 "adieresis",

415 "agrave",

416 "aring",

417 "atilde",

418 "ccedilla",

419 "eacute",

420 "ecircumflex",

421 "edieresis",

422 "egrave",

423 "iacute",

424 "icircumflex",

425 "idieresis",

426 "igrave",

427 "ntilde",

428 "oacute",

429 "ocircumflex",

430 "odieresis",

431 "ograve",

432 "otilde",

433 "scaron",

434 "uacute",

435 "ucircumflex",

436 "udieresis",

437 "ugrave",

438 "yacute",

439 "ydieresis",

440 "zcaron",

441 "exclamsmall",

442 "Hungarumlautsmall",

443 "dollaroldstyle",

444 "dollarsuperior",

445 "ampersandsmall",

446 "Acutesmall",

447 "parenleftsuperior",

448 "parenrightsuperior",

449 "twodotenleader",

450 "onedotenleader",

451 "zerooldstyle",

452 "oneoldstyle",

453 "twooldstyle",

454 "threeoldstyle",

455 "fouroldstyle",

456 "fiveoldstyle",

457 "sixoldstyle",

458 "sevenoldstyle",

459 "eightoldstyle",

460 "nineoldstyle",

461 "commasuperior",

462 "threequartersemdash",

463 "periodsuperior",

464 "questionsmall",

465 "asuperior",

466 "bsuperior",

467 "centsuperior",

468 "dsuperior",

469 "esuperior",

470 "isuperior",

471 "lsuperior",

472 "msuperior",

473 "nsuperior",

474 "osuperior",

475 "rsuperior",

476 "ssuperior",

477 "tsuperior",

478 "ff",

479 "ffi",

480 "ffl",

481 "parenleftinferior",

482 "parenrightinferior",

483 "Circumflexsmall",

484 "hyphensuperior",

485 "Gravesmall",

486 "Asmall",

487 "Bsmall",

488 "Csmall",

489 "Dsmall",

490 "Esmall",

491 "Fsmall",

492 "Gsmall",

493 "Hsmall",

494 "Ismall",

495 "Jsmall",

496 "Ksmall",

497 "Lsmall",

498 "Msmall",

499 "Nsmall",

500 "Osmall",

501 "Psmall",

502 "Qsmall",

503 "Rsmall",

504 "Ssmall",

505 "Tsmall",

506 "Usmall",

507 "Vsmall",

508 "Wsmall",

509 "Xsmall",

510 "Ysmall",

511 "Zsmall",

512 "colonmonetary",

513 "onefitted",

514 "rupiah",

515 "Tildesmall",

516 "exclamdownsmall",

517 "centoldstyle",

518 "Lslashsmall",

519 "Scaronsmall",

520 "Zcaronsmall",

521 "Dieresissmall",

522 "Brevesmall",

523 "Caronsmall",

524 "Dotaccentsmall",

525 "Macronsmall",

526 "figuredash",

527 "hypheninferior",

528 "Ogoneksmall",

529 "Ringsmall",

530 "Cedillasmall",

531 "questiondownsmall",

532 "oneeighth",

533 "threeeighths",

534 "fiveeighths",

535 "seveneighths",

536 "onethird",

537 "twothirds",

538 "zerosuperior",

539 "foursuperior",

540 "fivesuperior",

541 "sixsuperior",

542 "sevensuperior",

543 "eightsuperior",

544 "ninesuperior",

545 "zeroinferior",

546 "oneinferior",

547 "twoinferior",

548 "threeinferior",

549 "fourinferior",

550 "fiveinferior",

551 "sixinferior",

552 "seveninferior",

553 "eightinferior",

554 "nineinferior",

555 "centinferior",

556 "dollarinferior",

557 "periodinferior",

558 "commainferior",

559 "Agravesmall",

560 "Aacutesmall",

561 "Acircumflexsmall",

562 "Atildesmall",

563 "Adieresissmall",

564 "Aringsmall",

565 "AEsmall",

566 "Ccedillasmall",

567 "Egravesmall",

568 "Eacutesmall",

569 "Ecircumflexsmall",

570 "Edieresissmall",

571 "Igravesmall",

572 "Iacutesmall",

573 "Icircumflexsmall",

574 "Idieresissmall",

575 "Ethsmall",

576 "Ntildesmall",

577 "Ogravesmall",

578 "Oacutesmall",

579 "Ocircumflexsmall",

580 "Otildesmall",

581 "Odieresissmall",

582 "OEsmall",

583 "Oslashsmall",

584 "Ugravesmall",

585 "Uacutesmall",

586 "Ucircumflexsmall",

587 "Udieresissmall",

588 "Yacutesmall",

589 "Thornsmall",

590 "Ydieresissmall",

591 "001.000",

592 "001.001",

593 "001.002",

594 "001.003",

595 "Black",

596 "Bold",

597 "Book",

598 "Light",

599 "Medium",

600 "Regular",

601 "Roman",

602 "Semibold",

603 )

604

605 class INDEX:

606 def __init__(self, fp: BinaryIO) -> None:

607 self.fp = fp

608 self.offsets: List[int] = []

609 (count, offsize) = struct.unpack(">HB", self.fp.read(3))

610 for i in range(count + 1):

611 self.offsets.append(nunpack(self.fp.read(offsize)))

612 self.base = self.fp.tell() - 1

613 self.fp.seek(self.base + self.offsets[-1])

614

615 def __repr__(self) -> str:

616 return "<INDEX: size=%d>" % len(self)

617

618 def __len__(self) -> int:

619 return len(self.offsets) - 1

620

621 def __getitem__(self, i: int) -> bytes:

622 self.fp.seek(self.base + self.offsets[i])

623 return self.fp.read(self.offsets[i + 1] - self.offsets[i])

624

625 def __iter__(self) -> Iterator[bytes]:

626 return iter(self[i] for i in range(len(self)))

627

628 def __init__(self, name: str, fp: BinaryIO) -> None:

629 self.name = name

630 self.fp = fp

631 # Header

632 (_major, _minor, hdrsize, offsize) = struct.unpack("BBBB", self.fp.read(4))

633 self.fp.read(hdrsize - 4)

634 # Name INDEX

635 self.name_index = self.INDEX(self.fp)

636 # Top DICT INDEX

637 self.dict_index = self.INDEX(self.fp)

638 # String INDEX

639 self.string_index = self.INDEX(self.fp)

640 # Global Subr INDEX

641 self.subr_index = self.INDEX(self.fp)

642 # Top DICT DATA

643 self.top_dict = getdict(self.dict_index[0])

644 (charset_pos,) = self.top_dict.get(15, [0])

645 (encoding_pos,) = self.top_dict.get(16, [0])

646 (charstring_pos,) = self.top_dict.get(17, [0])

647 # CharStrings

648 self.fp.seek(cast(int, charstring_pos))

649 self.charstring = self.INDEX(self.fp)

650 self.nglyphs = len(self.charstring)

651 # Encodings

652 self.code2gid = {}

653 self.gid2code = {}

654 self.fp.seek(cast(int, encoding_pos))

655 format = self.fp.read(1)

656 if format == b"\x00":

657 # Format 0

658 (n,) = struct.unpack("B", self.fp.read(1))

659 for code, gid in enumerate(struct.unpack("B" * n, self.fp.read(n))):

660 self.code2gid[code] = gid

661 self.gid2code[gid] = code

662 elif format == b"\x01":

663 # Format 1

664 (n,) = struct.unpack("B", self.fp.read(1))

665 code = 0

666 for i in range(n):

667 (first, nleft) = struct.unpack("BB", self.fp.read(2))

668 for gid in range(first, first + nleft + 1):

669 self.code2gid[code] = gid

670 self.gid2code[gid] = code

671 code += 1

672 else:

673 raise PDFValueError("unsupported encoding format: %r" % format)

674 # Charsets

675 self.name2gid = {}

676 self.gid2name = {}

677 self.fp.seek(cast(int, charset_pos))

678 format = self.fp.read(1)

679 if format == b"\x00":

680 # Format 0

681 n = self.nglyphs - 1

682 for gid, sid in enumerate(

683 cast(

684 Tuple[int, ...], struct.unpack(">" + "H" * n, self.fp.read(2 * n))

685 ),

686 ):

687 gid += 1

688 sidname = self.getstr(sid)

689 self.name2gid[sidname] = gid

690 self.gid2name[gid] = sidname

691 elif format == b"\x01":

692 # Format 1

693 (n,) = struct.unpack("B", self.fp.read(1))

694 sid = 0

695 for i in range(n):

696 (first, nleft) = struct.unpack("BB", self.fp.read(2))

697 for gid in range(first, first + nleft + 1):

698 sidname = self.getstr(sid)

699 self.name2gid[sidname] = gid

700 self.gid2name[gid] = sidname

701 sid += 1

702 elif format == b"\x02":

703 # Format 2

704 assert False, str(("Unhandled", format))

705 else:

706 raise PDFValueError("unsupported charset format: %r" % format)

707

708 def getstr(self, sid: int) -> Union[str, bytes]:

709 # This returns str for one of the STANDARD_STRINGS but bytes otherwise,

710 # and appears to be a needless source of type complexity.

711 if sid < len(self.STANDARD_STRINGS):

712 return self.STANDARD_STRINGS[sid]

713 return self.string_index[sid - len(self.STANDARD_STRINGS)]

714

715

716class TrueTypeFont:

717 class CMapNotFound(PDFException):

718 pass

719

720 def __init__(self, name: str, fp: BinaryIO) -> None:

721 self.name = name

722 self.fp = fp

723 self.tables: Dict[bytes, Tuple[int, int]] = {}

724 self.fonttype = fp.read(4)

725 try:

726 (ntables, _1, _2, _3) = cast(

727 Tuple[int, int, int, int],

728 struct.unpack(">HHHH", fp.read(8)),

729 )

730 for _ in range(ntables):

731 (name_bytes, tsum, offset, length) = cast(

732 Tuple[bytes, int, int, int],

733 struct.unpack(">4sLLL", fp.read(16)),

734 )

735 self.tables[name_bytes] = (offset, length)

736 except struct.error:

737 # Do not fail if there are not enough bytes to read. Even for

738 # corrupted PDFs we would like to get as much information as

739 # possible, so continue.

740 pass

741

742 def create_unicode_map(self) -> FileUnicodeMap:

743 if b"cmap" not in self.tables:

744 raise TrueTypeFont.CMapNotFound

745 (base_offset, length) = self.tables[b"cmap"]

746 fp = self.fp

747 fp.seek(base_offset)

748 (version, nsubtables) = cast(Tuple[int, int], struct.unpack(">HH", fp.read(4)))

749 subtables: List[Tuple[int, int, int]] = []

750 for i in range(nsubtables):

751 subtables.append(

752 cast(Tuple[int, int, int], struct.unpack(">HHL", fp.read(8))),

753 )

754 char2gid: Dict[int, int] = {}

755 # Only supports subtable type 0, 2 and 4.

756 for platform_id, encoding_id, st_offset in subtables:

757 # Skip non-Unicode cmaps.

758 # https://docs.microsoft.com/en-us/typography/opentype/spec/cmap

759 if not (platform_id == 0 or (platform_id == 3 and encoding_id in [1, 10])):

760 continue

761 fp.seek(base_offset + st_offset)

762 (fmttype, fmtlen, fmtlang) = cast(

763 Tuple[int, int, int],

764 struct.unpack(">HHH", fp.read(6)),

765 )

766 if fmttype == 0:

767 char2gid.update(

768 enumerate(

769 cast(Tuple[int, ...], struct.unpack(">256B", fp.read(256))),

770 ),

771 )

772 elif fmttype == 2:

773 subheaderkeys = cast(

774 Tuple[int, ...],

775 struct.unpack(">256H", fp.read(512)),

776 )

777 firstbytes = [0] * 8192

778 for i, k in enumerate(subheaderkeys):

779 firstbytes[k // 8] = i

780 nhdrs = max(subheaderkeys) // 8 + 1

781 hdrs: List[Tuple[int, int, int, int, int]] = []

782 for i in range(nhdrs):

783 (firstcode, entcount, delta, offset) = cast(

784 Tuple[int, int, int, int],

785 struct.unpack(">HHhH", fp.read(8)),

786 )

787 hdrs.append((i, firstcode, entcount, delta, fp.tell() - 2 + offset))

788 for i, firstcode, entcount, delta, pos in hdrs:

789 if not entcount:

790 continue

791 first = firstcode + (firstbytes[i] << 8)

792 fp.seek(pos)

793 for c in range(entcount):

794 gid = cast(Tuple[int], struct.unpack(">H", fp.read(2)))[0]

795 if gid:

796 gid += delta

797 char2gid[first + c] = gid

798 elif fmttype == 4:

799 (segcount, _1, _2, _3) = cast(

800 Tuple[int, int, int, int],

801 struct.unpack(">HHHH", fp.read(8)),

802 )

803 segcount //= 2

804 ecs = cast(

805 Tuple[int, ...],

806 struct.unpack(">%dH" % segcount, fp.read(2 * segcount)),

807 )

808 fp.read(2)

809 scs = cast(

810 Tuple[int, ...],

811 struct.unpack(">%dH" % segcount, fp.read(2 * segcount)),

812 )

813 idds = cast(

814 Tuple[int, ...],

815 struct.unpack(">%dh" % segcount, fp.read(2 * segcount)),

816 )

817 pos = fp.tell()

818 idrs = cast(

819 Tuple[int, ...],

820 struct.unpack(">%dH" % segcount, fp.read(2 * segcount)),

821 )

822 for ec, sc, idd, idr in zip(ecs, scs, idds, idrs):

823 if idr:

824 fp.seek(pos + idr)

825 for c in range(sc, ec + 1):

826 b = cast(Tuple[int], struct.unpack(">H", fp.read(2)))[0]

827 char2gid[c] = (b + idd) & 0xFFFF

828 else:

829 for c in range(sc, ec + 1):

830 char2gid[c] = (c + idd) & 0xFFFF

831 else:

832 assert False, str(("Unhandled", fmttype))

833 if not char2gid:

834 raise TrueTypeFont.CMapNotFound

835 # create unicode map

836 unicode_map = FileUnicodeMap()

837 for char, gid in char2gid.items():

838 unicode_map.add_cid2unichr(gid, char)

839 return unicode_map

840

841

842class PDFFontError(PDFException):

843 pass

844

845

846class PDFUnicodeNotDefined(PDFFontError):

847 pass

848

849

850LITERAL_STANDARD_ENCODING = LIT("StandardEncoding")

851LITERAL_TYPE1C = LIT("Type1C")

852

853# Font widths are maintained in a dict type that maps from *either* unicode

854# chars or integer character IDs.

855FontWidthDict = Union[Dict[int, float], Dict[str, float]]

856

857

858class PDFFont:

859 def __init__(

860 self,

861 descriptor: Mapping[str, Any],

862 widths: FontWidthDict,

863 default_width: Optional[float] = None,

864 ) -> None:

865 self.descriptor = descriptor

866 self.widths: FontWidthDict = resolve_all(widths)

867 self.fontname = resolve1(descriptor.get("FontName", "unknown"))

868 if isinstance(self.fontname, PSLiteral):

869 self.fontname = literal_name(self.fontname)

870 self.flags = int_value(descriptor.get("Flags", 0))

871 self.ascent = num_value(descriptor.get("Ascent", 0))

872 self.descent = num_value(descriptor.get("Descent", 0))

873 self.italic_angle = num_value(descriptor.get("ItalicAngle", 0))

874 if default_width is None:

875 self.default_width = num_value(descriptor.get("MissingWidth", 0))

876 else:

877 self.default_width = default_width

878 self.default_width = resolve1(self.default_width)

879 self.leading = num_value(descriptor.get("Leading", 0))

880 self.bbox = cast(

881 Rect,

882 list_value(resolve_all(descriptor.get("FontBBox", (0, 0, 0, 0)))),

883 )

884 self.hscale = self.vscale = 0.001

885

886 # PDF RM 9.8.1 specifies /Descent should always be a negative number.

887 # PScript5.dll seems to produce Descent with a positive number, but

888 # text analysis will be wrong if this is taken as correct. So force

889 # descent to negative.

890 if self.descent > 0:

891 self.descent = -self.descent

892

893 def __repr__(self) -> str:

894 return "<PDFFont>"

895

896 def is_vertical(self) -> bool:

897 return False

898

899 def is_multibyte(self) -> bool:

900 return False

901

902 def decode(self, bytes: bytes) -> Iterable[int]:

903 return bytearray(bytes) # map(ord, bytes)

904

905 def get_ascent(self) -> float:

906 """Ascent above the baseline, in text space units"""

907 return self.ascent * self.vscale

908

909 def get_descent(self) -> float:

910 """Descent below the baseline, in text space units; always negative"""

911 return self.descent * self.vscale

912

913 def get_width(self) -> float:

914 w = self.bbox[2] - self.bbox[0]

915 if w == 0:

916 w = -self.default_width

917 return w * self.hscale

918

919 def get_height(self) -> float:

920 h = self.bbox[3] - self.bbox[1]

921 if h == 0:

922 h = self.ascent - self.descent

923 return h * self.vscale

924

925 def char_width(self, cid: int) -> float:

926 # Because character widths may be mapping either IDs or strings,

927 # we try to lookup the character ID first, then its str equivalent.

928 try:

929 return cast(Dict[int, float], self.widths)[cid] * self.hscale

930 except KeyError:

931 str_widths = cast(Dict[str, float], self.widths)

932 try:

933 return str_widths[self.to_unichr(cid)] * self.hscale

934 except (KeyError, PDFUnicodeNotDefined):

935 return self.default_width * self.hscale

936

937 def char_disp(self, cid: int) -> Union[float, Tuple[Optional[float], float]]:

938 """Returns an integer for horizontal fonts, a tuple for vertical fonts."""

939 return 0

940

941 def string_width(self, s: bytes) -> float:

942 return sum(self.char_width(cid) for cid in self.decode(s))

943

944 def to_unichr(self, cid: int) -> str:

945 raise NotImplementedError

946

947

948class PDFSimpleFont(PDFFont):

949 def __init__(

950 self,

951 descriptor: Mapping[str, Any],

952 widths: FontWidthDict,

953 spec: Mapping[str, Any],

954 ) -> None:

955 # Font encoding is specified either by a name of

956 # built-in encoding or a dictionary that describes

957 # the differences.

958 if "Encoding" in spec:

959 encoding = resolve1(spec["Encoding"])

960 else:

961 encoding = LITERAL_STANDARD_ENCODING

962 if isinstance(encoding, dict):

963 name = literal_name(encoding.get("BaseEncoding", LITERAL_STANDARD_ENCODING))

964 diff = list_value(encoding.get("Differences", []))

965 self.cid2unicode = EncodingDB.get_encoding(name, diff)

966 else:

967 self.cid2unicode = EncodingDB.get_encoding(literal_name(encoding))

968 self.unicode_map: Optional[UnicodeMap] = None

969 if "ToUnicode" in spec:

970 strm = stream_value(spec["ToUnicode"])

971 self.unicode_map = FileUnicodeMap()

972 CMapParser(self.unicode_map, BytesIO(strm.get_data())).run()

973 PDFFont.__init__(self, descriptor, widths)

974

975 def to_unichr(self, cid: int) -> str:

976 if self.unicode_map:

977 try:

978 return self.unicode_map.get_unichr(cid)

979 except KeyError:

980 pass

981 try:

982 return self.cid2unicode[cid]

983 except KeyError:

984 raise PDFUnicodeNotDefined(None, cid)

985

986

987class PDFType1Font(PDFSimpleFont):

988 def __init__(self, rsrcmgr: "PDFResourceManager", spec: Mapping[str, Any]) -> None:

989 try:

990 self.basefont = literal_name(spec["BaseFont"])

991 except KeyError:

992 if settings.STRICT:

993 raise PDFFontError("BaseFont is missing")

994 self.basefont = "unknown"

995

996 widths: FontWidthDict

997 try:

998 (descriptor, int_widths) = FontMetricsDB.get_metrics(self.basefont)

999 widths = cast(Dict[str, float], int_widths) # implicit int->float

1000 except KeyError:

1001 descriptor = dict_value(spec.get("FontDescriptor", {}))

1002 firstchar = int_value(spec.get("FirstChar", 0))

1003 # lastchar = int_value(spec.get('LastChar', 255))

1004 width_list = list_value(spec.get("Widths", [0] * 256))

1005 widths = {i + firstchar: resolve1(w) for (i, w) in enumerate(width_list)}

1006 PDFSimpleFont.__init__(self, descriptor, widths, spec)

1007 if "Encoding" not in spec and "FontFile" in descriptor:

1008 # try to recover the missing encoding info from the font file.

1009 self.fontfile = stream_value(descriptor.get("FontFile"))

1010 length1 = int_value(self.fontfile["Length1"])

1011 data = self.fontfile.get_data()[:length1]

1012 parser = Type1FontHeaderParser(BytesIO(data))

1013 self.cid2unicode = parser.get_encoding()

1014

1015 def __repr__(self) -> str:

1016 return "<PDFType1Font: basefont=%r>" % self.basefont

1017

1018

1019class PDFTrueTypeFont(PDFType1Font):

1020 def __repr__(self) -> str:

1021 return "<PDFTrueTypeFont: basefont=%r>" % self.basefont

1022

1023

1024class PDFType3Font(PDFSimpleFont):

1025 def __init__(self, rsrcmgr: "PDFResourceManager", spec: Mapping[str, Any]) -> None:

1026 firstchar = int_value(spec.get("FirstChar", 0))

1027 # lastchar = int_value(spec.get('LastChar', 0))

1028 width_list = list_value(spec.get("Widths", [0] * 256))

1029 widths = {i + firstchar: w for (i, w) in enumerate(width_list)}

1030 if "FontDescriptor" in spec:

1031 descriptor = dict_value(spec["FontDescriptor"])

1032 else:

1033 descriptor = {"Ascent": 0, "Descent": 0, "FontBBox": spec["FontBBox"]}

1034 PDFSimpleFont.__init__(self, descriptor, widths, spec)

1035 self.matrix = cast(Matrix, tuple(list_value(spec.get("FontMatrix"))))

1036 (_, self.descent, _, self.ascent) = self.bbox

1037 (self.hscale, self.vscale) = apply_matrix_norm(self.matrix, (1, 1))

1038

1039 def __repr__(self) -> str:

1040 return "<PDFType3Font>"

1041

1042

1043class PDFCIDFont(PDFFont):

1044 default_disp: Union[float, Tuple[Optional[float], float]]

1045

1046 def __init__(

1047 self,

1048 rsrcmgr: "PDFResourceManager",

1049 spec: Mapping[str, Any],

1050 strict: bool = settings.STRICT,

1051 ) -> None:

1052 try:

1053 self.basefont = literal_name(spec["BaseFont"])

1054 except KeyError:

1055 if strict:

1056 raise PDFFontError("BaseFont is missing")

1057 self.basefont = "unknown"

1058 self.cidsysteminfo = dict_value(spec.get("CIDSystemInfo", {}))

1059 cid_registry = resolve1(self.cidsysteminfo.get("Registry", b"unknown")).decode(

1060 "latin1",

1061 )

1062 cid_ordering = resolve1(self.cidsysteminfo.get("Ordering", b"unknown")).decode(

1063 "latin1",

1064 )

1065 self.cidcoding = f"{cid_registry.strip()}-{cid_ordering.strip()}"

1066 self.cmap: CMapBase = self.get_cmap_from_spec(spec, strict)

1067

1068 try:

1069 descriptor = dict_value(spec["FontDescriptor"])

1070 except KeyError:

1071 if strict:

1072 raise PDFFontError("FontDescriptor is missing")

1073 descriptor = {}

1074 ttf = None

1075 if "FontFile2" in descriptor:

1076 self.fontfile = stream_value(descriptor.get("FontFile2"))

1077 ttf = TrueTypeFont(self.basefont, BytesIO(self.fontfile.get_data()))

1078 self.unicode_map: Optional[UnicodeMap] = None

1079 if "ToUnicode" in spec:

1080 if isinstance(spec["ToUnicode"], PDFStream):

1081 strm = stream_value(spec["ToUnicode"])

1082 self.unicode_map = FileUnicodeMap()

1083 CMapParser(self.unicode_map, BytesIO(strm.get_data())).run()

1084 else:

1085 cmap_name = literal_name(spec["ToUnicode"])

1086 encoding = literal_name(spec["Encoding"])

1087 if (

1088 "Identity" in cid_ordering

1089 or "Identity" in cmap_name

1090 or "Identity" in encoding

1091 ):

1092 self.unicode_map = IdentityUnicodeMap()

1093 elif self.cidcoding in ("Adobe-Identity", "Adobe-UCS"):

1094 if ttf:

1095 try:

1096 self.unicode_map = ttf.create_unicode_map()

1097 except TrueTypeFont.CMapNotFound:

1098 pass

1099 else:

1100 try:

1101 self.unicode_map = CMapDB.get_unicode_map(

1102 self.cidcoding,

1103 self.cmap.is_vertical(),

1104 )

1105 except CMapDB.CMapNotFound:

1106 pass

1107

1108 self.vertical = self.cmap.is_vertical()

1109 if self.vertical:

1110 # writing mode: vertical

1111 widths2 = get_widths2(list_value(spec.get("W2", [])))

1112 self.disps = {cid: (vx, vy) for (cid, (_, (vx, vy))) in widths2.items()}

1113 (vy, w) = resolve1(spec.get("DW2", [880, -1000]))

1114 self.default_disp = (None, vy)

1115 widths = {cid: w for (cid, (w, _)) in widths2.items()}

1116 default_width = w

1117 else:

1118 # writing mode: horizontal

1119 self.disps = {}

1120 self.default_disp = 0

1121 widths = get_widths(list_value(spec.get("W", [])))

1122 default_width = spec.get("DW", 1000)

1123 PDFFont.__init__(self, descriptor, widths, default_width=default_width)

1124

1125 def get_cmap_from_spec(self, spec: Mapping[str, Any], strict: bool) -> CMapBase:

1126 """Get cmap from font specification

1127

1128 For certain PDFs, Encoding Type isn't mentioned as an attribute of

1129 Encoding but as an attribute of CMapName, where CMapName is an

1130 attribute of spec['Encoding'].

1131 The horizontal/vertical modes are mentioned with different name

1132 such as 'DLIdent-H/V','OneByteIdentityH/V','Identity-H/V'.

1133 """

1134 cmap_name = self._get_cmap_name(spec, strict)

1135

1136 try:

1137 return CMapDB.get_cmap(cmap_name)

1138 except CMapDB.CMapNotFound as e:

1139 if strict:

1140 raise PDFFontError(e)

1141 return CMap()

1142

1143 @staticmethod

1144 def _get_cmap_name(spec: Mapping[str, Any], strict: bool) -> str:

1145 """Get cmap name from font specification"""

1146 cmap_name = "unknown" # default value

1147

1148 try:

1149 spec_encoding = spec["Encoding"]

1150 if hasattr(spec_encoding, "name"):

1151 cmap_name = literal_name(spec["Encoding"])

1152 else:

1153 cmap_name = literal_name(spec_encoding["CMapName"])

1154 except KeyError:

1155 if strict:

1156 raise PDFFontError("Encoding is unspecified")

1157

1158 if type(cmap_name) is PDFStream: # type: ignore[comparison-overlap]

1159 cmap_name_stream: PDFStream = cast(PDFStream, cmap_name)

1160 if "CMapName" in cmap_name_stream:

1161 cmap_name = cmap_name_stream.get("CMapName").name

1162 elif strict:

1163 raise PDFFontError("CMapName unspecified for encoding")

1164

1165 return IDENTITY_ENCODER.get(cmap_name, cmap_name)

1166

1167 def __repr__(self) -> str:

1168 return f"<PDFCIDFont: basefont={self.basefont!r}, cidcoding={self.cidcoding!r}>"

1169

1170 def is_vertical(self) -> bool:

1171 return self.vertical

1172

1173 def is_multibyte(self) -> bool:

1174 return True

1175

1176 def decode(self, bytes: bytes) -> Iterable[int]:

1177 return self.cmap.decode(bytes)

1178

1179 def char_disp(self, cid: int) -> Union[float, Tuple[Optional[float], float]]:

1180 """Returns an integer for horizontal fonts, a tuple for vertical fonts."""

1181 return self.disps.get(cid, self.default_disp)

1182

1183 def to_unichr(self, cid: int) -> str:

1184 try:

1185 if not self.unicode_map:

1186 raise PDFKeyError(cid)

1187 return self.unicode_map.get_unichr(cid)

1188 except KeyError:

1189 raise PDFUnicodeNotDefined(self.cidcoding, cid)