Coverage for /pythoncovmergedfiles/medio/medio/src/pdfminer.six/pdfminer/pdffont.py: 28%

1import contextlib

2import logging

3import struct

4from collections.abc import Iterable, Iterator, Mapping

5from io import BytesIO

6from typing import (

7 TYPE_CHECKING,

8 Any,

9 BinaryIO,

10 cast,

11)

13from pdfminer import settings

14from pdfminer.casting import safe_float, safe_rect_list

15from pdfminer.cmapdb import (

16 CMap,

17 CMapBase,

18 CMapDB,

19 CMapParser,

20 FileUnicodeMap,

21 IdentityUnicodeMap,

22 UnicodeMap,

23)

24from pdfminer.encodingdb import EncodingDB, name2unicode

25from pdfminer.fontmetrics import FONT_METRICS

26from pdfminer.pdfexceptions import PDFException, PDFKeyError, PDFValueError

27from pdfminer.pdftypes import (

28 PDFStream,

29 dict_value,

30 int_value,

31 list_value,

32 num_value,

33 resolve1,

34 resolve_all,

35 stream_value,

36)

37from pdfminer.psexceptions import PSEOF

38from pdfminer.psparser import (

39 KWD,

40 LIT,

41 PSKeyword,

42 PSLiteral,

43 PSStackParser,

44 literal_name,

45)

46from pdfminer.utils import Matrix, Point, Rect, apply_matrix_norm, choplist, nunpack

48if TYPE_CHECKING:

49 from pdfminer.pdfinterp import PDFResourceManager

51log = logging.getLogger(__name__)

54def get_widths(seq: Iterable[object]) -> dict[str | int, float]:

55 """Build a mapping of character widths for horizontal writing."""

56 widths: dict[int, float] = {}

57 r: list[float] = []

58 for v in seq:

59 v = resolve1(v)

60 if isinstance(v, list):

61 if r:

62 char1 = r[-1]

63 for i, w in enumerate(v):

64 widths[cast(int, char1) + i] = w

65 r = []

66 elif isinstance(v, (int, float)): # == utils.isnumber(v)

67 r.append(v)

68 if len(r) == 3:

69 (char1, char2, w) = r

70 if isinstance(char1, int) and isinstance(char2, int):

71 for i in range(char1, char2 + 1):

72 widths[i] = w

73 else:

74 log.warning(

75 "Skipping invalid font width specification for %s to "

76 "%s because either of them is not an int",

77 char1,

78 char2,

79 )

80 r = []

81 else:

82 log.warning(

83 "Skipping invalid font width specification for %s "

84 "because it is not a number or a list",

85 v,

86 )

87 return cast(dict[str | int, float], widths)

90def get_widths2(seq: Iterable[object]) -> dict[int, tuple[float, Point]]:

91 """Build a mapping of character widths for vertical writing."""

92 widths: dict[int, tuple[float, Point]] = {}

93 r: list[float] = []

94 for v in seq:

95 if isinstance(v, list):

96 if r:

97 char1 = r[-1]

98 for i, (w, vx, vy) in enumerate(choplist(3, v)):

99 widths[cast(int, char1) + i] = (w, (vx, vy))

100 r = []

101 elif isinstance(v, (int, float)): # == utils.isnumber(v)

102 r.append(v)

103 if len(r) == 5:

104 (char1, char2, w, vx, vy) = r

105 for i in range(cast(int, char1), cast(int, char2) + 1):

106 widths[i] = (w, (vx, vy))

107 r = []

108 return widths

109

110

111class FontMetricsDB:

112 @classmethod

113 def get_metrics(cls, fontname: str) -> tuple[dict[str, object], dict[str, int]]:

114 return FONT_METRICS[fontname]

115

116

117# int here means that we're not extending PSStackParser with additional types.

118class Type1FontHeaderParser(PSStackParser[int]):

119 KEYWORD_BEGIN = KWD(b"begin")

120 KEYWORD_END = KWD(b"end")

121 KEYWORD_DEF = KWD(b"def")

122 KEYWORD_PUT = KWD(b"put")

123 KEYWORD_DICT = KWD(b"dict")

124 KEYWORD_ARRAY = KWD(b"array")

125 KEYWORD_READONLY = KWD(b"readonly")

126 KEYWORD_FOR = KWD(b"for")

127

128 def __init__(self, data: BinaryIO) -> None:

129 PSStackParser.__init__(self, data)

130 self._cid2unicode: dict[int, str] = {}

131

132 def get_encoding(self) -> dict[int, str]:

133 """Parse the font encoding.

134

135 The Type1 font encoding maps character codes to character names. These

136 character names could either be standard Adobe glyph names, or

137 character names associated with custom CharStrings for this font. A

138 CharString is a sequence of operations that describe how the character

139 should be drawn. Currently, this function returns '' (empty string)

140 for character names that are associated with a CharStrings.

141

142 Reference: Adobe Systems Incorporated, Adobe Type 1 Font Format

143

144 :returns mapping of character identifiers (cid's) to unicode characters

145 """

146 while 1:

147 try:

148 (cid, name) = self.nextobject()

149 except PSEOF:

150 break

151 try:

152 self._cid2unicode[cid] = name2unicode(cast(str, name))

153 except KeyError as e:

154 log.debug(str(e))

155 return self._cid2unicode

156

157 def do_keyword(self, pos: int, token: PSKeyword) -> None:

158 if token is self.KEYWORD_PUT:

159 ((_, key), (_, value)) = self.pop(2)

160 if isinstance(key, int) and isinstance(value, PSLiteral):

161 self.add_results((key, literal_name(value)))

162

163

164NIBBLES = ("0", "1", "2", "3", "4", "5", "6", "7", "8", "9", ".", "e", "e-", None, "-")

165

166# Mapping of cmap names. Original cmap name is kept if not in the mapping.

167# (missing reference for why DLIdent is mapped to Identity)

168IDENTITY_ENCODER = {

169 "DLIdent-H": "Identity-H",

170 "DLIdent-V": "Identity-V",

171}

172

173

174def getdict(data: bytes) -> dict[int, list[float | int]]:

175 d: dict[int, list[float | int]] = {}

176 fp = BytesIO(data)

177 stack: list[float | int] = []

178 while 1:

179 c = fp.read(1)

180 if not c:

181 break

182 b0 = ord(c)

183 if b0 <= 21:

184 d[b0] = stack

185 stack = []

186 continue

187 if b0 == 30:

188 s = ""

189 loop = True

190 while loop:

191 b = ord(fp.read(1))

192 for n in (b >> 4, b & 15):

193 if n == 15:

194 loop = False

195 else:

196 nibble = NIBBLES[n]

197 assert nibble is not None

198 s += nibble

199 value = float(s)

200 elif b0 >= 32 and b0 <= 246:

201 value = b0 - 139

202 else:

203 b1 = ord(fp.read(1))

204 if b0 >= 247 and b0 <= 250:

205 value = ((b0 - 247) << 8) + b1 + 108

206 elif b0 >= 251 and b0 <= 254:

207 value = -((b0 - 251) << 8) - b1 - 108

208 else:

209 b2 = ord(fp.read(1))

210 if b1 >= 128:

211 b1 -= 256

212 if b0 == 28:

213 value = b1 << 8 | b2

214 else:

215 value = b1 << 24 | b2 << 16 | struct.unpack(">H", fp.read(2))[0]

216 stack.append(value)

217 return d

218

219

220class CFFFont:

221 STANDARD_STRINGS = (

222 ".notdef",

223 "space",

224 "exclam",

225 "quotedbl",

226 "numbersign",

227 "dollar",

228 "percent",

229 "ampersand",

230 "quoteright",

231 "parenleft",

232 "parenright",

233 "asterisk",

234 "plus",

235 "comma",

236 "hyphen",

237 "period",

238 "slash",

239 "zero",

240 "one",

241 "two",

242 "three",

243 "four",

244 "five",

245 "six",

246 "seven",

247 "eight",

248 "nine",

249 "colon",

250 "semicolon",

251 "less",

252 "equal",

253 "greater",

254 "question",

255 "at",

256 "A",

257 "B",

258 "C",

259 "D",

260 "E",

261 "F",

262 "G",

263 "H",

264 "I",

265 "J",

266 "K",

267 "L",

268 "M",

269 "N",

270 "O",

271 "P",

272 "Q",

273 "R",

274 "S",

275 "T",

276 "U",

277 "V",

278 "W",

279 "X",

280 "Y",

281 "Z",

282 "bracketleft",

283 "backslash",

284 "bracketright",

285 "asciicircum",

286 "underscore",

287 "quoteleft",

288 "a",

289 "b",

290 "c",

291 "d",

292 "e",

293 "f",

294 "g",

295 "h",

296 "i",

297 "j",

298 "k",

299 "l",

300 "m",

301 "n",

302 "o",

303 "p",

304 "q",

305 "r",

306 "s",

307 "t",

308 "u",

309 "v",

310 "w",

311 "x",

312 "y",

313 "z",

314 "braceleft",

315 "bar",

316 "braceright",

317 "asciitilde",

318 "exclamdown",

319 "cent",

320 "sterling",

321 "fraction",

322 "yen",

323 "florin",

324 "section",

325 "currency",

326 "quotesingle",

327 "quotedblleft",

328 "guillemotleft",

329 "guilsinglleft",

330 "guilsinglright",

331 "fi",

332 "fl",

333 "endash",

334 "dagger",

335 "daggerdbl",

336 "periodcentered",

337 "paragraph",

338 "bullet",

339 "quotesinglbase",

340 "quotedblbase",

341 "quotedblright",

342 "guillemotright",

343 "ellipsis",

344 "perthousand",

345 "questiondown",

346 "grave",

347 "acute",

348 "circumflex",

349 "tilde",

350 "macron",

351 "breve",

352 "dotaccent",

353 "dieresis",

354 "ring",

355 "cedilla",

356 "hungarumlaut",

357 "ogonek",

358 "caron",

359 "emdash",

360 "AE",

361 "ordfeminine",

362 "Lslash",

363 "Oslash",

364 "OE",

365 "ordmasculine",

366 "ae",

367 "dotlessi",

368 "lslash",

369 "oslash",

370 "oe",

371 "germandbls",

372 "onesuperior",

373 "logicalnot",

374 "mu",

375 "trademark",

376 "Eth",

377 "onehalf",

378 "plusminus",

379 "Thorn",

380 "onequarter",

381 "divide",

382 "brokenbar",

383 "degree",

384 "thorn",

385 "threequarters",

386 "twosuperior",

387 "registered",

388 "minus",

389 "eth",

390 "multiply",

391 "threesuperior",

392 "copyright",

393 "Aacute",

394 "Acircumflex",

395 "Adieresis",

396 "Agrave",

397 "Aring",

398 "Atilde",

399 "Ccedilla",

400 "Eacute",

401 "Ecircumflex",

402 "Edieresis",

403 "Egrave",

404 "Iacute",

405 "Icircumflex",

406 "Idieresis",

407 "Igrave",

408 "Ntilde",

409 "Oacute",

410 "Ocircumflex",

411 "Odieresis",

412 "Ograve",

413 "Otilde",

414 "Scaron",

415 "Uacute",

416 "Ucircumflex",

417 "Udieresis",

418 "Ugrave",

419 "Yacute",

420 "Ydieresis",

421 "Zcaron",

422 "aacute",

423 "acircumflex",

424 "adieresis",

425 "agrave",

426 "aring",

427 "atilde",

428 "ccedilla",

429 "eacute",

430 "ecircumflex",

431 "edieresis",

432 "egrave",

433 "iacute",

434 "icircumflex",

435 "idieresis",

436 "igrave",

437 "ntilde",

438 "oacute",

439 "ocircumflex",

440 "odieresis",

441 "ograve",

442 "otilde",

443 "scaron",

444 "uacute",

445 "ucircumflex",

446 "udieresis",

447 "ugrave",

448 "yacute",

449 "ydieresis",

450 "zcaron",

451 "exclamsmall",

452 "Hungarumlautsmall",

453 "dollaroldstyle",

454 "dollarsuperior",

455 "ampersandsmall",

456 "Acutesmall",

457 "parenleftsuperior",

458 "parenrightsuperior",

459 "twodotenleader",

460 "onedotenleader",

461 "zerooldstyle",

462 "oneoldstyle",

463 "twooldstyle",

464 "threeoldstyle",

465 "fouroldstyle",

466 "fiveoldstyle",

467 "sixoldstyle",

468 "sevenoldstyle",

469 "eightoldstyle",

470 "nineoldstyle",

471 "commasuperior",

472 "threequartersemdash",

473 "periodsuperior",

474 "questionsmall",

475 "asuperior",

476 "bsuperior",

477 "centsuperior",

478 "dsuperior",

479 "esuperior",

480 "isuperior",

481 "lsuperior",

482 "msuperior",

483 "nsuperior",

484 "osuperior",

485 "rsuperior",

486 "ssuperior",

487 "tsuperior",

488 "ff",

489 "ffi",

490 "ffl",

491 "parenleftinferior",

492 "parenrightinferior",

493 "Circumflexsmall",

494 "hyphensuperior",

495 "Gravesmall",

496 "Asmall",

497 "Bsmall",

498 "Csmall",

499 "Dsmall",

500 "Esmall",

501 "Fsmall",

502 "Gsmall",

503 "Hsmall",

504 "Ismall",

505 "Jsmall",

506 "Ksmall",

507 "Lsmall",

508 "Msmall",

509 "Nsmall",

510 "Osmall",

511 "Psmall",

512 "Qsmall",

513 "Rsmall",

514 "Ssmall",

515 "Tsmall",

516 "Usmall",

517 "Vsmall",

518 "Wsmall",

519 "Xsmall",

520 "Ysmall",

521 "Zsmall",

522 "colonmonetary",

523 "onefitted",

524 "rupiah",

525 "Tildesmall",

526 "exclamdownsmall",

527 "centoldstyle",

528 "Lslashsmall",

529 "Scaronsmall",

530 "Zcaronsmall",

531 "Dieresissmall",

532 "Brevesmall",

533 "Caronsmall",

534 "Dotaccentsmall",

535 "Macronsmall",

536 "figuredash",

537 "hypheninferior",

538 "Ogoneksmall",

539 "Ringsmall",

540 "Cedillasmall",

541 "questiondownsmall",

542 "oneeighth",

543 "threeeighths",

544 "fiveeighths",

545 "seveneighths",

546 "onethird",

547 "twothirds",

548 "zerosuperior",

549 "foursuperior",

550 "fivesuperior",

551 "sixsuperior",

552 "sevensuperior",

553 "eightsuperior",

554 "ninesuperior",

555 "zeroinferior",

556 "oneinferior",

557 "twoinferior",

558 "threeinferior",

559 "fourinferior",

560 "fiveinferior",

561 "sixinferior",

562 "seveninferior",

563 "eightinferior",

564 "nineinferior",

565 "centinferior",

566 "dollarinferior",

567 "periodinferior",

568 "commainferior",

569 "Agravesmall",

570 "Aacutesmall",

571 "Acircumflexsmall",

572 "Atildesmall",

573 "Adieresissmall",

574 "Aringsmall",

575 "AEsmall",

576 "Ccedillasmall",

577 "Egravesmall",

578 "Eacutesmall",

579 "Ecircumflexsmall",

580 "Edieresissmall",

581 "Igravesmall",

582 "Iacutesmall",

583 "Icircumflexsmall",

584 "Idieresissmall",

585 "Ethsmall",

586 "Ntildesmall",

587 "Ogravesmall",

588 "Oacutesmall",

589 "Ocircumflexsmall",

590 "Otildesmall",

591 "Odieresissmall",

592 "OEsmall",

593 "Oslashsmall",

594 "Ugravesmall",

595 "Uacutesmall",

596 "Ucircumflexsmall",

597 "Udieresissmall",

598 "Yacutesmall",

599 "Thornsmall",

600 "Ydieresissmall",

601 "001.000",

602 "001.001",

603 "001.002",

604 "001.003",

605 "Black",

606 "Bold",

607 "Book",

608 "Light",

609 "Medium",

610 "Regular",

611 "Roman",

612 "Semibold",

613 )

614

615 class INDEX:

616 def __init__(self, fp: BinaryIO) -> None:

617 self.fp = fp

618 self.offsets: list[int] = []

619 (count, offsize) = struct.unpack(">HB", self.fp.read(3))

620 for _i in range(count + 1):

621 self.offsets.append(nunpack(self.fp.read(offsize)))

622 self.base = self.fp.tell() - 1

623 self.fp.seek(self.base + self.offsets[-1])

624

625 def __repr__(self) -> str:

626 return f"<INDEX: size={len(self)}>"

627

628 def __len__(self) -> int:

629 return len(self.offsets) - 1

630

631 def __getitem__(self, i: int) -> bytes:

632 self.fp.seek(self.base + self.offsets[i])

633 return self.fp.read(self.offsets[i + 1] - self.offsets[i])

634

635 def __iter__(self) -> Iterator[bytes]:

636 return iter(self[i] for i in range(len(self)))

637

638 def __init__(self, name: str, fp: BinaryIO) -> None:

639 self.name = name

640 self.fp = fp

641 # Header

642 (_major, _minor, hdrsize, _offsize) = struct.unpack("BBBB", self.fp.read(4))

643 self.fp.read(hdrsize - 4)

644 # Name INDEX

645 self.name_index = self.INDEX(self.fp)

646 # Top DICT INDEX

647 self.dict_index = self.INDEX(self.fp)

648 # String INDEX

649 self.string_index = self.INDEX(self.fp)

650 # Global Subr INDEX

651 self.subr_index = self.INDEX(self.fp)

652 # Top DICT DATA

653 self.top_dict = getdict(self.dict_index[0])

654 (charset_pos,) = self.top_dict.get(15, [0])

655 (encoding_pos,) = self.top_dict.get(16, [0])

656 (charstring_pos,) = self.top_dict.get(17, [0])

657 # CharStrings

658 self.fp.seek(cast(int, charstring_pos))

659 self.charstring = self.INDEX(self.fp)

660 self.nglyphs = len(self.charstring)

661 # Encodings

662 self.code2gid = {}

663 self.gid2code = {}

664 self.fp.seek(cast(int, encoding_pos))

665 format = self.fp.read(1)

666 if format == b"\x00":

667 # Format 0

668 (n,) = struct.unpack("B", self.fp.read(1))

669 for code, gid in enumerate(struct.unpack("B" * n, self.fp.read(n))):

670 self.code2gid[code] = gid

671 self.gid2code[gid] = code

672 elif format == b"\x01":

673 # Format 1

674 (n,) = struct.unpack("B", self.fp.read(1))

675 code = 0

676 for _i in range(n):

677 (first, nleft) = struct.unpack("BB", self.fp.read(2))

678 for gid in range(first, first + nleft + 1):

679 self.code2gid[code] = gid

680 self.gid2code[gid] = code

681 code += 1

682 else:

683 raise PDFValueError(f"unsupported encoding format: {format!r}")

684 # Charsets

685 self.name2gid = {}

686 self.gid2name = {}

687 self.fp.seek(cast(int, charset_pos))

688 format = self.fp.read(1)

689 if format == b"\x00":

690 # Format 0

691 n = self.nglyphs - 1

692 for gid, sid in enumerate(

693 cast(

694 tuple[int, ...], struct.unpack(">" + "H" * n, self.fp.read(2 * n))

695 ),

696 ):

697 gid += 1

698 sidname = self.getstr(sid)

699 self.name2gid[sidname] = gid

700 self.gid2name[gid] = sidname

701 elif format == b"\x01":

702 # Format 1

703 (n,) = struct.unpack("B", self.fp.read(1))

704 sid = 0

705 for _i in range(n):

706 (first, nleft) = struct.unpack("BB", self.fp.read(2))

707 for gid in range(first, first + nleft + 1):

708 sidname = self.getstr(sid)

709 self.name2gid[sidname] = gid

710 self.gid2name[gid] = sidname

711 sid += 1

712 elif format == b"\x02":

713 # Format 2

714 raise AssertionError(str(("Unhandled", format)))

715 else:

716 raise PDFValueError(f"unsupported charset format: {format!r}")

717

718 def getstr(self, sid: int) -> str | bytes:

719 # This returns str for one of the STANDARD_STRINGS but bytes otherwise,

720 # and appears to be a needless source of type complexity.

721 if sid < len(self.STANDARD_STRINGS):

722 return self.STANDARD_STRINGS[sid]

723 return self.string_index[sid - len(self.STANDARD_STRINGS)]

724

725

726class TrueTypeFont:

727 class CMapNotFound(PDFException):

728 pass

729

730 def __init__(self, name: str, fp: BinaryIO) -> None:

731 self.name = name

732 self.fp = fp

733 self.tables: dict[bytes, tuple[int, int]] = {}

734 self.fonttype = fp.read(4)

735 try:

736 (ntables, _1, _2, _3) = struct.unpack(">HHHH", fp.read(8))

737 for _ in range(ntables):

738 (name_bytes, _tsum, offset, length) = struct.unpack(

739 ">4sLLL", fp.read(16)

740 )

741 self.tables[name_bytes] = (offset, length)

742 except struct.error:

743 # Do not fail if there are not enough bytes to read. Even for

744 # corrupted PDFs we would like to get as much information as

745 # possible, so continue.

746 pass

747

748 def create_unicode_map(self) -> FileUnicodeMap:

749 if b"cmap" not in self.tables:

750 raise TrueTypeFont.CMapNotFound

751 (base_offset, _length) = self.tables[b"cmap"]

752 fp = self.fp

753 fp.seek(base_offset)

754 (_version, nsubtables) = cast(tuple[int, int], struct.unpack(">HH", fp.read(4)))

755 subtables: list[tuple[int, int, int]] = []

756 for _i in range(nsubtables):

757 subtables.append(struct.unpack(">HHL", fp.read(8)))

758 char2gid: dict[int, int] = {}

759 # Supports subtable type 0, 2, 4, 6, 10 and 12.

760 for platform_id, encoding_id, st_offset in subtables:

761 # Skip non-Unicode cmaps.

762 # https://docs.microsoft.com/en-us/typography/opentype/spec/cmap

763 if not (platform_id == 0 or (platform_id == 3 and encoding_id in [1, 10])):

764 continue

765 fp.seek(base_offset + st_offset)

766 (fmttype,) = struct.unpack(">H", fp.read(2))

767 if fmttype == 0:

768 self.parse_cmap_format_0(fp, char2gid)

769 elif fmttype == 2:

770 self.parse_cmap_format_2(fp, char2gid)

771 elif fmttype == 4:

772 self.parse_cmap_format_4(fp, char2gid)

773 elif fmttype == 6:

774 self.parse_cmap_format_6(fp, char2gid)

775 elif fmttype == 10:

776 self.parse_cmap_format_10(fp, char2gid)

777 elif fmttype == 12:

778 self.parse_cmap_format_12(fp, char2gid)

779 else:

780 raise AssertionError(str(("Unhandled", fmttype)))

781 if not char2gid:

782 raise TrueTypeFont.CMapNotFound

783 # create unicode map

784 unicode_map = FileUnicodeMap()

785 for char, gid in char2gid.items():

786 unicode_map.add_cid2unichr(gid, char)

787 return unicode_map

788

789 def parse_cmap_format_0(self, fp: BinaryIO, char2gid: dict[int, int]) -> None:

790 """Parse cmap subtable format 0"""

791 fmtlen, fmtlang = struct.unpack(">HH", fp.read(4))

792 log.debug("parse_cmap_format: fmtlen=%s, fmtlang=%s", fmtlen, fmtlang)

793 char2gid.update(enumerate(struct.unpack(">256B", fp.read(256))))

794

795 def parse_cmap_format_2(self, fp: BinaryIO, char2gid: dict[int, int]) -> None:

796 """Parse cmap subtable format 2"""

797 fmtlen, fmtlang = struct.unpack(">HH", fp.read(4))

798 log.debug("parse_cmap_format: fmtlen=%s, fmtlang=%s", fmtlen, fmtlang)

799 subheaderkeys = struct.unpack(">256H", fp.read(512))

800 firstbytes = [0] * 8192

801 for i, k in enumerate(subheaderkeys):

802 firstbytes[k // 8] = i

803 nhdrs = max(subheaderkeys) // 8 + 1

804 hdrs: list[tuple[int, int, int, int, int]] = []

805 for i in range(nhdrs):

806 (firstcode, entcount, delta, offset) = struct.unpack(">HHhH", fp.read(8))

807 hdrs.append((i, firstcode, entcount, delta, fp.tell() - 2 + offset))

808 for i, firstcode, entcount, delta, pos in hdrs:

809 if not entcount:

810 continue

811 first = firstcode + (firstbytes[i] << 8)

812 fp.seek(pos)

813 for c in range(entcount):

814 gid = struct.unpack(">H", fp.read(2))[0]

815 if gid:

816 gid += delta

817 char2gid[first + c] = gid

818

819 def parse_cmap_format_4(self, fp: BinaryIO, char2gid: dict[int, int]) -> None:

820 """Parse cmap subtable format 4"""

821 fmtlen, fmtlang = struct.unpack(">HH", fp.read(4))

822 log.debug("parse_cmap_format: fmtlen=%s, fmtlang=%s", fmtlen, fmtlang)

823 (segcount, _1, _2, _3) = struct.unpack(">HHHH", fp.read(8))

824 segcount //= 2

825 ecs = struct.unpack(f">{segcount}H", fp.read(2 * segcount))

826 fp.read(2)

827 scs = struct.unpack(f">{segcount}H", fp.read(2 * segcount))

828 idds = struct.unpack(f">{segcount}h", fp.read(2 * segcount))

829 pos = fp.tell()

830 idrs = struct.unpack(f">{segcount}H", fp.read(2 * segcount))

831 for ec, sc, idd, idr in zip(ecs, scs, idds, idrs, strict=False):

832 if idr:

833 fp.seek(pos + idr)

834 for c in range(sc, ec + 1):

835 b = struct.unpack(">H", fp.read(2))[0]

836 char2gid[c] = (b + idd) & 0xFFFF

837 else:

838 for c in range(sc, ec + 1):

839 char2gid[c] = (c + idd) & 0xFFFF

840

841 def parse_cmap_format_6(self, fp: BinaryIO, char2gid: dict[int, int]) -> None:

842 """Parse cmap subtable format 6"""

843 fmtlen, fmtlang = struct.unpack(">HH", fp.read(4))

844 log.debug("parse_cmap_format: fmtlen=%s, fmtlang=%s", fmtlen, fmtlang)

845 firstcode, entcount = struct.unpack(">HH", fp.read(4))

846 gids = struct.unpack(f">{entcount}H", fp.read(2 * entcount))

847 for i in range(entcount):

848 char2gid[firstcode + i] = gids[i]

849

850 def parse_cmap_format_10(self, fp: BinaryIO, char2gid: dict[int, int]) -> None:

851 """Parse cmap subtable format 10"""

852 rsv, fmtlen, fmtlang = struct.unpack(">HII", fp.read(10))

853 log.debug(

854 "parse_cmap_format: rsv=%s, fmtlen=%s, fmtlang=%s", rsv, fmtlen, fmtlang

855 )

856 startcode, numchars = struct.unpack(">II", fp.read(8))

857 gids = struct.unpack(f">{numchars}H", fp.read(2 * numchars))

858 for i in range(numchars):

859 char2gid[startcode + i] = gids[i]

860

861 def parse_cmap_format_12(self, fp: BinaryIO, char2gid: dict[int, int]) -> None:

862 """Parse cmap subtable format 12"""

863 rsv, fmtlen, fmtlang = struct.unpack(">HII", fp.read(10))

864 log.debug(

865 "parse_cmap_format: rsv=%s, fmtlen=%s, fmtlang=%s", rsv, fmtlen, fmtlang

866 )

867 numgroups = struct.unpack(">I", fp.read(4))[0]

868 for _i in range(numgroups):

869 sc, ec, sgid = struct.unpack(">III", fp.read(12))

870 for code in range(sc, ec + 1):

871 char2gid[code] = sgid

872 sgid += 1

873

874

875class PDFFontError(PDFException):

876 pass

877

878

879class PDFUnicodeNotDefined(PDFFontError):

880 pass

881

882

883LITERAL_STANDARD_ENCODING = LIT("StandardEncoding")

884LITERAL_TYPE1C = LIT("Type1C")

885

886# Font widths are maintained in a dict type that maps from *either* unicode

887# chars or integer character IDs.

888FontWidthDict = dict[int | str, float]

889

890

891class PDFFont:

892 def __init__(

893 self,

894 descriptor: Mapping[str, Any],

895 widths: FontWidthDict,

896 default_width: float | None = None,

897 ) -> None:

898 self.descriptor = descriptor

899 self.widths: FontWidthDict = resolve_all(widths)

900 self.fontname = resolve1(descriptor.get("FontName", "unknown"))

901 if isinstance(self.fontname, PSLiteral):

902 self.fontname = literal_name(self.fontname)

903 self.flags = int_value(descriptor.get("Flags", 0))

904 self.ascent = num_value(descriptor.get("Ascent", 0))

905 self.descent = num_value(descriptor.get("Descent", 0))

906 self.italic_angle = num_value(descriptor.get("ItalicAngle", 0))

907 if default_width is None:

908 self.default_width = num_value(descriptor.get("MissingWidth", 0))

909 else:

910 self.default_width = default_width

911 self.default_width = resolve1(self.default_width)

912 self.leading = num_value(descriptor.get("Leading", 0))

913 self.bbox = self._parse_bbox(descriptor)

914 self.hscale = self.vscale = 0.001

915

916 # PDF RM 9.8.1 specifies /Descent should always be a negative number.

917 # PScript5.dll seems to produce Descent with a positive number, but

918 # text analysis will be wrong if this is taken as correct. So force

919 # descent to negative.

920 if self.descent > 0:

921 self.descent = -self.descent

922

923 def __repr__(self) -> str:

924 return "<PDFFont>"

925

926 def is_vertical(self) -> bool:

927 return False

928

929 def is_multibyte(self) -> bool:

930 return False

931

932 def decode(self, bytes: bytes) -> Iterable[int]:

933 return bytearray(bytes) # map(ord, bytes)

934

935 def get_ascent(self) -> float:

936 """Ascent above the baseline, in text space units"""

937 return self.ascent * self.vscale

938

939 def get_descent(self) -> float:

940 """Descent below the baseline, in text space units; always negative"""

941 return self.descent * self.vscale

942

943 def get_width(self) -> float:

944 w = self.bbox[2] - self.bbox[0]

945 if w == 0:

946 w = -self.default_width

947 return w * self.hscale

948

949 def get_height(self) -> float:

950 h = self.bbox[3] - self.bbox[1]

951 if h == 0:

952 h = self.ascent - self.descent

953 return h * self.vscale

954

955 def char_width(self, cid: int) -> float:

956 # Because character widths may be mapping either IDs or strings,

957 # we try to lookup the character ID first, then its str equivalent.

958 cid_width = safe_float(self.widths.get(cid))

959 if cid_width is not None:

960 return cid_width * self.hscale

961

962 try:

963 str_cid = self.to_unichr(cid)

964 cid_width = safe_float(self.widths.get(str_cid))

965 if cid_width is not None:

966 return cid_width * self.hscale

967

968 except PDFUnicodeNotDefined:

969 pass

970

971 return self.default_width * self.hscale

972

973 def char_disp(self, cid: int) -> float | tuple[float | None, float]:

974 """Returns an integer for horizontal fonts, a tuple for vertical fonts."""

975 return 0

976

977 def string_width(self, s: bytes) -> float:

978 return sum(self.char_width(cid) for cid in self.decode(s))

979

980 def to_unichr(self, cid: int) -> str:

981 raise NotImplementedError

982

983 @staticmethod

984 def _parse_bbox(descriptor: Mapping[str, Any]) -> Rect:

985 """Parse FontBBox from the fonts descriptor"""

986 font_bbox = resolve_all(descriptor.get("FontBBox"))

987 bbox = safe_rect_list(font_bbox)

988 if bbox is None:

989 log.warning(

990 "Could not get FontBBox from font descriptor because "

991 "%r cannot be parsed as 4 floats",

992 font_bbox,

993 )

994 return 0.0, 0.0, 0.0, 0.0

995 return bbox

996

997

998class PDFSimpleFont(PDFFont):

999 def __init__(

1000 self,

1001 descriptor: Mapping[str, Any],

1002 widths: FontWidthDict,

1003 spec: Mapping[str, Any],

1004 ) -> None:

1005 # Font encoding is specified either by a name of

1006 # built-in encoding or a dictionary that describes

1007 # the differences.

1008

1009 default_encoding = LITERAL_STANDARD_ENCODING

1010 if literal_name(spec.get("Subtype")) == "TrueType":

1011 # PDF spec: TrueType fonts without Encoding default to WinAnsiEncoding

1012 default_encoding = LIT("WinAnsiEncoding")

1013

1014 encoding = default_encoding

1015 if "Encoding" in spec:

1016 encoding = resolve1(spec["Encoding"])

1017

1018 if isinstance(encoding, dict):

1019 name = literal_name(encoding.get("BaseEncoding", default_encoding))

1020 diff = list_value(encoding.get("Differences", []))

1021 self.cid2unicode = EncodingDB.get_encoding(name, diff)

1022 else:

1023 self.cid2unicode = EncodingDB.get_encoding(literal_name(encoding))

1024

1025 self.unicode_map: UnicodeMap | None = None

1026 if "ToUnicode" in spec:

1027 strm = stream_value(spec["ToUnicode"])

1028 self.unicode_map = FileUnicodeMap()

1029 CMapParser(self.unicode_map, BytesIO(strm.get_data())).run()

1030 PDFFont.__init__(self, descriptor, widths)

1031

1032 def to_unichr(self, cid: int) -> str:

1033 if self.unicode_map:

1034 try:

1035 return self.unicode_map.get_unichr(cid)

1036 except KeyError:

1037 pass

1038 try:

1039 return self.cid2unicode[cid]

1040 except KeyError as err:

1041 raise PDFUnicodeNotDefined(None, cid) from err

1042

1043

1044class PDFType1Font(PDFSimpleFont):

1045 def __init__(self, rsrcmgr: "PDFResourceManager", spec: Mapping[str, Any]) -> None:

1046 try:

1047 self.basefont = literal_name(spec["BaseFont"])

1048 except KeyError:

1049 if settings.STRICT:

1050 raise PDFFontError("BaseFont is missing") from None

1051 self.basefont = "unknown"

1052

1053 widths: FontWidthDict

1054 try:

1055 (descriptor, int_widths) = FontMetricsDB.get_metrics(self.basefont)

1056 widths = cast(dict[str | int, float], int_widths) # implicit int->float

1057 except KeyError:

1058 descriptor = dict_value(spec.get("FontDescriptor", {}))

1059 firstchar = int_value(spec.get("FirstChar", 0))

1060 # lastchar = int_value(spec.get('LastChar', 255))

1061 width_list = list_value(spec.get("Widths", [0] * 256))

1062 widths = {i + firstchar: resolve1(w) for (i, w) in enumerate(width_list)}

1063 PDFSimpleFont.__init__(self, descriptor, widths, spec)

1064 if "Encoding" not in spec and "FontFile" in descriptor:

1065 # try to recover the missing encoding info from the font file.

1066 self.fontfile = stream_value(descriptor.get("FontFile"))

1067 length1 = int_value(self.fontfile["Length1"])

1068 data = self.fontfile.get_data()[:length1]

1069 parser = Type1FontHeaderParser(BytesIO(data))

1070 self.cid2unicode = parser.get_encoding()

1071

1072 def __repr__(self) -> str:

1073 return f"<PDFType1Font: basefont={self.basefont!r}>"

1074

1075

1076class PDFTrueTypeFont(PDFType1Font):

1077 def __repr__(self) -> str:

1078 return f"<PDFTrueTypeFont: basefont={self.basefont!r}>"

1079

1080

1081class PDFType3Font(PDFSimpleFont):

1082 def __init__(self, rsrcmgr: "PDFResourceManager", spec: Mapping[str, Any]) -> None:

1083 firstchar = int_value(spec.get("FirstChar", 0))

1084 # lastchar = int_value(spec.get('LastChar', 0))

1085 width_list = list_value(spec.get("Widths", [0] * 256))

1086 widths: dict[str | int, float] = {

1087 i + firstchar: w for (i, w) in enumerate(width_list)

1088 }

1089 if "FontDescriptor" in spec:

1090 descriptor = dict_value(spec["FontDescriptor"])

1091 else:

1092 descriptor = {"Ascent": 0, "Descent": 0, "FontBBox": spec["FontBBox"]}

1093 PDFSimpleFont.__init__(self, descriptor, widths, spec)

1094 self.matrix = cast(Matrix, tuple(list_value(spec.get("FontMatrix"))))

1095 (_, self.descent, _, self.ascent) = self.bbox

1096 (self.hscale, self.vscale) = apply_matrix_norm(self.matrix, (1, 1))

1097

1098 def __repr__(self) -> str:

1099 return "<PDFType3Font>"

1100

1101

1102class PDFCIDFont(PDFFont):

1103 default_disp: float | tuple[float | None, float]

1104

1105 def __init__(

1106 self,

1107 rsrcmgr: "PDFResourceManager",

1108 spec: Mapping[str, Any],

1109 strict: bool = settings.STRICT,

1110 ) -> None:

1111 try:

1112 self.basefont = literal_name(spec["BaseFont"])

1113 except KeyError:

1114 if strict:

1115 raise PDFFontError("BaseFont is missing") from None

1116 self.basefont = "unknown"

1117 self.cidsysteminfo = dict_value(spec.get("CIDSystemInfo", {}))

1118 cid_registry = resolve1(self.cidsysteminfo.get("Registry", b"unknown")).decode(

1119 "latin1",

1120 )

1121 cid_ordering = resolve1(self.cidsysteminfo.get("Ordering", b"unknown")).decode(

1122 "latin1",

1123 )

1124 self.cidcoding = f"{cid_registry.strip()}-{cid_ordering.strip()}"

1125 self.cmap: CMapBase = self.get_cmap_from_spec(spec, strict)

1126

1127 try:

1128 descriptor = dict_value(spec["FontDescriptor"])

1129 except KeyError:

1130 if strict:

1131 raise PDFFontError("FontDescriptor is missing") from None

1132 descriptor = {}

1133 ttf = None

1134 if "FontFile2" in descriptor:

1135 self.fontfile = stream_value(descriptor.get("FontFile2"))

1136 ttf = TrueTypeFont(self.basefont, BytesIO(self.fontfile.get_data()))

1137 self.unicode_map: UnicodeMap | None = None

1138 if "ToUnicode" in spec:

1139 if isinstance(spec["ToUnicode"], PDFStream):

1140 strm = stream_value(spec["ToUnicode"])

1141 self.unicode_map = FileUnicodeMap()

1142 CMapParser(self.unicode_map, BytesIO(strm.get_data())).run()

1143 else:

1144 cmap_name = literal_name(spec["ToUnicode"])

1145 encoding = literal_name(spec["Encoding"])

1146 if (

1147 "Identity" in cid_ordering

1148 or "Identity" in cmap_name

1149 or "Identity" in encoding

1150 ):

1151 self.unicode_map = IdentityUnicodeMap()

1152 elif self.cidcoding in ("Adobe-Identity", "Adobe-UCS"):

1153 if ttf:

1154 with contextlib.suppress(TrueTypeFont.CMapNotFound):

1155 self.unicode_map = ttf.create_unicode_map()

1156 else:

1157 with contextlib.suppress(CMapDB.CMapNotFound):

1158 self.unicode_map = CMapDB.get_unicode_map(

1159 self.cidcoding,

1160 self.cmap.is_vertical(),

1161 )

1162

1163 self.vertical = self.cmap.is_vertical()

1164 if self.vertical:

1165 # writing mode: vertical

1166 widths2 = get_widths2(list_value(spec.get("W2", [])))

1167 self.disps = {cid: (vx, vy) for (cid, (_, (vx, vy))) in widths2.items()}

1168 (vy, w) = resolve1(spec.get("DW2", [880, -1000]))

1169 self.default_disp = (None, vy)

1170 widths: dict[str | int, float] = {

1171 cid: w for (cid, (w, _)) in widths2.items()

1172 }

1173 default_width = w

1174 else:

1175 # writing mode: horizontal

1176 self.disps = {}

1177 self.default_disp = 0

1178 widths = get_widths(list_value(spec.get("W", [])))

1179 default_width = spec.get("DW", 1000)

1180 PDFFont.__init__(self, descriptor, widths, default_width=default_width)

1181

1182 def get_cmap_from_spec(self, spec: Mapping[str, Any], strict: bool) -> CMapBase:

1183 """Get cmap from font specification

1184

1185 For certain PDFs, Encoding Type isn't mentioned as an attribute of

1186 Encoding but as an attribute of CMapName, where CMapName is an

1187 attribute of spec['Encoding'].

1188 The horizontal/vertical modes are mentioned with different name

1189 such as 'DLIdent-H/V','OneByteIdentityH/V','Identity-H/V'.

1190 """

1191 cmap_name = self._get_cmap_name(spec, strict)

1192

1193 try:

1194 return CMapDB.get_cmap(cmap_name)

1195 except CMapDB.CMapNotFound as e:

1196 if strict:

1197 raise PDFFontError(e) from e

1198 return CMap()

1199

1200 @staticmethod

1201 def _get_cmap_name(spec: Mapping[str, Any], strict: bool) -> str:

1202 """Get cmap name from font specification"""

1203 cmap_name = "unknown" # default value

1204

1205 try:

1206 spec_encoding = spec["Encoding"]

1207 if hasattr(spec_encoding, "name"):

1208 cmap_name = literal_name(spec["Encoding"])

1209 else:

1210 cmap_name = literal_name(spec_encoding["CMapName"])

1211 except KeyError:

1212 if strict:

1213 raise PDFFontError("Encoding is unspecified") from None

1214

1215 if type(cmap_name) is PDFStream: # type: ignore[comparison-overlap]

1216 cmap_name_stream: PDFStream = cast(PDFStream, cmap_name)

1217 if "CMapName" in cmap_name_stream:

1218 cmap_name = cmap_name_stream.get("CMapName").name

1219 elif strict:

1220 raise PDFFontError("CMapName unspecified for encoding")

1221

1222 return IDENTITY_ENCODER.get(cmap_name, cmap_name)

1223

1224 def __repr__(self) -> str:

1225 return f"<PDFCIDFont: basefont={self.basefont!r}, cidcoding={self.cidcoding!r}>"

1226

1227 def is_vertical(self) -> bool:

1228 return self.vertical

1229

1230 def is_multibyte(self) -> bool:

1231 return True

1232

1233 def decode(self, bytes: bytes) -> Iterable[int]:

1234 return self.cmap.decode(bytes)

1235

1236 def char_disp(self, cid: int) -> float | tuple[float | None, float]:

1237 """Returns an integer for horizontal fonts, a tuple for vertical fonts."""

1238 return self.disps.get(cid, self.default_disp)

1239

1240 def to_unichr(self, cid: int) -> str:

1241 try:

1242 if not self.unicode_map:

1243 raise PDFKeyError(cid)

1244 return self.unicode_map.get_unichr(cid)

1245 except KeyError as err:

1246 raise PDFUnicodeNotDefined(self.cidcoding, cid) from err