Coverage for /pythoncovmergedfiles/medio/medio/src/pdfminer.six/pdfminer/pdffont.py: 45%

1import contextlib

2import logging

3import struct

4from collections.abc import Iterable, Iterator, Mapping

5from io import BytesIO

6from typing import (

7 TYPE_CHECKING,

8 Any,

9 BinaryIO,

10 cast,

11)

13from pdfminer import settings

14from pdfminer.casting import safe_float, safe_rect_list

15from pdfminer.cmapdb import (

16 CMap,

17 CMapBase,

18 CMapDB,

19 CMapParser,

20 FileUnicodeMap,

21 IdentityUnicodeMap,

22 UnicodeMap,

23)

24from pdfminer.encodingdb import EncodingDB, name2unicode

25from pdfminer.fontmetrics import FONT_METRICS

26from pdfminer.pdfexceptions import PDFException, PDFKeyError, PDFValueError

27from pdfminer.pdftypes import (

28 PDFStream,

29 dict_value,

30 int_value,

31 list_value,

32 num_value,

33 resolve1,

34 resolve_all,

35 stream_value,

36)

37from pdfminer.psexceptions import PSEOF

38from pdfminer.psparser import (

39 KWD,

40 LIT,

41 PSKeyword,

42 PSLiteral,

43 PSStackParser,

44 literal_name,

45)

46from pdfminer.utils import Matrix, Point, Rect, apply_matrix_norm, choplist, nunpack

48if TYPE_CHECKING:

49 from pdfminer.pdfinterp import PDFResourceManager

51log = logging.getLogger(__name__)

54def get_widths(seq: Iterable[object]) -> dict[str | int, float]:

55 """Build a mapping of character widths for horizontal writing."""

56 widths: dict[int, float] = {}

57 r: list[float] = []

58 for v in seq:

59 v = resolve1(v)

60 if isinstance(v, list):

61 if r:

62 char1 = r[-1]

63 for i, w in enumerate(v):

64 widths[cast(int, char1) + i] = w

65 r = []

66 elif isinstance(v, (int, float)): # == utils.isnumber(v)

67 r.append(v)

68 if len(r) == 3:

69 (char1, char2, w) = r

70 if isinstance(char1, int) and isinstance(char2, int):

71 for i in range(char1, char2 + 1):

72 widths[i] = w

73 else:

74 log.warning(

75 f"Skipping invalid font width specification for {char1} to "

76 f"{char2} because either of them is not an int"

77 )

78 r = []

79 else:

80 log.warning(

81 f"Skipping invalid font width specification for {v} "

82 f"because it is not a number or a list"

83 )

84 return cast(dict[str | int, float], widths)

87def get_widths2(seq: Iterable[object]) -> dict[int, tuple[float, Point]]:

88 """Build a mapping of character widths for vertical writing."""

89 widths: dict[int, tuple[float, Point]] = {}

90 r: list[float] = []

91 for v in seq:

92 if isinstance(v, list):

93 if r:

94 char1 = r[-1]

95 for i, (w, vx, vy) in enumerate(choplist(3, v)):

96 widths[cast(int, char1) + i] = (w, (vx, vy))

97 r = []

98 elif isinstance(v, (int, float)): # == utils.isnumber(v)

99 r.append(v)

100 if len(r) == 5:

101 (char1, char2, w, vx, vy) = r

102 for i in range(cast(int, char1), cast(int, char2) + 1):

103 widths[i] = (w, (vx, vy))

104 r = []

105 return widths

106

107

108class FontMetricsDB:

109 @classmethod

110 def get_metrics(cls, fontname: str) -> tuple[dict[str, object], dict[str, int]]:

111 return FONT_METRICS[fontname]

112

113

114# int here means that we're not extending PSStackParser with additional types.

115class Type1FontHeaderParser(PSStackParser[int]):

116 KEYWORD_BEGIN = KWD(b"begin")

117 KEYWORD_END = KWD(b"end")

118 KEYWORD_DEF = KWD(b"def")

119 KEYWORD_PUT = KWD(b"put")

120 KEYWORD_DICT = KWD(b"dict")

121 KEYWORD_ARRAY = KWD(b"array")

122 KEYWORD_READONLY = KWD(b"readonly")

123 KEYWORD_FOR = KWD(b"for")

124

125 def __init__(self, data: BinaryIO) -> None:

126 PSStackParser.__init__(self, data)

127 self._cid2unicode: dict[int, str] = {}

128

129 def get_encoding(self) -> dict[int, str]:

130 """Parse the font encoding.

131

132 The Type1 font encoding maps character codes to character names. These

133 character names could either be standard Adobe glyph names, or

134 character names associated with custom CharStrings for this font. A

135 CharString is a sequence of operations that describe how the character

136 should be drawn. Currently, this function returns '' (empty string)

137 for character names that are associated with a CharStrings.

138

139 Reference: Adobe Systems Incorporated, Adobe Type 1 Font Format

140

141 :returns mapping of character identifiers (cid's) to unicode characters

142 """

143 while 1:

144 try:

145 (cid, name) = self.nextobject()

146 except PSEOF:

147 break

148 try:

149 self._cid2unicode[cid] = name2unicode(cast(str, name))

150 except KeyError as e:

151 log.debug(str(e))

152 return self._cid2unicode

153

154 def do_keyword(self, pos: int, token: PSKeyword) -> None:

155 if token is self.KEYWORD_PUT:

156 ((_, key), (_, value)) = self.pop(2)

157 if isinstance(key, int) and isinstance(value, PSLiteral):

158 self.add_results((key, literal_name(value)))

159

160

161NIBBLES = ("0", "1", "2", "3", "4", "5", "6", "7", "8", "9", ".", "e", "e-", None, "-")

162

163# Mapping of cmap names. Original cmap name is kept if not in the mapping.

164# (missing reference for why DLIdent is mapped to Identity)

165IDENTITY_ENCODER = {

166 "DLIdent-H": "Identity-H",

167 "DLIdent-V": "Identity-V",

168}

169

170

171def getdict(data: bytes) -> dict[int, list[float | int]]:

172 d: dict[int, list[float | int]] = {}

173 fp = BytesIO(data)

174 stack: list[float | int] = []

175 while 1:

176 c = fp.read(1)

177 if not c:

178 break

179 b0 = ord(c)

180 if b0 <= 21:

181 d[b0] = stack

182 stack = []

183 continue

184 if b0 == 30:

185 s = ""

186 loop = True

187 while loop:

188 b = ord(fp.read(1))

189 for n in (b >> 4, b & 15):

190 if n == 15:

191 loop = False

192 else:

193 nibble = NIBBLES[n]

194 assert nibble is not None

195 s += nibble

196 value = float(s)

197 elif b0 >= 32 and b0 <= 246:

198 value = b0 - 139

199 else:

200 b1 = ord(fp.read(1))

201 if b0 >= 247 and b0 <= 250:

202 value = ((b0 - 247) << 8) + b1 + 108

203 elif b0 >= 251 and b0 <= 254:

204 value = -((b0 - 251) << 8) - b1 - 108

205 else:

206 b2 = ord(fp.read(1))

207 if b1 >= 128:

208 b1 -= 256

209 if b0 == 28:

210 value = b1 << 8 | b2

211 else:

212 value = b1 << 24 | b2 << 16 | struct.unpack(">H", fp.read(2))[0]

213 stack.append(value)

214 return d

215

216

217class CFFFont:

218 STANDARD_STRINGS = (

219 ".notdef",

220 "space",

221 "exclam",

222 "quotedbl",

223 "numbersign",

224 "dollar",

225 "percent",

226 "ampersand",

227 "quoteright",

228 "parenleft",

229 "parenright",

230 "asterisk",

231 "plus",

232 "comma",

233 "hyphen",

234 "period",

235 "slash",

236 "zero",

237 "one",

238 "two",

239 "three",

240 "four",

241 "five",

242 "six",

243 "seven",

244 "eight",

245 "nine",

246 "colon",

247 "semicolon",

248 "less",

249 "equal",

250 "greater",

251 "question",

252 "at",

253 "A",

254 "B",

255 "C",

256 "D",

257 "E",

258 "F",

259 "G",

260 "H",

261 "I",

262 "J",

263 "K",

264 "L",

265 "M",

266 "N",

267 "O",

268 "P",

269 "Q",

270 "R",

271 "S",

272 "T",

273 "U",

274 "V",

275 "W",

276 "X",

277 "Y",

278 "Z",

279 "bracketleft",

280 "backslash",

281 "bracketright",

282 "asciicircum",

283 "underscore",

284 "quoteleft",

285 "a",

286 "b",

287 "c",

288 "d",

289 "e",

290 "f",

291 "g",

292 "h",

293 "i",

294 "j",

295 "k",

296 "l",

297 "m",

298 "n",

299 "o",

300 "p",

301 "q",

302 "r",

303 "s",

304 "t",

305 "u",

306 "v",

307 "w",

308 "x",

309 "y",

310 "z",

311 "braceleft",

312 "bar",

313 "braceright",

314 "asciitilde",

315 "exclamdown",

316 "cent",

317 "sterling",

318 "fraction",

319 "yen",

320 "florin",

321 "section",

322 "currency",

323 "quotesingle",

324 "quotedblleft",

325 "guillemotleft",

326 "guilsinglleft",

327 "guilsinglright",

328 "fi",

329 "fl",

330 "endash",

331 "dagger",

332 "daggerdbl",

333 "periodcentered",

334 "paragraph",

335 "bullet",

336 "quotesinglbase",

337 "quotedblbase",

338 "quotedblright",

339 "guillemotright",

340 "ellipsis",

341 "perthousand",

342 "questiondown",

343 "grave",

344 "acute",

345 "circumflex",

346 "tilde",

347 "macron",

348 "breve",

349 "dotaccent",

350 "dieresis",

351 "ring",

352 "cedilla",

353 "hungarumlaut",

354 "ogonek",

355 "caron",

356 "emdash",

357 "AE",

358 "ordfeminine",

359 "Lslash",

360 "Oslash",

361 "OE",

362 "ordmasculine",

363 "ae",

364 "dotlessi",

365 "lslash",

366 "oslash",

367 "oe",

368 "germandbls",

369 "onesuperior",

370 "logicalnot",

371 "mu",

372 "trademark",

373 "Eth",

374 "onehalf",

375 "plusminus",

376 "Thorn",

377 "onequarter",

378 "divide",

379 "brokenbar",

380 "degree",

381 "thorn",

382 "threequarters",

383 "twosuperior",

384 "registered",

385 "minus",

386 "eth",

387 "multiply",

388 "threesuperior",

389 "copyright",

390 "Aacute",

391 "Acircumflex",

392 "Adieresis",

393 "Agrave",

394 "Aring",

395 "Atilde",

396 "Ccedilla",

397 "Eacute",

398 "Ecircumflex",

399 "Edieresis",

400 "Egrave",

401 "Iacute",

402 "Icircumflex",

403 "Idieresis",

404 "Igrave",

405 "Ntilde",

406 "Oacute",

407 "Ocircumflex",

408 "Odieresis",

409 "Ograve",

410 "Otilde",

411 "Scaron",

412 "Uacute",

413 "Ucircumflex",

414 "Udieresis",

415 "Ugrave",

416 "Yacute",

417 "Ydieresis",

418 "Zcaron",

419 "aacute",

420 "acircumflex",

421 "adieresis",

422 "agrave",

423 "aring",

424 "atilde",

425 "ccedilla",

426 "eacute",

427 "ecircumflex",

428 "edieresis",

429 "egrave",

430 "iacute",

431 "icircumflex",

432 "idieresis",

433 "igrave",

434 "ntilde",

435 "oacute",

436 "ocircumflex",

437 "odieresis",

438 "ograve",

439 "otilde",

440 "scaron",

441 "uacute",

442 "ucircumflex",

443 "udieresis",

444 "ugrave",

445 "yacute",

446 "ydieresis",

447 "zcaron",

448 "exclamsmall",

449 "Hungarumlautsmall",

450 "dollaroldstyle",

451 "dollarsuperior",

452 "ampersandsmall",

453 "Acutesmall",

454 "parenleftsuperior",

455 "parenrightsuperior",

456 "twodotenleader",

457 "onedotenleader",

458 "zerooldstyle",

459 "oneoldstyle",

460 "twooldstyle",

461 "threeoldstyle",

462 "fouroldstyle",

463 "fiveoldstyle",

464 "sixoldstyle",

465 "sevenoldstyle",

466 "eightoldstyle",

467 "nineoldstyle",

468 "commasuperior",

469 "threequartersemdash",

470 "periodsuperior",

471 "questionsmall",

472 "asuperior",

473 "bsuperior",

474 "centsuperior",

475 "dsuperior",

476 "esuperior",

477 "isuperior",

478 "lsuperior",

479 "msuperior",

480 "nsuperior",

481 "osuperior",

482 "rsuperior",

483 "ssuperior",

484 "tsuperior",

485 "ff",

486 "ffi",

487 "ffl",

488 "parenleftinferior",

489 "parenrightinferior",

490 "Circumflexsmall",

491 "hyphensuperior",

492 "Gravesmall",

493 "Asmall",

494 "Bsmall",

495 "Csmall",

496 "Dsmall",

497 "Esmall",

498 "Fsmall",

499 "Gsmall",

500 "Hsmall",

501 "Ismall",

502 "Jsmall",

503 "Ksmall",

504 "Lsmall",

505 "Msmall",

506 "Nsmall",

507 "Osmall",

508 "Psmall",

509 "Qsmall",

510 "Rsmall",

511 "Ssmall",

512 "Tsmall",

513 "Usmall",

514 "Vsmall",

515 "Wsmall",

516 "Xsmall",

517 "Ysmall",

518 "Zsmall",

519 "colonmonetary",

520 "onefitted",

521 "rupiah",

522 "Tildesmall",

523 "exclamdownsmall",

524 "centoldstyle",

525 "Lslashsmall",

526 "Scaronsmall",

527 "Zcaronsmall",

528 "Dieresissmall",

529 "Brevesmall",

530 "Caronsmall",

531 "Dotaccentsmall",

532 "Macronsmall",

533 "figuredash",

534 "hypheninferior",

535 "Ogoneksmall",

536 "Ringsmall",

537 "Cedillasmall",

538 "questiondownsmall",

539 "oneeighth",

540 "threeeighths",

541 "fiveeighths",

542 "seveneighths",

543 "onethird",

544 "twothirds",

545 "zerosuperior",

546 "foursuperior",

547 "fivesuperior",

548 "sixsuperior",

549 "sevensuperior",

550 "eightsuperior",

551 "ninesuperior",

552 "zeroinferior",

553 "oneinferior",

554 "twoinferior",

555 "threeinferior",

556 "fourinferior",

557 "fiveinferior",

558 "sixinferior",

559 "seveninferior",

560 "eightinferior",

561 "nineinferior",

562 "centinferior",

563 "dollarinferior",

564 "periodinferior",

565 "commainferior",

566 "Agravesmall",

567 "Aacutesmall",

568 "Acircumflexsmall",

569 "Atildesmall",

570 "Adieresissmall",

571 "Aringsmall",

572 "AEsmall",

573 "Ccedillasmall",

574 "Egravesmall",

575 "Eacutesmall",

576 "Ecircumflexsmall",

577 "Edieresissmall",

578 "Igravesmall",

579 "Iacutesmall",

580 "Icircumflexsmall",

581 "Idieresissmall",

582 "Ethsmall",

583 "Ntildesmall",

584 "Ogravesmall",

585 "Oacutesmall",

586 "Ocircumflexsmall",

587 "Otildesmall",

588 "Odieresissmall",

589 "OEsmall",

590 "Oslashsmall",

591 "Ugravesmall",

592 "Uacutesmall",

593 "Ucircumflexsmall",

594 "Udieresissmall",

595 "Yacutesmall",

596 "Thornsmall",

597 "Ydieresissmall",

598 "001.000",

599 "001.001",

600 "001.002",

601 "001.003",

602 "Black",

603 "Bold",

604 "Book",

605 "Light",

606 "Medium",

607 "Regular",

608 "Roman",

609 "Semibold",

610 )

611

612 class INDEX:

613 def __init__(self, fp: BinaryIO) -> None:

614 self.fp = fp

615 self.offsets: list[int] = []

616 (count, offsize) = struct.unpack(">HB", self.fp.read(3))

617 for _i in range(count + 1):

618 self.offsets.append(nunpack(self.fp.read(offsize)))

619 self.base = self.fp.tell() - 1

620 self.fp.seek(self.base + self.offsets[-1])

621

622 def __repr__(self) -> str:

623 return f"<INDEX: size={len(self)}>"

624

625 def __len__(self) -> int:

626 return len(self.offsets) - 1

627

628 def __getitem__(self, i: int) -> bytes:

629 self.fp.seek(self.base + self.offsets[i])

630 return self.fp.read(self.offsets[i + 1] - self.offsets[i])

631

632 def __iter__(self) -> Iterator[bytes]:

633 return iter(self[i] for i in range(len(self)))

634

635 def __init__(self, name: str, fp: BinaryIO) -> None:

636 self.name = name

637 self.fp = fp

638 # Header

639 (_major, _minor, hdrsize, _offsize) = struct.unpack("BBBB", self.fp.read(4))

640 self.fp.read(hdrsize - 4)

641 # Name INDEX

642 self.name_index = self.INDEX(self.fp)

643 # Top DICT INDEX

644 self.dict_index = self.INDEX(self.fp)

645 # String INDEX

646 self.string_index = self.INDEX(self.fp)

647 # Global Subr INDEX

648 self.subr_index = self.INDEX(self.fp)

649 # Top DICT DATA

650 self.top_dict = getdict(self.dict_index[0])

651 (charset_pos,) = self.top_dict.get(15, [0])

652 (encoding_pos,) = self.top_dict.get(16, [0])

653 (charstring_pos,) = self.top_dict.get(17, [0])

654 # CharStrings

655 self.fp.seek(cast(int, charstring_pos))

656 self.charstring = self.INDEX(self.fp)

657 self.nglyphs = len(self.charstring)

658 # Encodings

659 self.code2gid = {}

660 self.gid2code = {}

661 self.fp.seek(cast(int, encoding_pos))

662 format = self.fp.read(1)

663 if format == b"\x00":

664 # Format 0

665 (n,) = struct.unpack("B", self.fp.read(1))

666 for code, gid in enumerate(struct.unpack("B" * n, self.fp.read(n))):

667 self.code2gid[code] = gid

668 self.gid2code[gid] = code

669 elif format == b"\x01":

670 # Format 1

671 (n,) = struct.unpack("B", self.fp.read(1))

672 code = 0

673 for _i in range(n):

674 (first, nleft) = struct.unpack("BB", self.fp.read(2))

675 for gid in range(first, first + nleft + 1):

676 self.code2gid[code] = gid

677 self.gid2code[gid] = code

678 code += 1

679 else:

680 raise PDFValueError(f"unsupported encoding format: {format!r}")

681 # Charsets

682 self.name2gid = {}

683 self.gid2name = {}

684 self.fp.seek(cast(int, charset_pos))

685 format = self.fp.read(1)

686 if format == b"\x00":

687 # Format 0

688 n = self.nglyphs - 1

689 for gid, sid in enumerate(

690 cast(

691 tuple[int, ...], struct.unpack(">" + "H" * n, self.fp.read(2 * n))

692 ),

693 ):

694 gid += 1

695 sidname = self.getstr(sid)

696 self.name2gid[sidname] = gid

697 self.gid2name[gid] = sidname

698 elif format == b"\x01":

699 # Format 1

700 (n,) = struct.unpack("B", self.fp.read(1))

701 sid = 0

702 for _i in range(n):

703 (first, nleft) = struct.unpack("BB", self.fp.read(2))

704 for gid in range(first, first + nleft + 1):

705 sidname = self.getstr(sid)

706 self.name2gid[sidname] = gid

707 self.gid2name[gid] = sidname

708 sid += 1

709 elif format == b"\x02":

710 # Format 2

711 raise AssertionError(str(("Unhandled", format)))

712 else:

713 raise PDFValueError(f"unsupported charset format: {format!r}")

714

715 def getstr(self, sid: int) -> str | bytes:

716 # This returns str for one of the STANDARD_STRINGS but bytes otherwise,

717 # and appears to be a needless source of type complexity.

718 if sid < len(self.STANDARD_STRINGS):

719 return self.STANDARD_STRINGS[sid]

720 return self.string_index[sid - len(self.STANDARD_STRINGS)]

721

722

723class TrueTypeFont:

724 class CMapNotFound(PDFException):

725 pass

726

727 def __init__(self, name: str, fp: BinaryIO) -> None:

728 self.name = name

729 self.fp = fp

730 self.tables: dict[bytes, tuple[int, int]] = {}

731 self.fonttype = fp.read(4)

732 try:

733 (ntables, _1, _2, _3) = cast(

734 tuple[int, int, int, int],

735 struct.unpack(">HHHH", fp.read(8)),

736 )

737 for _ in range(ntables):

738 (name_bytes, _tsum, offset, length) = cast(

739 tuple[bytes, int, int, int],

740 struct.unpack(">4sLLL", fp.read(16)),

741 )

742 self.tables[name_bytes] = (offset, length)

743 except struct.error:

744 # Do not fail if there are not enough bytes to read. Even for

745 # corrupted PDFs we would like to get as much information as

746 # possible, so continue.

747 pass

748

749 def create_unicode_map(self) -> FileUnicodeMap:

750 if b"cmap" not in self.tables:

751 raise TrueTypeFont.CMapNotFound

752 (base_offset, _length) = self.tables[b"cmap"]

753 fp = self.fp

754 fp.seek(base_offset)

755 (_version, nsubtables) = cast(tuple[int, int], struct.unpack(">HH", fp.read(4)))

756 subtables: list[tuple[int, int, int]] = []

757 for _i in range(nsubtables):

758 subtables.append(

759 cast(tuple[int, int, int], struct.unpack(">HHL", fp.read(8))),

760 )

761 char2gid: dict[int, int] = {}

762 # Only supports subtable type 0, 2 and 4.

763 for platform_id, encoding_id, st_offset in subtables:

764 # Skip non-Unicode cmaps.

765 # https://docs.microsoft.com/en-us/typography/opentype/spec/cmap

766 if not (platform_id == 0 or (platform_id == 3 and encoding_id in [1, 10])):

767 continue

768 fp.seek(base_offset + st_offset)

769 (fmttype, _fmtlen, _fmtlang) = cast(

770 tuple[int, int, int],

771 struct.unpack(">HHH", fp.read(6)),

772 )

773 if fmttype == 0:

774 char2gid.update(

775 enumerate(

776 cast(tuple[int, ...], struct.unpack(">256B", fp.read(256))),

777 ),

778 )

779 elif fmttype == 2:

780 subheaderkeys = cast(

781 tuple[int, ...],

782 struct.unpack(">256H", fp.read(512)),

783 )

784 firstbytes = [0] * 8192

785 for i, k in enumerate(subheaderkeys):

786 firstbytes[k // 8] = i

787 nhdrs = max(subheaderkeys) // 8 + 1

788 hdrs: list[tuple[int, int, int, int, int]] = []

789 for i in range(nhdrs):

790 (firstcode, entcount, delta, offset) = cast(

791 tuple[int, int, int, int],

792 struct.unpack(">HHhH", fp.read(8)),

793 )

794 hdrs.append((i, firstcode, entcount, delta, fp.tell() - 2 + offset))

795 for i, firstcode, entcount, delta, pos in hdrs:

796 if not entcount:

797 continue

798 first = firstcode + (firstbytes[i] << 8)

799 fp.seek(pos)

800 for c in range(entcount):

801 gid = cast(tuple[int], struct.unpack(">H", fp.read(2)))[0]

802 if gid:

803 gid += delta

804 char2gid[first + c] = gid

805 elif fmttype == 4:

806 (segcount, _1, _2, _3) = cast(

807 tuple[int, int, int, int],

808 struct.unpack(">HHHH", fp.read(8)),

809 )

810 segcount //= 2

811 ecs = cast(

812 tuple[int, ...],

813 struct.unpack(f">{segcount}H", fp.read(2 * segcount)),

814 )

815 fp.read(2)

816 scs = cast(

817 tuple[int, ...],

818 struct.unpack(f">{segcount}H", fp.read(2 * segcount)),

819 )

820 idds = cast(

821 tuple[int, ...],

822 struct.unpack(f">{segcount}h", fp.read(2 * segcount)),

823 )

824 pos = fp.tell()

825 idrs = cast(

826 tuple[int, ...],

827 struct.unpack(f">{segcount}H", fp.read(2 * segcount)),

828 )

829 for ec, sc, idd, idr in zip(ecs, scs, idds, idrs, strict=False):

830 if idr:

831 fp.seek(pos + idr)

832 for c in range(sc, ec + 1):

833 b = cast(tuple[int], struct.unpack(">H", fp.read(2)))[0]

834 char2gid[c] = (b + idd) & 0xFFFF

835 else:

836 for c in range(sc, ec + 1):

837 char2gid[c] = (c + idd) & 0xFFFF

838 else:

839 raise AssertionError(str(("Unhandled", fmttype)))

840 if not char2gid:

841 raise TrueTypeFont.CMapNotFound

842 # create unicode map

843 unicode_map = FileUnicodeMap()

844 for char, gid in char2gid.items():

845 unicode_map.add_cid2unichr(gid, char)

846 return unicode_map

847

848

849class PDFFontError(PDFException):

850 pass

851

852

853class PDFUnicodeNotDefined(PDFFontError):

854 pass

855

856

857LITERAL_STANDARD_ENCODING = LIT("StandardEncoding")

858LITERAL_TYPE1C = LIT("Type1C")

859

860# Font widths are maintained in a dict type that maps from *either* unicode

861# chars or integer character IDs.

862FontWidthDict = dict[int | str, float]

863

864

865class PDFFont:

866 def __init__(

867 self,

868 descriptor: Mapping[str, Any],

869 widths: FontWidthDict,

870 default_width: float | None = None,

871 ) -> None:

872 self.descriptor = descriptor

873 self.widths: FontWidthDict = resolve_all(widths)

874 self.fontname = resolve1(descriptor.get("FontName", "unknown"))

875 if isinstance(self.fontname, PSLiteral):

876 self.fontname = literal_name(self.fontname)

877 self.flags = int_value(descriptor.get("Flags", 0))

878 self.ascent = num_value(descriptor.get("Ascent", 0))

879 self.descent = num_value(descriptor.get("Descent", 0))

880 self.italic_angle = num_value(descriptor.get("ItalicAngle", 0))

881 if default_width is None:

882 self.default_width = num_value(descriptor.get("MissingWidth", 0))

883 else:

884 self.default_width = default_width

885 self.default_width = resolve1(self.default_width)

886 self.leading = num_value(descriptor.get("Leading", 0))

887 self.bbox = self._parse_bbox(descriptor)

888 self.hscale = self.vscale = 0.001

889

890 # PDF RM 9.8.1 specifies /Descent should always be a negative number.

891 # PScript5.dll seems to produce Descent with a positive number, but

892 # text analysis will be wrong if this is taken as correct. So force

893 # descent to negative.

894 if self.descent > 0:

895 self.descent = -self.descent

896

897 def __repr__(self) -> str:

898 return "<PDFFont>"

899

900 def is_vertical(self) -> bool:

901 return False

902

903 def is_multibyte(self) -> bool:

904 return False

905

906 def decode(self, bytes: bytes) -> Iterable[int]:

907 return bytearray(bytes) # map(ord, bytes)

908

909 def get_ascent(self) -> float:

910 """Ascent above the baseline, in text space units"""

911 return self.ascent * self.vscale

912

913 def get_descent(self) -> float:

914 """Descent below the baseline, in text space units; always negative"""

915 return self.descent * self.vscale

916

917 def get_width(self) -> float:

918 w = self.bbox[2] - self.bbox[0]

919 if w == 0:

920 w = -self.default_width

921 return w * self.hscale

922

923 def get_height(self) -> float:

924 h = self.bbox[3] - self.bbox[1]

925 if h == 0:

926 h = self.ascent - self.descent

927 return h * self.vscale

928

929 def char_width(self, cid: int) -> float:

930 # Because character widths may be mapping either IDs or strings,

931 # we try to lookup the character ID first, then its str equivalent.

932 cid_width = safe_float(self.widths.get(cid))

933 if cid_width is not None:

934 return cid_width * self.hscale

935

936 try:

937 str_cid = self.to_unichr(cid)

938 cid_width = safe_float(self.widths.get(str_cid))

939 if cid_width is not None:

940 return cid_width * self.hscale

941

942 except PDFUnicodeNotDefined:

943 pass

944

945 return self.default_width * self.hscale

946

947 def char_disp(self, cid: int) -> float | tuple[float | None, float]:

948 """Returns an integer for horizontal fonts, a tuple for vertical fonts."""

949 return 0

950

951 def string_width(self, s: bytes) -> float:

952 return sum(self.char_width(cid) for cid in self.decode(s))

953

954 def to_unichr(self, cid: int) -> str:

955 raise NotImplementedError

956

957 @staticmethod

958 def _parse_bbox(descriptor: Mapping[str, Any]) -> Rect:

959 """Parse FontBBox from the fonts descriptor"""

960 font_bbox = resolve_all(descriptor.get("FontBBox"))

961 bbox = safe_rect_list(font_bbox)

962 if bbox is None:

963 log.warning(

964 f"Could not get FontBBox from font descriptor because "

965 f"{font_bbox!r} cannot be parsed as 4 floats"

966 )

967 return 0.0, 0.0, 0.0, 0.0

968 return bbox

969

970

971class PDFSimpleFont(PDFFont):

972 def __init__(

973 self,

974 descriptor: Mapping[str, Any],

975 widths: FontWidthDict,

976 spec: Mapping[str, Any],

977 ) -> None:

978 # Font encoding is specified either by a name of

979 # built-in encoding or a dictionary that describes

980 # the differences.

981

982 default_encoding = LITERAL_STANDARD_ENCODING

983 if literal_name(spec.get("Subtype")) == "TrueType":

984 # PDF spec: TrueType fonts without Encoding default to WinAnsiEncoding

985 default_encoding = LIT("WinAnsiEncoding")

986

987 encoding = default_encoding

988 if "Encoding" in spec:

989 encoding = resolve1(spec["Encoding"])

990

991 if isinstance(encoding, dict):

992 name = literal_name(encoding.get("BaseEncoding", default_encoding))

993 diff = list_value(encoding.get("Differences", []))

994 self.cid2unicode = EncodingDB.get_encoding(name, diff)

995 else:

996 self.cid2unicode = EncodingDB.get_encoding(literal_name(encoding))

997

998 self.unicode_map: UnicodeMap | None = None

999 if "ToUnicode" in spec:

1000 strm = stream_value(spec["ToUnicode"])

1001 self.unicode_map = FileUnicodeMap()

1002 CMapParser(self.unicode_map, BytesIO(strm.get_data())).run()

1003 PDFFont.__init__(self, descriptor, widths)

1004

1005 def to_unichr(self, cid: int) -> str:

1006 if self.unicode_map:

1007 try:

1008 return self.unicode_map.get_unichr(cid)

1009 except KeyError:

1010 pass

1011 try:

1012 return self.cid2unicode[cid]

1013 except KeyError as err:

1014 raise PDFUnicodeNotDefined(None, cid) from err

1015

1016

1017class PDFType1Font(PDFSimpleFont):

1018 def __init__(self, rsrcmgr: "PDFResourceManager", spec: Mapping[str, Any]) -> None:

1019 try:

1020 self.basefont = literal_name(spec["BaseFont"])

1021 except KeyError:

1022 if settings.STRICT:

1023 raise PDFFontError("BaseFont is missing") from None

1024 self.basefont = "unknown"

1025

1026 widths: FontWidthDict

1027 try:

1028 (descriptor, int_widths) = FontMetricsDB.get_metrics(self.basefont)

1029 widths = cast(dict[str | int, float], int_widths) # implicit int->float

1030 except KeyError:

1031 descriptor = dict_value(spec.get("FontDescriptor", {}))

1032 firstchar = int_value(spec.get("FirstChar", 0))

1033 # lastchar = int_value(spec.get('LastChar', 255))

1034 width_list = list_value(spec.get("Widths", [0] * 256))

1035 widths = {i + firstchar: resolve1(w) for (i, w) in enumerate(width_list)}

1036 PDFSimpleFont.__init__(self, descriptor, widths, spec)

1037 if "Encoding" not in spec and "FontFile" in descriptor:

1038 # try to recover the missing encoding info from the font file.

1039 self.fontfile = stream_value(descriptor.get("FontFile"))

1040 length1 = int_value(self.fontfile["Length1"])

1041 data = self.fontfile.get_data()[:length1]

1042 parser = Type1FontHeaderParser(BytesIO(data))

1043 self.cid2unicode = parser.get_encoding()

1044

1045 def __repr__(self) -> str:

1046 return f"<PDFType1Font: basefont={self.basefont!r}>"

1047

1048

1049class PDFTrueTypeFont(PDFType1Font):

1050 def __repr__(self) -> str:

1051 return f"<PDFTrueTypeFont: basefont={self.basefont!r}>"

1052

1053

1054class PDFType3Font(PDFSimpleFont):

1055 def __init__(self, rsrcmgr: "PDFResourceManager", spec: Mapping[str, Any]) -> None:

1056 firstchar = int_value(spec.get("FirstChar", 0))

1057 # lastchar = int_value(spec.get('LastChar', 0))

1058 width_list = list_value(spec.get("Widths", [0] * 256))

1059 widths: dict[str | int, float] = {

1060 i + firstchar: w for (i, w) in enumerate(width_list)

1061 }

1062 if "FontDescriptor" in spec:

1063 descriptor = dict_value(spec["FontDescriptor"])

1064 else:

1065 descriptor = {"Ascent": 0, "Descent": 0, "FontBBox": spec["FontBBox"]}

1066 PDFSimpleFont.__init__(self, descriptor, widths, spec)

1067 self.matrix = cast(Matrix, tuple(list_value(spec.get("FontMatrix"))))

1068 (_, self.descent, _, self.ascent) = self.bbox

1069 (self.hscale, self.vscale) = apply_matrix_norm(self.matrix, (1, 1))

1070

1071 def __repr__(self) -> str:

1072 return "<PDFType3Font>"

1073

1074

1075class PDFCIDFont(PDFFont):

1076 default_disp: float | tuple[float | None, float]

1077

1078 def __init__(

1079 self,

1080 rsrcmgr: "PDFResourceManager",

1081 spec: Mapping[str, Any],

1082 strict: bool = settings.STRICT,

1083 ) -> None:

1084 try:

1085 self.basefont = literal_name(spec["BaseFont"])

1086 except KeyError:

1087 if strict:

1088 raise PDFFontError("BaseFont is missing") from None

1089 self.basefont = "unknown"

1090 self.cidsysteminfo = dict_value(spec.get("CIDSystemInfo", {}))

1091 cid_registry = resolve1(self.cidsysteminfo.get("Registry", b"unknown")).decode(

1092 "latin1",

1093 )

1094 cid_ordering = resolve1(self.cidsysteminfo.get("Ordering", b"unknown")).decode(

1095 "latin1",

1096 )

1097 self.cidcoding = f"{cid_registry.strip()}-{cid_ordering.strip()}"

1098 self.cmap: CMapBase = self.get_cmap_from_spec(spec, strict)

1099

1100 try:

1101 descriptor = dict_value(spec["FontDescriptor"])

1102 except KeyError:

1103 if strict:

1104 raise PDFFontError("FontDescriptor is missing") from None

1105 descriptor = {}

1106 ttf = None

1107 if "FontFile2" in descriptor:

1108 self.fontfile = stream_value(descriptor.get("FontFile2"))

1109 ttf = TrueTypeFont(self.basefont, BytesIO(self.fontfile.get_data()))

1110 self.unicode_map: UnicodeMap | None = None

1111 if "ToUnicode" in spec:

1112 if isinstance(spec["ToUnicode"], PDFStream):

1113 strm = stream_value(spec["ToUnicode"])

1114 self.unicode_map = FileUnicodeMap()

1115 CMapParser(self.unicode_map, BytesIO(strm.get_data())).run()

1116 else:

1117 cmap_name = literal_name(spec["ToUnicode"])

1118 encoding = literal_name(spec["Encoding"])

1119 if (

1120 "Identity" in cid_ordering

1121 or "Identity" in cmap_name

1122 or "Identity" in encoding

1123 ):

1124 self.unicode_map = IdentityUnicodeMap()

1125 elif self.cidcoding in ("Adobe-Identity", "Adobe-UCS"):

1126 if ttf:

1127 with contextlib.suppress(TrueTypeFont.CMapNotFound):

1128 self.unicode_map = ttf.create_unicode_map()

1129 else:

1130 with contextlib.suppress(CMapDB.CMapNotFound):

1131 self.unicode_map = CMapDB.get_unicode_map(

1132 self.cidcoding,

1133 self.cmap.is_vertical(),

1134 )

1135

1136 self.vertical = self.cmap.is_vertical()

1137 if self.vertical:

1138 # writing mode: vertical

1139 widths2 = get_widths2(list_value(spec.get("W2", [])))

1140 self.disps = {cid: (vx, vy) for (cid, (_, (vx, vy))) in widths2.items()}

1141 (vy, w) = resolve1(spec.get("DW2", [880, -1000]))

1142 self.default_disp = (None, vy)

1143 widths: dict[str | int, float] = {

1144 cid: w for (cid, (w, _)) in widths2.items()

1145 }

1146 default_width = w

1147 else:

1148 # writing mode: horizontal

1149 self.disps = {}

1150 self.default_disp = 0

1151 widths = get_widths(list_value(spec.get("W", [])))

1152 default_width = spec.get("DW", 1000)

1153 PDFFont.__init__(self, descriptor, widths, default_width=default_width)

1154

1155 def get_cmap_from_spec(self, spec: Mapping[str, Any], strict: bool) -> CMapBase:

1156 """Get cmap from font specification

1157

1158 For certain PDFs, Encoding Type isn't mentioned as an attribute of

1159 Encoding but as an attribute of CMapName, where CMapName is an

1160 attribute of spec['Encoding'].

1161 The horizontal/vertical modes are mentioned with different name

1162 such as 'DLIdent-H/V','OneByteIdentityH/V','Identity-H/V'.

1163 """

1164 cmap_name = self._get_cmap_name(spec, strict)

1165

1166 try:

1167 return CMapDB.get_cmap(cmap_name)

1168 except CMapDB.CMapNotFound as e:

1169 if strict:

1170 raise PDFFontError(e) from e

1171 return CMap()

1172

1173 @staticmethod

1174 def _get_cmap_name(spec: Mapping[str, Any], strict: bool) -> str:

1175 """Get cmap name from font specification"""

1176 cmap_name = "unknown" # default value

1177

1178 try:

1179 spec_encoding = spec["Encoding"]

1180 if hasattr(spec_encoding, "name"):

1181 cmap_name = literal_name(spec["Encoding"])

1182 else:

1183 cmap_name = literal_name(spec_encoding["CMapName"])

1184 except KeyError:

1185 if strict:

1186 raise PDFFontError("Encoding is unspecified") from None

1187

1188 if type(cmap_name) is PDFStream: # type: ignore[comparison-overlap]

1189 cmap_name_stream: PDFStream = cast(PDFStream, cmap_name)

1190 if "CMapName" in cmap_name_stream:

1191 cmap_name = cmap_name_stream.get("CMapName").name

1192 elif strict:

1193 raise PDFFontError("CMapName unspecified for encoding")

1194

1195 return IDENTITY_ENCODER.get(cmap_name, cmap_name)

1196

1197 def __repr__(self) -> str:

1198 return f"<PDFCIDFont: basefont={self.basefont!r}, cidcoding={self.cidcoding!r}>"

1199

1200 def is_vertical(self) -> bool:

1201 return self.vertical

1202

1203 def is_multibyte(self) -> bool:

1204 return True

1205

1206 def decode(self, bytes: bytes) -> Iterable[int]:

1207 return self.cmap.decode(bytes)

1208

1209 def char_disp(self, cid: int) -> float | tuple[float | None, float]:

1210 """Returns an integer for horizontal fonts, a tuple for vertical fonts."""

1211 return self.disps.get(cid, self.default_disp)

1212

1213 def to_unichr(self, cid: int) -> str:

1214 try:

1215 if not self.unicode_map:

1216 raise PDFKeyError(cid)

1217 return self.unicode_map.get_unichr(cid)

1218 except KeyError as err:

1219 raise PDFUnicodeNotDefined(self.cidcoding, cid) from err