Coverage for /pythoncovmergedfiles/medio/medio/src/pdfminer.six/pdfminer/pdffont.py: 67%

1import logging

2import struct

3from io import BytesIO

4from typing import (

5 TYPE_CHECKING,

6 Any,

7 BinaryIO,

8 Dict,

9 Iterable,

10 Iterator,

11 List,

12 Mapping,

13 Optional,

14 Tuple,

15 Union,

16 cast,

17)

19from pdfminer import settings

20from pdfminer.casting import safe_float, safe_rect_list

21from pdfminer.cmapdb import (

22 CMap,

23 CMapBase,

24 CMapDB,

25 CMapParser,

26 FileUnicodeMap,

27 IdentityUnicodeMap,

28 UnicodeMap,

29)

30from pdfminer.encodingdb import EncodingDB, name2unicode

31from pdfminer.fontmetrics import FONT_METRICS

32from pdfminer.pdfexceptions import PDFException, PDFKeyError, PDFValueError

33from pdfminer.pdftypes import (

34 PDFStream,

35 dict_value,

36 int_value,

37 list_value,

38 num_value,

39 resolve1,

40 resolve_all,

41 stream_value,

42)

43from pdfminer.psexceptions import PSEOF

44from pdfminer.psparser import (

45 KWD,

46 LIT,

47 PSKeyword,

48 PSLiteral,

49 PSStackParser,

50 literal_name,

51)

52from pdfminer.utils import Matrix, Point, Rect, apply_matrix_norm, choplist, nunpack

54if TYPE_CHECKING:

55 from pdfminer.pdfinterp import PDFResourceManager

57log = logging.getLogger(__name__)

60def get_widths(seq: Iterable[object]) -> Dict[Union[str, int], float]:

61 """Build a mapping of character widths for horizontal writing."""

62 widths: Dict[int, float] = {}

63 r: List[float] = []

64 for v in seq:

65 v = resolve1(v)

66 if isinstance(v, list):

67 if r:

68 char1 = r[-1]

69 for i, w in enumerate(v):

70 widths[cast(int, char1) + i] = w

71 r = []

72 elif isinstance(v, (int, float)): # == utils.isnumber(v)

73 r.append(v)

74 if len(r) == 3:

75 (char1, char2, w) = r

76 if isinstance(char1, int) and isinstance(char2, int):

77 for i in range(cast(int, char1), cast(int, char2) + 1):

78 widths[i] = w

79 else:

80 log.warning(

81 f"Skipping invalid font width specification for {char1} to {char2} because either of them is not an int"

82 )

83 r = []

84 else:

85 log.warning(

86 f"Skipping invalid font width specification for {v} because it is not a number or a list"

87 )

88 return cast(Dict[Union[str, int], float], widths)

91def get_widths2(seq: Iterable[object]) -> Dict[int, Tuple[float, Point]]:

92 """Build a mapping of character widths for vertical writing."""

93 widths: Dict[int, Tuple[float, Point]] = {}

94 r: List[float] = []

95 for v in seq:

96 if isinstance(v, list):

97 if r:

98 char1 = r[-1]

99 for i, (w, vx, vy) in enumerate(choplist(3, v)):

100 widths[cast(int, char1) + i] = (w, (vx, vy))

101 r = []

102 elif isinstance(v, (int, float)): # == utils.isnumber(v)

103 r.append(v)

104 if len(r) == 5:

105 (char1, char2, w, vx, vy) = r

106 for i in range(cast(int, char1), cast(int, char2) + 1):

107 widths[i] = (w, (vx, vy))

108 r = []

109 return widths

110

111

112class FontMetricsDB:

113 @classmethod

114 def get_metrics(cls, fontname: str) -> Tuple[Dict[str, object], Dict[str, int]]:

115 return FONT_METRICS[fontname]

116

117

118# int here means that we're not extending PSStackParser with additional types.

119class Type1FontHeaderParser(PSStackParser[int]):

120 KEYWORD_BEGIN = KWD(b"begin")

121 KEYWORD_END = KWD(b"end")

122 KEYWORD_DEF = KWD(b"def")

123 KEYWORD_PUT = KWD(b"put")

124 KEYWORD_DICT = KWD(b"dict")

125 KEYWORD_ARRAY = KWD(b"array")

126 KEYWORD_READONLY = KWD(b"readonly")

127 KEYWORD_FOR = KWD(b"for")

128

129 def __init__(self, data: BinaryIO) -> None:

130 PSStackParser.__init__(self, data)

131 self._cid2unicode: Dict[int, str] = {}

132

133 def get_encoding(self) -> Dict[int, str]:

134 """Parse the font encoding.

135

136 The Type1 font encoding maps character codes to character names. These

137 character names could either be standard Adobe glyph names, or

138 character names associated with custom CharStrings for this font. A

139 CharString is a sequence of operations that describe how the character

140 should be drawn. Currently, this function returns '' (empty string)

141 for character names that are associated with a CharStrings.

142

143 Reference: Adobe Systems Incorporated, Adobe Type 1 Font Format

144

145 :returns mapping of character identifiers (cid's) to unicode characters

146 """

147 while 1:

148 try:

149 (cid, name) = self.nextobject()

150 except PSEOF:

151 break

152 try:

153 self._cid2unicode[cid] = name2unicode(cast(str, name))

154 except KeyError as e:

155 log.debug(str(e))

156 return self._cid2unicode

157

158 def do_keyword(self, pos: int, token: PSKeyword) -> None:

159 if token is self.KEYWORD_PUT:

160 ((_, key), (_, value)) = self.pop(2)

161 if isinstance(key, int) and isinstance(value, PSLiteral):

162 self.add_results((key, literal_name(value)))

163

164

165NIBBLES = ("0", "1", "2", "3", "4", "5", "6", "7", "8", "9", ".", "e", "e-", None, "-")

166

167# Mapping of cmap names. Original cmap name is kept if not in the mapping.

168# (missing reference for why DLIdent is mapped to Identity)

169IDENTITY_ENCODER = {

170 "DLIdent-H": "Identity-H",

171 "DLIdent-V": "Identity-V",

172}

173

174

175def getdict(data: bytes) -> Dict[int, List[Union[float, int]]]:

176 d: Dict[int, List[Union[float, int]]] = {}

177 fp = BytesIO(data)

178 stack: List[Union[float, int]] = []

179 while 1:

180 c = fp.read(1)

181 if not c:

182 break

183 b0 = ord(c)

184 if b0 <= 21:

185 d[b0] = stack

186 stack = []

187 continue

188 if b0 == 30:

189 s = ""

190 loop = True

191 while loop:

192 b = ord(fp.read(1))

193 for n in (b >> 4, b & 15):

194 if n == 15:

195 loop = False

196 else:

197 nibble = NIBBLES[n]

198 assert nibble is not None

199 s += nibble

200 value = float(s)

201 elif b0 >= 32 and b0 <= 246:

202 value = b0 - 139

203 else:

204 b1 = ord(fp.read(1))

205 if b0 >= 247 and b0 <= 250:

206 value = ((b0 - 247) << 8) + b1 + 108

207 elif b0 >= 251 and b0 <= 254:

208 value = -((b0 - 251) << 8) - b1 - 108

209 else:

210 b2 = ord(fp.read(1))

211 if b1 >= 128:

212 b1 -= 256

213 if b0 == 28:

214 value = b1 << 8 | b2

215 else:

216 value = b1 << 24 | b2 << 16 | struct.unpack(">H", fp.read(2))[0]

217 stack.append(value)

218 return d

219

220

221class CFFFont:

222 STANDARD_STRINGS = (

223 ".notdef",

224 "space",

225 "exclam",

226 "quotedbl",

227 "numbersign",

228 "dollar",

229 "percent",

230 "ampersand",

231 "quoteright",

232 "parenleft",

233 "parenright",

234 "asterisk",

235 "plus",

236 "comma",

237 "hyphen",

238 "period",

239 "slash",

240 "zero",

241 "one",

242 "two",

243 "three",

244 "four",

245 "five",

246 "six",

247 "seven",

248 "eight",

249 "nine",

250 "colon",

251 "semicolon",

252 "less",

253 "equal",

254 "greater",

255 "question",

256 "at",

257 "A",

258 "B",

259 "C",

260 "D",

261 "E",

262 "F",

263 "G",

264 "H",

265 "I",

266 "J",

267 "K",

268 "L",

269 "M",

270 "N",

271 "O",

272 "P",

273 "Q",

274 "R",

275 "S",

276 "T",

277 "U",

278 "V",

279 "W",

280 "X",

281 "Y",

282 "Z",

283 "bracketleft",

284 "backslash",

285 "bracketright",

286 "asciicircum",

287 "underscore",

288 "quoteleft",

289 "a",

290 "b",

291 "c",

292 "d",

293 "e",

294 "f",

295 "g",

296 "h",

297 "i",

298 "j",

299 "k",

300 "l",

301 "m",

302 "n",

303 "o",

304 "p",

305 "q",

306 "r",

307 "s",

308 "t",

309 "u",

310 "v",

311 "w",

312 "x",

313 "y",

314 "z",

315 "braceleft",

316 "bar",

317 "braceright",

318 "asciitilde",

319 "exclamdown",

320 "cent",

321 "sterling",

322 "fraction",

323 "yen",

324 "florin",

325 "section",

326 "currency",

327 "quotesingle",

328 "quotedblleft",

329 "guillemotleft",

330 "guilsinglleft",

331 "guilsinglright",

332 "fi",

333 "fl",

334 "endash",

335 "dagger",

336 "daggerdbl",

337 "periodcentered",

338 "paragraph",

339 "bullet",

340 "quotesinglbase",

341 "quotedblbase",

342 "quotedblright",

343 "guillemotright",

344 "ellipsis",

345 "perthousand",

346 "questiondown",

347 "grave",

348 "acute",

349 "circumflex",

350 "tilde",

351 "macron",

352 "breve",

353 "dotaccent",

354 "dieresis",

355 "ring",

356 "cedilla",

357 "hungarumlaut",

358 "ogonek",

359 "caron",

360 "emdash",

361 "AE",

362 "ordfeminine",

363 "Lslash",

364 "Oslash",

365 "OE",

366 "ordmasculine",

367 "ae",

368 "dotlessi",

369 "lslash",

370 "oslash",

371 "oe",

372 "germandbls",

373 "onesuperior",

374 "logicalnot",

375 "mu",

376 "trademark",

377 "Eth",

378 "onehalf",

379 "plusminus",

380 "Thorn",

381 "onequarter",

382 "divide",

383 "brokenbar",

384 "degree",

385 "thorn",

386 "threequarters",

387 "twosuperior",

388 "registered",

389 "minus",

390 "eth",

391 "multiply",

392 "threesuperior",

393 "copyright",

394 "Aacute",

395 "Acircumflex",

396 "Adieresis",

397 "Agrave",

398 "Aring",

399 "Atilde",

400 "Ccedilla",

401 "Eacute",

402 "Ecircumflex",

403 "Edieresis",

404 "Egrave",

405 "Iacute",

406 "Icircumflex",

407 "Idieresis",

408 "Igrave",

409 "Ntilde",

410 "Oacute",

411 "Ocircumflex",

412 "Odieresis",

413 "Ograve",

414 "Otilde",

415 "Scaron",

416 "Uacute",

417 "Ucircumflex",

418 "Udieresis",

419 "Ugrave",

420 "Yacute",

421 "Ydieresis",

422 "Zcaron",

423 "aacute",

424 "acircumflex",

425 "adieresis",

426 "agrave",

427 "aring",

428 "atilde",

429 "ccedilla",

430 "eacute",

431 "ecircumflex",

432 "edieresis",

433 "egrave",

434 "iacute",

435 "icircumflex",

436 "idieresis",

437 "igrave",

438 "ntilde",

439 "oacute",

440 "ocircumflex",

441 "odieresis",

442 "ograve",

443 "otilde",

444 "scaron",

445 "uacute",

446 "ucircumflex",

447 "udieresis",

448 "ugrave",

449 "yacute",

450 "ydieresis",

451 "zcaron",

452 "exclamsmall",

453 "Hungarumlautsmall",

454 "dollaroldstyle",

455 "dollarsuperior",

456 "ampersandsmall",

457 "Acutesmall",

458 "parenleftsuperior",

459 "parenrightsuperior",

460 "twodotenleader",

461 "onedotenleader",

462 "zerooldstyle",

463 "oneoldstyle",

464 "twooldstyle",

465 "threeoldstyle",

466 "fouroldstyle",

467 "fiveoldstyle",

468 "sixoldstyle",

469 "sevenoldstyle",

470 "eightoldstyle",

471 "nineoldstyle",

472 "commasuperior",

473 "threequartersemdash",

474 "periodsuperior",

475 "questionsmall",

476 "asuperior",

477 "bsuperior",

478 "centsuperior",

479 "dsuperior",

480 "esuperior",

481 "isuperior",

482 "lsuperior",

483 "msuperior",

484 "nsuperior",

485 "osuperior",

486 "rsuperior",

487 "ssuperior",

488 "tsuperior",

489 "ff",

490 "ffi",

491 "ffl",

492 "parenleftinferior",

493 "parenrightinferior",

494 "Circumflexsmall",

495 "hyphensuperior",

496 "Gravesmall",

497 "Asmall",

498 "Bsmall",

499 "Csmall",

500 "Dsmall",

501 "Esmall",

502 "Fsmall",

503 "Gsmall",

504 "Hsmall",

505 "Ismall",

506 "Jsmall",

507 "Ksmall",

508 "Lsmall",

509 "Msmall",

510 "Nsmall",

511 "Osmall",

512 "Psmall",

513 "Qsmall",

514 "Rsmall",

515 "Ssmall",

516 "Tsmall",

517 "Usmall",

518 "Vsmall",

519 "Wsmall",

520 "Xsmall",

521 "Ysmall",

522 "Zsmall",

523 "colonmonetary",

524 "onefitted",

525 "rupiah",

526 "Tildesmall",

527 "exclamdownsmall",

528 "centoldstyle",

529 "Lslashsmall",

530 "Scaronsmall",

531 "Zcaronsmall",

532 "Dieresissmall",

533 "Brevesmall",

534 "Caronsmall",

535 "Dotaccentsmall",

536 "Macronsmall",

537 "figuredash",

538 "hypheninferior",

539 "Ogoneksmall",

540 "Ringsmall",

541 "Cedillasmall",

542 "questiondownsmall",

543 "oneeighth",

544 "threeeighths",

545 "fiveeighths",

546 "seveneighths",

547 "onethird",

548 "twothirds",

549 "zerosuperior",

550 "foursuperior",

551 "fivesuperior",

552 "sixsuperior",

553 "sevensuperior",

554 "eightsuperior",

555 "ninesuperior",

556 "zeroinferior",

557 "oneinferior",

558 "twoinferior",

559 "threeinferior",

560 "fourinferior",

561 "fiveinferior",

562 "sixinferior",

563 "seveninferior",

564 "eightinferior",

565 "nineinferior",

566 "centinferior",

567 "dollarinferior",

568 "periodinferior",

569 "commainferior",

570 "Agravesmall",

571 "Aacutesmall",

572 "Acircumflexsmall",

573 "Atildesmall",

574 "Adieresissmall",

575 "Aringsmall",

576 "AEsmall",

577 "Ccedillasmall",

578 "Egravesmall",

579 "Eacutesmall",

580 "Ecircumflexsmall",

581 "Edieresissmall",

582 "Igravesmall",

583 "Iacutesmall",

584 "Icircumflexsmall",

585 "Idieresissmall",

586 "Ethsmall",

587 "Ntildesmall",

588 "Ogravesmall",

589 "Oacutesmall",

590 "Ocircumflexsmall",

591 "Otildesmall",

592 "Odieresissmall",

593 "OEsmall",

594 "Oslashsmall",

595 "Ugravesmall",

596 "Uacutesmall",

597 "Ucircumflexsmall",

598 "Udieresissmall",

599 "Yacutesmall",

600 "Thornsmall",

601 "Ydieresissmall",

602 "001.000",

603 "001.001",

604 "001.002",

605 "001.003",

606 "Black",

607 "Bold",

608 "Book",

609 "Light",

610 "Medium",

611 "Regular",

612 "Roman",

613 "Semibold",

614 )

615

616 class INDEX:

617 def __init__(self, fp: BinaryIO) -> None:

618 self.fp = fp

619 self.offsets: List[int] = []

620 (count, offsize) = struct.unpack(">HB", self.fp.read(3))

621 for i in range(count + 1):

622 self.offsets.append(nunpack(self.fp.read(offsize)))

623 self.base = self.fp.tell() - 1

624 self.fp.seek(self.base + self.offsets[-1])

625

626 def __repr__(self) -> str:

627 return "<INDEX: size=%d>" % len(self)

628

629 def __len__(self) -> int:

630 return len(self.offsets) - 1

631

632 def __getitem__(self, i: int) -> bytes:

633 self.fp.seek(self.base + self.offsets[i])

634 return self.fp.read(self.offsets[i + 1] - self.offsets[i])

635

636 def __iter__(self) -> Iterator[bytes]:

637 return iter(self[i] for i in range(len(self)))

638

639 def __init__(self, name: str, fp: BinaryIO) -> None:

640 self.name = name

641 self.fp = fp

642 # Header

643 (_major, _minor, hdrsize, offsize) = struct.unpack("BBBB", self.fp.read(4))

644 self.fp.read(hdrsize - 4)

645 # Name INDEX

646 self.name_index = self.INDEX(self.fp)

647 # Top DICT INDEX

648 self.dict_index = self.INDEX(self.fp)

649 # String INDEX

650 self.string_index = self.INDEX(self.fp)

651 # Global Subr INDEX

652 self.subr_index = self.INDEX(self.fp)

653 # Top DICT DATA

654 self.top_dict = getdict(self.dict_index[0])

655 (charset_pos,) = self.top_dict.get(15, [0])

656 (encoding_pos,) = self.top_dict.get(16, [0])

657 (charstring_pos,) = self.top_dict.get(17, [0])

658 # CharStrings

659 self.fp.seek(cast(int, charstring_pos))

660 self.charstring = self.INDEX(self.fp)

661 self.nglyphs = len(self.charstring)

662 # Encodings

663 self.code2gid = {}

664 self.gid2code = {}

665 self.fp.seek(cast(int, encoding_pos))

666 format = self.fp.read(1)

667 if format == b"\x00":

668 # Format 0

669 (n,) = struct.unpack("B", self.fp.read(1))

670 for code, gid in enumerate(struct.unpack("B" * n, self.fp.read(n))):

671 self.code2gid[code] = gid

672 self.gid2code[gid] = code

673 elif format == b"\x01":

674 # Format 1

675 (n,) = struct.unpack("B", self.fp.read(1))

676 code = 0

677 for i in range(n):

678 (first, nleft) = struct.unpack("BB", self.fp.read(2))

679 for gid in range(first, first + nleft + 1):

680 self.code2gid[code] = gid

681 self.gid2code[gid] = code

682 code += 1

683 else:

684 raise PDFValueError("unsupported encoding format: %r" % format)

685 # Charsets

686 self.name2gid = {}

687 self.gid2name = {}

688 self.fp.seek(cast(int, charset_pos))

689 format = self.fp.read(1)

690 if format == b"\x00":

691 # Format 0

692 n = self.nglyphs - 1

693 for gid, sid in enumerate(

694 cast(

695 Tuple[int, ...], struct.unpack(">" + "H" * n, self.fp.read(2 * n))

696 ),

697 ):

698 gid += 1

699 sidname = self.getstr(sid)

700 self.name2gid[sidname] = gid

701 self.gid2name[gid] = sidname

702 elif format == b"\x01":

703 # Format 1

704 (n,) = struct.unpack("B", self.fp.read(1))

705 sid = 0

706 for i in range(n):

707 (first, nleft) = struct.unpack("BB", self.fp.read(2))

708 for gid in range(first, first + nleft + 1):

709 sidname = self.getstr(sid)

710 self.name2gid[sidname] = gid

711 self.gid2name[gid] = sidname

712 sid += 1

713 elif format == b"\x02":

714 # Format 2

715 assert False, str(("Unhandled", format))

716 else:

717 raise PDFValueError("unsupported charset format: %r" % format)

718

719 def getstr(self, sid: int) -> Union[str, bytes]:

720 # This returns str for one of the STANDARD_STRINGS but bytes otherwise,

721 # and appears to be a needless source of type complexity.

722 if sid < len(self.STANDARD_STRINGS):

723 return self.STANDARD_STRINGS[sid]

724 return self.string_index[sid - len(self.STANDARD_STRINGS)]

725

726

727class TrueTypeFont:

728 class CMapNotFound(PDFException):

729 pass

730

731 def __init__(self, name: str, fp: BinaryIO) -> None:

732 self.name = name

733 self.fp = fp

734 self.tables: Dict[bytes, Tuple[int, int]] = {}

735 self.fonttype = fp.read(4)

736 try:

737 (ntables, _1, _2, _3) = cast(

738 Tuple[int, int, int, int],

739 struct.unpack(">HHHH", fp.read(8)),

740 )

741 for _ in range(ntables):

742 (name_bytes, tsum, offset, length) = cast(

743 Tuple[bytes, int, int, int],

744 struct.unpack(">4sLLL", fp.read(16)),

745 )

746 self.tables[name_bytes] = (offset, length)

747 except struct.error:

748 # Do not fail if there are not enough bytes to read. Even for

749 # corrupted PDFs we would like to get as much information as

750 # possible, so continue.

751 pass

752

753 def create_unicode_map(self) -> FileUnicodeMap:

754 if b"cmap" not in self.tables:

755 raise TrueTypeFont.CMapNotFound

756 (base_offset, length) = self.tables[b"cmap"]

757 fp = self.fp

758 fp.seek(base_offset)

759 (version, nsubtables) = cast(Tuple[int, int], struct.unpack(">HH", fp.read(4)))

760 subtables: List[Tuple[int, int, int]] = []

761 for i in range(nsubtables):

762 subtables.append(

763 cast(Tuple[int, int, int], struct.unpack(">HHL", fp.read(8))),

764 )

765 char2gid: Dict[int, int] = {}

766 # Only supports subtable type 0, 2 and 4.

767 for platform_id, encoding_id, st_offset in subtables:

768 # Skip non-Unicode cmaps.

769 # https://docs.microsoft.com/en-us/typography/opentype/spec/cmap

770 if not (platform_id == 0 or (platform_id == 3 and encoding_id in [1, 10])):

771 continue

772 fp.seek(base_offset + st_offset)

773 (fmttype, fmtlen, fmtlang) = cast(

774 Tuple[int, int, int],

775 struct.unpack(">HHH", fp.read(6)),

776 )

777 if fmttype == 0:

778 char2gid.update(

779 enumerate(

780 cast(Tuple[int, ...], struct.unpack(">256B", fp.read(256))),

781 ),

782 )

783 elif fmttype == 2:

784 subheaderkeys = cast(

785 Tuple[int, ...],

786 struct.unpack(">256H", fp.read(512)),

787 )

788 firstbytes = [0] * 8192

789 for i, k in enumerate(subheaderkeys):

790 firstbytes[k // 8] = i

791 nhdrs = max(subheaderkeys) // 8 + 1

792 hdrs: List[Tuple[int, int, int, int, int]] = []

793 for i in range(nhdrs):

794 (firstcode, entcount, delta, offset) = cast(

795 Tuple[int, int, int, int],

796 struct.unpack(">HHhH", fp.read(8)),

797 )

798 hdrs.append((i, firstcode, entcount, delta, fp.tell() - 2 + offset))

799 for i, firstcode, entcount, delta, pos in hdrs:

800 if not entcount:

801 continue

802 first = firstcode + (firstbytes[i] << 8)

803 fp.seek(pos)

804 for c in range(entcount):

805 gid = cast(Tuple[int], struct.unpack(">H", fp.read(2)))[0]

806 if gid:

807 gid += delta

808 char2gid[first + c] = gid

809 elif fmttype == 4:

810 (segcount, _1, _2, _3) = cast(

811 Tuple[int, int, int, int],

812 struct.unpack(">HHHH", fp.read(8)),

813 )

814 segcount //= 2

815 ecs = cast(

816 Tuple[int, ...],

817 struct.unpack(">%dH" % segcount, fp.read(2 * segcount)),

818 )

819 fp.read(2)

820 scs = cast(

821 Tuple[int, ...],

822 struct.unpack(">%dH" % segcount, fp.read(2 * segcount)),

823 )

824 idds = cast(

825 Tuple[int, ...],

826 struct.unpack(">%dh" % segcount, fp.read(2 * segcount)),

827 )

828 pos = fp.tell()

829 idrs = cast(

830 Tuple[int, ...],

831 struct.unpack(">%dH" % segcount, fp.read(2 * segcount)),

832 )

833 for ec, sc, idd, idr in zip(ecs, scs, idds, idrs):

834 if idr:

835 fp.seek(pos + idr)

836 for c in range(sc, ec + 1):

837 b = cast(Tuple[int], struct.unpack(">H", fp.read(2)))[0]

838 char2gid[c] = (b + idd) & 0xFFFF

839 else:

840 for c in range(sc, ec + 1):

841 char2gid[c] = (c + idd) & 0xFFFF

842 else:

843 assert False, str(("Unhandled", fmttype))

844 if not char2gid:

845 raise TrueTypeFont.CMapNotFound

846 # create unicode map

847 unicode_map = FileUnicodeMap()

848 for char, gid in char2gid.items():

849 unicode_map.add_cid2unichr(gid, char)

850 return unicode_map

851

852

853class PDFFontError(PDFException):

854 pass

855

856

857class PDFUnicodeNotDefined(PDFFontError):

858 pass

859

860

861LITERAL_STANDARD_ENCODING = LIT("StandardEncoding")

862LITERAL_TYPE1C = LIT("Type1C")

863

864# Font widths are maintained in a dict type that maps from *either* unicode

865# chars or integer character IDs.

866FontWidthDict = Dict[Union[int, str], float]

867

868

869class PDFFont:

870 def __init__(

871 self,

872 descriptor: Mapping[str, Any],

873 widths: FontWidthDict,

874 default_width: Optional[float] = None,

875 ) -> None:

876 self.descriptor = descriptor

877 self.widths: FontWidthDict = resolve_all(widths)

878 self.fontname = resolve1(descriptor.get("FontName", "unknown"))

879 if isinstance(self.fontname, PSLiteral):

880 self.fontname = literal_name(self.fontname)

881 self.flags = int_value(descriptor.get("Flags", 0))

882 self.ascent = num_value(descriptor.get("Ascent", 0))

883 self.descent = num_value(descriptor.get("Descent", 0))

884 self.italic_angle = num_value(descriptor.get("ItalicAngle", 0))

885 if default_width is None:

886 self.default_width = num_value(descriptor.get("MissingWidth", 0))

887 else:

888 self.default_width = default_width

889 self.default_width = resolve1(self.default_width)

890 self.leading = num_value(descriptor.get("Leading", 0))

891 self.bbox = self._parse_bbox(descriptor)

892 self.hscale = self.vscale = 0.001

893

894 # PDF RM 9.8.1 specifies /Descent should always be a negative number.

895 # PScript5.dll seems to produce Descent with a positive number, but

896 # text analysis will be wrong if this is taken as correct. So force

897 # descent to negative.

898 if self.descent > 0:

899 self.descent = -self.descent

900

901 def __repr__(self) -> str:

902 return "<PDFFont>"

903

904 def is_vertical(self) -> bool:

905 return False

906

907 def is_multibyte(self) -> bool:

908 return False

909

910 def decode(self, bytes: bytes) -> Iterable[int]:

911 return bytearray(bytes) # map(ord, bytes)

912

913 def get_ascent(self) -> float:

914 """Ascent above the baseline, in text space units"""

915 return self.ascent * self.vscale

916

917 def get_descent(self) -> float:

918 """Descent below the baseline, in text space units; always negative"""

919 return self.descent * self.vscale

920

921 def get_width(self) -> float:

922 w = self.bbox[2] - self.bbox[0]

923 if w == 0:

924 w = -self.default_width

925 return w * self.hscale

926

927 def get_height(self) -> float:

928 h = self.bbox[3] - self.bbox[1]

929 if h == 0:

930 h = self.ascent - self.descent

931 return h * self.vscale

932

933 def char_width(self, cid: int) -> float:

934 # Because character widths may be mapping either IDs or strings,

935 # we try to lookup the character ID first, then its str equivalent.

936 cid_width = safe_float(self.widths.get(cid))

937 if cid_width is not None:

938 return cid_width * self.hscale

939

940 try:

941 str_cid = self.to_unichr(cid)

942 cid_width = safe_float(self.widths.get(str_cid))

943 if cid_width is not None:

944 return cid_width * self.hscale

945

946 except PDFUnicodeNotDefined:

947 pass

948

949 return self.default_width * self.hscale

950

951 def char_disp(self, cid: int) -> Union[float, Tuple[Optional[float], float]]:

952 """Returns an integer for horizontal fonts, a tuple for vertical fonts."""

953 return 0

954

955 def string_width(self, s: bytes) -> float:

956 return sum(self.char_width(cid) for cid in self.decode(s))

957

958 def to_unichr(self, cid: int) -> str:

959 raise NotImplementedError

960

961 @staticmethod

962 def _parse_bbox(descriptor: Mapping[str, Any]) -> Rect:

963 """Parse FontBBox from the fonts descriptor"""

964 font_bbox = resolve_all(descriptor.get("FontBBox"))

965 bbox = safe_rect_list(font_bbox)

966 if bbox is None:

967 log.warning(

968 f"Could get FontBBox from font descriptor because {font_bbox!r} cannot be parsed as 4 floats"

969 )

970 return 0.0, 0.0, 0.0, 0.0

971 return bbox

972

973

974class PDFSimpleFont(PDFFont):

975 def __init__(

976 self,

977 descriptor: Mapping[str, Any],

978 widths: FontWidthDict,

979 spec: Mapping[str, Any],

980 ) -> None:

981 # Font encoding is specified either by a name of

982 # built-in encoding or a dictionary that describes

983 # the differences.

984 if "Encoding" in spec:

985 encoding = resolve1(spec["Encoding"])

986 else:

987 encoding = LITERAL_STANDARD_ENCODING

988 if isinstance(encoding, dict):

989 name = literal_name(encoding.get("BaseEncoding", LITERAL_STANDARD_ENCODING))

990 diff = list_value(encoding.get("Differences", []))

991 self.cid2unicode = EncodingDB.get_encoding(name, diff)

992 else:

993 self.cid2unicode = EncodingDB.get_encoding(literal_name(encoding))

994 self.unicode_map: Optional[UnicodeMap] = None

995 if "ToUnicode" in spec:

996 strm = stream_value(spec["ToUnicode"])

997 self.unicode_map = FileUnicodeMap()

998 CMapParser(self.unicode_map, BytesIO(strm.get_data())).run()

999 PDFFont.__init__(self, descriptor, widths)

1000

1001 def to_unichr(self, cid: int) -> str:

1002 if self.unicode_map:

1003 try:

1004 return self.unicode_map.get_unichr(cid)

1005 except KeyError:

1006 pass

1007 try:

1008 return self.cid2unicode[cid]

1009 except KeyError:

1010 raise PDFUnicodeNotDefined(None, cid)

1011

1012

1013class PDFType1Font(PDFSimpleFont):

1014 def __init__(self, rsrcmgr: "PDFResourceManager", spec: Mapping[str, Any]) -> None:

1015 try:

1016 self.basefont = literal_name(spec["BaseFont"])

1017 except KeyError:

1018 if settings.STRICT:

1019 raise PDFFontError("BaseFont is missing")

1020 self.basefont = "unknown"

1021

1022 widths: FontWidthDict

1023 try:

1024 (descriptor, int_widths) = FontMetricsDB.get_metrics(self.basefont)

1025 widths = cast(

1026 Dict[Union[str, int], float], int_widths

1027 ) # implicit int->float

1028 except KeyError:

1029 descriptor = dict_value(spec.get("FontDescriptor", {}))

1030 firstchar = int_value(spec.get("FirstChar", 0))

1031 # lastchar = int_value(spec.get('LastChar', 255))

1032 width_list = list_value(spec.get("Widths", [0] * 256))

1033 widths = {i + firstchar: resolve1(w) for (i, w) in enumerate(width_list)}

1034 PDFSimpleFont.__init__(self, descriptor, widths, spec)

1035 if "Encoding" not in spec and "FontFile" in descriptor:

1036 # try to recover the missing encoding info from the font file.

1037 self.fontfile = stream_value(descriptor.get("FontFile"))

1038 length1 = int_value(self.fontfile["Length1"])

1039 data = self.fontfile.get_data()[:length1]

1040 parser = Type1FontHeaderParser(BytesIO(data))

1041 self.cid2unicode = parser.get_encoding()

1042

1043 def __repr__(self) -> str:

1044 return "<PDFType1Font: basefont=%r>" % self.basefont

1045

1046

1047class PDFTrueTypeFont(PDFType1Font):

1048 def __repr__(self) -> str:

1049 return "<PDFTrueTypeFont: basefont=%r>" % self.basefont

1050

1051

1052class PDFType3Font(PDFSimpleFont):

1053 def __init__(self, rsrcmgr: "PDFResourceManager", spec: Mapping[str, Any]) -> None:

1054 firstchar = int_value(spec.get("FirstChar", 0))

1055 # lastchar = int_value(spec.get('LastChar', 0))

1056 width_list = list_value(spec.get("Widths", [0] * 256))

1057 widths: Dict[Union[str, int], float] = {

1058 i + firstchar: w for (i, w) in enumerate(width_list)

1059 }

1060 if "FontDescriptor" in spec:

1061 descriptor = dict_value(spec["FontDescriptor"])

1062 else:

1063 descriptor = {"Ascent": 0, "Descent": 0, "FontBBox": spec["FontBBox"]}

1064 PDFSimpleFont.__init__(self, descriptor, widths, spec)

1065 self.matrix = cast(Matrix, tuple(list_value(spec.get("FontMatrix"))))

1066 (_, self.descent, _, self.ascent) = self.bbox

1067 (self.hscale, self.vscale) = apply_matrix_norm(self.matrix, (1, 1))

1068

1069 def __repr__(self) -> str:

1070 return "<PDFType3Font>"

1071

1072

1073class PDFCIDFont(PDFFont):

1074 default_disp: Union[float, Tuple[Optional[float], float]]

1075

1076 def __init__(

1077 self,

1078 rsrcmgr: "PDFResourceManager",

1079 spec: Mapping[str, Any],

1080 strict: bool = settings.STRICT,

1081 ) -> None:

1082 try:

1083 self.basefont = literal_name(spec["BaseFont"])

1084 except KeyError:

1085 if strict:

1086 raise PDFFontError("BaseFont is missing")

1087 self.basefont = "unknown"

1088 self.cidsysteminfo = dict_value(spec.get("CIDSystemInfo", {}))

1089 cid_registry = resolve1(self.cidsysteminfo.get("Registry", b"unknown")).decode(

1090 "latin1",

1091 )

1092 cid_ordering = resolve1(self.cidsysteminfo.get("Ordering", b"unknown")).decode(

1093 "latin1",

1094 )

1095 self.cidcoding = f"{cid_registry.strip()}-{cid_ordering.strip()}"

1096 self.cmap: CMapBase = self.get_cmap_from_spec(spec, strict)

1097

1098 try:

1099 descriptor = dict_value(spec["FontDescriptor"])

1100 except KeyError:

1101 if strict:

1102 raise PDFFontError("FontDescriptor is missing")

1103 descriptor = {}

1104 ttf = None

1105 if "FontFile2" in descriptor:

1106 self.fontfile = stream_value(descriptor.get("FontFile2"))

1107 ttf = TrueTypeFont(self.basefont, BytesIO(self.fontfile.get_data()))

1108 self.unicode_map: Optional[UnicodeMap] = None

1109 if "ToUnicode" in spec:

1110 if isinstance(spec["ToUnicode"], PDFStream):

1111 strm = stream_value(spec["ToUnicode"])

1112 self.unicode_map = FileUnicodeMap()

1113 CMapParser(self.unicode_map, BytesIO(strm.get_data())).run()

1114 else:

1115 cmap_name = literal_name(spec["ToUnicode"])

1116 encoding = literal_name(spec["Encoding"])

1117 if (

1118 "Identity" in cid_ordering

1119 or "Identity" in cmap_name

1120 or "Identity" in encoding

1121 ):

1122 self.unicode_map = IdentityUnicodeMap()

1123 elif self.cidcoding in ("Adobe-Identity", "Adobe-UCS"):

1124 if ttf:

1125 try:

1126 self.unicode_map = ttf.create_unicode_map()

1127 except TrueTypeFont.CMapNotFound:

1128 pass

1129 else:

1130 try:

1131 self.unicode_map = CMapDB.get_unicode_map(

1132 self.cidcoding,

1133 self.cmap.is_vertical(),

1134 )

1135 except CMapDB.CMapNotFound:

1136 pass

1137

1138 self.vertical = self.cmap.is_vertical()

1139 if self.vertical:

1140 # writing mode: vertical

1141 widths2 = get_widths2(list_value(spec.get("W2", [])))

1142 self.disps = {cid: (vx, vy) for (cid, (_, (vx, vy))) in widths2.items()}

1143 (vy, w) = resolve1(spec.get("DW2", [880, -1000]))

1144 self.default_disp = (None, vy)

1145 widths: Dict[Union[str, int], float] = {

1146 cid: w for (cid, (w, _)) in widths2.items()

1147 }

1148 default_width = w

1149 else:

1150 # writing mode: horizontal

1151 self.disps = {}

1152 self.default_disp = 0

1153 widths = get_widths(list_value(spec.get("W", [])))

1154 default_width = spec.get("DW", 1000)

1155 PDFFont.__init__(self, descriptor, widths, default_width=default_width)

1156

1157 def get_cmap_from_spec(self, spec: Mapping[str, Any], strict: bool) -> CMapBase:

1158 """Get cmap from font specification

1159

1160 For certain PDFs, Encoding Type isn't mentioned as an attribute of

1161 Encoding but as an attribute of CMapName, where CMapName is an

1162 attribute of spec['Encoding'].

1163 The horizontal/vertical modes are mentioned with different name

1164 such as 'DLIdent-H/V','OneByteIdentityH/V','Identity-H/V'.

1165 """

1166 cmap_name = self._get_cmap_name(spec, strict)

1167

1168 try:

1169 return CMapDB.get_cmap(cmap_name)

1170 except CMapDB.CMapNotFound as e:

1171 if strict:

1172 raise PDFFontError(e)

1173 return CMap()

1174

1175 @staticmethod

1176 def _get_cmap_name(spec: Mapping[str, Any], strict: bool) -> str:

1177 """Get cmap name from font specification"""

1178 cmap_name = "unknown" # default value

1179

1180 try:

1181 spec_encoding = spec["Encoding"]

1182 if hasattr(spec_encoding, "name"):

1183 cmap_name = literal_name(spec["Encoding"])

1184 else:

1185 cmap_name = literal_name(spec_encoding["CMapName"])

1186 except KeyError:

1187 if strict:

1188 raise PDFFontError("Encoding is unspecified")

1189

1190 if type(cmap_name) is PDFStream: # type: ignore[comparison-overlap]

1191 cmap_name_stream: PDFStream = cast(PDFStream, cmap_name)

1192 if "CMapName" in cmap_name_stream:

1193 cmap_name = cmap_name_stream.get("CMapName").name

1194 elif strict:

1195 raise PDFFontError("CMapName unspecified for encoding")

1196

1197 return IDENTITY_ENCODER.get(cmap_name, cmap_name)

1198

1199 def __repr__(self) -> str:

1200 return f"<PDFCIDFont: basefont={self.basefont!r}, cidcoding={self.cidcoding!r}>"

1201

1202 def is_vertical(self) -> bool:

1203 return self.vertical

1204

1205 def is_multibyte(self) -> bool:

1206 return True

1207

1208 def decode(self, bytes: bytes) -> Iterable[int]:

1209 return self.cmap.decode(bytes)

1210

1211 def char_disp(self, cid: int) -> Union[float, Tuple[Optional[float], float]]:

1212 """Returns an integer for horizontal fonts, a tuple for vertical fonts."""

1213 return self.disps.get(cid, self.default_disp)

1214

1215 def to_unichr(self, cid: int) -> str:

1216 try:

1217 if not self.unicode_map:

1218 raise PDFKeyError(cid)

1219 return self.unicode_map.get_unichr(cid)

1220 except KeyError:

1221 raise PDFUnicodeNotDefined(self.cidcoding, cid)