Coverage for /pythoncovmergedfiles/medio/medio/src/pdfminer.six/pdfminer/pdffont.py: 45%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

535 statements  

1import contextlib 

2import logging 

3import struct 

4from collections.abc import Iterable, Iterator, Mapping 

5from io import BytesIO 

6from typing import ( 

7 TYPE_CHECKING, 

8 Any, 

9 BinaryIO, 

10 cast, 

11) 

12 

13from pdfminer import settings 

14from pdfminer.casting import safe_float, safe_rect_list 

15from pdfminer.cmapdb import ( 

16 CMap, 

17 CMapBase, 

18 CMapDB, 

19 CMapParser, 

20 FileUnicodeMap, 

21 IdentityUnicodeMap, 

22 UnicodeMap, 

23) 

24from pdfminer.encodingdb import EncodingDB, name2unicode 

25from pdfminer.fontmetrics import FONT_METRICS 

26from pdfminer.pdfexceptions import PDFException, PDFKeyError, PDFValueError 

27from pdfminer.pdftypes import ( 

28 PDFStream, 

29 dict_value, 

30 int_value, 

31 list_value, 

32 num_value, 

33 resolve1, 

34 resolve_all, 

35 stream_value, 

36) 

37from pdfminer.psexceptions import PSEOF 

38from pdfminer.psparser import ( 

39 KWD, 

40 LIT, 

41 PSKeyword, 

42 PSLiteral, 

43 PSStackParser, 

44 literal_name, 

45) 

46from pdfminer.utils import Matrix, Point, Rect, apply_matrix_norm, choplist, nunpack 

47 

48if TYPE_CHECKING: 

49 from pdfminer.pdfinterp import PDFResourceManager 

50 

51log = logging.getLogger(__name__) 

52 

53 

54def get_widths(seq: Iterable[object]) -> dict[str | int, float]: 

55 """Build a mapping of character widths for horizontal writing.""" 

56 widths: dict[int, float] = {} 

57 r: list[float] = [] 

58 for v in seq: 

59 v = resolve1(v) 

60 if isinstance(v, list): 

61 if r: 

62 char1 = r[-1] 

63 for i, w in enumerate(v): 

64 widths[cast(int, char1) + i] = w 

65 r = [] 

66 elif isinstance(v, (int, float)): # == utils.isnumber(v) 

67 r.append(v) 

68 if len(r) == 3: 

69 (char1, char2, w) = r 

70 if isinstance(char1, int) and isinstance(char2, int): 

71 for i in range(char1, char2 + 1): 

72 widths[i] = w 

73 else: 

74 log.warning( 

75 f"Skipping invalid font width specification for {char1} to " 

76 f"{char2} because either of them is not an int" 

77 ) 

78 r = [] 

79 else: 

80 log.warning( 

81 f"Skipping invalid font width specification for {v} " 

82 f"because it is not a number or a list" 

83 ) 

84 return cast(dict[str | int, float], widths) 

85 

86 

87def get_widths2(seq: Iterable[object]) -> dict[int, tuple[float, Point]]: 

88 """Build a mapping of character widths for vertical writing.""" 

89 widths: dict[int, tuple[float, Point]] = {} 

90 r: list[float] = [] 

91 for v in seq: 

92 if isinstance(v, list): 

93 if r: 

94 char1 = r[-1] 

95 for i, (w, vx, vy) in enumerate(choplist(3, v)): 

96 widths[cast(int, char1) + i] = (w, (vx, vy)) 

97 r = [] 

98 elif isinstance(v, (int, float)): # == utils.isnumber(v) 

99 r.append(v) 

100 if len(r) == 5: 

101 (char1, char2, w, vx, vy) = r 

102 for i in range(cast(int, char1), cast(int, char2) + 1): 

103 widths[i] = (w, (vx, vy)) 

104 r = [] 

105 return widths 

106 

107 

108class FontMetricsDB: 

109 @classmethod 

110 def get_metrics(cls, fontname: str) -> tuple[dict[str, object], dict[str, int]]: 

111 return FONT_METRICS[fontname] 

112 

113 

114# int here means that we're not extending PSStackParser with additional types. 

115class Type1FontHeaderParser(PSStackParser[int]): 

116 KEYWORD_BEGIN = KWD(b"begin") 

117 KEYWORD_END = KWD(b"end") 

118 KEYWORD_DEF = KWD(b"def") 

119 KEYWORD_PUT = KWD(b"put") 

120 KEYWORD_DICT = KWD(b"dict") 

121 KEYWORD_ARRAY = KWD(b"array") 

122 KEYWORD_READONLY = KWD(b"readonly") 

123 KEYWORD_FOR = KWD(b"for") 

124 

125 def __init__(self, data: BinaryIO) -> None: 

126 PSStackParser.__init__(self, data) 

127 self._cid2unicode: dict[int, str] = {} 

128 

129 def get_encoding(self) -> dict[int, str]: 

130 """Parse the font encoding. 

131 

132 The Type1 font encoding maps character codes to character names. These 

133 character names could either be standard Adobe glyph names, or 

134 character names associated with custom CharStrings for this font. A 

135 CharString is a sequence of operations that describe how the character 

136 should be drawn. Currently, this function returns '' (empty string) 

137 for character names that are associated with a CharStrings. 

138 

139 Reference: Adobe Systems Incorporated, Adobe Type 1 Font Format 

140 

141 :returns mapping of character identifiers (cid's) to unicode characters 

142 """ 

143 while 1: 

144 try: 

145 (cid, name) = self.nextobject() 

146 except PSEOF: 

147 break 

148 try: 

149 self._cid2unicode[cid] = name2unicode(cast(str, name)) 

150 except KeyError as e: 

151 log.debug(str(e)) 

152 return self._cid2unicode 

153 

154 def do_keyword(self, pos: int, token: PSKeyword) -> None: 

155 if token is self.KEYWORD_PUT: 

156 ((_, key), (_, value)) = self.pop(2) 

157 if isinstance(key, int) and isinstance(value, PSLiteral): 

158 self.add_results((key, literal_name(value))) 

159 

160 

161NIBBLES = ("0", "1", "2", "3", "4", "5", "6", "7", "8", "9", ".", "e", "e-", None, "-") 

162 

163# Mapping of cmap names. Original cmap name is kept if not in the mapping. 

164# (missing reference for why DLIdent is mapped to Identity) 

165IDENTITY_ENCODER = { 

166 "DLIdent-H": "Identity-H", 

167 "DLIdent-V": "Identity-V", 

168} 

169 

170 

171def getdict(data: bytes) -> dict[int, list[float | int]]: 

172 d: dict[int, list[float | int]] = {} 

173 fp = BytesIO(data) 

174 stack: list[float | int] = [] 

175 while 1: 

176 c = fp.read(1) 

177 if not c: 

178 break 

179 b0 = ord(c) 

180 if b0 <= 21: 

181 d[b0] = stack 

182 stack = [] 

183 continue 

184 if b0 == 30: 

185 s = "" 

186 loop = True 

187 while loop: 

188 b = ord(fp.read(1)) 

189 for n in (b >> 4, b & 15): 

190 if n == 15: 

191 loop = False 

192 else: 

193 nibble = NIBBLES[n] 

194 assert nibble is not None 

195 s += nibble 

196 value = float(s) 

197 elif b0 >= 32 and b0 <= 246: 

198 value = b0 - 139 

199 else: 

200 b1 = ord(fp.read(1)) 

201 if b0 >= 247 and b0 <= 250: 

202 value = ((b0 - 247) << 8) + b1 + 108 

203 elif b0 >= 251 and b0 <= 254: 

204 value = -((b0 - 251) << 8) - b1 - 108 

205 else: 

206 b2 = ord(fp.read(1)) 

207 if b1 >= 128: 

208 b1 -= 256 

209 if b0 == 28: 

210 value = b1 << 8 | b2 

211 else: 

212 value = b1 << 24 | b2 << 16 | struct.unpack(">H", fp.read(2))[0] 

213 stack.append(value) 

214 return d 

215 

216 

217class CFFFont: 

218 STANDARD_STRINGS = ( 

219 ".notdef", 

220 "space", 

221 "exclam", 

222 "quotedbl", 

223 "numbersign", 

224 "dollar", 

225 "percent", 

226 "ampersand", 

227 "quoteright", 

228 "parenleft", 

229 "parenright", 

230 "asterisk", 

231 "plus", 

232 "comma", 

233 "hyphen", 

234 "period", 

235 "slash", 

236 "zero", 

237 "one", 

238 "two", 

239 "three", 

240 "four", 

241 "five", 

242 "six", 

243 "seven", 

244 "eight", 

245 "nine", 

246 "colon", 

247 "semicolon", 

248 "less", 

249 "equal", 

250 "greater", 

251 "question", 

252 "at", 

253 "A", 

254 "B", 

255 "C", 

256 "D", 

257 "E", 

258 "F", 

259 "G", 

260 "H", 

261 "I", 

262 "J", 

263 "K", 

264 "L", 

265 "M", 

266 "N", 

267 "O", 

268 "P", 

269 "Q", 

270 "R", 

271 "S", 

272 "T", 

273 "U", 

274 "V", 

275 "W", 

276 "X", 

277 "Y", 

278 "Z", 

279 "bracketleft", 

280 "backslash", 

281 "bracketright", 

282 "asciicircum", 

283 "underscore", 

284 "quoteleft", 

285 "a", 

286 "b", 

287 "c", 

288 "d", 

289 "e", 

290 "f", 

291 "g", 

292 "h", 

293 "i", 

294 "j", 

295 "k", 

296 "l", 

297 "m", 

298 "n", 

299 "o", 

300 "p", 

301 "q", 

302 "r", 

303 "s", 

304 "t", 

305 "u", 

306 "v", 

307 "w", 

308 "x", 

309 "y", 

310 "z", 

311 "braceleft", 

312 "bar", 

313 "braceright", 

314 "asciitilde", 

315 "exclamdown", 

316 "cent", 

317 "sterling", 

318 "fraction", 

319 "yen", 

320 "florin", 

321 "section", 

322 "currency", 

323 "quotesingle", 

324 "quotedblleft", 

325 "guillemotleft", 

326 "guilsinglleft", 

327 "guilsinglright", 

328 "fi", 

329 "fl", 

330 "endash", 

331 "dagger", 

332 "daggerdbl", 

333 "periodcentered", 

334 "paragraph", 

335 "bullet", 

336 "quotesinglbase", 

337 "quotedblbase", 

338 "quotedblright", 

339 "guillemotright", 

340 "ellipsis", 

341 "perthousand", 

342 "questiondown", 

343 "grave", 

344 "acute", 

345 "circumflex", 

346 "tilde", 

347 "macron", 

348 "breve", 

349 "dotaccent", 

350 "dieresis", 

351 "ring", 

352 "cedilla", 

353 "hungarumlaut", 

354 "ogonek", 

355 "caron", 

356 "emdash", 

357 "AE", 

358 "ordfeminine", 

359 "Lslash", 

360 "Oslash", 

361 "OE", 

362 "ordmasculine", 

363 "ae", 

364 "dotlessi", 

365 "lslash", 

366 "oslash", 

367 "oe", 

368 "germandbls", 

369 "onesuperior", 

370 "logicalnot", 

371 "mu", 

372 "trademark", 

373 "Eth", 

374 "onehalf", 

375 "plusminus", 

376 "Thorn", 

377 "onequarter", 

378 "divide", 

379 "brokenbar", 

380 "degree", 

381 "thorn", 

382 "threequarters", 

383 "twosuperior", 

384 "registered", 

385 "minus", 

386 "eth", 

387 "multiply", 

388 "threesuperior", 

389 "copyright", 

390 "Aacute", 

391 "Acircumflex", 

392 "Adieresis", 

393 "Agrave", 

394 "Aring", 

395 "Atilde", 

396 "Ccedilla", 

397 "Eacute", 

398 "Ecircumflex", 

399 "Edieresis", 

400 "Egrave", 

401 "Iacute", 

402 "Icircumflex", 

403 "Idieresis", 

404 "Igrave", 

405 "Ntilde", 

406 "Oacute", 

407 "Ocircumflex", 

408 "Odieresis", 

409 "Ograve", 

410 "Otilde", 

411 "Scaron", 

412 "Uacute", 

413 "Ucircumflex", 

414 "Udieresis", 

415 "Ugrave", 

416 "Yacute", 

417 "Ydieresis", 

418 "Zcaron", 

419 "aacute", 

420 "acircumflex", 

421 "adieresis", 

422 "agrave", 

423 "aring", 

424 "atilde", 

425 "ccedilla", 

426 "eacute", 

427 "ecircumflex", 

428 "edieresis", 

429 "egrave", 

430 "iacute", 

431 "icircumflex", 

432 "idieresis", 

433 "igrave", 

434 "ntilde", 

435 "oacute", 

436 "ocircumflex", 

437 "odieresis", 

438 "ograve", 

439 "otilde", 

440 "scaron", 

441 "uacute", 

442 "ucircumflex", 

443 "udieresis", 

444 "ugrave", 

445 "yacute", 

446 "ydieresis", 

447 "zcaron", 

448 "exclamsmall", 

449 "Hungarumlautsmall", 

450 "dollaroldstyle", 

451 "dollarsuperior", 

452 "ampersandsmall", 

453 "Acutesmall", 

454 "parenleftsuperior", 

455 "parenrightsuperior", 

456 "twodotenleader", 

457 "onedotenleader", 

458 "zerooldstyle", 

459 "oneoldstyle", 

460 "twooldstyle", 

461 "threeoldstyle", 

462 "fouroldstyle", 

463 "fiveoldstyle", 

464 "sixoldstyle", 

465 "sevenoldstyle", 

466 "eightoldstyle", 

467 "nineoldstyle", 

468 "commasuperior", 

469 "threequartersemdash", 

470 "periodsuperior", 

471 "questionsmall", 

472 "asuperior", 

473 "bsuperior", 

474 "centsuperior", 

475 "dsuperior", 

476 "esuperior", 

477 "isuperior", 

478 "lsuperior", 

479 "msuperior", 

480 "nsuperior", 

481 "osuperior", 

482 "rsuperior", 

483 "ssuperior", 

484 "tsuperior", 

485 "ff", 

486 "ffi", 

487 "ffl", 

488 "parenleftinferior", 

489 "parenrightinferior", 

490 "Circumflexsmall", 

491 "hyphensuperior", 

492 "Gravesmall", 

493 "Asmall", 

494 "Bsmall", 

495 "Csmall", 

496 "Dsmall", 

497 "Esmall", 

498 "Fsmall", 

499 "Gsmall", 

500 "Hsmall", 

501 "Ismall", 

502 "Jsmall", 

503 "Ksmall", 

504 "Lsmall", 

505 "Msmall", 

506 "Nsmall", 

507 "Osmall", 

508 "Psmall", 

509 "Qsmall", 

510 "Rsmall", 

511 "Ssmall", 

512 "Tsmall", 

513 "Usmall", 

514 "Vsmall", 

515 "Wsmall", 

516 "Xsmall", 

517 "Ysmall", 

518 "Zsmall", 

519 "colonmonetary", 

520 "onefitted", 

521 "rupiah", 

522 "Tildesmall", 

523 "exclamdownsmall", 

524 "centoldstyle", 

525 "Lslashsmall", 

526 "Scaronsmall", 

527 "Zcaronsmall", 

528 "Dieresissmall", 

529 "Brevesmall", 

530 "Caronsmall", 

531 "Dotaccentsmall", 

532 "Macronsmall", 

533 "figuredash", 

534 "hypheninferior", 

535 "Ogoneksmall", 

536 "Ringsmall", 

537 "Cedillasmall", 

538 "questiondownsmall", 

539 "oneeighth", 

540 "threeeighths", 

541 "fiveeighths", 

542 "seveneighths", 

543 "onethird", 

544 "twothirds", 

545 "zerosuperior", 

546 "foursuperior", 

547 "fivesuperior", 

548 "sixsuperior", 

549 "sevensuperior", 

550 "eightsuperior", 

551 "ninesuperior", 

552 "zeroinferior", 

553 "oneinferior", 

554 "twoinferior", 

555 "threeinferior", 

556 "fourinferior", 

557 "fiveinferior", 

558 "sixinferior", 

559 "seveninferior", 

560 "eightinferior", 

561 "nineinferior", 

562 "centinferior", 

563 "dollarinferior", 

564 "periodinferior", 

565 "commainferior", 

566 "Agravesmall", 

567 "Aacutesmall", 

568 "Acircumflexsmall", 

569 "Atildesmall", 

570 "Adieresissmall", 

571 "Aringsmall", 

572 "AEsmall", 

573 "Ccedillasmall", 

574 "Egravesmall", 

575 "Eacutesmall", 

576 "Ecircumflexsmall", 

577 "Edieresissmall", 

578 "Igravesmall", 

579 "Iacutesmall", 

580 "Icircumflexsmall", 

581 "Idieresissmall", 

582 "Ethsmall", 

583 "Ntildesmall", 

584 "Ogravesmall", 

585 "Oacutesmall", 

586 "Ocircumflexsmall", 

587 "Otildesmall", 

588 "Odieresissmall", 

589 "OEsmall", 

590 "Oslashsmall", 

591 "Ugravesmall", 

592 "Uacutesmall", 

593 "Ucircumflexsmall", 

594 "Udieresissmall", 

595 "Yacutesmall", 

596 "Thornsmall", 

597 "Ydieresissmall", 

598 "001.000", 

599 "001.001", 

600 "001.002", 

601 "001.003", 

602 "Black", 

603 "Bold", 

604 "Book", 

605 "Light", 

606 "Medium", 

607 "Regular", 

608 "Roman", 

609 "Semibold", 

610 ) 

611 

612 class INDEX: 

613 def __init__(self, fp: BinaryIO) -> None: 

614 self.fp = fp 

615 self.offsets: list[int] = [] 

616 (count, offsize) = struct.unpack(">HB", self.fp.read(3)) 

617 for _i in range(count + 1): 

618 self.offsets.append(nunpack(self.fp.read(offsize))) 

619 self.base = self.fp.tell() - 1 

620 self.fp.seek(self.base + self.offsets[-1]) 

621 

622 def __repr__(self) -> str: 

623 return f"<INDEX: size={len(self)}>" 

624 

625 def __len__(self) -> int: 

626 return len(self.offsets) - 1 

627 

628 def __getitem__(self, i: int) -> bytes: 

629 self.fp.seek(self.base + self.offsets[i]) 

630 return self.fp.read(self.offsets[i + 1] - self.offsets[i]) 

631 

632 def __iter__(self) -> Iterator[bytes]: 

633 return iter(self[i] for i in range(len(self))) 

634 

635 def __init__(self, name: str, fp: BinaryIO) -> None: 

636 self.name = name 

637 self.fp = fp 

638 # Header 

639 (_major, _minor, hdrsize, _offsize) = struct.unpack("BBBB", self.fp.read(4)) 

640 self.fp.read(hdrsize - 4) 

641 # Name INDEX 

642 self.name_index = self.INDEX(self.fp) 

643 # Top DICT INDEX 

644 self.dict_index = self.INDEX(self.fp) 

645 # String INDEX 

646 self.string_index = self.INDEX(self.fp) 

647 # Global Subr INDEX 

648 self.subr_index = self.INDEX(self.fp) 

649 # Top DICT DATA 

650 self.top_dict = getdict(self.dict_index[0]) 

651 (charset_pos,) = self.top_dict.get(15, [0]) 

652 (encoding_pos,) = self.top_dict.get(16, [0]) 

653 (charstring_pos,) = self.top_dict.get(17, [0]) 

654 # CharStrings 

655 self.fp.seek(cast(int, charstring_pos)) 

656 self.charstring = self.INDEX(self.fp) 

657 self.nglyphs = len(self.charstring) 

658 # Encodings 

659 self.code2gid = {} 

660 self.gid2code = {} 

661 self.fp.seek(cast(int, encoding_pos)) 

662 format = self.fp.read(1) 

663 if format == b"\x00": 

664 # Format 0 

665 (n,) = struct.unpack("B", self.fp.read(1)) 

666 for code, gid in enumerate(struct.unpack("B" * n, self.fp.read(n))): 

667 self.code2gid[code] = gid 

668 self.gid2code[gid] = code 

669 elif format == b"\x01": 

670 # Format 1 

671 (n,) = struct.unpack("B", self.fp.read(1)) 

672 code = 0 

673 for _i in range(n): 

674 (first, nleft) = struct.unpack("BB", self.fp.read(2)) 

675 for gid in range(first, first + nleft + 1): 

676 self.code2gid[code] = gid 

677 self.gid2code[gid] = code 

678 code += 1 

679 else: 

680 raise PDFValueError(f"unsupported encoding format: {format!r}") 

681 # Charsets 

682 self.name2gid = {} 

683 self.gid2name = {} 

684 self.fp.seek(cast(int, charset_pos)) 

685 format = self.fp.read(1) 

686 if format == b"\x00": 

687 # Format 0 

688 n = self.nglyphs - 1 

689 for gid, sid in enumerate( 

690 cast( 

691 tuple[int, ...], struct.unpack(">" + "H" * n, self.fp.read(2 * n)) 

692 ), 

693 ): 

694 gid += 1 

695 sidname = self.getstr(sid) 

696 self.name2gid[sidname] = gid 

697 self.gid2name[gid] = sidname 

698 elif format == b"\x01": 

699 # Format 1 

700 (n,) = struct.unpack("B", self.fp.read(1)) 

701 sid = 0 

702 for _i in range(n): 

703 (first, nleft) = struct.unpack("BB", self.fp.read(2)) 

704 for gid in range(first, first + nleft + 1): 

705 sidname = self.getstr(sid) 

706 self.name2gid[sidname] = gid 

707 self.gid2name[gid] = sidname 

708 sid += 1 

709 elif format == b"\x02": 

710 # Format 2 

711 raise AssertionError(str(("Unhandled", format))) 

712 else: 

713 raise PDFValueError(f"unsupported charset format: {format!r}") 

714 

715 def getstr(self, sid: int) -> str | bytes: 

716 # This returns str for one of the STANDARD_STRINGS but bytes otherwise, 

717 # and appears to be a needless source of type complexity. 

718 if sid < len(self.STANDARD_STRINGS): 

719 return self.STANDARD_STRINGS[sid] 

720 return self.string_index[sid - len(self.STANDARD_STRINGS)] 

721 

722 

723class TrueTypeFont: 

724 class CMapNotFound(PDFException): 

725 pass 

726 

727 def __init__(self, name: str, fp: BinaryIO) -> None: 

728 self.name = name 

729 self.fp = fp 

730 self.tables: dict[bytes, tuple[int, int]] = {} 

731 self.fonttype = fp.read(4) 

732 try: 

733 (ntables, _1, _2, _3) = cast( 

734 tuple[int, int, int, int], 

735 struct.unpack(">HHHH", fp.read(8)), 

736 ) 

737 for _ in range(ntables): 

738 (name_bytes, _tsum, offset, length) = cast( 

739 tuple[bytes, int, int, int], 

740 struct.unpack(">4sLLL", fp.read(16)), 

741 ) 

742 self.tables[name_bytes] = (offset, length) 

743 except struct.error: 

744 # Do not fail if there are not enough bytes to read. Even for 

745 # corrupted PDFs we would like to get as much information as 

746 # possible, so continue. 

747 pass 

748 

749 def create_unicode_map(self) -> FileUnicodeMap: 

750 if b"cmap" not in self.tables: 

751 raise TrueTypeFont.CMapNotFound 

752 (base_offset, _length) = self.tables[b"cmap"] 

753 fp = self.fp 

754 fp.seek(base_offset) 

755 (_version, nsubtables) = cast(tuple[int, int], struct.unpack(">HH", fp.read(4))) 

756 subtables: list[tuple[int, int, int]] = [] 

757 for _i in range(nsubtables): 

758 subtables.append( 

759 cast(tuple[int, int, int], struct.unpack(">HHL", fp.read(8))), 

760 ) 

761 char2gid: dict[int, int] = {} 

762 # Only supports subtable type 0, 2 and 4. 

763 for platform_id, encoding_id, st_offset in subtables: 

764 # Skip non-Unicode cmaps. 

765 # https://docs.microsoft.com/en-us/typography/opentype/spec/cmap 

766 if not (platform_id == 0 or (platform_id == 3 and encoding_id in [1, 10])): 

767 continue 

768 fp.seek(base_offset + st_offset) 

769 (fmttype, _fmtlen, _fmtlang) = cast( 

770 tuple[int, int, int], 

771 struct.unpack(">HHH", fp.read(6)), 

772 ) 

773 if fmttype == 0: 

774 char2gid.update( 

775 enumerate( 

776 cast(tuple[int, ...], struct.unpack(">256B", fp.read(256))), 

777 ), 

778 ) 

779 elif fmttype == 2: 

780 subheaderkeys = cast( 

781 tuple[int, ...], 

782 struct.unpack(">256H", fp.read(512)), 

783 ) 

784 firstbytes = [0] * 8192 

785 for i, k in enumerate(subheaderkeys): 

786 firstbytes[k // 8] = i 

787 nhdrs = max(subheaderkeys) // 8 + 1 

788 hdrs: list[tuple[int, int, int, int, int]] = [] 

789 for i in range(nhdrs): 

790 (firstcode, entcount, delta, offset) = cast( 

791 tuple[int, int, int, int], 

792 struct.unpack(">HHhH", fp.read(8)), 

793 ) 

794 hdrs.append((i, firstcode, entcount, delta, fp.tell() - 2 + offset)) 

795 for i, firstcode, entcount, delta, pos in hdrs: 

796 if not entcount: 

797 continue 

798 first = firstcode + (firstbytes[i] << 8) 

799 fp.seek(pos) 

800 for c in range(entcount): 

801 gid = cast(tuple[int], struct.unpack(">H", fp.read(2)))[0] 

802 if gid: 

803 gid += delta 

804 char2gid[first + c] = gid 

805 elif fmttype == 4: 

806 (segcount, _1, _2, _3) = cast( 

807 tuple[int, int, int, int], 

808 struct.unpack(">HHHH", fp.read(8)), 

809 ) 

810 segcount //= 2 

811 ecs = cast( 

812 tuple[int, ...], 

813 struct.unpack(f">{segcount}H", fp.read(2 * segcount)), 

814 ) 

815 fp.read(2) 

816 scs = cast( 

817 tuple[int, ...], 

818 struct.unpack(f">{segcount}H", fp.read(2 * segcount)), 

819 ) 

820 idds = cast( 

821 tuple[int, ...], 

822 struct.unpack(f">{segcount}h", fp.read(2 * segcount)), 

823 ) 

824 pos = fp.tell() 

825 idrs = cast( 

826 tuple[int, ...], 

827 struct.unpack(f">{segcount}H", fp.read(2 * segcount)), 

828 ) 

829 for ec, sc, idd, idr in zip(ecs, scs, idds, idrs, strict=False): 

830 if idr: 

831 fp.seek(pos + idr) 

832 for c in range(sc, ec + 1): 

833 b = cast(tuple[int], struct.unpack(">H", fp.read(2)))[0] 

834 char2gid[c] = (b + idd) & 0xFFFF 

835 else: 

836 for c in range(sc, ec + 1): 

837 char2gid[c] = (c + idd) & 0xFFFF 

838 else: 

839 raise AssertionError(str(("Unhandled", fmttype))) 

840 if not char2gid: 

841 raise TrueTypeFont.CMapNotFound 

842 # create unicode map 

843 unicode_map = FileUnicodeMap() 

844 for char, gid in char2gid.items(): 

845 unicode_map.add_cid2unichr(gid, char) 

846 return unicode_map 

847 

848 

849class PDFFontError(PDFException): 

850 pass 

851 

852 

853class PDFUnicodeNotDefined(PDFFontError): 

854 pass 

855 

856 

857LITERAL_STANDARD_ENCODING = LIT("StandardEncoding") 

858LITERAL_TYPE1C = LIT("Type1C") 

859 

860# Font widths are maintained in a dict type that maps from *either* unicode 

861# chars or integer character IDs. 

862FontWidthDict = dict[int | str, float] 

863 

864 

865class PDFFont: 

866 def __init__( 

867 self, 

868 descriptor: Mapping[str, Any], 

869 widths: FontWidthDict, 

870 default_width: float | None = None, 

871 ) -> None: 

872 self.descriptor = descriptor 

873 self.widths: FontWidthDict = resolve_all(widths) 

874 self.fontname = resolve1(descriptor.get("FontName", "unknown")) 

875 if isinstance(self.fontname, PSLiteral): 

876 self.fontname = literal_name(self.fontname) 

877 self.flags = int_value(descriptor.get("Flags", 0)) 

878 self.ascent = num_value(descriptor.get("Ascent", 0)) 

879 self.descent = num_value(descriptor.get("Descent", 0)) 

880 self.italic_angle = num_value(descriptor.get("ItalicAngle", 0)) 

881 if default_width is None: 

882 self.default_width = num_value(descriptor.get("MissingWidth", 0)) 

883 else: 

884 self.default_width = default_width 

885 self.default_width = resolve1(self.default_width) 

886 self.leading = num_value(descriptor.get("Leading", 0)) 

887 self.bbox = self._parse_bbox(descriptor) 

888 self.hscale = self.vscale = 0.001 

889 

890 # PDF RM 9.8.1 specifies /Descent should always be a negative number. 

891 # PScript5.dll seems to produce Descent with a positive number, but 

892 # text analysis will be wrong if this is taken as correct. So force 

893 # descent to negative. 

894 if self.descent > 0: 

895 self.descent = -self.descent 

896 

897 def __repr__(self) -> str: 

898 return "<PDFFont>" 

899 

900 def is_vertical(self) -> bool: 

901 return False 

902 

903 def is_multibyte(self) -> bool: 

904 return False 

905 

906 def decode(self, bytes: bytes) -> Iterable[int]: 

907 return bytearray(bytes) # map(ord, bytes) 

908 

909 def get_ascent(self) -> float: 

910 """Ascent above the baseline, in text space units""" 

911 return self.ascent * self.vscale 

912 

913 def get_descent(self) -> float: 

914 """Descent below the baseline, in text space units; always negative""" 

915 return self.descent * self.vscale 

916 

917 def get_width(self) -> float: 

918 w = self.bbox[2] - self.bbox[0] 

919 if w == 0: 

920 w = -self.default_width 

921 return w * self.hscale 

922 

923 def get_height(self) -> float: 

924 h = self.bbox[3] - self.bbox[1] 

925 if h == 0: 

926 h = self.ascent - self.descent 

927 return h * self.vscale 

928 

929 def char_width(self, cid: int) -> float: 

930 # Because character widths may be mapping either IDs or strings, 

931 # we try to lookup the character ID first, then its str equivalent. 

932 cid_width = safe_float(self.widths.get(cid)) 

933 if cid_width is not None: 

934 return cid_width * self.hscale 

935 

936 try: 

937 str_cid = self.to_unichr(cid) 

938 cid_width = safe_float(self.widths.get(str_cid)) 

939 if cid_width is not None: 

940 return cid_width * self.hscale 

941 

942 except PDFUnicodeNotDefined: 

943 pass 

944 

945 return self.default_width * self.hscale 

946 

947 def char_disp(self, cid: int) -> float | tuple[float | None, float]: 

948 """Returns an integer for horizontal fonts, a tuple for vertical fonts.""" 

949 return 0 

950 

951 def string_width(self, s: bytes) -> float: 

952 return sum(self.char_width(cid) for cid in self.decode(s)) 

953 

954 def to_unichr(self, cid: int) -> str: 

955 raise NotImplementedError 

956 

957 @staticmethod 

958 def _parse_bbox(descriptor: Mapping[str, Any]) -> Rect: 

959 """Parse FontBBox from the fonts descriptor""" 

960 font_bbox = resolve_all(descriptor.get("FontBBox")) 

961 bbox = safe_rect_list(font_bbox) 

962 if bbox is None: 

963 log.warning( 

964 f"Could not get FontBBox from font descriptor because " 

965 f"{font_bbox!r} cannot be parsed as 4 floats" 

966 ) 

967 return 0.0, 0.0, 0.0, 0.0 

968 return bbox 

969 

970 

971class PDFSimpleFont(PDFFont): 

972 def __init__( 

973 self, 

974 descriptor: Mapping[str, Any], 

975 widths: FontWidthDict, 

976 spec: Mapping[str, Any], 

977 ) -> None: 

978 # Font encoding is specified either by a name of 

979 # built-in encoding or a dictionary that describes 

980 # the differences. 

981 

982 default_encoding = LITERAL_STANDARD_ENCODING 

983 if literal_name(spec.get("Subtype")) == "TrueType": 

984 # PDF spec: TrueType fonts without Encoding default to WinAnsiEncoding 

985 default_encoding = LIT("WinAnsiEncoding") 

986 

987 encoding = default_encoding 

988 if "Encoding" in spec: 

989 encoding = resolve1(spec["Encoding"]) 

990 

991 if isinstance(encoding, dict): 

992 name = literal_name(encoding.get("BaseEncoding", default_encoding)) 

993 diff = list_value(encoding.get("Differences", [])) 

994 self.cid2unicode = EncodingDB.get_encoding(name, diff) 

995 else: 

996 self.cid2unicode = EncodingDB.get_encoding(literal_name(encoding)) 

997 

998 self.unicode_map: UnicodeMap | None = None 

999 if "ToUnicode" in spec: 

1000 strm = stream_value(spec["ToUnicode"]) 

1001 self.unicode_map = FileUnicodeMap() 

1002 CMapParser(self.unicode_map, BytesIO(strm.get_data())).run() 

1003 PDFFont.__init__(self, descriptor, widths) 

1004 

1005 def to_unichr(self, cid: int) -> str: 

1006 if self.unicode_map: 

1007 try: 

1008 return self.unicode_map.get_unichr(cid) 

1009 except KeyError: 

1010 pass 

1011 try: 

1012 return self.cid2unicode[cid] 

1013 except KeyError as err: 

1014 raise PDFUnicodeNotDefined(None, cid) from err 

1015 

1016 

1017class PDFType1Font(PDFSimpleFont): 

1018 def __init__(self, rsrcmgr: "PDFResourceManager", spec: Mapping[str, Any]) -> None: 

1019 try: 

1020 self.basefont = literal_name(spec["BaseFont"]) 

1021 except KeyError: 

1022 if settings.STRICT: 

1023 raise PDFFontError("BaseFont is missing") from None 

1024 self.basefont = "unknown" 

1025 

1026 widths: FontWidthDict 

1027 try: 

1028 (descriptor, int_widths) = FontMetricsDB.get_metrics(self.basefont) 

1029 widths = cast(dict[str | int, float], int_widths) # implicit int->float 

1030 except KeyError: 

1031 descriptor = dict_value(spec.get("FontDescriptor", {})) 

1032 firstchar = int_value(spec.get("FirstChar", 0)) 

1033 # lastchar = int_value(spec.get('LastChar', 255)) 

1034 width_list = list_value(spec.get("Widths", [0] * 256)) 

1035 widths = {i + firstchar: resolve1(w) for (i, w) in enumerate(width_list)} 

1036 PDFSimpleFont.__init__(self, descriptor, widths, spec) 

1037 if "Encoding" not in spec and "FontFile" in descriptor: 

1038 # try to recover the missing encoding info from the font file. 

1039 self.fontfile = stream_value(descriptor.get("FontFile")) 

1040 length1 = int_value(self.fontfile["Length1"]) 

1041 data = self.fontfile.get_data()[:length1] 

1042 parser = Type1FontHeaderParser(BytesIO(data)) 

1043 self.cid2unicode = parser.get_encoding() 

1044 

1045 def __repr__(self) -> str: 

1046 return f"<PDFType1Font: basefont={self.basefont!r}>" 

1047 

1048 

1049class PDFTrueTypeFont(PDFType1Font): 

1050 def __repr__(self) -> str: 

1051 return f"<PDFTrueTypeFont: basefont={self.basefont!r}>" 

1052 

1053 

1054class PDFType3Font(PDFSimpleFont): 

1055 def __init__(self, rsrcmgr: "PDFResourceManager", spec: Mapping[str, Any]) -> None: 

1056 firstchar = int_value(spec.get("FirstChar", 0)) 

1057 # lastchar = int_value(spec.get('LastChar', 0)) 

1058 width_list = list_value(spec.get("Widths", [0] * 256)) 

1059 widths: dict[str | int, float] = { 

1060 i + firstchar: w for (i, w) in enumerate(width_list) 

1061 } 

1062 if "FontDescriptor" in spec: 

1063 descriptor = dict_value(spec["FontDescriptor"]) 

1064 else: 

1065 descriptor = {"Ascent": 0, "Descent": 0, "FontBBox": spec["FontBBox"]} 

1066 PDFSimpleFont.__init__(self, descriptor, widths, spec) 

1067 self.matrix = cast(Matrix, tuple(list_value(spec.get("FontMatrix")))) 

1068 (_, self.descent, _, self.ascent) = self.bbox 

1069 (self.hscale, self.vscale) = apply_matrix_norm(self.matrix, (1, 1)) 

1070 

1071 def __repr__(self) -> str: 

1072 return "<PDFType3Font>" 

1073 

1074 

1075class PDFCIDFont(PDFFont): 

1076 default_disp: float | tuple[float | None, float] 

1077 

1078 def __init__( 

1079 self, 

1080 rsrcmgr: "PDFResourceManager", 

1081 spec: Mapping[str, Any], 

1082 strict: bool = settings.STRICT, 

1083 ) -> None: 

1084 try: 

1085 self.basefont = literal_name(spec["BaseFont"]) 

1086 except KeyError: 

1087 if strict: 

1088 raise PDFFontError("BaseFont is missing") from None 

1089 self.basefont = "unknown" 

1090 self.cidsysteminfo = dict_value(spec.get("CIDSystemInfo", {})) 

1091 cid_registry = resolve1(self.cidsysteminfo.get("Registry", b"unknown")).decode( 

1092 "latin1", 

1093 ) 

1094 cid_ordering = resolve1(self.cidsysteminfo.get("Ordering", b"unknown")).decode( 

1095 "latin1", 

1096 ) 

1097 self.cidcoding = f"{cid_registry.strip()}-{cid_ordering.strip()}" 

1098 self.cmap: CMapBase = self.get_cmap_from_spec(spec, strict) 

1099 

1100 try: 

1101 descriptor = dict_value(spec["FontDescriptor"]) 

1102 except KeyError: 

1103 if strict: 

1104 raise PDFFontError("FontDescriptor is missing") from None 

1105 descriptor = {} 

1106 ttf = None 

1107 if "FontFile2" in descriptor: 

1108 self.fontfile = stream_value(descriptor.get("FontFile2")) 

1109 ttf = TrueTypeFont(self.basefont, BytesIO(self.fontfile.get_data())) 

1110 self.unicode_map: UnicodeMap | None = None 

1111 if "ToUnicode" in spec: 

1112 if isinstance(spec["ToUnicode"], PDFStream): 

1113 strm = stream_value(spec["ToUnicode"]) 

1114 self.unicode_map = FileUnicodeMap() 

1115 CMapParser(self.unicode_map, BytesIO(strm.get_data())).run() 

1116 else: 

1117 cmap_name = literal_name(spec["ToUnicode"]) 

1118 encoding = literal_name(spec["Encoding"]) 

1119 if ( 

1120 "Identity" in cid_ordering 

1121 or "Identity" in cmap_name 

1122 or "Identity" in encoding 

1123 ): 

1124 self.unicode_map = IdentityUnicodeMap() 

1125 elif self.cidcoding in ("Adobe-Identity", "Adobe-UCS"): 

1126 if ttf: 

1127 with contextlib.suppress(TrueTypeFont.CMapNotFound): 

1128 self.unicode_map = ttf.create_unicode_map() 

1129 else: 

1130 with contextlib.suppress(CMapDB.CMapNotFound): 

1131 self.unicode_map = CMapDB.get_unicode_map( 

1132 self.cidcoding, 

1133 self.cmap.is_vertical(), 

1134 ) 

1135 

1136 self.vertical = self.cmap.is_vertical() 

1137 if self.vertical: 

1138 # writing mode: vertical 

1139 widths2 = get_widths2(list_value(spec.get("W2", []))) 

1140 self.disps = {cid: (vx, vy) for (cid, (_, (vx, vy))) in widths2.items()} 

1141 (vy, w) = resolve1(spec.get("DW2", [880, -1000])) 

1142 self.default_disp = (None, vy) 

1143 widths: dict[str | int, float] = { 

1144 cid: w for (cid, (w, _)) in widths2.items() 

1145 } 

1146 default_width = w 

1147 else: 

1148 # writing mode: horizontal 

1149 self.disps = {} 

1150 self.default_disp = 0 

1151 widths = get_widths(list_value(spec.get("W", []))) 

1152 default_width = spec.get("DW", 1000) 

1153 PDFFont.__init__(self, descriptor, widths, default_width=default_width) 

1154 

1155 def get_cmap_from_spec(self, spec: Mapping[str, Any], strict: bool) -> CMapBase: 

1156 """Get cmap from font specification 

1157 

1158 For certain PDFs, Encoding Type isn't mentioned as an attribute of 

1159 Encoding but as an attribute of CMapName, where CMapName is an 

1160 attribute of spec['Encoding']. 

1161 The horizontal/vertical modes are mentioned with different name 

1162 such as 'DLIdent-H/V','OneByteIdentityH/V','Identity-H/V'. 

1163 """ 

1164 cmap_name = self._get_cmap_name(spec, strict) 

1165 

1166 try: 

1167 return CMapDB.get_cmap(cmap_name) 

1168 except CMapDB.CMapNotFound as e: 

1169 if strict: 

1170 raise PDFFontError(e) from e 

1171 return CMap() 

1172 

1173 @staticmethod 

1174 def _get_cmap_name(spec: Mapping[str, Any], strict: bool) -> str: 

1175 """Get cmap name from font specification""" 

1176 cmap_name = "unknown" # default value 

1177 

1178 try: 

1179 spec_encoding = spec["Encoding"] 

1180 if hasattr(spec_encoding, "name"): 

1181 cmap_name = literal_name(spec["Encoding"]) 

1182 else: 

1183 cmap_name = literal_name(spec_encoding["CMapName"]) 

1184 except KeyError: 

1185 if strict: 

1186 raise PDFFontError("Encoding is unspecified") from None 

1187 

1188 if type(cmap_name) is PDFStream: # type: ignore[comparison-overlap] 

1189 cmap_name_stream: PDFStream = cast(PDFStream, cmap_name) 

1190 if "CMapName" in cmap_name_stream: 

1191 cmap_name = cmap_name_stream.get("CMapName").name 

1192 elif strict: 

1193 raise PDFFontError("CMapName unspecified for encoding") 

1194 

1195 return IDENTITY_ENCODER.get(cmap_name, cmap_name) 

1196 

1197 def __repr__(self) -> str: 

1198 return f"<PDFCIDFont: basefont={self.basefont!r}, cidcoding={self.cidcoding!r}>" 

1199 

1200 def is_vertical(self) -> bool: 

1201 return self.vertical 

1202 

1203 def is_multibyte(self) -> bool: 

1204 return True 

1205 

1206 def decode(self, bytes: bytes) -> Iterable[int]: 

1207 return self.cmap.decode(bytes) 

1208 

1209 def char_disp(self, cid: int) -> float | tuple[float | None, float]: 

1210 """Returns an integer for horizontal fonts, a tuple for vertical fonts.""" 

1211 return self.disps.get(cid, self.default_disp) 

1212 

1213 def to_unichr(self, cid: int) -> str: 

1214 try: 

1215 if not self.unicode_map: 

1216 raise PDFKeyError(cid) 

1217 return self.unicode_map.get_unichr(cid) 

1218 except KeyError as err: 

1219 raise PDFUnicodeNotDefined(self.cidcoding, cid) from err