Coverage for /pythoncovmergedfiles/medio/medio/src/pdfminer.six/pdfminer/pdffont.py: 67%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

534 statements  

1import logging 

2import struct 

3from io import BytesIO 

4from typing import ( 

5 TYPE_CHECKING, 

6 Any, 

7 BinaryIO, 

8 Dict, 

9 Iterable, 

10 Iterator, 

11 List, 

12 Mapping, 

13 Optional, 

14 Tuple, 

15 Union, 

16 cast, 

17) 

18 

19from pdfminer import settings 

20from pdfminer.casting import safe_float, safe_rect_list 

21from pdfminer.cmapdb import ( 

22 CMap, 

23 CMapBase, 

24 CMapDB, 

25 CMapParser, 

26 FileUnicodeMap, 

27 IdentityUnicodeMap, 

28 UnicodeMap, 

29) 

30from pdfminer.encodingdb import EncodingDB, name2unicode 

31from pdfminer.fontmetrics import FONT_METRICS 

32from pdfminer.pdfexceptions import PDFException, PDFKeyError, PDFValueError 

33from pdfminer.pdftypes import ( 

34 PDFStream, 

35 dict_value, 

36 int_value, 

37 list_value, 

38 num_value, 

39 resolve1, 

40 resolve_all, 

41 stream_value, 

42) 

43from pdfminer.psexceptions import PSEOF 

44from pdfminer.psparser import ( 

45 KWD, 

46 LIT, 

47 PSKeyword, 

48 PSLiteral, 

49 PSStackParser, 

50 literal_name, 

51) 

52from pdfminer.utils import Matrix, Point, Rect, apply_matrix_norm, choplist, nunpack 

53 

54if TYPE_CHECKING: 

55 from pdfminer.pdfinterp import PDFResourceManager 

56 

57log = logging.getLogger(__name__) 

58 

59 

60def get_widths(seq: Iterable[object]) -> Dict[Union[str, int], float]: 

61 """Build a mapping of character widths for horizontal writing.""" 

62 widths: Dict[int, float] = {} 

63 r: List[float] = [] 

64 for v in seq: 

65 v = resolve1(v) 

66 if isinstance(v, list): 

67 if r: 

68 char1 = r[-1] 

69 for i, w in enumerate(v): 

70 widths[cast(int, char1) + i] = w 

71 r = [] 

72 elif isinstance(v, (int, float)): # == utils.isnumber(v) 

73 r.append(v) 

74 if len(r) == 3: 

75 (char1, char2, w) = r 

76 if isinstance(char1, int) and isinstance(char2, int): 

77 for i in range(cast(int, char1), cast(int, char2) + 1): 

78 widths[i] = w 

79 else: 

80 log.warning( 

81 f"Skipping invalid font width specification for {char1} to {char2} because either of them is not an int" 

82 ) 

83 r = [] 

84 else: 

85 log.warning( 

86 f"Skipping invalid font width specification for {v} because it is not a number or a list" 

87 ) 

88 return cast(Dict[Union[str, int], float], widths) 

89 

90 

91def get_widths2(seq: Iterable[object]) -> Dict[int, Tuple[float, Point]]: 

92 """Build a mapping of character widths for vertical writing.""" 

93 widths: Dict[int, Tuple[float, Point]] = {} 

94 r: List[float] = [] 

95 for v in seq: 

96 if isinstance(v, list): 

97 if r: 

98 char1 = r[-1] 

99 for i, (w, vx, vy) in enumerate(choplist(3, v)): 

100 widths[cast(int, char1) + i] = (w, (vx, vy)) 

101 r = [] 

102 elif isinstance(v, (int, float)): # == utils.isnumber(v) 

103 r.append(v) 

104 if len(r) == 5: 

105 (char1, char2, w, vx, vy) = r 

106 for i in range(cast(int, char1), cast(int, char2) + 1): 

107 widths[i] = (w, (vx, vy)) 

108 r = [] 

109 return widths 

110 

111 

112class FontMetricsDB: 

113 @classmethod 

114 def get_metrics(cls, fontname: str) -> Tuple[Dict[str, object], Dict[str, int]]: 

115 return FONT_METRICS[fontname] 

116 

117 

118# int here means that we're not extending PSStackParser with additional types. 

119class Type1FontHeaderParser(PSStackParser[int]): 

120 KEYWORD_BEGIN = KWD(b"begin") 

121 KEYWORD_END = KWD(b"end") 

122 KEYWORD_DEF = KWD(b"def") 

123 KEYWORD_PUT = KWD(b"put") 

124 KEYWORD_DICT = KWD(b"dict") 

125 KEYWORD_ARRAY = KWD(b"array") 

126 KEYWORD_READONLY = KWD(b"readonly") 

127 KEYWORD_FOR = KWD(b"for") 

128 

129 def __init__(self, data: BinaryIO) -> None: 

130 PSStackParser.__init__(self, data) 

131 self._cid2unicode: Dict[int, str] = {} 

132 

133 def get_encoding(self) -> Dict[int, str]: 

134 """Parse the font encoding. 

135 

136 The Type1 font encoding maps character codes to character names. These 

137 character names could either be standard Adobe glyph names, or 

138 character names associated with custom CharStrings for this font. A 

139 CharString is a sequence of operations that describe how the character 

140 should be drawn. Currently, this function returns '' (empty string) 

141 for character names that are associated with a CharStrings. 

142 

143 Reference: Adobe Systems Incorporated, Adobe Type 1 Font Format 

144 

145 :returns mapping of character identifiers (cid's) to unicode characters 

146 """ 

147 while 1: 

148 try: 

149 (cid, name) = self.nextobject() 

150 except PSEOF: 

151 break 

152 try: 

153 self._cid2unicode[cid] = name2unicode(cast(str, name)) 

154 except KeyError as e: 

155 log.debug(str(e)) 

156 return self._cid2unicode 

157 

158 def do_keyword(self, pos: int, token: PSKeyword) -> None: 

159 if token is self.KEYWORD_PUT: 

160 ((_, key), (_, value)) = self.pop(2) 

161 if isinstance(key, int) and isinstance(value, PSLiteral): 

162 self.add_results((key, literal_name(value))) 

163 

164 

165NIBBLES = ("0", "1", "2", "3", "4", "5", "6", "7", "8", "9", ".", "e", "e-", None, "-") 

166 

167# Mapping of cmap names. Original cmap name is kept if not in the mapping. 

168# (missing reference for why DLIdent is mapped to Identity) 

169IDENTITY_ENCODER = { 

170 "DLIdent-H": "Identity-H", 

171 "DLIdent-V": "Identity-V", 

172} 

173 

174 

175def getdict(data: bytes) -> Dict[int, List[Union[float, int]]]: 

176 d: Dict[int, List[Union[float, int]]] = {} 

177 fp = BytesIO(data) 

178 stack: List[Union[float, int]] = [] 

179 while 1: 

180 c = fp.read(1) 

181 if not c: 

182 break 

183 b0 = ord(c) 

184 if b0 <= 21: 

185 d[b0] = stack 

186 stack = [] 

187 continue 

188 if b0 == 30: 

189 s = "" 

190 loop = True 

191 while loop: 

192 b = ord(fp.read(1)) 

193 for n in (b >> 4, b & 15): 

194 if n == 15: 

195 loop = False 

196 else: 

197 nibble = NIBBLES[n] 

198 assert nibble is not None 

199 s += nibble 

200 value = float(s) 

201 elif b0 >= 32 and b0 <= 246: 

202 value = b0 - 139 

203 else: 

204 b1 = ord(fp.read(1)) 

205 if b0 >= 247 and b0 <= 250: 

206 value = ((b0 - 247) << 8) + b1 + 108 

207 elif b0 >= 251 and b0 <= 254: 

208 value = -((b0 - 251) << 8) - b1 - 108 

209 else: 

210 b2 = ord(fp.read(1)) 

211 if b1 >= 128: 

212 b1 -= 256 

213 if b0 == 28: 

214 value = b1 << 8 | b2 

215 else: 

216 value = b1 << 24 | b2 << 16 | struct.unpack(">H", fp.read(2))[0] 

217 stack.append(value) 

218 return d 

219 

220 

221class CFFFont: 

222 STANDARD_STRINGS = ( 

223 ".notdef", 

224 "space", 

225 "exclam", 

226 "quotedbl", 

227 "numbersign", 

228 "dollar", 

229 "percent", 

230 "ampersand", 

231 "quoteright", 

232 "parenleft", 

233 "parenright", 

234 "asterisk", 

235 "plus", 

236 "comma", 

237 "hyphen", 

238 "period", 

239 "slash", 

240 "zero", 

241 "one", 

242 "two", 

243 "three", 

244 "four", 

245 "five", 

246 "six", 

247 "seven", 

248 "eight", 

249 "nine", 

250 "colon", 

251 "semicolon", 

252 "less", 

253 "equal", 

254 "greater", 

255 "question", 

256 "at", 

257 "A", 

258 "B", 

259 "C", 

260 "D", 

261 "E", 

262 "F", 

263 "G", 

264 "H", 

265 "I", 

266 "J", 

267 "K", 

268 "L", 

269 "M", 

270 "N", 

271 "O", 

272 "P", 

273 "Q", 

274 "R", 

275 "S", 

276 "T", 

277 "U", 

278 "V", 

279 "W", 

280 "X", 

281 "Y", 

282 "Z", 

283 "bracketleft", 

284 "backslash", 

285 "bracketright", 

286 "asciicircum", 

287 "underscore", 

288 "quoteleft", 

289 "a", 

290 "b", 

291 "c", 

292 "d", 

293 "e", 

294 "f", 

295 "g", 

296 "h", 

297 "i", 

298 "j", 

299 "k", 

300 "l", 

301 "m", 

302 "n", 

303 "o", 

304 "p", 

305 "q", 

306 "r", 

307 "s", 

308 "t", 

309 "u", 

310 "v", 

311 "w", 

312 "x", 

313 "y", 

314 "z", 

315 "braceleft", 

316 "bar", 

317 "braceright", 

318 "asciitilde", 

319 "exclamdown", 

320 "cent", 

321 "sterling", 

322 "fraction", 

323 "yen", 

324 "florin", 

325 "section", 

326 "currency", 

327 "quotesingle", 

328 "quotedblleft", 

329 "guillemotleft", 

330 "guilsinglleft", 

331 "guilsinglright", 

332 "fi", 

333 "fl", 

334 "endash", 

335 "dagger", 

336 "daggerdbl", 

337 "periodcentered", 

338 "paragraph", 

339 "bullet", 

340 "quotesinglbase", 

341 "quotedblbase", 

342 "quotedblright", 

343 "guillemotright", 

344 "ellipsis", 

345 "perthousand", 

346 "questiondown", 

347 "grave", 

348 "acute", 

349 "circumflex", 

350 "tilde", 

351 "macron", 

352 "breve", 

353 "dotaccent", 

354 "dieresis", 

355 "ring", 

356 "cedilla", 

357 "hungarumlaut", 

358 "ogonek", 

359 "caron", 

360 "emdash", 

361 "AE", 

362 "ordfeminine", 

363 "Lslash", 

364 "Oslash", 

365 "OE", 

366 "ordmasculine", 

367 "ae", 

368 "dotlessi", 

369 "lslash", 

370 "oslash", 

371 "oe", 

372 "germandbls", 

373 "onesuperior", 

374 "logicalnot", 

375 "mu", 

376 "trademark", 

377 "Eth", 

378 "onehalf", 

379 "plusminus", 

380 "Thorn", 

381 "onequarter", 

382 "divide", 

383 "brokenbar", 

384 "degree", 

385 "thorn", 

386 "threequarters", 

387 "twosuperior", 

388 "registered", 

389 "minus", 

390 "eth", 

391 "multiply", 

392 "threesuperior", 

393 "copyright", 

394 "Aacute", 

395 "Acircumflex", 

396 "Adieresis", 

397 "Agrave", 

398 "Aring", 

399 "Atilde", 

400 "Ccedilla", 

401 "Eacute", 

402 "Ecircumflex", 

403 "Edieresis", 

404 "Egrave", 

405 "Iacute", 

406 "Icircumflex", 

407 "Idieresis", 

408 "Igrave", 

409 "Ntilde", 

410 "Oacute", 

411 "Ocircumflex", 

412 "Odieresis", 

413 "Ograve", 

414 "Otilde", 

415 "Scaron", 

416 "Uacute", 

417 "Ucircumflex", 

418 "Udieresis", 

419 "Ugrave", 

420 "Yacute", 

421 "Ydieresis", 

422 "Zcaron", 

423 "aacute", 

424 "acircumflex", 

425 "adieresis", 

426 "agrave", 

427 "aring", 

428 "atilde", 

429 "ccedilla", 

430 "eacute", 

431 "ecircumflex", 

432 "edieresis", 

433 "egrave", 

434 "iacute", 

435 "icircumflex", 

436 "idieresis", 

437 "igrave", 

438 "ntilde", 

439 "oacute", 

440 "ocircumflex", 

441 "odieresis", 

442 "ograve", 

443 "otilde", 

444 "scaron", 

445 "uacute", 

446 "ucircumflex", 

447 "udieresis", 

448 "ugrave", 

449 "yacute", 

450 "ydieresis", 

451 "zcaron", 

452 "exclamsmall", 

453 "Hungarumlautsmall", 

454 "dollaroldstyle", 

455 "dollarsuperior", 

456 "ampersandsmall", 

457 "Acutesmall", 

458 "parenleftsuperior", 

459 "parenrightsuperior", 

460 "twodotenleader", 

461 "onedotenleader", 

462 "zerooldstyle", 

463 "oneoldstyle", 

464 "twooldstyle", 

465 "threeoldstyle", 

466 "fouroldstyle", 

467 "fiveoldstyle", 

468 "sixoldstyle", 

469 "sevenoldstyle", 

470 "eightoldstyle", 

471 "nineoldstyle", 

472 "commasuperior", 

473 "threequartersemdash", 

474 "periodsuperior", 

475 "questionsmall", 

476 "asuperior", 

477 "bsuperior", 

478 "centsuperior", 

479 "dsuperior", 

480 "esuperior", 

481 "isuperior", 

482 "lsuperior", 

483 "msuperior", 

484 "nsuperior", 

485 "osuperior", 

486 "rsuperior", 

487 "ssuperior", 

488 "tsuperior", 

489 "ff", 

490 "ffi", 

491 "ffl", 

492 "parenleftinferior", 

493 "parenrightinferior", 

494 "Circumflexsmall", 

495 "hyphensuperior", 

496 "Gravesmall", 

497 "Asmall", 

498 "Bsmall", 

499 "Csmall", 

500 "Dsmall", 

501 "Esmall", 

502 "Fsmall", 

503 "Gsmall", 

504 "Hsmall", 

505 "Ismall", 

506 "Jsmall", 

507 "Ksmall", 

508 "Lsmall", 

509 "Msmall", 

510 "Nsmall", 

511 "Osmall", 

512 "Psmall", 

513 "Qsmall", 

514 "Rsmall", 

515 "Ssmall", 

516 "Tsmall", 

517 "Usmall", 

518 "Vsmall", 

519 "Wsmall", 

520 "Xsmall", 

521 "Ysmall", 

522 "Zsmall", 

523 "colonmonetary", 

524 "onefitted", 

525 "rupiah", 

526 "Tildesmall", 

527 "exclamdownsmall", 

528 "centoldstyle", 

529 "Lslashsmall", 

530 "Scaronsmall", 

531 "Zcaronsmall", 

532 "Dieresissmall", 

533 "Brevesmall", 

534 "Caronsmall", 

535 "Dotaccentsmall", 

536 "Macronsmall", 

537 "figuredash", 

538 "hypheninferior", 

539 "Ogoneksmall", 

540 "Ringsmall", 

541 "Cedillasmall", 

542 "questiondownsmall", 

543 "oneeighth", 

544 "threeeighths", 

545 "fiveeighths", 

546 "seveneighths", 

547 "onethird", 

548 "twothirds", 

549 "zerosuperior", 

550 "foursuperior", 

551 "fivesuperior", 

552 "sixsuperior", 

553 "sevensuperior", 

554 "eightsuperior", 

555 "ninesuperior", 

556 "zeroinferior", 

557 "oneinferior", 

558 "twoinferior", 

559 "threeinferior", 

560 "fourinferior", 

561 "fiveinferior", 

562 "sixinferior", 

563 "seveninferior", 

564 "eightinferior", 

565 "nineinferior", 

566 "centinferior", 

567 "dollarinferior", 

568 "periodinferior", 

569 "commainferior", 

570 "Agravesmall", 

571 "Aacutesmall", 

572 "Acircumflexsmall", 

573 "Atildesmall", 

574 "Adieresissmall", 

575 "Aringsmall", 

576 "AEsmall", 

577 "Ccedillasmall", 

578 "Egravesmall", 

579 "Eacutesmall", 

580 "Ecircumflexsmall", 

581 "Edieresissmall", 

582 "Igravesmall", 

583 "Iacutesmall", 

584 "Icircumflexsmall", 

585 "Idieresissmall", 

586 "Ethsmall", 

587 "Ntildesmall", 

588 "Ogravesmall", 

589 "Oacutesmall", 

590 "Ocircumflexsmall", 

591 "Otildesmall", 

592 "Odieresissmall", 

593 "OEsmall", 

594 "Oslashsmall", 

595 "Ugravesmall", 

596 "Uacutesmall", 

597 "Ucircumflexsmall", 

598 "Udieresissmall", 

599 "Yacutesmall", 

600 "Thornsmall", 

601 "Ydieresissmall", 

602 "001.000", 

603 "001.001", 

604 "001.002", 

605 "001.003", 

606 "Black", 

607 "Bold", 

608 "Book", 

609 "Light", 

610 "Medium", 

611 "Regular", 

612 "Roman", 

613 "Semibold", 

614 ) 

615 

616 class INDEX: 

617 def __init__(self, fp: BinaryIO) -> None: 

618 self.fp = fp 

619 self.offsets: List[int] = [] 

620 (count, offsize) = struct.unpack(">HB", self.fp.read(3)) 

621 for i in range(count + 1): 

622 self.offsets.append(nunpack(self.fp.read(offsize))) 

623 self.base = self.fp.tell() - 1 

624 self.fp.seek(self.base + self.offsets[-1]) 

625 

626 def __repr__(self) -> str: 

627 return "<INDEX: size=%d>" % len(self) 

628 

629 def __len__(self) -> int: 

630 return len(self.offsets) - 1 

631 

632 def __getitem__(self, i: int) -> bytes: 

633 self.fp.seek(self.base + self.offsets[i]) 

634 return self.fp.read(self.offsets[i + 1] - self.offsets[i]) 

635 

636 def __iter__(self) -> Iterator[bytes]: 

637 return iter(self[i] for i in range(len(self))) 

638 

639 def __init__(self, name: str, fp: BinaryIO) -> None: 

640 self.name = name 

641 self.fp = fp 

642 # Header 

643 (_major, _minor, hdrsize, offsize) = struct.unpack("BBBB", self.fp.read(4)) 

644 self.fp.read(hdrsize - 4) 

645 # Name INDEX 

646 self.name_index = self.INDEX(self.fp) 

647 # Top DICT INDEX 

648 self.dict_index = self.INDEX(self.fp) 

649 # String INDEX 

650 self.string_index = self.INDEX(self.fp) 

651 # Global Subr INDEX 

652 self.subr_index = self.INDEX(self.fp) 

653 # Top DICT DATA 

654 self.top_dict = getdict(self.dict_index[0]) 

655 (charset_pos,) = self.top_dict.get(15, [0]) 

656 (encoding_pos,) = self.top_dict.get(16, [0]) 

657 (charstring_pos,) = self.top_dict.get(17, [0]) 

658 # CharStrings 

659 self.fp.seek(cast(int, charstring_pos)) 

660 self.charstring = self.INDEX(self.fp) 

661 self.nglyphs = len(self.charstring) 

662 # Encodings 

663 self.code2gid = {} 

664 self.gid2code = {} 

665 self.fp.seek(cast(int, encoding_pos)) 

666 format = self.fp.read(1) 

667 if format == b"\x00": 

668 # Format 0 

669 (n,) = struct.unpack("B", self.fp.read(1)) 

670 for code, gid in enumerate(struct.unpack("B" * n, self.fp.read(n))): 

671 self.code2gid[code] = gid 

672 self.gid2code[gid] = code 

673 elif format == b"\x01": 

674 # Format 1 

675 (n,) = struct.unpack("B", self.fp.read(1)) 

676 code = 0 

677 for i in range(n): 

678 (first, nleft) = struct.unpack("BB", self.fp.read(2)) 

679 for gid in range(first, first + nleft + 1): 

680 self.code2gid[code] = gid 

681 self.gid2code[gid] = code 

682 code += 1 

683 else: 

684 raise PDFValueError("unsupported encoding format: %r" % format) 

685 # Charsets 

686 self.name2gid = {} 

687 self.gid2name = {} 

688 self.fp.seek(cast(int, charset_pos)) 

689 format = self.fp.read(1) 

690 if format == b"\x00": 

691 # Format 0 

692 n = self.nglyphs - 1 

693 for gid, sid in enumerate( 

694 cast( 

695 Tuple[int, ...], struct.unpack(">" + "H" * n, self.fp.read(2 * n)) 

696 ), 

697 ): 

698 gid += 1 

699 sidname = self.getstr(sid) 

700 self.name2gid[sidname] = gid 

701 self.gid2name[gid] = sidname 

702 elif format == b"\x01": 

703 # Format 1 

704 (n,) = struct.unpack("B", self.fp.read(1)) 

705 sid = 0 

706 for i in range(n): 

707 (first, nleft) = struct.unpack("BB", self.fp.read(2)) 

708 for gid in range(first, first + nleft + 1): 

709 sidname = self.getstr(sid) 

710 self.name2gid[sidname] = gid 

711 self.gid2name[gid] = sidname 

712 sid += 1 

713 elif format == b"\x02": 

714 # Format 2 

715 assert False, str(("Unhandled", format)) 

716 else: 

717 raise PDFValueError("unsupported charset format: %r" % format) 

718 

719 def getstr(self, sid: int) -> Union[str, bytes]: 

720 # This returns str for one of the STANDARD_STRINGS but bytes otherwise, 

721 # and appears to be a needless source of type complexity. 

722 if sid < len(self.STANDARD_STRINGS): 

723 return self.STANDARD_STRINGS[sid] 

724 return self.string_index[sid - len(self.STANDARD_STRINGS)] 

725 

726 

727class TrueTypeFont: 

728 class CMapNotFound(PDFException): 

729 pass 

730 

731 def __init__(self, name: str, fp: BinaryIO) -> None: 

732 self.name = name 

733 self.fp = fp 

734 self.tables: Dict[bytes, Tuple[int, int]] = {} 

735 self.fonttype = fp.read(4) 

736 try: 

737 (ntables, _1, _2, _3) = cast( 

738 Tuple[int, int, int, int], 

739 struct.unpack(">HHHH", fp.read(8)), 

740 ) 

741 for _ in range(ntables): 

742 (name_bytes, tsum, offset, length) = cast( 

743 Tuple[bytes, int, int, int], 

744 struct.unpack(">4sLLL", fp.read(16)), 

745 ) 

746 self.tables[name_bytes] = (offset, length) 

747 except struct.error: 

748 # Do not fail if there are not enough bytes to read. Even for 

749 # corrupted PDFs we would like to get as much information as 

750 # possible, so continue. 

751 pass 

752 

753 def create_unicode_map(self) -> FileUnicodeMap: 

754 if b"cmap" not in self.tables: 

755 raise TrueTypeFont.CMapNotFound 

756 (base_offset, length) = self.tables[b"cmap"] 

757 fp = self.fp 

758 fp.seek(base_offset) 

759 (version, nsubtables) = cast(Tuple[int, int], struct.unpack(">HH", fp.read(4))) 

760 subtables: List[Tuple[int, int, int]] = [] 

761 for i in range(nsubtables): 

762 subtables.append( 

763 cast(Tuple[int, int, int], struct.unpack(">HHL", fp.read(8))), 

764 ) 

765 char2gid: Dict[int, int] = {} 

766 # Only supports subtable type 0, 2 and 4. 

767 for platform_id, encoding_id, st_offset in subtables: 

768 # Skip non-Unicode cmaps. 

769 # https://docs.microsoft.com/en-us/typography/opentype/spec/cmap 

770 if not (platform_id == 0 or (platform_id == 3 and encoding_id in [1, 10])): 

771 continue 

772 fp.seek(base_offset + st_offset) 

773 (fmttype, fmtlen, fmtlang) = cast( 

774 Tuple[int, int, int], 

775 struct.unpack(">HHH", fp.read(6)), 

776 ) 

777 if fmttype == 0: 

778 char2gid.update( 

779 enumerate( 

780 cast(Tuple[int, ...], struct.unpack(">256B", fp.read(256))), 

781 ), 

782 ) 

783 elif fmttype == 2: 

784 subheaderkeys = cast( 

785 Tuple[int, ...], 

786 struct.unpack(">256H", fp.read(512)), 

787 ) 

788 firstbytes = [0] * 8192 

789 for i, k in enumerate(subheaderkeys): 

790 firstbytes[k // 8] = i 

791 nhdrs = max(subheaderkeys) // 8 + 1 

792 hdrs: List[Tuple[int, int, int, int, int]] = [] 

793 for i in range(nhdrs): 

794 (firstcode, entcount, delta, offset) = cast( 

795 Tuple[int, int, int, int], 

796 struct.unpack(">HHhH", fp.read(8)), 

797 ) 

798 hdrs.append((i, firstcode, entcount, delta, fp.tell() - 2 + offset)) 

799 for i, firstcode, entcount, delta, pos in hdrs: 

800 if not entcount: 

801 continue 

802 first = firstcode + (firstbytes[i] << 8) 

803 fp.seek(pos) 

804 for c in range(entcount): 

805 gid = cast(Tuple[int], struct.unpack(">H", fp.read(2)))[0] 

806 if gid: 

807 gid += delta 

808 char2gid[first + c] = gid 

809 elif fmttype == 4: 

810 (segcount, _1, _2, _3) = cast( 

811 Tuple[int, int, int, int], 

812 struct.unpack(">HHHH", fp.read(8)), 

813 ) 

814 segcount //= 2 

815 ecs = cast( 

816 Tuple[int, ...], 

817 struct.unpack(">%dH" % segcount, fp.read(2 * segcount)), 

818 ) 

819 fp.read(2) 

820 scs = cast( 

821 Tuple[int, ...], 

822 struct.unpack(">%dH" % segcount, fp.read(2 * segcount)), 

823 ) 

824 idds = cast( 

825 Tuple[int, ...], 

826 struct.unpack(">%dh" % segcount, fp.read(2 * segcount)), 

827 ) 

828 pos = fp.tell() 

829 idrs = cast( 

830 Tuple[int, ...], 

831 struct.unpack(">%dH" % segcount, fp.read(2 * segcount)), 

832 ) 

833 for ec, sc, idd, idr in zip(ecs, scs, idds, idrs): 

834 if idr: 

835 fp.seek(pos + idr) 

836 for c in range(sc, ec + 1): 

837 b = cast(Tuple[int], struct.unpack(">H", fp.read(2)))[0] 

838 char2gid[c] = (b + idd) & 0xFFFF 

839 else: 

840 for c in range(sc, ec + 1): 

841 char2gid[c] = (c + idd) & 0xFFFF 

842 else: 

843 assert False, str(("Unhandled", fmttype)) 

844 if not char2gid: 

845 raise TrueTypeFont.CMapNotFound 

846 # create unicode map 

847 unicode_map = FileUnicodeMap() 

848 for char, gid in char2gid.items(): 

849 unicode_map.add_cid2unichr(gid, char) 

850 return unicode_map 

851 

852 

853class PDFFontError(PDFException): 

854 pass 

855 

856 

857class PDFUnicodeNotDefined(PDFFontError): 

858 pass 

859 

860 

861LITERAL_STANDARD_ENCODING = LIT("StandardEncoding") 

862LITERAL_TYPE1C = LIT("Type1C") 

863 

864# Font widths are maintained in a dict type that maps from *either* unicode 

865# chars or integer character IDs. 

866FontWidthDict = Dict[Union[int, str], float] 

867 

868 

869class PDFFont: 

870 def __init__( 

871 self, 

872 descriptor: Mapping[str, Any], 

873 widths: FontWidthDict, 

874 default_width: Optional[float] = None, 

875 ) -> None: 

876 self.descriptor = descriptor 

877 self.widths: FontWidthDict = resolve_all(widths) 

878 self.fontname = resolve1(descriptor.get("FontName", "unknown")) 

879 if isinstance(self.fontname, PSLiteral): 

880 self.fontname = literal_name(self.fontname) 

881 self.flags = int_value(descriptor.get("Flags", 0)) 

882 self.ascent = num_value(descriptor.get("Ascent", 0)) 

883 self.descent = num_value(descriptor.get("Descent", 0)) 

884 self.italic_angle = num_value(descriptor.get("ItalicAngle", 0)) 

885 if default_width is None: 

886 self.default_width = num_value(descriptor.get("MissingWidth", 0)) 

887 else: 

888 self.default_width = default_width 

889 self.default_width = resolve1(self.default_width) 

890 self.leading = num_value(descriptor.get("Leading", 0)) 

891 self.bbox = self._parse_bbox(descriptor) 

892 self.hscale = self.vscale = 0.001 

893 

894 # PDF RM 9.8.1 specifies /Descent should always be a negative number. 

895 # PScript5.dll seems to produce Descent with a positive number, but 

896 # text analysis will be wrong if this is taken as correct. So force 

897 # descent to negative. 

898 if self.descent > 0: 

899 self.descent = -self.descent 

900 

901 def __repr__(self) -> str: 

902 return "<PDFFont>" 

903 

904 def is_vertical(self) -> bool: 

905 return False 

906 

907 def is_multibyte(self) -> bool: 

908 return False 

909 

910 def decode(self, bytes: bytes) -> Iterable[int]: 

911 return bytearray(bytes) # map(ord, bytes) 

912 

913 def get_ascent(self) -> float: 

914 """Ascent above the baseline, in text space units""" 

915 return self.ascent * self.vscale 

916 

917 def get_descent(self) -> float: 

918 """Descent below the baseline, in text space units; always negative""" 

919 return self.descent * self.vscale 

920 

921 def get_width(self) -> float: 

922 w = self.bbox[2] - self.bbox[0] 

923 if w == 0: 

924 w = -self.default_width 

925 return w * self.hscale 

926 

927 def get_height(self) -> float: 

928 h = self.bbox[3] - self.bbox[1] 

929 if h == 0: 

930 h = self.ascent - self.descent 

931 return h * self.vscale 

932 

933 def char_width(self, cid: int) -> float: 

934 # Because character widths may be mapping either IDs or strings, 

935 # we try to lookup the character ID first, then its str equivalent. 

936 cid_width = safe_float(self.widths.get(cid)) 

937 if cid_width is not None: 

938 return cid_width * self.hscale 

939 

940 try: 

941 str_cid = self.to_unichr(cid) 

942 cid_width = safe_float(self.widths.get(str_cid)) 

943 if cid_width is not None: 

944 return cid_width * self.hscale 

945 

946 except PDFUnicodeNotDefined: 

947 pass 

948 

949 return self.default_width * self.hscale 

950 

951 def char_disp(self, cid: int) -> Union[float, Tuple[Optional[float], float]]: 

952 """Returns an integer for horizontal fonts, a tuple for vertical fonts.""" 

953 return 0 

954 

955 def string_width(self, s: bytes) -> float: 

956 return sum(self.char_width(cid) for cid in self.decode(s)) 

957 

958 def to_unichr(self, cid: int) -> str: 

959 raise NotImplementedError 

960 

961 @staticmethod 

962 def _parse_bbox(descriptor: Mapping[str, Any]) -> Rect: 

963 """Parse FontBBox from the fonts descriptor""" 

964 font_bbox = resolve_all(descriptor.get("FontBBox")) 

965 bbox = safe_rect_list(font_bbox) 

966 if bbox is None: 

967 log.warning( 

968 f"Could get FontBBox from font descriptor because {font_bbox!r} cannot be parsed as 4 floats" 

969 ) 

970 return 0.0, 0.0, 0.0, 0.0 

971 return bbox 

972 

973 

974class PDFSimpleFont(PDFFont): 

975 def __init__( 

976 self, 

977 descriptor: Mapping[str, Any], 

978 widths: FontWidthDict, 

979 spec: Mapping[str, Any], 

980 ) -> None: 

981 # Font encoding is specified either by a name of 

982 # built-in encoding or a dictionary that describes 

983 # the differences. 

984 if "Encoding" in spec: 

985 encoding = resolve1(spec["Encoding"]) 

986 else: 

987 encoding = LITERAL_STANDARD_ENCODING 

988 if isinstance(encoding, dict): 

989 name = literal_name(encoding.get("BaseEncoding", LITERAL_STANDARD_ENCODING)) 

990 diff = list_value(encoding.get("Differences", [])) 

991 self.cid2unicode = EncodingDB.get_encoding(name, diff) 

992 else: 

993 self.cid2unicode = EncodingDB.get_encoding(literal_name(encoding)) 

994 self.unicode_map: Optional[UnicodeMap] = None 

995 if "ToUnicode" in spec: 

996 strm = stream_value(spec["ToUnicode"]) 

997 self.unicode_map = FileUnicodeMap() 

998 CMapParser(self.unicode_map, BytesIO(strm.get_data())).run() 

999 PDFFont.__init__(self, descriptor, widths) 

1000 

1001 def to_unichr(self, cid: int) -> str: 

1002 if self.unicode_map: 

1003 try: 

1004 return self.unicode_map.get_unichr(cid) 

1005 except KeyError: 

1006 pass 

1007 try: 

1008 return self.cid2unicode[cid] 

1009 except KeyError: 

1010 raise PDFUnicodeNotDefined(None, cid) 

1011 

1012 

1013class PDFType1Font(PDFSimpleFont): 

1014 def __init__(self, rsrcmgr: "PDFResourceManager", spec: Mapping[str, Any]) -> None: 

1015 try: 

1016 self.basefont = literal_name(spec["BaseFont"]) 

1017 except KeyError: 

1018 if settings.STRICT: 

1019 raise PDFFontError("BaseFont is missing") 

1020 self.basefont = "unknown" 

1021 

1022 widths: FontWidthDict 

1023 try: 

1024 (descriptor, int_widths) = FontMetricsDB.get_metrics(self.basefont) 

1025 widths = cast( 

1026 Dict[Union[str, int], float], int_widths 

1027 ) # implicit int->float 

1028 except KeyError: 

1029 descriptor = dict_value(spec.get("FontDescriptor", {})) 

1030 firstchar = int_value(spec.get("FirstChar", 0)) 

1031 # lastchar = int_value(spec.get('LastChar', 255)) 

1032 width_list = list_value(spec.get("Widths", [0] * 256)) 

1033 widths = {i + firstchar: resolve1(w) for (i, w) in enumerate(width_list)} 

1034 PDFSimpleFont.__init__(self, descriptor, widths, spec) 

1035 if "Encoding" not in spec and "FontFile" in descriptor: 

1036 # try to recover the missing encoding info from the font file. 

1037 self.fontfile = stream_value(descriptor.get("FontFile")) 

1038 length1 = int_value(self.fontfile["Length1"]) 

1039 data = self.fontfile.get_data()[:length1] 

1040 parser = Type1FontHeaderParser(BytesIO(data)) 

1041 self.cid2unicode = parser.get_encoding() 

1042 

1043 def __repr__(self) -> str: 

1044 return "<PDFType1Font: basefont=%r>" % self.basefont 

1045 

1046 

1047class PDFTrueTypeFont(PDFType1Font): 

1048 def __repr__(self) -> str: 

1049 return "<PDFTrueTypeFont: basefont=%r>" % self.basefont 

1050 

1051 

1052class PDFType3Font(PDFSimpleFont): 

1053 def __init__(self, rsrcmgr: "PDFResourceManager", spec: Mapping[str, Any]) -> None: 

1054 firstchar = int_value(spec.get("FirstChar", 0)) 

1055 # lastchar = int_value(spec.get('LastChar', 0)) 

1056 width_list = list_value(spec.get("Widths", [0] * 256)) 

1057 widths: Dict[Union[str, int], float] = { 

1058 i + firstchar: w for (i, w) in enumerate(width_list) 

1059 } 

1060 if "FontDescriptor" in spec: 

1061 descriptor = dict_value(spec["FontDescriptor"]) 

1062 else: 

1063 descriptor = {"Ascent": 0, "Descent": 0, "FontBBox": spec["FontBBox"]} 

1064 PDFSimpleFont.__init__(self, descriptor, widths, spec) 

1065 self.matrix = cast(Matrix, tuple(list_value(spec.get("FontMatrix")))) 

1066 (_, self.descent, _, self.ascent) = self.bbox 

1067 (self.hscale, self.vscale) = apply_matrix_norm(self.matrix, (1, 1)) 

1068 

1069 def __repr__(self) -> str: 

1070 return "<PDFType3Font>" 

1071 

1072 

1073class PDFCIDFont(PDFFont): 

1074 default_disp: Union[float, Tuple[Optional[float], float]] 

1075 

1076 def __init__( 

1077 self, 

1078 rsrcmgr: "PDFResourceManager", 

1079 spec: Mapping[str, Any], 

1080 strict: bool = settings.STRICT, 

1081 ) -> None: 

1082 try: 

1083 self.basefont = literal_name(spec["BaseFont"]) 

1084 except KeyError: 

1085 if strict: 

1086 raise PDFFontError("BaseFont is missing") 

1087 self.basefont = "unknown" 

1088 self.cidsysteminfo = dict_value(spec.get("CIDSystemInfo", {})) 

1089 cid_registry = resolve1(self.cidsysteminfo.get("Registry", b"unknown")).decode( 

1090 "latin1", 

1091 ) 

1092 cid_ordering = resolve1(self.cidsysteminfo.get("Ordering", b"unknown")).decode( 

1093 "latin1", 

1094 ) 

1095 self.cidcoding = f"{cid_registry.strip()}-{cid_ordering.strip()}" 

1096 self.cmap: CMapBase = self.get_cmap_from_spec(spec, strict) 

1097 

1098 try: 

1099 descriptor = dict_value(spec["FontDescriptor"]) 

1100 except KeyError: 

1101 if strict: 

1102 raise PDFFontError("FontDescriptor is missing") 

1103 descriptor = {} 

1104 ttf = None 

1105 if "FontFile2" in descriptor: 

1106 self.fontfile = stream_value(descriptor.get("FontFile2")) 

1107 ttf = TrueTypeFont(self.basefont, BytesIO(self.fontfile.get_data())) 

1108 self.unicode_map: Optional[UnicodeMap] = None 

1109 if "ToUnicode" in spec: 

1110 if isinstance(spec["ToUnicode"], PDFStream): 

1111 strm = stream_value(spec["ToUnicode"]) 

1112 self.unicode_map = FileUnicodeMap() 

1113 CMapParser(self.unicode_map, BytesIO(strm.get_data())).run() 

1114 else: 

1115 cmap_name = literal_name(spec["ToUnicode"]) 

1116 encoding = literal_name(spec["Encoding"]) 

1117 if ( 

1118 "Identity" in cid_ordering 

1119 or "Identity" in cmap_name 

1120 or "Identity" in encoding 

1121 ): 

1122 self.unicode_map = IdentityUnicodeMap() 

1123 elif self.cidcoding in ("Adobe-Identity", "Adobe-UCS"): 

1124 if ttf: 

1125 try: 

1126 self.unicode_map = ttf.create_unicode_map() 

1127 except TrueTypeFont.CMapNotFound: 

1128 pass 

1129 else: 

1130 try: 

1131 self.unicode_map = CMapDB.get_unicode_map( 

1132 self.cidcoding, 

1133 self.cmap.is_vertical(), 

1134 ) 

1135 except CMapDB.CMapNotFound: 

1136 pass 

1137 

1138 self.vertical = self.cmap.is_vertical() 

1139 if self.vertical: 

1140 # writing mode: vertical 

1141 widths2 = get_widths2(list_value(spec.get("W2", []))) 

1142 self.disps = {cid: (vx, vy) for (cid, (_, (vx, vy))) in widths2.items()} 

1143 (vy, w) = resolve1(spec.get("DW2", [880, -1000])) 

1144 self.default_disp = (None, vy) 

1145 widths: Dict[Union[str, int], float] = { 

1146 cid: w for (cid, (w, _)) in widths2.items() 

1147 } 

1148 default_width = w 

1149 else: 

1150 # writing mode: horizontal 

1151 self.disps = {} 

1152 self.default_disp = 0 

1153 widths = get_widths(list_value(spec.get("W", []))) 

1154 default_width = spec.get("DW", 1000) 

1155 PDFFont.__init__(self, descriptor, widths, default_width=default_width) 

1156 

1157 def get_cmap_from_spec(self, spec: Mapping[str, Any], strict: bool) -> CMapBase: 

1158 """Get cmap from font specification 

1159 

1160 For certain PDFs, Encoding Type isn't mentioned as an attribute of 

1161 Encoding but as an attribute of CMapName, where CMapName is an 

1162 attribute of spec['Encoding']. 

1163 The horizontal/vertical modes are mentioned with different name 

1164 such as 'DLIdent-H/V','OneByteIdentityH/V','Identity-H/V'. 

1165 """ 

1166 cmap_name = self._get_cmap_name(spec, strict) 

1167 

1168 try: 

1169 return CMapDB.get_cmap(cmap_name) 

1170 except CMapDB.CMapNotFound as e: 

1171 if strict: 

1172 raise PDFFontError(e) 

1173 return CMap() 

1174 

1175 @staticmethod 

1176 def _get_cmap_name(spec: Mapping[str, Any], strict: bool) -> str: 

1177 """Get cmap name from font specification""" 

1178 cmap_name = "unknown" # default value 

1179 

1180 try: 

1181 spec_encoding = spec["Encoding"] 

1182 if hasattr(spec_encoding, "name"): 

1183 cmap_name = literal_name(spec["Encoding"]) 

1184 else: 

1185 cmap_name = literal_name(spec_encoding["CMapName"]) 

1186 except KeyError: 

1187 if strict: 

1188 raise PDFFontError("Encoding is unspecified") 

1189 

1190 if type(cmap_name) is PDFStream: # type: ignore[comparison-overlap] 

1191 cmap_name_stream: PDFStream = cast(PDFStream, cmap_name) 

1192 if "CMapName" in cmap_name_stream: 

1193 cmap_name = cmap_name_stream.get("CMapName").name 

1194 elif strict: 

1195 raise PDFFontError("CMapName unspecified for encoding") 

1196 

1197 return IDENTITY_ENCODER.get(cmap_name, cmap_name) 

1198 

1199 def __repr__(self) -> str: 

1200 return f"<PDFCIDFont: basefont={self.basefont!r}, cidcoding={self.cidcoding!r}>" 

1201 

1202 def is_vertical(self) -> bool: 

1203 return self.vertical 

1204 

1205 def is_multibyte(self) -> bool: 

1206 return True 

1207 

1208 def decode(self, bytes: bytes) -> Iterable[int]: 

1209 return self.cmap.decode(bytes) 

1210 

1211 def char_disp(self, cid: int) -> Union[float, Tuple[Optional[float], float]]: 

1212 """Returns an integer for horizontal fonts, a tuple for vertical fonts.""" 

1213 return self.disps.get(cid, self.default_disp) 

1214 

1215 def to_unichr(self, cid: int) -> str: 

1216 try: 

1217 if not self.unicode_map: 

1218 raise PDFKeyError(cid) 

1219 return self.unicode_map.get_unichr(cid) 

1220 except KeyError: 

1221 raise PDFUnicodeNotDefined(self.cidcoding, cid)