Coverage for /pythoncovmergedfiles/medio/medio/src/pdfminer.six/pdfminer/pdffont.py: 59%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

516 statements  

1import logging 

2import struct 

3from io import BytesIO 

4from typing import ( 

5 TYPE_CHECKING, 

6 Any, 

7 BinaryIO, 

8 Dict, 

9 Iterable, 

10 Iterator, 

11 List, 

12 Mapping, 

13 Optional, 

14 Tuple, 

15 Union, 

16 cast, 

17) 

18 

19from pdfminer import settings 

20from pdfminer.cmapdb import ( 

21 CMap, 

22 CMapBase, 

23 CMapDB, 

24 CMapParser, 

25 FileUnicodeMap, 

26 IdentityUnicodeMap, 

27 UnicodeMap, 

28) 

29from pdfminer.encodingdb import EncodingDB, name2unicode 

30from pdfminer.fontmetrics import FONT_METRICS 

31from pdfminer.pdfexceptions import PDFException, PDFKeyError, PDFValueError 

32from pdfminer.pdftypes import ( 

33 PDFStream, 

34 dict_value, 

35 int_value, 

36 list_value, 

37 num_value, 

38 resolve1, 

39 resolve_all, 

40 stream_value, 

41) 

42from pdfminer.psexceptions import PSEOF 

43from pdfminer.psparser import ( 

44 KWD, 

45 LIT, 

46 PSKeyword, 

47 PSLiteral, 

48 PSStackParser, 

49 literal_name, 

50) 

51from pdfminer.utils import Matrix, Point, Rect, apply_matrix_norm, choplist, nunpack 

52 

53if TYPE_CHECKING: 

54 from pdfminer.pdfinterp import PDFResourceManager 

55 

56log = logging.getLogger(__name__) 

57 

58 

59def get_widths(seq: Iterable[object]) -> Dict[int, float]: 

60 """Build a mapping of character widths for horizontal writing.""" 

61 widths: Dict[int, float] = {} 

62 r: List[float] = [] 

63 for v in seq: 

64 if isinstance(v, list): 

65 if r: 

66 char1 = r[-1] 

67 for i, w in enumerate(v): 

68 widths[cast(int, char1) + i] = w 

69 r = [] 

70 elif isinstance(v, (int, float)): # == utils.isnumber(v) 

71 r.append(v) 

72 if len(r) == 3: 

73 (char1, char2, w) = r 

74 for i in range(cast(int, char1), cast(int, char2) + 1): 

75 widths[i] = w 

76 r = [] 

77 return widths 

78 

79 

80def get_widths2(seq: Iterable[object]) -> Dict[int, Tuple[float, Point]]: 

81 """Build a mapping of character widths for vertical writing.""" 

82 widths: Dict[int, Tuple[float, Point]] = {} 

83 r: List[float] = [] 

84 for v in seq: 

85 if isinstance(v, list): 

86 if r: 

87 char1 = r[-1] 

88 for i, (w, vx, vy) in enumerate(choplist(3, v)): 

89 widths[cast(int, char1) + i] = (w, (vx, vy)) 

90 r = [] 

91 elif isinstance(v, (int, float)): # == utils.isnumber(v) 

92 r.append(v) 

93 if len(r) == 5: 

94 (char1, char2, w, vx, vy) = r 

95 for i in range(cast(int, char1), cast(int, char2) + 1): 

96 widths[i] = (w, (vx, vy)) 

97 r = [] 

98 return widths 

99 

100 

101class FontMetricsDB: 

102 @classmethod 

103 def get_metrics(cls, fontname: str) -> Tuple[Dict[str, object], Dict[str, int]]: 

104 return FONT_METRICS[fontname] 

105 

106 

107# int here means that we're not extending PSStackParser with additional types. 

108class Type1FontHeaderParser(PSStackParser[int]): 

109 KEYWORD_BEGIN = KWD(b"begin") 

110 KEYWORD_END = KWD(b"end") 

111 KEYWORD_DEF = KWD(b"def") 

112 KEYWORD_PUT = KWD(b"put") 

113 KEYWORD_DICT = KWD(b"dict") 

114 KEYWORD_ARRAY = KWD(b"array") 

115 KEYWORD_READONLY = KWD(b"readonly") 

116 KEYWORD_FOR = KWD(b"for") 

117 

118 def __init__(self, data: BinaryIO) -> None: 

119 PSStackParser.__init__(self, data) 

120 self._cid2unicode: Dict[int, str] = {} 

121 

122 def get_encoding(self) -> Dict[int, str]: 

123 """Parse the font encoding. 

124 

125 The Type1 font encoding maps character codes to character names. These 

126 character names could either be standard Adobe glyph names, or 

127 character names associated with custom CharStrings for this font. A 

128 CharString is a sequence of operations that describe how the character 

129 should be drawn. Currently, this function returns '' (empty string) 

130 for character names that are associated with a CharStrings. 

131 

132 Reference: Adobe Systems Incorporated, Adobe Type 1 Font Format 

133 

134 :returns mapping of character identifiers (cid's) to unicode characters 

135 """ 

136 while 1: 

137 try: 

138 (cid, name) = self.nextobject() 

139 except PSEOF: 

140 break 

141 try: 

142 self._cid2unicode[cid] = name2unicode(cast(str, name)) 

143 except KeyError as e: 

144 log.debug(str(e)) 

145 return self._cid2unicode 

146 

147 def do_keyword(self, pos: int, token: PSKeyword) -> None: 

148 if token is self.KEYWORD_PUT: 

149 ((_, key), (_, value)) = self.pop(2) 

150 if isinstance(key, int) and isinstance(value, PSLiteral): 

151 self.add_results((key, literal_name(value))) 

152 

153 

154NIBBLES = ("0", "1", "2", "3", "4", "5", "6", "7", "8", "9", ".", "e", "e-", None, "-") 

155 

156# Mapping of cmap names. Original cmap name is kept if not in the mapping. 

157# (missing reference for why DLIdent is mapped to Identity) 

158IDENTITY_ENCODER = { 

159 "DLIdent-H": "Identity-H", 

160 "DLIdent-V": "Identity-V", 

161} 

162 

163 

164def getdict(data: bytes) -> Dict[int, List[Union[float, int]]]: 

165 d: Dict[int, List[Union[float, int]]] = {} 

166 fp = BytesIO(data) 

167 stack: List[Union[float, int]] = [] 

168 while 1: 

169 c = fp.read(1) 

170 if not c: 

171 break 

172 b0 = ord(c) 

173 if b0 <= 21: 

174 d[b0] = stack 

175 stack = [] 

176 continue 

177 if b0 == 30: 

178 s = "" 

179 loop = True 

180 while loop: 

181 b = ord(fp.read(1)) 

182 for n in (b >> 4, b & 15): 

183 if n == 15: 

184 loop = False 

185 else: 

186 nibble = NIBBLES[n] 

187 assert nibble is not None 

188 s += nibble 

189 value = float(s) 

190 elif b0 >= 32 and b0 <= 246: 

191 value = b0 - 139 

192 else: 

193 b1 = ord(fp.read(1)) 

194 if b0 >= 247 and b0 <= 250: 

195 value = ((b0 - 247) << 8) + b1 + 108 

196 elif b0 >= 251 and b0 <= 254: 

197 value = -((b0 - 251) << 8) - b1 - 108 

198 else: 

199 b2 = ord(fp.read(1)) 

200 if b1 >= 128: 

201 b1 -= 256 

202 if b0 == 28: 

203 value = b1 << 8 | b2 

204 else: 

205 value = b1 << 24 | b2 << 16 | struct.unpack(">H", fp.read(2))[0] 

206 stack.append(value) 

207 return d 

208 

209 

210class CFFFont: 

211 STANDARD_STRINGS = ( 

212 ".notdef", 

213 "space", 

214 "exclam", 

215 "quotedbl", 

216 "numbersign", 

217 "dollar", 

218 "percent", 

219 "ampersand", 

220 "quoteright", 

221 "parenleft", 

222 "parenright", 

223 "asterisk", 

224 "plus", 

225 "comma", 

226 "hyphen", 

227 "period", 

228 "slash", 

229 "zero", 

230 "one", 

231 "two", 

232 "three", 

233 "four", 

234 "five", 

235 "six", 

236 "seven", 

237 "eight", 

238 "nine", 

239 "colon", 

240 "semicolon", 

241 "less", 

242 "equal", 

243 "greater", 

244 "question", 

245 "at", 

246 "A", 

247 "B", 

248 "C", 

249 "D", 

250 "E", 

251 "F", 

252 "G", 

253 "H", 

254 "I", 

255 "J", 

256 "K", 

257 "L", 

258 "M", 

259 "N", 

260 "O", 

261 "P", 

262 "Q", 

263 "R", 

264 "S", 

265 "T", 

266 "U", 

267 "V", 

268 "W", 

269 "X", 

270 "Y", 

271 "Z", 

272 "bracketleft", 

273 "backslash", 

274 "bracketright", 

275 "asciicircum", 

276 "underscore", 

277 "quoteleft", 

278 "a", 

279 "b", 

280 "c", 

281 "d", 

282 "e", 

283 "f", 

284 "g", 

285 "h", 

286 "i", 

287 "j", 

288 "k", 

289 "l", 

290 "m", 

291 "n", 

292 "o", 

293 "p", 

294 "q", 

295 "r", 

296 "s", 

297 "t", 

298 "u", 

299 "v", 

300 "w", 

301 "x", 

302 "y", 

303 "z", 

304 "braceleft", 

305 "bar", 

306 "braceright", 

307 "asciitilde", 

308 "exclamdown", 

309 "cent", 

310 "sterling", 

311 "fraction", 

312 "yen", 

313 "florin", 

314 "section", 

315 "currency", 

316 "quotesingle", 

317 "quotedblleft", 

318 "guillemotleft", 

319 "guilsinglleft", 

320 "guilsinglright", 

321 "fi", 

322 "fl", 

323 "endash", 

324 "dagger", 

325 "daggerdbl", 

326 "periodcentered", 

327 "paragraph", 

328 "bullet", 

329 "quotesinglbase", 

330 "quotedblbase", 

331 "quotedblright", 

332 "guillemotright", 

333 "ellipsis", 

334 "perthousand", 

335 "questiondown", 

336 "grave", 

337 "acute", 

338 "circumflex", 

339 "tilde", 

340 "macron", 

341 "breve", 

342 "dotaccent", 

343 "dieresis", 

344 "ring", 

345 "cedilla", 

346 "hungarumlaut", 

347 "ogonek", 

348 "caron", 

349 "emdash", 

350 "AE", 

351 "ordfeminine", 

352 "Lslash", 

353 "Oslash", 

354 "OE", 

355 "ordmasculine", 

356 "ae", 

357 "dotlessi", 

358 "lslash", 

359 "oslash", 

360 "oe", 

361 "germandbls", 

362 "onesuperior", 

363 "logicalnot", 

364 "mu", 

365 "trademark", 

366 "Eth", 

367 "onehalf", 

368 "plusminus", 

369 "Thorn", 

370 "onequarter", 

371 "divide", 

372 "brokenbar", 

373 "degree", 

374 "thorn", 

375 "threequarters", 

376 "twosuperior", 

377 "registered", 

378 "minus", 

379 "eth", 

380 "multiply", 

381 "threesuperior", 

382 "copyright", 

383 "Aacute", 

384 "Acircumflex", 

385 "Adieresis", 

386 "Agrave", 

387 "Aring", 

388 "Atilde", 

389 "Ccedilla", 

390 "Eacute", 

391 "Ecircumflex", 

392 "Edieresis", 

393 "Egrave", 

394 "Iacute", 

395 "Icircumflex", 

396 "Idieresis", 

397 "Igrave", 

398 "Ntilde", 

399 "Oacute", 

400 "Ocircumflex", 

401 "Odieresis", 

402 "Ograve", 

403 "Otilde", 

404 "Scaron", 

405 "Uacute", 

406 "Ucircumflex", 

407 "Udieresis", 

408 "Ugrave", 

409 "Yacute", 

410 "Ydieresis", 

411 "Zcaron", 

412 "aacute", 

413 "acircumflex", 

414 "adieresis", 

415 "agrave", 

416 "aring", 

417 "atilde", 

418 "ccedilla", 

419 "eacute", 

420 "ecircumflex", 

421 "edieresis", 

422 "egrave", 

423 "iacute", 

424 "icircumflex", 

425 "idieresis", 

426 "igrave", 

427 "ntilde", 

428 "oacute", 

429 "ocircumflex", 

430 "odieresis", 

431 "ograve", 

432 "otilde", 

433 "scaron", 

434 "uacute", 

435 "ucircumflex", 

436 "udieresis", 

437 "ugrave", 

438 "yacute", 

439 "ydieresis", 

440 "zcaron", 

441 "exclamsmall", 

442 "Hungarumlautsmall", 

443 "dollaroldstyle", 

444 "dollarsuperior", 

445 "ampersandsmall", 

446 "Acutesmall", 

447 "parenleftsuperior", 

448 "parenrightsuperior", 

449 "twodotenleader", 

450 "onedotenleader", 

451 "zerooldstyle", 

452 "oneoldstyle", 

453 "twooldstyle", 

454 "threeoldstyle", 

455 "fouroldstyle", 

456 "fiveoldstyle", 

457 "sixoldstyle", 

458 "sevenoldstyle", 

459 "eightoldstyle", 

460 "nineoldstyle", 

461 "commasuperior", 

462 "threequartersemdash", 

463 "periodsuperior", 

464 "questionsmall", 

465 "asuperior", 

466 "bsuperior", 

467 "centsuperior", 

468 "dsuperior", 

469 "esuperior", 

470 "isuperior", 

471 "lsuperior", 

472 "msuperior", 

473 "nsuperior", 

474 "osuperior", 

475 "rsuperior", 

476 "ssuperior", 

477 "tsuperior", 

478 "ff", 

479 "ffi", 

480 "ffl", 

481 "parenleftinferior", 

482 "parenrightinferior", 

483 "Circumflexsmall", 

484 "hyphensuperior", 

485 "Gravesmall", 

486 "Asmall", 

487 "Bsmall", 

488 "Csmall", 

489 "Dsmall", 

490 "Esmall", 

491 "Fsmall", 

492 "Gsmall", 

493 "Hsmall", 

494 "Ismall", 

495 "Jsmall", 

496 "Ksmall", 

497 "Lsmall", 

498 "Msmall", 

499 "Nsmall", 

500 "Osmall", 

501 "Psmall", 

502 "Qsmall", 

503 "Rsmall", 

504 "Ssmall", 

505 "Tsmall", 

506 "Usmall", 

507 "Vsmall", 

508 "Wsmall", 

509 "Xsmall", 

510 "Ysmall", 

511 "Zsmall", 

512 "colonmonetary", 

513 "onefitted", 

514 "rupiah", 

515 "Tildesmall", 

516 "exclamdownsmall", 

517 "centoldstyle", 

518 "Lslashsmall", 

519 "Scaronsmall", 

520 "Zcaronsmall", 

521 "Dieresissmall", 

522 "Brevesmall", 

523 "Caronsmall", 

524 "Dotaccentsmall", 

525 "Macronsmall", 

526 "figuredash", 

527 "hypheninferior", 

528 "Ogoneksmall", 

529 "Ringsmall", 

530 "Cedillasmall", 

531 "questiondownsmall", 

532 "oneeighth", 

533 "threeeighths", 

534 "fiveeighths", 

535 "seveneighths", 

536 "onethird", 

537 "twothirds", 

538 "zerosuperior", 

539 "foursuperior", 

540 "fivesuperior", 

541 "sixsuperior", 

542 "sevensuperior", 

543 "eightsuperior", 

544 "ninesuperior", 

545 "zeroinferior", 

546 "oneinferior", 

547 "twoinferior", 

548 "threeinferior", 

549 "fourinferior", 

550 "fiveinferior", 

551 "sixinferior", 

552 "seveninferior", 

553 "eightinferior", 

554 "nineinferior", 

555 "centinferior", 

556 "dollarinferior", 

557 "periodinferior", 

558 "commainferior", 

559 "Agravesmall", 

560 "Aacutesmall", 

561 "Acircumflexsmall", 

562 "Atildesmall", 

563 "Adieresissmall", 

564 "Aringsmall", 

565 "AEsmall", 

566 "Ccedillasmall", 

567 "Egravesmall", 

568 "Eacutesmall", 

569 "Ecircumflexsmall", 

570 "Edieresissmall", 

571 "Igravesmall", 

572 "Iacutesmall", 

573 "Icircumflexsmall", 

574 "Idieresissmall", 

575 "Ethsmall", 

576 "Ntildesmall", 

577 "Ogravesmall", 

578 "Oacutesmall", 

579 "Ocircumflexsmall", 

580 "Otildesmall", 

581 "Odieresissmall", 

582 "OEsmall", 

583 "Oslashsmall", 

584 "Ugravesmall", 

585 "Uacutesmall", 

586 "Ucircumflexsmall", 

587 "Udieresissmall", 

588 "Yacutesmall", 

589 "Thornsmall", 

590 "Ydieresissmall", 

591 "001.000", 

592 "001.001", 

593 "001.002", 

594 "001.003", 

595 "Black", 

596 "Bold", 

597 "Book", 

598 "Light", 

599 "Medium", 

600 "Regular", 

601 "Roman", 

602 "Semibold", 

603 ) 

604 

605 class INDEX: 

606 def __init__(self, fp: BinaryIO) -> None: 

607 self.fp = fp 

608 self.offsets: List[int] = [] 

609 (count, offsize) = struct.unpack(">HB", self.fp.read(3)) 

610 for i in range(count + 1): 

611 self.offsets.append(nunpack(self.fp.read(offsize))) 

612 self.base = self.fp.tell() - 1 

613 self.fp.seek(self.base + self.offsets[-1]) 

614 

615 def __repr__(self) -> str: 

616 return "<INDEX: size=%d>" % len(self) 

617 

618 def __len__(self) -> int: 

619 return len(self.offsets) - 1 

620 

621 def __getitem__(self, i: int) -> bytes: 

622 self.fp.seek(self.base + self.offsets[i]) 

623 return self.fp.read(self.offsets[i + 1] - self.offsets[i]) 

624 

625 def __iter__(self) -> Iterator[bytes]: 

626 return iter(self[i] for i in range(len(self))) 

627 

628 def __init__(self, name: str, fp: BinaryIO) -> None: 

629 self.name = name 

630 self.fp = fp 

631 # Header 

632 (_major, _minor, hdrsize, offsize) = struct.unpack("BBBB", self.fp.read(4)) 

633 self.fp.read(hdrsize - 4) 

634 # Name INDEX 

635 self.name_index = self.INDEX(self.fp) 

636 # Top DICT INDEX 

637 self.dict_index = self.INDEX(self.fp) 

638 # String INDEX 

639 self.string_index = self.INDEX(self.fp) 

640 # Global Subr INDEX 

641 self.subr_index = self.INDEX(self.fp) 

642 # Top DICT DATA 

643 self.top_dict = getdict(self.dict_index[0]) 

644 (charset_pos,) = self.top_dict.get(15, [0]) 

645 (encoding_pos,) = self.top_dict.get(16, [0]) 

646 (charstring_pos,) = self.top_dict.get(17, [0]) 

647 # CharStrings 

648 self.fp.seek(cast(int, charstring_pos)) 

649 self.charstring = self.INDEX(self.fp) 

650 self.nglyphs = len(self.charstring) 

651 # Encodings 

652 self.code2gid = {} 

653 self.gid2code = {} 

654 self.fp.seek(cast(int, encoding_pos)) 

655 format = self.fp.read(1) 

656 if format == b"\x00": 

657 # Format 0 

658 (n,) = struct.unpack("B", self.fp.read(1)) 

659 for code, gid in enumerate(struct.unpack("B" * n, self.fp.read(n))): 

660 self.code2gid[code] = gid 

661 self.gid2code[gid] = code 

662 elif format == b"\x01": 

663 # Format 1 

664 (n,) = struct.unpack("B", self.fp.read(1)) 

665 code = 0 

666 for i in range(n): 

667 (first, nleft) = struct.unpack("BB", self.fp.read(2)) 

668 for gid in range(first, first + nleft + 1): 

669 self.code2gid[code] = gid 

670 self.gid2code[gid] = code 

671 code += 1 

672 else: 

673 raise PDFValueError("unsupported encoding format: %r" % format) 

674 # Charsets 

675 self.name2gid = {} 

676 self.gid2name = {} 

677 self.fp.seek(cast(int, charset_pos)) 

678 format = self.fp.read(1) 

679 if format == b"\x00": 

680 # Format 0 

681 n = self.nglyphs - 1 

682 for gid, sid in enumerate( 

683 cast( 

684 Tuple[int, ...], struct.unpack(">" + "H" * n, self.fp.read(2 * n)) 

685 ), 

686 ): 

687 gid += 1 

688 sidname = self.getstr(sid) 

689 self.name2gid[sidname] = gid 

690 self.gid2name[gid] = sidname 

691 elif format == b"\x01": 

692 # Format 1 

693 (n,) = struct.unpack("B", self.fp.read(1)) 

694 sid = 0 

695 for i in range(n): 

696 (first, nleft) = struct.unpack("BB", self.fp.read(2)) 

697 for gid in range(first, first + nleft + 1): 

698 sidname = self.getstr(sid) 

699 self.name2gid[sidname] = gid 

700 self.gid2name[gid] = sidname 

701 sid += 1 

702 elif format == b"\x02": 

703 # Format 2 

704 assert False, str(("Unhandled", format)) 

705 else: 

706 raise PDFValueError("unsupported charset format: %r" % format) 

707 

708 def getstr(self, sid: int) -> Union[str, bytes]: 

709 # This returns str for one of the STANDARD_STRINGS but bytes otherwise, 

710 # and appears to be a needless source of type complexity. 

711 if sid < len(self.STANDARD_STRINGS): 

712 return self.STANDARD_STRINGS[sid] 

713 return self.string_index[sid - len(self.STANDARD_STRINGS)] 

714 

715 

716class TrueTypeFont: 

717 class CMapNotFound(PDFException): 

718 pass 

719 

720 def __init__(self, name: str, fp: BinaryIO) -> None: 

721 self.name = name 

722 self.fp = fp 

723 self.tables: Dict[bytes, Tuple[int, int]] = {} 

724 self.fonttype = fp.read(4) 

725 try: 

726 (ntables, _1, _2, _3) = cast( 

727 Tuple[int, int, int, int], 

728 struct.unpack(">HHHH", fp.read(8)), 

729 ) 

730 for _ in range(ntables): 

731 (name_bytes, tsum, offset, length) = cast( 

732 Tuple[bytes, int, int, int], 

733 struct.unpack(">4sLLL", fp.read(16)), 

734 ) 

735 self.tables[name_bytes] = (offset, length) 

736 except struct.error: 

737 # Do not fail if there are not enough bytes to read. Even for 

738 # corrupted PDFs we would like to get as much information as 

739 # possible, so continue. 

740 pass 

741 

742 def create_unicode_map(self) -> FileUnicodeMap: 

743 if b"cmap" not in self.tables: 

744 raise TrueTypeFont.CMapNotFound 

745 (base_offset, length) = self.tables[b"cmap"] 

746 fp = self.fp 

747 fp.seek(base_offset) 

748 (version, nsubtables) = cast(Tuple[int, int], struct.unpack(">HH", fp.read(4))) 

749 subtables: List[Tuple[int, int, int]] = [] 

750 for i in range(nsubtables): 

751 subtables.append( 

752 cast(Tuple[int, int, int], struct.unpack(">HHL", fp.read(8))), 

753 ) 

754 char2gid: Dict[int, int] = {} 

755 # Only supports subtable type 0, 2 and 4. 

756 for platform_id, encoding_id, st_offset in subtables: 

757 # Skip non-Unicode cmaps. 

758 # https://docs.microsoft.com/en-us/typography/opentype/spec/cmap 

759 if not (platform_id == 0 or (platform_id == 3 and encoding_id in [1, 10])): 

760 continue 

761 fp.seek(base_offset + st_offset) 

762 (fmttype, fmtlen, fmtlang) = cast( 

763 Tuple[int, int, int], 

764 struct.unpack(">HHH", fp.read(6)), 

765 ) 

766 if fmttype == 0: 

767 char2gid.update( 

768 enumerate( 

769 cast(Tuple[int, ...], struct.unpack(">256B", fp.read(256))), 

770 ), 

771 ) 

772 elif fmttype == 2: 

773 subheaderkeys = cast( 

774 Tuple[int, ...], 

775 struct.unpack(">256H", fp.read(512)), 

776 ) 

777 firstbytes = [0] * 8192 

778 for i, k in enumerate(subheaderkeys): 

779 firstbytes[k // 8] = i 

780 nhdrs = max(subheaderkeys) // 8 + 1 

781 hdrs: List[Tuple[int, int, int, int, int]] = [] 

782 for i in range(nhdrs): 

783 (firstcode, entcount, delta, offset) = cast( 

784 Tuple[int, int, int, int], 

785 struct.unpack(">HHhH", fp.read(8)), 

786 ) 

787 hdrs.append((i, firstcode, entcount, delta, fp.tell() - 2 + offset)) 

788 for i, firstcode, entcount, delta, pos in hdrs: 

789 if not entcount: 

790 continue 

791 first = firstcode + (firstbytes[i] << 8) 

792 fp.seek(pos) 

793 for c in range(entcount): 

794 gid = cast(Tuple[int], struct.unpack(">H", fp.read(2)))[0] 

795 if gid: 

796 gid += delta 

797 char2gid[first + c] = gid 

798 elif fmttype == 4: 

799 (segcount, _1, _2, _3) = cast( 

800 Tuple[int, int, int, int], 

801 struct.unpack(">HHHH", fp.read(8)), 

802 ) 

803 segcount //= 2 

804 ecs = cast( 

805 Tuple[int, ...], 

806 struct.unpack(">%dH" % segcount, fp.read(2 * segcount)), 

807 ) 

808 fp.read(2) 

809 scs = cast( 

810 Tuple[int, ...], 

811 struct.unpack(">%dH" % segcount, fp.read(2 * segcount)), 

812 ) 

813 idds = cast( 

814 Tuple[int, ...], 

815 struct.unpack(">%dh" % segcount, fp.read(2 * segcount)), 

816 ) 

817 pos = fp.tell() 

818 idrs = cast( 

819 Tuple[int, ...], 

820 struct.unpack(">%dH" % segcount, fp.read(2 * segcount)), 

821 ) 

822 for ec, sc, idd, idr in zip(ecs, scs, idds, idrs): 

823 if idr: 

824 fp.seek(pos + idr) 

825 for c in range(sc, ec + 1): 

826 b = cast(Tuple[int], struct.unpack(">H", fp.read(2)))[0] 

827 char2gid[c] = (b + idd) & 0xFFFF 

828 else: 

829 for c in range(sc, ec + 1): 

830 char2gid[c] = (c + idd) & 0xFFFF 

831 else: 

832 assert False, str(("Unhandled", fmttype)) 

833 if not char2gid: 

834 raise TrueTypeFont.CMapNotFound 

835 # create unicode map 

836 unicode_map = FileUnicodeMap() 

837 for char, gid in char2gid.items(): 

838 unicode_map.add_cid2unichr(gid, char) 

839 return unicode_map 

840 

841 

842class PDFFontError(PDFException): 

843 pass 

844 

845 

846class PDFUnicodeNotDefined(PDFFontError): 

847 pass 

848 

849 

850LITERAL_STANDARD_ENCODING = LIT("StandardEncoding") 

851LITERAL_TYPE1C = LIT("Type1C") 

852 

853# Font widths are maintained in a dict type that maps from *either* unicode 

854# chars or integer character IDs. 

855FontWidthDict = Union[Dict[int, float], Dict[str, float]] 

856 

857 

858class PDFFont: 

859 def __init__( 

860 self, 

861 descriptor: Mapping[str, Any], 

862 widths: FontWidthDict, 

863 default_width: Optional[float] = None, 

864 ) -> None: 

865 self.descriptor = descriptor 

866 self.widths: FontWidthDict = resolve_all(widths) 

867 self.fontname = resolve1(descriptor.get("FontName", "unknown")) 

868 if isinstance(self.fontname, PSLiteral): 

869 self.fontname = literal_name(self.fontname) 

870 self.flags = int_value(descriptor.get("Flags", 0)) 

871 self.ascent = num_value(descriptor.get("Ascent", 0)) 

872 self.descent = num_value(descriptor.get("Descent", 0)) 

873 self.italic_angle = num_value(descriptor.get("ItalicAngle", 0)) 

874 if default_width is None: 

875 self.default_width = num_value(descriptor.get("MissingWidth", 0)) 

876 else: 

877 self.default_width = default_width 

878 self.default_width = resolve1(self.default_width) 

879 self.leading = num_value(descriptor.get("Leading", 0)) 

880 self.bbox = cast( 

881 Rect, 

882 list_value(resolve_all(descriptor.get("FontBBox", (0, 0, 0, 0)))), 

883 ) 

884 self.hscale = self.vscale = 0.001 

885 

886 # PDF RM 9.8.1 specifies /Descent should always be a negative number. 

887 # PScript5.dll seems to produce Descent with a positive number, but 

888 # text analysis will be wrong if this is taken as correct. So force 

889 # descent to negative. 

890 if self.descent > 0: 

891 self.descent = -self.descent 

892 

893 def __repr__(self) -> str: 

894 return "<PDFFont>" 

895 

896 def is_vertical(self) -> bool: 

897 return False 

898 

899 def is_multibyte(self) -> bool: 

900 return False 

901 

902 def decode(self, bytes: bytes) -> Iterable[int]: 

903 return bytearray(bytes) # map(ord, bytes) 

904 

905 def get_ascent(self) -> float: 

906 """Ascent above the baseline, in text space units""" 

907 return self.ascent * self.vscale 

908 

909 def get_descent(self) -> float: 

910 """Descent below the baseline, in text space units; always negative""" 

911 return self.descent * self.vscale 

912 

913 def get_width(self) -> float: 

914 w = self.bbox[2] - self.bbox[0] 

915 if w == 0: 

916 w = -self.default_width 

917 return w * self.hscale 

918 

919 def get_height(self) -> float: 

920 h = self.bbox[3] - self.bbox[1] 

921 if h == 0: 

922 h = self.ascent - self.descent 

923 return h * self.vscale 

924 

925 def char_width(self, cid: int) -> float: 

926 # Because character widths may be mapping either IDs or strings, 

927 # we try to lookup the character ID first, then its str equivalent. 

928 try: 

929 return cast(Dict[int, float], self.widths)[cid] * self.hscale 

930 except KeyError: 

931 str_widths = cast(Dict[str, float], self.widths) 

932 try: 

933 return str_widths[self.to_unichr(cid)] * self.hscale 

934 except (KeyError, PDFUnicodeNotDefined): 

935 return self.default_width * self.hscale 

936 

937 def char_disp(self, cid: int) -> Union[float, Tuple[Optional[float], float]]: 

938 """Returns an integer for horizontal fonts, a tuple for vertical fonts.""" 

939 return 0 

940 

941 def string_width(self, s: bytes) -> float: 

942 return sum(self.char_width(cid) for cid in self.decode(s)) 

943 

944 def to_unichr(self, cid: int) -> str: 

945 raise NotImplementedError 

946 

947 

948class PDFSimpleFont(PDFFont): 

949 def __init__( 

950 self, 

951 descriptor: Mapping[str, Any], 

952 widths: FontWidthDict, 

953 spec: Mapping[str, Any], 

954 ) -> None: 

955 # Font encoding is specified either by a name of 

956 # built-in encoding or a dictionary that describes 

957 # the differences. 

958 if "Encoding" in spec: 

959 encoding = resolve1(spec["Encoding"]) 

960 else: 

961 encoding = LITERAL_STANDARD_ENCODING 

962 if isinstance(encoding, dict): 

963 name = literal_name(encoding.get("BaseEncoding", LITERAL_STANDARD_ENCODING)) 

964 diff = list_value(encoding.get("Differences", [])) 

965 self.cid2unicode = EncodingDB.get_encoding(name, diff) 

966 else: 

967 self.cid2unicode = EncodingDB.get_encoding(literal_name(encoding)) 

968 self.unicode_map: Optional[UnicodeMap] = None 

969 if "ToUnicode" in spec: 

970 strm = stream_value(spec["ToUnicode"]) 

971 self.unicode_map = FileUnicodeMap() 

972 CMapParser(self.unicode_map, BytesIO(strm.get_data())).run() 

973 PDFFont.__init__(self, descriptor, widths) 

974 

975 def to_unichr(self, cid: int) -> str: 

976 if self.unicode_map: 

977 try: 

978 return self.unicode_map.get_unichr(cid) 

979 except KeyError: 

980 pass 

981 try: 

982 return self.cid2unicode[cid] 

983 except KeyError: 

984 raise PDFUnicodeNotDefined(None, cid) 

985 

986 

987class PDFType1Font(PDFSimpleFont): 

988 def __init__(self, rsrcmgr: "PDFResourceManager", spec: Mapping[str, Any]) -> None: 

989 try: 

990 self.basefont = literal_name(spec["BaseFont"]) 

991 except KeyError: 

992 if settings.STRICT: 

993 raise PDFFontError("BaseFont is missing") 

994 self.basefont = "unknown" 

995 

996 widths: FontWidthDict 

997 try: 

998 (descriptor, int_widths) = FontMetricsDB.get_metrics(self.basefont) 

999 widths = cast(Dict[str, float], int_widths) # implicit int->float 

1000 except KeyError: 

1001 descriptor = dict_value(spec.get("FontDescriptor", {})) 

1002 firstchar = int_value(spec.get("FirstChar", 0)) 

1003 # lastchar = int_value(spec.get('LastChar', 255)) 

1004 width_list = list_value(spec.get("Widths", [0] * 256)) 

1005 widths = {i + firstchar: resolve1(w) for (i, w) in enumerate(width_list)} 

1006 PDFSimpleFont.__init__(self, descriptor, widths, spec) 

1007 if "Encoding" not in spec and "FontFile" in descriptor: 

1008 # try to recover the missing encoding info from the font file. 

1009 self.fontfile = stream_value(descriptor.get("FontFile")) 

1010 length1 = int_value(self.fontfile["Length1"]) 

1011 data = self.fontfile.get_data()[:length1] 

1012 parser = Type1FontHeaderParser(BytesIO(data)) 

1013 self.cid2unicode = parser.get_encoding() 

1014 

1015 def __repr__(self) -> str: 

1016 return "<PDFType1Font: basefont=%r>" % self.basefont 

1017 

1018 

1019class PDFTrueTypeFont(PDFType1Font): 

1020 def __repr__(self) -> str: 

1021 return "<PDFTrueTypeFont: basefont=%r>" % self.basefont 

1022 

1023 

1024class PDFType3Font(PDFSimpleFont): 

1025 def __init__(self, rsrcmgr: "PDFResourceManager", spec: Mapping[str, Any]) -> None: 

1026 firstchar = int_value(spec.get("FirstChar", 0)) 

1027 # lastchar = int_value(spec.get('LastChar', 0)) 

1028 width_list = list_value(spec.get("Widths", [0] * 256)) 

1029 widths = {i + firstchar: w for (i, w) in enumerate(width_list)} 

1030 if "FontDescriptor" in spec: 

1031 descriptor = dict_value(spec["FontDescriptor"]) 

1032 else: 

1033 descriptor = {"Ascent": 0, "Descent": 0, "FontBBox": spec["FontBBox"]} 

1034 PDFSimpleFont.__init__(self, descriptor, widths, spec) 

1035 self.matrix = cast(Matrix, tuple(list_value(spec.get("FontMatrix")))) 

1036 (_, self.descent, _, self.ascent) = self.bbox 

1037 (self.hscale, self.vscale) = apply_matrix_norm(self.matrix, (1, 1)) 

1038 

1039 def __repr__(self) -> str: 

1040 return "<PDFType3Font>" 

1041 

1042 

1043class PDFCIDFont(PDFFont): 

1044 default_disp: Union[float, Tuple[Optional[float], float]] 

1045 

1046 def __init__( 

1047 self, 

1048 rsrcmgr: "PDFResourceManager", 

1049 spec: Mapping[str, Any], 

1050 strict: bool = settings.STRICT, 

1051 ) -> None: 

1052 try: 

1053 self.basefont = literal_name(spec["BaseFont"]) 

1054 except KeyError: 

1055 if strict: 

1056 raise PDFFontError("BaseFont is missing") 

1057 self.basefont = "unknown" 

1058 self.cidsysteminfo = dict_value(spec.get("CIDSystemInfo", {})) 

1059 cid_registry = resolve1(self.cidsysteminfo.get("Registry", b"unknown")).decode( 

1060 "latin1", 

1061 ) 

1062 cid_ordering = resolve1(self.cidsysteminfo.get("Ordering", b"unknown")).decode( 

1063 "latin1", 

1064 ) 

1065 self.cidcoding = f"{cid_registry.strip()}-{cid_ordering.strip()}" 

1066 self.cmap: CMapBase = self.get_cmap_from_spec(spec, strict) 

1067 

1068 try: 

1069 descriptor = dict_value(spec["FontDescriptor"]) 

1070 except KeyError: 

1071 if strict: 

1072 raise PDFFontError("FontDescriptor is missing") 

1073 descriptor = {} 

1074 ttf = None 

1075 if "FontFile2" in descriptor: 

1076 self.fontfile = stream_value(descriptor.get("FontFile2")) 

1077 ttf = TrueTypeFont(self.basefont, BytesIO(self.fontfile.get_data())) 

1078 self.unicode_map: Optional[UnicodeMap] = None 

1079 if "ToUnicode" in spec: 

1080 if isinstance(spec["ToUnicode"], PDFStream): 

1081 strm = stream_value(spec["ToUnicode"]) 

1082 self.unicode_map = FileUnicodeMap() 

1083 CMapParser(self.unicode_map, BytesIO(strm.get_data())).run() 

1084 else: 

1085 cmap_name = literal_name(spec["ToUnicode"]) 

1086 encoding = literal_name(spec["Encoding"]) 

1087 if ( 

1088 "Identity" in cid_ordering 

1089 or "Identity" in cmap_name 

1090 or "Identity" in encoding 

1091 ): 

1092 self.unicode_map = IdentityUnicodeMap() 

1093 elif self.cidcoding in ("Adobe-Identity", "Adobe-UCS"): 

1094 if ttf: 

1095 try: 

1096 self.unicode_map = ttf.create_unicode_map() 

1097 except TrueTypeFont.CMapNotFound: 

1098 pass 

1099 else: 

1100 try: 

1101 self.unicode_map = CMapDB.get_unicode_map( 

1102 self.cidcoding, 

1103 self.cmap.is_vertical(), 

1104 ) 

1105 except CMapDB.CMapNotFound: 

1106 pass 

1107 

1108 self.vertical = self.cmap.is_vertical() 

1109 if self.vertical: 

1110 # writing mode: vertical 

1111 widths2 = get_widths2(list_value(spec.get("W2", []))) 

1112 self.disps = {cid: (vx, vy) for (cid, (_, (vx, vy))) in widths2.items()} 

1113 (vy, w) = resolve1(spec.get("DW2", [880, -1000])) 

1114 self.default_disp = (None, vy) 

1115 widths = {cid: w for (cid, (w, _)) in widths2.items()} 

1116 default_width = w 

1117 else: 

1118 # writing mode: horizontal 

1119 self.disps = {} 

1120 self.default_disp = 0 

1121 widths = get_widths(list_value(spec.get("W", []))) 

1122 default_width = spec.get("DW", 1000) 

1123 PDFFont.__init__(self, descriptor, widths, default_width=default_width) 

1124 

1125 def get_cmap_from_spec(self, spec: Mapping[str, Any], strict: bool) -> CMapBase: 

1126 """Get cmap from font specification 

1127 

1128 For certain PDFs, Encoding Type isn't mentioned as an attribute of 

1129 Encoding but as an attribute of CMapName, where CMapName is an 

1130 attribute of spec['Encoding']. 

1131 The horizontal/vertical modes are mentioned with different name 

1132 such as 'DLIdent-H/V','OneByteIdentityH/V','Identity-H/V'. 

1133 """ 

1134 cmap_name = self._get_cmap_name(spec, strict) 

1135 

1136 try: 

1137 return CMapDB.get_cmap(cmap_name) 

1138 except CMapDB.CMapNotFound as e: 

1139 if strict: 

1140 raise PDFFontError(e) 

1141 return CMap() 

1142 

1143 @staticmethod 

1144 def _get_cmap_name(spec: Mapping[str, Any], strict: bool) -> str: 

1145 """Get cmap name from font specification""" 

1146 cmap_name = "unknown" # default value 

1147 

1148 try: 

1149 spec_encoding = spec["Encoding"] 

1150 if hasattr(spec_encoding, "name"): 

1151 cmap_name = literal_name(spec["Encoding"]) 

1152 else: 

1153 cmap_name = literal_name(spec_encoding["CMapName"]) 

1154 except KeyError: 

1155 if strict: 

1156 raise PDFFontError("Encoding is unspecified") 

1157 

1158 if type(cmap_name) is PDFStream: # type: ignore[comparison-overlap] 

1159 cmap_name_stream: PDFStream = cast(PDFStream, cmap_name) 

1160 if "CMapName" in cmap_name_stream: 

1161 cmap_name = cmap_name_stream.get("CMapName").name 

1162 elif strict: 

1163 raise PDFFontError("CMapName unspecified for encoding") 

1164 

1165 return IDENTITY_ENCODER.get(cmap_name, cmap_name) 

1166 

1167 def __repr__(self) -> str: 

1168 return f"<PDFCIDFont: basefont={self.basefont!r}, cidcoding={self.cidcoding!r}>" 

1169 

1170 def is_vertical(self) -> bool: 

1171 return self.vertical 

1172 

1173 def is_multibyte(self) -> bool: 

1174 return True 

1175 

1176 def decode(self, bytes: bytes) -> Iterable[int]: 

1177 return self.cmap.decode(bytes) 

1178 

1179 def char_disp(self, cid: int) -> Union[float, Tuple[Optional[float], float]]: 

1180 """Returns an integer for horizontal fonts, a tuple for vertical fonts.""" 

1181 return self.disps.get(cid, self.default_disp) 

1182 

1183 def to_unichr(self, cid: int) -> str: 

1184 try: 

1185 if not self.unicode_map: 

1186 raise PDFKeyError(cid) 

1187 return self.unicode_map.get_unichr(cid) 

1188 except KeyError: 

1189 raise PDFUnicodeNotDefined(self.cidcoding, cid)