Coverage for /pythoncovmergedfiles/medio/medio/src/pdfminer.six/pdfminer/pdffont.py: 28%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

576 statements  

1import contextlib 

2import logging 

3import struct 

4from collections.abc import Iterable, Iterator, Mapping 

5from io import BytesIO 

6from typing import ( 

7 TYPE_CHECKING, 

8 Any, 

9 BinaryIO, 

10 cast, 

11) 

12 

13from pdfminer import settings 

14from pdfminer.casting import safe_float, safe_rect_list 

15from pdfminer.cmapdb import ( 

16 CMap, 

17 CMapBase, 

18 CMapDB, 

19 CMapParser, 

20 FileUnicodeMap, 

21 IdentityUnicodeMap, 

22 UnicodeMap, 

23) 

24from pdfminer.encodingdb import EncodingDB, name2unicode 

25from pdfminer.fontmetrics import FONT_METRICS 

26from pdfminer.pdfexceptions import PDFException, PDFKeyError, PDFValueError 

27from pdfminer.pdftypes import ( 

28 PDFStream, 

29 dict_value, 

30 int_value, 

31 list_value, 

32 num_value, 

33 resolve1, 

34 resolve_all, 

35 stream_value, 

36) 

37from pdfminer.psexceptions import PSEOF 

38from pdfminer.psparser import ( 

39 KWD, 

40 LIT, 

41 PSKeyword, 

42 PSLiteral, 

43 PSStackParser, 

44 literal_name, 

45) 

46from pdfminer.utils import Matrix, Point, Rect, apply_matrix_norm, choplist, nunpack 

47 

48if TYPE_CHECKING: 

49 from pdfminer.pdfinterp import PDFResourceManager 

50 

51log = logging.getLogger(__name__) 

52 

53 

54def get_widths(seq: Iterable[object]) -> dict[str | int, float]: 

55 """Build a mapping of character widths for horizontal writing.""" 

56 widths: dict[int, float] = {} 

57 r: list[float] = [] 

58 for v in seq: 

59 v = resolve1(v) 

60 if isinstance(v, list): 

61 if r: 

62 char1 = r[-1] 

63 for i, w in enumerate(v): 

64 widths[cast(int, char1) + i] = w 

65 r = [] 

66 elif isinstance(v, (int, float)): # == utils.isnumber(v) 

67 r.append(v) 

68 if len(r) == 3: 

69 (char1, char2, w) = r 

70 if isinstance(char1, int) and isinstance(char2, int): 

71 for i in range(char1, char2 + 1): 

72 widths[i] = w 

73 else: 

74 log.warning( 

75 "Skipping invalid font width specification for %s to " 

76 "%s because either of them is not an int", 

77 char1, 

78 char2, 

79 ) 

80 r = [] 

81 else: 

82 log.warning( 

83 "Skipping invalid font width specification for %s " 

84 "because it is not a number or a list", 

85 v, 

86 ) 

87 return cast(dict[str | int, float], widths) 

88 

89 

90def get_widths2(seq: Iterable[object]) -> dict[int, tuple[float, Point]]: 

91 """Build a mapping of character widths for vertical writing.""" 

92 widths: dict[int, tuple[float, Point]] = {} 

93 r: list[float] = [] 

94 for v in seq: 

95 if isinstance(v, list): 

96 if r: 

97 char1 = r[-1] 

98 for i, (w, vx, vy) in enumerate(choplist(3, v)): 

99 widths[cast(int, char1) + i] = (w, (vx, vy)) 

100 r = [] 

101 elif isinstance(v, (int, float)): # == utils.isnumber(v) 

102 r.append(v) 

103 if len(r) == 5: 

104 (char1, char2, w, vx, vy) = r 

105 for i in range(cast(int, char1), cast(int, char2) + 1): 

106 widths[i] = (w, (vx, vy)) 

107 r = [] 

108 return widths 

109 

110 

111class FontMetricsDB: 

112 @classmethod 

113 def get_metrics(cls, fontname: str) -> tuple[dict[str, object], dict[str, int]]: 

114 return FONT_METRICS[fontname] 

115 

116 

117# int here means that we're not extending PSStackParser with additional types. 

118class Type1FontHeaderParser(PSStackParser[int]): 

119 KEYWORD_BEGIN = KWD(b"begin") 

120 KEYWORD_END = KWD(b"end") 

121 KEYWORD_DEF = KWD(b"def") 

122 KEYWORD_PUT = KWD(b"put") 

123 KEYWORD_DICT = KWD(b"dict") 

124 KEYWORD_ARRAY = KWD(b"array") 

125 KEYWORD_READONLY = KWD(b"readonly") 

126 KEYWORD_FOR = KWD(b"for") 

127 

128 def __init__(self, data: BinaryIO) -> None: 

129 PSStackParser.__init__(self, data) 

130 self._cid2unicode: dict[int, str] = {} 

131 

132 def get_encoding(self) -> dict[int, str]: 

133 """Parse the font encoding. 

134 

135 The Type1 font encoding maps character codes to character names. These 

136 character names could either be standard Adobe glyph names, or 

137 character names associated with custom CharStrings for this font. A 

138 CharString is a sequence of operations that describe how the character 

139 should be drawn. Currently, this function returns '' (empty string) 

140 for character names that are associated with a CharStrings. 

141 

142 Reference: Adobe Systems Incorporated, Adobe Type 1 Font Format 

143 

144 :returns mapping of character identifiers (cid's) to unicode characters 

145 """ 

146 while 1: 

147 try: 

148 (cid, name) = self.nextobject() 

149 except PSEOF: 

150 break 

151 try: 

152 self._cid2unicode[cid] = name2unicode(cast(str, name)) 

153 except KeyError as e: 

154 log.debug(str(e)) 

155 return self._cid2unicode 

156 

157 def do_keyword(self, pos: int, token: PSKeyword) -> None: 

158 if token is self.KEYWORD_PUT: 

159 ((_, key), (_, value)) = self.pop(2) 

160 if isinstance(key, int) and isinstance(value, PSLiteral): 

161 self.add_results((key, literal_name(value))) 

162 

163 

164NIBBLES = ("0", "1", "2", "3", "4", "5", "6", "7", "8", "9", ".", "e", "e-", None, "-") 

165 

166# Mapping of cmap names. Original cmap name is kept if not in the mapping. 

167# (missing reference for why DLIdent is mapped to Identity) 

168IDENTITY_ENCODER = { 

169 "DLIdent-H": "Identity-H", 

170 "DLIdent-V": "Identity-V", 

171} 

172 

173 

174def getdict(data: bytes) -> dict[int, list[float | int]]: 

175 d: dict[int, list[float | int]] = {} 

176 fp = BytesIO(data) 

177 stack: list[float | int] = [] 

178 while 1: 

179 c = fp.read(1) 

180 if not c: 

181 break 

182 b0 = ord(c) 

183 if b0 <= 21: 

184 d[b0] = stack 

185 stack = [] 

186 continue 

187 if b0 == 30: 

188 s = "" 

189 loop = True 

190 while loop: 

191 b = ord(fp.read(1)) 

192 for n in (b >> 4, b & 15): 

193 if n == 15: 

194 loop = False 

195 else: 

196 nibble = NIBBLES[n] 

197 assert nibble is not None 

198 s += nibble 

199 value = float(s) 

200 elif b0 >= 32 and b0 <= 246: 

201 value = b0 - 139 

202 else: 

203 b1 = ord(fp.read(1)) 

204 if b0 >= 247 and b0 <= 250: 

205 value = ((b0 - 247) << 8) + b1 + 108 

206 elif b0 >= 251 and b0 <= 254: 

207 value = -((b0 - 251) << 8) - b1 - 108 

208 else: 

209 b2 = ord(fp.read(1)) 

210 if b1 >= 128: 

211 b1 -= 256 

212 if b0 == 28: 

213 value = b1 << 8 | b2 

214 else: 

215 value = b1 << 24 | b2 << 16 | struct.unpack(">H", fp.read(2))[0] 

216 stack.append(value) 

217 return d 

218 

219 

220class CFFFont: 

221 STANDARD_STRINGS = ( 

222 ".notdef", 

223 "space", 

224 "exclam", 

225 "quotedbl", 

226 "numbersign", 

227 "dollar", 

228 "percent", 

229 "ampersand", 

230 "quoteright", 

231 "parenleft", 

232 "parenright", 

233 "asterisk", 

234 "plus", 

235 "comma", 

236 "hyphen", 

237 "period", 

238 "slash", 

239 "zero", 

240 "one", 

241 "two", 

242 "three", 

243 "four", 

244 "five", 

245 "six", 

246 "seven", 

247 "eight", 

248 "nine", 

249 "colon", 

250 "semicolon", 

251 "less", 

252 "equal", 

253 "greater", 

254 "question", 

255 "at", 

256 "A", 

257 "B", 

258 "C", 

259 "D", 

260 "E", 

261 "F", 

262 "G", 

263 "H", 

264 "I", 

265 "J", 

266 "K", 

267 "L", 

268 "M", 

269 "N", 

270 "O", 

271 "P", 

272 "Q", 

273 "R", 

274 "S", 

275 "T", 

276 "U", 

277 "V", 

278 "W", 

279 "X", 

280 "Y", 

281 "Z", 

282 "bracketleft", 

283 "backslash", 

284 "bracketright", 

285 "asciicircum", 

286 "underscore", 

287 "quoteleft", 

288 "a", 

289 "b", 

290 "c", 

291 "d", 

292 "e", 

293 "f", 

294 "g", 

295 "h", 

296 "i", 

297 "j", 

298 "k", 

299 "l", 

300 "m", 

301 "n", 

302 "o", 

303 "p", 

304 "q", 

305 "r", 

306 "s", 

307 "t", 

308 "u", 

309 "v", 

310 "w", 

311 "x", 

312 "y", 

313 "z", 

314 "braceleft", 

315 "bar", 

316 "braceright", 

317 "asciitilde", 

318 "exclamdown", 

319 "cent", 

320 "sterling", 

321 "fraction", 

322 "yen", 

323 "florin", 

324 "section", 

325 "currency", 

326 "quotesingle", 

327 "quotedblleft", 

328 "guillemotleft", 

329 "guilsinglleft", 

330 "guilsinglright", 

331 "fi", 

332 "fl", 

333 "endash", 

334 "dagger", 

335 "daggerdbl", 

336 "periodcentered", 

337 "paragraph", 

338 "bullet", 

339 "quotesinglbase", 

340 "quotedblbase", 

341 "quotedblright", 

342 "guillemotright", 

343 "ellipsis", 

344 "perthousand", 

345 "questiondown", 

346 "grave", 

347 "acute", 

348 "circumflex", 

349 "tilde", 

350 "macron", 

351 "breve", 

352 "dotaccent", 

353 "dieresis", 

354 "ring", 

355 "cedilla", 

356 "hungarumlaut", 

357 "ogonek", 

358 "caron", 

359 "emdash", 

360 "AE", 

361 "ordfeminine", 

362 "Lslash", 

363 "Oslash", 

364 "OE", 

365 "ordmasculine", 

366 "ae", 

367 "dotlessi", 

368 "lslash", 

369 "oslash", 

370 "oe", 

371 "germandbls", 

372 "onesuperior", 

373 "logicalnot", 

374 "mu", 

375 "trademark", 

376 "Eth", 

377 "onehalf", 

378 "plusminus", 

379 "Thorn", 

380 "onequarter", 

381 "divide", 

382 "brokenbar", 

383 "degree", 

384 "thorn", 

385 "threequarters", 

386 "twosuperior", 

387 "registered", 

388 "minus", 

389 "eth", 

390 "multiply", 

391 "threesuperior", 

392 "copyright", 

393 "Aacute", 

394 "Acircumflex", 

395 "Adieresis", 

396 "Agrave", 

397 "Aring", 

398 "Atilde", 

399 "Ccedilla", 

400 "Eacute", 

401 "Ecircumflex", 

402 "Edieresis", 

403 "Egrave", 

404 "Iacute", 

405 "Icircumflex", 

406 "Idieresis", 

407 "Igrave", 

408 "Ntilde", 

409 "Oacute", 

410 "Ocircumflex", 

411 "Odieresis", 

412 "Ograve", 

413 "Otilde", 

414 "Scaron", 

415 "Uacute", 

416 "Ucircumflex", 

417 "Udieresis", 

418 "Ugrave", 

419 "Yacute", 

420 "Ydieresis", 

421 "Zcaron", 

422 "aacute", 

423 "acircumflex", 

424 "adieresis", 

425 "agrave", 

426 "aring", 

427 "atilde", 

428 "ccedilla", 

429 "eacute", 

430 "ecircumflex", 

431 "edieresis", 

432 "egrave", 

433 "iacute", 

434 "icircumflex", 

435 "idieresis", 

436 "igrave", 

437 "ntilde", 

438 "oacute", 

439 "ocircumflex", 

440 "odieresis", 

441 "ograve", 

442 "otilde", 

443 "scaron", 

444 "uacute", 

445 "ucircumflex", 

446 "udieresis", 

447 "ugrave", 

448 "yacute", 

449 "ydieresis", 

450 "zcaron", 

451 "exclamsmall", 

452 "Hungarumlautsmall", 

453 "dollaroldstyle", 

454 "dollarsuperior", 

455 "ampersandsmall", 

456 "Acutesmall", 

457 "parenleftsuperior", 

458 "parenrightsuperior", 

459 "twodotenleader", 

460 "onedotenleader", 

461 "zerooldstyle", 

462 "oneoldstyle", 

463 "twooldstyle", 

464 "threeoldstyle", 

465 "fouroldstyle", 

466 "fiveoldstyle", 

467 "sixoldstyle", 

468 "sevenoldstyle", 

469 "eightoldstyle", 

470 "nineoldstyle", 

471 "commasuperior", 

472 "threequartersemdash", 

473 "periodsuperior", 

474 "questionsmall", 

475 "asuperior", 

476 "bsuperior", 

477 "centsuperior", 

478 "dsuperior", 

479 "esuperior", 

480 "isuperior", 

481 "lsuperior", 

482 "msuperior", 

483 "nsuperior", 

484 "osuperior", 

485 "rsuperior", 

486 "ssuperior", 

487 "tsuperior", 

488 "ff", 

489 "ffi", 

490 "ffl", 

491 "parenleftinferior", 

492 "parenrightinferior", 

493 "Circumflexsmall", 

494 "hyphensuperior", 

495 "Gravesmall", 

496 "Asmall", 

497 "Bsmall", 

498 "Csmall", 

499 "Dsmall", 

500 "Esmall", 

501 "Fsmall", 

502 "Gsmall", 

503 "Hsmall", 

504 "Ismall", 

505 "Jsmall", 

506 "Ksmall", 

507 "Lsmall", 

508 "Msmall", 

509 "Nsmall", 

510 "Osmall", 

511 "Psmall", 

512 "Qsmall", 

513 "Rsmall", 

514 "Ssmall", 

515 "Tsmall", 

516 "Usmall", 

517 "Vsmall", 

518 "Wsmall", 

519 "Xsmall", 

520 "Ysmall", 

521 "Zsmall", 

522 "colonmonetary", 

523 "onefitted", 

524 "rupiah", 

525 "Tildesmall", 

526 "exclamdownsmall", 

527 "centoldstyle", 

528 "Lslashsmall", 

529 "Scaronsmall", 

530 "Zcaronsmall", 

531 "Dieresissmall", 

532 "Brevesmall", 

533 "Caronsmall", 

534 "Dotaccentsmall", 

535 "Macronsmall", 

536 "figuredash", 

537 "hypheninferior", 

538 "Ogoneksmall", 

539 "Ringsmall", 

540 "Cedillasmall", 

541 "questiondownsmall", 

542 "oneeighth", 

543 "threeeighths", 

544 "fiveeighths", 

545 "seveneighths", 

546 "onethird", 

547 "twothirds", 

548 "zerosuperior", 

549 "foursuperior", 

550 "fivesuperior", 

551 "sixsuperior", 

552 "sevensuperior", 

553 "eightsuperior", 

554 "ninesuperior", 

555 "zeroinferior", 

556 "oneinferior", 

557 "twoinferior", 

558 "threeinferior", 

559 "fourinferior", 

560 "fiveinferior", 

561 "sixinferior", 

562 "seveninferior", 

563 "eightinferior", 

564 "nineinferior", 

565 "centinferior", 

566 "dollarinferior", 

567 "periodinferior", 

568 "commainferior", 

569 "Agravesmall", 

570 "Aacutesmall", 

571 "Acircumflexsmall", 

572 "Atildesmall", 

573 "Adieresissmall", 

574 "Aringsmall", 

575 "AEsmall", 

576 "Ccedillasmall", 

577 "Egravesmall", 

578 "Eacutesmall", 

579 "Ecircumflexsmall", 

580 "Edieresissmall", 

581 "Igravesmall", 

582 "Iacutesmall", 

583 "Icircumflexsmall", 

584 "Idieresissmall", 

585 "Ethsmall", 

586 "Ntildesmall", 

587 "Ogravesmall", 

588 "Oacutesmall", 

589 "Ocircumflexsmall", 

590 "Otildesmall", 

591 "Odieresissmall", 

592 "OEsmall", 

593 "Oslashsmall", 

594 "Ugravesmall", 

595 "Uacutesmall", 

596 "Ucircumflexsmall", 

597 "Udieresissmall", 

598 "Yacutesmall", 

599 "Thornsmall", 

600 "Ydieresissmall", 

601 "001.000", 

602 "001.001", 

603 "001.002", 

604 "001.003", 

605 "Black", 

606 "Bold", 

607 "Book", 

608 "Light", 

609 "Medium", 

610 "Regular", 

611 "Roman", 

612 "Semibold", 

613 ) 

614 

615 class INDEX: 

616 def __init__(self, fp: BinaryIO) -> None: 

617 self.fp = fp 

618 self.offsets: list[int] = [] 

619 (count, offsize) = struct.unpack(">HB", self.fp.read(3)) 

620 for _i in range(count + 1): 

621 self.offsets.append(nunpack(self.fp.read(offsize))) 

622 self.base = self.fp.tell() - 1 

623 self.fp.seek(self.base + self.offsets[-1]) 

624 

625 def __repr__(self) -> str: 

626 return f"<INDEX: size={len(self)}>" 

627 

628 def __len__(self) -> int: 

629 return len(self.offsets) - 1 

630 

631 def __getitem__(self, i: int) -> bytes: 

632 self.fp.seek(self.base + self.offsets[i]) 

633 return self.fp.read(self.offsets[i + 1] - self.offsets[i]) 

634 

635 def __iter__(self) -> Iterator[bytes]: 

636 return iter(self[i] for i in range(len(self))) 

637 

638 def __init__(self, name: str, fp: BinaryIO) -> None: 

639 self.name = name 

640 self.fp = fp 

641 # Header 

642 (_major, _minor, hdrsize, _offsize) = struct.unpack("BBBB", self.fp.read(4)) 

643 self.fp.read(hdrsize - 4) 

644 # Name INDEX 

645 self.name_index = self.INDEX(self.fp) 

646 # Top DICT INDEX 

647 self.dict_index = self.INDEX(self.fp) 

648 # String INDEX 

649 self.string_index = self.INDEX(self.fp) 

650 # Global Subr INDEX 

651 self.subr_index = self.INDEX(self.fp) 

652 # Top DICT DATA 

653 self.top_dict = getdict(self.dict_index[0]) 

654 (charset_pos,) = self.top_dict.get(15, [0]) 

655 (encoding_pos,) = self.top_dict.get(16, [0]) 

656 (charstring_pos,) = self.top_dict.get(17, [0]) 

657 # CharStrings 

658 self.fp.seek(cast(int, charstring_pos)) 

659 self.charstring = self.INDEX(self.fp) 

660 self.nglyphs = len(self.charstring) 

661 # Encodings 

662 self.code2gid = {} 

663 self.gid2code = {} 

664 self.fp.seek(cast(int, encoding_pos)) 

665 format = self.fp.read(1) 

666 if format == b"\x00": 

667 # Format 0 

668 (n,) = struct.unpack("B", self.fp.read(1)) 

669 for code, gid in enumerate(struct.unpack("B" * n, self.fp.read(n))): 

670 self.code2gid[code] = gid 

671 self.gid2code[gid] = code 

672 elif format == b"\x01": 

673 # Format 1 

674 (n,) = struct.unpack("B", self.fp.read(1)) 

675 code = 0 

676 for _i in range(n): 

677 (first, nleft) = struct.unpack("BB", self.fp.read(2)) 

678 for gid in range(first, first + nleft + 1): 

679 self.code2gid[code] = gid 

680 self.gid2code[gid] = code 

681 code += 1 

682 else: 

683 raise PDFValueError(f"unsupported encoding format: {format!r}") 

684 # Charsets 

685 self.name2gid = {} 

686 self.gid2name = {} 

687 self.fp.seek(cast(int, charset_pos)) 

688 format = self.fp.read(1) 

689 if format == b"\x00": 

690 # Format 0 

691 n = self.nglyphs - 1 

692 for gid, sid in enumerate( 

693 cast( 

694 tuple[int, ...], struct.unpack(">" + "H" * n, self.fp.read(2 * n)) 

695 ), 

696 ): 

697 gid += 1 

698 sidname = self.getstr(sid) 

699 self.name2gid[sidname] = gid 

700 self.gid2name[gid] = sidname 

701 elif format == b"\x01": 

702 # Format 1 

703 (n,) = struct.unpack("B", self.fp.read(1)) 

704 sid = 0 

705 for _i in range(n): 

706 (first, nleft) = struct.unpack("BB", self.fp.read(2)) 

707 for gid in range(first, first + nleft + 1): 

708 sidname = self.getstr(sid) 

709 self.name2gid[sidname] = gid 

710 self.gid2name[gid] = sidname 

711 sid += 1 

712 elif format == b"\x02": 

713 # Format 2 

714 raise AssertionError(str(("Unhandled", format))) 

715 else: 

716 raise PDFValueError(f"unsupported charset format: {format!r}") 

717 

718 def getstr(self, sid: int) -> str | bytes: 

719 # This returns str for one of the STANDARD_STRINGS but bytes otherwise, 

720 # and appears to be a needless source of type complexity. 

721 if sid < len(self.STANDARD_STRINGS): 

722 return self.STANDARD_STRINGS[sid] 

723 return self.string_index[sid - len(self.STANDARD_STRINGS)] 

724 

725 

726class TrueTypeFont: 

727 class CMapNotFound(PDFException): 

728 pass 

729 

730 def __init__(self, name: str, fp: BinaryIO) -> None: 

731 self.name = name 

732 self.fp = fp 

733 self.tables: dict[bytes, tuple[int, int]] = {} 

734 self.fonttype = fp.read(4) 

735 try: 

736 (ntables, _1, _2, _3) = struct.unpack(">HHHH", fp.read(8)) 

737 for _ in range(ntables): 

738 (name_bytes, _tsum, offset, length) = struct.unpack( 

739 ">4sLLL", fp.read(16) 

740 ) 

741 self.tables[name_bytes] = (offset, length) 

742 except struct.error: 

743 # Do not fail if there are not enough bytes to read. Even for 

744 # corrupted PDFs we would like to get as much information as 

745 # possible, so continue. 

746 pass 

747 

748 def create_unicode_map(self) -> FileUnicodeMap: 

749 if b"cmap" not in self.tables: 

750 raise TrueTypeFont.CMapNotFound 

751 (base_offset, _length) = self.tables[b"cmap"] 

752 fp = self.fp 

753 fp.seek(base_offset) 

754 (_version, nsubtables) = cast(tuple[int, int], struct.unpack(">HH", fp.read(4))) 

755 subtables: list[tuple[int, int, int]] = [] 

756 for _i in range(nsubtables): 

757 subtables.append(struct.unpack(">HHL", fp.read(8))) 

758 char2gid: dict[int, int] = {} 

759 # Supports subtable type 0, 2, 4, 6, 10 and 12. 

760 for platform_id, encoding_id, st_offset in subtables: 

761 # Skip non-Unicode cmaps. 

762 # https://docs.microsoft.com/en-us/typography/opentype/spec/cmap 

763 if not (platform_id == 0 or (platform_id == 3 and encoding_id in [1, 10])): 

764 continue 

765 fp.seek(base_offset + st_offset) 

766 (fmttype,) = struct.unpack(">H", fp.read(2)) 

767 if fmttype == 0: 

768 self.parse_cmap_format_0(fp, char2gid) 

769 elif fmttype == 2: 

770 self.parse_cmap_format_2(fp, char2gid) 

771 elif fmttype == 4: 

772 self.parse_cmap_format_4(fp, char2gid) 

773 elif fmttype == 6: 

774 self.parse_cmap_format_6(fp, char2gid) 

775 elif fmttype == 10: 

776 self.parse_cmap_format_10(fp, char2gid) 

777 elif fmttype == 12: 

778 self.parse_cmap_format_12(fp, char2gid) 

779 else: 

780 raise AssertionError(str(("Unhandled", fmttype))) 

781 if not char2gid: 

782 raise TrueTypeFont.CMapNotFound 

783 # create unicode map 

784 unicode_map = FileUnicodeMap() 

785 for char, gid in char2gid.items(): 

786 unicode_map.add_cid2unichr(gid, char) 

787 return unicode_map 

788 

789 def parse_cmap_format_0(self, fp: BinaryIO, char2gid: dict[int, int]) -> None: 

790 """Parse cmap subtable format 0""" 

791 fmtlen, fmtlang = struct.unpack(">HH", fp.read(4)) 

792 log.debug("parse_cmap_format: fmtlen=%s, fmtlang=%s", fmtlen, fmtlang) 

793 char2gid.update(enumerate(struct.unpack(">256B", fp.read(256)))) 

794 

795 def parse_cmap_format_2(self, fp: BinaryIO, char2gid: dict[int, int]) -> None: 

796 """Parse cmap subtable format 2""" 

797 fmtlen, fmtlang = struct.unpack(">HH", fp.read(4)) 

798 log.debug("parse_cmap_format: fmtlen=%s, fmtlang=%s", fmtlen, fmtlang) 

799 subheaderkeys = struct.unpack(">256H", fp.read(512)) 

800 firstbytes = [0] * 8192 

801 for i, k in enumerate(subheaderkeys): 

802 firstbytes[k // 8] = i 

803 nhdrs = max(subheaderkeys) // 8 + 1 

804 hdrs: list[tuple[int, int, int, int, int]] = [] 

805 for i in range(nhdrs): 

806 (firstcode, entcount, delta, offset) = struct.unpack(">HHhH", fp.read(8)) 

807 hdrs.append((i, firstcode, entcount, delta, fp.tell() - 2 + offset)) 

808 for i, firstcode, entcount, delta, pos in hdrs: 

809 if not entcount: 

810 continue 

811 first = firstcode + (firstbytes[i] << 8) 

812 fp.seek(pos) 

813 for c in range(entcount): 

814 gid = struct.unpack(">H", fp.read(2))[0] 

815 if gid: 

816 gid += delta 

817 char2gid[first + c] = gid 

818 

819 def parse_cmap_format_4(self, fp: BinaryIO, char2gid: dict[int, int]) -> None: 

820 """Parse cmap subtable format 4""" 

821 fmtlen, fmtlang = struct.unpack(">HH", fp.read(4)) 

822 log.debug("parse_cmap_format: fmtlen=%s, fmtlang=%s", fmtlen, fmtlang) 

823 (segcount, _1, _2, _3) = struct.unpack(">HHHH", fp.read(8)) 

824 segcount //= 2 

825 ecs = struct.unpack(f">{segcount}H", fp.read(2 * segcount)) 

826 fp.read(2) 

827 scs = struct.unpack(f">{segcount}H", fp.read(2 * segcount)) 

828 idds = struct.unpack(f">{segcount}h", fp.read(2 * segcount)) 

829 pos = fp.tell() 

830 idrs = struct.unpack(f">{segcount}H", fp.read(2 * segcount)) 

831 for ec, sc, idd, idr in zip(ecs, scs, idds, idrs, strict=False): 

832 if idr: 

833 fp.seek(pos + idr) 

834 for c in range(sc, ec + 1): 

835 b = struct.unpack(">H", fp.read(2))[0] 

836 char2gid[c] = (b + idd) & 0xFFFF 

837 else: 

838 for c in range(sc, ec + 1): 

839 char2gid[c] = (c + idd) & 0xFFFF 

840 

841 def parse_cmap_format_6(self, fp: BinaryIO, char2gid: dict[int, int]) -> None: 

842 """Parse cmap subtable format 6""" 

843 fmtlen, fmtlang = struct.unpack(">HH", fp.read(4)) 

844 log.debug("parse_cmap_format: fmtlen=%s, fmtlang=%s", fmtlen, fmtlang) 

845 firstcode, entcount = struct.unpack(">HH", fp.read(4)) 

846 gids = struct.unpack(f">{entcount}H", fp.read(2 * entcount)) 

847 for i in range(entcount): 

848 char2gid[firstcode + i] = gids[i] 

849 

850 def parse_cmap_format_10(self, fp: BinaryIO, char2gid: dict[int, int]) -> None: 

851 """Parse cmap subtable format 10""" 

852 rsv, fmtlen, fmtlang = struct.unpack(">HII", fp.read(10)) 

853 log.debug( 

854 "parse_cmap_format: rsv=%s, fmtlen=%s, fmtlang=%s", rsv, fmtlen, fmtlang 

855 ) 

856 startcode, numchars = struct.unpack(">II", fp.read(8)) 

857 gids = struct.unpack(f">{numchars}H", fp.read(2 * numchars)) 

858 for i in range(numchars): 

859 char2gid[startcode + i] = gids[i] 

860 

861 def parse_cmap_format_12(self, fp: BinaryIO, char2gid: dict[int, int]) -> None: 

862 """Parse cmap subtable format 12""" 

863 rsv, fmtlen, fmtlang = struct.unpack(">HII", fp.read(10)) 

864 log.debug( 

865 "parse_cmap_format: rsv=%s, fmtlen=%s, fmtlang=%s", rsv, fmtlen, fmtlang 

866 ) 

867 numgroups = struct.unpack(">I", fp.read(4))[0] 

868 for _i in range(numgroups): 

869 sc, ec, sgid = struct.unpack(">III", fp.read(12)) 

870 for code in range(sc, ec + 1): 

871 char2gid[code] = sgid 

872 sgid += 1 

873 

874 

875class PDFFontError(PDFException): 

876 pass 

877 

878 

879class PDFUnicodeNotDefined(PDFFontError): 

880 pass 

881 

882 

883LITERAL_STANDARD_ENCODING = LIT("StandardEncoding") 

884LITERAL_TYPE1C = LIT("Type1C") 

885 

886# Font widths are maintained in a dict type that maps from *either* unicode 

887# chars or integer character IDs. 

888FontWidthDict = dict[int | str, float] 

889 

890 

891class PDFFont: 

892 def __init__( 

893 self, 

894 descriptor: Mapping[str, Any], 

895 widths: FontWidthDict, 

896 default_width: float | None = None, 

897 ) -> None: 

898 self.descriptor = descriptor 

899 self.widths: FontWidthDict = resolve_all(widths) 

900 self.fontname = resolve1(descriptor.get("FontName", "unknown")) 

901 if isinstance(self.fontname, PSLiteral): 

902 self.fontname = literal_name(self.fontname) 

903 self.flags = int_value(descriptor.get("Flags", 0)) 

904 self.ascent = num_value(descriptor.get("Ascent", 0)) 

905 self.descent = num_value(descriptor.get("Descent", 0)) 

906 self.italic_angle = num_value(descriptor.get("ItalicAngle", 0)) 

907 if default_width is None: 

908 self.default_width = num_value(descriptor.get("MissingWidth", 0)) 

909 else: 

910 self.default_width = default_width 

911 self.default_width = resolve1(self.default_width) 

912 self.leading = num_value(descriptor.get("Leading", 0)) 

913 self.bbox = self._parse_bbox(descriptor) 

914 self.hscale = self.vscale = 0.001 

915 

916 # PDF RM 9.8.1 specifies /Descent should always be a negative number. 

917 # PScript5.dll seems to produce Descent with a positive number, but 

918 # text analysis will be wrong if this is taken as correct. So force 

919 # descent to negative. 

920 if self.descent > 0: 

921 self.descent = -self.descent 

922 

923 def __repr__(self) -> str: 

924 return "<PDFFont>" 

925 

926 def is_vertical(self) -> bool: 

927 return False 

928 

929 def is_multibyte(self) -> bool: 

930 return False 

931 

932 def decode(self, bytes: bytes) -> Iterable[int]: 

933 return bytearray(bytes) # map(ord, bytes) 

934 

935 def get_ascent(self) -> float: 

936 """Ascent above the baseline, in text space units""" 

937 return self.ascent * self.vscale 

938 

939 def get_descent(self) -> float: 

940 """Descent below the baseline, in text space units; always negative""" 

941 return self.descent * self.vscale 

942 

943 def get_width(self) -> float: 

944 w = self.bbox[2] - self.bbox[0] 

945 if w == 0: 

946 w = -self.default_width 

947 return w * self.hscale 

948 

949 def get_height(self) -> float: 

950 h = self.bbox[3] - self.bbox[1] 

951 if h == 0: 

952 h = self.ascent - self.descent 

953 return h * self.vscale 

954 

955 def char_width(self, cid: int) -> float: 

956 # Because character widths may be mapping either IDs or strings, 

957 # we try to lookup the character ID first, then its str equivalent. 

958 cid_width = safe_float(self.widths.get(cid)) 

959 if cid_width is not None: 

960 return cid_width * self.hscale 

961 

962 try: 

963 str_cid = self.to_unichr(cid) 

964 cid_width = safe_float(self.widths.get(str_cid)) 

965 if cid_width is not None: 

966 return cid_width * self.hscale 

967 

968 except PDFUnicodeNotDefined: 

969 pass 

970 

971 return self.default_width * self.hscale 

972 

973 def char_disp(self, cid: int) -> float | tuple[float | None, float]: 

974 """Returns an integer for horizontal fonts, a tuple for vertical fonts.""" 

975 return 0 

976 

977 def string_width(self, s: bytes) -> float: 

978 return sum(self.char_width(cid) for cid in self.decode(s)) 

979 

980 def to_unichr(self, cid: int) -> str: 

981 raise NotImplementedError 

982 

983 @staticmethod 

984 def _parse_bbox(descriptor: Mapping[str, Any]) -> Rect: 

985 """Parse FontBBox from the fonts descriptor""" 

986 font_bbox = resolve_all(descriptor.get("FontBBox")) 

987 bbox = safe_rect_list(font_bbox) 

988 if bbox is None: 

989 log.warning( 

990 "Could not get FontBBox from font descriptor because " 

991 "%r cannot be parsed as 4 floats", 

992 font_bbox, 

993 ) 

994 return 0.0, 0.0, 0.0, 0.0 

995 return bbox 

996 

997 

998class PDFSimpleFont(PDFFont): 

999 def __init__( 

1000 self, 

1001 descriptor: Mapping[str, Any], 

1002 widths: FontWidthDict, 

1003 spec: Mapping[str, Any], 

1004 ) -> None: 

1005 # Font encoding is specified either by a name of 

1006 # built-in encoding or a dictionary that describes 

1007 # the differences. 

1008 

1009 default_encoding = LITERAL_STANDARD_ENCODING 

1010 if literal_name(spec.get("Subtype")) == "TrueType": 

1011 # PDF spec: TrueType fonts without Encoding default to WinAnsiEncoding 

1012 default_encoding = LIT("WinAnsiEncoding") 

1013 

1014 encoding = default_encoding 

1015 if "Encoding" in spec: 

1016 encoding = resolve1(spec["Encoding"]) 

1017 

1018 if isinstance(encoding, dict): 

1019 name = literal_name(encoding.get("BaseEncoding", default_encoding)) 

1020 diff = list_value(encoding.get("Differences", [])) 

1021 self.cid2unicode = EncodingDB.get_encoding(name, diff) 

1022 else: 

1023 self.cid2unicode = EncodingDB.get_encoding(literal_name(encoding)) 

1024 

1025 self.unicode_map: UnicodeMap | None = None 

1026 if "ToUnicode" in spec: 

1027 strm = stream_value(spec["ToUnicode"]) 

1028 self.unicode_map = FileUnicodeMap() 

1029 CMapParser(self.unicode_map, BytesIO(strm.get_data())).run() 

1030 PDFFont.__init__(self, descriptor, widths) 

1031 

1032 def to_unichr(self, cid: int) -> str: 

1033 if self.unicode_map: 

1034 try: 

1035 return self.unicode_map.get_unichr(cid) 

1036 except KeyError: 

1037 pass 

1038 try: 

1039 return self.cid2unicode[cid] 

1040 except KeyError as err: 

1041 raise PDFUnicodeNotDefined(None, cid) from err 

1042 

1043 

1044class PDFType1Font(PDFSimpleFont): 

1045 def __init__(self, rsrcmgr: "PDFResourceManager", spec: Mapping[str, Any]) -> None: 

1046 try: 

1047 self.basefont = literal_name(spec["BaseFont"]) 

1048 except KeyError: 

1049 if settings.STRICT: 

1050 raise PDFFontError("BaseFont is missing") from None 

1051 self.basefont = "unknown" 

1052 

1053 widths: FontWidthDict 

1054 try: 

1055 (descriptor, int_widths) = FontMetricsDB.get_metrics(self.basefont) 

1056 widths = cast(dict[str | int, float], int_widths) # implicit int->float 

1057 except KeyError: 

1058 descriptor = dict_value(spec.get("FontDescriptor", {})) 

1059 firstchar = int_value(spec.get("FirstChar", 0)) 

1060 # lastchar = int_value(spec.get('LastChar', 255)) 

1061 width_list = list_value(spec.get("Widths", [0] * 256)) 

1062 widths = {i + firstchar: resolve1(w) for (i, w) in enumerate(width_list)} 

1063 PDFSimpleFont.__init__(self, descriptor, widths, spec) 

1064 if "Encoding" not in spec and "FontFile" in descriptor: 

1065 # try to recover the missing encoding info from the font file. 

1066 self.fontfile = stream_value(descriptor.get("FontFile")) 

1067 length1 = int_value(self.fontfile["Length1"]) 

1068 data = self.fontfile.get_data()[:length1] 

1069 parser = Type1FontHeaderParser(BytesIO(data)) 

1070 self.cid2unicode = parser.get_encoding() 

1071 

1072 def __repr__(self) -> str: 

1073 return f"<PDFType1Font: basefont={self.basefont!r}>" 

1074 

1075 

1076class PDFTrueTypeFont(PDFType1Font): 

1077 def __repr__(self) -> str: 

1078 return f"<PDFTrueTypeFont: basefont={self.basefont!r}>" 

1079 

1080 

1081class PDFType3Font(PDFSimpleFont): 

1082 def __init__(self, rsrcmgr: "PDFResourceManager", spec: Mapping[str, Any]) -> None: 

1083 firstchar = int_value(spec.get("FirstChar", 0)) 

1084 # lastchar = int_value(spec.get('LastChar', 0)) 

1085 width_list = list_value(spec.get("Widths", [0] * 256)) 

1086 widths: dict[str | int, float] = { 

1087 i + firstchar: w for (i, w) in enumerate(width_list) 

1088 } 

1089 if "FontDescriptor" in spec: 

1090 descriptor = dict_value(spec["FontDescriptor"]) 

1091 else: 

1092 descriptor = {"Ascent": 0, "Descent": 0, "FontBBox": spec["FontBBox"]} 

1093 PDFSimpleFont.__init__(self, descriptor, widths, spec) 

1094 self.matrix = cast(Matrix, tuple(list_value(spec.get("FontMatrix")))) 

1095 (_, self.descent, _, self.ascent) = self.bbox 

1096 (self.hscale, self.vscale) = apply_matrix_norm(self.matrix, (1, 1)) 

1097 

1098 def __repr__(self) -> str: 

1099 return "<PDFType3Font>" 

1100 

1101 

1102class PDFCIDFont(PDFFont): 

1103 default_disp: float | tuple[float | None, float] 

1104 

1105 def __init__( 

1106 self, 

1107 rsrcmgr: "PDFResourceManager", 

1108 spec: Mapping[str, Any], 

1109 strict: bool = settings.STRICT, 

1110 ) -> None: 

1111 try: 

1112 self.basefont = literal_name(spec["BaseFont"]) 

1113 except KeyError: 

1114 if strict: 

1115 raise PDFFontError("BaseFont is missing") from None 

1116 self.basefont = "unknown" 

1117 self.cidsysteminfo = dict_value(spec.get("CIDSystemInfo", {})) 

1118 cid_registry = resolve1(self.cidsysteminfo.get("Registry", b"unknown")).decode( 

1119 "latin1", 

1120 ) 

1121 cid_ordering = resolve1(self.cidsysteminfo.get("Ordering", b"unknown")).decode( 

1122 "latin1", 

1123 ) 

1124 self.cidcoding = f"{cid_registry.strip()}-{cid_ordering.strip()}" 

1125 self.cmap: CMapBase = self.get_cmap_from_spec(spec, strict) 

1126 

1127 try: 

1128 descriptor = dict_value(spec["FontDescriptor"]) 

1129 except KeyError: 

1130 if strict: 

1131 raise PDFFontError("FontDescriptor is missing") from None 

1132 descriptor = {} 

1133 ttf = None 

1134 if "FontFile2" in descriptor: 

1135 self.fontfile = stream_value(descriptor.get("FontFile2")) 

1136 ttf = TrueTypeFont(self.basefont, BytesIO(self.fontfile.get_data())) 

1137 self.unicode_map: UnicodeMap | None = None 

1138 if "ToUnicode" in spec: 

1139 if isinstance(spec["ToUnicode"], PDFStream): 

1140 strm = stream_value(spec["ToUnicode"]) 

1141 self.unicode_map = FileUnicodeMap() 

1142 CMapParser(self.unicode_map, BytesIO(strm.get_data())).run() 

1143 else: 

1144 cmap_name = literal_name(spec["ToUnicode"]) 

1145 encoding = literal_name(spec["Encoding"]) 

1146 if ( 

1147 "Identity" in cid_ordering 

1148 or "Identity" in cmap_name 

1149 or "Identity" in encoding 

1150 ): 

1151 self.unicode_map = IdentityUnicodeMap() 

1152 elif self.cidcoding in ("Adobe-Identity", "Adobe-UCS"): 

1153 if ttf: 

1154 with contextlib.suppress(TrueTypeFont.CMapNotFound): 

1155 self.unicode_map = ttf.create_unicode_map() 

1156 else: 

1157 with contextlib.suppress(CMapDB.CMapNotFound): 

1158 self.unicode_map = CMapDB.get_unicode_map( 

1159 self.cidcoding, 

1160 self.cmap.is_vertical(), 

1161 ) 

1162 

1163 self.vertical = self.cmap.is_vertical() 

1164 if self.vertical: 

1165 # writing mode: vertical 

1166 widths2 = get_widths2(list_value(spec.get("W2", []))) 

1167 self.disps = {cid: (vx, vy) for (cid, (_, (vx, vy))) in widths2.items()} 

1168 (vy, w) = resolve1(spec.get("DW2", [880, -1000])) 

1169 self.default_disp = (None, vy) 

1170 widths: dict[str | int, float] = { 

1171 cid: w for (cid, (w, _)) in widths2.items() 

1172 } 

1173 default_width = w 

1174 else: 

1175 # writing mode: horizontal 

1176 self.disps = {} 

1177 self.default_disp = 0 

1178 widths = get_widths(list_value(spec.get("W", []))) 

1179 default_width = spec.get("DW", 1000) 

1180 PDFFont.__init__(self, descriptor, widths, default_width=default_width) 

1181 

1182 def get_cmap_from_spec(self, spec: Mapping[str, Any], strict: bool) -> CMapBase: 

1183 """Get cmap from font specification 

1184 

1185 For certain PDFs, Encoding Type isn't mentioned as an attribute of 

1186 Encoding but as an attribute of CMapName, where CMapName is an 

1187 attribute of spec['Encoding']. 

1188 The horizontal/vertical modes are mentioned with different name 

1189 such as 'DLIdent-H/V','OneByteIdentityH/V','Identity-H/V'. 

1190 """ 

1191 cmap_name = self._get_cmap_name(spec, strict) 

1192 

1193 try: 

1194 return CMapDB.get_cmap(cmap_name) 

1195 except CMapDB.CMapNotFound as e: 

1196 if strict: 

1197 raise PDFFontError(e) from e 

1198 return CMap() 

1199 

1200 @staticmethod 

1201 def _get_cmap_name(spec: Mapping[str, Any], strict: bool) -> str: 

1202 """Get cmap name from font specification""" 

1203 cmap_name = "unknown" # default value 

1204 

1205 try: 

1206 spec_encoding = spec["Encoding"] 

1207 if hasattr(spec_encoding, "name"): 

1208 cmap_name = literal_name(spec["Encoding"]) 

1209 else: 

1210 cmap_name = literal_name(spec_encoding["CMapName"]) 

1211 except KeyError: 

1212 if strict: 

1213 raise PDFFontError("Encoding is unspecified") from None 

1214 

1215 if type(cmap_name) is PDFStream: # type: ignore[comparison-overlap] 

1216 cmap_name_stream: PDFStream = cast(PDFStream, cmap_name) 

1217 if "CMapName" in cmap_name_stream: 

1218 cmap_name = cmap_name_stream.get("CMapName").name 

1219 elif strict: 

1220 raise PDFFontError("CMapName unspecified for encoding") 

1221 

1222 return IDENTITY_ENCODER.get(cmap_name, cmap_name) 

1223 

1224 def __repr__(self) -> str: 

1225 return f"<PDFCIDFont: basefont={self.basefont!r}, cidcoding={self.cidcoding!r}>" 

1226 

1227 def is_vertical(self) -> bool: 

1228 return self.vertical 

1229 

1230 def is_multibyte(self) -> bool: 

1231 return True 

1232 

1233 def decode(self, bytes: bytes) -> Iterable[int]: 

1234 return self.cmap.decode(bytes) 

1235 

1236 def char_disp(self, cid: int) -> float | tuple[float | None, float]: 

1237 """Returns an integer for horizontal fonts, a tuple for vertical fonts.""" 

1238 return self.disps.get(cid, self.default_disp) 

1239 

1240 def to_unichr(self, cid: int) -> str: 

1241 try: 

1242 if not self.unicode_map: 

1243 raise PDFKeyError(cid) 

1244 return self.unicode_map.get_unichr(cid) 

1245 except KeyError as err: 

1246 raise PDFUnicodeNotDefined(self.cidcoding, cid) from err