Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_font.py: 27%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

255 statements  

1from __future__ import annotations 

2 

3from collections.abc import Sequence 

4from dataclasses import dataclass, field 

5from typing import TYPE_CHECKING, Any, cast 

6 

7from pypdf.generic import ArrayObject, DictionaryObject, NameObject, NumberObject, StreamObject 

8 

9from ._cmap import get_encoding 

10from ._codecs.adobe_glyphs import adobe_glyphs 

11from ._utils import logger_warning 

12from .constants import FontFlags 

13from .errors import PdfReadError 

14 

15if TYPE_CHECKING: 

16 from io import BytesIO 

17 

18 from fontTools.ttLib.tables._h_e_a_d import table__h_e_a_d 

19 from fontTools.ttLib.tables._p_o_s_t import table__p_o_s_t 

20 from fontTools.ttLib.tables.O_S_2f_2 import table_O_S_2f_2 

21 

22try: 

23 from fontTools.ttLib import TTFont 

24 HAS_FONTTOOLS = True 

25except ImportError: 

26 HAS_FONTTOOLS = False 

27 

28 

29# Some constants from truetype font tables that we use: 

30HEADER_MACSTYLE_ITALIC = 0x02 

31OS2_FSSELECTION_ITALIC = 0x01 

32OS2_PANOSE_BFAMILYTYPE_SCRIPT = 3 

33OS2_PANOSE_BFAMILYTYPE_DECORATIVE = 4 

34OS2_PANOSE_BFAMILYTYPE_PICTORIAL = 5 

35OS2_PANOSE_BPROPORTION_MONOSPACED = 9 

36OS2_SFAMILYSCLASS_SCRIPTS = 10 

37OS2_SFAMILYSCLASS_SYMBOLIC = 12 

38 

39 

40@dataclass(frozen=True) 

41class FontDescriptor: 

42 """ 

43 Represents the FontDescriptor dictionary as defined in the PDF specification. 

44 This contains both descriptive and metric information. 

45 

46 The defaults are derived from the mean values of the 14 core fonts, rounded 

47 to 100. 

48 """ 

49 

50 name: str = "Unknown" 

51 family: str = "Unknown" 

52 weight: str = "Unknown" 

53 

54 ascent: float = 700.0 

55 descent: float = -200.0 

56 cap_height: float = 600.0 

57 x_height: float = 500.0 

58 italic_angle: float = 0.0 # Non-italic 

59 flags: int = 32 # Non-serif, non-symbolic, not fixed width 

60 bbox: tuple[float, float, float, float] = field(default_factory=lambda: (-100.0, -200.0, 1000.0, 900.0)) 

61 font_file: StreamObject | None = None 

62 

63 

64@dataclass(frozen=True) 

65class CoreFontMetrics: 

66 font_descriptor: FontDescriptor 

67 character_widths: dict[str, int] 

68 

69 

70@dataclass 

71class Font: 

72 """ 

73 A font object for use during text extraction and for producing 

74 text appearance streams. 

75 

76 Attributes: 

77 name: Font name, derived from font["/BaseFont"] 

78 character_map: The font's character map 

79 encoding: Font encoding 

80 sub_type: The font type, such as Type1, TrueType, or Type3. 

81 font_descriptor: Font metrics, including a mapping of characters to widths 

82 character_widths: A mapping of characters to widths 

83 space_width: The width of a space, or an approximation 

84 interpretable: Default True. If False, the font glyphs cannot 

85 be translated to characters, e.g. Type3 fonts that do not define 

86 a '/ToUnicode' mapping. 

87 

88 """ 

89 

90 name: str 

91 encoding: str | dict[int, str] 

92 character_map: dict[Any, Any] = field(default_factory=dict) 

93 sub_type: str = "Unknown" 

94 font_descriptor: FontDescriptor = field(default_factory=FontDescriptor) 

95 character_widths: dict[str, int] = field(default_factory=lambda: {"default": 500}) 

96 space_width: float | int = 250 

97 interpretable: bool = True 

98 

99 @staticmethod 

100 def _collect_tt_t1_character_widths( 

101 pdf_font_dict: DictionaryObject, 

102 char_map: dict[Any, Any], 

103 encoding: str | dict[int, str], 

104 current_widths: dict[str, int] 

105 ) -> None: 

106 """Parses a TrueType or Type1 font's /Widths array from a font dictionary and updates character widths""" 

107 widths_array = cast(ArrayObject, pdf_font_dict["/Widths"]) 

108 first_char = pdf_font_dict.get("/FirstChar", 0) 

109 if not isinstance(encoding, str): 

110 # This means that encoding is a dict 

111 current_widths.update({ 

112 encoding.get(idx + first_char, chr(idx + first_char)): width 

113 for idx, width in enumerate(widths_array) 

114 }) 

115 return 

116 

117 # We map the character code directly to the character 

118 # using the string encoding 

119 for idx, width in enumerate(widths_array): 

120 # Often "idx == 0" will denote the .notdef character, but we add it anyway 

121 char_code = idx + first_char # This is a raw code 

122 # Get the "raw" character or byte representation 

123 raw_char = bytes([char_code]).decode(encoding, "surrogatepass") 

124 # Translate raw_char to the REAL Unicode character using the char_map 

125 unicode_char = char_map.get(raw_char) 

126 if unicode_char: 

127 current_widths[unicode_char] = int(width) 

128 else: 

129 current_widths[raw_char] = int(width) 

130 

131 @staticmethod 

132 def _collect_cid_character_widths( 

133 d_font: DictionaryObject, char_map: dict[Any, Any], current_widths: dict[str, int] 

134 ) -> None: 

135 """Parses the /W array from a DescendantFont dictionary and updates character widths.""" 

136 ord_map = { 

137 ord(_target): _surrogate 

138 for _target, _surrogate in char_map.items() 

139 if isinstance(_target, str) 

140 } 

141 # /W width definitions have two valid formats which can be mixed and matched: 

142 # (1) A character start index followed by a list of widths, e.g. 

143 # `45 [500 600 700]` applies widths 500, 600, 700 to characters 45-47. 

144 # (2) A character start index, a character stop index, and a width, e.g. 

145 # `45 65 500` applies width 500 to characters 45-65. 

146 skip_count = 0 

147 _w = d_font.get("/W", []) 

148 for idx, w_entry in enumerate(_w): 

149 w_entry = w_entry.get_object() 

150 if skip_count: 

151 skip_count -= 1 

152 continue 

153 if not isinstance(w_entry, (int, float)): 

154 # We should never get here due to skip_count above. But 

155 # sometimes we do. 

156 logger_warning(f"Expected numeric value for width, got {w_entry}. Ignoring it.", __name__) 

157 continue 

158 # check for format (1): `int [int int int int ...]` 

159 w_next_entry = _w[idx + 1].get_object() 

160 if isinstance(w_next_entry, Sequence): 

161 start_idx, width_list = w_entry, w_next_entry 

162 current_widths.update( 

163 { 

164 ord_map[_cidx]: _width 

165 for _cidx, _width in zip( 

166 range( 

167 cast(int, start_idx), 

168 cast(int, start_idx) + len(width_list), 

169 1, 

170 ), 

171 width_list, 

172 ) 

173 if _cidx in ord_map 

174 } 

175 ) 

176 skip_count = 1 

177 # check for format (2): `int int int` 

178 elif isinstance(w_next_entry, (int, float)) and isinstance( 

179 _w[idx + 2].get_object(), (int, float) 

180 ): 

181 start_idx, stop_idx, const_width = ( 

182 w_entry, 

183 w_next_entry, 

184 _w[idx + 2].get_object(), 

185 ) 

186 current_widths.update( 

187 { 

188 ord_map[_cidx]: const_width 

189 for _cidx in range( 

190 cast(int, start_idx), cast(int, stop_idx + 1), 1 

191 ) 

192 if _cidx in ord_map 

193 } 

194 ) 

195 skip_count = 2 

196 else: 

197 # This handles the case of out of bounds (reaching the end of the width definitions 

198 # while expecting more elements). 

199 logger_warning( 

200 f"Invalid font width definition. Last element: {w_entry}.", 

201 __name__ 

202 ) 

203 

204 @staticmethod 

205 def _add_default_width(current_widths: dict[str, int], flags: int) -> None: 

206 if not current_widths: 

207 current_widths["default"] = 500 

208 return 

209 

210 if " " in current_widths and current_widths[" "] != 0: 

211 # Setting default to once or twice the space width, depending on fixed pitch 

212 if (flags & FontFlags.FIXED_PITCH) == FontFlags.FIXED_PITCH: 

213 current_widths["default"] = current_widths[" "] 

214 return 

215 

216 current_widths["default"] = int(2 * current_widths[" "]) 

217 return 

218 

219 # Use the average width of existing glyph widths 

220 valid_widths = [w for w in current_widths.values() if w > 0] 

221 current_widths["default"] = sum(valid_widths) // len(valid_widths) if valid_widths else 500 

222 

223 @staticmethod 

224 def _add_space_width(character_widths: dict[str, int], flags: int) -> int: 

225 space_width = character_widths.get(" ", 0) 

226 if space_width != 0: 

227 return space_width 

228 

229 if (flags & FontFlags.FIXED_PITCH) == FontFlags.FIXED_PITCH: 

230 return character_widths["default"] 

231 

232 return character_widths["default"] // 2 

233 

234 @staticmethod 

235 def _parse_font_descriptor(font_descriptor_obj: DictionaryObject) -> dict[str, Any]: 

236 font_descriptor_kwargs: dict[Any, Any] = {} 

237 for source_key, target_key in [ 

238 ("/FontName", "name"), 

239 ("/FontFamily", "family"), 

240 ("/FontWeight", "weight"), 

241 ("/Ascent", "ascent"), 

242 ("/Descent", "descent"), 

243 ("/CapHeight", "cap_height"), 

244 ("/XHeight", "x_height"), 

245 ("/ItalicAngle", "italic_angle"), 

246 ("/Flags", "flags"), 

247 ("/FontBBox", "bbox") 

248 ]: 

249 if source_key in font_descriptor_obj: 

250 font_descriptor_kwargs[target_key] = font_descriptor_obj[source_key] 

251 # Handle missing bbox gracefully - PDFs may have fonts without valid bounding boxes 

252 if "bbox" in font_descriptor_kwargs: 

253 bbox_tuple = tuple(map(float, font_descriptor_kwargs["bbox"])) 

254 assert len(bbox_tuple) == 4, bbox_tuple 

255 font_descriptor_kwargs["bbox"] = bbox_tuple 

256 

257 # Find the binary stream for this font if there is one 

258 for source_key in ["/FontFile", "/FontFile2", "/FontFile3"]: 

259 if source_key in font_descriptor_obj: 

260 if "font_file" in font_descriptor_kwargs: 

261 raise PdfReadError(f"More than one /FontFile found in {font_descriptor_obj}") 

262 

263 try: 

264 font_file = font_descriptor_obj[source_key].get_object() 

265 font_descriptor_kwargs["font_file"] = font_file 

266 except PdfReadError as e: 

267 logger_warning(f"Failed to get {source_key!r} in {font_descriptor_obj}: {e}", __name__) 

268 return font_descriptor_kwargs 

269 

270 @classmethod 

271 def from_font_resource( 

272 cls, 

273 pdf_font_dict: DictionaryObject, 

274 ) -> Font: 

275 from pypdf._codecs.core_font_metrics import CORE_FONT_METRICS # noqa: PLC0415 

276 

277 # Can collect base_font, name and encoding directly from font resource 

278 name = pdf_font_dict.get("/BaseFont", "Unknown").removeprefix("/") 

279 sub_type = pdf_font_dict.get("/Subtype", "Unknown").removeprefix("/") 

280 encoding, character_map = get_encoding(pdf_font_dict) 

281 font_descriptor = None 

282 character_widths: dict[str, int] = {} 

283 interpretable = True 

284 

285 # Deal with fonts by type; Type1, TrueType and certain Type3 

286 if pdf_font_dict.get("/Subtype") in ("/Type1", "/MMType1", "/TrueType", "/Type3"): 

287 # Type3 fonts that do not specify a "/ToUnicode" mapping cannot be 

288 # reliably converted into character codes unless all named chars 

289 # in /CharProcs map to a standard adobe glyph. See §9.10.2 of the 

290 # PDF 1.7 standard. 

291 if sub_type == "Type3" and "/ToUnicode" not in pdf_font_dict: 

292 interpretable = all( 

293 cname in adobe_glyphs 

294 for cname in pdf_font_dict.get("/CharProcs") or [] 

295 ) 

296 if interpretable: # Save some overhead if font is not interpretable 

297 if "/Widths" in pdf_font_dict: 

298 cls._collect_tt_t1_character_widths( 

299 pdf_font_dict, character_map, encoding, character_widths 

300 ) 

301 elif name in CORE_FONT_METRICS: 

302 font_descriptor = CORE_FONT_METRICS[name].font_descriptor 

303 character_widths = CORE_FONT_METRICS[name].character_widths 

304 if "/FontDescriptor" in pdf_font_dict: 

305 font_descriptor_obj = pdf_font_dict.get("/FontDescriptor", DictionaryObject()).get_object() 

306 if "/MissingWidth" in font_descriptor_obj: 

307 character_widths["default"] = cast(int, font_descriptor_obj["/MissingWidth"].get_object()) 

308 font_descriptor = FontDescriptor(**cls._parse_font_descriptor(font_descriptor_obj)) 

309 elif "/FontBBox" in pdf_font_dict: 

310 # For Type3 without Font Descriptor but with FontBBox, see Table 110 in the PDF specification 2.0 

311 bbox_tuple = tuple(map(float, cast(ArrayObject, pdf_font_dict["/FontBBox"]))) 

312 assert len(bbox_tuple) == 4, bbox_tuple 

313 font_descriptor = FontDescriptor(name=name, bbox=bbox_tuple) 

314 

315 else: 

316 # Composite font or CID font - CID fonts have a /W array mapping character codes 

317 # to widths stashed in /DescendantFonts. No need to test for /DescendantFonts though, 

318 # because all other fonts have already been dealt with. 

319 d_font: DictionaryObject 

320 for d_font_idx, d_font in enumerate( 

321 cast(ArrayObject, pdf_font_dict["/DescendantFonts"]) 

322 ): 

323 d_font = cast(DictionaryObject, d_font.get_object()) 

324 cast(ArrayObject, pdf_font_dict["/DescendantFonts"])[d_font_idx] = d_font 

325 cls._collect_cid_character_widths( 

326 d_font, character_map, character_widths 

327 ) 

328 if "/DW" in d_font: 

329 character_widths["default"] = cast(int, d_font["/DW"].get_object()) 

330 font_descriptor_obj = d_font.get("/FontDescriptor", DictionaryObject()).get_object() 

331 font_descriptor = FontDescriptor(**cls._parse_font_descriptor(font_descriptor_obj)) 

332 

333 if not font_descriptor: 

334 font_descriptor = FontDescriptor(name=name) 

335 

336 if character_widths.get("default", 0) == 0: 

337 cls._add_default_width(character_widths, font_descriptor.flags) 

338 

339 space_width = cls._add_space_width(character_widths, font_descriptor.flags) 

340 

341 return cls( 

342 name=name, 

343 sub_type=sub_type, 

344 encoding=encoding, 

345 font_descriptor=font_descriptor, 

346 character_map=character_map, 

347 character_widths=character_widths, 

348 space_width=space_width, 

349 interpretable=interpretable 

350 ) 

351 

352 @staticmethod 

353 def _font_flags_from_truetype_font_tables( 

354 header: table__h_e_a_d, 

355 postscript: table__p_o_s_t, 

356 os2: table_O_S_2f_2 

357 ) -> int: 

358 # Get the font flags 

359 if os2: 

360 panose = os2.panose 

361 # sFamilyClass is a two-byte field. The high byte describes the family class, whereas the low 

362 # byte only describes the subclass. We only need the high byte, hence the bit shift below: 

363 family_class = os2.sFamilyClass >> 8 

364 flags: int = 0 

365 

366 # ITALIC 

367 if header.macStyle & HEADER_MACSTYLE_ITALIC or (os2 and os2.fsSelection & OS2_FSSELECTION_ITALIC): 

368 flags |= FontFlags.ITALIC 

369 if postscript: 

370 italic_angle = postscript.italicAngle 

371 if italic_angle != 0.0: 

372 flags |= FontFlags.ITALIC 

373 

374 # FIXED_PITCH 

375 if ( 

376 (os2 and panose.bProportion == OS2_PANOSE_BPROPORTION_MONOSPACED) or 

377 (postscript and postscript.isFixedPitch > 0) # Actually 1, but originally (older versions of the TTF 

378 ): # specification) any non-zero value signified monospace. 

379 flags |= FontFlags.FIXED_PITCH 

380 

381 # SCRIPT 

382 if os2 and ( 

383 family_class == OS2_SFAMILYSCLASS_SCRIPTS or panose.bFamilyType == OS2_PANOSE_BFAMILYTYPE_SCRIPT 

384 ): 

385 flags |= FontFlags.SCRIPT 

386 

387 # SERIF 

388 if os2 and ( 

389 2 <= panose.bSerifStyle <= 10 

390 or 1 <= family_class <= 5 or family_class == 7 # 6 is reserved, all 8 and above are not serif 

391 ): 

392 flags |= FontFlags.SERIF 

393 

394 # SYMBOLIC 

395 if os2 and ( 

396 family_class == OS2_SFAMILYSCLASS_SYMBOLIC or 

397 panose.bFamilyType in {OS2_PANOSE_BFAMILYTYPE_DECORATIVE, OS2_PANOSE_BFAMILYTYPE_PICTORIAL} 

398 ): 

399 flags |= FontFlags.SYMBOLIC 

400 else: 

401 flags |= FontFlags.NONSYMBOLIC 

402 

403 return flags 

404 

405 @classmethod 

406 def from_truetype_font_file(cls, font_file: BytesIO) -> Font: 

407 if not HAS_FONTTOOLS: 

408 raise ImportError("The 'fontTools' library is required to use 'from_truetype_font_file'") 

409 with TTFont(font_file) as tt_font_object: 

410 # See Chapter 6 of the TrueType reference manual for the definition of the head, OS/2 and post tables: 

411 # https://developer.apple.com/fonts/TrueType-Reference-Manual/RM06/Chap6head.html 

412 # https://developer.apple.com/fonts/TrueType-Reference-Manual/RM06/Chap6OS2.html 

413 # https://developer.apple.com/fonts/TrueType-Reference-Manual/RM06/Chap6post.html 

414 header = tt_font_object["head"] 

415 horizontal_header = tt_font_object["hhea"] 

416 metrics = tt_font_object["hmtx"].metrics 

417 

418 # Collect additional font tables to derive font information 

419 postscript = tt_font_object.get("post", None) 

420 os2 = tt_font_object.get("OS/2", None) 

421 

422 # Get the scaling factor to convert font file's units per em to PDF's 1000 units per em 

423 units_per_em = header.unitsPerEm 

424 scale_factor = 1000.0 / units_per_em 

425 

426 # Get the font descriptor 

427 font_descriptor_kwargs: dict[Any, Any] = {} 

428 names = tt_font_object.get("name", None) 

429 if names: 

430 font_descriptor_kwargs["name"] = names.getBestFullName() 

431 font_descriptor_kwargs["family"] = names.getBestFamilyName() 

432 font_descriptor_kwargs["weight"] = names.getBestSubFamilyName() 

433 font_descriptor_kwargs["ascent"] = int(round(horizontal_header.ascent * scale_factor, 0)) 

434 font_descriptor_kwargs["descent"] = int(round(horizontal_header.descent * scale_factor, 0)) 

435 if os2: 

436 try: 

437 font_descriptor_kwargs["cap_height"] = int(round(os2.sCapHeight * scale_factor, 0)) 

438 font_descriptor_kwargs["x_height"] = int(round(os2.sxHeight * scale_factor, 0)) 

439 except AttributeError: 

440 pass 

441 

442 font_descriptor_kwargs["flags"] = cls._font_flags_from_truetype_font_tables(header, postscript, os2) 

443 

444 font_descriptor_kwargs["bbox"] = ( 

445 round(header.xMin * scale_factor, 0), 

446 round(header.yMin * scale_factor, 0), 

447 round(header.xMax * scale_factor, 0), 

448 round(header.yMax * scale_factor, 0) 

449 ) 

450 

451 font_file_data = StreamObject() 

452 font_file_raw_bytes = font_file.getvalue() 

453 font_file_data.set_data(font_file_raw_bytes) 

454 font_file_data.update({NameObject("/Length1"): NumberObject(len(font_file_raw_bytes))}) 

455 font_descriptor_kwargs["font_file"] = font_file_data 

456 

457 font_descriptor = FontDescriptor(**font_descriptor_kwargs) 

458 encoding = "utf_16_be" # Assume unicode 

459 

460 character_widths: dict[str, int] = {} 

461 character_map: dict[str, str] = {} 

462 

463 glyph_order = tt_font_object.getGlyphOrder() 

464 # Note that one glyph can be mapped to multiple unicode code points. However, buildReversedMin() 

465 # creates a dictionary mapping glyphs to the minimum Unicode codepoint. 

466 tt_font_cmap_table = tt_font_object.get("cmap") 

467 if tt_font_cmap_table: 

468 reverse_cmap = tt_font_cmap_table.buildReversedMin() 

469 for gid, glyph in enumerate(glyph_order): 

470 char_code = reverse_cmap.get(glyph) 

471 if char_code is None: 

472 continue 

473 char = chr(char_code) 

474 gid = tt_font_object.getGlyphID(glyph) 

475 # The following is to comply with how font_glyph_byte_map works in _appearance_stream.py 

476 gid_bytes = gid.to_bytes(2, "big") 

477 gid_key_string = gid_bytes.decode("utf-16-be", "surrogatepass") 

478 character_map[gid_key_string] = char 

479 character_widths[gid_key_string] = int(round(metrics[glyph][0] * scale_factor, 0)) 

480 else: 

481 raise PdfReadError("Font file does not have a cmap table") 

482 

483 cls._add_default_width(character_widths, font_descriptor_kwargs["flags"]) 

484 space_width = cls._add_space_width(character_widths, font_descriptor_kwargs["flags"]) 

485 

486 return cls( 

487 name=font_descriptor.name, 

488 sub_type="TrueType", 

489 encoding=encoding, 

490 font_descriptor=font_descriptor, 

491 character_map=character_map, 

492 character_widths=character_widths, 

493 space_width=space_width, 

494 interpretable=True 

495 ) 

496 

497 def as_font_resource(self) -> DictionaryObject: 

498 # For now, this returns a font resource that only works with the 14 Adobe Core fonts. 

499 return ( 

500 DictionaryObject({ 

501 NameObject("/Subtype"): NameObject("/Type1"), 

502 NameObject("/Name"): NameObject(f"/{self.name}"), 

503 NameObject("/Type"): NameObject("/Font"), 

504 NameObject("/BaseFont"): NameObject(f"/{self.name}"), 

505 NameObject("/Encoding"): NameObject("/WinAnsiEncoding") 

506 }) 

507 ) 

508 

509 def text_width(self, text: str = "") -> float: 

510 """Sum of character widths specified in PDF font for the supplied text.""" 

511 return sum( 

512 [self.character_widths.get(char, self.character_widths["default"]) for char in text], 0.0 

513 )