Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_font.py: 27%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1from __future__ import annotations
3from collections.abc import Sequence
4from dataclasses import dataclass, field
5from typing import TYPE_CHECKING, Any, cast
7from pypdf.generic import ArrayObject, DictionaryObject, NameObject, NumberObject, StreamObject
9from ._cmap import get_encoding
10from ._codecs.adobe_glyphs import adobe_glyphs
11from ._utils import logger_warning
12from .constants import FontFlags
13from .errors import PdfReadError
15if TYPE_CHECKING:
16 from io import BytesIO
18 from fontTools.ttLib.tables._h_e_a_d import table__h_e_a_d
19 from fontTools.ttLib.tables._p_o_s_t import table__p_o_s_t
20 from fontTools.ttLib.tables.O_S_2f_2 import table_O_S_2f_2
22try:
23 from fontTools.ttLib import TTFont
24 HAS_FONTTOOLS = True
25except ImportError:
26 HAS_FONTTOOLS = False
29# Some constants from truetype font tables that we use:
30HEADER_MACSTYLE_ITALIC = 0x02
31OS2_FSSELECTION_ITALIC = 0x01
32OS2_PANOSE_BFAMILYTYPE_SCRIPT = 3
33OS2_PANOSE_BFAMILYTYPE_DECORATIVE = 4
34OS2_PANOSE_BFAMILYTYPE_PICTORIAL = 5
35OS2_PANOSE_BPROPORTION_MONOSPACED = 9
36OS2_SFAMILYSCLASS_SCRIPTS = 10
37OS2_SFAMILYSCLASS_SYMBOLIC = 12
40@dataclass(frozen=True)
41class FontDescriptor:
42 """
43 Represents the FontDescriptor dictionary as defined in the PDF specification.
44 This contains both descriptive and metric information.
46 The defaults are derived from the mean values of the 14 core fonts, rounded
47 to 100.
48 """
50 name: str = "Unknown"
51 family: str = "Unknown"
52 weight: str = "Unknown"
54 ascent: float = 700.0
55 descent: float = -200.0
56 cap_height: float = 600.0
57 x_height: float = 500.0
58 italic_angle: float = 0.0 # Non-italic
59 flags: int = 32 # Non-serif, non-symbolic, not fixed width
60 bbox: tuple[float, float, float, float] = field(default_factory=lambda: (-100.0, -200.0, 1000.0, 900.0))
61 font_file: StreamObject | None = None
64@dataclass(frozen=True)
65class CoreFontMetrics:
66 font_descriptor: FontDescriptor
67 character_widths: dict[str, int]
70@dataclass
71class Font:
72 """
73 A font object for use during text extraction and for producing
74 text appearance streams.
76 Attributes:
77 name: Font name, derived from font["/BaseFont"]
78 character_map: The font's character map
79 encoding: Font encoding
80 sub_type: The font type, such as Type1, TrueType, or Type3.
81 font_descriptor: Font metrics, including a mapping of characters to widths
82 character_widths: A mapping of characters to widths
83 space_width: The width of a space, or an approximation
84 interpretable: Default True. If False, the font glyphs cannot
85 be translated to characters, e.g. Type3 fonts that do not define
86 a '/ToUnicode' mapping.
88 """
90 name: str
91 encoding: str | dict[int, str]
92 character_map: dict[Any, Any] = field(default_factory=dict)
93 sub_type: str = "Unknown"
94 font_descriptor: FontDescriptor = field(default_factory=FontDescriptor)
95 character_widths: dict[str, int] = field(default_factory=lambda: {"default": 500})
96 space_width: float | int = 250
97 interpretable: bool = True
99 @staticmethod
100 def _collect_tt_t1_character_widths(
101 pdf_font_dict: DictionaryObject,
102 char_map: dict[Any, Any],
103 encoding: str | dict[int, str],
104 current_widths: dict[str, int]
105 ) -> None:
106 """Parses a TrueType or Type1 font's /Widths array from a font dictionary and updates character widths"""
107 widths_array = cast(ArrayObject, pdf_font_dict["/Widths"])
108 first_char = pdf_font_dict.get("/FirstChar", 0)
109 if not isinstance(encoding, str):
110 # This means that encoding is a dict
111 current_widths.update({
112 encoding.get(idx + first_char, chr(idx + first_char)): width
113 for idx, width in enumerate(widths_array)
114 })
115 return
117 # We map the character code directly to the character
118 # using the string encoding
119 for idx, width in enumerate(widths_array):
120 # Often "idx == 0" will denote the .notdef character, but we add it anyway
121 char_code = idx + first_char # This is a raw code
122 # Get the "raw" character or byte representation
123 raw_char = bytes([char_code]).decode(encoding, "surrogatepass")
124 # Translate raw_char to the REAL Unicode character using the char_map
125 unicode_char = char_map.get(raw_char)
126 if unicode_char:
127 current_widths[unicode_char] = int(width)
128 else:
129 current_widths[raw_char] = int(width)
131 @staticmethod
132 def _collect_cid_character_widths(
133 d_font: DictionaryObject, char_map: dict[Any, Any], current_widths: dict[str, int]
134 ) -> None:
135 """Parses the /W array from a DescendantFont dictionary and updates character widths."""
136 ord_map = {
137 ord(_target): _surrogate
138 for _target, _surrogate in char_map.items()
139 if isinstance(_target, str)
140 }
141 # /W width definitions have two valid formats which can be mixed and matched:
142 # (1) A character start index followed by a list of widths, e.g.
143 # `45 [500 600 700]` applies widths 500, 600, 700 to characters 45-47.
144 # (2) A character start index, a character stop index, and a width, e.g.
145 # `45 65 500` applies width 500 to characters 45-65.
146 skip_count = 0
147 _w = d_font.get("/W", [])
148 for idx, w_entry in enumerate(_w):
149 w_entry = w_entry.get_object()
150 if skip_count:
151 skip_count -= 1
152 continue
153 if not isinstance(w_entry, (int, float)):
154 # We should never get here due to skip_count above. But
155 # sometimes we do.
156 logger_warning(f"Expected numeric value for width, got {w_entry}. Ignoring it.", __name__)
157 continue
158 # check for format (1): `int [int int int int ...]`
159 w_next_entry = _w[idx + 1].get_object()
160 if isinstance(w_next_entry, Sequence):
161 start_idx, width_list = w_entry, w_next_entry
162 current_widths.update(
163 {
164 ord_map[_cidx]: _width
165 for _cidx, _width in zip(
166 range(
167 cast(int, start_idx),
168 cast(int, start_idx) + len(width_list),
169 1,
170 ),
171 width_list,
172 )
173 if _cidx in ord_map
174 }
175 )
176 skip_count = 1
177 # check for format (2): `int int int`
178 elif isinstance(w_next_entry, (int, float)) and isinstance(
179 _w[idx + 2].get_object(), (int, float)
180 ):
181 start_idx, stop_idx, const_width = (
182 w_entry,
183 w_next_entry,
184 _w[idx + 2].get_object(),
185 )
186 current_widths.update(
187 {
188 ord_map[_cidx]: const_width
189 for _cidx in range(
190 cast(int, start_idx), cast(int, stop_idx + 1), 1
191 )
192 if _cidx in ord_map
193 }
194 )
195 skip_count = 2
196 else:
197 # This handles the case of out of bounds (reaching the end of the width definitions
198 # while expecting more elements).
199 logger_warning(
200 f"Invalid font width definition. Last element: {w_entry}.",
201 __name__
202 )
204 @staticmethod
205 def _add_default_width(current_widths: dict[str, int], flags: int) -> None:
206 if not current_widths:
207 current_widths["default"] = 500
208 return
210 if " " in current_widths and current_widths[" "] != 0:
211 # Setting default to once or twice the space width, depending on fixed pitch
212 if (flags & FontFlags.FIXED_PITCH) == FontFlags.FIXED_PITCH:
213 current_widths["default"] = current_widths[" "]
214 return
216 current_widths["default"] = int(2 * current_widths[" "])
217 return
219 # Use the average width of existing glyph widths
220 valid_widths = [w for w in current_widths.values() if w > 0]
221 current_widths["default"] = sum(valid_widths) // len(valid_widths) if valid_widths else 500
223 @staticmethod
224 def _add_space_width(character_widths: dict[str, int], flags: int) -> int:
225 space_width = character_widths.get(" ", 0)
226 if space_width != 0:
227 return space_width
229 if (flags & FontFlags.FIXED_PITCH) == FontFlags.FIXED_PITCH:
230 return character_widths["default"]
232 return character_widths["default"] // 2
234 @staticmethod
235 def _parse_font_descriptor(font_descriptor_obj: DictionaryObject) -> dict[str, Any]:
236 font_descriptor_kwargs: dict[Any, Any] = {}
237 for source_key, target_key in [
238 ("/FontName", "name"),
239 ("/FontFamily", "family"),
240 ("/FontWeight", "weight"),
241 ("/Ascent", "ascent"),
242 ("/Descent", "descent"),
243 ("/CapHeight", "cap_height"),
244 ("/XHeight", "x_height"),
245 ("/ItalicAngle", "italic_angle"),
246 ("/Flags", "flags"),
247 ("/FontBBox", "bbox")
248 ]:
249 if source_key in font_descriptor_obj:
250 font_descriptor_kwargs[target_key] = font_descriptor_obj[source_key]
251 # Handle missing bbox gracefully - PDFs may have fonts without valid bounding boxes
252 if "bbox" in font_descriptor_kwargs:
253 bbox_tuple = tuple(map(float, font_descriptor_kwargs["bbox"]))
254 assert len(bbox_tuple) == 4, bbox_tuple
255 font_descriptor_kwargs["bbox"] = bbox_tuple
257 # Find the binary stream for this font if there is one
258 for source_key in ["/FontFile", "/FontFile2", "/FontFile3"]:
259 if source_key in font_descriptor_obj:
260 if "font_file" in font_descriptor_kwargs:
261 raise PdfReadError(f"More than one /FontFile found in {font_descriptor_obj}")
263 try:
264 font_file = font_descriptor_obj[source_key].get_object()
265 font_descriptor_kwargs["font_file"] = font_file
266 except PdfReadError as e:
267 logger_warning(f"Failed to get {source_key!r} in {font_descriptor_obj}: {e}", __name__)
268 return font_descriptor_kwargs
270 @classmethod
271 def from_font_resource(
272 cls,
273 pdf_font_dict: DictionaryObject,
274 ) -> Font:
275 from pypdf._codecs.core_font_metrics import CORE_FONT_METRICS # noqa: PLC0415
277 # Can collect base_font, name and encoding directly from font resource
278 name = pdf_font_dict.get("/BaseFont", "Unknown").removeprefix("/")
279 sub_type = pdf_font_dict.get("/Subtype", "Unknown").removeprefix("/")
280 encoding, character_map = get_encoding(pdf_font_dict)
281 font_descriptor = None
282 character_widths: dict[str, int] = {}
283 interpretable = True
285 # Deal with fonts by type; Type1, TrueType and certain Type3
286 if pdf_font_dict.get("/Subtype") in ("/Type1", "/MMType1", "/TrueType", "/Type3"):
287 # Type3 fonts that do not specify a "/ToUnicode" mapping cannot be
288 # reliably converted into character codes unless all named chars
289 # in /CharProcs map to a standard adobe glyph. See §9.10.2 of the
290 # PDF 1.7 standard.
291 if sub_type == "Type3" and "/ToUnicode" not in pdf_font_dict:
292 interpretable = all(
293 cname in adobe_glyphs
294 for cname in pdf_font_dict.get("/CharProcs") or []
295 )
296 if interpretable: # Save some overhead if font is not interpretable
297 if "/Widths" in pdf_font_dict:
298 cls._collect_tt_t1_character_widths(
299 pdf_font_dict, character_map, encoding, character_widths
300 )
301 elif name in CORE_FONT_METRICS:
302 font_descriptor = CORE_FONT_METRICS[name].font_descriptor
303 character_widths = CORE_FONT_METRICS[name].character_widths
304 if "/FontDescriptor" in pdf_font_dict:
305 font_descriptor_obj = pdf_font_dict.get("/FontDescriptor", DictionaryObject()).get_object()
306 if "/MissingWidth" in font_descriptor_obj:
307 character_widths["default"] = cast(int, font_descriptor_obj["/MissingWidth"].get_object())
308 font_descriptor = FontDescriptor(**cls._parse_font_descriptor(font_descriptor_obj))
309 elif "/FontBBox" in pdf_font_dict:
310 # For Type3 without Font Descriptor but with FontBBox, see Table 110 in the PDF specification 2.0
311 bbox_tuple = tuple(map(float, cast(ArrayObject, pdf_font_dict["/FontBBox"])))
312 assert len(bbox_tuple) == 4, bbox_tuple
313 font_descriptor = FontDescriptor(name=name, bbox=bbox_tuple)
315 else:
316 # Composite font or CID font - CID fonts have a /W array mapping character codes
317 # to widths stashed in /DescendantFonts. No need to test for /DescendantFonts though,
318 # because all other fonts have already been dealt with.
319 d_font: DictionaryObject
320 for d_font_idx, d_font in enumerate(
321 cast(ArrayObject, pdf_font_dict["/DescendantFonts"])
322 ):
323 d_font = cast(DictionaryObject, d_font.get_object())
324 cast(ArrayObject, pdf_font_dict["/DescendantFonts"])[d_font_idx] = d_font
325 cls._collect_cid_character_widths(
326 d_font, character_map, character_widths
327 )
328 if "/DW" in d_font:
329 character_widths["default"] = cast(int, d_font["/DW"].get_object())
330 font_descriptor_obj = d_font.get("/FontDescriptor", DictionaryObject()).get_object()
331 font_descriptor = FontDescriptor(**cls._parse_font_descriptor(font_descriptor_obj))
333 if not font_descriptor:
334 font_descriptor = FontDescriptor(name=name)
336 if character_widths.get("default", 0) == 0:
337 cls._add_default_width(character_widths, font_descriptor.flags)
339 space_width = cls._add_space_width(character_widths, font_descriptor.flags)
341 return cls(
342 name=name,
343 sub_type=sub_type,
344 encoding=encoding,
345 font_descriptor=font_descriptor,
346 character_map=character_map,
347 character_widths=character_widths,
348 space_width=space_width,
349 interpretable=interpretable
350 )
352 @staticmethod
353 def _font_flags_from_truetype_font_tables(
354 header: table__h_e_a_d,
355 postscript: table__p_o_s_t,
356 os2: table_O_S_2f_2
357 ) -> int:
358 # Get the font flags
359 if os2:
360 panose = os2.panose
361 # sFamilyClass is a two-byte field. The high byte describes the family class, whereas the low
362 # byte only describes the subclass. We only need the high byte, hence the bit shift below:
363 family_class = os2.sFamilyClass >> 8
364 flags: int = 0
366 # ITALIC
367 if header.macStyle & HEADER_MACSTYLE_ITALIC or (os2 and os2.fsSelection & OS2_FSSELECTION_ITALIC):
368 flags |= FontFlags.ITALIC
369 if postscript:
370 italic_angle = postscript.italicAngle
371 if italic_angle != 0.0:
372 flags |= FontFlags.ITALIC
374 # FIXED_PITCH
375 if (
376 (os2 and panose.bProportion == OS2_PANOSE_BPROPORTION_MONOSPACED) or
377 (postscript and postscript.isFixedPitch > 0) # Actually 1, but originally (older versions of the TTF
378 ): # specification) any non-zero value signified monospace.
379 flags |= FontFlags.FIXED_PITCH
381 # SCRIPT
382 if os2 and (
383 family_class == OS2_SFAMILYSCLASS_SCRIPTS or panose.bFamilyType == OS2_PANOSE_BFAMILYTYPE_SCRIPT
384 ):
385 flags |= FontFlags.SCRIPT
387 # SERIF
388 if os2 and (
389 2 <= panose.bSerifStyle <= 10
390 or 1 <= family_class <= 5 or family_class == 7 # 6 is reserved, all 8 and above are not serif
391 ):
392 flags |= FontFlags.SERIF
394 # SYMBOLIC
395 if os2 and (
396 family_class == OS2_SFAMILYSCLASS_SYMBOLIC or
397 panose.bFamilyType in {OS2_PANOSE_BFAMILYTYPE_DECORATIVE, OS2_PANOSE_BFAMILYTYPE_PICTORIAL}
398 ):
399 flags |= FontFlags.SYMBOLIC
400 else:
401 flags |= FontFlags.NONSYMBOLIC
403 return flags
405 @classmethod
406 def from_truetype_font_file(cls, font_file: BytesIO) -> Font:
407 if not HAS_FONTTOOLS:
408 raise ImportError("The 'fontTools' library is required to use 'from_truetype_font_file'")
409 with TTFont(font_file) as tt_font_object:
410 # See Chapter 6 of the TrueType reference manual for the definition of the head, OS/2 and post tables:
411 # https://developer.apple.com/fonts/TrueType-Reference-Manual/RM06/Chap6head.html
412 # https://developer.apple.com/fonts/TrueType-Reference-Manual/RM06/Chap6OS2.html
413 # https://developer.apple.com/fonts/TrueType-Reference-Manual/RM06/Chap6post.html
414 header = tt_font_object["head"]
415 horizontal_header = tt_font_object["hhea"]
416 metrics = tt_font_object["hmtx"].metrics
418 # Collect additional font tables to derive font information
419 postscript = tt_font_object.get("post", None)
420 os2 = tt_font_object.get("OS/2", None)
422 # Get the scaling factor to convert font file's units per em to PDF's 1000 units per em
423 units_per_em = header.unitsPerEm
424 scale_factor = 1000.0 / units_per_em
426 # Get the font descriptor
427 font_descriptor_kwargs: dict[Any, Any] = {}
428 names = tt_font_object.get("name", None)
429 if names:
430 font_descriptor_kwargs["name"] = names.getBestFullName()
431 font_descriptor_kwargs["family"] = names.getBestFamilyName()
432 font_descriptor_kwargs["weight"] = names.getBestSubFamilyName()
433 font_descriptor_kwargs["ascent"] = int(round(horizontal_header.ascent * scale_factor, 0))
434 font_descriptor_kwargs["descent"] = int(round(horizontal_header.descent * scale_factor, 0))
435 if os2:
436 try:
437 font_descriptor_kwargs["cap_height"] = int(round(os2.sCapHeight * scale_factor, 0))
438 font_descriptor_kwargs["x_height"] = int(round(os2.sxHeight * scale_factor, 0))
439 except AttributeError:
440 pass
442 font_descriptor_kwargs["flags"] = cls._font_flags_from_truetype_font_tables(header, postscript, os2)
444 font_descriptor_kwargs["bbox"] = (
445 round(header.xMin * scale_factor, 0),
446 round(header.yMin * scale_factor, 0),
447 round(header.xMax * scale_factor, 0),
448 round(header.yMax * scale_factor, 0)
449 )
451 font_file_data = StreamObject()
452 font_file_raw_bytes = font_file.getvalue()
453 font_file_data.set_data(font_file_raw_bytes)
454 font_file_data.update({NameObject("/Length1"): NumberObject(len(font_file_raw_bytes))})
455 font_descriptor_kwargs["font_file"] = font_file_data
457 font_descriptor = FontDescriptor(**font_descriptor_kwargs)
458 encoding = "utf_16_be" # Assume unicode
460 character_widths: dict[str, int] = {}
461 character_map: dict[str, str] = {}
463 glyph_order = tt_font_object.getGlyphOrder()
464 # Note that one glyph can be mapped to multiple unicode code points. However, buildReversedMin()
465 # creates a dictionary mapping glyphs to the minimum Unicode codepoint.
466 tt_font_cmap_table = tt_font_object.get("cmap")
467 if tt_font_cmap_table:
468 reverse_cmap = tt_font_cmap_table.buildReversedMin()
469 for gid, glyph in enumerate(glyph_order):
470 char_code = reverse_cmap.get(glyph)
471 if char_code is None:
472 continue
473 char = chr(char_code)
474 gid = tt_font_object.getGlyphID(glyph)
475 # The following is to comply with how font_glyph_byte_map works in _appearance_stream.py
476 gid_bytes = gid.to_bytes(2, "big")
477 gid_key_string = gid_bytes.decode("utf-16-be", "surrogatepass")
478 character_map[gid_key_string] = char
479 character_widths[gid_key_string] = int(round(metrics[glyph][0] * scale_factor, 0))
480 else:
481 raise PdfReadError("Font file does not have a cmap table")
483 cls._add_default_width(character_widths, font_descriptor_kwargs["flags"])
484 space_width = cls._add_space_width(character_widths, font_descriptor_kwargs["flags"])
486 return cls(
487 name=font_descriptor.name,
488 sub_type="TrueType",
489 encoding=encoding,
490 font_descriptor=font_descriptor,
491 character_map=character_map,
492 character_widths=character_widths,
493 space_width=space_width,
494 interpretable=True
495 )
497 def as_font_resource(self) -> DictionaryObject:
498 # For now, this returns a font resource that only works with the 14 Adobe Core fonts.
499 return (
500 DictionaryObject({
501 NameObject("/Subtype"): NameObject("/Type1"),
502 NameObject("/Name"): NameObject(f"/{self.name}"),
503 NameObject("/Type"): NameObject("/Font"),
504 NameObject("/BaseFont"): NameObject(f"/{self.name}"),
505 NameObject("/Encoding"): NameObject("/WinAnsiEncoding")
506 })
507 )
509 def text_width(self, text: str = "") -> float:
510 """Sum of character widths specified in PDF font for the supplied text."""
511 return sum(
512 [self.character_widths.get(char, self.character_widths["default"]) for char in text], 0.0
513 )