Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_font.py: 33%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1from collections.abc import Sequence
2from dataclasses import dataclass, field
3from typing import Any, Optional, Union, cast
5from pypdf.generic import ArrayObject, DictionaryObject, IndirectObject
7from .errors import ParseError
10@dataclass(frozen=True)
11class FontDescriptor:
12 """
13 Represents the FontDescriptor dictionary as defined in the PDF specification.
14 This contains both descriptive and metric information.
16 The defaults are derived from the mean values of the 14 core fonts, rounded
17 to 100.
18 """
20 name: str = "Unknown"
21 family: str = "Unknown"
22 weight: str = "Unknown"
24 ascent: float = 700.0
25 descent: float = -200.0
26 cap_height: float = 600.0
27 x_height: float = 500.0
28 italic_angle: float = 0.0 # Non-italic
29 flags: int = 32 # Non-serif, non-symbolic, not fixed width
30 bbox: tuple[float, float, float, float] = field(default_factory=lambda: (-100.0, -200.0, 1000.0, 900.0))
32 character_widths: dict[str, int] = field(default_factory=dict)
34 @staticmethod
35 def _parse_font_descriptor(font_kwargs: dict[str, Any], font_descriptor_obj: DictionaryObject) -> dict[str, Any]:
36 font_descriptor_dict: DictionaryObject = (
37 font_descriptor_obj.get_object()
38 if isinstance(font_descriptor_obj, IndirectObject)
39 else font_descriptor_obj
40 )
41 for source_key, target_key in [
42 ("/FontName", "name"),
43 ("/FontFamily", "family"),
44 ("/FontWeight", "weight"),
45 ("/Ascent", "ascent"),
46 ("/Descent", "descent"),
47 ("/CapHeight", "cap_height"),
48 ("/XHeight", "x_height"),
49 ("/ItalicAngle", "italic_angle"),
50 ("/Flags", "flags"),
51 ("/FontBBox", "bbox")
52 ]:
53 if source_key in font_descriptor_dict:
54 font_kwargs[target_key] = font_descriptor_dict[source_key]
55 # No need for an if statement here, bbox is a required key in a font descriptor
56 bbox_tuple = tuple(map(float, font_kwargs["bbox"]))
57 assert len(bbox_tuple) == 4, bbox_tuple
58 font_kwargs["bbox"] = bbox_tuple
59 return font_kwargs
61 @staticmethod
62 def _collect_cid_character_widths(
63 d_font: DictionaryObject, char_map: dict[Any, Any], current_widths: dict[str, int]
64 ) -> None:
65 """Parses the /W array from a DescendantFont dictionary and updates character widths."""
66 ord_map = {
67 ord(_target): _surrogate
68 for _target, _surrogate in char_map.items()
69 if isinstance(_target, str)
70 }
71 # /W width definitions have two valid formats which can be mixed and matched:
72 # (1) A character start index followed by a list of widths, e.g.
73 # `45 [500 600 700]` applies widths 500, 600, 700 to characters 45-47.
74 # (2) A character start index, a character stop index, and a width, e.g.
75 # `45 65 500` applies width 500 to characters 45-65.
76 skip_count = 0
77 _w = d_font.get("/W", [])
78 for idx, w_entry in enumerate(_w):
79 w_entry = w_entry.get_object()
80 if skip_count:
81 skip_count -= 1
82 continue
83 if not isinstance(w_entry, (int, float)): # pragma: no cover
84 # We should never get here due to skip_count above. Add a
85 # warning and or use reader's "strict" to force an ex???
86 continue
87 # check for format (1): `int [int int int int ...]`
88 w_next_entry = _w[idx + 1].get_object()
89 if isinstance(w_next_entry, Sequence):
90 start_idx, width_list = w_entry, w_next_entry
91 current_widths.update(
92 {
93 ord_map[_cidx]: _width
94 for _cidx, _width in zip(
95 range(
96 cast(int, start_idx),
97 cast(int, start_idx) + len(width_list),
98 1,
99 ),
100 width_list,
101 )
102 if _cidx in ord_map
103 }
104 )
105 skip_count = 1
106 # check for format (2): `int int int`
107 elif isinstance(w_next_entry, (int, float)) and isinstance(
108 _w[idx + 2].get_object(), (int, float)
109 ):
110 start_idx, stop_idx, const_width = (
111 w_entry,
112 w_next_entry,
113 _w[idx + 2].get_object(),
114 )
115 current_widths.update(
116 {
117 ord_map[_cidx]: const_width
118 for _cidx in range(
119 cast(int, start_idx), cast(int, stop_idx + 1), 1
120 )
121 if _cidx in ord_map
122 }
123 )
124 skip_count = 2
125 else:
126 # Note: this doesn't handle the case of out of bounds (reaching the end of the width definitions
127 # while expecting more elements). This raises an IndexError which is sufficient.
128 raise ParseError(
129 f"Invalid font width definition. Next elements: {w_entry}, {w_next_entry}, {_w[idx + 2]}"
130 ) # pragma: no cover
132 @classmethod
133 def from_font_resource(
134 cls,
135 pdf_font_dict: DictionaryObject,
136 encoding: Optional[Union[str, dict[int, str]]] = None,
137 char_map: Optional[dict[Any, Any]] = None
138 ) -> "FontDescriptor":
139 from pypdf._cmap import get_encoding # noqa: PLC0415
140 from pypdf._codecs.core_fontmetrics import CORE_FONT_METRICS # noqa: PLC0415
141 # Prioritize information from the PDF font dictionary
142 font_name = pdf_font_dict.get("/BaseFont", "Unknown").removeprefix("/")
143 font_kwargs: dict[str, Any] = {"character_widths": {}}
145 # Deal with fonts by type; Type1, TrueType and certain Type3
146 if pdf_font_dict.get("/Subtype") in ("/Type1", "/MMType1", "/TrueType", "/Type3"):
147 if "/FontDescriptor" in pdf_font_dict:
148 # Collect character widths - TrueType and Type1 fonts
149 # have a /Widths array mapping character codes to widths
150 if not (encoding and char_map):
151 encoding, char_map = get_encoding(pdf_font_dict)
152 if isinstance(encoding, dict) and "/Widths" in pdf_font_dict:
153 first_char = pdf_font_dict.get("/FirstChar", 0)
154 font_kwargs["character_widths"] = {
155 encoding.get(idx + first_char, chr(idx + first_char)): width
156 for idx, width in enumerate(cast(ArrayObject, pdf_font_dict["/Widths"]))
157 }
158 # Collect font descriptor
159 font_kwargs = cls._parse_font_descriptor(
160 font_kwargs, pdf_font_dict.get("/FontDescriptor", DictionaryObject())
161 )
162 return cls(**font_kwargs)
164 if font_name in CORE_FONT_METRICS:
165 return CORE_FONT_METRICS[font_name]
167 # Composite font or CID font
168 # CID fonts have a /W array mapping character codes to widths stashed in /DescendantFonts
169 if "/DescendantFonts" in pdf_font_dict:
170 if not (encoding and char_map):
171 encoding, char_map = get_encoding(pdf_font_dict)
172 d_font: DictionaryObject
173 for d_font_idx, d_font in enumerate(
174 cast(ArrayObject, pdf_font_dict["/DescendantFonts"])
175 ):
176 d_font = cast(DictionaryObject, d_font.get_object())
177 cast(ArrayObject, pdf_font_dict["/DescendantFonts"])[d_font_idx] = d_font
178 # Collect character widths
179 cls._collect_cid_character_widths(
180 d_font, char_map, font_kwargs["character_widths"]
181 )
182 # Collect font descriptor
183 font_kwargs = cls._parse_font_descriptor(
184 font_kwargs, d_font.get("/FontDescriptor", DictionaryObject())
185 )
187 return cls(**font_kwargs)
189 def text_width(self, text: str) -> float:
190 """Sum of character widths specified in PDF font for the supplied text."""
191 return sum(
192 [self.character_widths.get(char, self.character_widths.get("default", 0)) for char in text], 0.0
193 )