Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_font.py: 32%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1from collections.abc import Sequence
2from dataclasses import dataclass, field
3from typing import Any, Union, cast
5from pypdf.generic import ArrayObject, DictionaryObject, NameObject
7from ._cmap import get_encoding
8from ._codecs.adobe_glyphs import adobe_glyphs
9from ._utils import logger_warning
10from .constants import FontFlags
13@dataclass(frozen=True)
14class FontDescriptor:
15 """
16 Represents the FontDescriptor dictionary as defined in the PDF specification.
17 This contains both descriptive and metric information.
19 The defaults are derived from the mean values of the 14 core fonts, rounded
20 to 100.
21 """
23 name: str = "Unknown"
24 family: str = "Unknown"
25 weight: str = "Unknown"
27 ascent: float = 700.0
28 descent: float = -200.0
29 cap_height: float = 600.0
30 x_height: float = 500.0
31 italic_angle: float = 0.0 # Non-italic
32 flags: int = 32 # Non-serif, non-symbolic, not fixed width
33 bbox: tuple[float, float, float, float] = field(default_factory=lambda: (-100.0, -200.0, 1000.0, 900.0))
36@dataclass(frozen=True)
37class CoreFontMetrics:
38 font_descriptor: FontDescriptor
39 character_widths: dict[str, int]
42@dataclass
43class Font:
44 """
45 A font object for use during text extraction and for producing
46 text appearance streams.
48 Attributes:
49 name: Font name, derived from font["/BaseFont"]
50 character_map: The font's character map
51 encoding: Font encoding
52 sub_type: The font type, such as Type1, TrueType, or Type3.
53 font_descriptor: Font metrics, including a mapping of characters to widths
54 character_widths: A mapping of characters to widths
55 space_width: The width of a space, or an approximation
56 interpretable: Default True. If False, the font glyphs cannot
57 be translated to characters, e.g. Type3 fonts that do not define
58 a '/ToUnicode' mapping.
60 """
62 name: str
63 encoding: Union[str, dict[int, str]]
64 character_map: dict[Any, Any] = field(default_factory=dict)
65 sub_type: str = "Unknown"
66 font_descriptor: FontDescriptor = field(default_factory=FontDescriptor)
67 character_widths: dict[str, int] = field(default_factory=lambda: {"default": 500})
68 space_width: Union[float, int] = 250
69 interpretable: bool = True
71 @staticmethod
72 def _collect_tt_t1_character_widths(
73 pdf_font_dict: DictionaryObject,
74 char_map: dict[Any, Any],
75 encoding: Union[str, dict[int, str]],
76 current_widths: dict[str, int]
77 ) -> None:
78 """Parses a TrueType or Type1 font's /Widths array from a font dictionary and updates character widths"""
79 widths_array = cast(ArrayObject, pdf_font_dict["/Widths"])
80 first_char = pdf_font_dict.get("/FirstChar", 0)
81 if not isinstance(encoding, str):
82 # This means that encoding is a dict
83 current_widths.update({
84 encoding.get(idx + first_char, chr(idx + first_char)): width
85 for idx, width in enumerate(widths_array)
86 })
87 return
89 # We map the character code directly to the character
90 # using the string encoding
91 for idx, width in enumerate(widths_array):
92 # Often "idx == 0" will denote the .notdef character, but we add it anyway
93 char_code = idx + first_char # This is a raw code
94 # Get the "raw" character or byte representation
95 raw_char = bytes([char_code]).decode(encoding, "surrogatepass")
96 # Translate raw_char to the REAL Unicode character using the char_map
97 unicode_char = char_map.get(raw_char)
98 if unicode_char:
99 current_widths[unicode_char] = int(width)
100 else:
101 current_widths[raw_char] = int(width)
103 @staticmethod
104 def _collect_cid_character_widths(
105 d_font: DictionaryObject, char_map: dict[Any, Any], current_widths: dict[str, int]
106 ) -> None:
107 """Parses the /W array from a DescendantFont dictionary and updates character widths."""
108 ord_map = {
109 ord(_target): _surrogate
110 for _target, _surrogate in char_map.items()
111 if isinstance(_target, str)
112 }
113 # /W width definitions have two valid formats which can be mixed and matched:
114 # (1) A character start index followed by a list of widths, e.g.
115 # `45 [500 600 700]` applies widths 500, 600, 700 to characters 45-47.
116 # (2) A character start index, a character stop index, and a width, e.g.
117 # `45 65 500` applies width 500 to characters 45-65.
118 skip_count = 0
119 _w = d_font.get("/W", [])
120 for idx, w_entry in enumerate(_w):
121 w_entry = w_entry.get_object()
122 if skip_count:
123 skip_count -= 1
124 continue
125 if not isinstance(w_entry, (int, float)):
126 # We should never get here due to skip_count above. But
127 # sometimes we do.
128 logger_warning(f"Expected numeric value for width, got {w_entry}. Ignoring it.", __name__)
129 continue
130 # check for format (1): `int [int int int int ...]`
131 w_next_entry = _w[idx + 1].get_object()
132 if isinstance(w_next_entry, Sequence):
133 start_idx, width_list = w_entry, w_next_entry
134 current_widths.update(
135 {
136 ord_map[_cidx]: _width
137 for _cidx, _width in zip(
138 range(
139 cast(int, start_idx),
140 cast(int, start_idx) + len(width_list),
141 1,
142 ),
143 width_list,
144 )
145 if _cidx in ord_map
146 }
147 )
148 skip_count = 1
149 # check for format (2): `int int int`
150 elif isinstance(w_next_entry, (int, float)) and isinstance(
151 _w[idx + 2].get_object(), (int, float)
152 ):
153 start_idx, stop_idx, const_width = (
154 w_entry,
155 w_next_entry,
156 _w[idx + 2].get_object(),
157 )
158 current_widths.update(
159 {
160 ord_map[_cidx]: const_width
161 for _cidx in range(
162 cast(int, start_idx), cast(int, stop_idx + 1), 1
163 )
164 if _cidx in ord_map
165 }
166 )
167 skip_count = 2
168 else:
169 # This handles the case of out of bounds (reaching the end of the width definitions
170 # while expecting more elements).
171 logger_warning(
172 f"Invalid font width definition. Last element: {w_entry}.",
173 __name__
174 )
176 @staticmethod
177 def _add_default_width(current_widths: dict[str, int], flags: int) -> None:
178 if not current_widths:
179 current_widths["default"] = 500
180 return
182 if " " in current_widths and current_widths[" "] != 0:
183 # Setting default to once or twice the space width, depending on fixed pitch
184 if (flags & FontFlags.FIXED_PITCH) == FontFlags.FIXED_PITCH:
185 current_widths["default"] = current_widths[" "]
186 return
188 current_widths["default"] = int(2 * current_widths[" "])
189 return
191 # Use the average width of existing glyph widths
192 valid_widths = [w for w in current_widths.values() if w > 0]
193 current_widths["default"] = sum(valid_widths) // len(valid_widths) if valid_widths else 500
195 @staticmethod
196 def _parse_font_descriptor(font_descriptor_obj: DictionaryObject) -> dict[str, Any]:
197 font_descriptor_kwargs: dict[Any, Any] = {}
198 for source_key, target_key in [
199 ("/FontName", "name"),
200 ("/FontFamily", "family"),
201 ("/FontWeight", "weight"),
202 ("/Ascent", "ascent"),
203 ("/Descent", "descent"),
204 ("/CapHeight", "cap_height"),
205 ("/XHeight", "x_height"),
206 ("/ItalicAngle", "italic_angle"),
207 ("/Flags", "flags"),
208 ("/FontBBox", "bbox")
209 ]:
210 if source_key in font_descriptor_obj:
211 font_descriptor_kwargs[target_key] = font_descriptor_obj[source_key]
212 # Handle missing bbox gracefully - PDFs may have fonts without valid bounding boxes
213 if "bbox" in font_descriptor_kwargs:
214 bbox_tuple = tuple(map(float, font_descriptor_kwargs["bbox"]))
215 assert len(bbox_tuple) == 4, bbox_tuple
216 font_descriptor_kwargs["bbox"] = bbox_tuple
217 return font_descriptor_kwargs
219 @classmethod
220 def from_font_resource(
221 cls,
222 pdf_font_dict: DictionaryObject,
223 ) -> "Font":
224 from pypdf._codecs.core_font_metrics import CORE_FONT_METRICS # noqa: PLC0415
226 # Can collect base_font, name and encoding directly from font resource
227 name = pdf_font_dict.get("/BaseFont", "Unknown").removeprefix("/")
228 sub_type = pdf_font_dict.get("/Subtype", "Unknown").removeprefix("/")
229 encoding, character_map = get_encoding(pdf_font_dict)
230 font_descriptor = None
231 character_widths: dict[str, int] = {}
232 interpretable = True
234 # Deal with fonts by type; Type1, TrueType and certain Type3
235 if pdf_font_dict.get("/Subtype") in ("/Type1", "/MMType1", "/TrueType", "/Type3"):
236 # Type3 fonts that do not specify a "/ToUnicode" mapping cannot be
237 # reliably converted into character codes unless all named chars
238 # in /CharProcs map to a standard adobe glyph. See §9.10.2 of the
239 # PDF 1.7 standard.
240 if sub_type == "Type3" and "/ToUnicode" not in pdf_font_dict:
241 interpretable = all(
242 cname in adobe_glyphs
243 for cname in pdf_font_dict.get("/CharProcs") or []
244 )
245 if interpretable: # Save some overhead if font is not interpretable
246 if "/Widths" in pdf_font_dict:
247 cls._collect_tt_t1_character_widths(
248 pdf_font_dict, character_map, encoding, character_widths
249 )
250 elif name in CORE_FONT_METRICS:
251 font_descriptor = CORE_FONT_METRICS[name].font_descriptor
252 character_widths = CORE_FONT_METRICS[name].character_widths
253 if "/FontDescriptor" in pdf_font_dict:
254 font_descriptor_obj = pdf_font_dict.get("/FontDescriptor", DictionaryObject()).get_object()
255 if "/MissingWidth" in font_descriptor_obj:
256 character_widths["default"] = cast(int, font_descriptor_obj["/MissingWidth"].get_object())
257 font_descriptor = FontDescriptor(**cls._parse_font_descriptor(font_descriptor_obj))
258 elif "/FontBBox" in pdf_font_dict:
259 # For Type3 without Font Descriptor but with FontBBox, see Table 110 in the PDF specification 2.0
260 bbox_tuple = tuple(map(float, cast(ArrayObject, pdf_font_dict["/FontBBox"])))
261 assert len(bbox_tuple) == 4, bbox_tuple
262 font_descriptor = FontDescriptor(name=name, bbox=bbox_tuple)
264 else:
265 # Composite font or CID font - CID fonts have a /W array mapping character codes
266 # to widths stashed in /DescendantFonts. No need to test for /DescendantFonts though,
267 # because all other fonts have already been dealt with.
268 d_font: DictionaryObject
269 for d_font_idx, d_font in enumerate(
270 cast(ArrayObject, pdf_font_dict["/DescendantFonts"])
271 ):
272 d_font = cast(DictionaryObject, d_font.get_object())
273 cast(ArrayObject, pdf_font_dict["/DescendantFonts"])[d_font_idx] = d_font
274 cls._collect_cid_character_widths(
275 d_font, character_map, character_widths
276 )
277 if "/DW" in d_font:
278 character_widths["default"] = cast(int, d_font["/DW"].get_object())
279 font_descriptor_obj = d_font.get("/FontDescriptor", DictionaryObject()).get_object()
280 font_descriptor = FontDescriptor(**cls._parse_font_descriptor(font_descriptor_obj))
282 if not font_descriptor:
283 font_descriptor = FontDescriptor(name=name)
285 if character_widths.get("default", 0) == 0:
286 cls._add_default_width(character_widths, font_descriptor.flags)
287 space_width = character_widths.get(" ", 0)
288 if space_width == 0:
289 if (font_descriptor.flags & FontFlags.FIXED_PITCH) == FontFlags.FIXED_PITCH:
290 space_width = character_widths["default"]
291 else:
292 space_width = character_widths["default"] // 2
294 return cls(
295 name=name,
296 sub_type=sub_type,
297 encoding=encoding,
298 font_descriptor=font_descriptor,
299 character_map=character_map,
300 character_widths=character_widths,
301 space_width=space_width,
302 interpretable=interpretable
303 )
305 def as_font_resource(self) -> DictionaryObject:
306 # For now, this returns a font resource that only works with the 14 Adobe Core fonts.
307 return (
308 DictionaryObject({
309 NameObject("/Subtype"): NameObject("/Type1"),
310 NameObject("/Name"): NameObject(f"/{self.name}"),
311 NameObject("/Type"): NameObject("/Font"),
312 NameObject("/BaseFont"): NameObject(f"/{self.name}"),
313 NameObject("/Encoding"): NameObject("/WinAnsiEncoding")
314 })
315 )
317 def text_width(self, text: str = "") -> float:
318 """Sum of character widths specified in PDF font for the supplied text."""
319 return sum(
320 [self.character_widths.get(char, self.character_widths["default"]) for char in text], 0.0
321 )