1"""Font constants and classes for "layout" mode text operations"""
2
3from collections.abc import Sequence
4from dataclasses import dataclass, field
5from typing import Any, Union, cast
6
7from ..._codecs import adobe_glyphs
8from ...errors import ParseError
9from ...generic import IndirectObject
10from ._font_widths import STANDARD_WIDTHS
11
12
13@dataclass
14class Font:
15 """
16 A font object formatted for use during "layout" mode text extraction
17
18 Attributes:
19 subtype (str): font subtype
20 space_width (int | float): width of a space character
21 encoding (str | Dict[int, str]): font encoding
22 char_map (dict): character map
23 font_dictionary (dict): font dictionary
24 width_map (Dict[str, int]): mapping of characters to widths
25 interpretable (bool): Default True. If False, the font glyphs cannot
26 be translated to characters, e.g. Type3 fonts that do not define
27 a '/ToUnicode' mapping.
28
29 """
30
31 subtype: str
32 space_width: Union[int, float]
33 encoding: Union[str, dict[int, str]]
34 char_map: dict[Any, Any]
35 font_dictionary: dict[Any, Any]
36 width_map: dict[str, int] = field(default_factory=dict, init=False)
37 interpretable: bool = True
38
39 def __post_init__(self) -> None:
40 # Type3 fonts that do not specify a "/ToUnicode" mapping cannot be
41 # reliably converted into character codes unless all named chars
42 # in /CharProcs map to a standard adobe glyph. See §9.10.2 of the
43 # PDF 1.7 standard.
44 if self.subtype == "/Type3" and "/ToUnicode" not in self.font_dictionary:
45 self.interpretable = all(
46 cname in adobe_glyphs
47 for cname in self.font_dictionary.get("/CharProcs") or []
48 )
49
50 if not self.interpretable: # save some overhead if font is not interpretable
51 return
52
53 # TrueType fonts have a /Widths array mapping character codes to widths
54 if isinstance(self.encoding, dict) and "/Widths" in self.font_dictionary:
55 first_char = self.font_dictionary.get("/FirstChar", 0)
56 self.width_map = {
57 self.encoding.get(idx + first_char, chr(idx + first_char)): width
58 for idx, width in enumerate(self.font_dictionary["/Widths"])
59 }
60
61 # CID fonts have a /W array mapping character codes to widths stashed in /DescendantFonts
62 if "/DescendantFonts" in self.font_dictionary:
63 d_font: dict[Any, Any]
64 for d_font_idx, d_font in enumerate(
65 self.font_dictionary["/DescendantFonts"]
66 ):
67 while isinstance(d_font, IndirectObject):
68 d_font = d_font.get_object()
69 self.font_dictionary["/DescendantFonts"][d_font_idx] = d_font
70 ord_map = {
71 ord(_target): _surrogate
72 for _target, _surrogate in self.char_map.items()
73 if isinstance(_target, str)
74 }
75 # /W width definitions have two valid formats which can be mixed and matched:
76 # (1) A character start index followed by a list of widths, e.g.
77 # `45 [500 600 700]` applies widths 500, 600, 700 to characters 45-47.
78 # (2) A character start index, a character stop index, and a width, e.g.
79 # `45 65 500` applies width 500 to characters 45-65.
80 skip_count = 0
81 _w = d_font.get("/W", [])
82 for idx, w_entry in enumerate(_w):
83 w_entry = w_entry.get_object()
84 if skip_count:
85 skip_count -= 1
86 continue
87 if not isinstance(w_entry, (int, float)): # pragma: no cover
88 # We should never get here due to skip_count above. Add a
89 # warning and or use reader's "strict" to force an ex???
90 continue
91 # check for format (1): `int [int int int int ...]`
92 w_next_entry = _w[idx + 1].get_object()
93 if isinstance(w_next_entry, Sequence):
94 start_idx, width_list = w_entry, w_next_entry
95 self.width_map.update(
96 {
97 ord_map[_cidx]: _width
98 for _cidx, _width in zip(
99 range(
100 cast(int, start_idx),
101 cast(int, start_idx) + len(width_list),
102 1,
103 ),
104 width_list,
105 )
106 if _cidx in ord_map
107 }
108 )
109 skip_count = 1
110 # check for format (2): `int int int`
111 elif isinstance(w_next_entry, (int, float)) and isinstance(
112 _w[idx + 2].get_object(), (int, float)
113 ):
114 start_idx, stop_idx, const_width = (
115 w_entry,
116 w_next_entry,
117 _w[idx + 2].get_object(),
118 )
119 self.width_map.update(
120 {
121 ord_map[_cidx]: const_width
122 for _cidx in range(
123 cast(int, start_idx), cast(int, stop_idx + 1), 1
124 )
125 if _cidx in ord_map
126 }
127 )
128 skip_count = 2
129 else:
130 # Note: this doesn't handle the case of out of bounds (reaching the end of the width definitions
131 # while expecting more elements). This raises an IndexError which is sufficient.
132 raise ParseError(
133 f"Invalid font width definition. Next elements: {w_entry}, {w_next_entry}, {_w[idx + 2]}"
134 ) # pragma: no cover
135
136 if not self.width_map and "/BaseFont" in self.font_dictionary:
137 for key in STANDARD_WIDTHS:
138 if self.font_dictionary["/BaseFont"].startswith(f"/{key}"):
139 self.width_map = STANDARD_WIDTHS[key]
140 break
141
142 def word_width(self, word: str) -> float:
143 """Sum of character widths specified in PDF font for the supplied word"""
144 return sum(
145 [self.width_map.get(char, self.space_width * 2) for char in word], 0.0
146 )
147
148 @staticmethod
149 def to_dict(font_instance: "Font") -> dict[str, Any]:
150 """Dataclass to dict for json.dumps serialization."""
151 return {
152 k: getattr(font_instance, k) for k in font_instance.__dataclass_fields__
153 }