1"""Font constants and classes for "layout" mode text operations"""
2
3from dataclasses import dataclass, field
4from typing import Any, Dict, Sequence, Union, cast
5
6from ..._codecs import adobe_glyphs
7from ...errors import ParseError
8from ...generic import IndirectObject
9from ._font_widths import STANDARD_WIDTHS
10
11
12@dataclass
13class Font:
14 """
15 A font object formatted for use during "layout" mode text extraction
16
17 Attributes:
18 subtype (str): font subtype
19 space_width (int | float): width of a space character
20 encoding (str | Dict[int, str]): font encoding
21 char_map (dict): character map
22 font_dictionary (dict): font dictionary
23 width_map (Dict[str, int]): mapping of characters to widths
24 interpretable (bool): Default True. If False, the font glyphs cannot
25 be translated to characters, e.g. Type3 fonts that do not define
26 a '/ToUnicode' mapping.
27
28 """
29
30 subtype: str
31 space_width: Union[int, float]
32 encoding: Union[str, Dict[int, str]]
33 char_map: Dict[Any, Any]
34 font_dictionary: Dict[Any, Any]
35 width_map: Dict[str, int] = field(default_factory=dict, init=False)
36 interpretable: bool = True
37
38 def __post_init__(self) -> None:
39 # Type3 fonts that do not specify a "/ToUnicode" mapping cannot be
40 # reliably converted into character codes unless all named chars
41 # in /CharProcs map to a standard adobe glyph. See §9.10.2 of the
42 # PDF 1.7 standard.
43 if self.subtype == "/Type3" and "/ToUnicode" not in self.font_dictionary:
44 self.interpretable = all(
45 cname in adobe_glyphs
46 for cname in self.font_dictionary.get("/CharProcs") or []
47 )
48
49 if not self.interpretable: # save some overhead if font is not interpretable
50 return
51
52 # TrueType fonts have a /Widths array mapping character codes to widths
53 if isinstance(self.encoding, dict) and "/Widths" in self.font_dictionary:
54 first_char = self.font_dictionary.get("/FirstChar", 0)
55 self.width_map = {
56 self.encoding.get(idx + first_char, chr(idx + first_char)): width
57 for idx, width in enumerate(self.font_dictionary["/Widths"])
58 }
59
60 # CID fonts have a /W array mapping character codes to widths stashed in /DescendantFonts
61 if "/DescendantFonts" in self.font_dictionary:
62 d_font: Dict[Any, Any]
63 for d_font_idx, d_font in enumerate(
64 self.font_dictionary["/DescendantFonts"]
65 ):
66 while isinstance(d_font, IndirectObject):
67 d_font = d_font.get_object()
68 self.font_dictionary["/DescendantFonts"][d_font_idx] = d_font
69 ord_map = {
70 ord(_target): _surrogate
71 for _target, _surrogate in self.char_map.items()
72 if isinstance(_target, str)
73 }
74 # /W width definitions have two valid formats which can be mixed and matched:
75 # (1) A character start index followed by a list of widths, e.g.
76 # `45 [500 600 700]` applies widths 500, 600, 700 to characters 45-47.
77 # (2) A character start index, a character stop index, and a width, e.g.
78 # `45 65 500` applies width 500 to characters 45-65.
79 skip_count = 0
80 _w = d_font.get("/W", [])
81 for idx, w_entry in enumerate(_w):
82 w_entry = w_entry.get_object()
83 if skip_count:
84 skip_count -= 1
85 continue
86 if not isinstance(w_entry, (int, float)): # pragma: no cover
87 # We should never get here due to skip_count above. Add a
88 # warning and or use reader's "strict" to force an ex???
89 continue
90 # check for format (1): `int [int int int int ...]`
91 w_next_entry = _w[idx + 1].get_object()
92 if isinstance(w_next_entry, Sequence):
93 start_idx, width_list = w_entry, w_next_entry
94 self.width_map.update(
95 {
96 ord_map[_cidx]: _width
97 for _cidx, _width in zip(
98 range(
99 cast(int, start_idx),
100 cast(int, start_idx) + len(width_list),
101 1,
102 ),
103 width_list,
104 )
105 if _cidx in ord_map
106 }
107 )
108 skip_count = 1
109 # check for format (2): `int int int`
110 elif isinstance(w_next_entry, (int, float)) and isinstance(
111 _w[idx + 2].get_object(), (int, float)
112 ):
113 start_idx, stop_idx, const_width = (
114 w_entry,
115 w_next_entry,
116 _w[idx + 2].get_object(),
117 )
118 self.width_map.update(
119 {
120 ord_map[_cidx]: const_width
121 for _cidx in range(
122 cast(int, start_idx), cast(int, stop_idx + 1), 1
123 )
124 if _cidx in ord_map
125 }
126 )
127 skip_count = 2
128 else:
129 # Note: this doesn't handle the case of out of bounds (reaching the end of the width definitions
130 # while expecting more elements). This raises an IndexError which is sufficient.
131 raise ParseError(
132 f"Invalid font width definition. Next elements: {w_entry}, {w_next_entry}, {_w[idx + 2]}"
133 ) # pragma: no cover
134
135 if not self.width_map and "/BaseFont" in self.font_dictionary:
136 for key in STANDARD_WIDTHS:
137 if self.font_dictionary["/BaseFont"].startswith(f"/{key}"):
138 self.width_map = STANDARD_WIDTHS[key]
139 break
140
141 def word_width(self, word: str) -> float:
142 """Sum of character widths specified in PDF font for the supplied word"""
143 return sum(
144 [self.width_map.get(char, self.space_width * 2) for char in word], 0.0
145 )
146
147 @staticmethod
148 def to_dict(font_instance: "Font") -> Dict[str, Any]:
149 """Dataclass to dict for json.dumps serialization."""
150 return {
151 k: getattr(font_instance, k) for k in font_instance.__dataclass_fields__
152 }