1"""Font constants and classes for "layout" mode text operations"""
2
3from dataclasses import dataclass, field
4from typing import Any, Union
5
6from ..._codecs import adobe_glyphs
7from ..._font import FontDescriptor
8from ...generic import DictionaryObject
9
10
11@dataclass
12class Font:
13 """
14 A font object formatted for use during "layout" mode text extraction
15
16 Attributes:
17 subtype (str): font subtype
18 space_width (int | float): width of a space character
19 encoding (str | Dict[int, str]): font encoding
20 char_map (dict): character map
21 font_dictionary (dict): font dictionary
22 font_descriptor: font metrics, including a mapping of characters to widths
23 width_map (Dict[str, int]): mapping of characters to widths
24 interpretable (bool): Default True. If False, the font glyphs cannot
25 be translated to characters, e.g. Type3 fonts that do not define
26 a '/ToUnicode' mapping.
27
28 """
29
30 subtype: str
31 space_width: Union[int, float]
32 encoding: Union[str, dict[int, str]]
33 char_map: dict[Any, Any]
34 font_dictionary: DictionaryObject
35 font_descriptor: FontDescriptor = field(default_factory=FontDescriptor, init=False)
36 interpretable: bool = True
37
38 def __post_init__(self) -> None:
39 # Type3 fonts that do not specify a "/ToUnicode" mapping cannot be
40 # reliably converted into character codes unless all named chars
41 # in /CharProcs map to a standard adobe glyph. See §9.10.2 of the
42 # PDF 1.7 standard.
43 if self.subtype == "/Type3" and "/ToUnicode" not in self.font_dictionary:
44 self.interpretable = all(
45 cname in adobe_glyphs
46 for cname in self.font_dictionary.get("/CharProcs") or []
47 )
48
49 if not self.interpretable: # save some overhead if font is not interpretable
50 return
51
52 self.font_descriptor = FontDescriptor.from_font_resource(self.font_dictionary, self.encoding, self.char_map)
53
54 def word_width(self, word: str) -> float:
55 """Sum of character widths specified in PDF font for the supplied word"""
56 return sum(
57 [self.font_descriptor.character_widths.get(char, self.space_width * 2) for char in word], 0.0
58 )
59
60 @staticmethod
61 def to_dict(font_instance: "Font") -> dict[str, Any]:
62 """Dataclass to dict for json.dumps serialization."""
63 return {
64 k: getattr(font_instance, k) for k in font_instance.__dataclass_fields__
65 }