1"""Font constants and classes for "layout" mode text operations""" 
    2 
    3from collections.abc import Sequence 
    4from dataclasses import dataclass, field 
    5from typing import Any, Union, cast 
    6 
    7from ..._codecs import adobe_glyphs 
    8from ...errors import ParseError 
    9from ...generic import IndirectObject 
    10from ._font_widths import STANDARD_WIDTHS 
    11 
    12 
    13@dataclass 
    14class Font: 
    15    """ 
    16    A font object formatted for use during "layout" mode text extraction 
    17 
    18    Attributes: 
    19        subtype (str): font subtype 
    20        space_width (int | float): width of a space character 
    21        encoding (str | Dict[int, str]): font encoding 
    22        char_map (dict): character map 
    23        font_dictionary (dict): font dictionary 
    24        width_map (Dict[str, int]): mapping of characters to widths 
    25        interpretable (bool): Default True. If False, the font glyphs cannot 
    26            be translated to characters, e.g. Type3 fonts that do not define 
    27            a '/ToUnicode' mapping. 
    28 
    29    """ 
    30 
    31    subtype: str 
    32    space_width: Union[int, float] 
    33    encoding: Union[str, dict[int, str]] 
    34    char_map: dict[Any, Any] 
    35    font_dictionary: dict[Any, Any] 
    36    width_map: dict[str, int] = field(default_factory=dict, init=False) 
    37    interpretable: bool = True 
    38 
    39    def __post_init__(self) -> None: 
    40        # Type3 fonts that do not specify a "/ToUnicode" mapping cannot be 
    41        # reliably converted into character codes unless all named chars 
    42        # in /CharProcs map to a standard adobe glyph. See §9.10.2 of the 
    43        # PDF 1.7 standard. 
    44        if self.subtype == "/Type3" and "/ToUnicode" not in self.font_dictionary: 
    45            self.interpretable = all( 
    46                cname in adobe_glyphs 
    47                for cname in self.font_dictionary.get("/CharProcs") or [] 
    48            ) 
    49 
    50        if not self.interpretable:  # save some overhead if font is not interpretable 
    51            return 
    52 
    53        # TrueType fonts have a /Widths array mapping character codes to widths 
    54        if isinstance(self.encoding, dict) and "/Widths" in self.font_dictionary: 
    55            first_char = self.font_dictionary.get("/FirstChar", 0) 
    56            self.width_map = { 
    57                self.encoding.get(idx + first_char, chr(idx + first_char)): width 
    58                for idx, width in enumerate(self.font_dictionary["/Widths"]) 
    59            } 
    60 
    61        # CID fonts have a /W array mapping character codes to widths stashed in /DescendantFonts 
    62        if "/DescendantFonts" in self.font_dictionary: 
    63            d_font: dict[Any, Any] 
    64            for d_font_idx, d_font in enumerate( 
    65                self.font_dictionary["/DescendantFonts"] 
    66            ): 
    67                while isinstance(d_font, IndirectObject): 
    68                    d_font = d_font.get_object() 
    69                self.font_dictionary["/DescendantFonts"][d_font_idx] = d_font 
    70                ord_map = { 
    71                    ord(_target): _surrogate 
    72                    for _target, _surrogate in self.char_map.items() 
    73                    if isinstance(_target, str) 
    74                } 
    75                # /W width definitions have two valid formats which can be mixed and matched: 
    76                #   (1) A character start index followed by a list of widths, e.g. 
    77                #       `45 [500 600 700]` applies widths 500, 600, 700 to characters 45-47. 
    78                #   (2) A character start index, a character stop index, and a width, e.g. 
    79                #       `45 65 500` applies width 500 to characters 45-65. 
    80                skip_count = 0 
    81                _w = d_font.get("/W", []) 
    82                for idx, w_entry in enumerate(_w): 
    83                    w_entry = w_entry.get_object() 
    84                    if skip_count: 
    85                        skip_count -= 1 
    86                        continue 
    87                    if not isinstance(w_entry, (int, float)):  # pragma: no cover 
    88                        # We should never get here due to skip_count above. Add a 
    89                        # warning and or use reader's "strict" to force an ex??? 
    90                        continue 
    91                    # check for format (1): `int [int int int int ...]` 
    92                    w_next_entry = _w[idx + 1].get_object() 
    93                    if isinstance(w_next_entry, Sequence): 
    94                        start_idx, width_list = w_entry, w_next_entry 
    95                        self.width_map.update( 
    96                            { 
    97                                ord_map[_cidx]: _width 
    98                                for _cidx, _width in zip( 
    99                                    range( 
    100                                        cast(int, start_idx), 
    101                                        cast(int, start_idx) + len(width_list), 
    102                                        1, 
    103                                    ), 
    104                                    width_list, 
    105                                ) 
    106                                if _cidx in ord_map 
    107                            } 
    108                        ) 
    109                        skip_count = 1 
    110                    # check for format (2): `int int int` 
    111                    elif isinstance(w_next_entry, (int, float)) and isinstance( 
    112                        _w[idx + 2].get_object(), (int, float) 
    113                    ): 
    114                        start_idx, stop_idx, const_width = ( 
    115                            w_entry, 
    116                            w_next_entry, 
    117                            _w[idx + 2].get_object(), 
    118                        ) 
    119                        self.width_map.update( 
    120                            { 
    121                                ord_map[_cidx]: const_width 
    122                                for _cidx in range( 
    123                                    cast(int, start_idx), cast(int, stop_idx + 1), 1 
    124                                ) 
    125                                if _cidx in ord_map 
    126                            } 
    127                        ) 
    128                        skip_count = 2 
    129                    else: 
    130                        # Note: this doesn't handle the case of out of bounds (reaching the end of the width definitions 
    131                        # while expecting more elements). This raises an IndexError which is sufficient. 
    132                        raise ParseError( 
    133                            f"Invalid font width definition. Next elements: {w_entry}, {w_next_entry}, {_w[idx + 2]}" 
    134                        )  # pragma: no cover 
    135 
    136        if not self.width_map and "/BaseFont" in self.font_dictionary: 
    137            for key in STANDARD_WIDTHS: 
    138                if self.font_dictionary["/BaseFont"].startswith(f"/{key}"): 
    139                    self.width_map = STANDARD_WIDTHS[key] 
    140                    break 
    141 
    142    def word_width(self, word: str) -> float: 
    143        """Sum of character widths specified in PDF font for the supplied word""" 
    144        return sum( 
    145            [self.width_map.get(char, self.space_width * 2) for char in word], 0.0 
    146        ) 
    147 
    148    @staticmethod 
    149    def to_dict(font_instance: "Font") -> dict[str, Any]: 
    150        """Dataclass to dict for json.dumps serialization.""" 
    151        return { 
    152            k: getattr(font_instance, k) for k in font_instance.__dataclass_fields__ 
    153        }