1"""manage the PDF transform stack during "layout" mode text extraction""" 
    2 
    3from collections import ChainMap, Counter 
    4from collections import ChainMap as ChainMapType 
    5from collections import Counter as CounterType 
    6from collections.abc import MutableMapping 
    7from typing import Any, Union 
    8 
    9from ...errors import PdfReadError 
    10from .. import mult 
    11from ._font import Font 
    12from ._text_state_params import TextStateParams 
    13 
    14TextStateManagerChainMapType = ChainMapType[Union[int, str], Union[float, bool]] 
    15TextStateManagerDictType = MutableMapping[Union[int, str], Union[float, bool]] 
    16 
    17 
    18class TextStateManager: 
    19    """ 
    20    Tracks the current text state including cm/tm/trm transformation matrices. 
    21 
    22    Attributes: 
    23        transform_stack (ChainMap): ChainMap of cm/tm transformation matrices 
    24        q_queue (Counter[int]): Counter of q operators 
    25        q_depth (List[int]): list of q operator nesting levels 
    26        Tc (float): character spacing 
    27        Tw (float): word spacing 
    28        Tz (int): horizontal scaling 
    29        TL (float): leading 
    30        Ts (float): text rise 
    31        font (Font): font object 
    32        font_size (int | float): font size 
    33 
    34    """ 
    35 
    36    def __init__(self) -> None: 
    37        self.transform_stack: TextStateManagerChainMapType = ChainMap( 
    38            self.new_transform() 
    39        ) 
    40        self.q_queue: CounterType[int] = Counter() 
    41        self.q_depth = [0] 
    42        self.Tc: float = 0.0 
    43        self.Tw: float = 0.0 
    44        self.Tz: float = 100.0 
    45        self.TL: float = 0.0 
    46        self.Ts: float = 0.0 
    47        self.font_stack: list[tuple[Union[Font, None], Union[int, float]]] = [] 
    48        self.font: Union[Font, None] = None 
    49        self.font_size: Union[int, float] = 0 
    50 
    51    def set_state_param(self, op: bytes, value: Union[float, list[Any]]) -> None: 
    52        """ 
    53        Set a text state parameter. Supports Tc, Tz, Tw, TL, and Ts operators. 
    54 
    55        Args: 
    56            op: operator read from PDF stream as bytes. No action is taken 
    57                for unsupported operators (see supported operators above). 
    58            value (float | List[Any]): new parameter value. If a list, 
    59                value[0] is used. 
    60 
    61        """ 
    62        if op not in [b"Tc", b"Tz", b"Tw", b"TL", b"Ts"]: 
    63            return 
    64        self.__setattr__(op.decode(), value[0] if isinstance(value, list) else value) 
    65 
    66    def set_font(self, font: Font, size: float) -> None: 
    67        """ 
    68        Set the current font and font_size. 
    69 
    70        Args: 
    71            font (Font): a layout mode Font 
    72            size (float): font size 
    73 
    74        """ 
    75        self.font = font 
    76        self.font_size = size 
    77 
    78    def text_state_params(self, value: Union[bytes, str] = "") -> TextStateParams: 
    79        """ 
    80        Create a TextStateParams instance to display a text string. Type[bytes] values 
    81        will be decoded implicitly. 
    82 
    83        Args: 
    84            value (str | bytes): text to associate with the captured state. 
    85 
    86        Raises: 
    87            PdfReadError: if font not set (no Tf operator in incoming pdf content stream) 
    88 
    89        Returns: 
    90            TextStateParams: current text state parameters 
    91 
    92        """ 
    93        if not isinstance(self.font, Font): 
    94            raise PdfReadError( 
    95                "font not set: is PDF missing a Tf operator?" 
    96            )  # pragma: no cover 
    97        if isinstance(value, bytes): 
    98            try: 
    99                if isinstance(self.font.encoding, str): 
    100                    txt = value.decode(self.font.encoding, "surrogatepass") 
    101                else: 
    102                    txt = "".join( 
    103                        self.font.encoding[x] 
    104                        if x in self.font.encoding 
    105                        else bytes((x,)).decode() 
    106                        for x in value 
    107                    ) 
    108            except (UnicodeEncodeError, UnicodeDecodeError): 
    109                txt = value.decode("utf-8", "replace") 
    110            txt = "".join( 
    111                self.font.char_map.get(x, x) for x in txt 
    112            ) 
    113        else: 
    114            txt = value 
    115        return TextStateParams( 
    116            txt, 
    117            self.font, 
    118            self.font_size, 
    119            self.Tc, 
    120            self.Tw, 
    121            self.Tz, 
    122            self.TL, 
    123            self.Ts, 
    124            self.effective_transform, 
    125        ) 
    126 
    127    @staticmethod 
    128    def raw_transform( 
    129        _a: float = 1.0, 
    130        _b: float = 0.0, 
    131        _c: float = 0.0, 
    132        _d: float = 1.0, 
    133        _e: float = 0.0, 
    134        _f: float = 0.0, 
    135    ) -> dict[int, float]: 
    136        """Only a/b/c/d/e/f matrix params""" 
    137        return dict(zip(range(6), map(float, (_a, _b, _c, _d, _e, _f)))) 
    138 
    139    @staticmethod 
    140    def new_transform( 
    141        _a: float = 1.0, 
    142        _b: float = 0.0, 
    143        _c: float = 0.0, 
    144        _d: float = 1.0, 
    145        _e: float = 0.0, 
    146        _f: float = 0.0, 
    147        is_text: bool = False, 
    148        is_render: bool = False, 
    149    ) -> TextStateManagerDictType: 
    150        """Standard a/b/c/d/e/f matrix params + 'is_text' and 'is_render' keys""" 
    151        result: Any = TextStateManager.raw_transform(_a, _b, _c, _d, _e, _f) 
    152        result.update({"is_text": is_text, "is_render": is_render}) 
    153        return result 
    154 
    155    def reset_tm(self) -> TextStateManagerChainMapType: 
    156        """Clear all transforms from chainmap having is_text==True or is_render==True""" 
    157        while ( 
    158            self.transform_stack.maps[0]["is_text"] 
    159            or self.transform_stack.maps[0]["is_render"] 
    160        ): 
    161            self.transform_stack = self.transform_stack.parents 
    162        return self.transform_stack 
    163 
    164    def reset_trm(self) -> TextStateManagerChainMapType: 
    165        """Clear all transforms from chainmap having is_render==True""" 
    166        while self.transform_stack.maps[0]["is_render"]: 
    167            self.transform_stack = self.transform_stack.parents 
    168        return self.transform_stack 
    169 
    170    def remove_q(self) -> TextStateManagerChainMapType: 
    171        """Rewind to stack prior state after closing a 'q' with internal 'cm' ops""" 
    172        self.font, self.font_size = self.font_stack.pop(-1) 
    173        self.transform_stack = self.reset_tm() 
    174        self.transform_stack.maps = self.transform_stack.maps[ 
    175            self.q_queue.pop(self.q_depth.pop(), 0) : 
    176        ] 
    177        return self.transform_stack 
    178 
    179    def add_q(self) -> None: 
    180        """Add another level to q_queue""" 
    181        self.font_stack.append((self.font, self.font_size)) 
    182        self.q_depth.append(len(self.q_depth)) 
    183 
    184    def add_cm(self, *args: Any) -> TextStateManagerChainMapType: 
    185        """Concatenate an additional transform matrix""" 
    186        self.transform_stack = self.reset_tm() 
    187        self.q_queue.update(self.q_depth[-1:]) 
    188        self.transform_stack = self.transform_stack.new_child(self.new_transform(*args)) 
    189        return self.transform_stack 
    190 
    191    def _complete_matrix(self, operands: list[float]) -> list[float]: 
    192        """Adds a, b, c, and d to an "e/f only" operand set (e.g Td)""" 
    193        if len(operands) == 2:  # this is a Td operator or equivalent 
    194            operands = [1.0, 0.0, 0.0, 1.0, *operands] 
    195        return operands 
    196 
    197    def add_tm(self, operands: list[float]) -> TextStateManagerChainMapType: 
    198        """Append a text transform matrix""" 
    199        self.transform_stack = self.transform_stack.new_child( 
    200            self.new_transform(  # type: ignore[misc] 
    201                *self._complete_matrix(operands), is_text=True  # type: ignore[arg-type] 
    202            ) 
    203        ) 
    204        return self.transform_stack 
    205 
    206    def add_trm(self, operands: list[float]) -> TextStateManagerChainMapType: 
    207        """Append a text rendering transform matrix""" 
    208        self.transform_stack = self.transform_stack.new_child( 
    209            self.new_transform(  # type: ignore[misc] 
    210                *self._complete_matrix(operands), is_text=True, is_render=True  # type: ignore[arg-type] 
    211            ) 
    212        ) 
    213        return self.transform_stack 
    214 
    215    @property 
    216    def effective_transform(self) -> list[float]: 
    217        """Current effective transform accounting for cm, tm, and trm transforms""" 
    218        eff_transform = [*self.transform_stack.maps[0].values()] 
    219        for transform in self.transform_stack.maps[1:]: 
    220            eff_transform = mult(eff_transform, transform)  # type: ignore[arg-type]  # dict has int keys 0-5 
    221        return eff_transform