1# Copyright (c) 2006, Mathieu Fenniak 
    2# Copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com> 
    3# 
    4# All rights reserved. 
    5# 
    6# Redistribution and use in source and binary forms, with or without 
    7# modification, are permitted provided that the following conditions are 
    8# met: 
    9# 
    10# * Redistributions of source code must retain the above copyright notice, 
    11# this list of conditions and the following disclaimer. 
    12# * Redistributions in binary form must reproduce the above copyright notice, 
    13# this list of conditions and the following disclaimer in the documentation 
    14# and/or other materials provided with the distribution. 
    15# * The name of the author may not be used to endorse or promote products 
    16# derived from this software without specific prior written permission. 
    17# 
    18# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
    19# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
    20# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
    21# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
    22# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
    23# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
    24# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
    25# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
    26# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
    27# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
    28# POSSIBILITY OF SUCH DAMAGE. 
    29 
    30import math 
    31from typing import Any, Callable, Optional, Union 
    32 
    33from .._cmap import build_font_width_map, compute_font_width, get_actual_str_key 
    34from ..generic import DictionaryObject, TextStringObject 
    35from . import OrientationNotFoundError, crlf_space_check, get_display_str, get_text_operands, mult 
    36 
    37 
    38class TextExtraction: 
    39    """ 
    40    A class to handle PDF text extraction operations. 
    41 
    42    This class encapsulates all the state and operations needed for extracting 
    43    text from PDF content streams, replacing the nested functions and nonlocal 
    44    variables in the original implementation. 
    45    """ 
    46 
    47    def __init__(self) -> None: 
    48        self._font_width_maps: dict[str, tuple[dict[Any, float], str, float]] = {} 
    49 
    50        # Text extraction state variables 
    51        self.cm_matrix: list[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] 
    52        self.tm_matrix: list[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] 
    53        self.cm_stack: list[ 
    54            tuple[ 
    55                list[float], 
    56                tuple[Union[str, dict[int, str]], dict[str, str], str, Optional[DictionaryObject]], 
    57                float, 
    58                float, 
    59                float, 
    60                float, 
    61                float, 
    62            ] 
    63        ] = [] 
    64 
    65        # Store the last modified matrices; can be an intermediate position 
    66        self.cm_prev: list[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] 
    67        self.tm_prev: list[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] 
    68 
    69        # Store the position at the beginning of building the text 
    70        self.memo_cm: list[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] 
    71        self.memo_tm: list[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] 
    72 
    73        self.char_scale = 1.0 
    74        self.space_scale = 1.0 
    75        self._space_width: float = 500.0  # will be set correctly at first Tf 
    76        self._actual_str_size: dict[str, float] = { 
    77            "str_widths": 0.0, 
    78            "space_width": 0.0, 
    79            "str_height": 0.0, 
    80        }  # will be set to string length calculation result 
    81        self.TL = 0.0 
    82        self.font_size = 12.0  # init just in case of 
    83 
    84        # Text extraction variables 
    85        self.text: str = "" 
    86        self.output: str = "" 
    87        self.rtl_dir: bool = False  # right-to-left 
    88        self.cmap: tuple[Union[str, dict[int, str]], dict[str, str], str, Optional[DictionaryObject]] = ( 
    89            "charmap", 
    90            {}, 
    91            "NotInitialized", 
    92            None, 
    93        )  # (encoding, CMAP, font resource name, font) 
    94        self.orientations: tuple[int, ...] = (0, 90, 180, 270) 
    95        self.visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None 
    96        self.cmaps: dict[str, tuple[str, float, Union[str, dict[int, str]], dict[str, str], DictionaryObject]] = {} 
    97 
    98        self.operation_handlers = { 
    99            b"BT": self._handle_bt, 
    100            b"ET": self._handle_et, 
    101            b"q": self._handle_save_graphics_state, 
    102            b"Q": self._handle_restore_graphics_state, 
    103            b"cm": self._handle_cm, 
    104            b"Tz": self._handle_tz, 
    105            b"Tw": self._handle_tw, 
    106            b"TL": self._handle_tl, 
    107            b"Tf": self._handle_tf, 
    108            b"Td": self._handle_td, 
    109            b"Tm": self._handle_tm, 
    110            b"T*": self._handle_t_star, 
    111            b"Tj": self._handle_tj_operation, 
    112        } 
    113 
    114    def initialize_extraction( 
    115        self, 
    116        orientations: tuple[int, ...] = (0, 90, 180, 270), 
    117        visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None, 
    118        cmaps: Optional[ 
    119            dict[str, tuple[str, float, Union[str, dict[int, str]], dict[str, str], DictionaryObject]] 
    120        ] = None, 
    121    ) -> None: 
    122        """Initialize the extractor with extraction parameters.""" 
    123        self.orientations = orientations 
    124        self.visitor_text = visitor_text 
    125        self.cmaps = cmaps or {} 
    126 
    127        # Reset state 
    128        self.text = "" 
    129        self.output = "" 
    130        self.rtl_dir = False 
    131 
    132    def compute_str_widths(self, str_widths: float) -> float: 
    133        return str_widths / 1000 
    134 
    135    def process_operation(self, operator: bytes, operands: list[Any]) -> None: 
    136        if operator in self.operation_handlers: 
    137            handler = self.operation_handlers[operator] 
    138            str_widths = handler(operands) 
    139 
    140            # Post-process operations that affect text positioning 
    141            if operator in {b"Td", b"Tm", b"T*", b"Tj"}: 
    142                self._post_process_text_operation(str_widths or 0.0) 
    143 
    144    def _post_process_text_operation(self, str_widths: float) -> None: 
    145        """Handle common post-processing for text positioning operations.""" 
    146        try: 
    147            self.text, self.output, self.cm_prev, self.tm_prev = crlf_space_check( 
    148                self.text, 
    149                (self.cm_prev, self.tm_prev), 
    150                (self.cm_matrix, self.tm_matrix), 
    151                (self.memo_cm, self.memo_tm), 
    152                self.cmap, 
    153                self.orientations, 
    154                self.output, 
    155                self.font_size, 
    156                self.visitor_text, 
    157                str_widths, 
    158                self.compute_str_widths(self._actual_str_size["space_width"]), 
    159                self._actual_str_size["str_height"], 
    160            ) 
    161            if self.text == "": 
    162                self.memo_cm = self.cm_matrix.copy() 
    163                self.memo_tm = self.tm_matrix.copy() 
    164        except OrientationNotFoundError: 
    165            pass 
    166 
    167    def _get_actual_font_widths( 
    168        self, 
    169        cmap: tuple[ 
    170            Union[str, dict[int, str]], dict[str, str], str, Optional[DictionaryObject] 
    171        ], 
    172        text_operands: str, 
    173        font_size: float, 
    174        space_width: float, 
    175    ) -> tuple[float, float, float]: 
    176        font_widths: float = 0 
    177        font_name: str = cmap[2] 
    178        if font_name not in self._font_width_maps: 
    179            if cmap[3] is None: 
    180                font_width_map: dict[Any, float] = {} 
    181                space_char = " " 
    182                actual_space_width: float = space_width 
    183                font_width_map["default"] = actual_space_width * 2 
    184            else: 
    185                space_char = get_actual_str_key(" ", cmap[0], cmap[1]) 
    186                font_width_map = build_font_width_map(cmap[3], space_width * 2) 
    187                actual_space_width = compute_font_width(font_width_map, space_char) 
    188            if actual_space_width == 0: 
    189                actual_space_width = space_width 
    190            self._font_width_maps[font_name] = (font_width_map, space_char, actual_space_width) 
    191        font_width_map = self._font_width_maps[font_name][0] 
    192        space_char = self._font_width_maps[font_name][1] 
    193        actual_space_width = self._font_width_maps[font_name][2] 
    194 
    195        if text_operands: 
    196            for char in text_operands: 
    197                if char == space_char: 
    198                    font_widths += actual_space_width 
    199                    continue 
    200                font_widths += compute_font_width(font_width_map, char) 
    201        return (font_widths * font_size, space_width * font_size, font_size) 
    202 
    203    def _handle_tj( 
    204        self, 
    205        text: str, 
    206        operands: list[Union[str, TextStringObject]], 
    207        cm_matrix: list[float], 
    208        tm_matrix: list[float], 
    209        cmap: tuple[ 
    210            Union[str, dict[int, str]], dict[str, str], str, Optional[DictionaryObject] 
    211        ], 
    212        orientations: tuple[int, ...], 
    213        font_size: float, 
    214        rtl_dir: bool, 
    215        visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]], 
    216        space_width: float, 
    217        actual_str_size: dict[str, float], 
    218    ) -> tuple[str, bool, dict[str, float]]: 
    219        text_operands, is_str_operands = get_text_operands( 
    220            operands, cm_matrix, tm_matrix, cmap, orientations) 
    221        if is_str_operands: 
    222            text += text_operands 
    223        else: 
    224            text, rtl_dir = get_display_str( 
    225                text, 
    226                cm_matrix, 
    227                tm_matrix,  # text matrix 
    228                cmap, 
    229                text_operands, 
    230                font_size, 
    231                rtl_dir, 
    232                visitor_text, 
    233            ) 
    234        font_widths, actual_str_size["space_width"], actual_str_size["str_height"] = ( 
    235            self._get_actual_font_widths(cmap, text_operands, font_size, space_width)) 
    236        actual_str_size["str_widths"] += font_widths 
    237 
    238        return text, rtl_dir, actual_str_size 
    239 
    240    def _flush_text(self) -> None: 
    241        """Flush accumulated text to output and call visitor if present.""" 
    242        self.output += self.text 
    243        if self.visitor_text is not None: 
    244            self.visitor_text(self.text, self.memo_cm, self.memo_tm, self.cmap[3], self.font_size) 
    245        self.text = "" 
    246        self.memo_cm = self.cm_matrix.copy() 
    247        self.memo_tm = self.tm_matrix.copy() 
    248 
    249    # Operation handlers 
    250 
    251    def _handle_bt(self, operands: list[Any]) -> None: 
    252        """Handle BT (Begin Text) operation - Table 5.4 page 405.""" 
    253        self.tm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] 
    254        self._flush_text() 
    255 
    256    def _handle_et(self, operands: list[Any]) -> None: 
    257        """Handle ET (End Text) operation - Table 5.4 page 405.""" 
    258        self._flush_text() 
    259 
    260    def _handle_save_graphics_state(self, operands: list[Any]) -> None: 
    261        """Handle q (Save graphics state) operation - Table 4.7 page 219.""" 
    262        self.cm_stack.append( 
    263            ( 
    264                self.cm_matrix, 
    265                self.cmap, 
    266                self.font_size, 
    267                self.char_scale, 
    268                self.space_scale, 
    269                self._space_width, 
    270                self.TL, 
    271            ) 
    272        ) 
    273 
    274    def _handle_restore_graphics_state(self, operands: list[Any]) -> None: 
    275        """Handle Q (Restore graphics state) operation - Table 4.7 page 219.""" 
    276        try: 
    277            ( 
    278                self.cm_matrix, 
    279                self.cmap, 
    280                self.font_size, 
    281                self.char_scale, 
    282                self.space_scale, 
    283                self._space_width, 
    284                self.TL, 
    285            ) = self.cm_stack.pop() 
    286        except Exception: 
    287            self.cm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] 
    288 
    289    def _handle_cm(self, operands: list[Any]) -> None: 
    290        """Handle cm (Modify current matrix) operation - Table 4.7 page 219.""" 
    291        self.output += self.text 
    292        if self.visitor_text is not None: 
    293            self.visitor_text(self.text, self.memo_cm, self.memo_tm, self.cmap[3], self.font_size) 
    294        self.text = "" 
    295        try: 
    296            self.cm_matrix = mult([float(operand) for operand in operands[:6]], self.cm_matrix) 
    297        except Exception: 
    298            self.cm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] 
    299        self.memo_cm = self.cm_matrix.copy() 
    300        self.memo_tm = self.tm_matrix.copy() 
    301 
    302    def _handle_tz(self, operands: list[Any]) -> None: 
    303        """Handle Tz (Set horizontal text scaling) operation - Table 5.2 page 398.""" 
    304        self.char_scale = float(operands[0]) / 100 if operands else 1.0 
    305 
    306    def _handle_tw(self, operands: list[Any]) -> None: 
    307        """Handle Tw (Set word spacing) operation - Table 5.2 page 398.""" 
    308        self.space_scale = 1.0 + float(operands[0] if operands else 0.0) 
    309 
    310    def _handle_tl(self, operands: list[Any]) -> None: 
    311        """Handle TL (Set Text Leading) operation - Table 5.2 page 398.""" 
    312        scale_x = math.sqrt(self.tm_matrix[0] ** 2 + self.tm_matrix[2] ** 2) 
    313        self.TL = float(operands[0] if operands else 0.0) * self.font_size * scale_x 
    314 
    315    def _handle_tf(self, operands: list[Any]) -> None: 
    316        """Handle Tf (Set font size) operation - Table 5.2 page 398.""" 
    317        if self.text != "": 
    318            self.output += self.text  # .translate(cmap) 
    319            if self.visitor_text is not None: 
    320                self.visitor_text(self.text, self.memo_cm, self.memo_tm, self.cmap[3], self.font_size) 
    321        self.text = "" 
    322        self.memo_cm = self.cm_matrix.copy() 
    323        self.memo_tm = self.tm_matrix.copy() 
    324        try: 
    325            # Import here to avoid circular imports 
    326            from .._cmap import unknown_char_map  # noqa: PLC0415 
    327 
    328            # char_map_tuple: font_type, 
    329            #                 float(sp_width / 2), 
    330            #                 encoding, 
    331            #                 map_dict, 
    332            #                 font_dict (describes the font) 
    333            char_map_tuple = self.cmaps[operands[0]] 
    334            # current cmap: encoding, 
    335            #               map_dict, 
    336            #               font resource name (internal name, not the real font name), 
    337            #               font_dict 
    338            self.cmap = ( 
    339                char_map_tuple[2], 
    340                char_map_tuple[3], 
    341                operands[0], 
    342                char_map_tuple[4], 
    343            ) 
    344            self._space_width = char_map_tuple[1] 
    345        except KeyError:  # font not found 
    346            self.cmap = ( 
    347                unknown_char_map[2], 
    348                unknown_char_map[3], 
    349                f"???{operands[0]}", 
    350                None, 
    351            ) 
    352            self._space_width = unknown_char_map[1] 
    353        try: 
    354            self.font_size = float(operands[1]) 
    355        except Exception: 
    356            pass  # keep previous size 
    357 
    358    def _handle_td(self, operands: list[Any]) -> float: 
    359        """Handle Td (Move text position) operation - Table 5.5 page 406.""" 
    360        # A special case is a translating only tm: 
    361        # tm = [1, 0, 0, 1, e, f] 
    362        # i.e. tm[4] += tx, tm[5] += ty. 
    363        tx, ty = float(operands[0]), float(operands[1]) 
    364        self.tm_matrix[4] += tx * self.tm_matrix[0] + ty * self.tm_matrix[2] 
    365        self.tm_matrix[5] += tx * self.tm_matrix[1] + ty * self.tm_matrix[3] 
    366        str_widths = self.compute_str_widths(self._actual_str_size["str_widths"]) 
    367        self._actual_str_size["str_widths"] = 0.0 
    368        return str_widths 
    369 
    370    def _handle_tm(self, operands: list[Any]) -> float: 
    371        """Handle Tm (Set text matrix) operation - Table 5.5 page 406.""" 
    372        self.tm_matrix = [float(operand) for operand in operands[:6]] 
    373        str_widths = self.compute_str_widths(self._actual_str_size["str_widths"]) 
    374        self._actual_str_size["str_widths"] = 0.0 
    375        return str_widths 
    376 
    377    def _handle_t_star(self, operands: list[Any]) -> float: 
    378        """Handle T* (Move to next line) operation - Table 5.5 page 406.""" 
    379        self.tm_matrix[4] -= self.TL * self.tm_matrix[2] 
    380        self.tm_matrix[5] -= self.TL * self.tm_matrix[3] 
    381        str_widths = self.compute_str_widths(self._actual_str_size["str_widths"]) 
    382        self._actual_str_size["str_widths"] = 0.0 
    383        return str_widths 
    384 
    385    def _handle_tj_operation(self, operands: list[Any]) -> float: 
    386        """Handle Tj (Show text) operation - Table 5.5 page 406.""" 
    387        self.text, self.rtl_dir, self._actual_str_size = self._handle_tj( 
    388            self.text, 
    389            operands, 
    390            self.cm_matrix, 
    391            self.tm_matrix, 
    392            self.cmap, 
    393            self.orientations, 
    394            self.font_size, 
    395            self.rtl_dir, 
    396            self.visitor_text, 
    397            self._space_width, 
    398            self._actual_str_size, 
    399        ) 
    400        return 0.0  # str_widths will be handled in post-processing