1""" 
    2Code related to text extraction. 
    3 
    4Some parts are still in _page.py. In doubt, they will stay there. 
    5""" 
    6 
    7import math 
    8from typing import Any, Callable, Optional, Union 
    9 
    10from ..generic import DictionaryObject, TextStringObject, encode_pdfdocencoding 
    11 
    12CUSTOM_RTL_MIN: int = -1 
    13CUSTOM_RTL_MAX: int = -1 
    14CUSTOM_RTL_SPECIAL_CHARS: list[int] = [] 
    15LAYOUT_NEW_BT_GROUP_SPACE_WIDTHS: int = 5 
    16 
    17 
    18class OrientationNotFoundError(Exception): 
    19    pass 
    20 
    21 
    22def set_custom_rtl( 
    23    _min: Union[str, int, None] = None, 
    24    _max: Union[str, int, None] = None, 
    25    specials: Union[str, list[int], None] = None, 
    26) -> tuple[int, int, list[int]]: 
    27    """ 
    28    Change the Right-To-Left and special characters custom parameters. 
    29 
    30    Args: 
    31        _min: The new minimum value for the range of custom characters that 
    32            will be written right to left. 
    33            If set to ``None``, the value will not be changed. 
    34            If set to an integer or string, it will be converted to its ASCII code. 
    35            The default value is -1, which sets no additional range to be converted. 
    36        _max: The new maximum value for the range of custom characters that will 
    37            be written right to left. 
    38            If set to ``None``, the value will not be changed. 
    39            If set to an integer or string, it will be converted to its ASCII code. 
    40            The default value is -1, which sets no additional range to be converted. 
    41        specials: The new list of special characters to be inserted in the 
    42            current insertion order. 
    43            If set to ``None``, the current value will not be changed. 
    44            If set to a string, it will be converted to a list of ASCII codes. 
    45            The default value is an empty list. 
    46 
    47    Returns: 
    48        A tuple containing the new values for ``CUSTOM_RTL_MIN``, 
    49        ``CUSTOM_RTL_MAX``, and ``CUSTOM_RTL_SPECIAL_CHARS``. 
    50 
    51    """ 
    52    global CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS 
    53    if isinstance(_min, int): 
    54        CUSTOM_RTL_MIN = _min 
    55    elif isinstance(_min, str): 
    56        CUSTOM_RTL_MIN = ord(_min) 
    57    if isinstance(_max, int): 
    58        CUSTOM_RTL_MAX = _max 
    59    elif isinstance(_max, str): 
    60        CUSTOM_RTL_MAX = ord(_max) 
    61    if isinstance(specials, str): 
    62        CUSTOM_RTL_SPECIAL_CHARS = [ord(x) for x in specials] 
    63    elif isinstance(specials, list): 
    64        CUSTOM_RTL_SPECIAL_CHARS = specials 
    65    return CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS 
    66 
    67 
    68def mult(m: list[float], n: list[float]) -> list[float]: 
    69    return [ 
    70        m[0] * n[0] + m[1] * n[2], 
    71        m[0] * n[1] + m[1] * n[3], 
    72        m[2] * n[0] + m[3] * n[2], 
    73        m[2] * n[1] + m[3] * n[3], 
    74        m[4] * n[0] + m[5] * n[2] + n[4], 
    75        m[4] * n[1] + m[5] * n[3] + n[5], 
    76    ] 
    77 
    78 
    79def orient(m: list[float]) -> int: 
    80    if m[3] > 1e-6: 
    81        return 0 
    82    if m[3] < -1e-6: 
    83        return 180 
    84    if m[1] > 0: 
    85        return 90 
    86    return 270 
    87 
    88 
    89def crlf_space_check( 
    90    text: str, 
    91    cmtm_prev: tuple[list[float], list[float]], 
    92    cmtm_matrix: tuple[list[float], list[float]], 
    93    memo_cmtm: tuple[list[float], list[float]], 
    94    cmap: tuple[ 
    95        Union[str, dict[int, str]], dict[str, str], str, Optional[DictionaryObject] 
    96    ], 
    97    orientations: tuple[int, ...], 
    98    output: str, 
    99    font_size: float, 
    100    visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]], 
    101    str_widths: float, 
    102    spacewidth: float, 
    103    str_height: float, 
    104) -> tuple[str, str, list[float], list[float]]: 
    105    cm_prev = cmtm_prev[0] 
    106    tm_prev = cmtm_prev[1] 
    107    cm_matrix = cmtm_matrix[0] 
    108    tm_matrix = cmtm_matrix[1] 
    109    memo_cm = memo_cmtm[0] 
    110    memo_tm = memo_cmtm[1] 
    111 
    112    m_prev = mult(tm_prev, cm_prev) 
    113    m = mult(tm_matrix, cm_matrix) 
    114    orientation = orient(m) 
    115    delta_x = m[4] - m_prev[4] 
    116    delta_y = m[5] - m_prev[5] 
    117    # Table 108 of the 1.7 reference ("Text positioning operators") 
    118    scale_prev_x = math.sqrt(tm_prev[0]**2 + tm_prev[1]**2) 
    119    scale_prev_y = math.sqrt(tm_prev[2]**2 + tm_prev[3]**2) 
    120    scale_y = math.sqrt(tm_matrix[2]**2 + tm_matrix[3]**2) 
    121    cm_prev = m 
    122 
    123    if orientation not in orientations: 
    124        raise OrientationNotFoundError 
    125    if orientation in (0, 180): 
    126        moved_height: float = delta_y 
    127        moved_width: float = delta_x 
    128    elif orientation in (90, 270): 
    129        moved_height = delta_x 
    130        moved_width = delta_y 
    131    try: 
    132        if abs(moved_height) > 0.8 * min(str_height * scale_prev_y, font_size * scale_y): 
    133            if (output + text)[-1] != "\n": 
    134                output += text + "\n" 
    135                if visitor_text is not None: 
    136                    visitor_text( 
    137                        text + "\n", 
    138                        memo_cm, 
    139                        memo_tm, 
    140                        cmap[3], 
    141                        font_size, 
    142                    ) 
    143                text = "" 
    144        elif ( 
    145            (moved_width >= (spacewidth + str_widths) * scale_prev_x) 
    146            and (output + text)[-1] != " " 
    147        ): 
    148            text += " " 
    149    except Exception: 
    150        pass 
    151    tm_prev = tm_matrix.copy() 
    152    cm_prev = cm_matrix.copy() 
    153    return text, output, cm_prev, tm_prev 
    154 
    155 
    156def get_text_operands( 
    157    operands: list[Union[str, TextStringObject]], 
    158    cm_matrix: list[float], 
    159    tm_matrix: list[float], 
    160    cmap: tuple[ 
    161        Union[str, dict[int, str]], dict[str, str], str, Optional[DictionaryObject] 
    162    ], 
    163    orientations: tuple[int, ...] 
    164) -> tuple[str, bool]: 
    165    t: str = "" 
    166    is_str_operands = False 
    167    m = mult(tm_matrix, cm_matrix) 
    168    orientation = orient(m) 
    169    if orientation in orientations and len(operands) > 0: 
    170        if isinstance(operands[0], str): 
    171            t = operands[0] 
    172            is_str_operands = True 
    173        else: 
    174            t = "" 
    175            tt: bytes = ( 
    176                encode_pdfdocencoding(operands[0]) 
    177                if isinstance(operands[0], str) 
    178                else operands[0] 
    179            ) 
    180            if isinstance(cmap[0], str): 
    181                try: 
    182                    t = tt.decode(cmap[0], "surrogatepass")  # apply str encoding 
    183                except Exception: 
    184                    # the data does not match the expectation, 
    185                    # we use the alternative ; 
    186                    # text extraction may not be good 
    187                    t = tt.decode( 
    188                        "utf-16-be" if cmap[0] == "charmap" else "charmap", 
    189                        "surrogatepass", 
    190                    )  # apply str encoding 
    191            else:  # apply dict encoding 
    192                t = "".join( 
    193                    [cmap[0][x] if x in cmap[0] else bytes((x,)).decode() for x in tt] 
    194                ) 
    195    return (t, is_str_operands) 
    196 
    197 
    198def get_display_str( 
    199    text: str, 
    200    cm_matrix: list[float], 
    201    tm_matrix: list[float], 
    202    cmap: tuple[ 
    203        Union[str, dict[int, str]], dict[str, str], str, Optional[DictionaryObject] 
    204    ], 
    205    text_operands: str, 
    206    font_size: float, 
    207    rtl_dir: bool, 
    208    visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] 
    209) -> tuple[str, bool]: 
    210    # "\u0590 - \u08FF \uFB50 - \uFDFF" 
    211    for x in [cmap[1].get(x, x) for x in text_operands]: 
    212        # x can be a sequence of bytes ; ex: habibi.pdf 
    213        if len(x) == 1: 
    214            xx = ord(x) 
    215        else: 
    216            xx = 1 
    217        # fmt: off 
    218        if ( 
    219            # cases where the current inserting order is kept 
    220            (xx <= 0x2F)                        # punctuations but... 
    221            or 0x3A <= xx <= 0x40               # numbers (x30-39) 
    222            or 0x2000 <= xx <= 0x206F           # upper punctuations.. 
    223            or 0x20A0 <= xx <= 0x21FF           # but (numbers) indices/exponents 
    224            or xx in CUSTOM_RTL_SPECIAL_CHARS   # customized.... 
    225        ): 
    226            text = x + text if rtl_dir else text + x 
    227        elif (  # right-to-left characters set 
    228            0x0590 <= xx <= 0x08FF 
    229            or 0xFB1D <= xx <= 0xFDFF 
    230            or 0xFE70 <= xx <= 0xFEFF 
    231            or CUSTOM_RTL_MIN <= xx <= CUSTOM_RTL_MAX 
    232        ): 
    233            if not rtl_dir: 
    234                rtl_dir = True 
    235                if visitor_text is not None: 
    236                    visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size) 
    237                text = "" 
    238            text = x + text 
    239        else:  # left-to-right 
    240            if rtl_dir: 
    241                rtl_dir = False 
    242                if visitor_text is not None: 
    243                    visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size) 
    244                text = "" 
    245            text = text + x 
    246        # fmt: on 
    247    return text, rtl_dir