1"""
2Code related to text extraction.
3
4Some parts are still in _page.py. In doubt, they will stay there.
5"""
6
7import math
8from typing import Any, Callable, Optional, Union
9
10from .._font import Font
11from ..generic import DictionaryObject, TextStringObject, encode_pdfdocencoding
12
13CUSTOM_RTL_MIN: int = -1
14CUSTOM_RTL_MAX: int = -1
15CUSTOM_RTL_SPECIAL_CHARS: list[int] = []
16LAYOUT_NEW_BT_GROUP_SPACE_WIDTHS: int = 5
17
18
19class OrientationNotFoundError(Exception):
20 pass
21
22
23def set_custom_rtl(
24 _min: Union[str, int, None] = None,
25 _max: Union[str, int, None] = None,
26 specials: Union[str, list[int], None] = None,
27) -> tuple[int, int, list[int]]:
28 """
29 Change the Right-To-Left and special characters custom parameters.
30
31 Args:
32 _min: The new minimum value for the range of custom characters that
33 will be written right to left.
34 If set to ``None``, the value will not be changed.
35 If set to an integer or string, it will be converted to its ASCII code.
36 The default value is -1, which sets no additional range to be converted.
37 _max: The new maximum value for the range of custom characters that will
38 be written right to left.
39 If set to ``None``, the value will not be changed.
40 If set to an integer or string, it will be converted to its ASCII code.
41 The default value is -1, which sets no additional range to be converted.
42 specials: The new list of special characters to be inserted in the
43 current insertion order.
44 If set to ``None``, the current value will not be changed.
45 If set to a string, it will be converted to a list of ASCII codes.
46 The default value is an empty list.
47
48 Returns:
49 A tuple containing the new values for ``CUSTOM_RTL_MIN``,
50 ``CUSTOM_RTL_MAX``, and ``CUSTOM_RTL_SPECIAL_CHARS``.
51
52 """
53 global CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS
54 if isinstance(_min, int):
55 CUSTOM_RTL_MIN = _min
56 elif isinstance(_min, str):
57 CUSTOM_RTL_MIN = ord(_min)
58 if isinstance(_max, int):
59 CUSTOM_RTL_MAX = _max
60 elif isinstance(_max, str):
61 CUSTOM_RTL_MAX = ord(_max)
62 if isinstance(specials, str):
63 CUSTOM_RTL_SPECIAL_CHARS = [ord(x) for x in specials]
64 elif isinstance(specials, list):
65 CUSTOM_RTL_SPECIAL_CHARS = specials
66 return CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS
67
68
69def mult(m: list[float], n: list[float]) -> list[float]:
70 return [
71 m[0] * n[0] + m[1] * n[2],
72 m[0] * n[1] + m[1] * n[3],
73 m[2] * n[0] + m[3] * n[2],
74 m[2] * n[1] + m[3] * n[3],
75 m[4] * n[0] + m[5] * n[2] + n[4],
76 m[4] * n[1] + m[5] * n[3] + n[5],
77 ]
78
79
80def orient(m: list[float]) -> int:
81 if m[3] > 1e-6:
82 return 0
83 if m[3] < -1e-6:
84 return 180
85 if m[1] > 0:
86 return 90
87 return 270
88
89
90def crlf_space_check(
91 text: str,
92 cmtm_prev: tuple[list[float], list[float]],
93 cmtm_matrix: tuple[list[float], list[float]],
94 memo_cmtm: tuple[list[float], list[float]],
95 font_resource: Optional[DictionaryObject],
96 orientations: tuple[int, ...],
97 output: str,
98 font_size: float,
99 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]],
100 str_widths: float,
101 spacewidth: float,
102 str_height: float,
103) -> tuple[str, str, list[float], list[float]]:
104 cm_prev = cmtm_prev[0]
105 tm_prev = cmtm_prev[1]
106 cm_matrix = cmtm_matrix[0]
107 tm_matrix = cmtm_matrix[1]
108 memo_cm = memo_cmtm[0]
109 memo_tm = memo_cmtm[1]
110
111 m_prev = mult(tm_prev, cm_prev)
112 m = mult(tm_matrix, cm_matrix)
113 orientation = orient(m)
114 delta_x = m[4] - m_prev[4]
115 delta_y = m[5] - m_prev[5]
116 # Table 108 of the 1.7 reference ("Text positioning operators")
117 scale_prev_x = math.sqrt(tm_prev[0]**2 + tm_prev[1]**2)
118 scale_prev_y = math.sqrt(tm_prev[2]**2 + tm_prev[3]**2)
119 scale_y = math.sqrt(tm_matrix[2]**2 + tm_matrix[3]**2)
120 cm_prev = m
121
122 if orientation not in orientations:
123 raise OrientationNotFoundError
124 if orientation in (0, 180):
125 moved_height: float = delta_y
126 moved_width: float = delta_x
127 elif orientation in (90, 270):
128 moved_height = delta_x
129 moved_width = delta_y
130 try:
131 if abs(moved_height) > 0.8 * min(str_height * scale_prev_y, font_size * scale_y):
132 if (output + text)[-1] != "\n":
133 output += text + "\n"
134 if visitor_text is not None:
135 visitor_text(
136 text + "\n",
137 memo_cm,
138 memo_tm,
139 font_resource,
140 font_size,
141 )
142 text = ""
143 elif (
144 (moved_width >= (spacewidth + str_widths) * scale_prev_x)
145 and (output + text)[-1] != " "
146 ):
147 text += " "
148 except Exception:
149 pass
150 tm_prev = tm_matrix.copy()
151 cm_prev = cm_matrix.copy()
152 return text, output, cm_prev, tm_prev
153
154
155def get_text_operands(
156 operands: list[Union[str, TextStringObject]],
157 cm_matrix: list[float],
158 tm_matrix: list[float],
159 font: Font,
160 orientations: tuple[int, ...]
161) -> tuple[str, bool]:
162 t: str = ""
163 is_str_operands = False
164 m = mult(tm_matrix, cm_matrix)
165 orientation = orient(m)
166 if orientation in orientations and len(operands) > 0:
167 if isinstance(operands[0], str):
168 t = operands[0]
169 is_str_operands = True
170 else:
171 t = ""
172 tt: bytes = (
173 encode_pdfdocencoding(operands[0])
174 if isinstance(operands[0], str)
175 else operands[0]
176 )
177 if isinstance(font.encoding, str):
178 try:
179 t = tt.decode(font.encoding, "surrogatepass") # apply str encoding
180 except Exception:
181 # the data does not match the expectation,
182 # we use the alternative ;
183 # text extraction may not be good
184 t = tt.decode(
185 "utf-16-be" if font.encoding == "charmap" else "charmap",
186 "surrogatepass",
187 ) # apply str encoding
188 else: # apply dict encoding
189 t = "".join(
190 [font.encoding[x] if x in font.encoding else bytes((x,)).decode() for x in tt]
191 )
192 return (t, is_str_operands)
193
194
195def get_display_str(
196 text: str,
197 cm_matrix: list[float],
198 tm_matrix: list[float],
199 font_resource: Optional[DictionaryObject],
200 font: Font,
201 text_operands: str,
202 font_size: float,
203 rtl_dir: bool,
204 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]]
205) -> tuple[str, bool, float]:
206 # "\u0590 - \u08FF \uFB50 - \uFDFF"
207 widths: float = 0.0
208 for x in [font.character_map.get(x, x) for x in text_operands]:
209 # x can be a sequence of bytes ; ex: habibi.pdf
210 if len(x) == 1:
211 xx = ord(x)
212 else:
213 xx = 1
214 # fmt: off
215 if (
216 # cases where the current inserting order is kept
217 (xx <= 0x2F) # punctuations but...
218 or 0x3A <= xx <= 0x40 # numbers (x30-39)
219 or 0x2000 <= xx <= 0x206F # upper punctuations..
220 or 0x20A0 <= xx <= 0x21FF # but (numbers) indices/exponents
221 or xx in CUSTOM_RTL_SPECIAL_CHARS # customized....
222 ):
223 text = x + text if rtl_dir else text + x
224 elif ( # right-to-left characters set
225 0x0590 <= xx <= 0x08FF
226 or 0xFB1D <= xx <= 0xFDFF
227 or 0xFE70 <= xx <= 0xFEFF
228 or CUSTOM_RTL_MIN <= xx <= CUSTOM_RTL_MAX
229 ):
230 if not rtl_dir:
231 rtl_dir = True
232 if visitor_text is not None:
233 visitor_text(text, cm_matrix, tm_matrix, font_resource, font_size)
234 text = ""
235 text = x + text
236 else: # left-to-right
237 if rtl_dir:
238 rtl_dir = False
239 if visitor_text is not None:
240 visitor_text(text, cm_matrix, tm_matrix, font_resource, font_size)
241 text = ""
242 text = text + x
243 widths += font.space_width if x == " " else font.text_width(x)
244 # fmt: on
245 return text, rtl_dir, widths