1"""
2Code related to text extraction.
3
4Some parts are still in _page.py. In doubt, they will stay there.
5"""
6
7import math
8from typing import Any, Callable, Dict, List, Optional, Tuple, Union
9
10from ..generic import DictionaryObject, TextStringObject, encode_pdfdocencoding
11
12CUSTOM_RTL_MIN: int = -1
13CUSTOM_RTL_MAX: int = -1
14CUSTOM_RTL_SPECIAL_CHARS: List[int] = []
15LAYOUT_NEW_BT_GROUP_SPACE_WIDTHS: int = 5
16
17
18class OrientationNotFoundError(Exception):
19 pass
20
21
22def set_custom_rtl(
23 _min: Union[str, int, None] = None,
24 _max: Union[str, int, None] = None,
25 specials: Union[str, List[int], None] = None,
26) -> Tuple[int, int, List[int]]:
27 """
28 Change the Right-To-Left and special characters custom parameters.
29
30 Args:
31 _min: The new minimum value for the range of custom characters that
32 will be written right to left.
33 If set to ``None``, the value will not be changed.
34 If set to an integer or string, it will be converted to its ASCII code.
35 The default value is -1, which sets no additional range to be converted.
36 _max: The new maximum value for the range of custom characters that will
37 be written right to left.
38 If set to ``None``, the value will not be changed.
39 If set to an integer or string, it will be converted to its ASCII code.
40 The default value is -1, which sets no additional range to be converted.
41 specials: The new list of special characters to be inserted in the
42 current insertion order.
43 If set to ``None``, the current value will not be changed.
44 If set to a string, it will be converted to a list of ASCII codes.
45 The default value is an empty list.
46
47 Returns:
48 A tuple containing the new values for ``CUSTOM_RTL_MIN``,
49 ``CUSTOM_RTL_MAX``, and ``CUSTOM_RTL_SPECIAL_CHARS``.
50
51 """
52 global CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS
53 if isinstance(_min, int):
54 CUSTOM_RTL_MIN = _min
55 elif isinstance(_min, str):
56 CUSTOM_RTL_MIN = ord(_min)
57 if isinstance(_max, int):
58 CUSTOM_RTL_MAX = _max
59 elif isinstance(_max, str):
60 CUSTOM_RTL_MAX = ord(_max)
61 if isinstance(specials, str):
62 CUSTOM_RTL_SPECIAL_CHARS = [ord(x) for x in specials]
63 elif isinstance(specials, list):
64 CUSTOM_RTL_SPECIAL_CHARS = specials
65 return CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS
66
67
68def mult(m: List[float], n: List[float]) -> List[float]:
69 return [
70 m[0] * n[0] + m[1] * n[2],
71 m[0] * n[1] + m[1] * n[3],
72 m[2] * n[0] + m[3] * n[2],
73 m[2] * n[1] + m[3] * n[3],
74 m[4] * n[0] + m[5] * n[2] + n[4],
75 m[4] * n[1] + m[5] * n[3] + n[5],
76 ]
77
78
79def orient(m: List[float]) -> int:
80 if m[3] > 1e-6:
81 return 0
82 if m[3] < -1e-6:
83 return 180
84 if m[1] > 0:
85 return 90
86 return 270
87
88
89def crlf_space_check(
90 text: str,
91 cmtm_prev: Tuple[List[float], List[float]],
92 cmtm_matrix: Tuple[List[float], List[float]],
93 memo_cmtm: Tuple[List[float], List[float]],
94 cmap: Tuple[
95 Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]
96 ],
97 orientations: Tuple[int, ...],
98 output: str,
99 font_size: float,
100 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]],
101 str_widths: float,
102 spacewidth: float,
103 str_height: float,
104) -> Tuple[str, str, List[float], List[float]]:
105 cm_prev = cmtm_prev[0]
106 tm_prev = cmtm_prev[1]
107 cm_matrix = cmtm_matrix[0]
108 tm_matrix = cmtm_matrix[1]
109 memo_cm = memo_cmtm[0]
110 memo_tm = memo_cmtm[1]
111
112 m_prev = mult(tm_prev, cm_prev)
113 m = mult(tm_matrix, cm_matrix)
114 orientation = orient(m)
115 delta_x = m[4] - m_prev[4]
116 delta_y = m[5] - m_prev[5]
117 # Table 108 of the 1.7 reference ("Text positioning operators")
118 scale_prev_x = math.sqrt(tm_prev[0]**2 + tm_prev[1]**2)
119 scale_prev_y = math.sqrt(tm_prev[2]**2 + tm_prev[3]**2)
120 scale_y = math.sqrt(tm_matrix[2]**2 + tm_matrix[3]**2)
121 cm_prev = m
122
123 if orientation not in orientations:
124 raise OrientationNotFoundError
125 if orientation in (0, 180):
126 moved_height: float = delta_y
127 moved_width: float = delta_x
128 elif orientation in (90, 270):
129 moved_height = delta_x
130 moved_width = delta_y
131 try:
132 if abs(moved_height) > 0.8 * min(str_height * scale_prev_y, font_size * scale_y):
133 if (output + text)[-1] != "\n":
134 output += text + "\n"
135 if visitor_text is not None:
136 visitor_text(
137 text + "\n",
138 memo_cm,
139 memo_tm,
140 cmap[3],
141 font_size,
142 )
143 text = ""
144 elif (
145 (moved_width >= (spacewidth + str_widths) * scale_prev_x)
146 and (output + text)[-1] != " "
147 ):
148 text += " "
149 except Exception:
150 pass
151 tm_prev = tm_matrix.copy()
152 cm_prev = cm_matrix.copy()
153 return text, output, cm_prev, tm_prev
154
155
156def get_text_operands(
157 operands: List[Union[str, TextStringObject]],
158 cm_matrix: List[float],
159 tm_matrix: List[float],
160 cmap: Tuple[
161 Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]
162 ],
163 orientations: Tuple[int, ...]
164) -> Tuple[str, bool]:
165 t: str = ""
166 is_str_operands = False
167 m = mult(tm_matrix, cm_matrix)
168 orientation = orient(m)
169 if orientation in orientations and len(operands) > 0:
170 if isinstance(operands[0], str):
171 t = operands[0]
172 is_str_operands = True
173 else:
174 t = ""
175 tt: bytes = (
176 encode_pdfdocencoding(operands[0])
177 if isinstance(operands[0], str)
178 else operands[0]
179 )
180 if isinstance(cmap[0], str):
181 try:
182 t = tt.decode(cmap[0], "surrogatepass") # apply str encoding
183 except Exception:
184 # the data does not match the expectation,
185 # we use the alternative ;
186 # text extraction may not be good
187 t = tt.decode(
188 "utf-16-be" if cmap[0] == "charmap" else "charmap",
189 "surrogatepass",
190 ) # apply str encoding
191 else: # apply dict encoding
192 t = "".join(
193 [cmap[0][x] if x in cmap[0] else bytes((x,)).decode() for x in tt]
194 )
195 return (t, is_str_operands)
196
197
198def get_display_str(
199 text: str,
200 cm_matrix: List[float],
201 tm_matrix: List[float],
202 cmap: Tuple[
203 Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]
204 ],
205 text_operands: str,
206 font_size: float,
207 rtl_dir: bool,
208 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]]
209) -> Tuple[str, bool]:
210 # "\u0590 - \u08FF \uFB50 - \uFDFF"
211 for x in [cmap[1].get(x, x) for x in text_operands]:
212 # x can be a sequence of bytes ; ex: habibi.pdf
213 if len(x) == 1:
214 xx = ord(x)
215 else:
216 xx = 1
217 # fmt: off
218 if (
219 # cases where the current inserting order is kept
220 (xx <= 0x2F) # punctuations but...
221 or 0x3A <= xx <= 0x40 # numbers (x30-39)
222 or 0x2000 <= xx <= 0x206F # upper punctuations..
223 or 0x20A0 <= xx <= 0x21FF # but (numbers) indices/exponents
224 or xx in CUSTOM_RTL_SPECIAL_CHARS # customized....
225 ):
226 text = x + text if rtl_dir else text + x
227 elif ( # right-to-left characters set
228 0x0590 <= xx <= 0x08FF
229 or 0xFB1D <= xx <= 0xFDFF
230 or 0xFE70 <= xx <= 0xFEFF
231 or CUSTOM_RTL_MIN <= xx <= CUSTOM_RTL_MAX
232 ):
233 if not rtl_dir:
234 rtl_dir = True
235 if visitor_text is not None:
236 visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
237 text = ""
238 text = x + text
239 else: # left-to-right
240 if rtl_dir:
241 rtl_dir = False
242 if visitor_text is not None:
243 visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
244 text = ""
245 text = text + x
246 # fmt: on
247 return text, rtl_dir