Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_text_extraction/_layout_mode/_fixed_width

1"""Extract PDF text preserving the layout of the source PDF"""

3from itertools import groupby

4from math import ceil

5from pathlib import Path

6from typing import Any, Dict, Iterator, List, Literal, Optional, Tuple, TypedDict

8from ..._utils import logger_warning

9from .. import LAYOUT_NEW_BT_GROUP_SPACE_WIDTHS

10from ._font import Font

11from ._text_state_manager import TextStateManager

12from ._text_state_params import TextStateParams

15class BTGroup(TypedDict):

16 """

17 Dict describing a line of text rendered within a BT/ET operator pair.

18 If multiple text show operations render text on the same line, the text

19 will be combined into a single BTGroup dict.

21 Keys:

22 tx: x coordinate of first character in BTGroup

23 ty: y coordinate of first character in BTGroup

24 font_size: nominal font size

25 font_height: effective font height

26 text: rendered text

27 displaced_tx: x coordinate of last character in BTGroup

28 flip_sort: -1 if page is upside down, else 1

29 """

31 tx: float

32 ty: float

33 font_size: float

34 font_height: float

35 text: str

36 displaced_tx: float

37 flip_sort: Literal[-1, 1]

40def bt_group(tj_op: TextStateParams, rendered_text: str, dispaced_tx: float) -> BTGroup:

41 """

42 BTGroup constructed from a TextStateParams instance, rendered text, and

43 displaced tx value.

45 Args:

46 tj_op (TextStateParams): TextStateParams instance

47 rendered_text (str): rendered text

48 dispaced_tx (float): x coordinate of last character in BTGroup

50 """

51 return BTGroup(

52 tx=tj_op.tx,

53 ty=tj_op.ty,

54 font_size=tj_op.font_size,

55 font_height=tj_op.font_height,

56 text=rendered_text,

57 displaced_tx=dispaced_tx,

58 flip_sort=-1 if tj_op.flip_vertical else 1,

59 )

62def recurs_to_target_op(

63 ops: Iterator[Tuple[List[Any], bytes]],

64 text_state_mgr: TextStateManager,

65 end_target: Literal[b"Q", b"ET"],

66 fonts: Dict[str, Font],

67 strip_rotated: bool = True,

68) -> Tuple[List[BTGroup], List[TextStateParams]]:

69 """

70 Recurse operators between BT/ET and/or q/Q operators managing the transform

71 stack and capturing text positioning and rendering data.

73 Args:

74 ops: iterator of operators in content stream

75 text_state_mgr: a TextStateManager instance

76 end_target: Either b"Q" (ends b"q" op) or b"ET" (ends b"BT" op)

77 fonts: font dictionary as returned by PageObject._layout_mode_fonts()

79 Returns:

80 tuple: list of BTGroup dicts + list of TextStateParams dataclass instances.

82 """

83 # 1 entry per line of text rendered within each BT/ET operation.

84 bt_groups: List[BTGroup] = []

86 # 1 entry per text show operator (Tj/TJ/'/")

87 tj_ops: List[TextStateParams] = []

89 if end_target == b"Q":

90 # add new q level. cm's added at this level will be popped at next b'Q'

91 text_state_mgr.add_q()

93 for operands, op in ops:

94 # The loop is broken by the end target, or exits normally when there are no more ops.

95 if op == end_target:

96 if op == b"Q":

97 text_state_mgr.remove_q()

98 if op == b"ET":

99 if not tj_ops:

100 return bt_groups, tj_ops

101 _text = ""

102 bt_idx = 0 # idx of first tj in this bt group

103 last_displaced_tx = tj_ops[bt_idx].displaced_tx

104 last_ty = tj_ops[bt_idx].ty

105 for _idx, _tj in enumerate(

106 tj_ops

107 ): # ... build text from new Tj operators

108 if strip_rotated and _tj.rotated:

109 continue

110 if not _tj.font.interpretable: # generates warning

111 continue

112 # if the y position of the text is greater than the font height, assume

113 # the text is on a new line and start a new group

114 if abs(_tj.ty - last_ty) > _tj.font_height:

115 if _text.strip():

116 bt_groups.append(

117 bt_group(tj_ops[bt_idx], _text, last_displaced_tx)

118 )

119 bt_idx = _idx

120 _text = ""

121

122 # if the x position of the text is less than the last x position by

123 # more than 5 spaces widths, assume the text order should be flipped

124 # and start a new group

125 if (

126 last_displaced_tx - _tj.tx

127 > _tj.space_tx * LAYOUT_NEW_BT_GROUP_SPACE_WIDTHS

128 ):

129 if _text.strip():

130 bt_groups.append(

131 bt_group(tj_ops[bt_idx], _text, last_displaced_tx)

132 )

133 bt_idx = _idx

134 last_displaced_tx = _tj.displaced_tx

135 _text = ""

136

137 # calculate excess x translation based on ending tx of previous Tj.

138 # multiply by bool (_idx != bt_idx) to ensure spaces aren't double

139 # applied to the first tj of a BTGroup in fixed_width_page().

140 excess_tx = round(_tj.tx - last_displaced_tx, 3) * (_idx != bt_idx)

141 # space_tx could be 0 if either Tz or font_size was 0 for this _tj.

142 spaces = int(excess_tx // _tj.space_tx) if _tj.space_tx else 0

143 new_text = f'{" " * spaces}{_tj.txt}'

144

145 last_ty = _tj.ty

146 _text = f"{_text}{new_text}"

147 last_displaced_tx = _tj.displaced_tx

148 if _text:

149 bt_groups.append(bt_group(tj_ops[bt_idx], _text, last_displaced_tx))

150 text_state_mgr.reset_tm()

151 break

152 if op == b"q":

153 bts, tjs = recurs_to_target_op(

154 ops, text_state_mgr, b"Q", fonts, strip_rotated

155 )

156 bt_groups.extend(bts)

157 tj_ops.extend(tjs)

158 elif op == b"cm":

159 text_state_mgr.add_cm(*operands)

160 elif op == b"BT":

161 bts, tjs = recurs_to_target_op(

162 ops, text_state_mgr, b"ET", fonts, strip_rotated

163 )

164 bt_groups.extend(bts)

165 tj_ops.extend(tjs)

166 elif op == b"Tj":

167 tj_ops.append(text_state_mgr.text_state_params(operands[0]))

168 elif op == b"TJ":

169 _tj = text_state_mgr.text_state_params()

170 for tj_op in operands[0]:

171 if isinstance(tj_op, bytes):

172 _tj = text_state_mgr.text_state_params(tj_op)

173 tj_ops.append(_tj)

174 else:

175 text_state_mgr.add_trm(_tj.displacement_matrix(TD_offset=tj_op))

176 elif op == b"'":

177 text_state_mgr.reset_trm()

178 text_state_mgr.add_tm([0, -text_state_mgr.TL])

179 tj_ops.append(text_state_mgr.text_state_params(operands[0]))

180 elif op == b'"':

181 text_state_mgr.reset_trm()

182 text_state_mgr.set_state_param(b"Tw", operands[0])

183 text_state_mgr.set_state_param(b"Tc", operands[1])

184 text_state_mgr.add_tm([0, -text_state_mgr.TL])

185 tj_ops.append(text_state_mgr.text_state_params(operands[2]))

186 elif op in (b"Td", b"Tm", b"TD", b"T*"):

187 text_state_mgr.reset_trm()

188 if op == b"Tm":

189 text_state_mgr.reset_tm()

190 elif op == b"TD":

191 text_state_mgr.set_state_param(b"TL", -operands[1])

192 elif op == b"T*":

193 operands = [0, -text_state_mgr.TL]

194 text_state_mgr.add_tm(operands)

195 elif op == b"Tf":

196 text_state_mgr.set_font(fonts[operands[0]], operands[1])

197 else: # handle Tc, Tw, Tz, TL, and Ts operators

198 text_state_mgr.set_state_param(op, operands)

199 else:

200 logger_warning(

201 f"Unbalanced target operations, expected {end_target!r}.",

202 __name__,

203 )

204 return bt_groups, tj_ops

205

206

207def y_coordinate_groups(

208 bt_groups: List[BTGroup], debug_path: Optional[Path] = None

209) -> Dict[int, List[BTGroup]]:

210 """

211 Group text operations by rendered y coordinate, i.e. the line number.

212

213 Args:

214 bt_groups: list of dicts as returned by text_show_operations()

215 debug_path (Path, optional): Path to a directory for saving debug output.

216

217 Returns:

218 Dict[int, List[BTGroup]]: dict of lists of text rendered by each BT operator

219 keyed by y coordinate

220

221 """

222 ty_groups = {

223 ty: sorted(grp, key=lambda x: x["tx"])

224 for ty, grp in groupby(

225 bt_groups, key=lambda bt_grp: int(bt_grp["ty"] * bt_grp["flip_sort"])

226 )

227 }

228 # combine groups whose y coordinates differ by less than the effective font height

229 # (accounts for mixed fonts and other minor oddities)

230 last_ty = next(iter(ty_groups))

231 last_txs = {int(_t["tx"]) for _t in ty_groups[last_ty] if _t["text"].strip()}

232 for ty in list(ty_groups)[1:]:

233 fsz = min(ty_groups[_y][0]["font_height"] for _y in (ty, last_ty))

234 txs = {int(_t["tx"]) for _t in ty_groups[ty] if _t["text"].strip()}

235 # prevent merge if both groups are rendering in the same x position.

236 no_text_overlap = not (txs & last_txs)

237 offset_less_than_font_height = abs(ty - last_ty) < fsz

238 if no_text_overlap and offset_less_than_font_height:

239 ty_groups[last_ty] = sorted(

240 ty_groups.pop(ty) + ty_groups[last_ty], key=lambda x: x["tx"]

241 )

242 last_txs |= txs

243 else:

244 last_ty = ty

245 last_txs = txs

246 if debug_path: # pragma: no cover

247 import json # noqa: PLC0415

248

249 debug_path.joinpath("bt_groups.json").write_text(

250 json.dumps(ty_groups, indent=2, default=str), "utf-8"

251 )

252 return ty_groups

253

254

255def text_show_operations(

256 ops: Iterator[Tuple[List[Any], bytes]],

257 fonts: Dict[str, Font],

258 strip_rotated: bool = True,

259 debug_path: Optional[Path] = None,

260) -> List[BTGroup]:

261 """

262 Extract text from BT/ET operator pairs.

263

264 Args:

265 ops (Iterator[Tuple[List, bytes]]): iterator of operators in content stream

266 fonts (Dict[str, Font]): font dictionary

267 strip_rotated: Removes text if rotated w.r.t. to the page. Defaults to True.

268 debug_path (Path, optional): Path to a directory for saving debug output.

269

270 Returns:

271 List[BTGroup]: list of dicts of text rendered by each BT operator

272

273 """

274 state_mgr = TextStateManager() # transformation stack manager

275 bt_groups: List[BTGroup] = [] # BT operator dict

276 tj_ops: List[TextStateParams] = [] # Tj/TJ operator data

277 for operands, op in ops:

278 if op in (b"BT", b"q"):

279 bts, tjs = recurs_to_target_op(

280 ops, state_mgr, b"ET" if op == b"BT" else b"Q", fonts, strip_rotated

281 )

282 bt_groups.extend(bts)

283 tj_ops.extend(tjs)

284 elif op == b"Tf":

285 state_mgr.set_font(fonts[operands[0]], operands[1])

286 else: # set Tc, Tw, Tz, TL, and Ts if required. ignores all other ops

287 state_mgr.set_state_param(op, operands)

288

289 if any(tj.rotated for tj in tj_ops):

290 if strip_rotated:

291 logger_warning(

292 "Rotated text discovered. Output will be incomplete.", __name__

293 )

294 else:

295 logger_warning(

296 "Rotated text discovered. Layout will be degraded.", __name__

297 )

298 if not all(tj.font.interpretable for tj in tj_ops):

299 logger_warning(

300 "PDF contains an uninterpretable font. Output will be incomplete.", __name__

301 )

302

303 # left align the data, i.e. decrement all tx values by min(tx)

304 min_x = min((x["tx"] for x in bt_groups), default=0.0)

305 bt_groups = [

306 dict(ogrp, tx=ogrp["tx"] - min_x, displaced_tx=ogrp["displaced_tx"] - min_x) # type: ignore[misc]

307 for ogrp in sorted(

308 bt_groups, key=lambda x: (x["ty"] * x["flip_sort"], -x["tx"]), reverse=True

309 )

310 ]

311

312 if debug_path: # pragma: no cover

313 import json # noqa: PLC0415

314

315 debug_path.joinpath("bts.json").write_text(

316 json.dumps(bt_groups, indent=2, default=str), "utf-8"

317 )

318 debug_path.joinpath("tjs.json").write_text(

319 json.dumps(

320 tj_ops, indent=2, default=lambda x: getattr(x, "to_dict", str)(x)

321 ),

322 "utf-8",

323 )

324 return bt_groups

325

326

327def fixed_char_width(bt_groups: List[BTGroup], scale_weight: float = 1.25) -> float:

328 """

329 Calculate average character width weighted by the length of the rendered

330 text in each sample for conversion to fixed-width layout.

331

332 Args:

333 bt_groups (List[BTGroup]): List of dicts of text rendered by each

334 BT operator

335

336 Returns:

337 float: fixed character width

338

339 """

340 char_widths = []

341 for _bt in bt_groups:

342 _len = len(_bt["text"]) * scale_weight

343 char_widths.append(((_bt["displaced_tx"] - _bt["tx"]) / _len, _len))

344 return sum(_w * _l for _w, _l in char_widths) / sum(_l for _, _l in char_widths)

345

346

347def fixed_width_page(

348 ty_groups: Dict[int, List[BTGroup]], char_width: float, space_vertically: bool, font_height_weight: float

349) -> str:

350 """

351 Generate page text from text operations grouped by rendered y coordinate.

352

353 Args:

354 ty_groups: dict of text show ops as returned by y_coordinate_groups()

355 char_width: fixed character width

356 space_vertically: include blank lines inferred from y distance + font height.

357 font_height_weight: multiplier for font height when calculating blank lines.

358

359 Returns:

360 str: page text in a fixed width format that closely adheres to the rendered

361 layout in the source pdf.

362

363 """

364 lines: List[str] = []

365 last_y_coord = 0

366 for y_coord, line_data in ty_groups.items():

367 if space_vertically and lines:

368 fh = line_data[0]["font_height"]

369 blank_lines = 0 if fh == 0 else (

370 int(abs(y_coord - last_y_coord) / (fh * font_height_weight)) - 1

371 )

372 lines.extend([""] * blank_lines)

373 line = ""

374 last_disp = 0.0

375 for bt_op in line_data:

376 offset = int(bt_op["tx"] // char_width)

377 spaces = (offset - len(line)) * (ceil(last_disp) < int(bt_op["tx"]))

378 line = f"{line}{' ' * spaces}{bt_op['text']}"

379 last_disp = bt_op["displaced_tx"]

380 if line.strip() or lines:

381 lines.append(

382 "".join(c if ord(c) < 14 or ord(c) > 31 else " " for c in line)

383 )

384 last_y_coord = y_coord

385 return "\n".join(ln.rstrip() for ln in lines if space_vertically or ln.strip())

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_text_extraction/_layout_mode/_fixed_width_page.py: 14%

166 statements