Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pdfplumber/utils/text.py: 16%

1import inspect

2import itertools

3import logging

4import re

5import string

6from operator import itemgetter

7from typing import (

8 Any,

9 Callable,

10 Dict,

11 Generator,

12 List,

13 Match,

14 Optional,

15 Pattern,

16 Tuple,

17 Union,

18)

20from .._typing import T_bbox, T_dir, T_num, T_obj, T_obj_iter, T_obj_list

21from .clustering import cluster_objects

22from .generic import to_list

23from .geometry import objects_to_bbox

25logger = logging.getLogger(__name__)

27DEFAULT_X_TOLERANCE = 3

28DEFAULT_Y_TOLERANCE = 3

29DEFAULT_X_DENSITY = 7.25

30DEFAULT_Y_DENSITY = 13

31DEFAULT_LINE_DIR: T_dir = "ttb"

32DEFAULT_CHAR_DIR: T_dir = "ltr"

34LIGATURES = {

35 "ﬀ": "ff",

36 "ﬃ": "ffi",

37 "ﬄ": "ffl",

38 "ﬁ": "fi",

39 "ﬂ": "fl",

40 "ﬆ": "st",

41 "ﬅ": "st",

42}

45def get_line_cluster_key(line_dir: T_dir) -> Callable[[T_obj], T_num]:

46 return {

47 "ttb": lambda x: x["top"],

48 "btt": lambda x: -x["bottom"],

49 "ltr": lambda x: x["x0"],

50 "rtl": lambda x: -x["x1"],

51 }[line_dir]

54def get_char_sort_key(char_dir: T_dir) -> Callable[[T_obj], Tuple[T_num, T_num]]:

55 return {

56 "ttb": lambda x: (x["top"], x["bottom"]),

57 "btt": lambda x: (-(x["top"] + x["height"]), -x["top"]),

58 "ltr": lambda x: (x["x0"], x["x0"]),

59 "rtl": lambda x: (-x["x1"], -x["x0"]),

60 }[char_dir]

63BBOX_ORIGIN_KEYS = {

64 "ttb": itemgetter(1),

65 "btt": itemgetter(3),

66 "ltr": itemgetter(0),

67 "rtl": itemgetter(2),

68}

70POSITION_KEYS = {

71 "ttb": itemgetter("top"),

72 "btt": itemgetter("bottom"),

73 "ltr": itemgetter("x0"),

74 "rtl": itemgetter("x1"),

75}

78def validate_directions(line_dir: T_dir, char_dir: T_dir, suffix: str = "") -> None:

79 valid_dirs = set(POSITION_KEYS.keys())

80 if line_dir not in valid_dirs:

81 raise ValueError(

82 f"line_dir{suffix} must be one of {valid_dirs}, not {line_dir}"

83 )

84 if char_dir not in valid_dirs:

85 raise ValueError(

86 f"char_dir{suffix} must be one of {valid_dirs}, not {char_dir}"

87 )

88 if set(line_dir) == set(char_dir):

89 raise ValueError(

90 f"line_dir{suffix}={line_dir} is incompatible "

91 f"with char_dir{suffix}={char_dir}"

92 )

95class TextMap:

96 """

97 A TextMap maps each unicode character in the text to an individual `char`

98 object (or, in the case of layout-implied whitespace, `None`).

99 """

100

101 def __init__(

102 self,

103 tuples: List[Tuple[str, Optional[T_obj]]],

104 line_dir_render: T_dir,

105 char_dir_render: T_dir,

106 ) -> None:

107 validate_directions(line_dir_render, char_dir_render, "_render")

108 self.tuples = tuples

109 self.line_dir_render = line_dir_render

110 self.char_dir_render = char_dir_render

111 self.as_string = self.to_string()

112

113 def to_string(self) -> str:

114 cd = self.char_dir_render

115 ld = self.line_dir_render

116

117 base = "".join(map(itemgetter(0), self.tuples))

118

119 if cd == "ltr" and ld == "ttb":

120 return base

121 else:

122 lines = base.split("\n")

123 if ld in ("btt", "rtl"):

124 lines = list(reversed(lines))

125

126 if cd == "rtl":

127 lines = ["".join(reversed(line)) for line in lines]

128

129 if ld in ("rtl", "ltr"):

130 max_line_length = max(map(len, lines))

131 if cd == "btt":

132 lines = [

133 (" " * (max_line_length - len(line))) + line for line in lines

134 ]

135 else:

136 lines = [

137 line + (" " * (max_line_length - len(line))) for line in lines

138 ]

139 return "\n".join(

140 "".join(line[i] for line in lines) for i in range(max_line_length)

141 )

142 else:

143 return "\n".join(lines)

144

145 def match_to_dict(

146 self,

147 m: Match[str],

148 main_group: int = 0,

149 return_groups: bool = True,

150 return_chars: bool = True,

151 ) -> Dict[str, Any]:

152 subset = self.tuples[m.start(main_group) : m.end(main_group)]

153 chars = [c for (text, c) in subset if c is not None]

154 x0, top, x1, bottom = objects_to_bbox(chars)

155

156 result = {

157 "text": m.group(main_group),

158 "x0": x0,

159 "top": top,

160 "x1": x1,

161 "bottom": bottom,

162 }

163

164 if return_groups:

165 result["groups"] = m.groups()

166

167 if return_chars:

168 result["chars"] = chars

169

170 return result

171

172 def search(

173 self,

174 pattern: Union[str, Pattern[str]],

175 regex: bool = True,

176 case: bool = True,

177 return_groups: bool = True,

178 return_chars: bool = True,

179 main_group: int = 0,

180 ) -> List[Dict[str, Any]]:

181 if isinstance(pattern, Pattern):

182 if regex is False:

183 raise ValueError(

184 "Cannot pass a compiled search pattern *and* regex=False together."

185 )

186 if case is False:

187 raise ValueError(

188 "Cannot pass a compiled search pattern *and* case=False together."

189 )

190 compiled = pattern

191 else:

192 if regex is False:

193 pattern = re.escape(pattern)

194

195 flags = re.I if case is False else 0

196 compiled = re.compile(pattern, flags)

197

198 gen = re.finditer(compiled, self.as_string)

199 # Remove zero-length matches (can happen, e.g., with optional

200 # patterns in regexes) and whitespace-only matches

201 filtered = filter(lambda m: bool(m.group(main_group).strip()), gen)

202 return [

203 self.match_to_dict(

204 m,

205 return_groups=return_groups,

206 return_chars=return_chars,

207 main_group=main_group,

208 )

209 for m in filtered

210 ]

211

212 def extract_text_lines(

213 self, strip: bool = True, return_chars: bool = True

214 ) -> List[Dict[str, Any]]:

215 """

216 `strip` is analogous to Python's `str.strip()` method, and returns

217 `text` attributes without their surrounding whitespace. Only

218 relevant when the relevant TextMap is created with `layout` = True

219

220 Setting `return_chars` to False will exclude the individual

221 character objects from the returned text-line dicts.

222 """

223 if strip:

224 pat = r" *([^\n]+?) *(\n|$)"

225 else:

226 pat = r"([^\n]+)"

227

228 return self.search(

229 pat, main_group=1, return_chars=return_chars, return_groups=False

230 )

231

232

233class WordMap:

234 """

235 A WordMap maps words->chars.

236 """

237

238 def __init__(self, tuples: List[Tuple[T_obj, T_obj_list]]) -> None:

239 self.tuples = tuples

240

241 def to_textmap(

242 self,

243 layout: bool = False,

244 layout_width: T_num = 0,

245 layout_height: T_num = 0,

246 layout_width_chars: int = 0,

247 layout_height_chars: int = 0,

248 layout_bbox: T_bbox = (0, 0, 0, 0),

249 x_density: T_num = DEFAULT_X_DENSITY,

250 y_density: T_num = DEFAULT_Y_DENSITY,

251 x_shift: T_num = 0,

252 y_shift: T_num = 0,

253 y_tolerance: T_num = DEFAULT_Y_TOLERANCE,

254 line_dir: T_dir = DEFAULT_LINE_DIR,

255 char_dir: T_dir = DEFAULT_CHAR_DIR,

256 line_dir_rotated: Optional[T_dir] = None,

257 char_dir_rotated: Optional[T_dir] = None,

258 char_dir_render: Optional[T_dir] = None,

259 line_dir_render: Optional[T_dir] = None,

260 use_text_flow: bool = False,

261 presorted: bool = False,

262 expand_ligatures: bool = True,

263 ) -> TextMap:

264 """

265 Given a list of (word, chars) tuples (i.e., a WordMap), return a list of

266 (char-text, char) tuples (i.e., a TextMap) that can be used to mimic

267 the structural layout of the text on the page(s), using the following

268 approach for top-to-bottom, left-to-right text:

269

270 - Sort the words by (top, x0) if not already sorted.

271

272 - Cluster the words by top (taking `y_tolerance` into account), and

273 iterate through them.

274

275 - For each cluster, divide (top - y_shift) by `y_density` to calculate

276 the minimum number of newlines that should come before this cluster.

277 Append that number of newlines *minus* the number of newlines already

278 appended, with a minimum of one.

279

280 - Then for each cluster, iterate through each word in it. Divide each

281 word's x0, minus `x_shift`, by `x_density` to calculate the minimum

282 number of characters that should come before this cluster. Append that

283 number of spaces *minus* the number of characters and spaces already

284 appended, with a minimum of one. Then append the word's text.

285

286 - At the termination of each line, add more spaces if necessary to

287 mimic `layout_width`.

288

289 - Finally, add newlines to the end if necessary to mimic to

290 `layout_height`.

291

292 For other line/character directions (e.g., bottom-to-top,

293 right-to-left), these steps are adjusted.

294 """

295 _textmap: List[Tuple[str, Optional[T_obj]]] = []

296

297 if not len(self.tuples):

298 return TextMap(

299 _textmap,

300 line_dir_render=line_dir_render or line_dir,

301 char_dir_render=char_dir_render or char_dir,

302 )

303

304 expansions = LIGATURES if expand_ligatures else {}

305

306 if layout:

307 if layout_width_chars:

308 if layout_width:

309 raise ValueError(

310 "`layout_width` and `layout_width_chars` cannot both be set."

311 )

312 else:

313 layout_width_chars = int(round(layout_width / x_density))

314

315 if layout_height_chars:

316 if layout_height:

317 raise ValueError(

318 "`layout_height` and `layout_height_chars` cannot both be set."

319 )

320 else:

321 layout_height_chars = int(round(layout_height / y_density))

322

323 blank_line = [(" ", None)] * layout_width_chars

324 else:

325 blank_line = []

326

327 num_newlines = 0

328

329 line_cluster_key = get_line_cluster_key(line_dir)

330 char_sort_key = get_char_sort_key(char_dir)

331

332 line_position_key = POSITION_KEYS[line_dir]

333 char_position_key = POSITION_KEYS[char_dir]

334

335 y_origin = BBOX_ORIGIN_KEYS[line_dir](layout_bbox)

336 x_origin = BBOX_ORIGIN_KEYS[char_dir](layout_bbox)

337

338 words_sorted_line_dir = (

339 self.tuples

340 if presorted or use_text_flow

341 else sorted(self.tuples, key=lambda x: line_cluster_key(x[0]))

342 )

343

344 tuples_by_line = cluster_objects(

345 words_sorted_line_dir,

346 lambda x: line_cluster_key(x[0]),

347 y_tolerance,

348 preserve_order=presorted or use_text_flow,

349 )

350

351 for i, line_tuples in enumerate(tuples_by_line):

352 if layout:

353 line_position = line_position_key(line_tuples[0][0])

354 y_dist_raw = line_position - (y_origin + y_shift)

355 adj = -1 if line_dir in ["btt", "rtl"] else 1

356 y_dist = y_dist_raw * adj / y_density

357 else:

358 y_dist = 0

359 num_newlines_prepend = max(

360 # At least one newline, unless this iis the first line

361 int(i > 0),

362 # ... or as many as needed to get the imputed "distance" from the top

363 round(y_dist) - num_newlines,

364 )

365

366 for i in range(num_newlines_prepend):

367 if not len(_textmap) or _textmap[-1][0] == "\n":

368 _textmap += blank_line

369 _textmap.append(("\n", None))

370

371 num_newlines += num_newlines_prepend

372

373 line_len = 0

374

375 line_tuples_sorted = (

376 line_tuples

377 if presorted or use_text_flow

378 else sorted(line_tuples, key=lambda x: char_sort_key(x[0]))

379 )

380

381 for word, chars in line_tuples_sorted:

382 if layout:

383 char_position = char_position_key(word)

384 x_dist_raw = char_position - (x_origin + x_shift)

385 adj = -1 if char_dir in ["btt", "rtl"] else 1

386 x_dist = x_dist_raw * adj / x_density

387 else:

388 x_dist = 0

389

390 num_spaces_prepend = max(min(1, line_len), round(x_dist) - line_len)

391 _textmap += [(" ", None)] * num_spaces_prepend

392 line_len += num_spaces_prepend

393

394 for c in chars:

395 letters = expansions.get(c["text"], c["text"])

396 for letter in letters:

397 _textmap.append((letter, c))

398 line_len += 1

399

400 # Append spaces at end of line

401 if layout:

402 _textmap += [(" ", None)] * (layout_width_chars - line_len)

403

404 # Append blank lines at end of text

405 if layout:

406 num_newlines_append = layout_height_chars - (num_newlines + 1)

407 for i in range(num_newlines_append):

408 if i > 0:

409 _textmap += blank_line

410 _textmap.append(("\n", None))

411

412 # Remove terminal newline

413 if _textmap[-1] == ("\n", None):

414 _textmap = _textmap[:-1]

415

416 return TextMap(

417 _textmap,

418 line_dir_render=line_dir_render or line_dir,

419 char_dir_render=char_dir_render or char_dir,

420 )

421

422

423class WordExtractor:

424 def __init__(

425 self,

426 x_tolerance: T_num = DEFAULT_X_TOLERANCE,

427 y_tolerance: T_num = DEFAULT_Y_TOLERANCE,

428 x_tolerance_ratio: Union[int, float, None] = None,

429 y_tolerance_ratio: Union[int, float, None] = None,

430 keep_blank_chars: bool = False,

431 use_text_flow: bool = False,

432 vertical_ttb: bool = True, # Should vertical words be read top-to-bottom?

433 horizontal_ltr: bool = True, # Should words be read left-to-right?

434 line_dir: T_dir = DEFAULT_LINE_DIR,

435 char_dir: T_dir = DEFAULT_CHAR_DIR,

436 line_dir_rotated: Optional[T_dir] = None,

437 char_dir_rotated: Optional[T_dir] = None,

438 extra_attrs: Optional[List[str]] = None,

439 split_at_punctuation: Union[bool, str] = False,

440 expand_ligatures: bool = True,

441 ):

442 self.x_tolerance = x_tolerance

443 self.y_tolerance = y_tolerance

444 self.x_tolerance_ratio = x_tolerance_ratio

445 self.y_tolerance_ratio = y_tolerance_ratio

446 self.keep_blank_chars = keep_blank_chars

447 self.use_text_flow = use_text_flow

448 self.horizontal_ltr = horizontal_ltr

449 self.vertical_ttb = vertical_ttb

450 if vertical_ttb is False:

451 logger.warning(

452 "vertical_ttb is deprecated and will be removed;"

453 " use line_dir/char_dir instead."

454 )

455 if horizontal_ltr is False:

456 logger.warning(

457 "horizontal_ltr is deprecated and will be removed;"

458 " use line_dir/char_dir instead."

459 )

460 self.line_dir = line_dir

461 self.char_dir = char_dir

462 # Default is to "flip" the directions for rotated text

463 self.line_dir_rotated = line_dir_rotated or char_dir

464 self.char_dir_rotated = char_dir_rotated or line_dir

465 validate_directions(self.line_dir, self.char_dir)

466 validate_directions(self.line_dir_rotated, self.char_dir_rotated, "_rotated")

467 self.extra_attrs = [] if extra_attrs is None else extra_attrs

468

469 # Note: string.punctuation = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

470 self.split_at_punctuation = (

471 string.punctuation

472 if split_at_punctuation is True

473 else (split_at_punctuation or "")

474 )

475

476 self.expansions = LIGATURES if expand_ligatures else {}

477

478 def get_char_dir(self, upright: int) -> T_dir:

479 # Note: This can be simplified and reincorporated into .merge_chars and

480 # .iter_chars_to_lines once .vertical_ttb and .horizontal_ltr

481 # deprecation is complete.

482 if not upright and not self.vertical_ttb:

483 return "btt"

484

485 elif upright and not self.horizontal_ltr:

486 return "rtl"

487

488 return self.char_dir if upright else self.char_dir_rotated

489

490 def merge_chars(self, ordered_chars: T_obj_list) -> T_obj:

491 x0, top, x1, bottom = objects_to_bbox(ordered_chars)

492 doctop_adj = ordered_chars[0]["doctop"] - ordered_chars[0]["top"]

493 upright = ordered_chars[0]["upright"]

494 char_dir = self.get_char_dir(upright)

495

496 word = {

497 "text": "".join(

498 self.expansions.get(c["text"], c["text"]) for c in ordered_chars

499 ),

500 "x0": x0,

501 "x1": x1,

502 "top": top,

503 "doctop": top + doctop_adj,

504 "bottom": bottom,

505 "upright": upright,

506 "height": bottom - top,

507 "width": x1 - x0,

508 "direction": char_dir,

509 }

510

511 for key in self.extra_attrs:

512 word[key] = ordered_chars[0][key]

513

514 return word

515

516 def char_begins_new_word(

517 self,

518 prev_char: T_obj,

519 curr_char: T_obj,

520 direction: T_dir,

521 x_tolerance: T_num,

522 y_tolerance: T_num,

523 ) -> bool:

524 """This method takes several factors into account to determine if

525 `curr_char` represents the beginning of a new word:

526

527 - Whether the text is "upright" (i.e., non-rotated)

528 - Whether the user has specified that horizontal text runs

529 left-to-right (default) or right-to-left, as represented by

530 self.horizontal_ltr

531 - Whether the user has specified that vertical text the text runs

532 top-to-bottom (default) or bottom-to-top, as represented by

533 self.vertical_ttb

534 - The x0, top, x1, and bottom attributes of prev_char and

535 curr_char

536 - The self.x_tolerance and self.y_tolerance settings. Note: In

537 this case, x/y refer to those directions for non-rotated text.

538 For vertical text, they are flipped. A more accurate terminology

539 might be "*intra*line character distance tolerance" and

540 "*inter*line character distance tolerance"

541

542 An important note: The *intra*line distance is measured from the

543 *end* of the previous character to the *beginning* of the current

544 character, while the *inter*line distance is measured from the

545 *top* of the previous character to the *top* of the next

546 character. The reasons for this are partly repository-historical,

547 and partly logical, as successive text lines' bounding boxes often

548 overlap slightly (and we don't want that overlap to be interpreted

549 as the two lines being the same line).

550

551 The upright-ness of the character determines the attributes to

552 compare, while horizontal_ltr/vertical_ttb determine the direction

553 of the comparison.

554 """

555 # Note: Due to the grouping step earlier in the process,

556 # curr_char["upright"] will always equal prev_char["upright"].

557 if direction in ("ltr", "rtl"):

558 x = x_tolerance

559 y = y_tolerance

560 ay = prev_char["top"]

561 cy = curr_char["top"]

562 if direction == "ltr":

563 ax = prev_char["x0"]

564 bx = prev_char["x1"]

565 cx = curr_char["x0"]

566 else:

567 ax = -prev_char["x1"]

568 bx = -prev_char["x0"]

569 cx = -curr_char["x1"]

570

571 else:

572 x = y_tolerance

573 y = x_tolerance

574 ay = prev_char["x0"]

575 cy = curr_char["x0"]

576 if direction == "ttb":

577 ax = prev_char["top"]

578 bx = prev_char["bottom"]

579 cx = curr_char["top"]

580 else:

581 ax = -prev_char["bottom"]

582 bx = -prev_char["top"]

583 cx = -curr_char["bottom"]

584

585 return bool(

586 # Intraline test

587 (cx < ax)

588 or (cx > bx + x)

589 # Interline test

590 or abs(cy - ay) > y

591 )

592

593 def iter_chars_to_words(

594 self,

595 ordered_chars: T_obj_iter,

596 direction: T_dir,

597 ) -> Generator[T_obj_list, None, None]:

598 current_word: T_obj_list = []

599

600 def start_next_word(

601 new_char: Optional[T_obj],

602 ) -> Generator[T_obj_list, None, None]:

603 nonlocal current_word

604

605 if current_word:

606 yield current_word

607

608 current_word = [] if new_char is None else [new_char]

609

610 xt = self.x_tolerance

611 xtr = self.x_tolerance_ratio

612 yt = self.y_tolerance

613 ytr = self.y_tolerance_ratio

614

615 for char in ordered_chars:

616 text = char["text"]

617

618 if not self.keep_blank_chars and text.isspace():

619 yield from start_next_word(None)

620

621 elif text in self.split_at_punctuation:

622 yield from start_next_word(char)

623 yield from start_next_word(None)

624

625 elif current_word and self.char_begins_new_word(

626 current_word[-1],

627 char,

628 direction,

629 x_tolerance=(xt if xtr is None else xtr * current_word[-1]["size"]),

630 y_tolerance=(yt if ytr is None else ytr * current_word[-1]["size"]),

631 ):

632 yield from start_next_word(char)

633

634 else:

635 current_word.append(char)

636

637 # Finally, after all chars processed

638 if current_word:

639 yield current_word

640

641 def iter_chars_to_lines(

642 self, chars: T_obj_iter

643 ) -> Generator[Tuple[T_obj_list, T_dir], None, None]:

644 chars = list(chars)

645 upright = chars[0]["upright"]

646 line_dir = self.line_dir if upright else self.line_dir_rotated

647 char_dir = self.get_char_dir(upright)

648

649 line_cluster_key = get_line_cluster_key(line_dir)

650 char_sort_key = get_char_sort_key(char_dir)

651

652 # Cluster by line

653 subclusters = cluster_objects(

654 chars,

655 line_cluster_key,

656 (self.y_tolerance if line_dir in ("ttb", "btt") else self.x_tolerance),

657 )

658

659 for sc in subclusters:

660 # Sort within line

661 chars_sorted = sorted(sc, key=char_sort_key)

662 yield (chars_sorted, char_dir)

663

664 def iter_extract_tuples(

665 self, chars: T_obj_iter

666 ) -> Generator[Tuple[T_obj, T_obj_list], None, None]:

667 grouping_key = itemgetter("upright", *self.extra_attrs)

668 grouped_chars = itertools.groupby(chars, grouping_key)

669

670 for keyvals, char_group in grouped_chars:

671 line_groups = (

672 [(char_group, self.char_dir)]

673 if self.use_text_flow

674 else self.iter_chars_to_lines(char_group)

675 )

676 for line_chars, direction in line_groups:

677 for word_chars in self.iter_chars_to_words(line_chars, direction):

678 yield (self.merge_chars(word_chars), word_chars)

679

680 def extract_wordmap(self, chars: T_obj_iter) -> WordMap:

681 return WordMap(list(self.iter_extract_tuples(chars)))

682

683 def extract_words(

684 self, chars: T_obj_list, return_chars: bool = False

685 ) -> T_obj_list:

686 if return_chars:

687 return list(

688 {**word, "chars": word_chars}

689 for word, word_chars in self.iter_extract_tuples(chars)

690 )

691 else:

692 return list(word for word, word_chars in self.iter_extract_tuples(chars))

693

694

695def extract_words(

696 chars: T_obj_list, return_chars: bool = False, **kwargs: Any

697) -> T_obj_list:

698 return WordExtractor(**kwargs).extract_words(chars, return_chars)

699

700

701TEXTMAP_KWARGS = inspect.signature(WordMap.to_textmap).parameters.keys()

702WORD_EXTRACTOR_KWARGS = inspect.signature(WordExtractor).parameters.keys()

703

704

705def chars_to_textmap(chars: T_obj_list, **kwargs: Any) -> TextMap:

706 kwargs.update(

707 {

708 "presorted": True,

709 "layout_bbox": kwargs.get("layout_bbox") or objects_to_bbox(chars),

710 }

711 )

712

713 extractor = WordExtractor(

714 **{k: kwargs[k] for k in WORD_EXTRACTOR_KWARGS if k in kwargs}

715 )

716 wordmap = extractor.extract_wordmap(chars)

717 textmap = wordmap.to_textmap(

718 **{k: kwargs[k] for k in TEXTMAP_KWARGS if k in kwargs}

719 )

720 return textmap

721

722

723def extract_text(

724 chars: T_obj_list,

725 line_dir_render: Optional[T_dir] = None,

726 char_dir_render: Optional[T_dir] = None,

727 **kwargs: Any,

728) -> str:

729 chars = to_list(chars)

730 if len(chars) == 0:

731 return ""

732

733 if kwargs.get("layout"):

734 textmap_kwargs = {

735 **kwargs,

736 **{"line_dir_render": line_dir_render, "char_dir_render": char_dir_render},

737 }

738 return chars_to_textmap(chars, **textmap_kwargs).as_string

739 else:

740 extractor = WordExtractor(

741 **{k: kwargs[k] for k in WORD_EXTRACTOR_KWARGS if k in kwargs}

742 )

743 words = extractor.extract_words(chars)

744

745 line_dir_render = line_dir_render or extractor.line_dir

746 char_dir_render = char_dir_render or extractor.char_dir

747

748 line_cluster_key = get_line_cluster_key(extractor.line_dir)

749

750 x_tolerance = kwargs.get("x_tolerance", DEFAULT_X_TOLERANCE)

751 y_tolerance = kwargs.get("y_tolerance", DEFAULT_Y_TOLERANCE)

752

753 lines = cluster_objects(

754 words,

755 line_cluster_key,

756 y_tolerance if line_dir_render in ("ttb", "btt") else x_tolerance,

757 )

758

759 return TextMap(

760 [

761 (char, None)

762 for char in (

763 "\n".join(" ".join(word["text"] for word in line) for line in lines)

764 )

765 ],

766 line_dir_render=line_dir_render,

767 char_dir_render=char_dir_render,

768 ).as_string

769

770

771def collate_line(

772 line_chars: T_obj_list,

773 tolerance: T_num = DEFAULT_X_TOLERANCE,

774) -> str:

775 coll = ""

776 last_x1 = None

777 for char in sorted(line_chars, key=itemgetter("x0")):

778 if (last_x1 is not None) and (char["x0"] > (last_x1 + tolerance)):

779 coll += " "

780 last_x1 = char["x1"]

781 coll += char["text"]

782 return coll

783

784

785def extract_text_simple(

786 chars: T_obj_list,

787 x_tolerance: T_num = DEFAULT_X_TOLERANCE,

788 y_tolerance: T_num = DEFAULT_Y_TOLERANCE,

789) -> str:

790 clustered = cluster_objects(chars, itemgetter("doctop"), y_tolerance)

791 return "\n".join(collate_line(c, x_tolerance) for c in clustered)

792

793

794def dedupe_chars(

795 chars: T_obj_list,

796 tolerance: T_num = 1,

797 extra_attrs: Optional[Tuple[str, ...]] = ("fontname", "size"),

798) -> T_obj_list:

799 """

800 Removes duplicate chars — those sharing the same text and positioning

801 (within `tolerance`) as other characters in the set. Use extra_args to

802 be more restrictive with the properties shared by the matching chars.

803 """

804 key = itemgetter(*("upright", "text"), *(extra_attrs or tuple()))

805 pos_key = itemgetter("doctop", "x0")

806

807 def yield_unique_chars(chars: T_obj_list) -> Generator[T_obj, None, None]:

808 sorted_chars = sorted(chars, key=key)

809 for grp, grp_chars in itertools.groupby(sorted_chars, key=key):

810 for y_cluster in cluster_objects(

811 list(grp_chars), itemgetter("doctop"), tolerance

812 ):

813 for x_cluster in cluster_objects(

814 y_cluster, itemgetter("x0"), tolerance

815 ):

816 yield sorted(x_cluster, key=pos_key)[0]

817

818 deduped = yield_unique_chars(chars)

819 return sorted(deduped, key=chars.index)