Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pdfplumber/utils/text.py: 16%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

315 statements  

1import inspect 

2import itertools 

3import logging 

4import re 

5import string 

6from operator import itemgetter 

7from typing import ( 

8 Any, 

9 Callable, 

10 Dict, 

11 Generator, 

12 List, 

13 Match, 

14 Optional, 

15 Pattern, 

16 Tuple, 

17 Union, 

18) 

19 

20from .._typing import T_bbox, T_dir, T_num, T_obj, T_obj_iter, T_obj_list 

21from .clustering import cluster_objects 

22from .generic import to_list 

23from .geometry import objects_to_bbox 

24 

25logger = logging.getLogger(__name__) 

26 

27DEFAULT_X_TOLERANCE = 3 

28DEFAULT_Y_TOLERANCE = 3 

29DEFAULT_X_DENSITY = 7.25 

30DEFAULT_Y_DENSITY = 13 

31DEFAULT_LINE_DIR: T_dir = "ttb" 

32DEFAULT_CHAR_DIR: T_dir = "ltr" 

33 

34LIGATURES = { 

35 "ff": "ff", 

36 "ffi": "ffi", 

37 "ffl": "ffl", 

38 "fi": "fi", 

39 "fl": "fl", 

40 "st": "st", 

41 "ſt": "st", 

42} 

43 

44 

45def get_line_cluster_key(line_dir: T_dir) -> Callable[[T_obj], T_num]: 

46 return { 

47 "ttb": lambda x: x["top"], 

48 "btt": lambda x: -x["bottom"], 

49 "ltr": lambda x: x["x0"], 

50 "rtl": lambda x: -x["x1"], 

51 }[line_dir] 

52 

53 

54def get_char_sort_key(char_dir: T_dir) -> Callable[[T_obj], Tuple[T_num, T_num]]: 

55 return { 

56 "ttb": lambda x: (x["top"], x["bottom"]), 

57 "btt": lambda x: (-(x["top"] + x["height"]), -x["top"]), 

58 "ltr": lambda x: (x["x0"], x["x0"]), 

59 "rtl": lambda x: (-x["x1"], -x["x0"]), 

60 }[char_dir] 

61 

62 

63BBOX_ORIGIN_KEYS = { 

64 "ttb": itemgetter(1), 

65 "btt": itemgetter(3), 

66 "ltr": itemgetter(0), 

67 "rtl": itemgetter(2), 

68} 

69 

70POSITION_KEYS = { 

71 "ttb": itemgetter("top"), 

72 "btt": itemgetter("bottom"), 

73 "ltr": itemgetter("x0"), 

74 "rtl": itemgetter("x1"), 

75} 

76 

77 

78def validate_directions(line_dir: T_dir, char_dir: T_dir, suffix: str = "") -> None: 

79 valid_dirs = set(POSITION_KEYS.keys()) 

80 if line_dir not in valid_dirs: 

81 raise ValueError( 

82 f"line_dir{suffix} must be one of {valid_dirs}, not {line_dir}" 

83 ) 

84 if char_dir not in valid_dirs: 

85 raise ValueError( 

86 f"char_dir{suffix} must be one of {valid_dirs}, not {char_dir}" 

87 ) 

88 if set(line_dir) == set(char_dir): 

89 raise ValueError( 

90 f"line_dir{suffix}={line_dir} is incompatible " 

91 f"with char_dir{suffix}={char_dir}" 

92 ) 

93 

94 

95class TextMap: 

96 """ 

97 A TextMap maps each unicode character in the text to an individual `char` 

98 object (or, in the case of layout-implied whitespace, `None`). 

99 """ 

100 

101 def __init__( 

102 self, 

103 tuples: List[Tuple[str, Optional[T_obj]]], 

104 line_dir_render: T_dir, 

105 char_dir_render: T_dir, 

106 ) -> None: 

107 validate_directions(line_dir_render, char_dir_render, "_render") 

108 self.tuples = tuples 

109 self.line_dir_render = line_dir_render 

110 self.char_dir_render = char_dir_render 

111 self.as_string = self.to_string() 

112 

113 def to_string(self) -> str: 

114 cd = self.char_dir_render 

115 ld = self.line_dir_render 

116 

117 base = "".join(map(itemgetter(0), self.tuples)) 

118 

119 if cd == "ltr" and ld == "ttb": 

120 return base 

121 else: 

122 lines = base.split("\n") 

123 if ld in ("btt", "rtl"): 

124 lines = list(reversed(lines)) 

125 

126 if cd == "rtl": 

127 lines = ["".join(reversed(line)) for line in lines] 

128 

129 if ld in ("rtl", "ltr"): 

130 max_line_length = max(map(len, lines)) 

131 if cd == "btt": 

132 lines = [ 

133 (" " * (max_line_length - len(line))) + line for line in lines 

134 ] 

135 else: 

136 lines = [ 

137 line + (" " * (max_line_length - len(line))) for line in lines 

138 ] 

139 return "\n".join( 

140 "".join(line[i] for line in lines) for i in range(max_line_length) 

141 ) 

142 else: 

143 return "\n".join(lines) 

144 

145 def match_to_dict( 

146 self, 

147 m: Match[str], 

148 main_group: int = 0, 

149 return_groups: bool = True, 

150 return_chars: bool = True, 

151 ) -> Dict[str, Any]: 

152 subset = self.tuples[m.start(main_group) : m.end(main_group)] 

153 chars = [c for (text, c) in subset if c is not None] 

154 x0, top, x1, bottom = objects_to_bbox(chars) 

155 

156 result = { 

157 "text": m.group(main_group), 

158 "x0": x0, 

159 "top": top, 

160 "x1": x1, 

161 "bottom": bottom, 

162 } 

163 

164 if return_groups: 

165 result["groups"] = m.groups() 

166 

167 if return_chars: 

168 result["chars"] = chars 

169 

170 return result 

171 

172 def search( 

173 self, 

174 pattern: Union[str, Pattern[str]], 

175 regex: bool = True, 

176 case: bool = True, 

177 return_groups: bool = True, 

178 return_chars: bool = True, 

179 main_group: int = 0, 

180 ) -> List[Dict[str, Any]]: 

181 if isinstance(pattern, Pattern): 

182 if regex is False: 

183 raise ValueError( 

184 "Cannot pass a compiled search pattern *and* regex=False together." 

185 ) 

186 if case is False: 

187 raise ValueError( 

188 "Cannot pass a compiled search pattern *and* case=False together." 

189 ) 

190 compiled = pattern 

191 else: 

192 if regex is False: 

193 pattern = re.escape(pattern) 

194 

195 flags = re.I if case is False else 0 

196 compiled = re.compile(pattern, flags) 

197 

198 gen = re.finditer(compiled, self.as_string) 

199 # Remove zero-length matches (can happen, e.g., with optional 

200 # patterns in regexes) and whitespace-only matches 

201 filtered = filter(lambda m: bool(m.group(main_group).strip()), gen) 

202 return [ 

203 self.match_to_dict( 

204 m, 

205 return_groups=return_groups, 

206 return_chars=return_chars, 

207 main_group=main_group, 

208 ) 

209 for m in filtered 

210 ] 

211 

212 def extract_text_lines( 

213 self, strip: bool = True, return_chars: bool = True 

214 ) -> List[Dict[str, Any]]: 

215 """ 

216 `strip` is analogous to Python's `str.strip()` method, and returns 

217 `text` attributes without their surrounding whitespace. Only 

218 relevant when the relevant TextMap is created with `layout` = True 

219 

220 Setting `return_chars` to False will exclude the individual 

221 character objects from the returned text-line dicts. 

222 """ 

223 if strip: 

224 pat = r" *([^\n]+?) *(\n|$)" 

225 else: 

226 pat = r"([^\n]+)" 

227 

228 return self.search( 

229 pat, main_group=1, return_chars=return_chars, return_groups=False 

230 ) 

231 

232 

233class WordMap: 

234 """ 

235 A WordMap maps words->chars. 

236 """ 

237 

238 def __init__(self, tuples: List[Tuple[T_obj, T_obj_list]]) -> None: 

239 self.tuples = tuples 

240 

241 def to_textmap( 

242 self, 

243 layout: bool = False, 

244 layout_width: T_num = 0, 

245 layout_height: T_num = 0, 

246 layout_width_chars: int = 0, 

247 layout_height_chars: int = 0, 

248 layout_bbox: T_bbox = (0, 0, 0, 0), 

249 x_density: T_num = DEFAULT_X_DENSITY, 

250 y_density: T_num = DEFAULT_Y_DENSITY, 

251 x_shift: T_num = 0, 

252 y_shift: T_num = 0, 

253 y_tolerance: T_num = DEFAULT_Y_TOLERANCE, 

254 line_dir: T_dir = DEFAULT_LINE_DIR, 

255 char_dir: T_dir = DEFAULT_CHAR_DIR, 

256 line_dir_rotated: Optional[T_dir] = None, 

257 char_dir_rotated: Optional[T_dir] = None, 

258 char_dir_render: Optional[T_dir] = None, 

259 line_dir_render: Optional[T_dir] = None, 

260 use_text_flow: bool = False, 

261 presorted: bool = False, 

262 expand_ligatures: bool = True, 

263 ) -> TextMap: 

264 """ 

265 Given a list of (word, chars) tuples (i.e., a WordMap), return a list of 

266 (char-text, char) tuples (i.e., a TextMap) that can be used to mimic 

267 the structural layout of the text on the page(s), using the following 

268 approach for top-to-bottom, left-to-right text: 

269 

270 - Sort the words by (top, x0) if not already sorted. 

271 

272 - Cluster the words by top (taking `y_tolerance` into account), and 

273 iterate through them. 

274 

275 - For each cluster, divide (top - y_shift) by `y_density` to calculate 

276 the minimum number of newlines that should come before this cluster. 

277 Append that number of newlines *minus* the number of newlines already 

278 appended, with a minimum of one. 

279 

280 - Then for each cluster, iterate through each word in it. Divide each 

281 word's x0, minus `x_shift`, by `x_density` to calculate the minimum 

282 number of characters that should come before this cluster. Append that 

283 number of spaces *minus* the number of characters and spaces already 

284 appended, with a minimum of one. Then append the word's text. 

285 

286 - At the termination of each line, add more spaces if necessary to 

287 mimic `layout_width`. 

288 

289 - Finally, add newlines to the end if necessary to mimic to 

290 `layout_height`. 

291 

292 For other line/character directions (e.g., bottom-to-top, 

293 right-to-left), these steps are adjusted. 

294 """ 

295 _textmap: List[Tuple[str, Optional[T_obj]]] = [] 

296 

297 if not len(self.tuples): 

298 return TextMap( 

299 _textmap, 

300 line_dir_render=line_dir_render or line_dir, 

301 char_dir_render=char_dir_render or char_dir, 

302 ) 

303 

304 expansions = LIGATURES if expand_ligatures else {} 

305 

306 if layout: 

307 if layout_width_chars: 

308 if layout_width: 

309 raise ValueError( 

310 "`layout_width` and `layout_width_chars` cannot both be set." 

311 ) 

312 else: 

313 layout_width_chars = int(round(layout_width / x_density)) 

314 

315 if layout_height_chars: 

316 if layout_height: 

317 raise ValueError( 

318 "`layout_height` and `layout_height_chars` cannot both be set." 

319 ) 

320 else: 

321 layout_height_chars = int(round(layout_height / y_density)) 

322 

323 blank_line = [(" ", None)] * layout_width_chars 

324 else: 

325 blank_line = [] 

326 

327 num_newlines = 0 

328 

329 line_cluster_key = get_line_cluster_key(line_dir) 

330 char_sort_key = get_char_sort_key(char_dir) 

331 

332 line_position_key = POSITION_KEYS[line_dir] 

333 char_position_key = POSITION_KEYS[char_dir] 

334 

335 y_origin = BBOX_ORIGIN_KEYS[line_dir](layout_bbox) 

336 x_origin = BBOX_ORIGIN_KEYS[char_dir](layout_bbox) 

337 

338 words_sorted_line_dir = ( 

339 self.tuples 

340 if presorted or use_text_flow 

341 else sorted(self.tuples, key=lambda x: line_cluster_key(x[0])) 

342 ) 

343 

344 tuples_by_line = cluster_objects( 

345 words_sorted_line_dir, 

346 lambda x: line_cluster_key(x[0]), 

347 y_tolerance, 

348 preserve_order=presorted or use_text_flow, 

349 ) 

350 

351 for i, line_tuples in enumerate(tuples_by_line): 

352 if layout: 

353 line_position = line_position_key(line_tuples[0][0]) 

354 y_dist_raw = line_position - (y_origin + y_shift) 

355 adj = -1 if line_dir in ["btt", "rtl"] else 1 

356 y_dist = y_dist_raw * adj / y_density 

357 else: 

358 y_dist = 0 

359 num_newlines_prepend = max( 

360 # At least one newline, unless this iis the first line 

361 int(i > 0), 

362 # ... or as many as needed to get the imputed "distance" from the top 

363 round(y_dist) - num_newlines, 

364 ) 

365 

366 for i in range(num_newlines_prepend): 

367 if not len(_textmap) or _textmap[-1][0] == "\n": 

368 _textmap += blank_line 

369 _textmap.append(("\n", None)) 

370 

371 num_newlines += num_newlines_prepend 

372 

373 line_len = 0 

374 

375 line_tuples_sorted = ( 

376 line_tuples 

377 if presorted or use_text_flow 

378 else sorted(line_tuples, key=lambda x: char_sort_key(x[0])) 

379 ) 

380 

381 for word, chars in line_tuples_sorted: 

382 if layout: 

383 char_position = char_position_key(word) 

384 x_dist_raw = char_position - (x_origin + x_shift) 

385 adj = -1 if char_dir in ["btt", "rtl"] else 1 

386 x_dist = x_dist_raw * adj / x_density 

387 else: 

388 x_dist = 0 

389 

390 num_spaces_prepend = max(min(1, line_len), round(x_dist) - line_len) 

391 _textmap += [(" ", None)] * num_spaces_prepend 

392 line_len += num_spaces_prepend 

393 

394 for c in chars: 

395 letters = expansions.get(c["text"], c["text"]) 

396 for letter in letters: 

397 _textmap.append((letter, c)) 

398 line_len += 1 

399 

400 # Append spaces at end of line 

401 if layout: 

402 _textmap += [(" ", None)] * (layout_width_chars - line_len) 

403 

404 # Append blank lines at end of text 

405 if layout: 

406 num_newlines_append = layout_height_chars - (num_newlines + 1) 

407 for i in range(num_newlines_append): 

408 if i > 0: 

409 _textmap += blank_line 

410 _textmap.append(("\n", None)) 

411 

412 # Remove terminal newline 

413 if _textmap[-1] == ("\n", None): 

414 _textmap = _textmap[:-1] 

415 

416 return TextMap( 

417 _textmap, 

418 line_dir_render=line_dir_render or line_dir, 

419 char_dir_render=char_dir_render or char_dir, 

420 ) 

421 

422 

423class WordExtractor: 

424 def __init__( 

425 self, 

426 x_tolerance: T_num = DEFAULT_X_TOLERANCE, 

427 y_tolerance: T_num = DEFAULT_Y_TOLERANCE, 

428 x_tolerance_ratio: Union[int, float, None] = None, 

429 y_tolerance_ratio: Union[int, float, None] = None, 

430 keep_blank_chars: bool = False, 

431 use_text_flow: bool = False, 

432 vertical_ttb: bool = True, # Should vertical words be read top-to-bottom? 

433 horizontal_ltr: bool = True, # Should words be read left-to-right? 

434 line_dir: T_dir = DEFAULT_LINE_DIR, 

435 char_dir: T_dir = DEFAULT_CHAR_DIR, 

436 line_dir_rotated: Optional[T_dir] = None, 

437 char_dir_rotated: Optional[T_dir] = None, 

438 extra_attrs: Optional[List[str]] = None, 

439 split_at_punctuation: Union[bool, str] = False, 

440 expand_ligatures: bool = True, 

441 ): 

442 self.x_tolerance = x_tolerance 

443 self.y_tolerance = y_tolerance 

444 self.x_tolerance_ratio = x_tolerance_ratio 

445 self.y_tolerance_ratio = y_tolerance_ratio 

446 self.keep_blank_chars = keep_blank_chars 

447 self.use_text_flow = use_text_flow 

448 self.horizontal_ltr = horizontal_ltr 

449 self.vertical_ttb = vertical_ttb 

450 if vertical_ttb is False: 

451 logger.warning( 

452 "vertical_ttb is deprecated and will be removed;" 

453 " use line_dir/char_dir instead." 

454 ) 

455 if horizontal_ltr is False: 

456 logger.warning( 

457 "horizontal_ltr is deprecated and will be removed;" 

458 " use line_dir/char_dir instead." 

459 ) 

460 self.line_dir = line_dir 

461 self.char_dir = char_dir 

462 # Default is to "flip" the directions for rotated text 

463 self.line_dir_rotated = line_dir_rotated or char_dir 

464 self.char_dir_rotated = char_dir_rotated or line_dir 

465 validate_directions(self.line_dir, self.char_dir) 

466 validate_directions(self.line_dir_rotated, self.char_dir_rotated, "_rotated") 

467 self.extra_attrs = [] if extra_attrs is None else extra_attrs 

468 

469 # Note: string.punctuation = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~' 

470 self.split_at_punctuation = ( 

471 string.punctuation 

472 if split_at_punctuation is True 

473 else (split_at_punctuation or "") 

474 ) 

475 

476 self.expansions = LIGATURES if expand_ligatures else {} 

477 

478 def get_char_dir(self, upright: int) -> T_dir: 

479 # Note: This can be simplified and reincorporated into .merge_chars and 

480 # .iter_chars_to_lines once .vertical_ttb and .horizontal_ltr 

481 # deprecation is complete. 

482 if not upright and not self.vertical_ttb: 

483 return "btt" 

484 

485 elif upright and not self.horizontal_ltr: 

486 return "rtl" 

487 

488 return self.char_dir if upright else self.char_dir_rotated 

489 

490 def merge_chars(self, ordered_chars: T_obj_list) -> T_obj: 

491 x0, top, x1, bottom = objects_to_bbox(ordered_chars) 

492 doctop_adj = ordered_chars[0]["doctop"] - ordered_chars[0]["top"] 

493 upright = ordered_chars[0]["upright"] 

494 char_dir = self.get_char_dir(upright) 

495 

496 word = { 

497 "text": "".join( 

498 self.expansions.get(c["text"], c["text"]) for c in ordered_chars 

499 ), 

500 "x0": x0, 

501 "x1": x1, 

502 "top": top, 

503 "doctop": top + doctop_adj, 

504 "bottom": bottom, 

505 "upright": upright, 

506 "height": bottom - top, 

507 "width": x1 - x0, 

508 "direction": char_dir, 

509 } 

510 

511 for key in self.extra_attrs: 

512 word[key] = ordered_chars[0][key] 

513 

514 return word 

515 

516 def char_begins_new_word( 

517 self, 

518 prev_char: T_obj, 

519 curr_char: T_obj, 

520 direction: T_dir, 

521 x_tolerance: T_num, 

522 y_tolerance: T_num, 

523 ) -> bool: 

524 """This method takes several factors into account to determine if 

525 `curr_char` represents the beginning of a new word: 

526 

527 - Whether the text is "upright" (i.e., non-rotated) 

528 - Whether the user has specified that horizontal text runs 

529 left-to-right (default) or right-to-left, as represented by 

530 self.horizontal_ltr 

531 - Whether the user has specified that vertical text the text runs 

532 top-to-bottom (default) or bottom-to-top, as represented by 

533 self.vertical_ttb 

534 - The x0, top, x1, and bottom attributes of prev_char and 

535 curr_char 

536 - The self.x_tolerance and self.y_tolerance settings. Note: In 

537 this case, x/y refer to those directions for non-rotated text. 

538 For vertical text, they are flipped. A more accurate terminology 

539 might be "*intra*line character distance tolerance" and 

540 "*inter*line character distance tolerance" 

541 

542 An important note: The *intra*line distance is measured from the 

543 *end* of the previous character to the *beginning* of the current 

544 character, while the *inter*line distance is measured from the 

545 *top* of the previous character to the *top* of the next 

546 character. The reasons for this are partly repository-historical, 

547 and partly logical, as successive text lines' bounding boxes often 

548 overlap slightly (and we don't want that overlap to be interpreted 

549 as the two lines being the same line). 

550 

551 The upright-ness of the character determines the attributes to 

552 compare, while horizontal_ltr/vertical_ttb determine the direction 

553 of the comparison. 

554 """ 

555 # Note: Due to the grouping step earlier in the process, 

556 # curr_char["upright"] will always equal prev_char["upright"]. 

557 if direction in ("ltr", "rtl"): 

558 x = x_tolerance 

559 y = y_tolerance 

560 ay = prev_char["top"] 

561 cy = curr_char["top"] 

562 if direction == "ltr": 

563 ax = prev_char["x0"] 

564 bx = prev_char["x1"] 

565 cx = curr_char["x0"] 

566 else: 

567 ax = -prev_char["x1"] 

568 bx = -prev_char["x0"] 

569 cx = -curr_char["x1"] 

570 

571 else: 

572 x = y_tolerance 

573 y = x_tolerance 

574 ay = prev_char["x0"] 

575 cy = curr_char["x0"] 

576 if direction == "ttb": 

577 ax = prev_char["top"] 

578 bx = prev_char["bottom"] 

579 cx = curr_char["top"] 

580 else: 

581 ax = -prev_char["bottom"] 

582 bx = -prev_char["top"] 

583 cx = -curr_char["bottom"] 

584 

585 return bool( 

586 # Intraline test 

587 (cx < ax) 

588 or (cx > bx + x) 

589 # Interline test 

590 or abs(cy - ay) > y 

591 ) 

592 

593 def iter_chars_to_words( 

594 self, 

595 ordered_chars: T_obj_iter, 

596 direction: T_dir, 

597 ) -> Generator[T_obj_list, None, None]: 

598 current_word: T_obj_list = [] 

599 

600 def start_next_word( 

601 new_char: Optional[T_obj], 

602 ) -> Generator[T_obj_list, None, None]: 

603 nonlocal current_word 

604 

605 if current_word: 

606 yield current_word 

607 

608 current_word = [] if new_char is None else [new_char] 

609 

610 xt = self.x_tolerance 

611 xtr = self.x_tolerance_ratio 

612 yt = self.y_tolerance 

613 ytr = self.y_tolerance_ratio 

614 

615 for char in ordered_chars: 

616 text = char["text"] 

617 

618 if not self.keep_blank_chars and text.isspace(): 

619 yield from start_next_word(None) 

620 

621 elif text in self.split_at_punctuation: 

622 yield from start_next_word(char) 

623 yield from start_next_word(None) 

624 

625 elif current_word and self.char_begins_new_word( 

626 current_word[-1], 

627 char, 

628 direction, 

629 x_tolerance=(xt if xtr is None else xtr * current_word[-1]["size"]), 

630 y_tolerance=(yt if ytr is None else ytr * current_word[-1]["size"]), 

631 ): 

632 yield from start_next_word(char) 

633 

634 else: 

635 current_word.append(char) 

636 

637 # Finally, after all chars processed 

638 if current_word: 

639 yield current_word 

640 

641 def iter_chars_to_lines( 

642 self, chars: T_obj_iter 

643 ) -> Generator[Tuple[T_obj_list, T_dir], None, None]: 

644 chars = list(chars) 

645 upright = chars[0]["upright"] 

646 line_dir = self.line_dir if upright else self.line_dir_rotated 

647 char_dir = self.get_char_dir(upright) 

648 

649 line_cluster_key = get_line_cluster_key(line_dir) 

650 char_sort_key = get_char_sort_key(char_dir) 

651 

652 # Cluster by line 

653 subclusters = cluster_objects( 

654 chars, 

655 line_cluster_key, 

656 (self.y_tolerance if line_dir in ("ttb", "btt") else self.x_tolerance), 

657 ) 

658 

659 for sc in subclusters: 

660 # Sort within line 

661 chars_sorted = sorted(sc, key=char_sort_key) 

662 yield (chars_sorted, char_dir) 

663 

664 def iter_extract_tuples( 

665 self, chars: T_obj_iter 

666 ) -> Generator[Tuple[T_obj, T_obj_list], None, None]: 

667 grouping_key = itemgetter("upright", *self.extra_attrs) 

668 grouped_chars = itertools.groupby(chars, grouping_key) 

669 

670 for keyvals, char_group in grouped_chars: 

671 line_groups = ( 

672 [(char_group, self.char_dir)] 

673 if self.use_text_flow 

674 else self.iter_chars_to_lines(char_group) 

675 ) 

676 for line_chars, direction in line_groups: 

677 for word_chars in self.iter_chars_to_words(line_chars, direction): 

678 yield (self.merge_chars(word_chars), word_chars) 

679 

680 def extract_wordmap(self, chars: T_obj_iter) -> WordMap: 

681 return WordMap(list(self.iter_extract_tuples(chars))) 

682 

683 def extract_words( 

684 self, chars: T_obj_list, return_chars: bool = False 

685 ) -> T_obj_list: 

686 if return_chars: 

687 return list( 

688 {**word, "chars": word_chars} 

689 for word, word_chars in self.iter_extract_tuples(chars) 

690 ) 

691 else: 

692 return list(word for word, word_chars in self.iter_extract_tuples(chars)) 

693 

694 

695def extract_words( 

696 chars: T_obj_list, return_chars: bool = False, **kwargs: Any 

697) -> T_obj_list: 

698 return WordExtractor(**kwargs).extract_words(chars, return_chars) 

699 

700 

701TEXTMAP_KWARGS = inspect.signature(WordMap.to_textmap).parameters.keys() 

702WORD_EXTRACTOR_KWARGS = inspect.signature(WordExtractor).parameters.keys() 

703 

704 

705def chars_to_textmap(chars: T_obj_list, **kwargs: Any) -> TextMap: 

706 kwargs.update( 

707 { 

708 "presorted": True, 

709 "layout_bbox": kwargs.get("layout_bbox") or objects_to_bbox(chars), 

710 } 

711 ) 

712 

713 extractor = WordExtractor( 

714 **{k: kwargs[k] for k in WORD_EXTRACTOR_KWARGS if k in kwargs} 

715 ) 

716 wordmap = extractor.extract_wordmap(chars) 

717 textmap = wordmap.to_textmap( 

718 **{k: kwargs[k] for k in TEXTMAP_KWARGS if k in kwargs} 

719 ) 

720 return textmap 

721 

722 

723def extract_text( 

724 chars: T_obj_list, 

725 line_dir_render: Optional[T_dir] = None, 

726 char_dir_render: Optional[T_dir] = None, 

727 **kwargs: Any, 

728) -> str: 

729 chars = to_list(chars) 

730 if len(chars) == 0: 

731 return "" 

732 

733 if kwargs.get("layout"): 

734 textmap_kwargs = { 

735 **kwargs, 

736 **{"line_dir_render": line_dir_render, "char_dir_render": char_dir_render}, 

737 } 

738 return chars_to_textmap(chars, **textmap_kwargs).as_string 

739 else: 

740 extractor = WordExtractor( 

741 **{k: kwargs[k] for k in WORD_EXTRACTOR_KWARGS if k in kwargs} 

742 ) 

743 words = extractor.extract_words(chars) 

744 

745 line_dir_render = line_dir_render or extractor.line_dir 

746 char_dir_render = char_dir_render or extractor.char_dir 

747 

748 line_cluster_key = get_line_cluster_key(extractor.line_dir) 

749 

750 x_tolerance = kwargs.get("x_tolerance", DEFAULT_X_TOLERANCE) 

751 y_tolerance = kwargs.get("y_tolerance", DEFAULT_Y_TOLERANCE) 

752 

753 lines = cluster_objects( 

754 words, 

755 line_cluster_key, 

756 y_tolerance if line_dir_render in ("ttb", "btt") else x_tolerance, 

757 ) 

758 

759 return TextMap( 

760 [ 

761 (char, None) 

762 for char in ( 

763 "\n".join(" ".join(word["text"] for word in line) for line in lines) 

764 ) 

765 ], 

766 line_dir_render=line_dir_render, 

767 char_dir_render=char_dir_render, 

768 ).as_string 

769 

770 

771def collate_line( 

772 line_chars: T_obj_list, 

773 tolerance: T_num = DEFAULT_X_TOLERANCE, 

774) -> str: 

775 coll = "" 

776 last_x1 = None 

777 for char in sorted(line_chars, key=itemgetter("x0")): 

778 if (last_x1 is not None) and (char["x0"] > (last_x1 + tolerance)): 

779 coll += " " 

780 last_x1 = char["x1"] 

781 coll += char["text"] 

782 return coll 

783 

784 

785def extract_text_simple( 

786 chars: T_obj_list, 

787 x_tolerance: T_num = DEFAULT_X_TOLERANCE, 

788 y_tolerance: T_num = DEFAULT_Y_TOLERANCE, 

789) -> str: 

790 clustered = cluster_objects(chars, itemgetter("doctop"), y_tolerance) 

791 return "\n".join(collate_line(c, x_tolerance) for c in clustered) 

792 

793 

794def dedupe_chars( 

795 chars: T_obj_list, 

796 tolerance: T_num = 1, 

797 extra_attrs: Optional[Tuple[str, ...]] = ("fontname", "size"), 

798) -> T_obj_list: 

799 """ 

800 Removes duplicate chars — those sharing the same text and positioning 

801 (within `tolerance`) as other characters in the set. Use extra_args to 

802 be more restrictive with the properties shared by the matching chars. 

803 """ 

804 key = itemgetter(*("upright", "text"), *(extra_attrs or tuple())) 

805 pos_key = itemgetter("doctop", "x0") 

806 

807 def yield_unique_chars(chars: T_obj_list) -> Generator[T_obj, None, None]: 

808 sorted_chars = sorted(chars, key=key) 

809 for grp, grp_chars in itertools.groupby(sorted_chars, key=key): 

810 for y_cluster in cluster_objects( 

811 list(grp_chars), itemgetter("doctop"), tolerance 

812 ): 

813 for x_cluster in cluster_objects( 

814 y_cluster, itemgetter("x0"), tolerance 

815 ): 

816 yield sorted(x_cluster, key=pos_key)[0] 

817 

818 deduped = yield_unique_chars(chars) 

819 return sorted(deduped, key=chars.index)