1import inspect
2import itertools
3import logging
4import re
5import string
6from operator import itemgetter
7from typing import (
8 Any,
9 Callable,
10 Dict,
11 Generator,
12 List,
13 Match,
14 Optional,
15 Pattern,
16 Tuple,
17 Union,
18)
19
20from .._typing import T_bbox, T_dir, T_num, T_obj, T_obj_iter, T_obj_list
21from .clustering import cluster_objects
22from .generic import to_list
23from .geometry import objects_to_bbox
24
25logger = logging.getLogger(__name__)
26
27DEFAULT_X_TOLERANCE = 3
28DEFAULT_Y_TOLERANCE = 3
29DEFAULT_X_DENSITY = 7.25
30DEFAULT_Y_DENSITY = 13
31DEFAULT_LINE_DIR: T_dir = "ttb"
32DEFAULT_CHAR_DIR: T_dir = "ltr"
33
34LIGATURES = {
35 "ff": "ff",
36 "ffi": "ffi",
37 "ffl": "ffl",
38 "fi": "fi",
39 "fl": "fl",
40 "st": "st",
41 "ſt": "st",
42}
43
44
45def get_line_cluster_key(line_dir: T_dir) -> Callable[[T_obj], T_num]:
46 return {
47 "ttb": lambda x: x["top"],
48 "btt": lambda x: -x["bottom"],
49 "ltr": lambda x: x["x0"],
50 "rtl": lambda x: -x["x1"],
51 }[line_dir]
52
53
54def get_char_sort_key(char_dir: T_dir) -> Callable[[T_obj], Tuple[T_num, T_num]]:
55 return {
56 "ttb": lambda x: (x["top"], x["bottom"]),
57 "btt": lambda x: (-(x["top"] + x["height"]), -x["top"]),
58 "ltr": lambda x: (x["x0"], x["x0"]),
59 "rtl": lambda x: (-x["x1"], -x["x0"]),
60 }[char_dir]
61
62
63BBOX_ORIGIN_KEYS = {
64 "ttb": itemgetter(1),
65 "btt": itemgetter(3),
66 "ltr": itemgetter(0),
67 "rtl": itemgetter(2),
68}
69
70POSITION_KEYS = {
71 "ttb": itemgetter("top"),
72 "btt": itemgetter("bottom"),
73 "ltr": itemgetter("x0"),
74 "rtl": itemgetter("x1"),
75}
76
77
78def validate_directions(line_dir: T_dir, char_dir: T_dir, suffix: str = "") -> None:
79 valid_dirs = set(POSITION_KEYS.keys())
80 if line_dir not in valid_dirs:
81 raise ValueError(
82 f"line_dir{suffix} must be one of {valid_dirs}, not {line_dir}"
83 )
84 if char_dir not in valid_dirs:
85 raise ValueError(
86 f"char_dir{suffix} must be one of {valid_dirs}, not {char_dir}"
87 )
88 if set(line_dir) == set(char_dir):
89 raise ValueError(
90 f"line_dir{suffix}={line_dir} is incompatible "
91 f"with char_dir{suffix}={char_dir}"
92 )
93
94
95class TextMap:
96 """
97 A TextMap maps each unicode character in the text to an individual `char`
98 object (or, in the case of layout-implied whitespace, `None`).
99 """
100
101 def __init__(
102 self,
103 tuples: List[Tuple[str, Optional[T_obj]]],
104 line_dir_render: T_dir,
105 char_dir_render: T_dir,
106 ) -> None:
107 validate_directions(line_dir_render, char_dir_render, "_render")
108 self.tuples = tuples
109 self.line_dir_render = line_dir_render
110 self.char_dir_render = char_dir_render
111 self.as_string = self.to_string()
112
113 def to_string(self) -> str:
114 cd = self.char_dir_render
115 ld = self.line_dir_render
116
117 base = "".join(map(itemgetter(0), self.tuples))
118
119 if cd == "ltr" and ld == "ttb":
120 return base
121 else:
122 lines = base.split("\n")
123 if ld in ("btt", "rtl"):
124 lines = list(reversed(lines))
125
126 if cd == "rtl":
127 lines = ["".join(reversed(line)) for line in lines]
128
129 if ld in ("rtl", "ltr"):
130 max_line_length = max(map(len, lines))
131 if cd == "btt":
132 lines = [
133 (" " * (max_line_length - len(line))) + line for line in lines
134 ]
135 else:
136 lines = [
137 line + (" " * (max_line_length - len(line))) for line in lines
138 ]
139 return "\n".join(
140 "".join(line[i] for line in lines) for i in range(max_line_length)
141 )
142 else:
143 return "\n".join(lines)
144
145 def match_to_dict(
146 self,
147 m: Match[str],
148 main_group: int = 0,
149 return_groups: bool = True,
150 return_chars: bool = True,
151 ) -> Dict[str, Any]:
152 subset = self.tuples[m.start(main_group) : m.end(main_group)]
153 chars = [c for (text, c) in subset if c is not None]
154 x0, top, x1, bottom = objects_to_bbox(chars)
155
156 result = {
157 "text": m.group(main_group),
158 "x0": x0,
159 "top": top,
160 "x1": x1,
161 "bottom": bottom,
162 }
163
164 if return_groups:
165 result["groups"] = m.groups()
166
167 if return_chars:
168 result["chars"] = chars
169
170 return result
171
172 def search(
173 self,
174 pattern: Union[str, Pattern[str]],
175 regex: bool = True,
176 case: bool = True,
177 return_groups: bool = True,
178 return_chars: bool = True,
179 main_group: int = 0,
180 ) -> List[Dict[str, Any]]:
181 if isinstance(pattern, Pattern):
182 if regex is False:
183 raise ValueError(
184 "Cannot pass a compiled search pattern *and* regex=False together."
185 )
186 if case is False:
187 raise ValueError(
188 "Cannot pass a compiled search pattern *and* case=False together."
189 )
190 compiled = pattern
191 else:
192 if regex is False:
193 pattern = re.escape(pattern)
194
195 flags = re.I if case is False else 0
196 compiled = re.compile(pattern, flags)
197
198 gen = re.finditer(compiled, self.as_string)
199 # Remove zero-length matches (can happen, e.g., with optional
200 # patterns in regexes) and whitespace-only matches
201 filtered = filter(lambda m: bool(m.group(main_group).strip()), gen)
202 return [
203 self.match_to_dict(
204 m,
205 return_groups=return_groups,
206 return_chars=return_chars,
207 main_group=main_group,
208 )
209 for m in filtered
210 ]
211
212 def extract_text_lines(
213 self, strip: bool = True, return_chars: bool = True
214 ) -> List[Dict[str, Any]]:
215 """
216 `strip` is analogous to Python's `str.strip()` method, and returns
217 `text` attributes without their surrounding whitespace. Only
218 relevant when the relevant TextMap is created with `layout` = True
219
220 Setting `return_chars` to False will exclude the individual
221 character objects from the returned text-line dicts.
222 """
223 if strip:
224 pat = r" *([^\n]+?) *(\n|$)"
225 else:
226 pat = r"([^\n]+)"
227
228 return self.search(
229 pat, main_group=1, return_chars=return_chars, return_groups=False
230 )
231
232
233class WordMap:
234 """
235 A WordMap maps words->chars.
236 """
237
238 def __init__(self, tuples: List[Tuple[T_obj, T_obj_list]]) -> None:
239 self.tuples = tuples
240
241 def to_textmap(
242 self,
243 layout: bool = False,
244 layout_width: T_num = 0,
245 layout_height: T_num = 0,
246 layout_width_chars: int = 0,
247 layout_height_chars: int = 0,
248 layout_bbox: T_bbox = (0, 0, 0, 0),
249 x_density: T_num = DEFAULT_X_DENSITY,
250 y_density: T_num = DEFAULT_Y_DENSITY,
251 x_shift: T_num = 0,
252 y_shift: T_num = 0,
253 y_tolerance: T_num = DEFAULT_Y_TOLERANCE,
254 line_dir: T_dir = DEFAULT_LINE_DIR,
255 char_dir: T_dir = DEFAULT_CHAR_DIR,
256 line_dir_rotated: Optional[T_dir] = None,
257 char_dir_rotated: Optional[T_dir] = None,
258 char_dir_render: Optional[T_dir] = None,
259 line_dir_render: Optional[T_dir] = None,
260 use_text_flow: bool = False,
261 presorted: bool = False,
262 expand_ligatures: bool = True,
263 ) -> TextMap:
264 """
265 Given a list of (word, chars) tuples (i.e., a WordMap), return a list of
266 (char-text, char) tuples (i.e., a TextMap) that can be used to mimic
267 the structural layout of the text on the page(s), using the following
268 approach for top-to-bottom, left-to-right text:
269
270 - Sort the words by (top, x0) if not already sorted.
271
272 - Cluster the words by top (taking `y_tolerance` into account), and
273 iterate through them.
274
275 - For each cluster, divide (top - y_shift) by `y_density` to calculate
276 the minimum number of newlines that should come before this cluster.
277 Append that number of newlines *minus* the number of newlines already
278 appended, with a minimum of one.
279
280 - Then for each cluster, iterate through each word in it. Divide each
281 word's x0, minus `x_shift`, by `x_density` to calculate the minimum
282 number of characters that should come before this cluster. Append that
283 number of spaces *minus* the number of characters and spaces already
284 appended, with a minimum of one. Then append the word's text.
285
286 - At the termination of each line, add more spaces if necessary to
287 mimic `layout_width`.
288
289 - Finally, add newlines to the end if necessary to mimic to
290 `layout_height`.
291
292 For other line/character directions (e.g., bottom-to-top,
293 right-to-left), these steps are adjusted.
294 """
295 _textmap: List[Tuple[str, Optional[T_obj]]] = []
296
297 if not len(self.tuples):
298 return TextMap(
299 _textmap,
300 line_dir_render=line_dir_render or line_dir,
301 char_dir_render=char_dir_render or char_dir,
302 )
303
304 expansions = LIGATURES if expand_ligatures else {}
305
306 if layout:
307 if layout_width_chars:
308 if layout_width:
309 raise ValueError(
310 "`layout_width` and `layout_width_chars` cannot both be set."
311 )
312 else:
313 layout_width_chars = int(round(layout_width / x_density))
314
315 if layout_height_chars:
316 if layout_height:
317 raise ValueError(
318 "`layout_height` and `layout_height_chars` cannot both be set."
319 )
320 else:
321 layout_height_chars = int(round(layout_height / y_density))
322
323 blank_line = [(" ", None)] * layout_width_chars
324 else:
325 blank_line = []
326
327 num_newlines = 0
328
329 line_cluster_key = get_line_cluster_key(line_dir)
330 char_sort_key = get_char_sort_key(char_dir)
331
332 line_position_key = POSITION_KEYS[line_dir]
333 char_position_key = POSITION_KEYS[char_dir]
334
335 y_origin = BBOX_ORIGIN_KEYS[line_dir](layout_bbox)
336 x_origin = BBOX_ORIGIN_KEYS[char_dir](layout_bbox)
337
338 words_sorted_line_dir = (
339 self.tuples
340 if presorted or use_text_flow
341 else sorted(self.tuples, key=lambda x: line_cluster_key(x[0]))
342 )
343
344 tuples_by_line = cluster_objects(
345 words_sorted_line_dir,
346 lambda x: line_cluster_key(x[0]),
347 y_tolerance,
348 preserve_order=presorted or use_text_flow,
349 )
350
351 for i, line_tuples in enumerate(tuples_by_line):
352 if layout:
353 line_position = line_position_key(line_tuples[0][0])
354 y_dist_raw = line_position - (y_origin + y_shift)
355 adj = -1 if line_dir in ["btt", "rtl"] else 1
356 y_dist = y_dist_raw * adj / y_density
357 else:
358 y_dist = 0
359 num_newlines_prepend = max(
360 # At least one newline, unless this iis the first line
361 int(i > 0),
362 # ... or as many as needed to get the imputed "distance" from the top
363 round(y_dist) - num_newlines,
364 )
365
366 for i in range(num_newlines_prepend):
367 if not len(_textmap) or _textmap[-1][0] == "\n":
368 _textmap += blank_line
369 _textmap.append(("\n", None))
370
371 num_newlines += num_newlines_prepend
372
373 line_len = 0
374
375 line_tuples_sorted = (
376 line_tuples
377 if presorted or use_text_flow
378 else sorted(line_tuples, key=lambda x: char_sort_key(x[0]))
379 )
380
381 for word, chars in line_tuples_sorted:
382 if layout:
383 char_position = char_position_key(word)
384 x_dist_raw = char_position - (x_origin + x_shift)
385 adj = -1 if char_dir in ["btt", "rtl"] else 1
386 x_dist = x_dist_raw * adj / x_density
387 else:
388 x_dist = 0
389
390 num_spaces_prepend = max(min(1, line_len), round(x_dist) - line_len)
391 _textmap += [(" ", None)] * num_spaces_prepend
392 line_len += num_spaces_prepend
393
394 for c in chars:
395 letters = expansions.get(c["text"], c["text"])
396 for letter in letters:
397 _textmap.append((letter, c))
398 line_len += 1
399
400 # Append spaces at end of line
401 if layout:
402 _textmap += [(" ", None)] * (layout_width_chars - line_len)
403
404 # Append blank lines at end of text
405 if layout:
406 num_newlines_append = layout_height_chars - (num_newlines + 1)
407 for i in range(num_newlines_append):
408 if i > 0:
409 _textmap += blank_line
410 _textmap.append(("\n", None))
411
412 # Remove terminal newline
413 if _textmap[-1] == ("\n", None):
414 _textmap = _textmap[:-1]
415
416 return TextMap(
417 _textmap,
418 line_dir_render=line_dir_render or line_dir,
419 char_dir_render=char_dir_render or char_dir,
420 )
421
422
423class WordExtractor:
424 def __init__(
425 self,
426 x_tolerance: T_num = DEFAULT_X_TOLERANCE,
427 y_tolerance: T_num = DEFAULT_Y_TOLERANCE,
428 x_tolerance_ratio: Union[int, float, None] = None,
429 y_tolerance_ratio: Union[int, float, None] = None,
430 keep_blank_chars: bool = False,
431 use_text_flow: bool = False,
432 vertical_ttb: bool = True, # Should vertical words be read top-to-bottom?
433 horizontal_ltr: bool = True, # Should words be read left-to-right?
434 line_dir: T_dir = DEFAULT_LINE_DIR,
435 char_dir: T_dir = DEFAULT_CHAR_DIR,
436 line_dir_rotated: Optional[T_dir] = None,
437 char_dir_rotated: Optional[T_dir] = None,
438 extra_attrs: Optional[List[str]] = None,
439 split_at_punctuation: Union[bool, str] = False,
440 expand_ligatures: bool = True,
441 ):
442 self.x_tolerance = x_tolerance
443 self.y_tolerance = y_tolerance
444 self.x_tolerance_ratio = x_tolerance_ratio
445 self.y_tolerance_ratio = y_tolerance_ratio
446 self.keep_blank_chars = keep_blank_chars
447 self.use_text_flow = use_text_flow
448 self.horizontal_ltr = horizontal_ltr
449 self.vertical_ttb = vertical_ttb
450 if vertical_ttb is False:
451 logger.warning(
452 "vertical_ttb is deprecated and will be removed;"
453 " use line_dir/char_dir instead."
454 )
455 if horizontal_ltr is False:
456 logger.warning(
457 "horizontal_ltr is deprecated and will be removed;"
458 " use line_dir/char_dir instead."
459 )
460 self.line_dir = line_dir
461 self.char_dir = char_dir
462 # Default is to "flip" the directions for rotated text
463 self.line_dir_rotated = line_dir_rotated or char_dir
464 self.char_dir_rotated = char_dir_rotated or line_dir
465 validate_directions(self.line_dir, self.char_dir)
466 validate_directions(self.line_dir_rotated, self.char_dir_rotated, "_rotated")
467 self.extra_attrs = [] if extra_attrs is None else extra_attrs
468
469 # Note: string.punctuation = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
470 self.split_at_punctuation = (
471 string.punctuation
472 if split_at_punctuation is True
473 else (split_at_punctuation or "")
474 )
475
476 self.expansions = LIGATURES if expand_ligatures else {}
477
478 def get_char_dir(self, upright: int) -> T_dir:
479 # Note: This can be simplified and reincorporated into .merge_chars and
480 # .iter_chars_to_lines once .vertical_ttb and .horizontal_ltr
481 # deprecation is complete.
482 if not upright and not self.vertical_ttb:
483 return "btt"
484
485 elif upright and not self.horizontal_ltr:
486 return "rtl"
487
488 return self.char_dir if upright else self.char_dir_rotated
489
490 def merge_chars(self, ordered_chars: T_obj_list) -> T_obj:
491 x0, top, x1, bottom = objects_to_bbox(ordered_chars)
492 doctop_adj = ordered_chars[0]["doctop"] - ordered_chars[0]["top"]
493 upright = ordered_chars[0]["upright"]
494 char_dir = self.get_char_dir(upright)
495
496 word = {
497 "text": "".join(
498 self.expansions.get(c["text"], c["text"]) for c in ordered_chars
499 ),
500 "x0": x0,
501 "x1": x1,
502 "top": top,
503 "doctop": top + doctop_adj,
504 "bottom": bottom,
505 "upright": upright,
506 "height": bottom - top,
507 "width": x1 - x0,
508 "direction": char_dir,
509 }
510
511 for key in self.extra_attrs:
512 word[key] = ordered_chars[0][key]
513
514 return word
515
516 def char_begins_new_word(
517 self,
518 prev_char: T_obj,
519 curr_char: T_obj,
520 direction: T_dir,
521 x_tolerance: T_num,
522 y_tolerance: T_num,
523 ) -> bool:
524 """This method takes several factors into account to determine if
525 `curr_char` represents the beginning of a new word:
526
527 - Whether the text is "upright" (i.e., non-rotated)
528 - Whether the user has specified that horizontal text runs
529 left-to-right (default) or right-to-left, as represented by
530 self.horizontal_ltr
531 - Whether the user has specified that vertical text the text runs
532 top-to-bottom (default) or bottom-to-top, as represented by
533 self.vertical_ttb
534 - The x0, top, x1, and bottom attributes of prev_char and
535 curr_char
536 - The self.x_tolerance and self.y_tolerance settings. Note: In
537 this case, x/y refer to those directions for non-rotated text.
538 For vertical text, they are flipped. A more accurate terminology
539 might be "*intra*line character distance tolerance" and
540 "*inter*line character distance tolerance"
541
542 An important note: The *intra*line distance is measured from the
543 *end* of the previous character to the *beginning* of the current
544 character, while the *inter*line distance is measured from the
545 *top* of the previous character to the *top* of the next
546 character. The reasons for this are partly repository-historical,
547 and partly logical, as successive text lines' bounding boxes often
548 overlap slightly (and we don't want that overlap to be interpreted
549 as the two lines being the same line).
550
551 The upright-ness of the character determines the attributes to
552 compare, while horizontal_ltr/vertical_ttb determine the direction
553 of the comparison.
554 """
555 # Note: Due to the grouping step earlier in the process,
556 # curr_char["upright"] will always equal prev_char["upright"].
557 if direction in ("ltr", "rtl"):
558 x = x_tolerance
559 y = y_tolerance
560 ay = prev_char["top"]
561 cy = curr_char["top"]
562 if direction == "ltr":
563 ax = prev_char["x0"]
564 bx = prev_char["x1"]
565 cx = curr_char["x0"]
566 else:
567 ax = -prev_char["x1"]
568 bx = -prev_char["x0"]
569 cx = -curr_char["x1"]
570
571 else:
572 x = y_tolerance
573 y = x_tolerance
574 ay = prev_char["x0"]
575 cy = curr_char["x0"]
576 if direction == "ttb":
577 ax = prev_char["top"]
578 bx = prev_char["bottom"]
579 cx = curr_char["top"]
580 else:
581 ax = -prev_char["bottom"]
582 bx = -prev_char["top"]
583 cx = -curr_char["bottom"]
584
585 return bool(
586 # Intraline test
587 (cx < ax)
588 or (cx > bx + x)
589 # Interline test
590 or abs(cy - ay) > y
591 )
592
593 def iter_chars_to_words(
594 self,
595 ordered_chars: T_obj_iter,
596 direction: T_dir,
597 ) -> Generator[T_obj_list, None, None]:
598 current_word: T_obj_list = []
599
600 def start_next_word(
601 new_char: Optional[T_obj],
602 ) -> Generator[T_obj_list, None, None]:
603 nonlocal current_word
604
605 if current_word:
606 yield current_word
607
608 current_word = [] if new_char is None else [new_char]
609
610 xt = self.x_tolerance
611 xtr = self.x_tolerance_ratio
612 yt = self.y_tolerance
613 ytr = self.y_tolerance_ratio
614
615 for char in ordered_chars:
616 text = char["text"]
617
618 if not self.keep_blank_chars and text.isspace():
619 yield from start_next_word(None)
620
621 elif text in self.split_at_punctuation:
622 yield from start_next_word(char)
623 yield from start_next_word(None)
624
625 elif current_word and self.char_begins_new_word(
626 current_word[-1],
627 char,
628 direction,
629 x_tolerance=(xt if xtr is None else xtr * current_word[-1]["size"]),
630 y_tolerance=(yt if ytr is None else ytr * current_word[-1]["size"]),
631 ):
632 yield from start_next_word(char)
633
634 else:
635 current_word.append(char)
636
637 # Finally, after all chars processed
638 if current_word:
639 yield current_word
640
641 def iter_chars_to_lines(
642 self, chars: T_obj_iter
643 ) -> Generator[Tuple[T_obj_list, T_dir], None, None]:
644 chars = list(chars)
645 upright = chars[0]["upright"]
646 line_dir = self.line_dir if upright else self.line_dir_rotated
647 char_dir = self.get_char_dir(upright)
648
649 line_cluster_key = get_line_cluster_key(line_dir)
650 char_sort_key = get_char_sort_key(char_dir)
651
652 # Cluster by line
653 subclusters = cluster_objects(
654 chars,
655 line_cluster_key,
656 (self.y_tolerance if line_dir in ("ttb", "btt") else self.x_tolerance),
657 )
658
659 for sc in subclusters:
660 # Sort within line
661 chars_sorted = sorted(sc, key=char_sort_key)
662 yield (chars_sorted, char_dir)
663
664 def iter_extract_tuples(
665 self, chars: T_obj_iter
666 ) -> Generator[Tuple[T_obj, T_obj_list], None, None]:
667 grouping_key = itemgetter("upright", *self.extra_attrs)
668 grouped_chars = itertools.groupby(chars, grouping_key)
669
670 for keyvals, char_group in grouped_chars:
671 line_groups = (
672 [(char_group, self.char_dir)]
673 if self.use_text_flow
674 else self.iter_chars_to_lines(char_group)
675 )
676 for line_chars, direction in line_groups:
677 for word_chars in self.iter_chars_to_words(line_chars, direction):
678 yield (self.merge_chars(word_chars), word_chars)
679
680 def extract_wordmap(self, chars: T_obj_iter) -> WordMap:
681 return WordMap(list(self.iter_extract_tuples(chars)))
682
683 def extract_words(
684 self, chars: T_obj_list, return_chars: bool = False
685 ) -> T_obj_list:
686 if return_chars:
687 return list(
688 {**word, "chars": word_chars}
689 for word, word_chars in self.iter_extract_tuples(chars)
690 )
691 else:
692 return list(word for word, word_chars in self.iter_extract_tuples(chars))
693
694
695def extract_words(
696 chars: T_obj_list, return_chars: bool = False, **kwargs: Any
697) -> T_obj_list:
698 return WordExtractor(**kwargs).extract_words(chars, return_chars)
699
700
701TEXTMAP_KWARGS = inspect.signature(WordMap.to_textmap).parameters.keys()
702WORD_EXTRACTOR_KWARGS = inspect.signature(WordExtractor).parameters.keys()
703
704
705def chars_to_textmap(chars: T_obj_list, **kwargs: Any) -> TextMap:
706 kwargs.update(
707 {
708 "presorted": True,
709 "layout_bbox": kwargs.get("layout_bbox") or objects_to_bbox(chars),
710 }
711 )
712
713 extractor = WordExtractor(
714 **{k: kwargs[k] for k in WORD_EXTRACTOR_KWARGS if k in kwargs}
715 )
716 wordmap = extractor.extract_wordmap(chars)
717 textmap = wordmap.to_textmap(
718 **{k: kwargs[k] for k in TEXTMAP_KWARGS if k in kwargs}
719 )
720 return textmap
721
722
723def extract_text(
724 chars: T_obj_list,
725 line_dir_render: Optional[T_dir] = None,
726 char_dir_render: Optional[T_dir] = None,
727 **kwargs: Any,
728) -> str:
729 chars = to_list(chars)
730 if len(chars) == 0:
731 return ""
732
733 if kwargs.get("layout"):
734 textmap_kwargs = {
735 **kwargs,
736 **{"line_dir_render": line_dir_render, "char_dir_render": char_dir_render},
737 }
738 return chars_to_textmap(chars, **textmap_kwargs).as_string
739 else:
740 extractor = WordExtractor(
741 **{k: kwargs[k] for k in WORD_EXTRACTOR_KWARGS if k in kwargs}
742 )
743 words = extractor.extract_words(chars)
744
745 line_dir_render = line_dir_render or extractor.line_dir
746 char_dir_render = char_dir_render or extractor.char_dir
747
748 line_cluster_key = get_line_cluster_key(extractor.line_dir)
749
750 x_tolerance = kwargs.get("x_tolerance", DEFAULT_X_TOLERANCE)
751 y_tolerance = kwargs.get("y_tolerance", DEFAULT_Y_TOLERANCE)
752
753 lines = cluster_objects(
754 words,
755 line_cluster_key,
756 y_tolerance if line_dir_render in ("ttb", "btt") else x_tolerance,
757 )
758
759 return TextMap(
760 [
761 (char, None)
762 for char in (
763 "\n".join(" ".join(word["text"] for word in line) for line in lines)
764 )
765 ],
766 line_dir_render=line_dir_render,
767 char_dir_render=char_dir_render,
768 ).as_string
769
770
771def collate_line(
772 line_chars: T_obj_list,
773 tolerance: T_num = DEFAULT_X_TOLERANCE,
774) -> str:
775 coll = ""
776 last_x1 = None
777 for char in sorted(line_chars, key=itemgetter("x0")):
778 if (last_x1 is not None) and (char["x0"] > (last_x1 + tolerance)):
779 coll += " "
780 last_x1 = char["x1"]
781 coll += char["text"]
782 return coll
783
784
785def extract_text_simple(
786 chars: T_obj_list,
787 x_tolerance: T_num = DEFAULT_X_TOLERANCE,
788 y_tolerance: T_num = DEFAULT_Y_TOLERANCE,
789) -> str:
790 clustered = cluster_objects(chars, itemgetter("doctop"), y_tolerance)
791 return "\n".join(collate_line(c, x_tolerance) for c in clustered)
792
793
794def dedupe_chars(
795 chars: T_obj_list,
796 tolerance: T_num = 1,
797 extra_attrs: Optional[Tuple[str, ...]] = ("fontname", "size"),
798) -> T_obj_list:
799 """
800 Removes duplicate chars — those sharing the same text and positioning
801 (within `tolerance`) as other characters in the set. Use extra_args to
802 be more restrictive with the properties shared by the matching chars.
803 """
804 key = itemgetter(*("upright", "text"), *(extra_attrs or tuple()))
805 pos_key = itemgetter("doctop", "x0")
806
807 def yield_unique_chars(chars: T_obj_list) -> Generator[T_obj, None, None]:
808 sorted_chars = sorted(chars, key=key)
809 for grp, grp_chars in itertools.groupby(sorted_chars, key=key):
810 for y_cluster in cluster_objects(
811 list(grp_chars), itemgetter("doctop"), tolerance
812 ):
813 for x_cluster in cluster_objects(
814 y_cluster, itemgetter("x0"), tolerance
815 ):
816 yield sorted(x_cluster, key=pos_key)[0]
817
818 deduped = yield_unique_chars(chars)
819 return sorted(deduped, key=chars.index)