Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pdfplumber/container.py: 60%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import csv
2import json
3from io import StringIO
4from itertools import chain
5from typing import Any, Dict, List, Optional, Set, TextIO
7from . import utils
8from ._typing import T_obj, T_obj_list
9from .convert import CSV_COLS_REQUIRED, CSV_COLS_TO_PREPEND, Serializer
12class Container(object):
13 cached_properties = ["_rect_edges", "_curve_edges", "_edges", "_objects"]
15 @property
16 def pages(self) -> Optional[List[Any]]: # pragma: nocover
17 raise NotImplementedError
19 @property
20 def objects(self) -> Dict[str, T_obj_list]: # pragma: nocover
21 raise NotImplementedError
23 def to_dict(
24 self, object_types: Optional[List[str]] = None
25 ) -> Dict[str, Any]: # pragma: nocover
26 raise NotImplementedError
28 def flush_cache(self, properties: Optional[List[str]] = None) -> None:
29 props = self.cached_properties if properties is None else properties
30 for p in props:
31 if hasattr(self, p):
32 delattr(self, p)
34 @property
35 def rects(self) -> T_obj_list:
36 return self.objects.get("rect", [])
38 @property
39 def lines(self) -> T_obj_list:
40 return self.objects.get("line", [])
42 @property
43 def curves(self) -> T_obj_list:
44 return self.objects.get("curve", [])
46 @property
47 def images(self) -> T_obj_list:
48 return self.objects.get("image", [])
50 @property
51 def chars(self) -> T_obj_list:
52 return self.objects.get("char", [])
54 @property
55 def textboxverticals(self) -> T_obj_list:
56 return self.objects.get("textboxvertical", [])
58 @property
59 def textboxhorizontals(self) -> T_obj_list:
60 return self.objects.get("textboxhorizontal", [])
62 @property
63 def textlineverticals(self) -> T_obj_list:
64 return self.objects.get("textlinevertical", [])
66 @property
67 def textlinehorizontals(self) -> T_obj_list:
68 return self.objects.get("textlinehorizontal", [])
70 @property
71 def rect_edges(self) -> T_obj_list:
72 if hasattr(self, "_rect_edges"):
73 return self._rect_edges
74 rect_edges_gen = (utils.rect_to_edges(r) for r in self.rects)
75 self._rect_edges: T_obj_list = list(chain(*rect_edges_gen))
76 return self._rect_edges
78 @property
79 def curve_edges(self) -> T_obj_list:
80 if hasattr(self, "_curve_edges"):
81 return self._curve_edges
82 curve_edges_gen = (utils.curve_to_edges(r) for r in self.curves)
83 self._curve_edges: T_obj_list = list(chain(*curve_edges_gen))
84 return self._curve_edges
86 @property
87 def edges(self) -> T_obj_list:
88 if hasattr(self, "_edges"):
89 return self._edges
90 line_edges = list(map(utils.line_to_edge, self.lines))
91 self._edges: T_obj_list = line_edges + self.rect_edges + self.curve_edges
92 return self._edges
94 @property
95 def horizontal_edges(self) -> T_obj_list:
96 def test(x: T_obj) -> bool:
97 return bool(x["orientation"] == "h")
99 return list(filter(test, self.edges))
101 @property
102 def vertical_edges(self) -> T_obj_list:
103 def test(x: T_obj) -> bool:
104 return bool(x["orientation"] == "v")
106 return list(filter(test, self.edges))
108 def to_json(
109 self,
110 stream: Optional[TextIO] = None,
111 object_types: Optional[List[str]] = None,
112 include_attrs: Optional[List[str]] = None,
113 exclude_attrs: Optional[List[str]] = None,
114 precision: Optional[int] = None,
115 indent: Optional[int] = None,
116 ) -> Optional[str]:
118 data = self.to_dict(object_types)
120 serialized = Serializer(
121 precision=precision,
122 include_attrs=include_attrs,
123 exclude_attrs=exclude_attrs,
124 ).serialize(data)
126 if stream is None:
127 return json.dumps(serialized, indent=indent)
128 else:
129 json.dump(serialized, stream, indent=indent)
130 return None
132 def to_csv(
133 self,
134 stream: Optional[TextIO] = None,
135 object_types: Optional[List[str]] = None,
136 precision: Optional[int] = None,
137 include_attrs: Optional[List[str]] = None,
138 exclude_attrs: Optional[List[str]] = None,
139 ) -> Optional[str]:
140 if stream is None:
141 stream = StringIO()
142 to_string = True
143 else:
144 to_string = False
146 if object_types is None:
147 object_types = list(self.objects.keys()) + ["annot"]
149 serialized = []
150 fields: Set[str] = set()
152 pages = [self] if self.pages is None else self.pages
154 serializer = Serializer(
155 precision=precision,
156 include_attrs=include_attrs,
157 exclude_attrs=exclude_attrs,
158 )
159 for page in pages:
160 for t in object_types:
161 objs = getattr(page, t + "s")
162 if len(objs):
163 serialized += serializer.serialize(objs)
164 new_keys = [k for k, v in objs[0].items() if type(v) is not dict]
165 fields = fields.union(set(new_keys))
167 non_req_cols = CSV_COLS_TO_PREPEND + list(
168 sorted(set(fields) - set(CSV_COLS_REQUIRED + CSV_COLS_TO_PREPEND))
169 )
171 cols = CSV_COLS_REQUIRED + list(filter(serializer.attr_filter, non_req_cols))
173 w = csv.DictWriter(
174 stream,
175 fieldnames=cols,
176 extrasaction="ignore",
177 quoting=csv.QUOTE_MINIMAL,
178 escapechar="\\",
179 )
180 w.writeheader()
181 w.writerows(serialized)
183 if to_string:
184 stream.seek(0)
185 return stream.read()
186 else:
187 return None