Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pdfplumber/container.py: 60%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

109 statements  

1import csv 

2import json 

3from io import StringIO 

4from itertools import chain 

5from typing import Any, Dict, List, Optional, Set, TextIO 

6 

7from . import utils 

8from ._typing import T_obj, T_obj_list 

9from .convert import CSV_COLS_REQUIRED, CSV_COLS_TO_PREPEND, Serializer 

10 

11 

12class Container(object): 

13 cached_properties = ["_rect_edges", "_curve_edges", "_edges", "_objects"] 

14 

15 @property 

16 def pages(self) -> Optional[List[Any]]: # pragma: nocover 

17 raise NotImplementedError 

18 

19 @property 

20 def objects(self) -> Dict[str, T_obj_list]: # pragma: nocover 

21 raise NotImplementedError 

22 

23 def to_dict( 

24 self, object_types: Optional[List[str]] = None 

25 ) -> Dict[str, Any]: # pragma: nocover 

26 raise NotImplementedError 

27 

28 def flush_cache(self, properties: Optional[List[str]] = None) -> None: 

29 props = self.cached_properties if properties is None else properties 

30 for p in props: 

31 if hasattr(self, p): 

32 delattr(self, p) 

33 

34 @property 

35 def rects(self) -> T_obj_list: 

36 return self.objects.get("rect", []) 

37 

38 @property 

39 def lines(self) -> T_obj_list: 

40 return self.objects.get("line", []) 

41 

42 @property 

43 def curves(self) -> T_obj_list: 

44 return self.objects.get("curve", []) 

45 

46 @property 

47 def images(self) -> T_obj_list: 

48 return self.objects.get("image", []) 

49 

50 @property 

51 def chars(self) -> T_obj_list: 

52 return self.objects.get("char", []) 

53 

54 @property 

55 def textboxverticals(self) -> T_obj_list: 

56 return self.objects.get("textboxvertical", []) 

57 

58 @property 

59 def textboxhorizontals(self) -> T_obj_list: 

60 return self.objects.get("textboxhorizontal", []) 

61 

62 @property 

63 def textlineverticals(self) -> T_obj_list: 

64 return self.objects.get("textlinevertical", []) 

65 

66 @property 

67 def textlinehorizontals(self) -> T_obj_list: 

68 return self.objects.get("textlinehorizontal", []) 

69 

70 @property 

71 def rect_edges(self) -> T_obj_list: 

72 if hasattr(self, "_rect_edges"): 

73 return self._rect_edges 

74 rect_edges_gen = (utils.rect_to_edges(r) for r in self.rects) 

75 self._rect_edges: T_obj_list = list(chain(*rect_edges_gen)) 

76 return self._rect_edges 

77 

78 @property 

79 def curve_edges(self) -> T_obj_list: 

80 if hasattr(self, "_curve_edges"): 

81 return self._curve_edges 

82 curve_edges_gen = (utils.curve_to_edges(r) for r in self.curves) 

83 self._curve_edges: T_obj_list = list(chain(*curve_edges_gen)) 

84 return self._curve_edges 

85 

86 @property 

87 def edges(self) -> T_obj_list: 

88 if hasattr(self, "_edges"): 

89 return self._edges 

90 line_edges = list(map(utils.line_to_edge, self.lines)) 

91 self._edges: T_obj_list = line_edges + self.rect_edges + self.curve_edges 

92 return self._edges 

93 

94 @property 

95 def horizontal_edges(self) -> T_obj_list: 

96 def test(x: T_obj) -> bool: 

97 return bool(x["orientation"] == "h") 

98 

99 return list(filter(test, self.edges)) 

100 

101 @property 

102 def vertical_edges(self) -> T_obj_list: 

103 def test(x: T_obj) -> bool: 

104 return bool(x["orientation"] == "v") 

105 

106 return list(filter(test, self.edges)) 

107 

108 def to_json( 

109 self, 

110 stream: Optional[TextIO] = None, 

111 object_types: Optional[List[str]] = None, 

112 include_attrs: Optional[List[str]] = None, 

113 exclude_attrs: Optional[List[str]] = None, 

114 precision: Optional[int] = None, 

115 indent: Optional[int] = None, 

116 ) -> Optional[str]: 

117 

118 data = self.to_dict(object_types) 

119 

120 serialized = Serializer( 

121 precision=precision, 

122 include_attrs=include_attrs, 

123 exclude_attrs=exclude_attrs, 

124 ).serialize(data) 

125 

126 if stream is None: 

127 return json.dumps(serialized, indent=indent) 

128 else: 

129 json.dump(serialized, stream, indent=indent) 

130 return None 

131 

132 def to_csv( 

133 self, 

134 stream: Optional[TextIO] = None, 

135 object_types: Optional[List[str]] = None, 

136 precision: Optional[int] = None, 

137 include_attrs: Optional[List[str]] = None, 

138 exclude_attrs: Optional[List[str]] = None, 

139 ) -> Optional[str]: 

140 if stream is None: 

141 stream = StringIO() 

142 to_string = True 

143 else: 

144 to_string = False 

145 

146 if object_types is None: 

147 object_types = list(self.objects.keys()) + ["annot"] 

148 

149 serialized = [] 

150 fields: Set[str] = set() 

151 

152 pages = [self] if self.pages is None else self.pages 

153 

154 serializer = Serializer( 

155 precision=precision, 

156 include_attrs=include_attrs, 

157 exclude_attrs=exclude_attrs, 

158 ) 

159 for page in pages: 

160 for t in object_types: 

161 objs = getattr(page, t + "s") 

162 if len(objs): 

163 serialized += serializer.serialize(objs) 

164 new_keys = [k for k, v in objs[0].items() if type(v) is not dict] 

165 fields = fields.union(set(new_keys)) 

166 

167 non_req_cols = CSV_COLS_TO_PREPEND + list( 

168 sorted(set(fields) - set(CSV_COLS_REQUIRED + CSV_COLS_TO_PREPEND)) 

169 ) 

170 

171 cols = CSV_COLS_REQUIRED + list(filter(serializer.attr_filter, non_req_cols)) 

172 

173 w = csv.DictWriter( 

174 stream, 

175 fieldnames=cols, 

176 extrasaction="ignore", 

177 quoting=csv.QUOTE_MINIMAL, 

178 escapechar="\\", 

179 ) 

180 w.writeheader() 

181 w.writerows(serialized) 

182 

183 if to_string: 

184 stream.seek(0) 

185 return stream.read() 

186 else: 

187 return None