Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pdfplumber/structure.py: 15%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

310 statements  

1import itertools 

2import logging 

3import re 

4from collections import deque 

5from dataclasses import asdict, dataclass, field 

6from typing import ( 

7 TYPE_CHECKING, 

8 Any, 

9 Callable, 

10 Dict, 

11 Iterable, 

12 Iterator, 

13 List, 

14 Optional, 

15 Pattern, 

16 Tuple, 

17 Union, 

18) 

19 

20from pdfminer.data_structures import NumberTree 

21from pdfminer.pdfparser import PDFParser 

22from pdfminer.pdftypes import PDFObjRef, resolve1 

23from pdfminer.psparser import PSLiteral 

24 

25from ._typing import T_bbox, T_obj 

26from .utils import decode_text, geometry 

27 

28logger = logging.getLogger(__name__) 

29 

30 

31if TYPE_CHECKING: # pragma: nocover 

32 from .page import Page 

33 from .pdf import PDF 

34 

35 

36MatchFunc = Callable[["PDFStructElement"], bool] 

37 

38 

39def _find_all( 

40 elements: Iterable["PDFStructElement"], 

41 matcher: Union[str, Pattern[str], MatchFunc], 

42) -> Iterator["PDFStructElement"]: 

43 """ 

44 Common code for `find_all()` in trees and elements. 

45 """ 

46 

47 def match_tag(x: "PDFStructElement") -> bool: 

48 """Match an element name.""" 

49 return x.type == matcher 

50 

51 def match_regex(x: "PDFStructElement") -> bool: 

52 """Match an element name by regular expression.""" 

53 return matcher.match(x.type) # type: ignore 

54 

55 if isinstance(matcher, str): 

56 match_func = match_tag 

57 elif isinstance(matcher, re.Pattern): 

58 match_func = match_regex 

59 else: 

60 match_func = matcher # type: ignore 

61 d = deque(elements) 

62 while d: 

63 el = d.popleft() 

64 if match_func(el): 

65 yield el 

66 d.extendleft(reversed(el.children)) 

67 

68 

69class Findable: 

70 """find() and find_all() methods that can be inherited to avoid 

71 repeating oneself""" 

72 

73 children: List["PDFStructElement"] 

74 

75 def find_all( 

76 self, matcher: Union[str, Pattern[str], MatchFunc] 

77 ) -> Iterator["PDFStructElement"]: 

78 """Iterate depth-first over matching elements in subtree. 

79 

80 The `matcher` argument is either an element name, a regular 

81 expression, or a function taking a `PDFStructElement` and 

82 returning `True` if the element matches. 

83 """ 

84 return _find_all(self.children, matcher) 

85 

86 def find( 

87 self, matcher: Union[str, Pattern[str], MatchFunc] 

88 ) -> Optional["PDFStructElement"]: 

89 """Find the first matching element in subtree. 

90 

91 The `matcher` argument is either an element name, a regular 

92 expression, or a function taking a `PDFStructElement` and 

93 returning `True` if the element matches. 

94 """ 

95 try: 

96 return next(_find_all(self.children, matcher)) 

97 except StopIteration: 

98 return None 

99 

100 

101@dataclass 

102class PDFStructElement(Findable): 

103 type: str 

104 revision: Optional[int] 

105 id: Optional[str] 

106 lang: Optional[str] 

107 alt_text: Optional[str] 

108 actual_text: Optional[str] 

109 title: Optional[str] 

110 page_number: Optional[int] 

111 attributes: Dict[str, Any] = field(default_factory=dict) 

112 mcids: List[int] = field(default_factory=list) 

113 children: List["PDFStructElement"] = field(default_factory=list) 

114 

115 def __iter__(self) -> Iterator["PDFStructElement"]: 

116 return iter(self.children) 

117 

118 def all_mcids(self) -> Iterator[Tuple[Optional[int], int]]: 

119 """Collect all MCIDs (with their page numbers, if there are 

120 multiple pages in the tree) inside a structure element. 

121 """ 

122 # Collect them depth-first to preserve ordering 

123 for mcid in self.mcids: 

124 yield self.page_number, mcid 

125 d = deque(self.children) 

126 while d: 

127 el = d.popleft() 

128 for mcid in el.mcids: 

129 yield el.page_number, mcid 

130 d.extendleft(reversed(el.children)) 

131 

132 def to_dict(self) -> Dict[str, Any]: 

133 """Return a compacted dict representation.""" 

134 r = asdict(self) 

135 # Prune empty values (does not matter in which order) 

136 d = deque([r]) 

137 while d: 

138 el = d.popleft() 

139 for k in list(el.keys()): 

140 if el[k] is None or el[k] == [] or el[k] == {}: 

141 del el[k] 

142 if "children" in el: 

143 d.extend(el["children"]) 

144 return r 

145 

146 

147class StructTreeMissing(ValueError): 

148 pass 

149 

150 

151class PDFStructTree(Findable): 

152 """Parse the structure tree of a PDF. 

153 

154 The constructor takes a `pdfplumber.PDF` and optionally a 

155 `pdfplumber.Page`. To avoid creating the entire tree for a large 

156 document it is recommended to provide a page. 

157 

158 This class creates a representation of the portion of the 

159 structure tree that reaches marked content sections, either for a 

160 single page, or for the whole document. Note that this is slightly 

161 different from the behaviour of other PDF libraries which will 

162 also include structure elements with no content. 

163 

164 If the PDF has no structure, the constructor will raise 

165 `StructTreeMissing`. 

166 

167 """ 

168 

169 page: Optional["Page"] 

170 

171 def __init__(self, doc: "PDF", page: Optional["Page"] = None): 

172 self.doc = doc.doc 

173 if "StructTreeRoot" not in self.doc.catalog: 

174 raise StructTreeMissing("PDF has no structure") 

175 self.root = resolve1(self.doc.catalog["StructTreeRoot"]) 

176 self.role_map = resolve1(self.root.get("RoleMap", {})) 

177 self.class_map = resolve1(self.root.get("ClassMap", {})) 

178 self.children: List[PDFStructElement] = [] 

179 

180 # If we have a specific page then we will work backwards from 

181 # its ParentTree - this is because structure elements could 

182 # span multiple pages, and the "Pg" attribute is *optional*, 

183 # so this is the approved way to get a page's structure... 

184 if page is not None: 

185 self.page = page 

186 self.pages = {page.page_number: page} 

187 self.page_dict = None 

188 # ...EXCEPT that the ParentTree is sometimes missing, in which 

189 # case we fall back to the non-approved way. 

190 parent_tree_obj = self.root.get("ParentTree") 

191 if parent_tree_obj is None: 

192 self._parse_struct_tree() 

193 else: 

194 parent_tree = NumberTree(parent_tree_obj) 

195 # If there is no marked content in the structure tree for 

196 # this page (which can happen even when there is a 

197 # structure tree) then there is no `StructParents`. 

198 # Note however that if there are XObjects in a page, 

199 # *they* may have `StructParent` (not `StructParents`) 

200 if "StructParents" not in self.page.page_obj.attrs: 

201 return 

202 parent_id = self.page.page_obj.attrs["StructParents"] 

203 # NumberTree should have a `get` method like it does in pdf.js... 

204 parent_array = resolve1( 

205 next(array for num, array in parent_tree.values if num == parent_id) 

206 ) 

207 self._parse_parent_tree(parent_array) 

208 else: 

209 self.page = None 

210 # Overhead of creating pages shouldn't be too bad we hope! 

211 self.pages = {page.page_number: page for page in doc.pages} 

212 self.page_dict = { 

213 page.page_obj.pageid: page.page_number for page in self.pages.values() 

214 } 

215 self._parse_struct_tree() 

216 

217 def _make_attributes( 

218 self, obj: Dict[str, Any], revision: Optional[int] 

219 ) -> Dict[str, Any]: 

220 attr_obj_list = [] 

221 for key in "C", "A": 

222 if key not in obj: 

223 continue 

224 attr_obj = resolve1(obj[key]) 

225 # It could be a list of attribute objects (why?) 

226 if isinstance(attr_obj, list): 

227 attr_obj_list.extend(attr_obj) 

228 else: 

229 attr_obj_list.append(attr_obj) 

230 attr_objs = [] 

231 prev_obj = None 

232 for aref in attr_obj_list: 

233 # If we find a revision number, which might "follow the 

234 # revision object" (the spec is not clear about what this 

235 # should look like but it implies they are simply adjacent 

236 # in a flat array), then use it to decide whether to take 

237 # the previous object... 

238 if isinstance(aref, int): 

239 if aref == revision and prev_obj is not None: 

240 attr_objs.append(prev_obj) 

241 prev_obj = None 

242 else: 

243 if prev_obj is not None: 

244 attr_objs.append(prev_obj) 

245 prev_obj = resolve1(aref) 

246 if prev_obj is not None: 

247 attr_objs.append(prev_obj) 

248 # Now merge all the attribute objects in the collected to a 

249 # single set (again, the spec doesn't really explain this but 

250 # does say that attributes in /A supersede those in /C) 

251 attr = {} 

252 for obj in attr_objs: 

253 if isinstance(obj, PSLiteral): 

254 key = decode_text(obj.name) 

255 if key not in self.class_map: 

256 logger.warning("Unknown attribute class %s", key) 

257 continue 

258 obj = self.class_map[key] 

259 for k, v in obj.items(): 

260 if isinstance(v, PSLiteral): 

261 attr[k] = decode_text(v.name) 

262 else: 

263 attr[k] = obj[k] 

264 return attr 

265 

266 def _make_element(self, obj: Any) -> Tuple[Optional[PDFStructElement], List[Any]]: 

267 # We hopefully caught these earlier 

268 assert "MCID" not in obj, "Uncaught MCR: %s" % obj 

269 assert "Obj" not in obj, "Uncaught OBJR: %s" % obj 

270 # Get page number if necessary 

271 page_number = None 

272 if self.page_dict is not None and "Pg" in obj: 

273 page_objid = obj["Pg"].objid 

274 assert page_objid in self.page_dict, "Object on unparsed page: %s" % obj 

275 page_number = self.page_dict[page_objid] 

276 obj_tag = "" 

277 if "S" in obj: 

278 obj_tag = decode_text(obj["S"].name) 

279 if obj_tag in self.role_map: 

280 obj_tag = decode_text(self.role_map[obj_tag].name) 

281 children = resolve1(obj["K"]) if "K" in obj else [] 

282 if isinstance(children, int): # ugh... isinstance... 

283 children = [children] 

284 elif isinstance(children, dict): # a single object.. ugh... 

285 children = [obj["K"]] 

286 revision = obj.get("R") 

287 attributes = self._make_attributes(obj, revision) 

288 element_id = decode_text(resolve1(obj["ID"])) if "ID" in obj else None 

289 title = decode_text(resolve1(obj["T"])) if "T" in obj else None 

290 lang = decode_text(resolve1(obj["Lang"])) if "Lang" in obj else None 

291 alt_text = decode_text(resolve1(obj["Alt"])) if "Alt" in obj else None 

292 actual_text = ( 

293 decode_text(resolve1(obj["ActualText"])) if "ActualText" in obj else None 

294 ) 

295 element = PDFStructElement( 

296 type=obj_tag, 

297 id=element_id, 

298 page_number=page_number, 

299 revision=revision, 

300 lang=lang, 

301 title=title, 

302 alt_text=alt_text, 

303 actual_text=actual_text, 

304 attributes=attributes, 

305 ) 

306 return element, children 

307 

308 def _parse_parent_tree(self, parent_array: List[Any]) -> None: 

309 """Populate the structure tree using the leaves of the parent tree for 

310 a given page.""" 

311 # First walk backwards from the leaves to the root, tracking references 

312 d = deque(parent_array) 

313 s = {} 

314 found_root = False 

315 while d: 

316 ref = d.popleft() 

317 # In the case where an MCID is not associated with any 

318 # structure, there will be a "null" in the parent tree. 

319 if ref == PDFParser.KEYWORD_NULL: 

320 continue 

321 if repr(ref) in s: 

322 continue 

323 obj = resolve1(ref) 

324 # This is required! It's in the spec! 

325 if "Type" in obj and decode_text(obj["Type"].name) == "StructTreeRoot": 

326 found_root = True 

327 else: 

328 # We hope that these are actual elements and not 

329 # references or marked-content sections... 

330 element, children = self._make_element(obj) 

331 # We have no page tree so we assume this page was parsed 

332 assert element is not None 

333 s[repr(ref)] = element, children 

334 d.append(obj["P"]) 

335 # If we didn't reach the root something is quite wrong! 

336 assert found_root 

337 self._resolve_children(s) 

338 

339 def on_parsed_page(self, obj: Dict[str, Any]) -> bool: 

340 if "Pg" not in obj: 

341 return True 

342 page_objid = obj["Pg"].objid 

343 if self.page_dict is not None: 

344 return page_objid in self.page_dict 

345 if self.page is not None: 

346 # We have to do this to satisfy mypy 

347 if page_objid != self.page.page_obj.pageid: 

348 return False 

349 return True 

350 

351 def _parse_struct_tree(self) -> None: 

352 """Populate the structure tree starting from the root, skipping 

353 unparsed pages and empty elements.""" 

354 root = resolve1(self.root["K"]) 

355 

356 # It could just be a single object ... it's in the spec (argh) 

357 if isinstance(root, dict): 

358 root = [self.root["K"]] 

359 d = deque(root) 

360 s = {} 

361 while d: 

362 ref = d.popleft() 

363 # In case the tree is actually a DAG and not a tree... 

364 if repr(ref) in s: # pragma: nocover (shouldn't happen) 

365 continue 

366 obj = resolve1(ref) 

367 # Deref top-level OBJR skipping refs to unparsed pages 

368 if isinstance(obj, dict) and "Obj" in obj: 

369 if not self.on_parsed_page(obj): 

370 continue 

371 ref = obj["Obj"] 

372 obj = resolve1(ref) 

373 element, children = self._make_element(obj) 

374 # Similar to above, delay resolving the children to avoid 

375 # tree-recursion. 

376 s[repr(ref)] = element, children 

377 for child in children: 

378 obj = resolve1(child) 

379 if isinstance(obj, dict): 

380 if not self.on_parsed_page(obj): 

381 continue 

382 if "Obj" in obj: 

383 child = obj["Obj"] 

384 elif "MCID" in obj: 

385 continue 

386 if isinstance(child, PDFObjRef): 

387 d.append(child) 

388 

389 # Traverse depth-first, removing empty elements (unsure how to 

390 # do this non-recursively) 

391 def prune(elements: List[Any]) -> List[Any]: 

392 next_elements = [] 

393 for ref in elements: 

394 obj = resolve1(ref) 

395 if isinstance(ref, int): 

396 next_elements.append(ref) 

397 continue 

398 elif isinstance(obj, dict): 

399 if not self.on_parsed_page(obj): 

400 continue 

401 if "MCID" in obj: 

402 next_elements.append(obj["MCID"]) 

403 continue 

404 elif "Obj" in obj: 

405 ref = obj["Obj"] 

406 element, children = s[repr(ref)] 

407 children = prune(children) 

408 # See assertions below 

409 if element is None or not children: 

410 del s[repr(ref)] 

411 else: 

412 s[repr(ref)] = element, children 

413 next_elements.append(ref) 

414 return next_elements 

415 

416 prune(root) 

417 self._resolve_children(s) 

418 

419 def _resolve_children(self, seen: Dict[str, Any]) -> None: 

420 """Resolve children starting from the tree root based on references we 

421 saw when traversing the structure tree. 

422 """ 

423 root = resolve1(self.root["K"]) 

424 # It could just be a single object ... it's in the spec (argh) 

425 if isinstance(root, dict): 

426 root = [self.root["K"]] 

427 self.children = [] 

428 # Create top-level self.children 

429 parsed_root = [] 

430 for ref in root: 

431 obj = resolve1(ref) 

432 if isinstance(obj, dict) and "Obj" in obj: 

433 if not self.on_parsed_page(obj): 

434 continue 

435 ref = obj["Obj"] 

436 if repr(ref) in seen: 

437 parsed_root.append(ref) 

438 d = deque(parsed_root) 

439 while d: 

440 ref = d.popleft() 

441 element, children = seen[repr(ref)] 

442 assert element is not None, "Unparsed element" 

443 for child in children: 

444 obj = resolve1(child) 

445 if isinstance(obj, int): 

446 element.mcids.append(obj) 

447 elif isinstance(obj, dict): 

448 # Skip out-of-page MCIDS and OBJRs 

449 if not self.on_parsed_page(obj): 

450 continue 

451 if "MCID" in obj: 

452 element.mcids.append(obj["MCID"]) 

453 elif "Obj" in obj: 

454 child = obj["Obj"] 

455 # NOTE: if, not elif, in case of OBJR above 

456 if isinstance(child, PDFObjRef): 

457 child_element, _ = seen.get(repr(child), (None, None)) 

458 if child_element is not None: 

459 element.children.append(child_element) 

460 d.append(child) 

461 self.children = [seen[repr(ref)][0] for ref in parsed_root] 

462 

463 def __iter__(self) -> Iterator[PDFStructElement]: 

464 return iter(self.children) 

465 

466 def element_bbox(self, el: PDFStructElement) -> T_bbox: 

467 """Get the bounding box for an element for visual debugging.""" 

468 page = None 

469 if self.page is not None: 

470 page = self.page 

471 elif el.page_number is not None: 

472 page = self.pages[el.page_number] 

473 bbox = el.attributes.get("BBox", None) 

474 if page is not None and bbox is not None: 

475 from .page import CroppedPage, _invert_box, _normalize_box 

476 

477 # Use secret knowledge of CroppedPage (cannot use 

478 # page.height because it is the *cropped* dimension, but 

479 # cropping does not actually translate coordinates) 

480 bbox = _invert_box( 

481 _normalize_box(bbox), page.mediabox[3] - page.mediabox[1] 

482 ) 

483 # Use more secret knowledge of CroppedPage 

484 if isinstance(page, CroppedPage): 

485 rect = geometry.bbox_to_rect(bbox) 

486 rects = page._crop_fn([rect]) 

487 if not rects: 

488 raise IndexError("Element no longer on page") 

489 return geometry.obj_to_bbox(rects[0]) 

490 else: 

491 # Not sure why mypy complains here 

492 return bbox # type: ignore 

493 else: 

494 mcid_objs = [] 

495 for page_number, mcid in el.all_mcids(): 

496 objects: Iterable[T_obj] 

497 if page_number is None: 

498 if page is not None: 

499 objects = itertools.chain.from_iterable(page.objects.values()) 

500 else: 

501 objects = [] # pragma: nocover 

502 else: 

503 objects = itertools.chain.from_iterable( 

504 self.pages[page_number].objects.values() 

505 ) 

506 for c in objects: 

507 if c["mcid"] == mcid: 

508 mcid_objs.append(c) 

509 if not mcid_objs: 

510 raise IndexError("No objects found") # pragma: nocover 

511 return geometry.objects_to_bbox(mcid_objs)