Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pdfplumber/structure.py: 15%

1import itertools

2import logging

3import re

4from collections import deque

5from dataclasses import asdict, dataclass, field

6from typing import (

7 TYPE_CHECKING,

8 Any,

9 Callable,

10 Dict,

11 Iterable,

12 Iterator,

13 List,

14 Optional,

15 Pattern,

16 Tuple,

17 Union,

18)

20from pdfminer.data_structures import NumberTree

21from pdfminer.pdfparser import PDFParser

22from pdfminer.pdftypes import PDFObjRef, resolve1

23from pdfminer.psparser import PSLiteral

25from ._typing import T_bbox, T_obj

26from .utils import decode_text, geometry

28logger = logging.getLogger(__name__)

31if TYPE_CHECKING: # pragma: nocover

32 from .page import Page

33 from .pdf import PDF

36MatchFunc = Callable[["PDFStructElement"], bool]

39def _find_all(

40 elements: Iterable["PDFStructElement"],

41 matcher: Union[str, Pattern[str], MatchFunc],

42) -> Iterator["PDFStructElement"]:

43 """

44 Common code for `find_all()` in trees and elements.

45 """

47 def match_tag(x: "PDFStructElement") -> bool:

48 """Match an element name."""

49 return x.type == matcher

51 def match_regex(x: "PDFStructElement") -> bool:

52 """Match an element name by regular expression."""

53 return matcher.match(x.type) # type: ignore

55 if isinstance(matcher, str):

56 match_func = match_tag

57 elif isinstance(matcher, re.Pattern):

58 match_func = match_regex

59 else:

60 match_func = matcher # type: ignore

61 d = deque(elements)

62 while d:

63 el = d.popleft()

64 if match_func(el):

65 yield el

66 d.extendleft(reversed(el.children))

69class Findable:

70 """find() and find_all() methods that can be inherited to avoid

71 repeating oneself"""

73 children: List["PDFStructElement"]

75 def find_all(

76 self, matcher: Union[str, Pattern[str], MatchFunc]

77 ) -> Iterator["PDFStructElement"]:

78 """Iterate depth-first over matching elements in subtree.

80 The `matcher` argument is either an element name, a regular

81 expression, or a function taking a `PDFStructElement` and

82 returning `True` if the element matches.

83 """

84 return _find_all(self.children, matcher)

86 def find(

87 self, matcher: Union[str, Pattern[str], MatchFunc]

88 ) -> Optional["PDFStructElement"]:

89 """Find the first matching element in subtree.

91 The `matcher` argument is either an element name, a regular

92 expression, or a function taking a `PDFStructElement` and

93 returning `True` if the element matches.

94 """

95 try:

96 return next(_find_all(self.children, matcher))

97 except StopIteration:

98 return None

100

101@dataclass

102class PDFStructElement(Findable):

103 type: str

104 revision: Optional[int]

105 id: Optional[str]

106 lang: Optional[str]

107 alt_text: Optional[str]

108 actual_text: Optional[str]

109 title: Optional[str]

110 page_number: Optional[int]

111 attributes: Dict[str, Any] = field(default_factory=dict)

112 mcids: List[int] = field(default_factory=list)

113 children: List["PDFStructElement"] = field(default_factory=list)

114

115 def __iter__(self) -> Iterator["PDFStructElement"]:

116 return iter(self.children)

117

118 def all_mcids(self) -> Iterator[Tuple[Optional[int], int]]:

119 """Collect all MCIDs (with their page numbers, if there are

120 multiple pages in the tree) inside a structure element.

121 """

122 # Collect them depth-first to preserve ordering

123 for mcid in self.mcids:

124 yield self.page_number, mcid

125 d = deque(self.children)

126 while d:

127 el = d.popleft()

128 for mcid in el.mcids:

129 yield el.page_number, mcid

130 d.extendleft(reversed(el.children))

131

132 def to_dict(self) -> Dict[str, Any]:

133 """Return a compacted dict representation."""

134 r = asdict(self)

135 # Prune empty values (does not matter in which order)

136 d = deque([r])

137 while d:

138 el = d.popleft()

139 for k in list(el.keys()):

140 if el[k] is None or el[k] == [] or el[k] == {}:

141 del el[k]

142 if "children" in el:

143 d.extend(el["children"])

144 return r

145

146

147class StructTreeMissing(ValueError):

148 pass

149

150

151class PDFStructTree(Findable):

152 """Parse the structure tree of a PDF.

153

154 The constructor takes a `pdfplumber.PDF` and optionally a

155 `pdfplumber.Page`. To avoid creating the entire tree for a large

156 document it is recommended to provide a page.

157

158 This class creates a representation of the portion of the

159 structure tree that reaches marked content sections, either for a

160 single page, or for the whole document. Note that this is slightly

161 different from the behaviour of other PDF libraries which will

162 also include structure elements with no content.

163

164 If the PDF has no structure, the constructor will raise

165 `StructTreeMissing`.

166

167 """

168

169 page: Optional["Page"]

170

171 def __init__(self, doc: "PDF", page: Optional["Page"] = None):

172 self.doc = doc.doc

173 if "StructTreeRoot" not in self.doc.catalog:

174 raise StructTreeMissing("PDF has no structure")

175 self.root = resolve1(self.doc.catalog["StructTreeRoot"])

176 self.role_map = resolve1(self.root.get("RoleMap", {}))

177 self.class_map = resolve1(self.root.get("ClassMap", {}))

178 self.children: List[PDFStructElement] = []

179

180 # If we have a specific page then we will work backwards from

181 # its ParentTree - this is because structure elements could

182 # span multiple pages, and the "Pg" attribute is *optional*,

183 # so this is the approved way to get a page's structure...

184 if page is not None:

185 self.page = page

186 self.pages = {page.page_number: page}

187 self.page_dict = None

188 # ...EXCEPT that the ParentTree is sometimes missing, in which

189 # case we fall back to the non-approved way.

190 parent_tree_obj = self.root.get("ParentTree")

191 if parent_tree_obj is None:

192 self._parse_struct_tree()

193 else:

194 parent_tree = NumberTree(parent_tree_obj)

195 # If there is no marked content in the structure tree for

196 # this page (which can happen even when there is a

197 # structure tree) then there is no `StructParents`.

198 # Note however that if there are XObjects in a page,

199 # *they* may have `StructParent` (not `StructParents`)

200 if "StructParents" not in self.page.page_obj.attrs:

201 return

202 parent_id = self.page.page_obj.attrs["StructParents"]

203 # NumberTree should have a `get` method like it does in pdf.js...

204 parent_array = resolve1(

205 next(array for num, array in parent_tree.values if num == parent_id)

206 )

207 self._parse_parent_tree(parent_array)

208 else:

209 self.page = None

210 # Overhead of creating pages shouldn't be too bad we hope!

211 self.pages = {page.page_number: page for page in doc.pages}

212 self.page_dict = {

213 page.page_obj.pageid: page.page_number for page in self.pages.values()

214 }

215 self._parse_struct_tree()

216

217 def _make_attributes(

218 self, obj: Dict[str, Any], revision: Optional[int]

219 ) -> Dict[str, Any]:

220 attr_obj_list = []

221 for key in "C", "A":

222 if key not in obj:

223 continue

224 attr_obj = resolve1(obj[key])

225 # It could be a list of attribute objects (why?)

226 if isinstance(attr_obj, list):

227 attr_obj_list.extend(attr_obj)

228 else:

229 attr_obj_list.append(attr_obj)

230 attr_objs = []

231 prev_obj = None

232 for aref in attr_obj_list:

233 # If we find a revision number, which might "follow the

234 # revision object" (the spec is not clear about what this

235 # should look like but it implies they are simply adjacent

236 # in a flat array), then use it to decide whether to take

237 # the previous object...

238 if isinstance(aref, int):

239 if aref == revision and prev_obj is not None:

240 attr_objs.append(prev_obj)

241 prev_obj = None

242 else:

243 if prev_obj is not None:

244 attr_objs.append(prev_obj)

245 prev_obj = resolve1(aref)

246 if prev_obj is not None:

247 attr_objs.append(prev_obj)

248 # Now merge all the attribute objects in the collected to a

249 # single set (again, the spec doesn't really explain this but

250 # does say that attributes in /A supersede those in /C)

251 attr = {}

252 for obj in attr_objs:

253 if isinstance(obj, PSLiteral):

254 key = decode_text(obj.name)

255 if key not in self.class_map:

256 logger.warning("Unknown attribute class %s", key)

257 continue

258 obj = self.class_map[key]

259 for k, v in obj.items():

260 if isinstance(v, PSLiteral):

261 attr[k] = decode_text(v.name)

262 else:

263 attr[k] = obj[k]

264 return attr

265

266 def _make_element(self, obj: Any) -> Tuple[Optional[PDFStructElement], List[Any]]:

267 # We hopefully caught these earlier

268 assert "MCID" not in obj, "Uncaught MCR: %s" % obj

269 assert "Obj" not in obj, "Uncaught OBJR: %s" % obj

270 # Get page number if necessary

271 page_number = None

272 if self.page_dict is not None and "Pg" in obj:

273 page_objid = obj["Pg"].objid

274 assert page_objid in self.page_dict, "Object on unparsed page: %s" % obj

275 page_number = self.page_dict[page_objid]

276 obj_tag = ""

277 if "S" in obj:

278 obj_tag = decode_text(obj["S"].name)

279 if obj_tag in self.role_map:

280 obj_tag = decode_text(self.role_map[obj_tag].name)

281 children = resolve1(obj["K"]) if "K" in obj else []

282 if isinstance(children, int): # ugh... isinstance...

283 children = [children]

284 elif isinstance(children, dict): # a single object.. ugh...

285 children = [obj["K"]]

286 revision = obj.get("R")

287 attributes = self._make_attributes(obj, revision)

288 element_id = decode_text(resolve1(obj["ID"])) if "ID" in obj else None

289 title = decode_text(resolve1(obj["T"])) if "T" in obj else None

290 lang = decode_text(resolve1(obj["Lang"])) if "Lang" in obj else None

291 alt_text = decode_text(resolve1(obj["Alt"])) if "Alt" in obj else None

292 actual_text = (

293 decode_text(resolve1(obj["ActualText"])) if "ActualText" in obj else None

294 )

295 element = PDFStructElement(

296 type=obj_tag,

297 id=element_id,

298 page_number=page_number,

299 revision=revision,

300 lang=lang,

301 title=title,

302 alt_text=alt_text,

303 actual_text=actual_text,

304 attributes=attributes,

305 )

306 return element, children

307

308 def _parse_parent_tree(self, parent_array: List[Any]) -> None:

309 """Populate the structure tree using the leaves of the parent tree for

310 a given page."""

311 # First walk backwards from the leaves to the root, tracking references

312 d = deque(parent_array)

313 s = {}

314 found_root = False

315 while d:

316 ref = d.popleft()

317 # In the case where an MCID is not associated with any

318 # structure, there will be a "null" in the parent tree.

319 if ref == PDFParser.KEYWORD_NULL:

320 continue

321 if repr(ref) in s:

322 continue

323 obj = resolve1(ref)

324 # This is required! It's in the spec!

325 if "Type" in obj and decode_text(obj["Type"].name) == "StructTreeRoot":

326 found_root = True

327 else:

328 # We hope that these are actual elements and not

329 # references or marked-content sections...

330 element, children = self._make_element(obj)

331 # We have no page tree so we assume this page was parsed

332 assert element is not None

333 s[repr(ref)] = element, children

334 d.append(obj["P"])

335 # If we didn't reach the root something is quite wrong!

336 assert found_root

337 self._resolve_children(s)

338

339 def on_parsed_page(self, obj: Dict[str, Any]) -> bool:

340 if "Pg" not in obj:

341 return True

342 page_objid = obj["Pg"].objid

343 if self.page_dict is not None:

344 return page_objid in self.page_dict

345 if self.page is not None:

346 # We have to do this to satisfy mypy

347 if page_objid != self.page.page_obj.pageid:

348 return False

349 return True

350

351 def _parse_struct_tree(self) -> None:

352 """Populate the structure tree starting from the root, skipping

353 unparsed pages and empty elements."""

354 root = resolve1(self.root["K"])

355

356 # It could just be a single object ... it's in the spec (argh)

357 if isinstance(root, dict):

358 root = [self.root["K"]]

359 d = deque(root)

360 s = {}

361 while d:

362 ref = d.popleft()

363 # In case the tree is actually a DAG and not a tree...

364 if repr(ref) in s: # pragma: nocover (shouldn't happen)

365 continue

366 obj = resolve1(ref)

367 # Deref top-level OBJR skipping refs to unparsed pages

368 if isinstance(obj, dict) and "Obj" in obj:

369 if not self.on_parsed_page(obj):

370 continue

371 ref = obj["Obj"]

372 obj = resolve1(ref)

373 element, children = self._make_element(obj)

374 # Similar to above, delay resolving the children to avoid

375 # tree-recursion.

376 s[repr(ref)] = element, children

377 for child in children:

378 obj = resolve1(child)

379 if isinstance(obj, dict):

380 if not self.on_parsed_page(obj):

381 continue

382 if "Obj" in obj:

383 child = obj["Obj"]

384 elif "MCID" in obj:

385 continue

386 if isinstance(child, PDFObjRef):

387 d.append(child)

388

389 # Traverse depth-first, removing empty elements (unsure how to

390 # do this non-recursively)

391 def prune(elements: List[Any]) -> List[Any]:

392 next_elements = []

393 for ref in elements:

394 obj = resolve1(ref)

395 if isinstance(ref, int):

396 next_elements.append(ref)

397 continue

398 elif isinstance(obj, dict):

399 if not self.on_parsed_page(obj):

400 continue

401 if "MCID" in obj:

402 next_elements.append(obj["MCID"])

403 continue

404 elif "Obj" in obj:

405 ref = obj["Obj"]

406 element, children = s[repr(ref)]

407 children = prune(children)

408 # See assertions below

409 if element is None or not children:

410 del s[repr(ref)]

411 else:

412 s[repr(ref)] = element, children

413 next_elements.append(ref)

414 return next_elements

415

416 prune(root)

417 self._resolve_children(s)

418

419 def _resolve_children(self, seen: Dict[str, Any]) -> None:

420 """Resolve children starting from the tree root based on references we

421 saw when traversing the structure tree.

422 """

423 root = resolve1(self.root["K"])

424 # It could just be a single object ... it's in the spec (argh)

425 if isinstance(root, dict):

426 root = [self.root["K"]]

427 self.children = []

428 # Create top-level self.children

429 parsed_root = []

430 for ref in root:

431 obj = resolve1(ref)

432 if isinstance(obj, dict) and "Obj" in obj:

433 if not self.on_parsed_page(obj):

434 continue

435 ref = obj["Obj"]

436 if repr(ref) in seen:

437 parsed_root.append(ref)

438 d = deque(parsed_root)

439 while d:

440 ref = d.popleft()

441 element, children = seen[repr(ref)]

442 assert element is not None, "Unparsed element"

443 for child in children:

444 obj = resolve1(child)

445 if isinstance(obj, int):

446 element.mcids.append(obj)

447 elif isinstance(obj, dict):

448 # Skip out-of-page MCIDS and OBJRs

449 if not self.on_parsed_page(obj):

450 continue

451 if "MCID" in obj:

452 element.mcids.append(obj["MCID"])

453 elif "Obj" in obj:

454 child = obj["Obj"]

455 # NOTE: if, not elif, in case of OBJR above

456 if isinstance(child, PDFObjRef):

457 child_element, _ = seen.get(repr(child), (None, None))

458 if child_element is not None:

459 element.children.append(child_element)

460 d.append(child)

461 self.children = [seen[repr(ref)][0] for ref in parsed_root]

462

463 def __iter__(self) -> Iterator[PDFStructElement]:

464 return iter(self.children)

465

466 def element_bbox(self, el: PDFStructElement) -> T_bbox:

467 """Get the bounding box for an element for visual debugging."""

468 page = None

469 if self.page is not None:

470 page = self.page

471 elif el.page_number is not None:

472 page = self.pages[el.page_number]

473 bbox = el.attributes.get("BBox", None)

474 if page is not None and bbox is not None:

475 from .page import CroppedPage, _invert_box, _normalize_box

476

477 # Use secret knowledge of CroppedPage (cannot use

478 # page.height because it is the *cropped* dimension, but

479 # cropping does not actually translate coordinates)

480 bbox = _invert_box(

481 _normalize_box(bbox), page.mediabox[3] - page.mediabox[1]

482 )

483 # Use more secret knowledge of CroppedPage

484 if isinstance(page, CroppedPage):

485 rect = geometry.bbox_to_rect(bbox)

486 rects = page._crop_fn([rect])

487 if not rects:

488 raise IndexError("Element no longer on page")

489 return geometry.obj_to_bbox(rects[0])

490 else:

491 # Not sure why mypy complains here

492 return bbox # type: ignore

493 else:

494 mcid_objs = []

495 for page_number, mcid in el.all_mcids():

496 objects: Iterable[T_obj]

497 if page_number is None:

498 if page is not None:

499 objects = itertools.chain.from_iterable(page.objects.values())

500 else:

501 objects = [] # pragma: nocover

502 else:

503 objects = itertools.chain.from_iterable(

504 self.pages[page_number].objects.values()

505 )

506 for c in objects:

507 if c["mcid"] == mcid:

508 mcid_objs.append(c)

509 if not mcid_objs:

510 raise IndexError("No objects found") # pragma: nocover

511 return geometry.objects_to_bbox(mcid_objs)