Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_doc

7# Redistribution and use in source and binary forms, with or without

8# modification, are permitted provided that the following conditions are

9# met:

10#

11# * Redistributions of source code must retain the above copyright notice,

12# this list of conditions and the following disclaimer.

13# * Redistributions in binary form must reproduce the above copyright notice,

14# this list of conditions and the following disclaimer in the documentation

15# and/or other materials provided with the distribution.

16# * The name of the author may not be used to endorse or promote products

17# derived from this software without specific prior written permission.

18#

19# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"

20# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE

21# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE

22# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE

23# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR

24# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF

25# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS

26# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN

27# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)

28# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE

29# POSSIBILITY OF SUCH DAMAGE.

31import struct

32import zlib

33from abc import abstractmethod

34from datetime import datetime

35from typing import (

36 Any,

37 Dict,

38 Generator,

39 Iterable,

40 Iterator,

41 List,

42 Mapping,

43 Optional,

44 Tuple,

45 Union,

46 cast,

47)

49from ._encryption import Encryption

50from ._page import PageObject, _VirtualList

51from ._page_labels import index2label as page_index2page_label

52from ._utils import (

53 deprecate_with_replacement,

54 logger_warning,

55 parse_iso8824_date,

56)

57from .constants import CatalogAttributes as CA

58from .constants import CatalogDictionary as CD

59from .constants import (

60 CheckboxRadioButtonAttributes,

61 GoToActionArguments,

62 UserAccessPermissions,

63)

64from .constants import Core as CO

65from .constants import DocumentInformationAttributes as DI

66from .constants import FieldDictionaryAttributes as FA

67from .constants import PageAttributes as PG

68from .constants import PagesAttributes as PA

69from .errors import PdfReadError, PyPdfError

70from .generic import (

71 ArrayObject,

72 BooleanObject,

73 ByteStringObject,

74 Destination,

75 DictionaryObject,

76 EncodedStreamObject,

77 Field,

78 Fit,

79 FloatObject,

80 IndirectObject,

81 NameObject,

82 NullObject,

83 NumberObject,

84 PdfObject,

85 TextStringObject,

86 TreeObject,

87 ViewerPreferences,

88 create_string_object,

89 is_null_or_none,

90)

91from .generic._files import EmbeddedFile

92from .types import OutlineType, PagemodeType

93from .xmp import XmpInformation

96def convert_to_int(d: bytes, size: int) -> Union[int, Tuple[Any, ...]]:

97 if size > 8:

98 raise PdfReadError("Invalid size in convert_to_int")

99 d = b"\x00\x00\x00\x00\x00\x00\x00\x00" + d

100 d = d[-8:]

101 return struct.unpack(">q", d)[0]

102

103

104class DocumentInformation(DictionaryObject):

105 """

106 A class representing the basic document metadata provided in a PDF File.

107 This class is accessible through

108 :py:class:`PdfReader.metadata<pypdf.PdfReader.metadata>`.

109

110 All text properties of the document metadata have

111 *two* properties, e.g. author and author_raw. The non-raw property will

112 always return a ``TextStringObject``, making it ideal for a case where the

113 metadata is being displayed. The raw property can sometimes return a

114 ``ByteStringObject``, if pypdf was unable to decode the string's text

115 encoding; this requires additional safety in the caller and therefore is not

116 as commonly accessed.

117 """

118

119 def __init__(self) -> None:

120 DictionaryObject.__init__(self)

121

122 def _get_text(self, key: str) -> Optional[str]:

123 retval = self.get(key, None)

124 if isinstance(retval, TextStringObject):

125 return retval

126 if isinstance(retval, ByteStringObject):

127 return str(retval)

128 return None

129

130 @property

131 def title(self) -> Optional[str]:

132 """

133 Read-only property accessing the document's title.

134

135 Returns a ``TextStringObject`` or ``None`` if the title is not

136 specified.

137 """

138 return (

139 self._get_text(DI.TITLE) or self.get(DI.TITLE).get_object() # type: ignore

140 if self.get(DI.TITLE)

141 else None

142 )

143

144 @property

145 def title_raw(self) -> Optional[str]:

146 """The "raw" version of title; can return a ``ByteStringObject``."""

147 return self.get(DI.TITLE)

148

149 @property

150 def author(self) -> Optional[str]:

151 """

152 Read-only property accessing the document's author.

153

154 Returns a ``TextStringObject`` or ``None`` if the author is not

155 specified.

156 """

157 return self._get_text(DI.AUTHOR)

158

159 @property

160 def author_raw(self) -> Optional[str]:

161 """The "raw" version of author; can return a ``ByteStringObject``."""

162 return self.get(DI.AUTHOR)

163

164 @property

165 def subject(self) -> Optional[str]:

166 """

167 Read-only property accessing the document's subject.

168

169 Returns a ``TextStringObject`` or ``None`` if the subject is not

170 specified.

171 """

172 return self._get_text(DI.SUBJECT)

173

174 @property

175 def subject_raw(self) -> Optional[str]:

176 """The "raw" version of subject; can return a ``ByteStringObject``."""

177 return self.get(DI.SUBJECT)

178

179 @property

180 def creator(self) -> Optional[str]:

181 """

182 Read-only property accessing the document's creator.

183

184 If the document was converted to PDF from another format, this is the

185 name of the application (e.g. OpenOffice) that created the original

186 document from which it was converted. Returns a ``TextStringObject`` or

187 ``None`` if the creator is not specified.

188 """

189 return self._get_text(DI.CREATOR)

190

191 @property

192 def creator_raw(self) -> Optional[str]:

193 """The "raw" version of creator; can return a ``ByteStringObject``."""

194 return self.get(DI.CREATOR)

195

196 @property

197 def producer(self) -> Optional[str]:

198 """

199 Read-only property accessing the document's producer.

200

201 If the document was converted to PDF from another format, this is the

202 name of the application (for example, macOS Quartz) that converted it to

203 PDF. Returns a ``TextStringObject`` or ``None`` if the producer is not

204 specified.

205 """

206 return self._get_text(DI.PRODUCER)

207

208 @property

209 def producer_raw(self) -> Optional[str]:

210 """The "raw" version of producer; can return a ``ByteStringObject``."""

211 return self.get(DI.PRODUCER)

212

213 @property

214 def creation_date(self) -> Optional[datetime]:

215 """Read-only property accessing the document's creation date."""

216 return parse_iso8824_date(self._get_text(DI.CREATION_DATE))

217

218 @property

219 def creation_date_raw(self) -> Optional[str]:

220 """

221 The "raw" version of creation date; can return a ``ByteStringObject``.

222

223 Typically in the format ``D:YYYYMMDDhhmmss[+Z-]hh'mm`` where the suffix

224 is the offset from UTC.

225 """

226 return self.get(DI.CREATION_DATE)

227

228 @property

229 def modification_date(self) -> Optional[datetime]:

230 """

231 Read-only property accessing the document's modification date.

232

233 The date and time the document was most recently modified.

234 """

235 return parse_iso8824_date(self._get_text(DI.MOD_DATE))

236

237 @property

238 def modification_date_raw(self) -> Optional[str]:

239 """

240 The "raw" version of modification date; can return a

241 ``ByteStringObject``.

242

243 Typically in the format ``D:YYYYMMDDhhmmss[+Z-]hh'mm`` where the suffix

244 is the offset from UTC.

245 """

246 return self.get(DI.MOD_DATE)

247

248 @property

249 def keywords(self) -> Optional[str]:

250 """

251 Read-only property accessing the document's keywords.

252

253 Returns a ``TextStringObject`` or ``None`` if keywords are not

254 specified.

255 """

256 return self._get_text(DI.KEYWORDS)

257

258 @property

259 def keywords_raw(self) -> Optional[str]:

260 """The "raw" version of keywords; can return a ``ByteStringObject``."""

261 return self.get(DI.KEYWORDS)

262

263

264class PdfDocCommon:

265 """

266 Common functions from PdfWriter and PdfReader objects.

267

268 This root class is strongly abstracted.

269 """

270

271 strict: bool = False # default

272

273 flattened_pages: Optional[List[PageObject]] = None

274

275 _encryption: Optional[Encryption] = None

276

277 _readonly: bool = False

278

279 @property

280 @abstractmethod

281 def root_object(self) -> DictionaryObject:

282 ... # pragma: no cover

283

284 @property

285 @abstractmethod

286 def pdf_header(self) -> str:

287 ... # pragma: no cover

288

289 @abstractmethod

290 def get_object(

291 self, indirect_reference: Union[int, IndirectObject]

292 ) -> Optional[PdfObject]:

293 ... # pragma: no cover

294

295 @abstractmethod

296 def _replace_object(self, indirect: IndirectObject, obj: PdfObject) -> PdfObject:

297 ... # pragma: no cover

298

299 @property

300 @abstractmethod

301 def _info(self) -> Optional[DictionaryObject]:

302 ... # pragma: no cover

303

304 @property

305 def metadata(self) -> Optional[DocumentInformation]:

306 """

307 Retrieve the PDF file's document information dictionary, if it exists.

308

309 Note that some PDF files use metadata streams instead of document

310 information dictionaries, and these metadata streams will not be

311 accessed by this function.

312 """

313 retval = DocumentInformation()

314 if self._info is None:

315 return None

316 retval.update(self._info)

317 return retval

318

319 @property

320 def xmp_metadata(self) -> Optional[XmpInformation]:

321 ... # pragma: no cover

322

323 @property

324 def viewer_preferences(self) -> Optional[ViewerPreferences]:

325 """Returns the existing ViewerPreferences as an overloaded dictionary."""

326 o = self.root_object.get(CD.VIEWER_PREFERENCES, None)

327 if o is None:

328 return None

329 o = o.get_object()

330 if not isinstance(o, ViewerPreferences):

331 o = ViewerPreferences(o)

332 if hasattr(o, "indirect_reference") and o.indirect_reference is not None:

333 self._replace_object(o.indirect_reference, o)

334 else:

335 self.root_object[NameObject(CD.VIEWER_PREFERENCES)] = o

336 return o

337

338 def get_num_pages(self) -> int:

339 """

340 Calculate the number of pages in this PDF file.

341

342 Returns:

343 The number of pages of the parsed PDF file.

344

345 Raises:

346 PdfReadError: If restrictions prevent this action.

347

348 """

349 # Flattened pages will not work on an encrypted PDF;

350 # the PDF file's page count is used in this case. Otherwise,

351 # the original method (flattened page count) is used.

352 if self.is_encrypted:

353 return self.root_object["/Pages"]["/Count"] # type: ignore

354 if self.flattened_pages is None:

355 self._flatten(self._readonly)

356 assert self.flattened_pages is not None

357 return len(self.flattened_pages)

358

359 def get_page(self, page_number: int) -> PageObject:

360 """

361 Retrieve a page by number from this PDF file.

362 Most of the time ``.pages[page_number]`` is preferred.

363

364 Args:

365 page_number: The page number to retrieve

366 (pages begin at zero)

367

368 Returns:

369 A :class:`PageObject<pypdf._page.PageObject>` instance.

370

371 """

372 if self.flattened_pages is None:

373 self._flatten(self._readonly)

374 assert self.flattened_pages is not None, "hint for mypy"

375 return self.flattened_pages[page_number]

376

377 def _get_page_in_node(

378 self,

379 page_number: int,

380 ) -> Tuple[DictionaryObject, int]:

381 """

382 Retrieve the node and position within the /Kids containing the page.

383 If page_number is greater than the number of pages, it returns the top node, -1.

384 """

385 top = cast(DictionaryObject, self.root_object["/Pages"])

386

387 def recursive_call(

388 node: DictionaryObject, mi: int

389 ) -> Tuple[Optional[PdfObject], int]:

390 ma = cast(int, node.get("/Count", 1)) # default 1 for /Page types

391 if node["/Type"] == "/Page":

392 if page_number == mi:

393 return node, -1

394 return None, mi + 1

395 if (page_number - mi) >= ma: # not in nodes below

396 if node == top:

397 return top, -1

398 return None, mi + ma

399 for idx, kid in enumerate(cast(ArrayObject, node["/Kids"])):

400 kid = cast(DictionaryObject, kid.get_object())

401 n, i = recursive_call(kid, mi)

402 if n is not None: # page has just been found ...

403 if i < 0: # ... just below!

404 return node, idx

405 # ... at lower levels

406 return n, i

407 mi = i

408 raise PyPdfError("Unexpectedly cannot find the node.")

409

410 node, idx = recursive_call(top, 0)

411 assert isinstance(node, DictionaryObject), "mypy"

412 return node, idx

413

414 @property

415 def named_destinations(self) -> Dict[str, Destination]:

416 """A read-only dictionary which maps names to destinations."""

417 return self._get_named_destinations()

418

419 def get_named_dest_root(self) -> ArrayObject:

420 named_dest = ArrayObject()

421 if CA.NAMES in self.root_object and isinstance(

422 self.root_object[CA.NAMES], DictionaryObject

423 ):

424 names = cast(DictionaryObject, self.root_object[CA.NAMES])

425 if CA.DESTS in names and isinstance(names[CA.DESTS], DictionaryObject):

426 # §3.6.3 Name Dictionary (PDF spec 1.7)

427 dests = cast(DictionaryObject, names[CA.DESTS])

428 dests_ref = dests.indirect_reference

429 if CA.NAMES in dests:

430 # §7.9.6, entries in a name tree node dictionary

431 named_dest = cast(ArrayObject, dests[CA.NAMES])

432 else:

433 named_dest = ArrayObject()

434 dests[NameObject(CA.NAMES)] = named_dest

435 elif hasattr(self, "_add_object"):

436 dests = DictionaryObject()

437 dests_ref = self._add_object(dests)

438 names[NameObject(CA.DESTS)] = dests_ref

439 dests[NameObject(CA.NAMES)] = named_dest

440

441 elif hasattr(self, "_add_object"):

442 names = DictionaryObject()

443 names_ref = self._add_object(names)

444 self.root_object[NameObject(CA.NAMES)] = names_ref

445 dests = DictionaryObject()

446 dests_ref = self._add_object(dests)

447 names[NameObject(CA.DESTS)] = dests_ref

448 dests[NameObject(CA.NAMES)] = named_dest

449

450 return named_dest

451

452 ## common

453 def _get_named_destinations(

454 self,

455 tree: Union[TreeObject, None] = None,

456 retval: Optional[Dict[str, Destination]] = None,

457 ) -> Dict[str, Destination]:

458 """

459 Retrieve the named destinations present in the document.

460

461 Args:

462 tree: The current tree.

463 retval: The previously retrieved destinations for nested calls.

464

465 Returns:

466 A dictionary which maps names to destinations.

467

468 """

469 if retval is None:

470 retval = {}

471 catalog = self.root_object

472

473 # get the name tree

474 if CA.DESTS in catalog:

475 tree = cast(TreeObject, catalog[CA.DESTS])

476 elif CA.NAMES in catalog:

477 names = cast(DictionaryObject, catalog[CA.NAMES])

478 if CA.DESTS in names:

479 tree = cast(TreeObject, names[CA.DESTS])

480

481 if is_null_or_none(tree):

482 return retval

483 assert tree is not None, "mypy"

484

485 if PA.KIDS in tree:

486 # recurse down the tree

487 for kid in cast(ArrayObject, tree[PA.KIDS]):

488 self._get_named_destinations(kid.get_object(), retval)

489 # §7.9.6, entries in a name tree node dictionary

490 elif CA.NAMES in tree: # /Kids and /Names are exclusives (§7.9.6)

491 names = cast(DictionaryObject, tree[CA.NAMES])

492 i = 0

493 while i < len(names):

494 original_key = names[i].get_object()

495 i += 1

496 if not isinstance(original_key, (bytes, str)):

497 continue

498 key = str(original_key)

499 try:

500 value = names[i].get_object()

501 except IndexError:

502 break

503 i += 1

504 if isinstance(value, DictionaryObject):

505 if "/D" in value:

506 value = value["/D"]

507 else:

508 continue

509 dest = self._build_destination(key, value)

510 if dest is not None:

511 retval[key] = dest

512 else: # case where Dests is in root catalog (PDF 1.7 specs, §2 about PDF 1.1)

513 for k__, v__ in tree.items():

514 val = v__.get_object()

515 if isinstance(val, DictionaryObject):

516 if "/D" in val:

517 val = val["/D"].get_object()

518 else:

519 continue

520 dest = self._build_destination(k__, val)

521 if dest is not None:

522 retval[k__] = dest

523 return retval

524

525 # A select group of relevant field attributes. For the complete list,

526 # see §12.3.2 of the PDF 1.7 or PDF 2.0 specification.

527

528 def get_fields(

529 self,

530 tree: Optional[TreeObject] = None,

531 retval: Optional[Dict[Any, Any]] = None,

532 fileobj: Optional[Any] = None,

533 stack: Optional[List[PdfObject]] = None,

534 ) -> Optional[Dict[str, Any]]:

535 """

536 Extract field data if this PDF contains interactive form fields.

537

538 The *tree*, *retval*, *stack* parameters are for recursive use.

539

540 Args:

541 tree: Current object to parse.

542 retval: In-progress list of fields.

543 fileobj: A file object (usually a text file) to write

544 a report to on all interactive form fields found.

545 stack: List of already parsed objects.

546

547 Returns:

548 A dictionary where each key is a field name, and each

549 value is a :class:`Field<pypdf.generic.Field>` object. By

550 default, the mapping name is used for keys.

551 ``None`` if form data could not be located.

552

553 """

554 field_attributes = FA.attributes_dict()

555 field_attributes.update(CheckboxRadioButtonAttributes.attributes_dict())

556 if retval is None:

557 retval = {}

558 catalog = self.root_object

559 stack = []

560 # get the AcroForm tree

561 if CD.ACRO_FORM in catalog:

562 tree = cast(Optional[TreeObject], catalog[CD.ACRO_FORM])

563 else:

564 return None

565 if tree is None:

566 return retval

567 assert stack is not None

568 if "/Fields" in tree:

569 fields = cast(ArrayObject, tree["/Fields"])

570 for f in fields:

571 field = f.get_object()

572 self._build_field(field, retval, fileobj, field_attributes, stack)

573 elif any(attr in tree for attr in field_attributes):

574 # Tree is a field

575 self._build_field(tree, retval, fileobj, field_attributes, stack)

576 return retval

577

578 def _get_qualified_field_name(self, parent: DictionaryObject) -> str:

579 if "/TM" in parent:

580 return cast(str, parent["/TM"])

581 if "/Parent" in parent:

582 return (

583 self._get_qualified_field_name(

584 cast(DictionaryObject, parent["/Parent"])

585 )

586 + "."

587 + cast(str, parent.get("/T", ""))

588 )

589 return cast(str, parent.get("/T", ""))

590

591 def _build_field(

592 self,

593 field: Union[TreeObject, DictionaryObject],

594 retval: Dict[Any, Any],

595 fileobj: Any,

596 field_attributes: Any,

597 stack: List[PdfObject],

598 ) -> None:

599 if all(attr not in field for attr in ("/T", "/TM")):

600 return

601 key = self._get_qualified_field_name(field)

602 if fileobj:

603 self._write_field(fileobj, field, field_attributes)

604 fileobj.write("\n")

605 retval[key] = Field(field)

606 obj = retval[key].indirect_reference.get_object() # to get the full object

607 if obj.get(FA.FT, "") == "/Ch":

608 retval[key][NameObject("/_States_")] = obj[NameObject(FA.Opt)]

609 if obj.get(FA.FT, "") == "/Btn" and "/AP" in obj:

610 # Checkbox

611 retval[key][NameObject("/_States_")] = ArrayObject(

612 list(obj["/AP"]["/N"].keys())

613 )

614 if "/Off" not in retval[key]["/_States_"]:

615 retval[key][NameObject("/_States_")].append(NameObject("/Off"))

616 elif obj.get(FA.FT, "") == "/Btn" and obj.get(FA.Ff, 0) & FA.FfBits.Radio != 0:

617 states: List[str] = []

618 retval[key][NameObject("/_States_")] = ArrayObject(states)

619 for k in obj.get(FA.Kids, {}):

620 k = k.get_object()

621 for s in list(k["/AP"]["/N"].keys()):

622 if s not in states:

623 states.append(s)

624 retval[key][NameObject("/_States_")] = ArrayObject(states)

625 if (

626 obj.get(FA.Ff, 0) & FA.FfBits.NoToggleToOff != 0

627 and "/Off" in retval[key]["/_States_"]

628 ):

629 del retval[key]["/_States_"][retval[key]["/_States_"].index("/Off")]

630 # at last for order

631 self._check_kids(field, retval, fileobj, stack)

632

633 def _check_kids(

634 self,

635 tree: Union[TreeObject, DictionaryObject],

636 retval: Any,

637 fileobj: Any,

638 stack: List[PdfObject],

639 ) -> None:

640 if tree in stack:

641 logger_warning(

642 f"{self._get_qualified_field_name(tree)} already parsed", __name__

643 )

644 return

645 stack.append(tree)

646 if PA.KIDS in tree:

647 # recurse down the tree

648 for kid in tree[PA.KIDS]: # type: ignore

649 kid = kid.get_object()

650 self.get_fields(kid, retval, fileobj, stack)

651

652 def _write_field(self, fileobj: Any, field: Any, field_attributes: Any) -> None:

653 field_attributes_tuple = FA.attributes()

654 field_attributes_tuple = (

655 field_attributes_tuple + CheckboxRadioButtonAttributes.attributes()

656 )

657

658 for attr in field_attributes_tuple:

659 if attr in (

660 FA.Kids,

661 FA.AA,

662 ):

663 continue

664 attr_name = field_attributes[attr]

665 try:

666 if attr == FA.FT:

667 # Make the field type value clearer

668 types = {

669 "/Btn": "Button",

670 "/Tx": "Text",

671 "/Ch": "Choice",

672 "/Sig": "Signature",

673 }

674 if field[attr] in types:

675 fileobj.write(f"{attr_name}: {types[field[attr]]}\n")

676 elif attr == FA.Parent:

677 # Let's just write the name of the parent

678 try:

679 name = field[attr][FA.TM]

680 except KeyError:

681 name = field[attr][FA.T]

682 fileobj.write(f"{attr_name}: {name}\n")

683 else:

684 fileobj.write(f"{attr_name}: {field[attr]}\n")

685 except KeyError:

686 # Field attribute is N/A or unknown, so don't write anything

687 pass

688

689 def get_form_text_fields(self, full_qualified_name: bool = False) -> Dict[str, Any]:

690 """

691 Retrieve form fields from the document with textual data.

692

693 Args:

694 full_qualified_name: to get full name

695

696 Returns:

697 A dictionary. The key is the name of the form field,

698 the value is the content of the field.

699

700 If the document contains multiple form fields with the same name, the

701 second and following will get the suffix .2, .3, ...

702

703 """

704

705 def indexed_key(k: str, fields: Dict[Any, Any]) -> str:

706 if k not in fields:

707 return k

708 return (

709 k

710 + "."

711 + str(sum(1 for kk in fields if kk.startswith(k + ".")) + 2)

712 )

713

714 # Retrieve document form fields

715 formfields = self.get_fields()

716 if formfields is None:

717 return {}

718 ff = {}

719 for field, value in formfields.items():

720 if value.get("/FT") == "/Tx":

721 if full_qualified_name:

722 ff[field] = value.get("/V")

723 else:

724 ff[indexed_key(cast(str, value["/T"]), ff)] = value.get("/V")

725 return ff

726

727 def get_pages_showing_field(

728 self, field: Union[Field, PdfObject, IndirectObject]

729 ) -> List[PageObject]:

730 """

731 Provides list of pages where the field is called.

732

733 Args:

734 field: Field Object, PdfObject or IndirectObject referencing a Field

735

736 Returns:

737 List of pages:

738 - Empty list:

739 The field has no widgets attached

740 (either hidden field or ancestor field).

741 - Single page list:

742 Page where the widget is present

743 (most common).

744 - Multi-page list:

745 Field with multiple kids widgets

746 (example: radio buttons, field repeated on multiple pages).

747

748 """

749

750 def _get_inherited(obj: DictionaryObject, key: str) -> Any:

751 if key in obj:

752 return obj[key]

753 if "/Parent" in obj:

754 return _get_inherited(

755 cast(DictionaryObject, obj["/Parent"].get_object()), key

756 )

757 return None

758

759 try:

760 # to cope with all types

761 field = cast(DictionaryObject, field.indirect_reference.get_object()) # type: ignore

762 except Exception as exc:

763 raise ValueError("Field type is invalid") from exc

764 if is_null_or_none(_get_inherited(field, "/FT")):

765 raise ValueError("Field is not valid")

766 ret = []

767 if field.get("/Subtype", "") == "/Widget":

768 if "/P" in field:

769 ret = [field["/P"].get_object()]

770 else:

771 ret = [

772 p

773 for p in self.pages

774 if field.indirect_reference in p.get("/Annots", "")

775 ]

776 else:

777 kids = field.get("/Kids", ())

778 for k in kids:

779 k = k.get_object()

780 if (k.get("/Subtype", "") == "/Widget") and ("/T" not in k):

781 # Kid that is just a widget, not a field:

782 if "/P" in k:

783 ret += [k["/P"].get_object()]

784 else:

785 ret += [

786 p

787 for p in self.pages

788 if k.indirect_reference in p.get("/Annots", "")

789 ]

790 return [

791 x

792 if isinstance(x, PageObject)

793 else (self.pages[self._get_page_number_by_indirect(x.indirect_reference)]) # type: ignore

794 for x in ret

795 ]

796

797 @property

798 def open_destination(

799 self,

800 ) -> Union[None, Destination, TextStringObject, ByteStringObject]:

801 """

802 Property to access the opening destination (``/OpenAction`` entry in

803 the PDF catalog). It returns ``None`` if the entry does not exist

804 or is not set.

805

806 Raises:

807 Exception: If a destination is invalid.

808

809 """

810 if "/OpenAction" not in self.root_object:

811 return None

812 oa: Any = self.root_object["/OpenAction"]

813 if isinstance(oa, bytes): # pragma: no cover

814 oa = oa.decode()

815 if isinstance(oa, str):

816 return create_string_object(oa)

817 if isinstance(oa, ArrayObject):

818 try:

819 page, typ, *array = oa

820 fit = Fit(typ, tuple(array))

821 return Destination("OpenAction", page, fit)

822 except Exception as exc:

823 raise Exception(f"Invalid Destination {oa}: {exc}")

824 else:

825 return None

826

827 @open_destination.setter

828 def open_destination(self, dest: Union[None, str, Destination, PageObject]) -> None:

829 raise NotImplementedError("No setter for open_destination")

830

831 @property

832 def outline(self) -> OutlineType:

833 """

834 Read-only property for the outline present in the document

835 (i.e., a collection of 'outline items' which are also known as

836 'bookmarks').

837 """

838 return self._get_outline()

839

840 def _get_outline(

841 self, node: Optional[DictionaryObject] = None, outline: Optional[Any] = None

842 ) -> OutlineType:

843 if outline is None:

844 outline = []

845 catalog = self.root_object

846

847 # get the outline dictionary and named destinations

848 if CO.OUTLINES in catalog:

849 lines = cast(DictionaryObject, catalog[CO.OUTLINES])

850

851 if isinstance(lines, NullObject):

852 return outline

853

854 # §12.3.3 Document outline, entries in the outline dictionary

855 if not is_null_or_none(lines) and "/First" in lines:

856 node = cast(DictionaryObject, lines["/First"])

857 self._named_destinations = self._get_named_destinations()

858

859 if node is None:

860 return outline

861

862 # see if there are any more outline items

863 while True:

864 outline_obj = self._build_outline_item(node)

865 if outline_obj:

866 outline.append(outline_obj)

867

868 # check for sub-outline

869 if "/First" in node:

870 sub_outline: List[Any] = []

871 self._get_outline(cast(DictionaryObject, node["/First"]), sub_outline)

872 if sub_outline:

873 outline.append(sub_outline)

874

875 if "/Next" not in node:

876 break

877 node = cast(DictionaryObject, node["/Next"])

878

879 return outline

880

881 @property

882 def threads(self) -> Optional[ArrayObject]:

883 """

884 Read-only property for the list of threads.

885

886 See §12.4.3 from the PDF 1.7 or 2.0 specification.

887

888 It is an array of dictionaries with "/F" (the first bead in the thread)

889 and "/I" (a thread information dictionary containing information about

890 the thread, such as its title, author, and creation date) properties or

891 None if there are no articles.

892

893 Since PDF 2.0 it can also contain an indirect reference to a metadata

894 stream containing information about the thread, such as its title,

895 author, and creation date.

896 """

897 catalog = self.root_object

898 if CO.THREADS in catalog:

899 return cast("ArrayObject", catalog[CO.THREADS])

900 return None

901

902 @abstractmethod

903 def _get_page_number_by_indirect(

904 self, indirect_reference: Union[None, int, NullObject, IndirectObject]

905 ) -> Optional[int]:

906 ... # pragma: no cover

907

908 def get_page_number(self, page: PageObject) -> Optional[int]:

909 """

910 Retrieve page number of a given PageObject.

911

912 Args:

913 page: The page to get page number. Should be

914 an instance of :class:`PageObject<pypdf._page.PageObject>`

915

916 Returns:

917 The page number or None if page is not found

918

919 """

920 return self._get_page_number_by_indirect(page.indirect_reference)

921

922 def get_destination_page_number(self, destination: Destination) -> Optional[int]:

923 """

924 Retrieve page number of a given Destination object.

925

926 Args:

927 destination: The destination to get page number.

928

929 Returns:

930 The page number or None if page is not found

931

932 """

933 return self._get_page_number_by_indirect(destination.page)

934

935 def _build_destination(

936 self,

937 title: str,

938 array: Optional[

939 List[

940 Union[NumberObject, IndirectObject, None, NullObject, DictionaryObject]

941 ]

942 ],

943 ) -> Destination:

944 page, typ = None, None

945 # handle outline items with missing or invalid destination

946 if (

947 isinstance(array, (NullObject, str))

948 or (isinstance(array, ArrayObject) and len(array) == 0)

949 or array is None

950 ):

951 page = NullObject()

952 return Destination(title, page, Fit.fit())

953 page, typ, *array = array # type: ignore

954 try:

955 return Destination(title, page, Fit(fit_type=typ, fit_args=array)) # type: ignore

956 except PdfReadError:

957 logger_warning(f"Unknown destination: {title} {array}", __name__)

958 if self.strict:

959 raise

960 # create a link to first Page

961 tmp = self.pages[0].indirect_reference

962 indirect_reference = NullObject() if tmp is None else tmp

963 return Destination(title, indirect_reference, Fit.fit())

964

965 def _build_outline_item(self, node: DictionaryObject) -> Optional[Destination]:

966 dest, title, outline_item = None, None, None

967

968 # title required for valid outline

969 # §12.3.3, entries in an outline item dictionary

970 try:

971 title = cast("str", node["/Title"])

972 except KeyError:

973 if self.strict:

974 raise PdfReadError(f"Outline Entry Missing /Title attribute: {node!r}")

975 title = ""

976

977 if "/A" in node:

978 # Action, PDF 1.7 and PDF 2.0 §12.6 (only type GoTo supported)

979 action = cast(DictionaryObject, node["/A"])

980 action_type = cast(NameObject, action[GoToActionArguments.S])

981 if action_type == "/GoTo":

982 if GoToActionArguments.D in action:

983 dest = action[GoToActionArguments.D]

984 elif self.strict:

985 raise PdfReadError(f"Outline Action Missing /D attribute: {node!r}")

986 elif "/Dest" in node:

987 # Destination, PDF 1.7 and PDF 2.0 §12.3.2

988 dest = node["/Dest"]

989 # if array was referenced in another object, will be a dict w/ key "/D"

990 if isinstance(dest, DictionaryObject) and "/D" in dest:

991 dest = dest["/D"]

992

993 if isinstance(dest, ArrayObject):

994 outline_item = self._build_destination(title, dest)

995 elif isinstance(dest, str):

996 # named destination, addresses NameObject Issue #193

997 # TODO: Keep named destination instead of replacing it?

998 try:

999 outline_item = self._build_destination(

1000 title, self._named_destinations[dest].dest_array

1001 )

1002 except KeyError:

1003 # named destination not found in Name Dict

1004 outline_item = self._build_destination(title, None)

1005 elif dest is None:

1006 # outline item not required to have destination or action

1007 # PDFv1.7 Table 153

1008 outline_item = self._build_destination(title, dest)

1009 else:

1010 if self.strict:

1011 raise PdfReadError(f"Unexpected destination {dest!r}")

1012 logger_warning(

1013 f"Removed unexpected destination {dest!r} from destination",

1014 __name__,

1015 )

1016 outline_item = self._build_destination(title, None)

1017

1018 # if outline item created, add color, format, and child count if present

1019 if outline_item:

1020 if "/C" in node:

1021 # Color of outline item font in (R, G, B) with values ranging 0.0-1.0

1022 outline_item[NameObject("/C")] = ArrayObject(FloatObject(c) for c in node["/C"]) # type: ignore

1023 if "/F" in node:

1024 # specifies style characteristics bold and/or italic

1025 # with 1=italic, 2=bold, 3=both

1026 outline_item[NameObject("/F")] = node["/F"]

1027 if "/Count" in node:

1028 # absolute value = num. visible children

1029 # with positive = open/unfolded, negative = closed/folded

1030 outline_item[NameObject("/Count")] = node["/Count"]

1031 # if count is 0 we will consider it as open (to have available is_open)

1032 outline_item[NameObject("/%is_open%")] = BooleanObject(

1033 node.get("/Count", 0) >= 0

1034 )

1035 outline_item.node = node

1036 try:

1037 outline_item.indirect_reference = node.indirect_reference

1038 except AttributeError:

1039 pass

1040 return outline_item

1041

1042 @property

1043 def pages(self) -> List[PageObject]:

1044 """

1045 Property that emulates a list of :class:`PageObject<pypdf._page.PageObject>`.

1046 This property allows to get a page or a range of pages.

1047

1048 Note:

1049 For PdfWriter only: Provides the capability to remove a page/range of

1050 page from the list (using the del operator). Remember: Only the page

1051 entry is removed, as the objects beneath can be used elsewhere. A

1052 solution to completely remove them - if they are not used anywhere - is

1053 to write to a buffer/temporary file and then load it into a new

1054 PdfWriter.

1055

1056 """

1057 return _VirtualList(self.get_num_pages, self.get_page) # type: ignore

1058

1059 @property

1060 def page_labels(self) -> List[str]:

1061 """

1062 A list of labels for the pages in this document.

1063

1064 This property is read-only. The labels are in the order that the pages

1065 appear in the document.

1066 """

1067 return [page_index2page_label(self, i) for i in range(len(self.pages))]

1068

1069 @property

1070 def page_layout(self) -> Optional[str]:

1071 """

1072 Get the page layout currently being used.

1073

1074 .. list-table:: Valid ``layout`` values

1075 :widths: 50 200

1076

1077 * - /NoLayout

1078 - Layout explicitly not specified

1079 * - /SinglePage

1080 - Show one page at a time

1081 * - /OneColumn

1082 - Show one column at a time

1083 * - /TwoColumnLeft

1084 - Show pages in two columns, odd-numbered pages on the left

1085 * - /TwoColumnRight

1086 - Show pages in two columns, odd-numbered pages on the right

1087 * - /TwoPageLeft

1088 - Show two pages at a time, odd-numbered pages on the left

1089 * - /TwoPageRight

1090 - Show two pages at a time, odd-numbered pages on the right

1091 """

1092 try:

1093 return cast(NameObject, self.root_object[CD.PAGE_LAYOUT])

1094 except KeyError:

1095 return None

1096

1097 @property

1098 def page_mode(self) -> Optional[PagemodeType]:

1099 """

1100 Get the page mode currently being used.

1101

1102 .. list-table:: Valid ``mode`` values

1103 :widths: 50 200

1104

1105 * - /UseNone

1106 - Do not show outline or thumbnails panels

1107 * - /UseOutlines

1108 - Show outline (aka bookmarks) panel

1109 * - /UseThumbs

1110 - Show page thumbnails panel

1111 * - /FullScreen

1112 - Fullscreen view

1113 * - /UseOC

1114 - Show Optional Content Group (OCG) panel

1115 * - /UseAttachments

1116 - Show attachments panel

1117 """

1118 try:

1119 return self.root_object["/PageMode"] # type: ignore

1120 except KeyError:

1121 return None

1122

1123 def _flatten(

1124 self,

1125 list_only: bool = False,

1126 pages: Union[None, DictionaryObject, PageObject] = None,

1127 inherit: Optional[Dict[str, Any]] = None,

1128 indirect_reference: Optional[IndirectObject] = None,

1129 ) -> None:

1130 """

1131 Process the document pages to ease searching.

1132

1133 Attributes of a page may inherit from ancestor nodes

1134 in the page tree. Flattening means moving

1135 any inheritance data into descendant nodes,

1136 effectively removing the inheritance dependency.

1137

1138 Note: It is distinct from another use of "flattening" applied to PDFs.

1139 Flattening a PDF also means combining all the contents into one single layer

1140 and making the file less editable.

1141

1142 Args:

1143 list_only: Will only list the pages within _flatten_pages.

1144 pages:

1145 inherit:

1146 indirect_reference: Used recursively to flatten the /Pages object.

1147

1148 """

1149 inheritable_page_attributes = (

1150 NameObject(PG.RESOURCES),

1151 NameObject(PG.MEDIABOX),

1152 NameObject(PG.CROPBOX),

1153 NameObject(PG.ROTATE),

1154 )

1155 if inherit is None:

1156 inherit = {}

1157 if pages is None:

1158 # Fix issue 327: set flattened_pages attribute only for

1159 # decrypted file

1160 catalog = self.root_object

1161 pages = catalog.get("/Pages").get_object() # type: ignore

1162 if not isinstance(pages, DictionaryObject):

1163 raise PdfReadError("Invalid object in /Pages")

1164 self.flattened_pages = []

1165

1166 if PA.TYPE in pages:

1167 t = cast(str, pages[PA.TYPE])

1168 # if the page tree node has no /Type, consider as a page if /Kids is also missing

1169 elif PA.KIDS not in pages:

1170 t = "/Page"

1171 else:

1172 t = "/Pages"

1173

1174 if t == "/Pages":

1175 for attr in inheritable_page_attributes:

1176 if attr in pages:

1177 inherit[attr] = pages[attr]

1178 for page in cast(ArrayObject, pages[PA.KIDS]):

1179 addt = {}

1180 if isinstance(page, IndirectObject):

1181 addt["indirect_reference"] = page

1182 obj = page.get_object()

1183 if obj:

1184 # damaged file may have invalid child in /Pages

1185 try:

1186 self._flatten(list_only, obj, inherit, **addt)

1187 except RecursionError:

1188 raise PdfReadError(

1189 "Maximum recursion depth reached during page flattening."

1190 )

1191 elif t == "/Page":

1192 for attr_in, value in inherit.items():

1193 # if the page has its own value, it does not inherit the

1194 # parent's value

1195 if attr_in not in pages:

1196 pages[attr_in] = value

1197 page_obj = PageObject(self, indirect_reference)

1198 if not list_only:

1199 page_obj.update(pages)

1200

1201 # TODO: Could flattened_pages be None at this point?

1202 self.flattened_pages.append(page_obj) # type: ignore

1203

1204 def remove_page(

1205 self,

1206 page: Union[int, PageObject, IndirectObject],

1207 clean: bool = False,

1208 ) -> None:

1209 """

1210 Remove page from pages list.

1211

1212 Args:

1213 page:

1214 * :class:`int`: Page number to be removed.

1215 * :class:`~pypdf._page.PageObject`: page to be removed. If the page appears many times

1216 only the first one will be removed.

1217 * :class:`~pypdf.generic.IndirectObject`: Reference to page to be removed.

1218

1219 clean: replace PageObject with NullObject to prevent annotations

1220 or destinations to reference a detached page.

1221

1222 """

1223 if self.flattened_pages is None:

1224 self._flatten(self._readonly)

1225 assert self.flattened_pages is not None

1226 if isinstance(page, IndirectObject):

1227 p = page.get_object()

1228 if not isinstance(p, PageObject):

1229 logger_warning("IndirectObject is not referencing a page", __name__)

1230 return

1231 page = p

1232

1233 if not isinstance(page, int):

1234 try:

1235 page = self.flattened_pages.index(page)

1236 except ValueError:

1237 logger_warning("Cannot find page in pages", __name__)

1238 return

1239 if not (0 <= page < len(self.flattened_pages)):

1240 logger_warning("Page number is out of range", __name__)

1241 return

1242

1243 ind = self.pages[page].indirect_reference

1244 del self.pages[page]

1245 if clean and ind is not None:

1246 self._replace_object(ind, NullObject())

1247

1248 def _get_indirect_object(self, num: int, gen: int) -> Optional[PdfObject]:

1249 """

1250 Used to ease development.

1251

1252 This is equivalent to generic.IndirectObject(num,gen,self).get_object()

1253

1254 Args:

1255 num: The object number of the indirect object.

1256 gen: The generation number of the indirect object.

1257

1258 Returns:

1259 A PdfObject

1260

1261 """

1262 return IndirectObject(num, gen, self).get_object()

1263

1264 def decode_permissions(

1265 self, permissions_code: int

1266 ) -> Dict[str, bool]: # pragma: no cover

1267 """Take the permissions as an integer, return the allowed access."""

1268 deprecate_with_replacement(

1269 old_name="decode_permissions",

1270 new_name="user_access_permissions",

1271 removed_in="5.0.0",

1272 )

1273

1274 permissions_mapping = {

1275 "print": UserAccessPermissions.PRINT,

1276 "modify": UserAccessPermissions.MODIFY,

1277 "copy": UserAccessPermissions.EXTRACT,

1278 "annotations": UserAccessPermissions.ADD_OR_MODIFY,

1279 "forms": UserAccessPermissions.FILL_FORM_FIELDS,

1280 # Do not fix typo, as part of official, but deprecated API.

1281 "accessability": UserAccessPermissions.EXTRACT_TEXT_AND_GRAPHICS,

1282 "assemble": UserAccessPermissions.ASSEMBLE_DOC,

1283 "print_high_quality": UserAccessPermissions.PRINT_TO_REPRESENTATION,

1284 }

1285

1286 return {

1287 key: permissions_code & flag != 0

1288 for key, flag in permissions_mapping.items()

1289 }

1290

1291 @property

1292 def user_access_permissions(self) -> Optional[UserAccessPermissions]:

1293 """Get the user access permissions for encrypted documents. Returns None if not encrypted."""

1294 if self._encryption is None:

1295 return None

1296 return UserAccessPermissions(self._encryption.P)

1297

1298 @property

1299 @abstractmethod

1300 def is_encrypted(self) -> bool:

1301 """

1302 Read-only boolean property showing whether this PDF file is encrypted.

1303

1304 Note that this property, if true, will remain true even after the

1305 :meth:`decrypt()<pypdf.PdfReader.decrypt>` method is called.

1306 """

1307 ... # pragma: no cover

1308

1309 @property

1310 def xfa(self) -> Optional[Dict[str, Any]]:

1311 tree: Optional[TreeObject] = None

1312 retval: Dict[str, Any] = {}

1313 catalog = self.root_object

1314

1315 if "/AcroForm" not in catalog or not catalog["/AcroForm"]:

1316 return None

1317

1318 tree = cast(TreeObject, catalog["/AcroForm"])

1319

1320 if "/XFA" in tree:

1321 fields = cast(ArrayObject, tree["/XFA"])

1322 i = iter(fields)

1323 for f in i:

1324 tag = f

1325 f = next(i)

1326 if isinstance(f, IndirectObject):

1327 field = cast(Optional[EncodedStreamObject], f.get_object())

1328 if field:

1329 es = zlib.decompress(field._data)

1330 retval[tag] = es

1331 return retval

1332

1333 @property

1334 def attachments(self) -> Mapping[str, List[bytes]]:

1335 """Mapping of attachment filenames to their content."""

1336 return LazyDict(

1337 {

1338 name: (self._get_attachment_list, name)

1339 for name in self._list_attachments()

1340 }

1341 )

1342

1343 @property

1344 def attachment_list(self) -> Generator[EmbeddedFile, None, None]:

1345 """Iterable of attachment objects."""

1346 yield from EmbeddedFile._load(self.root_object)

1347

1348 def _list_attachments(self) -> List[str]:

1349 """

1350 Retrieves the list of filenames of file attachments.

1351

1352 Returns:

1353 list of filenames

1354

1355 """

1356 names = []

1357 for entry in self.attachment_list:

1358 names.append(entry.name)

1359 if (name := entry.alternative_name) != entry.name and name:

1360 names.append(name)

1361 return names

1362

1363 def _get_attachment_list(self, name: str) -> List[bytes]:

1364 out = self._get_attachments(name)[name]

1365 if isinstance(out, list):

1366 return out

1367 return [out]

1368

1369 def _get_attachments(

1370 self, filename: Optional[str] = None

1371 ) -> Dict[str, Union[bytes, List[bytes]]]:

1372 """

1373 Retrieves all or selected file attachments of the PDF as a dictionary of file names

1374 and the file data as a bytestring.

1375

1376 Args:

1377 filename: If filename is None, then a dictionary of all attachments

1378 will be returned, where the key is the filename and the value

1379 is the content. Otherwise, a dictionary with just a single key

1380 - the filename - and its content will be returned.

1381

1382 Returns:

1383 dictionary of filename -> Union[bytestring or List[ByteString]]

1384 If the filename exists multiple times a list of the different versions will be provided.

1385

1386 """

1387 attachments: Dict[str, Union[bytes, List[bytes]]] = {}

1388 for entry in self.attachment_list:

1389 names = set()

1390 alternative_name = entry.alternative_name

1391 if filename is not None:

1392 if filename in {entry.name, alternative_name}:

1393 name = entry.name if filename == entry.name else alternative_name

1394 names.add(name)

1395 else:

1396 continue

1397 else:

1398 names = {entry.name, alternative_name}

1399

1400 for name in names:

1401 if name is None:

1402 continue

1403 if name in attachments:

1404 if not isinstance(attachments[name], list):

1405 attachments[name] = [attachments[name]] # type:ignore

1406 attachments[name].append(entry.content) # type:ignore

1407 else:

1408 attachments[name] = entry.content

1409 return attachments

1410

1411 @abstractmethod

1412 def _repr_mimebundle_(

1413 self,

1414 include: Union[None, Iterable[str]] = None,

1415 exclude: Union[None, Iterable[str]] = None,

1416 ) -> Dict[str, Any]:

1417 """

1418 Integration into Jupyter Notebooks.

1419

1420 This method returns a dictionary that maps a mime-type to its

1421 representation.

1422

1423 .. seealso::

1424

1425 https://ipython.readthedocs.io/en/stable/config/integrating.html

1426 """

1427 ... # pragma: no cover

1428

1429

1430class LazyDict(Mapping[Any, Any]):

1431 def __init__(self, *args: Any, **kwargs: Any) -> None:

1432 self._raw_dict = dict(*args, **kwargs)

1433

1434 def __getitem__(self, key: str) -> Any:

1435 func, arg = self._raw_dict.__getitem__(key)

1436 return func(arg)

1437

1438 def __iter__(self) -> Iterator[Any]:

1439 return iter(self._raw_dict)

1440

1441 def __len__(self) -> int:

1442 return len(self._raw_dict)

1443

1444 def __str__(self) -> str:

1445 return f"LazyDict(keys={list(self.keys())})"

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_doc_common.py: 22%

643 statements