Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_doc

7# Redistribution and use in source and binary forms, with or without

8# modification, are permitted provided that the following conditions are

9# met:

10#

11# * Redistributions of source code must retain the above copyright notice,

12# this list of conditions and the following disclaimer.

13# * Redistributions in binary form must reproduce the above copyright notice,

14# this list of conditions and the following disclaimer in the documentation

15# and/or other materials provided with the distribution.

16# * The name of the author may not be used to endorse or promote products

17# derived from this software without specific prior written permission.

18#

19# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"

20# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE

21# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE

22# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE

23# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR

24# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF

25# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS

26# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN

27# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)

28# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE

29# POSSIBILITY OF SUCH DAMAGE.

31import struct

32import zlib

33from abc import abstractmethod

34from collections.abc import Generator, Iterable, Iterator, Mapping

35from datetime import datetime

36from typing import (

37 Any,

38 Optional,

39 Union,

40 cast,

41)

43from ._encryption import Encryption

44from ._page import PageObject, _VirtualList

45from ._page_labels import index2label as page_index2page_label

46from ._utils import (

47 deprecation_with_replacement,

48 logger_warning,

49 parse_iso8824_date,

50)

51from .constants import CatalogAttributes as CA

52from .constants import CatalogDictionary as CD

53from .constants import (

54 CheckboxRadioButtonAttributes,

55 GoToActionArguments,

56 PagesAttributes,

57 UserAccessPermissions,

58)

59from .constants import Core as CO

60from .constants import DocumentInformationAttributes as DI

61from .constants import FieldDictionaryAttributes as FA

62from .constants import PageAttributes as PG

63from .errors import PdfReadError, PyPdfError

64from .generic import (

65 ArrayObject,

66 BooleanObject,

67 ByteStringObject,

68 Destination,

69 DictionaryObject,

70 EncodedStreamObject,

71 Field,

72 Fit,

73 FloatObject,

74 IndirectObject,

75 NameObject,

76 NullObject,

77 NumberObject,

78 PdfObject,

79 TextStringObject,

80 TreeObject,

81 ViewerPreferences,

82 create_string_object,

83 is_null_or_none,

84)

85from .generic._files import EmbeddedFile

86from .types import OutlineType, PagemodeType

87from .xmp import XmpInformation

90def convert_to_int(d: bytes, size: int) -> Union[int, tuple[Any, ...]]:

91 if size > 8:

92 raise PdfReadError("Invalid size in convert_to_int")

93 d = b"\x00\x00\x00\x00\x00\x00\x00\x00" + d

94 d = d[-8:]

95 return struct.unpack(">q", d)[0]

98class DocumentInformation(DictionaryObject):

99 """

100 A class representing the basic document metadata provided in a PDF File.

101 This class is accessible through

102 :py:class:`PdfReader.metadata<pypdf.PdfReader.metadata>`.

103

104 All text properties of the document metadata have

105 *two* properties, e.g. author and author_raw. The non-raw property will

106 always return a ``TextStringObject``, making it ideal for a case where the

107 metadata is being displayed. The raw property can sometimes return a

108 ``ByteStringObject``, if pypdf was unable to decode the string's text

109 encoding; this requires additional safety in the caller and therefore is not

110 as commonly accessed.

111 """

112

113 def __init__(self) -> None:

114 DictionaryObject.__init__(self)

115

116 def _get_text(self, key: str) -> Optional[str]:

117 retval = self.get(key, None)

118 if isinstance(retval, TextStringObject):

119 return retval

120 if isinstance(retval, ByteStringObject):

121 return str(retval)

122 return None

123

124 @property

125 def title(self) -> Optional[str]:

126 """

127 Read-only property accessing the document's title.

128

129 Returns a ``TextStringObject`` or ``None`` if the title is not

130 specified.

131 """

132 return (

133 self._get_text(DI.TITLE) or self.get(DI.TITLE).get_object() # type: ignore

134 if self.get(DI.TITLE)

135 else None

136 )

137

138 @property

139 def title_raw(self) -> Optional[str]:

140 """The "raw" version of title; can return a ``ByteStringObject``."""

141 return self.get(DI.TITLE)

142

143 @property

144 def author(self) -> Optional[str]:

145 """

146 Read-only property accessing the document's author.

147

148 Returns a ``TextStringObject`` or ``None`` if the author is not

149 specified.

150 """

151 return self._get_text(DI.AUTHOR)

152

153 @property

154 def author_raw(self) -> Optional[str]:

155 """The "raw" version of author; can return a ``ByteStringObject``."""

156 return self.get(DI.AUTHOR)

157

158 @property

159 def subject(self) -> Optional[str]:

160 """

161 Read-only property accessing the document's subject.

162

163 Returns a ``TextStringObject`` or ``None`` if the subject is not

164 specified.

165 """

166 return self._get_text(DI.SUBJECT)

167

168 @property

169 def subject_raw(self) -> Optional[str]:

170 """The "raw" version of subject; can return a ``ByteStringObject``."""

171 return self.get(DI.SUBJECT)

172

173 @property

174 def creator(self) -> Optional[str]:

175 """

176 Read-only property accessing the document's creator.

177

178 If the document was converted to PDF from another format, this is the

179 name of the application (e.g. OpenOffice) that created the original

180 document from which it was converted. Returns a ``TextStringObject`` or

181 ``None`` if the creator is not specified.

182 """

183 return self._get_text(DI.CREATOR)

184

185 @property

186 def creator_raw(self) -> Optional[str]:

187 """The "raw" version of creator; can return a ``ByteStringObject``."""

188 return self.get(DI.CREATOR)

189

190 @property

191 def producer(self) -> Optional[str]:

192 """

193 Read-only property accessing the document's producer.

194

195 If the document was converted to PDF from another format, this is the

196 name of the application (for example, macOS Quartz) that converted it to

197 PDF. Returns a ``TextStringObject`` or ``None`` if the producer is not

198 specified.

199 """

200 return self._get_text(DI.PRODUCER)

201

202 @property

203 def producer_raw(self) -> Optional[str]:

204 """The "raw" version of producer; can return a ``ByteStringObject``."""

205 return self.get(DI.PRODUCER)

206

207 @property

208 def creation_date(self) -> Optional[datetime]:

209 """Read-only property accessing the document's creation date."""

210 return parse_iso8824_date(self._get_text(DI.CREATION_DATE))

211

212 @property

213 def creation_date_raw(self) -> Optional[str]:

214 """

215 The "raw" version of creation date; can return a ``ByteStringObject``.

216

217 Typically in the format ``D:YYYYMMDDhhmmss[+Z-]hh'mm`` where the suffix

218 is the offset from UTC.

219 """

220 return self.get(DI.CREATION_DATE)

221

222 @property

223 def modification_date(self) -> Optional[datetime]:

224 """

225 Read-only property accessing the document's modification date.

226

227 The date and time the document was most recently modified.

228 """

229 return parse_iso8824_date(self._get_text(DI.MOD_DATE))

230

231 @property

232 def modification_date_raw(self) -> Optional[str]:

233 """

234 The "raw" version of modification date; can return a

235 ``ByteStringObject``.

236

237 Typically in the format ``D:YYYYMMDDhhmmss[+Z-]hh'mm`` where the suffix

238 is the offset from UTC.

239 """

240 return self.get(DI.MOD_DATE)

241

242 @property

243 def keywords(self) -> Optional[str]:

244 """

245 Read-only property accessing the document's keywords.

246

247 Returns a ``TextStringObject`` or ``None`` if keywords are not

248 specified.

249 """

250 return self._get_text(DI.KEYWORDS)

251

252 @property

253 def keywords_raw(self) -> Optional[str]:

254 """The "raw" version of keywords; can return a ``ByteStringObject``."""

255 return self.get(DI.KEYWORDS)

256

257

258class PdfDocCommon:

259 """

260 Common functions from PdfWriter and PdfReader objects.

261

262 This root class is strongly abstracted.

263 """

264

265 strict: bool = False # default

266

267 flattened_pages: Optional[list[PageObject]] = None

268

269 _encryption: Optional[Encryption] = None

270

271 _readonly: bool = False

272

273 @property

274 @abstractmethod

275 def root_object(self) -> DictionaryObject:

276 ... # pragma: no cover

277

278 @property

279 @abstractmethod

280 def pdf_header(self) -> str:

281 ... # pragma: no cover

282

283 @abstractmethod

284 def get_object(

285 self, indirect_reference: Union[int, IndirectObject]

286 ) -> Optional[PdfObject]:

287 ... # pragma: no cover

288

289 @abstractmethod

290 def _replace_object(self, indirect: IndirectObject, obj: PdfObject) -> PdfObject:

291 ... # pragma: no cover

292

293 @property

294 @abstractmethod

295 def _info(self) -> Optional[DictionaryObject]:

296 ... # pragma: no cover

297

298 @property

299 def metadata(self) -> Optional[DocumentInformation]:

300 """

301 Retrieve the PDF file's document information dictionary, if it exists.

302

303 Note that some PDF files use metadata streams instead of document

304 information dictionaries, and these metadata streams will not be

305 accessed by this function.

306 """

307 retval = DocumentInformation()

308 if self._info is None:

309 return None

310 retval.update(self._info)

311 return retval

312

313 @property

314 def xmp_metadata(self) -> Optional[XmpInformation]:

315 ... # pragma: no cover

316

317 @property

318 def viewer_preferences(self) -> Optional[ViewerPreferences]:

319 """Returns the existing ViewerPreferences as an overloaded dictionary."""

320 o = self.root_object.get(CD.VIEWER_PREFERENCES, None)

321 if o is None:

322 return None

323 o = o.get_object()

324 if not isinstance(o, ViewerPreferences):

325 o = ViewerPreferences(o)

326 if hasattr(o, "indirect_reference") and o.indirect_reference is not None:

327 self._replace_object(o.indirect_reference, o)

328 else:

329 self.root_object[NameObject(CD.VIEWER_PREFERENCES)] = o

330 return o

331

332 def get_num_pages(self) -> int:

333 """

334 Calculate the number of pages in this PDF file.

335

336 Returns:

337 The number of pages of the parsed PDF file.

338

339 Raises:

340 PdfReadError: If restrictions prevent this action.

341

342 """

343 # Flattened pages will not work on an encrypted PDF;

344 # the PDF file's page count is used in this case. Otherwise,

345 # the original method (flattened page count) is used.

346 if self.is_encrypted:

347 return self.root_object["/Pages"]["/Count"] # type: ignore

348 if self.flattened_pages is None:

349 self._flatten(self._readonly)

350 assert self.flattened_pages is not None

351 return len(self.flattened_pages)

352

353 def get_page(self, page_number: int) -> PageObject:

354 """

355 Retrieve a page by number from this PDF file.

356 Most of the time ``.pages[page_number]`` is preferred.

357

358 Args:

359 page_number: The page number to retrieve

360 (pages begin at zero)

361

362 Returns:

363 A :class:`PageObject<pypdf._page.PageObject>` instance.

364

365 """

366 if self.flattened_pages is None:

367 self._flatten(self._readonly)

368 assert self.flattened_pages is not None, "hint for mypy"

369 return self.flattened_pages[page_number]

370

371 def _get_page_in_node(

372 self,

373 page_number: int,

374 ) -> tuple[DictionaryObject, int]:

375 """

376 Retrieve the node and position within the /Kids containing the page.

377 If page_number is greater than the number of pages, it returns the top node, -1.

378 """

379 top = cast(DictionaryObject, self.root_object["/Pages"])

380

381 def recursive_call(

382 node: DictionaryObject, mi: int

383 ) -> tuple[Optional[PdfObject], int]:

384 ma = cast(int, node.get("/Count", 1)) # default 1 for /Page types

385 if node["/Type"] == "/Page":

386 if page_number == mi:

387 return node, -1

388 return None, mi + 1

389 if (page_number - mi) >= ma: # not in nodes below

390 if node == top:

391 return top, -1

392 return None, mi + ma

393 for idx, kid in enumerate(cast(ArrayObject, node["/Kids"])):

394 kid = cast(DictionaryObject, kid.get_object())

395 n, i = recursive_call(kid, mi)

396 if n is not None: # page has just been found ...

397 if i < 0: # ... just below!

398 return node, idx

399 # ... at lower levels

400 return n, i

401 mi = i

402 raise PyPdfError("Unexpectedly cannot find the node.")

403

404 node, idx = recursive_call(top, 0)

405 assert isinstance(node, DictionaryObject), "mypy"

406 return node, idx

407

408 @property

409 def named_destinations(self) -> dict[str, Destination]:

410 """A read-only dictionary which maps names to destinations."""

411 return self._get_named_destinations()

412

413 def get_named_dest_root(self) -> ArrayObject:

414 named_dest = ArrayObject()

415 if CA.NAMES in self.root_object and isinstance(

416 self.root_object[CA.NAMES], DictionaryObject

417 ):

418 names = cast(DictionaryObject, self.root_object[CA.NAMES])

419 if CA.DESTS in names and isinstance(names[CA.DESTS], DictionaryObject):

420 # §3.6.3 Name Dictionary (PDF spec 1.7)

421 dests = cast(DictionaryObject, names[CA.DESTS])

422 dests_ref = dests.indirect_reference

423 if CA.NAMES in dests:

424 # §7.9.6, entries in a name tree node dictionary

425 named_dest = cast(ArrayObject, dests[CA.NAMES])

426 else:

427 named_dest = ArrayObject()

428 dests[NameObject(CA.NAMES)] = named_dest

429 elif hasattr(self, "_add_object"):

430 dests = DictionaryObject()

431 dests_ref = self._add_object(dests)

432 names[NameObject(CA.DESTS)] = dests_ref

433 dests[NameObject(CA.NAMES)] = named_dest

434

435 elif hasattr(self, "_add_object"):

436 names = DictionaryObject()

437 names_ref = self._add_object(names)

438 self.root_object[NameObject(CA.NAMES)] = names_ref

439 dests = DictionaryObject()

440 dests_ref = self._add_object(dests)

441 names[NameObject(CA.DESTS)] = dests_ref

442 dests[NameObject(CA.NAMES)] = named_dest

443

444 return named_dest

445

446 ## common

447 def _get_named_destinations(

448 self,

449 tree: Union[TreeObject, None] = None,

450 retval: Optional[dict[str, Destination]] = None,

451 ) -> dict[str, Destination]:

452 """

453 Retrieve the named destinations present in the document.

454

455 Args:

456 tree: The current tree.

457 retval: The previously retrieved destinations for nested calls.

458

459 Returns:

460 A dictionary which maps names to destinations.

461

462 """

463 if retval is None:

464 retval = {}

465 catalog = self.root_object

466

467 # get the name tree

468 if CA.DESTS in catalog:

469 tree = cast(TreeObject, catalog[CA.DESTS])

470 elif CA.NAMES in catalog:

471 names = cast(DictionaryObject, catalog[CA.NAMES])

472 if CA.DESTS in names:

473 tree = cast(TreeObject, names[CA.DESTS])

474

475 if is_null_or_none(tree):

476 return retval

477 assert tree is not None, "mypy"

478

479 if PagesAttributes.KIDS in tree:

480 # recurse down the tree

481 for kid in cast(ArrayObject, tree[PagesAttributes.KIDS]):

482 self._get_named_destinations(kid.get_object(), retval)

483 # §7.9.6, entries in a name tree node dictionary

484 elif CA.NAMES in tree: # /Kids and /Names are exclusives (§7.9.6)

485 names = cast(DictionaryObject, tree[CA.NAMES])

486 i = 0

487 while i < len(names):

488 original_key = names[i].get_object()

489 i += 1

490 if not isinstance(original_key, (bytes, str)):

491 continue

492 key = str(original_key)

493 try:

494 value = names[i].get_object()

495 except IndexError:

496 break

497 i += 1

498 if isinstance(value, DictionaryObject):

499 if "/D" in value:

500 value = value["/D"]

501 else:

502 continue

503 dest = self._build_destination(key, value)

504 if dest is not None:

505 retval[key] = dest

506 else: # case where Dests is in root catalog (PDF 1.7 specs, §2 about PDF 1.1)

507 for k__, v__ in tree.items():

508 val = v__.get_object()

509 if isinstance(val, DictionaryObject):

510 if "/D" in val:

511 val = val["/D"].get_object()

512 else:

513 continue

514 dest = self._build_destination(k__, val)

515 if dest is not None:

516 retval[k__] = dest

517 return retval

518

519 # A select group of relevant field attributes. For the complete list,

520 # see §12.3.2 of the PDF 1.7 or PDF 2.0 specification.

521

522 def get_fields(

523 self,

524 tree: Optional[TreeObject] = None,

525 retval: Optional[dict[Any, Any]] = None,

526 fileobj: Optional[Any] = None,

527 stack: Optional[list[PdfObject]] = None,

528 ) -> Optional[dict[str, Any]]:

529 """

530 Extract field data if this PDF contains interactive form fields.

531

532 The *tree*, *retval*, *stack* parameters are for recursive use.

533

534 Args:

535 tree: Current object to parse.

536 retval: In-progress list of fields.

537 fileobj: A file object (usually a text file) to write

538 a report to on all interactive form fields found.

539 stack: List of already parsed objects.

540

541 Returns:

542 A dictionary where each key is a field name, and each

543 value is a :class:`Field<pypdf.generic.Field>` object. By

544 default, the mapping name is used for keys.

545 ``None`` if form data could not be located.

546

547 """

548 field_attributes = FA.attributes_dict()

549 field_attributes.update(CheckboxRadioButtonAttributes.attributes_dict())

550 if retval is None:

551 retval = {}

552 catalog = self.root_object

553 stack = []

554 # get the AcroForm tree

555 if CD.ACRO_FORM in catalog:

556 tree = cast(Optional[TreeObject], catalog[CD.ACRO_FORM])

557 else:

558 return None

559 if tree is None:

560 return retval

561 assert stack is not None

562 if "/Fields" in tree:

563 fields = cast(ArrayObject, tree["/Fields"])

564 for f in fields:

565 field = f.get_object()

566 self._build_field(field, retval, fileobj, field_attributes, stack)

567 elif any(attr in tree for attr in field_attributes):

568 # Tree is a field

569 self._build_field(tree, retval, fileobj, field_attributes, stack)

570 return retval

571

572 def _get_qualified_field_name(self, parent: DictionaryObject) -> str:

573 if "/TM" in parent:

574 return cast(str, parent["/TM"])

575 if "/Parent" in parent:

576 return (

577 self._get_qualified_field_name(

578 cast(DictionaryObject, parent["/Parent"])

579 )

580 + "."

581 + cast(str, parent.get("/T", ""))

582 )

583 return cast(str, parent.get("/T", ""))

584

585 def _build_field(

586 self,

587 field: Union[TreeObject, DictionaryObject],

588 retval: dict[Any, Any],

589 fileobj: Any,

590 field_attributes: Any,

591 stack: list[PdfObject],

592 ) -> None:

593 if all(attr not in field for attr in ("/T", "/TM")):

594 return

595 key = self._get_qualified_field_name(field)

596 if fileobj:

597 self._write_field(fileobj, field, field_attributes)

598 fileobj.write("\n")

599 retval[key] = Field(field)

600 obj = retval[key].indirect_reference.get_object() # to get the full object

601 if obj.get(FA.FT, "") == "/Ch":

602 retval[key][NameObject("/_States_")] = obj[NameObject(FA.Opt)]

603 if obj.get(FA.FT, "") == "/Btn" and "/AP" in obj:

604 # Checkbox

605 retval[key][NameObject("/_States_")] = ArrayObject(

606 list(obj["/AP"]["/N"].keys())

607 )

608 if "/Off" not in retval[key]["/_States_"]:

609 retval[key][NameObject("/_States_")].append(NameObject("/Off"))

610 elif obj.get(FA.FT, "") == "/Btn" and obj.get(FA.Ff, 0) & FA.FfBits.Radio != 0:

611 states: list[str] = []

612 retval[key][NameObject("/_States_")] = ArrayObject(states)

613 for k in obj.get(FA.Kids, {}):

614 k = k.get_object()

615 for s in list(k["/AP"]["/N"].keys()):

616 if s not in states:

617 states.append(s)

618 retval[key][NameObject("/_States_")] = ArrayObject(states)

619 if (

620 obj.get(FA.Ff, 0) & FA.FfBits.NoToggleToOff != 0

621 and "/Off" in retval[key]["/_States_"]

622 ):

623 del retval[key]["/_States_"][retval[key]["/_States_"].index("/Off")]

624 # at last for order

625 self._check_kids(field, retval, fileobj, stack)

626

627 def _check_kids(

628 self,

629 tree: Union[TreeObject, DictionaryObject],

630 retval: Any,

631 fileobj: Any,

632 stack: list[PdfObject],

633 ) -> None:

634 if tree in stack:

635 logger_warning(

636 f"{self._get_qualified_field_name(tree)} already parsed", __name__

637 )

638 return

639 stack.append(tree)

640 if PagesAttributes.KIDS in tree:

641 # recurse down the tree

642 for kid in tree[PagesAttributes.KIDS]: # type: ignore

643 kid = kid.get_object()

644 self.get_fields(kid, retval, fileobj, stack)

645

646 def _write_field(self, fileobj: Any, field: Any, field_attributes: Any) -> None:

647 field_attributes_tuple = FA.attributes()

648 field_attributes_tuple = (

649 field_attributes_tuple + CheckboxRadioButtonAttributes.attributes()

650 )

651

652 for attr in field_attributes_tuple:

653 if attr in (

654 FA.Kids,

655 FA.AA,

656 ):

657 continue

658 attr_name = field_attributes[attr]

659 try:

660 if attr == FA.FT:

661 # Make the field type value clearer

662 types = {

663 "/Btn": "Button",

664 "/Tx": "Text",

665 "/Ch": "Choice",

666 "/Sig": "Signature",

667 }

668 if field[attr] in types:

669 fileobj.write(f"{attr_name}: {types[field[attr]]}\n")

670 elif attr == FA.Parent:

671 # Let's just write the name of the parent

672 try:

673 name = field[attr][FA.TM]

674 except KeyError:

675 name = field[attr][FA.T]

676 fileobj.write(f"{attr_name}: {name}\n")

677 else:

678 fileobj.write(f"{attr_name}: {field[attr]}\n")

679 except KeyError:

680 # Field attribute is N/A or unknown, so don't write anything

681 pass

682

683 def get_form_text_fields(self, full_qualified_name: bool = False) -> dict[str, Any]:

684 """

685 Retrieve form fields from the document with textual data.

686

687 Args:

688 full_qualified_name: to get full name

689

690 Returns:

691 A dictionary. The key is the name of the form field,

692 the value is the content of the field.

693

694 If the document contains multiple form fields with the same name, the

695 second and following will get the suffix .2, .3, ...

696

697 """

698

699 def indexed_key(k: str, fields: dict[Any, Any]) -> str:

700 if k not in fields:

701 return k

702 return (

703 k

704 + "."

705 + str(sum(1 for kk in fields if kk.startswith(k + ".")) + 2)

706 )

707

708 # Retrieve document form fields

709 formfields = self.get_fields()

710 if formfields is None:

711 return {}

712 ff = {}

713 for field, value in formfields.items():

714 if value.get("/FT") == "/Tx":

715 if full_qualified_name:

716 ff[field] = value.get("/V")

717 else:

718 ff[indexed_key(cast(str, value["/T"]), ff)] = value.get("/V")

719 return ff

720

721 def get_pages_showing_field(

722 self, field: Union[Field, PdfObject, IndirectObject]

723 ) -> list[PageObject]:

724 """

725 Provides list of pages where the field is called.

726

727 Args:

728 field: Field Object, PdfObject or IndirectObject referencing a Field

729

730 Returns:

731 List of pages:

732 - Empty list:

733 The field has no widgets attached

734 (either hidden field or ancestor field).

735 - Single page list:

736 Page where the widget is present

737 (most common).

738 - Multi-page list:

739 Field with multiple kids widgets

740 (example: radio buttons, field repeated on multiple pages).

741

742 """

743

744 def _get_inherited(obj: DictionaryObject, key: str) -> Any:

745 if key in obj:

746 return obj[key]

747 if "/Parent" in obj:

748 return _get_inherited(

749 cast(DictionaryObject, obj["/Parent"].get_object()), key

750 )

751 return None

752

753 try:

754 # to cope with all types

755 field = cast(DictionaryObject, field.indirect_reference.get_object()) # type: ignore

756 except Exception as exc:

757 raise ValueError("Field type is invalid") from exc

758 if is_null_or_none(_get_inherited(field, "/FT")):

759 raise ValueError("Field is not valid")

760 ret = []

761 if field.get("/Subtype", "") == "/Widget":

762 if "/P" in field:

763 ret = [field["/P"].get_object()]

764 else:

765 ret = [

766 p

767 for p in self.pages

768 if field.indirect_reference in p.get("/Annots", "")

769 ]

770 else:

771 kids = field.get("/Kids", ())

772 for k in kids:

773 k = k.get_object()

774 if (k.get("/Subtype", "") == "/Widget") and ("/T" not in k):

775 # Kid that is just a widget, not a field:

776 if "/P" in k:

777 ret += [k["/P"].get_object()]

778 else:

779 ret += [

780 p

781 for p in self.pages

782 if k.indirect_reference in p.get("/Annots", "")

783 ]

784 return [

785 x

786 if isinstance(x, PageObject)

787 else (self.pages[self._get_page_number_by_indirect(x.indirect_reference)]) # type: ignore

788 for x in ret

789 ]

790

791 @property

792 def open_destination(

793 self,

794 ) -> Union[None, Destination, TextStringObject, ByteStringObject]:

795 """

796 Property to access the opening destination (``/OpenAction`` entry in

797 the PDF catalog). It returns ``None`` if the entry does not exist

798 or is not set.

799

800 Raises:

801 Exception: If a destination is invalid.

802

803 """

804 if "/OpenAction" not in self.root_object:

805 return None

806 oa: Any = self.root_object["/OpenAction"]

807 if isinstance(oa, bytes): # pragma: no cover

808 oa = oa.decode()

809 if isinstance(oa, str):

810 return create_string_object(oa)

811 if isinstance(oa, ArrayObject):

812 try:

813 page, typ, *array = oa

814 fit = Fit(typ, tuple(array))

815 return Destination("OpenAction", page, fit)

816 except Exception as exc:

817 raise Exception(f"Invalid Destination {oa}: {exc}")

818 else:

819 return None

820

821 @open_destination.setter

822 def open_destination(self, dest: Union[None, str, Destination, PageObject]) -> None:

823 raise NotImplementedError("No setter for open_destination")

824

825 @property

826 def outline(self) -> OutlineType:

827 """

828 Read-only property for the outline present in the document

829 (i.e., a collection of 'outline items' which are also known as

830 'bookmarks').

831 """

832 return self._get_outline()

833

834 def _get_outline(

835 self, node: Optional[DictionaryObject] = None, outline: Optional[Any] = None

836 ) -> OutlineType:

837 if outline is None:

838 outline = []

839 catalog = self.root_object

840

841 # get the outline dictionary and named destinations

842 if CO.OUTLINES in catalog:

843 lines = cast(DictionaryObject, catalog[CO.OUTLINES])

844

845 if isinstance(lines, NullObject):

846 return outline

847

848 # §12.3.3 Document outline, entries in the outline dictionary

849 if not is_null_or_none(lines) and "/First" in lines:

850 node = cast(DictionaryObject, lines["/First"])

851 self._named_destinations = self._get_named_destinations()

852

853 if node is None:

854 return outline

855

856 # see if there are any more outline items

857 while True:

858 outline_obj = self._build_outline_item(node)

859 if outline_obj:

860 outline.append(outline_obj)

861

862 # check for sub-outline

863 if "/First" in node:

864 sub_outline: list[Any] = []

865 self._get_outline(cast(DictionaryObject, node["/First"]), sub_outline)

866 if sub_outline:

867 outline.append(sub_outline)

868

869 if "/Next" not in node:

870 break

871 node = cast(DictionaryObject, node["/Next"])

872

873 return outline

874

875 @property

876 def threads(self) -> Optional[ArrayObject]:

877 """

878 Read-only property for the list of threads.

879

880 See §12.4.3 from the PDF 1.7 or 2.0 specification.

881

882 It is an array of dictionaries with "/F" (the first bead in the thread)

883 and "/I" (a thread information dictionary containing information about

884 the thread, such as its title, author, and creation date) properties or

885 None if there are no articles.

886

887 Since PDF 2.0 it can also contain an indirect reference to a metadata

888 stream containing information about the thread, such as its title,

889 author, and creation date.

890 """

891 catalog = self.root_object

892 if CO.THREADS in catalog:

893 return cast("ArrayObject", catalog[CO.THREADS])

894 return None

895

896 @abstractmethod

897 def _get_page_number_by_indirect(

898 self, indirect_reference: Union[None, int, NullObject, IndirectObject]

899 ) -> Optional[int]:

900 ... # pragma: no cover

901

902 def get_page_number(self, page: PageObject) -> Optional[int]:

903 """

904 Retrieve page number of a given PageObject.

905

906 Args:

907 page: The page to get page number. Should be

908 an instance of :class:`PageObject<pypdf._page.PageObject>`

909

910 Returns:

911 The page number or None if page is not found

912

913 """

914 return self._get_page_number_by_indirect(page.indirect_reference)

915

916 def get_destination_page_number(self, destination: Destination) -> Optional[int]:

917 """

918 Retrieve page number of a given Destination object.

919

920 Args:

921 destination: The destination to get page number.

922

923 Returns:

924 The page number or None if page is not found

925

926 """

927 return self._get_page_number_by_indirect(destination.page)

928

929 def _build_destination(

930 self,

931 title: str,

932 array: Optional[

933 list[

934 Union[NumberObject, IndirectObject, None, NullObject, DictionaryObject]

935 ]

936 ],

937 ) -> Destination:

938 page, typ = None, None

939 # handle outline items with missing or invalid destination

940 if (

941 isinstance(array, (NullObject, str))

942 or (isinstance(array, ArrayObject) and len(array) == 0)

943 or array is None

944 ):

945 page = NullObject()

946 return Destination(title, page, Fit.fit())

947 page, typ, *array = array # type: ignore

948 try:

949 return Destination(title, page, Fit(fit_type=typ, fit_args=array)) # type: ignore

950 except PdfReadError:

951 logger_warning(f"Unknown destination: {title} {array}", __name__)

952 if self.strict:

953 raise

954 # create a link to first Page

955 tmp = self.pages[0].indirect_reference

956 indirect_reference = NullObject() if tmp is None else tmp

957 return Destination(title, indirect_reference, Fit.fit())

958

959 def _build_outline_item(self, node: DictionaryObject) -> Optional[Destination]:

960 dest, title, outline_item = None, None, None

961

962 # title required for valid outline

963 # §12.3.3, entries in an outline item dictionary

964 try:

965 title = cast("str", node["/Title"])

966 except KeyError:

967 if self.strict:

968 raise PdfReadError(f"Outline Entry Missing /Title attribute: {node!r}")

969 title = ""

970

971 if "/A" in node:

972 # Action, PDF 1.7 and PDF 2.0 §12.6 (only type GoTo supported)

973 action = cast(DictionaryObject, node["/A"])

974 action_type = cast(NameObject, action[GoToActionArguments.S])

975 if action_type == "/GoTo":

976 if GoToActionArguments.D in action:

977 dest = action[GoToActionArguments.D]

978 elif self.strict:

979 raise PdfReadError(f"Outline Action Missing /D attribute: {node!r}")

980 elif "/Dest" in node:

981 # Destination, PDF 1.7 and PDF 2.0 §12.3.2

982 dest = node["/Dest"]

983 # if array was referenced in another object, will be a dict w/ key "/D"

984 if isinstance(dest, DictionaryObject) and "/D" in dest:

985 dest = dest["/D"]

986

987 if isinstance(dest, ArrayObject):

988 outline_item = self._build_destination(title, dest)

989 elif isinstance(dest, str):

990 # named destination, addresses NameObject Issue #193

991 # TODO: Keep named destination instead of replacing it?

992 try:

993 outline_item = self._build_destination(

994 title, self._named_destinations[dest].dest_array

995 )

996 except KeyError:

997 # named destination not found in Name Dict

998 outline_item = self._build_destination(title, None)

999 elif dest is None:

1000 # outline item not required to have destination or action

1001 # PDFv1.7 Table 153

1002 outline_item = self._build_destination(title, dest)

1003 else:

1004 if self.strict:

1005 raise PdfReadError(f"Unexpected destination {dest!r}")

1006 logger_warning(

1007 f"Removed unexpected destination {dest!r} from destination",

1008 __name__,

1009 )

1010 outline_item = self._build_destination(title, None)

1011

1012 # if outline item created, add color, format, and child count if present

1013 if outline_item:

1014 if "/C" in node:

1015 # Color of outline item font in (R, G, B) with values ranging 0.0-1.0

1016 outline_item[NameObject("/C")] = ArrayObject(FloatObject(c) for c in node["/C"]) # type: ignore

1017 if "/F" in node:

1018 # specifies style characteristics bold and/or italic

1019 # with 1=italic, 2=bold, 3=both

1020 outline_item[NameObject("/F")] = node["/F"]

1021 if "/Count" in node:

1022 # absolute value = num. visible children

1023 # with positive = open/unfolded, negative = closed/folded

1024 outline_item[NameObject("/Count")] = node["/Count"]

1025 # if count is 0 we will consider it as open (to have available is_open)

1026 outline_item[NameObject("/%is_open%")] = BooleanObject(

1027 node.get("/Count", 0) >= 0

1028 )

1029 outline_item.node = node

1030 try:

1031 outline_item.indirect_reference = node.indirect_reference

1032 except AttributeError:

1033 pass

1034 return outline_item

1035

1036 @property

1037 def pages(self) -> list[PageObject]:

1038 """

1039 Property that emulates a list of :class:`PageObject<pypdf._page.PageObject>`.

1040 This property allows to get a page or a range of pages.

1041

1042 Note:

1043 For PdfWriter only: Provides the capability to remove a page/range of

1044 page from the list (using the del operator). Remember: Only the page

1045 entry is removed, as the objects beneath can be used elsewhere. A

1046 solution to completely remove them - if they are not used anywhere - is

1047 to write to a buffer/temporary file and then load it into a new

1048 PdfWriter.

1049

1050 """

1051 return _VirtualList(self.get_num_pages, self.get_page) # type: ignore

1052

1053 @property

1054 def page_labels(self) -> list[str]:

1055 """

1056 A list of labels for the pages in this document.

1057

1058 This property is read-only. The labels are in the order that the pages

1059 appear in the document.

1060 """

1061 return [page_index2page_label(self, i) for i in range(len(self.pages))]

1062

1063 @property

1064 def page_layout(self) -> Optional[str]:

1065 """

1066 Get the page layout currently being used.

1067

1068 .. list-table:: Valid ``layout`` values

1069 :widths: 50 200

1070

1071 * - /NoLayout

1072 - Layout explicitly not specified

1073 * - /SinglePage

1074 - Show one page at a time

1075 * - /OneColumn

1076 - Show one column at a time

1077 * - /TwoColumnLeft

1078 - Show pages in two columns, odd-numbered pages on the left

1079 * - /TwoColumnRight

1080 - Show pages in two columns, odd-numbered pages on the right

1081 * - /TwoPageLeft

1082 - Show two pages at a time, odd-numbered pages on the left

1083 * - /TwoPageRight

1084 - Show two pages at a time, odd-numbered pages on the right

1085 """

1086 try:

1087 return cast(NameObject, self.root_object[CD.PAGE_LAYOUT])

1088 except KeyError:

1089 return None

1090

1091 @property

1092 def page_mode(self) -> Optional[PagemodeType]:

1093 """

1094 Get the page mode currently being used.

1095

1096 .. list-table:: Valid ``mode`` values

1097 :widths: 50 200

1098

1099 * - /UseNone

1100 - Do not show outline or thumbnails panels

1101 * - /UseOutlines

1102 - Show outline (aka bookmarks) panel

1103 * - /UseThumbs

1104 - Show page thumbnails panel

1105 * - /FullScreen

1106 - Fullscreen view

1107 * - /UseOC

1108 - Show Optional Content Group (OCG) panel

1109 * - /UseAttachments

1110 - Show attachments panel

1111 """

1112 try:

1113 return self.root_object["/PageMode"] # type: ignore

1114 except KeyError:

1115 return None

1116

1117 def _flatten(

1118 self,

1119 list_only: bool = False,

1120 pages: Union[None, DictionaryObject, PageObject] = None,

1121 inherit: Optional[dict[str, Any]] = None,

1122 indirect_reference: Optional[IndirectObject] = None,

1123 ) -> None:

1124 """

1125 Process the document pages to ease searching.

1126

1127 Attributes of a page may inherit from ancestor nodes

1128 in the page tree. Flattening means moving

1129 any inheritance data into descendant nodes,

1130 effectively removing the inheritance dependency.

1131

1132 Note: It is distinct from another use of "flattening" applied to PDFs.

1133 Flattening a PDF also means combining all the contents into one single layer

1134 and making the file less editable.

1135

1136 Args:

1137 list_only: Will only list the pages within _flatten_pages.

1138 pages:

1139 inherit:

1140 indirect_reference: Used recursively to flatten the /Pages object.

1141

1142 """

1143 inheritable_page_attributes = (

1144 NameObject(PG.RESOURCES),

1145 NameObject(PG.MEDIABOX),

1146 NameObject(PG.CROPBOX),

1147 NameObject(PG.ROTATE),

1148 )

1149 if inherit is None:

1150 inherit = {}

1151 if pages is None:

1152 # Fix issue 327: set flattened_pages attribute only for

1153 # decrypted file

1154 catalog = self.root_object

1155 pages = catalog.get("/Pages").get_object() # type: ignore

1156 if not isinstance(pages, DictionaryObject):

1157 raise PdfReadError("Invalid object in /Pages")

1158 self.flattened_pages = []

1159

1160 if PagesAttributes.TYPE in pages:

1161 t = cast(str, pages[PagesAttributes.TYPE])

1162 # if the page tree node has no /Type, consider as a page if /Kids is also missing

1163 elif PagesAttributes.KIDS not in pages:

1164 t = "/Page"

1165 else:

1166 t = "/Pages"

1167

1168 if t == "/Pages":

1169 for attr in inheritable_page_attributes:

1170 if attr in pages:

1171 inherit[attr] = pages[attr]

1172 for page in cast(ArrayObject, pages[PagesAttributes.KIDS]):

1173 addt = {}

1174 if isinstance(page, IndirectObject):

1175 addt["indirect_reference"] = page

1176 obj = page.get_object()

1177 if obj:

1178 # damaged file may have invalid child in /Pages

1179 try:

1180 self._flatten(list_only, obj, inherit, **addt)

1181 except RecursionError:

1182 raise PdfReadError(

1183 "Maximum recursion depth reached during page flattening."

1184 )

1185 elif t == "/Page":

1186 for attr_in, value in inherit.items():

1187 # if the page has its own value, it does not inherit the

1188 # parent's value

1189 if attr_in not in pages:

1190 pages[attr_in] = value

1191 page_obj = PageObject(self, indirect_reference)

1192 if not list_only:

1193 page_obj.update(pages)

1194

1195 # TODO: Could flattened_pages be None at this point?

1196 self.flattened_pages.append(page_obj) # type: ignore

1197

1198 def remove_page(

1199 self,

1200 page: Union[int, PageObject, IndirectObject],

1201 clean: bool = False,

1202 ) -> None:

1203 """

1204 Remove page from pages list.

1205

1206 Args:

1207 page:

1208 * :class:`int`: Page number to be removed.

1209 * :class:`~pypdf._page.PageObject`: page to be removed. If the page appears many times

1210 only the first one will be removed.

1211 * :class:`~pypdf.generic.IndirectObject`: Reference to page to be removed.

1212

1213 clean: replace PageObject with NullObject to prevent annotations

1214 or destinations to reference a detached page.

1215

1216 """

1217 if self.flattened_pages is None:

1218 self._flatten(self._readonly)

1219 assert self.flattened_pages is not None

1220 if isinstance(page, IndirectObject):

1221 p = page.get_object()

1222 if not isinstance(p, PageObject):

1223 logger_warning("IndirectObject is not referencing a page", __name__)

1224 return

1225 page = p

1226

1227 if not isinstance(page, int):

1228 try:

1229 page = self.flattened_pages.index(page)

1230 except ValueError:

1231 logger_warning("Cannot find page in pages", __name__)

1232 return

1233 if not (0 <= page < len(self.flattened_pages)):

1234 logger_warning("Page number is out of range", __name__)

1235 return

1236

1237 ind = self.pages[page].indirect_reference

1238 del self.pages[page]

1239 if clean and ind is not None:

1240 self._replace_object(ind, NullObject())

1241

1242 def _get_indirect_object(self, num: int, gen: int) -> Optional[PdfObject]:

1243 """

1244 Used to ease development.

1245

1246 This is equivalent to generic.IndirectObject(num,gen,self).get_object()

1247

1248 Args:

1249 num: The object number of the indirect object.

1250 gen: The generation number of the indirect object.

1251

1252 Returns:

1253 A PdfObject

1254

1255 """

1256 return IndirectObject(num, gen, self).get_object()

1257

1258 def decode_permissions(

1259 self, permissions_code: int

1260 ) -> dict[str, bool]: # pragma: no cover

1261 """Take the permissions as an integer, return the allowed access."""

1262 deprecation_with_replacement(

1263 old_name="decode_permissions",

1264 new_name="user_access_permissions",

1265 removed_in="5.0.0",

1266 )

1267

1268 permissions_mapping = {

1269 "print": UserAccessPermissions.PRINT,

1270 "modify": UserAccessPermissions.MODIFY,

1271 "copy": UserAccessPermissions.EXTRACT,

1272 "annotations": UserAccessPermissions.ADD_OR_MODIFY,

1273 "forms": UserAccessPermissions.FILL_FORM_FIELDS,

1274 # Do not fix typo, as part of official, but deprecated API.

1275 "accessability": UserAccessPermissions.EXTRACT_TEXT_AND_GRAPHICS,

1276 "assemble": UserAccessPermissions.ASSEMBLE_DOC,

1277 "print_high_quality": UserAccessPermissions.PRINT_TO_REPRESENTATION,

1278 }

1279

1280 return {

1281 key: permissions_code & flag != 0

1282 for key, flag in permissions_mapping.items()

1283 }

1284

1285 @property

1286 def user_access_permissions(self) -> Optional[UserAccessPermissions]:

1287 """Get the user access permissions for encrypted documents. Returns None if not encrypted."""

1288 if self._encryption is None:

1289 return None

1290 return UserAccessPermissions(self._encryption.P)

1291

1292 @property

1293 @abstractmethod

1294 def is_encrypted(self) -> bool:

1295 """

1296 Read-only boolean property showing whether this PDF file is encrypted.

1297

1298 Note that this property, if true, will remain true even after the

1299 :meth:`decrypt()<pypdf.PdfReader.decrypt>` method is called.

1300 """

1301 ... # pragma: no cover

1302

1303 @property

1304 def xfa(self) -> Optional[dict[str, Any]]:

1305 tree: Optional[TreeObject] = None

1306 retval: dict[str, Any] = {}

1307 catalog = self.root_object

1308

1309 if "/AcroForm" not in catalog or not catalog["/AcroForm"]:

1310 return None

1311

1312 tree = cast(TreeObject, catalog["/AcroForm"])

1313

1314 if "/XFA" in tree:

1315 fields = cast(ArrayObject, tree["/XFA"])

1316 i = iter(fields)

1317 for f in i:

1318 tag = f

1319 f = next(i)

1320 if isinstance(f, IndirectObject):

1321 field = cast(Optional[EncodedStreamObject], f.get_object())

1322 if field:

1323 es = zlib.decompress(field._data)

1324 retval[tag] = es

1325 return retval

1326

1327 @property

1328 def attachments(self) -> Mapping[str, list[bytes]]:

1329 """Mapping of attachment filenames to their content."""

1330 return LazyDict(

1331 {

1332 name: (self._get_attachment_list, name)

1333 for name in self._list_attachments()

1334 }

1335 )

1336

1337 @property

1338 def attachment_list(self) -> Generator[EmbeddedFile, None, None]:

1339 """Iterable of attachment objects."""

1340 yield from EmbeddedFile._load(self.root_object)

1341

1342 def _list_attachments(self) -> list[str]:

1343 """

1344 Retrieves the list of filenames of file attachments.

1345

1346 Returns:

1347 list of filenames

1348

1349 """

1350 names = []

1351 for entry in self.attachment_list:

1352 names.append(entry.name)

1353 if (name := entry.alternative_name) != entry.name and name:

1354 names.append(name)

1355 return names

1356

1357 def _get_attachment_list(self, name: str) -> list[bytes]:

1358 out = self._get_attachments(name)[name]

1359 if isinstance(out, list):

1360 return out

1361 return [out]

1362

1363 def _get_attachments(

1364 self, filename: Optional[str] = None

1365 ) -> dict[str, Union[bytes, list[bytes]]]:

1366 """

1367 Retrieves all or selected file attachments of the PDF as a dictionary of file names

1368 and the file data as a bytestring.

1369

1370 Args:

1371 filename: If filename is None, then a dictionary of all attachments

1372 will be returned, where the key is the filename and the value

1373 is the content. Otherwise, a dictionary with just a single key

1374 - the filename - and its content will be returned.

1375

1376 Returns:

1377 dictionary of filename -> Union[bytestring or List[ByteString]]

1378 If the filename exists multiple times a list of the different versions will be provided.

1379

1380 """

1381 attachments: dict[str, Union[bytes, list[bytes]]] = {}

1382 for entry in self.attachment_list:

1383 names = set()

1384 alternative_name = entry.alternative_name

1385 if filename is not None:

1386 if filename in {entry.name, alternative_name}:

1387 name = entry.name if filename == entry.name else alternative_name

1388 names.add(name)

1389 else:

1390 continue

1391 else:

1392 names = {entry.name, alternative_name}

1393

1394 for name in names:

1395 if name is None:

1396 continue

1397 if name in attachments:

1398 if not isinstance(attachments[name], list):

1399 attachments[name] = [attachments[name]] # type:ignore

1400 attachments[name].append(entry.content) # type:ignore

1401 else:

1402 attachments[name] = entry.content

1403 return attachments

1404

1405 @abstractmethod

1406 def _repr_mimebundle_(

1407 self,

1408 include: Union[None, Iterable[str]] = None,

1409 exclude: Union[None, Iterable[str]] = None,

1410 ) -> dict[str, Any]:

1411 """

1412 Integration into Jupyter Notebooks.

1413

1414 This method returns a dictionary that maps a mime-type to its

1415 representation.

1416

1417 .. seealso::

1418

1419 https://ipython.readthedocs.io/en/stable/config/integrating.html

1420 """

1421 ... # pragma: no cover

1422

1423

1424class LazyDict(Mapping[Any, Any]):

1425 def __init__(self, *args: Any, **kwargs: Any) -> None:

1426 self._raw_dict = dict(*args, **kwargs)

1427

1428 def __getitem__(self, key: str) -> Any:

1429 func, arg = self._raw_dict.__getitem__(key)

1430 return func(arg)

1431

1432 def __iter__(self) -> Iterator[Any]:

1433 return iter(self._raw_dict)

1434

1435 def __len__(self) -> int:

1436 return len(self._raw_dict)

1437

1438 def __str__(self) -> str:

1439 return f"LazyDict(keys={list(self.keys())})"

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pypdf/_doc_common.py: 22%

643 statements