Coverage for /pythoncovmergedfiles/medio/medio/src/pdfminer.six/pdfminer/pdfpage.py: 97%

1import itertools

2import logging

3from typing import Any, BinaryIO, Container, Dict, Iterator, List, Optional, Set, Tuple

5from pdfminer import settings

6from pdfminer.pdfdocument import (

7 PDFDocument,

8 PDFNoPageLabels,

9 PDFTextExtractionNotAllowed,

10)

11from pdfminer.pdfexceptions import PDFObjectNotFound, PDFValueError

12from pdfminer.pdfparser import PDFParser

13from pdfminer.pdftypes import dict_value, int_value, list_value, resolve1

14from pdfminer.psparser import LIT

15from pdfminer.utils import parse_rect

17log = logging.getLogger(__name__)

19# some predefined literals and keywords.

20LITERAL_PAGE = LIT("Page")

21LITERAL_PAGES = LIT("Pages")

24class PDFPage:

25 """An object that holds the information about a page.

27 A PDFPage object is merely a convenience class that has a set

28 of keys and values, which describe the properties of a page

29 and point to its contents.

31 Attributes

32 ----------

33 doc: a PDFDocument object.

34 pageid: any Python object that can uniquely identify the page.

35 attrs: a dictionary of page attributes.

36 contents: a list of PDFStream objects that represents the page content.

37 lastmod: the last modified time of the page.

38 resources: a dictionary of resources used by the page.

39 mediabox: the physical size of the page.

40 cropbox: the crop rectangle of the page.

41 rotate: the page rotation (in degree).

42 annots: the page annotations.

43 beads: a chain that represents natural reading order.

44 label: the page's label (typically, the logical page number).

46 """

48 def __init__(

49 self,

50 doc: PDFDocument,

51 pageid: object,

52 attrs: object,

53 label: Optional[str],

54 ) -> None:

55 """Initialize a page object.

57 doc: a PDFDocument object.

58 pageid: any Python object that can uniquely identify the page.

59 attrs: a dictionary of page attributes.

60 label: page label string.

61 """

62 self.doc = doc

63 self.pageid = pageid

64 self.attrs = dict_value(attrs)

65 self.label = label

66 self.lastmod = resolve1(self.attrs.get("LastModified"))

67 self.resources: Dict[object, object] = resolve1(

68 self.attrs.get("Resources", dict()),

69 )

70 mediabox_params: List[Any] = [

71 resolve1(mediabox_param) for mediabox_param in self.attrs["MediaBox"]

72 ]

73 self.mediabox = parse_rect(resolve1(mediabox_params))

74 self.cropbox = self.mediabox

75 if "CropBox" in self.attrs:

76 try:

77 self.cropbox = parse_rect(resolve1(self.attrs["CropBox"]))

78 except PDFValueError:

79 pass

81 self.rotate = (int_value(self.attrs.get("Rotate", 0)) + 360) % 360

82 self.annots = self.attrs.get("Annots")

83 self.beads = self.attrs.get("B")

84 if "Contents" in self.attrs:

85 contents = resolve1(self.attrs["Contents"])

86 else:

87 contents = []

88 if not isinstance(contents, list):

89 contents = [contents]

90 self.contents: List[object] = contents

92 def __repr__(self) -> str:

93 return f"<PDFPage: Resources={self.resources!r}, MediaBox={self.mediabox!r}>"

95 INHERITABLE_ATTRS = {"Resources", "MediaBox", "CropBox", "Rotate"}

97 @classmethod

98 def create_pages(cls, document: PDFDocument) -> Iterator["PDFPage"]:

99 def depth_first_search(

100 obj: Any,

101 parent: Dict[str, Any],

102 visited: Optional[Set[Any]] = None,

103 ) -> Iterator[Tuple[int, Dict[Any, Dict[Any, Any]]]]:

104 if isinstance(obj, int):

105 object_id = obj

106 object_properties = dict_value(document.getobj(object_id)).copy()

107 else:

108 # This looks broken. obj.objid means obj could be either

109 # PDFObjRef or PDFStream, but neither is valid for dict_value.

110 object_id = obj.objid # type: ignore[attr-defined]

111 object_properties = dict_value(obj).copy()

112

113 # Avoid recursion errors by keeping track of visited nodes

114 if visited is None:

115 visited = set()

116 if object_id in visited:

117 return

118 visited.add(object_id)

119

120 for k, v in parent.items():

121 if k in cls.INHERITABLE_ATTRS and k not in object_properties:

122 object_properties[k] = v

123

124 object_type = object_properties.get("Type")

125 if object_type is None and not settings.STRICT: # See #64

126 object_type = object_properties.get("type")

127

128 if object_type is LITERAL_PAGES and "Kids" in object_properties:

129 log.debug("Pages: Kids=%r", object_properties["Kids"])

130 for child in list_value(object_properties["Kids"]):

131 yield from depth_first_search(child, object_properties, visited)

132

133 elif object_type is LITERAL_PAGE:

134 log.debug("Page: %r", object_properties)

135 yield (object_id, object_properties)

136

137 try:

138 page_labels: Iterator[Optional[str]] = document.get_page_labels()

139 except PDFNoPageLabels:

140 page_labels = itertools.repeat(None)

141

142 pages = False

143 if "Pages" in document.catalog:

144 objects = depth_first_search(document.catalog["Pages"], document.catalog)

145 for objid, tree in objects:

146 yield cls(document, objid, tree, next(page_labels))

147 pages = True

148 if not pages:

149 # fallback when /Pages is missing.

150 for xref in document.xrefs:

151 for objid in xref.get_objids():

152 try:

153 obj = document.getobj(objid)

154 if isinstance(obj, dict) and obj.get("Type") is LITERAL_PAGE:

155 yield cls(document, objid, obj, next(page_labels))

156 except PDFObjectNotFound:

157 pass

158

159 @classmethod

160 def get_pages(

161 cls,

162 fp: BinaryIO,

163 pagenos: Optional[Container[int]] = None,

164 maxpages: int = 0,

165 password: str = "",

166 caching: bool = True,

167 check_extractable: bool = False,

168 ) -> Iterator["PDFPage"]:

169 # Create a PDF parser object associated with the file object.

170 parser = PDFParser(fp)

171 # Create a PDF document object that stores the document structure.

172 doc = PDFDocument(parser, password=password, caching=caching)

173 # Check if the document allows text extraction.

174 # If not, warn the user and proceed.

175 if not doc.is_extractable:

176 if check_extractable:

177 error_msg = "Text extraction is not allowed: %r" % fp

178 raise PDFTextExtractionNotAllowed(error_msg)

179 else:

180 warning_msg = (

181 "The PDF %r contains a metadata field "

182 "indicating that it should not allow "

183 "text extraction. Ignoring this field "

184 "and proceeding. Use the check_extractable "

185 "if you want to raise an error in this case" % fp

186 )

187 log.warning(warning_msg)

188 # Process each page contained in the document.

189 for pageno, page in enumerate(cls.create_pages(doc)):

190 if pagenos and (pageno not in pagenos):

191 continue

192 yield page

193 if maxpages and maxpages <= pageno + 1:

194 break