Coverage for /pythoncovmergedfiles/medio/medio/src/pdfminer.six/pdfminer/pdfparser.py: 95%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import logging
2from io import BytesIO
3from typing import TYPE_CHECKING, BinaryIO, Union
5from pdfminer import settings
6from pdfminer.casting import safe_int
7from pdfminer.pdfexceptions import PDFException
8from pdfminer.pdftypes import PDFObjRef, PDFStream, dict_value, int_value
9from pdfminer.psexceptions import PSEOF
10from pdfminer.psparser import KWD, PSKeyword, PSStackParser
12if TYPE_CHECKING:
13 from pdfminer.pdfdocument import PDFDocument
15log = logging.getLogger(__name__)
18class PDFSyntaxError(PDFException):
19 pass
22# PDFParser stack holds all the base types plus PDFStream, PDFObjRef, and None
23class PDFParser(PSStackParser[Union[PSKeyword, PDFStream, PDFObjRef, None]]):
24 """PDFParser fetch PDF objects from a file stream.
25 It can handle indirect references by referring to
26 a PDF document set by set_document method.
27 It also reads XRefs at the end of every PDF file.
29 Typical usage:
30 parser = PDFParser(fp)
31 parser.read_xref()
32 parser.read_xref(fallback=True) # optional
33 parser.set_document(doc)
34 parser.seek(offset)
35 parser.nextobject()
37 """
39 def __init__(self, fp: BinaryIO) -> None:
40 PSStackParser.__init__(self, fp)
41 self.doc: PDFDocument | None = None
42 self.fallback = False
44 def set_document(self, doc: "PDFDocument") -> None:
45 """Associates the parser with a PDFDocument object."""
46 self.doc = doc
48 KEYWORD_R = KWD(b"R")
49 KEYWORD_NULL = KWD(b"null")
50 KEYWORD_ENDOBJ = KWD(b"endobj")
51 KEYWORD_STREAM = KWD(b"stream")
52 KEYWORD_XREF = KWD(b"xref")
53 KEYWORD_STARTXREF = KWD(b"startxref")
55 def do_keyword(self, pos: int, token: PSKeyword) -> None:
56 """Handles PDF-related keywords."""
57 if token in (self.KEYWORD_XREF, self.KEYWORD_STARTXREF):
58 self.add_results(*self.pop(1))
60 elif token is self.KEYWORD_ENDOBJ:
61 self.add_results(*self.pop(4))
63 elif token is self.KEYWORD_NULL:
64 # null object
65 self.push((pos, None))
67 elif token is self.KEYWORD_R:
68 # reference to indirect object
69 if len(self.curstack) >= 2:
70 (_, _object_id), _ = self.pop(2)
71 object_id = safe_int(_object_id)
72 if object_id is not None:
73 obj = PDFObjRef(self.doc, object_id)
74 self.push((pos, obj))
76 elif token is self.KEYWORD_STREAM:
77 # stream object
78 popped_data = self.pop(1)
79 try:
80 ((_, dic),) = popped_data
81 except ValueError as err:
82 raise PDFSyntaxError(
83 f"Invalid stream dictionary: {popped_data}"
84 ) from err
86 dic = dict_value(dic)
87 objlen = 0
88 if not self.fallback:
89 try:
90 objlen = int_value(dic["Length"])
91 except KeyError as err:
92 if settings.STRICT:
93 raise PDFSyntaxError(f"/Length is undefined: {dic!r}") from err
94 self.seek(pos)
95 try:
96 (_, line) = self.nextline() # 'stream'
97 except PSEOF as err:
98 if settings.STRICT:
99 raise PDFSyntaxError("Unexpected EOF") from err
100 return
101 pos += len(line)
102 self.fp.seek(pos)
103 data = bytearray(self.fp.read(objlen))
104 self.seek(pos + objlen)
105 while 1:
106 try:
107 (_linepos, line) = self.nextline()
108 except PSEOF as err:
109 if settings.STRICT:
110 raise PDFSyntaxError("Unexpected EOF") from err
111 break
112 if b"endstream" in line:
113 i = line.index(b"endstream")
114 objlen += i
115 if self.fallback:
116 data += line[:i]
117 break
118 objlen += len(line)
119 if self.fallback:
120 data += line
121 self.seek(pos + objlen)
122 # XXX limit objlen not to exceed object boundary
123 log.debug(
124 "Stream: pos=%d, objlen=%d, dic=%r, data=%r...",
125 pos,
126 objlen,
127 dic,
128 data[:10],
129 )
130 assert self.doc is not None
131 stream = PDFStream(dic, bytes(data), self.doc.decipher)
132 self.push((pos, stream))
134 else:
135 # others
136 self.push((pos, token))
139class PDFStreamParser(PDFParser):
140 """PDFStreamParser is used to parse PDF content streams
141 that is contained in each page and has instructions
142 for rendering the page. A reference to a PDF document is
143 needed because a PDF content stream can also have
144 indirect references to other objects in the same document.
145 """
147 def __init__(self, data: bytes) -> None:
148 PDFParser.__init__(self, BytesIO(data))
150 def flush(self) -> None:
151 self.add_results(*self.popall())
153 KEYWORD_OBJ = KWD(b"obj")
155 def do_keyword(self, pos: int, token: PSKeyword) -> None:
156 if token is self.KEYWORD_R:
157 # reference to indirect object
158 (_, _object_id), _ = self.pop(2)
159 object_id = safe_int(_object_id)
160 if object_id is not None:
161 obj = PDFObjRef(self.doc, object_id)
162 self.push((pos, obj))
163 return
165 elif token in (self.KEYWORD_OBJ, self.KEYWORD_ENDOBJ):
166 if settings.STRICT:
167 # See PDF Spec 3.4.6: Only the object values are stored in the
168 # stream; the obj and endobj keywords are not used.
169 raise PDFSyntaxError("Keyword endobj found in stream")
170 return
172 # others
173 self.push((pos, token))