Coverage for /pythoncovmergedfiles/medio/medio/src/pdfminer.six/pdfminer/pdfparser.py: 95%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

103 statements  

1import logging 

2from io import BytesIO 

3from typing import TYPE_CHECKING, BinaryIO, Optional, Union 

4 

5from pdfminer import settings 

6from pdfminer.casting import safe_int 

7from pdfminer.pdfexceptions import PDFException 

8from pdfminer.pdftypes import PDFObjRef, PDFStream, dict_value, int_value 

9from pdfminer.psexceptions import PSEOF 

10from pdfminer.psparser import KWD, PSKeyword, PSStackParser 

11 

12if TYPE_CHECKING: 

13 from pdfminer.pdfdocument import PDFDocument 

14 

15log = logging.getLogger(__name__) 

16 

17 

18class PDFSyntaxError(PDFException): 

19 pass 

20 

21 

22# PDFParser stack holds all the base types plus PDFStream, PDFObjRef, and None 

23class PDFParser(PSStackParser[Union[PSKeyword, PDFStream, PDFObjRef, None]]): 

24 """PDFParser fetch PDF objects from a file stream. 

25 It can handle indirect references by referring to 

26 a PDF document set by set_document method. 

27 It also reads XRefs at the end of every PDF file. 

28 

29 Typical usage: 

30 parser = PDFParser(fp) 

31 parser.read_xref() 

32 parser.read_xref(fallback=True) # optional 

33 parser.set_document(doc) 

34 parser.seek(offset) 

35 parser.nextobject() 

36 

37 """ 

38 

39 def __init__(self, fp: BinaryIO) -> None: 

40 PSStackParser.__init__(self, fp) 

41 self.doc: Optional[PDFDocument] = None 

42 self.fallback = False 

43 

44 def set_document(self, doc: "PDFDocument") -> None: 

45 """Associates the parser with a PDFDocument object.""" 

46 self.doc = doc 

47 

48 KEYWORD_R = KWD(b"R") 

49 KEYWORD_NULL = KWD(b"null") 

50 KEYWORD_ENDOBJ = KWD(b"endobj") 

51 KEYWORD_STREAM = KWD(b"stream") 

52 KEYWORD_XREF = KWD(b"xref") 

53 KEYWORD_STARTXREF = KWD(b"startxref") 

54 

55 def do_keyword(self, pos: int, token: PSKeyword) -> None: 

56 """Handles PDF-related keywords.""" 

57 if token in (self.KEYWORD_XREF, self.KEYWORD_STARTXREF): 

58 self.add_results(*self.pop(1)) 

59 

60 elif token is self.KEYWORD_ENDOBJ: 

61 self.add_results(*self.pop(4)) 

62 

63 elif token is self.KEYWORD_NULL: 

64 # null object 

65 self.push((pos, None)) 

66 

67 elif token is self.KEYWORD_R: 

68 # reference to indirect object 

69 if len(self.curstack) >= 2: 

70 (_, _object_id), _ = self.pop(2) 

71 object_id = safe_int(_object_id) 

72 if object_id is not None: 

73 obj = PDFObjRef(self.doc, object_id) 

74 self.push((pos, obj)) 

75 

76 elif token is self.KEYWORD_STREAM: 

77 # stream object 

78 ((_, dic),) = self.pop(1) 

79 dic = dict_value(dic) 

80 objlen = 0 

81 if not self.fallback: 

82 try: 

83 objlen = int_value(dic["Length"]) 

84 except KeyError: 

85 if settings.STRICT: 

86 raise PDFSyntaxError("/Length is undefined: %r" % dic) 

87 self.seek(pos) 

88 try: 

89 (_, line) = self.nextline() # 'stream' 

90 except PSEOF: 

91 if settings.STRICT: 

92 raise PDFSyntaxError("Unexpected EOF") 

93 return 

94 pos += len(line) 

95 self.fp.seek(pos) 

96 data = bytearray(self.fp.read(objlen)) 

97 self.seek(pos + objlen) 

98 while 1: 

99 try: 

100 (linepos, line) = self.nextline() 

101 except PSEOF: 

102 if settings.STRICT: 

103 raise PDFSyntaxError("Unexpected EOF") 

104 break 

105 if b"endstream" in line: 

106 i = line.index(b"endstream") 

107 objlen += i 

108 if self.fallback: 

109 data += line[:i] 

110 break 

111 objlen += len(line) 

112 if self.fallback: 

113 data += line 

114 self.seek(pos + objlen) 

115 # XXX limit objlen not to exceed object boundary 

116 log.debug( 

117 "Stream: pos=%d, objlen=%d, dic=%r, data=%r...", 

118 pos, 

119 objlen, 

120 dic, 

121 data[:10], 

122 ) 

123 assert self.doc is not None 

124 stream = PDFStream(dic, bytes(data), self.doc.decipher) 

125 self.push((pos, stream)) 

126 

127 else: 

128 # others 

129 self.push((pos, token)) 

130 

131 

132class PDFStreamParser(PDFParser): 

133 """PDFStreamParser is used to parse PDF content streams 

134 that is contained in each page and has instructions 

135 for rendering the page. A reference to a PDF document is 

136 needed because a PDF content stream can also have 

137 indirect references to other objects in the same document. 

138 """ 

139 

140 def __init__(self, data: bytes) -> None: 

141 PDFParser.__init__(self, BytesIO(data)) 

142 

143 def flush(self) -> None: 

144 self.add_results(*self.popall()) 

145 

146 KEYWORD_OBJ = KWD(b"obj") 

147 

148 def do_keyword(self, pos: int, token: PSKeyword) -> None: 

149 if token is self.KEYWORD_R: 

150 # reference to indirect object 

151 (_, _object_id), _ = self.pop(2) 

152 object_id = safe_int(_object_id) 

153 if object_id is not None: 

154 obj = PDFObjRef(self.doc, object_id) 

155 self.push((pos, obj)) 

156 return 

157 

158 elif token in (self.KEYWORD_OBJ, self.KEYWORD_ENDOBJ): 

159 if settings.STRICT: 

160 # See PDF Spec 3.4.6: Only the object values are stored in the 

161 # stream; the obj and endobj keywords are not used. 

162 raise PDFSyntaxError("Keyword endobj found in stream") 

163 return 

164 

165 # others 

166 self.push((pos, token))