/src/xpdf-4.06/xpdf/Parser.cc
Line | Count | Source |
1 | | //======================================================================== |
2 | | // |
3 | | // Parser.cc |
4 | | // |
5 | | // Copyright 1996-2003 Glyph & Cog, LLC |
6 | | // |
7 | | //======================================================================== |
8 | | |
9 | | #include <aconf.h> |
10 | | |
11 | | #include <stddef.h> |
12 | | #include <string.h> |
13 | | #include "gmempp.h" |
14 | | #include "Object.h" |
15 | | #include "Array.h" |
16 | | #include "Dict.h" |
17 | | #include "Decrypt.h" |
18 | | #include "Parser.h" |
19 | | #include "XRef.h" |
20 | | #include "Error.h" |
21 | | |
22 | 562k | Parser::Parser(XRef *xrefA, Lexer *lexerA, GBool allowStreamsA) { |
23 | 562k | xref = xrefA; |
24 | 562k | lexer = lexerA; |
25 | 562k | inlineImg = 0; |
26 | 562k | allowStreams = allowStreamsA; |
27 | 562k | lexer->getObj(&buf1); |
28 | 562k | lexer->getObj(&buf2); |
29 | 562k | } |
30 | | |
31 | 562k | Parser::~Parser() { |
32 | 562k | buf1.free(); |
33 | 562k | buf2.free(); |
34 | 562k | delete lexer; |
35 | 562k | } |
36 | | |
37 | | Object *Parser::getObj(Object *obj, GBool simpleOnly, |
38 | | Guchar *fileKey, |
39 | | CryptAlgorithm encAlgorithm, int keyLength, |
40 | 248M | int objNum, int objGen, int recursion) { |
41 | 248M | char *key; |
42 | 248M | Stream *str; |
43 | 248M | Object obj2; |
44 | 248M | int num; |
45 | 248M | DecryptStream *decrypt; |
46 | 248M | GString *s, *s2; |
47 | 248M | int c; |
48 | | |
49 | | // refill buffer after inline image data |
50 | 248M | if (inlineImg == 2) { |
51 | 111k | buf1.free(); |
52 | 111k | buf2.free(); |
53 | 111k | lexer->getObj(&buf1); |
54 | 111k | lexer->getObj(&buf2); |
55 | 111k | inlineImg = 0; |
56 | 111k | } |
57 | | |
58 | | // array |
59 | 248M | if (!simpleOnly && recursion < objectRecursionLimit && buf1.isCmd("[")) { |
60 | 1.49M | shift(); |
61 | 1.49M | obj->initArray(xref); |
62 | 34.0M | while (!buf1.isCmd("]") && !buf1.isEOF()) |
63 | 32.5M | obj->arrayAdd(getObj(&obj2, gFalse, fileKey, encAlgorithm, keyLength, |
64 | 32.5M | objNum, objGen, recursion + 1)); |
65 | 1.49M | if (buf1.isEOF()) |
66 | 1.11M | error(errSyntaxError, getPos(), "End of file inside array"); |
67 | 1.49M | shift(); |
68 | | |
69 | | // dictionary or stream |
70 | 246M | } else if (!simpleOnly && recursion < objectRecursionLimit && |
71 | 57.3M | buf1.isCmd("<<")) { |
72 | 599k | shift(); |
73 | 599k | obj->initDict(xref); |
74 | 6.57M | while (!buf1.isCmd(">>") && !buf1.isEOF()) { |
75 | 5.98M | if (!buf1.isName()) { |
76 | 3.39M | error(errSyntaxError, getPos(), |
77 | 3.39M | "Dictionary key must be a name object"); |
78 | 3.39M | shift(); |
79 | 3.39M | } else { |
80 | 2.58M | key = copyString(buf1.getName()); |
81 | 2.58M | shift(); |
82 | 2.58M | if (buf1.isEOF() || buf1.isError()) { |
83 | 11.9k | gfree(key); |
84 | 11.9k | break; |
85 | 11.9k | } |
86 | 2.57M | obj->dictAdd(key, getObj(&obj2, gFalse, |
87 | 2.57M | fileKey, encAlgorithm, keyLength, |
88 | 2.57M | objNum, objGen, recursion + 1)); |
89 | 2.57M | } |
90 | 5.98M | } |
91 | 599k | if (buf1.isEOF()) |
92 | 41.2k | error(errSyntaxError, getPos(), "End of file inside dictionary"); |
93 | | // stream objects are not allowed inside content streams or |
94 | | // object streams |
95 | 599k | if (allowStreams && buf2.isCmd("stream")) { |
96 | 161k | if ((str = makeStream(obj, fileKey, encAlgorithm, keyLength, |
97 | 161k | objNum, objGen, recursion + 1))) { |
98 | 153k | obj->initStream(str); |
99 | 153k | } else { |
100 | 8.86k | obj->free(); |
101 | 8.86k | obj->initError(); |
102 | 8.86k | } |
103 | 437k | } else { |
104 | 437k | shift(); |
105 | 437k | } |
106 | | |
107 | | // indirect reference or integer |
108 | 246M | } else if (buf1.isInt()) { |
109 | 28.9M | num = buf1.getInt(); |
110 | 28.9M | shift(); |
111 | 28.9M | if (buf1.isInt() && buf2.isCmd("R")) { |
112 | 868k | int gen = buf1.getInt(); |
113 | 868k | if (num >= 0 && gen >= 0) { |
114 | 866k | obj->initRef(num, gen); |
115 | 866k | } else { |
116 | 1.82k | error(errSyntaxError, getPos(), |
117 | 1.82k | "Negative number or generation in indirect reference"); |
118 | 1.82k | obj->initError(); |
119 | 1.82k | } |
120 | 868k | shift(); |
121 | 868k | shift(); |
122 | 28.1M | } else { |
123 | 28.1M | obj->initInt(num); |
124 | 28.1M | } |
125 | | |
126 | | // string |
127 | 217M | } else if (buf1.isString() && fileKey) { |
128 | 13.5k | s = buf1.getString(); |
129 | 13.5k | s2 = new GString(); |
130 | 13.5k | obj2.initNull(); |
131 | 13.5k | decrypt = new DecryptStream(new MemStream(s->getCString(), 0, |
132 | 13.5k | s->getLength(), &obj2), |
133 | 13.5k | fileKey, encAlgorithm, keyLength, |
134 | 13.5k | objNum, objGen); |
135 | 13.5k | decrypt->reset(); |
136 | 340k | while ((c = decrypt->getChar()) != EOF) { |
137 | 326k | s2->append((char)c); |
138 | 326k | } |
139 | 13.5k | delete decrypt; |
140 | 13.5k | obj->initString(s2); |
141 | 13.5k | shift(); |
142 | | |
143 | | // simple object |
144 | 217M | } else { |
145 | 217M | buf1.copy(obj); |
146 | 217M | shift(); |
147 | 217M | } |
148 | | |
149 | 248M | return obj; |
150 | 248M | } |
151 | | |
152 | | Stream *Parser::makeStream(Object *dict, Guchar *fileKey, |
153 | | CryptAlgorithm encAlgorithm, int keyLength, |
154 | 161k | int objNum, int objGen, int recursion) { |
155 | | // get stream start position |
156 | 161k | lexer->skipToNextLine(); |
157 | 161k | Stream *curStr = lexer->getStream(); |
158 | 161k | if (!curStr) { |
159 | 904 | return NULL; |
160 | 904 | } |
161 | 161k | GFileOffset pos = curStr->getPos(); |
162 | | |
163 | 161k | GBool haveLength = gFalse; |
164 | 161k | GFileOffset length = 0; |
165 | 161k | GFileOffset endPos; |
166 | | |
167 | | // check for length in damaged file |
168 | 161k | if (xref && xref->getStreamEnd(pos, &endPos)) { |
169 | 122k | length = endPos - pos; |
170 | 122k | haveLength = gTrue; |
171 | | |
172 | | // get length from the stream object |
173 | 122k | } else { |
174 | 38.7k | Object obj; |
175 | 38.7k | dict->dictLookup("Length", &obj, recursion); |
176 | 38.7k | if (obj.isInt()) { |
177 | 28.1k | length = (GFileOffset)(Guint)obj.getInt(); |
178 | 28.1k | haveLength = gTrue; |
179 | 28.1k | } else { |
180 | 10.6k | error(errSyntaxError, getPos(), |
181 | 10.6k | "Missing or invalid 'Length' attribute in stream"); |
182 | 10.6k | } |
183 | 38.7k | obj.free(); |
184 | 38.7k | } |
185 | | |
186 | | // in badly damaged PDF files, we can run off the end of the input |
187 | | // stream immediately after the "stream" token |
188 | 161k | if (!lexer->getStream()) { |
189 | 0 | return NULL; |
190 | 0 | } |
191 | | |
192 | | // copy the base stream (Lexer will free stream objects when it gets |
193 | | // to end of stream -- which can happen in the shift() calls below) |
194 | 161k | BaseStream *baseStr = |
195 | 161k | (BaseStream *)lexer->getStream()->getBaseStream()->copy(); |
196 | | |
197 | | // 'Length' attribute is missing -- search for 'endstream' |
198 | 161k | if (!haveLength) { |
199 | 10.6k | GBool foundEndstream = gFalse; |
200 | 10.6k | char endstreamBuf[8]; |
201 | 10.6k | if ((curStr = lexer->getStream())) { |
202 | 10.6k | int c; |
203 | 5.16M | while ((c = curStr->getChar()) != EOF) { |
204 | 5.15M | if (c == 'e' && |
205 | 125k | curStr->getBlock(endstreamBuf, 8) == 8 && |
206 | 123k | !memcmp(endstreamBuf, "ndstream", 8)) { |
207 | 2.66k | length = curStr->getPos() - 9 - pos; |
208 | 2.66k | foundEndstream = gTrue; |
209 | 2.66k | break; |
210 | 2.66k | } |
211 | 5.15M | } |
212 | 10.6k | } |
213 | 10.6k | if (!foundEndstream) { |
214 | 7.96k | error(errSyntaxError, getPos(), "Couldn't find 'endstream' for stream"); |
215 | 7.96k | delete baseStr; |
216 | 7.96k | return NULL; |
217 | 7.96k | } |
218 | 10.6k | } |
219 | | |
220 | | // make new base stream |
221 | 153k | Stream *str = baseStr->makeSubStream(pos, gTrue, length, dict); |
222 | | |
223 | | // look for the 'endstream' marker |
224 | 153k | if (haveLength) { |
225 | | // skip over stream data |
226 | 150k | lexer->setPos(pos + length); |
227 | | |
228 | | // check for 'endstream' |
229 | | // NB: we never reuse the Parser object to parse objects after a |
230 | | // stream, and we could (if the PDF file is damaged) be in the |
231 | | // middle of binary data at this point, so we check the stream |
232 | | // data directly for 'endstream', rather than calling shift() to |
233 | | // parse objects |
234 | 150k | GBool foundEndstream = gFalse; |
235 | 150k | char endstreamBuf[8]; |
236 | 150k | if ((curStr = lexer->getStream())) { |
237 | | // skip up to 100 whitespace chars |
238 | 150k | int c; |
239 | 163k | for (int i = 0; i < 100; ++i) { |
240 | 163k | c = curStr->getChar(); |
241 | 163k | if (!Lexer::isSpace(c)) { |
242 | 150k | break; |
243 | 150k | } |
244 | 163k | } |
245 | 150k | if (c == 'e') { |
246 | 125k | if (curStr->getBlock(endstreamBuf, 8) == 8 && |
247 | 125k | !memcmp(endstreamBuf, "ndstream", 8)) { |
248 | 124k | foundEndstream = gTrue; |
249 | 124k | } |
250 | 125k | } |
251 | 150k | } |
252 | 150k | if (!foundEndstream) { |
253 | 25.9k | error(errSyntaxError, getPos(), "Missing 'endstream'"); |
254 | | // kludge for broken PDF files: just add 5k to the length, and |
255 | | // hope it's enough |
256 | | // (dict is now owned by str, so we need to copy it before deleting str) |
257 | 25.9k | Object obj; |
258 | 25.9k | dict->copy(&obj); |
259 | 25.9k | delete str; |
260 | 25.9k | length += 5000; |
261 | 25.9k | str = baseStr->makeSubStream(pos, gTrue, length, &obj); |
262 | 25.9k | } |
263 | 150k | } |
264 | | |
265 | | // free the copied base stream |
266 | 153k | delete baseStr; |
267 | | |
268 | | // handle decryption |
269 | 153k | if (fileKey) { |
270 | | // the 'Crypt' filter is used to mark unencrypted metadata streams |
271 | | //~ this should also check for an empty DecodeParams entry |
272 | 1.75k | GBool encrypted = gTrue; |
273 | 1.75k | Object obj; |
274 | 1.75k | dict->dictLookup("Filter", &obj, recursion); |
275 | 1.75k | if (obj.isName("Crypt")) { |
276 | 1 | encrypted = gFalse; |
277 | 1.75k | } else if (obj.isArray() && obj.arrayGetLength() >= 1) { |
278 | 18 | Object obj2; |
279 | 18 | if (obj.arrayGet(0, &obj2)->isName("Crypt")) { |
280 | 0 | encrypted = gFalse; |
281 | 0 | } |
282 | 18 | obj2.free(); |
283 | 18 | } |
284 | 1.75k | obj.free(); |
285 | 1.75k | if (encrypted) { |
286 | 1.75k | str = new DecryptStream(str, fileKey, encAlgorithm, keyLength, |
287 | 1.75k | objNum, objGen); |
288 | 1.75k | } |
289 | 1.75k | } |
290 | | |
291 | | // get filters |
292 | 153k | str = str->addFilters(dict, recursion); |
293 | | |
294 | 153k | return str; |
295 | 161k | } |
296 | | |
297 | 257M | void Parser::shift() { |
298 | 257M | if (inlineImg > 0) { |
299 | 146k | if (inlineImg < 2) { |
300 | 128k | ++inlineImg; |
301 | 128k | } else { |
302 | | // in a damaged content stream, if 'ID' shows up in the middle |
303 | | // of a dictionary, we need to reset |
304 | 17.6k | inlineImg = 0; |
305 | 17.6k | } |
306 | 257M | } else if (buf2.isCmd("ID")) { |
307 | 128k | lexer->skipChar(); // skip char after 'ID' command |
308 | 128k | inlineImg = 1; |
309 | 128k | } |
310 | 257M | buf1.free(); |
311 | 257M | buf1 = buf2; |
312 | 257M | if (inlineImg > 0) // don't buffer inline image data |
313 | 257k | buf2.initNull(); |
314 | 257M | else |
315 | 257M | lexer->getObj(&buf2); |
316 | 257M | } |