/src/xpdf-4.06/xpdf/Parser.cc

Source
//========================================================================
//
// Parser.cc
//
// Copyright 1996-2003 Glyph & Cog, LLC
//
//========================================================================

#include <aconf.h>

#include <stddef.h>
#include <string.h>
#include "gmempp.h"
#include "Object.h"
#include "Array.h"
#include "Dict.h"
#include "Decrypt.h"
#include "Parser.h"
#include "XRef.h"
#include "Error.h"

Parser::Parser(XRef *xrefA, Lexer *lexerA, GBool allowStreamsA) {
  xref = xrefA;
  lexer = lexerA;
  inlineImg = 0;
  allowStreams = allowStreamsA;
  lexer->getObj(&buf1);
  lexer->getObj(&buf2);
}

Parser::~Parser() {
  buf1.free();
  buf2.free();
  delete lexer;
}

Object *Parser::getObj(Object *obj, GBool simpleOnly,
           Guchar *fileKey,
           CryptAlgorithm encAlgorithm, int keyLength,
           int objNum, int objGen, int recursion) {
  char *key;
  Stream *str;
  Object obj2;
  int num;
  DecryptStream *decrypt;
  GString *s, *s2;
  int c;

  // refill buffer after inline image data
  if (inlineImg == 2) {
    buf1.free();
    buf2.free();
    lexer->getObj(&buf1);
    lexer->getObj(&buf2);
    inlineImg = 0;
  }

  // array
  if (!simpleOnly && recursion < objectRecursionLimit && buf1.isCmd("[")) {
    shift();
    obj->initArray(xref);
    while (!buf1.isCmd("]") && !buf1.isEOF())
      obj->arrayAdd(getObj(&obj2, gFalse, fileKey, encAlgorithm, keyLength,
         objNum, objGen, recursion + 1));
    if (buf1.isEOF())
      error(errSyntaxError, getPos(), "End of file inside array");
    shift();

  // dictionary or stream
  } else if (!simpleOnly && recursion < objectRecursionLimit &&
       buf1.isCmd("<<")) {
    shift();
    obj->initDict(xref);
    while (!buf1.isCmd(">>") && !buf1.isEOF()) {
      if (!buf1.isName()) {
  error(errSyntaxError, getPos(),
        "Dictionary key must be a name object");
  shift();
      } else {
  key = copyString(buf1.getName());
  shift();
  if (buf1.isEOF() || buf1.isError()) {
    gfree(key);
    break;
  }
  obj->dictAdd(key, getObj(&obj2, gFalse,
         fileKey, encAlgorithm, keyLength,
         objNum, objGen, recursion + 1));
      }
    }
    if (buf1.isEOF())
      error(errSyntaxError, getPos(), "End of file inside dictionary");
    // stream objects are not allowed inside content streams or
    // object streams
    if (allowStreams && buf2.isCmd("stream")) {
      if ((str = makeStream(obj, fileKey, encAlgorithm, keyLength,
          objNum, objGen, recursion + 1))) {
  obj->initStream(str);
      } else {
  obj->free();
  obj->initError();
      }
    } else {
      shift();
    }

  // indirect reference or integer
  } else if (buf1.isInt()) {
    num = buf1.getInt();
    shift();
    if (buf1.isInt() && buf2.isCmd("R")) {
      int gen = buf1.getInt();
      if (num >= 0 && gen >= 0) {
  obj->initRef(num, gen);
      } else {
  error(errSyntaxError, getPos(),
        "Negative number or generation in indirect reference");
  obj->initError();
      }
      shift();
      shift();
    } else {
      obj->initInt(num);
    }

  // string
  } else if (buf1.isString() && fileKey) {
    s = buf1.getString();
    s2 = new GString();
    obj2.initNull();
    decrypt = new DecryptStream(new MemStream(s->getCString(), 0,
                s->getLength(), &obj2),
        fileKey, encAlgorithm, keyLength,
        objNum, objGen);
    decrypt->reset();
    while ((c = decrypt->getChar()) != EOF) {
      s2->append((char)c);
    }
    delete decrypt;
    obj->initString(s2);
    shift();

  // simple object
  } else {
    buf1.copy(obj);
    shift();
  }

  return obj;
}

Stream *Parser::makeStream(Object *dict, Guchar *fileKey,
         CryptAlgorithm encAlgorithm, int keyLength,
         int objNum, int objGen, int recursion) {
  // get stream start position
  lexer->skipToNextLine();
  Stream *curStr = lexer->getStream();
  if (!curStr) {
    return NULL;
  }
  GFileOffset pos = curStr->getPos();

  GBool haveLength = gFalse;
  GFileOffset length = 0;
  GFileOffset endPos;

  // check for length in damaged file
  if (xref && xref->getStreamEnd(pos, &endPos)) {
    length = endPos - pos;
    haveLength = gTrue;

  // get length from the stream object
  } else {
    Object obj;
    dict->dictLookup("Length", &obj, recursion);
    if (obj.isInt()) {
      length = (GFileOffset)(Guint)obj.getInt();
      haveLength = gTrue;
    } else {
      error(errSyntaxError, getPos(),
      "Missing or invalid 'Length' attribute in stream");
    }
    obj.free();
  }

  // in badly damaged PDF files, we can run off the end of the input
  // stream immediately after the "stream" token
  if (!lexer->getStream()) {
    return NULL;
  }

  // copy the base stream (Lexer will free stream objects when it gets
  // to end of stream -- which can happen in the shift() calls below)
  BaseStream *baseStr =
      (BaseStream *)lexer->getStream()->getBaseStream()->copy();

  // 'Length' attribute is missing -- search for 'endstream'
  if (!haveLength) {
    GBool foundEndstream = gFalse;
    char endstreamBuf[8];
    if ((curStr = lexer->getStream())) {
      int c;
      while ((c = curStr->getChar()) != EOF) {
  if (c == 'e' &&
      curStr->getBlock(endstreamBuf, 8) == 8 &&
      !memcmp(endstreamBuf, "ndstream", 8)) {
    length = curStr->getPos() - 9 - pos;
    foundEndstream = gTrue;
    break;
  }
      }
    }
    if (!foundEndstream) {
      error(errSyntaxError, getPos(), "Couldn't find 'endstream' for stream");
      delete baseStr;
      return NULL;
    }
  }

  // make new base stream
  Stream *str = baseStr->makeSubStream(pos, gTrue, length, dict);

  // look for the 'endstream' marker
  if (haveLength) {
    // skip over stream data
    lexer->setPos(pos + length);

    // check for 'endstream'
    // NB: we never reuse the Parser object to parse objects after a
    // stream, and we could (if the PDF file is damaged) be in the
    // middle of binary data at this point, so we check the stream
    // data directly for 'endstream', rather than calling shift() to
    // parse objects
    GBool foundEndstream = gFalse;
    char endstreamBuf[8];
    if ((curStr = lexer->getStream())) {
      // skip up to 100 whitespace chars
      int c;
      for (int i = 0; i < 100; ++i) {
  c = curStr->getChar();
  if (!Lexer::isSpace(c)) {
    break;
  }
      }
      if (c == 'e') {
  if (curStr->getBlock(endstreamBuf, 8) == 8 &&
      !memcmp(endstreamBuf, "ndstream", 8)) {
    foundEndstream = gTrue;
  }
      }
    }
    if (!foundEndstream) {
      error(errSyntaxError, getPos(), "Missing 'endstream'");
      // kludge for broken PDF files: just add 5k to the length, and
      // hope it's enough
      // (dict is now owned by str, so we need to copy it before deleting str)
      Object obj;
      dict->copy(&obj);
      delete str;
      length += 5000;
      str = baseStr->makeSubStream(pos, gTrue, length, &obj);
    }
  }

  // free the copied base stream
  delete baseStr;

  // handle decryption
  if (fileKey) {
    // the 'Crypt' filter is used to mark unencrypted metadata streams
    //~ this should also check for an empty DecodeParams entry
    GBool encrypted = gTrue;
    Object obj;
    dict->dictLookup("Filter", &obj, recursion);
    if (obj.isName("Crypt")) {
      encrypted = gFalse;
    } else if (obj.isArray() && obj.arrayGetLength() >= 1) {
      Object obj2;
      if (obj.arrayGet(0, &obj2)->isName("Crypt")) {
  encrypted = gFalse;
      }
      obj2.free();
    }
    obj.free();
    if (encrypted) {
      str = new DecryptStream(str, fileKey, encAlgorithm, keyLength,
            objNum, objGen);
    }
  }

  // get filters
  str = str->addFilters(dict, recursion);

  return str;
}

void Parser::shift() {
  if (inlineImg > 0) {
    if (inlineImg < 2) {
      ++inlineImg;
    } else {
      // in a damaged content stream, if 'ID' shows up in the middle
      // of a dictionary, we need to reset
      inlineImg = 0;
    }
  } else if (buf2.isCmd("ID")) {
    lexer->skipChar();    // skip char after 'ID' command
    inlineImg = 1;
  }
  buf1.free();
  buf1 = buf2;
  if (inlineImg > 0)    // don't buffer inline image data
    buf2.initNull();
  else
    lexer->getObj(&buf2);
}

Coverage Report

Created: 2026-03-31 07:04

Line	Count	Source
1		//========================================================================
2		//
3		// Parser.cc
4		//
5		// Copyright 1996-2003 Glyph & Cog, LLC
6		//
7		//========================================================================
8
9		#include <aconf.h>
10
11		#include <stddef.h>
12		#include <string.h>
13		#include "gmempp.h"
14		#include "Object.h"
15		#include "Array.h"
16		#include "Dict.h"
17		#include "Decrypt.h"
18		#include "Parser.h"
19		#include "XRef.h"
20		#include "Error.h"
21
22	562k	Parser::Parser(XRef xrefA, Lexer lexerA, GBool allowStreamsA) {
23	562k	xref = xrefA;
24	562k	lexer = lexerA;
25	562k	inlineImg = 0;
26	562k	allowStreams = allowStreamsA;
27	562k	lexer->getObj(&buf1);
28	562k	lexer->getObj(&buf2);
29	562k	}
30
31	562k	Parser::~Parser() {
32	562k	buf1.free();
33	562k	buf2.free();
34	562k	delete lexer;
35	562k	}
36
37		Object Parser::getObj(Object obj, GBool simpleOnly,
38		Guchar *fileKey,
39		CryptAlgorithm encAlgorithm, int keyLength,
40	248M	int objNum, int objGen, int recursion) {
41	248M	char *key;
42	248M	Stream *str;
43	248M	Object obj2;
44	248M	int num;
45	248M	DecryptStream *decrypt;
46	248M	GString s, s2;
47	248M	int c;
48
49		// refill buffer after inline image data
50	248M	if (inlineImg == 2) {
51	111k	buf1.free();
52	111k	buf2.free();
53	111k	lexer->getObj(&buf1);
54	111k	lexer->getObj(&buf2);
55	111k	inlineImg = 0;
56	111k	}
57
58		// array
59	248M	if (!simpleOnly && recursion < objectRecursionLimit && buf1.isCmd("[")) {
60	1.49M	shift();
61	1.49M	obj->initArray(xref);
62	34.0M	while (!buf1.isCmd("]") && !buf1.isEOF())
63	32.5M	obj->arrayAdd(getObj(&obj2, gFalse, fileKey, encAlgorithm, keyLength,
64	32.5M	objNum, objGen, recursion + 1));
65	1.49M	if (buf1.isEOF())
66	1.11M	error(errSyntaxError, getPos(), "End of file inside array");
67	1.49M	shift();
68
69		// dictionary or stream
70	246M	} else if (!simpleOnly && recursion < objectRecursionLimit &&
71	57.3M	buf1.isCmd("<<")) {
72	599k	shift();
73	599k	obj->initDict(xref);
74	6.57M	while (!buf1.isCmd(">>") && !buf1.isEOF()) {
75	5.98M	if (!buf1.isName()) {
76	3.39M	error(errSyntaxError, getPos(),
77	3.39M	"Dictionary key must be a name object");
78	3.39M	shift();
79	3.39M	} else {
80	2.58M	key = copyString(buf1.getName());
81	2.58M	shift();
82	2.58M	if (buf1.isEOF() \|\| buf1.isError()) {
83	11.9k	gfree(key);
84	11.9k	break;
85	11.9k	}
86	2.57M	obj->dictAdd(key, getObj(&obj2, gFalse,
87	2.57M	fileKey, encAlgorithm, keyLength,
88	2.57M	objNum, objGen, recursion + 1));
89	2.57M	}
90	5.98M	}
91	599k	if (buf1.isEOF())
92	41.2k	error(errSyntaxError, getPos(), "End of file inside dictionary");
93		// stream objects are not allowed inside content streams or
94		// object streams
95	599k	if (allowStreams && buf2.isCmd("stream")) {
96	161k	if ((str = makeStream(obj, fileKey, encAlgorithm, keyLength,
97	161k	objNum, objGen, recursion + 1))) {
98	153k	obj->initStream(str);
99	153k	} else {
100	8.86k	obj->free();
101	8.86k	obj->initError();
102	8.86k	}
103	437k	} else {
104	437k	shift();
105	437k	}
106
107		// indirect reference or integer
108	246M	} else if (buf1.isInt()) {
109	28.9M	num = buf1.getInt();
110	28.9M	shift();
111	28.9M	if (buf1.isInt() && buf2.isCmd("R")) {
112	868k	int gen = buf1.getInt();
113	868k	if (num >= 0 && gen >= 0) {
114	866k	obj->initRef(num, gen);
115	866k	} else {
116	1.82k	error(errSyntaxError, getPos(),
117	1.82k	"Negative number or generation in indirect reference");
118	1.82k	obj->initError();
119	1.82k	}
120	868k	shift();
121	868k	shift();
122	28.1M	} else {
123	28.1M	obj->initInt(num);
124	28.1M	}
125
126		// string
127	217M	} else if (buf1.isString() && fileKey) {
128	13.5k	s = buf1.getString();
129	13.5k	s2 = new GString();
130	13.5k	obj2.initNull();
131	13.5k	decrypt = new DecryptStream(new MemStream(s->getCString(), 0,
132	13.5k	s->getLength(), &obj2),
133	13.5k	fileKey, encAlgorithm, keyLength,
134	13.5k	objNum, objGen);
135	13.5k	decrypt->reset();
136	340k	while ((c = decrypt->getChar()) != EOF) {
137	326k	s2->append((char)c);
138	326k	}
139	13.5k	delete decrypt;
140	13.5k	obj->initString(s2);
141	13.5k	shift();
142
143		// simple object
144	217M	} else {
145	217M	buf1.copy(obj);
146	217M	shift();
147	217M	}
148
149	248M	return obj;
150	248M	}
151
152		Stream Parser::makeStream(Object dict, Guchar *fileKey,
153		CryptAlgorithm encAlgorithm, int keyLength,
154	161k	int objNum, int objGen, int recursion) {
155		// get stream start position
156	161k	lexer->skipToNextLine();
157	161k	Stream *curStr = lexer->getStream();
158	161k	if (!curStr) {
159	904	return NULL;
160	904	}
161	161k	GFileOffset pos = curStr->getPos();
162
163	161k	GBool haveLength = gFalse;
164	161k	GFileOffset length = 0;
165	161k	GFileOffset endPos;
166
167		// check for length in damaged file
168	161k	if (xref && xref->getStreamEnd(pos, &endPos)) {
169	122k	length = endPos - pos;
170	122k	haveLength = gTrue;
171
172		// get length from the stream object
173	122k	} else {
174	38.7k	Object obj;
175	38.7k	dict->dictLookup("Length", &obj, recursion);
176	38.7k	if (obj.isInt()) {
177	28.1k	length = (GFileOffset)(Guint)obj.getInt();
178	28.1k	haveLength = gTrue;
179	28.1k	} else {
180	10.6k	error(errSyntaxError, getPos(),
181	10.6k	"Missing or invalid 'Length' attribute in stream");
182	10.6k	}
183	38.7k	obj.free();
184	38.7k	}
185
186		// in badly damaged PDF files, we can run off the end of the input
187		// stream immediately after the "stream" token
188	161k	if (!lexer->getStream()) {
189	0	return NULL;
190	0	}
191
192		// copy the base stream (Lexer will free stream objects when it gets
193		// to end of stream -- which can happen in the shift() calls below)
194	161k	BaseStream *baseStr =
195	161k	(BaseStream *)lexer->getStream()->getBaseStream()->copy();
196
197		// 'Length' attribute is missing -- search for 'endstream'
198	161k	if (!haveLength) {
199	10.6k	GBool foundEndstream = gFalse;
200	10.6k	char endstreamBuf[8];
201	10.6k	if ((curStr = lexer->getStream())) {
202	10.6k	int c;
203	5.16M	while ((c = curStr->getChar()) != EOF) {
204	5.15M	if (c == 'e' &&
205	125k	curStr->getBlock(endstreamBuf, 8) == 8 &&
206	123k	!memcmp(endstreamBuf, "ndstream", 8)) {
207	2.66k	length = curStr->getPos() - 9 - pos;
208	2.66k	foundEndstream = gTrue;
209	2.66k	break;
210	2.66k	}
211	5.15M	}
212	10.6k	}
213	10.6k	if (!foundEndstream) {
214	7.96k	error(errSyntaxError, getPos(), "Couldn't find 'endstream' for stream");
215	7.96k	delete baseStr;
216	7.96k	return NULL;
217	7.96k	}
218	10.6k	}
219
220		// make new base stream
221	153k	Stream *str = baseStr->makeSubStream(pos, gTrue, length, dict);
222
223		// look for the 'endstream' marker
224	153k	if (haveLength) {
225		// skip over stream data
226	150k	lexer->setPos(pos + length);
227
228		// check for 'endstream'
229		// NB: we never reuse the Parser object to parse objects after a
230		// stream, and we could (if the PDF file is damaged) be in the
231		// middle of binary data at this point, so we check the stream
232		// data directly for 'endstream', rather than calling shift() to
233		// parse objects
234	150k	GBool foundEndstream = gFalse;
235	150k	char endstreamBuf[8];
236	150k	if ((curStr = lexer->getStream())) {
237		// skip up to 100 whitespace chars
238	150k	int c;
239	163k	for (int i = 0; i < 100; ++i) {
240	163k	c = curStr->getChar();
241	163k	if (!Lexer::isSpace(c)) {
242	150k	break;
243	150k	}
244	163k	}
245	150k	if (c == 'e') {
246	125k	if (curStr->getBlock(endstreamBuf, 8) == 8 &&
247	125k	!memcmp(endstreamBuf, "ndstream", 8)) {
248	124k	foundEndstream = gTrue;
249	124k	}
250	125k	}
251	150k	}
252	150k	if (!foundEndstream) {
253	25.9k	error(errSyntaxError, getPos(), "Missing 'endstream'");
254		// kludge for broken PDF files: just add 5k to the length, and
255		// hope it's enough
256		// (dict is now owned by str, so we need to copy it before deleting str)
257	25.9k	Object obj;
258	25.9k	dict->copy(&obj);
259	25.9k	delete str;
260	25.9k	length += 5000;
261	25.9k	str = baseStr->makeSubStream(pos, gTrue, length, &obj);
262	25.9k	}
263	150k	}
264
265		// free the copied base stream
266	153k	delete baseStr;
267
268		// handle decryption
269	153k	if (fileKey) {
270		// the 'Crypt' filter is used to mark unencrypted metadata streams
271		//~ this should also check for an empty DecodeParams entry
272	1.75k	GBool encrypted = gTrue;
273	1.75k	Object obj;
274	1.75k	dict->dictLookup("Filter", &obj, recursion);
275	1.75k	if (obj.isName("Crypt")) {
276	1	encrypted = gFalse;
277	1.75k	} else if (obj.isArray() && obj.arrayGetLength() >= 1) {
278	18	Object obj2;
279	18	if (obj.arrayGet(0, &obj2)->isName("Crypt")) {
280	0	encrypted = gFalse;
281	0	}
282	18	obj2.free();
283	18	}
284	1.75k	obj.free();
285	1.75k	if (encrypted) {
286	1.75k	str = new DecryptStream(str, fileKey, encAlgorithm, keyLength,
287	1.75k	objNum, objGen);
288	1.75k	}
289	1.75k	}
290
291		// get filters
292	153k	str = str->addFilters(dict, recursion);
293
294	153k	return str;
295	161k	}
296
297	257M	void Parser::shift() {
298	257M	if (inlineImg > 0) {
299	146k	if (inlineImg < 2) {
300	128k	++inlineImg;
301	128k	} else {
302		// in a damaged content stream, if 'ID' shows up in the middle
303		// of a dictionary, we need to reset
304	17.6k	inlineImg = 0;
305	17.6k	}
306	257M	} else if (buf2.isCmd("ID")) {
307	128k	lexer->skipChar(); // skip char after 'ID' command
308	128k	inlineImg = 1;
309	128k	}
310	257M	buf1.free();
311	257M	buf1 = buf2;
312	257M	if (inlineImg > 0) // don't buffer inline image data
313	257k	buf2.initNull();
314	257M	else
315	257M	lexer->getObj(&buf2);
316	257M	}