/src/xpdf-4.06/xpdf/Lexer.cc

Source
//========================================================================
//
// Lexer.cc
//
// Copyright 1996-2003 Glyph & Cog, LLC
//
//========================================================================

#include <aconf.h>

#include <stdlib.h>
#include <stddef.h>
#include <string.h>
#include <ctype.h>
#include "gmempp.h"
#include "Lexer.h"
#include "Error.h"

//------------------------------------------------------------------------

// A '1' in this array means the character is white space.  A '1' or
// '2' means the character ends a name or command.
static char specialChars[256] = {
  1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0,   // 0x
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // 1x
  1, 0, 0, 0, 0, 2, 0, 0, 2, 2, 0, 0, 0, 0, 0, 2,   // 2x
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0,   // 3x
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // 4x
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0,   // 5x
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // 6x
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0,   // 7x
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // 8x
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // 9x
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // ax
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // bx
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // cx
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // dx
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // ex
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0    // fx
};

//------------------------------------------------------------------------
// Lexer
//------------------------------------------------------------------------

Lexer::Lexer(XRef *xref, Stream *str) {
  Object obj;

  curStr.initStream(str);
  streams = new Array(xref);
  streams->add(curStr.copy(&obj));
  strPtr = 0;
  freeArray = gTrue;
  curStr.streamReset();
}

Lexer::Lexer(XRef *xref, Object *obj) {
  Object obj2;

  if (obj->isStream()) {
    streams = new Array(xref);
    freeArray = gTrue;
    streams->add(obj->copy(&obj2));
  } else {
    streams = obj->getArray();
    freeArray = gFalse;
  }
  strPtr = 0;
  if (streams->getLength() > 0) {
    streams->get(strPtr, &curStr);
    curStr.streamReset();
  }
}

Lexer::~Lexer() {
  if (!curStr.isNone()) {
    curStr.streamClose();
    curStr.free();
  }
  if (freeArray) {
    delete streams;
  }
}

int Lexer::getChar() {
  int c;

  c = EOF;
  while (!curStr.isNone() && (c = curStr.streamGetChar()) == EOF) {
    curStr.streamClose();
    curStr.free();
    ++strPtr;
    if (strPtr < streams->getLength()) {
      streams->get(strPtr, &curStr);
      curStr.streamReset();
    }
  }
  return c;
}

int Lexer::lookChar() {
  if (curStr.isNone()) {
    return EOF;
  }
  return curStr.streamLookChar();
}

Object *Lexer::getObj(Object *obj) {
  char *p;
  int c, c2;
  GBool comment, neg, doubleMinus, done, invalid;
  int numParen, nErrors;
  int xi;
  double xf, scale;
  GString *s;
  int n, m;

  // skip whitespace and comments
  comment = gFalse;
  while (1) {
    if ((c = getChar()) == EOF) {
      return obj->initEOF();
    }
    if (comment) {
      if (c == '\r' || c == '\n')
  comment = gFalse;
    } else if (c == '%') {
      comment = gTrue;
    } else if (specialChars[c] != 1) {
      break;
    }
  }

  // start reading token
  switch (c) {

  // number
  case '0': case '1': case '2': case '3': case '4':
  case '5': case '6': case '7': case '8': case '9':
  case '+': case '-': case '.':
    // Adobe's number lexer has some "interesting" behavior:
    // "--123" is interpreted as 0
    // "--123.4" is interpreted as -123.4 [I've seen this in the wild]
    // "50-100" is interpreted as 50 [I've seen this in the wild]
    // "50--100" is interpreted as 50
    // "50-100.0" is an error -- but older versions of Acrobat may
    //   have interpreted it as 50100.0 (?)
    // "50--100.0" is an error -- but older versions of Acrobat may
    //   have interpreted it as 50100.0 (?)
    // "50.0-100" is interpreted as 50.0 (or maybe 50.0100?)
    // "50.0--100" is interpreted as 50.0 (or maybe 50.0100?)
    // "-50-100" is interpreted as -50
    // "-" is interpreted as 0
    // "-." is interpreted as 0.0
    neg = gFalse;
    doubleMinus = gFalse;
    xf = xi = 0;
    if (c == '+') {
      // just ignore it
    } else if (c == '-') {
      neg = gTrue;
      if (lookChar() == '-') {
  doubleMinus = gTrue;
  do {
    getChar();
  } while (lookChar() == '-');
      }
    } else if (c == '.') {
      goto doReal;
    } else {
      xf = xi = c - '0';
    }
    while (1) {
      c = lookChar();
      if (isdigit(c)) {
  getChar();
  xi = xi * 10 + (c - '0');
  if (xf < 1e20) {
    xf = xf * 10 + (c - '0');
  }
      } else if (c == '.') {
  getChar();
  goto doReal;
      } else {
  break;
      }
    }
    while ((c = lookChar()) == '-' || isdigit(c)) {
      getChar();
    }
    if (neg) {
      xi = -xi;
    }
    if (doubleMinus) {
      xi = 0;
    }
    obj->initInt(xi);
    break;
  doReal:
    scale = 0.1;
    while (1) {
      c = lookChar();
      if (c == '-') {
  error(errSyntaxWarning, getPos(), "Badly formatted number");
  getChar();
  continue;
      }
      if (!isdigit(c)) {
  break;
      }
      getChar();
      xf = xf + scale * (c - '0');
      scale *= 0.1;
    }
    while ((c = lookChar()) == '-' || isdigit(c)) {
      getChar();
    }
    if (neg) {
      xf = -xf;
    }
    obj->initReal(xf);
    break;

  // string
  case '(':
    p = tokBuf;
    n = 0;
    numParen = 1;
    done = gFalse;
    s = NULL;
    do {
      c2 = EOF;
      switch (c = getChar()) {

      case EOF:
  error(errSyntaxError, getPos(), "Unterminated string");
  done = gTrue;
  break;

      case '(':
  ++numParen;
  c2 = c;
  break;

      case ')':
  if (--numParen == 0) {
    done = gTrue;
  } else {
    c2 = c;
  }
  break;

      case '\r':
  // The PDF spec says that any literal end-of-line sequence
  // (LF, CR, CR+LF) is translated to a single LF char.
  c = lookChar();
  if (c == '\n') {
    getChar();
  }
  c2 = '\n';
  break;

      case '\\':
  switch (c = getChar()) {
  case 'n':
    c2 = '\n';
    break;
  case 'r':
    c2 = '\r';
    break;
  case 't':
    c2 = '\t';
    break;
  case 'b':
    c2 = '\b';
    break;
  case 'f':
    c2 = '\f';
    break;
  case '\\':
  case '(':
  case ')':
    c2 = c;
    break;
  case '0': case '1': case '2': case '3':
  case '4': case '5': case '6': case '7':
    c2 = c - '0';
    c = lookChar();
    if (c >= '0' && c <= '7') {
      getChar();
      c2 = (c2 << 3) + (c - '0');
      c = lookChar();
      if (c >= '0' && c <= '7') {
        getChar();
        c2 = (c2 << 3) + (c - '0');
      }
    }
    break;
  case '\r':
    c = lookChar();
    if (c == '\n') {
      getChar();
    }
    break;
  case '\n':
    break;
  case EOF:
    error(errSyntaxError, getPos(), "Unterminated string");
    done = gTrue;
    break;
  default:
    c2 = c;
    break;
  }
  break;

      default:
  c2 = c;
  break;
      }

      if (c2 != EOF) {
  if (n == tokBufSize) {
    if (!s)
      s = new GString(tokBuf, tokBufSize);
    else
      s->append(tokBuf, tokBufSize);
    p = tokBuf;
    n = 0;
  }
  *p++ = (char)c2;
  ++n;
      }
    } while (!done);
    if (!s)
      s = new GString(tokBuf, n);
    else
      s->append(tokBuf, n);
    obj->initString(s);
    break;

  // name
  case '/':
    p = tokBuf;
    n = 0;
    s = NULL;
    invalid = gFalse;
    while ((c = lookChar()) != EOF && !specialChars[c]) {
      getChar();
      if (c == '#') {
  c2 = lookChar();
  if (c2 >= '0' && c2 <= '9') {
    c = c2 - '0';
  } else if (c2 >= 'A' && c2 <= 'F') {
    c = c2 - 'A' + 10;
  } else if (c2 >= 'a' && c2 <= 'f') {
    c = c2 - 'a' + 10;
  } else {
    error(errSyntaxError, getPos(), "Invalid hex escape in name");
    goto notEscChar;
  }
  getChar();
  c2 = lookChar();
  if (c2 >= '0' && c2 <= '9') {
    c = (c << 4) + (c2 - '0');
  } else if (c2 >= 'A' && c2 <= 'F') {
    c = (c << 4) + (c2 - 'A' + 10);
  } else if (c2 >= 'a' && c2 <= 'f') {
    c = (c << 4) + (c2 - 'a' + 10);
  } else {
    error(errSyntaxError, getPos(), "Invalid hex escape in name");
    goto notEscChar;
  }
  getChar();
  if (c == 0) {
    invalid = gTrue;
  }
      }
     notEscChar:
      // the PDF spec claims that names are limited to 127 chars, but
      // Distiller 8 will produce longer names, and Acrobat 8 will
      // accept longer names
      ++n;
      if (n < tokBufSize) {
  *p++ = (char)c;
      } else if (n == tokBufSize) {
  *p = (char)c;
  s = new GString(tokBuf, n);
      } else {
  s->append((char)c);
      }
    }
    if (invalid) {
      error(errSyntaxError, getPos(), "Null character in name");
      obj->initError();
      if (s) {
  delete s;
      }
    } else if (n < tokBufSize) {
      *p = '\0';
      obj->initName(tokBuf);
    } else {
      obj->initName(s->getCString());
      delete s;
    }
    break;

  // array punctuation
  case '[':
  case ']':
    tokBuf[0] = (char)c;
    tokBuf[1] = '\0';
    obj->initCmd(tokBuf);
    break;

  // hex string or dict punctuation
  case '<':
    c = lookChar();

    // dict punctuation
    if (c == '<') {
      getChar();
      tokBuf[0] = tokBuf[1] = '<';
      tokBuf[2] = '\0';
      obj->initCmd(tokBuf);

    // hex string
    } else {
      p = tokBuf;
      m = n = 0;
      c2 = 0;
      s = NULL;
      nErrors = 0;
      while (nErrors < 100) {
  c = getChar();
  if (c == '>') {
    break;
  } else if (c == EOF) {
    error(errSyntaxError, getPos(), "Unterminated hex string");
    break;
  } else if (specialChars[c] != 1) {
    c2 = c2 << 4;
    if (c >= '0' && c <= '9') {
      c2 += c - '0';
    } else if (c >= 'A' && c <= 'F') {
      c2 += c - 'A' + 10;
    } else if (c >= 'a' && c <= 'f') {
      c2 += c - 'a' + 10;
    } else {
      error(errSyntaxError, getPos(),
      "Illegal character <{0:02x}> in hex string", c);
      ++nErrors;
    }
    if (++m == 2) {
      if (n == tokBufSize) {
        if (!s)
    s = new GString(tokBuf, tokBufSize);
        else
    s->append(tokBuf, tokBufSize);
        p = tokBuf;
        n = 0;
      }
      *p++ = (char)c2;
      ++n;
      c2 = 0;
      m = 0;
    }
  }
      }
      if (!s)
  s = new GString(tokBuf, n);
      else
  s->append(tokBuf, n);
      if (m == 1)
  s->append((char)(c2 << 4));
      obj->initString(s);
    }
    break;

  // dict punctuation
  case '>':
    c = lookChar();
    if (c == '>') {
      getChar();
      tokBuf[0] = tokBuf[1] = '>';
      tokBuf[2] = '\0';
      obj->initCmd(tokBuf);
    } else {
      error(errSyntaxError, getPos(), "Illegal character '>'");
      obj->initError();
    }
    break;

  // error
  case ')':
  case '{':
  case '}':
    error(errSyntaxError, getPos(), "Illegal character '{0:c}'", c);
    obj->initError();
    break;

  // command
  default:
    p = tokBuf;
    *p++ = (char)c;
    n = 1;
    while ((c = lookChar()) != EOF && !specialChars[c]) {
      getChar();
      if (++n == tokBufSize) {
  error(errSyntaxError, getPos(), "Command token too long");
  break;
      }
      *p++ = (char)c;
    }
    *p = '\0';
    if (tokBuf[0] == 't' && !strcmp(tokBuf, "true")) {
      obj->initBool(gTrue);
    } else if (tokBuf[0] == 'f' && !strcmp(tokBuf, "false")) {
      obj->initBool(gFalse);
    } else if (tokBuf[0] == 'n' && !strcmp(tokBuf, "null")) {
      obj->initNull();
    } else {
      obj->initCmd(tokBuf);
    }
    break;
  }

  return obj;
}

void Lexer::skipToNextLine() {
  int c;

  while (1) {
    c = getChar();
    if (c == EOF || c == '\n') {
      return;
    }
    if (c == '\r') {
      if ((c = lookChar()) == '\n') {
  getChar();
      }
      return;
    }
  }
}

void Lexer::skipToEOF() {
  while (getChar() != EOF) ;
}

GBool Lexer::isSpace(int c) {
  return c >= 0 && c <= 0xff && specialChars[c] == 1;
}

Coverage Report

Created: 2026-06-22 07:14

Line	Count	Source
1		//========================================================================
2		//
3		// Lexer.cc
4		//
5		// Copyright 1996-2003 Glyph & Cog, LLC
6		//
7		//========================================================================
8
9		#include <aconf.h>
10
11		#include <stdlib.h>
12		#include <stddef.h>
13		#include <string.h>
14		#include <ctype.h>
15		#include "gmempp.h"
16		#include "Lexer.h"
17		#include "Error.h"
18
19		//------------------------------------------------------------------------
20
21		// A '1' in this array means the character is white space. A '1' or
22		// '2' means the character ends a name or command.
23		static char specialChars[256] = {
24		1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, // 0x
25		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x
26		1, 0, 0, 0, 0, 2, 0, 0, 2, 2, 0, 0, 0, 0, 0, 2, // 2x
27		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, // 3x
28		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 4x
29		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0, // 5x
30		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 6x
31		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0, // 7x
32		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x
33		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x
34		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // ax
35		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // bx
36		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // cx
37		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // dx
38		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // ex
39		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // fx
40		};
41
42		//------------------------------------------------------------------------
43		// Lexer
44		//------------------------------------------------------------------------
45
46	658k	Lexer::Lexer(XRef xref, Stream str) {
47	658k	Object obj;
48
49	658k	curStr.initStream(str);
50	658k	streams = new Array(xref);
51	658k	streams->add(curStr.copy(&obj));
52	658k	strPtr = 0;
53	658k	freeArray = gTrue;
54	658k	curStr.streamReset();
55	658k	}
56
57	102k	Lexer::Lexer(XRef xref, Object obj) {
58	102k	Object obj2;
59
60	102k	if (obj->isStream()) {
61	101k	streams = new Array(xref);
62	101k	freeArray = gTrue;
63	101k	streams->add(obj->copy(&obj2));
64	101k	} else {
65	774	streams = obj->getArray();
66	774	freeArray = gFalse;
67	774	}
68	102k	strPtr = 0;
69	102k	if (streams->getLength() > 0) {
70	102k	streams->get(strPtr, &curStr);
71	102k	curStr.streamReset();
72	102k	}
73	102k	}
74
75	760k	Lexer::~Lexer() {
76	760k	if (!curStr.isNone()) {
77	318k	curStr.streamClose();
78	318k	curStr.free();
79	318k	}
80	760k	if (freeArray) {
81	759k	delete streams;
82	759k	}
83	760k	}
84
85	1.88G	int Lexer::getChar() {
86	1.88G	int c;
87
88	1.88G	c = EOF;
89	1.88G	while (!curStr.isNone() && (c = curStr.streamGetChar()) == EOF) {
90	442k	curStr.streamClose();
91	442k	curStr.free();
92	442k	++strPtr;
93	442k	if (strPtr < streams->getLength()) {
94	1.12k	streams->get(strPtr, &curStr);
95	1.12k	curStr.streamReset();
96	1.12k	}
97	442k	}
98	1.88G	return c;
99	1.88G	}
100
101	525M	int Lexer::lookChar() {
102	525M	if (curStr.isNone()) {
103	2.65k	return EOF;
104	2.65k	}
105	525M	return curStr.streamLookChar();
106	525M	}
107
108	261M	Object Lexer::getObj(Object obj) {
109	261M	char *p;
110	261M	int c, c2;
111	261M	GBool comment, neg, doubleMinus, done, invalid;
112	261M	int numParen, nErrors;
113	261M	int xi;
114	261M	double xf, scale;
115	261M	GString *s;
116	261M	int n, m;
117
118		// skip whitespace and comments
119	261M	comment = gFalse;
120	953M	while (1) {
121	953M	if ((c = getChar()) == EOF) {
122	143M	return obj->initEOF();
123	143M	}
124	809M	if (comment) {
125	54.1M	if (c == '\r' \|\| c == '\n')
126	671k	comment = gFalse;
127	755M	} else if (c == '%') {
128	675k	comment = gTrue;
129	754M	} else if (specialChars[c] != 1) {
130	118M	break;
131	118M	}
132	809M	}
133
134		// start reading token
135	118M	switch (c) {
136
137		// number
138	21.3M	case '0': case '1': case '2': case '3': case '4':
139	31.8M	case '5': case '6': case '7': case '8': case '9':
140	34.7M	case '+': case '-': case '.':
141		// Adobe's number lexer has some "interesting" behavior:
142		// "--123" is interpreted as 0
143		// "--123.4" is interpreted as -123.4 [I've seen this in the wild]
144		// "50-100" is interpreted as 50 [I've seen this in the wild]
145		// "50--100" is interpreted as 50
146		// "50-100.0" is an error -- but older versions of Acrobat may
147		// have interpreted it as 50100.0 (?)
148		// "50--100.0" is an error -- but older versions of Acrobat may
149		// have interpreted it as 50100.0 (?)
150		// "50.0-100" is interpreted as 50.0 (or maybe 50.0100?)
151		// "50.0--100" is interpreted as 50.0 (or maybe 50.0100?)
152		// "-50-100" is interpreted as -50
153		// "-" is interpreted as 0
154		// "-." is interpreted as 0.0
155	34.7M	neg = gFalse;
156	34.7M	doubleMinus = gFalse;
157	34.7M	xf = xi = 0;
158	34.7M	if (c == '+') {
159		// just ignore it
160	34.7M	} else if (c == '-') {
161	1.80M	neg = gTrue;
162	1.80M	if (lookChar() == '-') {
163	20.9k	doubleMinus = gTrue;
164	35.3k	do {
165	35.3k	getChar();
166	35.3k	} while (lookChar() == '-');
167	20.9k	}
168	32.9M	} else if (c == '.') {
169	1.04M	goto doReal;
170	31.8M	} else {
171	31.8M	xf = xi = c - '0';
172	31.8M	}
173	84.7M	while (1) {
174	84.7M	c = lookChar();
175	84.7M	if (isdigit(c)) {
176	51.0M	getChar();
177	51.0M	xi = xi * 10 + (c - '0');
178	51.0M	if (xf < 1e20) {
179	43.0M	xf = xf * 10 + (c - '0');
180	43.0M	}
181	51.0M	} else if (c == '.') {
182	4.84M	getChar();
183	4.84M	goto doReal;
184	28.8M	} else {
185	28.8M	break;
186	28.8M	}
187	84.7M	}
188	29.2M	while ((c = lookChar()) == '-' \|\| isdigit(c)) {
189	324k	getChar();
190	324k	}
191	28.8M	if (neg) {
192	958k	xi = -xi;
193	958k	}
194	28.8M	if (doubleMinus) {
195	17.4k	xi = 0;
196	17.4k	}
197	28.8M	obj->initInt(xi);
198	28.8M	break;
199	5.89M	doReal:
200	5.89M	scale = 0.1;
201	18.1M	while (1) {
202	18.1M	c = lookChar();
203	18.1M	if (c == '-') {
204	36.2k	error(errSyntaxWarning, getPos(), "Badly formatted number");
205	36.2k	getChar();
206	36.2k	continue;
207	36.2k	}
208	18.0M	if (!isdigit(c)) {
209	5.89M	break;
210	5.89M	}
211	12.1M	getChar();
212	12.1M	xf = xf + scale * (c - '0');
213	12.1M	scale *= 0.1;
214	12.1M	}
215	5.89M	while ((c = lookChar()) == '-' \|\| isdigit(c)) {
216	0	getChar();
217	0	}
218	5.89M	if (neg) {
219	850k	xf = -xf;
220	850k	}
221	5.89M	obj->initReal(xf);
222	5.89M	break;
223
224		// string
225	722k	case '(':
226	722k	p = tokBuf;
227	722k	n = 0;
228	722k	numParen = 1;
229	722k	done = gFalse;
230	722k	s = NULL;
231	437M	do {
232	437M	c2 = EOF;
233	437M	switch (c = getChar()) {
234
235	21.8k	case EOF:
236	21.8k	error(errSyntaxError, getPos(), "Unterminated string");
237	21.8k	done = gTrue;
238	21.8k	break;
239
240	1.12M	case '(':
241	1.12M	++numParen;
242	1.12M	c2 = c;
243	1.12M	break;
244
245	1.55M	case ')':
246	1.55M	if (--numParen == 0) {
247	700k	done = gTrue;
248	858k	} else {
249	858k	c2 = c;
250	858k	}
251	1.55M	break;
252
253	8.40M	case '\r':
254		// The PDF spec says that any literal end-of-line sequence
255		// (LF, CR, CR+LF) is translated to a single LF char.
256	8.40M	c = lookChar();
257	8.40M	if (c == '\n') {
258	167k	getChar();
259	167k	}
260	8.40M	c2 = '\n';
261	8.40M	break;
262
263	541k	case '\\':
264	541k	switch (c = getChar()) {
265	6.75k	case 'n':
266	6.75k	c2 = '\n';
267	6.75k	break;
268	67.6k	case 'r':
269	67.6k	c2 = '\r';
270	67.6k	break;
271	24.8k	case 't':
272	24.8k	c2 = '\t';
273	24.8k	break;
274	2.16k	case 'b':
275	2.16k	c2 = '\b';
276	2.16k	break;
277	813	case 'f':
278	813	c2 = '\f';
279	813	break;
280	36.9k	case '\\':
281	76.8k	case '(':
282	120k	case ')':
283	120k	c2 = c;
284	120k	break;
285	90.7k	case '0': case '1': case '2': case '3':
286	126k	case '4': case '5': case '6': case '7':
287	126k	c2 = c - '0';
288	126k	c = lookChar();
289	126k	if (c >= '0' && c <= '7') {
290	87.3k	getChar();
291	87.3k	c2 = (c2 << 3) + (c - '0');
292	87.3k	c = lookChar();
293	87.3k	if (c >= '0' && c <= '7') {
294	72.0k	getChar();
295	72.0k	c2 = (c2 << 3) + (c - '0');
296	72.0k	}
297	87.3k	}
298	126k	break;
299	3.85k	case '\r':
300	3.85k	c = lookChar();
301	3.85k	if (c == '\n') {
302	1.14k	getChar();
303	1.14k	}
304	3.85k	break;
305	8.22k	case '\n':
306	8.22k	break;
307	383	case EOF:
308	383	error(errSyntaxError, getPos(), "Unterminated string");
309	383	done = gTrue;
310	383	break;
311	179k	default:
312	179k	c2 = c;
313	179k	break;
314	541k	}
315	541k	break;
316
317	425M	default:
318	425M	c2 = c;
319	425M	break;
320	437M	}
321
322	437M	if (c2 != EOF) {
323	436M	if (n == tokBufSize) {
324	3.26M	if (!s)
325	106k	s = new GString(tokBuf, tokBufSize);
326	3.16M	else
327	3.16M	s->append(tokBuf, tokBufSize);
328	3.26M	p = tokBuf;
329	3.26M	n = 0;
330	3.26M	}
331	436M	*p++ = (char)c2;
332	436M	++n;
333	436M	}
334	437M	} while (!done);
335	722k	if (!s)
336	616k	s = new GString(tokBuf, n);
337	106k	else
338	106k	s->append(tokBuf, n);
339	722k	obj->initString(s);
340	722k	break;
341
342		// name
343	10.5M	case '/':
344	10.5M	p = tokBuf;
345	10.5M	n = 0;
346	10.5M	s = NULL;
347	10.5M	invalid = gFalse;
348	57.6M	while ((c = lookChar()) != EOF && !specialChars[c]) {
349	47.0M	getChar();
350	47.0M	if (c == '#') {
351	840k	c2 = lookChar();
352	840k	if (c2 >= '0' && c2 <= '9') {
353	2.41k	c = c2 - '0';
354	838k	} else if (c2 >= 'A' && c2 <= 'F') {
355	47.6k	c = c2 - 'A' + 10;
356	790k	} else if (c2 >= 'a' && c2 <= 'f') {
357	98.6k	c = c2 - 'a' + 10;
358	692k	} else {
359	692k	error(errSyntaxError, getPos(), "Invalid hex escape in name");
360	692k	goto notEscChar;
361	692k	}
362	148k	getChar();
363	148k	c2 = lookChar();
364	148k	if (c2 >= '0' && c2 <= '9') {
365	27.8k	c = (c << 4) + (c2 - '0');
366	120k	} else if (c2 >= 'A' && c2 <= 'F') {
367	47.2k	c = (c << 4) + (c2 - 'A' + 10);
368	73.7k	} else if (c2 >= 'a' && c2 <= 'f') {
369	1.05k	c = (c << 4) + (c2 - 'a' + 10);
370	72.6k	} else {
371	72.6k	error(errSyntaxError, getPos(), "Invalid hex escape in name");
372	72.6k	goto notEscChar;
373	72.6k	}
374	76.0k	getChar();
375	76.0k	if (c == 0) {
376	501	invalid = gTrue;
377	501	}
378	76.0k	}
379	47.0M	notEscChar:
380		// the PDF spec claims that names are limited to 127 chars, but
381		// Distiller 8 will produce longer names, and Acrobat 8 will
382		// accept longer names
383	47.0M	++n;
384	47.0M	if (n < tokBufSize) {
385	41.7M	*p++ = (char)c;
386	41.7M	} else if (n == tokBufSize) {
387	9.24k	*p = (char)c;
388	9.24k	s = new GString(tokBuf, n);
389	5.31M	} else {
390	5.31M	s->append((char)c);
391	5.31M	}
392	47.0M	}
393	10.5M	if (invalid) {
394	501	error(errSyntaxError, getPos(), "Null character in name");
395	501	obj->initError();
396	501	if (s) {
397	142	delete s;
398	142	}
399	10.5M	} else if (n < tokBufSize) {
400	10.5M	*p = '\0';
401	10.5M	obj->initName(tokBuf);
402	10.5M	} else {
403	9.09k	obj->initName(s->getCString());
404	9.09k	delete s;
405	9.09k	}
406	10.5M	break;
407
408		// array punctuation
409	10.3M	case '[':
410	11.1M	case ']':
411	11.1M	tokBuf[0] = (char)c;
412	11.1M	tokBuf[1] = '\0';
413	11.1M	obj->initCmd(tokBuf);
414	11.1M	break;
415
416		// hex string or dict punctuation
417	19.0M	case '<':
418	19.0M	c = lookChar();
419
420		// dict punctuation
421	19.0M	if (c == '<') {
422	3.82M	getChar();
423	3.82M	tokBuf[0] = tokBuf[1] = '<';
424	3.82M	tokBuf[2] = '\0';
425	3.82M	obj->initCmd(tokBuf);
426
427		// hex string
428	15.2M	} else {
429	15.2M	p = tokBuf;
430	15.2M	m = n = 0;
431	15.2M	c2 = 0;
432	15.2M	s = NULL;
433	15.2M	nErrors = 0;
434	102M	while (nErrors < 100) {
435	102M	c = getChar();
436	102M	if (c == '>') {
437	15.0M	break;
438	87.3M	} else if (c == EOF) {
439	6.81k	error(errSyntaxError, getPos(), "Unterminated hex string");
440	6.81k	break;
441	87.3M	} else if (specialChars[c] != 1) {
442	69.6M	c2 = c2 << 4;
443	69.6M	if (c >= '0' && c <= '9') {
444	9.99M	c2 += c - '0';
445	59.6M	} else if (c >= 'A' && c <= 'F') {
446	1.13M	c2 += c - 'A' + 10;
447	58.4M	} else if (c >= 'a' && c <= 'f') {
448	21.1M	c2 += c - 'a' + 10;
449	37.3M	} else {
450	37.3M	error(errSyntaxError, getPos(),
451	37.3M	"Illegal character <{0:02x}> in hex string", c);
452	37.3M	++nErrors;
453	37.3M	}
454	69.6M	if (++m == 2) {
455	27.5M	if (n == tokBufSize) {
456	11.9k	if (!s)
457	8.50k	s = new GString(tokBuf, tokBufSize);
458	3.47k	else
459	3.47k	s->append(tokBuf, tokBufSize);
460	11.9k	p = tokBuf;
461	11.9k	n = 0;
462	11.9k	}
463	27.5M	*p++ = (char)c2;
464	27.5M	++n;
465	27.5M	c2 = 0;
466	27.5M	m = 0;
467	27.5M	}
468	69.6M	}
469	102M	}
470	15.2M	if (!s)
471	15.2M	s = new GString(tokBuf, n);
472	8.50k	else
473	8.50k	s->append(tokBuf, n);
474	15.2M	if (m == 1)
475	14.6M	s->append((char)(c2 << 4));
476	15.2M	obj->initString(s);
477	15.2M	}
478	19.0M	break;
479
480		// dict punctuation
481	4.98M	case '>':
482	4.98M	c = lookChar();
483	4.98M	if (c == '>') {
484	2.46M	getChar();
485	2.46M	tokBuf[0] = tokBuf[1] = '>';
486	2.46M	tokBuf[2] = '\0';
487	2.46M	obj->initCmd(tokBuf);
488	2.52M	} else {
489	2.52M	error(errSyntaxError, getPos(), "Illegal character '>'");
490	2.52M	obj->initError();
491	2.52M	}
492	4.98M	break;
493
494		// error
495	1.55M	case ')':
496	2.20M	case '{':
497	2.34M	case '}':
498	2.34M	error(errSyntaxError, getPos(), "Illegal character '{0:c}'", c);
499	2.34M	obj->initError();
500	2.34M	break;
501
502		// command
503	34.4M	default:
504	34.4M	p = tokBuf;
505	34.4M	*p++ = (char)c;
506	34.4M	n = 1;
507	294M	while ((c = lookChar()) != EOF && !specialChars[c]) {
508	260M	getChar();
509	260M	if (++n == tokBufSize) {
510	1.07M	error(errSyntaxError, getPos(), "Command token too long");
511	1.07M	break;
512	1.07M	}
513	259M	*p++ = (char)c;
514	259M	}
515	34.4M	*p = '\0';
516	34.4M	if (tokBuf[0] == 't' && !strcmp(tokBuf, "true")) {
517	153k	obj->initBool(gTrue);
518	34.2M	} else if (tokBuf[0] == 'f' && !strcmp(tokBuf, "false")) {
519	9.82k	obj->initBool(gFalse);
520	34.2M	} else if (tokBuf[0] == 'n' && !strcmp(tokBuf, "null")) {
521	16.4k	obj->initNull();
522	34.2M	} else {
523	34.2M	obj->initCmd(tokBuf);
524	34.2M	}
525	34.4M	break;
526	118M	}
527
528	118M	return obj;
529	118M	}
530
531	198k	void Lexer::skipToNextLine() {
532	198k	int c;
533
534	611k	while (1) {
535	611k	c = getChar();
536	611k	if (c == EOF \|\| c == '\n') {
537	97.9k	return;
538	97.9k	}
539	513k	if (c == '\r') {
540	100k	if ((c = lookChar()) == '\n') {
541	89.2k	getChar();
542	89.2k	}
543	100k	return;
544	100k	}
545	513k	}
546	198k	}
547
548	61.3k	void Lexer::skipToEOF() {
549	3.33M	while (getChar() != EOF) ;
550	61.3k	}
551
552	196M	GBool Lexer::isSpace(int c) {
553	196M	return c >= 0 && c <= 0xff && specialChars[c] == 1;
554	196M	}