/src/mupdf/source/pdf/pdf-lex.c

Source (jump to first uncovered line)
// Copyright (C) 2004-2021 Artifex Software, Inc.
//
// This file is part of MuPDF.
//
// MuPDF is free software: you can redistribute it and/or modify it under the
// terms of the GNU Affero General Public License as published by the Free
// Software Foundation, either version 3 of the License, or (at your option)
// any later version.
//
// MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
// FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
// details.
//
// You should have received a copy of the GNU Affero General Public License
// along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
//
// Alternative licensing terms are available from the licensor.
// For commercial licensing, see <https://www.artifex.com/> or contact
// Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
// CA 94129, USA, for further information.

#include "mupdf/fitz.h"
#include "mupdf/pdf.h"

#include <string.h>

#define IS_NUMBER \
  '+':case'-':case'.':case'0':case'1':case'2':case'3':\
  case'4':case'5':case'6':case'7':case'8':case'9'
#define IS_WHITE \
  '\x00':case'\x09':case'\x0a':case'\x0c':case'\x0d':case'\x20'
#define IS_HEX \
  '0':case'1':case'2':case'3':case'4':case'5':case'6':\
  case'7':case'8':case'9':case'A':case'B':case'C':\
  case'D':case'E':case'F':case'a':case'b':case'c':\
  case'd':case'e':case'f'
#define IS_DELIM \
  '(':case')':case'<':case'>':case'[':case']':case'{':\
  case'}':case'/':case'%'

#define RANGE_0_9 \
  '0':case'1':case'2':case'3':case'4':case'5':\
  case'6':case'7':case'8':case'9'
#define RANGE_a_f \
  'a':case'b':case'c':case'd':case'e':case'f'
#define RANGE_A_F \
  'A':case'B':case'C':case'D':case'E':case'F'
#define RANGE_0_7 \
  '0':case'1':case'2':case'3':case'4':case'5':case'6':case'7'

/* #define DUMP_LEXER_STREAM */
#ifdef DUMP_LEXER_STREAM
static inline int lex_byte(fz_context *ctx, fz_stream *stm)
{
  int c = fz_read_byte(ctx, stm);

  if (c == EOF)
    fz_write_printf(ctx, fz_stdout(ctx), "<EOF>");
  else if (c >= 32 && c < 128)
    fz_write_printf(ctx, fz_stdout(ctx), "%c", c);
  else
    fz_write_printf(ctx, fz_stdout(ctx), "<%02x>", c);
  return c;
}
#else
#define lex_byte(C,S) fz_read_byte(C,S)
#endif

static inline int iswhite(int ch)
{
  return
    ch == '\000' ||
    ch == '\011' ||
    ch == '\012' ||
    ch == '\014' ||
    ch == '\015' ||
    ch == '\040';
}

static inline int fz_isprint(int ch)
{
  return ch >= ' ' && ch <= '~';
}

static inline int unhex(int ch)
{
  if (ch >= '0' && ch <= '9') return ch - '0';
  if (ch >= 'A' && ch <= 'F') return ch - 'A' + 0xA;
  if (ch >= 'a' && ch <= 'f') return ch - 'a' + 0xA;
  return 0;
}

static void
lex_white(fz_context *ctx, fz_stream *f)
{
  int c;
  do {
    c = lex_byte(ctx, f);
  } while ((c <= 32) && (iswhite(c)));
  if (c != EOF)
    fz_unread_byte(ctx, f);
}

static void
lex_comment(fz_context *ctx, fz_stream *f)
{
  int c;
  do {
    c = lex_byte(ctx, f);
  } while ((c != '\012') && (c != '\015') && (c != EOF));
}

/* Fast(ish) but inaccurate strtof, with Adobe overflow handling. */
static float acrobat_compatible_atof(char *s)
{
  int neg = 0;
  int i = 0;

  while (*s == '-')
  {
    neg = 1;
    ++s;
  }
  while (*s == '+')
  {
    ++s;
  }

  while (*s >= '0' && *s <= '9')
  {
    /* We deliberately ignore overflow here.
     * Tests show that Acrobat handles * overflows in exactly the same way we do:
     * 123450000000000000000678 is read as 678.
     */
    i = i * 10 + (*s - '0');
    ++s;
  }

  if (*s == '.')
  {
    float v = i;
    float n = 0;
    float d = 1;
    ++s;
    while (*s >= '0' && *s <= '9')
    {
      n = 10 * n + (*s - '0');
      d = 10 * d;
      ++s;
    }
    v += n / d;
    return neg ? -v : v;
  }
  else
  {
    return neg ? -i : i;
  }
}

/* Fast but inaccurate atoi. */
static int fast_atoi(char *s)
{
  int neg = 0;
  int i = 0;

  while (*s == '-')
  {
    neg = 1;
    ++s;
  }
  while (*s == '+')
  {
    ++s;
  }

  while (*s >= '0' && *s <= '9')
  {
    /* We deliberately ignore overflow here. */
    i = i * 10 + (*s - '0');
    ++s;
  }

  return neg ? -i : i;
}

static int
lex_number(fz_context *ctx, fz_stream *f, pdf_lexbuf *buf, int c)
{
  char *s = buf->scratch;
  char *e = buf->scratch + buf->size - 1; /* leave space for zero terminator */
  char *isreal = (c == '.' ? s : NULL);
  int neg = (c == '-');
  int isbad = 0;

  *s++ = c;

  c = lex_byte(ctx, f);

  /* skip extra '-' signs at start of number */
  if (neg)
  {
    while (c == '-')
      c = lex_byte(ctx, f);
  }

  while (s < e)
  {
    switch (c)
    {
    case IS_WHITE:
    case IS_DELIM:
      fz_unread_byte(ctx, f);
      goto end;
    case EOF:
      goto end;
    case '.':
      if (isreal)
        isbad = 1;
      isreal = s;
      *s++ = c;
      break;
    case '-':
      /* Bug 703248: Some PDFs (particularly those
       * generated by google docs) apparently have
       * numbers like 0.000000000000-5684342 in them.
       * We'll stop our interpretation at the -, but
       * keep reading to skip over the trailing
       * digits so they aren't parsed later. */
      *s++ = '\0';
      break;
    case RANGE_0_9:
      *s++ = c;
      break;
    default:
      isbad = 1;
      *s++ = c;
      break;
    }
    c = lex_byte(ctx, f);
  }

end:
  *s = '\0';
  if (isbad)
    return PDF_TOK_KEYWORD;
  if (isreal)
  {
    /* We'd like to use the fastest possible atof
     * routine, but we'd rather match acrobats
     * handling of broken numbers. As such, we
     * spot common broken cases and call an
     * acrobat compatible routine where required. */
    if (neg > 1 || isreal - buf->scratch >= 10)
      buf->f = acrobat_compatible_atof(buf->scratch);
    else
      buf->f = fz_atof(buf->scratch);
    return PDF_TOK_REAL;
  }
  else
  {
    buf->i = fast_atoi(buf->scratch);
    return PDF_TOK_INT;
  }
}

static void
lex_name(fz_context *ctx, fz_stream *f, pdf_lexbuf *lb)
{
  char *s = lb->scratch;
  char *e = s + fz_minz(127, lb->size);
  int c;

  while (1)
  {
    if (s == e)
    {
      if (e - lb->scratch < 127)
      {
        s += pdf_lexbuf_grow(ctx, lb);
        e = lb->scratch + fz_minz(127, lb->size);
      }
      else
      {
        /* truncate names that are too long */
        fz_warn(ctx, "name is too long");
        *s = 0;
        lb->len = s - lb->scratch;
        s = NULL;
      }
    }
    c = lex_byte(ctx, f);
    switch (c)
    {
    case IS_WHITE:
    case IS_DELIM:
      fz_unread_byte(ctx, f);
      goto end;
    case EOF:
      goto end;
    case '#':
    {
      int hex[2];
      int i;
      for (i = 0; i < 2; i++)
      {
        c = fz_peek_byte(ctx, f);
        switch (c)
        {
        case RANGE_0_9:
          if (i == 1 && c == '0' && hex[0] == 0)
            goto illegal;
          hex[i] = lex_byte(ctx, f) - '0';
          break;
        case RANGE_a_f:
          hex[i] = lex_byte(ctx, f) - 'a' + 10;
          break;
        case RANGE_A_F:
          hex[i] = lex_byte(ctx, f) - 'A' + 10;
          break;
        default:
        case EOF:
          goto illegal;
        }
      }
      if (s) *s++ = (hex[0] << 4) + hex[1];
      break;
illegal:
      if (i == 1)
        fz_unread_byte(ctx, f);
      if (s) *s++ = '#';
      continue;
    }
    default:
      if (s) *s++ = c;
      break;
    }
  }
end:
  if (s)
  {
    *s = '\0';
    lb->len = s - lb->scratch;
  }
}

static int
lex_string(fz_context *ctx, fz_stream *f, pdf_lexbuf *lb)
{
  char *s = lb->scratch;
  char *e = s + lb->size;
  int bal = 1;
  int oct;
  int c;

  while (1)
  {
    if (s == e)
    {
      s += pdf_lexbuf_grow(ctx, lb);
      e = lb->scratch + lb->size;
    }
    c = lex_byte(ctx, f);
    switch (c)
    {
    case EOF:
      return PDF_TOK_ERROR;
    case '(':
      bal++;
      *s++ = c;
      break;
    case ')':
      bal --;
      if (bal == 0)
        goto end;
      *s++ = c;
      break;
    case '\\':
      c = lex_byte(ctx, f);
      switch (c)
      {
      case EOF:
        return PDF_TOK_ERROR;
      case 'n':
        *s++ = '\n';
        break;
      case 'r':
        *s++ = '\r';
        break;
      case 't':
        *s++ = '\t';
        break;
      case 'b':
        *s++ = '\b';
        break;
      case 'f':
        *s++ = '\f';
        break;
      case '(':
        *s++ = '(';
        break;
      case ')':
        *s++ = ')';
        break;
      case '\\':
        *s++ = '\\';
        break;
      case RANGE_0_7:
        oct = c - '0';
        c = lex_byte(ctx, f);
        if (c >= '0' && c <= '7')
        {
          oct = oct * 8 + (c - '0');
          c = lex_byte(ctx, f);
          if (c >= '0' && c <= '7')
            oct = oct * 8 + (c - '0');
          else if (c != EOF)
            fz_unread_byte(ctx, f);
        }
        else if (c != EOF)
          fz_unread_byte(ctx, f);
        *s++ = oct;
        break;
      case '\n':
        break;
      case '\r':
        c = lex_byte(ctx, f);
        if ((c != '\n') && (c != EOF))
          fz_unread_byte(ctx, f);
        break;
      default:
        *s++ = c;
      }
      break;
    default:
      *s++ = c;
      break;
    }
  }
end:
  lb->len = s - lb->scratch;
  return PDF_TOK_STRING;
}

static int
lex_hex_string(fz_context *ctx, fz_stream *f, pdf_lexbuf *lb)
{
  char *s = lb->scratch;
  char *e = s + lb->size;
  int a = 0, x = 0;
  int c;

  while (1)
  {
    if (s == e)
    {
      s += pdf_lexbuf_grow(ctx, lb);
      e = lb->scratch + lb->size;
    }
    c = lex_byte(ctx, f);
    switch (c)
    {
    case IS_WHITE:
      break;
    default:
      fz_warn(ctx, "invalid character in hex string");
      /* fall through */
    case IS_HEX:
      if (x)
      {
        *s++ = a * 16 + unhex(c);
        x = !x;
      }
      else
      {
        a = unhex(c);
        x = !x;
      }
      break;
    case '>':
      if (x)
      {
        *s++ = a * 16; /* pad truncated string with '0' */
      }
      goto end;
    case EOF:
      return PDF_TOK_ERROR;
    }
  }
end:
  lb->len = s - lb->scratch;
  return PDF_TOK_STRING;
}

static pdf_token
pdf_token_from_keyword(char *key)
{
  switch (*key)
  {
  case 'R':
    if (!strcmp(key, "R")) return PDF_TOK_R;
    break;
  case 't':
    if (!strcmp(key, "true")) return PDF_TOK_TRUE;
    if (!strcmp(key, "trailer")) return PDF_TOK_TRAILER;
    break;
  case 'f':
    if (!strcmp(key, "false")) return PDF_TOK_FALSE;
    break;
  case 'n':
    if (!strcmp(key, "null")) return PDF_TOK_NULL;
    if (!strcmp(key, "newobj")) return PDF_TOK_NEWOBJ;
    break;
  case 'o':
    if (!strcmp(key, "obj")) return PDF_TOK_OBJ;
    break;
  case 'e':
    if (!strcmp(key, "endobj")) return PDF_TOK_ENDOBJ;
    if (!strcmp(key, "endstream")) return PDF_TOK_ENDSTREAM;
    break;
  case 's':
    if (!strcmp(key, "stream")) return PDF_TOK_STREAM;
    if (!strcmp(key, "startxref")) return PDF_TOK_STARTXREF;
    break;
  case 'x':
    if (!strcmp(key, "xref")) return PDF_TOK_XREF;
    break;
  }

  while (*key)
  {
    if (!fz_isprint(*key))
      return PDF_TOK_ERROR;
    ++key;
  }

  return PDF_TOK_KEYWORD;
}

void pdf_lexbuf_init(fz_context *ctx, pdf_lexbuf *lb, int size)
{
  lb->size = lb->base_size = size;
  lb->len = 0;
  lb->scratch = &lb->buffer[0];
}

void pdf_lexbuf_fin(fz_context *ctx, pdf_lexbuf *lb)
{
  if (lb && lb->size != lb->base_size)
    fz_free(ctx, lb->scratch);
}

ptrdiff_t pdf_lexbuf_grow(fz_context *ctx, pdf_lexbuf *lb)
{
  char *old = lb->scratch;
  size_t newsize = lb->size * 2;
  if (lb->size == lb->base_size)
  {
    lb->scratch = Memento_label(fz_malloc(ctx, newsize), "pdf_lexbuf");
    memcpy(lb->scratch, lb->buffer, lb->size);
  }
  else
  {
    lb->scratch = fz_realloc(ctx, lb->scratch, newsize);
  }
  lb->size = newsize;
  return lb->scratch - old;
}

pdf_token
pdf_lex(fz_context *ctx, fz_stream *f, pdf_lexbuf *buf)
{
  while (1)
  {
    int c = lex_byte(ctx, f);
    switch (c)
    {
    case EOF:
      return PDF_TOK_EOF;
    case IS_WHITE:
      lex_white(ctx, f);
      break;
    case '%':
      lex_comment(ctx, f);
      break;
    case '/':
      lex_name(ctx, f, buf);
      return PDF_TOK_NAME;
    case '(':
      return lex_string(ctx, f, buf);
    case ')':
      return PDF_TOK_ERROR;
    case '<':
      c = lex_byte(ctx, f);
      if (c == '<')
        return PDF_TOK_OPEN_DICT;
      if (c != EOF)
        fz_unread_byte(ctx, f);
      return lex_hex_string(ctx, f, buf);
    case '>':
      c = lex_byte(ctx, f);
      if (c == '>')
        return PDF_TOK_CLOSE_DICT;
      if (c != EOF)
        fz_unread_byte(ctx, f);
      return PDF_TOK_ERROR;
    case '[':
      return PDF_TOK_OPEN_ARRAY;
    case ']':
      return PDF_TOK_CLOSE_ARRAY;
    case '{':
      return PDF_TOK_OPEN_BRACE;
    case '}':
      return PDF_TOK_CLOSE_BRACE;
    case IS_NUMBER:
      return lex_number(ctx, f, buf, c);
    default: /* isregular: !isdelim && !iswhite && c != EOF */
      fz_unread_byte(ctx, f);
      lex_name(ctx, f, buf);
      return pdf_token_from_keyword(buf->scratch);
    }
  }
}

pdf_token
pdf_lex_no_string(fz_context *ctx, fz_stream *f, pdf_lexbuf *buf)
{
  while (1)
  {
    int c = lex_byte(ctx, f);
    switch (c)
    {
    case EOF:
      return PDF_TOK_EOF;
    case IS_WHITE:
      lex_white(ctx, f);
      break;
    case '%':
      lex_comment(ctx, f);
      break;
    case '/':
      lex_name(ctx, f, buf);
      return PDF_TOK_NAME;
    case '(':
      return PDF_TOK_ERROR; /* no strings allowed */
    case ')':
      return PDF_TOK_ERROR; /* no strings allowed */
    case '<':
      c = lex_byte(ctx, f);
      if (c == '<')
        return PDF_TOK_OPEN_DICT;
      if (c != EOF)
        fz_unread_byte(ctx, f);
      return PDF_TOK_ERROR; /* no strings allowed */
    case '>':
      c = lex_byte(ctx, f);
      if (c == '>')
        return PDF_TOK_CLOSE_DICT;
      if (c != EOF)
        fz_unread_byte(ctx, f);
      return PDF_TOK_ERROR;
    case '[':
      return PDF_TOK_OPEN_ARRAY;
    case ']':
      return PDF_TOK_CLOSE_ARRAY;
    case '{':
      return PDF_TOK_OPEN_BRACE;
    case '}':
      return PDF_TOK_CLOSE_BRACE;
    case IS_NUMBER:
      return lex_number(ctx, f, buf, c);
    default: /* isregular: !isdelim && !iswhite && c != EOF */
      fz_unread_byte(ctx, f);
      lex_name(ctx, f, buf);
      return pdf_token_from_keyword(buf->scratch);
    }
  }
}

void pdf_append_token(fz_context *ctx, fz_buffer *fzbuf, int tok, pdf_lexbuf *buf)
{
  switch (tok)
  {
  case PDF_TOK_NAME:
    fz_append_printf(ctx, fzbuf, "/%s", buf->scratch);
    break;
  case PDF_TOK_STRING:
    if (buf->len >= buf->size)
      pdf_lexbuf_grow(ctx, buf);
    buf->scratch[buf->len] = 0;
    fz_append_pdf_string(ctx, fzbuf, buf->scratch);
    break;
  case PDF_TOK_OPEN_DICT:
    fz_append_string(ctx, fzbuf, "<<");
    break;
  case PDF_TOK_CLOSE_DICT:
    fz_append_string(ctx, fzbuf, ">>");
    break;
  case PDF_TOK_OPEN_ARRAY:
    fz_append_byte(ctx, fzbuf, '[');
    break;
  case PDF_TOK_CLOSE_ARRAY:
    fz_append_byte(ctx, fzbuf, ']');
    break;
  case PDF_TOK_OPEN_BRACE:
    fz_append_byte(ctx, fzbuf, '{');
    break;
  case PDF_TOK_CLOSE_BRACE:
    fz_append_byte(ctx, fzbuf, '}');
    break;
  case PDF_TOK_INT:
    fz_append_printf(ctx, fzbuf, "%ld", buf->i);
    break;
  case PDF_TOK_REAL:
    fz_append_printf(ctx, fzbuf, "%g", buf->f);
    break;
  default:
    fz_append_data(ctx, fzbuf, buf->scratch, buf->len);
    break;
  }
}

Coverage Report

Created: 2023-06-07 06:20

Line	Count	Source (jump to first uncovered line)
1		// Copyright (C) 2004-2021 Artifex Software, Inc.
2		//
3		// This file is part of MuPDF.
4		//
5		// MuPDF is free software: you can redistribute it and/or modify it under the
6		// terms of the GNU Affero General Public License as published by the Free
7		// Software Foundation, either version 3 of the License, or (at your option)
8		// any later version.
9		//
10		// MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
11		// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12		// FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
13		// details.
14		//
15		// You should have received a copy of the GNU Affero General Public License
16		// along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
17		//
18		// Alternative licensing terms are available from the licensor.
19		// For commercial licensing, see <https://www.artifex.com/> or contact
20		// Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
21		// CA 94129, USA, for further information.
22
23		#include "mupdf/fitz.h"
24		#include "mupdf/pdf.h"
25
26		#include <string.h>
27
28		#define IS_NUMBER \
29	36.6M	'+':case'-':case'.':case'0':case'1':case'2':case'3':\
30	46.8M	case'4':case'5':case'6':case'7':case'8':case'9'
31		#define IS_WHITE \
32	156M	'\x00':case'\x09':case'\x0a':case'\x0c':case'\x0d':case'\x20'
33		#define IS_HEX \
34	39.1M	'0':case'1':case'2':case'3':case'4':case'5':case'6':\
35	41.9M	case'7':case'8':case'9':case'A':case'B':case'C':\
36	44.0M	case'D':case'E':case'F':case'a':case'b':case'c':\
37	45.3M	case'd':case'e':case'f'
38		#define IS_DELIM \
39	78.5M	'(':case')':case'<':case'>':case'[':case']':case'{':\
40	85.6M	case'}':case'/':case'%'
41
42		#define RANGE_0_9 \
43	75.8M	'0':case'1':case'2':case'3':case'4':case'5':\
44	116M	case'6':case'7':case'8':case'9'
45		#define RANGE_a_f \
46	18.0k	'a':case'b':case'c':case'd':case'e':case'f'
47		#define RANGE_A_F \
48	8.89k	'A':case'B':case'C':case'D':case'E':case'F'
49		#define RANGE_0_7 \
50	139k	'0':case'1':case'2':case'3':case'4':case'5':case'6':case'7'
51
52		/* #define DUMP_LEXER_STREAM */
53		#ifdef DUMP_LEXER_STREAM
54		static inline int lex_byte(fz_context ctx, fz_stream stm)
55		{
56		int c = fz_read_byte(ctx, stm);
57
58		if (c == EOF)
59		fz_write_printf(ctx, fz_stdout(ctx), "<EOF>");
60		else if (c >= 32 && c < 128)
61		fz_write_printf(ctx, fz_stdout(ctx), "%c", c);
62		else
63		fz_write_printf(ctx, fz_stdout(ctx), "<%02x>", c);
64		return c;
65		}
66		#else
67	1.31G	#define lex_byte(C,S) fz_read_byte(C,S)
68		#endif
69
70		static inline int iswhite(int ch)
71	16.5M	{
72	16.5M	return
73	16.5M	ch == '\000' \|\|
74	16.5M	ch == '\011' \|\|
75	16.5M	ch == '\012' \|\|
76	16.5M	ch == '\014' \|\|
77	16.5M	ch == '\015' \|\|
78	16.5M	ch == '\040';
79	16.5M	}
80
81		static inline int fz_isprint(int ch)
82	25.1M	{
83	25.1M	return ch >= ' ' && ch <= '~';
84	25.1M	}
85
86		static inline int unhex(int ch)
87	45.3M	{
88	45.3M	if (ch >= '0' && ch <= '9') return ch - '0';
89	35.6M	if (ch >= 'A' && ch <= 'F') return ch - 'A' + 0xA;
90	33.4M	if (ch >= 'a' && ch <= 'f') return ch - 'a' + 0xA;
91	31.1M	return 0;
92	33.4M	}
93
94		static void
95		lex_white(fz_context ctx, fz_stream f)
96	77.8M	{
97	77.8M	int c;
98	93.9M	do {
99	93.9M	c = lex_byte(ctx, f);
100	93.9M	} while ((c <= 32) && (iswhite(c)));
101	77.8M	if (c != EOF)
102	77.8M	fz_unread_byte(ctx, f);
103	77.8M	}
104
105		static void
106		lex_comment(fz_context ctx, fz_stream f)
107	306k	{
108	306k	int c;
109	152M	do {
110	152M	c = lex_byte(ctx, f);
111	152M	} while ((c != '\012') && (c != '\015') && (c != EOF));
112	306k	}
113
114		/* Fast(ish) but inaccurate strtof, with Adobe overflow handling. */
115		static float acrobat_compatible_atof(char *s)
116	11.8k	{
117	11.8k	int neg = 0;
118	11.8k	int i = 0;
119
120	12.4k	while (*s == '-')
121	619	{
122	619	neg = 1;
123	619	++s;
124	619	}
125	12.4k	while (*s == '+')
126	637	{
127	637	++s;
128	637	}
129
130	199k	while (s >= '0' && s <= '9')
131	187k	{
132		/* We deliberately ignore overflow here.
133		* Tests show that Acrobat handles * overflows in exactly the same way we do:
134		* 123450000000000000000678 is read as 678.
135		*/
136	187k	i = i * 10 + (*s - '0');
137	187k	++s;
138	187k	}
139
140	11.8k	if (*s == '.')
141	11.0k	{
142	11.0k	float v = i;
143	11.0k	float n = 0;
144	11.0k	float d = 1;
145	11.0k	++s;
146	40.2k	while (s >= '0' && s <= '9')
147	29.1k	{
148	29.1k	n = 10 * n + (*s - '0');
149	29.1k	d = 10 * d;
150	29.1k	++s;
151	29.1k	}
152	11.0k	v += n / d;
153	11.0k	return neg ? -v : v;
154	11.0k	}
155	751	else
156	751	{
157	751	return neg ? -i : i;
158	751	}
159	11.8k	}
160
161		/* Fast but inaccurate atoi. */
162		static int fast_atoi(char *s)
163	32.5M	{
164	32.5M	int neg = 0;
165	32.5M	int i = 0;
166
167	34.0M	while (*s == '-')
168	1.47M	{
169	1.47M	neg = 1;
170	1.47M	++s;
171	1.47M	}
172	32.5M	while (*s == '+')
173	2.80k	{
174	2.80k	++s;
175	2.80k	}
176
177	111M	while (s >= '0' && s <= '9')
178	79.3M	{
179		/* We deliberately ignore overflow here. */
180	79.3M	i = i * 10 + (*s - '0');
181	79.3M	++s;
182	79.3M	}
183
184	32.5M	return neg ? -i : i;
185	32.5M	}
186
187		static int
188		lex_number(fz_context ctx, fz_stream f, pdf_lexbuf *buf, int c)
189	46.8M	{
190	46.8M	char *s = buf->scratch;
191	46.8M	char e = buf->scratch + buf->size - 1; / leave space for zero terminator */
192	46.8M	char *isreal = (c == '.' ? s : NULL);
193	46.8M	int neg = (c == '-');
194	46.8M	int isbad = 0;
195
196	46.8M	*s++ = c;
197
198	46.8M	c = lex_byte(ctx, f);
199
200		/* skip extra '-' signs at start of number */
201	46.8M	if (neg)
202	2.96M	{
203	2.98M	while (c == '-')
204	17.0k	c = lex_byte(ctx, f);
205	2.96M	}
206
207	180M	while (s < e)
208	180M	{
209	180M	switch (c)
210	180M	{
211	48.5M	case IS_WHITE:
212	48.5M	case IS_DELIM:
213	46.8M	fz_unread_byte(ctx, f);
214	46.8M	goto end;
215	6.98k	case EOF:
216	6.98k	goto end;
217	13.7M	case '.':
218	13.7M	if (isreal)
219	256k	isbad = 1;
220	13.7M	isreal = s;
221	13.7M	*s++ = c;
222	13.7M	break;
223	97.9k	case '-':
224		/* Bug 703248: Some PDFs (particularly those
225		* generated by google docs) apparently have
226		* numbers like 0.000000000000-5684342 in them.
227		* We'll stop our interpretation at the -, but
228		* keep reading to skip over the trailing
229		* digits so they aren't parsed later. */
230	97.9k	*s++ = '\0';
231	97.9k	break;
232	116M	case RANGE_0_9:
233	116M	*s++ = c;
234	116M	break;
235	2.60M	default:
236	2.60M	isbad = 1;
237	2.60M	*s++ = c;
238	2.60M	break;
239	180M	}
240	133M	c = lex_byte(ctx, f);
241	133M	}
242
243	46.8M	end:
244	46.8M	*s = '\0';
245	46.8M	if (isbad)
246	772k	return PDF_TOK_KEYWORD;
247	46.0M	if (isreal)
248	13.4M	{
249		/* We'd like to use the fastest possible atof
250		* routine, but we'd rather match acrobats
251		* handling of broken numbers. As such, we
252		* spot common broken cases and call an
253		* acrobat compatible routine where required. */
254	13.4M	if (neg > 1 \|\| isreal - buf->scratch >= 10)
255	11.8k	buf->f = acrobat_compatible_atof(buf->scratch);
256	13.4M	else
257	13.4M	buf->f = fz_atof(buf->scratch);
258	13.4M	return PDF_TOK_REAL;
259	13.4M	}
260	32.5M	else
261	32.5M	{
262	32.5M	buf->i = fast_atoi(buf->scratch);
263	32.5M	return PDF_TOK_INT;
264	32.5M	}
265	46.0M	}
266
267		static void
268		lex_name(fz_context ctx, fz_stream f, pdf_lexbuf *lb)
269	38.8M	{
270	38.8M	char *s = lb->scratch;
271	38.8M	char *e = s + fz_minz(127, lb->size);
272	38.8M	int c;
273
274	294M	while (1)
275	294M	{
276	294M	if (s == e)
277	11.5k	{
278	11.5k	if (e - lb->scratch < 127)
279	0	{
280	0	s += pdf_lexbuf_grow(ctx, lb);
281	0	e = lb->scratch + fz_minz(127, lb->size);
282	0	}
283	11.5k	else
284	11.5k	{
285		/* truncate names that are too long */
286	11.5k	fz_warn(ctx, "name is too long");
287	11.5k	*s = 0;
288	11.5k	lb->len = s - lb->scratch;
289	11.5k	s = NULL;
290	11.5k	}
291	11.5k	}
292	294M	c = lex_byte(ctx, f);
293	294M	switch (c)
294	294M	{
295	78.9M	case IS_WHITE:
296	78.9M	case IS_DELIM:
297	38.8M	fz_unread_byte(ctx, f);
298	38.8M	goto end;
299	12.1k	case EOF:
300	12.1k	goto end;
301	112k	case '#':
302	112k	{
303	112k	int hex[2];
304	112k	int i;
305	183k	for (i = 0; i < 2; i++)
306	153k	{
307	153k	c = fz_peek_byte(ctx, f);
308	153k	switch (c)
309	153k	{
310	325k	case RANGE_0_9:
311	325k	if (i == 1 && c == '0' && hex[0] == 0)
312	40	goto illegal;
313	44.3k	hex[i] = lex_byte(ctx, f) - '0';
314	44.3k	break;
315	18.0k	case RANGE_a_f:
316	18.0k	hex[i] = lex_byte(ctx, f) - 'a' + 10;
317	18.0k	break;
318	8.89k	case RANGE_A_F:
319	8.89k	hex[i] = lex_byte(ctx, f) - 'A' + 10;
320	8.89k	break;
321	82.4k	default:
322	82.5k	case EOF:
323	82.5k	goto illegal;
324	153k	}
325	153k	}
326	29.8k	if (s) *s++ = (hex[0] << 4) + hex[1];
327	29.8k	break;
328	82.5k	illegal:
329	82.5k	if (i == 1)
330	11.6k	fz_unread_byte(ctx, f);
331	82.5k	if (s) *s++ = '#';
332	82.5k	continue;
333	112k	}
334	255M	default:
335	255M	if (s) *s++ = c;
336	255M	break;
337	294M	}
338	294M	}
339	38.8M	end:
340	38.8M	if (s)
341	38.8M	{
342	38.8M	*s = '\0';
343	38.8M	lb->len = s - lb->scratch;
344	38.8M	}
345	38.8M	}
346
347		static int
348		lex_string(fz_context ctx, fz_stream f, pdf_lexbuf *lb)
349	1.75M	{
350	1.75M	char *s = lb->scratch;
351	1.75M	char *e = s + lb->size;
352	1.75M	int bal = 1;
353	1.75M	int oct;
354	1.75M	int c;
355
356	359M	while (1)
357	359M	{
358	359M	if (s == e)
359	5.74k	{
360	5.74k	s += pdf_lexbuf_grow(ctx, lb);
361	5.74k	e = lb->scratch + lb->size;
362	5.74k	}
363	359M	c = lex_byte(ctx, f);
364	359M	switch (c)
365	359M	{
366	5.25k	case EOF:
367	5.25k	return PDF_TOK_ERROR;
368	488k	case '(':
369	488k	bal++;
370	488k	*s++ = c;
371	488k	break;
372	2.18M	case ')':
373	2.18M	bal --;
374	2.18M	if (bal == 0)
375	1.74M	goto end;
376	436k	*s++ = c;
377	436k	break;
378	597k	case '\\':
379	597k	c = lex_byte(ctx, f);
380	597k	switch (c)
381	597k	{
382	8	case EOF:
383	8	return PDF_TOK_ERROR;
384	5.54k	case 'n':
385	5.54k	*s++ = '\n';
386	5.54k	break;
387	7.51k	case 'r':
388	7.51k	*s++ = '\r';
389	7.51k	break;
390	4.70k	case 't':
391	4.70k	*s++ = '\t';
392	4.70k	break;
393	2.12k	case 'b':
394	2.12k	*s++ = '\b';
395	2.12k	break;
396	2.22k	case 'f':
397	2.22k	*s++ = '\f';
398	2.22k	break;
399	21.8k	case '(':
400	21.8k	*s++ = '(';
401	21.8k	break;
402	21.6k	case ')':
403	21.6k	*s++ = ')';
404	21.6k	break;
405	72.3k	case '\\':
406	72.3k	*s++ = '\\';
407	72.3k	break;
408	139k	case RANGE_0_7:
409	139k	oct = c - '0';
410	139k	c = lex_byte(ctx, f);
411	139k	if (c >= '0' && c <= '7')
412	134k	{
413	134k	oct = oct * 8 + (c - '0');
414	134k	c = lex_byte(ctx, f);
415	134k	if (c >= '0' && c <= '7')
416	133k	oct = oct * 8 + (c - '0');
417	1.12k	else if (c != EOF)
418	1.12k	fz_unread_byte(ctx, f);
419	134k	}
420	5.76k	else if (c != EOF)
421	5.71k	fz_unread_byte(ctx, f);
422	139k	*s++ = oct;
423	139k	break;
424	384	case '\n':
425	384	break;
426	2.53k	case '\r':
427	2.53k	c = lex_byte(ctx, f);
428	2.53k	if ((c != '\n') && (c != EOF))
429	2.45k	fz_unread_byte(ctx, f);
430	2.53k	break;
431	317k	default:
432	317k	*s++ = c;
433	597k	}
434	597k	break;
435	356M	default:
436	356M	*s++ = c;
437	356M	break;
438	359M	}
439	359M	}
440	1.74M	end:
441	1.74M	lb->len = s - lb->scratch;
442	1.74M	return PDF_TOK_STRING;
443	1.75M	}
444
445		static int
446		lex_hex_string(fz_context ctx, fz_stream f, pdf_lexbuf *lb)
447	1.28M	{
448	1.28M	char *s = lb->scratch;
449	1.28M	char *e = s + lb->size;
450	1.28M	int a = 0, x = 0;
451	1.28M	int c;
452
453	51.4M	while (1)
454	51.4M	{
455	51.4M	if (s == e)
456	1.69k	{
457	1.69k	s += pdf_lexbuf_grow(ctx, lb);
458	1.69k	e = lb->scratch + lb->size;
459	1.69k	}
460	51.4M	c = lex_byte(ctx, f);
461	51.4M	switch (c)
462	51.4M	{
463	4.77M	case IS_WHITE:
464	4.77M	break;
465	31.1M	default:
466	31.1M	fz_warn(ctx, "invalid character in hex string");
467		/* fall through */
468	45.3M	case IS_HEX:
469	45.3M	if (x)
470	22.6M	{
471	22.6M	s++ = a 16 + unhex(c);
472	22.6M	x = !x;
473	22.6M	}
474	22.7M	else
475	22.7M	{
476	22.7M	a = unhex(c);
477	22.7M	x = !x;
478	22.7M	}
479	45.3M	break;
480	1.28M	case '>':
481	1.28M	if (x)
482	91.6k	{
483	91.6k	s++ = a 16; /* pad truncated string with '0' */
484	91.6k	}
485	1.28M	goto end;
486	4.06k	case EOF:
487	4.06k	return PDF_TOK_ERROR;
488	51.4M	}
489	51.4M	}
490	1.28M	end:
491	1.28M	lb->len = s - lb->scratch;
492	1.28M	return PDF_TOK_STRING;
493	1.28M	}
494
495		static pdf_token
496		pdf_token_from_keyword(char *key)
497	25.4M	{
498	25.4M	switch (*key)
499	25.4M	{
500	6.15M	case 'R':
501	6.15M	if (!strcmp(key, "R")) return PDF_TOK_R;
502	45.1k	break;
503	208k	case 't':
504	208k	if (!strcmp(key, "true")) return PDF_TOK_TRUE;
505	65.4k	if (!strcmp(key, "trailer")) return PDF_TOK_TRAILER;
506	60.5k	break;
507	338k	case 'f':
508	338k	if (!strcmp(key, "false")) return PDF_TOK_FALSE;
509	215k	break;
510	506k	case 'n':
511	506k	if (!strcmp(key, "null")) return PDF_TOK_NULL;
512	387k	if (!strcmp(key, "newobj")) return PDF_TOK_NEWOBJ;
513	387k	break;
514	635k	case 'o':
515	635k	if (!strcmp(key, "obj")) return PDF_TOK_OBJ;
516	13.7k	break;
517	518k	case 'e':
518	518k	if (!strcmp(key, "endobj")) return PDF_TOK_ENDOBJ;
519	136k	if (!strcmp(key, "endstream")) return PDF_TOK_ENDSTREAM;
520	76.1k	break;
521	375k	case 's':
522	375k	if (!strcmp(key, "stream")) return PDF_TOK_STREAM;
523	152k	if (!strcmp(key, "startxref")) return PDF_TOK_STARTXREF;
524	147k	break;
525	147k	case 'x':
526	14.2k	if (!strcmp(key, "xref")) return PDF_TOK_XREF;
527	10.6k	break;
528	25.4M	}
529
530	35.8M	while (*key)
531	25.1M	{
532	25.1M	if (!fz_isprint(*key))
533	6.90M	return PDF_TOK_ERROR;
534	18.2M	++key;
535	18.2M	}
536
537	10.7M	return PDF_TOK_KEYWORD;
538	17.6M	}
539
540		void pdf_lexbuf_init(fz_context ctx, pdf_lexbuf lb, int size)
541	78.0k	{
542	78.0k	lb->size = lb->base_size = size;
543	78.0k	lb->len = 0;
544	78.0k	lb->scratch = &lb->buffer[0];
545	78.0k	}
546
547		void pdf_lexbuf_fin(fz_context ctx, pdf_lexbuf lb)
548	78.0k	{
549	78.0k	if (lb && lb->size != lb->base_size)
550	2.33k	fz_free(ctx, lb->scratch);
551	78.0k	}
552
553		ptrdiff_t pdf_lexbuf_grow(fz_context ctx, pdf_lexbuf lb)
554	7.44k	{
555	7.44k	char *old = lb->scratch;
556	7.44k	size_t newsize = lb->size * 2;
557	7.44k	if (lb->size == lb->base_size)
558	2.33k	{
559	2.33k	lb->scratch = Memento_label(fz_malloc(ctx, newsize), "pdf_lexbuf");
560	2.33k	memcpy(lb->scratch, lb->buffer, lb->size);
561	2.33k	}
562	5.10k	else
563	5.10k	{
564	5.10k	lb->scratch = fz_realloc(ctx, lb->scratch, newsize);
565	5.10k	}
566	7.44k	lb->size = newsize;
567	7.44k	return lb->scratch - old;
568	7.44k	}
569
570		pdf_token
571		pdf_lex(fz_context ctx, fz_stream f, pdf_lexbuf *buf)
572	91.8M	{
573	167M	while (1)
574	167M	{
575	167M	int c = lex_byte(ctx, f);
576	167M	switch (c)
577	167M	{
578	95.7k	case EOF:
579	95.7k	return PDF_TOK_EOF;
580	75.3M	case IS_WHITE:
581	75.3M	lex_white(ctx, f);
582	75.3M	break;
583	254k	case '%':
584	254k	lex_comment(ctx, f);
585	254k	break;
586	13.2M	case '/':
587	13.2M	lex_name(ctx, f, buf);
588	13.2M	return PDF_TOK_NAME;
589	1.75M	case '(':
590	1.75M	return lex_string(ctx, f, buf);
591	47.1k	case ')':
592	47.1k	return PDF_TOK_ERROR;
593	2.80M	case '<':
594	2.80M	c = lex_byte(ctx, f);
595	2.80M	if (c == '<')
596	1.51M	return PDF_TOK_OPEN_DICT;
597	1.28M	if (c != EOF)
598	1.28M	fz_unread_byte(ctx, f);
599	1.28M	return lex_hex_string(ctx, f, buf);
600	1.52M	case '>':
601	1.52M	c = lex_byte(ctx, f);
602	1.52M	if (c == '>')
603	1.38M	return PDF_TOK_CLOSE_DICT;
604	141k	if (c != EOF)
605	140k	fz_unread_byte(ctx, f);
606	141k	return PDF_TOK_ERROR;
607	1.42M	case '[':
608	1.42M	return PDF_TOK_OPEN_ARRAY;
609	1.35M	case ']':
610	1.35M	return PDF_TOK_CLOSE_ARRAY;
611	39.6k	case '{':
612	39.6k	return PDF_TOK_OPEN_BRACE;
613	56.1k	case '}':
614	56.1k	return PDF_TOK_CLOSE_BRACE;
615	45.4M	case IS_NUMBER:
616	45.4M	return lex_number(ctx, f, buf, c);
617	24.0M	default: /* isregular: !isdelim && !iswhite && c != EOF */
618	24.0M	fz_unread_byte(ctx, f);
619	24.0M	lex_name(ctx, f, buf);
620	24.0M	return pdf_token_from_keyword(buf->scratch);
621	167M	}
622	167M	}
623	91.8M	}
624
625		pdf_token
626		pdf_lex_no_string(fz_context ctx, fz_stream f, pdf_lexbuf *buf)
627	3.35M	{
628	5.91M	while (1)
629	5.91M	{
630	5.91M	int c = lex_byte(ctx, f);
631	5.91M	switch (c)
632	5.91M	{
633	3.58k	case EOF:
634	3.58k	return PDF_TOK_EOF;
635	2.49M	case IS_WHITE:
636	2.49M	lex_white(ctx, f);
637	2.49M	break;
638	52.0k	case '%':
639	52.0k	lex_comment(ctx, f);
640	52.0k	break;
641	95.6k	case '/':
642	95.6k	lex_name(ctx, f, buf);
643	95.6k	return PDF_TOK_NAME;
644	40.8k	case '(':
645	40.8k	return PDF_TOK_ERROR; /* no strings allowed */
646	41.2k	case ')':
647	41.2k	return PDF_TOK_ERROR; /* no strings allowed */
648	59.3k	case '<':
649	59.3k	c = lex_byte(ctx, f);
650	59.3k	if (c == '<')
651	8.19k	return PDF_TOK_OPEN_DICT;
652	51.1k	if (c != EOF)
653	51.1k	fz_unread_byte(ctx, f);
654	51.1k	return PDF_TOK_ERROR; /* no strings allowed */
655	93.7k	case '>':
656	93.7k	c = lex_byte(ctx, f);
657	93.7k	if (c == '>')
658	10.5k	return PDF_TOK_CLOSE_DICT;
659	83.2k	if (c != EOF)
660	83.2k	fz_unread_byte(ctx, f);
661	83.2k	return PDF_TOK_ERROR;
662	74.1k	case '[':
663	74.1k	return PDF_TOK_OPEN_ARRAY;
664	68.8k	case ']':
665	68.8k	return PDF_TOK_CLOSE_ARRAY;
666	40.0k	case '{':
667	40.0k	return PDF_TOK_OPEN_BRACE;
668	36.6k	case '}':
669	36.6k	return PDF_TOK_CLOSE_BRACE;
670	1.38M	case IS_NUMBER:
671	1.38M	return lex_number(ctx, f, buf, c);
672	1.41M	default: /* isregular: !isdelim && !iswhite && c != EOF */
673	1.41M	fz_unread_byte(ctx, f);
674	1.41M	lex_name(ctx, f, buf);
675	1.41M	return pdf_token_from_keyword(buf->scratch);
676	5.91M	}
677	5.91M	}
678	3.35M	}
679
680		void pdf_append_token(fz_context ctx, fz_buffer fzbuf, int tok, pdf_lexbuf *buf)
681	0	{
682	0	switch (tok)
683	0	{
684	0	case PDF_TOK_NAME:
685	0	fz_append_printf(ctx, fzbuf, "/%s", buf->scratch);
686	0	break;
687	0	case PDF_TOK_STRING:
688	0	if (buf->len >= buf->size)
689	0	pdf_lexbuf_grow(ctx, buf);
690	0	buf->scratch[buf->len] = 0;
691	0	fz_append_pdf_string(ctx, fzbuf, buf->scratch);
692	0	break;
693	0	case PDF_TOK_OPEN_DICT:
694	0	fz_append_string(ctx, fzbuf, "<<");
695	0	break;
696	0	case PDF_TOK_CLOSE_DICT:
697	0	fz_append_string(ctx, fzbuf, ">>");
698	0	break;
699	0	case PDF_TOK_OPEN_ARRAY:
700	0	fz_append_byte(ctx, fzbuf, '[');
701	0	break;
702	0	case PDF_TOK_CLOSE_ARRAY:
703	0	fz_append_byte(ctx, fzbuf, ']');
704	0	break;
705	0	case PDF_TOK_OPEN_BRACE:
706	0	fz_append_byte(ctx, fzbuf, '{');
707	0	break;
708	0	case PDF_TOK_CLOSE_BRACE:
709	0	fz_append_byte(ctx, fzbuf, '}');
710	0	break;
711	0	case PDF_TOK_INT:
712	0	fz_append_printf(ctx, fzbuf, "%ld", buf->i);
713	0	break;
714	0	case PDF_TOK_REAL:
715	0	fz_append_printf(ctx, fzbuf, "%g", buf->f);
716	0	break;
717	0	default:
718	0	fz_append_data(ctx, fzbuf, buf->scratch, buf->len);
719	0	break;
720	0	}
721	0	}