/src/mupdf/source/pdf/pdf-lex.c

Source
// Copyright (C) 2004-2024 Artifex Software, Inc.
//
// This file is part of MuPDF.
//
// MuPDF is free software: you can redistribute it and/or modify it under the
// terms of the GNU Affero General Public License as published by the Free
// Software Foundation, either version 3 of the License, or (at your option)
// any later version.
//
// MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
// FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
// details.
//
// You should have received a copy of the GNU Affero General Public License
// along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
//
// Alternative licensing terms are available from the licensor.
// For commercial licensing, see <https://www.artifex.com/> or contact
// Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
// CA 94129, USA, for further information.

#include "mupdf/fitz.h"
#include "mupdf/pdf.h"

#include <string.h>

#define IS_NUMBER \
  '+':case'-':case'.':case'0':case'1':case'2':case'3':\
  case'4':case'5':case'6':case'7':case'8':case'9'
#define IS_WHITE \
  '\x00':case'\x09':case'\x0a':case'\x0c':case'\x0d':case'\x20'
#define IS_HEX \
  '0':case'1':case'2':case'3':case'4':case'5':case'6':\
  case'7':case'8':case'9':case'A':case'B':case'C':\
  case'D':case'E':case'F':case'a':case'b':case'c':\
  case'd':case'e':case'f'
#define IS_DELIM \
  '(':case')':case'<':case'>':case'[':case']':case'{':\
  case'}':case'/':case'%'

#define RANGE_0_9 \
  '0':case'1':case'2':case'3':case'4':case'5':\
  case'6':case'7':case'8':case'9'
#define RANGE_a_f \
  'a':case'b':case'c':case'd':case'e':case'f'
#define RANGE_A_F \
  'A':case'B':case'C':case'D':case'E':case'F'
#define RANGE_0_7 \
  '0':case'1':case'2':case'3':case'4':case'5':case'6':case'7'

/* #define DUMP_LEXER_STREAM */
#ifdef DUMP_LEXER_STREAM
static inline int lex_byte(fz_context *ctx, fz_stream *stm)
{
  int c = fz_read_byte(ctx, stm);

  if (c == EOF)
    fz_write_printf(ctx, fz_stdout(ctx), "<EOF>");
  else if (c >= 32 && c < 128)
    fz_write_printf(ctx, fz_stdout(ctx), "%c", c);
  else
    fz_write_printf(ctx, fz_stdout(ctx), "<%02x>", c);
  return c;
}
#else
#define lex_byte(C,S) fz_read_byte(C,S)
#endif

static inline int iswhite(int ch)
{
  return
    ch == '\000' ||
    ch == '\011' ||
    ch == '\012' ||
    ch == '\014' ||
    ch == '\015' ||
    ch == '\040';
}

static inline int fz_isprint(int ch)
{
  return ch >= ' ' && ch <= '~';
}

static inline int unhex(int ch)
{
  if (ch >= '0' && ch <= '9') return ch - '0';
  if (ch >= 'A' && ch <= 'F') return ch - 'A' + 0xA;
  if (ch >= 'a' && ch <= 'f') return ch - 'a' + 0xA;
  return 0;
}

static void
lex_white(fz_context *ctx, fz_stream *f)
{
  int c;
  do {
    c = lex_byte(ctx, f);
  } while ((c <= 32) && (iswhite(c)));
  if (c != EOF)
    fz_unread_byte(ctx, f);
}

static void
lex_comment(fz_context *ctx, fz_stream *f)
{
  int c;
  do {
    c = lex_byte(ctx, f);
  } while ((c != '\012') && (c != '\015') && (c != EOF));
}

/* Fast(ish) but inaccurate strtof, with Adobe overflow handling. */
static float acrobat_compatible_atof(char *s)
{
  int neg = 0;
  int i = 0;

  while (*s == '-')
  {
    neg = 1;
    ++s;
  }
  while (*s == '+')
  {
    ++s;
  }

  while (*s >= '0' && *s <= '9')
  {
    /* We deliberately ignore overflow here.
     * Tests show that Acrobat handles * overflows in exactly the same way we do:
     * 123450000000000000000678 is read as 678.
     */
    i = i * 10 + (*s - '0');
    ++s;
  }

  if (*s == '.')
  {
    float v = i;
    float n = 0;
    float d = 1;
    ++s;
    while (*s >= '0' && *s <= '9')
    {
      n = 10 * n + (*s - '0');
      d = 10 * d;
      ++s;
    }
    v += n / d;
    return neg ? -v : v;
  }
  else
  {
    return neg ? -i : i;
  }
}

/* Fast but inaccurate atoi. */
static int64_t fast_atoi(char *s)
{
  int neg = 0;
  int64_t i = 0;

  while (*s == '-')
  {
    neg = 1;
    ++s;
  }
  while (*s == '+')
  {
    ++s;
  }

  while (*s >= '0' && *s <= '9')
  {
    /* We deliberately ignore overflow here. */
    i = i * 10 + (*s - '0');
    ++s;
  }

  return neg ? -i : i;
}

static int
lex_number(fz_context *ctx, fz_stream *f, pdf_lexbuf *buf, int c)
{
  char *s = buf->scratch;
  char *e = buf->scratch + buf->size - 1; /* leave space for zero terminator */
  char *isreal = (c == '.' ? s : NULL);
  int neg = (c == '-');
  int isbad = 0;

  *s++ = c;

  c = lex_byte(ctx, f);

  /* skip extra '-' signs at start of number */
  if (neg)
  {
    while (c == '-')
      c = lex_byte(ctx, f);
  }

  while (s < e)
  {
    switch (c)
    {
    case IS_WHITE:
    case IS_DELIM:
      fz_unread_byte(ctx, f);
      goto end;
    case EOF:
      goto end;
    case '.':
      if (isreal)
        isbad = 1;
      isreal = s;
      *s++ = c;
      break;
    case '-':
      /* Bug 703248: Some PDFs (particularly those
       * generated by google docs) apparently have
       * numbers like 0.000000000000-5684342 in them.
       * We'll stop our interpretation at the -, but
       * keep reading to skip over the trailing
       * digits so they aren't parsed later. */
      *s++ = '\0';
      break;
    case RANGE_0_9:
      *s++ = c;
      break;
    default:
      isbad = 1;
      *s++ = c;
      break;
    }
    c = lex_byte(ctx, f);
  }

end:
  *s = '\0';
  if (isbad)
    return PDF_TOK_KEYWORD;
  if (isreal)
  {
    /* We'd like to use the fastest possible atof
     * routine, but we'd rather match acrobats
     * handling of broken numbers. As such, we
     * spot common broken cases and call an
     * acrobat compatible routine where required. */
    if (neg > 1 || isreal - buf->scratch >= 10)
      buf->f = acrobat_compatible_atof(buf->scratch);
    else
      buf->f = fz_atof(buf->scratch);
    return PDF_TOK_REAL;
  }
  else
  {
    buf->i = fast_atoi(buf->scratch);
    return PDF_TOK_INT;
  }
}

static void
lex_name(fz_context *ctx, fz_stream *f, pdf_lexbuf *lb)
{
  char *s = lb->scratch;
  char *e = s + fz_minz(127, lb->size);
  int c;

  while (1)
  {
    if (s == e)
    {
      if (e - lb->scratch < 127)
      {
        s += pdf_lexbuf_grow(ctx, lb);
        e = lb->scratch + fz_minz(127, lb->size);
      }
      else
      {
        /* truncate names that are too long */
        fz_warn(ctx, "name is too long");
        *s = 0;
        lb->len = s - lb->scratch;
        s = NULL;
      }
    }
    c = lex_byte(ctx, f);
    switch (c)
    {
    case IS_WHITE:
    case IS_DELIM:
      fz_unread_byte(ctx, f);
      goto end;
    case EOF:
      goto end;
    case '#':
    {
      int hex[2];
      int i;
      for (i = 0; i < 2; i++)
      {
        c = fz_peek_byte(ctx, f);
        switch (c)
        {
        case RANGE_0_9:
          if (i == 1 && c == '0' && hex[0] == 0)
            goto illegal;
          hex[i] = lex_byte(ctx, f) - '0';
          break;
        case RANGE_a_f:
          hex[i] = lex_byte(ctx, f) - 'a' + 10;
          break;
        case RANGE_A_F:
          hex[i] = lex_byte(ctx, f) - 'A' + 10;
          break;
        default:
          goto illegal;
        case EOF:
          goto illegal_eof;
        }
      }
      if (s) *s++ = (hex[0] << 4) + hex[1];
      break;
illegal:
      if (i == 1)
        fz_unread_byte(ctx, f);
illegal_eof:
      if (s) *s++ = '#';
      continue;
    }
    default:
      if (s) *s++ = c;
      break;
    }
  }
end:
  if (s)
  {
    *s = '\0';
    lb->len = s - lb->scratch;
  }
}

static int
lex_string(fz_context *ctx, fz_stream *f, pdf_lexbuf *lb)
{
  char *s = lb->scratch;
  char *e = s + lb->size;
  int bal = 1;
  int oct;
  int c;

  while (1)
  {
    if (s == e)
    {
      s += pdf_lexbuf_grow(ctx, lb);
      e = lb->scratch + lb->size;
    }
    c = lex_byte(ctx, f);
    switch (c)
    {
    case EOF:
      return PDF_TOK_ERROR;
    case '(':
      bal++;
      *s++ = c;
      break;
    case ')':
      bal --;
      if (bal == 0)
        goto end;
      *s++ = c;
      break;
    case '\\':
      c = lex_byte(ctx, f);
      switch (c)
      {
      case EOF:
        return PDF_TOK_ERROR;
      case 'n':
        *s++ = '\n';
        break;
      case 'r':
        *s++ = '\r';
        break;
      case 't':
        *s++ = '\t';
        break;
      case 'b':
        *s++ = '\b';
        break;
      case 'f':
        *s++ = '\f';
        break;
      case '(':
        *s++ = '(';
        break;
      case ')':
        *s++ = ')';
        break;
      case '\\':
        *s++ = '\\';
        break;
      case RANGE_0_7:
        oct = c - '0';
        c = lex_byte(ctx, f);
        if (c >= '0' && c <= '7')
        {
          oct = oct * 8 + (c - '0');
          c = lex_byte(ctx, f);
          if (c >= '0' && c <= '7')
            oct = oct * 8 + (c - '0');
          else if (c != EOF)
            fz_unread_byte(ctx, f);
        }
        else if (c != EOF)
          fz_unread_byte(ctx, f);
        *s++ = oct;
        break;
      case '\n':
        break;
      case '\r':
        c = lex_byte(ctx, f);
        if ((c != '\n') && (c != EOF))
          fz_unread_byte(ctx, f);
        break;
      default:
        *s++ = c;
      }
      break;
    /* Bug 708256: PDF 32000-1 says that any occurence of \n, \r, or \r\n in a
     * (unless escaped with a '\') should be interpreted as a single 0x0a byte. */
    case '\n':
      *s++ = 0x0a;
      break;
    case '\r':
      *s++ = 0x0a;
      c = lex_byte(ctx, f);
      if ((c != '\n') && (c != EOF))
        fz_unread_byte(ctx, f);
      break;
    default:
      *s++ = c;
      break;
    }
  }
end:
  lb->len = s - lb->scratch;
  return PDF_TOK_STRING;
}

static int
lex_hex_string(fz_context *ctx, fz_stream *f, pdf_lexbuf *lb)
{
  char *s = lb->scratch;
  char *e = s + lb->size;
  int a = 0, x = 0;
  int c;

  while (1)
  {
    if (s == e)
    {
      s += pdf_lexbuf_grow(ctx, lb);
      e = lb->scratch + lb->size;
    }
    c = lex_byte(ctx, f);
    switch (c)
    {
    case IS_WHITE:
      break;
    default:
      fz_warn(ctx, "invalid character in hex string");
      /* fall through */
    case IS_HEX:
      if (x)
      {
        *s++ = a * 16 + unhex(c);
        x = !x;
      }
      else
      {
        a = unhex(c);
        x = !x;
      }
      break;
    case '>':
      if (x)
      {
        *s++ = a * 16; /* pad truncated string with '0' */
      }
      goto end;
    case EOF:
      return PDF_TOK_ERROR;
    }
  }
end:
  lb->len = s - lb->scratch;
  return PDF_TOK_STRING;
}

static pdf_token
pdf_token_from_keyword(char *key)
{
  switch (*key)
  {
  case 'R':
    if (!strcmp(key, "R")) return PDF_TOK_R;
    break;
  case 't':
    if (!strcmp(key, "true")) return PDF_TOK_TRUE;
    if (!strcmp(key, "trailer")) return PDF_TOK_TRAILER;
    break;
  case 'f':
    if (!strcmp(key, "false")) return PDF_TOK_FALSE;
    break;
  case 'n':
    if (!strcmp(key, "null")) return PDF_TOK_NULL;
    if (!strcmp(key, "newobj")) return PDF_TOK_NEWOBJ;
    break;
  case 'o':
    if (!strcmp(key, "obj")) return PDF_TOK_OBJ;
    break;
  case 'e':
    if (!strcmp(key, "endobj")) return PDF_TOK_ENDOBJ;
    if (!strcmp(key, "endstream")) return PDF_TOK_ENDSTREAM;
    break;
  case 's':
    if (!strcmp(key, "stream")) return PDF_TOK_STREAM;
    if (!strcmp(key, "startxref")) return PDF_TOK_STARTXREF;
    break;
  case 'x':
    if (!strcmp(key, "xref")) return PDF_TOK_XREF;
    break;
  }

  while (*key)
  {
    if (!fz_isprint(*key))
      return PDF_TOK_ERROR;
    ++key;
  }

  return PDF_TOK_KEYWORD;
}

void pdf_lexbuf_init(fz_context *ctx, pdf_lexbuf *lb, int size)
{
  lb->size = lb->base_size = size;
  lb->len = 0;
  lb->scratch = &lb->buffer[0];
}

void pdf_lexbuf_fin(fz_context *ctx, pdf_lexbuf *lb)
{
  if (lb && lb->size != lb->base_size)
    fz_free(ctx, lb->scratch);
}

ptrdiff_t pdf_lexbuf_grow(fz_context *ctx, pdf_lexbuf *lb)
{
  char *old = lb->scratch;
  size_t newsize = lb->size * 2;
  if (lb->size == lb->base_size)
  {
    lb->scratch = Memento_label(fz_malloc(ctx, newsize), "pdf_lexbuf");
    memcpy(lb->scratch, lb->buffer, lb->size);
  }
  else
  {
    lb->scratch = fz_realloc(ctx, lb->scratch, newsize);
  }
  lb->size = newsize;
  return lb->scratch - old;
}

pdf_token
pdf_lex(fz_context *ctx, fz_stream *f, pdf_lexbuf *buf)
{
  while (1)
  {
    int c = lex_byte(ctx, f);
    switch (c)
    {
    case EOF:
      return PDF_TOK_EOF;
    case IS_WHITE:
      lex_white(ctx, f);
      break;
    case '%':
      lex_comment(ctx, f);
      break;
    case '/':
      lex_name(ctx, f, buf);
      return PDF_TOK_NAME;
    case '(':
      return lex_string(ctx, f, buf);
    case ')':
      return PDF_TOK_ERROR;
    case '<':
      c = lex_byte(ctx, f);
      if (c == '<')
        return PDF_TOK_OPEN_DICT;
      if (c != EOF)
        fz_unread_byte(ctx, f);
      return lex_hex_string(ctx, f, buf);
    case '>':
      c = lex_byte(ctx, f);
      if (c == '>')
        return PDF_TOK_CLOSE_DICT;
      if (c != EOF)
        fz_unread_byte(ctx, f);
      return PDF_TOK_ERROR;
    case '[':
      return PDF_TOK_OPEN_ARRAY;
    case ']':
      return PDF_TOK_CLOSE_ARRAY;
    case '{':
      return PDF_TOK_OPEN_BRACE;
    case '}':
      return PDF_TOK_CLOSE_BRACE;
    case IS_NUMBER:
      return lex_number(ctx, f, buf, c);
    default: /* isregular: !isdelim && !iswhite && c != EOF */
      fz_unread_byte(ctx, f);
      lex_name(ctx, f, buf);
      return pdf_token_from_keyword(buf->scratch);
    }
  }
}

pdf_token
pdf_lex_no_string(fz_context *ctx, fz_stream *f, pdf_lexbuf *buf)
{
  while (1)
  {
    int c = lex_byte(ctx, f);
    switch (c)
    {
    case EOF:
      return PDF_TOK_EOF;
    case IS_WHITE:
      lex_white(ctx, f);
      break;
    case '%':
      lex_comment(ctx, f);
      break;
    case '/':
      lex_name(ctx, f, buf);
      return PDF_TOK_NAME;
    case '(':
      return PDF_TOK_ERROR; /* no strings allowed */
    case ')':
      return PDF_TOK_ERROR; /* no strings allowed */
    case '<':
      c = lex_byte(ctx, f);
      if (c == '<')
        return PDF_TOK_OPEN_DICT;
      if (c != EOF)
        fz_unread_byte(ctx, f);
      return PDF_TOK_ERROR; /* no strings allowed */
    case '>':
      c = lex_byte(ctx, f);
      if (c == '>')
        return PDF_TOK_CLOSE_DICT;
      if (c != EOF)
        fz_unread_byte(ctx, f);
      return PDF_TOK_ERROR;
    case '[':
      return PDF_TOK_OPEN_ARRAY;
    case ']':
      return PDF_TOK_CLOSE_ARRAY;
    case '{':
      return PDF_TOK_OPEN_BRACE;
    case '}':
      return PDF_TOK_CLOSE_BRACE;
    case IS_NUMBER:
      return lex_number(ctx, f, buf, c);
    default: /* isregular: !isdelim && !iswhite && c != EOF */
      fz_unread_byte(ctx, f);
      lex_name(ctx, f, buf);
      return pdf_token_from_keyword(buf->scratch);
    }
  }
}

void pdf_append_token(fz_context *ctx, fz_buffer *fzbuf, int tok, pdf_lexbuf *buf)
{
  switch (tok)
  {
  case PDF_TOK_NAME:
    fz_append_printf(ctx, fzbuf, "/%s", buf->scratch);
    break;
  case PDF_TOK_STRING:
    if (buf->len >= buf->size)
      pdf_lexbuf_grow(ctx, buf);
    buf->scratch[buf->len] = 0;
    fz_append_pdf_string(ctx, fzbuf, buf->scratch);
    break;
  case PDF_TOK_OPEN_DICT:
    fz_append_string(ctx, fzbuf, "<<");
    break;
  case PDF_TOK_CLOSE_DICT:
    fz_append_string(ctx, fzbuf, ">>");
    break;
  case PDF_TOK_OPEN_ARRAY:
    fz_append_byte(ctx, fzbuf, '[');
    break;
  case PDF_TOK_CLOSE_ARRAY:
    fz_append_byte(ctx, fzbuf, ']');
    break;
  case PDF_TOK_OPEN_BRACE:
    fz_append_byte(ctx, fzbuf, '{');
    break;
  case PDF_TOK_CLOSE_BRACE:
    fz_append_byte(ctx, fzbuf, '}');
    break;
  case PDF_TOK_INT:
    fz_append_printf(ctx, fzbuf, "%ld", buf->i);
    break;
  case PDF_TOK_REAL:
    fz_append_printf(ctx, fzbuf, "%g", buf->f);
    break;
  default:
    fz_append_data(ctx, fzbuf, buf->scratch, buf->len);
    break;
  }
}

Coverage Report

Created: 2025-12-31 07:06

Line	Count	Source
1		// Copyright (C) 2004-2024 Artifex Software, Inc.
2		//
3		// This file is part of MuPDF.
4		//
5		// MuPDF is free software: you can redistribute it and/or modify it under the
6		// terms of the GNU Affero General Public License as published by the Free
7		// Software Foundation, either version 3 of the License, or (at your option)
8		// any later version.
9		//
10		// MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
11		// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12		// FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
13		// details.
14		//
15		// You should have received a copy of the GNU Affero General Public License
16		// along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
17		//
18		// Alternative licensing terms are available from the licensor.
19		// For commercial licensing, see <https://www.artifex.com/> or contact
20		// Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
21		// CA 94129, USA, for further information.
22
23		#include "mupdf/fitz.h"
24		#include "mupdf/pdf.h"
25
26		#include <string.h>
27
28		#define IS_NUMBER \
29	201k	'+':case'-':case'.':case'0':case'1':case'2':case'3':\
30	264k	case'4':case'5':case'6':case'7':case'8':case'9'
31		#define IS_WHITE \
32	1.18M	'\x00':case'\x09':case'\x0a':case'\x0c':case'\x0d':case'\x20'
33		#define IS_HEX \
34	209k	'0':case'1':case'2':case'3':case'4':case'5':case'6':\
35	264k	case'7':case'8':case'9':case'A':case'B':case'C':\
36	293k	case'D':case'E':case'F':case'a':case'b':case'c':\
37	293k	case'd':case'e':case'f'
38		#define IS_DELIM \
39	578k	'(':case')':case'<':case'>':case'[':case']':case'{':\
40	592k	case'}':case'/':case'%'
41
42		#define RANGE_0_9 \
43	474k	'0':case'1':case'2':case'3':case'4':case'5':\
44	584k	case'6':case'7':case'8':case'9'
45		#define RANGE_a_f \
46	59	'a':case'b':case'c':case'd':case'e':case'f'
47		#define RANGE_A_F \
48	26	'A':case'B':case'C':case'D':case'E':case'F'
49		#define RANGE_0_7 \
50	0	'0':case'1':case'2':case'3':case'4':case'5':case'6':case'7'
51
52		/* #define DUMP_LEXER_STREAM */
53		#ifdef DUMP_LEXER_STREAM
54		static inline int lex_byte(fz_context ctx, fz_stream stm)
55		{
56		int c = fz_read_byte(ctx, stm);
57
58		if (c == EOF)
59		fz_write_printf(ctx, fz_stdout(ctx), "<EOF>");
60		else if (c >= 32 && c < 128)
61		fz_write_printf(ctx, fz_stdout(ctx), "%c", c);
62		else
63		fz_write_printf(ctx, fz_stdout(ctx), "<%02x>", c);
64		return c;
65		}
66		#else
67	8.20M	#define lex_byte(C,S) fz_read_byte(C,S)
68		#endif
69
70		static inline int iswhite(int ch)
71	411k	{
72	411k	return
73	411k	ch == '\000' \|\|
74	59.8k	ch == '\011' \|\|
75	59.7k	ch == '\012' \|\|
76	15.7k	ch == '\014' \|\|
77	15.3k	ch == '\015' \|\|
78	14.3k	ch == '\040';
79	411k	}
80
81		static inline int fz_isprint(int ch)
82	2.16M	{
83	2.16M	return ch >= ' ' && ch <= '~';
84	2.16M	}
85
86		static inline int unhex(int ch)
87	293k	{
88	293k	if (ch >= '0' && ch <= '9') return ch - '0';
89	52.8k	if (ch >= 'A' && ch <= 'F') return ch - 'A' + 0xA;
90	0	if (ch >= 'a' && ch <= 'f') return ch - 'a' + 0xA;
91	0	return 0;
92	0	}
93
94		static void
95		lex_white(fz_context ctx, fz_stream f)
96	631k	{
97	631k	int c;
98	1.03M	do {
99	1.03M	c = lex_byte(ctx, f);
100	1.03M	} while ((c <= 32) && (iswhite(c)));
101	631k	if (c != EOF)
102	631k	fz_unread_byte(ctx, f);
103	631k	}
104
105		static void
106		lex_comment(fz_context ctx, fz_stream f)
107	1.96k	{
108	1.96k	int c;
109	759k	do {
110	759k	c = lex_byte(ctx, f);
111	759k	} while ((c != '\012') && (c != '\015') && (c != EOF));
112	1.96k	}
113
114		/* Fast(ish) but inaccurate strtof, with Adobe overflow handling. */
115		static float acrobat_compatible_atof(char *s)
116	0	{
117	0	int neg = 0;
118	0	int i = 0;
119
120	0	while (*s == '-')
121	0	{
122	0	neg = 1;
123	0	++s;
124	0	}
125	0	while (*s == '+')
126	0	{
127	0	++s;
128	0	}
129
130	0	while (s >= '0' && s <= '9')
131	0	{
132		/* We deliberately ignore overflow here.
133		* Tests show that Acrobat handles * overflows in exactly the same way we do:
134		* 123450000000000000000678 is read as 678.
135		*/
136	0	i = i * 10 + (*s - '0');
137	0	++s;
138	0	}
139
140	0	if (*s == '.')
141	0	{
142	0	float v = i;
143	0	float n = 0;
144	0	float d = 1;
145	0	++s;
146	0	while (s >= '0' && s <= '9')
147	0	{
148	0	n = 10 * n + (*s - '0');
149	0	d = 10 * d;
150	0	++s;
151	0	}
152	0	v += n / d;
153	0	return neg ? -v : v;
154	0	}
155	0	else
156	0	{
157	0	return neg ? -i : i;
158	0	}
159	0	}
160
161		/* Fast but inaccurate atoi. */
162		static int64_t fast_atoi(char *s)
163	150k	{
164	150k	int neg = 0;
165	150k	int64_t i = 0;
166
167	151k	while (*s == '-')
168	351	{
169	351	neg = 1;
170	351	++s;
171	351	}
172	150k	while (*s == '+')
173	5	{
174	5	++s;
175	5	}
176
177	491k	while (s >= '0' && s <= '9')
178	341k	{
179		/* We deliberately ignore overflow here. */
180	341k	i = i * 10 + (*s - '0');
181	341k	++s;
182	341k	}
183
184	150k	return neg ? -i : i;
185	150k	}
186
187		static int
188		lex_number(fz_context ctx, fz_stream f, pdf_lexbuf *buf, int c)
189	264k	{
190	264k	char *s = buf->scratch;
191	264k	char e = buf->scratch + buf->size - 1; / leave space for zero terminator */
192	264k	char *isreal = (c == '.' ? s : NULL);
193	264k	int neg = (c == '-');
194	264k	int isbad = 0;
195
196	264k	*s++ = c;
197
198	264k	c = lex_byte(ctx, f);
199
200		/* skip extra '-' signs at start of number */
201	264k	if (neg)
202	675	{
203	709	while (c == '-')
204	34	c = lex_byte(ctx, f);
205	675	}
206
207	1.36M	while (s < e)
208	1.36M	{
209	1.36M	switch (c)
210	1.36M	{
211	427k	case IS_WHITE:
212	427k	case IS_DELIM:
213	264k	fz_unread_byte(ctx, f);
214	264k	goto end;
215	1	case EOF:
216	1	goto end;
217	310k	case '.':
218	310k	if (isreal)
219	202k	isbad = 1;
220	310k	isreal = s;
221	310k	*s++ = c;
222	310k	break;
223	4.37k	case '-':
224		/* Bug 703248: Some PDFs (particularly those
225		* generated by google docs) apparently have
226		* numbers like 0.000000000000-5684342 in them.
227		* We'll stop our interpretation at the -, but
228		* keep reading to skip over the trailing
229		* digits so they aren't parsed later. */
230	4.37k	*s++ = '\0';
231	4.37k	break;
232	582k	case RANGE_0_9:
233	582k	*s++ = c;
234	582k	break;
235	201k	default:
236	201k	isbad = 1;
237	201k	*s++ = c;
238	201k	break;
239	1.36M	}
240	1.09M	c = lex_byte(ctx, f);
241	1.09M	}
242
243	264k	end:
244	264k	*s = '\0';
245	264k	if (isbad)
246	106k	return PDF_TOK_KEYWORD;
247	157k	if (isreal)
248	6.49k	{
249		/* We'd like to use the fastest possible atof
250		* routine, but we'd rather match acrobats
251		* handling of broken numbers. As such, we
252		* spot common broken cases and call an
253		* acrobat compatible routine where required. */
254	6.49k	if (neg > 1 \|\| isreal - buf->scratch >= 10)
255	0	buf->f = acrobat_compatible_atof(buf->scratch);
256	6.49k	else
257	6.49k	buf->f = fz_atof(buf->scratch);
258	6.49k	return PDF_TOK_REAL;
259	6.49k	}
260	150k	else
261	150k	{
262	150k	buf->i = fast_atoi(buf->scratch);
263	150k	return PDF_TOK_INT;
264	150k	}
265	157k	}
266
267		static void
268		lex_name(fz_context ctx, fz_stream f, pdf_lexbuf *lb)
269	327k	{
270	327k	char *s = lb->scratch;
271	327k	char *e = s + fz_minz(127, lb->size);
272	327k	int c;
273
274	3.25M	while (1)
275	3.25M	{
276	3.25M	if (s == e)
277	20	{
278	20	if (e - lb->scratch < 127)
279	0	{
280	0	s += pdf_lexbuf_grow(ctx, lb);
281	0	e = lb->scratch + fz_minz(127, lb->size);
282	0	}
283	20	else
284	20	{
285		/* truncate names that are too long */
286	20	fz_warn(ctx, "name is too long");
287	20	*s = 0;
288	20	lb->len = s - lb->scratch;
289	20	s = NULL;
290	20	}
291	20	}
292	3.25M	c = lex_byte(ctx, f);
293	3.25M	switch (c)
294	3.25M	{
295	1.12M	case IS_WHITE:
296	1.12M	case IS_DELIM:
297	327k	fz_unread_byte(ctx, f);
298	327k	goto end;
299	2	case EOF:
300	2	goto end;
301	12.3k	case '#':
302	12.3k	{
303	12.3k	int hex[2];
304	12.3k	int i;
305	13.7k	for (i = 0; i < 2; i++)
306	13.7k	{
307	13.7k	c = fz_peek_byte(ctx, f);
308	13.7k	switch (c)
309	13.7k	{
310	12.0k	case RANGE_0_9:
311	12.0k	if (i == 1 && c == '0' && hex[0] == 0)
312	0	goto illegal;
313	1.35k	hex[i] = lex_byte(ctx, f) - '0';
314	1.35k	break;
315	59	case RANGE_a_f:
316	59	hex[i] = lex_byte(ctx, f) - 'a' + 10;
317	59	break;
318	26	case RANGE_A_F:
319	26	hex[i] = lex_byte(ctx, f) - 'A' + 10;
320	26	break;
321	12.3k	default:
322	12.3k	goto illegal;
323	12.3k	case EOF:
324	0	goto illegal_eof;
325	13.7k	}
326	13.7k	}
327	15	if (s) *s++ = (hex[0] << 4) + hex[1];
328	15	break;
329	12.3k	illegal:
330	12.3k	if (i == 1)
331	1.40k	fz_unread_byte(ctx, f);
332	12.3k	illegal_eof:
333	12.3k	if (s) *s++ = '#';
334	12.3k	continue;
335	12.3k	}
336	2.91M	default:
337	2.91M	if (s) *s++ = c;
338	2.91M	break;
339	3.25M	}
340	3.25M	}
341	327k	end:
342	327k	if (s)
343	327k	{
344	327k	*s = '\0';
345	327k	lb->len = s - lb->scratch;
346	327k	}
347	327k	}
348
349		static int
350		lex_string(fz_context ctx, fz_stream f, pdf_lexbuf *lb)
351	289	{
352	289	char *s = lb->scratch;
353	289	char *e = s + lb->size;
354	289	int bal = 1;
355	289	int oct;
356	289	int c;
357
358	3.81k	while (1)
359	3.81k	{
360	3.81k	if (s == e)
361	4	{
362	4	s += pdf_lexbuf_grow(ctx, lb);
363	4	e = lb->scratch + lb->size;
364	4	}
365	3.81k	c = lex_byte(ctx, f);
366	3.81k	switch (c)
367	3.81k	{
368	2	case EOF:
369	2	return PDF_TOK_ERROR;
370	94	case '(':
371	94	bal++;
372	94	*s++ = c;
373	94	break;
374	339	case ')':
375	339	bal --;
376	339	if (bal == 0)
377	287	goto end;
378	52	*s++ = c;
379	52	break;
380	2	case '\\':
381	2	c = lex_byte(ctx, f);
382	2	switch (c)
383	2	{
384	0	case EOF:
385	0	return PDF_TOK_ERROR;
386	0	case 'n':
387	0	*s++ = '\n';
388	0	break;
389	0	case 'r':
390	0	*s++ = '\r';
391	0	break;
392	0	case 't':
393	0	*s++ = '\t';
394	0	break;
395	0	case 'b':
396	0	*s++ = '\b';
397	0	break;
398	0	case 'f':
399	0	*s++ = '\f';
400	0	break;
401	1	case '(':
402	1	*s++ = '(';
403	1	break;
404	1	case ')':
405	1	*s++ = ')';
406	1	break;
407	0	case '\\':
408	0	*s++ = '\\';
409	0	break;
410	0	case RANGE_0_7:
411	0	oct = c - '0';
412	0	c = lex_byte(ctx, f);
413	0	if (c >= '0' && c <= '7')
414	0	{
415	0	oct = oct * 8 + (c - '0');
416	0	c = lex_byte(ctx, f);
417	0	if (c >= '0' && c <= '7')
418	0	oct = oct * 8 + (c - '0');
419	0	else if (c != EOF)
420	0	fz_unread_byte(ctx, f);
421	0	}
422	0	else if (c != EOF)
423	0	fz_unread_byte(ctx, f);
424	0	*s++ = oct;
425	0	break;
426	0	case '\n':
427	0	break;
428	0	case '\r':
429	0	c = lex_byte(ctx, f);
430	0	if ((c != '\n') && (c != EOF))
431	0	fz_unread_byte(ctx, f);
432	0	break;
433	0	default:
434	0	*s++ = c;
435	2	}
436	2	break;
437		/* Bug 708256: PDF 32000-1 says that any occurence of \n, \r, or \r\n in a
438		* (unless escaped with a '\') should be interpreted as a single 0x0a byte. */
439	13	case '\n':
440	13	*s++ = 0x0a;
441	13	break;
442	77	case '\r':
443	77	*s++ = 0x0a;
444	77	c = lex_byte(ctx, f);
445	77	if ((c != '\n') && (c != EOF))
446	3	fz_unread_byte(ctx, f);
447	77	break;
448	3.28k	default:
449	3.28k	*s++ = c;
450	3.28k	break;
451	3.81k	}
452	3.81k	}
453	287	end:
454	287	lb->len = s - lb->scratch;
455	287	return PDF_TOK_STRING;
456	289	}
457
458		static int
459		lex_hex_string(fz_context ctx, fz_stream f, pdf_lexbuf *lb)
460	66.9k	{
461	66.9k	char *s = lb->scratch;
462	66.9k	char *e = s + lb->size;
463	66.9k	int a = 0, x = 0;
464	66.9k	int c;
465
466	360k	while (1)
467	360k	{
468	360k	if (s == e)
469	0	{
470	0	s += pdf_lexbuf_grow(ctx, lb);
471	0	e = lb->scratch + lb->size;
472	0	}
473	360k	c = lex_byte(ctx, f);
474	360k	switch (c)
475	360k	{
476	0	case IS_WHITE:
477	0	break;
478	0	default:
479	0	fz_warn(ctx, "invalid character in hex string");
480		/* fall through */
481	293k	case IS_HEX:
482	293k	if (x)
483	146k	{
484	146k	s++ = a 16 + unhex(c);
485	146k	x = !x;
486	146k	}
487	146k	else
488	146k	{
489	146k	a = unhex(c);
490	146k	x = !x;
491	146k	}
492	293k	break;
493	66.9k	case '>':
494	66.9k	if (x)
495	1	{
496	1	s++ = a 16; /* pad truncated string with '0' */
497	1	}
498	66.9k	goto end;
499	0	case EOF:
500	0	return PDF_TOK_ERROR;
501	360k	}
502	360k	}
503	66.9k	end:
504	66.9k	lb->len = s - lb->scratch;
505	66.9k	return PDF_TOK_STRING;
506	66.9k	}
507
508		static pdf_token
509		pdf_token_from_keyword(char *key)
510	282k	{
511	282k	switch (*key)
512	282k	{
513	22.1k	case 'R':
514	22.1k	if (!strcmp(key, "R")) return PDF_TOK_R;
515	132	break;
516	10.0k	case 't':
517	10.0k	if (!strcmp(key, "true")) return PDF_TOK_TRUE;
518	10.0k	if (!strcmp(key, "trailer")) return PDF_TOK_TRAILER;
519	10.0k	break;
520	10.0k	case 'f':
521	2.67k	if (!strcmp(key, "false")) return PDF_TOK_FALSE;
522	2.26k	break;
523	5.98k	case 'n':
524	5.98k	if (!strcmp(key, "null")) return PDF_TOK_NULL;
525	5.98k	if (!strcmp(key, "newobj")) return PDF_TOK_NEWOBJ;
526	5.98k	break;
527	8.37k	case 'o':
528	8.37k	if (!strcmp(key, "obj")) return PDF_TOK_OBJ;
529	7.01k	break;
530	11.8k	case 'e':
531	11.8k	if (!strcmp(key, "endobj")) return PDF_TOK_ENDOBJ;
532	10.8k	if (!strcmp(key, "endstream")) return PDF_TOK_ENDSTREAM;
533	10.5k	break;
534	17.3k	case 's':
535	17.3k	if (!strcmp(key, "stream")) return PDF_TOK_STREAM;
536	16.8k	if (!strcmp(key, "startxref")) return PDF_TOK_STARTXREF;
537	16.8k	break;
538	16.8k	case 'x':
539	758	if (!strcmp(key, "xref")) return PDF_TOK_XREF;
540	758	break;
541	282k	}
542
543	2.37M	while (*key)
544	2.16M	{
545	2.16M	if (!fz_isprint(*key))
546	43.8k	return PDF_TOK_ERROR;
547	2.12M	++key;
548	2.12M	}
549
550	213k	return PDF_TOK_KEYWORD;
551	257k	}
552
553		void pdf_lexbuf_init(fz_context ctx, pdf_lexbuf lb, int size)
554	27	{
555	27	lb->size = lb->base_size = size;
556	27	lb->len = 0;
557	27	lb->scratch = &lb->buffer[0];
558	27	}
559
560		void pdf_lexbuf_fin(fz_context ctx, pdf_lexbuf lb)
561	27	{
562	27	if (lb && lb->size != lb->base_size)
563	2	fz_free(ctx, lb->scratch);
564	27	}
565
566		ptrdiff_t pdf_lexbuf_grow(fz_context ctx, pdf_lexbuf lb)
567	4	{
568	4	char *old = lb->scratch;
569	4	size_t newsize = lb->size * 2;
570	4	if (lb->size == lb->base_size)
571	2	{
572	2	lb->scratch = Memento_label(fz_malloc(ctx, newsize), "pdf_lexbuf");
573	2	memcpy(lb->scratch, lb->buffer, lb->size);
574	2	}
575	2	else
576	2	{
577	2	lb->scratch = fz_realloc(ctx, lb->scratch, newsize);
578	2	}
579	4	lb->size = newsize;
580	4	return lb->scratch - old;
581	4	}
582
583		pdf_token
584		pdf_lex(fz_context ctx, fz_stream f, pdf_lexbuf *buf)
585	242k	{
586	466k	while (1)
587	466k	{
588	466k	int c = lex_byte(ctx, f);
589	466k	switch (c)
590	466k	{
591	30	case EOF:
592	30	return PDF_TOK_EOF;
593	223k	case IS_WHITE:
594	223k	lex_white(ctx, f);
595	223k	break;
596	34	case '%':
597	34	lex_comment(ctx, f);
598	34	break;
599	39.8k	case '/':
600	39.8k	lex_name(ctx, f, buf);
601	39.8k	return PDF_TOK_NAME;
602	289	case '(':
603	289	return lex_string(ctx, f, buf);
604	213	case ')':
605	213	return PDF_TOK_ERROR;
606	69.9k	case '<':
607	69.9k	c = lex_byte(ctx, f);
608	69.9k	if (c == '<')
609	3.03k	return PDF_TOK_OPEN_DICT;
610	66.9k	if (c != EOF)
611	66.9k	fz_unread_byte(ctx, f);
612	66.9k	return lex_hex_string(ctx, f, buf);
613	2.64k	case '>':
614	2.64k	c = lex_byte(ctx, f);
615	2.64k	if (c == '>')
616	2.64k	return PDF_TOK_CLOSE_DICT;
617	0	if (c != EOF)
618	0	fz_unread_byte(ctx, f);
619	0	return PDF_TOK_ERROR;
620	1.79k	case '[':
621	1.79k	return PDF_TOK_OPEN_ARRAY;
622	1.75k	case ']':
623	1.75k	return PDF_TOK_CLOSE_ARRAY;
624	0	case '{':
625	0	return PDF_TOK_OPEN_BRACE;
626	0	case '}':
627	0	return PDF_TOK_CLOSE_BRACE;
628	99.7k	case IS_NUMBER:
629	99.7k	return lex_number(ctx, f, buf, c);
630	25.7k	default: /* isregular: !isdelim && !iswhite && c != EOF */
631	25.7k	fz_unread_byte(ctx, f);
632	25.7k	lex_name(ctx, f, buf);
633	25.7k	return pdf_token_from_keyword(buf->scratch);
634	466k	}
635	466k	}
636	242k	}
637
638		pdf_token
639		pdf_lex_no_string(fz_context ctx, fz_stream f, pdf_lexbuf *buf)
640	462k	{
641	872k	while (1)
642	872k	{
643	872k	int c = lex_byte(ctx, f);
644	872k	switch (c)
645	872k	{
646	5	case EOF:
647	5	return PDF_TOK_EOF;
648	407k	case IS_WHITE:
649	407k	lex_white(ctx, f);
650	407k	break;
651	1.93k	case '%':
652	1.93k	lex_comment(ctx, f);
653	1.93k	break;
654	5.19k	case '/':
655	5.19k	lex_name(ctx, f, buf);
656	5.19k	return PDF_TOK_NAME;
657	1.92k	case '(':
658	1.92k	return PDF_TOK_ERROR; /* no strings allowed */
659	1.90k	case ')':
660	1.90k	return PDF_TOK_ERROR; /* no strings allowed */
661	4.58k	case '<':
662	4.58k	c = lex_byte(ctx, f);
663	4.58k	if (c == '<')
664	19	return PDF_TOK_OPEN_DICT;
665	4.56k	if (c != EOF)
666	4.56k	fz_unread_byte(ctx, f);
667	4.56k	return PDF_TOK_ERROR; /* no strings allowed */
668	6.35k	case '>':
669	6.35k	c = lex_byte(ctx, f);
670	6.35k	if (c == '>')
671	72	return PDF_TOK_CLOSE_DICT;
672	6.28k	if (c != EOF)
673	6.28k	fz_unread_byte(ctx, f);
674	6.28k	return PDF_TOK_ERROR;
675	8.73k	case '[':
676	8.73k	return PDF_TOK_OPEN_ARRAY;
677	8.82k	case ']':
678	8.82k	return PDF_TOK_CLOSE_ARRAY;
679	1.84k	case '{':
680	1.84k	return PDF_TOK_OPEN_BRACE;
681	1.82k	case '}':
682	1.82k	return PDF_TOK_CLOSE_BRACE;
683	164k	case IS_NUMBER:
684	164k	return lex_number(ctx, f, buf, c);
685	257k	default: /* isregular: !isdelim && !iswhite && c != EOF */
686	257k	fz_unread_byte(ctx, f);
687	257k	lex_name(ctx, f, buf);
688	257k	return pdf_token_from_keyword(buf->scratch);
689	872k	}
690	872k	}
691	462k	}
692
693		void pdf_append_token(fz_context ctx, fz_buffer fzbuf, int tok, pdf_lexbuf *buf)
694	0	{
695	0	switch (tok)
696	0	{
697	0	case PDF_TOK_NAME:
698	0	fz_append_printf(ctx, fzbuf, "/%s", buf->scratch);
699	0	break;
700	0	case PDF_TOK_STRING:
701	0	if (buf->len >= buf->size)
702	0	pdf_lexbuf_grow(ctx, buf);
703	0	buf->scratch[buf->len] = 0;
704	0	fz_append_pdf_string(ctx, fzbuf, buf->scratch);
705	0	break;
706	0	case PDF_TOK_OPEN_DICT:
707	0	fz_append_string(ctx, fzbuf, "<<");
708	0	break;
709	0	case PDF_TOK_CLOSE_DICT:
710	0	fz_append_string(ctx, fzbuf, ">>");
711	0	break;
712	0	case PDF_TOK_OPEN_ARRAY:
713	0	fz_append_byte(ctx, fzbuf, '[');
714	0	break;
715	0	case PDF_TOK_CLOSE_ARRAY:
716	0	fz_append_byte(ctx, fzbuf, ']');
717	0	break;
718	0	case PDF_TOK_OPEN_BRACE:
719	0	fz_append_byte(ctx, fzbuf, '{');
720	0	break;
721	0	case PDF_TOK_CLOSE_BRACE:
722	0	fz_append_byte(ctx, fzbuf, '}');
723	0	break;
724	0	case PDF_TOK_INT:
725	0	fz_append_printf(ctx, fzbuf, "%ld", buf->i);
726	0	break;
727	0	case PDF_TOK_REAL:
728	0	fz_append_printf(ctx, fzbuf, "%g", buf->f);
729	0	break;
730	0	default:
731	0	fz_append_data(ctx, fzbuf, buf->scratch, buf->len);
732	0	break;
733	0	}
734	0	}