/src/mupdf/source/pdf/pdf-lex.c

Source (jump to first uncovered line)
// Copyright (C) 2004-2024 Artifex Software, Inc.
//
// This file is part of MuPDF.
//
// MuPDF is free software: you can redistribute it and/or modify it under the
// terms of the GNU Affero General Public License as published by the Free
// Software Foundation, either version 3 of the License, or (at your option)
// any later version.
//
// MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
// FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
// details.
//
// You should have received a copy of the GNU Affero General Public License
// along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
//
// Alternative licensing terms are available from the licensor.
// For commercial licensing, see <https://www.artifex.com/> or contact
// Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
// CA 94129, USA, for further information.

#include "mupdf/fitz.h"
#include "mupdf/pdf.h"

#include <string.h>

#define IS_NUMBER \
  '+':case'-':case'.':case'0':case'1':case'2':case'3':\
  case'4':case'5':case'6':case'7':case'8':case'9'
#define IS_WHITE \
  '\x00':case'\x09':case'\x0a':case'\x0c':case'\x0d':case'\x20'
#define IS_HEX \
  '0':case'1':case'2':case'3':case'4':case'5':case'6':\
  case'7':case'8':case'9':case'A':case'B':case'C':\
  case'D':case'E':case'F':case'a':case'b':case'c':\
  case'd':case'e':case'f'
#define IS_DELIM \
  '(':case')':case'<':case'>':case'[':case']':case'{':\
  case'}':case'/':case'%'

#define RANGE_0_9 \
  '0':case'1':case'2':case'3':case'4':case'5':\
  case'6':case'7':case'8':case'9'
#define RANGE_a_f \
  'a':case'b':case'c':case'd':case'e':case'f'
#define RANGE_A_F \
  'A':case'B':case'C':case'D':case'E':case'F'
#define RANGE_0_7 \
  '0':case'1':case'2':case'3':case'4':case'5':case'6':case'7'

/* #define DUMP_LEXER_STREAM */
#ifdef DUMP_LEXER_STREAM
static inline int lex_byte(fz_context *ctx, fz_stream *stm)
{
  int c = fz_read_byte(ctx, stm);

  if (c == EOF)
    fz_write_printf(ctx, fz_stdout(ctx), "<EOF>");
  else if (c >= 32 && c < 128)
    fz_write_printf(ctx, fz_stdout(ctx), "%c", c);
  else
    fz_write_printf(ctx, fz_stdout(ctx), "<%02x>", c);
  return c;
}
#else
#define lex_byte(C,S) fz_read_byte(C,S)
#endif

static inline int iswhite(int ch)
{
  return
    ch == '\000' ||
    ch == '\011' ||
    ch == '\012' ||
    ch == '\014' ||
    ch == '\015' ||
    ch == '\040';
}

static inline int fz_isprint(int ch)
{
  return ch >= ' ' && ch <= '~';
}

static inline int unhex(int ch)
{
  if (ch >= '0' && ch <= '9') return ch - '0';
  if (ch >= 'A' && ch <= 'F') return ch - 'A' + 0xA;
  if (ch >= 'a' && ch <= 'f') return ch - 'a' + 0xA;
  return 0;
}

static void
lex_white(fz_context *ctx, fz_stream *f)
{
  int c;
  do {
    c = lex_byte(ctx, f);
  } while ((c <= 32) && (iswhite(c)));
  if (c != EOF)
    fz_unread_byte(ctx, f);
}

static void
lex_comment(fz_context *ctx, fz_stream *f)
{
  int c;
  do {
    c = lex_byte(ctx, f);
  } while ((c != '\012') && (c != '\015') && (c != EOF));
}

/* Fast(ish) but inaccurate strtof, with Adobe overflow handling. */
static float acrobat_compatible_atof(char *s)
{
  int neg = 0;
  int i = 0;

  while (*s == '-')
  {
    neg = 1;
    ++s;
  }
  while (*s == '+')
  {
    ++s;
  }

  while (*s >= '0' && *s <= '9')
  {
    /* We deliberately ignore overflow here.
     * Tests show that Acrobat handles * overflows in exactly the same way we do:
     * 123450000000000000000678 is read as 678.
     */
    i = i * 10 + (*s - '0');
    ++s;
  }

  if (*s == '.')
  {
    float v = i;
    float n = 0;
    float d = 1;
    ++s;
    while (*s >= '0' && *s <= '9')
    {
      n = 10 * n + (*s - '0');
      d = 10 * d;
      ++s;
    }
    v += n / d;
    return neg ? -v : v;
  }
  else
  {
    return neg ? -i : i;
  }
}

/* Fast but inaccurate atoi. */
static int64_t fast_atoi(char *s)
{
  int neg = 0;
  int64_t i = 0;

  while (*s == '-')
  {
    neg = 1;
    ++s;
  }
  while (*s == '+')
  {
    ++s;
  }

  while (*s >= '0' && *s <= '9')
  {
    /* We deliberately ignore overflow here. */
    i = i * 10 + (*s - '0');
    ++s;
  }

  return neg ? -i : i;
}

static int
lex_number(fz_context *ctx, fz_stream *f, pdf_lexbuf *buf, int c)
{
  char *s = buf->scratch;
  char *e = buf->scratch + buf->size - 1; /* leave space for zero terminator */
  char *isreal = (c == '.' ? s : NULL);
  int neg = (c == '-');
  int isbad = 0;

  *s++ = c;

  c = lex_byte(ctx, f);

  /* skip extra '-' signs at start of number */
  if (neg)
  {
    while (c == '-')
      c = lex_byte(ctx, f);
  }

  while (s < e)
  {
    switch (c)
    {
    case IS_WHITE:
    case IS_DELIM:
      fz_unread_byte(ctx, f);
      goto end;
    case EOF:
      goto end;
    case '.':
      if (isreal)
        isbad = 1;
      isreal = s;
      *s++ = c;
      break;
    case '-':
      /* Bug 703248: Some PDFs (particularly those
       * generated by google docs) apparently have
       * numbers like 0.000000000000-5684342 in them.
       * We'll stop our interpretation at the -, but
       * keep reading to skip over the trailing
       * digits so they aren't parsed later. */
      *s++ = '\0';
      break;
    case RANGE_0_9:
      *s++ = c;
      break;
    default:
      isbad = 1;
      *s++ = c;
      break;
    }
    c = lex_byte(ctx, f);
  }

end:
  *s = '\0';
  if (isbad)
    return PDF_TOK_KEYWORD;
  if (isreal)
  {
    /* We'd like to use the fastest possible atof
     * routine, but we'd rather match acrobats
     * handling of broken numbers. As such, we
     * spot common broken cases and call an
     * acrobat compatible routine where required. */
    if (neg > 1 || isreal - buf->scratch >= 10)
      buf->f = acrobat_compatible_atof(buf->scratch);
    else
      buf->f = fz_atof(buf->scratch);
    return PDF_TOK_REAL;
  }
  else
  {
    buf->i = fast_atoi(buf->scratch);
    return PDF_TOK_INT;
  }
}

static void
lex_name(fz_context *ctx, fz_stream *f, pdf_lexbuf *lb)
{
  char *s = lb->scratch;
  char *e = s + fz_minz(127, lb->size);
  int c;

  while (1)
  {
    if (s == e)
    {
      if (e - lb->scratch < 127)
      {
        s += pdf_lexbuf_grow(ctx, lb);
        e = lb->scratch + fz_minz(127, lb->size);
      }
      else
      {
        /* truncate names that are too long */
        fz_warn(ctx, "name is too long");
        *s = 0;
        lb->len = s - lb->scratch;
        s = NULL;
      }
    }
    c = lex_byte(ctx, f);
    switch (c)
    {
    case IS_WHITE:
    case IS_DELIM:
      fz_unread_byte(ctx, f);
      goto end;
    case EOF:
      goto end;
    case '#':
    {
      int hex[2];
      int i;
      for (i = 0; i < 2; i++)
      {
        c = fz_peek_byte(ctx, f);
        switch (c)
        {
        case RANGE_0_9:
          if (i == 1 && c == '0' && hex[0] == 0)
            goto illegal;
          hex[i] = lex_byte(ctx, f) - '0';
          break;
        case RANGE_a_f:
          hex[i] = lex_byte(ctx, f) - 'a' + 10;
          break;
        case RANGE_A_F:
          hex[i] = lex_byte(ctx, f) - 'A' + 10;
          break;
        default:
          goto illegal;
        case EOF:
          goto illegal_eof;
        }
      }
      if (s) *s++ = (hex[0] << 4) + hex[1];
      break;
illegal:
      if (i == 1)
        fz_unread_byte(ctx, f);
illegal_eof:
      if (s) *s++ = '#';
      continue;
    }
    default:
      if (s) *s++ = c;
      break;
    }
  }
end:
  if (s)
  {
    *s = '\0';
    lb->len = s - lb->scratch;
  }
}

static int
lex_string(fz_context *ctx, fz_stream *f, pdf_lexbuf *lb)
{
  char *s = lb->scratch;
  char *e = s + lb->size;
  int bal = 1;
  int oct;
  int c;

  while (1)
  {
    if (s == e)
    {
      s += pdf_lexbuf_grow(ctx, lb);
      e = lb->scratch + lb->size;
    }
    c = lex_byte(ctx, f);
    switch (c)
    {
    case EOF:
      return PDF_TOK_ERROR;
    case '(':
      bal++;
      *s++ = c;
      break;
    case ')':
      bal --;
      if (bal == 0)
        goto end;
      *s++ = c;
      break;
    case '\\':
      c = lex_byte(ctx, f);
      switch (c)
      {
      case EOF:
        return PDF_TOK_ERROR;
      case 'n':
        *s++ = '\n';
        break;
      case 'r':
        *s++ = '\r';
        break;
      case 't':
        *s++ = '\t';
        break;
      case 'b':
        *s++ = '\b';
        break;
      case 'f':
        *s++ = '\f';
        break;
      case '(':
        *s++ = '(';
        break;
      case ')':
        *s++ = ')';
        break;
      case '\\':
        *s++ = '\\';
        break;
      case RANGE_0_7:
        oct = c - '0';
        c = lex_byte(ctx, f);
        if (c >= '0' && c <= '7')
        {
          oct = oct * 8 + (c - '0');
          c = lex_byte(ctx, f);
          if (c >= '0' && c <= '7')
            oct = oct * 8 + (c - '0');
          else if (c != EOF)
            fz_unread_byte(ctx, f);
        }
        else if (c != EOF)
          fz_unread_byte(ctx, f);
        *s++ = oct;
        break;
      case '\n':
        break;
      case '\r':
        c = lex_byte(ctx, f);
        if ((c != '\n') && (c != EOF))
          fz_unread_byte(ctx, f);
        break;
      default:
        *s++ = c;
      }
      break;
    /* Bug 708256: PDF 32000-1 says that any occurence of \n, \r, or \r\n in a
     * (unless escaped with a '\') should be interpreted as a single 0x0a byte. */
    case '\n':
      *s++ = 0x0a;
      break;
    case '\r':
      *s++ = 0x0a;
      c = lex_byte(ctx, f);
      if ((c != '\n') && (c != EOF))
        fz_unread_byte(ctx, f);
      break;
    default:
      *s++ = c;
      break;
    }
  }
end:
  lb->len = s - lb->scratch;
  return PDF_TOK_STRING;
}

static int
lex_hex_string(fz_context *ctx, fz_stream *f, pdf_lexbuf *lb)
{
  char *s = lb->scratch;
  char *e = s + lb->size;
  int a = 0, x = 0;
  int c;

  while (1)
  {
    if (s == e)
    {
      s += pdf_lexbuf_grow(ctx, lb);
      e = lb->scratch + lb->size;
    }
    c = lex_byte(ctx, f);
    switch (c)
    {
    case IS_WHITE:
      break;
    default:
      fz_warn(ctx, "invalid character in hex string");
      /* fall through */
    case IS_HEX:
      if (x)
      {
        *s++ = a * 16 + unhex(c);
        x = !x;
      }
      else
      {
        a = unhex(c);
        x = !x;
      }
      break;
    case '>':
      if (x)
      {
        *s++ = a * 16; /* pad truncated string with '0' */
      }
      goto end;
    case EOF:
      return PDF_TOK_ERROR;
    }
  }
end:
  lb->len = s - lb->scratch;
  return PDF_TOK_STRING;
}

static pdf_token
pdf_token_from_keyword(char *key)
{
  switch (*key)
  {
  case 'R':
    if (!strcmp(key, "R")) return PDF_TOK_R;
    break;
  case 't':
    if (!strcmp(key, "true")) return PDF_TOK_TRUE;
    if (!strcmp(key, "trailer")) return PDF_TOK_TRAILER;
    break;
  case 'f':
    if (!strcmp(key, "false")) return PDF_TOK_FALSE;
    break;
  case 'n':
    if (!strcmp(key, "null")) return PDF_TOK_NULL;
    if (!strcmp(key, "newobj")) return PDF_TOK_NEWOBJ;
    break;
  case 'o':
    if (!strcmp(key, "obj")) return PDF_TOK_OBJ;
    break;
  case 'e':
    if (!strcmp(key, "endobj")) return PDF_TOK_ENDOBJ;
    if (!strcmp(key, "endstream")) return PDF_TOK_ENDSTREAM;
    break;
  case 's':
    if (!strcmp(key, "stream")) return PDF_TOK_STREAM;
    if (!strcmp(key, "startxref")) return PDF_TOK_STARTXREF;
    break;
  case 'x':
    if (!strcmp(key, "xref")) return PDF_TOK_XREF;
    break;
  }

  while (*key)
  {
    if (!fz_isprint(*key))
      return PDF_TOK_ERROR;
    ++key;
  }

  return PDF_TOK_KEYWORD;
}

void pdf_lexbuf_init(fz_context *ctx, pdf_lexbuf *lb, int size)
{
  lb->size = lb->base_size = size;
  lb->len = 0;
  lb->scratch = &lb->buffer[0];
}

void pdf_lexbuf_fin(fz_context *ctx, pdf_lexbuf *lb)
{
  if (lb && lb->size != lb->base_size)
    fz_free(ctx, lb->scratch);
}

ptrdiff_t pdf_lexbuf_grow(fz_context *ctx, pdf_lexbuf *lb)
{
  char *old = lb->scratch;
  size_t newsize = lb->size * 2;
  if (lb->size == lb->base_size)
  {
    lb->scratch = Memento_label(fz_malloc(ctx, newsize), "pdf_lexbuf");
    memcpy(lb->scratch, lb->buffer, lb->size);
  }
  else
  {
    lb->scratch = fz_realloc(ctx, lb->scratch, newsize);
  }
  lb->size = newsize;
  return lb->scratch - old;
}

pdf_token
pdf_lex(fz_context *ctx, fz_stream *f, pdf_lexbuf *buf)
{
  while (1)
  {
    int c = lex_byte(ctx, f);
    switch (c)
    {
    case EOF:
      return PDF_TOK_EOF;
    case IS_WHITE:
      lex_white(ctx, f);
      break;
    case '%':
      lex_comment(ctx, f);
      break;
    case '/':
      lex_name(ctx, f, buf);
      return PDF_TOK_NAME;
    case '(':
      return lex_string(ctx, f, buf);
    case ')':
      return PDF_TOK_ERROR;
    case '<':
      c = lex_byte(ctx, f);
      if (c == '<')
        return PDF_TOK_OPEN_DICT;
      if (c != EOF)
        fz_unread_byte(ctx, f);
      return lex_hex_string(ctx, f, buf);
    case '>':
      c = lex_byte(ctx, f);
      if (c == '>')
        return PDF_TOK_CLOSE_DICT;
      if (c != EOF)
        fz_unread_byte(ctx, f);
      return PDF_TOK_ERROR;
    case '[':
      return PDF_TOK_OPEN_ARRAY;
    case ']':
      return PDF_TOK_CLOSE_ARRAY;
    case '{':
      return PDF_TOK_OPEN_BRACE;
    case '}':
      return PDF_TOK_CLOSE_BRACE;
    case IS_NUMBER:
      return lex_number(ctx, f, buf, c);
    default: /* isregular: !isdelim && !iswhite && c != EOF */
      fz_unread_byte(ctx, f);
      lex_name(ctx, f, buf);
      return pdf_token_from_keyword(buf->scratch);
    }
  }
}

pdf_token
pdf_lex_no_string(fz_context *ctx, fz_stream *f, pdf_lexbuf *buf)
{
  while (1)
  {
    int c = lex_byte(ctx, f);
    switch (c)
    {
    case EOF:
      return PDF_TOK_EOF;
    case IS_WHITE:
      lex_white(ctx, f);
      break;
    case '%':
      lex_comment(ctx, f);
      break;
    case '/':
      lex_name(ctx, f, buf);
      return PDF_TOK_NAME;
    case '(':
      return PDF_TOK_ERROR; /* no strings allowed */
    case ')':
      return PDF_TOK_ERROR; /* no strings allowed */
    case '<':
      c = lex_byte(ctx, f);
      if (c == '<')
        return PDF_TOK_OPEN_DICT;
      if (c != EOF)
        fz_unread_byte(ctx, f);
      return PDF_TOK_ERROR; /* no strings allowed */
    case '>':
      c = lex_byte(ctx, f);
      if (c == '>')
        return PDF_TOK_CLOSE_DICT;
      if (c != EOF)
        fz_unread_byte(ctx, f);
      return PDF_TOK_ERROR;
    case '[':
      return PDF_TOK_OPEN_ARRAY;
    case ']':
      return PDF_TOK_CLOSE_ARRAY;
    case '{':
      return PDF_TOK_OPEN_BRACE;
    case '}':
      return PDF_TOK_CLOSE_BRACE;
    case IS_NUMBER:
      return lex_number(ctx, f, buf, c);
    default: /* isregular: !isdelim && !iswhite && c != EOF */
      fz_unread_byte(ctx, f);
      lex_name(ctx, f, buf);
      return pdf_token_from_keyword(buf->scratch);
    }
  }
}

void pdf_append_token(fz_context *ctx, fz_buffer *fzbuf, int tok, pdf_lexbuf *buf)
{
  switch (tok)
  {
  case PDF_TOK_NAME:
    fz_append_printf(ctx, fzbuf, "/%s", buf->scratch);
    break;
  case PDF_TOK_STRING:
    if (buf->len >= buf->size)
      pdf_lexbuf_grow(ctx, buf);
    buf->scratch[buf->len] = 0;
    fz_append_pdf_string(ctx, fzbuf, buf->scratch);
    break;
  case PDF_TOK_OPEN_DICT:
    fz_append_string(ctx, fzbuf, "<<");
    break;
  case PDF_TOK_CLOSE_DICT:
    fz_append_string(ctx, fzbuf, ">>");
    break;
  case PDF_TOK_OPEN_ARRAY:
    fz_append_byte(ctx, fzbuf, '[');
    break;
  case PDF_TOK_CLOSE_ARRAY:
    fz_append_byte(ctx, fzbuf, ']');
    break;
  case PDF_TOK_OPEN_BRACE:
    fz_append_byte(ctx, fzbuf, '{');
    break;
  case PDF_TOK_CLOSE_BRACE:
    fz_append_byte(ctx, fzbuf, '}');
    break;
  case PDF_TOK_INT:
    fz_append_printf(ctx, fzbuf, "%ld", buf->i);
    break;
  case PDF_TOK_REAL:
    fz_append_printf(ctx, fzbuf, "%g", buf->f);
    break;
  default:
    fz_append_data(ctx, fzbuf, buf->scratch, buf->len);
    break;
  }
}

Coverage Report

Created: 2025-09-04 06:50

Line	Count	Source (jump to first uncovered line)
1		// Copyright (C) 2004-2024 Artifex Software, Inc.
2		//
3		// This file is part of MuPDF.
4		//
5		// MuPDF is free software: you can redistribute it and/or modify it under the
6		// terms of the GNU Affero General Public License as published by the Free
7		// Software Foundation, either version 3 of the License, or (at your option)
8		// any later version.
9		//
10		// MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
11		// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12		// FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
13		// details.
14		//
15		// You should have received a copy of the GNU Affero General Public License
16		// along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
17		//
18		// Alternative licensing terms are available from the licensor.
19		// For commercial licensing, see <https://www.artifex.com/> or contact
20		// Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
21		// CA 94129, USA, for further information.
22
23		#include "mupdf/fitz.h"
24		#include "mupdf/pdf.h"
25
26		#include <string.h>
27
28		#define IS_NUMBER \
29	101k	'+':case'-':case'.':case'0':case'1':case'2':case'3':\
30	160k	case'4':case'5':case'6':case'7':case'8':case'9'
31		#define IS_WHITE \
32	807k	'\x00':case'\x09':case'\x0a':case'\x0c':case'\x0d':case'\x20'
33		#define IS_HEX \
34	209k	'0':case'1':case'2':case'3':case'4':case'5':case'6':\
35	264k	case'7':case'8':case'9':case'A':case'B':case'C':\
36	293k	case'D':case'E':case'F':case'a':case'b':case'c':\
37	293k	case'd':case'e':case'f'
38		#define IS_DELIM \
39	405k	'(':case')':case'<':case'>':case'[':case']':case'{':\
40	425k	case'}':case'/':case'%'
41
42		#define RANGE_0_9 \
43	131k	'0':case'1':case'2':case'3':case'4':case'5':\
44	205k	case'6':case'7':case'8':case'9'
45		#define RANGE_a_f \
46	120	'a':case'b':case'c':case'd':case'e':case'f'
47		#define RANGE_A_F \
48	91	'A':case'B':case'C':case'D':case'E':case'F'
49		#define RANGE_0_7 \
50	0	'0':case'1':case'2':case'3':case'4':case'5':case'6':case'7'
51
52		/* #define DUMP_LEXER_STREAM */
53		#ifdef DUMP_LEXER_STREAM
54		static inline int lex_byte(fz_context ctx, fz_stream stm)
55		{
56		int c = fz_read_byte(ctx, stm);
57
58		if (c == EOF)
59		fz_write_printf(ctx, fz_stdout(ctx), "<EOF>");
60		else if (c >= 32 && c < 128)
61		fz_write_printf(ctx, fz_stdout(ctx), "%c", c);
62		else
63		fz_write_printf(ctx, fz_stdout(ctx), "<%02x>", c);
64		return c;
65		}
66		#else
67	5.68M	#define lex_byte(C,S) fz_read_byte(C,S)
68		#endif
69
70		static inline int iswhite(int ch)
71	405k	{
72	405k	return
73	405k	ch == '\000' \|\|
74	405k	ch == '\011' \|\|
75	405k	ch == '\012' \|\|
76	405k	ch == '\014' \|\|
77	405k	ch == '\015' \|\|
78	405k	ch == '\040';
79	405k	}
80
81		static inline int fz_isprint(int ch)
82	283k	{
83	283k	return ch >= ' ' && ch <= '~';
84	283k	}
85
86		static inline int unhex(int ch)
87	293k	{
88	293k	if (ch >= '0' && ch <= '9') return ch - '0';
89	52.7k	if (ch >= 'A' && ch <= 'F') return ch - 'A' + 0xA;
90	0	if (ch >= 'a' && ch <= 'f') return ch - 'a' + 0xA;
91	0	return 0;
92	0	}
93
94		static void
95		lex_white(fz_context ctx, fz_stream f)
96	441k	{
97	441k	int c;
98	838k	do {
99	838k	c = lex_byte(ctx, f);
100	838k	} while ((c <= 32) && (iswhite(c)));
101	441k	if (c != EOF)
102	441k	fz_unread_byte(ctx, f);
103	441k	}
104
105		static void
106		lex_comment(fz_context ctx, fz_stream f)
107	4.67k	{
108	4.67k	int c;
109	1.10M	do {
110	1.10M	c = lex_byte(ctx, f);
111	1.10M	} while ((c != '\012') && (c != '\015') && (c != EOF));
112	4.67k	}
113
114		/* Fast(ish) but inaccurate strtof, with Adobe overflow handling. */
115		static float acrobat_compatible_atof(char *s)
116	0	{
117	0	int neg = 0;
118	0	int i = 0;
119
120	0	while (*s == '-')
121	0	{
122	0	neg = 1;
123	0	++s;
124	0	}
125	0	while (*s == '+')
126	0	{
127	0	++s;
128	0	}
129
130	0	while (s >= '0' && s <= '9')
131	0	{
132		/* We deliberately ignore overflow here.
133		* Tests show that Acrobat handles * overflows in exactly the same way we do:
134		* 123450000000000000000678 is read as 678.
135		*/
136	0	i = i * 10 + (*s - '0');
137	0	++s;
138	0	}
139
140	0	if (*s == '.')
141	0	{
142	0	float v = i;
143	0	float n = 0;
144	0	float d = 1;
145	0	++s;
146	0	while (s >= '0' && s <= '9')
147	0	{
148	0	n = 10 * n + (*s - '0');
149	0	d = 10 * d;
150	0	++s;
151	0	}
152	0	v += n / d;
153	0	return neg ? -v : v;
154	0	}
155	0	else
156	0	{
157	0	return neg ? -i : i;
158	0	}
159	0	}
160
161		/* Fast but inaccurate atoi. */
162		static int64_t fast_atoi(char *s)
163	150k	{
164	150k	int neg = 0;
165	150k	int64_t i = 0;
166
167	150k	while (*s == '-')
168	340	{
169	340	neg = 1;
170	340	++s;
171	340	}
172	150k	while (*s == '+')
173	16	{
174	16	++s;
175	16	}
176
177	489k	while (s >= '0' && s <= '9')
178	339k	{
179		/* We deliberately ignore overflow here. */
180	339k	i = i * 10 + (*s - '0');
181	339k	++s;
182	339k	}
183
184	150k	return neg ? -i : i;
185	150k	}
186
187		static int
188		lex_number(fz_context ctx, fz_stream f, pdf_lexbuf *buf, int c)
189	160k	{
190	160k	char *s = buf->scratch;
191	160k	char e = buf->scratch + buf->size - 1; / leave space for zero terminator */
192	160k	char *isreal = (c == '.' ? s : NULL);
193	160k	int neg = (c == '-');
194	160k	int isbad = 0;
195
196	160k	*s++ = c;
197
198	160k	c = lex_byte(ctx, f);
199
200		/* skip extra '-' signs at start of number */
201	160k	if (neg)
202	802	{
203	804	while (c == '-')
204	2	c = lex_byte(ctx, f);
205	802	}
206
207	417k	while (s < e)
208	417k	{
209	417k	switch (c)
210	417k	{
211	291k	case IS_WHITE:
212	291k	case IS_DELIM:
213	160k	fz_unread_byte(ctx, f);
214	160k	goto end;
215	0	case EOF:
216	0	goto end;
217	2.38k	case '.':
218	2.38k	if (isreal)
219	89	isbad = 1;
220	2.38k	isreal = s;
221	2.38k	*s++ = c;
222	2.38k	break;
223	236	case '-':
224		/* Bug 703248: Some PDFs (particularly those
225		* generated by google docs) apparently have
226		* numbers like 0.000000000000-5684342 in them.
227		* We'll stop our interpretation at the -, but
228		* keep reading to skip over the trailing
229		* digits so they aren't parsed later. */
230	236	*s++ = '\0';
231	236	break;
232	204k	case RANGE_0_9:
233	204k	*s++ = c;
234	204k	break;
235	50.6k	default:
236	50.6k	isbad = 1;
237	50.6k	*s++ = c;
238	50.6k	break;
239	417k	}
240	257k	c = lex_byte(ctx, f);
241	257k	}
242
243	160k	end:
244	160k	*s = '\0';
245	160k	if (isbad)
246	3.51k	return PDF_TOK_KEYWORD;
247	156k	if (isreal)
248	6.43k	{
249		/* We'd like to use the fastest possible atof
250		* routine, but we'd rather match acrobats
251		* handling of broken numbers. As such, we
252		* spot common broken cases and call an
253		* acrobat compatible routine where required. */
254	6.43k	if (neg > 1 \|\| isreal - buf->scratch >= 10)
255	0	buf->f = acrobat_compatible_atof(buf->scratch);
256	6.43k	else
257	6.43k	buf->f = fz_atof(buf->scratch);
258	6.43k	return PDF_TOK_REAL;
259	6.43k	}
260	150k	else
261	150k	{
262	150k	buf->i = fast_atoi(buf->scratch);
263	150k	return PDF_TOK_INT;
264	150k	}
265	156k	}
266
267		static void
268		lex_name(fz_context ctx, fz_stream f, pdf_lexbuf *lb)
269	265k	{
270	265k	char *s = lb->scratch;
271	265k	char *e = s + fz_minz(127, lb->size);
272	265k	int c;
273
274	1.87M	while (1)
275	1.87M	{
276	1.87M	if (s == e)
277	30	{
278	30	if (e - lb->scratch < 127)
279	0	{
280	0	s += pdf_lexbuf_grow(ctx, lb);
281	0	e = lb->scratch + fz_minz(127, lb->size);
282	0	}
283	30	else
284	30	{
285		/* truncate names that are too long */
286	30	fz_warn(ctx, "name is too long");
287	30	*s = 0;
288	30	lb->len = s - lb->scratch;
289	30	s = NULL;
290	30	}
291	30	}
292	1.87M	c = lex_byte(ctx, f);
293	1.87M	switch (c)
294	1.87M	{
295	823k	case IS_WHITE:
296	823k	case IS_DELIM:
297	265k	fz_unread_byte(ctx, f);
298	265k	goto end;
299	2	case EOF:
300	2	goto end;
301	6.64k	case '#':
302	6.64k	{
303	6.64k	int hex[2];
304	6.64k	int i;
305	8.29k	for (i = 0; i < 2; i++)
306	8.27k	{
307	8.27k	c = fz_peek_byte(ctx, f);
308	8.27k	switch (c)
309	8.27k	{
310	12.4k	case RANGE_0_9:
311	12.4k	if (i == 1 && c == '0' && hex[0] == 0)
312	0	goto illegal;
313	1.44k	hex[i] = lex_byte(ctx, f) - '0';
314	1.44k	break;
315	120	case RANGE_a_f:
316	120	hex[i] = lex_byte(ctx, f) - 'a' + 10;
317	120	break;
318	91	case RANGE_A_F:
319	91	hex[i] = lex_byte(ctx, f) - 'A' + 10;
320	91	break;
321	6.62k	default:
322	6.62k	goto illegal;
323	6.62k	case EOF:
324	0	goto illegal_eof;
325	8.27k	}
326	8.27k	}
327	21	if (s) *s++ = (hex[0] << 4) + hex[1];
328	21	break;
329	6.62k	illegal:
330	6.62k	if (i == 1)
331	1.61k	fz_unread_byte(ctx, f);
332	6.62k	illegal_eof:
333	6.62k	if (s) *s++ = '#';
334	6.62k	continue;
335	6.62k	}
336	1.60M	default:
337	1.60M	if (s) *s++ = c;
338	1.60M	break;
339	1.87M	}
340	1.87M	}
341	265k	end:
342	265k	if (s)
343	265k	{
344	265k	*s = '\0';
345	265k	lb->len = s - lb->scratch;
346	265k	}
347	265k	}
348
349		static int
350		lex_string(fz_context ctx, fz_stream f, pdf_lexbuf *lb)
351	289	{
352	289	char *s = lb->scratch;
353	289	char *e = s + lb->size;
354	289	int bal = 1;
355	289	int oct;
356	289	int c;
357
358	3.81k	while (1)
359	3.81k	{
360	3.81k	if (s == e)
361	4	{
362	4	s += pdf_lexbuf_grow(ctx, lb);
363	4	e = lb->scratch + lb->size;
364	4	}
365	3.81k	c = lex_byte(ctx, f);
366	3.81k	switch (c)
367	3.81k	{
368	2	case EOF:
369	2	return PDF_TOK_ERROR;
370	94	case '(':
371	94	bal++;
372	94	*s++ = c;
373	94	break;
374	339	case ')':
375	339	bal --;
376	339	if (bal == 0)
377	287	goto end;
378	52	*s++ = c;
379	52	break;
380	2	case '\\':
381	2	c = lex_byte(ctx, f);
382	2	switch (c)
383	2	{
384	0	case EOF:
385	0	return PDF_TOK_ERROR;
386	0	case 'n':
387	0	*s++ = '\n';
388	0	break;
389	0	case 'r':
390	0	*s++ = '\r';
391	0	break;
392	0	case 't':
393	0	*s++ = '\t';
394	0	break;
395	0	case 'b':
396	0	*s++ = '\b';
397	0	break;
398	0	case 'f':
399	0	*s++ = '\f';
400	0	break;
401	1	case '(':
402	1	*s++ = '(';
403	1	break;
404	1	case ')':
405	1	*s++ = ')';
406	1	break;
407	0	case '\\':
408	0	*s++ = '\\';
409	0	break;
410	0	case RANGE_0_7:
411	0	oct = c - '0';
412	0	c = lex_byte(ctx, f);
413	0	if (c >= '0' && c <= '7')
414	0	{
415	0	oct = oct * 8 + (c - '0');
416	0	c = lex_byte(ctx, f);
417	0	if (c >= '0' && c <= '7')
418	0	oct = oct * 8 + (c - '0');
419	0	else if (c != EOF)
420	0	fz_unread_byte(ctx, f);
421	0	}
422	0	else if (c != EOF)
423	0	fz_unread_byte(ctx, f);
424	0	*s++ = oct;
425	0	break;
426	0	case '\n':
427	0	break;
428	0	case '\r':
429	0	c = lex_byte(ctx, f);
430	0	if ((c != '\n') && (c != EOF))
431	0	fz_unread_byte(ctx, f);
432	0	break;
433	0	default:
434	0	*s++ = c;
435	2	}
436	2	break;
437		/* Bug 708256: PDF 32000-1 says that any occurence of \n, \r, or \r\n in a
438		* (unless escaped with a '\') should be interpreted as a single 0x0a byte. */
439	13	case '\n':
440	13	*s++ = 0x0a;
441	13	break;
442	77	case '\r':
443	77	*s++ = 0x0a;
444	77	c = lex_byte(ctx, f);
445	77	if ((c != '\n') && (c != EOF))
446	3	fz_unread_byte(ctx, f);
447	77	break;
448	3.28k	default:
449	3.28k	*s++ = c;
450	3.28k	break;
451	3.81k	}
452	3.81k	}
453	287	end:
454	287	lb->len = s - lb->scratch;
455	287	return PDF_TOK_STRING;
456	289	}
457
458		static int
459		lex_hex_string(fz_context ctx, fz_stream f, pdf_lexbuf *lb)
460	66.8k	{
461	66.8k	char *s = lb->scratch;
462	66.8k	char *e = s + lb->size;
463	66.8k	int a = 0, x = 0;
464	66.8k	int c;
465
466	360k	while (1)
467	360k	{
468	360k	if (s == e)
469	0	{
470	0	s += pdf_lexbuf_grow(ctx, lb);
471	0	e = lb->scratch + lb->size;
472	0	}
473	360k	c = lex_byte(ctx, f);
474	360k	switch (c)
475	360k	{
476	0	case IS_WHITE:
477	0	break;
478	0	default:
479	0	fz_warn(ctx, "invalid character in hex string");
480		/* fall through */
481	293k	case IS_HEX:
482	293k	if (x)
483	146k	{
484	146k	s++ = a 16 + unhex(c);
485	146k	x = !x;
486	146k	}
487	146k	else
488	146k	{
489	146k	a = unhex(c);
490	146k	x = !x;
491	146k	}
492	293k	break;
493	66.8k	case '>':
494	66.8k	if (x)
495	1	{
496	1	s++ = a 16; /* pad truncated string with '0' */
497	1	}
498	66.8k	goto end;
499	0	case EOF:
500	0	return PDF_TOK_ERROR;
501	360k	}
502	360k	}
503	66.8k	end:
504	66.8k	lb->len = s - lb->scratch;
505	66.8k	return PDF_TOK_STRING;
506	66.8k	}
507
508		static pdf_token
509		pdf_token_from_keyword(char *key)
510	218k	{
511	218k	switch (*key)
512	218k	{
513	22.1k	case 'R':
514	22.1k	if (!strcmp(key, "R")) return PDF_TOK_R;
515	265	break;
516	6.83k	case 't':
517	6.83k	if (!strcmp(key, "true")) return PDF_TOK_TRUE;
518	6.80k	if (!strcmp(key, "trailer")) return PDF_TOK_TRAILER;
519	6.80k	break;
520	6.80k	case 'f':
521	1.35k	if (!strcmp(key, "false")) return PDF_TOK_FALSE;
522	946	break;
523	4.66k	case 'n':
524	4.66k	if (!strcmp(key, "null")) return PDF_TOK_NULL;
525	4.66k	if (!strcmp(key, "newobj")) return PDF_TOK_NEWOBJ;
526	4.66k	break;
527	7.30k	case 'o':
528	7.30k	if (!strcmp(key, "obj")) return PDF_TOK_OBJ;
529	5.97k	break;
530	8.77k	case 'e':
531	8.77k	if (!strcmp(key, "endobj")) return PDF_TOK_ENDOBJ;
532	7.80k	if (!strcmp(key, "endstream")) return PDF_TOK_ENDSTREAM;
533	7.51k	break;
534	8.58k	case 's':
535	8.58k	if (!strcmp(key, "stream")) return PDF_TOK_STREAM;
536	8.10k	if (!strcmp(key, "startxref")) return PDF_TOK_STARTXREF;
537	8.10k	break;
538	8.10k	case 'x':
539	579	if (!strcmp(key, "xref")) return PDF_TOK_XREF;
540	579	break;
541	218k	}
542
543	397k	while (*key)
544	283k	{
545	283k	if (!fz_isprint(*key))
546	79.2k	return PDF_TOK_ERROR;
547	204k	++key;
548	204k	}
549
550	114k	return PDF_TOK_KEYWORD;
551	193k	}
552
553		void pdf_lexbuf_init(fz_context ctx, pdf_lexbuf lb, int size)
554	23	{
555	23	lb->size = lb->base_size = size;
556	23	lb->len = 0;
557	23	lb->scratch = &lb->buffer[0];
558	23	}
559
560		void pdf_lexbuf_fin(fz_context ctx, pdf_lexbuf lb)
561	23	{
562	23	if (lb && lb->size != lb->base_size)
563	2	fz_free(ctx, lb->scratch);
564	23	}
565
566		ptrdiff_t pdf_lexbuf_grow(fz_context ctx, pdf_lexbuf lb)
567	4	{
568	4	char *old = lb->scratch;
569	4	size_t newsize = lb->size * 2;
570	4	if (lb->size == lb->base_size)
571	2	{
572	2	lb->scratch = Memento_label(fz_malloc(ctx, newsize), "pdf_lexbuf");
573	2	memcpy(lb->scratch, lb->buffer, lb->size);
574	2	}
575	2	else
576	2	{
577	2	lb->scratch = fz_realloc(ctx, lb->scratch, newsize);
578	2	}
579	4	lb->size = newsize;
580	4	return lb->scratch - old;
581	4	}
582
583		pdf_token
584		pdf_lex(fz_context ctx, fz_stream f, pdf_lexbuf *buf)
585	240k	{
586	464k	while (1)
587	464k	{
588	464k	int c = lex_byte(ctx, f);
589	464k	switch (c)
590	464k	{
591	27	case EOF:
592	27	return PDF_TOK_EOF;
593	223k	case IS_WHITE:
594	223k	lex_white(ctx, f);
595	223k	break;
596	26	case '%':
597	26	lex_comment(ctx, f);
598	26	break;
599	39.4k	case '/':
600	39.4k	lex_name(ctx, f, buf);
601	39.4k	return PDF_TOK_NAME;
602	289	case '(':
603	289	return lex_string(ctx, f, buf);
604	213	case ')':
605	213	return PDF_TOK_ERROR;
606	69.8k	case '<':
607	69.8k	c = lex_byte(ctx, f);
608	69.8k	if (c == '<')
609	2.96k	return PDF_TOK_OPEN_DICT;
610	66.8k	if (c != EOF)
611	66.8k	fz_unread_byte(ctx, f);
612	66.8k	return lex_hex_string(ctx, f, buf);
613	2.59k	case '>':
614	2.59k	c = lex_byte(ctx, f);
615	2.59k	if (c == '>')
616	2.59k	return PDF_TOK_CLOSE_DICT;
617	0	if (c != EOF)
618	0	fz_unread_byte(ctx, f);
619	0	return PDF_TOK_ERROR;
620	1.75k	case '[':
621	1.75k	return PDF_TOK_OPEN_ARRAY;
622	1.71k	case ']':
623	1.71k	return PDF_TOK_CLOSE_ARRAY;
624	0	case '{':
625	0	return PDF_TOK_OPEN_BRACE;
626	0	case '}':
627	0	return PDF_TOK_CLOSE_BRACE;
628	99.3k	case IS_NUMBER:
629	99.3k	return lex_number(ctx, f, buf, c);
630	25.5k	default: /* isregular: !isdelim && !iswhite && c != EOF */
631	25.5k	fz_unread_byte(ctx, f);
632	25.5k	lex_name(ctx, f, buf);
633	25.5k	return pdf_token_from_keyword(buf->scratch);
634	464k	}
635	464k	}
636	240k	}
637
638		pdf_token
639		pdf_lex_no_string(fz_context ctx, fz_stream f, pdf_lexbuf *buf)
640	305k	{
641	527k	while (1)
642	527k	{
643	527k	int c = lex_byte(ctx, f);
644	527k	switch (c)
645	527k	{
646	3	case EOF:
647	3	return PDF_TOK_EOF;
648	217k	case IS_WHITE:
649	217k	lex_white(ctx, f);
650	217k	break;
651	4.64k	case '%':
652	4.64k	lex_comment(ctx, f);
653	4.64k	break;
654	7.20k	case '/':
655	7.20k	lex_name(ctx, f, buf);
656	7.20k	return PDF_TOK_NAME;
657	4.61k	case '(':
658	4.61k	return PDF_TOK_ERROR; /* no strings allowed */
659	4.64k	case ')':
660	4.64k	return PDF_TOK_ERROR; /* no strings allowed */
661	7.33k	case '<':
662	7.33k	c = lex_byte(ctx, f);
663	7.33k	if (c == '<')
664	16	return PDF_TOK_OPEN_DICT;
665	7.31k	if (c != EOF)
666	7.31k	fz_unread_byte(ctx, f);
667	7.31k	return PDF_TOK_ERROR; /* no strings allowed */
668	9.05k	case '>':
669	9.05k	c = lex_byte(ctx, f);
670	9.05k	if (c == '>')
671	50	return PDF_TOK_CLOSE_DICT;
672	9.00k	if (c != EOF)
673	9.00k	fz_unread_byte(ctx, f);
674	9.00k	return PDF_TOK_ERROR;
675	4.55k	case '[':
676	4.55k	return PDF_TOK_OPEN_ARRAY;
677	4.67k	case ']':
678	4.67k	return PDF_TOK_CLOSE_ARRAY;
679	4.56k	case '{':
680	4.56k	return PDF_TOK_OPEN_BRACE;
681	4.60k	case '}':
682	4.60k	return PDF_TOK_CLOSE_BRACE;
683	60.7k	case IS_NUMBER:
684	60.7k	return lex_number(ctx, f, buf, c);
685	193k	default: /* isregular: !isdelim && !iswhite && c != EOF */
686	193k	fz_unread_byte(ctx, f);
687	193k	lex_name(ctx, f, buf);
688	193k	return pdf_token_from_keyword(buf->scratch);
689	527k	}
690	527k	}
691	305k	}
692
693		void pdf_append_token(fz_context ctx, fz_buffer fzbuf, int tok, pdf_lexbuf *buf)
694	0	{
695	0	switch (tok)
696	0	{
697	0	case PDF_TOK_NAME:
698	0	fz_append_printf(ctx, fzbuf, "/%s", buf->scratch);
699	0	break;
700	0	case PDF_TOK_STRING:
701	0	if (buf->len >= buf->size)
702	0	pdf_lexbuf_grow(ctx, buf);
703	0	buf->scratch[buf->len] = 0;
704	0	fz_append_pdf_string(ctx, fzbuf, buf->scratch);
705	0	break;
706	0	case PDF_TOK_OPEN_DICT:
707	0	fz_append_string(ctx, fzbuf, "<<");
708	0	break;
709	0	case PDF_TOK_CLOSE_DICT:
710	0	fz_append_string(ctx, fzbuf, ">>");
711	0	break;
712	0	case PDF_TOK_OPEN_ARRAY:
713	0	fz_append_byte(ctx, fzbuf, '[');
714	0	break;
715	0	case PDF_TOK_CLOSE_ARRAY:
716	0	fz_append_byte(ctx, fzbuf, ']');
717	0	break;
718	0	case PDF_TOK_OPEN_BRACE:
719	0	fz_append_byte(ctx, fzbuf, '{');
720	0	break;
721	0	case PDF_TOK_CLOSE_BRACE:
722	0	fz_append_byte(ctx, fzbuf, '}');
723	0	break;
724	0	case PDF_TOK_INT:
725	0	fz_append_printf(ctx, fzbuf, "%ld", buf->i);
726	0	break;
727	0	case PDF_TOK_REAL:
728	0	fz_append_printf(ctx, fzbuf, "%g", buf->f);
729	0	break;
730	0	default:
731	0	fz_append_data(ctx, fzbuf, buf->scratch, buf->len);
732	0	break;
733	0	}
734	0	}