Coverage Report

Created: 2023-09-25 06:35

/src/xpdf-4.04/xpdf/Lexer.cc
Line
Count
Source (jump to first uncovered line)
1
//========================================================================
2
//
3
// Lexer.cc
4
//
5
// Copyright 1996-2003 Glyph & Cog, LLC
6
//
7
//========================================================================
8
9
#include <aconf.h>
10
11
#ifdef USE_GCC_PRAGMAS
12
#pragma implementation
13
#endif
14
15
#include <stdlib.h>
16
#include <stddef.h>
17
#include <string.h>
18
#include <ctype.h>
19
#include "gmempp.h"
20
#include "Lexer.h"
21
#include "Error.h"
22
23
//------------------------------------------------------------------------
24
25
// A '1' in this array means the character is white space.  A '1' or
26
// '2' means the character ends a name or command.
27
static char specialChars[256] = {
28
  1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0,   // 0x
29
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // 1x
30
  1, 0, 0, 0, 0, 2, 0, 0, 2, 2, 0, 0, 0, 0, 0, 2,   // 2x
31
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0,   // 3x
32
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // 4x
33
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0,   // 5x
34
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // 6x
35
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0,   // 7x
36
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // 8x
37
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // 9x
38
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // ax
39
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // bx
40
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // cx
41
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // dx
42
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // ex
43
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0    // fx
44
};
45
46
//------------------------------------------------------------------------
47
// Lexer
48
//------------------------------------------------------------------------
49
50
132k
Lexer::Lexer(XRef *xref, Stream *str) {
51
132k
  Object obj;
52
53
132k
  curStr.initStream(str);
54
132k
  streams = new Array(xref);
55
132k
  streams->add(curStr.copy(&obj));
56
132k
  strPtr = 0;
57
132k
  freeArray = gTrue;
58
132k
  curStr.streamReset();
59
132k
}
60
61
0
Lexer::Lexer(XRef *xref, Object *obj) {
62
0
  Object obj2;
63
64
0
  if (obj->isStream()) {
65
0
    streams = new Array(xref);
66
0
    freeArray = gTrue;
67
0
    streams->add(obj->copy(&obj2));
68
0
  } else {
69
0
    streams = obj->getArray();
70
0
    freeArray = gFalse;
71
0
  }
72
0
  strPtr = 0;
73
0
  if (streams->getLength() > 0) {
74
0
    streams->get(strPtr, &curStr);
75
0
    curStr.streamReset();
76
0
  }
77
0
}
78
79
132k
Lexer::~Lexer() {
80
132k
  if (!curStr.isNone()) {
81
55.8k
    curStr.streamClose();
82
55.8k
    curStr.free();
83
55.8k
  }
84
132k
  if (freeArray) {
85
132k
    delete streams;
86
132k
  }
87
132k
}
88
89
752M
int Lexer::getChar() {
90
752M
  int c;
91
92
752M
  c = EOF;
93
752M
  while (!curStr.isNone() && (c = curStr.streamGetChar()) == EOF) {
94
76.3k
    curStr.streamClose();
95
76.3k
    curStr.free();
96
76.3k
    ++strPtr;
97
76.3k
    if (strPtr < streams->getLength()) {
98
0
      streams->get(strPtr, &curStr);
99
0
      curStr.streamReset();
100
0
    }
101
76.3k
  }
102
752M
  return c;
103
752M
}
104
105
161M
int Lexer::lookChar() {
106
161M
  if (curStr.isNone()) {
107
209
    return EOF;
108
209
  }
109
161M
  return curStr.streamLookChar();
110
161M
}
111
112
44.6M
Object *Lexer::getObj(Object *obj) {
113
44.6M
  char *p;
114
44.6M
  int c, c2;
115
44.6M
  GBool comment, neg, doubleMinus, done, invalid;
116
44.6M
  int numParen;
117
44.6M
  int xi;
118
44.6M
  double xf, scale;
119
44.6M
  GString *s;
120
44.6M
  int n, m;
121
122
  // skip whitespace and comments
123
44.6M
  comment = gFalse;
124
126M
  while (1) {
125
126M
    if ((c = getChar()) == EOF) {
126
10.6M
      return obj->initEOF();
127
10.6M
    }
128
115M
    if (comment) {
129
15.6M
      if (c == '\r' || c == '\n')
130
147k
  comment = gFalse;
131
99.8M
    } else if (c == '%') {
132
147k
      comment = gTrue;
133
99.6M
    } else if (specialChars[c] != 1) {
134
34.0M
      break;
135
34.0M
    }
136
115M
  }
137
138
  // start reading token
139
34.0M
  switch (c) {
140
141
  // number
142
2.85M
  case '0': case '1': case '2': case '3': case '4':
143
3.86M
  case '5': case '6': case '7': case '8': case '9':
144
4.33M
  case '+': case '-': case '.':
145
    // Adobe's number lexer has some "interesting" behavior:
146
    // "--123" is interpreted as 0
147
    // "--123.4" is interpreted as -123.4 [I've seen this in the wild]
148
    // "50-100" is interpreted as 50 [I've seen this in the wild]
149
    // "50--100" is interpreted as 50
150
    // "50-100.0" is an error -- but older versions of Acrobat may
151
    //   have interpreted it as 50100.0 (?)
152
    // "50--100.0" is an error -- but older versions of Acrobat may
153
    //   have interpreted it as 50100.0 (?)
154
    // "50.0-100" is interpreted as 50.0 (or maybe 50.0100?)
155
    // "50.0--100" is interpreted as 50.0 (or maybe 50.0100?)
156
    // "-50-100" is interpreted as -50
157
    // "-" is interpreted as 0
158
    // "-." is interpreted as 0.0
159
4.33M
    neg = gFalse;
160
4.33M
    doubleMinus = gFalse;
161
4.33M
    xf = xi = 0;
162
4.33M
    if (c == '+') {
163
      // just ignore it
164
4.31M
    } else if (c == '-') {
165
245k
      neg = gTrue;
166
245k
      if (lookChar() == '-') {
167
10.6k
  doubleMinus = gTrue;
168
19.1k
  do {
169
19.1k
    getChar();
170
19.1k
  } while (lookChar() == '-');
171
10.6k
      }
172
4.07M
    } else if (c == '.') {
173
204k
      goto doReal;
174
3.86M
    } else {
175
3.86M
      xf = xi = c - '0';
176
3.86M
    }
177
8.33M
    while (1) {
178
8.33M
      c = lookChar();
179
8.33M
      if (isdigit(c)) {
180
4.20M
  getChar();
181
4.20M
  xi = xi * 10 + (c - '0');
182
4.20M
  if (xf < 1e20) {
183
4.16M
    xf = xf * 10 + (c - '0');
184
4.16M
  }
185
4.20M
      } else if (c == '.') {
186
96.9k
  getChar();
187
96.9k
  goto doReal;
188
4.03M
      } else {
189
4.03M
  break;
190
4.03M
      }
191
8.33M
    }
192
4.04M
    while ((c = lookChar()) == '-' || isdigit(c)) {
193
4.85k
      getChar();
194
4.85k
    }
195
4.03M
    if (neg) {
196
239k
      xi = -xi;
197
239k
    }
198
4.03M
    if (doubleMinus) {
199
10.6k
      xi = 0;
200
10.6k
    }
201
4.03M
    obj->initInt(xi);
202
4.03M
    break;
203
301k
  doReal:
204
301k
    scale = 0.1;
205
460k
    while (1) {
206
460k
      c = lookChar();
207
460k
      if (c == '-') {
208
6.02k
  error(errSyntaxWarning, getPos(), "Badly formatted number");
209
6.02k
  getChar();
210
6.02k
  continue;
211
6.02k
      }
212
454k
      if (!isdigit(c)) {
213
301k
  break;
214
301k
      }
215
152k
      getChar();
216
152k
      xf = xf + scale * (c - '0');
217
152k
      scale *= 0.1;
218
152k
    }
219
301k
    while ((c = lookChar()) == '-' || isdigit(c)) {
220
0
      getChar();
221
0
    }
222
301k
    if (neg) {
223
6.17k
      xf = -xf;
224
6.17k
    }
225
301k
    obj->initReal(xf);
226
301k
    break;
227
228
  // string
229
105k
  case '(':
230
105k
    p = tokBuf;
231
105k
    n = 0;
232
105k
    numParen = 1;
233
105k
    done = gFalse;
234
105k
    s = NULL;
235
437M
    do {
236
437M
      c2 = EOF;
237
437M
      switch (c = getChar()) {
238
239
6.01k
      case EOF:
240
6.01k
  error(errSyntaxError, getPos(), "Unterminated string");
241
6.01k
  done = gTrue;
242
6.01k
  break;
243
244
463k
      case '(':
245
463k
  ++numParen;
246
463k
  c2 = c;
247
463k
  break;
248
249
520k
      case ')':
250
520k
  if (--numParen == 0) {
251
99.0k
    done = gTrue;
252
420k
  } else {
253
420k
    c2 = c;
254
420k
  }
255
520k
  break;
256
257
4.54M
      case '\r':
258
  // The PDF spec says that any literal end-of-line sequence
259
  // (LF, CR, CR+LF) is translated to a single LF char.
260
4.54M
  c = lookChar();
261
4.54M
  if (c == '\n') {
262
121k
    getChar();
263
121k
  }
264
4.54M
  c2 = '\n';
265
4.54M
  break;
266
267
235k
      case '\\':
268
235k
  switch (c = getChar()) {
269
6.28k
  case 'n':
270
6.28k
    c2 = '\n';
271
6.28k
    break;
272
13.8k
  case 'r':
273
13.8k
    c2 = '\r';
274
13.8k
    break;
275
23.1k
  case 't':
276
23.1k
    c2 = '\t';
277
23.1k
    break;
278
533
  case 'b':
279
533
    c2 = '\b';
280
533
    break;
281
1.41k
  case 'f':
282
1.41k
    c2 = '\f';
283
1.41k
    break;
284
5.77k
  case '\\':
285
9.05k
  case '(':
286
11.4k
  case ')':
287
11.4k
    c2 = c;
288
11.4k
    break;
289
9.00k
  case '0': case '1': case '2': case '3':
290
32.5k
  case '4': case '5': case '6': case '7':
291
32.5k
    c2 = c - '0';
292
32.5k
    c = lookChar();
293
32.5k
    if (c >= '0' && c <= '7') {
294
6.00k
      getChar();
295
6.00k
      c2 = (c2 << 3) + (c - '0');
296
6.00k
      c = lookChar();
297
6.00k
      if (c >= '0' && c <= '7') {
298
985
        getChar();
299
985
        c2 = (c2 << 3) + (c - '0');
300
985
      }
301
6.00k
    }
302
32.5k
    break;
303
1.63k
  case '\r':
304
1.63k
    c = lookChar();
305
1.63k
    if (c == '\n') {
306
869
      getChar();
307
869
    }
308
1.63k
    break;
309
5.76k
  case '\n':
310
5.76k
    break;
311
1
  case EOF:
312
1
    error(errSyntaxError, getPos(), "Unterminated string");
313
1
    done = gTrue;
314
1
    break;
315
138k
  default:
316
138k
    c2 = c;
317
138k
    break;
318
235k
  }
319
235k
  break;
320
321
431M
      default:
322
431M
  c2 = c;
323
431M
  break;
324
437M
      }
325
326
437M
      if (c2 != EOF) {
327
437M
  if (n == tokBufSize) {
328
3.37M
    if (!s)
329
56.2k
      s = new GString(tokBuf, tokBufSize);
330
3.31M
    else
331
3.31M
      s->append(tokBuf, tokBufSize);
332
3.37M
    p = tokBuf;
333
3.37M
    n = 0;
334
3.37M
  }
335
437M
  *p++ = (char)c2;
336
437M
  ++n;
337
437M
      }
338
437M
    } while (!done);
339
105k
    if (!s)
340
48.8k
      s = new GString(tokBuf, n);
341
56.2k
    else
342
56.2k
      s->append(tokBuf, n);
343
105k
    obj->initString(s);
344
105k
    break;
345
346
  // name
347
4.75M
  case '/':
348
4.75M
    p = tokBuf;
349
4.75M
    n = 0;
350
4.75M
    s = NULL;
351
4.75M
    invalid = gFalse;
352
16.3M
    while ((c = lookChar()) != EOF && !specialChars[c]) {
353
11.5M
      getChar();
354
11.5M
      if (c == '#') {
355
712k
  c2 = lookChar();
356
712k
  if (c2 >= '0' && c2 <= '9') {
357
2.10k
    c = c2 - '0';
358
709k
  } else if (c2 >= 'A' && c2 <= 'F') {
359
46.7k
    c = c2 - 'A' + 10;
360
663k
  } else if (c2 >= 'a' && c2 <= 'f') {
361
75.0k
    c = c2 - 'a' + 10;
362
588k
  } else {
363
588k
    error(errSyntaxError, getPos(), "Invalid hex escape in name");
364
588k
    goto notEscChar;
365
588k
  }
366
123k
  getChar();
367
123k
  c2 = lookChar();
368
123k
  if (c2 >= '0' && c2 <= '9') {
369
25.1k
    c = (c << 4) + (c2 - '0');
370
98.6k
  } else if (c2 >= 'A' && c2 <= 'F') {
371
46.3k
    c = (c << 4) + (c2 - 'A' + 10);
372
52.3k
  } else if (c2 >= 'a' && c2 <= 'f') {
373
258
    c = (c << 4) + (c2 - 'a' + 10);
374
52.0k
  } else {
375
52.0k
    error(errSyntaxError, getPos(), "Invalid hex escape in name");
376
52.0k
    goto notEscChar;
377
52.0k
  }
378
71.7k
  getChar();
379
71.7k
  if (c == 0) {
380
261
    invalid = gTrue;
381
261
  }
382
71.7k
      }
383
11.5M
     notEscChar:
384
      // the PDF spec claims that names are limited to 127 chars, but
385
      // Distiller 8 will produce longer names, and Acrobat 8 will
386
      // accept longer names
387
11.5M
      ++n;
388
11.5M
      if (n < tokBufSize) {
389
10.4M
  *p++ = (char)c;
390
10.4M
      } else if (n == tokBufSize) {
391
562
  *p = (char)c;
392
562
  s = new GString(tokBuf, n);
393
1.08M
      } else {
394
1.08M
  s->append((char)c);
395
1.08M
      }
396
11.5M
    }
397
4.75M
    if (invalid) {
398
131
      error(errSyntaxError, getPos(), "Null character in name");
399
131
      obj->initError();
400
131
      if (s) {
401
89
  delete s;
402
89
      }
403
4.75M
    } else if (n < tokBufSize) {
404
4.75M
      *p = '\0';
405
4.75M
      obj->initName(tokBuf);
406
4.75M
    } else {
407
473
      obj->initName(s->getCString());
408
473
      delete s;
409
473
    }
410
4.75M
    break;
411
412
  // array punctuation
413
4.86M
  case '[':
414
5.21M
  case ']':
415
5.21M
    tokBuf[0] = (char)c;
416
5.21M
    tokBuf[1] = '\0';
417
5.21M
    obj->initCmd(tokBuf);
418
5.21M
    break;
419
420
  // hex string or dict punctuation
421
3.30M
  case '<':
422
3.30M
    c = lookChar();
423
424
    // dict punctuation
425
3.30M
    if (c == '<') {
426
2.69M
      getChar();
427
2.69M
      tokBuf[0] = tokBuf[1] = '<';
428
2.69M
      tokBuf[2] = '\0';
429
2.69M
      obj->initCmd(tokBuf);
430
431
    // hex string
432
2.69M
    } else {
433
606k
      p = tokBuf;
434
606k
      m = n = 0;
435
606k
      c2 = 0;
436
606k
      s = NULL;
437
58.7M
      while (1) {
438
58.7M
  c = getChar();
439
58.7M
  if (c == '>') {
440
606k
    break;
441
58.1M
  } else if (c == EOF) {
442
688
    error(errSyntaxError, getPos(), "Unterminated hex string");
443
688
    break;
444
58.1M
  } else if (specialChars[c] != 1) {
445
51.6M
    c2 = c2 << 4;
446
51.6M
    if (c >= '0' && c <= '9')
447
4.22M
      c2 += c - '0';
448
47.4M
    else if (c >= 'A' && c <= 'F')
449
678k
      c2 += c - 'A' + 10;
450
46.7M
    else if (c >= 'a' && c <= 'f')
451
6.64M
      c2 += c - 'a' + 10;
452
40.1M
    else
453
40.1M
      error(errSyntaxError, getPos(),
454
40.1M
      "Illegal character <{0:02x}> in hex string", c);
455
51.6M
    if (++m == 2) {
456
25.7M
      if (n == tokBufSize) {
457
48.0k
        if (!s)
458
23.1k
    s = new GString(tokBuf, tokBufSize);
459
24.9k
        else
460
24.9k
    s->append(tokBuf, tokBufSize);
461
48.0k
        p = tokBuf;
462
48.0k
        n = 0;
463
48.0k
      }
464
25.7M
      *p++ = (char)c2;
465
25.7M
      ++n;
466
25.7M
      c2 = 0;
467
25.7M
      m = 0;
468
25.7M
    }
469
51.6M
  }
470
58.7M
      }
471
606k
      if (!s)
472
583k
  s = new GString(tokBuf, n);
473
23.1k
      else
474
23.1k
  s->append(tokBuf, n);
475
606k
      if (m == 1)
476
224k
  s->append((char)(c2 << 4));
477
606k
      obj->initString(s);
478
606k
    }
479
3.30M
    break;
480
481
  // dict punctuation
482
2.36M
  case '>':
483
2.36M
    c = lookChar();
484
2.36M
    if (c == '>') {
485
1.42M
      getChar();
486
1.42M
      tokBuf[0] = tokBuf[1] = '>';
487
1.42M
      tokBuf[2] = '\0';
488
1.42M
      obj->initCmd(tokBuf);
489
1.42M
    } else {
490
935k
      error(errSyntaxError, getPos(), "Illegal character '>'");
491
935k
      obj->initError();
492
935k
    }
493
2.36M
    break;
494
495
  // error
496
698k
  case ')':
497
1.17M
  case '{':
498
1.27M
  case '}':
499
1.27M
    error(errSyntaxError, getPos(), "Illegal character '{0:c}'", c);
500
1.27M
    obj->initError();
501
1.27M
    break;
502
503
  // command
504
12.6M
  default:
505
12.6M
    p = tokBuf;
506
12.6M
    *p++ = (char)c;
507
12.6M
    n = 1;
508
120M
    while ((c = lookChar()) != EOF && !specialChars[c]) {
509
108M
      getChar();
510
108M
      if (++n == tokBufSize) {
511
348k
  error(errSyntaxError, getPos(), "Command token too long");
512
348k
  break;
513
348k
      }
514
107M
      *p++ = (char)c;
515
107M
    }
516
12.6M
    *p = '\0';
517
12.6M
    if (tokBuf[0] == 't' && !strcmp(tokBuf, "true")) {
518
67.2k
      obj->initBool(gTrue);
519
12.5M
    } else if (tokBuf[0] == 'f' && !strcmp(tokBuf, "false")) {
520
492
      obj->initBool(gFalse);
521
12.5M
    } else if (tokBuf[0] == 'n' && !strcmp(tokBuf, "null")) {
522
283
      obj->initNull();
523
12.5M
    } else {
524
12.5M
      obj->initCmd(tokBuf);
525
12.5M
    }
526
12.6M
    break;
527
34.0M
  }
528
529
34.0M
  return obj;
530
34.0M
}
531
532
26.8k
void Lexer::skipToNextLine() {
533
26.8k
  int c;
534
535
94.1k
  while (1) {
536
94.1k
    c = getChar();
537
94.1k
    if (c == EOF || c == '\n') {
538
7.83k
      return;
539
7.83k
    }
540
86.3k
    if (c == '\r') {
541
19.0k
      if ((c = lookChar()) == '\n') {
542
15.5k
  getChar();
543
15.5k
      }
544
19.0k
      return;
545
19.0k
    }
546
86.3k
  }
547
26.8k
}
548
549
14.2k
void Lexer::skipToEOF() {
550
712k
  while (getChar() != EOF) ;
551
14.2k
}
552
553
26.7M
GBool Lexer::isSpace(int c) {
554
26.7M
  return c >= 0 && c <= 0xff && specialChars[c] == 1;
555
26.7M
}