Coverage Report

Created: 2026-06-22 07:14

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/xpdf-4.06/xpdf/Lexer.cc
Line
Count
Source
1
//========================================================================
2
//
3
// Lexer.cc
4
//
5
// Copyright 1996-2003 Glyph & Cog, LLC
6
//
7
//========================================================================
8
9
#include <aconf.h>
10
11
#include <stdlib.h>
12
#include <stddef.h>
13
#include <string.h>
14
#include <ctype.h>
15
#include "gmempp.h"
16
#include "Lexer.h"
17
#include "Error.h"
18
19
//------------------------------------------------------------------------
20
21
// A '1' in this array means the character is white space.  A '1' or
22
// '2' means the character ends a name or command.
23
static char specialChars[256] = {
24
  1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0,   // 0x
25
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // 1x
26
  1, 0, 0, 0, 0, 2, 0, 0, 2, 2, 0, 0, 0, 0, 0, 2,   // 2x
27
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0,   // 3x
28
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // 4x
29
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0,   // 5x
30
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // 6x
31
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0,   // 7x
32
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // 8x
33
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // 9x
34
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // ax
35
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // bx
36
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // cx
37
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // dx
38
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // ex
39
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0    // fx
40
};
41
42
//------------------------------------------------------------------------
43
// Lexer
44
//------------------------------------------------------------------------
45
46
658k
Lexer::Lexer(XRef *xref, Stream *str) {
47
658k
  Object obj;
48
49
658k
  curStr.initStream(str);
50
658k
  streams = new Array(xref);
51
658k
  streams->add(curStr.copy(&obj));
52
658k
  strPtr = 0;
53
658k
  freeArray = gTrue;
54
658k
  curStr.streamReset();
55
658k
}
56
57
102k
Lexer::Lexer(XRef *xref, Object *obj) {
58
102k
  Object obj2;
59
60
102k
  if (obj->isStream()) {
61
101k
    streams = new Array(xref);
62
101k
    freeArray = gTrue;
63
101k
    streams->add(obj->copy(&obj2));
64
101k
  } else {
65
774
    streams = obj->getArray();
66
774
    freeArray = gFalse;
67
774
  }
68
102k
  strPtr = 0;
69
102k
  if (streams->getLength() > 0) {
70
102k
    streams->get(strPtr, &curStr);
71
102k
    curStr.streamReset();
72
102k
  }
73
102k
}
74
75
760k
Lexer::~Lexer() {
76
760k
  if (!curStr.isNone()) {
77
318k
    curStr.streamClose();
78
318k
    curStr.free();
79
318k
  }
80
760k
  if (freeArray) {
81
759k
    delete streams;
82
759k
  }
83
760k
}
84
85
1.88G
int Lexer::getChar() {
86
1.88G
  int c;
87
88
1.88G
  c = EOF;
89
1.88G
  while (!curStr.isNone() && (c = curStr.streamGetChar()) == EOF) {
90
442k
    curStr.streamClose();
91
442k
    curStr.free();
92
442k
    ++strPtr;
93
442k
    if (strPtr < streams->getLength()) {
94
1.12k
      streams->get(strPtr, &curStr);
95
1.12k
      curStr.streamReset();
96
1.12k
    }
97
442k
  }
98
1.88G
  return c;
99
1.88G
}
100
101
525M
int Lexer::lookChar() {
102
525M
  if (curStr.isNone()) {
103
2.65k
    return EOF;
104
2.65k
  }
105
525M
  return curStr.streamLookChar();
106
525M
}
107
108
261M
Object *Lexer::getObj(Object *obj) {
109
261M
  char *p;
110
261M
  int c, c2;
111
261M
  GBool comment, neg, doubleMinus, done, invalid;
112
261M
  int numParen, nErrors;
113
261M
  int xi;
114
261M
  double xf, scale;
115
261M
  GString *s;
116
261M
  int n, m;
117
118
  // skip whitespace and comments
119
261M
  comment = gFalse;
120
953M
  while (1) {
121
953M
    if ((c = getChar()) == EOF) {
122
143M
      return obj->initEOF();
123
143M
    }
124
809M
    if (comment) {
125
54.1M
      if (c == '\r' || c == '\n')
126
671k
  comment = gFalse;
127
755M
    } else if (c == '%') {
128
675k
      comment = gTrue;
129
754M
    } else if (specialChars[c] != 1) {
130
118M
      break;
131
118M
    }
132
809M
  }
133
134
  // start reading token
135
118M
  switch (c) {
136
137
  // number
138
21.3M
  case '0': case '1': case '2': case '3': case '4':
139
31.8M
  case '5': case '6': case '7': case '8': case '9':
140
34.7M
  case '+': case '-': case '.':
141
    // Adobe's number lexer has some "interesting" behavior:
142
    // "--123" is interpreted as 0
143
    // "--123.4" is interpreted as -123.4 [I've seen this in the wild]
144
    // "50-100" is interpreted as 50 [I've seen this in the wild]
145
    // "50--100" is interpreted as 50
146
    // "50-100.0" is an error -- but older versions of Acrobat may
147
    //   have interpreted it as 50100.0 (?)
148
    // "50--100.0" is an error -- but older versions of Acrobat may
149
    //   have interpreted it as 50100.0 (?)
150
    // "50.0-100" is interpreted as 50.0 (or maybe 50.0100?)
151
    // "50.0--100" is interpreted as 50.0 (or maybe 50.0100?)
152
    // "-50-100" is interpreted as -50
153
    // "-" is interpreted as 0
154
    // "-." is interpreted as 0.0
155
34.7M
    neg = gFalse;
156
34.7M
    doubleMinus = gFalse;
157
34.7M
    xf = xi = 0;
158
34.7M
    if (c == '+') {
159
      // just ignore it
160
34.7M
    } else if (c == '-') {
161
1.80M
      neg = gTrue;
162
1.80M
      if (lookChar() == '-') {
163
20.9k
  doubleMinus = gTrue;
164
35.3k
  do {
165
35.3k
    getChar();
166
35.3k
  } while (lookChar() == '-');
167
20.9k
      }
168
32.9M
    } else if (c == '.') {
169
1.04M
      goto doReal;
170
31.8M
    } else {
171
31.8M
      xf = xi = c - '0';
172
31.8M
    }
173
84.7M
    while (1) {
174
84.7M
      c = lookChar();
175
84.7M
      if (isdigit(c)) {
176
51.0M
  getChar();
177
51.0M
  xi = xi * 10 + (c - '0');
178
51.0M
  if (xf < 1e20) {
179
43.0M
    xf = xf * 10 + (c - '0');
180
43.0M
  }
181
51.0M
      } else if (c == '.') {
182
4.84M
  getChar();
183
4.84M
  goto doReal;
184
28.8M
      } else {
185
28.8M
  break;
186
28.8M
      }
187
84.7M
    }
188
29.2M
    while ((c = lookChar()) == '-' || isdigit(c)) {
189
324k
      getChar();
190
324k
    }
191
28.8M
    if (neg) {
192
958k
      xi = -xi;
193
958k
    }
194
28.8M
    if (doubleMinus) {
195
17.4k
      xi = 0;
196
17.4k
    }
197
28.8M
    obj->initInt(xi);
198
28.8M
    break;
199
5.89M
  doReal:
200
5.89M
    scale = 0.1;
201
18.1M
    while (1) {
202
18.1M
      c = lookChar();
203
18.1M
      if (c == '-') {
204
36.2k
  error(errSyntaxWarning, getPos(), "Badly formatted number");
205
36.2k
  getChar();
206
36.2k
  continue;
207
36.2k
      }
208
18.0M
      if (!isdigit(c)) {
209
5.89M
  break;
210
5.89M
      }
211
12.1M
      getChar();
212
12.1M
      xf = xf + scale * (c - '0');
213
12.1M
      scale *= 0.1;
214
12.1M
    }
215
5.89M
    while ((c = lookChar()) == '-' || isdigit(c)) {
216
0
      getChar();
217
0
    }
218
5.89M
    if (neg) {
219
850k
      xf = -xf;
220
850k
    }
221
5.89M
    obj->initReal(xf);
222
5.89M
    break;
223
224
  // string
225
722k
  case '(':
226
722k
    p = tokBuf;
227
722k
    n = 0;
228
722k
    numParen = 1;
229
722k
    done = gFalse;
230
722k
    s = NULL;
231
437M
    do {
232
437M
      c2 = EOF;
233
437M
      switch (c = getChar()) {
234
235
21.8k
      case EOF:
236
21.8k
  error(errSyntaxError, getPos(), "Unterminated string");
237
21.8k
  done = gTrue;
238
21.8k
  break;
239
240
1.12M
      case '(':
241
1.12M
  ++numParen;
242
1.12M
  c2 = c;
243
1.12M
  break;
244
245
1.55M
      case ')':
246
1.55M
  if (--numParen == 0) {
247
700k
    done = gTrue;
248
858k
  } else {
249
858k
    c2 = c;
250
858k
  }
251
1.55M
  break;
252
253
8.40M
      case '\r':
254
  // The PDF spec says that any literal end-of-line sequence
255
  // (LF, CR, CR+LF) is translated to a single LF char.
256
8.40M
  c = lookChar();
257
8.40M
  if (c == '\n') {
258
167k
    getChar();
259
167k
  }
260
8.40M
  c2 = '\n';
261
8.40M
  break;
262
263
541k
      case '\\':
264
541k
  switch (c = getChar()) {
265
6.75k
  case 'n':
266
6.75k
    c2 = '\n';
267
6.75k
    break;
268
67.6k
  case 'r':
269
67.6k
    c2 = '\r';
270
67.6k
    break;
271
24.8k
  case 't':
272
24.8k
    c2 = '\t';
273
24.8k
    break;
274
2.16k
  case 'b':
275
2.16k
    c2 = '\b';
276
2.16k
    break;
277
813
  case 'f':
278
813
    c2 = '\f';
279
813
    break;
280
36.9k
  case '\\':
281
76.8k
  case '(':
282
120k
  case ')':
283
120k
    c2 = c;
284
120k
    break;
285
90.7k
  case '0': case '1': case '2': case '3':
286
126k
  case '4': case '5': case '6': case '7':
287
126k
    c2 = c - '0';
288
126k
    c = lookChar();
289
126k
    if (c >= '0' && c <= '7') {
290
87.3k
      getChar();
291
87.3k
      c2 = (c2 << 3) + (c - '0');
292
87.3k
      c = lookChar();
293
87.3k
      if (c >= '0' && c <= '7') {
294
72.0k
        getChar();
295
72.0k
        c2 = (c2 << 3) + (c - '0');
296
72.0k
      }
297
87.3k
    }
298
126k
    break;
299
3.85k
  case '\r':
300
3.85k
    c = lookChar();
301
3.85k
    if (c == '\n') {
302
1.14k
      getChar();
303
1.14k
    }
304
3.85k
    break;
305
8.22k
  case '\n':
306
8.22k
    break;
307
383
  case EOF:
308
383
    error(errSyntaxError, getPos(), "Unterminated string");
309
383
    done = gTrue;
310
383
    break;
311
179k
  default:
312
179k
    c2 = c;
313
179k
    break;
314
541k
  }
315
541k
  break;
316
317
425M
      default:
318
425M
  c2 = c;
319
425M
  break;
320
437M
      }
321
322
437M
      if (c2 != EOF) {
323
436M
  if (n == tokBufSize) {
324
3.26M
    if (!s)
325
106k
      s = new GString(tokBuf, tokBufSize);
326
3.16M
    else
327
3.16M
      s->append(tokBuf, tokBufSize);
328
3.26M
    p = tokBuf;
329
3.26M
    n = 0;
330
3.26M
  }
331
436M
  *p++ = (char)c2;
332
436M
  ++n;
333
436M
      }
334
437M
    } while (!done);
335
722k
    if (!s)
336
616k
      s = new GString(tokBuf, n);
337
106k
    else
338
106k
      s->append(tokBuf, n);
339
722k
    obj->initString(s);
340
722k
    break;
341
342
  // name
343
10.5M
  case '/':
344
10.5M
    p = tokBuf;
345
10.5M
    n = 0;
346
10.5M
    s = NULL;
347
10.5M
    invalid = gFalse;
348
57.6M
    while ((c = lookChar()) != EOF && !specialChars[c]) {
349
47.0M
      getChar();
350
47.0M
      if (c == '#') {
351
840k
  c2 = lookChar();
352
840k
  if (c2 >= '0' && c2 <= '9') {
353
2.41k
    c = c2 - '0';
354
838k
  } else if (c2 >= 'A' && c2 <= 'F') {
355
47.6k
    c = c2 - 'A' + 10;
356
790k
  } else if (c2 >= 'a' && c2 <= 'f') {
357
98.6k
    c = c2 - 'a' + 10;
358
692k
  } else {
359
692k
    error(errSyntaxError, getPos(), "Invalid hex escape in name");
360
692k
    goto notEscChar;
361
692k
  }
362
148k
  getChar();
363
148k
  c2 = lookChar();
364
148k
  if (c2 >= '0' && c2 <= '9') {
365
27.8k
    c = (c << 4) + (c2 - '0');
366
120k
  } else if (c2 >= 'A' && c2 <= 'F') {
367
47.2k
    c = (c << 4) + (c2 - 'A' + 10);
368
73.7k
  } else if (c2 >= 'a' && c2 <= 'f') {
369
1.05k
    c = (c << 4) + (c2 - 'a' + 10);
370
72.6k
  } else {
371
72.6k
    error(errSyntaxError, getPos(), "Invalid hex escape in name");
372
72.6k
    goto notEscChar;
373
72.6k
  }
374
76.0k
  getChar();
375
76.0k
  if (c == 0) {
376
501
    invalid = gTrue;
377
501
  }
378
76.0k
      }
379
47.0M
     notEscChar:
380
      // the PDF spec claims that names are limited to 127 chars, but
381
      // Distiller 8 will produce longer names, and Acrobat 8 will
382
      // accept longer names
383
47.0M
      ++n;
384
47.0M
      if (n < tokBufSize) {
385
41.7M
  *p++ = (char)c;
386
41.7M
      } else if (n == tokBufSize) {
387
9.24k
  *p = (char)c;
388
9.24k
  s = new GString(tokBuf, n);
389
5.31M
      } else {
390
5.31M
  s->append((char)c);
391
5.31M
      }
392
47.0M
    }
393
10.5M
    if (invalid) {
394
501
      error(errSyntaxError, getPos(), "Null character in name");
395
501
      obj->initError();
396
501
      if (s) {
397
142
  delete s;
398
142
      }
399
10.5M
    } else if (n < tokBufSize) {
400
10.5M
      *p = '\0';
401
10.5M
      obj->initName(tokBuf);
402
10.5M
    } else {
403
9.09k
      obj->initName(s->getCString());
404
9.09k
      delete s;
405
9.09k
    }
406
10.5M
    break;
407
408
  // array punctuation
409
10.3M
  case '[':
410
11.1M
  case ']':
411
11.1M
    tokBuf[0] = (char)c;
412
11.1M
    tokBuf[1] = '\0';
413
11.1M
    obj->initCmd(tokBuf);
414
11.1M
    break;
415
416
  // hex string or dict punctuation
417
19.0M
  case '<':
418
19.0M
    c = lookChar();
419
420
    // dict punctuation
421
19.0M
    if (c == '<') {
422
3.82M
      getChar();
423
3.82M
      tokBuf[0] = tokBuf[1] = '<';
424
3.82M
      tokBuf[2] = '\0';
425
3.82M
      obj->initCmd(tokBuf);
426
427
    // hex string
428
15.2M
    } else {
429
15.2M
      p = tokBuf;
430
15.2M
      m = n = 0;
431
15.2M
      c2 = 0;
432
15.2M
      s = NULL;
433
15.2M
      nErrors = 0;
434
102M
      while (nErrors < 100) {
435
102M
  c = getChar();
436
102M
  if (c == '>') {
437
15.0M
    break;
438
87.3M
  } else if (c == EOF) {
439
6.81k
    error(errSyntaxError, getPos(), "Unterminated hex string");
440
6.81k
    break;
441
87.3M
  } else if (specialChars[c] != 1) {
442
69.6M
    c2 = c2 << 4;
443
69.6M
    if (c >= '0' && c <= '9') {
444
9.99M
      c2 += c - '0';
445
59.6M
    } else if (c >= 'A' && c <= 'F') {
446
1.13M
      c2 += c - 'A' + 10;
447
58.4M
    } else if (c >= 'a' && c <= 'f') {
448
21.1M
      c2 += c - 'a' + 10;
449
37.3M
    } else {
450
37.3M
      error(errSyntaxError, getPos(),
451
37.3M
      "Illegal character <{0:02x}> in hex string", c);
452
37.3M
      ++nErrors;
453
37.3M
    }
454
69.6M
    if (++m == 2) {
455
27.5M
      if (n == tokBufSize) {
456
11.9k
        if (!s)
457
8.50k
    s = new GString(tokBuf, tokBufSize);
458
3.47k
        else
459
3.47k
    s->append(tokBuf, tokBufSize);
460
11.9k
        p = tokBuf;
461
11.9k
        n = 0;
462
11.9k
      }
463
27.5M
      *p++ = (char)c2;
464
27.5M
      ++n;
465
27.5M
      c2 = 0;
466
27.5M
      m = 0;
467
27.5M
    }
468
69.6M
  }
469
102M
      }
470
15.2M
      if (!s)
471
15.2M
  s = new GString(tokBuf, n);
472
8.50k
      else
473
8.50k
  s->append(tokBuf, n);
474
15.2M
      if (m == 1)
475
14.6M
  s->append((char)(c2 << 4));
476
15.2M
      obj->initString(s);
477
15.2M
    }
478
19.0M
    break;
479
480
  // dict punctuation
481
4.98M
  case '>':
482
4.98M
    c = lookChar();
483
4.98M
    if (c == '>') {
484
2.46M
      getChar();
485
2.46M
      tokBuf[0] = tokBuf[1] = '>';
486
2.46M
      tokBuf[2] = '\0';
487
2.46M
      obj->initCmd(tokBuf);
488
2.52M
    } else {
489
2.52M
      error(errSyntaxError, getPos(), "Illegal character '>'");
490
2.52M
      obj->initError();
491
2.52M
    }
492
4.98M
    break;
493
494
  // error
495
1.55M
  case ')':
496
2.20M
  case '{':
497
2.34M
  case '}':
498
2.34M
    error(errSyntaxError, getPos(), "Illegal character '{0:c}'", c);
499
2.34M
    obj->initError();
500
2.34M
    break;
501
502
  // command
503
34.4M
  default:
504
34.4M
    p = tokBuf;
505
34.4M
    *p++ = (char)c;
506
34.4M
    n = 1;
507
294M
    while ((c = lookChar()) != EOF && !specialChars[c]) {
508
260M
      getChar();
509
260M
      if (++n == tokBufSize) {
510
1.07M
  error(errSyntaxError, getPos(), "Command token too long");
511
1.07M
  break;
512
1.07M
      }
513
259M
      *p++ = (char)c;
514
259M
    }
515
34.4M
    *p = '\0';
516
34.4M
    if (tokBuf[0] == 't' && !strcmp(tokBuf, "true")) {
517
153k
      obj->initBool(gTrue);
518
34.2M
    } else if (tokBuf[0] == 'f' && !strcmp(tokBuf, "false")) {
519
9.82k
      obj->initBool(gFalse);
520
34.2M
    } else if (tokBuf[0] == 'n' && !strcmp(tokBuf, "null")) {
521
16.4k
      obj->initNull();
522
34.2M
    } else {
523
34.2M
      obj->initCmd(tokBuf);
524
34.2M
    }
525
34.4M
    break;
526
118M
  }
527
528
118M
  return obj;
529
118M
}
530
531
198k
void Lexer::skipToNextLine() {
532
198k
  int c;
533
534
611k
  while (1) {
535
611k
    c = getChar();
536
611k
    if (c == EOF || c == '\n') {
537
97.9k
      return;
538
97.9k
    }
539
513k
    if (c == '\r') {
540
100k
      if ((c = lookChar()) == '\n') {
541
89.2k
  getChar();
542
89.2k
      }
543
100k
      return;
544
100k
    }
545
513k
  }
546
198k
}
547
548
61.3k
void Lexer::skipToEOF() {
549
3.33M
  while (getChar() != EOF) ;
550
61.3k
}
551
552
196M
GBool Lexer::isSpace(int c) {
553
196M
  return c >= 0 && c <= 0xff && specialChars[c] == 1;
554
196M
}