Coverage Report

Created: 2025-06-13 07:02

/src/tesseract/src/ccutil/scanutils.cpp
Line
Count
Source (jump to first uncovered line)
1
// Copyright 2006 Google Inc.
2
// All Rights Reserved.
3
// Author: renn
4
//
5
// Licensed under the Apache License, Version 2.0 (the "License");
6
// you may not use this file except in compliance with the License.
7
// You may obtain a copy of the License at
8
// http://www.apache.org/licenses/LICENSE-2.0
9
// Unless required by applicable law or agreed to in writing, software
10
// distributed under the License is distributed on an "AS IS" BASIS,
11
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
// See the License for the specific language governing permissions and
13
// limitations under the License.
14
15
#ifdef HAVE_CONFIG_H
16
#  include "config_auto.h"
17
#endif
18
19
#include <cctype>
20
#include <climits> // for CHAR_BIT
21
#include <cmath>
22
#include <cstdarg>
23
#include <cstddef>
24
#include <cstdint>
25
#include <cstdio>
26
#include <cstring>
27
#include <limits> // for std::numeric_limits
28
29
#include "scanutils.h"
30
31
enum Flags {
32
  FL_SPLAT = 0x01, // Drop the value, do not assign
33
  FL_INV = 0x02,   // Character-set with inverse
34
  FL_WIDTH = 0x04, // Field width specified
35
  FL_MINUS = 0x08, // Negative number
36
};
37
38
enum Ranks {
39
  RANK_CHAR = -2,
40
  RANK_SHORT = -1,
41
  RANK_INT = 0,
42
  RANK_LONG = 1,
43
  RANK_LONGLONG = 2,
44
  RANK_PTR = std::numeric_limits<int>::max() // Special value used for pointers
45
};
46
47
const enum Ranks kMinRank = RANK_CHAR;
48
const enum Ranks kMaxRank = RANK_LONGLONG;
49
50
const enum Ranks kIntMaxRank = RANK_LONGLONG;
51
const enum Ranks kSizeTRank = RANK_LONG;
52
const enum Ranks kPtrDiffRank = RANK_LONG;
53
54
enum Bail {
55
  BAIL_NONE = 0, // No error condition
56
  BAIL_EOF,      // Hit EOF
57
  BAIL_ERR       // Conversion mismatch
58
};
59
60
// Helper functions ------------------------------------------------------------
61
0
inline size_t LongBit() {
62
0
  return CHAR_BIT * sizeof(long);
63
0
}
64
65
0
static inline int SkipSpace(FILE *s) {
66
0
  int p;
67
0
  while (isascii(p = fgetc(s)) && isspace(p)) {
68
0
    ;
69
0
  }
70
0
  ungetc(p, s); // Make sure next char is available for reading
71
0
  return p;
72
0
}
73
74
0
static inline void SetBit(unsigned long *bitmap, unsigned int bit) {
75
0
  bitmap[bit / LongBit()] |= 1UL << (bit % LongBit());
76
0
}
77
78
0
static inline int TestBit(unsigned long *bitmap, unsigned int bit) {
79
0
  return static_cast<int>(bitmap[bit / LongBit()] >> (bit % LongBit())) & 1;
80
0
}
81
82
0
static inline int DigitValue(int ch, int base) {
83
0
  if (ch >= '0' && ch <= '9') {
84
0
    if (base >= 10 || ch <= '7') {
85
0
      return ch - '0';
86
0
    }
87
0
  } else if (ch >= 'A' && ch <= 'Z' && base == 16) {
88
0
    return ch - 'A' + 10;
89
0
  } else if (ch >= 'a' && ch <= 'z' && base == 16) {
90
0
    return ch - 'a' + 10;
91
0
  }
92
0
  return -1;
93
0
}
94
95
// IO (re-)implementations -----------------------------------------------------
96
0
static uintmax_t streamtoumax(FILE *s, int base) {
97
0
  int minus = 0;
98
0
  uintmax_t v = 0;
99
0
  int d, c = 0;
100
101
0
  for (c = fgetc(s); isascii(c) && isspace(c); c = fgetc(s)) {
102
0
    ;
103
0
  }
104
105
  // Single optional + or -
106
0
  if (c == '-' || c == '+') {
107
0
    minus = (c == '-');
108
0
    c = fgetc(s);
109
0
  }
110
111
  // Assign correct base
112
0
  if (base == 0) {
113
0
    if (c == '0') {
114
0
      c = fgetc(s);
115
0
      if (c == 'x' || c == 'X') {
116
0
        base = 16;
117
0
        c = fgetc(s);
118
0
      } else {
119
0
        base = 8;
120
0
      }
121
0
    }
122
0
  } else if (base == 16) {
123
0
    if (c == '0') {
124
0
      c = fgetc(s);
125
0
      if (c == 'x' || c == 'X') {
126
0
        c = fgetc(s);
127
0
      }
128
0
    }
129
0
  }
130
131
  // Actual number parsing
132
0
  for (; (c != EOF) && (d = DigitValue(c, base)) >= 0; c = fgetc(s)) {
133
0
    v = v * base + d;
134
0
  }
135
136
0
  ungetc(c, s);
137
0
  return minus ? -v : v;
138
0
}
139
140
0
static double streamtofloat(FILE *s) {
141
0
  bool minus = false;
142
0
  uint64_t v = 0;
143
0
  int d, c;
144
0
  uint64_t k = 1;
145
0
  uint64_t w = 0;
146
147
0
  for (c = fgetc(s); isascii(c) && isspace(c); c = fgetc(s)) {
148
0
    ;
149
0
  }
150
151
  // Single optional + or -
152
0
  if (c == '-' || c == '+') {
153
0
    minus = (c == '-');
154
0
    c = fgetc(s);
155
0
  }
156
157
  // Actual number parsing
158
0
  for (; c != EOF && (d = DigitValue(c, 10)) >= 0; c = fgetc(s)) {
159
0
    v = v * 10 + d;
160
0
  }
161
0
  if (c == '.') {
162
0
    for (c = fgetc(s); c != EOF && (d = DigitValue(c, 10)) >= 0; c = fgetc(s)) {
163
0
      w = w * 10 + d;
164
0
      k *= 10;
165
0
    }
166
0
  }
167
0
  double f = v + static_cast<double>(w) / k;
168
0
  if (c == 'e' || c == 'E') {
169
0
    c = fgetc(s);
170
0
    int expsign = 1;
171
0
    if (c == '-' || c == '+') {
172
0
      expsign = (c == '-') ? -1 : 1;
173
0
      c = fgetc(s);
174
0
    }
175
0
    int exponent = 0;
176
0
    for (; (c != EOF) && (d = DigitValue(c, 10)) >= 0; c = fgetc(s)) {
177
0
      exponent = exponent * 10 + d;
178
0
    }
179
0
    exponent *= expsign;
180
0
    f *= pow(10.0, static_cast<double>(exponent));
181
0
  }
182
0
  ungetc(c, s);
183
184
0
  return minus ? -f : f;
185
0
}
186
187
static int tvfscanf(FILE *stream, const char *format, va_list ap);
188
189
0
int tfscanf(FILE *stream, const char *format, ...) {
190
0
  va_list ap;
191
0
  int rv;
192
193
0
  va_start(ap, format);
194
0
  rv = tvfscanf(stream, format, ap);
195
0
  va_end(ap);
196
197
0
  return rv;
198
0
}
199
200
0
static int tvfscanf(FILE *stream, const char *format, va_list ap) {
201
0
  const char *p = format;
202
0
  char ch;
203
0
  int q = 0;
204
0
  uintmax_t val = 0;
205
0
  int rank = RANK_INT; // Default rank
206
0
  unsigned int width = UINT_MAX;
207
0
  int base;
208
0
  int flags = 0;
209
0
  enum {
210
0
    ST_NORMAL,      // Ground state
211
0
    ST_FLAGS,       // Special flags
212
0
    ST_WIDTH,       // Field width
213
0
    ST_MODIFIERS,   // Length or conversion modifiers
214
0
    ST_MATCH_INIT,  // Initial state of %[ sequence
215
0
    ST_MATCH,       // Main state of %[ sequence
216
0
    ST_MATCH_RANGE, // After - in a %[ sequence
217
0
  } state = ST_NORMAL;
218
0
  char *sarg = nullptr; // %s %c or %[ string argument
219
0
  enum Bail bail = BAIL_NONE;
220
0
  int converted = 0; // Successful conversions
221
0
  unsigned long
222
0
      matchmap[((1 << CHAR_BIT) + (CHAR_BIT * sizeof(long) - 1)) / (CHAR_BIT * sizeof(long))];
223
0
  int matchinv = 0; // Is match map inverted?
224
0
  unsigned char range_start = 0;
225
0
  auto start_off = std::ftell(stream);
226
227
  // Skip leading spaces
228
0
  SkipSpace(stream);
229
230
0
  while ((ch = *p++) && !bail) {
231
0
    switch (state) {
232
0
      case ST_NORMAL:
233
0
        if (ch == '%') {
234
0
          state = ST_FLAGS;
235
0
          flags = 0;
236
0
          rank = RANK_INT;
237
0
          width = UINT_MAX;
238
0
        } else if (isascii(ch) && isspace(ch)) {
239
0
          SkipSpace(stream);
240
0
        } else {
241
0
          if (fgetc(stream) != ch) {
242
0
            bail = BAIL_ERR; // Match failure
243
0
          }
244
0
        }
245
0
        break;
246
247
0
      case ST_FLAGS:
248
0
        if (ch == '*') {
249
0
          flags |= FL_SPLAT;
250
0
        } else if ('0' <= ch && ch <= '9') {
251
0
          width = (ch - '0');
252
0
          state = ST_WIDTH;
253
0
          flags |= FL_WIDTH;
254
0
        } else {
255
0
          state = ST_MODIFIERS;
256
0
          p--; // Process this character again
257
0
        }
258
0
        break;
259
260
0
      case ST_WIDTH:
261
0
        if (ch >= '0' && ch <= '9') {
262
0
          width = width * 10 + (ch - '0');
263
0
        } else {
264
0
          state = ST_MODIFIERS;
265
0
          p--; // Process this character again
266
0
        }
267
0
        break;
268
269
0
      case ST_MODIFIERS:
270
0
        switch (ch) {
271
          // Length modifiers - nonterminal sequences
272
0
          case 'h':
273
0
            rank--; // Shorter rank
274
0
            break;
275
0
          case 'l':
276
0
            rank++; // Longer rank
277
0
            break;
278
0
          case 'j':
279
0
            rank = kIntMaxRank;
280
0
            break;
281
0
          case 'z':
282
0
            rank = kSizeTRank;
283
0
            break;
284
0
          case 't':
285
0
            rank = kPtrDiffRank;
286
0
            break;
287
0
          case 'L':
288
0
          case 'q':
289
0
            rank = RANK_LONGLONG; // long double/long long
290
0
            break;
291
292
0
          default:
293
            // Output modifiers - terminal sequences
294
0
            state = ST_NORMAL;   // Next state will be normal
295
0
            if (rank < kMinRank) { // Canonicalize rank
296
0
              rank = kMinRank;
297
0
            } else if (rank > kMaxRank) {
298
0
              rank = kMaxRank;
299
0
            }
300
301
0
            switch (ch) {
302
0
              case 'P': // Upper case pointer
303
0
              case 'p': // Pointer
304
0
                rank = RANK_PTR;
305
0
                base = 0;
306
0
                goto scan_int;
307
308
0
              case 'i': // Base-independent integer
309
0
                base = 0;
310
0
                goto scan_int;
311
312
0
              case 'd': // Decimal integer
313
0
                base = 10;
314
0
                goto scan_int;
315
316
0
              case 'o': // Octal integer
317
0
                base = 8;
318
0
                goto scan_int;
319
320
0
              case 'u': // Unsigned decimal integer
321
0
                base = 10;
322
0
                goto scan_int;
323
324
0
              case 'x': // Hexadecimal integer
325
0
              case 'X':
326
0
                base = 16;
327
0
                goto scan_int;
328
329
0
              case 'n': // Number of characters consumed
330
0
                val = std::ftell(stream) - start_off;
331
0
                goto set_integer;
332
333
0
              scan_int:
334
0
                q = SkipSpace(stream);
335
0
                if (q <= 0) {
336
0
                  bail = BAIL_EOF;
337
0
                  break;
338
0
                }
339
0
                val = streamtoumax(stream, base);
340
                // fall through
341
342
0
              set_integer:
343
0
                if (!(flags & FL_SPLAT)) {
344
0
                  converted++;
345
0
                  switch (rank) {
346
0
                    case RANK_CHAR:
347
0
                      *va_arg(ap, unsigned char *) = static_cast<unsigned char>(val);
348
0
                      break;
349
0
                    case RANK_SHORT:
350
0
                      *va_arg(ap, unsigned short *) = static_cast<unsigned short>(val);
351
0
                      break;
352
0
                    case RANK_INT:
353
0
                      *va_arg(ap, unsigned int *) = static_cast<unsigned int>(val);
354
0
                      break;
355
0
                    case RANK_LONG:
356
0
                      *va_arg(ap, unsigned long *) = static_cast<unsigned long>(val);
357
0
                      break;
358
0
                    case RANK_LONGLONG:
359
0
                      *va_arg(ap, unsigned long long *) = static_cast<unsigned long long>(val);
360
0
                      break;
361
0
                    case RANK_PTR:
362
0
                      *va_arg(ap, void **) = reinterpret_cast<void *>(static_cast<uintptr_t>(val));
363
0
                      break;
364
0
                  }
365
0
                }
366
0
                break;
367
368
0
              case 'f': // Preliminary float value parsing
369
0
              case 'g':
370
0
              case 'G':
371
0
              case 'e':
372
0
              case 'E':
373
0
                q = SkipSpace(stream);
374
0
                if (q <= 0) {
375
0
                  bail = BAIL_EOF;
376
0
                  break;
377
0
                }
378
379
0
                {
380
0
                  double fval = streamtofloat(stream);
381
0
                  if (!(flags & FL_SPLAT)) {
382
0
                    if (rank == RANK_INT) {
383
0
                      *va_arg(ap, float *) = static_cast<float>(fval);
384
0
                    } else if (rank == RANK_LONG) {
385
0
                      *va_arg(ap, double *) = static_cast<double>(fval);
386
0
                    }
387
0
                    converted++;
388
0
                  }
389
0
                }
390
0
                break;
391
392
0
              case 'c':                                 // Character
393
0
                width = (flags & FL_WIDTH) ? width : 1; // Default width == 1
394
0
                sarg = va_arg(ap, char *);
395
0
                while (width--) {
396
0
                  if ((q = fgetc(stream)) <= 0) {
397
0
                    bail = BAIL_EOF;
398
0
                    break;
399
0
                  }
400
0
                  if (!(flags & FL_SPLAT)) {
401
0
                    *sarg++ = q;
402
0
                    converted++;
403
0
                  }
404
0
                }
405
0
                break;
406
407
0
              case 's': // String
408
0
              {
409
0
                if (!(flags & FL_SPLAT)) {
410
0
                  sarg = va_arg(ap, char *);
411
0
                }
412
0
                unsigned length = 0;
413
0
                while (width--) {
414
0
                  q = fgetc(stream);
415
0
                  if ((isascii(q) && isspace(q)) || (q <= 0)) {
416
0
                    ungetc(q, stream);
417
0
                    break;
418
0
                  }
419
0
                  if (!(flags & FL_SPLAT)) {
420
0
                    sarg[length] = q;
421
0
                  }
422
0
                  length++;
423
0
                }
424
0
                if (length == 0) {
425
0
                  bail = BAIL_EOF;
426
0
                } else if (!(flags & FL_SPLAT)) {
427
0
                  sarg[length] = '\0'; // Terminate output
428
0
                  converted++;
429
0
                }
430
0
              } break;
431
432
0
              case '[': // Character range
433
0
                sarg = va_arg(ap, char *);
434
0
                state = ST_MATCH_INIT;
435
0
                matchinv = 0;
436
0
                memset(matchmap, 0, sizeof matchmap);
437
0
                break;
438
439
0
              case '%': // %% sequence
440
0
                if (fgetc(stream) != '%') {
441
0
                  bail = BAIL_ERR;
442
0
                }
443
0
                break;
444
445
0
              default:           // Anything else
446
0
                bail = BAIL_ERR; // Unknown sequence
447
0
                break;
448
0
            }
449
0
        }
450
0
        break;
451
452
0
      case ST_MATCH_INIT: // Initial state for %[ match
453
0
        if (ch == '^' && !(flags & FL_INV)) {
454
0
          matchinv = 1;
455
0
        } else {
456
0
          SetBit(matchmap, static_cast<unsigned char>(ch));
457
0
          state = ST_MATCH;
458
0
        }
459
0
        break;
460
461
0
      case ST_MATCH: // Main state for %[ match
462
0
        if (ch == ']') {
463
0
          goto match_run;
464
0
        } else if (ch == '-') {
465
0
          range_start = static_cast<unsigned char>(ch);
466
0
          state = ST_MATCH_RANGE;
467
0
        } else {
468
0
          SetBit(matchmap, static_cast<unsigned char>(ch));
469
0
        }
470
0
        break;
471
472
0
      case ST_MATCH_RANGE: // %[ match after -
473
0
        if (ch == ']') {
474
0
          SetBit(matchmap, static_cast<unsigned char>('-'));
475
0
          goto match_run;
476
0
        } else {
477
0
          int i;
478
0
          for (i = range_start; i < (static_cast<unsigned char>(ch)); i++) {
479
0
            SetBit(matchmap, i);
480
0
          }
481
0
          state = ST_MATCH;
482
0
        }
483
0
        break;
484
485
0
      match_run: // Match expression finished
486
0
        char *oarg = sarg;
487
0
        while (width) {
488
0
          q = fgetc(stream);
489
0
          auto qc = static_cast<unsigned char>(q);
490
0
          if (q <= 0 || !(TestBit(matchmap, qc) ^ matchinv)) {
491
0
            ungetc(q, stream);
492
0
            break;
493
0
          }
494
0
          if (!(flags & FL_SPLAT)) {
495
0
            *sarg = q;
496
0
          }
497
0
          sarg++;
498
0
        }
499
0
        if (oarg == sarg) {
500
0
          bail = (q <= 0) ? BAIL_EOF : BAIL_ERR;
501
0
        } else if (!(flags & FL_SPLAT)) {
502
0
          *sarg = '\0';
503
0
          converted++;
504
0
        }
505
0
        break;
506
0
    }
507
0
  }
508
509
0
  if (bail == BAIL_EOF && !converted) {
510
0
    converted = -1; // Return EOF (-1)
511
0
  }
512
513
0
  return converted;
514
0
}