Coverage Report

Created: 2023-06-07 06:17

/src/CMake/Utilities/cmexpat/lib/xmltok.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
                            __  __            _
3
                         ___\ \/ /_ __   __ _| |_
4
                        / _ \\  /| '_ \ / _` | __|
5
                       |  __//  \| |_) | (_| | |_
6
                        \___/_/\_\ .__/ \__,_|\__|
7
                                 |_| XML parser
8
9
   Copyright (c) 1997-2000 Thai Open Source Software Center Ltd
10
   Copyright (c) 2000      Clark Cooper <coopercc@users.sourceforge.net>
11
   Copyright (c) 2001-2003 Fred L. Drake, Jr. <fdrake@users.sourceforge.net>
12
   Copyright (c) 2002      Greg Stein <gstein@users.sourceforge.net>
13
   Copyright (c) 2002-2016 Karl Waclawek <karl@waclawek.net>
14
   Copyright (c) 2005-2009 Steven Solie <steven@solie.ca>
15
   Copyright (c) 2016-2022 Sebastian Pipping <sebastian@pipping.org>
16
   Copyright (c) 2016      Pascal Cuoq <cuoq@trust-in-soft.com>
17
   Copyright (c) 2016      Don Lewis <truckman@apache.org>
18
   Copyright (c) 2017      Rhodri James <rhodri@wildebeest.org.uk>
19
   Copyright (c) 2017      Alexander Bluhm <alexander.bluhm@gmx.net>
20
   Copyright (c) 2017      Benbuck Nason <bnason@netflix.com>
21
   Copyright (c) 2017      José Gutiérrez de la Concha <jose@zeroc.com>
22
   Copyright (c) 2019      David Loffredo <loffredo@steptools.com>
23
   Copyright (c) 2021      Dong-hee Na <donghee.na@python.org>
24
   Licensed under the MIT license:
25
26
   Permission is  hereby granted,  free of charge,  to any  person obtaining
27
   a  copy  of  this  software   and  associated  documentation  files  (the
28
   "Software"),  to  deal in  the  Software  without restriction,  including
29
   without  limitation the  rights  to use,  copy,  modify, merge,  publish,
30
   distribute, sublicense, and/or sell copies of the Software, and to permit
31
   persons  to whom  the Software  is  furnished to  do so,  subject to  the
32
   following conditions:
33
34
   The above copyright  notice and this permission notice  shall be included
35
   in all copies or substantial portions of the Software.
36
37
   THE  SOFTWARE  IS  PROVIDED  "AS  IS",  WITHOUT  WARRANTY  OF  ANY  KIND,
38
   EXPRESS  OR IMPLIED,  INCLUDING  BUT  NOT LIMITED  TO  THE WARRANTIES  OF
39
   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
40
   NO EVENT SHALL THE AUTHORS OR  COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
41
   DAMAGES OR  OTHER LIABILITY, WHETHER  IN AN  ACTION OF CONTRACT,  TORT OR
42
   OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
43
   USE OR OTHER DEALINGS IN THE SOFTWARE.
44
*/
45
46
#include <expat_config.h>
47
48
#include <stddef.h>
49
#include <string.h> /* memcpy */
50
#include <stdbool.h>
51
52
#ifdef _WIN32
53
#  include "winconfig.h"
54
#endif
55
56
#include "expat_external.h"
57
#include "internal.h"
58
#include "xmltok.h"
59
#include "nametab.h"
60
61
#ifdef XML_DTD
62
#  define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok)
63
#else
64
#  define IGNORE_SECTION_TOK_VTABLE /* as nothing */
65
#endif
66
67
#define VTABLE1                                                                \
68
  {PREFIX(prologTok), PREFIX(contentTok),                                      \
69
   PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE},                         \
70
      {PREFIX(attributeValueTok), PREFIX(entityValueTok)},                     \
71
      PREFIX(nameMatchesAscii), PREFIX(nameLength), PREFIX(skipS),             \
72
      PREFIX(getAtts), PREFIX(charRefNumber), PREFIX(predefinedEntityName),    \
73
      PREFIX(updatePosition), PREFIX(isPublicId)
74
75
#define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
76
77
#define UCS2_GET_NAMING(pages, hi, lo)                                         \
78
2.55k
  (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1u << ((lo)&0x1F)))
79
80
/* A 2 byte UTF-8 representation splits the characters 11 bits between
81
   the bottom 5 and 6 bits of the bytes.  We need 8 bits to index into
82
   pages, 3 bits to add to that index and 5 bits to generate the mask.
83
*/
84
#define UTF8_GET_NAMING2(pages, byte)                                          \
85
8.29M
  (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3)                         \
86
8.29M
                + ((((byte)[0]) & 3) << 1) + ((((byte)[1]) >> 5) & 1)]         \
87
8.29M
   & (1u << (((byte)[1]) & 0x1F)))
88
89
/* A 3 byte UTF-8 representation splits the characters 16 bits between
90
   the bottom 4, 6 and 6 bits of the bytes.  We need 8 bits to index
91
   into pages, 3 bits to add to that index and 5 bits to generate the
92
   mask.
93
*/
94
#define UTF8_GET_NAMING3(pages, byte)                                          \
95
11.4k
  (namingBitmap                                                                \
96
11.4k
       [((pages)[((((byte)[0]) & 0xF) << 4) + ((((byte)[1]) >> 2) & 0xF)]      \
97
11.4k
         << 3)                                                                 \
98
11.4k
        + ((((byte)[1]) & 3) << 1) + ((((byte)[2]) >> 5) & 1)]                 \
99
11.4k
   & (1u << (((byte)[2]) & 0x1F)))
100
101
/* Detection of invalid UTF-8 sequences is based on Table 3.1B
102
   of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/
103
   with the additional restriction of not allowing the Unicode
104
   code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE).
105
   Implementation details:
106
     (A & 0x80) == 0     means A < 0x80
107
   and
108
     (A & 0xC0) == 0xC0  means A > 0xBF
109
*/
110
111
#define UTF8_INVALID2(p)                                                       \
112
51.1M
  ((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0)
113
114
#define UTF8_INVALID3(p)                                                       \
115
34.3k
  (((p)[2] & 0x80) == 0                                                        \
116
34.3k
   || ((*p) == 0xEF && (p)[1] == 0xBF ? (p)[2] > 0xBD                          \
117
34.2k
                                      : ((p)[2] & 0xC0) == 0xC0)               \
118
34.3k
   || ((*p) == 0xE0                                                            \
119
34.2k
           ? (p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0                          \
120
34.2k
           : ((p)[1] & 0x80) == 0                                              \
121
29.6k
                 || ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0)))
122
123
#define UTF8_INVALID4(p)                                                       \
124
41.6k
  (((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 || ((p)[2] & 0x80) == 0     \
125
41.6k
   || ((p)[2] & 0xC0) == 0xC0                                                  \
126
41.6k
   || ((*p) == 0xF0                                                            \
127
41.4k
           ? (p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0                          \
128
41.4k
           : ((p)[1] & 0x80) == 0                                              \
129
36.6k
                 || ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0)))
130
131
static int PTRFASTCALL
132
101
isNever(const ENCODING *enc, const char *p) {
133
101
  UNUSED_P(enc);
134
101
  UNUSED_P(p);
135
101
  return 0;
136
101
}
137
138
static int PTRFASTCALL
139
8.27M
utf8_isName2(const ENCODING *enc, const char *p) {
140
8.27M
  UNUSED_P(enc);
141
8.27M
  return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
142
8.27M
}
143
144
static int PTRFASTCALL
145
3.23k
utf8_isName3(const ENCODING *enc, const char *p) {
146
3.23k
  UNUSED_P(enc);
147
3.23k
  return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
148
3.23k
}
149
150
#define utf8_isName4 isNever
151
152
static int PTRFASTCALL
153
13.8k
utf8_isNmstrt2(const ENCODING *enc, const char *p) {
154
13.8k
  UNUSED_P(enc);
155
13.8k
  return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
156
13.8k
}
157
158
static int PTRFASTCALL
159
8.26k
utf8_isNmstrt3(const ENCODING *enc, const char *p) {
160
8.26k
  UNUSED_P(enc);
161
8.26k
  return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
162
8.26k
}
163
164
#define utf8_isNmstrt4 isNever
165
166
static int PTRFASTCALL
167
51.1M
utf8_isInvalid2(const ENCODING *enc, const char *p) {
168
51.1M
  UNUSED_P(enc);
169
51.1M
  return UTF8_INVALID2((const unsigned char *)p);
170
51.1M
}
171
172
static int PTRFASTCALL
173
34.3k
utf8_isInvalid3(const ENCODING *enc, const char *p) {
174
34.3k
  UNUSED_P(enc);
175
34.3k
  return UTF8_INVALID3((const unsigned char *)p);
176
34.3k
}
177
178
static int PTRFASTCALL
179
41.6k
utf8_isInvalid4(const ENCODING *enc, const char *p) {
180
41.6k
  UNUSED_P(enc);
181
41.6k
  return UTF8_INVALID4((const unsigned char *)p);
182
41.6k
}
183
184
struct normal_encoding {
185
  ENCODING enc;
186
  unsigned char type[256];
187
#ifdef XML_MIN_SIZE
188
  int(PTRFASTCALL *byteType)(const ENCODING *, const char *);
189
  int(PTRFASTCALL *isNameMin)(const ENCODING *, const char *);
190
  int(PTRFASTCALL *isNmstrtMin)(const ENCODING *, const char *);
191
  int(PTRFASTCALL *byteToAscii)(const ENCODING *, const char *);
192
  int(PTRCALL *charMatches)(const ENCODING *, const char *, int);
193
#endif /* XML_MIN_SIZE */
194
  int(PTRFASTCALL *isName2)(const ENCODING *, const char *);
195
  int(PTRFASTCALL *isName3)(const ENCODING *, const char *);
196
  int(PTRFASTCALL *isName4)(const ENCODING *, const char *);
197
  int(PTRFASTCALL *isNmstrt2)(const ENCODING *, const char *);
198
  int(PTRFASTCALL *isNmstrt3)(const ENCODING *, const char *);
199
  int(PTRFASTCALL *isNmstrt4)(const ENCODING *, const char *);
200
  int(PTRFASTCALL *isInvalid2)(const ENCODING *, const char *);
201
  int(PTRFASTCALL *isInvalid3)(const ENCODING *, const char *);
202
  int(PTRFASTCALL *isInvalid4)(const ENCODING *, const char *);
203
};
204
205
59.5M
#define AS_NORMAL_ENCODING(enc) ((const struct normal_encoding *)(enc))
206
207
#ifdef XML_MIN_SIZE
208
209
#  define STANDARD_VTABLE(E)                                                   \
210
    E##byteType, E##isNameMin, E##isNmstrtMin, E##byteToAscii, E##charMatches,
211
212
#else
213
214
#  define STANDARD_VTABLE(E) /* as nothing */
215
216
#endif
217
218
#define NORMAL_VTABLE(E)                                                       \
219
  E##isName2, E##isName3, E##isName4, E##isNmstrt2, E##isNmstrt3,              \
220
      E##isNmstrt4, E##isInvalid2, E##isInvalid3, E##isInvalid4
221
222
#define NULL_VTABLE                                                            \
223
  /* isName2 */ NULL, /* isName3 */ NULL, /* isName4 */ NULL,                  \
224
      /* isNmstrt2 */ NULL, /* isNmstrt3 */ NULL, /* isNmstrt4 */ NULL,        \
225
      /* isInvalid2 */ NULL, /* isInvalid3 */ NULL, /* isInvalid4 */ NULL
226
227
static int FASTCALL checkCharRefNumber(int);
228
229
#include "xmltok_impl.h"
230
#include "ascii.h"
231
232
#ifdef XML_MIN_SIZE
233
#  define sb_isNameMin isNever
234
#  define sb_isNmstrtMin isNever
235
#endif
236
237
#ifdef XML_MIN_SIZE
238
#  define MINBPC(enc) ((enc)->minBytesPerChar)
239
#else
240
/* minimum bytes per character */
241
5.79G
#  define MINBPC(enc) 1
242
#endif
243
244
#define SB_BYTE_TYPE(enc, p)                                                   \
245
5.19G
  (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
246
247
#ifdef XML_MIN_SIZE
248
static int PTRFASTCALL
249
sb_byteType(const ENCODING *enc, const char *p) {
250
  return SB_BYTE_TYPE(enc, p);
251
}
252
#  define BYTE_TYPE(enc, p) (AS_NORMAL_ENCODING(enc)->byteType(enc, p))
253
#else
254
5.19G
#  define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
255
#endif
256
257
#ifdef XML_MIN_SIZE
258
#  define BYTE_TO_ASCII(enc, p) (AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p))
259
static int PTRFASTCALL
260
sb_byteToAscii(const ENCODING *enc, const char *p) {
261
  UNUSED_P(enc);
262
  return *p;
263
}
264
#else
265
2.70M
#  define BYTE_TO_ASCII(enc, p) (*(p))
266
#endif
267
268
8.28M
#define IS_NAME_CHAR(enc, p, n) (AS_NORMAL_ENCODING(enc)->isName##n(enc, p))
269
22.1k
#define IS_NMSTRT_CHAR(enc, p, n) (AS_NORMAL_ENCODING(enc)->isNmstrt##n(enc, p))
270
#ifdef XML_MIN_SIZE
271
#  define IS_INVALID_CHAR(enc, p, n)                                           \
272
    (AS_NORMAL_ENCODING(enc)->isInvalid##n                                     \
273
     && AS_NORMAL_ENCODING(enc)->isInvalid##n(enc, p))
274
#else
275
#  define IS_INVALID_CHAR(enc, p, n)                                           \
276
59.5M
    (AS_NORMAL_ENCODING(enc)->isInvalid##n(enc, p))
277
#endif
278
279
#ifdef XML_MIN_SIZE
280
#  define IS_NAME_CHAR_MINBPC(enc, p)                                          \
281
    (AS_NORMAL_ENCODING(enc)->isNameMin(enc, p))
282
#  define IS_NMSTRT_CHAR_MINBPC(enc, p)                                        \
283
    (AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p))
284
#else
285
0
#  define IS_NAME_CHAR_MINBPC(enc, p) (0)
286
0
#  define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
287
#endif
288
289
#ifdef XML_MIN_SIZE
290
#  define CHAR_MATCHES(enc, p, c)                                              \
291
    (AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c))
292
static int PTRCALL
293
sb_charMatches(const ENCODING *enc, const char *p, int c) {
294
  UNUSED_P(enc);
295
  return *p == c;
296
}
297
#else
298
/* c is an ASCII character */
299
5.47M
#  define CHAR_MATCHES(enc, p, c) (*(p) == c)
300
#endif
301
302
14.5M
#define PREFIX(ident) normal_##ident
303
#define XML_TOK_IMPL_C
304
#include "xmltok_impl.c"
305
#undef XML_TOK_IMPL_C
306
307
#undef MINBPC
308
#undef BYTE_TYPE
309
#undef BYTE_TO_ASCII
310
#undef CHAR_MATCHES
311
#undef IS_NAME_CHAR
312
#undef IS_NAME_CHAR_MINBPC
313
#undef IS_NMSTRT_CHAR
314
#undef IS_NMSTRT_CHAR_MINBPC
315
#undef IS_INVALID_CHAR
316
317
enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */
318
       UTF8_cval1 = 0x00,
319
       UTF8_cval2 = 0xc0,
320
       UTF8_cval3 = 0xe0,
321
       UTF8_cval4 = 0xf0
322
};
323
324
void
325
_INTERNAL_trim_to_complete_utf8_characters(const char *from,
326
19.1M
                                           const char **fromLimRef) {
327
19.1M
  const char *fromLim = *fromLimRef;
328
19.1M
  size_t walked = 0;
329
19.7M
  for (; fromLim > from; fromLim--, walked++) {
330
19.7M
    const unsigned char prev = (unsigned char)fromLim[-1];
331
19.7M
    if ((prev & 0xf8u)
332
19.7M
        == 0xf0u) { /* 4-byte character, lead by 0b11110xxx byte */
333
1.02k
      if (walked + 1 >= 4) {
334
617
        fromLim += 4 - 1;
335
617
        break;
336
617
      } else {
337
409
        walked = 0;
338
409
      }
339
19.7M
    } else if ((prev & 0xf0u)
340
19.7M
               == 0xe0u) { /* 3-byte character, lead by 0b1110xxxx byte */
341
3.98k
      if (walked + 1 >= 3) {
342
3.58k
        fromLim += 3 - 1;
343
3.58k
        break;
344
3.58k
      } else {
345
404
        walked = 0;
346
404
      }
347
19.7M
    } else if ((prev & 0xe0u)
348
19.7M
               == 0xc0u) { /* 2-byte character, lead by 0b110xxxxx byte */
349
575k
      if (walked + 1 >= 2) {
350
572k
        fromLim += 2 - 1;
351
572k
        break;
352
572k
      } else {
353
3.68k
        walked = 0;
354
3.68k
      }
355
19.1M
    } else if ((prev & 0x80u)
356
19.1M
               == 0x00u) { /* 1-byte character, matching 0b0xxxxxxx */
357
18.5M
      break;
358
18.5M
    }
359
19.7M
  }
360
19.1M
  *fromLimRef = fromLim;
361
19.1M
}
362
363
static enum XML_Convert_Result PTRCALL
364
utf8_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
365
19.1M
            char **toP, const char *toLim) {
366
19.1M
  bool input_incomplete = false;
367
19.1M
  bool output_exhausted = false;
368
369
  /* Avoid copying partial characters (due to limited space). */
370
19.1M
  const ptrdiff_t bytesAvailable = fromLim - *fromP;
371
19.1M
  const ptrdiff_t bytesStorable = toLim - *toP;
372
19.1M
  UNUSED_P(enc);
373
19.1M
  if (bytesAvailable > bytesStorable) {
374
82.1k
    fromLim = *fromP + bytesStorable;
375
82.1k
    output_exhausted = true;
376
82.1k
  }
377
378
  /* Avoid copying partial characters (from incomplete input). */
379
19.1M
  {
380
19.1M
    const char *const fromLimBefore = fromLim;
381
19.1M
    _INTERNAL_trim_to_complete_utf8_characters(*fromP, &fromLim);
382
19.1M
    if (fromLim < fromLimBefore) {
383
4.49k
      input_incomplete = true;
384
4.49k
    }
385
19.1M
  }
386
387
19.1M
  {
388
19.1M
    const ptrdiff_t bytesToCopy = fromLim - *fromP;
389
19.1M
    memcpy(*toP, *fromP, bytesToCopy);
390
19.1M
    *fromP += bytesToCopy;
391
19.1M
    *toP += bytesToCopy;
392
19.1M
  }
393
394
19.1M
  if (output_exhausted) /* needs to go first */
395
82.1k
    return XML_CONVERT_OUTPUT_EXHAUSTED;
396
19.0M
  else if (input_incomplete)
397
0
    return XML_CONVERT_INPUT_INCOMPLETE;
398
19.0M
  else
399
19.0M
    return XML_CONVERT_COMPLETED;
400
19.1M
}
401
402
static enum XML_Convert_Result PTRCALL
403
utf8_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
404
0
             unsigned short **toP, const unsigned short *toLim) {
405
0
  enum XML_Convert_Result res = XML_CONVERT_COMPLETED;
406
0
  unsigned short *to = *toP;
407
0
  const char *from = *fromP;
408
0
  while (from < fromLim && to < toLim) {
409
0
    switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) {
410
0
    case BT_LEAD2:
411
0
      if (fromLim - from < 2) {
412
0
        res = XML_CONVERT_INPUT_INCOMPLETE;
413
0
        goto after;
414
0
      }
415
0
      *to++ = (unsigned short)(((from[0] & 0x1f) << 6) | (from[1] & 0x3f));
416
0
      from += 2;
417
0
      break;
418
0
    case BT_LEAD3:
419
0
      if (fromLim - from < 3) {
420
0
        res = XML_CONVERT_INPUT_INCOMPLETE;
421
0
        goto after;
422
0
      }
423
0
      *to++ = (unsigned short)(((from[0] & 0xf) << 12) | ((from[1] & 0x3f) << 6)
424
0
                               | (from[2] & 0x3f));
425
0
      from += 3;
426
0
      break;
427
0
    case BT_LEAD4: {
428
0
      unsigned long n;
429
0
      if (toLim - to < 2) {
430
0
        res = XML_CONVERT_OUTPUT_EXHAUSTED;
431
0
        goto after;
432
0
      }
433
0
      if (fromLim - from < 4) {
434
0
        res = XML_CONVERT_INPUT_INCOMPLETE;
435
0
        goto after;
436
0
      }
437
0
      n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12)
438
0
          | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
439
0
      n -= 0x10000;
440
0
      to[0] = (unsigned short)((n >> 10) | 0xD800);
441
0
      to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
442
0
      to += 2;
443
0
      from += 4;
444
0
    } break;
445
0
    default:
446
0
      *to++ = *from++;
447
0
      break;
448
0
    }
449
0
  }
450
0
  if (from < fromLim)
451
0
    res = XML_CONVERT_OUTPUT_EXHAUSTED;
452
0
after:
453
0
  *fromP = from;
454
0
  *toP = to;
455
0
  return res;
456
0
}
457
458
#ifdef XML_NS
459
static const struct normal_encoding utf8_encoding_ns
460
    = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
461
       {
462
#  include "asciitab.h"
463
#  include "utf8tab.h"
464
       },
465
       STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
466
#endif
467
468
static const struct normal_encoding utf8_encoding
469
    = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
470
       {
471
#define BT_COLON BT_NMSTRT
472
#include "asciitab.h"
473
#undef BT_COLON
474
#include "utf8tab.h"
475
       },
476
       STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
477
478
#ifdef XML_NS
479
480
static const struct normal_encoding internal_utf8_encoding_ns
481
    = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
482
       {
483
#  include "iasciitab.h"
484
#  include "utf8tab.h"
485
       },
486
       STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
487
488
#endif
489
490
static const struct normal_encoding internal_utf8_encoding
491
    = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
492
       {
493
#define BT_COLON BT_NMSTRT
494
#include "iasciitab.h"
495
#undef BT_COLON
496
#include "utf8tab.h"
497
       },
498
       STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
499
500
static enum XML_Convert_Result PTRCALL
501
latin1_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
502
885k
              char **toP, const char *toLim) {
503
885k
  UNUSED_P(enc);
504
68.0M
  for (;;) {
505
68.0M
    unsigned char c;
506
68.0M
    if (*fromP == fromLim)
507
870k
      return XML_CONVERT_COMPLETED;
508
67.1M
    c = (unsigned char)**fromP;
509
67.1M
    if (c & 0x80) {
510
65.0M
      if (toLim - *toP < 2)
511
13.9k
        return XML_CONVERT_OUTPUT_EXHAUSTED;
512
65.0M
      *(*toP)++ = (char)((c >> 6) | UTF8_cval2);
513
65.0M
      *(*toP)++ = (char)((c & 0x3f) | 0x80);
514
65.0M
      (*fromP)++;
515
65.0M
    } else {
516
2.12M
      if (*toP == toLim)
517
955
        return XML_CONVERT_OUTPUT_EXHAUSTED;
518
2.12M
      *(*toP)++ = *(*fromP)++;
519
2.12M
    }
520
67.1M
  }
521
885k
}
522
523
static enum XML_Convert_Result PTRCALL
524
latin1_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
525
0
               unsigned short **toP, const unsigned short *toLim) {
526
0
  UNUSED_P(enc);
527
0
  while (*fromP < fromLim && *toP < toLim)
528
0
    *(*toP)++ = (unsigned char)*(*fromP)++;
529
530
0
  if ((*toP == toLim) && (*fromP < fromLim))
531
0
    return XML_CONVERT_OUTPUT_EXHAUSTED;
532
0
  else
533
0
    return XML_CONVERT_COMPLETED;
534
0
}
535
536
#ifdef XML_NS
537
538
static const struct normal_encoding latin1_encoding_ns
539
    = {{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0},
540
       {
541
#  include "asciitab.h"
542
#  include "latin1tab.h"
543
       },
544
       STANDARD_VTABLE(sb_) NULL_VTABLE};
545
546
#endif
547
548
static const struct normal_encoding latin1_encoding
549
    = {{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0},
550
       {
551
#define BT_COLON BT_NMSTRT
552
#include "asciitab.h"
553
#undef BT_COLON
554
#include "latin1tab.h"
555
       },
556
       STANDARD_VTABLE(sb_) NULL_VTABLE};
557
558
static enum XML_Convert_Result PTRCALL
559
ascii_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
560
1.78k
             char **toP, const char *toLim) {
561
1.78k
  UNUSED_P(enc);
562
38.0k
  while (*fromP < fromLim && *toP < toLim)
563
36.2k
    *(*toP)++ = *(*fromP)++;
564
565
1.78k
  if ((*toP == toLim) && (*fromP < fromLim))
566
230
    return XML_CONVERT_OUTPUT_EXHAUSTED;
567
1.55k
  else
568
1.55k
    return XML_CONVERT_COMPLETED;
569
1.78k
}
570
571
#ifdef XML_NS
572
573
static const struct normal_encoding ascii_encoding_ns
574
    = {{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0},
575
       {
576
#  include "asciitab.h"
577
           /* BT_NONXML == 0 */
578
       },
579
       STANDARD_VTABLE(sb_) NULL_VTABLE};
580
581
#endif
582
583
static const struct normal_encoding ascii_encoding
584
    = {{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0},
585
       {
586
#define BT_COLON BT_NMSTRT
587
#include "asciitab.h"
588
#undef BT_COLON
589
           /* BT_NONXML == 0 */
590
       },
591
       STANDARD_VTABLE(sb_) NULL_VTABLE};
592
593
static int PTRFASTCALL
594
4.61k
unicode_byte_type(char hi, char lo) {
595
4.61k
  switch ((unsigned char)hi) {
596
  /* 0xD800-0xDBFF first 16-bit code unit or high surrogate (W1) */
597
18
  case 0xD8:
598
43
  case 0xD9:
599
61
  case 0xDA:
600
89
  case 0xDB:
601
89
    return BT_LEAD4;
602
  /* 0xDC00-0xDFFF second 16-bit code unit or low surrogate (W2) */
603
5
  case 0xDC:
604
9
  case 0xDD:
605
14
  case 0xDE:
606
19
  case 0xDF:
607
19
    return BT_TRAIL;
608
53
  case 0xFF:
609
53
    switch ((unsigned char)lo) {
610
9
    case 0xFF: /* noncharacter-FFFF */
611
9
    case 0xFE: /* noncharacter-FFFE */
612
9
      return BT_NONXML;
613
53
    }
614
44
    break;
615
4.61k
  }
616
4.50k
  return BT_NONASCII;
617
4.61k
}
618
619
#define DEFINE_UTF16_TO_UTF8(E)                                                \
620
  static enum XML_Convert_Result PTRCALL E##toUtf8(                            \
621
      const ENCODING *enc, const char **fromP, const char *fromLim,            \
622
0
      char **toP, const char *toLim) {                                         \
623
0
    const char *from = *fromP;                                                 \
624
0
    UNUSED_P(enc);                                                             \
625
0
    fromLim = from + (((fromLim - from) >> 1) << 1); /* shrink to even */      \
626
0
    for (; from < fromLim; from += 2) {                                        \
627
0
      int plane;                                                               \
628
0
      unsigned char lo2;                                                       \
629
0
      unsigned char lo = GET_LO(from);                                         \
630
0
      unsigned char hi = GET_HI(from);                                         \
631
0
      switch (hi) {                                                            \
632
0
      case 0:                                                                  \
633
0
        if (lo < 0x80) {                                                       \
634
0
          if (*toP == toLim) {                                                 \
635
0
            *fromP = from;                                                     \
636
0
            return XML_CONVERT_OUTPUT_EXHAUSTED;                               \
637
0
          }                                                                    \
638
0
          *(*toP)++ = lo;                                                      \
639
0
          break;                                                               \
640
0
        }                                                                      \
641
0
        /* fall through */                                                     \
642
0
      case 0x1:                                                                \
643
0
      case 0x2:                                                                \
644
0
      case 0x3:                                                                \
645
0
      case 0x4:                                                                \
646
0
      case 0x5:                                                                \
647
0
      case 0x6:                                                                \
648
0
      case 0x7:                                                                \
649
0
        if (toLim - *toP < 2) {                                                \
650
0
          *fromP = from;                                                       \
651
0
          return XML_CONVERT_OUTPUT_EXHAUSTED;                                 \
652
0
        }                                                                      \
653
0
        *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2);                      \
654
0
        *(*toP)++ = ((lo & 0x3f) | 0x80);                                      \
655
0
        break;                                                                 \
656
0
      default:                                                                 \
657
0
        if (toLim - *toP < 3) {                                                \
658
0
          *fromP = from;                                                       \
659
0
          return XML_CONVERT_OUTPUT_EXHAUSTED;                                 \
660
0
        }                                                                      \
661
0
        /* 16 bits divided 4, 6, 6 amongst 3 bytes */                          \
662
0
        *(*toP)++ = ((hi >> 4) | UTF8_cval3);                                  \
663
0
        *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80);                    \
664
0
        *(*toP)++ = ((lo & 0x3f) | 0x80);                                      \
665
0
        break;                                                                 \
666
0
      case 0xD8:                                                               \
667
0
      case 0xD9:                                                               \
668
0
      case 0xDA:                                                               \
669
0
      case 0xDB:                                                               \
670
0
        if (toLim - *toP < 4) {                                                \
671
0
          *fromP = from;                                                       \
672
0
          return XML_CONVERT_OUTPUT_EXHAUSTED;                                 \
673
0
        }                                                                      \
674
0
        if (fromLim - from < 4) {                                              \
675
0
          *fromP = from;                                                       \
676
0
          return XML_CONVERT_INPUT_INCOMPLETE;                                 \
677
0
        }                                                                      \
678
0
        plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1;                   \
679
0
        *(*toP)++ = (char)((plane >> 2) | UTF8_cval4);                         \
680
0
        *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80);         \
681
0
        from += 2;                                                             \
682
0
        lo2 = GET_LO(from);                                                    \
683
0
        *(*toP)++ = (((lo & 0x3) << 4) | ((GET_HI(from) & 0x3) << 2)           \
684
0
                     | (lo2 >> 6) | 0x80);                                     \
685
0
        *(*toP)++ = ((lo2 & 0x3f) | 0x80);                                     \
686
0
        break;                                                                 \
687
0
      }                                                                        \
688
0
    }                                                                          \
689
0
    *fromP = from;                                                             \
690
0
    if (from < fromLim)                                                        \
691
0
      return XML_CONVERT_INPUT_INCOMPLETE;                                     \
692
0
    else                                                                       \
693
0
      return XML_CONVERT_COMPLETED;                                            \
694
0
  }
Unexecuted instantiation: xmltok.c:little2_toUtf8
Unexecuted instantiation: xmltok.c:big2_toUtf8
695
696
#define DEFINE_UTF16_TO_UTF16(E)                                               \
697
  static enum XML_Convert_Result PTRCALL E##toUtf16(                           \
698
      const ENCODING *enc, const char **fromP, const char *fromLim,            \
699
0
      unsigned short **toP, const unsigned short *toLim) {                     \
700
0
    enum XML_Convert_Result res = XML_CONVERT_COMPLETED;                       \
701
0
    UNUSED_P(enc);                                                             \
702
0
    fromLim = *fromP + (((fromLim - *fromP) >> 1) << 1); /* shrink to even */  \
703
0
    /* Avoid copying first half only of surrogate */                           \
704
0
    if (fromLim - *fromP > ((toLim - *toP) << 1)                               \
705
0
        && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) {                             \
706
0
      fromLim -= 2;                                                            \
707
0
      res = XML_CONVERT_INPUT_INCOMPLETE;                                      \
708
0
    }                                                                          \
709
0
    for (; *fromP < fromLim && *toP < toLim; *fromP += 2)                      \
710
0
      *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP);                      \
711
0
    if ((*toP == toLim) && (*fromP < fromLim))                                 \
712
0
      return XML_CONVERT_OUTPUT_EXHAUSTED;                                     \
713
0
    else                                                                       \
714
0
      return res;                                                              \
715
0
  }
Unexecuted instantiation: xmltok.c:little2_toUtf16
Unexecuted instantiation: xmltok.c:big2_toUtf16
716
717
#define SET2(ptr, ch) (((ptr)[0] = ((ch)&0xff)), ((ptr)[1] = ((ch) >> 8)))
718
0
#define GET_LO(ptr) ((unsigned char)(ptr)[0])
719
0
#define GET_HI(ptr) ((unsigned char)(ptr)[1])
720
721
DEFINE_UTF16_TO_UTF8(little2_)
722
DEFINE_UTF16_TO_UTF16(little2_)
723
724
#undef SET2
725
#undef GET_LO
726
#undef GET_HI
727
728
#define SET2(ptr, ch) (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch)&0xFF)))
729
0
#define GET_LO(ptr) ((unsigned char)(ptr)[1])
730
0
#define GET_HI(ptr) ((unsigned char)(ptr)[0])
731
732
DEFINE_UTF16_TO_UTF8(big2_)
733
DEFINE_UTF16_TO_UTF16(big2_)
734
735
#undef SET2
736
#undef GET_LO
737
#undef GET_HI
738
739
#define LITTLE2_BYTE_TYPE(enc, p)                                              \
740
2.10k
  ((p)[1] == 0 ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)]  \
741
2.10k
               : unicode_byte_type((p)[1], (p)[0]))
742
0
#define LITTLE2_BYTE_TO_ASCII(p) ((p)[1] == 0 ? (p)[0] : -1)
743
0
#define LITTLE2_CHAR_MATCHES(p, c) ((p)[1] == 0 && (p)[0] == c)
744
#define LITTLE2_IS_NAME_CHAR_MINBPC(p)                                         \
745
1.08k
  UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
746
#define LITTLE2_IS_NMSTRT_CHAR_MINBPC(p)                                       \
747
96
  UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
748
749
#ifdef XML_MIN_SIZE
750
751
static int PTRFASTCALL
752
little2_byteType(const ENCODING *enc, const char *p) {
753
  return LITTLE2_BYTE_TYPE(enc, p);
754
}
755
756
static int PTRFASTCALL
757
little2_byteToAscii(const ENCODING *enc, const char *p) {
758
  UNUSED_P(enc);
759
  return LITTLE2_BYTE_TO_ASCII(p);
760
}
761
762
static int PTRCALL
763
little2_charMatches(const ENCODING *enc, const char *p, int c) {
764
  UNUSED_P(enc);
765
  return LITTLE2_CHAR_MATCHES(p, c);
766
}
767
768
static int PTRFASTCALL
769
little2_isNameMin(const ENCODING *enc, const char *p) {
770
  UNUSED_P(enc);
771
  return LITTLE2_IS_NAME_CHAR_MINBPC(p);
772
}
773
774
static int PTRFASTCALL
775
little2_isNmstrtMin(const ENCODING *enc, const char *p) {
776
  UNUSED_P(enc);
777
  return LITTLE2_IS_NMSTRT_CHAR_MINBPC(p);
778
}
779
780
#  undef VTABLE
781
#  define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
782
783
#else /* not XML_MIN_SIZE */
784
785
#  undef PREFIX
786
0
#  define PREFIX(ident) little2_##ident
787
4.50k
#  define MINBPC(enc) 2
788
/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
789
2.10k
#  define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
790
0
#  define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(p)
791
0
#  define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(p, c)
792
17
#  define IS_NAME_CHAR(enc, p, n) 0
793
1.08k
#  define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(p)
794
17
#  define IS_NMSTRT_CHAR(enc, p, n) (0)
795
96
#  define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(p)
796
797
#  define XML_TOK_IMPL_C
798
#  include "xmltok_impl.c"
799
#  undef XML_TOK_IMPL_C
800
801
#  undef MINBPC
802
#  undef BYTE_TYPE
803
#  undef BYTE_TO_ASCII
804
#  undef CHAR_MATCHES
805
#  undef IS_NAME_CHAR
806
#  undef IS_NAME_CHAR_MINBPC
807
#  undef IS_NMSTRT_CHAR
808
#  undef IS_NMSTRT_CHAR_MINBPC
809
#  undef IS_INVALID_CHAR
810
811
#endif /* not XML_MIN_SIZE */
812
813
#ifdef XML_NS
814
815
static const struct normal_encoding little2_encoding_ns
816
    = {{VTABLE, 2, 0,
817
#  if BYTEORDER == 1234
818
        1
819
#  else
820
        0
821
#  endif
822
       },
823
       {
824
#  include "asciitab.h"
825
#  include "latin1tab.h"
826
       },
827
       STANDARD_VTABLE(little2_) NULL_VTABLE};
828
829
#endif
830
831
static const struct normal_encoding little2_encoding
832
    = {{VTABLE, 2, 0,
833
#if BYTEORDER == 1234
834
        1
835
#else
836
        0
837
#endif
838
       },
839
       {
840
#define BT_COLON BT_NMSTRT
841
#include "asciitab.h"
842
#undef BT_COLON
843
#include "latin1tab.h"
844
       },
845
       STANDARD_VTABLE(little2_) NULL_VTABLE};
846
847
#if BYTEORDER != 4321
848
849
#  ifdef XML_NS
850
851
static const struct normal_encoding internal_little2_encoding_ns
852
    = {{VTABLE, 2, 0, 1},
853
       {
854
#    include "iasciitab.h"
855
#    include "latin1tab.h"
856
       },
857
       STANDARD_VTABLE(little2_) NULL_VTABLE};
858
859
#  endif
860
861
static const struct normal_encoding internal_little2_encoding
862
    = {{VTABLE, 2, 0, 1},
863
       {
864
#  define BT_COLON BT_NMSTRT
865
#  include "iasciitab.h"
866
#  undef BT_COLON
867
#  include "latin1tab.h"
868
       },
869
       STANDARD_VTABLE(little2_) NULL_VTABLE};
870
871
#endif
872
873
#define BIG2_BYTE_TYPE(enc, p)                                                 \
874
2.51k
  ((p)[0] == 0                                                                 \
875
2.51k
       ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]]        \
876
2.51k
       : unicode_byte_type((p)[0], (p)[1]))
877
0
#define BIG2_BYTE_TO_ASCII(p) ((p)[0] == 0 ? (p)[1] : -1)
878
0
#define BIG2_CHAR_MATCHES(p, c) ((p)[0] == 0 && (p)[1] == c)
879
#define BIG2_IS_NAME_CHAR_MINBPC(p)                                            \
880
1.27k
  UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
881
#define BIG2_IS_NMSTRT_CHAR_MINBPC(p)                                          \
882
90
  UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
883
884
#ifdef XML_MIN_SIZE
885
886
static int PTRFASTCALL
887
big2_byteType(const ENCODING *enc, const char *p) {
888
  return BIG2_BYTE_TYPE(enc, p);
889
}
890
891
static int PTRFASTCALL
892
big2_byteToAscii(const ENCODING *enc, const char *p) {
893
  UNUSED_P(enc);
894
  return BIG2_BYTE_TO_ASCII(p);
895
}
896
897
static int PTRCALL
898
big2_charMatches(const ENCODING *enc, const char *p, int c) {
899
  UNUSED_P(enc);
900
  return BIG2_CHAR_MATCHES(p, c);
901
}
902
903
static int PTRFASTCALL
904
big2_isNameMin(const ENCODING *enc, const char *p) {
905
  UNUSED_P(enc);
906
  return BIG2_IS_NAME_CHAR_MINBPC(p);
907
}
908
909
static int PTRFASTCALL
910
big2_isNmstrtMin(const ENCODING *enc, const char *p) {
911
  UNUSED_P(enc);
912
  return BIG2_IS_NMSTRT_CHAR_MINBPC(p);
913
}
914
915
#  undef VTABLE
916
#  define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
917
918
#else /* not XML_MIN_SIZE */
919
920
#  undef PREFIX
921
0
#  define PREFIX(ident) big2_##ident
922
5.30k
#  define MINBPC(enc) 2
923
/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
924
2.51k
#  define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
925
0
#  define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(p)
926
0
#  define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(p, c)
927
4
#  define IS_NAME_CHAR(enc, p, n) 0
928
1.27k
#  define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(p)
929
4
#  define IS_NMSTRT_CHAR(enc, p, n) (0)
930
90
#  define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(p)
931
932
#  define XML_TOK_IMPL_C
933
#  include "xmltok_impl.c"
934
#  undef XML_TOK_IMPL_C
935
936
#  undef MINBPC
937
#  undef BYTE_TYPE
938
#  undef BYTE_TO_ASCII
939
#  undef CHAR_MATCHES
940
#  undef IS_NAME_CHAR
941
#  undef IS_NAME_CHAR_MINBPC
942
#  undef IS_NMSTRT_CHAR
943
#  undef IS_NMSTRT_CHAR_MINBPC
944
#  undef IS_INVALID_CHAR
945
946
#endif /* not XML_MIN_SIZE */
947
948
#ifdef XML_NS
949
950
static const struct normal_encoding big2_encoding_ns
951
    = {{VTABLE, 2, 0,
952
#  if BYTEORDER == 4321
953
        1
954
#  else
955
        0
956
#  endif
957
       },
958
       {
959
#  include "asciitab.h"
960
#  include "latin1tab.h"
961
       },
962
       STANDARD_VTABLE(big2_) NULL_VTABLE};
963
964
#endif
965
966
static const struct normal_encoding big2_encoding
967
    = {{VTABLE, 2, 0,
968
#if BYTEORDER == 4321
969
        1
970
#else
971
        0
972
#endif
973
       },
974
       {
975
#define BT_COLON BT_NMSTRT
976
#include "asciitab.h"
977
#undef BT_COLON
978
#include "latin1tab.h"
979
       },
980
       STANDARD_VTABLE(big2_) NULL_VTABLE};
981
982
#if BYTEORDER != 1234
983
984
#  ifdef XML_NS
985
986
static const struct normal_encoding internal_big2_encoding_ns
987
    = {{VTABLE, 2, 0, 1},
988
       {
989
#    include "iasciitab.h"
990
#    include "latin1tab.h"
991
       },
992
       STANDARD_VTABLE(big2_) NULL_VTABLE};
993
994
#  endif
995
996
static const struct normal_encoding internal_big2_encoding
997
    = {{VTABLE, 2, 0, 1},
998
       {
999
#  define BT_COLON BT_NMSTRT
1000
#  include "iasciitab.h"
1001
#  undef BT_COLON
1002
#  include "latin1tab.h"
1003
       },
1004
       STANDARD_VTABLE(big2_) NULL_VTABLE};
1005
1006
#endif
1007
1008
#undef PREFIX
1009
1010
static int FASTCALL
1011
2.78k
streqci(const char *s1, const char *s2) {
1012
15.3k
  for (;;) {
1013
15.3k
    char c1 = *s1++;
1014
15.3k
    char c2 = *s2++;
1015
15.3k
    if (ASCII_a <= c1 && c1 <= ASCII_z)
1016
522
      c1 += ASCII_A - ASCII_a;
1017
15.3k
    if (ASCII_a <= c2 && c2 <= ASCII_z)
1018
      /* The following line will never get executed.  streqci() is
1019
       * only called from two places, both of which guarantee to put
1020
       * upper-case strings into s2.
1021
       */
1022
0
      c2 += ASCII_A - ASCII_a; /* LCOV_EXCL_LINE */
1023
15.3k
    if (c1 != c2)
1024
1.54k
      return 0;
1025
13.8k
    if (! c1)
1026
1.24k
      break;
1027
13.8k
  }
1028
1.24k
  return 1;
1029
2.78k
}
1030
1031
static void PTRCALL
1032
initUpdatePosition(const ENCODING *enc, const char *ptr, const char *end,
1033
24
                   POSITION *pos) {
1034
24
  UNUSED_P(enc);
1035
24
  normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
1036
24
}
1037
1038
static int
1039
71.9k
toAscii(const ENCODING *enc, const char *ptr, const char *end) {
1040
71.9k
  char buf[1];
1041
71.9k
  char *p = buf;
1042
71.9k
  XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
1043
71.9k
  if (p == buf)
1044
200
    return -1;
1045
71.7k
  else
1046
71.7k
    return buf[0];
1047
71.9k
}
1048
1049
static int FASTCALL
1050
33.5k
isSpace(int c) {
1051
33.5k
  switch (c) {
1052
3.72k
  case 0x20:
1053
4.59k
  case 0xD:
1054
5.61k
  case 0xA:
1055
6.57k
  case 0x9:
1056
6.57k
    return 1;
1057
33.5k
  }
1058
26.9k
  return 0;
1059
33.5k
}
1060
1061
/* Return 1 if there's just optional white space or there's an S
1062
   followed by name=val.
1063
*/
1064
static int
1065
parsePseudoAttribute(const ENCODING *enc, const char *ptr, const char *end,
1066
                     const char **namePtr, const char **nameEndPtr,
1067
4.21k
                     const char **valPtr, const char **nextTokPtr) {
1068
4.21k
  int c;
1069
4.21k
  char open;
1070
4.21k
  if (ptr == end) {
1071
1.28k
    *namePtr = NULL;
1072
1.28k
    return 1;
1073
1.28k
  }
1074
2.93k
  if (! isSpace(toAscii(enc, ptr, end))) {
1075
3
    *nextTokPtr = ptr;
1076
3
    return 0;
1077
3
  }
1078
3.74k
  do {
1079
3.74k
    ptr += enc->minBytesPerChar;
1080
3.74k
  } while (isSpace(toAscii(enc, ptr, end)));
1081
2.93k
  if (ptr == end) {
1082
24
    *namePtr = NULL;
1083
24
    return 1;
1084
24
  }
1085
2.90k
  *namePtr = ptr;
1086
23.9k
  for (;;) {
1087
23.9k
    c = toAscii(enc, ptr, end);
1088
23.9k
    if (c == -1) {
1089
17
      *nextTokPtr = ptr;
1090
17
      return 0;
1091
17
    }
1092
23.9k
    if (c == ASCII_EQUALS) {
1093
2.83k
      *nameEndPtr = ptr;
1094
2.83k
      break;
1095
2.83k
    }
1096
21.1k
    if (isSpace(c)) {
1097
61
      *nameEndPtr = ptr;
1098
1.00k
      do {
1099
1.00k
        ptr += enc->minBytesPerChar;
1100
1.00k
      } while (isSpace(c = toAscii(enc, ptr, end)));
1101
61
      if (c != ASCII_EQUALS) {
1102
49
        *nextTokPtr = ptr;
1103
49
        return 0;
1104
49
      }
1105
12
      break;
1106
61
    }
1107
21.0k
    ptr += enc->minBytesPerChar;
1108
21.0k
  }
1109
2.84k
  if (ptr == *namePtr) {
1110
1
    *nextTokPtr = ptr;
1111
1
    return 0;
1112
1
  }
1113
2.84k
  ptr += enc->minBytesPerChar;
1114
2.84k
  c = toAscii(enc, ptr, end);
1115
3.72k
  while (isSpace(c)) {
1116
888
    ptr += enc->minBytesPerChar;
1117
888
    c = toAscii(enc, ptr, end);
1118
888
  }
1119
2.84k
  if (c != ASCII_QUOT && c != ASCII_APOS) {
1120
43
    *nextTokPtr = ptr;
1121
43
    return 0;
1122
43
  }
1123
2.79k
  open = (char)c;
1124
2.79k
  ptr += enc->minBytesPerChar;
1125
2.79k
  *valPtr = ptr;
1126
34.3k
  for (;; ptr += enc->minBytesPerChar) {
1127
34.3k
    c = toAscii(enc, ptr, end);
1128
34.3k
    if (c == open)
1129
2.73k
      break;
1130
31.5k
    if (! (ASCII_a <= c && c <= ASCII_z) && ! (ASCII_A <= c && c <= ASCII_Z)
1131
31.5k
        && ! (ASCII_0 <= c && c <= ASCII_9) && c != ASCII_PERIOD
1132
31.5k
        && c != ASCII_MINUS && c != ASCII_UNDERSCORE) {
1133
64
      *nextTokPtr = ptr;
1134
64
      return 0;
1135
64
    }
1136
31.5k
  }
1137
2.73k
  *nextTokPtr = ptr + enc->minBytesPerChar;
1138
2.73k
  return 1;
1139
2.79k
}
1140
1141
static const char KW_version[]
1142
    = {ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0'};
1143
1144
static const char KW_encoding[] = {ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d,
1145
                                   ASCII_i, ASCII_n, ASCII_g, '\0'};
1146
1147
static const char KW_standalone[]
1148
    = {ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a,
1149
       ASCII_l, ASCII_o, ASCII_n, ASCII_e, '\0'};
1150
1151
static const char KW_yes[] = {ASCII_y, ASCII_e, ASCII_s, '\0'};
1152
1153
static const char KW_no[] = {ASCII_n, ASCII_o, '\0'};
1154
1155
static int
1156
doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *, const char *,
1157
                                                 const char *),
1158
               int isGeneralTextEntity, const ENCODING *enc, const char *ptr,
1159
               const char *end, const char **badPtr, const char **versionPtr,
1160
               const char **versionEndPtr, const char **encodingName,
1161
1.56k
               const ENCODING **encoding, int *standalone) {
1162
1.56k
  const char *val = NULL;
1163
1.56k
  const char *name = NULL;
1164
1.56k
  const char *nameEnd = NULL;
1165
1.56k
  ptr += 5 * enc->minBytesPerChar;
1166
1.56k
  end -= 2 * enc->minBytesPerChar;
1167
1.56k
  if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)
1168
1.56k
      || ! name) {
1169
184
    *badPtr = ptr;
1170
184
    return 0;
1171
184
  }
1172
1.37k
  if (! XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) {
1173
5
    if (! isGeneralTextEntity) {
1174
5
      *badPtr = name;
1175
5
      return 0;
1176
5
    }
1177
1.37k
  } else {
1178
1.37k
    if (versionPtr)
1179
1.37k
      *versionPtr = val;
1180
1.37k
    if (versionEndPtr)
1181
1.37k
      *versionEndPtr = ptr;
1182
1.37k
    if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1183
12
      *badPtr = ptr;
1184
12
      return 0;
1185
12
    }
1186
1.36k
    if (! name) {
1187
5
      if (isGeneralTextEntity) {
1188
        /* a TextDecl must have an EncodingDecl */
1189
0
        *badPtr = ptr;
1190
0
        return 0;
1191
0
      }
1192
5
      return 1;
1193
5
    }
1194
1.36k
  }
1195
1.35k
  if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) {
1196
1.28k
    int c = toAscii(enc, val, end);
1197
1.28k
    if (! (ASCII_a <= c && c <= ASCII_z) && ! (ASCII_A <= c && c <= ASCII_Z)) {
1198
5
      *badPtr = val;
1199
5
      return 0;
1200
5
    }
1201
1.28k
    if (encodingName)
1202
1.28k
      *encodingName = val;
1203
1.28k
    if (encoding)
1204
1.28k
      *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
1205
1.28k
    if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1206
2
      *badPtr = ptr;
1207
2
      return 0;
1208
2
    }
1209
1.28k
    if (! name)
1210
1.28k
      return 1;
1211
1.28k
  }
1212
68
  if (! XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone)
1213
68
      || isGeneralTextEntity) {
1214
5
    *badPtr = name;
1215
5
    return 0;
1216
5
  }
1217
63
  if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) {
1218
35
    if (standalone)
1219
35
      *standalone = 1;
1220
35
  } else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) {
1221
27
    if (standalone)
1222
27
      *standalone = 0;
1223
27
  } else {
1224
1
    *badPtr = val;
1225
1
    return 0;
1226
1
  }
1227
1.00k
  while (isSpace(toAscii(enc, ptr, end)))
1228
938
    ptr += enc->minBytesPerChar;
1229
62
  if (ptr != end) {
1230
16
    *badPtr = ptr;
1231
16
    return 0;
1232
16
  }
1233
46
  return 1;
1234
62
}
1235
1236
static int FASTCALL
1237
754k
checkCharRefNumber(int result) {
1238
754k
  switch (result >> 8) {
1239
1
  case 0xD8:
1240
2
  case 0xD9:
1241
2
  case 0xDA:
1242
4
  case 0xDB:
1243
5
  case 0xDC:
1244
8
  case 0xDD:
1245
10
  case 0xDE:
1246
11
  case 0xDF:
1247
11
    return -1;
1248
6.12k
  case 0:
1249
6.12k
    if (latin1_encoding.type[result] == BT_NONXML)
1250
33
      return -1;
1251
6.09k
    break;
1252
6.21k
  case 0xFF:
1253
6.21k
    if (result == 0xFFFE || result == 0xFFFF)
1254
2
      return -1;
1255
6.21k
    break;
1256
754k
  }
1257
754k
  return result;
1258
754k
}
1259
1260
int FASTCALL
1261
754k
XmlUtf8Encode(int c, char *buf) {
1262
754k
  enum {
1263
    /* minN is minimum legal resulting value for N byte sequence */
1264
754k
    min2 = 0x80,
1265
754k
    min3 = 0x800,
1266
754k
    min4 = 0x10000
1267
754k
  };
1268
1269
754k
  if (c < 0)
1270
0
    return 0; /* LCOV_EXCL_LINE: this case is always eliminated beforehand */
1271
754k
  if (c < min2) {
1272
4.35k
    buf[0] = (char)(c | UTF8_cval1);
1273
4.35k
    return 1;
1274
4.35k
  }
1275
750k
  if (c < min3) {
1276
566k
    buf[0] = (char)((c >> 6) | UTF8_cval2);
1277
566k
    buf[1] = (char)((c & 0x3f) | 0x80);
1278
566k
    return 2;
1279
566k
  }
1280
183k
  if (c < min4) {
1281
11.9k
    buf[0] = (char)((c >> 12) | UTF8_cval3);
1282
11.9k
    buf[1] = (char)(((c >> 6) & 0x3f) | 0x80);
1283
11.9k
    buf[2] = (char)((c & 0x3f) | 0x80);
1284
11.9k
    return 3;
1285
11.9k
  }
1286
171k
  if (c < 0x110000) {
1287
171k
    buf[0] = (char)((c >> 18) | UTF8_cval4);
1288
171k
    buf[1] = (char)(((c >> 12) & 0x3f) | 0x80);
1289
171k
    buf[2] = (char)(((c >> 6) & 0x3f) | 0x80);
1290
171k
    buf[3] = (char)((c & 0x3f) | 0x80);
1291
171k
    return 4;
1292
171k
  }
1293
0
  return 0; /* LCOV_EXCL_LINE: this case too is eliminated before calling */
1294
171k
}
1295
1296
int FASTCALL
1297
0
XmlUtf16Encode(int charNum, unsigned short *buf) {
1298
0
  if (charNum < 0)
1299
0
    return 0;
1300
0
  if (charNum < 0x10000) {
1301
0
    buf[0] = (unsigned short)charNum;
1302
0
    return 1;
1303
0
  }
1304
0
  if (charNum < 0x110000) {
1305
0
    charNum -= 0x10000;
1306
0
    buf[0] = (unsigned short)((charNum >> 10) + 0xD800);
1307
0
    buf[1] = (unsigned short)((charNum & 0x3FF) + 0xDC00);
1308
0
    return 2;
1309
0
  }
1310
0
  return 0;
1311
0
}
1312
1313
struct unknown_encoding {
1314
  struct normal_encoding normal;
1315
  CONVERTER convert;
1316
  void *userData;
1317
  unsigned short utf16[256];
1318
  char utf8[256][4];
1319
};
1320
1321
0
#define AS_UNKNOWN_ENCODING(enc) ((const struct unknown_encoding *)(enc))
1322
1323
int
1324
0
XmlSizeOfUnknownEncoding(void) {
1325
0
  return sizeof(struct unknown_encoding);
1326
0
}
1327
1328
static int PTRFASTCALL
1329
0
unknown_isName(const ENCODING *enc, const char *p) {
1330
0
  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1331
0
  int c = uenc->convert(uenc->userData, p);
1332
0
  if (c & ~0xFFFF)
1333
0
    return 0;
1334
0
  return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
1335
0
}
1336
1337
static int PTRFASTCALL
1338
0
unknown_isNmstrt(const ENCODING *enc, const char *p) {
1339
0
  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1340
0
  int c = uenc->convert(uenc->userData, p);
1341
0
  if (c & ~0xFFFF)
1342
0
    return 0;
1343
0
  return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
1344
0
}
1345
1346
static int PTRFASTCALL
1347
0
unknown_isInvalid(const ENCODING *enc, const char *p) {
1348
0
  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1349
0
  int c = uenc->convert(uenc->userData, p);
1350
0
  return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
1351
0
}
1352
1353
static enum XML_Convert_Result PTRCALL
1354
unknown_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
1355
0
               char **toP, const char *toLim) {
1356
0
  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1357
0
  char buf[XML_UTF8_ENCODE_MAX];
1358
0
  for (;;) {
1359
0
    const char *utf8;
1360
0
    int n;
1361
0
    if (*fromP == fromLim)
1362
0
      return XML_CONVERT_COMPLETED;
1363
0
    utf8 = uenc->utf8[(unsigned char)**fromP];
1364
0
    n = *utf8++;
1365
0
    if (n == 0) {
1366
0
      int c = uenc->convert(uenc->userData, *fromP);
1367
0
      n = XmlUtf8Encode(c, buf);
1368
0
      if (n > toLim - *toP)
1369
0
        return XML_CONVERT_OUTPUT_EXHAUSTED;
1370
0
      utf8 = buf;
1371
0
      *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1372
0
                 - (BT_LEAD2 - 2));
1373
0
    } else {
1374
0
      if (n > toLim - *toP)
1375
0
        return XML_CONVERT_OUTPUT_EXHAUSTED;
1376
0
      (*fromP)++;
1377
0
    }
1378
0
    memcpy(*toP, utf8, n);
1379
0
    *toP += n;
1380
0
  }
1381
0
}
1382
1383
static enum XML_Convert_Result PTRCALL
1384
unknown_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
1385
0
                unsigned short **toP, const unsigned short *toLim) {
1386
0
  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1387
0
  while (*fromP < fromLim && *toP < toLim) {
1388
0
    unsigned short c = uenc->utf16[(unsigned char)**fromP];
1389
0
    if (c == 0) {
1390
0
      c = (unsigned short)uenc->convert(uenc->userData, *fromP);
1391
0
      *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1392
0
                 - (BT_LEAD2 - 2));
1393
0
    } else
1394
0
      (*fromP)++;
1395
0
    *(*toP)++ = c;
1396
0
  }
1397
1398
0
  if ((*toP == toLim) && (*fromP < fromLim))
1399
0
    return XML_CONVERT_OUTPUT_EXHAUSTED;
1400
0
  else
1401
0
    return XML_CONVERT_COMPLETED;
1402
0
}
1403
1404
ENCODING *
1405
XmlInitUnknownEncoding(void *mem, int *table, CONVERTER convert,
1406
0
                       void *userData) {
1407
0
  int i;
1408
0
  struct unknown_encoding *e = (struct unknown_encoding *)mem;
1409
0
  memcpy(mem, &latin1_encoding, sizeof(struct normal_encoding));
1410
0
  for (i = 0; i < 128; i++)
1411
0
    if (latin1_encoding.type[i] != BT_OTHER
1412
0
        && latin1_encoding.type[i] != BT_NONXML && table[i] != i)
1413
0
      return 0;
1414
0
  for (i = 0; i < 256; i++) {
1415
0
    int c = table[i];
1416
0
    if (c == -1) {
1417
0
      e->normal.type[i] = BT_MALFORM;
1418
      /* This shouldn't really get used. */
1419
0
      e->utf16[i] = 0xFFFF;
1420
0
      e->utf8[i][0] = 1;
1421
0
      e->utf8[i][1] = 0;
1422
0
    } else if (c < 0) {
1423
0
      if (c < -4)
1424
0
        return 0;
1425
      /* Multi-byte sequences need a converter function */
1426
0
      if (! convert)
1427
0
        return 0;
1428
0
      e->normal.type[i] = (unsigned char)(BT_LEAD2 - (c + 2));
1429
0
      e->utf8[i][0] = 0;
1430
0
      e->utf16[i] = 0;
1431
0
    } else if (c < 0x80) {
1432
0
      if (latin1_encoding.type[c] != BT_OTHER
1433
0
          && latin1_encoding.type[c] != BT_NONXML && c != i)
1434
0
        return 0;
1435
0
      e->normal.type[i] = latin1_encoding.type[c];
1436
0
      e->utf8[i][0] = 1;
1437
0
      e->utf8[i][1] = (char)c;
1438
0
      e->utf16[i] = (unsigned short)(c == 0 ? 0xFFFF : c);
1439
0
    } else if (checkCharRefNumber(c) < 0) {
1440
0
      e->normal.type[i] = BT_NONXML;
1441
      /* This shouldn't really get used. */
1442
0
      e->utf16[i] = 0xFFFF;
1443
0
      e->utf8[i][0] = 1;
1444
0
      e->utf8[i][1] = 0;
1445
0
    } else {
1446
0
      if (c > 0xFFFF)
1447
0
        return 0;
1448
0
      if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
1449
0
        e->normal.type[i] = BT_NMSTRT;
1450
0
      else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
1451
0
        e->normal.type[i] = BT_NAME;
1452
0
      else
1453
0
        e->normal.type[i] = BT_OTHER;
1454
0
      e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
1455
0
      e->utf16[i] = (unsigned short)c;
1456
0
    }
1457
0
  }
1458
0
  e->userData = userData;
1459
0
  e->convert = convert;
1460
0
  if (convert) {
1461
0
    e->normal.isName2 = unknown_isName;
1462
0
    e->normal.isName3 = unknown_isName;
1463
0
    e->normal.isName4 = unknown_isName;
1464
0
    e->normal.isNmstrt2 = unknown_isNmstrt;
1465
0
    e->normal.isNmstrt3 = unknown_isNmstrt;
1466
0
    e->normal.isNmstrt4 = unknown_isNmstrt;
1467
0
    e->normal.isInvalid2 = unknown_isInvalid;
1468
0
    e->normal.isInvalid3 = unknown_isInvalid;
1469
0
    e->normal.isInvalid4 = unknown_isInvalid;
1470
0
  }
1471
0
  e->normal.enc.utf8Convert = unknown_toUtf8;
1472
0
  e->normal.enc.utf16Convert = unknown_toUtf16;
1473
0
  return &(e->normal.enc);
1474
0
}
1475
1476
/* If this enumeration is changed, getEncodingIndex and encodings
1477
must also be changed. */
1478
enum {
1479
  UNKNOWN_ENC = -1,
1480
  ISO_8859_1_ENC = 0,
1481
  US_ASCII_ENC,
1482
  UTF_8_ENC,
1483
  UTF_16_ENC,
1484
  UTF_16BE_ENC,
1485
  UTF_16LE_ENC,
1486
  /* must match encodingNames up to here */
1487
  NO_ENC
1488
};
1489
1490
static const char KW_ISO_8859_1[]
1491
    = {ASCII_I, ASCII_S, ASCII_O,     ASCII_MINUS, ASCII_8, ASCII_8,
1492
       ASCII_5, ASCII_9, ASCII_MINUS, ASCII_1,     '\0'};
1493
static const char KW_US_ASCII[]
1494
    = {ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S,
1495
       ASCII_C, ASCII_I, ASCII_I,     '\0'};
1496
static const char KW_UTF_8[]
1497
    = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0'};
1498
static const char KW_UTF_16[]
1499
    = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0'};
1500
static const char KW_UTF_16BE[]
1501
    = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1,
1502
       ASCII_6, ASCII_B, ASCII_E, '\0'};
1503
static const char KW_UTF_16LE[]
1504
    = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1,
1505
       ASCII_6, ASCII_L, ASCII_E, '\0'};
1506
1507
static int FASTCALL
1508
16.2k
getEncodingIndex(const char *name) {
1509
16.2k
  static const char *const encodingNames[] = {
1510
16.2k
      KW_ISO_8859_1, KW_US_ASCII, KW_UTF_8, KW_UTF_16, KW_UTF_16BE, KW_UTF_16LE,
1511
16.2k
  };
1512
16.2k
  int i;
1513
16.2k
  if (name == NULL)
1514
14.9k
    return NO_ENC;
1515
1.54k
  for (i = 0; i < (int)(sizeof(encodingNames) / sizeof(encodingNames[0])); i++)
1516
1.50k
    if (streqci(name, encodingNames[i]))
1517
1.24k
      return i;
1518
36
  return UNKNOWN_ENC;
1519
1.27k
}
1520
1521
/* For binary compatibility, we store the index of the encoding
1522
   specified at initialization in the isUtf16 member.
1523
*/
1524
1525
7.41k
#define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16)
1526
14.9k
#define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i)
1527
1528
/* This is what detects the encoding.  encodingTable maps from
1529
   encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of
1530
   the external (protocol) specified encoding; state is
1531
   XML_CONTENT_STATE if we're parsing an external text entity, and
1532
   XML_PROLOG_STATE otherwise.
1533
*/
1534
1535
static int
1536
initScan(const ENCODING *const *encodingTable, const INIT_ENCODING *enc,
1537
7.48k
         int state, const char *ptr, const char *end, const char **nextTokPtr) {
1538
7.48k
  const ENCODING **encPtr;
1539
1540
7.48k
  if (ptr >= end)
1541
2
    return XML_TOK_NONE;
1542
7.48k
  encPtr = enc->encPtr;
1543
7.48k
  if (ptr + 1 == end) {
1544
    /* only a single byte available for auto-detection */
1545
14
#ifndef XML_DTD /* FIXME */
1546
    /* a well-formed document entity must have more than one byte */
1547
14
    if (state != XML_CONTENT_STATE)
1548
14
      return XML_TOK_PARTIAL;
1549
0
#endif
1550
    /* so we're parsing an external text entity... */
1551
    /* if UTF-16 was externally specified, then we need at least 2 bytes */
1552
0
    switch (INIT_ENC_INDEX(enc)) {
1553
0
    case UTF_16_ENC:
1554
0
    case UTF_16LE_ENC:
1555
0
    case UTF_16BE_ENC:
1556
0
      return XML_TOK_PARTIAL;
1557
0
    }
1558
0
    switch ((unsigned char)*ptr) {
1559
0
    case 0xFE:
1560
0
    case 0xFF:
1561
0
    case 0xEF: /* possibly first byte of UTF-8 BOM */
1562
0
      if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1563
0
        break;
1564
      /* fall through */
1565
0
    case 0x00:
1566
0
    case 0x3C:
1567
0
      return XML_TOK_PARTIAL;
1568
0
    }
1569
7.47k
  } else {
1570
7.47k
    switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
1571
90
    case 0xFEFF:
1572
90
      if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1573
0
        break;
1574
90
      *nextTokPtr = ptr + 2;
1575
90
      *encPtr = encodingTable[UTF_16BE_ENC];
1576
90
      return XML_TOK_BOM;
1577
    /* 00 3C is handled in the default case */
1578
0
    case 0x3C00:
1579
0
      if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
1580
0
           || INIT_ENC_INDEX(enc) == UTF_16_ENC)
1581
0
          && state == XML_CONTENT_STATE)
1582
0
        break;
1583
0
      *encPtr = encodingTable[UTF_16LE_ENC];
1584
0
      return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1585
111
    case 0xFFFE:
1586
111
      if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1587
0
        break;
1588
111
      *nextTokPtr = ptr + 2;
1589
111
      *encPtr = encodingTable[UTF_16LE_ENC];
1590
111
      return XML_TOK_BOM;
1591
64
    case 0xEFBB:
1592
      /* Maybe a UTF-8 BOM (EF BB BF) */
1593
      /* If there's an explicitly specified (external) encoding
1594
         of ISO-8859-1 or some flavour of UTF-16
1595
         and this is an external text entity,
1596
         don't look for the BOM,
1597
         because it might be a legal data.
1598
      */
1599
64
      if (state == XML_CONTENT_STATE) {
1600
0
        int e = INIT_ENC_INDEX(enc);
1601
0
        if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC || e == UTF_16LE_ENC
1602
0
            || e == UTF_16_ENC)
1603
0
          break;
1604
0
      }
1605
64
      if (ptr + 2 == end)
1606
2
        return XML_TOK_PARTIAL;
1607
62
      if ((unsigned char)ptr[2] == 0xBF) {
1608
51
        *nextTokPtr = ptr + 3;
1609
51
        *encPtr = encodingTable[UTF_8_ENC];
1610
51
        return XML_TOK_BOM;
1611
51
      }
1612
11
      break;
1613
7.20k
    default:
1614
7.20k
      if (ptr[0] == '\0') {
1615
        /* 0 isn't a legal data character. Furthermore a document
1616
           entity can only start with ASCII characters.  So the only
1617
           way this can fail to be big-endian UTF-16 if it it's an
1618
           external parsed general entity that's labelled as
1619
           UTF-16LE.
1620
        */
1621
0
        if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
1622
0
          break;
1623
0
        *encPtr = encodingTable[UTF_16BE_ENC];
1624
0
        return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1625
7.20k
      } else if (ptr[1] == '\0') {
1626
        /* We could recover here in the case:
1627
            - parsing an external entity
1628
            - second byte is 0
1629
            - no externally specified encoding
1630
            - no encoding declaration
1631
           by assuming UTF-16LE.  But we don't, because this would mean when
1632
           presented just with a single byte, we couldn't reliably determine
1633
           whether we needed further bytes.
1634
        */
1635
0
        if (state == XML_CONTENT_STATE)
1636
0
          break;
1637
0
        *encPtr = encodingTable[UTF_16LE_ENC];
1638
0
        return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1639
0
      }
1640
7.20k
      break;
1641
7.47k
    }
1642
7.47k
  }
1643
7.21k
  *encPtr = encodingTable[INIT_ENC_INDEX(enc)];
1644
7.21k
  return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1645
7.48k
}
1646
1647
40.2k
#define NS(x) x
1648
7.48k
#define ns(x) x
1649
#define XML_TOK_NS_C
1650
#include "xmltok_ns.c"
1651
#undef XML_TOK_NS_C
1652
#undef NS
1653
#undef ns
1654
1655
#ifdef XML_NS
1656
1657
#  define NS(x) x##NS
1658
#  define ns(x) x##_ns
1659
1660
#  define XML_TOK_NS_C
1661
#  include "xmltok_ns.c"
1662
#  undef XML_TOK_NS_C
1663
1664
#  undef NS
1665
#  undef ns
1666
1667
ENCODING *
1668
XmlInitUnknownEncodingNS(void *mem, int *table, CONVERTER convert,
1669
                         void *userData) {
1670
  ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
1671
  if (enc)
1672
    ((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON;
1673
  return enc;
1674
}
1675
1676
#endif /* XML_NS */