Coverage Report

Created: 2026-04-01 07:17

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/ghostpdl/expat/lib/xmltok.c
Line
Count
Source
1
/*
2
                            __  __            _
3
                         ___\ \/ /_ __   __ _| |_
4
                        / _ \\  /| '_ \ / _` | __|
5
                       |  __//  \| |_) | (_| | |_
6
                        \___/_/\_\ .__/ \__,_|\__|
7
                                 |_| XML parser
8
9
   Copyright (c) 1997-2000 Thai Open Source Software Center Ltd
10
   Copyright (c) 2000      Clark Cooper <coopercc@users.sourceforge.net>
11
   Copyright (c) 2001-2003 Fred L. Drake, Jr. <fdrake@users.sourceforge.net>
12
   Copyright (c) 2002      Greg Stein <gstein@users.sourceforge.net>
13
   Copyright (c) 2002-2016 Karl Waclawek <karl@waclawek.net>
14
   Copyright (c) 2005-2009 Steven Solie <steven@solie.ca>
15
   Copyright (c) 2016-2024 Sebastian Pipping <sebastian@pipping.org>
16
   Copyright (c) 2016      Pascal Cuoq <cuoq@trust-in-soft.com>
17
   Copyright (c) 2016      Don Lewis <truckman@apache.org>
18
   Copyright (c) 2017      Rhodri James <rhodri@wildebeest.org.uk>
19
   Copyright (c) 2017      Alexander Bluhm <alexander.bluhm@gmx.net>
20
   Copyright (c) 2017      Benbuck Nason <bnason@netflix.com>
21
   Copyright (c) 2017      José Gutiérrez de la Concha <jose@zeroc.com>
22
   Copyright (c) 2019      David Loffredo <loffredo@steptools.com>
23
   Copyright (c) 2021      Donghee Na <donghee.na@python.org>
24
   Copyright (c) 2022      Martin Ettl <ettl.martin78@googlemail.com>
25
   Copyright (c) 2022      Sean McBride <sean@rogue-research.com>
26
   Copyright (c) 2023      Hanno Böck <hanno@gentoo.org>
27
   Licensed under the MIT license:
28
29
   Permission is  hereby granted,  free of charge,  to any  person obtaining
30
   a  copy  of  this  software   and  associated  documentation  files  (the
31
   "Software"),  to  deal in  the  Software  without restriction,  including
32
   without  limitation the  rights  to use,  copy,  modify, merge,  publish,
33
   distribute, sublicense, and/or sell copies of the Software, and to permit
34
   persons  to whom  the Software  is  furnished to  do so,  subject to  the
35
   following conditions:
36
37
   The above copyright  notice and this permission notice  shall be included
38
   in all copies or substantial portions of the Software.
39
40
   THE  SOFTWARE  IS  PROVIDED  "AS  IS",  WITHOUT  WARRANTY  OF  ANY  KIND,
41
   EXPRESS  OR IMPLIED,  INCLUDING  BUT  NOT LIMITED  TO  THE WARRANTIES  OF
42
   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
43
   NO EVENT SHALL THE AUTHORS OR  COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
44
   DAMAGES OR  OTHER LIABILITY, WHETHER  IN AN  ACTION OF CONTRACT,  TORT OR
45
   OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
46
   USE OR OTHER DEALINGS IN THE SOFTWARE.
47
*/
48
49
#include "expat_config.h"
50
51
#include <stddef.h>
52
#include <string.h> /* memcpy */
53
54
#if defined(_MSC_VER) && _MSC_VER < 1700
55
typedef unsigned char bool;
56
#define true (bool)1
57
#define false (bool)0
58
#else
59
#include <stdbool.h>
60
#endif
61
62
#ifdef _WIN32
63
#  include "winconfig.h"
64
#endif
65
66
#include "expat_external.h"
67
#include "internal.h"
68
#include "xmltok.h"
69
#include "nametab.h"
70
71
#ifdef XML_DTD
72
#  define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok)
73
#else
74
#  define IGNORE_SECTION_TOK_VTABLE /* as nothing */
75
#endif
76
77
#define VTABLE1                                                                \
78
  {PREFIX(prologTok), PREFIX(contentTok),                                      \
79
   PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE},                         \
80
      {PREFIX(attributeValueTok), PREFIX(entityValueTok)},                     \
81
      PREFIX(nameMatchesAscii), PREFIX(nameLength), PREFIX(skipS),             \
82
      PREFIX(getAtts), PREFIX(charRefNumber), PREFIX(predefinedEntityName),    \
83
      PREFIX(updatePosition), PREFIX(isPublicId)
84
85
#define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
86
87
#define UCS2_GET_NAMING(pages, hi, lo)                                         \
88
0
  (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1u << ((lo) & 0x1F)))
89
90
/* A 2 byte UTF-8 representation splits the characters 11 bits between
91
   the bottom 5 and 6 bits of the bytes.  We need 8 bits to index into
92
   pages, 3 bits to add to that index and 5 bits to generate the mask.
93
*/
94
#define UTF8_GET_NAMING2(pages, byte)                                          \
95
0
  (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3)                         \
96
0
                + ((((byte)[0]) & 3) << 1) + ((((byte)[1]) >> 5) & 1)]         \
97
0
   & (1u << (((byte)[1]) & 0x1F)))
98
99
/* A 3 byte UTF-8 representation splits the characters 16 bits between
100
   the bottom 4, 6 and 6 bits of the bytes.  We need 8 bits to index
101
   into pages, 3 bits to add to that index and 5 bits to generate the
102
   mask.
103
*/
104
#define UTF8_GET_NAMING3(pages, byte)                                          \
105
0
  (namingBitmap                                                                \
106
0
       [((pages)[((((byte)[0]) & 0xF) << 4) + ((((byte)[1]) >> 2) & 0xF)]      \
107
0
         << 3)                                                                 \
108
0
        + ((((byte)[1]) & 3) << 1) + ((((byte)[2]) >> 5) & 1)]                 \
109
0
   & (1u << (((byte)[2]) & 0x1F)))
110
111
/* Detection of invalid UTF-8 sequences is based on Table 3.1B
112
   of Unicode 3.2: https://www.unicode.org/unicode/reports/tr28/
113
   with the additional restriction of not allowing the Unicode
114
   code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE).
115
   Implementation details:
116
     (A & 0x80) == 0     means A < 0x80
117
   and
118
     (A & 0xC0) == 0xC0  means A > 0xBF
119
*/
120
121
#define UTF8_INVALID2(p)                                                       \
122
0
  ((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0)
123
124
#define UTF8_INVALID3(p)                                                       \
125
429
  (((p)[2] & 0x80) == 0                                                        \
126
429
   || ((*p) == 0xEF && (p)[1] == 0xBF ? (p)[2] > 0xBD                          \
127
429
                                      : ((p)[2] & 0xC0) == 0xC0)               \
128
429
   || ((*p) == 0xE0                                                            \
129
429
           ? (p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0                          \
130
429
           : ((p)[1] & 0x80) == 0                                              \
131
429
                 || ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0)))
132
133
#define UTF8_INVALID4(p)                                                       \
134
0
  (((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 || ((p)[2] & 0x80) == 0     \
135
0
   || ((p)[2] & 0xC0) == 0xC0                                                  \
136
0
   || ((*p) == 0xF0                                                            \
137
0
           ? (p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0                          \
138
0
           : ((p)[1] & 0x80) == 0                                              \
139
0
                 || ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0)))
140
141
static int PTRFASTCALL
142
0
isNever(const ENCODING *enc, const char *p) {
143
0
  UNUSED_P(enc);
144
0
  UNUSED_P(p);
145
0
  return 0;
146
0
}
147
148
static int PTRFASTCALL
149
0
utf8_isName2(const ENCODING *enc, const char *p) {
150
0
  UNUSED_P(enc);
151
0
  return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
152
0
}
153
154
static int PTRFASTCALL
155
0
utf8_isName3(const ENCODING *enc, const char *p) {
156
0
  UNUSED_P(enc);
157
0
  return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
158
0
}
159
160
#define utf8_isName4 isNever
161
162
static int PTRFASTCALL
163
0
utf8_isNmstrt2(const ENCODING *enc, const char *p) {
164
0
  UNUSED_P(enc);
165
0
  return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
166
0
}
167
168
static int PTRFASTCALL
169
0
utf8_isNmstrt3(const ENCODING *enc, const char *p) {
170
0
  UNUSED_P(enc);
171
0
  return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
172
0
}
173
174
#define utf8_isNmstrt4 isNever
175
176
static int PTRFASTCALL
177
0
utf8_isInvalid2(const ENCODING *enc, const char *p) {
178
0
  UNUSED_P(enc);
179
0
  return UTF8_INVALID2((const unsigned char *)p);
180
0
}
181
182
static int PTRFASTCALL
183
429
utf8_isInvalid3(const ENCODING *enc, const char *p) {
184
429
  UNUSED_P(enc);
185
429
  return UTF8_INVALID3((const unsigned char *)p);
186
429
}
187
188
static int PTRFASTCALL
189
0
utf8_isInvalid4(const ENCODING *enc, const char *p) {
190
0
  UNUSED_P(enc);
191
0
  return UTF8_INVALID4((const unsigned char *)p);
192
0
}
193
194
struct normal_encoding {
195
  ENCODING enc;
196
  unsigned char type[256];
197
#ifdef XML_MIN_SIZE
198
  int(PTRFASTCALL *byteType)(const ENCODING *, const char *);
199
  int(PTRFASTCALL *isNameMin)(const ENCODING *, const char *);
200
  int(PTRFASTCALL *isNmstrtMin)(const ENCODING *, const char *);
201
  int(PTRFASTCALL *byteToAscii)(const ENCODING *, const char *);
202
  int(PTRCALL *charMatches)(const ENCODING *, const char *, int);
203
#endif /* XML_MIN_SIZE */
204
  int(PTRFASTCALL *isName2)(const ENCODING *, const char *);
205
  int(PTRFASTCALL *isName3)(const ENCODING *, const char *);
206
  int(PTRFASTCALL *isName4)(const ENCODING *, const char *);
207
  int(PTRFASTCALL *isNmstrt2)(const ENCODING *, const char *);
208
  int(PTRFASTCALL *isNmstrt3)(const ENCODING *, const char *);
209
  int(PTRFASTCALL *isNmstrt4)(const ENCODING *, const char *);
210
  int(PTRFASTCALL *isInvalid2)(const ENCODING *, const char *);
211
  int(PTRFASTCALL *isInvalid3)(const ENCODING *, const char *);
212
  int(PTRFASTCALL *isInvalid4)(const ENCODING *, const char *);
213
};
214
215
429
#define AS_NORMAL_ENCODING(enc) ((const struct normal_encoding *)(enc))
216
217
#ifdef XML_MIN_SIZE
218
219
#  define STANDARD_VTABLE(E)                                                   \
220
    E##byteType, E##isNameMin, E##isNmstrtMin, E##byteToAscii, E##charMatches,
221
222
#else
223
224
#  define STANDARD_VTABLE(E) /* as nothing */
225
226
#endif
227
228
#define NORMAL_VTABLE(E)                                                       \
229
  E##isName2, E##isName3, E##isName4, E##isNmstrt2, E##isNmstrt3,              \
230
      E##isNmstrt4, E##isInvalid2, E##isInvalid3, E##isInvalid4
231
232
#define NULL_VTABLE                                                            \
233
  /* isName2 */ NULL, /* isName3 */ NULL, /* isName4 */ NULL,                  \
234
      /* isNmstrt2 */ NULL, /* isNmstrt3 */ NULL, /* isNmstrt4 */ NULL,        \
235
      /* isInvalid2 */ NULL, /* isInvalid3 */ NULL, /* isInvalid4 */ NULL
236
237
static int FASTCALL checkCharRefNumber(int result);
238
239
#include "xmltok_impl.h"
240
#include "ascii.h"
241
242
#ifdef XML_MIN_SIZE
243
#  define sb_isNameMin isNever
244
#  define sb_isNmstrtMin isNever
245
#endif
246
247
#ifdef XML_MIN_SIZE
248
#  define MINBPC(enc) ((enc)->minBytesPerChar)
249
#else
250
/* minimum bytes per character */
251
7.99M
#  define MINBPC(enc) 1
252
#endif
253
254
#define SB_BYTE_TYPE(enc, p)                                                   \
255
5.10M
  (((const struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
256
257
#ifdef XML_MIN_SIZE
258
static int PTRFASTCALL
259
sb_byteType(const ENCODING *enc, const char *p) {
260
  return SB_BYTE_TYPE(enc, p);
261
}
262
#  define BYTE_TYPE(enc, p) (AS_NORMAL_ENCODING(enc)->byteType(enc, p))
263
#else
264
5.10M
#  define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
265
#endif
266
267
#ifdef XML_MIN_SIZE
268
#  define BYTE_TO_ASCII(enc, p) (AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p))
269
static int PTRFASTCALL
270
sb_byteToAscii(const ENCODING *enc, const char *p) {
271
  UNUSED_P(enc);
272
  return *p;
273
}
274
#else
275
167k
#  define BYTE_TO_ASCII(enc, p) (*(p))
276
#endif
277
278
0
#define IS_NAME_CHAR(enc, p, n) (AS_NORMAL_ENCODING(enc)->isName##n(enc, p))
279
0
#define IS_NMSTRT_CHAR(enc, p, n) (AS_NORMAL_ENCODING(enc)->isNmstrt##n(enc, p))
280
#ifdef XML_MIN_SIZE
281
#  define IS_INVALID_CHAR(enc, p, n)                                           \
282
    (AS_NORMAL_ENCODING(enc)->isInvalid##n                                     \
283
     && AS_NORMAL_ENCODING(enc)->isInvalid##n(enc, p))
284
#else
285
#  define IS_INVALID_CHAR(enc, p, n)                                           \
286
429
    (AS_NORMAL_ENCODING(enc)->isInvalid##n(enc, p))
287
#endif
288
289
#ifdef XML_MIN_SIZE
290
#  define IS_NAME_CHAR_MINBPC(enc, p)                                          \
291
    (AS_NORMAL_ENCODING(enc)->isNameMin(enc, p))
292
#  define IS_NMSTRT_CHAR_MINBPC(enc, p)                                        \
293
    (AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p))
294
#else
295
0
#  define IS_NAME_CHAR_MINBPC(enc, p) (0)
296
0
#  define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
297
#endif
298
299
#ifdef XML_MIN_SIZE
300
#  define CHAR_MATCHES(enc, p, c)                                              \
301
    (AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c))
302
static int PTRCALL
303
sb_charMatches(const ENCODING *enc, const char *p, int c) {
304
  UNUSED_P(enc);
305
  return *p == c;
306
}
307
#else
308
/* c is an ASCII character */
309
12.1k
#  define CHAR_MATCHES(enc, p, c) (*(p) == (c))
310
#endif
311
312
22.6k
#define PREFIX(ident) normal_##ident
313
#define XML_TOK_IMPL_C
314
#include "xmltok_impl.c"
315
#undef XML_TOK_IMPL_C
316
317
#undef MINBPC
318
#undef BYTE_TYPE
319
#undef BYTE_TO_ASCII
320
#undef CHAR_MATCHES
321
#undef IS_NAME_CHAR
322
#undef IS_NAME_CHAR_MINBPC
323
#undef IS_NMSTRT_CHAR
324
#undef IS_NMSTRT_CHAR_MINBPC
325
#undef IS_INVALID_CHAR
326
327
enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */
328
       UTF8_cval1 = 0x00,
329
       UTF8_cval2 = 0xc0,
330
       UTF8_cval3 = 0xe0,
331
       UTF8_cval4 = 0xf0
332
};
333
334
void
335
_INTERNAL_trim_to_complete_utf8_characters(const char *from,
336
123k
                                           const char **fromLimRef) {
337
123k
  const char *fromLim = *fromLimRef;
338
123k
  size_t walked = 0;
339
123k
  for (; fromLim > from; fromLim--, walked++) {
340
123k
    const unsigned char prev = (unsigned char)fromLim[-1];
341
123k
    if ((prev & 0xf8u)
342
123k
        == 0xf0u) { /* 4-byte character, lead by 0b11110xxx byte */
343
0
      if (walked + 1 >= 4) {
344
0
        fromLim += 4 - 1;
345
0
        break;
346
0
      } else {
347
0
        walked = 0;
348
0
      }
349
123k
    } else if ((prev & 0xf0u)
350
123k
               == 0xe0u) { /* 3-byte character, lead by 0b1110xxxx byte */
351
91
      if (walked + 1 >= 3) {
352
91
        fromLim += 3 - 1;
353
91
        break;
354
91
      } else {
355
0
        walked = 0;
356
0
      }
357
123k
    } else if ((prev & 0xe0u)
358
123k
               == 0xc0u) { /* 2-byte character, lead by 0b110xxxxx byte */
359
0
      if (walked + 1 >= 2) {
360
0
        fromLim += 2 - 1;
361
0
        break;
362
0
      } else {
363
0
        walked = 0;
364
0
      }
365
123k
    } else if ((prev & 0x80u)
366
123k
               == 0x00u) { /* 1-byte character, matching 0b0xxxxxxx */
367
123k
      break;
368
123k
    }
369
123k
  }
370
123k
  *fromLimRef = fromLim;
371
123k
}
372
373
static enum XML_Convert_Result PTRCALL
374
utf8_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
375
123k
            char **toP, const char *toLim) {
376
123k
  bool input_incomplete = false;
377
123k
  bool output_exhausted = false;
378
379
  /* Avoid copying partial characters (due to limited space). */
380
123k
  const ptrdiff_t bytesAvailable = fromLim - *fromP;
381
123k
  const ptrdiff_t bytesStorable = toLim - *toP;
382
123k
  UNUSED_P(enc);
383
123k
  if (bytesAvailable > bytesStorable) {
384
2.31k
    fromLim = *fromP + bytesStorable;
385
2.31k
    output_exhausted = true;
386
2.31k
  }
387
388
  /* Avoid copying partial characters (from incomplete input). */
389
123k
  {
390
123k
    const char *const fromLimBefore = fromLim;
391
123k
    _INTERNAL_trim_to_complete_utf8_characters(*fromP, &fromLim);
392
123k
    if (fromLim < fromLimBefore) {
393
0
      input_incomplete = true;
394
0
    }
395
123k
  }
396
397
123k
  {
398
123k
    const ptrdiff_t bytesToCopy = fromLim - *fromP;
399
123k
    memcpy(*toP, *fromP, bytesToCopy);
400
123k
    *fromP += bytesToCopy;
401
123k
    *toP += bytesToCopy;
402
123k
  }
403
404
123k
  if (output_exhausted) /* needs to go first */
405
2.31k
    return XML_CONVERT_OUTPUT_EXHAUSTED;
406
121k
  else if (input_incomplete)
407
0
    return XML_CONVERT_INPUT_INCOMPLETE;
408
121k
  else
409
121k
    return XML_CONVERT_COMPLETED;
410
123k
}
411
412
static enum XML_Convert_Result PTRCALL
413
utf8_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
414
0
             unsigned short **toP, const unsigned short *toLim) {
415
0
  enum XML_Convert_Result res = XML_CONVERT_COMPLETED;
416
0
  unsigned short *to = *toP;
417
0
  const char *from = *fromP;
418
0
  while (from < fromLim && to < toLim) {
419
0
    switch (SB_BYTE_TYPE(enc, from)) {
420
0
    case BT_LEAD2:
421
0
      if (fromLim - from < 2) {
422
0
        res = XML_CONVERT_INPUT_INCOMPLETE;
423
0
        goto after;
424
0
      }
425
0
      *to++ = (unsigned short)(((from[0] & 0x1f) << 6) | (from[1] & 0x3f));
426
0
      from += 2;
427
0
      break;
428
0
    case BT_LEAD3:
429
0
      if (fromLim - from < 3) {
430
0
        res = XML_CONVERT_INPUT_INCOMPLETE;
431
0
        goto after;
432
0
      }
433
0
      *to++ = (unsigned short)(((from[0] & 0xf) << 12) | ((from[1] & 0x3f) << 6)
434
0
                               | (from[2] & 0x3f));
435
0
      from += 3;
436
0
      break;
437
0
    case BT_LEAD4: {
438
0
      unsigned long n;
439
0
      if (toLim - to < 2) {
440
0
        res = XML_CONVERT_OUTPUT_EXHAUSTED;
441
0
        goto after;
442
0
      }
443
0
      if (fromLim - from < 4) {
444
0
        res = XML_CONVERT_INPUT_INCOMPLETE;
445
0
        goto after;
446
0
      }
447
0
      n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12)
448
0
          | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
449
0
      n -= 0x10000;
450
0
      to[0] = (unsigned short)((n >> 10) | 0xD800);
451
0
      to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
452
0
      to += 2;
453
0
      from += 4;
454
0
    } break;
455
0
    default:
456
0
      *to++ = *from++;
457
0
      break;
458
0
    }
459
0
  }
460
0
  if (from < fromLim)
461
0
    res = XML_CONVERT_OUTPUT_EXHAUSTED;
462
0
after:
463
0
  *fromP = from;
464
0
  *toP = to;
465
0
  return res;
466
0
}
467
468
#ifdef XML_NS
469
static const struct normal_encoding utf8_encoding_ns
470
    = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
471
       {
472
#  include "asciitab.h"
473
#  include "utf8tab.h"
474
       },
475
       STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
476
#endif
477
478
static const struct normal_encoding utf8_encoding
479
    = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
480
       {
481
#define BT_COLON BT_NMSTRT
482
#include "asciitab.h"
483
#undef BT_COLON
484
#include "utf8tab.h"
485
       },
486
       STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
487
488
#ifdef XML_NS
489
490
static const struct normal_encoding internal_utf8_encoding_ns
491
    = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
492
       {
493
#  include "iasciitab.h"
494
#  include "utf8tab.h"
495
       },
496
       STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
497
498
#endif
499
500
static const struct normal_encoding internal_utf8_encoding
501
    = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
502
       {
503
#define BT_COLON BT_NMSTRT
504
#include "iasciitab.h"
505
#undef BT_COLON
506
#include "utf8tab.h"
507
       },
508
       STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
509
510
static enum XML_Convert_Result PTRCALL
511
latin1_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
512
0
              char **toP, const char *toLim) {
513
0
  UNUSED_P(enc);
514
0
  for (;;) {
515
0
    unsigned char c;
516
0
    if (*fromP == fromLim)
517
0
      return XML_CONVERT_COMPLETED;
518
0
    c = (unsigned char)**fromP;
519
0
    if (c & 0x80) {
520
0
      if (toLim - *toP < 2)
521
0
        return XML_CONVERT_OUTPUT_EXHAUSTED;
522
0
      *(*toP)++ = (char)((c >> 6) | UTF8_cval2);
523
0
      *(*toP)++ = (char)((c & 0x3f) | 0x80);
524
0
      (*fromP)++;
525
0
    } else {
526
0
      if (*toP == toLim)
527
0
        return XML_CONVERT_OUTPUT_EXHAUSTED;
528
0
      *(*toP)++ = *(*fromP)++;
529
0
    }
530
0
  }
531
0
}
532
533
static enum XML_Convert_Result PTRCALL
534
latin1_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
535
0
               unsigned short **toP, const unsigned short *toLim) {
536
0
  UNUSED_P(enc);
537
0
  while (*fromP < fromLim && *toP < toLim)
538
0
    *(*toP)++ = (unsigned char)*(*fromP)++;
539
540
0
  if ((*toP == toLim) && (*fromP < fromLim))
541
0
    return XML_CONVERT_OUTPUT_EXHAUSTED;
542
0
  else
543
0
    return XML_CONVERT_COMPLETED;
544
0
}
545
546
#ifdef XML_NS
547
548
static const struct normal_encoding latin1_encoding_ns
549
    = {{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0},
550
       {
551
#  include "asciitab.h"
552
#  include "latin1tab.h"
553
       },
554
       STANDARD_VTABLE(sb_) NULL_VTABLE};
555
556
#endif
557
558
static const struct normal_encoding latin1_encoding
559
    = {{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0},
560
       {
561
#define BT_COLON BT_NMSTRT
562
#include "asciitab.h"
563
#undef BT_COLON
564
#include "latin1tab.h"
565
       },
566
       STANDARD_VTABLE(sb_) NULL_VTABLE};
567
568
static enum XML_Convert_Result PTRCALL
569
ascii_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
570
0
             char **toP, const char *toLim) {
571
0
  UNUSED_P(enc);
572
0
  while (*fromP < fromLim && *toP < toLim)
573
0
    *(*toP)++ = *(*fromP)++;
574
575
0
  if ((*toP == toLim) && (*fromP < fromLim))
576
0
    return XML_CONVERT_OUTPUT_EXHAUSTED;
577
0
  else
578
0
    return XML_CONVERT_COMPLETED;
579
0
}
580
581
#ifdef XML_NS
582
583
static const struct normal_encoding ascii_encoding_ns
584
    = {{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0},
585
       {
586
#  include "asciitab.h"
587
           /* BT_NONXML == 0 */
588
       },
589
       STANDARD_VTABLE(sb_) NULL_VTABLE};
590
591
#endif
592
593
static const struct normal_encoding ascii_encoding
594
    = {{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0},
595
       {
596
#define BT_COLON BT_NMSTRT
597
#include "asciitab.h"
598
#undef BT_COLON
599
           /* BT_NONXML == 0 */
600
       },
601
       STANDARD_VTABLE(sb_) NULL_VTABLE};
602
603
static int PTRFASTCALL
604
0
unicode_byte_type(char hi, char lo) {
605
0
  switch ((unsigned char)hi) {
606
  /* 0xD800-0xDBFF first 16-bit code unit or high surrogate (W1) */
607
0
  case 0xD8:
608
0
  case 0xD9:
609
0
  case 0xDA:
610
0
  case 0xDB:
611
0
    return BT_LEAD4;
612
  /* 0xDC00-0xDFFF second 16-bit code unit or low surrogate (W2) */
613
0
  case 0xDC:
614
0
  case 0xDD:
615
0
  case 0xDE:
616
0
  case 0xDF:
617
0
    return BT_TRAIL;
618
0
  case 0xFF:
619
0
    switch ((unsigned char)lo) {
620
0
    case 0xFF: /* noncharacter-FFFF */
621
0
    case 0xFE: /* noncharacter-FFFE */
622
0
      return BT_NONXML;
623
0
    }
624
0
    break;
625
0
  }
626
0
  return BT_NONASCII;
627
0
}
628
629
#define DEFINE_UTF16_TO_UTF8(E)                                                \
630
  static enum XML_Convert_Result PTRCALL E##toUtf8(                            \
631
      const ENCODING *enc, const char **fromP, const char *fromLim,            \
632
0
      char **toP, const char *toLim) {                                         \
633
0
    const char *from = *fromP;                                                 \
634
0
    UNUSED_P(enc);                                                             \
635
0
    fromLim = from + (((fromLim - from) >> 1) << 1); /* shrink to even */      \
636
0
    for (; from < fromLim; from += 2) {                                        \
637
0
      int plane;                                                               \
638
0
      unsigned char lo2;                                                       \
639
0
      unsigned char lo = GET_LO(from);                                         \
640
0
      unsigned char hi = GET_HI(from);                                         \
641
0
      switch (hi) {                                                            \
642
0
      case 0:                                                                  \
643
0
        if (lo < 0x80) {                                                       \
644
0
          if (*toP == toLim) {                                                 \
645
0
            *fromP = from;                                                     \
646
0
            return XML_CONVERT_OUTPUT_EXHAUSTED;                               \
647
0
          }                                                                    \
648
0
          *(*toP)++ = lo;                                                      \
649
0
          break;                                                               \
650
0
        }                                                                      \
651
0
        /* fall through */                                                     \
652
0
      case 0x1:                                                                \
653
0
      case 0x2:                                                                \
654
0
      case 0x3:                                                                \
655
0
      case 0x4:                                                                \
656
0
      case 0x5:                                                                \
657
0
      case 0x6:                                                                \
658
0
      case 0x7:                                                                \
659
0
        if (toLim - *toP < 2) {                                                \
660
0
          *fromP = from;                                                       \
661
0
          return XML_CONVERT_OUTPUT_EXHAUSTED;                                 \
662
0
        }                                                                      \
663
0
        *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2);                      \
664
0
        *(*toP)++ = ((lo & 0x3f) | 0x80);                                      \
665
0
        break;                                                                 \
666
0
      default:                                                                 \
667
0
        if (toLim - *toP < 3) {                                                \
668
0
          *fromP = from;                                                       \
669
0
          return XML_CONVERT_OUTPUT_EXHAUSTED;                                 \
670
0
        }                                                                      \
671
0
        /* 16 bits divided 4, 6, 6 amongst 3 bytes */                          \
672
0
        *(*toP)++ = ((hi >> 4) | UTF8_cval3);                                  \
673
0
        *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80);                    \
674
0
        *(*toP)++ = ((lo & 0x3f) | 0x80);                                      \
675
0
        break;                                                                 \
676
0
      case 0xD8:                                                               \
677
0
      case 0xD9:                                                               \
678
0
      case 0xDA:                                                               \
679
0
      case 0xDB:                                                               \
680
0
        if (toLim - *toP < 4) {                                                \
681
0
          *fromP = from;                                                       \
682
0
          return XML_CONVERT_OUTPUT_EXHAUSTED;                                 \
683
0
        }                                                                      \
684
0
        if (fromLim - from < 4) {                                              \
685
0
          *fromP = from;                                                       \
686
0
          return XML_CONVERT_INPUT_INCOMPLETE;                                 \
687
0
        }                                                                      \
688
0
        plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1;                   \
689
0
        *(*toP)++ = (char)((plane >> 2) | UTF8_cval4);                         \
690
0
        *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80);         \
691
0
        from += 2;                                                             \
692
0
        lo2 = GET_LO(from);                                                    \
693
0
        *(*toP)++ = (((lo & 0x3) << 4) | ((GET_HI(from) & 0x3) << 2)           \
694
0
                     | (lo2 >> 6) | 0x80);                                     \
695
0
        *(*toP)++ = ((lo2 & 0x3f) | 0x80);                                     \
696
0
        break;                                                                 \
697
0
      }                                                                        \
698
0
    }                                                                          \
699
0
    *fromP = from;                                                             \
700
0
    if (from < fromLim)                                                        \
701
0
      return XML_CONVERT_INPUT_INCOMPLETE;                                     \
702
0
    else                                                                       \
703
0
      return XML_CONVERT_COMPLETED;                                            \
704
0
  }
Unexecuted instantiation: xmltok.c:little2_toUtf8
Unexecuted instantiation: xmltok.c:big2_toUtf8
705
706
#define DEFINE_UTF16_TO_UTF16(E)                                               \
707
  static enum XML_Convert_Result PTRCALL E##toUtf16(                           \
708
      const ENCODING *enc, const char **fromP, const char *fromLim,            \
709
0
      unsigned short **toP, const unsigned short *toLim) {                     \
710
0
    enum XML_Convert_Result res = XML_CONVERT_COMPLETED;                       \
711
0
    UNUSED_P(enc);                                                             \
712
0
    fromLim = *fromP + (((fromLim - *fromP) >> 1) << 1); /* shrink to even */  \
713
0
    /* Avoid copying first half only of surrogate */                           \
714
0
    if (fromLim - *fromP > ((toLim - *toP) << 1)                               \
715
0
        && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) {                             \
716
0
      fromLim -= 2;                                                            \
717
0
      res = XML_CONVERT_INPUT_INCOMPLETE;                                      \
718
0
    }                                                                          \
719
0
    for (; *fromP < fromLim && *toP < toLim; *fromP += 2)                      \
720
0
      *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP);                      \
721
0
    if ((*toP == toLim) && (*fromP < fromLim))                                 \
722
0
      return XML_CONVERT_OUTPUT_EXHAUSTED;                                     \
723
0
    else                                                                       \
724
0
      return res;                                                              \
725
0
  }
Unexecuted instantiation: xmltok.c:little2_toUtf16
Unexecuted instantiation: xmltok.c:big2_toUtf16
726
727
0
#define GET_LO(ptr) ((unsigned char)(ptr)[0])
728
0
#define GET_HI(ptr) ((unsigned char)(ptr)[1])
729
730
DEFINE_UTF16_TO_UTF8(little2_)
731
DEFINE_UTF16_TO_UTF16(little2_)
732
733
#undef GET_LO
734
#undef GET_HI
735
736
0
#define GET_LO(ptr) ((unsigned char)(ptr)[1])
737
0
#define GET_HI(ptr) ((unsigned char)(ptr)[0])
738
739
DEFINE_UTF16_TO_UTF8(big2_)
740
DEFINE_UTF16_TO_UTF16(big2_)
741
742
#undef GET_LO
743
#undef GET_HI
744
745
#define LITTLE2_BYTE_TYPE(enc, p)                                              \
746
0
  ((p)[1] == 0 ? SB_BYTE_TYPE(enc, p) : unicode_byte_type((p)[1], (p)[0]))
747
0
#define LITTLE2_BYTE_TO_ASCII(p) ((p)[1] == 0 ? (p)[0] : -1)
748
0
#define LITTLE2_CHAR_MATCHES(p, c) ((p)[1] == 0 && (p)[0] == (c))
749
#define LITTLE2_IS_NAME_CHAR_MINBPC(p)                                         \
750
0
  UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
751
#define LITTLE2_IS_NMSTRT_CHAR_MINBPC(p)                                       \
752
0
  UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
753
754
#ifdef XML_MIN_SIZE
755
756
static int PTRFASTCALL
757
little2_byteType(const ENCODING *enc, const char *p) {
758
  return LITTLE2_BYTE_TYPE(enc, p);
759
}
760
761
static int PTRFASTCALL
762
little2_byteToAscii(const ENCODING *enc, const char *p) {
763
  UNUSED_P(enc);
764
  return LITTLE2_BYTE_TO_ASCII(p);
765
}
766
767
static int PTRCALL
768
little2_charMatches(const ENCODING *enc, const char *p, int c) {
769
  UNUSED_P(enc);
770
  return LITTLE2_CHAR_MATCHES(p, c);
771
}
772
773
static int PTRFASTCALL
774
little2_isNameMin(const ENCODING *enc, const char *p) {
775
  UNUSED_P(enc);
776
  return LITTLE2_IS_NAME_CHAR_MINBPC(p);
777
}
778
779
static int PTRFASTCALL
780
little2_isNmstrtMin(const ENCODING *enc, const char *p) {
781
  UNUSED_P(enc);
782
  return LITTLE2_IS_NMSTRT_CHAR_MINBPC(p);
783
}
784
785
#  undef VTABLE
786
#  define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
787
788
#else /* not XML_MIN_SIZE */
789
790
#  undef PREFIX
791
0
#  define PREFIX(ident) little2_##ident
792
0
#  define MINBPC(enc) 2
793
/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
794
0
#  define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
795
0
#  define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(p)
796
0
#  define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(p, c)
797
0
#  define IS_NAME_CHAR(enc, p, n) 0
798
0
#  define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(p)
799
0
#  define IS_NMSTRT_CHAR(enc, p, n) (0)
800
0
#  define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(p)
801
802
#  define XML_TOK_IMPL_C
803
#  include "xmltok_impl.c"
804
#  undef XML_TOK_IMPL_C
805
806
#  undef MINBPC
807
#  undef BYTE_TYPE
808
#  undef BYTE_TO_ASCII
809
#  undef CHAR_MATCHES
810
#  undef IS_NAME_CHAR
811
#  undef IS_NAME_CHAR_MINBPC
812
#  undef IS_NMSTRT_CHAR
813
#  undef IS_NMSTRT_CHAR_MINBPC
814
#  undef IS_INVALID_CHAR
815
816
#endif /* not XML_MIN_SIZE */
817
818
#ifdef XML_NS
819
820
static const struct normal_encoding little2_encoding_ns
821
    = {{VTABLE, 2, 0,
822
#  if BYTEORDER == 1234
823
        1
824
#  else
825
        0
826
#  endif
827
       },
828
       {
829
#  include "asciitab.h"
830
#  include "latin1tab.h"
831
       },
832
       STANDARD_VTABLE(little2_) NULL_VTABLE};
833
834
#endif
835
836
static const struct normal_encoding little2_encoding
837
    = {{VTABLE, 2, 0,
838
#if BYTEORDER == 1234
839
        1
840
#else
841
        0
842
#endif
843
       },
844
       {
845
#define BT_COLON BT_NMSTRT
846
#include "asciitab.h"
847
#undef BT_COLON
848
#include "latin1tab.h"
849
       },
850
       STANDARD_VTABLE(little2_) NULL_VTABLE};
851
852
#if BYTEORDER != 4321
853
854
#  ifdef XML_NS
855
856
static const struct normal_encoding internal_little2_encoding_ns
857
    = {{VTABLE, 2, 0, 1},
858
       {
859
#    include "iasciitab.h"
860
#    include "latin1tab.h"
861
       },
862
       STANDARD_VTABLE(little2_) NULL_VTABLE};
863
864
#  endif
865
866
static const struct normal_encoding internal_little2_encoding
867
    = {{VTABLE, 2, 0, 1},
868
       {
869
#  define BT_COLON BT_NMSTRT
870
#  include "iasciitab.h"
871
#  undef BT_COLON
872
#  include "latin1tab.h"
873
       },
874
       STANDARD_VTABLE(little2_) NULL_VTABLE};
875
876
#endif
877
878
#define BIG2_BYTE_TYPE(enc, p)                                                 \
879
0
  ((p)[0] == 0 ? SB_BYTE_TYPE(enc, p + 1) : unicode_byte_type((p)[0], (p)[1]))
880
0
#define BIG2_BYTE_TO_ASCII(p) ((p)[0] == 0 ? (p)[1] : -1)
881
0
#define BIG2_CHAR_MATCHES(p, c) ((p)[0] == 0 && (p)[1] == (c))
882
#define BIG2_IS_NAME_CHAR_MINBPC(p)                                            \
883
0
  UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
884
#define BIG2_IS_NMSTRT_CHAR_MINBPC(p)                                          \
885
0
  UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
886
887
#ifdef XML_MIN_SIZE
888
889
static int PTRFASTCALL
890
big2_byteType(const ENCODING *enc, const char *p) {
891
  return BIG2_BYTE_TYPE(enc, p);
892
}
893
894
static int PTRFASTCALL
895
big2_byteToAscii(const ENCODING *enc, const char *p) {
896
  UNUSED_P(enc);
897
  return BIG2_BYTE_TO_ASCII(p);
898
}
899
900
static int PTRCALL
901
big2_charMatches(const ENCODING *enc, const char *p, int c) {
902
  UNUSED_P(enc);
903
  return BIG2_CHAR_MATCHES(p, c);
904
}
905
906
static int PTRFASTCALL
907
big2_isNameMin(const ENCODING *enc, const char *p) {
908
  UNUSED_P(enc);
909
  return BIG2_IS_NAME_CHAR_MINBPC(p);
910
}
911
912
static int PTRFASTCALL
913
big2_isNmstrtMin(const ENCODING *enc, const char *p) {
914
  UNUSED_P(enc);
915
  return BIG2_IS_NMSTRT_CHAR_MINBPC(p);
916
}
917
918
#  undef VTABLE
919
#  define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
920
921
#else /* not XML_MIN_SIZE */
922
923
#  undef PREFIX
924
0
#  define PREFIX(ident) big2_##ident
925
0
#  define MINBPC(enc) 2
926
/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
927
0
#  define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
928
0
#  define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(p)
929
0
#  define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(p, c)
930
0
#  define IS_NAME_CHAR(enc, p, n) 0
931
0
#  define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(p)
932
0
#  define IS_NMSTRT_CHAR(enc, p, n) (0)
933
0
#  define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(p)
934
935
#  define XML_TOK_IMPL_C
936
#  include "xmltok_impl.c"
937
#  undef XML_TOK_IMPL_C
938
939
#  undef MINBPC
940
#  undef BYTE_TYPE
941
#  undef BYTE_TO_ASCII
942
#  undef CHAR_MATCHES
943
#  undef IS_NAME_CHAR
944
#  undef IS_NAME_CHAR_MINBPC
945
#  undef IS_NMSTRT_CHAR
946
#  undef IS_NMSTRT_CHAR_MINBPC
947
#  undef IS_INVALID_CHAR
948
949
#endif /* not XML_MIN_SIZE */
950
951
#ifdef XML_NS
952
953
static const struct normal_encoding big2_encoding_ns
954
    = {{VTABLE, 2, 0,
955
#  if BYTEORDER == 4321
956
        1
957
#  else
958
        0
959
#  endif
960
       },
961
       {
962
#  include "asciitab.h"
963
#  include "latin1tab.h"
964
       },
965
       STANDARD_VTABLE(big2_) NULL_VTABLE};
966
967
#endif
968
969
static const struct normal_encoding big2_encoding
970
    = {{VTABLE, 2, 0,
971
#if BYTEORDER == 4321
972
        1
973
#else
974
        0
975
#endif
976
       },
977
       {
978
#define BT_COLON BT_NMSTRT
979
#include "asciitab.h"
980
#undef BT_COLON
981
#include "latin1tab.h"
982
       },
983
       STANDARD_VTABLE(big2_) NULL_VTABLE};
984
985
#if BYTEORDER != 1234
986
987
#  ifdef XML_NS
988
989
static const struct normal_encoding internal_big2_encoding_ns
990
    = {{VTABLE, 2, 0, 1},
991
       {
992
#    include "iasciitab.h"
993
#    include "latin1tab.h"
994
       },
995
       STANDARD_VTABLE(big2_) NULL_VTABLE};
996
997
#  endif
998
999
static const struct normal_encoding internal_big2_encoding
1000
    = {{VTABLE, 2, 0, 1},
1001
       {
1002
#  define BT_COLON BT_NMSTRT
1003
#  include "iasciitab.h"
1004
#  undef BT_COLON
1005
#  include "latin1tab.h"
1006
       },
1007
       STANDARD_VTABLE(big2_) NULL_VTABLE};
1008
1009
#endif
1010
1011
#undef PREFIX
1012
1013
static int FASTCALL
1014
276
streqci(const char *s1, const char *s2) {
1015
966
  for (;;) {
1016
966
    char c1 = *s1++;
1017
966
    char c2 = *s2++;
1018
966
    if (ASCII_a <= c1 && c1 <= ASCII_z)
1019
621
      c1 += ASCII_A - ASCII_a;
1020
966
    if (ASCII_a <= c2 && c2 <= ASCII_z)
1021
      /* The following line will never get executed.  streqci() is
1022
       * only called from two places, both of which guarantee to put
1023
       * upper-case strings into s2.
1024
       */
1025
0
      c2 += ASCII_A - ASCII_a; /* LCOV_EXCL_LINE */
1026
966
    if (c1 != c2)
1027
207
      return 0;
1028
759
    if (! c1)
1029
69
      break;
1030
759
  }
1031
69
  return 1;
1032
276
}
1033
1034
static void PTRCALL
1035
initUpdatePosition(const ENCODING *enc, const char *ptr, const char *end,
1036
0
                   POSITION *pos) {
1037
0
  UNUSED_P(enc);
1038
0
  normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
1039
0
}
1040
1041
static int
1042
2.34k
toAscii(const ENCODING *enc, const char *ptr, const char *end) {
1043
2.34k
  char buf[1];
1044
2.34k
  char *p = buf;
1045
2.34k
  XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
1046
2.34k
  if (p == buf)
1047
0
    return -1;
1048
2.34k
  else
1049
2.34k
    return buf[0];
1050
2.34k
}
1051
1052
static int FASTCALL
1053
1.44k
isSpace(int c) {
1054
1.44k
  switch (c) {
1055
138
  case 0x20:
1056
138
  case 0xD:
1057
138
  case 0xA:
1058
138
  case 0x9:
1059
138
    return 1;
1060
1.44k
  }
1061
1.31k
  return 0;
1062
1.44k
}
1063
1064
/* Return 1 if there's just optional white space or there's an S
1065
   followed by name=val.
1066
*/
1067
static int
1068
parsePseudoAttribute(const ENCODING *enc, const char *ptr, const char *end,
1069
                     const char **namePtr, const char **nameEndPtr,
1070
207
                     const char **valPtr, const char **nextTokPtr) {
1071
207
  int c;
1072
207
  char open;
1073
207
  if (ptr == end) {
1074
69
    *namePtr = NULL;
1075
69
    return 1;
1076
69
  }
1077
138
  if (! isSpace(toAscii(enc, ptr, end))) {
1078
0
    *nextTokPtr = ptr;
1079
0
    return 0;
1080
0
  }
1081
138
  do {
1082
138
    ptr += enc->minBytesPerChar;
1083
138
  } while (isSpace(toAscii(enc, ptr, end)));
1084
138
  if (ptr == end) {
1085
0
    *namePtr = NULL;
1086
0
    return 1;
1087
0
  }
1088
138
  *namePtr = ptr;
1089
1.17k
  for (;;) {
1090
1.17k
    c = toAscii(enc, ptr, end);
1091
1.17k
    if (c == -1) {
1092
0
      *nextTokPtr = ptr;
1093
0
      return 0;
1094
0
    }
1095
1.17k
    if (c == ASCII_EQUALS) {
1096
138
      *nameEndPtr = ptr;
1097
138
      break;
1098
138
    }
1099
1.03k
    if (isSpace(c)) {
1100
0
      *nameEndPtr = ptr;
1101
0
      do {
1102
0
        ptr += enc->minBytesPerChar;
1103
0
      } while (isSpace(c = toAscii(enc, ptr, end)));
1104
0
      if (c != ASCII_EQUALS) {
1105
0
        *nextTokPtr = ptr;
1106
0
        return 0;
1107
0
      }
1108
0
      break;
1109
0
    }
1110
1.03k
    ptr += enc->minBytesPerChar;
1111
1.03k
  }
1112
138
  if (ptr == *namePtr) {
1113
0
    *nextTokPtr = ptr;
1114
0
    return 0;
1115
0
  }
1116
138
  ptr += enc->minBytesPerChar;
1117
138
  c = toAscii(enc, ptr, end);
1118
138
  while (isSpace(c)) {
1119
0
    ptr += enc->minBytesPerChar;
1120
0
    c = toAscii(enc, ptr, end);
1121
0
  }
1122
138
  if (c != ASCII_QUOT && c != ASCII_APOS) {
1123
0
    *nextTokPtr = ptr;
1124
0
    return 0;
1125
0
  }
1126
138
  open = (char)c;
1127
138
  ptr += enc->minBytesPerChar;
1128
138
  *valPtr = ptr;
1129
690
  for (;; ptr += enc->minBytesPerChar) {
1130
690
    c = toAscii(enc, ptr, end);
1131
690
    if (c == open)
1132
138
      break;
1133
552
    if (! (ASCII_a <= c && c <= ASCII_z) && ! (ASCII_A <= c && c <= ASCII_Z)
1134
345
        && ! (ASCII_0 <= c && c <= ASCII_9) && c != ASCII_PERIOD
1135
69
        && c != ASCII_MINUS && c != ASCII_UNDERSCORE) {
1136
0
      *nextTokPtr = ptr;
1137
0
      return 0;
1138
0
    }
1139
552
  }
1140
138
  *nextTokPtr = ptr + enc->minBytesPerChar;
1141
138
  return 1;
1142
138
}
1143
1144
static const char KW_version[]
1145
    = {ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0'};
1146
1147
static const char KW_encoding[] = {ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d,
1148
                                   ASCII_i, ASCII_n, ASCII_g, '\0'};
1149
1150
static const char KW_standalone[]
1151
    = {ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a,
1152
       ASCII_l, ASCII_o, ASCII_n, ASCII_e, '\0'};
1153
1154
static const char KW_yes[] = {ASCII_y, ASCII_e, ASCII_s, '\0'};
1155
1156
static const char KW_no[] = {ASCII_n, ASCII_o, '\0'};
1157
1158
static int
1159
doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *, const char *,
1160
                                                 const char *),
1161
               int isGeneralTextEntity, const ENCODING *enc, const char *ptr,
1162
               const char *end, const char **badPtr, const char **versionPtr,
1163
               const char **versionEndPtr, const char **encodingName,
1164
69
               const ENCODING **encoding, int *standalone) {
1165
69
  const char *val = NULL;
1166
69
  const char *name = NULL;
1167
69
  const char *nameEnd = NULL;
1168
69
  ptr += 5 * enc->minBytesPerChar;
1169
69
  end -= 2 * enc->minBytesPerChar;
1170
69
  if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)
1171
69
      || ! name) {
1172
0
    *badPtr = ptr;
1173
0
    return 0;
1174
0
  }
1175
69
  if (! XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) {
1176
0
    if (! isGeneralTextEntity) {
1177
0
      *badPtr = name;
1178
0
      return 0;
1179
0
    }
1180
69
  } else {
1181
69
    if (versionPtr)
1182
69
      *versionPtr = val;
1183
69
    if (versionEndPtr)
1184
69
      *versionEndPtr = ptr;
1185
69
    if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1186
0
      *badPtr = ptr;
1187
0
      return 0;
1188
0
    }
1189
69
    if (! name) {
1190
0
      if (isGeneralTextEntity) {
1191
        /* a TextDecl must have an EncodingDecl */
1192
0
        *badPtr = ptr;
1193
0
        return 0;
1194
0
      }
1195
0
      return 1;
1196
0
    }
1197
69
  }
1198
69
  if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) {
1199
69
    int c = toAscii(enc, val, end);
1200
69
    if (! (ASCII_a <= c && c <= ASCII_z) && ! (ASCII_A <= c && c <= ASCII_Z)) {
1201
0
      *badPtr = val;
1202
0
      return 0;
1203
0
    }
1204
69
    if (encodingName)
1205
69
      *encodingName = val;
1206
69
    if (encoding)
1207
69
      *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
1208
69
    if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1209
0
      *badPtr = ptr;
1210
0
      return 0;
1211
0
    }
1212
69
    if (! name)
1213
69
      return 1;
1214
69
  }
1215
0
  if (! XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone)
1216
0
      || isGeneralTextEntity) {
1217
0
    *badPtr = name;
1218
0
    return 0;
1219
0
  }
1220
0
  if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) {
1221
0
    if (standalone)
1222
0
      *standalone = 1;
1223
0
  } else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) {
1224
0
    if (standalone)
1225
0
      *standalone = 0;
1226
0
  } else {
1227
0
    *badPtr = val;
1228
0
    return 0;
1229
0
  }
1230
0
  while (isSpace(toAscii(enc, ptr, end)))
1231
0
    ptr += enc->minBytesPerChar;
1232
0
  if (ptr != end) {
1233
0
    *badPtr = ptr;
1234
0
    return 0;
1235
0
  }
1236
0
  return 1;
1237
0
}
1238
1239
static int FASTCALL
1240
0
checkCharRefNumber(int result) {
1241
0
  switch (result >> 8) {
1242
0
  case 0xD8:
1243
0
  case 0xD9:
1244
0
  case 0xDA:
1245
0
  case 0xDB:
1246
0
  case 0xDC:
1247
0
  case 0xDD:
1248
0
  case 0xDE:
1249
0
  case 0xDF:
1250
0
    return -1;
1251
0
  case 0:
1252
0
    if (latin1_encoding.type[result] == BT_NONXML)
1253
0
      return -1;
1254
0
    break;
1255
0
  case 0xFF:
1256
0
    if (result == 0xFFFE || result == 0xFFFF)
1257
0
      return -1;
1258
0
    break;
1259
0
  }
1260
0
  return result;
1261
0
}
1262
1263
int FASTCALL
1264
0
XmlUtf8Encode(int c, char *buf) {
1265
0
  enum {
1266
    /* minN is minimum legal resulting value for N byte sequence */
1267
0
    min2 = 0x80,
1268
0
    min3 = 0x800,
1269
0
    min4 = 0x10000
1270
0
  };
1271
1272
0
  if (c < 0)
1273
0
    return 0; /* LCOV_EXCL_LINE: this case is always eliminated beforehand */
1274
0
  if (c < min2) {
1275
0
    buf[0] = (char)(c | UTF8_cval1);
1276
0
    return 1;
1277
0
  }
1278
0
  if (c < min3) {
1279
0
    buf[0] = (char)((c >> 6) | UTF8_cval2);
1280
0
    buf[1] = (char)((c & 0x3f) | 0x80);
1281
0
    return 2;
1282
0
  }
1283
0
  if (c < min4) {
1284
0
    buf[0] = (char)((c >> 12) | UTF8_cval3);
1285
0
    buf[1] = (char)(((c >> 6) & 0x3f) | 0x80);
1286
0
    buf[2] = (char)((c & 0x3f) | 0x80);
1287
0
    return 3;
1288
0
  }
1289
0
  if (c < 0x110000) {
1290
0
    buf[0] = (char)((c >> 18) | UTF8_cval4);
1291
0
    buf[1] = (char)(((c >> 12) & 0x3f) | 0x80);
1292
0
    buf[2] = (char)(((c >> 6) & 0x3f) | 0x80);
1293
0
    buf[3] = (char)((c & 0x3f) | 0x80);
1294
0
    return 4;
1295
0
  }
1296
0
  return 0; /* LCOV_EXCL_LINE: this case too is eliminated before calling */
1297
0
}
1298
1299
int FASTCALL
1300
0
XmlUtf16Encode(int charNum, unsigned short *buf) {
1301
0
  if (charNum < 0)
1302
0
    return 0;
1303
0
  if (charNum < 0x10000) {
1304
0
    buf[0] = (unsigned short)charNum;
1305
0
    return 1;
1306
0
  }
1307
0
  if (charNum < 0x110000) {
1308
0
    charNum -= 0x10000;
1309
0
    buf[0] = (unsigned short)((charNum >> 10) + 0xD800);
1310
0
    buf[1] = (unsigned short)((charNum & 0x3FF) + 0xDC00);
1311
0
    return 2;
1312
0
  }
1313
0
  return 0;
1314
0
}
1315
1316
struct unknown_encoding {
1317
  struct normal_encoding normal;
1318
  CONVERTER convert;
1319
  void *userData;
1320
  unsigned short utf16[256];
1321
  char utf8[256][4];
1322
};
1323
1324
0
#define AS_UNKNOWN_ENCODING(enc) ((const struct unknown_encoding *)(enc))
1325
1326
int
1327
0
XmlSizeOfUnknownEncoding(void) {
1328
0
  return sizeof(struct unknown_encoding);
1329
0
}
1330
1331
static int PTRFASTCALL
1332
0
unknown_isName(const ENCODING *enc, const char *p) {
1333
0
  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1334
0
  int c = uenc->convert(uenc->userData, p);
1335
0
  if (c & ~0xFFFF)
1336
0
    return 0;
1337
0
  return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
1338
0
}
1339
1340
static int PTRFASTCALL
1341
0
unknown_isNmstrt(const ENCODING *enc, const char *p) {
1342
0
  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1343
0
  int c = uenc->convert(uenc->userData, p);
1344
0
  if (c & ~0xFFFF)
1345
0
    return 0;
1346
0
  return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
1347
0
}
1348
1349
static int PTRFASTCALL
1350
0
unknown_isInvalid(const ENCODING *enc, const char *p) {
1351
0
  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1352
0
  int c = uenc->convert(uenc->userData, p);
1353
0
  return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
1354
0
}
1355
1356
static enum XML_Convert_Result PTRCALL
1357
unknown_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
1358
0
               char **toP, const char *toLim) {
1359
0
  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1360
0
  char buf[XML_UTF8_ENCODE_MAX];
1361
0
  for (;;) {
1362
0
    const char *utf8;
1363
0
    int n;
1364
0
    if (*fromP == fromLim)
1365
0
      return XML_CONVERT_COMPLETED;
1366
0
    utf8 = uenc->utf8[(unsigned char)**fromP];
1367
0
    n = *utf8++;
1368
0
    if (n == 0) {
1369
0
      int c = uenc->convert(uenc->userData, *fromP);
1370
0
      n = XmlUtf8Encode(c, buf);
1371
0
      if (n > toLim - *toP)
1372
0
        return XML_CONVERT_OUTPUT_EXHAUSTED;
1373
0
      utf8 = buf;
1374
0
      *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1375
0
                 - (BT_LEAD2 - 2));
1376
0
    } else {
1377
0
      if (n > toLim - *toP)
1378
0
        return XML_CONVERT_OUTPUT_EXHAUSTED;
1379
0
      (*fromP)++;
1380
0
    }
1381
0
    memcpy(*toP, utf8, n);
1382
0
    *toP += n;
1383
0
  }
1384
0
}
1385
1386
static enum XML_Convert_Result PTRCALL
1387
unknown_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
1388
0
                unsigned short **toP, const unsigned short *toLim) {
1389
0
  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1390
0
  while (*fromP < fromLim && *toP < toLim) {
1391
0
    unsigned short c = uenc->utf16[(unsigned char)**fromP];
1392
0
    if (c == 0) {
1393
0
      c = (unsigned short)uenc->convert(uenc->userData, *fromP);
1394
0
      *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1395
0
                 - (BT_LEAD2 - 2));
1396
0
    } else
1397
0
      (*fromP)++;
1398
0
    *(*toP)++ = c;
1399
0
  }
1400
1401
0
  if ((*toP == toLim) && (*fromP < fromLim))
1402
0
    return XML_CONVERT_OUTPUT_EXHAUSTED;
1403
0
  else
1404
0
    return XML_CONVERT_COMPLETED;
1405
0
}
1406
1407
ENCODING *
1408
XmlInitUnknownEncoding(void *mem, int *table, CONVERTER convert,
1409
0
                       void *userData) {
1410
0
  int i;
1411
0
  struct unknown_encoding *e = (struct unknown_encoding *)mem;
1412
0
  memcpy(mem, &latin1_encoding, sizeof(struct normal_encoding));
1413
0
  for (i = 0; i < 128; i++)
1414
0
    if (latin1_encoding.type[i] != BT_OTHER
1415
0
        && latin1_encoding.type[i] != BT_NONXML && table[i] != i)
1416
0
      return 0;
1417
0
  for (i = 0; i < 256; i++) {
1418
0
    int c = table[i];
1419
0
    if (c == -1) {
1420
0
      e->normal.type[i] = BT_MALFORM;
1421
      /* This shouldn't really get used. */
1422
0
      e->utf16[i] = 0xFFFF;
1423
0
      e->utf8[i][0] = 1;
1424
0
      e->utf8[i][1] = 0;
1425
0
    } else if (c < 0) {
1426
0
      if (c < -4)
1427
0
        return 0;
1428
      /* Multi-byte sequences need a converter function */
1429
0
      if (! convert)
1430
0
        return 0;
1431
0
      e->normal.type[i] = (unsigned char)(BT_LEAD2 - (c + 2));
1432
0
      e->utf8[i][0] = 0;
1433
0
      e->utf16[i] = 0;
1434
0
    } else if (c < 0x80) {
1435
0
      if (latin1_encoding.type[c] != BT_OTHER
1436
0
          && latin1_encoding.type[c] != BT_NONXML && c != i)
1437
0
        return 0;
1438
0
      e->normal.type[i] = latin1_encoding.type[c];
1439
0
      e->utf8[i][0] = 1;
1440
0
      e->utf8[i][1] = (char)c;
1441
0
      e->utf16[i] = (unsigned short)(c == 0 ? 0xFFFF : c);
1442
0
    } else if (checkCharRefNumber(c) < 0) {
1443
0
      e->normal.type[i] = BT_NONXML;
1444
      /* This shouldn't really get used. */
1445
0
      e->utf16[i] = 0xFFFF;
1446
0
      e->utf8[i][0] = 1;
1447
0
      e->utf8[i][1] = 0;
1448
0
    } else {
1449
0
      if (c > 0xFFFF)
1450
0
        return 0;
1451
0
      if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
1452
0
        e->normal.type[i] = BT_NMSTRT;
1453
0
      else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
1454
0
        e->normal.type[i] = BT_NAME;
1455
0
      else
1456
0
        e->normal.type[i] = BT_OTHER;
1457
0
      e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
1458
0
      e->utf16[i] = (unsigned short)c;
1459
0
    }
1460
0
  }
1461
0
  e->userData = userData;
1462
0
  e->convert = convert;
1463
0
  if (convert) {
1464
0
    e->normal.isName2 = unknown_isName;
1465
0
    e->normal.isName3 = unknown_isName;
1466
0
    e->normal.isName4 = unknown_isName;
1467
0
    e->normal.isNmstrt2 = unknown_isNmstrt;
1468
0
    e->normal.isNmstrt3 = unknown_isNmstrt;
1469
0
    e->normal.isNmstrt4 = unknown_isNmstrt;
1470
0
    e->normal.isInvalid2 = unknown_isInvalid;
1471
0
    e->normal.isInvalid3 = unknown_isInvalid;
1472
0
    e->normal.isInvalid4 = unknown_isInvalid;
1473
0
  }
1474
0
  e->normal.enc.utf8Convert = unknown_toUtf8;
1475
0
  e->normal.enc.utf16Convert = unknown_toUtf16;
1476
0
  return &(e->normal.enc);
1477
0
}
1478
1479
/* If this enumeration is changed, getEncodingIndex and encodings
1480
must also be changed. */
1481
enum {
1482
  UNKNOWN_ENC = -1,
1483
  ISO_8859_1_ENC = 0,
1484
  US_ASCII_ENC,
1485
  UTF_8_ENC,
1486
  UTF_16_ENC,
1487
  UTF_16BE_ENC,
1488
  UTF_16LE_ENC,
1489
  /* must match encodingNames up to here */
1490
  NO_ENC
1491
};
1492
1493
static const char KW_ISO_8859_1[]
1494
    = {ASCII_I, ASCII_S, ASCII_O,     ASCII_MINUS, ASCII_8, ASCII_8,
1495
       ASCII_5, ASCII_9, ASCII_MINUS, ASCII_1,     '\0'};
1496
static const char KW_US_ASCII[]
1497
    = {ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S,
1498
       ASCII_C, ASCII_I, ASCII_I,     '\0'};
1499
static const char KW_UTF_8[]
1500
    = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0'};
1501
static const char KW_UTF_16[]
1502
    = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0'};
1503
static const char KW_UTF_16BE[]
1504
    = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1,
1505
       ASCII_6, ASCII_B, ASCII_E, '\0'};
1506
static const char KW_UTF_16LE[]
1507
    = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1,
1508
       ASCII_6, ASCII_L, ASCII_E, '\0'};
1509
1510
static int FASTCALL
1511
247
getEncodingIndex(const char *name) {
1512
247
  static const char *const encodingNames[] = {
1513
247
      KW_ISO_8859_1, KW_US_ASCII, KW_UTF_8, KW_UTF_16, KW_UTF_16BE, KW_UTF_16LE,
1514
247
  };
1515
247
  int i;
1516
247
  if (name == NULL)
1517
178
    return NO_ENC;
1518
207
  for (i = 0; i < (int)(sizeof(encodingNames) / sizeof(encodingNames[0])); i++)
1519
207
    if (streqci(name, encodingNames[i]))
1520
69
      return i;
1521
0
  return UNKNOWN_ENC;
1522
69
}
1523
1524
/* For binary compatibility, we store the index of the encoding
1525
   specified at initialization in the isUtf16 member.
1526
*/
1527
1528
20
#define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16)
1529
178
#define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i)
1530
1531
/* This is what detects the encoding.  encodingTable maps from
1532
   encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of
1533
   the external (protocol) specified encoding; state is
1534
   XML_CONTENT_STATE if we're parsing an external text entity, and
1535
   XML_PROLOG_STATE otherwise.
1536
*/
1537
1538
static int
1539
initScan(const ENCODING *const *encodingTable, const INIT_ENCODING *enc,
1540
89
         int state, const char *ptr, const char *end, const char **nextTokPtr) {
1541
89
  const ENCODING **encPtr;
1542
1543
89
  if (ptr >= end)
1544
0
    return XML_TOK_NONE;
1545
89
  encPtr = enc->encPtr;
1546
89
  if (ptr + 1 == end) {
1547
    /* only a single byte available for auto-detection */
1548
#ifndef XML_DTD /* FIXME */
1549
    /* a well-formed document entity must have more than one byte */
1550
    if (state != XML_CONTENT_STATE)
1551
      return XML_TOK_PARTIAL;
1552
#endif
1553
    /* so we're parsing an external text entity... */
1554
    /* if UTF-16 was externally specified, then we need at least 2 bytes */
1555
0
    switch (INIT_ENC_INDEX(enc)) {
1556
0
    case UTF_16_ENC:
1557
0
    case UTF_16LE_ENC:
1558
0
    case UTF_16BE_ENC:
1559
0
      return XML_TOK_PARTIAL;
1560
0
    }
1561
0
    switch ((unsigned char)*ptr) {
1562
0
    case 0xFE:
1563
0
    case 0xFF:
1564
0
    case 0xEF: /* possibly first byte of UTF-8 BOM */
1565
0
      if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1566
0
        break;
1567
      /* fall through */
1568
0
    case 0x00:
1569
0
    case 0x3C:
1570
0
      return XML_TOK_PARTIAL;
1571
0
    }
1572
89
  } else {
1573
89
    switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
1574
0
    case 0xFEFF:
1575
0
      if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1576
0
        break;
1577
0
      *nextTokPtr = ptr + 2;
1578
0
      *encPtr = encodingTable[UTF_16BE_ENC];
1579
0
      return XML_TOK_BOM;
1580
    /* 00 3C is handled in the default case */
1581
0
    case 0x3C00:
1582
0
      if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
1583
0
           || INIT_ENC_INDEX(enc) == UTF_16_ENC)
1584
0
          && state == XML_CONTENT_STATE)
1585
0
        break;
1586
0
      *encPtr = encodingTable[UTF_16LE_ENC];
1587
0
      return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1588
0
    case 0xFFFE:
1589
0
      if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1590
0
        break;
1591
0
      *nextTokPtr = ptr + 2;
1592
0
      *encPtr = encodingTable[UTF_16LE_ENC];
1593
0
      return XML_TOK_BOM;
1594
69
    case 0xEFBB:
1595
      /* Maybe a UTF-8 BOM (EF BB BF) */
1596
      /* If there's an explicitly specified (external) encoding
1597
         of ISO-8859-1 or some flavour of UTF-16
1598
         and this is an external text entity,
1599
         don't look for the BOM,
1600
         because it might be a legal data.
1601
      */
1602
69
      if (state == XML_CONTENT_STATE) {
1603
0
        int e = INIT_ENC_INDEX(enc);
1604
0
        if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC || e == UTF_16LE_ENC
1605
0
            || e == UTF_16_ENC)
1606
0
          break;
1607
0
      }
1608
69
      if (ptr + 2 == end)
1609
0
        return XML_TOK_PARTIAL;
1610
69
      if ((unsigned char)ptr[2] == 0xBF) {
1611
69
        *nextTokPtr = ptr + 3;
1612
69
        *encPtr = encodingTable[UTF_8_ENC];
1613
69
        return XML_TOK_BOM;
1614
69
      }
1615
0
      break;
1616
20
    default:
1617
20
      if (ptr[0] == '\0') {
1618
        /* 0 isn't a legal data character. Furthermore a document
1619
           entity can only start with ASCII characters.  So the only
1620
           way this can fail to be big-endian UTF-16 if it it's an
1621
           external parsed general entity that's labelled as
1622
           UTF-16LE.
1623
        */
1624
0
        if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
1625
0
          break;
1626
0
        *encPtr = encodingTable[UTF_16BE_ENC];
1627
0
        return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1628
20
      } else if (ptr[1] == '\0') {
1629
        /* We could recover here in the case:
1630
            - parsing an external entity
1631
            - second byte is 0
1632
            - no externally specified encoding
1633
            - no encoding declaration
1634
           by assuming UTF-16LE.  But we don't, because this would mean when
1635
           presented just with a single byte, we couldn't reliably determine
1636
           whether we needed further bytes.
1637
        */
1638
0
        if (state == XML_CONTENT_STATE)
1639
0
          break;
1640
0
        *encPtr = encodingTable[UTF_16LE_ENC];
1641
0
        return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1642
0
      }
1643
20
      break;
1644
89
    }
1645
89
  }
1646
20
  *encPtr = encodingTable[INIT_ENC_INDEX(enc)];
1647
20
  return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1648
89
}
1649
1650
523
#define NS(x) x
1651
69
#define ns(x) x
1652
#define XML_TOK_NS_C
1653
#include "xmltok_ns.c"
1654
#undef XML_TOK_NS_C
1655
#undef NS
1656
#undef ns
1657
1658
#ifdef XML_NS
1659
1660
60
#  define NS(x) x##NS
1661
20
#  define ns(x) x##_ns
1662
1663
#  define XML_TOK_NS_C
1664
#  include "xmltok_ns.c"
1665
#  undef XML_TOK_NS_C
1666
1667
#  undef NS
1668
#  undef ns
1669
1670
ENCODING *
1671
XmlInitUnknownEncodingNS(void *mem, int *table, CONVERTER convert,
1672
0
                         void *userData) {
1673
0
  ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
1674
0
  if (enc)
1675
0
    ((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON;
1676
0
  return enc;
1677
0
}
1678
1679
#endif /* XML_NS */