Coverage Report

Created: 2022-08-24 06:17

/src/CMake/Utilities/cmexpat/lib/xmltok.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
                            __  __            _
3
                         ___\ \/ /_ __   __ _| |_
4
                        / _ \\  /| '_ \ / _` | __|
5
                       |  __//  \| |_) | (_| | |_
6
                        \___/_/\_\ .__/ \__,_|\__|
7
                                 |_| XML parser
8
9
   Copyright (c) 1997-2000 Thai Open Source Software Center Ltd
10
   Copyright (c) 2000      Clark Cooper <coopercc@users.sourceforge.net>
11
   Copyright (c) 2001-2003 Fred L. Drake, Jr. <fdrake@users.sourceforge.net>
12
   Copyright (c) 2002      Greg Stein <gstein@users.sourceforge.net>
13
   Copyright (c) 2002-2016 Karl Waclawek <karl@waclawek.net>
14
   Copyright (c) 2005-2009 Steven Solie <steven@solie.ca>
15
   Copyright (c) 2016-2022 Sebastian Pipping <sebastian@pipping.org>
16
   Copyright (c) 2016      Pascal Cuoq <cuoq@trust-in-soft.com>
17
   Copyright (c) 2016      Don Lewis <truckman@apache.org>
18
   Copyright (c) 2017      Rhodri James <rhodri@wildebeest.org.uk>
19
   Copyright (c) 2017      Alexander Bluhm <alexander.bluhm@gmx.net>
20
   Copyright (c) 2017      Benbuck Nason <bnason@netflix.com>
21
   Copyright (c) 2017      José Gutiérrez de la Concha <jose@zeroc.com>
22
   Copyright (c) 2019      David Loffredo <loffredo@steptools.com>
23
   Copyright (c) 2021      Dong-hee Na <donghee.na@python.org>
24
   Licensed under the MIT license:
25
26
   Permission is  hereby granted,  free of charge,  to any  person obtaining
27
   a  copy  of  this  software   and  associated  documentation  files  (the
28
   "Software"),  to  deal in  the  Software  without restriction,  including
29
   without  limitation the  rights  to use,  copy,  modify, merge,  publish,
30
   distribute, sublicense, and/or sell copies of the Software, and to permit
31
   persons  to whom  the Software  is  furnished to  do so,  subject to  the
32
   following conditions:
33
34
   The above copyright  notice and this permission notice  shall be included
35
   in all copies or substantial portions of the Software.
36
37
   THE  SOFTWARE  IS  PROVIDED  "AS  IS",  WITHOUT  WARRANTY  OF  ANY  KIND,
38
   EXPRESS  OR IMPLIED,  INCLUDING  BUT  NOT LIMITED  TO  THE WARRANTIES  OF
39
   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
40
   NO EVENT SHALL THE AUTHORS OR  COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
41
   DAMAGES OR  OTHER LIABILITY, WHETHER  IN AN  ACTION OF CONTRACT,  TORT OR
42
   OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
43
   USE OR OTHER DEALINGS IN THE SOFTWARE.
44
*/
45
46
#include <expat_config.h>
47
48
#include <stddef.h>
49
#include <string.h> /* memcpy */
50
#include <stdbool.h>
51
52
#ifdef _WIN32
53
#  include "winconfig.h"
54
#endif
55
56
#include "expat_external.h"
57
#include "internal.h"
58
#include "xmltok.h"
59
#include "nametab.h"
60
61
#ifdef XML_DTD
62
#  define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok)
63
#else
64
#  define IGNORE_SECTION_TOK_VTABLE /* as nothing */
65
#endif
66
67
#define VTABLE1                                                                \
68
  {PREFIX(prologTok), PREFIX(contentTok),                                      \
69
   PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE},                         \
70
      {PREFIX(attributeValueTok), PREFIX(entityValueTok)},                     \
71
      PREFIX(nameMatchesAscii), PREFIX(nameLength), PREFIX(skipS),             \
72
      PREFIX(getAtts), PREFIX(charRefNumber), PREFIX(predefinedEntityName),    \
73
      PREFIX(updatePosition), PREFIX(isPublicId)
74
75
#define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
76
77
#define UCS2_GET_NAMING(pages, hi, lo)                                         \
78
139k
  (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1u << ((lo)&0x1F)))
79
80
/* A 2 byte UTF-8 representation splits the characters 11 bits between
81
   the bottom 5 and 6 bits of the bytes.  We need 8 bits to index into
82
   pages, 3 bits to add to that index and 5 bits to generate the mask.
83
*/
84
#define UTF8_GET_NAMING2(pages, byte)                                          \
85
8.33M
  (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3)                         \
86
8.33M
                + ((((byte)[0]) & 3) << 1) + ((((byte)[1]) >> 5) & 1)]         \
87
8.33M
   & (1u << (((byte)[1]) & 0x1F)))
88
89
/* A 3 byte UTF-8 representation splits the characters 16 bits between
90
   the bottom 4, 6 and 6 bits of the bytes.  We need 8 bits to index
91
   into pages, 3 bits to add to that index and 5 bits to generate the
92
   mask.
93
*/
94
#define UTF8_GET_NAMING3(pages, byte)                                          \
95
17.7k
  (namingBitmap                                                                \
96
17.7k
       [((pages)[((((byte)[0]) & 0xF) << 4) + ((((byte)[1]) >> 2) & 0xF)]      \
97
17.7k
         << 3)                                                                 \
98
17.7k
        + ((((byte)[1]) & 3) << 1) + ((((byte)[2]) >> 5) & 1)]                 \
99
17.7k
   & (1u << (((byte)[2]) & 0x1F)))
100
101
/* Detection of invalid UTF-8 sequences is based on Table 3.1B
102
   of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/
103
   with the additional restriction of not allowing the Unicode
104
   code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE).
105
   Implementation details:
106
     (A & 0x80) == 0     means A < 0x80
107
   and
108
     (A & 0xC0) == 0xC0  means A > 0xBF
109
*/
110
111
#define UTF8_INVALID2(p)                                                       \
112
31.1M
  ((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0)
113
114
#define UTF8_INVALID3(p)                                                       \
115
40.6k
  (((p)[2] & 0x80) == 0                                                        \
116
40.6k
   || ((*p) == 0xEF && (p)[1] == 0xBF ? (p)[2] > 0xBD                          \
117
40.5k
                                      : ((p)[2] & 0xC0) == 0xC0)               \
118
40.6k
   || ((*p) == 0xE0                                                            \
119
40.4k
           ? (p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0                          \
120
40.4k
           : ((p)[1] & 0x80) == 0                                              \
121
31.1k
                 || ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0)))
122
123
#define UTF8_INVALID4(p)                                                       \
124
13.5k
  (((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 || ((p)[2] & 0x80) == 0     \
125
13.5k
   || ((p)[2] & 0xC0) == 0xC0                                                  \
126
13.5k
   || ((*p) == 0xF0                                                            \
127
13.3k
           ? (p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0                          \
128
13.3k
           : ((p)[1] & 0x80) == 0                                              \
129
10.9k
                 || ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0)))
130
131
static int PTRFASTCALL
132
177
isNever(const ENCODING *enc, const char *p) {
133
177
  UNUSED_P(enc);
134
177
  UNUSED_P(p);
135
177
  return 0;
136
177
}
137
138
static int PTRFASTCALL
139
8.27M
utf8_isName2(const ENCODING *enc, const char *p) {
140
8.27M
  UNUSED_P(enc);
141
8.27M
  return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
142
8.27M
}
143
144
static int PTRFASTCALL
145
7.34k
utf8_isName3(const ENCODING *enc, const char *p) {
146
7.34k
  UNUSED_P(enc);
147
7.34k
  return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
148
7.34k
}
149
150
#define utf8_isName4 isNever
151
152
static int PTRFASTCALL
153
57.8k
utf8_isNmstrt2(const ENCODING *enc, const char *p) {
154
57.8k
  UNUSED_P(enc);
155
57.8k
  return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
156
57.8k
}
157
158
static int PTRFASTCALL
159
10.4k
utf8_isNmstrt3(const ENCODING *enc, const char *p) {
160
10.4k
  UNUSED_P(enc);
161
10.4k
  return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
162
10.4k
}
163
164
#define utf8_isNmstrt4 isNever
165
166
static int PTRFASTCALL
167
31.1M
utf8_isInvalid2(const ENCODING *enc, const char *p) {
168
31.1M
  UNUSED_P(enc);
169
31.1M
  return UTF8_INVALID2((const unsigned char *)p);
170
31.1M
}
171
172
static int PTRFASTCALL
173
40.6k
utf8_isInvalid3(const ENCODING *enc, const char *p) {
174
40.6k
  UNUSED_P(enc);
175
40.6k
  return UTF8_INVALID3((const unsigned char *)p);
176
40.6k
}
177
178
static int PTRFASTCALL
179
13.5k
utf8_isInvalid4(const ENCODING *enc, const char *p) {
180
13.5k
  UNUSED_P(enc);
181
13.5k
  return UTF8_INVALID4((const unsigned char *)p);
182
13.5k
}
183
184
struct normal_encoding {
185
  ENCODING enc;
186
  unsigned char type[256];
187
#ifdef XML_MIN_SIZE
188
  int(PTRFASTCALL *byteType)(const ENCODING *, const char *);
189
  int(PTRFASTCALL *isNameMin)(const ENCODING *, const char *);
190
  int(PTRFASTCALL *isNmstrtMin)(const ENCODING *, const char *);
191
  int(PTRFASTCALL *byteToAscii)(const ENCODING *, const char *);
192
  int(PTRCALL *charMatches)(const ENCODING *, const char *, int);
193
#endif /* XML_MIN_SIZE */
194
  int(PTRFASTCALL *isName2)(const ENCODING *, const char *);
195
  int(PTRFASTCALL *isName3)(const ENCODING *, const char *);
196
  int(PTRFASTCALL *isName4)(const ENCODING *, const char *);
197
  int(PTRFASTCALL *isNmstrt2)(const ENCODING *, const char *);
198
  int(PTRFASTCALL *isNmstrt3)(const ENCODING *, const char *);
199
  int(PTRFASTCALL *isNmstrt4)(const ENCODING *, const char *);
200
  int(PTRFASTCALL *isInvalid2)(const ENCODING *, const char *);
201
  int(PTRFASTCALL *isInvalid3)(const ENCODING *, const char *);
202
  int(PTRFASTCALL *isInvalid4)(const ENCODING *, const char *);
203
};
204
205
39.5M
#define AS_NORMAL_ENCODING(enc) ((const struct normal_encoding *)(enc))
206
207
#ifdef XML_MIN_SIZE
208
209
#  define STANDARD_VTABLE(E)                                                   \
210
    E##byteType, E##isNameMin, E##isNmstrtMin, E##byteToAscii, E##charMatches,
211
212
#else
213
214
#  define STANDARD_VTABLE(E) /* as nothing */
215
216
#endif
217
218
#define NORMAL_VTABLE(E)                                                       \
219
  E##isName2, E##isName3, E##isName4, E##isNmstrt2, E##isNmstrt3,              \
220
      E##isNmstrt4, E##isInvalid2, E##isInvalid3, E##isInvalid4
221
222
#define NULL_VTABLE                                                            \
223
  /* isName2 */ NULL, /* isName3 */ NULL, /* isName4 */ NULL,                  \
224
      /* isNmstrt2 */ NULL, /* isNmstrt3 */ NULL, /* isNmstrt4 */ NULL,        \
225
      /* isInvalid2 */ NULL, /* isInvalid3 */ NULL, /* isInvalid4 */ NULL
226
227
static int FASTCALL checkCharRefNumber(int);
228
229
#include "xmltok_impl.h"
230
#include "ascii.h"
231
232
#ifdef XML_MIN_SIZE
233
#  define sb_isNameMin isNever
234
#  define sb_isNmstrtMin isNever
235
#endif
236
237
#ifdef XML_MIN_SIZE
238
#  define MINBPC(enc) ((enc)->minBytesPerChar)
239
#else
240
/* minimum bytes per character */
241
19.7G
#  define MINBPC(enc) 1
242
#endif
243
244
#define SB_BYTE_TYPE(enc, p)                                                   \
245
13.7G
  (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
246
247
#ifdef XML_MIN_SIZE
248
static int PTRFASTCALL
249
sb_byteType(const ENCODING *enc, const char *p) {
250
  return SB_BYTE_TYPE(enc, p);
251
}
252
#  define BYTE_TYPE(enc, p) (AS_NORMAL_ENCODING(enc)->byteType(enc, p))
253
#else
254
13.7G
#  define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
255
#endif
256
257
#ifdef XML_MIN_SIZE
258
#  define BYTE_TO_ASCII(enc, p) (AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p))
259
static int PTRFASTCALL
260
sb_byteToAscii(const ENCODING *enc, const char *p) {
261
  UNUSED_P(enc);
262
  return *p;
263
}
264
#else
265
809k
#  define BYTE_TO_ASCII(enc, p) (*(p))
266
#endif
267
268
8.28M
#define IS_NAME_CHAR(enc, p, n) (AS_NORMAL_ENCODING(enc)->isName##n(enc, p))
269
68.3k
#define IS_NMSTRT_CHAR(enc, p, n) (AS_NORMAL_ENCODING(enc)->isNmstrt##n(enc, p))
270
#ifdef XML_MIN_SIZE
271
#  define IS_INVALID_CHAR(enc, p, n)                                           \
272
    (AS_NORMAL_ENCODING(enc)->isInvalid##n                                     \
273
     && AS_NORMAL_ENCODING(enc)->isInvalid##n(enc, p))
274
#else
275
#  define IS_INVALID_CHAR(enc, p, n)                                           \
276
39.5M
    (AS_NORMAL_ENCODING(enc)->isInvalid##n(enc, p))
277
#endif
278
279
#ifdef XML_MIN_SIZE
280
#  define IS_NAME_CHAR_MINBPC(enc, p)                                          \
281
    (AS_NORMAL_ENCODING(enc)->isNameMin(enc, p))
282
#  define IS_NMSTRT_CHAR_MINBPC(enc, p)                                        \
283
    (AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p))
284
#else
285
0
#  define IS_NAME_CHAR_MINBPC(enc, p) (0)
286
0
#  define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
287
#endif
288
289
#ifdef XML_MIN_SIZE
290
#  define CHAR_MATCHES(enc, p, c)                                              \
291
    (AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c))
292
static int PTRCALL
293
sb_charMatches(const ENCODING *enc, const char *p, int c) {
294
  UNUSED_P(enc);
295
  return *p == c;
296
}
297
#else
298
/* c is an ASCII character */
299
2.36M
#  define CHAR_MATCHES(enc, p, c) (*(p) == c)
300
#endif
301
302
16.6M
#define PREFIX(ident) normal_##ident
303
#define XML_TOK_IMPL_C
304
#include "xmltok_impl.c"
305
#undef XML_TOK_IMPL_C
306
307
#undef MINBPC
308
#undef BYTE_TYPE
309
#undef BYTE_TO_ASCII
310
#undef CHAR_MATCHES
311
#undef IS_NAME_CHAR
312
#undef IS_NAME_CHAR_MINBPC
313
#undef IS_NMSTRT_CHAR
314
#undef IS_NMSTRT_CHAR_MINBPC
315
#undef IS_INVALID_CHAR
316
317
enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */
318
       UTF8_cval1 = 0x00,
319
       UTF8_cval2 = 0xc0,
320
       UTF8_cval3 = 0xe0,
321
       UTF8_cval4 = 0xf0
322
};
323
324
void
325
_INTERNAL_trim_to_complete_utf8_characters(const char *from,
326
102M
                                           const char **fromLimRef) {
327
102M
  const char *fromLim = *fromLimRef;
328
102M
  size_t walked = 0;
329
103M
  for (; fromLim > from; fromLim--, walked++) {
330
103M
    const unsigned char prev = (unsigned char)fromLim[-1];
331
103M
    if ((prev & 0xf8u)
332
103M
        == 0xf0u) { /* 4-byte character, lead by 0b11110xxx byte */
333
1.01k
      if (walked + 1 >= 4) {
334
791
        fromLim += 4 - 1;
335
791
        break;
336
791
      } else {
337
224
        walked = 0;
338
224
      }
339
103M
    } else if ((prev & 0xf0u)
340
103M
               == 0xe0u) { /* 3-byte character, lead by 0b1110xxxx byte */
341
8.37k
      if (walked + 1 >= 3) {
342
6.83k
        fromLim += 3 - 1;
343
6.83k
        break;
344
6.83k
      } else {
345
1.54k
        walked = 0;
346
1.54k
      }
347
103M
    } else if ((prev & 0xe0u)
348
103M
               == 0xc0u) { /* 2-byte character, lead by 0b110xxxxx byte */
349
626k
      if (walked + 1 >= 2) {
350
614k
        fromLim += 2 - 1;
351
614k
        break;
352
614k
      } else {
353
11.5k
        walked = 0;
354
11.5k
      }
355
102M
    } else if ((prev & 0x80u)
356
102M
               == 0x00u) { /* 1-byte character, matching 0b0xxxxxxx */
357
102M
      break;
358
102M
    }
359
103M
  }
360
102M
  *fromLimRef = fromLim;
361
102M
}
362
363
static enum XML_Convert_Result PTRCALL
364
utf8_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
365
102M
            char **toP, const char *toLim) {
366
102M
  bool input_incomplete = false;
367
102M
  bool output_exhausted = false;
368
369
  /* Avoid copying partial characters (due to limited space). */
370
102M
  const ptrdiff_t bytesAvailable = fromLim - *fromP;
371
102M
  const ptrdiff_t bytesStorable = toLim - *toP;
372
102M
  UNUSED_P(enc);
373
102M
  if (bytesAvailable > bytesStorable) {
374
279k
    fromLim = *fromP + bytesStorable;
375
279k
    output_exhausted = true;
376
279k
  }
377
378
  /* Avoid copying partial characters (from incomplete input). */
379
102M
  {
380
102M
    const char *const fromLimBefore = fromLim;
381
102M
    _INTERNAL_trim_to_complete_utf8_characters(*fromP, &fromLim);
382
102M
    if (fromLim < fromLimBefore) {
383
13.3k
      input_incomplete = true;
384
13.3k
    }
385
102M
  }
386
387
102M
  {
388
102M
    const ptrdiff_t bytesToCopy = fromLim - *fromP;
389
102M
    memcpy(*toP, *fromP, bytesToCopy);
390
102M
    *fromP += bytesToCopy;
391
102M
    *toP += bytesToCopy;
392
102M
  }
393
394
102M
  if (output_exhausted) /* needs to go first */
395
279k
    return XML_CONVERT_OUTPUT_EXHAUSTED;
396
102M
  else if (input_incomplete)
397
0
    return XML_CONVERT_INPUT_INCOMPLETE;
398
102M
  else
399
102M
    return XML_CONVERT_COMPLETED;
400
102M
}
401
402
static enum XML_Convert_Result PTRCALL
403
utf8_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
404
0
             unsigned short **toP, const unsigned short *toLim) {
405
0
  enum XML_Convert_Result res = XML_CONVERT_COMPLETED;
406
0
  unsigned short *to = *toP;
407
0
  const char *from = *fromP;
408
0
  while (from < fromLim && to < toLim) {
409
0
    switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) {
410
0
    case BT_LEAD2:
411
0
      if (fromLim - from < 2) {
412
0
        res = XML_CONVERT_INPUT_INCOMPLETE;
413
0
        goto after;
414
0
      }
415
0
      *to++ = (unsigned short)(((from[0] & 0x1f) << 6) | (from[1] & 0x3f));
416
0
      from += 2;
417
0
      break;
418
0
    case BT_LEAD3:
419
0
      if (fromLim - from < 3) {
420
0
        res = XML_CONVERT_INPUT_INCOMPLETE;
421
0
        goto after;
422
0
      }
423
0
      *to++ = (unsigned short)(((from[0] & 0xf) << 12) | ((from[1] & 0x3f) << 6)
424
0
                               | (from[2] & 0x3f));
425
0
      from += 3;
426
0
      break;
427
0
    case BT_LEAD4: {
428
0
      unsigned long n;
429
0
      if (toLim - to < 2) {
430
0
        res = XML_CONVERT_OUTPUT_EXHAUSTED;
431
0
        goto after;
432
0
      }
433
0
      if (fromLim - from < 4) {
434
0
        res = XML_CONVERT_INPUT_INCOMPLETE;
435
0
        goto after;
436
0
      }
437
0
      n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12)
438
0
          | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
439
0
      n -= 0x10000;
440
0
      to[0] = (unsigned short)((n >> 10) | 0xD800);
441
0
      to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
442
0
      to += 2;
443
0
      from += 4;
444
0
    } break;
445
0
    default:
446
0
      *to++ = *from++;
447
0
      break;
448
0
    }
449
0
  }
450
0
  if (from < fromLim)
451
0
    res = XML_CONVERT_OUTPUT_EXHAUSTED;
452
0
after:
453
0
  *fromP = from;
454
0
  *toP = to;
455
0
  return res;
456
0
}
457
458
#ifdef XML_NS
459
static const struct normal_encoding utf8_encoding_ns
460
    = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
461
       {
462
#  include "asciitab.h"
463
#  include "utf8tab.h"
464
       },
465
       STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
466
#endif
467
468
static const struct normal_encoding utf8_encoding
469
    = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
470
       {
471
#define BT_COLON BT_NMSTRT
472
#include "asciitab.h"
473
#undef BT_COLON
474
#include "utf8tab.h"
475
       },
476
       STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
477
478
#ifdef XML_NS
479
480
static const struct normal_encoding internal_utf8_encoding_ns
481
    = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
482
       {
483
#  include "iasciitab.h"
484
#  include "utf8tab.h"
485
       },
486
       STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
487
488
#endif
489
490
static const struct normal_encoding internal_utf8_encoding
491
    = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
492
       {
493
#define BT_COLON BT_NMSTRT
494
#include "iasciitab.h"
495
#undef BT_COLON
496
#include "utf8tab.h"
497
       },
498
       STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
499
500
static enum XML_Convert_Result PTRCALL
501
latin1_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
502
2.17M
              char **toP, const char *toLim) {
503
2.17M
  UNUSED_P(enc);
504
99.2M
  for (;;) {
505
99.2M
    unsigned char c;
506
99.2M
    if (*fromP == fromLim)
507
2.15M
      return XML_CONVERT_COMPLETED;
508
97.0M
    c = (unsigned char)**fromP;
509
97.0M
    if (c & 0x80) {
510
87.7M
      if (toLim - *toP < 2)
511
19.4k
        return XML_CONVERT_OUTPUT_EXHAUSTED;
512
87.7M
      *(*toP)++ = (char)((c >> 6) | UTF8_cval2);
513
87.7M
      *(*toP)++ = (char)((c & 0x3f) | 0x80);
514
87.7M
      (*fromP)++;
515
87.7M
    } else {
516
9.28M
      if (*toP == toLim)
517
2.58k
        return XML_CONVERT_OUTPUT_EXHAUSTED;
518
9.27M
      *(*toP)++ = *(*fromP)++;
519
9.27M
    }
520
97.0M
  }
521
2.17M
}
522
523
static enum XML_Convert_Result PTRCALL
524
latin1_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
525
0
               unsigned short **toP, const unsigned short *toLim) {
526
0
  UNUSED_P(enc);
527
0
  while (*fromP < fromLim && *toP < toLim)
528
0
    *(*toP)++ = (unsigned char)*(*fromP)++;
529
530
0
  if ((*toP == toLim) && (*fromP < fromLim))
531
0
    return XML_CONVERT_OUTPUT_EXHAUSTED;
532
0
  else
533
0
    return XML_CONVERT_COMPLETED;
534
0
}
535
536
#ifdef XML_NS
537
538
static const struct normal_encoding latin1_encoding_ns
539
    = {{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0},
540
       {
541
#  include "asciitab.h"
542
#  include "latin1tab.h"
543
       },
544
       STANDARD_VTABLE(sb_) NULL_VTABLE};
545
546
#endif
547
548
static const struct normal_encoding latin1_encoding
549
    = {{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0},
550
       {
551
#define BT_COLON BT_NMSTRT
552
#include "asciitab.h"
553
#undef BT_COLON
554
#include "latin1tab.h"
555
       },
556
       STANDARD_VTABLE(sb_) NULL_VTABLE};
557
558
static enum XML_Convert_Result PTRCALL
559
ascii_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
560
5.32k
             char **toP, const char *toLim) {
561
5.32k
  UNUSED_P(enc);
562
82.6k
  while (*fromP < fromLim && *toP < toLim)
563
77.3k
    *(*toP)++ = *(*fromP)++;
564
565
5.32k
  if ((*toP == toLim) && (*fromP < fromLim))
566
278
    return XML_CONVERT_OUTPUT_EXHAUSTED;
567
5.04k
  else
568
5.04k
    return XML_CONVERT_COMPLETED;
569
5.32k
}
570
571
#ifdef XML_NS
572
573
static const struct normal_encoding ascii_encoding_ns
574
    = {{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0},
575
       {
576
#  include "asciitab.h"
577
           /* BT_NONXML == 0 */
578
       },
579
       STANDARD_VTABLE(sb_) NULL_VTABLE};
580
581
#endif
582
583
static const struct normal_encoding ascii_encoding
584
    = {{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0},
585
       {
586
#define BT_COLON BT_NMSTRT
587
#include "asciitab.h"
588
#undef BT_COLON
589
           /* BT_NONXML == 0 */
590
       },
591
       STANDARD_VTABLE(sb_) NULL_VTABLE};
592
593
static int PTRFASTCALL
594
277k
unicode_byte_type(char hi, char lo) {
595
277k
  switch ((unsigned char)hi) {
596
  /* 0xD800-0xDBFF first 16-bit code unit or high surrogate (W1) */
597
36
  case 0xD8:
598
66
  case 0xD9:
599
102
  case 0xDA:
600
124
  case 0xDB:
601
124
    return BT_LEAD4;
602
  /* 0xDC00-0xDFFF second 16-bit code unit or low surrogate (W2) */
603
4
  case 0xDC:
604
8
  case 0xDD:
605
14
  case 0xDE:
606
20
  case 0xDF:
607
20
    return BT_TRAIL;
608
70
  case 0xFF:
609
70
    switch ((unsigned char)lo) {
610
8
    case 0xFF: /* noncharacter-FFFF */
611
8
    case 0xFE: /* noncharacter-FFFE */
612
8
      return BT_NONXML;
613
70
    }
614
62
    break;
615
277k
  }
616
277k
  return BT_NONASCII;
617
277k
}
618
619
#define DEFINE_UTF16_TO_UTF8(E)                                                \
620
  static enum XML_Convert_Result PTRCALL E##toUtf8(                            \
621
      const ENCODING *enc, const char **fromP, const char *fromLim,            \
622
0
      char **toP, const char *toLim) {                                         \
623
0
    const char *from = *fromP;                                                 \
624
0
    UNUSED_P(enc);                                                             \
625
0
    fromLim = from + (((fromLim - from) >> 1) << 1); /* shrink to even */      \
626
0
    for (; from < fromLim; from += 2) {                                        \
627
0
      int plane;                                                               \
628
0
      unsigned char lo2;                                                       \
629
0
      unsigned char lo = GET_LO(from);                                         \
630
0
      unsigned char hi = GET_HI(from);                                         \
631
0
      switch (hi) {                                                            \
632
0
      case 0:                                                                  \
633
0
        if (lo < 0x80) {                                                       \
634
0
          if (*toP == toLim) {                                                 \
635
0
            *fromP = from;                                                     \
636
0
            return XML_CONVERT_OUTPUT_EXHAUSTED;                               \
637
0
          }                                                                    \
638
0
          *(*toP)++ = lo;                                                      \
639
0
          break;                                                               \
640
0
        }                                                                      \
641
0
        /* fall through */                                                     \
642
0
      case 0x1:                                                                \
643
0
      case 0x2:                                                                \
644
0
      case 0x3:                                                                \
645
0
      case 0x4:                                                                \
646
0
      case 0x5:                                                                \
647
0
      case 0x6:                                                                \
648
0
      case 0x7:                                                                \
649
0
        if (toLim - *toP < 2) {                                                \
650
0
          *fromP = from;                                                       \
651
0
          return XML_CONVERT_OUTPUT_EXHAUSTED;                                 \
652
0
        }                                                                      \
653
0
        *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2);                      \
654
0
        *(*toP)++ = ((lo & 0x3f) | 0x80);                                      \
655
0
        break;                                                                 \
656
0
      default:                                                                 \
657
0
        if (toLim - *toP < 3) {                                                \
658
0
          *fromP = from;                                                       \
659
0
          return XML_CONVERT_OUTPUT_EXHAUSTED;                                 \
660
0
        }                                                                      \
661
0
        /* 16 bits divided 4, 6, 6 amongst 3 bytes */                          \
662
0
        *(*toP)++ = ((hi >> 4) | UTF8_cval3);                                  \
663
0
        *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80);                    \
664
0
        *(*toP)++ = ((lo & 0x3f) | 0x80);                                      \
665
0
        break;                                                                 \
666
0
      case 0xD8:                                                               \
667
0
      case 0xD9:                                                               \
668
0
      case 0xDA:                                                               \
669
0
      case 0xDB:                                                               \
670
0
        if (toLim - *toP < 4) {                                                \
671
0
          *fromP = from;                                                       \
672
0
          return XML_CONVERT_OUTPUT_EXHAUSTED;                                 \
673
0
        }                                                                      \
674
0
        if (fromLim - from < 4) {                                              \
675
0
          *fromP = from;                                                       \
676
0
          return XML_CONVERT_INPUT_INCOMPLETE;                                 \
677
0
        }                                                                      \
678
0
        plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1;                   \
679
0
        *(*toP)++ = (char)((plane >> 2) | UTF8_cval4);                         \
680
0
        *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80);         \
681
0
        from += 2;                                                             \
682
0
        lo2 = GET_LO(from);                                                    \
683
0
        *(*toP)++ = (((lo & 0x3) << 4) | ((GET_HI(from) & 0x3) << 2)           \
684
0
                     | (lo2 >> 6) | 0x80);                                     \
685
0
        *(*toP)++ = ((lo2 & 0x3f) | 0x80);                                     \
686
0
        break;                                                                 \
687
0
      }                                                                        \
688
0
    }                                                                          \
689
0
    *fromP = from;                                                             \
690
0
    if (from < fromLim)                                                        \
691
0
      return XML_CONVERT_INPUT_INCOMPLETE;                                     \
692
0
    else                                                                       \
693
0
      return XML_CONVERT_COMPLETED;                                            \
694
0
  }
Unexecuted instantiation: xmltok.c:little2_toUtf8
Unexecuted instantiation: xmltok.c:big2_toUtf8
695
696
#define DEFINE_UTF16_TO_UTF16(E)                                               \
697
  static enum XML_Convert_Result PTRCALL E##toUtf16(                           \
698
      const ENCODING *enc, const char **fromP, const char *fromLim,            \
699
0
      unsigned short **toP, const unsigned short *toLim) {                     \
700
0
    enum XML_Convert_Result res = XML_CONVERT_COMPLETED;                       \
701
0
    UNUSED_P(enc);                                                             \
702
0
    fromLim = *fromP + (((fromLim - *fromP) >> 1) << 1); /* shrink to even */  \
703
0
    /* Avoid copying first half only of surrogate */                           \
704
0
    if (fromLim - *fromP > ((toLim - *toP) << 1)                               \
705
0
        && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) {                             \
706
0
      fromLim -= 2;                                                            \
707
0
      res = XML_CONVERT_INPUT_INCOMPLETE;                                      \
708
0
    }                                                                          \
709
0
    for (; *fromP < fromLim && *toP < toLim; *fromP += 2)                      \
710
0
      *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP);                      \
711
0
    if ((*toP == toLim) && (*fromP < fromLim))                                 \
712
0
      return XML_CONVERT_OUTPUT_EXHAUSTED;                                     \
713
0
    else                                                                       \
714
0
      return res;                                                              \
715
0
  }
Unexecuted instantiation: xmltok.c:little2_toUtf16
Unexecuted instantiation: xmltok.c:big2_toUtf16
716
717
#define SET2(ptr, ch) (((ptr)[0] = ((ch)&0xff)), ((ptr)[1] = ((ch) >> 8)))
718
0
#define GET_LO(ptr) ((unsigned char)(ptr)[0])
719
0
#define GET_HI(ptr) ((unsigned char)(ptr)[1])
720
721
DEFINE_UTF16_TO_UTF8(little2_)
722
DEFINE_UTF16_TO_UTF16(little2_)
723
724
#undef SET2
725
#undef GET_LO
726
#undef GET_HI
727
728
#define SET2(ptr, ch) (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch)&0xFF)))
729
0
#define GET_LO(ptr) ((unsigned char)(ptr)[1])
730
0
#define GET_HI(ptr) ((unsigned char)(ptr)[0])
731
732
DEFINE_UTF16_TO_UTF8(big2_)
733
DEFINE_UTF16_TO_UTF16(big2_)
734
735
#undef SET2
736
#undef GET_LO
737
#undef GET_HI
738
739
#define LITTLE2_BYTE_TYPE(enc, p)                                              \
740
273k
  ((p)[1] == 0 ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)]  \
741
273k
               : unicode_byte_type((p)[1], (p)[0]))
742
0
#define LITTLE2_BYTE_TO_ASCII(p) ((p)[1] == 0 ? (p)[0] : -1)
743
0
#define LITTLE2_CHAR_MATCHES(p, c) ((p)[1] == 0 && (p)[0] == c)
744
#define LITTLE2_IS_NAME_CHAR_MINBPC(p)                                         \
745
136k
  UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
746
#define LITTLE2_IS_NMSTRT_CHAR_MINBPC(p)                                       \
747
133
  UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
748
749
#ifdef XML_MIN_SIZE
750
751
static int PTRFASTCALL
752
little2_byteType(const ENCODING *enc, const char *p) {
753
  return LITTLE2_BYTE_TYPE(enc, p);
754
}
755
756
static int PTRFASTCALL
757
little2_byteToAscii(const ENCODING *enc, const char *p) {
758
  UNUSED_P(enc);
759
  return LITTLE2_BYTE_TO_ASCII(p);
760
}
761
762
static int PTRCALL
763
little2_charMatches(const ENCODING *enc, const char *p, int c) {
764
  UNUSED_P(enc);
765
  return LITTLE2_CHAR_MATCHES(p, c);
766
}
767
768
static int PTRFASTCALL
769
little2_isNameMin(const ENCODING *enc, const char *p) {
770
  UNUSED_P(enc);
771
  return LITTLE2_IS_NAME_CHAR_MINBPC(p);
772
}
773
774
static int PTRFASTCALL
775
little2_isNmstrtMin(const ENCODING *enc, const char *p) {
776
  UNUSED_P(enc);
777
  return LITTLE2_IS_NMSTRT_CHAR_MINBPC(p);
778
}
779
780
#  undef VTABLE
781
#  define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
782
783
#else /* not XML_MIN_SIZE */
784
785
#  undef PREFIX
786
0
#  define PREFIX(ident) little2_##ident
787
547k
#  define MINBPC(enc) 2
788
/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
789
273k
#  define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
790
0
#  define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(p)
791
0
#  define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(p, c)
792
25
#  define IS_NAME_CHAR(enc, p, n) 0
793
136k
#  define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(p)
794
25
#  define IS_NMSTRT_CHAR(enc, p, n) (0)
795
133
#  define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(p)
796
797
#  define XML_TOK_IMPL_C
798
#  include "xmltok_impl.c"
799
#  undef XML_TOK_IMPL_C
800
801
#  undef MINBPC
802
#  undef BYTE_TYPE
803
#  undef BYTE_TO_ASCII
804
#  undef CHAR_MATCHES
805
#  undef IS_NAME_CHAR
806
#  undef IS_NAME_CHAR_MINBPC
807
#  undef IS_NMSTRT_CHAR
808
#  undef IS_NMSTRT_CHAR_MINBPC
809
#  undef IS_INVALID_CHAR
810
811
#endif /* not XML_MIN_SIZE */
812
813
#ifdef XML_NS
814
815
static const struct normal_encoding little2_encoding_ns
816
    = {{VTABLE, 2, 0,
817
#  if BYTEORDER == 1234
818
        1
819
#  else
820
        0
821
#  endif
822
       },
823
       {
824
#  include "asciitab.h"
825
#  include "latin1tab.h"
826
       },
827
       STANDARD_VTABLE(little2_) NULL_VTABLE};
828
829
#endif
830
831
static const struct normal_encoding little2_encoding
832
    = {{VTABLE, 2, 0,
833
#if BYTEORDER == 1234
834
        1
835
#else
836
        0
837
#endif
838
       },
839
       {
840
#define BT_COLON BT_NMSTRT
841
#include "asciitab.h"
842
#undef BT_COLON
843
#include "latin1tab.h"
844
       },
845
       STANDARD_VTABLE(little2_) NULL_VTABLE};
846
847
#if BYTEORDER != 4321
848
849
#  ifdef XML_NS
850
851
static const struct normal_encoding internal_little2_encoding_ns
852
    = {{VTABLE, 2, 0, 1},
853
       {
854
#    include "iasciitab.h"
855
#    include "latin1tab.h"
856
       },
857
       STANDARD_VTABLE(little2_) NULL_VTABLE};
858
859
#  endif
860
861
static const struct normal_encoding internal_little2_encoding
862
    = {{VTABLE, 2, 0, 1},
863
       {
864
#  define BT_COLON BT_NMSTRT
865
#  include "iasciitab.h"
866
#  undef BT_COLON
867
#  include "latin1tab.h"
868
       },
869
       STANDARD_VTABLE(little2_) NULL_VTABLE};
870
871
#endif
872
873
#define BIG2_BYTE_TYPE(enc, p)                                                 \
874
4.07k
  ((p)[0] == 0                                                                 \
875
4.07k
       ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]]        \
876
4.07k
       : unicode_byte_type((p)[0], (p)[1]))
877
0
#define BIG2_BYTE_TO_ASCII(p) ((p)[0] == 0 ? (p)[1] : -1)
878
0
#define BIG2_CHAR_MATCHES(p, c) ((p)[0] == 0 && (p)[1] == c)
879
#define BIG2_IS_NAME_CHAR_MINBPC(p)                                            \
880
2.10k
  UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
881
#define BIG2_IS_NMSTRT_CHAR_MINBPC(p)                                          \
882
99
  UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
883
884
#ifdef XML_MIN_SIZE
885
886
static int PTRFASTCALL
887
big2_byteType(const ENCODING *enc, const char *p) {
888
  return BIG2_BYTE_TYPE(enc, p);
889
}
890
891
static int PTRFASTCALL
892
big2_byteToAscii(const ENCODING *enc, const char *p) {
893
  UNUSED_P(enc);
894
  return BIG2_BYTE_TO_ASCII(p);
895
}
896
897
static int PTRCALL
898
big2_charMatches(const ENCODING *enc, const char *p, int c) {
899
  UNUSED_P(enc);
900
  return BIG2_CHAR_MATCHES(p, c);
901
}
902
903
static int PTRFASTCALL
904
big2_isNameMin(const ENCODING *enc, const char *p) {
905
  UNUSED_P(enc);
906
  return BIG2_IS_NAME_CHAR_MINBPC(p);
907
}
908
909
static int PTRFASTCALL
910
big2_isNmstrtMin(const ENCODING *enc, const char *p) {
911
  UNUSED_P(enc);
912
  return BIG2_IS_NMSTRT_CHAR_MINBPC(p);
913
}
914
915
#  undef VTABLE
916
#  define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
917
918
#else /* not XML_MIN_SIZE */
919
920
#  undef PREFIX
921
0
#  define PREFIX(ident) big2_##ident
922
8.47k
#  define MINBPC(enc) 2
923
/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
924
4.07k
#  define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
925
0
#  define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(p)
926
0
#  define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(p, c)
927
21
#  define IS_NAME_CHAR(enc, p, n) 0
928
2.10k
#  define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(p)
929
21
#  define IS_NMSTRT_CHAR(enc, p, n) (0)
930
99
#  define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(p)
931
932
#  define XML_TOK_IMPL_C
933
#  include "xmltok_impl.c"
934
#  undef XML_TOK_IMPL_C
935
936
#  undef MINBPC
937
#  undef BYTE_TYPE
938
#  undef BYTE_TO_ASCII
939
#  undef CHAR_MATCHES
940
#  undef IS_NAME_CHAR
941
#  undef IS_NAME_CHAR_MINBPC
942
#  undef IS_NMSTRT_CHAR
943
#  undef IS_NMSTRT_CHAR_MINBPC
944
#  undef IS_INVALID_CHAR
945
946
#endif /* not XML_MIN_SIZE */
947
948
#ifdef XML_NS
949
950
static const struct normal_encoding big2_encoding_ns
951
    = {{VTABLE, 2, 0,
952
#  if BYTEORDER == 4321
953
        1
954
#  else
955
        0
956
#  endif
957
       },
958
       {
959
#  include "asciitab.h"
960
#  include "latin1tab.h"
961
       },
962
       STANDARD_VTABLE(big2_) NULL_VTABLE};
963
964
#endif
965
966
static const struct normal_encoding big2_encoding
967
    = {{VTABLE, 2, 0,
968
#if BYTEORDER == 4321
969
        1
970
#else
971
        0
972
#endif
973
       },
974
       {
975
#define BT_COLON BT_NMSTRT
976
#include "asciitab.h"
977
#undef BT_COLON
978
#include "latin1tab.h"
979
       },
980
       STANDARD_VTABLE(big2_) NULL_VTABLE};
981
982
#if BYTEORDER != 1234
983
984
#  ifdef XML_NS
985
986
static const struct normal_encoding internal_big2_encoding_ns
987
    = {{VTABLE, 2, 0, 1},
988
       {
989
#    include "iasciitab.h"
990
#    include "latin1tab.h"
991
       },
992
       STANDARD_VTABLE(big2_) NULL_VTABLE};
993
994
#  endif
995
996
static const struct normal_encoding internal_big2_encoding
997
    = {{VTABLE, 2, 0, 1},
998
       {
999
#  define BT_COLON BT_NMSTRT
1000
#  include "iasciitab.h"
1001
#  undef BT_COLON
1002
#  include "latin1tab.h"
1003
       },
1004
       STANDARD_VTABLE(big2_) NULL_VTABLE};
1005
1006
#endif
1007
1008
#undef PREFIX
1009
1010
static int FASTCALL
1011
3.92k
streqci(const char *s1, const char *s2) {
1012
21.2k
  for (;;) {
1013
21.2k
    char c1 = *s1++;
1014
21.2k
    char c2 = *s2++;
1015
21.2k
    if (ASCII_a <= c1 && c1 <= ASCII_z)
1016
1.09k
      c1 += ASCII_A - ASCII_a;
1017
21.2k
    if (ASCII_a <= c2 && c2 <= ASCII_z)
1018
      /* The following line will never get executed.  streqci() is
1019
       * only called from two places, both of which guarantee to put
1020
       * upper-case strings into s2.
1021
       */
1022
0
      c2 += ASCII_A - ASCII_a; /* LCOV_EXCL_LINE */
1023
21.2k
    if (c1 != c2)
1024
2.23k
      return 0;
1025
19.0k
    if (! c1)
1026
1.68k
      break;
1027
19.0k
  }
1028
1.68k
  return 1;
1029
3.92k
}
1030
1031
static void PTRCALL
1032
initUpdatePosition(const ENCODING *enc, const char *ptr, const char *end,
1033
27
                   POSITION *pos) {
1034
27
  UNUSED_P(enc);
1035
27
  normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
1036
27
}
1037
1038
static int
1039
241k
toAscii(const ENCODING *enc, const char *ptr, const char *end) {
1040
241k
  char buf[1];
1041
241k
  char *p = buf;
1042
241k
  XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
1043
241k
  if (p == buf)
1044
309
    return -1;
1045
241k
  else
1046
241k
    return buf[0];
1047
241k
}
1048
1049
static int FASTCALL
1050
60.9k
isSpace(int c) {
1051
60.9k
  switch (c) {
1052
5.02k
  case 0x20:
1053
6.05k
  case 0xD:
1054
6.93k
  case 0xA:
1055
8.13k
  case 0x9:
1056
8.13k
    return 1;
1057
60.9k
  }
1058
52.8k
  return 0;
1059
60.9k
}
1060
1061
/* Return 1 if there's just optional white space or there's an S
1062
   followed by name=val.
1063
*/
1064
static int
1065
parsePseudoAttribute(const ENCODING *enc, const char *ptr, const char *end,
1066
                     const char **namePtr, const char **nameEndPtr,
1067
5.75k
                     const char **valPtr, const char **nextTokPtr) {
1068
5.75k
  int c;
1069
5.75k
  char open;
1070
5.75k
  if (ptr == end) {
1071
1.75k
    *namePtr = NULL;
1072
1.75k
    return 1;
1073
1.75k
  }
1074
4.00k
  if (! isSpace(toAscii(enc, ptr, end))) {
1075
3
    *nextTokPtr = ptr;
1076
3
    return 0;
1077
3
  }
1078
4.97k
  do {
1079
4.97k
    ptr += enc->minBytesPerChar;
1080
4.97k
  } while (isSpace(toAscii(enc, ptr, end)));
1081
4.00k
  if (ptr == end) {
1082
48
    *namePtr = NULL;
1083
48
    return 1;
1084
48
  }
1085
3.95k
  *namePtr = ptr;
1086
48.7k
  for (;;) {
1087
48.7k
    c = toAscii(enc, ptr, end);
1088
48.7k
    if (c == -1) {
1089
19
      *nextTokPtr = ptr;
1090
19
      return 0;
1091
19
    }
1092
48.7k
    if (c == ASCII_EQUALS) {
1093
3.85k
      *nameEndPtr = ptr;
1094
3.85k
      break;
1095
3.85k
    }
1096
44.8k
    if (isSpace(c)) {
1097
78
      *nameEndPtr = ptr;
1098
1.20k
      do {
1099
1.20k
        ptr += enc->minBytesPerChar;
1100
1.20k
      } while (isSpace(c = toAscii(enc, ptr, end)));
1101
78
      if (c != ASCII_EQUALS) {
1102
63
        *nextTokPtr = ptr;
1103
63
        return 0;
1104
63
      }
1105
15
      break;
1106
78
    }
1107
44.8k
    ptr += enc->minBytesPerChar;
1108
44.8k
  }
1109
3.87k
  if (ptr == *namePtr) {
1110
3
    *nextTokPtr = ptr;
1111
3
    return 0;
1112
3
  }
1113
3.87k
  ptr += enc->minBytesPerChar;
1114
3.87k
  c = toAscii(enc, ptr, end);
1115
4.84k
  while (isSpace(c)) {
1116
975
    ptr += enc->minBytesPerChar;
1117
975
    c = toAscii(enc, ptr, end);
1118
975
  }
1119
3.87k
  if (c != ASCII_QUOT && c != ASCII_APOS) {
1120
53
    *nextTokPtr = ptr;
1121
53
    return 0;
1122
53
  }
1123
3.81k
  open = (char)c;
1124
3.81k
  ptr += enc->minBytesPerChar;
1125
3.81k
  *valPtr = ptr;
1126
174k
  for (;; ptr += enc->minBytesPerChar) {
1127
174k
    c = toAscii(enc, ptr, end);
1128
174k
    if (c == open)
1129
3.71k
      break;
1130
171k
    if (! (ASCII_a <= c && c <= ASCII_z) && ! (ASCII_A <= c && c <= ASCII_Z)
1131
171k
        && ! (ASCII_0 <= c && c <= ASCII_9) && c != ASCII_PERIOD
1132
171k
        && c != ASCII_MINUS && c != ASCII_UNDERSCORE) {
1133
101
      *nextTokPtr = ptr;
1134
101
      return 0;
1135
101
    }
1136
171k
  }
1137
3.71k
  *nextTokPtr = ptr + enc->minBytesPerChar;
1138
3.71k
  return 1;
1139
3.81k
}
1140
1141
static const char KW_version[]
1142
    = {ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0'};
1143
1144
static const char KW_encoding[] = {ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d,
1145
                                   ASCII_i, ASCII_n, ASCII_g, '\0'};
1146
1147
static const char KW_standalone[]
1148
    = {ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a,
1149
       ASCII_l, ASCII_o, ASCII_n, ASCII_e, '\0'};
1150
1151
static const char KW_yes[] = {ASCII_y, ASCII_e, ASCII_s, '\0'};
1152
1153
static const char KW_no[] = {ASCII_n, ASCII_o, '\0'};
1154
1155
static int
1156
doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *, const char *,
1157
                                                 const char *),
1158
               int isGeneralTextEntity, const ENCODING *enc, const char *ptr,
1159
               const char *end, const char **badPtr, const char **versionPtr,
1160
               const char **versionEndPtr, const char **encodingName,
1161
2.13k
               const ENCODING **encoding, int *standalone) {
1162
2.13k
  const char *val = NULL;
1163
2.13k
  const char *name = NULL;
1164
2.13k
  const char *nameEnd = NULL;
1165
2.13k
  ptr += 5 * enc->minBytesPerChar;
1166
2.13k
  end -= 2 * enc->minBytesPerChar;
1167
2.13k
  if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)
1168
2.13k
      || ! name) {
1169
262
    *badPtr = ptr;
1170
262
    return 0;
1171
262
  }
1172
1.86k
  if (! XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) {
1173
3
    if (! isGeneralTextEntity) {
1174
3
      *badPtr = name;
1175
3
      return 0;
1176
3
    }
1177
1.86k
  } else {
1178
1.86k
    if (versionPtr)
1179
1.86k
      *versionPtr = val;
1180
1.86k
    if (versionEndPtr)
1181
1.86k
      *versionEndPtr = ptr;
1182
1.86k
    if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1183
15
      *badPtr = ptr;
1184
15
      return 0;
1185
15
    }
1186
1.85k
    if (! name) {
1187
6
      if (isGeneralTextEntity) {
1188
        /* a TextDecl must have an EncodingDecl */
1189
0
        *badPtr = ptr;
1190
0
        return 0;
1191
0
      }
1192
6
      return 1;
1193
6
    }
1194
1.85k
  }
1195
1.84k
  if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) {
1196
1.76k
    int c = toAscii(enc, val, end);
1197
1.76k
    if (! (ASCII_a <= c && c <= ASCII_z) && ! (ASCII_A <= c && c <= ASCII_Z)) {
1198
4
      *badPtr = val;
1199
4
      return 0;
1200
4
    }
1201
1.76k
    if (encodingName)
1202
1.76k
      *encodingName = val;
1203
1.76k
    if (encoding)
1204
1.76k
      *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
1205
1.76k
    if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1206
7
      *badPtr = ptr;
1207
7
      return 0;
1208
7
    }
1209
1.75k
    if (! name)
1210
1.75k
      return 1;
1211
1.75k
  }
1212
81
  if (! XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone)
1213
81
      || isGeneralTextEntity) {
1214
4
    *badPtr = name;
1215
4
    return 0;
1216
4
  }
1217
77
  if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) {
1218
37
    if (standalone)
1219
37
      *standalone = 1;
1220
40
  } else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) {
1221
39
    if (standalone)
1222
39
      *standalone = 0;
1223
39
  } else {
1224
1
    *badPtr = val;
1225
1
    return 0;
1226
1
  }
1227
1.05k
  while (isSpace(toAscii(enc, ptr, end)))
1228
976
    ptr += enc->minBytesPerChar;
1229
76
  if (ptr != end) {
1230
3
    *badPtr = ptr;
1231
3
    return 0;
1232
3
  }
1233
73
  return 1;
1234
76
}
1235
1236
static int FASTCALL
1237
157k
checkCharRefNumber(int result) {
1238
157k
  switch (result >> 8) {
1239
1
  case 0xD8:
1240
2
  case 0xD9:
1241
3
  case 0xDA:
1242
4
  case 0xDB:
1243
5
  case 0xDC:
1244
10
  case 0xDD:
1245
12
  case 0xDE:
1246
13
  case 0xDF:
1247
13
    return -1;
1248
16.6k
  case 0:
1249
16.6k
    if (latin1_encoding.type[result] == BT_NONXML)
1250
37
      return -1;
1251
16.6k
    break;
1252
16.6k
  case 0xFF:
1253
7.43k
    if (result == 0xFFFE || result == 0xFFFF)
1254
5
      return -1;
1255
7.43k
    break;
1256
157k
  }
1257
157k
  return result;
1258
157k
}
1259
1260
int FASTCALL
1261
157k
XmlUtf8Encode(int c, char *buf) {
1262
157k
  enum {
1263
    /* minN is minimum legal resulting value for N byte sequence */
1264
157k
    min2 = 0x80,
1265
157k
    min3 = 0x800,
1266
157k
    min4 = 0x10000
1267
157k
  };
1268
1269
157k
  if (c < 0)
1270
0
    return 0; /* LCOV_EXCL_LINE: this case is always eliminated beforehand */
1271
157k
  if (c < min2) {
1272
10.9k
    buf[0] = (char)(c | UTF8_cval1);
1273
10.9k
    return 1;
1274
10.9k
  }
1275
146k
  if (c < min3) {
1276
6.95k
    buf[0] = (char)((c >> 6) | UTF8_cval2);
1277
6.95k
    buf[1] = (char)((c & 0x3f) | 0x80);
1278
6.95k
    return 2;
1279
6.95k
  }
1280
139k
  if (c < min4) {
1281
110k
    buf[0] = (char)((c >> 12) | UTF8_cval3);
1282
110k
    buf[1] = (char)(((c >> 6) & 0x3f) | 0x80);
1283
110k
    buf[2] = (char)((c & 0x3f) | 0x80);
1284
110k
    return 3;
1285
110k
  }
1286
28.9k
  if (c < 0x110000) {
1287
28.9k
    buf[0] = (char)((c >> 18) | UTF8_cval4);
1288
28.9k
    buf[1] = (char)(((c >> 12) & 0x3f) | 0x80);
1289
28.9k
    buf[2] = (char)(((c >> 6) & 0x3f) | 0x80);
1290
28.9k
    buf[3] = (char)((c & 0x3f) | 0x80);
1291
28.9k
    return 4;
1292
28.9k
  }
1293
0
  return 0; /* LCOV_EXCL_LINE: this case too is eliminated before calling */
1294
28.9k
}
1295
1296
int FASTCALL
1297
0
XmlUtf16Encode(int charNum, unsigned short *buf) {
1298
0
  if (charNum < 0)
1299
0
    return 0;
1300
0
  if (charNum < 0x10000) {
1301
0
    buf[0] = (unsigned short)charNum;
1302
0
    return 1;
1303
0
  }
1304
0
  if (charNum < 0x110000) {
1305
0
    charNum -= 0x10000;
1306
0
    buf[0] = (unsigned short)((charNum >> 10) + 0xD800);
1307
0
    buf[1] = (unsigned short)((charNum & 0x3FF) + 0xDC00);
1308
0
    return 2;
1309
0
  }
1310
0
  return 0;
1311
0
}
1312
1313
struct unknown_encoding {
1314
  struct normal_encoding normal;
1315
  CONVERTER convert;
1316
  void *userData;
1317
  unsigned short utf16[256];
1318
  char utf8[256][4];
1319
};
1320
1321
0
#define AS_UNKNOWN_ENCODING(enc) ((const struct unknown_encoding *)(enc))
1322
1323
int
1324
0
XmlSizeOfUnknownEncoding(void) {
1325
0
  return sizeof(struct unknown_encoding);
1326
0
}
1327
1328
static int PTRFASTCALL
1329
0
unknown_isName(const ENCODING *enc, const char *p) {
1330
0
  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1331
0
  int c = uenc->convert(uenc->userData, p);
1332
0
  if (c & ~0xFFFF)
1333
0
    return 0;
1334
0
  return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
1335
0
}
1336
1337
static int PTRFASTCALL
1338
0
unknown_isNmstrt(const ENCODING *enc, const char *p) {
1339
0
  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1340
0
  int c = uenc->convert(uenc->userData, p);
1341
0
  if (c & ~0xFFFF)
1342
0
    return 0;
1343
0
  return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
1344
0
}
1345
1346
static int PTRFASTCALL
1347
0
unknown_isInvalid(const ENCODING *enc, const char *p) {
1348
0
  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1349
0
  int c = uenc->convert(uenc->userData, p);
1350
0
  return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
1351
0
}
1352
1353
static enum XML_Convert_Result PTRCALL
1354
unknown_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
1355
0
               char **toP, const char *toLim) {
1356
0
  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1357
0
  char buf[XML_UTF8_ENCODE_MAX];
1358
0
  for (;;) {
1359
0
    const char *utf8;
1360
0
    int n;
1361
0
    if (*fromP == fromLim)
1362
0
      return XML_CONVERT_COMPLETED;
1363
0
    utf8 = uenc->utf8[(unsigned char)**fromP];
1364
0
    n = *utf8++;
1365
0
    if (n == 0) {
1366
0
      int c = uenc->convert(uenc->userData, *fromP);
1367
0
      n = XmlUtf8Encode(c, buf);
1368
0
      if (n > toLim - *toP)
1369
0
        return XML_CONVERT_OUTPUT_EXHAUSTED;
1370
0
      utf8 = buf;
1371
0
      *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1372
0
                 - (BT_LEAD2 - 2));
1373
0
    } else {
1374
0
      if (n > toLim - *toP)
1375
0
        return XML_CONVERT_OUTPUT_EXHAUSTED;
1376
0
      (*fromP)++;
1377
0
    }
1378
0
    memcpy(*toP, utf8, n);
1379
0
    *toP += n;
1380
0
  }
1381
0
}
1382
1383
static enum XML_Convert_Result PTRCALL
1384
unknown_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
1385
0
                unsigned short **toP, const unsigned short *toLim) {
1386
0
  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1387
0
  while (*fromP < fromLim && *toP < toLim) {
1388
0
    unsigned short c = uenc->utf16[(unsigned char)**fromP];
1389
0
    if (c == 0) {
1390
0
      c = (unsigned short)uenc->convert(uenc->userData, *fromP);
1391
0
      *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1392
0
                 - (BT_LEAD2 - 2));
1393
0
    } else
1394
0
      (*fromP)++;
1395
0
    *(*toP)++ = c;
1396
0
  }
1397
1398
0
  if ((*toP == toLim) && (*fromP < fromLim))
1399
0
    return XML_CONVERT_OUTPUT_EXHAUSTED;
1400
0
  else
1401
0
    return XML_CONVERT_COMPLETED;
1402
0
}
1403
1404
ENCODING *
1405
XmlInitUnknownEncoding(void *mem, int *table, CONVERTER convert,
1406
0
                       void *userData) {
1407
0
  int i;
1408
0
  struct unknown_encoding *e = (struct unknown_encoding *)mem;
1409
0
  memcpy(mem, &latin1_encoding, sizeof(struct normal_encoding));
1410
0
  for (i = 0; i < 128; i++)
1411
0
    if (latin1_encoding.type[i] != BT_OTHER
1412
0
        && latin1_encoding.type[i] != BT_NONXML && table[i] != i)
1413
0
      return 0;
1414
0
  for (i = 0; i < 256; i++) {
1415
0
    int c = table[i];
1416
0
    if (c == -1) {
1417
0
      e->normal.type[i] = BT_MALFORM;
1418
      /* This shouldn't really get used. */
1419
0
      e->utf16[i] = 0xFFFF;
1420
0
      e->utf8[i][0] = 1;
1421
0
      e->utf8[i][1] = 0;
1422
0
    } else if (c < 0) {
1423
0
      if (c < -4)
1424
0
        return 0;
1425
      /* Multi-byte sequences need a converter function */
1426
0
      if (! convert)
1427
0
        return 0;
1428
0
      e->normal.type[i] = (unsigned char)(BT_LEAD2 - (c + 2));
1429
0
      e->utf8[i][0] = 0;
1430
0
      e->utf16[i] = 0;
1431
0
    } else if (c < 0x80) {
1432
0
      if (latin1_encoding.type[c] != BT_OTHER
1433
0
          && latin1_encoding.type[c] != BT_NONXML && c != i)
1434
0
        return 0;
1435
0
      e->normal.type[i] = latin1_encoding.type[c];
1436
0
      e->utf8[i][0] = 1;
1437
0
      e->utf8[i][1] = (char)c;
1438
0
      e->utf16[i] = (unsigned short)(c == 0 ? 0xFFFF : c);
1439
0
    } else if (checkCharRefNumber(c) < 0) {
1440
0
      e->normal.type[i] = BT_NONXML;
1441
      /* This shouldn't really get used. */
1442
0
      e->utf16[i] = 0xFFFF;
1443
0
      e->utf8[i][0] = 1;
1444
0
      e->utf8[i][1] = 0;
1445
0
    } else {
1446
0
      if (c > 0xFFFF)
1447
0
        return 0;
1448
0
      if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
1449
0
        e->normal.type[i] = BT_NMSTRT;
1450
0
      else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
1451
0
        e->normal.type[i] = BT_NAME;
1452
0
      else
1453
0
        e->normal.type[i] = BT_OTHER;
1454
0
      e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
1455
0
      e->utf16[i] = (unsigned short)c;
1456
0
    }
1457
0
  }
1458
0
  e->userData = userData;
1459
0
  e->convert = convert;
1460
0
  if (convert) {
1461
0
    e->normal.isName2 = unknown_isName;
1462
0
    e->normal.isName3 = unknown_isName;
1463
0
    e->normal.isName4 = unknown_isName;
1464
0
    e->normal.isNmstrt2 = unknown_isNmstrt;
1465
0
    e->normal.isNmstrt3 = unknown_isNmstrt;
1466
0
    e->normal.isNmstrt4 = unknown_isNmstrt;
1467
0
    e->normal.isInvalid2 = unknown_isInvalid;
1468
0
    e->normal.isInvalid3 = unknown_isInvalid;
1469
0
    e->normal.isInvalid4 = unknown_isInvalid;
1470
0
  }
1471
0
  e->normal.enc.utf8Convert = unknown_toUtf8;
1472
0
  e->normal.enc.utf16Convert = unknown_toUtf16;
1473
0
  return &(e->normal.enc);
1474
0
}
1475
1476
/* If this enumeration is changed, getEncodingIndex and encodings
1477
must also be changed. */
1478
enum {
1479
  UNKNOWN_ENC = -1,
1480
  ISO_8859_1_ENC = 0,
1481
  US_ASCII_ENC,
1482
  UTF_8_ENC,
1483
  UTF_16_ENC,
1484
  UTF_16BE_ENC,
1485
  UTF_16LE_ENC,
1486
  /* must match encodingNames up to here */
1487
  NO_ENC
1488
};
1489
1490
static const char KW_ISO_8859_1[]
1491
    = {ASCII_I, ASCII_S, ASCII_O,     ASCII_MINUS, ASCII_8, ASCII_8,
1492
       ASCII_5, ASCII_9, ASCII_MINUS, ASCII_1,     '\0'};
1493
static const char KW_US_ASCII[]
1494
    = {ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S,
1495
       ASCII_C, ASCII_I, ASCII_I,     '\0'};
1496
static const char KW_UTF_8[]
1497
    = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0'};
1498
static const char KW_UTF_16[]
1499
    = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0'};
1500
static const char KW_UTF_16BE[]
1501
    = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1,
1502
       ASCII_6, ASCII_B, ASCII_E, '\0'};
1503
static const char KW_UTF_16LE[]
1504
    = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1,
1505
       ASCII_6, ASCII_L, ASCII_E, '\0'};
1506
1507
static int FASTCALL
1508
22.2k
getEncodingIndex(const char *name) {
1509
22.2k
  static const char *const encodingNames[] = {
1510
22.2k
      KW_ISO_8859_1, KW_US_ASCII, KW_UTF_8, KW_UTF_16, KW_UTF_16BE, KW_UTF_16LE,
1511
22.2k
  };
1512
22.2k
  int i;
1513
22.2k
  if (name == NULL)
1514
20.5k
    return NO_ENC;
1515
2.24k
  for (i = 0; i < (int)(sizeof(encodingNames) / sizeof(encodingNames[0])); i++)
1516
2.17k
    if (streqci(name, encodingNames[i]))
1517
1.68k
      return i;
1518
68
  return UNKNOWN_ENC;
1519
1.75k
}
1520
1521
/* For binary compatibility, we store the index of the encoding
1522
   specified at initialization in the isUtf16 member.
1523
*/
1524
1525
10.2k
#define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16)
1526
20.5k
#define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i)
1527
1528
/* This is what detects the encoding.  encodingTable maps from
1529
   encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of
1530
   the external (protocol) specified encoding; state is
1531
   XML_CONTENT_STATE if we're parsing an external text entity, and
1532
   XML_PROLOG_STATE otherwise.
1533
*/
1534
1535
static int
1536
initScan(const ENCODING *const *encodingTable, const INIT_ENCODING *enc,
1537
10.2k
         int state, const char *ptr, const char *end, const char **nextTokPtr) {
1538
10.2k
  const ENCODING **encPtr;
1539
1540
10.2k
  if (ptr >= end)
1541
1
    return XML_TOK_NONE;
1542
10.2k
  encPtr = enc->encPtr;
1543
10.2k
  if (ptr + 1 == end) {
1544
    /* only a single byte available for auto-detection */
1545
16
#ifndef XML_DTD /* FIXME */
1546
    /* a well-formed document entity must have more than one byte */
1547
16
    if (state != XML_CONTENT_STATE)
1548
16
      return XML_TOK_PARTIAL;
1549
0
#endif
1550
    /* so we're parsing an external text entity... */
1551
    /* if UTF-16 was externally specified, then we need at least 2 bytes */
1552
0
    switch (INIT_ENC_INDEX(enc)) {
1553
0
    case UTF_16_ENC:
1554
0
    case UTF_16LE_ENC:
1555
0
    case UTF_16BE_ENC:
1556
0
      return XML_TOK_PARTIAL;
1557
0
    }
1558
0
    switch ((unsigned char)*ptr) {
1559
0
    case 0xFE:
1560
0
    case 0xFF:
1561
0
    case 0xEF: /* possibly first byte of UTF-8 BOM */
1562
0
      if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1563
0
        break;
1564
      /* fall through */
1565
0
    case 0x00:
1566
0
    case 0x3C:
1567
0
      return XML_TOK_PARTIAL;
1568
0
    }
1569
10.2k
  } else {
1570
10.2k
    switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
1571
113
    case 0xFEFF:
1572
113
      if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1573
0
        break;
1574
113
      *nextTokPtr = ptr + 2;
1575
113
      *encPtr = encodingTable[UTF_16BE_ENC];
1576
113
      return XML_TOK_BOM;
1577
    /* 00 3C is handled in the default case */
1578
0
    case 0x3C00:
1579
0
      if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
1580
0
           || INIT_ENC_INDEX(enc) == UTF_16_ENC)
1581
0
          && state == XML_CONTENT_STATE)
1582
0
        break;
1583
0
      *encPtr = encodingTable[UTF_16LE_ENC];
1584
0
      return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1585
153
    case 0xFFFE:
1586
153
      if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1587
0
        break;
1588
153
      *nextTokPtr = ptr + 2;
1589
153
      *encPtr = encodingTable[UTF_16LE_ENC];
1590
153
      return XML_TOK_BOM;
1591
51
    case 0xEFBB:
1592
      /* Maybe a UTF-8 BOM (EF BB BF) */
1593
      /* If there's an explicitly specified (external) encoding
1594
         of ISO-8859-1 or some flavour of UTF-16
1595
         and this is an external text entity,
1596
         don't look for the BOM,
1597
         because it might be a legal data.
1598
      */
1599
51
      if (state == XML_CONTENT_STATE) {
1600
0
        int e = INIT_ENC_INDEX(enc);
1601
0
        if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC || e == UTF_16LE_ENC
1602
0
            || e == UTF_16_ENC)
1603
0
          break;
1604
0
      }
1605
51
      if (ptr + 2 == end)
1606
2
        return XML_TOK_PARTIAL;
1607
49
      if ((unsigned char)ptr[2] == 0xBF) {
1608
36
        *nextTokPtr = ptr + 3;
1609
36
        *encPtr = encodingTable[UTF_8_ENC];
1610
36
        return XML_TOK_BOM;
1611
36
      }
1612
13
      break;
1613
9.92k
    default:
1614
9.92k
      if (ptr[0] == '\0') {
1615
        /* 0 isn't a legal data character. Furthermore a document
1616
           entity can only start with ASCII characters.  So the only
1617
           way this can fail to be big-endian UTF-16 if it it's an
1618
           external parsed general entity that's labelled as
1619
           UTF-16LE.
1620
        */
1621
0
        if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
1622
0
          break;
1623
0
        *encPtr = encodingTable[UTF_16BE_ENC];
1624
0
        return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1625
9.92k
      } else if (ptr[1] == '\0') {
1626
        /* We could recover here in the case:
1627
            - parsing an external entity
1628
            - second byte is 0
1629
            - no externally specified encoding
1630
            - no encoding declaration
1631
           by assuming UTF-16LE.  But we don't, because this would mean when
1632
           presented just with a single byte, we couldn't reliably determine
1633
           whether we needed further bytes.
1634
        */
1635
0
        if (state == XML_CONTENT_STATE)
1636
0
          break;
1637
0
        *encPtr = encodingTable[UTF_16LE_ENC];
1638
0
        return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1639
0
      }
1640
9.92k
      break;
1641
10.2k
    }
1642
10.2k
  }
1643
9.93k
  *encPtr = encodingTable[INIT_ENC_INDEX(enc)];
1644
9.93k
  return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1645
10.2k
}
1646
1647
55.0k
#define NS(x) x
1648
10.2k
#define ns(x) x
1649
#define XML_TOK_NS_C
1650
#include "xmltok_ns.c"
1651
#undef XML_TOK_NS_C
1652
#undef NS
1653
#undef ns
1654
1655
#ifdef XML_NS
1656
1657
#  define NS(x) x##NS
1658
#  define ns(x) x##_ns
1659
1660
#  define XML_TOK_NS_C
1661
#  include "xmltok_ns.c"
1662
#  undef XML_TOK_NS_C
1663
1664
#  undef NS
1665
#  undef ns
1666
1667
ENCODING *
1668
XmlInitUnknownEncodingNS(void *mem, int *table, CONVERTER convert,
1669
                         void *userData) {
1670
  ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
1671
  if (enc)
1672
    ((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON;
1673
  return enc;
1674
}
1675
1676
#endif /* XML_NS */