Coverage Report

Created: 2024-07-27 06:44

/src/CMake/Utilities/cmexpat/lib/xmltok.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
                            __  __            _
3
                         ___\ \/ /_ __   __ _| |_
4
                        / _ \\  /| '_ \ / _` | __|
5
                       |  __//  \| |_) | (_| | |_
6
                        \___/_/\_\ .__/ \__,_|\__|
7
                                 |_| XML parser
8
9
   Copyright (c) 1997-2000 Thai Open Source Software Center Ltd
10
   Copyright (c) 2000      Clark Cooper <coopercc@users.sourceforge.net>
11
   Copyright (c) 2001-2003 Fred L. Drake, Jr. <fdrake@users.sourceforge.net>
12
   Copyright (c) 2002      Greg Stein <gstein@users.sourceforge.net>
13
   Copyright (c) 2002-2016 Karl Waclawek <karl@waclawek.net>
14
   Copyright (c) 2005-2009 Steven Solie <steven@solie.ca>
15
   Copyright (c) 2016-2022 Sebastian Pipping <sebastian@pipping.org>
16
   Copyright (c) 2016      Pascal Cuoq <cuoq@trust-in-soft.com>
17
   Copyright (c) 2016      Don Lewis <truckman@apache.org>
18
   Copyright (c) 2017      Rhodri James <rhodri@wildebeest.org.uk>
19
   Copyright (c) 2017      Alexander Bluhm <alexander.bluhm@gmx.net>
20
   Copyright (c) 2017      Benbuck Nason <bnason@netflix.com>
21
   Copyright (c) 2017      José Gutiérrez de la Concha <jose@zeroc.com>
22
   Copyright (c) 2019      David Loffredo <loffredo@steptools.com>
23
   Copyright (c) 2021      Dong-hee Na <donghee.na@python.org>
24
   Licensed under the MIT license:
25
26
   Permission is  hereby granted,  free of charge,  to any  person obtaining
27
   a  copy  of  this  software   and  associated  documentation  files  (the
28
   "Software"),  to  deal in  the  Software  without restriction,  including
29
   without  limitation the  rights  to use,  copy,  modify, merge,  publish,
30
   distribute, sublicense, and/or sell copies of the Software, and to permit
31
   persons  to whom  the Software  is  furnished to  do so,  subject to  the
32
   following conditions:
33
34
   The above copyright  notice and this permission notice  shall be included
35
   in all copies or substantial portions of the Software.
36
37
   THE  SOFTWARE  IS  PROVIDED  "AS  IS",  WITHOUT  WARRANTY  OF  ANY  KIND,
38
   EXPRESS  OR IMPLIED,  INCLUDING  BUT  NOT LIMITED  TO  THE WARRANTIES  OF
39
   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
40
   NO EVENT SHALL THE AUTHORS OR  COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
41
   DAMAGES OR  OTHER LIABILITY, WHETHER  IN AN  ACTION OF CONTRACT,  TORT OR
42
   OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
43
   USE OR OTHER DEALINGS IN THE SOFTWARE.
44
*/
45
46
#include <expat_config.h>
47
48
#include <stddef.h>
49
#include <string.h> /* memcpy */
50
#include <stdbool.h>
51
52
#ifdef _WIN32
53
#  include "winconfig.h"
54
#endif
55
56
#include "expat_external.h"
57
#include "internal.h"
58
#include "xmltok.h"
59
#include "nametab.h"
60
61
#ifdef XML_DTD
62
#  define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok)
63
#else
64
#  define IGNORE_SECTION_TOK_VTABLE /* as nothing */
65
#endif
66
67
#define VTABLE1                                                                \
68
  {PREFIX(prologTok), PREFIX(contentTok),                                      \
69
   PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE},                         \
70
      {PREFIX(attributeValueTok), PREFIX(entityValueTok)},                     \
71
      PREFIX(nameMatchesAscii), PREFIX(nameLength), PREFIX(skipS),             \
72
      PREFIX(getAtts), PREFIX(charRefNumber), PREFIX(predefinedEntityName),    \
73
      PREFIX(updatePosition), PREFIX(isPublicId)
74
75
#define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
76
77
#define UCS2_GET_NAMING(pages, hi, lo)                                         \
78
269k
  (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1u << ((lo)&0x1F)))
79
80
/* A 2 byte UTF-8 representation splits the characters 11 bits between
81
   the bottom 5 and 6 bits of the bytes.  We need 8 bits to index into
82
   pages, 3 bits to add to that index and 5 bits to generate the mask.
83
*/
84
#define UTF8_GET_NAMING2(pages, byte)                                          \
85
13.4M
  (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3)                         \
86
13.4M
                + ((((byte)[0]) & 3) << 1) + ((((byte)[1]) >> 5) & 1)]         \
87
13.4M
   & (1u << (((byte)[1]) & 0x1F)))
88
89
/* A 3 byte UTF-8 representation splits the characters 16 bits between
90
   the bottom 4, 6 and 6 bits of the bytes.  We need 8 bits to index
91
   into pages, 3 bits to add to that index and 5 bits to generate the
92
   mask.
93
*/
94
#define UTF8_GET_NAMING3(pages, byte)                                          \
95
115k
  (namingBitmap                                                                \
96
115k
       [((pages)[((((byte)[0]) & 0xF) << 4) + ((((byte)[1]) >> 2) & 0xF)]      \
97
115k
         << 3)                                                                 \
98
115k
        + ((((byte)[1]) & 3) << 1) + ((((byte)[2]) >> 5) & 1)]                 \
99
115k
   & (1u << (((byte)[2]) & 0x1F)))
100
101
/* Detection of invalid UTF-8 sequences is based on Table 3.1B
102
   of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/
103
   with the additional restriction of not allowing the Unicode
104
   code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE).
105
   Implementation details:
106
     (A & 0x80) == 0     means A < 0x80
107
   and
108
     (A & 0xC0) == 0xC0  means A > 0xBF
109
*/
110
111
#define UTF8_INVALID2(p)                                                       \
112
133M
  ((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0)
113
114
#define UTF8_INVALID3(p)                                                       \
115
391k
  (((p)[2] & 0x80) == 0                                                        \
116
391k
   || ((*p) == 0xEF && (p)[1] == 0xBF ? (p)[2] > 0xBD                          \
117
391k
                                      : ((p)[2] & 0xC0) == 0xC0)               \
118
391k
   || ((*p) == 0xE0                                                            \
119
391k
           ? (p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0                          \
120
391k
           : ((p)[1] & 0x80) == 0                                              \
121
386k
                 || ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0)))
122
123
#define UTF8_INVALID4(p)                                                       \
124
87.8k
  (((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 || ((p)[2] & 0x80) == 0     \
125
87.8k
   || ((p)[2] & 0xC0) == 0xC0                                                  \
126
87.8k
   || ((*p) == 0xF0                                                            \
127
87.7k
           ? (p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0                          \
128
87.7k
           : ((p)[1] & 0x80) == 0                                              \
129
86.7k
                 || ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0)))
130
131
static int PTRFASTCALL
132
142
isNever(const ENCODING *enc, const char *p) {
133
142
  UNUSED_P(enc);
134
142
  UNUSED_P(p);
135
142
  return 0;
136
142
}
137
138
static int PTRFASTCALL
139
13.4M
utf8_isName2(const ENCODING *enc, const char *p) {
140
13.4M
  UNUSED_P(enc);
141
13.4M
  return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
142
13.4M
}
143
144
static int PTRFASTCALL
145
100k
utf8_isName3(const ENCODING *enc, const char *p) {
146
100k
  UNUSED_P(enc);
147
100k
  return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
148
100k
}
149
150
#define utf8_isName4 isNever
151
152
static int PTRFASTCALL
153
49.6k
utf8_isNmstrt2(const ENCODING *enc, const char *p) {
154
49.6k
  UNUSED_P(enc);
155
49.6k
  return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
156
49.6k
}
157
158
static int PTRFASTCALL
159
15.6k
utf8_isNmstrt3(const ENCODING *enc, const char *p) {
160
15.6k
  UNUSED_P(enc);
161
15.6k
  return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
162
15.6k
}
163
164
#define utf8_isNmstrt4 isNever
165
166
static int PTRFASTCALL
167
133M
utf8_isInvalid2(const ENCODING *enc, const char *p) {
168
133M
  UNUSED_P(enc);
169
133M
  return UTF8_INVALID2((const unsigned char *)p);
170
133M
}
171
172
static int PTRFASTCALL
173
391k
utf8_isInvalid3(const ENCODING *enc, const char *p) {
174
391k
  UNUSED_P(enc);
175
391k
  return UTF8_INVALID3((const unsigned char *)p);
176
391k
}
177
178
static int PTRFASTCALL
179
87.8k
utf8_isInvalid4(const ENCODING *enc, const char *p) {
180
87.8k
  UNUSED_P(enc);
181
87.8k
  return UTF8_INVALID4((const unsigned char *)p);
182
87.8k
}
183
184
struct normal_encoding {
185
  ENCODING enc;
186
  unsigned char type[256];
187
#ifdef XML_MIN_SIZE
188
  int(PTRFASTCALL *byteType)(const ENCODING *, const char *);
189
  int(PTRFASTCALL *isNameMin)(const ENCODING *, const char *);
190
  int(PTRFASTCALL *isNmstrtMin)(const ENCODING *, const char *);
191
  int(PTRFASTCALL *byteToAscii)(const ENCODING *, const char *);
192
  int(PTRCALL *charMatches)(const ENCODING *, const char *, int);
193
#endif /* XML_MIN_SIZE */
194
  int(PTRFASTCALL *isName2)(const ENCODING *, const char *);
195
  int(PTRFASTCALL *isName3)(const ENCODING *, const char *);
196
  int(PTRFASTCALL *isName4)(const ENCODING *, const char *);
197
  int(PTRFASTCALL *isNmstrt2)(const ENCODING *, const char *);
198
  int(PTRFASTCALL *isNmstrt3)(const ENCODING *, const char *);
199
  int(PTRFASTCALL *isNmstrt4)(const ENCODING *, const char *);
200
  int(PTRFASTCALL *isInvalid2)(const ENCODING *, const char *);
201
  int(PTRFASTCALL *isInvalid3)(const ENCODING *, const char *);
202
  int(PTRFASTCALL *isInvalid4)(const ENCODING *, const char *);
203
};
204
205
147M
#define AS_NORMAL_ENCODING(enc) ((const struct normal_encoding *)(enc))
206
207
#ifdef XML_MIN_SIZE
208
209
#  define STANDARD_VTABLE(E)                                                   \
210
    E##byteType, E##isNameMin, E##isNmstrtMin, E##byteToAscii, E##charMatches,
211
212
#else
213
214
#  define STANDARD_VTABLE(E) /* as nothing */
215
216
#endif
217
218
#define NORMAL_VTABLE(E)                                                       \
219
  E##isName2, E##isName3, E##isName4, E##isNmstrt2, E##isNmstrt3,              \
220
      E##isNmstrt4, E##isInvalid2, E##isInvalid3, E##isInvalid4
221
222
#define NULL_VTABLE                                                            \
223
  /* isName2 */ NULL, /* isName3 */ NULL, /* isName4 */ NULL,                  \
224
      /* isNmstrt2 */ NULL, /* isNmstrt3 */ NULL, /* isNmstrt4 */ NULL,        \
225
      /* isInvalid2 */ NULL, /* isInvalid3 */ NULL, /* isInvalid4 */ NULL
226
227
static int FASTCALL checkCharRefNumber(int);
228
229
#include "xmltok_impl.h"
230
#include "ascii.h"
231
232
#ifdef XML_MIN_SIZE
233
#  define sb_isNameMin isNever
234
#  define sb_isNmstrtMin isNever
235
#endif
236
237
#ifdef XML_MIN_SIZE
238
#  define MINBPC(enc) ((enc)->minBytesPerChar)
239
#else
240
/* minimum bytes per character */
241
13.5G
#  define MINBPC(enc) 1
242
#endif
243
244
#define SB_BYTE_TYPE(enc, p)                                                   \
245
10.5G
  (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
246
247
#ifdef XML_MIN_SIZE
248
static int PTRFASTCALL
249
sb_byteType(const ENCODING *enc, const char *p) {
250
  return SB_BYTE_TYPE(enc, p);
251
}
252
#  define BYTE_TYPE(enc, p) (AS_NORMAL_ENCODING(enc)->byteType(enc, p))
253
#else
254
10.5G
#  define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
255
#endif
256
257
#ifdef XML_MIN_SIZE
258
#  define BYTE_TO_ASCII(enc, p) (AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p))
259
static int PTRFASTCALL
260
sb_byteToAscii(const ENCODING *enc, const char *p) {
261
  UNUSED_P(enc);
262
  return *p;
263
}
264
#else
265
322k
#  define BYTE_TO_ASCII(enc, p) (*(p))
266
#endif
267
268
13.5M
#define IS_NAME_CHAR(enc, p, n) (AS_NORMAL_ENCODING(enc)->isName##n(enc, p))
269
65.3k
#define IS_NMSTRT_CHAR(enc, p, n) (AS_NORMAL_ENCODING(enc)->isNmstrt##n(enc, p))
270
#ifdef XML_MIN_SIZE
271
#  define IS_INVALID_CHAR(enc, p, n)                                           \
272
    (AS_NORMAL_ENCODING(enc)->isInvalid##n                                     \
273
     && AS_NORMAL_ENCODING(enc)->isInvalid##n(enc, p))
274
#else
275
#  define IS_INVALID_CHAR(enc, p, n)                                           \
276
147M
    (AS_NORMAL_ENCODING(enc)->isInvalid##n(enc, p))
277
#endif
278
279
#ifdef XML_MIN_SIZE
280
#  define IS_NAME_CHAR_MINBPC(enc, p)                                          \
281
    (AS_NORMAL_ENCODING(enc)->isNameMin(enc, p))
282
#  define IS_NMSTRT_CHAR_MINBPC(enc, p)                                        \
283
    (AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p))
284
#else
285
0
#  define IS_NAME_CHAR_MINBPC(enc, p) (0)
286
0
#  define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
287
#endif
288
289
#ifdef XML_MIN_SIZE
290
#  define CHAR_MATCHES(enc, p, c)                                              \
291
    (AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c))
292
static int PTRCALL
293
sb_charMatches(const ENCODING *enc, const char *p, int c) {
294
  UNUSED_P(enc);
295
  return *p == c;
296
}
297
#else
298
/* c is an ASCII character */
299
2.14M
#  define CHAR_MATCHES(enc, p, c) (*(p) == c)
300
#endif
301
302
31.4M
#define PREFIX(ident) normal_##ident
303
#define XML_TOK_IMPL_C
304
#include "xmltok_impl.c"
305
#undef XML_TOK_IMPL_C
306
307
#undef MINBPC
308
#undef BYTE_TYPE
309
#undef BYTE_TO_ASCII
310
#undef CHAR_MATCHES
311
#undef IS_NAME_CHAR
312
#undef IS_NAME_CHAR_MINBPC
313
#undef IS_NMSTRT_CHAR
314
#undef IS_NMSTRT_CHAR_MINBPC
315
#undef IS_INVALID_CHAR
316
317
enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */
318
       UTF8_cval1 = 0x00,
319
       UTF8_cval2 = 0xc0,
320
       UTF8_cval3 = 0xe0,
321
       UTF8_cval4 = 0xf0
322
};
323
324
void
325
_INTERNAL_trim_to_complete_utf8_characters(const char *from,
326
53.1M
                                           const char **fromLimRef) {
327
53.1M
  const char *fromLim = *fromLimRef;
328
53.1M
  size_t walked = 0;
329
53.3M
  for (; fromLim > from; fromLim--, walked++) {
330
53.3M
    const unsigned char prev = (unsigned char)fromLim[-1];
331
53.3M
    if ((prev & 0xf8u)
332
53.3M
        == 0xf0u) { /* 4-byte character, lead by 0b11110xxx byte */
333
5.28k
      if (walked + 1 >= 4) {
334
5.04k
        fromLim += 4 - 1;
335
5.04k
        break;
336
5.04k
      } else {
337
244
        walked = 0;
338
244
      }
339
53.3M
    } else if ((prev & 0xf0u)
340
53.3M
               == 0xe0u) { /* 3-byte character, lead by 0b1110xxxx byte */
341
25.1k
      if (walked + 1 >= 3) {
342
24.7k
        fromLim += 3 - 1;
343
24.7k
        break;
344
24.7k
      } else {
345
396
        walked = 0;
346
396
      }
347
53.3M
    } else if ((prev & 0xe0u)
348
53.3M
               == 0xc0u) { /* 2-byte character, lead by 0b110xxxxx byte */
349
114k
      if (walked + 1 >= 2) {
350
111k
        fromLim += 2 - 1;
351
111k
        break;
352
111k
      } else {
353
2.59k
        walked = 0;
354
2.59k
      }
355
53.1M
    } else if ((prev & 0x80u)
356
53.1M
               == 0x00u) { /* 1-byte character, matching 0b0xxxxxxx */
357
53.0M
      break;
358
53.0M
    }
359
53.3M
  }
360
53.1M
  *fromLimRef = fromLim;
361
53.1M
}
362
363
static enum XML_Convert_Result PTRCALL
364
utf8_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
365
53.1M
            char **toP, const char *toLim) {
366
53.1M
  bool input_incomplete = false;
367
53.1M
  bool output_exhausted = false;
368
369
  /* Avoid copying partial characters (due to limited space). */
370
53.1M
  const ptrdiff_t bytesAvailable = fromLim - *fromP;
371
53.1M
  const ptrdiff_t bytesStorable = toLim - *toP;
372
53.1M
  UNUSED_P(enc);
373
53.1M
  if (bytesAvailable > bytesStorable) {
374
223k
    fromLim = *fromP + bytesStorable;
375
223k
    output_exhausted = true;
376
223k
  }
377
378
  /* Avoid copying partial characters (from incomplete input). */
379
53.1M
  {
380
53.1M
    const char *const fromLimBefore = fromLim;
381
53.1M
    _INTERNAL_trim_to_complete_utf8_characters(*fromP, &fromLim);
382
53.1M
    if (fromLim < fromLimBefore) {
383
3.23k
      input_incomplete = true;
384
3.23k
    }
385
53.1M
  }
386
387
53.1M
  {
388
53.1M
    const ptrdiff_t bytesToCopy = fromLim - *fromP;
389
53.1M
    memcpy(*toP, *fromP, bytesToCopy);
390
53.1M
    *fromP += bytesToCopy;
391
53.1M
    *toP += bytesToCopy;
392
53.1M
  }
393
394
53.1M
  if (output_exhausted) /* needs to go first */
395
223k
    return XML_CONVERT_OUTPUT_EXHAUSTED;
396
52.9M
  else if (input_incomplete)
397
0
    return XML_CONVERT_INPUT_INCOMPLETE;
398
52.9M
  else
399
52.9M
    return XML_CONVERT_COMPLETED;
400
53.1M
}
401
402
static enum XML_Convert_Result PTRCALL
403
utf8_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
404
0
             unsigned short **toP, const unsigned short *toLim) {
405
0
  enum XML_Convert_Result res = XML_CONVERT_COMPLETED;
406
0
  unsigned short *to = *toP;
407
0
  const char *from = *fromP;
408
0
  while (from < fromLim && to < toLim) {
409
0
    switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) {
410
0
    case BT_LEAD2:
411
0
      if (fromLim - from < 2) {
412
0
        res = XML_CONVERT_INPUT_INCOMPLETE;
413
0
        goto after;
414
0
      }
415
0
      *to++ = (unsigned short)(((from[0] & 0x1f) << 6) | (from[1] & 0x3f));
416
0
      from += 2;
417
0
      break;
418
0
    case BT_LEAD3:
419
0
      if (fromLim - from < 3) {
420
0
        res = XML_CONVERT_INPUT_INCOMPLETE;
421
0
        goto after;
422
0
      }
423
0
      *to++ = (unsigned short)(((from[0] & 0xf) << 12) | ((from[1] & 0x3f) << 6)
424
0
                               | (from[2] & 0x3f));
425
0
      from += 3;
426
0
      break;
427
0
    case BT_LEAD4: {
428
0
      unsigned long n;
429
0
      if (toLim - to < 2) {
430
0
        res = XML_CONVERT_OUTPUT_EXHAUSTED;
431
0
        goto after;
432
0
      }
433
0
      if (fromLim - from < 4) {
434
0
        res = XML_CONVERT_INPUT_INCOMPLETE;
435
0
        goto after;
436
0
      }
437
0
      n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12)
438
0
          | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
439
0
      n -= 0x10000;
440
0
      to[0] = (unsigned short)((n >> 10) | 0xD800);
441
0
      to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
442
0
      to += 2;
443
0
      from += 4;
444
0
    } break;
445
0
    default:
446
0
      *to++ = *from++;
447
0
      break;
448
0
    }
449
0
  }
450
0
  if (from < fromLim)
451
0
    res = XML_CONVERT_OUTPUT_EXHAUSTED;
452
0
after:
453
0
  *fromP = from;
454
0
  *toP = to;
455
0
  return res;
456
0
}
457
458
#ifdef XML_NS
459
static const struct normal_encoding utf8_encoding_ns
460
    = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
461
       {
462
#  include "asciitab.h"
463
#  include "utf8tab.h"
464
       },
465
       STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
466
#endif
467
468
static const struct normal_encoding utf8_encoding
469
    = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
470
       {
471
#define BT_COLON BT_NMSTRT
472
#include "asciitab.h"
473
#undef BT_COLON
474
#include "utf8tab.h"
475
       },
476
       STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
477
478
#ifdef XML_NS
479
480
static const struct normal_encoding internal_utf8_encoding_ns
481
    = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
482
       {
483
#  include "iasciitab.h"
484
#  include "utf8tab.h"
485
       },
486
       STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
487
488
#endif
489
490
static const struct normal_encoding internal_utf8_encoding
491
    = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
492
       {
493
#define BT_COLON BT_NMSTRT
494
#include "iasciitab.h"
495
#undef BT_COLON
496
#include "utf8tab.h"
497
       },
498
       STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
499
500
static enum XML_Convert_Result PTRCALL
501
latin1_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
502
156k
              char **toP, const char *toLim) {
503
156k
  UNUSED_P(enc);
504
93.5M
  for (;;) {
505
93.5M
    unsigned char c;
506
93.5M
    if (*fromP == fromLim)
507
145k
      return XML_CONVERT_COMPLETED;
508
93.4M
    c = (unsigned char)**fromP;
509
93.4M
    if (c & 0x80) {
510
90.2M
      if (toLim - *toP < 2)
511
8.38k
        return XML_CONVERT_OUTPUT_EXHAUSTED;
512
90.2M
      *(*toP)++ = (char)((c >> 6) | UTF8_cval2);
513
90.2M
      *(*toP)++ = (char)((c & 0x3f) | 0x80);
514
90.2M
      (*fromP)++;
515
90.2M
    } else {
516
3.21M
      if (*toP == toLim)
517
1.82k
        return XML_CONVERT_OUTPUT_EXHAUSTED;
518
3.21M
      *(*toP)++ = *(*fromP)++;
519
3.21M
    }
520
93.4M
  }
521
156k
}
522
523
static enum XML_Convert_Result PTRCALL
524
latin1_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
525
0
               unsigned short **toP, const unsigned short *toLim) {
526
0
  UNUSED_P(enc);
527
0
  while (*fromP < fromLim && *toP < toLim)
528
0
    *(*toP)++ = (unsigned char)*(*fromP)++;
529
530
0
  if ((*toP == toLim) && (*fromP < fromLim))
531
0
    return XML_CONVERT_OUTPUT_EXHAUSTED;
532
0
  else
533
0
    return XML_CONVERT_COMPLETED;
534
0
}
535
536
#ifdef XML_NS
537
538
static const struct normal_encoding latin1_encoding_ns
539
    = {{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0},
540
       {
541
#  include "asciitab.h"
542
#  include "latin1tab.h"
543
       },
544
       STANDARD_VTABLE(sb_) NULL_VTABLE};
545
546
#endif
547
548
static const struct normal_encoding latin1_encoding
549
    = {{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0},
550
       {
551
#define BT_COLON BT_NMSTRT
552
#include "asciitab.h"
553
#undef BT_COLON
554
#include "latin1tab.h"
555
       },
556
       STANDARD_VTABLE(sb_) NULL_VTABLE};
557
558
static enum XML_Convert_Result PTRCALL
559
ascii_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
560
949
             char **toP, const char *toLim) {
561
949
  UNUSED_P(enc);
562
7.82k
  while (*fromP < fromLim && *toP < toLim)
563
6.87k
    *(*toP)++ = *(*fromP)++;
564
565
949
  if ((*toP == toLim) && (*fromP < fromLim))
566
195
    return XML_CONVERT_OUTPUT_EXHAUSTED;
567
754
  else
568
754
    return XML_CONVERT_COMPLETED;
569
949
}
570
571
#ifdef XML_NS
572
573
static const struct normal_encoding ascii_encoding_ns
574
    = {{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0},
575
       {
576
#  include "asciitab.h"
577
           /* BT_NONXML == 0 */
578
       },
579
       STANDARD_VTABLE(sb_) NULL_VTABLE};
580
581
#endif
582
583
static const struct normal_encoding ascii_encoding
584
    = {{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0},
585
       {
586
#define BT_COLON BT_NMSTRT
587
#include "asciitab.h"
588
#undef BT_COLON
589
           /* BT_NONXML == 0 */
590
       },
591
       STANDARD_VTABLE(sb_) NULL_VTABLE};
592
593
static int PTRFASTCALL
594
537k
unicode_byte_type(char hi, char lo) {
595
537k
  switch ((unsigned char)hi) {
596
  /* 0xD800-0xDBFF first 16-bit code unit or high surrogate (W1) */
597
22
  case 0xD8:
598
41
  case 0xD9:
599
58
  case 0xDA:
600
75
  case 0xDB:
601
75
    return BT_LEAD4;
602
  /* 0xDC00-0xDFFF second 16-bit code unit or low surrogate (W2) */
603
4
  case 0xDC:
604
9
  case 0xDD:
605
14
  case 0xDE:
606
20
  case 0xDF:
607
20
    return BT_TRAIL;
608
57
  case 0xFF:
609
57
    switch ((unsigned char)lo) {
610
5
    case 0xFF: /* noncharacter-FFFF */
611
6
    case 0xFE: /* noncharacter-FFFE */
612
6
      return BT_NONXML;
613
57
    }
614
51
    break;
615
537k
  }
616
537k
  return BT_NONASCII;
617
537k
}
618
619
#define DEFINE_UTF16_TO_UTF8(E)                                                \
620
  static enum XML_Convert_Result PTRCALL E##toUtf8(                            \
621
      const ENCODING *enc, const char **fromP, const char *fromLim,            \
622
0
      char **toP, const char *toLim) {                                         \
623
0
    const char *from = *fromP;                                                 \
624
0
    UNUSED_P(enc);                                                             \
625
0
    fromLim = from + (((fromLim - from) >> 1) << 1); /* shrink to even */      \
626
0
    for (; from < fromLim; from += 2) {                                        \
627
0
      int plane;                                                               \
628
0
      unsigned char lo2;                                                       \
629
0
      unsigned char lo = GET_LO(from);                                         \
630
0
      unsigned char hi = GET_HI(from);                                         \
631
0
      switch (hi) {                                                            \
632
0
      case 0:                                                                  \
633
0
        if (lo < 0x80) {                                                       \
634
0
          if (*toP == toLim) {                                                 \
635
0
            *fromP = from;                                                     \
636
0
            return XML_CONVERT_OUTPUT_EXHAUSTED;                               \
637
0
          }                                                                    \
638
0
          *(*toP)++ = lo;                                                      \
639
0
          break;                                                               \
640
0
        }                                                                      \
641
0
        /* fall through */                                                     \
642
0
      case 0x1:                                                                \
643
0
      case 0x2:                                                                \
644
0
      case 0x3:                                                                \
645
0
      case 0x4:                                                                \
646
0
      case 0x5:                                                                \
647
0
      case 0x6:                                                                \
648
0
      case 0x7:                                                                \
649
0
        if (toLim - *toP < 2) {                                                \
650
0
          *fromP = from;                                                       \
651
0
          return XML_CONVERT_OUTPUT_EXHAUSTED;                                 \
652
0
        }                                                                      \
653
0
        *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2);                      \
654
0
        *(*toP)++ = ((lo & 0x3f) | 0x80);                                      \
655
0
        break;                                                                 \
656
0
      default:                                                                 \
657
0
        if (toLim - *toP < 3) {                                                \
658
0
          *fromP = from;                                                       \
659
0
          return XML_CONVERT_OUTPUT_EXHAUSTED;                                 \
660
0
        }                                                                      \
661
0
        /* 16 bits divided 4, 6, 6 amongst 3 bytes */                          \
662
0
        *(*toP)++ = ((hi >> 4) | UTF8_cval3);                                  \
663
0
        *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80);                    \
664
0
        *(*toP)++ = ((lo & 0x3f) | 0x80);                                      \
665
0
        break;                                                                 \
666
0
      case 0xD8:                                                               \
667
0
      case 0xD9:                                                               \
668
0
      case 0xDA:                                                               \
669
0
      case 0xDB:                                                               \
670
0
        if (toLim - *toP < 4) {                                                \
671
0
          *fromP = from;                                                       \
672
0
          return XML_CONVERT_OUTPUT_EXHAUSTED;                                 \
673
0
        }                                                                      \
674
0
        if (fromLim - from < 4) {                                              \
675
0
          *fromP = from;                                                       \
676
0
          return XML_CONVERT_INPUT_INCOMPLETE;                                 \
677
0
        }                                                                      \
678
0
        plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1;                   \
679
0
        *(*toP)++ = (char)((plane >> 2) | UTF8_cval4);                         \
680
0
        *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80);         \
681
0
        from += 2;                                                             \
682
0
        lo2 = GET_LO(from);                                                    \
683
0
        *(*toP)++ = (((lo & 0x3) << 4) | ((GET_HI(from) & 0x3) << 2)           \
684
0
                     | (lo2 >> 6) | 0x80);                                     \
685
0
        *(*toP)++ = ((lo2 & 0x3f) | 0x80);                                     \
686
0
        break;                                                                 \
687
0
      }                                                                        \
688
0
    }                                                                          \
689
0
    *fromP = from;                                                             \
690
0
    if (from < fromLim)                                                        \
691
0
      return XML_CONVERT_INPUT_INCOMPLETE;                                     \
692
0
    else                                                                       \
693
0
      return XML_CONVERT_COMPLETED;                                            \
694
0
  }
Unexecuted instantiation: xmltok.c:little2_toUtf8
Unexecuted instantiation: xmltok.c:big2_toUtf8
695
696
#define DEFINE_UTF16_TO_UTF16(E)                                               \
697
  static enum XML_Convert_Result PTRCALL E##toUtf16(                           \
698
      const ENCODING *enc, const char **fromP, const char *fromLim,            \
699
0
      unsigned short **toP, const unsigned short *toLim) {                     \
700
0
    enum XML_Convert_Result res = XML_CONVERT_COMPLETED;                       \
701
0
    UNUSED_P(enc);                                                             \
702
0
    fromLim = *fromP + (((fromLim - *fromP) >> 1) << 1); /* shrink to even */  \
703
0
    /* Avoid copying first half only of surrogate */                           \
704
0
    if (fromLim - *fromP > ((toLim - *toP) << 1)                               \
705
0
        && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) {                             \
706
0
      fromLim -= 2;                                                            \
707
0
      res = XML_CONVERT_INPUT_INCOMPLETE;                                      \
708
0
    }                                                                          \
709
0
    for (; *fromP < fromLim && *toP < toLim; *fromP += 2)                      \
710
0
      *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP);                      \
711
0
    if ((*toP == toLim) && (*fromP < fromLim))                                 \
712
0
      return XML_CONVERT_OUTPUT_EXHAUSTED;                                     \
713
0
    else                                                                       \
714
0
      return res;                                                              \
715
0
  }
Unexecuted instantiation: xmltok.c:little2_toUtf16
Unexecuted instantiation: xmltok.c:big2_toUtf16
716
717
#define SET2(ptr, ch) (((ptr)[0] = ((ch)&0xff)), ((ptr)[1] = ((ch) >> 8)))
718
0
#define GET_LO(ptr) ((unsigned char)(ptr)[0])
719
0
#define GET_HI(ptr) ((unsigned char)(ptr)[1])
720
721
DEFINE_UTF16_TO_UTF8(little2_)
722
DEFINE_UTF16_TO_UTF16(little2_)
723
724
#undef SET2
725
#undef GET_LO
726
#undef GET_HI
727
728
#define SET2(ptr, ch) (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch)&0xFF)))
729
0
#define GET_LO(ptr) ((unsigned char)(ptr)[1])
730
0
#define GET_HI(ptr) ((unsigned char)(ptr)[0])
731
732
DEFINE_UTF16_TO_UTF8(big2_)
733
DEFINE_UTF16_TO_UTF16(big2_)
734
735
#undef SET2
736
#undef GET_LO
737
#undef GET_HI
738
739
#define LITTLE2_BYTE_TYPE(enc, p)                                              \
740
379k
  ((p)[1] == 0 ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)]  \
741
379k
               : unicode_byte_type((p)[1], (p)[0]))
742
0
#define LITTLE2_BYTE_TO_ASCII(p) ((p)[1] == 0 ? (p)[0] : -1)
743
0
#define LITTLE2_CHAR_MATCHES(p, c) ((p)[1] == 0 && (p)[0] == c)
744
#define LITTLE2_IS_NAME_CHAR_MINBPC(p)                                         \
745
189k
  UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
746
#define LITTLE2_IS_NMSTRT_CHAR_MINBPC(p)                                       \
747
121
  UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
748
749
#ifdef XML_MIN_SIZE
750
751
static int PTRFASTCALL
752
little2_byteType(const ENCODING *enc, const char *p) {
753
  return LITTLE2_BYTE_TYPE(enc, p);
754
}
755
756
static int PTRFASTCALL
757
little2_byteToAscii(const ENCODING *enc, const char *p) {
758
  UNUSED_P(enc);
759
  return LITTLE2_BYTE_TO_ASCII(p);
760
}
761
762
static int PTRCALL
763
little2_charMatches(const ENCODING *enc, const char *p, int c) {
764
  UNUSED_P(enc);
765
  return LITTLE2_CHAR_MATCHES(p, c);
766
}
767
768
static int PTRFASTCALL
769
little2_isNameMin(const ENCODING *enc, const char *p) {
770
  UNUSED_P(enc);
771
  return LITTLE2_IS_NAME_CHAR_MINBPC(p);
772
}
773
774
static int PTRFASTCALL
775
little2_isNmstrtMin(const ENCODING *enc, const char *p) {
776
  UNUSED_P(enc);
777
  return LITTLE2_IS_NMSTRT_CHAR_MINBPC(p);
778
}
779
780
#  undef VTABLE
781
#  define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
782
783
#else /* not XML_MIN_SIZE */
784
785
#  undef PREFIX
786
0
#  define PREFIX(ident) little2_##ident
787
759k
#  define MINBPC(enc) 2
788
/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
789
379k
#  define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
790
0
#  define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(p)
791
0
#  define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(p, c)
792
5
#  define IS_NAME_CHAR(enc, p, n) 0
793
189k
#  define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(p)
794
5
#  define IS_NMSTRT_CHAR(enc, p, n) (0)
795
121
#  define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(p)
796
797
#  define XML_TOK_IMPL_C
798
#  include "xmltok_impl.c"
799
#  undef XML_TOK_IMPL_C
800
801
#  undef MINBPC
802
#  undef BYTE_TYPE
803
#  undef BYTE_TO_ASCII
804
#  undef CHAR_MATCHES
805
#  undef IS_NAME_CHAR
806
#  undef IS_NAME_CHAR_MINBPC
807
#  undef IS_NMSTRT_CHAR
808
#  undef IS_NMSTRT_CHAR_MINBPC
809
#  undef IS_INVALID_CHAR
810
811
#endif /* not XML_MIN_SIZE */
812
813
#ifdef XML_NS
814
815
static const struct normal_encoding little2_encoding_ns
816
    = {{VTABLE, 2, 0,
817
#  if BYTEORDER == 1234
818
        1
819
#  else
820
        0
821
#  endif
822
       },
823
       {
824
#  include "asciitab.h"
825
#  include "latin1tab.h"
826
       },
827
       STANDARD_VTABLE(little2_) NULL_VTABLE};
828
829
#endif
830
831
static const struct normal_encoding little2_encoding
832
    = {{VTABLE, 2, 0,
833
#if BYTEORDER == 1234
834
        1
835
#else
836
        0
837
#endif
838
       },
839
       {
840
#define BT_COLON BT_NMSTRT
841
#include "asciitab.h"
842
#undef BT_COLON
843
#include "latin1tab.h"
844
       },
845
       STANDARD_VTABLE(little2_) NULL_VTABLE};
846
847
#if BYTEORDER != 4321
848
849
#  ifdef XML_NS
850
851
static const struct normal_encoding internal_little2_encoding_ns
852
    = {{VTABLE, 2, 0, 1},
853
       {
854
#    include "iasciitab.h"
855
#    include "latin1tab.h"
856
       },
857
       STANDARD_VTABLE(little2_) NULL_VTABLE};
858
859
#  endif
860
861
static const struct normal_encoding internal_little2_encoding
862
    = {{VTABLE, 2, 0, 1},
863
       {
864
#  define BT_COLON BT_NMSTRT
865
#  include "iasciitab.h"
866
#  undef BT_COLON
867
#  include "latin1tab.h"
868
       },
869
       STANDARD_VTABLE(little2_) NULL_VTABLE};
870
871
#endif
872
873
#define BIG2_BYTE_TYPE(enc, p)                                                 \
874
158k
  ((p)[0] == 0                                                                 \
875
158k
       ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]]        \
876
158k
       : unicode_byte_type((p)[0], (p)[1]))
877
0
#define BIG2_BYTE_TO_ASCII(p) ((p)[0] == 0 ? (p)[1] : -1)
878
0
#define BIG2_CHAR_MATCHES(p, c) ((p)[0] == 0 && (p)[1] == c)
879
#define BIG2_IS_NAME_CHAR_MINBPC(p)                                            \
880
79.3k
  UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
881
#define BIG2_IS_NMSTRT_CHAR_MINBPC(p)                                          \
882
92
  UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
883
884
#ifdef XML_MIN_SIZE
885
886
static int PTRFASTCALL
887
big2_byteType(const ENCODING *enc, const char *p) {
888
  return BIG2_BYTE_TYPE(enc, p);
889
}
890
891
static int PTRFASTCALL
892
big2_byteToAscii(const ENCODING *enc, const char *p) {
893
  UNUSED_P(enc);
894
  return BIG2_BYTE_TO_ASCII(p);
895
}
896
897
static int PTRCALL
898
big2_charMatches(const ENCODING *enc, const char *p, int c) {
899
  UNUSED_P(enc);
900
  return BIG2_CHAR_MATCHES(p, c);
901
}
902
903
static int PTRFASTCALL
904
big2_isNameMin(const ENCODING *enc, const char *p) {
905
  UNUSED_P(enc);
906
  return BIG2_IS_NAME_CHAR_MINBPC(p);
907
}
908
909
static int PTRFASTCALL
910
big2_isNmstrtMin(const ENCODING *enc, const char *p) {
911
  UNUSED_P(enc);
912
  return BIG2_IS_NMSTRT_CHAR_MINBPC(p);
913
}
914
915
#  undef VTABLE
916
#  define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
917
918
#else /* not XML_MIN_SIZE */
919
920
#  undef PREFIX
921
0
#  define PREFIX(ident) big2_##ident
922
317k
#  define MINBPC(enc) 2
923
/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
924
158k
#  define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
925
0
#  define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(p)
926
0
#  define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(p, c)
927
16
#  define IS_NAME_CHAR(enc, p, n) 0
928
79.3k
#  define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(p)
929
16
#  define IS_NMSTRT_CHAR(enc, p, n) (0)
930
92
#  define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(p)
931
932
#  define XML_TOK_IMPL_C
933
#  include "xmltok_impl.c"
934
#  undef XML_TOK_IMPL_C
935
936
#  undef MINBPC
937
#  undef BYTE_TYPE
938
#  undef BYTE_TO_ASCII
939
#  undef CHAR_MATCHES
940
#  undef IS_NAME_CHAR
941
#  undef IS_NAME_CHAR_MINBPC
942
#  undef IS_NMSTRT_CHAR
943
#  undef IS_NMSTRT_CHAR_MINBPC
944
#  undef IS_INVALID_CHAR
945
946
#endif /* not XML_MIN_SIZE */
947
948
#ifdef XML_NS
949
950
static const struct normal_encoding big2_encoding_ns
951
    = {{VTABLE, 2, 0,
952
#  if BYTEORDER == 4321
953
        1
954
#  else
955
        0
956
#  endif
957
       },
958
       {
959
#  include "asciitab.h"
960
#  include "latin1tab.h"
961
       },
962
       STANDARD_VTABLE(big2_) NULL_VTABLE};
963
964
#endif
965
966
static const struct normal_encoding big2_encoding
967
    = {{VTABLE, 2, 0,
968
#if BYTEORDER == 4321
969
        1
970
#else
971
        0
972
#endif
973
       },
974
       {
975
#define BT_COLON BT_NMSTRT
976
#include "asciitab.h"
977
#undef BT_COLON
978
#include "latin1tab.h"
979
       },
980
       STANDARD_VTABLE(big2_) NULL_VTABLE};
981
982
#if BYTEORDER != 1234
983
984
#  ifdef XML_NS
985
986
static const struct normal_encoding internal_big2_encoding_ns
987
    = {{VTABLE, 2, 0, 1},
988
       {
989
#    include "iasciitab.h"
990
#    include "latin1tab.h"
991
       },
992
       STANDARD_VTABLE(big2_) NULL_VTABLE};
993
994
#  endif
995
996
static const struct normal_encoding internal_big2_encoding
997
    = {{VTABLE, 2, 0, 1},
998
       {
999
#  define BT_COLON BT_NMSTRT
1000
#  include "iasciitab.h"
1001
#  undef BT_COLON
1002
#  include "latin1tab.h"
1003
       },
1004
       STANDARD_VTABLE(big2_) NULL_VTABLE};
1005
1006
#endif
1007
1008
#undef PREFIX
1009
1010
static int FASTCALL
1011
3.22k
streqci(const char *s1, const char *s2) {
1012
17.6k
  for (;;) {
1013
17.6k
    char c1 = *s1++;
1014
17.6k
    char c2 = *s2++;
1015
17.6k
    if (ASCII_a <= c1 && c1 <= ASCII_z)
1016
610
      c1 += ASCII_A - ASCII_a;
1017
17.6k
    if (ASCII_a <= c2 && c2 <= ASCII_z)
1018
      /* The following line will never get executed.  streqci() is
1019
       * only called from two places, both of which guarantee to put
1020
       * upper-case strings into s2.
1021
       */
1022
0
      c2 += ASCII_A - ASCII_a; /* LCOV_EXCL_LINE */
1023
17.6k
    if (c1 != c2)
1024
1.80k
      return 0;
1025
15.8k
    if (! c1)
1026
1.41k
      break;
1027
15.8k
  }
1028
1.41k
  return 1;
1029
3.22k
}
1030
1031
static void PTRCALL
1032
initUpdatePosition(const ENCODING *enc, const char *ptr, const char *end,
1033
27
                   POSITION *pos) {
1034
27
  UNUSED_P(enc);
1035
27
  normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
1036
27
}
1037
1038
static int
1039
215k
toAscii(const ENCODING *enc, const char *ptr, const char *end) {
1040
215k
  char buf[1];
1041
215k
  char *p = buf;
1042
215k
  XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
1043
215k
  if (p == buf)
1044
566
    return -1;
1045
214k
  else
1046
214k
    return buf[0];
1047
215k
}
1048
1049
static int FASTCALL
1050
41.0k
isSpace(int c) {
1051
41.0k
  switch (c) {
1052
4.21k
  case 0x20:
1053
6.87k
  case 0xD:
1054
9.07k
  case 0xA:
1055
10.0k
  case 0x9:
1056
10.0k
    return 1;
1057
41.0k
  }
1058
31.0k
  return 0;
1059
41.0k
}
1060
1061
/* Return 1 if there's just optional white space or there's an S
1062
   followed by name=val.
1063
*/
1064
static int
1065
parsePseudoAttribute(const ENCODING *enc, const char *ptr, const char *end,
1066
                     const char **namePtr, const char **nameEndPtr,
1067
4.83k
                     const char **valPtr, const char **nextTokPtr) {
1068
4.83k
  int c;
1069
4.83k
  char open;
1070
4.83k
  if (ptr == end) {
1071
1.16k
    *namePtr = NULL;
1072
1.16k
    return 1;
1073
1.16k
  }
1074
3.66k
  if (! isSpace(toAscii(enc, ptr, end))) {
1075
8
    *nextTokPtr = ptr;
1076
8
    return 0;
1077
8
  }
1078
4.61k
  do {
1079
4.61k
    ptr += enc->minBytesPerChar;
1080
4.61k
  } while (isSpace(toAscii(enc, ptr, end)));
1081
3.65k
  if (ptr == end) {
1082
349
    *namePtr = NULL;
1083
349
    return 1;
1084
349
  }
1085
3.31k
  *namePtr = ptr;
1086
27.2k
  for (;;) {
1087
27.2k
    c = toAscii(enc, ptr, end);
1088
27.2k
    if (c == -1) {
1089
15
      *nextTokPtr = ptr;
1090
15
      return 0;
1091
15
    }
1092
27.2k
    if (c == ASCII_EQUALS) {
1093
3.22k
      *nameEndPtr = ptr;
1094
3.22k
      break;
1095
3.22k
    }
1096
24.0k
    if (isSpace(c)) {
1097
75
      *nameEndPtr = ptr;
1098
3.40k
      do {
1099
3.40k
        ptr += enc->minBytesPerChar;
1100
3.40k
      } while (isSpace(c = toAscii(enc, ptr, end)));
1101
75
      if (c != ASCII_EQUALS) {
1102
54
        *nextTokPtr = ptr;
1103
54
        return 0;
1104
54
      }
1105
21
      break;
1106
75
    }
1107
23.9k
    ptr += enc->minBytesPerChar;
1108
23.9k
  }
1109
3.24k
  if (ptr == *namePtr) {
1110
2
    *nextTokPtr = ptr;
1111
2
    return 0;
1112
2
  }
1113
3.23k
  ptr += enc->minBytesPerChar;
1114
3.23k
  c = toAscii(enc, ptr, end);
1115
4.19k
  while (isSpace(c)) {
1116
956
    ptr += enc->minBytesPerChar;
1117
956
    c = toAscii(enc, ptr, end);
1118
956
  }
1119
3.23k
  if (c != ASCII_QUOT && c != ASCII_APOS) {
1120
44
    *nextTokPtr = ptr;
1121
44
    return 0;
1122
44
  }
1123
3.19k
  open = (char)c;
1124
3.19k
  ptr += enc->minBytesPerChar;
1125
3.19k
  *valPtr = ptr;
1126
169k
  for (;; ptr += enc->minBytesPerChar) {
1127
169k
    c = toAscii(enc, ptr, end);
1128
169k
    if (c == open)
1129
3.12k
      break;
1130
166k
    if (! (ASCII_a <= c && c <= ASCII_z) && ! (ASCII_A <= c && c <= ASCII_Z)
1131
166k
        && ! (ASCII_0 <= c && c <= ASCII_9) && c != ASCII_PERIOD
1132
166k
        && c != ASCII_MINUS && c != ASCII_UNDERSCORE) {
1133
70
      *nextTokPtr = ptr;
1134
70
      return 0;
1135
70
    }
1136
166k
  }
1137
3.12k
  *nextTokPtr = ptr + enc->minBytesPerChar;
1138
3.12k
  return 1;
1139
3.19k
}
1140
1141
static const char KW_version[]
1142
    = {ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0'};
1143
1144
static const char KW_encoding[] = {ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d,
1145
                                   ASCII_i, ASCII_n, ASCII_g, '\0'};
1146
1147
static const char KW_standalone[]
1148
    = {ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a,
1149
       ASCII_l, ASCII_o, ASCII_n, ASCII_e, '\0'};
1150
1151
static const char KW_yes[] = {ASCII_y, ASCII_e, ASCII_s, '\0'};
1152
1153
static const char KW_no[] = {ASCII_n, ASCII_o, '\0'};
1154
1155
static int
1156
doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *, const char *,
1157
                                                 const char *),
1158
               int isGeneralTextEntity, const ENCODING *enc, const char *ptr,
1159
               const char *end, const char **badPtr, const char **versionPtr,
1160
               const char **versionEndPtr, const char **encodingName,
1161
1.78k
               const ENCODING **encoding, int *standalone) {
1162
1.78k
  const char *val = NULL;
1163
1.78k
  const char *name = NULL;
1164
1.78k
  const char *nameEnd = NULL;
1165
1.78k
  ptr += 5 * enc->minBytesPerChar;
1166
1.78k
  end -= 2 * enc->minBytesPerChar;
1167
1.78k
  if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)
1168
1.78k
      || ! name) {
1169
212
    *badPtr = ptr;
1170
212
    return 0;
1171
212
  }
1172
1.57k
  if (! XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) {
1173
3
    if (! isGeneralTextEntity) {
1174
3
      *badPtr = name;
1175
3
      return 0;
1176
3
    }
1177
1.57k
  } else {
1178
1.57k
    if (versionPtr)
1179
1.57k
      *versionPtr = val;
1180
1.57k
    if (versionEndPtr)
1181
1.57k
      *versionEndPtr = ptr;
1182
1.57k
    if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1183
10
      *badPtr = ptr;
1184
10
      return 0;
1185
10
    }
1186
1.56k
    if (! name) {
1187
13
      if (isGeneralTextEntity) {
1188
        /* a TextDecl must have an EncodingDecl */
1189
0
        *badPtr = ptr;
1190
0
        return 0;
1191
0
      }
1192
13
      return 1;
1193
13
    }
1194
1.56k
  }
1195
1.54k
  if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) {
1196
1.47k
    int c = toAscii(enc, val, end);
1197
1.47k
    if (! (ASCII_a <= c && c <= ASCII_z) && ! (ASCII_A <= c && c <= ASCII_Z)) {
1198
3
      *badPtr = val;
1199
3
      return 0;
1200
3
    }
1201
1.47k
    if (encodingName)
1202
1.47k
      *encodingName = val;
1203
1.47k
    if (encoding)
1204
1.47k
      *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
1205
1.47k
    if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1206
11
      *badPtr = ptr;
1207
11
      return 0;
1208
11
    }
1209
1.46k
    if (! name)
1210
1.45k
      return 1;
1211
1.46k
  }
1212
76
  if (! XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone)
1213
76
      || isGeneralTextEntity) {
1214
3
    *badPtr = name;
1215
3
    return 0;
1216
3
  }
1217
73
  if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) {
1218
34
    if (standalone)
1219
34
      *standalone = 1;
1220
39
  } else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) {
1221
38
    if (standalone)
1222
38
      *standalone = 0;
1223
38
  } else {
1224
1
    *badPtr = val;
1225
1
    return 0;
1226
1
  }
1227
1.13k
  while (isSpace(toAscii(enc, ptr, end)))
1228
1.06k
    ptr += enc->minBytesPerChar;
1229
72
  if (ptr != end) {
1230
2
    *badPtr = ptr;
1231
2
    return 0;
1232
2
  }
1233
70
  return 1;
1234
72
}
1235
1236
static int FASTCALL
1237
64.6k
checkCharRefNumber(int result) {
1238
64.6k
  switch (result >> 8) {
1239
1
  case 0xD8:
1240
2
  case 0xD9:
1241
3
  case 0xDA:
1242
4
  case 0xDB:
1243
5
  case 0xDC:
1244
6
  case 0xDD:
1245
8
  case 0xDE:
1246
9
  case 0xDF:
1247
9
    return -1;
1248
8.86k
  case 0:
1249
8.86k
    if (latin1_encoding.type[result] == BT_NONXML)
1250
25
      return -1;
1251
8.83k
    break;
1252
8.83k
  case 0xFF:
1253
977
    if (result == 0xFFFE || result == 0xFFFF)
1254
4
      return -1;
1255
973
    break;
1256
64.6k
  }
1257
64.5k
  return result;
1258
64.6k
}
1259
1260
int FASTCALL
1261
64.1k
XmlUtf8Encode(int c, char *buf) {
1262
64.1k
  enum {
1263
    /* minN is minimum legal resulting value for N byte sequence */
1264
64.1k
    min2 = 0x80,
1265
64.1k
    min3 = 0x800,
1266
64.1k
    min4 = 0x10000
1267
64.1k
  };
1268
1269
64.1k
  if (c < 0)
1270
0
    return 0; /* LCOV_EXCL_LINE: this case is always eliminated beforehand */
1271
64.1k
  if (c < min2) {
1272
5.36k
    buf[0] = (char)(c | UTF8_cval1);
1273
5.36k
    return 1;
1274
5.36k
  }
1275
58.8k
  if (c < min3) {
1276
7.16k
    buf[0] = (char)((c >> 6) | UTF8_cval2);
1277
7.16k
    buf[1] = (char)((c & 0x3f) | 0x80);
1278
7.16k
    return 2;
1279
7.16k
  }
1280
51.6k
  if (c < min4) {
1281
19.8k
    buf[0] = (char)((c >> 12) | UTF8_cval3);
1282
19.8k
    buf[1] = (char)(((c >> 6) & 0x3f) | 0x80);
1283
19.8k
    buf[2] = (char)((c & 0x3f) | 0x80);
1284
19.8k
    return 3;
1285
19.8k
  }
1286
31.7k
  if (c < 0x110000) {
1287
31.7k
    buf[0] = (char)((c >> 18) | UTF8_cval4);
1288
31.7k
    buf[1] = (char)(((c >> 12) & 0x3f) | 0x80);
1289
31.7k
    buf[2] = (char)(((c >> 6) & 0x3f) | 0x80);
1290
31.7k
    buf[3] = (char)((c & 0x3f) | 0x80);
1291
31.7k
    return 4;
1292
31.7k
  }
1293
0
  return 0; /* LCOV_EXCL_LINE: this case too is eliminated before calling */
1294
31.7k
}
1295
1296
int FASTCALL
1297
0
XmlUtf16Encode(int charNum, unsigned short *buf) {
1298
0
  if (charNum < 0)
1299
0
    return 0;
1300
0
  if (charNum < 0x10000) {
1301
0
    buf[0] = (unsigned short)charNum;
1302
0
    return 1;
1303
0
  }
1304
0
  if (charNum < 0x110000) {
1305
0
    charNum -= 0x10000;
1306
0
    buf[0] = (unsigned short)((charNum >> 10) + 0xD800);
1307
0
    buf[1] = (unsigned short)((charNum & 0x3FF) + 0xDC00);
1308
0
    return 2;
1309
0
  }
1310
0
  return 0;
1311
0
}
1312
1313
struct unknown_encoding {
1314
  struct normal_encoding normal;
1315
  CONVERTER convert;
1316
  void *userData;
1317
  unsigned short utf16[256];
1318
  char utf8[256][4];
1319
};
1320
1321
0
#define AS_UNKNOWN_ENCODING(enc) ((const struct unknown_encoding *)(enc))
1322
1323
int
1324
0
XmlSizeOfUnknownEncoding(void) {
1325
0
  return sizeof(struct unknown_encoding);
1326
0
}
1327
1328
static int PTRFASTCALL
1329
0
unknown_isName(const ENCODING *enc, const char *p) {
1330
0
  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1331
0
  int c = uenc->convert(uenc->userData, p);
1332
0
  if (c & ~0xFFFF)
1333
0
    return 0;
1334
0
  return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
1335
0
}
1336
1337
static int PTRFASTCALL
1338
0
unknown_isNmstrt(const ENCODING *enc, const char *p) {
1339
0
  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1340
0
  int c = uenc->convert(uenc->userData, p);
1341
0
  if (c & ~0xFFFF)
1342
0
    return 0;
1343
0
  return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
1344
0
}
1345
1346
static int PTRFASTCALL
1347
0
unknown_isInvalid(const ENCODING *enc, const char *p) {
1348
0
  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1349
0
  int c = uenc->convert(uenc->userData, p);
1350
0
  return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
1351
0
}
1352
1353
static enum XML_Convert_Result PTRCALL
1354
unknown_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
1355
0
               char **toP, const char *toLim) {
1356
0
  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1357
0
  char buf[XML_UTF8_ENCODE_MAX];
1358
0
  for (;;) {
1359
0
    const char *utf8;
1360
0
    int n;
1361
0
    if (*fromP == fromLim)
1362
0
      return XML_CONVERT_COMPLETED;
1363
0
    utf8 = uenc->utf8[(unsigned char)**fromP];
1364
0
    n = *utf8++;
1365
0
    if (n == 0) {
1366
0
      int c = uenc->convert(uenc->userData, *fromP);
1367
0
      n = XmlUtf8Encode(c, buf);
1368
0
      if (n > toLim - *toP)
1369
0
        return XML_CONVERT_OUTPUT_EXHAUSTED;
1370
0
      utf8 = buf;
1371
0
      *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1372
0
                 - (BT_LEAD2 - 2));
1373
0
    } else {
1374
0
      if (n > toLim - *toP)
1375
0
        return XML_CONVERT_OUTPUT_EXHAUSTED;
1376
0
      (*fromP)++;
1377
0
    }
1378
0
    memcpy(*toP, utf8, n);
1379
0
    *toP += n;
1380
0
  }
1381
0
}
1382
1383
static enum XML_Convert_Result PTRCALL
1384
unknown_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
1385
0
                unsigned short **toP, const unsigned short *toLim) {
1386
0
  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1387
0
  while (*fromP < fromLim && *toP < toLim) {
1388
0
    unsigned short c = uenc->utf16[(unsigned char)**fromP];
1389
0
    if (c == 0) {
1390
0
      c = (unsigned short)uenc->convert(uenc->userData, *fromP);
1391
0
      *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1392
0
                 - (BT_LEAD2 - 2));
1393
0
    } else
1394
0
      (*fromP)++;
1395
0
    *(*toP)++ = c;
1396
0
  }
1397
1398
0
  if ((*toP == toLim) && (*fromP < fromLim))
1399
0
    return XML_CONVERT_OUTPUT_EXHAUSTED;
1400
0
  else
1401
0
    return XML_CONVERT_COMPLETED;
1402
0
}
1403
1404
ENCODING *
1405
XmlInitUnknownEncoding(void *mem, int *table, CONVERTER convert,
1406
0
                       void *userData) {
1407
0
  int i;
1408
0
  struct unknown_encoding *e = (struct unknown_encoding *)mem;
1409
0
  memcpy(mem, &latin1_encoding, sizeof(struct normal_encoding));
1410
0
  for (i = 0; i < 128; i++)
1411
0
    if (latin1_encoding.type[i] != BT_OTHER
1412
0
        && latin1_encoding.type[i] != BT_NONXML && table[i] != i)
1413
0
      return 0;
1414
0
  for (i = 0; i < 256; i++) {
1415
0
    int c = table[i];
1416
0
    if (c == -1) {
1417
0
      e->normal.type[i] = BT_MALFORM;
1418
      /* This shouldn't really get used. */
1419
0
      e->utf16[i] = 0xFFFF;
1420
0
      e->utf8[i][0] = 1;
1421
0
      e->utf8[i][1] = 0;
1422
0
    } else if (c < 0) {
1423
0
      if (c < -4)
1424
0
        return 0;
1425
      /* Multi-byte sequences need a converter function */
1426
0
      if (! convert)
1427
0
        return 0;
1428
0
      e->normal.type[i] = (unsigned char)(BT_LEAD2 - (c + 2));
1429
0
      e->utf8[i][0] = 0;
1430
0
      e->utf16[i] = 0;
1431
0
    } else if (c < 0x80) {
1432
0
      if (latin1_encoding.type[c] != BT_OTHER
1433
0
          && latin1_encoding.type[c] != BT_NONXML && c != i)
1434
0
        return 0;
1435
0
      e->normal.type[i] = latin1_encoding.type[c];
1436
0
      e->utf8[i][0] = 1;
1437
0
      e->utf8[i][1] = (char)c;
1438
0
      e->utf16[i] = (unsigned short)(c == 0 ? 0xFFFF : c);
1439
0
    } else if (checkCharRefNumber(c) < 0) {
1440
0
      e->normal.type[i] = BT_NONXML;
1441
      /* This shouldn't really get used. */
1442
0
      e->utf16[i] = 0xFFFF;
1443
0
      e->utf8[i][0] = 1;
1444
0
      e->utf8[i][1] = 0;
1445
0
    } else {
1446
0
      if (c > 0xFFFF)
1447
0
        return 0;
1448
0
      if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
1449
0
        e->normal.type[i] = BT_NMSTRT;
1450
0
      else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
1451
0
        e->normal.type[i] = BT_NAME;
1452
0
      else
1453
0
        e->normal.type[i] = BT_OTHER;
1454
0
      e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
1455
0
      e->utf16[i] = (unsigned short)c;
1456
0
    }
1457
0
  }
1458
0
  e->userData = userData;
1459
0
  e->convert = convert;
1460
0
  if (convert) {
1461
0
    e->normal.isName2 = unknown_isName;
1462
0
    e->normal.isName3 = unknown_isName;
1463
0
    e->normal.isName4 = unknown_isName;
1464
0
    e->normal.isNmstrt2 = unknown_isNmstrt;
1465
0
    e->normal.isNmstrt3 = unknown_isNmstrt;
1466
0
    e->normal.isNmstrt4 = unknown_isNmstrt;
1467
0
    e->normal.isInvalid2 = unknown_isInvalid;
1468
0
    e->normal.isInvalid3 = unknown_isInvalid;
1469
0
    e->normal.isInvalid4 = unknown_isInvalid;
1470
0
  }
1471
0
  e->normal.enc.utf8Convert = unknown_toUtf8;
1472
0
  e->normal.enc.utf16Convert = unknown_toUtf16;
1473
0
  return &(e->normal.enc);
1474
0
}
1475
1476
/* If this enumeration is changed, getEncodingIndex and encodings
1477
must also be changed. */
1478
enum {
1479
  UNKNOWN_ENC = -1,
1480
  ISO_8859_1_ENC = 0,
1481
  US_ASCII_ENC,
1482
  UTF_8_ENC,
1483
  UTF_16_ENC,
1484
  UTF_16BE_ENC,
1485
  UTF_16LE_ENC,
1486
  /* must match encodingNames up to here */
1487
  NO_ENC
1488
};
1489
1490
static const char KW_ISO_8859_1[]
1491
    = {ASCII_I, ASCII_S, ASCII_O,     ASCII_MINUS, ASCII_8, ASCII_8,
1492
       ASCII_5, ASCII_9, ASCII_MINUS, ASCII_1,     '\0'};
1493
static const char KW_US_ASCII[]
1494
    = {ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S,
1495
       ASCII_C, ASCII_I, ASCII_I,     '\0'};
1496
static const char KW_UTF_8[]
1497
    = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0'};
1498
static const char KW_UTF_16[]
1499
    = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0'};
1500
static const char KW_UTF_16BE[]
1501
    = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1,
1502
       ASCII_6, ASCII_B, ASCII_E, '\0'};
1503
static const char KW_UTF_16LE[]
1504
    = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1,
1505
       ASCII_6, ASCII_L, ASCII_E, '\0'};
1506
1507
static int FASTCALL
1508
21.5k
getEncodingIndex(const char *name) {
1509
21.5k
  static const char *const encodingNames[] = {
1510
21.5k
      KW_ISO_8859_1, KW_US_ASCII, KW_UTF_8, KW_UTF_16, KW_UTF_16BE, KW_UTF_16LE,
1511
21.5k
  };
1512
21.5k
  int i;
1513
21.5k
  if (name == NULL)
1514
20.0k
    return NO_ENC;
1515
1.80k
  for (i = 0; i < (int)(sizeof(encodingNames) / sizeof(encodingNames[0])); i++)
1516
1.76k
    if (streqci(name, encodingNames[i]))
1517
1.41k
      return i;
1518
48
  return UNKNOWN_ENC;
1519
1.46k
}
1520
1521
/* For binary compatibility, we store the index of the encoding
1522
   specified at initialization in the isUtf16 member.
1523
*/
1524
1525
9.87k
#define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16)
1526
20.0k
#define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i)
1527
1528
/* This is what detects the encoding.  encodingTable maps from
1529
   encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of
1530
   the external (protocol) specified encoding; state is
1531
   XML_CONTENT_STATE if we're parsing an external text entity, and
1532
   XML_PROLOG_STATE otherwise.
1533
*/
1534
1535
static int
1536
initScan(const ENCODING *const *encodingTable, const INIT_ENCODING *enc,
1537
10.0k
         int state, const char *ptr, const char *end, const char **nextTokPtr) {
1538
10.0k
  const ENCODING **encPtr;
1539
1540
10.0k
  if (ptr >= end)
1541
2
    return XML_TOK_NONE;
1542
10.0k
  encPtr = enc->encPtr;
1543
10.0k
  if (ptr + 1 == end) {
1544
    /* only a single byte available for auto-detection */
1545
16
#ifndef XML_DTD /* FIXME */
1546
    /* a well-formed document entity must have more than one byte */
1547
16
    if (state != XML_CONTENT_STATE)
1548
16
      return XML_TOK_PARTIAL;
1549
0
#endif
1550
    /* so we're parsing an external text entity... */
1551
    /* if UTF-16 was externally specified, then we need at least 2 bytes */
1552
0
    switch (INIT_ENC_INDEX(enc)) {
1553
0
    case UTF_16_ENC:
1554
0
    case UTF_16LE_ENC:
1555
0
    case UTF_16BE_ENC:
1556
0
      return XML_TOK_PARTIAL;
1557
0
    }
1558
0
    switch ((unsigned char)*ptr) {
1559
0
    case 0xFE:
1560
0
    case 0xFF:
1561
0
    case 0xEF: /* possibly first byte of UTF-8 BOM */
1562
0
      if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1563
0
        break;
1564
      /* fall through */
1565
0
    case 0x00:
1566
0
    case 0x3C:
1567
0
      return XML_TOK_PARTIAL;
1568
0
    }
1569
10.0k
  } else {
1570
10.0k
    switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
1571
103
    case 0xFEFF:
1572
103
      if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1573
0
        break;
1574
103
      *nextTokPtr = ptr + 2;
1575
103
      *encPtr = encodingTable[UTF_16BE_ENC];
1576
103
      return XML_TOK_BOM;
1577
    /* 00 3C is handled in the default case */
1578
0
    case 0x3C00:
1579
0
      if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
1580
0
           || INIT_ENC_INDEX(enc) == UTF_16_ENC)
1581
0
          && state == XML_CONTENT_STATE)
1582
0
        break;
1583
0
      *encPtr = encodingTable[UTF_16LE_ENC];
1584
0
      return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1585
115
    case 0xFFFE:
1586
115
      if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1587
0
        break;
1588
115
      *nextTokPtr = ptr + 2;
1589
115
      *encPtr = encodingTable[UTF_16LE_ENC];
1590
115
      return XML_TOK_BOM;
1591
139
    case 0xEFBB:
1592
      /* Maybe a UTF-8 BOM (EF BB BF) */
1593
      /* If there's an explicitly specified (external) encoding
1594
         of ISO-8859-1 or some flavour of UTF-16
1595
         and this is an external text entity,
1596
         don't look for the BOM,
1597
         because it might be a legal data.
1598
      */
1599
139
      if (state == XML_CONTENT_STATE) {
1600
0
        int e = INIT_ENC_INDEX(enc);
1601
0
        if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC || e == UTF_16LE_ENC
1602
0
            || e == UTF_16_ENC)
1603
0
          break;
1604
0
      }
1605
139
      if (ptr + 2 == end)
1606
2
        return XML_TOK_PARTIAL;
1607
137
      if ((unsigned char)ptr[2] == 0xBF) {
1608
133
        *nextTokPtr = ptr + 3;
1609
133
        *encPtr = encodingTable[UTF_8_ENC];
1610
133
        return XML_TOK_BOM;
1611
133
      }
1612
4
      break;
1613
9.65k
    default:
1614
9.65k
      if (ptr[0] == '\0') {
1615
        /* 0 isn't a legal data character. Furthermore a document
1616
           entity can only start with ASCII characters.  So the only
1617
           way this can fail to be big-endian UTF-16 if it it's an
1618
           external parsed general entity that's labelled as
1619
           UTF-16LE.
1620
        */
1621
0
        if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
1622
0
          break;
1623
0
        *encPtr = encodingTable[UTF_16BE_ENC];
1624
0
        return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1625
9.65k
      } else if (ptr[1] == '\0') {
1626
        /* We could recover here in the case:
1627
            - parsing an external entity
1628
            - second byte is 0
1629
            - no externally specified encoding
1630
            - no encoding declaration
1631
           by assuming UTF-16LE.  But we don't, because this would mean when
1632
           presented just with a single byte, we couldn't reliably determine
1633
           whether we needed further bytes.
1634
        */
1635
0
        if (state == XML_CONTENT_STATE)
1636
0
          break;
1637
0
        *encPtr = encodingTable[UTF_16LE_ENC];
1638
0
        return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1639
0
      }
1640
9.65k
      break;
1641
10.0k
    }
1642
10.0k
  }
1643
9.65k
  *encPtr = encodingTable[INIT_ENC_INDEX(enc)];
1644
9.65k
  return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1645
10.0k
}
1646
1647
53.3k
#define NS(x) x
1648
10.0k
#define ns(x) x
1649
#define XML_TOK_NS_C
1650
#include "xmltok_ns.c"
1651
#undef XML_TOK_NS_C
1652
#undef NS
1653
#undef ns
1654
1655
#ifdef XML_NS
1656
1657
#  define NS(x) x##NS
1658
#  define ns(x) x##_ns
1659
1660
#  define XML_TOK_NS_C
1661
#  include "xmltok_ns.c"
1662
#  undef XML_TOK_NS_C
1663
1664
#  undef NS
1665
#  undef ns
1666
1667
ENCODING *
1668
XmlInitUnknownEncodingNS(void *mem, int *table, CONVERTER convert,
1669
                         void *userData) {
1670
  ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
1671
  if (enc)
1672
    ((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON;
1673
  return enc;
1674
}
1675
1676
#endif /* XML_NS */