Coverage Report

Created: 2026-01-25 07:18

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libexpat/expat/lib/xmltok.c
Line
Count
Source
1
/*
2
                            __  __            _
3
                         ___\ \/ /_ __   __ _| |_
4
                        / _ \\  /| '_ \ / _` | __|
5
                       |  __//  \| |_) | (_| | |_
6
                        \___/_/\_\ .__/ \__,_|\__|
7
                                 |_| XML parser
8
9
   Copyright (c) 1997-2000 Thai Open Source Software Center Ltd
10
   Copyright (c) 2000      Clark Cooper <coopercc@users.sourceforge.net>
11
   Copyright (c) 2001-2003 Fred L. Drake, Jr. <fdrake@users.sourceforge.net>
12
   Copyright (c) 2002      Greg Stein <gstein@users.sourceforge.net>
13
   Copyright (c) 2002-2016 Karl Waclawek <karl@waclawek.net>
14
   Copyright (c) 2005-2009 Steven Solie <steven@solie.ca>
15
   Copyright (c) 2016-2024 Sebastian Pipping <sebastian@pipping.org>
16
   Copyright (c) 2016      Pascal Cuoq <cuoq@trust-in-soft.com>
17
   Copyright (c) 2016      Don Lewis <truckman@apache.org>
18
   Copyright (c) 2017      Rhodri James <rhodri@wildebeest.org.uk>
19
   Copyright (c) 2017      Alexander Bluhm <alexander.bluhm@gmx.net>
20
   Copyright (c) 2017      Benbuck Nason <bnason@netflix.com>
21
   Copyright (c) 2017      José Gutiérrez de la Concha <jose@zeroc.com>
22
   Copyright (c) 2019      David Loffredo <loffredo@steptools.com>
23
   Copyright (c) 2021      Donghee Na <donghee.na@python.org>
24
   Copyright (c) 2022      Martin Ettl <ettl.martin78@googlemail.com>
25
   Copyright (c) 2022      Sean McBride <sean@rogue-research.com>
26
   Copyright (c) 2023      Hanno Böck <hanno@gentoo.org>
27
   Licensed under the MIT license:
28
29
   Permission is  hereby granted,  free of charge,  to any  person obtaining
30
   a  copy  of  this  software   and  associated  documentation  files  (the
31
   "Software"),  to  deal in  the  Software  without restriction,  including
32
   without  limitation the  rights  to use,  copy,  modify, merge,  publish,
33
   distribute, sublicense, and/or sell copies of the Software, and to permit
34
   persons  to whom  the Software  is  furnished to  do so,  subject to  the
35
   following conditions:
36
37
   The above copyright  notice and this permission notice  shall be included
38
   in all copies or substantial portions of the Software.
39
40
   THE  SOFTWARE  IS  PROVIDED  "AS  IS",  WITHOUT  WARRANTY  OF  ANY  KIND,
41
   EXPRESS  OR IMPLIED,  INCLUDING  BUT  NOT LIMITED  TO  THE WARRANTIES  OF
42
   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
43
   NO EVENT SHALL THE AUTHORS OR  COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
44
   DAMAGES OR  OTHER LIABILITY, WHETHER  IN AN  ACTION OF CONTRACT,  TORT OR
45
   OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
46
   USE OR OTHER DEALINGS IN THE SOFTWARE.
47
*/
48
49
#include "expat_config.h"
50
51
#include <stddef.h>
52
#include <string.h> /* memcpy */
53
#include <stdbool.h>
54
55
#ifdef _WIN32
56
#  include "winconfig.h"
57
#endif
58
59
#include "internal.h"
60
#include "xmltok.h"
61
#include "nametab.h"
62
63
#ifdef XML_DTD
64
#  define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok)
65
#else
66
#  define IGNORE_SECTION_TOK_VTABLE /* as nothing */
67
#endif
68
69
#define VTABLE1                                                                \
70
  {PREFIX(prologTok), PREFIX(contentTok),                                      \
71
   PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE},                         \
72
      {PREFIX(attributeValueTok), PREFIX(entityValueTok)},                     \
73
      PREFIX(nameMatchesAscii), PREFIX(nameLength), PREFIX(skipS),             \
74
      PREFIX(getAtts), PREFIX(charRefNumber), PREFIX(predefinedEntityName),    \
75
      PREFIX(updatePosition), PREFIX(isPublicId)
76
77
#define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
78
79
#define UCS2_GET_NAMING(pages, hi, lo)                                         \
80
2.37k
  (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1u << ((lo) & 0x1F)))
81
82
/* A 2 byte UTF-8 representation splits the characters 11 bits between
83
   the bottom 5 and 6 bits of the bytes.  We need 8 bits to index into
84
   pages, 3 bits to add to that index and 5 bits to generate the mask.
85
*/
86
#define UTF8_GET_NAMING2(pages, byte)                                          \
87
141
  (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3)                         \
88
141
                + ((((byte)[0]) & 3) << 1) + ((((byte)[1]) >> 5) & 1)]         \
89
141
   & (1u << (((byte)[1]) & 0x1F)))
90
91
/* A 3 byte UTF-8 representation splits the characters 16 bits between
92
   the bottom 4, 6 and 6 bits of the bytes.  We need 8 bits to index
93
   into pages, 3 bits to add to that index and 5 bits to generate the
94
   mask.
95
*/
96
#define UTF8_GET_NAMING3(pages, byte)                                          \
97
171
  (namingBitmap                                                                \
98
171
       [((pages)[((((byte)[0]) & 0xF) << 4) + ((((byte)[1]) >> 2) & 0xF)]      \
99
171
         << 3)                                                                 \
100
171
        + ((((byte)[1]) & 3) << 1) + ((((byte)[2]) >> 5) & 1)]                 \
101
171
   & (1u << (((byte)[2]) & 0x1F)))
102
103
/* Detection of invalid UTF-8 sequences is based on Table 3.1B
104
   of Unicode 3.2: https://www.unicode.org/unicode/reports/tr28/
105
   with the additional restriction of not allowing the Unicode
106
   code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE).
107
   Implementation details:
108
     (A & 0x80) == 0     means A < 0x80
109
   and
110
     (A & 0xC0) == 0xC0  means A > 0xBF
111
*/
112
113
#define UTF8_INVALID2(p)                                                       \
114
519
  ((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0)
115
116
#define UTF8_INVALID3(p)                                                       \
117
1.04k
  (((p)[2] & 0x80) == 0                                                        \
118
1.04k
   || ((*p) == 0xEF && (p)[1] == 0xBF ? (p)[2] > 0xBD                          \
119
962
                                      : ((p)[2] & 0xC0) == 0xC0)               \
120
1.04k
   || ((*p) == 0xE0                                                            \
121
885
           ? (p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0                          \
122
885
           : ((p)[1] & 0x80) == 0                                              \
123
788
                 || ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0)))
124
125
#define UTF8_INVALID4(p)                                                       \
126
597
  (((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 || ((p)[2] & 0x80) == 0     \
127
597
   || ((p)[2] & 0xC0) == 0xC0                                                  \
128
597
   || ((*p) == 0xF0                                                            \
129
439
           ? (p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0                          \
130
439
           : ((p)[1] & 0x80) == 0                                              \
131
426
                 || ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0)))
132
133
static int PTRFASTCALL
134
55
isNever(const ENCODING *enc, const char *p) {
135
55
  UNUSED_P(enc);
136
55
  UNUSED_P(p);
137
55
  return 0;
138
55
}
139
140
static int PTRFASTCALL
141
95
utf8_isName2(const ENCODING *enc, const char *p) {
142
95
  UNUSED_P(enc);
143
95
  return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
144
95
}
145
146
static int PTRFASTCALL
147
96
utf8_isName3(const ENCODING *enc, const char *p) {
148
96
  UNUSED_P(enc);
149
96
  return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
150
96
}
151
152
#define utf8_isName4 isNever
153
154
static int PTRFASTCALL
155
46
utf8_isNmstrt2(const ENCODING *enc, const char *p) {
156
46
  UNUSED_P(enc);
157
46
  return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
158
46
}
159
160
static int PTRFASTCALL
161
75
utf8_isNmstrt3(const ENCODING *enc, const char *p) {
162
75
  UNUSED_P(enc);
163
75
  return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
164
75
}
165
166
#define utf8_isNmstrt4 isNever
167
168
static int PTRFASTCALL
169
519
utf8_isInvalid2(const ENCODING *enc, const char *p) {
170
519
  UNUSED_P(enc);
171
519
  return UTF8_INVALID2((const unsigned char *)p);
172
519
}
173
174
static int PTRFASTCALL
175
1.04k
utf8_isInvalid3(const ENCODING *enc, const char *p) {
176
1.04k
  UNUSED_P(enc);
177
1.04k
  return UTF8_INVALID3((const unsigned char *)p);
178
1.04k
}
179
180
static int PTRFASTCALL
181
597
utf8_isInvalid4(const ENCODING *enc, const char *p) {
182
597
  UNUSED_P(enc);
183
597
  return UTF8_INVALID4((const unsigned char *)p);
184
597
}
185
186
struct normal_encoding {
187
  ENCODING enc;
188
  unsigned char type[256];
189
#ifdef XML_MIN_SIZE
190
  int(PTRFASTCALL *byteType)(const ENCODING *, const char *);
191
  int(PTRFASTCALL *isNameMin)(const ENCODING *, const char *);
192
  int(PTRFASTCALL *isNmstrtMin)(const ENCODING *, const char *);
193
  int(PTRFASTCALL *byteToAscii)(const ENCODING *, const char *);
194
  int(PTRCALL *charMatches)(const ENCODING *, const char *, int);
195
#endif /* XML_MIN_SIZE */
196
  int(PTRFASTCALL *isName2)(const ENCODING *, const char *);
197
  int(PTRFASTCALL *isName3)(const ENCODING *, const char *);
198
  int(PTRFASTCALL *isName4)(const ENCODING *, const char *);
199
  int(PTRFASTCALL *isNmstrt2)(const ENCODING *, const char *);
200
  int(PTRFASTCALL *isNmstrt3)(const ENCODING *, const char *);
201
  int(PTRFASTCALL *isNmstrt4)(const ENCODING *, const char *);
202
  int(PTRFASTCALL *isInvalid2)(const ENCODING *, const char *);
203
  int(PTRFASTCALL *isInvalid3)(const ENCODING *, const char *);
204
  int(PTRFASTCALL *isInvalid4)(const ENCODING *, const char *);
205
};
206
207
2.52k
#define AS_NORMAL_ENCODING(enc) ((const struct normal_encoding *)(enc))
208
209
#ifdef XML_MIN_SIZE
210
211
#  define STANDARD_VTABLE(E)                                                   \
212
    E##byteType, E##isNameMin, E##isNmstrtMin, E##byteToAscii, E##charMatches,
213
214
#else
215
216
#  define STANDARD_VTABLE(E) /* as nothing */
217
218
#endif
219
220
#define NORMAL_VTABLE(E)                                                       \
221
  E##isName2, E##isName3, E##isName4, E##isNmstrt2, E##isNmstrt3,              \
222
      E##isNmstrt4, E##isInvalid2, E##isInvalid3, E##isInvalid4
223
224
#define NULL_VTABLE                                                            \
225
  /* isName2 */ NULL, /* isName3 */ NULL, /* isName4 */ NULL,                  \
226
      /* isNmstrt2 */ NULL, /* isNmstrt3 */ NULL, /* isNmstrt4 */ NULL,        \
227
      /* isInvalid2 */ NULL, /* isInvalid3 */ NULL, /* isInvalid4 */ NULL
228
229
static int FASTCALL checkCharRefNumber(int result);
230
231
#include "xmltok_impl.h"
232
#include "ascii.h"
233
234
#ifdef XML_MIN_SIZE
235
#  define sb_isNameMin isNever
236
#  define sb_isNmstrtMin isNever
237
#endif
238
239
#ifdef XML_MIN_SIZE
240
#  define MINBPC(enc) ((enc)->minBytesPerChar)
241
#else
242
/* minimum bytes per character */
243
7.85M
#  define MINBPC(enc) 1
244
#endif
245
246
#define SB_BYTE_TYPE(enc, p)                                                   \
247
5.24M
  (((const struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
248
249
#ifdef XML_MIN_SIZE
250
static int PTRFASTCALL
251
sb_byteType(const ENCODING *enc, const char *p) {
252
  return SB_BYTE_TYPE(enc, p);
253
}
254
#  define BYTE_TYPE(enc, p) (AS_NORMAL_ENCODING(enc)->byteType(enc, p))
255
#else
256
5.23M
#  define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
257
#endif
258
259
#ifdef XML_MIN_SIZE
260
#  define BYTE_TO_ASCII(enc, p) (AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p))
261
static int PTRFASTCALL
262
sb_byteToAscii(const ENCODING *enc, const char *p) {
263
  UNUSED_P(enc);
264
  return *p;
265
}
266
#else
267
17.0k
#  define BYTE_TO_ASCII(enc, p) (*(p))
268
#endif
269
270
235
#define IS_NAME_CHAR(enc, p, n) (AS_NORMAL_ENCODING(enc)->isName##n(enc, p))
271
132
#define IS_NMSTRT_CHAR(enc, p, n) (AS_NORMAL_ENCODING(enc)->isNmstrt##n(enc, p))
272
#ifdef XML_MIN_SIZE
273
#  define IS_INVALID_CHAR(enc, p, n)                                           \
274
    (AS_NORMAL_ENCODING(enc)->isInvalid##n                                     \
275
     && AS_NORMAL_ENCODING(enc)->isInvalid##n(enc, p))
276
#else
277
#  define IS_INVALID_CHAR(enc, p, n)                                           \
278
2.83k
    (AS_NORMAL_ENCODING(enc)->isInvalid##n(enc, p))
279
#endif
280
281
#ifdef XML_MIN_SIZE
282
#  define IS_NAME_CHAR_MINBPC(enc, p)                                          \
283
    (AS_NORMAL_ENCODING(enc)->isNameMin(enc, p))
284
#  define IS_NMSTRT_CHAR_MINBPC(enc, p)                                        \
285
    (AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p))
286
#else
287
0
#  define IS_NAME_CHAR_MINBPC(enc, p) (0)
288
0
#  define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
289
#endif
290
291
#ifdef XML_MIN_SIZE
292
#  define CHAR_MATCHES(enc, p, c)                                              \
293
    (AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c))
294
static int PTRCALL
295
sb_charMatches(const ENCODING *enc, const char *p, int c) {
296
  UNUSED_P(enc);
297
  return *p == c;
298
}
299
#else
300
/* c is an ASCII character */
301
12.1k
#  define CHAR_MATCHES(enc, p, c) (*(p) == (c))
302
#endif
303
304
33.2k
#define PREFIX(ident) normal_##ident
305
#define XML_TOK_IMPL_C
306
#include "xmltok_impl.c"
307
#undef XML_TOK_IMPL_C
308
309
#undef MINBPC
310
#undef BYTE_TYPE
311
#undef BYTE_TO_ASCII
312
#undef CHAR_MATCHES
313
#undef IS_NAME_CHAR
314
#undef IS_NAME_CHAR_MINBPC
315
#undef IS_NMSTRT_CHAR
316
#undef IS_NMSTRT_CHAR_MINBPC
317
#undef IS_INVALID_CHAR
318
319
enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */
320
       UTF8_cval1 = 0x00,
321
       UTF8_cval2 = 0xc0,
322
       UTF8_cval3 = 0xe0,
323
       UTF8_cval4 = 0xf0
324
};
325
326
void
327
_INTERNAL_trim_to_complete_utf8_characters(const char *from,
328
65.8k
                                           const char **fromLimRef) {
329
65.8k
  const char *fromLim = *fromLimRef;
330
65.8k
  size_t walked = 0;
331
65.8k
  for (; fromLim > from; fromLim--, walked++) {
332
65.6k
    const unsigned char prev = (unsigned char)fromLim[-1];
333
65.6k
    if ((prev & 0xf8u)
334
65.6k
        == 0xf0u) { /* 4-byte character, lead by 0b11110xxx byte */
335
1
      if (walked + 1 >= 4) {
336
1
        fromLim += 4 - 1;
337
1
        break;
338
1
      } else {
339
0
        walked = 0;
340
0
      }
341
65.6k
    } else if ((prev & 0xf0u)
342
65.6k
               == 0xe0u) { /* 3-byte character, lead by 0b1110xxxx byte */
343
0
      if (walked + 1 >= 3) {
344
0
        fromLim += 3 - 1;
345
0
        break;
346
0
      } else {
347
0
        walked = 0;
348
0
      }
349
65.6k
    } else if ((prev & 0xe0u)
350
65.6k
               == 0xc0u) { /* 2-byte character, lead by 0b110xxxxx byte */
351
3
      if (walked + 1 >= 2) {
352
3
        fromLim += 2 - 1;
353
3
        break;
354
3
      } else {
355
0
        walked = 0;
356
0
      }
357
65.6k
    } else if ((prev & 0x80u)
358
65.6k
               == 0x00u) { /* 1-byte character, matching 0b0xxxxxxx */
359
65.6k
      break;
360
65.6k
    }
361
65.6k
  }
362
65.8k
  *fromLimRef = fromLim;
363
65.8k
}
364
365
static enum XML_Convert_Result PTRCALL
366
utf8_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
367
65.8k
            char **toP, const char *toLim) {
368
65.8k
  bool input_incomplete = false;
369
65.8k
  bool output_exhausted = false;
370
371
  /* Avoid copying partial characters (due to limited space). */
372
65.8k
  const ptrdiff_t bytesAvailable = fromLim - *fromP;
373
65.8k
  const ptrdiff_t bytesStorable = toLim - *toP;
374
65.8k
  UNUSED_P(enc);
375
65.8k
  if (bytesAvailable > bytesStorable) {
376
323
    fromLim = *fromP + bytesStorable;
377
323
    output_exhausted = true;
378
323
  }
379
380
  /* Avoid copying partial characters (from incomplete input). */
381
65.8k
  {
382
65.8k
    const char *const fromLimBefore = fromLim;
383
65.8k
    _INTERNAL_trim_to_complete_utf8_characters(*fromP, &fromLim);
384
65.8k
    if (fromLim < fromLimBefore) {
385
0
      input_incomplete = true;
386
0
    }
387
65.8k
  }
388
389
65.8k
  {
390
65.8k
    const ptrdiff_t bytesToCopy = fromLim - *fromP;
391
65.8k
    memcpy(*toP, *fromP, bytesToCopy);
392
65.8k
    *fromP += bytesToCopy;
393
65.8k
    *toP += bytesToCopy;
394
65.8k
  }
395
396
65.8k
  if (output_exhausted) /* needs to go first */
397
323
    return XML_CONVERT_OUTPUT_EXHAUSTED;
398
65.5k
  else if (input_incomplete)
399
0
    return XML_CONVERT_INPUT_INCOMPLETE;
400
65.5k
  else
401
65.5k
    return XML_CONVERT_COMPLETED;
402
65.8k
}
403
404
static enum XML_Convert_Result PTRCALL
405
utf8_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
406
0
             unsigned short **toP, const unsigned short *toLim) {
407
0
  enum XML_Convert_Result res = XML_CONVERT_COMPLETED;
408
0
  unsigned short *to = *toP;
409
0
  const char *from = *fromP;
410
0
  while (from < fromLim && to < toLim) {
411
0
    switch (SB_BYTE_TYPE(enc, from)) {
412
0
    case BT_LEAD2:
413
0
      if (fromLim - from < 2) {
414
0
        res = XML_CONVERT_INPUT_INCOMPLETE;
415
0
        goto after;
416
0
      }
417
0
      *to++ = (unsigned short)(((from[0] & 0x1f) << 6) | (from[1] & 0x3f));
418
0
      from += 2;
419
0
      break;
420
0
    case BT_LEAD3:
421
0
      if (fromLim - from < 3) {
422
0
        res = XML_CONVERT_INPUT_INCOMPLETE;
423
0
        goto after;
424
0
      }
425
0
      *to++ = (unsigned short)(((from[0] & 0xf) << 12) | ((from[1] & 0x3f) << 6)
426
0
                               | (from[2] & 0x3f));
427
0
      from += 3;
428
0
      break;
429
0
    case BT_LEAD4: {
430
0
      unsigned long n;
431
0
      if (toLim - to < 2) {
432
0
        res = XML_CONVERT_OUTPUT_EXHAUSTED;
433
0
        goto after;
434
0
      }
435
0
      if (fromLim - from < 4) {
436
0
        res = XML_CONVERT_INPUT_INCOMPLETE;
437
0
        goto after;
438
0
      }
439
0
      n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12)
440
0
          | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
441
0
      n -= 0x10000;
442
0
      to[0] = (unsigned short)((n >> 10) | 0xD800);
443
0
      to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
444
0
      to += 2;
445
0
      from += 4;
446
0
    } break;
447
0
    default:
448
0
      *to++ = *from++;
449
0
      break;
450
0
    }
451
0
  }
452
0
  if (from < fromLim)
453
0
    res = XML_CONVERT_OUTPUT_EXHAUSTED;
454
0
after:
455
0
  *fromP = from;
456
0
  *toP = to;
457
0
  return res;
458
0
}
459
460
#ifdef XML_NS
461
static const struct normal_encoding utf8_encoding_ns
462
    = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
463
       {
464
#  include "asciitab.h"
465
#  include "utf8tab.h"
466
       },
467
       STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
468
#endif
469
470
static const struct normal_encoding utf8_encoding
471
    = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
472
       {
473
#define BT_COLON BT_NMSTRT
474
#include "asciitab.h"
475
#undef BT_COLON
476
#include "utf8tab.h"
477
       },
478
       STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
479
480
#ifdef XML_NS
481
482
static const struct normal_encoding internal_utf8_encoding_ns
483
    = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
484
       {
485
#  include "iasciitab.h"
486
#  include "utf8tab.h"
487
       },
488
       STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
489
490
#endif
491
492
static const struct normal_encoding internal_utf8_encoding
493
    = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
494
       {
495
#define BT_COLON BT_NMSTRT
496
#include "iasciitab.h"
497
#undef BT_COLON
498
#include "utf8tab.h"
499
       },
500
       STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
501
502
static enum XML_Convert_Result PTRCALL
503
latin1_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
504
0
              char **toP, const char *toLim) {
505
0
  UNUSED_P(enc);
506
0
  for (;;) {
507
0
    unsigned char c;
508
0
    if (*fromP == fromLim)
509
0
      return XML_CONVERT_COMPLETED;
510
0
    c = (unsigned char)**fromP;
511
0
    if (c & 0x80) {
512
0
      if (toLim - *toP < 2)
513
0
        return XML_CONVERT_OUTPUT_EXHAUSTED;
514
0
      *(*toP)++ = (char)((c >> 6) | UTF8_cval2);
515
0
      *(*toP)++ = (char)((c & 0x3f) | 0x80);
516
0
      (*fromP)++;
517
0
    } else {
518
0
      if (*toP == toLim)
519
0
        return XML_CONVERT_OUTPUT_EXHAUSTED;
520
0
      *(*toP)++ = *(*fromP)++;
521
0
    }
522
0
  }
523
0
}
524
525
static enum XML_Convert_Result PTRCALL
526
latin1_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
527
0
               unsigned short **toP, const unsigned short *toLim) {
528
0
  UNUSED_P(enc);
529
0
  while (*fromP < fromLim && *toP < toLim)
530
0
    *(*toP)++ = (unsigned char)*(*fromP)++;
531
532
0
  if ((*toP == toLim) && (*fromP < fromLim))
533
0
    return XML_CONVERT_OUTPUT_EXHAUSTED;
534
0
  else
535
0
    return XML_CONVERT_COMPLETED;
536
0
}
537
538
#ifdef XML_NS
539
540
static const struct normal_encoding latin1_encoding_ns
541
    = {{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0},
542
       {
543
#  include "asciitab.h"
544
#  include "latin1tab.h"
545
       },
546
       STANDARD_VTABLE(sb_) NULL_VTABLE};
547
548
#endif
549
550
static const struct normal_encoding latin1_encoding
551
    = {{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0},
552
       {
553
#define BT_COLON BT_NMSTRT
554
#include "asciitab.h"
555
#undef BT_COLON
556
#include "latin1tab.h"
557
       },
558
       STANDARD_VTABLE(sb_) NULL_VTABLE};
559
560
static enum XML_Convert_Result PTRCALL
561
ascii_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
562
0
             char **toP, const char *toLim) {
563
0
  UNUSED_P(enc);
564
0
  while (*fromP < fromLim && *toP < toLim)
565
0
    *(*toP)++ = *(*fromP)++;
566
567
0
  if ((*toP == toLim) && (*fromP < fromLim))
568
0
    return XML_CONVERT_OUTPUT_EXHAUSTED;
569
0
  else
570
0
    return XML_CONVERT_COMPLETED;
571
0
}
572
573
#ifdef XML_NS
574
575
static const struct normal_encoding ascii_encoding_ns
576
    = {{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0},
577
       {
578
#  include "asciitab.h"
579
           /* BT_NONXML == 0 */
580
       },
581
       STANDARD_VTABLE(sb_) NULL_VTABLE};
582
583
#endif
584
585
static const struct normal_encoding ascii_encoding
586
    = {{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0},
587
       {
588
#define BT_COLON BT_NMSTRT
589
#include "asciitab.h"
590
#undef BT_COLON
591
           /* BT_NONXML == 0 */
592
       },
593
       STANDARD_VTABLE(sb_) NULL_VTABLE};
594
595
static int PTRFASTCALL
596
7.68k
unicode_byte_type(char hi, char lo) {
597
7.68k
  switch ((unsigned char)hi) {
598
  /* 0xD800-0xDBFF first 16-bit code unit or high surrogate (W1) */
599
58
  case 0xD8:
600
67
  case 0xD9:
601
82
  case 0xDA:
602
110
  case 0xDB:
603
110
    return BT_LEAD4;
604
  /* 0xDC00-0xDFFF second 16-bit code unit or low surrogate (W2) */
605
10
  case 0xDC:
606
13
  case 0xDD:
607
27
  case 0xDE:
608
46
  case 0xDF:
609
46
    return BT_TRAIL;
610
261
  case 0xFF:
611
261
    switch ((unsigned char)lo) {
612
70
    case 0xFF: /* noncharacter-FFFF */
613
70
    case 0xFE: /* noncharacter-FFFE */
614
70
      return BT_NONXML;
615
261
    }
616
191
    break;
617
7.68k
  }
618
7.46k
  return BT_NONASCII;
619
7.68k
}
620
621
#define DEFINE_UTF16_TO_UTF8(E)                                                \
622
  static enum XML_Convert_Result PTRCALL E##toUtf8(                            \
623
      const ENCODING *enc, const char **fromP, const char *fromLim,            \
624
87
      char **toP, const char *toLim) {                                         \
625
87
    const char *from = *fromP;                                                 \
626
87
    UNUSED_P(enc);                                                             \
627
87
    fromLim = from + (((fromLim - from) >> 1) << 1); /* shrink to even */      \
628
698
    for (; from < fromLim; from += 2) {                                        \
629
613
      int plane;                                                               \
630
613
      unsigned char lo2;                                                       \
631
613
      unsigned char lo = GET_LO(from);                                         \
632
613
      unsigned char hi = GET_HI(from);                                         \
633
613
      switch (hi) {                                                            \
634
78
      case 0:                                                                  \
635
78
        if (lo < 0x80) {                                                       \
636
67
          if (*toP == toLim) {                                                 \
637
0
            *fromP = from;                                                     \
638
0
            return XML_CONVERT_OUTPUT_EXHAUSTED;                               \
639
0
          }                                                                    \
640
67
          *(*toP)++ = lo;                                                      \
641
67
          break;                                                               \
642
67
        }                                                                      \
643
78
        /* fall through */                                                     \
644
78
      case 0x1:                                                                \
645
46
      case 0x2:                                                                \
646
46
      case 0x3:                                                                \
647
48
      case 0x4:                                                                \
648
48
      case 0x5:                                                                \
649
48
      case 0x6:                                                                \
650
48
      case 0x7:                                                                \
651
48
        if (toLim - *toP < 2) {                                                \
652
0
          *fromP = from;                                                       \
653
0
          return XML_CONVERT_OUTPUT_EXHAUSTED;                                 \
654
0
        }                                                                      \
655
48
        *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2);                      \
656
48
        *(*toP)++ = ((lo & 0x3f) | 0x80);                                      \
657
48
        break;                                                                 \
658
498
      default:                                                                 \
659
498
        if (toLim - *toP < 3) {                                                \
660
2
          *fromP = from;                                                       \
661
2
          return XML_CONVERT_OUTPUT_EXHAUSTED;                                 \
662
2
        }                                                                      \
663
498
        /* 16 bits divided 4, 6, 6 amongst 3 bytes */                          \
664
498
        *(*toP)++ = ((hi >> 4) | UTF8_cval3);                                  \
665
496
        *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80);                    \
666
496
        *(*toP)++ = ((lo & 0x3f) | 0x80);                                      \
667
496
        break;                                                                 \
668
498
      case 0xD8:                                                               \
669
0
      case 0xD9:                                                               \
670
0
      case 0xDA:                                                               \
671
0
      case 0xDB:                                                               \
672
0
        if (toLim - *toP < 4) {                                                \
673
0
          *fromP = from;                                                       \
674
0
          return XML_CONVERT_OUTPUT_EXHAUSTED;                                 \
675
0
        }                                                                      \
676
0
        if (fromLim - from < 4) {                                              \
677
0
          *fromP = from;                                                       \
678
0
          return XML_CONVERT_INPUT_INCOMPLETE;                                 \
679
0
        }                                                                      \
680
0
        plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1;                   \
681
0
        *(*toP)++ = (char)((plane >> 2) | UTF8_cval4);                         \
682
0
        *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80);         \
683
0
        from += 2;                                                             \
684
0
        lo2 = GET_LO(from);                                                    \
685
0
        *(*toP)++ = (((lo & 0x3) << 4) | ((GET_HI(from) & 0x3) << 2)           \
686
0
                     | (lo2 >> 6) | 0x80);                                     \
687
0
        *(*toP)++ = ((lo2 & 0x3f) | 0x80);                                     \
688
0
        break;                                                                 \
689
613
      }                                                                        \
690
613
    }                                                                          \
691
87
    *fromP = from;                                                             \
692
85
    if (from < fromLim)                                                        \
693
85
      return XML_CONVERT_INPUT_INCOMPLETE;                                     \
694
85
    else                                                                       \
695
85
      return XML_CONVERT_COMPLETED;                                            \
696
85
  }
xmltok.c:little2_toUtf8
Line
Count
Source
624
63
      char **toP, const char *toLim) {                                         \
625
63
    const char *from = *fromP;                                                 \
626
63
    UNUSED_P(enc);                                                             \
627
63
    fromLim = from + (((fromLim - from) >> 1) << 1); /* shrink to even */      \
628
542
    for (; from < fromLim; from += 2) {                                        \
629
479
      int plane;                                                               \
630
479
      unsigned char lo2;                                                       \
631
479
      unsigned char lo = GET_LO(from);                                         \
632
479
      unsigned char hi = GET_HI(from);                                         \
633
479
      switch (hi) {                                                            \
634
58
      case 0:                                                                  \
635
58
        if (lo < 0x80) {                                                       \
636
48
          if (*toP == toLim) {                                                 \
637
0
            *fromP = from;                                                     \
638
0
            return XML_CONVERT_OUTPUT_EXHAUSTED;                               \
639
0
          }                                                                    \
640
48
          *(*toP)++ = lo;                                                      \
641
48
          break;                                                               \
642
48
        }                                                                      \
643
58
        /* fall through */                                                     \
644
58
      case 0x1:                                                                \
645
44
      case 0x2:                                                                \
646
44
      case 0x3:                                                                \
647
46
      case 0x4:                                                                \
648
46
      case 0x5:                                                                \
649
46
      case 0x6:                                                                \
650
46
      case 0x7:                                                                \
651
46
        if (toLim - *toP < 2) {                                                \
652
0
          *fromP = from;                                                       \
653
0
          return XML_CONVERT_OUTPUT_EXHAUSTED;                                 \
654
0
        }                                                                      \
655
46
        *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2);                      \
656
46
        *(*toP)++ = ((lo & 0x3f) | 0x80);                                      \
657
46
        break;                                                                 \
658
385
      default:                                                                 \
659
385
        if (toLim - *toP < 3) {                                                \
660
0
          *fromP = from;                                                       \
661
0
          return XML_CONVERT_OUTPUT_EXHAUSTED;                                 \
662
0
        }                                                                      \
663
385
        /* 16 bits divided 4, 6, 6 amongst 3 bytes */                          \
664
385
        *(*toP)++ = ((hi >> 4) | UTF8_cval3);                                  \
665
385
        *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80);                    \
666
385
        *(*toP)++ = ((lo & 0x3f) | 0x80);                                      \
667
385
        break;                                                                 \
668
385
      case 0xD8:                                                               \
669
0
      case 0xD9:                                                               \
670
0
      case 0xDA:                                                               \
671
0
      case 0xDB:                                                               \
672
0
        if (toLim - *toP < 4) {                                                \
673
0
          *fromP = from;                                                       \
674
0
          return XML_CONVERT_OUTPUT_EXHAUSTED;                                 \
675
0
        }                                                                      \
676
0
        if (fromLim - from < 4) {                                              \
677
0
          *fromP = from;                                                       \
678
0
          return XML_CONVERT_INPUT_INCOMPLETE;                                 \
679
0
        }                                                                      \
680
0
        plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1;                   \
681
0
        *(*toP)++ = (char)((plane >> 2) | UTF8_cval4);                         \
682
0
        *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80);         \
683
0
        from += 2;                                                             \
684
0
        lo2 = GET_LO(from);                                                    \
685
0
        *(*toP)++ = (((lo & 0x3) << 4) | ((GET_HI(from) & 0x3) << 2)           \
686
0
                     | (lo2 >> 6) | 0x80);                                     \
687
0
        *(*toP)++ = ((lo2 & 0x3f) | 0x80);                                     \
688
0
        break;                                                                 \
689
479
      }                                                                        \
690
479
    }                                                                          \
691
63
    *fromP = from;                                                             \
692
63
    if (from < fromLim)                                                        \
693
63
      return XML_CONVERT_INPUT_INCOMPLETE;                                     \
694
63
    else                                                                       \
695
63
      return XML_CONVERT_COMPLETED;                                            \
696
63
  }
xmltok.c:big2_toUtf8
Line
Count
Source
624
24
      char **toP, const char *toLim) {                                         \
625
24
    const char *from = *fromP;                                                 \
626
24
    UNUSED_P(enc);                                                             \
627
24
    fromLim = from + (((fromLim - from) >> 1) << 1); /* shrink to even */      \
628
156
    for (; from < fromLim; from += 2) {                                        \
629
134
      int plane;                                                               \
630
134
      unsigned char lo2;                                                       \
631
134
      unsigned char lo = GET_LO(from);                                         \
632
134
      unsigned char hi = GET_HI(from);                                         \
633
134
      switch (hi) {                                                            \
634
20
      case 0:                                                                  \
635
20
        if (lo < 0x80) {                                                       \
636
19
          if (*toP == toLim) {                                                 \
637
0
            *fromP = from;                                                     \
638
0
            return XML_CONVERT_OUTPUT_EXHAUSTED;                               \
639
0
          }                                                                    \
640
19
          *(*toP)++ = lo;                                                      \
641
19
          break;                                                               \
642
19
        }                                                                      \
643
20
        /* fall through */                                                     \
644
20
      case 0x1:                                                                \
645
2
      case 0x2:                                                                \
646
2
      case 0x3:                                                                \
647
2
      case 0x4:                                                                \
648
2
      case 0x5:                                                                \
649
2
      case 0x6:                                                                \
650
2
      case 0x7:                                                                \
651
2
        if (toLim - *toP < 2) {                                                \
652
0
          *fromP = from;                                                       \
653
0
          return XML_CONVERT_OUTPUT_EXHAUSTED;                                 \
654
0
        }                                                                      \
655
2
        *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2);                      \
656
2
        *(*toP)++ = ((lo & 0x3f) | 0x80);                                      \
657
2
        break;                                                                 \
658
113
      default:                                                                 \
659
113
        if (toLim - *toP < 3) {                                                \
660
2
          *fromP = from;                                                       \
661
2
          return XML_CONVERT_OUTPUT_EXHAUSTED;                                 \
662
2
        }                                                                      \
663
113
        /* 16 bits divided 4, 6, 6 amongst 3 bytes */                          \
664
113
        *(*toP)++ = ((hi >> 4) | UTF8_cval3);                                  \
665
111
        *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80);                    \
666
111
        *(*toP)++ = ((lo & 0x3f) | 0x80);                                      \
667
111
        break;                                                                 \
668
113
      case 0xD8:                                                               \
669
0
      case 0xD9:                                                               \
670
0
      case 0xDA:                                                               \
671
0
      case 0xDB:                                                               \
672
0
        if (toLim - *toP < 4) {                                                \
673
0
          *fromP = from;                                                       \
674
0
          return XML_CONVERT_OUTPUT_EXHAUSTED;                                 \
675
0
        }                                                                      \
676
0
        if (fromLim - from < 4) {                                              \
677
0
          *fromP = from;                                                       \
678
0
          return XML_CONVERT_INPUT_INCOMPLETE;                                 \
679
0
        }                                                                      \
680
0
        plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1;                   \
681
0
        *(*toP)++ = (char)((plane >> 2) | UTF8_cval4);                         \
682
0
        *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80);         \
683
0
        from += 2;                                                             \
684
0
        lo2 = GET_LO(from);                                                    \
685
0
        *(*toP)++ = (((lo & 0x3) << 4) | ((GET_HI(from) & 0x3) << 2)           \
686
0
                     | (lo2 >> 6) | 0x80);                                     \
687
0
        *(*toP)++ = ((lo2 & 0x3f) | 0x80);                                     \
688
0
        break;                                                                 \
689
134
      }                                                                        \
690
134
    }                                                                          \
691
24
    *fromP = from;                                                             \
692
22
    if (from < fromLim)                                                        \
693
22
      return XML_CONVERT_INPUT_INCOMPLETE;                                     \
694
22
    else                                                                       \
695
22
      return XML_CONVERT_COMPLETED;                                            \
696
22
  }
697
698
#define DEFINE_UTF16_TO_UTF16(E)                                               \
699
  static enum XML_Convert_Result PTRCALL E##toUtf16(                           \
700
      const ENCODING *enc, const char **fromP, const char *fromLim,            \
701
0
      unsigned short **toP, const unsigned short *toLim) {                     \
702
0
    enum XML_Convert_Result res = XML_CONVERT_COMPLETED;                       \
703
0
    UNUSED_P(enc);                                                             \
704
0
    fromLim = *fromP + (((fromLim - *fromP) >> 1) << 1); /* shrink to even */  \
705
0
    /* Avoid copying first half only of surrogate */                           \
706
0
    if (fromLim - *fromP > ((toLim - *toP) << 1)                               \
707
0
        && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) {                             \
708
0
      fromLim -= 2;                                                            \
709
0
      res = XML_CONVERT_INPUT_INCOMPLETE;                                      \
710
0
    }                                                                          \
711
0
    for (; *fromP < fromLim && *toP < toLim; *fromP += 2)                      \
712
0
      *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP);                      \
713
0
    if ((*toP == toLim) && (*fromP < fromLim))                                 \
714
0
      return XML_CONVERT_OUTPUT_EXHAUSTED;                                     \
715
0
    else                                                                       \
716
0
      return res;                                                              \
717
0
  }
Unexecuted instantiation: xmltok.c:little2_toUtf16
Unexecuted instantiation: xmltok.c:big2_toUtf16
718
719
479
#define GET_LO(ptr) ((unsigned char)(ptr)[0])
720
479
#define GET_HI(ptr) ((unsigned char)(ptr)[1])
721
722
DEFINE_UTF16_TO_UTF8(little2_)
723
DEFINE_UTF16_TO_UTF16(little2_)
724
725
#undef GET_LO
726
#undef GET_HI
727
728
134
#define GET_LO(ptr) ((unsigned char)(ptr)[1])
729
134
#define GET_HI(ptr) ((unsigned char)(ptr)[0])
730
731
DEFINE_UTF16_TO_UTF8(big2_)
732
DEFINE_UTF16_TO_UTF16(big2_)
733
734
#undef GET_LO
735
#undef GET_HI
736
737
#define LITTLE2_BYTE_TYPE(enc, p)                                              \
738
6.59k
  ((p)[1] == 0 ? SB_BYTE_TYPE(enc, p) : unicode_byte_type((p)[1], (p)[0]))
739
1
#define LITTLE2_BYTE_TO_ASCII(p) ((p)[1] == 0 ? (p)[0] : -1)
740
9
#define LITTLE2_CHAR_MATCHES(p, c) ((p)[1] == 0 && (p)[0] == (c))
741
#define LITTLE2_IS_NAME_CHAR_MINBPC(p)                                         \
742
1.17k
  UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
743
#define LITTLE2_IS_NMSTRT_CHAR_MINBPC(p)                                       \
744
256
  UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
745
746
#ifdef XML_MIN_SIZE
747
748
static int PTRFASTCALL
749
little2_byteType(const ENCODING *enc, const char *p) {
750
  return LITTLE2_BYTE_TYPE(enc, p);
751
}
752
753
static int PTRFASTCALL
754
little2_byteToAscii(const ENCODING *enc, const char *p) {
755
  UNUSED_P(enc);
756
  return LITTLE2_BYTE_TO_ASCII(p);
757
}
758
759
static int PTRCALL
760
little2_charMatches(const ENCODING *enc, const char *p, int c) {
761
  UNUSED_P(enc);
762
  return LITTLE2_CHAR_MATCHES(p, c);
763
}
764
765
static int PTRFASTCALL
766
little2_isNameMin(const ENCODING *enc, const char *p) {
767
  UNUSED_P(enc);
768
  return LITTLE2_IS_NAME_CHAR_MINBPC(p);
769
}
770
771
static int PTRFASTCALL
772
little2_isNmstrtMin(const ENCODING *enc, const char *p) {
773
  UNUSED_P(enc);
774
  return LITTLE2_IS_NMSTRT_CHAR_MINBPC(p);
775
}
776
777
#  undef VTABLE
778
#  define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
779
780
#else /* not XML_MIN_SIZE */
781
782
#  undef PREFIX
783
281
#  define PREFIX(ident) little2_##ident
784
13.6k
#  define MINBPC(enc) 2
785
/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
786
6.59k
#  define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
787
1
#  define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(p)
788
9
#  define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(p, c)
789
2
#  define IS_NAME_CHAR(enc, p, n) 0
790
1.17k
#  define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(p)
791
2
#  define IS_NMSTRT_CHAR(enc, p, n) (0)
792
256
#  define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(p)
793
794
#  define XML_TOK_IMPL_C
795
#  include "xmltok_impl.c"
796
#  undef XML_TOK_IMPL_C
797
798
#  undef MINBPC
799
#  undef BYTE_TYPE
800
#  undef BYTE_TO_ASCII
801
#  undef CHAR_MATCHES
802
#  undef IS_NAME_CHAR
803
#  undef IS_NAME_CHAR_MINBPC
804
#  undef IS_NMSTRT_CHAR
805
#  undef IS_NMSTRT_CHAR_MINBPC
806
#  undef IS_INVALID_CHAR
807
808
#endif /* not XML_MIN_SIZE */
809
810
#ifdef XML_NS
811
812
static const struct normal_encoding little2_encoding_ns
813
    = {{VTABLE, 2, 0,
814
#  if BYTEORDER == 1234
815
        1
816
#  else
817
        0
818
#  endif
819
       },
820
       {
821
#  include "asciitab.h"
822
#  include "latin1tab.h"
823
       },
824
       STANDARD_VTABLE(little2_) NULL_VTABLE};
825
826
#endif
827
828
static const struct normal_encoding little2_encoding
829
    = {{VTABLE, 2, 0,
830
#if BYTEORDER == 1234
831
        1
832
#else
833
        0
834
#endif
835
       },
836
       {
837
#define BT_COLON BT_NMSTRT
838
#include "asciitab.h"
839
#undef BT_COLON
840
#include "latin1tab.h"
841
       },
842
       STANDARD_VTABLE(little2_) NULL_VTABLE};
843
844
#if BYTEORDER != 4321
845
846
#  ifdef XML_NS
847
848
static const struct normal_encoding internal_little2_encoding_ns
849
    = {{VTABLE, 2, 0, 1},
850
       {
851
#    include "iasciitab.h"
852
#    include "latin1tab.h"
853
       },
854
       STANDARD_VTABLE(little2_) NULL_VTABLE};
855
856
#  endif
857
858
static const struct normal_encoding internal_little2_encoding
859
    = {{VTABLE, 2, 0, 1},
860
       {
861
#  define BT_COLON BT_NMSTRT
862
#  include "iasciitab.h"
863
#  undef BT_COLON
864
#  include "latin1tab.h"
865
       },
866
       STANDARD_VTABLE(little2_) NULL_VTABLE};
867
868
#endif
869
870
#define BIG2_BYTE_TYPE(enc, p)                                                 \
871
4.64k
  ((p)[0] == 0 ? SB_BYTE_TYPE(enc, p + 1) : unicode_byte_type((p)[0], (p)[1]))
872
0
#define BIG2_BYTE_TO_ASCII(p) ((p)[0] == 0 ? (p)[1] : -1)
873
4
#define BIG2_CHAR_MATCHES(p, c) ((p)[0] == 0 && (p)[1] == (c))
874
#define BIG2_IS_NAME_CHAR_MINBPC(p)                                            \
875
800
  UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
876
#define BIG2_IS_NMSTRT_CHAR_MINBPC(p)                                          \
877
147
  UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
878
879
#ifdef XML_MIN_SIZE
880
881
static int PTRFASTCALL
882
big2_byteType(const ENCODING *enc, const char *p) {
883
  return BIG2_BYTE_TYPE(enc, p);
884
}
885
886
static int PTRFASTCALL
887
big2_byteToAscii(const ENCODING *enc, const char *p) {
888
  UNUSED_P(enc);
889
  return BIG2_BYTE_TO_ASCII(p);
890
}
891
892
static int PTRCALL
893
big2_charMatches(const ENCODING *enc, const char *p, int c) {
894
  UNUSED_P(enc);
895
  return BIG2_CHAR_MATCHES(p, c);
896
}
897
898
static int PTRFASTCALL
899
big2_isNameMin(const ENCODING *enc, const char *p) {
900
  UNUSED_P(enc);
901
  return BIG2_IS_NAME_CHAR_MINBPC(p);
902
}
903
904
static int PTRFASTCALL
905
big2_isNmstrtMin(const ENCODING *enc, const char *p) {
906
  UNUSED_P(enc);
907
  return BIG2_IS_NMSTRT_CHAR_MINBPC(p);
908
}
909
910
#  undef VTABLE
911
#  define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
912
913
#else /* not XML_MIN_SIZE */
914
915
#  undef PREFIX
916
244
#  define PREFIX(ident) big2_##ident
917
10.1k
#  define MINBPC(enc) 2
918
/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
919
4.64k
#  define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
920
0
#  define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(p)
921
4
#  define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(p, c)
922
0
#  define IS_NAME_CHAR(enc, p, n) 0
923
800
#  define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(p)
924
0
#  define IS_NMSTRT_CHAR(enc, p, n) (0)
925
147
#  define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(p)
926
927
#  define XML_TOK_IMPL_C
928
#  include "xmltok_impl.c"
929
#  undef XML_TOK_IMPL_C
930
931
#  undef MINBPC
932
#  undef BYTE_TYPE
933
#  undef BYTE_TO_ASCII
934
#  undef CHAR_MATCHES
935
#  undef IS_NAME_CHAR
936
#  undef IS_NAME_CHAR_MINBPC
937
#  undef IS_NMSTRT_CHAR
938
#  undef IS_NMSTRT_CHAR_MINBPC
939
#  undef IS_INVALID_CHAR
940
941
#endif /* not XML_MIN_SIZE */
942
943
#ifdef XML_NS
944
945
static const struct normal_encoding big2_encoding_ns
946
    = {{VTABLE, 2, 0,
947
#  if BYTEORDER == 4321
948
        1
949
#  else
950
        0
951
#  endif
952
       },
953
       {
954
#  include "asciitab.h"
955
#  include "latin1tab.h"
956
       },
957
       STANDARD_VTABLE(big2_) NULL_VTABLE};
958
959
#endif
960
961
static const struct normal_encoding big2_encoding
962
    = {{VTABLE, 2, 0,
963
#if BYTEORDER == 4321
964
        1
965
#else
966
        0
967
#endif
968
       },
969
       {
970
#define BT_COLON BT_NMSTRT
971
#include "asciitab.h"
972
#undef BT_COLON
973
#include "latin1tab.h"
974
       },
975
       STANDARD_VTABLE(big2_) NULL_VTABLE};
976
977
#if BYTEORDER != 1234
978
979
#  ifdef XML_NS
980
981
static const struct normal_encoding internal_big2_encoding_ns
982
    = {{VTABLE, 2, 0, 1},
983
       {
984
#    include "iasciitab.h"
985
#    include "latin1tab.h"
986
       },
987
       STANDARD_VTABLE(big2_) NULL_VTABLE};
988
989
#  endif
990
991
static const struct normal_encoding internal_big2_encoding
992
    = {{VTABLE, 2, 0, 1},
993
       {
994
#  define BT_COLON BT_NMSTRT
995
#  include "iasciitab.h"
996
#  undef BT_COLON
997
#  include "latin1tab.h"
998
       },
999
       STANDARD_VTABLE(big2_) NULL_VTABLE};
1000
1001
#endif
1002
1003
#undef PREFIX
1004
1005
static int FASTCALL
1006
0
streqci(const char *s1, const char *s2) {
1007
0
  for (;;) {
1008
0
    char c1 = *s1++;
1009
0
    char c2 = *s2++;
1010
0
    if (ASCII_a <= c1 && c1 <= ASCII_z)
1011
0
      c1 += ASCII_A - ASCII_a;
1012
0
    if (ASCII_a <= c2 && c2 <= ASCII_z)
1013
      /* The following line will never get executed.  streqci() is
1014
       * only called from two places, both of which guarantee to put
1015
       * upper-case strings into s2.
1016
       */
1017
0
      c2 += ASCII_A - ASCII_a; /* LCOV_EXCL_LINE */
1018
0
    if (c1 != c2)
1019
0
      return 0;
1020
0
    if (! c1)
1021
0
      break;
1022
0
  }
1023
0
  return 1;
1024
0
}
1025
1026
static void PTRCALL
1027
initUpdatePosition(const ENCODING *enc, const char *ptr, const char *end,
1028
34
                   POSITION *pos) {
1029
34
  UNUSED_P(enc);
1030
34
  normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
1031
34
}
1032
1033
static int
1034
0
toAscii(const ENCODING *enc, const char *ptr, const char *end) {
1035
0
  char buf[1];
1036
0
  char *p = buf;
1037
0
  XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
1038
0
  if (p == buf)
1039
0
    return -1;
1040
0
  else
1041
0
    return buf[0];
1042
0
}
1043
1044
static int FASTCALL
1045
0
isSpace(int c) {
1046
0
  switch (c) {
1047
0
  case 0x20:
1048
0
  case 0xD:
1049
0
  case 0xA:
1050
0
  case 0x9:
1051
0
    return 1;
1052
0
  }
1053
0
  return 0;
1054
0
}
1055
1056
/* Return 1 if there's just optional white space or there's an S
1057
   followed by name=val.
1058
*/
1059
static int
1060
parsePseudoAttribute(const ENCODING *enc, const char *ptr, const char *end,
1061
                     const char **namePtr, const char **nameEndPtr,
1062
0
                     const char **valPtr, const char **nextTokPtr) {
1063
0
  int c;
1064
0
  char open;
1065
0
  if (ptr == end) {
1066
0
    *namePtr = NULL;
1067
0
    return 1;
1068
0
  }
1069
0
  if (! isSpace(toAscii(enc, ptr, end))) {
1070
0
    *nextTokPtr = ptr;
1071
0
    return 0;
1072
0
  }
1073
0
  do {
1074
0
    ptr += enc->minBytesPerChar;
1075
0
  } while (isSpace(toAscii(enc, ptr, end)));
1076
0
  if (ptr == end) {
1077
0
    *namePtr = NULL;
1078
0
    return 1;
1079
0
  }
1080
0
  *namePtr = ptr;
1081
0
  for (;;) {
1082
0
    c = toAscii(enc, ptr, end);
1083
0
    if (c == -1) {
1084
0
      *nextTokPtr = ptr;
1085
0
      return 0;
1086
0
    }
1087
0
    if (c == ASCII_EQUALS) {
1088
0
      *nameEndPtr = ptr;
1089
0
      break;
1090
0
    }
1091
0
    if (isSpace(c)) {
1092
0
      *nameEndPtr = ptr;
1093
0
      do {
1094
0
        ptr += enc->minBytesPerChar;
1095
0
      } while (isSpace(c = toAscii(enc, ptr, end)));
1096
0
      if (c != ASCII_EQUALS) {
1097
0
        *nextTokPtr = ptr;
1098
0
        return 0;
1099
0
      }
1100
0
      break;
1101
0
    }
1102
0
    ptr += enc->minBytesPerChar;
1103
0
  }
1104
0
  if (ptr == *namePtr) {
1105
0
    *nextTokPtr = ptr;
1106
0
    return 0;
1107
0
  }
1108
0
  ptr += enc->minBytesPerChar;
1109
0
  c = toAscii(enc, ptr, end);
1110
0
  while (isSpace(c)) {
1111
0
    ptr += enc->minBytesPerChar;
1112
0
    c = toAscii(enc, ptr, end);
1113
0
  }
1114
0
  if (c != ASCII_QUOT && c != ASCII_APOS) {
1115
0
    *nextTokPtr = ptr;
1116
0
    return 0;
1117
0
  }
1118
0
  open = (char)c;
1119
0
  ptr += enc->minBytesPerChar;
1120
0
  *valPtr = ptr;
1121
0
  for (;; ptr += enc->minBytesPerChar) {
1122
0
    c = toAscii(enc, ptr, end);
1123
0
    if (c == open)
1124
0
      break;
1125
0
    if (! (ASCII_a <= c && c <= ASCII_z) && ! (ASCII_A <= c && c <= ASCII_Z)
1126
0
        && ! (ASCII_0 <= c && c <= ASCII_9) && c != ASCII_PERIOD
1127
0
        && c != ASCII_MINUS && c != ASCII_UNDERSCORE) {
1128
0
      *nextTokPtr = ptr;
1129
0
      return 0;
1130
0
    }
1131
0
  }
1132
0
  *nextTokPtr = ptr + enc->minBytesPerChar;
1133
0
  return 1;
1134
0
}
1135
1136
static const char KW_version[]
1137
    = {ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0'};
1138
1139
static const char KW_encoding[] = {ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d,
1140
                                   ASCII_i, ASCII_n, ASCII_g, '\0'};
1141
1142
static const char KW_standalone[]
1143
    = {ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a,
1144
       ASCII_l, ASCII_o, ASCII_n, ASCII_e, '\0'};
1145
1146
static const char KW_yes[] = {ASCII_y, ASCII_e, ASCII_s, '\0'};
1147
1148
static const char KW_no[] = {ASCII_n, ASCII_o, '\0'};
1149
1150
static int
1151
doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *, const char *,
1152
                                                 const char *),
1153
               int isGeneralTextEntity, const ENCODING *enc, const char *ptr,
1154
               const char *end, const char **badPtr, const char **versionPtr,
1155
               const char **versionEndPtr, const char **encodingName,
1156
0
               const ENCODING **encoding, int *standalone) {
1157
0
  const char *val = NULL;
1158
0
  const char *name = NULL;
1159
0
  const char *nameEnd = NULL;
1160
0
  ptr += 5 * enc->minBytesPerChar;
1161
0
  end -= 2 * enc->minBytesPerChar;
1162
0
  if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)
1163
0
      || ! name) {
1164
0
    *badPtr = ptr;
1165
0
    return 0;
1166
0
  }
1167
0
  if (! XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) {
1168
0
    if (! isGeneralTextEntity) {
1169
0
      *badPtr = name;
1170
0
      return 0;
1171
0
    }
1172
0
  } else {
1173
0
    if (versionPtr)
1174
0
      *versionPtr = val;
1175
0
    if (versionEndPtr)
1176
0
      *versionEndPtr = ptr;
1177
0
    if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1178
0
      *badPtr = ptr;
1179
0
      return 0;
1180
0
    }
1181
0
    if (! name) {
1182
0
      if (isGeneralTextEntity) {
1183
        /* a TextDecl must have an EncodingDecl */
1184
0
        *badPtr = ptr;
1185
0
        return 0;
1186
0
      }
1187
0
      return 1;
1188
0
    }
1189
0
  }
1190
0
  if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) {
1191
0
    int c = toAscii(enc, val, end);
1192
0
    if (! (ASCII_a <= c && c <= ASCII_z) && ! (ASCII_A <= c && c <= ASCII_Z)) {
1193
0
      *badPtr = val;
1194
0
      return 0;
1195
0
    }
1196
0
    if (encodingName)
1197
0
      *encodingName = val;
1198
0
    if (encoding)
1199
0
      *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
1200
0
    if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1201
0
      *badPtr = ptr;
1202
0
      return 0;
1203
0
    }
1204
0
    if (! name)
1205
0
      return 1;
1206
0
  }
1207
0
  if (! XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone)
1208
0
      || isGeneralTextEntity) {
1209
0
    *badPtr = name;
1210
0
    return 0;
1211
0
  }
1212
0
  if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) {
1213
0
    if (standalone)
1214
0
      *standalone = 1;
1215
0
  } else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) {
1216
0
    if (standalone)
1217
0
      *standalone = 0;
1218
0
  } else {
1219
0
    *badPtr = val;
1220
0
    return 0;
1221
0
  }
1222
0
  while (isSpace(toAscii(enc, ptr, end)))
1223
0
    ptr += enc->minBytesPerChar;
1224
0
  if (ptr != end) {
1225
0
    *badPtr = ptr;
1226
0
    return 0;
1227
0
  }
1228
0
  return 1;
1229
0
}
1230
1231
static int FASTCALL
1232
0
checkCharRefNumber(int result) {
1233
0
  switch (result >> 8) {
1234
0
  case 0xD8:
1235
0
  case 0xD9:
1236
0
  case 0xDA:
1237
0
  case 0xDB:
1238
0
  case 0xDC:
1239
0
  case 0xDD:
1240
0
  case 0xDE:
1241
0
  case 0xDF:
1242
0
    return -1;
1243
0
  case 0:
1244
0
    if (latin1_encoding.type[result] == BT_NONXML)
1245
0
      return -1;
1246
0
    break;
1247
0
  case 0xFF:
1248
0
    if (result == 0xFFFE || result == 0xFFFF)
1249
0
      return -1;
1250
0
    break;
1251
0
  }
1252
0
  return result;
1253
0
}
1254
1255
int FASTCALL
1256
0
XmlUtf8Encode(int c, char *buf) {
1257
0
  enum {
1258
    /* minN is minimum legal resulting value for N byte sequence */
1259
0
    min2 = 0x80,
1260
0
    min3 = 0x800,
1261
0
    min4 = 0x10000
1262
0
  };
1263
1264
0
  if (c < 0)
1265
0
    return 0; /* LCOV_EXCL_LINE: this case is always eliminated beforehand */
1266
0
  if (c < min2) {
1267
0
    buf[0] = (char)(c | UTF8_cval1);
1268
0
    return 1;
1269
0
  }
1270
0
  if (c < min3) {
1271
0
    buf[0] = (char)((c >> 6) | UTF8_cval2);
1272
0
    buf[1] = (char)((c & 0x3f) | 0x80);
1273
0
    return 2;
1274
0
  }
1275
0
  if (c < min4) {
1276
0
    buf[0] = (char)((c >> 12) | UTF8_cval3);
1277
0
    buf[1] = (char)(((c >> 6) & 0x3f) | 0x80);
1278
0
    buf[2] = (char)((c & 0x3f) | 0x80);
1279
0
    return 3;
1280
0
  }
1281
0
  if (c < 0x110000) {
1282
0
    buf[0] = (char)((c >> 18) | UTF8_cval4);
1283
0
    buf[1] = (char)(((c >> 12) & 0x3f) | 0x80);
1284
0
    buf[2] = (char)(((c >> 6) & 0x3f) | 0x80);
1285
0
    buf[3] = (char)((c & 0x3f) | 0x80);
1286
0
    return 4;
1287
0
  }
1288
0
  return 0; /* LCOV_EXCL_LINE: this case too is eliminated before calling */
1289
0
}
1290
1291
int FASTCALL
1292
0
XmlUtf16Encode(int charNum, unsigned short *buf) {
1293
0
  if (charNum < 0)
1294
0
    return 0;
1295
0
  if (charNum < 0x10000) {
1296
0
    buf[0] = (unsigned short)charNum;
1297
0
    return 1;
1298
0
  }
1299
0
  if (charNum < 0x110000) {
1300
0
    charNum -= 0x10000;
1301
0
    buf[0] = (unsigned short)((charNum >> 10) + 0xD800);
1302
0
    buf[1] = (unsigned short)((charNum & 0x3FF) + 0xDC00);
1303
0
    return 2;
1304
0
  }
1305
0
  return 0;
1306
0
}
1307
1308
struct unknown_encoding {
1309
  struct normal_encoding normal;
1310
  CONVERTER convert;
1311
  void *userData;
1312
  unsigned short utf16[256];
1313
  char utf8[256][4];
1314
};
1315
1316
0
#define AS_UNKNOWN_ENCODING(enc) ((const struct unknown_encoding *)(enc))
1317
1318
int
1319
0
XmlSizeOfUnknownEncoding(void) {
1320
0
  return sizeof(struct unknown_encoding);
1321
0
}
1322
1323
static int PTRFASTCALL
1324
0
unknown_isName(const ENCODING *enc, const char *p) {
1325
0
  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1326
0
  int c = uenc->convert(uenc->userData, p);
1327
0
  if (c & ~0xFFFF)
1328
0
    return 0;
1329
0
  return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
1330
0
}
1331
1332
static int PTRFASTCALL
1333
0
unknown_isNmstrt(const ENCODING *enc, const char *p) {
1334
0
  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1335
0
  int c = uenc->convert(uenc->userData, p);
1336
0
  if (c & ~0xFFFF)
1337
0
    return 0;
1338
0
  return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
1339
0
}
1340
1341
static int PTRFASTCALL
1342
0
unknown_isInvalid(const ENCODING *enc, const char *p) {
1343
0
  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1344
0
  int c = uenc->convert(uenc->userData, p);
1345
0
  return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
1346
0
}
1347
1348
static enum XML_Convert_Result PTRCALL
1349
unknown_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
1350
0
               char **toP, const char *toLim) {
1351
0
  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1352
0
  char buf[XML_UTF8_ENCODE_MAX];
1353
0
  for (;;) {
1354
0
    const char *utf8;
1355
0
    int n;
1356
0
    if (*fromP == fromLim)
1357
0
      return XML_CONVERT_COMPLETED;
1358
0
    utf8 = uenc->utf8[(unsigned char)**fromP];
1359
0
    n = *utf8++;
1360
0
    if (n == 0) {
1361
0
      int c = uenc->convert(uenc->userData, *fromP);
1362
0
      n = XmlUtf8Encode(c, buf);
1363
0
      if (n > toLim - *toP)
1364
0
        return XML_CONVERT_OUTPUT_EXHAUSTED;
1365
0
      utf8 = buf;
1366
0
      *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1367
0
                 - (BT_LEAD2 - 2));
1368
0
    } else {
1369
0
      if (n > toLim - *toP)
1370
0
        return XML_CONVERT_OUTPUT_EXHAUSTED;
1371
0
      (*fromP)++;
1372
0
    }
1373
0
    memcpy(*toP, utf8, n);
1374
0
    *toP += n;
1375
0
  }
1376
0
}
1377
1378
static enum XML_Convert_Result PTRCALL
1379
unknown_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
1380
0
                unsigned short **toP, const unsigned short *toLim) {
1381
0
  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1382
0
  while (*fromP < fromLim && *toP < toLim) {
1383
0
    unsigned short c = uenc->utf16[(unsigned char)**fromP];
1384
0
    if (c == 0) {
1385
0
      c = (unsigned short)uenc->convert(uenc->userData, *fromP);
1386
0
      *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1387
0
                 - (BT_LEAD2 - 2));
1388
0
    } else
1389
0
      (*fromP)++;
1390
0
    *(*toP)++ = c;
1391
0
  }
1392
1393
0
  if ((*toP == toLim) && (*fromP < fromLim))
1394
0
    return XML_CONVERT_OUTPUT_EXHAUSTED;
1395
0
  else
1396
0
    return XML_CONVERT_COMPLETED;
1397
0
}
1398
1399
ENCODING *
1400
XmlInitUnknownEncoding(void *mem, const int *table, CONVERTER convert,
1401
0
                       void *userData) {
1402
0
  int i;
1403
0
  struct unknown_encoding *e = (struct unknown_encoding *)mem;
1404
0
  memcpy(mem, &latin1_encoding, sizeof(struct normal_encoding));
1405
0
  for (i = 0; i < 128; i++)
1406
0
    if (latin1_encoding.type[i] != BT_OTHER
1407
0
        && latin1_encoding.type[i] != BT_NONXML && table[i] != i)
1408
0
      return 0;
1409
0
  for (i = 0; i < 256; i++) {
1410
0
    int c = table[i];
1411
0
    if (c == -1) {
1412
0
      e->normal.type[i] = BT_MALFORM;
1413
      /* This shouldn't really get used. */
1414
0
      e->utf16[i] = 0xFFFF;
1415
0
      e->utf8[i][0] = 1;
1416
0
      e->utf8[i][1] = 0;
1417
0
    } else if (c < 0) {
1418
0
      if (c < -4)
1419
0
        return 0;
1420
      /* Multi-byte sequences need a converter function */
1421
0
      if (! convert)
1422
0
        return 0;
1423
0
      e->normal.type[i] = (unsigned char)(BT_LEAD2 - (c + 2));
1424
0
      e->utf8[i][0] = 0;
1425
0
      e->utf16[i] = 0;
1426
0
    } else if (c < 0x80) {
1427
0
      if (latin1_encoding.type[c] != BT_OTHER
1428
0
          && latin1_encoding.type[c] != BT_NONXML && c != i)
1429
0
        return 0;
1430
0
      e->normal.type[i] = latin1_encoding.type[c];
1431
0
      e->utf8[i][0] = 1;
1432
0
      e->utf8[i][1] = (char)c;
1433
0
      e->utf16[i] = (unsigned short)(c == 0 ? 0xFFFF : c);
1434
0
    } else if (checkCharRefNumber(c) < 0) {
1435
0
      e->normal.type[i] = BT_NONXML;
1436
      /* This shouldn't really get used. */
1437
0
      e->utf16[i] = 0xFFFF;
1438
0
      e->utf8[i][0] = 1;
1439
0
      e->utf8[i][1] = 0;
1440
0
    } else {
1441
0
      if (c > 0xFFFF)
1442
0
        return 0;
1443
0
      if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
1444
0
        e->normal.type[i] = BT_NMSTRT;
1445
0
      else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
1446
0
        e->normal.type[i] = BT_NAME;
1447
0
      else
1448
0
        e->normal.type[i] = BT_OTHER;
1449
0
      e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
1450
0
      e->utf16[i] = (unsigned short)c;
1451
0
    }
1452
0
  }
1453
0
  e->userData = userData;
1454
0
  e->convert = convert;
1455
0
  if (convert) {
1456
0
    e->normal.isName2 = unknown_isName;
1457
0
    e->normal.isName3 = unknown_isName;
1458
0
    e->normal.isName4 = unknown_isName;
1459
0
    e->normal.isNmstrt2 = unknown_isNmstrt;
1460
0
    e->normal.isNmstrt3 = unknown_isNmstrt;
1461
0
    e->normal.isNmstrt4 = unknown_isNmstrt;
1462
0
    e->normal.isInvalid2 = unknown_isInvalid;
1463
0
    e->normal.isInvalid3 = unknown_isInvalid;
1464
0
    e->normal.isInvalid4 = unknown_isInvalid;
1465
0
  }
1466
0
  e->normal.enc.utf8Convert = unknown_toUtf8;
1467
0
  e->normal.enc.utf16Convert = unknown_toUtf16;
1468
0
  return &(e->normal.enc);
1469
0
}
1470
1471
/* If this enumeration is changed, getEncodingIndex and encodings
1472
must also be changed. */
1473
enum {
1474
  UNKNOWN_ENC = -1,
1475
  ISO_8859_1_ENC = 0,
1476
  US_ASCII_ENC,
1477
  UTF_8_ENC,
1478
  UTF_16_ENC,
1479
  UTF_16BE_ENC,
1480
  UTF_16LE_ENC,
1481
  /* must match encodingNames up to here */
1482
  NO_ENC
1483
};
1484
1485
static const char KW_ISO_8859_1[]
1486
    = {ASCII_I, ASCII_S, ASCII_O,     ASCII_MINUS, ASCII_8, ASCII_8,
1487
       ASCII_5, ASCII_9, ASCII_MINUS, ASCII_1,     '\0'};
1488
static const char KW_US_ASCII[]
1489
    = {ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S,
1490
       ASCII_C, ASCII_I, ASCII_I,     '\0'};
1491
static const char KW_UTF_8[]
1492
    = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0'};
1493
static const char KW_UTF_16[]
1494
    = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0'};
1495
static const char KW_UTF_16BE[]
1496
    = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1,
1497
       ASCII_6, ASCII_B, ASCII_E, '\0'};
1498
static const char KW_UTF_16LE[]
1499
    = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1,
1500
       ASCII_6, ASCII_L, ASCII_E, '\0'};
1501
1502
static int FASTCALL
1503
6.14k
getEncodingIndex(const char *name) {
1504
6.14k
  static const char *const encodingNames[] = {
1505
6.14k
      KW_ISO_8859_1, KW_US_ASCII, KW_UTF_8, KW_UTF_16, KW_UTF_16BE, KW_UTF_16LE,
1506
6.14k
  };
1507
6.14k
  int i;
1508
6.14k
  if (name == NULL)
1509
6.14k
    return NO_ENC;
1510
0
  for (i = 0; i < (int)(sizeof(encodingNames) / sizeof(encodingNames[0])); i++)
1511
0
    if (streqci(name, encodingNames[i]))
1512
0
      return i;
1513
0
  return UNKNOWN_ENC;
1514
0
}
1515
1516
/* For binary compatibility, we store the index of the encoding
1517
   specified at initialization in the isUtf16 member.
1518
*/
1519
1520
2.73k
#define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16)
1521
6.14k
#define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i)
1522
1523
/* This is what detects the encoding.  encodingTable maps from
1524
   encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of
1525
   the external (protocol) specified encoding; state is
1526
   XML_CONTENT_STATE if we're parsing an external text entity, and
1527
   XML_PROLOG_STATE otherwise.
1528
*/
1529
1530
static int
1531
initScan(const ENCODING *const *encodingTable, const INIT_ENCODING *enc,
1532
3.07k
         int state, const char *ptr, const char *end, const char **nextTokPtr) {
1533
3.07k
  const ENCODING **encPtr;
1534
1535
3.07k
  if (ptr >= end)
1536
10
    return XML_TOK_NONE;
1537
3.06k
  encPtr = enc->encPtr;
1538
3.06k
  if (ptr + 1 == end) {
1539
    /* only a single byte available for auto-detection */
1540
#ifndef XML_DTD /* FIXME */
1541
    /* a well-formed document entity must have more than one byte */
1542
    if (state != XML_CONTENT_STATE)
1543
      return XML_TOK_PARTIAL;
1544
#endif
1545
    /* so we're parsing an external text entity... */
1546
    /* if UTF-16 was externally specified, then we need at least 2 bytes */
1547
11
    switch (INIT_ENC_INDEX(enc)) {
1548
0
    case UTF_16_ENC:
1549
0
    case UTF_16LE_ENC:
1550
0
    case UTF_16BE_ENC:
1551
0
      return XML_TOK_PARTIAL;
1552
11
    }
1553
11
    switch ((unsigned char)*ptr) {
1554
0
    case 0xFE:
1555
4
    case 0xFF:
1556
4
    case 0xEF: /* possibly first byte of UTF-8 BOM */
1557
4
      if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1558
0
        break;
1559
      /* fall through */
1560
4
    case 0x00:
1561
7
    case 0x3C:
1562
7
      return XML_TOK_PARTIAL;
1563
11
    }
1564
3.05k
  } else {
1565
3.05k
    switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
1566
0
    case 0xFEFF:
1567
0
      if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1568
0
        break;
1569
0
      *nextTokPtr = ptr + 2;
1570
0
      *encPtr = encodingTable[UTF_16BE_ENC];
1571
0
      return XML_TOK_BOM;
1572
    /* 00 3C is handled in the default case */
1573
149
    case 0x3C00:
1574
149
      if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
1575
149
           || INIT_ENC_INDEX(enc) == UTF_16_ENC)
1576
0
          && state == XML_CONTENT_STATE)
1577
0
        break;
1578
149
      *encPtr = encodingTable[UTF_16LE_ENC];
1579
149
      return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1580
29
    case 0xFFFE:
1581
29
      if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1582
0
        break;
1583
29
      *nextTokPtr = ptr + 2;
1584
29
      *encPtr = encodingTable[UTF_16LE_ENC];
1585
29
      return XML_TOK_BOM;
1586
0
    case 0xEFBB:
1587
      /* Maybe a UTF-8 BOM (EF BB BF) */
1588
      /* If there's an explicitly specified (external) encoding
1589
         of ISO-8859-1 or some flavour of UTF-16
1590
         and this is an external text entity,
1591
         don't look for the BOM,
1592
         because it might be a legal data.
1593
      */
1594
0
      if (state == XML_CONTENT_STATE) {
1595
0
        int e = INIT_ENC_INDEX(enc);
1596
0
        if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC || e == UTF_16LE_ENC
1597
0
            || e == UTF_16_ENC)
1598
0
          break;
1599
0
      }
1600
0
      if (ptr + 2 == end)
1601
0
        return XML_TOK_PARTIAL;
1602
0
      if ((unsigned char)ptr[2] == 0xBF) {
1603
0
        *nextTokPtr = ptr + 3;
1604
0
        *encPtr = encodingTable[UTF_8_ENC];
1605
0
        return XML_TOK_BOM;
1606
0
      }
1607
0
      break;
1608
2.87k
    default:
1609
2.87k
      if (ptr[0] == '\0') {
1610
        /* 0 isn't a legal data character. Furthermore a document
1611
           entity can only start with ASCII characters.  So the only
1612
           way this can fail to be big-endian UTF-16 if it it's an
1613
           external parsed general entity that's labelled as
1614
           UTF-16LE.
1615
        */
1616
279
        if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
1617
0
          break;
1618
279
        *encPtr = encodingTable[UTF_16BE_ENC];
1619
279
        return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1620
2.59k
      } else if (ptr[1] == '\0') {
1621
        /* We could recover here in the case:
1622
            - parsing an external entity
1623
            - second byte is 0
1624
            - no externally specified encoding
1625
            - no encoding declaration
1626
           by assuming UTF-16LE.  But we don't, because this would mean when
1627
           presented just with a single byte, we couldn't reliably determine
1628
           whether we needed further bytes.
1629
        */
1630
205
        if (state == XML_CONTENT_STATE)
1631
0
          break;
1632
205
        *encPtr = encodingTable[UTF_16LE_ENC];
1633
205
        return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1634
205
      }
1635
2.38k
      break;
1636
3.05k
    }
1637
3.05k
  }
1638
2.39k
  *encPtr = encodingTable[INIT_ENC_INDEX(enc)];
1639
2.39k
  return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1640
3.06k
}
1641
1642
6.14k
#define NS(x) x
1643
0
#define ns(x) x
1644
#define XML_TOK_NS_C
1645
#include "xmltok_ns.c"
1646
#undef XML_TOK_NS_C
1647
#undef NS
1648
#undef ns
1649
1650
#ifdef XML_NS
1651
1652
9.21k
#  define NS(x) x##NS
1653
3.07k
#  define ns(x) x##_ns
1654
1655
#  define XML_TOK_NS_C
1656
#  include "xmltok_ns.c"
1657
#  undef XML_TOK_NS_C
1658
1659
#  undef NS
1660
#  undef ns
1661
1662
ENCODING *
1663
XmlInitUnknownEncodingNS(void *mem, const int *table, CONVERTER convert,
1664
0
                         void *userData) {
1665
0
  ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
1666
0
  if (enc)
1667
0
    ((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON;
1668
0
  return enc;
1669
0
}
1670
1671
#endif /* XML_NS */