Coverage Report

Created: 2026-05-31 06:50

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libexpat/expat/lib/xmltok.c
Line
Count
Source
1
/*
2
                            __  __            _
3
                         ___\ \/ /_ __   __ _| |_
4
                        / _ \\  /| '_ \ / _` | __|
5
                       |  __//  \| |_) | (_| | |_
6
                        \___/_/\_\ .__/ \__,_|\__|
7
                                 |_| XML parser
8
9
   Copyright (c) 1997-2000 Thai Open Source Software Center Ltd
10
   Copyright (c) 2000      Clark Cooper <coopercc@users.sourceforge.net>
11
   Copyright (c) 2001-2003 Fred L. Drake, Jr. <fdrake@users.sourceforge.net>
12
   Copyright (c) 2002      Greg Stein <gstein@users.sourceforge.net>
13
   Copyright (c) 2002-2016 Karl Waclawek <karl@waclawek.net>
14
   Copyright (c) 2005-2009 Steven Solie <steven@solie.ca>
15
   Copyright (c) 2016-2024 Sebastian Pipping <sebastian@pipping.org>
16
   Copyright (c) 2016      Pascal Cuoq <cuoq@trust-in-soft.com>
17
   Copyright (c) 2016      Don Lewis <truckman@apache.org>
18
   Copyright (c) 2017      Rhodri James <rhodri@wildebeest.org.uk>
19
   Copyright (c) 2017      Alexander Bluhm <alexander.bluhm@gmx.net>
20
   Copyright (c) 2017      Benbuck Nason <bnason@netflix.com>
21
   Copyright (c) 2017      José Gutiérrez de la Concha <jose@zeroc.com>
22
   Copyright (c) 2019      David Loffredo <loffredo@steptools.com>
23
   Copyright (c) 2021      Donghee Na <donghee.na@python.org>
24
   Copyright (c) 2022      Martin Ettl <ettl.martin78@googlemail.com>
25
   Copyright (c) 2022      Sean McBride <sean@rogue-research.com>
26
   Copyright (c) 2023      Hanno Böck <hanno@gentoo.org>
27
   Copyright (c) 2025      Alfonso Gregory <gfunni234@gmail.com>
28
   Licensed under the MIT license:
29
30
   Permission is  hereby granted,  free of charge,  to any  person obtaining
31
   a  copy  of  this  software   and  associated  documentation  files  (the
32
   "Software"),  to  deal in  the  Software  without restriction,  including
33
   without  limitation the  rights  to use,  copy,  modify, merge,  publish,
34
   distribute, sublicense, and/or sell copies of the Software, and to permit
35
   persons  to whom  the Software  is  furnished to  do so,  subject to  the
36
   following conditions:
37
38
   The above copyright  notice and this permission notice  shall be included
39
   in all copies or substantial portions of the Software.
40
41
   THE  SOFTWARE  IS  PROVIDED  "AS  IS",  WITHOUT  WARRANTY  OF  ANY  KIND,
42
   EXPRESS  OR IMPLIED,  INCLUDING  BUT  NOT LIMITED  TO  THE WARRANTIES  OF
43
   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
44
   NO EVENT SHALL THE AUTHORS OR  COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
45
   DAMAGES OR  OTHER LIABILITY, WHETHER  IN AN  ACTION OF CONTRACT,  TORT OR
46
   OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
47
   USE OR OTHER DEALINGS IN THE SOFTWARE.
48
*/
49
50
#include "expat_config.h"
51
52
#include <stddef.h>
53
#include <string.h> /* memcpy */
54
#include <stdbool.h>
55
56
#ifdef _WIN32
57
#  include "winconfig.h"
58
#endif
59
60
#include "internal.h"
61
#include "fallthrough.h"
62
#include "xmltok.h"
63
#include "nametab.h"
64
65
#ifdef XML_DTD
66
#  define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok)
67
#else
68
#  define IGNORE_SECTION_TOK_VTABLE /* as nothing */
69
#endif
70
71
#define VTABLE1                                                                \
72
  {PREFIX(prologTok), PREFIX(contentTok),                                      \
73
   PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE},                         \
74
      {PREFIX(attributeValueTok), PREFIX(entityValueTok)},                     \
75
      PREFIX(nameMatchesAscii), PREFIX(nameLength), PREFIX(skipS),             \
76
      PREFIX(getAtts), PREFIX(charRefNumber), PREFIX(predefinedEntityName),    \
77
      PREFIX(updatePosition), PREFIX(isPublicId)
78
79
#define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
80
81
#define UCS2_GET_NAMING(pages, hi, lo)                                         \
82
32.4k
  (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1u << ((lo) & 0x1F)))
83
84
/* A 2 byte UTF-8 representation splits the characters 11 bits between
85
   the bottom 5 and 6 bits of the bytes.  We need 8 bits to index into
86
   pages, 3 bits to add to that index and 5 bits to generate the mask.
87
*/
88
#define UTF8_GET_NAMING2(pages, byte)                                          \
89
1.27k
  (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3)                         \
90
1.27k
                + ((((byte)[0]) & 3) << 1) + ((((byte)[1]) >> 5) & 1)]         \
91
1.27k
   & (1u << (((byte)[1]) & 0x1F)))
92
93
/* A 3 byte UTF-8 representation splits the characters 16 bits between
94
   the bottom 4, 6 and 6 bits of the bytes.  We need 8 bits to index
95
   into pages, 3 bits to add to that index and 5 bits to generate the
96
   mask.
97
*/
98
#define UTF8_GET_NAMING3(pages, byte)                                          \
99
1.82k
  (namingBitmap                                                                \
100
1.82k
       [((pages)[((((byte)[0]) & 0xF) << 4) + ((((byte)[1]) >> 2) & 0xF)]      \
101
1.82k
         << 3)                                                                 \
102
1.82k
        + ((((byte)[1]) & 3) << 1) + ((((byte)[2]) >> 5) & 1)]                 \
103
1.82k
   & (1u << (((byte)[2]) & 0x1F)))
104
105
/* Detection of invalid UTF-8 sequences is based on Table 3.1B
106
   of Unicode 3.2: https://www.unicode.org/unicode/reports/tr28/
107
   with the additional restriction of not allowing the Unicode
108
   code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE).
109
   Implementation details:
110
     (A & 0x80) == 0     means A < 0x80
111
   and
112
     (A & 0xC0) == 0xC0  means A > 0xBF
113
*/
114
115
#define UTF8_INVALID2(p)                                                       \
116
4.62k
  ((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0)
117
118
#define UTF8_INVALID3(p)                                                       \
119
7.75k
  (((p)[2] & 0x80) == 0                                                        \
120
7.75k
   || ((*p) == 0xEF && (p)[1] == 0xBF ? (p)[2] > 0xBD                          \
121
7.50k
                                      : ((p)[2] & 0xC0) == 0xC0)               \
122
7.75k
   || ((*p) == 0xE0                                                            \
123
7.36k
           ? (p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0                          \
124
7.36k
           : ((p)[1] & 0x80) == 0                                              \
125
5.15k
                 || ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0)))
126
127
#define UTF8_INVALID4(p)                                                       \
128
2.37k
  (((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 || ((p)[2] & 0x80) == 0     \
129
2.37k
   || ((p)[2] & 0xC0) == 0xC0                                                  \
130
2.37k
   || ((*p) == 0xF0                                                            \
131
2.01k
           ? (p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0                          \
132
2.01k
           : ((p)[1] & 0x80) == 0                                              \
133
1.99k
                 || ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0)))
134
135
static int PTRFASTCALL
136
189
isNever(const ENCODING *enc, const char *p) {
137
189
  UNUSED_P(enc);
138
189
  UNUSED_P(p);
139
189
  return 0;
140
189
}
141
142
static int PTRFASTCALL
143
977
utf8_isName2(const ENCODING *enc, const char *p) {
144
977
  UNUSED_P(enc);
145
977
  return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
146
977
}
147
148
static int PTRFASTCALL
149
1.64k
utf8_isName3(const ENCODING *enc, const char *p) {
150
1.64k
  UNUSED_P(enc);
151
1.64k
  return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
152
1.64k
}
153
154
#define utf8_isName4 isNever
155
156
static int PTRFASTCALL
157
301
utf8_isNmstrt2(const ENCODING *enc, const char *p) {
158
301
  UNUSED_P(enc);
159
301
  return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
160
301
}
161
162
static int PTRFASTCALL
163
179
utf8_isNmstrt3(const ENCODING *enc, const char *p) {
164
179
  UNUSED_P(enc);
165
179
  return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
166
179
}
167
168
#define utf8_isNmstrt4 isNever
169
170
static int PTRFASTCALL
171
4.62k
utf8_isInvalid2(const ENCODING *enc, const char *p) {
172
4.62k
  UNUSED_P(enc);
173
4.62k
  return UTF8_INVALID2((const unsigned char *)p);
174
4.62k
}
175
176
static int PTRFASTCALL
177
7.75k
utf8_isInvalid3(const ENCODING *enc, const char *p) {
178
7.75k
  UNUSED_P(enc);
179
7.75k
  return UTF8_INVALID3((const unsigned char *)p);
180
7.75k
}
181
182
static int PTRFASTCALL
183
2.37k
utf8_isInvalid4(const ENCODING *enc, const char *p) {
184
2.37k
  UNUSED_P(enc);
185
2.37k
  return UTF8_INVALID4((const unsigned char *)p);
186
2.37k
}
187
188
struct normal_encoding {
189
  ENCODING enc;
190
  unsigned char type[256];
191
#ifdef XML_MIN_SIZE
192
  int(PTRFASTCALL *byteType)(const ENCODING *, const char *);
193
  int(PTRFASTCALL *isNameMin)(const ENCODING *, const char *);
194
  int(PTRFASTCALL *isNmstrtMin)(const ENCODING *, const char *);
195
  int(PTRFASTCALL *byteToAscii)(const ENCODING *, const char *);
196
  int(PTRCALL *charMatches)(const ENCODING *, const char *, int);
197
#endif /* XML_MIN_SIZE */
198
  int(PTRFASTCALL *isName2)(const ENCODING *, const char *);
199
  int(PTRFASTCALL *isName3)(const ENCODING *, const char *);
200
  int(PTRFASTCALL *isName4)(const ENCODING *, const char *);
201
  int(PTRFASTCALL *isNmstrt2)(const ENCODING *, const char *);
202
  int(PTRFASTCALL *isNmstrt3)(const ENCODING *, const char *);
203
  int(PTRFASTCALL *isNmstrt4)(const ENCODING *, const char *);
204
  int(PTRFASTCALL *isInvalid2)(const ENCODING *, const char *);
205
  int(PTRFASTCALL *isInvalid3)(const ENCODING *, const char *);
206
  int(PTRFASTCALL *isInvalid4)(const ENCODING *, const char *);
207
};
208
209
18.0k
#define AS_NORMAL_ENCODING(enc) ((const struct normal_encoding *)(enc))
210
211
#ifdef XML_MIN_SIZE
212
213
#  define STANDARD_VTABLE(E)                                                   \
214
    E##byteType, E##isNameMin, E##isNmstrtMin, E##byteToAscii, E##charMatches,
215
216
#else
217
218
#  define STANDARD_VTABLE(E) /* as nothing */
219
220
#endif
221
222
#define NORMAL_VTABLE(E)                                                       \
223
  E##isName2, E##isName3, E##isName4, E##isNmstrt2, E##isNmstrt3,              \
224
      E##isNmstrt4, E##isInvalid2, E##isInvalid3, E##isInvalid4
225
226
#define NULL_VTABLE                                                            \
227
  /* isName2 */ NULL, /* isName3 */ NULL, /* isName4 */ NULL,                  \
228
      /* isNmstrt2 */ NULL, /* isNmstrt3 */ NULL, /* isNmstrt4 */ NULL,        \
229
      /* isInvalid2 */ NULL, /* isInvalid3 */ NULL, /* isInvalid4 */ NULL
230
231
static int FASTCALL checkCharRefNumber(int result);
232
233
#include "xmltok_impl.h"
234
#include "ascii.h"
235
236
#ifdef XML_MIN_SIZE
237
#  define sb_isNameMin isNever
238
#  define sb_isNmstrtMin isNever
239
#endif
240
241
#ifdef XML_MIN_SIZE
242
#  define MINBPC(enc) ((enc)->minBytesPerChar)
243
#else
244
/* minimum bytes per character */
245
78.3M
#  define MINBPC(enc) 1
246
#endif
247
248
#define SB_BYTE_TYPE(enc, p)                                                   \
249
53.5M
  (((const struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
250
251
#ifdef XML_MIN_SIZE
252
static int PTRFASTCALL
253
sb_byteType(const ENCODING *enc, const char *p) {
254
  return SB_BYTE_TYPE(enc, p);
255
}
256
#  define BYTE_TYPE(enc, p) (AS_NORMAL_ENCODING(enc)->byteType(enc, p))
257
#else
258
53.4M
#  define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
259
#endif
260
261
#ifdef XML_MIN_SIZE
262
#  define BYTE_TO_ASCII(enc, p) (AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p))
263
static int PTRFASTCALL
264
sb_byteToAscii(const ENCODING *enc, const char *p) {
265
  UNUSED_P(enc);
266
  return *p;
267
}
268
#else
269
192k
#  define BYTE_TO_ASCII(enc, p) (*(p))
270
#endif
271
272
2.72k
#define IS_NAME_CHAR(enc, p, n) (AS_NORMAL_ENCODING(enc)->isName##n(enc, p))
273
570
#define IS_NMSTRT_CHAR(enc, p, n) (AS_NORMAL_ENCODING(enc)->isNmstrt##n(enc, p))
274
#ifdef XML_MIN_SIZE
275
#  define IS_INVALID_CHAR(enc, p, n)                                           \
276
    (AS_NORMAL_ENCODING(enc)->isInvalid##n                                     \
277
     && AS_NORMAL_ENCODING(enc)->isInvalid##n(enc, p))
278
#else
279
#  define IS_INVALID_CHAR(enc, p, n)                                           \
280
18.7k
    (AS_NORMAL_ENCODING(enc)->isInvalid##n(enc, p))
281
#endif
282
283
#ifdef XML_MIN_SIZE
284
#  define IS_NAME_CHAR_MINBPC(enc, p)                                          \
285
    (AS_NORMAL_ENCODING(enc)->isNameMin(enc, p))
286
#  define IS_NMSTRT_CHAR_MINBPC(enc, p)                                        \
287
    (AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p))
288
#else
289
0
#  define IS_NAME_CHAR_MINBPC(enc, p) (0)
290
0
#  define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
291
#endif
292
293
#ifdef XML_MIN_SIZE
294
#  define CHAR_MATCHES(enc, p, c)                                              \
295
    (AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c))
296
static int PTRCALL
297
sb_charMatches(const ENCODING *enc, const char *p, int c) {
298
  UNUSED_P(enc);
299
  return *p == c;
300
}
301
#else
302
/* c is an ASCII character */
303
34.3k
#  define CHAR_MATCHES(enc, p, c) (*(p) == (c))
304
#endif
305
306
414k
#define PREFIX(ident) normal_##ident
307
#define XML_TOK_IMPL_C
308
#include "xmltok_impl.c"
309
#undef XML_TOK_IMPL_C
310
311
#undef MINBPC
312
#undef BYTE_TYPE
313
#undef BYTE_TO_ASCII
314
#undef CHAR_MATCHES
315
#undef IS_NAME_CHAR
316
#undef IS_NAME_CHAR_MINBPC
317
#undef IS_NMSTRT_CHAR
318
#undef IS_NMSTRT_CHAR_MINBPC
319
#undef IS_INVALID_CHAR
320
321
enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */
322
       UTF8_cval1 = 0x00,
323
       UTF8_cval2 = 0xc0,
324
       UTF8_cval3 = 0xe0,
325
       UTF8_cval4 = 0xf0
326
};
327
328
void
329
_INTERNAL_trim_to_complete_utf8_characters(const char *from,
330
708k
                                           const char **fromLimRef) {
331
708k
  const char *fromLim = *fromLimRef;
332
708k
  size_t walked = 0;
333
708k
  for (; fromLim > from; fromLim--, walked++) {
334
705k
    const unsigned char prev = (unsigned char)fromLim[-1];
335
705k
    if ((prev & 0xf8u)
336
705k
        == 0xf0u) { /* 4-byte character, lead by 0b11110xxx byte */
337
57
      if (walked + 1 >= 4) {
338
57
        fromLim += 4 - 1;
339
57
        break;
340
57
      } else {
341
0
        walked = 0;
342
0
      }
343
705k
    } else if ((prev & 0xf0u)
344
705k
               == 0xe0u) { /* 3-byte character, lead by 0b1110xxxx byte */
345
35
      if (walked + 1 >= 3) {
346
24
        fromLim += 3 - 1;
347
24
        break;
348
24
      } else {
349
11
        walked = 0;
350
11
      }
351
705k
    } else if ((prev & 0xe0u)
352
705k
               == 0xc0u) { /* 2-byte character, lead by 0b110xxxxx byte */
353
25
      if (walked + 1 >= 2) {
354
18
        fromLim += 2 - 1;
355
18
        break;
356
18
      } else {
357
7
        walked = 0;
358
7
      }
359
705k
    } else if ((prev & 0x80u)
360
705k
               == 0x00u) { /* 1-byte character, matching 0b0xxxxxxx */
361
705k
      break;
362
705k
    }
363
705k
  }
364
708k
  *fromLimRef = fromLim;
365
708k
}
366
367
static enum XML_Convert_Result PTRCALL
368
utf8_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
369
708k
            char **toP, const char *toLim) {
370
708k
  bool input_incomplete = false;
371
708k
  bool output_exhausted = false;
372
373
  /* Avoid copying partial characters (due to limited space). */
374
708k
  const ptrdiff_t bytesAvailable = fromLim - *fromP;
375
708k
  const ptrdiff_t bytesStorable = toLim - *toP;
376
708k
  UNUSED_P(enc);
377
708k
  if (bytesAvailable > bytesStorable) {
378
2.68k
    fromLim = *fromP + bytesStorable;
379
2.68k
    output_exhausted = true;
380
2.68k
  }
381
382
  /* Avoid copying partial characters (from incomplete input). */
383
708k
  {
384
708k
    const char *const fromLimBefore = fromLim;
385
708k
    _INTERNAL_trim_to_complete_utf8_characters(*fromP, &fromLim);
386
708k
    if (fromLim < fromLimBefore) {
387
18
      input_incomplete = true;
388
18
    }
389
708k
  }
390
391
708k
  {
392
708k
    const ptrdiff_t bytesToCopy = fromLim - *fromP;
393
708k
    memcpy(*toP, *fromP, bytesToCopy);
394
708k
    *fromP += bytesToCopy;
395
708k
    *toP += bytesToCopy;
396
708k
  }
397
398
708k
  if (output_exhausted) /* needs to go first */
399
2.68k
    return XML_CONVERT_OUTPUT_EXHAUSTED;
400
705k
  else if (input_incomplete)
401
0
    return XML_CONVERT_INPUT_INCOMPLETE;
402
705k
  else
403
705k
    return XML_CONVERT_COMPLETED;
404
708k
}
405
406
static enum XML_Convert_Result PTRCALL
407
utf8_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
408
0
             unsigned short **toP, const unsigned short *toLim) {
409
0
  enum XML_Convert_Result res = XML_CONVERT_COMPLETED;
410
0
  unsigned short *to = *toP;
411
0
  const char *from = *fromP;
412
0
  while (from < fromLim && to < toLim) {
413
0
    switch (SB_BYTE_TYPE(enc, from)) {
414
0
    case BT_LEAD2:
415
0
      if (fromLim - from < 2) {
416
0
        res = XML_CONVERT_INPUT_INCOMPLETE;
417
0
        goto after;
418
0
      }
419
0
      *to++ = (unsigned short)(((from[0] & 0x1f) << 6) | (from[1] & 0x3f));
420
0
      from += 2;
421
0
      break;
422
0
    case BT_LEAD3:
423
0
      if (fromLim - from < 3) {
424
0
        res = XML_CONVERT_INPUT_INCOMPLETE;
425
0
        goto after;
426
0
      }
427
0
      *to++ = (unsigned short)(((from[0] & 0xf) << 12) | ((from[1] & 0x3f) << 6)
428
0
                               | (from[2] & 0x3f));
429
0
      from += 3;
430
0
      break;
431
0
    case BT_LEAD4: {
432
0
      unsigned long n;
433
0
      if (toLim - to < 2) {
434
0
        res = XML_CONVERT_OUTPUT_EXHAUSTED;
435
0
        goto after;
436
0
      }
437
0
      if (fromLim - from < 4) {
438
0
        res = XML_CONVERT_INPUT_INCOMPLETE;
439
0
        goto after;
440
0
      }
441
0
      n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12)
442
0
          | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
443
0
      n -= 0x10000;
444
0
      to[0] = (unsigned short)((n >> 10) | 0xD800);
445
0
      to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
446
0
      to += 2;
447
0
      from += 4;
448
0
    } break;
449
0
    default:
450
0
      *to++ = *from++;
451
0
      break;
452
0
    }
453
0
  }
454
0
  if (from < fromLim)
455
0
    res = XML_CONVERT_OUTPUT_EXHAUSTED;
456
0
after:
457
0
  *fromP = from;
458
0
  *toP = to;
459
0
  return res;
460
0
}
461
462
#ifdef XML_NS
463
static const struct normal_encoding utf8_encoding_ns
464
    = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
465
       {
466
#  include "asciitab.h"
467
#  include "utf8tab.h"
468
       },
469
       STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
470
#endif
471
472
static const struct normal_encoding utf8_encoding
473
    = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
474
       {
475
#define BT_COLON BT_NMSTRT
476
#include "asciitab.h"
477
#undef BT_COLON
478
#include "utf8tab.h"
479
       },
480
       STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
481
482
#ifdef XML_NS
483
484
static const struct normal_encoding internal_utf8_encoding_ns
485
    = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
486
       {
487
#  include "iasciitab.h"
488
#  include "utf8tab.h"
489
       },
490
       STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
491
492
#endif
493
494
static const struct normal_encoding internal_utf8_encoding
495
    = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
496
       {
497
#define BT_COLON BT_NMSTRT
498
#include "iasciitab.h"
499
#undef BT_COLON
500
#include "utf8tab.h"
501
       },
502
       STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
503
504
static enum XML_Convert_Result PTRCALL
505
latin1_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
506
0
              char **toP, const char *toLim) {
507
0
  UNUSED_P(enc);
508
0
  for (;;) {
509
0
    unsigned char c;
510
0
    if (*fromP == fromLim)
511
0
      return XML_CONVERT_COMPLETED;
512
0
    c = (unsigned char)**fromP;
513
0
    if (c & 0x80) {
514
0
      if (toLim - *toP < 2)
515
0
        return XML_CONVERT_OUTPUT_EXHAUSTED;
516
0
      *(*toP)++ = (char)((c >> 6) | UTF8_cval2);
517
0
      *(*toP)++ = (char)((c & 0x3f) | 0x80);
518
0
      (*fromP)++;
519
0
    } else {
520
0
      if (*toP == toLim)
521
0
        return XML_CONVERT_OUTPUT_EXHAUSTED;
522
0
      *(*toP)++ = *(*fromP)++;
523
0
    }
524
0
  }
525
0
}
526
527
static enum XML_Convert_Result PTRCALL
528
latin1_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
529
0
               unsigned short **toP, const unsigned short *toLim) {
530
0
  UNUSED_P(enc);
531
0
  while (*fromP < fromLim && *toP < toLim)
532
0
    *(*toP)++ = (unsigned char)*(*fromP)++;
533
534
0
  if ((*toP == toLim) && (*fromP < fromLim))
535
0
    return XML_CONVERT_OUTPUT_EXHAUSTED;
536
0
  else
537
0
    return XML_CONVERT_COMPLETED;
538
0
}
539
540
#ifdef XML_NS
541
542
static const struct normal_encoding latin1_encoding_ns
543
    = {{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0},
544
       {
545
#  include "asciitab.h"
546
#  include "latin1tab.h"
547
       },
548
       STANDARD_VTABLE(sb_) NULL_VTABLE};
549
550
#endif
551
552
static const struct normal_encoding latin1_encoding
553
    = {{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0},
554
       {
555
#define BT_COLON BT_NMSTRT
556
#include "asciitab.h"
557
#undef BT_COLON
558
#include "latin1tab.h"
559
       },
560
       STANDARD_VTABLE(sb_) NULL_VTABLE};
561
562
static enum XML_Convert_Result PTRCALL
563
ascii_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
564
0
             char **toP, const char *toLim) {
565
0
  UNUSED_P(enc);
566
0
  while (*fromP < fromLim && *toP < toLim)
567
0
    *(*toP)++ = *(*fromP)++;
568
569
0
  if ((*toP == toLim) && (*fromP < fromLim))
570
0
    return XML_CONVERT_OUTPUT_EXHAUSTED;
571
0
  else
572
0
    return XML_CONVERT_COMPLETED;
573
0
}
574
575
#ifdef XML_NS
576
577
static const struct normal_encoding ascii_encoding_ns
578
    = {{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0},
579
       {
580
#  include "asciitab.h"
581
           /* BT_NONXML == 0 */
582
       },
583
       STANDARD_VTABLE(sb_) NULL_VTABLE};
584
585
#endif
586
587
static const struct normal_encoding ascii_encoding
588
    = {{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0},
589
       {
590
#define BT_COLON BT_NMSTRT
591
#include "asciitab.h"
592
#undef BT_COLON
593
           /* BT_NONXML == 0 */
594
       },
595
       STANDARD_VTABLE(sb_) NULL_VTABLE};
596
597
static int PTRFASTCALL
598
157k
unicode_byte_type(char hi, char lo) {
599
157k
  switch ((unsigned char)hi) {
600
  /* 0xD800-0xDBFF first 16-bit code unit or high surrogate (W1) */
601
2.38k
  case 0xD8:
602
3.38k
  case 0xD9:
603
4.69k
  case 0xDA:
604
6.19k
  case 0xDB:
605
6.19k
    return BT_LEAD4;
606
  /* 0xDC00-0xDFFF second 16-bit code unit or low surrogate (W2) */
607
389
  case 0xDC:
608
740
  case 0xDD:
609
1.10k
  case 0xDE:
610
1.54k
  case 0xDF:
611
1.54k
    return BT_TRAIL;
612
3.90k
  case 0xFF:
613
3.90k
    switch ((unsigned char)lo) {
614
747
    case 0xFF: /* noncharacter-FFFF */
615
767
    case 0xFE: /* noncharacter-FFFE */
616
767
      return BT_NONXML;
617
3.90k
    }
618
3.14k
    break;
619
157k
  }
620
149k
  return BT_NONASCII;
621
157k
}
622
623
#define DEFINE_UTF16_TO_UTF8(E)                                                \
624
  static enum XML_Convert_Result PTRCALL E##toUtf8(                            \
625
      const ENCODING *enc, const char **fromP, const char *fromLim,            \
626
2.00k
      char **toP, const char *toLim) {                                         \
627
2.00k
    const char *from = *fromP;                                                 \
628
2.00k
    UNUSED_P(enc);                                                             \
629
2.00k
    fromLim = from + (((fromLim - from) >> 1) << 1); /* shrink to even */      \
630
14.3k
    for (; from < fromLim; from += 2) {                                        \
631
12.5k
      int plane;                                                               \
632
12.5k
      unsigned char lo2;                                                       \
633
12.5k
      unsigned char lo = GET_LO(from);                                         \
634
12.5k
      unsigned char hi = GET_HI(from);                                         \
635
12.5k
      switch (hi) {                                                            \
636
1.37k
      case 0:                                                                  \
637
1.37k
        if (lo < 0x80) {                                                       \
638
1.13k
          if (*toP == toLim) {                                                 \
639
14
            *fromP = from;                                                     \
640
14
            return XML_CONVERT_OUTPUT_EXHAUSTED;                               \
641
14
          }                                                                    \
642
1.13k
          *(*toP)++ = lo;                                                      \
643
1.12k
          break;                                                               \
644
1.13k
        }                                                                      \
645
1.37k
        EXPAT_FALLTHROUGH;                                                     \
646
973
      case 0x1:                                                                \
647
1.00k
      case 0x2:                                                                \
648
1.03k
      case 0x3:                                                                \
649
1.09k
      case 0x4:                                                                \
650
1.12k
      case 0x5:                                                                \
651
1.15k
      case 0x6:                                                                \
652
1.15k
      case 0x7:                                                                \
653
1.15k
        if (toLim - *toP < 2) {                                                \
654
32
          *fromP = from;                                                       \
655
32
          return XML_CONVERT_OUTPUT_EXHAUSTED;                                 \
656
32
        }                                                                      \
657
1.15k
        *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2);                      \
658
1.12k
        *(*toP)++ = ((lo & 0x3f) | 0x80);                                      \
659
1.12k
        break;                                                                 \
660
10.2k
      default:                                                                 \
661
10.2k
        if (toLim - *toP < 3) {                                                \
662
127
          *fromP = from;                                                       \
663
127
          return XML_CONVERT_OUTPUT_EXHAUSTED;                                 \
664
127
        }                                                                      \
665
10.2k
        /* 16 bits divided 4, 6, 6 amongst 3 bytes */                          \
666
10.2k
        *(*toP)++ = ((hi >> 4) | UTF8_cval3);                                  \
667
10.1k
        *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80);                    \
668
10.1k
        *(*toP)++ = ((lo & 0x3f) | 0x80);                                      \
669
10.1k
        break;                                                                 \
670
10.2k
      case 0xD8:                                                               \
671
0
      case 0xD9:                                                               \
672
0
      case 0xDA:                                                               \
673
0
      case 0xDB:                                                               \
674
0
        if (toLim - *toP < 4) {                                                \
675
0
          *fromP = from;                                                       \
676
0
          return XML_CONVERT_OUTPUT_EXHAUSTED;                                 \
677
0
        }                                                                      \
678
0
        if (fromLim - from < 4) {                                              \
679
0
          *fromP = from;                                                       \
680
0
          return XML_CONVERT_INPUT_INCOMPLETE;                                 \
681
0
        }                                                                      \
682
0
        plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1;                   \
683
0
        *(*toP)++ = (char)((plane >> 2) | UTF8_cval4);                         \
684
0
        *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80);         \
685
0
        from += 2;                                                             \
686
0
        lo2 = GET_LO(from);                                                    \
687
0
        *(*toP)++ = (((lo & 0x3) << 4) | ((GET_HI(from) & 0x3) << 2)           \
688
0
                     | (lo2 >> 6) | 0x80);                                     \
689
0
        *(*toP)++ = ((lo2 & 0x3f) | 0x80);                                     \
690
0
        break;                                                                 \
691
12.5k
      }                                                                        \
692
12.5k
    }                                                                          \
693
2.00k
    *fromP = from;                                                             \
694
1.82k
    if (from < fromLim)                                                        \
695
1.82k
      return XML_CONVERT_INPUT_INCOMPLETE;                                     \
696
1.82k
    else                                                                       \
697
1.82k
      return XML_CONVERT_COMPLETED;                                            \
698
1.82k
  }
xmltok.c:little2_toUtf8
Line
Count
Source
626
990
      char **toP, const char *toLim) {                                         \
627
990
    const char *from = *fromP;                                                 \
628
990
    UNUSED_P(enc);                                                             \
629
990
    fromLim = from + (((fromLim - from) >> 1) << 1); /* shrink to even */      \
630
8.25k
    for (; from < fromLim; from += 2) {                                        \
631
7.34k
      int plane;                                                               \
632
7.34k
      unsigned char lo2;                                                       \
633
7.34k
      unsigned char lo = GET_LO(from);                                         \
634
7.34k
      unsigned char hi = GET_HI(from);                                         \
635
7.34k
      switch (hi) {                                                            \
636
682
      case 0:                                                                  \
637
682
        if (lo < 0x80) {                                                       \
638
496
          if (*toP == toLim) {                                                 \
639
8
            *fromP = from;                                                     \
640
8
            return XML_CONVERT_OUTPUT_EXHAUSTED;                               \
641
8
          }                                                                    \
642
496
          *(*toP)++ = lo;                                                      \
643
488
          break;                                                               \
644
496
        }                                                                      \
645
682
        EXPAT_FALLTHROUGH;                                                     \
646
826
      case 0x1:                                                                \
647
839
      case 0x2:                                                                \
648
860
      case 0x3:                                                                \
649
876
      case 0x4:                                                                \
650
890
      case 0x5:                                                                \
651
907
      case 0x6:                                                                \
652
907
      case 0x7:                                                                \
653
907
        if (toLim - *toP < 2) {                                                \
654
19
          *fromP = from;                                                       \
655
19
          return XML_CONVERT_OUTPUT_EXHAUSTED;                                 \
656
19
        }                                                                      \
657
907
        *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2);                      \
658
888
        *(*toP)++ = ((lo & 0x3f) | 0x80);                                      \
659
888
        break;                                                                 \
660
5.94k
      default:                                                                 \
661
5.94k
        if (toLim - *toP < 3) {                                                \
662
59
          *fromP = from;                                                       \
663
59
          return XML_CONVERT_OUTPUT_EXHAUSTED;                                 \
664
59
        }                                                                      \
665
5.94k
        /* 16 bits divided 4, 6, 6 amongst 3 bytes */                          \
666
5.94k
        *(*toP)++ = ((hi >> 4) | UTF8_cval3);                                  \
667
5.88k
        *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80);                    \
668
5.88k
        *(*toP)++ = ((lo & 0x3f) | 0x80);                                      \
669
5.88k
        break;                                                                 \
670
5.94k
      case 0xD8:                                                               \
671
0
      case 0xD9:                                                               \
672
0
      case 0xDA:                                                               \
673
0
      case 0xDB:                                                               \
674
0
        if (toLim - *toP < 4) {                                                \
675
0
          *fromP = from;                                                       \
676
0
          return XML_CONVERT_OUTPUT_EXHAUSTED;                                 \
677
0
        }                                                                      \
678
0
        if (fromLim - from < 4) {                                              \
679
0
          *fromP = from;                                                       \
680
0
          return XML_CONVERT_INPUT_INCOMPLETE;                                 \
681
0
        }                                                                      \
682
0
        plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1;                   \
683
0
        *(*toP)++ = (char)((plane >> 2) | UTF8_cval4);                         \
684
0
        *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80);         \
685
0
        from += 2;                                                             \
686
0
        lo2 = GET_LO(from);                                                    \
687
0
        *(*toP)++ = (((lo & 0x3) << 4) | ((GET_HI(from) & 0x3) << 2)           \
688
0
                     | (lo2 >> 6) | 0x80);                                     \
689
0
        *(*toP)++ = ((lo2 & 0x3f) | 0x80);                                     \
690
0
        break;                                                                 \
691
7.34k
      }                                                                        \
692
7.34k
    }                                                                          \
693
990
    *fromP = from;                                                             \
694
904
    if (from < fromLim)                                                        \
695
904
      return XML_CONVERT_INPUT_INCOMPLETE;                                     \
696
904
    else                                                                       \
697
904
      return XML_CONVERT_COMPLETED;                                            \
698
904
  }
xmltok.c:big2_toUtf8
Line
Count
Source
626
1.01k
      char **toP, const char *toLim) {                                         \
627
1.01k
    const char *from = *fromP;                                                 \
628
1.01k
    UNUSED_P(enc);                                                             \
629
1.01k
    fromLim = from + (((fromLim - from) >> 1) << 1); /* shrink to even */      \
630
6.10k
    for (; from < fromLim; from += 2) {                                        \
631
5.17k
      int plane;                                                               \
632
5.17k
      unsigned char lo2;                                                       \
633
5.17k
      unsigned char lo = GET_LO(from);                                         \
634
5.17k
      unsigned char hi = GET_HI(from);                                         \
635
5.17k
      switch (hi) {                                                            \
636
688
      case 0:                                                                  \
637
688
        if (lo < 0x80) {                                                       \
638
638
          if (*toP == toLim) {                                                 \
639
6
            *fromP = from;                                                     \
640
6
            return XML_CONVERT_OUTPUT_EXHAUSTED;                               \
641
6
          }                                                                    \
642
638
          *(*toP)++ = lo;                                                      \
643
632
          break;                                                               \
644
638
        }                                                                      \
645
688
        EXPAT_FALLTHROUGH;                                                     \
646
147
      case 0x1:                                                                \
647
166
      case 0x2:                                                                \
648
176
      case 0x3:                                                                \
649
215
      case 0x4:                                                                \
650
235
      case 0x5:                                                                \
651
245
      case 0x6:                                                                \
652
245
      case 0x7:                                                                \
653
245
        if (toLim - *toP < 2) {                                                \
654
13
          *fromP = from;                                                       \
655
13
          return XML_CONVERT_OUTPUT_EXHAUSTED;                                 \
656
13
        }                                                                      \
657
245
        *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2);                      \
658
232
        *(*toP)++ = ((lo & 0x3f) | 0x80);                                      \
659
232
        break;                                                                 \
660
4.29k
      default:                                                                 \
661
4.29k
        if (toLim - *toP < 3) {                                                \
662
68
          *fromP = from;                                                       \
663
68
          return XML_CONVERT_OUTPUT_EXHAUSTED;                                 \
664
68
        }                                                                      \
665
4.29k
        /* 16 bits divided 4, 6, 6 amongst 3 bytes */                          \
666
4.29k
        *(*toP)++ = ((hi >> 4) | UTF8_cval3);                                  \
667
4.22k
        *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80);                    \
668
4.22k
        *(*toP)++ = ((lo & 0x3f) | 0x80);                                      \
669
4.22k
        break;                                                                 \
670
4.29k
      case 0xD8:                                                               \
671
0
      case 0xD9:                                                               \
672
0
      case 0xDA:                                                               \
673
0
      case 0xDB:                                                               \
674
0
        if (toLim - *toP < 4) {                                                \
675
0
          *fromP = from;                                                       \
676
0
          return XML_CONVERT_OUTPUT_EXHAUSTED;                                 \
677
0
        }                                                                      \
678
0
        if (fromLim - from < 4) {                                              \
679
0
          *fromP = from;                                                       \
680
0
          return XML_CONVERT_INPUT_INCOMPLETE;                                 \
681
0
        }                                                                      \
682
0
        plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1;                   \
683
0
        *(*toP)++ = (char)((plane >> 2) | UTF8_cval4);                         \
684
0
        *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80);         \
685
0
        from += 2;                                                             \
686
0
        lo2 = GET_LO(from);                                                    \
687
0
        *(*toP)++ = (((lo & 0x3) << 4) | ((GET_HI(from) & 0x3) << 2)           \
688
0
                     | (lo2 >> 6) | 0x80);                                     \
689
0
        *(*toP)++ = ((lo2 & 0x3f) | 0x80);                                     \
690
0
        break;                                                                 \
691
5.17k
      }                                                                        \
692
5.17k
    }                                                                          \
693
1.01k
    *fromP = from;                                                             \
694
924
    if (from < fromLim)                                                        \
695
924
      return XML_CONVERT_INPUT_INCOMPLETE;                                     \
696
924
    else                                                                       \
697
924
      return XML_CONVERT_COMPLETED;                                            \
698
924
  }
699
700
#define DEFINE_UTF16_TO_UTF16(E)                                               \
701
  static enum XML_Convert_Result PTRCALL E##toUtf16(                           \
702
      const ENCODING *enc, const char **fromP, const char *fromLim,            \
703
0
      unsigned short **toP, const unsigned short *toLim) {                     \
704
0
    enum XML_Convert_Result res = XML_CONVERT_COMPLETED;                       \
705
0
    UNUSED_P(enc);                                                             \
706
0
    fromLim = *fromP + (((fromLim - *fromP) >> 1) << 1); /* shrink to even */  \
707
0
    /* Avoid copying first half only of surrogate */                           \
708
0
    if (fromLim - *fromP > ((toLim - *toP) << 1)                               \
709
0
        && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) {                             \
710
0
      fromLim -= 2;                                                            \
711
0
      res = XML_CONVERT_INPUT_INCOMPLETE;                                      \
712
0
    }                                                                          \
713
0
    for (; *fromP < fromLim && *toP < toLim; *fromP += 2)                      \
714
0
      *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP);                      \
715
0
    if ((*toP == toLim) && (*fromP < fromLim))                                 \
716
0
      return XML_CONVERT_OUTPUT_EXHAUSTED;                                     \
717
0
    else                                                                       \
718
0
      return res;                                                              \
719
0
  }
Unexecuted instantiation: xmltok.c:little2_toUtf16
Unexecuted instantiation: xmltok.c:big2_toUtf16
720
721
7.34k
#define GET_LO(ptr) ((unsigned char)(ptr)[0])
722
7.34k
#define GET_HI(ptr) ((unsigned char)(ptr)[1])
723
724
DEFINE_UTF16_TO_UTF8(little2_)
725
DEFINE_UTF16_TO_UTF16(little2_)
726
727
#undef GET_LO
728
#undef GET_HI
729
730
5.17k
#define GET_LO(ptr) ((unsigned char)(ptr)[1])
731
5.17k
#define GET_HI(ptr) ((unsigned char)(ptr)[0])
732
733
DEFINE_UTF16_TO_UTF8(big2_)
734
DEFINE_UTF16_TO_UTF16(big2_)
735
736
#undef GET_LO
737
#undef GET_HI
738
739
#define LITTLE2_BYTE_TYPE(enc, p)                                              \
740
119k
  ((p)[1] == 0 ? SB_BYTE_TYPE(enc, p) : unicode_byte_type((p)[1], (p)[0]))
741
19
#define LITTLE2_BYTE_TO_ASCII(p) ((p)[1] == 0 ? (p)[0] : -1)
742
414
#define LITTLE2_CHAR_MATCHES(p, c) ((p)[1] == 0 && (p)[0] == (c))
743
#define LITTLE2_IS_NAME_CHAR_MINBPC(p)                                         \
744
15.1k
  UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
745
#define LITTLE2_IS_NMSTRT_CHAR_MINBPC(p)                                       \
746
3.20k
  UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
747
748
#ifdef XML_MIN_SIZE
749
750
static int PTRFASTCALL
751
little2_byteType(const ENCODING *enc, const char *p) {
752
  return LITTLE2_BYTE_TYPE(enc, p);
753
}
754
755
static int PTRFASTCALL
756
little2_byteToAscii(const ENCODING *enc, const char *p) {
757
  UNUSED_P(enc);
758
  return LITTLE2_BYTE_TO_ASCII(p);
759
}
760
761
static int PTRCALL
762
little2_charMatches(const ENCODING *enc, const char *p, int c) {
763
  UNUSED_P(enc);
764
  return LITTLE2_CHAR_MATCHES(p, c);
765
}
766
767
static int PTRFASTCALL
768
little2_isNameMin(const ENCODING *enc, const char *p) {
769
  UNUSED_P(enc);
770
  return LITTLE2_IS_NAME_CHAR_MINBPC(p);
771
}
772
773
static int PTRFASTCALL
774
little2_isNmstrtMin(const ENCODING *enc, const char *p) {
775
  UNUSED_P(enc);
776
  return LITTLE2_IS_NMSTRT_CHAR_MINBPC(p);
777
}
778
779
#  undef VTABLE
780
#  define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
781
782
#else /* not XML_MIN_SIZE */
783
784
#  undef PREFIX
785
4.46k
#  define PREFIX(ident) little2_##ident
786
238k
#  define MINBPC(enc) 2
787
/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
788
119k
#  define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
789
19
#  define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(p)
790
414
#  define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(p, c)
791
120
#  define IS_NAME_CHAR(enc, p, n) 0
792
15.1k
#  define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(p)
793
120
#  define IS_NMSTRT_CHAR(enc, p, n) (0)
794
3.20k
#  define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(p)
795
796
#  define XML_TOK_IMPL_C
797
#  include "xmltok_impl.c"
798
#  undef XML_TOK_IMPL_C
799
800
#  undef MINBPC
801
#  undef BYTE_TYPE
802
#  undef BYTE_TO_ASCII
803
#  undef CHAR_MATCHES
804
#  undef IS_NAME_CHAR
805
#  undef IS_NAME_CHAR_MINBPC
806
#  undef IS_NMSTRT_CHAR
807
#  undef IS_NMSTRT_CHAR_MINBPC
808
#  undef IS_INVALID_CHAR
809
810
#endif /* not XML_MIN_SIZE */
811
812
#ifdef XML_NS
813
814
static const struct normal_encoding little2_encoding_ns
815
    = {{VTABLE, 2, 0,
816
#  if BYTEORDER == 1234
817
        1
818
#  else
819
        0
820
#  endif
821
       },
822
       {
823
#  include "asciitab.h"
824
#  include "latin1tab.h"
825
       },
826
       STANDARD_VTABLE(little2_) NULL_VTABLE};
827
828
#endif
829
830
static const struct normal_encoding little2_encoding
831
    = {{VTABLE, 2, 0,
832
#if BYTEORDER == 1234
833
        1
834
#else
835
        0
836
#endif
837
       },
838
       {
839
#define BT_COLON BT_NMSTRT
840
#include "asciitab.h"
841
#undef BT_COLON
842
#include "latin1tab.h"
843
       },
844
       STANDARD_VTABLE(little2_) NULL_VTABLE};
845
846
#if BYTEORDER != 4321
847
848
#  ifdef XML_NS
849
850
static const struct normal_encoding internal_little2_encoding_ns
851
    = {{VTABLE, 2, 0, 1},
852
       {
853
#    include "iasciitab.h"
854
#    include "latin1tab.h"
855
       },
856
       STANDARD_VTABLE(little2_) NULL_VTABLE};
857
858
#  endif
859
860
static const struct normal_encoding internal_little2_encoding
861
    = {{VTABLE, 2, 0, 1},
862
       {
863
#  define BT_COLON BT_NMSTRT
864
#  include "iasciitab.h"
865
#  undef BT_COLON
866
#  include "latin1tab.h"
867
       },
868
       STANDARD_VTABLE(little2_) NULL_VTABLE};
869
870
#endif
871
872
#define BIG2_BYTE_TYPE(enc, p)                                                 \
873
102k
  ((p)[0] == 0 ? SB_BYTE_TYPE(enc, p + 1) : unicode_byte_type((p)[0], (p)[1]))
874
51
#define BIG2_BYTE_TO_ASCII(p) ((p)[0] == 0 ? (p)[1] : -1)
875
628
#define BIG2_CHAR_MATCHES(p, c) ((p)[0] == 0 && (p)[1] == (c))
876
#define BIG2_IS_NAME_CHAR_MINBPC(p)                                            \
877
11.8k
  UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
878
#define BIG2_IS_NMSTRT_CHAR_MINBPC(p)                                          \
879
2.29k
  UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
880
881
#ifdef XML_MIN_SIZE
882
883
static int PTRFASTCALL
884
big2_byteType(const ENCODING *enc, const char *p) {
885
  return BIG2_BYTE_TYPE(enc, p);
886
}
887
888
static int PTRFASTCALL
889
big2_byteToAscii(const ENCODING *enc, const char *p) {
890
  UNUSED_P(enc);
891
  return BIG2_BYTE_TO_ASCII(p);
892
}
893
894
static int PTRCALL
895
big2_charMatches(const ENCODING *enc, const char *p, int c) {
896
  UNUSED_P(enc);
897
  return BIG2_CHAR_MATCHES(p, c);
898
}
899
900
static int PTRFASTCALL
901
big2_isNameMin(const ENCODING *enc, const char *p) {
902
  UNUSED_P(enc);
903
  return BIG2_IS_NAME_CHAR_MINBPC(p);
904
}
905
906
static int PTRFASTCALL
907
big2_isNmstrtMin(const ENCODING *enc, const char *p) {
908
  UNUSED_P(enc);
909
  return BIG2_IS_NMSTRT_CHAR_MINBPC(p);
910
}
911
912
#  undef VTABLE
913
#  define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
914
915
#else /* not XML_MIN_SIZE */
916
917
#  undef PREFIX
918
5.08k
#  define PREFIX(ident) big2_##ident
919
210k
#  define MINBPC(enc) 2
920
/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
921
102k
#  define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
922
51
#  define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(p)
923
628
#  define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(p, c)
924
28
#  define IS_NAME_CHAR(enc, p, n) 0
925
11.8k
#  define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(p)
926
28
#  define IS_NMSTRT_CHAR(enc, p, n) (0)
927
2.29k
#  define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(p)
928
929
#  define XML_TOK_IMPL_C
930
#  include "xmltok_impl.c"
931
#  undef XML_TOK_IMPL_C
932
933
#  undef MINBPC
934
#  undef BYTE_TYPE
935
#  undef BYTE_TO_ASCII
936
#  undef CHAR_MATCHES
937
#  undef IS_NAME_CHAR
938
#  undef IS_NAME_CHAR_MINBPC
939
#  undef IS_NMSTRT_CHAR
940
#  undef IS_NMSTRT_CHAR_MINBPC
941
#  undef IS_INVALID_CHAR
942
943
#endif /* not XML_MIN_SIZE */
944
945
#ifdef XML_NS
946
947
static const struct normal_encoding big2_encoding_ns
948
    = {{VTABLE, 2, 0,
949
#  if BYTEORDER == 4321
950
        1
951
#  else
952
        0
953
#  endif
954
       },
955
       {
956
#  include "asciitab.h"
957
#  include "latin1tab.h"
958
       },
959
       STANDARD_VTABLE(big2_) NULL_VTABLE};
960
961
#endif
962
963
static const struct normal_encoding big2_encoding
964
    = {{VTABLE, 2, 0,
965
#if BYTEORDER == 4321
966
        1
967
#else
968
        0
969
#endif
970
       },
971
       {
972
#define BT_COLON BT_NMSTRT
973
#include "asciitab.h"
974
#undef BT_COLON
975
#include "latin1tab.h"
976
       },
977
       STANDARD_VTABLE(big2_) NULL_VTABLE};
978
979
#if BYTEORDER != 1234
980
981
#  ifdef XML_NS
982
983
static const struct normal_encoding internal_big2_encoding_ns
984
    = {{VTABLE, 2, 0, 1},
985
       {
986
#    include "iasciitab.h"
987
#    include "latin1tab.h"
988
       },
989
       STANDARD_VTABLE(big2_) NULL_VTABLE};
990
991
#  endif
992
993
static const struct normal_encoding internal_big2_encoding
994
    = {{VTABLE, 2, 0, 1},
995
       {
996
#  define BT_COLON BT_NMSTRT
997
#  include "iasciitab.h"
998
#  undef BT_COLON
999
#  include "latin1tab.h"
1000
       },
1001
       STANDARD_VTABLE(big2_) NULL_VTABLE};
1002
1003
#endif
1004
1005
#undef PREFIX
1006
1007
static int FASTCALL
1008
0
streqci(const char *s1, const char *s2) {
1009
0
  for (;;) {
1010
0
    char c1 = *s1++;
1011
0
    char c2 = *s2++;
1012
0
    if (ASCII_a <= c1 && c1 <= ASCII_z)
1013
0
      c1 += ASCII_A - ASCII_a;
1014
0
    if (ASCII_a <= c2 && c2 <= ASCII_z)
1015
      /* The following line will never get executed.  streqci() is
1016
       * only called from two places, both of which guarantee to put
1017
       * upper-case strings into s2.
1018
       */
1019
0
      c2 += ASCII_A - ASCII_a; /* LCOV_EXCL_LINE */
1020
0
    if (c1 != c2)
1021
0
      return 0;
1022
0
    if (! c1)
1023
0
      break;
1024
0
  }
1025
0
  return 1;
1026
0
}
1027
1028
static void PTRCALL
1029
initUpdatePosition(const ENCODING *enc, const char *ptr, const char *end,
1030
278
                   POSITION *pos) {
1031
278
  UNUSED_P(enc);
1032
278
  normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
1033
278
}
1034
1035
static int
1036
0
toAscii(const ENCODING *enc, const char *ptr, const char *end) {
1037
0
  char buf[1];
1038
0
  char *p = buf;
1039
0
  XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
1040
0
  if (p == buf)
1041
0
    return -1;
1042
0
  else
1043
0
    return buf[0];
1044
0
}
1045
1046
static int FASTCALL
1047
0
isSpace(int c) {
1048
0
  switch (c) {
1049
0
  case 0x20:
1050
0
  case 0xD:
1051
0
  case 0xA:
1052
0
  case 0x9:
1053
0
    return 1;
1054
0
  }
1055
0
  return 0;
1056
0
}
1057
1058
/* Return 1 if there's just optional white space or there's an S
1059
   followed by name=val.
1060
*/
1061
static int
1062
parsePseudoAttribute(const ENCODING *enc, const char *ptr, const char *end,
1063
                     const char **namePtr, const char **nameEndPtr,
1064
0
                     const char **valPtr, const char **nextTokPtr) {
1065
0
  int c;
1066
0
  char open;
1067
0
  if (ptr == end) {
1068
0
    *namePtr = NULL;
1069
0
    return 1;
1070
0
  }
1071
0
  if (! isSpace(toAscii(enc, ptr, end))) {
1072
0
    *nextTokPtr = ptr;
1073
0
    return 0;
1074
0
  }
1075
0
  do {
1076
0
    ptr += enc->minBytesPerChar;
1077
0
  } while (isSpace(toAscii(enc, ptr, end)));
1078
0
  if (ptr == end) {
1079
0
    *namePtr = NULL;
1080
0
    return 1;
1081
0
  }
1082
0
  *namePtr = ptr;
1083
0
  for (;;) {
1084
0
    c = toAscii(enc, ptr, end);
1085
0
    if (c == -1) {
1086
0
      *nextTokPtr = ptr;
1087
0
      return 0;
1088
0
    }
1089
0
    if (c == ASCII_EQUALS) {
1090
0
      *nameEndPtr = ptr;
1091
0
      break;
1092
0
    }
1093
0
    if (isSpace(c)) {
1094
0
      *nameEndPtr = ptr;
1095
0
      do {
1096
0
        ptr += enc->minBytesPerChar;
1097
0
      } while (isSpace(c = toAscii(enc, ptr, end)));
1098
0
      if (c != ASCII_EQUALS) {
1099
0
        *nextTokPtr = ptr;
1100
0
        return 0;
1101
0
      }
1102
0
      break;
1103
0
    }
1104
0
    ptr += enc->minBytesPerChar;
1105
0
  }
1106
0
  if (ptr == *namePtr) {
1107
0
    *nextTokPtr = ptr;
1108
0
    return 0;
1109
0
  }
1110
0
  ptr += enc->minBytesPerChar;
1111
0
  c = toAscii(enc, ptr, end);
1112
0
  while (isSpace(c)) {
1113
0
    ptr += enc->minBytesPerChar;
1114
0
    c = toAscii(enc, ptr, end);
1115
0
  }
1116
0
  if (c != ASCII_QUOT && c != ASCII_APOS) {
1117
0
    *nextTokPtr = ptr;
1118
0
    return 0;
1119
0
  }
1120
0
  open = (char)c;
1121
0
  ptr += enc->minBytesPerChar;
1122
0
  *valPtr = ptr;
1123
0
  for (;; ptr += enc->minBytesPerChar) {
1124
0
    c = toAscii(enc, ptr, end);
1125
0
    if (c == open)
1126
0
      break;
1127
0
    if (! (ASCII_a <= c && c <= ASCII_z) && ! (ASCII_A <= c && c <= ASCII_Z)
1128
0
        && ! (ASCII_0 <= c && c <= ASCII_9) && c != ASCII_PERIOD
1129
0
        && c != ASCII_MINUS && c != ASCII_UNDERSCORE) {
1130
0
      *nextTokPtr = ptr;
1131
0
      return 0;
1132
0
    }
1133
0
  }
1134
0
  *nextTokPtr = ptr + enc->minBytesPerChar;
1135
0
  return 1;
1136
0
}
1137
1138
static const char KW_version[]
1139
    = {ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0'};
1140
1141
static const char KW_encoding[] = {ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d,
1142
                                   ASCII_i, ASCII_n, ASCII_g, '\0'};
1143
1144
static const char KW_standalone[]
1145
    = {ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a,
1146
       ASCII_l, ASCII_o, ASCII_n, ASCII_e, '\0'};
1147
1148
static const char KW_yes[] = {ASCII_y, ASCII_e, ASCII_s, '\0'};
1149
1150
static const char KW_no[] = {ASCII_n, ASCII_o, '\0'};
1151
1152
static int
1153
doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *, const char *,
1154
                                                 const char *),
1155
               int isGeneralTextEntity, const ENCODING *enc, const char *ptr,
1156
               const char *end, const char **badPtr, const char **versionPtr,
1157
               const char **versionEndPtr, const char **encodingName,
1158
0
               const ENCODING **encoding, int *standalone) {
1159
0
  const char *val = NULL;
1160
0
  const char *name = NULL;
1161
0
  const char *nameEnd = NULL;
1162
0
  ptr += 5 * enc->minBytesPerChar;
1163
0
  end -= 2 * enc->minBytesPerChar;
1164
0
  if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)
1165
0
      || ! name) {
1166
0
    *badPtr = ptr;
1167
0
    return 0;
1168
0
  }
1169
0
  if (! XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) {
1170
0
    if (! isGeneralTextEntity) {
1171
0
      *badPtr = name;
1172
0
      return 0;
1173
0
    }
1174
0
  } else {
1175
0
    if (versionPtr)
1176
0
      *versionPtr = val;
1177
0
    if (versionEndPtr)
1178
0
      *versionEndPtr = ptr;
1179
0
    if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1180
0
      *badPtr = ptr;
1181
0
      return 0;
1182
0
    }
1183
0
    if (! name) {
1184
0
      if (isGeneralTextEntity) {
1185
        /* a TextDecl must have an EncodingDecl */
1186
0
        *badPtr = ptr;
1187
0
        return 0;
1188
0
      }
1189
0
      return 1;
1190
0
    }
1191
0
  }
1192
0
  if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) {
1193
0
    int c = toAscii(enc, val, end);
1194
0
    if (! (ASCII_a <= c && c <= ASCII_z) && ! (ASCII_A <= c && c <= ASCII_Z)) {
1195
0
      *badPtr = val;
1196
0
      return 0;
1197
0
    }
1198
0
    if (encodingName)
1199
0
      *encodingName = val;
1200
0
    if (encoding)
1201
0
      *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
1202
0
    if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1203
0
      *badPtr = ptr;
1204
0
      return 0;
1205
0
    }
1206
0
    if (! name)
1207
0
      return 1;
1208
0
  }
1209
0
  if (! XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone)
1210
0
      || isGeneralTextEntity) {
1211
0
    *badPtr = name;
1212
0
    return 0;
1213
0
  }
1214
0
  if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) {
1215
0
    if (standalone)
1216
0
      *standalone = 1;
1217
0
  } else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) {
1218
0
    if (standalone)
1219
0
      *standalone = 0;
1220
0
  } else {
1221
0
    *badPtr = val;
1222
0
    return 0;
1223
0
  }
1224
0
  while (isSpace(toAscii(enc, ptr, end)))
1225
0
    ptr += enc->minBytesPerChar;
1226
0
  if (ptr != end) {
1227
0
    *badPtr = ptr;
1228
0
    return 0;
1229
0
  }
1230
0
  return 1;
1231
0
}
1232
1233
static int FASTCALL
1234
10
checkCharRefNumber(int result) {
1235
10
  switch (result >> 8) {
1236
0
  case 0xD8:
1237
0
  case 0xD9:
1238
0
  case 0xDA:
1239
0
  case 0xDB:
1240
0
  case 0xDC:
1241
0
  case 0xDD:
1242
0
  case 0xDE:
1243
0
  case 0xDF:
1244
0
    return -1;
1245
10
  case 0:
1246
10
    if (latin1_encoding.type[result] == BT_NONXML)
1247
10
      return -1;
1248
0
    break;
1249
0
  case 0xFF:
1250
0
    if (result == 0xFFFE || result == 0xFFFF)
1251
0
      return -1;
1252
0
    break;
1253
10
  }
1254
0
  return result;
1255
10
}
1256
1257
int FASTCALL
1258
0
XmlUtf8Encode(int c, char *buf) {
1259
0
  enum {
1260
    /* minN is minimum legal resulting value for N byte sequence */
1261
0
    min2 = 0x80,
1262
0
    min3 = 0x800,
1263
0
    min4 = 0x10000
1264
0
  };
1265
1266
0
  if (c < 0)
1267
0
    return 0; /* LCOV_EXCL_LINE: this case is always eliminated beforehand */
1268
0
  if (c < min2) {
1269
0
    buf[0] = (char)(c | UTF8_cval1);
1270
0
    return 1;
1271
0
  }
1272
0
  if (c < min3) {
1273
0
    buf[0] = (char)((c >> 6) | UTF8_cval2);
1274
0
    buf[1] = (char)((c & 0x3f) | 0x80);
1275
0
    return 2;
1276
0
  }
1277
0
  if (c < min4) {
1278
0
    buf[0] = (char)((c >> 12) | UTF8_cval3);
1279
0
    buf[1] = (char)(((c >> 6) & 0x3f) | 0x80);
1280
0
    buf[2] = (char)((c & 0x3f) | 0x80);
1281
0
    return 3;
1282
0
  }
1283
0
  if (c < 0x110000) {
1284
0
    buf[0] = (char)((c >> 18) | UTF8_cval4);
1285
0
    buf[1] = (char)(((c >> 12) & 0x3f) | 0x80);
1286
0
    buf[2] = (char)(((c >> 6) & 0x3f) | 0x80);
1287
0
    buf[3] = (char)((c & 0x3f) | 0x80);
1288
0
    return 4;
1289
0
  }
1290
0
  return 0; /* LCOV_EXCL_LINE: this case too is eliminated before calling */
1291
0
}
1292
1293
int FASTCALL
1294
0
XmlUtf16Encode(int charNum, unsigned short *buf) {
1295
0
  if (charNum < 0)
1296
0
    return 0;
1297
0
  if (charNum < 0x10000) {
1298
0
    buf[0] = (unsigned short)charNum;
1299
0
    return 1;
1300
0
  }
1301
0
  if (charNum < 0x110000) {
1302
0
    charNum -= 0x10000;
1303
0
    buf[0] = (unsigned short)((charNum >> 10) + 0xD800);
1304
0
    buf[1] = (unsigned short)((charNum & 0x3FF) + 0xDC00);
1305
0
    return 2;
1306
0
  }
1307
0
  return 0;
1308
0
}
1309
1310
struct unknown_encoding {
1311
  struct normal_encoding normal;
1312
  CONVERTER convert;
1313
  void *userData;
1314
  unsigned short utf16[256];
1315
  char utf8[256][4];
1316
};
1317
1318
0
#define AS_UNKNOWN_ENCODING(enc) ((const struct unknown_encoding *)(enc))
1319
1320
int
1321
0
XmlSizeOfUnknownEncoding(void) {
1322
0
  return sizeof(struct unknown_encoding);
1323
0
}
1324
1325
static int PTRFASTCALL
1326
0
unknown_isName(const ENCODING *enc, const char *p) {
1327
0
  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1328
0
  int c = uenc->convert(uenc->userData, p);
1329
0
  if (c & ~0xFFFF)
1330
0
    return 0;
1331
0
  return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
1332
0
}
1333
1334
static int PTRFASTCALL
1335
0
unknown_isNmstrt(const ENCODING *enc, const char *p) {
1336
0
  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1337
0
  int c = uenc->convert(uenc->userData, p);
1338
0
  if (c & ~0xFFFF)
1339
0
    return 0;
1340
0
  return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
1341
0
}
1342
1343
static int PTRFASTCALL
1344
0
unknown_isInvalid(const ENCODING *enc, const char *p) {
1345
0
  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1346
0
  int c = uenc->convert(uenc->userData, p);
1347
0
  return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
1348
0
}
1349
1350
static enum XML_Convert_Result PTRCALL
1351
unknown_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
1352
0
               char **toP, const char *toLim) {
1353
0
  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1354
0
  char buf[XML_UTF8_ENCODE_MAX];
1355
0
  for (;;) {
1356
0
    const char *utf8;
1357
0
    int n;
1358
0
    if (*fromP == fromLim)
1359
0
      return XML_CONVERT_COMPLETED;
1360
0
    utf8 = uenc->utf8[(unsigned char)**fromP];
1361
0
    n = *utf8++;
1362
0
    if (n == 0) {
1363
0
      int c = uenc->convert(uenc->userData, *fromP);
1364
0
      n = XmlUtf8Encode(c, buf);
1365
0
      if (n > toLim - *toP)
1366
0
        return XML_CONVERT_OUTPUT_EXHAUSTED;
1367
0
      utf8 = buf;
1368
0
      *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1369
0
                 - (BT_LEAD2 - 2));
1370
0
    } else {
1371
0
      if (n > toLim - *toP)
1372
0
        return XML_CONVERT_OUTPUT_EXHAUSTED;
1373
0
      (*fromP)++;
1374
0
    }
1375
0
    memcpy(*toP, utf8, n);
1376
0
    *toP += n;
1377
0
  }
1378
0
}
1379
1380
static enum XML_Convert_Result PTRCALL
1381
unknown_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
1382
0
                unsigned short **toP, const unsigned short *toLim) {
1383
0
  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1384
0
  while (*fromP < fromLim && *toP < toLim) {
1385
0
    unsigned short c = uenc->utf16[(unsigned char)**fromP];
1386
0
    if (c == 0) {
1387
0
      c = (unsigned short)uenc->convert(uenc->userData, *fromP);
1388
0
      *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1389
0
                 - (BT_LEAD2 - 2));
1390
0
    } else
1391
0
      (*fromP)++;
1392
0
    *(*toP)++ = c;
1393
0
  }
1394
1395
0
  if ((*toP == toLim) && (*fromP < fromLim))
1396
0
    return XML_CONVERT_OUTPUT_EXHAUSTED;
1397
0
  else
1398
0
    return XML_CONVERT_COMPLETED;
1399
0
}
1400
1401
ENCODING *
1402
XmlInitUnknownEncoding(void *mem, const int *table, CONVERTER convert,
1403
0
                       void *userData) {
1404
0
  int i;
1405
0
  struct unknown_encoding *e = (struct unknown_encoding *)mem;
1406
0
  memcpy(mem, &latin1_encoding, sizeof(struct normal_encoding));
1407
0
  for (i = 0; i < 128; i++)
1408
0
    if (latin1_encoding.type[i] != BT_OTHER
1409
0
        && latin1_encoding.type[i] != BT_NONXML && table[i] != i)
1410
0
      return 0;
1411
0
  for (i = 0; i < 256; i++) {
1412
0
    int c = table[i];
1413
0
    if (c == -1) {
1414
0
      e->normal.type[i] = BT_MALFORM;
1415
      /* This shouldn't really get used. */
1416
0
      e->utf16[i] = 0xFFFF;
1417
0
      e->utf8[i][0] = 1;
1418
0
      e->utf8[i][1] = 0;
1419
0
    } else if (c < 0) {
1420
0
      if (c < -4)
1421
0
        return 0;
1422
      /* Multi-byte sequences need a converter function */
1423
0
      if (! convert)
1424
0
        return 0;
1425
0
      e->normal.type[i] = (unsigned char)(BT_LEAD2 - (c + 2));
1426
0
      e->utf8[i][0] = 0;
1427
0
      e->utf16[i] = 0;
1428
0
    } else if (c < 0x80) {
1429
0
      if (latin1_encoding.type[c] != BT_OTHER
1430
0
          && latin1_encoding.type[c] != BT_NONXML && c != i)
1431
0
        return 0;
1432
0
      e->normal.type[i] = latin1_encoding.type[c];
1433
0
      e->utf8[i][0] = 1;
1434
0
      e->utf8[i][1] = (char)c;
1435
0
      e->utf16[i] = (unsigned short)(c == 0 ? 0xFFFF : c);
1436
0
    } else if (checkCharRefNumber(c) < 0) {
1437
0
      e->normal.type[i] = BT_NONXML;
1438
      /* This shouldn't really get used. */
1439
0
      e->utf16[i] = 0xFFFF;
1440
0
      e->utf8[i][0] = 1;
1441
0
      e->utf8[i][1] = 0;
1442
0
    } else {
1443
0
      if (c > 0xFFFF)
1444
0
        return 0;
1445
0
      if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
1446
0
        e->normal.type[i] = BT_NMSTRT;
1447
0
      else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
1448
0
        e->normal.type[i] = BT_NAME;
1449
0
      else
1450
0
        e->normal.type[i] = BT_OTHER;
1451
0
      e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
1452
0
      e->utf16[i] = (unsigned short)c;
1453
0
    }
1454
0
  }
1455
0
  e->userData = userData;
1456
0
  e->convert = convert;
1457
0
  if (convert) {
1458
0
    e->normal.isName2 = unknown_isName;
1459
0
    e->normal.isName3 = unknown_isName;
1460
0
    e->normal.isName4 = unknown_isName;
1461
0
    e->normal.isNmstrt2 = unknown_isNmstrt;
1462
0
    e->normal.isNmstrt3 = unknown_isNmstrt;
1463
0
    e->normal.isNmstrt4 = unknown_isNmstrt;
1464
0
    e->normal.isInvalid2 = unknown_isInvalid;
1465
0
    e->normal.isInvalid3 = unknown_isInvalid;
1466
0
    e->normal.isInvalid4 = unknown_isInvalid;
1467
0
  }
1468
0
  e->normal.enc.utf8Convert = unknown_toUtf8;
1469
0
  e->normal.enc.utf16Convert = unknown_toUtf16;
1470
0
  return &(e->normal.enc);
1471
0
}
1472
1473
/* If this enumeration is changed, getEncodingIndex and encodings
1474
must also be changed. */
1475
enum {
1476
  UNKNOWN_ENC = -1,
1477
  ISO_8859_1_ENC = 0,
1478
  US_ASCII_ENC,
1479
  UTF_8_ENC,
1480
  UTF_16_ENC,
1481
  UTF_16BE_ENC,
1482
  UTF_16LE_ENC,
1483
  /* must match encodingNames up to here */
1484
  NO_ENC
1485
};
1486
1487
static const char KW_ISO_8859_1[]
1488
    = {ASCII_I, ASCII_S, ASCII_O,     ASCII_MINUS, ASCII_8, ASCII_8,
1489
       ASCII_5, ASCII_9, ASCII_MINUS, ASCII_1,     '\0'};
1490
static const char KW_US_ASCII[]
1491
    = {ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S,
1492
       ASCII_C, ASCII_I, ASCII_I,     '\0'};
1493
static const char KW_UTF_8[]
1494
    = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0'};
1495
static const char KW_UTF_16[]
1496
    = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0'};
1497
static const char KW_UTF_16BE[]
1498
    = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1,
1499
       ASCII_6, ASCII_B, ASCII_E, '\0'};
1500
static const char KW_UTF_16LE[]
1501
    = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1,
1502
       ASCII_6, ASCII_L, ASCII_E, '\0'};
1503
1504
static int FASTCALL
1505
35.2k
getEncodingIndex(const char *name) {
1506
35.2k
  static const char *const encodingNames[] = {
1507
35.2k
      KW_ISO_8859_1, KW_US_ASCII, KW_UTF_8, KW_UTF_16, KW_UTF_16BE, KW_UTF_16LE,
1508
35.2k
  };
1509
35.2k
  int i;
1510
35.2k
  if (name == NULL)
1511
35.2k
    return NO_ENC;
1512
0
  for (i = 0; i < (int)(sizeof(encodingNames) / sizeof(encodingNames[0])); i++)
1513
0
    if (streqci(name, encodingNames[i]))
1514
0
      return i;
1515
0
  return UNKNOWN_ENC;
1516
0
}
1517
1518
/* For binary compatibility, we store the index of the encoding
1519
   specified at initialization in the isUtf16 member.
1520
*/
1521
1522
14.0k
#define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16)
1523
35.2k
#define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i)
1524
1525
/* This is what detects the encoding.  encodingTable maps from
1526
   encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of
1527
   the external (protocol) specified encoding; state is
1528
   XML_CONTENT_STATE if we're parsing an external text entity, and
1529
   XML_PROLOG_STATE otherwise.
1530
*/
1531
1532
static int
1533
initScan(const ENCODING *const *encodingTable, const INIT_ENCODING *enc,
1534
17.6k
         int state, const char *ptr, const char *end, const char **nextTokPtr) {
1535
17.6k
  const ENCODING **encPtr;
1536
1537
17.6k
  if (ptr >= end)
1538
57
    return XML_TOK_NONE;
1539
17.5k
  encPtr = enc->encPtr;
1540
17.5k
  if (ptr + 1 == end) {
1541
    /* only a single byte available for auto-detection */
1542
#ifndef XML_DTD /* FIXME */
1543
    /* a well-formed document entity must have more than one byte */
1544
    if (state != XML_CONTENT_STATE)
1545
      return XML_TOK_PARTIAL;
1546
#endif
1547
    /* so we're parsing an external text entity... */
1548
    /* if UTF-16 was externally specified, then we need at least 2 bytes */
1549
180
    switch (INIT_ENC_INDEX(enc)) {
1550
0
    case UTF_16_ENC:
1551
0
    case UTF_16LE_ENC:
1552
0
    case UTF_16BE_ENC:
1553
0
      return XML_TOK_PARTIAL;
1554
180
    }
1555
180
    switch ((unsigned char)*ptr) {
1556
10
    case 0xFE:
1557
18
    case 0xFF:
1558
21
    case 0xEF: /* possibly first byte of UTF-8 BOM */
1559
21
      if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1560
0
        break;
1561
21
      EXPAT_FALLTHROUGH;
1562
21
    case 0x00:
1563
76
    case 0x3C:
1564
76
      return XML_TOK_PARTIAL;
1565
180
    }
1566
17.3k
  } else {
1567
17.3k
    switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
1568
25
    case 0xFEFF:
1569
25
      if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1570
0
        break;
1571
25
      *nextTokPtr = ptr + 2;
1572
25
      *encPtr = encodingTable[UTF_16BE_ENC];
1573
25
      return XML_TOK_BOM;
1574
    /* 00 3C is handled in the default case */
1575
2.42k
    case 0x3C00:
1576
2.42k
      if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
1577
2.42k
           || INIT_ENC_INDEX(enc) == UTF_16_ENC)
1578
0
          && state == XML_CONTENT_STATE)
1579
0
        break;
1580
2.42k
      *encPtr = encodingTable[UTF_16LE_ENC];
1581
2.42k
      return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1582
377
    case 0xFFFE:
1583
377
      if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1584
0
        break;
1585
377
      *nextTokPtr = ptr + 2;
1586
377
      *encPtr = encodingTable[UTF_16LE_ENC];
1587
377
      return XML_TOK_BOM;
1588
35
    case 0xEFBB:
1589
      /* Maybe a UTF-8 BOM (EF BB BF) */
1590
      /* If there's an explicitly specified (external) encoding
1591
         of ISO-8859-1 or some flavour of UTF-16
1592
         and this is an external text entity,
1593
         don't look for the BOM,
1594
         because it might be a legal data.
1595
      */
1596
35
      if (state == XML_CONTENT_STATE) {
1597
0
        int e = INIT_ENC_INDEX(enc);
1598
0
        if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC || e == UTF_16LE_ENC
1599
0
            || e == UTF_16_ENC)
1600
0
          break;
1601
0
      }
1602
35
      if (ptr + 2 == end)
1603
6
        return XML_TOK_PARTIAL;
1604
29
      if ((unsigned char)ptr[2] == 0xBF) {
1605
10
        *nextTokPtr = ptr + 3;
1606
10
        *encPtr = encodingTable[UTF_8_ENC];
1607
10
        return XML_TOK_BOM;
1608
10
      }
1609
19
      break;
1610
14.5k
    default:
1611
14.5k
      if (ptr[0] == '\0') {
1612
        /* 0 isn't a legal data character. Furthermore a document
1613
           entity can only start with ASCII characters.  So the only
1614
           way this can fail to be big-endian UTF-16 if it it's an
1615
           external parsed general entity that's labelled as
1616
           UTF-16LE.
1617
        */
1618
4.51k
        if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
1619
0
          break;
1620
4.51k
        *encPtr = encodingTable[UTF_16BE_ENC];
1621
4.51k
        return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1622
9.99k
      } else if (ptr[1] == '\0') {
1623
        /* We could recover here in the case:
1624
            - parsing an external entity
1625
            - second byte is 0
1626
            - no externally specified encoding
1627
            - no encoding declaration
1628
           by assuming UTF-16LE.  But we don't, because this would mean when
1629
           presented just with a single byte, we couldn't reliably determine
1630
           whether we needed further bytes.
1631
        */
1632
1.54k
        if (state == XML_CONTENT_STATE)
1633
0
          break;
1634
1.54k
        *encPtr = encodingTable[UTF_16LE_ENC];
1635
1.54k
        return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1636
1.54k
      }
1637
8.45k
      break;
1638
17.3k
    }
1639
17.3k
  }
1640
8.57k
  *encPtr = encodingTable[INIT_ENC_INDEX(enc)];
1641
8.57k
  return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1642
17.5k
}
1643
1644
35.2k
#define NS(x) x
1645
0
#define ns(x) x
1646
#define XML_TOK_NS_C
1647
#include "xmltok_ns.c"
1648
#undef XML_TOK_NS_C
1649
#undef NS
1650
#undef ns
1651
1652
#ifdef XML_NS
1653
1654
52.8k
#  define NS(x) x##NS
1655
17.6k
#  define ns(x) x##_ns
1656
1657
#  define XML_TOK_NS_C
1658
#  include "xmltok_ns.c"
1659
#  undef XML_TOK_NS_C
1660
1661
#  undef NS
1662
#  undef ns
1663
1664
ENCODING *
1665
XmlInitUnknownEncodingNS(void *mem, const int *table, CONVERTER convert,
1666
0
                         void *userData) {
1667
0
  ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
1668
0
  if (enc)
1669
0
    ((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON;
1670
0
  return enc;
1671
0
}
1672
1673
#endif /* XML_NS */