Coverage Report

Created: 2024-05-20 06:20

/src/expat/expat/lib/xmltok.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
                            __  __            _
3
                         ___\ \/ /_ __   __ _| |_
4
                        / _ \\  /| '_ \ / _` | __|
5
                       |  __//  \| |_) | (_| | |_
6
                        \___/_/\_\ .__/ \__,_|\__|
7
                                 |_| XML parser
8
9
   Copyright (c) 1997-2000 Thai Open Source Software Center Ltd
10
   Copyright (c) 2000      Clark Cooper <coopercc@users.sourceforge.net>
11
   Copyright (c) 2001-2003 Fred L. Drake, Jr. <fdrake@users.sourceforge.net>
12
   Copyright (c) 2002      Greg Stein <gstein@users.sourceforge.net>
13
   Copyright (c) 2002-2016 Karl Waclawek <karl@waclawek.net>
14
   Copyright (c) 2005-2009 Steven Solie <steven@solie.ca>
15
   Copyright (c) 2016-2024 Sebastian Pipping <sebastian@pipping.org>
16
   Copyright (c) 2016      Pascal Cuoq <cuoq@trust-in-soft.com>
17
   Copyright (c) 2016      Don Lewis <truckman@apache.org>
18
   Copyright (c) 2017      Rhodri James <rhodri@wildebeest.org.uk>
19
   Copyright (c) 2017      Alexander Bluhm <alexander.bluhm@gmx.net>
20
   Copyright (c) 2017      Benbuck Nason <bnason@netflix.com>
21
   Copyright (c) 2017      José Gutiérrez de la Concha <jose@zeroc.com>
22
   Copyright (c) 2019      David Loffredo <loffredo@steptools.com>
23
   Copyright (c) 2021      Donghee Na <donghee.na@python.org>
24
   Copyright (c) 2022      Martin Ettl <ettl.martin78@googlemail.com>
25
   Copyright (c) 2022      Sean McBride <sean@rogue-research.com>
26
   Copyright (c) 2023      Hanno Böck <hanno@gentoo.org>
27
   Licensed under the MIT license:
28
29
   Permission is  hereby granted,  free of charge,  to any  person obtaining
30
   a  copy  of  this  software   and  associated  documentation  files  (the
31
   "Software"),  to  deal in  the  Software  without restriction,  including
32
   without  limitation the  rights  to use,  copy,  modify, merge,  publish,
33
   distribute, sublicense, and/or sell copies of the Software, and to permit
34
   persons  to whom  the Software  is  furnished to  do so,  subject to  the
35
   following conditions:
36
37
   The above copyright  notice and this permission notice  shall be included
38
   in all copies or substantial portions of the Software.
39
40
   THE  SOFTWARE  IS  PROVIDED  "AS  IS",  WITHOUT  WARRANTY  OF  ANY  KIND,
41
   EXPRESS  OR IMPLIED,  INCLUDING  BUT  NOT LIMITED  TO  THE WARRANTIES  OF
42
   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
43
   NO EVENT SHALL THE AUTHORS OR  COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
44
   DAMAGES OR  OTHER LIABILITY, WHETHER  IN AN  ACTION OF CONTRACT,  TORT OR
45
   OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
46
   USE OR OTHER DEALINGS IN THE SOFTWARE.
47
*/
48
49
#include "expat_config.h"
50
51
#include <stddef.h>
52
#include <string.h> /* memcpy */
53
#include <stdbool.h>
54
55
#ifdef _WIN32
56
#  include "winconfig.h"
57
#endif
58
59
#include "expat_external.h"
60
#include "internal.h"
61
#include "xmltok.h"
62
#include "nametab.h"
63
64
#ifdef XML_DTD
65
#  define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok)
66
#else
67
#  define IGNORE_SECTION_TOK_VTABLE /* as nothing */
68
#endif
69
70
#define VTABLE1                                                                \
71
  {PREFIX(prologTok), PREFIX(contentTok),                                      \
72
   PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE},                         \
73
      {PREFIX(attributeValueTok), PREFIX(entityValueTok)},                     \
74
      PREFIX(nameMatchesAscii), PREFIX(nameLength), PREFIX(skipS),             \
75
      PREFIX(getAtts), PREFIX(charRefNumber), PREFIX(predefinedEntityName),    \
76
      PREFIX(updatePosition), PREFIX(isPublicId)
77
78
#define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
79
80
#define UCS2_GET_NAMING(pages, hi, lo)                                         \
81
8.52M
  (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1u << ((lo) & 0x1F)))
82
83
/* A 2 byte UTF-8 representation splits the characters 11 bits between
84
   the bottom 5 and 6 bits of the bytes.  We need 8 bits to index into
85
   pages, 3 bits to add to that index and 5 bits to generate the mask.
86
*/
87
#define UTF8_GET_NAMING2(pages, byte)                                          \
88
974k
  (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3)                         \
89
974k
                + ((((byte)[0]) & 3) << 1) + ((((byte)[1]) >> 5) & 1)]         \
90
974k
   & (1u << (((byte)[1]) & 0x1F)))
91
92
/* A 3 byte UTF-8 representation splits the characters 16 bits between
93
   the bottom 4, 6 and 6 bits of the bytes.  We need 8 bits to index
94
   into pages, 3 bits to add to that index and 5 bits to generate the
95
   mask.
96
*/
97
#define UTF8_GET_NAMING3(pages, byte)                                          \
98
1.39M
  (namingBitmap                                                                \
99
1.39M
       [((pages)[((((byte)[0]) & 0xF) << 4) + ((((byte)[1]) >> 2) & 0xF)]      \
100
1.39M
         << 3)                                                                 \
101
1.39M
        + ((((byte)[1]) & 3) << 1) + ((((byte)[2]) >> 5) & 1)]                 \
102
1.39M
   & (1u << (((byte)[2]) & 0x1F)))
103
104
/* Detection of invalid UTF-8 sequences is based on Table 3.1B
105
   of Unicode 3.2: https://www.unicode.org/unicode/reports/tr28/
106
   with the additional restriction of not allowing the Unicode
107
   code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE).
108
   Implementation details:
109
     (A & 0x80) == 0     means A < 0x80
110
   and
111
     (A & 0xC0) == 0xC0  means A > 0xBF
112
*/
113
114
#define UTF8_INVALID2(p)                                                       \
115
2.16M
  ((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0)
116
117
#define UTF8_INVALID3(p)                                                       \
118
239M
  (((p)[2] & 0x80) == 0                                                        \
119
239M
   || ((*p) == 0xEF && (p)[1] == 0xBF ? (p)[2] > 0xBD                          \
120
239M
                                      : ((p)[2] & 0xC0) == 0xC0)               \
121
239M
   || ((*p) == 0xE0                                                            \
122
239M
           ? (p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0                          \
123
239M
           : ((p)[1] & 0x80) == 0                                              \
124
224M
                 || ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0)))
125
126
#define UTF8_INVALID4(p)                                                       \
127
5.33M
  (((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 || ((p)[2] & 0x80) == 0     \
128
5.33M
   || ((p)[2] & 0xC0) == 0xC0                                                  \
129
5.33M
   || ((*p) == 0xF0                                                            \
130
5.33M
           ? (p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0                          \
131
5.33M
           : ((p)[1] & 0x80) == 0                                              \
132
5.01M
                 || ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0)))
133
134
static int PTRFASTCALL
135
385
isNever(const ENCODING *enc, const char *p) {
136
385
  UNUSED_P(enc);
137
385
  UNUSED_P(p);
138
385
  return 0;
139
385
}
140
141
static int PTRFASTCALL
142
608k
utf8_isName2(const ENCODING *enc, const char *p) {
143
608k
  UNUSED_P(enc);
144
608k
  return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
145
608k
}
146
147
static int PTRFASTCALL
148
1.38M
utf8_isName3(const ENCODING *enc, const char *p) {
149
1.38M
  UNUSED_P(enc);
150
1.38M
  return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
151
1.38M
}
152
153
#define utf8_isName4 isNever
154
155
static int PTRFASTCALL
156
366k
utf8_isNmstrt2(const ENCODING *enc, const char *p) {
157
366k
  UNUSED_P(enc);
158
366k
  return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
159
366k
}
160
161
static int PTRFASTCALL
162
14.4k
utf8_isNmstrt3(const ENCODING *enc, const char *p) {
163
14.4k
  UNUSED_P(enc);
164
14.4k
  return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
165
14.4k
}
166
167
#define utf8_isNmstrt4 isNever
168
169
static int PTRFASTCALL
170
2.16M
utf8_isInvalid2(const ENCODING *enc, const char *p) {
171
2.16M
  UNUSED_P(enc);
172
2.16M
  return UTF8_INVALID2((const unsigned char *)p);
173
2.16M
}
174
175
static int PTRFASTCALL
176
239M
utf8_isInvalid3(const ENCODING *enc, const char *p) {
177
239M
  UNUSED_P(enc);
178
239M
  return UTF8_INVALID3((const unsigned char *)p);
179
239M
}
180
181
static int PTRFASTCALL
182
5.33M
utf8_isInvalid4(const ENCODING *enc, const char *p) {
183
5.33M
  UNUSED_P(enc);
184
5.33M
  return UTF8_INVALID4((const unsigned char *)p);
185
5.33M
}
186
187
struct normal_encoding {
188
  ENCODING enc;
189
  unsigned char type[256];
190
#ifdef XML_MIN_SIZE
191
  int(PTRFASTCALL *byteType)(const ENCODING *, const char *);
192
  int(PTRFASTCALL *isNameMin)(const ENCODING *, const char *);
193
  int(PTRFASTCALL *isNmstrtMin)(const ENCODING *, const char *);
194
  int(PTRFASTCALL *byteToAscii)(const ENCODING *, const char *);
195
  int(PTRCALL *charMatches)(const ENCODING *, const char *, int);
196
#endif /* XML_MIN_SIZE */
197
  int(PTRFASTCALL *isName2)(const ENCODING *, const char *);
198
  int(PTRFASTCALL *isName3)(const ENCODING *, const char *);
199
  int(PTRFASTCALL *isName4)(const ENCODING *, const char *);
200
  int(PTRFASTCALL *isNmstrt2)(const ENCODING *, const char *);
201
  int(PTRFASTCALL *isNmstrt3)(const ENCODING *, const char *);
202
  int(PTRFASTCALL *isNmstrt4)(const ENCODING *, const char *);
203
  int(PTRFASTCALL *isInvalid2)(const ENCODING *, const char *);
204
  int(PTRFASTCALL *isInvalid3)(const ENCODING *, const char *);
205
  int(PTRFASTCALL *isInvalid4)(const ENCODING *, const char *);
206
};
207
208
249M
#define AS_NORMAL_ENCODING(enc) ((const struct normal_encoding *)(enc))
209
210
#ifdef XML_MIN_SIZE
211
212
#  define STANDARD_VTABLE(E)                                                   \
213
    E##byteType, E##isNameMin, E##isNmstrtMin, E##byteToAscii, E##charMatches,
214
215
#else
216
217
#  define STANDARD_VTABLE(E) /* as nothing */
218
219
#endif
220
221
#define NORMAL_VTABLE(E)                                                       \
222
  E##isName2, E##isName3, E##isName4, E##isNmstrt2, E##isNmstrt3,              \
223
      E##isNmstrt4, E##isInvalid2, E##isInvalid3, E##isInvalid4
224
225
#define NULL_VTABLE                                                            \
226
  /* isName2 */ NULL, /* isName3 */ NULL, /* isName4 */ NULL,                  \
227
      /* isNmstrt2 */ NULL, /* isNmstrt3 */ NULL, /* isNmstrt4 */ NULL,        \
228
      /* isInvalid2 */ NULL, /* isInvalid3 */ NULL, /* isInvalid4 */ NULL
229
230
static int FASTCALL checkCharRefNumber(int result);
231
232
#include "xmltok_impl.h"
233
#include "ascii.h"
234
235
#ifdef XML_MIN_SIZE
236
#  define sb_isNameMin isNever
237
#  define sb_isNmstrtMin isNever
238
#endif
239
240
#ifdef XML_MIN_SIZE
241
#  define MINBPC(enc) ((enc)->minBytesPerChar)
242
#else
243
/* minimum bytes per character */
244
4.89G
#  define MINBPC(enc) 1
245
#endif
246
247
#define SB_BYTE_TYPE(enc, p)                                                   \
248
4.32G
  (((const struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
249
250
#ifdef XML_MIN_SIZE
251
static int PTRFASTCALL
252
sb_byteType(const ENCODING *enc, const char *p) {
253
  return SB_BYTE_TYPE(enc, p);
254
}
255
#  define BYTE_TYPE(enc, p) (AS_NORMAL_ENCODING(enc)->byteType(enc, p))
256
#else
257
4.25G
#  define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
258
#endif
259
260
#ifdef XML_MIN_SIZE
261
#  define BYTE_TO_ASCII(enc, p) (AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p))
262
static int PTRFASTCALL
263
sb_byteToAscii(const ENCODING *enc, const char *p) {
264
  UNUSED_P(enc);
265
  return *p;
266
}
267
#else
268
183k
#  define BYTE_TO_ASCII(enc, p) (*(p))
269
#endif
270
271
1.99M
#define IS_NAME_CHAR(enc, p, n) (AS_NORMAL_ENCODING(enc)->isName##n(enc, p))
272
381k
#define IS_NMSTRT_CHAR(enc, p, n) (AS_NORMAL_ENCODING(enc)->isNmstrt##n(enc, p))
273
#ifdef XML_MIN_SIZE
274
#  define IS_INVALID_CHAR(enc, p, n)                                           \
275
    (AS_NORMAL_ENCODING(enc)->isInvalid##n                                     \
276
     && AS_NORMAL_ENCODING(enc)->isInvalid##n(enc, p))
277
#else
278
#  define IS_INVALID_CHAR(enc, p, n)                                           \
279
249M
    (AS_NORMAL_ENCODING(enc)->isInvalid##n(enc, p))
280
#endif
281
282
#ifdef XML_MIN_SIZE
283
#  define IS_NAME_CHAR_MINBPC(enc, p)                                          \
284
    (AS_NORMAL_ENCODING(enc)->isNameMin(enc, p))
285
#  define IS_NMSTRT_CHAR_MINBPC(enc, p)                                        \
286
    (AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p))
287
#else
288
0
#  define IS_NAME_CHAR_MINBPC(enc, p) (0)
289
0
#  define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
290
#endif
291
292
#ifdef XML_MIN_SIZE
293
#  define CHAR_MATCHES(enc, p, c)                                              \
294
    (AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c))
295
static int PTRCALL
296
sb_charMatches(const ENCODING *enc, const char *p, int c) {
297
  UNUSED_P(enc);
298
  return *p == c;
299
}
300
#else
301
/* c is an ASCII character */
302
15.9M
#  define CHAR_MATCHES(enc, p, c) (*(p) == (c))
303
#endif
304
305
10.0M
#define PREFIX(ident) normal_##ident
306
#define XML_TOK_IMPL_C
307
#include "xmltok_impl.c"
308
#undef XML_TOK_IMPL_C
309
310
#undef MINBPC
311
#undef BYTE_TYPE
312
#undef BYTE_TO_ASCII
313
#undef CHAR_MATCHES
314
#undef IS_NAME_CHAR
315
#undef IS_NAME_CHAR_MINBPC
316
#undef IS_NMSTRT_CHAR
317
#undef IS_NMSTRT_CHAR_MINBPC
318
#undef IS_INVALID_CHAR
319
320
enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */
321
       UTF8_cval1 = 0x00,
322
       UTF8_cval2 = 0xc0,
323
       UTF8_cval3 = 0xe0,
324
       UTF8_cval4 = 0xf0
325
};
326
327
void
328
_INTERNAL_trim_to_complete_utf8_characters(const char *from,
329
21.5M
                                           const char **fromLimRef) {
330
21.5M
  const char *fromLim = *fromLimRef;
331
21.5M
  size_t walked = 0;
332
33.9M
  for (; fromLim > from; fromLim--, walked++) {
333
33.9M
    const unsigned char prev = (unsigned char)fromLim[-1];
334
33.9M
    if ((prev & 0xf8u)
335
33.9M
        == 0xf0u) { /* 4-byte character, lead by 0b11110xxx byte */
336
2.43M
      if (walked + 1 >= 4) {
337
2.43M
        fromLim += 4 - 1;
338
2.43M
        break;
339
2.43M
      } else {
340
199
        walked = 0;
341
199
      }
342
31.5M
    } else if ((prev & 0xf0u)
343
31.5M
               == 0xe0u) { /* 3-byte character, lead by 0b1110xxxx byte */
344
2.51M
      if (walked + 1 >= 3) {
345
2.51M
        fromLim += 3 - 1;
346
2.51M
        break;
347
2.51M
      } else {
348
4.99k
        walked = 0;
349
4.99k
      }
350
29.0M
    } else if ((prev & 0xe0u)
351
29.0M
               == 0xc0u) { /* 2-byte character, lead by 0b110xxxxx byte */
352
87.1k
      if (walked + 1 >= 2) {
353
86.9k
        fromLim += 2 - 1;
354
86.9k
        break;
355
86.9k
      } else {
356
157
        walked = 0;
357
157
      }
358
28.9M
    } else if ((prev & 0x80u)
359
28.9M
               == 0x00u) { /* 1-byte character, matching 0b0xxxxxxx */
360
16.5M
      break;
361
16.5M
    }
362
33.9M
  }
363
21.5M
  *fromLimRef = fromLim;
364
21.5M
}
365
366
static enum XML_Convert_Result PTRCALL
367
utf8_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
368
21.5M
            char **toP, const char *toLim) {
369
21.5M
  bool input_incomplete = false;
370
21.5M
  bool output_exhausted = false;
371
372
  /* Avoid copying partial characters (due to limited space). */
373
21.5M
  const ptrdiff_t bytesAvailable = fromLim - *fromP;
374
21.5M
  const ptrdiff_t bytesStorable = toLim - *toP;
375
21.5M
  UNUSED_P(enc);
376
21.5M
  if (bytesAvailable > bytesStorable) {
377
51.6k
    fromLim = *fromP + bytesStorable;
378
51.6k
    output_exhausted = true;
379
51.6k
  }
380
381
  /* Avoid copying partial characters (from incomplete input). */
382
21.5M
  {
383
21.5M
    const char *const fromLimBefore = fromLim;
384
21.5M
    _INTERNAL_trim_to_complete_utf8_characters(*fromP, &fromLim);
385
21.5M
    if (fromLim < fromLimBefore) {
386
5.34k
      input_incomplete = true;
387
5.34k
    }
388
21.5M
  }
389
390
21.5M
  {
391
21.5M
    const ptrdiff_t bytesToCopy = fromLim - *fromP;
392
21.5M
    memcpy(*toP, *fromP, bytesToCopy);
393
21.5M
    *fromP += bytesToCopy;
394
21.5M
    *toP += bytesToCopy;
395
21.5M
  }
396
397
21.5M
  if (output_exhausted) /* needs to go first */
398
51.6k
    return XML_CONVERT_OUTPUT_EXHAUSTED;
399
21.5M
  else if (input_incomplete)
400
0
    return XML_CONVERT_INPUT_INCOMPLETE;
401
21.5M
  else
402
21.5M
    return XML_CONVERT_COMPLETED;
403
21.5M
}
404
405
static enum XML_Convert_Result PTRCALL
406
utf8_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
407
0
             unsigned short **toP, const unsigned short *toLim) {
408
0
  enum XML_Convert_Result res = XML_CONVERT_COMPLETED;
409
0
  unsigned short *to = *toP;
410
0
  const char *from = *fromP;
411
0
  while (from < fromLim && to < toLim) {
412
0
    switch (SB_BYTE_TYPE(enc, from)) {
413
0
    case BT_LEAD2:
414
0
      if (fromLim - from < 2) {
415
0
        res = XML_CONVERT_INPUT_INCOMPLETE;
416
0
        goto after;
417
0
      }
418
0
      *to++ = (unsigned short)(((from[0] & 0x1f) << 6) | (from[1] & 0x3f));
419
0
      from += 2;
420
0
      break;
421
0
    case BT_LEAD3:
422
0
      if (fromLim - from < 3) {
423
0
        res = XML_CONVERT_INPUT_INCOMPLETE;
424
0
        goto after;
425
0
      }
426
0
      *to++ = (unsigned short)(((from[0] & 0xf) << 12) | ((from[1] & 0x3f) << 6)
427
0
                               | (from[2] & 0x3f));
428
0
      from += 3;
429
0
      break;
430
0
    case BT_LEAD4: {
431
0
      unsigned long n;
432
0
      if (toLim - to < 2) {
433
0
        res = XML_CONVERT_OUTPUT_EXHAUSTED;
434
0
        goto after;
435
0
      }
436
0
      if (fromLim - from < 4) {
437
0
        res = XML_CONVERT_INPUT_INCOMPLETE;
438
0
        goto after;
439
0
      }
440
0
      n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12)
441
0
          | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
442
0
      n -= 0x10000;
443
0
      to[0] = (unsigned short)((n >> 10) | 0xD800);
444
0
      to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
445
0
      to += 2;
446
0
      from += 4;
447
0
    } break;
448
0
    default:
449
0
      *to++ = *from++;
450
0
      break;
451
0
    }
452
0
  }
453
0
  if (from < fromLim)
454
0
    res = XML_CONVERT_OUTPUT_EXHAUSTED;
455
0
after:
456
0
  *fromP = from;
457
0
  *toP = to;
458
0
  return res;
459
0
}
460
461
#ifdef XML_NS
462
static const struct normal_encoding utf8_encoding_ns
463
    = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
464
       {
465
#  include "asciitab.h"
466
#  include "utf8tab.h"
467
       },
468
       STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
469
#endif
470
471
static const struct normal_encoding utf8_encoding
472
    = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
473
       {
474
#define BT_COLON BT_NMSTRT
475
#include "asciitab.h"
476
#undef BT_COLON
477
#include "utf8tab.h"
478
       },
479
       STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
480
481
#ifdef XML_NS
482
483
static const struct normal_encoding internal_utf8_encoding_ns
484
    = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
485
       {
486
#  include "iasciitab.h"
487
#  include "utf8tab.h"
488
       },
489
       STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
490
491
#endif
492
493
static const struct normal_encoding internal_utf8_encoding
494
    = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
495
       {
496
#define BT_COLON BT_NMSTRT
497
#include "iasciitab.h"
498
#undef BT_COLON
499
#include "utf8tab.h"
500
       },
501
       STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
502
503
static enum XML_Convert_Result PTRCALL
504
latin1_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
505
25.7k
              char **toP, const char *toLim) {
506
25.7k
  UNUSED_P(enc);
507
11.6M
  for (;;) {
508
11.6M
    unsigned char c;
509
11.6M
    if (*fromP == fromLim)
510
24.2k
      return XML_CONVERT_COMPLETED;
511
11.6M
    c = (unsigned char)**fromP;
512
11.6M
    if (c & 0x80) {
513
8.84M
      if (toLim - *toP < 2)
514
801
        return XML_CONVERT_OUTPUT_EXHAUSTED;
515
8.84M
      *(*toP)++ = (char)((c >> 6) | UTF8_cval2);
516
8.84M
      *(*toP)++ = (char)((c & 0x3f) | 0x80);
517
8.84M
      (*fromP)++;
518
8.84M
    } else {
519
2.82M
      if (*toP == toLim)
520
624
        return XML_CONVERT_OUTPUT_EXHAUSTED;
521
2.82M
      *(*toP)++ = *(*fromP)++;
522
2.82M
    }
523
11.6M
  }
524
25.7k
}
525
526
static enum XML_Convert_Result PTRCALL
527
latin1_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
528
0
               unsigned short **toP, const unsigned short *toLim) {
529
0
  UNUSED_P(enc);
530
0
  while (*fromP < fromLim && *toP < toLim)
531
0
    *(*toP)++ = (unsigned char)*(*fromP)++;
532
533
0
  if ((*toP == toLim) && (*fromP < fromLim))
534
0
    return XML_CONVERT_OUTPUT_EXHAUSTED;
535
0
  else
536
0
    return XML_CONVERT_COMPLETED;
537
0
}
538
539
#ifdef XML_NS
540
541
static const struct normal_encoding latin1_encoding_ns
542
    = {{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0},
543
       {
544
#  include "asciitab.h"
545
#  include "latin1tab.h"
546
       },
547
       STANDARD_VTABLE(sb_) NULL_VTABLE};
548
549
#endif
550
551
static const struct normal_encoding latin1_encoding
552
    = {{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0},
553
       {
554
#define BT_COLON BT_NMSTRT
555
#include "asciitab.h"
556
#undef BT_COLON
557
#include "latin1tab.h"
558
       },
559
       STANDARD_VTABLE(sb_) NULL_VTABLE};
560
561
static enum XML_Convert_Result PTRCALL
562
ascii_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
563
778
             char **toP, const char *toLim) {
564
778
  UNUSED_P(enc);
565
15.8k
  while (*fromP < fromLim && *toP < toLim)
566
15.0k
    *(*toP)++ = *(*fromP)++;
567
568
778
  if ((*toP == toLim) && (*fromP < fromLim))
569
212
    return XML_CONVERT_OUTPUT_EXHAUSTED;
570
566
  else
571
566
    return XML_CONVERT_COMPLETED;
572
778
}
573
574
#ifdef XML_NS
575
576
static const struct normal_encoding ascii_encoding_ns
577
    = {{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0},
578
       {
579
#  include "asciitab.h"
580
           /* BT_NONXML == 0 */
581
       },
582
       STANDARD_VTABLE(sb_) NULL_VTABLE};
583
584
#endif
585
586
static const struct normal_encoding ascii_encoding
587
    = {{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0},
588
       {
589
#define BT_COLON BT_NMSTRT
590
#include "asciitab.h"
591
#undef BT_COLON
592
           /* BT_NONXML == 0 */
593
       },
594
       STANDARD_VTABLE(sb_) NULL_VTABLE};
595
596
static int PTRFASTCALL
597
1.20G
unicode_byte_type(char hi, char lo) {
598
1.20G
  switch ((unsigned char)hi) {
599
  /* 0xD800-0xDBFF first 16-bit code unit or high surrogate (W1) */
600
10.0M
  case 0xD8:
601
14.3M
  case 0xD9:
602
40.7M
  case 0xDA:
603
50.6M
  case 0xDB:
604
50.6M
    return BT_LEAD4;
605
  /* 0xDC00-0xDFFF second 16-bit code unit or low surrogate (W2) */
606
837
  case 0xDC:
607
1.70k
  case 0xDD:
608
2.53k
  case 0xDE:
609
3.35k
  case 0xDF:
610
3.35k
    return BT_TRAIL;
611
2.86M
  case 0xFF:
612
2.86M
    switch ((unsigned char)lo) {
613
972
    case 0xFF: /* noncharacter-FFFF */
614
1.06k
    case 0xFE: /* noncharacter-FFFE */
615
1.06k
      return BT_NONXML;
616
2.86M
    }
617
2.86M
    break;
618
1.20G
  }
619
1.15G
  return BT_NONASCII;
620
1.20G
}
621
622
#define DEFINE_UTF16_TO_UTF8(E)                                                \
623
  static enum XML_Convert_Result PTRCALL E##toUtf8(                            \
624
      const ENCODING *enc, const char **fromP, const char *fromLim,            \
625
1.89M
      char **toP, const char *toLim) {                                         \
626
1.89M
    const char *from = *fromP;                                                 \
627
1.89M
    UNUSED_P(enc);                                                             \
628
1.89M
    fromLim = from + (((fromLim - from) >> 1) << 1); /* shrink to even */      \
629
286M
    for (; from < fromLim; from += 2) {                                        \
630
284M
      int plane;                                                               \
631
284M
      unsigned char lo2;                                                       \
632
284M
      unsigned char lo = GET_LO(from);                                         \
633
284M
      unsigned char hi = GET_HI(from);                                         \
634
284M
      switch (hi) {                                                            \
635
5.55M
      case 0:                                                                  \
636
5.55M
        if (lo < 0x80) {                                                       \
637
4.91M
          if (*toP == toLim) {                                                 \
638
2.44k
            *fromP = from;                                                     \
639
2.44k
            return XML_CONVERT_OUTPUT_EXHAUSTED;                               \
640
2.44k
          }                                                                    \
641
4.91M
          *(*toP)++ = lo;                                                      \
642
4.91M
          break;                                                               \
643
4.91M
        }                                                                      \
644
5.55M
        /* fall through */                                                     \
645
5.55M
      case 0x1:                                                                \
646
905k
      case 0x2:                                                                \
647
1.00M
      case 0x3:                                                                \
648
1.12M
      case 0x4:                                                                \
649
1.19M
      case 0x5:                                                                \
650
1.28M
      case 0x6:                                                                \
651
1.34M
      case 0x7:                                                                \
652
1.34M
        if (toLim - *toP < 2) {                                                \
653
713
          *fromP = from;                                                       \
654
713
          return XML_CONVERT_OUTPUT_EXHAUSTED;                                 \
655
713
        }                                                                      \
656
1.34M
        *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2);                      \
657
1.34M
        *(*toP)++ = ((lo & 0x3f) | 0x80);                                      \
658
1.34M
        break;                                                                 \
659
268M
      default:                                                                 \
660
268M
        if (toLim - *toP < 3) {                                                \
661
46.4k
          *fromP = from;                                                       \
662
46.4k
          return XML_CONVERT_OUTPUT_EXHAUSTED;                                 \
663
46.4k
        }                                                                      \
664
268M
        /* 16 bits divided 4, 6, 6 amongst 3 bytes */                          \
665
268M
        *(*toP)++ = ((hi >> 4) | UTF8_cval3);                                  \
666
268M
        *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80);                    \
667
268M
        *(*toP)++ = ((lo & 0x3f) | 0x80);                                      \
668
268M
        break;                                                                 \
669
268M
      case 0xD8:                                                               \
670
3.31M
      case 0xD9:                                                               \
671
8.15M
      case 0xDA:                                                               \
672
10.1M
      case 0xDB:                                                               \
673
10.1M
        if (toLim - *toP < 4) {                                                \
674
3.66k
          *fromP = from;                                                       \
675
3.66k
          return XML_CONVERT_OUTPUT_EXHAUSTED;                                 \
676
3.66k
        }                                                                      \
677
10.1M
        if (fromLim - from < 4) {                                              \
678
0
          *fromP = from;                                                       \
679
0
          return XML_CONVERT_INPUT_INCOMPLETE;                                 \
680
0
        }                                                                      \
681
10.1M
        plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1;                   \
682
10.1M
        *(*toP)++ = (char)((plane >> 2) | UTF8_cval4);                         \
683
10.1M
        *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80);         \
684
10.1M
        from += 2;                                                             \
685
10.1M
        lo2 = GET_LO(from);                                                    \
686
10.1M
        *(*toP)++ = (((lo & 0x3) << 4) | ((GET_HI(from) & 0x3) << 2)           \
687
10.1M
                     | (lo2 >> 6) | 0x80);                                     \
688
10.1M
        *(*toP)++ = ((lo2 & 0x3f) | 0x80);                                     \
689
10.1M
        break;                                                                 \
690
284M
      }                                                                        \
691
284M
    }                                                                          \
692
1.89M
    *fromP = from;                                                             \
693
1.84M
    if (from < fromLim)                                                        \
694
1.84M
      return XML_CONVERT_INPUT_INCOMPLETE;                                     \
695
1.84M
    else                                                                       \
696
1.84M
      return XML_CONVERT_COMPLETED;                                            \
697
1.84M
  }
xmltok.c:little2_toUtf8
Line
Count
Source
625
1.11M
      char **toP, const char *toLim) {                                         \
626
1.11M
    const char *from = *fromP;                                                 \
627
1.11M
    UNUSED_P(enc);                                                             \
628
1.11M
    fromLim = from + (((fromLim - from) >> 1) << 1); /* shrink to even */      \
629
108M
    for (; from < fromLim; from += 2) {                                        \
630
107M
      int plane;                                                               \
631
107M
      unsigned char lo2;                                                       \
632
107M
      unsigned char lo = GET_LO(from);                                         \
633
107M
      unsigned char hi = GET_HI(from);                                         \
634
107M
      switch (hi) {                                                            \
635
3.36M
      case 0:                                                                  \
636
3.36M
        if (lo < 0x80) {                                                       \
637
3.00M
          if (*toP == toLim) {                                                 \
638
1.61k
            *fromP = from;                                                     \
639
1.61k
            return XML_CONVERT_OUTPUT_EXHAUSTED;                               \
640
1.61k
          }                                                                    \
641
3.00M
          *(*toP)++ = lo;                                                      \
642
2.99M
          break;                                                               \
643
3.00M
        }                                                                      \
644
3.36M
        /* fall through */                                                     \
645
3.36M
      case 0x1:                                                                \
646
495k
      case 0x2:                                                                \
647
563k
      case 0x3:                                                                \
648
600k
      case 0x4:                                                                \
649
635k
      case 0x5:                                                                \
650
724k
      case 0x6:                                                                \
651
769k
      case 0x7:                                                                \
652
769k
        if (toLim - *toP < 2) {                                                \
653
429
          *fromP = from;                                                       \
654
429
          return XML_CONVERT_OUTPUT_EXHAUSTED;                                 \
655
429
        }                                                                      \
656
769k
        *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2);                      \
657
768k
        *(*toP)++ = ((lo & 0x3f) | 0x80);                                      \
658
768k
        break;                                                                 \
659
98.9M
      default:                                                                 \
660
98.9M
        if (toLim - *toP < 3) {                                                \
661
22.1k
          *fromP = from;                                                       \
662
22.1k
          return XML_CONVERT_OUTPUT_EXHAUSTED;                                 \
663
22.1k
        }                                                                      \
664
98.9M
        /* 16 bits divided 4, 6, 6 amongst 3 bytes */                          \
665
98.9M
        *(*toP)++ = ((hi >> 4) | UTF8_cval3);                                  \
666
98.8M
        *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80);                    \
667
98.8M
        *(*toP)++ = ((lo & 0x3f) | 0x80);                                      \
668
98.8M
        break;                                                                 \
669
98.9M
      case 0xD8:                                                               \
670
2.11M
      case 0xD9:                                                               \
671
4.22M
      case 0xDA:                                                               \
672
5.07M
      case 0xDB:                                                               \
673
5.07M
        if (toLim - *toP < 4) {                                                \
674
1.48k
          *fromP = from;                                                       \
675
1.48k
          return XML_CONVERT_OUTPUT_EXHAUSTED;                                 \
676
1.48k
        }                                                                      \
677
5.07M
        if (fromLim - from < 4) {                                              \
678
0
          *fromP = from;                                                       \
679
0
          return XML_CONVERT_INPUT_INCOMPLETE;                                 \
680
0
        }                                                                      \
681
5.07M
        plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1;                   \
682
5.07M
        *(*toP)++ = (char)((plane >> 2) | UTF8_cval4);                         \
683
5.07M
        *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80);         \
684
5.07M
        from += 2;                                                             \
685
5.07M
        lo2 = GET_LO(from);                                                    \
686
5.07M
        *(*toP)++ = (((lo & 0x3) << 4) | ((GET_HI(from) & 0x3) << 2)           \
687
5.07M
                     | (lo2 >> 6) | 0x80);                                     \
688
5.07M
        *(*toP)++ = ((lo2 & 0x3f) | 0x80);                                     \
689
5.07M
        break;                                                                 \
690
107M
      }                                                                        \
691
107M
    }                                                                          \
692
1.11M
    *fromP = from;                                                             \
693
1.08M
    if (from < fromLim)                                                        \
694
1.08M
      return XML_CONVERT_INPUT_INCOMPLETE;                                     \
695
1.08M
    else                                                                       \
696
1.08M
      return XML_CONVERT_COMPLETED;                                            \
697
1.08M
  }
xmltok.c:big2_toUtf8
Line
Count
Source
625
786k
      char **toP, const char *toLim) {                                         \
626
786k
    const char *from = *fromP;                                                 \
627
786k
    UNUSED_P(enc);                                                             \
628
786k
    fromLim = from + (((fromLim - from) >> 1) << 1); /* shrink to even */      \
629
177M
    for (; from < fromLim; from += 2) {                                        \
630
177M
      int plane;                                                               \
631
177M
      unsigned char lo2;                                                       \
632
177M
      unsigned char lo = GET_LO(from);                                         \
633
177M
      unsigned char hi = GET_HI(from);                                         \
634
177M
      switch (hi) {                                                            \
635
2.19M
      case 0:                                                                  \
636
2.19M
        if (lo < 0x80) {                                                       \
637
1.91M
          if (*toP == toLim) {                                                 \
638
836
            *fromP = from;                                                     \
639
836
            return XML_CONVERT_OUTPUT_EXHAUSTED;                               \
640
836
          }                                                                    \
641
1.91M
          *(*toP)++ = lo;                                                      \
642
1.91M
          break;                                                               \
643
1.91M
        }                                                                      \
644
2.19M
        /* fall through */                                                     \
645
2.19M
      case 0x1:                                                                \
646
409k
      case 0x2:                                                                \
647
438k
      case 0x3:                                                                \
648
525k
      case 0x4:                                                                \
649
558k
      case 0x5:                                                                \
650
563k
      case 0x6:                                                                \
651
572k
      case 0x7:                                                                \
652
572k
        if (toLim - *toP < 2) {                                                \
653
284
          *fromP = from;                                                       \
654
284
          return XML_CONVERT_OUTPUT_EXHAUSTED;                                 \
655
284
        }                                                                      \
656
572k
        *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2);                      \
657
571k
        *(*toP)++ = ((lo & 0x3f) | 0x80);                                      \
658
571k
        break;                                                                 \
659
169M
      default:                                                                 \
660
169M
        if (toLim - *toP < 3) {                                                \
661
24.2k
          *fromP = from;                                                       \
662
24.2k
          return XML_CONVERT_OUTPUT_EXHAUSTED;                                 \
663
24.2k
        }                                                                      \
664
169M
        /* 16 bits divided 4, 6, 6 amongst 3 bytes */                          \
665
169M
        *(*toP)++ = ((hi >> 4) | UTF8_cval3);                                  \
666
169M
        *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80);                    \
667
169M
        *(*toP)++ = ((lo & 0x3f) | 0x80);                                      \
668
169M
        break;                                                                 \
669
169M
      case 0xD8:                                                               \
670
1.19M
      case 0xD9:                                                               \
671
3.92M
      case 0xDA:                                                               \
672
5.08M
      case 0xDB:                                                               \
673
5.08M
        if (toLim - *toP < 4) {                                                \
674
2.17k
          *fromP = from;                                                       \
675
2.17k
          return XML_CONVERT_OUTPUT_EXHAUSTED;                                 \
676
2.17k
        }                                                                      \
677
5.08M
        if (fromLim - from < 4) {                                              \
678
0
          *fromP = from;                                                       \
679
0
          return XML_CONVERT_INPUT_INCOMPLETE;                                 \
680
0
        }                                                                      \
681
5.08M
        plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1;                   \
682
5.08M
        *(*toP)++ = (char)((plane >> 2) | UTF8_cval4);                         \
683
5.08M
        *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80);         \
684
5.08M
        from += 2;                                                             \
685
5.08M
        lo2 = GET_LO(from);                                                    \
686
5.08M
        *(*toP)++ = (((lo & 0x3) << 4) | ((GET_HI(from) & 0x3) << 2)           \
687
5.08M
                     | (lo2 >> 6) | 0x80);                                     \
688
5.08M
        *(*toP)++ = ((lo2 & 0x3f) | 0x80);                                     \
689
5.08M
        break;                                                                 \
690
177M
      }                                                                        \
691
177M
    }                                                                          \
692
786k
    *fromP = from;                                                             \
693
759k
    if (from < fromLim)                                                        \
694
759k
      return XML_CONVERT_INPUT_INCOMPLETE;                                     \
695
759k
    else                                                                       \
696
759k
      return XML_CONVERT_COMPLETED;                                            \
697
759k
  }
698
699
#define DEFINE_UTF16_TO_UTF16(E)                                               \
700
  static enum XML_Convert_Result PTRCALL E##toUtf16(                           \
701
      const ENCODING *enc, const char **fromP, const char *fromLim,            \
702
0
      unsigned short **toP, const unsigned short *toLim) {                     \
703
0
    enum XML_Convert_Result res = XML_CONVERT_COMPLETED;                       \
704
0
    UNUSED_P(enc);                                                             \
705
0
    fromLim = *fromP + (((fromLim - *fromP) >> 1) << 1); /* shrink to even */  \
706
0
    /* Avoid copying first half only of surrogate */                           \
707
0
    if (fromLim - *fromP > ((toLim - *toP) << 1)                               \
708
0
        && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) {                             \
709
0
      fromLim -= 2;                                                            \
710
0
      res = XML_CONVERT_INPUT_INCOMPLETE;                                      \
711
0
    }                                                                          \
712
0
    for (; *fromP < fromLim && *toP < toLim; *fromP += 2)                      \
713
0
      *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP);                      \
714
0
    if ((*toP == toLim) && (*fromP < fromLim))                                 \
715
0
      return XML_CONVERT_OUTPUT_EXHAUSTED;                                     \
716
0
    else                                                                       \
717
0
      return res;                                                              \
718
0
  }
Unexecuted instantiation: xmltok.c:little2_toUtf16
Unexecuted instantiation: xmltok.c:big2_toUtf16
719
720
112M
#define GET_LO(ptr) ((unsigned char)(ptr)[0])
721
112M
#define GET_HI(ptr) ((unsigned char)(ptr)[1])
722
723
DEFINE_UTF16_TO_UTF8(little2_)
724
DEFINE_UTF16_TO_UTF16(little2_)
725
726
#undef GET_LO
727
#undef GET_HI
728
729
182M
#define GET_LO(ptr) ((unsigned char)(ptr)[1])
730
182M
#define GET_HI(ptr) ((unsigned char)(ptr)[0])
731
732
DEFINE_UTF16_TO_UTF8(big2_)
733
DEFINE_UTF16_TO_UTF16(big2_)
734
735
#undef GET_LO
736
#undef GET_HI
737
738
#define LITTLE2_BYTE_TYPE(enc, p)                                              \
739
492M
  ((p)[1] == 0 ? SB_BYTE_TYPE(enc, p) : unicode_byte_type((p)[1], (p)[0]))
740
427k
#define LITTLE2_BYTE_TO_ASCII(p) ((p)[1] == 0 ? (p)[0] : -1)
741
441k
#define LITTLE2_CHAR_MATCHES(p, c) ((p)[1] == 0 && (p)[0] == (c))
742
#define LITTLE2_IS_NAME_CHAR_MINBPC(p)                                         \
743
4.62M
  UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
744
#define LITTLE2_IS_NMSTRT_CHAR_MINBPC(p)                                       \
745
37.3k
  UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
746
747
#ifdef XML_MIN_SIZE
748
749
static int PTRFASTCALL
750
little2_byteType(const ENCODING *enc, const char *p) {
751
  return LITTLE2_BYTE_TYPE(enc, p);
752
}
753
754
static int PTRFASTCALL
755
little2_byteToAscii(const ENCODING *enc, const char *p) {
756
  UNUSED_P(enc);
757
  return LITTLE2_BYTE_TO_ASCII(p);
758
}
759
760
static int PTRCALL
761
little2_charMatches(const ENCODING *enc, const char *p, int c) {
762
  UNUSED_P(enc);
763
  return LITTLE2_CHAR_MATCHES(p, c);
764
}
765
766
static int PTRFASTCALL
767
little2_isNameMin(const ENCODING *enc, const char *p) {
768
  UNUSED_P(enc);
769
  return LITTLE2_IS_NAME_CHAR_MINBPC(p);
770
}
771
772
static int PTRFASTCALL
773
little2_isNmstrtMin(const ENCODING *enc, const char *p) {
774
  UNUSED_P(enc);
775
  return LITTLE2_IS_NMSTRT_CHAR_MINBPC(p);
776
}
777
778
#  undef VTABLE
779
#  define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
780
781
#else /* not XML_MIN_SIZE */
782
783
#  undef PREFIX
784
480k
#  define PREFIX(ident) little2_##ident
785
924M
#  define MINBPC(enc) 2
786
/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
787
492M
#  define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
788
427k
#  define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(p)
789
441k
#  define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(p, c)
790
119
#  define IS_NAME_CHAR(enc, p, n) 0
791
4.62M
#  define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(p)
792
119
#  define IS_NMSTRT_CHAR(enc, p, n) (0)
793
37.3k
#  define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(p)
794
795
#  define XML_TOK_IMPL_C
796
#  include "xmltok_impl.c"
797
#  undef XML_TOK_IMPL_C
798
799
#  undef MINBPC
800
#  undef BYTE_TYPE
801
#  undef BYTE_TO_ASCII
802
#  undef CHAR_MATCHES
803
#  undef IS_NAME_CHAR
804
#  undef IS_NAME_CHAR_MINBPC
805
#  undef IS_NMSTRT_CHAR
806
#  undef IS_NMSTRT_CHAR_MINBPC
807
#  undef IS_INVALID_CHAR
808
809
#endif /* not XML_MIN_SIZE */
810
811
#ifdef XML_NS
812
813
static const struct normal_encoding little2_encoding_ns
814
    = {{VTABLE, 2, 0,
815
#  if BYTEORDER == 1234
816
        1
817
#  else
818
        0
819
#  endif
820
       },
821
       {
822
#  include "asciitab.h"
823
#  include "latin1tab.h"
824
       },
825
       STANDARD_VTABLE(little2_) NULL_VTABLE};
826
827
#endif
828
829
static const struct normal_encoding little2_encoding
830
    = {{VTABLE, 2, 0,
831
#if BYTEORDER == 1234
832
        1
833
#else
834
        0
835
#endif
836
       },
837
       {
838
#define BT_COLON BT_NMSTRT
839
#include "asciitab.h"
840
#undef BT_COLON
841
#include "latin1tab.h"
842
       },
843
       STANDARD_VTABLE(little2_) NULL_VTABLE};
844
845
#if BYTEORDER != 4321
846
847
#  ifdef XML_NS
848
849
static const struct normal_encoding internal_little2_encoding_ns
850
    = {{VTABLE, 2, 0, 1},
851
       {
852
#    include "iasciitab.h"
853
#    include "latin1tab.h"
854
       },
855
       STANDARD_VTABLE(little2_) NULL_VTABLE};
856
857
#  endif
858
859
static const struct normal_encoding internal_little2_encoding
860
    = {{VTABLE, 2, 0, 1},
861
       {
862
#  define BT_COLON BT_NMSTRT
863
#  include "iasciitab.h"
864
#  undef BT_COLON
865
#  include "latin1tab.h"
866
       },
867
       STANDARD_VTABLE(little2_) NULL_VTABLE};
868
869
#endif
870
871
#define BIG2_BYTE_TYPE(enc, p)                                                 \
872
773M
  ((p)[0] == 0 ? SB_BYTE_TYPE(enc, p + 1) : unicode_byte_type((p)[0], (p)[1]))
873
160k
#define BIG2_BYTE_TO_ASCII(p) ((p)[0] == 0 ? (p)[1] : -1)
874
602k
#define BIG2_CHAR_MATCHES(p, c) ((p)[0] == 0 && (p)[1] == (c))
875
#define BIG2_IS_NAME_CHAR_MINBPC(p)                                            \
876
3.82M
  UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
877
#define BIG2_IS_NMSTRT_CHAR_MINBPC(p)                                          \
878
38.8k
  UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
879
880
#ifdef XML_MIN_SIZE
881
882
static int PTRFASTCALL
883
big2_byteType(const ENCODING *enc, const char *p) {
884
  return BIG2_BYTE_TYPE(enc, p);
885
}
886
887
static int PTRFASTCALL
888
big2_byteToAscii(const ENCODING *enc, const char *p) {
889
  UNUSED_P(enc);
890
  return BIG2_BYTE_TO_ASCII(p);
891
}
892
893
static int PTRCALL
894
big2_charMatches(const ENCODING *enc, const char *p, int c) {
895
  UNUSED_P(enc);
896
  return BIG2_CHAR_MATCHES(p, c);
897
}
898
899
static int PTRFASTCALL
900
big2_isNameMin(const ENCODING *enc, const char *p) {
901
  UNUSED_P(enc);
902
  return BIG2_IS_NAME_CHAR_MINBPC(p);
903
}
904
905
static int PTRFASTCALL
906
big2_isNmstrtMin(const ENCODING *enc, const char *p) {
907
  UNUSED_P(enc);
908
  return BIG2_IS_NMSTRT_CHAR_MINBPC(p);
909
}
910
911
#  undef VTABLE
912
#  define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
913
914
#else /* not XML_MIN_SIZE */
915
916
#  undef PREFIX
917
448k
#  define PREFIX(ident) big2_##ident
918
1.47G
#  define MINBPC(enc) 2
919
/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
920
773M
#  define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
921
160k
#  define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(p)
922
602k
#  define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(p, c)
923
137
#  define IS_NAME_CHAR(enc, p, n) 0
924
3.82M
#  define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(p)
925
137
#  define IS_NMSTRT_CHAR(enc, p, n) (0)
926
38.8k
#  define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(p)
927
928
#  define XML_TOK_IMPL_C
929
#  include "xmltok_impl.c"
930
#  undef XML_TOK_IMPL_C
931
932
#  undef MINBPC
933
#  undef BYTE_TYPE
934
#  undef BYTE_TO_ASCII
935
#  undef CHAR_MATCHES
936
#  undef IS_NAME_CHAR
937
#  undef IS_NAME_CHAR_MINBPC
938
#  undef IS_NMSTRT_CHAR
939
#  undef IS_NMSTRT_CHAR_MINBPC
940
#  undef IS_INVALID_CHAR
941
942
#endif /* not XML_MIN_SIZE */
943
944
#ifdef XML_NS
945
946
static const struct normal_encoding big2_encoding_ns
947
    = {{VTABLE, 2, 0,
948
#  if BYTEORDER == 4321
949
        1
950
#  else
951
        0
952
#  endif
953
       },
954
       {
955
#  include "asciitab.h"
956
#  include "latin1tab.h"
957
       },
958
       STANDARD_VTABLE(big2_) NULL_VTABLE};
959
960
#endif
961
962
static const struct normal_encoding big2_encoding
963
    = {{VTABLE, 2, 0,
964
#if BYTEORDER == 4321
965
        1
966
#else
967
        0
968
#endif
969
       },
970
       {
971
#define BT_COLON BT_NMSTRT
972
#include "asciitab.h"
973
#undef BT_COLON
974
#include "latin1tab.h"
975
       },
976
       STANDARD_VTABLE(big2_) NULL_VTABLE};
977
978
#if BYTEORDER != 1234
979
980
#  ifdef XML_NS
981
982
static const struct normal_encoding internal_big2_encoding_ns
983
    = {{VTABLE, 2, 0, 1},
984
       {
985
#    include "iasciitab.h"
986
#    include "latin1tab.h"
987
       },
988
       STANDARD_VTABLE(big2_) NULL_VTABLE};
989
990
#  endif
991
992
static const struct normal_encoding internal_big2_encoding
993
    = {{VTABLE, 2, 0, 1},
994
       {
995
#  define BT_COLON BT_NMSTRT
996
#  include "iasciitab.h"
997
#  undef BT_COLON
998
#  include "latin1tab.h"
999
       },
1000
       STANDARD_VTABLE(big2_) NULL_VTABLE};
1001
1002
#endif
1003
1004
#undef PREFIX
1005
1006
static int FASTCALL
1007
130k
streqci(const char *s1, const char *s2) {
1008
669k
  for (;;) {
1009
669k
    char c1 = *s1++;
1010
669k
    char c2 = *s2++;
1011
669k
    if (ASCII_a <= c1 && c1 <= ASCII_z)
1012
1.36k
      c1 += ASCII_A - ASCII_a;
1013
669k
    if (ASCII_a <= c2 && c2 <= ASCII_z)
1014
      /* The following line will never get executed.  streqci() is
1015
       * only called from two places, both of which guarantee to put
1016
       * upper-case strings into s2.
1017
       */
1018
0
      c2 += ASCII_A - ASCII_a; /* LCOV_EXCL_LINE */
1019
669k
    if (c1 != c2)
1020
108k
      return 0;
1021
561k
    if (! c1)
1022
21.7k
      break;
1023
561k
  }
1024
21.7k
  return 1;
1025
130k
}
1026
1027
static void PTRCALL
1028
initUpdatePosition(const ENCODING *enc, const char *ptr, const char *end,
1029
74
                   POSITION *pos) {
1030
74
  UNUSED_P(enc);
1031
74
  normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
1032
74
}
1033
1034
static int
1035
43.3k
toAscii(const ENCODING *enc, const char *ptr, const char *end) {
1036
43.3k
  char buf[1];
1037
43.3k
  char *p = buf;
1038
43.3k
  XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
1039
43.3k
  if (p == buf)
1040
552
    return -1;
1041
42.8k
  else
1042
42.8k
    return buf[0];
1043
43.3k
}
1044
1045
static int FASTCALL
1046
23.7k
isSpace(int c) {
1047
23.7k
  switch (c) {
1048
1.88k
  case 0x20:
1049
2.53k
  case 0xD:
1050
2.96k
  case 0xA:
1051
5.61k
  case 0x9:
1052
5.61k
    return 1;
1053
23.7k
  }
1054
18.1k
  return 0;
1055
23.7k
}
1056
1057
/* Return 1 if there's just optional white space or there's an S
1058
   followed by name=val.
1059
*/
1060
static int
1061
parsePseudoAttribute(const ENCODING *enc, const char *ptr, const char *end,
1062
                     const char **namePtr, const char **nameEndPtr,
1063
2.68k
                     const char **valPtr, const char **nextTokPtr) {
1064
2.68k
  int c;
1065
2.68k
  char open;
1066
2.68k
  if (ptr == end) {
1067
524
    *namePtr = NULL;
1068
524
    return 1;
1069
524
  }
1070
2.15k
  if (! isSpace(toAscii(enc, ptr, end))) {
1071
33
    *nextTokPtr = ptr;
1072
33
    return 0;
1073
33
  }
1074
3.10k
  do {
1075
3.10k
    ptr += enc->minBytesPerChar;
1076
3.10k
  } while (isSpace(toAscii(enc, ptr, end)));
1077
2.12k
  if (ptr == end) {
1078
42
    *namePtr = NULL;
1079
42
    return 1;
1080
42
  }
1081
2.08k
  *namePtr = ptr;
1082
16.0k
  for (;;) {
1083
16.0k
    c = toAscii(enc, ptr, end);
1084
16.0k
    if (c == -1) {
1085
108
      *nextTokPtr = ptr;
1086
108
      return 0;
1087
108
    }
1088
15.8k
    if (c == ASCII_EQUALS) {
1089
1.80k
      *nameEndPtr = ptr;
1090
1.80k
      break;
1091
1.80k
    }
1092
14.0k
    if (isSpace(c)) {
1093
166
      *nameEndPtr = ptr;
1094
902
      do {
1095
902
        ptr += enc->minBytesPerChar;
1096
902
      } while (isSpace(c = toAscii(enc, ptr, end)));
1097
166
      if (c != ASCII_EQUALS) {
1098
105
        *nextTokPtr = ptr;
1099
105
        return 0;
1100
105
      }
1101
61
      break;
1102
166
    }
1103
13.9k
    ptr += enc->minBytesPerChar;
1104
13.9k
  }
1105
1.87k
  if (ptr == *namePtr) {
1106
9
    *nextTokPtr = ptr;
1107
9
    return 0;
1108
9
  }
1109
1.86k
  ptr += enc->minBytesPerChar;
1110
1.86k
  c = toAscii(enc, ptr, end);
1111
3.26k
  while (isSpace(c)) {
1112
1.40k
    ptr += enc->minBytesPerChar;
1113
1.40k
    c = toAscii(enc, ptr, end);
1114
1.40k
  }
1115
1.86k
  if (c != ASCII_QUOT && c != ASCII_APOS) {
1116
77
    *nextTokPtr = ptr;
1117
77
    return 0;
1118
77
  }
1119
1.78k
  open = (char)c;
1120
1.78k
  ptr += enc->minBytesPerChar;
1121
1.78k
  *valPtr = ptr;
1122
17.0k
  for (;; ptr += enc->minBytesPerChar) {
1123
17.0k
    c = toAscii(enc, ptr, end);
1124
17.0k
    if (c == open)
1125
1.53k
      break;
1126
15.5k
    if (! (ASCII_a <= c && c <= ASCII_z) && ! (ASCII_A <= c && c <= ASCII_Z)
1127
15.5k
        && ! (ASCII_0 <= c && c <= ASCII_9) && c != ASCII_PERIOD
1128
15.5k
        && c != ASCII_MINUS && c != ASCII_UNDERSCORE) {
1129
254
      *nextTokPtr = ptr;
1130
254
      return 0;
1131
254
    }
1132
15.5k
  }
1133
1.53k
  *nextTokPtr = ptr + enc->minBytesPerChar;
1134
1.53k
  return 1;
1135
1.78k
}
1136
1137
static const char KW_version[]
1138
    = {ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0'};
1139
1140
static const char KW_encoding[] = {ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d,
1141
                                   ASCII_i, ASCII_n, ASCII_g, '\0'};
1142
1143
static const char KW_standalone[]
1144
    = {ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a,
1145
       ASCII_l, ASCII_o, ASCII_n, ASCII_e, '\0'};
1146
1147
static const char KW_yes[] = {ASCII_y, ASCII_e, ASCII_s, '\0'};
1148
1149
static const char KW_no[] = {ASCII_n, ASCII_o, '\0'};
1150
1151
static int
1152
doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *, const char *,
1153
                                                 const char *),
1154
               int isGeneralTextEntity, const ENCODING *enc, const char *ptr,
1155
               const char *end, const char **badPtr, const char **versionPtr,
1156
               const char **versionEndPtr, const char **encodingName,
1157
1.46k
               const ENCODING **encoding, int *standalone) {
1158
1.46k
  const char *val = NULL;
1159
1.46k
  const char *name = NULL;
1160
1.46k
  const char *nameEnd = NULL;
1161
1.46k
  ptr += 5 * enc->minBytesPerChar;
1162
1.46k
  end -= 2 * enc->minBytesPerChar;
1163
1.46k
  if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)
1164
1.46k
      || ! name) {
1165
542
    *badPtr = ptr;
1166
542
    return 0;
1167
542
  }
1168
920
  if (! XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) {
1169
260
    if (! isGeneralTextEntity) {
1170
98
      *badPtr = name;
1171
98
      return 0;
1172
98
    }
1173
660
  } else {
1174
660
    if (versionPtr)
1175
660
      *versionPtr = val;
1176
660
    if (versionEndPtr)
1177
660
      *versionEndPtr = ptr;
1178
660
    if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1179
46
      *badPtr = ptr;
1180
46
      return 0;
1181
46
    }
1182
614
    if (! name) {
1183
22
      if (isGeneralTextEntity) {
1184
        /* a TextDecl must have an EncodingDecl */
1185
14
        *badPtr = ptr;
1186
14
        return 0;
1187
14
      }
1188
8
      return 1;
1189
22
    }
1190
614
  }
1191
754
  if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) {
1192
587
    int c = toAscii(enc, val, end);
1193
587
    if (! (ASCII_a <= c && c <= ASCII_z) && ! (ASCII_A <= c && c <= ASCII_Z)) {
1194
27
      *badPtr = val;
1195
27
      return 0;
1196
27
    }
1197
560
    if (encodingName)
1198
560
      *encodingName = val;
1199
560
    if (encoding)
1200
560
      *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
1201
560
    if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1202
57
      *badPtr = ptr;
1203
57
      return 0;
1204
57
    }
1205
503
    if (! name)
1206
485
      return 1;
1207
503
  }
1208
185
  if (! XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone)
1209
185
      || isGeneralTextEntity) {
1210
115
    *badPtr = name;
1211
115
    return 0;
1212
115
  }
1213
70
  if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) {
1214
54
    if (standalone)
1215
54
      *standalone = 1;
1216
54
  } else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) {
1217
13
    if (standalone)
1218
13
      *standalone = 0;
1219
13
  } else {
1220
3
    *badPtr = val;
1221
3
    return 0;
1222
3
  }
1223
266
  while (isSpace(toAscii(enc, ptr, end)))
1224
199
    ptr += enc->minBytesPerChar;
1225
67
  if (ptr != end) {
1226
7
    *badPtr = ptr;
1227
7
    return 0;
1228
7
  }
1229
60
  return 1;
1230
67
}
1231
1232
static int FASTCALL
1233
53.9k
checkCharRefNumber(int result) {
1234
53.9k
  switch (result >> 8) {
1235
18
  case 0xD8:
1236
36
  case 0xD9:
1237
56
  case 0xDA:
1238
74
  case 0xDB:
1239
92
  case 0xDC:
1240
110
  case 0xDD:
1241
128
  case 0xDE:
1242
146
  case 0xDF:
1243
146
    return -1;
1244
8.28k
  case 0:
1245
8.28k
    if (latin1_encoding.type[result] == BT_NONXML)
1246
216
      return -1;
1247
8.07k
    break;
1248
8.07k
  case 0xFF:
1249
1.29k
    if (result == 0xFFFE || result == 0xFFFF)
1250
39
      return -1;
1251
1.25k
    break;
1252
53.9k
  }
1253
53.5k
  return result;
1254
53.9k
}
1255
1256
int FASTCALL
1257
53.4k
XmlUtf8Encode(int c, char *buf) {
1258
53.4k
  enum {
1259
    /* minN is minimum legal resulting value for N byte sequence */
1260
53.4k
    min2 = 0x80,
1261
53.4k
    min3 = 0x800,
1262
53.4k
    min4 = 0x10000
1263
53.4k
  };
1264
1265
53.4k
  if (c < 0)
1266
0
    return 0; /* LCOV_EXCL_LINE: this case is always eliminated beforehand */
1267
53.4k
  if (c < min2) {
1268
4.15k
    buf[0] = (char)(c | UTF8_cval1);
1269
4.15k
    return 1;
1270
4.15k
  }
1271
49.2k
  if (c < min3) {
1272
14.3k
    buf[0] = (char)((c >> 6) | UTF8_cval2);
1273
14.3k
    buf[1] = (char)((c & 0x3f) | 0x80);
1274
14.3k
    return 2;
1275
14.3k
  }
1276
34.8k
  if (c < min4) {
1277
24.6k
    buf[0] = (char)((c >> 12) | UTF8_cval3);
1278
24.6k
    buf[1] = (char)(((c >> 6) & 0x3f) | 0x80);
1279
24.6k
    buf[2] = (char)((c & 0x3f) | 0x80);
1280
24.6k
    return 3;
1281
24.6k
  }
1282
10.1k
  if (c < 0x110000) {
1283
10.1k
    buf[0] = (char)((c >> 18) | UTF8_cval4);
1284
10.1k
    buf[1] = (char)(((c >> 12) & 0x3f) | 0x80);
1285
10.1k
    buf[2] = (char)(((c >> 6) & 0x3f) | 0x80);
1286
10.1k
    buf[3] = (char)((c & 0x3f) | 0x80);
1287
10.1k
    return 4;
1288
10.1k
  }
1289
0
  return 0; /* LCOV_EXCL_LINE: this case too is eliminated before calling */
1290
10.1k
}
1291
1292
int FASTCALL
1293
0
XmlUtf16Encode(int charNum, unsigned short *buf) {
1294
0
  if (charNum < 0)
1295
0
    return 0;
1296
0
  if (charNum < 0x10000) {
1297
0
    buf[0] = (unsigned short)charNum;
1298
0
    return 1;
1299
0
  }
1300
0
  if (charNum < 0x110000) {
1301
0
    charNum -= 0x10000;
1302
0
    buf[0] = (unsigned short)((charNum >> 10) + 0xD800);
1303
0
    buf[1] = (unsigned short)((charNum & 0x3FF) + 0xDC00);
1304
0
    return 2;
1305
0
  }
1306
0
  return 0;
1307
0
}
1308
1309
struct unknown_encoding {
1310
  struct normal_encoding normal;
1311
  CONVERTER convert;
1312
  void *userData;
1313
  unsigned short utf16[256];
1314
  char utf8[256][4];
1315
};
1316
1317
0
#define AS_UNKNOWN_ENCODING(enc) ((const struct unknown_encoding *)(enc))
1318
1319
int
1320
0
XmlSizeOfUnknownEncoding(void) {
1321
0
  return sizeof(struct unknown_encoding);
1322
0
}
1323
1324
static int PTRFASTCALL
1325
0
unknown_isName(const ENCODING *enc, const char *p) {
1326
0
  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1327
0
  int c = uenc->convert(uenc->userData, p);
1328
0
  if (c & ~0xFFFF)
1329
0
    return 0;
1330
0
  return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
1331
0
}
1332
1333
static int PTRFASTCALL
1334
0
unknown_isNmstrt(const ENCODING *enc, const char *p) {
1335
0
  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1336
0
  int c = uenc->convert(uenc->userData, p);
1337
0
  if (c & ~0xFFFF)
1338
0
    return 0;
1339
0
  return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
1340
0
}
1341
1342
static int PTRFASTCALL
1343
0
unknown_isInvalid(const ENCODING *enc, const char *p) {
1344
0
  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1345
0
  int c = uenc->convert(uenc->userData, p);
1346
0
  return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
1347
0
}
1348
1349
static enum XML_Convert_Result PTRCALL
1350
unknown_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
1351
0
               char **toP, const char *toLim) {
1352
0
  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1353
0
  char buf[XML_UTF8_ENCODE_MAX];
1354
0
  for (;;) {
1355
0
    const char *utf8;
1356
0
    int n;
1357
0
    if (*fromP == fromLim)
1358
0
      return XML_CONVERT_COMPLETED;
1359
0
    utf8 = uenc->utf8[(unsigned char)**fromP];
1360
0
    n = *utf8++;
1361
0
    if (n == 0) {
1362
0
      int c = uenc->convert(uenc->userData, *fromP);
1363
0
      n = XmlUtf8Encode(c, buf);
1364
0
      if (n > toLim - *toP)
1365
0
        return XML_CONVERT_OUTPUT_EXHAUSTED;
1366
0
      utf8 = buf;
1367
0
      *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1368
0
                 - (BT_LEAD2 - 2));
1369
0
    } else {
1370
0
      if (n > toLim - *toP)
1371
0
        return XML_CONVERT_OUTPUT_EXHAUSTED;
1372
0
      (*fromP)++;
1373
0
    }
1374
0
    memcpy(*toP, utf8, n);
1375
0
    *toP += n;
1376
0
  }
1377
0
}
1378
1379
static enum XML_Convert_Result PTRCALL
1380
unknown_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
1381
0
                unsigned short **toP, const unsigned short *toLim) {
1382
0
  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1383
0
  while (*fromP < fromLim && *toP < toLim) {
1384
0
    unsigned short c = uenc->utf16[(unsigned char)**fromP];
1385
0
    if (c == 0) {
1386
0
      c = (unsigned short)uenc->convert(uenc->userData, *fromP);
1387
0
      *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1388
0
                 - (BT_LEAD2 - 2));
1389
0
    } else
1390
0
      (*fromP)++;
1391
0
    *(*toP)++ = c;
1392
0
  }
1393
1394
0
  if ((*toP == toLim) && (*fromP < fromLim))
1395
0
    return XML_CONVERT_OUTPUT_EXHAUSTED;
1396
0
  else
1397
0
    return XML_CONVERT_COMPLETED;
1398
0
}
1399
1400
ENCODING *
1401
XmlInitUnknownEncoding(void *mem, int *table, CONVERTER convert,
1402
0
                       void *userData) {
1403
0
  int i;
1404
0
  struct unknown_encoding *e = (struct unknown_encoding *)mem;
1405
0
  memcpy(mem, &latin1_encoding, sizeof(struct normal_encoding));
1406
0
  for (i = 0; i < 128; i++)
1407
0
    if (latin1_encoding.type[i] != BT_OTHER
1408
0
        && latin1_encoding.type[i] != BT_NONXML && table[i] != i)
1409
0
      return 0;
1410
0
  for (i = 0; i < 256; i++) {
1411
0
    int c = table[i];
1412
0
    if (c == -1) {
1413
0
      e->normal.type[i] = BT_MALFORM;
1414
      /* This shouldn't really get used. */
1415
0
      e->utf16[i] = 0xFFFF;
1416
0
      e->utf8[i][0] = 1;
1417
0
      e->utf8[i][1] = 0;
1418
0
    } else if (c < 0) {
1419
0
      if (c < -4)
1420
0
        return 0;
1421
      /* Multi-byte sequences need a converter function */
1422
0
      if (! convert)
1423
0
        return 0;
1424
0
      e->normal.type[i] = (unsigned char)(BT_LEAD2 - (c + 2));
1425
0
      e->utf8[i][0] = 0;
1426
0
      e->utf16[i] = 0;
1427
0
    } else if (c < 0x80) {
1428
0
      if (latin1_encoding.type[c] != BT_OTHER
1429
0
          && latin1_encoding.type[c] != BT_NONXML && c != i)
1430
0
        return 0;
1431
0
      e->normal.type[i] = latin1_encoding.type[c];
1432
0
      e->utf8[i][0] = 1;
1433
0
      e->utf8[i][1] = (char)c;
1434
0
      e->utf16[i] = (unsigned short)(c == 0 ? 0xFFFF : c);
1435
0
    } else if (checkCharRefNumber(c) < 0) {
1436
0
      e->normal.type[i] = BT_NONXML;
1437
      /* This shouldn't really get used. */
1438
0
      e->utf16[i] = 0xFFFF;
1439
0
      e->utf8[i][0] = 1;
1440
0
      e->utf8[i][1] = 0;
1441
0
    } else {
1442
0
      if (c > 0xFFFF)
1443
0
        return 0;
1444
0
      if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
1445
0
        e->normal.type[i] = BT_NMSTRT;
1446
0
      else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
1447
0
        e->normal.type[i] = BT_NAME;
1448
0
      else
1449
0
        e->normal.type[i] = BT_OTHER;
1450
0
      e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
1451
0
      e->utf16[i] = (unsigned short)c;
1452
0
    }
1453
0
  }
1454
0
  e->userData = userData;
1455
0
  e->convert = convert;
1456
0
  if (convert) {
1457
0
    e->normal.isName2 = unknown_isName;
1458
0
    e->normal.isName3 = unknown_isName;
1459
0
    e->normal.isName4 = unknown_isName;
1460
0
    e->normal.isNmstrt2 = unknown_isNmstrt;
1461
0
    e->normal.isNmstrt3 = unknown_isNmstrt;
1462
0
    e->normal.isNmstrt4 = unknown_isNmstrt;
1463
0
    e->normal.isInvalid2 = unknown_isInvalid;
1464
0
    e->normal.isInvalid3 = unknown_isInvalid;
1465
0
    e->normal.isInvalid4 = unknown_isInvalid;
1466
0
  }
1467
0
  e->normal.enc.utf8Convert = unknown_toUtf8;
1468
0
  e->normal.enc.utf16Convert = unknown_toUtf16;
1469
0
  return &(e->normal.enc);
1470
0
}
1471
1472
/* If this enumeration is changed, getEncodingIndex and encodings
1473
must also be changed. */
1474
enum {
1475
  UNKNOWN_ENC = -1,
1476
  ISO_8859_1_ENC = 0,
1477
  US_ASCII_ENC,
1478
  UTF_8_ENC,
1479
  UTF_16_ENC,
1480
  UTF_16BE_ENC,
1481
  UTF_16LE_ENC,
1482
  /* must match encodingNames up to here */
1483
  NO_ENC
1484
};
1485
1486
static const char KW_ISO_8859_1[]
1487
    = {ASCII_I, ASCII_S, ASCII_O,     ASCII_MINUS, ASCII_8, ASCII_8,
1488
       ASCII_5, ASCII_9, ASCII_MINUS, ASCII_1,     '\0'};
1489
static const char KW_US_ASCII[]
1490
    = {ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S,
1491
       ASCII_C, ASCII_I, ASCII_I,     '\0'};
1492
static const char KW_UTF_8[]
1493
    = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0'};
1494
static const char KW_UTF_16[]
1495
    = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0'};
1496
static const char KW_UTF_16BE[]
1497
    = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1,
1498
       ASCII_6, ASCII_B, ASCII_E, '\0'};
1499
static const char KW_UTF_16LE[]
1500
    = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1,
1501
       ASCII_6, ASCII_L, ASCII_E, '\0'};
1502
1503
static int FASTCALL
1504
189k
getEncodingIndex(const char *name) {
1505
189k
  static const char *const encodingNames[] = {
1506
189k
      KW_ISO_8859_1, KW_US_ASCII, KW_UTF_8, KW_UTF_16, KW_UTF_16BE, KW_UTF_16LE,
1507
189k
  };
1508
189k
  int i;
1509
189k
  if (name == NULL)
1510
167k
    return NO_ENC;
1511
129k
  for (i = 0; i < (int)(sizeof(encodingNames) / sizeof(encodingNames[0])); i++)
1512
129k
    if (streqci(name, encodingNames[i]))
1513
21.7k
      return i;
1514
140
  return UNKNOWN_ENC;
1515
21.9k
}
1516
1517
/* For binary compatibility, we store the index of the encoding
1518
   specified at initialization in the isUtf16 member.
1519
*/
1520
1521
71.2k
#define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16)
1522
188k
#define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i)
1523
1524
/* This is what detects the encoding.  encodingTable maps from
1525
   encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of
1526
   the external (protocol) specified encoding; state is
1527
   XML_CONTENT_STATE if we're parsing an external text entity, and
1528
   XML_PROLOG_STATE otherwise.
1529
*/
1530
1531
static int
1532
initScan(const ENCODING *const *encodingTable, const INIT_ENCODING *enc,
1533
85.6k
         int state, const char *ptr, const char *end, const char **nextTokPtr) {
1534
85.6k
  const ENCODING **encPtr;
1535
1536
85.6k
  if (ptr >= end)
1537
0
    return XML_TOK_NONE;
1538
85.6k
  encPtr = enc->encPtr;
1539
85.6k
  if (ptr + 1 == end) {
1540
    /* only a single byte available for auto-detection */
1541
#ifndef XML_DTD /* FIXME */
1542
    /* a well-formed document entity must have more than one byte */
1543
    if (state != XML_CONTENT_STATE)
1544
      return XML_TOK_PARTIAL;
1545
#endif
1546
    /* so we're parsing an external text entity... */
1547
    /* if UTF-16 was externally specified, then we need at least 2 bytes */
1548
220
    switch (INIT_ENC_INDEX(enc)) {
1549
0
    case UTF_16_ENC:
1550
55
    case UTF_16LE_ENC:
1551
55
    case UTF_16BE_ENC:
1552
55
      return XML_TOK_PARTIAL;
1553
220
    }
1554
165
    switch ((unsigned char)*ptr) {
1555
3
    case 0xFE:
1556
6
    case 0xFF:
1557
9
    case 0xEF: /* possibly first byte of UTF-8 BOM */
1558
9
      if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1559
0
        break;
1560
      /* fall through */
1561
12
    case 0x00:
1562
15
    case 0x3C:
1563
15
      return XML_TOK_PARTIAL;
1564
165
    }
1565
85.4k
  } else {
1566
85.4k
    switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
1567
616
    case 0xFEFF:
1568
616
      if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1569
0
        break;
1570
616
      *nextTokPtr = ptr + 2;
1571
616
      *encPtr = encodingTable[UTF_16BE_ENC];
1572
616
      return XML_TOK_BOM;
1573
    /* 00 3C is handled in the default case */
1574
16.9k
    case 0x3C00:
1575
16.9k
      if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
1576
16.9k
           || INIT_ENC_INDEX(enc) == UTF_16_ENC)
1577
16.9k
          && state == XML_CONTENT_STATE)
1578
0
        break;
1579
16.9k
      *encPtr = encodingTable[UTF_16LE_ENC];
1580
16.9k
      return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1581
3.86k
    case 0xFFFE:
1582
3.86k
      if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1583
0
        break;
1584
3.86k
      *nextTokPtr = ptr + 2;
1585
3.86k
      *encPtr = encodingTable[UTF_16LE_ENC];
1586
3.86k
      return XML_TOK_BOM;
1587
9.98k
    case 0xEFBB:
1588
      /* Maybe a UTF-8 BOM (EF BB BF) */
1589
      /* If there's an explicitly specified (external) encoding
1590
         of ISO-8859-1 or some flavour of UTF-16
1591
         and this is an external text entity,
1592
         don't look for the BOM,
1593
         because it might be a legal data.
1594
      */
1595
9.98k
      if (state == XML_CONTENT_STATE) {
1596
2.49k
        int e = INIT_ENC_INDEX(enc);
1597
2.49k
        if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC || e == UTF_16LE_ENC
1598
2.49k
            || e == UTF_16_ENC)
1599
0
          break;
1600
2.49k
      }
1601
9.98k
      if (ptr + 2 == end)
1602
4
        return XML_TOK_PARTIAL;
1603
9.98k
      if ((unsigned char)ptr[2] == 0xBF) {
1604
9.88k
        *nextTokPtr = ptr + 3;
1605
9.88k
        *encPtr = encodingTable[UTF_8_ENC];
1606
9.88k
        return XML_TOK_BOM;
1607
9.88k
      }
1608
92
      break;
1609
54.0k
    default:
1610
54.0k
      if (ptr[0] == '\0') {
1611
        /* 0 isn't a legal data character. Furthermore a document
1612
           entity can only start with ASCII characters.  So the only
1613
           way this can fail to be big-endian UTF-16 if it it's an
1614
           external parsed general entity that's labelled as
1615
           UTF-16LE.
1616
        */
1617
26.9k
        if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
1618
0
          break;
1619
26.9k
        *encPtr = encodingTable[UTF_16BE_ENC];
1620
26.9k
        return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1621
27.1k
      } else if (ptr[1] == '\0') {
1622
        /* We could recover here in the case:
1623
            - parsing an external entity
1624
            - second byte is 0
1625
            - no externally specified encoding
1626
            - no encoding declaration
1627
           by assuming UTF-16LE.  But we don't, because this would mean when
1628
           presented just with a single byte, we couldn't reliably determine
1629
           whether we needed further bytes.
1630
        */
1631
5.14k
        if (state == XML_CONTENT_STATE)
1632
1.28k
          break;
1633
3.85k
        *encPtr = encodingTable[UTF_16LE_ENC];
1634
3.85k
        return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1635
5.14k
      }
1636
21.9k
      break;
1637
85.4k
    }
1638
85.4k
  }
1639
23.5k
  *encPtr = encodingTable[INIT_ENC_INDEX(enc)];
1640
23.5k
  return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1641
85.6k
}
1642
1643
400k
#define NS(x) x
1644
64.1k
#define ns(x) x
1645
#define XML_TOK_NS_C
1646
#include "xmltok_ns.c"
1647
#undef XML_TOK_NS_C
1648
#undef NS
1649
#undef ns
1650
1651
#ifdef XML_NS
1652
1653
64.7k
#  define NS(x) x##NS
1654
21.3k
#  define ns(x) x##_ns
1655
1656
#  define XML_TOK_NS_C
1657
#  include "xmltok_ns.c"
1658
#  undef XML_TOK_NS_C
1659
1660
#  undef NS
1661
#  undef ns
1662
1663
ENCODING *
1664
XmlInitUnknownEncodingNS(void *mem, int *table, CONVERTER convert,
1665
0
                         void *userData) {
1666
0
  ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
1667
0
  if (enc)
1668
0
    ((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON;
1669
0
  return enc;
1670
0
}
1671
1672
#endif /* XML_NS */