Coverage Report

Created: 2018-09-25 14:53

/src/mozilla-central/parser/expat/lib/xmltok.c
Line
Count
Source (jump to first uncovered line)
1
/* Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd
2
   See the file COPYING for copying permission.
3
*/
4
5
#include <stddef.h>
6
7
#ifdef COMPILED_FROM_DSP
8
#include "winconfig.h"
9
#elif defined(MACOS_CLASSIC)
10
#include "macconfig.h"
11
#elif defined(__amigaos4__)
12
#include "amigaconfig.h"
13
#else
14
#ifdef HAVE_EXPAT_CONFIG_H
15
#include <expat_config.h>
16
#endif
17
#endif /* ndef COMPILED_FROM_DSP */
18
19
#include "expat_external.h"
20
#include "internal.h"
21
#include "xmltok.h"
22
#include "nametab.h"
23
24
#ifdef XML_DTD
25
#define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok)
26
#else
27
#define IGNORE_SECTION_TOK_VTABLE /* as nothing */
28
#endif
29
30
#define VTABLE1 \
31
  { PREFIX(prologTok), PREFIX(contentTok), \
32
    PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE }, \
33
  { PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \
34
  PREFIX(sameName), \
35
  PREFIX(nameMatchesAscii), \
36
  PREFIX(nameLength), \
37
  PREFIX(skipS), \
38
  PREFIX(getAtts), \
39
  PREFIX(charRefNumber), \
40
  PREFIX(predefinedEntityName), \
41
  PREFIX(updatePosition), \
42
  PREFIX(isPublicId)
43
44
#define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
45
46
#define UCS2_GET_NAMING(pages, hi, lo) \
47
0
   (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F)))
48
49
/* A 2 byte UTF-8 representation splits the characters 11 bits between
50
   the bottom 5 and 6 bits of the bytes.  We need 8 bits to index into
51
   pages, 3 bits to add to that index and 5 bits to generate the mask.
52
*/
53
#define UTF8_GET_NAMING2(pages, byte) \
54
0
    (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \
55
0
                      + ((((byte)[0]) & 3) << 1) \
56
0
                      + ((((byte)[1]) >> 5) & 1)] \
57
0
         & (1 << (((byte)[1]) & 0x1F)))
58
59
/* A 3 byte UTF-8 representation splits the characters 16 bits between
60
   the bottom 4, 6 and 6 bits of the bytes.  We need 8 bits to index
61
   into pages, 3 bits to add to that index and 5 bits to generate the
62
   mask.
63
*/
64
#define UTF8_GET_NAMING3(pages, byte) \
65
0
  (namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \
66
0
                             + ((((byte)[1]) >> 2) & 0xF)] \
67
0
                       << 3) \
68
0
                      + ((((byte)[1]) & 3) << 1) \
69
0
                      + ((((byte)[2]) >> 5) & 1)] \
70
0
         & (1 << (((byte)[2]) & 0x1F)))
71
72
#define UTF8_GET_NAMING(pages, p, n) \
73
  ((n) == 2 \
74
  ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \
75
  : ((n) == 3 \
76
     ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \
77
     : 0))
78
79
/* Detection of invalid UTF-8 sequences is based on Table 3.1B
80
   of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/
81
   with the additional restriction of not allowing the Unicode
82
   code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE).
83
   Implementation details:
84
     (A & 0x80) == 0     means A < 0x80
85
   and
86
     (A & 0xC0) == 0xC0  means A > 0xBF
87
*/
88
89
#define UTF8_INVALID2(p) \
90
0
  ((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0)
91
92
#define UTF8_INVALID3(p) \
93
0
  (((p)[2] & 0x80) == 0 \
94
0
  || \
95
0
  ((*p) == 0xEF && (p)[1] == 0xBF \
96
0
    ? \
97
0
    (p)[2] > 0xBD \
98
0
    : \
99
0
    ((p)[2] & 0xC0) == 0xC0) \
100
0
  || \
101
0
  ((*p) == 0xE0 \
102
0
    ? \
103
0
    (p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0 \
104
0
    : \
105
0
    ((p)[1] & 0x80) == 0 \
106
0
    || \
107
0
    ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0)))
108
109
#define UTF8_INVALID4(p) \
110
0
  (((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 \
111
0
  || \
112
0
  ((p)[2] & 0x80) == 0 || ((p)[2] & 0xC0) == 0xC0 \
113
0
  || \
114
0
  ((*p) == 0xF0 \
115
0
    ? \
116
0
    (p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0 \
117
0
    : \
118
0
    ((p)[1] & 0x80) == 0 \
119
0
    || \
120
0
    ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0)))
121
122
static int PTRFASTCALL
123
isNever(const ENCODING *enc, const char *p)
124
0
{
125
0
  return 0;
126
0
}
127
128
static int PTRFASTCALL
129
utf8_isName2(const ENCODING *enc, const char *p)
130
0
{
131
0
  return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
132
0
}
133
134
static int PTRFASTCALL
135
utf8_isName3(const ENCODING *enc, const char *p)
136
0
{
137
0
  return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
138
0
}
139
140
#define utf8_isName4 isNever
141
142
static int PTRFASTCALL
143
utf8_isNmstrt2(const ENCODING *enc, const char *p)
144
0
{
145
0
  return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
146
0
}
147
148
static int PTRFASTCALL
149
utf8_isNmstrt3(const ENCODING *enc, const char *p)
150
0
{
151
0
  return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
152
0
}
153
154
#define utf8_isNmstrt4 isNever
155
156
static int PTRFASTCALL
157
utf8_isInvalid2(const ENCODING *enc, const char *p)
158
0
{
159
0
  return UTF8_INVALID2((const unsigned char *)p);
160
0
}
161
162
static int PTRFASTCALL
163
utf8_isInvalid3(const ENCODING *enc, const char *p)
164
0
{
165
0
  return UTF8_INVALID3((const unsigned char *)p);
166
0
}
167
168
static int PTRFASTCALL
169
utf8_isInvalid4(const ENCODING *enc, const char *p)
170
0
{
171
0
  return UTF8_INVALID4((const unsigned char *)p);
172
0
}
173
174
struct normal_encoding {
175
  ENCODING enc;
176
  unsigned char type[256];
177
#ifdef XML_MIN_SIZE
178
  int (PTRFASTCALL *byteType)(const ENCODING *, const char *);
179
  int (PTRFASTCALL *isNameMin)(const ENCODING *, const char *);
180
  int (PTRFASTCALL *isNmstrtMin)(const ENCODING *, const char *);
181
  int (PTRFASTCALL *byteToAscii)(const ENCODING *, const char *);
182
  int (PTRCALL *charMatches)(const ENCODING *, const char *, int);
183
#endif /* XML_MIN_SIZE */
184
  int (PTRFASTCALL *isName2)(const ENCODING *, const char *);
185
  int (PTRFASTCALL *isName3)(const ENCODING *, const char *);
186
  int (PTRFASTCALL *isName4)(const ENCODING *, const char *);
187
  int (PTRFASTCALL *isNmstrt2)(const ENCODING *, const char *);
188
  int (PTRFASTCALL *isNmstrt3)(const ENCODING *, const char *);
189
  int (PTRFASTCALL *isNmstrt4)(const ENCODING *, const char *);
190
  int (PTRFASTCALL *isInvalid2)(const ENCODING *, const char *);
191
  int (PTRFASTCALL *isInvalid3)(const ENCODING *, const char *);
192
  int (PTRFASTCALL *isInvalid4)(const ENCODING *, const char *);
193
};
194
195
0
#define AS_NORMAL_ENCODING(enc)   ((const struct normal_encoding *) (enc))
196
197
#ifdef XML_MIN_SIZE
198
199
#define STANDARD_VTABLE(E) \
200
 E ## byteType, \
201
 E ## isNameMin, \
202
 E ## isNmstrtMin, \
203
 E ## byteToAscii, \
204
 E ## charMatches,
205
206
#else
207
208
#define STANDARD_VTABLE(E) /* as nothing */
209
210
#endif
211
212
#define NORMAL_VTABLE(E) \
213
 E ## isName2, \
214
 E ## isName3, \
215
 E ## isName4, \
216
 E ## isNmstrt2, \
217
 E ## isNmstrt3, \
218
 E ## isNmstrt4, \
219
 E ## isInvalid2, \
220
 E ## isInvalid3, \
221
 E ## isInvalid4
222
223
static int FASTCALL checkCharRefNumber(int);
224
225
#include "xmltok_impl.h"
226
#include "ascii.h"
227
228
#ifdef XML_MIN_SIZE
229
#define sb_isNameMin isNever
230
#define sb_isNmstrtMin isNever
231
#endif
232
233
#ifdef XML_MIN_SIZE
234
#define MINBPC(enc) ((enc)->minBytesPerChar)
235
#else
236
/* minimum bytes per character */
237
0
#define MINBPC(enc) 1
238
#endif
239
240
#define SB_BYTE_TYPE(enc, p) \
241
0
  (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
242
243
#ifdef XML_MIN_SIZE
244
static int PTRFASTCALL
245
sb_byteType(const ENCODING *enc, const char *p)
246
{
247
  return SB_BYTE_TYPE(enc, p);
248
}
249
#define BYTE_TYPE(enc, p) \
250
 (AS_NORMAL_ENCODING(enc)->byteType(enc, p))
251
#else
252
0
#define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
253
#endif
254
255
#ifdef XML_MIN_SIZE
256
#define BYTE_TO_ASCII(enc, p) \
257
 (AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p))
258
static int PTRFASTCALL
259
sb_byteToAscii(const ENCODING *enc, const char *p)
260
{
261
  return *p;
262
}
263
#else
264
0
#define BYTE_TO_ASCII(enc, p) (*(p))
265
#endif
266
267
#define IS_NAME_CHAR(enc, p, n) \
268
0
 (AS_NORMAL_ENCODING(enc)->isName ## n(enc, p))
269
#define IS_NMSTRT_CHAR(enc, p, n) \
270
0
 (AS_NORMAL_ENCODING(enc)->isNmstrt ## n(enc, p))
271
#define IS_INVALID_CHAR(enc, p, n) \
272
0
 (AS_NORMAL_ENCODING(enc)->isInvalid ## n(enc, p))
273
274
#ifdef XML_MIN_SIZE
275
#define IS_NAME_CHAR_MINBPC(enc, p) \
276
 (AS_NORMAL_ENCODING(enc)->isNameMin(enc, p))
277
#define IS_NMSTRT_CHAR_MINBPC(enc, p) \
278
 (AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p))
279
#else
280
0
#define IS_NAME_CHAR_MINBPC(enc, p) (0)
281
0
#define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
282
#endif
283
284
#ifdef XML_MIN_SIZE
285
#define CHAR_MATCHES(enc, p, c) \
286
 (AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c))
287
static int PTRCALL
288
sb_charMatches(const ENCODING *enc, const char *p, int c)
289
{
290
  return *p == c;
291
}
292
#else
293
/* c is an ASCII character */
294
0
#define CHAR_MATCHES(enc, p, c) (*(p) == c)
295
#endif
296
297
0
#define PREFIX(ident) normal_ ## ident
298
#include "xmltok_impl.c"
299
300
#undef MINBPC
301
#undef BYTE_TYPE
302
#undef BYTE_TO_ASCII
303
#undef CHAR_MATCHES
304
#undef IS_NAME_CHAR
305
#undef IS_NAME_CHAR_MINBPC
306
#undef IS_NMSTRT_CHAR
307
#undef IS_NMSTRT_CHAR_MINBPC
308
#undef IS_INVALID_CHAR
309
310
enum {  /* UTF8_cvalN is value of masked first byte of N byte sequence */
311
  UTF8_cval1 = 0x00,
312
  UTF8_cval2 = 0xc0,
313
  UTF8_cval3 = 0xe0,
314
  UTF8_cval4 = 0xf0
315
};
316
317
static void PTRCALL
318
utf8_toUtf8(const ENCODING *enc,
319
            const char **fromP, const char *fromLim,
320
            char **toP, const char *toLim)
321
0
{
322
0
  char *to;
323
0
  const char *from;
324
0
  if (fromLim - *fromP > toLim - *toP) {
325
0
    /* Avoid copying partial characters. */
326
0
    for (fromLim = *fromP + (toLim - *toP); fromLim > *fromP; fromLim--)
327
0
      if (((unsigned char)fromLim[-1] & 0xc0) != 0x80)
328
0
        break;
329
0
  }
330
0
  for (to = *toP, from = *fromP; from != fromLim; from++, to++)
331
0
    *to = *from;
332
0
  *fromP = from;
333
0
  *toP = to;
334
0
}
335
336
static void PTRCALL
337
utf8_toUtf16(const ENCODING *enc,
338
             const char **fromP, const char *fromLim,
339
             unsigned short **toP, const unsigned short *toLim)
340
0
{
341
0
  unsigned short *to = *toP;
342
0
  const char *from = *fromP;
343
0
  while (from != fromLim && to != toLim) {
344
0
    switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) {
345
0
    case BT_LEAD2:
346
0
      *to++ = (unsigned short)(((from[0] & 0x1f) << 6) | (from[1] & 0x3f));
347
0
      from += 2;
348
0
      break;
349
0
    case BT_LEAD3:
350
0
      *to++ = (unsigned short)(((from[0] & 0xf) << 12)
351
0
                               | ((from[1] & 0x3f) << 6) | (from[2] & 0x3f));
352
0
      from += 3;
353
0
      break;
354
0
    case BT_LEAD4:
355
0
      {
356
0
        unsigned long n;
357
0
        if (to + 1 == toLim)
358
0
          goto after;
359
0
        n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12)
360
0
            | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
361
0
        n -= 0x10000;
362
0
        to[0] = (unsigned short)((n >> 10) | 0xD800);
363
0
        to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
364
0
        to += 2;
365
0
        from += 4;
366
0
      }
367
0
      break;
368
0
    default:
369
0
      *to++ = *from++;
370
0
      break;
371
0
    }
372
0
  }
373
0
after:
374
0
  *fromP = from;
375
0
  *toP = to;
376
0
}
377
378
#ifdef XML_NS
379
static const struct normal_encoding utf8_encoding_ns = {
380
  { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
381
  {
382
#include "asciitab.h"
383
#include "utf8tab.h"
384
  },
385
  STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
386
};
387
#endif
388
389
static const struct normal_encoding utf8_encoding = {
390
  { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
391
  {
392
#define BT_COLON BT_NMSTRT
393
#include "asciitab.h"
394
#undef BT_COLON
395
#include "utf8tab.h"
396
  },
397
  STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
398
};
399
400
#ifdef XML_NS
401
402
static const struct normal_encoding internal_utf8_encoding_ns = {
403
  { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
404
  {
405
#include "iasciitab.h"
406
#include "utf8tab.h"
407
  },
408
  STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
409
};
410
411
#endif
412
413
static const struct normal_encoding internal_utf8_encoding = {
414
  { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
415
  {
416
#define BT_COLON BT_NMSTRT
417
#include "iasciitab.h"
418
#undef BT_COLON
419
#include "utf8tab.h"
420
  },
421
  STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
422
};
423
424
static void PTRCALL
425
latin1_toUtf8(const ENCODING *enc,
426
              const char **fromP, const char *fromLim,
427
              char **toP, const char *toLim)
428
0
{
429
0
  for (;;) {
430
0
    unsigned char c;
431
0
    if (*fromP == fromLim)
432
0
      break;
433
0
    c = (unsigned char)**fromP;
434
0
    if (c & 0x80) {
435
0
      if (toLim - *toP < 2)
436
0
        break;
437
0
      *(*toP)++ = (char)((c >> 6) | UTF8_cval2);
438
0
      *(*toP)++ = (char)((c & 0x3f) | 0x80);
439
0
      (*fromP)++;
440
0
    }
441
0
    else {
442
0
      if (*toP == toLim)
443
0
        break;
444
0
      *(*toP)++ = *(*fromP)++;
445
0
    }
446
0
  }
447
0
}
448
449
static void PTRCALL
450
latin1_toUtf16(const ENCODING *enc,
451
               const char **fromP, const char *fromLim,
452
               unsigned short **toP, const unsigned short *toLim)
453
0
{
454
0
  while (*fromP != fromLim && *toP != toLim)
455
0
    *(*toP)++ = (unsigned char)*(*fromP)++;
456
0
}
457
458
#ifdef XML_NS
459
460
static const struct normal_encoding latin1_encoding_ns = {
461
  { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
462
  {
463
#include "asciitab.h"
464
#include "latin1tab.h"
465
  },
466
  STANDARD_VTABLE(sb_)
467
};
468
469
#endif
470
471
static const struct normal_encoding latin1_encoding = {
472
  { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
473
  {
474
#define BT_COLON BT_NMSTRT
475
#include "asciitab.h"
476
#undef BT_COLON
477
#include "latin1tab.h"
478
  },
479
  STANDARD_VTABLE(sb_)
480
};
481
482
static void PTRCALL
483
ascii_toUtf8(const ENCODING *enc,
484
             const char **fromP, const char *fromLim,
485
             char **toP, const char *toLim)
486
0
{
487
0
  while (*fromP != fromLim && *toP != toLim)
488
0
    *(*toP)++ = *(*fromP)++;
489
0
}
490
491
#ifdef XML_NS
492
493
static const struct normal_encoding ascii_encoding_ns = {
494
  { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
495
  {
496
#include "asciitab.h"
497
/* BT_NONXML == 0 */
498
  },
499
  STANDARD_VTABLE(sb_)
500
};
501
502
#endif
503
504
static const struct normal_encoding ascii_encoding = {
505
  { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
506
  {
507
#define BT_COLON BT_NMSTRT
508
#include "asciitab.h"
509
#undef BT_COLON
510
/* BT_NONXML == 0 */
511
  },
512
  STANDARD_VTABLE(sb_)
513
};
514
515
static int PTRFASTCALL
516
unicode_byte_type(char hi, char lo)
517
0
{
518
0
  switch ((unsigned char)hi) {
519
0
  case 0xD8: case 0xD9: case 0xDA: case 0xDB:
520
0
    return BT_LEAD4;
521
0
  case 0xDC: case 0xDD: case 0xDE: case 0xDF:
522
0
    return BT_TRAIL;
523
0
  case 0xFF:
524
0
    switch ((unsigned char)lo) {
525
0
    case 0xFF:
526
0
    case 0xFE:
527
0
      return BT_NONXML;
528
0
    }
529
0
    break;
530
0
  }
531
0
  return BT_NONASCII;
532
0
}
533
534
#define DEFINE_UTF16_TO_UTF8(E) \
535
static void  PTRCALL \
536
E ## toUtf8(const ENCODING *enc, \
537
            const char **fromP, const char *fromLim, \
538
0
            char **toP, const char *toLim) \
539
0
{ \
540
0
  const char *from; \
541
0
  for (from = *fromP; from != fromLim; from += 2) { \
542
0
    int plane; \
543
0
    unsigned char lo2; \
544
0
    unsigned char lo = GET_LO(from); \
545
0
    unsigned char hi = GET_HI(from); \
546
0
    switch (hi) { \
547
0
    case 0: \
548
0
      if (lo < 0x80) { \
549
0
        if (*toP == toLim) { \
550
0
          *fromP = from; \
551
0
          return; \
552
0
        } \
553
0
        *(*toP)++ = lo; \
554
0
        break; \
555
0
      } \
556
0
      /* fall through */ \
557
0
    case 0x1: case 0x2: case 0x3: \
558
0
    case 0x4: case 0x5: case 0x6: case 0x7: \
559
0
      if (toLim -  *toP < 2) { \
560
0
        *fromP = from; \
561
0
        return; \
562
0
      } \
563
0
      *(*toP)++ = ((lo >> 6) | (hi << 2) |  UTF8_cval2); \
564
0
      *(*toP)++ = ((lo & 0x3f) | 0x80); \
565
0
      break; \
566
0
    default: \
567
0
      if (toLim -  *toP < 3)  { \
568
0
        *fromP = from; \
569
0
        return; \
570
0
      } \
571
0
      /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \
572
0
      *(*toP)++ = ((hi >> 4) | UTF8_cval3); \
573
0
      *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \
574
0
      *(*toP)++ = ((lo & 0x3f) | 0x80); \
575
0
      break; \
576
0
    case 0xD8: case 0xD9: case 0xDA: case 0xDB: \
577
0
      if (toLim -  *toP < 4) { \
578
0
        *fromP = from; \
579
0
        return; \
580
0
      } \
581
0
      plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \
582
0
      *(*toP)++ = ((plane >> 2) | UTF8_cval4); \
583
0
      *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \
584
0
      from += 2; \
585
0
      lo2 = GET_LO(from); \
586
0
      *(*toP)++ = (((lo & 0x3) << 4) \
587
0
                   | ((GET_HI(from) & 0x3) << 2) \
588
0
                   | (lo2 >> 6) \
589
0
                   | 0x80); \
590
0
      *(*toP)++ = ((lo2 & 0x3f) | 0x80); \
591
0
      break; \
592
0
    } \
593
0
  } \
594
0
  *fromP = from; \
595
0
}
Unexecuted instantiation: xmltok.c:little2_toUtf8
Unexecuted instantiation: xmltok.c:big2_toUtf8
596
597
#define DEFINE_UTF16_TO_UTF16(E) \
598
static void  PTRCALL \
599
E ## toUtf16(const ENCODING *enc, \
600
             const char **fromP, const char *fromLim, \
601
0
             unsigned short **toP, const unsigned short *toLim) \
602
0
{ \
603
0
  /* Avoid copying first half only of surrogate */ \
604
0
  if (fromLim - *fromP > ((toLim - *toP) << 1) \
605
0
      && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) \
606
0
    fromLim -= 2; \
607
0
  for (; *fromP != fromLim && *toP != toLim; *fromP += 2) \
608
0
    *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \
609
0
}
Unexecuted instantiation: xmltok.c:little2_toUtf16
Unexecuted instantiation: xmltok.c:big2_toUtf16
610
611
#define SET2(ptr, ch) \
612
  (((ptr)[0] = ((ch) & 0xff)), ((ptr)[1] = ((ch) >> 8)))
613
0
#define GET_LO(ptr) ((unsigned char)(ptr)[0])
614
0
#define GET_HI(ptr) ((unsigned char)(ptr)[1])
615
616
DEFINE_UTF16_TO_UTF8(little2_)
617
DEFINE_UTF16_TO_UTF16(little2_)
618
619
#undef SET2
620
#undef GET_LO
621
#undef GET_HI
622
623
#define SET2(ptr, ch) \
624
  (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch) & 0xFF)))
625
0
#define GET_LO(ptr) ((unsigned char)(ptr)[1])
626
0
#define GET_HI(ptr) ((unsigned char)(ptr)[0])
627
628
DEFINE_UTF16_TO_UTF8(big2_)
629
DEFINE_UTF16_TO_UTF16(big2_)
630
631
#undef SET2
632
#undef GET_LO
633
#undef GET_HI
634
635
#define LITTLE2_BYTE_TYPE(enc, p) \
636
0
 ((p)[1] == 0 \
637
0
  ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \
638
0
  : unicode_byte_type((p)[1], (p)[0]))
639
0
#define LITTLE2_BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1)
640
0
#define LITTLE2_CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c)
641
#define LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) \
642
0
  UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
643
#define LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
644
0
  UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
645
646
#ifdef XML_MIN_SIZE
647
648
static int PTRFASTCALL
649
little2_byteType(const ENCODING *enc, const char *p)
650
{
651
  return LITTLE2_BYTE_TYPE(enc, p);
652
}
653
654
static int PTRFASTCALL
655
little2_byteToAscii(const ENCODING *enc, const char *p)
656
{
657
  return LITTLE2_BYTE_TO_ASCII(enc, p);
658
}
659
660
static int PTRCALL
661
little2_charMatches(const ENCODING *enc, const char *p, int c)
662
{
663
  return LITTLE2_CHAR_MATCHES(enc, p, c);
664
}
665
666
static int PTRFASTCALL
667
little2_isNameMin(const ENCODING *enc, const char *p)
668
{
669
  return LITTLE2_IS_NAME_CHAR_MINBPC(enc, p);
670
}
671
672
static int PTRFASTCALL
673
little2_isNmstrtMin(const ENCODING *enc, const char *p)
674
{
675
  return LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p);
676
}
677
678
#undef VTABLE
679
#define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
680
681
#else /* not XML_MIN_SIZE */
682
683
#undef PREFIX
684
0
#define PREFIX(ident) little2_ ## ident
685
0
#define MINBPC(enc) 2
686
/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
687
0
#define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
688
0
#define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(enc, p)
689
0
#define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(enc, p, c)
690
0
#define IS_NAME_CHAR(enc, p, n) 0
691
0
#define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(enc, p)
692
0
#define IS_NMSTRT_CHAR(enc, p, n) (0)
693
0
#define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p)
694
695
#include "xmltok_impl.c"
696
697
#undef MINBPC
698
#undef BYTE_TYPE
699
#undef BYTE_TO_ASCII
700
#undef CHAR_MATCHES
701
#undef IS_NAME_CHAR
702
#undef IS_NAME_CHAR_MINBPC
703
#undef IS_NMSTRT_CHAR
704
#undef IS_NMSTRT_CHAR_MINBPC
705
#undef IS_INVALID_CHAR
706
707
#endif /* not XML_MIN_SIZE */
708
709
#ifdef XML_NS
710
711
static const struct normal_encoding little2_encoding_ns = {
712
  { VTABLE, 2, 0,
713
#if BYTEORDER == 1234
714
    1
715
#else
716
    0
717
#endif
718
  },
719
  {
720
#include "asciitab.h"
721
#include "latin1tab.h"
722
  },
723
  STANDARD_VTABLE(little2_)
724
};
725
726
#endif
727
728
static const struct normal_encoding little2_encoding = {
729
  { VTABLE, 2, 0,
730
#if BYTEORDER == 1234
731
    1
732
#else
733
    0
734
#endif
735
  },
736
  {
737
#define BT_COLON BT_NMSTRT
738
#include "asciitab.h"
739
#undef BT_COLON
740
#include "latin1tab.h"
741
  },
742
  STANDARD_VTABLE(little2_)
743
};
744
745
#if BYTEORDER != 4321
746
747
#ifdef XML_NS
748
749
static const struct normal_encoding internal_little2_encoding_ns = {
750
  { VTABLE, 2, 0, 1 },
751
  {
752
#include "iasciitab.h"
753
#include "latin1tab.h"
754
  },
755
  STANDARD_VTABLE(little2_)
756
};
757
758
#endif
759
760
static const struct normal_encoding internal_little2_encoding = {
761
  { VTABLE, 2, 0, 1 },
762
  {
763
#define BT_COLON BT_NMSTRT
764
#include "iasciitab.h"
765
#undef BT_COLON
766
#include "latin1tab.h"
767
  },
768
  STANDARD_VTABLE(little2_)
769
};
770
771
#endif
772
773
774
#define BIG2_BYTE_TYPE(enc, p) \
775
0
 ((p)[0] == 0 \
776
0
  ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \
777
0
  : unicode_byte_type((p)[0], (p)[1]))
778
0
#define BIG2_BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1)
779
0
#define BIG2_CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c)
780
#define BIG2_IS_NAME_CHAR_MINBPC(enc, p) \
781
0
  UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
782
#define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
783
0
  UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
784
785
#ifdef XML_MIN_SIZE
786
787
static int PTRFASTCALL
788
big2_byteType(const ENCODING *enc, const char *p)
789
{
790
  return BIG2_BYTE_TYPE(enc, p);
791
}
792
793
static int PTRFASTCALL
794
big2_byteToAscii(const ENCODING *enc, const char *p)
795
{
796
  return BIG2_BYTE_TO_ASCII(enc, p);
797
}
798
799
static int PTRCALL
800
big2_charMatches(const ENCODING *enc, const char *p, int c)
801
{
802
  return BIG2_CHAR_MATCHES(enc, p, c);
803
}
804
805
static int PTRFASTCALL
806
big2_isNameMin(const ENCODING *enc, const char *p)
807
{
808
  return BIG2_IS_NAME_CHAR_MINBPC(enc, p);
809
}
810
811
static int PTRFASTCALL
812
big2_isNmstrtMin(const ENCODING *enc, const char *p)
813
{
814
  return BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p);
815
}
816
817
#undef VTABLE
818
#define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
819
820
#else /* not XML_MIN_SIZE */
821
822
#undef PREFIX
823
0
#define PREFIX(ident) big2_ ## ident
824
0
#define MINBPC(enc) 2
825
/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
826
0
#define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
827
0
#define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(enc, p)
828
0
#define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(enc, p, c)
829
0
#define IS_NAME_CHAR(enc, p, n) 0
830
0
#define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(enc, p)
831
0
#define IS_NMSTRT_CHAR(enc, p, n) (0)
832
0
#define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p)
833
834
#include "xmltok_impl.c"
835
836
#undef MINBPC
837
#undef BYTE_TYPE
838
#undef BYTE_TO_ASCII
839
#undef CHAR_MATCHES
840
#undef IS_NAME_CHAR
841
#undef IS_NAME_CHAR_MINBPC
842
#undef IS_NMSTRT_CHAR
843
#undef IS_NMSTRT_CHAR_MINBPC
844
#undef IS_INVALID_CHAR
845
846
#endif /* not XML_MIN_SIZE */
847
848
#ifdef XML_NS
849
850
static const struct normal_encoding big2_encoding_ns = {
851
  { VTABLE, 2, 0,
852
#if BYTEORDER == 4321
853
  1
854
#else
855
  0
856
#endif
857
  },
858
  {
859
#include "asciitab.h"
860
#include "latin1tab.h"
861
  },
862
  STANDARD_VTABLE(big2_)
863
};
864
865
#endif
866
867
static const struct normal_encoding big2_encoding = {
868
  { VTABLE, 2, 0,
869
#if BYTEORDER == 4321
870
  1
871
#else
872
  0
873
#endif
874
  },
875
  {
876
#define BT_COLON BT_NMSTRT
877
#include "asciitab.h"
878
#undef BT_COLON
879
#include "latin1tab.h"
880
  },
881
  STANDARD_VTABLE(big2_)
882
};
883
884
#if BYTEORDER != 1234
885
886
#ifdef XML_NS
887
888
static const struct normal_encoding internal_big2_encoding_ns = {
889
  { VTABLE, 2, 0, 1 },
890
  {
891
#include "iasciitab.h"
892
#include "latin1tab.h"
893
  },
894
  STANDARD_VTABLE(big2_)
895
};
896
897
#endif
898
899
static const struct normal_encoding internal_big2_encoding = {
900
  { VTABLE, 2, 0, 1 },
901
  {
902
#define BT_COLON BT_NMSTRT
903
#include "iasciitab.h"
904
#undef BT_COLON
905
#include "latin1tab.h"
906
  },
907
  STANDARD_VTABLE(big2_)
908
};
909
910
#endif
911
912
#undef PREFIX
913
914
static int FASTCALL
915
streqci(const char *s1, const char *s2)
916
0
{
917
0
  for (;;) {
918
0
    char c1 = *s1++;
919
0
    char c2 = *s2++;
920
0
    if (ASCII_a <= c1 && c1 <= ASCII_z)
921
0
      c1 += ASCII_A - ASCII_a;
922
0
    if (ASCII_a <= c2 && c2 <= ASCII_z)
923
0
      c2 += ASCII_A - ASCII_a;
924
0
    if (c1 != c2)
925
0
      return 0;
926
0
    if (!c1)
927
0
      break;
928
0
  }
929
0
  return 1;
930
0
}
931
932
static void PTRCALL
933
initUpdatePosition(const ENCODING *enc, const char *ptr,
934
                   const char *end, POSITION *pos)
935
0
{
936
0
  normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
937
0
}
938
939
static int
940
toAscii(const ENCODING *enc, const char *ptr, const char *end)
941
0
{
942
0
  char buf[1];
943
0
  char *p = buf;
944
0
  XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
945
0
  if (p == buf)
946
0
    return -1;
947
0
  else
948
0
    return buf[0];
949
0
}
950
951
static int FASTCALL
952
isSpace(int c)
953
0
{
954
0
  switch (c) {
955
0
  case 0x20:
956
0
  case 0xD:
957
0
  case 0xA:
958
0
  case 0x9:
959
0
    return 1;
960
0
  }
961
0
  return 0;
962
0
}
963
964
/* Return 1 if there's just optional white space or there's an S
965
   followed by name=val.
966
*/
967
static int
968
parsePseudoAttribute(const ENCODING *enc,
969
                     const char *ptr,
970
                     const char *end,
971
                     const char **namePtr,
972
                     const char **nameEndPtr,
973
                     const char **valPtr,
974
                     const char **nextTokPtr)
975
0
{
976
0
  int c;
977
0
  char open;
978
0
  if (ptr == end) {
979
0
    *namePtr = NULL;
980
0
    return 1;
981
0
  }
982
0
  if (!isSpace(toAscii(enc, ptr, end))) {
983
0
    *nextTokPtr = ptr;
984
0
    return 0;
985
0
  }
986
0
  do {
987
0
    ptr += enc->minBytesPerChar;
988
0
  } while (isSpace(toAscii(enc, ptr, end)));
989
0
  if (ptr == end) {
990
0
    *namePtr = NULL;
991
0
    return 1;
992
0
  }
993
0
  *namePtr = ptr;
994
0
  for (;;) {
995
0
    c = toAscii(enc, ptr, end);
996
0
    if (c == -1) {
997
0
      *nextTokPtr = ptr;
998
0
      return 0;
999
0
    }
1000
0
    if (c == ASCII_EQUALS) {
1001
0
      *nameEndPtr = ptr;
1002
0
      break;
1003
0
    }
1004
0
    if (isSpace(c)) {
1005
0
      *nameEndPtr = ptr;
1006
0
      do {
1007
0
        ptr += enc->minBytesPerChar;
1008
0
      } while (isSpace(c = toAscii(enc, ptr, end)));
1009
0
      if (c != ASCII_EQUALS) {
1010
0
        *nextTokPtr = ptr;
1011
0
        return 0;
1012
0
      }
1013
0
      break;
1014
0
    }
1015
0
    ptr += enc->minBytesPerChar;
1016
0
  }
1017
0
  if (ptr == *namePtr) {
1018
0
    *nextTokPtr = ptr;
1019
0
    return 0;
1020
0
  }
1021
0
  ptr += enc->minBytesPerChar;
1022
0
  c = toAscii(enc, ptr, end);
1023
0
  while (isSpace(c)) {
1024
0
    ptr += enc->minBytesPerChar;
1025
0
    c = toAscii(enc, ptr, end);
1026
0
  }
1027
0
  if (c != ASCII_QUOT && c != ASCII_APOS) {
1028
0
    *nextTokPtr = ptr;
1029
0
    return 0;
1030
0
  }
1031
0
  open = (char)c;
1032
0
  ptr += enc->minBytesPerChar;
1033
0
  *valPtr = ptr;
1034
0
  for (;; ptr += enc->minBytesPerChar) {
1035
0
    c = toAscii(enc, ptr, end);
1036
0
    if (c == open)
1037
0
      break;
1038
0
    if (!(ASCII_a <= c && c <= ASCII_z)
1039
0
        && !(ASCII_A <= c && c <= ASCII_Z)
1040
0
        && !(ASCII_0 <= c && c <= ASCII_9)
1041
0
        && c != ASCII_PERIOD
1042
0
        && c != ASCII_MINUS
1043
0
        && c != ASCII_UNDERSCORE) {
1044
0
      *nextTokPtr = ptr;
1045
0
      return 0;
1046
0
    }
1047
0
  }
1048
0
  *nextTokPtr = ptr + enc->minBytesPerChar;
1049
0
  return 1;
1050
0
}
1051
1052
static const char KW_version[] = {
1053
  ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0'
1054
};
1055
1056
static const char KW_encoding[] = {
1057
  ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d, ASCII_i, ASCII_n, ASCII_g, '\0'
1058
};
1059
1060
static const char KW_standalone[] = {
1061
  ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a, ASCII_l, ASCII_o,
1062
  ASCII_n, ASCII_e, '\0'
1063
};
1064
1065
static const char KW_yes[] = {
1066
  ASCII_y, ASCII_e, ASCII_s,  '\0'
1067
};
1068
1069
static const char KW_no[] = {
1070
  ASCII_n, ASCII_o,  '\0'
1071
};
1072
1073
/* BEGIN MOZILLA CHANGE (http://bugzilla.mozilla.org/show_bug.cgi?id=62157) */
1074
static const char KW_XML_1_0[] = {
1075
  ASCII_1, ASCII_PERIOD, ASCII_0, '\0'
1076
};
1077
/* END MOZILLA CHANGE */
1078
1079
static int
1080
doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *,
1081
                                                 const char *,
1082
                                                 const char *),
1083
               int isGeneralTextEntity,
1084
               const ENCODING *enc,
1085
               const char *ptr,
1086
               const char *end,
1087
               const char **badPtr,
1088
               const char **versionPtr,
1089
               const char **versionEndPtr,
1090
               const char **encodingName,
1091
               const ENCODING **encoding,
1092
               int *standalone)
1093
0
{
1094
0
  const char *val = NULL;
1095
0
  const char *name = NULL;
1096
0
  const char *nameEnd = NULL;
1097
0
  ptr += 5 * enc->minBytesPerChar;
1098
0
  end -= 2 * enc->minBytesPerChar;
1099
0
  if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)
1100
0
      || !name) {
1101
0
    *badPtr = ptr;
1102
0
    return 0;
1103
0
  }
1104
0
  if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) {
1105
0
    if (!isGeneralTextEntity) {
1106
0
      *badPtr = name;
1107
0
      return 0;
1108
0
    }
1109
0
  }
1110
0
  else {
1111
0
    if (versionPtr)
1112
0
      *versionPtr = val;
1113
0
    if (versionEndPtr)
1114
0
      *versionEndPtr = ptr;
1115
0
/* BEGIN MOZILLA CHANGE (http://bugzilla.mozilla.org/show_bug.cgi?id=62157) */
1116
0
     /* Anything else but a version="1.0" is invalid for us, until we support later versions. */
1117
0
     if (!XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_XML_1_0)) {
1118
0
       *badPtr = val;
1119
0
       return 0;
1120
0
     }
1121
0
/* END MOZILLA CHANGE */
1122
0
    if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1123
0
      *badPtr = ptr;
1124
0
      return 0;
1125
0
    }
1126
0
    if (!name) {
1127
0
      if (isGeneralTextEntity) {
1128
0
        /* a TextDecl must have an EncodingDecl */
1129
0
        *badPtr = ptr;
1130
0
        return 0;
1131
0
      }
1132
0
      return 1;
1133
0
    }
1134
0
  }
1135
0
  if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) {
1136
0
    int c = toAscii(enc, val, end);
1137
0
    if (!(ASCII_a <= c && c <= ASCII_z) && !(ASCII_A <= c && c <= ASCII_Z)) {
1138
0
      *badPtr = val;
1139
0
      return 0;
1140
0
    }
1141
0
    if (encodingName)
1142
0
      *encodingName = val;
1143
0
    if (encoding)
1144
0
      *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
1145
0
    if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1146
0
      *badPtr = ptr;
1147
0
      return 0;
1148
0
    }
1149
0
    if (!name)
1150
0
      return 1;
1151
0
  }
1152
0
  if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone)
1153
0
      || isGeneralTextEntity) {
1154
0
    *badPtr = name;
1155
0
    return 0;
1156
0
  }
1157
0
  if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) {
1158
0
    if (standalone)
1159
0
      *standalone = 1;
1160
0
  }
1161
0
  else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) {
1162
0
    if (standalone)
1163
0
      *standalone = 0;
1164
0
  }
1165
0
  else {
1166
0
    *badPtr = val;
1167
0
    return 0;
1168
0
  }
1169
0
  while (isSpace(toAscii(enc, ptr, end)))
1170
0
    ptr += enc->minBytesPerChar;
1171
0
  if (ptr != end) {
1172
0
    *badPtr = ptr;
1173
0
    return 0;
1174
0
  }
1175
0
  return 1;
1176
0
}
1177
1178
static int FASTCALL
1179
checkCharRefNumber(int result)
1180
0
{
1181
0
  switch (result >> 8) {
1182
0
  case 0xD8: case 0xD9: case 0xDA: case 0xDB:
1183
0
  case 0xDC: case 0xDD: case 0xDE: case 0xDF:
1184
0
    return -1;
1185
0
  case 0:
1186
0
    if (latin1_encoding.type[result] == BT_NONXML)
1187
0
      return -1;
1188
0
    break;
1189
0
  case 0xFF:
1190
0
    if (result == 0xFFFE || result == 0xFFFF)
1191
0
      return -1;
1192
0
    break;
1193
0
  }
1194
0
  return result;
1195
0
}
1196
1197
int FASTCALL
1198
XmlUtf8Encode(int c, char *buf)
1199
0
{
1200
0
  enum {
1201
0
    /* minN is minimum legal resulting value for N byte sequence */
1202
0
    min2 = 0x80,
1203
0
    min3 = 0x800,
1204
0
    min4 = 0x10000
1205
0
  };
1206
0
1207
0
  if (c < 0)
1208
0
    return 0;
1209
0
  if (c < min2) {
1210
0
    buf[0] = (char)(c | UTF8_cval1);
1211
0
    return 1;
1212
0
  }
1213
0
  if (c < min3) {
1214
0
    buf[0] = (char)((c >> 6) | UTF8_cval2);
1215
0
    buf[1] = (char)((c & 0x3f) | 0x80);
1216
0
    return 2;
1217
0
  }
1218
0
  if (c < min4) {
1219
0
    buf[0] = (char)((c >> 12) | UTF8_cval3);
1220
0
    buf[1] = (char)(((c >> 6) & 0x3f) | 0x80);
1221
0
    buf[2] = (char)((c & 0x3f) | 0x80);
1222
0
    return 3;
1223
0
  }
1224
0
  if (c < 0x110000) {
1225
0
    buf[0] = (char)((c >> 18) | UTF8_cval4);
1226
0
    buf[1] = (char)(((c >> 12) & 0x3f) | 0x80);
1227
0
    buf[2] = (char)(((c >> 6) & 0x3f) | 0x80);
1228
0
    buf[3] = (char)((c & 0x3f) | 0x80);
1229
0
    return 4;
1230
0
  }
1231
0
  return 0;
1232
0
}
1233
1234
int FASTCALL
1235
XmlUtf16Encode(int charNum, unsigned short *buf)
1236
0
{
1237
0
  if (charNum < 0)
1238
0
    return 0;
1239
0
  if (charNum < 0x10000) {
1240
0
    buf[0] = (unsigned short)charNum;
1241
0
    return 1;
1242
0
  }
1243
0
  if (charNum < 0x110000) {
1244
0
    charNum -= 0x10000;
1245
0
    buf[0] = (unsigned short)((charNum >> 10) + 0xD800);
1246
0
    buf[1] = (unsigned short)((charNum & 0x3FF) + 0xDC00);
1247
0
    return 2;
1248
0
  }
1249
0
  return 0;
1250
0
}
1251
1252
struct unknown_encoding {
1253
  struct normal_encoding normal;
1254
  CONVERTER convert;
1255
  void *userData;
1256
  unsigned short utf16[256];
1257
  char utf8[256][4];
1258
};
1259
1260
0
#define AS_UNKNOWN_ENCODING(enc)  ((const struct unknown_encoding *) (enc))
1261
1262
int
1263
XmlSizeOfUnknownEncoding(void)
1264
0
{
1265
0
  return sizeof(struct unknown_encoding);
1266
0
}
1267
1268
static int PTRFASTCALL
1269
unknown_isName(const ENCODING *enc, const char *p)
1270
0
{
1271
0
  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1272
0
  int c = uenc->convert(uenc->userData, p);
1273
0
  if (c & ~0xFFFF)
1274
0
    return 0;
1275
0
  return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
1276
0
}
1277
1278
static int PTRFASTCALL
1279
unknown_isNmstrt(const ENCODING *enc, const char *p)
1280
0
{
1281
0
  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1282
0
  int c = uenc->convert(uenc->userData, p);
1283
0
  if (c & ~0xFFFF)
1284
0
    return 0;
1285
0
  return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
1286
0
}
1287
1288
static int PTRFASTCALL
1289
unknown_isInvalid(const ENCODING *enc, const char *p)
1290
0
{
1291
0
  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1292
0
  int c = uenc->convert(uenc->userData, p);
1293
0
  return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
1294
0
}
1295
1296
static void PTRCALL
1297
unknown_toUtf8(const ENCODING *enc,
1298
               const char **fromP, const char *fromLim,
1299
               char **toP, const char *toLim)
1300
0
{
1301
0
  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1302
0
  char buf[XML_UTF8_ENCODE_MAX];
1303
0
  for (;;) {
1304
0
    const char *utf8;
1305
0
    int n;
1306
0
    if (*fromP == fromLim)
1307
0
      break;
1308
0
    utf8 = uenc->utf8[(unsigned char)**fromP];
1309
0
    n = *utf8++;
1310
0
    if (n == 0) {
1311
0
      int c = uenc->convert(uenc->userData, *fromP);
1312
0
      n = XmlUtf8Encode(c, buf);
1313
0
      if (n > toLim - *toP)
1314
0
        break;
1315
0
      utf8 = buf;
1316
0
      *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1317
0
                 - (BT_LEAD2 - 2));
1318
0
    }
1319
0
    else {
1320
0
      if (n > toLim - *toP)
1321
0
        break;
1322
0
      (*fromP)++;
1323
0
    }
1324
0
    do {
1325
0
      *(*toP)++ = *utf8++;
1326
0
    } while (--n != 0);
1327
0
  }
1328
0
}
1329
1330
static void PTRCALL
1331
unknown_toUtf16(const ENCODING *enc,
1332
                const char **fromP, const char *fromLim,
1333
                unsigned short **toP, const unsigned short *toLim)
1334
0
{
1335
0
  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1336
0
  while (*fromP != fromLim && *toP != toLim) {
1337
0
    unsigned short c = uenc->utf16[(unsigned char)**fromP];
1338
0
    if (c == 0) {
1339
0
      c = (unsigned short)
1340
0
          uenc->convert(uenc->userData, *fromP);
1341
0
      *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1342
0
                 - (BT_LEAD2 - 2));
1343
0
    }
1344
0
    else
1345
0
      (*fromP)++;
1346
0
    *(*toP)++ = c;
1347
0
  }
1348
0
}
1349
1350
ENCODING *
1351
XmlInitUnknownEncoding(void *mem,
1352
                       int *table,
1353
                       CONVERTER convert, 
1354
                       void *userData)
1355
0
{
1356
0
  int i;
1357
0
  struct unknown_encoding *e = (struct unknown_encoding *)mem;
1358
0
  for (i = 0; i < (int)sizeof(struct normal_encoding); i++)
1359
0
    ((char *)mem)[i] = ((char *)&latin1_encoding)[i];
1360
0
  for (i = 0; i < 128; i++)
1361
0
    if (latin1_encoding.type[i] != BT_OTHER
1362
0
        && latin1_encoding.type[i] != BT_NONXML
1363
0
        && table[i] != i)
1364
0
      return 0;
1365
0
  for (i = 0; i < 256; i++) {
1366
0
    int c = table[i];
1367
0
    if (c == -1) {
1368
0
      e->normal.type[i] = BT_MALFORM;
1369
0
      /* This shouldn't really get used. */
1370
0
      e->utf16[i] = 0xFFFF;
1371
0
      e->utf8[i][0] = 1;
1372
0
      e->utf8[i][1] = 0;
1373
0
    }
1374
0
    else if (c < 0) {
1375
0
      if (c < -4)
1376
0
        return 0;
1377
0
      e->normal.type[i] = (unsigned char)(BT_LEAD2 - (c + 2));
1378
0
      e->utf8[i][0] = 0;
1379
0
      e->utf16[i] = 0;
1380
0
    }
1381
0
    else if (c < 0x80) {
1382
0
      if (latin1_encoding.type[c] != BT_OTHER
1383
0
          && latin1_encoding.type[c] != BT_NONXML
1384
0
          && c != i)
1385
0
        return 0;
1386
0
      e->normal.type[i] = latin1_encoding.type[c];
1387
0
      e->utf8[i][0] = 1;
1388
0
      e->utf8[i][1] = (char)c;
1389
0
      e->utf16[i] = (unsigned short)(c == 0 ? 0xFFFF : c);
1390
0
    }
1391
0
    else if (checkCharRefNumber(c) < 0) {
1392
0
      e->normal.type[i] = BT_NONXML;
1393
0
      /* This shouldn't really get used. */
1394
0
      e->utf16[i] = 0xFFFF;
1395
0
      e->utf8[i][0] = 1;
1396
0
      e->utf8[i][1] = 0;
1397
0
    }
1398
0
    else {
1399
0
      if (c > 0xFFFF)
1400
0
        return 0;
1401
0
      if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
1402
0
        e->normal.type[i] = BT_NMSTRT;
1403
0
      else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
1404
0
        e->normal.type[i] = BT_NAME;
1405
0
      else
1406
0
        e->normal.type[i] = BT_OTHER;
1407
0
      e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
1408
0
      e->utf16[i] = (unsigned short)c;
1409
0
    }
1410
0
  }
1411
0
  e->userData = userData;
1412
0
  e->convert = convert;
1413
0
  if (convert) {
1414
0
    e->normal.isName2 = unknown_isName;
1415
0
    e->normal.isName3 = unknown_isName;
1416
0
    e->normal.isName4 = unknown_isName;
1417
0
    e->normal.isNmstrt2 = unknown_isNmstrt;
1418
0
    e->normal.isNmstrt3 = unknown_isNmstrt;
1419
0
    e->normal.isNmstrt4 = unknown_isNmstrt;
1420
0
    e->normal.isInvalid2 = unknown_isInvalid;
1421
0
    e->normal.isInvalid3 = unknown_isInvalid;
1422
0
    e->normal.isInvalid4 = unknown_isInvalid;
1423
0
  }
1424
0
  e->normal.enc.utf8Convert = unknown_toUtf8;
1425
0
  e->normal.enc.utf16Convert = unknown_toUtf16;
1426
0
  return &(e->normal.enc);
1427
0
}
1428
1429
/* If this enumeration is changed, getEncodingIndex and encodings
1430
must also be changed. */
1431
enum {
1432
  UNKNOWN_ENC = -1,
1433
  ISO_8859_1_ENC = 0,
1434
  US_ASCII_ENC,
1435
  UTF_8_ENC,
1436
  UTF_16_ENC,
1437
  UTF_16BE_ENC,
1438
  UTF_16LE_ENC,
1439
  /* must match encodingNames up to here */
1440
  NO_ENC
1441
};
1442
1443
static const char KW_ISO_8859_1[] = {
1444
  ASCII_I, ASCII_S, ASCII_O, ASCII_MINUS, ASCII_8, ASCII_8, ASCII_5, ASCII_9,
1445
  ASCII_MINUS, ASCII_1, '\0'
1446
};
1447
static const char KW_US_ASCII[] = {
1448
  ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S, ASCII_C, ASCII_I, ASCII_I,
1449
  '\0'
1450
};
1451
static const char KW_UTF_8[] =  {
1452
  ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0'
1453
};
1454
static const char KW_UTF_16[] = {
1455
  ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0'
1456
};
1457
static const char KW_UTF_16BE[] = {
1458
  ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_B, ASCII_E,
1459
  '\0'
1460
};
1461
static const char KW_UTF_16LE[] = {
1462
  ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_L, ASCII_E,
1463
  '\0'
1464
};
1465
1466
static int FASTCALL
1467
getEncodingIndex(const char *name)
1468
0
{
1469
0
  static const char * const encodingNames[] = {
1470
0
    KW_ISO_8859_1,
1471
0
    KW_US_ASCII,
1472
0
    KW_UTF_8,
1473
0
    KW_UTF_16,
1474
0
    KW_UTF_16BE,
1475
0
    KW_UTF_16LE,
1476
0
  };
1477
0
  int i;
1478
0
  if (name == NULL)
1479
0
    return NO_ENC;
1480
0
  for (i = 0; i < (int)(sizeof(encodingNames)/sizeof(encodingNames[0])); i++)
1481
0
    if (streqci(name, encodingNames[i]))
1482
0
      return i;
1483
0
  return UNKNOWN_ENC;
1484
0
}
1485
1486
/* For binary compatibility, we store the index of the encoding
1487
   specified at initialization in the isUtf16 member.
1488
*/
1489
1490
0
#define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16)
1491
0
#define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i)
1492
1493
/* This is what detects the encoding.  encodingTable maps from
1494
   encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of
1495
   the external (protocol) specified encoding; state is
1496
   XML_CONTENT_STATE if we're parsing an external text entity, and
1497
   XML_PROLOG_STATE otherwise.
1498
*/
1499
1500
1501
static int
1502
initScan(const ENCODING * const *encodingTable,
1503
         const INIT_ENCODING *enc,
1504
         int state,
1505
         const char *ptr,
1506
         const char *end,
1507
         const char **nextTokPtr)
1508
0
{
1509
0
  const ENCODING **encPtr;
1510
0
1511
0
  if (ptr == end)
1512
0
    return XML_TOK_NONE;
1513
0
  encPtr = enc->encPtr;
1514
0
  if (ptr + 1 == end) {
1515
0
    /* only a single byte available for auto-detection */
1516
#ifndef XML_DTD /* FIXME */
1517
    /* a well-formed document entity must have more than one byte */
1518
    if (state != XML_CONTENT_STATE)
1519
      return XML_TOK_PARTIAL;
1520
#endif
1521
    /* so we're parsing an external text entity... */
1522
0
    /* if UTF-16 was externally specified, then we need at least 2 bytes */
1523
0
    switch (INIT_ENC_INDEX(enc)) {
1524
0
    case UTF_16_ENC:
1525
0
    case UTF_16LE_ENC:
1526
0
    case UTF_16BE_ENC:
1527
0
      return XML_TOK_PARTIAL;
1528
0
    }
1529
0
    switch ((unsigned char)*ptr) {
1530
0
    case 0xFE:
1531
0
    case 0xFF:
1532
0
    case 0xEF: /* possibly first byte of UTF-8 BOM */
1533
0
      if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1534
0
          && state == XML_CONTENT_STATE)
1535
0
        break;
1536
0
      /* fall through */
1537
0
    case 0x00:
1538
0
    case 0x3C:
1539
0
      return XML_TOK_PARTIAL;
1540
0
    }
1541
0
  }
1542
0
  else {
1543
0
    switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
1544
0
    case 0xFEFF:
1545
0
      if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1546
0
          && state == XML_CONTENT_STATE)
1547
0
        break;
1548
0
      *nextTokPtr = ptr + 2;
1549
0
      *encPtr = encodingTable[UTF_16BE_ENC];
1550
0
      return XML_TOK_BOM;
1551
0
    /* 00 3C is handled in the default case */
1552
0
    case 0x3C00:
1553
0
      if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
1554
0
           || INIT_ENC_INDEX(enc) == UTF_16_ENC)
1555
0
          && state == XML_CONTENT_STATE)
1556
0
        break;
1557
0
      *encPtr = encodingTable[UTF_16LE_ENC];
1558
0
      return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1559
0
    case 0xFFFE:
1560
0
      if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1561
0
          && state == XML_CONTENT_STATE)
1562
0
        break;
1563
0
      *nextTokPtr = ptr + 2;
1564
0
      *encPtr = encodingTable[UTF_16LE_ENC];
1565
0
      return XML_TOK_BOM;
1566
0
    case 0xEFBB:
1567
0
      /* Maybe a UTF-8 BOM (EF BB BF) */
1568
0
      /* If there's an explicitly specified (external) encoding
1569
0
         of ISO-8859-1 or some flavour of UTF-16
1570
0
         and this is an external text entity,
1571
0
         don't look for the BOM,
1572
0
         because it might be a legal data.
1573
0
      */
1574
0
      if (state == XML_CONTENT_STATE) {
1575
0
        int e = INIT_ENC_INDEX(enc);
1576
0
        if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC
1577
0
            || e == UTF_16LE_ENC || e == UTF_16_ENC)
1578
0
          break;
1579
0
      }
1580
0
      if (ptr + 2 == end)
1581
0
        return XML_TOK_PARTIAL;
1582
0
      if ((unsigned char)ptr[2] == 0xBF) {
1583
0
        *nextTokPtr = ptr + 3;
1584
0
        *encPtr = encodingTable[UTF_8_ENC];
1585
0
        return XML_TOK_BOM;
1586
0
      }
1587
0
      break;
1588
0
    default:
1589
0
      if (ptr[0] == '\0') {
1590
0
        /* 0 isn't a legal data character. Furthermore a document
1591
0
           entity can only start with ASCII characters.  So the only
1592
0
           way this can fail to be big-endian UTF-16 if it it's an
1593
0
           external parsed general entity that's labelled as
1594
0
           UTF-16LE.
1595
0
        */
1596
0
        if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
1597
0
          break;
1598
0
        *encPtr = encodingTable[UTF_16BE_ENC];
1599
0
        return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1600
0
      }
1601
0
      else if (ptr[1] == '\0') {
1602
0
        /* We could recover here in the case:
1603
0
            - parsing an external entity
1604
0
            - second byte is 0
1605
0
            - no externally specified encoding
1606
0
            - no encoding declaration
1607
0
           by assuming UTF-16LE.  But we don't, because this would mean when
1608
0
           presented just with a single byte, we couldn't reliably determine
1609
0
           whether we needed further bytes.
1610
0
        */
1611
0
        if (state == XML_CONTENT_STATE)
1612
0
          break;
1613
0
        *encPtr = encodingTable[UTF_16LE_ENC];
1614
0
        return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1615
0
      }
1616
0
      break;
1617
0
    }
1618
0
  }
1619
0
  *encPtr = encodingTable[INIT_ENC_INDEX(enc)];
1620
0
  return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1621
0
}
1622
1623
1624
0
#define NS(x) x
1625
0
#define ns(x) x
1626
#include "xmltok_ns.c"
1627
#undef NS
1628
#undef ns
1629
1630
#ifdef XML_NS
1631
1632
0
#define NS(x) x ## NS
1633
0
#define ns(x) x ## _ns
1634
1635
#include "xmltok_ns.c"
1636
1637
#undef NS
1638
#undef ns
1639
1640
ENCODING *
1641
XmlInitUnknownEncodingNS(void *mem,
1642
                         int *table,
1643
                         CONVERTER convert, 
1644
                         void *userData)
1645
0
{
1646
0
  ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
1647
0
  if (enc)
1648
0
    ((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON;
1649
0
  return enc;
1650
0
}
1651
1652
#endif /* XML_NS */
1653
1654
/* BEGIN MOZILLA CHANGE (Mozilla extensions for QName checking) */
1655
#ifdef MOZILLA_CLIENT
1656
#include "moz_extensions.c"
1657
#endif /* MOZILLA_CLIENT */
1658
/* END MOZILLA CHANGE */