Coverage Report

Created: 2025-07-07 10:01

/work/workdir/UnpackedTarball/libxml2/encoding.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * encoding.c : implements the encoding conversion functions needed for XML
3
 *
4
 * Related specs:
5
 * rfc2044        (UTF-8 and UTF-16) F. Yergeau Alis Technologies
6
 * rfc2781        UTF-16, an encoding of ISO 10646, P. Hoffman, F. Yergeau
7
 * [ISO-10646]    UTF-8 and UTF-16 in Annexes
8
 * [ISO-8859-1]   ISO Latin-1 characters codes.
9
 * [UNICODE]      The Unicode Consortium, "The Unicode Standard --
10
 *                Worldwide Character Encoding -- Version 1.0", Addison-
11
 *                Wesley, Volume 1, 1991, Volume 2, 1992.  UTF-8 is
12
 *                described in Unicode Technical Report #4.
13
 * [US-ASCII]     Coded Character Set--7-bit American Standard Code for
14
 *                Information Interchange, ANSI X3.4-1986.
15
 *
16
 * See Copyright for the status of this software.
17
 *
18
 * daniel@veillard.com
19
 *
20
 * Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" <duerst@w3.org>
21
 */
22
23
#define IN_LIBXML
24
#include "libxml.h"
25
26
#include <string.h>
27
#include <limits.h>
28
#include <ctype.h>
29
#include <stdlib.h>
30
31
#ifdef LIBXML_ICONV_ENABLED
32
#include <iconv.h>
33
#include <errno.h>
34
#endif
35
36
#include <libxml/encoding.h>
37
#include <libxml/xmlmemory.h>
38
#include <libxml/parser.h>
39
#ifdef LIBXML_HTML_ENABLED
40
#include <libxml/HTMLparser.h>
41
#endif
42
#include <libxml/xmlerror.h>
43
44
#include "private/buf.h"
45
#include "private/enc.h"
46
#include "private/entities.h"
47
#include "private/error.h"
48
#include "private/memory.h"
49
50
#ifdef LIBXML_ICU_ENABLED
51
#include <unicode/ucnv.h>
52
#endif
53
54
2.84k
#define XML_HANDLER_STATIC (1 << 0)
55
4.70k
#define XML_HANDLER_LEGACY (1 << 1)
56
57
typedef struct _xmlCharEncodingAlias xmlCharEncodingAlias;
58
typedef xmlCharEncodingAlias *xmlCharEncodingAliasPtr;
59
struct _xmlCharEncodingAlias {
60
    const char *name;
61
    const char *alias;
62
};
63
64
static xmlCharEncodingAliasPtr xmlCharEncodingAliases = NULL;
65
static int xmlCharEncodingAliasesNb = 0;
66
static int xmlCharEncodingAliasesMax = 0;
67
68
static int xmlLittleEndian = 1;
69
70
typedef struct {
71
    const char *name;
72
    xmlCharEncoding enc;
73
} xmlEncTableEntry;
74
75
static const xmlEncTableEntry xmlEncTable[] = {
76
    { "ASCII", XML_CHAR_ENCODING_ASCII },
77
    { "EUC-JP", XML_CHAR_ENCODING_EUC_JP },
78
    { "HTML", XML_CHAR_ENCODING_HTML },
79
    { "ISO LATIN 1", XML_CHAR_ENCODING_8859_1 },
80
    { "ISO LATIN 2", XML_CHAR_ENCODING_8859_2 },
81
    { "ISO-10646-UCS-2", XML_CHAR_ENCODING_UCS2 },
82
    { "ISO-10646-UCS-4", XML_CHAR_ENCODING_UCS4LE },
83
    { "ISO-2022-JP", XML_CHAR_ENCODING_2022_JP },
84
    { "ISO-8859-1", XML_CHAR_ENCODING_8859_1 },
85
    { "ISO-8859-10", XML_CHAR_ENCODING_8859_10 },
86
    { "ISO-8859-11", XML_CHAR_ENCODING_8859_11 },
87
    { "ISO-8859-13", XML_CHAR_ENCODING_8859_13 },
88
    { "ISO-8859-14", XML_CHAR_ENCODING_8859_14 },
89
    { "ISO-8859-15", XML_CHAR_ENCODING_8859_15 },
90
    { "ISO-8859-16", XML_CHAR_ENCODING_8859_16 },
91
    { "ISO-8859-2", XML_CHAR_ENCODING_8859_2 },
92
    { "ISO-8859-3", XML_CHAR_ENCODING_8859_3 },
93
    { "ISO-8859-4", XML_CHAR_ENCODING_8859_4 },
94
    { "ISO-8859-5", XML_CHAR_ENCODING_8859_5 },
95
    { "ISO-8859-6", XML_CHAR_ENCODING_8859_6 },
96
    { "ISO-8859-7", XML_CHAR_ENCODING_8859_7 },
97
    { "ISO-8859-8", XML_CHAR_ENCODING_8859_8 },
98
    { "ISO-8859-9", XML_CHAR_ENCODING_8859_9 },
99
    { "ISO-LATIN-1", XML_CHAR_ENCODING_8859_1 },
100
    { "ISO-LATIN-2", XML_CHAR_ENCODING_8859_2 },
101
    { "SHIFT_JIS", XML_CHAR_ENCODING_SHIFT_JIS },
102
    { "UCS-2", XML_CHAR_ENCODING_UCS2 },
103
    { "UCS-4", XML_CHAR_ENCODING_UCS4LE },
104
    { "UCS2", XML_CHAR_ENCODING_UCS2 },
105
    { "UCS4", XML_CHAR_ENCODING_UCS4LE },
106
    { "US-ASCII", XML_CHAR_ENCODING_ASCII },
107
    { "UTF-16", XML_CHAR_ENCODING_UTF16 },
108
    { "UTF-16BE", XML_CHAR_ENCODING_UTF16BE },
109
    { "UTF-16LE", XML_CHAR_ENCODING_UTF16LE },
110
    { "UTF-8", XML_CHAR_ENCODING_UTF8 },
111
    { "UTF16", XML_CHAR_ENCODING_UTF16 },
112
    { "UTF8", XML_CHAR_ENCODING_UTF8 }
113
};
114
115
static xmlCharEncError
116
asciiToAscii(void *vctxt, unsigned char* out, int *outlen,
117
             const unsigned char* in, int *inlen, int flush);
118
static xmlCharEncError
119
UTF8ToUTF8(void *vctxt, unsigned char* out, int *outlen,
120
           const unsigned char* inb, int *inlenb, int flush);
121
static xmlCharEncError
122
latin1ToUTF8(void *vctxt, unsigned char* out, int *outlen,
123
             const unsigned char* in, int *inlen, int flush);
124
static xmlCharEncError
125
UTF16LEToUTF8(void *vctxt, unsigned char* out, int *outlen,
126
              const unsigned char* inb, int *inlenb, int flush);
127
static xmlCharEncError
128
UTF16BEToUTF8(void *vctxt, unsigned char* out, int *outlen,
129
              const unsigned char* inb, int *inlenb, int flush);
130
131
#ifdef LIBXML_OUTPUT_ENABLED
132
133
static xmlCharEncError
134
UTF8ToLatin1(void *vctxt, unsigned char* outb, int *outlen,
135
             const unsigned char* in, int *inlen, int flush);
136
static xmlCharEncError
137
UTF8ToUTF16(void *vctxt, unsigned char* outb, int *outlen,
138
            const unsigned char* in, int *inlen, int flush);
139
static xmlCharEncError
140
UTF8ToUTF16LE(void *vctxt, unsigned char* outb, int *outlen,
141
              const unsigned char* in, int *inlen, int flush);
142
static xmlCharEncError
143
UTF8ToUTF16BE(void *vctxt, unsigned char* outb, int *outlen,
144
              const unsigned char* in, int *inlen, int flush);
145
146
#else /* LIBXML_OUTPUT_ENABLED */
147
148
#define UTF8ToLatin1 NULL
149
#define UTF8ToUTF16 NULL
150
#define UTF8ToUTF16LE NULL
151
#define UTF8ToUTF16BE NULL
152
153
#endif /* LIBXML_OUTPUT_ENABLED */
154
155
#if defined(LIBXML_OUTPUT_ENABLED) && defined(LIBXML_HTML_ENABLED)
156
static xmlCharEncError
157
UTF8ToHtmlWrapper(void *vctxt, unsigned char *out, int *outlen,
158
                  const unsigned char *in, int *inlen, int flush);
159
#else
160
#define UTF8ToHtmlWrapper NULL
161
#endif
162
163
#if !defined(LIBXML_ICONV_ENABLED) && !defined(LIBXML_ICU_ENABLED) && \
164
    defined(LIBXML_ISO8859X_ENABLED)
165
166
#include "iso8859x.inc"
167
168
static xmlCharEncError
169
ISO8859xToUTF8(void *vctxt, unsigned char* out, int *outlen,
170
               const unsigned char* in, int *inlen, int flush);
171
static xmlCharEncError
172
UTF8ToISO8859x(void *vctxt, unsigned char *out, int *outlen,
173
               const unsigned char *in, int *inlen, int flush);
174
175
#define MAKE_ISO_HANDLER(name, n) \
176
    { (char *) name, { ISO8859xToUTF8 }, { UTF8ToISO8859x }, \
177
      (void *) xmlunicodetable_ISO8859_##n, \
178
      (void *) xmltranscodetable_ISO8859_##n, \
179
      NULL, XML_HANDLER_STATIC }
180
181
#else /* LIBXML_ISO8859X_ENABLED */
182
183
#define MAKE_ISO_HANDLER(name, n) \
184
    { (char *) name, { NULL }, { NULL }, NULL, NULL, NULL, \
185
      XML_HANDLER_STATIC }
186
187
#endif /* LIBXML_ISO8859X_ENABLED */
188
189
#define MAKE_HANDLER(name, in, out) \
190
    { (char *) name, { in }, { out }, NULL, NULL, NULL, XML_HANDLER_STATIC }
191
192
/*
193
 * The layout must match enum xmlCharEncoding.
194
 *
195
 * Names should match the IANA registry if possible:
196
 * https://www.iana.org/assignments/character-sets/character-sets.xhtml
197
 */
198
static const xmlCharEncodingHandler defaultHandlers[31] = {
199
    MAKE_HANDLER(NULL, NULL, NULL), /* NONE */
200
    MAKE_HANDLER("UTF-8", UTF8ToUTF8, UTF8ToUTF8),
201
    MAKE_HANDLER("UTF-16LE", UTF16LEToUTF8, UTF8ToUTF16LE),
202
    MAKE_HANDLER("UTF-16BE", UTF16BEToUTF8, UTF8ToUTF16BE),
203
    MAKE_HANDLER("UCS-4LE", NULL, NULL),
204
    MAKE_HANDLER("UCS-4BE", NULL, NULL),
205
    MAKE_HANDLER("IBM037", NULL, NULL),
206
    MAKE_HANDLER(NULL, NULL, NULL), /* UCS4_2143 */
207
    MAKE_HANDLER(NULL, NULL, NULL), /* UCS4_3412 */
208
    MAKE_HANDLER("UCS-2", NULL, NULL),
209
    MAKE_HANDLER("ISO-8859-1", latin1ToUTF8, UTF8ToLatin1),
210
    MAKE_ISO_HANDLER("ISO-8859-2", 2),
211
    MAKE_ISO_HANDLER("ISO-8859-3", 3),
212
    MAKE_ISO_HANDLER("ISO-8859-4", 4),
213
    MAKE_ISO_HANDLER("ISO-8859-5", 5),
214
    MAKE_ISO_HANDLER("ISO-8859-6", 6),
215
    MAKE_ISO_HANDLER("ISO-8859-7", 7),
216
    MAKE_ISO_HANDLER("ISO-8859-8", 8),
217
    MAKE_ISO_HANDLER("ISO-8859-9", 9),
218
    MAKE_HANDLER("ISO-2022-JP", NULL, NULL),
219
    MAKE_HANDLER("Shift_JIS", NULL, NULL),
220
    MAKE_HANDLER("EUC-JP", NULL, NULL),
221
    MAKE_HANDLER("US-ASCII", asciiToAscii, asciiToAscii),
222
    MAKE_HANDLER("UTF-16", UTF16LEToUTF8, UTF8ToUTF16),
223
    MAKE_HANDLER("HTML", NULL, UTF8ToHtmlWrapper),
224
    MAKE_ISO_HANDLER("ISO-8859-10", 10),
225
    MAKE_ISO_HANDLER("ISO-8859-11", 11),
226
    MAKE_ISO_HANDLER("ISO-8859-13", 13),
227
    MAKE_ISO_HANDLER("ISO-8859-14", 14),
228
    MAKE_ISO_HANDLER("ISO-8859-15", 15),
229
    MAKE_ISO_HANDLER("ISO-8859-16", 16),
230
};
231
232
#define NUM_DEFAULT_HANDLERS \
233
11.3k
    (sizeof(defaultHandlers) / sizeof(defaultHandlers[0]))
234
235
/* the size should be growable, but it's not a big deal ... */
236
0
#define MAX_ENCODING_HANDLERS 50
237
static xmlCharEncodingHandlerPtr *globalHandlers = NULL;
238
static int nbCharEncodingHandler = 0;
239
240
#ifdef LIBXML_ICONV_ENABLED
241
static xmlParserErrors
242
xmlCharEncIconv(const char *name, xmlCharEncFlags flags,
243
                xmlCharEncodingHandler **out);
244
#endif
245
246
#ifdef LIBXML_ICU_ENABLED
247
static xmlParserErrors
248
xmlCharEncUconv(const char *name, xmlCharEncFlags flags,
249
                xmlCharEncodingHandler **out);
250
#endif
251
252
/************************************************************************
253
 *                  *
254
 *    Generic encoding handling routines      *
255
 *                  *
256
 ************************************************************************/
257
258
/**
259
 * xmlDetectCharEncoding:
260
 * @in:  a pointer to the first bytes of the XML entity, must be at least
261
 *       2 bytes long (at least 4 if encoding is UTF4 variant).
262
 * @len:  pointer to the length of the buffer
263
 *
264
 * Guess the encoding of the entity using the first bytes of the entity content
265
 * according to the non-normative appendix F of the XML-1.0 recommendation.
266
 *
267
 * Returns one of the XML_CHAR_ENCODING_... values.
268
 */
269
xmlCharEncoding
270
xmlDetectCharEncoding(const unsigned char* in, int len)
271
0
{
272
0
    if (in == NULL)
273
0
        return(XML_CHAR_ENCODING_NONE);
274
0
    if (len >= 4) {
275
0
  if ((in[0] == 0x00) && (in[1] == 0x00) &&
276
0
      (in[2] == 0x00) && (in[3] == 0x3C))
277
0
      return(XML_CHAR_ENCODING_UCS4BE);
278
0
  if ((in[0] == 0x3C) && (in[1] == 0x00) &&
279
0
      (in[2] == 0x00) && (in[3] == 0x00))
280
0
      return(XML_CHAR_ENCODING_UCS4LE);
281
0
  if ((in[0] == 0x4C) && (in[1] == 0x6F) &&
282
0
      (in[2] == 0xA7) && (in[3] == 0x94))
283
0
      return(XML_CHAR_ENCODING_EBCDIC);
284
0
  if ((in[0] == 0x3C) && (in[1] == 0x3F) &&
285
0
      (in[2] == 0x78) && (in[3] == 0x6D))
286
0
      return(XML_CHAR_ENCODING_UTF8);
287
  /*
288
   * Although not part of the recommendation, we also
289
   * attempt an "auto-recognition" of UTF-16LE and
290
   * UTF-16BE encodings.
291
   */
292
0
  if ((in[0] == 0x3C) && (in[1] == 0x00) &&
293
0
      (in[2] == 0x3F) && (in[3] == 0x00))
294
0
      return(XML_CHAR_ENCODING_UTF16LE);
295
0
  if ((in[0] == 0x00) && (in[1] == 0x3C) &&
296
0
      (in[2] == 0x00) && (in[3] == 0x3F))
297
0
      return(XML_CHAR_ENCODING_UTF16BE);
298
0
    }
299
0
    if (len >= 3) {
300
  /*
301
   * Errata on XML-1.0 June 20 2001
302
   * We now allow an UTF8 encoded BOM
303
   */
304
0
  if ((in[0] == 0xEF) && (in[1] == 0xBB) &&
305
0
      (in[2] == 0xBF))
306
0
      return(XML_CHAR_ENCODING_UTF8);
307
0
    }
308
    /* For UTF-16 we can recognize by the BOM */
309
0
    if (len >= 2) {
310
0
  if ((in[0] == 0xFE) && (in[1] == 0xFF))
311
0
      return(XML_CHAR_ENCODING_UTF16BE);
312
0
  if ((in[0] == 0xFF) && (in[1] == 0xFE))
313
0
      return(XML_CHAR_ENCODING_UTF16LE);
314
0
    }
315
0
    return(XML_CHAR_ENCODING_NONE);
316
0
}
317
318
/**
319
 * xmlCleanupEncodingAliases:
320
 *
321
 * DEPRECATED: This function modifies global state and is not
322
 * thread-safe.
323
 *
324
 * Unregisters all aliases
325
 */
326
void
327
0
xmlCleanupEncodingAliases(void) {
328
0
    int i;
329
330
0
    if (xmlCharEncodingAliases == NULL)
331
0
  return;
332
333
0
    for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
334
0
  if (xmlCharEncodingAliases[i].name != NULL)
335
0
      xmlFree((char *) xmlCharEncodingAliases[i].name);
336
0
  if (xmlCharEncodingAliases[i].alias != NULL)
337
0
      xmlFree((char *) xmlCharEncodingAliases[i].alias);
338
0
    }
339
0
    xmlCharEncodingAliasesNb = 0;
340
0
    xmlCharEncodingAliasesMax = 0;
341
0
    xmlFree(xmlCharEncodingAliases);
342
0
    xmlCharEncodingAliases = NULL;
343
0
}
344
345
/**
346
 * xmlGetEncodingAlias:
347
 * @alias:  the alias name as parsed, in UTF-8 format (ASCII actually)
348
 *
349
 * DEPRECATED: This function is not thread-safe.
350
 *
351
 * Lookup an encoding name for the given alias.
352
 *
353
 * Returns NULL if not found, otherwise the original name
354
 */
355
const char *
356
180k
xmlGetEncodingAlias(const char *alias) {
357
180k
    int i;
358
180k
    char upper[100];
359
360
180k
    if (alias == NULL)
361
0
  return(NULL);
362
363
180k
    if (xmlCharEncodingAliases == NULL)
364
180k
  return(NULL);
365
366
0
    for (i = 0;i < 99;i++) {
367
0
        upper[i] = (char) toupper((unsigned char) alias[i]);
368
0
  if (upper[i] == 0) break;
369
0
    }
370
0
    upper[i] = 0;
371
372
    /*
373
     * Walk down the list looking for a definition of the alias
374
     */
375
0
    for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
376
0
  if (!strcmp(xmlCharEncodingAliases[i].alias, upper)) {
377
0
      return(xmlCharEncodingAliases[i].name);
378
0
  }
379
0
    }
380
0
    return(NULL);
381
0
}
382
383
/**
384
 * xmlAddEncodingAlias:
385
 * @name:  the encoding name as parsed, in UTF-8 format (ASCII actually)
386
 * @alias:  the alias name as parsed, in UTF-8 format (ASCII actually)
387
 *
388
 * DEPRECATED: This function modifies global state and is not
389
 * thread-safe.
390
 *
391
 * Registers an alias @alias for an encoding named @name. Existing alias
392
 * will be overwritten.
393
 *
394
 * Returns 0 in case of success, -1 in case of error
395
 */
396
int
397
0
xmlAddEncodingAlias(const char *name, const char *alias) {
398
0
    int i;
399
0
    char upper[100];
400
0
    char *nameCopy, *aliasCopy;
401
402
0
    if ((name == NULL) || (alias == NULL))
403
0
  return(-1);
404
405
0
    for (i = 0;i < 99;i++) {
406
0
        upper[i] = (char) toupper((unsigned char) alias[i]);
407
0
  if (upper[i] == 0) break;
408
0
    }
409
0
    upper[i] = 0;
410
411
0
    if (xmlCharEncodingAliasesNb >= xmlCharEncodingAliasesMax) {
412
0
        xmlCharEncodingAliasPtr tmp;
413
0
        int newSize;
414
415
0
        newSize = xmlGrowCapacity(xmlCharEncodingAliasesMax, sizeof(tmp[0]),
416
0
                                  20, XML_MAX_ITEMS);
417
0
        if (newSize < 0)
418
0
            return(-1);
419
0
        tmp = xmlRealloc(xmlCharEncodingAliases, newSize * sizeof(tmp[0]));
420
0
        if (tmp == NULL)
421
0
            return(-1);
422
0
        xmlCharEncodingAliases = tmp;
423
0
        xmlCharEncodingAliasesMax = newSize;
424
0
    }
425
426
    /*
427
     * Walk down the list looking for a definition of the alias
428
     */
429
0
    for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
430
0
  if (!strcmp(xmlCharEncodingAliases[i].alias, upper)) {
431
      /*
432
       * Replace the definition.
433
       */
434
0
      nameCopy = xmlMemStrdup(name);
435
0
            if (nameCopy == NULL)
436
0
                return(-1);
437
0
      xmlFree((char *) xmlCharEncodingAliases[i].name);
438
0
      xmlCharEncodingAliases[i].name = nameCopy;
439
0
      return(0);
440
0
  }
441
0
    }
442
    /*
443
     * Add the definition
444
     */
445
0
    nameCopy = xmlMemStrdup(name);
446
0
    if (nameCopy == NULL)
447
0
        return(-1);
448
0
    aliasCopy = xmlMemStrdup(upper);
449
0
    if (aliasCopy == NULL) {
450
0
        xmlFree(nameCopy);
451
0
        return(-1);
452
0
    }
453
0
    xmlCharEncodingAliases[xmlCharEncodingAliasesNb].name = nameCopy;
454
0
    xmlCharEncodingAliases[xmlCharEncodingAliasesNb].alias = aliasCopy;
455
0
    xmlCharEncodingAliasesNb++;
456
0
    return(0);
457
0
}
458
459
/**
460
 * xmlDelEncodingAlias:
461
 * @alias:  the alias name as parsed, in UTF-8 format (ASCII actually)
462
 *
463
 * DEPRECATED: This function modifies global state and is not
464
 * thread-safe.
465
 *
466
 * Unregisters an encoding alias @alias
467
 *
468
 * Returns 0 in case of success, -1 in case of error
469
 */
470
int
471
0
xmlDelEncodingAlias(const char *alias) {
472
0
    int i;
473
474
0
    if (alias == NULL)
475
0
  return(-1);
476
477
0
    if (xmlCharEncodingAliases == NULL)
478
0
  return(-1);
479
    /*
480
     * Walk down the list looking for a definition of the alias
481
     */
482
0
    for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
483
0
  if (!strcmp(xmlCharEncodingAliases[i].alias, alias)) {
484
0
      xmlFree((char *) xmlCharEncodingAliases[i].name);
485
0
      xmlFree((char *) xmlCharEncodingAliases[i].alias);
486
0
      xmlCharEncodingAliasesNb--;
487
0
      memmove(&xmlCharEncodingAliases[i], &xmlCharEncodingAliases[i + 1],
488
0
        sizeof(xmlCharEncodingAlias) * (xmlCharEncodingAliasesNb - i));
489
0
      return(0);
490
0
  }
491
0
    }
492
0
    return(-1);
493
0
}
494
495
static int
496
900k
xmlCompareEncTableEntries(const void *vkey, const void *ventry) {
497
900k
    const char *key = vkey;
498
900k
    const xmlEncTableEntry *entry = ventry;
499
500
900k
    return(xmlStrcasecmp(BAD_CAST key, BAD_CAST entry->name));
501
900k
}
502
503
static xmlCharEncoding
504
xmlParseCharEncodingInternal(const char *name)
505
180k
{
506
180k
    const xmlEncTableEntry *entry;
507
508
180k
    if (name == NULL)
509
0
       return(XML_CHAR_ENCODING_NONE);
510
511
180k
    entry = bsearch(name, xmlEncTable,
512
180k
                    sizeof(xmlEncTable) / sizeof(xmlEncTable[0]),
513
180k
                    sizeof(xmlEncTable[0]), xmlCompareEncTableEntries);
514
180k
    if (entry != NULL)
515
179k
        return(entry->enc);
516
517
253
    return(XML_CHAR_ENCODING_ERROR);
518
180k
}
519
520
/**
521
 * xmlParseCharEncoding:
522
 * @name:  the encoding name as parsed, in UTF-8 format (ASCII actually)
523
 *
524
 * Compare the string to the encoding schemes already known. Note
525
 * that the comparison is case insensitive accordingly to the section
526
 * [XML] 4.3.3 Character Encoding in Entities.
527
 *
528
 * Returns one of the XML_CHAR_ENCODING_... values or XML_CHAR_ENCODING_NONE
529
 * if not recognized.
530
 */
531
xmlCharEncoding
532
xmlParseCharEncoding(const char *name)
533
0
{
534
0
    xmlCharEncoding enc = xmlParseCharEncodingInternal(name);
535
536
    /* Backward compatibility */
537
0
    if (enc == XML_CHAR_ENCODING_UTF16)
538
0
        enc = XML_CHAR_ENCODING_UTF16LE;
539
540
0
    return(enc);
541
0
}
542
543
/**
544
 * xmlGetCharEncodingName:
545
 * @enc:  the encoding
546
 *
547
 * The "canonical" name for XML encoding.
548
 * C.f. http://www.w3.org/TR/REC-xml#charencoding
549
 * Section 4.3.3  Character Encoding in Entities
550
 *
551
 * Returns the canonical name for the given encoding
552
 */
553
const char*
554
0
xmlGetCharEncodingName(xmlCharEncoding enc) {
555
0
    switch (enc) {
556
0
        case XML_CHAR_ENCODING_UTF16LE:
557
0
      return("UTF-16");
558
0
        case XML_CHAR_ENCODING_UTF16BE:
559
0
      return("UTF-16");
560
0
        case XML_CHAR_ENCODING_UCS4LE:
561
0
            return("UCS-4");
562
0
        case XML_CHAR_ENCODING_UCS4BE:
563
0
            return("UCS-4");
564
0
        default:
565
0
            break;
566
0
    }
567
568
0
    if ((enc <= 0) || ((size_t) enc >= NUM_DEFAULT_HANDLERS))
569
0
        return(NULL);
570
571
0
    return(defaultHandlers[enc].name);
572
0
}
573
574
/************************************************************************
575
 *                  *
576
 *      Char encoding handlers        *
577
 *                  *
578
 ************************************************************************/
579
580
/**
581
 * xmlNewCharEncodingHandler:
582
 * @name:  the encoding name, in UTF-8 format (ASCII actually)
583
 * @input:  the xmlCharEncodingInputFunc to read that encoding
584
 * @output:  the xmlCharEncodingOutputFunc to write that encoding
585
 *
586
 * DEPRECATED: This function modifies global state and is not
587
 * thread-safe.
588
 *
589
 * Create and registers an xmlCharEncodingHandler.
590
 *
591
 * Returns the xmlCharEncodingHandlerPtr created (or NULL in case of error).
592
 */
593
xmlCharEncodingHandlerPtr
594
xmlNewCharEncodingHandler(const char *name,
595
                          xmlCharEncodingInputFunc input,
596
0
                          xmlCharEncodingOutputFunc output) {
597
0
    xmlCharEncodingHandlerPtr handler;
598
0
    const char *alias;
599
0
    char upper[500];
600
0
    int i;
601
0
    char *up = NULL;
602
603
    /*
604
     * Do the alias resolution
605
     */
606
0
    alias = xmlGetEncodingAlias(name);
607
0
    if (alias != NULL)
608
0
  name = alias;
609
610
    /*
611
     * Keep only the uppercase version of the encoding.
612
     */
613
0
    if (name == NULL)
614
0
  return(NULL);
615
0
    for (i = 0;i < 499;i++) {
616
0
        upper[i] = (char) toupper((unsigned char) name[i]);
617
0
  if (upper[i] == 0) break;
618
0
    }
619
0
    upper[i] = 0;
620
0
    up = xmlMemStrdup(upper);
621
0
    if (up == NULL)
622
0
  return(NULL);
623
624
    /*
625
     * allocate and fill-up an handler block.
626
     */
627
0
    handler = (xmlCharEncodingHandlerPtr)
628
0
              xmlMalloc(sizeof(xmlCharEncodingHandler));
629
0
    if (handler == NULL) {
630
0
        xmlFree(up);
631
0
  return(NULL);
632
0
    }
633
0
    memset(handler, 0, sizeof(xmlCharEncodingHandler));
634
0
    handler->input.legacyFunc = input;
635
0
    handler->output.legacyFunc = output;
636
0
    handler->name = up;
637
0
    handler->flags = XML_HANDLER_STATIC | XML_HANDLER_LEGACY;
638
639
    /*
640
     * registers and returns the handler.
641
     */
642
0
    xmlRegisterCharEncodingHandler(handler);
643
0
    return(handler);
644
0
}
645
646
/**
647
 * xmlCharEncNewCustomHandler:
648
 * @name:  the encoding name
649
 * @input:  input callback which converts to UTF-8
650
 * @output:  output callback which converts from UTF-8
651
 * @ctxtDtor:  context destructor
652
 * @inputCtxt:  context for input callback
653
 * @outputCtxt:  context for output callback
654
 * @out:  pointer to resulting handler
655
 *
656
 * Create a custom xmlCharEncodingHandler.
657
 *
658
 * Returns an xmlParserErrors code.
659
 */
660
xmlParserErrors
661
xmlCharEncNewCustomHandler(const char *name,
662
                           xmlCharEncConvFunc input, xmlCharEncConvFunc output,
663
                           xmlCharEncConvCtxtDtor ctxtDtor,
664
                           void *inputCtxt, void *outputCtxt,
665
0
                           xmlCharEncodingHandler **out) {
666
0
    xmlCharEncodingHandler *handler;
667
668
0
    if (out == NULL)
669
0
        return(XML_ERR_ARGUMENT);
670
671
0
    handler = xmlMalloc(sizeof(*handler));
672
0
    if (handler == NULL)
673
0
        goto error;
674
0
    memset(handler, 0, sizeof(*handler));
675
676
0
    if (name != NULL) {
677
0
        handler->name = xmlMemStrdup(name);
678
0
        if (handler->name == NULL)
679
0
            goto error;
680
0
    }
681
682
0
    handler->input.func = input;
683
0
    handler->output.func = output;
684
0
    handler->ctxtDtor = ctxtDtor;
685
0
    handler->inputCtxt = inputCtxt;
686
0
    handler->outputCtxt = outputCtxt;
687
688
0
    *out = handler;
689
0
    return(XML_ERR_OK);
690
691
0
error:
692
0
    xmlFree(handler);
693
694
0
    if (ctxtDtor != NULL) {
695
0
        if (inputCtxt != NULL)
696
0
            ctxtDtor(inputCtxt);
697
0
        if (outputCtxt != NULL)
698
0
            ctxtDtor(outputCtxt);
699
0
    }
700
701
0
    return(XML_ERR_NO_MEMORY);
702
0
}
703
704
/**
705
 * xmlInitCharEncodingHandlers:
706
 *
707
 * DEPRECATED: Alias for xmlInitParser.
708
 */
709
void
710
0
xmlInitCharEncodingHandlers(void) {
711
0
    xmlInitParser();
712
0
}
713
714
/**
715
 * xmlInitEncodingInternal:
716
 *
717
 * Initialize the char encoding support.
718
 */
719
void
720
18
xmlInitEncodingInternal(void) {
721
18
    unsigned short int tst = 0x1234;
722
18
    unsigned char *ptr = (unsigned char *) &tst;
723
724
18
    if (*ptr == 0x12) xmlLittleEndian = 0;
725
18
    else xmlLittleEndian = 1;
726
18
}
727
728
/**
729
 * xmlCleanupCharEncodingHandlers:
730
 *
731
 * DEPRECATED: This function will be made private. Call xmlCleanupParser
732
 * to free global state but see the warnings there. xmlCleanupParser
733
 * should be only called once at program exit. In most cases, you don't
734
 * have call cleanup functions at all.
735
 *
736
 * Cleanup the memory allocated for the char encoding support, it
737
 * unregisters all the encoding handlers and the aliases.
738
 */
739
void
740
0
xmlCleanupCharEncodingHandlers(void) {
741
0
    xmlCleanupEncodingAliases();
742
743
0
    if (globalHandlers == NULL) return;
744
745
0
    for (;nbCharEncodingHandler > 0;) {
746
0
        xmlCharEncodingHandler *handler;
747
748
0
        nbCharEncodingHandler--;
749
0
        handler = globalHandlers[nbCharEncodingHandler];
750
0
  if (handler != NULL) {
751
0
      if (handler->name != NULL)
752
0
    xmlFree(handler->name);
753
0
      xmlFree(handler);
754
0
  }
755
0
    }
756
0
    xmlFree(globalHandlers);
757
0
    globalHandlers = NULL;
758
0
    nbCharEncodingHandler = 0;
759
0
}
760
761
/**
762
 * xmlRegisterCharEncodingHandler:
763
 * @handler:  the xmlCharEncodingHandlerPtr handler block
764
 *
765
 * DEPRECATED: This function modifies global state and is not
766
 * thread-safe.
767
 *
768
 * Register the char encoding handler.
769
 */
770
void
771
0
xmlRegisterCharEncodingHandler(xmlCharEncodingHandlerPtr handler) {
772
0
    if (handler == NULL)
773
0
        return;
774
0
    if (globalHandlers == NULL) {
775
0
        globalHandlers = xmlMalloc(
776
0
                MAX_ENCODING_HANDLERS * sizeof(globalHandlers[0]));
777
0
        if (globalHandlers == NULL)
778
0
            goto free_handler;
779
0
    }
780
781
0
    if (nbCharEncodingHandler >= MAX_ENCODING_HANDLERS)
782
0
        goto free_handler;
783
0
    globalHandlers[nbCharEncodingHandler++] = handler;
784
0
    return;
785
786
0
free_handler:
787
0
    if (handler != NULL) {
788
0
        if (handler->name != NULL) {
789
0
            xmlFree(handler->name);
790
0
        }
791
0
        xmlFree(handler);
792
0
    }
793
0
}
794
795
/**
796
 * xmlFindExtraHandler:
797
 * @norig:  name of the char encoding
798
 * @name:  potentially aliased name of the encoding
799
 * @flags:  bit mask of flags
800
 * @impl:  a conversion implementation (optional)
801
 * @implCtxt:  user data for conversion implementation (optional)
802
 * @out:  pointer to resulting handler
803
 *
804
 * Search the non-default handlers for an exact match.
805
 *
806
 * Returns an xmlParserErrors error code.
807
 */
808
static xmlParserErrors
809
xmlFindExtraHandler(const char *norig, const char *name, xmlCharEncFlags flags,
810
                    xmlCharEncConvImpl impl, void *implCtxt,
811
275
                    xmlCharEncodingHandler **out) {
812
    /*
813
     * Try custom implementation before deprecated global handlers.
814
     *
815
     * Note that we pass the original name without deprecated
816
     * alias resolution.
817
     */
818
275
    if (impl != NULL)
819
0
        return(impl(implCtxt, norig, flags, out));
820
821
    /*
822
     * Deprecated
823
     */
824
275
    if (globalHandlers != NULL) {
825
0
        int i;
826
827
0
        for (i = 0; i < nbCharEncodingHandler; i++) {
828
0
            xmlCharEncodingHandler *h = globalHandlers[i];
829
830
0
            if (!xmlStrcasecmp((const xmlChar *) name,
831
0
                               (const xmlChar *) h->name)) {
832
0
                if ((((flags & XML_ENC_INPUT) == 0) || (h->input.func)) &&
833
0
                    (((flags & XML_ENC_OUTPUT) == 0) || (h->output.func))) {
834
0
                    *out = h;
835
0
                    return(XML_ERR_OK);
836
0
                }
837
0
            }
838
0
        }
839
0
    }
840
841
#ifdef LIBXML_ICONV_ENABLED
842
    {
843
        int ret = xmlCharEncIconv(name, flags, out);
844
845
        if (ret == XML_ERR_OK)
846
            return(XML_ERR_OK);
847
        if (ret != XML_ERR_UNSUPPORTED_ENCODING)
848
            return(ret);
849
    }
850
#endif /* LIBXML_ICONV_ENABLED */
851
852
#ifdef LIBXML_ICU_ENABLED
853
    {
854
        int ret = xmlCharEncUconv(name, flags, out);
855
856
        if (ret == XML_ERR_OK)
857
            return(XML_ERR_OK);
858
        if (ret != XML_ERR_UNSUPPORTED_ENCODING)
859
            return(ret);
860
    }
861
#endif /* LIBXML_ICU_ENABLED */
862
863
275
    return(XML_ERR_UNSUPPORTED_ENCODING);
864
275
}
865
866
/**
867
 * xmlLookupCharEncodingHandler:
868
 * @enc:  an xmlCharEncoding value.
869
 * @out:  pointer to result
870
 *
871
 * Find or create a handler matching the encoding. The following
872
 * converters are looked up in order:
873
 *
874
 * - Built-in handler (UTF-8, UTF-16, ISO-8859-1, ASCII)
875
 * - User-registered global handler (deprecated)
876
 * - iconv if enabled
877
 * - ICU if enabled
878
 *
879
 * The handler must be closed with xmlCharEncCloseFunc.
880
 *
881
 * If the encoding is UTF-8, a NULL handler and no error code will
882
 * be returned.
883
 *
884
 * Available since 2.13.0.
885
 *
886
 * Returns XML_ERR_OK, XML_ERR_UNSUPPORTED_ENCODING or another
887
 * xmlParserErrors error code.
888
 */
889
xmlParserErrors
890
xmlLookupCharEncodingHandler(xmlCharEncoding enc,
891
10.4k
                             xmlCharEncodingHandler **out) {
892
10.4k
    const xmlCharEncodingHandler *handler;
893
894
10.4k
    if (out == NULL)
895
0
        return(XML_ERR_ARGUMENT);
896
10.4k
    *out = NULL;
897
898
10.4k
    if ((enc <= 0) || ((size_t) enc >= NUM_DEFAULT_HANDLERS))
899
0
        return(XML_ERR_UNSUPPORTED_ENCODING);
900
901
    /* Return NULL handler for UTF-8 */
902
10.4k
    if ((enc == XML_CHAR_ENCODING_UTF8) ||
903
10.4k
        (enc == XML_CHAR_ENCODING_NONE))
904
8.51k
        return(XML_ERR_OK);
905
906
1.89k
    handler = &defaultHandlers[enc];
907
1.89k
    if ((handler->input.func != NULL) || (handler->output.func != NULL)) {
908
1.88k
        *out = (xmlCharEncodingHandler *) handler;
909
1.88k
        return(XML_ERR_OK);
910
1.88k
    }
911
912
17
    if (handler->name != NULL) {
913
17
        xmlCharEncFlags flags = XML_ENC_INPUT;
914
915
17
#ifdef LIBXML_OUTPUT_ENABLED
916
17
        flags |= XML_ENC_OUTPUT;
917
17
#endif
918
17
        return(xmlFindExtraHandler(handler->name, handler->name, flags,
919
17
                                   NULL, NULL, out));
920
17
    }
921
922
0
    return(XML_ERR_UNSUPPORTED_ENCODING);
923
17
}
924
925
/**
926
 * xmlGetCharEncodingHandler:
927
 * @enc:  an xmlCharEncoding value.
928
 *
929
 * DEPRECATED: Use xmlLookupCharEncodingHandler which has better error
930
 * reporting.
931
 *
932
 * Returns the handler or NULL if no handler was found or an error
933
 * occurred.
934
 */
935
xmlCharEncodingHandlerPtr
936
8.47k
xmlGetCharEncodingHandler(xmlCharEncoding enc) {
937
8.47k
    xmlCharEncodingHandler *ret;
938
939
8.47k
    xmlLookupCharEncodingHandler(enc, &ret);
940
8.47k
    return(ret);
941
8.47k
}
942
943
/**
944
 * xmlCreateCharEncodingHandler:
945
 * @name:  a string describing the char encoding.
946
 * @flags:  bit mask of flags
947
 * @impl:  a conversion implementation (optional)
948
 * @implCtxt:  user data for conversion implementation (optional)
949
 * @out:  pointer to result
950
 *
951
 * Find or create a handler matching the encoding. The following
952
 * converters are looked up in order:
953
 *
954
 * - Built-in handler (UTF-8, UTF-16, ISO-8859-1, ASCII)
955
 * - Custom implementation if provided
956
 * - User-registered global handler (deprecated)
957
 * - iconv if enabled
958
 * - ICU if enabled
959
 *
960
 * The handler must be closed with xmlCharEncCloseFunc.
961
 *
962
 * If the encoding is UTF-8, a NULL handler and no error code will
963
 * be returned.
964
 *
965
 * @flags can contain XML_ENC_INPUT, XML_ENC_OUTPUT or both.
966
 *
967
 * Available since 2.14.0.
968
 *
969
 * Returns XML_ERR_OK, XML_ERR_UNSUPPORTED_ENCODING or another
970
 * xmlParserErrors error code.
971
 */
972
xmlParserErrors
973
xmlCreateCharEncodingHandler(const char *name, xmlCharEncFlags flags,
974
                             xmlCharEncConvImpl impl, void *implCtxt,
975
180k
                             xmlCharEncodingHandler **out) {
976
180k
    const xmlCharEncodingHandler *handler;
977
180k
    const char *norig, *nalias;
978
180k
    xmlCharEncoding enc;
979
980
180k
    if (out == NULL)
981
0
        return(XML_ERR_ARGUMENT);
982
180k
    *out = NULL;
983
984
180k
    if ((name == NULL) || (flags == 0))
985
0
        return(XML_ERR_ARGUMENT);
986
987
180k
    norig = name;
988
180k
    nalias = xmlGetEncodingAlias(name);
989
180k
    if (nalias != NULL)
990
0
  name = nalias;
991
992
180k
    enc = xmlParseCharEncodingInternal(name);
993
994
    /* Return NULL handler for UTF-8 */
995
180k
    if (enc == XML_CHAR_ENCODING_UTF8)
996
178k
        return(XML_ERR_OK);
997
998
1.22k
    if ((enc > 0) && ((size_t) enc < NUM_DEFAULT_HANDLERS)) {
999
968
        handler = &defaultHandlers[enc];
1000
968
        if ((((flags & XML_ENC_INPUT) == 0) || (handler->input.func)) &&
1001
968
            (((flags & XML_ENC_OUTPUT) == 0) || (handler->output.func))) {
1002
963
            *out = (xmlCharEncodingHandler *) handler;
1003
963
            return(XML_ERR_OK);
1004
963
        }
1005
968
    }
1006
1007
258
    return(xmlFindExtraHandler(norig, name, flags, impl, implCtxt, out));
1008
1.22k
}
1009
1010
/**
1011
 * xmlOpenCharEncodingHandler:
1012
 * @name:  a string describing the char encoding.
1013
 * @output:  boolean, use handler for output
1014
 * @out:  pointer to result
1015
 *
1016
 * Find or create a handler matching the encoding. The following
1017
 * converters are looked up in order:
1018
 *
1019
 * - Built-in handler (UTF-8, UTF-16, ISO-8859-1, ASCII)
1020
 * - User-registered global handler (deprecated)
1021
 * - iconv if enabled
1022
 * - ICU if enabled
1023
 *
1024
 * The handler must be closed with xmlCharEncCloseFunc.
1025
 *
1026
 * If the encoding is UTF-8, a NULL handler and no error code will
1027
 * be returned.
1028
 *
1029
 * Available since 2.13.0.
1030
 *
1031
 * Returns XML_ERR_OK, XML_ERR_UNSUPPORTED_ENCODING or another
1032
 * xmlParserErrors error code.
1033
 */
1034
xmlParserErrors
1035
xmlOpenCharEncodingHandler(const char *name, int output,
1036
0
                           xmlCharEncodingHandler **out) {
1037
0
    xmlCharEncFlags flags = output ? XML_ENC_OUTPUT : XML_ENC_INPUT;
1038
1039
0
    return(xmlCreateCharEncodingHandler(name, flags, NULL, NULL, out));
1040
0
}
1041
1042
/**
1043
 * xmlFindCharEncodingHandler:
1044
 * @name:  a string describing the char encoding.
1045
 *
1046
 * DEPRECATED: Use xmlOpenCharEncodingHandler which has better error
1047
 * reporting.
1048
 *
1049
 * If the encoding is UTF-8, this will return a no-op handler that
1050
 * shouldn't be used.
1051
 *
1052
 * Returns the handler or NULL if no handler was found or an error
1053
 * occurred.
1054
 */
1055
xmlCharEncodingHandlerPtr
1056
0
xmlFindCharEncodingHandler(const char *name) {
1057
0
    xmlCharEncodingHandler *ret;
1058
0
    xmlCharEncFlags flags;
1059
1060
    /*
1061
     * This handler shouldn't be used, but we must return a non-NULL
1062
     * handler.
1063
     */
1064
0
    if ((xmlStrcasecmp(BAD_CAST name, BAD_CAST "UTF-8") == 0) ||
1065
0
        (xmlStrcasecmp(BAD_CAST name, BAD_CAST "UTF8") == 0))
1066
0
        return((xmlCharEncodingHandlerPtr)
1067
0
                &defaultHandlers[XML_CHAR_ENCODING_UTF8]);
1068
1069
0
    flags = XML_ENC_INPUT;
1070
0
#ifdef LIBXML_OUTPUT_ENABLED
1071
0
    flags |= XML_ENC_OUTPUT;
1072
0
#endif
1073
0
    xmlCreateCharEncodingHandler(name, flags, NULL, NULL, &ret);
1074
0
    return(ret);
1075
0
}
1076
1077
/************************************************************************
1078
 *                  *
1079
 *    ICONV based generic conversion functions    *
1080
 *                  *
1081
 ************************************************************************/
1082
1083
#ifdef LIBXML_ICONV_ENABLED
1084
typedef struct {
1085
    iconv_t cd;
1086
} xmlIconvCtxt;
1087
1088
/**
1089
 * xmlIconvConvert:
1090
 * @vctxt:  conversion context
1091
 * @out:  a pointer to an array of bytes to store the result
1092
 * @outlen:  the length of @out
1093
 * @in:  a pointer to an array of input bytes
1094
 * @inlen:  the length of @in
1095
 * @flush:  end of input
1096
 *
1097
 * The value of @inlen after return is the number of octets consumed
1098
 *     as the return value is positive, else unpredictable.
1099
 * The value of @outlen after return is the number of octets produced.
1100
 *
1101
 * Returns an XML_ENC_ERR code.
1102
 */
1103
static xmlCharEncError
1104
xmlIconvConvert(void *vctxt, unsigned char *out, int *outlen,
1105
                const unsigned char *in, int *inlen,
1106
                int flush ATTRIBUTE_UNUSED) {
1107
    xmlIconvCtxt *ctxt = vctxt;
1108
    size_t icv_inlen, icv_outlen;
1109
    const char *icv_in = (const char *) in;
1110
    char *icv_out = (char *) out;
1111
    size_t ret;
1112
1113
    if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL)) {
1114
        if (outlen != NULL) *outlen = 0;
1115
        return(XML_ENC_ERR_INTERNAL);
1116
    }
1117
    icv_inlen = *inlen;
1118
    icv_outlen = *outlen;
1119
    /*
1120
     * Some versions take const, other versions take non-const input.
1121
     */
1122
    ret = iconv(ctxt->cd, (void *) &icv_in, &icv_inlen, &icv_out, &icv_outlen);
1123
    *inlen -= icv_inlen;
1124
    *outlen -= icv_outlen;
1125
    if (ret == (size_t) -1) {
1126
        if (errno == EILSEQ)
1127
            return(XML_ENC_ERR_INPUT);
1128
        if (errno == E2BIG)
1129
            return(XML_ENC_ERR_SPACE);
1130
        /*
1131
         * EINVAL means a truncated multi-byte sequence at the end
1132
         * of the input buffer. We treat this as success.
1133
         */
1134
        if (errno == EINVAL)
1135
            return(XML_ENC_ERR_SUCCESS);
1136
#ifdef __APPLE__
1137
        /*
1138
         * Apple's new libiconv can return EOPNOTSUPP under
1139
         * unknown circumstances (detected when fuzzing).
1140
         */
1141
        if (errno == EOPNOTSUPP)
1142
            return(XML_ENC_ERR_INPUT);
1143
#endif
1144
        return(XML_ENC_ERR_INTERNAL);
1145
    }
1146
    return(XML_ENC_ERR_SUCCESS);
1147
}
1148
1149
static void
1150
xmlIconvFree(void *vctxt) {
1151
    xmlIconvCtxt *ctxt = vctxt;
1152
1153
    if (ctxt == NULL)
1154
        return;
1155
1156
    if (ctxt->cd != (iconv_t) -1)
1157
        iconv_close(ctxt->cd);
1158
1159
    xmlFree(ctxt);
1160
}
1161
1162
#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && \
1163
    defined(__GLIBC__)
1164
#include <libxml/parserInternals.h>
1165
1166
static int
1167
xmlEncodingMatch(const char *name1, const char *name2) {
1168
    /*
1169
     * Fuzzy match for encoding names
1170
     */
1171
    while (1) {
1172
        while ((*name1 != 0) && (!IS_ASCII_LETTER(*name1)))
1173
            name1 += 1;
1174
        while ((*name2 != 0) && (!IS_ASCII_LETTER(*name2)))
1175
            name2 += 1;
1176
        if ((*name1 == 0) || (*name2 == 0))
1177
            break;
1178
        if ((*name1 | 0x20) != (*name2 | 0x20))
1179
            return(0);
1180
        name1 += 1;
1181
        name2 += 1;
1182
    }
1183
1184
    return((*name1 == 0) && (*name2 == 0));
1185
}
1186
#endif /* FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION */
1187
1188
static xmlParserErrors
1189
xmlCharEncIconv(const char *name, xmlCharEncFlags flags,
1190
                xmlCharEncodingHandler **out) {
1191
    xmlCharEncConvFunc inFunc = NULL, outFunc = NULL;
1192
    xmlIconvCtxt *inputCtxt = NULL, *outputCtxt = NULL;
1193
    iconv_t icv_in;
1194
    iconv_t icv_out;
1195
    xmlParserErrors ret;
1196
1197
    /*
1198
     * POSIX allows "indicator suffixes" like "//IGNORE" to be
1199
     * passed to iconv_open. This can change the behavior in
1200
     * unexpected ways.
1201
     *
1202
     * Many iconv implementations also support non-standard
1203
     * codesets like "wchar_t", "char" or the empty string "".
1204
     * It would make sense to disallow them, but codeset names
1205
     * are matched fuzzily, so a string like "w-C.hA_rt" could
1206
     * be interpreted as "wchar_t".
1207
     *
1208
     * When escaping characters that aren't supported in the
1209
     * target encoding, we also rely on GNU libiconv behavior to
1210
     * stop conversion without trying any kind of fallback.
1211
     * This violates the POSIX spec which says:
1212
     *
1213
     * > If iconv() encounters a character in the input buffer
1214
     * > that is valid, but for which an identical character does
1215
     * > not exist in the output codeset [...] iconv() shall
1216
     * > perform an implementation-defined conversion on the
1217
     * > character.
1218
     *
1219
     * See: https://sourceware.org/bugzilla/show_bug.cgi?id=29913
1220
     *
1221
     * Unfortunately, strict POSIX compliance makes it impossible
1222
     * to detect untranslatable characters.
1223
     */
1224
    if (strstr(name, "//") != NULL) {
1225
        ret = XML_ERR_UNSUPPORTED_ENCODING;
1226
        goto error;
1227
    }
1228
1229
#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && \
1230
    defined(__GLIBC__)
1231
    /*
1232
     * This glibc bug can lead to unpredictable results with the
1233
     * push parser.
1234
     *
1235
     * https://sourceware.org/bugzilla/show_bug.cgi?id=32633
1236
     */
1237
    if ((xmlEncodingMatch(name, "TSCII")) ||
1238
        (xmlEncodingMatch(name, "BIG5-HKSCS"))) {
1239
        ret = XML_ERR_UNSUPPORTED_ENCODING;
1240
        goto error;
1241
    }
1242
#endif
1243
1244
    if (flags & XML_ENC_INPUT) {
1245
        inputCtxt = xmlMalloc(sizeof(xmlIconvCtxt));
1246
        if (inputCtxt == NULL) {
1247
            ret = XML_ERR_NO_MEMORY;
1248
            goto error;
1249
        }
1250
        inputCtxt->cd = (iconv_t) -1;
1251
1252
        icv_in = iconv_open("UTF-8", name);
1253
        if (icv_in == (iconv_t) -1) {
1254
            if (errno == EINVAL)
1255
                ret = XML_ERR_UNSUPPORTED_ENCODING;
1256
            else if (errno == ENOMEM)
1257
                ret = XML_ERR_NO_MEMORY;
1258
            else
1259
                ret = XML_ERR_SYSTEM;
1260
            goto error;
1261
        }
1262
        inputCtxt->cd = icv_in;
1263
1264
        inFunc = xmlIconvConvert;
1265
    }
1266
1267
    if (flags & XML_ENC_OUTPUT) {
1268
        outputCtxt = xmlMalloc(sizeof(xmlIconvCtxt));
1269
        if (outputCtxt == NULL) {
1270
            ret = XML_ERR_NO_MEMORY;
1271
            goto error;
1272
        }
1273
        outputCtxt->cd = (iconv_t) -1;
1274
1275
        icv_out = iconv_open(name, "UTF-8");
1276
        if (icv_out == (iconv_t) -1) {
1277
            if (errno == EINVAL)
1278
                ret = XML_ERR_UNSUPPORTED_ENCODING;
1279
            else if (errno == ENOMEM)
1280
                ret = XML_ERR_NO_MEMORY;
1281
            else
1282
                ret = XML_ERR_SYSTEM;
1283
            goto error;
1284
        }
1285
        outputCtxt->cd = icv_out;
1286
1287
        outFunc = xmlIconvConvert;
1288
    }
1289
1290
    return(xmlCharEncNewCustomHandler(name, inFunc, outFunc, xmlIconvFree,
1291
                                      inputCtxt, outputCtxt, out));
1292
1293
error:
1294
    if (inputCtxt != NULL)
1295
        xmlIconvFree(inputCtxt);
1296
    if (outputCtxt != NULL)
1297
        xmlIconvFree(outputCtxt);
1298
    return(ret);
1299
}
1300
#endif /* LIBXML_ICONV_ENABLED */
1301
1302
/************************************************************************
1303
 *                  *
1304
 *    ICU based generic conversion functions    *
1305
 *                  *
1306
 ************************************************************************/
1307
1308
#ifdef LIBXML_ICU_ENABLED
1309
/* Size of pivot buffer, same as icu/source/common/ucnv.cpp CHUNK_SIZE */
1310
#define ICU_PIVOT_BUF_SIZE 1024
1311
1312
typedef struct _uconv_t xmlUconvCtxt;
1313
struct _uconv_t {
1314
  UConverter *uconv; /* for conversion between an encoding and UTF-16 */
1315
  UConverter *utf8; /* for conversion between UTF-8 and UTF-16 */
1316
  UChar      *pivot_source;
1317
  UChar      *pivot_target;
1318
  int        isInput;
1319
  UChar      pivot_buf[ICU_PIVOT_BUF_SIZE];
1320
};
1321
1322
/**
1323
 * xmlUconvConvert:
1324
 * @vctxt:  conversion context
1325
 * @out:  a pointer to an array of bytes to store the result
1326
 * @outlen:  the length of @out
1327
 * @in:  a pointer to an array of input bytes
1328
 * @inlen:  the length of @in
1329
 * @flush:  end of input
1330
 *
1331
 * Returns an XML_ENC_ERR code.
1332
 *
1333
 * The value of @inlen after return is the number of octets consumed
1334
 *     as the return value is positive, else unpredictable.
1335
 * The value of @outlen after return is the number of octets produced.
1336
 */
1337
static xmlCharEncError
1338
xmlUconvConvert(void *vctxt, unsigned char *out, int *outlen,
1339
                const unsigned char *in, int *inlen, int flush) {
1340
    xmlUconvCtxt *cd = vctxt;
1341
    const char *ucv_in = (const char *) in;
1342
    char *ucv_out = (char *) out;
1343
    UConverter *target, *source;
1344
    UErrorCode err = U_ZERO_ERROR;
1345
    int ret;
1346
1347
    if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL)) {
1348
        if (outlen != NULL)
1349
            *outlen = 0;
1350
        return(XML_ENC_ERR_INTERNAL);
1351
    }
1352
1353
    /*
1354
     * The ICU API can consume input, including partial sequences,
1355
     * even if the output buffer would overflow. The remaining input
1356
     * must be processed by calling ucnv_convertEx with a possibly
1357
     * empty input buffer.
1358
     */
1359
    if (cd->isInput) {
1360
        source = cd->uconv;
1361
        target = cd->utf8;
1362
    } else {
1363
        source = cd->utf8;
1364
        target = cd->uconv;
1365
    }
1366
1367
    ucnv_convertEx(target, source, &ucv_out, ucv_out + *outlen,
1368
                   &ucv_in, ucv_in + *inlen, cd->pivot_buf,
1369
                   &cd->pivot_source, &cd->pivot_target,
1370
                   cd->pivot_buf + ICU_PIVOT_BUF_SIZE,
1371
                   /* reset */ 0, flush, &err);
1372
1373
    *inlen = ucv_in - (const char*) in;
1374
    *outlen = ucv_out - (char *) out;
1375
1376
    if (U_SUCCESS(err)) {
1377
        ret = XML_ENC_ERR_SUCCESS;
1378
    } else {
1379
        switch (err) {
1380
            case U_TRUNCATED_CHAR_FOUND:
1381
                /* Should only happen with flush */
1382
                ret = XML_ENC_ERR_INPUT;
1383
                break;
1384
1385
            case U_BUFFER_OVERFLOW_ERROR:
1386
                ret = XML_ENC_ERR_SPACE;
1387
                break;
1388
1389
            case U_INVALID_CHAR_FOUND:
1390
            case U_ILLEGAL_CHAR_FOUND:
1391
            case U_ILLEGAL_ESCAPE_SEQUENCE:
1392
            case U_UNSUPPORTED_ESCAPE_SEQUENCE:
1393
                ret = XML_ENC_ERR_INPUT;
1394
                break;
1395
1396
            case U_MEMORY_ALLOCATION_ERROR:
1397
                ret = XML_ENC_ERR_MEMORY;
1398
                break;
1399
1400
            default:
1401
                ret = XML_ENC_ERR_INTERNAL;
1402
                break;
1403
        }
1404
    }
1405
1406
    return(ret);
1407
}
1408
1409
static xmlParserErrors
1410
openIcuConverter(const char* name, int isInput, xmlUconvCtxt **out)
1411
{
1412
    UErrorCode status;
1413
    xmlUconvCtxt *conv;
1414
1415
    *out = NULL;
1416
1417
    conv = (xmlUconvCtxt *) xmlMalloc(sizeof(xmlUconvCtxt));
1418
    if (conv == NULL)
1419
        return(XML_ERR_NO_MEMORY);
1420
1421
    conv->isInput = isInput;
1422
    conv->pivot_source = conv->pivot_buf;
1423
    conv->pivot_target = conv->pivot_buf;
1424
1425
    status = U_ZERO_ERROR;
1426
    conv->uconv = ucnv_open(name, &status);
1427
    if (U_FAILURE(status))
1428
        goto error;
1429
1430
    status = U_ZERO_ERROR;
1431
    if (isInput) {
1432
        ucnv_setToUCallBack(conv->uconv, UCNV_TO_U_CALLBACK_STOP,
1433
                                                NULL, NULL, NULL, &status);
1434
    }
1435
    else {
1436
        ucnv_setFromUCallBack(conv->uconv, UCNV_FROM_U_CALLBACK_STOP,
1437
                                                NULL, NULL, NULL, &status);
1438
    }
1439
    if (U_FAILURE(status))
1440
        goto error;
1441
1442
    status = U_ZERO_ERROR;
1443
    conv->utf8 = ucnv_open("UTF-8", &status);
1444
    if (U_FAILURE(status))
1445
        goto error;
1446
1447
    *out = conv;
1448
    return(XML_ERR_OK);
1449
1450
error:
1451
    if (conv->uconv)
1452
        ucnv_close(conv->uconv);
1453
    xmlFree(conv);
1454
1455
    if (status == U_FILE_ACCESS_ERROR)
1456
        return(XML_ERR_UNSUPPORTED_ENCODING);
1457
    if (status == U_MEMORY_ALLOCATION_ERROR)
1458
        return(XML_ERR_NO_MEMORY);
1459
    return(XML_ERR_SYSTEM);
1460
}
1461
1462
static void
1463
closeIcuConverter(xmlUconvCtxt *conv)
1464
{
1465
    if (conv == NULL)
1466
        return;
1467
    ucnv_close(conv->uconv);
1468
    ucnv_close(conv->utf8);
1469
    xmlFree(conv);
1470
}
1471
1472
static void
1473
xmlUconvFree(void *vctxt) {
1474
    closeIcuConverter(vctxt);
1475
}
1476
1477
static xmlParserErrors
1478
xmlCharEncUconv(const char *name, xmlCharEncFlags flags,
1479
                xmlCharEncodingHandler **out) {
1480
    xmlCharEncConvFunc inFunc = NULL, outFunc = NULL;
1481
    xmlUconvCtxt *ucv_in = NULL;
1482
    xmlUconvCtxt *ucv_out = NULL;
1483
    int ret;
1484
1485
    if (flags & XML_ENC_INPUT) {
1486
        ret = openIcuConverter(name, 1, &ucv_in);
1487
        if (ret != 0)
1488
            goto error;
1489
        inFunc = xmlUconvConvert;
1490
    }
1491
1492
    if (flags & XML_ENC_OUTPUT) {
1493
        ret = openIcuConverter(name, 0, &ucv_out);
1494
        if (ret != 0)
1495
            goto error;
1496
        outFunc = xmlUconvConvert;
1497
    }
1498
1499
    return(xmlCharEncNewCustomHandler(name, inFunc, outFunc, xmlUconvFree,
1500
                                      ucv_in, ucv_out, out));
1501
1502
error:
1503
    if (ucv_in != NULL)
1504
        closeIcuConverter(ucv_in);
1505
    if (ucv_out != NULL)
1506
        closeIcuConverter(ucv_out);
1507
    return(ret);
1508
}
1509
#endif /* LIBXML_ICU_ENABLED */
1510
1511
/************************************************************************
1512
 *                  *
1513
 *    The real API used by libxml for on-the-fly conversion *
1514
 *                  *
1515
 ************************************************************************/
1516
1517
/**
1518
 * xmlEncConvertError:
1519
 * @code:  XML_ENC_ERR code
1520
 *
1521
 * Convert XML_ENC_ERR to libxml2 error codes.
1522
 */
1523
static xmlParserErrors
1524
162
xmlEncConvertError(xmlCharEncError code) {
1525
162
    xmlParserErrors ret;
1526
1527
162
    switch (code) {
1528
0
        case XML_ENC_ERR_SUCCESS:
1529
0
            ret = XML_ERR_OK;
1530
0
            break;
1531
162
        case XML_ENC_ERR_INPUT:
1532
162
            ret = XML_ERR_INVALID_ENCODING;
1533
162
            break;
1534
0
        case XML_ENC_ERR_MEMORY:
1535
0
            ret = XML_ERR_NO_MEMORY;
1536
0
            break;
1537
0
        default:
1538
0
            ret = XML_ERR_INTERNAL_ERROR;
1539
0
            break;
1540
162
    }
1541
1542
162
    return(ret);
1543
162
}
1544
1545
/**
1546
 * xmlEncInputChunk:
1547
 * @handler:  encoding handler
1548
 * @out:  a pointer to an array of bytes to store the result
1549
 * @outlen:  the length of @out
1550
 * @in:  a pointer to an array of input bytes
1551
 * @inlen:  the length of @in
1552
 * @flush:  end of input
1553
 *
1554
 * The value of @inlen after return is the number of octets consumed
1555
 *     as the return value is 0, else unpredictable.
1556
 * The value of @outlen after return is the number of octets produced.
1557
 *
1558
 * Returns an XML_ENC_ERR code.
1559
 */
1560
xmlCharEncError
1561
xmlEncInputChunk(xmlCharEncodingHandler *handler, unsigned char *out,
1562
                 int *outlen, const unsigned char *in, int *inlen,
1563
4.70k
                 int flush) {
1564
4.70k
    xmlCharEncError ret;
1565
1566
4.70k
    if (handler->flags & XML_HANDLER_LEGACY) {
1567
0
        xmlCharEncodingInputFunc func = handler->input.legacyFunc;
1568
1569
0
        if (func == NULL) {
1570
0
            *outlen = 0;
1571
0
            *inlen = 0;
1572
0
            return(XML_ENC_ERR_INTERNAL);
1573
0
        }
1574
1575
0
        ret = func(out, outlen, in, inlen);
1576
4.70k
    } else {
1577
4.70k
        xmlCharEncConvFunc func = handler->input.func;
1578
4.70k
        int oldInlen;
1579
1580
4.70k
        if (func == NULL) {
1581
0
            *outlen = 0;
1582
0
            *inlen = 0;
1583
0
            return(XML_ENC_ERR_INTERNAL);
1584
0
        }
1585
1586
4.70k
        oldInlen = *inlen;
1587
4.70k
        ret = func(handler->inputCtxt, out, outlen, in, inlen, flush);
1588
1589
        /*
1590
         * Check for truncated multi-byte sequence.
1591
         */
1592
4.70k
        if ((flush) && (ret == XML_ENC_ERR_SUCCESS) && (*inlen != oldInlen))
1593
1
            ret = XML_ENC_ERR_INPUT;
1594
4.70k
    }
1595
1596
4.70k
    if (ret > 0)
1597
2.92k
        ret = XML_ENC_ERR_SUCCESS;
1598
1599
4.70k
    return(ret);
1600
4.70k
}
1601
1602
/**
1603
 * xmlEncOutputChunk:
1604
 * @handler:  encoding handler
1605
 * @out:  a pointer to an array of bytes to store the result
1606
 * @outlen:  the length of @out
1607
 * @in:  a pointer to an array of input bytes
1608
 * @inlen:  the length of @in
1609
 *
1610
 * Returns an XML_ENC_ERR code.
1611
 *
1612
 * The value of @inlen after return is the number of octets consumed
1613
 *     as the return value is 0, else unpredictable.
1614
 * The value of @outlen after return is the number of octets produced.
1615
 */
1616
static xmlCharEncError
1617
xmlEncOutputChunk(xmlCharEncodingHandler *handler, unsigned char *out,
1618
0
                  int *outlen, const unsigned char *in, int *inlen) {
1619
0
    xmlCharEncError ret;
1620
1621
0
    if (handler->flags & XML_HANDLER_LEGACY) {
1622
0
        xmlCharEncodingOutputFunc func = handler->output.legacyFunc;
1623
1624
0
        if (func == NULL) {
1625
0
            *outlen = 0;
1626
0
            *inlen = 0;
1627
0
            return(XML_ENC_ERR_INTERNAL);
1628
0
        }
1629
1630
0
        ret = func(out, outlen, in, inlen);
1631
0
    } else {
1632
0
        xmlCharEncConvFunc func = handler->output.func;
1633
1634
0
        if (func == NULL) {
1635
0
            *outlen = 0;
1636
0
            *inlen = 0;
1637
0
            return(XML_ENC_ERR_INTERNAL);
1638
0
        }
1639
1640
0
        ret = func(handler->outputCtxt, out, outlen, in, inlen, /* flush */ 0);
1641
0
    }
1642
1643
0
    if (ret > 0)
1644
0
        ret = XML_ENC_ERR_SUCCESS;
1645
1646
0
    return(ret);
1647
0
}
1648
1649
/**
1650
 * xmlCharEncFirstLine:
1651
 * @handler:   char encoding transformation data structure
1652
 * @out:  an xmlBuffer for the output.
1653
 * @in:  an xmlBuffer for the input
1654
 *
1655
 * DEPERECATED: Don't use.
1656
 *
1657
 * Returns the number of bytes written or an XML_ENC_ERR code.
1658
 */
1659
int
1660
xmlCharEncFirstLine(xmlCharEncodingHandler *handler, xmlBufferPtr out,
1661
0
                    xmlBufferPtr in) {
1662
0
    return(xmlCharEncInFunc(handler, out, in));
1663
0
}
1664
1665
/**
1666
 * xmlCharEncInput:
1667
 * @input: a parser input buffer
1668
 * @sizeOut:  pointer to output size
1669
 * @flush:  end of input
1670
 *
1671
 * @sizeOut should be set to the maximum output size (or SIZE_MAX).
1672
 * After return, it is set to the number of bytes written.
1673
 *
1674
 * Generic front-end for the encoding handler on parser input
1675
 *
1676
 * Returns an XML_ENC_ERR code.
1677
 */
1678
xmlCharEncError
1679
xmlCharEncInput(xmlParserInputBufferPtr input, size_t *sizeOut, int flush)
1680
3.44k
{
1681
3.44k
    xmlBufPtr out, in;
1682
3.44k
    const xmlChar *dataIn;
1683
3.44k
    size_t availIn;
1684
3.44k
    size_t maxOut;
1685
3.44k
    size_t totalIn, totalOut;
1686
3.44k
    xmlCharEncError ret;
1687
1688
3.44k
    out = input->buffer;
1689
3.44k
    in = input->raw;
1690
1691
3.44k
    maxOut = *sizeOut;
1692
3.44k
    totalOut = 0;
1693
1694
3.44k
    *sizeOut = 0;
1695
1696
3.44k
    availIn = xmlBufUse(in);
1697
3.44k
    if ((availIn == 0) && (!flush))
1698
225
        return(0);
1699
3.22k
    dataIn = xmlBufContent(in);
1700
3.22k
    totalIn = 0;
1701
1702
4.70k
    while (1) {
1703
4.70k
        size_t availOut;
1704
4.70k
        int completeOut, completeIn;
1705
4.70k
        int c_out, c_in;
1706
1707
4.70k
        availOut = xmlBufAvail(out);
1708
4.70k
        if (availOut > INT_MAX / 2)
1709
0
            availOut = INT_MAX / 2;
1710
1711
4.70k
        if (availOut < maxOut) {
1712
4.64k
            c_out = availOut;
1713
4.64k
            completeOut = 0;
1714
4.64k
        } else {
1715
60
            c_out = maxOut;
1716
60
            completeOut = 1;
1717
60
        }
1718
1719
4.70k
        if (availIn > INT_MAX / 2) {
1720
0
            c_in = INT_MAX / 2;
1721
0
            completeIn = 0;
1722
4.70k
        } else {
1723
4.70k
            c_in = availIn;
1724
4.70k
            completeIn = 1;
1725
4.70k
        }
1726
1727
4.70k
        ret = xmlEncInputChunk(input->encoder, xmlBufEnd(out), &c_out,
1728
4.70k
                               dataIn, &c_in, flush && completeIn);
1729
1730
4.70k
        totalIn += c_in;
1731
4.70k
        dataIn += c_in;
1732
4.70k
        availIn -= c_in;
1733
1734
4.70k
        totalOut += c_out;
1735
4.70k
        maxOut -= c_out;
1736
4.70k
        xmlBufAddLen(out, c_out);
1737
1738
4.70k
        if ((ret != XML_ENC_ERR_SUCCESS) && (ret != XML_ENC_ERR_SPACE)) {
1739
162
            input->error = xmlEncConvertError(ret);
1740
162
            return(ret);
1741
162
        }
1742
1743
4.53k
        if ((completeOut) && (completeIn))
1744
59
            break;
1745
4.48k
        if ((completeOut) && (ret == XML_ENC_ERR_SPACE))
1746
0
            break;
1747
4.48k
        if ((completeIn) && (ret == XML_ENC_ERR_SUCCESS))
1748
3.00k
            break;
1749
1750
1.47k
        if (ret == XML_ENC_ERR_SPACE) {
1751
1.47k
            if (xmlBufGrow(out, 4096) < 0) {
1752
0
                input->error = XML_ERR_NO_MEMORY;
1753
0
                return(XML_ENC_ERR_MEMORY);
1754
0
            }
1755
1.47k
        }
1756
1.47k
    }
1757
1758
3.06k
    xmlBufShrink(in, totalIn);
1759
1760
3.06k
    if (input->rawconsumed > ULONG_MAX - (unsigned long) totalIn)
1761
0
        input->rawconsumed = ULONG_MAX;
1762
3.06k
    else
1763
3.06k
        input->rawconsumed += totalIn;
1764
1765
3.06k
    *sizeOut = totalOut;
1766
3.06k
    return(XML_ENC_ERR_SUCCESS);
1767
3.22k
}
1768
1769
/**
1770
 * xmlCharEncInFunc:
1771
 * @handler:  char encoding transformation data structure
1772
 * @out:  an xmlBuffer for the output.
1773
 * @in:  an xmlBuffer for the input
1774
 *
1775
 * Generic front-end for the encoding handler input function
1776
 *
1777
 * Returns the number of bytes written or an XML_ENC_ERR code.
1778
 */
1779
int
1780
xmlCharEncInFunc(xmlCharEncodingHandler * handler, xmlBufferPtr out,
1781
                 xmlBufferPtr in)
1782
0
{
1783
0
    int ret;
1784
0
    int written;
1785
0
    int toconv;
1786
1787
0
    if (handler == NULL)
1788
0
        return(XML_ENC_ERR_INTERNAL);
1789
0
    if (out == NULL)
1790
0
        return(XML_ENC_ERR_INTERNAL);
1791
0
    if (in == NULL)
1792
0
        return(XML_ENC_ERR_INTERNAL);
1793
1794
0
    toconv = in->use;
1795
0
    if (toconv == 0)
1796
0
        return (0);
1797
0
    written = out->size - out->use -1; /* count '\0' */
1798
0
    if (toconv * 2 >= written) {
1799
0
        xmlBufferGrow(out, out->size + toconv * 2);
1800
0
        written = out->size - out->use - 1;
1801
0
    }
1802
0
    ret = xmlEncInputChunk(handler, &out->content[out->use], &written,
1803
0
                           in->content, &toconv, /* flush */ 0);
1804
0
    xmlBufferShrink(in, toconv);
1805
0
    out->use += written;
1806
0
    out->content[out->use] = 0;
1807
1808
0
    return (written? written : ret);
1809
0
}
1810
1811
#ifdef LIBXML_OUTPUT_ENABLED
1812
/**
1813
 * xmlCharEncOutput:
1814
 * @output: a parser output buffer
1815
 * @init: is this an initialization call without data
1816
 *
1817
 * Generic front-end for the encoding handler on parser output
1818
 * a first call with @init == 1 has to be made first to initiate the
1819
 * output in case of non-stateless encoding needing to initiate their
1820
 * state or the output (like the BOM in UTF16).
1821
 * In case of UTF8 sequence conversion errors for the given encoder,
1822
 * the content will be automatically remapped to a CharRef sequence.
1823
 *
1824
 * Returns the number of bytes written or an XML_ENC_ERR code.
1825
 */
1826
int
1827
xmlCharEncOutput(xmlOutputBufferPtr output, int init)
1828
0
{
1829
0
    int ret;
1830
0
    size_t written;
1831
0
    int writtentot = 0;
1832
0
    size_t toconv;
1833
0
    int c_in;
1834
0
    int c_out;
1835
0
    xmlBufPtr in;
1836
0
    xmlBufPtr out;
1837
1838
0
    if ((output == NULL) || (output->encoder == NULL) ||
1839
0
        (output->buffer == NULL) || (output->conv == NULL))
1840
0
        return(XML_ENC_ERR_INTERNAL);
1841
0
    out = output->conv;
1842
0
    in = output->buffer;
1843
1844
0
retry:
1845
1846
0
    written = xmlBufAvail(out);
1847
1848
    /*
1849
     * First specific handling of the initialization call
1850
     */
1851
0
    if (init) {
1852
0
        c_in = 0;
1853
0
        c_out = written;
1854
        /* TODO: Check return value. */
1855
0
        xmlEncOutputChunk(output->encoder, xmlBufEnd(out), &c_out,
1856
0
                          NULL, &c_in);
1857
0
        xmlBufAddLen(out, c_out);
1858
0
        return(c_out);
1859
0
    }
1860
1861
    /*
1862
     * Conversion itself.
1863
     */
1864
0
    toconv = xmlBufUse(in);
1865
0
    if (toconv > 64 * 1024)
1866
0
        toconv = 64 * 1024;
1867
0
    if (toconv * 4 >= written) {
1868
0
        if (xmlBufGrow(out, toconv * 4) < 0) {
1869
0
            ret = XML_ENC_ERR_MEMORY;
1870
0
            goto error;
1871
0
        }
1872
0
        written = xmlBufAvail(out);
1873
0
    }
1874
0
    if (written > 256 * 1024)
1875
0
        written = 256 * 1024;
1876
1877
0
    c_in = toconv;
1878
0
    c_out = written;
1879
0
    ret = xmlEncOutputChunk(output->encoder, xmlBufEnd(out), &c_out,
1880
0
                            xmlBufContent(in), &c_in);
1881
0
    xmlBufShrink(in, c_in);
1882
0
    xmlBufAddLen(out, c_out);
1883
0
    writtentot += c_out;
1884
1885
0
    if (ret == XML_ENC_ERR_SPACE)
1886
0
        goto retry;
1887
1888
    /*
1889
     * Attempt to handle error cases
1890
     */
1891
0
    if (ret == XML_ENC_ERR_INPUT) {
1892
0
        xmlChar charref[20];
1893
0
        int len = xmlBufUse(in);
1894
0
        xmlChar *content = xmlBufContent(in);
1895
0
        int cur, charrefLen;
1896
1897
0
        cur = xmlGetUTF8Char(content, &len);
1898
0
        if (cur <= 0)
1899
0
            goto error;
1900
1901
        /*
1902
         * Removes the UTF8 sequence, and replace it by a charref
1903
         * and continue the transcoding phase, hoping the error
1904
         * did not mangle the encoder state.
1905
         */
1906
0
        charrefLen = xmlSerializeDecCharRef((char *) charref, cur);
1907
0
        xmlBufGrow(out, charrefLen * 4);
1908
0
        c_out = xmlBufAvail(out);
1909
0
        c_in = charrefLen;
1910
0
        ret = xmlEncOutputChunk(output->encoder, xmlBufEnd(out), &c_out,
1911
0
                                charref, &c_in);
1912
0
        if ((ret < 0) || (c_in != charrefLen)) {
1913
0
            ret = XML_ENC_ERR_INTERNAL;
1914
0
            goto error;
1915
0
        }
1916
1917
0
        xmlBufShrink(in, len);
1918
0
        xmlBufAddLen(out, c_out);
1919
0
        writtentot += c_out;
1920
0
        goto retry;
1921
0
    }
1922
1923
0
error:
1924
0
    if (((writtentot <= 0) && (ret != 0)) ||
1925
0
        (ret == XML_ENC_ERR_MEMORY)) {
1926
0
        if (output->error == 0)
1927
0
            output->error = xmlEncConvertError(ret);
1928
0
        return(ret);
1929
0
    }
1930
1931
0
    return(writtentot);
1932
0
}
1933
#endif
1934
1935
/**
1936
 * xmlCharEncOutFunc:
1937
 * @handler:  char encoding transformation data structure
1938
 * @out:  an xmlBuffer for the output.
1939
 * @in:  an xmlBuffer for the input
1940
 *
1941
 * Generic front-end for the encoding handler output function
1942
 * a first call with @in == NULL has to be made firs to initiate the
1943
 * output in case of non-stateless encoding needing to initiate their
1944
 * state or the output (like the BOM in UTF16).
1945
 * In case of UTF8 sequence conversion errors for the given encoder,
1946
 * the content will be automatically remapped to a CharRef sequence.
1947
 *
1948
 * Returns the number of bytes written or an XML_ENC_ERR code.
1949
 */
1950
int
1951
xmlCharEncOutFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out,
1952
0
                  xmlBufferPtr in) {
1953
0
    int ret;
1954
0
    int written;
1955
0
    int writtentot = 0;
1956
0
    int toconv;
1957
1958
0
    if (handler == NULL) return(XML_ENC_ERR_INTERNAL);
1959
0
    if (out == NULL) return(XML_ENC_ERR_INTERNAL);
1960
1961
0
retry:
1962
1963
0
    written = out->size - out->use;
1964
1965
0
    if (written > 0)
1966
0
  written--; /* Gennady: count '/0' */
1967
1968
    /*
1969
     * First specific handling of in = NULL, i.e. the initialization call
1970
     */
1971
0
    if (in == NULL) {
1972
0
        toconv = 0;
1973
        /* TODO: Check return value. */
1974
0
        xmlEncOutputChunk(handler, &out->content[out->use], &written,
1975
0
                          NULL, &toconv);
1976
0
        out->use += written;
1977
0
        out->content[out->use] = 0;
1978
0
        return(0);
1979
0
    }
1980
1981
    /*
1982
     * Conversion itself.
1983
     */
1984
0
    toconv = in->use;
1985
0
    if (toconv * 4 >= written) {
1986
0
        xmlBufferGrow(out, toconv * 4);
1987
0
  written = out->size - out->use - 1;
1988
0
    }
1989
0
    ret = xmlEncOutputChunk(handler, &out->content[out->use], &written,
1990
0
                            in->content, &toconv);
1991
0
    xmlBufferShrink(in, toconv);
1992
0
    out->use += written;
1993
0
    writtentot += written;
1994
0
    out->content[out->use] = 0;
1995
1996
0
    if (ret == XML_ENC_ERR_SPACE)
1997
0
        goto retry;
1998
1999
    /*
2000
     * Attempt to handle error cases
2001
     */
2002
0
    if (ret == XML_ENC_ERR_INPUT) {
2003
0
        xmlChar charref[20];
2004
0
        int len = in->use;
2005
0
        const xmlChar *utf = (const xmlChar *) in->content;
2006
0
        int cur, charrefLen;
2007
2008
0
        cur = xmlGetUTF8Char(utf, &len);
2009
0
        if (cur <= 0)
2010
0
            return(ret);
2011
2012
        /*
2013
         * Removes the UTF8 sequence, and replace it by a charref
2014
         * and continue the transcoding phase, hoping the error
2015
         * did not mangle the encoder state.
2016
         */
2017
0
        charrefLen = xmlSerializeDecCharRef((char *) charref, cur);
2018
0
        xmlBufferShrink(in, len);
2019
0
        xmlBufferGrow(out, charrefLen * 4);
2020
0
        written = out->size - out->use - 1;
2021
0
        toconv = charrefLen;
2022
0
        ret = xmlEncOutputChunk(handler, &out->content[out->use], &written,
2023
0
                                charref, &toconv);
2024
0
        if ((ret < 0) || (toconv != charrefLen))
2025
0
            return(XML_ENC_ERR_INTERNAL);
2026
2027
0
        out->use += written;
2028
0
        writtentot += written;
2029
0
        out->content[out->use] = 0;
2030
0
        goto retry;
2031
0
    }
2032
0
    return(writtentot ? writtentot : ret);
2033
0
}
2034
2035
/**
2036
 * xmlCharEncCloseFunc:
2037
 * @handler:  char encoding transformation data structure
2038
 *
2039
 * Releases an xmlCharEncodingHandler. Must be called after
2040
 * a handler is no longer in use.
2041
 *
2042
 * Returns 0.
2043
 */
2044
int
2045
2.84k
xmlCharEncCloseFunc(xmlCharEncodingHandler *handler) {
2046
2.84k
    if (handler == NULL)
2047
0
        return(0);
2048
2049
2.84k
    if (handler->flags & XML_HANDLER_STATIC)
2050
2.84k
        return(0);
2051
2052
0
    xmlFree(handler->name);
2053
0
    if (handler->ctxtDtor != NULL) {
2054
0
        handler->ctxtDtor(handler->inputCtxt);
2055
0
        handler->ctxtDtor(handler->outputCtxt);
2056
0
    }
2057
0
    xmlFree(handler);
2058
0
    return(0);
2059
2.84k
}
2060
2061
/**
2062
 * xmlByteConsumed:
2063
 * @ctxt: an XML parser context
2064
 *
2065
 * DEPRECATED: Don't use.
2066
 *
2067
 * This function provides the current index of the parser relative
2068
 * to the start of the current entity. This function is computed in
2069
 * bytes from the beginning starting at zero and finishing at the
2070
 * size in byte of the file if parsing a file. The function is
2071
 * of constant cost if the input is UTF-8 but can be costly if run
2072
 * on non-UTF-8 input.
2073
 *
2074
 * Returns the index in bytes from the beginning of the entity or -1
2075
 *         in case the index could not be computed.
2076
 */
2077
long
2078
0
xmlByteConsumed(xmlParserCtxtPtr ctxt) {
2079
0
    xmlParserInputPtr in;
2080
2081
0
    if (ctxt == NULL)
2082
0
        return(-1);
2083
0
    in = ctxt->input;
2084
0
    if (in == NULL)
2085
0
        return(-1);
2086
2087
0
    if ((in->buf != NULL) && (in->buf->encoder != NULL)) {
2088
0
        int unused = 0;
2089
0
  xmlCharEncodingHandler * handler = in->buf->encoder;
2090
2091
        /*
2092
   * Encoding conversion, compute the number of unused original
2093
   * bytes from the input not consumed and subtract that from
2094
   * the raw consumed value, this is not a cheap operation
2095
   */
2096
0
        if (in->end - in->cur > 0) {
2097
0
      unsigned char *convbuf;
2098
0
      const unsigned char *cur = (const unsigned char *)in->cur;
2099
0
      int toconv, ret;
2100
2101
0
            convbuf = xmlMalloc(32000);
2102
0
            if (convbuf == NULL)
2103
0
                return(-1);
2104
2105
0
            toconv = in->end - cur;
2106
0
            unused = 32000;
2107
0
            ret = xmlEncOutputChunk(handler, convbuf, &unused, cur, &toconv);
2108
2109
0
            xmlFree(convbuf);
2110
2111
0
            if (ret != XML_ENC_ERR_SUCCESS)
2112
0
                return(-1);
2113
0
  }
2114
2115
0
  if (in->buf->rawconsumed < (unsigned long) unused)
2116
0
      return(-1);
2117
0
  return(in->buf->rawconsumed - unused);
2118
0
    }
2119
2120
0
    return(in->consumed + (in->cur - in->base));
2121
0
}
2122
2123
/************************************************************************
2124
 *                  *
2125
 *    Conversions To/From UTF8 encoding     *
2126
 *                  *
2127
 ************************************************************************/
2128
2129
static xmlCharEncError
2130
asciiToAscii(void *vctxt ATTRIBUTE_UNUSED,
2131
             unsigned char* out, int *poutlen,
2132
             const unsigned char* in, int *pinlen,
2133
201
             int flush ATTRIBUTE_UNUSED) {
2134
201
    const unsigned char *inend;
2135
201
    const unsigned char *instart = in;
2136
201
    int inlen, outlen, ret;
2137
2138
201
    if (in == NULL) {
2139
0
        *pinlen = 0;
2140
0
        *poutlen = 0;
2141
0
        return(XML_ENC_ERR_SUCCESS);
2142
0
    }
2143
2144
201
    inlen = *pinlen;
2145
201
    outlen = *poutlen;
2146
2147
201
    if (outlen < inlen) {
2148
43
        inlen = outlen;
2149
43
        ret = XML_ENC_ERR_SPACE;
2150
158
    } else {
2151
158
        ret = inlen;
2152
158
    }
2153
2154
201
    inend = in + inlen;
2155
201
    *poutlen = inlen;
2156
201
    *pinlen = inlen;
2157
2158
426k
    while (in < inend) {
2159
426k
  unsigned c = *in;
2160
2161
426k
        if (c >= 0x80) {
2162
40
      *poutlen = in - instart;
2163
40
      *pinlen = in - instart;
2164
40
      return(XML_ENC_ERR_INPUT);
2165
40
  }
2166
2167
426k
        in++;
2168
426k
  *out++ = c;
2169
426k
    }
2170
2171
161
    return(ret);
2172
201
}
2173
2174
static xmlCharEncError
2175
latin1ToUTF8(void *vctxt ATTRIBUTE_UNUSED,
2176
             unsigned char* out, int *outlen,
2177
             const unsigned char* in, int *inlen,
2178
1.00k
             int flush ATTRIBUTE_UNUSED) {
2179
1.00k
    unsigned char* outstart = out;
2180
1.00k
    const unsigned char* instart = in;
2181
1.00k
    unsigned char* outend;
2182
1.00k
    const unsigned char* inend;
2183
1.00k
    int ret = XML_ENC_ERR_SPACE;
2184
2185
1.00k
    if ((out == NULL) || (in == NULL) || (outlen == NULL) || (inlen == NULL))
2186
0
  return(XML_ENC_ERR_INTERNAL);
2187
2188
1.00k
    outend = out + *outlen;
2189
1.00k
    inend = in + *inlen;
2190
2191
4.52M
    while (in < inend) {
2192
4.52M
        unsigned c = *in;
2193
2194
4.52M
  if (c < 0x80) {
2195
3.58M
            if (out >= outend)
2196
316
                goto done;
2197
3.58M
            *out++ = c;
2198
3.58M
  } else {
2199
941k
            if (outend - out < 2)
2200
158
                goto done;
2201
941k
      *out++ = (c >> 6) | 0xC0;
2202
941k
            *out++ = (c & 0x3F) | 0x80;
2203
941k
        }
2204
2205
4.52M
        in++;
2206
4.52M
    }
2207
2208
528
    ret = out - outstart;
2209
2210
1.00k
done:
2211
1.00k
    *outlen = out - outstart;
2212
1.00k
    *inlen = in - instart;
2213
1.00k
    return(ret);
2214
528
}
2215
2216
/**
2217
 * xmlIsolat1ToUTF8:
2218
 * @out:  a pointer to an array of bytes to store the result
2219
 * @outlen:  the length of @out
2220
 * @in:  a pointer to an array of ISO Latin 1 chars
2221
 * @inlen:  the length of @in
2222
 *
2223
 * Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8
2224
 * block of chars out.
2225
 *
2226
 * Returns the number of bytes written or an XML_ENC_ERR code.
2227
 *
2228
 * The value of @inlen after return is the number of octets consumed
2229
 *     if the return value is positive, else unpredictable.
2230
 * The value of @outlen after return is the number of octets produced.
2231
 */
2232
int
2233
xmlIsolat1ToUTF8(unsigned char* out, int *outlen,
2234
0
                 const unsigned char* in, int *inlen) {
2235
0
    return(latin1ToUTF8(/* ctxt */ NULL, out, outlen, in, inlen,
2236
0
                        /* flush */ 0));
2237
0
}
2238
2239
static xmlCharEncError
2240
UTF8ToUTF8(void *vctxt ATTRIBUTE_UNUSED,
2241
           unsigned char* out, int *outlen,
2242
           const unsigned char* in, int *inlen,
2243
0
           int flush ATTRIBUTE_UNUSED) {
2244
0
    int len;
2245
0
    int ret;
2246
2247
0
    if (in == NULL) {
2248
0
        *inlen = 0;
2249
0
        *outlen = 0;
2250
0
        return(XML_ENC_ERR_SUCCESS);
2251
0
    }
2252
2253
0
    if (*outlen < *inlen) {
2254
0
  len = *outlen;
2255
0
        ret = XML_ENC_ERR_SPACE;
2256
0
    } else {
2257
0
  len = *inlen;
2258
0
        ret = len;
2259
0
    }
2260
2261
0
    memcpy(out, in, len);
2262
2263
0
    *outlen = len;
2264
0
    *inlen = len;
2265
0
    return(ret);
2266
0
}
2267
2268
2269
#ifdef LIBXML_OUTPUT_ENABLED
2270
static xmlCharEncError
2271
UTF8ToLatin1(void *vctxt ATTRIBUTE_UNUSED,
2272
             unsigned char* out, int *outlen,
2273
             const unsigned char* in, int *inlen,
2274
0
             int flush ATTRIBUTE_UNUSED) {
2275
0
    const unsigned char* outend;
2276
0
    const unsigned char* outstart = out;
2277
0
    const unsigned char* instart = in;
2278
0
    const unsigned char* inend;
2279
0
    unsigned c;
2280
0
    int ret = XML_ENC_ERR_SPACE;
2281
2282
0
    if ((out == NULL) || (outlen == NULL) || (inlen == NULL))
2283
0
        return(XML_ENC_ERR_INTERNAL);
2284
2285
0
    if (in == NULL) {
2286
0
        *inlen = 0;
2287
0
        *outlen = 0;
2288
0
        return(XML_ENC_ERR_SUCCESS);
2289
0
    }
2290
2291
0
    inend = in + *inlen;
2292
0
    outend = out + *outlen;
2293
0
    while (in < inend) {
2294
0
        if (out >= outend)
2295
0
            goto done;
2296
2297
0
  c = *in;
2298
2299
0
        if (c < 0x80) {
2300
0
            *out++ = c;
2301
0
        } else if ((c >= 0xC2) && (c <= 0xC3)) {
2302
0
            if (inend - in < 2)
2303
0
                break;
2304
0
            in++;
2305
0
            *out++ = (unsigned char) ((c << 6) | (*in & 0x3F));
2306
0
        } else {
2307
0
            ret = XML_ENC_ERR_INPUT;
2308
0
            goto done;
2309
0
  }
2310
2311
0
        in++;
2312
0
    }
2313
2314
0
    ret = out - outstart;
2315
2316
0
done:
2317
0
    *outlen = out - outstart;
2318
0
    *inlen = in - instart;
2319
0
    return(ret);
2320
0
}
2321
2322
/**
2323
 * xmlUTF8ToIsolat1:
2324
 * @out:  a pointer to an array of bytes to store the result
2325
 * @outlen:  the length of @out
2326
 * @in:  a pointer to an array of UTF-8 chars
2327
 * @inlen:  the length of @in
2328
 *
2329
 * Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1
2330
 * block of chars out.
2331
 *
2332
 * Returns the number of bytes written or an XML_ENC_ERR code.
2333
 *
2334
 * The value of @inlen after return is the number of octets consumed
2335
 *     if the return value is positive, else unpredictable.
2336
 * The value of @outlen after return is the number of octets produced.
2337
 */
2338
int
2339
xmlUTF8ToIsolat1(unsigned char* out, int *outlen,
2340
0
              const unsigned char* in, int *inlen) {
2341
0
    if ((out == NULL) || (outlen == NULL) || (in == NULL) || (inlen == NULL))
2342
0
        return(XML_ENC_ERR_INTERNAL);
2343
2344
0
    return(UTF8ToLatin1(/* ctxt */ NULL, out, outlen, in, inlen,
2345
0
                        /* flush */ 0));
2346
0
}
2347
#endif /* LIBXML_OUTPUT_ENABLED */
2348
2349
static xmlCharEncError
2350
UTF16LEToUTF8(void *vctxt ATTRIBUTE_UNUSED,
2351
              unsigned char *out, int *outlen,
2352
              const unsigned char *in, int *inlen,
2353
1.42k
              int flush ATTRIBUTE_UNUSED) {
2354
1.42k
    const unsigned char *instart = in;
2355
1.42k
    const unsigned char *inend = in + (*inlen & ~1);
2356
1.42k
    unsigned char *outstart = out;
2357
1.42k
    unsigned char *outend = out + *outlen;
2358
1.42k
    unsigned c, d;
2359
1.42k
    int ret = XML_ENC_ERR_SPACE;
2360
2361
1.22M
    while (in < inend) {
2362
1.22M
        c = in[0] | (in[1] << 8);
2363
2364
1.22M
        if (c < 0x80) {
2365
128k
            if (out >= outend)
2366
13
                goto done;
2367
128k
            out[0] = c;
2368
128k
            in += 2;
2369
128k
            out += 1;
2370
1.09M
        } else if (c < 0x800) {
2371
21.9k
            if (outend - out < 2)
2372
12
                goto done;
2373
21.9k
            out[0] = (c >> 6)   | 0xC0;
2374
21.9k
            out[1] = (c & 0x3F) | 0x80;
2375
21.9k
            in += 2;
2376
21.9k
            out += 2;
2377
1.06M
        } else if ((c & 0xF800) != 0xD800) {
2378
1.06M
            if (outend - out < 3)
2379
242
                goto done;
2380
1.06M
            out[0] =  (c >> 12)         | 0xE0;
2381
1.06M
            out[1] = ((c >>  6) & 0x3F) | 0x80;
2382
1.06M
            out[2] =  (c        & 0x3F) | 0x80;
2383
1.06M
            in += 2;
2384
1.06M
            out += 3;
2385
1.06M
        } else {
2386
            /* Surrogate pair */
2387
923
            if ((c & 0xFC00) != 0xD800) {
2388
34
                ret = XML_ENC_ERR_INPUT;
2389
34
                goto done;
2390
34
            }
2391
889
      if (inend - in < 4)
2392
9
    break;
2393
880
            d = in[2] | (in[3] << 8);
2394
880
            if ((d & 0xFC00) != 0xDC00) {
2395
25
                ret = XML_ENC_ERR_INPUT;
2396
25
                goto done;
2397
25
            }
2398
855
      if (outend - out < 4)
2399
3
    goto done;
2400
852
            c = (c << 10) + d - ((0xD800 << 10) + 0xDC00 - 0x10000);
2401
852
            out[0] =  (c >> 18)         | 0xF0;
2402
852
            out[1] = ((c >> 12) & 0x3F) | 0x80;
2403
852
            out[2] = ((c >>  6) & 0x3F) | 0x80;
2404
852
            out[3] =  (c        & 0x3F) | 0x80;
2405
852
            in += 4;
2406
852
            out += 4;
2407
852
        }
2408
1.22M
    }
2409
2410
1.09k
    ret = out - outstart;
2411
2412
1.42k
done:
2413
1.42k
    *outlen = out - outstart;
2414
1.42k
    *inlen = in - instart;
2415
1.42k
    return(ret);
2416
1.09k
}
2417
2418
#ifdef LIBXML_OUTPUT_ENABLED
2419
static xmlCharEncError
2420
UTF8ToUTF16LE(void *vctxt ATTRIBUTE_UNUSED,
2421
              unsigned char *out, int *outlen,
2422
              const unsigned char *in, int *inlen,
2423
0
              int flush ATTRIBUTE_UNUSED) {
2424
0
    const unsigned char *instart = in;
2425
0
    const unsigned char *inend;
2426
0
    unsigned char *outstart = out;
2427
0
    unsigned char *outend;
2428
0
    unsigned c, d;
2429
0
    int ret = XML_ENC_ERR_SPACE;
2430
2431
    /* UTF16LE encoding has no BOM */
2432
0
    if ((out == NULL) || (outlen == NULL) || (inlen == NULL))
2433
0
        return(XML_ENC_ERR_INTERNAL);
2434
0
    if (in == NULL) {
2435
0
  *outlen = 0;
2436
0
  *inlen = 0;
2437
0
  return(0);
2438
0
    }
2439
0
    inend = in + *inlen;
2440
0
    outend = out + (*outlen & ~1);
2441
0
    while (in < inend) {
2442
0
        c = in[0];
2443
2444
0
        if (c < 0x80) {
2445
0
            if (out >= outend)
2446
0
                goto done;
2447
0
            out[0] = c;
2448
0
            out[1] = 0;
2449
0
            in += 1;
2450
0
            out += 2;
2451
0
        } else {
2452
0
            int i, len;
2453
0
            unsigned min;
2454
2455
0
            if (c < 0xE0) {
2456
0
                if (c < 0xC2) {
2457
0
                    ret = XML_ENC_ERR_INPUT;
2458
0
                    goto done;
2459
0
                }
2460
0
                c &= 0x1F;
2461
0
                len = 2;
2462
0
                min = 0x80;
2463
0
            } else if (c < 0xF0) {
2464
0
                c &= 0x0F;
2465
0
                len = 3;
2466
0
                min = 0x800;
2467
0
            } else {
2468
0
                c &= 0x0F;
2469
0
                len = 4;
2470
0
                min = 0x10000;
2471
0
            }
2472
2473
0
            if (inend - in < len)
2474
0
                break;
2475
2476
0
            for (i = 1; i < len; i++) {
2477
0
                if ((in[i] & 0xC0) != 0x80) {
2478
0
                    ret = XML_ENC_ERR_INPUT;
2479
0
                    goto done;
2480
0
                }
2481
0
                c = (c << 6) | (in[i] & 0x3F);
2482
0
            }
2483
2484
0
            if ((c < min) ||
2485
0
                ((c >= 0xD800) && (c <= 0xDFFF)) ||
2486
0
                (c > 0x10FFFF)) {
2487
0
                ret = XML_ENC_ERR_INPUT;
2488
0
                goto done;
2489
0
            }
2490
2491
0
            if (c < 0x10000) {
2492
0
                if (out >= outend)
2493
0
                    goto done;
2494
0
                out[0] = c & 0xFF;
2495
0
                out[1] = c >> 8;
2496
0
                out += 2;
2497
0
            } else {
2498
0
                if (outend - out < 4)
2499
0
                    goto done;
2500
0
                c -= 0x10000;
2501
0
                d = (c & 0x03FF) | 0xDC00;
2502
0
                c = (c >> 10)    | 0xD800;
2503
0
                out[0] = c & 0xFF;
2504
0
                out[1] = c >> 8;
2505
0
                out[2] = d & 0xFF;
2506
0
                out[3] = d >> 8;
2507
0
                out += 4;
2508
0
            }
2509
2510
0
            in += len;
2511
0
        }
2512
0
    }
2513
2514
0
    ret = out - outstart;
2515
2516
0
done:
2517
0
    *outlen = out - outstart;
2518
0
    *inlen = in - instart;
2519
0
    return(ret);
2520
0
}
2521
2522
static xmlCharEncError
2523
UTF8ToUTF16(void *vctxt,
2524
            unsigned char* outb, int *outlen,
2525
            const unsigned char* in, int *inlen,
2526
0
            int flush) {
2527
0
    if (in == NULL) {
2528
  /*
2529
   * initialization, add the Byte Order Mark for UTF-16LE
2530
   */
2531
0
        if (*outlen >= 2) {
2532
0
      outb[0] = 0xFF;
2533
0
      outb[1] = 0xFE;
2534
0
      *outlen = 2;
2535
0
      *inlen = 0;
2536
0
      return(2);
2537
0
  }
2538
0
  *outlen = 0;
2539
0
  *inlen = 0;
2540
0
  return(0);
2541
0
    }
2542
0
    return (UTF8ToUTF16LE(vctxt, outb, outlen, in, inlen, flush));
2543
0
}
2544
#endif /* LIBXML_OUTPUT_ENABLED */
2545
2546
static xmlCharEncError
2547
UTF16BEToUTF8(void *vctxt ATTRIBUTE_UNUSED,
2548
              unsigned char *out, int *outlen,
2549
              const unsigned char *in, int *inlen,
2550
1.33k
              int flush ATTRIBUTE_UNUSED) {
2551
1.33k
    const unsigned char *instart = in;
2552
1.33k
    const unsigned char *inend = in + (*inlen & ~1);
2553
1.33k
    unsigned char *outstart = out;
2554
1.33k
    unsigned char *outend = out + *outlen;
2555
1.33k
    unsigned c, d;
2556
1.33k
    int ret = XML_ENC_ERR_SPACE;
2557
2558
1.58M
    while (in < inend) {
2559
1.58M
        c = (in[0] << 8) | in[1];
2560
2561
1.58M
        if (c < 0x80) {
2562
109k
            if (out >= outend)
2563
16
                goto done;
2564
109k
            out[0] = c;
2565
109k
            in += 2;
2566
109k
            out += 1;
2567
1.47M
        } else if (c < 0x800) {
2568
88.0k
            if (outend - out < 2)
2569
21
                goto done;
2570
88.0k
            out[0] = (c >> 6)   | 0xC0;
2571
88.0k
            out[1] = (c & 0x3F) | 0x80;
2572
88.0k
            in += 2;
2573
88.0k
            out += 2;
2574
1.38M
        } else if ((c & 0xF800) != 0xD800) {
2575
1.38M
            if (outend - out < 3)
2576
296
                goto done;
2577
1.38M
            out[0] =  (c >> 12)         | 0xE0;
2578
1.38M
            out[1] = ((c >>  6) & 0x3F) | 0x80;
2579
1.38M
            out[2] =  (c        & 0x3F) | 0x80;
2580
1.38M
            in += 2;
2581
1.38M
            out += 3;
2582
1.38M
        } else {
2583
            /* Surrogate pair */
2584
923
            if ((c & 0xFC00) != 0xD800) {
2585
24
                ret = XML_ENC_ERR_INPUT;
2586
24
                goto done;
2587
24
            }
2588
899
      if (inend - in < 4)
2589
16
    break;
2590
883
            d = (in[2] << 8) | in[3];
2591
883
            if ((d & 0xFC00) != 0xDC00) {
2592
36
                ret = XML_ENC_ERR_INPUT;
2593
36
                goto done;
2594
36
            }
2595
847
      if (outend - out < 4)
2596
1
    goto done;
2597
846
            c = (c << 10) + d - ((0xD800 << 10) + 0xDC00 - 0x10000);
2598
846
            out[0] =  (c >> 18)         | 0xF0;
2599
846
            out[1] = ((c >> 12) & 0x3F) | 0x80;
2600
846
            out[2] = ((c >>  6) & 0x3F) | 0x80;
2601
846
            out[3] =  (c        & 0x3F) | 0x80;
2602
846
            in += 4;
2603
846
            out += 4;
2604
846
        }
2605
1.58M
    }
2606
2607
939
    ret = out - outstart;
2608
2609
1.33k
done:
2610
1.33k
    *outlen = out - outstart;
2611
1.33k
    *inlen = in - instart;
2612
1.33k
    return(ret);
2613
939
}
2614
2615
#ifdef LIBXML_OUTPUT_ENABLED
2616
static xmlCharEncError
2617
UTF8ToUTF16BE(void *vctxt ATTRIBUTE_UNUSED,
2618
              unsigned char *out, int *outlen,
2619
              const unsigned char *in, int *inlen,
2620
0
              int flush ATTRIBUTE_UNUSED) {
2621
0
    const unsigned char *instart = in;
2622
0
    const unsigned char *inend;
2623
0
    unsigned char *outstart = out;
2624
0
    unsigned char *outend;
2625
0
    unsigned c, d;
2626
0
    int ret = XML_ENC_ERR_SPACE;
2627
2628
    /* UTF-16BE has no BOM */
2629
0
    if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
2630
0
    if (in == NULL) {
2631
0
  *outlen = 0;
2632
0
  *inlen = 0;
2633
0
  return(0);
2634
0
    }
2635
0
    inend = in + *inlen;
2636
0
    outend = out + (*outlen & ~1);
2637
0
    while (in < inend) {
2638
0
        c = in[0];
2639
2640
0
        if (c < 0x80) {
2641
0
            if (out >= outend)
2642
0
                goto done;
2643
0
            out[0] = 0;
2644
0
            out[1] = c;
2645
0
            in += 1;
2646
0
            out += 2;
2647
0
        } else {
2648
0
            int i, len;
2649
0
            unsigned min;
2650
2651
0
            if (c < 0xE0) {
2652
0
                if (c < 0xC2) {
2653
0
                    ret = XML_ENC_ERR_INPUT;
2654
0
                    goto done;
2655
0
                }
2656
0
                c &= 0x1F;
2657
0
                len = 2;
2658
0
                min = 0x80;
2659
0
            } else if (c < 0xF0) {
2660
0
                c &= 0x0F;
2661
0
                len = 3;
2662
0
                min = 0x800;
2663
0
            } else {
2664
0
                c &= 0x0F;
2665
0
                len = 4;
2666
0
                min = 0x10000;
2667
0
            }
2668
2669
0
            if (inend - in < len)
2670
0
                break;
2671
2672
0
            for (i = 1; i < len; i++) {
2673
0
                if ((in[i] & 0xC0) != 0x80) {
2674
0
                    ret = XML_ENC_ERR_INPUT;
2675
0
                    goto done;
2676
0
                }
2677
0
                c = (c << 6) | (in[i] & 0x3F);
2678
0
            }
2679
2680
0
            if ((c < min) ||
2681
0
                ((c >= 0xD800) && (c <= 0xDFFF)) ||
2682
0
                (c > 0x10FFFF)) {
2683
0
                ret = XML_ENC_ERR_INPUT;
2684
0
                goto done;
2685
0
            }
2686
2687
0
            if (c < 0x10000) {
2688
0
                if (out >= outend)
2689
0
                    goto done;
2690
0
                out[0] = c >> 8;
2691
0
                out[1] = c & 0xFF;
2692
0
                out += 2;
2693
0
            } else {
2694
0
                if (outend - out < 4)
2695
0
                    goto done;
2696
0
                c -= 0x10000;
2697
0
                d = (c & 0x03FF) | 0xDC00;
2698
0
                c = (c >> 10)    | 0xD800;
2699
0
                out[0] = c >> 8;
2700
0
                out[1] = c & 0xFF;
2701
0
                out[2] = d >> 8;
2702
0
                out[3] = d & 0xFF;
2703
0
                out += 4;
2704
0
            }
2705
2706
0
            in += len;
2707
0
        }
2708
0
    }
2709
2710
0
    ret = out - outstart;
2711
2712
0
done:
2713
0
    *outlen = out - outstart;
2714
0
    *inlen = in - instart;
2715
0
    return(ret);
2716
0
}
2717
#endif /* LIBXML_OUTPUT_ENABLED */
2718
2719
#if defined(LIBXML_OUTPUT_ENABLED) && defined(LIBXML_HTML_ENABLED)
2720
static xmlCharEncError
2721
UTF8ToHtmlWrapper(void *vctxt ATTRIBUTE_UNUSED,
2722
                  unsigned char *out, int *outlen,
2723
                  const unsigned char *in, int *inlen,
2724
0
                  int flush ATTRIBUTE_UNUSED) {
2725
0
    return(htmlUTF8ToHtml(out, outlen, in, inlen));
2726
0
}
2727
#endif
2728
2729
#if !defined(LIBXML_ICONV_ENABLED) && !defined(LIBXML_ICU_ENABLED) && \
2730
    defined(LIBXML_ISO8859X_ENABLED)
2731
2732
static xmlCharEncError
2733
UTF8ToISO8859x(void *vctxt,
2734
               unsigned char *out, int *outlen,
2735
               const unsigned char *in, int *inlen,
2736
0
               int flush ATTRIBUTE_UNUSED) {
2737
0
    const unsigned char *xlattable = vctxt;
2738
0
    const unsigned char *instart = in;
2739
0
    const unsigned char *inend;
2740
0
    unsigned char *outstart = out;
2741
0
    unsigned char *outend;
2742
0
    int ret = XML_ENC_ERR_SPACE;
2743
2744
0
    if (in == NULL) {
2745
        /*
2746
        * initialization nothing to do
2747
        */
2748
0
        *outlen = 0;
2749
0
        *inlen = 0;
2750
0
        return(XML_ENC_ERR_SUCCESS);
2751
0
    }
2752
2753
0
    inend = in + *inlen;
2754
0
    outend = out + *outlen;
2755
0
    while (in < inend) {
2756
0
        unsigned d = *in;
2757
2758
0
        if  (d < 0x80)  {
2759
0
            if (out >= outend)
2760
0
                goto done;
2761
0
            in += 1;
2762
0
        } else if (d < 0xE0) {
2763
0
            unsigned c;
2764
2765
0
            if (inend - in < 2)
2766
0
                break;
2767
0
            c = in[1] & 0x3F;
2768
0
            d = d & 0x1F;
2769
0
            d = xlattable [48 + c + xlattable [d] * 64];
2770
0
            if (d == 0) {
2771
                /* not in character set */
2772
0
                ret = XML_ENC_ERR_INPUT;
2773
0
                goto done;
2774
0
            }
2775
0
            if (out >= outend)
2776
0
                goto done;
2777
0
            in += 2;
2778
0
        } else if (d < 0xF0) {
2779
0
            unsigned c1;
2780
0
            unsigned c2;
2781
2782
0
            if (inend - in < 3)
2783
0
                break;
2784
0
            c1 = in[1] & 0x3F;
2785
0
            c2 = in[2] & 0x3F;
2786
0
      d = d & 0x0F;
2787
0
      d = xlattable [48 + c2 + xlattable [48 + c1 +
2788
0
      xlattable [32 + d] * 64] * 64];
2789
0
            if (d == 0) {
2790
                /* not in character set */
2791
0
                ret = XML_ENC_ERR_INPUT;
2792
0
                goto done;
2793
0
            }
2794
0
            if (out >= outend)
2795
0
                goto done;
2796
0
            in += 3;
2797
0
        } else {
2798
            /* cannot transcode >= U+010000 */
2799
0
                ret = XML_ENC_ERR_INPUT;
2800
0
                goto done;
2801
0
        }
2802
2803
0
        *out++ = d;
2804
0
    }
2805
2806
0
    ret = out - outstart;
2807
2808
0
done:
2809
0
    *outlen = out - outstart;
2810
0
    *inlen = in - instart;
2811
0
    return(ret);
2812
0
}
2813
2814
static xmlCharEncError
2815
ISO8859xToUTF8(void *vctxt,
2816
               unsigned char* out, int *outlen,
2817
               const unsigned char* in, int *inlen,
2818
737
               int flush ATTRIBUTE_UNUSED) {
2819
737
    unsigned short const *unicodetable = vctxt;
2820
737
    const unsigned char* instart = in;
2821
737
    const unsigned char* inend;
2822
737
    unsigned char* outstart = out;
2823
737
    unsigned char* outend;
2824
737
    int ret = XML_ENC_ERR_SPACE;
2825
2826
737
    outend = out + *outlen;
2827
737
    inend = in + *inlen;
2828
2829
3.42M
    while (in < inend) {
2830
3.42M
        unsigned c = *in;
2831
2832
3.42M
        if (c < 0x80) {
2833
2.50M
            if (out >= outend)
2834
216
                goto done;
2835
2.50M
            *out++ = c;
2836
2.50M
        } else {
2837
919k
            c = unicodetable[c - 0x80];
2838
919k
            if (c == 0) {
2839
                /* undefined code point */
2840
2
                ret = XML_ENC_ERR_INPUT;
2841
2
                goto done;
2842
2
            }
2843
919k
            if (c < 0x800) {
2844
918k
                if (outend - out < 2)
2845
143
                    goto done;
2846
918k
                *out++ = ((c >>  6) & 0x1F) | 0xC0;
2847
918k
                *out++ = (c & 0x3F) | 0x80;
2848
918k
            } else {
2849
1.41k
                if (outend - out < 3)
2850
3
                    goto done;
2851
1.41k
                *out++ = ((c >>  12) & 0x0F) | 0xE0;
2852
1.41k
                *out++ = ((c >>  6) & 0x3F) | 0x80;
2853
1.41k
                *out++ = (c & 0x3F) | 0x80;
2854
1.41k
            }
2855
919k
        }
2856
2857
3.42M
        in += 1;
2858
3.42M
    }
2859
2860
373
    ret = out - outstart;
2861
2862
737
done:
2863
737
    *outlen = out - outstart;
2864
737
    *inlen = in - instart;
2865
737
    return(ret);
2866
373
}
2867
2868
#endif
2869