Coverage Report

Created: 2024-09-06 07:53

/src/libxml2/encoding.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * encoding.c : implements the encoding conversion functions needed for XML
3
 *
4
 * Related specs:
5
 * rfc2044        (UTF-8 and UTF-16) F. Yergeau Alis Technologies
6
 * rfc2781        UTF-16, an encoding of ISO 10646, P. Hoffman, F. Yergeau
7
 * [ISO-10646]    UTF-8 and UTF-16 in Annexes
8
 * [ISO-8859-1]   ISO Latin-1 characters codes.
9
 * [UNICODE]      The Unicode Consortium, "The Unicode Standard --
10
 *                Worldwide Character Encoding -- Version 1.0", Addison-
11
 *                Wesley, Volume 1, 1991, Volume 2, 1992.  UTF-8 is
12
 *                described in Unicode Technical Report #4.
13
 * [US-ASCII]     Coded Character Set--7-bit American Standard Code for
14
 *                Information Interchange, ANSI X3.4-1986.
15
 *
16
 * See Copyright for the status of this software.
17
 *
18
 * daniel@veillard.com
19
 *
20
 * Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" <duerst@w3.org>
21
 */
22
23
#define IN_LIBXML
24
#include "libxml.h"
25
26
#include <string.h>
27
#include <limits.h>
28
#include <ctype.h>
29
#include <stdlib.h>
30
31
#ifdef LIBXML_ICONV_ENABLED
32
#include <iconv.h>
33
#include <errno.h>
34
#endif
35
36
#include <libxml/encoding.h>
37
#include <libxml/xmlmemory.h>
38
#include <libxml/parser.h>
39
#ifdef LIBXML_HTML_ENABLED
40
#include <libxml/HTMLparser.h>
41
#endif
42
#include <libxml/xmlerror.h>
43
44
#include "private/buf.h"
45
#include "private/enc.h"
46
#include "private/entities.h"
47
#include "private/error.h"
48
49
#ifdef LIBXML_ICU_ENABLED
50
#include <unicode/ucnv.h>
51
#endif
52
53
3.42k
#define XML_HANDLER_STATIC 1
54
55
typedef struct _xmlCharEncodingAlias xmlCharEncodingAlias;
56
typedef xmlCharEncodingAlias *xmlCharEncodingAliasPtr;
57
struct _xmlCharEncodingAlias {
58
    const char *name;
59
    const char *alias;
60
};
61
62
static xmlCharEncodingAliasPtr xmlCharEncodingAliases = NULL;
63
static int xmlCharEncodingAliasesNb = 0;
64
static int xmlCharEncodingAliasesMax = 0;
65
66
static int xmlLittleEndian = 1;
67
68
typedef struct {
69
    const char *name;
70
    xmlCharEncoding enc;
71
} xmlEncTableEntry;
72
73
static const xmlEncTableEntry xmlEncTable[] = {
74
    { "ASCII", XML_CHAR_ENCODING_ASCII },
75
    { "EUC-JP", XML_CHAR_ENCODING_EUC_JP },
76
    { "HTML", XML_CHAR_ENCODING_HTML },
77
    { "ISO LATIN 1", XML_CHAR_ENCODING_8859_1 },
78
    { "ISO LATIN 2", XML_CHAR_ENCODING_8859_2 },
79
    { "ISO-10646-UCS-2", XML_CHAR_ENCODING_UCS2 },
80
    { "ISO-10646-UCS-4", XML_CHAR_ENCODING_UCS4LE },
81
    { "ISO-2022-JP", XML_CHAR_ENCODING_2022_JP },
82
    { "ISO-8859-1", XML_CHAR_ENCODING_8859_1 },
83
    { "ISO-8859-10", XML_CHAR_ENCODING_8859_10 },
84
    { "ISO-8859-11", XML_CHAR_ENCODING_8859_11 },
85
    { "ISO-8859-13", XML_CHAR_ENCODING_8859_13 },
86
    { "ISO-8859-14", XML_CHAR_ENCODING_8859_14 },
87
    { "ISO-8859-15", XML_CHAR_ENCODING_8859_15 },
88
    { "ISO-8859-16", XML_CHAR_ENCODING_8859_16 },
89
    { "ISO-8859-2", XML_CHAR_ENCODING_8859_2 },
90
    { "ISO-8859-3", XML_CHAR_ENCODING_8859_3 },
91
    { "ISO-8859-4", XML_CHAR_ENCODING_8859_4 },
92
    { "ISO-8859-5", XML_CHAR_ENCODING_8859_5 },
93
    { "ISO-8859-6", XML_CHAR_ENCODING_8859_6 },
94
    { "ISO-8859-7", XML_CHAR_ENCODING_8859_7 },
95
    { "ISO-8859-8", XML_CHAR_ENCODING_8859_8 },
96
    { "ISO-8859-9", XML_CHAR_ENCODING_8859_9 },
97
    { "ISO-LATIN-1", XML_CHAR_ENCODING_8859_1 },
98
    { "ISO-LATIN-2", XML_CHAR_ENCODING_8859_2 },
99
    { "SHIFT_JIS", XML_CHAR_ENCODING_SHIFT_JIS },
100
    { "UCS-2", XML_CHAR_ENCODING_UCS2 },
101
    { "UCS-4", XML_CHAR_ENCODING_UCS4LE },
102
    { "UCS2", XML_CHAR_ENCODING_UCS2 },
103
    { "UCS4", XML_CHAR_ENCODING_UCS4LE },
104
    { "US-ASCII", XML_CHAR_ENCODING_ASCII },
105
    { "UTF-16", XML_CHAR_ENCODING_UTF16 },
106
    { "UTF-16BE", XML_CHAR_ENCODING_UTF16BE },
107
    { "UTF-16LE", XML_CHAR_ENCODING_UTF16LE },
108
    { "UTF-8", XML_CHAR_ENCODING_UTF8 },
109
    { "UTF16", XML_CHAR_ENCODING_UTF16LE },
110
    { "UTF8", XML_CHAR_ENCODING_UTF8 }
111
};
112
113
static int
114
asciiToAscii(unsigned char* out, int *outlen,
115
             const unsigned char* in, int *inlen, void *vctxt);
116
static int
117
UTF8ToUTF8(unsigned char* out, int *outlen,
118
           const unsigned char* inb, int *inlenb, void *vctxt);
119
static int
120
latin1ToUTF8(unsigned char* out, int *outlen,
121
             const unsigned char* in, int *inlen, void *vctxt);
122
static int
123
UTF16LEToUTF8(unsigned char* out, int *outlen,
124
              const unsigned char* inb, int *inlenb, void *vctxt);
125
static int
126
UTF16BEToUTF8(unsigned char* out, int *outlen,
127
              const unsigned char* inb, int *inlenb, void *vctxt);
128
129
#ifdef LIBXML_OUTPUT_ENABLED
130
131
static int
132
UTF8ToLatin1(unsigned char* outb, int *outlen,
133
             const unsigned char* in, int *inlen, void *vctxt);
134
static int
135
UTF8ToUTF16(unsigned char* outb, int *outlen,
136
            const unsigned char* in, int *inlen, void *vctxt);
137
static int
138
UTF8ToUTF16LE(unsigned char* outb, int *outlen,
139
              const unsigned char* in, int *inlen, void *vctxt);
140
static int
141
UTF8ToUTF16BE(unsigned char* outb, int *outlen,
142
              const unsigned char* in, int *inlen, void *vctxt);
143
144
#else /* LIBXML_OUTPUT_ENABLED */
145
146
#define UTF8ToLatin1 NULL
147
#define UTF8ToUTF16 NULL
148
#define UTF8ToUTF16LE NULL
149
#define UTF8ToUTF16BE NULL
150
151
#endif /* LIBXML_OUTPUT_ENABLED */
152
153
#if defined(LIBXML_OUTPUT_ENABLED) && defined(LIBXML_HTML_ENABLED)
154
static int
155
UTF8ToHtmlWrapper(unsigned char *out, int *outlen,
156
                  const unsigned char *in, int *inlen, void *vctxt);
157
#else
158
#define UTF8ToHtmlWrapper NULL
159
#endif
160
161
#ifdef LIBXML_ICONV_ENABLED
162
  #define EMPTY_ICONV , (iconv_t) 0, (iconv_t) 0
163
#else
164
  #define EMPTY_ICONV
165
#endif
166
167
#if !defined(LIBXML_ICONV_ENABLED) && !defined(LIBXML_ICU_ENABLED) && \
168
    defined(LIBXML_ISO8859X_ENABLED)
169
170
#include "iso8859x.inc"
171
172
static int
173
ISO8859xToUTF8(unsigned char* out, int *outlen,
174
               const unsigned char* in, int *inlen, void *vctxt);
175
static int
176
UTF8ToISO8859x(unsigned char *out, int *outlen,
177
               const unsigned char *in, int *inlen, void *vctxt);
178
179
#define MAKE_ISO_HANDLER(name, n) \
180
    { (char *) name, \
181
      (xmlCharEncodingInputFunc) (void (*)(void)) ISO8859xToUTF8, \
182
      (xmlCharEncodingInputFunc) (void (*)(void)) UTF8ToISO8859x \
183
      EMPTY_ICONV, \
184
      (void *) xmlunicodetable_ISO8859_##n, \
185
      (void *) xmltranscodetable_ISO8859_##n, \
186
      NULL, XML_HANDLER_STATIC }
187
188
#else /* LIBXML_ISO8859X_ENABLED */
189
190
#define MAKE_ISO_HANDLER(name, n) \
191
    { (char *) name, NULL, NULL EMPTY_ICONV, NULL, NULL, NULL, \
192
      XML_HANDLER_STATIC }
193
194
#endif /* LIBXML_ISO8859X_ENABLED */
195
196
#define MAKE_HANDLER(name, in, out) \
197
    { (char *) name, \
198
      (xmlCharEncodingInputFunc) (void (*)(void)) in, \
199
      (xmlCharEncodingOutputFunc) (void (*)(void)) out \
200
      EMPTY_ICONV, NULL, NULL, NULL, XML_HANDLER_STATIC }
201
202
/*
203
 * The layout must match enum xmlCharEncoding.
204
 *
205
 * Names should match the IANA registry if possible:
206
 * https://www.iana.org/assignments/character-sets/character-sets.xhtml
207
 */
208
static const xmlCharEncodingHandler defaultHandlers[31] = {
209
    MAKE_HANDLER(NULL, NULL, NULL), /* NONE */
210
    MAKE_HANDLER("UTF-8", UTF8ToUTF8, UTF8ToUTF8),
211
    MAKE_HANDLER("UTF-16LE", UTF16LEToUTF8, UTF8ToUTF16LE),
212
    MAKE_HANDLER("UTF-16BE", UTF16BEToUTF8, UTF8ToUTF16BE),
213
    MAKE_HANDLER("UCS-4LE", NULL, NULL),
214
    MAKE_HANDLER("UCS-4BE", NULL, NULL),
215
    MAKE_HANDLER("IBM037", NULL, NULL),
216
    MAKE_HANDLER("ISO-10646-UCS-4", NULL, NULL), /* UCS4_2143 */
217
    MAKE_HANDLER("ISO-10646-UCS-4", NULL, NULL), /* UCS4_2143 */
218
    MAKE_HANDLER("ISO-10646-UCS-2", NULL, NULL),
219
    MAKE_HANDLER("ISO-8859-1", latin1ToUTF8, UTF8ToLatin1),
220
    MAKE_ISO_HANDLER("ISO-8859-2", 2),
221
    MAKE_ISO_HANDLER("ISO-8859-3", 3),
222
    MAKE_ISO_HANDLER("ISO-8859-4", 4),
223
    MAKE_ISO_HANDLER("ISO-8859-5", 5),
224
    MAKE_ISO_HANDLER("ISO-8859-6", 6),
225
    MAKE_ISO_HANDLER("ISO-8859-7", 7),
226
    MAKE_ISO_HANDLER("ISO-8859-8", 8),
227
    MAKE_ISO_HANDLER("ISO-8859-9", 9),
228
    MAKE_HANDLER("ISO-2022-JP", NULL, NULL),
229
    MAKE_HANDLER("Shift_JIS", NULL, NULL),
230
    MAKE_HANDLER("EUC-JP", NULL, NULL),
231
    MAKE_HANDLER("US-ASCII", asciiToAscii, asciiToAscii),
232
    MAKE_HANDLER("UTF-16", UTF16LEToUTF8, UTF8ToUTF16),
233
    MAKE_HANDLER("HTML", NULL, UTF8ToHtmlWrapper),
234
    MAKE_ISO_HANDLER("ISO-8859-10", 10),
235
    MAKE_ISO_HANDLER("ISO-8859-11", 11),
236
    MAKE_ISO_HANDLER("ISO-8859-13", 13),
237
    MAKE_ISO_HANDLER("ISO-8859-14", 14),
238
    MAKE_ISO_HANDLER("ISO-8859-15", 15),
239
    MAKE_ISO_HANDLER("ISO-8859-16", 16),
240
};
241
242
#define NUM_DEFAULT_HANDLERS \
243
2.02k
    (sizeof(defaultHandlers) / sizeof(defaultHandlers[0]))
244
245
/* the size should be growable, but it's not a big deal ... */
246
0
#define MAX_ENCODING_HANDLERS 50
247
static xmlCharEncodingHandlerPtr *globalHandlers = NULL;
248
static int nbCharEncodingHandler = 0;
249
250
#ifdef LIBXML_ICONV_ENABLED
251
static int
252
xmlCharEncIconv(void *vctxt, const char *name, xmlCharEncConverter *conv);
253
#endif
254
255
#ifdef LIBXML_ICU_ENABLED
256
static int
257
xmlCharEncUconv(void *vctxt, const char *name, xmlCharEncConverter *conv);
258
#endif
259
260
/************************************************************************
261
 *                  *
262
 *    Generic encoding handling routines      *
263
 *                  *
264
 ************************************************************************/
265
266
/**
267
 * xmlDetectCharEncoding:
268
 * @in:  a pointer to the first bytes of the XML entity, must be at least
269
 *       2 bytes long (at least 4 if encoding is UTF4 variant).
270
 * @len:  pointer to the length of the buffer
271
 *
272
 * Guess the encoding of the entity using the first bytes of the entity content
273
 * according to the non-normative appendix F of the XML-1.0 recommendation.
274
 *
275
 * Returns one of the XML_CHAR_ENCODING_... values.
276
 */
277
xmlCharEncoding
278
xmlDetectCharEncoding(const unsigned char* in, int len)
279
0
{
280
0
    if (in == NULL)
281
0
        return(XML_CHAR_ENCODING_NONE);
282
0
    if (len >= 4) {
283
0
  if ((in[0] == 0x00) && (in[1] == 0x00) &&
284
0
      (in[2] == 0x00) && (in[3] == 0x3C))
285
0
      return(XML_CHAR_ENCODING_UCS4BE);
286
0
  if ((in[0] == 0x3C) && (in[1] == 0x00) &&
287
0
      (in[2] == 0x00) && (in[3] == 0x00))
288
0
      return(XML_CHAR_ENCODING_UCS4LE);
289
0
  if ((in[0] == 0x00) && (in[1] == 0x00) &&
290
0
      (in[2] == 0x3C) && (in[3] == 0x00))
291
0
      return(XML_CHAR_ENCODING_UCS4_2143);
292
0
  if ((in[0] == 0x00) && (in[1] == 0x3C) &&
293
0
      (in[2] == 0x00) && (in[3] == 0x00))
294
0
      return(XML_CHAR_ENCODING_UCS4_3412);
295
0
  if ((in[0] == 0x4C) && (in[1] == 0x6F) &&
296
0
      (in[2] == 0xA7) && (in[3] == 0x94))
297
0
      return(XML_CHAR_ENCODING_EBCDIC);
298
0
  if ((in[0] == 0x3C) && (in[1] == 0x3F) &&
299
0
      (in[2] == 0x78) && (in[3] == 0x6D))
300
0
      return(XML_CHAR_ENCODING_UTF8);
301
  /*
302
   * Although not part of the recommendation, we also
303
   * attempt an "auto-recognition" of UTF-16LE and
304
   * UTF-16BE encodings.
305
   */
306
0
  if ((in[0] == 0x3C) && (in[1] == 0x00) &&
307
0
      (in[2] == 0x3F) && (in[3] == 0x00))
308
0
      return(XML_CHAR_ENCODING_UTF16LE);
309
0
  if ((in[0] == 0x00) && (in[1] == 0x3C) &&
310
0
      (in[2] == 0x00) && (in[3] == 0x3F))
311
0
      return(XML_CHAR_ENCODING_UTF16BE);
312
0
    }
313
0
    if (len >= 3) {
314
  /*
315
   * Errata on XML-1.0 June 20 2001
316
   * We now allow an UTF8 encoded BOM
317
   */
318
0
  if ((in[0] == 0xEF) && (in[1] == 0xBB) &&
319
0
      (in[2] == 0xBF))
320
0
      return(XML_CHAR_ENCODING_UTF8);
321
0
    }
322
    /* For UTF-16 we can recognize by the BOM */
323
0
    if (len >= 2) {
324
0
  if ((in[0] == 0xFE) && (in[1] == 0xFF))
325
0
      return(XML_CHAR_ENCODING_UTF16BE);
326
0
  if ((in[0] == 0xFF) && (in[1] == 0xFE))
327
0
      return(XML_CHAR_ENCODING_UTF16LE);
328
0
    }
329
0
    return(XML_CHAR_ENCODING_NONE);
330
0
}
331
332
/**
333
 * xmlCleanupEncodingAliases:
334
 *
335
 * DEPRECATED: This function modifies global state and is not
336
 * thread-safe.
337
 *
338
 * Unregisters all aliases
339
 */
340
void
341
9.89k
xmlCleanupEncodingAliases(void) {
342
9.89k
    int i;
343
344
9.89k
    if (xmlCharEncodingAliases == NULL)
345
9.89k
  return;
346
347
0
    for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
348
0
  if (xmlCharEncodingAliases[i].name != NULL)
349
0
      xmlFree((char *) xmlCharEncodingAliases[i].name);
350
0
  if (xmlCharEncodingAliases[i].alias != NULL)
351
0
      xmlFree((char *) xmlCharEncodingAliases[i].alias);
352
0
    }
353
0
    xmlCharEncodingAliasesNb = 0;
354
0
    xmlCharEncodingAliasesMax = 0;
355
0
    xmlFree(xmlCharEncodingAliases);
356
0
    xmlCharEncodingAliases = NULL;
357
0
}
358
359
/**
360
 * xmlGetEncodingAlias:
361
 * @alias:  the alias name as parsed, in UTF-8 format (ASCII actually)
362
 *
363
 * DEPRECATED: This function is not thread-safe.
364
 *
365
 * Lookup an encoding name for the given alias.
366
 *
367
 * Returns NULL if not found, otherwise the original name
368
 */
369
const char *
370
2.89k
xmlGetEncodingAlias(const char *alias) {
371
2.89k
    int i;
372
2.89k
    char upper[100];
373
374
2.89k
    if (alias == NULL)
375
0
  return(NULL);
376
377
2.89k
    if (xmlCharEncodingAliases == NULL)
378
2.89k
  return(NULL);
379
380
0
    for (i = 0;i < 99;i++) {
381
0
        upper[i] = (char) toupper((unsigned char) alias[i]);
382
0
  if (upper[i] == 0) break;
383
0
    }
384
0
    upper[i] = 0;
385
386
    /*
387
     * Walk down the list looking for a definition of the alias
388
     */
389
0
    for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
390
0
  if (!strcmp(xmlCharEncodingAliases[i].alias, upper)) {
391
0
      return(xmlCharEncodingAliases[i].name);
392
0
  }
393
0
    }
394
0
    return(NULL);
395
0
}
396
397
/**
398
 * xmlAddEncodingAlias:
399
 * @name:  the encoding name as parsed, in UTF-8 format (ASCII actually)
400
 * @alias:  the alias name as parsed, in UTF-8 format (ASCII actually)
401
 *
402
 * DEPRECATED: This function modifies global state and is not
403
 * thread-safe.
404
 *
405
 * Registers an alias @alias for an encoding named @name. Existing alias
406
 * will be overwritten.
407
 *
408
 * Returns 0 in case of success, -1 in case of error
409
 */
410
int
411
0
xmlAddEncodingAlias(const char *name, const char *alias) {
412
0
    int i;
413
0
    char upper[100];
414
0
    char *nameCopy, *aliasCopy;
415
416
0
    if ((name == NULL) || (alias == NULL))
417
0
  return(-1);
418
419
0
    for (i = 0;i < 99;i++) {
420
0
        upper[i] = (char) toupper((unsigned char) alias[i]);
421
0
  if (upper[i] == 0) break;
422
0
    }
423
0
    upper[i] = 0;
424
425
0
    if (xmlCharEncodingAliasesNb >= xmlCharEncodingAliasesMax) {
426
0
        xmlCharEncodingAliasPtr tmp;
427
0
        size_t newSize = xmlCharEncodingAliasesMax ?
428
0
                         xmlCharEncodingAliasesMax * 2 :
429
0
                         20;
430
431
0
        tmp = (xmlCharEncodingAliasPtr)
432
0
              xmlRealloc(xmlCharEncodingAliases,
433
0
                         newSize * sizeof(xmlCharEncodingAlias));
434
0
        if (tmp == NULL)
435
0
            return(-1);
436
0
        xmlCharEncodingAliases = tmp;
437
0
        xmlCharEncodingAliasesMax = newSize;
438
0
    }
439
440
    /*
441
     * Walk down the list looking for a definition of the alias
442
     */
443
0
    for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
444
0
  if (!strcmp(xmlCharEncodingAliases[i].alias, upper)) {
445
      /*
446
       * Replace the definition.
447
       */
448
0
      nameCopy = xmlMemStrdup(name);
449
0
            if (nameCopy == NULL)
450
0
                return(-1);
451
0
      xmlFree((char *) xmlCharEncodingAliases[i].name);
452
0
      xmlCharEncodingAliases[i].name = nameCopy;
453
0
      return(0);
454
0
  }
455
0
    }
456
    /*
457
     * Add the definition
458
     */
459
0
    nameCopy = xmlMemStrdup(name);
460
0
    if (nameCopy == NULL)
461
0
        return(-1);
462
0
    aliasCopy = xmlMemStrdup(upper);
463
0
    if (aliasCopy == NULL) {
464
0
        xmlFree(nameCopy);
465
0
        return(-1);
466
0
    }
467
0
    xmlCharEncodingAliases[xmlCharEncodingAliasesNb].name = nameCopy;
468
0
    xmlCharEncodingAliases[xmlCharEncodingAliasesNb].alias = aliasCopy;
469
0
    xmlCharEncodingAliasesNb++;
470
0
    return(0);
471
0
}
472
473
/**
474
 * xmlDelEncodingAlias:
475
 * @alias:  the alias name as parsed, in UTF-8 format (ASCII actually)
476
 *
477
 * DEPRECATED: This function modifies global state and is not
478
 * thread-safe.
479
 *
480
 * Unregisters an encoding alias @alias
481
 *
482
 * Returns 0 in case of success, -1 in case of error
483
 */
484
int
485
0
xmlDelEncodingAlias(const char *alias) {
486
0
    int i;
487
488
0
    if (alias == NULL)
489
0
  return(-1);
490
491
0
    if (xmlCharEncodingAliases == NULL)
492
0
  return(-1);
493
    /*
494
     * Walk down the list looking for a definition of the alias
495
     */
496
0
    for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
497
0
  if (!strcmp(xmlCharEncodingAliases[i].alias, alias)) {
498
0
      xmlFree((char *) xmlCharEncodingAliases[i].name);
499
0
      xmlFree((char *) xmlCharEncodingAliases[i].alias);
500
0
      xmlCharEncodingAliasesNb--;
501
0
      memmove(&xmlCharEncodingAliases[i], &xmlCharEncodingAliases[i + 1],
502
0
        sizeof(xmlCharEncodingAlias) * (xmlCharEncodingAliasesNb - i));
503
0
      return(0);
504
0
  }
505
0
    }
506
0
    return(-1);
507
0
}
508
509
static int
510
14.8k
xmlCompareEncTableEntries(const void *vkey, const void *ventry) {
511
14.8k
    const char *key = vkey;
512
14.8k
    const xmlEncTableEntry *entry = ventry;
513
514
14.8k
    return(xmlStrcasecmp(BAD_CAST key, BAD_CAST entry->name));
515
14.8k
}
516
517
static xmlCharEncoding
518
xmlParseCharEncodingInternal(const char *name)
519
2.89k
{
520
2.89k
    const xmlEncTableEntry *entry;
521
522
2.89k
    if (name == NULL)
523
0
       return(XML_CHAR_ENCODING_NONE);
524
525
2.89k
    entry = bsearch(name, xmlEncTable,
526
2.89k
                    sizeof(xmlEncTable) / sizeof(xmlEncTable[0]),
527
2.89k
                    sizeof(xmlEncTable[0]), xmlCompareEncTableEntries);
528
2.89k
    if (entry != NULL)
529
434
        return(entry->enc);
530
531
2.46k
    return(XML_CHAR_ENCODING_ERROR);
532
2.89k
}
533
534
/**
535
 * xmlParseCharEncoding:
536
 * @name:  the encoding name as parsed, in UTF-8 format (ASCII actually)
537
 *
538
 * Compare the string to the encoding schemes already known. Note
539
 * that the comparison is case insensitive accordingly to the section
540
 * [XML] 4.3.3 Character Encoding in Entities.
541
 *
542
 * Returns one of the XML_CHAR_ENCODING_... values or XML_CHAR_ENCODING_NONE
543
 * if not recognized.
544
 */
545
xmlCharEncoding
546
xmlParseCharEncoding(const char *name)
547
0
{
548
0
    xmlCharEncoding enc = xmlParseCharEncodingInternal(name);
549
550
    /* Backward compatibility */
551
0
    if (enc == XML_CHAR_ENCODING_UTF16)
552
0
        enc = XML_CHAR_ENCODING_UTF16LE;
553
554
0
    return(enc);
555
0
}
556
557
/**
558
 * xmlGetCharEncodingName:
559
 * @enc:  the encoding
560
 *
561
 * The "canonical" name for XML encoding.
562
 * C.f. http://www.w3.org/TR/REC-xml#charencoding
563
 * Section 4.3.3  Character Encoding in Entities
564
 *
565
 * Returns the canonical name for the given encoding
566
 */
567
const char*
568
0
xmlGetCharEncodingName(xmlCharEncoding enc) {
569
0
    switch (enc) {
570
0
        case XML_CHAR_ENCODING_UTF16LE:
571
0
      return("UTF-16");
572
0
        case XML_CHAR_ENCODING_UTF16BE:
573
0
      return("UTF-16");
574
0
        case XML_CHAR_ENCODING_UCS4LE:
575
0
            return("ISO-10646-UCS-4");
576
0
        case XML_CHAR_ENCODING_UCS4BE:
577
0
            return("ISO-10646-UCS-4");
578
0
        default:
579
0
            break;
580
0
    }
581
582
0
    if ((enc <= 0) || ((size_t) enc >= NUM_DEFAULT_HANDLERS))
583
0
        return(NULL);
584
585
0
    return(defaultHandlers[enc].name);
586
0
}
587
588
/************************************************************************
589
 *                  *
590
 *      Char encoding handlers        *
591
 *                  *
592
 ************************************************************************/
593
594
/**
595
 * xmlNewCharEncodingHandler:
596
 * @name:  the encoding name, in UTF-8 format (ASCII actually)
597
 * @input:  the xmlCharEncodingInputFunc to read that encoding
598
 * @output:  the xmlCharEncodingOutputFunc to write that encoding
599
 *
600
 * DEPRECATED: This function modifies global state and is not
601
 * thread-safe.
602
 *
603
 * Create and registers an xmlCharEncodingHandler.
604
 *
605
 * Returns the xmlCharEncodingHandlerPtr created (or NULL in case of error).
606
 */
607
xmlCharEncodingHandlerPtr
608
xmlNewCharEncodingHandler(const char *name,
609
                          xmlCharEncodingInputFunc input,
610
0
                          xmlCharEncodingOutputFunc output) {
611
0
    xmlCharEncodingHandlerPtr handler;
612
0
    const char *alias;
613
0
    char upper[500];
614
0
    int i;
615
0
    char *up = NULL;
616
617
    /*
618
     * Do the alias resolution
619
     */
620
0
    alias = xmlGetEncodingAlias(name);
621
0
    if (alias != NULL)
622
0
  name = alias;
623
624
    /*
625
     * Keep only the uppercase version of the encoding.
626
     */
627
0
    if (name == NULL)
628
0
  return(NULL);
629
0
    for (i = 0;i < 499;i++) {
630
0
        upper[i] = (char) toupper((unsigned char) name[i]);
631
0
  if (upper[i] == 0) break;
632
0
    }
633
0
    upper[i] = 0;
634
0
    up = xmlMemStrdup(upper);
635
0
    if (up == NULL)
636
0
  return(NULL);
637
638
    /*
639
     * allocate and fill-up an handler block.
640
     */
641
0
    handler = (xmlCharEncodingHandlerPtr)
642
0
              xmlMalloc(sizeof(xmlCharEncodingHandler));
643
0
    if (handler == NULL) {
644
0
        xmlFree(up);
645
0
  return(NULL);
646
0
    }
647
0
    memset(handler, 0, sizeof(xmlCharEncodingHandler));
648
0
    handler->input = input;
649
0
    handler->output = output;
650
0
    handler->name = up;
651
0
    handler->flags = XML_HANDLER_STATIC;
652
653
0
#ifdef LIBXML_ICONV_ENABLED
654
0
    handler->iconv_in = NULL;
655
0
    handler->iconv_out = NULL;
656
0
#endif
657
658
    /*
659
     * registers and returns the handler.
660
     */
661
0
    xmlRegisterCharEncodingHandler(handler);
662
0
    return(handler);
663
0
}
664
665
/**
666
 * xmlInitCharEncodingHandlers:
667
 *
668
 * DEPRECATED: Alias for xmlInitParser.
669
 */
670
void
671
0
xmlInitCharEncodingHandlers(void) {
672
0
    xmlInitParser();
673
0
}
674
675
/**
676
 * xmlInitEncodingInternal:
677
 *
678
 * Initialize the char encoding support.
679
 */
680
void
681
9.89k
xmlInitEncodingInternal(void) {
682
9.89k
    unsigned short int tst = 0x1234;
683
9.89k
    unsigned char *ptr = (unsigned char *) &tst;
684
685
9.89k
    if (*ptr == 0x12) xmlLittleEndian = 0;
686
9.89k
    else xmlLittleEndian = 1;
687
9.89k
}
688
689
/**
690
 * xmlCleanupCharEncodingHandlers:
691
 *
692
 * DEPRECATED: This function will be made private. Call xmlCleanupParser
693
 * to free global state but see the warnings there. xmlCleanupParser
694
 * should be only called once at program exit. In most cases, you don't
695
 * have call cleanup functions at all.
696
 *
697
 * Cleanup the memory allocated for the char encoding support, it
698
 * unregisters all the encoding handlers and the aliases.
699
 */
700
void
701
9.89k
xmlCleanupCharEncodingHandlers(void) {
702
9.89k
    xmlCleanupEncodingAliases();
703
704
9.89k
    if (globalHandlers == NULL) return;
705
706
0
    for (;nbCharEncodingHandler > 0;) {
707
0
        xmlCharEncodingHandler *handler;
708
709
0
        nbCharEncodingHandler--;
710
0
        handler = globalHandlers[nbCharEncodingHandler];
711
0
  if (handler != NULL) {
712
0
      if (handler->name != NULL)
713
0
    xmlFree(handler->name);
714
0
      xmlFree(handler);
715
0
  }
716
0
    }
717
0
    xmlFree(globalHandlers);
718
0
    globalHandlers = NULL;
719
0
    nbCharEncodingHandler = 0;
720
0
}
721
722
/**
723
 * xmlRegisterCharEncodingHandler:
724
 * @handler:  the xmlCharEncodingHandlerPtr handler block
725
 *
726
 * DEPRECATED: This function modifies global state and is not
727
 * thread-safe.
728
 *
729
 * Register the char encoding handler.
730
 */
731
void
732
0
xmlRegisterCharEncodingHandler(xmlCharEncodingHandlerPtr handler) {
733
0
    if (handler == NULL)
734
0
        return;
735
0
    if (globalHandlers == NULL) {
736
0
        globalHandlers = xmlMalloc(
737
0
                MAX_ENCODING_HANDLERS * sizeof(globalHandlers[0]));
738
0
        if (globalHandlers == NULL)
739
0
            goto free_handler;
740
0
    }
741
742
0
    if (nbCharEncodingHandler >= MAX_ENCODING_HANDLERS)
743
0
        goto free_handler;
744
0
    globalHandlers[nbCharEncodingHandler++] = handler;
745
0
    return;
746
747
0
free_handler:
748
0
    if (handler != NULL) {
749
0
        if (handler->name != NULL) {
750
0
            xmlFree(handler->name);
751
0
        }
752
0
        xmlFree(handler);
753
0
    }
754
0
}
755
756
static int
757
xmlInvokeConvImpl(xmlCharEncConvImpl impl, void *implCtxt,
758
2.56k
                  const char *name, xmlCharEncodingHandler *handler) {
759
2.56k
    xmlCharEncConverter conv = { NULL, NULL, NULL, NULL, NULL };
760
2.56k
    int ret;
761
762
2.56k
    ret = impl(implCtxt, name, &conv);
763
764
2.56k
    if (ret == XML_ERR_OK) {
765
2.52k
        handler->input =
766
2.52k
            (xmlCharEncodingInputFunc) (void (*)(void)) conv.input;
767
2.52k
        handler->output =
768
2.52k
            (xmlCharEncodingOutputFunc) (void (*)(void)) conv.output;
769
2.52k
        handler->ctxtDtor = conv.ctxtDtor;
770
2.52k
        handler->inputCtxt = conv.inputCtxt;
771
2.52k
        handler->outputCtxt = conv.outputCtxt;
772
2.52k
    }
773
774
2.56k
    return(ret);
775
2.56k
}
776
777
/**
778
 * xmlFindExtraHandler:
779
 * @norig:  name of the char encoding
780
 * @name:  potentially aliased name of the encoding
781
 * @output:  boolean, use handler for output
782
 * @impl:  a conversion implementation (optional)
783
 * @implCtxt:  user data for conversion implementation (optional)
784
 * @out:  pointer to resulting handler
785
 *
786
 * Search the non-default handlers for an exact match.
787
 *
788
 * Returns an xmlParserErrors error code.
789
 */
790
static int
791
xmlFindExtraHandler(const char *norig, const char *name, int output,
792
                    xmlCharEncConvImpl impl, void *implCtxt,
793
2.56k
                    xmlCharEncodingHandler **out) {
794
2.56k
    xmlCharEncodingHandler *handler;
795
2.56k
    int ret;
796
2.56k
    int i;
797
798
2.56k
    handler = xmlMalloc(sizeof(*handler));
799
2.56k
    if (handler == NULL)
800
0
        return(XML_ERR_NO_MEMORY);
801
2.56k
    memset(handler, 0, sizeof(*handler));
802
803
2.56k
    handler->name = xmlMemStrdup(name);
804
2.56k
    if (handler->name == NULL) {
805
0
        ret = XML_ERR_NO_MEMORY;
806
0
        goto done;
807
0
    }
808
809
    /*
810
     * Try custom implementation before deprecated global handlers.
811
     *
812
     * Note that we pass the original name without deprecated
813
     * alias resolution.
814
     */
815
2.56k
    if (impl != NULL) {
816
0
        ret = xmlInvokeConvImpl(impl, implCtxt, norig, handler);
817
0
        if (ret != XML_ERR_OK)
818
0
            goto done;
819
820
0
        *out = handler;
821
0
        return(XML_ERR_OK);
822
0
    }
823
824
    /*
825
     * Deprecated
826
     */
827
2.56k
    if (globalHandlers != NULL) {
828
0
        for (i = 0; i < nbCharEncodingHandler; i++) {
829
0
            xmlCharEncodingHandler *h = globalHandlers[i];
830
831
0
            if (!xmlStrcasecmp((const xmlChar *) name,
832
0
                               (const xmlChar *) h->name)) {
833
0
                if ((output ? h->output : h->input) != NULL) {
834
0
                    *out = h;
835
0
                    ret = XML_ERR_OK;
836
0
                    goto done;
837
0
                }
838
0
            }
839
0
        }
840
0
    }
841
842
2.56k
#ifdef LIBXML_ICONV_ENABLED
843
2.56k
    ret = xmlInvokeConvImpl(xmlCharEncIconv, handler, name, handler);
844
2.56k
    if (ret == XML_ERR_OK) {
845
2.52k
        *out = handler;
846
2.52k
        return(XML_ERR_OK);
847
2.52k
    }
848
49
    if (ret != XML_ERR_UNSUPPORTED_ENCODING)
849
0
        goto done;
850
49
#endif /* LIBXML_ICONV_ENABLED */
851
852
#ifdef LIBXML_ICU_ENABLED
853
    ret = xmlInvokeConvImpl(xmlCharEncUconv, handler, name, handler);
854
    if (ret == XML_ERR_OK) {
855
        *out = handler;
856
        return(XML_ERR_OK);
857
    }
858
    if (ret != XML_ERR_UNSUPPORTED_ENCODING)
859
        goto done;
860
#endif /* LIBXML_ICU_ENABLED */
861
862
49
    ret = XML_ERR_UNSUPPORTED_ENCODING;
863
864
49
done:
865
49
    if (handler != NULL) {
866
49
        xmlFree(handler->name);
867
49
        xmlFree(handler);
868
49
    }
869
870
49
    return(ret);
871
49
}
872
873
/**
874
 * xmlLookupCharEncodingHandler:
875
 * @enc:  an xmlCharEncoding value.
876
 * @out:  pointer to result
877
 *
878
 * Find or create a handler matching the encoding. The following
879
 * converters are looked up in order:
880
 *
881
 * - Built-in handler (UTF-8, UTF-16, ISO-8859-1, ASCII)
882
 * - User-registered global handler (deprecated)
883
 * - iconv if enabled
884
 * - ICU if enabled
885
 *
886
 * The handler must be closed with xmlCharEncCloseFunc.
887
 *
888
 * If the encoding is UTF-8, a NULL handler and no error code will
889
 * be returned.
890
 *
891
 * Available since 2.13.0.
892
 *
893
 * Returns XML_ERR_OK, XML_ERR_UNSUPPORTED_ENCODING or another
894
 * xmlParserErrors error code.
895
 */
896
int
897
xmlLookupCharEncodingHandler(xmlCharEncoding enc,
898
1.67k
                             xmlCharEncodingHandler **out) {
899
1.67k
    const xmlCharEncodingHandler *handler;
900
901
1.67k
    if (out == NULL)
902
0
        return(XML_ERR_ARGUMENT);
903
1.67k
    *out = NULL;
904
905
1.67k
    if ((enc <= 0) || ((size_t) enc >= NUM_DEFAULT_HANDLERS))
906
0
        return(XML_ERR_UNSUPPORTED_ENCODING);
907
908
    /* Return NULL handler for UTF-8 */
909
1.67k
    if ((enc == XML_CHAR_ENCODING_UTF8) ||
910
1.67k
        (enc == XML_CHAR_ENCODING_NONE))
911
1.01k
        return(XML_ERR_OK);
912
913
664
    handler = &defaultHandlers[enc];
914
664
    if ((handler->input != NULL) || (handler->output != NULL)) {
915
645
        *out = (xmlCharEncodingHandler *) handler;
916
645
        return(XML_ERR_OK);
917
645
    }
918
919
19
    if (handler->name != NULL)
920
19
        return(xmlFindExtraHandler(handler->name, handler->name, 0,
921
19
                                   NULL, NULL, out));
922
923
0
    return(XML_ERR_UNSUPPORTED_ENCODING);
924
19
}
925
926
/**
927
 * xmlGetCharEncodingHandler:
928
 * @enc:  an xmlCharEncoding value.
929
 *
930
 * DEPRECATED: Use xmlLookupCharEncodingHandler which has better error
931
 * reporting.
932
 *
933
 * Returns the handler or NULL if no handler was found or an error
934
 * occurred.
935
 */
936
xmlCharEncodingHandlerPtr
937
0
xmlGetCharEncodingHandler(xmlCharEncoding enc) {
938
0
    xmlCharEncodingHandler *ret;
939
940
0
    xmlLookupCharEncodingHandler(enc, &ret);
941
0
    return(ret);
942
0
}
943
944
/**
945
 * xmlCreateCharEncodingHandler:
946
 * @name:  a string describing the char encoding.
947
 * @output:  boolean, use handler for output
948
 * @impl:  a conversion implementation (optional)
949
 * @implCtxt:  user data for conversion implementation (optional)
950
 * @out:  pointer to result
951
 *
952
 * Find or create a handler matching the encoding. The following
953
 * converters are looked up in order:
954
 *
955
 * - Built-in handler (UTF-8, UTF-16, ISO-8859-1, ASCII)
956
 * - Custom implementation if provided
957
 * - User-registered global handler (deprecated)
958
 * - iconv if enabled
959
 * - ICU if enabled
960
 *
961
 * The handler must be closed with xmlCharEncCloseFunc.
962
 *
963
 * If the encoding is UTF-8, a NULL handler and no error code will
964
 * be returned.
965
 *
966
 * Available since 2.14.0.
967
 *
968
 * Returns XML_ERR_OK, XML_ERR_UNSUPPORTED_ENCODING or another
969
 * xmlParserErrors error code.
970
 */
971
int
972
xmlCreateCharEncodingHandler(const char *name, int output,
973
                             xmlCharEncConvImpl impl, void *implCtxt,
974
2.89k
                             xmlCharEncodingHandler **out) {
975
2.89k
    const xmlCharEncodingHandler *handler;
976
2.89k
    const char *norig, *nalias;
977
2.89k
    xmlCharEncoding enc;
978
979
2.89k
    if (out == NULL)
980
0
        return(XML_ERR_ARGUMENT);
981
2.89k
    *out = NULL;
982
983
2.89k
    if (name == NULL)
984
0
        return(XML_ERR_ARGUMENT);
985
986
2.89k
    norig = name;
987
2.89k
    nalias = xmlGetEncodingAlias(name);
988
2.89k
    if (nalias != NULL)
989
0
  name = nalias;
990
991
2.89k
    enc = xmlParseCharEncodingInternal(name);
992
993
    /* Return NULL handler for UTF-8 */
994
2.89k
    if (enc == XML_CHAR_ENCODING_UTF8)
995
87
        return(XML_ERR_OK);
996
997
2.81k
    if ((enc > 0) && ((size_t) enc < NUM_DEFAULT_HANDLERS)) {
998
347
        handler = &defaultHandlers[enc];
999
347
        if ((output ? handler->output : handler->input) != NULL) {
1000
261
            *out = (xmlCharEncodingHandler *) handler;
1001
261
            return(XML_ERR_OK);
1002
261
        }
1003
347
    }
1004
1005
2.55k
    return(xmlFindExtraHandler(norig, name, output, impl, implCtxt, out));
1006
2.81k
}
1007
1008
/**
1009
 * xmlOpenCharEncodingHandler:
1010
 * @name:  a string describing the char encoding.
1011
 * @output:  boolean, use handler for output
1012
 * @out:  pointer to result
1013
 *
1014
 * Find or create a handler matching the encoding. The following
1015
 * converters are looked up in order:
1016
 *
1017
 * - Built-in handler (UTF-8, UTF-16, ISO-8859-1, ASCII)
1018
 * - User-registered global handler (deprecated)
1019
 * - iconv if enabled
1020
 * - ICU if enabled
1021
 *
1022
 * The handler must be closed with xmlCharEncCloseFunc.
1023
 *
1024
 * If the encoding is UTF-8, a NULL handler and no error code will
1025
 * be returned.
1026
 *
1027
 * Available since 2.13.0.
1028
 *
1029
 * Returns XML_ERR_OK, XML_ERR_UNSUPPORTED_ENCODING or another
1030
 * xmlParserErrors error code.
1031
 */
1032
int
1033
xmlOpenCharEncodingHandler(const char *name, int output,
1034
0
                           xmlCharEncodingHandler **out) {
1035
0
    return(xmlCreateCharEncodingHandler(name, output, NULL, NULL, out));
1036
0
}
1037
1038
/**
1039
 * xmlFindCharEncodingHandler:
1040
 * @name:  a string describing the char encoding.
1041
 *
1042
 * DEPRECATED: Use xmlOpenCharEncodingHandler which has better error
1043
 * reporting.
1044
 *
1045
 * If the encoding is UTF-8, this will return a no-op handler that
1046
 * shouldn't be used.
1047
 *
1048
 * Returns the handler or NULL if no handler was found or an error
1049
 * occurred.
1050
 */
1051
xmlCharEncodingHandlerPtr
1052
0
xmlFindCharEncodingHandler(const char *name) {
1053
0
    xmlCharEncodingHandler *ret;
1054
1055
    /*
1056
     * This handler shouldn't be used, but we must return a non-NULL
1057
     * handler.
1058
     */
1059
0
    if ((xmlStrcasecmp(BAD_CAST name, BAD_CAST "UTF-8") == 0) ||
1060
0
        (xmlStrcasecmp(BAD_CAST name, BAD_CAST "UTF8") == 0))
1061
0
        return((xmlCharEncodingHandlerPtr)
1062
0
                &defaultHandlers[XML_CHAR_ENCODING_UTF8]);
1063
1064
0
    xmlOpenCharEncodingHandler(name, 0, &ret);
1065
0
    return(ret);
1066
0
}
1067
1068
/************************************************************************
1069
 *                  *
1070
 *    ICONV based generic conversion functions    *
1071
 *                  *
1072
 ************************************************************************/
1073
1074
#ifdef LIBXML_ICONV_ENABLED
1075
typedef struct {
1076
    iconv_t cd;
1077
} xmlIconvCtxt;
1078
1079
/**
1080
 * xmlIconvConvert:
1081
 * @vctxt:  conversion context
1082
 * @out:  a pointer to an array of bytes to store the result
1083
 * @outlen:  the length of @out
1084
 * @in:  a pointer to an array of input bytes
1085
 * @inlen:  the length of @in
1086
 *
1087
 * Returns an XML_ENC_ERR code.
1088
 *
1089
 * The value of @inlen after return is the number of octets consumed
1090
 *     as the return value is positive, else unpredictable.
1091
 * The value of @outlen after return is the number of octets produced.
1092
 */
1093
static int
1094
xmlIconvConvert(unsigned char *out, int *outlen,
1095
42.8k
                const unsigned char *in, int *inlen, void *vctxt) {
1096
42.8k
    xmlIconvCtxt *ctxt = vctxt;
1097
42.8k
    size_t icv_inlen, icv_outlen;
1098
42.8k
    const char *icv_in = (const char *) in;
1099
42.8k
    char *icv_out = (char *) out;
1100
42.8k
    size_t ret;
1101
1102
42.8k
    if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL)) {
1103
0
        if (outlen != NULL) *outlen = 0;
1104
0
        return(XML_ENC_ERR_INTERNAL);
1105
0
    }
1106
42.8k
    icv_inlen = *inlen;
1107
42.8k
    icv_outlen = *outlen;
1108
    /*
1109
     * Some versions take const, other versions take non-const input.
1110
     */
1111
42.8k
    ret = iconv(ctxt->cd, (void *) &icv_in, &icv_inlen, &icv_out, &icv_outlen);
1112
42.8k
    *inlen -= icv_inlen;
1113
42.8k
    *outlen -= icv_outlen;
1114
42.8k
    if (ret == (size_t) -1) {
1115
7.87k
        if (errno == EILSEQ)
1116
228
            return(XML_ENC_ERR_INPUT);
1117
7.64k
        if (errno == E2BIG)
1118
6.33k
            return(XML_ENC_ERR_SPACE);
1119
        /*
1120
         * EINVAL means a truncated multi-byte sequence at the end
1121
         * of the input buffer. We treat this as success.
1122
         */
1123
1.30k
        if (errno == EINVAL)
1124
1.30k
            return(XML_ENC_ERR_SUCCESS);
1125
0
        return(XML_ENC_ERR_INTERNAL);
1126
1.30k
    }
1127
34.9k
    return(XML_ENC_ERR_SUCCESS);
1128
42.8k
}
1129
1130
static void
1131
5.08k
xmlIconvFree(void *vctxt) {
1132
5.08k
    xmlIconvCtxt *ctxt = vctxt;
1133
1134
5.08k
    if (ctxt->cd != (iconv_t) -1)
1135
5.04k
        iconv_close(ctxt->cd);
1136
1137
5.08k
    xmlFree(ctxt);
1138
5.08k
}
1139
1140
static int
1141
2.56k
xmlCharEncIconv(void *vctxt, const char *name, xmlCharEncConverter *conv) {
1142
2.56k
    xmlCharEncodingHandler *handler = vctxt;
1143
2.56k
    xmlIconvCtxt *inputCtxt = NULL, *outputCtxt = NULL;
1144
2.56k
    iconv_t icv_in;
1145
2.56k
    iconv_t icv_out;
1146
2.56k
    int ret;
1147
1148
2.56k
    inputCtxt = xmlMalloc(sizeof(xmlIconvCtxt));
1149
2.56k
    if (inputCtxt == NULL) {
1150
0
        ret = XML_ERR_NO_MEMORY;
1151
0
        goto error;
1152
0
    }
1153
2.56k
    inputCtxt->cd = (iconv_t) -1;
1154
1155
2.56k
    icv_in = iconv_open("UTF-8", name);
1156
2.56k
    if (icv_in == (iconv_t) -1) {
1157
49
        if (errno == EINVAL)
1158
49
            ret = XML_ERR_UNSUPPORTED_ENCODING;
1159
0
        else if (errno == ENOMEM)
1160
0
            ret = XML_ERR_NO_MEMORY;
1161
0
        else
1162
0
            ret = XML_ERR_SYSTEM;
1163
49
        goto error;
1164
49
    }
1165
2.52k
    inputCtxt->cd = icv_in;
1166
1167
2.52k
    outputCtxt = xmlMalloc(sizeof(xmlIconvCtxt));
1168
2.52k
    if (outputCtxt == NULL) {
1169
0
        ret = XML_ERR_NO_MEMORY;
1170
0
        goto error;
1171
0
    }
1172
2.52k
    outputCtxt->cd = (iconv_t) -1;
1173
1174
2.52k
    icv_out = iconv_open(name, "UTF-8");
1175
2.52k
    if (icv_out == (iconv_t) -1) {
1176
0
        if (errno == EINVAL)
1177
0
            ret = XML_ERR_UNSUPPORTED_ENCODING;
1178
0
        else if (errno == ENOMEM)
1179
0
            ret = XML_ERR_NO_MEMORY;
1180
0
        else
1181
0
            ret = XML_ERR_SYSTEM;
1182
0
        goto error;
1183
0
    }
1184
2.52k
    outputCtxt->cd = icv_out;
1185
1186
2.52k
    conv->input = xmlIconvConvert;
1187
2.52k
    conv->output = xmlIconvConvert;
1188
2.52k
    conv->ctxtDtor = xmlIconvFree;
1189
2.52k
    conv->inputCtxt = inputCtxt;
1190
2.52k
    conv->outputCtxt = outputCtxt;
1191
1192
    /* Backward compatibility */
1193
2.52k
    if (handler != NULL) {
1194
2.52k
        handler->iconv_in = icv_in;
1195
2.52k
        handler->iconv_out = icv_out;
1196
2.52k
    }
1197
1198
2.52k
    return(XML_ERR_OK);
1199
1200
49
error:
1201
49
    if (inputCtxt != NULL)
1202
49
        xmlIconvFree(inputCtxt);
1203
49
    if (outputCtxt != NULL)
1204
0
        xmlIconvFree(outputCtxt);
1205
49
    return(ret);
1206
2.52k
}
1207
#endif /* LIBXML_ICONV_ENABLED */
1208
1209
/************************************************************************
1210
 *                  *
1211
 *    ICU based generic conversion functions    *
1212
 *                  *
1213
 ************************************************************************/
1214
1215
#ifdef LIBXML_ICU_ENABLED
1216
/* Size of pivot buffer, same as icu/source/common/ucnv.cpp CHUNK_SIZE */
1217
#define ICU_PIVOT_BUF_SIZE 1024
1218
1219
typedef struct _uconv_t xmlUconvCtxt;
1220
struct _uconv_t {
1221
  UConverter *uconv; /* for conversion between an encoding and UTF-16 */
1222
  UConverter *utf8; /* for conversion between UTF-8 and UTF-16 */
1223
  UChar      *pivot_source;
1224
  UChar      *pivot_target;
1225
  int        isInput;
1226
  UChar      pivot_buf[ICU_PIVOT_BUF_SIZE];
1227
};
1228
1229
/**
1230
 * xmlUconvConvert:
1231
 * @vctxt:  converison context
1232
 * @out:  a pointer to an array of bytes to store the result
1233
 * @outlen:  the length of @out
1234
 * @in:  a pointer to an array of input bytes
1235
 * @inlen:  the length of @in
1236
 *
1237
 * Returns an XML_ENC_ERR code.
1238
 *
1239
 * The value of @inlen after return is the number of octets consumed
1240
 *     as the return value is positive, else unpredictable.
1241
 * The value of @outlen after return is the number of octets produced.
1242
 */
1243
static int
1244
xmlUconvConvert(unsigned char *out, int *outlen,
1245
                const unsigned char *in, int *inlen, void *vctxt) {
1246
    xmlUconvCtxt *cd = vctxt;
1247
    const char *ucv_in = (const char *) in;
1248
    char *ucv_out = (char *) out;
1249
    UConverter *target, *source;
1250
    UErrorCode err = U_ZERO_ERROR;
1251
    int ret;
1252
1253
    if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL)) {
1254
        if (outlen != NULL)
1255
            *outlen = 0;
1256
        return(XML_ENC_ERR_INTERNAL);
1257
    }
1258
1259
    /*
1260
     * Note that the ICU API is stateful. It can always consume a certain
1261
     * amount of input even if the output buffer would overflow. The
1262
     * remaining input must be processed by calling ucnv_convertEx with a
1263
     * possibly empty input buffer.
1264
     *
1265
     * ucnv_convertEx is always called with reset and flush set to 0,
1266
     * so we don't mess up the state. This should never generate
1267
     * U_TRUNCATED_CHAR_FOUND errors.
1268
     */
1269
    if (cd->isInput) {
1270
        source = cd->uconv;
1271
        target = cd->utf8;
1272
    } else {
1273
        source = cd->utf8;
1274
        target = cd->uconv;
1275
    }
1276
1277
    ucnv_convertEx(target, source, &ucv_out, ucv_out + *outlen,
1278
                   &ucv_in, ucv_in + *inlen, cd->pivot_buf,
1279
                   &cd->pivot_source, &cd->pivot_target,
1280
                   cd->pivot_buf + ICU_PIVOT_BUF_SIZE, 0, 0, &err);
1281
1282
    *inlen = ucv_in - (const char*) in;
1283
    *outlen = ucv_out - (char *) out;
1284
1285
    if (U_SUCCESS(err)) {
1286
        ret = XML_ENC_ERR_SUCCESS;
1287
    } else {
1288
        switch (err) {
1289
            case U_TRUNCATED_CHAR_FOUND:
1290
                /* Shouldn't happen without flush */
1291
                ret = XML_ENC_ERR_SUCCESS;
1292
                break;
1293
1294
            case U_BUFFER_OVERFLOW_ERROR:
1295
                ret = XML_ENC_ERR_SPACE;
1296
                break;
1297
1298
            case U_INVALID_CHAR_FOUND:
1299
            case U_ILLEGAL_CHAR_FOUND:
1300
                ret = XML_ENC_ERR_INPUT;
1301
                break;
1302
1303
            case U_MEMORY_ALLOCATION_ERROR:
1304
                ret = XML_ERR_NO_MEMORY;
1305
                break;
1306
1307
            default:
1308
                ret = XML_ENC_ERR_INTERNAL;
1309
                break;
1310
        }
1311
    }
1312
1313
    return(ret);
1314
}
1315
1316
static int
1317
openIcuConverter(const char* name, int isInput, xmlUconvCtxt **out)
1318
{
1319
    UErrorCode status;
1320
    xmlUconvCtxt *conv;
1321
1322
    *out = NULL;
1323
1324
    conv = (xmlUconvCtxt *) xmlMalloc(sizeof(xmlUconvCtxt));
1325
    if (conv == NULL)
1326
        return(XML_ERR_NO_MEMORY);
1327
1328
    conv->isInput = isInput;
1329
    conv->pivot_source = conv->pivot_buf;
1330
    conv->pivot_target = conv->pivot_buf;
1331
1332
    status = U_ZERO_ERROR;
1333
    conv->uconv = ucnv_open(name, &status);
1334
    if (U_FAILURE(status))
1335
        goto error;
1336
1337
    status = U_ZERO_ERROR;
1338
    if (isInput) {
1339
        ucnv_setToUCallBack(conv->uconv, UCNV_TO_U_CALLBACK_STOP,
1340
                                                NULL, NULL, NULL, &status);
1341
    }
1342
    else {
1343
        ucnv_setFromUCallBack(conv->uconv, UCNV_FROM_U_CALLBACK_STOP,
1344
                                                NULL, NULL, NULL, &status);
1345
    }
1346
    if (U_FAILURE(status))
1347
        goto error;
1348
1349
    status = U_ZERO_ERROR;
1350
    conv->utf8 = ucnv_open("UTF-8", &status);
1351
    if (U_FAILURE(status))
1352
        goto error;
1353
1354
    *out = conv;
1355
    return(0);
1356
1357
error:
1358
    if (conv->uconv)
1359
        ucnv_close(conv->uconv);
1360
    xmlFree(conv);
1361
1362
    if (status == U_FILE_ACCESS_ERROR)
1363
        return(XML_ERR_UNSUPPORTED_ENCODING);
1364
    if (status == U_MEMORY_ALLOCATION_ERROR)
1365
        return(XML_ERR_NO_MEMORY);
1366
    return(XML_ERR_SYSTEM);
1367
}
1368
1369
static void
1370
closeIcuConverter(xmlUconvCtxt *conv)
1371
{
1372
    if (conv == NULL)
1373
        return;
1374
    ucnv_close(conv->uconv);
1375
    ucnv_close(conv->utf8);
1376
    xmlFree(conv);
1377
}
1378
1379
static void
1380
xmlUconvFree(void *vctxt) {
1381
    closeIcuConverter(vctxt);
1382
}
1383
1384
static int
1385
xmlCharEncUconv(void *vctxt ATTRIBUTE_UNUSED, const char *name,
1386
                xmlCharEncConverter *conv) {
1387
    xmlUconvCtxt *ucv_in = NULL;
1388
    xmlUconvCtxt *ucv_out = NULL;
1389
    int ret;
1390
1391
    ret = openIcuConverter(name, 1, &ucv_in);
1392
    if (ret != 0)
1393
        goto error;
1394
    ret = openIcuConverter(name, 0, &ucv_out);
1395
    if (ret != 0)
1396
        goto error;
1397
1398
    conv->input = xmlUconvConvert;
1399
    conv->output = xmlUconvConvert;
1400
    conv->ctxtDtor = xmlUconvFree;
1401
    conv->inputCtxt = ucv_in;
1402
    conv->outputCtxt = ucv_out;
1403
1404
    return(XML_ERR_OK);
1405
1406
error:
1407
    if (ucv_in != NULL)
1408
        closeIcuConverter(ucv_in);
1409
    if (ucv_out != NULL)
1410
        closeIcuConverter(ucv_out);
1411
    return(ret);
1412
}
1413
#endif /* LIBXML_ICU_ENABLED */
1414
1415
/************************************************************************
1416
 *                  *
1417
 *    The real API used by libxml for on-the-fly conversion *
1418
 *                  *
1419
 ************************************************************************/
1420
1421
/**
1422
 * xmlEncConvertError:
1423
 * @code:  XML_ENC_ERR code
1424
 *
1425
 * Convert XML_ENC_ERR to libxml2 error codes.
1426
 */
1427
static int
1428
269
xmlEncConvertError(int code) {
1429
269
    int ret;
1430
1431
269
    switch (code) {
1432
0
        case XML_ENC_ERR_SUCCESS:
1433
0
            ret = XML_ERR_OK;
1434
0
            break;
1435
269
        case XML_ENC_ERR_INPUT:
1436
269
            ret = XML_ERR_INVALID_ENCODING;
1437
269
            break;
1438
0
        case XML_ENC_ERR_MEMORY:
1439
0
            ret = XML_ERR_NO_MEMORY;
1440
0
            break;
1441
0
        default:
1442
0
            ret = XML_ERR_INTERNAL_ERROR;
1443
0
            break;
1444
269
    }
1445
1446
269
    return(ret);
1447
269
}
1448
1449
/**
1450
 * xmlEncInputChunk:
1451
 * @handler:  encoding handler
1452
 * @out:  a pointer to an array of bytes to store the result
1453
 * @outlen:  the length of @out
1454
 * @in:  a pointer to an array of input bytes
1455
 * @inlen:  the length of @in
1456
 *
1457
 * The value of @inlen after return is the number of octets consumed
1458
 *     as the return value is 0, else unpredictable.
1459
 * The value of @outlen after return is the number of octets produced.
1460
 *
1461
 * Returns an XML_ENC_ERR code.
1462
 */
1463
int
1464
xmlEncInputChunk(xmlCharEncodingHandler *handler, unsigned char *out,
1465
66.8k
                 int *outlen, const unsigned char *in, int *inlen) {
1466
66.8k
    int ret;
1467
1468
66.8k
    if (handler->input != NULL) {
1469
66.8k
        xmlCharEncConvFunc conv =
1470
66.8k
            (xmlCharEncConvFunc) (void (*)(void)) handler->input;
1471
1472
66.8k
        ret = conv(out, outlen, in, inlen, handler->inputCtxt);
1473
66.8k
        if (ret > 0)
1474
6.84k
            ret = XML_ENC_ERR_SUCCESS;
1475
66.8k
    }
1476
0
    else {
1477
0
        *outlen = 0;
1478
0
        *inlen = 0;
1479
0
        ret = XML_ENC_ERR_INTERNAL;
1480
0
    }
1481
1482
66.8k
    return(ret);
1483
66.8k
}
1484
1485
/**
1486
 * xmlEncOutputChunk:
1487
 * @handler:  encoding handler
1488
 * @out:  a pointer to an array of bytes to store the result
1489
 * @outlen:  the length of @out
1490
 * @in:  a pointer to an array of input bytes
1491
 * @inlen:  the length of @in
1492
 *
1493
 * Returns an XML_ENC_ERR code.
1494
 *
1495
 * The value of @inlen after return is the number of octets consumed
1496
 *     as the return value is 0, else unpredictable.
1497
 * The value of @outlen after return is the number of octets produced.
1498
 */
1499
static int
1500
xmlEncOutputChunk(xmlCharEncodingHandler *handler, unsigned char *out,
1501
0
                  int *outlen, const unsigned char *in, int *inlen) {
1502
0
    int ret;
1503
1504
0
    if (handler->output != NULL) {
1505
0
        xmlCharEncConvFunc conv =
1506
0
            (xmlCharEncConvFunc) (void (*)(void)) handler->output;
1507
1508
0
        ret = conv(out, outlen, in, inlen, handler->outputCtxt);
1509
0
        if (ret > 0)
1510
0
            ret = XML_ENC_ERR_SUCCESS;
1511
0
    }
1512
0
    else {
1513
0
        *outlen = 0;
1514
0
        *inlen = 0;
1515
0
        ret = XML_ENC_ERR_INTERNAL;
1516
0
    }
1517
1518
0
    return(ret);
1519
0
}
1520
1521
/**
1522
 * xmlCharEncFirstLine:
1523
 * @handler:   char encoding transformation data structure
1524
 * @out:  an xmlBuffer for the output.
1525
 * @in:  an xmlBuffer for the input
1526
 *
1527
 * DEPERECATED: Don't use.
1528
 *
1529
 * Returns the number of bytes written or an XML_ENC_ERR code.
1530
 */
1531
int
1532
xmlCharEncFirstLine(xmlCharEncodingHandler *handler, xmlBufferPtr out,
1533
0
                    xmlBufferPtr in) {
1534
0
    return(xmlCharEncInFunc(handler, out, in));
1535
0
}
1536
1537
/**
1538
 * xmlCharEncInput:
1539
 * @input: a parser input buffer
1540
 * @sizeOut:  pointer to output size
1541
 *
1542
 * @sizeOut should be set to the maximum output size (or SIZE_MAX).
1543
 * After return, it is set to the number of bytes written.
1544
 *
1545
 * Generic front-end for the encoding handler on parser input
1546
 *
1547
 * Returns an XML_ENC_ERR code.
1548
 */
1549
int
1550
xmlCharEncInput(xmlParserInputBufferPtr input, size_t *sizeOut)
1551
258k
{
1552
258k
    xmlBufPtr out, in;
1553
258k
    const xmlChar *dataIn;
1554
258k
    size_t availIn;
1555
258k
    size_t maxOut;
1556
258k
    size_t totalIn, totalOut;
1557
258k
    int ret;
1558
1559
258k
    out = input->buffer;
1560
258k
    in = input->raw;
1561
1562
258k
    maxOut = *sizeOut;
1563
258k
    totalOut = 0;
1564
1565
258k
    *sizeOut = 0;
1566
1567
258k
    availIn = xmlBufUse(in);
1568
258k
    if (availIn == 0)
1569
200k
        return(0);
1570
57.8k
    dataIn = xmlBufContent(in);
1571
57.8k
    totalIn = 0;
1572
1573
66.4k
    while (1) {
1574
66.4k
        size_t availOut;
1575
66.4k
        int completeOut, completeIn;
1576
66.4k
        int c_out, c_in;
1577
1578
66.4k
        availOut = xmlBufAvail(out);
1579
66.4k
        if (availOut > INT_MAX / 2)
1580
0
            availOut = INT_MAX / 2;
1581
1582
66.4k
        if (availOut < maxOut) {
1583
63.4k
            c_out = availOut;
1584
63.4k
            completeOut = 0;
1585
63.4k
        } else {
1586
3.08k
            c_out = maxOut;
1587
3.08k
            completeOut = 1;
1588
3.08k
        }
1589
1590
66.4k
        if (availIn > INT_MAX / 2) {
1591
0
            c_in = INT_MAX / 2;
1592
0
            completeIn = 0;
1593
66.4k
        } else {
1594
66.4k
            c_in = availIn;
1595
66.4k
            completeIn = 1;
1596
66.4k
        }
1597
1598
66.4k
        ret = xmlEncInputChunk(input->encoder, xmlBufEnd(out), &c_out,
1599
66.4k
                               dataIn, &c_in);
1600
1601
66.4k
        totalIn += c_in;
1602
66.4k
        dataIn += c_in;
1603
66.4k
        availIn -= c_in;
1604
1605
66.4k
        totalOut += c_out;
1606
66.4k
        maxOut -= c_out;
1607
66.4k
        xmlBufAddLen(out, c_out);
1608
1609
66.4k
        if ((ret != XML_ENC_ERR_SUCCESS) && (ret != XML_ENC_ERR_SPACE)) {
1610
269
            input->error = xmlEncConvertError(ret);
1611
269
            return(ret);
1612
269
        }
1613
1614
66.2k
        if ((completeOut) && (completeIn))
1615
3.00k
            break;
1616
63.2k
        if ((completeOut) && (ret == XML_ENC_ERR_SPACE))
1617
0
            break;
1618
63.2k
        if ((completeIn) && (ret == XML_ENC_ERR_SUCCESS))
1619
54.5k
            break;
1620
1621
8.65k
        if (ret == XML_ENC_ERR_SPACE) {
1622
8.65k
            if (xmlBufGrow(out, 4096) < 0) {
1623
0
                input->error = XML_ERR_NO_MEMORY;
1624
0
                return(XML_ENC_ERR_MEMORY);
1625
0
            }
1626
8.65k
        }
1627
8.65k
    }
1628
1629
57.5k
    xmlBufShrink(in, totalIn);
1630
1631
57.5k
    if (input->rawconsumed > ULONG_MAX - (unsigned long) totalIn)
1632
0
        input->rawconsumed = ULONG_MAX;
1633
57.5k
    else
1634
57.5k
        input->rawconsumed += totalIn;
1635
1636
57.5k
    *sizeOut = totalOut;
1637
57.5k
    return(XML_ERR_OK);
1638
57.8k
}
1639
1640
/**
1641
 * xmlCharEncInFunc:
1642
 * @handler:  char encoding transformation data structure
1643
 * @out:  an xmlBuffer for the output.
1644
 * @in:  an xmlBuffer for the input
1645
 *
1646
 * Generic front-end for the encoding handler input function
1647
 *
1648
 * Returns the number of bytes written or an XML_ENC_ERR code.
1649
 */
1650
int
1651
xmlCharEncInFunc(xmlCharEncodingHandler * handler, xmlBufferPtr out,
1652
                 xmlBufferPtr in)
1653
0
{
1654
0
    int ret;
1655
0
    int written;
1656
0
    int toconv;
1657
1658
0
    if (handler == NULL)
1659
0
        return(XML_ENC_ERR_INTERNAL);
1660
0
    if (out == NULL)
1661
0
        return(XML_ENC_ERR_INTERNAL);
1662
0
    if (in == NULL)
1663
0
        return(XML_ENC_ERR_INTERNAL);
1664
1665
0
    toconv = in->use;
1666
0
    if (toconv == 0)
1667
0
        return (0);
1668
0
    written = out->size - out->use -1; /* count '\0' */
1669
0
    if (toconv * 2 >= written) {
1670
0
        xmlBufferGrow(out, out->size + toconv * 2);
1671
0
        written = out->size - out->use - 1;
1672
0
    }
1673
0
    ret = xmlEncInputChunk(handler, &out->content[out->use], &written,
1674
0
                           in->content, &toconv);
1675
0
    xmlBufferShrink(in, toconv);
1676
0
    out->use += written;
1677
0
    out->content[out->use] = 0;
1678
1679
0
    return (written? written : ret);
1680
0
}
1681
1682
#ifdef LIBXML_OUTPUT_ENABLED
1683
/**
1684
 * xmlCharEncOutput:
1685
 * @output: a parser output buffer
1686
 * @init: is this an initialization call without data
1687
 *
1688
 * Generic front-end for the encoding handler on parser output
1689
 * a first call with @init == 1 has to be made first to initiate the
1690
 * output in case of non-stateless encoding needing to initiate their
1691
 * state or the output (like the BOM in UTF16).
1692
 * In case of UTF8 sequence conversion errors for the given encoder,
1693
 * the content will be automatically remapped to a CharRef sequence.
1694
 *
1695
 * Returns the number of bytes written or an XML_ENC_ERR code.
1696
 */
1697
int
1698
xmlCharEncOutput(xmlOutputBufferPtr output, int init)
1699
0
{
1700
0
    int ret;
1701
0
    size_t written;
1702
0
    int writtentot = 0;
1703
0
    size_t toconv;
1704
0
    int c_in;
1705
0
    int c_out;
1706
0
    xmlBufPtr in;
1707
0
    xmlBufPtr out;
1708
1709
0
    if ((output == NULL) || (output->encoder == NULL) ||
1710
0
        (output->buffer == NULL) || (output->conv == NULL))
1711
0
        return(XML_ENC_ERR_INTERNAL);
1712
0
    out = output->conv;
1713
0
    in = output->buffer;
1714
1715
0
retry:
1716
1717
0
    written = xmlBufAvail(out);
1718
1719
    /*
1720
     * First specific handling of the initialization call
1721
     */
1722
0
    if (init) {
1723
0
        c_in = 0;
1724
0
        c_out = written;
1725
        /* TODO: Check return value. */
1726
0
        xmlEncOutputChunk(output->encoder, xmlBufEnd(out), &c_out,
1727
0
                          NULL, &c_in);
1728
0
        xmlBufAddLen(out, c_out);
1729
0
        return(c_out);
1730
0
    }
1731
1732
    /*
1733
     * Conversion itself.
1734
     */
1735
0
    toconv = xmlBufUse(in);
1736
0
    if (toconv > 64 * 1024)
1737
0
        toconv = 64 * 1024;
1738
0
    if (toconv * 4 >= written) {
1739
0
        if (xmlBufGrow(out, toconv * 4) < 0) {
1740
0
            ret = XML_ENC_ERR_MEMORY;
1741
0
            goto error;
1742
0
        }
1743
0
        written = xmlBufAvail(out);
1744
0
    }
1745
0
    if (written > 256 * 1024)
1746
0
        written = 256 * 1024;
1747
1748
0
    c_in = toconv;
1749
0
    c_out = written;
1750
0
    ret = xmlEncOutputChunk(output->encoder, xmlBufEnd(out), &c_out,
1751
0
                            xmlBufContent(in), &c_in);
1752
0
    xmlBufShrink(in, c_in);
1753
0
    xmlBufAddLen(out, c_out);
1754
0
    writtentot += c_out;
1755
1756
0
    if (ret == XML_ENC_ERR_SPACE)
1757
0
        goto retry;
1758
1759
    /*
1760
     * Attempt to handle error cases
1761
     */
1762
0
    if (ret == XML_ENC_ERR_INPUT) {
1763
0
        xmlChar charref[20];
1764
0
        int len = xmlBufUse(in);
1765
0
        xmlChar *content = xmlBufContent(in);
1766
0
        int cur, charrefLen;
1767
1768
0
        cur = xmlGetUTF8Char(content, &len);
1769
0
        if (cur <= 0)
1770
0
            goto error;
1771
1772
        /*
1773
         * Removes the UTF8 sequence, and replace it by a charref
1774
         * and continue the transcoding phase, hoping the error
1775
         * did not mangle the encoder state.
1776
         */
1777
0
        charrefLen = xmlSerializeDecCharRef((char *) charref, cur);
1778
0
        xmlBufGrow(out, charrefLen * 4);
1779
0
        c_out = xmlBufAvail(out);
1780
0
        c_in = charrefLen;
1781
0
        ret = xmlEncOutputChunk(output->encoder, xmlBufEnd(out), &c_out,
1782
0
                                charref, &c_in);
1783
0
        if ((ret < 0) || (c_in != charrefLen)) {
1784
0
            ret = XML_ENC_ERR_INTERNAL;
1785
0
            goto error;
1786
0
        }
1787
1788
0
        xmlBufShrink(in, len);
1789
0
        xmlBufAddLen(out, c_out);
1790
0
        writtentot += c_out;
1791
0
        goto retry;
1792
0
    }
1793
1794
0
error:
1795
0
    if (((writtentot <= 0) && (ret != 0)) ||
1796
0
        (ret == XML_ENC_ERR_MEMORY)) {
1797
0
        if (output->error == 0)
1798
0
            output->error = xmlEncConvertError(ret);
1799
0
        return(ret);
1800
0
    }
1801
1802
0
    return(writtentot);
1803
0
}
1804
#endif
1805
1806
/**
1807
 * xmlCharEncOutFunc:
1808
 * @handler:  char encoding transformation data structure
1809
 * @out:  an xmlBuffer for the output.
1810
 * @in:  an xmlBuffer for the input
1811
 *
1812
 * Generic front-end for the encoding handler output function
1813
 * a first call with @in == NULL has to be made firs to initiate the
1814
 * output in case of non-stateless encoding needing to initiate their
1815
 * state or the output (like the BOM in UTF16).
1816
 * In case of UTF8 sequence conversion errors for the given encoder,
1817
 * the content will be automatically remapped to a CharRef sequence.
1818
 *
1819
 * Returns the number of bytes written or an XML_ENC_ERR code.
1820
 */
1821
int
1822
xmlCharEncOutFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out,
1823
0
                  xmlBufferPtr in) {
1824
0
    int ret;
1825
0
    int written;
1826
0
    int writtentot = 0;
1827
0
    int toconv;
1828
1829
0
    if (handler == NULL) return(XML_ENC_ERR_INTERNAL);
1830
0
    if (out == NULL) return(XML_ENC_ERR_INTERNAL);
1831
1832
0
retry:
1833
1834
0
    written = out->size - out->use;
1835
1836
0
    if (written > 0)
1837
0
  written--; /* Gennady: count '/0' */
1838
1839
    /*
1840
     * First specific handling of in = NULL, i.e. the initialization call
1841
     */
1842
0
    if (in == NULL) {
1843
0
        toconv = 0;
1844
        /* TODO: Check return value. */
1845
0
        xmlEncOutputChunk(handler, &out->content[out->use], &written,
1846
0
                          NULL, &toconv);
1847
0
        out->use += written;
1848
0
        out->content[out->use] = 0;
1849
0
        return(0);
1850
0
    }
1851
1852
    /*
1853
     * Conversion itself.
1854
     */
1855
0
    toconv = in->use;
1856
0
    if (toconv * 4 >= written) {
1857
0
        xmlBufferGrow(out, toconv * 4);
1858
0
  written = out->size - out->use - 1;
1859
0
    }
1860
0
    ret = xmlEncOutputChunk(handler, &out->content[out->use], &written,
1861
0
                            in->content, &toconv);
1862
0
    xmlBufferShrink(in, toconv);
1863
0
    out->use += written;
1864
0
    writtentot += written;
1865
0
    out->content[out->use] = 0;
1866
1867
0
    if (ret == XML_ENC_ERR_SPACE)
1868
0
        goto retry;
1869
1870
    /*
1871
     * Attempt to handle error cases
1872
     */
1873
0
    if (ret == XML_ENC_ERR_INPUT) {
1874
0
        xmlChar charref[20];
1875
0
        int len = in->use;
1876
0
        const xmlChar *utf = (const xmlChar *) in->content;
1877
0
        int cur, charrefLen;
1878
1879
0
        cur = xmlGetUTF8Char(utf, &len);
1880
0
        if (cur <= 0)
1881
0
            return(ret);
1882
1883
        /*
1884
         * Removes the UTF8 sequence, and replace it by a charref
1885
         * and continue the transcoding phase, hoping the error
1886
         * did not mangle the encoder state.
1887
         */
1888
0
        charrefLen = xmlSerializeDecCharRef((char *) charref, cur);
1889
0
        xmlBufferShrink(in, len);
1890
0
        xmlBufferGrow(out, charrefLen * 4);
1891
0
        written = out->size - out->use - 1;
1892
0
        toconv = charrefLen;
1893
0
        ret = xmlEncOutputChunk(handler, &out->content[out->use], &written,
1894
0
                                charref, &toconv);
1895
0
        if ((ret < 0) || (toconv != charrefLen))
1896
0
            return(XML_ENC_ERR_INTERNAL);
1897
1898
0
        out->use += written;
1899
0
        writtentot += written;
1900
0
        out->content[out->use] = 0;
1901
0
        goto retry;
1902
0
    }
1903
0
    return(writtentot ? writtentot : ret);
1904
0
}
1905
1906
/**
1907
 * xmlCharEncCloseFunc:
1908
 * @handler:  char encoding transformation data structure
1909
 *
1910
 * Releases an xmlCharEncodingHandler. Must be called after
1911
 * a handler is no longer in use.
1912
 *
1913
 * Returns 0.
1914
 */
1915
int
1916
3.42k
xmlCharEncCloseFunc(xmlCharEncodingHandler *handler) {
1917
3.42k
    if (handler == NULL)
1918
0
        return(0);
1919
1920
3.42k
    if (handler->flags & XML_HANDLER_STATIC)
1921
906
        return(0);
1922
1923
2.52k
    xmlFree(handler->name);
1924
2.52k
    if (handler->ctxtDtor != NULL) {
1925
2.52k
        handler->ctxtDtor(handler->inputCtxt);
1926
2.52k
        handler->ctxtDtor(handler->outputCtxt);
1927
2.52k
    }
1928
2.52k
    xmlFree(handler);
1929
2.52k
    return(0);
1930
3.42k
}
1931
1932
/**
1933
 * xmlByteConsumed:
1934
 * @ctxt: an XML parser context
1935
 *
1936
 * DEPRECATED: Don't use.
1937
 *
1938
 * This function provides the current index of the parser relative
1939
 * to the start of the current entity. This function is computed in
1940
 * bytes from the beginning starting at zero and finishing at the
1941
 * size in byte of the file if parsing a file. The function is
1942
 * of constant cost if the input is UTF-8 but can be costly if run
1943
 * on non-UTF-8 input.
1944
 *
1945
 * Returns the index in bytes from the beginning of the entity or -1
1946
 *         in case the index could not be computed.
1947
 */
1948
long
1949
0
xmlByteConsumed(xmlParserCtxtPtr ctxt) {
1950
0
    xmlParserInputPtr in;
1951
1952
0
    if (ctxt == NULL)
1953
0
        return(-1);
1954
0
    in = ctxt->input;
1955
0
    if (in == NULL)
1956
0
        return(-1);
1957
1958
0
    if ((in->buf != NULL) && (in->buf->encoder != NULL)) {
1959
0
        int unused = 0;
1960
0
  xmlCharEncodingHandler * handler = in->buf->encoder;
1961
1962
        /*
1963
   * Encoding conversion, compute the number of unused original
1964
   * bytes from the input not consumed and subtract that from
1965
   * the raw consumed value, this is not a cheap operation
1966
   */
1967
0
        if (in->end - in->cur > 0) {
1968
0
      unsigned char *convbuf;
1969
0
      const unsigned char *cur = (const unsigned char *)in->cur;
1970
0
      int toconv, ret;
1971
1972
0
            convbuf = xmlMalloc(32000);
1973
0
            if (convbuf == NULL)
1974
0
                return(-1);
1975
1976
0
            toconv = in->end - cur;
1977
0
            unused = 32000;
1978
0
            ret = xmlEncOutputChunk(handler, convbuf, &unused, cur, &toconv);
1979
1980
0
            xmlFree(convbuf);
1981
1982
0
            if (ret != XML_ENC_ERR_SUCCESS)
1983
0
                return(-1);
1984
0
  }
1985
1986
0
  if (in->buf->rawconsumed < (unsigned long) unused)
1987
0
      return(-1);
1988
0
  return(in->buf->rawconsumed - unused);
1989
0
    }
1990
1991
0
    return(in->consumed + (in->cur - in->base));
1992
0
}
1993
1994
/************************************************************************
1995
 *                  *
1996
 *    Conversions To/From UTF8 encoding     *
1997
 *                  *
1998
 ************************************************************************/
1999
2000
static int
2001
asciiToAscii(unsigned char* out, int *poutlen,
2002
             const unsigned char* in, int *pinlen,
2003
241
             void *vctxt ATTRIBUTE_UNUSED) {
2004
241
    const unsigned char *inend;
2005
241
    const unsigned char *instart = in;
2006
241
    int inlen, outlen, ret;
2007
2008
241
    if (in == NULL) {
2009
0
        *pinlen = 0;
2010
0
        *poutlen = 0;
2011
0
        return(XML_ENC_ERR_SUCCESS);
2012
0
    }
2013
2014
241
    inlen = *pinlen;
2015
241
    outlen = *poutlen;
2016
2017
241
    if (outlen < inlen) {
2018
51
        inlen = outlen;
2019
51
        ret = XML_ENC_ERR_SPACE;
2020
190
    } else {
2021
190
        ret = inlen;
2022
190
    }
2023
2024
241
    inend = in + inlen;
2025
241
    *poutlen = inlen;
2026
241
    *pinlen = inlen;
2027
2028
640k
    while (in < inend) {
2029
640k
  unsigned c = *in;
2030
2031
640k
        if (c >= 0x80) {
2032
11
      *poutlen = in - instart;
2033
11
      *pinlen = in - instart;
2034
11
      return(XML_ENC_ERR_INPUT);
2035
11
  }
2036
2037
640k
        in++;
2038
640k
  *out++ = c;
2039
640k
    }
2040
2041
230
    return(ret);
2042
241
}
2043
2044
static int
2045
latin1ToUTF8(unsigned char* out, int *outlen,
2046
             const unsigned char* in, int *inlen,
2047
5.31k
             void *vctxt ATTRIBUTE_UNUSED) {
2048
5.31k
    unsigned char* outstart = out;
2049
5.31k
    const unsigned char* instart = in;
2050
5.31k
    unsigned char* outend;
2051
5.31k
    const unsigned char* inend;
2052
5.31k
    int ret = XML_ENC_ERR_SPACE;
2053
2054
5.31k
    if ((out == NULL) || (in == NULL) || (outlen == NULL) || (inlen == NULL))
2055
0
  return(XML_ENC_ERR_INTERNAL);
2056
2057
5.31k
    outend = out + *outlen;
2058
5.31k
    inend = in + *inlen;
2059
2060
14.0M
    while (in < inend) {
2061
14.0M
        unsigned c = *in;
2062
2063
14.0M
  if (c < 0x80) {
2064
12.3M
            if (out >= outend)
2065
1.38k
                goto done;
2066
12.3M
            *out++ = c;
2067
12.3M
  } else {
2068
1.69M
            if (outend - out < 2)
2069
397
                goto done;
2070
1.69M
      *out++ = (c >> 6) | 0xC0;
2071
1.69M
            *out++ = (c & 0x3F) | 0x80;
2072
1.69M
        }
2073
2074
14.0M
        in++;
2075
14.0M
    }
2076
2077
3.53k
    ret = out - outstart;
2078
2079
5.31k
done:
2080
5.31k
    *outlen = out - outstart;
2081
5.31k
    *inlen = in - instart;
2082
5.31k
    return(ret);
2083
3.53k
}
2084
2085
/**
2086
 * isolat1ToUTF8:
2087
 * @out:  a pointer to an array of bytes to store the result
2088
 * @outlen:  the length of @out
2089
 * @in:  a pointer to an array of ISO Latin 1 chars
2090
 * @inlen:  the length of @in
2091
 *
2092
 * Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8
2093
 * block of chars out.
2094
 *
2095
 * Returns the number of bytes written or an XML_ENC_ERR code.
2096
 *
2097
 * The value of @inlen after return is the number of octets consumed
2098
 *     if the return value is positive, else unpredictable.
2099
 * The value of @outlen after return is the number of octets produced.
2100
 */
2101
int
2102
isolat1ToUTF8(unsigned char* out, int *outlen,
2103
0
              const unsigned char* in, int *inlen) {
2104
0
    return(latin1ToUTF8(out, outlen, in, inlen, NULL));
2105
0
}
2106
2107
static int
2108
UTF8ToUTF8(unsigned char* out, int *outlen,
2109
           const unsigned char* in, int *inlen,
2110
0
           void *vctxt ATTRIBUTE_UNUSED) {
2111
0
    int len;
2112
0
    int ret;
2113
2114
0
    if (in == NULL) {
2115
0
        *inlen = 0;
2116
0
        *outlen = 0;
2117
0
        return(XML_ENC_ERR_SUCCESS);
2118
0
    }
2119
2120
0
    if (*outlen < *inlen) {
2121
0
  len = *outlen;
2122
0
        ret = XML_ENC_ERR_SPACE;
2123
0
    } else {
2124
0
  len = *inlen;
2125
0
        ret = len;
2126
0
    }
2127
2128
0
    memcpy(out, in, len);
2129
2130
0
    *outlen = len;
2131
0
    *inlen = len;
2132
0
    return(ret);
2133
0
}
2134
2135
2136
#ifdef LIBXML_OUTPUT_ENABLED
2137
static int
2138
UTF8ToLatin1(unsigned char* out, int *outlen,
2139
             const unsigned char* in, int *inlen,
2140
0
             void *vctxt ATTRIBUTE_UNUSED) {
2141
0
    const unsigned char* outend;
2142
0
    const unsigned char* outstart = out;
2143
0
    const unsigned char* instart = in;
2144
0
    const unsigned char* inend;
2145
0
    unsigned c;
2146
0
    int ret = XML_ENC_ERR_SPACE;
2147
2148
0
    if ((out == NULL) || (outlen == NULL) || (inlen == NULL))
2149
0
        return(XML_ENC_ERR_INTERNAL);
2150
2151
0
    if (in == NULL) {
2152
0
        *inlen = 0;
2153
0
        *outlen = 0;
2154
0
        return(XML_ENC_ERR_SUCCESS);
2155
0
    }
2156
2157
0
    inend = in + *inlen;
2158
0
    outend = out + *outlen;
2159
0
    while (in < inend) {
2160
0
        if (out >= outend)
2161
0
            goto done;
2162
2163
0
  c = *in;
2164
2165
0
        if (c < 0x80) {
2166
0
            *out++ = c;
2167
0
        } else if ((c >= 0xC2) && (c <= 0xC3)) {
2168
0
            if (inend - in < 2)
2169
0
                break;
2170
0
            in++;
2171
0
            *out++ = (unsigned char) ((c << 6) | (*in & 0x3F));
2172
0
        } else {
2173
0
            ret = XML_ENC_ERR_INPUT;
2174
0
            goto done;
2175
0
  }
2176
2177
0
        in++;
2178
0
    }
2179
2180
0
    ret = out - outstart;
2181
2182
0
done:
2183
0
    *outlen = out - outstart;
2184
0
    *inlen = in - instart;
2185
0
    return(ret);
2186
0
}
2187
2188
/**
2189
 * UTF8Toisolat1:
2190
 * @out:  a pointer to an array of bytes to store the result
2191
 * @outlen:  the length of @out
2192
 * @in:  a pointer to an array of UTF-8 chars
2193
 * @inlen:  the length of @in
2194
 *
2195
 * Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1
2196
 * block of chars out.
2197
 *
2198
 * Returns the number of bytes written or an XML_ENC_ERR code.
2199
 *
2200
 * The value of @inlen after return is the number of octets consumed
2201
 *     if the return value is positive, else unpredictable.
2202
 * The value of @outlen after return is the number of octets produced.
2203
 */
2204
int
2205
UTF8Toisolat1(unsigned char* out, int *outlen,
2206
0
              const unsigned char* in, int *inlen) {
2207
0
    if ((out == NULL) || (outlen == NULL) || (in == NULL) || (inlen == NULL))
2208
0
        return(XML_ENC_ERR_INTERNAL);
2209
2210
0
    return(UTF8ToLatin1(out, outlen, in, inlen, NULL));
2211
0
}
2212
#endif /* LIBXML_OUTPUT_ENABLED */
2213
2214
static int
2215
UTF16LEToUTF8(unsigned char *out, int *outlen,
2216
              const unsigned char *in, int *inlen,
2217
9.48k
              void *vctxt ATTRIBUTE_UNUSED) {
2218
9.48k
    const unsigned char *instart = in;
2219
9.48k
    const unsigned char *inend = in + (*inlen & ~1);
2220
9.48k
    unsigned char *outstart = out;
2221
9.48k
    unsigned char *outend = out + *outlen;
2222
9.48k
    unsigned c, d;
2223
9.48k
    int ret = XML_ENC_ERR_SPACE;
2224
2225
2.43M
    while (in < inend) {
2226
2.42M
        c = in[0] | (in[1] << 8);
2227
2228
2.42M
        if (c < 0x80) {
2229
585k
            if (out >= outend)
2230
104
                goto done;
2231
585k
            out[0] = c;
2232
585k
            in += 2;
2233
585k
            out += 1;
2234
1.83M
        } else if (c < 0x800) {
2235
203k
            if (outend - out < 2)
2236
92
                goto done;
2237
203k
            out[0] = (c >> 6)   | 0xC0;
2238
203k
            out[1] = (c & 0x3F) | 0x80;
2239
203k
            in += 2;
2240
203k
            out += 2;
2241
1.63M
        } else if ((c & 0xF800) != 0xD800) {
2242
1.63M
            if (outend - out < 3)
2243
436
                goto done;
2244
1.63M
            out[0] =  (c >> 12)         | 0xE0;
2245
1.63M
            out[1] = ((c >>  6) & 0x3F) | 0x80;
2246
1.63M
            out[2] =  (c        & 0x3F) | 0x80;
2247
1.63M
            in += 2;
2248
1.63M
            out += 3;
2249
1.63M
        } else {
2250
            /* Surrogate pair */
2251
852
            if ((c & 0xFC00) != 0xD800) {
2252
10
                ret = XML_ENC_ERR_INPUT;
2253
10
                goto done;
2254
10
            }
2255
842
      if (inend - in < 4)
2256
446
    break;
2257
396
            d = in[2] | (in[3] << 8);
2258
396
            if ((d & 0xFC00) != 0xDC00) {
2259
4
                ret = XML_ENC_ERR_INPUT;
2260
4
                goto done;
2261
4
            }
2262
392
      if (outend - out < 4)
2263
0
    goto done;
2264
392
            c = (c << 10) + d - ((0xD800 << 10) + 0xDC00 - 0x10000);
2265
392
            out[0] =  (c >> 18)         | 0xF0;
2266
392
            out[1] = ((c >> 12) & 0x3F) | 0x80;
2267
392
            out[2] = ((c >>  6) & 0x3F) | 0x80;
2268
392
            out[3] =  (c        & 0x3F) | 0x80;
2269
392
            in += 4;
2270
392
            out += 4;
2271
392
        }
2272
2.42M
    }
2273
2274
8.83k
    ret = out - outstart;
2275
2276
9.48k
done:
2277
9.48k
    *outlen = out - outstart;
2278
9.48k
    *inlen = in - instart;
2279
9.48k
    return(ret);
2280
8.83k
}
2281
2282
#ifdef LIBXML_OUTPUT_ENABLED
2283
static int
2284
UTF8ToUTF16LE(unsigned char *out, int *outlen,
2285
              const unsigned char *in, int *inlen,
2286
0
              void *vctxt ATTRIBUTE_UNUSED) {
2287
0
    const unsigned char *instart = in;
2288
0
    const unsigned char *inend;
2289
0
    unsigned char *outstart = out;
2290
0
    unsigned char *outend;
2291
0
    unsigned c, d;
2292
0
    int ret = XML_ENC_ERR_SPACE;
2293
2294
    /* UTF16LE encoding has no BOM */
2295
0
    if ((out == NULL) || (outlen == NULL) || (inlen == NULL))
2296
0
        return(XML_ENC_ERR_INTERNAL);
2297
0
    if (in == NULL) {
2298
0
  *outlen = 0;
2299
0
  *inlen = 0;
2300
0
  return(0);
2301
0
    }
2302
0
    inend = in + *inlen;
2303
0
    outend = out + (*outlen & ~1);
2304
0
    while (in < inend) {
2305
0
        c = in[0];
2306
2307
0
        if (c < 0x80) {
2308
0
            if (out >= outend)
2309
0
                goto done;
2310
0
            out[0] = c;
2311
0
            out[1] = 0;
2312
0
            in += 1;
2313
0
            out += 2;
2314
0
        } else {
2315
0
            int i, len;
2316
0
            unsigned min;
2317
2318
0
            if (c < 0xE0) {
2319
0
                if (c < 0xC2) {
2320
0
                    ret = XML_ENC_ERR_INPUT;
2321
0
                    goto done;
2322
0
                }
2323
0
                c &= 0x1F;
2324
0
                len = 2;
2325
0
                min = 0x80;
2326
0
            } else if (c < 0xF0) {
2327
0
                c &= 0x0F;
2328
0
                len = 3;
2329
0
                min = 0x800;
2330
0
            } else {
2331
0
                c &= 0x0F;
2332
0
                len = 4;
2333
0
                min = 0x10000;
2334
0
            }
2335
2336
0
            if (inend - in < len)
2337
0
                break;
2338
2339
0
            for (i = 1; i < len; i++) {
2340
0
                if ((in[i] & 0xC0) != 0x80) {
2341
0
                    ret = XML_ENC_ERR_INPUT;
2342
0
                    goto done;
2343
0
                }
2344
0
                c = (c << 6) | (in[i] & 0x3F);
2345
0
            }
2346
2347
0
            if ((c < min) ||
2348
0
                ((c >= 0xD800) && (c <= 0xDFFF)) ||
2349
0
                (c > 0x10FFFF)) {
2350
0
                ret = XML_ENC_ERR_INPUT;
2351
0
                goto done;
2352
0
            }
2353
2354
0
            if (c < 0x10000) {
2355
0
                if (out >= outend)
2356
0
                    goto done;
2357
0
                out[0] = c & 0xFF;
2358
0
                out[1] = c >> 8;
2359
0
                out += 2;
2360
0
            } else {
2361
0
                if (outend - out < 4)
2362
0
                    goto done;
2363
0
                c -= 0x10000;
2364
0
                d = (c & 0x03FF) | 0xDC00;
2365
0
                c = (c >> 10)    | 0xD800;
2366
0
                out[0] = c & 0xFF;
2367
0
                out[1] = c >> 8;
2368
0
                out[2] = d & 0xFF;
2369
0
                out[3] = d >> 8;
2370
0
                out += 4;
2371
0
            }
2372
2373
0
            in += len;
2374
0
        }
2375
0
    }
2376
2377
0
    ret = out - outstart;
2378
2379
0
done:
2380
0
    *outlen = out - outstart;
2381
0
    *inlen = in - instart;
2382
0
    return(ret);
2383
0
}
2384
2385
static int
2386
UTF8ToUTF16(unsigned char* outb, int *outlen,
2387
            const unsigned char* in, int *inlen,
2388
0
            void *vctxt ATTRIBUTE_UNUSED) {
2389
0
    if (in == NULL) {
2390
  /*
2391
   * initialization, add the Byte Order Mark for UTF-16LE
2392
   */
2393
0
        if (*outlen >= 2) {
2394
0
      outb[0] = 0xFF;
2395
0
      outb[1] = 0xFE;
2396
0
      *outlen = 2;
2397
0
      *inlen = 0;
2398
0
      return(2);
2399
0
  }
2400
0
  *outlen = 0;
2401
0
  *inlen = 0;
2402
0
  return(0);
2403
0
    }
2404
0
    return (UTF8ToUTF16LE(outb, outlen, in, inlen, NULL));
2405
0
}
2406
#endif /* LIBXML_OUTPUT_ENABLED */
2407
2408
static int
2409
UTF16BEToUTF8(unsigned char *out, int *outlen,
2410
              const unsigned char *in, int *inlen,
2411
8.95k
              void *vctxt ATTRIBUTE_UNUSED) {
2412
8.95k
    const unsigned char *instart = in;
2413
8.95k
    const unsigned char *inend = in + (*inlen & ~1);
2414
8.95k
    unsigned char *outstart = out;
2415
8.95k
    unsigned char *outend = out + *outlen;
2416
8.95k
    unsigned c, d;
2417
8.95k
    int ret = XML_ENC_ERR_SPACE;
2418
2419
2.88M
    while (in < inend) {
2420
2.87M
        c = (in[0] << 8) | in[1];
2421
2422
2.87M
        if (c < 0x80) {
2423
1.10M
            if (out >= outend)
2424
166
                goto done;
2425
1.10M
            out[0] = c;
2426
1.10M
            in += 2;
2427
1.10M
            out += 1;
2428
1.76M
        } else if (c < 0x800) {
2429
148k
            if (outend - out < 2)
2430
53
                goto done;
2431
148k
            out[0] = (c >> 6)   | 0xC0;
2432
148k
            out[1] = (c & 0x3F) | 0x80;
2433
148k
            in += 2;
2434
148k
            out += 2;
2435
1.61M
        } else if ((c & 0xF800) != 0xD800) {
2436
1.61M
            if (outend - out < 3)
2437
651
                goto done;
2438
1.61M
            out[0] =  (c >> 12)         | 0xE0;
2439
1.61M
            out[1] = ((c >>  6) & 0x3F) | 0x80;
2440
1.61M
            out[2] =  (c        & 0x3F) | 0x80;
2441
1.61M
            in += 2;
2442
1.61M
            out += 3;
2443
1.61M
        } else {
2444
            /* Surrogate pair */
2445
1.21k
            if ((c & 0xFC00) != 0xD800) {
2446
9
                ret = XML_ENC_ERR_INPUT;
2447
9
                goto done;
2448
9
            }
2449
1.20k
      if (inend - in < 4)
2450
740
    break;
2451
466
            d = (in[2] << 8) | in[3];
2452
466
            if ((d & 0xFC00) != 0xDC00) {
2453
7
                ret = XML_ENC_ERR_INPUT;
2454
7
                goto done;
2455
7
            }
2456
459
      if (outend - out < 4)
2457
3
    goto done;
2458
456
            c = (c << 10) + d - ((0xD800 << 10) + 0xDC00 - 0x10000);
2459
456
            out[0] =  (c >> 18)         | 0xF0;
2460
456
            out[1] = ((c >> 12) & 0x3F) | 0x80;
2461
456
            out[2] = ((c >>  6) & 0x3F) | 0x80;
2462
456
            out[3] =  (c        & 0x3F) | 0x80;
2463
456
            in += 4;
2464
456
            out += 4;
2465
456
        }
2466
2.87M
    }
2467
2468
8.06k
    ret = out - outstart;
2469
2470
8.95k
done:
2471
8.95k
    *outlen = out - outstart;
2472
8.95k
    *inlen = in - instart;
2473
8.95k
    return(ret);
2474
8.06k
}
2475
2476
#ifdef LIBXML_OUTPUT_ENABLED
2477
static int
2478
UTF8ToUTF16BE(unsigned char *out, int *outlen,
2479
              const unsigned char *in, int *inlen,
2480
0
              void *vctxt ATTRIBUTE_UNUSED) {
2481
0
    const unsigned char *instart = in;
2482
0
    const unsigned char *inend;
2483
0
    unsigned char *outstart = out;
2484
0
    unsigned char *outend;
2485
0
    unsigned c, d;
2486
0
    int ret = XML_ENC_ERR_SPACE;
2487
2488
    /* UTF-16BE has no BOM */
2489
0
    if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
2490
0
    if (in == NULL) {
2491
0
  *outlen = 0;
2492
0
  *inlen = 0;
2493
0
  return(0);
2494
0
    }
2495
0
    inend = in + *inlen;
2496
0
    outend = out + (*outlen & ~1);
2497
0
    while (in < inend) {
2498
0
        c = in[0];
2499
2500
0
        if (c < 0x80) {
2501
0
            if (out >= outend)
2502
0
                goto done;
2503
0
            out[0] = 0;
2504
0
            out[1] = c;
2505
0
            in += 1;
2506
0
            out += 2;
2507
0
        } else {
2508
0
            int i, len;
2509
0
            unsigned min;
2510
2511
0
            if (c < 0xE0) {
2512
0
                if (c < 0xC2) {
2513
0
                    ret = XML_ENC_ERR_INPUT;
2514
0
                    goto done;
2515
0
                }
2516
0
                c &= 0x1F;
2517
0
                len = 2;
2518
0
                min = 0x80;
2519
0
            } else if (c < 0xF0) {
2520
0
                c &= 0x0F;
2521
0
                len = 3;
2522
0
                min = 0x800;
2523
0
            } else {
2524
0
                c &= 0x0F;
2525
0
                len = 4;
2526
0
                min = 0x10000;
2527
0
            }
2528
2529
0
            if (inend - in < len)
2530
0
                break;
2531
2532
0
            for (i = 1; i < len; i++) {
2533
0
                if ((in[i] & 0xC0) != 0x80) {
2534
0
                    ret = XML_ENC_ERR_INPUT;
2535
0
                    goto done;
2536
0
                }
2537
0
                c = (c << 6) | (in[i] & 0x3F);
2538
0
            }
2539
2540
0
            if ((c < min) ||
2541
0
                ((c >= 0xD800) && (c <= 0xDFFF)) ||
2542
0
                (c > 0x10FFFF)) {
2543
0
                ret = XML_ENC_ERR_INPUT;
2544
0
                goto done;
2545
0
            }
2546
2547
0
            if (c < 0x10000) {
2548
0
                if (out >= outend)
2549
0
                    goto done;
2550
0
                out[0] = c >> 8;
2551
0
                out[1] = c & 0xFF;
2552
0
                out += 2;
2553
0
            } else {
2554
0
                if (outend - out < 4)
2555
0
                    goto done;
2556
0
                c -= 0x10000;
2557
0
                d = (c & 0x03FF) | 0xDC00;
2558
0
                c = (c >> 10)    | 0xD800;
2559
0
                out[0] = c >> 8;
2560
0
                out[1] = c & 0xFF;
2561
0
                out[2] = d >> 8;
2562
0
                out[3] = d & 0xFF;
2563
0
                out += 4;
2564
0
            }
2565
2566
0
            in += len;
2567
0
        }
2568
0
    }
2569
2570
0
    ret = out - outstart;
2571
2572
0
done:
2573
0
    *outlen = out - outstart;
2574
0
    *inlen = in - instart;
2575
0
    return(ret);
2576
0
}
2577
#endif /* LIBXML_OUTPUT_ENABLED */
2578
2579
#if defined(LIBXML_OUTPUT_ENABLED) && defined(LIBXML_HTML_ENABLED)
2580
static int
2581
UTF8ToHtmlWrapper(unsigned char *out, int *outlen,
2582
                  const unsigned char *in, int *inlen,
2583
0
                  void *vctxt ATTRIBUTE_UNUSED) {
2584
0
    return(UTF8ToHtml(out, outlen, in, inlen));
2585
0
}
2586
#endif
2587
2588
#if !defined(LIBXML_ICONV_ENABLED) && !defined(LIBXML_ICU_ENABLED) && \
2589
    defined(LIBXML_ISO8859X_ENABLED)
2590
2591
static int
2592
UTF8ToISO8859x(unsigned char *out, int *outlen,
2593
               const unsigned char *in, int *inlen, void *vctxt) {
2594
    const unsigned char *xlattable = vctxt;
2595
    const unsigned char *instart = in;
2596
    const unsigned char *inend;
2597
    unsigned char *outstart = out;
2598
    unsigned char *outend;
2599
    int ret = XML_ENC_ERR_SPACE;
2600
2601
    if (in == NULL) {
2602
        /*
2603
        * initialization nothing to do
2604
        */
2605
        *outlen = 0;
2606
        *inlen = 0;
2607
        return(XML_ENC_ERR_SUCCESS);
2608
    }
2609
2610
    inend = in + *inlen;
2611
    outend = out + *outlen;
2612
    while (in < inend) {
2613
        unsigned d = *in;
2614
2615
        if  (d < 0x80)  {
2616
            if (out >= outend)
2617
                goto done;
2618
            in += 1;
2619
        } else if (d < 0xE0) {
2620
            unsigned c;
2621
2622
            if (inend - in < 2)
2623
                break;
2624
            c = in[1] & 0x3F;
2625
            d = d & 0x1F;
2626
            d = xlattable [48 + c + xlattable [d] * 64];
2627
            if (d == 0) {
2628
                /* not in character set */
2629
                ret = XML_ENC_ERR_INPUT;
2630
                goto done;
2631
            }
2632
            if (out >= outend)
2633
                goto done;
2634
            in += 2;
2635
        } else if (d < 0xF0) {
2636
            unsigned c1;
2637
            unsigned c2;
2638
2639
            if (inend - in < 3)
2640
                break;
2641
            c1 = in[1] & 0x3F;
2642
            c2 = in[2] & 0x3F;
2643
      d = d & 0x0F;
2644
      d = xlattable [48 + c2 + xlattable [48 + c1 +
2645
      xlattable [32 + d] * 64] * 64];
2646
            if (d == 0) {
2647
                /* not in character set */
2648
                ret = XML_ENC_ERR_INPUT;
2649
                goto done;
2650
            }
2651
            if (out >= outend)
2652
                goto done;
2653
            in += 3;
2654
        } else {
2655
            /* cannot transcode >= U+010000 */
2656
                ret = XML_ENC_ERR_INPUT;
2657
                goto done;
2658
        }
2659
2660
        *out++ = d;
2661
    }
2662
2663
    ret = out - outstart;
2664
2665
done:
2666
    *outlen = out - outstart;
2667
    *inlen = in - instart;
2668
    return(ret);
2669
}
2670
2671
static int
2672
ISO8859xToUTF8(unsigned char* out, int *outlen,
2673
               const unsigned char* in, int *inlen, void *vctxt) {
2674
    unsigned short const *unicodetable = vctxt;
2675
    const unsigned char* instart = in;
2676
    const unsigned char* inend;
2677
    unsigned char* outstart = out;
2678
    unsigned char* outend;
2679
    int ret = XML_ENC_ERR_SPACE;
2680
2681
    outend = out + *outlen;
2682
    inend = in + *inlen;
2683
2684
    while (in < inend) {
2685
        unsigned c = *in;
2686
2687
        if (c < 0x80) {
2688
            if (out >= outend)
2689
                goto done;
2690
            *out++ = c;
2691
        } else {
2692
            c = unicodetable[c - 0x80];
2693
            if (c == 0) {
2694
                /* undefined code point */
2695
                ret = XML_ENC_ERR_INPUT;
2696
                goto done;
2697
            }
2698
            if (c < 0x800) {
2699
                if (outend - out < 2)
2700
                    goto done;
2701
                *out++ = ((c >>  6) & 0x1F) | 0xC0;
2702
                *out++ = (c & 0x3F) | 0x80;
2703
            } else {
2704
                if (outend - out < 3)
2705
                    goto done;
2706
                *out++ = ((c >>  12) & 0x0F) | 0xE0;
2707
                *out++ = ((c >>  6) & 0x3F) | 0x80;
2708
                *out++ = (c & 0x3F) | 0x80;
2709
            }
2710
        }
2711
2712
        in += 1;
2713
    }
2714
2715
    ret = out - outstart;
2716
2717
done:
2718
    *outlen = out - outstart;
2719
    *inlen = in - instart;
2720
    return(ret);
2721
}
2722
2723
#endif
2724