Coverage Report

Created: 2025-07-11 06:13

/src/libxml2/encoding.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * encoding.c : implements the encoding conversion functions needed for XML
3
 *
4
 * Related specs:
5
 * rfc2044        (UTF-8 and UTF-16) F. Yergeau Alis Technologies
6
 * rfc2781        UTF-16, an encoding of ISO 10646, P. Hoffman, F. Yergeau
7
 * [ISO-10646]    UTF-8 and UTF-16 in Annexes
8
 * [ISO-8859-1]   ISO Latin-1 characters codes.
9
 * [UNICODE]      The Unicode Consortium, "The Unicode Standard --
10
 *                Worldwide Character Encoding -- Version 1.0", Addison-
11
 *                Wesley, Volume 1, 1991, Volume 2, 1992.  UTF-8 is
12
 *                described in Unicode Technical Report \#4.
13
 * [US-ASCII]     Coded Character Set--7-bit American Standard Code for
14
 *                Information Interchange, ANSI X3.4-1986.
15
 *
16
 * See Copyright for the status of this software.
17
 *
18
 * Author: Daniel Veillard
19
 *
20
 * Original code for IsoLatin1 and UTF-16 by Martin J. Duerst
21
 */
22
23
#define IN_LIBXML
24
#include "libxml.h"
25
26
#include <string.h>
27
#include <limits.h>
28
#include <ctype.h>
29
#include <stdlib.h>
30
31
#ifdef LIBXML_ICONV_ENABLED
32
#include <iconv.h>
33
#include <errno.h>
34
#endif
35
36
#include <libxml/encoding.h>
37
#include <libxml/xmlmemory.h>
38
#include <libxml/parser.h>
39
#ifdef LIBXML_HTML_ENABLED
40
#include <libxml/HTMLparser.h>
41
#endif
42
#include <libxml/xmlerror.h>
43
44
#include "private/buf.h"
45
#include "private/enc.h"
46
#include "private/error.h"
47
#include "private/memory.h"
48
49
#ifdef LIBXML_ICU_ENABLED
50
#include <unicode/ucnv.h>
51
#endif
52
53
128k
#define XML_HANDLER_STATIC (1 << 0)
54
237M
#define XML_HANDLER_LEGACY (1 << 1)
55
56
typedef struct _xmlCharEncodingAlias xmlCharEncodingAlias;
57
typedef xmlCharEncodingAlias *xmlCharEncodingAliasPtr;
58
struct _xmlCharEncodingAlias {
59
    const char *name;
60
    const char *alias;
61
};
62
63
static xmlCharEncodingAliasPtr xmlCharEncodingAliases = NULL;
64
static int xmlCharEncodingAliasesNb = 0;
65
static int xmlCharEncodingAliasesMax = 0;
66
67
static int xmlLittleEndian = 1;
68
69
typedef struct {
70
    const char *name;
71
    xmlCharEncoding enc;
72
} xmlEncTableEntry;
73
74
static const xmlEncTableEntry xmlEncTable[] = {
75
    { "ansi_x3.4-1968", XML_CHAR_ENCODING_ASCII },
76
    { "arabic", XML_CHAR_ENCODING_8859_6 },
77
    { "ascii", XML_CHAR_ENCODING_ASCII },
78
    { "asmo-708", XML_CHAR_ENCODING_8859_6 },
79
    { "cp1252", XML_CHAR_ENCODING_WINDOWS_1252 },
80
    { "cp819", XML_CHAR_ENCODING_8859_1 },
81
    { "cseucpkdfmtjapanese", XML_CHAR_ENCODING_EUC_JP },
82
    { "csiso2022jp", XML_CHAR_ENCODING_2022_JP },
83
    { "csiso88596e", XML_CHAR_ENCODING_8859_6 },
84
    { "csiso88596i", XML_CHAR_ENCODING_8859_6 },
85
    { "csiso88598e", XML_CHAR_ENCODING_8859_8 },
86
    { "csiso88598i", XML_CHAR_ENCODING_8859_8 },
87
    { "csisolatin1", XML_CHAR_ENCODING_8859_1 },
88
    { "csisolatin2", XML_CHAR_ENCODING_8859_2 },
89
    { "csisolatin3", XML_CHAR_ENCODING_8859_3 },
90
    { "csisolatin4", XML_CHAR_ENCODING_8859_4 },
91
    { "csisolatin5", XML_CHAR_ENCODING_8859_9 },
92
    { "csisolatin6", XML_CHAR_ENCODING_8859_10 },
93
    { "csisolatin9", XML_CHAR_ENCODING_8859_15 },
94
    { "csisolatinarabic", XML_CHAR_ENCODING_8859_6 },
95
    { "csisolatincyrillic", XML_CHAR_ENCODING_8859_5 },
96
    { "csisolatingreek", XML_CHAR_ENCODING_8859_7 },
97
    { "csisolatinhebrew", XML_CHAR_ENCODING_8859_8 },
98
    { "csshiftjis", XML_CHAR_ENCODING_SHIFT_JIS },
99
    { "csunicode", XML_CHAR_ENCODING_UTF16 },
100
    { "cyrillic", XML_CHAR_ENCODING_8859_5 },
101
    { "ecma-114", XML_CHAR_ENCODING_8859_6 },
102
    { "ecma-118", XML_CHAR_ENCODING_8859_7 },
103
    { "elot_928", XML_CHAR_ENCODING_8859_7 },
104
    { "euc-jp", XML_CHAR_ENCODING_EUC_JP },
105
    { "greek", XML_CHAR_ENCODING_8859_7 },
106
    { "greek8", XML_CHAR_ENCODING_8859_7 },
107
    { "html", XML_CHAR_ENCODING_HTML },
108
    { "ibm819", XML_CHAR_ENCODING_8859_1 },
109
    { "iso latin 1", XML_CHAR_ENCODING_8859_1 },
110
    { "iso latin 2", XML_CHAR_ENCODING_8859_2 },
111
    { "iso-10646-ucs-2", XML_CHAR_ENCODING_UCS2 },
112
    { "iso-10646-ucs-4", XML_CHAR_ENCODING_UCS4LE },
113
    { "iso-2022-jp", XML_CHAR_ENCODING_2022_JP },
114
    { "iso-8859-1", XML_CHAR_ENCODING_8859_1 },
115
    { "iso-8859-10", XML_CHAR_ENCODING_8859_10 },
116
    { "iso-8859-11", XML_CHAR_ENCODING_8859_11 },
117
    { "iso-8859-13", XML_CHAR_ENCODING_8859_13 },
118
    { "iso-8859-14", XML_CHAR_ENCODING_8859_14 },
119
    { "iso-8859-15", XML_CHAR_ENCODING_8859_15 },
120
    { "iso-8859-16", XML_CHAR_ENCODING_8859_16 },
121
    { "iso-8859-2", XML_CHAR_ENCODING_8859_2 },
122
    { "iso-8859-3", XML_CHAR_ENCODING_8859_3 },
123
    { "iso-8859-4", XML_CHAR_ENCODING_8859_4 },
124
    { "iso-8859-5", XML_CHAR_ENCODING_8859_5 },
125
    { "iso-8859-6", XML_CHAR_ENCODING_8859_6 },
126
    { "iso-8859-6-e", XML_CHAR_ENCODING_8859_6 },
127
    { "iso-8859-6-i", XML_CHAR_ENCODING_8859_6 },
128
    { "iso-8859-7", XML_CHAR_ENCODING_8859_7 },
129
    { "iso-8859-8", XML_CHAR_ENCODING_8859_8 },
130
    { "iso-8859-8-i", XML_CHAR_ENCODING_8859_8 },
131
    { "iso-8859-9", XML_CHAR_ENCODING_8859_9 },
132
    { "iso-ir-100", XML_CHAR_ENCODING_8859_1 },
133
    { "iso-ir-101", XML_CHAR_ENCODING_8859_2 },
134
    { "iso-ir-109", XML_CHAR_ENCODING_8859_3 },
135
    { "iso-ir-110", XML_CHAR_ENCODING_8859_4 },
136
    { "iso-ir-126", XML_CHAR_ENCODING_8859_7 },
137
    { "iso-ir-127", XML_CHAR_ENCODING_8859_6 },
138
    { "iso-ir-138", XML_CHAR_ENCODING_8859_8 },
139
    { "iso-ir-144", XML_CHAR_ENCODING_8859_5 },
140
    { "iso-ir-148", XML_CHAR_ENCODING_8859_9 },
141
    { "iso-ir-157", XML_CHAR_ENCODING_8859_10 },
142
    { "iso-latin-1", XML_CHAR_ENCODING_8859_1 },
143
    { "iso-latin-2", XML_CHAR_ENCODING_8859_2 },
144
    { "iso8859-1", XML_CHAR_ENCODING_8859_1 },
145
    { "iso8859-10", XML_CHAR_ENCODING_8859_1 },
146
    { "iso8859-13", XML_CHAR_ENCODING_8859_1 },
147
    { "iso8859-14", XML_CHAR_ENCODING_8859_1 },
148
    { "iso8859-15", XML_CHAR_ENCODING_8859_1 },
149
    { "iso8859-2", XML_CHAR_ENCODING_8859_2 },
150
    { "iso8859-3", XML_CHAR_ENCODING_8859_3 },
151
    { "iso8859-4", XML_CHAR_ENCODING_8859_4 },
152
    { "iso8859-5", XML_CHAR_ENCODING_8859_5 },
153
    { "iso8859-6", XML_CHAR_ENCODING_8859_6 },
154
    { "iso8859-7", XML_CHAR_ENCODING_8859_7 },
155
    { "iso8859-8", XML_CHAR_ENCODING_8859_8 },
156
    { "iso8859-9", XML_CHAR_ENCODING_8859_9 },
157
    { "iso88591", XML_CHAR_ENCODING_8859_1 },
158
    { "iso885910", XML_CHAR_ENCODING_8859_10 },
159
    { "iso885913", XML_CHAR_ENCODING_8859_13 },
160
    { "iso885914", XML_CHAR_ENCODING_8859_14 },
161
    { "iso885915", XML_CHAR_ENCODING_8859_15 },
162
    { "iso88592", XML_CHAR_ENCODING_8859_2 },
163
    { "iso88593", XML_CHAR_ENCODING_8859_3 },
164
    { "iso88594", XML_CHAR_ENCODING_8859_4 },
165
    { "iso88595", XML_CHAR_ENCODING_8859_5 },
166
    { "iso88596", XML_CHAR_ENCODING_8859_6 },
167
    { "iso88597", XML_CHAR_ENCODING_8859_7 },
168
    { "iso88598", XML_CHAR_ENCODING_8859_8 },
169
    { "iso88599", XML_CHAR_ENCODING_8859_9 },
170
    { "iso_8859-1", XML_CHAR_ENCODING_8859_1 },
171
    { "iso_8859-1:1987", XML_CHAR_ENCODING_8859_1 },
172
    { "iso_8859-2", XML_CHAR_ENCODING_8859_2 },
173
    { "iso_8859-2:1987", XML_CHAR_ENCODING_8859_2 },
174
    { "iso_8859-3", XML_CHAR_ENCODING_8859_3 },
175
    { "iso_8859-3:1988", XML_CHAR_ENCODING_8859_3 },
176
    { "iso_8859-4", XML_CHAR_ENCODING_8859_4 },
177
    { "iso_8859-4:1988", XML_CHAR_ENCODING_8859_4 },
178
    { "iso_8859-5", XML_CHAR_ENCODING_8859_5 },
179
    { "iso_8859-5:1988", XML_CHAR_ENCODING_8859_5 },
180
    { "iso_8859-6", XML_CHAR_ENCODING_8859_6 },
181
    { "iso_8859-6:1987", XML_CHAR_ENCODING_8859_6 },
182
    { "iso_8859-7", XML_CHAR_ENCODING_8859_7 },
183
    { "iso_8859-7:1987", XML_CHAR_ENCODING_8859_7 },
184
    { "iso_8859-8", XML_CHAR_ENCODING_8859_8 },
185
    { "iso_8859-8:1988", XML_CHAR_ENCODING_8859_8 },
186
    { "iso_8859-9", XML_CHAR_ENCODING_8859_9 },
187
    { "iso_8859-9:1989", XML_CHAR_ENCODING_8859_9 },
188
    { "l1", XML_CHAR_ENCODING_8859_1 },
189
    { "l2", XML_CHAR_ENCODING_8859_2 },
190
    { "l3", XML_CHAR_ENCODING_8859_3 },
191
    { "l4", XML_CHAR_ENCODING_8859_4 },
192
    { "l5", XML_CHAR_ENCODING_8859_9 },
193
    { "l6", XML_CHAR_ENCODING_8859_10 },
194
    { "l9", XML_CHAR_ENCODING_8859_15 },
195
    { "latin1", XML_CHAR_ENCODING_8859_1 },
196
    { "latin2", XML_CHAR_ENCODING_8859_2 },
197
    { "latin3", XML_CHAR_ENCODING_8859_3 },
198
    { "latin4", XML_CHAR_ENCODING_8859_4 },
199
    { "latin5", XML_CHAR_ENCODING_8859_9 },
200
    { "latin6", XML_CHAR_ENCODING_8859_10 },
201
    { "logical", XML_CHAR_ENCODING_8859_8 },
202
    { "ms932", XML_CHAR_ENCODING_SHIFT_JIS },
203
    { "ms_kanji", XML_CHAR_ENCODING_SHIFT_JIS },
204
    { "shift-jis", XML_CHAR_ENCODING_SHIFT_JIS },
205
    { "shift_jis", XML_CHAR_ENCODING_SHIFT_JIS },
206
    { "sjis", XML_CHAR_ENCODING_SHIFT_JIS },
207
    { "sun_eu_greek", XML_CHAR_ENCODING_8859_7 },
208
    { "ucs-2", XML_CHAR_ENCODING_UCS2 },
209
    { "ucs-4", XML_CHAR_ENCODING_UCS4LE },
210
    { "ucs2", XML_CHAR_ENCODING_UCS2 },
211
    { "ucs4", XML_CHAR_ENCODING_UCS4LE },
212
    { "unicode", XML_CHAR_ENCODING_UTF16 },
213
    { "unicode-1-1-utf-8", XML_CHAR_ENCODING_UTF8 },
214
    { "unicode11utf8", XML_CHAR_ENCODING_UTF8 },
215
    { "unicode20utf8", XML_CHAR_ENCODING_UTF8 },
216
    { "unicodefffe", XML_CHAR_ENCODING_UTF16BE },
217
    { "unicodefeff", XML_CHAR_ENCODING_UTF16LE },
218
    { "us-ascii", XML_CHAR_ENCODING_ASCII },
219
    { "utf-16", XML_CHAR_ENCODING_UTF16 },
220
    { "utf-16be", XML_CHAR_ENCODING_UTF16BE },
221
    { "utf-16le", XML_CHAR_ENCODING_UTF16LE },
222
    { "utf-8", XML_CHAR_ENCODING_UTF8 },
223
    { "utf16", XML_CHAR_ENCODING_UTF16 },
224
    { "utf8", XML_CHAR_ENCODING_UTF8 },
225
    { "visual", XML_CHAR_ENCODING_8859_8 },
226
    { "windows-1252", XML_CHAR_ENCODING_WINDOWS_1252 },
227
    { "windows-31j", XML_CHAR_ENCODING_SHIFT_JIS },
228
    { "x-cp1252", XML_CHAR_ENCODING_WINDOWS_1252 },
229
    { "x-euc-jp", XML_CHAR_ENCODING_EUC_JP },
230
    { "x-sjis", XML_CHAR_ENCODING_SHIFT_JIS },
231
    { "x-unicode20utf8", XML_CHAR_ENCODING_UTF8 }
232
};
233
234
static xmlCharEncError
235
asciiToAscii(void *vctxt, unsigned char* out, int *outlen,
236
             const unsigned char* in, int *inlen, int flush);
237
static xmlCharEncError
238
UTF8ToUTF8(void *vctxt, unsigned char* out, int *outlen,
239
           const unsigned char* inb, int *inlenb, int flush);
240
static xmlCharEncError
241
latin1ToUTF8(void *vctxt, unsigned char* out, int *outlen,
242
             const unsigned char* in, int *inlen, int flush);
243
static xmlCharEncError
244
UTF16LEToUTF8(void *vctxt, unsigned char* out, int *outlen,
245
              const unsigned char* inb, int *inlenb, int flush);
246
static xmlCharEncError
247
UTF16BEToUTF8(void *vctxt, unsigned char* out, int *outlen,
248
              const unsigned char* inb, int *inlenb, int flush);
249
250
#ifdef LIBXML_OUTPUT_ENABLED
251
252
static xmlCharEncError
253
UTF8ToLatin1(void *vctxt, unsigned char* outb, int *outlen,
254
             const unsigned char* in, int *inlen, int flush);
255
static xmlCharEncError
256
UTF8ToUTF16(void *vctxt, unsigned char* outb, int *outlen,
257
            const unsigned char* in, int *inlen, int flush);
258
static xmlCharEncError
259
UTF8ToUTF16LE(void *vctxt, unsigned char* outb, int *outlen,
260
              const unsigned char* in, int *inlen, int flush);
261
static xmlCharEncError
262
UTF8ToUTF16BE(void *vctxt, unsigned char* outb, int *outlen,
263
              const unsigned char* in, int *inlen, int flush);
264
265
#else /* LIBXML_OUTPUT_ENABLED */
266
267
#define UTF8ToLatin1 NULL
268
#define UTF8ToUTF16 NULL
269
#define UTF8ToUTF16LE NULL
270
#define UTF8ToUTF16BE NULL
271
272
#endif /* LIBXML_OUTPUT_ENABLED */
273
274
#if defined(LIBXML_OUTPUT_ENABLED) && defined(LIBXML_HTML_ENABLED)
275
static xmlCharEncError
276
UTF8ToHtmlWrapper(void *vctxt, unsigned char *out, int *outlen,
277
                  const unsigned char *in, int *inlen, int flush);
278
#else
279
#define UTF8ToHtmlWrapper NULL
280
#endif
281
282
#include "codegen/charset.inc"
283
284
static xmlCharEncError
285
EightBitToUtf8(void *vctxt, unsigned char* out, int *outlen,
286
               const unsigned char* in, int *inlen, int flush);
287
static xmlCharEncError
288
Utf8ToEightBit(void *vctxt, unsigned char *out, int *outlen,
289
               const unsigned char *in, int *inlen, int flush);
290
291
#define MAKE_8BIT_HANDLER(name, table) \
292
    { (char *) name, { EightBitToUtf8 }, { Utf8ToEightBit }, \
293
      (void *) xmlunicodetable_##table, \
294
      (void *) xmltranscodetable_##table, \
295
      NULL, XML_HANDLER_STATIC }
296
297
#if !defined(LIBXML_ICONV_ENABLED) && !defined(LIBXML_ICU_ENABLED) && \
298
    defined(LIBXML_ISO8859X_ENABLED)
299
300
#define MAKE_ISO_HANDLER(name, n) MAKE_8BIT_HANDLER(name, ISO8859_##n)
301
302
#else /* LIBXML_ISO8859X_ENABLED */
303
304
#define MAKE_ISO_HANDLER(name, n) \
305
    { (char *) name, { NULL }, { NULL }, NULL, NULL, NULL, \
306
      XML_HANDLER_STATIC }
307
308
#endif /* LIBXML_ISO8859X_ENABLED */
309
310
#define MAKE_HANDLER(name, in, out) \
311
    { (char *) name, { in }, { out }, NULL, NULL, NULL, XML_HANDLER_STATIC }
312
313
/*
314
 * The layout must match enum xmlCharEncoding.
315
 *
316
 * Names should match the IANA registry if possible:
317
 * https://www.iana.org/assignments/character-sets/character-sets.xhtml
318
 */
319
static const xmlCharEncodingHandler defaultHandlers[32] = {
320
    MAKE_HANDLER(NULL, NULL, NULL), /* NONE */
321
    MAKE_HANDLER("UTF-8", UTF8ToUTF8, UTF8ToUTF8),
322
    MAKE_HANDLER("UTF-16LE", UTF16LEToUTF8, UTF8ToUTF16LE),
323
    MAKE_HANDLER("UTF-16BE", UTF16BEToUTF8, UTF8ToUTF16BE),
324
    MAKE_HANDLER("UCS-4LE", NULL, NULL),
325
    MAKE_HANDLER("UCS-4BE", NULL, NULL),
326
    MAKE_HANDLER("IBM037", NULL, NULL),
327
    MAKE_HANDLER(NULL, NULL, NULL), /* UCS4_2143 */
328
    MAKE_HANDLER(NULL, NULL, NULL), /* UCS4_3412 */
329
    MAKE_HANDLER("UCS-2", NULL, NULL),
330
    MAKE_HANDLER("ISO-8859-1", latin1ToUTF8, UTF8ToLatin1),
331
    MAKE_ISO_HANDLER("ISO-8859-2", 2),
332
    MAKE_ISO_HANDLER("ISO-8859-3", 3),
333
    MAKE_ISO_HANDLER("ISO-8859-4", 4),
334
    MAKE_ISO_HANDLER("ISO-8859-5", 5),
335
    MAKE_ISO_HANDLER("ISO-8859-6", 6),
336
    MAKE_ISO_HANDLER("ISO-8859-7", 7),
337
    MAKE_ISO_HANDLER("ISO-8859-8", 8),
338
    MAKE_ISO_HANDLER("ISO-8859-9", 9),
339
    MAKE_HANDLER("ISO-2022-JP", NULL, NULL),
340
    MAKE_HANDLER("Shift_JIS", NULL, NULL),
341
    MAKE_HANDLER("EUC-JP", NULL, NULL),
342
    MAKE_HANDLER("US-ASCII", asciiToAscii, asciiToAscii),
343
    MAKE_HANDLER("UTF-16", UTF16LEToUTF8, UTF8ToUTF16),
344
    MAKE_HANDLER("HTML", NULL, UTF8ToHtmlWrapper),
345
    MAKE_ISO_HANDLER("ISO-8859-10", 10),
346
    MAKE_ISO_HANDLER("ISO-8859-11", 11),
347
    MAKE_ISO_HANDLER("ISO-8859-13", 13),
348
    MAKE_ISO_HANDLER("ISO-8859-14", 14),
349
    MAKE_ISO_HANDLER("ISO-8859-15", 15),
350
    MAKE_ISO_HANDLER("ISO-8859-16", 16),
351
    MAKE_8BIT_HANDLER("windows-1252", windows_1252)
352
};
353
354
#define NUM_DEFAULT_HANDLERS \
355
85.1k
    (sizeof(defaultHandlers) / sizeof(defaultHandlers[0]))
356
357
/* the size should be growable, but it's not a big deal ... */
358
0
#define MAX_ENCODING_HANDLERS 50
359
static xmlCharEncodingHandlerPtr *globalHandlers = NULL;
360
static int nbCharEncodingHandler = 0;
361
362
#ifdef LIBXML_ICONV_ENABLED
363
static xmlParserErrors
364
xmlCharEncIconv(const char *name, xmlCharEncFlags flags,
365
                xmlCharEncodingHandler **out);
366
#endif
367
368
#ifdef LIBXML_ICU_ENABLED
369
static xmlParserErrors
370
xmlCharEncUconv(const char *name, xmlCharEncFlags flags,
371
                xmlCharEncodingHandler **out);
372
#endif
373
374
/************************************************************************
375
 *                  *
376
 *    Generic encoding handling routines      *
377
 *                  *
378
 ************************************************************************/
379
380
/**
381
 * Guess the encoding of the entity using the first bytes of the entity content
382
 * according to the non-normative appendix F of the XML-1.0 recommendation.
383
 *
384
 * @param in  a pointer to the first bytes of the XML entity, must be at least
385
 *            2 bytes long (at least 4 if encoding is UTF4 variant).
386
 * @param len  pointer to the length of the buffer
387
 * @returns a xmlCharEncoding value.
388
 */
389
xmlCharEncoding
390
xmlDetectCharEncoding(const unsigned char* in, int len)
391
0
{
392
0
    if (in == NULL)
393
0
        return(XML_CHAR_ENCODING_NONE);
394
0
    if (len >= 4) {
395
0
  if ((in[0] == 0x00) && (in[1] == 0x00) &&
396
0
      (in[2] == 0x00) && (in[3] == 0x3C))
397
0
      return(XML_CHAR_ENCODING_UCS4BE);
398
0
  if ((in[0] == 0x3C) && (in[1] == 0x00) &&
399
0
      (in[2] == 0x00) && (in[3] == 0x00))
400
0
      return(XML_CHAR_ENCODING_UCS4LE);
401
0
  if ((in[0] == 0x4C) && (in[1] == 0x6F) &&
402
0
      (in[2] == 0xA7) && (in[3] == 0x94))
403
0
      return(XML_CHAR_ENCODING_EBCDIC);
404
0
  if ((in[0] == 0x3C) && (in[1] == 0x3F) &&
405
0
      (in[2] == 0x78) && (in[3] == 0x6D))
406
0
      return(XML_CHAR_ENCODING_UTF8);
407
  /*
408
   * Although not part of the recommendation, we also
409
   * attempt an "auto-recognition" of UTF-16LE and
410
   * UTF-16BE encodings.
411
   */
412
0
  if ((in[0] == 0x3C) && (in[1] == 0x00) &&
413
0
      (in[2] == 0x3F) && (in[3] == 0x00))
414
0
      return(XML_CHAR_ENCODING_UTF16LE);
415
0
  if ((in[0] == 0x00) && (in[1] == 0x3C) &&
416
0
      (in[2] == 0x00) && (in[3] == 0x3F))
417
0
      return(XML_CHAR_ENCODING_UTF16BE);
418
0
    }
419
0
    if (len >= 3) {
420
  /*
421
   * Errata on XML-1.0 June 20 2001
422
   * We now allow an UTF8 encoded BOM
423
   */
424
0
  if ((in[0] == 0xEF) && (in[1] == 0xBB) &&
425
0
      (in[2] == 0xBF))
426
0
      return(XML_CHAR_ENCODING_UTF8);
427
0
    }
428
    /* For UTF-16 we can recognize by the BOM */
429
0
    if (len >= 2) {
430
0
  if ((in[0] == 0xFE) && (in[1] == 0xFF))
431
0
      return(XML_CHAR_ENCODING_UTF16BE);
432
0
  if ((in[0] == 0xFF) && (in[1] == 0xFE))
433
0
      return(XML_CHAR_ENCODING_UTF16LE);
434
0
    }
435
0
    return(XML_CHAR_ENCODING_NONE);
436
0
}
437
438
/**
439
 * Unregisters all aliases.
440
 *
441
 * @deprecated This function modifies global state and is not
442
 * thread-safe. See #xmlCtxtSetCharEncConvImpl for an alternative.
443
 *
444
 */
445
void
446
0
xmlCleanupEncodingAliases(void) {
447
0
    int i;
448
449
0
    if (xmlCharEncodingAliases == NULL)
450
0
  return;
451
452
0
    for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
453
0
  if (xmlCharEncodingAliases[i].name != NULL)
454
0
      xmlFree((char *) xmlCharEncodingAliases[i].name);
455
0
  if (xmlCharEncodingAliases[i].alias != NULL)
456
0
      xmlFree((char *) xmlCharEncodingAliases[i].alias);
457
0
    }
458
0
    xmlCharEncodingAliasesNb = 0;
459
0
    xmlCharEncodingAliasesMax = 0;
460
0
    xmlFree(xmlCharEncodingAliases);
461
0
    xmlCharEncodingAliases = NULL;
462
0
}
463
464
/**
465
 * Lookup an encoding name for the given alias.
466
 *
467
 * @deprecated This function is not thread-safe.
468
 *
469
 * @param alias  the alias name as parsed, in UTF-8 format (ASCII actually)
470
 * @returns NULL if not found, otherwise the original name.
471
 */
472
const char *
473
137k
xmlGetEncodingAlias(const char *alias) {
474
137k
    int i;
475
137k
    char upper[100];
476
477
137k
    if (alias == NULL)
478
0
  return(NULL);
479
480
137k
    if (xmlCharEncodingAliases == NULL)
481
137k
  return(NULL);
482
483
0
    for (i = 0;i < 99;i++) {
484
0
        upper[i] = (char) toupper((unsigned char) alias[i]);
485
0
  if (upper[i] == 0) break;
486
0
    }
487
0
    upper[i] = 0;
488
489
    /*
490
     * Walk down the list looking for a definition of the alias
491
     */
492
0
    for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
493
0
  if (!strcmp(xmlCharEncodingAliases[i].alias, upper)) {
494
0
      return(xmlCharEncodingAliases[i].name);
495
0
  }
496
0
    }
497
0
    return(NULL);
498
0
}
499
500
/**
501
 * Registers an alias `alias` for an encoding named `name`. Existing
502
 * aliases will be overwritten.
503
 *
504
 * @deprecated This function modifies global state and is not
505
 * thread-safe. See #xmlCtxtSetCharEncConvImpl for an alternative.
506
 *
507
 * @param name  the encoding name as parsed, in UTF-8 format (ASCII actually)
508
 * @param alias  the alias name as parsed, in UTF-8 format (ASCII actually)
509
 * @returns 0 in case of success, -1 in case of error.
510
 */
511
int
512
0
xmlAddEncodingAlias(const char *name, const char *alias) {
513
0
    int i;
514
0
    char upper[100];
515
0
    char *nameCopy, *aliasCopy;
516
517
0
    if ((name == NULL) || (alias == NULL))
518
0
  return(-1);
519
520
0
    for (i = 0;i < 99;i++) {
521
0
        upper[i] = (char) toupper((unsigned char) alias[i]);
522
0
  if (upper[i] == 0) break;
523
0
    }
524
0
    upper[i] = 0;
525
526
0
    if (xmlCharEncodingAliasesNb >= xmlCharEncodingAliasesMax) {
527
0
        xmlCharEncodingAliasPtr tmp;
528
0
        int newSize;
529
530
0
        newSize = xmlGrowCapacity(xmlCharEncodingAliasesMax, sizeof(tmp[0]),
531
0
                                  20, XML_MAX_ITEMS);
532
0
        if (newSize < 0)
533
0
            return(-1);
534
0
        tmp = xmlRealloc(xmlCharEncodingAliases, newSize * sizeof(tmp[0]));
535
0
        if (tmp == NULL)
536
0
            return(-1);
537
0
        xmlCharEncodingAliases = tmp;
538
0
        xmlCharEncodingAliasesMax = newSize;
539
0
    }
540
541
    /*
542
     * Walk down the list looking for a definition of the alias
543
     */
544
0
    for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
545
0
  if (!strcmp(xmlCharEncodingAliases[i].alias, upper)) {
546
      /*
547
       * Replace the definition.
548
       */
549
0
      nameCopy = xmlMemStrdup(name);
550
0
            if (nameCopy == NULL)
551
0
                return(-1);
552
0
      xmlFree((char *) xmlCharEncodingAliases[i].name);
553
0
      xmlCharEncodingAliases[i].name = nameCopy;
554
0
      return(0);
555
0
  }
556
0
    }
557
    /*
558
     * Add the definition
559
     */
560
0
    nameCopy = xmlMemStrdup(name);
561
0
    if (nameCopy == NULL)
562
0
        return(-1);
563
0
    aliasCopy = xmlMemStrdup(upper);
564
0
    if (aliasCopy == NULL) {
565
0
        xmlFree(nameCopy);
566
0
        return(-1);
567
0
    }
568
0
    xmlCharEncodingAliases[xmlCharEncodingAliasesNb].name = nameCopy;
569
0
    xmlCharEncodingAliases[xmlCharEncodingAliasesNb].alias = aliasCopy;
570
0
    xmlCharEncodingAliasesNb++;
571
0
    return(0);
572
0
}
573
574
/**
575
 * Unregisters an encoding alias.
576
 *
577
 * @deprecated This function modifies global state and is not
578
 * thread-safe. See #xmlCtxtSetCharEncConvImpl for an alternative.
579
 *
580
 * @param alias  the alias name as parsed, in UTF-8 format (ASCII actually)
581
 * @returns 0 in case of success, -1 in case of error.
582
 */
583
int
584
0
xmlDelEncodingAlias(const char *alias) {
585
0
    int i;
586
587
0
    if (alias == NULL)
588
0
  return(-1);
589
590
0
    if (xmlCharEncodingAliases == NULL)
591
0
  return(-1);
592
    /*
593
     * Walk down the list looking for a definition of the alias
594
     */
595
0
    for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
596
0
  if (!strcmp(xmlCharEncodingAliases[i].alias, alias)) {
597
0
      xmlFree((char *) xmlCharEncodingAliases[i].name);
598
0
      xmlFree((char *) xmlCharEncodingAliases[i].alias);
599
0
      xmlCharEncodingAliasesNb--;
600
0
      memmove(&xmlCharEncodingAliases[i], &xmlCharEncodingAliases[i + 1],
601
0
        sizeof(xmlCharEncodingAlias) * (xmlCharEncodingAliasesNb - i));
602
0
      return(0);
603
0
  }
604
0
    }
605
0
    return(-1);
606
0
}
607
608
static int
609
672k
xmlCompareEncTableEntries(const void *vkey, const void *ventry) {
610
672k
    const char *key = vkey;
611
672k
    const xmlEncTableEntry *entry = ventry;
612
613
672k
    return(xmlStrcasecmp(BAD_CAST key, BAD_CAST entry->name));
614
672k
}
615
616
static xmlCharEncoding
617
xmlParseCharEncodingInternal(const char *name)
618
137k
{
619
137k
    const xmlEncTableEntry *entry;
620
621
137k
    if (name == NULL)
622
0
       return(XML_CHAR_ENCODING_NONE);
623
624
137k
    entry = bsearch(name, xmlEncTable,
625
137k
                    sizeof(xmlEncTable) / sizeof(xmlEncTable[0]),
626
137k
                    sizeof(xmlEncTable[0]), xmlCompareEncTableEntries);
627
137k
    if (entry != NULL)
628
82.5k
        return(entry->enc);
629
630
54.8k
    return(XML_CHAR_ENCODING_ERROR);
631
137k
}
632
633
/**
634
 * Compare the string to the encoding schemes already known. Note
635
 * that the comparison is case insensitive accordingly to the section
636
 * [XML] 4.3.3 Character Encoding in Entities.
637
 *
638
 * @param name  the encoding name as parsed, in UTF-8 format (ASCII actually)
639
 * @returns one of the xmlCharEncoding values or XML_CHAR_ENCODING_NONE
640
 * if not recognized.
641
 */
642
xmlCharEncoding
643
xmlParseCharEncoding(const char *name)
644
0
{
645
0
    xmlCharEncoding enc = xmlParseCharEncodingInternal(name);
646
647
    /* Backward compatibility */
648
0
    if (enc == XML_CHAR_ENCODING_UTF16)
649
0
        enc = XML_CHAR_ENCODING_UTF16LE;
650
651
0
    return(enc);
652
0
}
653
654
/**
655
 * The "canonical" name for XML encoding.
656
 * C.f. http://www.w3.org/TR/REC-xml#charencoding
657
 * Section 4.3.3  Character Encoding in Entities
658
 *
659
 * @param enc  the encoding
660
 * @returns the canonical name for the given encoding.
661
 */
662
const char*
663
821k
xmlGetCharEncodingName(xmlCharEncoding enc) {
664
821k
    switch (enc) {
665
0
        case XML_CHAR_ENCODING_UTF16LE:
666
0
      return("UTF-16");
667
0
        case XML_CHAR_ENCODING_UTF16BE:
668
0
      return("UTF-16");
669
0
        case XML_CHAR_ENCODING_UCS4LE:
670
0
            return("UCS-4");
671
0
        case XML_CHAR_ENCODING_UCS4BE:
672
0
            return("UCS-4");
673
821k
        default:
674
821k
            break;
675
821k
    }
676
677
821k
    if ((enc <= 0) || ((size_t) enc >= NUM_DEFAULT_HANDLERS))
678
821k
        return(NULL);
679
680
0
    return(defaultHandlers[enc].name);
681
821k
}
682
683
/************************************************************************
684
 *                  *
685
 *      Char encoding handlers        *
686
 *                  *
687
 ************************************************************************/
688
689
/**
690
 * Create and registers an xmlCharEncodingHandler.
691
 *
692
 * @deprecated This function modifies global state and is not
693
 * thread-safe. See #xmlCtxtSetCharEncConvImpl for an alternative.
694
 *
695
 * @param name  the encoding name, in UTF-8 format (ASCII actually)
696
 * @param input  the xmlCharEncodingInputFunc to read that encoding
697
 * @param output  the xmlCharEncodingOutputFunc to write that encoding
698
 * @returns the xmlCharEncodingHandler created (or NULL in case of error).
699
 */
700
xmlCharEncodingHandler *
701
xmlNewCharEncodingHandler(const char *name,
702
                          xmlCharEncodingInputFunc input,
703
0
                          xmlCharEncodingOutputFunc output) {
704
0
    xmlCharEncodingHandlerPtr handler;
705
0
    const char *alias;
706
0
    char upper[500];
707
0
    int i;
708
0
    char *up = NULL;
709
710
    /*
711
     * Do the alias resolution
712
     */
713
0
    alias = xmlGetEncodingAlias(name);
714
0
    if (alias != NULL)
715
0
  name = alias;
716
717
    /*
718
     * Keep only the uppercase version of the encoding.
719
     */
720
0
    if (name == NULL)
721
0
  return(NULL);
722
0
    for (i = 0;i < 499;i++) {
723
0
        upper[i] = (char) toupper((unsigned char) name[i]);
724
0
  if (upper[i] == 0) break;
725
0
    }
726
0
    upper[i] = 0;
727
0
    up = xmlMemStrdup(upper);
728
0
    if (up == NULL)
729
0
  return(NULL);
730
731
    /*
732
     * allocate and fill-up an handler block.
733
     */
734
0
    handler = (xmlCharEncodingHandlerPtr)
735
0
              xmlMalloc(sizeof(xmlCharEncodingHandler));
736
0
    if (handler == NULL) {
737
0
        xmlFree(up);
738
0
  return(NULL);
739
0
    }
740
0
    memset(handler, 0, sizeof(xmlCharEncodingHandler));
741
0
    handler->input.legacyFunc = input;
742
0
    handler->output.legacyFunc = output;
743
0
    handler->name = up;
744
0
    handler->flags = XML_HANDLER_STATIC | XML_HANDLER_LEGACY;
745
746
    /*
747
     * registers and returns the handler.
748
     */
749
0
    xmlRegisterCharEncodingHandler(handler);
750
0
    return(handler);
751
0
}
752
753
/**
754
 * Create a custom xmlCharEncodingHandler.
755
 *
756
 * @param name  the encoding name
757
 * @param input  input callback which converts to UTF-8
758
 * @param output  output callback which converts from UTF-8
759
 * @param ctxtDtor  context destructor
760
 * @param inputCtxt  context for input callback
761
 * @param outputCtxt  context for output callback
762
 * @param out  pointer to resulting handler
763
 * @returns an xmlParserErrors code.
764
 */
765
xmlParserErrors
766
xmlCharEncNewCustomHandler(const char *name,
767
                           xmlCharEncConvFunc input, xmlCharEncConvFunc output,
768
                           xmlCharEncConvCtxtDtor ctxtDtor,
769
                           void *inputCtxt, void *outputCtxt,
770
52.4k
                           xmlCharEncodingHandler **out) {
771
52.4k
    xmlCharEncodingHandler *handler;
772
773
52.4k
    if (out == NULL)
774
0
        return(XML_ERR_ARGUMENT);
775
776
52.4k
    handler = xmlMalloc(sizeof(*handler));
777
52.4k
    if (handler == NULL)
778
11
        goto error;
779
52.4k
    memset(handler, 0, sizeof(*handler));
780
781
52.4k
    if (name != NULL) {
782
52.4k
        handler->name = xmlMemStrdup(name);
783
52.4k
        if (handler->name == NULL)
784
6
            goto error;
785
52.4k
    }
786
787
52.4k
    handler->input.func = input;
788
52.4k
    handler->output.func = output;
789
52.4k
    handler->ctxtDtor = ctxtDtor;
790
52.4k
    handler->inputCtxt = inputCtxt;
791
52.4k
    handler->outputCtxt = outputCtxt;
792
793
52.4k
    *out = handler;
794
52.4k
    return(XML_ERR_OK);
795
796
17
error:
797
17
    xmlFree(handler);
798
799
17
    if (ctxtDtor != NULL) {
800
17
        if (inputCtxt != NULL)
801
17
            ctxtDtor(inputCtxt);
802
17
        if (outputCtxt != NULL)
803
2
            ctxtDtor(outputCtxt);
804
17
    }
805
806
17
    return(XML_ERR_NO_MEMORY);
807
52.4k
}
808
809
/**
810
 * @deprecated Alias for #xmlInitParser.
811
 */
812
void
813
0
xmlInitCharEncodingHandlers(void) {
814
0
    xmlInitParser();
815
0
}
816
817
/**
818
 * Initialize the char encoding support.
819
 */
820
void
821
2
xmlInitEncodingInternal(void) {
822
2
    unsigned short int tst = 0x1234;
823
2
    unsigned char *ptr = (unsigned char *) &tst;
824
825
2
    if (*ptr == 0x12) xmlLittleEndian = 0;
826
2
    else xmlLittleEndian = 1;
827
2
}
828
829
/**
830
 * Cleanup the memory allocated for the char encoding support, it
831
 * unregisters all the encoding handlers and the aliases.
832
 *
833
 * @deprecated This function will be made private. Call #xmlCleanupParser
834
 * to free global state but see the warnings there. #xmlCleanupParser
835
 * should be only called once at program exit. In most cases, you don't
836
 * have call cleanup functions at all.
837
 *
838
 */
839
void
840
0
xmlCleanupCharEncodingHandlers(void) {
841
0
    xmlCleanupEncodingAliases();
842
843
0
    if (globalHandlers == NULL) return;
844
845
0
    for (;nbCharEncodingHandler > 0;) {
846
0
        xmlCharEncodingHandler *handler;
847
848
0
        nbCharEncodingHandler--;
849
0
        handler = globalHandlers[nbCharEncodingHandler];
850
0
  if (handler != NULL) {
851
0
      if (handler->name != NULL)
852
0
    xmlFree(handler->name);
853
0
      xmlFree(handler);
854
0
  }
855
0
    }
856
0
    xmlFree(globalHandlers);
857
0
    globalHandlers = NULL;
858
0
    nbCharEncodingHandler = 0;
859
0
}
860
861
/**
862
 * Register the char encoding handler.
863
 *
864
 * @deprecated This function modifies global state and is not
865
 * thread-safe. See #xmlCtxtSetCharEncConvImpl for an alternative.
866
 *
867
 * @param handler  the xmlCharEncodingHandler handler block
868
 */
869
void
870
0
xmlRegisterCharEncodingHandler(xmlCharEncodingHandler *handler) {
871
0
    if (handler == NULL)
872
0
        return;
873
0
    if (globalHandlers == NULL) {
874
0
        globalHandlers = xmlMalloc(
875
0
                MAX_ENCODING_HANDLERS * sizeof(globalHandlers[0]));
876
0
        if (globalHandlers == NULL)
877
0
            goto free_handler;
878
0
    }
879
880
0
    if (nbCharEncodingHandler >= MAX_ENCODING_HANDLERS)
881
0
        goto free_handler;
882
0
    globalHandlers[nbCharEncodingHandler++] = handler;
883
0
    return;
884
885
0
free_handler:
886
0
    if (handler != NULL) {
887
0
        if (handler->name != NULL) {
888
0
            xmlFree(handler->name);
889
0
        }
890
0
        xmlFree(handler);
891
0
    }
892
0
}
893
894
/**
895
 * Search the non-default handlers for an exact match.
896
 *
897
 * @param norig  name of the char encoding
898
 * @param name  potentially aliased name of the encoding
899
 * @param flags  bit mask of flags
900
 * @param impl  a conversion implementation (optional)
901
 * @param implCtxt  user data for conversion implementation (optional)
902
 * @param out  pointer to resulting handler
903
 * @returns an xmlParserErrors error code.
904
 */
905
static xmlParserErrors
906
xmlFindExtraHandler(const char *norig, const char *name, xmlCharEncFlags flags,
907
                    xmlCharEncConvImpl impl, void *implCtxt,
908
60.5k
                    xmlCharEncodingHandler **out) {
909
    /*
910
     * Try custom implementation before deprecated global handlers.
911
     *
912
     * Note that we pass the original name without deprecated
913
     * alias resolution.
914
     */
915
60.5k
    if (impl != NULL)
916
0
        return(impl(implCtxt, norig, flags, out));
917
918
    /*
919
     * Deprecated
920
     */
921
60.5k
    if (globalHandlers != NULL) {
922
0
        int i;
923
924
0
        for (i = 0; i < nbCharEncodingHandler; i++) {
925
0
            xmlCharEncodingHandler *h = globalHandlers[i];
926
927
0
            if (!xmlStrcasecmp((const xmlChar *) name,
928
0
                               (const xmlChar *) h->name)) {
929
0
                if ((((flags & XML_ENC_INPUT) == 0) || (h->input.func)) &&
930
0
                    (((flags & XML_ENC_OUTPUT) == 0) || (h->output.func))) {
931
0
                    *out = h;
932
0
                    return(XML_ERR_OK);
933
0
                }
934
0
            }
935
0
        }
936
0
    }
937
938
60.5k
#ifdef LIBXML_ICONV_ENABLED
939
60.5k
    {
940
60.5k
        int ret = xmlCharEncIconv(name, flags, out);
941
942
60.5k
        if (ret == XML_ERR_OK)
943
52.4k
            return(XML_ERR_OK);
944
8.17k
        if (ret != XML_ERR_UNSUPPORTED_ENCODING)
945
28
            return(ret);
946
8.17k
    }
947
8.14k
#endif /* LIBXML_ICONV_ENABLED */
948
949
#ifdef LIBXML_ICU_ENABLED
950
    {
951
        int ret = xmlCharEncUconv(name, flags, out);
952
953
        if (ret == XML_ERR_OK)
954
            return(XML_ERR_OK);
955
        if (ret != XML_ERR_UNSUPPORTED_ENCODING)
956
            return(ret);
957
    }
958
#endif /* LIBXML_ICU_ENABLED */
959
960
8.14k
    return(XML_ERR_UNSUPPORTED_ENCODING);
961
8.17k
}
962
963
/**
964
 * Find or create a handler matching the encoding. The following
965
 * converters are looked up in order:
966
 *
967
 * - Built-in handler (UTF-8, UTF-16, ISO-8859-1, ASCII)
968
 * - User-registered global handler (deprecated)
969
 * - iconv if enabled
970
 * - ICU if enabled
971
 *
972
 * The handler must be closed with #xmlCharEncCloseFunc.
973
 *
974
 * If the encoding is UTF-8, a NULL handler and no error code will
975
 * be returned.
976
 *
977
 * @since 2.13.0
978
 *
979
 * @param enc  an xmlCharEncoding value.
980
 * @param out  pointer to result
981
 * @returns XML_ERR_OK, XML_ERR_UNSUPPORTED_ENCODING or another
982
 * xmlParserErrors error code.
983
 */
984
xmlParserErrors
985
xmlLookupCharEncodingHandler(xmlCharEncoding enc,
986
11.7k
                             xmlCharEncodingHandler **out) {
987
11.7k
    const xmlCharEncodingHandler *handler;
988
989
11.7k
    if (out == NULL)
990
0
        return(XML_ERR_ARGUMENT);
991
11.7k
    *out = NULL;
992
993
11.7k
    if ((enc <= 0) || ((size_t) enc >= NUM_DEFAULT_HANDLERS))
994
0
        return(XML_ERR_UNSUPPORTED_ENCODING);
995
996
    /* Return NULL handler for UTF-8 */
997
11.7k
    if ((enc == XML_CHAR_ENCODING_UTF8) ||
998
11.7k
        (enc == XML_CHAR_ENCODING_NONE))
999
3.76k
        return(XML_ERR_OK);
1000
1001
7.96k
    handler = &defaultHandlers[enc];
1002
7.96k
    if ((handler->input.func != NULL) || (handler->output.func != NULL)) {
1003
7.07k
        *out = (xmlCharEncodingHandler *) handler;
1004
7.07k
        return(XML_ERR_OK);
1005
7.07k
    }
1006
1007
893
    if (handler->name != NULL) {
1008
893
        xmlCharEncFlags flags = XML_ENC_INPUT;
1009
1010
893
#ifdef LIBXML_OUTPUT_ENABLED
1011
893
        flags |= XML_ENC_OUTPUT;
1012
893
#endif
1013
893
        return(xmlFindExtraHandler(handler->name, handler->name, flags,
1014
893
                                   NULL, NULL, out));
1015
893
    }
1016
1017
0
    return(XML_ERR_UNSUPPORTED_ENCODING);
1018
893
}
1019
1020
/**
1021
 * @deprecated Use #xmlLookupCharEncodingHandler which has better error
1022
 * reporting.
1023
 *
1024
 * @param enc  an xmlCharEncoding value.
1025
 * @returns the handler or NULL if no handler was found or an error
1026
 * occurred.
1027
 */
1028
xmlCharEncodingHandler *
1029
0
xmlGetCharEncodingHandler(xmlCharEncoding enc) {
1030
0
    xmlCharEncodingHandler *ret;
1031
1032
0
    xmlLookupCharEncodingHandler(enc, &ret);
1033
0
    return(ret);
1034
0
}
1035
1036
/**
1037
 * Find or create a handler matching the encoding. The following
1038
 * converters are looked up in order:
1039
 *
1040
 * - Built-in handler (UTF-8, UTF-16, ISO-8859-1, ASCII)
1041
 * - Custom implementation if provided
1042
 * - User-registered global handler (deprecated)
1043
 * - iconv if enabled
1044
 * - ICU if enabled
1045
 *
1046
 * The handler must be closed with #xmlCharEncCloseFunc.
1047
 *
1048
 * If the encoding is UTF-8, a NULL handler and no error code will
1049
 * be returned.
1050
 *
1051
 * `flags` can contain XML_ENC_INPUT, XML_ENC_OUTPUT or both.
1052
 *
1053
 * @since 2.14.0
1054
 *
1055
 * @param name  a string describing the char encoding.
1056
 * @param flags  bit mask of flags
1057
 * @param impl  a conversion implementation (optional)
1058
 * @param implCtxt  user data for conversion implementation (optional)
1059
 * @param out  pointer to result
1060
 * @returns XML_ERR_OK, XML_ERR_UNSUPPORTED_ENCODING or another
1061
 * xmlParserErrors error code.
1062
 */
1063
xmlParserErrors
1064
xmlCreateCharEncodingHandler(const char *name, xmlCharEncFlags flags,
1065
                             xmlCharEncConvImpl impl, void *implCtxt,
1066
137k
                             xmlCharEncodingHandler **out) {
1067
137k
    const xmlCharEncodingHandler *handler;
1068
137k
    const char *norig, *nalias;
1069
137k
    xmlCharEncoding enc;
1070
1071
137k
    if (out == NULL)
1072
0
        return(XML_ERR_ARGUMENT);
1073
137k
    *out = NULL;
1074
1075
137k
    if ((name == NULL) || (flags == 0))
1076
0
        return(XML_ERR_ARGUMENT);
1077
1078
137k
    norig = name;
1079
137k
    nalias = xmlGetEncodingAlias(name);
1080
137k
    if (nalias != NULL)
1081
0
  name = nalias;
1082
1083
137k
    enc = xmlParseCharEncodingInternal(name);
1084
1085
    /* Return NULL handler for UTF-8 */
1086
137k
    if (enc == XML_CHAR_ENCODING_UTF8)
1087
9.10k
        return(XML_ERR_OK);
1088
1089
128k
    if ((enc > 0) && ((size_t) enc < NUM_DEFAULT_HANDLERS)) {
1090
73.4k
        if (flags & XML_ENC_HTML) {
1091
            /*
1092
             * TODO: HTML5 only allows a fixed set of charset
1093
             * labels. We should add an option to enable or
1094
             * disable this restriction.
1095
             *
1096
             * TODO: Map ISO-8859-9 to windows-1254.
1097
             */
1098
0
            switch (enc) {
1099
0
                case XML_CHAR_ENCODING_ASCII:
1100
0
                case XML_CHAR_ENCODING_8859_1:
1101
0
                    enc = XML_CHAR_ENCODING_WINDOWS_1252;
1102
0
                    break;
1103
0
                case XML_CHAR_ENCODING_UCS2:
1104
0
                case XML_CHAR_ENCODING_UTF16:
1105
0
                    enc = XML_CHAR_ENCODING_UTF16LE;
1106
0
                    break;
1107
0
                default:
1108
0
                    break;
1109
0
            }
1110
0
        }
1111
1112
73.4k
        handler = &defaultHandlers[enc];
1113
73.4k
        if ((((flags & XML_ENC_INPUT) == 0) || (handler->input.func)) &&
1114
73.4k
            (((flags & XML_ENC_OUTPUT) == 0) || (handler->output.func))) {
1115
68.5k
            xmlCharEncodingHandler *ret;
1116
1117
            /*
1118
             * Return a copy of the handler with the original name.
1119
             */
1120
1121
68.5k
            ret = xmlMalloc(sizeof(*ret));
1122
68.5k
            if (ret == NULL)
1123
5
                return(XML_ERR_NO_MEMORY);
1124
68.5k
            memset(ret, 0, sizeof(*ret));
1125
1126
68.5k
            ret->name = xmlMemStrdup(norig);
1127
68.5k
            if (ret->name == NULL) {
1128
4
                xmlFree(ret);
1129
4
                return(XML_ERR_NO_MEMORY);
1130
4
            }
1131
68.5k
            ret->input = handler->input;
1132
68.5k
            ret->output = handler->output;
1133
68.5k
            ret->inputCtxt = handler->inputCtxt;
1134
68.5k
            ret->outputCtxt = handler->outputCtxt;
1135
68.5k
            ret->ctxtDtor = handler->ctxtDtor;
1136
1137
68.5k
            *out = ret;
1138
68.5k
            return(XML_ERR_OK);
1139
68.5k
        }
1140
73.4k
    }
1141
1142
59.6k
    return(xmlFindExtraHandler(norig, name, flags, impl, implCtxt, out));
1143
128k
}
1144
1145
/**
1146
 * Find or create a handler matching the encoding. The following
1147
 * converters are looked up in order:
1148
 *
1149
 * - Built-in handler (UTF-8, UTF-16, ISO-8859-1, ASCII)
1150
 * - User-registered global handler (deprecated)
1151
 * - iconv if enabled
1152
 * - ICU if enabled
1153
 *
1154
 * The handler must be closed with #xmlCharEncCloseFunc.
1155
 *
1156
 * If the encoding is UTF-8, a NULL handler and no error code will
1157
 * be returned.
1158
 *
1159
 * @since 2.13.0
1160
 *
1161
 * @param name  a string describing the char encoding.
1162
 * @param output  boolean, use handler for output
1163
 * @param out  pointer to result
1164
 * @returns XML_ERR_OK, XML_ERR_UNSUPPORTED_ENCODING or another
1165
 * xmlParserErrors error code.
1166
 */
1167
xmlParserErrors
1168
xmlOpenCharEncodingHandler(const char *name, int output,
1169
0
                           xmlCharEncodingHandler **out) {
1170
0
    xmlCharEncFlags flags = output ? XML_ENC_OUTPUT : XML_ENC_INPUT;
1171
1172
0
    return(xmlCreateCharEncodingHandler(name, flags, NULL, NULL, out));
1173
0
}
1174
1175
/**
1176
 * If the encoding is UTF-8, this will return a no-op handler that
1177
 * shouldn't be used.
1178
 *
1179
 * @deprecated Use #xmlOpenCharEncodingHandler which has better error
1180
 * reporting.
1181
 *
1182
 * @param name  a string describing the char encoding.
1183
 * @returns the handler or NULL if no handler was found or an error
1184
 * occurred.
1185
 */
1186
xmlCharEncodingHandler *
1187
428
xmlFindCharEncodingHandler(const char *name) {
1188
428
    xmlCharEncodingHandler *ret;
1189
428
    xmlCharEncFlags flags;
1190
1191
    /*
1192
     * This handler shouldn't be used, but we must return a non-NULL
1193
     * handler.
1194
     */
1195
428
    if ((xmlStrcasecmp(BAD_CAST name, BAD_CAST "UTF-8") == 0) ||
1196
428
        (xmlStrcasecmp(BAD_CAST name, BAD_CAST "UTF8") == 0))
1197
0
        return((xmlCharEncodingHandlerPtr)
1198
0
                &defaultHandlers[XML_CHAR_ENCODING_UTF8]);
1199
1200
428
    flags = XML_ENC_INPUT;
1201
428
#ifdef LIBXML_OUTPUT_ENABLED
1202
428
    flags |= XML_ENC_OUTPUT;
1203
428
#endif
1204
428
    xmlCreateCharEncodingHandler(name, flags, NULL, NULL, &ret);
1205
428
    return(ret);
1206
428
}
1207
1208
/************************************************************************
1209
 *                  *
1210
 *    ICONV based generic conversion functions    *
1211
 *                  *
1212
 ************************************************************************/
1213
1214
#ifdef LIBXML_ICONV_ENABLED
1215
typedef struct {
1216
    iconv_t cd;
1217
} xmlIconvCtxt;
1218
1219
/**
1220
 * The value of `inlen` after return is the number of bytes consumed.
1221
 * The value of `outlen` after return is the number of bytes produced.
1222
 *
1223
 * @param vctxt  conversion context
1224
 * @param out  a pointer to an array of bytes to store the result
1225
 * @param outlen  the length of `out`
1226
 * @param in  a pointer to an array of input bytes
1227
 * @param inlen  the length of `in`
1228
 * @param flush  end of input
1229
 * @returns an xmlCharEncError code.
1230
 */
1231
static xmlCharEncError
1232
xmlIconvConvert(void *vctxt, unsigned char *out, int *outlen,
1233
                const unsigned char *in, int *inlen,
1234
5.26M
                int flush ATTRIBUTE_UNUSED) {
1235
5.26M
    xmlIconvCtxt *ctxt = vctxt;
1236
5.26M
    size_t icv_inlen, icv_outlen;
1237
5.26M
    const char *icv_in = (const char *) in;
1238
5.26M
    char *icv_out = (char *) out;
1239
5.26M
    size_t ret;
1240
1241
5.26M
    if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL)) {
1242
44
        if (outlen != NULL) *outlen = 0;
1243
44
        return(XML_ENC_ERR_INTERNAL);
1244
44
    }
1245
5.26M
    icv_inlen = *inlen;
1246
5.26M
    icv_outlen = *outlen;
1247
    /*
1248
     * Some versions take const, other versions take non-const input.
1249
     */
1250
5.26M
    ret = iconv(ctxt->cd, (void *) &icv_in, &icv_inlen, &icv_out, &icv_outlen);
1251
5.26M
    *inlen -= icv_inlen;
1252
5.26M
    *outlen -= icv_outlen;
1253
5.26M
    if (ret == (size_t) -1) {
1254
2.66M
        if (errno == EILSEQ)
1255
2.40M
            return(XML_ENC_ERR_INPUT);
1256
263k
        if (errno == E2BIG)
1257
259k
            return(XML_ENC_ERR_SPACE);
1258
        /*
1259
         * EINVAL means a truncated multi-byte sequence at the end
1260
         * of the input buffer. We treat this as success.
1261
         */
1262
3.66k
        if (errno == EINVAL)
1263
3.66k
            return(XML_ENC_ERR_SUCCESS);
1264
#ifdef __APPLE__
1265
        /*
1266
         * Apple's new libiconv can return EOPNOTSUPP under
1267
         * unknown circumstances (detected when fuzzing).
1268
         */
1269
        if (errno == EOPNOTSUPP)
1270
            return(XML_ENC_ERR_INPUT);
1271
#endif
1272
0
        return(XML_ENC_ERR_INTERNAL);
1273
3.66k
    }
1274
2.60M
    return(XML_ENC_ERR_SUCCESS);
1275
5.26M
}
1276
1277
static void
1278
112k
xmlIconvFree(void *vctxt) {
1279
112k
    xmlIconvCtxt *ctxt = vctxt;
1280
1281
112k
    if (ctxt == NULL)
1282
51.4k
        return;
1283
1284
61.2k
    if (ctxt->cd != (iconv_t) -1)
1285
53.3k
        iconv_close(ctxt->cd);
1286
1287
61.2k
    xmlFree(ctxt);
1288
61.2k
}
1289
1290
#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && \
1291
    defined(__GLIBC__)
1292
#include <libxml/parserInternals.h>
1293
1294
static int
1295
120k
xmlEncodingMatch(const char *name1, const char *name2) {
1296
    /*
1297
     * Fuzzy match for encoding names
1298
     */
1299
124k
    while (1) {
1300
132k
        while ((*name1 != 0) && (!IS_ASCII_LETTER(*name1)))
1301
8.07k
            name1 += 1;
1302
124k
        while ((*name2 != 0) && (!IS_ASCII_LETTER(*name2)))
1303
276
            name2 += 1;
1304
124k
        if ((*name1 == 0) || (*name2 == 0))
1305
1.40k
            break;
1306
122k
        if ((*name1 | 0x20) != (*name2 | 0x20))
1307
119k
            return(0);
1308
3.10k
        name1 += 1;
1309
3.10k
        name2 += 1;
1310
3.10k
    }
1311
1312
1.40k
    return((*name1 == 0) && (*name2 == 0));
1313
120k
}
1314
#endif /* FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION */
1315
1316
static xmlParserErrors
1317
xmlCharEncIconv(const char *name, xmlCharEncFlags flags,
1318
60.5k
                xmlCharEncodingHandler **out) {
1319
60.5k
    xmlCharEncConvFunc inFunc = NULL, outFunc = NULL;
1320
60.5k
    xmlIconvCtxt *inputCtxt = NULL, *outputCtxt = NULL;
1321
60.5k
    iconv_t icv_in;
1322
60.5k
    iconv_t icv_out;
1323
60.5k
    xmlParserErrors ret;
1324
1325
    /*
1326
     * POSIX allows "indicator suffixes" like "//IGNORE" to be
1327
     * passed to iconv_open. This can change the behavior in
1328
     * unexpected ways.
1329
     *
1330
     * Many iconv implementations also support non-standard
1331
     * codesets like "wchar_t", "char" or the empty string "".
1332
     * It would make sense to disallow them, but codeset names
1333
     * are matched fuzzily, so a string like "w-C.hA_rt" could
1334
     * be interpreted as "wchar_t".
1335
     *
1336
     * When escaping characters that aren't supported in the
1337
     * target encoding, we also rely on GNU libiconv behavior to
1338
     * stop conversion without trying any kind of fallback.
1339
     * This violates the POSIX spec which says:
1340
     *
1341
     * > If iconv() encounters a character in the input buffer
1342
     * > that is valid, but for which an identical character does
1343
     * > not exist in the output codeset [...] iconv() shall
1344
     * > perform an implementation-defined conversion on the
1345
     * > character.
1346
     *
1347
     * See: https://sourceware.org/bugzilla/show_bug.cgi?id=29913
1348
     *
1349
     * Unfortunately, strict POSIX compliance makes it impossible
1350
     * to detect untranslatable characters.
1351
     */
1352
60.5k
    if (strstr(name, "//") != NULL) {
1353
5
        ret = XML_ERR_UNSUPPORTED_ENCODING;
1354
5
        goto error;
1355
5
    }
1356
1357
60.5k
#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && \
1358
60.5k
    defined(__GLIBC__)
1359
    /*
1360
     * This glibc bug can lead to unpredictable results with the
1361
     * push parser.
1362
     *
1363
     * https://sourceware.org/bugzilla/show_bug.cgi?id=32633
1364
     */
1365
60.5k
    if ((xmlEncodingMatch(name, "TSCII")) ||
1366
60.5k
        (xmlEncodingMatch(name, "BIG5-HKSCS"))) {
1367
235
        ret = XML_ERR_UNSUPPORTED_ENCODING;
1368
235
        goto error;
1369
235
    }
1370
60.3k
#endif
1371
1372
60.3k
    if (flags & XML_ENC_INPUT) {
1373
60.3k
        inputCtxt = xmlMalloc(sizeof(xmlIconvCtxt));
1374
60.3k
        if (inputCtxt == NULL) {
1375
9
            ret = XML_ERR_NO_MEMORY;
1376
9
            goto error;
1377
9
        }
1378
60.3k
        inputCtxt->cd = (iconv_t) -1;
1379
1380
60.3k
        icv_in = iconv_open("UTF-8", name);
1381
60.3k
        if (icv_in == (iconv_t) -1) {
1382
7.90k
            if (errno == EINVAL)
1383
7.90k
                ret = XML_ERR_UNSUPPORTED_ENCODING;
1384
0
            else if (errno == ENOMEM)
1385
0
                ret = XML_ERR_NO_MEMORY;
1386
0
            else
1387
0
                ret = XML_ERR_SYSTEM;
1388
7.90k
            goto error;
1389
7.90k
        }
1390
52.4k
        inputCtxt->cd = icv_in;
1391
1392
52.4k
        inFunc = xmlIconvConvert;
1393
52.4k
    }
1394
1395
52.4k
    if (flags & XML_ENC_OUTPUT) {
1396
938
        outputCtxt = xmlMalloc(sizeof(xmlIconvCtxt));
1397
938
        if (outputCtxt == NULL) {
1398
2
            ret = XML_ERR_NO_MEMORY;
1399
2
            goto error;
1400
2
        }
1401
936
        outputCtxt->cd = (iconv_t) -1;
1402
1403
936
        icv_out = iconv_open(name, "UTF-8");
1404
936
        if (icv_out == (iconv_t) -1) {
1405
0
            if (errno == EINVAL)
1406
0
                ret = XML_ERR_UNSUPPORTED_ENCODING;
1407
0
            else if (errno == ENOMEM)
1408
0
                ret = XML_ERR_NO_MEMORY;
1409
0
            else
1410
0
                ret = XML_ERR_SYSTEM;
1411
0
            goto error;
1412
0
        }
1413
936
        outputCtxt->cd = icv_out;
1414
1415
936
        outFunc = xmlIconvConvert;
1416
936
    }
1417
1418
52.4k
    return(xmlCharEncNewCustomHandler(name, inFunc, outFunc, xmlIconvFree,
1419
52.4k
                                      inputCtxt, outputCtxt, out));
1420
1421
8.15k
error:
1422
8.15k
    if (inputCtxt != NULL)
1423
7.90k
        xmlIconvFree(inputCtxt);
1424
8.15k
    if (outputCtxt != NULL)
1425
0
        xmlIconvFree(outputCtxt);
1426
8.15k
    return(ret);
1427
52.4k
}
1428
#endif /* LIBXML_ICONV_ENABLED */
1429
1430
/************************************************************************
1431
 *                  *
1432
 *    ICU based generic conversion functions    *
1433
 *                  *
1434
 ************************************************************************/
1435
1436
#ifdef LIBXML_ICU_ENABLED
1437
/* Size of pivot buffer, same as icu/source/common/ucnv.cpp CHUNK_SIZE */
1438
#define ICU_PIVOT_BUF_SIZE 1024
1439
1440
typedef struct _uconv_t xmlUconvCtxt;
1441
struct _uconv_t {
1442
  UConverter *uconv; /* for conversion between an encoding and UTF-16 */
1443
  UConverter *utf8; /* for conversion between UTF-8 and UTF-16 */
1444
  UChar      *pivot_source;
1445
  UChar      *pivot_target;
1446
  int        isInput;
1447
  UChar      pivot_buf[ICU_PIVOT_BUF_SIZE];
1448
};
1449
1450
/**
1451
 * The value of `inlen` after return is the number of bytes consumed.
1452
 * The value of `outlen` after return is the number of bytes produced.
1453
 *
1454
 * @param vctxt  conversion context
1455
 * @param out  a pointer to an array of bytes to store the result
1456
 * @param outlen  the length of `out`
1457
 * @param in  a pointer to an array of input bytes
1458
 * @param inlen  the length of `in`
1459
 * @param flush  end of input
1460
 * @returns an xmlCharEncError code.
1461
 */
1462
static xmlCharEncError
1463
xmlUconvConvert(void *vctxt, unsigned char *out, int *outlen,
1464
                const unsigned char *in, int *inlen, int flush) {
1465
    xmlUconvCtxt *cd = vctxt;
1466
    const char *ucv_in = (const char *) in;
1467
    char *ucv_out = (char *) out;
1468
    UConverter *target, *source;
1469
    UErrorCode err = U_ZERO_ERROR;
1470
    int ret;
1471
1472
    if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL)) {
1473
        if (outlen != NULL)
1474
            *outlen = 0;
1475
        return(XML_ENC_ERR_INTERNAL);
1476
    }
1477
1478
    /*
1479
     * The ICU API can consume input, including partial sequences,
1480
     * even if the output buffer would overflow. The remaining input
1481
     * must be processed by calling ucnv_convertEx with a possibly
1482
     * empty input buffer.
1483
     */
1484
    if (cd->isInput) {
1485
        source = cd->uconv;
1486
        target = cd->utf8;
1487
    } else {
1488
        source = cd->utf8;
1489
        target = cd->uconv;
1490
    }
1491
1492
    ucnv_convertEx(target, source, &ucv_out, ucv_out + *outlen,
1493
                   &ucv_in, ucv_in + *inlen, cd->pivot_buf,
1494
                   &cd->pivot_source, &cd->pivot_target,
1495
                   cd->pivot_buf + ICU_PIVOT_BUF_SIZE,
1496
                   /* reset */ 0, flush, &err);
1497
1498
    *inlen = ucv_in - (const char*) in;
1499
    *outlen = ucv_out - (char *) out;
1500
1501
    if (U_SUCCESS(err)) {
1502
        ret = XML_ENC_ERR_SUCCESS;
1503
    } else {
1504
        switch (err) {
1505
            case U_TRUNCATED_CHAR_FOUND:
1506
                /* Should only happen with flush */
1507
                ret = XML_ENC_ERR_INPUT;
1508
                break;
1509
1510
            case U_BUFFER_OVERFLOW_ERROR:
1511
                ret = XML_ENC_ERR_SPACE;
1512
                break;
1513
1514
            case U_INVALID_CHAR_FOUND:
1515
            case U_ILLEGAL_CHAR_FOUND:
1516
            case U_ILLEGAL_ESCAPE_SEQUENCE:
1517
            case U_UNSUPPORTED_ESCAPE_SEQUENCE:
1518
                ret = XML_ENC_ERR_INPUT;
1519
                break;
1520
1521
            case U_MEMORY_ALLOCATION_ERROR:
1522
                ret = XML_ENC_ERR_MEMORY;
1523
                break;
1524
1525
            default:
1526
                ret = XML_ENC_ERR_INTERNAL;
1527
                break;
1528
        }
1529
    }
1530
1531
    return(ret);
1532
}
1533
1534
static xmlParserErrors
1535
openIcuConverter(const char* name, int isInput, xmlUconvCtxt **out)
1536
{
1537
    UErrorCode status;
1538
    xmlUconvCtxt *conv;
1539
1540
    *out = NULL;
1541
1542
    conv = (xmlUconvCtxt *) xmlMalloc(sizeof(xmlUconvCtxt));
1543
    if (conv == NULL)
1544
        return(XML_ERR_NO_MEMORY);
1545
1546
    conv->isInput = isInput;
1547
    conv->pivot_source = conv->pivot_buf;
1548
    conv->pivot_target = conv->pivot_buf;
1549
1550
    status = U_ZERO_ERROR;
1551
    conv->uconv = ucnv_open(name, &status);
1552
    if (U_FAILURE(status))
1553
        goto error;
1554
1555
    status = U_ZERO_ERROR;
1556
    if (isInput) {
1557
        ucnv_setToUCallBack(conv->uconv, UCNV_TO_U_CALLBACK_STOP,
1558
                                                NULL, NULL, NULL, &status);
1559
    }
1560
    else {
1561
        ucnv_setFromUCallBack(conv->uconv, UCNV_FROM_U_CALLBACK_STOP,
1562
                                                NULL, NULL, NULL, &status);
1563
    }
1564
    if (U_FAILURE(status))
1565
        goto error;
1566
1567
    status = U_ZERO_ERROR;
1568
    conv->utf8 = ucnv_open("UTF-8", &status);
1569
    if (U_FAILURE(status))
1570
        goto error;
1571
1572
    *out = conv;
1573
    return(XML_ERR_OK);
1574
1575
error:
1576
    if (conv->uconv)
1577
        ucnv_close(conv->uconv);
1578
    xmlFree(conv);
1579
1580
    if (status == U_FILE_ACCESS_ERROR)
1581
        return(XML_ERR_UNSUPPORTED_ENCODING);
1582
    if (status == U_MEMORY_ALLOCATION_ERROR)
1583
        return(XML_ERR_NO_MEMORY);
1584
    return(XML_ERR_SYSTEM);
1585
}
1586
1587
static void
1588
closeIcuConverter(xmlUconvCtxt *conv)
1589
{
1590
    if (conv == NULL)
1591
        return;
1592
    ucnv_close(conv->uconv);
1593
    ucnv_close(conv->utf8);
1594
    xmlFree(conv);
1595
}
1596
1597
static void
1598
xmlUconvFree(void *vctxt) {
1599
    closeIcuConverter(vctxt);
1600
}
1601
1602
static xmlParserErrors
1603
xmlCharEncUconv(const char *name, xmlCharEncFlags flags,
1604
                xmlCharEncodingHandler **out) {
1605
    xmlCharEncConvFunc inFunc = NULL, outFunc = NULL;
1606
    xmlUconvCtxt *ucv_in = NULL;
1607
    xmlUconvCtxt *ucv_out = NULL;
1608
    int ret;
1609
1610
    if (flags & XML_ENC_INPUT) {
1611
        ret = openIcuConverter(name, 1, &ucv_in);
1612
        if (ret != 0)
1613
            goto error;
1614
        inFunc = xmlUconvConvert;
1615
    }
1616
1617
    if (flags & XML_ENC_OUTPUT) {
1618
        ret = openIcuConverter(name, 0, &ucv_out);
1619
        if (ret != 0)
1620
            goto error;
1621
        outFunc = xmlUconvConvert;
1622
    }
1623
1624
    return(xmlCharEncNewCustomHandler(name, inFunc, outFunc, xmlUconvFree,
1625
                                      ucv_in, ucv_out, out));
1626
1627
error:
1628
    if (ucv_in != NULL)
1629
        closeIcuConverter(ucv_in);
1630
    if (ucv_out != NULL)
1631
        closeIcuConverter(ucv_out);
1632
    return(ret);
1633
}
1634
#endif /* LIBXML_ICU_ENABLED */
1635
1636
/************************************************************************
1637
 *                  *
1638
 *    The real API used by libxml for on-the-fly conversion *
1639
 *                  *
1640
 ************************************************************************/
1641
1642
/**
1643
 * Convert xmlCharEncError to xmlParserErrors code.
1644
 *
1645
 * @param code  xmlCharEncError code
1646
 */
1647
static xmlParserErrors
1648
3.95k
xmlEncConvertError(xmlCharEncError code) {
1649
3.95k
    xmlParserErrors ret;
1650
1651
3.95k
    switch (code) {
1652
0
        case XML_ENC_ERR_SUCCESS:
1653
0
            ret = XML_ERR_OK;
1654
0
            break;
1655
3.94k
        case XML_ENC_ERR_INPUT:
1656
3.94k
            ret = XML_ERR_INVALID_ENCODING;
1657
3.94k
            break;
1658
3
        case XML_ENC_ERR_MEMORY:
1659
3
            ret = XML_ERR_NO_MEMORY;
1660
3
            break;
1661
4
        default:
1662
4
            ret = XML_ERR_INTERNAL_ERROR;
1663
4
            break;
1664
3.95k
    }
1665
1666
3.95k
    return(ret);
1667
3.95k
}
1668
1669
/**
1670
 * The value of `inlen` after return is the number of bytes consumed.
1671
 * The value of `outlen` after return is the number of bytes produced.
1672
 *
1673
 * @param handler  encoding handler
1674
 * @param out  a pointer to an array of bytes to store the result
1675
 * @param outlen  the length of `out`
1676
 * @param in  a pointer to an array of input bytes
1677
 * @param inlen  the length of `in`
1678
 * @param flush  end of input
1679
 * @returns an xmlCharEncError code.
1680
 */
1681
xmlCharEncError
1682
xmlEncInputChunk(xmlCharEncodingHandler *handler, unsigned char *out,
1683
                 int *outlen, const unsigned char *in, int *inlen,
1684
512k
                 int flush) {
1685
512k
    xmlCharEncError ret;
1686
1687
512k
    if (handler->flags & XML_HANDLER_LEGACY) {
1688
0
        xmlCharEncodingInputFunc func = handler->input.legacyFunc;
1689
1690
0
        if (func == NULL) {
1691
0
            *outlen = 0;
1692
0
            *inlen = 0;
1693
0
            return(XML_ENC_ERR_INTERNAL);
1694
0
        }
1695
1696
0
        ret = func(out, outlen, in, inlen);
1697
512k
    } else {
1698
512k
        xmlCharEncConvFunc func = handler->input.func;
1699
512k
        int oldInlen;
1700
1701
512k
        if (func == NULL) {
1702
0
            *outlen = 0;
1703
0
            *inlen = 0;
1704
0
            return(XML_ENC_ERR_INTERNAL);
1705
0
        }
1706
1707
512k
        oldInlen = *inlen;
1708
512k
        ret = func(handler->inputCtxt, out, outlen, in, inlen, flush);
1709
1710
        /*
1711
         * Check for truncated multi-byte sequence.
1712
         */
1713
512k
        if ((flush) && (ret == XML_ENC_ERR_SUCCESS) && (*inlen != oldInlen))
1714
2
            ret = XML_ENC_ERR_INPUT;
1715
512k
    }
1716
1717
512k
    if (ret > 0)
1718
74.6k
        ret = XML_ENC_ERR_SUCCESS;
1719
1720
512k
    return(ret);
1721
512k
}
1722
1723
/**
1724
 * The value of `inlen` after return is the number of bytes consumed.
1725
 * The value of `outlen` after return is the number of bytes produced.
1726
 *
1727
 * @param handler  encoding handler
1728
 * @param out  a pointer to an array of bytes to store the result
1729
 * @param outlen  the length of `out`
1730
 * @param in  a pointer to an array of input bytes
1731
 * @param inlen  the length of `in`
1732
 * @returns an xmlCharEncError code.
1733
 */
1734
static xmlCharEncError
1735
xmlEncOutputChunk(xmlCharEncodingHandler *handler, unsigned char *out,
1736
236M
                  int *outlen, const unsigned char *in, int *inlen) {
1737
236M
    xmlCharEncError ret;
1738
1739
236M
    if (handler->flags & XML_HANDLER_LEGACY) {
1740
0
        xmlCharEncodingOutputFunc func = handler->output.legacyFunc;
1741
1742
0
        if (func == NULL) {
1743
0
            *outlen = 0;
1744
0
            *inlen = 0;
1745
0
            return(XML_ENC_ERR_INTERNAL);
1746
0
        }
1747
1748
0
        ret = func(out, outlen, in, inlen);
1749
236M
    } else {
1750
236M
        xmlCharEncConvFunc func = handler->output.func;
1751
1752
236M
        if (func == NULL) {
1753
0
            *outlen = 0;
1754
0
            *inlen = 0;
1755
0
            return(XML_ENC_ERR_INTERNAL);
1756
0
        }
1757
1758
236M
        ret = func(handler->outputCtxt, out, outlen, in, inlen, /* flush */ 0);
1759
236M
    }
1760
1761
236M
    if (ret > 0)
1762
116M
        ret = XML_ENC_ERR_SUCCESS;
1763
1764
236M
    return(ret);
1765
236M
}
1766
1767
/**
1768
 * DEPERECATED: Don't use.
1769
 *
1770
 * @param handler  encoding handler
1771
 * @param out  an xmlBuffer for the output.
1772
 * @param in  an xmlBuffer for the input
1773
 * @returns the number of bytes written or an xmlCharEncError code.
1774
 */
1775
int
1776
xmlCharEncFirstLine(xmlCharEncodingHandler *handler, struct _xmlBuffer *out,
1777
0
                    struct _xmlBuffer *in) {
1778
0
    return(xmlCharEncInFunc(handler, out, in));
1779
0
}
1780
1781
/**
1782
 * Generic front-end for input encoding conversion.
1783
 *
1784
 * `sizeOut` should be set to the maximum output size (or SIZE_MAX).
1785
 * After return, it is set to the number of bytes written.
1786
 *
1787
 * @param input  a parser input buffer
1788
 * @param sizeOut  pointer to output size
1789
 * @param flush  end of input
1790
 * @returns an xmlCharEncError code.
1791
 */
1792
xmlCharEncError
1793
xmlCharEncInput(xmlParserInputBuffer *input, size_t *sizeOut, int flush)
1794
8.82M
{
1795
8.82M
    xmlBufPtr out, in;
1796
8.82M
    const xmlChar *dataIn;
1797
8.82M
    size_t availIn;
1798
8.82M
    size_t maxOut;
1799
8.82M
    size_t totalIn, totalOut;
1800
8.82M
    xmlCharEncError ret;
1801
1802
8.82M
    out = input->buffer;
1803
8.82M
    in = input->raw;
1804
1805
8.82M
    maxOut = *sizeOut;
1806
8.82M
    totalOut = 0;
1807
1808
8.82M
    *sizeOut = 0;
1809
1810
8.82M
    availIn = xmlBufUse(in);
1811
8.82M
    if ((availIn == 0) && (!flush))
1812
8.36M
        return(0);
1813
460k
    dataIn = xmlBufContent(in);
1814
460k
    totalIn = 0;
1815
1816
510k
    while (1) {
1817
510k
        size_t availOut;
1818
510k
        int completeOut, completeIn;
1819
510k
        int c_out, c_in;
1820
1821
510k
        availOut = xmlBufAvail(out);
1822
510k
        if (availOut > INT_MAX / 2)
1823
0
            availOut = INT_MAX / 2;
1824
1825
510k
        if (availOut < maxOut) {
1826
73.5k
            c_out = availOut;
1827
73.5k
            completeOut = 0;
1828
436k
        } else {
1829
436k
            c_out = maxOut;
1830
436k
            completeOut = 1;
1831
436k
        }
1832
1833
510k
        if (availIn > INT_MAX / 2) {
1834
0
            c_in = INT_MAX / 2;
1835
0
            completeIn = 0;
1836
510k
        } else {
1837
510k
            c_in = availIn;
1838
510k
            completeIn = 1;
1839
510k
        }
1840
1841
510k
        ret = xmlEncInputChunk(input->encoder, xmlBufEnd(out), &c_out,
1842
510k
                               dataIn, &c_in, flush && completeIn);
1843
1844
510k
        totalIn += c_in;
1845
510k
        dataIn += c_in;
1846
510k
        availIn -= c_in;
1847
1848
510k
        totalOut += c_out;
1849
510k
        maxOut -= c_out;
1850
510k
        xmlBufAddLen(out, c_out);
1851
1852
510k
        if ((ret != XML_ENC_ERR_SUCCESS) && (ret != XML_ENC_ERR_SPACE)) {
1853
3.94k
            input->error = xmlEncConvertError(ret);
1854
3.94k
            return(ret);
1855
3.94k
        }
1856
1857
506k
        if ((completeOut) && (completeIn))
1858
433k
            break;
1859
72.7k
        if ((completeOut) && (ret == XML_ENC_ERR_SPACE))
1860
0
            break;
1861
72.7k
        if ((completeIn) && (ret == XML_ENC_ERR_SUCCESS))
1862
22.6k
            break;
1863
1864
50.1k
        if (ret == XML_ENC_ERR_SPACE) {
1865
50.1k
            if (xmlBufGrow(out, 4096) < 0) {
1866
15
                input->error = XML_ERR_NO_MEMORY;
1867
15
                return(XML_ENC_ERR_MEMORY);
1868
15
            }
1869
50.1k
        }
1870
50.1k
    }
1871
1872
456k
    xmlBufShrink(in, totalIn);
1873
1874
456k
    if (input->rawconsumed > ULONG_MAX - (unsigned long) totalIn)
1875
0
        input->rawconsumed = ULONG_MAX;
1876
456k
    else
1877
456k
        input->rawconsumed += totalIn;
1878
1879
456k
    *sizeOut = totalOut;
1880
456k
    return(XML_ENC_ERR_SUCCESS);
1881
460k
}
1882
1883
/**
1884
 * Generic front-end for input encoding conversion.
1885
 *
1886
 * @param handler  encoding handler
1887
 * @param out  an xmlBuffer for the output.
1888
 * @param in  an xmlBuffer for the input
1889
 * @returns the number of bytes written or an xmlCharEncError code.
1890
 */
1891
int
1892
xmlCharEncInFunc(xmlCharEncodingHandler * handler, struct _xmlBuffer *out,
1893
                 struct _xmlBuffer *in)
1894
0
{
1895
0
    int ret;
1896
0
    int written;
1897
0
    int toconv;
1898
1899
0
    if (handler == NULL)
1900
0
        return(XML_ENC_ERR_INTERNAL);
1901
0
    if (out == NULL)
1902
0
        return(XML_ENC_ERR_INTERNAL);
1903
0
    if (in == NULL)
1904
0
        return(XML_ENC_ERR_INTERNAL);
1905
1906
0
    toconv = in->use;
1907
0
    if (toconv == 0)
1908
0
        return (0);
1909
0
    written = out->size - out->use -1; /* count '\0' */
1910
0
    if (toconv * 2 >= written) {
1911
0
        xmlBufferGrow(out, out->size + toconv * 2);
1912
0
        written = out->size - out->use - 1;
1913
0
    }
1914
0
    ret = xmlEncInputChunk(handler, &out->content[out->use], &written,
1915
0
                           in->content, &toconv, /* flush */ 0);
1916
0
    xmlBufferShrink(in, toconv);
1917
0
    out->use += written;
1918
0
    out->content[out->use] = 0;
1919
1920
0
    return (written? written : ret);
1921
0
}
1922
1923
/*
1924
 * @param buf  a char buffer
1925
 * @param val  a codepoint
1926
 *
1927
 * Serializes a decimal char ref like `&#38;`.
1928
 *
1929
 * Writes at most 10 bytes. Does not include a terminating zero byte.
1930
 *
1931
 * @returns the number of bytes written.
1932
 */
1933
static int
1934
117M
xmlSerializeDecCharRef(char *buf, int val) {
1935
117M
    char *out = buf;
1936
117M
    int len, i;
1937
1938
117M
    *out++ = '&';
1939
117M
    *out++ = '#';
1940
1941
117M
    if (val < 100) {
1942
13.4k
        len = (val < 10) ? 1 : 2;
1943
117M
    } else if (val < 10000) {
1944
117M
        len = (val < 1000) ? 3 : 4;
1945
117M
    } else if (val < 1000000) {
1946
21.4k
        len = (val < 100000) ? 5 : 6;
1947
21.4k
    } else {
1948
165
        len = 7;
1949
165
    }
1950
1951
472M
    for (i = len - 1; i >= 0; i--) {
1952
354M
        out[i] = '0' + val % 10;
1953
354M
        val /= 10;
1954
354M
    }
1955
1956
117M
    out[len] = ';';
1957
1958
117M
    return(len + 3);
1959
117M
}
1960
1961
#ifdef LIBXML_OUTPUT_ENABLED
1962
/**
1963
 * Generic front-end for output encoding conversion.
1964
 *
1965
 * A first call with `init` set to 1 has to be made to write a BOM.
1966
 *
1967
 * When using GNU libiconv, unsupported characters in the output
1968
 * encoding will be automatically replaced with a numeric character
1969
 * reference.
1970
 *
1971
 * @param output  a parser output buffer
1972
 * @param init  is this an initialization call without data
1973
 * @returns the number of bytes written or an xmlCharEncError code.
1974
 */
1975
int
1976
xmlCharEncOutput(xmlOutputBuffer *output, int init)
1977
750k
{
1978
750k
    int ret;
1979
750k
    size_t written;
1980
750k
    int writtentot = 0;
1981
750k
    size_t toconv;
1982
750k
    int c_in;
1983
750k
    int c_out;
1984
750k
    xmlBufPtr in;
1985
750k
    xmlBufPtr out;
1986
1987
750k
    if ((output == NULL) || (output->encoder == NULL) ||
1988
750k
        (output->buffer == NULL) || (output->conv == NULL))
1989
0
        return(XML_ENC_ERR_INTERNAL);
1990
750k
    out = output->conv;
1991
750k
    in = output->buffer;
1992
1993
118M
retry:
1994
1995
118M
    written = xmlBufAvail(out);
1996
1997
    /*
1998
     * First specific handling of the initialization call
1999
     */
2000
118M
    if (init) {
2001
395
        c_in = 0;
2002
395
        c_out = written;
2003
        /* TODO: Check return value. */
2004
395
        xmlEncOutputChunk(output->encoder, xmlBufEnd(out), &c_out,
2005
395
                          NULL, &c_in);
2006
395
        xmlBufAddLen(out, c_out);
2007
395
        return(c_out);
2008
395
    }
2009
2010
    /*
2011
     * Conversion itself.
2012
     */
2013
118M
    toconv = xmlBufUse(in);
2014
118M
    if (toconv > 64 * 1024)
2015
36.7M
        toconv = 64 * 1024;
2016
118M
    if (toconv * 4 >= written) {
2017
1.31k
        if (xmlBufGrow(out, toconv * 4) < 0) {
2018
3
            ret = XML_ENC_ERR_MEMORY;
2019
3
            goto error;
2020
3
        }
2021
1.31k
        written = xmlBufAvail(out);
2022
1.31k
    }
2023
118M
    if (written > 256 * 1024)
2024
112M
        written = 256 * 1024;
2025
2026
118M
    c_in = toconv;
2027
118M
    c_out = written;
2028
118M
    ret = xmlEncOutputChunk(output->encoder, xmlBufEnd(out), &c_out,
2029
118M
                            xmlBufContent(in), &c_in);
2030
118M
    xmlBufShrink(in, c_in);
2031
118M
    xmlBufAddLen(out, c_out);
2032
118M
    writtentot += c_out;
2033
2034
118M
    if (ret == XML_ENC_ERR_SPACE)
2035
0
        goto retry;
2036
2037
    /*
2038
     * Attempt to handle error cases
2039
     */
2040
118M
    if (ret == XML_ENC_ERR_INPUT) {
2041
117M
        xmlChar charref[20];
2042
117M
        int len = xmlBufUse(in);
2043
117M
        xmlChar *content = xmlBufContent(in);
2044
117M
        int cur, charrefLen;
2045
2046
117M
        cur = xmlGetUTF8Char(content, &len);
2047
117M
        if (cur <= 0)
2048
6
            goto error;
2049
2050
        /*
2051
         * Removes the UTF8 sequence, and replace it by a charref
2052
         * and continue the transcoding phase, hoping the error
2053
         * did not mangle the encoder state.
2054
         */
2055
117M
        charrefLen = xmlSerializeDecCharRef((char *) charref, cur);
2056
117M
        xmlBufGrow(out, charrefLen * 4);
2057
117M
        c_out = xmlBufAvail(out);
2058
117M
        c_in = charrefLen;
2059
117M
        ret = xmlEncOutputChunk(output->encoder, xmlBufEnd(out), &c_out,
2060
117M
                                charref, &c_in);
2061
117M
        if ((ret < 0) || (c_in != charrefLen)) {
2062
8
            ret = XML_ENC_ERR_INTERNAL;
2063
8
            goto error;
2064
8
        }
2065
2066
117M
        xmlBufShrink(in, len);
2067
117M
        xmlBufAddLen(out, c_out);
2068
117M
        writtentot += c_out;
2069
117M
        goto retry;
2070
117M
    }
2071
2072
749k
error:
2073
749k
    if (((writtentot <= 0) && (ret != 0)) ||
2074
749k
        (ret == XML_ENC_ERR_MEMORY)) {
2075
11
        if (output->error == 0)
2076
11
            output->error = xmlEncConvertError(ret);
2077
11
        return(ret);
2078
11
    }
2079
2080
749k
    return(writtentot);
2081
749k
}
2082
#endif
2083
2084
/**
2085
 * Generic front-end for output encoding conversion.
2086
 *
2087
 * A first call with `in` set to NULL has to be made to write a BOM.
2088
 *
2089
 * When using GNU libiconv, unsupported characters in the output
2090
 * encoding will be automatically replaced with a numeric character
2091
 * reference.
2092
 *
2093
 * @param handler  encoding handler
2094
 * @param out  an xmlBuffer for the output.
2095
 * @param in  an xmlBuffer for the input
2096
 * @returns the number of bytes written or an xmlCharEncError code.
2097
 */
2098
int
2099
xmlCharEncOutFunc(xmlCharEncodingHandler *handler, struct _xmlBuffer *out,
2100
0
                  struct _xmlBuffer *in) {
2101
0
    int ret;
2102
0
    int written;
2103
0
    int writtentot = 0;
2104
0
    int toconv;
2105
2106
0
    if (handler == NULL) return(XML_ENC_ERR_INTERNAL);
2107
0
    if (out == NULL) return(XML_ENC_ERR_INTERNAL);
2108
2109
0
retry:
2110
2111
0
    written = out->size - out->use;
2112
2113
0
    if (written > 0)
2114
0
  written--; /* Gennady: count '/0' */
2115
2116
    /*
2117
     * First specific handling of in = NULL, i.e. the initialization call
2118
     */
2119
0
    if (in == NULL) {
2120
0
        toconv = 0;
2121
        /* TODO: Check return value. */
2122
0
        xmlEncOutputChunk(handler, &out->content[out->use], &written,
2123
0
                          NULL, &toconv);
2124
0
        out->use += written;
2125
0
        out->content[out->use] = 0;
2126
0
        return(0);
2127
0
    }
2128
2129
    /*
2130
     * Conversion itself.
2131
     */
2132
0
    toconv = in->use;
2133
0
    if (toconv * 4 >= written) {
2134
0
        xmlBufferGrow(out, toconv * 4);
2135
0
  written = out->size - out->use - 1;
2136
0
    }
2137
0
    ret = xmlEncOutputChunk(handler, &out->content[out->use], &written,
2138
0
                            in->content, &toconv);
2139
0
    xmlBufferShrink(in, toconv);
2140
0
    out->use += written;
2141
0
    writtentot += written;
2142
0
    out->content[out->use] = 0;
2143
2144
0
    if (ret == XML_ENC_ERR_SPACE)
2145
0
        goto retry;
2146
2147
    /*
2148
     * Attempt to handle error cases
2149
     */
2150
0
    if (ret == XML_ENC_ERR_INPUT) {
2151
0
        xmlChar charref[20];
2152
0
        int len = in->use;
2153
0
        const xmlChar *utf = (const xmlChar *) in->content;
2154
0
        int cur, charrefLen;
2155
2156
0
        cur = xmlGetUTF8Char(utf, &len);
2157
0
        if (cur <= 0)
2158
0
            return(ret);
2159
2160
        /*
2161
         * Removes the UTF8 sequence, and replace it by a charref
2162
         * and continue the transcoding phase, hoping the error
2163
         * did not mangle the encoder state.
2164
         */
2165
0
        charrefLen = xmlSerializeDecCharRef((char *) charref, cur);
2166
0
        xmlBufferShrink(in, len);
2167
0
        xmlBufferGrow(out, charrefLen * 4);
2168
0
        written = out->size - out->use - 1;
2169
0
        toconv = charrefLen;
2170
0
        ret = xmlEncOutputChunk(handler, &out->content[out->use], &written,
2171
0
                                charref, &toconv);
2172
0
        if ((ret < 0) || (toconv != charrefLen))
2173
0
            return(XML_ENC_ERR_INTERNAL);
2174
2175
0
        out->use += written;
2176
0
        writtentot += written;
2177
0
        out->content[out->use] = 0;
2178
0
        goto retry;
2179
0
    }
2180
0
    return(writtentot ? writtentot : ret);
2181
0
}
2182
2183
/**
2184
 * Releases an xmlCharEncodingHandler. Must be called after
2185
 * a handler is no longer in use.
2186
 *
2187
 * @param handler  encoding handler
2188
 * @returns 0.
2189
 */
2190
int
2191
128k
xmlCharEncCloseFunc(xmlCharEncodingHandler *handler) {
2192
128k
    if (handler == NULL)
2193
0
        return(0);
2194
2195
128k
    if (handler->flags & XML_HANDLER_STATIC)
2196
7.07k
        return(0);
2197
2198
120k
    xmlFree(handler->name);
2199
120k
    if (handler->ctxtDtor != NULL) {
2200
52.4k
        handler->ctxtDtor(handler->inputCtxt);
2201
52.4k
        handler->ctxtDtor(handler->outputCtxt);
2202
52.4k
    }
2203
120k
    xmlFree(handler);
2204
120k
    return(0);
2205
128k
}
2206
2207
/**
2208
 * This function provides the current index of the parser relative
2209
 * to the start of the current entity. This function is computed in
2210
 * bytes from the beginning starting at zero and finishing at the
2211
 * size in byte of the file if parsing a file. The function is
2212
 * of constant cost if the input is UTF-8 but can be costly if run
2213
 * on non-UTF-8 input.
2214
 *
2215
 * @deprecated Don't use.
2216
 *
2217
 * @param ctxt  an XML parser context
2218
 * @returns the index in bytes from the beginning of the entity or -1
2219
 *         in case the index could not be computed.
2220
 */
2221
long
2222
0
xmlByteConsumed(xmlParserCtxt *ctxt) {
2223
0
    xmlParserInputPtr in;
2224
2225
0
    if (ctxt == NULL)
2226
0
        return(-1);
2227
0
    in = ctxt->input;
2228
0
    if (in == NULL)
2229
0
        return(-1);
2230
2231
0
    if ((in->buf != NULL) && (in->buf->encoder != NULL)) {
2232
0
        int unused = 0;
2233
0
  xmlCharEncodingHandler * handler = in->buf->encoder;
2234
2235
        /*
2236
   * Encoding conversion, compute the number of unused original
2237
   * bytes from the input not consumed and subtract that from
2238
   * the raw consumed value, this is not a cheap operation
2239
   */
2240
0
        if (in->end - in->cur > 0) {
2241
0
      unsigned char *convbuf;
2242
0
      const unsigned char *cur = (const unsigned char *)in->cur;
2243
0
      int toconv, ret;
2244
2245
0
            convbuf = xmlMalloc(32000);
2246
0
            if (convbuf == NULL)
2247
0
                return(-1);
2248
2249
0
            toconv = in->end - cur;
2250
0
            unused = 32000;
2251
0
            ret = xmlEncOutputChunk(handler, convbuf, &unused, cur, &toconv);
2252
2253
0
            xmlFree(convbuf);
2254
2255
0
            if (ret != XML_ENC_ERR_SUCCESS)
2256
0
                return(-1);
2257
0
  }
2258
2259
0
  if (in->buf->rawconsumed < (unsigned long) unused)
2260
0
      return(-1);
2261
0
  return(in->buf->rawconsumed - unused);
2262
0
    }
2263
2264
0
    return(in->consumed + (in->cur - in->base));
2265
0
}
2266
2267
/************************************************************************
2268
 *                  *
2269
 *    Conversions To/From UTF8 encoding     *
2270
 *                  *
2271
 ************************************************************************/
2272
2273
static xmlCharEncError
2274
asciiToAscii(void *vctxt ATTRIBUTE_UNUSED,
2275
             unsigned char* out, int *poutlen,
2276
             const unsigned char* in, int *pinlen,
2277
231M
             int flush ATTRIBUTE_UNUSED) {
2278
231M
    const unsigned char *inend;
2279
231M
    const unsigned char *instart = in;
2280
231M
    int inlen, outlen, ret;
2281
2282
231M
    if (in == NULL) {
2283
145
        *pinlen = 0;
2284
145
        *poutlen = 0;
2285
145
        return(XML_ENC_ERR_SUCCESS);
2286
145
    }
2287
2288
231M
    inlen = *pinlen;
2289
231M
    outlen = *poutlen;
2290
2291
231M
    if (outlen < inlen) {
2292
262
        inlen = outlen;
2293
262
        ret = XML_ENC_ERR_SPACE;
2294
231M
    } else {
2295
231M
        ret = inlen;
2296
231M
    }
2297
2298
231M
    inend = in + inlen;
2299
231M
    *poutlen = inlen;
2300
231M
    *pinlen = inlen;
2301
2302
1.08G
    while (in < inend) {
2303
970M
  unsigned c = *in;
2304
2305
970M
        if (c >= 0x80) {
2306
115M
      *poutlen = in - instart;
2307
115M
      *pinlen = in - instart;
2308
115M
      return(XML_ENC_ERR_INPUT);
2309
115M
  }
2310
2311
854M
        in++;
2312
854M
  *out++ = c;
2313
854M
    }
2314
2315
116M
    return(ret);
2316
231M
}
2317
2318
static xmlCharEncError
2319
latin1ToUTF8(void *vctxt ATTRIBUTE_UNUSED,
2320
             unsigned char* out, int *outlen,
2321
             const unsigned char* in, int *inlen,
2322
126k
             int flush ATTRIBUTE_UNUSED) {
2323
126k
    unsigned char* outstart = out;
2324
126k
    const unsigned char* instart = in;
2325
126k
    unsigned char* outend;
2326
126k
    const unsigned char* inend;
2327
126k
    int ret = XML_ENC_ERR_SPACE;
2328
2329
126k
    if ((out == NULL) || (in == NULL) || (outlen == NULL) || (inlen == NULL))
2330
0
  return(XML_ENC_ERR_INTERNAL);
2331
2332
126k
    outend = out + *outlen;
2333
126k
    inend = in + *inlen;
2334
2335
157M
    while (in < inend) {
2336
157M
        unsigned c = *in;
2337
2338
157M
  if (c < 0x80) {
2339
83.2M
            if (out >= outend)
2340
15.1k
                goto done;
2341
83.2M
            *out++ = c;
2342
83.2M
  } else {
2343
74.0M
            if (outend - out < 2)
2344
32.4k
                goto done;
2345
73.9M
      *out++ = (c >> 6) | 0xC0;
2346
73.9M
            *out++ = (c & 0x3F) | 0x80;
2347
73.9M
        }
2348
2349
157M
        in++;
2350
157M
    }
2351
2352
78.5k
    ret = out - outstart;
2353
2354
126k
done:
2355
126k
    *outlen = out - outstart;
2356
126k
    *inlen = in - instart;
2357
126k
    return(ret);
2358
78.5k
}
2359
2360
/**
2361
 * Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8
2362
 * block of chars out.
2363
 *
2364
 * The value of `inlen` after return is the number of bytes consumed.
2365
 * The value of `outlen` after return is the number of bytes produced.
2366
 *
2367
 * @param out  a pointer to an array of bytes to store the result
2368
 * @param outlen  the length of `out`
2369
 * @param in  a pointer to an array of ISO Latin 1 chars
2370
 * @param inlen  the length of `in`
2371
 * @returns the number of bytes written or an xmlCharEncError code.
2372
 */
2373
int
2374
xmlIsolat1ToUTF8(unsigned char* out, int *outlen,
2375
0
                 const unsigned char* in, int *inlen) {
2376
0
    return(latin1ToUTF8(/* ctxt */ NULL, out, outlen, in, inlen,
2377
0
                        /* flush */ 0));
2378
0
}
2379
2380
static xmlCharEncError
2381
UTF8ToUTF8(void *vctxt ATTRIBUTE_UNUSED,
2382
           unsigned char* out, int *outlen,
2383
           const unsigned char* in, int *inlen,
2384
0
           int flush ATTRIBUTE_UNUSED) {
2385
0
    int len;
2386
0
    int ret;
2387
2388
0
    if (in == NULL) {
2389
0
        *inlen = 0;
2390
0
        *outlen = 0;
2391
0
        return(XML_ENC_ERR_SUCCESS);
2392
0
    }
2393
2394
0
    if (*outlen < *inlen) {
2395
0
  len = *outlen;
2396
0
        ret = XML_ENC_ERR_SPACE;
2397
0
    } else {
2398
0
  len = *inlen;
2399
0
        ret = len;
2400
0
    }
2401
2402
0
    memcpy(out, in, len);
2403
2404
0
    *outlen = len;
2405
0
    *inlen = len;
2406
0
    return(ret);
2407
0
}
2408
2409
2410
#ifdef LIBXML_OUTPUT_ENABLED
2411
static xmlCharEncError
2412
UTF8ToLatin1(void *vctxt ATTRIBUTE_UNUSED,
2413
             unsigned char* out, int *outlen,
2414
             const unsigned char* in, int *inlen,
2415
6.79k
             int flush ATTRIBUTE_UNUSED) {
2416
6.79k
    const unsigned char* outend;
2417
6.79k
    const unsigned char* outstart = out;
2418
6.79k
    const unsigned char* instart = in;
2419
6.79k
    const unsigned char* inend;
2420
6.79k
    unsigned c;
2421
6.79k
    int ret = XML_ENC_ERR_SPACE;
2422
2423
6.79k
    if ((out == NULL) || (outlen == NULL) || (inlen == NULL))
2424
0
        return(XML_ENC_ERR_INTERNAL);
2425
2426
6.79k
    if (in == NULL) {
2427
46
        *inlen = 0;
2428
46
        *outlen = 0;
2429
46
        return(XML_ENC_ERR_SUCCESS);
2430
46
    }
2431
2432
6.74k
    inend = in + *inlen;
2433
6.74k
    outend = out + *outlen;
2434
43.0k
    while (in < inend) {
2435
39.5k
        if (out >= outend)
2436
0
            goto done;
2437
2438
39.5k
  c = *in;
2439
2440
39.5k
        if (c < 0x80) {
2441
35.1k
            *out++ = c;
2442
35.1k
        } else if ((c >= 0xC2) && (c <= 0xC3)) {
2443
1.06k
            if (inend - in < 2)
2444
0
                break;
2445
1.06k
            in++;
2446
1.06k
            *out++ = (unsigned char) ((c << 6) | (*in & 0x3F));
2447
3.30k
        } else {
2448
3.30k
            ret = XML_ENC_ERR_INPUT;
2449
3.30k
            goto done;
2450
3.30k
  }
2451
2452
36.2k
        in++;
2453
36.2k
    }
2454
2455
3.44k
    ret = out - outstart;
2456
2457
6.74k
done:
2458
6.74k
    *outlen = out - outstart;
2459
6.74k
    *inlen = in - instart;
2460
6.74k
    return(ret);
2461
3.44k
}
2462
2463
/**
2464
 * Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1
2465
 * block of chars out.
2466
 *
2467
 * The value of `inlen` after return is the number of bytes consumed.
2468
 * The value of `outlen` after return is the number of bytes produced.
2469
 *
2470
 * @param out  a pointer to an array of bytes to store the result
2471
 * @param outlen  the length of `out`
2472
 * @param in  a pointer to an array of UTF-8 chars
2473
 * @param inlen  the length of `in`
2474
 * @returns the number of bytes written or an xmlCharEncError code.
2475
 */
2476
int
2477
xmlUTF8ToIsolat1(unsigned char* out, int *outlen,
2478
0
              const unsigned char* in, int *inlen) {
2479
0
    if ((out == NULL) || (outlen == NULL) || (in == NULL) || (inlen == NULL))
2480
0
        return(XML_ENC_ERR_INTERNAL);
2481
2482
0
    return(UTF8ToLatin1(/* ctxt */ NULL, out, outlen, in, inlen,
2483
0
                        /* flush */ 0));
2484
0
}
2485
#endif /* LIBXML_OUTPUT_ENABLED */
2486
2487
static xmlCharEncError
2488
UTF16LEToUTF8(void *vctxt ATTRIBUTE_UNUSED,
2489
              unsigned char *out, int *outlen,
2490
              const unsigned char *in, int *inlen,
2491
15.8k
              int flush ATTRIBUTE_UNUSED) {
2492
15.8k
    const unsigned char *instart = in;
2493
15.8k
    const unsigned char *inend = in + (*inlen & ~1);
2494
15.8k
    unsigned char *outstart = out;
2495
15.8k
    unsigned char *outend = out + *outlen;
2496
15.8k
    unsigned c, d;
2497
15.8k
    int ret = XML_ENC_ERR_SPACE;
2498
2499
1.79M
    while (in < inend) {
2500
1.77M
        c = in[0] | (in[1] << 8);
2501
2502
1.77M
        if (c < 0x80) {
2503
48.6k
            if (out >= outend)
2504
300
                goto done;
2505
48.3k
            out[0] = c;
2506
48.3k
            in += 2;
2507
48.3k
            out += 1;
2508
1.73M
        } else if (c < 0x800) {
2509
15.4k
            if (outend - out < 2)
2510
304
                goto done;
2511
15.1k
            out[0] = (c >> 6)   | 0xC0;
2512
15.1k
            out[1] = (c & 0x3F) | 0x80;
2513
15.1k
            in += 2;
2514
15.1k
            out += 2;
2515
1.71M
        } else if ((c & 0xF800) != 0xD800) {
2516
1.70M
            if (outend - out < 3)
2517
239
                goto done;
2518
1.70M
            out[0] =  (c >> 12)         | 0xE0;
2519
1.70M
            out[1] = ((c >>  6) & 0x3F) | 0x80;
2520
1.70M
            out[2] =  (c        & 0x3F) | 0x80;
2521
1.70M
            in += 2;
2522
1.70M
            out += 3;
2523
1.70M
        } else {
2524
            /* Surrogate pair */
2525
6.08k
            if ((c & 0xFC00) != 0xD800) {
2526
598
                ret = XML_ENC_ERR_INPUT;
2527
598
                goto done;
2528
598
            }
2529
5.48k
      if (inend - in < 4)
2530
360
    break;
2531
5.12k
            d = in[2] | (in[3] << 8);
2532
5.12k
            if ((d & 0xFC00) != 0xDC00) {
2533
645
                ret = XML_ENC_ERR_INPUT;
2534
645
                goto done;
2535
645
            }
2536
4.48k
      if (outend - out < 4)
2537
224
    goto done;
2538
4.25k
            c = (c << 10) + d - ((0xD800 << 10) + 0xDC00 - 0x10000);
2539
4.25k
            out[0] =  (c >> 18)         | 0xF0;
2540
4.25k
            out[1] = ((c >> 12) & 0x3F) | 0x80;
2541
4.25k
            out[2] = ((c >>  6) & 0x3F) | 0x80;
2542
4.25k
            out[3] =  (c        & 0x3F) | 0x80;
2543
4.25k
            in += 4;
2544
4.25k
            out += 4;
2545
4.25k
        }
2546
1.77M
    }
2547
2548
13.5k
    ret = out - outstart;
2549
2550
15.8k
done:
2551
15.8k
    *outlen = out - outstart;
2552
15.8k
    *inlen = in - instart;
2553
15.8k
    return(ret);
2554
13.5k
}
2555
2556
#ifdef LIBXML_OUTPUT_ENABLED
2557
static xmlCharEncError
2558
UTF8ToUTF16LE(void *vctxt ATTRIBUTE_UNUSED,
2559
              unsigned char *out, int *outlen,
2560
              const unsigned char *in, int *inlen,
2561
20.9k
              int flush ATTRIBUTE_UNUSED) {
2562
20.9k
    const unsigned char *instart = in;
2563
20.9k
    const unsigned char *inend;
2564
20.9k
    unsigned char *outstart = out;
2565
20.9k
    unsigned char *outend;
2566
20.9k
    unsigned c, d;
2567
20.9k
    int ret = XML_ENC_ERR_SPACE;
2568
2569
    /* UTF16LE encoding has no BOM */
2570
20.9k
    if ((out == NULL) || (outlen == NULL) || (inlen == NULL))
2571
0
        return(XML_ENC_ERR_INTERNAL);
2572
20.9k
    if (in == NULL) {
2573
40
  *outlen = 0;
2574
40
  *inlen = 0;
2575
40
  return(0);
2576
40
    }
2577
20.9k
    inend = in + *inlen;
2578
20.9k
    outend = out + (*outlen & ~1);
2579
604M
    while (in < inend) {
2580
604M
        c = in[0];
2581
2582
604M
        if (c < 0x80) {
2583
30.8M
            if (out >= outend)
2584
0
                goto done;
2585
30.8M
            out[0] = c;
2586
30.8M
            out[1] = 0;
2587
30.8M
            in += 1;
2588
30.8M
            out += 2;
2589
573M
        } else {
2590
573M
            int i, len;
2591
573M
            unsigned min;
2592
2593
573M
            if (c < 0xE0) {
2594
573M
                if (c < 0xC2) {
2595
0
                    ret = XML_ENC_ERR_INPUT;
2596
0
                    goto done;
2597
0
                }
2598
573M
                c &= 0x1F;
2599
573M
                len = 2;
2600
573M
                min = 0x80;
2601
573M
            } else if (c < 0xF0) {
2602
222
                c &= 0x0F;
2603
222
                len = 3;
2604
222
                min = 0x800;
2605
222
            } else {
2606
83
                c &= 0x0F;
2607
83
                len = 4;
2608
83
                min = 0x10000;
2609
83
            }
2610
2611
573M
            if (inend - in < len)
2612
7.94k
                break;
2613
2614
1.14G
            for (i = 1; i < len; i++) {
2615
573M
                if ((in[i] & 0xC0) != 0x80) {
2616
0
                    ret = XML_ENC_ERR_INPUT;
2617
0
                    goto done;
2618
0
                }
2619
573M
                c = (c << 6) | (in[i] & 0x3F);
2620
573M
            }
2621
2622
573M
            if ((c < min) ||
2623
573M
                ((c >= 0xD800) && (c <= 0xDFFF)) ||
2624
573M
                (c > 0x10FFFF)) {
2625
0
                ret = XML_ENC_ERR_INPUT;
2626
0
                goto done;
2627
0
            }
2628
2629
573M
            if (c < 0x10000) {
2630
573M
                if (out >= outend)
2631
0
                    goto done;
2632
573M
                out[0] = c & 0xFF;
2633
573M
                out[1] = c >> 8;
2634
573M
                out += 2;
2635
573M
            } else {
2636
83
                if (outend - out < 4)
2637
0
                    goto done;
2638
83
                c -= 0x10000;
2639
83
                d = (c & 0x03FF) | 0xDC00;
2640
83
                c = (c >> 10)    | 0xD800;
2641
83
                out[0] = c & 0xFF;
2642
83
                out[1] = c >> 8;
2643
83
                out[2] = d & 0xFF;
2644
83
                out[3] = d >> 8;
2645
83
                out += 4;
2646
83
            }
2647
2648
573M
            in += len;
2649
573M
        }
2650
604M
    }
2651
2652
20.9k
    ret = out - outstart;
2653
2654
20.9k
done:
2655
20.9k
    *outlen = out - outstart;
2656
20.9k
    *inlen = in - instart;
2657
20.9k
    return(ret);
2658
20.9k
}
2659
2660
static xmlCharEncError
2661
UTF8ToUTF16(void *vctxt,
2662
            unsigned char* outb, int *outlen,
2663
            const unsigned char* in, int *inlen,
2664
20.7k
            int flush) {
2665
20.7k
    if (in == NULL) {
2666
  /*
2667
   * initialization, add the Byte Order Mark for UTF-16LE
2668
   */
2669
59
        if (*outlen >= 2) {
2670
59
      outb[0] = 0xFF;
2671
59
      outb[1] = 0xFE;
2672
59
      *outlen = 2;
2673
59
      *inlen = 0;
2674
59
      return(2);
2675
59
  }
2676
0
  *outlen = 0;
2677
0
  *inlen = 0;
2678
0
  return(0);
2679
59
    }
2680
20.7k
    return (UTF8ToUTF16LE(vctxt, outb, outlen, in, inlen, flush));
2681
20.7k
}
2682
#endif /* LIBXML_OUTPUT_ENABLED */
2683
2684
static xmlCharEncError
2685
UTF16BEToUTF8(void *vctxt ATTRIBUTE_UNUSED,
2686
              unsigned char *out, int *outlen,
2687
              const unsigned char *in, int *inlen,
2688
23.8k
              int flush ATTRIBUTE_UNUSED) {
2689
23.8k
    const unsigned char *instart = in;
2690
23.8k
    const unsigned char *inend = in + (*inlen & ~1);
2691
23.8k
    unsigned char *outstart = out;
2692
23.8k
    unsigned char *outend = out + *outlen;
2693
23.8k
    unsigned c, d;
2694
23.8k
    int ret = XML_ENC_ERR_SPACE;
2695
2696
12.2M
    while (in < inend) {
2697
12.2M
        c = (in[0] << 8) | in[1];
2698
2699
12.2M
        if (c < 0x80) {
2700
56.7k
            if (out >= outend)
2701
247
                goto done;
2702
56.5k
            out[0] = c;
2703
56.5k
            in += 2;
2704
56.5k
            out += 1;
2705
12.1M
        } else if (c < 0x800) {
2706
27.0k
            if (outend - out < 2)
2707
341
                goto done;
2708
26.7k
            out[0] = (c >> 6)   | 0xC0;
2709
26.7k
            out[1] = (c & 0x3F) | 0x80;
2710
26.7k
            in += 2;
2711
26.7k
            out += 2;
2712
12.1M
        } else if ((c & 0xF800) != 0xD800) {
2713
12.1M
            if (outend - out < 3)
2714
10.2k
                goto done;
2715
12.1M
            out[0] =  (c >> 12)         | 0xE0;
2716
12.1M
            out[1] = ((c >>  6) & 0x3F) | 0x80;
2717
12.1M
            out[2] =  (c        & 0x3F) | 0x80;
2718
12.1M
            in += 2;
2719
12.1M
            out += 3;
2720
12.1M
        } else {
2721
            /* Surrogate pair */
2722
2.80k
            if ((c & 0xFC00) != 0xD800) {
2723
616
                ret = XML_ENC_ERR_INPUT;
2724
616
                goto done;
2725
616
            }
2726
2.18k
      if (inend - in < 4)
2727
1.13k
    break;
2728
1.05k
            d = (in[2] << 8) | in[3];
2729
1.05k
            if ((d & 0xFC00) != 0xDC00) {
2730
344
                ret = XML_ENC_ERR_INPUT;
2731
344
                goto done;
2732
344
            }
2733
707
      if (outend - out < 4)
2734
77
    goto done;
2735
630
            c = (c << 10) + d - ((0xD800 << 10) + 0xDC00 - 0x10000);
2736
630
            out[0] =  (c >> 18)         | 0xF0;
2737
630
            out[1] = ((c >> 12) & 0x3F) | 0x80;
2738
630
            out[2] = ((c >>  6) & 0x3F) | 0x80;
2739
630
            out[3] =  (c        & 0x3F) | 0x80;
2740
630
            in += 4;
2741
630
            out += 4;
2742
630
        }
2743
12.2M
    }
2744
2745
11.9k
    ret = out - outstart;
2746
2747
23.8k
done:
2748
23.8k
    *outlen = out - outstart;
2749
23.8k
    *inlen = in - instart;
2750
23.8k
    return(ret);
2751
11.9k
}
2752
2753
#ifdef LIBXML_OUTPUT_ENABLED
2754
static xmlCharEncError
2755
UTF8ToUTF16BE(void *vctxt ATTRIBUTE_UNUSED,
2756
              unsigned char *out, int *outlen,
2757
              const unsigned char *in, int *inlen,
2758
2.22k
              int flush ATTRIBUTE_UNUSED) {
2759
2.22k
    const unsigned char *instart = in;
2760
2.22k
    const unsigned char *inend;
2761
2.22k
    unsigned char *outstart = out;
2762
2.22k
    unsigned char *outend;
2763
2.22k
    unsigned c, d;
2764
2.22k
    int ret = XML_ENC_ERR_SPACE;
2765
2766
    /* UTF-16BE has no BOM */
2767
2.22k
    if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
2768
2.22k
    if (in == NULL) {
2769
61
  *outlen = 0;
2770
61
  *inlen = 0;
2771
61
  return(0);
2772
61
    }
2773
2.16k
    inend = in + *inlen;
2774
2.16k
    outend = out + (*outlen & ~1);
2775
6.33M
    while (in < inend) {
2776
6.33M
        c = in[0];
2777
2778
6.33M
        if (c < 0x80) {
2779
952k
            if (out >= outend)
2780
0
                goto done;
2781
952k
            out[0] = 0;
2782
952k
            out[1] = c;
2783
952k
            in += 1;
2784
952k
            out += 2;
2785
5.38M
        } else {
2786
5.38M
            int i, len;
2787
5.38M
            unsigned min;
2788
2789
5.38M
            if (c < 0xE0) {
2790
5.37M
                if (c < 0xC2) {
2791
0
                    ret = XML_ENC_ERR_INPUT;
2792
0
                    goto done;
2793
0
                }
2794
5.37M
                c &= 0x1F;
2795
5.37M
                len = 2;
2796
5.37M
                min = 0x80;
2797
5.37M
            } else if (c < 0xF0) {
2798
3.68k
                c &= 0x0F;
2799
3.68k
                len = 3;
2800
3.68k
                min = 0x800;
2801
3.68k
            } else {
2802
2.39k
                c &= 0x0F;
2803
2.39k
                len = 4;
2804
2.39k
                min = 0x10000;
2805
2.39k
            }
2806
2807
5.38M
            if (inend - in < len)
2808
5
                break;
2809
2810
10.7M
            for (i = 1; i < len; i++) {
2811
5.39M
                if ((in[i] & 0xC0) != 0x80) {
2812
6
                    ret = XML_ENC_ERR_INPUT;
2813
6
                    goto done;
2814
6
                }
2815
5.39M
                c = (c << 6) | (in[i] & 0x3F);
2816
5.39M
            }
2817
2818
5.38M
            if ((c < min) ||
2819
5.38M
                ((c >= 0xD800) && (c <= 0xDFFF)) ||
2820
5.38M
                (c > 0x10FFFF)) {
2821
0
                ret = XML_ENC_ERR_INPUT;
2822
0
                goto done;
2823
0
            }
2824
2825
5.38M
            if (c < 0x10000) {
2826
5.38M
                if (out >= outend)
2827
0
                    goto done;
2828
5.38M
                out[0] = c >> 8;
2829
5.38M
                out[1] = c & 0xFF;
2830
5.38M
                out += 2;
2831
5.38M
            } else {
2832
2.39k
                if (outend - out < 4)
2833
0
                    goto done;
2834
2.39k
                c -= 0x10000;
2835
2.39k
                d = (c & 0x03FF) | 0xDC00;
2836
2.39k
                c = (c >> 10)    | 0xD800;
2837
2.39k
                out[0] = c >> 8;
2838
2.39k
                out[1] = c & 0xFF;
2839
2.39k
                out[2] = d >> 8;
2840
2.39k
                out[3] = d & 0xFF;
2841
2.39k
                out += 4;
2842
2.39k
            }
2843
2844
5.38M
            in += len;
2845
5.38M
        }
2846
6.33M
    }
2847
2848
2.16k
    ret = out - outstart;
2849
2850
2.16k
done:
2851
2.16k
    *outlen = out - outstart;
2852
2.16k
    *inlen = in - instart;
2853
2.16k
    return(ret);
2854
2.16k
}
2855
#endif /* LIBXML_OUTPUT_ENABLED */
2856
2857
#if defined(LIBXML_OUTPUT_ENABLED) && defined(LIBXML_HTML_ENABLED)
2858
static xmlCharEncError
2859
UTF8ToHtmlWrapper(void *vctxt ATTRIBUTE_UNUSED,
2860
                  unsigned char *out, int *outlen,
2861
                  const unsigned char *in, int *inlen,
2862
0
                  int flush ATTRIBUTE_UNUSED) {
2863
0
    return(htmlUTF8ToHtml(out, outlen, in, inlen));
2864
0
}
2865
#endif
2866
2867
static xmlCharEncError
2868
Utf8ToEightBit(void *vctxt,
2869
               unsigned char *out, int *outlen,
2870
               const unsigned char *in, int *inlen,
2871
0
               int flush ATTRIBUTE_UNUSED) {
2872
0
    const unsigned char *xlattable = vctxt;
2873
0
    const unsigned char *instart = in;
2874
0
    const unsigned char *inend;
2875
0
    unsigned char *outstart = out;
2876
0
    unsigned char *outend;
2877
0
    int ret = XML_ENC_ERR_SPACE;
2878
2879
0
    if (in == NULL) {
2880
        /*
2881
        * initialization nothing to do
2882
        */
2883
0
        *outlen = 0;
2884
0
        *inlen = 0;
2885
0
        return(XML_ENC_ERR_SUCCESS);
2886
0
    }
2887
2888
0
    inend = in + *inlen;
2889
0
    outend = out + *outlen;
2890
0
    while (in < inend) {
2891
0
        unsigned d = *in;
2892
2893
0
        if  (d < 0x80)  {
2894
0
            if (out >= outend)
2895
0
                goto done;
2896
0
            in += 1;
2897
0
        } else if (d < 0xE0) {
2898
0
            unsigned c;
2899
2900
0
            if (inend - in < 2)
2901
0
                break;
2902
0
            c = in[1] & 0x3F;
2903
0
            d = d & 0x1F;
2904
0
            d = xlattable [48 + c + xlattable [d] * 64];
2905
0
            if (d == 0) {
2906
                /* not in character set */
2907
0
                ret = XML_ENC_ERR_INPUT;
2908
0
                goto done;
2909
0
            }
2910
0
            if (out >= outend)
2911
0
                goto done;
2912
0
            in += 2;
2913
0
        } else if (d < 0xF0) {
2914
0
            unsigned c1;
2915
0
            unsigned c2;
2916
2917
0
            if (inend - in < 3)
2918
0
                break;
2919
0
            c1 = in[1] & 0x3F;
2920
0
            c2 = in[2] & 0x3F;
2921
0
      d = d & 0x0F;
2922
0
      d = xlattable [48 + c2 + xlattable [48 + c1 +
2923
0
      xlattable [32 + d] * 64] * 64];
2924
0
            if (d == 0) {
2925
                /* not in character set */
2926
0
                ret = XML_ENC_ERR_INPUT;
2927
0
                goto done;
2928
0
            }
2929
0
            if (out >= outend)
2930
0
                goto done;
2931
0
            in += 3;
2932
0
        } else {
2933
            /* cannot transcode >= U+010000 */
2934
0
                ret = XML_ENC_ERR_INPUT;
2935
0
                goto done;
2936
0
        }
2937
2938
0
        *out++ = d;
2939
0
    }
2940
2941
0
    ret = out - outstart;
2942
2943
0
done:
2944
0
    *outlen = out - outstart;
2945
0
    *inlen = in - instart;
2946
0
    return(ret);
2947
0
}
2948
2949
static xmlCharEncError
2950
EightBitToUtf8(void *vctxt,
2951
               unsigned char* out, int *outlen,
2952
               const unsigned char* in, int *inlen,
2953
1.47k
               int flush ATTRIBUTE_UNUSED) {
2954
1.47k
    unsigned short const *unicodetable = vctxt;
2955
1.47k
    const unsigned char* instart = in;
2956
1.47k
    const unsigned char* inend;
2957
1.47k
    unsigned char* outstart = out;
2958
1.47k
    unsigned char* outend;
2959
1.47k
    int ret = XML_ENC_ERR_SPACE;
2960
2961
1.47k
    outend = out + *outlen;
2962
1.47k
    inend = in + *inlen;
2963
2964
2.46M
    while (in < inend) {
2965
2.46M
        unsigned c = *in;
2966
2967
2.46M
        if (c < 0x80) {
2968
1.89M
            if (out >= outend)
2969
511
                goto done;
2970
1.89M
            *out++ = c;
2971
1.89M
        } else {
2972
575k
            c = unicodetable[c - 0x80];
2973
575k
            if (c == 0) {
2974
                /* undefined code point */
2975
0
                ret = XML_ENC_ERR_INPUT;
2976
0
                goto done;
2977
0
            }
2978
575k
            if (c < 0x800) {
2979
411k
                if (outend - out < 2)
2980
304
                    goto done;
2981
411k
                *out++ = ((c >>  6) & 0x1F) | 0xC0;
2982
411k
                *out++ = (c & 0x3F) | 0x80;
2983
411k
            } else {
2984
163k
                if (outend - out < 3)
2985
278
                    goto done;
2986
163k
                *out++ = ((c >>  12) & 0x0F) | 0xE0;
2987
163k
                *out++ = ((c >>  6) & 0x3F) | 0x80;
2988
163k
                *out++ = (c & 0x3F) | 0x80;
2989
163k
            }
2990
575k
        }
2991
2992
2.46M
        in += 1;
2993
2.46M
    }
2994
2995
386
    ret = out - outstart;
2996
2997
1.47k
done:
2998
1.47k
    *outlen = out - outstart;
2999
1.47k
    *inlen = in - instart;
3000
1.47k
    return(ret);
3001
386
}
3002