Coverage Report

Created: 2025-08-26 06:42

/src/libxml2/xmlstring.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * string.c : an XML string utilities module
3
 *
4
 * This module provides various utility functions for manipulating
5
 * the xmlChar* type. All functions named xmlStr* have been moved here
6
 * from the parser.c file (their original home).
7
 *
8
 * See Copyright for the status of this software.
9
 *
10
 * UTF8 string routines from: William Brack
11
 *
12
 * Author: Daniel Veillard
13
 */
14
15
#define IN_LIBXML
16
#include "libxml.h"
17
18
#include <stdlib.h>
19
#include <string.h>
20
#include <limits.h>
21
#include <libxml/xmlmemory.h>
22
#include <libxml/parserInternals.h>
23
#include <libxml/xmlstring.h>
24
25
#include "private/parser.h"
26
#include "private/string.h"
27
28
#ifndef va_copy
29
  #ifdef __va_copy
30
    #define va_copy(dest, src) __va_copy(dest, src)
31
  #else
32
    #define va_copy(dest, src) memcpy(&(dest), &(src), sizeof(va_list))
33
  #endif
34
#endif
35
36
/************************************************************************
37
 *                                                                      *
38
 *                Commodity functions to handle xmlChars                *
39
 *                                                                      *
40
 ************************************************************************/
41
42
/**
43
 * a strndup for array of xmlChar's
44
 *
45
 * @param cur  the input xmlChar *
46
 * @param len  the len of `cur`
47
 * @returns a new xmlChar * or NULL
48
 */
49
xmlChar *
50
115M
xmlStrndup(const xmlChar *cur, int len) {
51
115M
    xmlChar *ret;
52
53
115M
    if ((cur == NULL) || (len < 0)) return(NULL);
54
115M
    ret = xmlMalloc((size_t) len + 1);
55
115M
    if (ret == NULL) {
56
21.8k
        return(NULL);
57
21.8k
    }
58
115M
    memcpy(ret, cur, len);
59
115M
    ret[len] = 0;
60
115M
    return(ret);
61
115M
}
62
63
/**
64
 * a strdup for array of xmlChar's. Since they are supposed to be
65
 * encoded in UTF-8 or an encoding with 8bit based chars, we assume
66
 * a termination mark of '0'.
67
 *
68
 * @param cur  the input xmlChar *
69
 * @returns a new xmlChar * or NULL
70
 */
71
xmlChar *
72
73.5M
xmlStrdup(const xmlChar *cur) {
73
73.5M
    const xmlChar *p = cur;
74
75
73.5M
    if (cur == NULL) return(NULL);
76
22.4G
    while (*p != 0) p++; /* non input consuming */
77
73.5M
    return(xmlStrndup(cur, p - cur));
78
73.5M
}
79
80
/**
81
 * a strndup for char's to xmlChar's
82
 *
83
 * @param cur  the input char *
84
 * @param len  the len of `cur`
85
 * @returns a new xmlChar * or NULL
86
 */
87
88
xmlChar *
89
2.30M
xmlCharStrndup(const char *cur, int len) {
90
2.30M
    int i;
91
2.30M
    xmlChar *ret;
92
93
2.30M
    if ((cur == NULL) || (len < 0)) return(NULL);
94
2.30M
    ret = xmlMalloc((size_t) len + 1);
95
2.30M
    if (ret == NULL) {
96
1.10k
        return(NULL);
97
1.10k
    }
98
136M
    for (i = 0;i < len;i++) {
99
        /* Explicit sign change */
100
134M
        ret[i] = (xmlChar) cur[i];
101
134M
        if (ret[i] == 0) return(ret);
102
134M
    }
103
2.30M
    ret[len] = 0;
104
2.30M
    return(ret);
105
2.30M
}
106
107
/**
108
 * a strdup for char's to xmlChar's
109
 *
110
 * @param cur  the input char *
111
 * @returns a new xmlChar * or NULL
112
 */
113
114
xmlChar *
115
2.30M
xmlCharStrdup(const char *cur) {
116
2.30M
    const char *p = cur;
117
118
2.30M
    if (cur == NULL) return(NULL);
119
136M
    while (*p != '\0') p++; /* non input consuming */
120
2.30M
    return(xmlCharStrndup(cur, p - cur));
121
2.30M
}
122
123
/**
124
 * a strcmp for xmlChar's
125
 *
126
 * @param str1  the first xmlChar *
127
 * @param str2  the second xmlChar *
128
 * @returns the integer result of the comparison
129
 */
130
131
int
132
29.6k
xmlStrcmp(const xmlChar *str1, const xmlChar *str2) {
133
29.6k
    if (str1 == str2) return(0);
134
14.9k
    if (str1 == NULL) return(-1);
135
13.5k
    if (str2 == NULL) return(1);
136
12.2k
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
137
12.2k
    return(strcmp((const char *)str1, (const char *)str2));
138
#else
139
    do {
140
        int tmp = *str1++ - *str2;
141
        if (tmp != 0) return(tmp);
142
    } while (*str2++ != 0);
143
    return 0;
144
#endif
145
13.5k
}
146
147
/**
148
 * Check if both strings are equal of have same content.
149
 * Should be a bit more readable and faster than #xmlStrcmp
150
 *
151
 * @param str1  the first xmlChar *
152
 * @param str2  the second xmlChar *
153
 * @returns 1 if they are equal, 0 if they are different
154
 */
155
156
int
157
232M
xmlStrEqual(const xmlChar *str1, const xmlChar *str2) {
158
232M
    if (str1 == str2) return(1);
159
230M
    if (str1 == NULL) return(0);
160
223M
    if (str2 == NULL) return(0);
161
213M
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
162
213M
    return(strcmp((const char *)str1, (const char *)str2) == 0);
163
#else
164
    do {
165
        if (*str1++ != *str2) return(0);
166
    } while (*str2++);
167
    return(1);
168
#endif
169
223M
}
170
171
/**
172
 * Check if a QName is Equal to a given string
173
 *
174
 * @param pref  the prefix of the QName
175
 * @param name  the localname of the QName
176
 * @param str  the second xmlChar *
177
 * @returns 1 if they are equal, 0 if they are different
178
 */
179
180
int
181
524k
xmlStrQEqual(const xmlChar *pref, const xmlChar *name, const xmlChar *str) {
182
524k
    if (pref == NULL) return(xmlStrEqual(name, str));
183
208k
    if (name == NULL) return(0);
184
208k
    if (str == NULL) return(0);
185
186
1.43M
    do {
187
1.43M
        if (*pref++ != *str) return(0);
188
1.43M
    } while ((*str++) && (*pref));
189
208k
    if (*str++ != ':') return(0);
190
22.4M
    do {
191
22.4M
        if (*name++ != *str) return(0);
192
22.4M
    } while (*str++);
193
208k
    return(1);
194
208k
}
195
196
/**
197
 * a strncmp for xmlChar's
198
 *
199
 * @param str1  the first xmlChar *
200
 * @param str2  the second xmlChar *
201
 * @param len  the max comparison length
202
 * @returns the integer result of the comparison
203
 */
204
205
int
206
55.2M
xmlStrncmp(const xmlChar *str1, const xmlChar *str2, int len) {
207
55.2M
    if (len <= 0) return(0);
208
55.2M
    if (str1 == str2) return(0);
209
55.2M
    if (str1 == NULL) return(-1);
210
55.2M
    if (str2 == NULL) return(1);
211
55.2M
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
212
55.2M
    return(strncmp((const char *)str1, (const char *)str2, len));
213
#else
214
    do {
215
        int tmp = *str1++ - *str2;
216
        if (tmp != 0 || --len == 0) return(tmp);
217
    } while (*str2++ != 0);
218
    return 0;
219
#endif
220
55.2M
}
221
222
static const xmlChar casemap[256] = {
223
    0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
224
    0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
225
    0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
226
    0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,
227
    0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,
228
    0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
229
    0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,
230
    0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
231
    0x40,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
232
    0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
233
    0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
234
    0x78,0x79,0x7A,0x7B,0x5C,0x5D,0x5E,0x5F,
235
    0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
236
    0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
237
    0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
238
    0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E,0x7F,
239
    0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,
240
    0x88,0x89,0x8A,0x8B,0x8C,0x8D,0x8E,0x8F,
241
    0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97,
242
    0x98,0x99,0x9A,0x9B,0x9C,0x9D,0x9E,0x9F,
243
    0xA0,0xA1,0xA2,0xA3,0xA4,0xA5,0xA6,0xA7,
244
    0xA8,0xA9,0xAA,0xAB,0xAC,0xAD,0xAE,0xAF,
245
    0xB0,0xB1,0xB2,0xB3,0xB4,0xB5,0xB6,0xB7,
246
    0xB8,0xB9,0xBA,0xBB,0xBC,0xBD,0xBE,0xBF,
247
    0xC0,0xC1,0xC2,0xC3,0xC4,0xC5,0xC6,0xC7,
248
    0xC8,0xC9,0xCA,0xCB,0xCC,0xCD,0xCE,0xCF,
249
    0xD0,0xD1,0xD2,0xD3,0xD4,0xD5,0xD6,0xD7,
250
    0xD8,0xD9,0xDA,0xDB,0xDC,0xDD,0xDE,0xDF,
251
    0xE0,0xE1,0xE2,0xE3,0xE4,0xE5,0xE6,0xE7,
252
    0xE8,0xE9,0xEA,0xEB,0xEC,0xED,0xEE,0xEF,
253
    0xF0,0xF1,0xF2,0xF3,0xF4,0xF5,0xF6,0xF7,
254
    0xF8,0xF9,0xFA,0xFB,0xFC,0xFD,0xFE,0xFF
255
};
256
257
/**
258
 * a strcasecmp for xmlChar's
259
 *
260
 * @param str1  the first xmlChar *
261
 * @param str2  the second xmlChar *
262
 * @returns the integer result of the comparison
263
 */
264
265
int
266
8.63M
xmlStrcasecmp(const xmlChar *str1, const xmlChar *str2) {
267
8.63M
    register int tmp;
268
269
8.63M
    if (str1 == str2) return(0);
270
8.63M
    if (str1 == NULL) return(-1);
271
8.63M
    if (str2 == NULL) return(1);
272
13.3M
    do {
273
13.3M
        tmp = casemap[*str1++] - casemap[*str2];
274
13.3M
        if (tmp != 0) return(tmp);
275
13.3M
    } while (*str2++ != 0);
276
517k
    return 0;
277
8.63M
}
278
279
/**
280
 * a strncasecmp for xmlChar's
281
 *
282
 * @param str1  the first xmlChar *
283
 * @param str2  the second xmlChar *
284
 * @param len  the max comparison length
285
 * @returns the integer result of the comparison
286
 */
287
288
int
289
89.7k
xmlStrncasecmp(const xmlChar *str1, const xmlChar *str2, int len) {
290
89.7k
    register int tmp;
291
292
89.7k
    if (len <= 0) return(0);
293
89.7k
    if (str1 == str2) return(0);
294
89.7k
    if (str1 == NULL) return(-1);
295
89.7k
    if (str2 == NULL) return(1);
296
139k
    do {
297
139k
        tmp = casemap[*str1++] - casemap[*str2];
298
139k
        if (tmp != 0 || --len == 0) return(tmp);
299
139k
    } while (*str2++ != 0);
300
0
    return 0;
301
89.7k
}
302
303
/**
304
 * a strchr for xmlChar's
305
 *
306
 * @param str  the xmlChar * array
307
 * @param val  the xmlChar to search
308
 * @returns the xmlChar * for the first occurrence or NULL.
309
 */
310
311
const xmlChar *
312
479M
xmlStrchr(const xmlChar *str, xmlChar val) {
313
479M
    if (str == NULL) return(NULL);
314
3.06G
    while (*str != 0) { /* non input consuming */
315
2.60G
        if (*str == val) return((xmlChar *) str);
316
2.58G
        str++;
317
2.58G
    }
318
464M
    return(NULL);
319
479M
}
320
321
/**
322
 * a strstr for xmlChar's
323
 *
324
 * @param str  the xmlChar * array (haystack)
325
 * @param val  the xmlChar to search (needle)
326
 * @returns the xmlChar * for the first occurrence or NULL.
327
 */
328
329
const xmlChar *
330
2.79M
xmlStrstr(const xmlChar *str, const xmlChar *val) {
331
2.79M
    int n;
332
333
2.79M
    if (str == NULL) return(NULL);
334
2.79M
    if (val == NULL) return(NULL);
335
2.79M
    n = xmlStrlen(val);
336
337
2.79M
    if (n == 0) return(str);
338
1.19G
    while (*str != 0) { /* non input consuming */
339
1.19G
        if (*str == *val) {
340
695k
            if (!xmlStrncmp(str, val, n)) return((const xmlChar *) str);
341
695k
        }
342
1.19G
        str++;
343
1.19G
    }
344
2.62M
    return(NULL);
345
2.79M
}
346
347
/**
348
 * a case-ignoring strstr for xmlChar's
349
 *
350
 * @param str  the xmlChar * array (haystack)
351
 * @param val  the xmlChar to search (needle)
352
 * @returns the xmlChar * for the first occurrence or NULL.
353
 */
354
355
const xmlChar *
356
0
xmlStrcasestr(const xmlChar *str, const xmlChar *val) {
357
0
    int n;
358
359
0
    if (str == NULL) return(NULL);
360
0
    if (val == NULL) return(NULL);
361
0
    n = xmlStrlen(val);
362
363
0
    if (n == 0) return(str);
364
0
    while (*str != 0) { /* non input consuming */
365
0
        if (casemap[*str] == casemap[*val])
366
0
            if (!xmlStrncasecmp(str, val, n)) return(str);
367
0
        str++;
368
0
    }
369
0
    return(NULL);
370
0
}
371
372
/**
373
 * Extract a substring of a given string
374
 *
375
 * @param str  the xmlChar * array (haystack)
376
 * @param start  the index of the first char (zero based)
377
 * @param len  the length of the substring
378
 * @returns the xmlChar * for the first occurrence or NULL.
379
 */
380
381
xmlChar *
382
0
xmlStrsub(const xmlChar *str, int start, int len) {
383
0
    int i;
384
385
0
    if (str == NULL) return(NULL);
386
0
    if (start < 0) return(NULL);
387
0
    if (len < 0) return(NULL);
388
389
0
    for (i = 0;i < start;i++) {
390
0
        if (*str == 0) return(NULL);
391
0
        str++;
392
0
    }
393
0
    if (*str == 0) return(NULL);
394
0
    return(xmlStrndup(str, len));
395
0
}
396
397
/**
398
 * length of a xmlChar's string
399
 *
400
 * @param str  the xmlChar * array
401
 * @returns the number of xmlChar contained in the ARRAY.
402
 */
403
404
int
405
15.0M
xmlStrlen(const xmlChar *str) {
406
15.0M
    size_t len = str ? strlen((const char *)str) : 0;
407
15.0M
    return(len > INT_MAX ? 0 : len);
408
15.0M
}
409
410
/**
411
 * a strncat for array of xmlChar's, it will extend `cur` with the len
412
 * first bytes of `add`. Note that if `len` < 0 then this is an API error
413
 * and NULL will be returned.
414
 *
415
 * @param cur  the original xmlChar * array
416
 * @param add  the xmlChar * array added
417
 * @param len  the length of `add`
418
 * @returns a new xmlChar *, the original `cur` is reallocated and should
419
 * not be freed.
420
 */
421
422
xmlChar *
423
5.64M
xmlStrncat(xmlChar *cur, const xmlChar *add, int len) {
424
5.64M
    int size;
425
5.64M
    xmlChar *ret;
426
427
5.64M
    if ((add == NULL) || (len == 0))
428
14.7k
        return(cur);
429
5.62M
    if (len < 0)
430
0
  return(NULL);
431
5.62M
    if (cur == NULL)
432
4.75k
        return(xmlStrndup(add, len));
433
434
5.62M
    size = xmlStrlen(cur);
435
5.62M
    if ((size < 0) || (size > INT_MAX - len))
436
0
        return(NULL);
437
5.62M
    ret = (xmlChar *) xmlRealloc(cur, (size_t) size + len + 1);
438
5.62M
    if (ret == NULL) {
439
1.71k
        xmlFree(cur);
440
1.71k
        return(NULL);
441
1.71k
    }
442
5.62M
    memcpy(&ret[size], add, len);
443
5.62M
    ret[size + len] = 0;
444
5.62M
    return(ret);
445
5.62M
}
446
447
/**
448
 * same as #xmlStrncat, but creates a new string.  The original
449
 * two strings are not freed. If `len` is < 0 then the length
450
 * will be calculated automatically.
451
 *
452
 * @param str1  first xmlChar string
453
 * @param str2  second xmlChar string
454
 * @param len  the len of `str2` or < 0
455
 * @returns a new xmlChar * or NULL
456
 */
457
xmlChar *
458
31.9k
xmlStrncatNew(const xmlChar *str1, const xmlChar *str2, int len) {
459
31.9k
    int size;
460
31.9k
    xmlChar *ret;
461
462
31.9k
    if (len < 0) {
463
27.0k
        len = xmlStrlen(str2);
464
27.0k
        if (len < 0)
465
0
            return(NULL);
466
27.0k
    }
467
31.9k
    if (str1 == NULL)
468
675
        return(xmlStrndup(str2, len));
469
31.3k
    if ((str2 == NULL) || (len == 0))
470
1.54k
        return(xmlStrdup(str1));
471
472
29.7k
    size = xmlStrlen(str1);
473
29.7k
    if ((size < 0) || (size > INT_MAX - len))
474
0
        return(NULL);
475
29.7k
    ret = (xmlChar *) xmlMalloc((size_t) size + len + 1);
476
29.7k
    if (ret == NULL)
477
4
        return(NULL);
478
29.7k
    memcpy(ret, str1, size);
479
29.7k
    memcpy(&ret[size], str2, len);
480
29.7k
    ret[size + len] = 0;
481
29.7k
    return(ret);
482
29.7k
}
483
484
/**
485
 * a strcat for array of xmlChar's. Since they are supposed to be
486
 * encoded in UTF-8 or an encoding with 8bit based chars, we assume
487
 * a termination mark of '0'.
488
 *
489
 * @param cur  the original xmlChar * array
490
 * @param add  the xmlChar * array added
491
 * @returns a new xmlChar * containing the concatenated string. The original
492
 * `cur` is reallocated and should not be freed.
493
 */
494
xmlChar *
495
5.65M
xmlStrcat(xmlChar *cur, const xmlChar *add) {
496
5.65M
    const xmlChar *p = add;
497
498
5.65M
    if (add == NULL) return(cur);
499
5.65M
    if (cur == NULL)
500
16.0k
        return(xmlStrdup(add));
501
502
188M
    while (*p != 0) p++; /* non input consuming */
503
5.63M
    return(xmlStrncat(cur, add, p - add));
504
5.65M
}
505
506
/**
507
 * Formats `msg` and places result into `buf`.
508
 *
509
 * @param buf  the result buffer.
510
 * @param len  the result buffer length.
511
 * @param msg  the message with printf formatting.
512
 * @param ...   extra parameters for the message.
513
 * @returns the number of characters written to `buf` or -1 if an error occurs.
514
 */
515
int
516
0
xmlStrPrintf(xmlChar *buf, int len, const char *msg, ...) {
517
0
    va_list args;
518
0
    int ret;
519
520
0
    if((buf == NULL) || (msg == NULL)) {
521
0
        return(-1);
522
0
    }
523
524
0
    va_start(args, msg);
525
0
    ret = vsnprintf((char *) buf, len, (const char *) msg, args);
526
0
    va_end(args);
527
0
    buf[len - 1] = 0; /* be safe ! */
528
529
0
    return(ret);
530
0
}
531
532
/**
533
 * Formats `msg` and places result into `buf`.
534
 *
535
 * @param buf  the result buffer.
536
 * @param len  the result buffer length.
537
 * @param msg  the message with printf formatting.
538
 * @param ap  extra parameters for the message.
539
 * @returns the number of characters written to `buf` or -1 if an error occurs.
540
 */
541
int
542
0
xmlStrVPrintf(xmlChar *buf, int len, const char *msg, va_list ap) {
543
0
    int ret;
544
545
0
    if((buf == NULL) || (msg == NULL)) {
546
0
        return(-1);
547
0
    }
548
549
0
    ret = vsnprintf((char *) buf, len, (const char *) msg, ap);
550
0
    buf[len - 1] = 0; /* be safe ! */
551
552
0
    return(ret);
553
0
}
554
555
/**
556
 * Creates a newly allocated string according to format.
557
 *
558
 * @param out  pointer to the resulting string
559
 * @param maxSize  maximum size of the output buffer
560
 * @param msg  printf format string
561
 * @param ap  arguments for format string
562
 * @returns 0 on success, 1 if the result was truncated or on other
563
 * errors, -1 if a memory allocation failed.
564
 */
565
int
566
12.2M
xmlStrVASPrintf(xmlChar **out, int maxSize, const char *msg, va_list ap) {
567
12.2M
    char empty[1];
568
12.2M
    va_list copy;
569
12.2M
    xmlChar *buf;
570
12.2M
    int res, size;
571
12.2M
    int truncated = 0;
572
573
12.2M
    if (out == NULL)
574
0
        return(1);
575
12.2M
    *out = NULL;
576
12.2M
    if (msg == NULL)
577
0
        return(1);
578
12.2M
    if (maxSize < 32)
579
0
        maxSize = 32;
580
581
12.2M
    va_copy(copy, ap);
582
12.2M
    res = vsnprintf(empty, 1, msg, copy);
583
12.2M
    va_end(copy);
584
585
12.2M
    if (res > 0) {
586
        /* snprintf seems to work according to C99. */
587
588
12.2M
        if (res < maxSize) {
589
12.2M
            size = res + 1;
590
12.2M
        } else {
591
3.60k
            size = maxSize;
592
3.60k
            truncated = 1;
593
3.60k
        }
594
12.2M
        buf = xmlMalloc(size);
595
12.2M
        if (buf == NULL)
596
10.3k
            return(-1);
597
12.2M
        if (vsnprintf((char *) buf, size, msg, ap) < 0) {
598
0
            xmlFree(buf);
599
0
            return(1);
600
0
        }
601
12.2M
    } else {
602
        /*
603
         * Unfortunately, older snprintf implementations don't follow the
604
         * C99 spec. If the output exceeds the size of the buffer, they can
605
         * return -1, 0 or the number of characters written instead of the
606
         * needed size. Older MSCVRT also won't write a terminating null
607
         * byte if the buffer is too small.
608
         *
609
         * If the value returned is non-negative and strictly less than
610
         * the buffer size (without terminating null), the result should
611
         * have been written completely, so we double the buffer size
612
         * until this condition is true. This assumes that snprintf will
613
         * eventually return a non-negative value. Otherwise, we will
614
         * allocate more and more memory until we run out.
615
         *
616
         * Note that this code path is also executed on conforming
617
         * platforms if the output is the empty string.
618
         */
619
620
0
        buf = NULL;
621
0
        size = 32;
622
0
        while (1) {
623
0
            buf = xmlMalloc(size);
624
0
            if (buf == NULL)
625
0
                return(-1);
626
627
0
            va_copy(copy, ap);
628
0
            res = vsnprintf((char *) buf, size, msg, copy);
629
0
            va_end(copy);
630
0
            if ((res >= 0) && (res < size - 1))
631
0
                break;
632
633
0
            if (size >= maxSize) {
634
0
                truncated = 1;
635
0
                break;
636
0
            }
637
638
0
            xmlFree(buf);
639
640
0
            if (size > maxSize / 2)
641
0
                size = maxSize;
642
0
            else
643
0
                size *= 2;
644
0
        }
645
0
    }
646
647
    /*
648
     * If the output was truncated, make sure that the buffer doesn't
649
     * end with a truncated UTF-8 sequence.
650
     */
651
12.2M
    if (truncated != 0) {
652
3.60k
        int i = size - 1;
653
654
37.1k
        while (i > 0) {
655
            /* Break after ASCII */
656
37.1k
            if (buf[i-1] < 0x80)
657
1.21k
                break;
658
35.9k
            i -= 1;
659
            /* Break before non-ASCII */
660
35.9k
            if (buf[i] >= 0xc0)
661
2.38k
                break;
662
35.9k
        }
663
664
3.60k
        buf[i] = 0;
665
3.60k
    }
666
667
12.2M
    *out = (xmlChar *) buf;
668
12.2M
    return(truncated);
669
12.2M
}
670
671
/**
672
 * See xmlStrVASPrintf.
673
 *
674
 * @param out  pointer to the resulting string
675
 * @param maxSize  maximum size of the output buffer
676
 * @param msg  printf format string
677
 * @param ...  arguments for format string
678
 * @returns 0 on success, 1 if the result was truncated or on other
679
 * errors, -1 if a memory allocation failed.
680
 */
681
int
682
0
xmlStrASPrintf(xmlChar **out, int maxSize, const char *msg, ...) {
683
0
    va_list ap;
684
0
    int ret;
685
686
0
    va_start(ap, msg);
687
0
    ret = xmlStrVASPrintf(out, maxSize, msg, ap);
688
0
    va_end(ap);
689
690
0
    return(ret);
691
0
}
692
693
/************************************************************************
694
 *                                                                      *
695
 *              Generic UTF8 handling routines                          *
696
 *                                                                      *
697
 * From rfc2044: encoding of the Unicode values on UTF-8:               *
698
 *                                                                      *
699
 * UCS-4 range (hex.)           UTF-8 octet sequence (binary)           *
700
 * 0000 0000-0000 007F   0xxxxxxx                                       *
701
 * 0000 0080-0000 07FF   110xxxxx 10xxxxxx                              *
702
 * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx                     *
703
 *                                                                      *
704
 * I hope we won't use values > 0xFFFF anytime soon !                   *
705
 *                                                                      *
706
 ************************************************************************/
707
708
709
/**
710
 * calculates the internal size of a UTF8 character
711
 *
712
 * @param utf  pointer to the UTF8 character
713
 * @returns the numbers of bytes in the character, -1 on format error
714
 */
715
int
716
0
xmlUTF8Size(const xmlChar *utf) {
717
0
    xmlChar mask;
718
0
    int len;
719
720
0
    if (utf == NULL)
721
0
        return -1;
722
0
    if (*utf < 0x80)
723
0
        return 1;
724
    /* check valid UTF8 character */
725
0
    if (!(*utf & 0x40))
726
0
        return -1;
727
    /* determine number of bytes in char */
728
0
    len = 2;
729
0
    for (mask=0x20; mask != 0; mask>>=1) {
730
0
        if (!(*utf & mask))
731
0
            return len;
732
0
        len++;
733
0
    }
734
0
    return -1;
735
0
}
736
737
/**
738
 * compares the two UCS4 values
739
 *
740
 * @param utf1  pointer to first UTF8 char
741
 * @param utf2  pointer to second UTF8 char
742
 * @returns result of the compare as with #xmlStrncmp
743
 */
744
int
745
0
xmlUTF8Charcmp(const xmlChar *utf1, const xmlChar *utf2) {
746
747
0
    if (utf1 == NULL ) {
748
0
        if (utf2 == NULL)
749
0
            return 0;
750
0
        return -1;
751
0
    }
752
0
    return xmlStrncmp(utf1, utf2, xmlUTF8Size(utf1));
753
0
}
754
755
/**
756
 * compute the length of an UTF8 string, it doesn't do a full UTF8
757
 * checking of the content of the string.
758
 *
759
 * @param utf  a sequence of UTF-8 encoded bytes
760
 * @returns the number of characters in the string or -1 in case of error
761
 */
762
int
763
18.5k
xmlUTF8Strlen(const xmlChar *utf) {
764
18.5k
    size_t ret = 0;
765
766
18.5k
    if (utf == NULL)
767
16
        return(-1);
768
769
37.6M
    while (*utf != 0) {
770
37.6M
        if (utf[0] & 0x80) {
771
349k
            if ((utf[1] & 0xc0) != 0x80)
772
0
                return(-1);
773
349k
            if ((utf[0] & 0xe0) == 0xe0) {
774
343k
                if ((utf[2] & 0xc0) != 0x80)
775
0
                    return(-1);
776
343k
                if ((utf[0] & 0xf0) == 0xf0) {
777
7.59k
                    if ((utf[0] & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
778
772
                        return(-1);
779
6.82k
                    utf += 4;
780
336k
                } else {
781
336k
                    utf += 3;
782
336k
                }
783
343k
            } else {
784
5.21k
                utf += 2;
785
5.21k
            }
786
37.3M
        } else {
787
37.3M
            utf++;
788
37.3M
        }
789
37.6M
        ret++;
790
37.6M
    }
791
17.7k
    return(ret > INT_MAX ? 0 : ret);
792
18.5k
}
793
794
/**
795
 * Read the first UTF8 character from `utf`
796
 *
797
 * @param utf  a sequence of UTF-8 encoded bytes
798
 * @param len  a pointer to the minimum number of bytes present in
799
 *        the sequence.  This is used to assure the next character
800
 *        is completely contained within the sequence.
801
 * @returns the char value or -1 in case of error, and sets *len to
802
 *        the actual number of bytes consumed (0 in case of error)
803
 */
804
int
805
617M
xmlGetUTF8Char(const unsigned char *utf, int *len) {
806
617M
    unsigned int c;
807
808
617M
    if (utf == NULL)
809
0
        goto error;
810
617M
    if (len == NULL)
811
0
        goto error;
812
813
617M
    c = utf[0];
814
617M
    if (c < 0x80) {
815
445M
        if (*len < 1)
816
0
            goto error;
817
        /* 1-byte code */
818
445M
        *len = 1;
819
445M
    } else {
820
172M
        if ((*len < 2) || ((utf[1] & 0xc0) != 0x80))
821
6.88M
            goto error;
822
165M
        if (c < 0xe0) {
823
15.5M
            if (c < 0xc2)
824
7.43M
                goto error;
825
            /* 2-byte code */
826
8.13M
            *len = 2;
827
8.13M
            c = (c & 0x1f) << 6;
828
8.13M
            c |= utf[1] & 0x3f;
829
149M
        } else {
830
149M
            if ((*len < 3) || ((utf[2] & 0xc0) != 0x80))
831
74.3k
                goto error;
832
149M
            if (c < 0xf0) {
833
                /* 3-byte code */
834
148M
                *len = 3;
835
148M
                c = (c & 0xf) << 12;
836
148M
                c |= (utf[1] & 0x3f) << 6;
837
148M
                c |= utf[2] & 0x3f;
838
148M
                if ((c < 0x800) || ((c >= 0xd800) && (c < 0xe000)))
839
6.68k
                    goto error;
840
148M
            } else {
841
1.14M
                if ((*len < 4) || ((utf[3] & 0xc0) != 0x80))
842
6.32k
                    goto error;
843
1.13M
                *len = 4;
844
                /* 4-byte code */
845
1.13M
                c = (c & 0x7) << 18;
846
1.13M
                c |= (utf[1] & 0x3f) << 12;
847
1.13M
                c |= (utf[2] & 0x3f) << 6;
848
1.13M
                c |= utf[3] & 0x3f;
849
1.13M
                if ((c < 0x10000) || (c >= 0x110000))
850
5.86k
                    goto error;
851
1.13M
            }
852
149M
        }
853
165M
    }
854
603M
    return(c);
855
856
14.4M
error:
857
14.4M
    if (len != NULL)
858
14.4M
  *len = 0;
859
14.4M
    return(-1);
860
617M
}
861
862
/**
863
 * Checks `utf` for being valid UTF-8. `utf` is assumed to be
864
 * null-terminated. This function is not super-strict, as it will
865
 * allow longer UTF-8 sequences than necessary. Note that Java is
866
 * capable of producing these sequences if provoked. Also note, this
867
 * routine checks for the 4-byte maximum size, but does not check for
868
 * 0x10ffff maximum value.
869
 *
870
 * @param utf  Pointer to putative UTF-8 encoded string.
871
 * @returns value: true if `utf` is valid.
872
 **/
873
int
874
xmlCheckUTF8(const unsigned char *utf)
875
0
{
876
0
    int ix;
877
0
    unsigned char c;
878
879
0
    if (utf == NULL)
880
0
        return(0);
881
    /*
882
     * utf is a string of 1, 2, 3 or 4 bytes.  The valid strings
883
     * are as follows (in "bit format"):
884
     *    0xxxxxxx                                      valid 1-byte
885
     *    110xxxxx 10xxxxxx                             valid 2-byte
886
     *    1110xxxx 10xxxxxx 10xxxxxx                    valid 3-byte
887
     *    11110xxx 10xxxxxx 10xxxxxx 10xxxxxx           valid 4-byte
888
     */
889
0
    while ((c = utf[0])) {      /* string is 0-terminated */
890
0
        ix = 0;
891
0
        if ((c & 0x80) == 0x00) { /* 1-byte code, starts with 10 */
892
0
            ix = 1;
893
0
  } else if ((c & 0xe0) == 0xc0) {/* 2-byte code, starts with 110 */
894
0
      if ((utf[1] & 0xc0 ) != 0x80)
895
0
          return 0;
896
0
      ix = 2;
897
0
  } else if ((c & 0xf0) == 0xe0) {/* 3-byte code, starts with 1110 */
898
0
      if (((utf[1] & 0xc0) != 0x80) ||
899
0
          ((utf[2] & 0xc0) != 0x80))
900
0
        return 0;
901
0
      ix = 3;
902
0
  } else if ((c & 0xf8) == 0xf0) {/* 4-byte code, starts with 11110 */
903
0
      if (((utf[1] & 0xc0) != 0x80) ||
904
0
          ((utf[2] & 0xc0) != 0x80) ||
905
0
    ((utf[3] & 0xc0) != 0x80))
906
0
        return 0;
907
0
      ix = 4;
908
0
  } else       /* unknown encoding */
909
0
      return 0;
910
0
        utf += ix;
911
0
      }
912
0
      return(1);
913
0
}
914
915
/**
916
 * storage size of an UTF8 string
917
 * the behaviour is not guaranteed if the input string is not UTF-8
918
 *
919
 * @param utf  a sequence of UTF-8 encoded bytes
920
 * @param len  the number of characters in the array
921
 * @returns the storage size of
922
 * the first 'len' characters of ARRAY
923
 */
924
925
int
926
9.84M
xmlUTF8Strsize(const xmlChar *utf, int len) {
927
9.84M
    const xmlChar *ptr=utf;
928
9.84M
    int ch;
929
9.84M
    size_t ret;
930
931
9.84M
    if (utf == NULL)
932
0
        return(0);
933
934
9.84M
    if (len <= 0)
935
0
        return(0);
936
937
45.0M
    while ( len-- > 0) {
938
35.2M
        if ( !*ptr )
939
2.69k
            break;
940
35.1M
        ch = *ptr++;
941
35.1M
        if ((ch & 0x80))
942
1.79M
            while ((ch<<=1) & 0x80 ) {
943
1.19M
    if (*ptr == 0) break;
944
1.19M
                ptr++;
945
1.19M
      }
946
35.1M
    }
947
9.84M
    ret = ptr - utf;
948
9.84M
    return (ret > INT_MAX ? 0 : ret);
949
9.84M
}
950
951
952
/**
953
 * a strndup for array of UTF8's
954
 *
955
 * @param utf  the input UTF8 *
956
 * @param len  the len of `utf` (in chars)
957
 * @returns a new UTF8 * or NULL
958
 */
959
xmlChar *
960
3.17k
xmlUTF8Strndup(const xmlChar *utf, int len) {
961
3.17k
    xmlChar *ret;
962
3.17k
    int i;
963
964
3.17k
    if ((utf == NULL) || (len < 0)) return(NULL);
965
3.17k
    i = xmlUTF8Strsize(utf, len);
966
3.17k
    ret = xmlMalloc((size_t) i + 1);
967
3.17k
    if (ret == NULL) {
968
3
        return(NULL);
969
3
    }
970
3.17k
    memcpy(ret, utf, i);
971
3.17k
    ret[i] = 0;
972
3.17k
    return(ret);
973
3.17k
}
974
975
/**
976
 * a function to provide the equivalent of fetching a
977
 * character from a string array
978
 *
979
 * @param utf  the input UTF8 *
980
 * @param pos  the position of the desired UTF8 char (in chars)
981
 * @returns a pointer to the UTF8 character or NULL
982
 */
983
const xmlChar *
984
199k
xmlUTF8Strpos(const xmlChar *utf, int pos) {
985
199k
    int ch;
986
987
199k
    if (utf == NULL) return(NULL);
988
199k
    if (pos < 0)
989
0
        return(NULL);
990
6.29M
    while (pos--) {
991
6.09M
        ch = *utf++;
992
6.09M
        if (ch == 0)
993
0
            return(NULL);
994
6.09M
        if ( ch & 0x80 ) {
995
            /* if not simple ascii, verify proper format */
996
2.61M
            if ( (ch & 0xc0) != 0xc0 )
997
0
                return(NULL);
998
            /* then skip over remaining bytes for this char */
999
7.82M
            while ( (ch <<= 1) & 0x80 )
1000
5.21M
                if ( (*utf++ & 0xc0) != 0x80 )
1001
0
                    return(NULL);
1002
2.61M
        }
1003
6.09M
    }
1004
199k
    return((xmlChar *)utf);
1005
199k
}
1006
1007
/**
1008
 * a function to provide the relative location of a UTF8 char
1009
 *
1010
 * @param utf  the input UTF8 *
1011
 * @param utfchar  the UTF8 character to be found
1012
 * @returns the relative character position of the desired char
1013
 * or -1 if not found
1014
 */
1015
int
1016
4.96M
xmlUTF8Strloc(const xmlChar *utf, const xmlChar *utfchar) {
1017
4.96M
    size_t i;
1018
4.96M
    int size;
1019
4.96M
    int ch;
1020
1021
4.96M
    if (utf==NULL || utfchar==NULL) return -1;
1022
4.96M
    size = xmlUTF8Strsize(utfchar, 1);
1023
57.5M
        for(i=0; (ch=*utf) != 0; i++) {
1024
52.8M
            if (xmlStrncmp(utf, utfchar, size)==0)
1025
285k
                return(i > INT_MAX ? 0 : i);
1026
52.5M
            utf++;
1027
52.5M
            if ( ch & 0x80 ) {
1028
                /* if not simple ascii, verify proper format */
1029
6.23M
                if ( (ch & 0xc0) != 0xc0 )
1030
0
                    return(-1);
1031
                /* then skip over remaining bytes for this char */
1032
18.6M
                while ( (ch <<= 1) & 0x80 )
1033
12.4M
                    if ( (*utf++ & 0xc0) != 0x80 )
1034
34
                        return(-1);
1035
6.23M
            }
1036
52.5M
        }
1037
1038
4.67M
    return(-1);
1039
4.96M
}
1040
/**
1041
 * Create a substring from a given UTF-8 string
1042
 * Note:  positions are given in units of UTF-8 chars
1043
 *
1044
 * @param utf  a sequence of UTF-8 encoded bytes
1045
 * @param start  relative pos of first char
1046
 * @param len  total number to copy
1047
 * @returns a pointer to a newly created string or NULL if the
1048
 * start index is out of bounds or a memory allocation failed.
1049
 * If len is too large, the result is truncated.
1050
 */
1051
1052
xmlChar *
1053
3.17k
xmlUTF8Strsub(const xmlChar *utf, int start, int len) {
1054
3.17k
    int i;
1055
3.17k
    int ch;
1056
1057
3.17k
    if (utf == NULL) return(NULL);
1058
3.17k
    if (start < 0) return(NULL);
1059
3.17k
    if (len < 0) return(NULL);
1060
1061
    /*
1062
     * Skip over any leading chars
1063
     */
1064
10.0M
    for (i = 0; i < start; i++) {
1065
10.0M
        ch = *utf++;
1066
10.0M
        if (ch == 0)
1067
0
            return(NULL);
1068
        /* skip over remaining bytes for this char */
1069
10.0M
        if (ch & 0x80) {
1070
1.89k
            ch <<= 1;
1071
5.11k
            while (ch & 0x80) {
1072
3.22k
                if (*utf++ == 0)
1073
0
                    return(NULL);
1074
3.22k
                ch <<= 1;
1075
3.22k
            }
1076
1.89k
        }
1077
10.0M
    }
1078
1079
3.17k
    return(xmlUTF8Strndup(utf, len));
1080
3.17k
}
1081
1082
/**
1083
 * Replaces a string with an escaped string.
1084
 *
1085
 * `msg` must be a heap-allocated buffer created by libxml2 that may be
1086
 * returned, or that may be freed and replaced.
1087
 *
1088
 * @param msg  a pointer to the string in which to escape '%' characters.
1089
 * @returns the same string with all '%' characters escaped.
1090
 */
1091
xmlChar *
1092
xmlEscapeFormatString(xmlChar **msg)
1093
697k
{
1094
697k
    xmlChar *msgPtr = NULL;
1095
697k
    xmlChar *result = NULL;
1096
697k
    xmlChar *resultPtr = NULL;
1097
697k
    size_t count = 0;
1098
697k
    size_t msgLen = 0;
1099
697k
    size_t resultLen = 0;
1100
1101
697k
    if (!msg || !*msg)
1102
334
        return(NULL);
1103
1104
30.8M
    for (msgPtr = *msg; *msgPtr != '\0'; ++msgPtr) {
1105
30.1M
        ++msgLen;
1106
30.1M
        if (*msgPtr == '%')
1107
689
            ++count;
1108
30.1M
    }
1109
1110
697k
    if (count == 0)
1111
696k
        return(*msg);
1112
1113
402
    if ((count > INT_MAX) || (msgLen > INT_MAX - count))
1114
0
        return(NULL);
1115
402
    resultLen = msgLen + count + 1;
1116
402
    result = xmlMalloc(resultLen);
1117
402
    if (result == NULL) {
1118
        /* Clear *msg to prevent format string vulnerabilities in
1119
           out-of-memory situations. */
1120
1
        xmlFree(*msg);
1121
1
        *msg = NULL;
1122
1
        return(NULL);
1123
1
    }
1124
1125
112k
    for (msgPtr = *msg, resultPtr = result; *msgPtr != '\0'; ++msgPtr, ++resultPtr) {
1126
112k
        *resultPtr = *msgPtr;
1127
112k
        if (*msgPtr == '%')
1128
688
            *(++resultPtr) = '%';
1129
112k
    }
1130
401
    result[resultLen - 1] = '\0';
1131
1132
401
    xmlFree(*msg);
1133
401
    *msg = result;
1134
1135
401
    return *msg;
1136
402
}
1137