Coverage Report

Created: 2026-04-12 06:50

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libxml2/xmlstring.c
Line
Count
Source
1
/*
2
 * string.c : an XML string utilities module
3
 *
4
 * This module provides various utility functions for manipulating
5
 * the xmlChar* type. All functions named xmlStr* have been moved here
6
 * from the parser.c file (their original home).
7
 *
8
 * See Copyright for the status of this software.
9
 *
10
 * UTF8 string routines from: William Brack
11
 *
12
 * Author: Daniel Veillard
13
 */
14
15
#define IN_LIBXML
16
#include "libxml.h"
17
18
#include <stdlib.h>
19
#include <string.h>
20
#include <limits.h>
21
#include <libxml/xmlmemory.h>
22
#include <libxml/parserInternals.h>
23
#include <libxml/xmlstring.h>
24
25
#include "private/parser.h"
26
#include "private/string.h"
27
28
#ifndef va_copy
29
  #ifdef __va_copy
30
    #define va_copy(dest, src) __va_copy(dest, src)
31
  #else
32
    #define va_copy(dest, src) memcpy(&(dest), &(src), sizeof(va_list))
33
  #endif
34
#endif
35
36
/************************************************************************
37
 *                                                                      *
38
 *                Commodity functions to handle xmlChars                *
39
 *                                                                      *
40
 ************************************************************************/
41
42
/**
43
 * a strndup for array of xmlChar's
44
 *
45
 * @param cur  the input xmlChar *
46
 * @param len  the len of `cur`
47
 * @returns a new xmlChar * or NULL
48
 */
49
xmlChar *
50
122M
xmlStrndup(const xmlChar *cur, int len) {
51
122M
    xmlChar *ret;
52
53
122M
    if ((cur == NULL) || (len < 0)) return(NULL);
54
122M
    ret = xmlMalloc((size_t) len + 1);
55
122M
    if (ret == NULL) {
56
22.3k
        return(NULL);
57
22.3k
    }
58
122M
    memcpy(ret, cur, len);
59
122M
    ret[len] = 0;
60
122M
    return(ret);
61
122M
}
62
63
/**
64
 * a strdup for array of xmlChar's. Since they are supposed to be
65
 * encoded in UTF-8 or an encoding with 8bit based chars, we assume
66
 * a termination mark of '0'.
67
 *
68
 * @param cur  the input xmlChar *
69
 * @returns a new xmlChar * or NULL
70
 */
71
xmlChar *
72
87.3M
xmlStrdup(const xmlChar *cur) {
73
87.3M
    const xmlChar *p = cur;
74
75
87.3M
    if (cur == NULL) return(NULL);
76
52.2G
    while (*p != 0) p++; /* non input consuming */
77
87.3M
    return(xmlStrndup(cur, p - cur));
78
87.3M
}
79
80
/**
81
 * a strndup for char's to xmlChar's
82
 *
83
 * @param cur  the input char *
84
 * @param len  the len of `cur`
85
 * @returns a new xmlChar * or NULL
86
 */
87
88
xmlChar *
89
2.30M
xmlCharStrndup(const char *cur, int len) {
90
2.30M
    int i;
91
2.30M
    xmlChar *ret;
92
93
2.30M
    if ((cur == NULL) || (len < 0)) return(NULL);
94
2.30M
    ret = xmlMalloc((size_t) len + 1);
95
2.30M
    if (ret == NULL) {
96
1.19k
        return(NULL);
97
1.19k
    }
98
149M
    for (i = 0;i < len;i++) {
99
        /* Explicit sign change */
100
146M
        ret[i] = (xmlChar) cur[i];
101
146M
        if (ret[i] == 0) return(ret);
102
146M
    }
103
2.30M
    ret[len] = 0;
104
2.30M
    return(ret);
105
2.30M
}
106
107
/**
108
 * a strdup for char's to xmlChar's
109
 *
110
 * @param cur  the input char *
111
 * @returns a new xmlChar * or NULL
112
 */
113
114
xmlChar *
115
2.30M
xmlCharStrdup(const char *cur) {
116
2.30M
    const char *p = cur;
117
118
2.30M
    if (cur == NULL) return(NULL);
119
149M
    while (*p != '\0') p++; /* non input consuming */
120
2.30M
    return(xmlCharStrndup(cur, p - cur));
121
2.30M
}
122
123
/**
124
 * a strcmp for xmlChar's
125
 *
126
 * @param str1  the first xmlChar *
127
 * @param str2  the second xmlChar *
128
 * @returns the integer result of the comparison
129
 */
130
131
int
132
28.3k
xmlStrcmp(const xmlChar *str1, const xmlChar *str2) {
133
28.3k
    if (str1 == str2) return(0);
134
13.3k
    if (str1 == NULL) return(-1);
135
11.9k
    if (str2 == NULL) return(1);
136
10.6k
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
137
10.6k
    return(strcmp((const char *)str1, (const char *)str2));
138
#else
139
    do {
140
        int tmp = *str1++ - *str2;
141
        if (tmp != 0) return(tmp);
142
    } while (*str2++ != 0);
143
    return 0;
144
#endif
145
11.9k
}
146
147
/**
148
 * Check if both strings are equal of have same content.
149
 * Should be a bit more readable and faster than #xmlStrcmp
150
 *
151
 * @param str1  the first xmlChar *
152
 * @param str2  the second xmlChar *
153
 * @returns 1 if they are equal, 0 if they are different
154
 */
155
156
int
157
258M
xmlStrEqual(const xmlChar *str1, const xmlChar *str2) {
158
258M
    if (str1 == str2) return(1);
159
256M
    if (str1 == NULL) return(0);
160
232M
    if (str2 == NULL) return(0);
161
225M
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
162
225M
    return(strcmp((const char *)str1, (const char *)str2) == 0);
163
#else
164
    do {
165
        if (*str1++ != *str2) return(0);
166
    } while (*str2++);
167
    return(1);
168
#endif
169
232M
}
170
171
/**
172
 * Check if a QName is Equal to a given string
173
 *
174
 * @param pref  the prefix of the QName
175
 * @param name  the localname of the QName
176
 * @param str  the second xmlChar *
177
 * @returns 1 if they are equal, 0 if they are different
178
 */
179
180
int
181
478k
xmlStrQEqual(const xmlChar *pref, const xmlChar *name, const xmlChar *str) {
182
478k
    if (pref == NULL) return(xmlStrEqual(name, str));
183
218k
    if (name == NULL) return(0);
184
218k
    if (str == NULL) return(0);
185
186
754k
    do {
187
754k
        if (*pref++ != *str) return(0);
188
754k
    } while ((*str++) && (*pref));
189
218k
    if (*str++ != ':') return(0);
190
27.1M
    do {
191
27.1M
        if (*name++ != *str) return(0);
192
27.1M
    } while (*str++);
193
217k
    return(1);
194
217k
}
195
196
/**
197
 * a strncmp for xmlChar's
198
 *
199
 * @param str1  the first xmlChar *
200
 * @param str2  the second xmlChar *
201
 * @param len  the max comparison length
202
 * @returns the integer result of the comparison
203
 */
204
205
int
206
94.6M
xmlStrncmp(const xmlChar *str1, const xmlChar *str2, int len) {
207
94.6M
    if (len <= 0) return(0);
208
94.6M
    if (str1 == str2) return(0);
209
94.6M
    if (str1 == NULL) return(-1);
210
94.6M
    if (str2 == NULL) return(1);
211
94.6M
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
212
94.6M
    return(strncmp((const char *)str1, (const char *)str2, len));
213
#else
214
    do {
215
        int tmp = *str1++ - *str2;
216
        if (tmp != 0 || --len == 0) return(tmp);
217
    } while (*str2++ != 0);
218
    return 0;
219
#endif
220
94.6M
}
221
222
static const xmlChar casemap[256] = {
223
    0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
224
    0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
225
    0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
226
    0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,
227
    0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,
228
    0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
229
    0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,
230
    0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
231
    0x40,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
232
    0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
233
    0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
234
    0x78,0x79,0x7A,0x7B,0x5C,0x5D,0x5E,0x5F,
235
    0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
236
    0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
237
    0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
238
    0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E,0x7F,
239
    0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,
240
    0x88,0x89,0x8A,0x8B,0x8C,0x8D,0x8E,0x8F,
241
    0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97,
242
    0x98,0x99,0x9A,0x9B,0x9C,0x9D,0x9E,0x9F,
243
    0xA0,0xA1,0xA2,0xA3,0xA4,0xA5,0xA6,0xA7,
244
    0xA8,0xA9,0xAA,0xAB,0xAC,0xAD,0xAE,0xAF,
245
    0xB0,0xB1,0xB2,0xB3,0xB4,0xB5,0xB6,0xB7,
246
    0xB8,0xB9,0xBA,0xBB,0xBC,0xBD,0xBE,0xBF,
247
    0xC0,0xC1,0xC2,0xC3,0xC4,0xC5,0xC6,0xC7,
248
    0xC8,0xC9,0xCA,0xCB,0xCC,0xCD,0xCE,0xCF,
249
    0xD0,0xD1,0xD2,0xD3,0xD4,0xD5,0xD6,0xD7,
250
    0xD8,0xD9,0xDA,0xDB,0xDC,0xDD,0xDE,0xDF,
251
    0xE0,0xE1,0xE2,0xE3,0xE4,0xE5,0xE6,0xE7,
252
    0xE8,0xE9,0xEA,0xEB,0xEC,0xED,0xEE,0xEF,
253
    0xF0,0xF1,0xF2,0xF3,0xF4,0xF5,0xF6,0xF7,
254
    0xF8,0xF9,0xFA,0xFB,0xFC,0xFD,0xFE,0xFF
255
};
256
257
/**
258
 * a strcasecmp for xmlChar's
259
 *
260
 * @param str1  the first xmlChar *
261
 * @param str2  the second xmlChar *
262
 * @returns the integer result of the comparison
263
 */
264
265
int
266
6.88M
xmlStrcasecmp(const xmlChar *str1, const xmlChar *str2) {
267
6.88M
    register int tmp;
268
269
6.88M
    if (str1 == str2) return(0);
270
6.88M
    if (str1 == NULL) return(-1);
271
6.88M
    if (str2 == NULL) return(1);
272
10.8M
    do {
273
10.8M
        tmp = casemap[*str1++] - casemap[*str2];
274
10.8M
        if (tmp != 0) return(tmp);
275
10.8M
    } while (*str2++ != 0);
276
357k
    return 0;
277
6.88M
}
278
279
/**
280
 * a strncasecmp for xmlChar's
281
 *
282
 * @param str1  the first xmlChar *
283
 * @param str2  the second xmlChar *
284
 * @param len  the max comparison length
285
 * @returns the integer result of the comparison
286
 */
287
288
int
289
84.5k
xmlStrncasecmp(const xmlChar *str1, const xmlChar *str2, int len) {
290
84.5k
    register int tmp;
291
292
84.5k
    if (len <= 0) return(0);
293
84.5k
    if (str1 == str2) return(0);
294
84.5k
    if (str1 == NULL) return(-1);
295
84.5k
    if (str2 == NULL) return(1);
296
129k
    do {
297
129k
        tmp = casemap[*str1++] - casemap[*str2];
298
129k
        if (tmp != 0 || --len == 0) return(tmp);
299
129k
    } while (*str2++ != 0);
300
0
    return 0;
301
84.5k
}
302
303
/**
304
 * a strchr for xmlChar's
305
 *
306
 * @param str  the xmlChar * array
307
 * @param val  the xmlChar to search
308
 * @returns the xmlChar * for the first occurrence or NULL.
309
 */
310
311
const xmlChar *
312
325M
xmlStrchr(const xmlChar *str, xmlChar val) {
313
325M
    if (str == NULL) return(NULL);
314
1.76G
    while (*str != 0) { /* non input consuming */
315
1.45G
        if (*str == val) return((xmlChar *) str);
316
1.43G
        str++;
317
1.43G
    }
318
312M
    return(NULL);
319
324M
}
320
321
/**
322
 * a strstr for xmlChar's
323
 *
324
 * @param str  the xmlChar * array (haystack)
325
 * @param val  the xmlChar to search (needle)
326
 * @returns the xmlChar * for the first occurrence or NULL.
327
 */
328
329
const xmlChar *
330
2.93M
xmlStrstr(const xmlChar *str, const xmlChar *val) {
331
2.93M
    int n;
332
333
2.93M
    if (str == NULL) return(NULL);
334
2.93M
    if (val == NULL) return(NULL);
335
2.93M
    n = xmlStrlen(val);
336
337
2.93M
    if (n == 0) return(str);
338
522M
    while (*str != 0) { /* non input consuming */
339
519M
        if (*str == *val) {
340
575k
            if (!xmlStrncmp(str, val, n)) return((const xmlChar *) str);
341
575k
        }
342
519M
        str++;
343
519M
    }
344
2.76M
    return(NULL);
345
2.93M
}
346
347
/**
348
 * a case-ignoring strstr for xmlChar's
349
 *
350
 * @param str  the xmlChar * array (haystack)
351
 * @param val  the xmlChar to search (needle)
352
 * @returns the xmlChar * for the first occurrence or NULL.
353
 */
354
355
const xmlChar *
356
0
xmlStrcasestr(const xmlChar *str, const xmlChar *val) {
357
0
    int n;
358
359
0
    if (str == NULL) return(NULL);
360
0
    if (val == NULL) return(NULL);
361
0
    n = xmlStrlen(val);
362
363
0
    if (n == 0) return(str);
364
0
    while (*str != 0) { /* non input consuming */
365
0
        if (casemap[*str] == casemap[*val])
366
0
            if (!xmlStrncasecmp(str, val, n)) return(str);
367
0
        str++;
368
0
    }
369
0
    return(NULL);
370
0
}
371
372
/**
373
 * Extract a substring of a given string
374
 *
375
 * @param str  the xmlChar * array (haystack)
376
 * @param start  the index of the first char (zero based)
377
 * @param len  the length of the substring
378
 * @returns the xmlChar * for the first occurrence or NULL.
379
 */
380
381
xmlChar *
382
0
xmlStrsub(const xmlChar *str, int start, int len) {
383
0
    int i;
384
385
0
    if (str == NULL) return(NULL);
386
0
    if (start < 0) return(NULL);
387
0
    if (len < 0) return(NULL);
388
389
0
    for (i = 0;i < start;i++) {
390
0
        if (*str == 0) return(NULL);
391
0
        str++;
392
0
    }
393
0
    if (*str == 0) return(NULL);
394
0
    return(xmlStrndup(str, len));
395
0
}
396
397
/**
398
 * length of a xmlChar's string
399
 *
400
 * @param str  the xmlChar * array
401
 * @returns the number of xmlChar contained in the ARRAY.
402
 */
403
404
int
405
17.0M
xmlStrlen(const xmlChar *str) {
406
17.0M
    size_t len = str ? strlen((const char *)str) : 0;
407
17.0M
    return(len > INT_MAX ? 0 : len);
408
17.0M
}
409
410
/**
411
 * a strncat for array of xmlChar's, it will extend `cur` with the len
412
 * first bytes of `add`. Note that if `len` < 0 then this is an API error
413
 * and NULL will be returned.
414
 *
415
 * @param cur  the original xmlChar * array
416
 * @param add  the xmlChar * array added
417
 * @param len  the length of `add`
418
 * @returns a new xmlChar *, the original `cur` is reallocated and should
419
 * not be freed.
420
 */
421
422
xmlChar *
423
6.64M
xmlStrncat(xmlChar *cur, const xmlChar *add, int len) {
424
6.64M
    int size;
425
6.64M
    xmlChar *ret;
426
427
6.64M
    if ((add == NULL) || (len == 0))
428
10.1k
        return(cur);
429
430
6.63M
    if (len < 0) {
431
0
        if (cur != NULL)
432
0
            xmlFree(cur);
433
0
        return(NULL);
434
0
    }
435
436
6.63M
    if (cur == NULL)
437
7.10k
        return(xmlStrndup(add, len));
438
439
6.62M
    size = xmlStrlen(cur);
440
6.62M
    if ((size < 0) || (size > INT_MAX - len)) {
441
0
        xmlFree(cur);
442
0
        return(NULL);
443
0
    }
444
6.62M
    ret = (xmlChar *) xmlRealloc(cur, (size_t) size + len + 1);
445
6.62M
    if (ret == NULL) {
446
1.81k
        xmlFree(cur);
447
1.81k
        return(NULL);
448
1.81k
    }
449
6.62M
    memcpy(&ret[size], add, len);
450
6.62M
    ret[size + len] = 0;
451
6.62M
    return(ret);
452
6.62M
}
453
454
/**
455
 * same as #xmlStrncat, but creates a new string.  The original
456
 * two strings are not freed. If `len` is < 0 then the length
457
 * will be calculated automatically.
458
 *
459
 * @param str1  first xmlChar string
460
 * @param str2  second xmlChar string
461
 * @param len  the len of `str2` or < 0
462
 * @returns a new xmlChar * or NULL
463
 */
464
xmlChar *
465
28.5k
xmlStrncatNew(const xmlChar *str1, const xmlChar *str2, int len) {
466
28.5k
    int size;
467
28.5k
    xmlChar *ret;
468
469
28.5k
    if (len < 0) {
470
23.4k
        len = xmlStrlen(str2);
471
23.4k
        if (len < 0)
472
0
            return(NULL);
473
23.4k
    }
474
28.5k
    if (str1 == NULL)
475
660
        return(xmlStrndup(str2, len));
476
27.8k
    if ((str2 == NULL) || (len == 0))
477
1.00k
        return(xmlStrdup(str1));
478
479
26.8k
    size = xmlStrlen(str1);
480
26.8k
    if ((size < 0) || (size > INT_MAX - len))
481
0
        return(NULL);
482
26.8k
    ret = (xmlChar *) xmlMalloc((size_t) size + len + 1);
483
26.8k
    if (ret == NULL)
484
8
        return(NULL);
485
26.8k
    memcpy(ret, str1, size);
486
26.8k
    memcpy(&ret[size], str2, len);
487
26.8k
    ret[size + len] = 0;
488
26.8k
    return(ret);
489
26.8k
}
490
491
/**
492
 * a strcat for array of xmlChar's. Since they are supposed to be
493
 * encoded in UTF-8 or an encoding with 8bit based chars, we assume
494
 * a termination mark of '0'.
495
 *
496
 * @param cur  the original xmlChar * array
497
 * @param add  the xmlChar * array added
498
 * @returns a new xmlChar * containing the concatenated string. The original
499
 * `cur` is reallocated and should not be freed.
500
 */
501
xmlChar *
502
6.65M
xmlStrcat(xmlChar *cur, const xmlChar *add) {
503
6.65M
    const xmlChar *p = add;
504
505
6.65M
    if (add == NULL) return(cur);
506
6.65M
    if (cur == NULL)
507
18.7k
        return(xmlStrdup(add));
508
509
174M
    while (*p != 0) p++; /* non input consuming */
510
6.63M
    return(xmlStrncat(cur, add, p - add));
511
6.65M
}
512
513
/**
514
 * Formats `msg` and places result into `buf`.
515
 *
516
 * @param buf  the result buffer.
517
 * @param len  the result buffer length.
518
 * @param msg  the message with printf formatting.
519
 * @param ...   extra parameters for the message.
520
 * @returns the number of characters written to `buf` or -1 if an error occurs.
521
 */
522
int
523
0
xmlStrPrintf(xmlChar *buf, int len, const char *msg, ...) {
524
0
    va_list args;
525
0
    int ret;
526
527
0
    if((buf == NULL) || (msg == NULL) || (len <= 0)) {
528
0
        return(-1);
529
0
    }
530
531
0
    va_start(args, msg);
532
0
    ret = vsnprintf((char *) buf, len, (const char *) msg, args);
533
0
    va_end(args);
534
0
    buf[len - 1] = 0; /* be safe ! */
535
536
0
    return(ret);
537
0
}
538
539
/**
540
 * Formats `msg` and places result into `buf`.
541
 *
542
 * @param buf  the result buffer.
543
 * @param len  the result buffer length.
544
 * @param msg  the message with printf formatting.
545
 * @param ap  extra parameters for the message.
546
 * @returns the number of characters written to `buf` or -1 if an error occurs.
547
 */
548
int
549
0
xmlStrVPrintf(xmlChar *buf, int len, const char *msg, va_list ap) {
550
0
    int ret;
551
552
0
    if((buf == NULL) || (msg == NULL) || (len <= 0)) {
553
0
        return(-1);
554
0
    }
555
556
0
    ret = vsnprintf((char *) buf, len, (const char *) msg, ap);
557
0
    buf[len - 1] = 0; /* be safe ! */
558
559
0
    return(ret);
560
0
}
561
562
/**
563
 * Creates a newly allocated string according to format.
564
 *
565
 * @param out  pointer to the resulting string
566
 * @param maxSize  maximum size of the output buffer
567
 * @param msg  printf format string
568
 * @param ap  arguments for format string
569
 * @returns 0 on success, 1 if the result was truncated or on other
570
 * errors, -1 if a memory allocation failed.
571
 */
572
int
573
13.1M
xmlStrVASPrintf(xmlChar **out, int maxSize, const char *msg, va_list ap) {
574
13.1M
    char empty[1];
575
13.1M
    va_list copy;
576
13.1M
    xmlChar *buf;
577
13.1M
    int res, size;
578
13.1M
    int truncated = 0;
579
580
13.1M
    if (out == NULL)
581
0
        return(1);
582
13.1M
    *out = NULL;
583
13.1M
    if (msg == NULL)
584
0
        return(1);
585
13.1M
    if (maxSize < 32)
586
0
        maxSize = 32;
587
588
13.1M
    va_copy(copy, ap);
589
13.1M
    res = vsnprintf(empty, 1, msg, copy);
590
13.1M
    va_end(copy);
591
592
13.1M
    if (res > 0) {
593
        /* snprintf seems to work according to C99. */
594
595
13.1M
        if (res < maxSize) {
596
13.0M
            size = res + 1;
597
13.0M
        } else {
598
4.40k
            size = maxSize;
599
4.40k
            truncated = 1;
600
4.40k
        }
601
13.1M
        buf = xmlMalloc(size);
602
13.1M
        if (buf == NULL)
603
10.3k
            return(-1);
604
13.0M
        if (vsnprintf((char *) buf, size, msg, ap) < 0) {
605
0
            xmlFree(buf);
606
0
            return(1);
607
0
        }
608
13.0M
    } else {
609
        /*
610
         * Unfortunately, older snprintf implementations don't follow the
611
         * C99 spec. If the output exceeds the size of the buffer, they can
612
         * return -1, 0 or the number of characters written instead of the
613
         * needed size. Older MSCVRT also won't write a terminating null
614
         * byte if the buffer is too small.
615
         *
616
         * If the value returned is non-negative and strictly less than
617
         * the buffer size (without terminating null), the result should
618
         * have been written completely, so we double the buffer size
619
         * until this condition is true. This assumes that snprintf will
620
         * eventually return a non-negative value. Otherwise, we will
621
         * allocate more and more memory until we run out.
622
         *
623
         * Note that this code path is also executed on conforming
624
         * platforms if the output is the empty string.
625
         */
626
627
0
        buf = NULL;
628
0
        size = 32;
629
0
        while (1) {
630
0
            buf = xmlMalloc(size);
631
0
            if (buf == NULL)
632
0
                return(-1);
633
634
0
            va_copy(copy, ap);
635
0
            res = vsnprintf((char *) buf, size, msg, copy);
636
0
            va_end(copy);
637
0
            if ((res >= 0) && (res < size - 1))
638
0
                break;
639
640
0
            if (size >= maxSize) {
641
0
                truncated = 1;
642
0
                break;
643
0
            }
644
645
0
            xmlFree(buf);
646
647
0
            if (size > maxSize / 2)
648
0
                size = maxSize;
649
0
            else
650
0
                size *= 2;
651
0
        }
652
0
    }
653
654
    /*
655
     * If the output was truncated, make sure that the buffer doesn't
656
     * end with a truncated UTF-8 sequence.
657
     */
658
13.0M
    if (truncated != 0) {
659
4.40k
        int i = size - 1;
660
661
41.7k
        while (i > 0) {
662
            /* Break after ASCII */
663
41.7k
            if (buf[i-1] < 0x80)
664
1.45k
                break;
665
40.3k
            i -= 1;
666
            /* Break before non-ASCII */
667
40.3k
            if (buf[i] >= 0xc0)
668
2.95k
                break;
669
40.3k
        }
670
671
4.40k
        buf[i] = 0;
672
4.40k
    }
673
674
13.0M
    *out = (xmlChar *) buf;
675
13.0M
    return(truncated);
676
13.1M
}
677
678
/**
679
 * See xmlStrVASPrintf.
680
 *
681
 * @param out  pointer to the resulting string
682
 * @param maxSize  maximum size of the output buffer
683
 * @param msg  printf format string
684
 * @param ...  arguments for format string
685
 * @returns 0 on success, 1 if the result was truncated or on other
686
 * errors, -1 if a memory allocation failed.
687
 */
688
int
689
0
xmlStrASPrintf(xmlChar **out, int maxSize, const char *msg, ...) {
690
0
    va_list ap;
691
0
    int ret;
692
693
0
    va_start(ap, msg);
694
0
    ret = xmlStrVASPrintf(out, maxSize, msg, ap);
695
0
    va_end(ap);
696
697
0
    return(ret);
698
0
}
699
700
/************************************************************************
701
 *                                                                      *
702
 *              Generic UTF8 handling routines                          *
703
 *                                                                      *
704
 * From rfc2044: encoding of the Unicode values on UTF-8:               *
705
 *                                                                      *
706
 * UCS-4 range (hex.)           UTF-8 octet sequence (binary)           *
707
 * 0000 0000-0000 007F   0xxxxxxx                                       *
708
 * 0000 0080-0000 07FF   110xxxxx 10xxxxxx                              *
709
 * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx                     *
710
 *                                                                      *
711
 * I hope we won't use values > 0xFFFF anytime soon !                   *
712
 *                                                                      *
713
 ************************************************************************/
714
715
716
/**
717
 * calculates the internal size of a UTF8 character
718
 *
719
 * @param utf  pointer to the UTF8 character
720
 * @returns the numbers of bytes in the character, -1 on format error
721
 */
722
int
723
0
xmlUTF8Size(const xmlChar *utf) {
724
0
    xmlChar mask;
725
0
    int len;
726
727
0
    if (utf == NULL)
728
0
        return -1;
729
0
    if (*utf < 0x80)
730
0
        return 1;
731
    /* check valid UTF8 character */
732
0
    if (!(*utf & 0x40))
733
0
        return -1;
734
    /* determine number of bytes in char */
735
0
    len = 2;
736
0
    for (mask=0x20; mask != 0; mask>>=1) {
737
0
        if (!(*utf & mask))
738
0
            return len;
739
0
        len++;
740
0
    }
741
0
    return -1;
742
0
}
743
744
/**
745
 * compares the two UCS4 values
746
 *
747
 * @param utf1  pointer to first UTF8 char
748
 * @param utf2  pointer to second UTF8 char
749
 * @returns result of the compare as with #xmlStrncmp
750
 */
751
int
752
0
xmlUTF8Charcmp(const xmlChar *utf1, const xmlChar *utf2) {
753
754
0
    if (utf1 == NULL ) {
755
0
        if (utf2 == NULL)
756
0
            return 0;
757
0
        return -1;
758
0
    }
759
0
    return xmlStrncmp(utf1, utf2, xmlUTF8Size(utf1));
760
0
}
761
762
/**
763
 * compute the length of an UTF8 string, it doesn't do a full UTF8
764
 * checking of the content of the string.
765
 *
766
 * @param utf  a sequence of UTF-8 encoded bytes
767
 * @returns the number of characters in the string or -1 in case of error
768
 */
769
int
770
29.3k
xmlUTF8Strlen(const xmlChar *utf) {
771
29.3k
    size_t ret = 0;
772
773
29.3k
    if (utf == NULL)
774
25
        return(-1);
775
776
22.2M
    while (*utf != 0) {
777
22.2M
        if (utf[0] & 0x80) {
778
18.7M
            if ((utf[1] & 0xc0) != 0x80)
779
0
                return(-1);
780
18.7M
            if ((utf[0] & 0xe0) == 0xe0) {
781
18.7M
                if ((utf[2] & 0xc0) != 0x80)
782
0
                    return(-1);
783
18.7M
                if ((utf[0] & 0xf0) == 0xf0) {
784
7.10k
                    if ((utf[0] & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
785
752
                        return(-1);
786
6.35k
                    utf += 4;
787
18.7M
                } else {
788
18.7M
                    utf += 3;
789
18.7M
                }
790
18.7M
            } else {
791
8.82k
                utf += 2;
792
8.82k
            }
793
18.7M
        } else {
794
3.45M
            utf++;
795
3.45M
        }
796
22.2M
        ret++;
797
22.2M
    }
798
28.5k
    return(ret > INT_MAX ? 0 : ret);
799
29.2k
}
800
801
/**
802
 * Read the first UTF8 character from `utf`
803
 *
804
 * @param utf  a sequence of UTF-8 encoded bytes
805
 * @param len  a pointer to the minimum number of bytes present in
806
 *        the sequence.  This is used to assure the next character
807
 *        is completely contained within the sequence.
808
 * @returns the char value or -1 in case of error, and sets *len to
809
 *        the actual number of bytes consumed (0 in case of error)
810
 */
811
int
812
814M
xmlGetUTF8Char(const unsigned char *utf, int *len) {
813
814M
    unsigned int c;
814
815
814M
    if (utf == NULL)
816
0
        goto error;
817
814M
    if (len == NULL)
818
0
        goto error;
819
820
814M
    c = utf[0];
821
814M
    if (c < 0x80) {
822
508M
        if (*len < 1)
823
0
            goto error;
824
        /* 1-byte code */
825
508M
        *len = 1;
826
508M
    } else {
827
305M
        if ((*len < 2) || ((utf[1] & 0xc0) != 0x80))
828
73.7M
            goto error;
829
231M
        if (c < 0xe0) {
830
59.8M
            if (c < 0xc2)
831
51.4M
                goto error;
832
            /* 2-byte code */
833
8.39M
            *len = 2;
834
8.39M
            c = (c & 0x1f) << 6;
835
8.39M
            c |= utf[1] & 0x3f;
836
171M
        } else {
837
171M
            if ((*len < 3) || ((utf[2] & 0xc0) != 0x80))
838
772k
                goto error;
839
170M
            if (c < 0xf0) {
840
                /* 3-byte code */
841
169M
                *len = 3;
842
169M
                c = (c & 0xf) << 12;
843
169M
                c |= (utf[1] & 0x3f) << 6;
844
169M
                c |= utf[2] & 0x3f;
845
169M
                if ((c < 0x800) || ((c >= 0xd800) && (c < 0xe000)))
846
9.57k
                    goto error;
847
169M
            } else {
848
1.35M
                if ((*len < 4) || ((utf[3] & 0xc0) != 0x80))
849
6.52k
                    goto error;
850
1.34M
                *len = 4;
851
                /* 4-byte code */
852
1.34M
                c = (c & 0x7) << 18;
853
1.34M
                c |= (utf[1] & 0x3f) << 12;
854
1.34M
                c |= (utf[2] & 0x3f) << 6;
855
1.34M
                c |= utf[3] & 0x3f;
856
1.34M
                if ((c < 0x10000) || (c >= 0x110000))
857
60.8k
                    goto error;
858
1.34M
            }
859
170M
        }
860
231M
    }
861
688M
    return(c);
862
863
126M
error:
864
126M
    if (len != NULL)
865
126M
  *len = 0;
866
126M
    return(-1);
867
814M
}
868
869
/**
870
 * Checks `utf` for being valid UTF-8. `utf` is assumed to be
871
 * null-terminated. This function is not super-strict, as it will
872
 * allow longer UTF-8 sequences than necessary. Note that Java is
873
 * capable of producing these sequences if provoked. Also note, this
874
 * routine checks for the 4-byte maximum size, but does not check for
875
 * 0x10ffff maximum value.
876
 *
877
 * @param utf  Pointer to putative UTF-8 encoded string.
878
 * @returns value: true if `utf` is valid.
879
 **/
880
int
881
xmlCheckUTF8(const unsigned char *utf)
882
0
{
883
0
    int ix;
884
0
    unsigned char c;
885
886
0
    if (utf == NULL)
887
0
        return(0);
888
    /*
889
     * utf is a string of 1, 2, 3 or 4 bytes.  The valid strings
890
     * are as follows (in "bit format"):
891
     *    0xxxxxxx                                      valid 1-byte
892
     *    110xxxxx 10xxxxxx                             valid 2-byte
893
     *    1110xxxx 10xxxxxx 10xxxxxx                    valid 3-byte
894
     *    11110xxx 10xxxxxx 10xxxxxx 10xxxxxx           valid 4-byte
895
     */
896
0
    while ((c = utf[0])) {      /* string is 0-terminated */
897
0
        ix = 0;
898
0
        if ((c & 0x80) == 0x00) { /* 1-byte code, starts with 10 */
899
0
            ix = 1;
900
0
  } else if ((c & 0xe0) == 0xc0) {/* 2-byte code, starts with 110 */
901
0
      if ((utf[1] & 0xc0 ) != 0x80)
902
0
          return 0;
903
0
      ix = 2;
904
0
  } else if ((c & 0xf0) == 0xe0) {/* 3-byte code, starts with 1110 */
905
0
      if (((utf[1] & 0xc0) != 0x80) ||
906
0
          ((utf[2] & 0xc0) != 0x80))
907
0
        return 0;
908
0
      ix = 3;
909
0
  } else if ((c & 0xf8) == 0xf0) {/* 4-byte code, starts with 11110 */
910
0
      if (((utf[1] & 0xc0) != 0x80) ||
911
0
          ((utf[2] & 0xc0) != 0x80) ||
912
0
    ((utf[3] & 0xc0) != 0x80))
913
0
        return 0;
914
0
      ix = 4;
915
0
  } else       /* unknown encoding */
916
0
      return 0;
917
0
        utf += ix;
918
0
      }
919
0
      return(1);
920
0
}
921
922
/**
923
 * storage size of an UTF8 string
924
 * the behaviour is not guaranteed if the input string is not UTF-8
925
 *
926
 * @param utf  a sequence of UTF-8 encoded bytes
927
 * @param len  the number of characters in the array
928
 * @returns the storage size of
929
 * the first 'len' characters of ARRAY
930
 */
931
932
int
933
10.1M
xmlUTF8Strsize(const xmlChar *utf, int len) {
934
10.1M
    const xmlChar *ptr=utf;
935
10.1M
    int ch;
936
10.1M
    size_t ret;
937
938
10.1M
    if (utf == NULL)
939
0
        return(0);
940
941
10.1M
    if (len <= 0)
942
0
        return(0);
943
944
39.9M
    while ( len-- > 0) {
945
29.7M
        if ( !*ptr )
946
3.33k
            break;
947
29.7M
        ch = *ptr++;
948
29.7M
        if ((ch & 0x80))
949
61.8M
            while ((ch<<=1) & 0x80 ) {
950
41.2M
    if (*ptr == 0) break;
951
41.2M
                ptr++;
952
41.2M
      }
953
29.7M
    }
954
10.1M
    ret = ptr - utf;
955
10.1M
    return (ret > INT_MAX ? 0 : ret);
956
10.1M
}
957
958
959
/**
960
 * a strndup for array of UTF8's
961
 *
962
 * @param utf  the input UTF8 *
963
 * @param len  the len of `utf` (in chars)
964
 * @returns a new UTF8 * or NULL
965
 */
966
xmlChar *
967
4.39k
xmlUTF8Strndup(const xmlChar *utf, int len) {
968
4.39k
    xmlChar *ret;
969
4.39k
    int i;
970
971
4.39k
    if ((utf == NULL) || (len < 0)) return(NULL);
972
4.39k
    i = xmlUTF8Strsize(utf, len);
973
4.39k
    ret = xmlMalloc((size_t) i + 1);
974
4.39k
    if (ret == NULL) {
975
4
        return(NULL);
976
4
    }
977
4.38k
    memcpy(ret, utf, i);
978
4.38k
    ret[i] = 0;
979
4.38k
    return(ret);
980
4.39k
}
981
982
/**
983
 * a function to provide the equivalent of fetching a
984
 * character from a string array
985
 *
986
 * @param utf  the input UTF8 *
987
 * @param pos  the position of the desired UTF8 char (in chars)
988
 * @returns a pointer to the UTF8 character or NULL
989
 */
990
const xmlChar *
991
432k
xmlUTF8Strpos(const xmlChar *utf, int pos) {
992
432k
    int ch;
993
994
432k
    if (utf == NULL) return(NULL);
995
432k
    if (pos < 0)
996
0
        return(NULL);
997
11.9M
    while (pos--) {
998
11.5M
        ch = *utf++;
999
11.5M
        if (ch == 0)
1000
0
            return(NULL);
1001
11.5M
        if ( ch & 0x80 ) {
1002
            /* if not simple ascii, verify proper format */
1003
909k
            if ( (ch & 0xc0) != 0xc0 )
1004
0
                return(NULL);
1005
            /* then skip over remaining bytes for this char */
1006
2.73M
            while ( (ch <<= 1) & 0x80 )
1007
1.82M
                if ( (*utf++ & 0xc0) != 0x80 )
1008
0
                    return(NULL);
1009
909k
        }
1010
11.5M
    }
1011
432k
    return((xmlChar *)utf);
1012
432k
}
1013
1014
/**
1015
 * a function to provide the relative location of a UTF8 char
1016
 *
1017
 * @param utf  the input UTF8 *
1018
 * @param utfchar  the UTF8 character to be found
1019
 * @returns the relative character position of the desired char
1020
 * or -1 if not found
1021
 */
1022
int
1023
5.15M
xmlUTF8Strloc(const xmlChar *utf, const xmlChar *utfchar) {
1024
5.15M
    size_t i;
1025
5.15M
    int size;
1026
5.15M
    int ch;
1027
1028
5.15M
    if (utf==NULL || utfchar==NULL) return -1;
1029
5.15M
    size = xmlUTF8Strsize(utfchar, 1);
1030
97.5M
        for(i=0; (ch=*utf) != 0; i++) {
1031
92.9M
            if (xmlStrncmp(utf, utfchar, size)==0)
1032
576k
                return(i > INT_MAX ? 0 : i);
1033
92.3M
            utf++;
1034
92.3M
            if ( ch & 0x80 ) {
1035
                /* if not simple ascii, verify proper format */
1036
7.08M
                if ( (ch & 0xc0) != 0xc0 )
1037
0
                    return(-1);
1038
                /* then skip over remaining bytes for this char */
1039
21.1M
                while ( (ch <<= 1) & 0x80 )
1040
14.0M
                    if ( (*utf++ & 0xc0) != 0x80 )
1041
257
                        return(-1);
1042
7.08M
            }
1043
92.3M
        }
1044
1045
4.57M
    return(-1);
1046
5.15M
}
1047
/**
1048
 * Create a substring from a given UTF-8 string
1049
 * Note:  positions are given in units of UTF-8 chars
1050
 *
1051
 * @param utf  a sequence of UTF-8 encoded bytes
1052
 * @param start  relative pos of first char
1053
 * @param len  total number to copy
1054
 * @returns a pointer to a newly created string or NULL if the
1055
 * start index is out of bounds or a memory allocation failed.
1056
 * If len is too large, the result is truncated.
1057
 */
1058
1059
xmlChar *
1060
4.39k
xmlUTF8Strsub(const xmlChar *utf, int start, int len) {
1061
4.39k
    int i;
1062
4.39k
    int ch;
1063
1064
4.39k
    if (utf == NULL) return(NULL);
1065
4.39k
    if (start < 0) return(NULL);
1066
4.39k
    if (len < 0) return(NULL);
1067
1068
    /*
1069
     * Skip over any leading chars
1070
     */
1071
43.3k
    for (i = 0; i < start; i++) {
1072
38.9k
        ch = *utf++;
1073
38.9k
        if (ch == 0)
1074
0
            return(NULL);
1075
        /* skip over remaining bytes for this char */
1076
38.9k
        if (ch & 0x80) {
1077
1.36k
            ch <<= 1;
1078
3.90k
            while (ch & 0x80) {
1079
2.54k
                if (*utf++ == 0)
1080
0
                    return(NULL);
1081
2.54k
                ch <<= 1;
1082
2.54k
            }
1083
1.36k
        }
1084
38.9k
    }
1085
1086
4.39k
    return(xmlUTF8Strndup(utf, len));
1087
4.39k
}
1088
1089
/**
1090
 * Replaces a string with an escaped string.
1091
 *
1092
 * `msg` must be a heap-allocated buffer created by libxml2 that may be
1093
 * returned, or that may be freed and replaced.
1094
 *
1095
 * @param msg  a pointer to the string in which to escape '%' characters.
1096
 * @returns the same string with all '%' characters escaped.
1097
 */
1098
xmlChar *
1099
xmlEscapeFormatString(xmlChar **msg)
1100
853k
{
1101
853k
    xmlChar *msgPtr = NULL;
1102
853k
    xmlChar *result = NULL;
1103
853k
    xmlChar *resultPtr = NULL;
1104
853k
    size_t count = 0;
1105
853k
    size_t msgLen = 0;
1106
853k
    size_t resultLen = 0;
1107
1108
853k
    if (!msg || !*msg)
1109
360
        return(NULL);
1110
1111
37.1M
    for (msgPtr = *msg; *msgPtr != '\0'; ++msgPtr) {
1112
36.3M
        ++msgLen;
1113
36.3M
        if (*msgPtr == '%')
1114
806
            ++count;
1115
36.3M
    }
1116
1117
853k
    if (count == 0)
1118
852k
        return(*msg);
1119
1120
368
    if ((count > INT_MAX) || (msgLen > INT_MAX - count))
1121
0
        return(NULL);
1122
368
    resultLen = msgLen + count + 1;
1123
368
    result = xmlMalloc(resultLen);
1124
368
    if (result == NULL) {
1125
        /* Clear *msg to prevent format string vulnerabilities in
1126
           out-of-memory situations. */
1127
1
        xmlFree(*msg);
1128
1
        *msg = NULL;
1129
1
        return(NULL);
1130
1
    }
1131
1132
1.01M
    for (msgPtr = *msg, resultPtr = result; *msgPtr != '\0'; ++msgPtr, ++resultPtr) {
1133
1.01M
        *resultPtr = *msgPtr;
1134
1.01M
        if (*msgPtr == '%')
1135
805
            *(++resultPtr) = '%';
1136
1.01M
    }
1137
367
    result[resultLen - 1] = '\0';
1138
1139
367
    xmlFree(*msg);
1140
367
    *msg = result;
1141
1142
367
    return *msg;
1143
368
}
1144