Coverage Report

Created: 2023-06-07 06:14

/src/libxml2/xmlstring.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * string.c : an XML string utilities module
3
 *
4
 * This module provides various utility functions for manipulating
5
 * the xmlChar* type. All functions named xmlStr* have been moved here
6
 * from the parser.c file (their original home).
7
 *
8
 * See Copyright for the status of this software.
9
 *
10
 * UTF8 string routines from:
11
 * William Brack <wbrack@mmm.com.hk>
12
 *
13
 * daniel@veillard.com
14
 */
15
16
#define IN_LIBXML
17
#include "libxml.h"
18
19
#include <stdlib.h>
20
#include <string.h>
21
#include <limits.h>
22
#include <libxml/xmlmemory.h>
23
#include <libxml/parserInternals.h>
24
#include <libxml/xmlstring.h>
25
26
#include "private/parser.h"
27
#include "private/string.h"
28
29
/************************************************************************
30
 *                                                                      *
31
 *                Commodity functions to handle xmlChars                *
32
 *                                                                      *
33
 ************************************************************************/
34
35
/**
36
 * xmlStrndup:
37
 * @cur:  the input xmlChar *
38
 * @len:  the len of @cur
39
 *
40
 * a strndup for array of xmlChar's
41
 *
42
 * Returns a new xmlChar * or NULL
43
 */
44
xmlChar *
45
61.9M
xmlStrndup(const xmlChar *cur, int len) {
46
61.9M
    xmlChar *ret;
47
48
61.9M
    if ((cur == NULL) || (len < 0)) return(NULL);
49
61.9M
    ret = (xmlChar *) xmlMallocAtomic((size_t) len + 1);
50
61.9M
    if (ret == NULL) {
51
337k
        return(NULL);
52
337k
    }
53
61.6M
    memcpy(ret, cur, len);
54
61.6M
    ret[len] = 0;
55
61.6M
    return(ret);
56
61.9M
}
57
58
/**
59
 * xmlStrdup:
60
 * @cur:  the input xmlChar *
61
 *
62
 * a strdup for array of xmlChar's. Since they are supposed to be
63
 * encoded in UTF-8 or an encoding with 8bit based chars, we assume
64
 * a termination mark of '0'.
65
 *
66
 * Returns a new xmlChar * or NULL
67
 */
68
xmlChar *
69
37.3M
xmlStrdup(const xmlChar *cur) {
70
37.3M
    const xmlChar *p = cur;
71
72
37.3M
    if (cur == NULL) return(NULL);
73
8.13G
    while (*p != 0) p++; /* non input consuming */
74
33.1M
    return(xmlStrndup(cur, p - cur));
75
37.3M
}
76
77
/**
78
 * xmlCharStrndup:
79
 * @cur:  the input char *
80
 * @len:  the len of @cur
81
 *
82
 * a strndup for char's to xmlChar's
83
 *
84
 * Returns a new xmlChar * or NULL
85
 */
86
87
xmlChar *
88
1.20M
xmlCharStrndup(const char *cur, int len) {
89
1.20M
    int i;
90
1.20M
    xmlChar *ret;
91
92
1.20M
    if ((cur == NULL) || (len < 0)) return(NULL);
93
1.20M
    ret = (xmlChar *) xmlMallocAtomic((size_t) len + 1);
94
1.20M
    if (ret == NULL) {
95
183
        return(NULL);
96
183
    }
97
37.7M
    for (i = 0;i < len;i++) {
98
        /* Explicit sign change */
99
36.5M
        ret[i] = (xmlChar) cur[i];
100
36.5M
        if (ret[i] == 0) return(ret);
101
36.5M
    }
102
1.20M
    ret[len] = 0;
103
1.20M
    return(ret);
104
1.20M
}
105
106
/**
107
 * xmlCharStrdup:
108
 * @cur:  the input char *
109
 *
110
 * a strdup for char's to xmlChar's
111
 *
112
 * Returns a new xmlChar * or NULL
113
 */
114
115
xmlChar *
116
1.20M
xmlCharStrdup(const char *cur) {
117
1.20M
    const char *p = cur;
118
119
1.20M
    if (cur == NULL) return(NULL);
120
37.7M
    while (*p != '\0') p++; /* non input consuming */
121
1.20M
    return(xmlCharStrndup(cur, p - cur));
122
1.20M
}
123
124
/**
125
 * xmlStrcmp:
126
 * @str1:  the first xmlChar *
127
 * @str2:  the second xmlChar *
128
 *
129
 * a strcmp for xmlChar's
130
 *
131
 * Returns the integer result of the comparison
132
 */
133
134
int
135
1.29M
xmlStrcmp(const xmlChar *str1, const xmlChar *str2) {
136
1.29M
    if (str1 == str2) return(0);
137
1.29M
    if (str1 == NULL) return(-1);
138
1.29M
    if (str2 == NULL) return(1);
139
1.29M
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
140
1.29M
    return(strcmp((const char *)str1, (const char *)str2));
141
#else
142
    do {
143
        int tmp = *str1++ - *str2;
144
        if (tmp != 0) return(tmp);
145
    } while (*str2++ != 0);
146
    return 0;
147
#endif
148
1.29M
}
149
150
/**
151
 * xmlStrEqual:
152
 * @str1:  the first xmlChar *
153
 * @str2:  the second xmlChar *
154
 *
155
 * Check if both strings are equal of have same content.
156
 * Should be a bit more readable and faster than xmlStrcmp()
157
 *
158
 * Returns 1 if they are equal, 0 if they are different
159
 */
160
161
int
162
469M
xmlStrEqual(const xmlChar *str1, const xmlChar *str2) {
163
469M
    if (str1 == str2) return(1);
164
327M
    if (str1 == NULL) return(0);
165
327M
    if (str2 == NULL) return(0);
166
327M
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
167
327M
    return(strcmp((const char *)str1, (const char *)str2) == 0);
168
#else
169
    do {
170
        if (*str1++ != *str2) return(0);
171
    } while (*str2++);
172
    return(1);
173
#endif
174
327M
}
175
176
/**
177
 * xmlStrQEqual:
178
 * @pref:  the prefix of the QName
179
 * @name:  the localname of the QName
180
 * @str:  the second xmlChar *
181
 *
182
 * Check if a QName is Equal to a given string
183
 *
184
 * Returns 1 if they are equal, 0 if they are different
185
 */
186
187
int
188
183k
xmlStrQEqual(const xmlChar *pref, const xmlChar *name, const xmlChar *str) {
189
183k
    if (pref == NULL) return(xmlStrEqual(name, str));
190
151k
    if (name == NULL) return(0);
191
151k
    if (str == NULL) return(0);
192
193
760k
    do {
194
760k
        if (*pref++ != *str) return(0);
195
760k
    } while ((*str++) && (*pref));
196
149k
    if (*str++ != ':') return(0);
197
6.44M
    do {
198
6.44M
        if (*name++ != *str) return(0);
199
6.44M
    } while (*str++);
200
147k
    return(1);
201
149k
}
202
203
/**
204
 * xmlStrncmp:
205
 * @str1:  the first xmlChar *
206
 * @str2:  the second xmlChar *
207
 * @len:  the max comparison length
208
 *
209
 * a strncmp for xmlChar's
210
 *
211
 * Returns the integer result of the comparison
212
 */
213
214
int
215
15.3M
xmlStrncmp(const xmlChar *str1, const xmlChar *str2, int len) {
216
15.3M
    if (len <= 0) return(0);
217
15.2M
    if (str1 == str2) return(0);
218
15.2M
    if (str1 == NULL) return(-1);
219
15.2M
    if (str2 == NULL) return(1);
220
15.2M
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
221
15.2M
    return(strncmp((const char *)str1, (const char *)str2, len));
222
#else
223
    do {
224
        int tmp = *str1++ - *str2;
225
        if (tmp != 0 || --len == 0) return(tmp);
226
    } while (*str2++ != 0);
227
    return 0;
228
#endif
229
15.2M
}
230
231
static const xmlChar casemap[256] = {
232
    0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
233
    0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
234
    0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
235
    0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,
236
    0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,
237
    0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
238
    0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,
239
    0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
240
    0x40,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
241
    0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
242
    0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
243
    0x78,0x79,0x7A,0x7B,0x5C,0x5D,0x5E,0x5F,
244
    0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
245
    0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
246
    0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
247
    0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E,0x7F,
248
    0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,
249
    0x88,0x89,0x8A,0x8B,0x8C,0x8D,0x8E,0x8F,
250
    0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97,
251
    0x98,0x99,0x9A,0x9B,0x9C,0x9D,0x9E,0x9F,
252
    0xA0,0xA1,0xA2,0xA3,0xA4,0xA5,0xA6,0xA7,
253
    0xA8,0xA9,0xAA,0xAB,0xAC,0xAD,0xAE,0xAF,
254
    0xB0,0xB1,0xB2,0xB3,0xB4,0xB5,0xB6,0xB7,
255
    0xB8,0xB9,0xBA,0xBB,0xBC,0xBD,0xBE,0xBF,
256
    0xC0,0xC1,0xC2,0xC3,0xC4,0xC5,0xC6,0xC7,
257
    0xC8,0xC9,0xCA,0xCB,0xCC,0xCD,0xCE,0xCF,
258
    0xD0,0xD1,0xD2,0xD3,0xD4,0xD5,0xD6,0xD7,
259
    0xD8,0xD9,0xDA,0xDB,0xDC,0xDD,0xDE,0xDF,
260
    0xE0,0xE1,0xE2,0xE3,0xE4,0xE5,0xE6,0xE7,
261
    0xE8,0xE9,0xEA,0xEB,0xEC,0xED,0xEE,0xEF,
262
    0xF0,0xF1,0xF2,0xF3,0xF4,0xF5,0xF6,0xF7,
263
    0xF8,0xF9,0xFA,0xFB,0xFC,0xFD,0xFE,0xFF
264
};
265
266
/**
267
 * xmlStrcasecmp:
268
 * @str1:  the first xmlChar *
269
 * @str2:  the second xmlChar *
270
 *
271
 * a strcasecmp for xmlChar's
272
 *
273
 * Returns the integer result of the comparison
274
 */
275
276
int
277
4.83M
xmlStrcasecmp(const xmlChar *str1, const xmlChar *str2) {
278
4.83M
    register int tmp;
279
280
4.83M
    if (str1 == str2) return(0);
281
4.83M
    if (str1 == NULL) return(-1);
282
4.81M
    if (str2 == NULL) return(1);
283
5.98M
    do {
284
5.98M
        tmp = casemap[*str1++] - casemap[*str2];
285
5.98M
        if (tmp != 0) return(tmp);
286
5.98M
    } while (*str2++ != 0);
287
20.2k
    return 0;
288
4.81M
}
289
290
/**
291
 * xmlStrncasecmp:
292
 * @str1:  the first xmlChar *
293
 * @str2:  the second xmlChar *
294
 * @len:  the max comparison length
295
 *
296
 * a strncasecmp for xmlChar's
297
 *
298
 * Returns the integer result of the comparison
299
 */
300
301
int
302
162M
xmlStrncasecmp(const xmlChar *str1, const xmlChar *str2, int len) {
303
162M
    register int tmp;
304
305
162M
    if (len <= 0) return(0);
306
162M
    if (str1 == str2) return(0);
307
162M
    if (str1 == NULL) return(-1);
308
162M
    if (str2 == NULL) return(1);
309
333M
    do {
310
333M
        tmp = casemap[*str1++] - casemap[*str2];
311
333M
        if (tmp != 0 || --len == 0) return(tmp);
312
333M
    } while (*str2++ != 0);
313
0
    return 0;
314
162M
}
315
316
/**
317
 * xmlStrchr:
318
 * @str:  the xmlChar * array
319
 * @val:  the xmlChar to search
320
 *
321
 * a strchr for xmlChar's
322
 *
323
 * Returns the xmlChar * for the first occurrence or NULL.
324
 */
325
326
const xmlChar *
327
18.6M
xmlStrchr(const xmlChar *str, xmlChar val) {
328
18.6M
    if (str == NULL) return(NULL);
329
629M
    while (*str != 0) { /* non input consuming */
330
612M
        if (*str == val) return((xmlChar *) str);
331
611M
        str++;
332
611M
    }
333
16.8M
    return(NULL);
334
18.5M
}
335
336
/**
337
 * xmlStrstr:
338
 * @str:  the xmlChar * array (haystack)
339
 * @val:  the xmlChar to search (needle)
340
 *
341
 * a strstr for xmlChar's
342
 *
343
 * Returns the xmlChar * for the first occurrence or NULL.
344
 */
345
346
const xmlChar *
347
6.64M
xmlStrstr(const xmlChar *str, const xmlChar *val) {
348
6.64M
    int n;
349
350
6.64M
    if (str == NULL) return(NULL);
351
6.64M
    if (val == NULL) return(NULL);
352
6.64M
    n = xmlStrlen(val);
353
354
6.64M
    if (n == 0) return(str);
355
165M
    while (*str != 0) { /* non input consuming */
356
159M
        if (*str == *val) {
357
1.74M
            if (!xmlStrncmp(str, val, n)) return((const xmlChar *) str);
358
1.74M
        }
359
159M
        str++;
360
159M
    }
361
6.52M
    return(NULL);
362
6.64M
}
363
364
/**
365
 * xmlStrcasestr:
366
 * @str:  the xmlChar * array (haystack)
367
 * @val:  the xmlChar to search (needle)
368
 *
369
 * a case-ignoring strstr for xmlChar's
370
 *
371
 * Returns the xmlChar * for the first occurrence or NULL.
372
 */
373
374
const xmlChar *
375
0
xmlStrcasestr(const xmlChar *str, const xmlChar *val) {
376
0
    int n;
377
378
0
    if (str == NULL) return(NULL);
379
0
    if (val == NULL) return(NULL);
380
0
    n = xmlStrlen(val);
381
382
0
    if (n == 0) return(str);
383
0
    while (*str != 0) { /* non input consuming */
384
0
        if (casemap[*str] == casemap[*val])
385
0
            if (!xmlStrncasecmp(str, val, n)) return(str);
386
0
        str++;
387
0
    }
388
0
    return(NULL);
389
0
}
390
391
/**
392
 * xmlStrsub:
393
 * @str:  the xmlChar * array (haystack)
394
 * @start:  the index of the first char (zero based)
395
 * @len:  the length of the substring
396
 *
397
 * Extract a substring of a given string
398
 *
399
 * Returns the xmlChar * for the first occurrence or NULL.
400
 */
401
402
xmlChar *
403
0
xmlStrsub(const xmlChar *str, int start, int len) {
404
0
    int i;
405
406
0
    if (str == NULL) return(NULL);
407
0
    if (start < 0) return(NULL);
408
0
    if (len < 0) return(NULL);
409
410
0
    for (i = 0;i < start;i++) {
411
0
        if (*str == 0) return(NULL);
412
0
        str++;
413
0
    }
414
0
    if (*str == 0) return(NULL);
415
0
    return(xmlStrndup(str, len));
416
0
}
417
418
/**
419
 * xmlStrlen:
420
 * @str:  the xmlChar * array
421
 *
422
 * length of a xmlChar's string
423
 *
424
 * Returns the number of xmlChar contained in the ARRAY.
425
 */
426
427
int
428
135M
xmlStrlen(const xmlChar *str) {
429
135M
    size_t len = str ? strlen((const char *)str) : 0;
430
135M
    return(len > INT_MAX ? 0 : len);
431
135M
}
432
433
/**
434
 * xmlStrncat:
435
 * @cur:  the original xmlChar * array
436
 * @add:  the xmlChar * array added
437
 * @len:  the length of @add
438
 *
439
 * a strncat for array of xmlChar's, it will extend @cur with the len
440
 * first bytes of @add. Note that if @len < 0 then this is an API error
441
 * and NULL will be returned.
442
 *
443
 * Returns a new xmlChar *, the original @cur is reallocated and should
444
 * not be freed.
445
 */
446
447
xmlChar *
448
636k
xmlStrncat(xmlChar *cur, const xmlChar *add, int len) {
449
636k
    int size;
450
636k
    xmlChar *ret;
451
452
636k
    if ((add == NULL) || (len == 0))
453
25.2k
        return(cur);
454
610k
    if (len < 0)
455
0
  return(NULL);
456
610k
    if (cur == NULL)
457
30.7k
        return(xmlStrndup(add, len));
458
459
580k
    size = xmlStrlen(cur);
460
580k
    if ((size < 0) || (size > INT_MAX - len))
461
0
        return(NULL);
462
580k
    ret = (xmlChar *) xmlRealloc(cur, (size_t) size + len + 1);
463
580k
    if (ret == NULL) {
464
1.12k
        return(cur);
465
1.12k
    }
466
579k
    memcpy(&ret[size], add, len);
467
579k
    ret[size + len] = 0;
468
579k
    return(ret);
469
580k
}
470
471
/**
472
 * xmlStrncatNew:
473
 * @str1:  first xmlChar string
474
 * @str2:  second xmlChar string
475
 * @len:  the len of @str2 or < 0
476
 *
477
 * same as xmlStrncat, but creates a new string.  The original
478
 * two strings are not freed. If @len is < 0 then the length
479
 * will be calculated automatically.
480
 *
481
 * Returns a new xmlChar * or NULL
482
 */
483
xmlChar *
484
102k
xmlStrncatNew(const xmlChar *str1, const xmlChar *str2, int len) {
485
102k
    int size;
486
102k
    xmlChar *ret;
487
488
102k
    if (len < 0) {
489
0
        len = xmlStrlen(str2);
490
0
        if (len < 0)
491
0
            return(NULL);
492
0
    }
493
102k
    if ((str2 == NULL) || (len == 0))
494
0
        return(xmlStrdup(str1));
495
102k
    if (str1 == NULL)
496
3
        return(xmlStrndup(str2, len));
497
498
102k
    size = xmlStrlen(str1);
499
102k
    if ((size < 0) || (size > INT_MAX - len))
500
0
        return(NULL);
501
102k
    ret = (xmlChar *) xmlMalloc((size_t) size + len + 1);
502
102k
    if (ret == NULL) {
503
298
        return(xmlStrndup(str1, size));
504
298
    }
505
101k
    memcpy(ret, str1, size);
506
101k
    memcpy(&ret[size], str2, len);
507
101k
    ret[size + len] = 0;
508
101k
    return(ret);
509
102k
}
510
511
/**
512
 * xmlStrcat:
513
 * @cur:  the original xmlChar * array
514
 * @add:  the xmlChar * array added
515
 *
516
 * a strcat for array of xmlChar's. Since they are supposed to be
517
 * encoded in UTF-8 or an encoding with 8bit based chars, we assume
518
 * a termination mark of '0'.
519
 *
520
 * Returns a new xmlChar * containing the concatenated string. The original
521
 * @cur is reallocated and should not be freed.
522
 */
523
xmlChar *
524
1.09M
xmlStrcat(xmlChar *cur, const xmlChar *add) {
525
1.09M
    const xmlChar *p = add;
526
527
1.09M
    if (add == NULL) return(cur);
528
1.06M
    if (cur == NULL)
529
547k
        return(xmlStrdup(add));
530
531
1.97G
    while (*p != 0) p++; /* non input consuming */
532
513k
    return(xmlStrncat(cur, add, p - add));
533
1.06M
}
534
535
/**
536
 * xmlStrPrintf:
537
 * @buf:   the result buffer.
538
 * @len:   the result buffer length.
539
 * @msg:   the message with printf formatting.
540
 * @...:   extra parameters for the message.
541
 *
542
 * Formats @msg and places result into @buf.
543
 *
544
 * Returns the number of characters written to @buf or -1 if an error occurs.
545
 */
546
int
547
63.2k
xmlStrPrintf(xmlChar *buf, int len, const char *msg, ...) {
548
63.2k
    va_list args;
549
63.2k
    int ret;
550
551
63.2k
    if((buf == NULL) || (msg == NULL)) {
552
0
        return(-1);
553
0
    }
554
555
63.2k
    va_start(args, msg);
556
63.2k
    ret = vsnprintf((char *) buf, len, (const char *) msg, args);
557
63.2k
    va_end(args);
558
63.2k
    buf[len - 1] = 0; /* be safe ! */
559
560
63.2k
    return(ret);
561
63.2k
}
562
563
/**
564
 * xmlStrVPrintf:
565
 * @buf:   the result buffer.
566
 * @len:   the result buffer length.
567
 * @msg:   the message with printf formatting.
568
 * @ap:    extra parameters for the message.
569
 *
570
 * Formats @msg and places result into @buf.
571
 *
572
 * Returns the number of characters written to @buf or -1 if an error occurs.
573
 */
574
int
575
0
xmlStrVPrintf(xmlChar *buf, int len, const char *msg, va_list ap) {
576
0
    int ret;
577
578
0
    if((buf == NULL) || (msg == NULL)) {
579
0
        return(-1);
580
0
    }
581
582
0
    ret = vsnprintf((char *) buf, len, (const char *) msg, ap);
583
0
    buf[len - 1] = 0; /* be safe ! */
584
585
0
    return(ret);
586
0
}
587
588
/************************************************************************
589
 *                                                                      *
590
 *              Generic UTF8 handling routines                          *
591
 *                                                                      *
592
 * From rfc2044: encoding of the Unicode values on UTF-8:               *
593
 *                                                                      *
594
 * UCS-4 range (hex.)           UTF-8 octet sequence (binary)           *
595
 * 0000 0000-0000 007F   0xxxxxxx                                       *
596
 * 0000 0080-0000 07FF   110xxxxx 10xxxxxx                              *
597
 * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx                     *
598
 *                                                                      *
599
 * I hope we won't use values > 0xFFFF anytime soon !                   *
600
 *                                                                      *
601
 ************************************************************************/
602
603
604
/**
605
 * xmlUTF8Size:
606
 * @utf: pointer to the UTF8 character
607
 *
608
 * calculates the internal size of a UTF8 character
609
 *
610
 * returns the numbers of bytes in the character, -1 on format error
611
 */
612
int
613
1.72M
xmlUTF8Size(const xmlChar *utf) {
614
1.72M
    xmlChar mask;
615
1.72M
    int len;
616
617
1.72M
    if (utf == NULL)
618
0
        return -1;
619
1.72M
    if (*utf < 0x80)
620
1.42M
        return 1;
621
    /* check valid UTF8 character */
622
300k
    if (!(*utf & 0x40))
623
6.39k
        return -1;
624
    /* determine number of bytes in char */
625
294k
    len = 2;
626
297k
    for (mask=0x20; mask != 0; mask>>=1) {
627
297k
        if (!(*utf & mask))
628
294k
            return len;
629
3.62k
        len++;
630
3.62k
    }
631
277
    return -1;
632
294k
}
633
634
/**
635
 * xmlUTF8Charcmp:
636
 * @utf1: pointer to first UTF8 char
637
 * @utf2: pointer to second UTF8 char
638
 *
639
 * compares the two UCS4 values
640
 *
641
 * returns result of the compare as with xmlStrncmp
642
 */
643
int
644
1.72M
xmlUTF8Charcmp(const xmlChar *utf1, const xmlChar *utf2) {
645
646
1.72M
    if (utf1 == NULL ) {
647
0
        if (utf2 == NULL)
648
0
            return 0;
649
0
        return -1;
650
0
    }
651
1.72M
    return xmlStrncmp(utf1, utf2, xmlUTF8Size(utf1));
652
1.72M
}
653
654
/**
655
 * xmlUTF8Strlen:
656
 * @utf:  a sequence of UTF-8 encoded bytes
657
 *
658
 * compute the length of an UTF8 string, it doesn't do a full UTF8
659
 * checking of the content of the string.
660
 *
661
 * Returns the number of characters in the string or -1 in case of error
662
 */
663
int
664
8.88k
xmlUTF8Strlen(const xmlChar *utf) {
665
8.88k
    size_t ret = 0;
666
667
8.88k
    if (utf == NULL)
668
8
        return(-1);
669
670
10.8M
    while (*utf != 0) {
671
10.8M
        if (utf[0] & 0x80) {
672
3.68M
            if ((utf[1] & 0xc0) != 0x80)
673
36
                return(-1);
674
3.68M
            if ((utf[0] & 0xe0) == 0xe0) {
675
3.49M
                if ((utf[2] & 0xc0) != 0x80)
676
0
                    return(-1);
677
3.49M
                if ((utf[0] & 0xf0) == 0xf0) {
678
227
                    if ((utf[0] & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
679
0
                        return(-1);
680
227
                    utf += 4;
681
3.49M
                } else {
682
3.49M
                    utf += 3;
683
3.49M
                }
684
3.49M
            } else {
685
194k
                utf += 2;
686
194k
            }
687
7.19M
        } else {
688
7.19M
            utf++;
689
7.19M
        }
690
10.8M
        ret++;
691
10.8M
    }
692
8.83k
    return(ret > INT_MAX ? 0 : ret);
693
8.87k
}
694
695
/**
696
 * xmlGetUTF8Char:
697
 * @utf:  a sequence of UTF-8 encoded bytes
698
 * @len:  a pointer to the minimum number of bytes present in
699
 *        the sequence.  This is used to assure the next character
700
 *        is completely contained within the sequence.
701
 *
702
 * Read the first UTF8 character from @utf
703
 *
704
 * Returns the char value or -1 in case of error, and sets *len to
705
 *        the actual number of bytes consumed (0 in case of error)
706
 */
707
int
708
74.2M
xmlGetUTF8Char(const unsigned char *utf, int *len) {
709
74.2M
    unsigned int c;
710
711
74.2M
    if (utf == NULL)
712
0
        goto error;
713
74.2M
    if (len == NULL)
714
0
        goto error;
715
74.2M
    if (*len < 1)
716
0
        goto error;
717
718
74.2M
    c = utf[0];
719
74.2M
    if (c & 0x80) {
720
12.1M
        if (*len < 2)
721
4.81k
            goto error;
722
12.1M
        if ((utf[1] & 0xc0) != 0x80)
723
387k
            goto error;
724
11.8M
        if ((c & 0xe0) == 0xe0) {
725
2.50M
            if (*len < 3)
726
1.43k
                goto error;
727
2.50M
            if ((utf[2] & 0xc0) != 0x80)
728
9.99k
                goto error;
729
2.49M
            if ((c & 0xf0) == 0xf0) {
730
151k
                if (*len < 4)
731
494
                    goto error;
732
150k
                if ((c & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
733
7.81k
                    goto error;
734
142k
                *len = 4;
735
                /* 4-byte code */
736
142k
                c = (utf[0] & 0x7) << 18;
737
142k
                c |= (utf[1] & 0x3f) << 12;
738
142k
                c |= (utf[2] & 0x3f) << 6;
739
142k
                c |= utf[3] & 0x3f;
740
2.34M
            } else {
741
              /* 3-byte code */
742
2.34M
                *len = 3;
743
2.34M
                c = (utf[0] & 0xf) << 12;
744
2.34M
                c |= (utf[1] & 0x3f) << 6;
745
2.34M
                c |= utf[2] & 0x3f;
746
2.34M
            }
747
9.29M
        } else {
748
          /* 2-byte code */
749
9.29M
            *len = 2;
750
9.29M
            c = (utf[0] & 0x1f) << 6;
751
9.29M
            c |= utf[1] & 0x3f;
752
9.29M
        }
753
62.0M
    } else {
754
        /* 1-byte code */
755
62.0M
        *len = 1;
756
62.0M
    }
757
73.8M
    return(c);
758
759
412k
error:
760
412k
    if (len != NULL)
761
412k
  *len = 0;
762
412k
    return(-1);
763
74.2M
}
764
765
/**
766
 * xmlCheckUTF8:
767
 * @utf: Pointer to putative UTF-8 encoded string.
768
 *
769
 * Checks @utf for being valid UTF-8. @utf is assumed to be
770
 * null-terminated. This function is not super-strict, as it will
771
 * allow longer UTF-8 sequences than necessary. Note that Java is
772
 * capable of producing these sequences if provoked. Also note, this
773
 * routine checks for the 4-byte maximum size, but does not check for
774
 * 0x10ffff maximum value.
775
 *
776
 * Return value: true if @utf is valid.
777
 **/
778
int
779
xmlCheckUTF8(const unsigned char *utf)
780
497
{
781
497
    int ix;
782
497
    unsigned char c;
783
784
497
    if (utf == NULL)
785
1
        return(0);
786
    /*
787
     * utf is a string of 1, 2, 3 or 4 bytes.  The valid strings
788
     * are as follows (in "bit format"):
789
     *    0xxxxxxx                                      valid 1-byte
790
     *    110xxxxx 10xxxxxx                             valid 2-byte
791
     *    1110xxxx 10xxxxxx 10xxxxxx                    valid 3-byte
792
     *    11110xxx 10xxxxxx 10xxxxxx 10xxxxxx           valid 4-byte
793
     */
794
12.1k
    while ((c = utf[0])) {      /* string is 0-terminated */
795
12.0k
        ix = 0;
796
12.0k
        if ((c & 0x80) == 0x00) { /* 1-byte code, starts with 10 */
797
5.13k
            ix = 1;
798
6.91k
  } else if ((c & 0xe0) == 0xc0) {/* 2-byte code, starts with 110 */
799
2.20k
      if ((utf[1] & 0xc0 ) != 0x80)
800
53
          return 0;
801
2.15k
      ix = 2;
802
4.71k
  } else if ((c & 0xf0) == 0xe0) {/* 3-byte code, starts with 1110 */
803
4.41k
      if (((utf[1] & 0xc0) != 0x80) ||
804
4.41k
          ((utf[2] & 0xc0) != 0x80))
805
90
        return 0;
806
4.32k
      ix = 3;
807
4.32k
  } else if ((c & 0xf8) == 0xf0) {/* 4-byte code, starts with 11110 */
808
45
      if (((utf[1] & 0xc0) != 0x80) ||
809
45
          ((utf[2] & 0xc0) != 0x80) ||
810
45
    ((utf[3] & 0xc0) != 0x80))
811
7
        return 0;
812
38
      ix = 4;
813
38
  } else       /* unknown encoding */
814
254
      return 0;
815
11.6k
        utf += ix;
816
11.6k
      }
817
92
      return(1);
818
496
}
819
820
/**
821
 * xmlUTF8Strsize:
822
 * @utf:  a sequence of UTF-8 encoded bytes
823
 * @len:  the number of characters in the array
824
 *
825
 * storage size of an UTF8 string
826
 * the behaviour is not guaranteed if the input string is not UTF-8
827
 *
828
 * Returns the storage size of
829
 * the first 'len' characters of ARRAY
830
 */
831
832
int
833
7.09M
xmlUTF8Strsize(const xmlChar *utf, int len) {
834
7.09M
    const xmlChar *ptr=utf;
835
7.09M
    int ch;
836
7.09M
    size_t ret;
837
838
7.09M
    if (utf == NULL)
839
0
        return(0);
840
841
7.09M
    if (len <= 0)
842
705
        return(0);
843
844
15.1M
    while ( len-- > 0) {
845
8.08M
        if ( !*ptr )
846
3.58k
            break;
847
8.08M
        if ( (ch = *ptr++) & 0x80)
848
6.42M
            while ((ch<<=1) & 0x80 ) {
849
3.84M
    if (*ptr == 0) break;
850
3.84M
                ptr++;
851
3.84M
      }
852
8.08M
    }
853
7.09M
    ret = ptr - utf;
854
7.09M
    return (ret > INT_MAX ? 0 : ret);
855
7.09M
}
856
857
858
/**
859
 * xmlUTF8Strndup:
860
 * @utf:  the input UTF8 *
861
 * @len:  the len of @utf (in chars)
862
 *
863
 * a strndup for array of UTF8's
864
 *
865
 * Returns a new UTF8 * or NULL
866
 */
867
xmlChar *
868
7.30k
xmlUTF8Strndup(const xmlChar *utf, int len) {
869
7.30k
    xmlChar *ret;
870
7.30k
    int i;
871
872
7.30k
    if ((utf == NULL) || (len < 0)) return(NULL);
873
7.30k
    i = xmlUTF8Strsize(utf, len);
874
7.30k
    ret = (xmlChar *) xmlMallocAtomic((size_t) i + 1);
875
7.30k
    if (ret == NULL) {
876
5
        return(NULL);
877
5
    }
878
7.30k
    memcpy(ret, utf, i);
879
7.30k
    ret[i] = 0;
880
7.30k
    return(ret);
881
7.30k
}
882
883
/**
884
 * xmlUTF8Strpos:
885
 * @utf:  the input UTF8 *
886
 * @pos:  the position of the desired UTF8 char (in chars)
887
 *
888
 * a function to provide the equivalent of fetching a
889
 * character from a string array
890
 *
891
 * Returns a pointer to the UTF8 character or NULL
892
 */
893
const xmlChar *
894
3.23k
xmlUTF8Strpos(const xmlChar *utf, int pos) {
895
3.23k
    int ch;
896
897
3.23k
    if (utf == NULL) return(NULL);
898
3.23k
    if (pos < 0)
899
0
        return(NULL);
900
41.2k
    while (pos--) {
901
37.9k
        if ((ch=*utf++) == 0) return(NULL);
902
37.9k
        if ( ch & 0x80 ) {
903
            /* if not simple ascii, verify proper format */
904
14.8k
            if ( (ch & 0xc0) != 0xc0 )
905
0
                return(NULL);
906
            /* then skip over remaining bytes for this char */
907
29.8k
            while ( (ch <<= 1) & 0x80 )
908
14.9k
                if ( (*utf++ & 0xc0) != 0x80 )
909
0
                    return(NULL);
910
14.8k
        }
911
37.9k
    }
912
3.23k
    return((xmlChar *)utf);
913
3.23k
}
914
915
/**
916
 * xmlUTF8Strloc:
917
 * @utf:  the input UTF8 *
918
 * @utfchar:  the UTF8 character to be found
919
 *
920
 * a function to provide the relative location of a UTF8 char
921
 *
922
 * Returns the relative character position of the desired char
923
 * or -1 if not found
924
 */
925
int
926
1.61M
xmlUTF8Strloc(const xmlChar *utf, const xmlChar *utfchar) {
927
1.61M
    size_t i;
928
1.61M
    int size;
929
1.61M
    int ch;
930
931
1.61M
    if (utf==NULL || utfchar==NULL) return -1;
932
1.61M
    size = xmlUTF8Strsize(utfchar, 1);
933
11.6M
        for(i=0; (ch=*utf) != 0; i++) {
934
10.0M
            if (xmlStrncmp(utf, utfchar, size)==0)
935
4.74k
                return(i > INT_MAX ? 0 : i);
936
10.0M
            utf++;
937
10.0M
            if ( ch & 0x80 ) {
938
                /* if not simple ascii, verify proper format */
939
18.8k
                if ( (ch & 0xc0) != 0xc0 )
940
0
                    return(-1);
941
                /* then skip over remaining bytes for this char */
942
37.8k
                while ( (ch <<= 1) & 0x80 )
943
19.0k
                    if ( (*utf++ & 0xc0) != 0x80 )
944
0
                        return(-1);
945
18.8k
            }
946
10.0M
        }
947
948
1.61M
    return(-1);
949
1.61M
}
950
/**
951
 * xmlUTF8Strsub:
952
 * @utf:  a sequence of UTF-8 encoded bytes
953
 * @start: relative pos of first char
954
 * @len:   total number to copy
955
 *
956
 * Create a substring from a given UTF-8 string
957
 * Note:  positions are given in units of UTF-8 chars
958
 *
959
 * Returns a pointer to a newly created string
960
 * or NULL if any problem
961
 */
962
963
xmlChar *
964
5.52k
xmlUTF8Strsub(const xmlChar *utf, int start, int len) {
965
5.52k
    int i;
966
5.52k
    int ch;
967
968
5.52k
    if (utf == NULL) return(NULL);
969
5.52k
    if (start < 0) return(NULL);
970
5.52k
    if (len < 0) return(NULL);
971
972
    /*
973
     * Skip over any leading chars
974
     */
975
11.3k
    for (i = 0;i < start;i++) {
976
7.53k
        if ((ch=*utf++) == 0) return(NULL);
977
5.87k
        if ( ch & 0x80 ) {
978
            /* if not simple ascii, verify proper format */
979
437
            if ( (ch & 0xc0) != 0xc0 )
980
0
                return(NULL);
981
            /* then skip over remaining bytes for this char */
982
903
            while ( (ch <<= 1) & 0x80 )
983
466
                if ( (*utf++ & 0xc0) != 0x80 )
984
0
                    return(NULL);
985
437
        }
986
5.87k
    }
987
988
3.86k
    return(xmlUTF8Strndup(utf, len));
989
5.52k
}
990
991
/**
992
 * xmlEscapeFormatString:
993
 * @msg:  a pointer to the string in which to escape '%' characters.
994
 * Must be a heap-allocated buffer created by libxml2 that may be
995
 * returned, or that may be freed and replaced.
996
 *
997
 * Replaces the string pointed to by 'msg' with an escaped string.
998
 * Returns the same string with all '%' characters escaped.
999
 */
1000
xmlChar *
1001
xmlEscapeFormatString(xmlChar **msg)
1002
0
{
1003
0
    xmlChar *msgPtr = NULL;
1004
0
    xmlChar *result = NULL;
1005
0
    xmlChar *resultPtr = NULL;
1006
0
    size_t count = 0;
1007
0
    size_t msgLen = 0;
1008
0
    size_t resultLen = 0;
1009
1010
0
    if (!msg || !*msg)
1011
0
        return(NULL);
1012
1013
0
    for (msgPtr = *msg; *msgPtr != '\0'; ++msgPtr) {
1014
0
        ++msgLen;
1015
0
        if (*msgPtr == '%')
1016
0
            ++count;
1017
0
    }
1018
1019
0
    if (count == 0)
1020
0
        return(*msg);
1021
1022
0
    if ((count > INT_MAX) || (msgLen > INT_MAX - count))
1023
0
        return(NULL);
1024
0
    resultLen = msgLen + count + 1;
1025
0
    result = (xmlChar *) xmlMallocAtomic(resultLen);
1026
0
    if (result == NULL) {
1027
        /* Clear *msg to prevent format string vulnerabilities in
1028
           out-of-memory situations. */
1029
0
        xmlFree(*msg);
1030
0
        *msg = NULL;
1031
0
        return(NULL);
1032
0
    }
1033
1034
0
    for (msgPtr = *msg, resultPtr = result; *msgPtr != '\0'; ++msgPtr, ++resultPtr) {
1035
0
        *resultPtr = *msgPtr;
1036
0
        if (*msgPtr == '%')
1037
0
            *(++resultPtr) = '%';
1038
0
    }
1039
0
    result[resultLen - 1] = '\0';
1040
1041
0
    xmlFree(*msg);
1042
0
    *msg = result;
1043
1044
0
    return *msg;
1045
0
}
1046