Coverage Report

Created: 2022-11-15 06:34

/src/libxml2/xmlstring.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * string.c : an XML string utilities module
3
 *
4
 * This module provides various utility functions for manipulating
5
 * the xmlChar* type. All functions named xmlStr* have been moved here
6
 * from the parser.c file (their original home).
7
 *
8
 * See Copyright for the status of this software.
9
 *
10
 * UTF8 string routines from:
11
 * William Brack <wbrack@mmm.com.hk>
12
 *
13
 * daniel@veillard.com
14
 */
15
16
#define IN_LIBXML
17
#include "libxml.h"
18
19
#include <stdlib.h>
20
#include <string.h>
21
#include <limits.h>
22
#include <libxml/xmlmemory.h>
23
#include <libxml/parserInternals.h>
24
#include <libxml/xmlstring.h>
25
26
#include "private/parser.h"
27
#include "private/string.h"
28
29
/************************************************************************
30
 *                                                                      *
31
 *                Commodity functions to handle xmlChars                *
32
 *                                                                      *
33
 ************************************************************************/
34
35
/**
36
 * xmlStrndup:
37
 * @cur:  the input xmlChar *
38
 * @len:  the len of @cur
39
 *
40
 * a strndup for array of xmlChar's
41
 *
42
 * Returns a new xmlChar * or NULL
43
 */
44
xmlChar *
45
1.07M
xmlStrndup(const xmlChar *cur, int len) {
46
1.07M
    xmlChar *ret;
47
48
1.07M
    if ((cur == NULL) || (len < 0)) return(NULL);
49
1.07M
    ret = (xmlChar *) xmlMallocAtomic((size_t) len + 1);
50
1.07M
    if (ret == NULL) {
51
0
        xmlErrMemory(NULL, NULL);
52
0
        return(NULL);
53
0
    }
54
1.07M
    memcpy(ret, cur, len);
55
1.07M
    ret[len] = 0;
56
1.07M
    return(ret);
57
1.07M
}
58
59
/**
60
 * xmlStrdup:
61
 * @cur:  the input xmlChar *
62
 *
63
 * a strdup for array of xmlChar's. Since they are supposed to be
64
 * encoded in UTF-8 or an encoding with 8bit based chars, we assume
65
 * a termination mark of '0'.
66
 *
67
 * Returns a new xmlChar * or NULL
68
 */
69
xmlChar *
70
1.55M
xmlStrdup(const xmlChar *cur) {
71
1.55M
    const xmlChar *p = cur;
72
73
1.55M
    if (cur == NULL) return(NULL);
74
1.19G
    while (*p != 0) p++; /* non input consuming */
75
708k
    return(xmlStrndup(cur, p - cur));
76
1.55M
}
77
78
/**
79
 * xmlCharStrndup:
80
 * @cur:  the input char *
81
 * @len:  the len of @cur
82
 *
83
 * a strndup for char's to xmlChar's
84
 *
85
 * Returns a new xmlChar * or NULL
86
 */
87
88
xmlChar *
89
2.61k
xmlCharStrndup(const char *cur, int len) {
90
2.61k
    int i;
91
2.61k
    xmlChar *ret;
92
93
2.61k
    if ((cur == NULL) || (len < 0)) return(NULL);
94
2.61k
    ret = (xmlChar *) xmlMallocAtomic((size_t) len + 1);
95
2.61k
    if (ret == NULL) {
96
0
        xmlErrMemory(NULL, NULL);
97
0
        return(NULL);
98
0
    }
99
1.13M
    for (i = 0;i < len;i++) {
100
        /* Explicit sign change */
101
1.13M
        ret[i] = (xmlChar) cur[i];
102
1.13M
        if (ret[i] == 0) return(ret);
103
1.13M
    }
104
2.61k
    ret[len] = 0;
105
2.61k
    return(ret);
106
2.61k
}
107
108
/**
109
 * xmlCharStrdup:
110
 * @cur:  the input char *
111
 *
112
 * a strdup for char's to xmlChar's
113
 *
114
 * Returns a new xmlChar * or NULL
115
 */
116
117
xmlChar *
118
2.61k
xmlCharStrdup(const char *cur) {
119
2.61k
    const char *p = cur;
120
121
2.61k
    if (cur == NULL) return(NULL);
122
1.13M
    while (*p != '\0') p++; /* non input consuming */
123
2.61k
    return(xmlCharStrndup(cur, p - cur));
124
2.61k
}
125
126
/**
127
 * xmlStrcmp:
128
 * @str1:  the first xmlChar *
129
 * @str2:  the second xmlChar *
130
 *
131
 * a strcmp for xmlChar's
132
 *
133
 * Returns the integer result of the comparison
134
 */
135
136
int
137
26.0k
xmlStrcmp(const xmlChar *str1, const xmlChar *str2) {
138
26.0k
    if (str1 == str2) return(0);
139
26.0k
    if (str1 == NULL) return(-1);
140
26.0k
    if (str2 == NULL) return(1);
141
26.0k
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
142
26.0k
    return(strcmp((const char *)str1, (const char *)str2));
143
#else
144
    do {
145
        int tmp = *str1++ - *str2;
146
        if (tmp != 0) return(tmp);
147
    } while (*str2++ != 0);
148
    return 0;
149
#endif
150
26.0k
}
151
152
/**
153
 * xmlStrEqual:
154
 * @str1:  the first xmlChar *
155
 * @str2:  the second xmlChar *
156
 *
157
 * Check if both strings are equal of have same content.
158
 * Should be a bit more readable and faster than xmlStrcmp()
159
 *
160
 * Returns 1 if they are equal, 0 if they are different
161
 */
162
163
int
164
4.91M
xmlStrEqual(const xmlChar *str1, const xmlChar *str2) {
165
4.91M
    if (str1 == str2) return(1);
166
4.86M
    if (str1 == NULL) return(0);
167
4.86M
    if (str2 == NULL) return(0);
168
4.86M
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
169
4.86M
    return(strcmp((const char *)str1, (const char *)str2) == 0);
170
#else
171
    do {
172
        if (*str1++ != *str2) return(0);
173
    } while (*str2++);
174
    return(1);
175
#endif
176
4.86M
}
177
178
/**
179
 * xmlStrQEqual:
180
 * @pref:  the prefix of the QName
181
 * @name:  the localname of the QName
182
 * @str:  the second xmlChar *
183
 *
184
 * Check if a QName is Equal to a given string
185
 *
186
 * Returns 1 if they are equal, 0 if they are different
187
 */
188
189
int
190
384
xmlStrQEqual(const xmlChar *pref, const xmlChar *name, const xmlChar *str) {
191
384
    if (pref == NULL) return(xmlStrEqual(name, str));
192
171
    if (name == NULL) return(0);
193
171
    if (str == NULL) return(0);
194
195
375
    do {
196
375
        if (*pref++ != *str) return(0);
197
375
    } while ((*str++) && (*pref));
198
117
    if (*str++ != ':') return(0);
199
1.24k
    do {
200
1.24k
        if (*name++ != *str) return(0);
201
1.24k
    } while (*str++);
202
109
    return(1);
203
117
}
204
205
/**
206
 * xmlStrncmp:
207
 * @str1:  the first xmlChar *
208
 * @str2:  the second xmlChar *
209
 * @len:  the max comparison length
210
 *
211
 * a strncmp for xmlChar's
212
 *
213
 * Returns the integer result of the comparison
214
 */
215
216
int
217
11.5k
xmlStrncmp(const xmlChar *str1, const xmlChar *str2, int len) {
218
11.5k
    if (len <= 0) return(0);
219
11.3k
    if (str1 == str2) return(0);
220
11.3k
    if (str1 == NULL) return(-1);
221
10.4k
    if (str2 == NULL) return(1);
222
10.4k
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
223
10.4k
    return(strncmp((const char *)str1, (const char *)str2, len));
224
#else
225
    do {
226
        int tmp = *str1++ - *str2;
227
        if (tmp != 0 || --len == 0) return(tmp);
228
    } while (*str2++ != 0);
229
    return 0;
230
#endif
231
10.4k
}
232
233
static const xmlChar casemap[256] = {
234
    0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
235
    0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
236
    0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
237
    0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,
238
    0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,
239
    0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
240
    0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,
241
    0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
242
    0x40,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
243
    0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
244
    0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
245
    0x78,0x79,0x7A,0x7B,0x5C,0x5D,0x5E,0x5F,
246
    0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
247
    0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
248
    0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
249
    0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E,0x7F,
250
    0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,
251
    0x88,0x89,0x8A,0x8B,0x8C,0x8D,0x8E,0x8F,
252
    0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97,
253
    0x98,0x99,0x9A,0x9B,0x9C,0x9D,0x9E,0x9F,
254
    0xA0,0xA1,0xA2,0xA3,0xA4,0xA5,0xA6,0xA7,
255
    0xA8,0xA9,0xAA,0xAB,0xAC,0xAD,0xAE,0xAF,
256
    0xB0,0xB1,0xB2,0xB3,0xB4,0xB5,0xB6,0xB7,
257
    0xB8,0xB9,0xBA,0xBB,0xBC,0xBD,0xBE,0xBF,
258
    0xC0,0xC1,0xC2,0xC3,0xC4,0xC5,0xC6,0xC7,
259
    0xC8,0xC9,0xCA,0xCB,0xCC,0xCD,0xCE,0xCF,
260
    0xD0,0xD1,0xD2,0xD3,0xD4,0xD5,0xD6,0xD7,
261
    0xD8,0xD9,0xDA,0xDB,0xDC,0xDD,0xDE,0xDF,
262
    0xE0,0xE1,0xE2,0xE3,0xE4,0xE5,0xE6,0xE7,
263
    0xE8,0xE9,0xEA,0xEB,0xEC,0xED,0xEE,0xEF,
264
    0xF0,0xF1,0xF2,0xF3,0xF4,0xF5,0xF6,0xF7,
265
    0xF8,0xF9,0xFA,0xFB,0xFC,0xFD,0xFE,0xFF
266
};
267
268
/**
269
 * xmlStrcasecmp:
270
 * @str1:  the first xmlChar *
271
 * @str2:  the second xmlChar *
272
 *
273
 * a strcasecmp for xmlChar's
274
 *
275
 * Returns the integer result of the comparison
276
 */
277
278
int
279
70.4k
xmlStrcasecmp(const xmlChar *str1, const xmlChar *str2) {
280
70.4k
    register int tmp;
281
282
70.4k
    if (str1 == str2) return(0);
283
70.4k
    if (str1 == NULL) return(-1);
284
69.6k
    if (str2 == NULL) return(1);
285
100k
    do {
286
100k
        tmp = casemap[*str1++] - casemap[*str2];
287
100k
        if (tmp != 0) return(tmp);
288
100k
    } while (*str2++ != 0);
289
7.03k
    return 0;
290
69.6k
}
291
292
/**
293
 * xmlStrncasecmp:
294
 * @str1:  the first xmlChar *
295
 * @str2:  the second xmlChar *
296
 * @len:  the max comparison length
297
 *
298
 * a strncasecmp for xmlChar's
299
 *
300
 * Returns the integer result of the comparison
301
 */
302
303
int
304
10.8k
xmlStrncasecmp(const xmlChar *str1, const xmlChar *str2, int len) {
305
10.8k
    register int tmp;
306
307
10.8k
    if (len <= 0) return(0);
308
10.8k
    if (str1 == str2) return(0);
309
10.8k
    if (str1 == NULL) return(-1);
310
10.8k
    if (str2 == NULL) return(1);
311
13.0k
    do {
312
13.0k
        tmp = casemap[*str1++] - casemap[*str2];
313
13.0k
        if (tmp != 0 || --len == 0) return(tmp);
314
13.0k
    } while (*str2++ != 0);
315
0
    return 0;
316
10.8k
}
317
318
/**
319
 * xmlStrchr:
320
 * @str:  the xmlChar * array
321
 * @val:  the xmlChar to search
322
 *
323
 * a strchr for xmlChar's
324
 *
325
 * Returns the xmlChar * for the first occurrence or NULL.
326
 */
327
328
const xmlChar *
329
77.6M
xmlStrchr(const xmlChar *str, xmlChar val) {
330
77.6M
    if (str == NULL) return(NULL);
331
1.27G
    while (*str != 0) { /* non input consuming */
332
1.19G
        if (*str == val) return((xmlChar *) str);
333
1.19G
        str++;
334
1.19G
    }
335
77.5M
    return(NULL);
336
77.6M
}
337
338
/**
339
 * xmlStrstr:
340
 * @str:  the xmlChar * array (haystack)
341
 * @val:  the xmlChar to search (needle)
342
 *
343
 * a strstr for xmlChar's
344
 *
345
 * Returns the xmlChar * for the first occurrence or NULL.
346
 */
347
348
const xmlChar *
349
34.9k
xmlStrstr(const xmlChar *str, const xmlChar *val) {
350
34.9k
    int n;
351
352
34.9k
    if (str == NULL) return(NULL);
353
34.9k
    if (val == NULL) return(NULL);
354
34.9k
    n = xmlStrlen(val);
355
356
34.9k
    if (n == 0) return(str);
357
3.50M
    while (*str != 0) { /* non input consuming */
358
3.47M
        if (*str == *val) {
359
6.69k
            if (!xmlStrncmp(str, val, n)) return((const xmlChar *) str);
360
6.69k
        }
361
3.47M
        str++;
362
3.47M
    }
363
33.4k
    return(NULL);
364
34.9k
}
365
366
/**
367
 * xmlStrcasestr:
368
 * @str:  the xmlChar * array (haystack)
369
 * @val:  the xmlChar to search (needle)
370
 *
371
 * a case-ignoring strstr for xmlChar's
372
 *
373
 * Returns the xmlChar * for the first occurrence or NULL.
374
 */
375
376
const xmlChar *
377
0
xmlStrcasestr(const xmlChar *str, const xmlChar *val) {
378
0
    int n;
379
380
0
    if (str == NULL) return(NULL);
381
0
    if (val == NULL) return(NULL);
382
0
    n = xmlStrlen(val);
383
384
0
    if (n == 0) return(str);
385
0
    while (*str != 0) { /* non input consuming */
386
0
        if (casemap[*str] == casemap[*val])
387
0
            if (!xmlStrncasecmp(str, val, n)) return(str);
388
0
        str++;
389
0
    }
390
0
    return(NULL);
391
0
}
392
393
/**
394
 * xmlStrsub:
395
 * @str:  the xmlChar * array (haystack)
396
 * @start:  the index of the first char (zero based)
397
 * @len:  the length of the substring
398
 *
399
 * Extract a substring of a given string
400
 *
401
 * Returns the xmlChar * for the first occurrence or NULL.
402
 */
403
404
xmlChar *
405
0
xmlStrsub(const xmlChar *str, int start, int len) {
406
0
    int i;
407
408
0
    if (str == NULL) return(NULL);
409
0
    if (start < 0) return(NULL);
410
0
    if (len < 0) return(NULL);
411
412
0
    for (i = 0;i < start;i++) {
413
0
        if (*str == 0) return(NULL);
414
0
        str++;
415
0
    }
416
0
    if (*str == 0) return(NULL);
417
0
    return(xmlStrndup(str, len));
418
0
}
419
420
/**
421
 * xmlStrlen:
422
 * @str:  the xmlChar * array
423
 *
424
 * length of a xmlChar's string
425
 *
426
 * Returns the number of xmlChar contained in the ARRAY.
427
 */
428
429
int
430
594k
xmlStrlen(const xmlChar *str) {
431
594k
    size_t len = str ? strlen((const char *)str) : 0;
432
594k
    return(len > INT_MAX ? 0 : len);
433
594k
}
434
435
/**
436
 * xmlStrncat:
437
 * @cur:  the original xmlChar * array
438
 * @add:  the xmlChar * array added
439
 * @len:  the length of @add
440
 *
441
 * a strncat for array of xmlChar's, it will extend @cur with the len
442
 * first bytes of @add. Note that if @len < 0 then this is an API error
443
 * and NULL will be returned.
444
 *
445
 * Returns a new xmlChar *, the original @cur is reallocated and should
446
 * not be freed.
447
 */
448
449
xmlChar *
450
99.2k
xmlStrncat(xmlChar *cur, const xmlChar *add, int len) {
451
99.2k
    int size;
452
99.2k
    xmlChar *ret;
453
454
99.2k
    if ((add == NULL) || (len == 0))
455
15.6k
        return(cur);
456
83.6k
    if (len < 0)
457
0
  return(NULL);
458
83.6k
    if (cur == NULL)
459
273
        return(xmlStrndup(add, len));
460
461
83.3k
    size = xmlStrlen(cur);
462
83.3k
    if ((size < 0) || (size > INT_MAX - len))
463
0
        return(NULL);
464
83.3k
    ret = (xmlChar *) xmlRealloc(cur, (size_t) size + len + 1);
465
83.3k
    if (ret == NULL) {
466
0
        xmlErrMemory(NULL, NULL);
467
0
        return(cur);
468
0
    }
469
83.3k
    memcpy(&ret[size], add, len);
470
83.3k
    ret[size + len] = 0;
471
83.3k
    return(ret);
472
83.3k
}
473
474
/**
475
 * xmlStrncatNew:
476
 * @str1:  first xmlChar string
477
 * @str2:  second xmlChar string
478
 * @len:  the len of @str2 or < 0
479
 *
480
 * same as xmlStrncat, but creates a new string.  The original
481
 * two strings are not freed. If @len is < 0 then the length
482
 * will be calculated automatically.
483
 *
484
 * Returns a new xmlChar * or NULL
485
 */
486
xmlChar *
487
407
xmlStrncatNew(const xmlChar *str1, const xmlChar *str2, int len) {
488
407
    int size;
489
407
    xmlChar *ret;
490
491
407
    if (len < 0) {
492
0
        len = xmlStrlen(str2);
493
0
        if (len < 0)
494
0
            return(NULL);
495
0
    }
496
407
    if ((str2 == NULL) || (len == 0))
497
0
        return(xmlStrdup(str1));
498
407
    if (str1 == NULL)
499
0
        return(xmlStrndup(str2, len));
500
501
407
    size = xmlStrlen(str1);
502
407
    if ((size < 0) || (size > INT_MAX - len))
503
0
        return(NULL);
504
407
    ret = (xmlChar *) xmlMalloc((size_t) size + len + 1);
505
407
    if (ret == NULL) {
506
0
        xmlErrMemory(NULL, NULL);
507
0
        return(xmlStrndup(str1, size));
508
0
    }
509
407
    memcpy(ret, str1, size);
510
407
    memcpy(&ret[size], str2, len);
511
407
    ret[size + len] = 0;
512
407
    return(ret);
513
407
}
514
515
/**
516
 * xmlStrcat:
517
 * @cur:  the original xmlChar * array
518
 * @add:  the xmlChar * array added
519
 *
520
 * a strcat for array of xmlChar's. Since they are supposed to be
521
 * encoded in UTF-8 or an encoding with 8bit based chars, we assume
522
 * a termination mark of '0'.
523
 *
524
 * Returns a new xmlChar * containing the concatenated string. The original
525
 * @cur is reallocated and should not be freed.
526
 */
527
xmlChar *
528
77.2k
xmlStrcat(xmlChar *cur, const xmlChar *add) {
529
77.2k
    const xmlChar *p = add;
530
531
77.2k
    if (add == NULL) return(cur);
532
77.2k
    if (cur == NULL)
533
14.0k
        return(xmlStrdup(add));
534
535
99.0M
    while (*p != 0) p++; /* non input consuming */
536
63.2k
    return(xmlStrncat(cur, add, p - add));
537
77.2k
}
538
539
/**
540
 * xmlStrPrintf:
541
 * @buf:   the result buffer.
542
 * @len:   the result buffer length.
543
 * @msg:   the message with printf formatting.
544
 * @...:   extra parameters for the message.
545
 *
546
 * Formats @msg and places result into @buf.
547
 *
548
 * Returns the number of characters written to @buf or -1 if an error occurs.
549
 */
550
int XMLCDECL
551
0
xmlStrPrintf(xmlChar *buf, int len, const char *msg, ...) {
552
0
    va_list args;
553
0
    int ret;
554
555
0
    if((buf == NULL) || (msg == NULL)) {
556
0
        return(-1);
557
0
    }
558
559
0
    va_start(args, msg);
560
0
    ret = vsnprintf((char *) buf, len, (const char *) msg, args);
561
0
    va_end(args);
562
0
    buf[len - 1] = 0; /* be safe ! */
563
564
0
    return(ret);
565
0
}
566
567
/**
568
 * xmlStrVPrintf:
569
 * @buf:   the result buffer.
570
 * @len:   the result buffer length.
571
 * @msg:   the message with printf formatting.
572
 * @ap:    extra parameters for the message.
573
 *
574
 * Formats @msg and places result into @buf.
575
 *
576
 * Returns the number of characters written to @buf or -1 if an error occurs.
577
 */
578
int
579
0
xmlStrVPrintf(xmlChar *buf, int len, const char *msg, va_list ap) {
580
0
    int ret;
581
582
0
    if((buf == NULL) || (msg == NULL)) {
583
0
        return(-1);
584
0
    }
585
586
0
    ret = vsnprintf((char *) buf, len, (const char *) msg, ap);
587
0
    buf[len - 1] = 0; /* be safe ! */
588
589
0
    return(ret);
590
0
}
591
592
/************************************************************************
593
 *                                                                      *
594
 *              Generic UTF8 handling routines                          *
595
 *                                                                      *
596
 * From rfc2044: encoding of the Unicode values on UTF-8:               *
597
 *                                                                      *
598
 * UCS-4 range (hex.)           UTF-8 octet sequence (binary)           *
599
 * 0000 0000-0000 007F   0xxxxxxx                                       *
600
 * 0000 0080-0000 07FF   110xxxxx 10xxxxxx                              *
601
 * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx                     *
602
 *                                                                      *
603
 * I hope we won't use values > 0xFFFF anytime soon !                   *
604
 *                                                                      *
605
 ************************************************************************/
606
607
608
/**
609
 * xmlUTF8Size:
610
 * @utf: pointer to the UTF8 character
611
 *
612
 * calculates the internal size of a UTF8 character
613
 *
614
 * returns the numbers of bytes in the character, -1 on format error
615
 */
616
int
617
0
xmlUTF8Size(const xmlChar *utf) {
618
0
    xmlChar mask;
619
0
    int len;
620
621
0
    if (utf == NULL)
622
0
        return -1;
623
0
    if (*utf < 0x80)
624
0
        return 1;
625
    /* check valid UTF8 character */
626
0
    if (!(*utf & 0x40))
627
0
        return -1;
628
    /* determine number of bytes in char */
629
0
    len = 2;
630
0
    for (mask=0x20; mask != 0; mask>>=1) {
631
0
        if (!(*utf & mask))
632
0
            return len;
633
0
        len++;
634
0
    }
635
0
    return -1;
636
0
}
637
638
/**
639
 * xmlUTF8Charcmp:
640
 * @utf1: pointer to first UTF8 char
641
 * @utf2: pointer to second UTF8 char
642
 *
643
 * compares the two UCS4 values
644
 *
645
 * returns result of the compare as with xmlStrncmp
646
 */
647
int
648
0
xmlUTF8Charcmp(const xmlChar *utf1, const xmlChar *utf2) {
649
650
0
    if (utf1 == NULL ) {
651
0
        if (utf2 == NULL)
652
0
            return 0;
653
0
        return -1;
654
0
    }
655
0
    return xmlStrncmp(utf1, utf2, xmlUTF8Size(utf1));
656
0
}
657
658
/**
659
 * xmlUTF8Strlen:
660
 * @utf:  a sequence of UTF-8 encoded bytes
661
 *
662
 * compute the length of an UTF8 string, it doesn't do a full UTF8
663
 * checking of the content of the string.
664
 *
665
 * Returns the number of characters in the string or -1 in case of error
666
 */
667
int
668
136
xmlUTF8Strlen(const xmlChar *utf) {
669
136
    size_t ret = 0;
670
671
136
    if (utf == NULL)
672
0
        return(-1);
673
674
37.6k
    while (*utf != 0) {
675
37.5k
        if (utf[0] & 0x80) {
676
6.56k
            if ((utf[1] & 0xc0) != 0x80)
677
0
                return(-1);
678
6.56k
            if ((utf[0] & 0xe0) == 0xe0) {
679
1.40k
                if ((utf[2] & 0xc0) != 0x80)
680
0
                    return(-1);
681
1.40k
                if ((utf[0] & 0xf0) == 0xf0) {
682
0
                    if ((utf[0] & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
683
0
                        return(-1);
684
0
                    utf += 4;
685
1.40k
                } else {
686
1.40k
                    utf += 3;
687
1.40k
                }
688
5.16k
            } else {
689
5.16k
                utf += 2;
690
5.16k
            }
691
30.9k
        } else {
692
30.9k
            utf++;
693
30.9k
        }
694
37.5k
        ret++;
695
37.5k
    }
696
136
    return(ret > INT_MAX ? 0 : ret);
697
136
}
698
699
/**
700
 * xmlGetUTF8Char:
701
 * @utf:  a sequence of UTF-8 encoded bytes
702
 * @len:  a pointer to the minimum number of bytes present in
703
 *        the sequence.  This is used to assure the next character
704
 *        is completely contained within the sequence.
705
 *
706
 * Read the first UTF8 character from @utf
707
 *
708
 * Returns the char value or -1 in case of error, and sets *len to
709
 *        the actual number of bytes consumed (0 in case of error)
710
 */
711
int
712
0
xmlGetUTF8Char(const unsigned char *utf, int *len) {
713
0
    unsigned int c;
714
715
0
    if (utf == NULL)
716
0
        goto error;
717
0
    if (len == NULL)
718
0
        goto error;
719
0
    if (*len < 1)
720
0
        goto error;
721
722
0
    c = utf[0];
723
0
    if (c & 0x80) {
724
0
        if (*len < 2)
725
0
            goto error;
726
0
        if ((utf[1] & 0xc0) != 0x80)
727
0
            goto error;
728
0
        if ((c & 0xe0) == 0xe0) {
729
0
            if (*len < 3)
730
0
                goto error;
731
0
            if ((utf[2] & 0xc0) != 0x80)
732
0
                goto error;
733
0
            if ((c & 0xf0) == 0xf0) {
734
0
                if (*len < 4)
735
0
                    goto error;
736
0
                if ((c & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
737
0
                    goto error;
738
0
                *len = 4;
739
                /* 4-byte code */
740
0
                c = (utf[0] & 0x7) << 18;
741
0
                c |= (utf[1] & 0x3f) << 12;
742
0
                c |= (utf[2] & 0x3f) << 6;
743
0
                c |= utf[3] & 0x3f;
744
0
            } else {
745
              /* 3-byte code */
746
0
                *len = 3;
747
0
                c = (utf[0] & 0xf) << 12;
748
0
                c |= (utf[1] & 0x3f) << 6;
749
0
                c |= utf[2] & 0x3f;
750
0
            }
751
0
        } else {
752
          /* 2-byte code */
753
0
            *len = 2;
754
0
            c = (utf[0] & 0x1f) << 6;
755
0
            c |= utf[1] & 0x3f;
756
0
        }
757
0
    } else {
758
        /* 1-byte code */
759
0
        *len = 1;
760
0
    }
761
0
    return(c);
762
763
0
error:
764
0
    if (len != NULL)
765
0
  *len = 0;
766
0
    return(-1);
767
0
}
768
769
/**
770
 * xmlCheckUTF8:
771
 * @utf: Pointer to putative UTF-8 encoded string.
772
 *
773
 * Checks @utf for being valid UTF-8. @utf is assumed to be
774
 * null-terminated. This function is not super-strict, as it will
775
 * allow longer UTF-8 sequences than necessary. Note that Java is
776
 * capable of producing these sequences if provoked. Also note, this
777
 * routine checks for the 4-byte maximum size, but does not check for
778
 * 0x10ffff maximum value.
779
 *
780
 * Return value: true if @utf is valid.
781
 **/
782
int
783
xmlCheckUTF8(const unsigned char *utf)
784
0
{
785
0
    int ix;
786
0
    unsigned char c;
787
788
0
    if (utf == NULL)
789
0
        return(0);
790
    /*
791
     * utf is a string of 1, 2, 3 or 4 bytes.  The valid strings
792
     * are as follows (in "bit format"):
793
     *    0xxxxxxx                                      valid 1-byte
794
     *    110xxxxx 10xxxxxx                             valid 2-byte
795
     *    1110xxxx 10xxxxxx 10xxxxxx                    valid 3-byte
796
     *    11110xxx 10xxxxxx 10xxxxxx 10xxxxxx           valid 4-byte
797
     */
798
0
    while ((c = utf[0])) {      /* string is 0-terminated */
799
0
        ix = 0;
800
0
        if ((c & 0x80) == 0x00) { /* 1-byte code, starts with 10 */
801
0
            ix = 1;
802
0
  } else if ((c & 0xe0) == 0xc0) {/* 2-byte code, starts with 110 */
803
0
      if ((utf[1] & 0xc0 ) != 0x80)
804
0
          return 0;
805
0
      ix = 2;
806
0
  } else if ((c & 0xf0) == 0xe0) {/* 3-byte code, starts with 1110 */
807
0
      if (((utf[1] & 0xc0) != 0x80) ||
808
0
          ((utf[2] & 0xc0) != 0x80))
809
0
        return 0;
810
0
      ix = 3;
811
0
  } else if ((c & 0xf8) == 0xf0) {/* 4-byte code, starts with 11110 */
812
0
      if (((utf[1] & 0xc0) != 0x80) ||
813
0
          ((utf[2] & 0xc0) != 0x80) ||
814
0
    ((utf[3] & 0xc0) != 0x80))
815
0
        return 0;
816
0
      ix = 4;
817
0
  } else       /* unknown encoding */
818
0
      return 0;
819
0
        utf += ix;
820
0
      }
821
0
      return(1);
822
0
}
823
824
/**
825
 * xmlUTF8Strsize:
826
 * @utf:  a sequence of UTF-8 encoded bytes
827
 * @len:  the number of characters in the array
828
 *
829
 * storage size of an UTF8 string
830
 * the behaviour is not guaranteed if the input string is not UTF-8
831
 *
832
 * Returns the storage size of
833
 * the first 'len' characters of ARRAY
834
 */
835
836
int
837
0
xmlUTF8Strsize(const xmlChar *utf, int len) {
838
0
    const xmlChar *ptr=utf;
839
0
    int ch;
840
0
    size_t ret;
841
842
0
    if (utf == NULL)
843
0
        return(0);
844
845
0
    if (len <= 0)
846
0
        return(0);
847
848
0
    while ( len-- > 0) {
849
0
        if ( !*ptr )
850
0
            break;
851
0
        if ( (ch = *ptr++) & 0x80)
852
0
            while ((ch<<=1) & 0x80 ) {
853
0
    if (*ptr == 0) break;
854
0
                ptr++;
855
0
      }
856
0
    }
857
0
    ret = ptr - utf;
858
0
    return (ret > INT_MAX ? 0 : ret);
859
0
}
860
861
862
/**
863
 * xmlUTF8Strndup:
864
 * @utf:  the input UTF8 *
865
 * @len:  the len of @utf (in chars)
866
 *
867
 * a strndup for array of UTF8's
868
 *
869
 * Returns a new UTF8 * or NULL
870
 */
871
xmlChar *
872
0
xmlUTF8Strndup(const xmlChar *utf, int len) {
873
0
    xmlChar *ret;
874
0
    int i;
875
876
0
    if ((utf == NULL) || (len < 0)) return(NULL);
877
0
    i = xmlUTF8Strsize(utf, len);
878
0
    ret = (xmlChar *) xmlMallocAtomic((size_t) i + 1);
879
0
    if (ret == NULL) {
880
0
        return(NULL);
881
0
    }
882
0
    memcpy(ret, utf, i);
883
0
    ret[i] = 0;
884
0
    return(ret);
885
0
}
886
887
/**
888
 * xmlUTF8Strpos:
889
 * @utf:  the input UTF8 *
890
 * @pos:  the position of the desired UTF8 char (in chars)
891
 *
892
 * a function to provide the equivalent of fetching a
893
 * character from a string array
894
 *
895
 * Returns a pointer to the UTF8 character or NULL
896
 */
897
const xmlChar *
898
0
xmlUTF8Strpos(const xmlChar *utf, int pos) {
899
0
    int ch;
900
901
0
    if (utf == NULL) return(NULL);
902
0
    if (pos < 0)
903
0
        return(NULL);
904
0
    while (pos--) {
905
0
        if ((ch=*utf++) == 0) return(NULL);
906
0
        if ( ch & 0x80 ) {
907
            /* if not simple ascii, verify proper format */
908
0
            if ( (ch & 0xc0) != 0xc0 )
909
0
                return(NULL);
910
            /* then skip over remaining bytes for this char */
911
0
            while ( (ch <<= 1) & 0x80 )
912
0
                if ( (*utf++ & 0xc0) != 0x80 )
913
0
                    return(NULL);
914
0
        }
915
0
    }
916
0
    return((xmlChar *)utf);
917
0
}
918
919
/**
920
 * xmlUTF8Strloc:
921
 * @utf:  the input UTF8 *
922
 * @utfchar:  the UTF8 character to be found
923
 *
924
 * a function to provide the relative location of a UTF8 char
925
 *
926
 * Returns the relative character position of the desired char
927
 * or -1 if not found
928
 */
929
int
930
0
xmlUTF8Strloc(const xmlChar *utf, const xmlChar *utfchar) {
931
0
    size_t i;
932
0
    int size;
933
0
    int ch;
934
935
0
    if (utf==NULL || utfchar==NULL) return -1;
936
0
    size = xmlUTF8Strsize(utfchar, 1);
937
0
        for(i=0; (ch=*utf) != 0; i++) {
938
0
            if (xmlStrncmp(utf, utfchar, size)==0)
939
0
                return(i > INT_MAX ? 0 : i);
940
0
            utf++;
941
0
            if ( ch & 0x80 ) {
942
                /* if not simple ascii, verify proper format */
943
0
                if ( (ch & 0xc0) != 0xc0 )
944
0
                    return(-1);
945
                /* then skip over remaining bytes for this char */
946
0
                while ( (ch <<= 1) & 0x80 )
947
0
                    if ( (*utf++ & 0xc0) != 0x80 )
948
0
                        return(-1);
949
0
            }
950
0
        }
951
952
0
    return(-1);
953
0
}
954
/**
955
 * xmlUTF8Strsub:
956
 * @utf:  a sequence of UTF-8 encoded bytes
957
 * @start: relative pos of first char
958
 * @len:   total number to copy
959
 *
960
 * Create a substring from a given UTF-8 string
961
 * Note:  positions are given in units of UTF-8 chars
962
 *
963
 * Returns a pointer to a newly created string
964
 * or NULL if any problem
965
 */
966
967
xmlChar *
968
1
xmlUTF8Strsub(const xmlChar *utf, int start, int len) {
969
1
    int i;
970
1
    int ch;
971
972
1
    if (utf == NULL) return(NULL);
973
1
    if (start < 0) return(NULL);
974
1
    if (len < 0) return(NULL);
975
976
    /*
977
     * Skip over any leading chars
978
     */
979
165
    for (i = 0;i < start;i++) {
980
165
        if ((ch=*utf++) == 0) return(NULL);
981
164
        if ( ch & 0x80 ) {
982
            /* if not simple ascii, verify proper format */
983
3
            if ( (ch & 0xc0) != 0xc0 )
984
0
                return(NULL);
985
            /* then skip over remaining bytes for this char */
986
8
            while ( (ch <<= 1) & 0x80 )
987
5
                if ( (*utf++ & 0xc0) != 0x80 )
988
0
                    return(NULL);
989
3
        }
990
164
    }
991
992
0
    return(xmlUTF8Strndup(utf, len));
993
1
}
994
995
/**
996
 * xmlEscapeFormatString:
997
 * @msg:  a pointer to the string in which to escape '%' characters.
998
 * Must be a heap-allocated buffer created by libxml2 that may be
999
 * returned, or that may be freed and replaced.
1000
 *
1001
 * Replaces the string pointed to by 'msg' with an escaped string.
1002
 * Returns the same string with all '%' characters escaped.
1003
 */
1004
xmlChar *
1005
xmlEscapeFormatString(xmlChar **msg)
1006
0
{
1007
0
    xmlChar *msgPtr = NULL;
1008
0
    xmlChar *result = NULL;
1009
0
    xmlChar *resultPtr = NULL;
1010
0
    size_t count = 0;
1011
0
    size_t msgLen = 0;
1012
0
    size_t resultLen = 0;
1013
1014
0
    if (!msg || !*msg)
1015
0
        return(NULL);
1016
1017
0
    for (msgPtr = *msg; *msgPtr != '\0'; ++msgPtr) {
1018
0
        ++msgLen;
1019
0
        if (*msgPtr == '%')
1020
0
            ++count;
1021
0
    }
1022
1023
0
    if (count == 0)
1024
0
        return(*msg);
1025
1026
0
    if ((count > INT_MAX) || (msgLen > INT_MAX - count))
1027
0
        return(NULL);
1028
0
    resultLen = msgLen + count + 1;
1029
0
    result = (xmlChar *) xmlMallocAtomic(resultLen);
1030
0
    if (result == NULL) {
1031
        /* Clear *msg to prevent format string vulnerabilities in
1032
           out-of-memory situations. */
1033
0
        xmlFree(*msg);
1034
0
        *msg = NULL;
1035
0
        xmlErrMemory(NULL, NULL);
1036
0
        return(NULL);
1037
0
    }
1038
1039
0
    for (msgPtr = *msg, resultPtr = result; *msgPtr != '\0'; ++msgPtr, ++resultPtr) {
1040
0
        *resultPtr = *msgPtr;
1041
0
        if (*msgPtr == '%')
1042
0
            *(++resultPtr) = '%';
1043
0
    }
1044
0
    result[resultLen - 1] = '\0';
1045
1046
0
    xmlFree(*msg);
1047
0
    *msg = result;
1048
1049
0
    return *msg;
1050
0
}
1051