Coverage Report

Created: 2022-11-15 06:15

/src/libxml2/xmlstring.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * string.c : an XML string utilities module
3
 *
4
 * This module provides various utility functions for manipulating
5
 * the xmlChar* type. All functions named xmlStr* have been moved here
6
 * from the parser.c file (their original home).
7
 *
8
 * See Copyright for the status of this software.
9
 *
10
 * UTF8 string routines from:
11
 * William Brack <wbrack@mmm.com.hk>
12
 *
13
 * daniel@veillard.com
14
 */
15
16
#define IN_LIBXML
17
#include "libxml.h"
18
19
#include <stdlib.h>
20
#include <string.h>
21
#include <limits.h>
22
#include <libxml/xmlmemory.h>
23
#include <libxml/parserInternals.h>
24
#include <libxml/xmlstring.h>
25
26
#include "private/parser.h"
27
#include "private/string.h"
28
29
/************************************************************************
30
 *                                                                      *
31
 *                Commodity functions to handle xmlChars                *
32
 *                                                                      *
33
 ************************************************************************/
34
35
/**
36
 * xmlStrndup:
37
 * @cur:  the input xmlChar *
38
 * @len:  the len of @cur
39
 *
40
 * a strndup for array of xmlChar's
41
 *
42
 * Returns a new xmlChar * or NULL
43
 */
44
xmlChar *
45
25.9M
xmlStrndup(const xmlChar *cur, int len) {
46
25.9M
    xmlChar *ret;
47
48
25.9M
    if ((cur == NULL) || (len < 0)) return(NULL);
49
25.9M
    ret = (xmlChar *) xmlMallocAtomic((size_t) len + 1);
50
25.9M
    if (ret == NULL) {
51
0
        xmlErrMemory(NULL, NULL);
52
0
        return(NULL);
53
0
    }
54
25.9M
    memcpy(ret, cur, len);
55
25.9M
    ret[len] = 0;
56
25.9M
    return(ret);
57
25.9M
}
58
59
/**
60
 * xmlStrdup:
61
 * @cur:  the input xmlChar *
62
 *
63
 * a strdup for array of xmlChar's. Since they are supposed to be
64
 * encoded in UTF-8 or an encoding with 8bit based chars, we assume
65
 * a termination mark of '0'.
66
 *
67
 * Returns a new xmlChar * or NULL
68
 */
69
xmlChar *
70
10.8M
xmlStrdup(const xmlChar *cur) {
71
10.8M
    const xmlChar *p = cur;
72
73
10.8M
    if (cur == NULL) return(NULL);
74
1.34G
    while (*p != 0) p++; /* non input consuming */
75
10.8M
    return(xmlStrndup(cur, p - cur));
76
10.8M
}
77
78
/**
79
 * xmlCharStrndup:
80
 * @cur:  the input char *
81
 * @len:  the len of @cur
82
 *
83
 * a strndup for char's to xmlChar's
84
 *
85
 * Returns a new xmlChar * or NULL
86
 */
87
88
xmlChar *
89
712
xmlCharStrndup(const char *cur, int len) {
90
712
    int i;
91
712
    xmlChar *ret;
92
93
712
    if ((cur == NULL) || (len < 0)) return(NULL);
94
712
    ret = (xmlChar *) xmlMallocAtomic((size_t) len + 1);
95
712
    if (ret == NULL) {
96
0
        xmlErrMemory(NULL, NULL);
97
0
        return(NULL);
98
0
    }
99
138M
    for (i = 0;i < len;i++) {
100
        /* Explicit sign change */
101
138M
        ret[i] = (xmlChar) cur[i];
102
138M
        if (ret[i] == 0) return(ret);
103
138M
    }
104
712
    ret[len] = 0;
105
712
    return(ret);
106
712
}
107
108
/**
109
 * xmlCharStrdup:
110
 * @cur:  the input char *
111
 *
112
 * a strdup for char's to xmlChar's
113
 *
114
 * Returns a new xmlChar * or NULL
115
 */
116
117
xmlChar *
118
712
xmlCharStrdup(const char *cur) {
119
712
    const char *p = cur;
120
121
712
    if (cur == NULL) return(NULL);
122
138M
    while (*p != '\0') p++; /* non input consuming */
123
712
    return(xmlCharStrndup(cur, p - cur));
124
712
}
125
126
/**
127
 * xmlStrcmp:
128
 * @str1:  the first xmlChar *
129
 * @str2:  the second xmlChar *
130
 *
131
 * a strcmp for xmlChar's
132
 *
133
 * Returns the integer result of the comparison
134
 */
135
136
int
137
0
xmlStrcmp(const xmlChar *str1, const xmlChar *str2) {
138
0
    if (str1 == str2) return(0);
139
0
    if (str1 == NULL) return(-1);
140
0
    if (str2 == NULL) return(1);
141
0
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
142
0
    return(strcmp((const char *)str1, (const char *)str2));
143
#else
144
    do {
145
        int tmp = *str1++ - *str2;
146
        if (tmp != 0) return(tmp);
147
    } while (*str2++ != 0);
148
    return 0;
149
#endif
150
0
}
151
152
/**
153
 * xmlStrEqual:
154
 * @str1:  the first xmlChar *
155
 * @str2:  the second xmlChar *
156
 *
157
 * Check if both strings are equal of have same content.
158
 * Should be a bit more readable and faster than xmlStrcmp()
159
 *
160
 * Returns 1 if they are equal, 0 if they are different
161
 */
162
163
int
164
3.95M
xmlStrEqual(const xmlChar *str1, const xmlChar *str2) {
165
3.95M
    if (str1 == str2) return(1);
166
2.06M
    if (str1 == NULL) return(0);
167
2.06M
    if (str2 == NULL) return(0);
168
2.06M
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
169
2.06M
    return(strcmp((const char *)str1, (const char *)str2) == 0);
170
#else
171
    do {
172
        if (*str1++ != *str2) return(0);
173
    } while (*str2++);
174
    return(1);
175
#endif
176
2.06M
}
177
178
/**
179
 * xmlStrQEqual:
180
 * @pref:  the prefix of the QName
181
 * @name:  the localname of the QName
182
 * @str:  the second xmlChar *
183
 *
184
 * Check if a QName is Equal to a given string
185
 *
186
 * Returns 1 if they are equal, 0 if they are different
187
 */
188
189
int
190
0
xmlStrQEqual(const xmlChar *pref, const xmlChar *name, const xmlChar *str) {
191
0
    if (pref == NULL) return(xmlStrEqual(name, str));
192
0
    if (name == NULL) return(0);
193
0
    if (str == NULL) return(0);
194
195
0
    do {
196
0
        if (*pref++ != *str) return(0);
197
0
    } while ((*str++) && (*pref));
198
0
    if (*str++ != ':') return(0);
199
0
    do {
200
0
        if (*name++ != *str) return(0);
201
0
    } while (*str++);
202
0
    return(1);
203
0
}
204
205
/**
206
 * xmlStrncmp:
207
 * @str1:  the first xmlChar *
208
 * @str2:  the second xmlChar *
209
 * @len:  the max comparison length
210
 *
211
 * a strncmp for xmlChar's
212
 *
213
 * Returns the integer result of the comparison
214
 */
215
216
int
217
46.3M
xmlStrncmp(const xmlChar *str1, const xmlChar *str2, int len) {
218
46.3M
    if (len <= 0) return(0);
219
46.3M
    if (str1 == str2) return(0);
220
46.3M
    if (str1 == NULL) return(-1);
221
46.3M
    if (str2 == NULL) return(1);
222
46.3M
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
223
46.3M
    return(strncmp((const char *)str1, (const char *)str2, len));
224
#else
225
    do {
226
        int tmp = *str1++ - *str2;
227
        if (tmp != 0 || --len == 0) return(tmp);
228
    } while (*str2++ != 0);
229
    return 0;
230
#endif
231
46.3M
}
232
233
static const xmlChar casemap[256] = {
234
    0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
235
    0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
236
    0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
237
    0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,
238
    0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,
239
    0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
240
    0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,
241
    0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
242
    0x40,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
243
    0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
244
    0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
245
    0x78,0x79,0x7A,0x7B,0x5C,0x5D,0x5E,0x5F,
246
    0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
247
    0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
248
    0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
249
    0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E,0x7F,
250
    0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,
251
    0x88,0x89,0x8A,0x8B,0x8C,0x8D,0x8E,0x8F,
252
    0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97,
253
    0x98,0x99,0x9A,0x9B,0x9C,0x9D,0x9E,0x9F,
254
    0xA0,0xA1,0xA2,0xA3,0xA4,0xA5,0xA6,0xA7,
255
    0xA8,0xA9,0xAA,0xAB,0xAC,0xAD,0xAE,0xAF,
256
    0xB0,0xB1,0xB2,0xB3,0xB4,0xB5,0xB6,0xB7,
257
    0xB8,0xB9,0xBA,0xBB,0xBC,0xBD,0xBE,0xBF,
258
    0xC0,0xC1,0xC2,0xC3,0xC4,0xC5,0xC6,0xC7,
259
    0xC8,0xC9,0xCA,0xCB,0xCC,0xCD,0xCE,0xCF,
260
    0xD0,0xD1,0xD2,0xD3,0xD4,0xD5,0xD6,0xD7,
261
    0xD8,0xD9,0xDA,0xDB,0xDC,0xDD,0xDE,0xDF,
262
    0xE0,0xE1,0xE2,0xE3,0xE4,0xE5,0xE6,0xE7,
263
    0xE8,0xE9,0xEA,0xEB,0xEC,0xED,0xEE,0xEF,
264
    0xF0,0xF1,0xF2,0xF3,0xF4,0xF5,0xF6,0xF7,
265
    0xF8,0xF9,0xFA,0xFB,0xFC,0xFD,0xFE,0xFF
266
};
267
268
/**
269
 * xmlStrcasecmp:
270
 * @str1:  the first xmlChar *
271
 * @str2:  the second xmlChar *
272
 *
273
 * a strcasecmp for xmlChar's
274
 *
275
 * Returns the integer result of the comparison
276
 */
277
278
int
279
0
xmlStrcasecmp(const xmlChar *str1, const xmlChar *str2) {
280
0
    register int tmp;
281
282
0
    if (str1 == str2) return(0);
283
0
    if (str1 == NULL) return(-1);
284
0
    if (str2 == NULL) return(1);
285
0
    do {
286
0
        tmp = casemap[*str1++] - casemap[*str2];
287
0
        if (tmp != 0) return(tmp);
288
0
    } while (*str2++ != 0);
289
0
    return 0;
290
0
}
291
292
/**
293
 * xmlStrncasecmp:
294
 * @str1:  the first xmlChar *
295
 * @str2:  the second xmlChar *
296
 * @len:  the max comparison length
297
 *
298
 * a strncasecmp for xmlChar's
299
 *
300
 * Returns the integer result of the comparison
301
 */
302
303
int
304
106M
xmlStrncasecmp(const xmlChar *str1, const xmlChar *str2, int len) {
305
106M
    register int tmp;
306
307
106M
    if (len <= 0) return(0);
308
106M
    if (str1 == str2) return(0);
309
106M
    if (str1 == NULL) return(-1);
310
106M
    if (str2 == NULL) return(1);
311
223M
    do {
312
223M
        tmp = casemap[*str1++] - casemap[*str2];
313
223M
        if (tmp != 0 || --len == 0) return(tmp);
314
223M
    } while (*str2++ != 0);
315
0
    return 0;
316
106M
}
317
318
/**
319
 * xmlStrchr:
320
 * @str:  the xmlChar * array
321
 * @val:  the xmlChar to search
322
 *
323
 * a strchr for xmlChar's
324
 *
325
 * Returns the xmlChar * for the first occurrence or NULL.
326
 */
327
328
const xmlChar *
329
2.32M
xmlStrchr(const xmlChar *str, xmlChar val) {
330
2.32M
    if (str == NULL) return(NULL);
331
47.4M
    while (*str != 0) { /* non input consuming */
332
45.0M
        if (*str == val) return((xmlChar *) str);
333
45.0M
        str++;
334
45.0M
    }
335
2.32M
    return(NULL);
336
2.32M
}
337
338
/**
339
 * xmlStrstr:
340
 * @str:  the xmlChar * array (haystack)
341
 * @val:  the xmlChar to search (needle)
342
 *
343
 * a strstr for xmlChar's
344
 *
345
 * Returns the xmlChar * for the first occurrence or NULL.
346
 */
347
348
const xmlChar *
349
6.45M
xmlStrstr(const xmlChar *str, const xmlChar *val) {
350
6.45M
    int n;
351
352
6.45M
    if (str == NULL) return(NULL);
353
6.45M
    if (val == NULL) return(NULL);
354
6.45M
    n = xmlStrlen(val);
355
356
6.45M
    if (n == 0) return(str);
357
14.3M
    while (*str != 0) { /* non input consuming */
358
7.92M
        if (*str == *val) {
359
1.13k
            if (!xmlStrncmp(str, val, n)) return((const xmlChar *) str);
360
1.13k
        }
361
7.92M
        str++;
362
7.92M
    }
363
6.44M
    return(NULL);
364
6.45M
}
365
366
/**
367
 * xmlStrcasestr:
368
 * @str:  the xmlChar * array (haystack)
369
 * @val:  the xmlChar to search (needle)
370
 *
371
 * a case-ignoring strstr for xmlChar's
372
 *
373
 * Returns the xmlChar * for the first occurrence or NULL.
374
 */
375
376
const xmlChar *
377
0
xmlStrcasestr(const xmlChar *str, const xmlChar *val) {
378
0
    int n;
379
380
0
    if (str == NULL) return(NULL);
381
0
    if (val == NULL) return(NULL);
382
0
    n = xmlStrlen(val);
383
384
0
    if (n == 0) return(str);
385
0
    while (*str != 0) { /* non input consuming */
386
0
        if (casemap[*str] == casemap[*val])
387
0
            if (!xmlStrncasecmp(str, val, n)) return(str);
388
0
        str++;
389
0
    }
390
0
    return(NULL);
391
0
}
392
393
/**
394
 * xmlStrsub:
395
 * @str:  the xmlChar * array (haystack)
396
 * @start:  the index of the first char (zero based)
397
 * @len:  the length of the substring
398
 *
399
 * Extract a substring of a given string
400
 *
401
 * Returns the xmlChar * for the first occurrence or NULL.
402
 */
403
404
xmlChar *
405
0
xmlStrsub(const xmlChar *str, int start, int len) {
406
0
    int i;
407
408
0
    if (str == NULL) return(NULL);
409
0
    if (start < 0) return(NULL);
410
0
    if (len < 0) return(NULL);
411
412
0
    for (i = 0;i < start;i++) {
413
0
        if (*str == 0) return(NULL);
414
0
        str++;
415
0
    }
416
0
    if (*str == 0) return(NULL);
417
0
    return(xmlStrndup(str, len));
418
0
}
419
420
/**
421
 * xmlStrlen:
422
 * @str:  the xmlChar * array
423
 *
424
 * length of a xmlChar's string
425
 *
426
 * Returns the number of xmlChar contained in the ARRAY.
427
 */
428
429
int
430
17.3M
xmlStrlen(const xmlChar *str) {
431
17.3M
    size_t len = str ? strlen((const char *)str) : 0;
432
17.3M
    return(len > INT_MAX ? 0 : len);
433
17.3M
}
434
435
/**
436
 * xmlStrncat:
437
 * @cur:  the original xmlChar * array
438
 * @add:  the xmlChar * array added
439
 * @len:  the length of @add
440
 *
441
 * a strncat for array of xmlChar's, it will extend @cur with the len
442
 * first bytes of @add. Note that if @len < 0 then this is an API error
443
 * and NULL will be returned.
444
 *
445
 * Returns a new xmlChar *, the original @cur is reallocated and should
446
 * not be freed.
447
 */
448
449
xmlChar *
450
15
xmlStrncat(xmlChar *cur, const xmlChar *add, int len) {
451
15
    int size;
452
15
    xmlChar *ret;
453
454
15
    if ((add == NULL) || (len == 0))
455
0
        return(cur);
456
15
    if (len < 0)
457
0
  return(NULL);
458
15
    if (cur == NULL)
459
0
        return(xmlStrndup(add, len));
460
461
15
    size = xmlStrlen(cur);
462
15
    if ((size < 0) || (size > INT_MAX - len))
463
0
        return(NULL);
464
15
    ret = (xmlChar *) xmlRealloc(cur, (size_t) size + len + 1);
465
15
    if (ret == NULL) {
466
0
        xmlErrMemory(NULL, NULL);
467
0
        return(cur);
468
0
    }
469
15
    memcpy(&ret[size], add, len);
470
15
    ret[size + len] = 0;
471
15
    return(ret);
472
15
}
473
474
/**
475
 * xmlStrncatNew:
476
 * @str1:  first xmlChar string
477
 * @str2:  second xmlChar string
478
 * @len:  the len of @str2 or < 0
479
 *
480
 * same as xmlStrncat, but creates a new string.  The original
481
 * two strings are not freed. If @len is < 0 then the length
482
 * will be calculated automatically.
483
 *
484
 * Returns a new xmlChar * or NULL
485
 */
486
xmlChar *
487
0
xmlStrncatNew(const xmlChar *str1, const xmlChar *str2, int len) {
488
0
    int size;
489
0
    xmlChar *ret;
490
491
0
    if (len < 0) {
492
0
        len = xmlStrlen(str2);
493
0
        if (len < 0)
494
0
            return(NULL);
495
0
    }
496
0
    if ((str2 == NULL) || (len == 0))
497
0
        return(xmlStrdup(str1));
498
0
    if (str1 == NULL)
499
0
        return(xmlStrndup(str2, len));
500
501
0
    size = xmlStrlen(str1);
502
0
    if ((size < 0) || (size > INT_MAX - len))
503
0
        return(NULL);
504
0
    ret = (xmlChar *) xmlMalloc((size_t) size + len + 1);
505
0
    if (ret == NULL) {
506
0
        xmlErrMemory(NULL, NULL);
507
0
        return(xmlStrndup(str1, size));
508
0
    }
509
0
    memcpy(ret, str1, size);
510
0
    memcpy(&ret[size], str2, len);
511
0
    ret[size + len] = 0;
512
0
    return(ret);
513
0
}
514
515
/**
516
 * xmlStrcat:
517
 * @cur:  the original xmlChar * array
518
 * @add:  the xmlChar * array added
519
 *
520
 * a strcat for array of xmlChar's. Since they are supposed to be
521
 * encoded in UTF-8 or an encoding with 8bit based chars, we assume
522
 * a termination mark of '0'.
523
 *
524
 * Returns a new xmlChar * containing the concatenated string. The original
525
 * @cur is reallocated and should not be freed.
526
 */
527
xmlChar *
528
1.73k
xmlStrcat(xmlChar *cur, const xmlChar *add) {
529
1.73k
    const xmlChar *p = add;
530
531
1.73k
    if (add == NULL) return(cur);
532
1.73k
    if (cur == NULL)
533
1.71k
        return(xmlStrdup(add));
534
535
327k
    while (*p != 0) p++; /* non input consuming */
536
15
    return(xmlStrncat(cur, add, p - add));
537
1.73k
}
538
539
/**
540
 * xmlStrPrintf:
541
 * @buf:   the result buffer.
542
 * @len:   the result buffer length.
543
 * @msg:   the message with printf formatting.
544
 * @...:   extra parameters for the message.
545
 *
546
 * Formats @msg and places result into @buf.
547
 *
548
 * Returns the number of characters written to @buf or -1 if an error occurs.
549
 */
550
int XMLCDECL
551
765
xmlStrPrintf(xmlChar *buf, int len, const char *msg, ...) {
552
765
    va_list args;
553
765
    int ret;
554
555
765
    if((buf == NULL) || (msg == NULL)) {
556
0
        return(-1);
557
0
    }
558
559
765
    va_start(args, msg);
560
765
    ret = vsnprintf((char *) buf, len, (const char *) msg, args);
561
765
    va_end(args);
562
765
    buf[len - 1] = 0; /* be safe ! */
563
564
765
    return(ret);
565
765
}
566
567
/**
568
 * xmlStrVPrintf:
569
 * @buf:   the result buffer.
570
 * @len:   the result buffer length.
571
 * @msg:   the message with printf formatting.
572
 * @ap:    extra parameters for the message.
573
 *
574
 * Formats @msg and places result into @buf.
575
 *
576
 * Returns the number of characters written to @buf or -1 if an error occurs.
577
 */
578
int
579
0
xmlStrVPrintf(xmlChar *buf, int len, const char *msg, va_list ap) {
580
0
    int ret;
581
582
0
    if((buf == NULL) || (msg == NULL)) {
583
0
        return(-1);
584
0
    }
585
586
0
    ret = vsnprintf((char *) buf, len, (const char *) msg, ap);
587
0
    buf[len - 1] = 0; /* be safe ! */
588
589
0
    return(ret);
590
0
}
591
592
/************************************************************************
593
 *                                                                      *
594
 *              Generic UTF8 handling routines                          *
595
 *                                                                      *
596
 * From rfc2044: encoding of the Unicode values on UTF-8:               *
597
 *                                                                      *
598
 * UCS-4 range (hex.)           UTF-8 octet sequence (binary)           *
599
 * 0000 0000-0000 007F   0xxxxxxx                                       *
600
 * 0000 0080-0000 07FF   110xxxxx 10xxxxxx                              *
601
 * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx                     *
602
 *                                                                      *
603
 * I hope we won't use values > 0xFFFF anytime soon !                   *
604
 *                                                                      *
605
 ************************************************************************/
606
607
608
/**
609
 * xmlUTF8Size:
610
 * @utf: pointer to the UTF8 character
611
 *
612
 * calculates the internal size of a UTF8 character
613
 *
614
 * returns the numbers of bytes in the character, -1 on format error
615
 */
616
int
617
1.84k
xmlUTF8Size(const xmlChar *utf) {
618
1.84k
    xmlChar mask;
619
1.84k
    int len;
620
621
1.84k
    if (utf == NULL)
622
0
        return -1;
623
1.84k
    if (*utf < 0x80)
624
1.67k
        return 1;
625
    /* check valid UTF8 character */
626
168
    if (!(*utf & 0x40))
627
59
        return -1;
628
    /* determine number of bytes in char */
629
109
    len = 2;
630
238
    for (mask=0x20; mask != 0; mask>>=1) {
631
218
        if (!(*utf & mask))
632
89
            return len;
633
129
        len++;
634
129
    }
635
20
    return -1;
636
109
}
637
638
/**
639
 * xmlUTF8Charcmp:
640
 * @utf1: pointer to first UTF8 char
641
 * @utf2: pointer to second UTF8 char
642
 *
643
 * compares the two UCS4 values
644
 *
645
 * returns result of the compare as with xmlStrncmp
646
 */
647
int
648
1.84k
xmlUTF8Charcmp(const xmlChar *utf1, const xmlChar *utf2) {
649
650
1.84k
    if (utf1 == NULL ) {
651
0
        if (utf2 == NULL)
652
0
            return 0;
653
0
        return -1;
654
0
    }
655
1.84k
    return xmlStrncmp(utf1, utf2, xmlUTF8Size(utf1));
656
1.84k
}
657
658
/**
659
 * xmlUTF8Strlen:
660
 * @utf:  a sequence of UTF-8 encoded bytes
661
 *
662
 * compute the length of an UTF8 string, it doesn't do a full UTF8
663
 * checking of the content of the string.
664
 *
665
 * Returns the number of characters in the string or -1 in case of error
666
 */
667
int
668
57
xmlUTF8Strlen(const xmlChar *utf) {
669
57
    size_t ret = 0;
670
671
57
    if (utf == NULL)
672
0
        return(-1);
673
674
7.03M
    while (*utf != 0) {
675
7.03M
        if (utf[0] & 0x80) {
676
764
            if ((utf[1] & 0xc0) != 0x80)
677
10
                return(-1);
678
754
            if ((utf[0] & 0xe0) == 0xe0) {
679
56
                if ((utf[2] & 0xc0) != 0x80)
680
0
                    return(-1);
681
56
                if ((utf[0] & 0xf0) == 0xf0) {
682
7
                    if ((utf[0] & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
683
0
                        return(-1);
684
7
                    utf += 4;
685
49
                } else {
686
49
                    utf += 3;
687
49
                }
688
698
            } else {
689
698
                utf += 2;
690
698
            }
691
7.03M
        } else {
692
7.03M
            utf++;
693
7.03M
        }
694
7.03M
        ret++;
695
7.03M
    }
696
47
    return(ret > INT_MAX ? 0 : ret);
697
57
}
698
699
/**
700
 * xmlGetUTF8Char:
701
 * @utf:  a sequence of UTF-8 encoded bytes
702
 * @len:  a pointer to the minimum number of bytes present in
703
 *        the sequence.  This is used to assure the next character
704
 *        is completely contained within the sequence.
705
 *
706
 * Read the first UTF8 character from @utf
707
 *
708
 * Returns the char value or -1 in case of error, and sets *len to
709
 *        the actual number of bytes consumed (0 in case of error)
710
 */
711
int
712
0
xmlGetUTF8Char(const unsigned char *utf, int *len) {
713
0
    unsigned int c;
714
715
0
    if (utf == NULL)
716
0
        goto error;
717
0
    if (len == NULL)
718
0
        goto error;
719
0
    if (*len < 1)
720
0
        goto error;
721
722
0
    c = utf[0];
723
0
    if (c & 0x80) {
724
0
        if (*len < 2)
725
0
            goto error;
726
0
        if ((utf[1] & 0xc0) != 0x80)
727
0
            goto error;
728
0
        if ((c & 0xe0) == 0xe0) {
729
0
            if (*len < 3)
730
0
                goto error;
731
0
            if ((utf[2] & 0xc0) != 0x80)
732
0
                goto error;
733
0
            if ((c & 0xf0) == 0xf0) {
734
0
                if (*len < 4)
735
0
                    goto error;
736
0
                if ((c & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
737
0
                    goto error;
738
0
                *len = 4;
739
                /* 4-byte code */
740
0
                c = (utf[0] & 0x7) << 18;
741
0
                c |= (utf[1] & 0x3f) << 12;
742
0
                c |= (utf[2] & 0x3f) << 6;
743
0
                c |= utf[3] & 0x3f;
744
0
            } else {
745
              /* 3-byte code */
746
0
                *len = 3;
747
0
                c = (utf[0] & 0xf) << 12;
748
0
                c |= (utf[1] & 0x3f) << 6;
749
0
                c |= utf[2] & 0x3f;
750
0
            }
751
0
        } else {
752
          /* 2-byte code */
753
0
            *len = 2;
754
0
            c = (utf[0] & 0x1f) << 6;
755
0
            c |= utf[1] & 0x3f;
756
0
        }
757
0
    } else {
758
        /* 1-byte code */
759
0
        *len = 1;
760
0
    }
761
0
    return(c);
762
763
0
error:
764
0
    if (len != NULL)
765
0
  *len = 0;
766
0
    return(-1);
767
0
}
768
769
/**
770
 * xmlCheckUTF8:
771
 * @utf: Pointer to putative UTF-8 encoded string.
772
 *
773
 * Checks @utf for being valid UTF-8. @utf is assumed to be
774
 * null-terminated. This function is not super-strict, as it will
775
 * allow longer UTF-8 sequences than necessary. Note that Java is
776
 * capable of producing these sequences if provoked. Also note, this
777
 * routine checks for the 4-byte maximum size, but does not check for
778
 * 0x10ffff maximum value.
779
 *
780
 * Return value: true if @utf is valid.
781
 **/
782
int
783
xmlCheckUTF8(const unsigned char *utf)
784
44
{
785
44
    int ix;
786
44
    unsigned char c;
787
788
44
    if (utf == NULL)
789
0
        return(0);
790
    /*
791
     * utf is a string of 1, 2, 3 or 4 bytes.  The valid strings
792
     * are as follows (in "bit format"):
793
     *    0xxxxxxx                                      valid 1-byte
794
     *    110xxxxx 10xxxxxx                             valid 2-byte
795
     *    1110xxxx 10xxxxxx 10xxxxxx                    valid 3-byte
796
     *    11110xxx 10xxxxxx 10xxxxxx 10xxxxxx           valid 4-byte
797
     */
798
108
    while ((c = utf[0])) {      /* string is 0-terminated */
799
108
        ix = 0;
800
108
        if ((c & 0x80) == 0x00) { /* 1-byte code, starts with 10 */
801
64
            ix = 1;
802
64
  } else if ((c & 0xe0) == 0xc0) {/* 2-byte code, starts with 110 */
803
0
      if ((utf[1] & 0xc0 ) != 0x80)
804
0
          return 0;
805
0
      ix = 2;
806
44
  } else if ((c & 0xf0) == 0xe0) {/* 3-byte code, starts with 1110 */
807
8
      if (((utf[1] & 0xc0) != 0x80) ||
808
8
          ((utf[2] & 0xc0) != 0x80))
809
8
        return 0;
810
0
      ix = 3;
811
36
  } else if ((c & 0xf8) == 0xf0) {/* 4-byte code, starts with 11110 */
812
0
      if (((utf[1] & 0xc0) != 0x80) ||
813
0
          ((utf[2] & 0xc0) != 0x80) ||
814
0
    ((utf[3] & 0xc0) != 0x80))
815
0
        return 0;
816
0
      ix = 4;
817
0
  } else       /* unknown encoding */
818
36
      return 0;
819
64
        utf += ix;
820
64
      }
821
0
      return(1);
822
44
}
823
824
/**
825
 * xmlUTF8Strsize:
826
 * @utf:  a sequence of UTF-8 encoded bytes
827
 * @len:  the number of characters in the array
828
 *
829
 * storage size of an UTF8 string
830
 * the behaviour is not guaranteed if the input string is not UTF-8
831
 *
832
 * Returns the storage size of
833
 * the first 'len' characters of ARRAY
834
 */
835
836
int
837
44.0M
xmlUTF8Strsize(const xmlChar *utf, int len) {
838
44.0M
    const xmlChar *ptr=utf;
839
44.0M
    int ch;
840
44.0M
    size_t ret;
841
842
44.0M
    if (utf == NULL)
843
0
        return(0);
844
845
44.0M
    if (len <= 0)
846
0
        return(0);
847
848
91.0M
    while ( len-- > 0) {
849
46.9M
        if ( !*ptr )
850
0
            break;
851
46.9M
        if ( (ch = *ptr++) & 0x80)
852
159k
            while ((ch<<=1) & 0x80 ) {
853
109k
    if (*ptr == 0) break;
854
109k
                ptr++;
855
109k
      }
856
46.9M
    }
857
44.0M
    ret = ptr - utf;
858
44.0M
    return (ret > INT_MAX ? 0 : ret);
859
44.0M
}
860
861
862
/**
863
 * xmlUTF8Strndup:
864
 * @utf:  the input UTF8 *
865
 * @len:  the len of @utf (in chars)
866
 *
867
 * a strndup for array of UTF8's
868
 *
869
 * Returns a new UTF8 * or NULL
870
 */
871
xmlChar *
872
10
xmlUTF8Strndup(const xmlChar *utf, int len) {
873
10
    xmlChar *ret;
874
10
    int i;
875
876
10
    if ((utf == NULL) || (len < 0)) return(NULL);
877
10
    i = xmlUTF8Strsize(utf, len);
878
10
    ret = (xmlChar *) xmlMallocAtomic((size_t) i + 1);
879
10
    if (ret == NULL) {
880
0
        return(NULL);
881
0
    }
882
10
    memcpy(ret, utf, i);
883
10
    ret[i] = 0;
884
10
    return(ret);
885
10
}
886
887
/**
888
 * xmlUTF8Strpos:
889
 * @utf:  the input UTF8 *
890
 * @pos:  the position of the desired UTF8 char (in chars)
891
 *
892
 * a function to provide the equivalent of fetching a
893
 * character from a string array
894
 *
895
 * Returns a pointer to the UTF8 character or NULL
896
 */
897
const xmlChar *
898
608
xmlUTF8Strpos(const xmlChar *utf, int pos) {
899
608
    int ch;
900
901
608
    if (utf == NULL) return(NULL);
902
608
    if (pos < 0)
903
0
        return(NULL);
904
10.9M
    while (pos--) {
905
10.9M
        if ((ch=*utf++) == 0) return(NULL);
906
10.9M
        if ( ch & 0x80 ) {
907
            /* if not simple ascii, verify proper format */
908
576
            if ( (ch & 0xc0) != 0xc0 )
909
576
                return(NULL);
910
            /* then skip over remaining bytes for this char */
911
0
            while ( (ch <<= 1) & 0x80 )
912
0
                if ( (*utf++ & 0xc0) != 0x80 )
913
0
                    return(NULL);
914
0
        }
915
10.9M
    }
916
32
    return((xmlChar *)utf);
917
608
}
918
919
/**
920
 * xmlUTF8Strloc:
921
 * @utf:  the input UTF8 *
922
 * @utfchar:  the UTF8 character to be found
923
 *
924
 * a function to provide the relative location of a UTF8 char
925
 *
926
 * Returns the relative character position of the desired char
927
 * or -1 if not found
928
 */
929
int
930
854
xmlUTF8Strloc(const xmlChar *utf, const xmlChar *utfchar) {
931
854
    size_t i;
932
854
    int size;
933
854
    int ch;
934
935
854
    if (utf==NULL || utfchar==NULL) return -1;
936
854
    size = xmlUTF8Strsize(utfchar, 1);
937
35.3M
        for(i=0; (ch=*utf) != 0; i++) {
938
35.3M
            if (xmlStrncmp(utf, utfchar, size)==0)
939
632
                return(i > INT_MAX ? 0 : i);
940
35.3M
            utf++;
941
35.3M
            if ( ch & 0x80 ) {
942
                /* if not simple ascii, verify proper format */
943
3.01k
                if ( (ch & 0xc0) != 0xc0 )
944
0
                    return(-1);
945
                /* then skip over remaining bytes for this char */
946
8.86k
                while ( (ch <<= 1) & 0x80 )
947
6.07k
                    if ( (*utf++ & 0xc0) != 0x80 )
948
222
                        return(-1);
949
3.01k
            }
950
35.3M
        }
951
952
0
    return(-1);
953
854
}
954
/**
955
 * xmlUTF8Strsub:
956
 * @utf:  a sequence of UTF-8 encoded bytes
957
 * @start: relative pos of first char
958
 * @len:   total number to copy
959
 *
960
 * Create a substring from a given UTF-8 string
961
 * Note:  positions are given in units of UTF-8 chars
962
 *
963
 * Returns a pointer to a newly created string
964
 * or NULL if any problem
965
 */
966
967
xmlChar *
968
0
xmlUTF8Strsub(const xmlChar *utf, int start, int len) {
969
0
    int i;
970
0
    int ch;
971
972
0
    if (utf == NULL) return(NULL);
973
0
    if (start < 0) return(NULL);
974
0
    if (len < 0) return(NULL);
975
976
    /*
977
     * Skip over any leading chars
978
     */
979
0
    for (i = 0;i < start;i++) {
980
0
        if ((ch=*utf++) == 0) return(NULL);
981
0
        if ( ch & 0x80 ) {
982
            /* if not simple ascii, verify proper format */
983
0
            if ( (ch & 0xc0) != 0xc0 )
984
0
                return(NULL);
985
            /* then skip over remaining bytes for this char */
986
0
            while ( (ch <<= 1) & 0x80 )
987
0
                if ( (*utf++ & 0xc0) != 0x80 )
988
0
                    return(NULL);
989
0
        }
990
0
    }
991
992
0
    return(xmlUTF8Strndup(utf, len));
993
0
}
994
995
/**
996
 * xmlEscapeFormatString:
997
 * @msg:  a pointer to the string in which to escape '%' characters.
998
 * Must be a heap-allocated buffer created by libxml2 that may be
999
 * returned, or that may be freed and replaced.
1000
 *
1001
 * Replaces the string pointed to by 'msg' with an escaped string.
1002
 * Returns the same string with all '%' characters escaped.
1003
 */
1004
xmlChar *
1005
xmlEscapeFormatString(xmlChar **msg)
1006
0
{
1007
0
    xmlChar *msgPtr = NULL;
1008
0
    xmlChar *result = NULL;
1009
0
    xmlChar *resultPtr = NULL;
1010
0
    size_t count = 0;
1011
0
    size_t msgLen = 0;
1012
0
    size_t resultLen = 0;
1013
1014
0
    if (!msg || !*msg)
1015
0
        return(NULL);
1016
1017
0
    for (msgPtr = *msg; *msgPtr != '\0'; ++msgPtr) {
1018
0
        ++msgLen;
1019
0
        if (*msgPtr == '%')
1020
0
            ++count;
1021
0
    }
1022
1023
0
    if (count == 0)
1024
0
        return(*msg);
1025
1026
0
    if ((count > INT_MAX) || (msgLen > INT_MAX - count))
1027
0
        return(NULL);
1028
0
    resultLen = msgLen + count + 1;
1029
0
    result = (xmlChar *) xmlMallocAtomic(resultLen);
1030
0
    if (result == NULL) {
1031
        /* Clear *msg to prevent format string vulnerabilities in
1032
           out-of-memory situations. */
1033
0
        xmlFree(*msg);
1034
0
        *msg = NULL;
1035
0
        xmlErrMemory(NULL, NULL);
1036
0
        return(NULL);
1037
0
    }
1038
1039
0
    for (msgPtr = *msg, resultPtr = result; *msgPtr != '\0'; ++msgPtr, ++resultPtr) {
1040
0
        *resultPtr = *msgPtr;
1041
0
        if (*msgPtr == '%')
1042
0
            *(++resultPtr) = '%';
1043
0
    }
1044
0
    result[resultLen - 1] = '\0';
1045
1046
0
    xmlFree(*msg);
1047
0
    *msg = result;
1048
1049
0
    return *msg;
1050
0
}
1051