Coverage Report

Created: 2022-11-15 06:34

/src/libxml2/xmlstring.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * string.c : an XML string utilities module
3
 *
4
 * This module provides various utility functions for manipulating
5
 * the xmlChar* type. All functions named xmlStr* have been moved here
6
 * from the parser.c file (their original home).
7
 *
8
 * See Copyright for the status of this software.
9
 *
10
 * UTF8 string routines from:
11
 * William Brack <wbrack@mmm.com.hk>
12
 *
13
 * daniel@veillard.com
14
 */
15
16
#define IN_LIBXML
17
#include "libxml.h"
18
19
#include <stdlib.h>
20
#include <string.h>
21
#include <limits.h>
22
#include <libxml/xmlmemory.h>
23
#include <libxml/parserInternals.h>
24
#include <libxml/xmlstring.h>
25
26
#include "private/parser.h"
27
#include "private/string.h"
28
29
/************************************************************************
30
 *                                                                      *
31
 *                Commodity functions to handle xmlChars                *
32
 *                                                                      *
33
 ************************************************************************/
34
35
/**
36
 * xmlStrndup:
37
 * @cur:  the input xmlChar *
38
 * @len:  the len of @cur
39
 *
40
 * a strndup for array of xmlChar's
41
 *
42
 * Returns a new xmlChar * or NULL
43
 */
44
xmlChar *
45
27.0M
xmlStrndup(const xmlChar *cur, int len) {
46
27.0M
    xmlChar *ret;
47
48
27.0M
    if ((cur == NULL) || (len < 0)) return(NULL);
49
27.0M
    ret = (xmlChar *) xmlMallocAtomic((size_t) len + 1);
50
27.0M
    if (ret == NULL) {
51
0
        xmlErrMemory(NULL, NULL);
52
0
        return(NULL);
53
0
    }
54
27.0M
    memcpy(ret, cur, len);
55
27.0M
    ret[len] = 0;
56
27.0M
    return(ret);
57
27.0M
}
58
59
/**
60
 * xmlStrdup:
61
 * @cur:  the input xmlChar *
62
 *
63
 * a strdup for array of xmlChar's. Since they are supposed to be
64
 * encoded in UTF-8 or an encoding with 8bit based chars, we assume
65
 * a termination mark of '0'.
66
 *
67
 * Returns a new xmlChar * or NULL
68
 */
69
xmlChar *
70
12.4M
xmlStrdup(const xmlChar *cur) {
71
12.4M
    const xmlChar *p = cur;
72
73
12.4M
    if (cur == NULL) return(NULL);
74
2.54G
    while (*p != 0) p++; /* non input consuming */
75
11.5M
    return(xmlStrndup(cur, p - cur));
76
12.4M
}
77
78
/**
79
 * xmlCharStrndup:
80
 * @cur:  the input char *
81
 * @len:  the len of @cur
82
 *
83
 * a strndup for char's to xmlChar's
84
 *
85
 * Returns a new xmlChar * or NULL
86
 */
87
88
xmlChar *
89
3.32k
xmlCharStrndup(const char *cur, int len) {
90
3.32k
    int i;
91
3.32k
    xmlChar *ret;
92
93
3.32k
    if ((cur == NULL) || (len < 0)) return(NULL);
94
3.32k
    ret = (xmlChar *) xmlMallocAtomic((size_t) len + 1);
95
3.32k
    if (ret == NULL) {
96
0
        xmlErrMemory(NULL, NULL);
97
0
        return(NULL);
98
0
    }
99
139M
    for (i = 0;i < len;i++) {
100
        /* Explicit sign change */
101
139M
        ret[i] = (xmlChar) cur[i];
102
139M
        if (ret[i] == 0) return(ret);
103
139M
    }
104
3.32k
    ret[len] = 0;
105
3.32k
    return(ret);
106
3.32k
}
107
108
/**
109
 * xmlCharStrdup:
110
 * @cur:  the input char *
111
 *
112
 * a strdup for char's to xmlChar's
113
 *
114
 * Returns a new xmlChar * or NULL
115
 */
116
117
xmlChar *
118
3.32k
xmlCharStrdup(const char *cur) {
119
3.32k
    const char *p = cur;
120
121
3.32k
    if (cur == NULL) return(NULL);
122
139M
    while (*p != '\0') p++; /* non input consuming */
123
3.32k
    return(xmlCharStrndup(cur, p - cur));
124
3.32k
}
125
126
/**
127
 * xmlStrcmp:
128
 * @str1:  the first xmlChar *
129
 * @str2:  the second xmlChar *
130
 *
131
 * a strcmp for xmlChar's
132
 *
133
 * Returns the integer result of the comparison
134
 */
135
136
int
137
26.0k
xmlStrcmp(const xmlChar *str1, const xmlChar *str2) {
138
26.0k
    if (str1 == str2) return(0);
139
26.0k
    if (str1 == NULL) return(-1);
140
26.0k
    if (str2 == NULL) return(1);
141
26.0k
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
142
26.0k
    return(strcmp((const char *)str1, (const char *)str2));
143
#else
144
    do {
145
        int tmp = *str1++ - *str2;
146
        if (tmp != 0) return(tmp);
147
    } while (*str2++ != 0);
148
    return 0;
149
#endif
150
26.0k
}
151
152
/**
153
 * xmlStrEqual:
154
 * @str1:  the first xmlChar *
155
 * @str2:  the second xmlChar *
156
 *
157
 * Check if both strings are equal of have same content.
158
 * Should be a bit more readable and faster than xmlStrcmp()
159
 *
160
 * Returns 1 if they are equal, 0 if they are different
161
 */
162
163
int
164
8.86M
xmlStrEqual(const xmlChar *str1, const xmlChar *str2) {
165
8.86M
    if (str1 == str2) return(1);
166
6.92M
    if (str1 == NULL) return(0);
167
6.92M
    if (str2 == NULL) return(0);
168
6.92M
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
169
6.92M
    return(strcmp((const char *)str1, (const char *)str2) == 0);
170
#else
171
    do {
172
        if (*str1++ != *str2) return(0);
173
    } while (*str2++);
174
    return(1);
175
#endif
176
6.92M
}
177
178
/**
179
 * xmlStrQEqual:
180
 * @pref:  the prefix of the QName
181
 * @name:  the localname of the QName
182
 * @str:  the second xmlChar *
183
 *
184
 * Check if a QName is Equal to a given string
185
 *
186
 * Returns 1 if they are equal, 0 if they are different
187
 */
188
189
int
190
384
xmlStrQEqual(const xmlChar *pref, const xmlChar *name, const xmlChar *str) {
191
384
    if (pref == NULL) return(xmlStrEqual(name, str));
192
171
    if (name == NULL) return(0);
193
171
    if (str == NULL) return(0);
194
195
375
    do {
196
375
        if (*pref++ != *str) return(0);
197
375
    } while ((*str++) && (*pref));
198
117
    if (*str++ != ':') return(0);
199
1.24k
    do {
200
1.24k
        if (*name++ != *str) return(0);
201
1.24k
    } while (*str++);
202
109
    return(1);
203
117
}
204
205
/**
206
 * xmlStrncmp:
207
 * @str1:  the first xmlChar *
208
 * @str2:  the second xmlChar *
209
 * @len:  the max comparison length
210
 *
211
 * a strncmp for xmlChar's
212
 *
213
 * Returns the integer result of the comparison
214
 */
215
216
int
217
46.3M
xmlStrncmp(const xmlChar *str1, const xmlChar *str2, int len) {
218
46.3M
    if (len <= 0) return(0);
219
46.3M
    if (str1 == str2) return(0);
220
46.3M
    if (str1 == NULL) return(-1);
221
46.3M
    if (str2 == NULL) return(1);
222
46.3M
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
223
46.3M
    return(strncmp((const char *)str1, (const char *)str2, len));
224
#else
225
    do {
226
        int tmp = *str1++ - *str2;
227
        if (tmp != 0 || --len == 0) return(tmp);
228
    } while (*str2++ != 0);
229
    return 0;
230
#endif
231
46.3M
}
232
233
static const xmlChar casemap[256] = {
234
    0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
235
    0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
236
    0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
237
    0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,
238
    0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,
239
    0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
240
    0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,
241
    0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
242
    0x40,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
243
    0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
244
    0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
245
    0x78,0x79,0x7A,0x7B,0x5C,0x5D,0x5E,0x5F,
246
    0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
247
    0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
248
    0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
249
    0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E,0x7F,
250
    0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,
251
    0x88,0x89,0x8A,0x8B,0x8C,0x8D,0x8E,0x8F,
252
    0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97,
253
    0x98,0x99,0x9A,0x9B,0x9C,0x9D,0x9E,0x9F,
254
    0xA0,0xA1,0xA2,0xA3,0xA4,0xA5,0xA6,0xA7,
255
    0xA8,0xA9,0xAA,0xAB,0xAC,0xAD,0xAE,0xAF,
256
    0xB0,0xB1,0xB2,0xB3,0xB4,0xB5,0xB6,0xB7,
257
    0xB8,0xB9,0xBA,0xBB,0xBC,0xBD,0xBE,0xBF,
258
    0xC0,0xC1,0xC2,0xC3,0xC4,0xC5,0xC6,0xC7,
259
    0xC8,0xC9,0xCA,0xCB,0xCC,0xCD,0xCE,0xCF,
260
    0xD0,0xD1,0xD2,0xD3,0xD4,0xD5,0xD6,0xD7,
261
    0xD8,0xD9,0xDA,0xDB,0xDC,0xDD,0xDE,0xDF,
262
    0xE0,0xE1,0xE2,0xE3,0xE4,0xE5,0xE6,0xE7,
263
    0xE8,0xE9,0xEA,0xEB,0xEC,0xED,0xEE,0xEF,
264
    0xF0,0xF1,0xF2,0xF3,0xF4,0xF5,0xF6,0xF7,
265
    0xF8,0xF9,0xFA,0xFB,0xFC,0xFD,0xFE,0xFF
266
};
267
268
/**
269
 * xmlStrcasecmp:
270
 * @str1:  the first xmlChar *
271
 * @str2:  the second xmlChar *
272
 *
273
 * a strcasecmp for xmlChar's
274
 *
275
 * Returns the integer result of the comparison
276
 */
277
278
int
279
70.4k
xmlStrcasecmp(const xmlChar *str1, const xmlChar *str2) {
280
70.4k
    register int tmp;
281
282
70.4k
    if (str1 == str2) return(0);
283
70.4k
    if (str1 == NULL) return(-1);
284
69.6k
    if (str2 == NULL) return(1);
285
100k
    do {
286
100k
        tmp = casemap[*str1++] - casemap[*str2];
287
100k
        if (tmp != 0) return(tmp);
288
100k
    } while (*str2++ != 0);
289
7.03k
    return 0;
290
69.6k
}
291
292
/**
293
 * xmlStrncasecmp:
294
 * @str1:  the first xmlChar *
295
 * @str2:  the second xmlChar *
296
 * @len:  the max comparison length
297
 *
298
 * a strncasecmp for xmlChar's
299
 *
300
 * Returns the integer result of the comparison
301
 */
302
303
int
304
106M
xmlStrncasecmp(const xmlChar *str1, const xmlChar *str2, int len) {
305
106M
    register int tmp;
306
307
106M
    if (len <= 0) return(0);
308
106M
    if (str1 == str2) return(0);
309
106M
    if (str1 == NULL) return(-1);
310
106M
    if (str2 == NULL) return(1);
311
223M
    do {
312
223M
        tmp = casemap[*str1++] - casemap[*str2];
313
223M
        if (tmp != 0 || --len == 0) return(tmp);
314
223M
    } while (*str2++ != 0);
315
0
    return 0;
316
106M
}
317
318
/**
319
 * xmlStrchr:
320
 * @str:  the xmlChar * array
321
 * @val:  the xmlChar to search
322
 *
323
 * a strchr for xmlChar's
324
 *
325
 * Returns the xmlChar * for the first occurrence or NULL.
326
 */
327
328
const xmlChar *
329
79.9M
xmlStrchr(const xmlChar *str, xmlChar val) {
330
79.9M
    if (str == NULL) return(NULL);
331
1.32G
    while (*str != 0) { /* non input consuming */
332
1.24G
        if (*str == val) return((xmlChar *) str);
333
1.24G
        str++;
334
1.24G
    }
335
79.8M
    return(NULL);
336
79.9M
}
337
338
/**
339
 * xmlStrstr:
340
 * @str:  the xmlChar * array (haystack)
341
 * @val:  the xmlChar to search (needle)
342
 *
343
 * a strstr for xmlChar's
344
 *
345
 * Returns the xmlChar * for the first occurrence or NULL.
346
 */
347
348
const xmlChar *
349
6.48M
xmlStrstr(const xmlChar *str, const xmlChar *val) {
350
6.48M
    int n;
351
352
6.48M
    if (str == NULL) return(NULL);
353
6.48M
    if (val == NULL) return(NULL);
354
6.48M
    n = xmlStrlen(val);
355
356
6.48M
    if (n == 0) return(str);
357
17.8M
    while (*str != 0) { /* non input consuming */
358
11.4M
        if (*str == *val) {
359
7.82k
            if (!xmlStrncmp(str, val, n)) return((const xmlChar *) str);
360
7.82k
        }
361
11.3M
        str++;
362
11.3M
    }
363
6.48M
    return(NULL);
364
6.48M
}
365
366
/**
367
 * xmlStrcasestr:
368
 * @str:  the xmlChar * array (haystack)
369
 * @val:  the xmlChar to search (needle)
370
 *
371
 * a case-ignoring strstr for xmlChar's
372
 *
373
 * Returns the xmlChar * for the first occurrence or NULL.
374
 */
375
376
const xmlChar *
377
0
xmlStrcasestr(const xmlChar *str, const xmlChar *val) {
378
0
    int n;
379
380
0
    if (str == NULL) return(NULL);
381
0
    if (val == NULL) return(NULL);
382
0
    n = xmlStrlen(val);
383
384
0
    if (n == 0) return(str);
385
0
    while (*str != 0) { /* non input consuming */
386
0
        if (casemap[*str] == casemap[*val])
387
0
            if (!xmlStrncasecmp(str, val, n)) return(str);
388
0
        str++;
389
0
    }
390
0
    return(NULL);
391
0
}
392
393
/**
394
 * xmlStrsub:
395
 * @str:  the xmlChar * array (haystack)
396
 * @start:  the index of the first char (zero based)
397
 * @len:  the length of the substring
398
 *
399
 * Extract a substring of a given string
400
 *
401
 * Returns the xmlChar * for the first occurrence or NULL.
402
 */
403
404
xmlChar *
405
0
xmlStrsub(const xmlChar *str, int start, int len) {
406
0
    int i;
407
408
0
    if (str == NULL) return(NULL);
409
0
    if (start < 0) return(NULL);
410
0
    if (len < 0) return(NULL);
411
412
0
    for (i = 0;i < start;i++) {
413
0
        if (*str == 0) return(NULL);
414
0
        str++;
415
0
    }
416
0
    if (*str == 0) return(NULL);
417
0
    return(xmlStrndup(str, len));
418
0
}
419
420
/**
421
 * xmlStrlen:
422
 * @str:  the xmlChar * array
423
 *
424
 * length of a xmlChar's string
425
 *
426
 * Returns the number of xmlChar contained in the ARRAY.
427
 */
428
429
int
430
17.9M
xmlStrlen(const xmlChar *str) {
431
17.9M
    size_t len = str ? strlen((const char *)str) : 0;
432
17.9M
    return(len > INT_MAX ? 0 : len);
433
17.9M
}
434
435
/**
436
 * xmlStrncat:
437
 * @cur:  the original xmlChar * array
438
 * @add:  the xmlChar * array added
439
 * @len:  the length of @add
440
 *
441
 * a strncat for array of xmlChar's, it will extend @cur with the len
442
 * first bytes of @add. Note that if @len < 0 then this is an API error
443
 * and NULL will be returned.
444
 *
445
 * Returns a new xmlChar *, the original @cur is reallocated and should
446
 * not be freed.
447
 */
448
449
xmlChar *
450
99.2k
xmlStrncat(xmlChar *cur, const xmlChar *add, int len) {
451
99.2k
    int size;
452
99.2k
    xmlChar *ret;
453
454
99.2k
    if ((add == NULL) || (len == 0))
455
15.6k
        return(cur);
456
83.6k
    if (len < 0)
457
0
  return(NULL);
458
83.6k
    if (cur == NULL)
459
273
        return(xmlStrndup(add, len));
460
461
83.3k
    size = xmlStrlen(cur);
462
83.3k
    if ((size < 0) || (size > INT_MAX - len))
463
0
        return(NULL);
464
83.3k
    ret = (xmlChar *) xmlRealloc(cur, (size_t) size + len + 1);
465
83.3k
    if (ret == NULL) {
466
0
        xmlErrMemory(NULL, NULL);
467
0
        return(cur);
468
0
    }
469
83.3k
    memcpy(&ret[size], add, len);
470
83.3k
    ret[size + len] = 0;
471
83.3k
    return(ret);
472
83.3k
}
473
474
/**
475
 * xmlStrncatNew:
476
 * @str1:  first xmlChar string
477
 * @str2:  second xmlChar string
478
 * @len:  the len of @str2 or < 0
479
 *
480
 * same as xmlStrncat, but creates a new string.  The original
481
 * two strings are not freed. If @len is < 0 then the length
482
 * will be calculated automatically.
483
 *
484
 * Returns a new xmlChar * or NULL
485
 */
486
xmlChar *
487
407
xmlStrncatNew(const xmlChar *str1, const xmlChar *str2, int len) {
488
407
    int size;
489
407
    xmlChar *ret;
490
491
407
    if (len < 0) {
492
0
        len = xmlStrlen(str2);
493
0
        if (len < 0)
494
0
            return(NULL);
495
0
    }
496
407
    if ((str2 == NULL) || (len == 0))
497
0
        return(xmlStrdup(str1));
498
407
    if (str1 == NULL)
499
0
        return(xmlStrndup(str2, len));
500
501
407
    size = xmlStrlen(str1);
502
407
    if ((size < 0) || (size > INT_MAX - len))
503
0
        return(NULL);
504
407
    ret = (xmlChar *) xmlMalloc((size_t) size + len + 1);
505
407
    if (ret == NULL) {
506
0
        xmlErrMemory(NULL, NULL);
507
0
        return(xmlStrndup(str1, size));
508
0
    }
509
407
    memcpy(ret, str1, size);
510
407
    memcpy(&ret[size], str2, len);
511
407
    ret[size + len] = 0;
512
407
    return(ret);
513
407
}
514
515
/**
516
 * xmlStrcat:
517
 * @cur:  the original xmlChar * array
518
 * @add:  the xmlChar * array added
519
 *
520
 * a strcat for array of xmlChar's. Since they are supposed to be
521
 * encoded in UTF-8 or an encoding with 8bit based chars, we assume
522
 * a termination mark of '0'.
523
 *
524
 * Returns a new xmlChar * containing the concatenated string. The original
525
 * @cur is reallocated and should not be freed.
526
 */
527
xmlChar *
528
79.0k
xmlStrcat(xmlChar *cur, const xmlChar *add) {
529
79.0k
    const xmlChar *p = add;
530
531
79.0k
    if (add == NULL) return(cur);
532
79.0k
    if (cur == NULL)
533
15.7k
        return(xmlStrdup(add));
534
535
99.3M
    while (*p != 0) p++; /* non input consuming */
536
63.2k
    return(xmlStrncat(cur, add, p - add));
537
79.0k
}
538
539
/**
540
 * xmlStrPrintf:
541
 * @buf:   the result buffer.
542
 * @len:   the result buffer length.
543
 * @msg:   the message with printf formatting.
544
 * @...:   extra parameters for the message.
545
 *
546
 * Formats @msg and places result into @buf.
547
 *
548
 * Returns the number of characters written to @buf or -1 if an error occurs.
549
 */
550
int XMLCDECL
551
765
xmlStrPrintf(xmlChar *buf, int len, const char *msg, ...) {
552
765
    va_list args;
553
765
    int ret;
554
555
765
    if((buf == NULL) || (msg == NULL)) {
556
0
        return(-1);
557
0
    }
558
559
765
    va_start(args, msg);
560
765
    ret = vsnprintf((char *) buf, len, (const char *) msg, args);
561
765
    va_end(args);
562
765
    buf[len - 1] = 0; /* be safe ! */
563
564
765
    return(ret);
565
765
}
566
567
/**
568
 * xmlStrVPrintf:
569
 * @buf:   the result buffer.
570
 * @len:   the result buffer length.
571
 * @msg:   the message with printf formatting.
572
 * @ap:    extra parameters for the message.
573
 *
574
 * Formats @msg and places result into @buf.
575
 *
576
 * Returns the number of characters written to @buf or -1 if an error occurs.
577
 */
578
int
579
0
xmlStrVPrintf(xmlChar *buf, int len, const char *msg, va_list ap) {
580
0
    int ret;
581
582
0
    if((buf == NULL) || (msg == NULL)) {
583
0
        return(-1);
584
0
    }
585
586
0
    ret = vsnprintf((char *) buf, len, (const char *) msg, ap);
587
0
    buf[len - 1] = 0; /* be safe ! */
588
589
0
    return(ret);
590
0
}
591
592
/************************************************************************
593
 *                                                                      *
594
 *              Generic UTF8 handling routines                          *
595
 *                                                                      *
596
 * From rfc2044: encoding of the Unicode values on UTF-8:               *
597
 *                                                                      *
598
 * UCS-4 range (hex.)           UTF-8 octet sequence (binary)           *
599
 * 0000 0000-0000 007F   0xxxxxxx                                       *
600
 * 0000 0080-0000 07FF   110xxxxx 10xxxxxx                              *
601
 * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx                     *
602
 *                                                                      *
603
 * I hope we won't use values > 0xFFFF anytime soon !                   *
604
 *                                                                      *
605
 ************************************************************************/
606
607
608
/**
609
 * xmlUTF8Size:
610
 * @utf: pointer to the UTF8 character
611
 *
612
 * calculates the internal size of a UTF8 character
613
 *
614
 * returns the numbers of bytes in the character, -1 on format error
615
 */
616
int
617
1.84k
xmlUTF8Size(const xmlChar *utf) {
618
1.84k
    xmlChar mask;
619
1.84k
    int len;
620
621
1.84k
    if (utf == NULL)
622
0
        return -1;
623
1.84k
    if (*utf < 0x80)
624
1.67k
        return 1;
625
    /* check valid UTF8 character */
626
168
    if (!(*utf & 0x40))
627
59
        return -1;
628
    /* determine number of bytes in char */
629
109
    len = 2;
630
238
    for (mask=0x20; mask != 0; mask>>=1) {
631
218
        if (!(*utf & mask))
632
89
            return len;
633
129
        len++;
634
129
    }
635
20
    return -1;
636
109
}
637
638
/**
639
 * xmlUTF8Charcmp:
640
 * @utf1: pointer to first UTF8 char
641
 * @utf2: pointer to second UTF8 char
642
 *
643
 * compares the two UCS4 values
644
 *
645
 * returns result of the compare as with xmlStrncmp
646
 */
647
int
648
1.84k
xmlUTF8Charcmp(const xmlChar *utf1, const xmlChar *utf2) {
649
650
1.84k
    if (utf1 == NULL ) {
651
0
        if (utf2 == NULL)
652
0
            return 0;
653
0
        return -1;
654
0
    }
655
1.84k
    return xmlStrncmp(utf1, utf2, xmlUTF8Size(utf1));
656
1.84k
}
657
658
/**
659
 * xmlUTF8Strlen:
660
 * @utf:  a sequence of UTF-8 encoded bytes
661
 *
662
 * compute the length of an UTF8 string, it doesn't do a full UTF8
663
 * checking of the content of the string.
664
 *
665
 * Returns the number of characters in the string or -1 in case of error
666
 */
667
int
668
193
xmlUTF8Strlen(const xmlChar *utf) {
669
193
    size_t ret = 0;
670
671
193
    if (utf == NULL)
672
0
        return(-1);
673
674
7.07M
    while (*utf != 0) {
675
7.07M
        if (utf[0] & 0x80) {
676
7.33k
            if ((utf[1] & 0xc0) != 0x80)
677
10
                return(-1);
678
7.32k
            if ((utf[0] & 0xe0) == 0xe0) {
679
1.46k
                if ((utf[2] & 0xc0) != 0x80)
680
0
                    return(-1);
681
1.46k
                if ((utf[0] & 0xf0) == 0xf0) {
682
7
                    if ((utf[0] & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
683
0
                        return(-1);
684
7
                    utf += 4;
685
1.45k
                } else {
686
1.45k
                    utf += 3;
687
1.45k
                }
688
5.86k
            } else {
689
5.86k
                utf += 2;
690
5.86k
            }
691
7.06M
        } else {
692
7.06M
            utf++;
693
7.06M
        }
694
7.07M
        ret++;
695
7.07M
    }
696
183
    return(ret > INT_MAX ? 0 : ret);
697
193
}
698
699
/**
700
 * xmlGetUTF8Char:
701
 * @utf:  a sequence of UTF-8 encoded bytes
702
 * @len:  a pointer to the minimum number of bytes present in
703
 *        the sequence.  This is used to assure the next character
704
 *        is completely contained within the sequence.
705
 *
706
 * Read the first UTF8 character from @utf
707
 *
708
 * Returns the char value or -1 in case of error, and sets *len to
709
 *        the actual number of bytes consumed (0 in case of error)
710
 */
711
int
712
0
xmlGetUTF8Char(const unsigned char *utf, int *len) {
713
0
    unsigned int c;
714
715
0
    if (utf == NULL)
716
0
        goto error;
717
0
    if (len == NULL)
718
0
        goto error;
719
0
    if (*len < 1)
720
0
        goto error;
721
722
0
    c = utf[0];
723
0
    if (c & 0x80) {
724
0
        if (*len < 2)
725
0
            goto error;
726
0
        if ((utf[1] & 0xc0) != 0x80)
727
0
            goto error;
728
0
        if ((c & 0xe0) == 0xe0) {
729
0
            if (*len < 3)
730
0
                goto error;
731
0
            if ((utf[2] & 0xc0) != 0x80)
732
0
                goto error;
733
0
            if ((c & 0xf0) == 0xf0) {
734
0
                if (*len < 4)
735
0
                    goto error;
736
0
                if ((c & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
737
0
                    goto error;
738
0
                *len = 4;
739
                /* 4-byte code */
740
0
                c = (utf[0] & 0x7) << 18;
741
0
                c |= (utf[1] & 0x3f) << 12;
742
0
                c |= (utf[2] & 0x3f) << 6;
743
0
                c |= utf[3] & 0x3f;
744
0
            } else {
745
              /* 3-byte code */
746
0
                *len = 3;
747
0
                c = (utf[0] & 0xf) << 12;
748
0
                c |= (utf[1] & 0x3f) << 6;
749
0
                c |= utf[2] & 0x3f;
750
0
            }
751
0
        } else {
752
          /* 2-byte code */
753
0
            *len = 2;
754
0
            c = (utf[0] & 0x1f) << 6;
755
0
            c |= utf[1] & 0x3f;
756
0
        }
757
0
    } else {
758
        /* 1-byte code */
759
0
        *len = 1;
760
0
    }
761
0
    return(c);
762
763
0
error:
764
0
    if (len != NULL)
765
0
  *len = 0;
766
0
    return(-1);
767
0
}
768
769
/**
770
 * xmlCheckUTF8:
771
 * @utf: Pointer to putative UTF-8 encoded string.
772
 *
773
 * Checks @utf for being valid UTF-8. @utf is assumed to be
774
 * null-terminated. This function is not super-strict, as it will
775
 * allow longer UTF-8 sequences than necessary. Note that Java is
776
 * capable of producing these sequences if provoked. Also note, this
777
 * routine checks for the 4-byte maximum size, but does not check for
778
 * 0x10ffff maximum value.
779
 *
780
 * Return value: true if @utf is valid.
781
 **/
782
int
783
xmlCheckUTF8(const unsigned char *utf)
784
44
{
785
44
    int ix;
786
44
    unsigned char c;
787
788
44
    if (utf == NULL)
789
0
        return(0);
790
    /*
791
     * utf is a string of 1, 2, 3 or 4 bytes.  The valid strings
792
     * are as follows (in "bit format"):
793
     *    0xxxxxxx                                      valid 1-byte
794
     *    110xxxxx 10xxxxxx                             valid 2-byte
795
     *    1110xxxx 10xxxxxx 10xxxxxx                    valid 3-byte
796
     *    11110xxx 10xxxxxx 10xxxxxx 10xxxxxx           valid 4-byte
797
     */
798
108
    while ((c = utf[0])) {      /* string is 0-terminated */
799
108
        ix = 0;
800
108
        if ((c & 0x80) == 0x00) { /* 1-byte code, starts with 10 */
801
64
            ix = 1;
802
64
  } else if ((c & 0xe0) == 0xc0) {/* 2-byte code, starts with 110 */
803
0
      if ((utf[1] & 0xc0 ) != 0x80)
804
0
          return 0;
805
0
      ix = 2;
806
44
  } else if ((c & 0xf0) == 0xe0) {/* 3-byte code, starts with 1110 */
807
8
      if (((utf[1] & 0xc0) != 0x80) ||
808
8
          ((utf[2] & 0xc0) != 0x80))
809
8
        return 0;
810
0
      ix = 3;
811
36
  } else if ((c & 0xf8) == 0xf0) {/* 4-byte code, starts with 11110 */
812
0
      if (((utf[1] & 0xc0) != 0x80) ||
813
0
          ((utf[2] & 0xc0) != 0x80) ||
814
0
    ((utf[3] & 0xc0) != 0x80))
815
0
        return 0;
816
0
      ix = 4;
817
0
  } else       /* unknown encoding */
818
36
      return 0;
819
64
        utf += ix;
820
64
      }
821
0
      return(1);
822
44
}
823
824
/**
825
 * xmlUTF8Strsize:
826
 * @utf:  a sequence of UTF-8 encoded bytes
827
 * @len:  the number of characters in the array
828
 *
829
 * storage size of an UTF8 string
830
 * the behaviour is not guaranteed if the input string is not UTF-8
831
 *
832
 * Returns the storage size of
833
 * the first 'len' characters of ARRAY
834
 */
835
836
int
837
44.0M
xmlUTF8Strsize(const xmlChar *utf, int len) {
838
44.0M
    const xmlChar *ptr=utf;
839
44.0M
    int ch;
840
44.0M
    size_t ret;
841
842
44.0M
    if (utf == NULL)
843
0
        return(0);
844
845
44.0M
    if (len <= 0)
846
0
        return(0);
847
848
91.0M
    while ( len-- > 0) {
849
46.9M
        if ( !*ptr )
850
0
            break;
851
46.9M
        if ( (ch = *ptr++) & 0x80)
852
159k
            while ((ch<<=1) & 0x80 ) {
853
109k
    if (*ptr == 0) break;
854
109k
                ptr++;
855
109k
      }
856
46.9M
    }
857
44.0M
    ret = ptr - utf;
858
44.0M
    return (ret > INT_MAX ? 0 : ret);
859
44.0M
}
860
861
862
/**
863
 * xmlUTF8Strndup:
864
 * @utf:  the input UTF8 *
865
 * @len:  the len of @utf (in chars)
866
 *
867
 * a strndup for array of UTF8's
868
 *
869
 * Returns a new UTF8 * or NULL
870
 */
871
xmlChar *
872
10
xmlUTF8Strndup(const xmlChar *utf, int len) {
873
10
    xmlChar *ret;
874
10
    int i;
875
876
10
    if ((utf == NULL) || (len < 0)) return(NULL);
877
10
    i = xmlUTF8Strsize(utf, len);
878
10
    ret = (xmlChar *) xmlMallocAtomic((size_t) i + 1);
879
10
    if (ret == NULL) {
880
0
        return(NULL);
881
0
    }
882
10
    memcpy(ret, utf, i);
883
10
    ret[i] = 0;
884
10
    return(ret);
885
10
}
886
887
/**
888
 * xmlUTF8Strpos:
889
 * @utf:  the input UTF8 *
890
 * @pos:  the position of the desired UTF8 char (in chars)
891
 *
892
 * a function to provide the equivalent of fetching a
893
 * character from a string array
894
 *
895
 * Returns a pointer to the UTF8 character or NULL
896
 */
897
const xmlChar *
898
608
xmlUTF8Strpos(const xmlChar *utf, int pos) {
899
608
    int ch;
900
901
608
    if (utf == NULL) return(NULL);
902
608
    if (pos < 0)
903
0
        return(NULL);
904
10.9M
    while (pos--) {
905
10.9M
        if ((ch=*utf++) == 0) return(NULL);
906
10.9M
        if ( ch & 0x80 ) {
907
            /* if not simple ascii, verify proper format */
908
576
            if ( (ch & 0xc0) != 0xc0 )
909
576
                return(NULL);
910
            /* then skip over remaining bytes for this char */
911
0
            while ( (ch <<= 1) & 0x80 )
912
0
                if ( (*utf++ & 0xc0) != 0x80 )
913
0
                    return(NULL);
914
0
        }
915
10.9M
    }
916
32
    return((xmlChar *)utf);
917
608
}
918
919
/**
920
 * xmlUTF8Strloc:
921
 * @utf:  the input UTF8 *
922
 * @utfchar:  the UTF8 character to be found
923
 *
924
 * a function to provide the relative location of a UTF8 char
925
 *
926
 * Returns the relative character position of the desired char
927
 * or -1 if not found
928
 */
929
int
930
854
xmlUTF8Strloc(const xmlChar *utf, const xmlChar *utfchar) {
931
854
    size_t i;
932
854
    int size;
933
854
    int ch;
934
935
854
    if (utf==NULL || utfchar==NULL) return -1;
936
854
    size = xmlUTF8Strsize(utfchar, 1);
937
35.3M
        for(i=0; (ch=*utf) != 0; i++) {
938
35.3M
            if (xmlStrncmp(utf, utfchar, size)==0)
939
632
                return(i > INT_MAX ? 0 : i);
940
35.3M
            utf++;
941
35.3M
            if ( ch & 0x80 ) {
942
                /* if not simple ascii, verify proper format */
943
3.01k
                if ( (ch & 0xc0) != 0xc0 )
944
0
                    return(-1);
945
                /* then skip over remaining bytes for this char */
946
8.86k
                while ( (ch <<= 1) & 0x80 )
947
6.07k
                    if ( (*utf++ & 0xc0) != 0x80 )
948
222
                        return(-1);
949
3.01k
            }
950
35.3M
        }
951
952
0
    return(-1);
953
854
}
954
/**
955
 * xmlUTF8Strsub:
956
 * @utf:  a sequence of UTF-8 encoded bytes
957
 * @start: relative pos of first char
958
 * @len:   total number to copy
959
 *
960
 * Create a substring from a given UTF-8 string
961
 * Note:  positions are given in units of UTF-8 chars
962
 *
963
 * Returns a pointer to a newly created string
964
 * or NULL if any problem
965
 */
966
967
xmlChar *
968
1
xmlUTF8Strsub(const xmlChar *utf, int start, int len) {
969
1
    int i;
970
1
    int ch;
971
972
1
    if (utf == NULL) return(NULL);
973
1
    if (start < 0) return(NULL);
974
1
    if (len < 0) return(NULL);
975
976
    /*
977
     * Skip over any leading chars
978
     */
979
165
    for (i = 0;i < start;i++) {
980
165
        if ((ch=*utf++) == 0) return(NULL);
981
164
        if ( ch & 0x80 ) {
982
            /* if not simple ascii, verify proper format */
983
3
            if ( (ch & 0xc0) != 0xc0 )
984
0
                return(NULL);
985
            /* then skip over remaining bytes for this char */
986
8
            while ( (ch <<= 1) & 0x80 )
987
5
                if ( (*utf++ & 0xc0) != 0x80 )
988
0
                    return(NULL);
989
3
        }
990
164
    }
991
992
0
    return(xmlUTF8Strndup(utf, len));
993
1
}
994
995
/**
996
 * xmlEscapeFormatString:
997
 * @msg:  a pointer to the string in which to escape '%' characters.
998
 * Must be a heap-allocated buffer created by libxml2 that may be
999
 * returned, or that may be freed and replaced.
1000
 *
1001
 * Replaces the string pointed to by 'msg' with an escaped string.
1002
 * Returns the same string with all '%' characters escaped.
1003
 */
1004
xmlChar *
1005
xmlEscapeFormatString(xmlChar **msg)
1006
0
{
1007
0
    xmlChar *msgPtr = NULL;
1008
0
    xmlChar *result = NULL;
1009
0
    xmlChar *resultPtr = NULL;
1010
0
    size_t count = 0;
1011
0
    size_t msgLen = 0;
1012
0
    size_t resultLen = 0;
1013
1014
0
    if (!msg || !*msg)
1015
0
        return(NULL);
1016
1017
0
    for (msgPtr = *msg; *msgPtr != '\0'; ++msgPtr) {
1018
0
        ++msgLen;
1019
0
        if (*msgPtr == '%')
1020
0
            ++count;
1021
0
    }
1022
1023
0
    if (count == 0)
1024
0
        return(*msg);
1025
1026
0
    if ((count > INT_MAX) || (msgLen > INT_MAX - count))
1027
0
        return(NULL);
1028
0
    resultLen = msgLen + count + 1;
1029
0
    result = (xmlChar *) xmlMallocAtomic(resultLen);
1030
0
    if (result == NULL) {
1031
        /* Clear *msg to prevent format string vulnerabilities in
1032
           out-of-memory situations. */
1033
0
        xmlFree(*msg);
1034
0
        *msg = NULL;
1035
0
        xmlErrMemory(NULL, NULL);
1036
0
        return(NULL);
1037
0
    }
1038
1039
0
    for (msgPtr = *msg, resultPtr = result; *msgPtr != '\0'; ++msgPtr, ++resultPtr) {
1040
0
        *resultPtr = *msgPtr;
1041
0
        if (*msgPtr == '%')
1042
0
            *(++resultPtr) = '%';
1043
0
    }
1044
0
    result[resultLen - 1] = '\0';
1045
1046
0
    xmlFree(*msg);
1047
0
    *msg = result;
1048
1049
0
    return *msg;
1050
0
}
1051