Coverage Report

Created: 2025-07-11 06:53

/src/htslib/textutils.c
Line
Count
Source (jump to first uncovered line)
1
/*  textutils.c -- non-bioinformatics utility routines for text etc.
2
3
    Copyright (C) 2016, 2018-2020 Genome Research Ltd.
4
5
    Author: John Marshall <jm18@sanger.ac.uk>
6
7
Permission is hereby granted, free of charge, to any person obtaining a copy
8
of this software and associated documentation files (the "Software"), to deal
9
in the Software without restriction, including without limitation the rights
10
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11
copies of the Software, and to permit persons to whom the Software is
12
furnished to do so, subject to the following conditions:
13
14
The above copyright notice and this permission notice shall be included in
15
all copies or substantial portions of the Software.
16
17
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23
DEALINGS IN THE SOFTWARE.  */
24
25
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
26
#include <config.h>
27
28
#include <stdio.h>
29
#include <string.h>
30
31
#include "htslib/hfile.h"
32
#include "htslib/kstring.h"
33
#include "htslib/sam.h"  // For stringify_argv() declaration
34
35
#include "hts_internal.h"
36
37
static int dehex(char c)
38
146k
{
39
146k
    if (c >= 'a' && c <= 'f') return c - 'a' + 10;
40
89.7k
    else if (c >= 'A' && c <= 'F') return c - 'A' + 10;
41
86.4k
    else if (c >= '0' && c <= '9') return c - '0';
42
25.2k
    else return -1;  // Hence dehex('\0') = -1
43
146k
}
44
45
int hts_decode_percent(char *dest, size_t *destlen, const char *s)
46
433
{
47
433
    char *d = dest;
48
433
    int hi, lo;
49
50
4.06M
    while (*s) {
51
4.06M
        if (*s == '%' && (hi = dehex(s[1])) >= 0 && (lo = dehex(s[2])) >= 0) {
52
57.8k
            *d++ = (hi << 4) | lo;
53
57.8k
            s += 3;
54
57.8k
        }
55
4.00M
        else *d++ = *s++;
56
4.06M
    }
57
58
433
    *d = '\0';
59
433
    *destlen = d - dest;
60
433
    return 0;
61
433
}
62
63
static int debase64(char c)
64
0
{
65
0
    if (c >= 'a' && c <= 'z') return c - 'a' + 26;
66
0
    else if (c >= 'A' && c <= 'Z') return c - 'A';
67
0
    else if (c >= '0' && c <= '9') return c - '0' + 52;
68
0
    else if (c == '/') return 63;
69
0
    else if (c == '+') return 62;
70
0
    else return -1;  // Hence debase64('\0') = -1
71
0
}
72
73
size_t hts_base64_decoded_length(size_t len)
74
0
{
75
0
    size_t nquartets = (len + 2) / 4;
76
0
    return 3 * nquartets;
77
0
}
78
79
int hts_decode_base64(char *dest, size_t *destlen, const char *s)
80
0
{
81
0
    char *d = dest;
82
0
    int x0, x1, x2, x3;
83
84
0
    while (1) {
85
0
        x0 = debase64(*s++);
86
0
        x1 = (x0 >= 0)? debase64(*s++) : -1;
87
0
        x2 = (x1 >= 0)? debase64(*s++) : -1;
88
0
        x3 = (x2 >= 0)? debase64(*s++) : -1;
89
0
        if (x3 < 0) break;
90
91
0
        *d++ = (x0 << 2) | (x1 >> 4);
92
0
        *d++ = (x1 << 4) | (x2 >> 2);
93
0
        *d++ = (x2 << 6) | x3;
94
0
    }
95
96
0
    if (x1 >= 0) *d++ = (x0 << 2) | (x1 >> 4);
97
0
    if (x2 >= 0) *d++ = (x1 << 4) | (x2 >> 2);
98
99
0
    *destlen = d - dest;
100
0
    return 0;
101
0
}
102
103
static char *encode_utf8(char *s, unsigned x)
104
0
{
105
0
    if (x >= 0x10000) {
106
0
        *s++ = 0xF0 | (x >> 18);
107
0
        *s++ = 0x80 | ((x >> 12) & 0x3F);
108
0
        *s++ = 0x80 | ((x >> 6) & 0x3F);
109
0
        *s++ = 0x80 | (x & 0x3F);
110
0
    }
111
0
    else if (x >= 0x800) {
112
0
        *s++ = 0xE0 | (x >> 12);
113
0
        *s++ = 0x80 | ((x >> 6) & 0x3F);
114
0
        *s++ = 0x80 | (x & 0x3F);
115
0
    }
116
0
    else if (x >= 0x80) {
117
0
        *s++ = 0xC0 | (x >> 6);
118
0
        *s++ = 0x80 | (x & 0x3F);
119
0
    }
120
0
    else *s++ = x;
121
122
0
    return s;
123
0
}
124
125
static char *sscan_string(char *s)
126
0
{
127
0
    char *d = s;
128
0
    int d1, d2, d3, d4;
129
130
0
    for (;;) switch (*s) {
131
0
    case '\\':
132
0
        switch (s[1]) {
133
0
        case '\0': *d = '\0'; return s+1;
134
0
        case 'b': *d++ = '\b'; s += 2; break;
135
0
        case 'f': *d++ = '\f'; s += 2; break;
136
0
        case 'n': *d++ = '\n'; s += 2; break;
137
0
        case 'r': *d++ = '\r'; s += 2; break;
138
0
        case 't': *d++ = '\t'; s += 2; break;
139
0
        default:  *d++ = s[1]; s += 2; break;
140
0
        case 'u':
141
0
            if ((d1 = dehex(s[2])) >= 0 && (d2 = dehex(s[3])) >= 0 &&
142
0
                (d3 = dehex(s[4])) >= 0 && (d4 = dehex(s[5])) >= 0) {
143
0
                d = encode_utf8(d, d1 << 12 | d2 << 8 | d3 << 4 | d4);
144
0
                s += 6;
145
0
            }
146
0
            break;
147
0
        }
148
0
        break;
149
150
0
    case '"':
151
0
        *d = '\0';
152
0
        return s+1;
153
154
0
    case '\0':
155
0
        *d = '\0';
156
0
        return s;
157
158
0
    default:
159
0
        *d++ = *s++;
160
0
        break;
161
0
    }
162
0
}
163
164
static int fscan_string(hFILE *fp, kstring_t *d)
165
0
{
166
0
    int c, d1, d2, d3, d4;
167
0
    uint32_t e = 0;
168
169
0
    while ((c = hgetc(fp)) != EOF) switch (c) {
170
0
    case '\\':
171
0
        if ((c = hgetc(fp)) == EOF) return e == 0 ? 0 : -1;
172
0
        switch (c) {
173
0
        case 'b': e |= kputc('\b', d) < 0; break;
174
0
        case 'f': e |= kputc('\f', d) < 0; break;
175
0
        case 'n': e |= kputc('\n', d) < 0; break;
176
0
        case 'r': e |= kputc('\r', d) < 0; break;
177
0
        case 't': e |= kputc('\t', d) < 0; break;
178
0
        default:  e |= kputc(c,    d) < 0; break;
179
0
        case 'u':
180
0
            if ((c = hgetc(fp)) != EOF && (d1 = dehex(c)) >= 0 &&
181
0
                (c = hgetc(fp)) != EOF && (d2 = dehex(c)) >= 0 &&
182
0
                (c = hgetc(fp)) != EOF && (d3 = dehex(c)) >= 0 &&
183
0
                (c = hgetc(fp)) != EOF && (d4 = dehex(c)) >= 0) {
184
0
                char buf[8];
185
0
                char *lim = encode_utf8(buf, d1 << 12 | d2 << 8 | d3 << 4 | d4);
186
0
                e |= kputsn(buf, lim - buf, d) < 0;
187
0
            }
188
0
            break;
189
0
        }
190
0
        break;
191
192
0
    case '"':
193
0
        return e == 0 ? 0 : -1;
194
195
0
    default:
196
0
        e |= kputc(c, d) < 0;
197
0
        break;
198
0
    }
199
0
    return e == 0 ? 0 : -1;
200
0
}
201
202
static char token_type(hts_json_token *token)
203
0
{
204
0
    const char *s = token->str;
205
206
0
    switch (*s) {
207
0
    case 'f':
208
0
        return (strcmp(s, "false") == 0)? 'b' : '?';
209
0
    case 'n':
210
0
        return (strcmp(s, "null") == 0)? '.' : '?';
211
0
    case 't':
212
0
        return (strcmp(s, "true") == 0)? 'b' : '?';
213
0
    case '-':
214
0
    case '0': case '1': case '2': case '3': case '4':
215
0
    case '5': case '6': case '7': case '8': case '9':
216
0
        return 'n';
217
0
    default:
218
0
        return '?';
219
0
    }
220
0
}
221
222
HTSLIB_EXPORT
223
0
hts_json_token * hts_json_alloc_token(void) {
224
0
    return calloc(1, sizeof(hts_json_token));
225
0
}
226
227
HTSLIB_EXPORT
228
0
char hts_json_token_type(hts_json_token *token) {
229
0
    return token->type;
230
0
}
231
232
HTSLIB_EXPORT
233
0
void hts_json_free_token(hts_json_token *token) {
234
0
    free(token);
235
0
}
236
237
HTSLIB_EXPORT
238
0
char *hts_json_token_str(hts_json_token *token) {
239
0
    return token->str;
240
0
}
241
242
HTSLIB_EXPORT
243
char hts_json_snext(char *str, size_t *state, hts_json_token *token)
244
0
{
245
0
    char *s = &str[*state >> 2];
246
0
    int hidden = *state & 3;
247
248
0
    if (hidden) {
249
0
        *state &= ~3;
250
0
        return token->type = "?}]?"[hidden];
251
0
    }
252
253
0
#define STATE(s,h)  (((s) - str) << 2 | (h))
254
255
0
    for (;;) switch (*s) {
256
0
    case ' ':
257
0
    case '\t':
258
0
    case '\r':
259
0
    case '\n':
260
0
    case ',':
261
0
    case ':':
262
0
        s++;
263
0
        continue;
264
265
0
    case '\0':
266
0
        return token->type = '\0';
267
268
0
    case '{':
269
0
    case '[':
270
0
    case '}':
271
0
    case ']':
272
0
        *state = STATE(s+1, 0);
273
0
        return token->type = *s;
274
275
0
    case '"':
276
0
        token->str = s+1;
277
0
        *state = STATE(sscan_string(s+1), 0);
278
0
        return token->type = 's';
279
280
0
    default:
281
0
        token->str = s;
282
0
        s += strcspn(s, " \t\r\n,]}");
283
0
        hidden = (*s == '}')? 1 : (*s == ']')? 2 : 0;
284
0
        if (*s != '\0') *s++ = '\0';
285
0
        *state = STATE(s, hidden);
286
0
        return token->type = token_type(token);
287
0
    }
288
289
0
#undef STATE
290
0
}
291
292
HTSLIB_EXPORT
293
char hts_json_fnext(struct hFILE *fp, hts_json_token *token, kstring_t *kstr)
294
0
{
295
0
    char peek;
296
0
    int c;
297
298
0
    for (;;) switch (c = hgetc(fp)) {
299
0
    case ' ':
300
0
    case '\t':
301
0
    case '\r':
302
0
    case '\n':
303
0
    case ',':
304
0
    case ':':
305
0
        continue;
306
307
0
    case EOF:
308
0
        return token->type = '\0';
309
310
0
    case '{':
311
0
    case '[':
312
0
    case '}':
313
0
    case ']':
314
0
        return token->type = c;
315
316
0
    case '"':
317
0
        kstr->l = 0;
318
0
        fscan_string(fp, kstr);
319
0
        if (kstr->l == 0) kputsn("", 0, kstr);
320
0
        token->str = kstr->s;
321
0
        return token->type = 's';
322
323
0
    default:
324
0
        kstr->l = 0;
325
0
        kputc(c, kstr);
326
0
        while (hpeek(fp, &peek, 1) == 1 && !strchr(" \t\r\n,]}", peek)) {
327
0
            if ((c = hgetc(fp)) == EOF) break;
328
0
            kputc(c, kstr);
329
0
        }
330
0
        token->str = kstr->s;
331
0
        return token->type = token_type(token);
332
0
    }
333
0
}
334
335
336
typedef char hts_json_nextfn(void *arg1, void *arg2, hts_json_token *token);
337
338
static char skip_value(char type, hts_json_nextfn *next, void *arg1, void *arg2)
339
0
{
340
0
    hts_json_token token;
341
0
    int level;
342
343
0
    switch (type? type : next(arg1, arg2, &token)) {
344
0
    case '\0':
345
0
        return '\0';
346
347
0
    case '?':
348
0
    case '}':
349
0
    case ']':
350
0
        return '?';
351
352
0
    case '{':
353
0
    case '[':
354
0
        level = 1;
355
0
        break;
356
357
0
    default:
358
0
        return 'v';
359
0
    }
360
361
0
    while (level > 0)
362
0
        switch (next(arg1, arg2, &token)) {
363
0
        case '\0':
364
0
            return '\0';
365
366
0
        case '?':
367
0
            return '?';
368
369
0
        case '{':
370
0
        case '[':
371
0
            level++;
372
0
            break;
373
374
0
        case '}':
375
0
        case ']':
376
0
            --level;
377
0
            break;
378
379
0
        default:
380
0
            break;
381
0
        }
382
383
0
    return 'v';
384
0
}
385
386
static char snext(void *arg1, void *arg2, hts_json_token *token)
387
0
{
388
0
    return hts_json_snext(arg1, arg2, token);
389
0
}
390
391
HTSLIB_EXPORT
392
char hts_json_sskip_value(char *str, size_t *state, char type)
393
0
{
394
0
    return skip_value(type, snext, str, state);
395
0
}
396
397
static char fnext(void *arg1, void *arg2, hts_json_token *token)
398
0
{
399
0
    return hts_json_fnext(arg1, token, arg2);
400
0
}
401
402
HTSLIB_EXPORT
403
char hts_json_fskip_value(struct hFILE *fp, char type)
404
0
{
405
0
    kstring_t str = { 0, 0, NULL };
406
0
    char ret = skip_value(type, fnext, fp, &str);
407
0
    free(str.s);
408
0
    return ret;
409
0
}
410
411
/*
412
 * A function to help with construction of CL tags in @PG records.
413
 * Takes an argc, argv pair and returns a single space-separated string.
414
 * This string should be deallocated by the calling function.
415
 *
416
 * Returns malloced char * on success
417
 *         NULL on failure
418
 */
419
0
char *stringify_argv(int argc, char *argv[]) {
420
0
    char *str, *cp;
421
0
    size_t nbytes = 1;
422
0
    int i, j;
423
424
    /* Allocate */
425
0
    for (i = 0; i < argc; i++) {
426
0
        if (i > 0) nbytes += 1;
427
0
        nbytes += strlen(argv[i]);
428
0
    }
429
0
    if (!(str = malloc(nbytes)))
430
0
        return NULL;
431
432
    /* Copy */
433
0
    cp = str;
434
0
    for (i = 0; i < argc; i++) {
435
0
        if (i > 0) *cp++ = ' ';
436
0
        j = 0;
437
0
        while (argv[i][j]) {
438
0
            if (argv[i][j] == '\t')
439
0
                *cp++ = ' ';
440
0
            else
441
0
                *cp++ = argv[i][j];
442
0
            j++;
443
0
        }
444
0
    }
445
0
    *cp++ = 0;
446
447
0
    return str;
448
0
}
449
450
/* Utility function for printing possibly malicious text data
451
 */
452
const char *
453
hts_strprint(char *buf, size_t buflen, char quote, const char *s, size_t len)
454
217k
{
455
217k
    const char *slim = (len < SIZE_MAX)? &s[len] : NULL;
456
217k
    char *t = buf, *bufend = buf + buflen;
457
458
217k
    size_t qlen = quote? 1 : 0;
459
217k
    if (quote) *t++ = quote;
460
461
1.44M
    for (; slim? (s < slim) : (*s); s++) {
462
1.23M
        char c;
463
1.23M
        size_t clen;
464
1.23M
        switch (*s) {
465
0
        case '\n': c = 'n'; clen = 2; break;
466
1.67k
        case '\r': c = 'r'; clen = 2; break;
467
2.81k
        case '\t': c = 't'; clen = 2; break;
468
10
        case '\0': c = '0'; clen = 2; break;
469
1.18k
        case '\\': c = '\\'; clen = 2; break;
470
1.23M
        default:
471
1.23M
            c = *s;
472
1.23M
            if (c == quote) clen = 2;
473
1.22M
            else clen = isprint_c(c)? 1 : 4;
474
1.23M
            break;
475
1.23M
        }
476
477
1.23M
        if (t-buf + clen + qlen >= buflen) {
478
46.9k
            while (t-buf + 3 + qlen >= buflen) t--;
479
12.6k
            if (quote) *t++ = quote;
480
12.6k
            strcpy(t, "...");
481
12.6k
            return buf;
482
12.6k
        }
483
484
1.22M
        if (clen == 4) {
485
144k
            snprintf(t, bufend - t, "\\x%02X", (unsigned char) c);
486
144k
            t += clen;
487
144k
        }
488
1.08M
        else {
489
1.08M
            if (clen == 2) *t++ = '\\';
490
1.08M
            *t++ = c;
491
1.08M
        }
492
1.22M
    }
493
494
204k
    if (quote) *t++ = quote;
495
204k
    *t = '\0';
496
204k
    return buf;
497
217k
}