Coverage Report

Created: 2025-11-16 06:31

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/htslib/textutils.c
Line
Count
Source
1
/*  textutils.c -- non-bioinformatics utility routines for text etc.
2
3
    Copyright (C) 2016, 2018-2020 Genome Research Ltd.
4
5
    Author: John Marshall <jm18@sanger.ac.uk>
6
7
Permission is hereby granted, free of charge, to any person obtaining a copy
8
of this software and associated documentation files (the "Software"), to deal
9
in the Software without restriction, including without limitation the rights
10
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11
copies of the Software, and to permit persons to whom the Software is
12
furnished to do so, subject to the following conditions:
13
14
The above copyright notice and this permission notice shall be included in
15
all copies or substantial portions of the Software.
16
17
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23
DEALINGS IN THE SOFTWARE.  */
24
25
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
26
#include <config.h>
27
28
#include <stdio.h>
29
#include <string.h>
30
31
#include "htslib/hfile.h"
32
#include "htslib/kstring.h"
33
#include "htslib/sam.h"  // For stringify_argv() declaration
34
35
#include "hts_internal.h"
36
37
static int dehex(char c)
38
10.4k
{
39
10.4k
    if (c >= 'a' && c <= 'f') return c - 'a' + 10;
40
6.64k
    else if (c >= 'A' && c <= 'F') return c - 'A' + 10;
41
6.22k
    else if (c >= '0' && c <= '9') return c - '0';
42
2.35k
    else return -1;  // Hence dehex('\0') = -1
43
10.4k
}
44
45
int hts_decode_percent(char *dest, size_t *destlen, const char *s)
46
108
{
47
108
    char *d = dest;
48
108
    int hi, lo;
49
50
816k
    while (*s) {
51
816k
        if (*s == '%' && (hi = dehex(s[1])) >= 0 && (lo = dehex(s[2])) >= 0) {
52
3.70k
            *d++ = (hi << 4) | lo;
53
3.70k
            s += 3;
54
3.70k
        }
55
812k
        else *d++ = *s++;
56
816k
    }
57
58
108
    *d = '\0';
59
108
    *destlen = d - dest;
60
108
    return 0;
61
108
}
62
63
static int debase64(char c)
64
0
{
65
0
    if (c >= 'a' && c <= 'z') return c - 'a' + 26;
66
0
    else if (c >= 'A' && c <= 'Z') return c - 'A';
67
0
    else if (c >= '0' && c <= '9') return c - '0' + 52;
68
0
    else if (c == '/') return 63;
69
0
    else if (c == '+') return 62;
70
0
    else return -1;  // Hence debase64('\0') = -1
71
0
}
72
73
size_t hts_base64_decoded_length(size_t len)
74
0
{
75
0
    size_t nquartets = (len + 2) / 4;
76
0
    return 3 * nquartets;
77
0
}
78
79
int hts_decode_base64(char *dest, size_t *destlen, const char *s)
80
0
{
81
0
    char *d = dest;
82
0
    int x0, x1, x2, x3;
83
84
0
    while (1) {
85
0
        x0 = debase64(*s++);
86
0
        x1 = (x0 >= 0)? debase64(*s++) : -1;
87
0
        x2 = (x1 >= 0)? debase64(*s++) : -1;
88
0
        x3 = (x2 >= 0)? debase64(*s++) : -1;
89
0
        if (x3 < 0) break;
90
91
0
        *d++ = (x0 << 2) | (x1 >> 4);
92
0
        *d++ = (x1 << 4) | (x2 >> 2);
93
0
        *d++ = (x2 << 6) | x3;
94
0
    }
95
96
0
    if (x1 >= 0) *d++ = (x0 << 2) | (x1 >> 4);
97
0
    if (x2 >= 0) *d++ = (x1 << 4) | (x2 >> 2);
98
99
0
    *destlen = d - dest;
100
0
    return 0;
101
0
}
102
103
static char *encode_utf8(char *s, unsigned x)
104
0
{
105
0
    if (x >= 0x10000) {
106
0
        *s++ = 0xF0 | (x >> 18);
107
0
        *s++ = 0x80 | ((x >> 12) & 0x3F);
108
0
        *s++ = 0x80 | ((x >> 6) & 0x3F);
109
0
        *s++ = 0x80 | (x & 0x3F);
110
0
    }
111
0
    else if (x >= 0x800) {
112
0
        *s++ = 0xE0 | (x >> 12);
113
0
        *s++ = 0x80 | ((x >> 6) & 0x3F);
114
0
        *s++ = 0x80 | (x & 0x3F);
115
0
    }
116
0
    else if (x >= 0x80) {
117
0
        *s++ = 0xC0 | (x >> 6);
118
0
        *s++ = 0x80 | (x & 0x3F);
119
0
    }
120
0
    else *s++ = x;
121
122
0
    return s;
123
0
}
124
125
static char *sscan_string(char *s)
126
0
{
127
0
    char *d = s;
128
0
    int d1, d2, d3, d4;
129
130
0
    for (;;) switch (*s) {
131
0
    case '\\':
132
0
        switch (s[1]) {
133
0
        case '\0': *d = '\0'; return s+1;
134
0
        case 'b': *d++ = '\b'; s += 2; break;
135
0
        case 'f': *d++ = '\f'; s += 2; break;
136
0
        case 'n': *d++ = '\n'; s += 2; break;
137
0
        case 'r': *d++ = '\r'; s += 2; break;
138
0
        case 't': *d++ = '\t'; s += 2; break;
139
0
        default:  *d++ = s[1]; s += 2; break;
140
0
        case 'u':
141
0
            if ((d1 = dehex(s[2])) >= 0 && (d2 = dehex(s[3])) >= 0 &&
142
0
                (d3 = dehex(s[4])) >= 0 && (d4 = dehex(s[5])) >= 0) {
143
0
                d = encode_utf8(d, d1 << 12 | d2 << 8 | d3 << 4 | d4);
144
0
                s += 6;
145
0
            }
146
0
            break;
147
0
        }
148
0
        break;
149
150
0
    case '"':
151
0
        *d = '\0';
152
0
        return s+1;
153
154
0
    case '\0':
155
0
        *d = '\0';
156
0
        return s;
157
158
0
    default:
159
0
        *d++ = *s++;
160
0
        break;
161
0
    }
162
0
}
163
164
static int fscan_string(hFILE *fp, kstring_t *d)
165
0
{
166
0
    int c, d1, d2, d3, d4;
167
0
    uint32_t e = 0;
168
169
0
    while ((c = hgetc(fp)) != EOF) switch (c) {
170
0
    case '\\':
171
0
        if ((c = hgetc(fp)) == EOF) return e == 0 ? 0 : -1;
172
0
        switch (c) {
173
0
        case 'b': e |= kputc('\b', d) < 0; break;
174
0
        case 'f': e |= kputc('\f', d) < 0; break;
175
0
        case 'n': e |= kputc('\n', d) < 0; break;
176
0
        case 'r': e |= kputc('\r', d) < 0; break;
177
0
        case 't': e |= kputc('\t', d) < 0; break;
178
0
        default:  e |= kputc(c,    d) < 0; break;
179
0
        case 'u':
180
0
            if ((c = hgetc(fp)) != EOF && (d1 = dehex(c)) >= 0 &&
181
0
                (c = hgetc(fp)) != EOF && (d2 = dehex(c)) >= 0 &&
182
0
                (c = hgetc(fp)) != EOF && (d3 = dehex(c)) >= 0 &&
183
0
                (c = hgetc(fp)) != EOF && (d4 = dehex(c)) >= 0) {
184
0
                char buf[8];
185
0
                char *lim = encode_utf8(buf, d1 << 12 | d2 << 8 | d3 << 4 | d4);
186
0
                e |= kputsn(buf, lim - buf, d) < 0;
187
0
            }
188
0
            break;
189
0
        }
190
0
        break;
191
192
0
    case '"':
193
0
        return e == 0 ? 0 : -1;
194
195
0
    default:
196
0
        e |= kputc(c, d) < 0;
197
0
        break;
198
0
    }
199
0
    return e == 0 ? 0 : -1;
200
0
}
201
202
static char token_type(hts_json_token *token)
203
0
{
204
0
    const char *s = token->str;
205
206
0
    switch (*s) {
207
0
    case 'f':
208
0
        return (strcmp(s, "false") == 0)? 'b' : '?';
209
0
    case 'n':
210
0
        return (strcmp(s, "null") == 0)? '.' : '?';
211
0
    case 't':
212
0
        return (strcmp(s, "true") == 0)? 'b' : '?';
213
0
    case '-':
214
0
    case '0': case '1': case '2': case '3': case '4':
215
0
    case '5': case '6': case '7': case '8': case '9':
216
0
        return 'n';
217
0
    default:
218
0
        return '?';
219
0
    }
220
0
}
221
222
HTSLIB_EXPORT
223
0
hts_json_token * hts_json_alloc_token(void) {
224
0
    return calloc(1, sizeof(hts_json_token));
225
0
}
226
227
HTSLIB_EXPORT
228
0
char hts_json_token_type(hts_json_token *token) {
229
0
    return token->type;
230
0
}
231
232
HTSLIB_EXPORT
233
0
void hts_json_free_token(hts_json_token *token) {
234
0
    free(token);
235
0
}
236
237
HTSLIB_EXPORT
238
0
char *hts_json_token_str(hts_json_token *token) {
239
0
    return token->str;
240
0
}
241
242
HTSLIB_EXPORT
243
char hts_json_snext(char *str, size_t *state, hts_json_token *token)
244
0
{
245
0
    char *s = &str[*state >> 2];
246
0
    int hidden = *state & 3;
247
248
0
    if (hidden) {
249
0
        *state &= ~3;
250
0
        return token->type = "?}]?"[hidden];
251
0
    }
252
253
0
#define STATE(s,h)  (((s) - str) << 2 | (h))
254
255
0
    for (;;) switch (*s) {
256
0
    case ' ':
257
0
    case '\t':
258
0
    case '\r':
259
0
    case '\n':
260
0
    case ',':
261
0
    case ':':
262
0
        s++;
263
0
        continue;
264
265
0
    case '\0':
266
0
        return token->type = '\0';
267
268
0
    case '{':
269
0
    case '[':
270
0
    case '}':
271
0
    case ']':
272
0
        *state = STATE(s+1, 0);
273
0
        return token->type = *s;
274
275
0
    case '"':
276
0
        token->str = s+1;
277
0
        *state = STATE(sscan_string(s+1), 0);
278
0
        return token->type = 's';
279
280
0
    default:
281
0
        token->str = s;
282
0
        s += strcspn(s, " \t\r\n,]}");
283
0
        hidden = (*s == '}')? 1 : (*s == ']')? 2 : 0;
284
0
        if (*s != '\0') *s++ = '\0';
285
0
        *state = STATE(s, hidden);
286
0
        return token->type = token_type(token);
287
0
    }
288
289
0
#undef STATE
290
0
}
291
292
HTSLIB_EXPORT
293
char hts_json_fnext(struct hFILE *fp, hts_json_token *token, kstring_t *kstr)
294
0
{
295
0
    char peek;
296
0
    int c;
297
298
0
    for (;;) switch (c = hgetc(fp)) {
299
0
    case ' ':
300
0
    case '\t':
301
0
    case '\r':
302
0
    case '\n':
303
0
    case ',':
304
0
    case ':':
305
0
        continue;
306
307
0
    case EOF:
308
0
        return token->type = '\0';
309
310
0
    case '{':
311
0
    case '[':
312
0
    case '}':
313
0
    case ']':
314
0
        return token->type = c;
315
316
0
    case '"':
317
0
        kstr->l = 0;
318
0
        fscan_string(fp, kstr);
319
0
        if (kstr->l == 0) kputsn("", 0, kstr);
320
0
        token->str = kstr->s;
321
0
        return token->type = 's';
322
323
0
    default:
324
0
        kstr->l = 0;
325
0
        kputc(c, kstr);
326
0
        while (hpeek(fp, &peek, 1) == 1 && !strchr(" \t\r\n,]}", peek)) {
327
0
            if ((c = hgetc(fp)) == EOF) break;
328
0
            kputc(c, kstr);
329
0
        }
330
0
        token->str = kstr->s;
331
0
        return token->type = token_type(token);
332
0
    }
333
0
}
334
335
336
typedef char hts_json_nextfn(void *arg1, void *arg2, hts_json_token *token);
337
338
static char skip_value(char type, hts_json_nextfn *next, void *arg1, void *arg2)
339
0
{
340
0
    hts_json_token token;
341
0
    int level;
342
343
0
    switch (type? type : next(arg1, arg2, &token)) {
344
0
    case '\0':
345
0
        return '\0';
346
347
0
    case '?':
348
0
    case '}':
349
0
    case ']':
350
0
        return '?';
351
352
0
    case '{':
353
0
    case '[':
354
0
        level = 1;
355
0
        break;
356
357
0
    default:
358
0
        return 'v';
359
0
    }
360
361
0
    while (level > 0)
362
0
        switch (next(arg1, arg2, &token)) {
363
0
        case '\0':
364
0
            return '\0';
365
366
0
        case '?':
367
0
            return '?';
368
369
0
        case '{':
370
0
        case '[':
371
0
            level++;
372
0
            break;
373
374
0
        case '}':
375
0
        case ']':
376
0
            --level;
377
0
            break;
378
379
0
        default:
380
0
            break;
381
0
        }
382
383
0
    return 'v';
384
0
}
385
386
static char snext(void *arg1, void *arg2, hts_json_token *token)
387
0
{
388
0
    return hts_json_snext(arg1, arg2, token);
389
0
}
390
391
HTSLIB_EXPORT
392
char hts_json_sskip_value(char *str, size_t *state, char type)
393
0
{
394
0
    return skip_value(type, snext, str, state);
395
0
}
396
397
static char fnext(void *arg1, void *arg2, hts_json_token *token)
398
0
{
399
0
    return hts_json_fnext(arg1, token, arg2);
400
0
}
401
402
HTSLIB_EXPORT
403
char hts_json_fskip_value(struct hFILE *fp, char type)
404
0
{
405
0
    kstring_t str = { 0, 0, NULL };
406
0
    char ret = skip_value(type, fnext, fp, &str);
407
0
    free(str.s);
408
0
    return ret;
409
0
}
410
411
/*
412
 * A function to help with construction of CL tags in @PG records.
413
 * Takes an argc, argv pair and returns a single space-separated string.
414
 * This string should be deallocated by the calling function.
415
 *
416
 * Returns malloced char * on success
417
 *         NULL on failure
418
 */
419
0
char *stringify_argv(int argc, char *argv[]) {
420
0
    char *str, *cp;
421
0
    size_t nbytes = 1;
422
0
    int i, j;
423
424
    /* Allocate */
425
0
    for (i = 0; i < argc; i++) {
426
0
        if (i > 0) nbytes += 1;
427
0
        nbytes += strlen(argv[i]);
428
0
    }
429
0
    if (!(str = malloc(nbytes)))
430
0
        return NULL;
431
432
    /* Copy */
433
0
    cp = str;
434
0
    for (i = 0; i < argc; i++) {
435
0
        if (i > 0) *cp++ = ' ';
436
0
        j = 0;
437
0
        while (argv[i][j]) {
438
0
            if (argv[i][j] == '\t')
439
0
                *cp++ = ' ';
440
0
            else
441
0
                *cp++ = argv[i][j];
442
0
            j++;
443
0
        }
444
0
    }
445
0
    *cp++ = 0;
446
447
0
    return str;
448
0
}
449
450
/* Utility function for printing possibly malicious text data
451
 */
452
const char *
453
hts_strprint(char *buf, size_t buflen, char quote, const char *s, size_t len)
454
64.6k
{
455
64.6k
    const char *slim = (len < SIZE_MAX)? &s[len] : NULL;
456
64.6k
    char *t = buf, *bufend = buf + buflen;
457
458
64.6k
    size_t qlen = quote? 1 : 0;
459
64.6k
    if (quote) *t++ = quote;
460
461
705k
    for (; slim? (s < slim) : (*s); s++) {
462
643k
        char c;
463
643k
        size_t clen;
464
643k
        switch (*s) {
465
0
        case '\n': c = 'n'; clen = 2; break;
466
1.60k
        case '\r': c = 'r'; clen = 2; break;
467
838
        case '\t': c = 't'; clen = 2; break;
468
0
        case '\0': c = '0'; clen = 2; break;
469
1.06k
        case '\\': c = '\\'; clen = 2; break;
470
639k
        default:
471
639k
            c = *s;
472
639k
            if (c == quote) clen = 2;
473
634k
            else clen = isprint_c(c)? 1 : 4;
474
639k
            break;
475
643k
        }
476
477
643k
        if (t-buf + clen + qlen >= buflen) {
478
9.22k
            while (t-buf + 3 + qlen >= buflen) t--;
479
2.67k
            if (quote) *t++ = quote;
480
2.67k
            strcpy(t, "...");
481
2.67k
            return buf;
482
2.67k
        }
483
484
640k
        if (clen == 4) {
485
129k
            snprintf(t, bufend - t, "\\x%02X", (unsigned char) c);
486
129k
            t += clen;
487
129k
        }
488
510k
        else {
489
510k
            if (clen == 2) *t++ = '\\';
490
510k
            *t++ = c;
491
510k
        }
492
640k
    }
493
494
62.0k
    if (quote) *t++ = quote;
495
62.0k
    *t = '\0';
496
62.0k
    return buf;
497
64.6k
}