Coverage Report

Created: 2023-01-17 06:24

/src/htslib/textutils.c
Line
Count
Source (jump to first uncovered line)
1
/*  textutils.c -- non-bioinformatics utility routines for text etc.
2
3
    Copyright (C) 2016, 2018-2020 Genome Research Ltd.
4
5
    Author: John Marshall <jm18@sanger.ac.uk>
6
7
Permission is hereby granted, free of charge, to any person obtaining a copy
8
of this software and associated documentation files (the "Software"), to deal
9
in the Software without restriction, including without limitation the rights
10
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11
copies of the Software, and to permit persons to whom the Software is
12
furnished to do so, subject to the following conditions:
13
14
The above copyright notice and this permission notice shall be included in
15
all copies or substantial portions of the Software.
16
17
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23
DEALINGS IN THE SOFTWARE.  */
24
25
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
26
#include <config.h>
27
28
#include <stdio.h>
29
#include <string.h>
30
31
#include "htslib/hfile.h"
32
#include "htslib/kstring.h"
33
#include "htslib/sam.h"  // For stringify_argv() declaration
34
35
#include "hts_internal.h"
36
37
static int dehex(char c)
38
0
{
39
0
    if (c >= 'a' && c <= 'f') return c - 'a' + 10;
40
0
    else if (c >= 'A' && c <= 'F') return c - 'A' + 10;
41
0
    else if (c >= '0' && c <= '9') return c - '0';
42
0
    else return -1;  // Hence dehex('\0') = -1
43
0
}
44
45
int hts_decode_percent(char *dest, size_t *destlen, const char *s)
46
0
{
47
0
    char *d = dest;
48
0
    int hi, lo;
49
50
0
    while (*s) {
51
0
        if (*s == '%' && (hi = dehex(s[1])) >= 0 && (lo = dehex(s[2])) >= 0) {
52
0
            *d++ = (hi << 4) | lo;
53
0
            s += 3;
54
0
        }
55
0
        else *d++ = *s++;
56
0
    }
57
58
0
    *d = '\0';
59
0
    *destlen = d - dest;
60
0
    return 0;
61
0
}
62
63
static int debase64(char c)
64
0
{
65
0
    if (c >= 'a' && c <= 'z') return c - 'a' + 26;
66
0
    else if (c >= 'A' && c <= 'Z') return c - 'A';
67
0
    else if (c >= '0' && c <= '9') return c - '0' + 52;
68
0
    else if (c == '/') return 63;
69
0
    else if (c == '+') return 62;
70
0
    else return -1;  // Hence debase64('\0') = -1
71
0
}
72
73
size_t hts_base64_decoded_length(size_t len)
74
0
{
75
0
    size_t nquartets = (len + 2) / 4;
76
0
    return 3 * nquartets;
77
0
}
78
79
int hts_decode_base64(char *dest, size_t *destlen, const char *s)
80
0
{
81
0
    char *d = dest;
82
0
    int x0, x1, x2, x3;
83
84
0
    while (1) {
85
0
        x0 = debase64(*s++);
86
0
        x1 = (x0 >= 0)? debase64(*s++) : -1;
87
0
        x2 = (x1 >= 0)? debase64(*s++) : -1;
88
0
        x3 = (x2 >= 0)? debase64(*s++) : -1;
89
0
        if (x3 < 0) break;
90
91
0
        *d++ = (x0 << 2) | (x1 >> 4);
92
0
        *d++ = (x1 << 4) | (x2 >> 2);
93
0
        *d++ = (x2 << 6) | x3;
94
0
    }
95
96
0
    if (x1 >= 0) *d++ = (x0 << 2) | (x1 >> 4);
97
0
    if (x2 >= 0) *d++ = (x1 << 4) | (x2 >> 2);
98
99
0
    *destlen = d - dest;
100
0
    return 0;
101
0
}
102
103
static char *encode_utf8(char *s, unsigned x)
104
0
{
105
0
    if (x >= 0x10000) {
106
0
        *s++ = 0xF0 | (x >> 18);
107
0
        *s++ = 0x80 | ((x >> 12) & 0x3F);
108
0
        *s++ = 0x80 | ((x >> 6) & 0x3F);
109
0
        *s++ = 0x80 | (x & 0x3F);
110
0
    }
111
0
    else if (x >= 0x800) {
112
0
        *s++ = 0xE0 | (x >> 12);
113
0
        *s++ = 0x80 | ((x >> 6) & 0x3F);
114
0
        *s++ = 0x80 | (x & 0x3F);
115
0
    }
116
0
    else if (x >= 0x80) {
117
0
        *s++ = 0xC0 | (x >> 6);
118
0
        *s++ = 0x80 | (x & 0x3F);
119
0
    }
120
0
    else *s++ = x;
121
122
0
    return s;
123
0
}
124
125
static char *sscan_string(char *s)
126
0
{
127
0
    char *d = s;
128
0
    int d1, d2, d3, d4;
129
130
0
    for (;;) switch (*s) {
131
0
    case '\\':
132
0
        switch (s[1]) {
133
0
        case '\0': *d = '\0'; return s+1;
134
0
        case 'b': *d++ = '\b'; s += 2; break;
135
0
        case 'f': *d++ = '\f'; s += 2; break;
136
0
        case 'n': *d++ = '\n'; s += 2; break;
137
0
        case 'r': *d++ = '\r'; s += 2; break;
138
0
        case 't': *d++ = '\t'; s += 2; break;
139
0
        default:  *d++ = s[1]; s += 2; break;
140
0
        case 'u':
141
0
            if ((d1 = dehex(s[2])) >= 0 && (d2 = dehex(s[3])) >= 0 &&
142
0
                (d3 = dehex(s[4])) >= 0 && (d4 = dehex(s[5])) >= 0) {
143
0
                d = encode_utf8(d, d1 << 12 | d2 << 8 | d3 << 4 | d4);
144
0
                s += 6;
145
0
            }
146
0
            break;
147
0
        }
148
0
        break;
149
150
0
    case '"':
151
0
        *d = '\0';
152
0
        return s+1;
153
154
0
    case '\0':
155
0
        *d = '\0';
156
0
        return s;
157
158
0
    default:
159
0
        *d++ = *s++;
160
0
        break;
161
0
    }
162
0
}
163
164
static int fscan_string(hFILE *fp, kstring_t *d)
165
0
{
166
0
    int c, d1, d2, d3, d4;
167
0
    uint32_t e = 0;
168
169
0
    while ((c = hgetc(fp)) != EOF) switch (c) {
170
0
    case '\\':
171
0
        if ((c = hgetc(fp)) == EOF) return e == 0 ? 0 : -1;
172
0
        switch (c) {
173
0
        case 'b': e |= kputc('\b', d) < 0; break;
174
0
        case 'f': e |= kputc('\f', d) < 0; break;
175
0
        case 'n': e |= kputc('\n', d) < 0; break;
176
0
        case 'r': e |= kputc('\r', d) < 0; break;
177
0
        case 't': e |= kputc('\t', d) < 0; break;
178
0
        default:  e |= kputc(c,    d) < 0; break;
179
0
        case 'u':
180
0
            if ((c = hgetc(fp)) != EOF && (d1 = dehex(c)) >= 0 &&
181
0
                (c = hgetc(fp)) != EOF && (d2 = dehex(c)) >= 0 &&
182
0
                (c = hgetc(fp)) != EOF && (d3 = dehex(c)) >= 0 &&
183
0
                (c = hgetc(fp)) != EOF && (d4 = dehex(c)) >= 0) {
184
0
                char buf[8];
185
0
                char *lim = encode_utf8(buf, d1 << 12 | d2 << 8 | d3 << 4 | d4);
186
0
                e |= kputsn(buf, lim - buf, d) < 0;
187
0
            }
188
0
            break;
189
0
        }
190
0
        break;
191
192
0
    case '"':
193
0
        return e == 0 ? 0 : -1;
194
195
0
    default:
196
0
        e |= kputc(c, d) < 0;
197
0
        break;
198
0
    }
199
0
    return e == 0 ? 0 : -1;
200
0
}
201
202
static char token_type(hts_json_token *token)
203
0
{
204
0
    const char *s = token->str;
205
206
0
    switch (*s) {
207
0
    case 'f':
208
0
        return (strcmp(s, "false") == 0)? 'b' : '?';
209
0
    case 'n':
210
0
        return (strcmp(s, "null") == 0)? '.' : '?';
211
0
    case 't':
212
0
        return (strcmp(s, "true") == 0)? 'b' : '?';
213
0
    case '-':
214
0
    case '0': case '1': case '2': case '3': case '4':
215
0
    case '5': case '6': case '7': case '8': case '9':
216
0
        return 'n';
217
0
    default:
218
0
        return '?';
219
0
    }
220
0
}
221
222
HTSLIB_EXPORT
223
0
hts_json_token * hts_json_alloc_token() {
224
0
    return calloc(1, sizeof(hts_json_token));
225
0
}
226
227
HTSLIB_EXPORT
228
0
char hts_json_token_type(hts_json_token *token) {
229
0
    return token->type;
230
0
}
231
232
HTSLIB_EXPORT
233
0
void hts_json_free_token(hts_json_token *token) {
234
0
    free(token);
235
0
}
236
237
HTSLIB_EXPORT
238
0
char *hts_json_token_str(hts_json_token *token) {
239
0
    return token->str;
240
0
}
241
242
HTSLIB_EXPORT
243
char hts_json_snext(char *str, size_t *state, hts_json_token *token)
244
0
{
245
0
    char *s = &str[*state >> 2];
246
0
    int hidden = *state & 3;
247
248
0
    if (hidden) {
249
0
        *state &= ~3;
250
0
        return token->type = "?}]?"[hidden];
251
0
    }
252
253
0
#define STATE(s,h)  (((s) - str) << 2 | (h))
254
255
0
    for (;;) switch (*s) {
256
0
    case ' ':
257
0
    case '\t':
258
0
    case '\r':
259
0
    case '\n':
260
0
    case ',':
261
0
    case ':':
262
0
        s++;
263
0
        continue;
264
265
0
    case '\0':
266
0
        return token->type = '\0';
267
268
0
    case '{':
269
0
    case '[':
270
0
    case '}':
271
0
    case ']':
272
0
        *state = STATE(s+1, 0);
273
0
        return token->type = *s;
274
275
0
    case '"':
276
0
        token->str = s+1;
277
0
        *state = STATE(sscan_string(s+1), 0);
278
0
        return token->type = 's';
279
280
0
    default:
281
0
        token->str = s;
282
0
        s += strcspn(s, " \t\r\n,]}");
283
0
        hidden = (*s == '}')? 1 : (*s == ']')? 2 : 0;
284
0
        if (*s != '\0') *s++ = '\0';
285
0
        *state = STATE(s, hidden);
286
0
        return token->type = token_type(token);
287
0
    }
288
289
0
#undef STATE
290
0
}
291
292
HTSLIB_EXPORT
293
char hts_json_fnext(struct hFILE *fp, hts_json_token *token, kstring_t *kstr)
294
0
{
295
0
    char peek;
296
0
    int c;
297
298
0
    for (;;) switch (c = hgetc(fp)) {
299
0
    case ' ':
300
0
    case '\t':
301
0
    case '\r':
302
0
    case '\n':
303
0
    case ',':
304
0
    case ':':
305
0
        continue;
306
307
0
    case EOF:
308
0
        return token->type = '\0';
309
310
0
    case '{':
311
0
    case '[':
312
0
    case '}':
313
0
    case ']':
314
0
        return token->type = c;
315
316
0
    case '"':
317
0
        kstr->l = 0;
318
0
        fscan_string(fp, kstr);
319
0
        if (kstr->l == 0) kputsn("", 0, kstr);
320
0
        token->str = kstr->s;
321
0
        return token->type = 's';
322
323
0
    default:
324
0
        kstr->l = 0;
325
0
        kputc(c, kstr);
326
0
        while (hpeek(fp, &peek, 1) == 1 && !strchr(" \t\r\n,]}", peek)) {
327
0
            if ((c = hgetc(fp)) == EOF) break;
328
0
            kputc(c, kstr);
329
0
        }
330
0
        token->str = kstr->s;
331
0
        return token->type = token_type(token);
332
0
    }
333
0
}
334
335
336
typedef char hts_json_nextfn(void *arg1, void *arg2, hts_json_token *token);
337
338
static char skip_value(char type, hts_json_nextfn *next, void *arg1, void *arg2)
339
0
{
340
0
    hts_json_token token;
341
0
    int level;
342
343
0
    switch (type? type : next(arg1, arg2, &token)) {
344
0
    case '\0':
345
0
        return '\0';
346
347
0
    case '?':
348
0
    case '}':
349
0
    case ']':
350
0
        return '?';
351
352
0
    case '{':
353
0
    case '[':
354
0
        level = 1;
355
0
        break;
356
357
0
    default:
358
0
        return 'v';
359
0
    }
360
361
0
    while (level > 0)
362
0
        switch (next(arg1, arg2, &token)) {
363
0
        case '\0':
364
0
            return '\0';
365
366
0
        case '?':
367
0
            return '?';
368
369
0
        case '{':
370
0
        case '[':
371
0
            level++;
372
0
            break;
373
374
0
        case '}':
375
0
        case ']':
376
0
            --level;
377
0
            break;
378
379
0
        default:
380
0
            break;
381
0
        }
382
383
0
    return 'v';
384
0
}
385
386
static char snext(void *arg1, void *arg2, hts_json_token *token)
387
0
{
388
0
    return hts_json_snext(arg1, arg2, token);
389
0
}
390
391
HTSLIB_EXPORT
392
char hts_json_sskip_value(char *str, size_t *state, char type)
393
0
{
394
0
    return skip_value(type, snext, str, state);
395
0
}
396
397
static char fnext(void *arg1, void *arg2, hts_json_token *token)
398
0
{
399
0
    return hts_json_fnext(arg1, token, arg2);
400
0
}
401
402
HTSLIB_EXPORT
403
char hts_json_fskip_value(struct hFILE *fp, char type)
404
0
{
405
0
    kstring_t str = { 0, 0, NULL };
406
0
    char ret = skip_value(type, fnext, fp, &str);
407
0
    free(str.s);
408
0
    return ret;
409
0
}
410
411
/*
412
 * A function to help with construction of CL tags in @PG records.
413
 * Takes an argc, argv pair and returns a single space-separated string.
414
 * This string should be deallocated by the calling function.
415
 *
416
 * Returns malloced char * on success
417
 *         NULL on failure
418
 */
419
0
char *stringify_argv(int argc, char *argv[]) {
420
0
    char *str, *cp;
421
0
    size_t nbytes = 1;
422
0
    int i, j;
423
424
    /* Allocate */
425
0
    for (i = 0; i < argc; i++) {
426
0
        if (i > 0) nbytes += 1;
427
0
        nbytes += strlen(argv[i]);
428
0
    }
429
0
    if (!(str = malloc(nbytes)))
430
0
        return NULL;
431
432
    /* Copy */
433
0
    cp = str;
434
0
    for (i = 0; i < argc; i++) {
435
0
        if (i > 0) *cp++ = ' ';
436
0
        j = 0;
437
0
        while (argv[i][j]) {
438
0
            if (argv[i][j] == '\t')
439
0
                *cp++ = ' ';
440
0
            else
441
0
                *cp++ = argv[i][j];
442
0
            j++;
443
0
        }
444
0
    }
445
0
    *cp++ = 0;
446
447
0
    return str;
448
0
}
449
450
/* Utility function for printing possibly malicious text data
451
 */
452
const char *
453
hts_strprint(char *buf, size_t buflen, char quote, const char *s, size_t len)
454
8.63k
{
455
8.63k
    const char *slim = (len < SIZE_MAX)? &s[len] : NULL;
456
8.63k
    char *t = buf;
457
458
8.63k
    size_t qlen = quote? 1 : 0;
459
8.63k
    if (quote) *t++ = quote;
460
461
132k
    for (; slim? (s < slim) : (*s); s++) {
462
123k
        char c;
463
123k
        size_t clen;
464
123k
        switch (*s) {
465
0
        case '\n': c = 'n'; clen = 2; break;
466
179
        case '\r': c = 'r'; clen = 2; break;
467
185
        case '\t': c = 't'; clen = 2; break;
468
0
        case '\0': c = '0'; clen = 2; break;
469
251
        case '\\': c = '\\'; clen = 2; break;
470
123k
        default:
471
123k
            c = *s;
472
123k
            if (c == quote) clen = 2;
473
121k
            else clen = isprint_c(c)? 1 : 4;
474
123k
            break;
475
123k
        }
476
477
123k
        if (t-buf + clen + qlen >= buflen) {
478
1.17k
            while (t-buf + 3 + qlen >= buflen) t--;
479
328
            if (quote) *t++ = quote;
480
328
            strcpy(t, "...");
481
328
            return buf;
482
328
        }
483
484
123k
        if (clen == 4) {
485
13.7k
            sprintf(t, "\\x%02X", (unsigned char) c);
486
13.7k
            t += clen;
487
13.7k
        }
488
109k
        else {
489
109k
            if (clen == 2) *t++ = '\\';
490
109k
            *t++ = c;
491
109k
        }
492
123k
    }
493
494
8.31k
    if (quote) *t++ = quote;
495
8.31k
    *t = '\0';
496
8.31k
    return buf;
497
8.63k
}