Coverage Report

Created: 2024-07-23 06:11

/src/htslib/vcf.c
Line
Count
Source (jump to first uncovered line)
1
/*  vcf.c -- VCF/BCF API functions.
2
3
    Copyright (C) 2012, 2013 Broad Institute.
4
    Copyright (C) 2012-2024 Genome Research Ltd.
5
    Portions copyright (C) 2014 Intel Corporation.
6
7
    Author: Heng Li <lh3@sanger.ac.uk>
8
9
Permission is hereby granted, free of charge, to any person obtaining a copy
10
of this software and associated documentation files (the "Software"), to deal
11
in the Software without restriction, including without limitation the rights
12
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13
copies of the Software, and to permit persons to whom the Software is
14
furnished to do so, subject to the following conditions:
15
16
The above copyright notice and this permission notice shall be included in
17
all copies or substantial portions of the Software.
18
19
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25
DEALINGS IN THE SOFTWARE.  */
26
27
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
28
#include <config.h>
29
30
#include <stdio.h>
31
#include <assert.h>
32
#include <string.h>
33
#include <strings.h>
34
#include <stdlib.h>
35
#include <limits.h>
36
#include <stdint.h>
37
#include <inttypes.h>
38
#include <errno.h>
39
40
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
41
#include "fuzz_settings.h"
42
#endif
43
44
#include "htslib/vcf.h"
45
#include "htslib/bgzf.h"
46
#include "htslib/tbx.h"
47
#include "htslib/hfile.h"
48
#include "hts_internal.h"
49
#include "htslib/hts_endian.h"
50
#include "htslib/khash_str2int.h"
51
#include "htslib/kstring.h"
52
#include "htslib/sam.h"
53
#include "htslib/khash.h"
54
55
#if 0
56
// This helps on Intel a bit, often 6-7% faster VCF parsing.
57
// Conversely sometimes harms AMD Zen4 as ~9% slower.
58
// Possibly related to IPC differences.  However for now it's just a
59
// curiousity we ignore and stick with the simpler code.
60
//
61
// Left here as a hint for future explorers.
62
static inline int xstreq(const char *a, const char *b) {
63
    while (*a && *a == *b)
64
        a++, b++;
65
    return *a == *b;
66
}
67
68
#define KHASH_MAP_INIT_XSTR(name, khval_t) \
69
  KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, xstreq)
70
71
KHASH_MAP_INIT_XSTR(vdict, bcf_idinfo_t)
72
#else
73
KHASH_MAP_INIT_STR(vdict, bcf_idinfo_t)
74
#endif
75
76
typedef khash_t(vdict) vdict_t;
77
78
KHASH_MAP_INIT_STR(hdict, bcf_hrec_t*)
79
typedef khash_t(hdict) hdict_t;
80
81
82
#include "htslib/kseq.h"
83
HTSLIB_EXPORT
84
uint32_t bcf_float_missing    = 0x7F800001;
85
86
HTSLIB_EXPORT
87
uint32_t bcf_float_vector_end = 0x7F800002;
88
89
HTSLIB_EXPORT
90
uint8_t bcf_type_shift[] = { 0, 0, 1, 2, 3, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
91
92
static bcf_idinfo_t bcf_idinfo_def = { .info = { 15, 15, 15 }, .hrec = { NULL, NULL, NULL}, .id = -1 };
93
94
/*
95
    Partial support for 64-bit POS and Number=1 INFO tags.
96
    Notes:
97
     - the support for 64-bit values is motivated by POS and INFO/END for large genomes
98
     - the use of 64-bit values does not conform to the specification
99
     - cannot output 64-bit BCF and if it does, it is not compatible with anything
100
     - experimental, use at your risk
101
*/
102
#ifdef VCF_ALLOW_INT64
103
    #define BCF_MAX_BT_INT64 (0x7fffffffffffffff)       /* INT64_MAX, for internal use only */
104
    #define BCF_MIN_BT_INT64 -9223372036854775800LL     /* INT64_MIN + 8, for internal use only */
105
#endif
106
107
2.31k
#define BCF_IS_64BIT (1<<30)
108
109
110
// Opaque structure with auxilary data which allows to extend bcf_hdr_t without breaking ABI.
111
// Note that this preserving API and ABI requires that the first element is vdict_t struct
112
// rather than a pointer, as user programs may (and in some cases do) access the dictionary
113
// directly as (vdict_t*)hdr->dict.
114
typedef struct
115
{
116
    vdict_t dict;   // bcf_hdr_t.dict[0] vdict_t dictionary which keeps bcf_idinfo_t for BCF_HL_FLT,BCF_HL_INFO,BCF_HL_FMT
117
    hdict_t *gen;   // hdict_t dictionary which keeps bcf_hrec_t* pointers for generic and structured fields
118
    size_t *key_len;// length of h->id[BCF_DT_ID] strings
119
}
120
bcf_hdr_aux_t;
121
122
static inline bcf_hdr_aux_t *get_hdr_aux(const bcf_hdr_t *hdr)
123
389k
{
124
389k
    return (bcf_hdr_aux_t *)hdr->dict[0];
125
389k
}
126
127
static char *find_chrom_header_line(char *s)
128
0
{
129
0
    char *nl;
130
0
    if (strncmp(s, "#CHROM\t", 7) == 0) return s;
131
0
    else if ((nl = strstr(s, "\n#CHROM\t")) != NULL) return nl+1;
132
0
    else return NULL;
133
0
}
134
135
/*************************
136
 *** VCF header parser ***
137
 *************************/
138
139
static int bcf_hdr_add_sample_len(bcf_hdr_t *h, const char *s, size_t len)
140
10.4k
{
141
10.4k
    const char *ss = s;
142
11.3k
    while ( *ss && isspace_c(*ss) && ss - s < len) ss++;
143
10.4k
    if ( !*ss || ss - s == len)
144
10
    {
145
10
        hts_log_error("Empty sample name: trailing spaces/tabs in the header line?");
146
10
        return -1;
147
10
    }
148
149
10.4k
    vdict_t *d = (vdict_t*)h->dict[BCF_DT_SAMPLE];
150
10.4k
    int ret;
151
10.4k
    char *sdup = malloc(len + 1);
152
10.4k
    if (!sdup) return -1;
153
10.4k
    memcpy(sdup, s, len);
154
10.4k
    sdup[len] = 0;
155
156
    // Ensure space is available in h->samples
157
10.4k
    size_t n = kh_size(d);
158
10.4k
    char **new_samples = realloc(h->samples, sizeof(char*) * (n + 1));
159
10.4k
    if (!new_samples) {
160
0
        free(sdup);
161
0
        return -1;
162
0
    }
163
10.4k
    h->samples = new_samples;
164
165
10.4k
    int k = kh_put(vdict, d, sdup, &ret);
166
10.4k
    if (ret < 0) {
167
0
        free(sdup);
168
0
        return -1;
169
0
    }
170
10.4k
    if (ret) { // absent
171
10.4k
        kh_val(d, k) = bcf_idinfo_def;
172
10.4k
        kh_val(d, k).id = n;
173
10.4k
    } else {
174
2
        hts_log_error("Duplicated sample name '%s'", sdup);
175
2
        free(sdup);
176
2
        return -1;
177
2
    }
178
10.4k
    h->samples[n] = sdup;
179
10.4k
    h->dirty = 1;
180
10.4k
    return 0;
181
10.4k
}
182
183
int bcf_hdr_add_sample(bcf_hdr_t *h, const char *s)
184
0
{
185
0
    if (!s) {
186
        // Allowed for backwards-compatibility, calling with s == NULL
187
        // used to trigger bcf_hdr_sync(h);
188
0
        return 0;
189
0
    }
190
0
    return bcf_hdr_add_sample_len(h, s, strlen(s));
191
0
}
192
193
int HTS_RESULT_USED bcf_hdr_parse_sample_line(bcf_hdr_t *hdr, const char *str)
194
7.22k
{
195
7.22k
    const char *mandatory = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO";
196
7.22k
    if ( strncmp(str,mandatory,strlen(mandatory)) )
197
118
    {
198
118
        hts_log_error("Could not parse the \"#CHROM..\" line, either the fields are incorrect or spaces are present instead of tabs:\n\t%s",str);
199
118
        return -1;
200
118
    }
201
202
7.10k
    const char *beg = str + strlen(mandatory), *end;
203
7.10k
    if ( !*beg || *beg=='\n' ) return 0;
204
2.56k
    if ( strncmp(beg,"\tFORMAT\t",8) )
205
100
    {
206
100
        hts_log_error("Could not parse the \"#CHROM..\" line, either FORMAT is missing or spaces are present instead of tabs:\n\t%s",str);
207
100
        return -1;
208
100
    }
209
2.46k
    beg += 8;
210
211
2.46k
    int ret = 0;
212
10.7k
    while ( *beg )
213
10.4k
    {
214
10.4k
        end = beg;
215
999M
        while ( *end && *end!='\t' && *end!='\n' ) end++;
216
10.4k
        if ( bcf_hdr_add_sample_len(hdr, beg, end-beg) < 0 ) ret = -1;
217
10.4k
        if ( !*end || *end=='\n' || ret<0 ) break;
218
8.24k
        beg = end + 1;
219
8.24k
    }
220
2.46k
    return ret;
221
2.56k
}
222
223
int bcf_hdr_sync(bcf_hdr_t *h)
224
137k
{
225
137k
    int i;
226
548k
    for (i = 0; i < 3; i++)
227
411k
    {
228
411k
        vdict_t *d = (vdict_t*)h->dict[i];
229
411k
        khint_t k;
230
411k
        if ( h->n[i] < kh_size(d) )
231
2.19k
        {
232
2.19k
            bcf_idpair_t *new_idpair;
233
            // this should be true only for i=2, BCF_DT_SAMPLE
234
2.19k
            new_idpair = (bcf_idpair_t*) realloc(h->id[i], kh_size(d)*sizeof(bcf_idpair_t));
235
2.19k
            if (!new_idpair) return -1;
236
2.19k
            h->n[i] = kh_size(d);
237
2.19k
            h->id[i] = new_idpair;
238
2.19k
        }
239
4.77G
        for (k=kh_begin(d); k<kh_end(d); k++)
240
4.77G
        {
241
4.77G
            if (!kh_exist(d,k)) continue;
242
44.8M
            h->id[i][kh_val(d,k).id].key = kh_key(d,k);
243
44.8M
            h->id[i][kh_val(d,k).id].val = &kh_val(d,k);
244
44.8M
        }
245
411k
    }
246
247
    // Invalidate key length cache
248
137k
    bcf_hdr_aux_t *aux = get_hdr_aux(h);
249
137k
    if (aux && aux->key_len) {
250
3.45k
        free(aux->key_len);
251
3.45k
        aux->key_len = NULL;
252
3.45k
    }
253
254
137k
    h->dirty = 0;
255
137k
    return 0;
256
137k
}
257
258
void bcf_hrec_destroy(bcf_hrec_t *hrec)
259
226k
{
260
226k
    if (!hrec) return;
261
220k
    free(hrec->key);
262
220k
    if ( hrec->value ) free(hrec->value);
263
220k
    int i;
264
830k
    for (i=0; i<hrec->nkeys; i++)
265
610k
    {
266
610k
        free(hrec->keys[i]);
267
610k
        free(hrec->vals[i]);
268
610k
    }
269
220k
    free(hrec->keys);
270
220k
    free(hrec->vals);
271
220k
    free(hrec);
272
220k
}
273
274
// Copies all fields except IDX.
275
bcf_hrec_t *bcf_hrec_dup(bcf_hrec_t *hrec)
276
0
{
277
0
    int save_errno;
278
0
    bcf_hrec_t *out = (bcf_hrec_t*) calloc(1,sizeof(bcf_hrec_t));
279
0
    if (!out) return NULL;
280
281
0
    out->type = hrec->type;
282
0
    if ( hrec->key ) {
283
0
        out->key = strdup(hrec->key);
284
0
        if (!out->key) goto fail;
285
0
    }
286
0
    if ( hrec->value ) {
287
0
        out->value = strdup(hrec->value);
288
0
        if (!out->value) goto fail;
289
0
    }
290
0
    out->nkeys = hrec->nkeys;
291
0
    out->keys = (char**) malloc(sizeof(char*)*hrec->nkeys);
292
0
    if (!out->keys) goto fail;
293
0
    out->vals = (char**) malloc(sizeof(char*)*hrec->nkeys);
294
0
    if (!out->vals) goto fail;
295
0
    int i, j = 0;
296
0
    for (i=0; i<hrec->nkeys; i++)
297
0
    {
298
0
        if ( hrec->keys[i] && !strcmp("IDX",hrec->keys[i]) ) continue;
299
0
        if ( hrec->keys[i] ) {
300
0
            out->keys[j] = strdup(hrec->keys[i]);
301
0
            if (!out->keys[j]) goto fail;
302
0
        }
303
0
        if ( hrec->vals[i] ) {
304
0
            out->vals[j] = strdup(hrec->vals[i]);
305
0
            if (!out->vals[j]) goto fail;
306
0
        }
307
0
        j++;
308
0
    }
309
0
    if ( i!=j ) out->nkeys -= i-j;   // IDX was omitted
310
0
    return out;
311
312
0
 fail:
313
0
    save_errno = errno;
314
0
    hts_log_error("%s", strerror(errno));
315
0
    bcf_hrec_destroy(out);
316
0
    errno = save_errno;
317
0
    return NULL;
318
0
}
319
320
void bcf_hrec_debug(FILE *fp, bcf_hrec_t *hrec)
321
0
{
322
0
    fprintf(fp, "key=[%s] value=[%s]", hrec->key, hrec->value?hrec->value:"");
323
0
    int i;
324
0
    for (i=0; i<hrec->nkeys; i++)
325
0
        fprintf(fp, "\t[%s]=[%s]", hrec->keys[i],hrec->vals[i]);
326
0
    fprintf(fp, "\n");
327
0
}
328
329
void bcf_header_debug(bcf_hdr_t *hdr)
330
0
{
331
0
    int i, j;
332
0
    for (i=0; i<hdr->nhrec; i++)
333
0
    {
334
0
        if ( !hdr->hrec[i]->value )
335
0
        {
336
0
            fprintf(stderr, "##%s=<", hdr->hrec[i]->key);
337
0
            fprintf(stderr,"%s=%s", hdr->hrec[i]->keys[0], hdr->hrec[i]->vals[0]);
338
0
            for (j=1; j<hdr->hrec[i]->nkeys; j++)
339
0
                fprintf(stderr,",%s=%s", hdr->hrec[i]->keys[j], hdr->hrec[i]->vals[j]);
340
0
            fprintf(stderr,">\n");
341
0
        }
342
0
        else
343
0
            fprintf(stderr,"##%s=%s\n", hdr->hrec[i]->key,hdr->hrec[i]->value);
344
0
    }
345
0
}
346
347
int bcf_hrec_add_key(bcf_hrec_t *hrec, const char *str, size_t len)
348
466k
{
349
466k
    char **tmp;
350
466k
    size_t n = hrec->nkeys + 1;
351
466k
    assert(len > 0 && len < SIZE_MAX);
352
466k
    tmp = realloc(hrec->keys, sizeof(char*)*n);
353
466k
    if (!tmp) return -1;
354
466k
    hrec->keys = tmp;
355
466k
    tmp = realloc(hrec->vals, sizeof(char*)*n);
356
466k
    if (!tmp) return -1;
357
466k
    hrec->vals = tmp;
358
359
466k
    hrec->keys[hrec->nkeys] = (char*) malloc((len+1)*sizeof(char));
360
466k
    if (!hrec->keys[hrec->nkeys]) return -1;
361
466k
    memcpy(hrec->keys[hrec->nkeys],str,len);
362
466k
    hrec->keys[hrec->nkeys][len] = 0;
363
466k
    hrec->vals[hrec->nkeys] = NULL;
364
466k
    hrec->nkeys = n;
365
466k
    return 0;
366
466k
}
367
368
int bcf_hrec_set_val(bcf_hrec_t *hrec, int i, const char *str, size_t len, int is_quoted)
369
466k
{
370
466k
    if ( hrec->vals[i] ) {
371
0
        free(hrec->vals[i]);
372
0
        hrec->vals[i] = NULL;
373
0
    }
374
466k
    if ( !str ) return 0;
375
466k
    if ( is_quoted )
376
138k
    {
377
138k
        if (len >= SIZE_MAX - 3) {
378
0
            errno = ENOMEM;
379
0
            return -1;
380
0
        }
381
138k
        hrec->vals[i] = (char*) malloc((len+3)*sizeof(char));
382
138k
        if (!hrec->vals[i]) return -1;
383
138k
        hrec->vals[i][0] = '"';
384
138k
        memcpy(&hrec->vals[i][1],str,len);
385
138k
        hrec->vals[i][len+1] = '"';
386
138k
        hrec->vals[i][len+2] = 0;
387
138k
    }
388
327k
    else
389
327k
    {
390
327k
        if (len == SIZE_MAX) {
391
0
            errno = ENOMEM;
392
0
            return -1;
393
0
        }
394
327k
        hrec->vals[i] = (char*) malloc((len+1)*sizeof(char));
395
327k
        if (!hrec->vals[i]) return -1;
396
327k
        memcpy(hrec->vals[i],str,len);
397
327k
        hrec->vals[i][len] = 0;
398
327k
    }
399
466k
    return 0;
400
466k
}
401
402
int hrec_add_idx(bcf_hrec_t *hrec, int idx)
403
143k
{
404
143k
    int n = hrec->nkeys + 1;
405
143k
    char **tmp = (char**) realloc(hrec->keys, sizeof(char*)*n);
406
143k
    if (!tmp) return -1;
407
143k
    hrec->keys = tmp;
408
409
143k
    tmp = (char**) realloc(hrec->vals, sizeof(char*)*n);
410
143k
    if (!tmp) return -1;
411
143k
    hrec->vals = tmp;
412
413
143k
    hrec->keys[hrec->nkeys] = strdup("IDX");
414
143k
    if (!hrec->keys[hrec->nkeys]) return -1;
415
416
143k
    kstring_t str = {0,0,0};
417
143k
    if (kputw(idx, &str) < 0) {
418
0
        free(hrec->keys[hrec->nkeys]);
419
0
        return -1;
420
0
    }
421
143k
    hrec->vals[hrec->nkeys] = str.s;
422
143k
    hrec->nkeys = n;
423
143k
    return 0;
424
143k
}
425
426
int bcf_hrec_find_key(bcf_hrec_t *hrec, const char *key)
427
154k
{
428
154k
    int i;
429
233k
    for (i=0; i<hrec->nkeys; i++)
430
193k
        if ( !strcasecmp(key,hrec->keys[i]) ) return i;
431
40.1k
    return -1;
432
154k
}
433
434
static void bcf_hrec_set_type(bcf_hrec_t *hrec)
435
420k
{
436
420k
    if ( !strcmp(hrec->key, "contig") ) hrec->type = BCF_HL_CTG;
437
384k
    else if ( !strcmp(hrec->key, "INFO") ) hrec->type = BCF_HL_INFO;
438
246k
    else if ( !strcmp(hrec->key, "FILTER") ) hrec->type = BCF_HL_FLT;
439
102k
    else if ( !strcmp(hrec->key, "FORMAT") ) hrec->type = BCF_HL_FMT;
440
78.0k
    else if ( hrec->nkeys>0 ) hrec->type = BCF_HL_STR;
441
64.1k
    else hrec->type = BCF_HL_GEN;
442
420k
}
443
444
445
/**
446
    The arrays were generated with
447
448
    valid_ctg:
449
        perl -le '@v = (split(//,q[!#$%&*+./:;=?@^_|~-]),"a"..."z","A"..."Z","0"..."9"); @a = (0) x 256; foreach $c (@v) { $a[ord($c)] = 1; } print join(", ",@a)' | fold -w 48
450
451
    valid_tag:
452
        perl -le '@v = (split(//,q[_.]),"a"..."z","A"..."Z","0"..."9"); @a = (0) x 256; foreach $c (@v) { $a[ord($c)] = 1; } print join(", ",@a)' | fold -w 48
453
*/
454
static const uint8_t valid_ctg[256] =
455
{
456
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
457
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
458
    0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1,
459
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,
460
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
461
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1,
462
    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
463
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0,
464
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
465
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
466
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
467
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
468
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
469
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
470
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
471
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
472
};
473
static const uint8_t valid_tag[256] =
474
{
475
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
476
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
477
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
478
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
479
    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
480
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
481
    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
482
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
483
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
484
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
485
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
486
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
487
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
488
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
489
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
490
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
491
};
492
493
/**
494
    bcf_hrec_check() - check the validity of structured header lines
495
496
    Returns 0 on success or negative value on error.
497
498
    Currently the return status is not checked by the caller
499
    and only a warning is printed on stderr. This should be improved
500
    to propagate the error all the way up to the caller and let it
501
    decide what to do: throw an error or proceed anyway.
502
 */
503
static int bcf_hrec_check(bcf_hrec_t *hrec)
504
210k
{
505
210k
    int i;
506
210k
    bcf_hrec_set_type(hrec);
507
508
210k
    if ( hrec->type==BCF_HL_CTG )
509
18.0k
    {
510
18.0k
        i = bcf_hrec_find_key(hrec,"ID");
511
18.0k
        if ( i<0 ) goto err_missing_id;
512
14.7k
        char *val = hrec->vals[i];
513
14.7k
        if ( val[0]=='*' || val[0]=='=' || !valid_ctg[(uint8_t)val[0]] ) goto err_invalid_ctg;
514
182k
        while ( *(++val) )
515
181k
            if ( !valid_ctg[(uint8_t)*val] ) goto err_invalid_ctg;
516
751
        return 0;
517
1.61k
    }
518
192k
    if ( hrec->type==BCF_HL_INFO )
519
69.1k
    {
520
69.1k
        i = bcf_hrec_find_key(hrec,"ID");
521
69.1k
        if ( i<0 ) goto err_missing_id;
522
62.2k
        char *val = hrec->vals[i];
523
62.2k
        if ( !strcmp(val,"1000G") ) return 0;
524
62.2k
        if ( val[0]=='.' || (val[0]>='0' && val[0]<='9') || !valid_tag[(uint8_t)val[0]] ) goto err_invalid_tag;
525
21.9k
        while ( *(++val) )
526
17.7k
            if ( !valid_tag[(uint8_t)*val] ) goto err_invalid_tag;
527
4.16k
        return 0;
528
6.72k
    }
529
123k
    if ( hrec->type==BCF_HL_FMT )
530
12.0k
    {
531
12.0k
        i = bcf_hrec_find_key(hrec,"ID");
532
12.0k
        if ( i<0 ) goto err_missing_id;
533
10.7k
        char *val = hrec->vals[i];
534
10.7k
        if ( val[0]=='.' || (val[0]>='0' && val[0]<='9') || !valid_tag[(uint8_t)val[0]] ) goto err_invalid_tag;
535
340k
        while ( *(++val) )
536
337k
            if ( !valid_tag[(uint8_t)*val] ) goto err_invalid_tag;
537
2.80k
        return 0;
538
4.87k
    }
539
111k
    return 0;
540
541
11.4k
  err_missing_id:
542
11.4k
    hts_log_warning("Missing ID attribute in one or more header lines");
543
11.4k
    return -1;
544
545
13.9k
  err_invalid_ctg:
546
13.9k
    hts_log_warning("Invalid contig name: \"%s\"", hrec->vals[i]);
547
13.9k
    return -1;
548
549
66.0k
  err_invalid_tag:
550
66.0k
    hts_log_warning("Invalid tag name: \"%s\"", hrec->vals[i]);
551
66.0k
    return -1;
552
123k
}
553
554
static inline int is_escaped(const char *min, const char *str)
555
138k
{
556
138k
    int n = 0;
557
139k
    while ( --str>=min && *str=='\\' ) n++;
558
138k
    return n%2;
559
138k
}
560
561
bcf_hrec_t *bcf_hdr_parse_line(const bcf_hdr_t *h, const char *line, int *len)
562
238k
{
563
238k
    bcf_hrec_t *hrec = NULL;
564
238k
    const char *p = line;
565
238k
    if (p[0] != '#' || p[1] != '#') { *len = 0; return NULL; }
566
225k
    p += 2;
567
568
225k
    const char *q = p;
569
1.75M
    while ( *q && *q!='=' && *q != '\n' ) q++;
570
225k
    ptrdiff_t n = q-p;
571
225k
    if ( *q!='=' || !n ) // wrong format
572
5.72k
        goto malformed_line;
573
574
220k
    hrec = (bcf_hrec_t*) calloc(1,sizeof(bcf_hrec_t));
575
220k
    if (!hrec) { *len = -1; return NULL; }
576
220k
    hrec->key = (char*) malloc(sizeof(char)*(n+1));
577
220k
    if (!hrec->key) goto fail;
578
220k
    memcpy(hrec->key,p,n);
579
220k
    hrec->key[n] = 0;
580
220k
    hrec->type = -1;
581
582
220k
    p = ++q;
583
220k
    if ( *p!='<' ) // generic field, e.g. ##samtoolsVersion=0.1.18-r579
584
35.5k
    {
585
12.1M
        while ( *q && *q!='\n' ) q++;
586
35.5k
        hrec->value = (char*) malloc((q-p+1)*sizeof(char));
587
35.5k
        if (!hrec->value) goto fail;
588
35.5k
        memcpy(hrec->value, p, q-p);
589
35.5k
        hrec->value[q-p] = 0;
590
35.5k
        *len = q - line + (*q ? 1 : 0); // Skip \n but not \0
591
35.5k
        return hrec;
592
35.5k
    }
593
594
    // structured line, e.g.
595
    // ##INFO=<ID=PV1,Number=1,Type=Float,Description="P-value for baseQ bias">
596
    // ##PEDIGREE=<Name_0=G0-ID,Name_1=G1-ID,Name_3=GN-ID>
597
184k
    int nopen = 1;
598
650k
    while ( *q && *q!='\n' && nopen>0 )
599
475k
    {
600
475k
        p = ++q;
601
476k
        while ( *q && *q==' ' ) { p++; q++; }
602
        // ^[A-Za-z_][0-9A-Za-z_.]*$
603
475k
        if (p==q && *q && (isalpha_c(*q) || *q=='_'))
604
472k
        {
605
472k
            q++;
606
2.59M
            while ( *q && (isalnum_c(*q) || *q=='_' || *q=='.') ) q++;
607
472k
        }
608
475k
        n = q-p;
609
475k
        int m = 0;
610
476k
        while ( *q && *q==' ' ) { q++; m++; }
611
475k
        if ( *q!='=' || !n )
612
9.79k
            goto malformed_line;
613
614
466k
        if (bcf_hrec_add_key(hrec, p, q-p-m) < 0) goto fail;
615
466k
        p = ++q;
616
467k
        while ( *q && *q==' ' ) { p++; q++; }
617
618
466k
        int quoted = 0;
619
466k
        char ending = '\0';
620
466k
        switch (*p) {
621
138k
        case '"':
622
138k
            quoted = 1;
623
138k
            ending = '"';
624
138k
            p++;
625
138k
            break;
626
201
        case '[':
627
201
            quoted = 1;
628
201
            ending = ']';
629
201
            break;
630
466k
        }
631
466k
        if ( quoted ) q++;
632
393M
        while ( *q && *q != '\n' )
633
393M
        {
634
393M
            if ( quoted ) { if ( *q==ending && !is_escaped(p,q) ) break; }
635
390M
            else
636
390M
            {
637
390M
                if ( *q=='<' ) nopen++;
638
390M
                if ( *q=='>' ) nopen--;
639
390M
                if ( !nopen ) break;
640
390M
                if ( *q==',' && nopen==1 ) break;
641
390M
            }
642
392M
            q++;
643
392M
        }
644
466k
        const char *r = q;
645
466k
        if (quoted && ending == ']') {
646
201
            if (*q == ending) {
647
159
                r++;
648
159
                q++;
649
159
                quoted = 0;
650
159
            } else {
651
42
                char buffer[320];
652
42
                hts_log_error("Missing ']' in header line %s",
653
42
                              hts_strprint(buffer, sizeof(buffer), '"',
654
42
                                           line, q-line));
655
42
                goto fail;
656
42
            }
657
201
        }
658
466k
        while ( r > p && r[-1] == ' ' ) r--;
659
466k
        if (bcf_hrec_set_val(hrec, hrec->nkeys-1, p, r-p, quoted) < 0)
660
0
            goto fail;
661
466k
        if ( quoted && *q==ending ) q++;
662
466k
        if ( *q=='>' )
663
141k
        {
664
141k
            if (nopen) nopen--;     // this can happen with nested angle brackets <>
665
141k
            q++;
666
141k
        }
667
466k
    }
668
174k
    if ( nopen )
669
33.2k
        hts_log_warning("Incomplete header line, trying to proceed anyway:\n\t[%s]\n\t[%d]",line,q[0]);
670
671
    // Skip to end of line
672
174k
    int nonspace = 0;
673
174k
    p = q;
674
1.82M
    while ( *q && *q!='\n' ) { nonspace |= !isspace_c(*q); q++; }
675
174k
    if (nonspace) {
676
1.18k
        char buffer[320];
677
1.18k
        hts_log_warning("Dropped trailing junk from header line '%s'",
678
1.18k
                        hts_strprint(buffer, sizeof(buffer),
679
1.18k
                                     '"', line, q - line));
680
1.18k
    }
681
682
174k
    *len = q - line + (*q ? 1 : 0);
683
174k
    return hrec;
684
685
42
 fail:
686
42
    *len = -1;
687
42
    bcf_hrec_destroy(hrec);
688
42
    return NULL;
689
690
15.5k
 malformed_line:
691
15.5k
    {
692
15.5k
        char buffer[320];
693
764k
        while ( *q && *q!='\n' ) q++;  // Ensure *len includes full line
694
15.5k
        hts_log_error("Could not parse the header line: %s",
695
15.5k
                      hts_strprint(buffer, sizeof(buffer),
696
15.5k
                                   '"', line, q - line));
697
15.5k
        *len = q - line + (*q ? 1 : 0);
698
15.5k
        bcf_hrec_destroy(hrec);
699
15.5k
        return NULL;
700
184k
    }
701
184k
}
702
703
static int bcf_hdr_set_idx(bcf_hdr_t *hdr, int dict_type, const char *tag, bcf_idinfo_t *idinfo)
704
142k
{
705
142k
    size_t new_n;
706
707
    // If available, preserve existing IDX
708
142k
    if ( idinfo->id==-1 )
709
141k
        idinfo->id = hdr->n[dict_type];
710
576
    else if ( idinfo->id < hdr->n[dict_type] && hdr->id[dict_type][idinfo->id].key )
711
8
    {
712
8
        hts_log_error("Conflicting IDX=%d lines in the header dictionary, the new tag is %s",
713
8
            idinfo->id, tag);
714
8
        errno = EINVAL;
715
8
        return -1;
716
8
    }
717
718
142k
    new_n = idinfo->id >= hdr->n[dict_type] ? idinfo->id+1 : hdr->n[dict_type];
719
142k
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
720
    // hts_resize() can attempt to allocate up to 2 * requested items
721
142k
    if (new_n > FUZZ_ALLOC_LIMIT/(2 * sizeof(bcf_idpair_t)))
722
30
        return -1;
723
142k
#endif
724
142k
    if (hts_resize(bcf_idpair_t, new_n, &hdr->m[dict_type],
725
142k
                   &hdr->id[dict_type], HTS_RESIZE_CLEAR)) {
726
0
        return -1;
727
0
    }
728
142k
    hdr->n[dict_type] = new_n;
729
730
    // NB: the next kh_put call can invalidate the idinfo pointer, therefore
731
    // we leave it unassigned here. It must be set explicitly in bcf_hdr_sync.
732
142k
    hdr->id[dict_type][idinfo->id].key = tag;
733
734
142k
    return 0;
735
142k
}
736
737
// returns: 1 when hdr needs to be synced, -1 on error, 0 otherwise
738
static int bcf_hdr_register_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
739
210k
{
740
    // contig
741
210k
    int i, ret, replacing = 0;
742
210k
    khint_t k;
743
210k
    char *str = NULL;
744
745
210k
    bcf_hrec_set_type(hrec);
746
747
210k
    if ( hrec->type==BCF_HL_CTG )
748
18.0k
    {
749
18.0k
        hts_pos_t len = 0;
750
751
        // Get the contig ID ($str) and length ($j)
752
18.0k
        i = bcf_hrec_find_key(hrec,"length");
753
18.0k
        if ( i<0 ) len = 0;
754
1.74k
        else {
755
1.74k
            char *end = hrec->vals[i];
756
1.74k
            len = strtoll(hrec->vals[i], &end, 10);
757
1.74k
            if (end == hrec->vals[i] || len < 0) return 0;
758
1.74k
        }
759
760
16.7k
        i = bcf_hrec_find_key(hrec,"ID");
761
16.7k
        if ( i<0 ) return 0;
762
14.7k
        str = strdup(hrec->vals[i]);
763
14.7k
        if (!str) return -1;
764
765
        // Register in the dictionary
766
14.7k
        vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_CTG];
767
14.7k
        khint_t k = kh_get(vdict, d, str);
768
14.7k
        if ( k != kh_end(d) ) { // already present
769
1.39k
            free(str); str=NULL;
770
1.39k
            if (kh_val(d, k).hrec[0] != NULL) // and not removed
771
1.39k
                return 0;
772
0
            replacing = 1;
773
13.3k
        } else {
774
13.3k
            k = kh_put(vdict, d, str, &ret);
775
13.3k
            if (ret < 0) { free(str); return -1; }
776
13.3k
        }
777
778
13.3k
        int idx = bcf_hrec_find_key(hrec,"IDX");
779
13.3k
        if ( idx!=-1 )
780
6.15k
        {
781
6.15k
            char *tmp = hrec->vals[idx];
782
6.15k
            idx = strtol(hrec->vals[idx], &tmp, 10);
783
6.15k
            if ( *tmp || idx < 0 || idx >= INT_MAX - 1)
784
6.03k
            {
785
6.03k
                if (!replacing) {
786
6.03k
                    kh_del(vdict, d, k);
787
6.03k
                    free(str);
788
6.03k
                }
789
6.03k
                hts_log_warning("Error parsing the IDX tag, skipping");
790
6.03k
                return 0;
791
6.03k
            }
792
6.15k
        }
793
794
7.30k
        kh_val(d, k) = bcf_idinfo_def;
795
7.30k
        kh_val(d, k).id = idx;
796
7.30k
        kh_val(d, k).info[0] = len;
797
7.30k
        kh_val(d, k).hrec[0] = hrec;
798
7.30k
        if (bcf_hdr_set_idx(hdr, BCF_DT_CTG, kh_key(d,k), &kh_val(d,k)) < 0) {
799
15
            if (!replacing) {
800
15
                kh_del(vdict, d, k);
801
15
                free(str);
802
15
            }
803
15
            return -1;
804
15
        }
805
7.28k
        if ( idx==-1 ) {
806
7.18k
            if (hrec_add_idx(hrec, kh_val(d,k).id) < 0) {
807
0
               return -1;
808
0
            }
809
7.18k
        }
810
811
7.28k
        return 1;
812
7.28k
    }
813
814
192k
    if ( hrec->type==BCF_HL_STR ) return 1;
815
185k
    if ( hrec->type!=BCF_HL_INFO && hrec->type!=BCF_HL_FLT && hrec->type!=BCF_HL_FMT ) return 0;
816
817
    // INFO/FILTER/FORMAT
818
153k
    char *id = NULL;
819
153k
    uint32_t type = UINT32_MAX, var = UINT32_MAX;
820
153k
    int num = -1, idx = -1;
821
574k
    for (i=0; i<hrec->nkeys; i++)
822
422k
    {
823
422k
        if ( !strcmp(hrec->keys[i], "ID") ) id = hrec->vals[i];
824
279k
        else if ( !strcmp(hrec->keys[i], "IDX") )
825
4.96k
        {
826
4.96k
            char *tmp = hrec->vals[i];
827
4.96k
            idx = strtol(hrec->vals[i], &tmp, 10);
828
4.96k
            if ( *tmp || idx < 0 || idx >= INT_MAX - 1)
829
1.44k
            {
830
1.44k
                hts_log_warning("Error parsing the IDX tag, skipping");
831
1.44k
                return 0;
832
1.44k
            }
833
4.96k
        }
834
274k
        else if ( !strcmp(hrec->keys[i], "Type") )
835
71.6k
        {
836
71.6k
            if ( !strcmp(hrec->vals[i], "Integer") ) type = BCF_HT_INT;
837
69.3k
            else if ( !strcmp(hrec->vals[i], "Float") ) type = BCF_HT_REAL;
838
67.9k
            else if ( !strcmp(hrec->vals[i], "String") ) type = BCF_HT_STR;
839
4.82k
            else if ( !strcmp(hrec->vals[i], "Character") ) type = BCF_HT_STR;
840
4.73k
            else if ( !strcmp(hrec->vals[i], "Flag") ) type = BCF_HT_FLAG;
841
2.80k
            else
842
2.80k
            {
843
2.80k
                hts_log_warning("The type \"%s\" is not supported, assuming \"String\"", hrec->vals[i]);
844
2.80k
                type = BCF_HT_STR;
845
2.80k
            }
846
71.6k
        }
847
202k
        else if ( !strcmp(hrec->keys[i], "Number") )
848
64.7k
        {
849
64.7k
            if ( !strcmp(hrec->vals[i],"A") ) var = BCF_VL_A;
850
64.1k
            else if ( !strcmp(hrec->vals[i],"R") ) var = BCF_VL_R;
851
64.1k
            else if ( !strcmp(hrec->vals[i],"G") ) var = BCF_VL_G;
852
64.1k
            else if ( !strcmp(hrec->vals[i],".") ) var = BCF_VL_VAR;
853
63.7k
            else
854
63.7k
            {
855
63.7k
                sscanf(hrec->vals[i],"%d",&num);
856
63.7k
                var = BCF_VL_FIXED;
857
63.7k
            }
858
64.7k
            if (var != BCF_VL_FIXED) num = 0xfffff;
859
64.7k
        }
860
422k
    }
861
151k
    if (hrec->type == BCF_HL_INFO || hrec->type == BCF_HL_FMT) {
862
80.7k
        if (type == -1) {
863
10.5k
            hts_log_warning("%s %s field has no Type defined. Assuming String",
864
10.5k
                *hrec->key == 'I' ? "An" : "A", hrec->key);
865
10.5k
            type = BCF_HT_STR;
866
10.5k
        }
867
80.7k
        if (var == -1) {
868
15.9k
            hts_log_warning("%s %s field has no Number defined. Assuming '.'",
869
15.9k
                *hrec->key == 'I' ? "An" : "A", hrec->key);
870
15.9k
            var = BCF_VL_VAR;
871
15.9k
        }
872
80.7k
        if ( type==BCF_HT_FLAG && (var!=BCF_VL_FIXED || num!=0) )
873
678
        {
874
678
            hts_log_warning("The definition of Flag \"%s/%s\" is invalid, forcing Number=0", hrec->key,id);
875
678
            var = BCF_VL_FIXED;
876
678
            num = 0;
877
678
        }
878
80.7k
    }
879
151k
    uint32_t info = ((((uint32_t)num) & 0xfffff)<<12 |
880
151k
                     (var & 0xf) << 8 |
881
151k
                     (type & 0xf) << 4 |
882
151k
                     (((uint32_t) hrec->type) & 0xf));
883
884
151k
    if ( !id ) return 0;
885
142k
    str = strdup(id);
886
142k
    if (!str) return -1;
887
888
142k
    vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_ID];
889
142k
    k = kh_get(vdict, d, str);
890
142k
    if ( k != kh_end(d) )
891
7.24k
    {
892
        // already present
893
7.24k
        free(str);
894
7.24k
        if ( kh_val(d, k).hrec[info&0xf] ) return 0;
895
2.13k
        kh_val(d, k).info[info&0xf] = info;
896
2.13k
        kh_val(d, k).hrec[info&0xf] = hrec;
897
2.13k
        if ( idx==-1 ) {
898
2.13k
            if (hrec_add_idx(hrec, kh_val(d, k).id) < 0) {
899
0
                return -1;
900
0
            }
901
2.13k
        }
902
2.13k
        return 1;
903
2.13k
    }
904
135k
    k = kh_put(vdict, d, str, &ret);
905
135k
    if (ret < 0) {
906
0
        free(str);
907
0
        return -1;
908
0
    }
909
135k
    kh_val(d, k) = bcf_idinfo_def;
910
135k
    kh_val(d, k).info[info&0xf] = info;
911
135k
    kh_val(d, k).hrec[info&0xf] = hrec;
912
135k
    kh_val(d, k).id = idx;
913
135k
    if (bcf_hdr_set_idx(hdr, BCF_DT_ID, kh_key(d,k), &kh_val(d,k)) < 0) {
914
23
        kh_del(vdict, d, k);
915
23
        free(str);
916
23
        return -1;
917
23
    }
918
135k
    if ( idx==-1 ) {
919
134k
        if (hrec_add_idx(hrec, kh_val(d,k).id) < 0) {
920
0
            return -1;
921
0
        }
922
134k
    }
923
924
135k
    return 1;
925
135k
}
926
927
static void bcf_hdr_unregister_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
928
0
{
929
0
    if (hrec->type == BCF_HL_FLT ||
930
0
        hrec->type == BCF_HL_INFO ||
931
0
        hrec->type == BCF_HL_FMT ||
932
0
        hrec->type == BCF_HL_CTG) {
933
0
        int id = bcf_hrec_find_key(hrec, "ID");
934
0
        if (id < 0 || !hrec->vals[id])
935
0
            return;
936
0
        vdict_t *dict = (hrec->type == BCF_HL_CTG
937
0
                         ? (vdict_t*)hdr->dict[BCF_DT_CTG]
938
0
                         : (vdict_t*)hdr->dict[BCF_DT_ID]);
939
0
        khint_t k = kh_get(vdict, dict, hrec->vals[id]);
940
0
        if (k != kh_end(dict))
941
0
            kh_val(dict, k).hrec[hrec->type==BCF_HL_CTG ? 0 : hrec->type] = NULL;
942
0
    }
943
0
}
944
945
static void bcf_hdr_remove_from_hdict(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
946
0
{
947
0
    kstring_t str = KS_INITIALIZE;
948
0
    bcf_hdr_aux_t *aux = get_hdr_aux(hdr);
949
0
    khint_t k;
950
0
    int id;
951
952
0
    switch (hrec->type) {
953
0
    case BCF_HL_GEN:
954
0
        if (ksprintf(&str, "##%s=%s", hrec->key,hrec->value) < 0)
955
0
            str.l = 0;
956
0
        break;
957
0
    case BCF_HL_STR:
958
0
        id = bcf_hrec_find_key(hrec, "ID");
959
0
        if (id < 0)
960
0
            return;
961
0
        if (!hrec->vals[id] ||
962
0
            ksprintf(&str, "##%s=<ID=%s>", hrec->key, hrec->vals[id]) < 0)
963
0
            str.l = 0;
964
0
        break;
965
0
    default:
966
0
        return;
967
0
    }
968
0
    if (str.l) {
969
0
        k = kh_get(hdict, aux->gen, str.s);
970
0
    } else {
971
        // Couldn't get a string for some reason, so try the hard way...
972
0
        for (k = kh_begin(aux->gen); k < kh_end(aux->gen); k++) {
973
0
            if (kh_exist(aux->gen, k) && kh_val(aux->gen, k) == hrec)
974
0
                break;
975
0
        }
976
0
    }
977
0
    if (k != kh_end(aux->gen) && kh_val(aux->gen, k) == hrec) {
978
0
        kh_val(aux->gen, k) = NULL;
979
0
        free((char *) kh_key(aux->gen, k));
980
0
        kh_key(aux->gen, k) = NULL;
981
0
        kh_del(hdict, aux->gen, k);
982
0
    }
983
0
    free(str.s);
984
0
}
985
986
int bcf_hdr_update_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec, const bcf_hrec_t *tmp)
987
0
{
988
    // currently only for bcf_hdr_set_version
989
0
    assert( hrec->type==BCF_HL_GEN );
990
0
    int ret;
991
0
    khint_t k;
992
0
    bcf_hdr_aux_t *aux = get_hdr_aux(hdr);
993
0
    for (k=kh_begin(aux->gen); k<kh_end(aux->gen); k++)
994
0
    {
995
0
        if ( !kh_exist(aux->gen,k) ) continue;
996
0
        if ( hrec!=(bcf_hrec_t*)kh_val(aux->gen,k) ) continue;
997
0
        break;
998
0
    }
999
0
    assert( k<kh_end(aux->gen) );   // something went wrong, should never happen
1000
0
    free((char*)kh_key(aux->gen,k));
1001
0
    kh_del(hdict,aux->gen,k);
1002
0
    kstring_t str = {0,0,0};
1003
0
    if ( ksprintf(&str, "##%s=%s", tmp->key,tmp->value) < 0 )
1004
0
    {
1005
0
        free(str.s);
1006
0
        return -1;
1007
0
    }
1008
0
    k = kh_put(hdict, aux->gen, str.s, &ret);
1009
0
    if ( ret<0 )
1010
0
    {
1011
0
        free(str.s);
1012
0
        return -1;
1013
0
    }
1014
0
    free(hrec->value);
1015
0
    hrec->value = strdup(tmp->value);
1016
0
    if ( !hrec->value ) return -1;
1017
0
    return 0;
1018
0
}
1019
1020
int bcf_hdr_add_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
1021
211k
{
1022
211k
    kstring_t str = {0,0,0};
1023
211k
    bcf_hdr_aux_t *aux = get_hdr_aux(hdr);
1024
1025
211k
    int res;
1026
211k
    if ( !hrec ) return 0;
1027
1028
210k
    bcf_hrec_check(hrec);   // todo: check return status and propagate errors up
1029
1030
210k
    res = bcf_hdr_register_hrec(hdr,hrec);
1031
210k
    if (res < 0) return -1;
1032
210k
    if ( !res )
1033
58.9k
    {
1034
        // If one of the hashed field, then it is already present
1035
58.9k
        if ( hrec->type != BCF_HL_GEN )
1036
26.9k
        {
1037
26.9k
            bcf_hrec_destroy(hrec);
1038
26.9k
            return 0;
1039
26.9k
        }
1040
1041
        // Is one of the generic fields and already present?
1042
32.0k
        if ( ksprintf(&str, "##%s=%s", hrec->key,hrec->value) < 0 )
1043
0
        {
1044
0
            free(str.s);
1045
0
            return -1;
1046
0
        }
1047
32.0k
        khint_t k = kh_get(hdict, aux->gen, str.s);
1048
32.0k
        if ( k != kh_end(aux->gen) )
1049
15.8k
        {
1050
            // duplicate record
1051
15.8k
            bcf_hrec_destroy(hrec);
1052
15.8k
            free(str.s);
1053
15.8k
            return 0;
1054
15.8k
        }
1055
32.0k
    }
1056
1057
167k
    int i;
1058
167k
    if ( hrec->type==BCF_HL_STR && (i=bcf_hrec_find_key(hrec,"ID"))>=0 )
1059
3.73k
    {
1060
3.73k
        if ( ksprintf(&str, "##%s=<ID=%s>", hrec->key,hrec->vals[i]) < 0 )
1061
0
        {
1062
0
            free(str.s);
1063
0
            return -1;
1064
0
        }
1065
3.73k
        khint_t k = kh_get(hdict, aux->gen, str.s);
1066
3.73k
        if ( k != kh_end(aux->gen) )
1067
2.64k
        {
1068
            // duplicate record
1069
2.64k
            bcf_hrec_destroy(hrec);
1070
2.64k
            free(str.s);
1071
2.64k
            return 0;
1072
2.64k
        }
1073
3.73k
    }
1074
1075
    // New record, needs to be added
1076
164k
    int n = hdr->nhrec + 1;
1077
164k
    bcf_hrec_t **new_hrec = realloc(hdr->hrec, n*sizeof(bcf_hrec_t*));
1078
164k
    if (!new_hrec) {
1079
0
        free(str.s);
1080
0
        bcf_hdr_unregister_hrec(hdr, hrec);
1081
0
        return -1;
1082
0
    }
1083
164k
    hdr->hrec = new_hrec;
1084
1085
164k
    if ( str.s )
1086
17.3k
    {
1087
17.3k
        khint_t k = kh_put(hdict, aux->gen, str.s, &res);
1088
17.3k
        if ( res<0 )
1089
0
        {
1090
0
            free(str.s);
1091
0
            return -1;
1092
0
        }
1093
17.3k
        kh_val(aux->gen,k) = hrec;
1094
17.3k
    }
1095
1096
164k
    hdr->hrec[hdr->nhrec] = hrec;
1097
164k
    hdr->dirty = 1;
1098
164k
    hdr->nhrec = n;
1099
1100
164k
    return hrec->type==BCF_HL_GEN ? 0 : 1;
1101
164k
}
1102
1103
bcf_hrec_t *bcf_hdr_get_hrec(const bcf_hdr_t *hdr, int type, const char *key, const char *value, const char *str_class)
1104
0
{
1105
0
    int i;
1106
0
    if ( type==BCF_HL_GEN )
1107
0
    {
1108
        // e.g. ##fileformat=VCFv4.2
1109
        //      ##source=GenomicsDBImport
1110
        //      ##bcftools_viewVersion=1.16-80-gdfdb0923+htslib-1.16-34-g215d364
1111
0
        if ( value )
1112
0
        {
1113
0
            kstring_t str = {0,0,0};
1114
0
            ksprintf(&str, "##%s=%s", key,value);
1115
0
            bcf_hdr_aux_t *aux = get_hdr_aux(hdr);
1116
0
            khint_t k = kh_get(hdict, aux->gen, str.s);
1117
0
            free(str.s);
1118
0
            if ( k == kh_end(aux->gen) ) return NULL;
1119
0
            return kh_val(aux->gen, k);
1120
0
        }
1121
0
        for (i=0; i<hdr->nhrec; i++)
1122
0
        {
1123
0
            if ( hdr->hrec[i]->type!=type ) continue;
1124
0
            if ( strcmp(hdr->hrec[i]->key,key) ) continue;
1125
0
            return hdr->hrec[i];
1126
0
        }
1127
0
        return NULL;
1128
0
    }
1129
0
    else if ( type==BCF_HL_STR )
1130
0
    {
1131
        // e.g. ##GATKCommandLine=<ID=GenomicsDBImport,CommandLine="GenomicsDBImport....">
1132
        //      ##ALT=<ID=NON_REF,Description="Represents any possible alternative allele not already represented at this location by REF and ALT">
1133
0
        if (!str_class) return NULL;
1134
0
        if ( !strcmp("ID",key) )
1135
0
        {
1136
0
            kstring_t str = {0,0,0};
1137
0
            ksprintf(&str, "##%s=<%s=%s>",str_class,key,value);
1138
0
            bcf_hdr_aux_t *aux = get_hdr_aux(hdr);
1139
0
            khint_t k = kh_get(hdict, aux->gen, str.s);
1140
0
            free(str.s);
1141
0
            if ( k == kh_end(aux->gen) ) return NULL;
1142
0
            return kh_val(aux->gen, k);
1143
0
        }
1144
0
        for (i=0; i<hdr->nhrec; i++)
1145
0
        {
1146
0
            if ( hdr->hrec[i]->type!=type ) continue;
1147
0
            if ( strcmp(hdr->hrec[i]->key,str_class) ) continue;
1148
0
            int j = bcf_hrec_find_key(hdr->hrec[i],key);
1149
0
            if ( j>=0 && !strcmp(hdr->hrec[i]->vals[j],value) ) return hdr->hrec[i];
1150
0
        }
1151
0
        return NULL;
1152
0
    }
1153
0
    vdict_t *d = type==BCF_HL_CTG ? (vdict_t*)hdr->dict[BCF_DT_CTG] : (vdict_t*)hdr->dict[BCF_DT_ID];
1154
0
    khint_t k = kh_get(vdict, d, value);
1155
0
    if ( k == kh_end(d) ) return NULL;
1156
0
    return kh_val(d, k).hrec[type==BCF_HL_CTG?0:type];
1157
0
}
1158
1159
void bcf_hdr_check_sanity(bcf_hdr_t *hdr)
1160
6.99k
{
1161
6.99k
    static int PL_warned = 0, GL_warned = 0;
1162
1163
6.99k
    if ( !PL_warned )
1164
6.99k
    {
1165
6.99k
        int id = bcf_hdr_id2int(hdr, BCF_DT_ID, "PL");
1166
6.99k
        if ( bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,id) && bcf_hdr_id2length(hdr,BCF_HL_FMT,id)!=BCF_VL_G )
1167
0
        {
1168
0
            hts_log_warning("PL should be declared as Number=G");
1169
0
            PL_warned = 1;
1170
0
        }
1171
6.99k
    }
1172
6.99k
    if ( !GL_warned )
1173
6.99k
    {
1174
6.99k
        int id = bcf_hdr_id2int(hdr, BCF_DT_ID, "GL");
1175
6.99k
        if ( bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,id) && bcf_hdr_id2length(hdr,BCF_HL_FMT,id)!=BCF_VL_G )
1176
0
        {
1177
0
            hts_log_warning("GL should be declared as Number=G");
1178
0
            GL_warned = 1;
1179
0
        }
1180
6.99k
    }
1181
6.99k
}
1182
1183
int bcf_hdr_parse(bcf_hdr_t *hdr, char *htxt)
1184
9.47k
{
1185
9.47k
    int len, done = 0;
1186
9.47k
    char *p = htxt;
1187
1188
    // Check sanity: "fileformat" string must come as first
1189
9.47k
    bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr,p,&len);
1190
9.47k
    if ( !hrec || !hrec->key || strcasecmp(hrec->key,"fileformat") )
1191
1.04k
        hts_log_warning("The first line should be ##fileformat; is the VCF/BCF header broken?");
1192
9.47k
    if (bcf_hdr_add_hrec(hdr, hrec) < 0) {
1193
0
        bcf_hrec_destroy(hrec);
1194
0
        return -1;
1195
0
    }
1196
1197
    // The filter PASS must appear first in the dictionary
1198
9.47k
    hrec = bcf_hdr_parse_line(hdr,"##FILTER=<ID=PASS,Description=\"All filters passed\">",&len);
1199
9.47k
    if (!hrec || bcf_hdr_add_hrec(hdr, hrec) < 0) {
1200
0
        bcf_hrec_destroy(hrec);
1201
0
        return -1;
1202
0
    }
1203
1204
    // Parse the whole header
1205
26.7k
    do {
1206
88.6k
        while (NULL != (hrec = bcf_hdr_parse_line(hdr, p, &len))) {
1207
61.9k
            if (bcf_hdr_add_hrec(hdr, hrec) < 0) {
1208
36
                bcf_hrec_destroy(hrec);
1209
36
                return -1;
1210
36
            }
1211
61.9k
            p += len;
1212
61.9k
        }
1213
26.6k
        assert(hrec == NULL);
1214
26.6k
        if (len < 0) {
1215
            // len < 0 indicates out-of-memory, or similar error
1216
12
            hts_log_error("Could not parse header line: %s", strerror(errno));
1217
12
            return -1;
1218
26.6k
        } else if (len > 0) {
1219
            // Bad header line.  bcf_hdr_parse_line() will have logged it.
1220
            // Skip and try again on the next line (p + len will be the start
1221
            // of the next one).
1222
15.3k
            p += len;
1223
15.3k
            continue;
1224
15.3k
        }
1225
1226
        // Next should be the sample line.  If not, it was a malformed
1227
        // header, in which case print a warning and skip (many VCF
1228
        // operations do not really care about a few malformed lines).
1229
        // In the future we may want to add a strict mode that errors in
1230
        // this case.
1231
11.3k
        if ( strncmp("#CHROM\t",p,7) && strncmp("#CHROM ",p,7) ) {
1232
4.08k
            char *eol = strchr(p, '\n');
1233
4.08k
            if (*p != '\0') {
1234
1.99k
                char buffer[320];
1235
1.99k
                hts_log_warning("Could not parse header line: %s",
1236
1.99k
                                hts_strprint(buffer, sizeof(buffer),
1237
1.99k
                                               '"', p,
1238
1.99k
                                               eol ? (eol - p) : SIZE_MAX));
1239
1.99k
            }
1240
4.08k
            if (eol) {
1241
1.88k
                p = eol + 1; // Try from the next line.
1242
2.20k
            } else {
1243
2.20k
                done = -1; // No more lines left, give up.
1244
2.20k
            }
1245
7.22k
        } else {
1246
7.22k
            done = 1; // Sample line found
1247
7.22k
        }
1248
26.6k
    } while (!done);
1249
1250
9.42k
    if (done < 0) {
1251
        // No sample line is fatal.
1252
2.20k
        hts_log_error("Could not parse the header, sample line not found");
1253
2.20k
        return -1;
1254
2.20k
    }
1255
1256
7.22k
    if (bcf_hdr_parse_sample_line(hdr,p) < 0)
1257
230
        return -1;
1258
6.99k
    if (bcf_hdr_sync(hdr) < 0)
1259
0
        return -1;
1260
6.99k
    bcf_hdr_check_sanity(hdr);
1261
6.99k
    return 0;
1262
6.99k
}
1263
1264
int bcf_hdr_append(bcf_hdr_t *hdr, const char *line)
1265
0
{
1266
0
    int len;
1267
0
    bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr, (char*) line, &len);
1268
0
    if ( !hrec ) return -1;
1269
0
    if (bcf_hdr_add_hrec(hdr, hrec) < 0)
1270
0
        return -1;
1271
0
    return 0;
1272
0
}
1273
1274
void bcf_hdr_remove(bcf_hdr_t *hdr, int type, const char *key)
1275
0
{
1276
0
    int i = 0;
1277
0
    bcf_hrec_t *hrec;
1278
0
    if ( !key )
1279
0
    {
1280
        // no key, remove all entries of this type
1281
0
        while ( i<hdr->nhrec )
1282
0
        {
1283
0
            if ( hdr->hrec[i]->type!=type ) { i++; continue; }
1284
0
            hrec = hdr->hrec[i];
1285
0
            bcf_hdr_unregister_hrec(hdr, hrec);
1286
0
            bcf_hdr_remove_from_hdict(hdr, hrec);
1287
0
            hdr->dirty = 1;
1288
0
            hdr->nhrec--;
1289
0
            if ( i < hdr->nhrec )
1290
0
                memmove(&hdr->hrec[i],&hdr->hrec[i+1],(hdr->nhrec-i)*sizeof(bcf_hrec_t*));
1291
0
            bcf_hrec_destroy(hrec);
1292
0
        }
1293
0
        return;
1294
0
    }
1295
0
    while (1)
1296
0
    {
1297
0
        if ( type==BCF_HL_FLT || type==BCF_HL_INFO || type==BCF_HL_FMT || type== BCF_HL_CTG )
1298
0
        {
1299
0
            hrec = bcf_hdr_get_hrec(hdr, type, "ID", key, NULL);
1300
0
            if ( !hrec ) return;
1301
1302
0
            for (i=0; i<hdr->nhrec; i++)
1303
0
                if ( hdr->hrec[i]==hrec ) break;
1304
0
            assert( i<hdr->nhrec );
1305
1306
0
            vdict_t *d = type==BCF_HL_CTG ? (vdict_t*)hdr->dict[BCF_DT_CTG] : (vdict_t*)hdr->dict[BCF_DT_ID];
1307
0
            khint_t k = kh_get(vdict, d, key);
1308
0
            kh_val(d, k).hrec[type==BCF_HL_CTG?0:type] = NULL;
1309
0
        }
1310
0
        else
1311
0
        {
1312
0
            for (i=0; i<hdr->nhrec; i++)
1313
0
            {
1314
0
                if ( hdr->hrec[i]->type!=type ) continue;
1315
0
                if ( type==BCF_HL_GEN )
1316
0
                {
1317
0
                    if ( !strcmp(hdr->hrec[i]->key,key) ) break;
1318
0
                }
1319
0
                else
1320
0
                {
1321
                    // not all structured lines have ID, we could be more sophisticated as in bcf_hdr_get_hrec()
1322
0
                    int j = bcf_hrec_find_key(hdr->hrec[i], "ID");
1323
0
                    if ( j>=0 && !strcmp(hdr->hrec[i]->vals[j],key) ) break;
1324
0
                }
1325
0
            }
1326
0
            if ( i==hdr->nhrec ) return;
1327
0
            hrec = hdr->hrec[i];
1328
0
            bcf_hdr_remove_from_hdict(hdr, hrec);
1329
0
        }
1330
1331
0
        hdr->nhrec--;
1332
0
        if ( i < hdr->nhrec )
1333
0
            memmove(&hdr->hrec[i],&hdr->hrec[i+1],(hdr->nhrec-i)*sizeof(bcf_hrec_t*));
1334
0
        bcf_hrec_destroy(hrec);
1335
0
        hdr->dirty = 1;
1336
0
    }
1337
0
}
1338
1339
int bcf_hdr_printf(bcf_hdr_t *hdr, const char *fmt, ...)
1340
0
{
1341
0
    char tmp[256], *line = tmp;
1342
0
    va_list ap;
1343
0
    va_start(ap, fmt);
1344
0
    int n = vsnprintf(line, sizeof(tmp), fmt, ap);
1345
0
    va_end(ap);
1346
1347
0
    if (n >= sizeof(tmp)) {
1348
0
        n++; // For trailing NUL
1349
0
        line = (char*)malloc(n);
1350
0
        if (!line)
1351
0
            return -1;
1352
1353
0
        va_start(ap, fmt);
1354
0
        vsnprintf(line, n, fmt, ap);
1355
0
        va_end(ap);
1356
0
    }
1357
1358
0
    int ret = bcf_hdr_append(hdr, line);
1359
1360
0
    if (line != tmp) free(line);
1361
0
    return ret;
1362
0
}
1363
1364
1365
/**********************
1366
 *** BCF header I/O ***
1367
 **********************/
1368
1369
const char *bcf_hdr_get_version(const bcf_hdr_t *hdr)
1370
0
{
1371
0
    bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, BCF_HL_GEN, "fileformat", NULL, NULL);
1372
0
    if ( !hrec )
1373
0
    {
1374
0
        hts_log_warning("No version string found, assuming VCFv4.2");
1375
0
        return "VCFv4.2";
1376
0
    }
1377
0
    return hrec->value;
1378
0
}
1379
1380
int bcf_hdr_set_version(bcf_hdr_t *hdr, const char *version)
1381
0
{
1382
0
    bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, BCF_HL_GEN, "fileformat", NULL, NULL);
1383
0
    if ( !hrec )
1384
0
    {
1385
0
        int len;
1386
0
        kstring_t str = {0,0,0};
1387
0
        if ( ksprintf(&str,"##fileformat=%s", version) < 0 ) return -1;
1388
0
        hrec = bcf_hdr_parse_line(hdr, str.s, &len);
1389
0
        free(str.s);
1390
0
    }
1391
0
    else
1392
0
    {
1393
0
        bcf_hrec_t *tmp = bcf_hrec_dup(hrec);
1394
0
        if ( !tmp ) return -1;
1395
0
        free(tmp->value);
1396
0
        tmp->value = strdup(version);
1397
0
        if ( !tmp->value ) return -1;
1398
0
        bcf_hdr_update_hrec(hdr, hrec, tmp);
1399
0
        bcf_hrec_destroy(tmp);
1400
0
    }
1401
0
    hdr->dirty = 1;
1402
0
    return 0; // FIXME: check for errs in this function (return < 0 if so)
1403
0
}
1404
1405
bcf_hdr_t *bcf_hdr_init(const char *mode)
1406
9.60k
{
1407
9.60k
    int i;
1408
9.60k
    bcf_hdr_t *h;
1409
9.60k
    h = (bcf_hdr_t*)calloc(1, sizeof(bcf_hdr_t));
1410
9.60k
    if (!h) return NULL;
1411
38.4k
    for (i = 0; i < 3; ++i) {
1412
28.8k
        if ((h->dict[i] = kh_init(vdict)) == NULL) goto fail;
1413
        // Supersize the hash to make collisions very unlikely
1414
28.8k
        static int dsize[3] = {16384,16384,2048}; // info, contig, format
1415
28.8k
        if (kh_resize(vdict, h->dict[i], dsize[i]) < 0) goto fail;
1416
28.8k
    }
1417
1418
9.60k
    bcf_hdr_aux_t *aux = (bcf_hdr_aux_t*)calloc(1,sizeof(bcf_hdr_aux_t));
1419
9.60k
    if ( !aux ) goto fail;
1420
9.60k
    if ( (aux->gen = kh_init(hdict))==NULL ) { free(aux); goto fail; }
1421
9.60k
    aux->key_len = NULL;
1422
9.60k
    aux->dict = *((vdict_t*)h->dict[0]);
1423
9.60k
    free(h->dict[0]);
1424
9.60k
    h->dict[0] = aux;
1425
1426
9.60k
    if ( strchr(mode,'w') )
1427
0
    {
1428
0
        bcf_hdr_append(h, "##fileformat=VCFv4.2");
1429
        // The filter PASS must appear first in the dictionary
1430
0
        bcf_hdr_append(h, "##FILTER=<ID=PASS,Description=\"All filters passed\">");
1431
0
    }
1432
9.60k
    return h;
1433
1434
0
 fail:
1435
0
    for (i = 0; i < 3; ++i)
1436
0
        kh_destroy(vdict, h->dict[i]);
1437
0
    free(h);
1438
0
    return NULL;
1439
9.60k
}
1440
1441
void bcf_hdr_destroy(bcf_hdr_t *h)
1442
9.60k
{
1443
9.60k
    int i;
1444
9.60k
    khint_t k;
1445
9.60k
    if (!h) return;
1446
38.4k
    for (i = 0; i < 3; ++i) {
1447
28.8k
        vdict_t *d = (vdict_t*)h->dict[i];
1448
28.8k
        if (d == 0) continue;
1449
334M
        for (k = kh_begin(d); k != kh_end(d); ++k)
1450
334M
            if (kh_exist(d, k)) free((char*)kh_key(d, k));
1451
28.8k
        if ( i==0 )
1452
9.60k
        {
1453
9.60k
            bcf_hdr_aux_t *aux = get_hdr_aux(h);
1454
54.8k
            for (k=kh_begin(aux->gen); k<kh_end(aux->gen); k++)
1455
45.2k
                if ( kh_exist(aux->gen,k) ) free((char*)kh_key(aux->gen,k));
1456
9.60k
            kh_destroy(hdict, aux->gen);
1457
9.60k
            free(aux->key_len); // may exist for dict[0] only
1458
9.60k
        }
1459
28.8k
        kh_destroy(vdict, d);
1460
28.8k
        free(h->id[i]);
1461
28.8k
    }
1462
174k
    for (i=0; i<h->nhrec; i++)
1463
164k
        bcf_hrec_destroy(h->hrec[i]);
1464
9.60k
    if (h->nhrec) free(h->hrec);
1465
9.60k
    if (h->samples) free(h->samples);
1466
9.60k
    free(h->keep_samples);
1467
9.60k
    free(h->transl[0]); free(h->transl[1]);
1468
9.60k
    free(h->mem.s);
1469
9.60k
    free(h);
1470
9.60k
}
1471
1472
bcf_hdr_t *bcf_hdr_read(htsFile *hfp)
1473
9.60k
{
1474
9.60k
    if (hfp->format.format == vcf)
1475
8.46k
        return vcf_hdr_read(hfp);
1476
1.14k
    if (hfp->format.format != bcf) {
1477
0
        hts_log_error("Input is not detected as bcf or vcf format");
1478
0
        return NULL;
1479
0
    }
1480
1481
1.14k
    assert(hfp->is_bgzf);
1482
1483
1.14k
    BGZF *fp = hfp->fp.bgzf;
1484
1.14k
    uint8_t magic[5];
1485
1.14k
    bcf_hdr_t *h;
1486
1.14k
    h = bcf_hdr_init("r");
1487
1.14k
    if (!h) {
1488
0
        hts_log_error("Failed to allocate bcf header");
1489
0
        return NULL;
1490
0
    }
1491
1.14k
    if (bgzf_read(fp, magic, 5) != 5)
1492
8
    {
1493
8
        hts_log_error("Failed to read the header (reading BCF in text mode?)");
1494
8
        bcf_hdr_destroy(h);
1495
8
        return NULL;
1496
8
    }
1497
1.13k
    if (strncmp((char*)magic, "BCF\2\2", 5) != 0)
1498
20
    {
1499
20
        if (!strncmp((char*)magic, "BCF", 3))
1500
20
            hts_log_error("Invalid BCF2 magic string: only BCFv2.2 is supported");
1501
0
        else
1502
0
            hts_log_error("Invalid BCF2 magic string");
1503
20
        bcf_hdr_destroy(h);
1504
20
        return NULL;
1505
20
    }
1506
1.11k
    uint8_t buf[4];
1507
1.11k
    size_t hlen;
1508
1.11k
    char *htxt = NULL;
1509
1.11k
    if (bgzf_read(fp, buf, 4) != 4) goto fail;
1510
1.10k
    hlen = buf[0] | (buf[1] << 8) | (buf[2] << 16) | ((size_t) buf[3] << 24);
1511
1.10k
    if (hlen >= SIZE_MAX) { errno = ENOMEM; goto fail; }
1512
1.10k
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
1513
1.10k
    if (hlen > FUZZ_ALLOC_LIMIT/2) { errno = ENOMEM; goto fail; }
1514
1.09k
#endif
1515
1.09k
    htxt = (char*)malloc(hlen + 1);
1516
1.09k
    if (!htxt) goto fail;
1517
1.09k
    if (bgzf_read(fp, htxt, hlen) != hlen) goto fail;
1518
1.04k
    htxt[hlen] = '\0'; // Ensure htxt is terminated
1519
1.04k
    if ( bcf_hdr_parse(h, htxt) < 0 ) goto fail;
1520
714
    free(htxt);
1521
714
    return h;
1522
398
 fail:
1523
398
    hts_log_error("Failed to read BCF header");
1524
398
    free(htxt);
1525
398
    bcf_hdr_destroy(h);
1526
398
    return NULL;
1527
1.04k
}
1528
1529
int bcf_hdr_write(htsFile *hfp, bcf_hdr_t *h)
1530
6.99k
{
1531
6.99k
    if (!h) {
1532
0
        errno = EINVAL;
1533
0
        return -1;
1534
0
    }
1535
6.99k
    if ( h->dirty ) {
1536
0
        if (bcf_hdr_sync(h) < 0) return -1;
1537
0
    }
1538
6.99k
    hfp->format.category = variant_data;
1539
6.99k
    if (hfp->format.format == vcf || hfp->format.format == text_format) {
1540
3.49k
        hfp->format.format = vcf;
1541
3.49k
        return vcf_hdr_write(hfp, h);
1542
3.49k
    }
1543
1544
3.49k
    if (hfp->format.format == binary_format)
1545
3.49k
        hfp->format.format = bcf;
1546
1547
3.49k
    kstring_t htxt = {0,0,0};
1548
3.49k
    if (bcf_hdr_format(h, 1, &htxt) < 0) {
1549
0
        free(htxt.s);
1550
0
        return -1;
1551
0
    }
1552
3.49k
    kputc('\0', &htxt); // include the \0 byte
1553
1554
3.49k
    BGZF *fp = hfp->fp.bgzf;
1555
3.49k
    if ( bgzf_write(fp, "BCF\2\2", 5) !=5 ) return -1;
1556
3.49k
    uint8_t hlen[4];
1557
3.49k
    u32_to_le(htxt.l, hlen);
1558
3.49k
    if ( bgzf_write(fp, hlen, 4) !=4 ) return -1;
1559
3.49k
    if ( bgzf_write(fp, htxt.s, htxt.l) != htxt.l ) return -1;
1560
3.49k
    if ( bgzf_flush(fp) < 0) return -1;
1561
1562
3.49k
    free(htxt.s);
1563
3.49k
    return 0;
1564
3.49k
}
1565
1566
/********************
1567
 *** BCF site I/O ***
1568
 ********************/
1569
1570
bcf1_t *bcf_init(void)
1571
6.99k
{
1572
6.99k
    bcf1_t *v;
1573
6.99k
    v = (bcf1_t*)calloc(1, sizeof(bcf1_t));
1574
6.99k
    return v;
1575
6.99k
}
1576
1577
void bcf_clear(bcf1_t *v)
1578
43.8k
{
1579
43.8k
    int i;
1580
43.8k
    for (i=0; i<v->d.m_info; i++)
1581
0
    {
1582
0
        if ( v->d.info[i].vptr_free )
1583
0
        {
1584
0
            free(v->d.info[i].vptr - v->d.info[i].vptr_off);
1585
0
            v->d.info[i].vptr_free = 0;
1586
0
        }
1587
0
    }
1588
43.8k
    for (i=0; i<v->d.m_fmt; i++)
1589
0
    {
1590
0
        if ( v->d.fmt[i].p_free )
1591
0
        {
1592
0
            free(v->d.fmt[i].p - v->d.fmt[i].p_off);
1593
0
            v->d.fmt[i].p_free = 0;
1594
0
        }
1595
0
    }
1596
43.8k
    v->rid = v->pos = v->rlen = v->unpacked = 0;
1597
43.8k
    bcf_float_set_missing(v->qual);
1598
43.8k
    v->n_info = v->n_allele = v->n_fmt = v->n_sample = 0;
1599
43.8k
    v->shared.l = v->indiv.l = 0;
1600
43.8k
    v->d.var_type = -1;
1601
43.8k
    v->d.shared_dirty = 0;
1602
43.8k
    v->d.indiv_dirty  = 0;
1603
43.8k
    v->d.n_flt = 0;
1604
43.8k
    v->errcode = 0;
1605
43.8k
    if (v->d.m_als) v->d.als[0] = 0;
1606
43.8k
    if (v->d.m_id) v->d.id[0] = 0;
1607
43.8k
}
1608
1609
void bcf_empty(bcf1_t *v)
1610
6.99k
{
1611
6.99k
    bcf_clear1(v);
1612
6.99k
    free(v->d.id);
1613
6.99k
    free(v->d.als);
1614
6.99k
    free(v->d.allele); free(v->d.flt); free(v->d.info); free(v->d.fmt);
1615
6.99k
    if (v->d.var ) free(v->d.var);
1616
6.99k
    free(v->shared.s); free(v->indiv.s);
1617
6.99k
    memset(&v->d,0,sizeof(v->d));
1618
6.99k
    memset(&v->shared,0,sizeof(v->shared));
1619
6.99k
    memset(&v->indiv,0,sizeof(v->indiv));
1620
6.99k
}
1621
1622
void bcf_destroy(bcf1_t *v)
1623
6.99k
{
1624
6.99k
    if (!v) return;
1625
6.99k
    bcf_empty1(v);
1626
6.99k
    free(v);
1627
6.99k
}
1628
1629
static inline int bcf_read1_core(BGZF *fp, bcf1_t *v)
1630
750
{
1631
750
    uint8_t x[32];
1632
750
    ssize_t ret;
1633
750
    uint32_t shared_len, indiv_len;
1634
750
    if ((ret = bgzf_read(fp, x, 32)) != 32) {
1635
22
        if (ret == 0) return -1;
1636
20
        return -2;
1637
22
    }
1638
728
    bcf_clear1(v);
1639
728
    shared_len = le_to_u32(x);
1640
728
    if (shared_len < 24) return -2;
1641
726
    shared_len -= 24; // to exclude six 32-bit integers
1642
726
    indiv_len = le_to_u32(x + 4);
1643
726
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
1644
    // ks_resize() normally allocates 1.5 * requested size to allow for growth
1645
726
    if ((uint64_t) shared_len + indiv_len > FUZZ_ALLOC_LIMIT / 3 * 2) return -2;
1646
690
#endif
1647
690
    if (ks_resize(&v->shared, shared_len ? shared_len : 1) != 0) return -2;
1648
690
    if (ks_resize(&v->indiv, indiv_len ? indiv_len : 1) != 0) return -2;
1649
690
    v->rid  = le_to_i32(x + 8);
1650
690
    v->pos  = le_to_u32(x + 12);
1651
690
    if ( v->pos==UINT32_MAX ) v->pos = -1;  // this is for telomere coordinate, e.g. MT:0
1652
690
    v->rlen = le_to_i32(x + 16);
1653
690
    v->qual = le_to_float(x + 20);
1654
690
    v->n_info = le_to_u16(x + 24);
1655
690
    v->n_allele = le_to_u16(x + 26);
1656
690
    v->n_sample = le_to_u32(x + 28) & 0xffffff;
1657
690
    v->n_fmt = x[31];
1658
690
    v->shared.l = shared_len;
1659
690
    v->indiv.l = indiv_len;
1660
    // silent fix of broken BCFs produced by earlier versions of bcf_subset, prior to and including bd6ed8b4
1661
690
    if ( (!v->indiv.l || !v->n_sample) && v->n_fmt ) v->n_fmt = 0;
1662
1663
690
    if (bgzf_read(fp, v->shared.s, v->shared.l) != v->shared.l) return -2;
1664
570
    if (bgzf_read(fp, v->indiv.s, v->indiv.l) != v->indiv.l) return -2;
1665
544
    return 0;
1666
570
}
1667
1668
0
#define bit_array_size(n) ((n)/8+1)
1669
0
#define bit_array_set(a,i)   ((a)[(i)/8] |=   1 << ((i)%8))
1670
0
#define bit_array_clear(a,i) ((a)[(i)/8] &= ~(1 << ((i)%8)))
1671
0
#define bit_array_test(a,i)  ((a)[(i)/8] &   (1 << ((i)%8)))
1672
1673
static int bcf_dec_typed_int1_safe(uint8_t *p, uint8_t *end, uint8_t **q,
1674
2.85k
                                   int32_t *val) {
1675
2.85k
    uint32_t t;
1676
2.85k
    if (end - p < 2) return -1;
1677
2.82k
    t = *p++ & 0xf;
1678
    /* Use if .. else if ... else instead of switch to force order.  Assumption
1679
       is that small integers are more frequent than big ones. */
1680
2.82k
    if (t == BCF_BT_INT8) {
1681
1.51k
        *val = *(int8_t *) p++;
1682
1.51k
    } else {
1683
1.30k
        if (end - p < (1<<bcf_type_shift[t])) return -1;
1684
1.29k
        if (t == BCF_BT_INT16) {
1685
480
            *val = le_to_i16(p);
1686
480
            p += 2;
1687
812
        } else if (t == BCF_BT_INT32) {
1688
566
            *val = le_to_i32(p);
1689
566
            p += 4;
1690
#ifdef VCF_ALLOW_INT64
1691
        } else if (t == BCF_BT_INT64) {
1692
            // This case should never happen because there should be no
1693
            // 64-bit BCFs at all, definitely not coming from htslib
1694
            *val = le_to_i64(p);
1695
            p += 8;
1696
#endif
1697
566
        } else {
1698
246
            return -1;
1699
246
        }
1700
1.29k
    }
1701
2.56k
    *q = p;
1702
2.56k
    return 0;
1703
2.82k
}
1704
1705
static int bcf_dec_size_safe(uint8_t *p, uint8_t *end, uint8_t **q,
1706
9.43k
                             int *num, int *type) {
1707
9.43k
    int r;
1708
9.43k
    if (p >= end) return -1;
1709
9.40k
    *type = *p & 0xf;
1710
9.40k
    if (*p>>4 != 15) {
1711
9.13k
        *q = p + 1;
1712
9.13k
        *num = *p >> 4;
1713
9.13k
        return 0;
1714
9.13k
    }
1715
268
    r = bcf_dec_typed_int1_safe(p + 1, end, q, num);
1716
268
    if (r) return r;
1717
170
    return *num >= 0 ? 0 : -1;
1718
268
}
1719
1720
784
static const char *get_type_name(int type) {
1721
784
    const char *types[9] = {
1722
784
        "null", "int (8-bit)", "int (16 bit)", "int (32 bit)",
1723
784
        "unknown", "float", "unknown", "char", "unknown"
1724
784
    };
1725
784
    int t = (type >= 0 && type < 8) ? type : 8;
1726
784
    return types[t];
1727
784
}
1728
1729
static void bcf_record_check_err(const bcf_hdr_t *hdr, bcf1_t *rec,
1730
608
                                 char *type, uint32_t *reports, int i) {
1731
608
    if (*reports == 0 || hts_verbose >= HTS_LOG_DEBUG)
1732
82
        hts_log_warning("Bad BCF record at %s:%"PRIhts_pos
1733
608
                        ": Invalid FORMAT %s %d",
1734
608
                        bcf_seqname_safe(hdr,rec), rec->pos+1, type, i);
1735
608
    (*reports)++;
1736
608
}
1737
1738
544
static int bcf_record_check(const bcf_hdr_t *hdr, bcf1_t *rec) {
1739
544
    uint8_t *ptr, *end;
1740
544
    size_t bytes;
1741
544
    uint32_t err = 0;
1742
544
    int type = 0;
1743
544
    int num  = 0;
1744
544
    int reflen = 0;
1745
544
    uint32_t i, reports;
1746
544
    const uint32_t is_integer = ((1 << BCF_BT_INT8)  |
1747
544
                                 (1 << BCF_BT_INT16) |
1748
#ifdef VCF_ALLOW_INT64
1749
                                 (1 << BCF_BT_INT64) |
1750
#endif
1751
544
                                 (1 << BCF_BT_INT32));
1752
544
    const uint32_t is_valid_type = (is_integer          |
1753
544
                                    (1 << BCF_BT_NULL)  |
1754
544
                                    (1 << BCF_BT_FLOAT) |
1755
544
                                    (1 << BCF_BT_CHAR));
1756
544
    int32_t max_id = hdr ? hdr->n[BCF_DT_ID] : 0;
1757
1758
    // Check for valid contig ID
1759
544
    if (rec->rid < 0
1760
544
        || (hdr && (rec->rid >= hdr->n[BCF_DT_CTG]
1761
490
                    || hdr->id[BCF_DT_CTG][rec->rid].key == NULL))) {
1762
490
        hts_log_warning("Bad BCF record at %"PRIhts_pos": Invalid %s id %d", rec->pos+1, "CONTIG", rec->rid);
1763
490
        err |= BCF_ERR_CTG_INVALID;
1764
490
    }
1765
1766
    // Check ID
1767
544
    ptr = (uint8_t *) rec->shared.s;
1768
544
    end = ptr + rec->shared.l;
1769
544
    if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_shared;
1770
518
    if (type != BCF_BT_CHAR) {
1771
434
        hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s type %d (%s)", bcf_seqname_safe(hdr,rec), rec->pos+1, "ID", type, get_type_name(type));
1772
434
        err |= BCF_ERR_TAG_INVALID;
1773
434
    }
1774
518
    bytes = (size_t) num << bcf_type_shift[type];
1775
518
    if (end - ptr < bytes) goto bad_shared;
1776
508
    ptr += bytes;
1777
1778
    // Check REF and ALT
1779
508
    if (rec->n_allele < 1) {
1780
246
        hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": No REF allele",
1781
246
                        bcf_seqname_safe(hdr,rec), rec->pos+1);
1782
246
        err |= BCF_ERR_TAG_UNDEF;
1783
246
    }
1784
1785
508
    reports = 0;
1786
6.49k
    for (i = 0; i < rec->n_allele; i++) {
1787
6.08k
        if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_shared;
1788
6.02k
        if (type != BCF_BT_CHAR) {
1789
5.70k
            if (!reports++ || hts_verbose >= HTS_LOG_DEBUG)
1790
206
                hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s type %d (%s)", bcf_seqname_safe(hdr,rec), rec->pos+1, "REF/ALT", type, get_type_name(type));
1791
5.70k
            err |= BCF_ERR_CHAR;
1792
5.70k
        }
1793
6.02k
        if (i == 0) reflen = num;
1794
6.02k
        bytes = (size_t) num << bcf_type_shift[type];
1795
6.02k
        if (end - ptr < bytes) goto bad_shared;
1796
5.99k
        ptr += bytes;
1797
5.99k
    }
1798
1799
    // Check FILTER
1800
412
    reports = 0;
1801
412
    if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_shared;
1802
396
    if (num > 0) {
1803
236
        bytes = (size_t) num << bcf_type_shift[type];
1804
236
        if (((1 << type) & is_integer) == 0) {
1805
122
            hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s type %d (%s)", bcf_seqname_safe(hdr,rec), rec->pos+1, "FILTER", type, get_type_name(type));
1806
122
            err |= BCF_ERR_TAG_INVALID;
1807
122
            if (end - ptr < bytes) goto bad_shared;
1808
102
            ptr += bytes;
1809
114
        } else {
1810
114
            if (end - ptr < bytes) goto bad_shared;
1811
4.27k
            for (i = 0; i < num; i++) {
1812
4.16k
                int32_t key = bcf_dec_int1(ptr, type, &ptr);
1813
4.16k
                if (key < 0
1814
4.16k
                    || (hdr && (key >= max_id
1815
3.62k
                                || hdr->id[BCF_DT_ID][key].key == NULL))) {
1816
3.62k
                    if (!reports++ || hts_verbose >= HTS_LOG_DEBUG)
1817
110
                        hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s id %d", bcf_seqname_safe(hdr,rec), rec->pos+1, "FILTER", key);
1818
3.62k
                    err |= BCF_ERR_TAG_UNDEF;
1819
3.62k
                }
1820
4.16k
            }
1821
112
        }
1822
236
    }
1823
1824
    // Check INFO
1825
374
    reports = 0;
1826
374
    bcf_idpair_t *id_tmp = hdr ? hdr->id[BCF_DT_ID] : NULL;
1827
1.98k
    for (i = 0; i < rec->n_info; i++) {
1828
1.81k
        int32_t key = -1;
1829
1.81k
        if (bcf_dec_typed_int1_safe(ptr, end, &ptr, &key) != 0) goto bad_shared;
1830
1.65k
        if (key < 0 || (hdr && (key >= max_id
1831
1.53k
                                || id_tmp[key].key == NULL))) {
1832
1.30k
            if (!reports++ || hts_verbose >= HTS_LOG_DEBUG)
1833
164
                hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s id %d", bcf_seqname_safe(hdr,rec), rec->pos+1, "INFO", key);
1834
1.30k
            err |= BCF_ERR_TAG_UNDEF;
1835
1.30k
        }
1836
1.65k
        if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_shared;
1837
1.62k
        if (((1 << type) & is_valid_type) == 0
1838
1.62k
            || (type == BCF_BT_NULL && num > 0)) {
1839
218
            if (!reports++ || hts_verbose >= HTS_LOG_DEBUG)
1840
22
                hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s type %d (%s)", bcf_seqname_safe(hdr,rec), rec->pos+1, "INFO", type, get_type_name(type));
1841
218
            err |= BCF_ERR_TAG_INVALID;
1842
218
        }
1843
1.62k
        bytes = (size_t) num << bcf_type_shift[type];
1844
1.62k
        if (end - ptr < bytes) goto bad_shared;
1845
1.60k
        ptr += bytes;
1846
1.60k
    }
1847
1848
    // Check FORMAT and individual information
1849
164
    ptr = (uint8_t *) rec->indiv.s;
1850
164
    end = ptr + rec->indiv.l;
1851
164
    reports = 0;
1852
842
    for (i = 0; i < rec->n_fmt; i++) {
1853
766
        int32_t key = -1;
1854
766
        if (bcf_dec_typed_int1_safe(ptr, end, &ptr, &key) != 0) goto bad_indiv;
1855
734
        if (key < 0
1856
734
            || (hdr && (key >= max_id
1857
664
                        || id_tmp[key].key == NULL))) {
1858
544
            bcf_record_check_err(hdr, rec, "id", &reports, key);
1859
544
            err |= BCF_ERR_TAG_UNDEF;
1860
544
        }
1861
734
        if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_indiv;
1862
720
        if (((1 << type) & is_valid_type) == 0
1863
720
            || (type == BCF_BT_NULL && num > 0)) {
1864
64
            bcf_record_check_err(hdr, rec, "type", &reports, type);
1865
64
            err |= BCF_ERR_TAG_INVALID;
1866
64
        }
1867
720
        bytes = ((size_t) num << bcf_type_shift[type]) * rec->n_sample;
1868
720
        if (end - ptr < bytes) goto bad_indiv;
1869
678
        ptr += bytes;
1870
678
    }
1871
1872
76
    if (!err && rec->rlen < 0) {
1873
        // Treat bad rlen as a warning instead of an error, and try to
1874
        // fix up by using the length of the stored REF allele.
1875
34
        static int warned = 0;
1876
34
        if (!warned) {
1877
1
            hts_log_warning("BCF record at %s:%"PRIhts_pos" has invalid RLEN (%"PRIhts_pos"). "
1878
1
                            "Only one invalid RLEN will be reported.",
1879
1
                            bcf_seqname_safe(hdr,rec), rec->pos+1, rec->rlen);
1880
1
            warned = 1;
1881
1
        }
1882
34
        rec->rlen = reflen >= 0 ? reflen : 0;
1883
34
    }
1884
1885
76
    rec->errcode |= err;
1886
1887
76
    return err ? -2 : 0; // Return -2 so bcf_read() reports an error
1888
1889
380
 bad_shared:
1890
380
    hts_log_error("Bad BCF record at %s:%"PRIhts_pos" - shared section malformed or too short", bcf_seqname_safe(hdr,rec), rec->pos+1);
1891
380
    return -2;
1892
1893
88
 bad_indiv:
1894
88
    hts_log_error("Bad BCF record at %s:%"PRIhts_pos" - individuals section malformed or too short", bcf_seqname_safe(hdr,rec), rec->pos+1);
1895
88
    return -2;
1896
164
}
1897
1898
static inline uint8_t *bcf_unpack_fmt_core1(uint8_t *ptr, int n_sample, bcf_fmt_t *fmt);
1899
int bcf_subset_format(const bcf_hdr_t *hdr, bcf1_t *rec)
1900
0
{
1901
0
    if ( !hdr->keep_samples ) return 0;
1902
0
    if ( !bcf_hdr_nsamples(hdr) )
1903
0
    {
1904
0
        rec->indiv.l = rec->n_sample = 0;
1905
0
        return 0;
1906
0
    }
1907
1908
0
    int i, j;
1909
0
    uint8_t *ptr = (uint8_t*)rec->indiv.s, *dst = NULL, *src;
1910
0
    bcf_dec_t *dec = &rec->d;
1911
0
    hts_expand(bcf_fmt_t, rec->n_fmt, dec->m_fmt, dec->fmt);
1912
0
    for (i=0; i<dec->m_fmt; ++i) dec->fmt[i].p_free = 0;
1913
1914
0
    for (i=0; i<rec->n_fmt; i++)
1915
0
    {
1916
0
        ptr = bcf_unpack_fmt_core1(ptr, rec->n_sample, &dec->fmt[i]);
1917
0
        src = dec->fmt[i].p - dec->fmt[i].size;
1918
0
        if ( dst )
1919
0
        {
1920
0
            memmove(dec->fmt[i-1].p + dec->fmt[i-1].p_len, dec->fmt[i].p - dec->fmt[i].p_off, dec->fmt[i].p_off);
1921
0
            dec->fmt[i].p = dec->fmt[i-1].p + dec->fmt[i-1].p_len + dec->fmt[i].p_off;
1922
0
        }
1923
0
        dst = dec->fmt[i].p;
1924
0
        for (j=0; j<hdr->nsamples_ori; j++)
1925
0
        {
1926
0
            src += dec->fmt[i].size;
1927
0
            if ( !bit_array_test(hdr->keep_samples,j) ) continue;
1928
0
            memmove(dst, src, dec->fmt[i].size);
1929
0
            dst += dec->fmt[i].size;
1930
0
        }
1931
0
        rec->indiv.l -= dec->fmt[i].p_len - (dst - dec->fmt[i].p);
1932
0
        dec->fmt[i].p_len = dst - dec->fmt[i].p;
1933
0
    }
1934
0
    rec->unpacked |= BCF_UN_FMT;
1935
1936
0
    rec->n_sample = bcf_hdr_nsamples(hdr);
1937
0
    return 0;
1938
0
}
1939
1940
int bcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
1941
38.5k
{
1942
38.5k
    if (fp->format.format == vcf) return vcf_read(fp,h,v);
1943
750
    int ret = bcf_read1_core(fp->fp.bgzf, v);
1944
750
    if (ret == 0) ret = bcf_record_check(h, v);
1945
750
    if ( ret!=0 || !h->keep_samples ) return ret;
1946
0
    return bcf_subset_format(h,v);
1947
750
}
1948
1949
int bcf_readrec(BGZF *fp, void *null, void *vv, int *tid, hts_pos_t *beg, hts_pos_t *end)
1950
0
{
1951
0
    bcf1_t *v = (bcf1_t *) vv;
1952
0
    int ret = bcf_read1_core(fp, v);
1953
0
    if (ret == 0) ret = bcf_record_check(NULL, v);
1954
0
    if (ret  >= 0)
1955
0
        *tid = v->rid, *beg = v->pos, *end = v->pos + v->rlen;
1956
0
    return ret;
1957
0
}
1958
1959
static inline int bcf1_sync_id(bcf1_t *line, kstring_t *str)
1960
0
{
1961
    // single typed string
1962
0
    if ( line->d.id && strcmp(line->d.id, ".") ) {
1963
0
        return bcf_enc_vchar(str, strlen(line->d.id), line->d.id);
1964
0
    } else {
1965
0
        return bcf_enc_size(str, 0, BCF_BT_CHAR);
1966
0
    }
1967
0
}
1968
static inline int bcf1_sync_alleles(bcf1_t *line, kstring_t *str)
1969
0
{
1970
    // list of typed strings
1971
0
    int i;
1972
0
    for (i=0; i<line->n_allele; i++) {
1973
0
        if (bcf_enc_vchar(str, strlen(line->d.allele[i]), line->d.allele[i]) < 0)
1974
0
            return -1;
1975
0
    }
1976
0
    if ( !line->rlen && line->n_allele ) line->rlen = strlen(line->d.allele[0]);
1977
0
    return 0;
1978
0
}
1979
static inline int bcf1_sync_filter(bcf1_t *line, kstring_t *str)
1980
0
{
1981
    // typed vector of integers
1982
0
    if ( line->d.n_flt ) {
1983
0
        return bcf_enc_vint(str, line->d.n_flt, line->d.flt, -1);
1984
0
    } else {
1985
0
        return bcf_enc_vint(str, 0, 0, -1);
1986
0
    }
1987
0
}
1988
1989
static inline int bcf1_sync_info(bcf1_t *line, kstring_t *str)
1990
0
{
1991
    // pairs of typed vectors
1992
0
    int i, irm = -1, e = 0;
1993
0
    for (i=0; i<line->n_info; i++)
1994
0
    {
1995
0
        bcf_info_t *info = &line->d.info[i];
1996
0
        if ( !info->vptr )
1997
0
        {
1998
            // marked for removal
1999
0
            if ( irm < 0 ) irm = i;
2000
0
            continue;
2001
0
        }
2002
0
        e |= kputsn_(info->vptr - info->vptr_off, info->vptr_len + info->vptr_off, str) < 0;
2003
0
        if ( irm >=0 )
2004
0
        {
2005
0
            bcf_info_t tmp = line->d.info[irm]; line->d.info[irm] = line->d.info[i]; line->d.info[i] = tmp;
2006
0
            while ( irm<=i && line->d.info[irm].vptr ) irm++;
2007
0
        }
2008
0
    }
2009
0
    if ( irm>=0 ) line->n_info = irm;
2010
0
    return e == 0 ? 0 : -1;
2011
0
}
2012
2013
static int bcf1_sync(bcf1_t *line)
2014
112
{
2015
112
    char *shared_ori = line->shared.s;
2016
112
    size_t prev_len;
2017
2018
112
    kstring_t tmp = {0,0,0};
2019
112
    if ( !line->shared.l )
2020
0
    {
2021
        // New line created via API, BCF data blocks do not exist. Get it ready for BCF output
2022
0
        tmp = line->shared;
2023
0
        bcf1_sync_id(line, &tmp);
2024
0
        line->unpack_size[0] = tmp.l; prev_len = tmp.l;
2025
2026
0
        bcf1_sync_alleles(line, &tmp);
2027
0
        line->unpack_size[1] = tmp.l - prev_len; prev_len = tmp.l;
2028
2029
0
        bcf1_sync_filter(line, &tmp);
2030
0
        line->unpack_size[2] = tmp.l - prev_len;
2031
2032
0
        bcf1_sync_info(line, &tmp);
2033
0
        line->shared = tmp;
2034
0
    }
2035
112
    else if ( line->d.shared_dirty )
2036
0
    {
2037
        // The line was edited, update the BCF data block.
2038
2039
0
        if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line,BCF_UN_STR);
2040
2041
        // ptr_ori points to the original unchanged BCF data.
2042
0
        uint8_t *ptr_ori = (uint8_t *) line->shared.s;
2043
2044
        // ID: single typed string
2045
0
        if ( line->d.shared_dirty & BCF1_DIRTY_ID )
2046
0
            bcf1_sync_id(line, &tmp);
2047
0
        else
2048
0
            kputsn_(ptr_ori, line->unpack_size[0], &tmp);
2049
0
        ptr_ori += line->unpack_size[0];
2050
0
        line->unpack_size[0] = tmp.l; prev_len = tmp.l;
2051
2052
        // REF+ALT: list of typed strings
2053
0
        if ( line->d.shared_dirty & BCF1_DIRTY_ALS )
2054
0
            bcf1_sync_alleles(line, &tmp);
2055
0
        else
2056
0
        {
2057
0
            kputsn_(ptr_ori, line->unpack_size[1], &tmp);
2058
0
            if ( !line->rlen && line->n_allele ) line->rlen = strlen(line->d.allele[0]);
2059
0
        }
2060
0
        ptr_ori += line->unpack_size[1];
2061
0
        line->unpack_size[1] = tmp.l - prev_len; prev_len = tmp.l;
2062
2063
0
        if ( line->unpacked & BCF_UN_FLT )
2064
0
        {
2065
            // FILTER: typed vector of integers
2066
0
            if ( line->d.shared_dirty & BCF1_DIRTY_FLT )
2067
0
                bcf1_sync_filter(line, &tmp);
2068
0
            else if ( line->d.n_flt )
2069
0
                kputsn_(ptr_ori, line->unpack_size[2], &tmp);
2070
0
            else
2071
0
                bcf_enc_vint(&tmp, 0, 0, -1);
2072
0
            ptr_ori += line->unpack_size[2];
2073
0
            line->unpack_size[2] = tmp.l - prev_len;
2074
2075
0
            if ( line->unpacked & BCF_UN_INFO )
2076
0
            {
2077
                // INFO: pairs of typed vectors
2078
0
                if ( line->d.shared_dirty & BCF1_DIRTY_INF )
2079
0
                {
2080
0
                    bcf1_sync_info(line, &tmp);
2081
0
                    ptr_ori = (uint8_t*)line->shared.s + line->shared.l;
2082
0
                }
2083
0
            }
2084
0
        }
2085
2086
0
        int size = line->shared.l - (size_t)ptr_ori + (size_t)line->shared.s;
2087
0
        if ( size ) kputsn_(ptr_ori, size, &tmp);
2088
2089
0
        free(line->shared.s);
2090
0
        line->shared = tmp;
2091
0
    }
2092
112
    if ( line->shared.s != shared_ori && line->unpacked & BCF_UN_INFO )
2093
0
    {
2094
        // Reallocated line->shared.s block invalidated line->d.info[].vptr pointers
2095
0
        size_t off_new = line->unpack_size[0] + line->unpack_size[1] + line->unpack_size[2];
2096
0
        int i;
2097
0
        for (i=0; i<line->n_info; i++)
2098
0
        {
2099
0
            uint8_t *vptr_free = line->d.info[i].vptr_free ? line->d.info[i].vptr - line->d.info[i].vptr_off : NULL;
2100
0
            line->d.info[i].vptr = (uint8_t*) line->shared.s + off_new + line->d.info[i].vptr_off;
2101
0
            off_new += line->d.info[i].vptr_len + line->d.info[i].vptr_off;
2102
0
            if ( vptr_free )
2103
0
            {
2104
0
                free(vptr_free);
2105
0
                line->d.info[i].vptr_free = 0;
2106
0
            }
2107
0
        }
2108
0
    }
2109
2110
112
    if ( line->n_sample && line->n_fmt && (!line->indiv.l || line->d.indiv_dirty) )
2111
0
    {
2112
        // The genotype fields changed or are not present
2113
0
        tmp.l = tmp.m = 0; tmp.s = NULL;
2114
0
        int i, irm = -1;
2115
0
        for (i=0; i<line->n_fmt; i++)
2116
0
        {
2117
0
            bcf_fmt_t *fmt = &line->d.fmt[i];
2118
0
            if ( !fmt->p )
2119
0
            {
2120
                // marked for removal
2121
0
                if ( irm < 0 ) irm = i;
2122
0
                continue;
2123
0
            }
2124
0
            kputsn_(fmt->p - fmt->p_off, fmt->p_len + fmt->p_off, &tmp);
2125
0
            if ( irm >=0 )
2126
0
            {
2127
0
                bcf_fmt_t tfmt = line->d.fmt[irm]; line->d.fmt[irm] = line->d.fmt[i]; line->d.fmt[i] = tfmt;
2128
0
                while ( irm<=i && line->d.fmt[irm].p ) irm++;
2129
0
            }
2130
2131
0
        }
2132
0
        if ( irm>=0 ) line->n_fmt = irm;
2133
0
        free(line->indiv.s);
2134
0
        line->indiv = tmp;
2135
2136
        // Reallocated line->indiv.s block invalidated line->d.fmt[].p pointers
2137
0
        size_t off_new = 0;
2138
0
        for (i=0; i<line->n_fmt; i++)
2139
0
        {
2140
0
            uint8_t *p_free = line->d.fmt[i].p_free ? line->d.fmt[i].p - line->d.fmt[i].p_off : NULL;
2141
0
            line->d.fmt[i].p = (uint8_t*) line->indiv.s + off_new + line->d.fmt[i].p_off;
2142
0
            off_new += line->d.fmt[i].p_len + line->d.fmt[i].p_off;
2143
0
            if ( p_free )
2144
0
            {
2145
0
                free(p_free);
2146
0
                line->d.fmt[i].p_free = 0;
2147
0
            }
2148
0
        }
2149
0
    }
2150
112
    if ( !line->n_sample ) line->n_fmt = 0;
2151
112
    line->d.shared_dirty = line->d.indiv_dirty = 0;
2152
112
    return 0;
2153
112
}
2154
2155
bcf1_t *bcf_copy(bcf1_t *dst, bcf1_t *src)
2156
0
{
2157
0
    bcf1_sync(src);
2158
2159
0
    bcf_clear(dst);
2160
0
    dst->rid  = src->rid;
2161
0
    dst->pos  = src->pos;
2162
0
    dst->rlen = src->rlen;
2163
0
    dst->qual = src->qual;
2164
0
    dst->n_info = src->n_info; dst->n_allele = src->n_allele;
2165
0
    dst->n_fmt = src->n_fmt; dst->n_sample = src->n_sample;
2166
2167
0
    if ( dst->shared.m < src->shared.l )
2168
0
    {
2169
0
        dst->shared.s = (char*) realloc(dst->shared.s, src->shared.l);
2170
0
        dst->shared.m = src->shared.l;
2171
0
    }
2172
0
    dst->shared.l = src->shared.l;
2173
0
    memcpy(dst->shared.s,src->shared.s,dst->shared.l);
2174
2175
0
    if ( dst->indiv.m < src->indiv.l )
2176
0
    {
2177
0
        dst->indiv.s = (char*) realloc(dst->indiv.s, src->indiv.l);
2178
0
        dst->indiv.m = src->indiv.l;
2179
0
    }
2180
0
    dst->indiv.l = src->indiv.l;
2181
0
    memcpy(dst->indiv.s,src->indiv.s,dst->indiv.l);
2182
2183
0
    return dst;
2184
0
}
2185
bcf1_t *bcf_dup(bcf1_t *src)
2186
0
{
2187
0
    bcf1_t *out = bcf_init1();
2188
0
    return bcf_copy(out, src);
2189
0
}
2190
2191
int bcf_write(htsFile *hfp, bcf_hdr_t *h, bcf1_t *v)
2192
34.2k
{
2193
34.2k
    if ( h->dirty ) {
2194
0
        if (bcf_hdr_sync(h) < 0) return -1;
2195
0
    }
2196
34.2k
    if ( bcf_hdr_nsamples(h)!=v->n_sample )
2197
66
    {
2198
66
        hts_log_error("Broken VCF record, the number of columns at %s:%"PRIhts_pos" does not match the number of samples (%d vs %d)",
2199
66
            bcf_seqname_safe(h,v), v->pos+1, v->n_sample, bcf_hdr_nsamples(h));
2200
66
        return -1;
2201
66
    }
2202
2203
34.2k
    if ( hfp->format.format == vcf || hfp->format.format == text_format )
2204
31.4k
        return vcf_write(hfp,h,v);
2205
2206
2.80k
    if ( v->errcode & ~BCF_ERR_LIMITS ) // todo: unsure about the other BCF_ERR_LIMITS branches in vcf_parse_format_alloc4()
2207
2.69k
    {
2208
        // vcf_parse1() encountered a new contig or tag, undeclared in the
2209
        // header.  At this point, the header must have been printed,
2210
        // proceeding would lead to a broken BCF file. Errors must be checked
2211
        // and cleared by the caller before we can proceed.
2212
2.69k
        char errdescription[1024] = "";
2213
2.69k
        hts_log_error("Unchecked error (%d %s) at %s:%"PRIhts_pos, v->errcode, bcf_strerror(v->errcode, errdescription, sizeof(errdescription)), bcf_seqname_safe(h,v), v->pos+1);
2214
2.69k
        return -1;
2215
2.69k
    }
2216
112
    bcf1_sync(v);   // check if the BCF record was modified
2217
2218
112
    if ( v->unpacked & BCF_IS_64BIT )
2219
1
    {
2220
1
        hts_log_error("Data at %s:%"PRIhts_pos" contains 64-bit values not representable in BCF. Please use VCF instead", bcf_seqname_safe(h,v), v->pos+1);
2221
1
        return -1;
2222
1
    }
2223
2224
111
    BGZF *fp = hfp->fp.bgzf;
2225
111
    uint8_t x[32];
2226
111
    u32_to_le(v->shared.l + 24, x); // to include six 32-bit integers
2227
111
    u32_to_le(v->indiv.l, x + 4);
2228
111
    i32_to_le(v->rid, x + 8);
2229
111
    u32_to_le(v->pos, x + 12);
2230
111
    u32_to_le(v->rlen, x + 16);
2231
111
    float_to_le(v->qual, x + 20);
2232
111
    u16_to_le(v->n_info, x + 24);
2233
111
    u16_to_le(v->n_allele, x + 26);
2234
111
    u32_to_le((uint32_t)v->n_fmt<<24 | (v->n_sample & 0xffffff), x + 28);
2235
111
    if ( bgzf_write(fp, x, 32) != 32 ) return -1;
2236
111
    if ( bgzf_write(fp, v->shared.s, v->shared.l) != v->shared.l ) return -1;
2237
111
    if ( bgzf_write(fp, v->indiv.s, v->indiv.l) != v->indiv.l ) return -1;
2238
2239
111
    if (hfp->idx) {
2240
0
        if (bgzf_idx_push(fp, hfp->idx, v->rid, v->pos, v->pos + v->rlen,
2241
0
                          bgzf_tell(fp), 1) < 0)
2242
0
            return -1;
2243
0
    }
2244
2245
111
    return 0;
2246
111
}
2247
2248
/**********************
2249
 *** VCF header I/O ***
2250
 **********************/
2251
2252
0
static int add_missing_contig_hrec(bcf_hdr_t *h, const char *name) {
2253
0
    bcf_hrec_t *hrec = calloc(1, sizeof(bcf_hrec_t));
2254
0
    int save_errno;
2255
0
    if (!hrec) goto fail;
2256
2257
0
    hrec->key = strdup("contig");
2258
0
    if (!hrec->key) goto fail;
2259
2260
0
    if (bcf_hrec_add_key(hrec, "ID", strlen("ID")) < 0) goto fail;
2261
0
    if (bcf_hrec_set_val(hrec, hrec->nkeys-1, name, strlen(name), 0) < 0)
2262
0
        goto fail;
2263
0
    if (bcf_hdr_add_hrec(h, hrec) < 0)
2264
0
        goto fail;
2265
0
    return 0;
2266
2267
0
 fail:
2268
0
    save_errno = errno;
2269
0
    hts_log_error("%s", strerror(errno));
2270
0
    if (hrec) bcf_hrec_destroy(hrec);
2271
0
    errno = save_errno;
2272
0
    return -1;
2273
0
}
2274
2275
bcf_hdr_t *vcf_hdr_read(htsFile *fp)
2276
8.46k
{
2277
8.46k
    kstring_t txt, *s = &fp->line;
2278
8.46k
    int ret;
2279
8.46k
    bcf_hdr_t *h;
2280
8.46k
    tbx_t *idx = NULL;
2281
8.46k
    const char **names = NULL;
2282
8.46k
    h = bcf_hdr_init("r");
2283
8.46k
    if (!h) {
2284
0
        hts_log_error("Failed to allocate bcf header");
2285
0
        return NULL;
2286
0
    }
2287
8.46k
    txt.l = txt.m = 0; txt.s = 0;
2288
106k
    while ((ret = hts_getline(fp, KS_SEP_LINE, s)) >= 0) {
2289
105k
        int e = 0;
2290
105k
        if (s->l == 0) continue;
2291
101k
        if (s->s[0] != '#') {
2292
28
            hts_log_error("No sample line");
2293
28
            goto error;
2294
28
        }
2295
101k
        if (s->s[1] != '#' && fp->fn_aux) { // insert contigs here
2296
0
            kstring_t tmp = { 0, 0, NULL };
2297
0
            hFILE *f = hopen(fp->fn_aux, "r");
2298
0
            if (f == NULL) {
2299
0
                hts_log_error("Couldn't open \"%s\"", fp->fn_aux);
2300
0
                goto error;
2301
0
            }
2302
0
            while (tmp.l = 0, kgetline(&tmp, (kgets_func *) hgets, f) >= 0) {
2303
0
                char *tab = strchr(tmp.s, '\t');
2304
0
                if (tab == NULL) continue;
2305
0
                e |= (kputs("##contig=<ID=", &txt) < 0);
2306
0
                e |= (kputsn(tmp.s, tab - tmp.s, &txt) < 0);
2307
0
                e |= (kputs(",length=", &txt) < 0);
2308
0
                e |= (kputl(atol(tab), &txt) < 0);
2309
0
                e |= (kputsn(">\n", 2, &txt) < 0);
2310
0
            }
2311
0
            free(tmp.s);
2312
0
            if (hclose(f) != 0) {
2313
0
                hts_log_error("Error on closing %s", fp->fn_aux);
2314
0
                goto error;
2315
0
            }
2316
0
            if (e) goto error;
2317
0
        }
2318
101k
        if (kputsn(s->s, s->l, &txt) < 0) goto error;
2319
101k
        if (kputc('\n', &txt) < 0) goto error;
2320
101k
        if (s->s[1] != '#') break;
2321
101k
    }
2322
8.43k
    if ( ret < -1 ) goto error;
2323
8.43k
    if ( !txt.s )
2324
0
    {
2325
0
        hts_log_error("Could not read the header");
2326
0
        goto error;
2327
0
    }
2328
8.43k
    if ( bcf_hdr_parse(h, txt.s) < 0 ) goto error;
2329
2330
    // check tabix index, are all contigs listed in the header? add the missing ones
2331
6.27k
    idx = tbx_index_load3(fp->fn, NULL, HTS_IDX_SILENT_FAIL);
2332
6.27k
    if ( idx )
2333
0
    {
2334
0
        int i, n, need_sync = 0;
2335
0
        names = tbx_seqnames(idx, &n);
2336
0
        if (!names) goto error;
2337
0
        for (i=0; i<n; i++)
2338
0
        {
2339
0
            bcf_hrec_t *hrec = bcf_hdr_get_hrec(h, BCF_HL_CTG, "ID", (char*) names[i], NULL);
2340
0
            if ( hrec ) continue;
2341
0
            if (add_missing_contig_hrec(h, names[i]) < 0) goto error;
2342
0
            need_sync = 1;
2343
0
        }
2344
0
        if ( need_sync ) {
2345
0
            if (bcf_hdr_sync(h) < 0) goto error;
2346
0
        }
2347
0
        free(names);
2348
0
        tbx_destroy(idx);
2349
0
    }
2350
6.27k
    free(txt.s);
2351
6.27k
    return h;
2352
2353
2.18k
 error:
2354
2.18k
    if (idx) tbx_destroy(idx);
2355
2.18k
    free(names);
2356
2.18k
    free(txt.s);
2357
2.18k
    if (h) bcf_hdr_destroy(h);
2358
2.18k
    return NULL;
2359
6.27k
}
2360
2361
int bcf_hdr_set(bcf_hdr_t *hdr, const char *fname)
2362
0
{
2363
0
    int i = 0, n = 0, save_errno;
2364
0
    char **lines = hts_readlines(fname, &n);
2365
0
    if ( !lines ) return 1;
2366
0
    for (i=0; i<n-1; i++)
2367
0
    {
2368
0
        int k;
2369
0
        bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr,lines[i],&k);
2370
0
        if (!hrec) goto fail;
2371
0
        if (bcf_hdr_add_hrec(hdr, hrec) < 0) {
2372
0
            bcf_hrec_destroy(hrec);
2373
0
            goto fail;
2374
0
        }
2375
0
        free(lines[i]);
2376
0
        lines[i] = NULL;
2377
0
    }
2378
0
    if (bcf_hdr_parse_sample_line(hdr, lines[n-1]) < 0) goto fail;
2379
0
    if (bcf_hdr_sync(hdr) < 0) goto fail;
2380
0
    free(lines[n-1]);
2381
0
    free(lines);
2382
0
    return 0;
2383
2384
0
 fail:
2385
0
    save_errno = errno;
2386
0
    for (; i < n; i++)
2387
0
        free(lines[i]);
2388
0
    free(lines);
2389
0
    errno = save_errno;
2390
0
    return 1;
2391
0
}
2392
2393
static int _bcf_hrec_format(const bcf_hrec_t *hrec, int is_bcf, kstring_t *str)
2394
22.3k
{
2395
22.3k
    uint32_t e = 0;
2396
22.3k
    if ( !hrec->value )
2397
12.3k
    {
2398
12.3k
        int j, nout = 0;
2399
12.3k
        e |= ksprintf(str, "##%s=<", hrec->key) < 0;
2400
51.1k
        for (j=0; j<hrec->nkeys; j++)
2401
38.7k
        {
2402
            // do not output IDX if output is VCF
2403
38.7k
            if ( !is_bcf && !strcmp("IDX",hrec->keys[j]) ) continue;
2404
33.1k
            if ( nout ) e |= kputc(',',str) < 0;
2405
33.1k
            e |= ksprintf(str,"%s=%s", hrec->keys[j], hrec->vals[j]) < 0;
2406
33.1k
            nout++;
2407
33.1k
        }
2408
12.3k
        e |= ksprintf(str,">\n") < 0;
2409
12.3k
    }
2410
9.90k
    else
2411
9.90k
        e |= ksprintf(str,"##%s=%s\n", hrec->key,hrec->value) < 0;
2412
2413
22.3k
    return e == 0 ? 0 : -1;
2414
22.3k
}
2415
2416
int bcf_hrec_format(const bcf_hrec_t *hrec, kstring_t *str)
2417
0
{
2418
0
    return _bcf_hrec_format(hrec,0,str);
2419
0
}
2420
2421
int bcf_hdr_format(const bcf_hdr_t *hdr, int is_bcf, kstring_t *str)
2422
6.99k
{
2423
6.99k
    int i, r = 0;
2424
29.2k
    for (i=0; i<hdr->nhrec; i++)
2425
22.3k
        r |= _bcf_hrec_format(hdr->hrec[i], is_bcf, str) < 0;
2426
2427
6.99k
    r |= ksprintf(str, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO") < 0;
2428
6.99k
    if ( bcf_hdr_nsamples(hdr) )
2429
2.19k
    {
2430
2.19k
        r |= ksprintf(str, "\tFORMAT") < 0;
2431
12.5k
        for (i=0; i<bcf_hdr_nsamples(hdr); i++)
2432
10.3k
            r |= ksprintf(str, "\t%s", hdr->samples[i]) < 0;
2433
2.19k
    }
2434
6.99k
    r |= ksprintf(str, "\n") < 0;
2435
2436
6.99k
    return r ? -1 : 0;
2437
6.99k
}
2438
2439
char *bcf_hdr_fmt_text(const bcf_hdr_t *hdr, int is_bcf, int *len)
2440
0
{
2441
0
    kstring_t txt = {0,0,0};
2442
0
    if (bcf_hdr_format(hdr, is_bcf, &txt) < 0)
2443
0
        return NULL;
2444
0
    if ( len ) *len = txt.l;
2445
0
    return txt.s;
2446
0
}
2447
2448
const char **bcf_hdr_seqnames(const bcf_hdr_t *h, int *n)
2449
0
{
2450
0
    vdict_t *d = (vdict_t*)h->dict[BCF_DT_CTG];
2451
0
    int i, tid, m = kh_size(d);
2452
0
    const char **names = (const char**) calloc(m,sizeof(const char*));
2453
0
    if ( !names )
2454
0
    {
2455
0
        hts_log_error("Failed to allocate memory");
2456
0
        *n = 0;
2457
0
        return NULL;
2458
0
    }
2459
0
    khint_t k;
2460
0
    for (k=kh_begin(d); k<kh_end(d); k++)
2461
0
    {
2462
0
        if ( !kh_exist(d,k) ) continue;
2463
0
        if ( !kh_val(d, k).hrec[0] ) continue;  // removed via bcf_hdr_remove
2464
0
        tid = kh_val(d,k).id;
2465
0
        if ( tid >= m )
2466
0
        {
2467
            // This can happen after a contig has been removed from BCF header via bcf_hdr_remove()
2468
0
            if ( hts_resize(const char*, tid + 1, &m, &names, HTS_RESIZE_CLEAR)<0 )
2469
0
            {
2470
0
                hts_log_error("Failed to allocate memory");
2471
0
                *n = 0;
2472
0
                free(names);
2473
0
                return NULL;
2474
0
            }
2475
0
            m = tid + 1;
2476
0
        }
2477
0
        names[tid] = kh_key(d,k);
2478
0
    }
2479
    // ensure there are no gaps
2480
0
    for (i=0,tid=0; tid<m; i++,tid++)
2481
0
    {
2482
0
        while ( tid<m && !names[tid] ) tid++;
2483
0
        if ( tid==m ) break;
2484
0
        if ( i==tid ) continue;
2485
0
        names[i] = names[tid];
2486
0
        names[tid] = 0;
2487
0
    }
2488
0
    *n = i;
2489
0
    return names;
2490
0
}
2491
2492
int vcf_hdr_write(htsFile *fp, const bcf_hdr_t *h)
2493
3.49k
{
2494
3.49k
    kstring_t htxt = {0,0,0};
2495
3.49k
    if (bcf_hdr_format(h, 0, &htxt) < 0) {
2496
0
        free(htxt.s);
2497
0
        return -1;
2498
0
    }
2499
3.49k
    while (htxt.l && htxt.s[htxt.l-1] == '\0') --htxt.l; // kill trailing zeros
2500
3.49k
    int ret;
2501
3.49k
    if ( fp->format.compression!=no_compression ) {
2502
0
        ret = bgzf_write(fp->fp.bgzf, htxt.s, htxt.l);
2503
0
        if (bgzf_flush(fp->fp.bgzf) != 0) return -1;
2504
3.49k
    } else {
2505
3.49k
        ret = hwrite(fp->fp.hfile, htxt.s, htxt.l);
2506
3.49k
    }
2507
3.49k
    free(htxt.s);
2508
3.49k
    return ret<0 ? -1 : 0;
2509
3.49k
}
2510
2511
/***********************
2512
 *** Typed value I/O ***
2513
 ***********************/
2514
2515
int bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize)
2516
391k
{
2517
391k
    int32_t max = INT32_MIN, min = INT32_MAX;
2518
391k
    int i;
2519
391k
    if (n <= 0) {
2520
2.00k
        return bcf_enc_size(s, 0, BCF_BT_NULL);
2521
389k
    } else if (n == 1) {
2522
25.3k
        return bcf_enc_int1(s, a[0]);
2523
364k
    } else {
2524
364k
        if (wsize <= 0) wsize = n;
2525
2526
        // Equivalent to:
2527
        // for (i = 0; i < n; ++i) {
2528
        //     if (a[i] == bcf_int32_missing || a[i] == bcf_int32_vector_end )
2529
        //         continue;
2530
        //     if (max < a[i]) max = a[i];
2531
        //     if (min > a[i]) min = a[i];
2532
        // }
2533
364k
        int max4[4] = {INT32_MIN, INT32_MIN, INT32_MIN, INT32_MIN};
2534
364k
        int min4[4] = {INT32_MAX, INT32_MAX, INT32_MAX, INT32_MAX};
2535
101M
        for (i = 0; i < (n&~3); i+=4) {
2536
            // bcf_int32_missing    == INT32_MIN and
2537
            // bcf_int32_vector_end == INT32_MIN+1.
2538
            // We skip these, but can mostly avoid explicit checking
2539
100M
            if (max4[0] < a[i+0]) max4[0] = a[i+0];
2540
100M
            if (max4[1] < a[i+1]) max4[1] = a[i+1];
2541
100M
            if (max4[2] < a[i+2]) max4[2] = a[i+2];
2542
100M
            if (max4[3] < a[i+3]) max4[3] = a[i+3];
2543
100M
            if (min4[0] > a[i+0] && a[i+0] > INT32_MIN+1) min4[0] = a[i+0];
2544
100M
            if (min4[1] > a[i+1] && a[i+1] > INT32_MIN+1) min4[1] = a[i+1];
2545
100M
            if (min4[2] > a[i+2] && a[i+2] > INT32_MIN+1) min4[2] = a[i+2];
2546
100M
            if (min4[3] > a[i+3] && a[i+3] > INT32_MIN+1) min4[3] = a[i+3];
2547
100M
        }
2548
364k
        min = min4[0];
2549
364k
        if (min > min4[1]) min = min4[1];
2550
364k
        if (min > min4[2]) min = min4[2];
2551
364k
        if (min > min4[3]) min = min4[3];
2552
364k
        max = max4[0];
2553
364k
        if (max < max4[1]) max = max4[1];
2554
364k
        if (max < max4[2]) max = max4[2];
2555
364k
        if (max < max4[3]) max = max4[3];
2556
930k
        for (; i < n; ++i) {
2557
565k
            if (max < a[i]) max = a[i];
2558
565k
            if (min > a[i] && a[i] > INT32_MIN+1) min = a[i];
2559
565k
        }
2560
2561
364k
        if (max <= BCF_MAX_BT_INT8 && min >= BCF_MIN_BT_INT8) {
2562
54.6k
            if (bcf_enc_size(s, wsize, BCF_BT_INT8) < 0 ||
2563
54.6k
                ks_resize(s, s->l + n) < 0)
2564
0
                return -1;
2565
54.6k
            uint8_t *p = (uint8_t *) s->s + s->l;
2566
17.5M
            for (i = 0; i < n; ++i, p++) {
2567
17.5M
                if ( a[i]==bcf_int32_vector_end )   *p = bcf_int8_vector_end;
2568
17.5M
                else if ( a[i]==bcf_int32_missing ) *p = bcf_int8_missing;
2569
270k
                else *p = a[i];
2570
17.5M
            }
2571
54.6k
            s->l += n;
2572
309k
        } else if (max <= BCF_MAX_BT_INT16 && min >= BCF_MIN_BT_INT16) {
2573
231k
            uint8_t *p;
2574
231k
            if (bcf_enc_size(s, wsize, BCF_BT_INT16) < 0 ||
2575
231k
                ks_resize(s, s->l + n * sizeof(int16_t)) < 0)
2576
0
                return -1;
2577
231k
            p = (uint8_t *) s->s + s->l;
2578
104M
            for (i = 0; i < n; ++i)
2579
104M
            {
2580
104M
                int16_t x;
2581
104M
                if ( a[i]==bcf_int32_vector_end ) x = bcf_int16_vector_end;
2582
104M
                else if ( a[i]==bcf_int32_missing ) x = bcf_int16_missing;
2583
2.65M
                else x = a[i];
2584
104M
                i16_to_le(x, p);
2585
104M
                p += sizeof(int16_t);
2586
104M
            }
2587
231k
            s->l += n * sizeof(int16_t);
2588
231k
        } else {
2589
77.9k
            uint8_t *p;
2590
77.9k
            if (bcf_enc_size(s, wsize, BCF_BT_INT32) < 0 ||
2591
77.9k
                ks_resize(s, s->l + n * sizeof(int32_t)) < 0)
2592
0
                return -1;
2593
77.9k
            p = (uint8_t *) s->s + s->l;
2594
281M
            for (i = 0; i < n; ++i) {
2595
281M
                i32_to_le(a[i], p);
2596
281M
                p += sizeof(int32_t);
2597
281M
            }
2598
77.9k
            s->l += n * sizeof(int32_t);
2599
77.9k
        }
2600
364k
    }
2601
2602
364k
    return 0;
2603
391k
}
2604
2605
#ifdef VCF_ALLOW_INT64
2606
static int bcf_enc_long1(kstring_t *s, int64_t x) {
2607
    uint32_t e = 0;
2608
    if (x <= BCF_MAX_BT_INT32 && x >= BCF_MIN_BT_INT32)
2609
        return bcf_enc_int1(s, x);
2610
    if (x == bcf_int64_vector_end) {
2611
        e |= bcf_enc_size(s, 1, BCF_BT_INT8);
2612
        e |= kputc(bcf_int8_vector_end, s) < 0;
2613
    } else if (x == bcf_int64_missing) {
2614
        e |= bcf_enc_size(s, 1, BCF_BT_INT8);
2615
        e |= kputc(bcf_int8_missing, s) < 0;
2616
    } else {
2617
        e |= bcf_enc_size(s, 1, BCF_BT_INT64);
2618
        e |= ks_expand(s, 8);
2619
        if (e == 0) { u64_to_le(x, (uint8_t *) s->s + s->l); s->l += 8; }
2620
    }
2621
    return e == 0 ? 0 : -1;
2622
}
2623
#endif
2624
2625
781k
static inline int serialize_float_array(kstring_t *s, size_t n, const float *a) {
2626
781k
    uint8_t *p;
2627
781k
    size_t i;
2628
781k
    size_t bytes = n * sizeof(float);
2629
2630
781k
    if (bytes / sizeof(float) != n) return -1;
2631
781k
    if (ks_resize(s, s->l + bytes) < 0) return -1;
2632
2633
781k
    p = (uint8_t *) s->s + s->l;
2634
148M
    for (i = 0; i < n; i++) {
2635
147M
        float_to_le(a[i], p);
2636
147M
        p += sizeof(float);
2637
147M
    }
2638
781k
    s->l += bytes;
2639
2640
781k
    return 0;
2641
781k
}
2642
2643
int bcf_enc_vfloat(kstring_t *s, int n, float *a)
2644
781k
{
2645
781k
    assert(n >= 0);
2646
781k
    bcf_enc_size(s, n, BCF_BT_FLOAT);
2647
781k
    serialize_float_array(s, n, a);
2648
781k
    return 0; // FIXME: check for errs in this function
2649
781k
}
2650
2651
int bcf_enc_vchar(kstring_t *s, int l, const char *a)
2652
2.48M
{
2653
2.48M
    bcf_enc_size(s, l, BCF_BT_CHAR);
2654
2.48M
    kputsn(a, l, s);
2655
2.48M
    return 0; // FIXME: check for errs in this function
2656
2.48M
}
2657
2658
// Special case of n==1 as it also occurs quite often in FORMAT data.
2659
// This version is also small enough to get inlined.
2660
11.8k
static inline int bcf_fmt_array1(kstring_t *s, int type, void *data) {
2661
11.8k
    uint32_t e = 0;
2662
11.8k
    uint8_t *p = (uint8_t *)data;
2663
11.8k
    int32_t v;
2664
2665
    // helps gcc more than clang here. In billions of cycles:
2666
    //          bcf_fmt_array1  bcf_fmt_array
2667
    // gcc7:    23.2            24.3
2668
    // gcc13:   21.6            23.0
2669
    // clang13: 27.1            27.8
2670
11.8k
    switch (type) {
2671
11.8k
    case BCF_BT_CHAR:
2672
11.8k
        e |= kputc_(*p == bcf_str_missing ? '.' : *p, s) < 0;
2673
11.8k
        break;
2674
2675
0
    case BCF_BT_INT8:
2676
0
        if (*(int8_t *)p != bcf_int8_vector_end) {
2677
0
            e |= ((*(int8_t *)p == bcf_int8_missing)
2678
0
                  ? kputc_('.', s)
2679
0
                  : kputw(*(int8_t *)p, s)) < 0;
2680
0
        }
2681
0
        break;
2682
0
    case BCF_BT_INT16:
2683
0
        v = le_to_i16(p);
2684
0
        if (v != bcf_int16_vector_end) {
2685
0
            e |= (v == bcf_int16_missing
2686
0
                  ? kputc_('.', s)
2687
0
                  : kputw(v, s)) < 0;
2688
0
        }
2689
0
        break;
2690
2691
0
    case BCF_BT_INT32:
2692
0
        v = le_to_i32(p);
2693
0
        if (v != bcf_int32_vector_end) {
2694
0
            e |= (v == bcf_int32_missing
2695
0
                  ? kputc_('.', s)
2696
0
                  : kputw(v, s)) < 0;
2697
0
        }
2698
0
        break;
2699
2700
0
    case BCF_BT_FLOAT:
2701
0
        v = le_to_u32(p);
2702
0
        if (v != bcf_float_vector_end) {
2703
0
            e |= (v == bcf_float_missing
2704
0
                  ? kputc_('.', s)
2705
0
                  : kputd(le_to_float(p), s)) < 0;
2706
0
        }
2707
0
        break;
2708
2709
0
    default:
2710
0
        hts_log_error("Unexpected type %d", type);
2711
0
        return -1;
2712
11.8k
    }
2713
2714
11.8k
    return e == 0 ? 0 : -1;
2715
11.8k
}
2716
2717
int bcf_fmt_array(kstring_t *s, int n, int type, void *data)
2718
2.71M
{
2719
2.71M
    int j = 0;
2720
2.71M
    uint32_t e = 0;
2721
2.71M
    if (n == 0) {
2722
1.39M
        return kputc_('.', s) >= 0 ? 0 : -1;
2723
1.39M
    }
2724
2725
1.31M
    if (type == BCF_BT_CHAR)
2726
112k
    {
2727
112k
        char *p = (char *)data;
2728
2729
        // Note bcf_str_missing is already accounted for in n==0 above.
2730
112k
        if (n >= 8) {
2731
40.2k
            char *p_end = memchr(p, 0, n);
2732
40.2k
            e |= kputsn(p, p_end ? p_end-p : n, s) < 0;
2733
72.4k
        } else {
2734
234k
            for (j = 0; j < n && *p; ++j, ++p)
2735
161k
               e |= kputc(*p, s) < 0;
2736
72.4k
        }
2737
112k
    }
2738
1.20M
    else
2739
1.20M
    {
2740
1.20M
        #define BRANCH(type_t, convert, is_missing, is_vector_end, kprint) { \
2741
1.20M
            uint8_t *p = (uint8_t *) data; \
2742
275M
            for (j=0; j<n; j++, p += sizeof(type_t))    \
2743
273M
            { \
2744
273M
                type_t v = convert(p); \
2745
273M
                if ( is_vector_end ) break; \
2746
273M
                if ( j ) e |= kputc_(',', s) < 0; \
2747
273M
                e |= (is_missing ? kputc('.', s) : kprint) < 0; \
2748
273M
            } \
2749
1.20M
        }
2750
1.20M
        switch (type) {
2751
342k
            case BCF_BT_INT8:  BRANCH(int8_t,  le_to_i8, v==bcf_int8_missing,  v==bcf_int8_vector_end,  kputw(v, s)); break;
2752
223k
            case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, v==bcf_int16_missing, v==bcf_int16_vector_end, kputw(v, s)); break;
2753
245k
            case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, v==bcf_int32_missing, v==bcf_int32_vector_end, kputw(v, s)); break;
2754
389k
            case BCF_BT_FLOAT: BRANCH(uint32_t, le_to_u32, v==bcf_float_missing, v==bcf_float_vector_end, kputd(le_to_float(p), s)); break;
2755
0
            default: hts_log_error("Unexpected type %d", type); exit(1); break;
2756
1.20M
        }
2757
1.20M
        #undef BRANCH
2758
1.20M
    }
2759
1.31M
    return e == 0 ? 0 : -1;
2760
1.31M
}
2761
2762
uint8_t *bcf_fmt_sized_array(kstring_t *s, uint8_t *ptr)
2763
1.41M
{
2764
1.41M
    int x, type;
2765
1.41M
    x = bcf_dec_size(ptr, &ptr, &type);
2766
1.41M
    bcf_fmt_array(s, x, type, ptr);
2767
1.41M
    return ptr + (x << bcf_type_shift[type]);
2768
1.41M
}
2769
2770
/********************
2771
 *** VCF site I/O ***
2772
 ********************/
2773
2774
typedef struct {
2775
    int key;            // Key for h->id[BCF_DT_ID][key] vdict
2776
    int max_m;          // number of elements in field array (ie commas)
2777
    int size;           // field size (max_l or max_g*4 if is_gt)
2778
    int offset;         // offset of buf into h->mem
2779
    uint32_t is_gt:1,   // is genotype
2780
             max_g:31;  // maximum number of genotypes
2781
    uint32_t max_l;     // length of field
2782
    uint32_t y;         // h->id[0][fmt[j].key].val->info[BCF_HL_FMT]
2783
    uint8_t *buf;       // Pointer into h->mem
2784
} fmt_aux_t;
2785
2786
// fmt_aux_t field notes:
2787
// max_* are biggest sizes of the various FORMAT fields across all samples.
2788
// We use these after pivoting the data to ensure easy random access
2789
// of a specific sample.
2790
//
2791
// max_m is only used for type BCF_HT_REAL or BCF_HT_INT
2792
// max_g is only used for is_gt == 1 (will be BCF_HT_STR)
2793
// max_l is only used for is_gt == 0 (will be BCF_HT_STR)
2794
//
2795
// These are computed in vcf_parse_format_max3 and used in
2796
// vcf_parse_format_alloc4 to get the size.
2797
//
2798
// size is computed from max_g, max_l, max_m and is_gt.  Once computed
2799
// the max values are never accessed again.
2800
//
2801
// In theory all 4 vars could be coalesced into a single variable, but this
2802
// significantly harms speed (even if done via a union).  It's about 25-30%
2803
// slower.
2804
2805
static inline int align_mem(kstring_t *s)
2806
56.7k
{
2807
56.7k
    int e = 0;
2808
56.7k
    if (s->l&7) {
2809
10.5k
        uint64_t zero = 0;
2810
10.5k
        e = kputsn((char*)&zero, 8 - (s->l&7), s) < 0;
2811
10.5k
    }
2812
56.7k
    return e == 0 ? 0 : -1;
2813
56.7k
}
2814
2815
57.6k
#define MAX_N_FMT 255   /* Limited by size of bcf1_t n_fmt field */
2816
2817
// detect FORMAT "."
2818
static int vcf_parse_format_empty1(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
2819
12.6k
                                   const char *p, const char *q) {
2820
12.6k
    const char *end = s->s + s->l;
2821
12.6k
    if ( q>=end )
2822
37
    {
2823
37
        hts_log_error("FORMAT column with no sample columns starting at %s:%"PRIhts_pos"", bcf_seqname_safe(h,v), v->pos+1);
2824
37
        v->errcode |= BCF_ERR_NCOLS;
2825
37
        return -1;
2826
37
    }
2827
2828
12.5k
    v->n_fmt = 0;
2829
12.5k
    if ( p[0]=='.' && p[1]==0 ) // FORMAT field is empty "."
2830
124
    {
2831
124
        v->n_sample = bcf_hdr_nsamples(h);
2832
124
        return 1;
2833
124
    }
2834
2835
12.4k
    return 0;
2836
12.5k
}
2837
2838
// get format information from the dictionary
2839
static int vcf_parse_format_dict2(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
2840
12.4k
                                  const char *p, const char *q, fmt_aux_t *fmt) {
2841
12.4k
    const vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID];
2842
12.4k
    char *t;
2843
12.4k
    int j;
2844
12.4k
    ks_tokaux_t aux1;
2845
2846
70.0k
    for (j = 0, t = kstrtok(p, ":", &aux1); t; t = kstrtok(0, 0, &aux1), ++j) {
2847
57.6k
        if (j >= MAX_N_FMT) {
2848
3
            v->errcode |= BCF_ERR_LIMITS;
2849
3
            hts_log_error("FORMAT column at %s:%"PRIhts_pos" lists more identifiers than htslib can handle",
2850
3
                bcf_seqname_safe(h,v), v->pos+1);
2851
3
            return -1;
2852
3
        }
2853
2854
57.6k
        *(char*)aux1.p = 0;
2855
57.6k
        khint_t k = kh_get(vdict, d, t);
2856
57.6k
        if (k == kh_end(d) || kh_val(d, k).info[BCF_HL_FMT] == 15) {
2857
9.59k
            if ( t[0]=='.' && t[1]==0 )
2858
3
            {
2859
3
                hts_log_error("Invalid FORMAT tag name '.' at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
2860
3
                v->errcode |= BCF_ERR_TAG_INVALID;
2861
3
                return -1;
2862
3
            }
2863
9.58k
            hts_log_warning("FORMAT '%s' at %s:%"PRIhts_pos" is not defined in the header, assuming Type=String", t, bcf_seqname_safe(h,v), v->pos+1);
2864
9.58k
            kstring_t tmp = {0,0,0};
2865
9.58k
            int l;
2866
9.58k
            ksprintf(&tmp, "##FORMAT=<ID=%s,Number=1,Type=String,Description=\"Dummy\">", t);
2867
9.58k
            bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
2868
9.58k
            free(tmp.s);
2869
9.58k
            int res = hrec ? bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) : -1;
2870
9.58k
            if (res < 0) bcf_hrec_destroy(hrec);
2871
9.58k
            if (res > 0) res = bcf_hdr_sync((bcf_hdr_t*)h);
2872
2873
9.58k
            k = kh_get(vdict, d, t);
2874
9.58k
            v->errcode |= BCF_ERR_TAG_UNDEF;
2875
9.58k
            if (res || k == kh_end(d)) {
2876
20
                hts_log_error("Could not add dummy header for FORMAT '%s' at %s:%"PRIhts_pos, t, bcf_seqname_safe(h,v), v->pos+1);
2877
20
                v->errcode |= BCF_ERR_TAG_INVALID;
2878
20
                return -1;
2879
20
            }
2880
9.58k
        }
2881
57.6k
        fmt[j].max_l = fmt[j].max_m = fmt[j].max_g = 0;
2882
57.6k
        fmt[j].key = kh_val(d, k).id;
2883
57.6k
        fmt[j].is_gt = (t[0] == 'G' && t[1] == 'T' && !t[2]);
2884
57.6k
        fmt[j].y = h->id[0][fmt[j].key].val->info[BCF_HL_FMT];
2885
57.6k
        v->n_fmt++;
2886
57.6k
    }
2887
12.4k
    return 0;
2888
12.4k
}
2889
2890
// compute max
2891
static int vcf_parse_format_max3(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
2892
12.4k
                                 char *p, char *q, fmt_aux_t *fmt) {
2893
12.4k
    int n_sample_ori = -1;
2894
12.4k
    char *r = q + 1;  // r: position in the format string
2895
12.4k
    int l = 0, m = 1, g = 1, j;
2896
12.4k
    v->n_sample = 0;  // m: max vector size, l: max field len, g: max number of alleles
2897
12.4k
    const char *end = s->s + s->l;
2898
2899
30.6k
    while ( r<end )
2900
30.5k
    {
2901
        // can we skip some samples?
2902
30.5k
        if ( h->keep_samples )
2903
0
        {
2904
0
            n_sample_ori++;
2905
0
            if ( !bit_array_test(h->keep_samples,n_sample_ori) )
2906
0
            {
2907
0
                while ( *r!='\t' && r<end ) r++;
2908
0
                if ( *r=='\t' ) { *r = 0; r++; }
2909
0
                continue;
2910
0
            }
2911
0
        }
2912
2913
        // collect fmt stats: max vector size, length, number of alleles
2914
30.5k
        j = 0;  // j-th format field
2915
30.5k
        fmt_aux_t *f = fmt;
2916
30.5k
        static char meta[256] = {
2917
            // \0 \t , / : |
2918
30.5k
            1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2919
30.5k
            0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1, 0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,
2920
30.5k
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2921
30.5k
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,
2922
30.5k
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2923
30.5k
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2924
30.5k
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2925
30.5k
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2926
30.5k
        };
2927
2928
30.5k
        char *r_start = r;
2929
8.75M
        for (;;) {
2930
            // Quickly skip ahead to an appropriate meta-character
2931
10.0M
            while (!meta[(unsigned char)*r]) r++;
2932
2933
8.75M
            switch (*r) {
2934
8.68M
            case ',':
2935
8.68M
                m++;
2936
8.68M
                break;
2937
2938
771
            case '|':
2939
25.7k
            case '/':
2940
25.7k
                if (f->is_gt) g++;
2941
25.7k
                break;
2942
2943
8.83k
            case '\t':
2944
8.83k
                *r = 0; // fall through
2945
2946
8.83k
            default: // valid due to while loop above.
2947
30.4k
            case '\0':
2948
45.8k
            case ':':
2949
45.8k
                l = r - r_start; r_start = r;
2950
45.8k
                if (f->max_m < m) f->max_m = m;
2951
45.8k
                if (f->max_l < l) f->max_l = l;
2952
45.8k
                if (f->is_gt && f->max_g < g) f->max_g = g;
2953
45.8k
                l = 0, m = g = 1;
2954
45.8k
                if ( *r==':' ) {
2955
15.3k
                    j++; f++;
2956
15.3k
                    if ( j>=v->n_fmt ) {
2957
27
                        hts_log_error("Incorrect number of FORMAT fields at %s:%"PRIhts_pos"",
2958
27
                                      h->id[BCF_DT_CTG][v->rid].key, v->pos+1);
2959
27
                        v->errcode |= BCF_ERR_NCOLS;
2960
27
                        return -1;
2961
27
                    }
2962
30.4k
                } else goto end_for;
2963
15.3k
                break;
2964
8.75M
            }
2965
8.72M
            if ( r>=end ) break;
2966
8.72M
            r++;
2967
8.72M
        }
2968
30.4k
    end_for:
2969
30.4k
        v->n_sample++;
2970
30.4k
        if ( v->n_sample == bcf_hdr_nsamples(h) ) break;
2971
18.2k
        r++;
2972
18.2k
    }
2973
2974
12.3k
    return 0;
2975
12.4k
}
2976
2977
// allocate memory for arrays
2978
static int vcf_parse_format_alloc4(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
2979
                                   const char *p, const char *q,
2980
12.3k
                                   fmt_aux_t *fmt) {
2981
12.3k
    kstring_t *mem = (kstring_t*)&h->mem;
2982
2983
12.3k
    int j;
2984
69.1k
    for (j = 0; j < v->n_fmt; ++j) {
2985
56.7k
        fmt_aux_t *f = &fmt[j];
2986
56.7k
        if ( !f->max_m ) f->max_m = 1;  // omitted trailing format field
2987
2988
56.7k
        if ((f->y>>4&0xf) == BCF_HT_STR) {
2989
56.7k
            f->size = f->is_gt? f->max_g << 2 : f->max_l;
2990
56.7k
        } else if ((f->y>>4&0xf) == BCF_HT_REAL || (f->y>>4&0xf) == BCF_HT_INT) {
2991
0
            f->size = f->max_m << 2;
2992
0
        } else {
2993
0
            hts_log_error("The format type %d at %s:%"PRIhts_pos" is currently not supported", f->y>>4&0xf, bcf_seqname_safe(h,v), v->pos+1);
2994
0
            v->errcode |= BCF_ERR_TAG_INVALID;
2995
0
            return -1;
2996
0
        }
2997
2998
56.7k
        if (align_mem(mem) < 0) {
2999
0
            hts_log_error("Memory allocation failure at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3000
0
            v->errcode |= BCF_ERR_LIMITS;
3001
0
            return -1;
3002
0
        }
3003
3004
        // Limit the total memory to ~2Gb per VCF row.  This should mean
3005
        // malformed VCF data is less likely to take excessive memory and/or
3006
        // time.
3007
56.7k
        if ((uint64_t) mem->l + v->n_sample * (uint64_t)f->size > INT_MAX) {
3008
0
            static int warned = 0;
3009
0
            if ( !warned ) hts_log_warning("Excessive memory required by FORMAT fields at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3010
0
            warned = 1;
3011
0
            v->errcode |= BCF_ERR_LIMITS;
3012
0
            f->size = -1;
3013
0
            f->offset = 0;
3014
0
            continue;
3015
0
        }
3016
3017
56.7k
        f->offset = mem->l;
3018
56.7k
        if (ks_resize(mem, mem->l + v->n_sample * (size_t)f->size) < 0) {
3019
0
            hts_log_error("Memory allocation failure at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3020
0
            v->errcode |= BCF_ERR_LIMITS;
3021
0
            return -1;
3022
0
        }
3023
56.7k
        mem->l += v->n_sample * f->size;
3024
56.7k
    }
3025
3026
12.3k
    {
3027
12.3k
        int j;
3028
69.1k
        for (j = 0; j < v->n_fmt; ++j)
3029
56.7k
            fmt[j].buf = (uint8_t*)mem->s + fmt[j].offset;
3030
12.3k
    }
3031
3032
    // check for duplicate tags
3033
12.3k
    int i;
3034
56.7k
    for (i=1; i<v->n_fmt; i++)
3035
44.3k
    {
3036
44.3k
        fmt_aux_t *ifmt = &fmt[i];
3037
44.3k
        if ( ifmt->size==-1 ) continue; // already marked for removal
3038
252k
        for (j=0; j<i; j++)
3039
236k
        {
3040
236k
            fmt_aux_t *jfmt = &fmt[j];
3041
236k
            if ( jfmt->size==-1 ) continue; // already marked for removal
3042
126k
            if ( ifmt->key!=jfmt->key ) continue;
3043
27.7k
            static int warned = 0;
3044
27.7k
            if ( !warned ) hts_log_warning("Duplicate FORMAT tag %s at %s:%"PRIhts_pos, bcf_hdr_int2id(h,BCF_DT_ID,ifmt->key), bcf_seqname_safe(h,v), v->pos+1);
3045
27.7k
            warned = 1;
3046
27.7k
            v->errcode |= BCF_ERR_TAG_INVALID;
3047
27.7k
            ifmt->size = -1;
3048
27.7k
            ifmt->offset = 0;
3049
27.7k
            break;
3050
126k
        }
3051
44.3k
    }
3052
12.3k
    return 0;
3053
12.3k
}
3054
3055
// Fill the sample fields
3056
static int vcf_parse_format_fill5(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
3057
12.3k
                                  const char *p, const char *q, fmt_aux_t *fmt) {
3058
12.3k
    static int extreme_val_warned = 0;
3059
12.3k
    int n_sample_ori = -1;
3060
    // At beginning of the loop t points to the first char of a format
3061
12.3k
    const char *t = q + 1;
3062
12.3k
    int m = 0;   // m: sample id
3063
12.3k
    const int nsamples = bcf_hdr_nsamples(h);
3064
3065
12.3k
    const char *end = s->s + s->l;
3066
42.5k
    while ( t<end )
3067
38.0k
    {
3068
        // can we skip some samples?
3069
38.0k
        if ( h->keep_samples )
3070
0
        {
3071
0
            n_sample_ori++;
3072
0
            if ( !bit_array_test(h->keep_samples,n_sample_ori) )
3073
0
            {
3074
0
                while ( *t && t<end ) t++;
3075
0
                t++;
3076
0
                continue;
3077
0
            }
3078
0
        }
3079
38.0k
        if ( m == nsamples ) break;
3080
3081
30.4k
        int j = 0; // j-th format field, m-th sample
3082
45.7k
        while ( t < end )
3083
43.7k
        {
3084
43.7k
            fmt_aux_t *z = &fmt[j++];
3085
43.7k
            const int htype = z->y>>4&0xf;
3086
43.7k
            if (!z->buf) {
3087
19
                hts_log_error("Memory allocation failure for FORMAT field type %d at %s:%"PRIhts_pos,
3088
19
                              z->y>>4&0xf, bcf_seqname_safe(h,v), v->pos+1);
3089
19
                v->errcode |= BCF_ERR_LIMITS;
3090
19
                return -1;
3091
19
            }
3092
3093
43.7k
            if ( z->size==-1 )
3094
6.37k
            {
3095
                // this field is to be ignored, it's either too big or a duplicate
3096
107k
                while ( *t != ':' && *t ) t++;
3097
6.37k
            }
3098
37.3k
            else if (htype == BCF_HT_STR) {
3099
37.3k
                int l;
3100
37.3k
                if (z->is_gt) {
3101
                    // Genotypes.
3102
                    // <val>([|/]<val>)+... where <val> is [0-9]+ or ".".
3103
4.54k
                    int32_t is_phased = 0;
3104
4.54k
                    uint32_t *x = (uint32_t*)(z->buf + z->size * (size_t)m);
3105
4.54k
                    uint32_t unreadable = 0;
3106
4.54k
                    uint32_t max = 0;
3107
4.54k
                    int overflow = 0;
3108
26.7k
                    for (l = 0;; ++t) {
3109
26.7k
                        if (*t == '.') {
3110
6.80k
                            ++t, x[l++] = is_phased;
3111
19.9k
                        } else {
3112
19.9k
                            const char *tt = t;
3113
19.9k
                            uint32_t val;
3114
                            // Or "v->n_allele < 10", but it doesn't
3115
                            // seem to be any faster and this feels safer.
3116
19.9k
                            if (*t >= '0' && *t <= '9' &&
3117
19.9k
                                !(t[1] >= '0' && t[1] <= '9')) {
3118
9.99k
                                val = *t++ - '0';
3119
9.99k
                            } else {
3120
9.94k
                                val = hts_str2uint(t, (char **)&t,
3121
9.94k
                                                   sizeof(val) * CHAR_MAX - 2,
3122
9.94k
                                                   &overflow);
3123
9.94k
                                unreadable |= tt == t;
3124
9.94k
                            }
3125
19.9k
                            if (max < val) max = val;
3126
19.9k
                            x[l++] = (val + 1) << 1 | is_phased;
3127
19.9k
                        }
3128
26.7k
                        is_phased = (*t == '|');
3129
26.7k
                        if (*t != '|' && *t != '/') break;
3130
26.7k
                    }
3131
                    // Possibly check max against v->n_allele instead?
3132
4.54k
                    if (overflow || max > (INT32_MAX >> 1) - 1) {
3133
173
                        hts_log_error("Couldn't read GT data: value too large at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3134
173
                        return -1;
3135
173
                    }
3136
4.37k
                    if (unreadable) {
3137
52
                        hts_log_error("Couldn't read GT data: value not a number or '.' at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3138
52
                        return -1;
3139
52
                    }
3140
4.32k
                    if ( !l ) x[l++] = 0;   // An empty field, insert missing value
3141
5.95k
                    for (; l < z->size>>2; ++l)
3142
1.63k
                        x[l] = bcf_int32_vector_end;
3143
3144
32.8k
                } else {
3145
                    // Otherwise arbitrary strings
3146
32.8k
                    char *x = (char*)z->buf + z->size * (size_t)m;
3147
9.54M
                    for (l = 0; *t != ':' && *t; ++t)
3148
9.51M
                        x[l++] = *t;
3149
32.8k
                    if (z->size > l)
3150
17.7k
                        memset(&x[l], 0, (z->size-l) * sizeof(*x));
3151
32.8k
                }
3152
3153
37.3k
            } else if (htype == BCF_HT_INT) {
3154
                // One or more integers in an array
3155
0
                int32_t *x = (int32_t*)(z->buf + z->size * (size_t)m);
3156
0
                int l;
3157
0
                for (l = 0;; ++t) {
3158
0
                    if (*t == '.') {
3159
0
                        x[l++] = bcf_int32_missing, ++t; // ++t to skip "."
3160
0
                    } else {
3161
0
                        int overflow = 0;
3162
0
                        char *te;
3163
0
                        long int tmp_val = hts_str2int(t, &te, sizeof(tmp_val)*CHAR_BIT, &overflow);
3164
0
                        if ( te==t || overflow || tmp_val<BCF_MIN_BT_INT32 || tmp_val>BCF_MAX_BT_INT32 )
3165
0
                        {
3166
0
                            if ( !extreme_val_warned )
3167
0
                            {
3168
0
                                hts_log_warning("Extreme FORMAT/%s value encountered and set to missing at %s:%"PRIhts_pos,
3169
0
                                                h->id[BCF_DT_ID][fmt[j-1].key].key, bcf_seqname_safe(h,v), v->pos+1);
3170
0
                                extreme_val_warned = 1;
3171
0
                            }
3172
0
                            tmp_val = bcf_int32_missing;
3173
0
                        }
3174
0
                        x[l++] = tmp_val;
3175
0
                        t = te;
3176
0
                    }
3177
0
                    if (*t != ',') break;
3178
0
                }
3179
0
                if ( !l )
3180
0
                    x[l++] = bcf_int32_missing;
3181
0
                for (; l < z->size>>2; ++l)
3182
0
                    x[l] = bcf_int32_vector_end;
3183
3184
0
            } else if (htype == BCF_HT_REAL) {
3185
                // One of more floating point values in an array
3186
0
                float *x = (float*)(z->buf + z->size * (size_t)m);
3187
0
                int l;
3188
0
                for (l = 0;; ++t) {
3189
0
                    if (*t == '.' && !isdigit_c(t[1])) {
3190
0
                        bcf_float_set_missing(x[l++]), ++t; // ++t to skip "."
3191
0
                    } else {
3192
0
                        int overflow = 0;
3193
0
                        char *te;
3194
0
                        float tmp_val = hts_str2dbl(t, &te, &overflow);
3195
0
                        if ( (te==t || overflow) && !extreme_val_warned )
3196
0
                        {
3197
0
                            hts_log_warning("Extreme FORMAT/%s value encountered at %s:%"PRIhts_pos, h->id[BCF_DT_ID][fmt[j-1].key].key, bcf_seqname(h,v), v->pos+1);
3198
0
                            extreme_val_warned = 1;
3199
0
                        }
3200
0
                        x[l++] = tmp_val;
3201
0
                        t = te;
3202
0
                    }
3203
0
                    if (*t != ',') break;
3204
0
                }
3205
0
                if ( !l )
3206
                    // An empty field, insert missing value
3207
0
                    bcf_float_set_missing(x[l++]);
3208
0
                for (; l < z->size>>2; ++l)
3209
0
                    bcf_float_set_vector_end(x[l]);
3210
0
            } else {
3211
0
                hts_log_error("Unknown FORMAT field type %d at %s:%"PRIhts_pos, htype, bcf_seqname_safe(h,v), v->pos+1);
3212
0
                v->errcode |= BCF_ERR_TAG_INVALID;
3213
0
                return -1;
3214
0
            }
3215
3216
43.5k
            if (*t == '\0') {
3217
28.1k
                break;
3218
28.1k
            }
3219
15.3k
            else if (*t == ':') {
3220
15.3k
                t++;
3221
15.3k
            }
3222
27
            else {
3223
27
                char buffer[8];
3224
27
                hts_log_error("Invalid character %s in '%s' FORMAT field at %s:%"PRIhts_pos"",
3225
27
                    hts_strprint(buffer, sizeof buffer, '\'', t, 1),
3226
27
                    h->id[BCF_DT_ID][z->key].key, bcf_seqname_safe(h,v), v->pos+1);
3227
27
                v->errcode |= BCF_ERR_CHAR;
3228
27
                return -1;
3229
27
            }
3230
43.5k
        }
3231
3232
        // fill end-of-vector values
3233
508k
        for (; j < v->n_fmt; ++j) {
3234
478k
            fmt_aux_t *z = &fmt[j];
3235
478k
            const int htype = z->y>>4&0xf;
3236
478k
            int l;
3237
3238
478k
            if (z->size == -1) // this field is to be ignored
3239
402k
                continue;
3240
3241
76.0k
            if (htype == BCF_HT_STR) {
3242
76.0k
                if (z->is_gt) {
3243
10.1k
                    int32_t *x = (int32_t*)(z->buf + z->size * (size_t)m);
3244
10.1k
                    if (z->size) x[0] = bcf_int32_missing;
3245
38.6k
                    for (l = 1; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end;
3246
65.9k
                } else {
3247
65.9k
                    char *x = (char*)z->buf + z->size * (size_t)m;
3248
65.9k
                    if ( z->size ) {
3249
21.7k
                        x[0] = '.';
3250
21.7k
                        memset(&x[1], 0, (z->size-1) * sizeof(*x));
3251
21.7k
                    }
3252
65.9k
                }
3253
76.0k
            } else if (htype == BCF_HT_INT) {
3254
0
                int32_t *x = (int32_t*)(z->buf + z->size * (size_t)m);
3255
0
                x[0] = bcf_int32_missing;
3256
0
                for (l = 1; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end;
3257
0
            } else if (htype == BCF_HT_REAL) {
3258
0
                float *x = (float*)(z->buf + z->size * (size_t)m);
3259
0
                bcf_float_set_missing(x[0]);
3260
0
                for (l = 1; l < z->size>>2; ++l) bcf_float_set_vector_end(x[l]);
3261
0
            }
3262
76.0k
        }
3263
3264
30.1k
        m++; t++;
3265
30.1k
    }
3266
3267
12.1k
    return 0;
3268
12.3k
}
3269
3270
// write individual genotype information
3271
static int vcf_parse_format_gt6(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
3272
12.1k
                                const char *p, const char *q, fmt_aux_t *fmt) {
3273
12.1k
    kstring_t *str = &v->indiv;
3274
12.1k
    int i, need_downsize = 0;
3275
12.1k
    if (v->n_sample > 0) {
3276
62.8k
        for (i = 0; i < v->n_fmt; ++i) {
3277
50.7k
            fmt_aux_t *z = &fmt[i];
3278
50.7k
            if ( z->size==-1 ) {
3279
23.1k
                need_downsize = 1;
3280
23.1k
                continue;
3281
23.1k
            }
3282
27.6k
            bcf_enc_int1(str, z->key);
3283
27.6k
            if ((z->y>>4&0xf) == BCF_HT_STR && !z->is_gt) {
3284
22.5k
                bcf_enc_size(str, z->size, BCF_BT_CHAR);
3285
22.5k
                kputsn((char*)z->buf, z->size * (size_t)v->n_sample, str);
3286
22.5k
            } else if ((z->y>>4&0xf) == BCF_HT_INT || z->is_gt) {
3287
5.17k
                bcf_enc_vint(str, (z->size>>2) * v->n_sample, (int32_t*)z->buf, z->size>>2);
3288
5.17k
            } else {
3289
0
                bcf_enc_size(str, z->size>>2, BCF_BT_FLOAT);
3290
0
                if (serialize_float_array(str, (z->size>>2) * (size_t)v->n_sample,
3291
0
                                          (float *) z->buf) != 0) {
3292
0
                    v->errcode |= BCF_ERR_LIMITS;
3293
0
                    hts_log_error("Out of memory at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3294
0
                    return -1;
3295
0
                }
3296
0
            }
3297
27.6k
        }
3298
3299
12.0k
    }
3300
12.1k
    if ( need_downsize ) {
3301
3.64k
        i = 0;
3302
37.6k
        while ( i < v->n_fmt ) {
3303
34.0k
            if ( fmt[i].size==-1 )
3304
23.1k
            {
3305
23.1k
                v->n_fmt--;
3306
23.1k
                if ( i < v->n_fmt ) memmove(&fmt[i],&fmt[i+1],sizeof(*fmt)*(v->n_fmt-i));
3307
23.1k
            }
3308
10.9k
            else
3309
10.9k
                i++;
3310
34.0k
        }
3311
3.64k
    }
3312
12.1k
    return 0;
3313
12.1k
}
3314
3315
// validity checking
3316
12.1k
static int vcf_parse_format_check7(const bcf_hdr_t *h, bcf1_t *v) {
3317
12.1k
    if ( v->n_sample!=bcf_hdr_nsamples(h) )
3318
146
    {
3319
146
        hts_log_error("Number of columns at %s:%"PRIhts_pos" does not match the number of samples (%d vs %d)",
3320
146
            bcf_seqname_safe(h,v), v->pos+1, v->n_sample, bcf_hdr_nsamples(h));
3321
146
        v->errcode |= BCF_ERR_NCOLS;
3322
146
        return -1;
3323
146
    }
3324
11.9k
    if ( v->indiv.l > 0xffffffff )
3325
0
    {
3326
0
        hts_log_error("The FORMAT at %s:%"PRIhts_pos" is too long", bcf_seqname_safe(h,v), v->pos+1);
3327
0
        v->errcode |= BCF_ERR_LIMITS;
3328
3329
        // Error recovery: return -1 if this is a critical error or 0 if we want to ignore the FORMAT and proceed
3330
0
        v->n_fmt = 0;
3331
0
        return -1;
3332
0
    }
3333
3334
11.9k
    return 0;
3335
11.9k
}
3336
3337
// p,q is the start and the end of the FORMAT field
3338
static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
3339
                            char *p, char *q)
3340
22.2k
{
3341
22.2k
    if ( !bcf_hdr_nsamples(h) ) return 0;
3342
12.6k
    kstring_t *mem = (kstring_t*)&h->mem;
3343
12.6k
    mem->l = 0;
3344
3345
12.6k
    fmt_aux_t fmt[MAX_N_FMT];
3346
3347
    // detect FORMAT "."
3348
12.6k
    int ret; // +ve = ok, -ve = err
3349
12.6k
    if ((ret = vcf_parse_format_empty1(s, h, v, p, q)))
3350
161
        return ret ? 0 : -1;
3351
3352
    // get format information from the dictionary
3353
12.4k
    if (vcf_parse_format_dict2(s, h, v, p, q, fmt) < 0)
3354
26
        return -1;
3355
3356
    // FORMAT data is per-sample A:B:C A:B:C A:B:C ... but in memory it is
3357
    // stored as per-type arrays AAA... BBB... CCC...  This is basically
3358
    // a data rotation or pivot.
3359
3360
    // The size of elements in the array grow to their maximum needed,
3361
    // permitting fast random access.  This means however we have to first
3362
    // scan the whole FORMAT line to find the maximum of each type, and
3363
    // then scan it again to find the store the data.
3364
    // We break this down into compute-max, allocate, fill-out-buffers
3365
3366
    // TODO: ?
3367
    // The alternative would be to pivot on the first pass, with fixed
3368
    // size entries for numerics and concatenated strings otherwise, also
3369
    // tracking maximum sizes.  Then on a second pass we reallocate and
3370
    // copy the data again to a uniformly sized array.  Two passes through
3371
    // memory, but without doubling string parsing.
3372
3373
    // compute max
3374
12.4k
    if (vcf_parse_format_max3(s, h, v, p, q, fmt) < 0)
3375
27
        return -1;
3376
3377
    // allocate memory for arrays
3378
12.3k
    if (vcf_parse_format_alloc4(s, h, v, p, q, fmt) < 0)
3379
0
        return -1;
3380
3381
    // fill the sample fields; at beginning of the loop
3382
12.3k
    if (vcf_parse_format_fill5(s, h, v, p, q, fmt) < 0)
3383
271
        return -1;
3384
3385
    // write individual genotype information
3386
12.1k
    if (vcf_parse_format_gt6(s, h, v, p, q, fmt) < 0)
3387
0
        return -1;
3388
3389
    // validity checking
3390
12.1k
    if (vcf_parse_format_check7(h, v) < 0)
3391
146
        return -1;
3392
3393
11.9k
    return 0;
3394
12.1k
}
3395
3396
6.97k
static khint_t fix_chromosome(const bcf_hdr_t *h, vdict_t *d, const char *p) {
3397
    // Simple error recovery for chromosomes not defined in the header. It will not help when VCF header has
3398
    // been already printed, but will enable tools like vcfcheck to proceed.
3399
3400
6.97k
    kstring_t tmp = {0,0,0};
3401
6.97k
    khint_t k;
3402
6.97k
    int l;
3403
6.97k
    if (ksprintf(&tmp, "##contig=<ID=%s>", p) < 0)
3404
0
        return kh_end(d);
3405
6.97k
    bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
3406
6.97k
    free(tmp.s);
3407
6.97k
    int res = hrec ? bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) : -1;
3408
6.97k
    if (res < 0) bcf_hrec_destroy(hrec);
3409
6.97k
    if (res > 0) res = bcf_hdr_sync((bcf_hdr_t*)h);
3410
6.97k
    k = kh_get(vdict, d, p);
3411
3412
6.97k
    return k;
3413
6.97k
}
3414
3415
33.7k
static int vcf_parse_filter(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p, char *q) {
3416
33.7k
    int i, n_flt = 1, max_n_flt = 0;
3417
33.7k
    char *r, *t;
3418
33.7k
    int32_t *a_flt = NULL;
3419
33.7k
    ks_tokaux_t aux1;
3420
33.7k
    khint_t k;
3421
33.7k
    vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID];
3422
    // count the number of filters
3423
33.7k
    if (*(q-1) == ';') *(q-1) = 0;
3424
515M
    for (r = p; *r; ++r)
3425
515M
        if (*r == ';') ++n_flt;
3426
33.7k
    if (n_flt > max_n_flt) {
3427
33.7k
        a_flt = malloc(n_flt * sizeof(*a_flt));
3428
33.7k
        if (!a_flt) {
3429
0
            hts_log_error("Could not allocate memory at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3430
0
            v->errcode |= BCF_ERR_LIMITS; // No appropriate code?
3431
0
            return -1;
3432
0
        }
3433
33.7k
        max_n_flt = n_flt;
3434
33.7k
    }
3435
    // add filters
3436
2.84M
    for (t = kstrtok(p, ";", &aux1), i = 0; t; t = kstrtok(0, 0, &aux1)) {
3437
2.81M
        *(char*)aux1.p = 0;
3438
2.81M
        k = kh_get(vdict, d, t);
3439
2.81M
        if (k == kh_end(d))
3440
60.1k
        {
3441
            // Simple error recovery for FILTERs not defined in the header. It will not help when VCF header has
3442
            // been already printed, but will enable tools like vcfcheck to proceed.
3443
60.1k
            hts_log_warning("FILTER '%s' is not defined in the header", t);
3444
60.1k
            kstring_t tmp = {0,0,0};
3445
60.1k
            int l;
3446
60.1k
            ksprintf(&tmp, "##FILTER=<ID=%s,Description=\"Dummy\">", t);
3447
60.1k
            bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
3448
60.1k
            free(tmp.s);
3449
60.1k
            int res = hrec ? bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) : -1;
3450
60.1k
            if (res < 0) bcf_hrec_destroy(hrec);
3451
60.1k
            if (res > 0) res = bcf_hdr_sync((bcf_hdr_t*)h);
3452
60.1k
            k = kh_get(vdict, d, t);
3453
60.1k
            v->errcode |= BCF_ERR_TAG_UNDEF;
3454
60.1k
            if (res || k == kh_end(d)) {
3455
63
                hts_log_error("Could not add dummy header for FILTER '%s' at %s:%"PRIhts_pos, t, bcf_seqname_safe(h,v), v->pos+1);
3456
63
                v->errcode |= BCF_ERR_TAG_INVALID;
3457
63
                free(a_flt);
3458
63
                return -1;
3459
63
            }
3460
60.1k
        }
3461
2.81M
        a_flt[i++] = kh_val(d, k).id;
3462
2.81M
    }
3463
3464
33.7k
    bcf_enc_vint(str, n_flt, a_flt, -1);
3465
33.7k
    free(a_flt);
3466
3467
33.7k
    return 0;
3468
33.7k
}
3469
3470
27.9k
static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p, char *q) {
3471
27.9k
    static int extreme_int_warned = 0, negative_rlen_warned = 0;
3472
27.9k
    int max_n_val = 0, overflow = 0;
3473
27.9k
    char *r, *key;
3474
27.9k
    khint_t k;
3475
27.9k
    vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID];
3476
27.9k
    int32_t *a_val = NULL;
3477
3478
27.9k
    v->n_info = 0;
3479
27.9k
    if (*(q-1) == ';') *(q-1) = 0;
3480
5.81M
    for (r = key = p;; ++r) {
3481
5.81M
        int c;
3482
5.81M
        char *val, *end;
3483
426M
        while (*r > '=' || (*r != ';' && *r != '=' && *r != 0)) r++;
3484
5.81M
        if (v->n_info == UINT16_MAX) {
3485
14
            hts_log_error("Too many INFO entries at %s:%"PRIhts_pos,
3486
14
                          bcf_seqname_safe(h,v), v->pos+1);
3487
14
            v->errcode |= BCF_ERR_LIMITS;
3488
14
            goto fail;
3489
14
        }
3490
5.81M
        val = end = NULL;
3491
5.81M
        c = *r; *r = 0;
3492
5.81M
        if (c == '=') {
3493
2.50M
            val = r + 1;
3494
3495
706M
            for (end = val; *end != ';' && *end != 0; ++end);
3496
2.50M
            c = *end; *end = 0;
3497
3.30M
        } else end = r;
3498
5.81M
        if ( !*key ) { if (c==0) break; r = end; key = r + 1; continue; }  // faulty VCF, ";;" in the INFO
3499
5.77M
        k = kh_get(vdict, d, key);
3500
5.77M
        if (k == kh_end(d) || kh_val(d, k).info[BCF_HL_INFO] == 15)
3501
53.8k
        {
3502
53.8k
            hts_log_warning("INFO '%s' is not defined in the header, assuming Type=String", key);
3503
53.8k
            kstring_t tmp = {0,0,0};
3504
53.8k
            int l;
3505
53.8k
            ksprintf(&tmp, "##INFO=<ID=%s,Number=1,Type=String,Description=\"Dummy\">", key);
3506
53.8k
            bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
3507
53.8k
            free(tmp.s);
3508
53.8k
            int res = hrec ? bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) : -1;
3509
53.8k
            if (res < 0) bcf_hrec_destroy(hrec);
3510
53.8k
            if (res > 0) res = bcf_hdr_sync((bcf_hdr_t*)h);
3511
53.8k
            k = kh_get(vdict, d, key);
3512
53.8k
            v->errcode |= BCF_ERR_TAG_UNDEF;
3513
53.8k
            if (res || k == kh_end(d)) {
3514
129
                hts_log_error("Could not add dummy header for INFO '%s' at %s:%"PRIhts_pos, key, bcf_seqname_safe(h,v), v->pos+1);
3515
129
                v->errcode |= BCF_ERR_TAG_INVALID;
3516
129
                goto fail;
3517
129
            }
3518
53.8k
        }
3519
5.77M
        uint32_t y = kh_val(d, k).info[BCF_HL_INFO];
3520
5.77M
        ++v->n_info;
3521
5.77M
        bcf_enc_int1(str, kh_val(d, k).id);
3522
5.77M
        if (val == 0) {
3523
3.26M
            bcf_enc_size(str, 0, BCF_BT_NULL);
3524
3.26M
        } else if ((y>>4&0xf) == BCF_HT_FLAG || (y>>4&0xf) == BCF_HT_STR) { // if Flag has a value, treat it as a string
3525
64.3k
            bcf_enc_vchar(str, end - val, val);
3526
2.44M
        } else { // int/float value/array
3527
2.44M
            int i, n_val;
3528
2.44M
            char *t, *te;
3529
686M
            for (t = val, n_val = 1; *t; ++t) // count the number of values
3530
683M
                if (*t == ',') ++n_val;
3531
            // Check both int and float size in one step for simplicity
3532
2.44M
            if (n_val > max_n_val) {
3533
6.57k
                int32_t *a_tmp = (int32_t *)realloc(a_val, n_val * sizeof(*a_val));
3534
6.57k
                if (!a_tmp) {
3535
0
                    hts_log_error("Could not allocate memory at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3536
0
                    v->errcode |= BCF_ERR_LIMITS; // No appropriate code?
3537
0
                    goto fail;
3538
0
                }
3539
6.57k
                a_val = a_tmp;
3540
6.57k
                max_n_val = n_val;
3541
6.57k
            }
3542
2.44M
            if ((y>>4&0xf) == BCF_HT_INT) {
3543
1.66M
                i = 0, t = val;
3544
1.66M
                int64_t val1;
3545
1.66M
                int is_int64 = 0;
3546
#ifdef VCF_ALLOW_INT64
3547
                if ( n_val==1 )
3548
                {
3549
                    overflow = 0;
3550
                    long long int tmp_val = hts_str2int(val, &te, sizeof(tmp_val)*CHAR_BIT, &overflow);
3551
                    if ( te==val ) tmp_val = bcf_int32_missing;
3552
                    else if ( overflow || tmp_val<BCF_MIN_BT_INT64 || tmp_val>BCF_MAX_BT_INT64 )
3553
                    {
3554
                        if ( !extreme_int_warned )
3555
                        {
3556
                            hts_log_warning("Extreme INFO/%s value encountered and set to missing at %s:%"PRIhts_pos,key,bcf_seqname_safe(h,v), v->pos+1);
3557
                            extreme_int_warned = 1;
3558
                        }
3559
                        tmp_val = bcf_int32_missing;
3560
                    }
3561
                    else
3562
                        is_int64 = 1;
3563
                    val1 = tmp_val;
3564
                    t = te;
3565
                    i = 1;  // this is just to avoid adding another nested block...
3566
                }
3567
#endif
3568
404M
                for (; i < n_val; ++i, ++t)
3569
403M
                {
3570
403M
                    overflow = 0;
3571
403M
                    long int tmp_val = hts_str2int(t, &te, sizeof(tmp_val)*CHAR_BIT, &overflow);
3572
403M
                    if ( te==t ) tmp_val = bcf_int32_missing;
3573
3.32M
                    else if ( overflow || tmp_val<BCF_MIN_BT_INT32 || tmp_val>BCF_MAX_BT_INT32 )
3574
474k
                    {
3575
474k
                        if ( !extreme_int_warned )
3576
1
                        {
3577
1
                            hts_log_warning("Extreme INFO/%s value encountered and set to missing at %s:%"PRIhts_pos,key,bcf_seqname_safe(h,v), v->pos+1);
3578
1
                            extreme_int_warned = 1;
3579
1
                        }
3580
474k
                        tmp_val = bcf_int32_missing;
3581
474k
                    }
3582
403M
                    a_val[i] = tmp_val;
3583
481M
                    for (t = te; *t && *t != ','; t++);
3584
403M
                }
3585
1.66M
                if (n_val == 1) {
3586
#ifdef VCF_ALLOW_INT64
3587
                    if ( is_int64 )
3588
                    {
3589
                        v->unpacked |= BCF_IS_64BIT;
3590
                        bcf_enc_long1(str, val1);
3591
                    }
3592
                    else
3593
                        bcf_enc_int1(str, (int32_t)val1);
3594
#else
3595
1.30M
                    val1 = a_val[0];
3596
1.30M
                    bcf_enc_int1(str, (int32_t)val1);
3597
1.30M
#endif
3598
1.30M
                } else {
3599
351k
                    bcf_enc_vint(str, n_val, a_val, -1);
3600
351k
                }
3601
1.66M
                if (n_val==1 && (val1!=bcf_int32_missing || is_int64)
3602
1.66M
                    && memcmp(key, "END", 4) == 0)
3603
0
                {
3604
0
                    if ( val1 <= v->pos )
3605
0
                    {
3606
0
                        if ( !negative_rlen_warned )
3607
0
                        {
3608
0
                            hts_log_warning("INFO/END=%"PRIhts_pos" is smaller than POS at %s:%"PRIhts_pos,val1,bcf_seqname_safe(h,v),v->pos+1);
3609
0
                            negative_rlen_warned = 1;
3610
0
                        }
3611
0
                    }
3612
0
                    else
3613
0
                        v->rlen = val1 - v->pos;
3614
0
                }
3615
1.66M
            } else if ((y>>4&0xf) == BCF_HT_REAL) {
3616
781k
                float *val_f = (float *)a_val;
3617
148M
                for (i = 0, t = val; i < n_val; ++i, ++t)
3618
147M
                {
3619
147M
                    overflow = 0;
3620
147M
                    val_f[i] = hts_str2dbl(t, &te, &overflow);
3621
147M
                    if ( te==t || overflow ) // conversion failed
3622
145M
                        bcf_float_set_missing(val_f[i]);
3623
181M
                    for (t = te; *t && *t != ','; t++);
3624
147M
                }
3625
781k
                bcf_enc_vfloat(str, n_val, val_f);
3626
781k
            }
3627
2.44M
        }
3628
5.77M
        if (c == 0) break;
3629
5.75M
        r = end;
3630
5.75M
        key = r + 1;
3631
5.75M
    }
3632
3633
27.8k
    free(a_val);
3634
27.8k
    return 0;
3635
3636
143
 fail:
3637
143
    free(a_val);
3638
143
    return -1;
3639
27.9k
}
3640
3641
int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v)
3642
36.1k
{
3643
36.1k
    int ret = -2, overflow = 0;
3644
36.1k
    char *p, *q, *r, *t;
3645
36.1k
    kstring_t *str;
3646
36.1k
    khint_t k;
3647
36.1k
    ks_tokaux_t aux;
3648
3649
//#define NOT_DOT(p) strcmp((p), ".")
3650
//#define NOT_DOT(p) (!(*p == '.' && !p[1]))
3651
//#define NOT_DOT(p) ((*p) != '.' || (p)[1])
3652
//#define NOT_DOT(p) (q-p != 1 || memcmp(p, ".\0", 2))
3653
175k
#define NOT_DOT(p) (memcmp(p, ".\0", 2))
3654
3655
36.1k
    if (!s || !h || !v || !(s->s))
3656
0
        return ret;
3657
3658
    // Assumed in lots of places, but we may as well spot this early
3659
36.1k
    assert(sizeof(float) == sizeof(int32_t));
3660
3661
    // Ensure string we parse has space to permit some over-flow when during
3662
    // parsing.  Eg to do memcmp(key, "END", 4) in vcf_parse_info over
3663
    // the more straight forward looking strcmp, giving a speed advantage.
3664
36.1k
    if (ks_resize(s, s->l+4) < 0)
3665
0
        return -1;
3666
3667
    // Force our memory to be initialised so we avoid the technicality of
3668
    // undefined behaviour in using a 4-byte memcmp.  (The reality is this
3669
    // almost certainly is never detected by the compiler so has no impact,
3670
    // but equally so this code has minimal (often beneficial) impact on
3671
    // performance too.)
3672
36.1k
    s->s[s->l+0] = 0;
3673
36.1k
    s->s[s->l+1] = 0;
3674
36.1k
    s->s[s->l+2] = 0;
3675
36.1k
    s->s[s->l+3] = 0;
3676
3677
36.1k
    bcf_clear1(v);
3678
36.1k
    str = &v->shared;
3679
36.1k
    memset(&aux, 0, sizeof(ks_tokaux_t));
3680
3681
    // CHROM
3682
36.1k
    if (!(p = kstrtok(s->s, "\t", &aux)))
3683
0
        goto err;
3684
36.1k
    *(q = (char*)aux.p) = 0;
3685
3686
36.1k
    vdict_t *d = (vdict_t*)h->dict[BCF_DT_CTG];
3687
36.1k
    k = kh_get(vdict, d, p);
3688
36.1k
    if (k == kh_end(d)) {
3689
6.97k
        hts_log_warning("Contig '%s' is not defined in the header. (Quick workaround: index the file with tabix.)", p);
3690
6.97k
        v->errcode = BCF_ERR_CTG_UNDEF;
3691
6.97k
        if ((k = fix_chromosome(h, d, p)) == kh_end(d)) {
3692
115
            hts_log_error("Could not add dummy header for contig '%s'", p);
3693
115
            v->errcode |= BCF_ERR_CTG_INVALID;
3694
115
            goto err;
3695
115
        }
3696
6.97k
    }
3697
36.0k
    v->rid = kh_val(d, k).id;
3698
3699
    // POS
3700
36.0k
    if (!(p = kstrtok(0, 0, &aux)))
3701
419
        goto err;
3702
35.6k
    *(q = (char*)aux.p) = 0;
3703
3704
35.6k
    overflow = 0;
3705
35.6k
    char *tmp = p;
3706
35.6k
    v->pos = hts_str2uint(p, &p, 63, &overflow);
3707
35.6k
    if (overflow) {
3708
63
        hts_log_error("Position value '%s' is too large", tmp);
3709
63
        goto err;
3710
35.5k
    } else if ( *p ) {
3711
97
        hts_log_error("Could not parse the position '%s'", tmp);
3712
97
        goto err;
3713
35.4k
    } else {
3714
35.4k
        v->pos -= 1;
3715
35.4k
    }
3716
35.4k
    if (v->pos >= INT32_MAX)
3717
2.20k
        v->unpacked |= BCF_IS_64BIT;
3718
3719
    // ID
3720
35.4k
    if (!(p = kstrtok(0, 0, &aux)))
3721
127
        goto err;
3722
35.3k
    *(q = (char*)aux.p) = 0;
3723
3724
35.3k
    if (NOT_DOT(p)) bcf_enc_vchar(str, q - p, p);
3725
308
    else bcf_enc_size(str, 0, BCF_BT_CHAR);
3726
3727
    // REF
3728
35.3k
    if (!(p = kstrtok(0, 0, &aux)))
3729
61
        goto err;
3730
35.2k
    *(q = (char*)aux.p) = 0;
3731
3732
35.2k
    bcf_enc_vchar(str, q - p, p);
3733
35.2k
    v->n_allele = 1, v->rlen = q - p;
3734
3735
    // ALT
3736
35.2k
    if (!(p = kstrtok(0, 0, &aux)))
3737
37
        goto err;
3738
35.2k
    *(q = (char*)aux.p) = 0;
3739
3740
35.2k
    if (NOT_DOT(p)) {
3741
83.0M
        for (r = t = p;; ++r) {
3742
83.0M
            if (*r == ',' || *r == 0) {
3743
2.35M
                if (v->n_allele == UINT16_MAX) {
3744
3
                    hts_log_error("Too many ALT alleles at %s:%"PRIhts_pos,
3745
3
                                  bcf_seqname_safe(h,v), v->pos+1);
3746
3
                    v->errcode |= BCF_ERR_LIMITS;
3747
3
                    goto err;
3748
3
                }
3749
2.35M
                bcf_enc_vchar(str, r - t, t);
3750
2.35M
                t = r + 1;
3751
2.35M
                ++v->n_allele;
3752
2.35M
            }
3753
83.0M
            if (r == q) break;
3754
83.0M
        }
3755
34.6k
    }
3756
3757
    // QUAL
3758
35.2k
    if (!(p = kstrtok(0, 0, &aux)))
3759
79
        goto err;
3760
35.1k
    *(q = (char*)aux.p) = 0;
3761
3762
35.1k
    if (NOT_DOT(p)) v->qual = atof(p);
3763
1.57k
    else bcf_float_set_missing(v->qual);
3764
35.1k
    if ( v->max_unpack && !(v->max_unpack>>1) ) goto end; // BCF_UN_STR
3765
3766
    // FILTER
3767
35.1k
    if (!(p = kstrtok(0, 0, &aux)))
3768
62
        goto err;
3769
35.0k
    *(q = (char*)aux.p) = 0;
3770
3771
35.0k
    if (NOT_DOT(p)) {
3772
33.7k
        if (vcf_parse_filter(str, h, v, p, q)) {
3773
63
            goto err;
3774
63
        }
3775
33.7k
    } else bcf_enc_vint(str, 0, 0, -1);
3776
35.0k
    if ( v->max_unpack && !(v->max_unpack>>2) ) goto end; // BCF_UN_FLT
3777
3778
    // INFO
3779
35.0k
    if (!(p = kstrtok(0, 0, &aux)))
3780
165
        goto err;
3781
34.8k
    *(q = (char*)aux.p) = 0;
3782
3783
34.8k
    if (NOT_DOT(p)) {
3784
27.9k
        if (vcf_parse_info(str, h, v, p, q)) {
3785
143
            goto err;
3786
143
        }
3787
27.9k
    }
3788
34.7k
    if ( v->max_unpack && !(v->max_unpack>>3) ) goto end;
3789
3790
    // FORMAT; optional
3791
34.7k
    p = kstrtok(0, 0, &aux);
3792
34.7k
    if (p) {
3793
22.2k
        *(q = (char*)aux.p) = 0;
3794
3795
22.2k
        return vcf_parse_format(s, h, v, p, q) == 0 ? 0 : -2;
3796
22.2k
    } else {
3797
12.4k
        return 0;
3798
12.4k
    }
3799
3800
0
 end:
3801
0
    ret = 0;
3802
3803
1.43k
 err:
3804
1.43k
    return ret;
3805
0
}
3806
3807
int vcf_open_mode(char *mode, const char *fn, const char *format)
3808
0
{
3809
0
    if (format == NULL) {
3810
        // Try to pick a format based on the filename extension
3811
0
        char extension[HTS_MAX_EXT_LEN];
3812
0
        if (find_file_extension(fn, extension) < 0) return -1;
3813
0
        return vcf_open_mode(mode, fn, extension);
3814
0
    }
3815
0
    else if (strcasecmp(format, "bcf") == 0) strcpy(mode, "b");
3816
0
    else if (strcasecmp(format, "vcf") == 0) strcpy(mode, "");
3817
0
    else if (strcasecmp(format, "vcf.gz") == 0 || strcasecmp(format, "vcf.bgz") == 0) strcpy(mode, "z");
3818
0
    else return -1;
3819
3820
0
    return 0;
3821
0
}
3822
3823
int vcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
3824
37.7k
{
3825
37.7k
    int ret;
3826
37.7k
    ret = hts_getline(fp, KS_SEP_LINE, &fp->line);
3827
37.7k
    if (ret < 0) return ret;
3828
36.1k
    return vcf_parse1(&fp->line, h, v);
3829
37.7k
}
3830
3831
static inline uint8_t *bcf_unpack_fmt_core1(uint8_t *ptr, int n_sample, bcf_fmt_t *fmt)
3832
0
{
3833
0
    uint8_t *ptr_start = ptr;
3834
0
    fmt->id = bcf_dec_typed_int1(ptr, &ptr);
3835
0
    fmt->n = bcf_dec_size(ptr, &ptr, &fmt->type);
3836
0
    fmt->size = fmt->n << bcf_type_shift[fmt->type];
3837
0
    fmt->p = ptr;
3838
0
    fmt->p_off  = ptr - ptr_start;
3839
0
    fmt->p_free = 0;
3840
0
    ptr += n_sample * fmt->size;
3841
0
    fmt->p_len = ptr - fmt->p;
3842
0
    return ptr;
3843
0
}
3844
3845
static inline uint8_t *bcf_unpack_info_core1(uint8_t *ptr, bcf_info_t *info)
3846
0
{
3847
0
    uint8_t *ptr_start = ptr;
3848
0
    int64_t len = 0;
3849
0
    info->key = bcf_dec_typed_int1(ptr, &ptr);
3850
0
    len = info->len = bcf_dec_size(ptr, &ptr, &info->type);
3851
0
    info->vptr = ptr;
3852
0
    info->vptr_off  = ptr - ptr_start;
3853
0
    info->vptr_free = 0;
3854
0
    info->v1.i = 0;
3855
0
    if (info->len == 1) {
3856
0
        switch(info->type) {
3857
0
        case BCF_BT_INT8:
3858
0
        case BCF_BT_CHAR:
3859
0
            info->v1.i = *(int8_t*)ptr;
3860
0
            break;
3861
0
        case BCF_BT_INT16:
3862
0
            info->v1.i = le_to_i16(ptr);
3863
0
            len <<= 1;
3864
0
            break;
3865
0
        case BCF_BT_INT32:
3866
0
            info->v1.i = le_to_i32(ptr);
3867
0
            len <<= 2;
3868
0
            break;
3869
0
        case BCF_BT_FLOAT:
3870
0
            info->v1.f = le_to_float(ptr);
3871
0
            len <<= 2;
3872
0
            break;
3873
0
        case BCF_BT_INT64:
3874
0
            info->v1.i = le_to_i64(ptr);
3875
0
            len <<= 3;
3876
0
            break;
3877
0
        }
3878
0
    } else {
3879
0
        len <<= bcf_type_shift[info->type];
3880
0
    }
3881
0
    ptr += len;
3882
3883
0
    info->vptr_len = ptr - info->vptr;
3884
0
    return ptr;
3885
0
}
3886
3887
int bcf_unpack(bcf1_t *b, int which)
3888
31.4k
{
3889
31.4k
    if ( !b->shared.l ) return 0; // Building a new BCF record from scratch
3890
31.4k
    uint8_t *ptr = (uint8_t*)b->shared.s, *ptr_ori;
3891
31.4k
    int i;
3892
31.4k
    bcf_dec_t *d = &b->d;
3893
31.4k
    if (which & BCF_UN_FLT) which |= BCF_UN_STR;
3894
31.4k
    if (which & BCF_UN_INFO) which |= BCF_UN_SHR;
3895
31.4k
    if ((which&BCF_UN_STR) && !(b->unpacked&BCF_UN_STR))
3896
31.4k
    {
3897
31.4k
        kstring_t tmp;
3898
3899
        // ID
3900
31.4k
        tmp.l = 0; tmp.s = d->id; tmp.m = d->m_id;
3901
31.4k
        ptr_ori = ptr;
3902
31.4k
        ptr = bcf_fmt_sized_array(&tmp, ptr);
3903
31.4k
        b->unpack_size[0] = ptr - ptr_ori;
3904
31.4k
        kputc_('\0', &tmp);
3905
31.4k
        d->id = tmp.s; d->m_id = tmp.m;
3906
3907
        // REF and ALT are in a single block (d->als) and d->alleles are pointers into this block
3908
31.4k
        hts_expand(char*, b->n_allele, d->m_allele, d->allele); // NM: hts_expand() is a macro
3909
31.4k
        tmp.l = 0; tmp.s = d->als; tmp.m = d->m_als;
3910
31.4k
        ptr_ori = ptr;
3911
1.41M
        for (i = 0; i < b->n_allele; ++i) {
3912
            // Use offset within tmp.s as realloc may change pointer
3913
1.38M
            d->allele[i] = (char *)(intptr_t)tmp.l;
3914
1.38M
            ptr = bcf_fmt_sized_array(&tmp, ptr);
3915
1.38M
            kputc_('\0', &tmp);
3916
1.38M
        }
3917
31.4k
        b->unpack_size[1] = ptr - ptr_ori;
3918
31.4k
        d->als = tmp.s; d->m_als = tmp.m;
3919
3920
        // Convert our offsets within tmp.s back to pointers again
3921
1.41M
        for (i = 0; i < b->n_allele; ++i)
3922
1.38M
            d->allele[i] = d->als + (ptrdiff_t)d->allele[i];
3923
31.4k
        b->unpacked |= BCF_UN_STR;
3924
31.4k
    }
3925
31.4k
    if ((which&BCF_UN_FLT) && !(b->unpacked&BCF_UN_FLT)) { // FILTER
3926
31.4k
        ptr = (uint8_t*)b->shared.s + b->unpack_size[0] + b->unpack_size[1];
3927
31.4k
        ptr_ori = ptr;
3928
31.4k
        if (*ptr>>4) {
3929
30.1k
            int type;
3930
30.1k
            d->n_flt = bcf_dec_size(ptr, &ptr, &type);
3931
30.1k
            hts_expand(int, d->n_flt, d->m_flt, d->flt);
3932
1.92M
            for (i = 0; i < d->n_flt; ++i)
3933
1.89M
                d->flt[i] = bcf_dec_int1(ptr, type, &ptr);
3934
30.1k
        } else ++ptr, d->n_flt = 0;
3935
31.4k
        b->unpack_size[2] = ptr - ptr_ori;
3936
31.4k
        b->unpacked |= BCF_UN_FLT;
3937
31.4k
    }
3938
31.4k
    if ((which&BCF_UN_INFO) && !(b->unpacked&BCF_UN_INFO)) { // INFO
3939
0
        ptr = (uint8_t*)b->shared.s + b->unpack_size[0] + b->unpack_size[1] + b->unpack_size[2];
3940
0
        hts_expand(bcf_info_t, b->n_info, d->m_info, d->info);
3941
0
        for (i = 0; i < d->m_info; ++i) d->info[i].vptr_free = 0;
3942
0
        for (i = 0; i < b->n_info; ++i)
3943
0
            ptr = bcf_unpack_info_core1(ptr, &d->info[i]);
3944
0
        b->unpacked |= BCF_UN_INFO;
3945
0
    }
3946
31.4k
    if ((which&BCF_UN_FMT) && b->n_sample && !(b->unpacked&BCF_UN_FMT)) { // FORMAT
3947
0
        ptr = (uint8_t*)b->indiv.s;
3948
0
        hts_expand(bcf_fmt_t, b->n_fmt, d->m_fmt, d->fmt);
3949
0
        for (i = 0; i < d->m_fmt; ++i) d->fmt[i].p_free = 0;
3950
0
        for (i = 0; i < b->n_fmt; ++i)
3951
0
            ptr = bcf_unpack_fmt_core1(ptr, b->n_sample, &d->fmt[i]);
3952
0
        b->unpacked |= BCF_UN_FMT;
3953
0
    }
3954
31.4k
    return 0;
3955
31.4k
}
3956
3957
int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s)
3958
31.4k
{
3959
31.4k
    int i;
3960
31.4k
    int32_t max_dt_id = h->n[BCF_DT_ID];
3961
31.4k
    const char *chrom = bcf_seqname(h, v);
3962
31.4k
    if (!chrom) {
3963
0
        hts_log_error("Invalid BCF, CONTIG id=%d not present in the header",
3964
0
                      v->rid);
3965
0
        errno = EINVAL;
3966
0
        return -1;
3967
0
    }
3968
3969
31.4k
    bcf_unpack((bcf1_t*)v, BCF_UN_ALL & ~(BCF_UN_INFO|BCF_UN_FMT));
3970
3971
    // Cache of key lengths so we don't keep repeatedly using them.
3972
    // This assumes we're not modifying the header between successive calls
3973
    // to vcf_format, but that would lead to many other forms of breakage
3974
    // so it feels like a valid assumption to make.
3975
    //
3976
    // We cannot just do this in bcf_hdr_sync as some code (eg bcftools
3977
    // annotate) manipulates the headers directly without calling sync to
3978
    // refresh the data structures.  So we must do just-in-time length
3979
    // calculation during writes instead.
3980
31.4k
    bcf_hdr_aux_t *aux = get_hdr_aux(h);
3981
31.4k
    if (!aux->key_len) {
3982
5.67k
        if (!(aux->key_len = calloc(h->n[BCF_DT_ID]+1, sizeof(*aux->key_len))))
3983
0
            return -1;
3984
5.67k
    }
3985
31.4k
    size_t *key_len = aux->key_len;
3986
3987
31.4k
    kputs(chrom, s); // CHROM
3988
31.4k
    kputc_('\t', s); kputll(v->pos + 1, s); // POS
3989
31.4k
    kputc_('\t', s); kputs(v->d.id ? v->d.id : ".", s); // ID
3990
31.4k
    kputc_('\t', s); // REF
3991
31.4k
    if (v->n_allele > 0) kputs(v->d.allele[0], s);
3992
0
    else kputc_('.', s);
3993
31.4k
    kputc_('\t', s); // ALT
3994
31.4k
    if (v->n_allele > 1) {
3995
1.38M
        for (i = 1; i < v->n_allele; ++i) {
3996
1.35M
            if (i > 1) kputc_(',', s);
3997
1.35M
            kputs(v->d.allele[i], s);
3998
1.35M
        }
3999
30.8k
    } else kputc_('.', s);
4000
31.4k
    kputc_('\t', s); // QUAL
4001
31.4k
    if ( bcf_float_is_missing(v->qual) ) kputc_('.', s); // QUAL
4002
29.9k
    else kputd(v->qual, s);
4003
31.4k
    kputc_('\t', s); // FILTER
4004
31.4k
    if (v->d.n_flt) {
4005
1.92M
        for (i = 0; i < v->d.n_flt; ++i) {
4006
1.89M
            int32_t idx = v->d.flt[i];
4007
1.89M
            if (idx < 0 || idx >= max_dt_id
4008
1.89M
                || h->id[BCF_DT_ID][idx].key == NULL) {
4009
0
                hts_log_error("Invalid BCF, the FILTER tag id=%d at %s:%"PRIhts_pos" not present in the header",
4010
0
                              idx, bcf_seqname_safe(h, v), v->pos + 1);
4011
0
                errno = EINVAL;
4012
0
                return -1;
4013
0
            }
4014
1.89M
            if (i) kputc_(';', s);
4015
1.89M
            if (!key_len[idx])
4016
117k
                key_len[idx] = strlen(h->id[BCF_DT_ID][idx].key);
4017
1.89M
            kputsn(h->id[BCF_DT_ID][idx].key, key_len[idx], s);
4018
1.89M
        }
4019
30.1k
    } else kputc_('.', s);
4020
4021
31.4k
    kputc_('\t', s); // INFO
4022
31.4k
    if (v->n_info) {
4023
11.8k
        uint8_t *ptr = v->shared.s
4024
11.8k
            ? (uint8_t *)v->shared.s + v->unpack_size[0] +
4025
11.8k
               v->unpack_size[1] + v->unpack_size[2]
4026
11.8k
            : NULL;
4027
11.8k
        int first = 1;
4028
11.8k
        bcf_info_t *info = v->d.info;
4029
4030
        // Note if we duplicate this code into custom packed and unpacked
4031
        // implementations then we gain a bit more speed, particularly with
4032
        // clang 13 (up to 5%).  Not sure why this is, but code duplication
4033
        // isn't pleasant and it's still faster adding packed support than
4034
        // not so it's a win, just not as good as it should be.
4035
11.8k
        const int info_packed = !(v->unpacked & BCF_UN_INFO) && v->shared.l;
4036
2.40M
        for (i = 0; i < v->n_info; ++i) {
4037
2.38M
            bcf_info_t in, *z;
4038
2.38M
            if (info_packed) {
4039
                // Use a local bcf_info_t when data is packed
4040
2.38M
                z = &in;
4041
2.38M
                z->key  = bcf_dec_typed_int1(ptr, &ptr);
4042
2.38M
                z->len  = bcf_dec_size(ptr, &ptr, &z->type);
4043
2.38M
                z->vptr = ptr;
4044
2.38M
                ptr += z->len << bcf_type_shift[z->type];
4045
2.38M
            } else {
4046
                // Else previously unpacked INFO struct
4047
0
                z = &info[i];
4048
4049
                // Also potentially since deleted
4050
0
                if ( !z->vptr ) continue;
4051
0
            }
4052
4053
2.38M
            bcf_idpair_t *id = z->key >= 0 && z->key < max_dt_id
4054
2.38M
                ? &h->id[BCF_DT_ID][z->key]
4055
2.38M
                : NULL;
4056
4057
2.38M
            if (!id || !id->key) {
4058
0
                hts_log_error("Invalid BCF, the INFO tag id=%d is %s at %s:%"PRIhts_pos,
4059
0
                              z->key,
4060
0
                              z->key < 0 ? "negative"
4061
0
                              : (z->key >= max_dt_id ? "too large" : "not present in the header"),
4062
0
                              bcf_seqname_safe(h, v), v->pos+1);
4063
0
                errno = EINVAL;
4064
0
                return -1;
4065
0
            }
4066
4067
            // KEY
4068
2.38M
            if (!key_len[z->key])
4069
25.9k
                key_len[z->key] = strlen(id->key);
4070
2.38M
            size_t id_len = key_len[z->key];
4071
2.38M
            if (ks_resize(s, s->l + 3 + id_len) < 0)
4072
0
                return -1;
4073
2.38M
            char *sptr = s->s + s->l;
4074
2.38M
            if ( !first ) {
4075
2.37M
                *sptr++ = ';';
4076
2.37M
                s->l++;
4077
2.37M
            }
4078
2.38M
            first = 0;
4079
2.38M
            memcpy(sptr, id->key, id_len);
4080
2.38M
            s->l += id_len;
4081
4082
            // VALUE
4083
2.38M
            if (z->len <= 0) continue;
4084
1.23M
            sptr[id_len] = '=';
4085
1.23M
            s->l++;
4086
4087
1.23M
            if (z->len != 1 || info_packed) {
4088
1.23M
                bcf_fmt_array(s, z->len, z->type, z->vptr);
4089
1.23M
            } else {
4090
                // Single length vectors are unpacked into their
4091
                // own info.v1 union and handled separately.
4092
0
                if (z->type == BCF_BT_FLOAT) {
4093
0
                    if ( bcf_float_is_missing(z->v1.f) )
4094
0
                        kputc_('.', s);
4095
0
                    else
4096
0
                        kputd(z->v1.f, s);
4097
0
                } else if (z->type == BCF_BT_CHAR) {
4098
0
                    kputc_(z->v1.i, s);
4099
0
                } else if (z->type < BCF_BT_INT64) {
4100
0
                    int64_t missing[] = {
4101
0
                        0, // BCF_BT_NULL
4102
0
                        bcf_int8_missing,
4103
0
                        bcf_int16_missing,
4104
0
                        bcf_int32_missing,
4105
0
                    };
4106
0
                    if (z->v1.i == missing[z->type])
4107
0
                        kputc_('.', s);
4108
0
                    else
4109
0
                        kputw(z->v1.i, s);
4110
0
                } else if (z->type == BCF_BT_INT64) {
4111
0
                    if (z->v1.i == bcf_int64_missing)
4112
0
                        kputc_('.', s);
4113
0
                    else
4114
0
                        kputll(z->v1.i, s);
4115
0
                } else {
4116
0
                    hts_log_error("Unexpected type %d at %s:%"PRIhts_pos, z->type, bcf_seqname_safe(h, v), v->pos+1);
4117
0
                    errno = EINVAL;
4118
0
                    return -1;
4119
0
                }
4120
0
            }
4121
1.23M
        }
4122
11.8k
        if ( first ) kputc_('.', s);
4123
19.5k
    } else kputc_('.', s);
4124
4125
    // FORMAT and individual information
4126
31.4k
    if (v->n_sample) {
4127
11.2k
        int i,j;
4128
11.2k
        if ( v->n_fmt) {
4129
11.1k
            uint8_t *ptr = (uint8_t *)v->indiv.s;
4130
11.1k
            int gt_i = -1;
4131
11.1k
            bcf_fmt_t *fmt = v->d.fmt;
4132
11.1k
            int first = 1;
4133
11.1k
            int fmt_packed = !(v->unpacked & BCF_UN_FMT);
4134
4135
11.1k
            if (fmt_packed) {
4136
                // Local fmt as we have an array of num FORMAT keys,
4137
                // each of which points to N.Sample values.
4138
4139
                // No real gain to be had in handling unpacked data here,
4140
                // but it doesn't cost us much in complexity either and
4141
                // it gives us flexibility.
4142
11.1k
                fmt = malloc(v->n_fmt * sizeof(*fmt));
4143
11.1k
                if (!fmt)
4144
0
                    return -1;
4145
11.1k
            }
4146
4147
            // KEYS
4148
35.1k
            for (i = 0; i < (int)v->n_fmt; ++i) {
4149
23.9k
                bcf_fmt_t *z;
4150
23.9k
                z = &fmt[i];
4151
23.9k
                if (fmt_packed) {
4152
23.9k
                    z->id   = bcf_dec_typed_int1(ptr, &ptr);
4153
23.9k
                    z->n    = bcf_dec_size(ptr, &ptr, &z->type);
4154
23.9k
                    z->p    = ptr;
4155
23.9k
                    z->size = z->n << bcf_type_shift[z->type];
4156
23.9k
                    ptr += v->n_sample * z->size;
4157
23.9k
                }
4158
23.9k
                if ( !z->p ) continue;
4159
23.9k
                kputc_(!first ? ':' : '\t', s); first = 0;
4160
4161
23.9k
                bcf_idpair_t *id = z->id >= 0 && z->id < max_dt_id
4162
23.9k
                    ? &h->id[BCF_DT_ID][z->id]
4163
23.9k
                    : NULL;
4164
4165
23.9k
                if (!id || !id->key) {
4166
0
                    hts_log_error("Invalid BCF, the FORMAT tag id=%d at %s:%"PRIhts_pos" not present in the header", z->id, bcf_seqname_safe(h, v), v->pos+1);
4167
0
                    errno = EINVAL;
4168
0
                    return -1;
4169
0
                }
4170
4171
23.9k
                if (!key_len[z->id])
4172
12.9k
                    key_len[z->id] = strlen(id->key);
4173
23.9k
                size_t id_len = key_len[z->id];
4174
23.9k
                kputsn(id->key, id_len, s);
4175
23.9k
                if (id_len == 2 && id->key[0] == 'G' && id->key[1] == 'T')
4176
4.74k
                    gt_i = i;
4177
23.9k
            }
4178
11.1k
            if ( first ) kputsn("\t.", 2, s);
4179
4180
            // VALUES per sample
4181
36.1k
            for (j = 0; j < v->n_sample; ++j) {
4182
25.0k
                kputc_('\t', s);
4183
25.0k
                first = 1;
4184
25.0k
                bcf_fmt_t *f = fmt;
4185
78.0k
                for (i = 0; i < (int)v->n_fmt; i++, f++) {
4186
65.7k
                    if ( !f->p ) continue;
4187
65.7k
                    if (!first) kputc_(':', s);
4188
65.7k
                    first = 0;
4189
65.7k
                    if (gt_i == i) {
4190
12.7k
                        bcf_format_gt(f,j,s);
4191
12.7k
                        break;
4192
12.7k
                    }
4193
52.9k
                    else if (f->n == 1)
4194
10.5k
                        bcf_fmt_array1(s, f->type, f->p + j * (size_t)f->size);
4195
42.3k
                    else
4196
42.3k
                        bcf_fmt_array(s, f->n, f->type, f->p + j * (size_t)f->size);
4197
65.7k
                }
4198
4199
                // Simpler loop post GT and at least 1 iteration
4200
41.2k
                for (i++, f++; i < (int)v->n_fmt; i++, f++) {
4201
16.2k
                    if ( !f->p ) continue;
4202
16.2k
                    kputc_(':', s);
4203
16.2k
                    if (f->n == 1)
4204
1.27k
                        bcf_fmt_array1(s, f->type, f->p + j * (size_t)f->size);
4205
14.9k
                    else
4206
14.9k
                        bcf_fmt_array(s, f->n, f->type, f->p + j * (size_t)f->size);
4207
16.2k
                }
4208
25.0k
                if ( first ) kputc_('.', s);
4209
25.0k
            }
4210
11.1k
            if (fmt_packed)
4211
11.1k
                free(fmt);
4212
11.1k
        }
4213
103
        else
4214
496
            for (j=0; j<=v->n_sample; j++)
4215
393
                kputsn("\t.", 2, s);
4216
11.2k
    }
4217
31.4k
    kputc('\n', s);
4218
31.4k
    return 0;
4219
31.4k
}
4220
4221
int vcf_write_line(htsFile *fp, kstring_t *line)
4222
0
{
4223
0
    int ret;
4224
0
    if ( line->s[line->l-1]!='\n' ) kputc('\n',line);
4225
0
    if ( fp->format.compression!=no_compression )
4226
0
        ret = bgzf_write(fp->fp.bgzf, line->s, line->l);
4227
0
    else
4228
0
        ret = hwrite(fp->fp.hfile, line->s, line->l);
4229
0
    return ret==line->l ? 0 : -1;
4230
0
}
4231
4232
int vcf_write(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
4233
31.4k
{
4234
31.4k
    ssize_t ret;
4235
31.4k
    fp->line.l = 0;
4236
31.4k
    if (vcf_format1(h, v, &fp->line) != 0)
4237
0
        return -1;
4238
31.4k
    if ( fp->format.compression!=no_compression ) {
4239
0
        if (bgzf_flush_try(fp->fp.bgzf, fp->line.l) < 0)
4240
0
            return -1;
4241
0
        ret = bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l);
4242
31.4k
    } else {
4243
31.4k
        ret = hwrite(fp->fp.hfile, fp->line.s, fp->line.l);
4244
31.4k
    }
4245
4246
31.4k
    if (fp->idx && fp->format.compression == bgzf) {
4247
0
        int tid;
4248
0
        if ((tid = hts_idx_tbi_name(fp->idx, v->rid, bcf_seqname_safe(h, v))) < 0)
4249
0
            return -1;
4250
4251
0
        if (bgzf_idx_push(fp->fp.bgzf, fp->idx,
4252
0
                          tid, v->pos, v->pos + v->rlen,
4253
0
                          bgzf_tell(fp->fp.bgzf), 1) < 0)
4254
0
            return -1;
4255
0
    }
4256
4257
31.4k
    return ret==fp->line.l ? 0 : -1;
4258
31.4k
}
4259
4260
/************************
4261
 * Data access routines *
4262
 ************************/
4263
4264
int bcf_hdr_id2int(const bcf_hdr_t *h, int which, const char *id)
4265
13.9k
{
4266
13.9k
    khint_t k;
4267
13.9k
    vdict_t *d = (vdict_t*)h->dict[which];
4268
13.9k
    k = kh_get(vdict, d, id);
4269
13.9k
    return k == kh_end(d)? -1 : kh_val(d, k).id;
4270
13.9k
}
4271
4272
4273
/********************
4274
 *** BCF indexing ***
4275
 ********************/
4276
4277
// Calculate number of index levels given min_shift and the header contig
4278
// list.  Also returns number of contigs in *nids_out.
4279
static int idx_calc_n_lvls_ids(const bcf_hdr_t *h, int min_shift,
4280
                               int starting_n_lvls, int *nids_out)
4281
0
{
4282
0
    int n_lvls, i, nids = 0;
4283
0
    int64_t max_len = 0, s;
4284
4285
0
    for (i = 0; i < h->n[BCF_DT_CTG]; ++i)
4286
0
    {
4287
0
        if ( !h->id[BCF_DT_CTG][i].val ) continue;
4288
0
        if ( max_len < h->id[BCF_DT_CTG][i].val->info[0] )
4289
0
            max_len = h->id[BCF_DT_CTG][i].val->info[0];
4290
0
        nids++;
4291
0
    }
4292
0
    if ( !max_len ) max_len = (1LL<<31) - 1;  // In case contig line is broken.
4293
0
    max_len += 256;
4294
0
    s = hts_bin_maxpos(min_shift, starting_n_lvls);
4295
0
    for (n_lvls = starting_n_lvls; max_len > s; ++n_lvls, s <<= 3);
4296
4297
0
    if (nids_out) *nids_out = nids;
4298
0
    return n_lvls;
4299
0
}
4300
4301
hts_idx_t *bcf_index(htsFile *fp, int min_shift)
4302
0
{
4303
0
    int n_lvls;
4304
0
    bcf1_t *b = NULL;
4305
0
    hts_idx_t *idx = NULL;
4306
0
    bcf_hdr_t *h;
4307
0
    int r;
4308
0
    h = bcf_hdr_read(fp);
4309
0
    if ( !h ) return NULL;
4310
0
    int nids = 0;
4311
0
    n_lvls = idx_calc_n_lvls_ids(h, min_shift, 0, &nids);
4312
0
    idx = hts_idx_init(nids, HTS_FMT_CSI, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
4313
0
    if (!idx) goto fail;
4314
0
    b = bcf_init1();
4315
0
    if (!b) goto fail;
4316
0
    while ((r = bcf_read1(fp,h, b)) >= 0) {
4317
0
        int ret;
4318
0
        ret = hts_idx_push(idx, b->rid, b->pos, b->pos + b->rlen, bgzf_tell(fp->fp.bgzf), 1);
4319
0
        if (ret < 0) goto fail;
4320
0
    }
4321
0
    if (r < -1) goto fail;
4322
0
    hts_idx_finish(idx, bgzf_tell(fp->fp.bgzf));
4323
0
    bcf_destroy1(b);
4324
0
    bcf_hdr_destroy(h);
4325
0
    return idx;
4326
4327
0
 fail:
4328
0
    hts_idx_destroy(idx);
4329
0
    bcf_destroy1(b);
4330
0
    bcf_hdr_destroy(h);
4331
0
    return NULL;
4332
0
}
4333
4334
hts_idx_t *bcf_index_load2(const char *fn, const char *fnidx)
4335
0
{
4336
0
    return fnidx? hts_idx_load2(fn, fnidx) : bcf_index_load(fn);
4337
0
}
4338
4339
hts_idx_t *bcf_index_load3(const char *fn, const char *fnidx, int flags)
4340
0
{
4341
0
    return hts_idx_load3(fn, fnidx, HTS_FMT_CSI, flags);
4342
0
}
4343
4344
int bcf_index_build3(const char *fn, const char *fnidx, int min_shift, int n_threads)
4345
0
{
4346
0
    htsFile *fp;
4347
0
    hts_idx_t *idx;
4348
0
    tbx_t *tbx;
4349
0
    int ret;
4350
0
    if ((fp = hts_open(fn, "rb")) == 0) return -2;
4351
0
    if (n_threads)
4352
0
        hts_set_threads(fp, n_threads);
4353
0
    if ( fp->format.compression!=bgzf ) { hts_close(fp); return -3; }
4354
0
    switch (fp->format.format) {
4355
0
        case bcf:
4356
0
            if (!min_shift) {
4357
0
                hts_log_error("TBI indices for BCF files are not supported");
4358
0
                ret = -1;
4359
0
            } else {
4360
0
                idx = bcf_index(fp, min_shift);
4361
0
                if (idx) {
4362
0
                    ret = hts_idx_save_as(idx, fn, fnidx, HTS_FMT_CSI);
4363
0
                    if (ret < 0) ret = -4;
4364
0
                    hts_idx_destroy(idx);
4365
0
                }
4366
0
                else ret = -1;
4367
0
            }
4368
0
            break;
4369
4370
0
        case vcf:
4371
0
            tbx = tbx_index(hts_get_bgzfp(fp), min_shift, &tbx_conf_vcf);
4372
0
            if (tbx) {
4373
0
                ret = hts_idx_save_as(tbx->idx, fn, fnidx, min_shift > 0 ? HTS_FMT_CSI : HTS_FMT_TBI);
4374
0
                if (ret < 0) ret = -4;
4375
0
                tbx_destroy(tbx);
4376
0
            }
4377
0
            else ret = -1;
4378
0
            break;
4379
4380
0
        default:
4381
0
            ret = -3;
4382
0
            break;
4383
0
    }
4384
0
    hts_close(fp);
4385
0
    return ret;
4386
0
}
4387
4388
int bcf_index_build2(const char *fn, const char *fnidx, int min_shift)
4389
0
{
4390
0
    return bcf_index_build3(fn, fnidx, min_shift, 0);
4391
0
}
4392
4393
int bcf_index_build(const char *fn, int min_shift)
4394
0
{
4395
0
    return bcf_index_build3(fn, NULL, min_shift, 0);
4396
0
}
4397
4398
// Initialise fp->idx for the current format type.
4399
// This must be called after the header has been written but no other data.
4400
0
static int vcf_idx_init(htsFile *fp, bcf_hdr_t *h, int min_shift, const char *fnidx) {
4401
0
    int n_lvls, fmt;
4402
4403
0
    if (min_shift == 0) {
4404
0
        min_shift = 14;
4405
0
        n_lvls = 5;
4406
0
        fmt = HTS_FMT_TBI;
4407
0
    } else {
4408
        // Set initial n_lvls to match tbx_index()
4409
0
        int starting_n_lvls = (TBX_MAX_SHIFT - min_shift + 2) / 3;
4410
        // Increase if necessary
4411
0
        n_lvls = idx_calc_n_lvls_ids(h, min_shift, starting_n_lvls, NULL);
4412
0
        fmt = HTS_FMT_CSI;
4413
0
    }
4414
4415
0
    fp->idx = hts_idx_init(0, fmt, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
4416
0
    if (!fp->idx) return -1;
4417
4418
    // Tabix meta data, added even in CSI for VCF
4419
0
    uint8_t conf[4*7];
4420
0
    u32_to_le(TBX_VCF, conf+0);  // fmt
4421
0
    u32_to_le(1,       conf+4);  // name col
4422
0
    u32_to_le(2,       conf+8);  // beg col
4423
0
    u32_to_le(0,       conf+12); // end col
4424
0
    u32_to_le('#',     conf+16); // comment
4425
0
    u32_to_le(0,       conf+20); // n.skip
4426
0
    u32_to_le(0,       conf+24); // ref name len
4427
0
    if (hts_idx_set_meta(fp->idx, sizeof(conf)*sizeof(*conf), (uint8_t *)conf, 1) < 0) {
4428
0
        hts_idx_destroy(fp->idx);
4429
0
        fp->idx = NULL;
4430
0
        return -1;
4431
0
    }
4432
0
    fp->fnidx = fnidx;
4433
4434
0
    return 0;
4435
0
}
4436
4437
// Initialise fp->idx for the current format type.
4438
// This must be called after the header has been written but no other data.
4439
0
int bcf_idx_init(htsFile *fp, bcf_hdr_t *h, int min_shift, const char *fnidx) {
4440
0
    int n_lvls, nids = 0;
4441
4442
0
    if (fp->format.compression != bgzf) {
4443
0
        hts_log_error("Indexing is only supported on BGZF-compressed files");
4444
0
        return -3; // Matches no-compression return for bcf_index_build3()
4445
0
    }
4446
4447
0
    if (fp->format.format == vcf)
4448
0
        return vcf_idx_init(fp, h, min_shift, fnidx);
4449
4450
0
    if (!min_shift)
4451
0
        min_shift = 14;
4452
4453
0
    n_lvls = idx_calc_n_lvls_ids(h, min_shift, 0, &nids);
4454
4455
0
    fp->idx = hts_idx_init(nids, HTS_FMT_CSI, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
4456
0
    if (!fp->idx) return -1;
4457
0
    fp->fnidx = fnidx;
4458
4459
0
    return 0;
4460
0
}
4461
4462
// Finishes an index. Call after the last record has been written.
4463
// Returns 0 on success, <0 on failure.
4464
//
4465
// NB: same format as SAM/BAM as it uses bgzf.
4466
0
int bcf_idx_save(htsFile *fp) {
4467
0
    return sam_idx_save(fp);
4468
0
}
4469
4470
/*****************
4471
 *** Utilities ***
4472
 *****************/
4473
4474
int bcf_hdr_combine(bcf_hdr_t *dst, const bcf_hdr_t *src)
4475
0
{
4476
0
    int i, ndst_ori = dst->nhrec, need_sync = 0, ret = 0, res;
4477
0
    for (i=0; i<src->nhrec; i++)
4478
0
    {
4479
0
        if ( src->hrec[i]->type==BCF_HL_GEN && src->hrec[i]->value )
4480
0
        {
4481
0
            int j;
4482
0
            for (j=0; j<ndst_ori; j++)
4483
0
            {
4484
0
                if ( dst->hrec[j]->type!=BCF_HL_GEN ) continue;
4485
4486
                // Checking only the key part of generic lines, otherwise
4487
                // the VCFs are too verbose. Should we perhaps add a flag
4488
                // to bcf_hdr_combine() and make this optional?
4489
0
                if ( !strcmp(src->hrec[i]->key,dst->hrec[j]->key) ) break;
4490
0
            }
4491
0
            if ( j>=ndst_ori ) {
4492
0
                res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
4493
0
                if (res < 0) return -1;
4494
0
                need_sync += res;
4495
0
            }
4496
0
        }
4497
0
        else if ( src->hrec[i]->type==BCF_HL_STR )
4498
0
        {
4499
            // NB: we are ignoring fields without ID
4500
0
            int j = bcf_hrec_find_key(src->hrec[i],"ID");
4501
0
            if ( j>=0 )
4502
0
            {
4503
0
                bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], src->hrec[i]->key);
4504
0
                if ( !rec ) {
4505
0
                    res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
4506
0
                    if (res < 0) return -1;
4507
0
                    need_sync += res;
4508
0
                }
4509
0
            }
4510
0
        }
4511
0
        else
4512
0
        {
4513
0
            int j = bcf_hrec_find_key(src->hrec[i],"ID");
4514
0
            assert( j>=0 ); // this should always be true for valid VCFs
4515
4516
0
            bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], NULL);
4517
0
            if ( !rec ) {
4518
0
                res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
4519
0
                if (res < 0) return -1;
4520
0
                need_sync += res;
4521
0
            } else if ( src->hrec[i]->type==BCF_HL_INFO || src->hrec[i]->type==BCF_HL_FMT )
4522
0
            {
4523
                // Check that both records are of the same type. The bcf_hdr_id2length
4524
                // macro cannot be used here because dst header is not synced yet.
4525
0
                vdict_t *d_src = (vdict_t*)src->dict[BCF_DT_ID];
4526
0
                vdict_t *d_dst = (vdict_t*)dst->dict[BCF_DT_ID];
4527
0
                khint_t k_src  = kh_get(vdict, d_src, src->hrec[i]->vals[0]);
4528
0
                khint_t k_dst  = kh_get(vdict, d_dst, src->hrec[i]->vals[0]);
4529
0
                if ( (kh_val(d_src,k_src).info[rec->type]>>8 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>8 & 0xf) )
4530
0
                {
4531
0
                    hts_log_warning("Trying to combine \"%s\" tag definitions of different lengths",
4532
0
                        src->hrec[i]->vals[0]);
4533
0
                    ret |= 1;
4534
0
                }
4535
0
                if ( (kh_val(d_src,k_src).info[rec->type]>>4 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>4 & 0xf) )
4536
0
                {
4537
0
                    hts_log_warning("Trying to combine \"%s\" tag definitions of different types",
4538
0
                        src->hrec[i]->vals[0]);
4539
0
                    ret |= 1;
4540
0
                }
4541
0
            }
4542
0
        }
4543
0
    }
4544
0
    if ( need_sync ) {
4545
0
        if (bcf_hdr_sync(dst) < 0) return -1;
4546
0
    }
4547
0
    return ret;
4548
0
}
4549
4550
bcf_hdr_t *bcf_hdr_merge(bcf_hdr_t *dst, const bcf_hdr_t *src)
4551
0
{
4552
0
    if ( !dst )
4553
0
    {
4554
        // this will effectively strip existing IDX attributes from src to become dst
4555
0
        dst = bcf_hdr_init("r");
4556
0
        kstring_t htxt = {0,0,0};
4557
0
        if (bcf_hdr_format(src, 0, &htxt) < 0) {
4558
0
            free(htxt.s);
4559
0
            return NULL;
4560
0
        }
4561
0
        if ( bcf_hdr_parse(dst, htxt.s) < 0 ) {
4562
0
            bcf_hdr_destroy(dst);
4563
0
            dst = NULL;
4564
0
        }
4565
0
        free(htxt.s);
4566
0
        return dst;
4567
0
    }
4568
4569
0
    int i, ndst_ori = dst->nhrec, need_sync = 0, res;
4570
0
    for (i=0; i<src->nhrec; i++)
4571
0
    {
4572
0
        if ( src->hrec[i]->type==BCF_HL_GEN && src->hrec[i]->value )
4573
0
        {
4574
0
            int j;
4575
0
            for (j=0; j<ndst_ori; j++)
4576
0
            {
4577
0
                if ( dst->hrec[j]->type!=BCF_HL_GEN ) continue;
4578
4579
                // Checking only the key part of generic lines, otherwise
4580
                // the VCFs are too verbose. Should we perhaps add a flag
4581
                // to bcf_hdr_combine() and make this optional?
4582
0
                if ( !strcmp(src->hrec[i]->key,dst->hrec[j]->key) ) break;
4583
0
            }
4584
0
            if ( j>=ndst_ori ) {
4585
0
                res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
4586
0
                if (res < 0) return NULL;
4587
0
                need_sync += res;
4588
0
            }
4589
0
        }
4590
0
        else if ( src->hrec[i]->type==BCF_HL_STR )
4591
0
        {
4592
            // NB: we are ignoring fields without ID
4593
0
            int j = bcf_hrec_find_key(src->hrec[i],"ID");
4594
0
            if ( j>=0 )
4595
0
            {
4596
0
                bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], src->hrec[i]->key);
4597
0
                if ( !rec ) {
4598
0
                    res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
4599
0
                    if (res < 0) return NULL;
4600
0
                    need_sync += res;
4601
0
                }
4602
0
            }
4603
0
        }
4604
0
        else
4605
0
        {
4606
0
            int j = bcf_hrec_find_key(src->hrec[i],"ID");
4607
0
            assert( j>=0 ); // this should always be true for valid VCFs
4608
4609
0
            bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], NULL);
4610
0
            if ( !rec ) {
4611
0
                res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
4612
0
                if (res < 0) return NULL;
4613
0
                need_sync += res;
4614
0
            } else if ( src->hrec[i]->type==BCF_HL_INFO || src->hrec[i]->type==BCF_HL_FMT )
4615
0
            {
4616
                // Check that both records are of the same type. The bcf_hdr_id2length
4617
                // macro cannot be used here because dst header is not synced yet.
4618
0
                vdict_t *d_src = (vdict_t*)src->dict[BCF_DT_ID];
4619
0
                vdict_t *d_dst = (vdict_t*)dst->dict[BCF_DT_ID];
4620
0
                khint_t k_src  = kh_get(vdict, d_src, src->hrec[i]->vals[0]);
4621
0
                khint_t k_dst  = kh_get(vdict, d_dst, src->hrec[i]->vals[0]);
4622
0
                if ( (kh_val(d_src,k_src).info[rec->type]>>8 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>8 & 0xf) )
4623
0
                {
4624
0
                    hts_log_warning("Trying to combine \"%s\" tag definitions of different lengths",
4625
0
                        src->hrec[i]->vals[0]);
4626
0
                }
4627
0
                if ( (kh_val(d_src,k_src).info[rec->type]>>4 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>4 & 0xf) )
4628
0
                {
4629
0
                    hts_log_warning("Trying to combine \"%s\" tag definitions of different types",
4630
0
                        src->hrec[i]->vals[0]);
4631
0
                }
4632
0
            }
4633
0
        }
4634
0
    }
4635
0
    if ( need_sync ) {
4636
0
        if (bcf_hdr_sync(dst) < 0) return NULL;
4637
0
    }
4638
0
    return dst;
4639
0
}
4640
4641
int bcf_translate(const bcf_hdr_t *dst_hdr, bcf_hdr_t *src_hdr, bcf1_t *line)
4642
0
{
4643
0
    int i;
4644
0
    if ( line->errcode )
4645
0
    {
4646
0
        char errordescription[1024] = "";
4647
0
        hts_log_error("Unchecked error (%d %s) at %s:%"PRIhts_pos", exiting", line->errcode, bcf_strerror(line->errcode, errordescription, sizeof(errordescription)),  bcf_seqname_safe(src_hdr,line), line->pos+1);
4648
0
        exit(1);
4649
0
    }
4650
0
    if ( src_hdr->ntransl==-1 ) return 0;    // no need to translate, all tags have the same id
4651
0
    if ( !src_hdr->ntransl )  // called for the first time, see what needs translating
4652
0
    {
4653
0
        int dict;
4654
0
        for (dict=0; dict<2; dict++)    // BCF_DT_ID and BCF_DT_CTG
4655
0
        {
4656
0
            src_hdr->transl[dict] = (int*) malloc(src_hdr->n[dict]*sizeof(int));
4657
0
            for (i=0; i<src_hdr->n[dict]; i++)
4658
0
            {
4659
0
                if ( !src_hdr->id[dict][i].key ) // gap left after removed BCF header lines
4660
0
                {
4661
0
                    src_hdr->transl[dict][i] = -1;
4662
0
                    continue;
4663
0
                }
4664
0
                src_hdr->transl[dict][i] = bcf_hdr_id2int(dst_hdr,dict,src_hdr->id[dict][i].key);
4665
0
                if ( src_hdr->transl[dict][i]!=-1 && i!=src_hdr->transl[dict][i] ) src_hdr->ntransl++;
4666
0
            }
4667
0
        }
4668
0
        if ( !src_hdr->ntransl )
4669
0
        {
4670
0
            free(src_hdr->transl[0]); src_hdr->transl[0] = NULL;
4671
0
            free(src_hdr->transl[1]); src_hdr->transl[1] = NULL;
4672
0
            src_hdr->ntransl = -1;
4673
0
        }
4674
0
        if ( src_hdr->ntransl==-1 ) return 0;
4675
0
    }
4676
0
    bcf_unpack(line,BCF_UN_ALL);
4677
4678
    // CHROM
4679
0
    if ( src_hdr->transl[BCF_DT_CTG][line->rid] >=0 ) line->rid = src_hdr->transl[BCF_DT_CTG][line->rid];
4680
4681
    // FILTER
4682
0
    for (i=0; i<line->d.n_flt; i++)
4683
0
    {
4684
0
        int src_id = line->d.flt[i];
4685
0
        if ( src_hdr->transl[BCF_DT_ID][src_id] >=0 )
4686
0
            line->d.flt[i] = src_hdr->transl[BCF_DT_ID][src_id];
4687
0
        line->d.shared_dirty |= BCF1_DIRTY_FLT;
4688
0
    }
4689
4690
    // INFO
4691
0
    for (i=0; i<line->n_info; i++)
4692
0
    {
4693
0
        int src_id = line->d.info[i].key;
4694
0
        int dst_id = src_hdr->transl[BCF_DT_ID][src_id];
4695
0
        if ( dst_id<0 ) continue;
4696
0
        line->d.info[i].key = dst_id;
4697
0
        if ( !line->d.info[i].vptr ) continue;  // skip deleted
4698
0
        int src_size = src_id>>7 ? ( src_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
4699
0
        int dst_size = dst_id>>7 ? ( dst_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
4700
0
        if ( src_size==dst_size )   // can overwrite
4701
0
        {
4702
0
            uint8_t *vptr = line->d.info[i].vptr - line->d.info[i].vptr_off;
4703
0
            if ( dst_size==BCF_BT_INT8 ) { vptr[1] = (uint8_t)dst_id; }
4704
0
            else if ( dst_size==BCF_BT_INT16 ) { *(uint16_t*)vptr = (uint16_t)dst_id; }
4705
0
            else { *(uint32_t*)vptr = (uint32_t)dst_id; }
4706
0
        }
4707
0
        else    // must realloc
4708
0
        {
4709
0
            bcf_info_t *info = &line->d.info[i];
4710
0
            kstring_t str = {0,0,0};
4711
0
            bcf_enc_int1(&str, dst_id);
4712
0
            bcf_enc_size(&str, info->len,info->type);
4713
0
            uint32_t vptr_off = str.l;
4714
0
            kputsn((char*)info->vptr, info->vptr_len, &str);
4715
0
            if( info->vptr_free ) free(info->vptr - info->vptr_off);
4716
0
            info->vptr_off = vptr_off;
4717
0
            info->vptr = (uint8_t*)str.s + info->vptr_off;
4718
0
            info->vptr_free = 1;
4719
0
            line->d.shared_dirty |= BCF1_DIRTY_INF;
4720
0
        }
4721
0
    }
4722
4723
    // FORMAT
4724
0
    for (i=0; i<line->n_fmt; i++)
4725
0
    {
4726
0
        int src_id = line->d.fmt[i].id;
4727
0
        int dst_id = src_hdr->transl[BCF_DT_ID][src_id];
4728
0
        if ( dst_id<0 ) continue;
4729
0
        line->d.fmt[i].id = dst_id;
4730
0
        if( !line->d.fmt[i].p ) continue;  // skip deleted
4731
0
        int src_size = src_id>>7 ? ( src_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
4732
0
        int dst_size = dst_id>>7 ? ( dst_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
4733
0
        if ( src_size==dst_size )   // can overwrite
4734
0
        {
4735
0
            uint8_t *p = line->d.fmt[i].p - line->d.fmt[i].p_off;    // pointer to the vector size (4bits) and BT type (4bits)
4736
0
            if ( dst_size==BCF_BT_INT8 ) { p[1] = dst_id; }
4737
0
            else if ( dst_size==BCF_BT_INT16 ) { i16_to_le(dst_id, p + 1); }
4738
0
            else { i32_to_le(dst_id, p + 1); }
4739
0
        }
4740
0
        else    // must realloc
4741
0
        {
4742
0
            bcf_fmt_t *fmt = &line->d.fmt[i];
4743
0
            kstring_t str = {0,0,0};
4744
0
            bcf_enc_int1(&str, dst_id);
4745
0
            bcf_enc_size(&str, fmt->n, fmt->type);
4746
0
            uint32_t p_off = str.l;
4747
0
            kputsn((char*)fmt->p, fmt->p_len, &str);
4748
0
            if( fmt->p_free ) free(fmt->p - fmt->p_off);
4749
0
            fmt->p_off = p_off;
4750
0
            fmt->p = (uint8_t*)str.s + fmt->p_off;
4751
0
            fmt->p_free = 1;
4752
0
            line->d.indiv_dirty = 1;
4753
0
        }
4754
0
    }
4755
0
    return 0;
4756
0
}
4757
4758
bcf_hdr_t *bcf_hdr_dup(const bcf_hdr_t *hdr)
4759
0
{
4760
0
    bcf_hdr_t *hout = bcf_hdr_init("r");
4761
0
    if (!hout) {
4762
0
        hts_log_error("Failed to allocate bcf header");
4763
0
        return NULL;
4764
0
    }
4765
0
    kstring_t htxt = {0,0,0};
4766
0
    if (bcf_hdr_format(hdr, 1, &htxt) < 0) {
4767
0
        free(htxt.s);
4768
0
        return NULL;
4769
0
    }
4770
0
    if ( bcf_hdr_parse(hout, htxt.s) < 0 ) {
4771
0
        bcf_hdr_destroy(hout);
4772
0
        hout = NULL;
4773
0
    }
4774
0
    free(htxt.s);
4775
0
    return hout;
4776
0
}
4777
4778
bcf_hdr_t *bcf_hdr_subset(const bcf_hdr_t *h0, int n, char *const* samples, int *imap)
4779
0
{
4780
0
    void *names_hash = khash_str2int_init();
4781
0
    kstring_t htxt = {0,0,0};
4782
0
    kstring_t str = {0,0,0};
4783
0
    bcf_hdr_t *h = bcf_hdr_init("w");
4784
0
    int r = 0;
4785
0
    if (!h || !names_hash) {
4786
0
        hts_log_error("Failed to allocate bcf header");
4787
0
        goto err;
4788
0
    }
4789
0
    if (bcf_hdr_format(h0, 1, &htxt) < 0) {
4790
0
        hts_log_error("Failed to get header text");
4791
0
        goto err;
4792
0
    }
4793
0
    bcf_hdr_set_version(h,bcf_hdr_get_version(h0));
4794
0
    int j;
4795
0
    for (j=0; j<n; j++) imap[j] = -1;
4796
0
    if ( bcf_hdr_nsamples(h0) > 0) {
4797
0
        char *p = find_chrom_header_line(htxt.s);
4798
0
        int i = 0, end = n? 8 : 7;
4799
0
        while ((p = strchr(p, '\t')) != 0 && i < end) ++i, ++p;
4800
0
        if (i != end) {
4801
0
            hts_log_error("Wrong number of columns in header #CHROM line");
4802
0
            goto err;
4803
0
        }
4804
0
        r |= kputsn(htxt.s, p - htxt.s, &str) < 0;
4805
0
        for (i = 0; i < n; ++i) {
4806
0
            if ( khash_str2int_has_key(names_hash,samples[i]) )
4807
0
            {
4808
0
                hts_log_error("Duplicate sample name \"%s\"", samples[i]);
4809
0
                goto err;
4810
0
            }
4811
0
            imap[i] = bcf_hdr_id2int(h0, BCF_DT_SAMPLE, samples[i]);
4812
0
            if (imap[i] < 0) continue;
4813
0
            r |= kputc('\t', &str) < 0;
4814
0
            r |= kputs(samples[i], &str) < 0;
4815
0
            r |= khash_str2int_inc(names_hash,samples[i]) < 0;
4816
0
        }
4817
0
    } else r |= kputsn(htxt.s, htxt.l, &str) < 0;
4818
0
    while (str.l && (!str.s[str.l-1] || str.s[str.l-1]=='\n') ) str.l--; // kill trailing zeros and newlines
4819
0
    r |= kputc('\n',&str) < 0;
4820
0
    if (r) {
4821
0
        hts_log_error("%s", strerror(errno));
4822
0
        goto err;
4823
0
    }
4824
0
    if ( bcf_hdr_parse(h, str.s) < 0 ) {
4825
0
        bcf_hdr_destroy(h);
4826
0
        h = NULL;
4827
0
    }
4828
0
    free(str.s);
4829
0
    free(htxt.s);
4830
0
    khash_str2int_destroy(names_hash);
4831
0
    return h;
4832
4833
0
 err:
4834
0
    ks_free(&str);
4835
0
    ks_free(&htxt);
4836
0
    khash_str2int_destroy(names_hash);
4837
0
    bcf_hdr_destroy(h);
4838
0
    return NULL;
4839
0
}
4840
4841
int bcf_hdr_set_samples(bcf_hdr_t *hdr, const char *samples, int is_file)
4842
0
{
4843
0
    if ( samples && !strcmp("-",samples) ) return 0;            // keep all samples
4844
4845
0
    int i, narr = bit_array_size(bcf_hdr_nsamples(hdr));
4846
0
    hdr->keep_samples = (uint8_t*) calloc(narr,1);
4847
0
    if (!hdr->keep_samples) return -1;
4848
4849
0
    hdr->nsamples_ori = bcf_hdr_nsamples(hdr);
4850
0
    if ( !samples )
4851
0
    {
4852
        // exclude all samples
4853
0
        khint_t k;
4854
0
        vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_SAMPLE], *new_dict;
4855
0
        new_dict = kh_init(vdict);
4856
0
        if (!new_dict) return -1;
4857
4858
0
        bcf_hdr_nsamples(hdr) = 0;
4859
4860
0
        for (k = kh_begin(d); k != kh_end(d); ++k)
4861
0
            if (kh_exist(d, k)) free((char*)kh_key(d, k));
4862
0
        kh_destroy(vdict, d);
4863
0
        hdr->dict[BCF_DT_SAMPLE] = new_dict;
4864
0
        if (bcf_hdr_sync(hdr) < 0) return -1;
4865
4866
0
        return 0;
4867
0
    }
4868
4869
0
    if ( samples[0]=='^' )
4870
0
        for (i=0; i<bcf_hdr_nsamples(hdr); i++) bit_array_set(hdr->keep_samples,i);
4871
4872
0
    int idx, n, ret = 0;
4873
0
    char **smpls = hts_readlist(samples[0]=='^'?samples+1:samples, is_file, &n);
4874
0
    if ( !smpls ) return -1;
4875
0
    for (i=0; i<n; i++)
4876
0
    {
4877
0
        idx = bcf_hdr_id2int(hdr,BCF_DT_SAMPLE,smpls[i]);
4878
0
        if ( idx<0 )
4879
0
        {
4880
0
            if ( !ret ) ret = i+1;
4881
0
            continue;
4882
0
        }
4883
0
        assert( idx<bcf_hdr_nsamples(hdr) );
4884
0
        if (  samples[0]=='^' )
4885
0
            bit_array_clear(hdr->keep_samples, idx);
4886
0
        else
4887
0
            bit_array_set(hdr->keep_samples, idx);
4888
0
    }
4889
0
    for (i=0; i<n; i++) free(smpls[i]);
4890
0
    free(smpls);
4891
4892
0
    bcf_hdr_nsamples(hdr) = 0;
4893
0
    for (i=0; i<hdr->nsamples_ori; i++)
4894
0
        if ( bit_array_test(hdr->keep_samples,i) ) bcf_hdr_nsamples(hdr)++;
4895
4896
0
    if ( !bcf_hdr_nsamples(hdr) ) { free(hdr->keep_samples); hdr->keep_samples=NULL; }
4897
0
    else
4898
0
    {
4899
        // Make new list and dictionary with desired samples
4900
0
        char **samples = (char**) malloc(sizeof(char*)*bcf_hdr_nsamples(hdr));
4901
0
        vdict_t *new_dict, *d;
4902
0
        int k, res;
4903
0
        if (!samples) return -1;
4904
4905
0
        new_dict = kh_init(vdict);
4906
0
        if (!new_dict) {
4907
0
            free(samples);
4908
0
            return -1;
4909
0
        }
4910
0
        idx = 0;
4911
0
        for (i=0; i<hdr->nsamples_ori; i++) {
4912
0
            if ( bit_array_test(hdr->keep_samples,i) ) {
4913
0
                samples[idx] = hdr->samples[i];
4914
0
                k = kh_put(vdict, new_dict, hdr->samples[i], &res);
4915
0
                if (res < 0) {
4916
0
                    free(samples);
4917
0
                    kh_destroy(vdict, new_dict);
4918
0
                    return -1;
4919
0
                }
4920
0
                kh_val(new_dict, k) = bcf_idinfo_def;
4921
0
                kh_val(new_dict, k).id = idx;
4922
0
                idx++;
4923
0
            }
4924
0
        }
4925
4926
        // Delete desired samples from old dictionary, so we don't free them
4927
0
        d = (vdict_t*)hdr->dict[BCF_DT_SAMPLE];
4928
0
        for (i=0; i < idx; i++) {
4929
0
            int k = kh_get(vdict, d, samples[i]);
4930
0
            if (k < kh_end(d)) kh_del(vdict, d, k);
4931
0
        }
4932
4933
        // Free everything else
4934
0
        for (k = kh_begin(d); k != kh_end(d); ++k)
4935
0
            if (kh_exist(d, k)) free((char*)kh_key(d, k));
4936
0
        kh_destroy(vdict, d);
4937
0
        hdr->dict[BCF_DT_SAMPLE] = new_dict;
4938
4939
0
        free(hdr->samples);
4940
0
        hdr->samples = samples;
4941
4942
0
        if (bcf_hdr_sync(hdr) < 0)
4943
0
            return -1;
4944
0
    }
4945
4946
0
    return ret;
4947
0
}
4948
4949
int bcf_subset(const bcf_hdr_t *h, bcf1_t *v, int n, int *imap)
4950
0
{
4951
0
    kstring_t ind;
4952
0
    ind.s = 0; ind.l = ind.m = 0;
4953
0
    if (n) {
4954
0
        bcf_fmt_t fmt[MAX_N_FMT];
4955
0
        int i, j;
4956
0
        uint8_t *ptr = (uint8_t*)v->indiv.s;
4957
0
        for (i = 0; i < v->n_fmt; ++i)
4958
0
            ptr = bcf_unpack_fmt_core1(ptr, v->n_sample, &fmt[i]);
4959
0
        for (i = 0; i < (int)v->n_fmt; ++i) {
4960
0
            bcf_fmt_t *f = &fmt[i];
4961
0
            bcf_enc_int1(&ind, f->id);
4962
0
            bcf_enc_size(&ind, f->n, f->type);
4963
0
            for (j = 0; j < n; ++j)
4964
0
                if (imap[j] >= 0) kputsn((char*)(f->p + imap[j] * f->size), f->size, &ind);
4965
0
        }
4966
0
        for (i = j = 0; j < n; ++j) if (imap[j] >= 0) ++i;
4967
0
        v->n_sample = i;
4968
0
    } else v->n_sample = 0;
4969
0
    if ( !v->n_sample ) v->n_fmt = 0;
4970
0
    free(v->indiv.s);
4971
0
    v->indiv = ind;
4972
0
    v->unpacked &= ~BCF_UN_FMT;    // only BCF is ready for output, VCF will need to unpack again
4973
0
    return 0;
4974
0
}
4975
4976
int bcf_is_snp(bcf1_t *v)
4977
0
{
4978
0
    int i;
4979
0
    bcf_unpack(v, BCF_UN_STR);
4980
0
    for (i = 0; i < v->n_allele; ++i)
4981
0
    {
4982
0
        if ( v->d.allele[i][1]==0 && v->d.allele[i][0]!='*' ) continue;
4983
4984
        // mpileup's <X> allele, see also below. This is not completely satisfactory,
4985
        // a general library is here narrowly tailored to fit samtools.
4986
0
        if ( v->d.allele[i][0]=='<' && v->d.allele[i][1]=='X' && v->d.allele[i][2]=='>' ) continue;
4987
0
        if ( v->d.allele[i][0]=='<' && v->d.allele[i][1]=='*' && v->d.allele[i][2]=='>' ) continue;
4988
4989
0
        break;
4990
0
    }
4991
0
    return i == v->n_allele;
4992
0
}
4993
4994
static void bcf_set_variant_type(const char *ref, const char *alt, bcf_variant_t *var)
4995
0
{
4996
0
    if ( *alt == '*' && !alt[1] ) { var->n = 0; var->type = VCF_OVERLAP; return; }  // overlapping variant
4997
4998
    // The most frequent case
4999
0
    if ( !ref[1] && !alt[1] )
5000
0
    {
5001
0
        if ( *alt == '.' || *ref==*alt ) { var->n = 0; var->type = VCF_REF; return; }
5002
0
        if ( *alt == 'X' ) { var->n = 0; var->type = VCF_REF; return; }  // mpileup's X allele shouldn't be treated as variant
5003
0
        var->n = 1; var->type = VCF_SNP; return;
5004
0
    }
5005
0
    if ( alt[0]=='<' )
5006
0
    {
5007
0
        if ( alt[1]=='X' && alt[2]=='>' ) { var->n = 0; var->type = VCF_REF; return; }  // mpileup's X allele shouldn't be treated as variant
5008
0
        if ( alt[1]=='*' && alt[2]=='>' ) { var->n = 0; var->type = VCF_REF; return; }
5009
0
        if ( !strcmp("NON_REF>",alt+1) ) { var->n = 0; var->type = VCF_REF; return; }
5010
0
        var->type = VCF_OTHER;
5011
0
        return;
5012
0
    }
5013
5014
    // Catch "joined before" breakend case
5015
0
    if ( alt[0]==']' || alt[0] == '[' )
5016
0
    {
5017
0
        var->type = VCF_BND; return;
5018
0
    }
5019
5020
    // Iterate through alt characters that match the reference
5021
0
    const char *r = ref, *a = alt;
5022
0
    while (*r && *a && toupper_c(*r)==toupper_c(*a) ) { r++; a++; }     // unfortunately, matching REF,ALT case is not guaranteed
5023
5024
0
    if ( *a && !*r )
5025
0
    {
5026
0
        if ( *a==']' || *a=='[' ) { var->type = VCF_BND; return; } // "joined after" breakend
5027
0
        while ( *a ) a++;
5028
0
        var->n = (a-alt)-(r-ref); var->type = VCF_INDEL | VCF_INS; return;
5029
0
    }
5030
0
    else if ( *r && !*a )
5031
0
    {
5032
0
        while ( *r ) r++;
5033
0
        var->n = (a-alt)-(r-ref); var->type = VCF_INDEL | VCF_DEL; return;
5034
0
    }
5035
0
    else if ( !*r && !*a )
5036
0
    {
5037
0
        var->n = 0; var->type = VCF_REF; return;
5038
0
    }
5039
5040
0
    const char *re = r, *ae = a;
5041
0
    while ( re[1] ) re++;
5042
0
    while ( ae[1] ) ae++;
5043
0
    while ( re>r && ae>a && toupper_c(*re)==toupper_c(*ae) ) { re--; ae--; }
5044
0
    if ( ae==a )
5045
0
    {
5046
0
        if ( re==r ) { var->n = 1; var->type = VCF_SNP; return; }
5047
0
        var->n = -(re-r);
5048
0
        if ( toupper_c(*re)==toupper_c(*ae) ) { var->type = VCF_INDEL | VCF_DEL; return; }
5049
0
        var->type = VCF_OTHER; return;
5050
0
    }
5051
0
    else if ( re==r )
5052
0
    {
5053
0
        var->n = ae-a;
5054
0
        if ( toupper_c(*re)==toupper_c(*ae) ) { var->type = VCF_INDEL | VCF_INS; return; }
5055
0
        var->type = VCF_OTHER; return;
5056
0
    }
5057
5058
0
    var->type = ( re-r == ae-a ) ? VCF_MNP : VCF_OTHER;
5059
0
    var->n = ( re-r > ae-a ) ? -(re-r+1) : ae-a+1;
5060
5061
    // should do also complex events, SVs, etc...
5062
0
}
5063
5064
static int bcf_set_variant_types(bcf1_t *b)
5065
0
{
5066
0
    if ( !(b->unpacked & BCF_UN_STR) ) bcf_unpack(b, BCF_UN_STR);
5067
0
    bcf_dec_t *d = &b->d;
5068
0
    if ( d->n_var < b->n_allele )
5069
0
    {
5070
0
        bcf_variant_t *new_var = realloc(d->var, sizeof(bcf_variant_t)*b->n_allele);
5071
0
        if (!new_var)
5072
0
            return -1;
5073
0
        d->var = new_var;
5074
0
        d->n_var = b->n_allele;
5075
0
    }
5076
0
    int i;
5077
0
    b->d.var_type = 0;
5078
0
    d->var[0].type = VCF_REF;
5079
0
    d->var[0].n    = 0;
5080
0
    for (i=1; i<b->n_allele; i++)
5081
0
    {
5082
0
        bcf_set_variant_type(d->allele[0],d->allele[i], &d->var[i]);
5083
0
        b->d.var_type |= d->var[i].type;
5084
        //fprintf(stderr,"[set_variant_type] %d   %s %s -> %d %d .. %d\n", b->pos+1,d->allele[0],d->allele[i],d->var[i].type,d->var[i].n, b->d.var_type);
5085
0
    }
5086
0
    return 0;
5087
0
}
5088
5089
// bcf_get_variant_type/bcf_get_variant_types should only return the following,
5090
// to be compatible with callers that are not expecting newer values
5091
// like VCF_INS, VCF_DEL.  The full set is available from the newer
5092
// vcf_has_variant_type* interfaces.
5093
0
#define ORIG_VAR_TYPES (VCF_SNP|VCF_MNP|VCF_INDEL|VCF_OTHER|VCF_BND|VCF_OVERLAP)
5094
int bcf_get_variant_types(bcf1_t *rec)
5095
0
{
5096
0
    if ( rec->d.var_type==-1 ) {
5097
0
        if (bcf_set_variant_types(rec) != 0) {
5098
0
            hts_log_error("Couldn't get variant types: %s", strerror(errno));
5099
0
            exit(1); // Due to legacy API having no way to report failures
5100
0
        }
5101
0
    }
5102
0
    return rec->d.var_type & ORIG_VAR_TYPES;
5103
0
}
5104
5105
int bcf_get_variant_type(bcf1_t *rec, int ith_allele)
5106
0
{
5107
0
    if ( rec->d.var_type==-1 ) {
5108
0
        if (bcf_set_variant_types(rec) != 0) {
5109
0
            hts_log_error("Couldn't get variant types: %s", strerror(errno));
5110
0
            exit(1); // Due to legacy API having no way to report failures
5111
0
        }
5112
0
    }
5113
0
    if (ith_allele < 0 || ith_allele >= rec->n_allele) {
5114
0
        hts_log_error("Requested allele outside valid range");
5115
0
        exit(1);
5116
0
    }
5117
0
    return rec->d.var[ith_allele].type & ORIG_VAR_TYPES;
5118
0
}
5119
#undef ORIG_VAR_TYPES
5120
5121
int bcf_has_variant_type(bcf1_t *rec, int ith_allele, uint32_t bitmask)
5122
0
{
5123
0
    if ( rec->d.var_type==-1 ) {
5124
0
        if (bcf_set_variant_types(rec) != 0) return -1;
5125
0
    }
5126
0
    if (ith_allele < 0 || ith_allele >= rec->n_allele) return -1;
5127
0
    if (bitmask == VCF_REF) {  // VCF_REF is 0, so handled as a special case
5128
0
        return rec->d.var[ith_allele].type == VCF_REF;
5129
0
    }
5130
0
    return bitmask & rec->d.var[ith_allele].type;
5131
0
}
5132
5133
int bcf_variant_length(bcf1_t *rec, int ith_allele)
5134
0
{
5135
0
    if ( rec->d.var_type==-1 ) {
5136
0
        if (bcf_set_variant_types(rec) != 0) return bcf_int32_missing;
5137
0
    }
5138
0
    if (ith_allele < 0 || ith_allele >= rec->n_allele) return bcf_int32_missing;
5139
0
    return rec->d.var[ith_allele].n;
5140
0
}
5141
5142
int bcf_has_variant_types(bcf1_t *rec, uint32_t bitmask,
5143
                          enum bcf_variant_match mode)
5144
0
{
5145
0
    if ( rec->d.var_type==-1 ) {
5146
0
        if (bcf_set_variant_types(rec) != 0) return -1;
5147
0
    }
5148
0
    uint32_t type = rec->d.var_type;
5149
0
    if ( mode==bcf_match_overlap ) return bitmask & type;
5150
5151
    // VCF_INDEL is always set with VCF_INS and VCF_DEL by bcf_set_variant_type[s], but the bitmask may
5152
    // ask for say `VCF_INS` or `VCF_INDEL` only
5153
0
    if ( bitmask&(VCF_INS|VCF_DEL) && !(bitmask&VCF_INDEL) ) type &= ~VCF_INDEL;
5154
0
    else if ( bitmask&VCF_INDEL && !(bitmask&(VCF_INS|VCF_DEL)) ) type &= ~(VCF_INS|VCF_DEL);
5155
5156
0
    if ( mode==bcf_match_subset )
5157
0
    {
5158
0
        if ( ~bitmask & type ) return 0;
5159
0
        else return bitmask & type;
5160
0
    }
5161
    // mode == bcf_match_exact
5162
0
    return type==bitmask ? type : 0;
5163
0
}
5164
5165
int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type)
5166
0
{
5167
0
    static int negative_rlen_warned = 0;
5168
0
    int is_end_tag;
5169
5170
    // Is the field already present?
5171
0
    int i, inf_id = bcf_hdr_id2int(hdr,BCF_DT_ID,key);
5172
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,inf_id) ) return -1;    // No such INFO field in the header
5173
0
    if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO);
5174
5175
0
    is_end_tag = strcmp(key, "END") == 0;
5176
5177
0
    for (i=0; i<line->n_info; i++)
5178
0
        if ( inf_id==line->d.info[i].key ) break;
5179
0
    bcf_info_t *inf = i==line->n_info ? NULL : &line->d.info[i];
5180
5181
0
    if ( !n || (type==BCF_HT_STR && !values) )
5182
0
    {
5183
0
        if ( n==0 && is_end_tag )
5184
0
            line->rlen = line->n_allele ? strlen(line->d.allele[0]) : 0;
5185
0
        if ( inf )
5186
0
        {
5187
            // Mark the tag for removal, free existing memory if necessary
5188
0
            if ( inf->vptr_free )
5189
0
            {
5190
0
                free(inf->vptr - inf->vptr_off);
5191
0
                inf->vptr_free = 0;
5192
0
            }
5193
0
            line->d.shared_dirty |= BCF1_DIRTY_INF;
5194
0
            inf->vptr = NULL;
5195
0
            inf->vptr_off = inf->vptr_len = 0;
5196
0
        }
5197
0
        return 0;
5198
0
    }
5199
5200
0
    if (is_end_tag)
5201
0
    {
5202
0
        if (n != 1)
5203
0
        {
5204
0
            hts_log_error("END info tag should only have one value at %s:%"PRIhts_pos, bcf_seqname_safe(hdr,line), line->pos+1);
5205
0
            line->errcode |= BCF_ERR_TAG_INVALID;
5206
0
            return -1;
5207
0
        }
5208
0
        if (type != BCF_HT_INT && type != BCF_HT_LONG)
5209
0
        {
5210
0
            hts_log_error("Wrong type (%d) for END info tag at %s:%"PRIhts_pos, type, bcf_seqname_safe(hdr,line), line->pos+1);
5211
0
            line->errcode |= BCF_ERR_TAG_INVALID;
5212
0
            return -1;
5213
0
        }
5214
0
    }
5215
5216
    // Encode the values and determine the size required to accommodate the values
5217
0
    kstring_t str = {0,0,0};
5218
0
    bcf_enc_int1(&str, inf_id);
5219
0
    if ( type==BCF_HT_INT )
5220
0
        bcf_enc_vint(&str, n, (int32_t*)values, -1);
5221
0
    else if ( type==BCF_HT_REAL )
5222
0
        bcf_enc_vfloat(&str, n, (float*)values);
5223
0
    else if ( type==BCF_HT_FLAG || type==BCF_HT_STR )
5224
0
    {
5225
0
        if ( values==NULL )
5226
0
            bcf_enc_size(&str, 0, BCF_BT_NULL);
5227
0
        else
5228
0
            bcf_enc_vchar(&str, strlen((char*)values), (char*)values);
5229
0
    }
5230
#ifdef VCF_ALLOW_INT64
5231
    else if ( type==BCF_HT_LONG )
5232
    {
5233
        if (n != 1) {
5234
            hts_log_error("Only storing a single BCF_HT_LONG value is supported at %s:%"PRIhts_pos, bcf_seqname_safe(hdr,line), line->pos+1);
5235
            abort();
5236
        }
5237
        bcf_enc_long1(&str, *(int64_t *) values);
5238
    }
5239
#endif
5240
0
    else
5241
0
    {
5242
0
        hts_log_error("The type %d not implemented yet at %s:%"PRIhts_pos, type, bcf_seqname_safe(hdr,line), line->pos+1);
5243
0
        abort();
5244
0
    }
5245
5246
    // Is the INFO tag already present
5247
0
    if ( inf )
5248
0
    {
5249
        // Is it big enough to accommodate new block?
5250
0
        if ( inf->vptr && str.l <= inf->vptr_len + inf->vptr_off )
5251
0
        {
5252
0
            if ( str.l != inf->vptr_len + inf->vptr_off ) line->d.shared_dirty |= BCF1_DIRTY_INF;
5253
0
            uint8_t *ptr = inf->vptr - inf->vptr_off;
5254
0
            memcpy(ptr, str.s, str.l);
5255
0
            free(str.s);
5256
0
            int vptr_free = inf->vptr_free;
5257
0
            bcf_unpack_info_core1(ptr, inf);
5258
0
            inf->vptr_free = vptr_free;
5259
0
        }
5260
0
        else
5261
0
        {
5262
0
            if ( inf->vptr_free )
5263
0
                free(inf->vptr - inf->vptr_off);
5264
0
            bcf_unpack_info_core1((uint8_t*)str.s, inf);
5265
0
            inf->vptr_free = 1;
5266
0
            line->d.shared_dirty |= BCF1_DIRTY_INF;
5267
0
        }
5268
0
    }
5269
0
    else
5270
0
    {
5271
        // The tag is not present, create new one
5272
0
        line->n_info++;
5273
0
        hts_expand0(bcf_info_t, line->n_info, line->d.m_info , line->d.info);
5274
0
        inf = &line->d.info[line->n_info-1];
5275
0
        bcf_unpack_info_core1((uint8_t*)str.s, inf);
5276
0
        inf->vptr_free = 1;
5277
0
        line->d.shared_dirty |= BCF1_DIRTY_INF;
5278
0
    }
5279
0
    line->unpacked |= BCF_UN_INFO;
5280
5281
0
   if ( n==1 && is_end_tag) {
5282
0
        hts_pos_t end = type == BCF_HT_INT ? *(int32_t *) values : *(int64_t *) values;
5283
0
        if ( (type == BCF_HT_INT && end!=bcf_int32_missing) || (type == BCF_HT_LONG && end!=bcf_int64_missing) )
5284
0
        {
5285
0
            if ( end <= line->pos )
5286
0
            {
5287
0
                if ( !negative_rlen_warned )
5288
0
                {
5289
0
                    hts_log_warning("INFO/END=%"PRIhts_pos" is smaller than POS at %s:%"PRIhts_pos,end,bcf_seqname_safe(hdr,line),line->pos+1);
5290
0
                    negative_rlen_warned = 1;
5291
0
                }
5292
0
                line->rlen = line->n_allele ? strlen(line->d.allele[0]) : 0;
5293
0
            }
5294
0
            else
5295
0
                line->rlen = end - line->pos;
5296
0
        }
5297
0
    }
5298
0
    return 0;
5299
0
}
5300
5301
int bcf_update_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const char **values, int n)
5302
0
{
5303
0
    if ( !n )
5304
0
        return bcf_update_format(hdr,line,key,NULL,0,BCF_HT_STR);
5305
5306
0
    int i, max_len = 0;
5307
0
    for (i=0; i<n; i++)
5308
0
    {
5309
0
        int len = strlen(values[i]);
5310
0
        if ( len > max_len ) max_len = len;
5311
0
    }
5312
0
    char *out = (char*) malloc(max_len*n);
5313
0
    if ( !out ) return -2;
5314
0
    for (i=0; i<n; i++)
5315
0
    {
5316
0
        char *dst = out+i*max_len;
5317
0
        const char *src = values[i];
5318
0
        int j = 0;
5319
0
        while ( src[j] ) { dst[j] = src[j]; j++; }
5320
0
        for (; j<max_len; j++) dst[j] = 0;
5321
0
    }
5322
0
    int ret = bcf_update_format(hdr,line,key,out,max_len*n,BCF_HT_STR);
5323
0
    free(out);
5324
0
    return ret;
5325
0
}
5326
5327
int bcf_update_format(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type)
5328
0
{
5329
    // Is the field already present?
5330
0
    int i, fmt_id = bcf_hdr_id2int(hdr,BCF_DT_ID,key);
5331
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,fmt_id) )
5332
0
    {
5333
0
        if ( !n ) return 0;
5334
0
        return -1;  // the key not present in the header
5335
0
    }
5336
5337
0
    if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
5338
5339
0
    for (i=0; i<line->n_fmt; i++)
5340
0
        if ( line->d.fmt[i].id==fmt_id ) break;
5341
0
    bcf_fmt_t *fmt = i==line->n_fmt ? NULL : &line->d.fmt[i];
5342
5343
0
    if ( !n )
5344
0
    {
5345
0
        if ( fmt )
5346
0
        {
5347
            // Mark the tag for removal, free existing memory if necessary
5348
0
            if ( fmt->p_free )
5349
0
            {
5350
0
                free(fmt->p - fmt->p_off);
5351
0
                fmt->p_free = 0;
5352
0
            }
5353
0
            line->d.indiv_dirty = 1;
5354
0
            fmt->p = NULL;
5355
0
        }
5356
0
        return 0;
5357
0
    }
5358
5359
0
    line->n_sample = bcf_hdr_nsamples(hdr);
5360
0
    int nps = n / line->n_sample;  // number of values per sample
5361
0
    assert( nps && nps*line->n_sample==n );     // must be divisible by n_sample
5362
5363
    // Encode the values and determine the size required to accommodate the values
5364
0
    kstring_t str = {0,0,0};
5365
0
    bcf_enc_int1(&str, fmt_id);
5366
0
    assert(values != NULL);
5367
0
    if ( type==BCF_HT_INT )
5368
0
        bcf_enc_vint(&str, n, (int32_t*)values, nps);
5369
0
    else if ( type==BCF_HT_REAL )
5370
0
    {
5371
0
        bcf_enc_size(&str, nps, BCF_BT_FLOAT);
5372
0
        serialize_float_array(&str, nps*line->n_sample, (float *) values);
5373
0
    }
5374
0
    else if ( type==BCF_HT_STR )
5375
0
    {
5376
0
        bcf_enc_size(&str, nps, BCF_BT_CHAR);
5377
0
        kputsn((char*)values, nps*line->n_sample, &str);
5378
0
    }
5379
0
    else
5380
0
    {
5381
0
        hts_log_error("The type %d not implemented yet at %s:%"PRIhts_pos, type, bcf_seqname_safe(hdr,line), line->pos+1);
5382
0
        abort();
5383
0
    }
5384
5385
0
    if ( !fmt )
5386
0
    {
5387
        // Not present, new format field
5388
0
        line->n_fmt++;
5389
0
        hts_expand0(bcf_fmt_t, line->n_fmt, line->d.m_fmt, line->d.fmt);
5390
5391
        // Special case: VCF specification requires that GT is always first
5392
0
        if ( line->n_fmt > 1 && key[0]=='G' && key[1]=='T' && !key[2] )
5393
0
        {
5394
0
            for (i=line->n_fmt-1; i>0; i--)
5395
0
                line->d.fmt[i] = line->d.fmt[i-1];
5396
0
            fmt = &line->d.fmt[0];
5397
0
        }
5398
0
        else
5399
0
            fmt = &line->d.fmt[line->n_fmt-1];
5400
0
        bcf_unpack_fmt_core1((uint8_t*)str.s, line->n_sample, fmt);
5401
0
        line->d.indiv_dirty = 1;
5402
0
        fmt->p_free = 1;
5403
0
    }
5404
0
    else
5405
0
    {
5406
        // The tag is already present, check if it is big enough to accommodate the new block
5407
0
        if ( fmt->p && str.l <= fmt->p_len + fmt->p_off )
5408
0
        {
5409
            // good, the block is big enough
5410
0
            if ( str.l != fmt->p_len + fmt->p_off ) line->d.indiv_dirty = 1;
5411
0
            uint8_t *ptr = fmt->p - fmt->p_off;
5412
0
            memcpy(ptr, str.s, str.l);
5413
0
            free(str.s);
5414
0
            int p_free = fmt->p_free;
5415
0
            bcf_unpack_fmt_core1(ptr, line->n_sample, fmt);
5416
0
            fmt->p_free = p_free;
5417
0
        }
5418
0
        else
5419
0
        {
5420
0
            if ( fmt->p_free )
5421
0
                free(fmt->p - fmt->p_off);
5422
0
            bcf_unpack_fmt_core1((uint8_t*)str.s, line->n_sample, fmt);
5423
0
            fmt->p_free = 1;
5424
0
            line->d.indiv_dirty = 1;
5425
0
        }
5426
0
    }
5427
0
    line->unpacked |= BCF_UN_FMT;
5428
0
    return 0;
5429
0
}
5430
5431
5432
int bcf_update_filter(const bcf_hdr_t *hdr, bcf1_t *line, int *flt_ids, int n)
5433
0
{
5434
0
    if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
5435
0
    line->d.shared_dirty |= BCF1_DIRTY_FLT;
5436
0
    line->d.n_flt = n;
5437
0
    if ( !n ) return 0;
5438
0
    hts_expand(int, line->d.n_flt, line->d.m_flt, line->d.flt);
5439
0
    int i;
5440
0
    for (i=0; i<n; i++)
5441
0
        line->d.flt[i] = flt_ids[i];
5442
0
    return 0;
5443
0
}
5444
5445
int bcf_add_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id)
5446
0
{
5447
0
    if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
5448
0
    int i;
5449
0
    for (i=0; i<line->d.n_flt; i++)
5450
0
        if ( flt_id==line->d.flt[i] ) break;
5451
0
    if ( i<line->d.n_flt ) return 0;    // this filter is already set
5452
0
    line->d.shared_dirty |= BCF1_DIRTY_FLT;
5453
0
    if ( flt_id==0 )    // set to PASS
5454
0
        line->d.n_flt = 1;
5455
0
    else if ( line->d.n_flt==1 && line->d.flt[0]==0 )
5456
0
        line->d.n_flt = 1;
5457
0
    else
5458
0
        line->d.n_flt++;
5459
0
    hts_expand(int, line->d.n_flt, line->d.m_flt, line->d.flt);
5460
0
    line->d.flt[line->d.n_flt-1] = flt_id;
5461
0
    return 1;
5462
0
}
5463
int bcf_remove_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id, int pass)
5464
0
{
5465
0
    if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
5466
0
    int i;
5467
0
    for (i=0; i<line->d.n_flt; i++)
5468
0
        if ( flt_id==line->d.flt[i] ) break;
5469
0
    if ( i==line->d.n_flt ) return 0;   // the filter is not present
5470
0
    line->d.shared_dirty |= BCF1_DIRTY_FLT;
5471
0
    if ( i!=line->d.n_flt-1 ) memmove(line->d.flt+i,line->d.flt+i+1,(line->d.n_flt-i-1)*sizeof(*line->d.flt));
5472
0
    line->d.n_flt--;
5473
0
    if ( !line->d.n_flt && pass ) bcf_add_filter(hdr,line,0);
5474
0
    return 0;
5475
0
}
5476
5477
int bcf_has_filter(const bcf_hdr_t *hdr, bcf1_t *line, char *filter)
5478
0
{
5479
0
    if ( filter[0]=='.' && !filter[1] ) filter = "PASS";
5480
0
    int id = bcf_hdr_id2int(hdr, BCF_DT_ID, filter);
5481
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FLT,id) ) return -1;  // not defined in the header
5482
5483
0
    if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
5484
0
    if ( id==0 && !line->d.n_flt) return 1; // PASS
5485
5486
0
    int i;
5487
0
    for (i=0; i<line->d.n_flt; i++)
5488
0
        if ( line->d.flt[i]==id ) return 1;
5489
0
    return 0;
5490
0
}
5491
5492
static inline int _bcf1_sync_alleles(const bcf_hdr_t *hdr, bcf1_t *line, int nals)
5493
0
{
5494
0
    line->d.shared_dirty |= BCF1_DIRTY_ALS;
5495
5496
0
    line->n_allele = nals;
5497
0
    hts_expand(char*, line->n_allele, line->d.m_allele, line->d.allele);
5498
5499
0
    char *als = line->d.als;
5500
0
    int n = 0;
5501
0
    while (n<nals)
5502
0
    {
5503
0
        line->d.allele[n] = als;
5504
0
        while ( *als ) als++;
5505
0
        als++;
5506
0
        n++;
5507
0
    }
5508
5509
    // Update REF length. Note that END is 1-based while line->pos 0-based
5510
0
    bcf_info_t *end_info = bcf_get_info(hdr,line,"END");
5511
0
    if ( end_info )
5512
0
    {
5513
0
        if ( end_info->type==BCF_HT_INT && end_info->v1.i==bcf_int32_missing ) end_info = NULL;
5514
0
        else if ( end_info->type==BCF_HT_LONG && end_info->v1.i==bcf_int64_missing ) end_info = NULL;
5515
0
    }
5516
0
    if ( end_info && end_info->v1.i > line->pos )
5517
0
        line->rlen = end_info->v1.i - line->pos;
5518
0
    else if ( nals > 0 )
5519
0
        line->rlen = strlen(line->d.allele[0]);
5520
0
    else
5521
0
        line->rlen = 0;
5522
5523
0
    return 0;
5524
0
}
5525
int bcf_update_alleles(const bcf_hdr_t *hdr, bcf1_t *line, const char **alleles, int nals)
5526
0
{
5527
0
    if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line, BCF_UN_STR);
5528
0
    char *free_old = NULL;
5529
0
    char buffer[256];
5530
0
    size_t used = 0;
5531
5532
    // The pointers in alleles may point into the existing line->d.als memory,
5533
    // so care needs to be taken not to clobber them while updating.  Usually
5534
    // they will be short so we can copy through an intermediate buffer.
5535
    // If they're longer, or won't fit in the existing allocation we
5536
    // can allocate a new buffer to write into.  Note that in either case
5537
    // pointers to line->d.als memory in alleles may not be valid when we've
5538
    // finished.
5539
0
    int i;
5540
0
    size_t avail = line->d.m_als < sizeof(buffer) ? line->d.m_als : sizeof(buffer);
5541
0
    for (i=0; i<nals; i++) {
5542
0
        size_t sz = strlen(alleles[i]) + 1;
5543
0
        if (avail - used < sz)
5544
0
            break;
5545
0
        memcpy(buffer + used, alleles[i], sz);
5546
0
        used += sz;
5547
0
    }
5548
5549
    // Did we miss anything?
5550
0
    if (i < nals) {
5551
0
        int j;
5552
0
        size_t needed = used;
5553
0
        char *new_als;
5554
0
        for (j = i; j < nals; j++)
5555
0
            needed += strlen(alleles[j]) + 1;
5556
0
        if (needed < line->d.m_als) // Don't shrink the buffer
5557
0
            needed = line->d.m_als;
5558
0
        if (needed > INT_MAX) {
5559
0
            hts_log_error("REF + alleles too long to fit in a BCF record");
5560
0
            return -1;
5561
0
        }
5562
0
        new_als = malloc(needed);
5563
0
        if (!new_als)
5564
0
            return -1;
5565
0
        free_old = line->d.als;
5566
0
        line->d.als = new_als;
5567
0
        line->d.m_als = needed;
5568
0
    }
5569
5570
    // Copy from the temp buffer to the destination
5571
0
    if (used) {
5572
0
        assert(used <= line->d.m_als);
5573
0
        memcpy(line->d.als, buffer, used);
5574
0
    }
5575
5576
    // Add in any remaining entries - if this happens we will always be
5577
    // writing to a newly-allocated buffer.
5578
0
    for (; i < nals; i++) {
5579
0
        size_t sz = strlen(alleles[i]) + 1;
5580
0
        memcpy(line->d.als + used, alleles[i], sz);
5581
0
        used += sz;
5582
0
    }
5583
5584
0
    if (free_old)
5585
0
        free(free_old);
5586
0
    return _bcf1_sync_alleles(hdr,line,nals);
5587
0
}
5588
5589
int bcf_update_alleles_str(const bcf_hdr_t *hdr, bcf1_t *line, const char *alleles_string)
5590
0
{
5591
0
    if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line, BCF_UN_STR);
5592
0
    kstring_t tmp;
5593
0
    tmp.l = 0; tmp.s = line->d.als; tmp.m = line->d.m_als;
5594
0
    kputs(alleles_string, &tmp);
5595
0
    line->d.als = tmp.s; line->d.m_als = tmp.m;
5596
5597
0
    int nals = 1;
5598
0
    char *t = line->d.als;
5599
0
    while (*t)
5600
0
    {
5601
0
        if ( *t==',' ) { *t = 0; nals++; }
5602
0
        t++;
5603
0
    }
5604
0
    return _bcf1_sync_alleles(hdr, line, nals);
5605
0
}
5606
5607
int bcf_update_id(const bcf_hdr_t *hdr, bcf1_t *line, const char *id)
5608
0
{
5609
0
    if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line, BCF_UN_STR);
5610
0
    kstring_t tmp;
5611
0
    tmp.l = 0; tmp.s = line->d.id; tmp.m = line->d.m_id;
5612
0
    if ( id )
5613
0
        kputs(id, &tmp);
5614
0
    else
5615
0
        kputs(".", &tmp);
5616
0
    line->d.id = tmp.s; line->d.m_id = tmp.m;
5617
0
    line->d.shared_dirty |= BCF1_DIRTY_ID;
5618
0
    return 0;
5619
0
}
5620
5621
int bcf_add_id(const bcf_hdr_t *hdr, bcf1_t *line, const char *id)
5622
0
{
5623
0
    if ( !id ) return 0;
5624
0
    if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line, BCF_UN_STR);
5625
5626
0
    kstring_t tmp;
5627
0
    tmp.l = 0; tmp.s = line->d.id; tmp.m = line->d.m_id;
5628
5629
0
    int len = strlen(id);
5630
0
    char *dst = line->d.id;
5631
0
    while ( *dst && (dst=strstr(dst,id)) )
5632
0
    {
5633
0
        if ( dst[len]!=0 && dst[len]!=';' ) dst++;              // a prefix, not a match
5634
0
        else if ( dst==line->d.id || dst[-1]==';' ) return 0;   // already present
5635
0
        dst++;  // a suffix, not a match
5636
0
    }
5637
0
    if ( line->d.id && (line->d.id[0]!='.' || line->d.id[1]) )
5638
0
    {
5639
0
        tmp.l = strlen(line->d.id);
5640
0
        kputc(';',&tmp);
5641
0
    }
5642
0
    kputs(id,&tmp);
5643
5644
0
    line->d.id = tmp.s; line->d.m_id = tmp.m;
5645
0
    line->d.shared_dirty |= BCF1_DIRTY_ID;
5646
0
    return 0;
5647
5648
0
}
5649
5650
bcf_fmt_t *bcf_get_fmt(const bcf_hdr_t *hdr, bcf1_t *line, const char *key)
5651
0
{
5652
0
    int id = bcf_hdr_id2int(hdr, BCF_DT_ID, key);
5653
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,id) ) return NULL;   // no such FMT field in the header
5654
0
    return bcf_get_fmt_id(line, id);
5655
0
}
5656
5657
bcf_info_t *bcf_get_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key)
5658
0
{
5659
0
    int id = bcf_hdr_id2int(hdr, BCF_DT_ID, key);
5660
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,id) ) return NULL;   // no such INFO field in the header
5661
0
    return bcf_get_info_id(line, id);
5662
0
}
5663
5664
bcf_fmt_t *bcf_get_fmt_id(bcf1_t *line, const int id)
5665
0
{
5666
0
    int i;
5667
0
    if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
5668
0
    for (i=0; i<line->n_fmt; i++)
5669
0
    {
5670
0
        if ( line->d.fmt[i].id==id ) return &line->d.fmt[i];
5671
0
    }
5672
0
    return NULL;
5673
0
}
5674
5675
bcf_info_t *bcf_get_info_id(bcf1_t *line, const int id)
5676
0
{
5677
0
    int i;
5678
0
    if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO);
5679
0
    for (i=0; i<line->n_info; i++)
5680
0
    {
5681
0
        if ( line->d.info[i].key==id ) return &line->d.info[i];
5682
0
    }
5683
0
    return NULL;
5684
0
}
5685
5686
5687
int bcf_get_info_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type)
5688
0
{
5689
0
    int i, ret = -4, tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag);
5690
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,tag_id) ) return -1;    // no such INFO field in the header
5691
0
    if ( bcf_hdr_id2type(hdr,BCF_HL_INFO,tag_id)!=(type & 0xff) ) return -2;     // expected different type
5692
5693
0
    if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO);
5694
5695
0
    for (i=0; i<line->n_info; i++)
5696
0
        if ( line->d.info[i].key==tag_id ) break;
5697
0
    if ( i==line->n_info ) return ( type==BCF_HT_FLAG ) ? 0 : -3;       // the tag is not present in this record
5698
0
    if ( type==BCF_HT_FLAG ) return 1;
5699
5700
0
    bcf_info_t *info = &line->d.info[i];
5701
0
    if ( !info->vptr ) return -3;           // the tag was marked for removal
5702
0
    if ( type==BCF_HT_STR )
5703
0
    {
5704
0
        if ( *ndst < info->len+1 )
5705
0
        {
5706
0
            *ndst = info->len + 1;
5707
0
            *dst  = realloc(*dst, *ndst);
5708
0
        }
5709
0
        memcpy(*dst,info->vptr,info->len);
5710
0
        ((uint8_t*)*dst)[info->len] = 0;
5711
0
        return info->len;
5712
0
    }
5713
5714
    // Make sure the buffer is big enough
5715
0
    int size1;
5716
0
    switch (type) {
5717
0
        case BCF_HT_INT:  size1 = sizeof(int32_t); break;
5718
0
        case BCF_HT_LONG: size1 = sizeof(int64_t); break;
5719
0
        case BCF_HT_REAL: size1 = sizeof(float); break;
5720
0
        default:
5721
0
            hts_log_error("Unexpected output type %d at %s:%"PRIhts_pos, type, bcf_seqname_safe(hdr,line), line->pos+1);
5722
0
            return -2;
5723
0
    }
5724
0
    if ( *ndst < info->len )
5725
0
    {
5726
0
        *ndst = info->len;
5727
0
        *dst  = realloc(*dst, *ndst * size1);
5728
0
    }
5729
5730
0
    #define BRANCH(type_t, convert, is_missing, is_vector_end, set_missing, set_regular, out_type_t) do { \
5731
0
        out_type_t *tmp = (out_type_t *) *dst; \
5732
0
        int j; \
5733
0
        for (j=0; j<info->len; j++) \
5734
0
        { \
5735
0
            type_t p = convert(info->vptr + j * sizeof(type_t)); \
5736
0
            if ( is_vector_end ) break; \
5737
0
            if ( is_missing ) set_missing; \
5738
0
            else set_regular; \
5739
0
            tmp++; \
5740
0
        } \
5741
0
        ret = j; \
5742
0
    } while (0)
5743
0
    switch (info->type) {
5744
0
        case BCF_BT_INT8:
5745
0
            if (type == BCF_HT_LONG) {
5746
0
                BRANCH(int8_t,  le_to_i8,  p==bcf_int8_missing,  p==bcf_int8_vector_end,  *tmp=bcf_int64_missing, *tmp=p, int64_t);
5747
0
            } else {
5748
0
                BRANCH(int8_t,  le_to_i8,  p==bcf_int8_missing,  p==bcf_int8_vector_end,  *tmp=bcf_int32_missing, *tmp=p, int32_t);
5749
0
            }
5750
0
            break;
5751
0
        case BCF_BT_INT16:
5752
0
            if (type == BCF_HT_LONG) {
5753
0
                BRANCH(int16_t, le_to_i16, p==bcf_int16_missing, p==bcf_int16_vector_end, *tmp=bcf_int64_missing, *tmp=p, int64_t);
5754
0
            } else {
5755
0
                BRANCH(int16_t, le_to_i16, p==bcf_int16_missing, p==bcf_int16_vector_end, *tmp=bcf_int32_missing, *tmp=p, int32_t);
5756
0
            }
5757
0
            break;
5758
0
        case BCF_BT_INT32:
5759
0
            if (type == BCF_HT_LONG) {
5760
0
                BRANCH(int32_t, le_to_i32, p==bcf_int32_missing, p==bcf_int32_vector_end, *tmp=bcf_int64_missing, *tmp=p, int64_t); break;
5761
0
            } else {
5762
0
                BRANCH(int32_t, le_to_i32, p==bcf_int32_missing, p==bcf_int32_vector_end, *tmp=bcf_int32_missing, *tmp=p, int32_t); break;
5763
0
            }
5764
0
        case BCF_BT_FLOAT: BRANCH(uint32_t, le_to_u32, p==bcf_float_missing, p==bcf_float_vector_end, bcf_float_set_missing(*tmp), bcf_float_set(tmp, p), float); break;
5765
0
        default: hts_log_error("Unexpected type %d at %s:%"PRIhts_pos, info->type, bcf_seqname_safe(hdr,line), line->pos+1); return -2;
5766
0
    }
5767
0
    #undef BRANCH
5768
0
    return ret;  // set by BRANCH
5769
0
}
5770
5771
int bcf_get_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, char ***dst, int *ndst)
5772
0
{
5773
0
    int i,tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag);
5774
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,tag_id) ) return -1;    // no such FORMAT field in the header
5775
0
    if ( bcf_hdr_id2type(hdr,BCF_HL_FMT,tag_id)!=BCF_HT_STR ) return -2;     // expected different type
5776
5777
0
    if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
5778
5779
0
    for (i=0; i<line->n_fmt; i++)
5780
0
        if ( line->d.fmt[i].id==tag_id ) break;
5781
0
    if ( i==line->n_fmt ) return -3;                               // the tag is not present in this record
5782
0
    bcf_fmt_t *fmt = &line->d.fmt[i];
5783
0
    if ( !fmt->p ) return -3;                                      // the tag was marked for removal
5784
5785
0
    int nsmpl = bcf_hdr_nsamples(hdr);
5786
0
    if ( !*dst )
5787
0
    {
5788
0
        *dst = (char**) malloc(sizeof(char*)*nsmpl);
5789
0
        if ( !*dst ) return -4;     // could not alloc
5790
0
        (*dst)[0] = NULL;
5791
0
    }
5792
0
    int n = (fmt->n+1)*nsmpl;
5793
0
    if ( *ndst < n )
5794
0
    {
5795
0
        (*dst)[0] = realloc((*dst)[0], n);
5796
0
        if ( !(*dst)[0] ) return -4;    // could not alloc
5797
0
        *ndst = n;
5798
0
    }
5799
0
    for (i=0; i<nsmpl; i++)
5800
0
    {
5801
0
        uint8_t *src = fmt->p + i*fmt->n;
5802
0
        uint8_t *tmp = (uint8_t*)(*dst)[0] + i*(fmt->n+1);
5803
0
        memcpy(tmp,src,fmt->n);
5804
0
        tmp[fmt->n] = 0;
5805
0
        (*dst)[i] = (char*) tmp;
5806
0
    }
5807
0
    return n;
5808
0
}
5809
5810
int bcf_get_format_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type)
5811
0
{
5812
0
    int i,j, tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag);
5813
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,tag_id) ) return -1;    // no such FORMAT field in the header
5814
0
    if ( tag[0]=='G' && tag[1]=='T' && tag[2]==0 )
5815
0
    {
5816
        // Ugly: GT field is considered to be a string by the VCF header but BCF represents it as INT.
5817
0
        if ( bcf_hdr_id2type(hdr,BCF_HL_FMT,tag_id)!=BCF_HT_STR ) return -2;
5818
0
    }
5819
0
    else if ( bcf_hdr_id2type(hdr,BCF_HL_FMT,tag_id)!=type ) return -2;     // expected different type
5820
5821
0
    if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
5822
5823
0
    for (i=0; i<line->n_fmt; i++)
5824
0
        if ( line->d.fmt[i].id==tag_id ) break;
5825
0
    if ( i==line->n_fmt ) return -3;                               // the tag is not present in this record
5826
0
    bcf_fmt_t *fmt = &line->d.fmt[i];
5827
0
    if ( !fmt->p ) return -3;                                      // the tag was marked for removal
5828
5829
0
    if ( type==BCF_HT_STR )
5830
0
    {
5831
0
        int n = fmt->n*bcf_hdr_nsamples(hdr);
5832
0
        if ( *ndst < n )
5833
0
        {
5834
0
            *dst  = realloc(*dst, n);
5835
0
            if ( !*dst ) return -4;     // could not alloc
5836
0
            *ndst = n;
5837
0
        }
5838
0
        memcpy(*dst,fmt->p,n);
5839
0
        return n;
5840
0
    }
5841
5842
    // Make sure the buffer is big enough
5843
0
    int nsmpl = bcf_hdr_nsamples(hdr);
5844
0
    int size1 = type==BCF_HT_INT ? sizeof(int32_t) : sizeof(float);
5845
0
    if ( *ndst < fmt->n*nsmpl )
5846
0
    {
5847
0
        *ndst = fmt->n*nsmpl;
5848
0
        *dst  = realloc(*dst, *ndst*size1);
5849
0
        if ( !*dst ) return -4;     // could not alloc
5850
0
    }
5851
5852
0
    #define BRANCH(type_t, convert, is_missing, is_vector_end, set_missing, set_vector_end, set_regular, out_type_t) { \
5853
0
        out_type_t *tmp = (out_type_t *) *dst; \
5854
0
        uint8_t *fmt_p = fmt->p; \
5855
0
        for (i=0; i<nsmpl; i++) \
5856
0
        { \
5857
0
            for (j=0; j<fmt->n; j++) \
5858
0
            { \
5859
0
                type_t p = convert(fmt_p + j * sizeof(type_t)); \
5860
0
                if ( is_missing ) set_missing; \
5861
0
                else if ( is_vector_end ) { set_vector_end; break; } \
5862
0
                else set_regular; \
5863
0
                tmp++; \
5864
0
            } \
5865
0
            for (; j<fmt->n; j++) { set_vector_end; tmp++; } \
5866
0
            fmt_p += fmt->size; \
5867
0
        } \
5868
0
    }
5869
0
    switch (fmt->type) {
5870
0
        case BCF_BT_INT8:  BRANCH(int8_t,  le_to_i8, p==bcf_int8_missing,  p==bcf_int8_vector_end,  *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, *tmp=p, int32_t); break;
5871
0
        case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, p==bcf_int16_missing, p==bcf_int16_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, *tmp=p, int32_t); break;
5872
0
        case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, p==bcf_int32_missing, p==bcf_int32_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, *tmp=p, int32_t); break;
5873
0
        case BCF_BT_FLOAT: BRANCH(uint32_t, le_to_u32, p==bcf_float_missing, p==bcf_float_vector_end, bcf_float_set_missing(*tmp), bcf_float_set_vector_end(*tmp), bcf_float_set(tmp, p), float); break;
5874
0
        default: hts_log_error("Unexpected type %d at %s:%"PRIhts_pos, fmt->type, bcf_seqname_safe(hdr,line), line->pos+1); exit(1);
5875
0
    }
5876
0
    #undef BRANCH
5877
0
    return nsmpl*fmt->n;
5878
0
}
5879
5880
//error description structure definition
5881
typedef struct err_desc {
5882
    int  errorcode;
5883
    const char *description;
5884
}err_desc;
5885
5886
// error descriptions
5887
static const err_desc errdesc_bcf[] = {
5888
    { BCF_ERR_CTG_UNDEF, "Contig not defined in header"},
5889
    { BCF_ERR_TAG_UNDEF, "Tag not defined in header" },
5890
    { BCF_ERR_NCOLS, "Incorrect number of columns" },
5891
    { BCF_ERR_LIMITS, "Limits reached" },
5892
    { BCF_ERR_CHAR, "Invalid character" },
5893
    { BCF_ERR_CTG_INVALID, "Invalid contig" },
5894
    { BCF_ERR_TAG_INVALID, "Invalid tag" },
5895
};
5896
5897
/// append given description to buffer based on available size and add ... when not enough space
5898
    /** @param buffer       buffer to which description to be appended
5899
        @param offset       offset at which to be appended
5900
        @param maxbuffer    maximum size of the buffer
5901
        @param description  the description to be appended
5902
on failure returns -1 - when buffer is not big enough; returns -1 on invalid params and on too small buffer which are improbable due to validation at caller site
5903
on success returns 0
5904
    */
5905
5.71k
static int add_desc_to_buffer(char *buffer, size_t *offset, size_t maxbuffer, const char *description) {
5906
5907
5.71k
    if (!description || !buffer || !offset || (maxbuffer < 4))
5908
0
        return -1;
5909
5910
5.71k
    size_t rembuffer = maxbuffer - *offset;
5911
5.71k
    if (rembuffer > (strlen(description) + (rembuffer == maxbuffer ? 0 : 1))) {    //add description with optionally required ','
5912
5.71k
        *offset += snprintf(buffer + *offset, rembuffer, "%s%s", (rembuffer == maxbuffer)? "": ",", description);
5913
5.71k
    } else {    //not enough space for description, put ...
5914
0
        size_t tmppos = (rembuffer <= 4) ? maxbuffer - 4 : *offset;
5915
0
        snprintf(buffer + tmppos, 4, "...");    //ignore offset update
5916
0
        return -1;
5917
0
    }
5918
5.71k
    return 0;
5919
5.71k
}
5920
5921
//get description for given error code. return NULL on error
5922
2.69k
const char *bcf_strerror(int errorcode, char *buffer, size_t maxbuffer) {
5923
2.69k
    size_t usedup = 0;
5924
2.69k
    int ret = 0;
5925
2.69k
    int idx;
5926
5927
2.69k
    if (!buffer || maxbuffer < 4)
5928
0
        return NULL;           //invalid / insufficient buffer
5929
5930
2.69k
    if (!errorcode) {
5931
0
        buffer[0] = '\0';      //no error, set null
5932
0
        return buffer;
5933
0
    }
5934
5935
21.5k
    for (idx = 0; idx < sizeof(errdesc_bcf) / sizeof(err_desc); ++idx) {
5936
18.8k
        if (errorcode & errdesc_bcf[idx].errorcode) {    //error is set, add description
5937
5.71k
            ret = add_desc_to_buffer(buffer, &usedup, maxbuffer, errdesc_bcf[idx].description);
5938
5.71k
            if (ret < 0)
5939
0
                break;         //not enough space, ... added, no need to continue
5940
5941
5.71k
            errorcode &= ~errdesc_bcf[idx].errorcode;    //reset the error
5942
5.71k
        }
5943
18.8k
    }
5944
5945
2.69k
    if (errorcode && (ret >= 0))  {     //undescribed error is present in error code and had enough buffer, try to add unkonwn error as well§
5946
0
        add_desc_to_buffer(buffer, &usedup, maxbuffer, "Unknown error");
5947
0
    }
5948
2.69k
    return buffer;
5949
2.69k
}
5950