Coverage Report

Created: 2023-11-19 06:13

/src/htslib/vcf.c
Line
Count
Source (jump to first uncovered line)
1
/*  vcf.c -- VCF/BCF API functions.
2
3
    Copyright (C) 2012, 2013 Broad Institute.
4
    Copyright (C) 2012-2023 Genome Research Ltd.
5
    Portions copyright (C) 2014 Intel Corporation.
6
7
    Author: Heng Li <lh3@sanger.ac.uk>
8
9
Permission is hereby granted, free of charge, to any person obtaining a copy
10
of this software and associated documentation files (the "Software"), to deal
11
in the Software without restriction, including without limitation the rights
12
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13
copies of the Software, and to permit persons to whom the Software is
14
furnished to do so, subject to the following conditions:
15
16
The above copyright notice and this permission notice shall be included in
17
all copies or substantial portions of the Software.
18
19
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25
DEALINGS IN THE SOFTWARE.  */
26
27
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
28
#include <config.h>
29
30
#include <stdio.h>
31
#include <assert.h>
32
#include <string.h>
33
#include <strings.h>
34
#include <stdlib.h>
35
#include <limits.h>
36
#include <stdint.h>
37
#include <inttypes.h>
38
#include <errno.h>
39
40
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
41
#include "fuzz_settings.h"
42
#endif
43
44
#include "htslib/vcf.h"
45
#include "htslib/bgzf.h"
46
#include "htslib/tbx.h"
47
#include "htslib/hfile.h"
48
#include "hts_internal.h"
49
#include "htslib/hts_endian.h"
50
#include "htslib/khash_str2int.h"
51
#include "htslib/kstring.h"
52
#include "htslib/sam.h"
53
#include "htslib/khash.h"
54
55
#if 0
56
// This helps on Intel a bit, often 6-7% faster VCF parsing.
57
// Conversely sometimes harms AMD Zen4 as ~9% slower.
58
// Possibly related to IPC differences.  However for now it's just a
59
// curiousity we ignore and stick with the simpler code.
60
//
61
// Left here as a hint for future explorers.
62
static inline int xstreq(const char *a, const char *b) {
63
    while (*a && *a == *b)
64
        a++, b++;
65
    return *a == *b;
66
}
67
68
#define KHASH_MAP_INIT_XSTR(name, khval_t) \
69
  KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, xstreq)
70
71
KHASH_MAP_INIT_XSTR(vdict, bcf_idinfo_t)
72
#else
73
KHASH_MAP_INIT_STR(vdict, bcf_idinfo_t)
74
#endif
75
76
typedef khash_t(vdict) vdict_t;
77
78
KHASH_MAP_INIT_STR(hdict, bcf_hrec_t*)
79
typedef khash_t(hdict) hdict_t;
80
81
82
#include "htslib/kseq.h"
83
HTSLIB_EXPORT
84
uint32_t bcf_float_missing    = 0x7F800001;
85
86
HTSLIB_EXPORT
87
uint32_t bcf_float_vector_end = 0x7F800002;
88
89
HTSLIB_EXPORT
90
uint8_t bcf_type_shift[] = { 0, 0, 1, 2, 3, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
91
92
static bcf_idinfo_t bcf_idinfo_def = { .info = { 15, 15, 15 }, .hrec = { NULL, NULL, NULL}, .id = -1 };
93
94
/*
95
    Partial support for 64-bit POS and Number=1 INFO tags.
96
    Notes:
97
     - the support for 64-bit values is motivated by POS and INFO/END for large genomes
98
     - the use of 64-bit values does not conform to the specification
99
     - cannot output 64-bit BCF and if it does, it is not compatible with anything
100
     - experimental, use at your risk
101
*/
102
#ifdef VCF_ALLOW_INT64
103
    #define BCF_MAX_BT_INT64 (0x7fffffffffffffff)       /* INT64_MAX, for internal use only */
104
    #define BCF_MIN_BT_INT64 -9223372036854775800LL     /* INT64_MIN + 8, for internal use only */
105
#endif
106
107
618
#define BCF_IS_64BIT (1<<30)
108
109
110
// Opaque structure with auxilary data which allows to extend bcf_hdr_t without breaking ABI.
111
// Note that this preserving API and ABI requires that the first element is vdict_t struct
112
// rather than a pointer, as user programs may (and in some cases do) access the dictionary
113
// directly as (vdict_t*)hdr->dict.
114
typedef struct
115
{
116
    vdict_t dict;   // bcf_hdr_t.dict[0] vdict_t dictionary which keeps bcf_idinfo_t for BCF_HL_FLT,BCF_HL_INFO,BCF_HL_FMT
117
    hdict_t *gen;   // hdict_t dictionary which keeps bcf_hrec_t* pointers for generic and structured fields
118
    size_t *key_len;// length of h->id[BCF_DT_ID] strings
119
}
120
bcf_hdr_aux_t;
121
122
static inline bcf_hdr_aux_t *get_hdr_aux(const bcf_hdr_t *hdr)
123
224k
{
124
224k
    return (bcf_hdr_aux_t *)hdr->dict[0];
125
224k
}
126
127
static char *find_chrom_header_line(char *s)
128
0
{
129
0
    char *nl;
130
0
    if (strncmp(s, "#CHROM\t", 7) == 0) return s;
131
0
    else if ((nl = strstr(s, "\n#CHROM\t")) != NULL) return nl+1;
132
0
    else return NULL;
133
0
}
134
135
/*************************
136
 *** VCF header parser ***
137
 *************************/
138
139
static int bcf_hdr_add_sample_len(bcf_hdr_t *h, const char *s, size_t len)
140
11.4k
{
141
11.4k
    const char *ss = s;
142
12.0k
    while ( *ss && isspace_c(*ss) && ss - s < len) ss++;
143
11.4k
    if ( !*ss || ss - s == len)
144
5
    {
145
5
        hts_log_error("Empty sample name: trailing spaces/tabs in the header line?");
146
5
        return -1;
147
5
    }
148
149
11.4k
    vdict_t *d = (vdict_t*)h->dict[BCF_DT_SAMPLE];
150
11.4k
    int ret;
151
11.4k
    char *sdup = malloc(len + 1);
152
11.4k
    if (!sdup) return -1;
153
11.4k
    memcpy(sdup, s, len);
154
11.4k
    sdup[len] = 0;
155
156
    // Ensure space is available in h->samples
157
11.4k
    size_t n = kh_size(d);
158
11.4k
    char **new_samples = realloc(h->samples, sizeof(char*) * (n + 1));
159
11.4k
    if (!new_samples) {
160
0
        free(sdup);
161
0
        return -1;
162
0
    }
163
11.4k
    h->samples = new_samples;
164
165
11.4k
    int k = kh_put(vdict, d, sdup, &ret);
166
11.4k
    if (ret < 0) {
167
0
        free(sdup);
168
0
        return -1;
169
0
    }
170
11.4k
    if (ret) { // absent
171
11.4k
        kh_val(d, k) = bcf_idinfo_def;
172
11.4k
        kh_val(d, k).id = n;
173
11.4k
    } else {
174
4
        hts_log_error("Duplicated sample name '%s'", sdup);
175
4
        free(sdup);
176
4
        return -1;
177
4
    }
178
11.4k
    h->samples[n] = sdup;
179
11.4k
    h->dirty = 1;
180
11.4k
    return 0;
181
11.4k
}
182
183
int bcf_hdr_add_sample(bcf_hdr_t *h, const char *s)
184
0
{
185
0
    if (!s) {
186
        // Allowed for backwards-compatibility, calling with s == NULL
187
        // used to trigger bcf_hdr_sync(h);
188
0
        return 0;
189
0
    }
190
0
    return bcf_hdr_add_sample_len(h, s, strlen(s));
191
0
}
192
193
int HTS_RESULT_USED bcf_hdr_parse_sample_line(bcf_hdr_t *hdr, const char *str)
194
5.14k
{
195
5.14k
    const char *mandatory = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO";
196
5.14k
    if ( strncmp(str,mandatory,strlen(mandatory)) )
197
33
    {
198
33
        hts_log_error("Could not parse the \"#CHROM..\" line, either the fields are incorrect or spaces are present instead of tabs:\n\t%s",str);
199
33
        return -1;
200
33
    }
201
202
5.11k
    const char *beg = str + strlen(mandatory), *end;
203
5.11k
    if ( !*beg || *beg=='\n' ) return 0;
204
2.14k
    if ( strncmp(beg,"\tFORMAT\t",8) )
205
19
    {
206
19
        hts_log_error("Could not parse the \"#CHROM..\" line, either FORMAT is missing or spaces are present instead of tabs:\n\t%s",str);
207
19
        return -1;
208
19
    }
209
2.12k
    beg += 8;
210
211
2.12k
    int ret = 0;
212
11.7k
    while ( *beg )
213
11.4k
    {
214
11.4k
        end = beg;
215
362M
        while ( *end && *end!='\t' && *end!='\n' ) end++;
216
11.4k
        if ( bcf_hdr_add_sample_len(hdr, beg, end-beg) < 0 ) ret = -1;
217
11.4k
        if ( !*end || *end=='\n' || ret<0 ) break;
218
9.58k
        beg = end + 1;
219
9.58k
    }
220
2.12k
    return ret;
221
2.14k
}
222
223
int bcf_hdr_sync(bcf_hdr_t *h)
224
74.6k
{
225
74.6k
    int i;
226
298k
    for (i = 0; i < 3; i++)
227
223k
    {
228
223k
        vdict_t *d = (vdict_t*)h->dict[i];
229
223k
        khint_t k;
230
223k
        if ( h->n[i] < kh_size(d) )
231
1.88k
        {
232
1.88k
            bcf_idpair_t *new_idpair;
233
            // this should be true only for i=2, BCF_DT_SAMPLE
234
1.88k
            new_idpair = (bcf_idpair_t*) realloc(h->id[i], kh_size(d)*sizeof(bcf_idpair_t));
235
1.88k
            if (!new_idpair) return -1;
236
1.88k
            h->n[i] = kh_size(d);
237
1.88k
            h->id[i] = new_idpair;
238
1.88k
        }
239
2.59G
        for (k=kh_begin(d); k<kh_end(d); k++)
240
2.59G
        {
241
2.59G
            if (!kh_exist(d,k)) continue;
242
11.0M
            h->id[i][kh_val(d,k).id].key = kh_key(d,k);
243
11.0M
            h->id[i][kh_val(d,k).id].val = &kh_val(d,k);
244
11.0M
        }
245
223k
    }
246
247
    // Invalidate key length cache
248
74.6k
    bcf_hdr_aux_t *aux = get_hdr_aux(h);
249
74.6k
    if (aux && aux->key_len) {
250
4.13k
        free(aux->key_len);
251
4.13k
        aux->key_len = NULL;
252
4.13k
    }
253
254
74.6k
    h->dirty = 0;
255
74.6k
    return 0;
256
74.6k
}
257
258
void bcf_hrec_destroy(bcf_hrec_t *hrec)
259
124k
{
260
124k
    if (!hrec) return;
261
122k
    free(hrec->key);
262
122k
    if ( hrec->value ) free(hrec->value);
263
122k
    int i;
264
477k
    for (i=0; i<hrec->nkeys; i++)
265
355k
    {
266
355k
        free(hrec->keys[i]);
267
355k
        free(hrec->vals[i]);
268
355k
    }
269
122k
    free(hrec->keys);
270
122k
    free(hrec->vals);
271
122k
    free(hrec);
272
122k
}
273
274
// Copies all fields except IDX.
275
bcf_hrec_t *bcf_hrec_dup(bcf_hrec_t *hrec)
276
0
{
277
0
    int save_errno;
278
0
    bcf_hrec_t *out = (bcf_hrec_t*) calloc(1,sizeof(bcf_hrec_t));
279
0
    if (!out) return NULL;
280
281
0
    out->type = hrec->type;
282
0
    if ( hrec->key ) {
283
0
        out->key = strdup(hrec->key);
284
0
        if (!out->key) goto fail;
285
0
    }
286
0
    if ( hrec->value ) {
287
0
        out->value = strdup(hrec->value);
288
0
        if (!out->value) goto fail;
289
0
    }
290
0
    out->nkeys = hrec->nkeys;
291
0
    out->keys = (char**) malloc(sizeof(char*)*hrec->nkeys);
292
0
    if (!out->keys) goto fail;
293
0
    out->vals = (char**) malloc(sizeof(char*)*hrec->nkeys);
294
0
    if (!out->vals) goto fail;
295
0
    int i, j = 0;
296
0
    for (i=0; i<hrec->nkeys; i++)
297
0
    {
298
0
        if ( hrec->keys[i] && !strcmp("IDX",hrec->keys[i]) ) continue;
299
0
        if ( hrec->keys[i] ) {
300
0
            out->keys[j] = strdup(hrec->keys[i]);
301
0
            if (!out->keys[j]) goto fail;
302
0
        }
303
0
        if ( hrec->vals[i] ) {
304
0
            out->vals[j] = strdup(hrec->vals[i]);
305
0
            if (!out->vals[j]) goto fail;
306
0
        }
307
0
        j++;
308
0
    }
309
0
    if ( i!=j ) out->nkeys -= i-j;   // IDX was omitted
310
0
    return out;
311
312
0
 fail:
313
0
    save_errno = errno;
314
0
    hts_log_error("%s", strerror(errno));
315
0
    bcf_hrec_destroy(out);
316
0
    errno = save_errno;
317
0
    return NULL;
318
0
}
319
320
void bcf_hrec_debug(FILE *fp, bcf_hrec_t *hrec)
321
0
{
322
0
    fprintf(fp, "key=[%s] value=[%s]", hrec->key, hrec->value?hrec->value:"");
323
0
    int i;
324
0
    for (i=0; i<hrec->nkeys; i++)
325
0
        fprintf(fp, "\t[%s]=[%s]", hrec->keys[i],hrec->vals[i]);
326
0
    fprintf(fp, "\n");
327
0
}
328
329
void bcf_header_debug(bcf_hdr_t *hdr)
330
0
{
331
0
    int i, j;
332
0
    for (i=0; i<hdr->nhrec; i++)
333
0
    {
334
0
        if ( !hdr->hrec[i]->value )
335
0
        {
336
0
            fprintf(stderr, "##%s=<", hdr->hrec[i]->key);
337
0
            fprintf(stderr,"%s=%s", hdr->hrec[i]->keys[0], hdr->hrec[i]->vals[0]);
338
0
            for (j=1; j<hdr->hrec[i]->nkeys; j++)
339
0
                fprintf(stderr,",%s=%s", hdr->hrec[i]->keys[j], hdr->hrec[i]->vals[j]);
340
0
            fprintf(stderr,">\n");
341
0
        }
342
0
        else
343
0
            fprintf(stderr,"##%s=%s\n", hdr->hrec[i]->key,hdr->hrec[i]->value);
344
0
    }
345
0
}
346
347
int bcf_hrec_add_key(bcf_hrec_t *hrec, const char *str, size_t len)
348
276k
{
349
276k
    char **tmp;
350
276k
    size_t n = hrec->nkeys + 1;
351
276k
    assert(len > 0 && len < SIZE_MAX);
352
276k
    tmp = realloc(hrec->keys, sizeof(char*)*n);
353
276k
    if (!tmp) return -1;
354
276k
    hrec->keys = tmp;
355
276k
    tmp = realloc(hrec->vals, sizeof(char*)*n);
356
276k
    if (!tmp) return -1;
357
276k
    hrec->vals = tmp;
358
359
276k
    hrec->keys[hrec->nkeys] = (char*) malloc((len+1)*sizeof(char));
360
276k
    if (!hrec->keys[hrec->nkeys]) return -1;
361
276k
    memcpy(hrec->keys[hrec->nkeys],str,len);
362
276k
    hrec->keys[hrec->nkeys][len] = 0;
363
276k
    hrec->vals[hrec->nkeys] = NULL;
364
276k
    hrec->nkeys = n;
365
276k
    return 0;
366
276k
}
367
368
int bcf_hrec_set_val(bcf_hrec_t *hrec, int i, const char *str, size_t len, int is_quoted)
369
276k
{
370
276k
    if ( hrec->vals[i] ) {
371
0
        free(hrec->vals[i]);
372
0
        hrec->vals[i] = NULL;
373
0
    }
374
276k
    if ( !str ) return 0;
375
276k
    if ( is_quoted )
376
97.3k
    {
377
97.3k
        if (len >= SIZE_MAX - 3) {
378
0
            errno = ENOMEM;
379
0
            return -1;
380
0
        }
381
97.3k
        hrec->vals[i] = (char*) malloc((len+3)*sizeof(char));
382
97.3k
        if (!hrec->vals[i]) return -1;
383
97.3k
        hrec->vals[i][0] = '"';
384
97.3k
        memcpy(&hrec->vals[i][1],str,len);
385
97.3k
        hrec->vals[i][len+1] = '"';
386
97.3k
        hrec->vals[i][len+2] = 0;
387
97.3k
    }
388
179k
    else
389
179k
    {
390
179k
        if (len == SIZE_MAX) {
391
0
            errno = ENOMEM;
392
0
            return -1;
393
0
        }
394
179k
        hrec->vals[i] = (char*) malloc((len+1)*sizeof(char));
395
179k
        if (!hrec->vals[i]) return -1;
396
179k
        memcpy(hrec->vals[i],str,len);
397
179k
        hrec->vals[i][len] = 0;
398
179k
    }
399
276k
    return 0;
400
276k
}
401
402
int hrec_add_idx(bcf_hrec_t *hrec, int idx)
403
78.2k
{
404
78.2k
    int n = hrec->nkeys + 1;
405
78.2k
    char **tmp = (char**) realloc(hrec->keys, sizeof(char*)*n);
406
78.2k
    if (!tmp) return -1;
407
78.2k
    hrec->keys = tmp;
408
409
78.2k
    tmp = (char**) realloc(hrec->vals, sizeof(char*)*n);
410
78.2k
    if (!tmp) return -1;
411
78.2k
    hrec->vals = tmp;
412
413
78.2k
    hrec->keys[hrec->nkeys] = strdup("IDX");
414
78.2k
    if (!hrec->keys[hrec->nkeys]) return -1;
415
416
78.2k
    kstring_t str = {0,0,0};
417
78.2k
    if (kputw(idx, &str) < 0) {
418
0
        free(hrec->keys[hrec->nkeys]);
419
0
        return -1;
420
0
    }
421
78.2k
    hrec->vals[hrec->nkeys] = str.s;
422
78.2k
    hrec->nkeys = n;
423
78.2k
    return 0;
424
78.2k
}
425
426
int bcf_hrec_find_key(bcf_hrec_t *hrec, const char *key)
427
92.4k
{
428
92.4k
    int i;
429
156k
    for (i=0; i<hrec->nkeys; i++)
430
127k
        if ( !strcasecmp(key,hrec->keys[i]) ) return i;
431
29.2k
    return -1;
432
92.4k
}
433
434
static void bcf_hrec_set_type(bcf_hrec_t *hrec)
435
237k
{
436
237k
    if ( !strcmp(hrec->key, "contig") ) hrec->type = BCF_HL_CTG;
437
217k
    else if ( !strcmp(hrec->key, "INFO") ) hrec->type = BCF_HL_INFO;
438
145k
    else if ( !strcmp(hrec->key, "FILTER") ) hrec->type = BCF_HL_FLT;
439
76.9k
    else if ( !strcmp(hrec->key, "FORMAT") ) hrec->type = BCF_HL_FMT;
440
53.0k
    else if ( hrec->nkeys>0 ) hrec->type = BCF_HL_STR;
441
36.0k
    else hrec->type = BCF_HL_GEN;
442
237k
}
443
444
445
/**
446
    The arrays were generated with
447
448
    valid_ctg:
449
        perl -le '@v = (split(//,q[!#$%&*+./:;=?@^_|~-]),"a"..."z","A"..."Z","0"..."9"); @a = (0) x 256; foreach $c (@v) { $a[ord($c)] = 1; } print join(", ",@a)' | fold -w 48
450
451
    valid_tag:
452
        perl -le '@v = (split(//,q[_.]),"a"..."z","A"..."Z","0"..."9"); @a = (0) x 256; foreach $c (@v) { $a[ord($c)] = 1; } print join(", ",@a)' | fold -w 48
453
*/
454
static const uint8_t valid_ctg[256] =
455
{
456
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
457
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
458
    0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1,
459
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,
460
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
461
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1,
462
    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
463
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0,
464
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
465
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
466
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
467
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
468
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
469
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
470
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
471
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
472
};
473
static const uint8_t valid_tag[256] =
474
{
475
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
476
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
477
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
478
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
479
    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
480
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
481
    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
482
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
483
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
484
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
485
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
486
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
487
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
488
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
489
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
490
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
491
};
492
493
/**
494
    bcf_hrec_check() - check the validity of structured header lines
495
496
    Returns 0 on success or negative value on error.
497
498
    Currently the return status is not checked by the caller
499
    and only a warning is printed on stderr. This should be improved
500
    to propagate the error all the way up to the caller and let it
501
    decide what to do: throw an error or proceed anyway.
502
 */
503
static int bcf_hrec_check(bcf_hrec_t *hrec)
504
118k
{
505
118k
    int i;
506
118k
    bcf_hrec_set_type(hrec);
507
508
118k
    if ( hrec->type==BCF_HL_CTG )
509
9.82k
    {
510
9.82k
        i = bcf_hrec_find_key(hrec,"ID");
511
9.82k
        if ( i<0 ) goto err_missing_id;
512
8.33k
        char *val = hrec->vals[i];
513
8.33k
        if ( val[0]=='*' || val[0]=='=' || !valid_ctg[(uint8_t)val[0]] ) goto err_invalid_ctg;
514
175k
        while ( *(++val) )
515
175k
            if ( !valid_ctg[(uint8_t)*val] ) goto err_invalid_ctg;
516
676
        return 0;
517
2.47k
    }
518
108k
    if ( hrec->type==BCF_HL_INFO )
519
36.0k
    {
520
36.0k
        i = bcf_hrec_find_key(hrec,"ID");
521
36.0k
        if ( i<0 ) goto err_missing_id;
522
31.2k
        char *val = hrec->vals[i];
523
31.2k
        if ( !strcmp(val,"1000G") ) return 0;
524
30.8k
        if ( val[0]=='.' || (val[0]>='0' && val[0]<='9') || !valid_tag[(uint8_t)val[0]] ) goto err_invalid_tag;
525
8.40k
        while ( *(++val) )
526
6.72k
            if ( !valid_tag[(uint8_t)*val] ) goto err_invalid_tag;
527
1.68k
        return 0;
528
3.83k
    }
529
72.7k
    if ( hrec->type==BCF_HL_FMT )
530
11.9k
    {
531
11.9k
        i = bcf_hrec_find_key(hrec,"ID");
532
11.9k
        if ( i<0 ) goto err_missing_id;
533
11.2k
        char *val = hrec->vals[i];
534
11.2k
        if ( val[0]=='.' || (val[0]>='0' && val[0]<='9') || !valid_tag[(uint8_t)val[0]] ) goto err_invalid_tag;
535
11.8k
        while ( *(++val) )
536
8.73k
            if ( !valid_tag[(uint8_t)*val] ) goto err_invalid_tag;
537
3.10k
        return 0;
538
4.96k
    }
539
60.7k
    return 0;
540
541
7.03k
  err_missing_id:
542
7.03k
    hts_log_warning("Missing ID attribute in one or more header lines");
543
7.03k
    return -1;
544
545
7.66k
  err_invalid_ctg:
546
7.66k
    hts_log_warning("Invalid contig name: \"%s\"", hrec->vals[i]);
547
7.66k
    return -1;
548
549
37.3k
  err_invalid_tag:
550
37.3k
    hts_log_warning("Invalid tag name: \"%s\"", hrec->vals[i]);
551
37.3k
    return -1;
552
72.7k
}
553
554
static inline int is_escaped(const char *min, const char *str)
555
93.6k
{
556
93.6k
    int n = 0;
557
94.5k
    while ( --str>=min && *str=='\\' ) n++;
558
93.6k
    return n%2;
559
93.6k
}
560
561
bcf_hrec_t *bcf_hdr_parse_line(const bcf_hdr_t *h, const char *line, int *len)
562
132k
{
563
132k
    bcf_hrec_t *hrec = NULL;
564
132k
    const char *p = line;
565
132k
    if (p[0] != '#' || p[1] != '#') { *len = 0; return NULL; }
566
124k
    p += 2;
567
568
124k
    const char *q = p;
569
1.12M
    while ( *q && *q!='=' && *q != '\n' ) q++;
570
124k
    ptrdiff_t n = q-p;
571
124k
    if ( *q!='=' || !n ) // wrong format
572
2.24k
        goto malformed_line;
573
574
122k
    hrec = (bcf_hrec_t*) calloc(1,sizeof(bcf_hrec_t));
575
122k
    if (!hrec) { *len = -1; return NULL; }
576
122k
    hrec->key = (char*) malloc(sizeof(char)*(n+1));
577
122k
    if (!hrec->key) goto fail;
578
122k
    memcpy(hrec->key,p,n);
579
122k
    hrec->key[n] = 0;
580
122k
    hrec->type = -1;
581
582
122k
    p = ++q;
583
122k
    if ( *p!='<' ) // generic field, e.g. ##samtoolsVersion=0.1.18-r579
584
19.5k
    {
585
10.4M
        while ( *q && *q!='\n' ) q++;
586
19.5k
        hrec->value = (char*) malloc((q-p+1)*sizeof(char));
587
19.5k
        if (!hrec->value) goto fail;
588
19.5k
        memcpy(hrec->value, p, q-p);
589
19.5k
        hrec->value[q-p] = 0;
590
19.5k
        *len = q - line + (*q ? 1 : 0); // Skip \n but not \0
591
19.5k
        return hrec;
592
19.5k
    }
593
594
    // structured line, e.g.
595
    // ##INFO=<ID=PV1,Number=1,Type=Float,Description="P-value for baseQ bias">
596
    // ##PEDIGREE=<Name_0=G0-ID,Name_1=G1-ID,Name_3=GN-ID>
597
102k
    int nopen = 1;
598
379k
    while ( *q && *q!='\n' && nopen>0 )
599
280k
    {
600
280k
        p = ++q;
601
280k
        while ( *q && *q==' ' ) { p++; q++; }
602
        // ^[A-Za-z_][0-9A-Za-z_.]*$
603
280k
        if (p==q && *q && (isalpha_c(*q) || *q=='_'))
604
278k
        {
605
278k
            q++;
606
1.41M
            while ( *q && (isalnum_c(*q) || *q=='_' || *q=='.') ) q++;
607
278k
        }
608
280k
        n = q-p;
609
280k
        int m = 0;
610
280k
        while ( *q && *q==' ' ) { q++; m++; }
611
280k
        if ( *q!='=' || !n )
612
3.52k
            goto malformed_line;
613
614
276k
        if (bcf_hrec_add_key(hrec, p, q-p-m) < 0) goto fail;
615
276k
        p = ++q;
616
278k
        while ( *q && *q==' ' ) { p++; q++; }
617
618
276k
        int quoted = 0;
619
276k
        char ending = '\0';
620
276k
        switch (*p) {
621
97.3k
        case '"':
622
97.3k
            quoted = 1;
623
97.3k
            ending = '"';
624
97.3k
            p++;
625
97.3k
            break;
626
343
        case '[':
627
343
            quoted = 1;
628
343
            ending = ']';
629
343
            break;
630
276k
        }
631
276k
        if ( quoted ) q++;
632
674M
        while ( *q && *q != '\n' )
633
674M
        {
634
674M
            if ( quoted ) { if ( *q==ending && !is_escaped(p,q) ) break; }
635
673M
            else
636
673M
            {
637
673M
                if ( *q=='<' ) nopen++;
638
673M
                if ( *q=='>' ) nopen--;
639
673M
                if ( !nopen ) break;
640
673M
                if ( *q==',' && nopen==1 ) break;
641
673M
            }
642
674M
            q++;
643
674M
        }
644
276k
        const char *r = q;
645
276k
        if (quoted && ending == ']') {
646
343
            if (*q == ending) {
647
333
                r++;
648
333
                q++;
649
333
                quoted = 0;
650
333
            } else {
651
10
                char buffer[320];
652
10
                hts_log_error("Missing ']' in header line %s",
653
10
                              hts_strprint(buffer, sizeof(buffer), '"',
654
10
                                           line, q-line));
655
10
                goto fail;
656
10
            }
657
343
        }
658
277k
        while ( r > p && r[-1] == ' ' ) r--;
659
276k
        if (bcf_hrec_set_val(hrec, hrec->nkeys-1, p, r-p, quoted) < 0)
660
0
            goto fail;
661
276k
        if ( quoted && *q==ending ) q++;
662
276k
        if ( *q=='>' )
663
76.3k
        {
664
76.3k
            if (nopen) nopen--;     // this can happen with nested angle brackets <>
665
76.3k
            q++;
666
76.3k
        }
667
276k
    }
668
99.0k
    if ( nopen )
669
22.6k
        hts_log_warning("Incomplete header line, trying to proceed anyway:\n\t[%s]\n\t[%d]",line,q[0]);
670
671
    // Skip to end of line
672
99.0k
    int nonspace = 0;
673
99.0k
    p = q;
674
9.42M
    while ( *q && *q!='\n' ) { nonspace |= !isspace_c(*q); q++; }
675
99.0k
    if (nonspace) {
676
807
        char buffer[320];
677
807
        hts_log_warning("Dropped trailing junk from header line '%s'",
678
807
                        hts_strprint(buffer, sizeof(buffer),
679
807
                                     '"', line, q - line));
680
807
    }
681
682
99.0k
    *len = q - line + (*q ? 1 : 0);
683
99.0k
    return hrec;
684
685
10
 fail:
686
10
    *len = -1;
687
10
    bcf_hrec_destroy(hrec);
688
10
    return NULL;
689
690
5.76k
 malformed_line:
691
5.76k
    {
692
5.76k
        char buffer[320];
693
151M
        while ( *q && *q!='\n' ) q++;  // Ensure *len includes full line
694
5.76k
        hts_log_error("Could not parse the header line: %s",
695
5.76k
                      hts_strprint(buffer, sizeof(buffer),
696
5.76k
                                   '"', line, q - line));
697
5.76k
        *len = q - line + (*q ? 1 : 0);
698
5.76k
        bcf_hrec_destroy(hrec);
699
5.76k
        return NULL;
700
102k
    }
701
102k
}
702
703
static int bcf_hdr_set_idx(bcf_hdr_t *hdr, int dict_type, const char *tag, bcf_idinfo_t *idinfo)
704
75.9k
{
705
75.9k
    size_t new_n;
706
707
    // If available, preserve existing IDX
708
75.9k
    if ( idinfo->id==-1 )
709
75.5k
        idinfo->id = hdr->n[dict_type];
710
382
    else if ( idinfo->id < hdr->n[dict_type] && hdr->id[dict_type][idinfo->id].key )
711
5
    {
712
5
        hts_log_error("Conflicting IDX=%d lines in the header dictionary, the new tag is %s",
713
5
            idinfo->id, tag);
714
5
        errno = EINVAL;
715
5
        return -1;
716
5
    }
717
718
75.9k
    new_n = idinfo->id >= hdr->n[dict_type] ? idinfo->id+1 : hdr->n[dict_type];
719
75.9k
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
720
    // hts_resize() can attempt to allocate up to 2 * requested items
721
75.9k
    if (new_n > FUZZ_ALLOC_LIMIT/(2 * sizeof(bcf_idpair_t)))
722
14
        return -1;
723
75.8k
#endif
724
75.8k
    if (hts_resize(bcf_idpair_t, new_n, &hdr->m[dict_type],
725
75.8k
                   &hdr->id[dict_type], HTS_RESIZE_CLEAR)) {
726
0
        return -1;
727
0
    }
728
75.8k
    hdr->n[dict_type] = new_n;
729
730
    // NB: the next kh_put call can invalidate the idinfo pointer, therefore
731
    // we leave it unassigned here. It must be set explicitly in bcf_hdr_sync.
732
75.8k
    hdr->id[dict_type][idinfo->id].key = tag;
733
734
75.8k
    return 0;
735
75.8k
}
736
737
// returns: 1 when hdr needs to be synced, -1 on error, 0 otherwise
738
static int bcf_hdr_register_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
739
118k
{
740
    // contig
741
118k
    int i, ret, replacing = 0;
742
118k
    khint_t k;
743
118k
    char *str = NULL;
744
745
118k
    bcf_hrec_set_type(hrec);
746
747
118k
    if ( hrec->type==BCF_HL_CTG )
748
9.82k
    {
749
9.82k
        hts_pos_t len = 0;
750
751
        // Get the contig ID ($str) and length ($j)
752
9.82k
        i = bcf_hrec_find_key(hrec,"length");
753
9.82k
        if ( i<0 ) len = 0;
754
819
        else {
755
819
            char *end = hrec->vals[i];
756
819
            len = strtoll(hrec->vals[i], &end, 10);
757
819
            if (end == hrec->vals[i] || len < 0) return 0;
758
819
        }
759
760
9.38k
        i = bcf_hrec_find_key(hrec,"ID");
761
9.38k
        if ( i<0 ) return 0;
762
8.33k
        str = strdup(hrec->vals[i]);
763
8.33k
        if (!str) return -1;
764
765
        // Register in the dictionary
766
8.33k
        vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_CTG];
767
8.33k
        khint_t k = kh_get(vdict, d, str);
768
8.33k
        if ( k != kh_end(d) ) { // already present
769
1.41k
            free(str); str=NULL;
770
1.41k
            if (kh_val(d, k).hrec[0] != NULL) // and not removed
771
1.41k
                return 0;
772
0
            replacing = 1;
773
6.92k
        } else {
774
6.92k
            k = kh_put(vdict, d, str, &ret);
775
6.92k
            if (ret < 0) { free(str); return -1; }
776
6.92k
        }
777
778
6.92k
        int idx = bcf_hrec_find_key(hrec,"IDX");
779
6.92k
        if ( idx!=-1 )
780
964
        {
781
964
            char *tmp = hrec->vals[idx];
782
964
            idx = strtol(hrec->vals[idx], &tmp, 10);
783
964
            if ( *tmp || idx < 0 || idx >= INT_MAX - 1)
784
823
            {
785
823
                if (!replacing) {
786
823
                    kh_del(vdict, d, k);
787
823
                    free(str);
788
823
                }
789
823
                hts_log_warning("Error parsing the IDX tag, skipping");
790
823
                return 0;
791
823
            }
792
964
        }
793
794
6.09k
        kh_val(d, k) = bcf_idinfo_def;
795
6.09k
        kh_val(d, k).id = idx;
796
6.09k
        kh_val(d, k).info[0] = len;
797
6.09k
        kh_val(d, k).hrec[0] = hrec;
798
6.09k
        if (bcf_hdr_set_idx(hdr, BCF_DT_CTG, kh_key(d,k), &kh_val(d,k)) < 0) {
799
11
            if (!replacing) {
800
11
                kh_del(vdict, d, k);
801
11
                free(str);
802
11
            }
803
11
            return -1;
804
11
        }
805
6.08k
        if ( idx==-1 ) {
806
5.95k
            if (hrec_add_idx(hrec, kh_val(d,k).id) < 0) {
807
0
               return -1;
808
0
            }
809
5.95k
        }
810
811
6.08k
        return 1;
812
6.08k
    }
813
814
108k
    if ( hrec->type==BCF_HL_STR ) return 1;
815
100k
    if ( hrec->type!=BCF_HL_INFO && hrec->type!=BCF_HL_FLT && hrec->type!=BCF_HL_FMT ) return 0;
816
817
    // INFO/FILTER/FORMAT
818
82.2k
    char *id = NULL;
819
82.2k
    uint32_t type = UINT32_MAX, var = UINT32_MAX;
820
82.2k
    int num = -1, idx = -1;
821
311k
    for (i=0; i<hrec->nkeys; i++)
822
230k
    {
823
230k
        if ( !strcmp(hrec->keys[i], "ID") ) id = hrec->vals[i];
824
154k
        else if ( !strcmp(hrec->keys[i], "IDX") )
825
2.06k
        {
826
2.06k
            char *tmp = hrec->vals[i];
827
2.06k
            idx = strtol(hrec->vals[i], &tmp, 10);
828
2.06k
            if ( *tmp || idx < 0 || idx >= INT_MAX - 1)
829
931
            {
830
931
                hts_log_warning("Error parsing the IDX tag, skipping");
831
931
                return 0;
832
931
            }
833
2.06k
        }
834
152k
        else if ( !strcmp(hrec->keys[i], "Type") )
835
42.3k
        {
836
42.3k
            if ( !strcmp(hrec->vals[i], "Integer") ) type = BCF_HT_INT;
837
40.7k
            else if ( !strcmp(hrec->vals[i], "Float") ) type = BCF_HT_REAL;
838
39.7k
            else if ( !strcmp(hrec->vals[i], "String") ) type = BCF_HT_STR;
839
3.28k
            else if ( !strcmp(hrec->vals[i], "Character") ) type = BCF_HT_STR;
840
3.07k
            else if ( !strcmp(hrec->vals[i], "Flag") ) type = BCF_HT_FLAG;
841
1.76k
            else
842
1.76k
            {
843
1.76k
                hts_log_warning("The type \"%s\" is not supported, assuming \"String\"", hrec->vals[i]);
844
1.76k
                type = BCF_HT_STR;
845
1.76k
            }
846
42.3k
        }
847
110k
        else if ( !strcmp(hrec->keys[i], "Number") )
848
37.9k
        {
849
37.9k
            if ( !strcmp(hrec->vals[i],"A") ) var = BCF_VL_A;
850
37.7k
            else if ( !strcmp(hrec->vals[i],"R") ) var = BCF_VL_R;
851
37.5k
            else if ( !strcmp(hrec->vals[i],"G") ) var = BCF_VL_G;
852
37.3k
            else if ( !strcmp(hrec->vals[i],".") ) var = BCF_VL_VAR;
853
37.1k
            else
854
37.1k
            {
855
37.1k
                sscanf(hrec->vals[i],"%d",&num);
856
37.1k
                var = BCF_VL_FIXED;
857
37.1k
            }
858
37.9k
            if (var != BCF_VL_FIXED) num = 0xfffff;
859
37.9k
        }
860
230k
    }
861
81.3k
    if (hrec->type == BCF_HL_INFO || hrec->type == BCF_HL_FMT) {
862
47.3k
        if (type == -1) {
863
5.69k
            hts_log_warning("%s %s field has no Type defined. Assuming String",
864
5.69k
                *hrec->key == 'I' ? "An" : "A", hrec->key);
865
5.69k
            type = BCF_HT_STR;
866
5.69k
        }
867
47.3k
        if (var == -1) {
868
9.33k
            hts_log_warning("%s %s field has no Number defined. Assuming '.'",
869
9.33k
                *hrec->key == 'I' ? "An" : "A", hrec->key);
870
9.33k
            var = BCF_VL_VAR;
871
9.33k
        }
872
47.3k
        if ( type==BCF_HT_FLAG && (var!=BCF_VL_FIXED || num!=0) )
873
522
        {
874
522
            hts_log_warning("The definition of Flag \"%s/%s\" is invalid, forcing Number=0", hrec->key,id);
875
522
            var = BCF_VL_FIXED;
876
522
            num = 0;
877
522
        }
878
47.3k
    }
879
81.3k
    uint32_t info = ((((uint32_t)num) & 0xfffff)<<12 |
880
81.3k
                     (var & 0xf) << 8 |
881
81.3k
                     (type & 0xf) << 4 |
882
81.3k
                     (((uint32_t) hrec->type) & 0xf));
883
884
81.3k
    if ( !id ) return 0;
885
76.0k
    str = strdup(id);
886
76.0k
    if (!str) return -1;
887
888
76.0k
    vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_ID];
889
76.0k
    k = kh_get(vdict, d, str);
890
76.0k
    if ( k != kh_end(d) )
891
6.19k
    {
892
        // already present
893
6.19k
        free(str);
894
6.19k
        if ( kh_val(d, k).hrec[info&0xf] ) return 0;
895
2.71k
        kh_val(d, k).info[info&0xf] = info;
896
2.71k
        kh_val(d, k).hrec[info&0xf] = hrec;
897
2.71k
        if ( idx==-1 ) {
898
2.68k
            if (hrec_add_idx(hrec, kh_val(d, k).id) < 0) {
899
0
                return -1;
900
0
            }
901
2.68k
        }
902
2.71k
        return 1;
903
2.71k
    }
904
69.8k
    k = kh_put(vdict, d, str, &ret);
905
69.8k
    if (ret < 0) {
906
0
        free(str);
907
0
        return -1;
908
0
    }
909
69.8k
    kh_val(d, k) = bcf_idinfo_def;
910
69.8k
    kh_val(d, k).info[info&0xf] = info;
911
69.8k
    kh_val(d, k).hrec[info&0xf] = hrec;
912
69.8k
    kh_val(d, k).id = idx;
913
69.8k
    if (bcf_hdr_set_idx(hdr, BCF_DT_ID, kh_key(d,k), &kh_val(d,k)) < 0) {
914
8
        kh_del(vdict, d, k);
915
8
        free(str);
916
8
        return -1;
917
8
    }
918
69.8k
    if ( idx==-1 ) {
919
69.5k
        if (hrec_add_idx(hrec, kh_val(d,k).id) < 0) {
920
0
            return -1;
921
0
        }
922
69.5k
    }
923
924
69.8k
    return 1;
925
69.8k
}
926
927
static void bcf_hdr_unregister_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
928
0
{
929
0
    if (hrec->type == BCF_HL_FLT ||
930
0
        hrec->type == BCF_HL_INFO ||
931
0
        hrec->type == BCF_HL_FMT ||
932
0
        hrec->type == BCF_HL_CTG) {
933
0
        int id = bcf_hrec_find_key(hrec, "ID");
934
0
        if (id < 0 || !hrec->vals[id])
935
0
            return;
936
0
        vdict_t *dict = (hrec->type == BCF_HL_CTG
937
0
                         ? (vdict_t*)hdr->dict[BCF_DT_CTG]
938
0
                         : (vdict_t*)hdr->dict[BCF_DT_ID]);
939
0
        khint_t k = kh_get(vdict, dict, hrec->vals[id]);
940
0
        if (k != kh_end(dict))
941
0
            kh_val(dict, k).hrec[hrec->type==BCF_HL_CTG ? 0 : hrec->type] = NULL;
942
0
    }
943
0
}
944
945
static void bcf_hdr_remove_from_hdict(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
946
0
{
947
0
    kstring_t str = KS_INITIALIZE;
948
0
    bcf_hdr_aux_t *aux = get_hdr_aux(hdr);
949
0
    khint_t k;
950
0
    int id;
951
952
0
    switch (hrec->type) {
953
0
    case BCF_HL_GEN:
954
0
        if (ksprintf(&str, "##%s=%s", hrec->key,hrec->value) < 0)
955
0
            str.l = 0;
956
0
        break;
957
0
    case BCF_HL_STR:
958
0
        id = bcf_hrec_find_key(hrec, "ID");
959
0
        if (id < 0)
960
0
            return;
961
0
        if (!hrec->vals[id] ||
962
0
            ksprintf(&str, "##%s=<ID=%s>", hrec->key, hrec->vals[id]) < 0)
963
0
            str.l = 0;
964
0
        break;
965
0
    default:
966
0
        return;
967
0
    }
968
0
    if (str.l) {
969
0
        k = kh_get(hdict, aux->gen, str.s);
970
0
    } else {
971
        // Couldn't get a string for some reason, so try the hard way...
972
0
        for (k = kh_begin(aux->gen); k < kh_end(aux->gen); k++) {
973
0
            if (kh_exist(aux->gen, k) && kh_val(aux->gen, k) == hrec)
974
0
                break;
975
0
        }
976
0
    }
977
0
    if (k != kh_end(aux->gen) && kh_val(aux->gen, k) == hrec) {
978
0
        kh_val(aux->gen, k) = NULL;
979
0
        free((char *) kh_key(aux->gen, k));
980
0
        kh_key(aux->gen, k) = NULL;
981
0
        kh_del(hdict, aux->gen, k);
982
0
    }
983
0
    free(str.s);
984
0
}
985
986
int bcf_hdr_update_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec, const bcf_hrec_t *tmp)
987
0
{
988
    // currently only for bcf_hdr_set_version
989
0
    assert( hrec->type==BCF_HL_GEN );
990
0
    int ret;
991
0
    khint_t k;
992
0
    bcf_hdr_aux_t *aux = get_hdr_aux(hdr);
993
0
    for (k=kh_begin(aux->gen); k<kh_end(aux->gen); k++)
994
0
    {
995
0
        if ( !kh_exist(aux->gen,k) ) continue;
996
0
        if ( hrec!=(bcf_hrec_t*)kh_val(aux->gen,k) ) continue;
997
0
        break;
998
0
    }
999
0
    assert( k<kh_end(aux->gen) );   // something went wrong, should never happen
1000
0
    free((char*)kh_key(aux->gen,k));
1001
0
    kh_del(hdict,aux->gen,k);
1002
0
    kstring_t str = {0,0,0};
1003
0
    if ( ksprintf(&str, "##%s=%s", tmp->key,tmp->value) < 0 )
1004
0
    {
1005
0
        free(str.s);
1006
0
        return -1;
1007
0
    }
1008
0
    k = kh_put(hdict, aux->gen, str.s, &ret);
1009
0
    if ( ret<0 )
1010
0
    {
1011
0
        free(str.s);
1012
0
        return -1;
1013
0
    }
1014
0
    free(hrec->value);
1015
0
    hrec->value = strdup(tmp->value);
1016
0
    if ( !hrec->value ) return -1;
1017
0
    return 0;
1018
0
}
1019
1020
int bcf_hdr_add_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
1021
119k
{
1022
119k
    kstring_t str = {0,0,0};
1023
119k
    bcf_hdr_aux_t *aux = get_hdr_aux(hdr);
1024
1025
119k
    int res;
1026
119k
    if ( !hrec ) return 0;
1027
1028
118k
    bcf_hrec_check(hrec);   // todo: check return status and propagate errors up
1029
1030
118k
    res = bcf_hdr_register_hrec(hdr,hrec);
1031
118k
    if (res < 0) return -1;
1032
118k
    if ( !res )
1033
31.4k
    {
1034
        // If one of the hashed field, then it is already present
1035
31.4k
        if ( hrec->type != BCF_HL_GEN )
1036
13.4k
        {
1037
13.4k
            bcf_hrec_destroy(hrec);
1038
13.4k
            return 0;
1039
13.4k
        }
1040
1041
        // Is one of the generic fields and already present?
1042
18.0k
        if ( ksprintf(&str, "##%s=%s", hrec->key,hrec->value) < 0 )
1043
0
        {
1044
0
            free(str.s);
1045
0
            return -1;
1046
0
        }
1047
18.0k
        khint_t k = kh_get(hdict, aux->gen, str.s);
1048
18.0k
        if ( k != kh_end(aux->gen) )
1049
7.75k
        {
1050
            // duplicate record
1051
7.75k
            bcf_hrec_destroy(hrec);
1052
7.75k
            free(str.s);
1053
7.75k
            return 0;
1054
7.75k
        }
1055
18.0k
    }
1056
1057
97.3k
    int i;
1058
97.3k
    if ( hrec->type==BCF_HL_STR && (i=bcf_hrec_find_key(hrec,"ID"))>=0 )
1059
2.27k
    {
1060
2.27k
        if ( ksprintf(&str, "##%s=<ID=%s>", hrec->key,hrec->vals[i]) < 0 )
1061
0
        {
1062
0
            free(str.s);
1063
0
            return -1;
1064
0
        }
1065
2.27k
        khint_t k = kh_get(hdict, aux->gen, str.s);
1066
2.27k
        if ( k != kh_end(aux->gen) )
1067
1.47k
        {
1068
            // duplicate record
1069
1.47k
            bcf_hrec_destroy(hrec);
1070
1.47k
            free(str.s);
1071
1.47k
            return 0;
1072
1.47k
        }
1073
2.27k
    }
1074
1075
    // New record, needs to be added
1076
95.8k
    int n = hdr->nhrec + 1;
1077
95.8k
    bcf_hrec_t **new_hrec = realloc(hdr->hrec, n*sizeof(bcf_hrec_t*));
1078
95.8k
    if (!new_hrec) {
1079
0
        free(str.s);
1080
0
        bcf_hdr_unregister_hrec(hdr, hrec);
1081
0
        return -1;
1082
0
    }
1083
95.8k
    hdr->hrec = new_hrec;
1084
1085
95.8k
    if ( str.s )
1086
11.0k
    {
1087
11.0k
        khint_t k = kh_put(hdict, aux->gen, str.s, &res);
1088
11.0k
        if ( res<0 )
1089
0
        {
1090
0
            free(str.s);
1091
0
            return -1;
1092
0
        }
1093
11.0k
        kh_val(aux->gen,k) = hrec;
1094
11.0k
    }
1095
1096
95.8k
    hdr->hrec[hdr->nhrec] = hrec;
1097
95.8k
    hdr->dirty = 1;
1098
95.8k
    hdr->nhrec = n;
1099
1100
95.8k
    return hrec->type==BCF_HL_GEN ? 0 : 1;
1101
95.8k
}
1102
1103
bcf_hrec_t *bcf_hdr_get_hrec(const bcf_hdr_t *hdr, int type, const char *key, const char *value, const char *str_class)
1104
0
{
1105
0
    int i;
1106
0
    if ( type==BCF_HL_GEN )
1107
0
    {
1108
        // e.g. ##fileformat=VCFv4.2
1109
        //      ##source=GenomicsDBImport
1110
        //      ##bcftools_viewVersion=1.16-80-gdfdb0923+htslib-1.16-34-g215d364
1111
0
        if ( value )
1112
0
        {
1113
0
            kstring_t str = {0,0,0};
1114
0
            ksprintf(&str, "##%s=%s", key,value);
1115
0
            bcf_hdr_aux_t *aux = get_hdr_aux(hdr);
1116
0
            khint_t k = kh_get(hdict, aux->gen, str.s);
1117
0
            free(str.s);
1118
0
            if ( k == kh_end(aux->gen) ) return NULL;
1119
0
            return kh_val(aux->gen, k);
1120
0
        }
1121
0
        for (i=0; i<hdr->nhrec; i++)
1122
0
        {
1123
0
            if ( hdr->hrec[i]->type!=type ) continue;
1124
0
            if ( strcmp(hdr->hrec[i]->key,key) ) continue;
1125
0
            return hdr->hrec[i];
1126
0
        }
1127
0
        return NULL;
1128
0
    }
1129
0
    else if ( type==BCF_HL_STR )
1130
0
    {
1131
        // e.g. ##GATKCommandLine=<ID=GenomicsDBImport,CommandLine="GenomicsDBImport....">
1132
        //      ##ALT=<ID=NON_REF,Description="Represents any possible alternative allele not already represented at this location by REF and ALT">
1133
0
        if (!str_class) return NULL;
1134
0
        if ( !strcmp("ID",key) )
1135
0
        {
1136
0
            kstring_t str = {0,0,0};
1137
0
            ksprintf(&str, "##%s=<%s=%s>",str_class,key,value);
1138
0
            bcf_hdr_aux_t *aux = get_hdr_aux(hdr);
1139
0
            khint_t k = kh_get(hdict, aux->gen, str.s);
1140
0
            free(str.s);
1141
0
            if ( k == kh_end(aux->gen) ) return NULL;
1142
0
            return kh_val(aux->gen, k);
1143
0
        }
1144
0
        for (i=0; i<hdr->nhrec; i++)
1145
0
        {
1146
0
            if ( hdr->hrec[i]->type!=type ) continue;
1147
0
            if ( strcmp(hdr->hrec[i]->key,str_class) ) continue;
1148
0
            int j = bcf_hrec_find_key(hdr->hrec[i],key);
1149
0
            if ( j>=0 && !strcmp(hdr->hrec[i]->vals[j],value) ) return hdr->hrec[i];
1150
0
        }
1151
0
        return NULL;
1152
0
    }
1153
0
    vdict_t *d = type==BCF_HL_CTG ? (vdict_t*)hdr->dict[BCF_DT_CTG] : (vdict_t*)hdr->dict[BCF_DT_ID];
1154
0
    khint_t k = kh_get(vdict, d, value);
1155
0
    if ( k == kh_end(d) ) return NULL;
1156
0
    return kh_val(d, k).hrec[type==BCF_HL_CTG?0:type];
1157
0
}
1158
1159
void bcf_hdr_check_sanity(bcf_hdr_t *hdr)
1160
5.08k
{
1161
5.08k
    static int PL_warned = 0, GL_warned = 0;
1162
1163
5.08k
    if ( !PL_warned )
1164
5.08k
    {
1165
5.08k
        int id = bcf_hdr_id2int(hdr, BCF_DT_ID, "PL");
1166
5.08k
        if ( bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,id) && bcf_hdr_id2length(hdr,BCF_HL_FMT,id)!=BCF_VL_G )
1167
0
        {
1168
0
            hts_log_warning("PL should be declared as Number=G");
1169
0
            PL_warned = 1;
1170
0
        }
1171
5.08k
    }
1172
5.08k
    if ( !GL_warned )
1173
5.08k
    {
1174
5.08k
        int id = bcf_hdr_id2int(hdr, BCF_DT_ID, "GL");
1175
5.08k
        if ( bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,id) && bcf_hdr_id2length(hdr,BCF_HL_FMT,id)!=BCF_VL_G )
1176
0
        {
1177
0
            hts_log_warning("GL should be declared as Number=G");
1178
0
            GL_warned = 1;
1179
0
        }
1180
5.08k
    }
1181
5.08k
}
1182
1183
int bcf_hdr_parse(bcf_hdr_t *hdr, char *htxt)
1184
5.99k
{
1185
5.99k
    int len, done = 0;
1186
5.99k
    char *p = htxt;
1187
1188
    // Check sanity: "fileformat" string must come as first
1189
5.99k
    bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr,p,&len);
1190
5.99k
    if ( !hrec || !hrec->key || strcasecmp(hrec->key,"fileformat") )
1191
643
        hts_log_warning("The first line should be ##fileformat; is the VCF/BCF header broken?");
1192
5.99k
    if (bcf_hdr_add_hrec(hdr, hrec) < 0) {
1193
0
        bcf_hrec_destroy(hrec);
1194
0
        return -1;
1195
0
    }
1196
1197
    // The filter PASS must appear first in the dictionary
1198
5.99k
    hrec = bcf_hdr_parse_line(hdr,"##FILTER=<ID=PASS,Description=\"All filters passed\">",&len);
1199
5.99k
    if (!hrec || bcf_hdr_add_hrec(hdr, hrec) < 0) {
1200
0
        bcf_hrec_destroy(hrec);
1201
0
        return -1;
1202
0
    }
1203
1204
    // Parse the whole header
1205
13.1k
    do {
1206
50.3k
        while (NULL != (hrec = bcf_hdr_parse_line(hdr, p, &len))) {
1207
37.1k
            if (bcf_hdr_add_hrec(hdr, hrec) < 0) {
1208
19
                bcf_hrec_destroy(hrec);
1209
19
                return -1;
1210
19
            }
1211
37.1k
            p += len;
1212
37.1k
        }
1213
13.1k
        assert(hrec == NULL);
1214
13.1k
        if (len < 0) {
1215
            // len < 0 indicates out-of-memory, or similar error
1216
3
            hts_log_error("Could not parse header line: %s", strerror(errno));
1217
3
            return -1;
1218
13.1k
        } else if (len > 0) {
1219
            // Bad header line.  bcf_hdr_parse_line() will have logged it.
1220
            // Skip and try again on the next line (p + len will be the start
1221
            // of the next one).
1222
5.66k
            p += len;
1223
5.66k
            continue;
1224
5.66k
        }
1225
1226
        // Next should be the sample line.  If not, it was a malformed
1227
        // header, in which case print a warning and skip (many VCF
1228
        // operations do not really care about a few malformed lines).
1229
        // In the future we may want to add a strict mode that errors in
1230
        // this case.
1231
7.50k
        if ( strncmp("#CHROM\t",p,7) && strncmp("#CHROM ",p,7) ) {
1232
2.36k
            char *eol = strchr(p, '\n');
1233
2.36k
            if (*p != '\0') {
1234
1.56k
                char buffer[320];
1235
1.56k
                hts_log_warning("Could not parse header line: %s",
1236
1.56k
                                hts_strprint(buffer, sizeof(buffer),
1237
1.56k
                                               '"', p,
1238
1.56k
                                               eol ? (eol - p) : SIZE_MAX));
1239
1.56k
            }
1240
2.36k
            if (eol) {
1241
1.54k
                p = eol + 1; // Try from the next line.
1242
1.54k
            } else {
1243
822
                done = -1; // No more lines left, give up.
1244
822
            }
1245
5.14k
        } else {
1246
5.14k
            done = 1; // Sample line found
1247
5.14k
        }
1248
13.1k
    } while (!done);
1249
1250
5.96k
    if (done < 0) {
1251
        // No sample line is fatal.
1252
822
        hts_log_error("Could not parse the header, sample line not found");
1253
822
        return -1;
1254
822
    }
1255
1256
5.14k
    if (bcf_hdr_parse_sample_line(hdr,p) < 0)
1257
61
        return -1;
1258
5.08k
    if (bcf_hdr_sync(hdr) < 0)
1259
0
        return -1;
1260
5.08k
    bcf_hdr_check_sanity(hdr);
1261
5.08k
    return 0;
1262
5.08k
}
1263
1264
int bcf_hdr_append(bcf_hdr_t *hdr, const char *line)
1265
0
{
1266
0
    int len;
1267
0
    bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr, (char*) line, &len);
1268
0
    if ( !hrec ) return -1;
1269
0
    if (bcf_hdr_add_hrec(hdr, hrec) < 0)
1270
0
        return -1;
1271
0
    return 0;
1272
0
}
1273
1274
void bcf_hdr_remove(bcf_hdr_t *hdr, int type, const char *key)
1275
0
{
1276
0
    int i = 0;
1277
0
    bcf_hrec_t *hrec;
1278
0
    if ( !key )
1279
0
    {
1280
        // no key, remove all entries of this type
1281
0
        while ( i<hdr->nhrec )
1282
0
        {
1283
0
            if ( hdr->hrec[i]->type!=type ) { i++; continue; }
1284
0
            hrec = hdr->hrec[i];
1285
0
            bcf_hdr_unregister_hrec(hdr, hrec);
1286
0
            bcf_hdr_remove_from_hdict(hdr, hrec);
1287
0
            hdr->dirty = 1;
1288
0
            hdr->nhrec--;
1289
0
            if ( i < hdr->nhrec )
1290
0
                memmove(&hdr->hrec[i],&hdr->hrec[i+1],(hdr->nhrec-i)*sizeof(bcf_hrec_t*));
1291
0
            bcf_hrec_destroy(hrec);
1292
0
        }
1293
0
        return;
1294
0
    }
1295
0
    while (1)
1296
0
    {
1297
0
        if ( type==BCF_HL_FLT || type==BCF_HL_INFO || type==BCF_HL_FMT || type== BCF_HL_CTG )
1298
0
        {
1299
0
            hrec = bcf_hdr_get_hrec(hdr, type, "ID", key, NULL);
1300
0
            if ( !hrec ) return;
1301
1302
0
            for (i=0; i<hdr->nhrec; i++)
1303
0
                if ( hdr->hrec[i]==hrec ) break;
1304
0
            assert( i<hdr->nhrec );
1305
1306
0
            vdict_t *d = type==BCF_HL_CTG ? (vdict_t*)hdr->dict[BCF_DT_CTG] : (vdict_t*)hdr->dict[BCF_DT_ID];
1307
0
            khint_t k = kh_get(vdict, d, key);
1308
0
            kh_val(d, k).hrec[type==BCF_HL_CTG?0:type] = NULL;
1309
0
        }
1310
0
        else
1311
0
        {
1312
0
            for (i=0; i<hdr->nhrec; i++)
1313
0
            {
1314
0
                if ( hdr->hrec[i]->type!=type ) continue;
1315
0
                if ( type==BCF_HL_GEN )
1316
0
                {
1317
0
                    if ( !strcmp(hdr->hrec[i]->key,key) ) break;
1318
0
                }
1319
0
                else
1320
0
                {
1321
                    // not all structured lines have ID, we could be more sophisticated as in bcf_hdr_get_hrec()
1322
0
                    int j = bcf_hrec_find_key(hdr->hrec[i], "ID");
1323
0
                    if ( j>=0 && !strcmp(hdr->hrec[i]->vals[j],key) ) break;
1324
0
                }
1325
0
            }
1326
0
            if ( i==hdr->nhrec ) return;
1327
0
            hrec = hdr->hrec[i];
1328
0
            bcf_hdr_remove_from_hdict(hdr, hrec);
1329
0
        }
1330
1331
0
        hdr->nhrec--;
1332
0
        if ( i < hdr->nhrec )
1333
0
            memmove(&hdr->hrec[i],&hdr->hrec[i+1],(hdr->nhrec-i)*sizeof(bcf_hrec_t*));
1334
0
        bcf_hrec_destroy(hrec);
1335
0
        hdr->dirty = 1;
1336
0
    }
1337
0
}
1338
1339
int bcf_hdr_printf(bcf_hdr_t *hdr, const char *fmt, ...)
1340
0
{
1341
0
    char tmp[256], *line = tmp;
1342
0
    va_list ap;
1343
0
    va_start(ap, fmt);
1344
0
    int n = vsnprintf(line, sizeof(tmp), fmt, ap);
1345
0
    va_end(ap);
1346
1347
0
    if (n >= sizeof(tmp)) {
1348
0
        n++; // For trailing NUL
1349
0
        line = (char*)malloc(n);
1350
0
        if (!line)
1351
0
            return -1;
1352
1353
0
        va_start(ap, fmt);
1354
0
        vsnprintf(line, n, fmt, ap);
1355
0
        va_end(ap);
1356
0
    }
1357
1358
0
    int ret = bcf_hdr_append(hdr, line);
1359
1360
0
    if (line != tmp) free(line);
1361
0
    return ret;
1362
0
}
1363
1364
1365
/**********************
1366
 *** BCF header I/O ***
1367
 **********************/
1368
1369
const char *bcf_hdr_get_version(const bcf_hdr_t *hdr)
1370
0
{
1371
0
    bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, BCF_HL_GEN, "fileformat", NULL, NULL);
1372
0
    if ( !hrec )
1373
0
    {
1374
0
        hts_log_warning("No version string found, assuming VCFv4.2");
1375
0
        return "VCFv4.2";
1376
0
    }
1377
0
    return hrec->value;
1378
0
}
1379
1380
int bcf_hdr_set_version(bcf_hdr_t *hdr, const char *version)
1381
0
{
1382
0
    bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, BCF_HL_GEN, "fileformat", NULL, NULL);
1383
0
    if ( !hrec )
1384
0
    {
1385
0
        int len;
1386
0
        kstring_t str = {0,0,0};
1387
0
        if ( ksprintf(&str,"##fileformat=%s", version) < 0 ) return -1;
1388
0
        hrec = bcf_hdr_parse_line(hdr, str.s, &len);
1389
0
        free(str.s);
1390
0
    }
1391
0
    else
1392
0
    {
1393
0
        bcf_hrec_t *tmp = bcf_hrec_dup(hrec);
1394
0
        if ( !tmp ) return -1;
1395
0
        free(tmp->value);
1396
0
        tmp->value = strdup(version);
1397
0
        if ( !tmp->value ) return -1;
1398
0
        bcf_hdr_update_hrec(hdr, hrec, tmp);
1399
0
        bcf_hrec_destroy(tmp);
1400
0
    }
1401
0
    hdr->dirty = 1;
1402
0
    return 0; // FIXME: check for errs in this function (return < 0 if so)
1403
0
}
1404
1405
bcf_hdr_t *bcf_hdr_init(const char *mode)
1406
6.03k
{
1407
6.03k
    int i;
1408
6.03k
    bcf_hdr_t *h;
1409
6.03k
    h = (bcf_hdr_t*)calloc(1, sizeof(bcf_hdr_t));
1410
6.03k
    if (!h) return NULL;
1411
24.1k
    for (i = 0; i < 3; ++i) {
1412
18.0k
        if ((h->dict[i] = kh_init(vdict)) == NULL) goto fail;
1413
        // Supersize the hash to make collisions very unlikely
1414
18.0k
        static int dsize[3] = {16384,16384,2048}; // info, contig, format
1415
18.0k
        if (kh_resize(vdict, h->dict[i], dsize[i]) < 0) goto fail;
1416
18.0k
    }
1417
1418
6.03k
    bcf_hdr_aux_t *aux = (bcf_hdr_aux_t*)calloc(1,sizeof(bcf_hdr_aux_t));
1419
6.03k
    if ( !aux ) goto fail;
1420
6.03k
    if ( (aux->gen = kh_init(hdict))==NULL ) { free(aux); goto fail; }
1421
6.03k
    aux->key_len = NULL;
1422
6.03k
    aux->dict = *((vdict_t*)h->dict[0]);
1423
6.03k
    free(h->dict[0]);
1424
6.03k
    h->dict[0] = aux;
1425
1426
6.03k
    if ( strchr(mode,'w') )
1427
0
    {
1428
0
        bcf_hdr_append(h, "##fileformat=VCFv4.2");
1429
        // The filter PASS must appear first in the dictionary
1430
0
        bcf_hdr_append(h, "##FILTER=<ID=PASS,Description=\"All filters passed\">");
1431
0
    }
1432
6.03k
    return h;
1433
1434
0
 fail:
1435
0
    for (i = 0; i < 3; ++i)
1436
0
        kh_destroy(vdict, h->dict[i]);
1437
0
    free(h);
1438
0
    return NULL;
1439
6.03k
}
1440
1441
void bcf_hdr_destroy(bcf_hdr_t *h)
1442
6.03k
{
1443
6.03k
    int i;
1444
6.03k
    khint_t k;
1445
6.03k
    if (!h) return;
1446
24.1k
    for (i = 0; i < 3; ++i) {
1447
18.0k
        vdict_t *d = (vdict_t*)h->dict[i];
1448
18.0k
        if (d == 0) continue;
1449
210M
        for (k = kh_begin(d); k != kh_end(d); ++k)
1450
210M
            if (kh_exist(d, k)) free((char*)kh_key(d, k));
1451
18.0k
        if ( i==0 )
1452
6.03k
        {
1453
6.03k
            bcf_hdr_aux_t *aux = get_hdr_aux(h);
1454
34.8k
            for (k=kh_begin(aux->gen); k<kh_end(aux->gen); k++)
1455
28.7k
                if ( kh_exist(aux->gen,k) ) free((char*)kh_key(aux->gen,k));
1456
6.03k
            kh_destroy(hdict, aux->gen);
1457
6.03k
            free(aux->key_len); // may exist for dict[0] only
1458
6.03k
        }
1459
18.0k
        kh_destroy(vdict, d);
1460
18.0k
        free(h->id[i]);
1461
18.0k
    }
1462
101k
    for (i=0; i<h->nhrec; i++)
1463
95.8k
        bcf_hrec_destroy(h->hrec[i]);
1464
6.03k
    if (h->nhrec) free(h->hrec);
1465
6.03k
    if (h->samples) free(h->samples);
1466
6.03k
    free(h->keep_samples);
1467
6.03k
    free(h->transl[0]); free(h->transl[1]);
1468
6.03k
    free(h->mem.s);
1469
6.03k
    free(h);
1470
6.03k
}
1471
1472
bcf_hdr_t *bcf_hdr_read(htsFile *hfp)
1473
6.03k
{
1474
6.03k
    if (hfp->format.format == vcf)
1475
5.38k
        return vcf_hdr_read(hfp);
1476
649
    if (hfp->format.format != bcf) {
1477
0
        hts_log_error("Input is not detected as bcf or vcf format");
1478
0
        return NULL;
1479
0
    }
1480
1481
649
    assert(hfp->is_bgzf);
1482
1483
649
    BGZF *fp = hfp->fp.bgzf;
1484
649
    uint8_t magic[5];
1485
649
    bcf_hdr_t *h;
1486
649
    h = bcf_hdr_init("r");
1487
649
    if (!h) {
1488
0
        hts_log_error("Failed to allocate bcf header");
1489
0
        return NULL;
1490
0
    }
1491
649
    if (bgzf_read(fp, magic, 5) != 5)
1492
0
    {
1493
0
        hts_log_error("Failed to read the header (reading BCF in text mode?)");
1494
0
        bcf_hdr_destroy(h);
1495
0
        return NULL;
1496
0
    }
1497
649
    if (strncmp((char*)magic, "BCF\2\2", 5) != 0)
1498
0
    {
1499
0
        if (!strncmp((char*)magic, "BCF", 3))
1500
0
            hts_log_error("Invalid BCF2 magic string: only BCFv2.2 is supported");
1501
0
        else
1502
0
            hts_log_error("Invalid BCF2 magic string");
1503
0
        bcf_hdr_destroy(h);
1504
0
        return NULL;
1505
0
    }
1506
649
    uint8_t buf[4];
1507
649
    size_t hlen;
1508
649
    char *htxt = NULL;
1509
649
    if (bgzf_read(fp, buf, 4) != 4) goto fail;
1510
649
    hlen = buf[0] | (buf[1] << 8) | (buf[2] << 16) | ((size_t) buf[3] << 24);
1511
649
    if (hlen >= SIZE_MAX) { errno = ENOMEM; goto fail; }
1512
649
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
1513
649
    if (hlen > FUZZ_ALLOC_LIMIT) { errno = ENOMEM; goto fail; }
1514
649
#endif
1515
649
    htxt = (char*)malloc(hlen + 1);
1516
649
    if (!htxt) goto fail;
1517
649
    if (bgzf_read(fp, htxt, hlen) != hlen) goto fail;
1518
643
    htxt[hlen] = '\0'; // Ensure htxt is terminated
1519
643
    if ( bcf_hdr_parse(h, htxt) < 0 ) goto fail;
1520
616
    free(htxt);
1521
616
    return h;
1522
33
 fail:
1523
33
    hts_log_error("Failed to read BCF header");
1524
33
    free(htxt);
1525
33
    bcf_hdr_destroy(h);
1526
33
    return NULL;
1527
643
}
1528
1529
int bcf_hdr_write(htsFile *hfp, bcf_hdr_t *h)
1530
5.08k
{
1531
5.08k
    if (!h) {
1532
0
        errno = EINVAL;
1533
0
        return -1;
1534
0
    }
1535
5.08k
    if ( h->dirty ) {
1536
0
        if (bcf_hdr_sync(h) < 0) return -1;
1537
0
    }
1538
5.08k
    hfp->format.category = variant_data;
1539
5.08k
    if (hfp->format.format == vcf || hfp->format.format == text_format) {
1540
5.08k
        hfp->format.format = vcf;
1541
5.08k
        return vcf_hdr_write(hfp, h);
1542
5.08k
    }
1543
1544
0
    if (hfp->format.format == binary_format)
1545
0
        hfp->format.format = bcf;
1546
1547
0
    kstring_t htxt = {0,0,0};
1548
0
    if (bcf_hdr_format(h, 1, &htxt) < 0) {
1549
0
        free(htxt.s);
1550
0
        return -1;
1551
0
    }
1552
0
    kputc('\0', &htxt); // include the \0 byte
1553
1554
0
    BGZF *fp = hfp->fp.bgzf;
1555
0
    if ( bgzf_write(fp, "BCF\2\2", 5) !=5 ) return -1;
1556
0
    uint8_t hlen[4];
1557
0
    u32_to_le(htxt.l, hlen);
1558
0
    if ( bgzf_write(fp, hlen, 4) !=4 ) return -1;
1559
0
    if ( bgzf_write(fp, htxt.s, htxt.l) != htxt.l ) return -1;
1560
1561
0
    free(htxt.s);
1562
0
    return 0;
1563
0
}
1564
1565
/********************
1566
 *** BCF site I/O ***
1567
 ********************/
1568
1569
bcf1_t *bcf_init()
1570
5.08k
{
1571
5.08k
    bcf1_t *v;
1572
5.08k
    v = (bcf1_t*)calloc(1, sizeof(bcf1_t));
1573
5.08k
    return v;
1574
5.08k
}
1575
1576
void bcf_clear(bcf1_t *v)
1577
32.1k
{
1578
32.1k
    int i;
1579
32.1k
    for (i=0; i<v->d.m_info; i++)
1580
0
    {
1581
0
        if ( v->d.info[i].vptr_free )
1582
0
        {
1583
0
            free(v->d.info[i].vptr - v->d.info[i].vptr_off);
1584
0
            v->d.info[i].vptr_free = 0;
1585
0
        }
1586
0
    }
1587
32.1k
    for (i=0; i<v->d.m_fmt; i++)
1588
0
    {
1589
0
        if ( v->d.fmt[i].p_free )
1590
0
        {
1591
0
            free(v->d.fmt[i].p - v->d.fmt[i].p_off);
1592
0
            v->d.fmt[i].p_free = 0;
1593
0
        }
1594
0
    }
1595
32.1k
    v->rid = v->pos = v->rlen = v->unpacked = 0;
1596
32.1k
    bcf_float_set_missing(v->qual);
1597
32.1k
    v->n_info = v->n_allele = v->n_fmt = v->n_sample = 0;
1598
32.1k
    v->shared.l = v->indiv.l = 0;
1599
32.1k
    v->d.var_type = -1;
1600
32.1k
    v->d.shared_dirty = 0;
1601
32.1k
    v->d.indiv_dirty  = 0;
1602
32.1k
    v->d.n_flt = 0;
1603
32.1k
    v->errcode = 0;
1604
32.1k
    if (v->d.m_als) v->d.als[0] = 0;
1605
32.1k
    if (v->d.m_id) v->d.id[0] = 0;
1606
32.1k
}
1607
1608
void bcf_empty(bcf1_t *v)
1609
5.08k
{
1610
5.08k
    bcf_clear1(v);
1611
5.08k
    free(v->d.id);
1612
5.08k
    free(v->d.als);
1613
5.08k
    free(v->d.allele); free(v->d.flt); free(v->d.info); free(v->d.fmt);
1614
5.08k
    if (v->d.var ) free(v->d.var);
1615
5.08k
    free(v->shared.s); free(v->indiv.s);
1616
5.08k
    memset(&v->d,0,sizeof(v->d));
1617
5.08k
    memset(&v->shared,0,sizeof(v->shared));
1618
5.08k
    memset(&v->indiv,0,sizeof(v->indiv));
1619
5.08k
}
1620
1621
void bcf_destroy(bcf1_t *v)
1622
5.08k
{
1623
5.08k
    if (!v) return;
1624
5.08k
    bcf_empty1(v);
1625
5.08k
    free(v);
1626
5.08k
}
1627
1628
static inline int bcf_read1_core(BGZF *fp, bcf1_t *v)
1629
2.71k
{
1630
2.71k
    uint8_t x[32];
1631
2.71k
    ssize_t ret;
1632
2.71k
    uint32_t shared_len, indiv_len;
1633
2.71k
    if ((ret = bgzf_read(fp, x, 32)) != 32) {
1634
11
        if (ret == 0) return -1;
1635
10
        return -2;
1636
11
    }
1637
2.70k
    bcf_clear1(v);
1638
2.70k
    shared_len = le_to_u32(x);
1639
2.70k
    if (shared_len < 24) return -2;
1640
2.69k
    shared_len -= 24; // to exclude six 32-bit integers
1641
2.69k
    indiv_len = le_to_u32(x + 4);
1642
2.69k
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
1643
    // ks_resize() normally allocates 1.5 * requested size to allow for growth
1644
2.69k
    if ((uint64_t) shared_len + indiv_len > FUZZ_ALLOC_LIMIT / 3 * 2) return -2;
1645
2.68k
#endif
1646
2.68k
    if (ks_resize(&v->shared, shared_len ? shared_len : 1) != 0) return -2;
1647
2.68k
    if (ks_resize(&v->indiv, indiv_len ? indiv_len : 1) != 0) return -2;
1648
2.68k
    v->rid  = le_to_i32(x + 8);
1649
2.68k
    v->pos  = le_to_u32(x + 12);
1650
2.68k
    if ( v->pos==UINT32_MAX ) v->pos = -1;  // this is for telomere coordinate, e.g. MT:0
1651
2.68k
    v->rlen = le_to_i32(x + 16);
1652
2.68k
    v->qual = le_to_float(x + 20);
1653
2.68k
    v->n_info = le_to_u16(x + 24);
1654
2.68k
    v->n_allele = le_to_u16(x + 26);
1655
2.68k
    v->n_sample = le_to_u32(x + 28) & 0xffffff;
1656
2.68k
    v->n_fmt = x[31];
1657
2.68k
    v->shared.l = shared_len;
1658
2.68k
    v->indiv.l = indiv_len;
1659
    // silent fix of broken BCFs produced by earlier versions of bcf_subset, prior to and including bd6ed8b4
1660
2.68k
    if ( (!v->indiv.l || !v->n_sample) && v->n_fmt ) v->n_fmt = 0;
1661
1662
2.68k
    if (bgzf_read(fp, v->shared.s, v->shared.l) != v->shared.l) return -2;
1663
2.56k
    if (bgzf_read(fp, v->indiv.s, v->indiv.l) != v->indiv.l) return -2;
1664
2.55k
    return 0;
1665
2.56k
}
1666
1667
0
#define bit_array_size(n) ((n)/8+1)
1668
0
#define bit_array_set(a,i)   ((a)[(i)/8] |=   1 << ((i)%8))
1669
0
#define bit_array_clear(a,i) ((a)[(i)/8] &= ~(1 << ((i)%8)))
1670
0
#define bit_array_test(a,i)  ((a)[(i)/8] &   (1 << ((i)%8)))
1671
1672
static int bcf_dec_typed_int1_safe(uint8_t *p, uint8_t *end, uint8_t **q,
1673
9.98k
                                   int32_t *val) {
1674
9.98k
    uint32_t t;
1675
9.98k
    if (end - p < 2) return -1;
1676
9.95k
    t = *p++ & 0xf;
1677
    /* Use if .. else if ... else instead of switch to force order.  Assumption
1678
       is that small integers are more frequent than big ones. */
1679
9.95k
    if (t == BCF_BT_INT8) {
1680
4.10k
        *val = *(int8_t *) p++;
1681
5.84k
    } else {
1682
5.84k
        if (end - p < (1<<bcf_type_shift[t])) return -1;
1683
5.83k
        if (t == BCF_BT_INT16) {
1684
2.78k
            *val = le_to_i16(p);
1685
2.78k
            p += 2;
1686
3.04k
        } else if (t == BCF_BT_INT32) {
1687
2.84k
            *val = le_to_i32(p);
1688
2.84k
            p += 4;
1689
#ifdef VCF_ALLOW_INT64
1690
        } else if (t == BCF_BT_INT64) {
1691
            // This case should never happen because there should be no
1692
            // 64-bit BCFs at all, definitely not coming from htslib
1693
            *val = le_to_i64(p);
1694
            p += 8;
1695
#endif
1696
2.84k
        } else {
1697
199
            return -1;
1698
199
        }
1699
5.83k
    }
1700
9.74k
    *q = p;
1701
9.74k
    return 0;
1702
9.95k
}
1703
1704
static int bcf_dec_size_safe(uint8_t *p, uint8_t *end, uint8_t **q,
1705
24.6k
                             int *num, int *type) {
1706
24.6k
    int r;
1707
24.6k
    if (p >= end) return -1;
1708
24.6k
    *type = *p & 0xf;
1709
24.6k
    if (*p>>4 != 15) {
1710
24.0k
        *q = p + 1;
1711
24.0k
        *num = *p >> 4;
1712
24.0k
        return 0;
1713
24.0k
    }
1714
623
    r = bcf_dec_typed_int1_safe(p + 1, end, q, num);
1715
623
    if (r) return r;
1716
552
    return *num >= 0 ? 0 : -1;
1717
623
}
1718
1719
670
static const char *get_type_name(int type) {
1720
670
    const char *types[9] = {
1721
670
        "null", "int (8-bit)", "int (16 bit)", "int (32 bit)",
1722
670
        "unknown", "float", "unknown", "char", "unknown"
1723
670
    };
1724
670
    int t = (type >= 0 && type < 8) ? type : 8;
1725
670
    return types[t];
1726
670
}
1727
1728
static void bcf_record_check_err(const bcf_hdr_t *hdr, bcf1_t *rec,
1729
1.76k
                                 char *type, uint32_t *reports, int i) {
1730
1.76k
    if (*reports == 0 || hts_verbose >= HTS_LOG_DEBUG)
1731
107
        hts_log_warning("Bad BCF record at %s:%"PRIhts_pos
1732
1.76k
                        ": Invalid FORMAT %s %d",
1733
1.76k
                        bcf_seqname_safe(hdr,rec), rec->pos+1, type, i);
1734
1.76k
    (*reports)++;
1735
1.76k
}
1736
1737
2.55k
static int bcf_record_check(const bcf_hdr_t *hdr, bcf1_t *rec) {
1738
2.55k
    uint8_t *ptr, *end;
1739
2.55k
    size_t bytes;
1740
2.55k
    uint32_t err = 0;
1741
2.55k
    int type = 0;
1742
2.55k
    int num  = 0;
1743
2.55k
    int reflen = 0;
1744
2.55k
    uint32_t i, reports;
1745
2.55k
    const uint32_t is_integer = ((1 << BCF_BT_INT8)  |
1746
2.55k
                                 (1 << BCF_BT_INT16) |
1747
#ifdef VCF_ALLOW_INT64
1748
                                 (1 << BCF_BT_INT64) |
1749
#endif
1750
2.55k
                                 (1 << BCF_BT_INT32));
1751
2.55k
    const uint32_t is_valid_type = (is_integer          |
1752
2.55k
                                    (1 << BCF_BT_NULL)  |
1753
2.55k
                                    (1 << BCF_BT_FLOAT) |
1754
2.55k
                                    (1 << BCF_BT_CHAR));
1755
2.55k
    int32_t max_id = hdr ? hdr->n[BCF_DT_ID] : 0;
1756
1757
    // Check for valid contig ID
1758
2.55k
    if (rec->rid < 0
1759
2.55k
        || (hdr && (rec->rid >= hdr->n[BCF_DT_CTG]
1760
2.39k
                    || hdr->id[BCF_DT_CTG][rec->rid].key == NULL))) {
1761
423
        hts_log_warning("Bad BCF record at %"PRIhts_pos": Invalid %s id %d", rec->pos+1, "CONTIG", rec->rid);
1762
423
        err |= BCF_ERR_CTG_INVALID;
1763
423
    }
1764
1765
    // Check ID
1766
2.55k
    ptr = (uint8_t *) rec->shared.s;
1767
2.55k
    end = ptr + rec->shared.l;
1768
2.55k
    if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_shared;
1769
2.54k
    if (type != BCF_BT_CHAR) {
1770
405
        hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s type %d (%s)", bcf_seqname_safe(hdr,rec), rec->pos+1, "ID", type, get_type_name(type));
1771
405
        err |= BCF_ERR_TAG_INVALID;
1772
405
    }
1773
2.54k
    bytes = (size_t) num << bcf_type_shift[type];
1774
2.54k
    if (end - ptr < bytes) goto bad_shared;
1775
2.53k
    ptr += bytes;
1776
1777
    // Check REF and ALT
1778
2.53k
    if (rec->n_allele < 1) {
1779
276
        hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": No REF allele",
1780
276
                        bcf_seqname_safe(hdr,rec), rec->pos+1);
1781
276
        err |= BCF_ERR_TAG_UNDEF;
1782
276
    }
1783
1784
2.53k
    reports = 0;
1785
12.9k
    for (i = 0; i < rec->n_allele; i++) {
1786
10.4k
        if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_shared;
1787
10.4k
        if (type != BCF_BT_CHAR) {
1788
7.55k
            if (!reports++ || hts_verbose >= HTS_LOG_DEBUG)
1789
135
                hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s type %d (%s)", bcf_seqname_safe(hdr,rec), rec->pos+1, "REF/ALT", type, get_type_name(type));
1790
7.55k
            err |= BCF_ERR_CHAR;
1791
7.55k
        }
1792
10.4k
        if (i == 0) reflen = num;
1793
10.4k
        bytes = (size_t) num << bcf_type_shift[type];
1794
10.4k
        if (end - ptr < bytes) goto bad_shared;
1795
10.3k
        ptr += bytes;
1796
10.3k
    }
1797
1798
    // Check FILTER
1799
2.45k
    reports = 0;
1800
2.45k
    if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_shared;
1801
2.44k
    if (num > 0) {
1802
237
        bytes = (size_t) num << bcf_type_shift[type];
1803
237
        if (((1 << type) & is_integer) == 0) {
1804
117
            hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s type %d (%s)", bcf_seqname_safe(hdr,rec), rec->pos+1, "FILTER", type, get_type_name(type));
1805
117
            err |= BCF_ERR_TAG_INVALID;
1806
117
            if (end - ptr < bytes) goto bad_shared;
1807
103
            ptr += bytes;
1808
120
        } else {
1809
120
            if (end - ptr < bytes) goto bad_shared;
1810
3.95k
            for (i = 0; i < num; i++) {
1811
3.84k
                int32_t key = bcf_dec_int1(ptr, type, &ptr);
1812
3.84k
                if (key < 0
1813
3.84k
                    || (hdr && (key >= max_id
1814
3.41k
                                || hdr->id[BCF_DT_ID][key].key == NULL))) {
1815
3.41k
                    if (!reports++ || hts_verbose >= HTS_LOG_DEBUG)
1816
117
                        hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s id %d", bcf_seqname_safe(hdr,rec), rec->pos+1, "FILTER", key);
1817
3.41k
                    err |= BCF_ERR_TAG_UNDEF;
1818
3.41k
                }
1819
3.84k
            }
1820
118
        }
1821
237
    }
1822
1823
    // Check INFO
1824
2.43k
    reports = 0;
1825
2.43k
    bcf_idpair_t *id_tmp = hdr ? hdr->id[BCF_DT_ID] : NULL;
1826
9.84k
    for (i = 0; i < rec->n_info; i++) {
1827
7.60k
        int32_t key = -1;
1828
7.60k
        if (bcf_dec_typed_int1_safe(ptr, end, &ptr, &key) != 0) goto bad_shared;
1829
7.47k
        if (key < 0 || (hdr && (key >= max_id
1830
7.16k
                                || id_tmp[key].key == NULL))) {
1831
7.16k
            if (!reports++ || hts_verbose >= HTS_LOG_DEBUG)
1832
163
                hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s id %d", bcf_seqname_safe(hdr,rec), rec->pos+1, "INFO", key);
1833
7.16k
            err |= BCF_ERR_TAG_UNDEF;
1834
7.16k
        }
1835
7.47k
        if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_shared;
1836
7.44k
        if (((1 << type) & is_valid_type) == 0
1837
7.44k
            || (type == BCF_BT_NULL && num > 0)) {
1838
1.14k
            if (!reports++ || hts_verbose >= HTS_LOG_DEBUG)
1839
13
                hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s type %d (%s)", bcf_seqname_safe(hdr,rec), rec->pos+1, "INFO", type, get_type_name(type));
1840
1.14k
            err |= BCF_ERR_TAG_INVALID;
1841
1.14k
        }
1842
7.44k
        bytes = (size_t) num << bcf_type_shift[type];
1843
7.44k
        if (end - ptr < bytes) goto bad_shared;
1844
7.41k
        ptr += bytes;
1845
7.41k
    }
1846
1847
    // Check FORMAT and individual information
1848
2.23k
    ptr = (uint8_t *) rec->indiv.s;
1849
2.23k
    end = ptr + rec->indiv.l;
1850
2.23k
    reports = 0;
1851
3.88k
    for (i = 0; i < rec->n_fmt; i++) {
1852
1.76k
        int32_t key = -1;
1853
1.76k
        if (bcf_dec_typed_int1_safe(ptr, end, &ptr, &key) != 0) goto bad_indiv;
1854
1.71k
        if (key < 0
1855
1.71k
            || (hdr && (key >= max_id
1856
1.61k
                        || id_tmp[key].key == NULL))) {
1857
1.61k
            bcf_record_check_err(hdr, rec, "id", &reports, key);
1858
1.61k
            err |= BCF_ERR_TAG_UNDEF;
1859
1.61k
        }
1860
1.71k
        if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_indiv;
1861
1.69k
        if (((1 << type) & is_valid_type) == 0
1862
1.69k
            || (type == BCF_BT_NULL && num > 0)) {
1863
157
            bcf_record_check_err(hdr, rec, "type", &reports, type);
1864
157
            err |= BCF_ERR_TAG_INVALID;
1865
157
        }
1866
1.69k
        bytes = ((size_t) num << bcf_type_shift[type]) * rec->n_sample;
1867
1.69k
        if (end - ptr < bytes) goto bad_indiv;
1868
1.64k
        ptr += bytes;
1869
1.64k
    }
1870
1871
2.12k
    if (!err && rec->rlen < 0) {
1872
        // Treat bad rlen as a warning instead of an error, and try to
1873
        // fix up by using the length of the stored REF allele.
1874
1.26k
        static int warned = 0;
1875
1.26k
        if (!warned) {
1876
1
            hts_log_warning("BCF record at %s:%"PRIhts_pos" has invalid RLEN (%"PRIhts_pos"). "
1877
1
                            "Only one invalid RLEN will be reported.",
1878
1
                            bcf_seqname_safe(hdr,rec), rec->pos+1, rec->rlen);
1879
1
            warned = 1;
1880
1
        }
1881
1.26k
        rec->rlen = reflen >= 0 ? reflen : 0;
1882
1.26k
    }
1883
1884
2.12k
    rec->errcode |= err;
1885
1886
2.12k
    return err ? -2 : 0; // Return -2 so bcf_read() reports an error
1887
1888
311
 bad_shared:
1889
311
    hts_log_error("Bad BCF record at %s:%"PRIhts_pos" - shared section malformed or too short", bcf_seqname_safe(hdr,rec), rec->pos+1);
1890
311
    return -2;
1891
1892
112
 bad_indiv:
1893
112
    hts_log_error("Bad BCF record at %s:%"PRIhts_pos" - individuals section malformed or too short", bcf_seqname_safe(hdr,rec), rec->pos+1);
1894
112
    return -2;
1895
2.23k
}
1896
1897
static inline uint8_t *bcf_unpack_fmt_core1(uint8_t *ptr, int n_sample, bcf_fmt_t *fmt);
1898
int bcf_subset_format(const bcf_hdr_t *hdr, bcf1_t *rec)
1899
0
{
1900
0
    if ( !hdr->keep_samples ) return 0;
1901
0
    if ( !bcf_hdr_nsamples(hdr) )
1902
0
    {
1903
0
        rec->indiv.l = rec->n_sample = 0;
1904
0
        return 0;
1905
0
    }
1906
1907
0
    int i, j;
1908
0
    uint8_t *ptr = (uint8_t*)rec->indiv.s, *dst = NULL, *src;
1909
0
    bcf_dec_t *dec = &rec->d;
1910
0
    hts_expand(bcf_fmt_t, rec->n_fmt, dec->m_fmt, dec->fmt);
1911
0
    for (i=0; i<dec->m_fmt; ++i) dec->fmt[i].p_free = 0;
1912
1913
0
    for (i=0; i<rec->n_fmt; i++)
1914
0
    {
1915
0
        ptr = bcf_unpack_fmt_core1(ptr, rec->n_sample, &dec->fmt[i]);
1916
0
        src = dec->fmt[i].p - dec->fmt[i].size;
1917
0
        if ( dst )
1918
0
        {
1919
0
            memmove(dec->fmt[i-1].p + dec->fmt[i-1].p_len, dec->fmt[i].p - dec->fmt[i].p_off, dec->fmt[i].p_off);
1920
0
            dec->fmt[i].p = dec->fmt[i-1].p + dec->fmt[i-1].p_len + dec->fmt[i].p_off;
1921
0
        }
1922
0
        dst = dec->fmt[i].p;
1923
0
        for (j=0; j<hdr->nsamples_ori; j++)
1924
0
        {
1925
0
            src += dec->fmt[i].size;
1926
0
            if ( !bit_array_test(hdr->keep_samples,j) ) continue;
1927
0
            memmove(dst, src, dec->fmt[i].size);
1928
0
            dst += dec->fmt[i].size;
1929
0
        }
1930
0
        rec->indiv.l -= dec->fmt[i].p_len - (dst - dec->fmt[i].p);
1931
0
        dec->fmt[i].p_len = dst - dec->fmt[i].p;
1932
0
    }
1933
0
    rec->unpacked |= BCF_UN_FMT;
1934
1935
0
    rec->n_sample = bcf_hdr_nsamples(hdr);
1936
0
    return 0;
1937
0
}
1938
1939
int bcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
1940
29.6k
{
1941
29.6k
    if (fp->format.format == vcf) return vcf_read(fp,h,v);
1942
2.71k
    int ret = bcf_read1_core(fp->fp.bgzf, v);
1943
2.71k
    if (ret == 0) ret = bcf_record_check(h, v);
1944
2.71k
    if ( ret!=0 || !h->keep_samples ) return ret;
1945
0
    return bcf_subset_format(h,v);
1946
2.71k
}
1947
1948
int bcf_readrec(BGZF *fp, void *null, void *vv, int *tid, hts_pos_t *beg, hts_pos_t *end)
1949
0
{
1950
0
    bcf1_t *v = (bcf1_t *) vv;
1951
0
    int ret = bcf_read1_core(fp, v);
1952
0
    if (ret == 0) ret = bcf_record_check(NULL, v);
1953
0
    if (ret  >= 0)
1954
0
        *tid = v->rid, *beg = v->pos, *end = v->pos + v->rlen;
1955
0
    return ret;
1956
0
}
1957
1958
static inline int bcf1_sync_id(bcf1_t *line, kstring_t *str)
1959
0
{
1960
    // single typed string
1961
0
    if ( line->d.id && strcmp(line->d.id, ".") ) {
1962
0
        return bcf_enc_vchar(str, strlen(line->d.id), line->d.id);
1963
0
    } else {
1964
0
        return bcf_enc_size(str, 0, BCF_BT_CHAR);
1965
0
    }
1966
0
}
1967
static inline int bcf1_sync_alleles(bcf1_t *line, kstring_t *str)
1968
0
{
1969
    // list of typed strings
1970
0
    int i;
1971
0
    for (i=0; i<line->n_allele; i++) {
1972
0
        if (bcf_enc_vchar(str, strlen(line->d.allele[i]), line->d.allele[i]) < 0)
1973
0
            return -1;
1974
0
    }
1975
0
    if ( !line->rlen && line->n_allele ) line->rlen = strlen(line->d.allele[0]);
1976
0
    return 0;
1977
0
}
1978
static inline int bcf1_sync_filter(bcf1_t *line, kstring_t *str)
1979
0
{
1980
    // typed vector of integers
1981
0
    if ( line->d.n_flt ) {
1982
0
        return bcf_enc_vint(str, line->d.n_flt, line->d.flt, -1);
1983
0
    } else {
1984
0
        return bcf_enc_vint(str, 0, 0, -1);
1985
0
    }
1986
0
}
1987
1988
static inline int bcf1_sync_info(bcf1_t *line, kstring_t *str)
1989
0
{
1990
    // pairs of typed vectors
1991
0
    int i, irm = -1, e = 0;
1992
0
    for (i=0; i<line->n_info; i++)
1993
0
    {
1994
0
        bcf_info_t *info = &line->d.info[i];
1995
0
        if ( !info->vptr )
1996
0
        {
1997
            // marked for removal
1998
0
            if ( irm < 0 ) irm = i;
1999
0
            continue;
2000
0
        }
2001
0
        e |= kputsn_(info->vptr - info->vptr_off, info->vptr_len + info->vptr_off, str) < 0;
2002
0
        if ( irm >=0 )
2003
0
        {
2004
0
            bcf_info_t tmp = line->d.info[irm]; line->d.info[irm] = line->d.info[i]; line->d.info[i] = tmp;
2005
0
            while ( irm<=i && line->d.info[irm].vptr ) irm++;
2006
0
        }
2007
0
    }
2008
0
    if ( irm>=0 ) line->n_info = irm;
2009
0
    return e == 0 ? 0 : -1;
2010
0
}
2011
2012
static int bcf1_sync(bcf1_t *line)
2013
0
{
2014
0
    char *shared_ori = line->shared.s;
2015
0
    size_t prev_len;
2016
2017
0
    kstring_t tmp = {0,0,0};
2018
0
    if ( !line->shared.l )
2019
0
    {
2020
        // New line created via API, BCF data blocks do not exist. Get it ready for BCF output
2021
0
        tmp = line->shared;
2022
0
        bcf1_sync_id(line, &tmp);
2023
0
        line->unpack_size[0] = tmp.l; prev_len = tmp.l;
2024
2025
0
        bcf1_sync_alleles(line, &tmp);
2026
0
        line->unpack_size[1] = tmp.l - prev_len; prev_len = tmp.l;
2027
2028
0
        bcf1_sync_filter(line, &tmp);
2029
0
        line->unpack_size[2] = tmp.l - prev_len;
2030
2031
0
        bcf1_sync_info(line, &tmp);
2032
0
        line->shared = tmp;
2033
0
    }
2034
0
    else if ( line->d.shared_dirty )
2035
0
    {
2036
        // The line was edited, update the BCF data block.
2037
2038
0
        if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line,BCF_UN_STR);
2039
2040
        // ptr_ori points to the original unchanged BCF data.
2041
0
        uint8_t *ptr_ori = (uint8_t *) line->shared.s;
2042
2043
        // ID: single typed string
2044
0
        if ( line->d.shared_dirty & BCF1_DIRTY_ID )
2045
0
            bcf1_sync_id(line, &tmp);
2046
0
        else
2047
0
            kputsn_(ptr_ori, line->unpack_size[0], &tmp);
2048
0
        ptr_ori += line->unpack_size[0];
2049
0
        line->unpack_size[0] = tmp.l; prev_len = tmp.l;
2050
2051
        // REF+ALT: list of typed strings
2052
0
        if ( line->d.shared_dirty & BCF1_DIRTY_ALS )
2053
0
            bcf1_sync_alleles(line, &tmp);
2054
0
        else
2055
0
        {
2056
0
            kputsn_(ptr_ori, line->unpack_size[1], &tmp);
2057
0
            if ( !line->rlen && line->n_allele ) line->rlen = strlen(line->d.allele[0]);
2058
0
        }
2059
0
        ptr_ori += line->unpack_size[1];
2060
0
        line->unpack_size[1] = tmp.l - prev_len; prev_len = tmp.l;
2061
2062
0
        if ( line->unpacked & BCF_UN_FLT )
2063
0
        {
2064
            // FILTER: typed vector of integers
2065
0
            if ( line->d.shared_dirty & BCF1_DIRTY_FLT )
2066
0
                bcf1_sync_filter(line, &tmp);
2067
0
            else if ( line->d.n_flt )
2068
0
                kputsn_(ptr_ori, line->unpack_size[2], &tmp);
2069
0
            else
2070
0
                bcf_enc_vint(&tmp, 0, 0, -1);
2071
0
            ptr_ori += line->unpack_size[2];
2072
0
            line->unpack_size[2] = tmp.l - prev_len;
2073
2074
0
            if ( line->unpacked & BCF_UN_INFO )
2075
0
            {
2076
                // INFO: pairs of typed vectors
2077
0
                if ( line->d.shared_dirty & BCF1_DIRTY_INF )
2078
0
                {
2079
0
                    bcf1_sync_info(line, &tmp);
2080
0
                    ptr_ori = (uint8_t*)line->shared.s + line->shared.l;
2081
0
                }
2082
0
            }
2083
0
        }
2084
2085
0
        int size = line->shared.l - (size_t)ptr_ori + (size_t)line->shared.s;
2086
0
        if ( size ) kputsn_(ptr_ori, size, &tmp);
2087
2088
0
        free(line->shared.s);
2089
0
        line->shared = tmp;
2090
0
    }
2091
0
    if ( line->shared.s != shared_ori && line->unpacked & BCF_UN_INFO )
2092
0
    {
2093
        // Reallocated line->shared.s block invalidated line->d.info[].vptr pointers
2094
0
        size_t off_new = line->unpack_size[0] + line->unpack_size[1] + line->unpack_size[2];
2095
0
        int i;
2096
0
        for (i=0; i<line->n_info; i++)
2097
0
        {
2098
0
            uint8_t *vptr_free = line->d.info[i].vptr_free ? line->d.info[i].vptr - line->d.info[i].vptr_off : NULL;
2099
0
            line->d.info[i].vptr = (uint8_t*) line->shared.s + off_new + line->d.info[i].vptr_off;
2100
0
            off_new += line->d.info[i].vptr_len + line->d.info[i].vptr_off;
2101
0
            if ( vptr_free )
2102
0
            {
2103
0
                free(vptr_free);
2104
0
                line->d.info[i].vptr_free = 0;
2105
0
            }
2106
0
        }
2107
0
    }
2108
2109
0
    if ( line->n_sample && line->n_fmt && (!line->indiv.l || line->d.indiv_dirty) )
2110
0
    {
2111
        // The genotype fields changed or are not present
2112
0
        tmp.l = tmp.m = 0; tmp.s = NULL;
2113
0
        int i, irm = -1;
2114
0
        for (i=0; i<line->n_fmt; i++)
2115
0
        {
2116
0
            bcf_fmt_t *fmt = &line->d.fmt[i];
2117
0
            if ( !fmt->p )
2118
0
            {
2119
                // marked for removal
2120
0
                if ( irm < 0 ) irm = i;
2121
0
                continue;
2122
0
            }
2123
0
            kputsn_(fmt->p - fmt->p_off, fmt->p_len + fmt->p_off, &tmp);
2124
0
            if ( irm >=0 )
2125
0
            {
2126
0
                bcf_fmt_t tfmt = line->d.fmt[irm]; line->d.fmt[irm] = line->d.fmt[i]; line->d.fmt[i] = tfmt;
2127
0
                while ( irm<=i && line->d.fmt[irm].p ) irm++;
2128
0
            }
2129
2130
0
        }
2131
0
        if ( irm>=0 ) line->n_fmt = irm;
2132
0
        free(line->indiv.s);
2133
0
        line->indiv = tmp;
2134
2135
        // Reallocated line->indiv.s block invalidated line->d.fmt[].p pointers
2136
0
        size_t off_new = 0;
2137
0
        for (i=0; i<line->n_fmt; i++)
2138
0
        {
2139
0
            uint8_t *p_free = line->d.fmt[i].p_free ? line->d.fmt[i].p - line->d.fmt[i].p_off : NULL;
2140
0
            line->d.fmt[i].p = (uint8_t*) line->indiv.s + off_new + line->d.fmt[i].p_off;
2141
0
            off_new += line->d.fmt[i].p_len + line->d.fmt[i].p_off;
2142
0
            if ( p_free )
2143
0
            {
2144
0
                free(p_free);
2145
0
                line->d.fmt[i].p_free = 0;
2146
0
            }
2147
0
        }
2148
0
    }
2149
0
    if ( !line->n_sample ) line->n_fmt = 0;
2150
0
    line->d.shared_dirty = line->d.indiv_dirty = 0;
2151
0
    return 0;
2152
0
}
2153
2154
bcf1_t *bcf_copy(bcf1_t *dst, bcf1_t *src)
2155
0
{
2156
0
    bcf1_sync(src);
2157
2158
0
    bcf_clear(dst);
2159
0
    dst->rid  = src->rid;
2160
0
    dst->pos  = src->pos;
2161
0
    dst->rlen = src->rlen;
2162
0
    dst->qual = src->qual;
2163
0
    dst->n_info = src->n_info; dst->n_allele = src->n_allele;
2164
0
    dst->n_fmt = src->n_fmt; dst->n_sample = src->n_sample;
2165
2166
0
    if ( dst->shared.m < src->shared.l )
2167
0
    {
2168
0
        dst->shared.s = (char*) realloc(dst->shared.s, src->shared.l);
2169
0
        dst->shared.m = src->shared.l;
2170
0
    }
2171
0
    dst->shared.l = src->shared.l;
2172
0
    memcpy(dst->shared.s,src->shared.s,dst->shared.l);
2173
2174
0
    if ( dst->indiv.m < src->indiv.l )
2175
0
    {
2176
0
        dst->indiv.s = (char*) realloc(dst->indiv.s, src->indiv.l);
2177
0
        dst->indiv.m = src->indiv.l;
2178
0
    }
2179
0
    dst->indiv.l = src->indiv.l;
2180
0
    memcpy(dst->indiv.s,src->indiv.s,dst->indiv.l);
2181
2182
0
    return dst;
2183
0
}
2184
bcf1_t *bcf_dup(bcf1_t *src)
2185
0
{
2186
0
    bcf1_t *out = bcf_init1();
2187
0
    return bcf_copy(out, src);
2188
0
}
2189
2190
int bcf_write(htsFile *hfp, bcf_hdr_t *h, bcf1_t *v)
2191
24.6k
{
2192
24.6k
    if ( h->dirty ) {
2193
0
        if (bcf_hdr_sync(h) < 0) return -1;
2194
0
    }
2195
24.6k
    if ( bcf_hdr_nsamples(h)!=v->n_sample )
2196
122
    {
2197
122
        hts_log_error("Broken VCF record, the number of columns at %s:%"PRIhts_pos" does not match the number of samples (%d vs %d)",
2198
122
            bcf_seqname_safe(h,v), v->pos+1, v->n_sample, bcf_hdr_nsamples(h));
2199
122
        return -1;
2200
122
    }
2201
2202
24.5k
    if ( hfp->format.format == vcf || hfp->format.format == text_format )
2203
24.5k
        return vcf_write(hfp,h,v);
2204
2205
0
    if ( v->errcode )
2206
0
    {
2207
        // vcf_parse1() encountered a new contig or tag, undeclared in the
2208
        // header.  At this point, the header must have been printed,
2209
        // proceeding would lead to a broken BCF file. Errors must be checked
2210
        // and cleared by the caller before we can proceed.
2211
0
        char errdescription[1024] = "";
2212
0
        hts_log_error("Unchecked error (%d %s) at %s:%"PRIhts_pos, v->errcode, bcf_strerror(v->errcode, errdescription, sizeof(errdescription)), bcf_seqname_safe(h,v), v->pos+1);
2213
0
        return -1;
2214
0
    }
2215
0
    bcf1_sync(v);   // check if the BCF record was modified
2216
2217
0
    if ( v->unpacked & BCF_IS_64BIT )
2218
0
    {
2219
0
        hts_log_error("Data at %s:%"PRIhts_pos" contains 64-bit values not representable in BCF. Please use VCF instead", bcf_seqname_safe(h,v), v->pos+1);
2220
0
        return -1;
2221
0
    }
2222
2223
0
    BGZF *fp = hfp->fp.bgzf;
2224
0
    uint8_t x[32];
2225
0
    u32_to_le(v->shared.l + 24, x); // to include six 32-bit integers
2226
0
    u32_to_le(v->indiv.l, x + 4);
2227
0
    i32_to_le(v->rid, x + 8);
2228
0
    u32_to_le(v->pos, x + 12);
2229
0
    u32_to_le(v->rlen, x + 16);
2230
0
    float_to_le(v->qual, x + 20);
2231
0
    u16_to_le(v->n_info, x + 24);
2232
0
    u16_to_le(v->n_allele, x + 26);
2233
0
    u32_to_le((uint32_t)v->n_fmt<<24 | (v->n_sample & 0xffffff), x + 28);
2234
0
    if ( bgzf_write(fp, x, 32) != 32 ) return -1;
2235
0
    if ( bgzf_write(fp, v->shared.s, v->shared.l) != v->shared.l ) return -1;
2236
0
    if ( bgzf_write(fp, v->indiv.s, v->indiv.l) != v->indiv.l ) return -1;
2237
2238
0
    if (hfp->idx) {
2239
0
        if (bgzf_idx_push(fp, hfp->idx, v->rid, v->pos, v->pos + v->rlen,
2240
0
                          bgzf_tell(fp), 1) < 0)
2241
0
            return -1;
2242
0
    }
2243
2244
0
    return 0;
2245
0
}
2246
2247
/**********************
2248
 *** VCF header I/O ***
2249
 **********************/
2250
2251
0
static int add_missing_contig_hrec(bcf_hdr_t *h, const char *name) {
2252
0
    bcf_hrec_t *hrec = calloc(1, sizeof(bcf_hrec_t));
2253
0
    int save_errno;
2254
0
    if (!hrec) goto fail;
2255
2256
0
    hrec->key = strdup("contig");
2257
0
    if (!hrec->key) goto fail;
2258
2259
0
    if (bcf_hrec_add_key(hrec, "ID", strlen("ID")) < 0) goto fail;
2260
0
    if (bcf_hrec_set_val(hrec, hrec->nkeys-1, name, strlen(name), 0) < 0)
2261
0
        goto fail;
2262
0
    if (bcf_hdr_add_hrec(h, hrec) < 0)
2263
0
        goto fail;
2264
0
    return 0;
2265
2266
0
 fail:
2267
0
    save_errno = errno;
2268
0
    hts_log_error("%s", strerror(errno));
2269
0
    if (hrec) bcf_hrec_destroy(hrec);
2270
0
    errno = save_errno;
2271
0
    return -1;
2272
0
}
2273
2274
bcf_hdr_t *vcf_hdr_read(htsFile *fp)
2275
5.38k
{
2276
5.38k
    kstring_t txt, *s = &fp->line;
2277
5.38k
    int ret;
2278
5.38k
    bcf_hdr_t *h;
2279
5.38k
    tbx_t *idx = NULL;
2280
5.38k
    const char **names = NULL;
2281
5.38k
    h = bcf_hdr_init("r");
2282
5.38k
    if (!h) {
2283
0
        hts_log_error("Failed to allocate bcf header");
2284
0
        return NULL;
2285
0
    }
2286
5.38k
    txt.l = txt.m = 0; txt.s = 0;
2287
63.4k
    while ((ret = hts_getline(fp, KS_SEP_LINE, s)) >= 0) {
2288
62.7k
        int e = 0;
2289
62.7k
        if (s->l == 0) continue;
2290
62.3k
        if (s->s[0] != '#') {
2291
16
            hts_log_error("No sample line");
2292
16
            goto error;
2293
16
        }
2294
62.2k
        if (s->s[1] != '#' && fp->fn_aux) { // insert contigs here
2295
0
            kstring_t tmp = { 0, 0, NULL };
2296
0
            hFILE *f = hopen(fp->fn_aux, "r");
2297
0
            if (f == NULL) {
2298
0
                hts_log_error("Couldn't open \"%s\"", fp->fn_aux);
2299
0
                goto error;
2300
0
            }
2301
0
            while (tmp.l = 0, kgetline(&tmp, (kgets_func *) hgets, f) >= 0) {
2302
0
                char *tab = strchr(tmp.s, '\t');
2303
0
                if (tab == NULL) continue;
2304
0
                e |= (kputs("##contig=<ID=", &txt) < 0);
2305
0
                e |= (kputsn(tmp.s, tab - tmp.s, &txt) < 0);
2306
0
                e |= (kputs(",length=", &txt) < 0);
2307
0
                e |= (kputl(atol(tab), &txt) < 0);
2308
0
                e |= (kputsn(">\n", 2, &txt) < 0);
2309
0
            }
2310
0
            free(tmp.s);
2311
0
            if (hclose(f) != 0) {
2312
0
                hts_log_error("Error on closing %s", fp->fn_aux);
2313
0
                goto error;
2314
0
            }
2315
0
            if (e) goto error;
2316
0
        }
2317
62.2k
        if (kputsn(s->s, s->l, &txt) < 0) goto error;
2318
62.2k
        if (kputc('\n', &txt) < 0) goto error;
2319
62.2k
        if (s->s[1] != '#') break;
2320
62.2k
    }
2321
5.36k
    if ( ret < -1 ) goto error;
2322
5.34k
    if ( !txt.s )
2323
0
    {
2324
0
        hts_log_error("Could not read the header");
2325
0
        goto error;
2326
0
    }
2327
5.34k
    if ( bcf_hdr_parse(h, txt.s) < 0 ) goto error;
2328
2329
    // check tabix index, are all contigs listed in the header? add the missing ones
2330
4.46k
    idx = tbx_index_load3(fp->fn, NULL, HTS_IDX_SILENT_FAIL);
2331
4.46k
    if ( idx )
2332
0
    {
2333
0
        int i, n, need_sync = 0;
2334
0
        names = tbx_seqnames(idx, &n);
2335
0
        if (!names) goto error;
2336
0
        for (i=0; i<n; i++)
2337
0
        {
2338
0
            bcf_hrec_t *hrec = bcf_hdr_get_hrec(h, BCF_HL_CTG, "ID", (char*) names[i], NULL);
2339
0
            if ( hrec ) continue;
2340
0
            if (add_missing_contig_hrec(h, names[i]) < 0) goto error;
2341
0
            need_sync = 1;
2342
0
        }
2343
0
        if ( need_sync ) {
2344
0
            if (bcf_hdr_sync(h) < 0) goto error;
2345
0
        }
2346
0
        free(names);
2347
0
        tbx_destroy(idx);
2348
0
    }
2349
4.46k
    free(txt.s);
2350
4.46k
    return h;
2351
2352
914
 error:
2353
914
    if (idx) tbx_destroy(idx);
2354
914
    free(names);
2355
914
    free(txt.s);
2356
914
    if (h) bcf_hdr_destroy(h);
2357
914
    return NULL;
2358
4.46k
}
2359
2360
int bcf_hdr_set(bcf_hdr_t *hdr, const char *fname)
2361
0
{
2362
0
    int i = 0, n = 0, save_errno;
2363
0
    char **lines = hts_readlines(fname, &n);
2364
0
    if ( !lines ) return 1;
2365
0
    for (i=0; i<n-1; i++)
2366
0
    {
2367
0
        int k;
2368
0
        bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr,lines[i],&k);
2369
0
        if (!hrec) goto fail;
2370
0
        if (bcf_hdr_add_hrec(hdr, hrec) < 0) {
2371
0
            bcf_hrec_destroy(hrec);
2372
0
            goto fail;
2373
0
        }
2374
0
        free(lines[i]);
2375
0
        lines[i] = NULL;
2376
0
    }
2377
0
    if (bcf_hdr_parse_sample_line(hdr, lines[n-1]) < 0) goto fail;
2378
0
    if (bcf_hdr_sync(hdr) < 0) goto fail;
2379
0
    free(lines[n-1]);
2380
0
    free(lines);
2381
0
    return 0;
2382
2383
0
 fail:
2384
0
    save_errno = errno;
2385
0
    for (; i < n; i++)
2386
0
        free(lines[i]);
2387
0
    free(lines);
2388
0
    errno = save_errno;
2389
0
    return 1;
2390
0
}
2391
2392
static int _bcf_hrec_format(const bcf_hrec_t *hrec, int is_bcf, kstring_t *str)
2393
19.5k
{
2394
19.5k
    uint32_t e = 0;
2395
19.5k
    if ( !hrec->value )
2396
12.5k
    {
2397
12.5k
        int j, nout = 0;
2398
12.5k
        e |= ksprintf(str, "##%s=<", hrec->key) < 0;
2399
59.8k
        for (j=0; j<hrec->nkeys; j++)
2400
47.3k
        {
2401
            // do not output IDX if output is VCF
2402
47.3k
            if ( !is_bcf && !strcmp("IDX",hrec->keys[j]) ) continue;
2403
39.6k
            if ( nout ) e |= kputc(',',str) < 0;
2404
39.6k
            e |= ksprintf(str,"%s=%s", hrec->keys[j], hrec->vals[j]) < 0;
2405
39.6k
            nout++;
2406
39.6k
        }
2407
12.5k
        e |= ksprintf(str,">\n") < 0;
2408
12.5k
    }
2409
7.04k
    else
2410
7.04k
        e |= ksprintf(str,"##%s=%s\n", hrec->key,hrec->value) < 0;
2411
2412
19.5k
    return e == 0 ? 0 : -1;
2413
19.5k
}
2414
2415
int bcf_hrec_format(const bcf_hrec_t *hrec, kstring_t *str)
2416
0
{
2417
0
    return _bcf_hrec_format(hrec,0,str);
2418
0
}
2419
2420
int bcf_hdr_format(const bcf_hdr_t *hdr, int is_bcf, kstring_t *str)
2421
5.08k
{
2422
5.08k
    int i, r = 0;
2423
24.6k
    for (i=0; i<hdr->nhrec; i++)
2424
19.5k
        r |= _bcf_hrec_format(hdr->hrec[i], is_bcf, str) < 0;
2425
2426
5.08k
    r |= ksprintf(str, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO") < 0;
2427
5.08k
    if ( bcf_hdr_nsamples(hdr) )
2428
1.88k
    {
2429
1.88k
        r |= ksprintf(str, "\tFORMAT") < 0;
2430
13.2k
        for (i=0; i<bcf_hdr_nsamples(hdr); i++)
2431
11.3k
            r |= ksprintf(str, "\t%s", hdr->samples[i]) < 0;
2432
1.88k
    }
2433
5.08k
    r |= ksprintf(str, "\n") < 0;
2434
2435
5.08k
    return r ? -1 : 0;
2436
5.08k
}
2437
2438
char *bcf_hdr_fmt_text(const bcf_hdr_t *hdr, int is_bcf, int *len)
2439
0
{
2440
0
    kstring_t txt = {0,0,0};
2441
0
    if (bcf_hdr_format(hdr, is_bcf, &txt) < 0)
2442
0
        return NULL;
2443
0
    if ( len ) *len = txt.l;
2444
0
    return txt.s;
2445
0
}
2446
2447
const char **bcf_hdr_seqnames(const bcf_hdr_t *h, int *n)
2448
0
{
2449
0
    vdict_t *d = (vdict_t*)h->dict[BCF_DT_CTG];
2450
0
    int i, tid, m = kh_size(d);
2451
0
    const char **names = (const char**) calloc(m,sizeof(const char*));
2452
0
    if ( !names )
2453
0
    {
2454
0
        hts_log_error("Failed to allocate memory");
2455
0
        *n = 0;
2456
0
        return NULL;
2457
0
    }
2458
0
    khint_t k;
2459
0
    for (k=kh_begin(d); k<kh_end(d); k++)
2460
0
    {
2461
0
        if ( !kh_exist(d,k) ) continue;
2462
0
        if ( !kh_val(d, k).hrec[0] ) continue;  // removed via bcf_hdr_remove
2463
0
        tid = kh_val(d,k).id;
2464
0
        if ( tid >= m )
2465
0
        {
2466
            // This can happen after a contig has been removed from BCF header via bcf_hdr_remove()
2467
0
            if ( hts_resize(const char*, tid + 1, &m, &names, HTS_RESIZE_CLEAR)<0 )
2468
0
            {
2469
0
                hts_log_error("Failed to allocate memory");
2470
0
                *n = 0;
2471
0
                free(names);
2472
0
                return NULL;
2473
0
            }
2474
0
            m = tid + 1;
2475
0
        }
2476
0
        names[tid] = kh_key(d,k);
2477
0
    }
2478
    // ensure there are no gaps
2479
0
    for (i=0,tid=0; tid<m; i++,tid++)
2480
0
    {
2481
0
        while ( tid<m && !names[tid] ) tid++;
2482
0
        if ( tid==m ) break;
2483
0
        if ( i==tid ) continue;
2484
0
        names[i] = names[tid];
2485
0
        names[tid] = 0;
2486
0
    }
2487
0
    *n = i;
2488
0
    return names;
2489
0
}
2490
2491
int vcf_hdr_write(htsFile *fp, const bcf_hdr_t *h)
2492
5.08k
{
2493
5.08k
    kstring_t htxt = {0,0,0};
2494
5.08k
    if (bcf_hdr_format(h, 0, &htxt) < 0) {
2495
0
        free(htxt.s);
2496
0
        return -1;
2497
0
    }
2498
5.08k
    while (htxt.l && htxt.s[htxt.l-1] == '\0') --htxt.l; // kill trailing zeros
2499
5.08k
    int ret;
2500
5.08k
    if ( fp->format.compression!=no_compression ) {
2501
0
        ret = bgzf_write(fp->fp.bgzf, htxt.s, htxt.l);
2502
0
        if (bgzf_flush(fp->fp.bgzf) != 0) return -1;
2503
5.08k
    } else {
2504
5.08k
        ret = hwrite(fp->fp.hfile, htxt.s, htxt.l);
2505
5.08k
    }
2506
5.08k
    free(htxt.s);
2507
5.08k
    return ret<0 ? -1 : 0;
2508
5.08k
}
2509
2510
/***********************
2511
 *** Typed value I/O ***
2512
 ***********************/
2513
2514
int bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize)
2515
363k
{
2516
363k
    int32_t max = INT32_MIN, min = INT32_MAX;
2517
363k
    int i;
2518
363k
    if (n <= 0) {
2519
3.56k
        return bcf_enc_size(s, 0, BCF_BT_NULL);
2520
359k
    } else if (n == 1) {
2521
18.5k
        return bcf_enc_int1(s, a[0]);
2522
341k
    } else {
2523
341k
        if (wsize <= 0) wsize = n;
2524
2525
        // Equivalent to:
2526
        // for (i = 0; i < n; ++i) {
2527
        //     if (a[i] == bcf_int32_missing || a[i] == bcf_int32_vector_end )
2528
        //         continue;
2529
        //     if (max < a[i]) max = a[i];
2530
        //     if (min > a[i]) min = a[i];
2531
        // }
2532
341k
        int max4[4] = {INT32_MIN, INT32_MIN, INT32_MIN, INT32_MIN};
2533
341k
        int min4[4] = {INT32_MAX, INT32_MAX, INT32_MAX, INT32_MAX};
2534
187M
        for (i = 0; i < (n&~3); i+=4) {
2535
            // bcf_int32_missing    == INT32_MIN and
2536
            // bcf_int32_vector_end == INT32_MIN+1.
2537
            // We skip these, but can mostly avoid explicit checking
2538
187M
            if (max4[0] < a[i+0]) max4[0] = a[i+0];
2539
187M
            if (max4[1] < a[i+1]) max4[1] = a[i+1];
2540
187M
            if (max4[2] < a[i+2]) max4[2] = a[i+2];
2541
187M
            if (max4[3] < a[i+3]) max4[3] = a[i+3];
2542
187M
            if (min4[0] > a[i+0] && a[i+0] > INT32_MIN+1) min4[0] = a[i+0];
2543
187M
            if (min4[1] > a[i+1] && a[i+1] > INT32_MIN+1) min4[1] = a[i+1];
2544
187M
            if (min4[2] > a[i+2] && a[i+2] > INT32_MIN+1) min4[2] = a[i+2];
2545
187M
            if (min4[3] > a[i+3] && a[i+3] > INT32_MIN+1) min4[3] = a[i+3];
2546
187M
        }
2547
341k
        min = min4[0];
2548
341k
        if (min > min4[1]) min = min4[1];
2549
341k
        if (min > min4[2]) min = min4[2];
2550
341k
        if (min > min4[3]) min = min4[3];
2551
341k
        max = max4[0];
2552
341k
        if (max < max4[1]) max = max4[1];
2553
341k
        if (max < max4[2]) max = max4[2];
2554
341k
        if (max < max4[3]) max = max4[3];
2555
821k
        for (; i < n; ++i) {
2556
480k
            if (max < a[i]) max = a[i];
2557
480k
            if (min > a[i] && a[i] > INT32_MIN+1) min = a[i];
2558
480k
        }
2559
2560
341k
        if (max <= BCF_MAX_BT_INT8 && min >= BCF_MIN_BT_INT8) {
2561
48.5k
            if (bcf_enc_size(s, wsize, BCF_BT_INT8) < 0 ||
2562
48.5k
                ks_resize(s, s->l + n) < 0)
2563
0
                return -1;
2564
48.5k
            uint8_t *p = (uint8_t *) s->s + s->l;
2565
72.2M
            for (i = 0; i < n; ++i, p++) {
2566
72.2M
                if ( a[i]==bcf_int32_vector_end )   *p = bcf_int8_vector_end;
2567
69.7M
                else if ( a[i]==bcf_int32_missing ) *p = bcf_int8_missing;
2568
3.14M
                else *p = a[i];
2569
72.2M
            }
2570
48.5k
            s->l += n;
2571
292k
        } else if (max <= BCF_MAX_BT_INT16 && min >= BCF_MIN_BT_INT16) {
2572
178k
            uint8_t *p;
2573
178k
            if (bcf_enc_size(s, wsize, BCF_BT_INT16) < 0 ||
2574
178k
                ks_resize(s, s->l + n * sizeof(int16_t)) < 0)
2575
0
                return -1;
2576
178k
            p = (uint8_t *) s->s + s->l;
2577
109M
            for (i = 0; i < n; ++i)
2578
109M
            {
2579
109M
                int16_t x;
2580
109M
                if ( a[i]==bcf_int32_vector_end ) x = bcf_int16_vector_end;
2581
108M
                else if ( a[i]==bcf_int32_missing ) x = bcf_int16_missing;
2582
1.42M
                else x = a[i];
2583
109M
                i16_to_le(x, p);
2584
109M
                p += sizeof(int16_t);
2585
109M
            }
2586
178k
            s->l += n * sizeof(int16_t);
2587
178k
        } else {
2588
114k
            uint8_t *p;
2589
114k
            if (bcf_enc_size(s, wsize, BCF_BT_INT32) < 0 ||
2590
114k
                ks_resize(s, s->l + n * sizeof(int32_t)) < 0)
2591
0
                return -1;
2592
114k
            p = (uint8_t *) s->s + s->l;
2593
569M
            for (i = 0; i < n; ++i) {
2594
569M
                i32_to_le(a[i], p);
2595
569M
                p += sizeof(int32_t);
2596
569M
            }
2597
114k
            s->l += n * sizeof(int32_t);
2598
114k
        }
2599
341k
    }
2600
2601
341k
    return 0;
2602
363k
}
2603
2604
#ifdef VCF_ALLOW_INT64
2605
static int bcf_enc_long1(kstring_t *s, int64_t x) {
2606
    uint32_t e = 0;
2607
    if (x <= BCF_MAX_BT_INT32 && x >= BCF_MIN_BT_INT32)
2608
        return bcf_enc_int1(s, x);
2609
    if (x == bcf_int64_vector_end) {
2610
        e |= bcf_enc_size(s, 1, BCF_BT_INT8);
2611
        e |= kputc(bcf_int8_vector_end, s) < 0;
2612
    } else if (x == bcf_int64_missing) {
2613
        e |= bcf_enc_size(s, 1, BCF_BT_INT8);
2614
        e |= kputc(bcf_int8_missing, s) < 0;
2615
    } else {
2616
        e |= bcf_enc_size(s, 1, BCF_BT_INT64);
2617
        e |= ks_expand(s, 8);
2618
        if (e == 0) { u64_to_le(x, (uint8_t *) s->s + s->l); s->l += 8; }
2619
    }
2620
    return e == 0 ? 0 : -1;
2621
}
2622
#endif
2623
2624
858k
static inline int serialize_float_array(kstring_t *s, size_t n, const float *a) {
2625
858k
    uint8_t *p;
2626
858k
    size_t i;
2627
858k
    size_t bytes = n * sizeof(float);
2628
2629
858k
    if (bytes / sizeof(float) != n) return -1;
2630
858k
    if (ks_resize(s, s->l + bytes) < 0) return -1;
2631
2632
858k
    p = (uint8_t *) s->s + s->l;
2633
290M
    for (i = 0; i < n; i++) {
2634
289M
        float_to_le(a[i], p);
2635
289M
        p += sizeof(float);
2636
289M
    }
2637
858k
    s->l += bytes;
2638
2639
858k
    return 0;
2640
858k
}
2641
2642
int bcf_enc_vfloat(kstring_t *s, int n, float *a)
2643
858k
{
2644
858k
    assert(n >= 0);
2645
858k
    bcf_enc_size(s, n, BCF_BT_FLOAT);
2646
858k
    serialize_float_array(s, n, a);
2647
858k
    return 0; // FIXME: check for errs in this function
2648
858k
}
2649
2650
int bcf_enc_vchar(kstring_t *s, int l, const char *a)
2651
3.77M
{
2652
3.77M
    bcf_enc_size(s, l, BCF_BT_CHAR);
2653
3.77M
    kputsn(a, l, s);
2654
3.77M
    return 0; // FIXME: check for errs in this function
2655
3.77M
}
2656
2657
// Special case of n==1 as it also occurs quite often in FORMAT data.
2658
// This version is also small enough to get inlined.
2659
286k
static inline int bcf_fmt_array1(kstring_t *s, int type, void *data) {
2660
286k
    uint32_t e = 0;
2661
286k
    uint8_t *p = (uint8_t *)data;
2662
286k
    int32_t v;
2663
2664
    // helps gcc more than clang here. In billions of cycles:
2665
    //          bcf_fmt_array1  bcf_fmt_array
2666
    // gcc7:    23.2            24.3
2667
    // gcc13:   21.6            23.0
2668
    // clang13: 27.1            27.8
2669
286k
    switch (type) {
2670
277k
    case BCF_BT_CHAR:
2671
277k
        e |= kputc_(*p == bcf_str_missing ? '.' : *p, s) < 0;
2672
277k
        break;
2673
2674
3.28k
    case BCF_BT_INT8:
2675
3.28k
        if (*(int8_t *)p != bcf_int8_vector_end) {
2676
3.28k
            e |= ((*(int8_t *)p == bcf_int8_missing)
2677
3.28k
                  ? kputc_('.', s)
2678
3.28k
                  : kputw(*(int8_t *)p, s)) < 0;
2679
3.28k
        }
2680
3.28k
        break;
2681
4.19k
    case BCF_BT_INT16:
2682
4.19k
        v = le_to_i16(p);
2683
4.19k
        if (v != bcf_int16_vector_end) {
2684
4.19k
            e |= (v == bcf_int16_missing
2685
4.19k
                  ? kputc_('.', s)
2686
4.19k
                  : kputw(v, s)) < 0;
2687
4.19k
        }
2688
4.19k
        break;
2689
2690
883
    case BCF_BT_INT32:
2691
883
        v = le_to_i32(p);
2692
883
        if (v != bcf_int32_vector_end) {
2693
883
            e |= (v == bcf_int32_missing
2694
883
                  ? kputc_('.', s)
2695
883
                  : kputw(v, s)) < 0;
2696
883
        }
2697
883
        break;
2698
2699
0
    case BCF_BT_FLOAT:
2700
0
        v = le_to_u32(p);
2701
0
        if (v != bcf_float_vector_end) {
2702
0
            e |= (v == bcf_float_missing
2703
0
                  ? kputc_('.', s)
2704
0
                  : kputd(le_to_float(p), s)) < 0;
2705
0
        }
2706
0
        break;
2707
2708
0
    default:
2709
0
        hts_log_error("Unexpected type %d", type);
2710
0
        return -1;
2711
286k
    }
2712
2713
286k
    return e == 0 ? 0 : -1;
2714
286k
}
2715
2716
int bcf_fmt_array(kstring_t *s, int n, int type, void *data)
2717
7.23M
{
2718
7.23M
    int j = 0;
2719
7.23M
    uint32_t e = 0;
2720
7.23M
    if (n == 0) {
2721
4.75M
        return kputc_('.', s) >= 0 ? 0 : -1;
2722
4.75M
    }
2723
2724
2.48M
    if (type == BCF_BT_CHAR)
2725
199k
    {
2726
199k
        char *p = (char *)data;
2727
2728
        // Note bcf_str_missing is already accounted for in n==0 above.
2729
199k
        if (n >= 8) {
2730
87.4k
            char *p_end = memchr(p, 0, n);
2731
87.4k
            e |= kputsn(p, p_end ? p_end-p : n, s) < 0;
2732
112k
        } else {
2733
275k
            for (j = 0; j < n && *p; ++j, ++p)
2734
162k
               e |= kputc(*p, s) < 0;
2735
112k
        }
2736
199k
    }
2737
2.28M
    else
2738
2.28M
    {
2739
2.28M
        #define BRANCH(type_t, convert, is_missing, is_vector_end, kprint) { \
2740
2.28M
            uint8_t *p = (uint8_t *) data; \
2741
946M
            for (j=0; j<n; j++, p += sizeof(type_t))    \
2742
944M
            { \
2743
944M
                type_t v = convert(p); \
2744
944M
                if ( is_vector_end ) break; \
2745
944M
                if ( j ) e |= kputc_(',', s) < 0; \
2746
944M
                e |= (is_missing ? kputc('.', s) : kprint) < 0; \
2747
944M
            } \
2748
2.28M
        }
2749
2.28M
        switch (type) {
2750
653k
            case BCF_BT_INT8:  BRANCH(int8_t,  le_to_i8, v==bcf_int8_missing,  v==bcf_int8_vector_end,  kputw(v, s)); break;
2751
372k
            case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, v==bcf_int16_missing, v==bcf_int16_vector_end, kputw(v, s)); break;
2752
468k
            case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, v==bcf_int32_missing, v==bcf_int32_vector_end, kputw(v, s)); break;
2753
790k
            case BCF_BT_FLOAT: BRANCH(uint32_t, le_to_u32, v==bcf_float_missing, v==bcf_float_vector_end, kputd(le_to_float(p), s)); break;
2754
0
            default: hts_log_error("Unexpected type %d", type); exit(1); break;
2755
2.28M
        }
2756
2.28M
        #undef BRANCH
2757
2.28M
    }
2758
2.48M
    return e == 0 ? 0 : -1;
2759
2.48M
}
2760
2761
uint8_t *bcf_fmt_sized_array(kstring_t *s, uint8_t *ptr)
2762
3.60M
{
2763
3.60M
    int x, type;
2764
3.60M
    x = bcf_dec_size(ptr, &ptr, &type);
2765
3.60M
    bcf_fmt_array(s, x, type, ptr);
2766
3.60M
    return ptr + (x << bcf_type_shift[type]);
2767
3.60M
}
2768
2769
/********************
2770
 *** VCF site I/O ***
2771
 ********************/
2772
2773
typedef struct {
2774
    int key;            // Key for h->id[BCF_DT_ID][key] vdict
2775
    int max_m;          // number of elements in field array (ie commas)
2776
    int size;           // field size (max_l or max_g*4 if is_gt)
2777
    int offset;         // offset of buf into h->mem
2778
    uint32_t is_gt:1,   // is genotype
2779
             max_g:31;  // maximum number of genotypes
2780
    uint32_t max_l;     // length of field
2781
    uint32_t y;         // h->id[0][fmt[j].key].val->info[BCF_HL_FMT]
2782
    uint8_t *buf;       // Pointer into h->mem
2783
} fmt_aux_t;
2784
2785
// fmt_aux_t field notes:
2786
// max_* are biggest sizes of the various FORMAT fields across all samples.
2787
// We use these after pivoting the data to ensure easy random access
2788
// of a specific sample.
2789
//
2790
// max_m is only used for type BCF_HT_REAL or BCF_HT_INT
2791
// max_g is only used for is_gt == 1 (will be BCF_HT_STR)
2792
// max_l is only used for is_gt == 0 (will be BCF_HT_STR)
2793
//
2794
// These are computed in vcf_parse_format_max3 and used in
2795
// vcf_parse_format_alloc4 to get the size.
2796
//
2797
// size is computed from max_g, max_l, max_m and is_gt.  Once computed
2798
// the max values are never accessed again.
2799
//
2800
// In theory all 4 vars could be coalesced into a single variable, but this
2801
// significantly harms speed (even if done via a union).  It's about 25-30%
2802
// slower.
2803
2804
static inline int align_mem(kstring_t *s)
2805
121k
{
2806
121k
    int e = 0;
2807
121k
    if (s->l&7) {
2808
14.2k
        uint64_t zero = 0;
2809
14.2k
        e = kputsn((char*)&zero, 8 - (s->l&7), s) < 0;
2810
14.2k
    }
2811
121k
    return e == 0 ? 0 : -1;
2812
121k
}
2813
2814
124k
#define MAX_N_FMT 255   /* Limited by size of bcf1_t n_fmt field */
2815
2816
// detect FORMAT "."
2817
static int vcf_parse_format_empty1(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
2818
9.07k
                                   const char *p, const char *q) {
2819
9.07k
    const char *end = s->s + s->l;
2820
9.07k
    if ( q>=end )
2821
55
    {
2822
55
        hts_log_error("FORMAT column with no sample columns starting at %s:%"PRIhts_pos"", bcf_seqname_safe(h,v), v->pos+1);
2823
55
        v->errcode |= BCF_ERR_NCOLS;
2824
55
        return -1;
2825
55
    }
2826
2827
9.02k
    v->n_fmt = 0;
2828
9.02k
    if ( p[0]=='.' && p[1]==0 ) // FORMAT field is empty "."
2829
294
    {
2830
294
        v->n_sample = bcf_hdr_nsamples(h);
2831
294
        return 1;
2832
294
    }
2833
2834
8.72k
    return 0;
2835
9.02k
}
2836
2837
// get format information from the dictionary
2838
static int vcf_parse_format_dict2(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
2839
8.72k
                                  const char *p, const char *q, fmt_aux_t *fmt) {
2840
8.72k
    const vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID];
2841
8.72k
    char *t;
2842
8.72k
    int j;
2843
8.72k
    ks_tokaux_t aux1;
2844
2845
133k
    for (j = 0, t = kstrtok(p, ":", &aux1); t; t = kstrtok(0, 0, &aux1), ++j) {
2846
124k
        if (j >= MAX_N_FMT) {
2847
11
            v->errcode |= BCF_ERR_LIMITS;
2848
11
            hts_log_error("FORMAT column at %s:%"PRIhts_pos" lists more identifiers than htslib can handle",
2849
11
                bcf_seqname_safe(h,v), v->pos+1);
2850
11
            return -1;
2851
11
        }
2852
2853
124k
        *(char*)aux1.p = 0;
2854
124k
        khint_t k = kh_get(vdict, d, t);
2855
124k
        if (k == kh_end(d) || kh_val(d, k).info[BCF_HL_FMT] == 15) {
2856
9.89k
            if ( t[0]=='.' && t[1]==0 )
2857
1
            {
2858
1
                hts_log_error("Invalid FORMAT tag name '.' at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
2859
1
                v->errcode |= BCF_ERR_TAG_INVALID;
2860
1
                return -1;
2861
1
            }
2862
9.88k
            hts_log_warning("FORMAT '%s' at %s:%"PRIhts_pos" is not defined in the header, assuming Type=String", t, bcf_seqname_safe(h,v), v->pos+1);
2863
9.88k
            kstring_t tmp = {0,0,0};
2864
9.88k
            int l;
2865
9.88k
            ksprintf(&tmp, "##FORMAT=<ID=%s,Number=1,Type=String,Description=\"Dummy\">", t);
2866
9.88k
            bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
2867
9.88k
            free(tmp.s);
2868
9.88k
            int res = hrec ? bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) : -1;
2869
9.88k
            if (res < 0) bcf_hrec_destroy(hrec);
2870
9.88k
            if (res > 0) res = bcf_hdr_sync((bcf_hdr_t*)h);
2871
2872
9.88k
            k = kh_get(vdict, d, t);
2873
9.88k
            v->errcode |= BCF_ERR_TAG_UNDEF;
2874
9.88k
            if (res || k == kh_end(d)) {
2875
17
                hts_log_error("Could not add dummy header for FORMAT '%s' at %s:%"PRIhts_pos, t, bcf_seqname_safe(h,v), v->pos+1);
2876
17
                v->errcode |= BCF_ERR_TAG_INVALID;
2877
17
                return -1;
2878
17
            }
2879
9.88k
        }
2880
124k
        fmt[j].max_l = fmt[j].max_m = fmt[j].max_g = 0;
2881
124k
        fmt[j].key = kh_val(d, k).id;
2882
124k
        fmt[j].is_gt = (t[0] == 'G' && t[1] == 'T' && !t[2]);
2883
124k
        fmt[j].y = h->id[0][fmt[j].key].val->info[BCF_HL_FMT];
2884
124k
        v->n_fmt++;
2885
124k
    }
2886
8.69k
    return 0;
2887
8.72k
}
2888
2889
// compute max
2890
static int vcf_parse_format_max3(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
2891
8.69k
                                 char *p, char *q, fmt_aux_t *fmt) {
2892
8.69k
    int n_sample_ori = -1;
2893
8.69k
    char *r = q + 1;  // r: position in the format string
2894
8.69k
    int l = 0, m = 1, g = 1, j;
2895
8.69k
    v->n_sample = 0;  // m: max vector size, l: max field len, g: max number of alleles
2896
8.69k
    const char *end = s->s + s->l;
2897
2898
46.9k
    while ( r<end )
2899
46.7k
    {
2900
        // can we skip some samples?
2901
46.7k
        if ( h->keep_samples )
2902
0
        {
2903
0
            n_sample_ori++;
2904
0
            if ( !bit_array_test(h->keep_samples,n_sample_ori) )
2905
0
            {
2906
0
                while ( *r!='\t' && r<end ) r++;
2907
0
                if ( *r=='\t' ) { *r = 0; r++; }
2908
0
                continue;
2909
0
            }
2910
0
        }
2911
2912
        // collect fmt stats: max vector size, length, number of alleles
2913
46.7k
        j = 0;  // j-th format field
2914
46.7k
        fmt_aux_t *f = fmt;
2915
46.7k
        static char meta[256] = {
2916
            // \0 \t , / : |
2917
46.7k
            1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2918
46.7k
            0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1, 0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,
2919
46.7k
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2920
46.7k
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,
2921
46.7k
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2922
46.7k
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2923
46.7k
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2924
46.7k
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2925
46.7k
        };
2926
2927
46.7k
        char *r_start = r;
2928
11.1M
        for (;;) {
2929
            // Quickly skip ahead to an appropriate meta-character
2930
18.1M
            while (!meta[(unsigned char)*r]) r++;
2931
2932
11.1M
            switch (*r) {
2933
7.68M
            case ',':
2934
7.68M
                m++;
2935
7.68M
                break;
2936
2937
407
            case '|':
2938
3.36M
            case '/':
2939
3.36M
                if (f->is_gt) g++;
2940
3.36M
                break;
2941
2942
9.33k
            case '\t':
2943
9.33k
                *r = 0; // fall through
2944
2945
9.33k
            default: // valid due to while loop above.
2946
46.7k
            case '\0':
2947
73.5k
            case ':':
2948
73.5k
                l = r - r_start; r_start = r;
2949
73.5k
                if (f->max_m < m) f->max_m = m;
2950
73.5k
                if (f->max_l < l) f->max_l = l;
2951
73.5k
                if (f->is_gt && f->max_g < g) f->max_g = g;
2952
73.5k
                l = 0, m = g = 1;
2953
73.5k
                if ( *r==':' ) {
2954
26.8k
                    j++; f++;
2955
26.8k
                    if ( j>=v->n_fmt ) {
2956
17
                        hts_log_error("Incorrect number of FORMAT fields at %s:%"PRIhts_pos"",
2957
17
                                      h->id[BCF_DT_CTG][v->rid].key, v->pos+1);
2958
17
                        v->errcode |= BCF_ERR_NCOLS;
2959
17
                        return -1;
2960
17
                    }
2961
46.7k
                } else goto end_for;
2962
26.8k
                break;
2963
11.1M
            }
2964
11.0M
            if ( r>=end ) break;
2965
11.0M
            r++;
2966
11.0M
        }
2967
46.7k
    end_for:
2968
46.7k
        v->n_sample++;
2969
46.7k
        if ( v->n_sample == bcf_hdr_nsamples(h) ) break;
2970
38.2k
        r++;
2971
38.2k
    }
2972
2973
8.68k
    return 0;
2974
8.69k
}
2975
2976
// allocate memory for arrays
2977
static int vcf_parse_format_alloc4(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
2978
                                   const char *p, const char *q,
2979
8.68k
                                   fmt_aux_t *fmt) {
2980
8.68k
    kstring_t *mem = (kstring_t*)&h->mem;
2981
2982
8.68k
    int j;
2983
129k
    for (j = 0; j < v->n_fmt; ++j) {
2984
121k
        fmt_aux_t *f = &fmt[j];
2985
121k
        if ( !f->max_m ) f->max_m = 1;  // omitted trailing format field
2986
2987
121k
        if ((f->y>>4&0xf) == BCF_HT_STR) {
2988
121k
            f->size = f->is_gt? f->max_g << 2 : f->max_l;
2989
121k
        } else if ((f->y>>4&0xf) == BCF_HT_REAL || (f->y>>4&0xf) == BCF_HT_INT) {
2990
0
            f->size = f->max_m << 2;
2991
2
        } else {
2992
2
            hts_log_error("The format type %d at %s:%"PRIhts_pos" is currently not supported", f->y>>4&0xf, bcf_seqname_safe(h,v), v->pos+1);
2993
2
            v->errcode |= BCF_ERR_TAG_INVALID;
2994
2
            return -1;
2995
2
        }
2996
2997
121k
        if (align_mem(mem) < 0) {
2998
0
            hts_log_error("Memory allocation failure at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
2999
0
            v->errcode |= BCF_ERR_LIMITS;
3000
0
            return -1;
3001
0
        }
3002
3003
        // Limit the total memory to ~2Gb per VCF row.  This should mean
3004
        // malformed VCF data is less likely to take excessive memory and/or
3005
        // time.
3006
121k
        if ((uint64_t) mem->l + v->n_sample * (uint64_t)f->size > INT_MAX) {
3007
0
            hts_log_error("Excessive memory required by FORMAT fields at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3008
0
            v->errcode |= BCF_ERR_LIMITS;
3009
0
            return -1;
3010
0
        }
3011
3012
121k
        f->offset = mem->l;
3013
121k
        if (ks_resize(mem, mem->l + v->n_sample * (size_t)f->size) < 0) {
3014
0
            hts_log_error("Memory allocation failure at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3015
0
            v->errcode |= BCF_ERR_LIMITS;
3016
0
            return -1;
3017
0
        }
3018
121k
        mem->l += v->n_sample * f->size;
3019
121k
    }
3020
3021
8.67k
    {
3022
8.67k
        int j;
3023
129k
        for (j = 0; j < v->n_fmt; ++j)
3024
121k
            fmt[j].buf = (uint8_t*)mem->s + fmt[j].offset;
3025
8.67k
    }
3026
3027
8.67k
    return 0;
3028
8.68k
}
3029
3030
// Fill the sample fields
3031
static int vcf_parse_format_fill5(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
3032
8.67k
                                  const char *p, const char *q, fmt_aux_t *fmt) {
3033
8.67k
    static int extreme_val_warned = 0;
3034
8.67k
    int n_sample_ori = -1;
3035
    // At beginning of the loop t points to the first char of a format
3036
8.67k
    const char *t = q + 1;
3037
8.67k
    int m = 0;   // m: sample id
3038
8.67k
    const int nsamples = bcf_hdr_nsamples(h);
3039
3040
8.67k
    const char *end = s->s + s->l;
3041
54.7k
    while ( t<end )
3042
51.5k
    {
3043
        // can we skip some samples?
3044
51.5k
        if ( h->keep_samples )
3045
0
        {
3046
0
            n_sample_ori++;
3047
0
            if ( !bit_array_test(h->keep_samples,n_sample_ori) )
3048
0
            {
3049
0
                while ( *t && t<end ) t++;
3050
0
                t++;
3051
0
                continue;
3052
0
            }
3053
0
        }
3054
51.5k
        if ( m == nsamples ) break;
3055
3056
46.3k
        int j = 0; // j-th format field, m-th sample
3057
72.4k
        while ( t < end )
3058
71.1k
        {
3059
71.1k
            fmt_aux_t *z = &fmt[j++];
3060
71.1k
            const int htype = z->y>>4&0xf;
3061
71.1k
            if (!z->buf) {
3062
12
                hts_log_error("Memory allocation failure for FORMAT field type %d at %s:%"PRIhts_pos,
3063
12
                              z->y>>4&0xf, bcf_seqname_safe(h,v), v->pos+1);
3064
12
                v->errcode |= BCF_ERR_LIMITS;
3065
12
                return -1;
3066
12
            }
3067
3068
71.1k
            if (htype == BCF_HT_STR) {
3069
71.1k
                int l;
3070
71.1k
                if (z->is_gt) {
3071
                    // Genotypes.
3072
                    // <val>([|/]<val>)+... where <val> is [0-9]+ or ".".
3073
5.03k
                    int32_t is_phased = 0;
3074
5.03k
                    uint32_t *x = (uint32_t*)(z->buf + z->size * (size_t)m);
3075
5.03k
                    uint32_t unreadable = 0;
3076
5.03k
                    uint32_t max = 0;
3077
5.03k
                    int overflow = 0;
3078
1.53M
                    for (l = 0;; ++t) {
3079
1.53M
                        if (*t == '.') {
3080
883k
                            ++t, x[l++] = is_phased;
3081
883k
                        } else {
3082
648k
                            const char *tt = t;
3083
648k
                            uint32_t val;
3084
                            // Or "v->n_allele < 10", but it doesn't
3085
                            // seem to be any faster and this feels safer.
3086
648k
                            if (*t >= '0' && *t <= '9' &&
3087
648k
                                !(t[1] >= '0' && t[1] <= '9')) {
3088
615k
                                val = *t++ - '0';
3089
615k
                            } else {
3090
32.7k
                                val = hts_str2uint(t, (char **)&t,
3091
32.7k
                                                   sizeof(val) * CHAR_MAX - 2,
3092
32.7k
                                                   &overflow);
3093
32.7k
                                unreadable |= tt == t;
3094
32.7k
                            }
3095
648k
                            if (max < val) max = val;
3096
648k
                            x[l++] = (val + 1) << 1 | is_phased;
3097
648k
                        }
3098
1.53M
                        is_phased = (*t == '|');
3099
1.53M
                        if (*t != '|' && *t != '/') break;
3100
1.53M
                    }
3101
                    // Possibly check max against v->n_allele instead?
3102
5.03k
                    if (overflow || max > (INT32_MAX >> 1) - 1) {
3103
159
                        hts_log_error("Couldn't read GT data: value too large at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3104
159
                        return -1;
3105
159
                    }
3106
4.87k
                    if (unreadable) {
3107
82
                        hts_log_error("Couldn't read GT data: value not a number or '.' at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3108
82
                        return -1;
3109
82
                    }
3110
4.79k
                    if ( !l ) x[l++] = 0;   // An empty field, insert missing value
3111
3.52M
                    for (; l < z->size>>2; ++l)
3112
3.52M
                        x[l] = bcf_int32_vector_end;
3113
3114
66.1k
                } else {
3115
                    // Otherwise arbitrary strings
3116
66.1k
                    char *x = (char*)z->buf + z->size * (size_t)m;
3117
11.0M
                    for (l = 0; *t != ':' && *t; ++t)
3118
10.9M
                        x[l++] = *t;
3119
66.1k
                    if (z->size > l)
3120
52.9k
                        memset(&x[l], 0, (z->size-l) * sizeof(*x));
3121
66.1k
                }
3122
3123
71.1k
            } else if (htype == BCF_HT_INT) {
3124
                // One or more integers in an array
3125
0
                int32_t *x = (int32_t*)(z->buf + z->size * (size_t)m);
3126
0
                int l;
3127
0
                for (l = 0;; ++t) {
3128
0
                    if (*t == '.') {
3129
0
                        x[l++] = bcf_int32_missing, ++t; // ++t to skip "."
3130
0
                    } else {
3131
0
                        int overflow = 0;
3132
0
                        char *te;
3133
0
                        long int tmp_val = hts_str2int(t, &te, sizeof(tmp_val)*CHAR_BIT, &overflow);
3134
0
                        if ( te==t || overflow || tmp_val<BCF_MIN_BT_INT32 || tmp_val>BCF_MAX_BT_INT32 )
3135
0
                        {
3136
0
                            if ( !extreme_val_warned )
3137
0
                            {
3138
0
                                hts_log_warning("Extreme FORMAT/%s value encountered and set to missing at %s:%"PRIhts_pos,
3139
0
                                                h->id[BCF_DT_ID][fmt[j-1].key].key, bcf_seqname_safe(h,v), v->pos+1);
3140
0
                                extreme_val_warned = 1;
3141
0
                            }
3142
0
                            tmp_val = bcf_int32_missing;
3143
0
                        }
3144
0
                        x[l++] = tmp_val;
3145
0
                        t = te;
3146
0
                    }
3147
0
                    if (*t != ',') break;
3148
0
                }
3149
0
                if ( !l )
3150
0
                    x[l++] = bcf_int32_missing;
3151
0
                for (; l < z->size>>2; ++l)
3152
0
                    x[l] = bcf_int32_vector_end;
3153
3154
0
            } else if (htype == BCF_HT_REAL) {
3155
                // One of more floating point values in an array
3156
0
                float *x = (float*)(z->buf + z->size * (size_t)m);
3157
0
                int l;
3158
0
                for (l = 0;; ++t) {
3159
0
                    if (*t == '.' && !isdigit_c(t[1])) {
3160
0
                        bcf_float_set_missing(x[l++]), ++t; // ++t to skip "."
3161
0
                    } else {
3162
0
                        int overflow = 0;
3163
0
                        char *te;
3164
0
                        float tmp_val = hts_str2dbl(t, &te, &overflow);
3165
0
                        if ( (te==t || overflow) && !extreme_val_warned )
3166
0
                        {
3167
0
                            hts_log_warning("Extreme FORMAT/%s value encountered at %s:%"PRIhts_pos, h->id[BCF_DT_ID][fmt[j-1].key].key, bcf_seqname(h,v), v->pos+1);
3168
0
                            extreme_val_warned = 1;
3169
0
                        }
3170
0
                        x[l++] = tmp_val;
3171
0
                        t = te;
3172
0
                    }
3173
0
                    if (*t != ',') break;
3174
0
                }
3175
0
                if ( !l )
3176
                    // An empty field, insert missing value
3177
0
                    bcf_float_set_missing(x[l++]);
3178
0
                for (; l < z->size>>2; ++l)
3179
0
                    bcf_float_set_vector_end(x[l]);
3180
0
            } else {
3181
0
                hts_log_error("Unknown FORMAT field type %d at %s:%"PRIhts_pos, htype, bcf_seqname_safe(h,v), v->pos+1);
3182
0
                v->errcode |= BCF_ERR_TAG_INVALID;
3183
0
                return -1;
3184
0
            }
3185
3186
70.9k
            if (*t == '\0') {
3187
44.8k
                break;
3188
44.8k
            }
3189
26.1k
            else if (*t == ':') {
3190
26.0k
                t++;
3191
26.0k
            }
3192
64
            else {
3193
64
                char buffer[8];
3194
64
                hts_log_error("Invalid character %s in '%s' FORMAT field at %s:%"PRIhts_pos"",
3195
64
                    hts_strprint(buffer, sizeof buffer, '\'', t, 1),
3196
64
                    h->id[BCF_DT_ID][z->key].key, bcf_seqname_safe(h,v), v->pos+1);
3197
64
                v->errcode |= BCF_ERR_CHAR;
3198
64
                return -1;
3199
64
            }
3200
70.9k
        }
3201
3202
        // fill end-of-vector values
3203
1.66M
        for (; j < v->n_fmt; ++j) {
3204
1.62M
            fmt_aux_t *z = &fmt[j];
3205
1.62M
            const int htype = z->y>>4&0xf;
3206
1.62M
            int l;
3207
1.62M
            if (htype == BCF_HT_STR) {
3208
1.62M
                if (z->is_gt) {
3209
38.5k
                    int32_t *x = (int32_t*)(z->buf + z->size * (size_t)m);
3210
38.5k
                    if (z->size) x[0] = bcf_int32_missing;
3211
11.9M
                    for (l = 1; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end;
3212
1.58M
                } else {
3213
1.58M
                    char *x = (char*)z->buf + z->size * (size_t)m;
3214
1.58M
                    if ( z->size ) {
3215
334k
                        x[0] = '.';
3216
334k
                        memset(&x[1], 0, (z->size-1) * sizeof(*x));
3217
334k
                    }
3218
1.58M
                }
3219
1.62M
            } else if (htype == BCF_HT_INT) {
3220
0
                int32_t *x = (int32_t*)(z->buf + z->size * (size_t)m);
3221
0
                x[0] = bcf_int32_missing;
3222
0
                for (l = 1; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end;
3223
0
            } else if (htype == BCF_HT_REAL) {
3224
0
                float *x = (float*)(z->buf + z->size * (size_t)m);
3225
0
                bcf_float_set_missing(x[0]);
3226
0
                for (l = 1; l < z->size>>2; ++l) bcf_float_set_vector_end(x[l]);
3227
0
            }
3228
1.62M
        }
3229
3230
46.0k
        m++; t++;
3231
46.0k
    }
3232
3233
8.36k
    return 0;
3234
8.67k
}
3235
3236
// write individual genotype information
3237
static int vcf_parse_format_gt6(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
3238
8.36k
                                const char *p, const char *q, fmt_aux_t *fmt) {
3239
8.36k
    kstring_t *str = &v->indiv;
3240
8.36k
    int i;
3241
8.36k
    if (v->n_sample > 0) {
3242
126k
        for (i = 0; i < v->n_fmt; ++i) {
3243
118k
            fmt_aux_t *z = &fmt[i];
3244
118k
            bcf_enc_int1(str, z->key);
3245
118k
            if ((z->y>>4&0xf) == BCF_HT_STR && !z->is_gt) {
3246
111k
                bcf_enc_size(str, z->size, BCF_BT_CHAR);
3247
111k
                kputsn((char*)z->buf, z->size * (size_t)v->n_sample, str);
3248
111k
            } else if ((z->y>>4&0xf) == BCF_HT_INT || z->is_gt) {
3249
6.73k
                bcf_enc_vint(str, (z->size>>2) * v->n_sample, (int32_t*)z->buf, z->size>>2);
3250
6.73k
            } else {
3251
0
                bcf_enc_size(str, z->size>>2, BCF_BT_FLOAT);
3252
0
                if (serialize_float_array(str, (z->size>>2) * (size_t)v->n_sample,
3253
0
                                          (float *) z->buf) != 0) {
3254
0
                    v->errcode |= BCF_ERR_LIMITS;
3255
0
                    hts_log_error("Out of memory at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3256
0
                    return -1;
3257
0
                }
3258
0
            }
3259
118k
        }
3260
8.34k
    }
3261
3262
8.36k
    return 0;
3263
8.36k
}
3264
3265
// validity checking
3266
8.36k
static int vcf_parse_format_check7(const bcf_hdr_t *h, bcf1_t *v) {
3267
8.36k
    if ( v->n_sample!=bcf_hdr_nsamples(h) )
3268
109
    {
3269
109
        hts_log_error("Number of columns at %s:%"PRIhts_pos" does not match the number of samples (%d vs %d)",
3270
109
            bcf_seqname_safe(h,v), v->pos+1, v->n_sample, bcf_hdr_nsamples(h));
3271
109
        v->errcode |= BCF_ERR_NCOLS;
3272
109
        return -1;
3273
109
    }
3274
8.25k
    if ( v->indiv.l > 0xffffffff )
3275
0
    {
3276
0
        hts_log_error("The FORMAT at %s:%"PRIhts_pos" is too long", bcf_seqname_safe(h,v), v->pos+1);
3277
0
        v->errcode |= BCF_ERR_LIMITS;
3278
3279
        // Error recovery: return -1 if this is a critical error or 0 if we want to ignore the FORMAT and proceed
3280
0
        v->n_fmt = 0;
3281
0
        return -1;
3282
0
    }
3283
3284
8.25k
    return 0;
3285
8.25k
}
3286
3287
// p,q is the start and the end of the FORMAT field
3288
static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
3289
                            char *p, char *q)
3290
17.7k
{
3291
17.7k
    if ( !bcf_hdr_nsamples(h) ) return 0;
3292
9.07k
    kstring_t *mem = (kstring_t*)&h->mem;
3293
9.07k
    mem->l = 0;
3294
3295
9.07k
    fmt_aux_t fmt[MAX_N_FMT];
3296
3297
    // detect FORMAT "."
3298
9.07k
    int ret; // +ve = ok, -ve = err
3299
9.07k
    if ((ret = vcf_parse_format_empty1(s, h, v, p, q)))
3300
349
        return ret ? 0 : -1;
3301
3302
    // get format information from the dictionary
3303
8.72k
    if (vcf_parse_format_dict2(s, h, v, p, q, fmt) < 0)
3304
29
        return -1;
3305
3306
    // FORMAT data is per-sample A:B:C A:B:C A:B:C ... but in memory it is
3307
    // stored as per-type arrays AAA... BBB... CCC...  This is basically
3308
    // a data rotation or pivot.
3309
3310
    // The size of elements in the array grow to their maximum needed,
3311
    // permitting fast random access.  This means however we have to first
3312
    // scan the whole FORMAT line to find the maximum of each type, and
3313
    // then scan it again to find the store the data.
3314
    // We break this down into compute-max, allocate, fill-out-buffers
3315
3316
    // TODO: ?
3317
    // The alternative would be to pivot on the first pass, with fixed
3318
    // size entries for numerics and concatenated strings otherwise, also
3319
    // tracking maximum sizes.  Then on a second pass we reallocate and
3320
    // copy the data again to a uniformly sized array.  Two passes through
3321
    // memory, but without doubling string parsing.
3322
3323
    // compute max
3324
8.69k
    if (vcf_parse_format_max3(s, h, v, p, q, fmt) < 0)
3325
17
        return -1;
3326
3327
    // allocate memory for arrays
3328
8.68k
    if (vcf_parse_format_alloc4(s, h, v, p, q, fmt) < 0)
3329
2
        return -1;
3330
3331
    // fill the sample fields; at beginning of the loop
3332
8.67k
    if (vcf_parse_format_fill5(s, h, v, p, q, fmt) < 0)
3333
317
        return -1;
3334
3335
    // write individual genotype information
3336
8.36k
    if (vcf_parse_format_gt6(s, h, v, p, q, fmt) < 0)
3337
0
        return -1;
3338
3339
    // validity checking
3340
8.36k
    if (vcf_parse_format_check7(h, v) < 0)
3341
109
        return -1;
3342
3343
8.25k
    return 0;
3344
8.36k
}
3345
3346
5.46k
static khint_t fix_chromosome(const bcf_hdr_t *h, vdict_t *d, const char *p) {
3347
    // Simple error recovery for chromosomes not defined in the header. It will not help when VCF header has
3348
    // been already printed, but will enable tools like vcfcheck to proceed.
3349
3350
5.46k
    kstring_t tmp = {0,0,0};
3351
5.46k
    khint_t k;
3352
5.46k
    int l;
3353
5.46k
    if (ksprintf(&tmp, "##contig=<ID=%s>", p) < 0)
3354
0
        return kh_end(d);
3355
5.46k
    bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
3356
5.46k
    free(tmp.s);
3357
5.46k
    int res = hrec ? bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) : -1;
3358
5.46k
    if (res < 0) bcf_hrec_destroy(hrec);
3359
5.46k
    if (res > 0) res = bcf_hdr_sync((bcf_hdr_t*)h);
3360
5.46k
    k = kh_get(vdict, d, p);
3361
3362
5.46k
    return k;
3363
5.46k
}
3364
3365
22.6k
static int vcf_parse_filter(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p, char *q) {
3366
22.6k
    int i, n_flt = 1, max_n_flt = 0;
3367
22.6k
    char *r, *t;
3368
22.6k
    int32_t *a_flt = NULL;
3369
22.6k
    ks_tokaux_t aux1;
3370
22.6k
    khint_t k;
3371
22.6k
    vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID];
3372
    // count the number of filters
3373
22.6k
    if (*(q-1) == ';') *(q-1) = 0;
3374
271M
    for (r = p; *r; ++r)
3375
271M
        if (*r == ';') ++n_flt;
3376
22.6k
    if (n_flt > max_n_flt) {
3377
22.6k
        a_flt = malloc(n_flt * sizeof(*a_flt));
3378
22.6k
        if (!a_flt) {
3379
0
            hts_log_error("Could not allocate memory at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3380
0
            v->errcode |= BCF_ERR_LIMITS; // No appropriate code?
3381
0
            return -1;
3382
0
        }
3383
22.6k
        max_n_flt = n_flt;
3384
22.6k
    }
3385
    // add filters
3386
3.52M
    for (t = kstrtok(p, ";", &aux1), i = 0; t; t = kstrtok(0, 0, &aux1)) {
3387
3.50M
        *(char*)aux1.p = 0;
3388
3.50M
        k = kh_get(vdict, d, t);
3389
3.50M
        if (k == kh_end(d))
3390
27.5k
        {
3391
            // Simple error recovery for FILTERs not defined in the header. It will not help when VCF header has
3392
            // been already printed, but will enable tools like vcfcheck to proceed.
3393
27.5k
            hts_log_warning("FILTER '%s' is not defined in the header", t);
3394
27.5k
            kstring_t tmp = {0,0,0};
3395
27.5k
            int l;
3396
27.5k
            ksprintf(&tmp, "##FILTER=<ID=%s,Description=\"Dummy\">", t);
3397
27.5k
            bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
3398
27.5k
            free(tmp.s);
3399
27.5k
            int res = hrec ? bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) : -1;
3400
27.5k
            if (res < 0) bcf_hrec_destroy(hrec);
3401
27.5k
            if (res > 0) res = bcf_hdr_sync((bcf_hdr_t*)h);
3402
27.5k
            k = kh_get(vdict, d, t);
3403
27.5k
            v->errcode |= BCF_ERR_TAG_UNDEF;
3404
27.5k
            if (res || k == kh_end(d)) {
3405
46
                hts_log_error("Could not add dummy header for FILTER '%s' at %s:%"PRIhts_pos, t, bcf_seqname_safe(h,v), v->pos+1);
3406
46
                v->errcode |= BCF_ERR_TAG_INVALID;
3407
46
                free(a_flt);
3408
46
                return -1;
3409
46
            }
3410
27.5k
        }
3411
3.50M
        a_flt[i++] = kh_val(d, k).id;
3412
3.50M
    }
3413
3414
22.6k
    bcf_enc_vint(str, n_flt, a_flt, -1);
3415
22.6k
    free(a_flt);
3416
3417
22.6k
    return 0;
3418
22.6k
}
3419
3420
21.1k
static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p, char *q) {
3421
21.1k
    static int extreme_int_warned = 0, negative_rlen_warned = 0;
3422
21.1k
    int max_n_val = 0, overflow = 0;
3423
21.1k
    char *r, *key;
3424
21.1k
    khint_t k;
3425
21.1k
    vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID];
3426
21.1k
    int32_t *a_val = NULL;
3427
3428
21.1k
    v->n_info = 0;
3429
21.1k
    if (*(q-1) == ';') *(q-1) = 0;
3430
3.65M
    for (r = key = p;; ++r) {
3431
3.65M
        int c;
3432
3.65M
        char *val, *end;
3433
329M
        while (*r > '=' || (*r != ';' && *r != '=' && *r != 0)) r++;
3434
3.65M
        if (v->n_info == UINT16_MAX) {
3435
2
            hts_log_error("Too many INFO entries at %s:%"PRIhts_pos,
3436
2
                          bcf_seqname_safe(h,v), v->pos+1);
3437
2
            v->errcode |= BCF_ERR_LIMITS;
3438
2
            goto fail;
3439
2
        }
3440
3.65M
        val = end = NULL;
3441
3.65M
        c = *r; *r = 0;
3442
3.65M
        if (c == '=') {
3443
2.49M
            val = r + 1;
3444
3445
1.32G
            for (end = val; *end != ';' && *end != 0; ++end);
3446
2.49M
            c = *end; *end = 0;
3447
2.49M
        } else end = r;
3448
3.65M
        if ( !*key ) { if (c==0) break; r = end; key = r + 1; continue; }  // faulty VCF, ";;" in the INFO
3449
3.60M
        k = kh_get(vdict, d, key);
3450
3.60M
        if (k == kh_end(d) || kh_val(d, k).info[BCF_HL_INFO] == 15)
3451
27.3k
        {
3452
27.3k
            hts_log_warning("INFO '%s' is not defined in the header, assuming Type=String", key);
3453
27.3k
            kstring_t tmp = {0,0,0};
3454
27.3k
            int l;
3455
27.3k
            ksprintf(&tmp, "##INFO=<ID=%s,Number=1,Type=String,Description=\"Dummy\">", key);
3456
27.3k
            bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
3457
27.3k
            free(tmp.s);
3458
27.3k
            int res = hrec ? bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) : -1;
3459
27.3k
            if (res < 0) bcf_hrec_destroy(hrec);
3460
27.3k
            if (res > 0) res = bcf_hdr_sync((bcf_hdr_t*)h);
3461
27.3k
            k = kh_get(vdict, d, key);
3462
27.3k
            v->errcode |= BCF_ERR_TAG_UNDEF;
3463
27.3k
            if (res || k == kh_end(d)) {
3464
94
                hts_log_error("Could not add dummy header for INFO '%s' at %s:%"PRIhts_pos, key, bcf_seqname_safe(h,v), v->pos+1);
3465
94
                v->errcode |= BCF_ERR_TAG_INVALID;
3466
94
                goto fail;
3467
94
            }
3468
27.3k
        }
3469
3.60M
        uint32_t y = kh_val(d, k).info[BCF_HL_INFO];
3470
3.60M
        ++v->n_info;
3471
3.60M
        bcf_enc_int1(str, kh_val(d, k).id);
3472
3.60M
        if (val == 0) {
3473
1.11M
            bcf_enc_size(str, 0, BCF_BT_NULL);
3474
2.48M
        } else if ((y>>4&0xf) == BCF_HT_FLAG || (y>>4&0xf) == BCF_HT_STR) { // if Flag has a value, treat it as a string
3475
77.3k
            bcf_enc_vchar(str, end - val, val);
3476
2.41M
        } else { // int/float value/array
3477
2.41M
            int i, n_val;
3478
2.41M
            char *t, *te;
3479
1.27G
            for (t = val, n_val = 1; *t; ++t) // count the number of values
3480
1.26G
                if (*t == ',') ++n_val;
3481
            // Check both int and float size in one step for simplicity
3482
2.41M
            if (n_val > max_n_val) {
3483
3.82k
                int32_t *a_tmp = (int32_t *)realloc(a_val, n_val * sizeof(*a_val));
3484
3.82k
                if (!a_tmp) {
3485
0
                    hts_log_error("Could not allocate memory at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3486
0
                    v->errcode |= BCF_ERR_LIMITS; // No appropriate code?
3487
0
                    goto fail;
3488
0
                }
3489
3.82k
                a_val = a_tmp;
3490
3.82k
                max_n_val = n_val;
3491
3.82k
            }
3492
2.41M
            if ((y>>4&0xf) == BCF_HT_INT) {
3493
1.55M
                i = 0, t = val;
3494
1.55M
                int64_t val1;
3495
1.55M
                int is_int64 = 0;
3496
#ifdef VCF_ALLOW_INT64
3497
                if ( n_val==1 )
3498
                {
3499
                    overflow = 0;
3500
                    long long int tmp_val = hts_str2int(val, &te, sizeof(tmp_val)*CHAR_BIT, &overflow);
3501
                    if ( te==val ) tmp_val = bcf_int32_missing;
3502
                    else if ( overflow || tmp_val<BCF_MIN_BT_INT64 || tmp_val>BCF_MAX_BT_INT64 )
3503
                    {
3504
                        if ( !extreme_int_warned )
3505
                        {
3506
                            hts_log_warning("Extreme INFO/%s value encountered and set to missing at %s:%"PRIhts_pos,key,bcf_seqname_safe(h,v), v->pos+1);
3507
                            extreme_int_warned = 1;
3508
                        }
3509
                        tmp_val = bcf_int32_missing;
3510
                    }
3511
                    else
3512
                        is_int64 = 1;
3513
                    val1 = tmp_val;
3514
                    t = te;
3515
                    i = 1;  // this is just to avoid adding another nested block...
3516
                }
3517
#endif
3518
743M
                for (; i < n_val; ++i, ++t)
3519
741M
                {
3520
741M
                    overflow = 0;
3521
741M
                    long int tmp_val = hts_str2int(t, &te, sizeof(tmp_val)*CHAR_BIT, &overflow);
3522
741M
                    if ( te==t ) tmp_val = bcf_int32_missing;
3523
4.98M
                    else if ( overflow || tmp_val<BCF_MIN_BT_INT32 || tmp_val>BCF_MAX_BT_INT32 )
3524
1.05M
                    {
3525
1.05M
                        if ( !extreme_int_warned )
3526
1
                        {
3527
1
                            hts_log_warning("Extreme INFO/%s value encountered and set to missing at %s:%"PRIhts_pos,key,bcf_seqname_safe(h,v), v->pos+1);
3528
1
                            extreme_int_warned = 1;
3529
1
                        }
3530
1.05M
                        tmp_val = bcf_int32_missing;
3531
1.05M
                    }
3532
741M
                    a_val[i] = tmp_val;
3533
861M
                    for (t = te; *t && *t != ','; t++);
3534
741M
                }
3535
1.55M
                if (n_val == 1) {
3536
#ifdef VCF_ALLOW_INT64
3537
                    if ( is_int64 )
3538
                    {
3539
                        v->unpacked |= BCF_IS_64BIT;
3540
                        bcf_enc_long1(str, val1);
3541
                    }
3542
                    else
3543
                        bcf_enc_int1(str, (int32_t)val1);
3544
#else
3545
1.21M
                    val1 = a_val[0];
3546
1.21M
                    bcf_enc_int1(str, (int32_t)val1);
3547
1.21M
#endif
3548
1.21M
                } else {
3549
333k
                    bcf_enc_vint(str, n_val, a_val, -1);
3550
333k
                }
3551
1.55M
                if (n_val==1 && (val1!=bcf_int32_missing || is_int64)
3552
1.55M
                    && memcmp(key, "END", 4) == 0)
3553
0
                {
3554
0
                    if ( val1 <= v->pos )
3555
0
                    {
3556
0
                        if ( !negative_rlen_warned )
3557
0
                        {
3558
0
                            hts_log_warning("INFO/END=%"PRIhts_pos" is smaller than POS at %s:%"PRIhts_pos,val1,bcf_seqname_safe(h,v),v->pos+1);
3559
0
                            negative_rlen_warned = 1;
3560
0
                        }
3561
0
                    }
3562
0
                    else
3563
0
                        v->rlen = val1 - v->pos;
3564
0
                }
3565
1.55M
            } else if ((y>>4&0xf) == BCF_HT_REAL) {
3566
858k
                float *val_f = (float *)a_val;
3567
290M
                for (i = 0, t = val; i < n_val; ++i, ++t)
3568
289M
                {
3569
289M
                    overflow = 0;
3570
289M
                    val_f[i] = hts_str2dbl(t, &te, &overflow);
3571
289M
                    if ( te==t || overflow ) // conversion failed
3572
288M
                        bcf_float_set_missing(val_f[i]);
3573
371M
                    for (t = te; *t && *t != ','; t++);
3574
289M
                }
3575
858k
                bcf_enc_vfloat(str, n_val, val_f);
3576
858k
            }
3577
2.41M
        }
3578
3.60M
        if (c == 0) break;
3579
3.59M
        r = end;
3580
3.59M
        key = r + 1;
3581
3.59M
    }
3582
3583
21.0k
    free(a_val);
3584
21.0k
    return 0;
3585
3586
96
 fail:
3587
96
    free(a_val);
3588
96
    return -1;
3589
21.1k
}
3590
3591
int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v)
3592
24.3k
{
3593
24.3k
    int ret = -2, overflow = 0;
3594
24.3k
    char *p, *q, *r, *t;
3595
24.3k
    kstring_t *str;
3596
24.3k
    khint_t k;
3597
24.3k
    ks_tokaux_t aux;
3598
3599
//#define NOT_DOT(p) strcmp((p), ".")
3600
//#define NOT_DOT(p) (!(*p == '.' && !p[1]))
3601
//#define NOT_DOT(p) ((*p) != '.' || (p)[1])
3602
//#define NOT_DOT(p) (q-p != 1 || memcmp(p, ".\0", 2))
3603
117k
#define NOT_DOT(p) (memcmp(p, ".\0", 2))
3604
3605
24.3k
    if (!s || !h || !v || !(s->s))
3606
0
        return ret;
3607
3608
    // Assumed in lots of places, but we may as well spot this early
3609
24.3k
    assert(sizeof(float) == sizeof(int32_t));
3610
3611
    // Ensure string we parse has space to permit some over-flow when during
3612
    // parsing.  Eg to do memcmp(key, "END", 4) in vcf_parse_info over
3613
    // the more straight forward looking strcmp, giving a speed advantage.
3614
24.3k
    if (ks_resize(s, s->l+4) < 0)
3615
0
        return -1;
3616
3617
    // Force our memory to be initialised so we avoid the technicality of
3618
    // undefined behaviour in using a 4-byte memcmp.  (The reality is this
3619
    // almost certainly is never detected by the compiler so has no impact,
3620
    // but equally so this code has minimal (often beneficial) impact on
3621
    // performance too.)
3622
24.3k
    s->s[s->l+0] = 0;
3623
24.3k
    s->s[s->l+1] = 0;
3624
24.3k
    s->s[s->l+2] = 0;
3625
24.3k
    s->s[s->l+3] = 0;
3626
3627
24.3k
    bcf_clear1(v);
3628
24.3k
    str = &v->shared;
3629
24.3k
    memset(&aux, 0, sizeof(ks_tokaux_t));
3630
3631
    // CHROM
3632
24.3k
    if (!(p = kstrtok(s->s, "\t", &aux)))
3633
0
        goto err;
3634
24.3k
    *(q = (char*)aux.p) = 0;
3635
3636
24.3k
    vdict_t *d = (vdict_t*)h->dict[BCF_DT_CTG];
3637
24.3k
    k = kh_get(vdict, d, p);
3638
24.3k
    if (k == kh_end(d)) {
3639
5.46k
        hts_log_warning("Contig '%s' is not defined in the header. (Quick workaround: index the file with tabix.)", p);
3640
5.46k
        v->errcode = BCF_ERR_CTG_UNDEF;
3641
5.46k
        if ((k = fix_chromosome(h, d, p)) == kh_end(d)) {
3642
55
            hts_log_error("Could not add dummy header for contig '%s'", p);
3643
55
            v->errcode |= BCF_ERR_CTG_INVALID;
3644
55
            goto err;
3645
55
        }
3646
5.46k
    }
3647
24.3k
    v->rid = kh_val(d, k).id;
3648
3649
    // POS
3650
24.3k
    if (!(p = kstrtok(0, 0, &aux)))
3651
482
        goto err;
3652
23.8k
    *(q = (char*)aux.p) = 0;
3653
3654
23.8k
    overflow = 0;
3655
23.8k
    char *tmp = p;
3656
23.8k
    v->pos = hts_str2uint(p, &p, 63, &overflow);
3657
23.8k
    if (overflow) {
3658
36
        hts_log_error("Position value '%s' is too large", tmp);
3659
36
        goto err;
3660
23.8k
    } else if ( *p ) {
3661
101
        hts_log_error("Could not parse the position '%s'", tmp);
3662
101
        goto err;
3663
23.7k
    } else {
3664
23.7k
        v->pos -= 1;
3665
23.7k
    }
3666
23.7k
    if (v->pos >= INT32_MAX)
3667
618
        v->unpacked |= BCF_IS_64BIT;
3668
3669
    // ID
3670
23.7k
    if (!(p = kstrtok(0, 0, &aux)))
3671
109
        goto err;
3672
23.5k
    *(q = (char*)aux.p) = 0;
3673
3674
23.5k
    if (NOT_DOT(p)) bcf_enc_vchar(str, q - p, p);
3675
127
    else bcf_enc_size(str, 0, BCF_BT_CHAR);
3676
3677
    // REF
3678
23.5k
    if (!(p = kstrtok(0, 0, &aux)))
3679
73
        goto err;
3680
23.5k
    *(q = (char*)aux.p) = 0;
3681
3682
23.5k
    bcf_enc_vchar(str, q - p, p);
3683
23.5k
    v->n_allele = 1, v->rlen = q - p;
3684
3685
    // ALT
3686
23.5k
    if (!(p = kstrtok(0, 0, &aux)))
3687
38
        goto err;
3688
23.4k
    *(q = (char*)aux.p) = 0;
3689
3690
23.4k
    if (NOT_DOT(p)) {
3691
263M
        for (r = t = p;; ++r) {
3692
263M
            if (*r == ',' || *r == 0) {
3693
3.65M
                if (v->n_allele == UINT16_MAX) {
3694
1
                    hts_log_error("Too many ALT alleles at %s:%"PRIhts_pos,
3695
1
                                  bcf_seqname_safe(h,v), v->pos+1);
3696
1
                    v->errcode |= BCF_ERR_LIMITS;
3697
1
                    goto err;
3698
1
                }
3699
3.65M
                bcf_enc_vchar(str, r - t, t);
3700
3.65M
                t = r + 1;
3701
3.65M
                ++v->n_allele;
3702
3.65M
            }
3703
263M
            if (r == q) break;
3704
263M
        }
3705
23.0k
    }
3706
3707
    // QUAL
3708
23.4k
    if (!(p = kstrtok(0, 0, &aux)))
3709
69
        goto err;
3710
23.4k
    *(q = (char*)aux.p) = 0;
3711
3712
23.4k
    if (NOT_DOT(p)) v->qual = atof(p);
3713
430
    else bcf_float_set_missing(v->qual);
3714
23.4k
    if ( v->max_unpack && !(v->max_unpack>>1) ) goto end; // BCF_UN_STR
3715
3716
    // FILTER
3717
23.4k
    if (!(p = kstrtok(0, 0, &aux)))
3718
42
        goto err;
3719
23.3k
    *(q = (char*)aux.p) = 0;
3720
3721
23.3k
    if (NOT_DOT(p)) {
3722
22.6k
        if (vcf_parse_filter(str, h, v, p, q)) {
3723
46
            goto err;
3724
46
        }
3725
22.6k
    } else bcf_enc_vint(str, 0, 0, -1);
3726
23.3k
    if ( v->max_unpack && !(v->max_unpack>>2) ) goto end; // BCF_UN_FLT
3727
3728
    // INFO
3729
23.3k
    if (!(p = kstrtok(0, 0, &aux)))
3730
185
        goto err;
3731
23.1k
    *(q = (char*)aux.p) = 0;
3732
3733
23.1k
    if (NOT_DOT(p)) {
3734
21.1k
        if (vcf_parse_info(str, h, v, p, q)) {
3735
96
            goto err;
3736
96
        }
3737
21.1k
    }
3738
23.0k
    if ( v->max_unpack && !(v->max_unpack>>3) ) goto end;
3739
3740
    // FORMAT; optional
3741
23.0k
    p = kstrtok(0, 0, &aux);
3742
23.0k
    if (p) {
3743
17.7k
        *(q = (char*)aux.p) = 0;
3744
3745
17.7k
        return vcf_parse_format(s, h, v, p, q) == 0 ? 0 : -2;
3746
17.7k
    } else {
3747
5.30k
        return 0;
3748
5.30k
    }
3749
3750
0
 end:
3751
0
    ret = 0;
3752
3753
1.33k
 err:
3754
1.33k
    return ret;
3755
0
}
3756
3757
int vcf_open_mode(char *mode, const char *fn, const char *format)
3758
0
{
3759
0
    if (format == NULL) {
3760
        // Try to pick a format based on the filename extension
3761
0
        char extension[HTS_MAX_EXT_LEN];
3762
0
        if (find_file_extension(fn, extension) < 0) return -1;
3763
0
        return vcf_open_mode(mode, fn, extension);
3764
0
    }
3765
0
    else if (strcasecmp(format, "bcf") == 0) strcpy(mode, "b");
3766
0
    else if (strcasecmp(format, "vcf") == 0) strcpy(mode, "");
3767
0
    else if (strcasecmp(format, "vcf.gz") == 0 || strcasecmp(format, "vcf.bgz") == 0) strcpy(mode, "z");
3768
0
    else return -1;
3769
3770
0
    return 0;
3771
0
}
3772
3773
int vcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
3774
26.9k
{
3775
26.9k
    int ret;
3776
26.9k
    ret = hts_getline(fp, KS_SEP_LINE, &fp->line);
3777
26.9k
    if (ret < 0) return ret;
3778
24.3k
    return vcf_parse1(&fp->line, h, v);
3779
26.9k
}
3780
3781
static inline uint8_t *bcf_unpack_fmt_core1(uint8_t *ptr, int n_sample, bcf_fmt_t *fmt)
3782
0
{
3783
0
    uint8_t *ptr_start = ptr;
3784
0
    fmt->id = bcf_dec_typed_int1(ptr, &ptr);
3785
0
    fmt->n = bcf_dec_size(ptr, &ptr, &fmt->type);
3786
0
    fmt->size = fmt->n << bcf_type_shift[fmt->type];
3787
0
    fmt->p = ptr;
3788
0
    fmt->p_off  = ptr - ptr_start;
3789
0
    fmt->p_free = 0;
3790
0
    ptr += n_sample * fmt->size;
3791
0
    fmt->p_len = ptr - fmt->p;
3792
0
    return ptr;
3793
0
}
3794
3795
static inline uint8_t *bcf_unpack_info_core1(uint8_t *ptr, bcf_info_t *info)
3796
0
{
3797
0
    uint8_t *ptr_start = ptr;
3798
0
    int64_t len = 0;
3799
0
    info->key = bcf_dec_typed_int1(ptr, &ptr);
3800
0
    len = info->len = bcf_dec_size(ptr, &ptr, &info->type);
3801
0
    info->vptr = ptr;
3802
0
    info->vptr_off  = ptr - ptr_start;
3803
0
    info->vptr_free = 0;
3804
0
    info->v1.i = 0;
3805
0
    if (info->len == 1) {
3806
0
        switch(info->type) {
3807
0
        case BCF_BT_INT8:
3808
0
        case BCF_BT_CHAR:
3809
0
            info->v1.i = *(int8_t*)ptr;
3810
0
            break;
3811
0
        case BCF_BT_INT16:
3812
0
            info->v1.i = le_to_i16(ptr);
3813
0
            len <<= 1;
3814
0
            break;
3815
0
        case BCF_BT_INT32:
3816
0
            info->v1.i = le_to_i32(ptr);
3817
0
            len <<= 2;
3818
0
            break;
3819
0
        case BCF_BT_FLOAT:
3820
0
            info->v1.f = le_to_float(ptr);
3821
0
            len <<= 2;
3822
0
            break;
3823
0
        case BCF_BT_INT64:
3824
0
            info->v1.i = le_to_i64(ptr);
3825
0
            len <<= 3;
3826
0
            break;
3827
0
        }
3828
0
    } else {
3829
0
        len <<= bcf_type_shift[info->type];
3830
0
    }
3831
0
    ptr += len;
3832
3833
0
    info->vptr_len = ptr - info->vptr;
3834
0
    return ptr;
3835
0
}
3836
3837
int bcf_unpack(bcf1_t *b, int which)
3838
24.5k
{
3839
24.5k
    if ( !b->shared.l ) return 0; // Building a new BCF record from scratch
3840
24.5k
    uint8_t *ptr = (uint8_t*)b->shared.s, *ptr_ori;
3841
24.5k
    int i;
3842
24.5k
    bcf_dec_t *d = &b->d;
3843
24.5k
    if (which & BCF_UN_FLT) which |= BCF_UN_STR;
3844
24.5k
    if (which & BCF_UN_INFO) which |= BCF_UN_SHR;
3845
24.5k
    if ((which&BCF_UN_STR) && !(b->unpacked&BCF_UN_STR))
3846
24.5k
    {
3847
24.5k
        kstring_t tmp;
3848
3849
        // ID
3850
24.5k
        tmp.l = 0; tmp.s = d->id; tmp.m = d->m_id;
3851
24.5k
        ptr_ori = ptr;
3852
24.5k
        ptr = bcf_fmt_sized_array(&tmp, ptr);
3853
24.5k
        b->unpack_size[0] = ptr - ptr_ori;
3854
24.5k
        kputc_('\0', &tmp);
3855
24.5k
        d->id = tmp.s; d->m_id = tmp.m;
3856
3857
        // REF and ALT are in a single block (d->als) and d->alleles are pointers into this block
3858
24.5k
        hts_expand(char*, b->n_allele, d->m_allele, d->allele); // NM: hts_expand() is a macro
3859
24.5k
        tmp.l = 0; tmp.s = d->als; tmp.m = d->m_als;
3860
24.5k
        ptr_ori = ptr;
3861
3.60M
        for (i = 0; i < b->n_allele; ++i) {
3862
            // Use offset within tmp.s as realloc may change pointer
3863
3.57M
            d->allele[i] = (char *)(intptr_t)tmp.l;
3864
3.57M
            ptr = bcf_fmt_sized_array(&tmp, ptr);
3865
3.57M
            kputc_('\0', &tmp);
3866
3.57M
        }
3867
24.5k
        b->unpack_size[1] = ptr - ptr_ori;
3868
24.5k
        d->als = tmp.s; d->m_als = tmp.m;
3869
3870
        // Convert our offsets within tmp.s back to pointers again
3871
3.60M
        for (i = 0; i < b->n_allele; ++i)
3872
3.57M
            d->allele[i] = d->als + (ptrdiff_t)d->allele[i];
3873
24.5k
        b->unpacked |= BCF_UN_STR;
3874
24.5k
    }
3875
24.5k
    if ((which&BCF_UN_FLT) && !(b->unpacked&BCF_UN_FLT)) { // FILTER
3876
24.5k
        ptr = (uint8_t*)b->shared.s + b->unpack_size[0] + b->unpack_size[1];
3877
24.5k
        ptr_ori = ptr;
3878
24.5k
        if (*ptr>>4) {
3879
21.9k
            int type;
3880
21.9k
            d->n_flt = bcf_dec_size(ptr, &ptr, &type);
3881
21.9k
            hts_expand(int, d->n_flt, d->m_flt, d->flt);
3882
3.25M
            for (i = 0; i < d->n_flt; ++i)
3883
3.23M
                d->flt[i] = bcf_dec_int1(ptr, type, &ptr);
3884
21.9k
        } else ++ptr, d->n_flt = 0;
3885
24.5k
        b->unpack_size[2] = ptr - ptr_ori;
3886
24.5k
        b->unpacked |= BCF_UN_FLT;
3887
24.5k
    }
3888
24.5k
    if ((which&BCF_UN_INFO) && !(b->unpacked&BCF_UN_INFO)) { // INFO
3889
0
        ptr = (uint8_t*)b->shared.s + b->unpack_size[0] + b->unpack_size[1] + b->unpack_size[2];
3890
0
        hts_expand(bcf_info_t, b->n_info, d->m_info, d->info);
3891
0
        for (i = 0; i < d->m_info; ++i) d->info[i].vptr_free = 0;
3892
0
        for (i = 0; i < b->n_info; ++i)
3893
0
            ptr = bcf_unpack_info_core1(ptr, &d->info[i]);
3894
0
        b->unpacked |= BCF_UN_INFO;
3895
0
    }
3896
24.5k
    if ((which&BCF_UN_FMT) && b->n_sample && !(b->unpacked&BCF_UN_FMT)) { // FORMAT
3897
0
        ptr = (uint8_t*)b->indiv.s;
3898
0
        hts_expand(bcf_fmt_t, b->n_fmt, d->m_fmt, d->fmt);
3899
0
        for (i = 0; i < d->m_fmt; ++i) d->fmt[i].p_free = 0;
3900
0
        for (i = 0; i < b->n_fmt; ++i)
3901
0
            ptr = bcf_unpack_fmt_core1(ptr, b->n_sample, &d->fmt[i]);
3902
0
        b->unpacked |= BCF_UN_FMT;
3903
0
    }
3904
24.5k
    return 0;
3905
24.5k
}
3906
3907
int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s)
3908
24.5k
{
3909
24.5k
    int i;
3910
24.5k
    int32_t max_dt_id = h->n[BCF_DT_ID];
3911
24.5k
    const char *chrom = bcf_seqname(h, v);
3912
24.5k
    if (!chrom) {
3913
0
        hts_log_error("Invalid BCF, CONTIG id=%d not present in the header",
3914
0
                      v->rid);
3915
0
        errno = EINVAL;
3916
0
        return -1;
3917
0
    }
3918
3919
24.5k
    bcf_unpack((bcf1_t*)v, BCF_UN_ALL & ~(BCF_UN_INFO|BCF_UN_FMT));
3920
3921
    // Cache of key lengths so we don't keep repeatedly using them.
3922
    // This assumes we're not modifying the header between successive calls
3923
    // to vcf_format, but that would lead to many other forms of breakage
3924
    // so it feels like a valid assumption to make.
3925
    //
3926
    // We cannot just do this in bcf_hdr_sync as some code (eg bcftools
3927
    // annotate) manipulates the headers directly without calling sync to
3928
    // refresh the data structures.  So we must do just-in-time length
3929
    // calculation during writes instead.
3930
24.5k
    bcf_hdr_aux_t *aux = get_hdr_aux(h);
3931
24.5k
    if (!aux->key_len) {
3932
7.48k
        if (!(aux->key_len = calloc(h->n[BCF_DT_ID]+1, sizeof(*aux->key_len))))
3933
0
            return -1;
3934
7.48k
    }
3935
24.5k
    size_t *key_len = aux->key_len;
3936
3937
24.5k
    kputs(chrom, s); // CHROM
3938
24.5k
    kputc_('\t', s); kputll(v->pos + 1, s); // POS
3939
24.5k
    kputc_('\t', s); kputs(v->d.id ? v->d.id : ".", s); // ID
3940
24.5k
    kputc_('\t', s); // REF
3941
24.5k
    if (v->n_allele > 0) kputs(v->d.allele[0], s);
3942
0
    else kputc_('.', s);
3943
24.5k
    kputc_('\t', s); // ALT
3944
24.5k
    if (v->n_allele > 1) {
3945
3.57M
        for (i = 1; i < v->n_allele; ++i) {
3946
3.55M
            if (i > 1) kputc_(',', s);
3947
3.55M
            kputs(v->d.allele[i], s);
3948
3.55M
        }
3949
21.9k
    } else kputc_('.', s);
3950
24.5k
    kputc_('\t', s); // QUAL
3951
24.5k
    if ( bcf_float_is_missing(v->qual) ) kputc_('.', s); // QUAL
3952
24.1k
    else kputd(v->qual, s);
3953
24.5k
    kputc_('\t', s); // FILTER
3954
24.5k
    if (v->d.n_flt) {
3955
3.25M
        for (i = 0; i < v->d.n_flt; ++i) {
3956
3.23M
            int32_t idx = v->d.flt[i];
3957
3.23M
            if (idx < 0 || idx >= max_dt_id
3958
3.23M
                || h->id[BCF_DT_ID][idx].key == NULL) {
3959
0
                hts_log_error("Invalid BCF, the FILTER tag id=%d at %s:%"PRIhts_pos" not present in the header",
3960
0
                              idx, bcf_seqname_safe(h, v), v->pos + 1);
3961
0
                errno = EINVAL;
3962
0
                return -1;
3963
0
            }
3964
3.23M
            if (i) kputc_(';', s);
3965
3.23M
            if (!key_len[idx])
3966
2.40M
                key_len[idx] = strlen(h->id[BCF_DT_ID][idx].key);
3967
3.23M
            kputsn(h->id[BCF_DT_ID][idx].key, key_len[idx], s);
3968
3.23M
        }
3969
21.7k
    } else kputc_('.', s);
3970
3971
24.5k
    kputc_('\t', s); // INFO
3972
24.5k
    if (v->n_info) {
3973
8.28k
        uint8_t *ptr = (uint8_t *)v->shared.s + v->unpack_size[0] + v->unpack_size[1] + v->unpack_size[2];
3974
8.28k
        int first = 1;
3975
8.28k
        bcf_info_t *info = v->d.info;
3976
3977
        // Note if we duplicate this code into custom packed and unpacked
3978
        // implementations then we gain a bit more speed, particularly with
3979
        // clang 13 (up to 5%).  Not sure why this is, but code duplication
3980
        // isn't pleasant and it's still faster adding packed support than
3981
        // not so it's a win, just not as good as it should be.
3982
8.28k
        const int info_packed = !(v->unpacked & BCF_UN_INFO) && v->shared.l;
3983
2.89M
        for (i = 0; i < v->n_info; ++i) {
3984
2.88M
            bcf_info_t in, *z;
3985
2.88M
            if (info_packed) {
3986
                // Use a local bcf_info_t when data is packed
3987
2.88M
                z = &in;
3988
2.88M
                z->key  = bcf_dec_typed_int1(ptr, &ptr);
3989
2.88M
                z->len  = bcf_dec_size(ptr, &ptr, &z->type);
3990
2.88M
                z->vptr = ptr;
3991
2.88M
                ptr += z->len << bcf_type_shift[z->type];
3992
2.88M
            } else {
3993
                // Else previously unpacked INFO struct
3994
0
                z = &info[i];
3995
3996
                // Also potentially since deleted
3997
0
                if ( !z->vptr ) continue;
3998
0
            }
3999
4000
2.88M
            bcf_idpair_t *id = z->key >= 0 && z->key < max_dt_id
4001
2.88M
                ? &h->id[BCF_DT_ID][z->key]
4002
2.88M
                : NULL;
4003
4004
2.88M
            if (!id || !id->key) {
4005
0
                hts_log_error("Invalid BCF, the INFO tag id=%d is %s at %s:%"PRIhts_pos,
4006
0
                              z->key,
4007
0
                              z->key < 0 ? "negative"
4008
0
                              : (z->key >= max_dt_id ? "too large" : "not present in the header"),
4009
0
                              bcf_seqname_safe(h, v), v->pos+1);
4010
0
                errno = EINVAL;
4011
0
                return -1;
4012
0
            }
4013
4014
            // KEY
4015
2.88M
            if (!key_len[z->key])
4016
19.1k
                key_len[z->key] = strlen(id->key);
4017
2.88M
            size_t id_len = key_len[z->key];
4018
2.88M
            if (ks_resize(s, s->l + 3 + id_len) < 0)
4019
0
                return -1;
4020
2.88M
            char *sptr = s->s + s->l;
4021
2.88M
            if ( !first ) {
4022
2.87M
                *sptr++ = ';';
4023
2.87M
                s->l++;
4024
2.87M
            }
4025
2.88M
            first = 0;
4026
2.88M
            memcpy(sptr, id->key, id_len);
4027
2.88M
            s->l += id_len;
4028
4029
            // VALUE
4030
2.88M
            if (z->len <= 0) continue;
4031
2.30M
            sptr[id_len] = '=';
4032
2.30M
            s->l++;
4033
4034
2.30M
            if (z->len != 1 || info_packed) {
4035
2.30M
                bcf_fmt_array(s, z->len, z->type, z->vptr);
4036
2.30M
            } else {
4037
                // Single length vectors are unpacked into their
4038
                // own info.v1 union and handled separately.
4039
0
                if (z->type == BCF_BT_FLOAT) {
4040
0
                    if ( bcf_float_is_missing(z->v1.f) )
4041
0
                        kputc_('.', s);
4042
0
                    else
4043
0
                        kputd(z->v1.f, s);
4044
0
                } else if (z->type == BCF_BT_CHAR) {
4045
0
                    kputc_(z->v1.i, s);
4046
0
                } else if (z->type < BCF_BT_INT64) {
4047
0
                    int64_t missing[] = {
4048
0
                        0, // BCF_BT_NULL
4049
0
                        bcf_int8_missing,
4050
0
                        bcf_int16_missing,
4051
0
                        bcf_int32_missing,
4052
0
                    };
4053
0
                    if (z->v1.i == missing[z->type])
4054
0
                        kputc_('.', s);
4055
0
                    else
4056
0
                        kputw(z->v1.i, s);
4057
0
                } else if (z->type == BCF_BT_INT64) {
4058
0
                    if (z->v1.i == bcf_int64_missing)
4059
0
                        kputc_('.', s);
4060
0
                    else
4061
0
                        kputll(z->v1.i, s);
4062
0
                } else {
4063
0
                    hts_log_error("Unexpected type %d at %s:%"PRIhts_pos, z->type, bcf_seqname_safe(h, v), v->pos+1);
4064
0
                    errno = EINVAL;
4065
0
                    return -1;
4066
0
                }
4067
0
            }
4068
2.30M
        }
4069
8.28k
        if ( first ) kputc_('.', s);
4070
16.2k
    } else kputc_('.', s);
4071
4072
    // FORMAT and individual information
4073
24.5k
    if (v->n_sample) {
4074
8.54k
        int i,j;
4075
8.54k
        if ( v->n_fmt) {
4076
8.25k
            uint8_t *ptr = (uint8_t *)v->indiv.s;
4077
8.25k
            int gt_i = -1;
4078
8.25k
            bcf_fmt_t *fmt = v->d.fmt;
4079
8.25k
            int first = 1;
4080
8.25k
            int fmt_packed = !(v->unpacked & BCF_UN_FMT);
4081
4082
8.25k
            if (fmt_packed) {
4083
                // Local fmt as we have an array of num FORMAT keys,
4084
                // each of which points to N.Sample values.
4085
4086
                // No real gain to be had in handling unpacked data here,
4087
                // but it doesn't cost us much in complexity either and
4088
                // it gives us flexibility.
4089
8.25k
                fmt = malloc(v->n_fmt * sizeof(*fmt));
4090
8.25k
                if (!fmt)
4091
0
                    return -1;
4092
8.25k
            }
4093
4094
            // KEYS
4095
123k
            for (i = 0; i < (int)v->n_fmt; ++i) {
4096
114k
                bcf_fmt_t *z;
4097
114k
                z = &fmt[i];
4098
114k
                if (fmt_packed) {
4099
114k
                    z->id   = bcf_dec_typed_int1(ptr, &ptr);
4100
114k
                    z->n    = bcf_dec_size(ptr, &ptr, &z->type);
4101
114k
                    z->p    = ptr;
4102
114k
                    z->size = z->n << bcf_type_shift[z->type];
4103
114k
                    ptr += v->n_sample * z->size;
4104
114k
                }
4105
114k
                if ( !z->p ) continue;
4106
114k
                kputc_(!first ? ':' : '\t', s); first = 0;
4107
4108
114k
                bcf_idpair_t *id = z->id >= 0 && z->id < max_dt_id
4109
114k
                    ? &h->id[BCF_DT_ID][z->id]
4110
114k
                    : NULL;
4111
4112
114k
                if (!id || !id->key) {
4113
0
                    hts_log_error("Invalid BCF, the FORMAT tag id=%d at %s:%"PRIhts_pos" not present in the header", z->id, bcf_seqname_safe(h, v), v->pos+1);
4114
0
                    errno = EINVAL;
4115
0
                    return -1;
4116
0
                }
4117
4118
114k
                if (!key_len[z->id])
4119
98.2k
                    key_len[z->id] = strlen(id->key);
4120
114k
                size_t id_len = key_len[z->id];
4121
114k
                kputsn(id->key, id_len, s);
4122
114k
                if (id_len == 2 && id->key[0] == 'G' && id->key[1] == 'T')
4123
6.66k
                    gt_i = i;
4124
114k
            }
4125
8.25k
            if ( first ) kputsn("\t.", 2, s);
4126
4127
            // VALUES per sample
4128
53.3k
            for (j = 0; j < v->n_sample; ++j) {
4129
45.0k
                kputc_('\t', s);
4130
45.0k
                first = 1;
4131
45.0k
                bcf_fmt_t *f = fmt;
4132
1.60M
                for (i = 0; i < (int)v->n_fmt; i++, f++) {
4133
1.58M
                    if ( !f->p ) continue;
4134
1.58M
                    if (!first) kputc_(':', s);
4135
1.58M
                    first = 0;
4136
1.58M
                    if (gt_i == i) {
4137
26.3k
                        bcf_format_gt(f,j,s);
4138
26.3k
                        break;
4139
26.3k
                    }
4140
1.55M
                    else if (f->n == 1)
4141
284k
                        bcf_fmt_array1(s, f->type, f->p + j * (size_t)f->size);
4142
1.27M
                    else
4143
1.27M
                        bcf_fmt_array(s, f->n, f->type, f->p + j * (size_t)f->size);
4144
1.58M
                }
4145
4146
                // Simpler loop post GT and at least 1 iteration
4147
99.5k
                for (i++, f++; i < (int)v->n_fmt; i++, f++) {
4148
54.4k
                    if ( !f->p ) continue;
4149
54.4k
                    kputc_(':', s);
4150
54.4k
                    if (f->n == 1)
4151
1.88k
                        bcf_fmt_array1(s, f->type, f->p + j * (size_t)f->size);
4152
52.6k
                    else
4153
52.6k
                        bcf_fmt_array(s, f->n, f->type, f->p + j * (size_t)f->size);
4154
54.4k
                }
4155
45.0k
                if ( first ) kputc_('.', s);
4156
45.0k
            }
4157
8.25k
            if (fmt_packed)
4158
8.25k
                free(fmt);
4159
8.25k
        }
4160
294
        else
4161
2.38k
            for (j=0; j<=v->n_sample; j++)
4162
2.09k
                kputsn("\t.", 2, s);
4163
8.54k
    }
4164
24.5k
    kputc('\n', s);
4165
24.5k
    return 0;
4166
24.5k
}
4167
4168
int vcf_write_line(htsFile *fp, kstring_t *line)
4169
0
{
4170
0
    int ret;
4171
0
    if ( line->s[line->l-1]!='\n' ) kputc('\n',line);
4172
0
    if ( fp->format.compression!=no_compression )
4173
0
        ret = bgzf_write(fp->fp.bgzf, line->s, line->l);
4174
0
    else
4175
0
        ret = hwrite(fp->fp.hfile, line->s, line->l);
4176
0
    return ret==line->l ? 0 : -1;
4177
0
}
4178
4179
int vcf_write(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
4180
24.5k
{
4181
24.5k
    ssize_t ret;
4182
24.5k
    fp->line.l = 0;
4183
24.5k
    if (vcf_format1(h, v, &fp->line) != 0)
4184
0
        return -1;
4185
24.5k
    if ( fp->format.compression!=no_compression ) {
4186
0
        if (bgzf_flush_try(fp->fp.bgzf, fp->line.l) < 0)
4187
0
            return -1;
4188
0
        ret = bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l);
4189
24.5k
    } else {
4190
24.5k
        ret = hwrite(fp->fp.hfile, fp->line.s, fp->line.l);
4191
24.5k
    }
4192
4193
24.5k
    if (fp->idx && fp->format.compression == bgzf) {
4194
0
        int tid;
4195
0
        if ((tid = hts_idx_tbi_name(fp->idx, v->rid, bcf_seqname_safe(h, v))) < 0)
4196
0
            return -1;
4197
4198
0
        if (bgzf_idx_push(fp->fp.bgzf, fp->idx,
4199
0
                          tid, v->pos, v->pos + v->rlen,
4200
0
                          bgzf_tell(fp->fp.bgzf), 1) < 0)
4201
0
            return -1;
4202
0
    }
4203
4204
24.5k
    return ret==fp->line.l ? 0 : -1;
4205
24.5k
}
4206
4207
/************************
4208
 * Data access routines *
4209
 ************************/
4210
4211
int bcf_hdr_id2int(const bcf_hdr_t *h, int which, const char *id)
4212
10.1k
{
4213
10.1k
    khint_t k;
4214
10.1k
    vdict_t *d = (vdict_t*)h->dict[which];
4215
10.1k
    k = kh_get(vdict, d, id);
4216
10.1k
    return k == kh_end(d)? -1 : kh_val(d, k).id;
4217
10.1k
}
4218
4219
4220
/********************
4221
 *** BCF indexing ***
4222
 ********************/
4223
4224
// Calculate number of index levels given min_shift and the header contig
4225
// list.  Also returns number of contigs in *nids_out.
4226
static int idx_calc_n_lvls_ids(const bcf_hdr_t *h, int min_shift,
4227
                               int starting_n_lvls, int *nids_out)
4228
0
{
4229
0
    int n_lvls, i, nids = 0;
4230
0
    int64_t max_len = 0, s;
4231
4232
0
    for (i = 0; i < h->n[BCF_DT_CTG]; ++i)
4233
0
    {
4234
0
        if ( !h->id[BCF_DT_CTG][i].val ) continue;
4235
0
        if ( max_len < h->id[BCF_DT_CTG][i].val->info[0] )
4236
0
            max_len = h->id[BCF_DT_CTG][i].val->info[0];
4237
0
        nids++;
4238
0
    }
4239
0
    if ( !max_len ) max_len = (1LL<<31) - 1;  // In case contig line is broken.
4240
0
    max_len += 256;
4241
0
    s = 1LL << (min_shift + starting_n_lvls * 3);
4242
0
    for (n_lvls = starting_n_lvls; max_len > s; ++n_lvls, s <<= 3);
4243
4244
0
    if (nids_out) *nids_out = nids;
4245
0
    return n_lvls;
4246
0
}
4247
4248
hts_idx_t *bcf_index(htsFile *fp, int min_shift)
4249
0
{
4250
0
    int n_lvls;
4251
0
    bcf1_t *b = NULL;
4252
0
    hts_idx_t *idx = NULL;
4253
0
    bcf_hdr_t *h;
4254
0
    int r;
4255
0
    h = bcf_hdr_read(fp);
4256
0
    if ( !h ) return NULL;
4257
0
    int nids = 0;
4258
0
    n_lvls = idx_calc_n_lvls_ids(h, min_shift, 0, &nids);
4259
0
    idx = hts_idx_init(nids, HTS_FMT_CSI, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
4260
0
    if (!idx) goto fail;
4261
0
    b = bcf_init1();
4262
0
    if (!b) goto fail;
4263
0
    while ((r = bcf_read1(fp,h, b)) >= 0) {
4264
0
        int ret;
4265
0
        ret = hts_idx_push(idx, b->rid, b->pos, b->pos + b->rlen, bgzf_tell(fp->fp.bgzf), 1);
4266
0
        if (ret < 0) goto fail;
4267
0
    }
4268
0
    if (r < -1) goto fail;
4269
0
    hts_idx_finish(idx, bgzf_tell(fp->fp.bgzf));
4270
0
    bcf_destroy1(b);
4271
0
    bcf_hdr_destroy(h);
4272
0
    return idx;
4273
4274
0
 fail:
4275
0
    hts_idx_destroy(idx);
4276
0
    bcf_destroy1(b);
4277
0
    bcf_hdr_destroy(h);
4278
0
    return NULL;
4279
0
}
4280
4281
hts_idx_t *bcf_index_load2(const char *fn, const char *fnidx)
4282
0
{
4283
0
    return fnidx? hts_idx_load2(fn, fnidx) : bcf_index_load(fn);
4284
0
}
4285
4286
hts_idx_t *bcf_index_load3(const char *fn, const char *fnidx, int flags)
4287
0
{
4288
0
    return hts_idx_load3(fn, fnidx, HTS_FMT_CSI, flags);
4289
0
}
4290
4291
int bcf_index_build3(const char *fn, const char *fnidx, int min_shift, int n_threads)
4292
0
{
4293
0
    htsFile *fp;
4294
0
    hts_idx_t *idx;
4295
0
    tbx_t *tbx;
4296
0
    int ret;
4297
0
    if ((fp = hts_open(fn, "rb")) == 0) return -2;
4298
0
    if (n_threads)
4299
0
        hts_set_threads(fp, n_threads);
4300
0
    if ( fp->format.compression!=bgzf ) { hts_close(fp); return -3; }
4301
0
    switch (fp->format.format) {
4302
0
        case bcf:
4303
0
            if (!min_shift) {
4304
0
                hts_log_error("TBI indices for BCF files are not supported");
4305
0
                ret = -1;
4306
0
            } else {
4307
0
                idx = bcf_index(fp, min_shift);
4308
0
                if (idx) {
4309
0
                    ret = hts_idx_save_as(idx, fn, fnidx, HTS_FMT_CSI);
4310
0
                    if (ret < 0) ret = -4;
4311
0
                    hts_idx_destroy(idx);
4312
0
                }
4313
0
                else ret = -1;
4314
0
            }
4315
0
            break;
4316
4317
0
        case vcf:
4318
0
            tbx = tbx_index(hts_get_bgzfp(fp), min_shift, &tbx_conf_vcf);
4319
0
            if (tbx) {
4320
0
                ret = hts_idx_save_as(tbx->idx, fn, fnidx, min_shift > 0 ? HTS_FMT_CSI : HTS_FMT_TBI);
4321
0
                if (ret < 0) ret = -4;
4322
0
                tbx_destroy(tbx);
4323
0
            }
4324
0
            else ret = -1;
4325
0
            break;
4326
4327
0
        default:
4328
0
            ret = -3;
4329
0
            break;
4330
0
    }
4331
0
    hts_close(fp);
4332
0
    return ret;
4333
0
}
4334
4335
int bcf_index_build2(const char *fn, const char *fnidx, int min_shift)
4336
0
{
4337
0
    return bcf_index_build3(fn, fnidx, min_shift, 0);
4338
0
}
4339
4340
int bcf_index_build(const char *fn, int min_shift)
4341
0
{
4342
0
    return bcf_index_build3(fn, NULL, min_shift, 0);
4343
0
}
4344
4345
// Initialise fp->idx for the current format type.
4346
// This must be called after the header has been written but no other data.
4347
0
static int vcf_idx_init(htsFile *fp, bcf_hdr_t *h, int min_shift, const char *fnidx) {
4348
0
    int n_lvls, fmt;
4349
4350
0
    if (min_shift == 0) {
4351
0
        min_shift = 14;
4352
0
        n_lvls = 5;
4353
0
        fmt = HTS_FMT_TBI;
4354
0
    } else {
4355
        // Set initial n_lvls to match tbx_index()
4356
0
        int starting_n_lvls = (TBX_MAX_SHIFT - min_shift + 2) / 3;
4357
        // Increase if necessary
4358
0
        n_lvls = idx_calc_n_lvls_ids(h, min_shift, starting_n_lvls, NULL);
4359
0
        fmt = HTS_FMT_CSI;
4360
0
    }
4361
4362
0
    fp->idx = hts_idx_init(0, fmt, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
4363
0
    if (!fp->idx) return -1;
4364
4365
    // Tabix meta data, added even in CSI for VCF
4366
0
    uint8_t conf[4*7];
4367
0
    u32_to_le(TBX_VCF, conf+0);  // fmt
4368
0
    u32_to_le(1,       conf+4);  // name col
4369
0
    u32_to_le(2,       conf+8);  // beg col
4370
0
    u32_to_le(0,       conf+12); // end col
4371
0
    u32_to_le('#',     conf+16); // comment
4372
0
    u32_to_le(0,       conf+20); // n.skip
4373
0
    u32_to_le(0,       conf+24); // ref name len
4374
0
    if (hts_idx_set_meta(fp->idx, sizeof(conf)*sizeof(*conf), (uint8_t *)conf, 1) < 0) {
4375
0
        hts_idx_destroy(fp->idx);
4376
0
        fp->idx = NULL;
4377
0
        return -1;
4378
0
    }
4379
0
    fp->fnidx = fnidx;
4380
4381
0
    return 0;
4382
0
}
4383
4384
// Initialise fp->idx for the current format type.
4385
// This must be called after the header has been written but no other data.
4386
0
int bcf_idx_init(htsFile *fp, bcf_hdr_t *h, int min_shift, const char *fnidx) {
4387
0
    int n_lvls, nids = 0;
4388
4389
0
    if (fp->format.compression != bgzf) {
4390
0
        hts_log_error("Indexing is only supported on BGZF-compressed files");
4391
0
        return -3; // Matches no-compression return for bcf_index_build3()
4392
0
    }
4393
4394
0
    if (fp->format.format == vcf)
4395
0
        return vcf_idx_init(fp, h, min_shift, fnidx);
4396
4397
0
    if (!min_shift)
4398
0
        min_shift = 14;
4399
4400
0
    n_lvls = idx_calc_n_lvls_ids(h, min_shift, 0, &nids);
4401
4402
0
    fp->idx = hts_idx_init(nids, HTS_FMT_CSI, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
4403
0
    if (!fp->idx) return -1;
4404
0
    fp->fnidx = fnidx;
4405
4406
0
    return 0;
4407
0
}
4408
4409
// Finishes an index. Call after the last record has been written.
4410
// Returns 0 on success, <0 on failure.
4411
//
4412
// NB: same format as SAM/BAM as it uses bgzf.
4413
0
int bcf_idx_save(htsFile *fp) {
4414
0
    return sam_idx_save(fp);
4415
0
}
4416
4417
/*****************
4418
 *** Utilities ***
4419
 *****************/
4420
4421
int bcf_hdr_combine(bcf_hdr_t *dst, const bcf_hdr_t *src)
4422
0
{
4423
0
    int i, ndst_ori = dst->nhrec, need_sync = 0, ret = 0, res;
4424
0
    for (i=0; i<src->nhrec; i++)
4425
0
    {
4426
0
        if ( src->hrec[i]->type==BCF_HL_GEN && src->hrec[i]->value )
4427
0
        {
4428
0
            int j;
4429
0
            for (j=0; j<ndst_ori; j++)
4430
0
            {
4431
0
                if ( dst->hrec[j]->type!=BCF_HL_GEN ) continue;
4432
4433
                // Checking only the key part of generic lines, otherwise
4434
                // the VCFs are too verbose. Should we perhaps add a flag
4435
                // to bcf_hdr_combine() and make this optional?
4436
0
                if ( !strcmp(src->hrec[i]->key,dst->hrec[j]->key) ) break;
4437
0
            }
4438
0
            if ( j>=ndst_ori ) {
4439
0
                res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
4440
0
                if (res < 0) return -1;
4441
0
                need_sync += res;
4442
0
            }
4443
0
        }
4444
0
        else if ( src->hrec[i]->type==BCF_HL_STR )
4445
0
        {
4446
            // NB: we are ignoring fields without ID
4447
0
            int j = bcf_hrec_find_key(src->hrec[i],"ID");
4448
0
            if ( j>=0 )
4449
0
            {
4450
0
                bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], src->hrec[i]->key);
4451
0
                if ( !rec ) {
4452
0
                    res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
4453
0
                    if (res < 0) return -1;
4454
0
                    need_sync += res;
4455
0
                }
4456
0
            }
4457
0
        }
4458
0
        else
4459
0
        {
4460
0
            int j = bcf_hrec_find_key(src->hrec[i],"ID");
4461
0
            assert( j>=0 ); // this should always be true for valid VCFs
4462
4463
0
            bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], NULL);
4464
0
            if ( !rec ) {
4465
0
                res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
4466
0
                if (res < 0) return -1;
4467
0
                need_sync += res;
4468
0
            } else if ( src->hrec[i]->type==BCF_HL_INFO || src->hrec[i]->type==BCF_HL_FMT )
4469
0
            {
4470
                // Check that both records are of the same type. The bcf_hdr_id2length
4471
                // macro cannot be used here because dst header is not synced yet.
4472
0
                vdict_t *d_src = (vdict_t*)src->dict[BCF_DT_ID];
4473
0
                vdict_t *d_dst = (vdict_t*)dst->dict[BCF_DT_ID];
4474
0
                khint_t k_src  = kh_get(vdict, d_src, src->hrec[i]->vals[0]);
4475
0
                khint_t k_dst  = kh_get(vdict, d_dst, src->hrec[i]->vals[0]);
4476
0
                if ( (kh_val(d_src,k_src).info[rec->type]>>8 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>8 & 0xf) )
4477
0
                {
4478
0
                    hts_log_warning("Trying to combine \"%s\" tag definitions of different lengths",
4479
0
                        src->hrec[i]->vals[0]);
4480
0
                    ret |= 1;
4481
0
                }
4482
0
                if ( (kh_val(d_src,k_src).info[rec->type]>>4 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>4 & 0xf) )
4483
0
                {
4484
0
                    hts_log_warning("Trying to combine \"%s\" tag definitions of different types",
4485
0
                        src->hrec[i]->vals[0]);
4486
0
                    ret |= 1;
4487
0
                }
4488
0
            }
4489
0
        }
4490
0
    }
4491
0
    if ( need_sync ) {
4492
0
        if (bcf_hdr_sync(dst) < 0) return -1;
4493
0
    }
4494
0
    return ret;
4495
0
}
4496
4497
bcf_hdr_t *bcf_hdr_merge(bcf_hdr_t *dst, const bcf_hdr_t *src)
4498
0
{
4499
0
    if ( !dst )
4500
0
    {
4501
        // this will effectively strip existing IDX attributes from src to become dst
4502
0
        dst = bcf_hdr_init("r");
4503
0
        kstring_t htxt = {0,0,0};
4504
0
        if (bcf_hdr_format(src, 0, &htxt) < 0) {
4505
0
            free(htxt.s);
4506
0
            return NULL;
4507
0
        }
4508
0
        if ( bcf_hdr_parse(dst, htxt.s) < 0 ) {
4509
0
            bcf_hdr_destroy(dst);
4510
0
            dst = NULL;
4511
0
        }
4512
0
        free(htxt.s);
4513
0
        return dst;
4514
0
    }
4515
4516
0
    int i, ndst_ori = dst->nhrec, need_sync = 0, res;
4517
0
    for (i=0; i<src->nhrec; i++)
4518
0
    {
4519
0
        if ( src->hrec[i]->type==BCF_HL_GEN && src->hrec[i]->value )
4520
0
        {
4521
0
            int j;
4522
0
            for (j=0; j<ndst_ori; j++)
4523
0
            {
4524
0
                if ( dst->hrec[j]->type!=BCF_HL_GEN ) continue;
4525
4526
                // Checking only the key part of generic lines, otherwise
4527
                // the VCFs are too verbose. Should we perhaps add a flag
4528
                // to bcf_hdr_combine() and make this optional?
4529
0
                if ( !strcmp(src->hrec[i]->key,dst->hrec[j]->key) ) break;
4530
0
            }
4531
0
            if ( j>=ndst_ori ) {
4532
0
                res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
4533
0
                if (res < 0) return NULL;
4534
0
                need_sync += res;
4535
0
            }
4536
0
        }
4537
0
        else if ( src->hrec[i]->type==BCF_HL_STR )
4538
0
        {
4539
            // NB: we are ignoring fields without ID
4540
0
            int j = bcf_hrec_find_key(src->hrec[i],"ID");
4541
0
            if ( j>=0 )
4542
0
            {
4543
0
                bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], src->hrec[i]->key);
4544
0
                if ( !rec ) {
4545
0
                    res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
4546
0
                    if (res < 0) return NULL;
4547
0
                    need_sync += res;
4548
0
                }
4549
0
            }
4550
0
        }
4551
0
        else
4552
0
        {
4553
0
            int j = bcf_hrec_find_key(src->hrec[i],"ID");
4554
0
            assert( j>=0 ); // this should always be true for valid VCFs
4555
4556
0
            bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], NULL);
4557
0
            if ( !rec ) {
4558
0
                res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
4559
0
                if (res < 0) return NULL;
4560
0
                need_sync += res;
4561
0
            } else if ( src->hrec[i]->type==BCF_HL_INFO || src->hrec[i]->type==BCF_HL_FMT )
4562
0
            {
4563
                // Check that both records are of the same type. The bcf_hdr_id2length
4564
                // macro cannot be used here because dst header is not synced yet.
4565
0
                vdict_t *d_src = (vdict_t*)src->dict[BCF_DT_ID];
4566
0
                vdict_t *d_dst = (vdict_t*)dst->dict[BCF_DT_ID];
4567
0
                khint_t k_src  = kh_get(vdict, d_src, src->hrec[i]->vals[0]);
4568
0
                khint_t k_dst  = kh_get(vdict, d_dst, src->hrec[i]->vals[0]);
4569
0
                if ( (kh_val(d_src,k_src).info[rec->type]>>8 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>8 & 0xf) )
4570
0
                {
4571
0
                    hts_log_warning("Trying to combine \"%s\" tag definitions of different lengths",
4572
0
                        src->hrec[i]->vals[0]);
4573
0
                }
4574
0
                if ( (kh_val(d_src,k_src).info[rec->type]>>4 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>4 & 0xf) )
4575
0
                {
4576
0
                    hts_log_warning("Trying to combine \"%s\" tag definitions of different types",
4577
0
                        src->hrec[i]->vals[0]);
4578
0
                }
4579
0
            }
4580
0
        }
4581
0
    }
4582
0
    if ( need_sync ) {
4583
0
        if (bcf_hdr_sync(dst) < 0) return NULL;
4584
0
    }
4585
0
    return dst;
4586
0
}
4587
4588
int bcf_translate(const bcf_hdr_t *dst_hdr, bcf_hdr_t *src_hdr, bcf1_t *line)
4589
0
{
4590
0
    int i;
4591
0
    if ( line->errcode )
4592
0
    {
4593
0
        char errordescription[1024] = "";
4594
0
        hts_log_error("Unchecked error (%d %s) at %s:%"PRIhts_pos", exiting", line->errcode, bcf_strerror(line->errcode, errordescription, sizeof(errordescription)),  bcf_seqname_safe(src_hdr,line), line->pos+1);
4595
0
        exit(1);
4596
0
    }
4597
0
    if ( src_hdr->ntransl==-1 ) return 0;    // no need to translate, all tags have the same id
4598
0
    if ( !src_hdr->ntransl )  // called for the first time, see what needs translating
4599
0
    {
4600
0
        int dict;
4601
0
        for (dict=0; dict<2; dict++)    // BCF_DT_ID and BCF_DT_CTG
4602
0
        {
4603
0
            src_hdr->transl[dict] = (int*) malloc(src_hdr->n[dict]*sizeof(int));
4604
0
            for (i=0; i<src_hdr->n[dict]; i++)
4605
0
            {
4606
0
                if ( !src_hdr->id[dict][i].key ) // gap left after removed BCF header lines
4607
0
                {
4608
0
                    src_hdr->transl[dict][i] = -1;
4609
0
                    continue;
4610
0
                }
4611
0
                src_hdr->transl[dict][i] = bcf_hdr_id2int(dst_hdr,dict,src_hdr->id[dict][i].key);
4612
0
                if ( src_hdr->transl[dict][i]!=-1 && i!=src_hdr->transl[dict][i] ) src_hdr->ntransl++;
4613
0
            }
4614
0
        }
4615
0
        if ( !src_hdr->ntransl )
4616
0
        {
4617
0
            free(src_hdr->transl[0]); src_hdr->transl[0] = NULL;
4618
0
            free(src_hdr->transl[1]); src_hdr->transl[1] = NULL;
4619
0
            src_hdr->ntransl = -1;
4620
0
        }
4621
0
        if ( src_hdr->ntransl==-1 ) return 0;
4622
0
    }
4623
0
    bcf_unpack(line,BCF_UN_ALL);
4624
4625
    // CHROM
4626
0
    if ( src_hdr->transl[BCF_DT_CTG][line->rid] >=0 ) line->rid = src_hdr->transl[BCF_DT_CTG][line->rid];
4627
4628
    // FILTER
4629
0
    for (i=0; i<line->d.n_flt; i++)
4630
0
    {
4631
0
        int src_id = line->d.flt[i];
4632
0
        if ( src_hdr->transl[BCF_DT_ID][src_id] >=0 )
4633
0
            line->d.flt[i] = src_hdr->transl[BCF_DT_ID][src_id];
4634
0
        line->d.shared_dirty |= BCF1_DIRTY_FLT;
4635
0
    }
4636
4637
    // INFO
4638
0
    for (i=0; i<line->n_info; i++)
4639
0
    {
4640
0
        int src_id = line->d.info[i].key;
4641
0
        int dst_id = src_hdr->transl[BCF_DT_ID][src_id];
4642
0
        if ( dst_id<0 ) continue;
4643
0
        line->d.info[i].key = dst_id;
4644
0
        if ( !line->d.info[i].vptr ) continue;  // skip deleted
4645
0
        int src_size = src_id>>7 ? ( src_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
4646
0
        int dst_size = dst_id>>7 ? ( dst_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
4647
0
        if ( src_size==dst_size )   // can overwrite
4648
0
        {
4649
0
            uint8_t *vptr = line->d.info[i].vptr - line->d.info[i].vptr_off;
4650
0
            if ( dst_size==BCF_BT_INT8 ) { vptr[1] = (uint8_t)dst_id; }
4651
0
            else if ( dst_size==BCF_BT_INT16 ) { *(uint16_t*)vptr = (uint16_t)dst_id; }
4652
0
            else { *(uint32_t*)vptr = (uint32_t)dst_id; }
4653
0
        }
4654
0
        else    // must realloc
4655
0
        {
4656
0
            bcf_info_t *info = &line->d.info[i];
4657
0
            kstring_t str = {0,0,0};
4658
0
            bcf_enc_int1(&str, dst_id);
4659
0
            bcf_enc_size(&str, info->len,info->type);
4660
0
            uint32_t vptr_off = str.l;
4661
0
            kputsn((char*)info->vptr, info->vptr_len, &str);
4662
0
            if( info->vptr_free ) free(info->vptr - info->vptr_off);
4663
0
            info->vptr_off = vptr_off;
4664
0
            info->vptr = (uint8_t*)str.s + info->vptr_off;
4665
0
            info->vptr_free = 1;
4666
0
            line->d.shared_dirty |= BCF1_DIRTY_INF;
4667
0
        }
4668
0
    }
4669
4670
    // FORMAT
4671
0
    for (i=0; i<line->n_fmt; i++)
4672
0
    {
4673
0
        int src_id = line->d.fmt[i].id;
4674
0
        int dst_id = src_hdr->transl[BCF_DT_ID][src_id];
4675
0
        if ( dst_id<0 ) continue;
4676
0
        line->d.fmt[i].id = dst_id;
4677
0
        if( !line->d.fmt[i].p ) continue;  // skip deleted
4678
0
        int src_size = src_id>>7 ? ( src_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
4679
0
        int dst_size = dst_id>>7 ? ( dst_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
4680
0
        if ( src_size==dst_size )   // can overwrite
4681
0
        {
4682
0
            uint8_t *p = line->d.fmt[i].p - line->d.fmt[i].p_off;    // pointer to the vector size (4bits) and BT type (4bits)
4683
0
            if ( dst_size==BCF_BT_INT8 ) { p[1] = dst_id; }
4684
0
            else if ( dst_size==BCF_BT_INT16 ) { i16_to_le(dst_id, p + 1); }
4685
0
            else { i32_to_le(dst_id, p + 1); }
4686
0
        }
4687
0
        else    // must realloc
4688
0
        {
4689
0
            bcf_fmt_t *fmt = &line->d.fmt[i];
4690
0
            kstring_t str = {0,0,0};
4691
0
            bcf_enc_int1(&str, dst_id);
4692
0
            bcf_enc_size(&str, fmt->n, fmt->type);
4693
0
            uint32_t p_off = str.l;
4694
0
            kputsn((char*)fmt->p, fmt->p_len, &str);
4695
0
            if( fmt->p_free ) free(fmt->p - fmt->p_off);
4696
0
            fmt->p_off = p_off;
4697
0
            fmt->p = (uint8_t*)str.s + fmt->p_off;
4698
0
            fmt->p_free = 1;
4699
0
            line->d.indiv_dirty = 1;
4700
0
        }
4701
0
    }
4702
0
    return 0;
4703
0
}
4704
4705
bcf_hdr_t *bcf_hdr_dup(const bcf_hdr_t *hdr)
4706
0
{
4707
0
    bcf_hdr_t *hout = bcf_hdr_init("r");
4708
0
    if (!hout) {
4709
0
        hts_log_error("Failed to allocate bcf header");
4710
0
        return NULL;
4711
0
    }
4712
0
    kstring_t htxt = {0,0,0};
4713
0
    if (bcf_hdr_format(hdr, 1, &htxt) < 0) {
4714
0
        free(htxt.s);
4715
0
        return NULL;
4716
0
    }
4717
0
    if ( bcf_hdr_parse(hout, htxt.s) < 0 ) {
4718
0
        bcf_hdr_destroy(hout);
4719
0
        hout = NULL;
4720
0
    }
4721
0
    free(htxt.s);
4722
0
    return hout;
4723
0
}
4724
4725
bcf_hdr_t *bcf_hdr_subset(const bcf_hdr_t *h0, int n, char *const* samples, int *imap)
4726
0
{
4727
0
    void *names_hash = khash_str2int_init();
4728
0
    kstring_t htxt = {0,0,0};
4729
0
    kstring_t str = {0,0,0};
4730
0
    bcf_hdr_t *h = bcf_hdr_init("w");
4731
0
    int r = 0;
4732
0
    if (!h || !names_hash) {
4733
0
        hts_log_error("Failed to allocate bcf header");
4734
0
        goto err;
4735
0
    }
4736
0
    if (bcf_hdr_format(h0, 1, &htxt) < 0) {
4737
0
        hts_log_error("Failed to get header text");
4738
0
        goto err;
4739
0
    }
4740
0
    bcf_hdr_set_version(h,bcf_hdr_get_version(h0));
4741
0
    int j;
4742
0
    for (j=0; j<n; j++) imap[j] = -1;
4743
0
    if ( bcf_hdr_nsamples(h0) > 0) {
4744
0
        char *p = find_chrom_header_line(htxt.s);
4745
0
        int i = 0, end = n? 8 : 7;
4746
0
        while ((p = strchr(p, '\t')) != 0 && i < end) ++i, ++p;
4747
0
        if (i != end) {
4748
0
            hts_log_error("Wrong number of columns in header #CHROM line");
4749
0
            goto err;
4750
0
        }
4751
0
        r |= kputsn(htxt.s, p - htxt.s, &str) < 0;
4752
0
        for (i = 0; i < n; ++i) {
4753
0
            if ( khash_str2int_has_key(names_hash,samples[i]) )
4754
0
            {
4755
0
                hts_log_error("Duplicate sample name \"%s\"", samples[i]);
4756
0
                goto err;
4757
0
            }
4758
0
            imap[i] = bcf_hdr_id2int(h0, BCF_DT_SAMPLE, samples[i]);
4759
0
            if (imap[i] < 0) continue;
4760
0
            r |= kputc('\t', &str) < 0;
4761
0
            r |= kputs(samples[i], &str) < 0;
4762
0
            r |= khash_str2int_inc(names_hash,samples[i]) < 0;
4763
0
        }
4764
0
    } else r |= kputsn(htxt.s, htxt.l, &str) < 0;
4765
0
    while (str.l && (!str.s[str.l-1] || str.s[str.l-1]=='\n') ) str.l--; // kill trailing zeros and newlines
4766
0
    r |= kputc('\n',&str) < 0;
4767
0
    if (r) {
4768
0
        hts_log_error("%s", strerror(errno));
4769
0
        goto err;
4770
0
    }
4771
0
    if ( bcf_hdr_parse(h, str.s) < 0 ) {
4772
0
        bcf_hdr_destroy(h);
4773
0
        h = NULL;
4774
0
    }
4775
0
    free(str.s);
4776
0
    free(htxt.s);
4777
0
    khash_str2int_destroy(names_hash);
4778
0
    return h;
4779
4780
0
 err:
4781
0
    ks_free(&str);
4782
0
    ks_free(&htxt);
4783
0
    khash_str2int_destroy(names_hash);
4784
0
    bcf_hdr_destroy(h);
4785
0
    return NULL;
4786
0
}
4787
4788
int bcf_hdr_set_samples(bcf_hdr_t *hdr, const char *samples, int is_file)
4789
0
{
4790
0
    if ( samples && !strcmp("-",samples) ) return 0;            // keep all samples
4791
4792
0
    int i, narr = bit_array_size(bcf_hdr_nsamples(hdr));
4793
0
    hdr->keep_samples = (uint8_t*) calloc(narr,1);
4794
0
    if (!hdr->keep_samples) return -1;
4795
4796
0
    hdr->nsamples_ori = bcf_hdr_nsamples(hdr);
4797
0
    if ( !samples )
4798
0
    {
4799
        // exclude all samples
4800
0
        khint_t k;
4801
0
        vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_SAMPLE], *new_dict;
4802
0
        new_dict = kh_init(vdict);
4803
0
        if (!new_dict) return -1;
4804
4805
0
        bcf_hdr_nsamples(hdr) = 0;
4806
4807
0
        for (k = kh_begin(d); k != kh_end(d); ++k)
4808
0
            if (kh_exist(d, k)) free((char*)kh_key(d, k));
4809
0
        kh_destroy(vdict, d);
4810
0
        hdr->dict[