Coverage Report

Created: 2024-02-25 06:34

/src/htslib/vcf.c
Line
Count
Source (jump to first uncovered line)
1
/*  vcf.c -- VCF/BCF API functions.
2
3
    Copyright (C) 2012, 2013 Broad Institute.
4
    Copyright (C) 2012-2023 Genome Research Ltd.
5
    Portions copyright (C) 2014 Intel Corporation.
6
7
    Author: Heng Li <lh3@sanger.ac.uk>
8
9
Permission is hereby granted, free of charge, to any person obtaining a copy
10
of this software and associated documentation files (the "Software"), to deal
11
in the Software without restriction, including without limitation the rights
12
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13
copies of the Software, and to permit persons to whom the Software is
14
furnished to do so, subject to the following conditions:
15
16
The above copyright notice and this permission notice shall be included in
17
all copies or substantial portions of the Software.
18
19
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25
DEALINGS IN THE SOFTWARE.  */
26
27
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
28
#include <config.h>
29
30
#include <stdio.h>
31
#include <assert.h>
32
#include <string.h>
33
#include <strings.h>
34
#include <stdlib.h>
35
#include <limits.h>
36
#include <stdint.h>
37
#include <inttypes.h>
38
#include <errno.h>
39
40
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
41
#include "fuzz_settings.h"
42
#endif
43
44
#include "htslib/vcf.h"
45
#include "htslib/bgzf.h"
46
#include "htslib/tbx.h"
47
#include "htslib/hfile.h"
48
#include "hts_internal.h"
49
#include "htslib/hts_endian.h"
50
#include "htslib/khash_str2int.h"
51
#include "htslib/kstring.h"
52
#include "htslib/sam.h"
53
#include "htslib/khash.h"
54
55
#if 0
56
// This helps on Intel a bit, often 6-7% faster VCF parsing.
57
// Conversely sometimes harms AMD Zen4 as ~9% slower.
58
// Possibly related to IPC differences.  However for now it's just a
59
// curiousity we ignore and stick with the simpler code.
60
//
61
// Left here as a hint for future explorers.
62
static inline int xstreq(const char *a, const char *b) {
63
    while (*a && *a == *b)
64
        a++, b++;
65
    return *a == *b;
66
}
67
68
#define KHASH_MAP_INIT_XSTR(name, khval_t) \
69
  KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, xstreq)
70
71
KHASH_MAP_INIT_XSTR(vdict, bcf_idinfo_t)
72
#else
73
KHASH_MAP_INIT_STR(vdict, bcf_idinfo_t)
74
#endif
75
76
typedef khash_t(vdict) vdict_t;
77
78
KHASH_MAP_INIT_STR(hdict, bcf_hrec_t*)
79
typedef khash_t(hdict) hdict_t;
80
81
82
#include "htslib/kseq.h"
83
HTSLIB_EXPORT
84
uint32_t bcf_float_missing    = 0x7F800001;
85
86
HTSLIB_EXPORT
87
uint32_t bcf_float_vector_end = 0x7F800002;
88
89
HTSLIB_EXPORT
90
uint8_t bcf_type_shift[] = { 0, 0, 1, 2, 3, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
91
92
static bcf_idinfo_t bcf_idinfo_def = { .info = { 15, 15, 15 }, .hrec = { NULL, NULL, NULL}, .id = -1 };
93
94
/*
95
    Partial support for 64-bit POS and Number=1 INFO tags.
96
    Notes:
97
     - the support for 64-bit values is motivated by POS and INFO/END for large genomes
98
     - the use of 64-bit values does not conform to the specification
99
     - cannot output 64-bit BCF and if it does, it is not compatible with anything
100
     - experimental, use at your risk
101
*/
102
#ifdef VCF_ALLOW_INT64
103
    #define BCF_MAX_BT_INT64 (0x7fffffffffffffff)       /* INT64_MAX, for internal use only */
104
    #define BCF_MIN_BT_INT64 -9223372036854775800LL     /* INT64_MIN + 8, for internal use only */
105
#endif
106
107
385
#define BCF_IS_64BIT (1<<30)
108
109
110
// Opaque structure with auxilary data which allows to extend bcf_hdr_t without breaking ABI.
111
// Note that this preserving API and ABI requires that the first element is vdict_t struct
112
// rather than a pointer, as user programs may (and in some cases do) access the dictionary
113
// directly as (vdict_t*)hdr->dict.
114
typedef struct
115
{
116
    vdict_t dict;   // bcf_hdr_t.dict[0] vdict_t dictionary which keeps bcf_idinfo_t for BCF_HL_FLT,BCF_HL_INFO,BCF_HL_FMT
117
    hdict_t *gen;   // hdict_t dictionary which keeps bcf_hrec_t* pointers for generic and structured fields
118
    size_t *key_len;// length of h->id[BCF_DT_ID] strings
119
}
120
bcf_hdr_aux_t;
121
122
static inline bcf_hdr_aux_t *get_hdr_aux(const bcf_hdr_t *hdr)
123
307k
{
124
307k
    return (bcf_hdr_aux_t *)hdr->dict[0];
125
307k
}
126
127
static char *find_chrom_header_line(char *s)
128
0
{
129
0
    char *nl;
130
0
    if (strncmp(s, "#CHROM\t", 7) == 0) return s;
131
0
    else if ((nl = strstr(s, "\n#CHROM\t")) != NULL) return nl+1;
132
0
    else return NULL;
133
0
}
134
135
/*************************
136
 *** VCF header parser ***
137
 *************************/
138
139
static int bcf_hdr_add_sample_len(bcf_hdr_t *h, const char *s, size_t len)
140
12.8k
{
141
12.8k
    const char *ss = s;
142
13.1k
    while ( *ss && isspace_c(*ss) && ss - s < len) ss++;
143
12.8k
    if ( !*ss || ss - s == len)
144
2
    {
145
2
        hts_log_error("Empty sample name: trailing spaces/tabs in the header line?");
146
2
        return -1;
147
2
    }
148
149
12.8k
    vdict_t *d = (vdict_t*)h->dict[BCF_DT_SAMPLE];
150
12.8k
    int ret;
151
12.8k
    char *sdup = malloc(len + 1);
152
12.8k
    if (!sdup) return -1;
153
12.8k
    memcpy(sdup, s, len);
154
12.8k
    sdup[len] = 0;
155
156
    // Ensure space is available in h->samples
157
12.8k
    size_t n = kh_size(d);
158
12.8k
    char **new_samples = realloc(h->samples, sizeof(char*) * (n + 1));
159
12.8k
    if (!new_samples) {
160
0
        free(sdup);
161
0
        return -1;
162
0
    }
163
12.8k
    h->samples = new_samples;
164
165
12.8k
    int k = kh_put(vdict, d, sdup, &ret);
166
12.8k
    if (ret < 0) {
167
0
        free(sdup);
168
0
        return -1;
169
0
    }
170
12.8k
    if (ret) { // absent
171
12.8k
        kh_val(d, k) = bcf_idinfo_def;
172
12.8k
        kh_val(d, k).id = n;
173
12.8k
    } else {
174
0
        hts_log_error("Duplicated sample name '%s'", sdup);
175
0
        free(sdup);
176
0
        return -1;
177
0
    }
178
12.8k
    h->samples[n] = sdup;
179
12.8k
    h->dirty = 1;
180
12.8k
    return 0;
181
12.8k
}
182
183
int bcf_hdr_add_sample(bcf_hdr_t *h, const char *s)
184
0
{
185
0
    if (!s) {
186
        // Allowed for backwards-compatibility, calling with s == NULL
187
        // used to trigger bcf_hdr_sync(h);
188
0
        return 0;
189
0
    }
190
0
    return bcf_hdr_add_sample_len(h, s, strlen(s));
191
0
}
192
193
int HTS_RESULT_USED bcf_hdr_parse_sample_line(bcf_hdr_t *hdr, const char *str)
194
4.90k
{
195
4.90k
    const char *mandatory = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO";
196
4.90k
    if ( strncmp(str,mandatory,strlen(mandatory)) )
197
104
    {
198
104
        hts_log_error("Could not parse the \"#CHROM..\" line, either the fields are incorrect or spaces are present instead of tabs:\n\t%s",str);
199
104
        return -1;
200
104
    }
201
202
4.80k
    const char *beg = str + strlen(mandatory), *end;
203
4.80k
    if ( !*beg || *beg=='\n' ) return 0;
204
1.63k
    if ( strncmp(beg,"\tFORMAT\t",8) )
205
42
    {
206
42
        hts_log_error("Could not parse the \"#CHROM..\" line, either FORMAT is missing or spaces are present instead of tabs:\n\t%s",str);
207
42
        return -1;
208
42
    }
209
1.59k
    beg += 8;
210
211
1.59k
    int ret = 0;
212
12.9k
    while ( *beg )
213
12.8k
    {
214
12.8k
        end = beg;
215
324M
        while ( *end && *end!='\t' && *end!='\n' ) end++;
216
12.8k
        if ( bcf_hdr_add_sample_len(hdr, beg, end-beg) < 0 ) ret = -1;
217
12.8k
        if ( !*end || *end=='\n' || ret<0 ) break;
218
11.3k
        beg = end + 1;
219
11.3k
    }
220
1.59k
    return ret;
221
1.63k
}
222
223
int bcf_hdr_sync(bcf_hdr_t *h)
224
109k
{
225
109k
    int i;
226
439k
    for (i = 0; i < 3; i++)
227
329k
    {
228
329k
        vdict_t *d = (vdict_t*)h->dict[i];
229
329k
        khint_t k;
230
329k
        if ( h->n[i] < kh_size(d) )
231
1.56k
        {
232
1.56k
            bcf_idpair_t *new_idpair;
233
            // this should be true only for i=2, BCF_DT_SAMPLE
234
1.56k
            new_idpair = (bcf_idpair_t*) realloc(h->id[i], kh_size(d)*sizeof(bcf_idpair_t));
235
1.56k
            if (!new_idpair) return -1;
236
1.56k
            h->n[i] = kh_size(d);
237
1.56k
            h->id[i] = new_idpair;
238
1.56k
        }
239
3.82G
        for (k=kh_begin(d); k<kh_end(d); k++)
240
3.82G
        {
241
3.82G
            if (!kh_exist(d,k)) continue;
242
22.7M
            h->id[i][kh_val(d,k).id].key = kh_key(d,k);
243
22.7M
            h->id[i][kh_val(d,k).id].val = &kh_val(d,k);
244
22.7M
        }
245
329k
    }
246
247
    // Invalidate key length cache
248
109k
    bcf_hdr_aux_t *aux = get_hdr_aux(h);
249
109k
    if (aux && aux->key_len) {
250
2.53k
        free(aux->key_len);
251
2.53k
        aux->key_len = NULL;
252
2.53k
    }
253
254
109k
    h->dirty = 0;
255
109k
    return 0;
256
109k
}
257
258
void bcf_hrec_destroy(bcf_hrec_t *hrec)
259
188k
{
260
188k
    if (!hrec) return;
261
181k
    free(hrec->key);
262
181k
    if ( hrec->value ) free(hrec->value);
263
181k
    int i;
264
687k
    for (i=0; i<hrec->nkeys; i++)
265
506k
    {
266
506k
        free(hrec->keys[i]);
267
506k
        free(hrec->vals[i]);
268
506k
    }
269
181k
    free(hrec->keys);
270
181k
    free(hrec->vals);
271
181k
    free(hrec);
272
181k
}
273
274
// Copies all fields except IDX.
275
bcf_hrec_t *bcf_hrec_dup(bcf_hrec_t *hrec)
276
0
{
277
0
    int save_errno;
278
0
    bcf_hrec_t *out = (bcf_hrec_t*) calloc(1,sizeof(bcf_hrec_t));
279
0
    if (!out) return NULL;
280
281
0
    out->type = hrec->type;
282
0
    if ( hrec->key ) {
283
0
        out->key = strdup(hrec->key);
284
0
        if (!out->key) goto fail;
285
0
    }
286
0
    if ( hrec->value ) {
287
0
        out->value = strdup(hrec->value);
288
0
        if (!out->value) goto fail;
289
0
    }
290
0
    out->nkeys = hrec->nkeys;
291
0
    out->keys = (char**) malloc(sizeof(char*)*hrec->nkeys);
292
0
    if (!out->keys) goto fail;
293
0
    out->vals = (char**) malloc(sizeof(char*)*hrec->nkeys);
294
0
    if (!out->vals) goto fail;
295
0
    int i, j = 0;
296
0
    for (i=0; i<hrec->nkeys; i++)
297
0
    {
298
0
        if ( hrec->keys[i] && !strcmp("IDX",hrec->keys[i]) ) continue;
299
0
        if ( hrec->keys[i] ) {
300
0
            out->keys[j] = strdup(hrec->keys[i]);
301
0
            if (!out->keys[j]) goto fail;
302
0
        }
303
0
        if ( hrec->vals[i] ) {
304
0
            out->vals[j] = strdup(hrec->vals[i]);
305
0
            if (!out->vals[j]) goto fail;
306
0
        }
307
0
        j++;
308
0
    }
309
0
    if ( i!=j ) out->nkeys -= i-j;   // IDX was omitted
310
0
    return out;
311
312
0
 fail:
313
0
    save_errno = errno;
314
0
    hts_log_error("%s", strerror(errno));
315
0
    bcf_hrec_destroy(out);
316
0
    errno = save_errno;
317
0
    return NULL;
318
0
}
319
320
void bcf_hrec_debug(FILE *fp, bcf_hrec_t *hrec)
321
0
{
322
0
    fprintf(fp, "key=[%s] value=[%s]", hrec->key, hrec->value?hrec->value:"");
323
0
    int i;
324
0
    for (i=0; i<hrec->nkeys; i++)
325
0
        fprintf(fp, "\t[%s]=[%s]", hrec->keys[i],hrec->vals[i]);
326
0
    fprintf(fp, "\n");
327
0
}
328
329
void bcf_header_debug(bcf_hdr_t *hdr)
330
0
{
331
0
    int i, j;
332
0
    for (i=0; i<hdr->nhrec; i++)
333
0
    {
334
0
        if ( !hdr->hrec[i]->value )
335
0
        {
336
0
            fprintf(stderr, "##%s=<", hdr->hrec[i]->key);
337
0
            fprintf(stderr,"%s=%s", hdr->hrec[i]->keys[0], hdr->hrec[i]->vals[0]);
338
0
            for (j=1; j<hdr->hrec[i]->nkeys; j++)
339
0
                fprintf(stderr,",%s=%s", hdr->hrec[i]->keys[j], hdr->hrec[i]->vals[j]);
340
0
            fprintf(stderr,">\n");
341
0
        }
342
0
        else
343
0
            fprintf(stderr,"##%s=%s\n", hdr->hrec[i]->key,hdr->hrec[i]->value);
344
0
    }
345
0
}
346
347
int bcf_hrec_add_key(bcf_hrec_t *hrec, const char *str, size_t len)
348
392k
{
349
392k
    char **tmp;
350
392k
    size_t n = hrec->nkeys + 1;
351
392k
    assert(len > 0 && len < SIZE_MAX);
352
392k
    tmp = realloc(hrec->keys, sizeof(char*)*n);
353
392k
    if (!tmp) return -1;
354
392k
    hrec->keys = tmp;
355
392k
    tmp = realloc(hrec->vals, sizeof(char*)*n);
356
392k
    if (!tmp) return -1;
357
392k
    hrec->vals = tmp;
358
359
392k
    hrec->keys[hrec->nkeys] = (char*) malloc((len+1)*sizeof(char));
360
392k
    if (!hrec->keys[hrec->nkeys]) return -1;
361
392k
    memcpy(hrec->keys[hrec->nkeys],str,len);
362
392k
    hrec->keys[hrec->nkeys][len] = 0;
363
392k
    hrec->vals[hrec->nkeys] = NULL;
364
392k
    hrec->nkeys = n;
365
392k
    return 0;
366
392k
}
367
368
int bcf_hrec_set_val(bcf_hrec_t *hrec, int i, const char *str, size_t len, int is_quoted)
369
392k
{
370
392k
    if ( hrec->vals[i] ) {
371
0
        free(hrec->vals[i]);
372
0
        hrec->vals[i] = NULL;
373
0
    }
374
392k
    if ( !str ) return 0;
375
392k
    if ( is_quoted )
376
108k
    {
377
108k
        if (len >= SIZE_MAX - 3) {
378
0
            errno = ENOMEM;
379
0
            return -1;
380
0
        }
381
108k
        hrec->vals[i] = (char*) malloc((len+3)*sizeof(char));
382
108k
        if (!hrec->vals[i]) return -1;
383
108k
        hrec->vals[i][0] = '"';
384
108k
        memcpy(&hrec->vals[i][1],str,len);
385
108k
        hrec->vals[i][len+1] = '"';
386
108k
        hrec->vals[i][len+2] = 0;
387
108k
    }
388
283k
    else
389
283k
    {
390
283k
        if (len == SIZE_MAX) {
391
0
            errno = ENOMEM;
392
0
            return -1;
393
0
        }
394
283k
        hrec->vals[i] = (char*) malloc((len+1)*sizeof(char));
395
283k
        if (!hrec->vals[i]) return -1;
396
283k
        memcpy(hrec->vals[i],str,len);
397
283k
        hrec->vals[i][len] = 0;
398
283k
    }
399
392k
    return 0;
400
392k
}
401
402
int hrec_add_idx(bcf_hrec_t *hrec, int idx)
403
114k
{
404
114k
    int n = hrec->nkeys + 1;
405
114k
    char **tmp = (char**) realloc(hrec->keys, sizeof(char*)*n);
406
114k
    if (!tmp) return -1;
407
114k
    hrec->keys = tmp;
408
409
114k
    tmp = (char**) realloc(hrec->vals, sizeof(char*)*n);
410
114k
    if (!tmp) return -1;
411
114k
    hrec->vals = tmp;
412
413
114k
    hrec->keys[hrec->nkeys] = strdup("IDX");
414
114k
    if (!hrec->keys[hrec->nkeys]) return -1;
415
416
114k
    kstring_t str = {0,0,0};
417
114k
    if (kputw(idx, &str) < 0) {
418
0
        free(hrec->keys[hrec->nkeys]);
419
0
        return -1;
420
0
    }
421
114k
    hrec->vals[hrec->nkeys] = str.s;
422
114k
    hrec->nkeys = n;
423
114k
    return 0;
424
114k
}
425
426
int bcf_hrec_find_key(bcf_hrec_t *hrec, const char *key)
427
149k
{
428
149k
    int i;
429
227k
    for (i=0; i<hrec->nkeys; i++)
430
175k
        if ( !strcasecmp(key,hrec->keys[i]) ) return i;
431
51.4k
    return -1;
432
149k
}
433
434
static void bcf_hrec_set_type(bcf_hrec_t *hrec)
435
347k
{
436
347k
    if ( !strcmp(hrec->key, "contig") ) hrec->type = BCF_HL_CTG;
437
302k
    else if ( !strcmp(hrec->key, "INFO") ) hrec->type = BCF_HL_INFO;
438
185k
    else if ( !strcmp(hrec->key, "FILTER") ) hrec->type = BCF_HL_FLT;
439
84.1k
    else if ( !strcmp(hrec->key, "FORMAT") ) hrec->type = BCF_HL_FMT;
440
60.2k
    else if ( hrec->nkeys>0 ) hrec->type = BCF_HL_STR;
441
43.6k
    else hrec->type = BCF_HL_GEN;
442
347k
}
443
444
445
/**
446
    The arrays were generated with
447
448
    valid_ctg:
449
        perl -le '@v = (split(//,q[!#$%&*+./:;=?@^_|~-]),"a"..."z","A"..."Z","0"..."9"); @a = (0) x 256; foreach $c (@v) { $a[ord($c)] = 1; } print join(", ",@a)' | fold -w 48
450
451
    valid_tag:
452
        perl -le '@v = (split(//,q[_.]),"a"..."z","A"..."Z","0"..."9"); @a = (0) x 256; foreach $c (@v) { $a[ord($c)] = 1; } print join(", ",@a)' | fold -w 48
453
*/
454
static const uint8_t valid_ctg[256] =
455
{
456
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
457
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
458
    0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1,
459
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,
460
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
461
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1,
462
    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
463
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0,
464
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
465
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
466
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
467
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
468
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
469
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
470
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
471
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
472
};
473
static const uint8_t valid_tag[256] =
474
{
475
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
476
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
477
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
478
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
479
    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
480
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
481
    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
482
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
483
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
484
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
485
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
486
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
487
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
488
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
489
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
490
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
491
};
492
493
/**
494
    bcf_hrec_check() - check the validity of structured header lines
495
496
    Returns 0 on success or negative value on error.
497
498
    Currently the return status is not checked by the caller
499
    and only a warning is printed on stderr. This should be improved
500
    to propagate the error all the way up to the caller and let it
501
    decide what to do: throw an error or proceed anyway.
502
 */
503
static int bcf_hrec_check(bcf_hrec_t *hrec)
504
173k
{
505
173k
    int i;
506
173k
    bcf_hrec_set_type(hrec);
507
508
173k
    if ( hrec->type==BCF_HL_CTG )
509
22.3k
    {
510
22.3k
        i = bcf_hrec_find_key(hrec,"ID");
511
22.3k
        if ( i<0 ) goto err_missing_id;
512
9.90k
        char *val = hrec->vals[i];
513
9.90k
        if ( val[0]=='*' || val[0]=='=' || !valid_ctg[(uint8_t)val[0]] ) goto err_invalid_ctg;
514
102k
        while ( *(++val) )
515
101k
            if ( !valid_ctg[(uint8_t)*val] ) goto err_invalid_ctg;
516
1.46k
        return 0;
517
2.62k
    }
518
151k
    if ( hrec->type==BCF_HL_INFO )
519
58.5k
    {
520
58.5k
        i = bcf_hrec_find_key(hrec,"ID");
521
58.5k
        if ( i<0 ) goto err_missing_id;
522
52.8k
        char *val = hrec->vals[i];
523
52.8k
        if ( !strcmp(val,"1000G") ) return 0;
524
52.7k
        if ( val[0]=='.' || (val[0]>='0' && val[0]<='9') || !valid_tag[(uint8_t)val[0]] ) goto err_invalid_tag;
525
12.8k
        while ( *(++val) )
526
10.4k
            if ( !valid_tag[(uint8_t)*val] ) goto err_invalid_tag;
527
2.42k
        return 0;
528
4.49k
    }
529
92.6k
    if ( hrec->type==BCF_HL_FMT )
530
11.9k
    {
531
11.9k
        i = bcf_hrec_find_key(hrec,"ID");
532
11.9k
        if ( i<0 ) goto err_missing_id;
533
10.8k
        char *val = hrec->vals[i];
534
10.8k
        if ( val[0]=='.' || (val[0]>='0' && val[0]<='9') || !valid_tag[(uint8_t)val[0]] ) goto err_invalid_tag;
535
33.7k
        while ( *(++val) )
536
31.6k
            if ( !valid_tag[(uint8_t)*val] ) goto err_invalid_tag;
537
2.18k
        return 0;
538
3.98k
    }
539
80.7k
    return 0;
540
541
19.2k
  err_missing_id:
542
19.2k
    hts_log_warning("Missing ID attribute in one or more header lines");
543
19.2k
    return -1;
544
545
8.43k
  err_invalid_ctg:
546
8.43k
    hts_log_warning("Invalid contig name: \"%s\"", hrec->vals[i]);
547
8.43k
    return -1;
548
549
59.0k
  err_invalid_tag:
550
59.0k
    hts_log_warning("Invalid tag name: \"%s\"", hrec->vals[i]);
551
59.0k
    return -1;
552
92.6k
}
553
554
static inline int is_escaped(const char *min, const char *str)
555
108k
{
556
108k
    int n = 0;
557
108k
    while ( --str>=min && *str=='\\' ) n++;
558
108k
    return n%2;
559
108k
}
560
561
bcf_hrec_t *bcf_hdr_parse_line(const bcf_hdr_t *h, const char *line, int *len)
562
196k
{
563
196k
    bcf_hrec_t *hrec = NULL;
564
196k
    const char *p = line;
565
196k
    if (p[0] != '#' || p[1] != '#') { *len = 0; return NULL; }
566
188k
    p += 2;
567
568
188k
    const char *q = p;
569
1.42M
    while ( *q && *q!='=' && *q != '\n' ) q++;
570
188k
    ptrdiff_t n = q-p;
571
188k
    if ( *q!='=' || !n ) // wrong format
572
7.40k
        goto malformed_line;
573
574
181k
    hrec = (bcf_hrec_t*) calloc(1,sizeof(bcf_hrec_t));
575
181k
    if (!hrec) { *len = -1; return NULL; }
576
181k
    hrec->key = (char*) malloc(sizeof(char)*(n+1));
577
181k
    if (!hrec->key) goto fail;
578
181k
    memcpy(hrec->key,p,n);
579
181k
    hrec->key[n] = 0;
580
181k
    hrec->type = -1;
581
582
181k
    p = ++q;
583
181k
    if ( *p!='<' ) // generic field, e.g. ##samtoolsVersion=0.1.18-r579
584
23.0k
    {
585
12.9M
        while ( *q && *q!='\n' ) q++;
586
23.0k
        hrec->value = (char*) malloc((q-p+1)*sizeof(char));
587
23.0k
        if (!hrec->value) goto fail;
588
23.0k
        memcpy(hrec->value, p, q-p);
589
23.0k
        hrec->value[q-p] = 0;
590
23.0k
        *len = q - line + (*q ? 1 : 0); // Skip \n but not \0
591
23.0k
        return hrec;
592
23.0k
    }
593
594
    // structured line, e.g.
595
    // ##INFO=<ID=PV1,Number=1,Type=Float,Description="P-value for baseQ bias">
596
    // ##PEDIGREE=<Name_0=G0-ID,Name_1=G1-ID,Name_3=GN-ID>
597
158k
    int nopen = 1;
598
550k
    while ( *q && *q!='\n' && nopen>0 )
599
399k
    {
600
399k
        p = ++q;
601
399k
        while ( *q && *q==' ' ) { p++; q++; }
602
        // ^[A-Za-z_][0-9A-Za-z_.]*$
603
399k
        if (p==q && *q && (isalpha_c(*q) || *q=='_'))
604
397k
        {
605
397k
            q++;
606
2.30M
            while ( *q && (isalnum_c(*q) || *q=='_' || *q=='.') ) q++;
607
397k
        }
608
399k
        n = q-p;
609
399k
        int m = 0;
610
399k
        while ( *q && *q==' ' ) { q++; m++; }
611
399k
        if ( *q!='=' || !n )
612
7.47k
            goto malformed_line;
613
614
392k
        if (bcf_hrec_add_key(hrec, p, q-p-m) < 0) goto fail;
615
392k
        p = ++q;
616
394k
        while ( *q && *q==' ' ) { p++; q++; }
617
618
392k
        int quoted = 0;
619
392k
        char ending = '\0';
620
392k
        switch (*p) {
621
108k
        case '"':
622
108k
            quoted = 1;
623
108k
            ending = '"';
624
108k
            p++;
625
108k
            break;
626
21
        case '[':
627
21
            quoted = 1;
628
21
            ending = ']';
629
21
            break;
630
392k
        }
631
392k
        if ( quoted ) q++;
632
393M
        while ( *q && *q != '\n' )
633
393M
        {
634
393M
            if ( quoted ) { if ( *q==ending && !is_escaped(p,q) ) break; }
635
392M
            else
636
392M
            {
637
392M
                if ( *q=='<' ) nopen++;
638
392M
                if ( *q=='>' ) nopen--;
639
392M
                if ( !nopen ) break;
640
392M
                if ( *q==',' && nopen==1 ) break;
641
392M
            }
642
392M
            q++;
643
392M
        }
644
392k
        const char *r = q;
645
392k
        if (quoted && ending == ']') {
646
21
            if (*q == ending) {
647
4
                r++;
648
4
                q++;
649
4
                quoted = 0;
650
17
            } else {
651
17
                char buffer[320];
652
17
                hts_log_error("Missing ']' in header line %s",
653
17
                              hts_strprint(buffer, sizeof(buffer), '"',
654
17
                                           line, q-line));
655
17
                goto fail;
656
17
            }
657
21
        }
658
392k
        while ( r > p && r[-1] == ' ' ) r--;
659
392k
        if (bcf_hrec_set_val(hrec, hrec->nkeys-1, p, r-p, quoted) < 0)
660
0
            goto fail;
661
392k
        if ( quoted && *q==ending ) q++;
662
392k
        if ( *q=='>' )
663
112k
        {
664
112k
            if (nopen) nopen--;     // this can happen with nested angle brackets <>
665
112k
            q++;
666
112k
        }
667
392k
    }
668
150k
    if ( nopen )
669
38.5k
        hts_log_warning("Incomplete header line, trying to proceed anyway:\n\t[%s]\n\t[%d]",line,q[0]);
670
671
    // Skip to end of line
672
150k
    int nonspace = 0;
673
150k
    p = q;
674
6.85M
    while ( *q && *q!='\n' ) { nonspace |= !isspace_c(*q); q++; }
675
150k
    if (nonspace) {
676
1.34k
        char buffer[320];
677
1.34k
        hts_log_warning("Dropped trailing junk from header line '%s'",
678
1.34k
                        hts_strprint(buffer, sizeof(buffer),
679
1.34k
                                     '"', line, q - line));
680
1.34k
    }
681
682
150k
    *len = q - line + (*q ? 1 : 0);
683
150k
    return hrec;
684
685
17
 fail:
686
17
    *len = -1;
687
17
    bcf_hrec_destroy(hrec);
688
17
    return NULL;
689
690
14.8k
 malformed_line:
691
14.8k
    {
692
14.8k
        char buffer[320];
693
1.75M
        while ( *q && *q!='\n' ) q++;  // Ensure *len includes full line
694
14.8k
        hts_log_error("Could not parse the header line: %s",
695
14.8k
                      hts_strprint(buffer, sizeof(buffer),
696
14.8k
                                   '"', line, q - line));
697
14.8k
        *len = q - line + (*q ? 1 : 0);
698
14.8k
        bcf_hrec_destroy(hrec);
699
14.8k
        return NULL;
700
158k
    }
701
158k
}
702
703
static int bcf_hdr_set_idx(bcf_hdr_t *hdr, int dict_type, const char *tag, bcf_idinfo_t *idinfo)
704
112k
{
705
112k
    size_t new_n;
706
707
    // If available, preserve existing IDX
708
112k
    if ( idinfo->id==-1 )
709
112k
        idinfo->id = hdr->n[dict_type];
710
346
    else if ( idinfo->id < hdr->n[dict_type] && hdr->id[dict_type][idinfo->id].key )
711
15
    {
712
15
        hts_log_error("Conflicting IDX=%d lines in the header dictionary, the new tag is %s",
713
15
            idinfo->id, tag);
714
15
        errno = EINVAL;
715
15
        return -1;
716
15
    }
717
718
112k
    new_n = idinfo->id >= hdr->n[dict_type] ? idinfo->id+1 : hdr->n[dict_type];
719
112k
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
720
    // hts_resize() can attempt to allocate up to 2 * requested items
721
112k
    if (new_n > FUZZ_ALLOC_LIMIT/(2 * sizeof(bcf_idpair_t)))
722
6
        return -1;
723
112k
#endif
724
112k
    if (hts_resize(bcf_idpair_t, new_n, &hdr->m[dict_type],
725
112k
                   &hdr->id[dict_type], HTS_RESIZE_CLEAR)) {
726
0
        return -1;
727
0
    }
728
112k
    hdr->n[dict_type] = new_n;
729
730
    // NB: the next kh_put call can invalidate the idinfo pointer, therefore
731
    // we leave it unassigned here. It must be set explicitly in bcf_hdr_sync.
732
112k
    hdr->id[dict_type][idinfo->id].key = tag;
733
734
112k
    return 0;
735
112k
}
736
737
// returns: 1 when hdr needs to be synced, -1 on error, 0 otherwise
738
static int bcf_hdr_register_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
739
173k
{
740
    // contig
741
173k
    int i, ret, replacing = 0;
742
173k
    khint_t k;
743
173k
    char *str = NULL;
744
745
173k
    bcf_hrec_set_type(hrec);
746
747
173k
    if ( hrec->type==BCF_HL_CTG )
748
22.3k
    {
749
22.3k
        hts_pos_t len = 0;
750
751
        // Get the contig ID ($str) and length ($j)
752
22.3k
        i = bcf_hrec_find_key(hrec,"length");
753
22.3k
        if ( i<0 ) len = 0;
754
7.58k
        else {
755
7.58k
            char *end = hrec->vals[i];
756
7.58k
            len = strtoll(hrec->vals[i], &end, 10);
757
7.58k
            if (end == hrec->vals[i] || len < 0) return 0;
758
7.58k
        }
759
760
18.3k
        i = bcf_hrec_find_key(hrec,"ID");
761
18.3k
        if ( i<0 ) return 0;
762
9.90k
        str = strdup(hrec->vals[i]);
763
9.90k
        if (!str) return -1;
764
765
        // Register in the dictionary
766
9.90k
        vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_CTG];
767
9.90k
        khint_t k = kh_get(vdict, d, str);
768
9.90k
        if ( k != kh_end(d) ) { // already present
769
2.77k
            free(str); str=NULL;
770
2.77k
            if (kh_val(d, k).hrec[0] != NULL) // and not removed
771
2.77k
                return 0;
772
0
            replacing = 1;
773
7.12k
        } else {
774
7.12k
            k = kh_put(vdict, d, str, &ret);
775
7.12k
            if (ret < 0) { free(str); return -1; }
776
7.12k
        }
777
778
7.12k
        int idx = bcf_hrec_find_key(hrec,"IDX");
779
7.12k
        if ( idx!=-1 )
780
1.71k
        {
781
1.71k
            char *tmp = hrec->vals[idx];
782
1.71k
            idx = strtol(hrec->vals[idx], &tmp, 10);
783
1.71k
            if ( *tmp || idx < 0 || idx >= INT_MAX - 1)
784
1.65k
            {
785
1.65k
                if (!replacing) {
786
1.65k
                    kh_del(vdict, d, k);
787
1.65k
                    free(str);
788
1.65k
                }
789
1.65k
                hts_log_warning("Error parsing the IDX tag, skipping");
790
1.65k
                return 0;
791
1.65k
            }
792
1.71k
        }
793
794
5.47k
        kh_val(d, k) = bcf_idinfo_def;
795
5.47k
        kh_val(d, k).id = idx;
796
5.47k
        kh_val(d, k).info[0] = len;
797
5.47k
        kh_val(d, k).hrec[0] = hrec;
798
5.47k
        if (bcf_hdr_set_idx(hdr, BCF_DT_CTG, kh_key(d,k), &kh_val(d,k)) < 0) {
799
6
            if (!replacing) {
800
6
                kh_del(vdict, d, k);
801
6
                free(str);
802
6
            }
803
6
            return -1;
804
6
        }
805
5.47k
        if ( idx==-1 ) {
806
5.41k
            if (hrec_add_idx(hrec, kh_val(d,k).id) < 0) {
807
0
               return -1;
808
0
            }
809
5.41k
        }
810
811
5.47k
        return 1;
812
5.47k
    }
813
814
151k
    if ( hrec->type==BCF_HL_STR ) return 1;
815
142k
    if ( hrec->type!=BCF_HL_INFO && hrec->type!=BCF_HL_FLT && hrec->type!=BCF_HL_FMT ) return 0;
816
817
    // INFO/FILTER/FORMAT
818
121k
    char *id = NULL;
819
121k
    uint32_t type = UINT32_MAX, var = UINT32_MAX;
820
121k
    int num = -1, idx = -1;
821
468k
    for (i=0; i<hrec->nkeys; i++)
822
348k
    {
823
348k
        if ( !strcmp(hrec->keys[i], "ID") ) id = hrec->vals[i];
824
235k
        else if ( !strcmp(hrec->keys[i], "IDX") )
825
4.53k
        {
826
4.53k
            char *tmp = hrec->vals[i];
827
4.53k
            idx = strtol(hrec->vals[i], &tmp, 10);
828
4.53k
            if ( *tmp || idx < 0 || idx >= INT_MAX - 1)
829
1.31k
            {
830
1.31k
                hts_log_warning("Error parsing the IDX tag, skipping");
831
1.31k
                return 0;
832
1.31k
            }
833
4.53k
        }
834
230k
        else if ( !strcmp(hrec->keys[i], "Type") )
835
62.6k
        {
836
62.6k
            if ( !strcmp(hrec->vals[i], "Integer") ) type = BCF_HT_INT;
837
60.9k
            else if ( !strcmp(hrec->vals[i], "Float") ) type = BCF_HT_REAL;
838
59.5k
            else if ( !strcmp(hrec->vals[i], "String") ) type = BCF_HT_STR;
839
4.35k
            else if ( !strcmp(hrec->vals[i], "Character") ) type = BCF_HT_STR;
840
4.16k
            else if ( !strcmp(hrec->vals[i], "Flag") ) type = BCF_HT_FLAG;
841
2.47k
            else
842
2.47k
            {
843
2.47k
                hts_log_warning("The type \"%s\" is not supported, assuming \"String\"", hrec->vals[i]);
844
2.47k
                type = BCF_HT_STR;
845
2.47k
            }
846
62.6k
        }
847
168k
        else if ( !strcmp(hrec->keys[i], "Number") )
848
57.3k
        {
849
57.3k
            if ( !strcmp(hrec->vals[i],"A") ) var = BCF_VL_A;
850
57.0k
            else if ( !strcmp(hrec->vals[i],"R") ) var = BCF_VL_R;
851
57.0k
            else if ( !strcmp(hrec->vals[i],"G") ) var = BCF_VL_G;
852
57.0k
            else if ( !strcmp(hrec->vals[i],".") ) var = BCF_VL_VAR;
853
57.0k
            else
854
57.0k
            {
855
57.0k
                sscanf(hrec->vals[i],"%d",&num);
856
57.0k
                var = BCF_VL_FIXED;
857
57.0k
            }
858
57.3k
            if (var != BCF_VL_FIXED) num = 0xfffff;
859
57.3k
        }
860
348k
    }
861
119k
    if (hrec->type == BCF_HL_INFO || hrec->type == BCF_HL_FMT) {
862
69.3k
        if (type == -1) {
863
6.72k
            hts_log_warning("%s %s field has no Type defined. Assuming String",
864
6.72k
                *hrec->key == 'I' ? "An" : "A", hrec->key);
865
6.72k
            type = BCF_HT_STR;
866
6.72k
        }
867
69.3k
        if (var == -1) {
868
11.9k
            hts_log_warning("%s %s field has no Number defined. Assuming '.'",
869
11.9k
                *hrec->key == 'I' ? "An" : "A", hrec->key);
870
11.9k
            var = BCF_VL_VAR;
871
11.9k
        }
872
69.3k
        if ( type==BCF_HT_FLAG && (var!=BCF_VL_FIXED || num!=0) )
873
438
        {
874
438
            hts_log_warning("The definition of Flag \"%s/%s\" is invalid, forcing Number=0", hrec->key,id);
875
438
            var = BCF_VL_FIXED;
876
438
            num = 0;
877
438
        }
878
69.3k
    }
879
119k
    uint32_t info = ((((uint32_t)num) & 0xfffff)<<12 |
880
119k
                     (var & 0xf) << 8 |
881
119k
                     (type & 0xf) << 4 |
882
119k
                     (((uint32_t) hrec->type) & 0xf));
883
884
119k
    if ( !id ) return 0;
885
113k
    str = strdup(id);
886
113k
    if (!str) return -1;
887
888
113k
    vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_ID];
889
113k
    k = kh_get(vdict, d, str);
890
113k
    if ( k != kh_end(d) )
891
6.20k
    {
892
        // already present
893
6.20k
        free(str);
894
6.20k
        if ( kh_val(d, k).hrec[info&0xf] ) return 0;
895
1.99k
        kh_val(d, k).info[info&0xf] = info;
896
1.99k
        kh_val(d, k).hrec[info&0xf] = hrec;
897
1.99k
        if ( idx==-1 ) {
898
1.92k
            if (hrec_add_idx(hrec, kh_val(d, k).id) < 0) {
899
0
                return -1;
900
0
            }
901
1.92k
        }
902
1.99k
        return 1;
903
1.99k
    }
904
107k
    k = kh_put(vdict, d, str, &ret);
905
107k
    if (ret < 0) {
906
0
        free(str);
907
0
        return -1;
908
0
    }
909
107k
    kh_val(d, k) = bcf_idinfo_def;
910
107k
    kh_val(d, k).info[info&0xf] = info;
911
107k
    kh_val(d, k).hrec[info&0xf] = hrec;
912
107k
    kh_val(d, k).id = idx;
913
107k
    if (bcf_hdr_set_idx(hdr, BCF_DT_ID, kh_key(d,k), &kh_val(d,k)) < 0) {
914
15
        kh_del(vdict, d, k);
915
15
        free(str);
916
15
        return -1;
917
15
    }
918
107k
    if ( idx==-1 ) {
919
106k
        if (hrec_add_idx(hrec, kh_val(d,k).id) < 0) {
920
0
            return -1;
921
0
        }
922
106k
    }
923
924
107k
    return 1;
925
107k
}
926
927
static void bcf_hdr_unregister_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
928
0
{
929
0
    if (hrec->type == BCF_HL_FLT ||
930
0
        hrec->type == BCF_HL_INFO ||
931
0
        hrec->type == BCF_HL_FMT ||
932
0
        hrec->type == BCF_HL_CTG) {
933
0
        int id = bcf_hrec_find_key(hrec, "ID");
934
0
        if (id < 0 || !hrec->vals[id])
935
0
            return;
936
0
        vdict_t *dict = (hrec->type == BCF_HL_CTG
937
0
                         ? (vdict_t*)hdr->dict[BCF_DT_CTG]
938
0
                         : (vdict_t*)hdr->dict[BCF_DT_ID]);
939
0
        khint_t k = kh_get(vdict, dict, hrec->vals[id]);
940
0
        if (k != kh_end(dict))
941
0
            kh_val(dict, k).hrec[hrec->type==BCF_HL_CTG ? 0 : hrec->type] = NULL;
942
0
    }
943
0
}
944
945
static void bcf_hdr_remove_from_hdict(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
946
0
{
947
0
    kstring_t str = KS_INITIALIZE;
948
0
    bcf_hdr_aux_t *aux = get_hdr_aux(hdr);
949
0
    khint_t k;
950
0
    int id;
951
952
0
    switch (hrec->type) {
953
0
    case BCF_HL_GEN:
954
0
        if (ksprintf(&str, "##%s=%s", hrec->key,hrec->value) < 0)
955
0
            str.l = 0;
956
0
        break;
957
0
    case BCF_HL_STR:
958
0
        id = bcf_hrec_find_key(hrec, "ID");
959
0
        if (id < 0)
960
0
            return;
961
0
        if (!hrec->vals[id] ||
962
0
            ksprintf(&str, "##%s=<ID=%s>", hrec->key, hrec->vals[id]) < 0)
963
0
            str.l = 0;
964
0
        break;
965
0
    default:
966
0
        return;
967
0
    }
968
0
    if (str.l) {
969
0
        k = kh_get(hdict, aux->gen, str.s);
970
0
    } else {
971
        // Couldn't get a string for some reason, so try the hard way...
972
0
        for (k = kh_begin(aux->gen); k < kh_end(aux->gen); k++) {
973
0
            if (kh_exist(aux->gen, k) && kh_val(aux->gen, k) == hrec)
974
0
                break;
975
0
        }
976
0
    }
977
0
    if (k != kh_end(aux->gen) && kh_val(aux->gen, k) == hrec) {
978
0
        kh_val(aux->gen, k) = NULL;
979
0
        free((char *) kh_key(aux->gen, k));
980
0
        kh_key(aux->gen, k) = NULL;
981
0
        kh_del(hdict, aux->gen, k);
982
0
    }
983
0
    free(str.s);
984
0
}
985
986
int bcf_hdr_update_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec, const bcf_hrec_t *tmp)
987
0
{
988
    // currently only for bcf_hdr_set_version
989
0
    assert( hrec->type==BCF_HL_GEN );
990
0
    int ret;
991
0
    khint_t k;
992
0
    bcf_hdr_aux_t *aux = get_hdr_aux(hdr);
993
0
    for (k=kh_begin(aux->gen); k<kh_end(aux->gen); k++)
994
0
    {
995
0
        if ( !kh_exist(aux->gen,k) ) continue;
996
0
        if ( hrec!=(bcf_hrec_t*)kh_val(aux->gen,k) ) continue;
997
0
        break;
998
0
    }
999
0
    assert( k<kh_end(aux->gen) );   // something went wrong, should never happen
1000
0
    free((char*)kh_key(aux->gen,k));
1001
0
    kh_del(hdict,aux->gen,k);
1002
0
    kstring_t str = {0,0,0};
1003
0
    if ( ksprintf(&str, "##%s=%s", tmp->key,tmp->value) < 0 )
1004
0
    {
1005
0
        free(str.s);
1006
0
        return -1;
1007
0
    }
1008
0
    k = kh_put(hdict, aux->gen, str.s, &ret);
1009
0
    if ( ret<0 )
1010
0
    {
1011
0
        free(str.s);
1012
0
        return -1;
1013
0
    }
1014
0
    free(hrec->value);
1015
0
    hrec->value = strdup(tmp->value);
1016
0
    if ( !hrec->value ) return -1;
1017
0
    return 0;
1018
0
}
1019
1020
int bcf_hdr_add_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
1021
174k
{
1022
174k
    kstring_t str = {0,0,0};
1023
174k
    bcf_hdr_aux_t *aux = get_hdr_aux(hdr);
1024
1025
174k
    int res;
1026
174k
    if ( !hrec ) return 0;
1027
1028
173k
    bcf_hrec_check(hrec);   // todo: check return status and propagate errors up
1029
1030
173k
    res = bcf_hdr_register_hrec(hdr,hrec);
1031
173k
    if (res < 0) return -1;
1032
173k
    if ( !res )
1033
50.7k
    {
1034
        // If one of the hashed field, then it is already present
1035
50.7k
        if ( hrec->type != BCF_HL_GEN )
1036
28.9k
        {
1037
28.9k
            bcf_hrec_destroy(hrec);
1038
28.9k
            return 0;
1039
28.9k
        }
1040
1041
        // Is one of the generic fields and already present?
1042
21.8k
        if ( ksprintf(&str, "##%s=%s", hrec->key,hrec->value) < 0 )
1043
0
        {
1044
0
            free(str.s);
1045
0
            return -1;
1046
0
        }
1047
21.8k
        khint_t k = kh_get(hdict, aux->gen, str.s);
1048
21.8k
        if ( k != kh_end(aux->gen) )
1049
10.5k
        {
1050
            // duplicate record
1051
10.5k
            bcf_hrec_destroy(hrec);
1052
10.5k
            free(str.s);
1053
10.5k
            return 0;
1054
10.5k
        }
1055
21.8k
    }
1056
1057
134k
    int i;
1058
134k
    if ( hrec->type==BCF_HL_STR && (i=bcf_hrec_find_key(hrec,"ID"))>=0 )
1059
4.83k
    {
1060
4.83k
        if ( ksprintf(&str, "##%s=<ID=%s>", hrec->key,hrec->vals[i]) < 0 )
1061
0
        {
1062
0
            free(str.s);
1063
0
            return -1;
1064
0
        }
1065
4.83k
        khint_t k = kh_get(hdict, aux->gen, str.s);
1066
4.83k
        if ( k != kh_end(aux->gen) )
1067
3.53k
        {
1068
            // duplicate record
1069
3.53k
            bcf_hrec_destroy(hrec);
1070
3.53k
            free(str.s);
1071
3.53k
            return 0;
1072
3.53k
        }
1073
4.83k
    }
1074
1075
    // New record, needs to be added
1076
130k
    int n = hdr->nhrec + 1;
1077
130k
    bcf_hrec_t **new_hrec = realloc(hdr->hrec, n*sizeof(bcf_hrec_t*));
1078
130k
    if (!new_hrec) {
1079
0
        free(str.s);
1080
0
        bcf_hdr_unregister_hrec(hdr, hrec);
1081
0
        return -1;
1082
0
    }
1083
130k
    hdr->hrec = new_hrec;
1084
1085
130k
    if ( str.s )
1086
12.5k
    {
1087
12.5k
        khint_t k = kh_put(hdict, aux->gen, str.s, &res);
1088
12.5k
        if ( res<0 )
1089
0
        {
1090
0
            free(str.s);
1091
0
            return -1;
1092
0
        }
1093
12.5k
        kh_val(aux->gen,k) = hrec;
1094
12.5k
    }
1095
1096
130k
    hdr->hrec[hdr->nhrec] = hrec;
1097
130k
    hdr->dirty = 1;
1098
130k
    hdr->nhrec = n;
1099
1100
130k
    return hrec->type==BCF_HL_GEN ? 0 : 1;
1101
130k
}
1102
1103
bcf_hrec_t *bcf_hdr_get_hrec(const bcf_hdr_t *hdr, int type, const char *key, const char *value, const char *str_class)
1104
0
{
1105
0
    int i;
1106
0
    if ( type==BCF_HL_GEN )
1107
0
    {
1108
        // e.g. ##fileformat=VCFv4.2
1109
        //      ##source=GenomicsDBImport
1110
        //      ##bcftools_viewVersion=1.16-80-gdfdb0923+htslib-1.16-34-g215d364
1111
0
        if ( value )
1112
0
        {
1113
0
            kstring_t str = {0,0,0};
1114
0
            ksprintf(&str, "##%s=%s", key,value);
1115
0
            bcf_hdr_aux_t *aux = get_hdr_aux(hdr);
1116
0
            khint_t k = kh_get(hdict, aux->gen, str.s);
1117
0
            free(str.s);
1118
0
            if ( k == kh_end(aux->gen) ) return NULL;
1119
0
            return kh_val(aux->gen, k);
1120
0
        }
1121
0
        for (i=0; i<hdr->nhrec; i++)
1122
0
        {
1123
0
            if ( hdr->hrec[i]->type!=type ) continue;
1124
0
            if ( strcmp(hdr->hrec[i]->key,key) ) continue;
1125
0
            return hdr->hrec[i];
1126
0
        }
1127
0
        return NULL;
1128
0
    }
1129
0
    else if ( type==BCF_HL_STR )
1130
0
    {
1131
        // e.g. ##GATKCommandLine=<ID=GenomicsDBImport,CommandLine="GenomicsDBImport....">
1132
        //      ##ALT=<ID=NON_REF,Description="Represents any possible alternative allele not already represented at this location by REF and ALT">
1133
0
        if (!str_class) return NULL;
1134
0
        if ( !strcmp("ID",key) )
1135
0
        {
1136
0
            kstring_t str = {0,0,0};
1137
0
            ksprintf(&str, "##%s=<%s=%s>",str_class,key,value);
1138
0
            bcf_hdr_aux_t *aux = get_hdr_aux(hdr);
1139
0
            khint_t k = kh_get(hdict, aux->gen, str.s);
1140
0
            free(str.s);
1141
0
            if ( k == kh_end(aux->gen) ) return NULL;
1142
0
            return kh_val(aux->gen, k);
1143
0
        }
1144
0
        for (i=0; i<hdr->nhrec; i++)
1145
0
        {
1146
0
            if ( hdr->hrec[i]->type!=type ) continue;
1147
0
            if ( strcmp(hdr->hrec[i]->key,str_class) ) continue;
1148
0
            int j = bcf_hrec_find_key(hdr->hrec[i],key);
1149
0
            if ( j>=0 && !strcmp(hdr->hrec[i]->vals[j],value) ) return hdr->hrec[i];
1150
0
        }
1151
0
        return NULL;
1152
0
    }
1153
0
    vdict_t *d = type==BCF_HL_CTG ? (vdict_t*)hdr->dict[BCF_DT_CTG] : (vdict_t*)hdr->dict[BCF_DT_ID];
1154
0
    khint_t k = kh_get(vdict, d, value);
1155
0
    if ( k == kh_end(d) ) return NULL;
1156
0
    return kh_val(d, k).hrec[type==BCF_HL_CTG?0:type];
1157
0
}
1158
1159
void bcf_hdr_check_sanity(bcf_hdr_t *hdr)
1160
4.76k
{
1161
4.76k
    static int PL_warned = 0, GL_warned = 0;
1162
1163
4.76k
    if ( !PL_warned )
1164
4.76k
    {
1165
4.76k
        int id = bcf_hdr_id2int(hdr, BCF_DT_ID, "PL");
1166
4.76k
        if ( bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,id) && bcf_hdr_id2length(hdr,BCF_HL_FMT,id)!=BCF_VL_G )
1167
0
        {
1168
0
            hts_log_warning("PL should be declared as Number=G");
1169
0
            PL_warned = 1;
1170
0
        }
1171
4.76k
    }
1172
4.76k
    if ( !GL_warned )
1173
4.76k
    {
1174
4.76k
        int id = bcf_hdr_id2int(hdr, BCF_DT_ID, "GL");
1175
4.76k
        if ( bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,id) && bcf_hdr_id2length(hdr,BCF_HL_FMT,id)!=BCF_VL_G )
1176
0
        {
1177
0
            hts_log_warning("GL should be declared as Number=G");
1178
0
            GL_warned = 1;
1179
0
        }
1180
4.76k
    }
1181
4.76k
}
1182
1183
int bcf_hdr_parse(bcf_hdr_t *hdr, char *htxt)
1184
5.60k
{
1185
5.60k
    int len, done = 0;
1186
5.60k
    char *p = htxt;
1187
1188
    // Check sanity: "fileformat" string must come as first
1189
5.60k
    bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr,p,&len);
1190
5.60k
    if ( !hrec || !hrec->key || strcasecmp(hrec->key,"fileformat") )
1191
452
        hts_log_warning("The first line should be ##fileformat; is the VCF/BCF header broken?");
1192
5.60k
    if (bcf_hdr_add_hrec(hdr, hrec) < 0) {
1193
0
        bcf_hrec_destroy(hrec);
1194
0
        return -1;
1195
0
    }
1196
1197
    // The filter PASS must appear first in the dictionary
1198
5.60k
    hrec = bcf_hdr_parse_line(hdr,"##FILTER=<ID=PASS,Description=\"All filters passed\">",&len);
1199
5.60k
    if (!hrec || bcf_hdr_add_hrec(hdr, hrec) < 0) {
1200
0
        bcf_hrec_destroy(hrec);
1201
0
        return -1;
1202
0
    }
1203
1204
    // Parse the whole header
1205
22.2k
    do {
1206
79.8k
        while (NULL != (hrec = bcf_hdr_parse_line(hdr, p, &len))) {
1207
57.6k
            if (bcf_hdr_add_hrec(hdr, hrec) < 0) {
1208
20
                bcf_hrec_destroy(hrec);
1209
20
                return -1;
1210
20
            }
1211
57.5k
            p += len;
1212
57.5k
        }
1213
22.2k
        assert(hrec == NULL);
1214
22.2k
        if (len < 0) {
1215
            // len < 0 indicates out-of-memory, or similar error
1216
2
            hts_log_error("Could not parse header line: %s", strerror(errno));
1217
2
            return -1;
1218
22.2k
        } else if (len > 0) {
1219
            // Bad header line.  bcf_hdr_parse_line() will have logged it.
1220
            // Skip and try again on the next line (p + len will be the start
1221
            // of the next one).
1222
14.7k
            p += len;
1223
14.7k
            continue;
1224
14.7k
        }
1225
1226
        // Next should be the sample line.  If not, it was a malformed
1227
        // header, in which case print a warning and skip (many VCF
1228
        // operations do not really care about a few malformed lines).
1229
        // In the future we may want to add a strict mode that errors in
1230
        // this case.
1231
7.47k
        if ( strncmp("#CHROM\t",p,7) && strncmp("#CHROM ",p,7) ) {
1232
2.56k
            char *eol = strchr(p, '\n');
1233
2.56k
            if (*p != '\0') {
1234
1.92k
                char buffer[320];
1235
1.92k
                hts_log_warning("Could not parse header line: %s",
1236
1.92k
                                hts_strprint(buffer, sizeof(buffer),
1237
1.92k
                                               '"', p,
1238
1.92k
                                               eol ? (eol - p) : SIZE_MAX));
1239
1.92k
            }
1240
2.56k
            if (eol) {
1241
1.88k
                p = eol + 1; // Try from the next line.
1242
1.88k
            } else {
1243
678
                done = -1; // No more lines left, give up.
1244
678
            }
1245
4.90k
        } else {
1246
4.90k
            done = 1; // Sample line found
1247
4.90k
        }
1248
22.2k
    } while (!done);
1249
1250
5.58k
    if (done < 0) {
1251
        // No sample line is fatal.
1252
678
        hts_log_error("Could not parse the header, sample line not found");
1253
678
        return -1;
1254
678
    }
1255
1256
4.90k
    if (bcf_hdr_parse_sample_line(hdr,p) < 0)
1257
148
        return -1;
1258
4.76k
    if (bcf_hdr_sync(hdr) < 0)
1259
0
        return -1;
1260
4.76k
    bcf_hdr_check_sanity(hdr);
1261
4.76k
    return 0;
1262
4.76k
}
1263
1264
int bcf_hdr_append(bcf_hdr_t *hdr, const char *line)
1265
0
{
1266
0
    int len;
1267
0
    bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr, (char*) line, &len);
1268
0
    if ( !hrec ) return -1;
1269
0
    if (bcf_hdr_add_hrec(hdr, hrec) < 0)
1270
0
        return -1;
1271
0
    return 0;
1272
0
}
1273
1274
void bcf_hdr_remove(bcf_hdr_t *hdr, int type, const char *key)
1275
0
{
1276
0
    int i = 0;
1277
0
    bcf_hrec_t *hrec;
1278
0
    if ( !key )
1279
0
    {
1280
        // no key, remove all entries of this type
1281
0
        while ( i<hdr->nhrec )
1282
0
        {
1283
0
            if ( hdr->hrec[i]->type!=type ) { i++; continue; }
1284
0
            hrec = hdr->hrec[i];
1285
0
            bcf_hdr_unregister_hrec(hdr, hrec);
1286
0
            bcf_hdr_remove_from_hdict(hdr, hrec);
1287
0
            hdr->dirty = 1;
1288
0
            hdr->nhrec--;
1289
0
            if ( i < hdr->nhrec )
1290
0
                memmove(&hdr->hrec[i],&hdr->hrec[i+1],(hdr->nhrec-i)*sizeof(bcf_hrec_t*));
1291
0
            bcf_hrec_destroy(hrec);
1292
0
        }
1293
0
        return;
1294
0
    }
1295
0
    while (1)
1296
0
    {
1297
0
        if ( type==BCF_HL_FLT || type==BCF_HL_INFO || type==BCF_HL_FMT || type== BCF_HL_CTG )
1298
0
        {
1299
0
            hrec = bcf_hdr_get_hrec(hdr, type, "ID", key, NULL);
1300
0
            if ( !hrec ) return;
1301
1302
0
            for (i=0; i<hdr->nhrec; i++)
1303
0
                if ( hdr->hrec[i]==hrec ) break;
1304
0
            assert( i<hdr->nhrec );
1305
1306
0
            vdict_t *d = type==BCF_HL_CTG ? (vdict_t*)hdr->dict[BCF_DT_CTG] : (vdict_t*)hdr->dict[BCF_DT_ID];
1307
0
            khint_t k = kh_get(vdict, d, key);
1308
0
            kh_val(d, k).hrec[type==BCF_HL_CTG?0:type] = NULL;
1309
0
        }
1310
0
        else
1311
0
        {
1312
0
            for (i=0; i<hdr->nhrec; i++)
1313
0
            {
1314
0
                if ( hdr->hrec[i]->type!=type ) continue;
1315
0
                if ( type==BCF_HL_GEN )
1316
0
                {
1317
0
                    if ( !strcmp(hdr->hrec[i]->key,key) ) break;
1318
0
                }
1319
0
                else
1320
0
                {
1321
                    // not all structured lines have ID, we could be more sophisticated as in bcf_hdr_get_hrec()
1322
0
                    int j = bcf_hrec_find_key(hdr->hrec[i], "ID");
1323
0
                    if ( j>=0 && !strcmp(hdr->hrec[i]->vals[j],key) ) break;
1324
0
                }
1325
0
            }
1326
0
            if ( i==hdr->nhrec ) return;
1327
0
            hrec = hdr->hrec[i];
1328
0
            bcf_hdr_remove_from_hdict(hdr, hrec);
1329
0
        }
1330
1331
0
        hdr->nhrec--;
1332
0
        if ( i < hdr->nhrec )
1333
0
            memmove(&hdr->hrec[i],&hdr->hrec[i+1],(hdr->nhrec-i)*sizeof(bcf_hrec_t*));
1334
0
        bcf_hrec_destroy(hrec);
1335
0
        hdr->dirty = 1;
1336
0
    }
1337
0
}
1338
1339
int bcf_hdr_printf(bcf_hdr_t *hdr, const char *fmt, ...)
1340
0
{
1341
0
    char tmp[256], *line = tmp;
1342
0
    va_list ap;
1343
0
    va_start(ap, fmt);
1344
0
    int n = vsnprintf(line, sizeof(tmp), fmt, ap);
1345
0
    va_end(ap);
1346
1347
0
    if (n >= sizeof(tmp)) {
1348
0
        n++; // For trailing NUL
1349
0
        line = (char*)malloc(n);
1350
0
        if (!line)
1351
0
            return -1;
1352
1353
0
        va_start(ap, fmt);
1354
0
        vsnprintf(line, n, fmt, ap);
1355
0
        va_end(ap);
1356
0
    }
1357
1358
0
    int ret = bcf_hdr_append(hdr, line);
1359
1360
0
    if (line != tmp) free(line);
1361
0
    return ret;
1362
0
}
1363
1364
1365
/**********************
1366
 *** BCF header I/O ***
1367
 **********************/
1368
1369
const char *bcf_hdr_get_version(const bcf_hdr_t *hdr)
1370
0
{
1371
0
    bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, BCF_HL_GEN, "fileformat", NULL, NULL);
1372
0
    if ( !hrec )
1373
0
    {
1374
0
        hts_log_warning("No version string found, assuming VCFv4.2");
1375
0
        return "VCFv4.2";
1376
0
    }
1377
0
    return hrec->value;
1378
0
}
1379
1380
int bcf_hdr_set_version(bcf_hdr_t *hdr, const char *version)
1381
0
{
1382
0
    bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, BCF_HL_GEN, "fileformat", NULL, NULL);
1383
0
    if ( !hrec )
1384
0
    {
1385
0
        int len;
1386
0
        kstring_t str = {0,0,0};
1387
0
        if ( ksprintf(&str,"##fileformat=%s", version) < 0 ) return -1;
1388
0
        hrec = bcf_hdr_parse_line(hdr, str.s, &len);
1389
0
        free(str.s);
1390
0
    }
1391
0
    else
1392
0
    {
1393
0
        bcf_hrec_t *tmp = bcf_hrec_dup(hrec);
1394
0
        if ( !tmp ) return -1;
1395
0
        free(tmp->value);
1396
0
        tmp->value = strdup(version);
1397
0
        if ( !tmp->value ) return -1;
1398
0
        bcf_hdr_update_hrec(hdr, hrec, tmp);
1399
0
        bcf_hrec_destroy(tmp);
1400
0
    }
1401
0
    hdr->dirty = 1;
1402
0
    return 0; // FIXME: check for errs in this function (return < 0 if so)
1403
0
}
1404
1405
bcf_hdr_t *bcf_hdr_init(const char *mode)
1406
5.64k
{
1407
5.64k
    int i;
1408
5.64k
    bcf_hdr_t *h;
1409
5.64k
    h = (bcf_hdr_t*)calloc(1, sizeof(bcf_hdr_t));
1410
5.64k
    if (!h) return NULL;
1411
22.5k
    for (i = 0; i < 3; ++i) {
1412
16.9k
        if ((h->dict[i] = kh_init(vdict)) == NULL) goto fail;
1413
        // Supersize the hash to make collisions very unlikely
1414
16.9k
        static int dsize[3] = {16384,16384,2048}; // info, contig, format
1415
16.9k
        if (kh_resize(vdict, h->dict[i], dsize[i]) < 0) goto fail;
1416
16.9k
    }
1417
1418
5.64k
    bcf_hdr_aux_t *aux = (bcf_hdr_aux_t*)calloc(1,sizeof(bcf_hdr_aux_t));
1419
5.64k
    if ( !aux ) goto fail;
1420
5.64k
    if ( (aux->gen = kh_init(hdict))==NULL ) { free(aux); goto fail; }
1421
5.64k
    aux->key_len = NULL;
1422
5.64k
    aux->dict = *((vdict_t*)h->dict[0]);
1423
5.64k
    free(h->dict[0]);
1424
5.64k
    h->dict[0] = aux;
1425
1426
5.64k
    if ( strchr(mode,'w') )
1427
0
    {
1428
0
        bcf_hdr_append(h, "##fileformat=VCFv4.2");
1429
        // The filter PASS must appear first in the dictionary
1430
0
        bcf_hdr_append(h, "##FILTER=<ID=PASS,Description=\"All filters passed\">");
1431
0
    }
1432
5.64k
    return h;
1433
1434
0
 fail:
1435
0
    for (i = 0; i < 3; ++i)
1436
0
        kh_destroy(vdict, h->dict[i]);
1437
0
    free(h);
1438
0
    return NULL;
1439
5.64k
}
1440
1441
void bcf_hdr_destroy(bcf_hdr_t *h)
1442
5.64k
{
1443
5.64k
    int i;
1444
5.64k
    khint_t k;
1445
5.64k
    if (!h) return;
1446
22.5k
    for (i = 0; i < 3; ++i) {
1447
16.9k
        vdict_t *d = (vdict_t*)h->dict[i];
1448
16.9k
        if (d == 0) continue;
1449
196M
        for (k = kh_begin(d); k != kh_end(d); ++k)
1450
196M
            if (kh_exist(d, k)) free((char*)kh_key(d, k));
1451
16.9k
        if ( i==0 )
1452
5.64k
        {
1453
5.64k
            bcf_hdr_aux_t *aux = get_hdr_aux(h);
1454
35.3k
            for (k=kh_begin(aux->gen); k<kh_end(aux->gen); k++)
1455
29.7k
                if ( kh_exist(aux->gen,k) ) free((char*)kh_key(aux->gen,k));
1456
5.64k
            kh_destroy(hdict, aux->gen);
1457
5.64k
            free(aux->key_len); // may exist for dict[0] only
1458
5.64k
        }
1459
16.9k
        kh_destroy(vdict, d);
1460
16.9k
        free(h->id[i]);
1461
16.9k
    }
1462
136k
    for (i=0; i<h->nhrec; i++)
1463
130k
        bcf_hrec_destroy(h->hrec[i]);
1464
5.64k
    if (h->nhrec) free(h->hrec);
1465
5.64k
    if (h->samples) free(h->samples);
1466
5.64k
    free(h->keep_samples);
1467
5.64k
    free(h->transl[0]); free(h->transl[1]);
1468
5.64k
    free(h->mem.s);
1469
5.64k
    free(h);
1470
5.64k
}
1471
1472
bcf_hdr_t *bcf_hdr_read(htsFile *hfp)
1473
5.64k
{
1474
5.64k
    if (hfp->format.format == vcf)
1475
5.18k
        return vcf_hdr_read(hfp);
1476
456
    if (hfp->format.format != bcf) {
1477
0
        hts_log_error("Input is not detected as bcf or vcf format");
1478
0
        return NULL;
1479
0
    }
1480
1481
456
    assert(hfp->is_bgzf);
1482
1483
456
    BGZF *fp = hfp->fp.bgzf;
1484
456
    uint8_t magic[5];
1485
456
    bcf_hdr_t *h;
1486
456
    h = bcf_hdr_init("r");
1487
456
    if (!h) {
1488
0
        hts_log_error("Failed to allocate bcf header");
1489
0
        return NULL;
1490
0
    }
1491
456
    if (bgzf_read(fp, magic, 5) != 5)
1492
0
    {
1493
0
        hts_log_error("Failed to read the header (reading BCF in text mode?)");
1494
0
        bcf_hdr_destroy(h);
1495
0
        return NULL;
1496
0
    }
1497
456
    if (strncmp((char*)magic, "BCF\2\2", 5) != 0)
1498
0
    {
1499
0
        if (!strncmp((char*)magic, "BCF", 3))
1500
0
            hts_log_error("Invalid BCF2 magic string: only BCFv2.2 is supported");
1501
0
        else
1502
0
            hts_log_error("Invalid BCF2 magic string");
1503
0
        bcf_hdr_destroy(h);
1504
0
        return NULL;
1505
0
    }
1506
456
    uint8_t buf[4];
1507
456
    size_t hlen;
1508
456
    char *htxt = NULL;
1509
456
    if (bgzf_read(fp, buf, 4) != 4) goto fail;
1510
456
    hlen = buf[0] | (buf[1] << 8) | (buf[2] << 16) | ((size_t) buf[3] << 24);
1511
456
    if (hlen >= SIZE_MAX) { errno = ENOMEM; goto fail; }
1512
456
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
1513
456
    if (hlen > FUZZ_ALLOC_LIMIT/2) { errno = ENOMEM; goto fail; }
1514
454
#endif
1515
454
    htxt = (char*)malloc(hlen + 1);
1516
454
    if (!htxt) goto fail;
1517
454
    if (bgzf_read(fp, htxt, hlen) != hlen) goto fail;
1518
452
    htxt[hlen] = '\0'; // Ensure htxt is terminated
1519
452
    if ( bcf_hdr_parse(h, htxt) < 0 ) goto fail;
1520
414
    free(htxt);
1521
414
    return h;
1522
42
 fail:
1523
42
    hts_log_error("Failed to read BCF header");
1524
42
    free(htxt);
1525
42
    bcf_hdr_destroy(h);
1526
42
    return NULL;
1527
452
}
1528
1529
int bcf_hdr_write(htsFile *hfp, bcf_hdr_t *h)
1530
4.76k
{
1531
4.76k
    if (!h) {
1532
0
        errno = EINVAL;
1533
0
        return -1;
1534
0
    }
1535
4.76k
    if ( h->dirty ) {
1536
0
        if (bcf_hdr_sync(h) < 0) return -1;
1537
0
    }
1538
4.76k
    hfp->format.category = variant_data;
1539
4.76k
    if (hfp->format.format == vcf || hfp->format.format == text_format) {
1540
2.38k
        hfp->format.format = vcf;
1541
2.38k
        return vcf_hdr_write(hfp, h);
1542
2.38k
    }
1543
1544
2.38k
    if (hfp->format.format == binary_format)
1545
2.38k
        hfp->format.format = bcf;
1546
1547
2.38k
    kstring_t htxt = {0,0,0};
1548
2.38k
    if (bcf_hdr_format(h, 1, &htxt) < 0) {
1549
0
        free(htxt.s);
1550
0
        return -1;
1551
0
    }
1552
2.38k
    kputc('\0', &htxt); // include the \0 byte
1553
1554
2.38k
    BGZF *fp = hfp->fp.bgzf;
1555
2.38k
    if ( bgzf_write(fp, "BCF\2\2", 5) !=5 ) return -1;
1556
2.38k
    uint8_t hlen[4];
1557
2.38k
    u32_to_le(htxt.l, hlen);
1558
2.38k
    if ( bgzf_write(fp, hlen, 4) !=4 ) return -1;
1559
2.38k
    if ( bgzf_write(fp, htxt.s, htxt.l) != htxt.l ) return -1;
1560
2.38k
    if ( bgzf_flush(fp) < 0) return -1;
1561
1562
2.38k
    free(htxt.s);
1563
2.38k
    return 0;
1564
2.38k
}
1565
1566
/********************
1567
 *** BCF site I/O ***
1568
 ********************/
1569
1570
bcf1_t *bcf_init()
1571
4.76k
{
1572
4.76k
    bcf1_t *v;
1573
4.76k
    v = (bcf1_t*)calloc(1, sizeof(bcf1_t));
1574
4.76k
    return v;
1575
4.76k
}
1576
1577
void bcf_clear(bcf1_t *v)
1578
26.5k
{
1579
26.5k
    int i;
1580
26.5k
    for (i=0; i<v->d.m_info; i++)
1581
0
    {
1582
0
        if ( v->d.info[i].vptr_free )
1583
0
        {
1584
0
            free(v->d.info[i].vptr - v->d.info[i].vptr_off);
1585
0
            v->d.info[i].vptr_free = 0;
1586
0
        }
1587
0
    }
1588
26.5k
    for (i=0; i<v->d.m_fmt; i++)
1589
0
    {
1590
0
        if ( v->d.fmt[i].p_free )
1591
0
        {
1592
0
            free(v->d.fmt[i].p - v->d.fmt[i].p_off);
1593
0
            v->d.fmt[i].p_free = 0;
1594
0
        }
1595
0
    }
1596
26.5k
    v->rid = v->pos = v->rlen = v->unpacked = 0;
1597
26.5k
    bcf_float_set_missing(v->qual);
1598
26.5k
    v->n_info = v->n_allele = v->n_fmt = v->n_sample = 0;
1599
26.5k
    v->shared.l = v->indiv.l = 0;
1600
26.5k
    v->d.var_type = -1;
1601
26.5k
    v->d.shared_dirty = 0;
1602
26.5k
    v->d.indiv_dirty  = 0;
1603
26.5k
    v->d.n_flt = 0;
1604
26.5k
    v->errcode = 0;
1605
26.5k
    if (v->d.m_als) v->d.als[0] = 0;
1606
26.5k
    if (v->d.m_id) v->d.id[0] = 0;
1607
26.5k
}
1608
1609
void bcf_empty(bcf1_t *v)
1610
4.76k
{
1611
4.76k
    bcf_clear1(v);
1612
4.76k
    free(v->d.id);
1613
4.76k
    free(v->d.als);
1614
4.76k
    free(v->d.allele); free(v->d.flt); free(v->d.info); free(v->d.fmt);
1615
4.76k
    if (v->d.var ) free(v->d.var);
1616
4.76k
    free(v->shared.s); free(v->indiv.s);
1617
4.76k
    memset(&v->d,0,sizeof(v->d));
1618
4.76k
    memset(&v->shared,0,sizeof(v->shared));
1619
4.76k
    memset(&v->indiv,0,sizeof(v->indiv));
1620
4.76k
}
1621
1622
void bcf_destroy(bcf1_t *v)
1623
4.76k
{
1624
4.76k
    if (!v) return;
1625
4.76k
    bcf_empty1(v);
1626
4.76k
    free(v);
1627
4.76k
}
1628
1629
static inline int bcf_read1_core(BGZF *fp, bcf1_t *v)
1630
896
{
1631
896
    uint8_t x[32];
1632
896
    ssize_t ret;
1633
896
    uint32_t shared_len, indiv_len;
1634
896
    if ((ret = bgzf_read(fp, x, 32)) != 32) {
1635
22
        if (ret == 0) return -1;
1636
20
        return -2;
1637
22
    }
1638
874
    bcf_clear1(v);
1639
874
    shared_len = le_to_u32(x);
1640
874
    if (shared_len < 24) return -2;
1641
872
    shared_len -= 24; // to exclude six 32-bit integers
1642
872
    indiv_len = le_to_u32(x + 4);
1643
872
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
1644
    // ks_resize() normally allocates 1.5 * requested size to allow for growth
1645
872
    if ((uint64_t) shared_len + indiv_len > FUZZ_ALLOC_LIMIT / 3 * 2) return -2;
1646
844
#endif
1647
844
    if (ks_resize(&v->shared, shared_len ? shared_len : 1) != 0) return -2;
1648
844
    if (ks_resize(&v->indiv, indiv_len ? indiv_len : 1) != 0) return -2;
1649
844
    v->rid  = le_to_i32(x + 8);
1650
844
    v->pos  = le_to_u32(x + 12);
1651
844
    if ( v->pos==UINT32_MAX ) v->pos = -1;  // this is for telomere coordinate, e.g. MT:0
1652
844
    v->rlen = le_to_i32(x + 16);
1653
844
    v->qual = le_to_float(x + 20);
1654
844
    v->n_info = le_to_u16(x + 24);
1655
844
    v->n_allele = le_to_u16(x + 26);
1656
844
    v->n_sample = le_to_u32(x + 28) & 0xffffff;
1657
844
    v->n_fmt = x[31];
1658
844
    v->shared.l = shared_len;
1659
844
    v->indiv.l = indiv_len;
1660
    // silent fix of broken BCFs produced by earlier versions of bcf_subset, prior to and including bd6ed8b4
1661
844
    if ( (!v->indiv.l || !v->n_sample) && v->n_fmt ) v->n_fmt = 0;
1662
1663
844
    if (bgzf_read(fp, v->shared.s, v->shared.l) != v->shared.l) return -2;
1664
812
    if (bgzf_read(fp, v->indiv.s, v->indiv.l) != v->indiv.l) return -2;
1665
792
    return 0;
1666
812
}
1667
1668
0
#define bit_array_size(n) ((n)/8+1)
1669
0
#define bit_array_set(a,i)   ((a)[(i)/8] |=   1 << ((i)%8))
1670
0
#define bit_array_clear(a,i) ((a)[(i)/8] &= ~(1 << ((i)%8)))
1671
0
#define bit_array_test(a,i)  ((a)[(i)/8] &   (1 << ((i)%8)))
1672
1673
static int bcf_dec_typed_int1_safe(uint8_t *p, uint8_t *end, uint8_t **q,
1674
10.7k
                                   int32_t *val) {
1675
10.7k
    uint32_t t;
1676
10.7k
    if (end - p < 2) return -1;
1677
10.7k
    t = *p++ & 0xf;
1678
    /* Use if .. else if ... else instead of switch to force order.  Assumption
1679
       is that small integers are more frequent than big ones. */
1680
10.7k
    if (t == BCF_BT_INT8) {
1681
5.00k
        *val = *(int8_t *) p++;
1682
5.74k
    } else {
1683
5.74k
        if (end - p < (1<<bcf_type_shift[t])) return -1;
1684
5.73k
        if (t == BCF_BT_INT16) {
1685
2.76k
            *val = le_to_i16(p);
1686
2.76k
            p += 2;
1687
2.97k
        } else if (t == BCF_BT_INT32) {
1688
2.81k
            *val = le_to_i32(p);
1689
2.81k
            p += 4;
1690
#ifdef VCF_ALLOW_INT64
1691
        } else if (t == BCF_BT_INT64) {
1692
            // This case should never happen because there should be no
1693
            // 64-bit BCFs at all, definitely not coming from htslib
1694
            *val = le_to_i64(p);
1695
            p += 8;
1696
#endif
1697
2.81k
        } else {
1698
160
            return -1;
1699
160
        }
1700
5.73k
    }
1701
10.5k
    *q = p;
1702
10.5k
    return 0;
1703
10.7k
}
1704
1705
static int bcf_dec_size_safe(uint8_t *p, uint8_t *end, uint8_t **q,
1706
19.2k
                             int *num, int *type) {
1707
19.2k
    int r;
1708
19.2k
    if (p >= end) return -1;
1709
19.2k
    *type = *p & 0xf;
1710
19.2k
    if (*p>>4 != 15) {
1711
18.5k
        *q = p + 1;
1712
18.5k
        *num = *p >> 4;
1713
18.5k
        return 0;
1714
18.5k
    }
1715
674
    r = bcf_dec_typed_int1_safe(p + 1, end, q, num);
1716
674
    if (r) return r;
1717
632
    return *num >= 0 ? 0 : -1;
1718
674
}
1719
1720
486
static const char *get_type_name(int type) {
1721
486
    const char *types[9] = {
1722
486
        "null", "int (8-bit)", "int (16 bit)", "int (32 bit)",
1723
486
        "unknown", "float", "unknown", "char", "unknown"
1724
486
    };
1725
486
    int t = (type >= 0 && type < 8) ? type : 8;
1726
486
    return types[t];
1727
486
}
1728
1729
static void bcf_record_check_err(const bcf_hdr_t *hdr, bcf1_t *rec,
1730
838
                                 char *type, uint32_t *reports, int i) {
1731
838
    if (*reports == 0 || hts_verbose >= HTS_LOG_DEBUG)
1732
72
        hts_log_warning("Bad BCF record at %s:%"PRIhts_pos
1733
838
                        ": Invalid FORMAT %s %d",
1734
838
                        bcf_seqname_safe(hdr,rec), rec->pos+1, type, i);
1735
838
    (*reports)++;
1736
838
}
1737
1738
792
static int bcf_record_check(const bcf_hdr_t *hdr, bcf1_t *rec) {
1739
792
    uint8_t *ptr, *end;
1740
792
    size_t bytes;
1741
792
    uint32_t err = 0;
1742
792
    int type = 0;
1743
792
    int num  = 0;
1744
792
    int reflen = 0;
1745
792
    uint32_t i, reports;
1746
792
    const uint32_t is_integer = ((1 << BCF_BT_INT8)  |
1747
792
                                 (1 << BCF_BT_INT16) |
1748
#ifdef VCF_ALLOW_INT64
1749
                                 (1 << BCF_BT_INT64) |
1750
#endif
1751
792
                                 (1 << BCF_BT_INT32));
1752
792
    const uint32_t is_valid_type = (is_integer          |
1753
792
                                    (1 << BCF_BT_NULL)  |
1754
792
                                    (1 << BCF_BT_FLOAT) |
1755
792
                                    (1 << BCF_BT_CHAR));
1756
792
    int32_t max_id = hdr ? hdr->n[BCF_DT_ID] : 0;
1757
1758
    // Check for valid contig ID
1759
792
    if (rec->rid < 0
1760
792
        || (hdr && (rec->rid >= hdr->n[BCF_DT_CTG]
1761
684
                    || hdr->id[BCF_DT_CTG][rec->rid].key == NULL))) {
1762
266
        hts_log_warning("Bad BCF record at %"PRIhts_pos": Invalid %s id %d", rec->pos+1, "CONTIG", rec->rid);
1763
266
        err |= BCF_ERR_CTG_INVALID;
1764
266
    }
1765
1766
    // Check ID
1767
792
    ptr = (uint8_t *) rec->shared.s;
1768
792
    end = ptr + rec->shared.l;
1769
792
    if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_shared;
1770
788
    if (type != BCF_BT_CHAR) {
1771
270
        hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s type %d (%s)", bcf_seqname_safe(hdr,rec), rec->pos+1, "ID", type, get_type_name(type));
1772
270
        err |= BCF_ERR_TAG_INVALID;
1773
270
    }
1774
788
    bytes = (size_t) num << bcf_type_shift[type];
1775
788
    if (end - ptr < bytes) goto bad_shared;
1776
786
    ptr += bytes;
1777
1778
    // Check REF and ALT
1779
786
    if (rec->n_allele < 1) {
1780
130
        hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": No REF allele",
1781
130
                        bcf_seqname_safe(hdr,rec), rec->pos+1);
1782
130
        err |= BCF_ERR_TAG_UNDEF;
1783
130
    }
1784
1785
786
    reports = 0;
1786
8.53k
    for (i = 0; i < rec->n_allele; i++) {
1787
7.82k
        if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_shared;
1788
7.78k
        if (type != BCF_BT_CHAR) {
1789
6.92k
            if (!reports++ || hts_verbose >= HTS_LOG_DEBUG)
1790
158
                hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s type %d (%s)", bcf_seqname_safe(hdr,rec), rec->pos+1, "REF/ALT", type, get_type_name(type));
1791
6.92k
            err |= BCF_ERR_CHAR;
1792
6.92k
        }
1793
7.78k
        if (i == 0) reflen = num;
1794
7.78k
        bytes = (size_t) num << bcf_type_shift[type];
1795
7.78k
        if (end - ptr < bytes) goto bad_shared;
1796
7.75k
        ptr += bytes;
1797
7.75k
    }
1798
1799
    // Check FILTER
1800
716
    reports = 0;
1801
716
    if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_shared;
1802
710
    if (num > 0) {
1803
120
        bytes = (size_t) num << bcf_type_shift[type];
1804
120
        if (((1 << type) & is_integer) == 0) {
1805
44
            hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s type %d (%s)", bcf_seqname_safe(hdr,rec), rec->pos+1, "FILTER", type, get_type_name(type));
1806
44
            err |= BCF_ERR_TAG_INVALID;
1807
44
            if (end - ptr < bytes) goto bad_shared;
1808
40
            ptr += bytes;
1809
76
        } else {
1810
76
            if (end - ptr < bytes) goto bad_shared;
1811
4.48k
            for (i = 0; i < num; i++) {
1812
4.40k
                int32_t key = bcf_dec_int1(ptr, type, &ptr);
1813
4.40k
                if (key < 0
1814
4.40k
                    || (hdr && (key >= max_id
1815
3.75k
                                || hdr->id[BCF_DT_ID][key].key == NULL))) {
1816
3.75k
                    if (!reports++ || hts_verbose >= HTS_LOG_DEBUG)
1817
74
                        hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s id %d", bcf_seqname_safe(hdr,rec), rec->pos+1, "FILTER", key);
1818
3.75k
                    err |= BCF_ERR_TAG_UNDEF;
1819
3.75k
                }
1820
4.40k
            }
1821
74
        }
1822
120
    }
1823
1824
    // Check INFO
1825
704
    reports = 0;
1826
704
    bcf_idpair_t *id_tmp = hdr ? hdr->id[BCF_DT_ID] : NULL;
1827
9.81k
    for (i = 0; i < rec->n_info; i++) {
1828
9.22k
        int32_t key = -1;
1829
9.22k
        if (bcf_dec_typed_int1_safe(ptr, end, &ptr, &key) != 0) goto bad_shared;
1830
9.13k
        if (key < 0 || (hdr && (key >= max_id
1831
8.85k
                                || id_tmp[key].key == NULL))) {
1832
8.85k
            if (!reports++ || hts_verbose >= HTS_LOG_DEBUG)
1833
114
                hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s id %d", bcf_seqname_safe(hdr,rec), rec->pos+1, "INFO", key);
1834
8.85k
            err |= BCF_ERR_TAG_UNDEF;
1835
8.85k
        }
1836
9.13k
        if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_shared;
1837
9.12k
        if (((1 << type) & is_valid_type) == 0
1838
9.12k
            || (type == BCF_BT_NULL && num > 0)) {
1839
1.22k
            if (!reports++ || hts_verbose >= HTS_LOG_DEBUG)
1840
14
                hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s type %d (%s)", bcf_seqname_safe(hdr,rec), rec->pos+1, "INFO", type, get_type_name(type));
1841
1.22k
            err |= BCF_ERR_TAG_INVALID;
1842
1.22k
        }
1843
9.12k
        bytes = (size_t) num << bcf_type_shift[type];
1844
9.12k
        if (end - ptr < bytes) goto bad_shared;
1845
9.11k
        ptr += bytes;
1846
9.11k
    }
1847
1848
    // Check FORMAT and individual information
1849
594
    ptr = (uint8_t *) rec->indiv.s;
1850
594
    end = ptr + rec->indiv.l;
1851
594
    reports = 0;
1852
1.37k
    for (i = 0; i < rec->n_fmt; i++) {
1853
862
        int32_t key = -1;
1854
862
        if (bcf_dec_typed_int1_safe(ptr, end, &ptr, &key) != 0) goto bad_indiv;
1855
818
        if (key < 0
1856
818
            || (hdr && (key >= max_id
1857
692
                        || id_tmp[key].key == NULL))) {
1858
692
            bcf_record_check_err(hdr, rec, "id", &reports, key);
1859
692
            err |= BCF_ERR_TAG_UNDEF;
1860
692
        }
1861
818
        if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_indiv;
1862
810
        if (((1 << type) & is_valid_type) == 0
1863
810
            || (type == BCF_BT_NULL && num > 0)) {
1864
146
            bcf_record_check_err(hdr, rec, "type", &reports, type);
1865
146
            err |= BCF_ERR_TAG_INVALID;
1866
146
        }
1867
810
        bytes = ((size_t) num << bcf_type_shift[type]) * rec->n_sample;
1868
810
        if (end - ptr < bytes) goto bad_indiv;
1869
780
        ptr += bytes;
1870
780
    }
1871
1872
512
    if (!err && rec->rlen < 0) {
1873
        // Treat bad rlen as a warning instead of an error, and try to
1874
        // fix up by using the length of the stored REF allele.
1875
324
        static int warned = 0;
1876
324
        if (!warned) {
1877
1
            hts_log_warning("BCF record at %s:%"PRIhts_pos" has invalid RLEN (%"PRIhts_pos"). "
1878
1
                            "Only one invalid RLEN will be reported.",
1879
1
                            bcf_seqname_safe(hdr,rec), rec->pos+1, rec->rlen);
1880
1
            warned = 1;
1881
1
        }
1882
324
        rec->rlen = reflen >= 0 ? reflen : 0;
1883
324
    }
1884
1885
512
    rec->errcode |= err;
1886
1887
512
    return err ? -2 : 0; // Return -2 so bcf_read() reports an error
1888
1889
198
 bad_shared:
1890
198
    hts_log_error("Bad BCF record at %s:%"PRIhts_pos" - shared section malformed or too short", bcf_seqname_safe(hdr,rec), rec->pos+1);
1891
198
    return -2;
1892
1893
82
 bad_indiv:
1894
82
    hts_log_error("Bad BCF record at %s:%"PRIhts_pos" - individuals section malformed or too short", bcf_seqname_safe(hdr,rec), rec->pos+1);
1895
82
    return -2;
1896
594
}
1897
1898
static inline uint8_t *bcf_unpack_fmt_core1(uint8_t *ptr, int n_sample, bcf_fmt_t *fmt);
1899
int bcf_subset_format(const bcf_hdr_t *hdr, bcf1_t *rec)
1900
0
{
1901
0
    if ( !hdr->keep_samples ) return 0;
1902
0
    if ( !bcf_hdr_nsamples(hdr) )
1903
0
    {
1904
0
        rec->indiv.l = rec->n_sample = 0;
1905
0
        return 0;
1906
0
    }
1907
1908
0
    int i, j;
1909
0
    uint8_t *ptr = (uint8_t*)rec->indiv.s, *dst = NULL, *src;
1910
0
    bcf_dec_t *dec = &rec->d;
1911
0
    hts_expand(bcf_fmt_t, rec->n_fmt, dec->m_fmt, dec->fmt);
1912
0
    for (i=0; i<dec->m_fmt; ++i) dec->fmt[i].p_free = 0;
1913
1914
0
    for (i=0; i<rec->n_fmt; i++)
1915
0
    {
1916
0
        ptr = bcf_unpack_fmt_core1(ptr, rec->n_sample, &dec->fmt[i]);
1917
0
        src = dec->fmt[i].p - dec->fmt[i].size;
1918
0
        if ( dst )
1919
0
        {
1920
0
            memmove(dec->fmt[i-1].p + dec->fmt[i-1].p_len, dec->fmt[i].p - dec->fmt[i].p_off, dec->fmt[i].p_off);
1921
0
            dec->fmt[i].p = dec->fmt[i-1].p + dec->fmt[i-1].p_len + dec->fmt[i].p_off;
1922
0
        }
1923
0
        dst = dec->fmt[i].p;
1924
0
        for (j=0; j<hdr->nsamples_ori; j++)
1925
0
        {
1926
0
            src += dec->fmt[i].size;
1927
0
            if ( !bit_array_test(hdr->keep_samples,j) ) continue;
1928
0
            memmove(dst, src, dec->fmt[i].size);
1929
0
            dst += dec->fmt[i].size;
1930
0
        }
1931
0
        rec->indiv.l -= dec->fmt[i].p_len - (dst - dec->fmt[i].p);
1932
0
        dec->fmt[i].p_len = dst - dec->fmt[i].p;
1933
0
    }
1934
0
    rec->unpacked |= BCF_UN_FMT;
1935
1936
0
    rec->n_sample = bcf_hdr_nsamples(hdr);
1937
0
    return 0;
1938
0
}
1939
1940
int bcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
1941
22.7k
{
1942
22.7k
    if (fp->format.format == vcf) return vcf_read(fp,h,v);
1943
896
    int ret = bcf_read1_core(fp->fp.bgzf, v);
1944
896
    if (ret == 0) ret = bcf_record_check(h, v);
1945
896
    if ( ret!=0 || !h->keep_samples ) return ret;
1946
0
    return bcf_subset_format(h,v);
1947
896
}
1948
1949
int bcf_readrec(BGZF *fp, void *null, void *vv, int *tid, hts_pos_t *beg, hts_pos_t *end)
1950
0
{
1951
0
    bcf1_t *v = (bcf1_t *) vv;
1952
0
    int ret = bcf_read1_core(fp, v);
1953
0
    if (ret == 0) ret = bcf_record_check(NULL, v);
1954
0
    if (ret  >= 0)
1955
0
        *tid = v->rid, *beg = v->pos, *end = v->pos + v->rlen;
1956
0
    return ret;
1957
0
}
1958
1959
static inline int bcf1_sync_id(bcf1_t *line, kstring_t *str)
1960
0
{
1961
    // single typed string
1962
0
    if ( line->d.id && strcmp(line->d.id, ".") ) {
1963
0
        return bcf_enc_vchar(str, strlen(line->d.id), line->d.id);
1964
0
    } else {
1965
0
        return bcf_enc_size(str, 0, BCF_BT_CHAR);
1966
0
    }
1967
0
}
1968
static inline int bcf1_sync_alleles(bcf1_t *line, kstring_t *str)
1969
0
{
1970
    // list of typed strings
1971
0
    int i;
1972
0
    for (i=0; i<line->n_allele; i++) {
1973
0
        if (bcf_enc_vchar(str, strlen(line->d.allele[i]), line->d.allele[i]) < 0)
1974
0
            return -1;
1975
0
    }
1976
0
    if ( !line->rlen && line->n_allele ) line->rlen = strlen(line->d.allele[0]);
1977
0
    return 0;
1978
0
}
1979
static inline int bcf1_sync_filter(bcf1_t *line, kstring_t *str)
1980
0
{
1981
    // typed vector of integers
1982
0
    if ( line->d.n_flt ) {
1983
0
        return bcf_enc_vint(str, line->d.n_flt, line->d.flt, -1);
1984
0
    } else {
1985
0
        return bcf_enc_vint(str, 0, 0, -1);
1986
0
    }
1987
0
}
1988
1989
static inline int bcf1_sync_info(bcf1_t *line, kstring_t *str)
1990
0
{
1991
    // pairs of typed vectors
1992
0
    int i, irm = -1, e = 0;
1993
0
    for (i=0; i<line->n_info; i++)
1994
0
    {
1995
0
        bcf_info_t *info = &line->d.info[i];
1996
0
        if ( !info->vptr )
1997
0
        {
1998
            // marked for removal
1999
0
            if ( irm < 0 ) irm = i;
2000
0
            continue;
2001
0
        }
2002
0
        e |= kputsn_(info->vptr - info->vptr_off, info->vptr_len + info->vptr_off, str) < 0;
2003
0
        if ( irm >=0 )
2004
0
        {
2005
0
            bcf_info_t tmp = line->d.info[irm]; line->d.info[irm] = line->d.info[i]; line->d.info[i] = tmp;
2006
0
            while ( irm<=i && line->d.info[irm].vptr ) irm++;
2007
0
        }
2008
0
    }
2009
0
    if ( irm>=0 ) line->n_info = irm;
2010
0
    return e == 0 ? 0 : -1;
2011
0
}
2012
2013
static int bcf1_sync(bcf1_t *line)
2014
241
{
2015
241
    char *shared_ori = line->shared.s;
2016
241
    size_t prev_len;
2017
2018
241
    kstring_t tmp = {0,0,0};
2019
241
    if ( !line->shared.l )
2020
0
    {
2021
        // New line created via API, BCF data blocks do not exist. Get it ready for BCF output
2022
0
        tmp = line->shared;
2023
0
        bcf1_sync_id(line, &tmp);
2024
0
        line->unpack_size[0] = tmp.l; prev_len = tmp.l;
2025
2026
0
        bcf1_sync_alleles(line, &tmp);
2027
0
        line->unpack_size[1] = tmp.l - prev_len; prev_len = tmp.l;
2028
2029
0
        bcf1_sync_filter(line, &tmp);
2030
0
        line->unpack_size[2] = tmp.l - prev_len;
2031
2032
0
        bcf1_sync_info(line, &tmp);
2033
0
        line->shared = tmp;
2034
0
    }
2035
241
    else if ( line->d.shared_dirty )
2036
0
    {
2037
        // The line was edited, update the BCF data block.
2038
2039
0
        if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line,BCF_UN_STR);
2040
2041
        // ptr_ori points to the original unchanged BCF data.
2042
0
        uint8_t *ptr_ori = (uint8_t *) line->shared.s;
2043
2044
        // ID: single typed string
2045
0
        if ( line->d.shared_dirty & BCF1_DIRTY_ID )
2046
0
            bcf1_sync_id(line, &tmp);
2047
0
        else
2048
0
            kputsn_(ptr_ori, line->unpack_size[0], &tmp);
2049
0
        ptr_ori += line->unpack_size[0];
2050
0
        line->unpack_size[0] = tmp.l; prev_len = tmp.l;
2051
2052
        // REF+ALT: list of typed strings
2053
0
        if ( line->d.shared_dirty & BCF1_DIRTY_ALS )
2054
0
            bcf1_sync_alleles(line, &tmp);
2055
0
        else
2056
0
        {
2057
0
            kputsn_(ptr_ori, line->unpack_size[1], &tmp);
2058
0
            if ( !line->rlen && line->n_allele ) line->rlen = strlen(line->d.allele[0]);
2059
0
        }
2060
0
        ptr_ori += line->unpack_size[1];
2061
0
        line->unpack_size[1] = tmp.l - prev_len; prev_len = tmp.l;
2062
2063
0
        if ( line->unpacked & BCF_UN_FLT )
2064
0
        {
2065
            // FILTER: typed vector of integers
2066
0
            if ( line->d.shared_dirty & BCF1_DIRTY_FLT )
2067
0
                bcf1_sync_filter(line, &tmp);
2068
0
            else if ( line->d.n_flt )
2069
0
                kputsn_(ptr_ori, line->unpack_size[2], &tmp);
2070
0
            else
2071
0
                bcf_enc_vint(&tmp, 0, 0, -1);
2072
0
            ptr_ori += line->unpack_size[2];
2073
0
            line->unpack_size[2] = tmp.l - prev_len;
2074
2075
0
            if ( line->unpacked & BCF_UN_INFO )
2076
0
            {
2077
                // INFO: pairs of typed vectors
2078
0
                if ( line->d.shared_dirty & BCF1_DIRTY_INF )
2079
0
                {
2080
0
                    bcf1_sync_info(line, &tmp);
2081
0
                    ptr_ori = (uint8_t*)line->shared.s + line->shared.l;
2082
0
                }
2083
0
            }
2084
0
        }
2085
2086
0
        int size = line->shared.l - (size_t)ptr_ori + (size_t)line->shared.s;
2087
0
        if ( size ) kputsn_(ptr_ori, size, &tmp);
2088
2089
0
        free(line->shared.s);
2090
0
        line->shared = tmp;
2091
0
    }
2092
241
    if ( line->shared.s != shared_ori && line->unpacked & BCF_UN_INFO )
2093
0
    {
2094
        // Reallocated line->shared.s block invalidated line->d.info[].vptr pointers
2095
0
        size_t off_new = line->unpack_size[0] + line->unpack_size[1] + line->unpack_size[2];
2096
0
        int i;
2097
0
        for (i=0; i<line->n_info; i++)
2098
0
        {
2099
0
            uint8_t *vptr_free = line->d.info[i].vptr_free ? line->d.info[i].vptr - line->d.info[i].vptr_off : NULL;
2100
0
            line->d.info[i].vptr = (uint8_t*) line->shared.s + off_new + line->d.info[i].vptr_off;
2101
0
            off_new += line->d.info[i].vptr_len + line->d.info[i].vptr_off;
2102
0
            if ( vptr_free )
2103
0
            {
2104
0
                free(vptr_free);
2105
0
                line->d.info[i].vptr_free = 0;
2106
0
            }
2107
0
        }
2108
0
    }
2109
2110
241
    if ( line->n_sample && line->n_fmt && (!line->indiv.l || line->d.indiv_dirty) )
2111
0
    {
2112
        // The genotype fields changed or are not present
2113
0
        tmp.l = tmp.m = 0; tmp.s = NULL;
2114
0
        int i, irm = -1;
2115
0
        for (i=0; i<line->n_fmt; i++)
2116
0
        {
2117
0
            bcf_fmt_t *fmt = &line->d.fmt[i];
2118
0
            if ( !fmt->p )
2119
0
            {
2120
                // marked for removal
2121
0
                if ( irm < 0 ) irm = i;
2122
0
                continue;
2123
0
            }
2124
0
            kputsn_(fmt->p - fmt->p_off, fmt->p_len + fmt->p_off, &tmp);
2125
0
            if ( irm >=0 )
2126
0
            {
2127
0
                bcf_fmt_t tfmt = line->d.fmt[irm]; line->d.fmt[irm] = line->d.fmt[i]; line->d.fmt[i] = tfmt;
2128
0
                while ( irm<=i && line->d.fmt[irm].p ) irm++;
2129
0
            }
2130
2131
0
        }
2132
0
        if ( irm>=0 ) line->n_fmt = irm;
2133
0
        free(line->indiv.s);
2134
0
        line->indiv = tmp;
2135
2136
        // Reallocated line->indiv.s block invalidated line->d.fmt[].p pointers
2137
0
        size_t off_new = 0;
2138
0
        for (i=0; i<line->n_fmt; i++)
2139
0
        {
2140
0
            uint8_t *p_free = line->d.fmt[i].p_free ? line->d.fmt[i].p - line->d.fmt[i].p_off : NULL;
2141
0
            line->d.fmt[i].p = (uint8_t*) line->indiv.s + off_new + line->d.fmt[i].p_off;
2142
0
            off_new += line->d.fmt[i].p_len + line->d.fmt[i].p_off;
2143
0
            if ( p_free )
2144
0
            {
2145
0
                free(p_free);
2146
0
                line->d.fmt[i].p_free = 0;
2147
0
            }
2148
0
        }
2149
0
    }
2150
241
    if ( !line->n_sample ) line->n_fmt = 0;
2151
241
    line->d.shared_dirty = line->d.indiv_dirty = 0;
2152
241
    return 0;
2153
241
}
2154
2155
bcf1_t *bcf_copy(bcf1_t *dst, bcf1_t *src)
2156
0
{
2157
0
    bcf1_sync(src);
2158
2159
0
    bcf_clear(dst);
2160
0
    dst->rid  = src->rid;
2161
0
    dst->pos  = src->pos;
2162
0
    dst->rlen = src->rlen;
2163
0
    dst->qual = src->qual;
2164
0
    dst->n_info = src->n_info; dst->n_allele = src->n_allele;
2165
0
    dst->n_fmt = src->n_fmt; dst->n_sample = src->n_sample;
2166
2167
0
    if ( dst->shared.m < src->shared.l )
2168
0
    {
2169
0
        dst->shared.s = (char*) realloc(dst->shared.s, src->shared.l);
2170
0
        dst->shared.m = src->shared.l;
2171
0
    }
2172
0
    dst->shared.l = src->shared.l;
2173
0
    memcpy(dst->shared.s,src->shared.s,dst->shared.l);
2174
2175
0
    if ( dst->indiv.m < src->indiv.l )
2176
0
    {
2177
0
        dst->indiv.s = (char*) realloc(dst->indiv.s, src->indiv.l);
2178
0
        dst->indiv.m = src->indiv.l;
2179
0
    }
2180
0
    dst->indiv.l = src->indiv.l;
2181
0
    memcpy(dst->indiv.s,src->indiv.s,dst->indiv.l);
2182
2183
0
    return dst;
2184
0
}
2185
bcf1_t *bcf_dup(bcf1_t *src)
2186
0
{
2187
0
    bcf1_t *out = bcf_init1();
2188
0
    return bcf_copy(out, src);
2189
0
}
2190
2191
int bcf_write(htsFile *hfp, bcf_hdr_t *h, bcf1_t *v)
2192
20.0k
{
2193
20.0k
    if ( h->dirty ) {
2194
0
        if (bcf_hdr_sync(h) < 0) return -1;
2195
0
    }
2196
20.0k
    if ( bcf_hdr_nsamples(h)!=v->n_sample )
2197
73
    {
2198
73
        hts_log_error("Broken VCF record, the number of columns at %s:%"PRIhts_pos" does not match the number of samples (%d vs %d)",
2199
73
            bcf_seqname_safe(h,v), v->pos+1, v->n_sample, bcf_hdr_nsamples(h));
2200
73
        return -1;
2201
73
    }
2202
2203
19.9k
    if ( hfp->format.format == vcf || hfp->format.format == text_format )
2204
17.7k
        return vcf_write(hfp,h,v);
2205
2206
2.20k
    if ( v->errcode & ~BCF_ERR_LIMITS ) // todo: unsure about the other BCF_ERR_LIMITS branches in vcf_parse_format_alloc4()
2207
1.96k
    {
2208
        // vcf_parse1() encountered a new contig or tag, undeclared in the
2209
        // header.  At this point, the header must have been printed,
2210
        // proceeding would lead to a broken BCF file. Errors must be checked
2211
        // and cleared by the caller before we can proceed.
2212
1.96k
        char errdescription[1024] = "";
2213
1.96k
        hts_log_error("Unchecked error (%d %s) at %s:%"PRIhts_pos, v->errcode, bcf_strerror(v->errcode, errdescription, sizeof(errdescription)), bcf_seqname_safe(h,v), v->pos+1);
2214
1.96k
        return -1;
2215
1.96k
    }
2216
241
    bcf1_sync(v);   // check if the BCF record was modified
2217
2218
241
    if ( v->unpacked & BCF_IS_64BIT )
2219
0
    {
2220
0
        hts_log_error("Data at %s:%"PRIhts_pos" contains 64-bit values not representable in BCF. Please use VCF instead", bcf_seqname_safe(h,v), v->pos+1);
2221
0
        return -1;
2222
0
    }
2223
2224
241
    BGZF *fp = hfp->fp.bgzf;
2225
241
    uint8_t x[32];
2226
241
    u32_to_le(v->shared.l + 24, x); // to include six 32-bit integers
2227
241
    u32_to_le(v->indiv.l, x + 4);
2228
241
    i32_to_le(v->rid, x + 8);
2229
241
    u32_to_le(v->pos, x + 12);
2230
241
    u32_to_le(v->rlen, x + 16);
2231
241
    float_to_le(v->qual, x + 20);
2232
241
    u16_to_le(v->n_info, x + 24);
2233
241
    u16_to_le(v->n_allele, x + 26);
2234
241
    u32_to_le((uint32_t)v->n_fmt<<24 | (v->n_sample & 0xffffff), x + 28);
2235
241
    if ( bgzf_write(fp, x, 32) != 32 ) return -1;
2236
241
    if ( bgzf_write(fp, v->shared.s, v->shared.l) != v->shared.l ) return -1;
2237
241
    if ( bgzf_write(fp, v->indiv.s, v->indiv.l) != v->indiv.l ) return -1;
2238
2239
241
    if (hfp->idx) {
2240
0
        if (bgzf_idx_push(fp, hfp->idx, v->rid, v->pos, v->pos + v->rlen,
2241
0
                          bgzf_tell(fp), 1) < 0)
2242
0
            return -1;
2243
0
    }
2244
2245
241
    return 0;
2246
241
}
2247
2248
/**********************
2249
 *** VCF header I/O ***
2250
 **********************/
2251
2252
0
static int add_missing_contig_hrec(bcf_hdr_t *h, const char *name) {
2253
0
    bcf_hrec_t *hrec = calloc(1, sizeof(bcf_hrec_t));
2254
0
    int save_errno;
2255
0
    if (!hrec) goto fail;
2256
2257
0
    hrec->key = strdup("contig");
2258
0
    if (!hrec->key) goto fail;
2259
2260
0
    if (bcf_hrec_add_key(hrec, "ID", strlen("ID")) < 0) goto fail;
2261
0
    if (bcf_hrec_set_val(hrec, hrec->nkeys-1, name, strlen(name), 0) < 0)
2262
0
        goto fail;
2263
0
    if (bcf_hdr_add_hrec(h, hrec) < 0)
2264
0
        goto fail;
2265
0
    return 0;
2266
2267
0
 fail:
2268
0
    save_errno = errno;
2269
0
    hts_log_error("%s", strerror(errno));
2270
0
    if (hrec) bcf_hrec_destroy(hrec);
2271
0
    errno = save_errno;
2272
0
    return -1;
2273
0
}
2274
2275
bcf_hdr_t *vcf_hdr_read(htsFile *fp)
2276
5.18k
{
2277
5.18k
    kstring_t txt, *s = &fp->line;
2278
5.18k
    int ret;
2279
5.18k
    bcf_hdr_t *h;
2280
5.18k
    tbx_t *idx = NULL;
2281
5.18k
    const char **names = NULL;
2282
5.18k
    h = bcf_hdr_init("r");
2283
5.18k
    if (!h) {
2284
0
        hts_log_error("Failed to allocate bcf header");
2285
0
        return NULL;
2286
0
    }
2287
5.18k
    txt.l = txt.m = 0; txt.s = 0;
2288
78.5k
    while ((ret = hts_getline(fp, KS_SEP_LINE, s)) >= 0) {
2289
78.0k
        int e = 0;
2290
78.0k
        if (s->l == 0) continue;
2291
77.8k
        if (s->s[0] != '#') {
2292
14
            hts_log_error("No sample line");
2293
14
            goto error;
2294
14
        }
2295
77.8k
        if (s->s[1] != '#' && fp->fn_aux) { // insert contigs here
2296
0
            kstring_t tmp = { 0, 0, NULL };
2297
0
            hFILE *f = hopen(fp->fn_aux, "r");
2298
0
            if (f == NULL) {
2299
0
                hts_log_error("Couldn't open \"%s\"", fp->fn_aux);
2300
0
                goto error;
2301
0
            }
2302
0
            while (tmp.l = 0, kgetline(&tmp, (kgets_func *) hgets, f) >= 0) {
2303
0
                char *tab = strchr(tmp.s, '\t');
2304
0
                if (tab == NULL) continue;
2305
0
                e |= (kputs("##contig=<ID=", &txt) < 0);
2306
0
                e |= (kputsn(tmp.s, tab - tmp.s, &txt) < 0);
2307
0
                e |= (kputs(",length=", &txt) < 0);
2308
0
                e |= (kputl(atol(tab), &txt) < 0);
2309
0
                e |= (kputsn(">\n", 2, &txt) < 0);
2310
0
            }
2311
0
            free(tmp.s);
2312
0
            if (hclose(f) != 0) {
2313
0
                hts_log_error("Error on closing %s", fp->fn_aux);
2314
0
                goto error;
2315
0
            }
2316
0
            if (e) goto error;
2317
0
        }
2318
77.8k
        if (kputsn(s->s, s->l, &txt) < 0) goto error;
2319
77.8k
        if (kputc('\n', &txt) < 0) goto error;
2320
77.8k
        if (s->s[1] != '#') break;
2321
77.8k
    }
2322
5.17k
    if ( ret < -1 ) goto error;
2323
5.15k
    if ( !txt.s )
2324
0
    {
2325
0
        hts_log_error("Could not read the header");
2326
0
        goto error;
2327
0
    }
2328
5.15k
    if ( bcf_hdr_parse(h, txt.s) < 0 ) goto error;
2329
2330
    // check tabix index, are all contigs listed in the header? add the missing ones
2331
4.34k
    idx = tbx_index_load3(fp->fn, NULL, HTS_IDX_SILENT_FAIL);
2332
4.34k
    if ( idx )
2333
0
    {
2334
0
        int i, n, need_sync = 0;
2335
0
        names = tbx_seqnames(idx, &n);
2336
0
        if (!names) goto error;
2337
0
        for (i=0; i<n; i++)
2338
0
        {
2339
0
            bcf_hrec_t *hrec = bcf_hdr_get_hrec(h, BCF_HL_CTG, "ID", (char*) names[i], NULL);
2340
0
            if ( hrec ) continue;
2341
0
            if (add_missing_contig_hrec(h, names[i]) < 0) goto error;
2342
0
            need_sync = 1;
2343
0
        }
2344
0
        if ( need_sync ) {
2345
0
            if (bcf_hdr_sync(h) < 0) goto error;
2346
0
        }
2347
0
        free(names);
2348
0
        tbx_destroy(idx);
2349
0
    }
2350
4.34k
    free(txt.s);
2351
4.34k
    return h;
2352
2353
838
 error:
2354
838
    if (idx) tbx_destroy(idx);
2355
838
    free(names);
2356
838
    free(txt.s);
2357
838
    if (h) bcf_hdr_destroy(h);
2358
838
    return NULL;
2359
4.34k
}
2360
2361
int bcf_hdr_set(bcf_hdr_t *hdr, const char *fname)
2362
0
{
2363
0
    int i = 0, n = 0, save_errno;
2364
0
    char **lines = hts_readlines(fname, &n);
2365
0
    if ( !lines ) return 1;
2366
0
    for (i=0; i<n-1; i++)
2367
0
    {
2368
0
        int k;
2369
0
        bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr,lines[i],&k);
2370
0
        if (!hrec) goto fail;
2371
0
        if (bcf_hdr_add_hrec(hdr, hrec) < 0) {
2372
0
            bcf_hrec_destroy(hrec);
2373
0
            goto fail;
2374
0
        }
2375
0
        free(lines[i]);
2376
0
        lines[i] = NULL;
2377
0
    }
2378
0
    if (bcf_hdr_parse_sample_line(hdr, lines[n-1]) < 0) goto fail;
2379
0
    if (bcf_hdr_sync(hdr) < 0) goto fail;
2380
0
    free(lines[n-1]);
2381
0
    free(lines);
2382
0
    return 0;
2383
2384
0
 fail:
2385
0
    save_errno = errno;
2386
0
    for (; i < n; i++)
2387
0
        free(lines[i]);
2388
0
    free(lines);
2389
0
    errno = save_errno;
2390
0
    return 1;
2391
0
}
2392
2393
static int _bcf_hrec_format(const bcf_hrec_t *hrec, int is_bcf, kstring_t *str)
2394
16.7k
{
2395
16.7k
    uint32_t e = 0;
2396
16.7k
    if ( !hrec->value )
2397
8.90k
    {
2398
8.90k
        int j, nout = 0;
2399
8.90k
        e |= ksprintf(str, "##%s=<", hrec->key) < 0;
2400
36.1k
        for (j=0; j<hrec->nkeys; j++)
2401
27.2k
        {
2402
            // do not output IDX if output is VCF
2403
27.2k
            if ( !is_bcf && !strcmp("IDX",hrec->keys[j]) ) continue;
2404
23.1k
            if ( nout ) e |= kputc(',',str) < 0;
2405
23.1k
            e |= ksprintf(str,"%s=%s", hrec->keys[j], hrec->vals[j]) < 0;
2406
23.1k
            nout++;
2407
23.1k
        }
2408
8.90k
        e |= ksprintf(str,">\n") < 0;
2409
8.90k
    }
2410
7.87k
    else
2411
7.87k
        e |= ksprintf(str,"##%s=%s\n", hrec->key,hrec->value) < 0;
2412
2413
16.7k
    return e == 0 ? 0 : -1;
2414
16.7k
}
2415
2416
int bcf_hrec_format(const bcf_hrec_t *hrec, kstring_t *str)
2417
0
{
2418
0
    return _bcf_hrec_format(hrec,0,str);
2419
0
}
2420
2421
int bcf_hdr_format(const bcf_hdr_t *hdr, int is_bcf, kstring_t *str)
2422
4.76k
{
2423
4.76k
    int i, r = 0;
2424
21.5k
    for (i=0; i<hdr->nhrec; i++)
2425
16.7k
        r |= _bcf_hrec_format(hdr->hrec[i], is_bcf, str) < 0;
2426
2427
4.76k
    r |= ksprintf(str, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO") < 0;
2428
4.76k
    if ( bcf_hdr_nsamples(hdr) )
2429
1.56k
    {
2430
1.56k
        r |= ksprintf(str, "\tFORMAT") < 0;
2431
14.4k
        for (i=0; i<bcf_hdr_nsamples(hdr); i++)
2432
12.8k
            r |= ksprintf(str, "\t%s", hdr->samples[i]) < 0;
2433
1.56k
    }
2434
4.76k
    r |= ksprintf(str, "\n") < 0;
2435
2436
4.76k
    return r ? -1 : 0;
2437
4.76k
}
2438
2439
char *bcf_hdr_fmt_text(const bcf_hdr_t *hdr, int is_bcf, int *len)
2440
0
{
2441
0
    kstring_t txt = {0,0,0};
2442
0
    if (bcf_hdr_format(hdr, is_bcf, &txt) < 0)
2443
0
        return NULL;
2444
0
    if ( len ) *len = txt.l;
2445
0
    return txt.s;
2446
0
}
2447
2448
const char **bcf_hdr_seqnames(const bcf_hdr_t *h, int *n)
2449
0
{
2450
0
    vdict_t *d = (vdict_t*)h->dict[BCF_DT_CTG];
2451
0
    int i, tid, m = kh_size(d);
2452
0
    const char **names = (const char**) calloc(m,sizeof(const char*));
2453
0
    if ( !names )
2454
0
    {
2455
0
        hts_log_error("Failed to allocate memory");
2456
0
        *n = 0;
2457
0
        return NULL;
2458
0
    }
2459
0
    khint_t k;
2460
0
    for (k=kh_begin(d); k<kh_end(d); k++)
2461
0
    {
2462
0
        if ( !kh_exist(d,k) ) continue;
2463
0
        if ( !kh_val(d, k).hrec[0] ) continue;  // removed via bcf_hdr_remove
2464
0
        tid = kh_val(d,k).id;
2465
0
        if ( tid >= m )
2466
0
        {
2467
            // This can happen after a contig has been removed from BCF header via bcf_hdr_remove()
2468
0
            if ( hts_resize(const char*, tid + 1, &m, &names, HTS_RESIZE_CLEAR)<0 )
2469
0
            {
2470
0
                hts_log_error("Failed to allocate memory");
2471
0
                *n = 0;
2472
0
                free(names);
2473
0
                return NULL;
2474
0
            }
2475
0
            m = tid + 1;
2476
0
        }
2477
0
        names[tid] = kh_key(d,k);
2478
0
    }
2479
    // ensure there are no gaps
2480
0
    for (i=0,tid=0; tid<m; i++,tid++)
2481
0
    {
2482
0
        while ( tid<m && !names[tid] ) tid++;
2483
0
        if ( tid==m ) break;
2484
0
        if ( i==tid ) continue;
2485
0
        names[i] = names[tid];
2486
0
        names[tid] = 0;
2487
0
    }
2488
0
    *n = i;
2489
0
    return names;
2490
0
}
2491
2492
int vcf_hdr_write(htsFile *fp, const bcf_hdr_t *h)
2493
2.38k
{
2494
2.38k
    kstring_t htxt = {0,0,0};
2495
2.38k
    if (bcf_hdr_format(h, 0, &htxt) < 0) {
2496
0
        free(htxt.s);
2497
0
        return -1;
2498
0
    }
2499
2.38k
    while (htxt.l && htxt.s[htxt.l-1] == '\0') --htxt.l; // kill trailing zeros
2500
2.38k
    int ret;
2501
2.38k
    if ( fp->format.compression!=no_compression ) {
2502
0
        ret = bgzf_write(fp->fp.bgzf, htxt.s, htxt.l);
2503
0
        if (bgzf_flush(fp->fp.bgzf) != 0) return -1;
2504
2.38k
    } else {
2505
2.38k
        ret = hwrite(fp->fp.hfile, htxt.s, htxt.l);
2506
2.38k
    }
2507
2.38k
    free(htxt.s);
2508
2.38k
    return ret<0 ? -1 : 0;
2509
2.38k
}
2510
2511
/***********************
2512
 *** Typed value I/O ***
2513
 ***********************/
2514
2515
int bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize)
2516
554k
{
2517
554k
    int32_t max = INT32_MIN, min = INT32_MAX;
2518
554k
    int i;
2519
554k
    if (n <= 0) {
2520
2.04k
        return bcf_enc_size(s, 0, BCF_BT_NULL);
2521
552k
    } else if (n == 1) {
2522
15.1k
        return bcf_enc_int1(s, a[0]);
2523
537k
    } else {
2524
537k
        if (wsize <= 0) wsize = n;
2525
2526
        // Equivalent to:
2527
        // for (i = 0; i < n; ++i) {
2528
        //     if (a[i] == bcf_int32_missing || a[i] == bcf_int32_vector_end )
2529
        //         continue;
2530
        //     if (max < a[i]) max = a[i];
2531
        //     if (min > a[i]) min = a[i];
2532
        // }
2533
537k
        int max4[4] = {INT32_MIN, INT32_MIN, INT32_MIN, INT32_MIN};
2534
537k
        int min4[4] = {INT32_MAX, INT32_MAX, INT32_MAX, INT32_MAX};
2535
131M
        for (i = 0; i < (n&~3); i+=4) {
2536
            // bcf_int32_missing    == INT32_MIN and
2537
            // bcf_int32_vector_end == INT32_MIN+1.
2538
            // We skip these, but can mostly avoid explicit checking
2539
131M
            if (max4[0] < a[i+0]) max4[0] = a[i+0];
2540
131M
            if (max4[1] < a[i+1]) max4[1] = a[i+1];
2541
131M
            if (max4[2] < a[i+2]) max4[2] = a[i+2];
2542
131M
            if (max4[3] < a[i+3]) max4[3] = a[i+3];
2543
131M
            if (min4[0] > a[i+0] && a[i+0] > INT32_MIN+1) min4[0] = a[i+0];
2544
131M
            if (min4[1] > a[i+1] && a[i+1] > INT32_MIN+1) min4[1] = a[i+1];
2545
131M
            if (min4[2] > a[i+2] && a[i+2] > INT32_MIN+1) min4[2] = a[i+2];
2546
131M
            if (min4[3] > a[i+3] && a[i+3] > INT32_MIN+1) min4[3] = a[i+3];
2547
131M
        }
2548
537k
        min = min4[0];
2549
537k
        if (min > min4[1]) min = min4[1];
2550
537k
        if (min > min4[2]) min = min4[2];
2551
537k
        if (min > min4[3]) min = min4[3];
2552
537k
        max = max4[0];
2553
537k
        if (max < max4[1]) max = max4[1];
2554
537k
        if (max < max4[2]) max = max4[2];
2555
537k
        if (max < max4[3]) max = max4[3];
2556
1.30M
        for (; i < n; ++i) {
2557
764k
            if (max < a[i]) max = a[i];
2558
764k
            if (min > a[i] && a[i] > INT32_MIN+1) min = a[i];
2559
764k
        }
2560
2561
537k
        if (max <= BCF_MAX_BT_INT8 && min >= BCF_MIN_BT_INT8) {
2562
83.2k
            if (bcf_enc_size(s, wsize, BCF_BT_INT8) < 0 ||
2563
83.2k
                ks_resize(s, s->l + n) < 0)
2564
0
                return -1;
2565
83.2k
            uint8_t *p = (uint8_t *) s->s + s->l;
2566
23.4M
            for (i = 0; i < n; ++i, p++) {
2567
23.3M
                if ( a[i]==bcf_int32_vector_end )   *p = bcf_int8_vector_end;
2568
22.8M
                else if ( a[i]==bcf_int32_missing ) *p = bcf_int8_missing;
2569
447k
                else *p = a[i];
2570
23.3M
            }
2571
83.2k
            s->l += n;
2572
454k
        } else if (max <= BCF_MAX_BT_INT16 && min >= BCF_MIN_BT_INT16) {
2573
317k
            uint8_t *p;
2574
317k
            if (bcf_enc_size(s, wsize, BCF_BT_INT16) < 0 ||
2575
317k
                ks_resize(s, s->l + n * sizeof(int16_t)) < 0)
2576
0
                return -1;
2577
317k
            p = (uint8_t *) s->s + s->l;
2578
151M
            for (i = 0; i < n; ++i)
2579
151M
            {
2580
151M
                int16_t x;
2581
151M
                if ( a[i]==bcf_int32_vector_end ) x = bcf_int16_vector_end;
2582
150M
                else if ( a[i]==bcf_int32_missing ) x = bcf_int16_missing;
2583
2.96M
                else x = a[i];
2584
151M
                i16_to_le(x, p);
2585
151M
                p += sizeof(int16_t);
2586
151M
            }
2587
317k
            s->l += n * sizeof(int16_t);
2588
317k
        } else {
2589
137k
            uint8_t *p;
2590
137k
            if (bcf_enc_size(s, wsize, BCF_BT_INT32) < 0 ||
2591
137k
                ks_resize(s, s->l + n * sizeof(int32_t)) < 0)
2592
0
                return -1;
2593
137k
            p = (uint8_t *) s->s + s->l;
2594
351M
            for (i = 0; i < n; ++i) {
2595
351M
                i32_to_le(a[i], p);
2596
351M
                p += sizeof(int32_t);
2597
351M
            }
2598
137k
            s->l += n * sizeof(int32_t);
2599
137k
        }
2600
537k
    }
2601
2602
537k
    return 0;
2603
554k
}
2604
2605
#ifdef VCF_ALLOW_INT64
2606
static int bcf_enc_long1(kstring_t *s, int64_t x) {
2607
    uint32_t e = 0;
2608
    if (x <= BCF_MAX_BT_INT32 && x >= BCF_MIN_BT_INT32)
2609
        return bcf_enc_int1(s, x);
2610
    if (x == bcf_int64_vector_end) {
2611
        e |= bcf_enc_size(s, 1, BCF_BT_INT8);
2612
        e |= kputc(bcf_int8_vector_end, s) < 0;
2613
    } else if (x == bcf_int64_missing) {
2614
        e |= bcf_enc_size(s, 1, BCF_BT_INT8);
2615
        e |= kputc(bcf_int8_missing, s) < 0;
2616
    } else {
2617
        e |= bcf_enc_size(s, 1, BCF_BT_INT64);
2618
        e |= ks_expand(s, 8);
2619
        if (e == 0) { u64_to_le(x, (uint8_t *) s->s + s->l); s->l += 8; }
2620
    }
2621
    return e == 0 ? 0 : -1;
2622
}
2623
#endif
2624
2625
455k
static inline int serialize_float_array(kstring_t *s, size_t n, const float *a) {
2626
455k
    uint8_t *p;
2627
455k
    size_t i;
2628
455k
    size_t bytes = n * sizeof(float);
2629
2630
455k
    if (bytes / sizeof(float) != n) return -1;
2631
455k
    if (ks_resize(s, s->l + bytes) < 0) return -1;
2632
2633
455k
    p = (uint8_t *) s->s + s->l;
2634
100M
    for (i = 0; i < n; i++) {
2635
99.8M
        float_to_le(a[i], p);
2636
99.8M
        p += sizeof(float);
2637
99.8M
    }
2638
455k
    s->l += bytes;
2639
2640
455k
    return 0;
2641
455k
}
2642
2643
int bcf_enc_vfloat(kstring_t *s, int n, float *a)
2644
455k
{
2645
455k
    assert(n >= 0);
2646
455k
    bcf_enc_size(s, n, BCF_BT_FLOAT);
2647
455k
    serialize_float_array(s, n, a);
2648
455k
    return 0; // FIXME: check for errs in this function
2649
455k
}
2650
2651
int bcf_enc_vchar(kstring_t *s, int l, const char *a)
2652
4.06M
{
2653
4.06M
    bcf_enc_size(s, l, BCF_BT_CHAR);
2654
4.06M
    kputsn(a, l, s);
2655
4.06M
    return 0; // FIXME: check for errs in this function
2656
4.06M
}
2657
2658
// Special case of n==1 as it also occurs quite often in FORMAT data.
2659
// This version is also small enough to get inlined.
2660
320k
static inline int bcf_fmt_array1(kstring_t *s, int type, void *data) {
2661
320k
    uint32_t e = 0;
2662
320k
    uint8_t *p = (uint8_t *)data;
2663
320k
    int32_t v;
2664
2665
    // helps gcc more than clang here. In billions of cycles:
2666
    //          bcf_fmt_array1  bcf_fmt_array
2667
    // gcc7:    23.2            24.3
2668
    // gcc13:   21.6            23.0
2669
    // clang13: 27.1            27.8
2670
320k
    switch (type) {
2671
316k
    case BCF_BT_CHAR:
2672
316k
        e |= kputc_(*p == bcf_str_missing ? '.' : *p, s) < 0;
2673
316k
        break;
2674
2675
2.72k
    case BCF_BT_INT8:
2676
2.72k
        if (*(int8_t *)p != bcf_int8_vector_end) {
2677
2.72k
            e |= ((*(int8_t *)p == bcf_int8_missing)
2678
2.72k
                  ? kputc_('.', s)
2679
2.72k
                  : kputw(*(int8_t *)p, s)) < 0;
2680
2.72k
        }
2681
2.72k
        break;
2682
748
    case BCF_BT_INT16:
2683
748
        v = le_to_i16(p);
2684
748
        if (v != bcf_int16_vector_end) {
2685
748
            e |= (v == bcf_int16_missing
2686
748
                  ? kputc_('.', s)
2687
748
                  : kputw(v, s)) < 0;
2688
748
        }
2689
748
        break;
2690
2691
642
    case BCF_BT_INT32:
2692
642
        v = le_to_i32(p);
2693
642
        if (v != bcf_int32_vector_end) {
2694
642
            e |= (v == bcf_int32_missing
2695
642
                  ? kputc_('.', s)
2696
642
                  : kputw(v, s)) < 0;
2697
642
        }
2698
642
        break;
2699
2700
0
    case BCF_BT_FLOAT:
2701
0
        v = le_to_u32(p);
2702
0
        if (v != bcf_float_vector_end) {
2703
0
            e |= (v == bcf_float_missing
2704
0
                  ? kputc_('.', s)
2705
0
                  : kputd(le_to_float(p), s)) < 0;
2706
0
        }
2707
0
        break;
2708
2709
0
    default:
2710
0
        hts_log_error("Unexpected type %d", type);
2711
0
        return -1;
2712
320k
    }
2713
2714
320k
    return e == 0 ? 0 : -1;
2715
320k
}
2716
2717
int bcf_fmt_array(kstring_t *s, int n, int type, void *data)
2718
4.36M
{
2719
4.36M
    int j = 0;
2720
4.36M
    uint32_t e = 0;
2721
4.36M
    if (n == 0) {
2722
2.86M
        return kputc_('.', s) >= 0 ? 0 : -1;
2723
2.86M
    }
2724
2725
1.49M
    if (type == BCF_BT_CHAR)
2726
150k
    {
2727
150k
        char *p = (char *)data;
2728
2729
        // Note bcf_str_missing is already accounted for in n==0 above.
2730
150k
        if (n >= 8) {
2731
69.8k
            char *p_end = memchr(p, 0, n);
2732
69.8k
            e |= kputsn(p, p_end ? p_end-p : n, s) < 0;
2733
80.2k
        } else {
2734
235k
            for (j = 0; j < n && *p; ++j, ++p)
2735
155k
               e |= kputc(*p, s) < 0;
2736
80.2k
        }
2737
150k
    }
2738
1.34M
    else
2739
1.34M
    {
2740
1.34M
        #define BRANCH(type_t, convert, is_missing, is_vector_end, kprint) { \
2741
1.34M
            uint8_t *p = (uint8_t *) data; \
2742
280M
            for (j=0; j<n; j++, p += sizeof(type_t))    \
2743
279M
            { \
2744
279M
                type_t v = convert(p); \
2745
279M
                if ( is_vector_end ) break; \
2746
279M
                if ( j ) e |= kputc_(',', s) < 0; \
2747
279M
                e |= (is_missing ? kputc('.', s) : kprint) < 0; \
2748
279M
            } \
2749
1.34M
        }
2750
1.34M
        switch (type) {
2751
492k
            case BCF_BT_INT8:  BRANCH(int8_t,  le_to_i8, v==bcf_int8_missing,  v==bcf_int8_vector_end,  kputw(v, s)); break;
2752
299k
            case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, v==bcf_int16_missing, v==bcf_int16_vector_end, kputw(v, s)); break;
2753
345k
            case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, v==bcf_int32_missing, v==bcf_int32_vector_end, kputw(v, s)); break;
2754
206k
            case BCF_BT_FLOAT: BRANCH(uint32_t, le_to_u32, v==bcf_float_missing, v==bcf_float_vector_end, kputd(le_to_float(p), s)); break;
2755
0
            default: hts_log_error("Unexpected type %d", type); exit(1); break;
2756
1.34M
        }
2757
1.34M
        #undef BRANCH
2758
1.34M
    }
2759
1.49M
    return e == 0 ? 0 : -1;
2760
1.49M
}
2761
2762
uint8_t *bcf_fmt_sized_array(kstring_t *s, uint8_t *ptr)
2763
2.40M
{
2764
2.40M
    int x, type;
2765
2.40M
    x = bcf_dec_size(ptr, &ptr, &type);
2766
2.40M
    bcf_fmt_array(s, x, type, ptr);
2767
2.40M
    return ptr + (x << bcf_type_shift[type]);
2768
2.40M
}
2769
2770
/********************
2771
 *** VCF site I/O ***
2772
 ********************/
2773
2774
typedef struct {
2775
    int key;            // Key for h->id[BCF_DT_ID][key] vdict
2776
    int max_m;          // number of elements in field array (ie commas)
2777
    int size;           // field size (max_l or max_g*4 if is_gt)
2778
    int offset;         // offset of buf into h->mem
2779
    uint32_t is_gt:1,   // is genotype
2780
             max_g:31;  // maximum number of genotypes
2781
    uint32_t max_l;     // length of field
2782
    uint32_t y;         // h->id[0][fmt[j].key].val->info[BCF_HL_FMT]
2783
    uint8_t *buf;       // Pointer into h->mem
2784
} fmt_aux_t;
2785
2786
// fmt_aux_t field notes:
2787
// max_* are biggest sizes of the various FORMAT fields across all samples.
2788
// We use these after pivoting the data to ensure easy random access
2789
// of a specific sample.
2790
//
2791
// max_m is only used for type BCF_HT_REAL or BCF_HT_INT
2792
// max_g is only used for is_gt == 1 (will be BCF_HT_STR)
2793
// max_l is only used for is_gt == 0 (will be BCF_HT_STR)
2794
//
2795
// These are computed in vcf_parse_format_max3 and used in
2796
// vcf_parse_format_alloc4 to get the size.
2797
//
2798
// size is computed from max_g, max_l, max_m and is_gt.  Once computed
2799
// the max values are never accessed again.
2800
//
2801
// In theory all 4 vars could be coalesced into a single variable, but this
2802
// significantly harms speed (even if done via a union).  It's about 25-30%
2803
// slower.
2804
2805
static inline int align_mem(kstring_t *s)
2806
99.5k
{
2807
99.5k
    int e = 0;
2808
99.5k
    if (s->l&7) {
2809
26.0k
        uint64_t zero = 0;
2810
26.0k
        e = kputsn((char*)&zero, 8 - (s->l&7), s) < 0;
2811
26.0k
    }
2812
99.5k
    return e == 0 ? 0 : -1;
2813
99.5k
}
2814
2815
101k
#define MAX_N_FMT 255   /* Limited by size of bcf1_t n_fmt field */
2816
2817
// detect FORMAT "."
2818
static int vcf_parse_format_empty1(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
2819
8.71k
                                   const char *p, const char *q) {
2820
8.71k
    const char *end = s->s + s->l;
2821
8.71k
    if ( q>=end )
2822
41
    {
2823
41
        hts_log_error("FORMAT column with no sample columns starting at %s:%"PRIhts_pos"", bcf_seqname_safe(h,v), v->pos+1);
2824
41
        v->errcode |= BCF_ERR_NCOLS;
2825
41
        return -1;
2826
41
    }
2827
2828
8.67k
    v->n_fmt = 0;
2829
8.67k
    if ( p[0]=='.' && p[1]==0 ) // FORMAT field is empty "."
2830
462
    {
2831
462
        v->n_sample = bcf_hdr_nsamples(h);
2832
462
        return 1;
2833
462
    }
2834
2835
8.21k
    return 0;
2836
8.67k
}
2837
2838
// get format information from the dictionary
2839
static int vcf_parse_format_dict2(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
2840
8.21k
                                  const char *p, const char *q, fmt_aux_t *fmt) {
2841
8.21k
    const vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID];
2842
8.21k
    char *t;
2843
8.21k
    int j;
2844
8.21k
    ks_tokaux_t aux1;
2845
2846
110k
    for (j = 0, t = kstrtok(p, ":", &aux1); t; t = kstrtok(0, 0, &aux1), ++j) {
2847
101k
        if (j >= MAX_N_FMT) {
2848
5
            v->errcode |= BCF_ERR_LIMITS;
2849
5
            hts_log_error("FORMAT column at %s:%"PRIhts_pos" lists more identifiers than htslib can handle",
2850
5
                bcf_seqname_safe(h,v), v->pos+1);
2851
5
            return -1;
2852
5
        }
2853
2854
101k
        *(char*)aux1.p = 0;
2855
101k
        khint_t k = kh_get(vdict, d, t);
2856
101k
        if (k == kh_end(d) || kh_val(d, k).info[BCF_HL_FMT] == 15) {
2857
8.73k
            if ( t[0]=='.' && t[1]==0 )
2858
1
            {
2859
1
                hts_log_error("Invalid FORMAT tag name '.' at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
2860
1
                v->errcode |= BCF_ERR_TAG_INVALID;
2861
1
                return -1;
2862
1
            }
2863
8.72k
            hts_log_warning("FORMAT '%s' at %s:%"PRIhts_pos" is not defined in the header, assuming Type=String", t, bcf_seqname_safe(h,v), v->pos+1);
2864
8.72k
            kstring_t tmp = {0,0,0};
2865
8.72k
            int l;
2866
8.72k
            ksprintf(&tmp, "##FORMAT=<ID=%s,Number=1,Type=String,Description=\"Dummy\">", t);
2867
8.72k
            bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
2868
8.72k
            free(tmp.s);
2869
8.72k
            int res = hrec ? bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) : -1;
2870
8.72k
            if (res < 0) bcf_hrec_destroy(hrec);
2871
8.72k
            if (res > 0) res = bcf_hdr_sync((bcf_hdr_t*)h);
2872
2873
8.72k
            k = kh_get(vdict, d, t);
2874
8.72k
            v->errcode |= BCF_ERR_TAG_UNDEF;
2875
8.72k
            if (res || k == kh_end(d)) {
2876
17
                hts_log_error("Could not add dummy header for FORMAT '%s' at %s:%"PRIhts_pos, t, bcf_seqname_safe(h,v), v->pos+1);
2877
17
                v->errcode |= BCF_ERR_TAG_INVALID;
2878
17
                return -1;
2879
17
            }
2880
8.72k
        }
2881
101k
        fmt[j].max_l = fmt[j].max_m = fmt[j].max_g = 0;
2882
101k
        fmt[j].key = kh_val(d, k).id;
2883
101k
        fmt[j].is_gt = (t[0] == 'G' && t[1] == 'T' && !t[2]);
2884
101k
        fmt[j].y = h->id[0][fmt[j].key].val->info[BCF_HL_FMT];
2885
101k
        v->n_fmt++;
2886
101k
    }
2887
8.18k
    return 0;
2888
8.21k
}
2889
2890
// compute max
2891
static int vcf_parse_format_max3(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
2892
8.18k
                                 char *p, char *q, fmt_aux_t *fmt) {
2893
8.18k
    int n_sample_ori = -1;
2894
8.18k
    char *r = q + 1;  // r: position in the format string
2895
8.18k
    int l = 0, m = 1, g = 1, j;
2896
8.18k
    v->n_sample = 0;  // m: max vector size, l: max field len, g: max number of alleles
2897
8.18k
    const char *end = s->s + s->l;
2898
2899
33.4k
    while ( r<end )
2900
33.3k
    {
2901
        // can we skip some samples?
2902
33.3k
        if ( h->keep_samples )
2903
0
        {
2904
0
            n_sample_ori++;
2905
0
            if ( !bit_array_test(h->keep_samples,n_sample_ori) )
2906
0
            {
2907
0
                while ( *r!='\t' && r<end ) r++;
2908
0
                if ( *r=='\t' ) { *r = 0; r++; }
2909
0
                continue;
2910
0
            }
2911
0
        }
2912
2913
        // collect fmt stats: max vector size, length, number of alleles
2914
33.3k
        j = 0;  // j-th format field
2915
33.3k
        fmt_aux_t *f = fmt;
2916
33.3k
        static char meta[256] = {
2917
            // \0 \t , / : |
2918
33.3k
            1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2919
33.3k
            0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1, 0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,
2920
33.3k
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2921
33.3k
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,
2922
33.3k
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2923
33.3k
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2924
33.3k
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2925
33.3k
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2926
33.3k
        };
2927
2928
33.3k
        char *r_start = r;
2929
8.17M
        for (;;) {
2930
            // Quickly skip ahead to an appropriate meta-character
2931
11.7M
            while (!meta[(unsigned char)*r]) r++;
2932
2933
8.17M
            switch (*r) {
2934
7.22M
            case ',':
2935
7.22M
                m++;
2936
7.22M
                break;
2937
2938
15.7k
            case '|':
2939
867k
            case '/':
2940
867k
                if (f->is_gt) g++;
2941
867k
                break;
2942
2943
12.0k
            case '\t':
2944
12.0k
                *r = 0; // fall through
2945
2946
12.0k
            default: // valid due to while loop above.
2947
33.3k
            case '\0':
2948
74.1k
            case ':':
2949
74.1k
                l = r - r_start; r_start = r;
2950
74.1k
                if (f->max_m < m) f->max_m = m;
2951
74.1k
                if (f->max_l < l) f->max_l = l;
2952
74.1k
                if (f->is_gt && f->max_g < g) f->max_g = g;
2953
74.1k
                l = 0, m = g = 1;
2954
74.1k
                if ( *r==':' ) {
2955
40.8k
                    j++; f++;
2956
40.8k
                    if ( j>=v->n_fmt ) {
2957
10
                        hts_log_error("Incorrect number of FORMAT fields at %s:%"PRIhts_pos"",
2958
10
                                      h->id[BCF_DT_CTG][v->rid].key, v->pos+1);
2959
10
                        v->errcode |= BCF_ERR_NCOLS;
2960
10
                        return -1;
2961
10
                    }
2962
40.8k
                } else goto end_for;
2963
40.8k
                break;
2964
8.17M
            }
2965
8.13M
            if ( r>=end ) break;
2966
8.13M
            r++;
2967
8.13M
        }
2968
33.3k
    end_for:
2969
33.3k
        v->n_sample++;
2970
33.3k
        if ( v->n_sample == bcf_hdr_nsamples(h) ) break;
2971
25.3k
        r++;
2972
25.3k
    }
2973
2974
8.17k
    return 0;
2975
8.18k
}
2976
2977
// allocate memory for arrays
2978
static int vcf_parse_format_alloc4(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
2979
                                   const char *p, const char *q,
2980
8.17k
                                   fmt_aux_t *fmt) {
2981
8.17k
    kstring_t *mem = (kstring_t*)&h->mem;
2982
2983
8.17k
    int j;
2984
107k
    for (j = 0; j < v->n_fmt; ++j) {
2985
99.5k
        fmt_aux_t *f = &fmt[j];
2986
99.5k
        if ( !f->max_m ) f->max_m = 1;  // omitted trailing format field
2987
2988
99.5k
        if ((f->y>>4&0xf) == BCF_HT_STR) {
2989
99.5k
            f->size = f->is_gt? f->max_g << 2 : f->max_l;
2990
99.5k
        } else if ((f->y>>4&0xf) == BCF_HT_REAL || (f->y>>4&0xf) == BCF_HT_INT) {
2991
0
            f->size = f->max_m << 2;
2992
6
        } else {
2993
6
            hts_log_error("The format type %d at %s:%"PRIhts_pos" is currently not supported", f->y>>4&0xf, bcf_seqname_safe(h,v), v->pos+1);
2994
6
            v->errcode |= BCF_ERR_TAG_INVALID;
2995
6
            return -1;
2996
6
        }
2997
2998
99.5k
        if (align_mem(mem) < 0) {
2999
0
            hts_log_error("Memory allocation failure at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3000
0
            v->errcode |= BCF_ERR_LIMITS;
3001
0
            return -1;
3002
0
        }
3003
3004
        // Limit the total memory to ~2Gb per VCF row.  This should mean
3005
        // malformed VCF data is less likely to take excessive memory and/or
3006
        // time.
3007
99.5k
        if ((uint64_t) mem->l + v->n_sample * (uint64_t)f->size > INT_MAX) {
3008
0
            static int warned = 0;
3009
0
            if ( !warned ) hts_log_warning("Excessive memory required by FORMAT fields at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3010
0
            warned = 1;
3011
0
            v->errcode |= BCF_ERR_LIMITS;
3012
0
            f->size = -1;
3013
0
            f->offset = 0;
3014
0
            continue;
3015
0
        }
3016
3017
99.5k
        f->offset = mem->l;
3018
99.5k
        if (ks_resize(mem, mem->l + v->n_sample * (size_t)f->size) < 0) {
3019
0
            hts_log_error("Memory allocation failure at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3020
0
            v->errcode |= BCF_ERR_LIMITS;
3021
0
            return -1;
3022
0
        }
3023
99.5k
        mem->l += v->n_sample * f->size;
3024
99.5k
    }
3025
3026
8.17k
    {
3027
8.17k
        int j;
3028
107k
        for (j = 0; j < v->n_fmt; ++j)
3029
99.5k
            fmt[j].buf = (uint8_t*)mem->s + fmt[j].offset;
3030
8.17k
    }
3031
3032
8.17k
    return 0;
3033
8.17k
}
3034
3035
// Fill the sample fields
3036
static int vcf_parse_format_fill5(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
3037
8.17k
                                  const char *p, const char *q, fmt_aux_t *fmt) {
3038
8.17k
    static int extreme_val_warned = 0;
3039
8.17k
    int n_sample_ori = -1;
3040
    // At beginning of the loop t points to the first char of a format
3041
8.17k
    const char *t = q + 1;
3042
8.17k
    int m = 0;   // m: sample id
3043
8.17k
    const int nsamples = bcf_hdr_nsamples(h);
3044
3045
8.17k
    const char *end = s->s + s->l;
3046
41.1k
    while ( t<end )
3047
38.3k
    {
3048
        // can we skip some samples?
3049
38.3k
        if ( h->keep_samples )
3050
0
        {
3051
0
            n_sample_ori++;
3052
0
            if ( !bit_array_test(h->keep_samples,n_sample_ori) )
3053
0
            {
3054
0
                while ( *t && t<end ) t++;
3055
0
                t++;
3056
0
                continue;
3057
0
            }
3058
0
        }
3059
38.3k
        if ( m == nsamples ) break;
3060
3061
33.1k
        int j = 0; // j-th format field, m-th sample
3062
73.4k
        while ( t < end )
3063
73.1k
        {
3064
73.1k
            fmt_aux_t *z = &fmt[j++];
3065
73.1k
            const int htype = z->y>>4&0xf;
3066
73.1k
            if (!z->buf) {
3067
3
                hts_log_error("Memory allocation failure for FORMAT field type %d at %s:%"PRIhts_pos,
3068
3
                              z->y>>4&0xf, bcf_seqname_safe(h,v), v->pos+1);
3069
3
                v->errcode |= BCF_ERR_LIMITS;
3070
3
                return -1;
3071
3
            }
3072
3073
73.1k
            if ( z->size==-1 )
3074
0
            {
3075
                // this field is to be ignored, it's too big
3076
0
                while ( *t != ':' && *t ) t++;
3077
0
            }
3078
73.1k
            else if (htype == BCF_HT_STR) {
3079
73.1k
                int l;
3080
73.1k
                if (z->is_gt) {
3081
                    // Genotypes.
3082
                    // <val>([|/]<val>)+... where <val> is [0-9]+ or ".".
3083
4.50k
                    int32_t is_phased = 0;
3084
4.50k
                    uint32_t *x = (uint32_t*)(z->buf + z->size * (size_t)m);
3085
4.50k
                    uint32_t unreadable = 0;
3086
4.50k
                    uint32_t max = 0;
3087
4.50k
                    int overflow = 0;
3088
450k
                    for (l = 0;; ++t) {
3089
450k
                        if (*t == '.') {
3090
256k
                            ++t, x[l++] = is_phased;
3091
256k
                        } else {
3092
193k
                            const char *tt = t;
3093
193k
                            uint32_t val;
3094
                            // Or "v->n_allele < 10", but it doesn't
3095
                            // seem to be any faster and this feels safer.
3096
193k
                            if (*t >= '0' && *t <= '9' &&
3097
193k
                                !(t[1] >= '0' && t[1] <= '9')) {
3098
180k
                                val = *t++ - '0';
3099
180k
                            } else {
3100
13.7k
                                val = hts_str2uint(t, (char **)&t,
3101
13.7k
                                                   sizeof(val) * CHAR_MAX - 2,
3102
13.7k
                                                   &overflow);
3103
13.7k
                                unreadable |= tt == t;
3104
13.7k
                            }
3105
193k
                            if (max < val) max = val;
3106
193k
                            x[l++] = (val + 1) << 1 | is_phased;
3107
193k
                        }
3108
450k
                        is_phased = (*t == '|');
3109
450k
                        if (*t != '|' && *t != '/') break;
3110
450k
                    }
3111
                    // Possibly check max against v->n_allele instead?
3112
4.50k
                    if (overflow || max > (INT32_MAX >> 1) - 1) {
3113
107
                        hts_log_error("Couldn't read GT data: value too large at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3114
107
                        return -1;
3115
107
                    }
3116
4.39k
                    if (unreadable) {
3117
44
                        hts_log_error("Couldn't read GT data: value not a number or '.' at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3118
44
                        return -1;
3119
44
                    }
3120
4.35k
                    if ( !l ) x[l++] = 0;   // An empty field, insert missing value
3121
589k
                    for (; l < z->size>>2; ++l)
3122
585k
                        x[l] = bcf_int32_vector_end;
3123
3124
68.6k
                } else {
3125
                    // Otherwise arbitrary strings
3126
68.6k
                    char *x = (char*)z->buf + z->size * (size_t)m;
3127
9.85M
                    for (l = 0; *t != ':' && *t; ++t)
3128
9.78M
                        x[l++] = *t;
3129
68.6k
                    if (z->size > l)
3130
56.6k
                        memset(&x[l], 0, (z->size-l) * sizeof(*x));
3131
68.6k
                }
3132
3133
73.1k
            } else if (htype == BCF_HT_INT) {
3134
                // One or more integers in an array
3135
0
                int32_t *x = (int32_t*)(z->buf + z->size * (size_t)m);
3136
0
                int l;
3137
0
                for (l = 0;; ++t) {
3138
0
                    if (*t == '.') {
3139
0
                        x[l++] = bcf_int32_missing, ++t; // ++t to skip "."
3140
0
                    } else {
3141
0
                        int overflow = 0;
3142
0
                        char *te;
3143
0
                        long int tmp_val = hts_str2int(t, &te, sizeof(tmp_val)*CHAR_BIT, &overflow);
3144
0
                        if ( te==t || overflow || tmp_val<BCF_MIN_BT_INT32 || tmp_val>BCF_MAX_BT_INT32 )
3145
0
                        {
3146
0
                            if ( !extreme_val_warned )
3147
0
                            {
3148
0
                                hts_log_warning("Extreme FORMAT/%s value encountered and set to missing at %s:%"PRIhts_pos,
3149
0
                                                h->id[BCF_DT_ID][fmt[j-1].key].key, bcf_seqname_safe(h,v), v->pos+1);
3150
0
                                extreme_val_warned = 1;
3151
0
                            }
3152
0
                            tmp_val = bcf_int32_missing;
3153
0
                        }
3154
0
                        x[l++] = tmp_val;
3155
0
                        t = te;
3156
0
                    }
3157
0
                    if (*t != ',') break;
3158
0
                }
3159
0
                if ( !l )
3160
0
                    x[l++] = bcf_int32_missing;
3161
0
                for (; l < z->size>>2; ++l)
3162
0
                    x[l] = bcf_int32_vector_end;
3163
3164
0
            } else if (htype == BCF_HT_REAL) {
3165
                // One of more floating point values in an array
3166
0
                float *x = (float*)(z->buf + z->size * (size_t)m);
3167
0
                int l;
3168
0
                for (l = 0;; ++t) {
3169
0
                    if (*t == '.' && !isdigit_c(t[1])) {
3170
0
                        bcf_float_set_missing(x[l++]), ++t; // ++t to skip "."
3171
0
                    } else {
3172
0
                        int overflow = 0;
3173
0
                        char *te;
3174
0
                        float tmp_val = hts_str2dbl(t, &te, &overflow);
3175
0
                        if ( (te==t || overflow) && !extreme_val_warned )
3176
0
                        {
3177
0
                            hts_log_warning("Extreme FORMAT/%s value encountered at %s:%"PRIhts_pos, h->id[BCF_DT_ID][fmt[j-1].key].key, bcf_seqname(h,v), v->pos+1);
3178
0
                            extreme_val_warned = 1;
3179
0
                        }
3180
0
                        x[l++] = tmp_val;
3181
0
                        t = te;
3182
0
                    }
3183
0
                    if (*t != ',') break;
3184
0
                }
3185
0
                if ( !l )
3186
                    // An empty field, insert missing value
3187
0
                    bcf_float_set_missing(x[l++]);
3188
0
                for (; l < z->size>>2; ++l)
3189
0
                    bcf_float_set_vector_end(x[l]);
3190
0
            } else {
3191
0
                hts_log_error("Unknown FORMAT field type %d at %s:%"PRIhts_pos, htype, bcf_seqname_safe(h,v), v->pos+1);
3192
0
                v->errcode |= BCF_ERR_TAG_INVALID;
3193
0
                return -1;
3194
0
            }
3195
3196
73.0k
            if (*t == '\0') {
3197
32.6k
                break;
3198
32.6k
            }
3199
40.3k
            else if (*t == ':') {
3200
40.3k
                t++;
3201
40.3k
            }
3202
29
            else {
3203
29
                char buffer[8];
3204
29
                hts_log_error("Invalid character %s in '%s' FORMAT field at %s:%"PRIhts_pos"",
3205
29
                    hts_strprint(buffer, sizeof buffer, '\'', t, 1),
3206
29
                    h->id[BCF_DT_ID][z->key].key, bcf_seqname_safe(h,v), v->pos+1);
3207
29
                v->errcode |= BCF_ERR_CHAR;
3208
29
                return -1;
3209
29
            }
3210
73.0k
        }
3211
3212
        // fill end-of-vector values
3213
1.28M
        for (; j < v->n_fmt; ++j) {
3214
1.24M
            fmt_aux_t *z = &fmt[j];
3215
1.24M
            const int htype = z->y>>4&0xf;
3216
1.24M
            int l;
3217
1.24M
            if (htype == BCF_HT_STR) {
3218
1.24M
                if (z->is_gt) {
3219
13.9k
                    int32_t *x = (int32_t*)(z->buf + z->size * (size_t)m);
3220
13.9k
                    if (z->size) x[0] = bcf_int32_missing;
3221
2.93M
                    for (l = 1; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end;
3222
1.23M
                } else {
3223
1.23M
                    char *x = (char*)z->buf + z->size * (size_t)m;
3224
1.23M
                    if ( z->size ) {
3225
499k
                        x[0] = '.';
3226
499k
                        memset(&x[1], 0, (z->size-1) * sizeof(*x));
3227
499k
                    }
3228
1.23M
                }
3229
1.24M
            } else if (htype == BCF_HT_INT) {
3230
0
                int32_t *x = (int32_t*)(z->buf + z->size * (size_t)m);
3231
0
                x[0] = bcf_int32_missing;
3232
0
                for (l = 1; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end;
3233
0
            } else if (htype == BCF_HT_REAL) {
3234
0
                float *x = (float*)(z->buf + z->size * (size_t)m);
3235
0
                bcf_float_set_missing(x[0]);
3236
0
                for (l = 1; l < z->size>>2; ++l) bcf_float_set_vector_end(x[l]);
3237
0
            }
3238
1.24M
        }
3239
3240
32.9k
        m++; t++;
3241
32.9k
    }
3242
3243
7.98k
    return 0;
3244
8.17k
}
3245
3246
// write individual genotype information
3247
static int vcf_parse_format_gt6(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
3248
7.98k
                                const char *p, const char *q, fmt_aux_t *fmt) {
3249
7.98k
    kstring_t *str = &v->indiv;
3250
7.98k
    int i, need_downsize = 0;
3251
7.98k
    if (v->n_sample > 0) {
3252
103k
        for (i = 0; i < v->n_fmt; ++i) {
3253
95.5k
            fmt_aux_t *z = &fmt[i];
3254
95.5k
            if ( z->size==-1 ) {
3255
0
                need_downsize = 1;
3256
0
                continue;
3257
0
            }
3258
95.5k
            bcf_enc_int1(str, z->key);
3259
95.5k
            if ((z->y>>4&0xf) == BCF_HT_STR && !z->is_gt) {
3260
91.3k
                bcf_enc_size(str, z->size, BCF_BT_CHAR);
3261
91.3k
                kputsn((char*)z->buf, z->size * (size_t)v->n_sample, str);
3262
91.3k
            } else if ((z->y>>4&0xf) == BCF_HT_INT || z->is_gt) {
3263
4.27k
                bcf_enc_vint(str, (z->size>>2) * v->n_sample, (int32_t*)z->buf, z->size>>2);
3264
4.27k
            } else {
3265
0
                bcf_enc_size(str, z->size>>2, BCF_BT_FLOAT);
3266
0
                if (serialize_float_array(str, (z->size>>2) * (size_t)v->n_sample,
3267
0
                                          (float *) z->buf) != 0) {
3268
0
                    v->errcode |= BCF_ERR_LIMITS;
3269
0
                    hts_log_error("Out of memory at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3270
0
                    return -1;
3271
0
                }
3272
0
            }
3273
95.5k
        }
3274
3275
7.97k
    }
3276
7.98k
    if ( need_downsize ) {
3277
0
        i = 1;
3278
0
        while ( i < v->n_fmt ) {
3279
0
            if ( fmt[i].size==-1 )
3280
0
            {
3281
0
                memmove(&fmt[i-1],&fmt[i],sizeof(*fmt));
3282
0
                v->n_fmt--;
3283
0
            }
3284
0
            else
3285
0
                i++;
3286
0
        }
3287
0
    }
3288
3289
7.98k
    return 0;
3290
7.98k
}
3291
3292
// validity checking
3293
7.98k
static int vcf_parse_format_check7(const bcf_hdr_t *h, bcf1_t *v) {
3294
7.98k
    if ( v->n_sample!=bcf_hdr_nsamples(h) )
3295
110
    {
3296
110
        hts_log_error("Number of columns at %s:%"PRIhts_pos" does not match the number of samples (%d vs %d)",
3297
110
            bcf_seqname_safe(h,v), v->pos+1, v->n_sample, bcf_hdr_nsamples(h));
3298
110
        v->errcode |= BCF_ERR_NCOLS;
3299
110
        return -1;
3300
110
    }
3301
7.87k
    if ( v->indiv.l > 0xffffffff )
3302
0
    {
3303
0
        hts_log_error("The FORMAT at %s:%"PRIhts_pos" is too long", bcf_seqname_safe(h,v), v->pos+1);
3304
0
        v->errcode |= BCF_ERR_LIMITS;
3305
3306
        // Error recovery: return -1 if this is a critical error or 0 if we want to ignore the FORMAT and proceed
3307
0
        v->n_fmt = 0;
3308
0
        return -1;
3309
0
    }
3310
3311
7.87k
    return 0;
3312
7.87k
}
3313
3314
// p,q is the start and the end of the FORMAT field
3315
static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
3316
                            char *p, char *q)
3317
15.2k
{
3318
15.2k
    if ( !bcf_hdr_nsamples(h) ) return 0;
3319
8.71k
    kstring_t *mem = (kstring_t*)&h->mem;
3320
8.71k
    mem->l = 0;
3321
3322
8.71k
    fmt_aux_t fmt[MAX_N_FMT];
3323
3324
    // detect FORMAT "."
3325
8.71k
    int ret; // +ve = ok, -ve = err
3326
8.71k
    if ((ret = vcf_parse_format_empty1(s, h, v, p, q)))
3327
503
        return ret ? 0 : -1;
3328
3329
    // get format information from the dictionary
3330
8.21k
    if (vcf_parse_format_dict2(s, h, v, p, q, fmt) < 0)
3331
23
        return -1;
3332
3333
    // FORMAT data is per-sample A:B:C A:B:C A:B:C ... but in memory it is
3334
    // stored as per-type arrays AAA... BBB... CCC...  This is basically
3335
    // a data rotation or pivot.
3336
3337
    // The size of elements in the array grow to their maximum needed,
3338
    // permitting fast random access.  This means however we have to first
3339
    // scan the whole FORMAT line to find the maximum of each type, and
3340
    // then scan it again to find the store the data.
3341
    // We break this down into compute-max, allocate, fill-out-buffers
3342
3343
    // TODO: ?
3344
    // The alternative would be to pivot on the first pass, with fixed
3345
    // size entries for numerics and concatenated strings otherwise, also
3346
    // tracking maximum sizes.  Then on a second pass we reallocate and
3347
    // copy the data again to a uniformly sized array.  Two passes through
3348
    // memory, but without doubling string parsing.
3349
3350
    // compute max
3351
8.18k
    if (vcf_parse_format_max3(s, h, v, p, q, fmt) < 0)
3352
10
        return -1;
3353
3354
    // allocate memory for arrays
3355
8.17k
    if (vcf_parse_format_alloc4(s, h, v, p, q, fmt) < 0)
3356
6
        return -1;
3357
3358
    // fill the sample fields; at beginning of the loop
3359
8.17k
    if (vcf_parse_format_fill5(s, h, v, p, q, fmt) < 0)
3360
183
        return -1;
3361
3362
    // write individual genotype information
3363
7.98k
    if (vcf_parse_format_gt6(s, h, v, p, q, fmt) < 0)
3364
0
        return -1;
3365
3366
    // validity checking
3367
7.98k
    if (vcf_parse_format_check7(h, v) < 0)
3368
110
        return -1;
3369
3370
7.87k
    return 0;
3371
7.98k
}
3372
3373
5.06k
static khint_t fix_chromosome(const bcf_hdr_t *h, vdict_t *d, const char *p) {
3374
    // Simple error recovery for chromosomes not defined in the header. It will not help when VCF header has
3375
    // been already printed, but will enable tools like vcfcheck to proceed.
3376
3377
5.06k
    kstring_t tmp = {0,0,0};
3378
5.06k
    khint_t k;
3379
5.06k
    int l;
3380
5.06k
    if (ksprintf(&tmp, "##contig=<ID=%s>", p) < 0)
3381
0
        return kh_end(d);
3382
5.06k
    bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
3383
5.06k
    free(tmp.s);
3384
5.06k
    int res = hrec ? bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) : -1;
3385
5.06k
    if (res < 0) bcf_hrec_destroy(hrec);
3386
5.06k
    if (res > 0) res = bcf_hdr_sync((bcf_hdr_t*)h);
3387
5.06k
    k = kh_get(vdict, d, p);
3388
3389
5.06k
    return k;
3390
5.06k
}
3391
3392
19.5k
static int vcf_parse_filter(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p, char *q) {
3393
19.5k
    int i, n_flt = 1, max_n_flt = 0;
3394
19.5k
    char *r, *t;
3395
19.5k
    int32_t *a_flt = NULL;
3396
19.5k
    ks_tokaux_t aux1;
3397
19.5k
    khint_t k;
3398
19.5k
    vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID];
3399
    // count the number of filters
3400
19.5k
    if (*(q-1) == ';') *(q-1) = 0;
3401
472M
    for (r = p; *r; ++r)
3402
472M
        if (*r == ';') ++n_flt;
3403
19.5k
    if (n_flt > max_n_flt) {
3404
19.5k
        a_flt = malloc(n_flt * sizeof(*a_flt));
3405
19.5k
        if (!a_flt) {
3406
0
            hts_log_error("Could not allocate memory at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3407
0
            v->errcode |= BCF_ERR_LIMITS; // No appropriate code?
3408
0
            return -1;
3409
0
        }
3410
19.5k
        max_n_flt = n_flt;
3411
19.5k
    }
3412
    // add filters
3413
2.22M
    for (t = kstrtok(p, ";", &aux1), i = 0; t; t = kstrtok(0, 0, &aux1)) {
3414
2.20M
        *(char*)aux1.p = 0;
3415
2.20M
        k = kh_get(vdict, d, t);
3416
2.20M
        if (k == kh_end(d))
3417
44.7k
        {
3418
            // Simple error recovery for FILTERs not defined in the header. It will not help when VCF header has
3419
            // been already printed, but will enable tools like vcfcheck to proceed.
3420
44.7k
            hts_log_warning("FILTER '%s' is not defined in the header", t);
3421
44.7k
            kstring_t tmp = {0,0,0};
3422
44.7k
            int l;
3423
44.7k
            ksprintf(&tmp, "##FILTER=<ID=%s,Description=\"Dummy\">", t);
3424
44.7k
            bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
3425
44.7k
            free(tmp.s);
3426
44.7k
            int res = hrec ? bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) : -1;
3427
44.7k
            if (res < 0) bcf_hrec_destroy(hrec);
3428
44.7k
            if (res > 0) res = bcf_hdr_sync((bcf_hdr_t*)h);
3429
44.7k
            k = kh_get(vdict, d, t);
3430
44.7k
            v->errcode |= BCF_ERR_TAG_UNDEF;
3431
44.7k
            if (res || k == kh_end(d)) {
3432
71
                hts_log_error("Could not add dummy header for FILTER '%s' at %s:%"PRIhts_pos, t, bcf_seqname_safe(h,v), v->pos+1);
3433
71
                v->errcode |= BCF_ERR_TAG_INVALID;
3434
71
                free(a_flt);
3435
71
                return -1;
3436
71
            }
3437
44.7k
        }
3438
2.20M
        a_flt[i++] = kh_val(d, k).id;
3439
2.20M
    }
3440
3441
19.4k
    bcf_enc_vint(str, n_flt, a_flt, -1);
3442
19.4k
    free(a_flt);
3443
3444
19.4k
    return 0;
3445
19.5k
}
3446
3447
19.0k
static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p, char *q) {
3448
19.0k
    static int extreme_int_warned = 0, negative_rlen_warned = 0;
3449
19.0k
    int max_n_val = 0, overflow = 0;
3450
19.0k
    char *r, *key;
3451
19.0k
    khint_t k;
3452
19.0k
    vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID];
3453
19.0k
    int32_t *a_val = NULL;
3454
3455
19.0k
    v->n_info = 0;
3456
19.0k
    if (*(q-1) == ';') *(q-1) = 0;
3457
5.40M
    for (r = key = p;; ++r) {
3458
5.40M
        int c;
3459
5.40M
        char *val, *end;
3460
495M
        while (*r > '=' || (*r != ';' && *r != '=' && *r != 0)) r++;
3461
5.40M
        if (v->n_info == UINT16_MAX) {
3462
2
            hts_log_error("Too many INFO entries at %s:%"PRIhts_pos,
3463
2
                          bcf_seqname_safe(h,v), v->pos+1);
3464
2
            v->errcode |= BCF_ERR_LIMITS;
3465
2
            goto fail;
3466
2
        }
3467
5.40M
        val = end = NULL;
3468
5.40M
        c = *r; *r = 0;
3469
5.40M
        if (c == '=') {
3470
3.03M
            val = r + 1;
3471
3472
842M
            for (end = val; *end != ';' && *end != 0; ++end);
3473
3.03M
            c = *end; *end = 0;
3474
3.03M
        } else end = r;
3475
5.40M
        if ( !*key ) { if (c==0) break; r = end; key = r + 1; continue; }  // faulty VCF, ";;" in the INFO
3476
5.35M
        k = kh_get(vdict, d, key);
3477
5.35M
        if (k == kh_end(d) || kh_val(d, k).info[BCF_HL_INFO] == 15)
3478
46.8k
        {
3479
46.8k
            hts_log_warning("INFO '%s' is not defined in the header, assuming Type=String", key);
3480
46.8k
            kstring_t tmp = {0,0,0};
3481
46.8k
            int l;
3482
46.8k
            ksprintf(&tmp, "##INFO=<ID=%s,Number=1,Type=String,Description=\"Dummy\">", key);
3483
46.8k
            bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
3484
46.8k
            free(tmp.s);
3485
46.8k
            int res = hrec ? bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) : -1;
3486
46.8k
            if (res < 0) bcf_hrec_destroy(hrec);
3487
46.8k
            if (res > 0) res = bcf_hdr_sync((bcf_hdr_t*)h);
3488
46.8k
            k = kh_get(vdict, d, key);
3489
46.8k
            v->errcode |= BCF_ERR_TAG_UNDEF;
3490
46.8k
            if (res || k == kh_end(d)) {
3491
130
                hts_log_error("Could not add dummy header for INFO '%s' at %s:%"PRIhts_pos, key, bcf_seqname_safe(h,v), v->pos+1);
3492
130
                v->errcode |= BCF_ERR_TAG_INVALID;
3493
130
                goto fail;
3494
130
            }
3495
46.8k
        }
3496
5.35M
        uint32_t y = kh_val(d, k).info[BCF_HL_INFO];
3497
5.35M
        ++v->n_info;
3498
5.35M
        bcf_enc_int1(str, kh_val(d, k).id);
3499
5.35M
        if (val == 0) {
3500
2.31M
            bcf_enc_size(str, 0, BCF_BT_NULL);
3501
3.03M
        } else if ((y>>4&0xf) == BCF_HT_FLAG || (y>>4&0xf) == BCF_HT_STR) { // if Flag has a value, treat it as a string
3502
74.3k
            bcf_enc_vchar(str, end - val, val);
3503
2.96M
        } else { // int/float value/array
3504
2.96M
            int i, n_val;
3505
2.96M
            char *t, *te;
3506
785M
            for (t = val, n_val = 1; *t; ++t) // count the number of values
3507
782M
                if (*t == ',') ++n_val;
3508
            // Check both int and float size in one step for simplicity
3509
2.96M
            if (n_val > max_n_val) {
3510
5.81k
                int32_t *a_tmp = (int32_t *)realloc(a_val, n_val * sizeof(*a_val));
3511
5.81k
                if (!a_tmp) {
3512
0
                    hts_log_error("Could not allocate memory at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3513
0
                    v->errcode |= BCF_ERR_LIMITS; // No appropriate code?
3514
0
                    goto fail;
3515
0
                }
3516
5.81k
                a_val = a_tmp;
3517
5.81k
                max_n_val = n_val;
3518
5.81k
            }
3519
2.96M
            if ((y>>4&0xf) == BCF_HT_INT) {
3520
2.50M
                i = 0, t = val;
3521
2.50M
                int64_t val1;
3522
2.50M
                int is_int64 = 0;
3523
#ifdef VCF_ALLOW_INT64
3524
                if ( n_val==1 )
3525
                {
3526
                    overflow = 0;
3527
                    long long int tmp_val = hts_str2int(val, &te, sizeof(tmp_val)*CHAR_BIT, &overflow);
3528
                    if ( te==val ) tmp_val = bcf_int32_missing;
3529
                    else if ( overflow || tmp_val<BCF_MIN_BT_INT64 || tmp_val>BCF_MAX_BT_INT64 )
3530
                    {
3531
                        if ( !extreme_int_warned )
3532
                        {
3533
                            hts_log_warning("Extreme INFO/%s value encountered and set to missing at %s:%"PRIhts_pos,key,bcf_seqname_safe(h,v), v->pos+1);
3534
                            extreme_int_warned = 1;
3535
                        }
3536
                        tmp_val = bcf_int32_missing;
3537
                    }
3538
                    else
3539
                        is_int64 = 1;
3540
                    val1 = tmp_val;
3541
                    t = te;
3542
                    i = 1;  // this is just to avoid adding another nested block...
3543
                }
3544
#endif
3545
527M
                for (; i < n_val; ++i, ++t)
3546
524M
                {
3547
524M
                    overflow = 0;
3548
524M
                    long int tmp_val = hts_str2int(t, &te, sizeof(tmp_val)*CHAR_BIT, &overflow);
3549
524M
                    if ( te==t ) tmp_val = bcf_int32_missing;
3550
5.97M
                    else if ( overflow || tmp_val<BCF_MIN_BT_INT32 || tmp_val>BCF_MAX_BT_INT32 )
3551
1.05M
                    {
3552
1.05M
                        if ( !extreme_int_warned )
3553
1
                        {
3554
1
                            hts_log_warning("Extreme INFO/%s value encountered and set to missing at %s:%"PRIhts_pos,key,bcf_seqname_safe(h,v), v->pos+1);
3555
1
                            extreme_int_warned = 1;
3556
1
                        }
3557
1.05M
                        tmp_val = bcf_int32_missing;
3558
1.05M
                    }
3559
524M
                    a_val[i] = tmp_val;
3560
632M
                    for (t = te; *t && *t != ','; t++);
3561
524M
                }
3562
2.50M
                if (n_val == 1) {
3563
#ifdef VCF_ALLOW_INT64
3564
                    if ( is_int64 )
3565
                    {
3566
                        v->unpacked |= BCF_IS_64BIT;
3567
                        bcf_enc_long1(str, val1);
3568
                    }
3569
                    else
3570
                        bcf_enc_int1(str, (int32_t)val1);
3571
#else
3572
1.97M
                    val1 = a_val[0];
3573
1.97M
                    bcf_enc_int1(str, (int32_t)val1);
3574
1.97M
#endif
3575
1.97M
                } else {
3576
530k
                    bcf_enc_vint(str, n_val, a_val, -1);
3577
530k
                }
3578
2.50M
                if (n_val==1 && (val1!=bcf_int32_missing || is_int64)
3579
2.50M
                    && memcmp(key, "END", 4) == 0)
3580
0
                {
3581
0
                    if ( val1 <= v->pos )
3582
0
                    {
3583
0
                        if ( !negative_rlen_warned )
3584
0
                        {
3585
0
                            hts_log_warning("INFO/END=%"PRIhts_pos" is smaller than POS at %s:%"PRIhts_pos,val1,bcf_seqname_safe(h,v),v->pos+1);
3586
0
                            negative_rlen_warned = 1;
3587
0
                        }
3588
0
                    }
3589
0
                    else
3590
0
                        v->rlen = val1 - v->pos;
3591
0
                }
3592
2.50M
            } else if ((y>>4&0xf) == BCF_HT_REAL) {
3593
455k
                float *val_f = (float *)a_val;
3594
100M
                for (i = 0, t = val; i < n_val; ++i, ++t)
3595
99.8M
                {
3596
99.8M
                    overflow = 0;
3597
99.8M
                    val_f[i] = hts_str2dbl(t, &te, &overflow);
3598
99.8M
                    if ( te==t || overflow ) // conversion failed
3599
98.9M
                        bcf_float_set_missing(val_f[i]);
3600
121M
                    for (t = te; *t && *t != ','; t++);
3601
99.8M
                }
3602
455k
                bcf_enc_vfloat(str, n_val, val_f);
3603
455k
            }
3604
2.96M
        }
3605
5.35M
        if (c == 0) break;
3606
5.34M
        r = end;
3607
5.34M
        key = r + 1;
3608
5.34M
    }
3609
3610
18.9k
    free(a_val);
3611
18.9k
    return 0;
3612
3613
132
 fail:
3614
132
    free(a_val);
3615
132
    return -1;
3616
19.0k
}
3617
3618
int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v)
3619
20.8k
{
3620
20.8k
    int ret = -2, overflow = 0;
3621
20.8k
    char *p, *q, *r, *t;
3622
20.8k
    kstring_t *str;
3623
20.8k
    khint_t k;
3624
20.8k
    ks_tokaux_t aux;
3625
3626
//#define NOT_DOT(p) strcmp((p), ".")
3627
//#define NOT_DOT(p) (!(*p == '.' && !p[1]))
3628
//#define NOT_DOT(p) ((*p) != '.' || (p)[1])
3629
//#define NOT_DOT(p) (q-p != 1 || memcmp(p, ".\0", 2))
3630
100k
#define NOT_DOT(p) (memcmp(p, ".\0", 2))
3631
3632
20.8k
    if (!s || !h || !v || !(s->s))
3633
0
        return ret;
3634
3635
    // Assumed in lots of places, but we may as well spot this early
3636
20.8k
    assert(sizeof(float) == sizeof(int32_t));
3637
3638
    // Ensure string we parse has space to permit some over-flow when during
3639
    // parsing.  Eg to do memcmp(key, "END", 4) in vcf_parse_info over
3640
    // the more straight forward looking strcmp, giving a speed advantage.
3641
20.8k
    if (ks_resize(s, s->l+4) < 0)
3642
0
        return -1;
3643
3644
    // Force our memory to be initialised so we avoid the technicality of
3645
    // undefined behaviour in using a 4-byte memcmp.  (The reality is this
3646
    // almost certainly is never detected by the compiler so has no impact,
3647
    // but equally so this code has minimal (often beneficial) impact on
3648
    // performance too.)
3649
20.8k
    s->s[s->l+0] = 0;
3650
20.8k
    s->s[s->l+1] = 0;
3651
20.8k
    s->s[s->l+2] = 0;
3652
20.8k
    s->s[s->l+3] = 0;
3653
3654
20.8k
    bcf_clear1(v);
3655
20.8k
    str = &v->shared;
3656
20.8k
    memset(&aux, 0, sizeof(ks_tokaux_t));
3657
3658
    // CHROM
3659
20.8k
    if (!(p = kstrtok(s->s, "\t", &aux)))
3660
0
        goto err;
3661
20.8k
    *(q = (char*)aux.p) = 0;
3662
3663
20.8k
    vdict_t *d = (vdict_t*)h->dict[BCF_DT_CTG];
3664
20.8k
    k = kh_get(vdict, d, p);
3665
20.8k
    if (k == kh_end(d)) {
3666
5.06k
        hts_log_warning("Contig '%s' is not defined in the header. (Quick workaround: index the file with tabix.)", p);
3667
5.06k
        v->errcode = BCF_ERR_CTG_UNDEF;
3668
5.06k
        if ((k = fix_chromosome(h, d, p)) == kh_end(d)) {
3669
71
            hts_log_error("Could not add dummy header for contig '%s'", p);
3670
71
            v->errcode |= BCF_ERR_CTG_INVALID;
3671
71
            goto err;
3672
71
        }
3673
5.06k
    }
3674
20.7k
    v->rid = kh_val(d, k).id;
3675
3676
    // POS
3677
20.7k
    if (!(p = kstrtok(0, 0, &aux)))
3678
377
        goto err;
3679
20.4k
    *(q = (char*)aux.p) = 0;
3680
3681
20.4k
    overflow = 0;
3682
20.4k
    char *tmp = p;
3683
20.4k
    v->pos = hts_str2uint(p, &p, 63, &overflow);
3684
20.4k
    if (overflow) {
3685
8
        hts_log_error("Position value '%s' is too large", tmp);
3686
8
        goto err;
3687
20.4k
    } else if ( *p ) {
3688
76
        hts_log_error("Could not parse the position '%s'", tmp);
3689
76
        goto err;
3690
20.3k
    } else {
3691
20.3k
        v->pos -= 1;
3692
20.3k
    }
3693
20.3k
    if (v->pos >= INT32_MAX)
3694
144
        v->unpacked |= BCF_IS_64BIT;
3695
3696
    // ID
3697
20.3k
    if (!(p = kstrtok(0, 0, &aux)))
3698
36
        goto err;
3699
20.3k
    *(q = (char*)aux.p) = 0;
3700
3701
20.3k
    if (NOT_DOT(p)) bcf_enc_vchar(str, q - p, p);
3702
391
    else bcf_enc_size(str, 0, BCF_BT_CHAR);
3703
3704
    // REF
3705
20.3k
    if (!(p = kstrtok(0, 0, &aux)))
3706
33
        goto err;
3707
20.2k
    *(q = (char*)aux.p) = 0;
3708
3709
20.2k
    bcf_enc_vchar(str, q - p, p);
3710
20.2k
    v->n_allele = 1, v->rlen = q - p;
3711
3712
    // ALT
3713
20.2k
    if (!(p = kstrtok(0, 0, &aux)))
3714
25
        goto err;
3715
20.2k
    *(q = (char*)aux.p) = 0;
3716
3717
20.2k
    if (NOT_DOT(p)) {
3718
186M
        for (r = t = p;; ++r) {
3719
186M
            if (*r == ',' || *r == 0) {
3720
3.95M
                if (v->n_allele == UINT16_MAX) {
3721
3
                    hts_log_error("Too many ALT alleles at %s:%"PRIhts_pos,
3722
3
                                  bcf_seqname_safe(h,v), v->pos+1);
3723
3
                    v->errcode |= BCF_ERR_LIMITS;
3724
3
                    goto err;
3725
3
                }
3726
3.95M
                bcf_enc_vchar(str, r - t, t);
3727
3.95M
                t = r + 1;
3728
3.95M
                ++v->n_allele;
3729
3.95M
            }
3730
186M
            if (r == q) break;
3731
186M
        }
3732
20.1k
    }
3733
3734
    // QUAL
3735
20.2k
    if (!(p = kstrtok(0, 0, &aux)))
3736
43
        goto err;
3737
20.1k
    *(q = (char*)aux.p) = 0;
3738
3739
20.1k
    if (NOT_DOT(p)) v->qual = atof(p);
3740
465
    else bcf_float_set_missing(v->qual);
3741
20.1k
    if ( v->max_unpack && !(v->max_unpack>>1) ) goto end; // BCF_UN_STR
3742
3743
    // FILTER
3744
20.1k
    if (!(p = kstrtok(0, 0, &aux)))
3745
36
        goto err;
3746
20.1k
    *(q = (char*)aux.p) = 0;
3747
3748
20.1k
    if (NOT_DOT(p)) {
3749
19.5k
        if (vcf_parse_filter(str, h, v, p, q)) {
3750
71
            goto err;
3751
71
        }
3752
19.5k
    } else bcf_enc_vint(str, 0, 0, -1);
3753
20.0k
    if ( v->max_unpack && !(v->max_unpack>>2) ) goto end; // BCF_UN_FLT
3754
3755
    // INFO
3756
20.0k
    if (!(p = kstrtok(0, 0, &aux)))
3757
86
        goto err;
3758
20.0k
    *(q = (char*)aux.p) = 0;
3759
3760
20.0k
    if (NOT_DOT(p)) {
3761
19.0k
        if (vcf_parse_info(str, h, v, p, q)) {
3762
132
            goto err;
3763
132
        }
3764
19.0k
    }
3765
19.8k
    if ( v->max_unpack && !(v->max_unpack>>3) ) goto end;
3766
3767
    // FORMAT; optional
3768
19.8k
    p = kstrtok(0, 0, &aux);
3769
19.8k
    if (p) {
3770
15.2k
        *(q = (char*)aux.p) = 0;
3771
3772
15.2k
        return vcf_parse_format(s, h, v, p, q) == 0 ? 0 : -2;
3773
15.2k
    } else {
3774
4.59k
        return 0;
3775
4.59k
    }
3776
3777
0
 end:
3778
0
    ret = 0;
3779
3780
997
 err:
3781
997
    return ret;
3782
0
}
3783
3784
int vcf_open_mode(char *mode, const char *fn, const char *format)
3785
0
{
3786
0
    if (format == NULL) {
3787
        // Try to pick a format based on the filename extension
3788
0
        char extension[HTS_MAX_EXT_LEN];
3789
0
        if (find_file_extension(fn, extension) < 0) return -1;
3790
0
        return vcf_open_mode(mode, fn, extension);
3791
0
    }
3792
0
    else if (strcasecmp(format, "bcf") == 0) strcpy(mode, "b");
3793
0
    else if (strcasecmp(format, "vcf") == 0) strcpy(mode, "");
3794
0
    else if (strcasecmp(format, "vcf.gz") == 0 || strcasecmp(format, "vcf.bgz") == 0) strcpy(mode, "z");
3795
0
    else return -1;
3796
3797
0
    return 0;
3798
0
}
3799
3800
int vcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
3801
21.8k
{
3802
21.8k
    int ret;
3803
21.8k
    ret = hts_getline(fp, KS_SEP_LINE, &fp->line);
3804
21.8k
    if (ret < 0) return ret;
3805
20.8k
    return vcf_parse1(&fp->line, h, v);
3806
21.8k
}
3807
3808
static inline uint8_t *bcf_unpack_fmt_core1(uint8_t *ptr, int n_sample, bcf_fmt_t *fmt)
3809
0
{
3810
0
    uint8_t *ptr_start = ptr;
3811
0
    fmt->id = bcf_dec_typed_int1(ptr, &ptr);
3812
0
    fmt->n = bcf_dec_size(ptr, &ptr, &fmt->type);
3813
0
    fmt->size = fmt->n << bcf_type_shift[fmt->type];
3814
0
    fmt->p = ptr;
3815
0
    fmt->p_off  = ptr - ptr_start;
3816
0
    fmt->p_free = 0;
3817
0
    ptr += n_sample * fmt->size;
3818
0
    fmt->p_len = ptr - fmt->p;
3819
0
    return ptr;
3820
0
}
3821
3822
static inline uint8_t *bcf_unpack_info_core1(uint8_t *ptr, bcf_info_t *info)
3823
0
{
3824
0
    uint8_t *ptr_start = ptr;
3825
0
    int64_t len = 0;
3826
0
    info->key = bcf_dec_typed_int1(ptr, &ptr);
3827
0
    len = info->len = bcf_dec_size(ptr, &ptr, &info->type);
3828
0
    info->vptr = ptr;
3829
0
    info->vptr_off  = ptr - ptr_start;
3830
0
    info->vptr_free = 0;
3831
0
    info->v1.i = 0;
3832
0
    if (info->len == 1) {
3833
0
        switch(info->type) {
3834
0
        case BCF_BT_INT8:
3835
0
        case BCF_BT_CHAR:
3836
0
            info->v1.i = *(int8_t*)ptr;
3837
0
            break;
3838
0
        case BCF_BT_INT16:
3839
0
            info->v1.i = le_to_i16(ptr);
3840
0
            len <<= 1;
3841
0
            break;
3842
0
        case BCF_BT_INT32:
3843
0
            info->v1.i = le_to_i32(ptr);
3844
0
            len <<= 2;
3845
0
            break;
3846
0
        case BCF_BT_FLOAT:
3847
0
            info->v1.f = le_to_float(ptr);
3848
0
            len <<= 2;
3849
0
            break;
3850
0
        case BCF_BT_INT64:
3851
0
            info->v1.i = le_to_i64(ptr);
3852
0
            len <<= 3;
3853
0
            break;
3854
0
        }
3855
0
    } else {
3856
0
        len <<= bcf_type_shift[info->type];
3857
0
    }
3858
0
    ptr += len;
3859
3860
0
    info->vptr_len = ptr - info->vptr;
3861
0
    return ptr;
3862
0
}
3863
3864
int bcf_unpack(bcf1_t *b, int which)
3865
17.7k
{
3866
17.7k
    if ( !b->shared.l ) return 0; // Building a new BCF record from scratch
3867
17.7k
    uint8_t *ptr = (uint8_t*)b->shared.s, *ptr_ori;
3868
17.7k
    int i;
3869
17.7k
    bcf_dec_t *d = &b->d;
3870
17.7k
    if (which & BCF_UN_FLT) which |= BCF_UN_STR;
3871
17.7k
    if (which & BCF_UN_INFO) which |= BCF_UN_SHR;
3872
17.7k
    if ((which&BCF_UN_STR) && !(b->unpacked&BCF_UN_STR))
3873
17.7k
    {
3874
17.7k
        kstring_t tmp;
3875
3876
        // ID
3877
17.7k
        tmp.l = 0; tmp.s = d->id; tmp.m = d->m_id;
3878
17.7k
        ptr_ori = ptr;
3879
17.7k
        ptr = bcf_fmt_sized_array(&tmp, ptr);
3880
17.7k
        b->unpack_size[0] = ptr - ptr_ori;
3881
17.7k
        kputc_('\0', &tmp);
3882
17.7k
        d->id = tmp.s; d->m_id = tmp.m;
3883
3884
        // REF and ALT are in a single block (d->als) and d->alleles are pointers into this block
3885
17.7k
        hts_expand(char*, b->n_allele, d->m_allele, d->allele); // NM: hts_expand() is a macro
3886
17.7k
        tmp.l = 0; tmp.s = d->als; tmp.m = d->m_als;
3887
17.7k
        ptr_ori = ptr;
3888
2.40M
        for (i = 0; i < b->n_allele; ++i) {
3889
            // Use offset within tmp.s as realloc may change pointer
3890
2.38M
            d->allele[i] = (char *)(intptr_t)tmp.l;
3891
2.38M
            ptr = bcf_fmt_sized_array(&tmp, ptr);
3892
2.38M
            kputc_('\0', &tmp);
3893
2.38M
        }
3894
17.7k
        b->unpack_size[1] = ptr - ptr_ori;
3895
17.7k
        d->als = tmp.s; d->m_als = tmp.m;
3896
3897
        // Convert our offsets within tmp.s back to pointers again
3898
2.40M
        for (i = 0; i < b->n_allele; ++i)
3899
2.38M
            d->allele[i] = d->als + (ptrdiff_t)d->allele[i];
3900
17.7k
        b->unpacked |= BCF_UN_STR;
3901
17.7k
    }
3902
17.7k
    if ((which&BCF_UN_FLT) && !(b->unpacked&BCF_UN_FLT)) { // FILTER
3903
17.7k
        ptr = (uint8_t*)b->shared.s + b->unpack_size[0] + b->unpack_size[1];
3904
17.7k
        ptr_ori = ptr;
3905
17.7k
        if (*ptr>>4) {
3906
16.9k
            int type;
3907
16.9k
            d->n_flt = bcf_dec_size(ptr, &ptr, &type);
3908
16.9k
            hts_expand(int, d->n_flt, d->m_flt, d->flt);
3909
1.72M
            for (i = 0; i < d->n_flt; ++i)
3910
1.70M
                d->flt[i] = bcf_dec_int1(ptr, type, &ptr);
3911
16.9k
        } else ++ptr, d->n_flt = 0;
3912
17.7k
        b->unpack_size[2] = ptr - ptr_ori;
3913
17.7k
        b->unpacked |= BCF_UN_FLT;
3914
17.7k
    }
3915
17.7k
    if ((which&BCF_UN_INFO) && !(b->unpacked&BCF_UN_INFO)) { // INFO
3916
0
        ptr = (uint8_t*)b->shared.s + b->unpack_size[0] + b->unpack_size[1] + b->unpack_size[2];
3917
0
        hts_expand(bcf_info_t, b->n_info, d->m_info, d->info);
3918
0
        for (i = 0; i < d->m_info; ++i) d->info[i].vptr_free = 0;
3919
0
        for (i = 0; i < b->n_info; ++i)
3920
0
            ptr = bcf_unpack_info_core1(ptr, &d->info[i]);
3921
0
        b->unpacked |= BCF_UN_INFO;
3922
0
    }
3923
17.7k
    if ((which&BCF_UN_FMT) && b->n_sample && !(b->unpacked&BCF_UN_FMT)) { // FORMAT
3924
0
        ptr = (uint8_t*)b->indiv.s;
3925
0
        hts_expand(bcf_fmt_t, b->n_fmt, d->m_fmt, d->fmt);
3926
0
        for (i = 0; i < d->m_fmt; ++i) d->fmt[i].p_free = 0;
3927
0
        for (i = 0; i < b->n_fmt; ++i)
3928
0
            ptr = bcf_unpack_fmt_core1(ptr, b->n_sample, &d->fmt[i]);
3929
0
        b->unpacked |= BCF_UN_FMT;
3930
0
    }
3931
17.7k
    return 0;
3932
17.7k
}
3933
3934
int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s)
3935
17.7k
{
3936
17.7k
    int i;
3937
17.7k
    int32_t max_dt_id = h->n[BCF_DT_ID];
3938
17.7k
    const char *chrom = bcf_seqname(h, v);
3939
17.7k
    if (!chrom) {
3940
0
        hts_log_error("Invalid BCF, CONTIG id=%d not present in the header",
3941
0
                      v->rid);
3942
0
        errno = EINVAL;
3943
0
        return -1;
3944
0
    }
3945
3946
17.7k
    bcf_unpack((bcf1_t*)v, BCF_UN_ALL & ~(BCF_UN_INFO|BCF_UN_FMT));
3947
3948
    // Cache of key lengths so we don't keep repeatedly using them.
3949
    // This assumes we're not modifying the header between successive calls
3950
    // to vcf_format, but that would lead to many other forms of breakage
3951
    // so it feels like a valid assumption to make.
3952
    //
3953
    // We cannot just do this in bcf_hdr_sync as some code (eg bcftools
3954
    // annotate) manipulates the headers directly without calling sync to
3955
    // refresh the data structures.  So we must do just-in-time length
3956
    // calculation during writes instead.
3957
17.7k
    bcf_hdr_aux_t *aux = get_hdr_aux(h);
3958
17.7k
    if (!aux->key_len) {
3959
4.12k
        if (!(aux->key_len = calloc(h->n[BCF_DT_ID]+1, sizeof(*aux->key_len))))
3960
0
            return -1;
3961
4.12k
    }
3962
17.7k
    size_t *key_len = aux->key_len;
3963
3964
17.7k
    kputs(chrom, s); // CHROM
3965
17.7k
    kputc_('\t', s); kputll(v->pos + 1, s); // POS
3966
17.7k
    kputc_('\t', s); kputs(v->d.id ? v->d.id : ".", s); // ID
3967
17.7k
    kputc_('\t', s); // REF
3968
17.7k
    if (v->n_allele > 0) kputs(v->d.allele[0], s);
3969
0
    else kputc_('.', s);
3970
17.7k
    kputc_('\t', s); // ALT
3971
17.7k
    if (v->n_allele > 1) {
3972
2.38M
        for (i = 1; i < v->n_allele; ++i) {
3973
2.37M
            if (i > 1) kputc_(',', s);
3974
2.37M
            kputs(v->d.allele[i], s);
3975
2.37M
        }
3976
17.4k
    } else kputc_('.', s);
3977
17.7k
    kputc_('\t', s); // QUAL
3978
17.7k
    if ( bcf_float_is_missing(v->qual) ) kputc_('.', s); // QUAL
3979
17.2k
    else kputd(v->qual, s);
3980
17.7k
    kputc_('\t', s); // FILTER
3981
17.7k
    if (v->d.n_flt) {
3982
1.72M
        for (i = 0; i < v->d.n_flt; ++i) {
3983
1.70M
            int32_t idx = v->d.flt[i];
3984
1.70M
            if (idx < 0 || idx >= max_dt_id
3985
1.70M
                || h->id[BCF_DT_ID][idx].key == NULL) {
3986
0
                hts_log_error("Invalid BCF, the FILTER tag id=%d at %s:%"PRIhts_pos" not present in the header",
3987
0
                              idx, bcf_seqname_safe(h, v), v->pos + 1);
3988
0
                errno = EINVAL;
3989
0
                return -1;
3990
0
            }
3991
1.70M
            if (i) kputc_(';', s);
3992
1.70M
            if (!key_len[idx])
3993
77.1k
                key_len[idx] = strlen(h->id[BCF_DT_ID][idx].key);
3994
1.70M
            kputsn(h->id[BCF_DT_ID][idx].key, key_len[idx], s);
3995
1.70M
        }
3996
16.9k
    } else kputc_('.', s);
3997
3998
17.7k
    kputc_('\t', s); // INFO
3999
17.7k
    if (v->n_info) {
4000
6.05k
        uint8_t *ptr = (uint8_t *)v->shared.s + v->unpack_size[0] + v->unpack_size[1] + v->unpack_size[2];
4001
6.05k
        int first = 1;
4002
6.05k
        bcf_info_t *info = v->d.info;
4003
4004
        // Note if we duplicate this code into custom packed and unpacked
4005
        // implementations then we gain a bit more speed, particularly with
4006
        // clang 13 (up to 5%).  Not sure why this is, but code duplication
4007
        // isn't pleasant and it's still faster adding packed support than
4008
        // not so it's a win, just not as good as it should be.
4009
6.05k
        const int info_packed = !(v->unpacked & BCF_UN_INFO) && v->shared.l;
4010
2.68M
        for (i = 0; i < v->n_info; ++i) {
4011
2.68M
            bcf_info_t in, *z;
4012
2.68M
            if (info_packed) {
4013
                // Use a local bcf_info_t when data is packed
4014
2.68M
                z = &in;
4015
2.68M
                z->key  = bcf_dec_typed_int1(ptr, &ptr);
4016
2.68M
                z->len  = bcf_dec_size(ptr, &ptr, &z->type);
4017
2.68M
                z->vptr = ptr;
4018
2.68M
                ptr += z->len << bcf_type_shift[z->type];
4019
2.68M
            } else {
4020
                // Else previously unpacked INFO struct
4021
0
                z = &info[i];
4022
4023
                // Also potentially since deleted
4024
0
                if ( !z->vptr ) continue;
4025
0
            }
4026
4027
2.68M
            bcf_idpair_t *id = z->key >= 0 && z->key < max_dt_id
4028
2.68M
                ? &h->id[BCF_DT_ID][z->key]
4029
2.68M
                : NULL;
4030
4031
2.68M
            if (!id || !id->key) {
4032
0
                hts_log_error("Invalid BCF, the INFO tag id=%d is %s at %s:%"PRIhts_pos,
4033
0
                              z->key,
4034
0
                              z->key < 0 ? "negative"
4035
0
                              : (z->key >= max_dt_id ? "too large" : "not present in the header"),
4036
0
                              bcf_seqname_safe(h, v), v->pos+1);
4037
0
                errno = EINVAL;
4038
0
                return -1;
4039
0
            }
4040
4041
            // KEY
4042
2.68M
            if (!key_len[z->key])
4043
28.3k
                key_len[z->key] = strlen(id->key);
4044
2.68M
            size_t id_len = key_len[z->key];
4045
2.68M
            if (ks_resize(s, s->l + 3 + id_len) < 0)
4046
0
                return -1;
4047
2.68M
            char *sptr = s->s + s->l;
4048
2.68M
            if ( !first ) {
4049
2.67M
                *sptr++ = ';';
4050
2.67M
                s->l++;
4051
2.67M
            }
4052
2.68M
            first = 0;
4053
2.68M
            memcpy(sptr, id->key, id_len);
4054
2.68M
            s->l += id_len;
4055
4056
            // VALUE
4057
2.68M
            if (z->len <= 0) continue;
4058
1.38M
            sptr[id_len] = '=';
4059
1.38M
            s->l++;
4060
4061
1.38M
            if (z->len != 1 || info_packed) {
4062
1.38M
                bcf_fmt_array(s, z->len, z->type, z->vptr);
4063
1.38M
            } else {
4064
                // Single length vectors are unpacked into their
4065
                // own info.v1 union and handled separately.
4066
0
                if (z->type == BCF_BT_FLOAT) {
4067
0
                    if ( bcf_float_is_missing(z->v1.f) )
4068
0
                        kputc_('.', s);
4069
0
                    else
4070
0
                        kputd(z->v1.f, s);
4071
0
                } else if (z->type == BCF_BT_CHAR) {
4072
0
                    kputc_(z->v1.i, s);
4073
0
                } else if (z->type < BCF_BT_INT64) {
4074
0
                    int64_t missing[] = {
4075
0
                        0, // BCF_BT_NULL
4076
0
                        bcf_int8_missing,
4077
0
                        bcf_int16_missing,
4078
0
                        bcf_int32_missing,
4079
0
                    };
4080
0
                    if (z->v1.i == missing[z->type])
4081
0
                        kputc_('.', s);
4082
0
                    else
4083
0
                        kputw(z->v1.i, s);
4084
0
                } else if (z->type == BCF_BT_INT64) {
4085
0
                    if (z->v1.i == bcf_int64_missing)
4086
0
                        kputc_('.', s);
4087
0
                    else
4088
0
                        kputll(z->v1.i, s);
4089
0
                } else {
4090
0
                    hts_log_error("Unexpected type %d at %s:%"PRIhts_pos, z->type, bcf_seqname_safe(h, v), v->pos+1);
4091
0
                    errno = EINVAL;
4092
0
                    return -1;
4093
0
                }
4094
0
            }
4095
1.38M
        }
4096
6.05k
        if ( first ) kputc_('.', s);
4097
11.6k
    } else kputc_('.', s);
4098
4099
    // FORMAT and individual information
4100
17.7k
    if (v->n_sample) {
4101
7.68k
        int i,j;
4102
7.68k
        if ( v->n_fmt) {
4103
7.23k
            uint8_t *ptr = (uint8_t *)v->indiv.s;
4104
7.23k
            int gt_i = -1;
4105
7.23k
            bcf_fmt_t *fmt = v->d.fmt;
4106
7.23k
            int first = 1;
4107
7.23k
            int fmt_packed = !(v->unpacked & BCF_UN_FMT);
4108
4109
7.23k
            if (fmt_packed) {
4110
                // Local fmt as we have an array of num FORMAT keys,
4111
                // each of which points to N.Sample values.
4112
4113
                // No real gain to be had in handling unpacked data here,
4114
                // but it doesn't cost us much in complexity either and
4115
                // it gives us flexibility.
4116
7.23k
                fmt = malloc(v->n_fmt * sizeof(*fmt));
4117
7.23k
                if (!fmt)
4118
0
                    return -1;
4119
7.23k
            }
4120
4121
            // KEYS
4122
79.9k
            for (i = 0; i < (int)v->n_fmt; ++i) {
4123
72.7k
                bcf_fmt_t *z;
4124
72.7k
                z = &fmt[i];
4125
72.7k
                if (fmt_packed) {
4126
72.7k
                    z->id   = bcf_dec_typed_int1(ptr, &ptr);
4127
72.7k
                    z->n    = bcf_dec_size(ptr, &ptr, &z->type);
4128
72.7k
                    z->p    = ptr;
4129
72.7k
                    z->size = z->n << bcf_type_shift[z->type];
4130
72.7k
                    ptr += v->n_sample * z->size;
4131
72.7k
                }
4132
72.7k
                if ( !z->p ) continue;
4133
72.7k
                kputc_(!first ? ':' : '\t', s); first = 0;
4134
4135
72.7k
                bcf_idpair_t *id = z->id >= 0 && z->id < max_dt_id
4136
72.7k
                    ? &h->id[BCF_DT_ID][z->id]
4137
72.7k
                    : NULL;
4138
4139
72.7k
                if (!id || !id->key) {
4140
0
                    hts_log_error("Invalid BCF, the FORMAT tag id=%d at %s:%"PRIhts_pos" not present in the header", z->id, bcf_seqname_safe(h, v), v->pos+1);
4141
0
                    errno = EINVAL;
4142
0
                    return -1;
4143
0
                }
4144
4145
72.7k
                if (!key_len[z->id])
4146
64.1k
                    key_len[z->id] = strlen(id->key);
4147
72.7k
                size_t id_len = key_len[z->id];
4148
72.7k
                kputsn(id->key, id_len, s);
4149
72.7k
                if (id_len == 2 && id->key[0] == 'G' && id->key[1] == 'T')
4150
3.69k
                    gt_i = i;
4151
72.7k
            }
4152
7.23k
            if ( first ) kputsn("\t.", 2, s);
4153
4154
            // VALUES per sample
4155
33.6k
            for (j = 0; j < v->n_sample; ++j) {
4156
26.4k
                kputc_('\t', s);
4157
26.4k
                first = 1;
4158
26.4k
                bcf_fmt_t *f = fmt;
4159
895k
                for (i = 0; i < (int)v->n_fmt; i++, f++) {
4160
878k
                    if ( !f->p ) continue;
4161
878k
                    if (!first) kputc_(':', s);
4162
878k
                    first = 0;
4163
878k
                    if (gt_i == i) {
4164
10.0k
                        bcf_format_gt(f,j,s);
4165
10.0k
                        break;
4166
10.0k
                    }
4167
868k
                    else if (f->n == 1)
4168
320k
                        bcf_fmt_array1(s, f->type, f->p + j * (size_t)f->size);
4169
548k
                    else
4170
548k
                        bcf_fmt_array(s, f->n, f->type, f->p + j * (size_t)f->size);
4171
878k
                }
4172
4173
                // Simpler loop post GT and at least 1 iteration
4174
53.8k
                for (i++, f++; i < (int)v->n_fmt; i++, f++) {
4175
27.4k
                    if ( !f->p ) continue;
4176
27.4k
                    kputc_(':', s);
4177
27.4k
                    if (f->n == 1)
4178
673
                        bcf_fmt_array1(s, f->type, f->p + j * (size_t)f->size);
4179
26.7k
                    else
4180
26.7k
                        bcf_fmt_array(s, f->n, f->type, f->p + j * (size_t)f->size);
4181
27.4k
                }
4182
26.4k
                if ( first ) kputc_('.', s);
4183
26.4k
            }
4184
7.23k
            if (fmt_packed)
4185
7.23k
                free(fmt);
4186
7.23k
        }
4187
449
        else
4188
1.41k
            for (j=0; j<=v->n_sample; j++)
4189
968
                kputsn("\t.", 2, s);
4190
7.68k
    }
4191
17.7k
    kputc('\n', s);
4192
17.7k
    return 0;
4193
17.7k
}
4194
4195
int vcf_write_line(htsFile *fp, kstring_t *line)
4196
0
{
4197
0
    int ret;
4198
0
    if ( line->s[line->l-1]!='\n' ) kputc('\n',line);
4199
0
    if ( fp->format.compression!=no_compression )
4200
0
        ret = bgzf_write(fp->fp.bgzf, line->s, line->l);
4201
0
    else
4202
0
        ret = hwrite(fp->fp.hfile, line->s, line->l);
4203
0
    return ret==line->l ? 0 : -1;
4204
0
}
4205
4206
int vcf_write(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
4207
17.7k
{
4208
17.7k
    ssize_t ret;
4209
17.7k
    fp->line.l = 0;
4210
17.7k
    if (vcf_format1(h, v, &fp->line) != 0)
4211
0
        return -1;
4212
17.7k
    if ( fp->format.compression!=no_compression ) {
4213
0
        if (bgzf_flush_try(fp->fp.bgzf, fp->line.l) < 0)
4214
0
            return -1;
4215
0
        ret = bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l);
4216
17.7k
    } else {
4217
17.7k
        ret = hwrite(fp->fp.hfile, fp->line.s, fp->line.l);
4218
17.7k
    }
4219
4220
17.7k
    if (fp->idx && fp->format.compression == bgzf) {
4221
0
        int tid;
4222
0
        if ((tid = hts_idx_tbi_name(fp->idx, v->rid, bcf_seqname_safe(h, v))) < 0)
4223
0
            return -1;
4224
4225
0
        if (bgzf_idx_push(fp->fp.bgzf, fp->idx,
4226
0
                          tid, v->pos, v->pos + v->rlen,
4227
0
                          bgzf_tell(fp->fp.bgzf), 1) < 0)
4228
0
            return -1;
4229
0
    }
4230
4231
17.7k
    return ret==fp->line.l ? 0 : -1;
4232
17.7k
}
4233
4234
/************************
4235
 * Data access routines *
4236
 ************************/
4237
4238
int bcf_hdr_id2int(const bcf_hdr_t *h, int which, const char *id)
4239
9.52k
{
4240
9.52k
    khint_t k;
4241
9.52k
    vdict_t *d = (vdict_t*)h->dict[which];
4242
9.52k
    k = kh_get(vdict, d, id);
4243
9.52k
    return k == kh_end(d)? -1 : kh_val(d, k).id;
4244
9.52k
}
4245
4246
4247
/********************
4248
 *** BCF indexing ***
4249
 ********************/
4250
4251
// Calculate number of index levels given min_shift and the header contig
4252
// list.  Also returns number of contigs in *nids_out.
4253
static int idx_calc_n_lvls_ids(const bcf_hdr_t *h, int min_shift,
4254
                               int starting_n_lvls, int *nids_out)
4255
0
{
4256
0
    int n_lvls, i, nids = 0;
4257
0
    int64_t max_len = 0, s;
4258
4259
0
    for (i = 0; i < h->n[BCF_DT_CTG]; ++i)
4260
0
    {
4261
0
        if ( !h->id[BCF_DT_CTG][i].val ) continue;
4262
0
        if ( max_len < h->id[BCF_DT_CTG][i].val->info[0] )
4263
0
            max_len = h->id[BCF_DT_CTG][i].val->info[0];
4264
0
        nids++;
4265
0
    }
4266
0
    if ( !max_len ) max_len = (1LL<<31) - 1;  // In case contig line is broken.
4267
0
    max_len += 256;
4268
0
    s = 1LL << (min_shift + starting_n_lvls * 3);
4269
0
    for (n_lvls = starting_n_lvls; max_len > s; ++n_lvls, s <<= 3);
4270
4271
0
    if (nids_out) *nids_out = nids;
4272
0
    return n_lvls;
4273
0
}
4274
4275
hts_idx_t *bcf_index(htsFile *fp, int min_shift)
4276
0
{
4277
0
    int n_lvls;
4278
0
    bcf1_t *b = NULL;
4279
0
    hts_idx_t *idx = NULL;
4280
0
    bcf_hdr_t *h;
4281
0
    int r;
4282
0
    h = bcf_hdr_read(fp);
4283
0
    if ( !h ) return NULL;
4284
0
    int nids = 0;
4285
0
    n_lvls = idx_calc_n_lvls_ids(h, min_shift, 0, &nids);
4286
0
    idx = hts_idx_init(nids, HTS_FMT_CSI, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
4287
0
    if (!idx) goto fail;
4288
0
    b = bcf_init1();
4289
0
    if (!b) goto fail;
4290
0
    while ((r = bcf_read1(fp,h, b)) >= 0) {
4291
0
        int ret;
4292
0
        ret = hts_idx_push(idx, b->rid, b->pos, b->pos + b->rlen, bgzf_tell(fp->fp.bgzf), 1);
4293
0
        if (ret < 0) goto fail;
4294
0
    }
4295
0
    if (r < -1) goto fail;
4296
0
    hts_idx_finish(idx, bgzf_tell(fp->fp.bgzf));
4297
0
    bcf_destroy1(b);
4298
0
    bcf_hdr_destroy(h);
4299
0
    return idx;
4300
4301
0
 fail:
4302
0
    hts_idx_destroy(idx);
4303
0
    bcf_destroy1(b);
4304
0
    bcf_hdr_destroy(h);
4305
0
    return NULL;
4306
0
}
4307
4308
hts_idx_t *bcf_index_load2(const char *fn, const char *fnidx)
4309
0
{
4310
0
    return fnidx? hts_idx_load2(fn, fnidx) : bcf_index_load(fn);
4311
0
}
4312
4313
hts_idx_t *bcf_index_load3(const char *fn, const char *fnidx, int flags)
4314
0
{
4315
0
    return hts_idx_load3(fn, fnidx, HTS_FMT_CSI, flags);
4316
0
}
4317
4318
int bcf_index_build3(const char *fn, const char *fnidx, int min_shift, int n_threads)
4319
0
{
4320
0
    htsFile *fp;
4321
0
    hts_idx_t *idx;
4322
0
    tbx_t *tbx;
4323
0
    int ret;
4324
0
    if ((fp = hts_open(fn, "rb")) == 0) return -2;
4325
0
    if (n_threads)
4326
0
        hts_set_threads(fp, n_threads);
4327
0
    if ( fp->format.compression!=bgzf ) { hts_close(fp); return -3; }
4328
0
    switch (fp->format.format) {
4329
0
        case bcf:
4330
0
            if (!min_shift) {
4331
0
                hts_log_error("TBI indices for BCF files are not supported");
4332
0
                ret = -1;
4333
0
            } else {
4334
0
                idx = bcf_index(fp, min_shift);
4335
0
                if (idx) {
4336
0
                    ret = hts_idx_save_as(idx, fn, fnidx, HTS_FMT_CSI);
4337
0
                    if (ret < 0) ret = -4;
4338
0
                    hts_idx_destroy(idx);
4339
0
                }
4340
0
                else ret = -1;
4341
0
            }
4342
0
            break;
4343
4344
0
        case vcf:
4345
0
            tbx = tbx_index(hts_get_bgzfp(fp), min_shift, &tbx_conf_vcf);
4346
0
            if (tbx) {
4347
0
                ret = hts_idx_save_as(tbx->idx, fn, fnidx, min_shift > 0 ? HTS_FMT_CSI : HTS_FMT_TBI);
4348
0
                if (ret < 0) ret = -4;
4349
0
                tbx_destroy(tbx);
4350
0
            }
4351
0
            else ret = -1;
4352
0
            break;
4353
4354
0
        default:
4355
0
            ret = -3;
4356
0
            break;
4357
0
    }
4358
0
    hts_close(fp);
4359
0
    return ret;
4360
0
}
4361
4362
int bcf_index_build2(const char *fn, const char *fnidx, int min_shift)
4363
0
{
4364
0
    return bcf_index_build3(fn, fnidx, min_shift, 0);
4365
0
}
4366
4367
int bcf_index_build(const char *fn, int min_shift)
4368
0
{
4369
0
    return bcf_index_build3(fn, NULL, min_shift, 0);
4370
0
}
4371
4372
// Initialise fp->idx for the current format type.
4373
// This must be called after the header has been written but no other data.
4374
0
static int vcf_idx_init(htsFile *fp, bcf_hdr_t *h, int min_shift, const char *fnidx) {
4375
0
    int n_lvls, fmt;
4376
4377
0
    if (min_shift == 0) {
4378
0
        min_shift = 14;
4379
0
        n_lvls = 5;
4380
0
        fmt = HTS_FMT_TBI;
4381
0
    } else {
4382
        // Set initial n_lvls to match tbx_index()
4383
0
        int starting_n_lvls = (TBX_MAX_SHIFT - min_shift + 2) / 3;
4384
        // Increase if necessary
4385
0
        n_lvls = idx_calc_n_lvls_ids(h, min_shift, starting_n_lvls, NULL);
4386
0
        fmt = HTS_FMT_CSI;
4387
0
    }
4388
4389
0
    fp->idx = hts_idx_init(0, fmt, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
4390
0
    if (!fp->idx) return -1;
4391
4392
    // Tabix meta data, added even in CSI for VCF
4393
0
    uint8_t conf[4*7];
4394
0
    u32_to_le(TBX_VCF, conf+0);  // fmt
4395
0
    u32_to_le(1,       conf+4);  // name col
4396
0
    u32_to_le(2,       conf+8);  // beg col
4397
0
    u32_to_le(0,       conf+12); // end col
4398
0
    u32_to_le('#',     conf+16); // comment
4399
0
    u32_to_le(0,       conf+20); // n.skip
4400
0
    u32_to_le(0,       conf+24); // ref name len
4401
0
    if (hts_idx_set_meta(fp->idx, sizeof(conf)*sizeof(*conf), (uint8_t *)conf, 1) < 0) {
4402
0
        hts_idx_destroy(fp->idx);
4403
0
        fp->idx = NULL;
4404
0
        return -1;
4405
0
    }
4406
0
    fp->fnidx = fnidx;
4407
4408
0
    return 0;
4409
0
}
4410
4411
// Initialise fp->idx for the current format type.
4412
// This must be called after the header has been written but no other data.
4413
0
int bcf_idx_init(htsFile *fp, bcf_hdr_t *h, int min_shift, const char *fnidx) {
4414
0
    int n_lvls, nids = 0;
4415
4416
0
    if (fp->format.compression != bgzf) {
4417
0
        hts_log_error("Indexing is only supported on BGZF-compressed files");
4418
0
        return -3; // Matches no-compression return for bcf_index_build3()
4419
0
    }
4420
4421
0
    if (fp->format.format == vcf)
4422
0
        return vcf_idx_init(fp, h, min_shift, fnidx);
4423
4424
0
    if (!min_shift)
4425
0
        min_shift = 14;
4426
4427
0
    n_lvls = idx_calc_n_lvls_ids(h, min_shift, 0, &nids);
4428
4429
0
    fp->idx = hts_idx_init(nids, HTS_FMT_CSI, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
4430
0
    if (!fp->idx) return -1;
4431
0
    fp->fnidx = fnidx;
4432
4433
0
    return 0;
4434
0
}
4435
4436
// Finishes an index. Call after the last record has been written.
4437
// Returns 0 on success, <0 on failure.
4438
//
4439
// NB: same format as SAM/BAM as it uses bgzf.
4440
0
int bcf_idx_save(htsFile *fp) {
4441
0
    return sam_idx_save(fp);
4442
0
}
4443
4444
/*****************
4445
 *** Utilities ***
4446
 *****************/
4447
4448
int bcf_hdr_combine(bcf_hdr_t *dst, const bcf_hdr_t *src)
4449
0
{
4450
0
    int i, ndst_ori = dst->nhrec, need_sync = 0, ret = 0, res;
4451
0
    for (i=0; i<src->nhrec; i++)
4452
0
    {
4453
0
        if ( src->hrec[i]->type==BCF_HL_GEN && src->hrec[i]->value )
4454
0
        {
4455
0
            int j;
4456
0
            for (j=0; j<ndst_ori; j++)
4457
0
            {
4458
0
                if ( dst->hrec[j]->type!=BCF_HL_GEN ) continue;
4459
4460
                // Checking only the key part of generic lines, otherwise
4461
                // the VCFs are too verbose. Should we perhaps add a flag
4462
                // to bcf_hdr_combine() and make this optional?
4463
0
                if ( !strcmp(src->hrec[i]->key,dst->hrec[j]->key) ) break;
4464
0
            }
4465
0
            if ( j>=ndst_ori ) {
4466
0
                res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
4467
0
                if (res < 0) return -1;
4468
0
                need_sync += res;
4469
0
            }
4470
0
        }
4471
0
        else if ( src->hrec[i]->type==BCF_HL_STR )
4472
0
        {
4473
            // NB: we are ignoring fields without ID
4474
0
            int j = bcf_hrec_find_key(src->hrec[i],"ID");
4475
0
            if ( j>=0 )
4476
0
            {
4477
0
                bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], src->hrec[i]->key);
4478
0
                if ( !rec ) {
4479
0
                    res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
4480
0
                    if (res < 0) return -1;
4481
0
                    need_sync += res;
4482
0
                }
4483
0
            }
4484
0
        }
4485
0
        else
4486
0
        {
4487
0
            int j = bcf_hrec_find_key(src->hrec[i],"ID");
4488
0
            assert( j>=0 ); // this should always be true for valid VCFs
4489
4490
0
            bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], NULL);
4491
0
            if ( !rec ) {
4492
0
                res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
4493
0
                if (res < 0) return -1;
4494
0
                need_sync += res;
4495
0
            } else if ( src->hrec[i]->type==BCF_HL_INFO || src->hrec[i]->type==BCF_HL_FMT )
4496
0
            {
4497
                // Check that both records are of the same type. The bcf_hdr_id2length
4498
                // macro cannot be used here because dst header is not synced yet.
4499
0
                vdict_t *d_src = (vdict_t*)src->dict[BCF_DT_ID];
4500
0
                vdict_t *d_dst = (vdict_t*)dst->dict[BCF_DT_ID];
4501
0
                khint_t k_src  = kh_get(vdict, d_src, src->hrec[i]->vals[0]);
4502
0
                khint_t k_dst  = kh_get(vdict, d_dst, src->hrec[i]->vals[0]);
4503
0
                if ( (kh_val(d_src,k_src).info[rec->type]>>8 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>8 & 0xf) )
4504
0
                {
4505
0
                    hts_log_warning("Trying to combine \"%s\" tag definitions of different lengths",
4506
0
                        src->hrec[i]->vals[0]);
4507
0
                    ret |= 1;
4508
0
                }
4509
0
                if ( (kh_val(d_src,k_src).info[rec->type]>>4 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>4 & 0xf) )
4510
0
                {
4511
0
                    hts_log_warning("Trying to combine \"%s\" tag definitions of different types",
4512
0
                        src->hrec[i]->vals[0]);
4513
0
                    ret |= 1;
4514
0
                }
4515
0
            }
4516
0
        }
4517
0
    }
4518
0
    if ( need_sync ) {
4519
0
        if (bcf_hdr_sync(dst) < 0) return -1;
4520
0
    }
4521
0
    return ret;
4522
0
}
4523
4524
bcf_hdr_t *bcf_hdr_merge(bcf_hdr_t *dst, const bcf_hdr_t *src)
4525
0
{
4526
0
    if ( !dst )
4527
0
    {
4528
        // this will effectively strip existing IDX attributes from src to become dst
4529
0
        dst = bcf_hdr_init("r");
4530
0
        kstring_t htxt = {0,0,0};
4531
0
        if (bcf_hdr_format(src, 0, &htxt) < 0) {
4532
0
            free(htxt.s);
4533
0
            return NULL;
4534
0
        }
4535
0
        if ( bcf_hdr_parse(dst, htxt.s) < 0 ) {
4536
0
            bcf_hdr_destroy(dst);
4537
0
            dst = NULL;
4538
0
        }
4539
0
        free(htxt.s);
4540
0
        return dst;
4541
0
    }
4542
4543
0
    int i, ndst_ori = dst->nhrec, need_sync = 0, res;
4544
0
    for (i=0; i<src->nhrec; i++)
4545
0
    {
4546
0
        if ( src->hrec[i]->type==BCF_HL_GEN && src->hrec[i]->value )
4547
0
        {
4548
0
            int j;
4549
0
            for (j=0; j<ndst_ori; j++)
4550
0
            {
4551
0
                if ( dst->hrec[j]->type!=BCF_HL_GEN ) continue;
4552
4553
                // Checking only the key part of generic lines, otherwise
4554
                // the VCFs are too verbose. Should we perhaps add a flag
4555
                // to bcf_hdr_combine() and make this optional?
4556
0
                if ( !strcmp(src->hrec[i]->key,dst->hrec[j]->key) ) break;
4557
0
            }
4558
0
            if ( j>=ndst_ori ) {
4559
0
                res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
4560
0
                if (res < 0) return NULL;
4561
0
                need_sync += res;
4562
0
            }
4563
0
        }
4564
0
        else if ( src->hrec[i]->type==BCF_HL_STR )
4565
0
        {
4566
            // NB: we are ignoring fields without ID
4567
0
            int j = bcf_hrec_find_key(src->hrec[i],"ID");
4568
0
            if ( j>=0 )
4569
0
            {
4570
0
                bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], src->hrec[i]->key);
4571
0
                if ( !rec ) {
4572
0
                    res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
4573
0
                    if (res < 0) return NULL;
4574
0
                    need_sync += res;
4575
0
                }
4576
0
            }
4577
0
        }
4578
0
        else
4579
0
        {
4580
0
            int j = bcf_hrec_find_key(src->hrec[i],"ID");
4581
0
            assert( j>=0 ); // this should always be true for valid VCFs
4582
4583
0
            bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], NULL);
4584
0
            if ( !rec ) {
4585
0
                res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
4586
0
                if (res < 0) return NULL;
4587
0
                need_sync += res;
4588
0
            } else if ( src->hrec[i]->type==BCF_HL_INFO || src->hrec[i]->type==BCF_HL_FMT )
4589
0
            {
4590
                // Check that both records are of the same type. The bcf_hdr_id2length
4591
                // macro cannot be used here because dst header is not synced yet.
4592
0
                vdict_t *d_src = (vdict_t*)src->dict[BCF_DT_ID];
4593
0
                vdict_t *d_dst = (vdict_t*)dst->dict[BCF_DT_ID];
4594
0
                khint_t k_src  = kh_get(vdict, d_src, src->hrec[i]->vals[0]);
4595
0
                khint_t k_dst  = kh_get(vdict, d_dst, src->hrec[i]->vals[0]);
4596
0
                if ( (kh_val(d_src,k_src).info[rec->type]>>8 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>8 & 0xf) )
4597
0
                {
4598
0
                    hts_log_warning("Trying to combine \"%s\" tag definitions of different lengths",
4599
0
                        src->hrec[i]->vals[0]);
4600
0
                }
4601
0
                if ( (kh_val(d_src,k_src).info[rec->type]>>4 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>4 & 0xf) )
4602
0
                {
4603
0
                    hts_log_warning("Trying to combine \"%s\" tag definitions of different types",
4604
0
                        src->hrec[i]->vals[0]);
4605
0
                }
4606
0
            }
4607
0
        }
4608
0
    }
4609
0
    if ( need_sync ) {
4610
0
        if (bcf_hdr_sync(dst) < 0) return NULL;
4611
0
    }
4612
0
    return dst;
4613
0
}
4614
4615
int bcf_translate(const bcf_hdr_t *dst_hdr, bcf_hdr_t *src_hdr, bcf1_t *line)
4616
0
{
4617
0
    int i;
4618
0
    if ( line->errcode )
4619
0
    {
4620
0
        char errordescription[1024] = "";
4621
0
        hts_log_error("Unchecked error (%d %s) at %s:%"PRIhts_pos", exiting", line->errcode, bcf_strerror(line->errcode, errordescription, sizeof(errordescription)),  bcf_seqname_safe(src_hdr,line), line->pos+1);
4622
0
        exit(1);
4623
0
    }
4624
0
    if ( src_hdr->ntransl==-1 ) return 0;    // no need to translate, all tags have the same id
4625
0
    if ( !src_hdr->ntransl )  // called for the first time, see what needs translating
4626
0
    {
4627
0
        int dict;
4628
0
        for (dict=0; dict<2; dict++)    // BCF_DT_ID and BCF_DT_CTG
4629
0
        {
4630
0
            src_hdr->transl[dict] = (int*) malloc(src_hdr->n[dict]*sizeof(int));
4631
0
            for (i=0; i<src_hdr->n[dict]; i++)
4632
0
            {
4633
0
                if ( !src_hdr->id[dict][i].key ) // gap left after removed BCF header lines
4634
0
                {
4635
0
                    src_hdr->transl[dict][i] = -1;
4636
0
                    continue;
4637
0
                }
4638
0
                src_hdr->transl[dict][i] = bcf_hdr_id2int(dst_hdr,dict,src_hdr->id[dict][i].key);
4639
0
                if ( src_hdr->transl[dict][i]!=-1 && i!=src_hdr->transl[dict][i] ) src_hdr->ntransl++;
4640
0
            }
4641
0
        }
4642
0
        if ( !src_hdr->ntransl )
4643
0
        {
4644
0
            free(src_hdr->transl[0]); src_hdr->transl[0] = NULL;
4645
0
            free(src_hdr->transl[1]); src_hdr->transl[1] = NULL;
4646
0
            src_hdr->ntransl = -1;
4647
0
        }
4648
0
        if ( src_hdr->ntransl==-1 ) return 0;
4649
0
    }
4650
0
    bcf_unpack(line,BCF_UN_ALL);
4651
4652
    // CHROM
4653
0
    if ( src_hdr->transl[BCF_DT_CTG][line->rid] >=0 ) line->rid = src_hdr->transl[BCF_DT_CTG][line->rid];
4654
4655
    // FILTER
4656
0
    for (i=0; i<line->d.n_flt; i++)
4657
0
    {
4658
0
        int src_id = line->d.flt[i];
4659
0
        if ( src_hdr->transl[BCF_DT_ID][src_id] >=0 )
4660
0
            line->d.flt[i] = src_hdr->transl[BCF_DT_ID][src_id];
4661
0
        line->d.shared_dirty |= BCF1_DIRTY_FLT;
4662
0
    }
4663
4664
    // INFO
4665
0
    for (i=0; i<line->n_info; i++)
4666
0
    {
4667
0
        int src_id = line->d.info[i].key;
4668
0
        int dst_id = src_hdr->transl[BCF_DT_ID][src_id];
4669
0
        if ( dst_id<0 ) continue;
4670
0
        line->d.info[i].key = dst_id;
4671
0
        if ( !line->d.info[i].vptr ) continue;  // skip deleted
4672
0
        int src_size = src_id>>7 ? ( src_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
4673
0
        int dst_size = dst_id>>7 ? ( dst_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
4674
0
        if ( src_size==dst_size )   // can overwrite
4675
0
        {
4676
0
            uint8_t *vptr = line->d.info[i].vptr - line->d.info[i].vptr_off;
4677
0
            if ( dst_size==BCF_BT_INT8 ) { vptr[1] = (uint8_t)dst_id; }
4678
0
            else if ( dst_size==BCF_BT_INT16 ) { *(uint16_t*)vptr = (uint16_t)dst_id; }
4679
0
            else { *(uint32_t*)vptr = (uint32_t)dst_id; }
4680
0
        }
4681
0
        else    // must realloc
4682
0
        {
4683
0
            bcf_info_t *info = &line->d.info[i];
4684
0
            kstring_t str = {0,0,0};
4685
0
            bcf_enc_int1(&str, dst_id);
4686
0
            bcf_enc_size(&str, info->len,info->type);
4687
0
            uint32_t vptr_off = str.l;
4688
0
            kputsn((char*)info->vptr, info->vptr_len, &str);
4689
0
            if( info->vptr_free ) free(info->vptr - info->vptr_off);
4690
0
            info->vptr_off = vptr_off;
4691
0
            info->vptr = (uint8_t*)str.s + info->vptr_off;
4692
0
            info->vptr_free = 1;
4693
0
            line->d.shared_dirty |= BCF1_DIRTY_INF;
4694
0
        }
4695
0
    }
4696
4697
    // FORMAT
4698
0
    for (i=0; i<line->n_fmt; i++)
4699
0
    {
4700
0
        int src_id = line->d.fmt[i].id;
4701
0
        int dst_id = src_hdr->transl[BCF_DT_ID][src_id];
4702
0
        if ( dst_id<0 ) continue;
4703
0
        line->d.fmt[i].id = dst_id;
4704
0
        if( !line->d.fmt[i].p ) continue;  // skip deleted
4705
0
        int src_size = src_id>>7 ? ( src_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
4706
0
        int dst_size = dst_id>>7 ? ( dst_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
4707
0
        if ( src_size==dst_size )   // can overwrite
4708
0
        {
4709
0
            uint8_t *p = line->d.fmt[i].p - line->d.fmt[i].p_off;    // pointer to the vector size (4bits) and BT type (4bits)
4710
0
            if ( dst_size==BCF_BT_INT8 ) { p[1] = dst_id; }
4711
0
            else if ( dst_size==BCF_BT_INT16 ) { i16_to_le(dst_id, p + 1); }
4712
0
            else { i32_to_le(dst_id, p + 1); }
4713
0
        }
4714
0
        else    // must realloc
4715
0
        {
4716
0
            bcf_fmt_t *fmt = &line->d.fmt[i];
4717
0
            kstring_t str = {0,0,0};
4718
0
            bcf_enc_int1(&str, dst_id);
4719
0
            bcf_enc_size(&str, fmt->n, fmt->type);
4720
0
            uint32_t p_off = str.l;
4721
0
            kputsn((char*)fmt->p, fmt->p_len, &str);
4722
0
            if( fmt->p_free ) free(fmt->p - fmt->p_off);
4723
0
            fmt->p_off = p_off;
4724
0
            fmt->p = (uint8_t*)str.s + fmt->p_off;
4725
0
            fmt->p_free = 1;
4726
0
            line->d.indiv_dirty = 1;
4727
0
        }
4728
0
    }
4729
0
    return 0;
4730
0
}
4731
4732
bcf_hdr_t *bcf_hdr_dup(const bcf_hdr_t *hdr)
4733
0
{
4734
0
    bcf_hdr_t *hout = bcf_hdr_init("r");
4735
0
    if (!hout) {
4736
0
        hts_log_error("Failed to allocate bcf header");
4737
0
        return NULL;
4738
0
    }
4739
0
    kstring_t htxt = {0,0,0};
4740
0
    if (bcf_hdr_format(hdr, 1, &htxt) < 0) {
4741
0
        free(htxt.s);
4742
0
        return NULL;
4743
0
    }
4744
0
    if ( bcf_hdr_parse(hout, htxt.s) < 0 ) {
4745
0
        bcf_hdr_destroy(hout);
4746
0
        hout = NULL;
4747
0
    }
4748
0
    free(htxt.s);
4749
0
    return hout;
4750
0
}
4751
4752
bcf_hdr_t *bcf_hdr_subset(const bcf_hdr_t *h0, int n, char *const* samples, int *imap)
4753
0
{
4754
0
    void *names_hash = khash_str2int_init();
4755
0
    kstring_t htxt = {0,0,0};
4756
0
    kstring_t str = {0,0,0};
4757
0
    bcf_hdr_t *h = bcf_hdr_init("w");
4758
0
    int r = 0;
4759
0
    if (!h || !names_hash) {
4760
0
        hts_log_error("Failed to allocate bcf header");
4761
0
        goto err;
4762
0
    }
4763
0
    if (bcf_hdr_format(h0, 1, &htxt) < 0) {
4764
0
        hts_log_error("Failed to get header text");
4765
0
        goto err;
4766
0
    }
4767
0
    bcf_hdr_set_version(h,bcf_hdr_get_version(h0));
4768
0
    int j;
4769
0
    for (j=0; j<n; j++) imap[j] = -1;
4770
0
    if ( bcf_hdr_nsamples(h0) > 0) {
4771
0
        char *p = find_chrom_header_line(htxt.s);
4772
0
        int i = 0, end = n? 8 : 7;
4773
0
        while ((p = strchr(p, '\t')) != 0 && i < end) ++i, ++p;
4774
0
        if (i != end) {
4775
0
            hts_log_error("Wrong number of columns in header #CHROM line");
4776
0
            goto err;
4777
0
        }
4778
0
        r |= kputsn(htxt.s, p - htxt.s, &str) < 0;
4779
0
        for (i = 0; i < n; ++i) {
4780
0
            if ( khash_str2int_has_key(names_hash,samples[i]) )
4781
0
            {
4782
0
                hts_log_error("Duplicate sample name \"%s\"", samples[i]);
4783
0
                goto err;
4784
0
            }
4785
0
            imap[i] = bcf_hdr_id2int(h0, BCF_DT_SAMPLE, samples[i]);
4786
0
            if (imap[i] < 0) continue;
4787
0
            r |= kputc('\t', &str) < 0;
4788
0
            r |= kputs(samples[i], &str) < 0;
4789
0
            r |= khash_str2int_inc(names_hash,samples[i]) < 0;
4790
0
        }
4791
0
    } else r |= kputsn(htxt.s, htxt.l, &str) < 0;
4792
0
    while (str.l && (!str.s[str.l-1] || str.s[str.l-1]=='\n') ) str.l--; // kill trailing zeros and newlines
4793
0
    r |= kputc('\n',&str) < 0;
4794
0
    if (r) {
4795
0
        hts_log_error("%s", strerror(errno));
4796
0
        goto err;
4797
0
    }
4798
0
    if ( bcf_hdr_parse(h, str.s) < 0 ) {
4799
0
        bcf_hdr_destroy(h);
4800
0
        h = NULL;
4801
0
    }
4802
0
    free(str.s);
4803
0
    free(htxt.s);
4804
0
    khash_str2int_destroy(names_hash);
4805
0
    return h;
4806
4807
0
 err:
4808
0
    ks_free(&str);
4809
0
    ks_free(&htxt);
4810
0
    khash_str2int_destroy(names_hash);
4811
0
    bcf_hdr_destroy(h);
4812
0
    return NULL;
4813
0
}
4814
4815
int bcf_hdr_set_samples(bcf_hdr_t *hdr, const char *samples, int is_file)
4816
0
{
4817
0
    if ( samples && !strcmp("-",samples) ) return 0;            // keep all samples
4818
4819
0
    int i, narr = bit_array_size(bcf_hdr_nsamples(hdr));
4820
0
    hdr->keep_samples = (uint8_t*) calloc(narr,1);
4821
0
    if (!hdr->keep_samples) return -1;
4822
4823
0
    hdr->nsamples_ori = bcf_hdr_nsamples(hdr);
4824
0
    if ( !samples )
4825
0
    {
4826
        // exclude all samples
4827
0
        khint_t k;
4828
0
        vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_SAMPLE], *new_dict;
4829
0
        new_dict = kh_init(vdict);
4830
0
        if (!new_dict) return -1;
4831
4832
0
        bcf_hdr_nsamples(hdr) = 0;
4833
4834
0
        for (k = kh_begin(d); k != kh_end(d); ++k)
4835
0
            if (kh_exist(d, k)) free((char*)kh_key(d, k));
4836
0
        kh_destroy(vdict, d);
4837
0
        hdr->dict[BCF_DT_SAMPLE] = new_dict;
4838
0
        if (bcf_hdr_sync(hdr) < 0) return -1;
4839
4840
0
        return 0;
4841
0
    }
4842
4843
0
    if ( samples[0]=='^' )
4844
0
        for (i=0; i<bcf_hdr_nsamples(hdr); i++) bit_array_set(hdr->keep_samples,i);
4845
4846
0
    int idx, n, ret = 0;
4847
0
    char **smpls = hts_readlist(samples[0]=='^'?samples+1:samples, is_file, &n);
4848
0
    if ( !smpls ) return -1;
4849
0
    for (i=0; i<n; i++)
4850
0
    {
4851
0
        idx = bcf_hdr_id2int(hdr,BCF_DT_SAMPLE,smpls[i]);
4852
0
        if ( idx<0 )
4853
0
        {
4854
0
            if ( !ret ) ret = i+1;
4855
0
            continue;
4856
0
        }
4857
0
        assert( idx<bcf_hdr_nsamples(hdr) );
4858
0
        if (  samples[0]=='^' )
4859
0
            bit_array_clear(hdr->keep_samples, idx);
4860
0
        else
4861
0
            bit_array_set(hdr->keep_samples, idx);
4862
0
    }
4863
0
    for (i=0; i<n; i++) free(smpls[i]);
4864
0
    free(smpls);
4865
4866
0
    bcf_hdr_nsamples(hdr) = 0;
4867
0
    for (i=0; i<hdr->nsamples_ori; i++)
4868
0
        if ( bit_array_test(hdr->keep_samples,i) ) bcf_hdr_nsamples(hdr)++;
4869
4870
0
    if ( !bcf_hdr_nsamples(hdr) ) { free(hdr->keep_samples); hdr->keep_samples=NULL; }
4871
0
    else
4872
0
    {
4873
        // Make new list and dictionary with desired samples
4874
0
        char **samples = (char**) malloc(sizeof(char*)*bcf_hdr_nsamples(hdr));
4875
0
        vdict_t *new_dict, *d;
4876
0
        int k, res;
4877
0
        if (!samples) return -1;
4878
4879
0
        new_dict = kh_init(vdict);
4880
0
        if (!new_dict) {
4881
0
            free(samples);
4882
0
            return -1;
4883
0
        }
4884
0
        idx = 0;
4885
0
        for (i=0; i<hdr->nsamples_ori; i++) {
4886
0
            if ( bit_array_test(hdr->keep_samples,i) ) {
4887
0
                samples[idx] = hdr->samples[i];
4888
0
                k = kh_put(vdict, new_dict, hdr->samples[i], &res);
4889
0
                if (res < 0) {
4890
0
                    free(samples);
4891
0
                    kh_destroy(vdict, new_dict);
4892
0
                    return -1;
4893
0
                }
4894
0
                kh_val(new_dict, k) = bcf_idinfo_def;
4895
0
                kh_val(new_dict, k).id = idx;
4896
0
                idx++;
4897
0
            }
4898
0
        }
4899
4900
        // Delete desired samples from old dictionary, so we don't free them
4901
0
        d = (vdict_t*)hdr->dict[BCF_DT_SAMPLE];
4902
0
        for (i=0; i < idx; i++) {
4903
0
            int k = kh_get(vdict, d, samples[i]);
4904
0
            if (k < kh_end(d)) kh_del(vdict, d, k);
4905
0
        }
4906
4907
        // Free everything else
4908
0
        for (k = kh_begin(d); k != kh_end(d); ++k)
4909
0
            if (kh_exist(d, k)) free((char*)kh_key(d, k));
4910
0
        kh_destroy(vdict, d);
4911
0
        hdr->dict[BCF_DT_SAMPLE] = new_dict;
4912
4913
0
        free(hdr->samples);
4914
0
        hdr->samples = samples;
4915
4916
0
        if (bcf_hdr_sync(hdr) < 0)
4917
0
            return -1;
4918
0
    }
4919
4920
0
    return ret;
4921
0
}
4922
4923
int bcf_subset(const bcf_hdr_t *h, bcf1_t *v, int n, int *imap)
4924
0
{
4925
0
    kstring_t ind;
4926
0
    ind.s = 0; ind.l = ind.m = 0;
4927
0
    if (n) {
4928
0
        bcf_fmt_t fmt[MAX_N_FMT];
4929
0
        int i, j;
4930
0
        uint8_t *ptr = (uint8_t*)v->indiv.s;
4931
0
        for (i = 0; i < v->n_fmt; ++i)
4932
0
            ptr = bcf_unpack_fmt_core1(ptr, v->n_sample, &fmt[i]);
4933
0
        for (i = 0; i < (int)v->n_fmt; ++i) {
4934
0
            bcf_fmt_t *f = &fmt[i];
4935
0
            bcf_enc_int1(&ind, f->id);
4936
0
            bcf_enc_size(&ind, f->n, f->type);
4937
0
            for (j = 0; j < n; ++j)
4938
0
                if (imap[j] >= 0) kputsn((char*)(f->p + imap[j] * f->size), f->size, &ind);
4939
0
        }
4940
0
        for (i = j = 0; j < n; ++j) if (imap[j] >= 0) ++i;
4941
0
        v->n_sample = i;
4942
0
    } else v->n_sample = 0;
4943
0
    if ( !v->n_sample ) v->n_fmt = 0;
4944
0
    free(v->indiv.s);
4945
0
    v->indiv = ind;
4946
0
    v->unpacked &= ~BCF_UN_FMT;    // only BCF is ready for output, VCF will need to unpack again
4947
0
    return 0;
4948
0
}
4949
4950
int bcf_is_snp(bcf1_t *v)
4951
0
{
4952
0
    int i;
4953
0
    bcf_unpack(v, BCF_UN_STR);
4954
0
    for (i = 0; i < v->n_allele; ++i)
4955
0
    {
4956
0
        if ( v->d.allele[i][1]==0 && v->d.allele[i][0]!='*' ) continue;
4957
4958
        // mpileup's <X> allele, see also below. This is not completely satisfactory,
4959
        // a general library is here narrowly tailored to fit samtools.
4960
0
        if ( v->d.allele[i][0]=='<' && v->d.allele[i][1]=='X' && v->d.allele[i][2]=='>' ) continue;
4961
0
        if ( v->d.allele[i][0]=='<' && v->d.allele[i][1]=='*' && v->d.allele[i][2]=='>' ) continue;
4962
4963
0
        break;
4964
0
    }
4965
0
    return i == v->n_allele;
4966
0
}
4967
4968
static void bcf_set_variant_type(const char *ref, const char *alt, bcf_variant_t *var)
4969
0
{
4970
0
    if ( *alt == '*' && !alt[1] ) { var->n = 0; var->type = VCF_OVERLAP; return; }  // overlapping variant
4971
4972
    // The most frequent case
4973
0
    if ( !ref[1] && !alt[1] )
4974
0
    {
4975
0
        if ( *alt == '.' || *ref==*alt ) { var->n = 0; var->type = VCF_REF; return; }
4976
0
        if ( *alt == 'X' ) { var->n = 0; var->type = VCF_REF; return; }  // mpileup's X allele shouldn't be treated as variant
4977
0
        var->n = 1; var->type = VCF_SNP; return;
4978
0
    }
4979
0
    if ( alt[0]=='<' )
4980
0
    {
4981
0
        if ( alt[1]=='X' && alt[2]=='>' ) { var->n = 0; var->type = VCF_REF; return; }  // mpileup's X allele shouldn't be treated as variant
4982
0
        if ( alt[1]=='*' && alt[2]=='>' ) { var->n = 0; var->type = VCF_REF; return; }
4983
0
        if ( !strcmp("NON_REF>",alt+1) ) { var->n = 0; var->type = VCF_REF; return; }
4984
0
        var->type = VCF_OTHER;
4985
0
        return;
4986
0
    }
4987
4988
    // Catch "joined before" breakend case
4989
0
    if ( alt[0]==']' || alt[0] == '[' )
4990
0
    {
4991
0
        var->type = VCF_BND; return;
4992
0
    }
4993
4994
    // Iterate through alt characters that match the reference
4995
0
    const char *r = ref, *a = alt;
4996
0
    while (*r && *a && toupper_c(*r)==toupper_c(*a) ) { r++; a++; }     // unfortunately, matching REF,ALT case is not guaranteed
4997
4998
0
    if ( *a && !*r )
4999
0
    {
5000
0
        if ( *a==']' || *a=='[' ) { var->type = VCF_BND; return; } // "joined after" breakend
5001
0
        while ( *a ) a++;
5002
0
        var->n = (a-alt)-(r-ref); var->type = VCF_INDEL | VCF_INS; return;
5003
0
    }
5004
0
    else if ( *r && !*a )
5005
0
    {
5006
0
        while ( *r ) r++;
5007
0
        var->n = (a-alt)-(r-ref); var->type = VCF_INDEL | VCF_DEL; return;
5008
0
    }
5009
0
    else if ( !*r && !*a )
5010
0
    {
5011
0
        var->n = 0; var->type = VCF_REF; return;
5012
0
    }
5013
5014
0
    const char *re = r, *ae = a;
5015
0
    while ( re[1] ) re++;
5016
0
    while ( ae[1] ) ae++;
5017
0
    while ( re>r && ae>a && toupper_c(*re)==toupper_c(*ae) ) { re--; ae--; }
5018
0
    if ( ae==a )
5019
0
    {
5020
0
        if ( re==r ) { var->n = 1; var->type = VCF_SNP; return; }
5021
0
        var->n = -(re-r);
5022
0
        if ( toupper_c(*re)==toupper_c(*ae) ) { var->type = VCF_INDEL | VCF_DEL; return; }
5023
0
        var->type = VCF_OTHER; return;
5024
0
    }
5025
0
    else if ( re==r )
5026
0
    {
5027
0
        var->n = ae-a;
5028
0
        if ( toupper_c(*re)==toupper_c(*ae) ) { var->type = VCF_INDEL | VCF_INS; return; }
5029
0
        var->type = VCF_OTHER; return;
5030
0
    }
5031
5032
0
    var->type = ( re-r == ae-a ) ? VCF_MNP : VCF_OTHER;
5033
0
    var->n = ( re-r > ae-a ) ? -(re-r+1) : ae-a+1;
5034
5035
    // should do also complex events, SVs, etc...
5036
0
}
5037
5038
static int bcf_set_variant_types(bcf1_t *b)
5039
0
{
5040
0
    if ( !(b->unpacked & BCF_UN_STR) ) bcf_unpack(b, BCF_UN_STR);
5041
0
    bcf_dec_t *d = &b->d;
5042
0
    if ( d->n_var < b->n_allele )
5043
0
    {
5044
0
        bcf_variant_t *new_var = realloc(d->var, sizeof(bcf_variant_t)*b->n_allele);
5045
0
        if (!new_var)
5046
0
            return -1;
5047
0
        d->var = new_var;
5048
0
        d->n_var = b->n_allele;
5049
0
    }
5050
0
    int i;
5051
0
    b->d.var_type = 0;
5052
0
    d->var[0].type = VCF_REF;
5053
0
    d->var[0].n    = 0;
5054
0
    for (i=1; i<b->n_allele; i++)
5055
0
    {
5056
0
        bcf_set_variant_type(d->allele[0],d->allele[i], &d->var[i]);
5057
0
        b->d.var_type |= d->var[i].type;
5058
        //fprintf(stderr,"[set_variant_type] %d   %s %s -> %d %d .. %d\n", b->pos+1,d->allele[0],d->allele[i],d->var[i].type,d->var[i].n, b->d.var_type);
5059
0
    }
5060
0
    return 0;
5061
0
}
5062
5063
// bcf_get_variant_type/bcf_get_variant_types should only return the following,
5064
// to be compatible with callers that are not expecting newer values
5065
// like VCF_INS, VCF_DEL.  The full set is available from the newer
5066
// vcf_has_variant_type* interfaces.
5067
0
#define ORIG_VAR_TYPES (VCF_SNP|VCF_MNP|VCF_INDEL|VCF_OTHER|VCF_BND|VCF_OVERLAP)
5068
int bcf_get_variant_types(bcf1_t *rec)
5069
0
{
5070
0
    if ( rec->d.var_type==-1 ) {
5071
0
        if (bcf_set_variant_types(rec) != 0) {
5072
0
            hts_log_error("Couldn't get variant types: %s", strerror(errno));
5073
0
            exit(1); // Due to legacy API having no way to report failures
5074
0
        }
5075
0
    }
5076
0
    return rec->d.var_type & ORIG_VAR_TYPES;
5077
0
}
5078
5079
int bcf_get_variant_type(bcf1_t *rec, int ith_allele)
5080
0
{
5081
0
    if ( rec->d.var_type==-1 ) {
5082
0
        if (bcf_set_variant_types(rec) != 0) {
5083
0
            hts_log_error("Couldn't get variant types: %s", strerror(errno));
5084
0
            exit(1); // Due to legacy API having no way to report failures
5085
0
        }
5086
0
    }
5087
0
    if (ith_allele < 0 || ith_allele >= rec->n_allele) {
5088
0
        hts_log_error("Requested allele outside valid range");
5089
0
        exit(1);
5090
0
    }
5091
0
    return rec->d.var[ith_allele].type & ORIG_VAR_TYPES;
5092
0
}
5093
#undef ORIG_VAR_TYPES
5094
5095
int bcf_has_variant_type(bcf1_t *rec, int ith_allele, uint32_t bitmask)
5096
0
{
5097
0
    if ( rec->d.var_type==-1 ) {
5098
0
        if (bcf_set_variant_types(rec) != 0) return -1;
5099
0
    }
5100
0
    if (ith_allele < 0 || ith_allele >= rec->n_allele) return -1;
5101
0
    if (bitmask == VCF_REF) {  // VCF_REF is 0, so handled as a special case
5102
0
        return rec->d.var[ith_allele].type == VCF_REF;
5103
0
    }
5104
0
    return bitmask & rec->d.var[ith_allele].type;
5105
0
}
5106
5107
int bcf_variant_length(bcf1_t *rec, int ith_allele)
5108
0
{
5109
0
    if ( rec->d.var_type==-1 ) {
5110
0
        if (bcf_set_variant_types(rec) != 0) return bcf_int32_missing;
5111
0
    }
5112
0
    if (ith_allele < 0 || ith_allele >= rec->n_allele) return bcf_int32_missing;
5113
0
    return rec->d.var[ith_allele].n;
5114
0
}
5115
5116
int bcf_has_variant_types(bcf1_t *rec, uint32_t bitmask,
5117
                          enum bcf_variant_match mode)
5118
0
{
5119
0
    if ( rec->d.var_type==-1 ) {
5120
0
        if (bcf_set_variant_types(rec) != 0) return -1;
5121
0
    }
5122
0
    uint32_t type = rec->d.var_type;
5123
0
    if ( mode==bcf_match_overlap ) return bitmask & type;
5124
5125
    // VCF_INDEL is always set with VCF_INS and VCF_DEL by bcf_set_variant_type[s], but the bitmask may
5126
    // ask for say `VCF_INS` or `VCF_INDEL` only
5127
0
    if ( bitmask&(VCF_INS|VCF_DEL) && !(bitmask&VCF_INDEL) ) type &= ~VCF_INDEL;
5128
0
    else if ( bitmask&VCF_INDEL && !(bitmask&(VCF_INS|VCF_DEL)) ) type &= ~(VCF_INS|VCF_DEL);
5129
5130
0
    if ( mode==bcf_match_subset )
5131
0
    {
5132
0
        if ( ~bitmask & type ) return 0;
5133
0
        else return bitmask & type;
5134
0
    }
5135
    // mode == bcf_match_exact
5136
0
    return type==bitmask ? type : 0;
5137
0
}
5138
5139
int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type)
5140
0
{
5141
0
    static int negative_rlen_warned = 0;
5142
0
    int is_end_tag;
5143
5144
    // Is the field already present?
5145
0
    int i, inf_id = bcf_hdr_id2int(hdr,BCF_DT_ID,key);
5146
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,inf_id) ) return -1;    // No such INFO field in the header
5147
0
    if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO);
5148
5149
0
    is_end_tag = strcmp(key, "END") == 0;
5150
5151
0
    for (i=0; i<line->n_info; i++)
5152
0
        if ( inf_id==line->d.info[i].key ) break;
5153
0
    bcf_info_t *inf = i==line->n_info ? NULL : &line->d.info[i];
5154
5155
0
    if ( !n || (type==BCF_HT_STR && !values) )
5156
0
    {
5157
0
        if ( n==0 && is_end_tag )
5158
0
            line->rlen = line->n_allele ? strlen(line->d.allele[0]) : 0;
5159
0
        if ( inf )
5160
0
        {
5161
            // Mark the tag for removal, free existing memory if necessary
5162
0
            if ( inf->vptr_free )
5163
0
            {
5164
0
                free(inf->vptr - inf->vptr_off);
5165
0
                inf->vptr_free = 0;
5166
0
            }
5167
0
            line->d.shared_dirty |= BCF1_DIRTY_INF;
5168
0
            inf->vptr = NULL;
5169
0
            inf->vptr_off = inf->vptr_len = 0;
5170
0
        }
5171
0
        return 0;
5172
0
    }
5173
5174
0
    if (is_end_tag)
5175
0
    {
5176
0
        if (n != 1)
5177
0
        {
5178
0
            hts_log_error("END info tag should only have one value at %s:%"PRIhts_pos, bcf_seqname_safe(hdr,line), line->pos+1);
5179
0
            line->errcode |= BCF_ERR_TAG_INVALID;
5180
0
            return -1;
5181
0
        }
5182
0
        if (type != BCF_HT_INT && type != BCF_HT_LONG)
5183
0
        {
5184
0
            hts_log_error("Wrong type (%d) for END info tag at %s:%"PRIhts_pos, type, bcf_seqname_safe(hdr,line), line->pos+1);
5185
0
            line->errcode |= BCF_ERR_TAG_INVALID;
5186
0
            return -1;
5187
0
        }
5188
0
    }
5189
5190
    // Encode the values and determine the size required to accommodate the values
5191
0
    kstring_t str = {0,0,0};
5192
0
    bcf_enc_int1(&str, inf_id);
5193
0
    if ( type==BCF_HT_INT )
5194
0
        bcf_enc_vint(&str, n, (int32_t*)values, -1);
5195
0
    else if ( type==BCF_HT_REAL )
5196
0
        bcf_enc_vfloat(&str, n, (float*)values);
5197
0
    else if ( type==BCF_HT_FLAG || type==BCF_HT_STR )
5198
0
    {
5199
0
        if ( values==NULL )
5200
0
            bcf_enc_size(&str, 0, BCF_BT_NULL);
5201
0
        else
5202
0
            bcf_enc_vchar(&str, strlen((char*)values), (char*)values);
5203
0
    }
5204
#ifdef VCF_ALLOW_INT64
5205
    else if ( type==BCF_HT_LONG )
5206
    {
5207
        if (n != 1) {
5208
            hts_log_error("Only storing a single BCF_HT_LONG value is supported at %s:%"PRIhts_pos, bcf_seqname_safe(hdr,line), line->pos+1);
5209
            abort();
5210
        }
5211
        bcf_enc_long1(&str, *(int64_t *) values);
5212
    }
5213
#endif
5214
0
    else
5215
0
    {
5216
0
        hts_log_error("The type %d not implemented yet at %s:%"PRIhts_pos, type, bcf_seqname_safe(hdr,line), line->pos+1);
5217
0
        abort();
5218
0
    }
5219
5220
    // Is the INFO tag already present
5221
0
    if ( inf )
5222
0
    {
5223
        // Is it big enough to accommodate new block?
5224
0
        if ( inf->vptr && str.l <= inf->vptr_len + inf->vptr_off )
5225
0
        {
5226
0
            if ( str.l != inf->vptr_len + inf->vptr_off ) line->d.shared_dirty |= BCF1_DIRTY_INF;
5227
0
            uint8_t *ptr = inf->vptr - inf->vptr_off;
5228
0
            memcpy(ptr, str.s, str.l);
5229
0
            free(str.s);
5230
0
            int vptr_free = inf->vptr_free;
5231
0
            bcf_unpack_info_core1(ptr, inf);
5232
0
            inf->vptr_free = vptr_free;
5233
0
        }
5234
0
        else
5235
0
        {
5236
0
            if ( inf->vptr_free )
5237
0
                free(inf->vptr - inf->vptr_off);
5238
0
            bcf_unpack_info_core1((uint8_t*)str.s, inf);
5239
0
            inf->vptr_free = 1;
5240
0
            line->d.shared_dirty |= BCF1_DIRTY_INF;
5241
0
        }
5242
0
    }
5243
0
    else
5244
0
    {
5245
        // The tag is not present, create new one
5246
0
        line->n_info++;
5247
0
        hts_expand0(bcf_info_t, line->n_info, line->d.m_info , line->d.info);
5248
0
        inf = &line->d.info[line->n_info-1];
5249
0
        bcf_unpack_info_core1((uint8_t*)str.s, inf);
5250
0
        inf->vptr_free = 1;
5251
0
        line->d.shared_dirty |= BCF1_DIRTY_INF;
5252
0
    }
5253
0
    line->unpacked |= BCF_UN_INFO;
5254
5255
0
   if ( n==1 && is_end_tag) {
5256
0
        hts_pos_t end = type == BCF_HT_INT ? *(int32_t *) values : *(int64_t *) values;
5257
0
        if ( (type == BCF_HT_INT && end!=bcf_int32_missing) || (type == BCF_HT_LONG && end!=bcf_int64_missing) )
5258
0
        {
5259
0
            if ( end <= line->pos )
5260
0
            {
5261
0
                if ( !negative_rlen_warned )
5262
0
                {
5263
0
                    hts_log_warning("INFO/END=%"PRIhts_pos" is smaller than POS at %s:%"PRIhts_pos,end,bcf_seqname_safe(hdr,line),line->pos+1);
5264
0
                    negative_rlen_warned = 1;
5265
0
                }
5266
0
                line->rlen = line->n_allele ? strlen(line->d.allele[0]) : 0;
5267
0
            }
5268
0
            else
5269
0
                line->rlen = end - line->pos;
5270
0
        }
5271
0
    }
5272
0
    return 0;
5273
0
}
5274
5275
int bcf_update_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const char **values, int n)
5276
0
{
5277
0
    if ( !n )
5278
0
        return bcf_update_format(hdr,line,key,NULL,0,BCF_HT_STR);
5279
5280
0
    int i, max_len = 0;
5281
0
    for (i=0; i<n; i++)
5282
0
    {
5283
0
        int len = strlen(values[i]);
5284
0
        if ( len > max_len ) max_len = len;
5285
0
    }
5286
0
    char *out = (char*) malloc(max_len*n);
5287
0
    if ( !out ) return -2;
5288
0
    for (i=0; i<n; i++)
5289
0
    {
5290
0
        char *dst = out+i*max_len;
5291
0
        const char *src = values[i];
5292
0
        int j = 0;
5293
0
        while ( src[j] ) { dst[j] = src[j]; j++; }
5294
0
        for (; j<max_len; j++) dst[j] = 0;
5295
0
    }
5296
0
    int ret = bcf_update_format(hdr,line,key,out,max_len*n,BCF_HT_STR);
5297
0
    free(out);
5298
0
    return ret;
5299
0
}
5300
5301
int bcf_update_format(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type)
5302
0
{
5303
    // Is the field already present?
5304
0
    int i, fmt_id = bcf_hdr_id2int(hdr,BCF_DT_ID,key);
5305
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,fmt_id) )
5306
0
    {
5307
0
        if ( !n ) return 0;
5308
0
        return -1;  // the key not present in the header
5309
0
    }
5310
5311
0
    if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
5312
5313
0
    for (i=0; i<line->n_fmt; i++)
5314
0
        if ( line->d.fmt[i].id==fmt_id ) break;
5315
0
    bcf_fmt_t *fmt = i==line->n_fmt ? NULL : &line->d.fmt[i];
5316
5317
0
    if ( !n )
5318
0
    {
5319
0
        if ( fmt )
5320
0
        {
5321
            // Mark the tag for removal, free existing memory if necessary
5322
0
            if ( fmt->p_free )
5323
0
            {
5324
0
                free(fmt->p - fmt->p_off);
5325
0
                fmt->p_free = 0;
5326
0
            }
5327
0
            line->d.indiv_dirty = 1;
5328
0
            fmt->p = NULL;
5329
0
        }
5330
0
        return 0;
5331
0
    }
5332
5333
0
    line->n_sample = bcf_hdr_nsamples(hdr);
5334
0
    int nps = n / line->n_sample;  // number of values per sample
5335
0
    assert( nps && nps*line->n_sample==n );     // must be divisible by n_sample
5336
5337
    // Encode the values and determine the size required to accommodate the values
5338
0
    kstring_t str = {0,0,0};
5339
0
    bcf_enc_int1(&str, fmt_id);
5340
0
    assert(values != NULL);
5341
0
    if ( type==BCF_HT_INT )
5342
0
        bcf_enc_vint(&str, n, (int32_t*)values, nps);
5343
0
    else if ( type==BCF_HT_REAL )
5344
0
    {
5345
0
        bcf_enc_size(&str, nps, BCF_BT_FLOAT);
5346
0
        serialize_float_array(&str, nps*line->n_sample, (float *) values);
5347
0
    }
5348
0
    else if ( type==BCF_HT_STR )
5349
0
    {
5350
0
        bcf_enc_size(&str, nps, BCF_BT_CHAR);
5351
0
        kputsn((char*)values, nps*line->n_sample, &str);
5352
0
    }
5353
0
    else
5354
0
    {
5355
0
        hts_log_error("The type %d not implemented yet at %s:%"PRIhts_pos, type, bcf_seqname_safe(hdr,line), line->pos+1);
5356
0
        abort();
5357
0
    }
5358
5359
0
    if ( !fmt )
5360
0
    {
5361
        // Not present, new format field
5362
0
        line->n_fmt++;
5363
0
        hts_expand0(bcf_fmt_t, line->n_fmt, line->d.m_fmt, line->d.fmt);
5364
5365
        // Special case: VCF specification requires that GT is always first
5366
0
        if ( line->n_fmt > 1 && key[0]=='G' && key[1]=='T' && !key[2] )
5367
0
        {
5368
0
            for (i=line->n_fmt-1; i>0; i--)
5369
0
                line->d.fmt[i] = line->d.fmt[i-1];
5370
0
            fmt = &line->d.fmt[0];
5371
0
        }
5372
0
        else
5373
0
            fmt = &line->d.fmt[line->n_fmt-1];
5374
0
        bcf_unpack_fmt_core1((uint8_t*)str.s, line->n_sample, fmt);
5375
0
        line->d.indiv_dirty = 1;
5376
0
        fmt->p_free = 1;
5377
0
    }
5378
0
    else
5379
0
    {
5380
        // The tag is already present, check if it is big enough to accommodate the new block
5381
0
        if ( fmt->p && str.l <= fmt->p_len + fmt->p_off )
5382
0
        {
5383
            // good, the block is big enough
5384
0
            if ( str.l != fmt->p_len + fmt->p_off ) line->d.indiv_dirty = 1;
5385
0
            uint8_t *ptr = fmt->p - fmt->p_off;
5386
0
            memcpy(ptr, str.s, str.l);
5387
0
            free(str.s);
5388
0
            int p_free = fmt->p_free;
5389
0
            bcf_unpack_fmt_core1(ptr, line->n_sample, fmt);
5390
0
            fmt->p_free = p_free;
5391
0
        }
5392
0
        else
5393
0
        {
5394
0
            if ( fmt->p_free )
5395
0
                free(fmt->p - fmt->p_off);
5396
0
            bcf_unpack_fmt_core1((uint8_t*)str.s, line->n_sample, fmt);
5397
0
            fmt->p_free = 1;
5398
0
            line->d.indiv_dirty = 1;
5399
0
        }
5400
0
    }
5401
0
    line->unpacked |= BCF_UN_FMT;
5402
0
    return 0;
5403
0
}
5404
5405
5406
int bcf_update_filter(const bcf_hdr_t *hdr, bcf1_t *line, int *flt_ids, int n)
5407
0
{
5408
0
    if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
5409
0
    line->d.shared_dirty |= BCF1_DIRTY_FLT;
5410
0
    line->d.n_flt = n;
5411
0
    if ( !n ) return 0;
5412
0
    hts_expand(int, line->d.n_flt, line->d.m_flt, line->d.flt);
5413
0
    int i;
5414
0
    for (i=0; i<n; i++)
5415
0
        line->d.flt[i] = flt_ids[i];
5416
0
    return 0;
5417
0
}
5418
5419
int bcf_add_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id)
5420
0
{
5421
0
    if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
5422
0
    int i;
5423
0
    for (i=0; i<line->d.n_flt; i++)
5424
0
        if ( flt_id==line->d.flt[i] ) break;
5425
0
    if ( i<line->d.n_flt ) return 0;    // this filter is already set
5426
0
    line->d.shared_dirty |= BCF1_DIRTY_FLT;
5427
0
    if ( flt_id==0 )    // set to PASS
5428
0
        line->d.n_flt = 1;
5429
0
    else if ( line->d.n_flt==1 && line->d.flt[0]==0 )
5430
0
        line->d.n_flt = 1;
5431
0
    else
5432
0
        line->d.n_flt++;
5433
0
    hts_expand(int, line->d.n_flt, line->d.m_flt, line->d.flt);
5434
0
    line->d.flt[line->d.n_flt-1] = flt_id;
5435
0
    return 1;
5436
0
}
5437
int bcf_remove_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id, int pass)
5438
0
{
5439
0
    if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
5440
0
    int i;
5441
0
    for (i=0; i<line->d.n_flt; i++)
5442
0
        if ( flt_id==line->d.flt[i] ) break;
5443
0
    if ( i==line->d.n_flt ) return 0;   // the filter is not present
5444
0
    line->d.shared_dirty |= BCF1_DIRTY_FLT;
5445
0
    if ( i!=line->d.n_flt-1 ) memmove(line->d.flt+i,line->d.flt+i+1,(line->d.n_flt-i-1)*sizeof(*line->d.flt));
5446
0
    line->d.n_flt--;
5447
0
    if ( !line->d.n_flt && pass ) bcf_add_filter(hdr,line,0);
5448
0
    return 0;
5449
0
}
5450
5451
int bcf_has_filter(const bcf_hdr_t *hdr, bcf1_t *line, char *filter)
5452
0
{
5453
0
    if ( filter[0]=='.' && !filter[1] ) filter = "PASS";
5454
0
    int id = bcf_hdr_id2int(hdr, BCF_DT_ID, filter);
5455
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FLT,id) ) return -1;  // not defined in the header
5456
5457
0
    if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
5458
0
    if ( id==0 && !line->d.n_flt) return 1; // PASS
5459
5460
0
    int i;
5461
0
    for (i=0; i<line->d.n_flt; i++)
5462
0
        if ( line->d.flt[i]==id ) return 1;
5463
0
    return 0;
5464
0
}
5465
5466
static inline int _bcf1_sync_alleles(const bcf_hdr_t *hdr, bcf1_t *line, int nals)
5467
0
{
5468
0
    line->d.shared_dirty |= BCF1_DIRTY_ALS;
5469
5470
0
    line->n_allele = nals;
5471
0
    hts_expand(char*, line->n_allele, line->d.m_allele, line->d.allele);
5472
5473
0
    char *als = line->d.als;
5474
0
    int n = 0;
5475
0
    while (n<nals)
5476
0
    {
5477
0
        line->d.allele[n] = als;
5478
0
        while ( *als ) als++;
5479
0
        als++;
5480
0
        n++;
5481
0
    }
5482
5483
    // Update REF length. Note that END is 1-based while line->pos 0-based
5484
0
    bcf_info_t *end_info = bcf_get_info(hdr,line,"END");
5485
0
    if ( end_info )
5486
0
    {
5487
0
        if ( end_info->type==BCF_HT_INT && end_info->v1.i==bcf_int32_missing ) end_info = NULL;
5488
0
        else if ( end_info->type==BCF_HT_LONG && end_info->v1.i==bcf_int64_missing ) end_info = NULL;
5489
0
    }
5490
0
    if ( end_info && end_info->v1.i > line->pos )
5491
0
        line->rlen = end_info->v1.i - line->pos;
5492
0
    else if ( nals > 0 )
5493
0
        line->rlen = strlen(line->d.allele[0]);
5494
0
    else
5495
0
        line->rlen = 0;
5496
5497
0
    return 0;
5498
0
}
5499
int bcf_update_alleles(const bcf_hdr_t *hdr, bcf1_t *line, const char **alleles, int nals)
5500
0
{
5501
0
    if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line, BCF_UN_STR);
5502
0
    char *free_old = NULL;
5503
0
    char buffer[256];
5504
0
    size_t used = 0;
5505
5506
    // The pointers in alleles may point into the existing line->d.als memory,
5507
    // so care needs to be taken not to clobber them while updating.  Usually
5508
    // they will be short so we can copy through an intermediate buffer.
5509
    // If they're longer, or won't fit in the existing allocation we
5510
    // can allocate a new buffer to write into.  Note that in either case
5511
    // pointers to line->d.als memory in alleles may not be valid when we've
5512
    // finished.
5513
0
    int i;
5514
0
    size_t avail = line->d.m_als < sizeof(buffer) ? line->d.m_als : sizeof(buffer);
5515
0
    for (i=0; i<nals; i++) {
5516
0
        size_t sz = strlen(alleles[i]) + 1;
5517
0
        if (avail - used < sz)
5518
0
            break;
5519
0
        memcpy(buffer + used, alleles[i], sz);
5520
0
        used += sz;
5521
0
    }
5522
5523
    // Did we miss anything?
5524
0
    if (i < nals) {
5525
0
        int j;
5526
0
        size_t needed = used;
5527
0
        char *new_als;
5528
0
        for (j = i; j < nals; j++)
5529
0
            needed += strlen(alleles[j]) + 1;
5530
0
        if (needed < line->d.m_als) // Don't shrink the buffer
5531
0
            needed = line->d.m_als;
5532
0
        if (needed > INT_MAX) {
5533
0
            hts_log_error("REF + alleles too long to fit in a BCF record");
5534
0
            return -1;
5535
0
        }
5536
0
        new_als = malloc(needed);
5537
0
        if (!new_als)
5538
0
            return -1;
5539
0
        free_old = line->d.als;
5540
0
        line->d.als = new_als;
5541
0
        line->d.m_als = needed;
5542
0
    }
5543
5544
    // Copy from the temp buffer to the destination
5545
0
    if (used) {
5546
0
        assert(used <= line->d.m_als);
5547
0
        memcpy(line->d.als, buffer, used);
5548
0
    }
5549
5550
    // Add in any remaining entries - if this happens we will always be
5551
    // writing to a newly-allocated buffer.
5552
0
    for (; i < nals; i++) {
5553
0
        size_t sz = strlen(alleles[i]) + 1;
5554
0
        memcpy(line->d.als + used, alleles[i], sz);
5555
0
        used += sz;
5556
0
    }
5557
5558
0
    if (free_old)
5559
0
        free(free_old);
5560
0
    return _bcf1_sync_alleles(hdr,line,nals);
5561
0
}
5562
5563
int bcf_update_alleles_str(const bcf_hdr_t *hdr, bcf1_t *line, const char *alleles_string)
5564
0
{
5565
0
    if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line, BCF_UN_STR);
5566
0
    kstring_t tmp;
5567
0
    tmp.l = 0; tmp.s = line->d.als; tmp.m = line->d.m_als;
5568
0
    kputs(alleles_string, &tmp);
5569
0
    line->d.als = tmp.s; line->d.m_als = tmp.m;
5570
5571
0
    int nals = 1;
5572
0
    char *t = line->d.als;
5573
0
    while (*t)
5574
0
    {
5575
0
        if ( *t==',' ) { *t = 0; nals++; }
5576
0
        t++;
5577
0
    }
5578
0
    return _bcf1_sync_alleles(hdr, line, nals);
5579
0
}
5580
5581
int bcf_update_id(const bcf_hdr_t *hdr, bcf1_t *line, const char *id)
5582
0
{
5583
0
    if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line, BCF_UN_STR);
5584
0
    kstring_t tmp;
5585
0
    tmp.l = 0; tmp.s = line->d.id; tmp.m = line->d.m_id;
5586
0
    if ( id )
5587
0
        kputs(id, &tmp);
5588
0
    else
5589
0
        kputs(".", &tmp);
5590
0
    line->d.id = tmp.s; line->d.m_id = tmp.m;
5591
0
    line->d.shared_dirty |= BCF1_DIRTY_ID;
5592
0
    return 0;
5593
0
}
5594
5595
int bcf_add_id(const bcf_hdr_t *hdr, bcf1_t *line, const char *id)
5596
0
{
5597
0
    if ( !id ) return 0;
5598
0
    if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line, BCF_UN_STR);
5599
5600
0
    kstring_t tmp;
5601
0
    tmp.l = 0; tmp.s = line->d.id; tmp.m = line->d.m_id;
5602
5603
0
    int len = strlen(id);
5604
0
    char *dst = line->d.id;
5605
0
    while ( *dst && (dst=strstr(dst,id)) )
5606
0
    {
5607
0
        if ( dst[len]!=0 && dst[len]!=';' ) dst++;              // a prefix, not a match
5608
0
        else if ( dst==line->d.id || dst[-1]==';' ) return 0;   // already present
5609
0
        dst++;  // a suffix, not a match
5610
0
    }
5611
0
    if ( line->d.id && (line->d.id[0]!='.' || line->d.id[1]) )
5612
0
    {
5613
0
        tmp.l = strlen(line->d.id);
5614
0
        kputc(';',&tmp);
5615
0
    }
5616
0
    kputs(id,&tmp);
5617
5618
0
    line->d.id = tmp.s; line->d.m_id = tmp.m;
5619
0
    line->d.shared_dirty |= BCF1_DIRTY_ID;
5620
0
    return 0;
5621
5622
0
}
5623
5624
bcf_fmt_t *bcf_get_fmt(const bcf_hdr_t *hdr, bcf1_t *line, const char *key)
5625
0
{
5626
0
    int id = bcf_hdr_id2int(hdr, BCF_DT_ID, key);
5627
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,id) ) return NULL;   // no such FMT field in the header
5628
0
    return bcf_get_fmt_id(line, id);
5629
0
}
5630
5631
bcf_info_t *bcf_get_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key)
5632
0
{
5633
0
    int id = bcf_hdr_id2int(hdr, BCF_DT_ID, key);
5634
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,id) ) return NULL;   // no such INFO field in the header
5635
0
    return bcf_get_info_id(line, id);
5636
0
}
5637
5638
bcf_fmt_t *bcf_get_fmt_id(bcf1_t *line, const int id)
5639
0
{
5640
0
    int i;
5641
0
    if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
5642
0
    for (i=0; i<line->n_fmt; i++)
5643
0
    {
5644
0
        if ( line->d.fmt[i].id==id ) return &line->d.fmt[i];
5645
0
    }
5646
0
    return NULL;
5647
0
}
5648
5649
bcf_info_t *bcf_get_info_id(bcf1_t *line, const int id)
5650
0
{
5651
0
    int i;
5652
0
    if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO);
5653
0
    for (i=0; i<line->n_info; i++)
5654
0
    {
5655
0
        if ( line->d.info[i].key==id ) return &line->d.info[i];
5656
0
    }
5657
0
    return NULL;
5658
0
}
5659
5660
5661
int bcf_get_info_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type)
5662
0
{
5663
0
    int i, ret = -4, tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag);
5664
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,tag_id) ) return -1;    // no such INFO field in the header
5665
0
    if ( bcf_hdr_id2type(hdr,BCF_HL_INFO,tag_id)!=(type & 0xff) ) return -2;     // expected different type
5666
5667
0
    if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO);
5668
5669
0
    for (i=0; i<line->n_info; i++)
5670
0
        if ( line->d.info[i].key==tag_id ) break;
5671
0
    if ( i==line->n_info ) return ( type==BCF_HT_FLAG ) ? 0 : -3;       // the tag is not present in this record
5672
0
    if ( type==BCF_HT_FLAG ) return 1;
5673
5674
0
    bcf_info_t *info = &line->d.info[i];
5675
0
    if ( !info->vptr ) return -3;           // the tag was marked for removal
5676
0
    if ( type==BCF_HT_STR )
5677
0
    {
5678
0
        if ( *ndst < info->len+1 )
5679
0
        {
5680
0
            *ndst = info->len + 1;
5681
0
            *dst  = realloc(*dst, *ndst);
5682
0
        }
5683
0
        memcpy(*dst,info->vptr,info->len);
5684
0
        ((uint8_t*)*dst)[info->len] = 0;
5685
0
        return info->len;
5686
0
    }
5687
5688
    // Make sure the buffer is big enough
5689
0
    int size1;
5690
0
    switch (type) {
5691
0
        case BCF_HT_INT:  size1 = sizeof(int32_t); break;
5692
0
        case BCF_HT_LONG: size1 = sizeof(int64_t); break;
5693
0
        case BCF_HT_REAL: size1 = sizeof(float); break;
5694
0
        default:
5695
0
            hts_log_error("Unexpected output type %d at %s:%"PRIhts_pos, type, bcf_seqname_safe(hdr,line), line->pos+1);
5696
0
            return -2;
5697
0
    }
5698
0
    if ( *ndst < info->len )
5699
0
    {
5700
0
        *ndst = info->len;
5701
0
        *dst  = realloc(*dst, *ndst * size1);
5702
0
    }
5703
5704
0
    #define BRANCH(type_t, convert, is_missing, is_vector_end, set_missing, set_regular, out_type_t) do { \
5705
0
        out_type_t *tmp = (out_type_t *) *dst; \
5706
0
        int j; \
5707
0
        for (j=0; j<info->len; j++) \
5708
0
        { \
5709
0
            type_t p = convert(info->vptr + j * sizeof(type_t)); \
5710
0
            if ( is_vector_end ) break; \
5711
0
            if ( is_missing ) set_missing; \
5712
0
            else set_regular; \
5713
0
            tmp++; \
5714
0
        } \
5715
0
        ret = j; \
5716
0
    } while (0)
5717
0
    switch (info->type) {
5718
0
        case BCF_BT_INT8:
5719
0
            if (type == BCF_HT_LONG) {
5720
0
                BRANCH(int8_t,  le_to_i8,  p==bcf_int8_missing,  p==bcf_int8_vector_end,  *tmp=bcf_int64_missing, *tmp=p, int64_t);
5721
0
            } else {
5722
0
                BRANCH(int8_t,  le_to_i8,  p==bcf_int8_missing,  p==bcf_int8_vector_end,  *tmp=bcf_int32_missing, *tmp=p, int32_t);
5723
0
            }
5724
0
            break;
5725
0
        case BCF_BT_INT16:
5726
0
            if (type == BCF_HT_LONG) {
5727
0
                BRANCH(int16_t, le_to_i16, p==bcf_int16_missing, p==bcf_int16_vector_end, *tmp=bcf_int64_missing, *tmp=p, int64_t);
5728
0
            } else {
5729
0
                BRANCH(int16_t, le_to_i16, p==bcf_int16_missing, p==bcf_int16_vector_end, *tmp=bcf_int32_missing, *tmp=p, int32_t);
5730
0
            }
5731
0
            break;
5732
0
        case BCF_BT_INT32:
5733
0
            if (type == BCF_HT_LONG) {
5734
0
                BRANCH(int32_t, le_to_i32, p==bcf_int32_missing, p==bcf_int32_vector_end, *tmp=bcf_int64_missing, *tmp=p, int64_t); break;
5735
0
            } else {
5736
0
                BRANCH(int32_t, le_to_i32, p==bcf_int32_missing, p==bcf_int32_vector_end, *tmp=bcf_int32_missing, *tmp=p, int32_t); break;
5737
0
            }
5738
0
        case BCF_BT_FLOAT: BRANCH(uint32_t, le_to_u32, p==bcf_float_missing, p==bcf_float_vector_end, bcf_float_set_missing(*tmp), bcf_float_set(tmp, p), float); break;
5739
0
        default: hts_log_error("Unexpected type %d at %s:%"PRIhts_pos, info->type, bcf_seqname_safe(hdr,line), line->pos+1); return -2;
5740
0
    }
5741
0
    #undef BRANCH
5742
0
    return ret;  // set by BRANCH
5743
0
}
5744
5745
int bcf_get_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, char ***dst, int *ndst)
5746
0
{
5747
0
    int i,tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag);
5748
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,tag_id) ) return -1;    // no such FORMAT field in the header
5749
0
    if ( bcf_hdr_id2type(hdr,BCF_HL_FMT,tag_id)!=BCF_HT_STR ) return -2;     // expected different type
5750
5751
0
    if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
5752
5753
0
    for (i=0; i<line->n_fmt; i++)
5754
0
        if ( line->d.fmt[i].id==tag_id ) break;
5755
0
    if ( i==line->n_fmt ) return -3;                               // the tag is not present in this record
5756
0
    bcf_fmt_t *fmt = &line->d.fmt[i];
5757
0
    if ( !fmt->p ) return -3;                                      // the tag was marked for removal
5758
5759
0
    int nsmpl = bcf_hdr_nsamples(hdr);
5760
0
    if ( !*dst )
5761
0
    {
5762
0
        *dst = (char**) malloc(sizeof(char*)*nsmpl);
5763
0
        if ( !*dst ) return -4;     // could not alloc
5764
0
        (*dst)[0] = NULL;
5765
0
    }
5766
0
    int n = (fmt->n+1)*nsmpl;
5767
0
    if ( *ndst < n )
5768
0
    {
5769
0
        (*dst)[0] = realloc((*dst)[0], n);
5770
0
        if ( !(*dst)[0] ) return -4;    // could not alloc
5771
0
        *ndst = n;
5772
0
    }
5773
0
    for (i=0; i<nsmpl; i++)
5774
0
    {
5775
0
        uint8_t *src = fmt->p + i*fmt->n;
5776
0
        uint8_t *tmp = (uint8_t*)(*dst)[0] + i*(fmt->n+1);
5777
0
        memcpy(tmp,src,fmt->n);
5778
0
        tmp[fmt->n] = 0;
5779
0
        (*dst)[i] = (char*) tmp;
5780
0
    }
5781
0
    return n;
5782
0
}
5783
5784
int bcf_get_format_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type)
5785
0
{
5786
0
    int i,j, tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag);
5787
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,tag_id) ) return -1;    // no such FORMAT field in the header
5788
0
    if ( tag[0]=='G' && tag[1]=='T' && tag[2]==0 )
5789
0
    {
5790
        // Ugly: GT field is considered to be a string by the VCF header but BCF represents it as INT.
5791
0
        if ( bcf_hdr_id2type(hdr,BCF_HL_FMT,tag_id)!=BCF_HT_STR ) return -2;
5792
0
    }
5793
0
    else if ( bcf_hdr_id2type(hdr,BCF_HL_FMT,tag_id)!=type ) return -2;     // expected different type
5794
5795
0
    if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
5796
5797
0
    for (i=0; i<line->n_fmt; i++)
5798
0
        if ( line->d.fmt[i].id==tag_id ) break;
5799
0
    if ( i==line->n_fmt ) return -3;                               // the tag is not present in this record
5800
0
    bcf_fmt_t *fmt = &line->d.fmt[i];
5801
0
    if ( !fmt->p ) return -3;                                      // the tag was marked for removal
5802
5803
0
    if ( type==BCF_HT_STR )
5804
0
    {
5805
0
        int n = fmt->n*bcf_hdr_nsamples(hdr);
5806
0
        if ( *ndst < n )
5807
0
        {
5808
0
            *dst  = realloc(*dst, n);
5809
0
            if ( !*dst ) return -4;     // could not alloc
5810
0
            *ndst = n;
5811
0
        }
5812
0
        memcpy(*dst,fmt->p,n);
5813
0
        return n;
5814
0
    }
5815
5816
    // Make sure the buffer is big enough
5817
0
    int nsmpl = bcf_hdr_nsamples(hdr);
5818
0
    int size1 = type==BCF_HT_INT ? sizeof(int32_t) : sizeof(float);
5819
0
    if ( *ndst < fmt->n*nsmpl )
5820
0
    {
5821
0
        *ndst = fmt->n*nsmpl;
5822
0
        *dst  = realloc(*dst, *ndst*size1);
5823
0
        if ( !*dst ) return -4;     // could not alloc
5824
0
    }
5825
5826
0
    #define BRANCH(type_t, convert, is_missing, is_vector_end, set_missing, set_vector_end, set_regular, out_type_t) { \
5827
0
        out_type_t *tmp = (out_type_t *) *dst; \
5828
0
        uint8_t *fmt_p = fmt->p; \
5829
0
        for (i=0; i<nsmpl; i++) \
5830
0
        { \
5831
0
            for (j=0; j<fmt->n; j++) \
5832
0
            { \
5833
0
                type_t p = convert(fmt_p + j * sizeof(type_t)); \
5834
0
                if ( is_missing ) set_missing; \
5835
0
                else if ( is_vector_end ) { set_vector_end; break; } \
5836
0
                else set_regular; \
5837
0
                tmp++; \
5838
0
            } \
5839
0
            for (; j<fmt->n; j++) { set_vector_end; tmp++; } \
5840
0
            fmt_p += fmt->size; \
5841
0
        } \
5842
0
    }
5843
0
    switch (fmt->type) {
5844
0
        case BCF_BT_INT8:  BRANCH(int8_t,  le_to_i8, p==bcf_int8_missing,  p==bcf_int8_vector_end,  *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, *tmp=p, int32_t); break;
5845
0
        case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, p==bcf_int16_missing, p==bcf_int16_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, *tmp=p, int32_t); break;
5846
0
        case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, p==bcf_int32_missing, p==bcf_int32_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, *tmp=p, int32_t); break;
5847
0
        case BCF_BT_FLOAT: BRANCH(uint32_t, le_to_u32, p==bcf_float_missing, p==bcf_float_vector_end, bcf_float_set_missing(*tmp), bcf_float_set_vector_end(*tmp), bcf_float_set(tmp, p), float); break;
5848
0
        default: hts_log_error("Unexpected type %d at %s:%"PRIhts_pos, fmt->type, bcf_seqname_safe(hdr,line), line->pos+1); exit(1);
5849
0
    }
5850
0
    #undef BRANCH
5851
0
    return nsmpl*fmt->n;
5852
0
}
5853
5854
//error description structure definition
5855
typedef struct err_desc {
5856
    int  errorcode;
5857
    const char *description;
5858
}err_desc;
5859
5860
// error descriptions
5861
static const err_desc errdesc_bcf[] = {
5862
    { BCF_ERR_CTG_UNDEF, "Contig not defined in header"},
5863
    { BCF_ERR_TAG_UNDEF, "Tag not defined in header" },
5864
    { BCF_ERR_NCOLS, "Incorrect number of columns" },
5865
    { BCF_ERR_LIMITS, "Limits reached" },
5866
    { BCF_ERR_CHAR, "Invalid character" },
5867
    { BCF_ERR_CTG_INVALID, "Invalid contig" },
5868
    { BCF_ERR_TAG_INVALID, "Invalid tag" },
5869
};
5870
5871
/// append given description to buffer based on available size and add ... when not enough space
5872
    /** @param buffer       buffer to which description to be appended
5873
        @param offset       offset at which to be appended
5874
        @param maxbuffer    maximum size of the buffer
5875
        @param description  the description to be appended
5876
on failure returns -1 - when buffer is not big enough; returns -1 on invalid params and on too small buffer which are improbable due to validation at caller site
5877
on success returns 0
5878
    */
5879
3.91k
static int add_desc_to_buffer(char *buffer, size_t *offset, size_t maxbuffer, const char *description) {
5880
5881
3.91k
    if (!description || !buffer || !offset || (maxbuffer < 4))
5882
0
        return -1;
5883
5884
3.91k
    size_t rembuffer = maxbuffer - *offset;
5885
3.91k
    if (rembuffer > (strlen(description) + (rembuffer == maxbuffer ? 0 : 1))) {    //add description with optionally required ','
5886
3.91k
        *offset += snprintf(buffer + *offset, rembuffer, "%s%s", (rembuffer == maxbuffer)? "": ",", description);
5887
3.91k
    } else {    //not enough space for description, put ...
5888
0
        size_t tmppos = (rembuffer <= 4) ? maxbuffer - 4 : *offset;
5889
0
        snprintf(buffer + tmppos, 4, "...");    //ignore offset update
5890
0
        return -1;
5891
0
    }
5892
3.91k
    return 0;
5893
3.91k
}
5894
5895
//get description for given error code. return NULL on error
5896
1.96k
const char *bcf_strerror(int errorcode, char *buffer, size_t maxbuffer) {
5897
1.96k
    size_t usedup = 0;
5898
1.96k
    int ret = 0;
5899
1.96k
    int idx;
5900
5901
1.96k
    if (!buffer || maxbuffer < 4)
5902
0
        return NULL;           //invalid / insufficient buffer
5903
5904
1.96k
    if (!errorcode) {
5905
0
        buffer[0] = '\0';      //no error, set null
5906
0
        return buffer;
5907
0
    }
5908
5909
15.7k
    for (idx = 0; idx < sizeof(errdesc_bcf) / sizeof(err_desc); ++idx) {
5910
13.7k
        if (errorcode & errdesc_bcf[idx].errorcode) {    //error is set, add description
5911
3.91k
            ret = add_desc_to_buffer(buffer, &usedup, maxbuffer, errdesc_bcf[idx].description);
5912
3.91k
            if (ret < 0)
5913
0
                break;         //not enough space, ... added, no need to continue
5914
5915
3.91k
            errorcode &= ~errdesc_bcf[idx].errorcode;    //reset the error
5916
3.91k
        }
5917
13.7k
    }
5918
5919
1.96k
    if (errorcode && (ret >= 0))  {     //undescribed error is present in error code and had enough buffer, try to add unkonwn error as well§
5920
0
        add_desc_to_buffer(buffer, &usedup, maxbuffer, "Unknown error");
5921
0
    }
5922
1.96k
    return buffer;
5923
1.96k
}
5924