Coverage Report

Created: 2026-05-30 06:09

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/htslib/vcf.c
Line
Count
Source
1
/*  vcf.c -- VCF/BCF API functions.
2
3
    Copyright (C) 2012, 2013 Broad Institute.
4
    Copyright (C) 2012-2025 Genome Research Ltd.
5
    Portions copyright (C) 2014 Intel Corporation.
6
7
    Author: Heng Li <lh3@sanger.ac.uk>
8
9
Permission is hereby granted, free of charge, to any person obtaining a copy
10
of this software and associated documentation files (the "Software"), to deal
11
in the Software without restriction, including without limitation the rights
12
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13
copies of the Software, and to permit persons to whom the Software is
14
furnished to do so, subject to the following conditions:
15
16
The above copyright notice and this permission notice shall be included in
17
all copies or substantial portions of the Software.
18
19
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25
DEALINGS IN THE SOFTWARE.  */
26
27
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
28
#include <config.h>
29
30
#include <stdio.h>
31
#include <assert.h>
32
#include <string.h>
33
#include <strings.h>
34
#include <stdlib.h>
35
#include <limits.h>
36
#include <stdint.h>
37
#include <inttypes.h>
38
#include <errno.h>
39
40
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
41
#include "fuzz_settings.h"
42
#endif
43
44
#include "htslib/vcf.h"
45
#include "htslib/bgzf.h"
46
#include "htslib/tbx.h"
47
#include "htslib/hfile.h"
48
#include "hts_internal.h"
49
#include "htslib/hts_alloc.h"
50
#include "htslib/hts_endian.h"
51
#include "htslib/khash_str2int.h"
52
#include "htslib/kstring.h"
53
#include "htslib/sam.h"
54
#include "htslib/khash.h"
55
#include "bgzf_internal.h"
56
57
#if 0
58
// This helps on Intel a bit, often 6-7% faster VCF parsing.
59
// Conversely sometimes harms AMD Zen4 as ~9% slower.
60
// Possibly related to IPC differences.  However for now it's just a
61
// curiousity we ignore and stick with the simpler code.
62
//
63
// Left here as a hint for future explorers.
64
static inline int xstreq(const char *a, const char *b) {
65
    while (*a && *a == *b)
66
        a++, b++;
67
    return *a == *b;
68
}
69
70
#define KHASH_MAP_INIT_XSTR(name, khval_t) \
71
  KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, xstreq)
72
73
KHASH_MAP_INIT_XSTR(vdict, bcf_idinfo_t)
74
#else
75
KHASH_MAP_INIT_STR(vdict, bcf_idinfo_t)
76
#endif
77
78
typedef khash_t(vdict) vdict_t;
79
80
KHASH_MAP_INIT_STR(hdict, bcf_hrec_t*)
81
typedef khash_t(hdict) hdict_t;
82
83
84
#include "htslib/kseq.h"
85
HTSLIB_EXPORT
86
uint32_t bcf_float_missing    = 0x7F800001;
87
88
HTSLIB_EXPORT
89
uint32_t bcf_float_vector_end = 0x7F800002;
90
91
HTSLIB_EXPORT
92
uint8_t bcf_type_shift[] = { 0, 0, 1, 2, 3, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
93
94
static bcf_idinfo_t bcf_idinfo_def = { .info = { 15, 15, 15 }, .hrec = { NULL, NULL, NULL}, .id = -1 };
95
96
/*
97
    Partial support for 64-bit POS and Number=1 INFO tags.
98
    Notes:
99
     - the support for 64-bit values is motivated by POS and INFO/END for large genomes
100
     - the use of 64-bit values does not conform to the specification
101
     - cannot output 64-bit BCF and if it does, it is not compatible with anything
102
     - experimental, use at your risk
103
*/
104
#ifdef VCF_ALLOW_INT64
105
    #define BCF_MAX_BT_INT64 (0x7fffffffffffffff)       /* INT64_MAX, for internal use only */
106
    #define BCF_MIN_BT_INT64 -9223372036854775800LL     /* INT64_MIN + 8, for internal use only */
107
#endif
108
109
817
#define BCF_IS_64BIT (1<<30)
110
111
112
// Opaque structure with auxilary data which allows to extend bcf_hdr_t without breaking ABI.
113
// Note that this preserving API and ABI requires that the first element is vdict_t struct
114
// rather than a pointer, as user programs may (and in some cases do) access the dictionary
115
// directly as (vdict_t*)hdr->dict.
116
typedef struct
117
{
118
    vdict_t dict;   // bcf_hdr_t.dict[0] vdict_t dictionary which keeps bcf_idinfo_t for BCF_HL_FLT,BCF_HL_INFO,BCF_HL_FMT
119
    hdict_t *gen;   // hdict_t dictionary which keeps bcf_hrec_t* pointers for generic and structured fields
120
    size_t *key_len;// length of h->id[BCF_DT_ID] strings
121
    int version;    //cached version
122
    uint32_t ref_count; // reference count, low bit indicates bcf_hdr_destroy() has been called
123
}
124
bcf_hdr_aux_t;
125
126
static inline bcf_hdr_aux_t *get_hdr_aux(const bcf_hdr_t *hdr)
127
320k
{
128
320k
    return (bcf_hdr_aux_t *)hdr->dict[0];
129
320k
}
130
131
//version macros
132
183k
#define VCF_DEF 4002000
133
43.2k
#define VCF44   4004000
134
37.4k
#define VCF45   4005000
135
136
#define VCF_MAJOR_VER(x) ( (x) / 10000 / 100 )
137
#define VCF_MINOR_VER(x) ( ((x) % 1000000) / 1000 )
138
139
/**
140
 *  bcf_get_version - get the version as int
141
 *  @param hdr   - bcf header, to get version
142
 *  @param verstr- version string, which is already available
143
 *  Returns version on success and default version on failure
144
 *  version = major * 100 * 10000 + minor * 1000
145
 */
146
static int bcf_get_version(const bcf_hdr_t *hdr, const char *verstr)
147
22.0k
{
148
22.0k
    const char *version = NULL, vcf[] = "VCFv";
149
22.0k
    char *major = NULL, *minor = NULL;
150
22.0k
    int ver = -1;
151
22.0k
    long tmp = 0;
152
22.0k
    bcf_hdr_aux_t *aux = NULL;
153
154
22.0k
    if (!hdr && !verstr) {  //invalid input
155
0
        goto fail;
156
0
    }
157
158
22.0k
    if (hdr) {
159
15.6k
        if ((aux = get_hdr_aux(hdr)) && aux->version != 0) {    //use cached version
160
14.5k
            return aux->version;
161
14.5k
        }
162
        //get from header
163
1.08k
        version = bcf_hdr_get_version(hdr);
164
6.38k
    } else {
165
        //get from version string
166
6.38k
        version = verstr;
167
6.38k
    }
168
7.47k
    if (!(major = strstr(version, vcf))) {  //bad format
169
5.63k
        goto fail;
170
5.63k
    }
171
1.84k
    major += sizeof(vcf) - 1;
172
1.84k
    if (!(minor = strchr(major, '.'))) {    //bad format
173
200
        goto fail;
174
200
    }
175
1.64k
    tmp = strtol(major, NULL, 10);
176
1.64k
    if ((!tmp && errno == EINVAL) ||
177
1.57k
        ((tmp == LONG_MIN || tmp == LONG_MAX) && errno == ERANGE)) {    //failed
178
78
        goto fail;
179
78
    }
180
1.56k
    ver = tmp * 100 * 10000;
181
1.56k
    tmp = strtol(++minor, NULL, 10);
182
1.56k
    if ((!tmp && errno == EINVAL) ||
183
1.48k
        ((tmp == LONG_MIN || tmp == LONG_MAX) && errno == ERANGE)) {    //failed
184
350
        goto fail;
185
350
    }
186
1.21k
    ver += tmp * 1000;
187
1.21k
    return ver;
188
189
6.26k
fail:
190
6.26k
    hts_log_warning("Couldn't get VCF version, considering as %d.%d",
191
6.26k
        VCF_MAJOR_VER(VCF_DEF), VCF_MINOR_VER(VCF_DEF));
192
6.26k
    return VCF_DEF;
193
1.56k
}
194
195
// Header reference counting
196
197
static void bcf_hdr_incr_ref(bcf_hdr_t *h)
198
2.76k
{
199
2.76k
    bcf_hdr_aux_t *aux = get_hdr_aux(h);
200
2.76k
    aux->ref_count += 2;
201
2.76k
}
202
203
static void bcf_hdr_decr_ref(bcf_hdr_t *h)
204
2.76k
{
205
2.76k
    bcf_hdr_aux_t *aux = get_hdr_aux(h);
206
2.76k
    if (aux->ref_count >= 2)
207
2.76k
        aux->ref_count -= 2;
208
209
2.76k
    if (aux->ref_count == 0)
210
2.55k
        bcf_hdr_destroy(h);
211
2.76k
}
212
213
static void hdr_bgzf_private_data_cleanup(void *data)
214
2.76k
{
215
2.76k
    bcf_hdr_t *h = (bcf_hdr_t *) data;
216
2.76k
    bcf_hdr_decr_ref(h);
217
2.76k
}
218
219
static char *find_chrom_header_line(char *s)
220
0
{
221
0
    char *nl;
222
0
    if (strncmp(s, "#CHROM\t", 7) == 0) return s;
223
0
    else if ((nl = strstr(s, "\n#CHROM\t")) != NULL) return nl+1;
224
0
    else return NULL;
225
0
}
226
227
static int64_t get_rlen(const bcf_hdr_t *h, bcf1_t *v);
228
229
/*************************
230
 *** VCF header parser ***
231
 *************************/
232
233
static int bcf_hdr_add_sample_len(bcf_hdr_t *h, const char *s, size_t len)
234
7.63k
{
235
7.63k
    const char *ss = s;
236
8.06k
    while ( *ss && isspace_c(*ss) && ss - s < len) ss++;
237
7.63k
    if ( !*ss || ss - s == len)
238
10
    {
239
10
        hts_log_error("Empty sample name: trailing spaces/tabs in the header line?");
240
10
        return -1;
241
10
    }
242
243
7.62k
    vdict_t *d = (vdict_t*)h->dict[BCF_DT_SAMPLE];
244
7.62k
    int ret;
245
7.62k
    char *sdup = malloc(len + 1);
246
7.62k
    if (!sdup) return -1;
247
7.62k
    memcpy(sdup, s, len);
248
7.62k
    sdup[len] = 0;
249
250
    // Ensure space is available in h->samples
251
7.62k
    size_t n = kh_size(d);
252
7.62k
    char **new_samples = hts_realloc_ps(h->samples, sizeof(*h->samples), n, 1);
253
7.62k
    if (!new_samples) {
254
0
        free(sdup);
255
0
        return -1;
256
0
    }
257
7.62k
    h->samples = new_samples;
258
259
7.62k
    int k = kh_put(vdict, d, sdup, &ret);
260
7.62k
    if (ret < 0) {
261
0
        free(sdup);
262
0
        return -1;
263
0
    }
264
7.62k
    if (ret) { // absent
265
7.62k
        kh_val(d, k) = bcf_idinfo_def;
266
7.62k
        kh_val(d, k).id = n;
267
7.62k
    } else {
268
6
        hts_log_error("Duplicated sample name '%s'", sdup);
269
6
        free(sdup);
270
6
        return -1;
271
6
    }
272
7.62k
    h->samples[n] = sdup;
273
7.62k
    h->dirty = 1;
274
7.62k
    return 0;
275
7.62k
}
276
277
int bcf_hdr_add_sample(bcf_hdr_t *h, const char *s)
278
0
{
279
0
    if (!s) {
280
        // Allowed for backwards-compatibility, calling with s == NULL
281
        // used to trigger bcf_hdr_sync(h);
282
0
        return 0;
283
0
    }
284
0
    return bcf_hdr_add_sample_len(h, s, strlen(s));
285
0
}
286
287
int HTS_RESULT_USED bcf_hdr_parse_sample_line(bcf_hdr_t *hdr, const char *str)
288
4.89k
{
289
4.89k
    const char *mandatory = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO";
290
4.89k
    if ( strncmp(str,mandatory,strlen(mandatory)) )
291
184
    {
292
184
        hts_log_error("Could not parse the \"#CHROM..\" line, either the fields are incorrect or spaces are present instead of tabs:\n\t%s",str);
293
184
        return -1;
294
184
    }
295
296
4.71k
    const char *beg = str + strlen(mandatory), *end;
297
4.71k
    if ( !*beg || *beg=='\n' ) return 0;
298
1.59k
    if ( strncmp(beg,"\tFORMAT\t",8) )
299
22
    {
300
22
        hts_log_error("Could not parse the \"#CHROM..\" line, either FORMAT is missing or spaces are present instead of tabs:\n\t%s",str);
301
22
        return -1;
302
22
    }
303
1.57k
    beg += 8;
304
305
1.57k
    int ret = 0;
306
7.75k
    while ( *beg )
307
7.63k
    {
308
7.63k
        end = beg;
309
169M
        while ( *end && *end!='\t' && *end!='\n' ) end++;
310
7.63k
        if ( bcf_hdr_add_sample_len(hdr, beg, end-beg) < 0 ) ret = -1;
311
7.63k
        if ( !*end || *end=='\n' || ret<0 ) break;
312
6.18k
        beg = end + 1;
313
6.18k
    }
314
1.57k
    return ret;
315
1.59k
}
316
317
int bcf_hdr_sync(bcf_hdr_t *h)
318
85.2k
{
319
85.2k
    int i;
320
341k
    for (i = 0; i < 3; i++)
321
255k
    {
322
255k
        vdict_t *d = (vdict_t*)h->dict[i];
323
255k
        khint_t k;
324
255k
        if ( h->n[i] < kh_size(d) )
325
1.44k
        {
326
1.44k
            bcf_idpair_t *new_idpair;
327
            // this should be true only for i=2, BCF_DT_SAMPLE
328
1.44k
            new_idpair = hts_realloc_p(h->id[i], sizeof(bcf_idpair_t), kh_size(d));
329
1.44k
            if (!new_idpair) return -1;
330
1.44k
            h->n[i] = kh_size(d);
331
1.44k
            h->id[i] = new_idpair;
332
1.44k
        }
333
2.96G
        for (k=kh_begin(d); k<kh_end(d); k++)
334
2.96G
        {
335
2.96G
            if (!kh_exist(d,k)) continue;
336
22.9M
            h->id[i][kh_val(d,k).id].key = kh_key(d,k);
337
22.9M
            h->id[i][kh_val(d,k).id].val = &kh_val(d,k);
338
22.9M
        }
339
255k
    }
340
341
    // Invalidate key length cache
342
85.2k
    bcf_hdr_aux_t *aux = get_hdr_aux(h);
343
85.2k
    if (aux && aux->key_len) {
344
3.05k
        free(aux->key_len);
345
3.05k
        aux->key_len = NULL;
346
3.05k
    }
347
348
85.2k
    h->dirty = 0;
349
85.2k
    return 0;
350
85.2k
}
351
352
void bcf_hrec_destroy(bcf_hrec_t *hrec)
353
174k
{
354
174k
    if (!hrec) return;
355
168k
    free(hrec->key);
356
168k
    if ( hrec->value ) free(hrec->value);
357
168k
    int i;
358
547k
    for (i=0; i<hrec->nkeys; i++)
359
378k
    {
360
378k
        free(hrec->keys[i]);
361
378k
        free(hrec->vals[i]);
362
378k
    }
363
168k
    free(hrec->keys);
364
168k
    free(hrec->vals);
365
168k
    free(hrec);
366
168k
}
367
368
// Copies all fields except IDX.
369
bcf_hrec_t *bcf_hrec_dup(bcf_hrec_t *hrec)
370
0
{
371
0
    int save_errno;
372
0
    bcf_hrec_t *out = (bcf_hrec_t*) calloc(1,sizeof(bcf_hrec_t));
373
0
    if (!out) return NULL;
374
375
0
    out->type = hrec->type;
376
0
    if ( hrec->key ) {
377
0
        out->key = strdup(hrec->key);
378
0
        if (!out->key) goto fail;
379
0
    }
380
0
    if ( hrec->value ) {
381
0
        out->value = strdup(hrec->value);
382
0
        if (!out->value) goto fail;
383
0
    }
384
0
    out->nkeys = hrec->nkeys;
385
0
    out->keys = hts_malloc_p(sizeof(char*), hrec->nkeys);
386
0
    if (!out->keys) goto fail;
387
0
    out->vals = hts_malloc_p(sizeof(char*), hrec->nkeys);
388
0
    if (!out->vals) goto fail;
389
0
    int i, j = 0;
390
0
    for (i=0; i<hrec->nkeys; i++)
391
0
    {
392
0
        if ( hrec->keys[i] && !strcmp("IDX",hrec->keys[i]) ) continue;
393
0
        if ( hrec->keys[i] ) {
394
0
            out->keys[j] = strdup(hrec->keys[i]);
395
0
            if (!out->keys[j]) goto fail;
396
0
        }
397
0
        if ( hrec->vals[i] ) {
398
0
            out->vals[j] = strdup(hrec->vals[i]);
399
0
            if (!out->vals[j]) goto fail;
400
0
        }
401
0
        j++;
402
0
    }
403
0
    if ( i!=j ) out->nkeys -= i-j;   // IDX was omitted
404
0
    return out;
405
406
0
 fail:
407
0
    save_errno = errno;
408
0
    hts_log_error("%s", strerror(errno));
409
0
    bcf_hrec_destroy(out);
410
0
    errno = save_errno;
411
0
    return NULL;
412
0
}
413
414
void bcf_hrec_debug(FILE *fp, bcf_hrec_t *hrec)
415
0
{
416
0
    fprintf(fp, "key=[%s] value=[%s]", hrec->key, hrec->value?hrec->value:"");
417
0
    int i;
418
0
    for (i=0; i<hrec->nkeys; i++)
419
0
        fprintf(fp, "\t[%s]=[%s]", hrec->keys[i],hrec->vals[i]);
420
0
    fprintf(fp, "\n");
421
0
}
422
423
void bcf_header_debug(bcf_hdr_t *hdr)
424
0
{
425
0
    int i, j;
426
0
    for (i=0; i<hdr->nhrec; i++)
427
0
    {
428
0
        if ( !hdr->hrec[i]->value )
429
0
        {
430
0
            fprintf(stderr, "##%s=<", hdr->hrec[i]->key);
431
0
            fprintf(stderr,"%s=%s", hdr->hrec[i]->keys[0], hdr->hrec[i]->vals[0]);
432
0
            for (j=1; j<hdr->hrec[i]->nkeys; j++)
433
0
                fprintf(stderr,",%s=%s", hdr->hrec[i]->keys[j], hdr->hrec[i]->vals[j]);
434
0
            fprintf(stderr,">\n");
435
0
        }
436
0
        else
437
0
            fprintf(stderr,"##%s=%s\n", hdr->hrec[i]->key,hdr->hrec[i]->value);
438
0
    }
439
0
}
440
441
int bcf_hrec_add_key(bcf_hrec_t *hrec, const char *str, size_t len)
442
288k
{
443
288k
    char **tmp;
444
288k
    size_t n = hrec->nkeys + 1;
445
288k
    assert(len > 0 && len < SIZE_MAX);
446
288k
    tmp = hts_realloc_p(hrec->keys, sizeof(char*), n);
447
288k
    if (!tmp) return -1;
448
288k
    hrec->keys = tmp;
449
288k
    tmp = hts_realloc_p(hrec->vals, sizeof(char*), n);
450
288k
    if (!tmp) return -1;
451
288k
    hrec->vals = tmp;
452
453
288k
    hrec->keys[hrec->nkeys] = hts_malloc_ps(sizeof(char), len, 1);
454
288k
    if (!hrec->keys[hrec->nkeys]) return -1;
455
288k
    memcpy(hrec->keys[hrec->nkeys],str,len);
456
288k
    hrec->keys[hrec->nkeys][len] = 0;
457
288k
    hrec->vals[hrec->nkeys] = NULL;
458
288k
    hrec->nkeys = n;
459
288k
    return 0;
460
288k
}
461
462
int bcf_hrec_set_val(bcf_hrec_t *hrec, int i, const char *str, size_t len, int is_quoted)
463
288k
{
464
288k
    if ( hrec->vals[i] ) {
465
0
        free(hrec->vals[i]);
466
0
        hrec->vals[i] = NULL;
467
0
    }
468
288k
    if ( !str ) return 0;
469
288k
    if ( is_quoted )
470
84.1k
    {
471
84.1k
        if (len >= SIZE_MAX - 3) {
472
0
            errno = ENOMEM;
473
0
            return -1;
474
0
        }
475
84.1k
        hrec->vals[i] = hts_malloc_ps(sizeof(char), len, 3);
476
84.1k
        if (!hrec->vals[i]) return -1;
477
84.1k
        hrec->vals[i][0] = '"';
478
84.1k
        memcpy(&hrec->vals[i][1],str,len);
479
84.1k
        hrec->vals[i][len+1] = '"';
480
84.1k
        hrec->vals[i][len+2] = 0;
481
84.1k
    }
482
204k
    else
483
204k
    {
484
204k
        if (len == SIZE_MAX) {
485
0
            errno = ENOMEM;
486
0
            return -1;
487
0
        }
488
204k
        hrec->vals[i] = hts_malloc_ps(sizeof(char), len, 1);
489
204k
        if (!hrec->vals[i]) return -1;
490
204k
        memcpy(hrec->vals[i],str,len);
491
204k
        hrec->vals[i][len] = 0;
492
204k
    }
493
288k
    return 0;
494
288k
}
495
496
int hrec_add_idx(bcf_hrec_t *hrec, int idx)
497
90.1k
{
498
90.1k
    int n = hrec->nkeys + 1;
499
90.1k
    char **tmp = hts_realloc_p(hrec->keys, sizeof(char*), n);
500
90.1k
    if (!tmp) return -1;
501
90.1k
    hrec->keys = tmp;
502
503
90.1k
    tmp = hts_realloc_p(hrec->vals, sizeof(char*), n);
504
90.1k
    if (!tmp) return -1;
505
90.1k
    hrec->vals = tmp;
506
507
90.1k
    hrec->keys[hrec->nkeys] = strdup("IDX");
508
90.1k
    if (!hrec->keys[hrec->nkeys]) return -1;
509
510
90.1k
    kstring_t str = {0,0,0};
511
90.1k
    if (kputw(idx, &str) < 0) {
512
0
        free(hrec->keys[hrec->nkeys]);
513
0
        return -1;
514
0
    }
515
90.1k
    hrec->vals[hrec->nkeys] = str.s;
516
90.1k
    hrec->nkeys = n;
517
90.1k
    return 0;
518
90.1k
}
519
520
int bcf_hrec_find_key(bcf_hrec_t *hrec, const char *key)
521
114k
{
522
114k
    int i;
523
175k
    for (i=0; i<hrec->nkeys; i++)
524
129k
        if ( !strcasecmp(key,hrec->keys[i]) ) return i;
525
45.6k
    return -1;
526
114k
}
527
528
static void bcf_hrec_set_type(bcf_hrec_t *hrec)
529
316k
{
530
316k
    if ( !strcmp(hrec->key, "contig") ) hrec->type = BCF_HL_CTG;
531
286k
    else if ( !strcmp(hrec->key, "INFO") ) hrec->type = BCF_HL_INFO;
532
193k
    else if ( !strcmp(hrec->key, "FILTER") ) hrec->type = BCF_HL_FLT;
533
99.2k
    else if ( !strcmp(hrec->key, "FORMAT") ) hrec->type = BCF_HL_FMT;
534
79.1k
    else if ( hrec->nkeys>0 ) hrec->type = BCF_HL_STR;
535
63.1k
    else hrec->type = BCF_HL_GEN;
536
316k
}
537
538
539
/**
540
    The arrays were generated with
541
542
    valid_ctg:
543
        perl -le '@v = (split(//,q[!#$%&*+./:;=?@^_|~-]),"a"..."z","A"..."Z","0"..."9"); @a = (0) x 256; foreach $c (@v) { $a[ord($c)] = 1; } print join(", ",@a)' | fold -w 48
544
545
    valid_tag:
546
        perl -le '@v = (split(//,q[_.]),"a"..."z","A"..."Z","0"..."9"); @a = (0) x 256; foreach $c (@v) { $a[ord($c)] = 1; } print join(", ",@a)' | fold -w 48
547
*/
548
static const uint8_t valid_ctg[256] =
549
{
550
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
551
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
552
    0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1,
553
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,
554
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
555
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1,
556
    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
557
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0,
558
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
559
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
560
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
561
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
562
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
563
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
564
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
565
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
566
};
567
static const uint8_t valid_tag[256] =
568
{
569
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
570
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
571
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
572
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
573
    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
574
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
575
    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
576
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
577
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
578
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
579
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
580
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
581
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
582
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
583
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
584
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
585
};
586
587
/**
588
    bcf_hrec_check() - check the validity of structured header lines
589
590
    Returns 0 on success or negative value on error.
591
592
    Currently the return status is not checked by the caller
593
    and only a warning is printed on stderr. This should be improved
594
    to propagate the error all the way up to the caller and let it
595
    decide what to do: throw an error or proceed anyway.
596
 */
597
static int bcf_hrec_check(bcf_hrec_t *hrec)
598
158k
{
599
158k
    int i;
600
158k
    bcf_hrec_set_type(hrec);
601
602
158k
    if ( hrec->type==BCF_HL_CTG )
603
14.7k
    {
604
14.7k
        i = bcf_hrec_find_key(hrec,"ID");
605
14.7k
        if ( i<0 ) goto err_missing_id;
606
10.1k
        char *val = hrec->vals[i];
607
10.1k
        if ( val[0]=='*' || val[0]=='=' || !valid_ctg[(uint8_t)val[0]] ) goto err_invalid_ctg;
608
55.2k
        while ( *(++val) )
609
54.9k
            if ( !valid_ctg[(uint8_t)*val] ) goto err_invalid_ctg;
610
353
        return 0;
611
1.48k
    }
612
143k
    if ( hrec->type==BCF_HL_INFO )
613
46.3k
    {
614
46.3k
        i = bcf_hrec_find_key(hrec,"ID");
615
46.3k
        if ( i<0 ) goto err_missing_id;
616
32.5k
        char *val = hrec->vals[i];
617
32.5k
        if ( !strcmp(val,"1000G") ) return 0;
618
32.5k
        if ( val[0]=='.' || (val[0]>='0' && val[0]<='9') || !valid_tag[(uint8_t)val[0]] ) goto err_invalid_tag;
619
15.3k
        while ( *(++val) )
620
13.3k
            if ( !valid_tag[(uint8_t)*val] ) goto err_invalid_tag;
621
1.99k
        return 0;
622
3.95k
    }
623
96.9k
    if ( hrec->type==BCF_HL_FMT )
624
10.0k
    {
625
10.0k
        i = bcf_hrec_find_key(hrec,"ID");
626
10.0k
        if ( i<0 ) goto err_missing_id;
627
8.82k
        char *val = hrec->vals[i];
628
8.82k
        if ( val[0]=='.' || (val[0]>='0' && val[0]<='9') || !valid_tag[(uint8_t)val[0]] ) goto err_invalid_tag;
629
10.3k
        while ( *(++val) )
630
7.84k
            if ( !valid_tag[(uint8_t)*val] ) goto err_invalid_tag;
631
2.47k
        return 0;
632
4.37k
    }
633
86.9k
    return 0;
634
635
19.6k
  err_missing_id:
636
19.6k
    hts_log_warning("Missing ID attribute in one or more header lines");
637
19.6k
    return -1;
638
639
9.76k
  err_invalid_ctg:
640
9.76k
    hts_log_warning("Invalid contig name: \"%s\"", hrec->vals[i]);
641
9.76k
    return -1;
642
643
36.8k
  err_invalid_tag:
644
36.8k
    hts_log_warning("Invalid tag name: \"%s\"", hrec->vals[i]);
645
36.8k
    return -1;
646
96.9k
}
647
648
static inline int is_escaped(const char *min, const char *str)
649
84.2k
{
650
84.2k
    int n = 0;
651
84.5k
    while ( --str>=min && *str=='\\' ) n++;
652
84.2k
    return n%2;
653
84.2k
}
654
655
bcf_hrec_t *bcf_hdr_parse_line(const bcf_hdr_t *h, const char *line, int *len)
656
182k
{
657
182k
    bcf_hrec_t *hrec = NULL;
658
182k
    const char *p = line;
659
182k
    if (p[0] != '#' || p[1] != '#') { *len = 0; return NULL; }
660
174k
    p += 2;
661
662
174k
    const char *q = p;
663
1.61M
    while ( *q && *q!='=' && *q != '\n' ) q++;
664
174k
    ptrdiff_t n = q-p;
665
174k
    if ( *q!='=' || !n ) // wrong format
666
5.91k
        goto malformed_line;
667
668
168k
    hrec = (bcf_hrec_t*) calloc(1,sizeof(bcf_hrec_t));
669
168k
    if (!hrec) { *len = -1; return NULL; }
670
168k
    hrec->key = hts_malloc_ps(sizeof(char), n, 1);
671
168k
    if (!hrec->key) goto fail;
672
168k
    memcpy(hrec->key,p,n);
673
168k
    hrec->key[n] = 0;
674
168k
    hrec->type = -1;
675
676
168k
    p = ++q;
677
168k
    if ( *p!='<' ) // generic field, e.g. ##samtoolsVersion=0.1.18-r579
678
37.1k
    {
679
8.83M
        while ( *q && *q!='\n' ) q++;
680
37.1k
        hrec->value = hts_malloc_p(sizeof(char), (q-p+1));
681
37.1k
        if (!hrec->value) goto fail;
682
37.1k
        memcpy(hrec->value, p, q-p);
683
37.1k
        hrec->value[q-p] = 0;
684
37.1k
        *len = q - line + (*q ? 1 : 0); // Skip \n but not \0
685
37.1k
        return hrec;
686
37.1k
    }
687
688
    // structured line, e.g.
689
    // ##INFO=<ID=PV1,Number=1,Type=Float,Description="P-value for baseQ bias">
690
    // ##PEDIGREE=<Name_0=G0-ID,Name_1=G1-ID,Name_3=GN-ID>
691
131k
    int nopen = 1;
692
420k
    while ( *q && *q!='\n' && nopen>0 )
693
299k
    {
694
299k
        p = ++q;
695
302k
        while ( *q && *q==' ' ) { p++; q++; }
696
        // ^[A-Za-z_][0-9A-Za-z_.]*$
697
299k
        if (p==q && *q && (isalpha_c(*q) || *q=='_'))
698
295k
        {
699
295k
            q++;
700
1.72M
            while ( *q && (isalnum_c(*q) || *q=='_' || *q=='.') ) q++;
701
295k
        }
702
299k
        n = q-p;
703
299k
        int m = 0;
704
299k
        while ( *q && *q==' ' ) { q++; m++; }
705
299k
        if ( *q!='=' || !n )
706
10.4k
            goto malformed_line;
707
708
288k
        if (bcf_hrec_add_key(hrec, p, q-p-m) < 0) goto fail;
709
288k
        p = ++q;
710
291k
        while ( *q && *q==' ' ) { p++; q++; }
711
712
288k
        int quoted = 0;
713
288k
        char ending = '\0';
714
288k
        switch (*p) {
715
84.1k
        case '"':
716
84.1k
            quoted = 1;
717
84.1k
            ending = '"';
718
84.1k
            p++;
719
84.1k
            break;
720
285
        case '[':
721
285
            quoted = 1;
722
285
            ending = ']';
723
285
            break;
724
288k
        }
725
288k
        if ( quoted ) q++;
726
220M
        while ( *q && *q != '\n' )
727
220M
        {
728
220M
            if ( quoted ) { if ( *q==ending && !is_escaped(p,q) ) break; }
729
219M
            else
730
219M
            {
731
219M
                if ( *q=='<' ) nopen++;
732
219M
                if ( *q=='>' ) nopen--;
733
219M
                if ( !nopen ) break;
734
219M
                if ( *q==',' && nopen==1 ) break;
735
219M
            }
736
220M
            q++;
737
220M
        }
738
288k
        const char *r = q;
739
288k
        if (quoted && ending == ']') {
740
285
            if (*q == ending) {
741
230
                r++;
742
230
                q++;
743
230
                quoted = 0;
744
230
            } else {
745
55
                char buffer[320];
746
55
                hts_log_error("Missing ']' in header line %s",
747
55
                              hts_strprint(buffer, sizeof(buffer), '"',
748
55
                                           line, q-line));
749
55
                goto fail;
750
55
            }
751
285
        }
752
289k
        while ( r > p && r[-1] == ' ' ) r--;
753
288k
        if (bcf_hrec_set_val(hrec, hrec->nkeys-1, p, r-p, quoted) < 0)
754
0
            goto fail;
755
288k
        if ( quoted && *q==ending ) q++;
756
288k
        if ( *q=='>' )
757
93.0k
        {
758
93.0k
            if (nopen) nopen--;     // this can happen with nested angle brackets <>
759
93.0k
            q++;
760
93.0k
        }
761
288k
    }
762
120k
    if ( nopen )
763
27.8k
        hts_log_warning("Incomplete header line, trying to proceed anyway:\n\t[%s]\n\t[%d]",line,q[0]);
764
765
    // Skip to end of line
766
120k
    int nonspace = 0;
767
120k
    p = q;
768
998k
    while ( *q && *q!='\n' ) { nonspace |= !isspace_c(*q); q++; }
769
120k
    if (nonspace) {
770
1.17k
        char buffer[320];
771
1.17k
        hts_log_warning("Dropped trailing junk from header line '%s'",
772
1.17k
                        hts_strprint(buffer, sizeof(buffer),
773
1.17k
                                     '"', line, q - line));
774
1.17k
    }
775
776
120k
    *len = q - line + (*q ? 1 : 0);
777
120k
    return hrec;
778
779
55
 fail:
780
55
    *len = -1;
781
55
    bcf_hrec_destroy(hrec);
782
55
    return NULL;
783
784
16.3k
 malformed_line:
785
16.3k
    {
786
16.3k
        char buffer[320];
787
6.80M
        while ( *q && *q!='\n' ) q++;  // Ensure *len includes full line
788
16.3k
        hts_log_error("Could not parse the header line: %s",
789
16.3k
                      hts_strprint(buffer, sizeof(buffer),
790
16.3k
                                   '"', line, q - line));
791
16.3k
        *len = q - line + (*q ? 1 : 0);
792
16.3k
        bcf_hrec_destroy(hrec);
793
16.3k
        return NULL;
794
131k
    }
795
131k
}
796
797
static int bcf_hdr_set_idx(bcf_hdr_t *hdr, int dict_type, const char *tag, bcf_idinfo_t *idinfo)
798
89.1k
{
799
89.1k
    size_t new_n;
800
801
    // If available, preserve existing IDX
802
89.1k
    if ( idinfo->id==-1 )
803
88.5k
        idinfo->id = hdr->n[dict_type];
804
555
    else if ( idinfo->id < hdr->n[dict_type] && hdr->id[dict_type][idinfo->id].key )
805
9
    {
806
9
        hts_log_error("Conflicting IDX=%d lines in the header dictionary, the new tag is %s",
807
9
            idinfo->id, tag);
808
9
        errno = EINVAL;
809
9
        return -1;
810
9
    }
811
812
89.0k
    new_n = idinfo->id >= hdr->n[dict_type] ? idinfo->id+1 : hdr->n[dict_type];
813
89.0k
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
814
    // hts_resize() can attempt to allocate up to 2 * requested items
815
89.0k
    if (new_n > FUZZ_ALLOC_LIMIT/(2 * sizeof(bcf_idpair_t)))
816
28
        return -1;
817
89.0k
#endif
818
89.0k
    if (hts_resize(bcf_idpair_t, new_n, &hdr->m[dict_type],
819
89.0k
                   &hdr->id[dict_type], HTS_RESIZE_CLEAR)) {
820
0
        return -1;
821
0
    }
822
89.0k
    hdr->n[dict_type] = new_n;
823
824
    // NB: the next kh_put call can invalidate the idinfo pointer, therefore
825
    // we leave it unassigned here. It must be set explicitly in bcf_hdr_sync.
826
89.0k
    hdr->id[dict_type][idinfo->id].key = tag;
827
828
89.0k
    return 0;
829
89.0k
}
830
831
// returns: 1 when hdr needs to be synced, -1 on error, 0 otherwise
832
static int bcf_hdr_register_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
833
158k
{
834
    // contig
835
158k
    int i, ret, replacing = 0;
836
158k
    khint_t k;
837
158k
    char *str = NULL;
838
839
158k
    bcf_hrec_set_type(hrec);
840
841
158k
    if ( hrec->type==BCF_HL_CTG )
842
14.7k
    {
843
14.7k
        hts_pos_t len = 0;
844
845
        // Get the contig ID ($str) and length ($j)
846
14.7k
        i = bcf_hrec_find_key(hrec,"length");
847
14.7k
        if ( i<0 ) len = 0;
848
1.68k
        else {
849
1.68k
            char *end = hrec->vals[i];
850
1.68k
            len = strtoll(hrec->vals[i], &end, 10);
851
1.68k
            if (end == hrec->vals[i] || len < 0) return 0;
852
1.68k
        }
853
854
14.1k
        i = bcf_hrec_find_key(hrec,"ID");
855
14.1k
        if ( i<0 ) return 0;
856
10.1k
        str = strdup(hrec->vals[i]);
857
10.1k
        if (!str) return -1;
858
859
        // Register in the dictionary
860
10.1k
        vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_CTG];
861
10.1k
        khint_t k = kh_get(vdict, d, str);
862
10.1k
        if ( k != kh_end(d) ) { // already present
863
3.52k
            free(str); str=NULL;
864
3.52k
            if (kh_val(d, k).hrec[0] != NULL) // and not removed
865
3.52k
                return 0;
866
0
            replacing = 1;
867
6.60k
        } else {
868
6.60k
            k = kh_put(vdict, d, str, &ret);
869
6.60k
            if (ret < 0) { free(str); return -1; }
870
6.60k
        }
871
872
6.60k
        int idx = bcf_hrec_find_key(hrec,"IDX");
873
6.60k
        if ( idx!=-1 )
874
1.24k
        {
875
1.24k
            char *tmp = hrec->vals[idx];
876
1.24k
            idx = strtol(hrec->vals[idx], &tmp, 10);
877
1.24k
            if ( *tmp || idx < 0 || idx >= INT_MAX - 1)
878
1.18k
            {
879
1.18k
                if (!replacing) {
880
1.18k
                    kh_del(vdict, d, k);
881
1.18k
                    free(str);
882
1.18k
                }
883
1.18k
                hts_log_warning("Error parsing the IDX tag, skipping");
884
1.18k
                return 0;
885
1.18k
            }
886
1.24k
        }
887
888
5.41k
        kh_val(d, k) = bcf_idinfo_def;
889
5.41k
        kh_val(d, k).id = idx;
890
5.41k
        kh_val(d, k).info[0] = len;
891
5.41k
        kh_val(d, k).hrec[0] = hrec;
892
5.41k
        if (bcf_hdr_set_idx(hdr, BCF_DT_CTG, kh_key(d,k), &kh_val(d,k)) < 0) {
893
21
            if (!replacing) {
894
21
                kh_del(vdict, d, k);
895
21
                free(str);
896
21
            }
897
21
            return -1;
898
21
        }
899
5.39k
        if ( idx==-1 ) {
900
5.36k
            if (hrec_add_idx(hrec, kh_val(d,k).id) < 0) {
901
0
               return -1;
902
0
            }
903
5.36k
        }
904
905
5.39k
        return 1;
906
5.39k
    }
907
908
143k
    if ( hrec->type==BCF_HL_STR ) return 1;
909
135k
    if ( hrec->type!=BCF_HL_INFO && hrec->type!=BCF_HL_FLT && hrec->type!=BCF_HL_FMT ) return 0;
910
911
    // INFO/FILTER/FORMAT
912
103k
    char *id = NULL;
913
103k
    uint32_t type = UINT32_MAX, var = UINT32_MAX;
914
103k
    int num = -1, idx = -1;
915
361k
    for (i=0; i<hrec->nkeys; i++)
916
258k
    {
917
258k
        if ( !strcmp(hrec->keys[i], "ID") ) id = hrec->vals[i];
918
169k
        else if ( !strcmp(hrec->keys[i], "IDX") )
919
1.24k
        {
920
1.24k
            char *tmp = hrec->vals[i];
921
1.24k
            idx = strtol(hrec->vals[i], &tmp, 10);
922
1.24k
            if ( *tmp || idx < 0 || idx >= INT_MAX - 1)
923
264
            {
924
264
                hts_log_warning("Error parsing the IDX tag, skipping");
925
264
                return 0;
926
264
            }
927
1.24k
        }
928
168k
        else if ( !strcmp(hrec->keys[i], "Type") )
929
42.3k
        {
930
42.3k
            if ( !strcmp(hrec->vals[i], "Integer") ) type = BCF_HT_INT;
931
40.6k
            else if ( !strcmp(hrec->vals[i], "Float") ) type = BCF_HT_REAL;
932
39.7k
            else if ( !strcmp(hrec->vals[i], "String") ) type = BCF_HT_STR;
933
5.17k
            else if ( !strcmp(hrec->vals[i], "Character") ) type = BCF_HT_STR;
934
5.01k
            else if ( !strcmp(hrec->vals[i], "Flag") ) type = BCF_HT_FLAG;
935
3.84k
            else
936
3.84k
            {
937
3.84k
                hts_log_warning("The type \"%s\" is not supported, assuming \"String\"", hrec->vals[i]);
938
3.84k
                type = BCF_HT_STR;
939
3.84k
            }
940
42.3k
        }
941
126k
        else if ( !strcmp(hrec->keys[i], "Number") )
942
39.4k
        {
943
39.4k
            int is_fmt = hrec->type == BCF_HL_FMT;
944
39.4k
            if ( !strcmp(hrec->vals[i],"A") ) var = BCF_VL_A;
945
36.7k
            else if ( !strcmp(hrec->vals[i],"R") ) var = BCF_VL_R;
946
36.6k
            else if ( !strcmp(hrec->vals[i],"G") ) var = BCF_VL_G;
947
35.7k
            else if ( !strcmp(hrec->vals[i],".") ) var = BCF_VL_VAR;
948
35.7k
            else if ( is_fmt && !strcmp(hrec->vals[i],"P") )  var = BCF_VL_P;
949
35.7k
            else if ( is_fmt && !strcmp(hrec->vals[i],"LA") ) var = BCF_VL_LA;
950
35.7k
            else if ( is_fmt && !strcmp(hrec->vals[i],"LR") ) var = BCF_VL_LR;
951
35.7k
            else if ( is_fmt && !strcmp(hrec->vals[i],"LG") ) var = BCF_VL_LG;
952
35.7k
            else if ( is_fmt && !strcmp(hrec->vals[i],"M") )  var = BCF_VL_M;
953
35.7k
            else
954
35.7k
            {
955
35.7k
                if (sscanf(hrec->vals[i],"%d",&num) == 1)
956
35.2k
                    var = BCF_VL_FIXED;
957
35.7k
            }
958
39.4k
            if (var != BCF_VL_FIXED) num = 0xfffff;
959
39.4k
        }
960
258k
    }
961
103k
    if (hrec->type == BCF_HL_INFO || hrec->type == BCF_HL_FMT) {
962
56.1k
        if (type == -1) {
963
14.7k
            hts_log_warning("%s %s field has no Type defined. Assuming String",
964
14.7k
                *hrec->key == 'I' ? "An" : "A", hrec->key);
965
14.7k
            type = BCF_HT_STR;
966
14.7k
        }
967
56.1k
        if (var == UINT32_MAX) {
968
17.2k
            hts_log_warning("%s %s field has no Number defined. Assuming '.'",
969
17.2k
                *hrec->key == 'I' ? "An" : "A", hrec->key);
970
17.2k
            var = BCF_VL_VAR;
971
17.2k
        }
972
56.1k
        if ( type==BCF_HT_FLAG && (var!=BCF_VL_FIXED || num!=0) )
973
856
        {
974
856
            hts_log_warning("The definition of Flag \"%s/%s\" is invalid, forcing Number=0", hrec->key,id);
975
856
            var = BCF_VL_FIXED;
976
856
            num = 0;
977
856
        }
978
56.1k
    }
979
103k
    uint32_t info = ((((uint32_t)num) & 0xfffff)<<12 |
980
103k
                     (var & 0xf) << 8 |
981
103k
                     (type & 0xf) << 4 |
982
103k
                     (((uint32_t) hrec->type) & 0xf));
983
984
103k
    if ( !id ) return 0;
985
88.5k
    str = strdup(id);
986
88.5k
    if (!str) return -1;
987
988
88.5k
    vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_ID];
989
88.5k
    k = kh_get(vdict, d, str);
990
88.5k
    if ( k != kh_end(d) )
991
4.83k
    {
992
        // already present
993
4.83k
        free(str);
994
4.83k
        if ( kh_val(d, k).hrec[info&0xf] ) return 0;
995
1.58k
        kh_val(d, k).info[info&0xf] = info;
996
1.58k
        kh_val(d, k).hrec[info&0xf] = hrec;
997
1.58k
        if ( idx==-1 ) {
998
1.58k
            if (hrec_add_idx(hrec, kh_val(d, k).id) < 0) {
999
0
                return -1;
1000
0
            }
1001
1.58k
        }
1002
1.58k
        return 1;
1003
1.58k
    }
1004
83.6k
    k = kh_put(vdict, d, str, &ret);
1005
83.6k
    if (ret < 0) {
1006
0
        free(str);
1007
0
        return -1;
1008
0
    }
1009
83.6k
    kh_val(d, k) = bcf_idinfo_def;
1010
83.6k
    kh_val(d, k).info[info&0xf] = info;
1011
83.6k
    kh_val(d, k).hrec[info&0xf] = hrec;
1012
83.6k
    kh_val(d, k).id = idx;
1013
83.6k
    if (bcf_hdr_set_idx(hdr, BCF_DT_ID, kh_key(d,k), &kh_val(d,k)) < 0) {
1014
16
        kh_del(vdict, d, k);
1015
16
        free(str);
1016
16
        return -1;
1017
16
    }
1018
83.6k
    if ( idx==-1 ) {
1019
83.1k
        if (hrec_add_idx(hrec, kh_val(d,k).id) < 0) {
1020
0
            return -1;
1021
0
        }
1022
83.1k
    }
1023
1024
83.6k
    return 1;
1025
83.6k
}
1026
1027
static void bcf_hdr_unregister_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
1028
0
{
1029
0
    if (hrec->type == BCF_HL_FLT ||
1030
0
        hrec->type == BCF_HL_INFO ||
1031
0
        hrec->type == BCF_HL_FMT ||
1032
0
        hrec->type == BCF_HL_CTG) {
1033
0
        int id = bcf_hrec_find_key(hrec, "ID");
1034
0
        if (id < 0 || !hrec->vals[id])
1035
0
            return;
1036
0
        vdict_t *dict = (hrec->type == BCF_HL_CTG
1037
0
                         ? (vdict_t*)hdr->dict[BCF_DT_CTG]
1038
0
                         : (vdict_t*)hdr->dict[BCF_DT_ID]);
1039
0
        khint_t k = kh_get(vdict, dict, hrec->vals[id]);
1040
0
        if (k != kh_end(dict))
1041
0
            kh_val(dict, k).hrec[hrec->type==BCF_HL_CTG ? 0 : hrec->type] = NULL;
1042
0
    }
1043
0
}
1044
1045
static void bcf_hdr_remove_from_hdict(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
1046
0
{
1047
0
    kstring_t str = KS_INITIALIZE;
1048
0
    bcf_hdr_aux_t *aux = get_hdr_aux(hdr);
1049
0
    khint_t k;
1050
0
    int id;
1051
1052
0
    switch (hrec->type) {
1053
0
    case BCF_HL_GEN:
1054
0
        if (ksprintf(&str, "##%s=%s", hrec->key,hrec->value) < 0)
1055
0
            str.l = 0;
1056
0
        break;
1057
0
    case BCF_HL_STR:
1058
0
        id = bcf_hrec_find_key(hrec, "ID");
1059
0
        if (id < 0)
1060
0
            return;
1061
0
        if (!hrec->vals[id] ||
1062
0
            ksprintf(&str, "##%s=<ID=%s>", hrec->key, hrec->vals[id]) < 0)
1063
0
            str.l = 0;
1064
0
        break;
1065
0
    default:
1066
0
        return;
1067
0
    }
1068
0
    if (str.l) {
1069
0
        k = kh_get(hdict, aux->gen, str.s);
1070
0
    } else {
1071
        // Couldn't get a string for some reason, so try the hard way...
1072
0
        for (k = kh_begin(aux->gen); k < kh_end(aux->gen); k++) {
1073
0
            if (kh_exist(aux->gen, k) && kh_val(aux->gen, k) == hrec)
1074
0
                break;
1075
0
        }
1076
0
    }
1077
0
    if (k != kh_end(aux->gen) && kh_val(aux->gen, k) == hrec) {
1078
0
        kh_val(aux->gen, k) = NULL;
1079
0
        free((char *) kh_key(aux->gen, k));
1080
0
        kh_key(aux->gen, k) = NULL;
1081
0
        kh_del(hdict, aux->gen, k);
1082
0
    }
1083
0
    free(str.s);
1084
0
}
1085
1086
int bcf_hdr_update_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec, const bcf_hrec_t *tmp)
1087
0
{
1088
0
    assert( hrec->type==BCF_HL_GEN );
1089
0
    int ret;
1090
0
    khint_t k;
1091
0
    bcf_hdr_aux_t *aux = get_hdr_aux(hdr);
1092
0
    for (k=kh_begin(aux->gen); k<kh_end(aux->gen); k++)
1093
0
    {
1094
0
        if ( !kh_exist(aux->gen,k) ) continue;
1095
0
        if ( hrec!=(bcf_hrec_t*)kh_val(aux->gen,k) ) continue;
1096
0
        break;
1097
0
    }
1098
0
    assert( k<kh_end(aux->gen) );   // something went wrong, should never happen
1099
0
    free((char*)kh_key(aux->gen,k));
1100
0
    kh_del(hdict,aux->gen,k);
1101
0
    kstring_t str = {0,0,0};
1102
0
    if ( ksprintf(&str, "##%s=%s", tmp->key,tmp->value) < 0 )
1103
0
    {
1104
0
        free(str.s);
1105
0
        return -1;
1106
0
    }
1107
0
    k = kh_put(hdict, aux->gen, str.s, &ret);
1108
0
    if ( ret<0 )
1109
0
    {
1110
0
        free(str.s);
1111
0
        return -1;
1112
0
    }
1113
0
    free(hrec->value);
1114
0
    hrec->value = strdup(tmp->value);
1115
0
    if ( !hrec->value ) return -1;
1116
0
    kh_val(aux->gen,k) = hrec;
1117
1118
0
    if (!strcmp(hrec->key,"fileformat")) {
1119
        //update version
1120
0
        get_hdr_aux(hdr)->version = bcf_get_version(NULL, hrec->value);
1121
0
    }
1122
0
    return 0;
1123
0
}
1124
1125
int bcf_hdr_add_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
1126
158k
{
1127
158k
    kstring_t str = {0,0,0};
1128
158k
    bcf_hdr_aux_t *aux = get_hdr_aux(hdr);
1129
1130
158k
    int res;
1131
158k
    if ( !hrec ) return 0;
1132
1133
158k
    bcf_hrec_check(hrec);   // todo: check return status and propagate errors up
1134
1135
158k
    res = bcf_hdr_register_hrec(hdr,hrec);
1136
158k
    if (res < 0) return -1;
1137
158k
    if ( !res )
1138
59.3k
    {
1139
        // If one of the hashed field, then it is already present
1140
59.3k
        if ( hrec->type != BCF_HL_GEN )
1141
27.8k
        {
1142
27.8k
            bcf_hrec_destroy(hrec);
1143
27.8k
            return 0;
1144
27.8k
        }
1145
        // Is one of the generic fields and already present?
1146
31.5k
        if ( ksprintf(&str, "##%s=%s", hrec->key,hrec->value) < 0 )
1147
0
        {
1148
0
            free(str.s);
1149
0
            return -1;
1150
0
        }
1151
31.5k
        khint_t k = kh_get(hdict, aux->gen, str.s);
1152
31.5k
        if ( k != kh_end(aux->gen) )
1153
18.2k
        {
1154
            // duplicate record
1155
18.2k
            bcf_hrec_destroy(hrec);
1156
18.2k
            free(str.s);
1157
18.2k
            return 0;
1158
18.2k
        }
1159
13.3k
        if (!strcmp(hrec->key, "fileformat")) {
1160
6.38k
            aux->version = bcf_get_version(NULL, hrec->value);
1161
6.38k
        }
1162
13.3k
    }
1163
1164
111k
    int i;
1165
111k
    if ( hrec->type==BCF_HL_STR && (i=bcf_hrec_find_key(hrec,"ID"))>=0 )
1166
4.46k
    {
1167
4.46k
        if ( ksprintf(&str, "##%s=<ID=%s>", hrec->key,hrec->vals[i]) < 0 )
1168
0
        {
1169
0
            free(str.s);
1170
0
            return -1;
1171
0
        }
1172
4.46k
        khint_t k = kh_get(hdict, aux->gen, str.s);
1173
4.46k
        if ( k != kh_end(aux->gen) )
1174
3.20k
        {
1175
            // duplicate record
1176
3.20k
            bcf_hrec_destroy(hrec);
1177
3.20k
            free(str.s);
1178
3.20k
            return 0;
1179
3.20k
        }
1180
4.46k
    }
1181
1182
    // New record, needs to be added
1183
108k
    int n = hdr->nhrec + 1;
1184
108k
    bcf_hrec_t **new_hrec = hts_realloc_p(hdr->hrec, sizeof(bcf_hrec_t*), n);
1185
108k
    if (!new_hrec) {
1186
0
        free(str.s);
1187
0
        bcf_hdr_unregister_hrec(hdr, hrec);
1188
0
        return -1;
1189
0
    }
1190
108k
    hdr->hrec = new_hrec;
1191
1192
108k
    if ( str.s )
1193
14.5k
    {
1194
14.5k
        khint_t k = kh_put(hdict, aux->gen, str.s, &res);
1195
14.5k
        if ( res<0 )
1196
0
        {
1197
0
            free(str.s);
1198
0
            return -1;
1199
0
        }
1200
14.5k
        kh_val(aux->gen,k) = hrec;
1201
14.5k
    }
1202
1203
108k
    hdr->hrec[hdr->nhrec] = hrec;
1204
108k
    hdr->dirty = 1;
1205
108k
    hdr->nhrec = n;
1206
1207
108k
    return hrec->type==BCF_HL_GEN ? 0 : 1;
1208
108k
}
1209
1210
bcf_hrec_t *bcf_hdr_get_hrec(const bcf_hdr_t *hdr, int type, const char *key, const char *value, const char *str_class)
1211
1.08k
{
1212
1.08k
    int i;
1213
1.08k
    if ( type==BCF_HL_GEN )
1214
1.08k
    {
1215
        // e.g. ##fileformat=VCFv4.2
1216
        //      ##source=GenomicsDBImport
1217
        //      ##bcftools_viewVersion=1.16-80-gdfdb0923+htslib-1.16-34-g215d364
1218
1.08k
        if ( value )
1219
0
        {
1220
0
            kstring_t str = {0,0,0};
1221
0
            ksprintf(&str, "##%s=%s", key,value);
1222
0
            bcf_hdr_aux_t *aux = get_hdr_aux(hdr);
1223
0
            khint_t k = kh_get(hdict, aux->gen, str.s);
1224
0
            free(str.s);
1225
0
            if ( k == kh_end(aux->gen) ) return NULL;
1226
0
            return kh_val(aux->gen, k);
1227
0
        }
1228
2.33k
        for (i=0; i<hdr->nhrec; i++)
1229
1.49k
        {
1230
1.49k
            if ( hdr->hrec[i]->type!=type ) continue;
1231
360
            if ( strcmp(hdr->hrec[i]->key,key) ) continue;
1232
254
            return hdr->hrec[i];
1233
360
        }
1234
832
        return NULL;
1235
1.08k
    }
1236
0
    else if ( type==BCF_HL_STR )
1237
0
    {
1238
        // e.g. ##GATKCommandLine=<ID=GenomicsDBImport,CommandLine="GenomicsDBImport....">
1239
        //      ##ALT=<ID=NON_REF,Description="Represents any possible alternative allele not already represented at this location by REF and ALT">
1240
0
        if (!str_class) return NULL;
1241
0
        if ( !strcmp("ID",key) )
1242
0
        {
1243
0
            kstring_t str = {0,0,0};
1244
0
            ksprintf(&str, "##%s=<%s=%s>",str_class,key,value);
1245
0
            bcf_hdr_aux_t *aux = get_hdr_aux(hdr);
1246
0
            khint_t k = kh_get(hdict, aux->gen, str.s);
1247
0
            free(str.s);
1248
0
            if ( k == kh_end(aux->gen) ) return NULL;
1249
0
            return kh_val(aux->gen, k);
1250
0
        }
1251
0
        for (i=0; i<hdr->nhrec; i++)
1252
0
        {
1253
0
            if ( hdr->hrec[i]->type!=type ) continue;
1254
0
            if ( strcmp(hdr->hrec[i]->key,str_class) ) continue;
1255
0
            int j = bcf_hrec_find_key(hdr->hrec[i],key);
1256
0
            if ( j>=0 && !strcmp(hdr->hrec[i]->vals[j],value) ) return hdr->hrec[i];
1257
0
        }
1258
0
        return NULL;
1259
0
    }
1260
0
    vdict_t *d = type==BCF_HL_CTG ? (vdict_t*)hdr->dict[BCF_DT_CTG] : (vdict_t*)hdr->dict[BCF_DT_ID];
1261
0
    khint_t k = kh_get(vdict, d, value);
1262
0
    if ( k == kh_end(d) ) return NULL;
1263
0
    return kh_val(d, k).hrec[type==BCF_HL_CTG?0:type];
1264
0
}
1265
1266
// Check the VCF header is correctly formatted as per the specification.
1267
// Note the code that calls this doesn't bother to check return values and
1268
// we have so many broken VCFs in the wild that for now we just reprt a
1269
// warning and continue anyway.  So currently this is a void function.
1270
void bcf_hdr_check_sanity(bcf_hdr_t *hdr)
1271
4.67k
{
1272
4.67k
    int version = bcf_get_version(hdr, NULL);
1273
1274
4.67k
    struct tag {
1275
4.67k
        char name[10];
1276
4.67k
        char number_str[3];
1277
4.67k
        int number;
1278
4.67k
        int version;
1279
4.67k
        int type;
1280
4.67k
    };
1281
1282
4.67k
    char type_str[][8] = {"Flag", "Integer", "Float", "String"};
1283
1284
4.67k
    struct tag info_tags[] = {
1285
4.67k
        {"AD",        "R",  BCF_VL_R,     VCF_DEF, BCF_HT_INT},
1286
4.67k
        {"ADF",       "R",  BCF_VL_R,     VCF_DEF, BCF_HT_INT},
1287
4.67k
        {"ADR",       "R",  BCF_VL_R,     VCF_DEF, BCF_HT_INT},
1288
4.67k
        {"AC",        "A",  BCF_VL_A,     VCF_DEF, BCF_HT_INT},
1289
4.67k
        {"AF",        "A",  BCF_VL_A,     VCF_DEF, BCF_HT_REAL},
1290
4.67k
        {"CIGAR",     "A",  BCF_VL_A,     VCF_DEF, BCF_HT_STR},
1291
4.67k
        {"AA",        "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_STR},
1292
4.67k
        {"AN",        "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1293
4.67k
        {"BQ",        "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_REAL},
1294
4.67k
        {"DB",        "0",  BCF_VL_FIXED, VCF_DEF, BCF_HT_FLAG},
1295
4.67k
        {"DP",        "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1296
4.67k
        {"END",       "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1297
4.67k
        {"H2",        "0",  BCF_VL_FIXED, VCF_DEF, BCF_HT_FLAG},
1298
4.67k
        {"H3",        "0",  BCF_VL_FIXED, VCF_DEF, BCF_HT_FLAG},
1299
4.67k
        {"MQ",        "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_REAL},
1300
4.67k
        {"MQ0",       "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1301
4.67k
        {"NS",        "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1302
4.67k
        {"SB",        "4",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1303
4.67k
        {"SOMATIC",   "0",  BCF_VL_FIXED, VCF_DEF, BCF_HT_FLAG},
1304
4.67k
        {"VALIDATED", "0",  BCF_VL_FIXED, VCF_DEF, BCF_HT_FLAG},
1305
4.67k
        {"1000G",     "0",  BCF_VL_FIXED, VCF_DEF, BCF_HT_FLAG},
1306
4.67k
    };
1307
4.67k
    static int info_warned[sizeof(info_tags)/sizeof(*info_tags)] = {0};
1308
1309
4.67k
    struct tag fmt_tags[] = {
1310
4.67k
        {"AD",   "R",  BCF_VL_R,     VCF_DEF, BCF_HT_INT},
1311
4.67k
        {"ADF",  "R",  BCF_VL_R,     VCF_DEF, BCF_HT_INT},
1312
4.67k
        {"ADR",  "R",  BCF_VL_R,     VCF_DEF, BCF_HT_INT},
1313
4.67k
        {"EC",   "A",  BCF_VL_A,     VCF_DEF, BCF_HT_INT},
1314
4.67k
        {"GL",   "G",  BCF_VL_G,     VCF_DEF, BCF_HT_REAL},
1315
4.67k
        {"GP",   "G",  BCF_VL_G,     VCF_DEF, BCF_HT_REAL},
1316
4.67k
        {"PL",   "G",  BCF_VL_G,     VCF_DEF, BCF_HT_INT},
1317
4.67k
        {"PP",   "G",  BCF_VL_G,     VCF_DEF, BCF_HT_INT},
1318
4.67k
        {"DP",   "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1319
4.67k
        {"LEN",  "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1320
4.67k
        {"FT",   "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_STR},
1321
4.67k
        {"GQ",   "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1322
4.67k
        {"GT",   "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_STR},
1323
4.67k
        {"HQ",   "2",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1324
4.67k
        {"MQ",   "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1325
4.67k
        {"PQ",   "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1326
4.67k
        {"PS",   "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1327
4.67k
        {"PSL",  "P",  BCF_VL_P,     VCF44,   BCF_HT_STR},
1328
4.67k
        {"PSO",  "P",  BCF_VL_P,     VCF44,   BCF_HT_INT},
1329
4.67k
        {"PSQ",  "P",  BCF_VL_P,     VCF44,   BCF_HT_INT},
1330
4.67k
        {"LGL",  "LG", BCF_VL_LG,    VCF45,   BCF_HT_INT},
1331
4.67k
        {"LGP",  "LG", BCF_VL_LG,    VCF45,   BCF_HT_INT},
1332
4.67k
        {"LPL",  "LG", BCF_VL_LG,    VCF45,   BCF_HT_INT},
1333
4.67k
        {"LPP",  "LG", BCF_VL_LG,    VCF45,   BCF_HT_INT},
1334
4.67k
        {"LEC",  "LA", BCF_VL_LA,    VCF45,   BCF_HT_INT},
1335
4.67k
        {"LAD",  "LR", BCF_VL_LR,    VCF45,   BCF_HT_INT},
1336
4.67k
        {"LADF", "LR", BCF_VL_LR,    VCF45,   BCF_HT_INT},
1337
4.67k
        {"LADR", "LR", BCF_VL_LR,    VCF45,   BCF_HT_INT},
1338
4.67k
    };
1339
4.67k
    static int fmt_warned[sizeof(fmt_tags)/sizeof(*fmt_tags)] = {0};
1340
1341
    // Check INFO tag numbers.  We shouldn't really permit ".", but it's
1342
    // commonly misused so we let it slide unless it's a new tag and the
1343
    // file format claims to be new also.  We also cannot distinguish between
1344
    // Number=1 and Number=2, but we at least report the correct term if we
1345
    // get, say, Number=G in its place.
1346
    // Also check the types.
1347
4.67k
    int i;
1348
102k
    for (i = 0; i < sizeof(info_tags)/sizeof(*info_tags); i++) {
1349
98.1k
        if (info_warned[i])
1350
2.49k
            continue;
1351
95.6k
        int id = bcf_hdr_id2int(hdr, BCF_DT_ID, info_tags[i].name);
1352
95.6k
        if (bcf_hdr_idinfo_exists(hdr, BCF_HL_INFO, id)) {
1353
1
            if (bcf_hdr_id2length(hdr, BCF_HL_INFO, id) != info_tags[i].number &&
1354
1
                bcf_hdr_id2length(hdr, BCF_HL_INFO, id) != BCF_VL_VAR) {
1355
0
                info_warned[i] = 1;
1356
1
            } else if (bcf_hdr_id2length(hdr, BCF_HL_INFO, id) == BCF_VL_FIXED &&
1357
0
                       bcf_hdr_id2number(hdr, BCF_HL_INFO, id) != atoi(info_tags[i].number_str)) {
1358
0
                info_warned[i] = 1;
1359
0
            }
1360
1361
1
            if (info_warned[i]) {
1362
0
                hts_log_warning("%s should be declared as Number=%s",
1363
0
                                info_tags[i].name, info_tags[i].number_str);
1364
0
            }
1365
1366
1
            if (bcf_hdr_id2type(hdr, BCF_HL_INFO, id) != info_tags[i].type) {
1367
1
                hts_log_warning("%s should be declared as Type=%s",
1368
1
                                info_tags[i].name, type_str[info_tags[i].type]);
1369
1
                info_warned[i] = 1;
1370
1
            }
1371
1
        }
1372
95.6k
    }
1373
1374
    // Check FORMAT tag numbers and types.
1375
135k
    for (i = 0; i < sizeof(fmt_tags)/sizeof(*fmt_tags); i++) {
1376
130k
        if (fmt_warned[i])
1377
0
            continue;
1378
130k
        int id = bcf_hdr_id2int(hdr, BCF_DT_ID, fmt_tags[i].name);
1379
130k
        if (bcf_hdr_idinfo_exists(hdr, BCF_HL_FMT, id)) {
1380
0
            if (bcf_hdr_id2length(hdr, BCF_HL_FMT, id) != fmt_tags[i].number) {
1381
                // Permit "Number=." if this tag predates the vcf version it is
1382
                // defined within.  This is a common tactic for callers to use
1383
                // new tags with older formats in order to avoid parsing failures
1384
                // with some software.
1385
                // We don't care for 4.3 and earlier as that's more of a wild-west
1386
                // and it's not abnormal to see incorrect usage of Number=. there.
1387
0
                if ((version < VCF44 &&
1388
0
                     bcf_hdr_id2length(hdr, BCF_HL_FMT, id) != BCF_VL_VAR) ||
1389
0
                    (version >= VCF44 && version >= fmt_tags[i].version)) {
1390
0
                    fmt_warned[i] = 1;
1391
0
                }
1392
0
            } else if (bcf_hdr_id2length(hdr, BCF_HL_FMT, id) == BCF_VL_FIXED &&
1393
0
                       bcf_hdr_id2number(hdr, BCF_HL_FMT, id) != atoi(fmt_tags[i].number_str)) {
1394
0
                fmt_warned[i] = 1;
1395
0
            }
1396
1397
0
            if (fmt_warned[i]) {
1398
0
                hts_log_warning("%s should be declared as Number=%s",
1399
0
                                fmt_tags[i].name, fmt_tags[i].number_str);
1400
0
            }
1401
1402
0
            if (bcf_hdr_id2type(hdr, BCF_HL_FMT, id) != fmt_tags[i].type) {
1403
0
                hts_log_warning("%s should be declared as Type=%s",
1404
0
                                fmt_tags[i].name, type_str[fmt_tags[i].type]);
1405
0
                fmt_warned[i] = 1;
1406
0
            }
1407
0
        }
1408
130k
    }
1409
4.67k
}
1410
1411
int bcf_hdr_parse(bcf_hdr_t *hdr, char *htxt)
1412
6.33k
{
1413
6.33k
    int len, done = 0;
1414
6.33k
    char *p = htxt;
1415
1416
    // Check sanity: "fileformat" string must come as first
1417
6.33k
    bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr,p,&len);
1418
6.33k
    if ( !hrec || !hrec->key || strcasecmp(hrec->key,"fileformat") )
1419
606
        hts_log_warning("The first line should be ##fileformat; is the VCF/BCF header broken?");
1420
6.33k
    if (bcf_hdr_add_hrec(hdr, hrec) < 0) {
1421
0
        bcf_hrec_destroy(hrec);
1422
0
        return -1;
1423
0
    }
1424
1425
    // The filter PASS must appear first in the dictionary
1426
6.33k
    hrec = bcf_hdr_parse_line(hdr,"##FILTER=<ID=PASS,Description=\"All filters passed\">",&len);
1427
6.33k
    if (!hrec || bcf_hdr_add_hrec(hdr, hrec) < 0) {
1428
0
        bcf_hrec_destroy(hrec);
1429
0
        return -1;
1430
0
    }
1431
1432
    // Parse the whole header
1433
23.9k
    do {
1434
89.1k
        while (NULL != (hrec = bcf_hdr_parse_line(hdr, p, &len))) {
1435
65.2k
            if (bcf_hdr_add_hrec(hdr, hrec) < 0) {
1436
36
                bcf_hrec_destroy(hrec);
1437
36
                return -1;
1438
36
            }
1439
65.2k
            p += len;
1440
65.2k
        }
1441
23.9k
        assert(hrec == NULL);
1442
23.8k
        if (len < 0) {
1443
            // len < 0 indicates out-of-memory, or similar error
1444
38
            hts_log_error("Could not parse header line: %s", strerror(errno));
1445
38
            return -1;
1446
23.8k
        } else if (len > 0) {
1447
            // Bad header line.  bcf_hdr_parse_line() will have logged it.
1448
            // Skip and try again on the next line (p + len will be the start
1449
            // of the next one).
1450
16.2k
            p += len;
1451
16.2k
            continue;
1452
16.2k
        }
1453
1454
        // Next should be the sample line.  If not, it was a malformed
1455
        // header, in which case print a warning and skip (many VCF
1456
        // operations do not really care about a few malformed lines).
1457
        // In the future we may want to add a strict mode that errors in
1458
        // this case.
1459
7.57k
        if ( strncmp("#CHROM\t",p,7) && strncmp("#CHROM ",p,7) ) {
1460
2.67k
            char *eol = strchr(p, '\n');
1461
2.67k
            if (*p != '\0') {
1462
1.38k
                char buffer[320];
1463
1.38k
                hts_log_warning("Could not parse header line: %s",
1464
1.38k
                                hts_strprint(buffer, sizeof(buffer),
1465
1.38k
                                               '"', p,
1466
1.38k
                                               eol ? (eol - p) : SIZE_MAX));
1467
1.38k
            }
1468
2.67k
            if (eol) {
1469
1.32k
                p = eol + 1; // Try from the next line.
1470
1.35k
            } else {
1471
1.35k
                done = -1; // No more lines left, give up.
1472
1.35k
            }
1473
4.89k
        } else {
1474
4.89k
            done = 1; // Sample line found
1475
4.89k
        }
1476
23.8k
    } while (!done);
1477
1478
6.25k
    if (done < 0) {
1479
        // No sample line is fatal.
1480
1.35k
        hts_log_error("Could not parse the header, sample line not found");
1481
1.35k
        return -1;
1482
1.35k
    }
1483
1484
4.89k
    if (bcf_hdr_parse_sample_line(hdr,p) < 0)
1485
222
        return -1;
1486
4.67k
    if (bcf_hdr_sync(hdr) < 0)
1487
0
        return -1;
1488
4.67k
    bcf_hdr_check_sanity(hdr);
1489
4.67k
    return 0;
1490
4.67k
}
1491
1492
int bcf_hdr_append(bcf_hdr_t *hdr, const char *line)
1493
0
{
1494
0
    int len;
1495
0
    bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr, (char*) line, &len);
1496
0
    if ( !hrec ) return -1;
1497
0
    if (bcf_hdr_add_hrec(hdr, hrec) < 0)
1498
0
        return -1;
1499
0
    return 0;
1500
0
}
1501
1502
void bcf_hdr_remove(bcf_hdr_t *hdr, int type, const char *key)
1503
0
{
1504
0
    int i = 0;
1505
0
    bcf_hrec_t *hrec;
1506
0
    if ( !key )
1507
0
    {
1508
        // no key, remove all entries of this type
1509
0
        while ( i<hdr->nhrec )
1510
0
        {
1511
0
            if ( hdr->hrec[i]->type!=type ) { i++; continue; }
1512
0
            hrec = hdr->hrec[i];
1513
0
            bcf_hdr_unregister_hrec(hdr, hrec);
1514
0
            bcf_hdr_remove_from_hdict(hdr, hrec);
1515
0
            hdr->dirty = 1;
1516
0
            hdr->nhrec--;
1517
0
            if ( i < hdr->nhrec )
1518
0
                memmove(&hdr->hrec[i],&hdr->hrec[i+1],(hdr->nhrec-i)*sizeof(bcf_hrec_t*));
1519
0
            bcf_hrec_destroy(hrec);
1520
0
        }
1521
0
        return;
1522
0
    }
1523
0
    while (1)
1524
0
    {
1525
0
        if ( type==BCF_HL_FLT || type==BCF_HL_INFO || type==BCF_HL_FMT || type== BCF_HL_CTG )
1526
0
        {
1527
0
            hrec = bcf_hdr_get_hrec(hdr, type, "ID", key, NULL);
1528
0
            if ( !hrec ) return;
1529
1530
0
            for (i=0; i<hdr->nhrec; i++)
1531
0
                if ( hdr->hrec[i]==hrec ) break;
1532
0
            assert( i<hdr->nhrec );
1533
1534
0
            vdict_t *d = type==BCF_HL_CTG ? (vdict_t*)hdr->dict[BCF_DT_CTG] : (vdict_t*)hdr->dict[BCF_DT_ID];
1535
0
            khint_t k = kh_get(vdict, d, key);
1536
0
            kh_val(d, k).hrec[type==BCF_HL_CTG?0:type] = NULL;
1537
0
        }
1538
0
        else
1539
0
        {
1540
0
            for (i=0; i<hdr->nhrec; i++)
1541
0
            {
1542
0
                if ( hdr->hrec[i]->type!=type ) continue;
1543
0
                if ( type==BCF_HL_GEN )
1544
0
                {
1545
0
                    if ( !strcmp(hdr->hrec[i]->key,key) ) break;
1546
0
                }
1547
0
                else
1548
0
                {
1549
                    // not all structured lines have ID, we could be more sophisticated as in bcf_hdr_get_hrec()
1550
0
                    int j = bcf_hrec_find_key(hdr->hrec[i], "ID");
1551
0
                    if ( j>=0 && !strcmp(hdr->hrec[i]->vals[j],key) ) break;
1552
0
                }
1553
0
            }
1554
0
            if ( i==hdr->nhrec ) return;
1555
0
            hrec = hdr->hrec[i];
1556
0
            bcf_hdr_remove_from_hdict(hdr, hrec);
1557
0
        }
1558
1559
0
        hdr->nhrec--;
1560
0
        if ( i < hdr->nhrec )
1561
0
            memmove(&hdr->hrec[i],&hdr->hrec[i+1],(hdr->nhrec-i)*sizeof(bcf_hrec_t*));
1562
0
        bcf_hrec_destroy(hrec);
1563
0
        hdr->dirty = 1;
1564
0
    }
1565
0
}
1566
1567
int bcf_hdr_printf(bcf_hdr_t *hdr, const char *fmt, ...)
1568
0
{
1569
0
    char tmp[256], *line = tmp;
1570
0
    va_list ap;
1571
0
    va_start(ap, fmt);
1572
0
    int n = vsnprintf(line, sizeof(tmp), fmt, ap);
1573
0
    va_end(ap);
1574
1575
0
    if (n >= sizeof(tmp)) {
1576
0
        n++; // For trailing NUL
1577
0
        line = (char*)malloc(n);
1578
0
        if (!line)
1579
0
            return -1;
1580
1581
0
        va_start(ap, fmt);
1582
0
        vsnprintf(line, n, fmt, ap);
1583
0
        va_end(ap);
1584
0
    }
1585
1586
0
    int ret = bcf_hdr_append(hdr, line);
1587
1588
0
    if (line != tmp) free(line);
1589
0
    return ret;
1590
0
}
1591
1592
1593
/**********************
1594
 *** BCF header I/O ***
1595
 **********************/
1596
1597
const char *bcf_hdr_get_version(const bcf_hdr_t *hdr)
1598
1.08k
{
1599
1.08k
    bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, BCF_HL_GEN, "fileformat", NULL, NULL);
1600
1.08k
    if ( !hrec )
1601
832
    {
1602
832
        hts_log_warning("No version string found, assuming VCFv4.2");
1603
832
        return "VCFv4.2";
1604
832
    }
1605
254
    return hrec->value;
1606
1.08k
}
1607
1608
int bcf_hdr_set_version(bcf_hdr_t *hdr, const char *version)
1609
0
{
1610
0
    bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, BCF_HL_GEN, "fileformat", NULL, NULL);
1611
0
    if ( !hrec )
1612
0
    {
1613
0
        int len;
1614
0
        kstring_t str = {0,0,0};
1615
0
        if ( ksprintf(&str,"##fileformat=%s", version) < 0 ) return -1;
1616
0
        hrec = bcf_hdr_parse_line(hdr, str.s, &len);
1617
0
        free(str.s);
1618
1619
0
        get_hdr_aux(hdr)->version = bcf_get_version(NULL, hrec->value);
1620
0
    }
1621
0
    else
1622
0
    {
1623
0
        bcf_hrec_t *tmp = bcf_hrec_dup(hrec);
1624
0
        if ( !tmp ) return -1;
1625
0
        free(tmp->value);
1626
0
        tmp->value = strdup(version);
1627
0
        if ( !tmp->value ) return -1;
1628
0
        bcf_hdr_update_hrec(hdr, hrec, tmp);
1629
0
        bcf_hrec_destroy(tmp);
1630
0
    }
1631
0
    hdr->dirty = 1;
1632
    //TODO rlen may change, deal with it
1633
0
    return 0; // FIXME: check for errs in this function (return < 0 if so)
1634
0
}
1635
1636
bcf_hdr_t *bcf_hdr_init(const char *mode)
1637
6.44k
{
1638
6.44k
    int i;
1639
6.44k
    bcf_hdr_t *h;
1640
6.44k
    h = (bcf_hdr_t*)calloc(1, sizeof(bcf_hdr_t));
1641
6.44k
    if (!h) return NULL;
1642
25.7k
    for (i = 0; i < 3; ++i) {
1643
19.3k
        if ((h->dict[i] = kh_init(vdict)) == NULL) goto fail;
1644
        // Supersize the hash to make collisions very unlikely
1645
19.3k
        static int dsize[3] = {16384,16384,2048}; // info, contig, format
1646
19.3k
        if (kh_resize(vdict, h->dict[i], dsize[i]) < 0) goto fail;
1647
19.3k
    }
1648
1649
6.44k
    bcf_hdr_aux_t *aux = (bcf_hdr_aux_t*)calloc(1,sizeof(bcf_hdr_aux_t));
1650
6.44k
    if ( !aux ) goto fail;
1651
6.44k
    if ( (aux->gen = kh_init(hdict))==NULL ) { free(aux); goto fail; }
1652
6.44k
    aux->key_len = NULL;
1653
6.44k
    aux->dict = *((vdict_t*)h->dict[0]);
1654
6.44k
    aux->version = 0;
1655
6.44k
    aux->ref_count = 1;
1656
6.44k
    free(h->dict[0]);
1657
6.44k
    h->dict[0] = aux;
1658
1659
6.44k
    if ( strchr(mode,'w') )
1660
0
    {
1661
0
        bcf_hdr_append(h, "##fileformat=VCFv4.2");
1662
        // The filter PASS must appear first in the dictionary
1663
0
        bcf_hdr_append(h, "##FILTER=<ID=PASS,Description=\"All filters passed\">");
1664
0
        aux->version = VCF_DEF;
1665
0
    }
1666
6.44k
    return h;
1667
1668
0
 fail:
1669
0
    for (i = 0; i < 3; ++i)
1670
0
        kh_destroy(vdict, h->dict[i]);
1671
0
    free(h);
1672
0
    return NULL;
1673
6.44k
}
1674
1675
void bcf_hdr_destroy(bcf_hdr_t *h)
1676
8.99k
{
1677
8.99k
    int i;
1678
8.99k
    khint_t k;
1679
8.99k
    if (!h) return;
1680
8.99k
    bcf_hdr_aux_t *aux = get_hdr_aux(h);
1681
8.99k
    if (aux->ref_count > 1) // Refs still held, so delay destruction
1682
2.55k
    {
1683
2.55k
        aux->ref_count &= ~1;
1684
2.55k
        return;
1685
2.55k
    }
1686
25.7k
    for (i = 0; i < 3; ++i) {
1687
19.3k
        vdict_t *d = (vdict_t*)h->dict[i];
1688
19.3k
        if (d == 0) continue;
1689
224M
        for (k = kh_begin(d); k != kh_end(d); ++k)
1690
224M
            if (kh_exist(d, k)) free((char*)kh_key(d, k));
1691
19.3k
        if ( i==0 )
1692
6.44k
        {
1693
41.4k
            for (k=kh_begin(aux->gen); k<kh_end(aux->gen); k++)
1694
35.0k
                if ( kh_exist(aux->gen,k) ) free((char*)kh_key(aux->gen,k));
1695
6.44k
            kh_destroy(hdict, aux->gen);
1696
6.44k
            free(aux->key_len); // may exist for dict[0] only
1697
6.44k
        }
1698
19.3k
        kh_destroy(vdict, d);
1699
19.3k
        free(h->id[i]);
1700
19.3k
    }
1701
115k
    for (i=0; i<h->nhrec; i++)
1702
108k
        bcf_hrec_destroy(h->hrec[i]);
1703
6.44k
    if (h->nhrec) free(h->hrec);
1704
6.44k
    if (h->samples) free(h->samples);
1705
6.44k
    free(h->keep_samples);
1706
6.44k
    free(h->transl[0]); free(h->transl[1]);
1707
6.44k
    free(h->mem.s);
1708
6.44k
    free(h);
1709
6.44k
}
1710
1711
bcf_hdr_t *bcf_hdr_read(htsFile *hfp)
1712
6.44k
{
1713
6.44k
    if (hfp->format.format == vcf)
1714
5.75k
        return vcf_hdr_read(hfp);
1715
686
    if (hfp->format.format != bcf) {
1716
0
        hts_log_error("Input is not detected as bcf or vcf format");
1717
0
        return NULL;
1718
0
    }
1719
1720
686
    assert(hfp->is_bgzf);
1721
1722
686
    BGZF *fp = hfp->fp.bgzf;
1723
686
    uint8_t magic[5];
1724
686
    bcf_hdr_t *h;
1725
686
    h = bcf_hdr_init("r");
1726
686
    if (!h) {
1727
0
        hts_log_error("Failed to allocate bcf header");
1728
0
        return NULL;
1729
0
    }
1730
686
    if (bgzf_read(fp, magic, 5) != 5)
1731
6
    {
1732
6
        hts_log_error("Failed to read the header (reading BCF in text mode?)");
1733
6
        bcf_hdr_destroy(h);
1734
6
        return NULL;
1735
6
    }
1736
680
    if (strncmp((char*)magic, "BCF\2\2", 5) != 0)
1737
14
    {
1738
14
        if (!strncmp((char*)magic, "BCF", 3))
1739
14
            hts_log_error("Invalid BCF2 magic string: only BCFv2.2 is supported");
1740
0
        else
1741
0
            hts_log_error("Invalid BCF2 magic string");
1742
14
        bcf_hdr_destroy(h);
1743
14
        return NULL;
1744
14
    }
1745
666
    uint8_t buf[4];
1746
666
    size_t hlen;
1747
666
    char *htxt = NULL;
1748
666
    if (bgzf_read(fp, buf, 4) != 4) goto fail;
1749
662
    hlen = buf[0] | (buf[1] << 8) | (buf[2] << 16) | ((size_t) buf[3] << 24);
1750
662
    if (hlen >= SIZE_MAX) { errno = ENOMEM; goto fail; }
1751
662
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
1752
662
    if (hlen > FUZZ_ALLOC_LIMIT/2) { errno = ENOMEM; goto fail; }
1753
658
#endif
1754
658
    htxt = (char*)malloc(hlen + 1);
1755
658
    if (!htxt) goto fail;
1756
658
    if (bgzf_read(fp, htxt, hlen) != hlen) goto fail;
1757
606
    htxt[hlen] = '\0'; // Ensure htxt is terminated
1758
606
    if ( bcf_hdr_parse(h, htxt) < 0 ) goto fail;
1759
428
    free(htxt);
1760
1761
428
    bcf_hdr_incr_ref(h);
1762
428
    bgzf_set_private_data(fp, h, hdr_bgzf_private_data_cleanup);
1763
1764
428
    return h;
1765
238
 fail:
1766
238
    hts_log_error("Failed to read BCF header");
1767
238
    free(htxt);
1768
238
    bcf_hdr_destroy(h);
1769
238
    return NULL;
1770
606
}
1771
1772
int bcf_hdr_write(htsFile *hfp, bcf_hdr_t *h)
1773
4.67k
{
1774
4.67k
    if (!h) {
1775
0
        errno = EINVAL;
1776
0
        return -1;
1777
0
    }
1778
4.67k
    if ( h->dirty ) {
1779
0
        if (bcf_hdr_sync(h) < 0) return -1;
1780
0
    }
1781
4.67k
    hfp->format.category = variant_data;
1782
4.67k
    if (hfp->format.format == vcf || hfp->format.format == text_format) {
1783
2.33k
        hfp->format.format = vcf;
1784
2.33k
        return vcf_hdr_write(hfp, h);
1785
2.33k
    }
1786
1787
2.33k
    if (hfp->format.format == binary_format)
1788
2.33k
        hfp->format.format = bcf;
1789
1790
2.33k
    kstring_t htxt = {0,0,0};
1791
2.33k
    if (bcf_hdr_format(h, 1, &htxt) < 0) {
1792
0
        free(htxt.s);
1793
0
        return -1;
1794
0
    }
1795
2.33k
    kputc('\0', &htxt); // include the \0 byte
1796
1797
2.33k
    BGZF *fp = hfp->fp.bgzf;
1798
2.33k
    if ( bgzf_write(fp, "BCF\2\2", 5) !=5 ) return -1;
1799
2.33k
    uint8_t hlen[4];
1800
2.33k
    u32_to_le(htxt.l, hlen);
1801
2.33k
    if ( bgzf_write(fp, hlen, 4) !=4 ) return -1;
1802
2.33k
    if ( bgzf_write(fp, htxt.s, htxt.l) != htxt.l ) return -1;
1803
2.33k
    if ( bgzf_flush(fp) < 0) return -1;
1804
1805
2.33k
    bcf_hdr_incr_ref(h);
1806
2.33k
    bgzf_set_private_data(fp, h, hdr_bgzf_private_data_cleanup);
1807
1808
2.33k
    free(htxt.s);
1809
2.33k
    return 0;
1810
2.33k
}
1811
1812
/********************
1813
 *** BCF site I/O ***
1814
 ********************/
1815
1816
bcf1_t *bcf_init(void)
1817
4.67k
{
1818
4.67k
    bcf1_t *v;
1819
4.67k
    v = (bcf1_t*)calloc(1, sizeof(bcf1_t));
1820
4.67k
    return v;
1821
4.67k
}
1822
1823
void bcf_clear(bcf1_t *v)
1824
39.0k
{
1825
39.0k
    int i;
1826
39.0k
    for (i=0; i<v->d.m_info; i++)
1827
0
    {
1828
0
        if ( v->d.info[i].vptr_free )
1829
0
        {
1830
0
            free(v->d.info[i].vptr - v->d.info[i].vptr_off);
1831
0
            v->d.info[i].vptr_free = 0;
1832
0
        }
1833
0
    }
1834
39.0k
    for (i=0; i<v->d.m_fmt; i++)
1835
0
    {
1836
0
        if ( v->d.fmt[i].p_free )
1837
0
        {
1838
0
            free(v->d.fmt[i].p - v->d.fmt[i].p_off);
1839
0
            v->d.fmt[i].p_free = 0;
1840
0
        }
1841
0
    }
1842
39.0k
    v->rid = v->pos = v->rlen = v->unpacked = 0;
1843
39.0k
    bcf_float_set_missing(v->qual);
1844
39.0k
    v->n_info = v->n_allele = v->n_fmt = v->n_sample = 0;
1845
39.0k
    v->shared.l = v->indiv.l = 0;
1846
39.0k
    v->d.var_type = -1;
1847
39.0k
    v->d.shared_dirty = 0;
1848
39.0k
    v->d.indiv_dirty  = 0;
1849
39.0k
    v->d.n_flt = 0;
1850
39.0k
    v->errcode = 0;
1851
39.0k
    if (v->d.m_als) v->d.als[0] = 0;
1852
39.0k
    if (v->d.m_id) v->d.id[0] = 0;
1853
39.0k
}
1854
1855
void bcf_empty(bcf1_t *v)
1856
4.67k
{
1857
4.67k
    bcf_clear1(v);
1858
4.67k
    free(v->d.id);
1859
4.67k
    free(v->d.als);
1860
4.67k
    free(v->d.allele); free(v->d.flt); free(v->d.info); free(v->d.fmt);
1861
4.67k
    if (v->d.var ) free(v->d.var);
1862
4.67k
    free(v->shared.s); free(v->indiv.s);
1863
4.67k
    memset(&v->d,0,sizeof(v->d));
1864
4.67k
    memset(&v->shared,0,sizeof(v->shared));
1865
4.67k
    memset(&v->indiv,0,sizeof(v->indiv));
1866
4.67k
}
1867
1868
void bcf_destroy(bcf1_t *v)
1869
4.67k
{
1870
4.67k
    if (!v) return;
1871
4.67k
    bcf_empty1(v);
1872
4.67k
    free(v);
1873
4.67k
}
1874
1875
static inline int bcf_read1_core(BGZF *fp, bcf1_t *v)
1876
472
{
1877
472
    uint8_t x[32];
1878
472
    ssize_t ret;
1879
472
    uint32_t shared_len, indiv_len;
1880
472
    if ((ret = bgzf_read(fp, x, 32)) != 32) {
1881
22
        if (ret == 0) return -1;
1882
8
        return -2;
1883
22
    }
1884
450
    bcf_clear1(v);
1885
450
    shared_len = le_to_u32(x);
1886
450
    if (shared_len < 24) return -2;
1887
450
    shared_len -= 24; // to exclude six 32-bit integers
1888
450
    indiv_len = le_to_u32(x + 4);
1889
450
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
1890
    // ks_resize() normally allocates 1.5 * requested size to allow for growth
1891
450
    if ((uint64_t) shared_len + indiv_len > FUZZ_ALLOC_LIMIT / 3 * 2) return -2;
1892
440
#endif
1893
440
    if (ks_resize(&v->shared, shared_len ? shared_len : 1) != 0) return -2;
1894
440
    if (ks_resize(&v->indiv, indiv_len ? indiv_len : 1) != 0) return -2;
1895
440
    v->rid  = le_to_i32(x + 8);
1896
440
    v->pos  = le_to_u32(x + 12);
1897
440
    if ( v->pos==UINT32_MAX ) v->pos = -1;  // this is for telomere coordinate, e.g. MT:0
1898
440
    v->rlen = le_to_i32(x + 16);
1899
440
    v->qual = le_to_float(x + 20);
1900
440
    v->n_info = le_to_u16(x + 24);
1901
440
    v->n_allele = le_to_u16(x + 26);
1902
440
    v->n_sample = le_to_u32(x + 28) & 0xffffff;
1903
440
    v->n_fmt = x[31];
1904
440
    v->shared.l = shared_len;
1905
440
    v->indiv.l = indiv_len;
1906
    // silent fix of broken BCFs produced by earlier versions of bcf_subset, prior to and including bd6ed8b4
1907
440
    if ( (!v->indiv.l || !v->n_sample) && v->n_fmt ) v->n_fmt = 0;
1908
1909
440
    if (bgzf_read(fp, v->shared.s, v->shared.l) != v->shared.l) return -2;
1910
406
    if (bgzf_read(fp, v->indiv.s, v->indiv.l) != v->indiv.l) return -2;
1911
404
    return 0;
1912
406
}
1913
1914
0
#define bit_array_size(n) ((n)/8+1)
1915
0
#define bit_array_set(a,i)   ((a)[(i)/8] |=   1 << ((i)%8))
1916
0
#define bit_array_clear(a,i) ((a)[(i)/8] &= ~(1 << ((i)%8)))
1917
0
#define bit_array_test(a,i)  ((a)[(i)/8] &   (1 << ((i)%8)))
1918
1919
static int bcf_dec_typed_int1_safe(uint8_t *p, uint8_t *end, uint8_t **q,
1920
4.82k
                                   int32_t *val) {
1921
4.82k
    uint32_t t;
1922
4.82k
    if (end - p < 2) return -1;
1923
4.80k
    t = *p++ & 0xf;
1924
    /* Use if .. else if ... else instead of switch to force order.  Assumption
1925
       is that small integers are more frequent than big ones. */
1926
4.80k
    if (t == BCF_BT_INT8) {
1927
2.35k
        *val = *(int8_t *) p++;
1928
2.44k
    } else {
1929
2.44k
        if (end - p < (1<<bcf_type_shift[t])) return -1;
1930
2.42k
        if (t == BCF_BT_INT16) {
1931
1.60k
            *val = le_to_i16(p);
1932
1.60k
            p += 2;
1933
1.60k
        } else if (t == BCF_BT_INT32) {
1934
672
            *val = le_to_i32(p);
1935
672
            p += 4;
1936
#ifdef VCF_ALLOW_INT64
1937
        } else if (t == BCF_BT_INT64) {
1938
            // This case should never happen because there should be no
1939
            // 64-bit BCFs at all, definitely not coming from htslib
1940
            *val = le_to_i64(p);
1941
            p += 8;
1942
#endif
1943
672
        } else {
1944
144
            return -1;
1945
144
        }
1946
2.42k
    }
1947
4.63k
    *q = p;
1948
4.63k
    return 0;
1949
4.80k
}
1950
1951
static int bcf_dec_size_safe(uint8_t *p, uint8_t *end, uint8_t **q,
1952
16.8k
                             int *num, int *type) {
1953
16.8k
    int r;
1954
16.8k
    if (p >= end) return -1;
1955
16.7k
    *type = *p & 0xf;
1956
16.7k
    if (*p>>4 != 15) {
1957
16.3k
        *q = p + 1;
1958
16.3k
        *num = *p >> 4;
1959
16.3k
        return 0;
1960
16.3k
    }
1961
404
    r = bcf_dec_typed_int1_safe(p + 1, end, q, num);
1962
404
    if (r) return r;
1963
336
    return *num >= 0 ? 0 : -1;
1964
404
}
1965
1966
556
static const char *get_type_name(int type) {
1967
556
    const char *types[9] = {
1968
556
        "null", "int (8-bit)", "int (16 bit)", "int (32 bit)",
1969
556
        "unknown", "float", "unknown", "char", "unknown"
1970
556
    };
1971
556
    int t = (type >= 0 && type < 8) ? type : 8;
1972
556
    return types[t];
1973
556
}
1974
1975
/**
1976
 *  updatephasing - updates 1st phasing based on other phasing status
1977
 *  @param p - pointer to phase value array
1978
 *  @param end - end of array
1979
 *  @param q - pointer to consumed data
1980
 *  @param samples - no. of samples in array
1981
 *  @param ploidy - no. of phasing values per sample
1982
 *  @param type - value type (one of BCF_BT_...)
1983
 *  Returns 0 on success and 1 on failure
1984
 *  Update for haploids made only if it is not unknown (.)
1985
 */
1986
static int updatephasing(uint8_t *p, uint8_t *end, uint8_t **q, int samples, int ploidy, int type)
1987
0
{
1988
0
    int j, k;
1989
0
    unsigned int inc = 1 << bcf_type_shift[type];
1990
0
    ptrdiff_t bytes = samples * ploidy * inc;
1991
1992
0
    if (samples < 0 || ploidy < 0 || end - p < bytes)
1993
0
        return 1;
1994
1995
    /*
1996
     * This works because phasing is stored in the least-significant bit
1997
     * of the GT encoding, and the data is always stored little-endian.
1998
     * Thus it's possible to get the desired result by doing bit operations
1999
     * on the least-significant byte of each value and ignoring the
2000
     * higher bytes (for 16-bit and 32-bit values).
2001
     */
2002
2003
0
    switch (ploidy) {
2004
0
    case 1:
2005
        // Trivial case - haploid data is phased by default
2006
0
        for (j = 0; j < samples; ++j) {
2007
0
            if (*p) *p |= 1;    //only if not unknown (.)
2008
0
            p += inc;
2009
0
        }
2010
0
        break;
2011
0
    case 2:
2012
        // Mostly trivial case - first is phased if second is.
2013
0
        for (j = 0; j < samples; ++j) {
2014
0
            *p |= (p[inc] & 1);
2015
0
            p += 2 * inc;
2016
0
        }
2017
0
        break;
2018
0
    default:
2019
        // Generic case - first is phased if all other alleles are.
2020
0
        for (j = 0; j < samples; ++j) {
2021
0
            uint8_t allphased = 1;
2022
0
            for (k = 1; k < ploidy; ++k)
2023
0
                allphased &= (p[inc * k]);
2024
0
            *p |= allphased;
2025
0
            p += ploidy * inc;
2026
0
        }
2027
0
    }
2028
0
    *q = p;
2029
0
    return 0;
2030
0
}
2031
2032
static void bcf_record_check_err(const bcf_hdr_t *hdr, bcf1_t *rec,
2033
2.25k
                                 char *type, uint32_t *reports, int i) {
2034
2.25k
    if (*reports == 0 || hts_verbose >= HTS_LOG_DEBUG)
2035
86
        hts_log_warning("Bad BCF record at %s:%"PRIhts_pos
2036
2.25k
                        ": Invalid FORMAT %s %d",
2037
2.25k
                        bcf_seqname_safe(hdr,rec), rec->pos+1, type, i);
2038
2.25k
    (*reports)++;
2039
2.25k
}
2040
2041
404
static int bcf_record_check(const bcf_hdr_t *hdr, bcf1_t *rec) {
2042
404
    uint8_t *ptr, *end;
2043
404
    size_t bytes;
2044
404
    uint32_t err = 0;
2045
404
    int type = 0;
2046
404
    int num  = 0;
2047
404
    uint32_t i, reports;
2048
404
    const uint32_t is_integer = ((1 << BCF_BT_INT8)  |
2049
404
                                 (1 << BCF_BT_INT16) |
2050
#ifdef VCF_ALLOW_INT64
2051
                                 (1 << BCF_BT_INT64) |
2052
#endif
2053
404
                                 (1 << BCF_BT_INT32));
2054
404
    const uint32_t is_valid_type = (is_integer          |
2055
404
                                    (1 << BCF_BT_NULL)  |
2056
404
                                    (1 << BCF_BT_FLOAT) |
2057
404
                                    (1 << BCF_BT_CHAR));
2058
404
    int32_t max_id = hdr ? hdr->n[BCF_DT_ID] : 0;
2059
    /* set phasing for 1st allele as in v44 for versions upto v43, to have
2060
    consistent binary values irrespective of version; not run for v >= v44,
2061
    to retain explicit phasing in v44 and higher */
2062
404
    int idgt = hdr ?
2063
404
                    bcf_get_version(hdr, NULL) < VCF44 ?
2064
404
                        bcf_hdr_id2int(hdr, BCF_DT_ID, "GT") : -1 :
2065
404
                    -1;
2066
2067
    // Check for valid contig ID
2068
404
    if (rec->rid < 0
2069
302
        || (hdr && (rec->rid >= hdr->n[BCF_DT_CTG]
2070
340
                    || hdr->id[BCF_DT_CTG][rec->rid].key == NULL))) {
2071
340
        hts_log_warning("Bad BCF record at %"PRIhts_pos": Invalid %s id %d", rec->pos+1, "CONTIG", rec->rid);
2072
340
        err |= BCF_ERR_CTG_INVALID;
2073
340
    }
2074
2075
    // Check ID
2076
404
    ptr = (uint8_t *) rec->shared.s;
2077
404
    end = ptr + rec->shared.l;
2078
404
    if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_shared;
2079
396
    if (type != BCF_BT_CHAR) {
2080
334
        hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s type %d (%s)", bcf_seqname_safe(hdr,rec), rec->pos+1, "ID", type, get_type_name(type));
2081
334
        err |= BCF_ERR_TAG_INVALID;
2082
334
    }
2083
396
    bytes = (size_t) num << bcf_type_shift[type];
2084
396
    if (end - ptr < bytes) goto bad_shared;
2085
390
    ptr += bytes;
2086
2087
    // Check REF and ALT
2088
390
    if (rec->n_allele < 1) {
2089
184
        hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": No REF allele",
2090
184
                        bcf_seqname_safe(hdr,rec), rec->pos+1);
2091
184
        err |= BCF_ERR_TAG_UNDEF;
2092
184
    }
2093
2094
390
    reports = 0;
2095
12.1k
    for (i = 0; i < rec->n_allele; i++) {
2096
11.7k
        if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_shared;
2097
11.7k
        if (type != BCF_BT_CHAR) {
2098
11.3k
            if (!reports++ || hts_verbose >= HTS_LOG_DEBUG)
2099
142
                hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s type %d (%s)", bcf_seqname_safe(hdr,rec), rec->pos+1, "REF/ALT", type, get_type_name(type));
2100
11.3k
            err |= BCF_ERR_CHAR;
2101
11.3k
        }
2102
11.7k
        bytes = (size_t) num << bcf_type_shift[type];
2103
11.7k
        if (end - ptr < bytes) goto bad_shared;
2104
11.7k
        ptr += bytes;
2105
11.7k
    }
2106
2107
    // Check FILTER
2108
322
    reports = 0;
2109
322
    if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_shared;
2110
306
    if (num > 0) {
2111
168
        bytes = (size_t) num << bcf_type_shift[type];
2112
168
        if (((1 << type) & is_integer) == 0) {
2113
68
            hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s type %d (%s)", bcf_seqname_safe(hdr,rec), rec->pos+1, "FILTER", type, get_type_name(type));
2114
68
            err |= BCF_ERR_TAG_INVALID;
2115
68
            if (end - ptr < bytes) goto bad_shared;
2116
62
            ptr += bytes;
2117
100
        } else {
2118
100
            if (end - ptr < bytes) goto bad_shared;
2119
3.72k
            for (i = 0; i < num; i++) {
2120
3.62k
                int32_t key = bcf_dec_int1(ptr, type, &ptr);
2121
3.62k
                if (key < 0
2122
2.77k
                    || (hdr && (key >= max_id
2123
2.51k
                                || hdr->id[BCF_DT_ID][key].key == NULL))) {
2124
2.51k
                    if (!reports++ || hts_verbose >= HTS_LOG_DEBUG)
2125
88
                        hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s id %d", bcf_seqname_safe(hdr,rec), rec->pos+1, "FILTER", key);
2126
2.51k
                    err |= BCF_ERR_TAG_UNDEF;
2127
2.51k
                }
2128
3.62k
            }
2129
98
        }
2130
168
    }
2131
2132
    // Check INFO
2133
298
    reports = 0;
2134
298
    bcf_idpair_t *id_tmp = hdr ? hdr->id[BCF_DT_ID] : NULL;
2135
2.30k
    for (i = 0; i < rec->n_info; i++) {
2136
2.14k
        int32_t key = -1;
2137
2.14k
        if (bcf_dec_typed_int1_safe(ptr, end, &ptr, &key) != 0) goto bad_shared;
2138
2.04k
        if (key < 0 || (hdr && (key >= max_id
2139
1.60k
                                || id_tmp[key].key == NULL))) {
2140
1.60k
            if (!reports++ || hts_verbose >= HTS_LOG_DEBUG)
2141
92
                hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s id %d", bcf_seqname_safe(hdr,rec), rec->pos+1, "INFO", key);
2142
1.60k
            err |= BCF_ERR_TAG_UNDEF;
2143
1.60k
        }
2144
2.04k
        if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_shared;
2145
2.01k
        if (((1 << type) & is_valid_type) == 0
2146
1.95k
            || (type == BCF_BT_NULL && num > 0)) {
2147
80
            if (!reports++ || hts_verbose >= HTS_LOG_DEBUG)
2148
12
                hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s type %d (%s)", bcf_seqname_safe(hdr,rec), rec->pos+1, "INFO", type, get_type_name(type));
2149
80
            err |= BCF_ERR_TAG_INVALID;
2150
80
        }
2151
2.01k
        bytes = (size_t) num << bcf_type_shift[type];
2152
2.01k
        if (end - ptr < bytes) goto bad_shared;
2153
2.00k
        ptr += bytes;
2154
2.00k
    }
2155
2156
    // Check FORMAT and individual information
2157
162
    ptr = (uint8_t *) rec->indiv.s;
2158
162
    end = ptr + rec->indiv.l;
2159
162
    reports = 0;
2160
2.34k
    for (i = 0; i < rec->n_fmt; i++) {
2161
2.28k
        int32_t key = -1;
2162
2.28k
        if (bcf_dec_typed_int1_safe(ptr, end, &ptr, &key) != 0) goto bad_indiv;
2163
2.25k
        if (key < 0
2164
2.13k
            || (hdr && (key >= max_id
2165
2.09k
                        || id_tmp[key].key == NULL))) {
2166
2.09k
            bcf_record_check_err(hdr, rec, "id", &reports, key);
2167
2.09k
            err |= BCF_ERR_TAG_UNDEF;
2168
2.09k
        }
2169
2.25k
        if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_indiv;
2170
2.22k
        if (((1 << type) & is_valid_type) == 0
2171
2.12k
            || (type == BCF_BT_NULL && num > 0)) {
2172
154
            bcf_record_check_err(hdr, rec, "type", &reports, type);
2173
154
            err |= BCF_ERR_TAG_INVALID;
2174
154
        }
2175
2.22k
        if (idgt >= 0 && idgt == key) {
2176
            // check first GT phasing bit and fix up if necessary
2177
0
            if (updatephasing(ptr, end, &ptr, rec->n_sample, num, type)) {
2178
0
                err |= BCF_ERR_TAG_INVALID;
2179
0
            }
2180
2.22k
        } else {
2181
2.22k
            bytes = ((size_t) num << bcf_type_shift[type]) * rec->n_sample;
2182
2.22k
            if (end - ptr < bytes) goto bad_indiv;
2183
2.18k
            ptr += bytes;
2184
2.18k
        }
2185
2.22k
    }
2186
2187
64
    if (!err && rec->rlen < 0) {
2188
        // Treat bad rlen as a warning instead of an error, and try to
2189
        // fix up by using the length of the stored REF allele.
2190
16
        static int warned = 0;
2191
16
        if (!warned) {
2192
1
            hts_log_warning("BCF record at %s:%"PRIhts_pos" has invalid RLEN (%"PRIhts_pos"). "
2193
1
                            "Only one invalid RLEN will be reported.",
2194
1
                            bcf_seqname_safe(hdr,rec), rec->pos+1, rec->rlen);
2195
1
            warned = 1;
2196
1
        }
2197
        //find rlen considering reflen, END, SVLEN, fmt LEN
2198
16
        hts_pos_t len = get_rlen(hdr, rec);
2199
16
        rec->rlen = len >= 0 ? len : 0;
2200
16
    }
2201
2202
64
    rec->errcode |= err;
2203
2204
64
    return err ? -2 : 0; // Return -2 so bcf_read() reports an error
2205
2206
242
 bad_shared:
2207
242
    hts_log_error("Bad BCF record at %s:%"PRIhts_pos" - shared section malformed or too short", bcf_seqname_safe(hdr,rec), rec->pos+1);
2208
242
    return -2;
2209
2210
98
 bad_indiv:
2211
98
    hts_log_error("Bad BCF record at %s:%"PRIhts_pos" - individuals section malformed or too short", bcf_seqname_safe(hdr,rec), rec->pos+1);
2212
98
    return -2;
2213
162
}
2214
2215
static inline uint8_t *bcf_unpack_fmt_core1(uint8_t *ptr, int n_sample, bcf_fmt_t *fmt);
2216
int bcf_subset_format(const bcf_hdr_t *hdr, bcf1_t *rec)
2217
0
{
2218
0
    if ( !hdr->keep_samples ) return 0;
2219
0
    if ( !bcf_hdr_nsamples(hdr) )
2220
0
    {
2221
0
        rec->indiv.l = rec->n_sample = 0;
2222
0
        return 0;
2223
0
    }
2224
2225
0
    int i, j;
2226
0
    uint8_t *ptr = (uint8_t*)rec->indiv.s, *dst = NULL, *src;
2227
0
    bcf_dec_t *dec = &rec->d;
2228
0
    hts_expand(bcf_fmt_t, rec->n_fmt, dec->m_fmt, dec->fmt);
2229
0
    for (i=0; i<dec->m_fmt; ++i) dec->fmt[i].p_free = 0;
2230
2231
0
    for (i=0; i<rec->n_fmt; i++)
2232
0
    {
2233
0
        ptr = bcf_unpack_fmt_core1(ptr, rec->n_sample, &dec->fmt[i]);
2234
0
        src = dec->fmt[i].p - dec->fmt[i].size;
2235
0
        if ( dst )
2236
0
        {
2237
0
            memmove(dec->fmt[i-1].p + dec->fmt[i-1].p_len, dec->fmt[i].p - dec->fmt[i].p_off, dec->fmt[i].p_off);
2238
0
            dec->fmt[i].p = dec->fmt[i-1].p + dec->fmt[i-1].p_len + dec->fmt[i].p_off;
2239
0
        }
2240
0
        dst = dec->fmt[i].p;
2241
0
        for (j=0; j<hdr->nsamples_ori; j++)
2242
0
        {
2243
0
            src += dec->fmt[i].size;
2244
0
            if ( !bit_array_test(hdr->keep_samples,j) ) continue;
2245
0
            memmove(dst, src, dec->fmt[i].size);
2246
0
            dst += dec->fmt[i].size;
2247
0
        }
2248
0
        rec->indiv.l -= dec->fmt[i].p_len - (dst - dec->fmt[i].p);
2249
0
        dec->fmt[i].p_len = dst - dec->fmt[i].p;
2250
0
    }
2251
0
    rec->unpacked |= BCF_UN_FMT;
2252
2253
0
    rec->n_sample = bcf_hdr_nsamples(hdr);
2254
0
    return 0;
2255
0
}
2256
2257
int bcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
2258
35.5k
{
2259
35.5k
    if (fp->format.format == vcf) return vcf_read(fp, h, v);
2260
472
    if (!h)
2261
0
        h = (const bcf_hdr_t *) bgzf_get_private_data(fp->fp.bgzf);
2262
472
    int ret = bcf_read1_core(fp->fp.bgzf, v);
2263
472
    if (ret == 0) ret = bcf_record_check(h, v);
2264
472
    if ( ret!=0 || !h->keep_samples ) return ret;
2265
0
    return bcf_subset_format(h,v);
2266
472
}
2267
2268
int bcf_readrec(BGZF *fp, void *null, void *vv, int *tid, hts_pos_t *beg, hts_pos_t *end)
2269
0
{
2270
0
    bcf1_t *v = (bcf1_t *) vv;
2271
0
    const bcf_hdr_t *hdr = (const bcf_hdr_t *) bgzf_get_private_data(fp);
2272
0
    int ret = bcf_read1_core(fp, v);
2273
0
    if (ret == 0) ret = bcf_record_check(hdr, v);
2274
0
    if (ret  >= 0)
2275
0
        *tid = v->rid, *beg = v->pos, *end = v->pos + v->rlen;
2276
0
    return ret;
2277
0
}
2278
2279
static inline int bcf1_sync_id(bcf1_t *line, kstring_t *str)
2280
0
{
2281
    // single typed string
2282
0
    if ( line->d.id && strcmp(line->d.id, ".") ) {
2283
0
        return bcf_enc_vchar(str, strlen(line->d.id), line->d.id);
2284
0
    } else {
2285
0
        return bcf_enc_size(str, 0, BCF_BT_CHAR);
2286
0
    }
2287
0
}
2288
static inline int bcf1_sync_alleles(bcf1_t *line, kstring_t *str)
2289
0
{
2290
    // list of typed strings
2291
0
    int i;
2292
0
    for (i=0; i<line->n_allele; i++) {
2293
0
        if (bcf_enc_vchar(str, strlen(line->d.allele[i]), line->d.allele[i]) < 0)
2294
0
            return -1;
2295
0
    }
2296
0
    if ( !line->rlen && line->n_allele ) line->rlen = strlen(line->d.allele[0]);
2297
0
    return 0;
2298
0
}
2299
static inline int bcf1_sync_filter(bcf1_t *line, kstring_t *str)
2300
0
{
2301
    // typed vector of integers
2302
0
    if ( line->d.n_flt ) {
2303
0
        return bcf_enc_vint(str, line->d.n_flt, line->d.flt, -1);
2304
0
    } else {
2305
0
        return bcf_enc_vint(str, 0, 0, -1);
2306
0
    }
2307
0
}
2308
2309
static inline int bcf1_sync_info(bcf1_t *line, kstring_t *str)
2310
0
{
2311
    // pairs of typed vectors
2312
0
    int i, irm = -1, e = 0;
2313
0
    for (i=0; i<line->n_info; i++)
2314
0
    {
2315
0
        bcf_info_t *info = &line->d.info[i];
2316
0
        if ( !info->vptr )
2317
0
        {
2318
            // marked for removal
2319
0
            if ( irm < 0 ) irm = i;
2320
0
            continue;
2321
0
        }
2322
0
        e |= kputsn_(info->vptr - info->vptr_off, info->vptr_len + info->vptr_off, str) < 0;
2323
0
        if ( irm >=0 )
2324
0
        {
2325
0
            bcf_info_t tmp = line->d.info[irm]; line->d.info[irm] = line->d.info[i]; line->d.info[i] = tmp;
2326
0
            while ( irm<=i && line->d.info[irm].vptr ) irm++;
2327
0
        }
2328
0
    }
2329
0
    if ( irm>=0 ) line->n_info = irm;
2330
0
    return e == 0 ? 0 : -1;
2331
0
}
2332
2333
static int bcf1_sync(bcf1_t *line)
2334
23
{
2335
23
    char *shared_ori = line->shared.s;
2336
23
    size_t prev_len;
2337
2338
23
    kstring_t tmp = {0,0,0};
2339
23
    if ( !line->shared.l )
2340
0
    {
2341
        // New line created via API, BCF data blocks do not exist. Get it ready for BCF output
2342
0
        tmp = line->shared;
2343
0
        bcf1_sync_id(line, &tmp);
2344
0
        line->unpack_size[0] = tmp.l; prev_len = tmp.l;
2345
2346
0
        bcf1_sync_alleles(line, &tmp);
2347
0
        line->unpack_size[1] = tmp.l - prev_len; prev_len = tmp.l;
2348
2349
0
        bcf1_sync_filter(line, &tmp);
2350
0
        line->unpack_size[2] = tmp.l - prev_len;
2351
2352
0
        bcf1_sync_info(line, &tmp);
2353
0
        line->shared = tmp;
2354
0
    }
2355
23
    else if ( line->d.shared_dirty )
2356
0
    {
2357
        // The line was edited, update the BCF data block.
2358
2359
0
        if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line,BCF_UN_STR);
2360
2361
        // ptr_ori points to the original unchanged BCF data.
2362
0
        uint8_t *ptr_ori = (uint8_t *) line->shared.s;
2363
2364
        // ID: single typed string
2365
0
        if ( line->d.shared_dirty & BCF1_DIRTY_ID )
2366
0
            bcf1_sync_id(line, &tmp);
2367
0
        else
2368
0
            kputsn_(ptr_ori, line->unpack_size[0], &tmp);
2369
0
        ptr_ori += line->unpack_size[0];
2370
0
        line->unpack_size[0] = tmp.l; prev_len = tmp.l;
2371
2372
        // REF+ALT: list of typed strings
2373
0
        if ( line->d.shared_dirty & BCF1_DIRTY_ALS )
2374
0
            bcf1_sync_alleles(line, &tmp);
2375
0
        else
2376
0
        {
2377
0
            kputsn_(ptr_ori, line->unpack_size[1], &tmp);
2378
0
            if ( !line->rlen && line->n_allele ) line->rlen = strlen(line->d.allele[0]);
2379
0
        }
2380
0
        ptr_ori += line->unpack_size[1];
2381
0
        line->unpack_size[1] = tmp.l - prev_len; prev_len = tmp.l;
2382
2383
0
        if ( line->unpacked & BCF_UN_FLT )
2384
0
        {
2385
            // FILTER: typed vector of integers
2386
0
            if ( line->d.shared_dirty & BCF1_DIRTY_FLT )
2387
0
                bcf1_sync_filter(line, &tmp);
2388
0
            else if ( line->d.n_flt )
2389
0
                kputsn_(ptr_ori, line->unpack_size[2], &tmp);
2390
0
            else
2391
0
                bcf_enc_vint(&tmp, 0, 0, -1);
2392
0
            ptr_ori += line->unpack_size[2];
2393
0
            line->unpack_size[2] = tmp.l - prev_len;
2394
2395
0
            if ( line->unpacked & BCF_UN_INFO )
2396
0
            {
2397
                // INFO: pairs of typed vectors
2398
0
                if ( line->d.shared_dirty & BCF1_DIRTY_INF )
2399
0
                {
2400
0
                    bcf1_sync_info(line, &tmp);
2401
0
                    ptr_ori = (uint8_t*)line->shared.s + line->shared.l;
2402
0
                }
2403
0
            }
2404
0
        }
2405
2406
0
        int size = line->shared.l - (size_t)ptr_ori + (size_t)line->shared.s;
2407
0
        if ( size ) kputsn_(ptr_ori, size, &tmp);
2408
2409
0
        free(line->shared.s);
2410
0
        line->shared = tmp;
2411
0
    }
2412
23
    if ( line->shared.s != shared_ori && line->unpacked & BCF_UN_INFO )
2413
0
    {
2414
        // Reallocated line->shared.s block invalidated line->d.info[].vptr pointers
2415
0
        size_t off_new = line->unpack_size[0] + line->unpack_size[1] + line->unpack_size[2];
2416
0
        int i;
2417
0
        for (i=0; i<line->n_info; i++)
2418
0
        {
2419
0
            uint8_t *vptr_free = line->d.info[i].vptr_free ? line->d.info[i].vptr - line->d.info[i].vptr_off : NULL;
2420
0
            line->d.info[i].vptr = (uint8_t*) line->shared.s + off_new + line->d.info[i].vptr_off;
2421
0
            off_new += line->d.info[i].vptr_len + line->d.info[i].vptr_off;
2422
0
            if ( vptr_free )
2423
0
            {
2424
0
                free(vptr_free);
2425
0
                line->d.info[i].vptr_free = 0;
2426
0
            }
2427
0
        }
2428
0
    }
2429
2430
23
    if ( line->n_sample && line->n_fmt && (!line->indiv.l || line->d.indiv_dirty) )
2431
0
    {
2432
        // The genotype fields changed or are not present
2433
0
        tmp.l = tmp.m = 0; tmp.s = NULL;
2434
0
        int i, irm = -1;
2435
0
        for (i=0; i<line->n_fmt; i++)
2436
0
        {
2437
0
            bcf_fmt_t *fmt = &line->d.fmt[i];
2438
0
            if ( !fmt->p )
2439
0
            {
2440
                // marked for removal
2441
0
                if ( irm < 0 ) irm = i;
2442
0
                continue;
2443
0
            }
2444
0
            kputsn_(fmt->p - fmt->p_off, fmt->p_len + fmt->p_off, &tmp);
2445
0
            if ( irm >=0 )
2446
0
            {
2447
0
                bcf_fmt_t tfmt = line->d.fmt[irm]; line->d.fmt[irm] = line->d.fmt[i]; line->d.fmt[i] = tfmt;
2448
0
                while ( irm<=i && line->d.fmt[irm].p ) irm++;
2449
0
            }
2450
2451
0
        }
2452
0
        if ( irm>=0 ) line->n_fmt = irm;
2453
0
        free(line->indiv.s);
2454
0
        line->indiv = tmp;
2455
2456
        // Reallocated line->indiv.s block invalidated line->d.fmt[].p pointers
2457
0
        size_t off_new = 0;
2458
0
        for (i=0; i<line->n_fmt; i++)
2459
0
        {
2460
0
            uint8_t *p_free = line->d.fmt[i].p_free ? line->d.fmt[i].p - line->d.fmt[i].p_off : NULL;
2461
0
            line->d.fmt[i].p = (uint8_t*) line->indiv.s + off_new + line->d.fmt[i].p_off;
2462
0
            off_new += line->d.fmt[i].p_len + line->d.fmt[i].p_off;
2463
0
            if ( p_free )
2464
0
            {
2465
0
                free(p_free);
2466
0
                line->d.fmt[i].p_free = 0;
2467
0
            }
2468
0
        }
2469
0
    }
2470
23
    if ( !line->n_sample ) line->n_fmt = 0;
2471
23
    line->d.shared_dirty = line->d.indiv_dirty = 0;
2472
23
    return 0;
2473
23
}
2474
2475
bcf1_t *bcf_copy(bcf1_t *dst, bcf1_t *src)
2476
0
{
2477
0
    bcf1_sync(src);
2478
2479
0
    bcf_clear(dst);
2480
0
    dst->rid  = src->rid;
2481
0
    dst->pos  = src->pos;
2482
0
    dst->rlen = src->rlen;
2483
0
    dst->qual = src->qual;
2484
0
    dst->n_info = src->n_info; dst->n_allele = src->n_allele;
2485
0
    dst->n_fmt = src->n_fmt; dst->n_sample = src->n_sample;
2486
2487
0
    if ( dst->shared.m < src->shared.l )
2488
0
    {
2489
0
        dst->shared.s = (char*) realloc(dst->shared.s, src->shared.l);
2490
0
        dst->shared.m = src->shared.l;
2491
0
    }
2492
0
    dst->shared.l = src->shared.l;
2493
0
    memcpy(dst->shared.s,src->shared.s,dst->shared.l);
2494
2495
0
    if ( dst->indiv.m < src->indiv.l )
2496
0
    {
2497
0
        dst->indiv.s = (char*) realloc(dst->indiv.s, src->indiv.l);
2498
0
        dst->indiv.m = src->indiv.l;
2499
0
    }
2500
0
    dst->indiv.l = src->indiv.l;
2501
0
    memcpy(dst->indiv.s,src->indiv.s,dst->indiv.l);
2502
2503
0
    return dst;
2504
0
}
2505
bcf1_t *bcf_dup(bcf1_t *src)
2506
0
{
2507
0
    bcf1_t *out = bcf_init1();
2508
0
    return bcf_copy(out, src);
2509
0
}
2510
2511
int bcf_write(htsFile *hfp, bcf_hdr_t *h, bcf1_t *v)
2512
32.8k
{
2513
32.8k
    if ( h->dirty ) {
2514
0
        if (bcf_hdr_sync(h) < 0) return -1;
2515
0
    }
2516
32.8k
    if ( bcf_hdr_nsamples(h)!=v->n_sample )
2517
84
    {
2518
84
        hts_log_error("Broken VCF record, the number of columns at %s:%"PRIhts_pos" does not match the number of samples (%d vs %d)",
2519
84
            bcf_seqname_safe(h,v), v->pos+1, v->n_sample, bcf_hdr_nsamples(h));
2520
84
        return -1;
2521
84
    }
2522
2523
32.7k
    if ( hfp->format.format == vcf || hfp->format.format == text_format )
2524
30.8k
        return vcf_write(hfp,h,v);
2525
2526
1.90k
    if ( v->errcode & ~BCF_ERR_LIMITS ) // todo: unsure about the other BCF_ERR_LIMITS branches in vcf_parse_format_alloc4()
2527
1.87k
    {
2528
        // vcf_parse1() encountered a new contig or tag, undeclared in the
2529
        // header.  At this point, the header must have been printed,
2530
        // proceeding would lead to a broken BCF file. Errors must be checked
2531
        // and cleared by the caller before we can proceed.
2532
1.87k
        char errdescription[1024] = "";
2533
1.87k
        hts_log_error("Unchecked error (%d %s) at %s:%"PRIhts_pos, v->errcode, bcf_strerror(v->errcode, errdescription, sizeof(errdescription)), bcf_seqname_safe(h,v), v->pos+1);
2534
1.87k
        return -1;
2535
1.87k
    }
2536
23
    bcf1_sync(v);   // check if the BCF record was modified
2537
2538
23
    if ( v->unpacked & BCF_IS_64BIT )
2539
0
    {
2540
0
        hts_log_error("Data at %s:%"PRIhts_pos" contains 64-bit values not representable in BCF. Please use VCF instead", bcf_seqname_safe(h,v), v->pos+1);
2541
0
        return -1;
2542
0
    }
2543
2544
23
    BGZF *fp = hfp->fp.bgzf;
2545
23
    uint8_t x[32];
2546
23
    u32_to_le(v->shared.l + 24, x); // to include six 32-bit integers
2547
23
    u32_to_le(v->indiv.l, x + 4);
2548
23
    i32_to_le(v->rid, x + 8);
2549
23
    u32_to_le(v->pos, x + 12);
2550
23
    u32_to_le(v->rlen, x + 16);
2551
23
    float_to_le(v->qual, x + 20);
2552
23
    u16_to_le(v->n_info, x + 24);
2553
23
    u16_to_le(v->n_allele, x + 26);
2554
23
    u32_to_le((uint32_t)v->n_fmt<<24 | (v->n_sample & 0xffffff), x + 28);
2555
23
    if ( bgzf_write(fp, x, 32) != 32 ) return -1;
2556
23
    if ( bgzf_write(fp, v->shared.s, v->shared.l) != v->shared.l ) return -1;
2557
23
    if ( bgzf_write(fp, v->indiv.s, v->indiv.l) != v->indiv.l ) return -1;
2558
2559
23
    if (hfp->idx) {
2560
0
        if (bgzf_idx_push(fp, hfp->idx, v->rid, v->pos, v->pos + v->rlen,
2561
0
                          bgzf_tell(fp), 1) < 0)
2562
0
            return -1;
2563
0
    }
2564
2565
23
    return 0;
2566
23
}
2567
2568
/**********************
2569
 *** VCF header I/O ***
2570
 **********************/
2571
2572
0
static int add_missing_contig_hrec(bcf_hdr_t *h, const char *name) {
2573
0
    bcf_hrec_t *hrec = calloc(1, sizeof(bcf_hrec_t));
2574
0
    int save_errno;
2575
0
    if (!hrec) goto fail;
2576
2577
0
    hrec->key = strdup("contig");
2578
0
    if (!hrec->key) goto fail;
2579
2580
0
    if (bcf_hrec_add_key(hrec, "ID", strlen("ID")) < 0) goto fail;
2581
0
    if (bcf_hrec_set_val(hrec, hrec->nkeys-1, name, strlen(name), 0) < 0)
2582
0
        goto fail;
2583
0
    if (bcf_hdr_add_hrec(h, hrec) < 0)
2584
0
        goto fail;
2585
0
    return 0;
2586
2587
0
 fail:
2588
0
    save_errno = errno;
2589
0
    hts_log_error("%s", strerror(errno));
2590
0
    if (hrec) bcf_hrec_destroy(hrec);
2591
0
    errno = save_errno;
2592
0
    return -1;
2593
0
}
2594
2595
bcf_hdr_t *vcf_hdr_read(htsFile *fp)
2596
5.75k
{
2597
5.75k
    kstring_t txt, *s = &fp->line;
2598
5.75k
    int ret;
2599
5.75k
    bcf_hdr_t *h;
2600
5.75k
    tbx_t *idx = NULL;
2601
5.75k
    const char **names = NULL;
2602
5.75k
    h = bcf_hdr_init("r");
2603
5.75k
    if (!h) {
2604
0
        hts_log_error("Failed to allocate bcf header");
2605
0
        return NULL;
2606
0
    }
2607
5.75k
    txt.l = txt.m = 0; txt.s = 0;
2608
93.0k
    while ((ret = hts_getline(fp, KS_SEP_LINE, s)) >= 0) {
2609
91.9k
        int e = 0;
2610
91.9k
        if (s->l == 0) continue;
2611
87.8k
        if (s->s[0] != '#') {
2612
18
            hts_log_error("No sample line");
2613
18
            goto error;
2614
18
        }
2615
87.8k
        if (s->s[1] != '#' && fp->fn_aux) { // insert contigs here
2616
0
            kstring_t tmp = { 0, 0, NULL };
2617
0
            hFILE *f = hopen(fp->fn_aux, "r");
2618
0
            if (f == NULL) {
2619
0
                hts_log_error("Couldn't open \"%s\"", fp->fn_aux);
2620
0
                goto error;
2621
0
            }
2622
0
            while (tmp.l = 0, khgetline(&tmp, f) >= 0) {
2623
0
                char *tab = strchr(tmp.s, '\t');
2624
0
                if (tab == NULL) continue;
2625
0
                e |= (kputs("##contig=<ID=", &txt) < 0);
2626
0
                e |= (kputsn(tmp.s, tab - tmp.s, &txt) < 0);
2627
0
                e |= (kputs(",length=", &txt) < 0);
2628
0
                e |= (kputl(atol(tab), &txt) < 0);
2629
0
                e |= (kputsn(">\n", 2, &txt) < 0);
2630
0
            }
2631
0
            free(tmp.s);
2632
0
            if (hclose(f) != 0) {
2633
0
                hts_log_error("Error on closing %s", fp->fn_aux);
2634
0
                goto error;
2635
0
            }
2636
0
            if (e) goto error;
2637
0
        }
2638
87.8k
        if (kputsn(s->s, s->l, &txt) < 0) goto error;
2639
87.8k
        if (kputc('\n', &txt) < 0) goto error;
2640
87.8k
        if (s->s[1] != '#') break;
2641
87.8k
    }
2642
5.73k
    if ( ret < -1 ) goto error;
2643
5.72k
    if ( !txt.s )
2644
0
    {
2645
0
        hts_log_error("Could not read the header");
2646
0
        goto error;
2647
0
    }
2648
5.72k
    if ( bcf_hdr_parse(h, txt.s) < 0 ) goto error;
2649
2650
    // check tabix index, are all contigs listed in the header? add the missing ones
2651
4.24k
    idx = tbx_index_load3(fp->fn, NULL, HTS_IDX_SILENT_FAIL);
2652
4.24k
    if ( idx )
2653
0
    {
2654
0
        int i, n, need_sync = 0;
2655
0
        names = tbx_seqnames(idx, &n);
2656
0
        if (!names) goto error;
2657
0
        for (i=0; i<n; i++)
2658
0
        {
2659
0
            bcf_hrec_t *hrec = bcf_hdr_get_hrec(h, BCF_HL_CTG, "ID", (char*) names[i], NULL);
2660
0
            if ( hrec ) continue;
2661
0
            if (add_missing_contig_hrec(h, names[i]) < 0) goto error;
2662
0
            need_sync = 1;
2663
0
        }
2664
0
        if ( need_sync ) {
2665
0
            if (bcf_hdr_sync(h) < 0) goto error;
2666
0
        }
2667
0
        free(names);
2668
0
        tbx_destroy(idx);
2669
0
    }
2670
4.24k
    free(txt.s);
2671
4.24k
    return h;
2672
2673
1.50k
 error:
2674
1.50k
    if (idx) tbx_destroy(idx);
2675
1.50k
    free(names);
2676
1.50k
    free(txt.s);
2677
1.50k
    if (h) bcf_hdr_destroy(h);
2678
1.50k
    return NULL;
2679
4.24k
}
2680
2681
int bcf_hdr_set(bcf_hdr_t *hdr, const char *fname)
2682
0
{
2683
0
    int i = 0, n = 0, save_errno;
2684
0
    char **lines = hts_readlines(fname, &n);
2685
0
    if ( !lines ) return 1;
2686
0
    for (i=0; i<n-1; i++)
2687
0
    {
2688
0
        int k;
2689
0
        bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr,lines[i],&k);
2690
0
        if (!hrec) goto fail;
2691
0
        if (bcf_hdr_add_hrec(hdr, hrec) < 0) {
2692
0
            bcf_hrec_destroy(hrec);
2693
0
            goto fail;
2694
0
        }
2695
0
        free(lines[i]);
2696
0
        lines[i] = NULL;
2697
0
    }
2698
0
    if (bcf_hdr_parse_sample_line(hdr, lines[n-1]) < 0) goto fail;
2699
0
    if (bcf_hdr_sync(hdr) < 0) goto fail;
2700
0
    free(lines[n-1]);
2701
0
    free(lines);
2702
0
    return 0;
2703
2704
0
 fail:
2705
0
    save_errno = errno;
2706
0
    for (; i < n; i++)
2707
0
        free(lines[i]);
2708
0
    free(lines);
2709
0
    errno = save_errno;
2710
0
    return 1;
2711
0
}
2712
2713
static int _bcf_hrec_format(const bcf_hrec_t *hrec, int is_bcf, kstring_t *str)
2714
15.2k
{
2715
15.2k
    uint32_t e = 0;
2716
15.2k
    if ( !hrec->value )
2717
8.42k
    {
2718
8.42k
        int j, nout = 0;
2719
8.42k
        e |= ksprintf(str, "##%s=<", hrec->key) < 0;
2720
33.4k
        for (j=0; j<hrec->nkeys; j++)
2721
24.9k
        {
2722
            // do not output IDX if output is VCF
2723
24.9k
            if ( !is_bcf && !strcmp("IDX",hrec->keys[j]) ) continue;
2724
21.1k
            if ( nout ) e |= kputc(',',str) < 0;
2725
21.1k
            e |= ksprintf(str,"%s=%s", hrec->keys[j], hrec->vals[j]) < 0;
2726
21.1k
            nout++;
2727
21.1k
        }
2728
8.42k
        e |= ksprintf(str,">\n") < 0;
2729
8.42k
    }
2730
6.81k
    else
2731
6.81k
        e |= ksprintf(str,"##%s=%s\n", hrec->key,hrec->value) < 0;
2732
2733
15.2k
    return e == 0 ? 0 : -1;
2734
15.2k
}
2735
2736
int bcf_hrec_format(const bcf_hrec_t *hrec, kstring_t *str)
2737
0
{
2738
0
    return _bcf_hrec_format(hrec,0,str);
2739
0
}
2740
2741
int bcf_hdr_format(const bcf_hdr_t *hdr, int is_bcf, kstring_t *str)
2742
4.67k
{
2743
4.67k
    int i, r = 0;
2744
19.9k
    for (i=0; i<hdr->nhrec; i++)
2745
15.2k
        r |= _bcf_hrec_format(hdr->hrec[i], is_bcf, str) < 0;
2746
2747
4.67k
    r |= ksprintf(str, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO") < 0;
2748
4.67k
    if ( bcf_hdr_nsamples(hdr) )
2749
1.44k
    {
2750
1.44k
        r |= ksprintf(str, "\tFORMAT") < 0;
2751
8.97k
        for (i=0; i<bcf_hdr_nsamples(hdr); i++)
2752
7.53k
            r |= ksprintf(str, "\t%s", hdr->samples[i]) < 0;
2753
1.44k
    }
2754
4.67k
    r |= ksprintf(str, "\n") < 0;
2755
2756
4.67k
    return r ? -1 : 0;
2757
4.67k
}
2758
2759
char *bcf_hdr_fmt_text(const bcf_hdr_t *hdr, int is_bcf, int *len)
2760
0
{
2761
0
    kstring_t txt = {0,0,0};
2762
0
    if (bcf_hdr_format(hdr, is_bcf, &txt) < 0)
2763
0
        return NULL;
2764
0
    if ( len ) *len = txt.l;
2765
0
    return txt.s;
2766
0
}
2767
2768
const char **bcf_hdr_seqnames(const bcf_hdr_t *h, int *n)
2769
0
{
2770
0
    vdict_t *d = (vdict_t*)h->dict[BCF_DT_CTG];
2771
0
    int i, tid, m = kh_size(d);
2772
0
    const char **names = (const char**) calloc(m,sizeof(const char*));
2773
0
    if ( !names )
2774
0
    {
2775
0
        hts_log_error("Failed to allocate memory");
2776
0
        *n = 0;
2777
0
        return NULL;
2778
0
    }
2779
0
    khint_t k;
2780
0
    for (k=kh_begin(d); k<kh_end(d); k++)
2781
0
    {
2782
0
        if ( !kh_exist(d,k) ) continue;
2783
0
        if ( !kh_val(d, k).hrec[0] ) continue;  // removed via bcf_hdr_remove
2784
0
        tid = kh_val(d,k).id;
2785
0
        if ( tid >= m )
2786
0
        {
2787
            // This can happen after a contig has been removed from BCF header via bcf_hdr_remove()
2788
0
            if ( hts_resize(const char*, tid + 1, &m, &names, HTS_RESIZE_CLEAR)<0 )
2789
0
            {
2790
0
                hts_log_error("Failed to allocate memory");
2791
0
                *n = 0;
2792
0
                free(names);
2793
0
                return NULL;
2794
0
            }
2795
0
            m = tid + 1;
2796
0
        }
2797
0
        names[tid] = kh_key(d,k);
2798
0
    }
2799
    // ensure there are no gaps
2800
0
    for (i=0,tid=0; tid<m; i++,tid++)
2801
0
    {
2802
0
        while ( tid<m && !names[tid] ) tid++;
2803
0
        if ( tid==m ) break;
2804
0
        if ( i==tid ) continue;
2805
0
        names[i] = names[tid];
2806
0
        names[tid] = 0;
2807
0
    }
2808
0
    *n = i;
2809
0
    return names;
2810
0
}
2811
2812
int vcf_hdr_write(htsFile *fp, const bcf_hdr_t *h)
2813
2.33k
{
2814
2.33k
    kstring_t htxt = {0,0,0};
2815
2.33k
    if (bcf_hdr_format(h, 0, &htxt) < 0) {
2816
0
        free(htxt.s);
2817
0
        return -1;
2818
0
    }
2819
2.33k
    while (htxt.l && htxt.s[htxt.l-1] == '\0') --htxt.l; // kill trailing zeros
2820
2.33k
    int ret;
2821
2.33k
    if ( fp->format.compression!=no_compression ) {
2822
0
        ret = bgzf_write(fp->fp.bgzf, htxt.s, htxt.l);
2823
0
        if (bgzf_flush(fp->fp.bgzf) != 0) return -1;
2824
2.33k
    } else {
2825
2.33k
        ret = hwrite(fp->fp.hfile, htxt.s, htxt.l);
2826
2.33k
    }
2827
2.33k
    free(htxt.s);
2828
2.33k
    return ret<0 ? -1 : 0;
2829
2.33k
}
2830
2831
/***********************
2832
 *** Typed value I/O ***
2833
 ***********************/
2834
2835
int bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize)
2836
213k
{
2837
213k
    int32_t max = INT32_MIN, min = INT32_MAX;
2838
213k
    int i;
2839
213k
    if (n <= 0) {
2840
2.75k
        return bcf_enc_size(s, 0, BCF_BT_NULL);
2841
210k
    } else if (n == 1) {
2842
29.9k
        return bcf_enc_int1(s, a[0]);
2843
180k
    } else {
2844
180k
        if (wsize <= 0) wsize = n;
2845
2846
        // Equivalent to:
2847
        // for (i = 0; i < n; ++i) {
2848
        //     if (a[i] == bcf_int32_missing || a[i] == bcf_int32_vector_end )
2849
        //         continue;
2850
        //     if (max < a[i]) max = a[i];
2851
        //     if (min > a[i]) min = a[i];
2852
        // }
2853
180k
        int max4[4] = {INT32_MIN, INT32_MIN, INT32_MIN, INT32_MIN};
2854
180k
        int min4[4] = {INT32_MAX, INT32_MAX, INT32_MAX, INT32_MAX};
2855
24.9M
        for (i = 0; i < (n&~3); i+=4) {
2856
            // bcf_int32_missing    == INT32_MIN and
2857
            // bcf_int32_vector_end == INT32_MIN+1.
2858
            // We skip these, but can mostly avoid explicit checking
2859
24.7M
            if (max4[0] < a[i+0]) max4[0] = a[i+0];
2860
24.7M
            if (max4[1] < a[i+1]) max4[1] = a[i+1];
2861
24.7M
            if (max4[2] < a[i+2]) max4[2] = a[i+2];
2862
24.7M
            if (max4[3] < a[i+3]) max4[3] = a[i+3];
2863
24.7M
            if (min4[0] > a[i+0] && a[i+0] > INT32_MIN+1) min4[0] = a[i+0];
2864
24.7M
            if (min4[1] > a[i+1] && a[i+1] > INT32_MIN+1) min4[1] = a[i+1];
2865
24.7M
            if (min4[2] > a[i+2] && a[i+2] > INT32_MIN+1) min4[2] = a[i+2];
2866
24.7M
            if (min4[3] > a[i+3] && a[i+3] > INT32_MIN+1) min4[3] = a[i+3];
2867
24.7M
        }
2868
180k
        min = min4[0];
2869
180k
        if (min > min4[1]) min = min4[1];
2870
180k
        if (min > min4[2]) min = min4[2];
2871
180k
        if (min > min4[3]) min = min4[3];
2872
180k
        max = max4[0];
2873
180k
        if (max < max4[1]) max = max4[1];
2874
180k
        if (max < max4[2]) max = max4[2];
2875
180k
        if (max < max4[3]) max = max4[3];
2876
430k
        for (; i < n; ++i) {
2877
249k
            if (max < a[i]) max = a[i];
2878
249k
            if (min > a[i] && a[i] > INT32_MIN+1) min = a[i];
2879
249k
        }
2880
2881
180k
        if (max <= BCF_MAX_BT_INT8 && min >= BCF_MIN_BT_INT8) {
2882
20.4k
            if (bcf_enc_size(s, wsize, BCF_BT_INT8) < 0 ||
2883
20.4k
                ks_resize(s, s->l + n) < 0)
2884
0
                return -1;
2885
20.4k
            uint8_t *p = (uint8_t *) s->s + s->l;
2886
4.25M
            for (i = 0; i < n; ++i, p++) {
2887
4.23M
                if ( a[i]==bcf_int32_vector_end )   *p = bcf_int8_vector_end;
2888
4.22M
                else if ( a[i]==bcf_int32_missing ) *p = bcf_int8_missing;
2889
105k
                else *p = a[i];
2890
4.23M
            }
2891
20.4k
            s->l += n;
2892
160k
        } else if (max <= BCF_MAX_BT_INT16 && min >= BCF_MIN_BT_INT16) {
2893
115k
            uint8_t *p;
2894
115k
            if (bcf_enc_size(s, wsize, BCF_BT_INT16) < 0 ||
2895
115k
                ks_resize(s, s->l + n * sizeof(int16_t)) < 0)
2896
0
                return -1;
2897
115k
            p = (uint8_t *) s->s + s->l;
2898
44.0M
            for (i = 0; i < n; ++i)
2899
43.9M
            {
2900
43.9M
                int16_t x;
2901
43.9M
                if ( a[i]==bcf_int32_vector_end ) x = bcf_int16_vector_end;
2902
43.9M
                else if ( a[i]==bcf_int32_missing ) x = bcf_int16_missing;
2903
875k
                else x = a[i];
2904
43.9M
                i16_to_le(x, p);
2905
43.9M
                p += sizeof(int16_t);
2906
43.9M
            }
2907
115k
            s->l += n * sizeof(int16_t);
2908
115k
        } else {
2909
45.2k
            uint8_t *p;
2910
45.2k
            if (bcf_enc_size(s, wsize, BCF_BT_INT32) < 0 ||
2911
45.2k
                ks_resize(s, s->l + n * sizeof(int32_t)) < 0)
2912
0
                return -1;
2913
45.2k
            p = (uint8_t *) s->s + s->l;
2914
51.1M
            for (i = 0; i < n; ++i) {
2915
51.0M
                i32_to_le(a[i], p);
2916
51.0M
                p += sizeof(int32_t);
2917
51.0M
            }
2918
45.2k
            s->l += n * sizeof(int32_t);
2919
45.2k
        }
2920
180k
    }
2921
2922
180k
    return 0;
2923
213k
}
2924
2925
#ifdef VCF_ALLOW_INT64
2926
static int bcf_enc_long1(kstring_t *s, int64_t x) {
2927
    uint32_t e = 0;
2928
    if (x <= BCF_MAX_BT_INT32 && x >= BCF_MIN_BT_INT32)
2929
        return bcf_enc_int1(s, x);
2930
    if (x == bcf_int64_vector_end) {
2931
        e |= bcf_enc_size(s, 1, BCF_BT_INT8);
2932
        e |= kputc(bcf_int8_vector_end, s) < 0;
2933
    } else if (x == bcf_int64_missing) {
2934
        e |= bcf_enc_size(s, 1, BCF_BT_INT8);
2935
        e |= kputc(bcf_int8_missing, s) < 0;
2936
    } else {
2937
        e |= bcf_enc_size(s, 1, BCF_BT_INT64);
2938
        e |= ks_expand(s, 8);
2939
        if (e == 0) { u64_to_le(x, (uint8_t *) s->s + s->l); s->l += 8; }
2940
    }
2941
    return e == 0 ? 0 : -1;
2942
}
2943
#endif
2944
2945
432k
static inline int serialize_float_array(kstring_t *s, size_t n, const float *a) {
2946
432k
    uint8_t *p;
2947
432k
    size_t i;
2948
432k
    size_t bytes = n * sizeof(float);
2949
2950
432k
    if (bytes / sizeof(float) != n) return -1;
2951
432k
    if (ks_resize(s, s->l + bytes) < 0) return -1;
2952
2953
432k
    p = (uint8_t *) s->s + s->l;
2954
81.5M
    for (i = 0; i < n; i++) {
2955
81.0M
        float_to_le(a[i], p);
2956
81.0M
        p += sizeof(float);
2957
81.0M
    }
2958
432k
    s->l += bytes;
2959
2960
432k
    return 0;
2961
432k
}
2962
2963
int bcf_enc_vfloat(kstring_t *s, int n, float *a)
2964
432k
{
2965
432k
    assert(n >= 0);
2966
432k
    bcf_enc_size(s, n, BCF_BT_FLOAT);
2967
432k
    serialize_float_array(s, n, a);
2968
432k
    return 0; // FIXME: check for errs in this function
2969
432k
}
2970
2971
int bcf_enc_vchar(kstring_t *s, int l, const char *a)
2972
2.78M
{
2973
2.78M
    bcf_enc_size(s, l, BCF_BT_CHAR);
2974
2.78M
    kputsn(a, l, s);
2975
2.78M
    return 0; // FIXME: check for errs in this function
2976
2.78M
}
2977
2978
// Special case of n==1 as it also occurs quite often in FORMAT data.
2979
// This version is also small enough to get inlined.
2980
5.87k
static inline int bcf_fmt_array1(kstring_t *s, int type, void *data) {
2981
5.87k
    uint32_t e = 0;
2982
5.87k
    uint8_t *p = (uint8_t *)data;
2983
5.87k
    int32_t v;
2984
2985
    // helps gcc more than clang here. In billions of cycles:
2986
    //          bcf_fmt_array1  bcf_fmt_array
2987
    // gcc7:    23.2            24.3
2988
    // gcc13:   21.6            23.0
2989
    // clang13: 27.1            27.8
2990
5.87k
    switch (type) {
2991
5.87k
    case BCF_BT_CHAR:
2992
5.87k
        e |= kputc_(*p == bcf_str_missing ? '.' : *p, s) < 0;
2993
5.87k
        break;
2994
2995
0
    case BCF_BT_INT8:
2996
0
        if (*(int8_t *)p != bcf_int8_vector_end) {
2997
0
            e |= ((*(int8_t *)p == bcf_int8_missing)
2998
0
                  ? kputc_('.', s)
2999
0
                  : kputw(*(int8_t *)p, s)) < 0;
3000
0
        }
3001
0
        break;
3002
0
    case BCF_BT_INT16:
3003
0
        v = le_to_i16(p);
3004
0
        if (v != bcf_int16_vector_end) {
3005
0
            e |= (v == bcf_int16_missing
3006
0
                  ? kputc_('.', s)
3007
0
                  : kputw(v, s)) < 0;
3008
0
        }
3009
0
        break;
3010
3011
0
    case BCF_BT_INT32:
3012
0
        v = le_to_i32(p);
3013
0
        if (v != bcf_int32_vector_end) {
3014
0
            e |= (v == bcf_int32_missing
3015
0
                  ? kputc_('.', s)
3016
0
                  : kputw(v, s)) < 0;
3017
0
        }
3018
0
        break;
3019
3020
0
    case BCF_BT_FLOAT:
3021
0
        v = le_to_u32(p);
3022
0
        if (v != bcf_float_vector_end) {
3023
0
            e |= (v == bcf_float_missing
3024
0
                  ? kputc_('.', s)
3025
0
                  : kputd(le_to_float(p), s)) < 0;
3026
0
        }
3027
0
        break;
3028
3029
0
    default:
3030
0
        hts_log_error("Unexpected type %d", type);
3031
0
        return -1;
3032
5.87k
    }
3033
3034
5.87k
    return e == 0 ? 0 : -1;
3035
5.87k
}
3036
3037
int bcf_fmt_array(kstring_t *s, int n, int type, void *data)
3038
2.33M
{
3039
2.33M
    int j = 0;
3040
2.33M
    uint32_t e = 0;
3041
2.33M
    if (n == 0) {
3042
1.40M
        return kputc_('.', s) >= 0 ? 0 : -1;
3043
1.40M
    }
3044
3045
925k
    if (type == BCF_BT_CHAR)
3046
284k
    {
3047
284k
        char *p = (char *)data;
3048
3049
        // Note bcf_str_missing is already accounted for in n==0 above.
3050
284k
        if (n >= 8) {
3051
66.7k
            char *p_end = memchr(p, 0, n);
3052
66.7k
            e |= kputsn(p, p_end ? p_end-p : n, s) < 0;
3053
218k
        } else {
3054
771k
            for (j = 0; j < n && *p; ++j, ++p)
3055
553k
               e |= kputc(*p, s) < 0;
3056
218k
        }
3057
284k
    }
3058
640k
    else
3059
640k
    {
3060
640k
        #define BRANCH(type_t, convert, is_missing, is_vector_end, kprint) { \
3061
640k
            uint8_t *p = (uint8_t *) data; \
3062
89.1M
            for (j=0; j<n; j++, p += sizeof(type_t))    \
3063
88.5M
            { \
3064
88.5M
                type_t v = convert(p); \
3065
88.5M
                if ( is_vector_end ) break; \
3066
88.5M
                if ( j ) e |= kputc_(',', s) < 0; \
3067
88.5M
                e |= (is_missing ? kputc('.', s) : kprint) < 0; \
3068
88.5M
            } \
3069
640k
        }
3070
640k
        switch (type) {
3071
180k
            case BCF_BT_INT8:  BRANCH(int8_t,  le_to_i8, v==bcf_int8_missing,  v==bcf_int8_vector_end,  kputw(v, s)); break;
3072
116k
            case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, v==bcf_int16_missing, v==bcf_int16_vector_end, kputw(v, s)); break;
3073
128k
            case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, v==bcf_int32_missing, v==bcf_int32_vector_end, kputw(v, s)); break;
3074
215k
            case BCF_BT_FLOAT: BRANCH(uint32_t, le_to_u32, v==bcf_float_missing, v==bcf_float_vector_end, kputd(le_to_float(p), s)); break;
3075
0
            default: hts_log_error("Unexpected type %d", type); exit(1); break;
3076
640k
        }
3077
640k
        #undef BRANCH
3078
640k
    }
3079
925k
    return e == 0 ? 0 : -1;
3080
925k
}
3081
3082
uint8_t *bcf_fmt_sized_array(kstring_t *s, uint8_t *ptr)
3083
1.56M
{
3084
1.56M
    int x, type;
3085
1.56M
    x = bcf_dec_size(ptr, &ptr, &type);
3086
1.56M
    bcf_fmt_array(s, x, type, ptr);
3087
1.56M
    return ptr + (x << bcf_type_shift[type]);
3088
1.56M
}
3089
3090
/********************
3091
 *** VCF site I/O ***
3092
 ********************/
3093
3094
typedef struct {
3095
    int key;            // Key for h->id[BCF_DT_ID][key] vdict
3096
    int max_m;          // number of elements in field array (ie commas)
3097
    int size;           // field size (max_l or max_g*4 if is_gt)
3098
    int offset;         // offset of buf into h->mem
3099
    uint32_t is_gt:1,   // is genotype
3100
             max_g:31;  // maximum number of genotypes
3101
    uint32_t max_l;     // length of field
3102
    uint32_t y;         // h->id[0][fmt[j].key].val->info[BCF_HL_FMT]
3103
    uint8_t *buf;       // Pointer into h->mem
3104
} fmt_aux_t;
3105
3106
// fmt_aux_t field notes:
3107
// max_* are biggest sizes of the various FORMAT fields across all samples.
3108
// We use these after pivoting the data to ensure easy random access
3109
// of a specific sample.
3110
//
3111
// max_m is only used for type BCF_HT_REAL or BCF_HT_INT
3112
// max_g is only used for is_gt == 1 (will be BCF_HT_STR)
3113
// max_l is only used for is_gt == 0 (will be BCF_HT_STR)
3114
//
3115
// These are computed in vcf_parse_format_max3 and used in
3116
// vcf_parse_format_alloc4 to get the size.
3117
//
3118
// size is computed from max_g, max_l, max_m and is_gt.  Once computed
3119
// the max values are never accessed again.
3120
//
3121
// In theory all 4 vars could be coalesced into a single variable, but this
3122
// significantly harms speed (even if done via a union).  It's about 25-30%
3123
// slower.
3124
3125
static inline int align_mem(kstring_t *s)
3126
81.7k
{
3127
81.7k
    int e = 0;
3128
81.7k
    if (s->l&7) {
3129
10.7k
        uint64_t zero = 0;
3130
10.7k
        e = kputsn((char*)&zero, 8 - (s->l&7), s) < 0;
3131
10.7k
    }
3132
81.7k
    return e == 0 ? 0 : -1;
3133
81.7k
}
3134
3135
82.9k
#define MAX_N_FMT 255   /* Limited by size of bcf1_t n_fmt field */
3136
3137
// detect FORMAT "."
3138
static int vcf_parse_format_empty1(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
3139
10.9k
                                   const char *p, const char *q) {
3140
10.9k
    const char *end = s->s + s->l;
3141
10.9k
    if ( q>=end )
3142
57
    {
3143
57
        hts_log_error("FORMAT column with no sample columns starting at %s:%"PRIhts_pos"", bcf_seqname_safe(h,v), v->pos+1);
3144
57
        v->errcode |= BCF_ERR_NCOLS;
3145
57
        return -1;
3146
57
    }
3147
3148
10.8k
    v->n_fmt = 0;
3149
10.8k
    if ( p[0]=='.' && p[1]==0 ) // FORMAT field is empty "."
3150
242
    {
3151
242
        v->n_sample = bcf_hdr_nsamples(h);
3152
242
        return 1;
3153
242
    }
3154
3155
10.6k
    return 0;
3156
10.8k
}
3157
3158
// get format information from the dictionary
3159
static int vcf_parse_format_dict2(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
3160
10.6k
                                  const char *p, const char *q, fmt_aux_t *fmt) {
3161
10.6k
    const vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID];
3162
10.6k
    char *t;
3163
10.6k
    int j;
3164
10.6k
    ks_tokaux_t aux1;
3165
3166
93.4k
    for (j = 0, t = kstrtok(p, ":", &aux1); t; t = kstrtok(0, 0, &aux1), ++j) {
3167
82.9k
        if (j >= MAX_N_FMT) {
3168
3
            v->errcode |= BCF_ERR_LIMITS;
3169
3
            hts_log_error("FORMAT column at %s:%"PRIhts_pos" lists more identifiers than htslib can handle",
3170
3
                bcf_seqname_safe(h,v), v->pos+1);
3171
3
            return -1;
3172
3
        }
3173
3174
82.8k
        *(char*)aux1.p = 0;
3175
82.8k
        khint_t k = kh_get(vdict, d, t);
3176
82.8k
        if (k == kh_end(d) || kh_val(d, k).info[BCF_HL_FMT] == 15) {
3177
6.16k
            if ( t[0]=='.' && t[1]==0 )
3178
3
            {
3179
3
                hts_log_error("Invalid FORMAT tag name '.' at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3180
3
                v->errcode |= BCF_ERR_TAG_INVALID;
3181
3
                return -1;
3182
3
            }
3183
6.16k
            hts_log_warning("FORMAT '%s' at %s:%"PRIhts_pos" is not defined in the header, assuming Type=String", t, bcf_seqname_safe(h,v), v->pos+1);
3184
6.16k
            kstring_t tmp = {0,0,0};
3185
6.16k
            int l;
3186
6.16k
            ksprintf(&tmp, "##FORMAT=<ID=%s,Number=1,Type=String,Description=\"Dummy\">", t);
3187
6.16k
            bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
3188
6.16k
            free(tmp.s);
3189
6.16k
            int res = hrec ? bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) : -1;
3190
6.16k
            if (res < 0) bcf_hrec_destroy(hrec);
3191
6.16k
            if (res > 0) res = bcf_hdr_sync((bcf_hdr_t*)h);
3192
3193
6.16k
            k = kh_get(vdict, d, t);
3194
6.16k
            v->errcode |= BCF_ERR_TAG_UNDEF;
3195
6.16k
            if (res || k == kh_end(d)) {
3196
12
                hts_log_error("Could not add dummy header for FORMAT '%s' at %s:%"PRIhts_pos, t, bcf_seqname_safe(h,v), v->pos+1);
3197
12
                v->errcode |= BCF_ERR_TAG_INVALID;
3198
12
                return -1;
3199
12
            }
3200
6.16k
        }
3201
82.8k
        fmt[j].max_l = fmt[j].max_m = fmt[j].max_g = 0;
3202
82.8k
        fmt[j].key = kh_val(d, k).id;
3203
82.8k
        fmt[j].is_gt = (t[0] == 'G' && t[1] == 'T' && !t[2]);
3204
82.8k
        fmt[j].y = h->id[0][fmt[j].key].val->info[BCF_HL_FMT];
3205
82.8k
        v->n_fmt++;
3206
82.8k
    }
3207
10.5k
    return 0;
3208
10.6k
}
3209
3210
// compute max
3211
static int vcf_parse_format_max3(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
3212
10.5k
                                 char *p, char *q, fmt_aux_t *fmt) {
3213
10.5k
    int n_sample_ori = -1;
3214
10.5k
    char *r = q + 1;  // r: position in the format string
3215
10.5k
    int l = 0, m = 1, g = 1, j;
3216
10.5k
    v->n_sample = 0;  // m: max vector size, l: max field len, g: max number of alleles
3217
10.5k
    const char *end = s->s + s->l;
3218
3219
27.8k
    while ( r<end )
3220
27.7k
    {
3221
        // can we skip some samples?
3222
27.7k
        if ( h->keep_samples )
3223
0
        {
3224
0
            n_sample_ori++;
3225
0
            if ( !bit_array_test(h->keep_samples,n_sample_ori) )
3226
0
            {
3227
0
                while ( *r!='\t' && r<end ) r++;
3228
0
                if ( *r=='\t' ) { *r = 0; r++; }
3229
0
                continue;
3230
0
            }
3231
0
        }
3232
3233
        // collect fmt stats: max vector size, length, number of alleles
3234
27.7k
        j = 0;  // j-th format field
3235
27.7k
        fmt_aux_t *f = fmt;
3236
27.7k
        static char meta[256] = {
3237
            // \0 \t , / : |
3238
27.7k
            1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3239
27.7k
            0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1, 0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,
3240
27.7k
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3241
27.7k
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,
3242
27.7k
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3243
27.7k
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3244
27.7k
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3245
27.7k
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3246
27.7k
        };
3247
3248
27.7k
        char *r_start = r;
3249
2.69M
        for (;;) {
3250
            // Quickly skip ahead to an appropriate meta-character
3251
3.37M
            while (!meta[(unsigned char)*r]) r++;
3252
3253
2.69M
            switch (*r) {
3254
2.63M
            case ',':
3255
2.63M
                m++;
3256
2.63M
                break;
3257
3258
1.62k
            case '|':
3259
18.8k
            case '/':
3260
18.8k
                if (f->is_gt) g++;
3261
18.8k
                break;
3262
3263
13.0k
            case '\t':
3264
13.0k
                *r = 0; // fall through
3265
3266
13.0k
            default: // valid due to while loop above.
3267
27.7k
            case '\0':
3268
44.1k
            case ':':
3269
44.1k
                l = r - r_start; r_start = r;
3270
44.1k
                if (f->max_m < m) f->max_m = m;
3271
44.1k
                if (f->max_l < l) f->max_l = l;
3272
44.1k
                if (f->is_gt && f->max_g < g) f->max_g = g;
3273
44.1k
                l = 0, m = g = 1;
3274
44.1k
                if ( *r==':' ) {
3275
16.4k
                    j++; f++;
3276
16.4k
                    if ( j>=v->n_fmt ) {
3277
24
                        hts_log_error("Incorrect number of FORMAT fields at %s:%"PRIhts_pos"",
3278
24
                                      h->id[BCF_DT_CTG][v->rid].key, v->pos+1);
3279
24
                        v->errcode |= BCF_ERR_NCOLS;
3280
24
                        return -1;
3281
24
                    }
3282
27.7k
                } else goto end_for;
3283
16.3k
                break;
3284
2.69M
            }
3285
2.66M
            if ( r>=end ) break;
3286
2.66M
            r++;
3287
2.66M
        }
3288
27.7k
    end_for:
3289
27.7k
        v->n_sample++;
3290
27.7k
        if ( v->n_sample == bcf_hdr_nsamples(h) ) break;
3291
17.2k
        r++;
3292
17.2k
    }
3293
3294
10.5k
    return 0;
3295
10.5k
}
3296
3297
// allocate memory for arrays
3298
static int vcf_parse_format_alloc4(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
3299
                                   const char *p, const char *q,
3300
10.5k
                                   fmt_aux_t *fmt) {
3301
10.5k
    kstring_t *mem = (kstring_t*)&h->mem;
3302
3303
10.5k
    int j;
3304
92.2k
    for (j = 0; j < v->n_fmt; ++j) {
3305
81.7k
        fmt_aux_t *f = &fmt[j];
3306
81.7k
        if ( !f->max_m ) f->max_m = 1;  // omitted trailing format field
3307
3308
81.7k
        if ((f->y>>4&0xf) == BCF_HT_STR) {
3309
81.7k
            f->size = f->is_gt? f->max_g << 2 : f->max_l;
3310
81.7k
        } else if ((f->y>>4&0xf) == BCF_HT_REAL || (f->y>>4&0xf) == BCF_HT_INT) {
3311
0
            f->size = f->max_m << 2;
3312
0
        } else {
3313
0
            hts_log_error("The format type %d at %s:%"PRIhts_pos" is currently not supported", f->y>>4&0xf, bcf_seqname_safe(h,v), v->pos+1);
3314
0
            v->errcode |= BCF_ERR_TAG_INVALID;
3315
0
            return -1;
3316
0
        }
3317
3318
81.7k
        if (align_mem(mem) < 0) {
3319
0
            hts_log_error("Memory allocation failure at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3320
0
            v->errcode |= BCF_ERR_LIMITS;
3321
0
            return -1;
3322
0
        }
3323
3324
        // Limit the total memory to ~2Gb per VCF row.  This should mean
3325
        // malformed VCF data is less likely to take excessive memory and/or
3326
        // time.
3327
81.7k
        if ((uint64_t) mem->l + v->n_sample * (uint64_t)f->size > INT_MAX) {
3328
0
            static int warned = 0;
3329
0
            if ( !warned ) hts_log_warning("Excessive memory required by FORMAT fields at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3330
0
            warned = 1;
3331
0
            v->errcode |= BCF_ERR_LIMITS;
3332
0
            f->size = -1;
3333
0
            f->offset = 0;
3334
0
            continue;
3335
0
        }
3336
3337
81.7k
        f->offset = mem->l;
3338
81.7k
        if (ks_resize(mem, mem->l + v->n_sample * (size_t)f->size) < 0) {
3339
0
            hts_log_error("Memory allocation failure at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3340
0
            v->errcode |= BCF_ERR_LIMITS;
3341
0
            return -1;
3342
0
        }
3343
81.7k
        mem->l += v->n_sample * f->size;
3344
81.7k
    }
3345
3346
10.5k
    {
3347
10.5k
        int j;
3348
92.2k
        for (j = 0; j < v->n_fmt; ++j)
3349
81.7k
            fmt[j].buf = (uint8_t*)mem->s + fmt[j].offset;
3350
10.5k
    }
3351
3352
    // check for duplicate tags
3353
10.5k
    int i;
3354
81.7k
    for (i=1; i<v->n_fmt; i++)
3355
71.1k
    {
3356
71.1k
        fmt_aux_t *ifmt = &fmt[i];
3357
71.1k
        if ( ifmt->size==-1 ) continue; // already marked for removal
3358
390k
        for (j=0; j<i; j++)
3359
374k
        {
3360
374k
            fmt_aux_t *jfmt = &fmt[j];
3361
374k
            if ( jfmt->size==-1 ) continue; // already marked for removal
3362
174k
            if ( ifmt->key!=jfmt->key ) continue;
3363
55.2k
            static int warned = 0;
3364
55.2k
            if ( !warned ) hts_log_warning("Duplicate FORMAT tag %s at %s:%"PRIhts_pos, bcf_hdr_int2id(h,BCF_DT_ID,ifmt->key), bcf_seqname_safe(h,v), v->pos+1);
3365
55.2k
            warned = 1;
3366
55.2k
            v->errcode |= BCF_ERR_TAG_INVALID;
3367
55.2k
            ifmt->size = -1;
3368
55.2k
            ifmt->offset = 0;
3369
55.2k
            break;
3370
174k
        }
3371
71.1k
    }
3372
10.5k
    return 0;
3373
10.5k
}
3374
3375
// Fill the sample fields
3376
static int vcf_parse_format_fill5(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
3377
10.5k
                                  const char *p, const char *q, fmt_aux_t *fmt) {
3378
10.5k
    static int extreme_val_warned = 0;
3379
10.5k
    int n_sample_ori = -1;
3380
    // At beginning of the loop t points to the first char of a format
3381
10.5k
    const char *t = q + 1;
3382
10.5k
    int m = 0;   // m: sample id
3383
10.5k
    const int nsamples = bcf_hdr_nsamples(h);
3384
10.5k
    const char *end = s->s + s->l;
3385
3386
10.5k
    int ver = bcf_get_version(h, NULL);
3387
3388
38.0k
    while ( t<end )
3389
36.1k
    {
3390
        // can we skip some samples?
3391
36.1k
        if ( h->keep_samples )
3392
0
        {
3393
0
            n_sample_ori++;
3394
0
            if ( !bit_array_test(h->keep_samples,n_sample_ori) )
3395
0
            {
3396
0
                while ( *t && t<end ) t++;
3397
0
                t++;
3398
0
                continue;
3399
0
            }
3400
0
        }
3401
36.1k
        if ( m == nsamples ) break;
3402
3403
27.5k
        int j = 0; // j-th format field, m-th sample
3404
43.6k
        while ( t < end )
3405
43.1k
        {
3406
43.1k
            fmt_aux_t *z = &fmt[j++];
3407
43.1k
            const int htype = z->y>>4&0xf;
3408
43.1k
            if (!z->buf) {
3409
7
                hts_log_error("Memory allocation failure for FORMAT field type %d at %s:%"PRIhts_pos,
3410
7
                              z->y>>4&0xf, bcf_seqname_safe(h,v), v->pos+1);
3411
7
                v->errcode |= BCF_ERR_LIMITS;
3412
7
                return -1;
3413
7
            }
3414
3415
43.1k
            if ( z->size==-1 )
3416
5.42k
            {
3417
                // this field is to be ignored, it's either too big or a duplicate
3418
43.7k
                while ( *t != ':' && *t ) t++;
3419
5.42k
            }
3420
37.7k
            else if (htype == BCF_HT_STR) {
3421
37.7k
                int l;
3422
37.7k
                if (z->is_gt) {
3423
                    // Genotypes.
3424
                    //([/|])?<val>)([|/]<val>)+... where <val> is [0-9]+ or ".".
3425
6.40k
                    int32_t is_phased = 0;
3426
6.40k
                    uint32_t *x = (uint32_t*)(z->buf + z->size * (size_t)m);
3427
6.40k
                    uint32_t unreadable = 0;
3428
6.40k
                    uint32_t max = 0;
3429
6.40k
                    int overflow = 0, ploidy = 0, anyunphased = 0, \
3430
6.40k
                        phasingprfx = 0, unknown1 = 0;
3431
3432
                    /* with prefixed phasing, it is explicitly given for 1st one
3433
                    with non-prefixed, set based on ploidy and phasing of other
3434
                    alleles. */
3435
6.40k
                    if (ver >= VCF44 && (*t == '|' || *t == '/')) {
3436
                        // cache prefix and phasing status
3437
753
                        is_phased = *t++ == '|';
3438
753
                        phasingprfx = 1;
3439
753
                    }
3440
3441
22.0k
                    for (l = 0;; ++t) {
3442
22.0k
                        ploidy++;
3443
22.0k
                        if (*t == '.') {
3444
4.16k
                            ++t, x[l++] = is_phased;
3445
4.16k
                            if (l==1) {   //for 1st allele only
3446
811
                                unknown1 = 1;
3447
811
                            }
3448
17.9k
                        } else {
3449
17.9k
                            const char *tt = t;
3450
17.9k
                            uint32_t val;
3451
                            // Or "v->n_allele < 10", but it doesn't
3452
                            // seem to be any faster and this feels safer.
3453
17.9k
                            if (*t >= '0' && *t <= '9' &&
3454
17.0k
                                !(t[1] >= '0' && t[1] <= '9')) {
3455
9.19k
                                val = *t++ - '0';
3456
9.19k
                            } else {
3457
8.71k
                                val = hts_str2uint(t, (char **)&t,
3458
8.71k
                                                   sizeof(val) * CHAR_MAX - 2,
3459
8.71k
                                                   &overflow);
3460
8.71k
                                unreadable |= tt == t;
3461
8.71k
                            }
3462
17.9k
                            if (max < val) max = val;
3463
17.9k
                            x[l++] = (val + 1) << 1 | is_phased;
3464
17.9k
                        }
3465
22.0k
                        anyunphased |= (ploidy != 1) && !is_phased;
3466
22.0k
                        is_phased = (*t == '|');
3467
22.0k
                        if (*t != '|' && *t != '/') break;
3468
22.0k
                    }
3469
6.40k
                    if (!phasingprfx) { //get GT in v44 way when no prefixed phasing
3470
                        /* no explicit phasing for 1st allele, set based on
3471
                         other alleles and ploidy */
3472
5.65k
                        if (ploidy == 1) {  //implicitly phased
3473
1.31k
                            if (!unknown1) {
3474
810
                                x[0] |= 1;
3475
810
                            }
3476
4.34k
                        } else {            //set by other unphased alleles
3477
4.34k
                            x[0] |= (anyunphased)? 0 : 1;
3478
4.34k
                        }
3479
5.65k
                    }
3480
                    // Possibly check max against v->n_allele instead?
3481
6.40k
                    if (overflow || max > (INT32_MAX >> 1) - 1) {
3482
60
                        hts_log_error("Couldn't read GT data: value too large at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3483
60
                        return -1;
3484
60
                    }
3485
6.34k
                    if (unreadable) {
3486
43
                        hts_log_error("Couldn't read GT data: value not a number or '.' at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3487
43
                        return -1;
3488
43
                    }
3489
6.30k
                    if ( !l ) x[l++] = 0;   // An empty field, insert missing value
3490
8.88k
                    for (; l < z->size>>2; ++l)
3491
2.58k
                        x[l] = bcf_int32_vector_end;
3492
3493
31.2k
                } else {
3494
                    // Otherwise arbitrary strings
3495
31.2k
                    char *x = (char*)z->buf + z->size * (size_t)m;
3496
2.91M
                    for (l = 0; *t != ':' && *t; ++t)
3497
2.88M
                        x[l++] = *t;
3498
31.2k
                    if (z->size > l)
3499
18.2k
                        memset(&x[l], 0, (z->size-l) * sizeof(*x));
3500
31.2k
                }
3501
3502
37.7k
            } else if (htype == BCF_HT_INT) {
3503
                // One or more integers in an array
3504
0
                int32_t *x = (int32_t*)(z->buf + z->size * (size_t)m);
3505
0
                int l;
3506
0
                for (l = 0;; ++t) {
3507
0
                    if (*t == '.') {
3508
0
                        x[l++] = bcf_int32_missing, ++t; // ++t to skip "."
3509
0
                    } else {
3510
0
                        int overflow = 0;
3511
0
                        char *te;
3512
0
                        long int tmp_val = hts_str2int(t, &te, sizeof(tmp_val)*CHAR_BIT, &overflow);
3513
0
                        if ( te==t || overflow || tmp_val<BCF_MIN_BT_INT32 || tmp_val>BCF_MAX_BT_INT32 )
3514
0
                        {
3515
0
                            if ( !extreme_val_warned )
3516
0
                            {
3517
0
                                hts_log_warning("Extreme FORMAT/%s value encountered and set to missing at %s:%"PRIhts_pos,
3518
0
                                                h->id[BCF_DT_ID][fmt[j-1].key].key, bcf_seqname_safe(h,v), v->pos+1);
3519
0
                                extreme_val_warned = 1;
3520
0
                            }
3521
0
                            tmp_val = bcf_int32_missing;
3522
0
                        }
3523
0
                        x[l++] = tmp_val;
3524
0
                        t = te;
3525
0
                    }
3526
0
                    if (*t != ',') break;
3527
0
                }
3528
0
                if ( !l )
3529
0
                    x[l++] = bcf_int32_missing;
3530
0
                for (; l < z->size>>2; ++l)
3531
0
                    x[l] = bcf_int32_vector_end;
3532
3533
0
            } else if (htype == BCF_HT_REAL) {
3534
                // One of more floating point values in an array
3535
0
                float *x = (float*)(z->buf + z->size * (size_t)m);
3536
0
                int l;
3537
0
                for (l = 0;; ++t) {
3538
0
                    if (*t == '.' && !isdigit_c(t[1])) {
3539
0
                        bcf_float_set_missing(x[l++]), ++t; // ++t to skip "."
3540
0
                    } else {
3541
0
                        int overflow = 0;
3542
0
                        char *te;
3543
0
                        float tmp_val = hts_str2dbl(t, &te, &overflow);
3544
0
                        if ( (te==t || overflow) && !extreme_val_warned )
3545
0
                        {
3546
0
                            hts_log_warning("Extreme FORMAT/%s value encountered at %s:%"PRIhts_pos, h->id[BCF_DT_ID][fmt[j-1].key].key, bcf_seqname(h,v), v->pos+1);
3547
0
                            extreme_val_warned = 1;
3548
0
                        }
3549
0
                        x[l++] = tmp_val;
3550
0
                        t = te;
3551
0
                    }
3552
0
                    if (*t != ',') break;
3553
0
                }
3554
0
                if ( !l )
3555
                    // An empty field, insert missing value
3556
0
                    bcf_float_set_missing(x[l++]);
3557
0
                for (; l < z->size>>2; ++l)
3558
0
                    bcf_float_set_vector_end(x[l]);
3559
0
            } else {
3560
0
                hts_log_error("Unknown FORMAT field type %d at %s:%"PRIhts_pos, htype, bcf_seqname_safe(h,v), v->pos+1);
3561
0
                v->errcode |= BCF_ERR_TAG_INVALID;
3562
0
                return -1;
3563
0
            }
3564
3565
43.0k
            if (*t == '\0') {
3566
26.9k
                break;
3567
26.9k
            }
3568
16.1k
            else if (*t == ':') {
3569
16.0k
                t++;
3570
16.0k
            }
3571
25
            else {
3572
25
                char buffer[8];
3573
25
                hts_log_error("Invalid character %s in '%s' FORMAT field at %s:%"PRIhts_pos"",
3574
25
                    hts_strprint(buffer, sizeof buffer, '\'', t, 1),
3575
25
                    h->id[BCF_DT_ID][z->key].key, bcf_seqname_safe(h,v), v->pos+1);
3576
25
                v->errcode |= BCF_ERR_CHAR;
3577
25
                return -1;
3578
25
            }
3579
43.0k
        }
3580
3581
        // fill end-of-vector values
3582
394k
        for (; j < v->n_fmt; ++j) {
3583
367k
            fmt_aux_t *z = &fmt[j];
3584
367k
            const int htype = z->y>>4&0xf;
3585
367k
            int l;
3586
3587
367k
            if (z->size == -1) // this field is to be ignored
3588
300k
                continue;
3589
3590
67.0k
            if (htype == BCF_HT_STR) {
3591
67.0k
                if (z->is_gt) {
3592
11.3k
                    int32_t *x = (int32_t*)(z->buf + z->size * (size_t)m);
3593
11.3k
                    if (z->size) x[0] = bcf_int32_missing;
3594
33.7k
                    for (l = 1; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end;
3595
55.7k
                } else {
3596
55.7k
                    char *x = (char*)z->buf + z->size * (size_t)m;
3597
55.7k
                    if ( z->size ) {
3598
9.56k
                        x[0] = '.';
3599
9.56k
                        memset(&x[1], 0, (z->size-1) * sizeof(*x));
3600
9.56k
                    }
3601
55.7k
                }
3602
67.0k
            } else if (htype == BCF_HT_INT) {
3603
0
                int32_t *x = (int32_t*)(z->buf + z->size * (size_t)m);
3604
0
                x[0] = bcf_int32_missing;
3605
0
                for (l = 1; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end;
3606
0
            } else if (htype == BCF_HT_REAL) {
3607
0
                float *x = (float*)(z->buf + z->size * (size_t)m);
3608
0
                bcf_float_set_missing(x[0]);
3609
0
                for (l = 1; l < z->size>>2; ++l) bcf_float_set_vector_end(x[l]);
3610
0
            }
3611
67.0k
        }
3612
3613
27.4k
        m++; t++;
3614
27.4k
    }
3615
3616
10.4k
    return 0;
3617
10.5k
}
3618
3619
// write individual genotype information
3620
static int vcf_parse_format_gt6(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
3621
10.4k
                                const char *p, const char *q, fmt_aux_t *fmt) {
3622
10.4k
    kstring_t *str = &v->indiv;
3623
10.4k
    int i, need_downsize = 0;
3624
10.4k
    if (v->n_sample > 0) {
3625
89.1k
        for (i = 0; i < v->n_fmt; ++i) {
3626
78.7k
            fmt_aux_t *z = &fmt[i];
3627
78.7k
            if ( z->size==-1 ) {
3628
52.7k
                need_downsize = 1;
3629
52.7k
                continue;
3630
52.7k
            }
3631
25.9k
            bcf_enc_int1(str, z->key);
3632
25.9k
            if ((z->y>>4&0xf) == BCF_HT_STR && !z->is_gt) {
3633
20.8k
                bcf_enc_size(str, z->size, BCF_BT_CHAR);
3634
20.8k
                kputsn((char*)z->buf, z->size * (size_t)v->n_sample, str);
3635
20.8k
            } else if ((z->y>>4&0xf) == BCF_HT_INT || z->is_gt) {
3636
5.10k
                bcf_enc_vint(str, (z->size>>2) * v->n_sample, (int32_t*)z->buf, z->size>>2);
3637
5.10k
            } else {
3638
0
                bcf_enc_size(str, z->size>>2, BCF_BT_FLOAT);
3639
0
                if (serialize_float_array(str, (z->size>>2) * (size_t)v->n_sample,
3640
0
                                          (float *) z->buf) != 0) {
3641
0
                    v->errcode |= BCF_ERR_LIMITS;
3642
0
                    hts_log_error("Out of memory at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3643
0
                    return -1;
3644
0
                }
3645
0
            }
3646
25.9k
        }
3647
3648
10.4k
    }
3649
10.4k
    if ( need_downsize ) {
3650
3.85k
        i = 0;
3651
69.6k
        while ( i < v->n_fmt ) {
3652
65.8k
            if ( fmt[i].size==-1 )
3653
52.7k
            {
3654
52.7k
                v->n_fmt--;
3655
52.7k
                if ( i < v->n_fmt ) memmove(&fmt[i],&fmt[i+1],sizeof(*fmt)*(v->n_fmt-i));
3656
52.7k
            }
3657
13.1k
            else
3658
13.1k
                i++;
3659
65.8k
        }
3660
3.85k
    }
3661
10.4k
    return 0;
3662
10.4k
}
3663
3664
// validity checking
3665
10.4k
static int vcf_parse_format_check7(const bcf_hdr_t *h, bcf1_t *v) {
3666
10.4k
    if ( v->n_sample!=bcf_hdr_nsamples(h) )
3667
89
    {
3668
89
        hts_log_error("Number of columns at %s:%"PRIhts_pos" does not match the number of samples (%d vs %d)",
3669
89
            bcf_seqname_safe(h,v), v->pos+1, v->n_sample, bcf_hdr_nsamples(h));
3670
89
        v->errcode |= BCF_ERR_NCOLS;
3671
89
        return -1;
3672
89
    }
3673
10.3k
    if ( v->indiv.l > 0xffffffff )
3674
0
    {
3675
0
        hts_log_error("The FORMAT at %s:%"PRIhts_pos" is too long", bcf_seqname_safe(h,v), v->pos+1);
3676
0
        v->errcode |= BCF_ERR_LIMITS;
3677
3678
        // Error recovery: return -1 if this is a critical error or 0 if we want to ignore the FORMAT and proceed
3679
0
        v->n_fmt = 0;
3680
0
        return -1;
3681
0
    }
3682
3683
10.3k
    return 0;
3684
10.3k
}
3685
3686
// p,q is the start and the end of the FORMAT field
3687
static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
3688
                            char *p, char *q)
3689
28.7k
{
3690
28.7k
    if ( !bcf_hdr_nsamples(h) ) return 0;
3691
10.9k
    kstring_t *mem = (kstring_t*)&h->mem;
3692
10.9k
    mem->l = 0;
3693
3694
10.9k
    fmt_aux_t fmt[MAX_N_FMT];
3695
3696
    // detect FORMAT "."
3697
10.9k
    int ret; // +ve = ok, -ve = err
3698
10.9k
    if ((ret = vcf_parse_format_empty1(s, h, v, p, q)))
3699
299
        return ret ? 0 : -1;
3700
3701
    // get format information from the dictionary
3702
10.6k
    if (vcf_parse_format_dict2(s, h, v, p, q, fmt) < 0)
3703
18
        return -1;
3704
3705
    // FORMAT data is per-sample A:B:C A:B:C A:B:C ... but in memory it is
3706
    // stored as per-type arrays AAA... BBB... CCC...  This is basically
3707
    // a data rotation or pivot.
3708
3709
    // The size of elements in the array grow to their maximum needed,
3710
    // permitting fast random access.  This means however we have to first
3711
    // scan the whole FORMAT line to find the maximum of each type, and
3712
    // then scan it again to find the store the data.
3713
    // We break this down into compute-max, allocate, fill-out-buffers
3714
3715
    // TODO: ?
3716
    // The alternative would be to pivot on the first pass, with fixed
3717
    // size entries for numerics and concatenated strings otherwise, also
3718
    // tracking maximum sizes.  Then on a second pass we reallocate and
3719
    // copy the data again to a uniformly sized array.  Two passes through
3720
    // memory, but without doubling string parsing.
3721
3722
    // compute max
3723
10.5k
    if (vcf_parse_format_max3(s, h, v, p, q, fmt) < 0)
3724
24
        return -1;
3725
3726
    // allocate memory for arrays
3727
10.5k
    if (vcf_parse_format_alloc4(s, h, v, p, q, fmt) < 0)
3728
0
        return -1;
3729
3730
    // fill the sample fields; at beginning of the loop
3731
10.5k
    if (vcf_parse_format_fill5(s, h, v, p, q, fmt) < 0)
3732
135
        return -1;
3733
3734
    // write individual genotype information
3735
10.4k
    if (vcf_parse_format_gt6(s, h, v, p, q, fmt) < 0)
3736
0
        return -1;
3737
3738
    // validity checking
3739
10.4k
    if (vcf_parse_format_check7(h, v) < 0)
3740
89
        return -1;
3741
3742
10.3k
    return 0;
3743
10.4k
}
3744
3745
5.05k
static khint_t fix_chromosome(const bcf_hdr_t *h, vdict_t *d, const char *p) {
3746
    // Simple error recovery for chromosomes not defined in the header. It will not help when VCF header has
3747
    // been already printed, but will enable tools like vcfcheck to proceed.
3748
3749
5.05k
    kstring_t tmp = {0,0,0};
3750
5.05k
    khint_t k;
3751
5.05k
    int l;
3752
5.05k
    if (ksprintf(&tmp, "##contig=<ID=%s>", p) < 0)
3753
0
        return kh_end(d);
3754
5.05k
    bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
3755
5.05k
    free(tmp.s);
3756
5.05k
    int res = hrec ? bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) : -1;
3757
5.05k
    if (res < 0) bcf_hrec_destroy(hrec);
3758
5.05k
    if (res > 0) res = bcf_hdr_sync((bcf_hdr_t*)h);
3759
5.05k
    k = kh_get(vdict, d, p);
3760
3761
5.05k
    return k;
3762
5.05k
}
3763
3764
30.9k
static int vcf_parse_filter(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p, char *q) {
3765
30.9k
    int i, n_flt = 1, max_n_flt = 0;
3766
30.9k
    char *r, *t;
3767
30.9k
    int32_t *a_flt = NULL;
3768
30.9k
    ks_tokaux_t aux1;
3769
30.9k
    khint_t k;
3770
30.9k
    vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID];
3771
    // count the number of filters
3772
30.9k
    if (*(q-1) == ';') *(q-1) = 0;
3773
341M
    for (r = p; *r; ++r)
3774
341M
        if (*r == ';') ++n_flt;
3775
30.9k
    if (n_flt > max_n_flt) {
3776
30.9k
        a_flt = hts_malloc_p(sizeof(*a_flt), n_flt);
3777
30.9k
        if (!a_flt) {
3778
0
            hts_log_error("Could not allocate memory at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3779
0
            v->errcode |= BCF_ERR_LIMITS; // No appropriate code?
3780
0
            return -1;
3781
0
        }
3782
30.9k
        max_n_flt = n_flt;
3783
30.9k
    }
3784
    // add filters
3785
1.66M
    for (t = kstrtok(p, ";", &aux1), i = 0; t; t = kstrtok(0, 0, &aux1)) {
3786
1.63M
        *(char*)aux1.p = 0;
3787
1.63M
        k = kh_get(vdict, d, t);
3788
1.63M
        if (k == kh_end(d))
3789
41.0k
        {
3790
            // Simple error recovery for FILTERs not defined in the header. It will not help when VCF header has
3791
            // been already printed, but will enable tools like vcfcheck to proceed.
3792
41.0k
            hts_log_warning("FILTER '%s' is not defined in the header", t);
3793
41.0k
            kstring_t tmp = {0,0,0};
3794
41.0k
            int l;
3795
41.0k
            ksprintf(&tmp, "##FILTER=<ID=%s,Description=\"Dummy\">", t);
3796
41.0k
            bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
3797
41.0k
            free(tmp.s);
3798
41.0k
            int res = hrec ? bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) : -1;
3799
41.0k
            if (res < 0) bcf_hrec_destroy(hrec);
3800
41.0k
            if (res > 0) res = bcf_hdr_sync((bcf_hdr_t*)h);
3801
41.0k
            k = kh_get(vdict, d, t);
3802
41.0k
            v->errcode |= BCF_ERR_TAG_UNDEF;
3803
41.0k
            if (res || k == kh_end(d)) {
3804
58
                hts_log_error("Could not add dummy header for FILTER '%s' at %s:%"PRIhts_pos, t, bcf_seqname_safe(h,v), v->pos+1);
3805
58
                v->errcode |= BCF_ERR_TAG_INVALID;
3806
58
                free(a_flt);
3807
58
                return -1;
3808
58
            }
3809
41.0k
        }
3810
1.63M
        a_flt[i++] = kh_val(d, k).id;
3811
1.63M
    }
3812
3813
30.8k
    bcf_enc_vint(str, n_flt, a_flt, -1);
3814
30.8k
    free(a_flt);
3815
3816
30.8k
    return 0;
3817
30.9k
}
3818
3819
32.6k
static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p, char *q) {
3820
32.6k
    static int extreme_int_warned = 0, negative_rlen_warned = 0;
3821
32.6k
    int max_n_val = 0, overflow = 0;
3822
32.6k
    char *r, *key;
3823
32.6k
    khint_t k;
3824
32.6k
    vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID];
3825
32.6k
    int32_t *a_val = NULL;
3826
3827
32.6k
    v->n_info = 0;
3828
32.6k
    if (*(q-1) == ';') *(q-1) = 0;
3829
3.10M
    for (r = key = p;; ++r) {
3830
3.10M
        int c;
3831
3.10M
        char *val, *end;
3832
231M
        while (*r > '=' || (*r != ';' && *r != '=' && *r != 0)) r++;
3833
3.10M
        if (v->n_info == UINT16_MAX) {
3834
5
            hts_log_error("Too many INFO entries at %s:%"PRIhts_pos,
3835
5
                          bcf_seqname_safe(h,v), v->pos+1);
3836
5
            v->errcode |= BCF_ERR_LIMITS;
3837
5
            goto fail;
3838
5
        }
3839
3.10M
        val = end = NULL;
3840
3.10M
        c = *r; *r = 0;
3841
3.10M
        if (c == '=') {
3842
1.43M
            val = r + 1;
3843
3844
289M
            for (end = val; *end != ';' && *end != 0; ++end);
3845
1.43M
            c = *end; *end = 0;
3846
1.66M
        } else end = r;
3847
3.10M
        if ( !*key ) { if (c==0) break; r = end; key = r + 1; continue; }  // faulty VCF, ";;" in the INFO
3848
3.06M
        k = kh_get(vdict, d, key);
3849
3.06M
        if (k == kh_end(d) || kh_val(d, k).info[BCF_HL_INFO] == 15)
3850
28.5k
        {
3851
28.5k
            hts_log_warning("INFO '%s' is not defined in the header, assuming Type=String", key);
3852
28.5k
            kstring_t tmp = {0,0,0};
3853
28.5k
            int l;
3854
28.5k
            ksprintf(&tmp, "##INFO=<ID=%s,Number=1,Type=String,Description=\"Dummy\">", key);
3855
28.5k
            bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
3856
28.5k
            free(tmp.s);
3857
28.5k
            int res = hrec ? bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) : -1;
3858
28.5k
            if (res < 0) bcf_hrec_destroy(hrec);
3859
28.5k
            if (res > 0) res = bcf_hdr_sync((bcf_hdr_t*)h);
3860
28.5k
            k = kh_get(vdict, d, key);
3861
28.5k
            v->errcode |= BCF_ERR_TAG_UNDEF;
3862
28.5k
            if (res || k == kh_end(d)) {
3863
74
                hts_log_error("Could not add dummy header for INFO '%s' at %s:%"PRIhts_pos, key, bcf_seqname_safe(h,v), v->pos+1);
3864
74
                v->errcode |= BCF_ERR_TAG_INVALID;
3865
74
                goto fail;
3866
74
            }
3867
28.5k
        }
3868
3.06M
        uint32_t y = kh_val(d, k).info[BCF_HL_INFO];
3869
3.06M
        ++v->n_info;
3870
3.06M
        bcf_enc_int1(str, kh_val(d, k).id);
3871
3.06M
        if (val == 0) {
3872
1.63M
            bcf_enc_size(str, 0, BCF_BT_NULL);
3873
1.63M
        } else if ((y>>4&0xf) == BCF_HT_FLAG || (y>>4&0xf) == BCF_HT_STR) { // if Flag has a value, treat it as a string
3874
119k
            bcf_enc_vchar(str, end - val, val);
3875
1.31M
        } else { // int/float value/array
3876
1.31M
            int i, n_val;
3877
1.31M
            char *t, *te;
3878
250M
            for (t = val, n_val = 1; *t; ++t) // count the number of values
3879
249M
                if (*t == ',') ++n_val;
3880
            // Check both int and float size in one step for simplicity
3881
1.31M
            if (n_val > max_n_val) {
3882
4.59k
                int32_t *a_tmp = hts_realloc_p(a_val, sizeof(*a_val), n_val);
3883
4.59k
                if (!a_tmp) {
3884
0
                    hts_log_error("Could not allocate memory at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3885
0
                    v->errcode |= BCF_ERR_LIMITS; // No appropriate code?
3886
0
                    goto fail;
3887
0
                }
3888
4.59k
                a_val = a_tmp;
3889
4.59k
                max_n_val = n_val;
3890
4.59k
            }
3891
1.31M
            if ((y>>4&0xf) == BCF_HT_INT) {
3892
882k
                i = 0, t = val;
3893
882k
                int64_t val1;
3894
882k
                int is_int64 = 0;
3895
#ifdef VCF_ALLOW_INT64
3896
                if ( n_val==1 )
3897
                {
3898
                    overflow = 0;
3899
                    long long int tmp_val = hts_str2int(val, &te, sizeof(tmp_val)*CHAR_BIT, &overflow);
3900
                    if ( te==val ) tmp_val = bcf_int32_missing;
3901
                    else if ( overflow || tmp_val<BCF_MIN_BT_INT64 || tmp_val>BCF_MAX_BT_INT64 )
3902
                    {
3903
                        if ( !extreme_int_warned )
3904
                        {
3905
                            hts_log_warning("Extreme INFO/%s value encountered and set to missing at %s:%"PRIhts_pos,key,bcf_seqname_safe(h,v), v->pos+1);
3906
                            extreme_int_warned = 1;
3907
                        }
3908
                        tmp_val = bcf_int32_missing;
3909
                    }
3910
                    else
3911
                        is_int64 = 1;
3912
                    val1 = tmp_val;
3913
                    t = te;
3914
                    i = 1;  // this is just to avoid adding another nested block...
3915
                }
3916
#endif
3917
100M
                for (; i < n_val; ++i, ++t)
3918
99.3M
                {
3919
99.3M
                    overflow = 0;
3920
99.3M
                    long int tmp_val = hts_str2int(t, &te, sizeof(tmp_val)*CHAR_BIT, &overflow);
3921
99.3M
                    if ( te==t ) tmp_val = bcf_int32_missing;
3922
1.56M
                    else if ( overflow || tmp_val<BCF_MIN_BT_INT32 || tmp_val>BCF_MAX_BT_INT32 )
3923
185k
                    {
3924
185k
                        if ( !extreme_int_warned )
3925
1
                        {
3926
1
                            hts_log_warning("Extreme INFO/%s value encountered and set to missing at %s:%"PRIhts_pos,key,bcf_seqname_safe(h,v), v->pos+1);
3927
1
                            extreme_int_warned = 1;
3928
1
                        }
3929
185k
                        tmp_val = bcf_int32_missing;
3930
185k
                    }
3931
99.3M
                    a_val[i] = tmp_val;
3932
131M
                    for (t = te; *t && *t != ','; t++);
3933
99.3M
                }
3934
882k
                if (n_val == 1) {
3935
#ifdef VCF_ALLOW_INT64
3936
                    if ( is_int64 )
3937
                    {
3938
                        v->unpacked |= BCF_IS_64BIT;
3939
                        bcf_enc_long1(str, val1);
3940
                    }
3941
                    else
3942
                        bcf_enc_int1(str, (int32_t)val1);
3943
#else
3944
707k
                    val1 = a_val[0];
3945
707k
                    bcf_enc_int1(str, (int32_t)val1);
3946
707k
#endif
3947
707k
                } else {
3948
175k
                    bcf_enc_vint(str, n_val, a_val, -1);
3949
175k
                }
3950
882k
                if (n_val==1 && (val1!=bcf_int32_missing || is_int64)
3951
588k
                    && memcmp(key, "END", 4) == 0)
3952
0
                {
3953
0
                    if ( val1 <= v->pos )
3954
0
                    {
3955
0
                        if ( !negative_rlen_warned )
3956
0
                        {
3957
0
                            hts_log_warning("INFO/END=%"PRIhts_pos" is smaller than POS at %s:%"PRIhts_pos,val1,bcf_seqname_safe(h,v),v->pos+1);
3958
0
                            negative_rlen_warned = 1;
3959
0
                        }
3960
0
                    }
3961
0
                }
3962
882k
            } else if ((y>>4&0xf) == BCF_HT_REAL) {
3963
432k
                float *val_f = (float *)a_val;
3964
81.5M
                for (i = 0, t = val; i < n_val; ++i, ++t)
3965
81.0M
                {
3966
81.0M
                    overflow = 0;
3967
81.0M
                    val_f[i] = hts_str2dbl(t, &te, &overflow);
3968
81.0M
                    if ( te==t || overflow ) // conversion failed
3969
80.0M
                        bcf_float_set_missing(val_f[i]);
3970
107M
                    for (t = te; *t && *t != ','; t++);
3971
81.0M
                }
3972
432k
                bcf_enc_vfloat(str, n_val, val_f);
3973
432k
            }
3974
1.31M
        }
3975
3.06M
        if (c == 0) break;
3976
3.05M
        r = end;
3977
3.05M
        key = r + 1;
3978
3.05M
    }
3979
3980
32.6k
    free(a_val);
3981
32.6k
    return 0;
3982
3983
79
 fail:
3984
79
    free(a_val);
3985
79
    return -1;
3986
32.6k
}
3987
3988
int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v)
3989
33.9k
{
3990
33.9k
    int ret = -2, overflow = 0;
3991
33.9k
    char *p, *q, *r, *t;
3992
33.9k
    kstring_t *str;
3993
33.9k
    khint_t k;
3994
33.9k
    ks_tokaux_t aux;
3995
3996
//#define NOT_DOT(p) strcmp((p), ".")
3997
//#define NOT_DOT(p) (!(*p == '.' && !p[1]))
3998
//#define NOT_DOT(p) ((*p) != '.' || (p)[1])
3999
//#define NOT_DOT(p) (q-p != 1 || memcmp(p, ".\0", 2))
4000
166k
#define NOT_DOT(p) (memcmp(p, ".\0", 2))
4001
4002
33.9k
    if (!s || !h || !v || !(s->s))
4003
0
        return ret;
4004
4005
    // Assumed in lots of places, but we may as well spot this early
4006
33.9k
    assert(sizeof(float) == sizeof(int32_t));
4007
4008
    // Ensure string we parse has space to permit some over-flow when during
4009
    // parsing.  Eg to do memcmp(key, "END", 4) in vcf_parse_info over
4010
    // the more straight forward looking strcmp, giving a speed advantage.
4011
33.9k
    if (ks_resize(s, s->l+4) < 0)
4012
0
        return -2;
4013
4014
    // Force our memory to be initialised so we avoid the technicality of
4015
    // undefined behaviour in using a 4-byte memcmp.  (The reality is this
4016
    // almost certainly is never detected by the compiler so has no impact,
4017
    // but equally so this code has minimal (often beneficial) impact on
4018
    // performance too.)
4019
33.9k
    s->s[s->l+0] = 0;
4020
33.9k
    s->s[s->l+1] = 0;
4021
33.9k
    s->s[s->l+2] = 0;
4022
33.9k
    s->s[s->l+3] = 0;
4023
4024
33.9k
    bcf_clear1(v);
4025
33.9k
    str = &v->shared;
4026
33.9k
    memset(&aux, 0, sizeof(ks_tokaux_t));
4027
4028
    // CHROM
4029
33.9k
    if (!(p = kstrtok(s->s, "\t", &aux)))
4030
0
        goto err;
4031
33.9k
    *(q = (char*)aux.p) = 0;
4032
4033
33.9k
    vdict_t *d = (vdict_t*)h->dict[BCF_DT_CTG];
4034
33.9k
    k = kh_get(vdict, d, p);
4035
33.9k
    if (k == kh_end(d)) {
4036
5.05k
        hts_log_warning("Contig '%s' is not defined in the header. (Quick workaround: index the file with tabix.)", p);
4037
5.05k
        v->errcode = BCF_ERR_CTG_UNDEF;
4038
5.05k
        if ((k = fix_chromosome(h, d, p)) == kh_end(d)) {
4039
63
            hts_log_error("Could not add dummy header for contig '%s'", p);
4040
63
            v->errcode |= BCF_ERR_CTG_INVALID;
4041
63
            goto err;
4042
63
        }
4043
5.05k
    }
4044
33.8k
    v->rid = kh_val(d, k).id;
4045
4046
    // POS
4047
33.8k
    if (!(p = kstrtok(0, 0, &aux)))
4048
219
        goto err;
4049
33.6k
    *(q = (char*)aux.p) = 0;
4050
4051
33.6k
    overflow = 0;
4052
33.6k
    char *tmp = p;
4053
33.6k
    v->pos = hts_str2uint(p, &p, 62, &overflow);
4054
33.6k
    if (overflow) {
4055
37
        hts_log_error("Position value '%s' is too large", tmp);
4056
37
        goto err;
4057
33.6k
    } else if ( *p ) {
4058
72
        hts_log_error("Could not parse the position '%s'", tmp);
4059
72
        goto err;
4060
33.5k
    } else {
4061
33.5k
        v->pos -= 1;
4062
33.5k
    }
4063
33.5k
    if (v->pos >= INT32_MAX)
4064
794
        v->unpacked |= BCF_IS_64BIT;
4065
4066
    // ID
4067
33.5k
    if (!(p = kstrtok(0, 0, &aux)))
4068
49
        goto err;
4069
33.5k
    *(q = (char*)aux.p) = 0;
4070
4071
33.5k
    if (NOT_DOT(p)) bcf_enc_vchar(str, q - p, p);
4072
81
    else bcf_enc_size(str, 0, BCF_BT_CHAR);
4073
4074
    // REF
4075
33.5k
    if (!(p = kstrtok(0, 0, &aux)))
4076
44
        goto err;
4077
33.4k
    *(q = (char*)aux.p) = 0;
4078
4079
33.4k
    bcf_enc_vchar(str, q - p, p);
4080
33.4k
    v->n_allele = 1, v->rlen = q - p;
4081
4082
    // ALT
4083
33.4k
    if (!(p = kstrtok(0, 0, &aux)))
4084
22
        goto err;
4085
33.4k
    *(q = (char*)aux.p) = 0;
4086
4087
33.4k
    if (NOT_DOT(p)) {
4088
67.2M
        for (r = t = p;; ++r) {
4089
67.2M
            if (*r == ',' || *r == 0) {
4090
2.59M
                if (v->n_allele == UINT16_MAX) {
4091
4
                    hts_log_error("Too many ALT alleles at %s:%"PRIhts_pos,
4092
4
                                  bcf_seqname_safe(h,v), v->pos+1);
4093
4
                    v->errcode |= BCF_ERR_LIMITS;
4094
4
                    goto err;
4095
4
                }
4096
2.59M
                bcf_enc_vchar(str, r - t, t);
4097
2.59M
                t = r + 1;
4098
2.59M
                ++v->n_allele;
4099
2.59M
            }
4100
67.2M
            if (r == q) break;
4101
67.2M
        }
4102
32.9k
    }
4103
4104
    // QUAL
4105
33.4k
    if (!(p = kstrtok(0, 0, &aux)))
4106
70
        goto err;
4107
33.3k
    *(q = (char*)aux.p) = 0;
4108
4109
33.3k
    if (NOT_DOT(p)) v->qual = atof(p);
4110
591
    else bcf_float_set_missing(v->qual);
4111
33.3k
    if ( v->max_unpack && !(v->max_unpack>>1) ) goto end; // BCF_UN_STR
4112
4113
    // FILTER
4114
33.3k
    if (!(p = kstrtok(0, 0, &aux)))
4115
62
        goto err;
4116
33.3k
    *(q = (char*)aux.p) = 0;
4117
4118
33.3k
    if (NOT_DOT(p)) {
4119
30.9k
        if (vcf_parse_filter(str, h, v, p, q)) {
4120
58
            goto err;
4121
58
        }
4122
30.9k
    } else bcf_enc_vint(str, 0, 0, -1);
4123
33.2k
    if ( v->max_unpack && !(v->max_unpack>>2) ) goto end; // BCF_UN_FLT
4124
4125
    // INFO
4126
33.2k
    if (!(p = kstrtok(0, 0, &aux)))
4127
94
        goto err;
4128
33.1k
    *(q = (char*)aux.p) = 0;
4129
4130
33.1k
    if (NOT_DOT(p)) {
4131
32.6k
        if (vcf_parse_info(str, h, v, p, q)) {
4132
79
            goto err;
4133
79
        }
4134
32.6k
    }
4135
33.0k
    if ( v->max_unpack && !(v->max_unpack>>3) ) goto end;
4136
4137
    // FORMAT; optional
4138
33.0k
    p = kstrtok(0, 0, &aux);
4139
33.0k
    if (p) {
4140
28.7k
        *(q = (char*)aux.p) = 0;
4141
4142
28.7k
        if (vcf_parse_format(s, h, v, p, q)) {
4143
266
            goto err;
4144
266
        }
4145
28.7k
    }
4146
4147
32.8k
 end:
4148
32.8k
    v->rlen = get_rlen(h, v);    //set rlen based on version
4149
32.8k
    ret = 0;
4150
4151
33.9k
 err:
4152
33.9k
    return ret;
4153
32.8k
}
4154
4155
int vcf_open_mode(char *mode, const char *fn, const char *format)
4156
0
{
4157
0
    if (format == NULL) {
4158
        // Try to pick a format based on the filename extension
4159
0
        char extension[HTS_MAX_EXT_LEN];
4160
0
        if (find_file_extension(fn, extension) < 0) return -1;
4161
0
        return vcf_open_mode(mode, fn, extension);
4162
0
    }
4163
0
    else if (strcasecmp(format, "bcf") == 0) strcpy(mode, "b");
4164
0
    else if (strcasecmp(format, "vcf") == 0) strcpy(mode, "");
4165
0
    else if (strcasecmp(format, "vcf.gz") == 0 || strcasecmp(format, "vcf.bgz") == 0) strcpy(mode, "z");
4166
0
    else return -1;
4167
4168
0
    return 0;
4169
0
}
4170
4171
int vcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
4172
35.0k
{
4173
35.0k
    int ret;
4174
35.0k
    ret = hts_getline(fp, KS_SEP_LINE, &fp->line);
4175
35.0k
    if (ret < 0) return ret;
4176
33.9k
    return vcf_parse1(&fp->line, h, v);
4177
35.0k
}
4178
4179
static inline uint8_t *bcf_unpack_fmt_core1(uint8_t *ptr, int n_sample, bcf_fmt_t *fmt)
4180
0
{
4181
0
    uint8_t *ptr_start = ptr;
4182
0
    fmt->id = bcf_dec_typed_int1(ptr, &ptr);
4183
0
    fmt->n = bcf_dec_size(ptr, &ptr, &fmt->type);
4184
0
    fmt->size = fmt->n << bcf_type_shift[fmt->type];
4185
0
    fmt->p = ptr;
4186
0
    fmt->p_off  = ptr - ptr_start;
4187
0
    fmt->p_free = 0;
4188
0
    ptr += n_sample * fmt->size;
4189
0
    fmt->p_len = ptr - fmt->p;
4190
0
    return ptr;
4191
0
}
4192
4193
static inline uint8_t *bcf_unpack_info_core1(uint8_t *ptr, bcf_info_t *info)
4194
1.97k
{
4195
1.97k
    uint8_t *ptr_start = ptr;
4196
1.97k
    int64_t len = 0;
4197
1.97k
    info->key = bcf_dec_typed_int1(ptr, &ptr);
4198
1.97k
    len = info->len = bcf_dec_size(ptr, &ptr, &info->type);
4199
1.97k
    info->vptr = ptr;
4200
1.97k
    info->vptr_off  = ptr - ptr_start;
4201
1.97k
    info->vptr_free = 0;
4202
1.97k
    info->v1.i = 0;
4203
1.97k
    if (info->len == 1) {
4204
920
        switch(info->type) {
4205
0
        case BCF_BT_INT8:
4206
920
        case BCF_BT_CHAR:
4207
920
            info->v1.i = *(int8_t*)ptr;
4208
920
            break;
4209
0
        case BCF_BT_INT16:
4210
0
            info->v1.i = le_to_i16(ptr);
4211
0
            len <<= 1;
4212
0
            break;
4213
0
        case BCF_BT_INT32:
4214
0
            info->v1.i = le_to_i32(ptr);
4215
0
            len <<= 2;
4216
0
            break;
4217
0
        case BCF_BT_FLOAT:
4218
0
            info->v1.f = le_to_float(ptr);
4219
0
            len <<= 2;
4220
0
            break;
4221
0
        case BCF_BT_INT64:
4222
0
            info->v1.i = le_to_i64(ptr);
4223
0
            len <<= 3;
4224
0
            break;
4225
920
        }
4226
1.05k
    } else {
4227
1.05k
        len <<= bcf_type_shift[info->type];
4228
1.05k
    }
4229
1.97k
    ptr += len;
4230
4231
1.97k
    info->vptr_len = ptr - info->vptr;
4232
1.97k
    return ptr;
4233
1.97k
}
4234
4235
int bcf_unpack(bcf1_t *b, int which)
4236
30.8k
{
4237
30.8k
    if ( !b->shared.l ) return 0; // Building a new BCF record from scratch
4238
30.8k
    uint8_t *ptr = (uint8_t*)b->shared.s, *ptr_ori;
4239
30.8k
    int i;
4240
30.8k
    bcf_dec_t *d = &b->d;
4241
30.8k
    if (which & BCF_UN_FLT) which |= BCF_UN_STR;
4242
30.8k
    if (which & BCF_UN_INFO) which |= BCF_UN_SHR;
4243
30.8k
    if ((which&BCF_UN_STR) && !(b->unpacked&BCF_UN_STR))
4244
30.8k
    {
4245
30.8k
        kstring_t tmp;
4246
4247
        // ID
4248
30.8k
        tmp.l = 0; tmp.s = d->id; tmp.m = d->m_id;
4249
30.8k
        ptr_ori = ptr;
4250
30.8k
        ptr = bcf_fmt_sized_array(&tmp, ptr);
4251
30.8k
        b->unpack_size[0] = ptr - ptr_ori;
4252
30.8k
        kputc_('\0', &tmp);
4253
30.8k
        d->id = tmp.s; d->m_id = tmp.m;
4254
4255
        // REF and ALT are in a single block (d->als) and d->alleles are pointers into this block
4256
30.8k
        hts_expand(char*, b->n_allele, d->m_allele, d->allele); // NM: hts_expand() is a macro
4257
30.8k
        tmp.l = 0; tmp.s = d->als; tmp.m = d->m_als;
4258
30.8k
        ptr_ori = ptr;
4259
1.56M
        for (i = 0; i < b->n_allele; ++i) {
4260
            // Use offset within tmp.s as realloc may change pointer
4261
1.53M
            d->allele[i] = (char *)(intptr_t)tmp.l;
4262
1.53M
            ptr = bcf_fmt_sized_array(&tmp, ptr);
4263
1.53M
            kputc_('\0', &tmp);
4264
1.53M
        }
4265
30.8k
        b->unpack_size[1] = ptr - ptr_ori;
4266
30.8k
        d->als = tmp.s; d->m_als = tmp.m;
4267
4268
        // Convert our offsets within tmp.s back to pointers again
4269
1.56M
        for (i = 0; i < b->n_allele; ++i)
4270
1.53M
            d->allele[i] = d->als + (ptrdiff_t)d->allele[i];
4271
30.8k
        b->unpacked |= BCF_UN_STR;
4272
30.8k
    }
4273
30.8k
    if ((which&BCF_UN_FLT) && !(b->unpacked&BCF_UN_FLT)) { // FILTER
4274
30.8k
        ptr = (uint8_t*)b->shared.s + b->unpack_size[0] + b->unpack_size[1];
4275
30.8k
        ptr_ori = ptr;
4276
30.8k
        if (*ptr>>4) {
4277
28.5k
            int type;
4278
28.5k
            d->n_flt = bcf_dec_size(ptr, &ptr, &type);
4279
28.5k
            hts_expand(int, d->n_flt, d->m_flt, d->flt);
4280
366k
            for (i = 0; i < d->n_flt; ++i)
4281
338k
                d->flt[i] = bcf_dec_int1(ptr, type, &ptr);
4282
28.5k
        } else ++ptr, d->n_flt = 0;
4283
30.8k
        b->unpack_size[2] = ptr - ptr_ori;
4284
30.8k
        b->unpacked |= BCF_UN_FLT;
4285
30.8k
    }
4286
30.8k
    if ((which&BCF_UN_INFO) && !(b->unpacked&BCF_UN_INFO)) { // INFO
4287
0
        ptr = (uint8_t*)b->shared.s + b->unpack_size[0] + b->unpack_size[1] + b->unpack_size[2];
4288
0
        hts_expand(bcf_info_t, b->n_info, d->m_info, d->info);
4289
0
        for (i = 0; i < d->m_info; ++i) d->info[i].vptr_free = 0;
4290
0
        for (i = 0; i < b->n_info; ++i)
4291
0
            ptr = bcf_unpack_info_core1(ptr, &d->info[i]);
4292
0
        b->unpacked |= BCF_UN_INFO;
4293
0
    }
4294
30.8k
    if ((which&BCF_UN_FMT) && b->n_sample && !(b->unpacked&BCF_UN_FMT)) { // FORMAT
4295
0
        ptr = (uint8_t*)b->indiv.s;
4296
0
        hts_expand(bcf_fmt_t, b->n_fmt, d->m_fmt, d->fmt);
4297
0
        for (i = 0; i < d->m_fmt; ++i) d->fmt[i].p_free = 0;
4298
0
        for (i = 0; i < b->n_fmt; ++i)
4299
0
            ptr = bcf_unpack_fmt_core1(ptr, b->n_sample, &d->fmt[i]);
4300
0
        b->unpacked |= BCF_UN_FMT;
4301
0
    }
4302
30.8k
    return 0;
4303
30.8k
}
4304
4305
int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s)
4306
30.8k
{
4307
30.8k
    int i;
4308
30.8k
    int32_t max_dt_id = h->n[BCF_DT_ID];
4309
30.8k
    const char *chrom = bcf_seqname(h, v);
4310
30.8k
    if (!chrom) {
4311
0
        hts_log_error("Invalid BCF, CONTIG id=%d not present in the header",
4312
0
                      v->rid);
4313
0
        errno = EINVAL;
4314
0
        return -1;
4315
0
    }
4316
4317
30.8k
    bcf_unpack((bcf1_t*)v, BCF_UN_ALL & ~(BCF_UN_INFO|BCF_UN_FMT));
4318
4319
    // Cache of key lengths so we don't keep repeatedly using them.
4320
    // This assumes we're not modifying the header between successive calls
4321
    // to vcf_format, but that would lead to many other forms of breakage
4322
    // so it feels like a valid assumption to make.
4323
    //
4324
    // We cannot just do this in bcf_hdr_sync as some code (eg bcftools
4325
    // annotate) manipulates the headers directly without calling sync to
4326
    // refresh the data structures.  So we must do just-in-time length
4327
    // calculation during writes instead.
4328
30.8k
    bcf_hdr_aux_t *aux = get_hdr_aux(h);
4329
30.8k
    if (!aux->key_len) {
4330
4.58k
        if (!(aux->key_len = calloc(h->n[BCF_DT_ID]+1, sizeof(*aux->key_len))))
4331
0
            return -1;
4332
4.58k
    }
4333
30.8k
    size_t *key_len = aux->key_len;
4334
4335
30.8k
    kputs(chrom, s); // CHROM
4336
30.8k
    kputc_('\t', s); kputll(v->pos + 1, s); // POS
4337
30.8k
    kputc_('\t', s); kputs(v->d.id ? v->d.id : ".", s); // ID
4338
30.8k
    kputc_('\t', s); // REF
4339
30.8k
    if (v->n_allele > 0) kputs(v->d.allele[0], s);
4340
0
    else kputc_('.', s);
4341
30.8k
    kputc_('\t', s); // ALT
4342
30.8k
    if (v->n_allele > 1) {
4343
1.53M
        for (i = 1; i < v->n_allele; ++i) {
4344
1.50M
            if (i > 1) kputc_(',', s);
4345
1.50M
            kputs(v->d.allele[i], s);
4346
1.50M
        }
4347
30.4k
    } else kputc_('.', s);
4348
30.8k
    kputc_('\t', s); // QUAL
4349
30.8k
    if ( bcf_float_is_missing(v->qual) ) kputc_('.', s); // QUAL
4350
30.3k
    else kputd(v->qual, s);
4351
30.8k
    kputc_('\t', s); // FILTER
4352
30.8k
    if (v->d.n_flt) {
4353
366k
        for (i = 0; i < v->d.n_flt; ++i) {
4354
338k
            int32_t idx = v->d.flt[i];
4355
338k
            if (idx < 0 || idx >= max_dt_id
4356
338k
                || h->id[BCF_DT_ID][idx].key == NULL) {
4357
0
                hts_log_error("Invalid BCF, the FILTER tag id=%d at %s:%"PRIhts_pos" not present in the header",
4358
0
                              idx, bcf_seqname_safe(h, v), v->pos + 1);
4359
0
                errno = EINVAL;
4360
0
                return -1;
4361
0
            }
4362
338k
            if (i) kputc_(';', s);
4363
338k
            if (!key_len[idx])
4364
49.3k
                key_len[idx] = strlen(h->id[BCF_DT_ID][idx].key);
4365
338k
            kputsn(h->id[BCF_DT_ID][idx].key, key_len[idx], s);
4366
338k
        }
4367
28.5k
    } else kputc_('.', s);
4368
4369
30.8k
    kputc_('\t', s); // INFO
4370
30.8k
    if (v->n_info) {
4371
16.5k
        uint8_t *ptr = v->shared.s
4372
16.5k
            ? (uint8_t *)v->shared.s + v->unpack_size[0] +
4373
16.5k
               v->unpack_size[1] + v->unpack_size[2]
4374
16.5k
            : NULL;
4375
16.5k
        int first = 1;
4376
16.5k
        bcf_info_t *info = v->d.info;
4377
4378
        // Note if we duplicate this code into custom packed and unpacked
4379
        // implementations then we gain a bit more speed, particularly with
4380
        // clang 13 (up to 5%).  Not sure why this is, but code duplication
4381
        // isn't pleasant and it's still faster adding packed support than
4382
        // not so it's a win, just not as good as it should be.
4383
16.5k
        const int info_packed = !(v->unpacked & BCF_UN_INFO) && v->shared.l;
4384
1.50M
        for (i = 0; i < v->n_info; ++i) {
4385
1.48M
            bcf_info_t in, *z;
4386
1.48M
            if (info_packed) {
4387
                // Use a local bcf_info_t when data is packed
4388
1.48M
                z = &in;
4389
1.48M
                z->key  = bcf_dec_typed_int1(ptr, &ptr);
4390
1.48M
                z->len  = bcf_dec_size(ptr, &ptr, &z->type);
4391
1.48M
                z->vptr = ptr;
4392
1.48M
                ptr += z->len << bcf_type_shift[z->type];
4393
1.48M
            } else {
4394
                // Else previously unpacked INFO struct
4395
0
                z = &info[i];
4396
4397
                // Also potentially since deleted
4398
0
                if ( !z->vptr ) continue;
4399
0
            }
4400
4401
1.48M
            bcf_idpair_t *id = z->key >= 0 && z->key < max_dt_id
4402
1.48M
                ? &h->id[BCF_DT_ID][z->key]
4403
1.48M
                : NULL;
4404
4405
1.48M
            if (!id || !id->key) {
4406
0
                hts_log_error("Invalid BCF, the INFO tag id=%d is %s at %s:%"PRIhts_pos,
4407
0
                              z->key,
4408
0
                              z->key < 0 ? "negative"
4409
0
                              : (z->key >= max_dt_id ? "too large" : "not present in the header"),
4410
0
                              bcf_seqname_safe(h, v), v->pos+1);
4411
0
                errno = EINVAL;
4412
0
                return -1;
4413
0
            }
4414
4415
            // KEY
4416
1.48M
            if (!key_len[z->key])
4417
20.2k
                key_len[z->key] = strlen(id->key);
4418
1.48M
            size_t id_len = key_len[z->key];
4419
1.48M
            if (ks_resize(s, s->l + 3 + id_len) < 0)
4420
0
                return -1;
4421
1.48M
            char *sptr = s->s + s->l;
4422
1.48M
            if ( !first ) {
4423
1.47M
                *sptr++ = ';';
4424
1.47M
                s->l++;
4425
1.47M
            }
4426
1.48M
            first = 0;
4427
1.48M
            memcpy(sptr, id->key, id_len);
4428
1.48M
            s->l += id_len;
4429
4430
            // VALUE
4431
1.48M
            if (z->len <= 0) continue;
4432
702k
            sptr[id_len] = '=';
4433
702k
            s->l++;
4434
4435
702k
            if (z->len != 1 || info_packed) {
4436
702k
                bcf_fmt_array(s, z->len, z->type, z->vptr);
4437
702k
            } else {
4438
                // Single length vectors are unpacked into their
4439
                // own info.v1 union and handled separately.
4440
0
                if (z->type == BCF_BT_FLOAT) {
4441
0
                    if ( bcf_float_is_missing(z->v1.f) )
4442
0
                        kputc_('.', s);
4443
0
                    else
4444
0
                        kputd(z->v1.f, s);
4445
0
                } else if (z->type == BCF_BT_CHAR) {
4446
0
                    kputc_(z->v1.i, s);
4447
0
                } else if (z->type < BCF_BT_INT64) {
4448
0
                    int64_t missing[] = {
4449
0
                        0, // BCF_BT_NULL
4450
0
                        bcf_int8_missing,
4451
0
                        bcf_int16_missing,
4452
0
                        bcf_int32_missing,
4453
0
                    };
4454
0
                    if (z->v1.i == missing[z->type])
4455
0
                        kputc_('.', s);
4456
0
                    else
4457
0
                        kputw(z->v1.i, s);
4458
0
                } else if (z->type == BCF_BT_INT64) {
4459
0
                    if (z->v1.i == bcf_int64_missing)
4460
0
                        kputc_('.', s);
4461
0
                    else
4462
0
                        kputll(z->v1.i, s);
4463
0
                } else {
4464
0
                    hts_log_error("Unexpected type %d at %s:%"PRIhts_pos, z->type, bcf_seqname_safe(h, v), v->pos+1);
4465
0
                    errno = EINVAL;
4466
0
                    return -1;
4467
0
                }
4468
0
            }
4469
702k
        }
4470
16.5k
        if ( first ) kputc_('.', s);
4471
16.5k
    } else kputc_('.', s);
4472
4473
    // FORMAT and individual information
4474
30.8k
    if (v->n_sample) {
4475
9.99k
        int i,j;
4476
9.99k
        if ( v->n_fmt) {
4477
9.77k
            uint8_t *ptr = (uint8_t *)v->indiv.s;
4478
9.77k
            int gt_i = -1;
4479
9.77k
            bcf_fmt_t *fmt = v->d.fmt;
4480
9.77k
            int first = 1, ret = 0;
4481
9.77k
            int fmt_packed = !(v->unpacked & BCF_UN_FMT);
4482
4483
9.77k
            if (fmt_packed) {
4484
                // Local fmt as we have an array of num FORMAT keys,
4485
                // each of which points to N.Sample values.
4486
4487
                // No real gain to be had in handling unpacked data here,
4488
                // but it doesn't cost us much in complexity either and
4489
                // it gives us flexibility.
4490
9.77k
                fmt = hts_malloc_p(sizeof(*fmt), v->n_fmt);
4491
9.77k
                if (!fmt)
4492
0
                    return -1;
4493
9.77k
            }
4494
4495
            // KEYS
4496
33.2k
            for (i = 0; i < (int)v->n_fmt; ++i) {
4497
23.4k
                bcf_fmt_t *z;
4498
23.4k
                z = &fmt[i];
4499
23.4k
                if (fmt_packed) {
4500
23.4k
                    z->id   = bcf_dec_typed_int1(ptr, &ptr);
4501
23.4k
                    z->n    = bcf_dec_size(ptr, &ptr, &z->type);
4502
23.4k
                    z->p    = ptr;
4503
23.4k
                    z->size = z->n << bcf_type_shift[z->type];
4504
23.4k
                    ptr += v->n_sample * z->size;
4505
23.4k
                }
4506
23.4k
                if ( !z->p ) continue;
4507
23.4k
                kputc_(!first ? ':' : '\t', s); first = 0;
4508
4509
23.4k
                bcf_idpair_t *id = z->id >= 0 && z->id < max_dt_id
4510
23.4k
                    ? &h->id[BCF_DT_ID][z->id]
4511
23.4k
                    : NULL;
4512
4513
23.4k
                if (!id || !id->key) {
4514
0
                    hts_log_error("Invalid BCF, the FORMAT tag id=%d at %s:%"PRIhts_pos" not present in the header", z->id, bcf_seqname_safe(h, v), v->pos+1);
4515
0
                    errno = EINVAL;
4516
0
                    if (fmt_packed)
4517
0
                        free(fmt);
4518
0
                    return -1;
4519
0
                }
4520
4521
23.4k
                if (!key_len[z->id])
4522
11.4k
                    key_len[z->id] = strlen(id->key);
4523
23.4k
                size_t id_len = key_len[z->id];
4524
23.4k
                kputsn(id->key, id_len, s);
4525
23.4k
                if (id_len == 2 && id->key[0] == 'G' && id->key[1] == 'T')
4526
4.77k
                    gt_i = i;
4527
23.4k
            }
4528
9.77k
            if ( first ) kputsn("\t.", 2, s);
4529
4530
            // VALUES per sample
4531
33.7k
            for (j = 0; j < v->n_sample; ++j) {
4532
23.9k
                kputc_('\t', s);
4533
23.9k
                first = 1;
4534
23.9k
                bcf_fmt_t *f = fmt;
4535
66.3k
                for (i = 0; i < (int)v->n_fmt; i++, f++) {
4536
58.4k
                    if ( !f->p ) continue;
4537
58.4k
                    if (!first) kputc_(':', s);
4538
58.4k
                    first = 0;
4539
58.4k
                    if (gt_i == i) {
4540
16.0k
                        if ((ret = bcf_format_gt_v2(h, f,j,s)) < 0) {
4541
0
                            hts_log_error("Failed to format GT value for sample %d, returned %d", i, ret);
4542
0
                            errno = EINVAL;
4543
0
                            if (fmt_packed)
4544
0
                                free(fmt);
4545
0
                            return -1;
4546
0
                        }
4547
16.0k
                        break;
4548
16.0k
                    }
4549
42.4k
                    else if (f->n == 1)
4550
4.49k
                        bcf_fmt_array1(s, f->type, f->p + j * (size_t)f->size);
4551
37.9k
                    else
4552
37.9k
                        bcf_fmt_array(s, f->n, f->type, f->p + j * (size_t)f->size);
4553
58.4k
                }
4554
4555
                // Simpler loop post GT and at least 1 iteration
4556
46.0k
                for (i++, f++; i < (int)v->n_fmt; i++, f++) {
4557
22.1k
                    if ( !f->p ) continue;
4558
22.1k
                    kputc_(':', s);
4559
22.1k
                    if (f->n == 1)
4560
1.38k
                        bcf_fmt_array1(s, f->type, f->p + j * (size_t)f->size);
4561
20.7k
                    else
4562
20.7k
                        bcf_fmt_array(s, f->n, f->type, f->p + j * (size_t)f->size);
4563
22.1k
                }
4564
23.9k
                if ( first ) kputc_('.', s);
4565
23.9k
            }
4566
9.77k
            if (fmt_packed)
4567
9.77k
                free(fmt);
4568
9.77k
        }
4569
221
        else
4570
1.06k
            for (j=0; j<=v->n_sample; j++)
4571
839
                kputsn("\t.", 2, s);
4572
9.99k
    }
4573
30.8k
    kputc('\n', s);
4574
30.8k
    return 0;
4575
30.8k
}
4576
4577
int vcf_write_line(htsFile *fp, kstring_t *line)
4578
0
{
4579
0
    int ret;
4580
0
    if ( line->s[line->l-1]!='\n' ) kputc('\n',line);
4581
0
    if ( fp->format.compression!=no_compression )
4582
0
        ret = bgzf_write(fp->fp.bgzf, line->s, line->l);
4583
0
    else
4584
0
        ret = hwrite(fp->fp.hfile, line->s, line->l);
4585
0
    return ret==line->l ? 0 : -1;
4586
0
}
4587
4588
int vcf_write(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
4589
30.8k
{
4590
30.8k
    ssize_t ret;
4591
30.8k
    fp->line.l = 0;
4592
30.8k
    if (vcf_format1(h, v, &fp->line) != 0)
4593
0
        return -1;
4594
30.8k
    if ( fp->format.compression!=no_compression ) {
4595
0
        if (bgzf_flush_try(fp->fp.bgzf, fp->line.l) < 0)
4596
0
            return -1;
4597
0
        if (fp->idx && !fp->fp.bgzf->mt)
4598
0
            hts_idx_amend_last(fp->idx, bgzf_tell(fp->fp.bgzf));
4599
0
        ret = bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l);
4600
30.8k
    } else {
4601
30.8k
        ret = hwrite(fp->fp.hfile, fp->line.s, fp->line.l);
4602
30.8k
    }
4603
4604
30.8k
    if (fp->idx && fp->format.compression == bgzf) {
4605
0
        int tid;
4606
0
        if ((tid = hts_idx_tbi_name(fp->idx, v->rid, bcf_seqname_safe(h, v))) < 0)
4607
0
            return -1;
4608
4609
0
        if (bgzf_idx_push(fp->fp.bgzf, fp->idx,
4610
0
                          tid, v->pos, v->pos + v->rlen,
4611
0
                          bgzf_tell(fp->fp.bgzf), 1) < 0)
4612
0
            return -1;
4613
0
    }
4614
4615
30.8k
    return ret==fp->line.l ? 0 : -1;
4616
30.8k
}
4617
4618
/************************
4619
 * Data access routines *
4620
 ************************/
4621
4622
int bcf_hdr_id2int(const bcf_hdr_t *h, int which, const char *id)
4623
260k
{
4624
260k
    khint_t k;
4625
260k
    vdict_t *d = (vdict_t*)h->dict[which];
4626
260k
    k = kh_get(vdict, d, id);
4627
260k
    return k == kh_end(d)? -1 : kh_val(d, k).id;
4628
260k
}
4629
4630
4631
/********************
4632
 *** BCF indexing ***
4633
 ********************/
4634
4635
// Calculate number of index levels given min_shift and the header contig
4636
// list.  Also returns number of contigs in *nids_out.
4637
static int idx_calc_n_lvls_ids(const bcf_hdr_t *h, int *min_shift_in_out,
4638
                               int starting_n_lvls, int *nids_out)
4639
0
{
4640
0
    int n_lvls = starting_n_lvls, i, nids = 0;
4641
0
    int64_t max_len = 0;
4642
4643
0
    for (i = 0; i < h->n[BCF_DT_CTG]; ++i)
4644
0
    {
4645
0
        if ( !h->id[BCF_DT_CTG][i].val ) continue;
4646
0
        if ( max_len < h->id[BCF_DT_CTG][i].val->info[0] )
4647
0
            max_len = h->id[BCF_DT_CTG][i].val->info[0];
4648
0
        nids++;
4649
0
    }
4650
0
    if ( !max_len ) max_len = (1LL<<31) - 1;  // In case contig line is broken.
4651
4652
0
    hts_adjust_csi_settings(max_len, min_shift_in_out, &n_lvls);
4653
4654
0
    if (nids_out) *nids_out = nids;
4655
0
    return n_lvls;
4656
0
}
4657
4658
hts_idx_t *bcf_index(htsFile *fp, int min_shift)
4659
0
{
4660
0
    int n_lvls;
4661
0
    bcf1_t *b = NULL;
4662
0
    hts_idx_t *idx = NULL;
4663
0
    bcf_hdr_t *h;
4664
0
    int r;
4665
0
    h = bcf_hdr_read(fp);
4666
0
    if ( !h ) return NULL;
4667
0
    int nids = 0;
4668
0
    n_lvls = idx_calc_n_lvls_ids(h, &min_shift, 0, &nids);
4669
0
    idx = hts_idx_init(nids, HTS_FMT_CSI, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
4670
0
    if (!idx) goto fail;
4671
0
    b = bcf_init1();
4672
0
    if (!b) goto fail;
4673
0
    while ((r = bcf_read1(fp,h, b)) >= 0) {
4674
0
        int ret;
4675
0
        ret = hts_idx_push(idx, b->rid, b->pos, b->pos + b->rlen, bgzf_tell(fp->fp.bgzf), 1);
4676
0
        if (ret < 0) goto fail;
4677
0
    }
4678
0
    if (r < -1) goto fail;
4679
0
    hts_idx_finish(idx, bgzf_tell(fp->fp.bgzf));
4680
0
    bcf_destroy1(b);
4681
0
    bcf_hdr_destroy(h);
4682
0
    return idx;
4683
4684
0
 fail:
4685
0
    hts_idx_destroy(idx);
4686
0
    bcf_destroy1(b);
4687
0
    bcf_hdr_destroy(h);
4688
0
    return NULL;
4689
0
}
4690
4691
hts_idx_t *bcf_index_load2(const char *fn, const char *fnidx)
4692
0
{
4693
0
    return fnidx? hts_idx_load2(fn, fnidx) : bcf_index_load(fn);
4694
0
}
4695
4696
hts_idx_t *bcf_index_load3(const char *fn, const char *fnidx, int flags)
4697
0
{
4698
0
    return hts_idx_load3(fn, fnidx, HTS_FMT_CSI, flags);
4699
0
}
4700
4701
int bcf_index_build3(const char *fn, const char *fnidx, int min_shift, int n_threads)
4702
0
{
4703
0
    htsFile *fp;
4704
0
    hts_idx_t *idx;
4705
0
    tbx_t *tbx;
4706
0
    int ret;
4707
0
    if ((fp = hts_open(fn, "rb")) == 0) return -2;
4708
0
    if (n_threads)
4709
0
        hts_set_threads(fp, n_threads);
4710
0
    if ( fp->format.compression!=bgzf ) { hts_close(fp); return -3; }
4711
0
    switch (fp->format.format) {
4712
0
        case bcf:
4713
0
            if (!min_shift) {
4714
0
                hts_log_error("TBI indices for BCF files are not supported");
4715
0
                ret = -1;
4716
0
            } else {
4717
0
                idx = bcf_index(fp, min_shift);
4718
0
                if (idx) {
4719
0
                    ret = hts_idx_save_as(idx, fn, fnidx, HTS_FMT_CSI);
4720
0
                    if (ret < 0) ret = -4;
4721
0
                    hts_idx_destroy(idx);
4722
0
                }
4723
0
                else ret = -1;
4724
0
            }
4725
0
            break;
4726
4727
0
        case vcf:
4728
0
            tbx = tbx_index(hts_get_bgzfp(fp), min_shift, &tbx_conf_vcf);
4729
0
            if (tbx) {
4730
0
                ret = hts_idx_save_as(tbx->idx, fn, fnidx, min_shift > 0 ? HTS_FMT_CSI : HTS_FMT_TBI);
4731
0
                if (ret < 0) ret = -4;
4732
0
                tbx_destroy(tbx);
4733
0
            }
4734
0
            else ret = -1;
4735
0
            break;
4736
4737
0
        default:
4738
0
            ret = -3;
4739
0
            break;
4740
0
    }
4741
0
    hts_close(fp);
4742
0
    return ret;
4743
0
}
4744
4745
int bcf_index_build2(const char *fn, const char *fnidx, int min_shift)
4746
0
{
4747
0
    return bcf_index_build3(fn, fnidx, min_shift, 0);
4748
0
}
4749
4750
int bcf_index_build(const char *fn, int min_shift)
4751
0
{
4752
0
    return bcf_index_build3(fn, NULL, min_shift, 0);
4753
0
}
4754
4755
// Initialise fp->idx for the current format type.
4756
// This must be called after the header has been written but no other data.
4757
0
static int vcf_idx_init(htsFile *fp, bcf_hdr_t *h, int min_shift, const char *fnidx) {
4758
0
    int n_lvls, fmt;
4759
4760
0
    if (min_shift == 0) {
4761
0
        min_shift = 14;
4762
0
        n_lvls = 5;
4763
0
        fmt = HTS_FMT_TBI;
4764
0
    } else {
4765
        // Set initial n_lvls to match tbx_index()
4766
0
        int starting_n_lvls = (TBX_MAX_SHIFT - min_shift + 2) / 3;
4767
        // Increase if necessary
4768
0
        n_lvls = idx_calc_n_lvls_ids(h, &min_shift, starting_n_lvls, NULL);
4769
0
        fmt = HTS_FMT_CSI;
4770
0
    }
4771
4772
0
    fp->idx = hts_idx_init(0, fmt, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
4773
0
    if (!fp->idx) return -1;
4774
4775
    // Tabix meta data, added even in CSI for VCF
4776
0
    uint8_t conf[4*7];
4777
0
    u32_to_le(TBX_VCF, conf+0);  // fmt
4778
0
    u32_to_le(1,       conf+4);  // name col
4779
0
    u32_to_le(2,       conf+8);  // beg col
4780
0
    u32_to_le(0,       conf+12); // end col
4781
0
    u32_to_le('#',     conf+16); // comment
4782
0
    u32_to_le(0,       conf+20); // n.skip
4783
0
    u32_to_le(0,       conf+24); // ref name len
4784
0
    if (hts_idx_set_meta(fp->idx, sizeof(conf)*sizeof(*conf), (uint8_t *)conf, 1) < 0) {
4785
0
        hts_idx_destroy(fp->idx);
4786
0
        fp->idx = NULL;
4787
0
        return -1;
4788
0
    }
4789
0
    fp->fnidx = fnidx;
4790
4791
0
    return 0;
4792
0
}
4793
4794
// Initialise fp->idx for the current format type.
4795
// This must be called after the header has been written but no other data.
4796
0
int bcf_idx_init(htsFile *fp, bcf_hdr_t *h, int min_shift, const char *fnidx) {
4797
0
    int n_lvls, nids = 0;
4798
4799
0
    if (fp->format.compression != bgzf) {
4800
0
        hts_log_error("Indexing is only supported on BGZF-compressed files");
4801
0
        return -3; // Matches no-compression return for bcf_index_build3()
4802
0
    }
4803
4804
0
    if (fp->format.format == vcf)
4805
0
        return vcf_idx_init(fp, h, min_shift, fnidx);
4806
4807
0
    if (!min_shift)
4808
0
        min_shift = 14;
4809
4810
0
    n_lvls = idx_calc_n_lvls_ids(h, &min_shift, 0, &nids);
4811
4812
0
    fp->idx = hts_idx_init(nids, HTS_FMT_CSI, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
4813
0
    if (!fp->idx) return -1;
4814
0
    fp->fnidx = fnidx;
4815
4816
0
    return 0;
4817
0
}
4818
4819
// Finishes an index. Call after the last record has been written.
4820
// Returns 0 on success, <0 on failure.
4821
//
4822
// NB: same format as SAM/BAM as it uses bgzf.
4823
0
int bcf_idx_save(htsFile *fp) {
4824
0
    return sam_idx_save(fp);
4825
0
}
4826
4827
// Wrap around bcf_hdr_name2id() to get the right signature for hts_name2id_f
4828
0
static int bcf_hdr_name2id_wrapper(void *vhdr, const char *ref) {
4829
0
    return bcf_hdr_name2id((bcf_hdr_t *) vhdr, ref);
4830
0
}
4831
4832
hts_itr_t *bcf_itr_querys1(const hts_idx_t *idx, bcf_hdr_t *hdr,
4833
0
                           const char *region) {
4834
0
    return hts_itr_querys(idx, region, bcf_hdr_name2id_wrapper, hdr,
4835
0
                          hts_itr_query, bcf_readrec);
4836
0
}
4837
4838
4839
/*****************
4840
 *** Utilities ***
4841
 *****************/
4842
4843
int bcf_hdr_combine(bcf_hdr_t *dst, const bcf_hdr_t *src)
4844
0
{
4845
0
    int i, ndst_ori = dst->nhrec, need_sync = 0, ret = 0, res;
4846
0
    for (i=0; i<src->nhrec; i++)
4847
0
    {
4848
0
        if ( src->hrec[i]->type==BCF_HL_GEN && src->hrec[i]->value )
4849
0
        {
4850
0
            int j;
4851
0
            for (j=0; j<ndst_ori; j++)
4852
0
            {
4853
0
                if ( dst->hrec[j]->type!=BCF_HL_GEN ) continue;
4854
4855
                // Checking only the key part of generic lines, otherwise
4856
                // the VCFs are too verbose. Should we perhaps add a flag
4857
                // to bcf_hdr_combine() and make this optional?
4858
0
                if ( !strcmp(src->hrec[i]->key,dst->hrec[j]->key) ) break;
4859
0
            }
4860
0
            if ( j>=ndst_ori ) {
4861
0
                res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
4862
0
                if (res < 0) return -1;
4863
0
                need_sync += res;
4864
0
            }
4865
0
        }
4866
0
        else if ( src->hrec[i]->type==BCF_HL_STR )
4867
0
        {
4868
            // NB: we are ignoring fields without ID
4869
0
            int j = bcf_hrec_find_key(src->hrec[i],"ID");
4870
0
            if ( j>=0 )
4871
0
            {
4872
0
                bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], src->hrec[i]->key);
4873
0
                if ( !rec ) {
4874
0
                    res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
4875
0
                    if (res < 0) return -1;
4876
0
                    need_sync += res;
4877
0
                }
4878
0
            }
4879
0
        }
4880
0
        else
4881
0
        {
4882
0
            int j = bcf_hrec_find_key(src->hrec[i],"ID");
4883
0
            assert( j>=0 ); // this should always be true for valid VCFs
4884
4885
0
            bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], NULL);
4886
0
            if ( !rec ) {
4887
0
                res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
4888
0
                if (res < 0) return -1;
4889
0
                need_sync += res;
4890
0
            } else if ( src->hrec[i]->type==BCF_HL_INFO || src->hrec[i]->type==BCF_HL_FMT )
4891
0
            {
4892
                // Check that both records are of the same type. The bcf_hdr_id2length
4893
                // macro cannot be used here because dst header is not synced yet.
4894
0
                vdict_t *d_src = (vdict_t*)src->dict[BCF_DT_ID];
4895
0
                vdict_t *d_dst = (vdict_t*)dst->dict[BCF_DT_ID];
4896
0
                khint_t k_src  = kh_get(vdict, d_src, src->hrec[i]->vals[0]);
4897
0
                khint_t k_dst  = kh_get(vdict, d_dst, src->hrec[i]->vals[0]);
4898
0
                if ( (kh_val(d_src,k_src).info[rec->type]>>8 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>8 & 0xf) )
4899
0
                {
4900
0
                    hts_log_warning("Trying to combine \"%s\" tag definitions of different lengths",
4901
0
                        src->hrec[i]->vals[0]);
4902
0
                    ret |= 1;
4903
0
                }
4904
0
                if ( (kh_val(d_src,k_src).info[rec->type]>>4 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>4 & 0xf) )
4905
0
                {
4906
0
                    hts_log_warning("Trying to combine \"%s\" tag definitions of different types",
4907
0
                        src->hrec[i]->vals[0]);
4908
0
                    ret |= 1;
4909
0
                }
4910
0
            }
4911
0
        }
4912
0
    }
4913
0
    if ( need_sync ) {
4914
0
        if (bcf_hdr_sync(dst) < 0) return -1;
4915
0
    }
4916
0
    return ret;
4917
0
}
4918
4919
bcf_hdr_t *bcf_hdr_merge(bcf_hdr_t *dst, const bcf_hdr_t *src)
4920
0
{
4921
0
    if ( !dst )
4922
0
    {
4923
        // this will effectively strip existing IDX attributes from src to become dst
4924
0
        dst = bcf_hdr_init("r");
4925
0
        kstring_t htxt = {0,0,0};
4926
0
        if (bcf_hdr_format(src, 0, &htxt) < 0) {
4927
0
            free(htxt.s);
4928
0
            return NULL;
4929
0
        }
4930
0
        if ( bcf_hdr_parse(dst, htxt.s) < 0 ) {
4931
0
            bcf_hdr_destroy(dst);
4932
0
            dst = NULL;
4933
0
        }
4934
0
        free(htxt.s);
4935
0
        return dst;
4936
0
    }
4937
4938
0
    int i, ndst_ori = dst->nhrec, need_sync = 0, res;
4939
0
    for (i=0; i<src->nhrec; i++)
4940
0
    {
4941
0
        if ( src->hrec[i]->type==BCF_HL_GEN && src->hrec[i]->value )
4942
0
        {
4943
0
            int j;
4944
0
            for (j=0; j<ndst_ori; j++)
4945
0
            {
4946
0
                if ( dst->hrec[j]->type!=BCF_HL_GEN ) continue;
4947
4948
                // Checking only the key part of generic lines, otherwise
4949
                // the VCFs are too verbose. Should we perhaps add a flag
4950
                // to bcf_hdr_combine() and make this optional?
4951
0
                if ( !strcmp(src->hrec[i]->key,dst->hrec[j]->key) ) break;
4952
0
            }
4953
0
            if ( j>=ndst_ori ) {
4954
0
                res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
4955
0
                if (res < 0) return NULL;
4956
0
                need_sync += res;
4957
0
            }
4958
0
            else if ( !strcmp(src->hrec[i]->key,"fileformat") )
4959
0
            {
4960
0
                int ver_src = bcf_get_version(src,src->hrec[i]->value);
4961
0
                int ver_dst = bcf_get_version(dst,dst->hrec[j]->value);
4962
0
                if ( ver_src > ver_dst )
4963
0
                {
4964
0
                    if (bcf_hdr_set_version(dst,src->hrec[i]->value) < 0)
4965
0
                        return NULL;
4966
0
                    need_sync = 1;
4967
0
                }
4968
0
            }
4969
0
        }
4970
0
        else if ( src->hrec[i]->type==BCF_HL_STR )
4971
0
        {
4972
            // NB: we are ignoring fields without ID
4973
0
            int j = bcf_hrec_find_key(src->hrec[i],"ID");
4974
0
            if ( j>=0 )
4975
0
            {
4976
0
                bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], src->hrec[i]->key);
4977
0
                if ( !rec ) {
4978
0
                    res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
4979
0
                    if (res < 0) return NULL;
4980
0
                    need_sync += res;
4981
0
                }
4982
0
            }
4983
0
        }
4984
0
        else
4985
0
        {
4986
0
            int j = bcf_hrec_find_key(src->hrec[i],"ID");
4987
0
            assert( j>=0 ); // this should always be true for valid VCFs
4988
4989
0
            bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], NULL);
4990
0
            if ( !rec ) {
4991
0
                res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
4992
0
                if (res < 0) return NULL;
4993
0
                need_sync += res;
4994
0
            } else if ( src->hrec[i]->type==BCF_HL_INFO || src->hrec[i]->type==BCF_HL_FMT )
4995
0
            {
4996
                // Check that both records are of the same type. The bcf_hdr_id2length
4997
                // macro cannot be used here because dst header is not synced yet.
4998
0
                vdict_t *d_src = (vdict_t*)src->dict[BCF_DT_ID];
4999
0
                vdict_t *d_dst = (vdict_t*)dst->dict[BCF_DT_ID];
5000
0
                khint_t k_src  = kh_get(vdict, d_src, src->hrec[i]->vals[0]);
5001
0
                khint_t k_dst  = kh_get(vdict, d_dst, src->hrec[i]->vals[0]);
5002
0
                if ( (kh_val(d_src,k_src).info[rec->type]>>8 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>8 & 0xf) )
5003
0
                {
5004
0
                    hts_log_warning("Trying to combine \"%s\" tag definitions of different lengths",
5005
0
                        src->hrec[i]->vals[0]);
5006
0
                }
5007
0
                if ( (kh_val(d_src,k_src).info[rec->type]>>4 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>4 & 0xf) )
5008
0
                {
5009
0
                    hts_log_warning("Trying to combine \"%s\" tag definitions of different types",
5010
0
                        src->hrec[i]->vals[0]);
5011
0
                }
5012
0
            }
5013
0
        }
5014
0
    }
5015
0
    if ( need_sync ) {
5016
0
        if (bcf_hdr_sync(dst) < 0) return NULL;
5017
0
    }
5018
0
    return dst;
5019
0
}
5020
5021
int bcf_translate(const bcf_hdr_t *dst_hdr, bcf_hdr_t *src_hdr, bcf1_t *line)
5022
0
{
5023
0
    int i;
5024
0
    if ( line->errcode )
5025
0
    {
5026
0
        char errordescription[1024] = "";
5027
0
        hts_log_error("Unchecked error (%d %s) at %s:%"PRIhts_pos", exiting", line->errcode, bcf_strerror(line->errcode, errordescription, sizeof(errordescription)),  bcf_seqname_safe(src_hdr,line), line->pos+1);
5028
0
        exit(1);
5029
0
    }
5030
0
    if ( src_hdr->ntransl==-1 ) return 0;    // no need to translate, all tags have the same id
5031
0
    if ( !src_hdr->ntransl )  // called for the first time, see what needs translating
5032
0
    {
5033
0
        int dict;
5034
0
        for (dict=0; dict<2; dict++)    // BCF_DT_ID and BCF_DT_CTG
5035
0
        {
5036
0
            src_hdr->transl[dict] = hts_malloc_p(sizeof(int), src_hdr->n[dict]);
5037
0
            for (i=0; i<src_hdr->n[dict]; i++)
5038
0
            {
5039
0
                if ( !src_hdr->id[dict][i].key ) // gap left after removed BCF header lines
5040
0
                {
5041
0
                    src_hdr->transl[dict][i] = -1;
5042
0
                    continue;
5043
0
                }
5044
0
                src_hdr->transl[dict][i] = bcf_hdr_id2int(dst_hdr,dict,src_hdr->id[dict][i].key);
5045
0
                if ( src_hdr->transl[dict][i]!=-1 && i!=src_hdr->transl[dict][i] ) src_hdr->ntransl++;
5046
0
            }
5047
0
        }
5048
0
        if ( !src_hdr->ntransl )
5049
0
        {
5050
0
            free(src_hdr->transl[0]); src_hdr->transl[0] = NULL;
5051
0
            free(src_hdr->transl[1]); src_hdr->transl[1] = NULL;
5052
0
            src_hdr->ntransl = -1;
5053
0
        }
5054
0
        if ( src_hdr->ntransl==-1 ) return 0;
5055
0
    }
5056
0
    bcf_unpack(line,BCF_UN_ALL);
5057
5058
    // CHROM
5059
0
    if ( src_hdr->transl[BCF_DT_CTG][line->rid] >=0 ) line->rid = src_hdr->transl[BCF_DT_CTG][line->rid];
5060
5061
    // FILTER
5062
0
    for (i=0; i<line->d.n_flt; i++)
5063
0
    {
5064
0
        int src_id = line->d.flt[i];
5065
0
        if ( src_hdr->transl[BCF_DT_ID][src_id] >=0 )
5066
0
            line->d.flt[i] = src_hdr->transl[BCF_DT_ID][src_id];
5067
0
        line->d.shared_dirty |= BCF1_DIRTY_FLT;
5068
0
    }
5069
5070
    // INFO
5071
0
    for (i=0; i<line->n_info; i++)
5072
0
    {
5073
0
        int src_id = line->d.info[i].key;
5074
0
        int dst_id = src_hdr->transl[BCF_DT_ID][src_id];
5075
0
        if ( dst_id<0 ) continue;
5076
0
        line->d.info[i].key = dst_id;
5077
0
        if ( !line->d.info[i].vptr ) continue;  // skip deleted
5078
0
        int src_size = src_id>>7 ? ( src_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
5079
0
        int dst_size = dst_id>>7 ? ( dst_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
5080
0
        if ( src_size==dst_size )   // can overwrite
5081
0
        {
5082
0
            uint8_t *vptr = line->d.info[i].vptr - line->d.info[i].vptr_off;
5083
0
            if ( dst_size==BCF_BT_INT8 ) { vptr[1] = (uint8_t)dst_id; }
5084
0
            else if ( dst_size==BCF_BT_INT16 ) { *(uint16_t*)vptr = (uint16_t)dst_id; }
5085
0
            else { *(uint32_t*)vptr = (uint32_t)dst_id; }
5086
0
        }
5087
0
        else    // must realloc
5088
0
        {
5089
0
            bcf_info_t *info = &line->d.info[i];
5090
0
            kstring_t str = {0,0,0};
5091
0
            bcf_enc_int1(&str, dst_id);
5092
0
            bcf_enc_size(&str, info->len,info->type);
5093
0
            uint32_t vptr_off = str.l;
5094
0
            kputsn((char*)info->vptr, info->vptr_len, &str);
5095
0
            if( info->vptr_free ) free(info->vptr - info->vptr_off);
5096
0
            info->vptr_off = vptr_off;
5097
0
            info->vptr = (uint8_t*)str.s + info->vptr_off;
5098
0
            info->vptr_free = 1;
5099
0
            line->d.shared_dirty |= BCF1_DIRTY_INF;
5100
0
        }
5101
0
    }
5102
5103
    // FORMAT
5104
0
    for (i=0; i<line->n_fmt; i++)
5105
0
    {
5106
0
        int src_id = line->d.fmt[i].id;
5107
0
        int dst_id = src_hdr->transl[BCF_DT_ID][src_id];
5108
0
        if ( dst_id<0 ) continue;
5109
0
        line->d.fmt[i].id = dst_id;
5110
0
        if( !line->d.fmt[i].p ) continue;  // skip deleted
5111
0
        int src_size = src_id>>7 ? ( src_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
5112
0
        int dst_size = dst_id>>7 ? ( dst_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
5113
0
        if ( src_size==dst_size )   // can overwrite
5114
0
        {
5115
0
            uint8_t *p = line->d.fmt[i].p - line->d.fmt[i].p_off;    // pointer to the vector size (4bits) and BT type (4bits)
5116
0
            if ( dst_size==BCF_BT_INT8 ) { p[1] = dst_id; }
5117
0
            else if ( dst_size==BCF_BT_INT16 ) { i16_to_le(dst_id, p + 1); }
5118
0
            else { i32_to_le(dst_id, p + 1); }
5119
0
        }
5120
0
        else    // must realloc
5121
0
        {
5122
0
            bcf_fmt_t *fmt = &line->d.fmt[i];
5123
0
            kstring_t str = {0,0,0};
5124
0
            bcf_enc_int1(&str, dst_id);
5125
0
            bcf_enc_size(&str, fmt->n, fmt->type);
5126
0
            uint32_t p_off = str.l;
5127
0
            kputsn((char*)fmt->p, fmt->p_len, &str);
5128
0
            if( fmt->p_free ) free(fmt->p - fmt->p_off);
5129
0
            fmt->p_off = p_off;
5130
0
            fmt->p = (uint8_t*)str.s + fmt->p_off;
5131
0
            fmt->p_free = 1;
5132
0
            line->d.indiv_dirty = 1;
5133
0
        }
5134
0
    }
5135
0
    return 0;
5136
0
}
5137
5138
bcf_hdr_t *bcf_hdr_dup(const bcf_hdr_t *hdr)
5139
0
{
5140
0
    bcf_hdr_t *hout = bcf_hdr_init("r");
5141
0
    if (!hout) {
5142
0
        hts_log_error("Failed to allocate bcf header");
5143
0
        return NULL;
5144
0
    }
5145
0
    kstring_t htxt = {0,0,0};
5146
0
    if (bcf_hdr_format(hdr, 1, &htxt) < 0) {
5147
0
        free(htxt.s);
5148
0
        return NULL;
5149
0
    }
5150
0
    if ( bcf_hdr_parse(hout, htxt.s) < 0 ) {
5151
0
        bcf_hdr_destroy(hout);
5152
0
        hout = NULL;
5153
0
    }
5154
0
    free(htxt.s);
5155
0
    return hout;
5156
0
}
5157
5158
bcf_hdr_t *bcf_hdr_subset(const bcf_hdr_t *h0, int n, char *const* samples, int *imap)
5159
0
{
5160
0
    void *names_hash = khash_str2int_init();
5161
0
    kstring_t htxt = {0,0,0};
5162
0
    kstring_t str = {0,0,0};
5163
0
    bcf_hdr_t *h = bcf_hdr_init("w");
5164
0
    int r = 0;
5165
0
    if (!h || !names_hash) {
5166
0
        hts_log_error("Failed to allocate bcf header");
5167
0
        goto err;
5168
0
    }
5169
0
    if (bcf_hdr_format(h0, 1, &htxt) < 0) {
5170
0
        hts_log_error("Failed to get header text");
5171
0
        goto err;
5172
0
    }
5173
0
    bcf_hdr_set_version(h,bcf_hdr_get_version(h0));
5174
0
    int j;
5175
0
    for (j=0; j<n; j++) imap[j] = -1;
5176
0
    if ( bcf_hdr_nsamples(h0) > 0) {
5177
0
        char *p = find_chrom_header_line(htxt.s);
5178
0
        int i = 0, end = n? 8 : 7;
5179
0
        while ((p = strchr(p, '\t')) != 0 && i < end) ++i, ++p;
5180
0
        if (i != end) {
5181
0
            hts_log_error("Wrong number of columns in header #CHROM line");
5182
0
            goto err;
5183
0
        }
5184
0
        r |= kputsn(htxt.s, p - htxt.s, &str) < 0;
5185
0
        for (i = 0; i < n; ++i) {
5186
0
            if ( khash_str2int_has_key(names_hash,samples[i]) )
5187
0
            {
5188
0
                hts_log_error("Duplicate sample name \"%s\"", samples[i]);
5189
0
                goto err;
5190
0
            }
5191
0
            imap[i] = bcf_hdr_id2int(h0, BCF_DT_SAMPLE, samples[i]);
5192
0
            if (imap[i] < 0) continue;
5193
0
            r |= kputc('\t', &str) < 0;
5194
0
            r |= kputs(samples[i], &str) < 0;
5195
0
            r |= khash_str2int_inc(names_hash,samples[i]) < 0;
5196
0
        }
5197
0
    } else r |= kputsn(htxt.s, htxt.l, &str) < 0;
5198
0
    while (str.l && (!str.s[str.l-1] || str.s[str.l-1]=='\n') ) str.l--; // kill trailing zeros and newlines
5199
0
    r |= kputc('\n',&str) < 0;
5200
0
    if (r) {
5201
0
        hts_log_error("%s", strerror(errno));
5202
0
        goto err;
5203
0
    }
5204
0
    if ( bcf_hdr_parse(h, str.s) < 0 ) {
5205
0
        bcf_hdr_destroy(h);
5206
0
        h = NULL;
5207
0
    }
5208
0
    free(str.s);
5209
0
    free(htxt.s);
5210
0
    khash_str2int_destroy(names_hash);
5211
0
    return h;
5212
5213
0
 err:
5214
0
    ks_free(&str);
5215
0
    ks_free(&htxt);
5216
0
    khash_str2int_destroy(names_hash);
5217
0
    bcf_hdr_destroy(h);
5218
0
    return NULL;
5219
0
}
5220
5221
int bcf_hdr_set_samples(bcf_hdr_t *hdr, const char *samples, int is_file)
5222
0
{
5223
0
    if ( samples && !strcmp("-",samples) ) return 0;            // keep all samples
5224
5225
0
    int i, narr = bit_array_size(bcf_hdr_nsamples(hdr));
5226
0
    hdr->keep_samples = (uint8_t*) calloc(narr,1);
5227
0
    if (!hdr->keep_samples) return -1;
5228
5229
0
    hdr->nsamples_ori = bcf_hdr_nsamples(hdr);
5230
0
    if ( !samples )
5231
0
    {
5232
        // exclude all samples
5233
0
        khint_t k;
5234
0
        vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_SAMPLE], *new_dict;
5235
0
        new_dict = kh_init(vdict);
5236
0
        if (!new_dict) return -1;
5237
5238
0
        bcf_hdr_nsamples(hdr) = 0;
5239
5240
0
        for (k = kh_begin(d); k != kh_end(d); ++k)
5241
0
            if (kh_exist(d, k)) free((char*)kh_key(d, k));
5242
0
        kh_destroy(vdict, d);
5243
0
        hdr->dict[BCF_DT_SAMPLE] = new_dict;
5244
0
        if (bcf_hdr_sync(hdr) < 0) return -1;
5245
5246
0
        return 0;
5247
0
    }
5248
5249
0
    if ( samples[0]=='^' )
5250
0
        for (i=0; i<bcf_hdr_nsamples(hdr); i++) bit_array_set(hdr->keep_samples,i);
5251
5252
0
    int idx, n, ret = 0;
5253
0
    char **smpls = hts_readlist(samples[0]=='^'?samples+1:samples, is_file, &n);
5254
0
    if ( !smpls ) return -1;
5255
0
    for (i=0; i<n; i++)
5256
0
    {
5257
0
        idx = bcf_hdr_id2int(hdr,BCF_DT_SAMPLE,smpls[i]);
5258
0
        if ( idx<0 )
5259
0
        {
5260
0
            if ( !ret ) ret = i+1;
5261
0
            continue;
5262
0
        }
5263
0
        assert( idx<bcf_hdr_nsamples(hdr) );
5264
0
        if (  samples[0]=='^' )
5265
0
            bit_array_clear(hdr->keep_samples, idx);
5266
0
        else
5267
0
            bit_array_set(hdr->keep_samples, idx);
5268
0
    }
5269
0
    for (i=0; i<n; i++) free(smpls[i]);
5270
0
    free(smpls);
5271
5272
0
    bcf_hdr_nsamples(hdr) = 0;
5273
0
    for (i=0; i<hdr->nsamples_ori; i++)
5274
0
        if ( bit_array_test(hdr->keep_samples,i) ) bcf_hdr_nsamples(hdr)++;
5275
5276
0
    if ( !bcf_hdr_nsamples(hdr) ) { free(hdr->keep_samples); hdr->keep_samples=NULL; }
5277
0
    else
5278
0
    {
5279
        // Make new list and dictionary with desired samples
5280
0
        char **samples = hts_malloc_p(sizeof(char*), bcf_hdr_nsamples(hdr));
5281
0
        vdict_t *new_dict, *d;
5282
0
        int k, res;
5283
0
        if (!samples) return -1;
5284
5285
0
        new_dict = kh_init(vdict);
5286
0
        if (!new_dict) {
5287
0
            free(samples);
5288
0
            return -1;
5289
0
        }
5290
0
        idx = 0;
5291
0
        for (i=0; i<hdr->nsamples_ori; i++) {
5292
0
            if ( bit_array_test(hdr->keep_samples,i) ) {
5293
0
                samples[idx] = hdr->samples[i];
5294
0
                k = kh_put(vdict, new_dict, hdr->samples[i], &res);
5295
0
                if (res < 0) {
5296
0
                    free(samples);
5297
0
                    kh_destroy(vdict, new_dict);
5298
0
                    return -1;
5299
0
                }
5300
0
                kh_val(new_dict, k) = bcf_idinfo_def;
5301
0
                kh_val(new_dict, k).id = idx;
5302
0
                idx++;
5303
0
            }
5304
0
        }
5305
5306
        // Delete desired samples from old dictionary, so we don't free them
5307
0
        d = (vdict_t*)hdr->dict[BCF_DT_SAMPLE];
5308
0
        for (i=0; i < idx; i++) {
5309
0
            int k = kh_get(vdict, d, samples[i]);
5310
0
            if (k < kh_end(d)) kh_del(vdict, d, k);
5311
0
        }
5312
5313
        // Free everything else
5314
0
        for (k = kh_begin(d); k != kh_end(d); ++k)
5315
0
            if (kh_exist(d, k)) free((char*)kh_key(d, k));
5316
0
        kh_destroy(vdict, d);
5317
0
        hdr->dict[BCF_DT_SAMPLE] = new_dict;
5318
5319
0
        free(hdr->samples);
5320
0
        hdr->samples = samples;
5321
5322
0
        if (bcf_hdr_sync(hdr) < 0)
5323
0
            return -1;
5324
0
    }
5325
5326
0
    return ret;
5327
0
}
5328
5329
int bcf_subset(const bcf_hdr_t *h, bcf1_t *v, int n, int *imap)
5330
0
{
5331
0
    kstring_t ind;
5332
0
    ind.s = 0; ind.l = ind.m = 0;
5333
0
    if (n) {
5334
0
        bcf_fmt_t fmt[MAX_N_FMT];
5335
0
        int i, j;
5336
0
        uint8_t *ptr = (uint8_t*)v->indiv.s;
5337
0
        for (i = 0; i < v->n_fmt; ++i)
5338
0
            ptr = bcf_unpack_fmt_core1(ptr, v->n_sample, &fmt[i]);
5339
0
        for (i = 0; i < (int)v->n_fmt; ++i) {
5340
0
            bcf_fmt_t *f = &fmt[i];
5341
0
            bcf_enc_int1(&ind, f->id);
5342
0
            bcf_enc_size(&ind, f->n, f->type);
5343
0
            for (j = 0; j < n; ++j)
5344
0
                if (imap[j] >= 0) kputsn((char*)(f->p + imap[j] * f->size), f->size, &ind);
5345
0
        }
5346
0
        for (i = j = 0; j < n; ++j) if (imap[j] >= 0) ++i;
5347
0
        v->n_sample = i;
5348
0
    } else v->n_sample = 0;
5349
0
    if ( !v->n_sample ) v->n_fmt = 0;
5350
0
    free(v->indiv.s);
5351
0
    v->indiv = ind;
5352
0
    v->unpacked &= ~BCF_UN_FMT;    // only BCF is ready for output, VCF will need to unpack again
5353
0
    return 0;
5354
0
}
5355
5356
int bcf_is_snp(bcf1_t *v)
5357
0
{
5358
0
    int i;
5359
0
    bcf_unpack(v, BCF_UN_STR);
5360
0
    for (i = 0; i < v->n_allele; ++i)
5361
0
    {
5362
0
        if ( v->d.allele[i][1]==0 && v->d.allele[i][0]!='*' ) continue;
5363
5364
        // mpileup's <X> allele, see also below. This is not completely satisfactory,
5365
        // a general library is here narrowly tailored to fit samtools.
5366
0
        if ( v->d.allele[i][0]=='<' && v->d.allele[i][1]=='X' && v->d.allele[i][2]=='>' ) continue;
5367
0
        if ( v->d.allele[i][0]=='<' && v->d.allele[i][1]=='*' && v->d.allele[i][2]=='>' ) continue;
5368
5369
0
        break;
5370
0
    }
5371
0
    return i == v->n_allele;
5372
0
}
5373
5374
static void bcf_set_variant_type(const char *ref, const char *alt, bcf_variant_t *var)
5375
0
{
5376
0
    if ( *alt == '*' && !alt[1] ) { var->n = 0; var->type = VCF_OVERLAP; return; }  // overlapping variant
5377
5378
    // The most frequent case
5379
0
    if ( !ref[1] && !alt[1] )
5380
0
    {
5381
0
        if ( *alt == '.' || *ref==*alt ) { var->n = 0; var->type = VCF_REF; return; }
5382
0
        if ( *alt == 'X' ) { var->n = 0; var->type = VCF_REF; return; }  // mpileup's X allele shouldn't be treated as variant
5383
0
        var->n = 1; var->type = VCF_SNP; return;
5384
0
    }
5385
0
    if ( alt[0]=='<' )
5386
0
    {
5387
0
        if ( alt[1]=='X' && alt[2]=='>' ) { var->n = 0; var->type = VCF_REF; return; }  // mpileup's X allele shouldn't be treated as variant
5388
0
        if ( alt[1]=='*' && alt[2]=='>' ) { var->n = 0; var->type = VCF_REF; return; }
5389
0
        if ( !strcmp("NON_REF>",alt+1) ) { var->n = 0; var->type = VCF_REF; return; }
5390
0
        var->type = VCF_OTHER;
5391
0
        return;
5392
0
    }
5393
5394
    // Catch "joined before" breakend case
5395
0
    if ( alt[0]==']' || alt[0] == '[' )
5396
0
    {
5397
0
        var->type = VCF_BND; return;
5398
0
    }
5399
5400
    // Iterate through alt characters that match the reference
5401
0
    const char *r = ref, *a = alt;
5402
0
    while (*r && *a && toupper_c(*r)==toupper_c(*a) ) { r++; a++; }     // unfortunately, matching REF,ALT case is not guaranteed
5403
5404
0
    if ( *a && !*r )
5405
0
    {
5406
0
        while ( *a ) a++;
5407
0
        if ( *(a-1)==']' || *(a-1)=='[' ) { var->type = VCF_BND; return; } // "joined after" breakend
5408
0
        var->n = (a-alt)-(r-ref); var->type = VCF_INDEL | VCF_INS; return;
5409
0
    }
5410
0
    else if ( *r && !*a )
5411
0
    {
5412
0
        while ( *r ) r++;
5413
0
        var->n = (a-alt)-(r-ref); var->type = VCF_INDEL | VCF_DEL; return;
5414
0
    }
5415
0
    else if ( !*r && !*a )
5416
0
    {
5417
0
        var->n = 0; var->type = VCF_REF; return;
5418
0
    }
5419
5420
0
    const char *re = r, *ae = a;
5421
0
    while ( re[1] ) re++;
5422
0
    while ( ae[1] ) ae++;
5423
0
    if ( ae[0]==']' || ae[0]=='[' ) { var->type = VCF_BND; return; }    // "joined after" breakend
5424
0
    while ( re>r && ae>a && toupper_c(*re)==toupper_c(*ae) ) { re--; ae--; }
5425
0
    if ( ae==a )
5426
0
    {
5427
0
        if ( re==r ) { var->n = 1; var->type = VCF_SNP; return; }
5428
0
        var->n = -(re-r);
5429
0
        if ( toupper_c(*re)==toupper_c(*ae) ) { var->type = VCF_INDEL | VCF_DEL; return; }
5430
0
        var->type = VCF_OTHER; return;
5431
0
    }
5432
0
    else if ( re==r )
5433
0
    {
5434
0
        var->n = ae-a;
5435
0
        if ( toupper_c(*re)==toupper_c(*ae) ) { var->type = VCF_INDEL | VCF_INS; return; }
5436
0
        var->type = VCF_OTHER; return;
5437
0
    }
5438
5439
0
    var->type = ( re-r == ae-a ) ? VCF_MNP : VCF_OTHER;
5440
0
    var->n = ( re-r > ae-a ) ? -(re-r+1) : ae-a+1;
5441
5442
    // should do also complex events, SVs, etc...
5443
0
}
5444
5445
static int bcf_set_variant_types(bcf1_t *b)
5446
0
{
5447
0
    if ( !(b->unpacked & BCF_UN_STR) ) bcf_unpack(b, BCF_UN_STR);
5448
0
    bcf_dec_t *d = &b->d;
5449
0
    if ( d->n_var < b->n_allele )
5450
0
    {
5451
0
        bcf_variant_t *new_var = hts_realloc_p(d->var, sizeof(bcf_variant_t),
5452
0
                                              b->n_allele);
5453
0
        if (!new_var)
5454
0
            return -1;
5455
0
        d->var = new_var;
5456
0
        d->n_var = b->n_allele;
5457
0
    }
5458
0
    int i;
5459
0
    b->d.var_type = 0;
5460
0
    d->var[0].type = VCF_REF;
5461
0
    d->var[0].n    = 0;
5462
0
    for (i=1; i<b->n_allele; i++)
5463
0
    {
5464
0
        bcf_set_variant_type(d->allele[0],d->allele[i], &d->var[i]);
5465
0
        b->d.var_type |= d->var[i].type;
5466
        //fprintf(stderr,"[set_variant_type] %d   %s %s -> %d %d .. %d\n", b->pos+1,d->allele[0],d->allele[i],d->var[i].type,d->var[i].n, b->d.var_type);
5467
0
    }
5468
0
    return 0;
5469
0
}
5470
5471
// bcf_get_variant_type/bcf_get_variant_types should only return the following,
5472
// to be compatible with callers that are not expecting newer values
5473
// like VCF_INS, VCF_DEL.  The full set is available from the newer
5474
// vcf_has_variant_type* interfaces.
5475
0
#define ORIG_VAR_TYPES (VCF_SNP|VCF_MNP|VCF_INDEL|VCF_OTHER|VCF_BND|VCF_OVERLAP)
5476
int bcf_get_variant_types(bcf1_t *rec)
5477
0
{
5478
0
    if ( rec->d.var_type==-1 ) {
5479
0
        if (bcf_set_variant_types(rec) != 0) {
5480
0
            hts_log_error("Couldn't get variant types: %s", strerror(errno));
5481
0
            exit(1); // Due to legacy API having no way to report failures
5482
0
        }
5483
0
    }
5484
0
    return rec->d.var_type & ORIG_VAR_TYPES;
5485
0
}
5486
5487
int bcf_get_variant_type(bcf1_t *rec, int ith_allele)
5488
0
{
5489
0
    if ( rec->d.var_type==-1 ) {
5490
0
        if (bcf_set_variant_types(rec) != 0) {
5491
0
            hts_log_error("Couldn't get variant types: %s", strerror(errno));
5492
0
            exit(1); // Due to legacy API having no way to report failures
5493
0
        }
5494
0
    }
5495
0
    if (ith_allele < 0 || ith_allele >= rec->n_allele) {
5496
0
        hts_log_error("Requested allele outside valid range");
5497
0
        exit(1);
5498
0
    }
5499
0
    return rec->d.var[ith_allele].type & ORIG_VAR_TYPES;
5500
0
}
5501
#undef ORIG_VAR_TYPES
5502
5503
int bcf_has_variant_type(bcf1_t *rec, int ith_allele, uint32_t bitmask)
5504
0
{
5505
0
    if ( rec->d.var_type==-1 ) {
5506
0
        if (bcf_set_variant_types(rec) != 0) return -1;
5507
0
    }
5508
0
    if (ith_allele < 0 || ith_allele >= rec->n_allele) return -1;
5509
0
    if (bitmask == VCF_REF) {  // VCF_REF is 0, so handled as a special case
5510
0
        return rec->d.var[ith_allele].type == VCF_REF;
5511
0
    }
5512
0
    return bitmask & rec->d.var[ith_allele].type;
5513
0
}
5514
5515
int bcf_variant_length(bcf1_t *rec, int ith_allele)
5516
0
{
5517
0
    if ( rec->d.var_type==-1 ) {
5518
0
        if (bcf_set_variant_types(rec) != 0) return bcf_int32_missing;
5519
0
    }
5520
0
    if (ith_allele < 0 || ith_allele >= rec->n_allele) return bcf_int32_missing;
5521
0
    return rec->d.var[ith_allele].n;
5522
0
}
5523
5524
int bcf_has_variant_types(bcf1_t *rec, uint32_t bitmask,
5525
                          enum bcf_variant_match mode)
5526
0
{
5527
0
    if ( rec->d.var_type==-1 ) {
5528
0
        if (bcf_set_variant_types(rec) != 0) return -1;
5529
0
    }
5530
0
    uint32_t type = rec->d.var_type;
5531
0
    if ( mode==bcf_match_overlap ) return bitmask & type;
5532
5533
    // VCF_INDEL is always set with VCF_INS and VCF_DEL by bcf_set_variant_type[s], but the bitmask may
5534
    // ask for say `VCF_INS` or `VCF_INDEL` only
5535
0
    if ( bitmask&(VCF_INS|VCF_DEL) && !(bitmask&VCF_INDEL) ) type &= ~VCF_INDEL;
5536
0
    else if ( bitmask&VCF_INDEL && !(bitmask&(VCF_INS|VCF_DEL)) ) type &= ~(VCF_INS|VCF_DEL);
5537
5538
0
    if ( mode==bcf_match_subset )
5539
0
    {
5540
0
        if ( ~bitmask & type ) return 0;
5541
0
        else return bitmask & type;
5542
0
    }
5543
    // mode == bcf_match_exact
5544
0
    if ( bitmask==VCF_REF ) return type==bitmask ? 1 : 0;
5545
0
    return type==bitmask ? type : 0;
5546
0
}
5547
5548
int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type)
5549
0
{
5550
0
    static int negative_rlen_warned = 0;
5551
0
    int is_end_tag, is_svlen_tag = 0;
5552
5553
    // Is the field already present?
5554
0
    int i, inf_id = bcf_hdr_id2int(hdr,BCF_DT_ID,key);
5555
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,inf_id) ) return -1;    // No such INFO field in the header
5556
0
    if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO);
5557
5558
0
    is_end_tag = strcmp(key, "END") == 0;
5559
0
    is_svlen_tag = strcmp(key, "SVLEN") == 0;
5560
5561
0
    for (i=0; i<line->n_info; i++)
5562
0
        if ( inf_id==line->d.info[i].key ) break;
5563
0
    bcf_info_t *inf = i==line->n_info ? NULL : &line->d.info[i];
5564
5565
0
    if ( !n || (type==BCF_HT_STR && !values) )
5566
0
    {
5567
0
        if ( inf )
5568
0
        {
5569
            // Mark the tag for removal, free existing memory if necessary
5570
0
            if ( inf->vptr_free )
5571
0
            {
5572
0
                free(inf->vptr - inf->vptr_off);
5573
0
                inf->vptr_free = 0;
5574
0
            }
5575
0
            line->d.shared_dirty |= BCF1_DIRTY_INF;
5576
0
            inf->vptr = NULL;
5577
0
            inf->vptr_off = inf->vptr_len = 0;
5578
0
        }
5579
0
        if ( n==0 && (is_end_tag || is_svlen_tag) ) {
5580
0
            line->rlen = get_rlen(hdr, line);
5581
0
        }
5582
0
        return 0;
5583
0
    }
5584
5585
0
    if (is_end_tag)
5586
0
    {
5587
0
        if (n != 1)
5588
0
        {
5589
0
            hts_log_error("END info tag should only have one value at %s:%"PRIhts_pos, bcf_seqname_safe(hdr,line), line->pos+1);
5590
0
            line->errcode |= BCF_ERR_TAG_INVALID;
5591
0
            return -1;
5592
0
        }
5593
0
        if (type != BCF_HT_INT && type != BCF_HT_LONG)
5594
0
        {
5595
0
            hts_log_error("Wrong type (%d) for END info tag at %s:%"PRIhts_pos, type, bcf_seqname_safe(hdr,line), line->pos+1);
5596
0
            line->errcode |= BCF_ERR_TAG_INVALID;
5597
0
            return -1;
5598
0
        }
5599
0
    }
5600
5601
    // Encode the values and determine the size required to accommodate the values
5602
0
    kstring_t str = {0,0,0};
5603
0
    bcf_enc_int1(&str, inf_id);
5604
0
    if ( type==BCF_HT_INT )
5605
0
        bcf_enc_vint(&str, n, (int32_t*)values, -1);
5606
0
    else if ( type==BCF_HT_REAL )
5607
0
        bcf_enc_vfloat(&str, n, (float*)values);
5608
0
    else if ( type==BCF_HT_FLAG || type==BCF_HT_STR )
5609
0
    {
5610
0
        if ( values==NULL )
5611
0
            bcf_enc_size(&str, 0, BCF_BT_NULL);
5612
0
        else
5613
0
            bcf_enc_vchar(&str, strlen((char*)values), (char*)values);
5614
0
    }
5615
#ifdef VCF_ALLOW_INT64
5616
    else if ( type==BCF_HT_LONG )
5617
    {
5618
        if (n != 1) {
5619
            hts_log_error("Only storing a single BCF_HT_LONG value is supported at %s:%"PRIhts_pos, bcf_seqname_safe(hdr,line), line->pos+1);
5620
            abort();
5621
        }
5622
        bcf_enc_long1(&str, *(int64_t *) values);
5623
    }
5624
#endif
5625
0
    else
5626
0
    {
5627
0
        hts_log_error("The type %d not implemented yet at %s:%"PRIhts_pos, type, bcf_seqname_safe(hdr,line), line->pos+1);
5628
0
        abort();
5629
0
    }
5630
5631
    // Is the INFO tag already present
5632
0
    if ( inf )
5633
0
    {
5634
        // Is it big enough to accommodate new block?
5635
0
        if ( inf->vptr && str.l <= inf->vptr_len + inf->vptr_off )
5636
0
        {
5637
0
            if ( str.l != inf->vptr_len + inf->vptr_off ) line->d.shared_dirty |= BCF1_DIRTY_INF;
5638
0
            uint8_t *ptr = inf->vptr - inf->vptr_off;
5639
0
            memcpy(ptr, str.s, str.l);
5640
0
            free(str.s);
5641
0
            int vptr_free = inf->vptr_free;
5642
0
            bcf_unpack_info_core1(ptr, inf);
5643
0
            inf->vptr_free = vptr_free;
5644
0
        }
5645
0
        else
5646
0
        {
5647
0
            if ( inf->vptr_free )
5648
0
                free(inf->vptr - inf->vptr_off);
5649
0
            bcf_unpack_info_core1((uint8_t*)str.s, inf);
5650
0
            inf->vptr_free = 1;
5651
0
            line->d.shared_dirty |= BCF1_DIRTY_INF;
5652
0
        }
5653
0
    }
5654
0
    else
5655
0
    {
5656
        // The tag is not present, create new one
5657
0
        line->n_info++;
5658
0
        hts_expand0(bcf_info_t, line->n_info, line->d.m_info , line->d.info);
5659
0
        inf = &line->d.info[line->n_info-1];
5660
0
        bcf_unpack_info_core1((uint8_t*)str.s, inf);
5661
0
        inf->vptr_free = 1;
5662
0
        line->d.shared_dirty |= BCF1_DIRTY_INF;
5663
0
    }
5664
0
    line->unpacked |= BCF_UN_INFO;
5665
5666
0
   if ( n==1 && is_end_tag) {
5667
0
        hts_pos_t end = type == BCF_HT_INT ? *(int32_t *) values : *(int64_t *) values;
5668
0
        if ( (type == BCF_HT_INT && end!=bcf_int32_missing) || (type == BCF_HT_LONG && end!=bcf_int64_missing) )
5669
0
        {
5670
0
            if ( end <= line->pos )
5671
0
            {
5672
0
                if ( !negative_rlen_warned )
5673
0
                {
5674
0
                    hts_log_warning("INFO/END=%"PRIhts_pos" is smaller than POS at %s:%"PRIhts_pos,end,bcf_seqname_safe(hdr,line),line->pos+1);
5675
0
                    negative_rlen_warned = 1;
5676
0
                }
5677
0
            }
5678
0
        }
5679
0
    }
5680
0
    if (is_svlen_tag || is_end_tag) {
5681
0
        line->rlen = get_rlen(hdr, line);
5682
0
    }
5683
0
    return 0;
5684
0
}
5685
5686
int bcf_update_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const char **values, int n)
5687
0
{
5688
0
    if ( !n )
5689
0
        return bcf_update_format(hdr,line,key,NULL,0,BCF_HT_STR);
5690
5691
0
    int i, max_len = 0;
5692
0
    for (i=0; i<n; i++)
5693
0
    {
5694
0
        int len = strlen(values[i]);
5695
0
        if ( len > max_len ) max_len = len;
5696
0
    }
5697
0
    char *out = hts_malloc_p(max_len, n);
5698
0
    if ( !out ) return -2;
5699
0
    for (i=0; i<n; i++)
5700
0
    {
5701
0
        char *dst = out+i*max_len;
5702
0
        const char *src = values[i];
5703
0
        int j = 0;
5704
0
        while ( src[j] ) { dst[j] = src[j]; j++; }
5705
0
        for (; j<max_len; j++) dst[j] = 0;
5706
0
    }
5707
0
    int ret = bcf_update_format(hdr,line,key,out,max_len*n,BCF_HT_STR);
5708
0
    free(out);
5709
0
    return ret;
5710
0
}
5711
5712
int bcf_update_format(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type)
5713
0
{
5714
    // Is the field already present?
5715
0
    int i, fmt_id = bcf_hdr_id2int(hdr,BCF_DT_ID,key);
5716
0
    int is_len = 0;
5717
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,fmt_id) )
5718
0
    {
5719
0
        if ( !n ) return 0;
5720
0
        return -1;  // the key not present in the header
5721
0
    }
5722
5723
0
    if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
5724
5725
0
    for (i=0; i<line->n_fmt; i++)
5726
0
        if ( line->d.fmt[i].id==fmt_id ) break;
5727
0
    bcf_fmt_t *fmt = i==line->n_fmt ? NULL : &line->d.fmt[i];
5728
5729
0
    is_len = strcmp(key, "LEN") == 0;
5730
0
    if ( !n )
5731
0
    {
5732
0
        if ( fmt )
5733
0
        {
5734
            // Mark the tag for removal, free existing memory if necessary
5735
0
            if ( fmt->p_free )
5736
0
            {
5737
0
                free(fmt->p - fmt->p_off);
5738
0
                fmt->p_free = 0;
5739
0
            }
5740
0
            line->d.indiv_dirty = 1;
5741
0
            fmt->p = NULL;
5742
0
        }
5743
0
        if (is_len) {
5744
0
            line->rlen = get_rlen(hdr, line);
5745
0
        }
5746
0
        return 0;
5747
0
    }
5748
5749
0
    line->n_sample = bcf_hdr_nsamples(hdr);
5750
0
    int nps = n / line->n_sample;  // number of values per sample
5751
0
    assert( nps && nps*line->n_sample==n );     // must be divisible by n_sample
5752
5753
    // Encode the values and determine the size required to accommodate the values
5754
0
    kstring_t str = {0,0,0};
5755
0
    bcf_enc_int1(&str, fmt_id);
5756
0
    assert(values != NULL);
5757
0
    if ( type==BCF_HT_INT )
5758
0
        bcf_enc_vint(&str, n, (int32_t*)values, nps);
5759
0
    else if ( type==BCF_HT_REAL )
5760
0
    {
5761
0
        bcf_enc_size(&str, nps, BCF_BT_FLOAT);
5762
0
        serialize_float_array(&str, nps*line->n_sample, (float *) values);
5763
0
    }
5764
0
    else if ( type==BCF_HT_STR )
5765
0
    {
5766
0
        bcf_enc_size(&str, nps, BCF_BT_CHAR);
5767
0
        kputsn((char*)values, nps*line->n_sample, &str);
5768
0
    }
5769
0
    else
5770
0
    {
5771
0
        hts_log_error("The type %d not implemented yet at %s:%"PRIhts_pos, type, bcf_seqname_safe(hdr,line), line->pos+1);
5772
0
        abort();
5773
0
    }
5774
5775
0
    if ( !fmt )
5776
0
    {
5777
        // Not present, new format field
5778
0
        line->n_fmt++;
5779
0
        hts_expand0(bcf_fmt_t, line->n_fmt, line->d.m_fmt, line->d.fmt);
5780
5781
        // Special case: VCF specification requires that GT is always first
5782
0
        if ( line->n_fmt > 1 && key[0]=='G' && key[1]=='T' && !key[2] )
5783
0
        {
5784
0
            for (i=line->n_fmt-1; i>0; i--)
5785
0
                line->d.fmt[i] = line->d.fmt[i-1];
5786
0
            fmt = &line->d.fmt[0];
5787
0
        }
5788
0
        else
5789
0
            fmt = &line->d.fmt[line->n_fmt-1];
5790
0
        bcf_unpack_fmt_core1((uint8_t*)str.s, line->n_sample, fmt);
5791
0
        line->d.indiv_dirty = 1;
5792
0
        fmt->p_free = 1;
5793
0
    }
5794
0
    else
5795
0
    {
5796
        // The tag is already present, check if it is big enough to accommodate the new block
5797
0
        if ( fmt->p && str.l <= fmt->p_len + fmt->p_off )
5798
0
        {
5799
            // good, the block is big enough
5800
0
            if ( str.l != fmt->p_len + fmt->p_off ) line->d.indiv_dirty = 1;
5801
0
            uint8_t *ptr = fmt->p - fmt->p_off;
5802
0
            memcpy(ptr, str.s, str.l);
5803
0
            free(str.s);
5804
0
            int p_free = fmt->p_free;
5805
0
            bcf_unpack_fmt_core1(ptr, line->n_sample, fmt);
5806
0
            fmt->p_free = p_free;
5807
0
        }
5808
0
        else
5809
0
        {
5810
0
            if ( fmt->p_free )
5811
0
                free(fmt->p - fmt->p_off);
5812
0
            bcf_unpack_fmt_core1((uint8_t*)str.s, line->n_sample, fmt);
5813
0
            fmt->p_free = 1;
5814
0
            line->d.indiv_dirty = 1;
5815
0
        }
5816
0
    }
5817
0
    line->unpacked |= BCF_UN_FMT;
5818
5819
0
    if (is_len) {
5820
0
        line->rlen = get_rlen(hdr, line);
5821
0
    }
5822
0
    return 0;
5823
0
}
5824
5825
5826
int bcf_update_filter(const bcf_hdr_t *hdr, bcf1_t *line, int *flt_ids, int n)
5827
0
{
5828
0
    if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
5829
0
    line->d.shared_dirty |= BCF1_DIRTY_FLT;
5830
0
    line->d.n_flt = n;
5831
0
    if ( !n ) return 0;
5832
0
    hts_expand(int, line->d.n_flt, line->d.m_flt, line->d.flt);
5833
0
    int i;
5834
0
    for (i=0; i<n; i++)
5835
0
        line->d.flt[i] = flt_ids[i];
5836
0
    return 0;
5837
0
}
5838
5839
int bcf_add_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id)
5840
0
{
5841
0
    if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
5842
0
    int i;
5843
0
    for (i=0; i<line->d.n_flt; i++)
5844
0
        if ( flt_id==line->d.flt[i] ) break;
5845
0
    if ( i<line->d.n_flt ) return 0;    // this filter is already set
5846
0
    line->d.shared_dirty |= BCF1_DIRTY_FLT;
5847
0
    if ( flt_id==0 )    // set to PASS
5848
0
        line->d.n_flt = 1;
5849
0
    else if ( line->d.n_flt==1 && line->d.flt[0]==0 )
5850
0
        line->d.n_flt = 1;
5851
0
    else
5852
0
        line->d.n_flt++;
5853
0
    hts_expand(int, line->d.n_flt, line->d.m_flt, line->d.flt);
5854
0
    line->d.flt[line->d.n_flt-1] = flt_id;
5855
0
    return 1;
5856
0
}
5857
int bcf_remove_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id, int pass)
5858
0
{
5859
0
    if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
5860
0
    int i;
5861
0
    for (i=0; i<line->d.n_flt; i++)
5862
0
        if ( flt_id==line->d.flt[i] ) break;
5863
0
    if ( i==line->d.n_flt ) return 0;   // the filter is not present
5864
0
    line->d.shared_dirty |= BCF1_DIRTY_FLT;
5865
0
    if ( i!=line->d.n_flt-1 ) memmove(line->d.flt+i,line->d.flt+i+1,(line->d.n_flt-i-1)*sizeof(*line->d.flt));
5866
0
    line->d.n_flt--;
5867
0
    if ( !line->d.n_flt && pass ) bcf_add_filter(hdr,line,0);
5868
0
    return 0;
5869
0
}
5870
5871
int bcf_has_filter(const bcf_hdr_t *hdr, bcf1_t *line, char *filter)
5872
0
{
5873
0
    if ( filter[0]=='.' && !filter[1] ) filter = "PASS";
5874
0
    int id = bcf_hdr_id2int(hdr, BCF_DT_ID, filter);
5875
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FLT,id) ) return -1;  // not defined in the header
5876
5877
0
    if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
5878
0
    if ( id==0 && !line->d.n_flt) return 1; // PASS
5879
5880
0
    int i;
5881
0
    for (i=0; i<line->d.n_flt; i++)
5882
0
        if ( line->d.flt[i]==id ) return 1;
5883
0
    return 0;
5884
0
}
5885
5886
static inline int _bcf1_sync_alleles(const bcf_hdr_t *hdr, bcf1_t *line, int nals)
5887
0
{
5888
0
    line->d.shared_dirty |= BCF1_DIRTY_ALS;
5889
0
    line->d.var_type = -1;
5890
5891
0
    line->n_allele = nals;
5892
0
    hts_expand(char*, line->n_allele, line->d.m_allele, line->d.allele);
5893
5894
0
    char *als = line->d.als;
5895
0
    int n = 0;
5896
0
    while (n<nals)
5897
0
    {
5898
0
        line->d.allele[n] = als;
5899
0
        while ( *als ) als++;
5900
0
        als++;
5901
0
        n++;
5902
0
    }
5903
    // Update REF length. Note that END is 1-based while line->pos 0-based
5904
0
    line->rlen = get_rlen(hdr, line);
5905
5906
0
    return 0;
5907
0
}
5908
int bcf_update_alleles(const bcf_hdr_t *hdr, bcf1_t *line, const char **alleles, int nals)
5909
0
{
5910
0
    if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line, BCF_UN_STR);
5911
0
    char *free_old = NULL;
5912
0
    char buffer[256];
5913
0
    size_t used = 0;
5914
5915
    // The pointers in alleles may point into the existing line->d.als memory,
5916
    // so care needs to be taken not to clobber them while updating.  Usually
5917
    // they will be short so we can copy through an intermediate buffer.
5918
    // If they're longer, or won't fit in the existing allocation we
5919
    // can allocate a new buffer to write into.  Note that in either case
5920
    // pointers to line->d.als memory in alleles may not be valid when we've
5921
    // finished.
5922
0
    int i;
5923
0
    size_t avail = line->d.m_als < sizeof(buffer) ? line->d.m_als : sizeof(buffer);
5924
0
    for (i=0; i<nals; i++) {
5925
0
        size_t sz = strlen(alleles[i]) + 1;
5926
0
        if (avail - used < sz)
5927
0
            break;
5928
0
        memcpy(buffer + used, alleles[i], sz);
5929
0
        used += sz;
5930
0
    }
5931
5932
    // Did we miss anything?
5933
0
    if (i < nals) {
5934
0
        int j;
5935
0
        size_t needed = used;
5936
0
        char *new_als;
5937
0
        for (j = i; j < nals; j++)
5938
0
            needed += strlen(alleles[j]) + 1;
5939
0
        if (needed < line->d.m_als) // Don't shrink the buffer
5940
0
            needed = line->d.m_als;
5941
0
        if (needed > INT_MAX) {
5942
0
            hts_log_error("REF + alleles too long to fit in a BCF record");
5943
0
            return -1;
5944
0
        }
5945
0
        new_als = malloc(needed);
5946
0
        if (!new_als)
5947
0
            return -1;
5948
0
        free_old = line->d.als;
5949
0
        line->d.als = new_als;
5950
0
        line->d.m_als = needed;
5951
0
    }
5952
5953
    // Copy from the temp buffer to the destination
5954
0
    if (used) {
5955
0
        assert(used <= line->d.m_als);
5956
0
        memcpy(line->d.als, buffer, used);
5957
0
    }
5958
5959
    // Add in any remaining entries - if this happens we will always be
5960
    // writing to a newly-allocated buffer.
5961
0
    for (; i < nals; i++) {
5962
0
        size_t sz = strlen(alleles[i]) + 1;
5963
0
        memcpy(line->d.als + used, alleles[i], sz);
5964
0
        used += sz;
5965
0
    }
5966
5967
0
    if (free_old)
5968
0
        free(free_old);
5969
0
    return _bcf1_sync_alleles(hdr,line,nals);
5970
0
}
5971
5972
int bcf_update_alleles_str(const bcf_hdr_t *hdr, bcf1_t *line, const char *alleles_string)
5973
0
{
5974
0
    if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line, BCF_UN_STR);
5975
0
    kstring_t tmp;
5976
0
    tmp.l = 0; tmp.s = line->d.als; tmp.m = line->d.m_als;
5977
0
    kputs(alleles_string, &tmp);
5978
0
    line->d.als = tmp.s; line->d.m_als = tmp.m;
5979
5980
0
    int nals = 1;
5981
0
    char *t = line->d.als;
5982
0
    while (*t)
5983
0
    {
5984
0
        if ( *t==',' ) { *t = 0; nals++; }
5985
0
        t++;
5986
0
    }
5987
0
    return _bcf1_sync_alleles(hdr, line, nals);
5988
0
}
5989
5990
int bcf_update_id(const bcf_hdr_t *hdr, bcf1_t *line, const char *id)
5991
0
{
5992
0
    if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line, BCF_UN_STR);
5993
0
    kstring_t tmp;
5994
0
    tmp.l = 0; tmp.s = line->d.id; tmp.m = line->d.m_id;
5995
0
    if ( id )
5996
0
        kputs(id, &tmp);
5997
0
    else
5998
0
        kputs(".", &tmp);
5999
0
    line->d.id = tmp.s; line->d.m_id = tmp.m;
6000
0
    line->d.shared_dirty |= BCF1_DIRTY_ID;
6001
0
    return 0;
6002
0
}
6003
6004
int bcf_add_id(const bcf_hdr_t *hdr, bcf1_t *line, const char *id)
6005
0
{
6006
0
    if ( !id ) return 0;
6007
0
    if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line, BCF_UN_STR);
6008
6009
0
    kstring_t tmp;
6010
0
    tmp.l = 0; tmp.s = line->d.id; tmp.m = line->d.m_id;
6011
6012
0
    int len = strlen(id);
6013
0
    char *dst = line->d.id;
6014
0
    while ( *dst && (dst=strstr(dst,id)) )
6015
0
    {
6016
0
        if ( dst[len]!=0 && dst[len]!=';' ) dst++;              // a prefix, not a match
6017
0
        else if ( dst==line->d.id || dst[-1]==';' ) return 0;   // already present
6018
0
        dst++;  // a suffix, not a match
6019
0
    }
6020
0
    if ( line->d.id && (line->d.id[0]!='.' || line->d.id[1]) )
6021
0
    {
6022
0
        tmp.l = strlen(line->d.id);
6023
0
        kputc(';',&tmp);
6024
0
    }
6025
0
    kputs(id,&tmp);
6026
6027
0
    line->d.id = tmp.s; line->d.m_id = tmp.m;
6028
0
    line->d.shared_dirty |= BCF1_DIRTY_ID;
6029
0
    return 0;
6030
6031
0
}
6032
6033
bcf_fmt_t *bcf_get_fmt(const bcf_hdr_t *hdr, bcf1_t *line, const char *key)
6034
0
{
6035
0
    int id = bcf_hdr_id2int(hdr, BCF_DT_ID, key);
6036
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,id) ) return NULL;   // no such FMT field in the header
6037
0
    return bcf_get_fmt_id(line, id);
6038
0
}
6039
6040
bcf_info_t *bcf_get_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key)
6041
0
{
6042
0
    int id = bcf_hdr_id2int(hdr, BCF_DT_ID, key);
6043
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,id) ) return NULL;   // no such INFO field in the header
6044
0
    return bcf_get_info_id(line, id);
6045
0
}
6046
6047
bcf_fmt_t *bcf_get_fmt_id(bcf1_t *line, const int id)
6048
0
{
6049
0
    int i;
6050
0
    if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
6051
0
    for (i=0; i<line->n_fmt; i++)
6052
0
    {
6053
0
        if ( line->d.fmt[i].id==id ) return &line->d.fmt[i];
6054
0
    }
6055
0
    return NULL;
6056
0
}
6057
6058
bcf_info_t *bcf_get_info_id(bcf1_t *line, const int id)
6059
0
{
6060
0
    int i;
6061
0
    if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO);
6062
0
    for (i=0; i<line->n_info; i++)
6063
0
    {
6064
0
        if ( line->d.info[i].key==id ) return &line->d.info[i];
6065
0
    }
6066
0
    return NULL;
6067
0
}
6068
6069
6070
int bcf_get_info_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type)
6071
0
{
6072
0
    int i, ret = -4, tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag);
6073
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,tag_id) ) return -1;    // no such INFO field in the header
6074
0
    if ( bcf_hdr_id2type(hdr,BCF_HL_INFO,tag_id)!=(type & 0xff) ) return -2;     // expected different type
6075
6076
0
    if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO);
6077
6078
0
    for (i=0; i<line->n_info; i++)
6079
0
        if ( line->d.info[i].key==tag_id ) break;
6080
0
    if ( i==line->n_info ) return ( type==BCF_HT_FLAG ) ? 0 : -3;       // the tag is not present in this record
6081
0
    if ( type==BCF_HT_FLAG ) return 1;
6082
6083
0
    bcf_info_t *info = &line->d.info[i];
6084
0
    if ( !info->vptr ) return -3;           // the tag was marked for removal
6085
0
    if ( type==BCF_HT_STR )
6086
0
    {
6087
0
        if ( *ndst < info->len+1 )
6088
0
        {
6089
0
            *ndst = info->len + 1;
6090
0
            *dst  = realloc(*dst, *ndst);
6091
0
        }
6092
0
        memcpy(*dst,info->vptr,info->len);
6093
0
        ((uint8_t*)*dst)[info->len] = 0;
6094
0
        return info->len;
6095
0
    }
6096
6097
    // Make sure the buffer is big enough
6098
0
    int size1;
6099
0
    switch (type) {
6100
0
        case BCF_HT_INT:  size1 = sizeof(int32_t); break;
6101
0
        case BCF_HT_LONG: size1 = sizeof(int64_t); break;
6102
0
        case BCF_HT_REAL: size1 = sizeof(float); break;
6103
0
        default:
6104
0
            hts_log_error("Unexpected output type %d at %s:%"PRIhts_pos, type, bcf_seqname_safe(hdr,line), line->pos+1);
6105
0
            return -2;
6106
0
    }
6107
0
    if ( *ndst < info->len )
6108
0
    {
6109
0
        *ndst = info->len;
6110
0
        *dst  = hts_realloc_p(*dst, *ndst, size1);
6111
0
    }
6112
6113
0
    #define BRANCH(type_t, convert, is_missing, is_vector_end, set_missing, set_regular, out_type_t) do { \
6114
0
        out_type_t *tmp = (out_type_t *) *dst; \
6115
0
        int j; \
6116
0
        for (j=0; j<info->len; j++) \
6117
0
        { \
6118
0
            type_t p = convert(info->vptr + j * sizeof(type_t)); \
6119
0
            if ( is_vector_end ) break; \
6120
0
            if ( is_missing ) set_missing; \
6121
0
            else set_regular; \
6122
0
            tmp++; \
6123
0
        } \
6124
0
        ret = j; \
6125
0
    } while (0)
6126
0
    switch (info->type) {
6127
0
        case BCF_BT_INT8:
6128
0
            if (type == BCF_HT_LONG) {
6129
0
                BRANCH(int8_t,  le_to_i8,  p==bcf_int8_missing,  p==bcf_int8_vector_end,  *tmp=bcf_int64_missing, *tmp=p, int64_t);
6130
0
            } else {
6131
0
                BRANCH(int8_t,  le_to_i8,  p==bcf_int8_missing,  p==bcf_int8_vector_end,  *tmp=bcf_int32_missing, *tmp=p, int32_t);
6132
0
            }
6133
0
            break;
6134
0
        case BCF_BT_INT16:
6135
0
            if (type == BCF_HT_LONG) {
6136
0
                BRANCH(int16_t, le_to_i16, p==bcf_int16_missing, p==bcf_int16_vector_end, *tmp=bcf_int64_missing, *tmp=p, int64_t);
6137
0
            } else {
6138
0
                BRANCH(int16_t, le_to_i16, p==bcf_int16_missing, p==bcf_int16_vector_end, *tmp=bcf_int32_missing, *tmp=p, int32_t);
6139
0
            }
6140
0
            break;
6141
0
        case BCF_BT_INT32:
6142
0
            if (type == BCF_HT_LONG) {
6143
0
                BRANCH(int32_t, le_to_i32, p==bcf_int32_missing, p==bcf_int32_vector_end, *tmp=bcf_int64_missing, *tmp=p, int64_t); break;
6144
0
            } else {
6145
0
                BRANCH(int32_t, le_to_i32, p==bcf_int32_missing, p==bcf_int32_vector_end, *tmp=bcf_int32_missing, *tmp=p, int32_t); break;
6146
0
            }
6147
0
        case BCF_BT_FLOAT: BRANCH(uint32_t, le_to_u32, p==bcf_float_missing, p==bcf_float_vector_end, bcf_float_set_missing(*tmp), bcf_float_set(tmp, p), float); break;
6148
0
        default: hts_log_error("Unexpected type %d at %s:%"PRIhts_pos, info->type, bcf_seqname_safe(hdr,line), line->pos+1); return -2;
6149
0
    }
6150
0
    #undef BRANCH
6151
0
    return ret;  // set by BRANCH
6152
0
}
6153
6154
int bcf_get_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, char ***dst, int *ndst)
6155
0
{
6156
0
    int i,tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag);
6157
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,tag_id) ) return -1;    // no such FORMAT field in the header
6158
0
    if ( bcf_hdr_id2type(hdr,BCF_HL_FMT,tag_id)!=BCF_HT_STR ) return -2;     // expected different type
6159
6160
0
    if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
6161
6162
0
    for (i=0; i<line->n_fmt; i++)
6163
0
        if ( line->d.fmt[i].id==tag_id ) break;
6164
0
    if ( i==line->n_fmt ) return -3;                               // the tag is not present in this record
6165
0
    bcf_fmt_t *fmt = &line->d.fmt[i];
6166
0
    if ( !fmt->p ) return -3;                                      // the tag was marked for removal
6167
6168
0
    int nsmpl = bcf_hdr_nsamples(hdr);
6169
0
    if ( !*dst )
6170
0
    {
6171
0
        *dst = hts_malloc_p(sizeof(char*), nsmpl);
6172
0
        if ( !*dst ) return -4;     // could not alloc
6173
0
        (*dst)[0] = NULL;
6174
0
    }
6175
0
    int n = (fmt->n+1)*nsmpl;
6176
0
    if ( *ndst < n )
6177
0
    {
6178
0
        (*dst)[0] = realloc((*dst)[0], n);
6179
0
        if ( !(*dst)[0] ) return -4;    // could not alloc
6180
0
        *ndst = n;
6181
0
    }
6182
0
    for (i=0; i<nsmpl; i++)
6183
0
    {
6184
0
        uint8_t *src = fmt->p + i*fmt->n;
6185
0
        uint8_t *tmp = (uint8_t*)(*dst)[0] + i*(fmt->n+1);
6186
0
        memcpy(tmp,src,fmt->n);
6187
0
        tmp[fmt->n] = 0;
6188
0
        (*dst)[i] = (char*) tmp;
6189
0
    }
6190
0
    return n;
6191
0
}
6192
6193
int bcf_get_format_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type)
6194
0
{
6195
0
    int i,j, tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag);
6196
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,tag_id) ) return -1;    // no such FORMAT field in the header
6197
0
    if ( tag[0]=='G' && tag[1]=='T' && tag[2]==0 )
6198
0
    {
6199
        // Ugly: GT field is considered to be a string by the VCF header but BCF represents it as INT.
6200
0
        if ( bcf_hdr_id2type(hdr,BCF_HL_FMT,tag_id)!=BCF_HT_STR ) return -2;
6201
0
    }
6202
0
    else if ( bcf_hdr_id2type(hdr,BCF_HL_FMT,tag_id)!=type ) return -2;     // expected different type
6203
6204
0
    if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
6205
6206
0
    for (i=0; i<line->n_fmt; i++)
6207
0
        if ( line->d.fmt[i].id==tag_id ) break;
6208
0
    if ( i==line->n_fmt ) return -3;                               // the tag is not present in this record
6209
0
    bcf_fmt_t *fmt = &line->d.fmt[i];
6210
0
    if ( !fmt->p ) return -3;                                      // the tag was marked for removal
6211
6212
0
    if ( type==BCF_HT_STR )
6213
0
    {
6214
0
        int n = fmt->n*bcf_hdr_nsamples(hdr);
6215
0
        if ( *ndst < n )
6216
0
        {
6217
0
            *dst  = realloc(*dst, n);
6218
0
            if ( !*dst ) return -4;     // could not alloc
6219
0
            *ndst = n;
6220
0
        }
6221
0
        memcpy(*dst,fmt->p,n);
6222
0
        return n;
6223
0
    }
6224
6225
    // Make sure the buffer is big enough
6226
0
    int nsmpl = bcf_hdr_nsamples(hdr);
6227
0
    int size1 = type==BCF_HT_INT ? sizeof(int32_t) : sizeof(float);
6228
0
    if ( *ndst < fmt->n*nsmpl )
6229
0
    {
6230
0
        *ndst = fmt->n*nsmpl;
6231
0
        *dst  = hts_realloc_p(*dst, *ndst, size1);
6232
0
        if ( !*dst ) return -4;     // could not alloc
6233
0
    }
6234
6235
0
    #define BRANCH(type_t, convert, is_missing, is_vector_end, set_missing, set_vector_end, set_regular, out_type_t) { \
6236
0
        out_type_t *tmp = (out_type_t *) *dst; \
6237
0
        uint8_t *fmt_p = fmt->p; \
6238
0
        for (i=0; i<nsmpl; i++) \
6239
0
        { \
6240
0
            for (j=0; j<fmt->n; j++) \
6241
0
            { \
6242
0
                type_t p = convert(fmt_p + j * sizeof(type_t)); \
6243
0
                if ( is_missing ) set_missing; \
6244
0
                else if ( is_vector_end ) { set_vector_end; break; } \
6245
0
                else set_regular; \
6246
0
                tmp++; \
6247
0
            } \
6248
0
            for (; j<fmt->n; j++) { set_vector_end; tmp++; } \
6249
0
            fmt_p += fmt->size; \
6250
0
        } \
6251
0
    }
6252
0
    switch (fmt->type) {
6253
0
        case BCF_BT_INT8:  BRANCH(int8_t,  le_to_i8, p==bcf_int8_missing,  p==bcf_int8_vector_end,  *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, *tmp=p, int32_t); break;
6254
0
        case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, p==bcf_int16_missing, p==bcf_int16_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, *tmp=p, int32_t); break;
6255
0
        case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, p==bcf_int32_missing, p==bcf_int32_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, *tmp=p, int32_t); break;
6256
0
        case BCF_BT_FLOAT: BRANCH(uint32_t, le_to_u32, p==bcf_float_missing, p==bcf_float_vector_end, bcf_float_set_missing(*tmp), bcf_float_set_vector_end(*tmp), bcf_float_set(tmp, p), float); break;
6257
0
        default: hts_log_error("Unexpected type %d at %s:%"PRIhts_pos, fmt->type, bcf_seqname_safe(hdr,line), line->pos+1); exit(1);
6258
0
    }
6259
0
    #undef BRANCH
6260
6261
0
    return nsmpl*fmt->n;
6262
0
}
6263
6264
//error description structure definition
6265
typedef struct err_desc {
6266
    int  errorcode;
6267
    const char *description;
6268
}err_desc;
6269
6270
// error descriptions
6271
static const err_desc errdesc_bcf[] = {
6272
    { BCF_ERR_CTG_UNDEF, "Contig not defined in header"},
6273
    { BCF_ERR_TAG_UNDEF, "Tag not defined in header" },
6274
    { BCF_ERR_NCOLS, "Incorrect number of columns" },
6275
    { BCF_ERR_LIMITS, "Limits reached" },
6276
    { BCF_ERR_CHAR, "Invalid character" },
6277
    { BCF_ERR_CTG_INVALID, "Invalid contig" },
6278
    { BCF_ERR_TAG_INVALID, "Invalid tag" },
6279
};
6280
6281
/// append given description to buffer based on available size and add ... when not enough space
6282
    /** @param buffer       buffer to which description to be appended
6283
        @param offset       offset at which to be appended
6284
        @param maxbuffer    maximum size of the buffer
6285
        @param description  the description to be appended
6286
on failure returns -1 - when buffer is not big enough; returns -1 on invalid params and on too small buffer which are improbable due to validation at caller site
6287
on success returns 0
6288
    */
6289
3.93k
static int add_desc_to_buffer(char *buffer, size_t *offset, size_t maxbuffer, const char *description) {
6290
6291
3.93k
    if (!description || !buffer || !offset || (maxbuffer < 4))
6292
0
        return -1;
6293
6294
3.93k
    size_t rembuffer = maxbuffer - *offset;
6295
3.93k
    if (rembuffer > (strlen(description) + (rembuffer == maxbuffer ? 0 : 1))) {    //add description with optionally required ','
6296
3.93k
        *offset += snprintf(buffer + *offset, rembuffer, "%s%s", (rembuffer == maxbuffer)? "": ",", description);
6297
3.93k
    } else {    //not enough space for description, put ...
6298
0
        size_t tmppos = (rembuffer <= 4) ? maxbuffer - 4 : *offset;
6299
0
        snprintf(buffer + tmppos, 4, "...");    //ignore offset update
6300
0
        return -1;
6301
0
    }
6302
3.93k
    return 0;
6303
3.93k
}
6304
6305
//get description for given error code. return NULL on error
6306
1.87k
const char *bcf_strerror(int errorcode, char *buffer, size_t maxbuffer) {
6307
1.87k
    size_t usedup = 0;
6308
1.87k
    int ret = 0;
6309
1.87k
    int idx;
6310
6311
1.87k
    if (!buffer || maxbuffer < 4)
6312
0
        return NULL;           //invalid / insufficient buffer
6313
6314
1.87k
    if (!errorcode) {
6315
0
        buffer[0] = '\0';      //no error, set null
6316
0
        return buffer;
6317
0
    }
6318
6319
15.0k
    for (idx = 0; idx < sizeof(errdesc_bcf) / sizeof(err_desc); ++idx) {
6320
13.1k
        if (errorcode & errdesc_bcf[idx].errorcode) {    //error is set, add description
6321
3.93k
            ret = add_desc_to_buffer(buffer, &usedup, maxbuffer, errdesc_bcf[idx].description);
6322
3.93k
            if (ret < 0)
6323
0
                break;         //not enough space, ... added, no need to continue
6324
6325
3.93k
            errorcode &= ~errdesc_bcf[idx].errorcode;    //reset the error
6326
3.93k
        }
6327
13.1k
    }
6328
6329
1.87k
    if (errorcode && (ret >= 0))  {     //undescribed error is present in error code and had enough buffer, try to add unkonwn error as well§
6330
0
        add_desc_to_buffer(buffer, &usedup, maxbuffer, "Unknown error");
6331
0
    }
6332
1.87k
    return buffer;
6333
1.87k
}
6334
6335
/**
6336
 *  bcf_format_gt_v2 - formats GT information on a string
6337
 *  @param hdr - bcf header, to get version
6338
 *  @param fmt - pointer to bcf format data
6339
 *  @param isample - position of interested sample in data
6340
 *  @param str - pointer to output string
6341
 *  Returns 0 on success and -1 on failure
6342
 *  This method is preferred over bcf_format_gt as this supports vcf4.4 and
6343
 *  prefixed phasing. Explicit / prefixed phasing for 1st allele is used only
6344
 *  when it is a must to correctly express phasing.
6345
 * correctly express phasing.
6346
 */
6347
int bcf_format_gt_v2(const bcf_hdr_t *hdr, bcf_fmt_t *fmt, int isample, kstring_t *str)
6348
16.0k
{
6349
16.0k
    uint32_t e = 0;
6350
16.0k
    int ploidy = 1, anyunphased = 0;
6351
16.0k
    int32_t val0 = 0;
6352
16.0k
    size_t pos = str ? str->l : 0;
6353
6354
16.0k
    #define BRANCH(type_t, convert, missing, vector_end) { \
6355
14.4k
        uint8_t *ptr = fmt->p + isample*fmt->size; \
6356
14.4k
        int i; \
6357
37.5k
        for (i=0; i<fmt->n; i++, ptr += sizeof(type_t)) \
6358
32.4k
        { \
6359
32.4k
            type_t val = convert(ptr); \
6360
32.4k
            if ( val == vector_end ) break; \
6361
32.4k
            if (!i) { val0 = val; } \
6362
23.0k
            if (i) { \
6363
8.66k
                e |= kputc("/|"[val & 1], str) < 0; \
6364
8.66k
                anyunphased |= !(val & 1); \
6365
8.66k
            } \
6366
23.0k
            if (!(val >> 1)) e |= kputc('.', str) < 0; \
6367
23.0k
            else e |= kputw((val >> 1) - 1, str) < 0; \
6368
23.0k
        } \
6369
14.4k
        if (i == 0) e |= kputc('.', str) < 0; \
6370
14.4k
        ploidy = i; \
6371
14.4k
    }
6372
16.0k
    switch (fmt->type) {
6373
8.30k
        case BCF_BT_INT8:  BRANCH(int8_t,  le_to_i8,  bcf_int8_missing,
6374
8.30k
            bcf_int8_vector_end); break;
6375
957
        case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, bcf_int16_missing,
6376
957
            bcf_int16_vector_end); break;
6377
5.16k
        case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, bcf_int32_missing,
6378
5.16k
            bcf_int32_vector_end); break;
6379
1.60k
        case BCF_BT_NULL:  e |= kputc('.', str) < 0; break;
6380
0
        default: hts_log_error("Unexpected type %d", fmt->type); return -2;
6381
16.0k
    }
6382
16.0k
    #undef BRANCH
6383
6384
16.0k
    if (hdr && get_hdr_aux(hdr)->version >= VCF44) {
6385
        //output which supports prefixed phasing
6386
6387
        /* update 1st allele's phasing if required and append rest to it.
6388
        use prefixed phasing only when it is a must. i.e. without which the
6389
        inferred value will be incorrect */
6390
9.42k
        if (val0 & 1) {
6391
            /* 1st one is phased, if ploidy is > 1 and an unphased allele exists
6392
             need to specify explicitly */
6393
1.09k
            e |= (ploidy > 1 && anyunphased) ?
6394
88
                    (kinsert_char('|', pos, str) < 0) :
6395
1.09k
                        (ploidy <= 1 && !((val0 >> 1)) ? //|. needs explicit o/p
6396
0
                            (kinsert_char('|', pos, str) < 0) :
6397
1.00k
                            0);
6398
8.33k
        } else {
6399
            /* 1st allele is unphased, if ploidy is = 1 or allele is '.' or
6400
             ploidy > 1 and no other unphased allele exist, need to specify
6401
             explicitly */
6402
8.33k
            e |= ((ploidy <= 1 && val0 != 0) || (ploidy > 1 && !anyunphased)) ?
6403
5.14k
                    (kinsert_char('/', pos, str) < 0) :
6404
8.33k
                    0;
6405
8.33k
        }
6406
9.42k
    }
6407
16.0k
    return e == 0 ? 0 : -1;
6408
16.0k
}
6409
6410
/**
6411
 *  get_rlen - calculates and returns rlen value
6412
 *  @param h - bcf header
6413
 *  @param v - bcf data
6414
 *  Returns rlen calculated on success and -1 on failure.
6415
 *  rlen calculation is dependent on vcf version and a few other field data.
6416
 *  When bcf decoded data is available, refers it. When not available, retrieves
6417
 *  required field data by seeking on the data stream.
6418
 *  Ideally pos & version be set appropriately before any info/format field
6419
 *  update to have proper rlen calculation.
6420
 *  As version is not kept properly updated in practice, it is ignored in calcs.
6421
 */
6422
static int64_t get_rlen(const bcf_hdr_t *h, bcf1_t *v)
6423
32.8k
{
6424
32.8k
    uint8_t *f = (uint8_t*)v->shared.s, *t = NULL,
6425
32.8k
        *e = (uint8_t*)v->shared.s + v->shared.l;
6426
32.8k
    int size, type, id, lenid, endid, svlenid, i, bad, gvcf = 0, use_svlen = 0;
6427
32.8k
    bcf_info_t *endinfo = NULL, *svleninfo = NULL, end_lcl, svlen_lcl;
6428
32.8k
    bcf_fmt_t *lenfmt = NULL, len_lcl;
6429
6430
    //holds SVLEN allele status for the max no of alleles
6431
32.8k
    uint8_t svlenals[8192];
6432
    //pos from info END, fmt LEN, info SVLEN
6433
32.8k
    hts_pos_t end = 0, end_fmtlen = 0, end_svlen = 0, hpos;
6434
32.8k
    int64_t len_ref = 0, len = 0, tmp;
6435
32.8k
    endid = bcf_hdr_id2int(h, BCF_DT_ID, "END");
6436
6437
    //initialise bytes which are to be used
6438
32.8k
    memset(svlenals, 0, 1 + v->n_allele / 8);
6439
6440
    //use decoded data where ever available and where not, get from stream
6441
32.8k
    if (v->unpacked & BCF_UN_STR || v->d.shared_dirty & BCF1_DIRTY_ALS) {
6442
0
        for (i = 1; i < v->n_allele; ++i) {
6443
            // check only symbolic alt alleles
6444
0
            if (v->d.allele[i][0] != '<')
6445
0
                continue;
6446
0
            if (svlen_on_ref_for_vcf_alt(v->d.allele[i], -1)) {
6447
                // del, dup or cnv allele, note to check corresponding svlen val
6448
0
                svlenals[i >> 3] |= 1 << (i & 7);
6449
0
                use_svlen = 1;
6450
0
            } else if (!strcmp(v->d.allele[i], "<*>") ||
6451
0
                         !strcmp(v->d.allele[i], "<NON_REF>")) {
6452
0
                gvcf = 1;   //gvcf present, have to check for LEN field
6453
0
            }
6454
0
        }
6455
0
        f += v->unpack_size[0] + v->unpack_size[1];
6456
0
        len_ref = v->n_allele ? strlen(v->d.allele[0]) : 0;
6457
32.8k
    } else if (f < e) {
6458
        //skip ID
6459
32.8k
        size = bcf_dec_size(f, &f, &type);
6460
32.8k
        f += size << bcf_type_shift[type];
6461
        // REF, ALT
6462
2.37M
        for (i = 0; i < v->n_allele; ++i) {
6463
            //check all alleles, w/o NUL
6464
2.33M
            size = bcf_dec_size(f, &f, &type);
6465
2.33M
            if (!i) {   //REF length
6466
32.8k
                len_ref = size;
6467
2.30M
            } else if (size > 0 && *f == '<') {
6468
22.1k
                if (svlen_on_ref_for_vcf_alt((char *) f, size)) {
6469
                    // del, dup or cnv allele, note to check corresponding svlen val
6470
27
                    svlenals[i >> 3] |= 1 << (i & 7);
6471
27
                    use_svlen = 1;
6472
22.1k
                } else if ((size == 3 && !strncmp((char*)f, "<*>", size)) ||
6473
14.7k
                    (size == 9 && !strncmp((char*)f, "<NON_REF>", size))) {
6474
7.77k
                    gvcf = 1;   //gvcf present, have to check for LEN field
6475
7.77k
                }
6476
22.1k
            }
6477
2.33M
            f += size << bcf_type_shift[type];
6478
2.33M
        }
6479
32.8k
    }
6480
    // FILTER
6481
32.8k
    if (v->unpacked & BCF_UN_FLT) {
6482
0
        f += v->unpack_size[2];
6483
32.8k
    } else if (f < e) {
6484
32.8k
        size = bcf_dec_size(f, &f, &type);
6485
32.8k
        f += size << bcf_type_shift[type];
6486
32.8k
    }
6487
6488
    // Only do SVLEN lookup if there are suitable symbolic alleles
6489
32.8k
    svlenid = use_svlen ? bcf_hdr_id2int(h, BCF_DT_ID, "SVLEN") : -1;
6490
6491
    // INFO
6492
32.8k
    if (svlenid >= 0 || endid >= 0 ) {  //only if end/svlen present
6493
13.2k
        if (v->unpacked & BCF_UN_INFO || v->d.shared_dirty & BCF1_DIRTY_INF) {
6494
0
            endinfo = bcf_get_info(h, v, "END");
6495
0
            svleninfo = bcf_get_info(h, v, "SVLEN");
6496
13.2k
        } else if (f < e) {
6497
21.2k
            for (i = 0; i < v->n_info; ++i) {
6498
16.6k
                id = bcf_dec_typed_int1(f, &t);
6499
16.6k
                if (id == endid) {  //END
6500
1.97k
                    t = bcf_unpack_info_core1(f, &end_lcl);
6501
1.97k
                    endinfo = &end_lcl;
6502
1.97k
                    if (svleninfo || svlenid < 0) {
6503
1.97k
                        break;  //already got svlen or no need to search further
6504
1.97k
                    }
6505
14.7k
                } else if (id == svlenid) { //SVLEN
6506
0
                    t = bcf_unpack_info_core1(f, &svlen_lcl);
6507
0
                    svleninfo = &svlen_lcl;
6508
0
                    if (endinfo || endid < 0 ) {
6509
0
                        break;  //already got end or no need to search further
6510
0
                    }
6511
14.7k
                } else {
6512
14.7k
                    f = t;
6513
14.7k
                    size = bcf_dec_size(f, &t, &type);
6514
14.7k
                    t += size << bcf_type_shift[type];
6515
14.7k
                }
6516
14.7k
                f = t;
6517
14.7k
            }
6518
6.54k
        }
6519
13.2k
    }
6520
6521
    // Only do LEN lookup if a <*> allele was found
6522
32.8k
    lenid = gvcf ? bcf_hdr_id2int(h, BCF_DT_ID, "LEN") : -1;
6523
6524
    // FORMAT
6525
32.8k
    if (lenid >= 0) {
6526
        //with LEN and has gvcf allele
6527
0
        f = (uint8_t*)v->indiv.s; t = NULL; e = (uint8_t*)v->indiv.s + v->indiv.l;
6528
0
        if (v->unpacked & BCF_UN_FMT || v->d.indiv_dirty) {
6529
0
            lenfmt = bcf_get_fmt(h, v, "LEN");
6530
0
        } else if (f < e) {
6531
0
            for (i = 0; i < v->n_fmt; ++i) {
6532
0
                id = bcf_dec_typed_int1(f, &t);
6533
0
                if (id == lenid) {
6534
0
                        t = bcf_unpack_fmt_core1(f, v->n_sample, &len_lcl);
6535
0
                    lenfmt = &len_lcl;
6536
0
                    break;  //that's all needed
6537
0
                } else {
6538
0
                    f = t;
6539
0
                    size = bcf_dec_size(f, &t, &type);
6540
0
                    t += size * v->n_sample << bcf_type_shift[type];
6541
0
                }
6542
0
                f = t;
6543
0
            }
6544
0
        }
6545
0
    }
6546
    //got required data, find end and rlen
6547
32.8k
    if (endinfo && endinfo->vptr) { //end position given by info END
6548
        //end info exists, not being deleted
6549
1.97k
        end = endinfo->v1.i;
6550
1.97k
        switch(endinfo->type) {
6551
0
            case BCF_BT_INT8:  end = end == bcf_int8_missing ? 0 : end;  break;
6552
0
            case BCF_BT_INT16: end = end == bcf_int16_missing ? 0 : end; break;
6553
0
            case BCF_BT_INT32: end = end == bcf_int32_missing ? 0 : end; break;
6554
0
            case BCF_BT_INT64: end = end == bcf_int64_missing ? 0 : end; break;
6555
1.97k
            default: end = 0; break; //invalid
6556
1.97k
        }
6557
1.97k
    }
6558
6559
32.8k
    if (svleninfo && svleninfo->vptr) {
6560
        //svlen info exists, not being deleted
6561
0
        bad = 0;
6562
        //get largest svlen corresponding to a <DEL> symbolic allele
6563
0
        for (i = 0; i < svleninfo->len && i + 1 < v->n_allele; ++i) {
6564
0
            if (!(svlenals[i >> 3] & (1 << ((i + 1) & 7))))
6565
0
                continue;
6566
6567
0
            switch(svleninfo->type) {
6568
0
                case BCF_BT_INT8:
6569
0
                    tmp = le_to_i8(&svleninfo->vptr[i]);
6570
0
                    tmp = tmp == bcf_int8_missing ? 0 : tmp;
6571
0
                break;
6572
0
                case BCF_BT_INT16:
6573
0
                    tmp = le_to_i16(&svleninfo->vptr[i * 2]);
6574
0
                    tmp = tmp == bcf_int16_missing ? 0 : tmp;
6575
0
                break;
6576
0
                case BCF_BT_INT32:
6577
0
                    tmp = le_to_i32(&svleninfo->vptr[i * 4]);
6578
0
                    tmp = tmp == bcf_int32_missing ? 0 : tmp;
6579
0
                break;
6580
0
                case BCF_BT_INT64:
6581
0
                    tmp = le_to_i64(&svleninfo->vptr[i * 8]);
6582
0
                    tmp = tmp == bcf_int64_missing ? 0 : tmp;
6583
0
                break;
6584
0
                default: //invalid
6585
0
                    tmp = 0;
6586
0
                    bad = 1;
6587
0
                break;
6588
0
            }
6589
0
            if (bad) {  //stop svlen check
6590
0
                len = 0;
6591
0
                break;
6592
0
            }
6593
6594
0
            tmp = tmp < 0 ? llabs(tmp) : tmp;
6595
0
            if (len < tmp) len = tmp;
6596
0
        }
6597
0
    }
6598
32.8k
    if ((!svleninfo || !len) && end) { //no svlen, infer from end
6599
0
        len = end > v->pos ? end - v->pos - 1 : 0;
6600
0
    }
6601
32.8k
    end_svlen = v->pos + len + 1;   //end position found from SVLEN
6602
6603
32.8k
    len = 0;
6604
32.8k
    if (lenfmt && lenfmt->p) {
6605
        //fmt len exists, not being deleted, has gvcf and version >= 4.5
6606
0
        int j = 0;
6607
0
        int64_t offset = 0;
6608
0
        bad = 0;
6609
0
        for (i = 0; i < v->n_sample; ++i) {
6610
0
            for (j = 0; j < lenfmt->n; ++j) {
6611
0
                switch(lenfmt->type) {
6612
0
                case BCF_BT_INT8:
6613
0
                    tmp = le_to_i8(lenfmt->p + offset + j);
6614
0
                    tmp = tmp == bcf_int8_missing ? 0 : tmp;
6615
0
                break;
6616
0
                case BCF_BT_INT16:
6617
0
                    tmp = le_to_i16(lenfmt->p + offset + j * 2);
6618
0
                    tmp = tmp == bcf_int16_missing ? 0 : tmp;
6619
0
                break;
6620
0
                case BCF_BT_INT32:
6621
0
                    tmp = le_to_i32(lenfmt->p + offset + j * 4);
6622
0
                    tmp = tmp == bcf_int32_missing ? 0 : tmp;
6623
0
                break;
6624
0
                case BCF_BT_INT64:
6625
0
                    tmp = le_to_i64(lenfmt->p + offset + j * 8);
6626
0
                    tmp = tmp == bcf_int64_missing ? 0 : tmp;
6627
0
                break;
6628
0
                default: //invalid
6629
0
                    bad = 1;
6630
0
                break;
6631
0
                }
6632
0
                if (bad) {  //stop LEN check
6633
0
                    len = 0;
6634
0
                    break;
6635
0
                }
6636
                //assumes only gvcf have valid LEN
6637
0
                if (len < tmp) len = tmp;
6638
0
            }
6639
0
            offset += j << bcf_type_shift[lenfmt->type];
6640
0
        }
6641
0
    }
6642
32.8k
    if ((!lenfmt || !len) && end) { //no fmt len, infer from end
6643
0
        len = end > v->pos ? end - v->pos : 0;
6644
0
    }
6645
32.8k
    end_fmtlen = v->pos + len;  //end position found from LEN
6646
6647
    //get largest pos, based on END, SVLEN, fmt LEN and length using it
6648
32.8k
    hpos = end < end_svlen ?
6649
8.03k
            end_svlen < end_fmtlen ? end_fmtlen : end_svlen :
6650
32.8k
            end < end_fmtlen ? end_fmtlen : end;
6651
32.8k
    len = hpos - v->pos;
6652
6653
    //NOTE: 'end' calculation be in sync with tbx.c:tbx_parse1
6654
6655
    /* rlen to be calculated based on version, END, SVLEN, fmt LEN, ref len.
6656
    Relevance of these fields vary across different vcf versions.
6657
    Many times, these info/fmt fields are used without version updates;
6658
    hence these fields are used for calculation disregarding vcf version */
6659
32.8k
    return len < len_ref ? len_ref : len;
6660
32.8k
}